diff --git "a/checkpoint-57000/trainer_state.json" "b/checkpoint-57000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-57000/trainer_state.json" @@ -0,0 +1,399033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.6537700491188865, + "eval_steps": 500, + "global_step": 57000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.655818609306981e-05, + "grad_norm": 1.3708604284273813, + "learning_rate": 1.5518311607697085e-08, + "loss": 11.9661, + "step": 1 + }, + { + "epoch": 9.311637218613962e-05, + "grad_norm": 1.517282924209813, + "learning_rate": 3.103662321539417e-08, + "loss": 11.9619, + "step": 2 + }, + { + "epoch": 0.00013967455827920943, + "grad_norm": 1.381027223632402, + "learning_rate": 4.655493482309125e-08, + "loss": 11.9661, + "step": 3 + }, + { + "epoch": 0.00018623274437227924, + "grad_norm": 1.4347701445726384, + "learning_rate": 6.207324643078834e-08, + "loss": 11.9717, + "step": 4 + }, + { + "epoch": 0.00023279093046534908, + "grad_norm": 1.4515647037637918, + "learning_rate": 7.759155803848543e-08, + "loss": 11.9723, + "step": 5 + }, + { + "epoch": 0.00027934911655841887, + "grad_norm": 1.4602866872438616, + "learning_rate": 9.31098696461825e-08, + "loss": 11.9611, + "step": 6 + }, + { + "epoch": 0.0003259073026514887, + "grad_norm": 1.35912222926092, + "learning_rate": 1.0862818125387958e-07, + "loss": 11.964, + "step": 7 + }, + { + "epoch": 0.0003724654887445585, + "grad_norm": 1.450848286045753, + "learning_rate": 1.2414649286157668e-07, + "loss": 11.9621, + "step": 8 + }, + { + "epoch": 0.0004190236748376283, + "grad_norm": 1.3044425074824442, + "learning_rate": 1.3966480446927375e-07, + "loss": 11.9662, + "step": 9 + }, + { + "epoch": 0.00046558186093069816, + "grad_norm": 1.3131819667900089, + "learning_rate": 1.5518311607697085e-07, + "loss": 11.9684, + "step": 10 + }, + { + "epoch": 0.0005121400470237679, + "grad_norm": 1.4808624546147569, + "learning_rate": 1.707014276846679e-07, + "loss": 11.9701, + "step": 11 + }, + { + "epoch": 0.0005586982331168377, + "grad_norm": 1.4166049186968965, + "learning_rate": 1.86219739292365e-07, + "loss": 11.9702, + "step": 12 + }, + { + "epoch": 0.0006052564192099075, + "grad_norm": 1.4676660773041568, + "learning_rate": 2.0173805090006208e-07, + "loss": 11.9658, + "step": 13 + }, + { + "epoch": 0.0006518146053029774, + "grad_norm": 1.5740947261226104, + "learning_rate": 2.1725636250775915e-07, + "loss": 11.9655, + "step": 14 + }, + { + "epoch": 0.0006983727913960472, + "grad_norm": 1.400989580265354, + "learning_rate": 2.3277467411545626e-07, + "loss": 11.9678, + "step": 15 + }, + { + "epoch": 0.000744930977489117, + "grad_norm": 1.4842782747827559, + "learning_rate": 2.4829298572315336e-07, + "loss": 11.9667, + "step": 16 + }, + { + "epoch": 0.0007914891635821868, + "grad_norm": 1.3433668621533006, + "learning_rate": 2.638112973308504e-07, + "loss": 11.9691, + "step": 17 + }, + { + "epoch": 0.0008380473496752566, + "grad_norm": 1.3830758712906133, + "learning_rate": 2.793296089385475e-07, + "loss": 11.9755, + "step": 18 + }, + { + "epoch": 0.0008846055357683265, + "grad_norm": 1.3691410595011717, + "learning_rate": 2.9484792054624455e-07, + "loss": 11.9689, + "step": 19 + }, + { + "epoch": 0.0009311637218613963, + "grad_norm": 1.4604322038031345, + "learning_rate": 3.103662321539417e-07, + "loss": 11.9761, + "step": 20 + }, + { + "epoch": 0.0009777219079544661, + "grad_norm": 1.4371805576636691, + "learning_rate": 3.2588454376163876e-07, + "loss": 11.9609, + "step": 21 + }, + { + "epoch": 0.0010242800940475358, + "grad_norm": 1.4373782086422322, + "learning_rate": 3.414028553693358e-07, + "loss": 11.9659, + "step": 22 + }, + { + "epoch": 0.0010708382801406058, + "grad_norm": 1.4241093119460353, + "learning_rate": 3.569211669770329e-07, + "loss": 11.9598, + "step": 23 + }, + { + "epoch": 0.0011173964662336755, + "grad_norm": 1.4066259919845672, + "learning_rate": 3.7243947858473e-07, + "loss": 11.9664, + "step": 24 + }, + { + "epoch": 0.0011639546523267454, + "grad_norm": 1.4780693875771842, + "learning_rate": 3.8795779019242706e-07, + "loss": 11.9669, + "step": 25 + }, + { + "epoch": 0.001210512838419815, + "grad_norm": 1.4921147568065307, + "learning_rate": 4.0347610180012416e-07, + "loss": 11.9628, + "step": 26 + }, + { + "epoch": 0.001257071024512885, + "grad_norm": 1.4289736363183292, + "learning_rate": 4.1899441340782126e-07, + "loss": 11.9682, + "step": 27 + }, + { + "epoch": 0.0013036292106059547, + "grad_norm": 1.362648377603769, + "learning_rate": 4.345127250155183e-07, + "loss": 11.9583, + "step": 28 + }, + { + "epoch": 0.0013501873966990246, + "grad_norm": 1.3798786930303772, + "learning_rate": 4.5003103662321536e-07, + "loss": 11.962, + "step": 29 + }, + { + "epoch": 0.0013967455827920943, + "grad_norm": 1.401773141944224, + "learning_rate": 4.655493482309125e-07, + "loss": 11.9615, + "step": 30 + }, + { + "epoch": 0.0014433037688851643, + "grad_norm": 1.4686450864923506, + "learning_rate": 4.810676598386096e-07, + "loss": 11.9631, + "step": 31 + }, + { + "epoch": 0.001489861954978234, + "grad_norm": 1.4984663716253244, + "learning_rate": 4.965859714463067e-07, + "loss": 11.9606, + "step": 32 + }, + { + "epoch": 0.0015364201410713039, + "grad_norm": 1.4907975818814487, + "learning_rate": 5.121042830540038e-07, + "loss": 11.9579, + "step": 33 + }, + { + "epoch": 0.0015829783271643736, + "grad_norm": 1.3694941622293078, + "learning_rate": 5.276225946617008e-07, + "loss": 11.9558, + "step": 34 + }, + { + "epoch": 0.0016295365132574435, + "grad_norm": 1.4253452197381304, + "learning_rate": 5.43140906269398e-07, + "loss": 11.9512, + "step": 35 + }, + { + "epoch": 0.0016760946993505132, + "grad_norm": 1.4858283164258006, + "learning_rate": 5.58659217877095e-07, + "loss": 11.9488, + "step": 36 + }, + { + "epoch": 0.0017226528854435831, + "grad_norm": 1.4679913873000154, + "learning_rate": 5.741775294847921e-07, + "loss": 11.9474, + "step": 37 + }, + { + "epoch": 0.001769211071536653, + "grad_norm": 1.4738777975041721, + "learning_rate": 5.896958410924891e-07, + "loss": 11.948, + "step": 38 + }, + { + "epoch": 0.0018157692576297227, + "grad_norm": 1.3150789833931824, + "learning_rate": 6.052141527001863e-07, + "loss": 11.945, + "step": 39 + }, + { + "epoch": 0.0018623274437227927, + "grad_norm": 1.4454105078426955, + "learning_rate": 6.207324643078834e-07, + "loss": 11.9444, + "step": 40 + }, + { + "epoch": 0.0019088856298158624, + "grad_norm": 1.444520612540447, + "learning_rate": 6.362507759155805e-07, + "loss": 11.9373, + "step": 41 + }, + { + "epoch": 0.0019554438159089323, + "grad_norm": 1.4779617989003198, + "learning_rate": 6.517690875232775e-07, + "loss": 11.941, + "step": 42 + }, + { + "epoch": 0.002002002002002002, + "grad_norm": 1.5049406062936521, + "learning_rate": 6.672873991309746e-07, + "loss": 11.9381, + "step": 43 + }, + { + "epoch": 0.0020485601880950717, + "grad_norm": 1.3177127023583788, + "learning_rate": 6.828057107386716e-07, + "loss": 11.9423, + "step": 44 + }, + { + "epoch": 0.002095118374188142, + "grad_norm": 1.3970609350885217, + "learning_rate": 6.983240223463687e-07, + "loss": 11.9404, + "step": 45 + }, + { + "epoch": 0.0021416765602812115, + "grad_norm": 1.351286483683469, + "learning_rate": 7.138423339540658e-07, + "loss": 11.9282, + "step": 46 + }, + { + "epoch": 0.0021882347463742812, + "grad_norm": 1.540865059656044, + "learning_rate": 7.29360645561763e-07, + "loss": 11.917, + "step": 47 + }, + { + "epoch": 0.002234792932467351, + "grad_norm": 1.451510855628316, + "learning_rate": 7.4487895716946e-07, + "loss": 11.9067, + "step": 48 + }, + { + "epoch": 0.002281351118560421, + "grad_norm": 1.513839230798165, + "learning_rate": 7.603972687771571e-07, + "loss": 11.898, + "step": 49 + }, + { + "epoch": 0.0023279093046534908, + "grad_norm": 1.4416570593606024, + "learning_rate": 7.759155803848541e-07, + "loss": 11.8971, + "step": 50 + }, + { + "epoch": 0.0023744674907465605, + "grad_norm": 1.5490182018611962, + "learning_rate": 7.914338919925513e-07, + "loss": 11.8886, + "step": 51 + }, + { + "epoch": 0.00242102567683963, + "grad_norm": 1.2752788656579526, + "learning_rate": 8.069522036002483e-07, + "loss": 11.8945, + "step": 52 + }, + { + "epoch": 0.0024675838629327003, + "grad_norm": 1.4177818453360893, + "learning_rate": 8.224705152079454e-07, + "loss": 11.8946, + "step": 53 + }, + { + "epoch": 0.00251414204902577, + "grad_norm": 1.4617964175233835, + "learning_rate": 8.379888268156425e-07, + "loss": 11.892, + "step": 54 + }, + { + "epoch": 0.0025607002351188397, + "grad_norm": 1.357060068875352, + "learning_rate": 8.535071384233396e-07, + "loss": 11.8939, + "step": 55 + }, + { + "epoch": 0.0026072584212119094, + "grad_norm": 1.485107605832194, + "learning_rate": 8.690254500310366e-07, + "loss": 11.8797, + "step": 56 + }, + { + "epoch": 0.0026538166073049796, + "grad_norm": 1.3927889115883838, + "learning_rate": 8.845437616387337e-07, + "loss": 11.8825, + "step": 57 + }, + { + "epoch": 0.0027003747933980493, + "grad_norm": 1.4800135431422752, + "learning_rate": 9.000620732464307e-07, + "loss": 11.8752, + "step": 58 + }, + { + "epoch": 0.002746932979491119, + "grad_norm": 1.4905918969566738, + "learning_rate": 9.15580384854128e-07, + "loss": 11.8771, + "step": 59 + }, + { + "epoch": 0.0027934911655841887, + "grad_norm": 1.5061304827184874, + "learning_rate": 9.31098696461825e-07, + "loss": 11.8705, + "step": 60 + }, + { + "epoch": 0.002840049351677259, + "grad_norm": 1.5341285319305773, + "learning_rate": 9.466170080695222e-07, + "loss": 11.8664, + "step": 61 + }, + { + "epoch": 0.0028866075377703285, + "grad_norm": 1.469650002827667, + "learning_rate": 9.621353196772191e-07, + "loss": 11.8592, + "step": 62 + }, + { + "epoch": 0.002933165723863398, + "grad_norm": 1.358540263308322, + "learning_rate": 9.776536312849163e-07, + "loss": 11.867, + "step": 63 + }, + { + "epoch": 0.002979723909956468, + "grad_norm": 1.5295181083833052, + "learning_rate": 9.931719428926134e-07, + "loss": 11.8412, + "step": 64 + }, + { + "epoch": 0.003026282096049538, + "grad_norm": 1.5758748772096505, + "learning_rate": 1.0086902545003104e-06, + "loss": 11.7985, + "step": 65 + }, + { + "epoch": 0.0030728402821426077, + "grad_norm": 1.5236646203653539, + "learning_rate": 1.0242085661080075e-06, + "loss": 11.7784, + "step": 66 + }, + { + "epoch": 0.0031193984682356775, + "grad_norm": 1.4617124328594029, + "learning_rate": 1.0397268777157045e-06, + "loss": 11.76, + "step": 67 + }, + { + "epoch": 0.003165956654328747, + "grad_norm": 1.542096808602436, + "learning_rate": 1.0552451893234016e-06, + "loss": 11.738, + "step": 68 + }, + { + "epoch": 0.0032125148404218173, + "grad_norm": 1.6246474727938334, + "learning_rate": 1.0707635009310988e-06, + "loss": 11.7153, + "step": 69 + }, + { + "epoch": 0.003259073026514887, + "grad_norm": 1.5129708161413855, + "learning_rate": 1.086281812538796e-06, + "loss": 11.7263, + "step": 70 + }, + { + "epoch": 0.0033056312126079567, + "grad_norm": 1.5157499538362975, + "learning_rate": 1.1018001241464929e-06, + "loss": 11.7135, + "step": 71 + }, + { + "epoch": 0.0033521893987010264, + "grad_norm": 1.3703807723942008, + "learning_rate": 1.11731843575419e-06, + "loss": 11.724, + "step": 72 + }, + { + "epoch": 0.0033987475847940965, + "grad_norm": 1.5226912266215868, + "learning_rate": 1.1328367473618872e-06, + "loss": 11.6961, + "step": 73 + }, + { + "epoch": 0.0034453057708871662, + "grad_norm": 1.3641858800827498, + "learning_rate": 1.1483550589695841e-06, + "loss": 11.7135, + "step": 74 + }, + { + "epoch": 0.003491863956980236, + "grad_norm": 1.4710563338041693, + "learning_rate": 1.1638733705772813e-06, + "loss": 11.6877, + "step": 75 + }, + { + "epoch": 0.003538422143073306, + "grad_norm": 1.422714753629224, + "learning_rate": 1.1793916821849782e-06, + "loss": 11.6997, + "step": 76 + }, + { + "epoch": 0.0035849803291663758, + "grad_norm": 1.4657301044238948, + "learning_rate": 1.1949099937926754e-06, + "loss": 11.6955, + "step": 77 + }, + { + "epoch": 0.0036315385152594455, + "grad_norm": 1.595862769409633, + "learning_rate": 1.2104283054003725e-06, + "loss": 11.6551, + "step": 78 + }, + { + "epoch": 0.003678096701352515, + "grad_norm": 1.5739833606984286, + "learning_rate": 1.2259466170080695e-06, + "loss": 11.6524, + "step": 79 + }, + { + "epoch": 0.0037246548874455853, + "grad_norm": 1.4482913051577482, + "learning_rate": 1.2414649286157668e-06, + "loss": 11.6666, + "step": 80 + }, + { + "epoch": 0.003771213073538655, + "grad_norm": 1.4629908906970026, + "learning_rate": 1.2569832402234638e-06, + "loss": 11.6572, + "step": 81 + }, + { + "epoch": 0.0038177712596317247, + "grad_norm": 1.5188243511852006, + "learning_rate": 1.272501551831161e-06, + "loss": 11.6304, + "step": 82 + }, + { + "epoch": 0.0038643294457247944, + "grad_norm": 1.6428341129162678, + "learning_rate": 1.2880198634388579e-06, + "loss": 11.6108, + "step": 83 + }, + { + "epoch": 0.0039108876318178646, + "grad_norm": 1.4498383968282371, + "learning_rate": 1.303538175046555e-06, + "loss": 11.6405, + "step": 84 + }, + { + "epoch": 0.003957445817910934, + "grad_norm": 1.584189459796871, + "learning_rate": 1.319056486654252e-06, + "loss": 11.5908, + "step": 85 + }, + { + "epoch": 0.004004004004004004, + "grad_norm": 1.5129703355627007, + "learning_rate": 1.3345747982619491e-06, + "loss": 11.6037, + "step": 86 + }, + { + "epoch": 0.004050562190097074, + "grad_norm": 1.6610675248055227, + "learning_rate": 1.3500931098696463e-06, + "loss": 11.5474, + "step": 87 + }, + { + "epoch": 0.004097120376190143, + "grad_norm": 1.5197798486315686, + "learning_rate": 1.3656114214773432e-06, + "loss": 11.5699, + "step": 88 + }, + { + "epoch": 0.0041436785622832135, + "grad_norm": 1.6699257040586764, + "learning_rate": 1.3811297330850404e-06, + "loss": 11.4856, + "step": 89 + }, + { + "epoch": 0.004190236748376284, + "grad_norm": 1.6997661626474914, + "learning_rate": 1.3966480446927373e-06, + "loss": 11.4404, + "step": 90 + }, + { + "epoch": 0.004236794934469353, + "grad_norm": 1.6605565701782765, + "learning_rate": 1.4121663563004347e-06, + "loss": 11.4393, + "step": 91 + }, + { + "epoch": 0.004283353120562423, + "grad_norm": 1.6058736755746341, + "learning_rate": 1.4276846679081316e-06, + "loss": 11.3967, + "step": 92 + }, + { + "epoch": 0.004329911306655492, + "grad_norm": 1.5948037881921027, + "learning_rate": 1.4432029795158288e-06, + "loss": 11.3713, + "step": 93 + }, + { + "epoch": 0.0043764694927485625, + "grad_norm": 1.475393673339237, + "learning_rate": 1.458721291123526e-06, + "loss": 11.3739, + "step": 94 + }, + { + "epoch": 0.004423027678841633, + "grad_norm": 1.5694620446125722, + "learning_rate": 1.4742396027312229e-06, + "loss": 11.2971, + "step": 95 + }, + { + "epoch": 0.004469585864934702, + "grad_norm": 1.617089167368496, + "learning_rate": 1.48975791433892e-06, + "loss": 11.2606, + "step": 96 + }, + { + "epoch": 0.004516144051027772, + "grad_norm": 1.4919966995310878, + "learning_rate": 1.505276225946617e-06, + "loss": 11.2985, + "step": 97 + }, + { + "epoch": 0.004562702237120842, + "grad_norm": 1.4264899295134017, + "learning_rate": 1.5207945375543141e-06, + "loss": 11.2986, + "step": 98 + }, + { + "epoch": 0.004609260423213911, + "grad_norm": 1.5114911514161742, + "learning_rate": 1.5363128491620113e-06, + "loss": 11.2628, + "step": 99 + }, + { + "epoch": 0.0046558186093069815, + "grad_norm": 1.429660751428128, + "learning_rate": 1.5518311607697082e-06, + "loss": 11.2812, + "step": 100 + }, + { + "epoch": 0.004702376795400051, + "grad_norm": 1.40581919425212, + "learning_rate": 1.5673494723774056e-06, + "loss": 11.2548, + "step": 101 + }, + { + "epoch": 0.004748934981493121, + "grad_norm": 1.4156737439783769, + "learning_rate": 1.5828677839851025e-06, + "loss": 11.2164, + "step": 102 + }, + { + "epoch": 0.004795493167586191, + "grad_norm": 1.409157019308083, + "learning_rate": 1.5983860955927997e-06, + "loss": 11.2385, + "step": 103 + }, + { + "epoch": 0.00484205135367926, + "grad_norm": 1.464077718144328, + "learning_rate": 1.6139044072004966e-06, + "loss": 11.1704, + "step": 104 + }, + { + "epoch": 0.0048886095397723305, + "grad_norm": 1.3690782753909905, + "learning_rate": 1.6294227188081938e-06, + "loss": 11.171, + "step": 105 + }, + { + "epoch": 0.004935167725865401, + "grad_norm": 1.3853483983866206, + "learning_rate": 1.6449410304158907e-06, + "loss": 11.1754, + "step": 106 + }, + { + "epoch": 0.00498172591195847, + "grad_norm": 1.399799630714051, + "learning_rate": 1.6604593420235879e-06, + "loss": 11.1419, + "step": 107 + }, + { + "epoch": 0.00502828409805154, + "grad_norm": 1.4022083998698203, + "learning_rate": 1.675977653631285e-06, + "loss": 11.0958, + "step": 108 + }, + { + "epoch": 0.005074842284144609, + "grad_norm": 1.3242883311488585, + "learning_rate": 1.691495965238982e-06, + "loss": 11.124, + "step": 109 + }, + { + "epoch": 0.0051214004702376794, + "grad_norm": 1.3836050769516477, + "learning_rate": 1.7070142768466791e-06, + "loss": 11.0965, + "step": 110 + }, + { + "epoch": 0.00516795865633075, + "grad_norm": 1.3817451133501975, + "learning_rate": 1.722532588454376e-06, + "loss": 11.021, + "step": 111 + }, + { + "epoch": 0.005214516842423819, + "grad_norm": 1.3330601462642664, + "learning_rate": 1.7380509000620732e-06, + "loss": 11.0609, + "step": 112 + }, + { + "epoch": 0.005261075028516889, + "grad_norm": 1.3103903658295055, + "learning_rate": 1.7535692116697704e-06, + "loss": 11.101, + "step": 113 + }, + { + "epoch": 0.005307633214609959, + "grad_norm": 1.2545602734488988, + "learning_rate": 1.7690875232774673e-06, + "loss": 11.0631, + "step": 114 + }, + { + "epoch": 0.005354191400703028, + "grad_norm": 1.3046136283844865, + "learning_rate": 1.7846058348851645e-06, + "loss": 11.0008, + "step": 115 + }, + { + "epoch": 0.0054007495867960985, + "grad_norm": 1.285627827178413, + "learning_rate": 1.8001241464928614e-06, + "loss": 10.9731, + "step": 116 + }, + { + "epoch": 0.005447307772889169, + "grad_norm": 1.2453837848323703, + "learning_rate": 1.815642458100559e-06, + "loss": 11.0192, + "step": 117 + }, + { + "epoch": 0.005493865958982238, + "grad_norm": 1.2602524644451933, + "learning_rate": 1.831160769708256e-06, + "loss": 11.0023, + "step": 118 + }, + { + "epoch": 0.005540424145075308, + "grad_norm": 1.2195527799462427, + "learning_rate": 1.846679081315953e-06, + "loss": 10.9221, + "step": 119 + }, + { + "epoch": 0.005586982331168377, + "grad_norm": 1.2116870254119112, + "learning_rate": 1.86219739292365e-06, + "loss": 10.9408, + "step": 120 + }, + { + "epoch": 0.0056335405172614475, + "grad_norm": 1.2314291259641865, + "learning_rate": 1.8777157045313472e-06, + "loss": 10.929, + "step": 121 + }, + { + "epoch": 0.005680098703354518, + "grad_norm": 1.2308756212028744, + "learning_rate": 1.8932340161390444e-06, + "loss": 10.9252, + "step": 122 + }, + { + "epoch": 0.005726656889447587, + "grad_norm": 1.1986638404572, + "learning_rate": 1.9087523277467415e-06, + "loss": 10.9208, + "step": 123 + }, + { + "epoch": 0.005773215075540657, + "grad_norm": 1.1526368634053268, + "learning_rate": 1.9242706393544382e-06, + "loss": 10.9363, + "step": 124 + }, + { + "epoch": 0.005819773261633727, + "grad_norm": 1.1814388878134068, + "learning_rate": 1.9397889509621354e-06, + "loss": 10.8366, + "step": 125 + }, + { + "epoch": 0.005866331447726796, + "grad_norm": 1.1427357441094759, + "learning_rate": 1.9553072625698325e-06, + "loss": 10.8787, + "step": 126 + }, + { + "epoch": 0.0059128896338198665, + "grad_norm": 1.1512447212113377, + "learning_rate": 1.9708255741775297e-06, + "loss": 10.8742, + "step": 127 + }, + { + "epoch": 0.005959447819912936, + "grad_norm": 1.158533742562125, + "learning_rate": 1.986343885785227e-06, + "loss": 10.8669, + "step": 128 + }, + { + "epoch": 0.006006006006006006, + "grad_norm": 1.1596045173411622, + "learning_rate": 2.0018621973929236e-06, + "loss": 10.7754, + "step": 129 + }, + { + "epoch": 0.006052564192099076, + "grad_norm": 1.1373138087967376, + "learning_rate": 2.0173805090006207e-06, + "loss": 10.8441, + "step": 130 + }, + { + "epoch": 0.006099122378192145, + "grad_norm": 1.1097587958698145, + "learning_rate": 2.032898820608318e-06, + "loss": 10.7972, + "step": 131 + }, + { + "epoch": 0.0061456805642852155, + "grad_norm": 1.10953573247759, + "learning_rate": 2.048417132216015e-06, + "loss": 10.7337, + "step": 132 + }, + { + "epoch": 0.006192238750378286, + "grad_norm": 1.0685350040782384, + "learning_rate": 2.0639354438237118e-06, + "loss": 10.7523, + "step": 133 + }, + { + "epoch": 0.006238796936471355, + "grad_norm": 1.0601374387483151, + "learning_rate": 2.079453755431409e-06, + "loss": 10.7849, + "step": 134 + }, + { + "epoch": 0.006285355122564425, + "grad_norm": 1.132554937132596, + "learning_rate": 2.094972067039106e-06, + "loss": 10.7163, + "step": 135 + }, + { + "epoch": 0.006331913308657494, + "grad_norm": 1.0872903649918424, + "learning_rate": 2.1104903786468032e-06, + "loss": 10.7264, + "step": 136 + }, + { + "epoch": 0.0063784714947505644, + "grad_norm": 1.052816531472274, + "learning_rate": 2.1260086902545004e-06, + "loss": 10.7205, + "step": 137 + }, + { + "epoch": 0.006425029680843635, + "grad_norm": 1.0925283597566655, + "learning_rate": 2.1415270018621975e-06, + "loss": 10.7139, + "step": 138 + }, + { + "epoch": 0.006471587866936704, + "grad_norm": 1.03208133690534, + "learning_rate": 2.1570453134698947e-06, + "loss": 10.6965, + "step": 139 + }, + { + "epoch": 0.006518146053029774, + "grad_norm": 1.0217660957070638, + "learning_rate": 2.172563625077592e-06, + "loss": 10.74, + "step": 140 + }, + { + "epoch": 0.006564704239122844, + "grad_norm": 1.034765001722576, + "learning_rate": 2.188081936685289e-06, + "loss": 10.6604, + "step": 141 + }, + { + "epoch": 0.006611262425215913, + "grad_norm": 1.0315672709330623, + "learning_rate": 2.2036002482929857e-06, + "loss": 10.6395, + "step": 142 + }, + { + "epoch": 0.0066578206113089835, + "grad_norm": 1.0284614345732108, + "learning_rate": 2.219118559900683e-06, + "loss": 10.6518, + "step": 143 + }, + { + "epoch": 0.006704378797402053, + "grad_norm": 0.9852431643450547, + "learning_rate": 2.23463687150838e-06, + "loss": 10.683, + "step": 144 + }, + { + "epoch": 0.006750936983495123, + "grad_norm": 0.9974728455391493, + "learning_rate": 2.250155183116077e-06, + "loss": 10.6414, + "step": 145 + }, + { + "epoch": 0.006797495169588193, + "grad_norm": 1.0193941804667037, + "learning_rate": 2.2656734947237744e-06, + "loss": 10.6277, + "step": 146 + }, + { + "epoch": 0.006844053355681262, + "grad_norm": 0.9959495287551732, + "learning_rate": 2.281191806331471e-06, + "loss": 10.5662, + "step": 147 + }, + { + "epoch": 0.0068906115417743325, + "grad_norm": 0.9840066153338298, + "learning_rate": 2.2967101179391682e-06, + "loss": 10.6173, + "step": 148 + }, + { + "epoch": 0.006937169727867403, + "grad_norm": 0.9864199556652946, + "learning_rate": 2.3122284295468654e-06, + "loss": 10.6147, + "step": 149 + }, + { + "epoch": 0.006983727913960472, + "grad_norm": 1.0072568642854038, + "learning_rate": 2.3277467411545626e-06, + "loss": 10.5559, + "step": 150 + }, + { + "epoch": 0.007030286100053542, + "grad_norm": 1.0053987833431959, + "learning_rate": 2.3432650527622597e-06, + "loss": 10.516, + "step": 151 + }, + { + "epoch": 0.007076844286146612, + "grad_norm": 0.9889746873911764, + "learning_rate": 2.3587833643699564e-06, + "loss": 10.5815, + "step": 152 + }, + { + "epoch": 0.007123402472239681, + "grad_norm": 0.9940289365927889, + "learning_rate": 2.3743016759776536e-06, + "loss": 10.4911, + "step": 153 + }, + { + "epoch": 0.0071699606583327516, + "grad_norm": 1.0072834570861804, + "learning_rate": 2.3898199875853507e-06, + "loss": 10.5668, + "step": 154 + }, + { + "epoch": 0.007216518844425821, + "grad_norm": 0.9611079452170246, + "learning_rate": 2.405338299193048e-06, + "loss": 10.5689, + "step": 155 + }, + { + "epoch": 0.007263077030518891, + "grad_norm": 0.9922624395693586, + "learning_rate": 2.420856610800745e-06, + "loss": 10.4935, + "step": 156 + }, + { + "epoch": 0.007309635216611961, + "grad_norm": 0.956301845446188, + "learning_rate": 2.4363749224084418e-06, + "loss": 10.4921, + "step": 157 + }, + { + "epoch": 0.00735619340270503, + "grad_norm": 0.9478413893118777, + "learning_rate": 2.451893234016139e-06, + "loss": 10.5328, + "step": 158 + }, + { + "epoch": 0.0074027515887981005, + "grad_norm": 0.9297923950923886, + "learning_rate": 2.4674115456238365e-06, + "loss": 10.5253, + "step": 159 + }, + { + "epoch": 0.007449309774891171, + "grad_norm": 0.9198946296911947, + "learning_rate": 2.4829298572315337e-06, + "loss": 10.4933, + "step": 160 + }, + { + "epoch": 0.00749586796098424, + "grad_norm": 0.9147562168626459, + "learning_rate": 2.4984481688392304e-06, + "loss": 10.5167, + "step": 161 + }, + { + "epoch": 0.00754242614707731, + "grad_norm": 0.9379570807855013, + "learning_rate": 2.5139664804469276e-06, + "loss": 10.4839, + "step": 162 + }, + { + "epoch": 0.007588984333170379, + "grad_norm": 0.9062103287961867, + "learning_rate": 2.5294847920546247e-06, + "loss": 10.5377, + "step": 163 + }, + { + "epoch": 0.0076355425192634494, + "grad_norm": 0.9269616985826348, + "learning_rate": 2.545003103662322e-06, + "loss": 10.4619, + "step": 164 + }, + { + "epoch": 0.00768210070535652, + "grad_norm": 0.9172979770554238, + "learning_rate": 2.560521415270019e-06, + "loss": 10.5003, + "step": 165 + }, + { + "epoch": 0.007728658891449589, + "grad_norm": 0.9043292076426647, + "learning_rate": 2.5760397268777158e-06, + "loss": 10.4974, + "step": 166 + }, + { + "epoch": 0.007775217077542659, + "grad_norm": 0.9043064241282794, + "learning_rate": 2.591558038485413e-06, + "loss": 10.4825, + "step": 167 + }, + { + "epoch": 0.007821775263635729, + "grad_norm": 0.9113595952658243, + "learning_rate": 2.60707635009311e-06, + "loss": 10.4249, + "step": 168 + }, + { + "epoch": 0.0078683334497288, + "grad_norm": 0.8898040535241469, + "learning_rate": 2.6225946617008072e-06, + "loss": 10.4895, + "step": 169 + }, + { + "epoch": 0.007914891635821868, + "grad_norm": 0.9155598948364991, + "learning_rate": 2.638112973308504e-06, + "loss": 10.4049, + "step": 170 + }, + { + "epoch": 0.007961449821914938, + "grad_norm": 0.9285408024290253, + "learning_rate": 2.653631284916201e-06, + "loss": 10.3781, + "step": 171 + }, + { + "epoch": 0.008008008008008008, + "grad_norm": 0.9044933606932509, + "learning_rate": 2.6691495965238983e-06, + "loss": 10.4185, + "step": 172 + }, + { + "epoch": 0.008054566194101078, + "grad_norm": 0.9296038655578656, + "learning_rate": 2.6846679081315954e-06, + "loss": 10.3779, + "step": 173 + }, + { + "epoch": 0.008101124380194148, + "grad_norm": 0.9409610474356832, + "learning_rate": 2.7001862197392926e-06, + "loss": 10.3511, + "step": 174 + }, + { + "epoch": 0.008147682566287217, + "grad_norm": 0.9062728161394892, + "learning_rate": 2.7157045313469893e-06, + "loss": 10.3676, + "step": 175 + }, + { + "epoch": 0.008194240752380287, + "grad_norm": 0.8845111536557827, + "learning_rate": 2.7312228429546864e-06, + "loss": 10.4267, + "step": 176 + }, + { + "epoch": 0.008240798938473357, + "grad_norm": 0.925091885064968, + "learning_rate": 2.7467411545623836e-06, + "loss": 10.372, + "step": 177 + }, + { + "epoch": 0.008287357124566427, + "grad_norm": 0.9005744352445538, + "learning_rate": 2.7622594661700808e-06, + "loss": 10.4223, + "step": 178 + }, + { + "epoch": 0.008333915310659497, + "grad_norm": 0.9175967399309095, + "learning_rate": 2.777777777777778e-06, + "loss": 10.3775, + "step": 179 + }, + { + "epoch": 0.008380473496752567, + "grad_norm": 0.9293712289114088, + "learning_rate": 2.7932960893854746e-06, + "loss": 10.2978, + "step": 180 + }, + { + "epoch": 0.008427031682845636, + "grad_norm": 0.9068152036291544, + "learning_rate": 2.8088144009931722e-06, + "loss": 10.317, + "step": 181 + }, + { + "epoch": 0.008473589868938706, + "grad_norm": 0.8597732007879302, + "learning_rate": 2.8243327126008694e-06, + "loss": 10.4084, + "step": 182 + }, + { + "epoch": 0.008520148055031776, + "grad_norm": 0.8528821135230492, + "learning_rate": 2.8398510242085665e-06, + "loss": 10.3805, + "step": 183 + }, + { + "epoch": 0.008566706241124846, + "grad_norm": 0.878645181594358, + "learning_rate": 2.8553693358162633e-06, + "loss": 10.3752, + "step": 184 + }, + { + "epoch": 0.008613264427217916, + "grad_norm": 0.8697769050926981, + "learning_rate": 2.8708876474239604e-06, + "loss": 10.3628, + "step": 185 + }, + { + "epoch": 0.008659822613310985, + "grad_norm": 0.8942145831959144, + "learning_rate": 2.8864059590316576e-06, + "loss": 10.336, + "step": 186 + }, + { + "epoch": 0.008706380799404055, + "grad_norm": 0.877017533424186, + "learning_rate": 2.9019242706393547e-06, + "loss": 10.2827, + "step": 187 + }, + { + "epoch": 0.008752938985497125, + "grad_norm": 0.8762681054511939, + "learning_rate": 2.917442582247052e-06, + "loss": 10.3459, + "step": 188 + }, + { + "epoch": 0.008799497171590195, + "grad_norm": 0.8907257953638913, + "learning_rate": 2.9329608938547486e-06, + "loss": 10.2516, + "step": 189 + }, + { + "epoch": 0.008846055357683265, + "grad_norm": 0.8953397513932821, + "learning_rate": 2.9484792054624458e-06, + "loss": 10.2535, + "step": 190 + }, + { + "epoch": 0.008892613543776335, + "grad_norm": 0.8748181965934891, + "learning_rate": 2.963997517070143e-06, + "loss": 10.2447, + "step": 191 + }, + { + "epoch": 0.008939171729869404, + "grad_norm": 0.8554886363908146, + "learning_rate": 2.97951582867784e-06, + "loss": 10.3488, + "step": 192 + }, + { + "epoch": 0.008985729915962474, + "grad_norm": 0.8935203185091557, + "learning_rate": 2.9950341402855372e-06, + "loss": 10.2154, + "step": 193 + }, + { + "epoch": 0.009032288102055544, + "grad_norm": 0.8391645818528363, + "learning_rate": 3.010552451893234e-06, + "loss": 10.3347, + "step": 194 + }, + { + "epoch": 0.009078846288148614, + "grad_norm": 0.8894433101186009, + "learning_rate": 3.026070763500931e-06, + "loss": 10.1949, + "step": 195 + }, + { + "epoch": 0.009125404474241684, + "grad_norm": 0.8772484822547878, + "learning_rate": 3.0415890751086283e-06, + "loss": 10.2422, + "step": 196 + }, + { + "epoch": 0.009171962660334753, + "grad_norm": 0.8588979675051513, + "learning_rate": 3.0571073867163254e-06, + "loss": 10.261, + "step": 197 + }, + { + "epoch": 0.009218520846427823, + "grad_norm": 0.8547841201841134, + "learning_rate": 3.0726256983240226e-06, + "loss": 10.2721, + "step": 198 + }, + { + "epoch": 0.009265079032520893, + "grad_norm": 0.8466149661810174, + "learning_rate": 3.0881440099317193e-06, + "loss": 10.2363, + "step": 199 + }, + { + "epoch": 0.009311637218613963, + "grad_norm": 0.867131674512745, + "learning_rate": 3.1036623215394165e-06, + "loss": 10.2413, + "step": 200 + }, + { + "epoch": 0.009358195404707033, + "grad_norm": 0.8848374308173081, + "learning_rate": 3.1191806331471136e-06, + "loss": 10.1955, + "step": 201 + }, + { + "epoch": 0.009404753590800102, + "grad_norm": 0.8560456447531867, + "learning_rate": 3.134698944754811e-06, + "loss": 10.2354, + "step": 202 + }, + { + "epoch": 0.009451311776893172, + "grad_norm": 0.8659250299143588, + "learning_rate": 3.150217256362508e-06, + "loss": 10.188, + "step": 203 + }, + { + "epoch": 0.009497869962986242, + "grad_norm": 0.8461325770531928, + "learning_rate": 3.165735567970205e-06, + "loss": 10.1449, + "step": 204 + }, + { + "epoch": 0.009544428149079312, + "grad_norm": 0.8098329221582864, + "learning_rate": 3.1812538795779022e-06, + "loss": 10.239, + "step": 205 + }, + { + "epoch": 0.009590986335172382, + "grad_norm": 0.8790234507532462, + "learning_rate": 3.1967721911855994e-06, + "loss": 10.1574, + "step": 206 + }, + { + "epoch": 0.009637544521265452, + "grad_norm": 0.823909730954989, + "learning_rate": 3.212290502793296e-06, + "loss": 10.2502, + "step": 207 + }, + { + "epoch": 0.00968410270735852, + "grad_norm": 0.8346611978187417, + "learning_rate": 3.2278088144009933e-06, + "loss": 10.1905, + "step": 208 + }, + { + "epoch": 0.00973066089345159, + "grad_norm": 0.8604764809758039, + "learning_rate": 3.2433271260086904e-06, + "loss": 10.1861, + "step": 209 + }, + { + "epoch": 0.009777219079544661, + "grad_norm": 0.8619332584394843, + "learning_rate": 3.2588454376163876e-06, + "loss": 10.0859, + "step": 210 + }, + { + "epoch": 0.009823777265637731, + "grad_norm": 0.8648614025677002, + "learning_rate": 3.2743637492240847e-06, + "loss": 10.1356, + "step": 211 + }, + { + "epoch": 0.009870335451730801, + "grad_norm": 0.8540023496196106, + "learning_rate": 3.2898820608317815e-06, + "loss": 10.1182, + "step": 212 + }, + { + "epoch": 0.00991689363782387, + "grad_norm": 0.8174171985889019, + "learning_rate": 3.3054003724394786e-06, + "loss": 10.1658, + "step": 213 + }, + { + "epoch": 0.00996345182391694, + "grad_norm": 0.8371313194340357, + "learning_rate": 3.3209186840471758e-06, + "loss": 10.1863, + "step": 214 + }, + { + "epoch": 0.01001001001001001, + "grad_norm": 0.783213545796884, + "learning_rate": 3.336436995654873e-06, + "loss": 10.1774, + "step": 215 + }, + { + "epoch": 0.01005656819610308, + "grad_norm": 0.8555824474537045, + "learning_rate": 3.35195530726257e-06, + "loss": 10.1736, + "step": 216 + }, + { + "epoch": 0.01010312638219615, + "grad_norm": 0.8145783448671329, + "learning_rate": 3.367473618870267e-06, + "loss": 10.1448, + "step": 217 + }, + { + "epoch": 0.010149684568289219, + "grad_norm": 0.8721081567038838, + "learning_rate": 3.382991930477964e-06, + "loss": 10.0884, + "step": 218 + }, + { + "epoch": 0.010196242754382289, + "grad_norm": 0.7890349415139494, + "learning_rate": 3.398510242085661e-06, + "loss": 10.2292, + "step": 219 + }, + { + "epoch": 0.010242800940475359, + "grad_norm": 0.8756151082473433, + "learning_rate": 3.4140285536933583e-06, + "loss": 10.0227, + "step": 220 + }, + { + "epoch": 0.010289359126568429, + "grad_norm": 0.8219781930410619, + "learning_rate": 3.4295468653010554e-06, + "loss": 10.1816, + "step": 221 + }, + { + "epoch": 0.0103359173126615, + "grad_norm": 0.8088827361367288, + "learning_rate": 3.445065176908752e-06, + "loss": 10.0765, + "step": 222 + }, + { + "epoch": 0.01038247549875457, + "grad_norm": 0.8119792412872731, + "learning_rate": 3.4605834885164493e-06, + "loss": 10.1224, + "step": 223 + }, + { + "epoch": 0.010429033684847638, + "grad_norm": 0.8145204174008123, + "learning_rate": 3.4761018001241465e-06, + "loss": 10.0567, + "step": 224 + }, + { + "epoch": 0.010475591870940708, + "grad_norm": 0.8177544286124119, + "learning_rate": 3.4916201117318436e-06, + "loss": 10.1129, + "step": 225 + }, + { + "epoch": 0.010522150057033778, + "grad_norm": 0.8077741535293547, + "learning_rate": 3.5071384233395408e-06, + "loss": 10.0515, + "step": 226 + }, + { + "epoch": 0.010568708243126848, + "grad_norm": 0.8315710996647879, + "learning_rate": 3.5226567349472375e-06, + "loss": 10.1029, + "step": 227 + }, + { + "epoch": 0.010615266429219918, + "grad_norm": 0.813875213375446, + "learning_rate": 3.5381750465549347e-06, + "loss": 10.0625, + "step": 228 + }, + { + "epoch": 0.010661824615312987, + "grad_norm": 0.8491415984917048, + "learning_rate": 3.553693358162632e-06, + "loss": 10.0106, + "step": 229 + }, + { + "epoch": 0.010708382801406057, + "grad_norm": 0.7783493958942653, + "learning_rate": 3.569211669770329e-06, + "loss": 10.1222, + "step": 230 + }, + { + "epoch": 0.010754940987499127, + "grad_norm": 0.8290341322813608, + "learning_rate": 3.584729981378026e-06, + "loss": 10.0758, + "step": 231 + }, + { + "epoch": 0.010801499173592197, + "grad_norm": 0.7817142211946463, + "learning_rate": 3.600248292985723e-06, + "loss": 10.1339, + "step": 232 + }, + { + "epoch": 0.010848057359685267, + "grad_norm": 0.8156996946855882, + "learning_rate": 3.61576660459342e-06, + "loss": 10.0356, + "step": 233 + }, + { + "epoch": 0.010894615545778337, + "grad_norm": 0.7968740433935626, + "learning_rate": 3.631284916201118e-06, + "loss": 10.073, + "step": 234 + }, + { + "epoch": 0.010941173731871406, + "grad_norm": 0.7833651146667605, + "learning_rate": 3.6468032278088147e-06, + "loss": 10.0315, + "step": 235 + }, + { + "epoch": 0.010987731917964476, + "grad_norm": 0.8054081701752892, + "learning_rate": 3.662321539416512e-06, + "loss": 10.0652, + "step": 236 + }, + { + "epoch": 0.011034290104057546, + "grad_norm": 0.793408776305064, + "learning_rate": 3.677839851024209e-06, + "loss": 10.069, + "step": 237 + }, + { + "epoch": 0.011080848290150616, + "grad_norm": 0.7738981828827082, + "learning_rate": 3.693358162631906e-06, + "loss": 10.0736, + "step": 238 + }, + { + "epoch": 0.011127406476243686, + "grad_norm": 0.8126282620607647, + "learning_rate": 3.7088764742396034e-06, + "loss": 9.9771, + "step": 239 + }, + { + "epoch": 0.011173964662336755, + "grad_norm": 0.7984730753017674, + "learning_rate": 3.7243947858473e-06, + "loss": 10.0421, + "step": 240 + }, + { + "epoch": 0.011220522848429825, + "grad_norm": 0.8006780444897709, + "learning_rate": 3.7399130974549972e-06, + "loss": 9.9695, + "step": 241 + }, + { + "epoch": 0.011267081034522895, + "grad_norm": 0.7816254940763119, + "learning_rate": 3.7554314090626944e-06, + "loss": 10.0221, + "step": 242 + }, + { + "epoch": 0.011313639220615965, + "grad_norm": 0.7928243507738107, + "learning_rate": 3.7709497206703915e-06, + "loss": 10.0417, + "step": 243 + }, + { + "epoch": 0.011360197406709035, + "grad_norm": 0.8100229198750354, + "learning_rate": 3.7864680322780887e-06, + "loss": 9.995, + "step": 244 + }, + { + "epoch": 0.011406755592802104, + "grad_norm": 0.7573323360393243, + "learning_rate": 3.8019863438857854e-06, + "loss": 10.0064, + "step": 245 + }, + { + "epoch": 0.011453313778895174, + "grad_norm": 0.8098782284914151, + "learning_rate": 3.817504655493483e-06, + "loss": 10.0094, + "step": 246 + }, + { + "epoch": 0.011499871964988244, + "grad_norm": 0.8224985456209083, + "learning_rate": 3.833022967101179e-06, + "loss": 9.9627, + "step": 247 + }, + { + "epoch": 0.011546430151081314, + "grad_norm": 0.8025551785297478, + "learning_rate": 3.8485412787088765e-06, + "loss": 10.0122, + "step": 248 + }, + { + "epoch": 0.011592988337174384, + "grad_norm": 0.8403906580235022, + "learning_rate": 3.864059590316574e-06, + "loss": 9.8943, + "step": 249 + }, + { + "epoch": 0.011639546523267454, + "grad_norm": 0.832882622384281, + "learning_rate": 3.879577901924271e-06, + "loss": 9.8665, + "step": 250 + }, + { + "epoch": 0.011686104709360523, + "grad_norm": 0.7758060446670616, + "learning_rate": 3.895096213531968e-06, + "loss": 9.9875, + "step": 251 + }, + { + "epoch": 0.011732662895453593, + "grad_norm": 0.7603783249020577, + "learning_rate": 3.910614525139665e-06, + "loss": 9.9764, + "step": 252 + }, + { + "epoch": 0.011779221081546663, + "grad_norm": 0.7621822018854629, + "learning_rate": 3.926132836747362e-06, + "loss": 9.998, + "step": 253 + }, + { + "epoch": 0.011825779267639733, + "grad_norm": 0.8235664078697521, + "learning_rate": 3.941651148355059e-06, + "loss": 9.8858, + "step": 254 + }, + { + "epoch": 0.011872337453732803, + "grad_norm": 0.7879012139278242, + "learning_rate": 3.9571694599627566e-06, + "loss": 9.8362, + "step": 255 + }, + { + "epoch": 0.011918895639825872, + "grad_norm": 0.776605405275299, + "learning_rate": 3.972687771570454e-06, + "loss": 9.8962, + "step": 256 + }, + { + "epoch": 0.011965453825918942, + "grad_norm": 0.7881545351597435, + "learning_rate": 3.98820608317815e-06, + "loss": 9.8473, + "step": 257 + }, + { + "epoch": 0.012012012012012012, + "grad_norm": 0.7522860752027696, + "learning_rate": 4.003724394785847e-06, + "loss": 9.919, + "step": 258 + }, + { + "epoch": 0.012058570198105082, + "grad_norm": 0.7773257860615893, + "learning_rate": 4.019242706393544e-06, + "loss": 9.8861, + "step": 259 + }, + { + "epoch": 0.012105128384198152, + "grad_norm": 0.7705323984798782, + "learning_rate": 4.0347610180012415e-06, + "loss": 9.9217, + "step": 260 + }, + { + "epoch": 0.012151686570291222, + "grad_norm": 0.7864140264417167, + "learning_rate": 4.050279329608939e-06, + "loss": 9.8979, + "step": 261 + }, + { + "epoch": 0.01219824475638429, + "grad_norm": 0.7506223826107817, + "learning_rate": 4.065797641216636e-06, + "loss": 9.9149, + "step": 262 + }, + { + "epoch": 0.01224480294247736, + "grad_norm": 0.7852688439119851, + "learning_rate": 4.081315952824333e-06, + "loss": 9.8459, + "step": 263 + }, + { + "epoch": 0.012291361128570431, + "grad_norm": 0.7507479460528043, + "learning_rate": 4.09683426443203e-06, + "loss": 9.9417, + "step": 264 + }, + { + "epoch": 0.012337919314663501, + "grad_norm": 0.8120512129530695, + "learning_rate": 4.112352576039727e-06, + "loss": 9.8186, + "step": 265 + }, + { + "epoch": 0.012384477500756571, + "grad_norm": 0.7882308086983317, + "learning_rate": 4.1278708876474236e-06, + "loss": 9.8199, + "step": 266 + }, + { + "epoch": 0.01243103568684964, + "grad_norm": 0.7899172886608683, + "learning_rate": 4.143389199255121e-06, + "loss": 9.8622, + "step": 267 + }, + { + "epoch": 0.01247759387294271, + "grad_norm": 0.7666453404819819, + "learning_rate": 4.158907510862818e-06, + "loss": 9.858, + "step": 268 + }, + { + "epoch": 0.01252415205903578, + "grad_norm": 0.7420500535557069, + "learning_rate": 4.174425822470515e-06, + "loss": 9.8918, + "step": 269 + }, + { + "epoch": 0.01257071024512885, + "grad_norm": 0.8050110916969554, + "learning_rate": 4.189944134078212e-06, + "loss": 9.842, + "step": 270 + }, + { + "epoch": 0.01261726843122192, + "grad_norm": 0.7705996561989802, + "learning_rate": 4.205462445685909e-06, + "loss": 9.7937, + "step": 271 + }, + { + "epoch": 0.012663826617314989, + "grad_norm": 0.7978602272756855, + "learning_rate": 4.2209807572936065e-06, + "loss": 9.8292, + "step": 272 + }, + { + "epoch": 0.012710384803408059, + "grad_norm": 0.7642634749224271, + "learning_rate": 4.236499068901304e-06, + "loss": 9.7982, + "step": 273 + }, + { + "epoch": 0.012756942989501129, + "grad_norm": 0.7941028364615562, + "learning_rate": 4.252017380509001e-06, + "loss": 9.8177, + "step": 274 + }, + { + "epoch": 0.012803501175594199, + "grad_norm": 0.7532797238922417, + "learning_rate": 4.267535692116698e-06, + "loss": 9.8112, + "step": 275 + }, + { + "epoch": 0.01285005936168727, + "grad_norm": 0.7543344498700375, + "learning_rate": 4.283054003724395e-06, + "loss": 9.8365, + "step": 276 + }, + { + "epoch": 0.01289661754778034, + "grad_norm": 0.7763487672270293, + "learning_rate": 4.298572315332092e-06, + "loss": 9.793, + "step": 277 + }, + { + "epoch": 0.012943175733873408, + "grad_norm": 0.7666162516305549, + "learning_rate": 4.314090626939789e-06, + "loss": 9.7574, + "step": 278 + }, + { + "epoch": 0.012989733919966478, + "grad_norm": 0.7727690944853363, + "learning_rate": 4.3296089385474866e-06, + "loss": 9.7551, + "step": 279 + }, + { + "epoch": 0.013036292106059548, + "grad_norm": 0.7641842932271968, + "learning_rate": 4.345127250155184e-06, + "loss": 9.7287, + "step": 280 + }, + { + "epoch": 0.013082850292152618, + "grad_norm": 0.7520100245003488, + "learning_rate": 4.360645561762881e-06, + "loss": 9.7808, + "step": 281 + }, + { + "epoch": 0.013129408478245688, + "grad_norm": 0.7565896798703039, + "learning_rate": 4.376163873370578e-06, + "loss": 9.7718, + "step": 282 + }, + { + "epoch": 0.013175966664338757, + "grad_norm": 0.7287622367883638, + "learning_rate": 4.391682184978275e-06, + "loss": 9.8254, + "step": 283 + }, + { + "epoch": 0.013222524850431827, + "grad_norm": 0.7371721584543278, + "learning_rate": 4.4072004965859715e-06, + "loss": 9.7775, + "step": 284 + }, + { + "epoch": 0.013269083036524897, + "grad_norm": 0.7331597092074453, + "learning_rate": 4.422718808193669e-06, + "loss": 9.7598, + "step": 285 + }, + { + "epoch": 0.013315641222617967, + "grad_norm": 0.7544358972740158, + "learning_rate": 4.438237119801366e-06, + "loss": 9.6754, + "step": 286 + }, + { + "epoch": 0.013362199408711037, + "grad_norm": 0.7753353789988036, + "learning_rate": 4.453755431409063e-06, + "loss": 9.6755, + "step": 287 + }, + { + "epoch": 0.013408757594804106, + "grad_norm": 0.7500489242600129, + "learning_rate": 4.46927374301676e-06, + "loss": 9.7375, + "step": 288 + }, + { + "epoch": 0.013455315780897176, + "grad_norm": 0.7330759282935106, + "learning_rate": 4.484792054624457e-06, + "loss": 9.7789, + "step": 289 + }, + { + "epoch": 0.013501873966990246, + "grad_norm": 0.7359592111376084, + "learning_rate": 4.500310366232154e-06, + "loss": 9.7563, + "step": 290 + }, + { + "epoch": 0.013548432153083316, + "grad_norm": 0.7228651481119513, + "learning_rate": 4.5158286778398516e-06, + "loss": 9.7536, + "step": 291 + }, + { + "epoch": 0.013594990339176386, + "grad_norm": 0.7488441898222892, + "learning_rate": 4.531346989447549e-06, + "loss": 9.6831, + "step": 292 + }, + { + "epoch": 0.013641548525269456, + "grad_norm": 0.7721106087672064, + "learning_rate": 4.546865301055246e-06, + "loss": 9.7358, + "step": 293 + }, + { + "epoch": 0.013688106711362525, + "grad_norm": 0.760961033099896, + "learning_rate": 4.562383612662942e-06, + "loss": 9.5655, + "step": 294 + }, + { + "epoch": 0.013734664897455595, + "grad_norm": 0.7593099690548725, + "learning_rate": 4.577901924270639e-06, + "loss": 9.7084, + "step": 295 + }, + { + "epoch": 0.013781223083548665, + "grad_norm": 0.721563866310033, + "learning_rate": 4.5934202358783365e-06, + "loss": 9.6894, + "step": 296 + }, + { + "epoch": 0.013827781269641735, + "grad_norm": 0.698056905181288, + "learning_rate": 4.608938547486034e-06, + "loss": 9.775, + "step": 297 + }, + { + "epoch": 0.013874339455734805, + "grad_norm": 0.7344677045463243, + "learning_rate": 4.624456859093731e-06, + "loss": 9.6711, + "step": 298 + }, + { + "epoch": 0.013920897641827874, + "grad_norm": 0.7064956534836647, + "learning_rate": 4.639975170701428e-06, + "loss": 9.6434, + "step": 299 + }, + { + "epoch": 0.013967455827920944, + "grad_norm": 0.7254113077787065, + "learning_rate": 4.655493482309125e-06, + "loss": 9.6915, + "step": 300 + }, + { + "epoch": 0.014014014014014014, + "grad_norm": 0.7744173877062213, + "learning_rate": 4.671011793916822e-06, + "loss": 9.5764, + "step": 301 + }, + { + "epoch": 0.014060572200107084, + "grad_norm": 0.7451444244141707, + "learning_rate": 4.686530105524519e-06, + "loss": 9.6247, + "step": 302 + }, + { + "epoch": 0.014107130386200154, + "grad_norm": 0.7766989157616826, + "learning_rate": 4.702048417132216e-06, + "loss": 9.6769, + "step": 303 + }, + { + "epoch": 0.014153688572293224, + "grad_norm": 0.7532617330293945, + "learning_rate": 4.717566728739913e-06, + "loss": 9.6337, + "step": 304 + }, + { + "epoch": 0.014200246758386293, + "grad_norm": 0.7039302997858976, + "learning_rate": 4.73308504034761e-06, + "loss": 9.6314, + "step": 305 + }, + { + "epoch": 0.014246804944479363, + "grad_norm": 0.7182566863357065, + "learning_rate": 4.748603351955307e-06, + "loss": 9.6259, + "step": 306 + }, + { + "epoch": 0.014293363130572433, + "grad_norm": 0.6756750675935356, + "learning_rate": 4.764121663563004e-06, + "loss": 9.678, + "step": 307 + }, + { + "epoch": 0.014339921316665503, + "grad_norm": 0.7036961461903851, + "learning_rate": 4.7796399751707015e-06, + "loss": 9.6393, + "step": 308 + }, + { + "epoch": 0.014386479502758573, + "grad_norm": 0.6947977818559564, + "learning_rate": 4.795158286778399e-06, + "loss": 9.6168, + "step": 309 + }, + { + "epoch": 0.014433037688851642, + "grad_norm": 0.7250766534694751, + "learning_rate": 4.810676598386096e-06, + "loss": 9.5395, + "step": 310 + }, + { + "epoch": 0.014479595874944712, + "grad_norm": 0.689950755019036, + "learning_rate": 4.826194909993793e-06, + "loss": 9.6897, + "step": 311 + }, + { + "epoch": 0.014526154061037782, + "grad_norm": 0.7131734878467589, + "learning_rate": 4.84171322160149e-06, + "loss": 9.537, + "step": 312 + }, + { + "epoch": 0.014572712247130852, + "grad_norm": 0.7094958930806955, + "learning_rate": 4.857231533209186e-06, + "loss": 9.6447, + "step": 313 + }, + { + "epoch": 0.014619270433223922, + "grad_norm": 0.7012291304051442, + "learning_rate": 4.8727498448168836e-06, + "loss": 9.6005, + "step": 314 + }, + { + "epoch": 0.01466582861931699, + "grad_norm": 0.7447376686645472, + "learning_rate": 4.888268156424581e-06, + "loss": 9.5368, + "step": 315 + }, + { + "epoch": 0.01471238680541006, + "grad_norm": 0.710469099500382, + "learning_rate": 4.903786468032278e-06, + "loss": 9.6117, + "step": 316 + }, + { + "epoch": 0.014758944991503131, + "grad_norm": 0.7188627499341534, + "learning_rate": 4.919304779639975e-06, + "loss": 9.5129, + "step": 317 + }, + { + "epoch": 0.014805503177596201, + "grad_norm": 0.6905706364558549, + "learning_rate": 4.934823091247673e-06, + "loss": 9.5642, + "step": 318 + }, + { + "epoch": 0.014852061363689271, + "grad_norm": 0.6957686280229269, + "learning_rate": 4.95034140285537e-06, + "loss": 9.5885, + "step": 319 + }, + { + "epoch": 0.014898619549782341, + "grad_norm": 0.6801422777651734, + "learning_rate": 4.965859714463067e-06, + "loss": 9.5665, + "step": 320 + }, + { + "epoch": 0.01494517773587541, + "grad_norm": 0.6923889585647361, + "learning_rate": 4.981378026070764e-06, + "loss": 9.4988, + "step": 321 + }, + { + "epoch": 0.01499173592196848, + "grad_norm": 0.7050348486052702, + "learning_rate": 4.996896337678461e-06, + "loss": 9.5306, + "step": 322 + }, + { + "epoch": 0.01503829410806155, + "grad_norm": 0.7003694114310757, + "learning_rate": 5.012414649286158e-06, + "loss": 9.5395, + "step": 323 + }, + { + "epoch": 0.01508485229415462, + "grad_norm": 0.6947515052017332, + "learning_rate": 5.027932960893855e-06, + "loss": 9.5417, + "step": 324 + }, + { + "epoch": 0.01513141048024769, + "grad_norm": 0.7079515384716829, + "learning_rate": 5.043451272501552e-06, + "loss": 9.4791, + "step": 325 + }, + { + "epoch": 0.015177968666340759, + "grad_norm": 0.6992075602571445, + "learning_rate": 5.0589695841092494e-06, + "loss": 9.5795, + "step": 326 + }, + { + "epoch": 0.015224526852433829, + "grad_norm": 0.6979486698618595, + "learning_rate": 5.074487895716947e-06, + "loss": 9.5463, + "step": 327 + }, + { + "epoch": 0.015271085038526899, + "grad_norm": 0.6948080275260687, + "learning_rate": 5.090006207324644e-06, + "loss": 9.5505, + "step": 328 + }, + { + "epoch": 0.015317643224619969, + "grad_norm": 0.6830799917884035, + "learning_rate": 5.105524518932341e-06, + "loss": 9.5207, + "step": 329 + }, + { + "epoch": 0.01536420141071304, + "grad_norm": 0.6804595982164465, + "learning_rate": 5.121042830540038e-06, + "loss": 9.4955, + "step": 330 + }, + { + "epoch": 0.015410759596806108, + "grad_norm": 0.7065069094918891, + "learning_rate": 5.136561142147734e-06, + "loss": 9.5255, + "step": 331 + }, + { + "epoch": 0.015457317782899178, + "grad_norm": 0.6918011309494667, + "learning_rate": 5.1520794537554315e-06, + "loss": 9.397, + "step": 332 + }, + { + "epoch": 0.015503875968992248, + "grad_norm": 0.7054349058572548, + "learning_rate": 5.167597765363129e-06, + "loss": 9.4663, + "step": 333 + }, + { + "epoch": 0.015550434155085318, + "grad_norm": 0.661817414171618, + "learning_rate": 5.183116076970826e-06, + "loss": 9.5459, + "step": 334 + }, + { + "epoch": 0.015596992341178388, + "grad_norm": 0.6980605182415274, + "learning_rate": 5.198634388578523e-06, + "loss": 9.4133, + "step": 335 + }, + { + "epoch": 0.015643550527271458, + "grad_norm": 0.693412798252813, + "learning_rate": 5.21415270018622e-06, + "loss": 9.355, + "step": 336 + }, + { + "epoch": 0.01569010871336453, + "grad_norm": 0.6970086403494994, + "learning_rate": 5.229671011793917e-06, + "loss": 9.4508, + "step": 337 + }, + { + "epoch": 0.0157366668994576, + "grad_norm": 0.6714249609026101, + "learning_rate": 5.2451893234016144e-06, + "loss": 9.4857, + "step": 338 + }, + { + "epoch": 0.01578322508555067, + "grad_norm": 0.6724477483738567, + "learning_rate": 5.260707635009312e-06, + "loss": 9.3579, + "step": 339 + }, + { + "epoch": 0.015829783271643735, + "grad_norm": 0.7016582945839679, + "learning_rate": 5.276225946617008e-06, + "loss": 9.2865, + "step": 340 + }, + { + "epoch": 0.015876341457736805, + "grad_norm": 0.683884164666531, + "learning_rate": 5.291744258224705e-06, + "loss": 9.3695, + "step": 341 + }, + { + "epoch": 0.015922899643829876, + "grad_norm": 0.6706677374123015, + "learning_rate": 5.307262569832402e-06, + "loss": 9.3522, + "step": 342 + }, + { + "epoch": 0.015969457829922946, + "grad_norm": 0.6979656209725624, + "learning_rate": 5.322780881440099e-06, + "loss": 9.3425, + "step": 343 + }, + { + "epoch": 0.016016016016016016, + "grad_norm": 0.6534556404786572, + "learning_rate": 5.3382991930477965e-06, + "loss": 9.4577, + "step": 344 + }, + { + "epoch": 0.016062574202109086, + "grad_norm": 0.6730561543030591, + "learning_rate": 5.353817504655494e-06, + "loss": 9.3715, + "step": 345 + }, + { + "epoch": 0.016109132388202156, + "grad_norm": 0.6724339335454756, + "learning_rate": 5.369335816263191e-06, + "loss": 9.3743, + "step": 346 + }, + { + "epoch": 0.016155690574295226, + "grad_norm": 0.6709193532711722, + "learning_rate": 5.384854127870888e-06, + "loss": 9.4153, + "step": 347 + }, + { + "epoch": 0.016202248760388296, + "grad_norm": 0.6849223817419793, + "learning_rate": 5.400372439478585e-06, + "loss": 9.3253, + "step": 348 + }, + { + "epoch": 0.016248806946481367, + "grad_norm": 0.6679927626158595, + "learning_rate": 5.415890751086282e-06, + "loss": 9.3853, + "step": 349 + }, + { + "epoch": 0.016295365132574433, + "grad_norm": 0.6855171838290249, + "learning_rate": 5.431409062693979e-06, + "loss": 9.3454, + "step": 350 + }, + { + "epoch": 0.016341923318667503, + "grad_norm": 0.6700939560227821, + "learning_rate": 5.446927374301676e-06, + "loss": 9.3475, + "step": 351 + }, + { + "epoch": 0.016388481504760573, + "grad_norm": 0.6677155729156595, + "learning_rate": 5.462445685909373e-06, + "loss": 9.3855, + "step": 352 + }, + { + "epoch": 0.016435039690853644, + "grad_norm": 0.6644976422466717, + "learning_rate": 5.47796399751707e-06, + "loss": 9.3447, + "step": 353 + }, + { + "epoch": 0.016481597876946714, + "grad_norm": 0.6677933362778764, + "learning_rate": 5.493482309124767e-06, + "loss": 9.2942, + "step": 354 + }, + { + "epoch": 0.016528156063039784, + "grad_norm": 0.6703718186277525, + "learning_rate": 5.509000620732464e-06, + "loss": 9.3474, + "step": 355 + }, + { + "epoch": 0.016574714249132854, + "grad_norm": 0.6393370177230121, + "learning_rate": 5.5245189323401615e-06, + "loss": 9.3675, + "step": 356 + }, + { + "epoch": 0.016621272435225924, + "grad_norm": 0.6501102752592777, + "learning_rate": 5.540037243947859e-06, + "loss": 9.3563, + "step": 357 + }, + { + "epoch": 0.016667830621318994, + "grad_norm": 0.6601525224286019, + "learning_rate": 5.555555555555556e-06, + "loss": 9.2648, + "step": 358 + }, + { + "epoch": 0.016714388807412064, + "grad_norm": 0.6345841458199475, + "learning_rate": 5.571073867163253e-06, + "loss": 9.3054, + "step": 359 + }, + { + "epoch": 0.016760946993505135, + "grad_norm": 0.6388317571404779, + "learning_rate": 5.586592178770949e-06, + "loss": 9.2915, + "step": 360 + }, + { + "epoch": 0.0168075051795982, + "grad_norm": 0.6428319660070392, + "learning_rate": 5.602110490378647e-06, + "loss": 9.2349, + "step": 361 + }, + { + "epoch": 0.01685406336569127, + "grad_norm": 0.6285572961325145, + "learning_rate": 5.6176288019863444e-06, + "loss": 9.3371, + "step": 362 + }, + { + "epoch": 0.01690062155178434, + "grad_norm": 0.6421362884174131, + "learning_rate": 5.633147113594042e-06, + "loss": 9.2744, + "step": 363 + }, + { + "epoch": 0.01694717973787741, + "grad_norm": 0.6590063921349176, + "learning_rate": 5.648665425201739e-06, + "loss": 9.2803, + "step": 364 + }, + { + "epoch": 0.016993737923970482, + "grad_norm": 0.6171936373704363, + "learning_rate": 5.664183736809436e-06, + "loss": 9.3216, + "step": 365 + }, + { + "epoch": 0.017040296110063552, + "grad_norm": 0.630873318128838, + "learning_rate": 5.679702048417133e-06, + "loss": 9.3507, + "step": 366 + }, + { + "epoch": 0.017086854296156622, + "grad_norm": 0.6494592831353134, + "learning_rate": 5.69522036002483e-06, + "loss": 9.2776, + "step": 367 + }, + { + "epoch": 0.017133412482249692, + "grad_norm": 0.64888572034199, + "learning_rate": 5.7107386716325265e-06, + "loss": 9.2714, + "step": 368 + }, + { + "epoch": 0.017179970668342762, + "grad_norm": 0.6714612651244516, + "learning_rate": 5.726256983240224e-06, + "loss": 9.2153, + "step": 369 + }, + { + "epoch": 0.017226528854435832, + "grad_norm": 0.6230806717329518, + "learning_rate": 5.741775294847921e-06, + "loss": 9.2078, + "step": 370 + }, + { + "epoch": 0.017273087040528903, + "grad_norm": 0.6346751275871169, + "learning_rate": 5.757293606455618e-06, + "loss": 9.1781, + "step": 371 + }, + { + "epoch": 0.01731964522662197, + "grad_norm": 0.6448894447001716, + "learning_rate": 5.772811918063315e-06, + "loss": 9.1718, + "step": 372 + }, + { + "epoch": 0.01736620341271504, + "grad_norm": 0.629173309827016, + "learning_rate": 5.788330229671012e-06, + "loss": 9.2946, + "step": 373 + }, + { + "epoch": 0.01741276159880811, + "grad_norm": 0.631996502103521, + "learning_rate": 5.8038485412787094e-06, + "loss": 9.2235, + "step": 374 + }, + { + "epoch": 0.01745931978490118, + "grad_norm": 0.6598524322636844, + "learning_rate": 5.819366852886407e-06, + "loss": 9.168, + "step": 375 + }, + { + "epoch": 0.01750587797099425, + "grad_norm": 0.6541288141729082, + "learning_rate": 5.834885164494104e-06, + "loss": 9.1271, + "step": 376 + }, + { + "epoch": 0.01755243615708732, + "grad_norm": 0.6336512682968927, + "learning_rate": 5.8504034761018e-06, + "loss": 9.1892, + "step": 377 + }, + { + "epoch": 0.01759899434318039, + "grad_norm": 0.6314030505756737, + "learning_rate": 5.865921787709497e-06, + "loss": 9.2259, + "step": 378 + }, + { + "epoch": 0.01764555252927346, + "grad_norm": 0.6360420052958742, + "learning_rate": 5.881440099317194e-06, + "loss": 9.2371, + "step": 379 + }, + { + "epoch": 0.01769211071536653, + "grad_norm": 0.6349438378828433, + "learning_rate": 5.8969584109248915e-06, + "loss": 9.2179, + "step": 380 + }, + { + "epoch": 0.0177386689014596, + "grad_norm": 0.633665373814018, + "learning_rate": 5.912476722532589e-06, + "loss": 9.2134, + "step": 381 + }, + { + "epoch": 0.01778522708755267, + "grad_norm": 0.6603081081288801, + "learning_rate": 5.927995034140286e-06, + "loss": 9.1124, + "step": 382 + }, + { + "epoch": 0.017831785273645737, + "grad_norm": 0.5929642187779179, + "learning_rate": 5.943513345747983e-06, + "loss": 9.2189, + "step": 383 + }, + { + "epoch": 0.017878343459738807, + "grad_norm": 0.6272964983357461, + "learning_rate": 5.95903165735568e-06, + "loss": 9.1684, + "step": 384 + }, + { + "epoch": 0.017924901645831878, + "grad_norm": 0.6153759875875562, + "learning_rate": 5.974549968963377e-06, + "loss": 9.1808, + "step": 385 + }, + { + "epoch": 0.017971459831924948, + "grad_norm": 0.621869488432977, + "learning_rate": 5.9900682805710744e-06, + "loss": 9.1143, + "step": 386 + }, + { + "epoch": 0.018018018018018018, + "grad_norm": 0.6068277676554784, + "learning_rate": 6.005586592178771e-06, + "loss": 9.1013, + "step": 387 + }, + { + "epoch": 0.018064576204111088, + "grad_norm": 0.6545479589205443, + "learning_rate": 6.021104903786468e-06, + "loss": 9.0098, + "step": 388 + }, + { + "epoch": 0.018111134390204158, + "grad_norm": 0.6354193220810118, + "learning_rate": 6.036623215394165e-06, + "loss": 9.1716, + "step": 389 + }, + { + "epoch": 0.01815769257629723, + "grad_norm": 0.6058659796240694, + "learning_rate": 6.052141527001862e-06, + "loss": 9.1744, + "step": 390 + }, + { + "epoch": 0.0182042507623903, + "grad_norm": 0.6469274103168096, + "learning_rate": 6.067659838609559e-06, + "loss": 9.0995, + "step": 391 + }, + { + "epoch": 0.01825080894848337, + "grad_norm": 0.6034982376415466, + "learning_rate": 6.0831781502172565e-06, + "loss": 9.1128, + "step": 392 + }, + { + "epoch": 0.018297367134576435, + "grad_norm": 0.5938117361427198, + "learning_rate": 6.098696461824954e-06, + "loss": 9.1315, + "step": 393 + }, + { + "epoch": 0.018343925320669505, + "grad_norm": 0.5985872637688537, + "learning_rate": 6.114214773432651e-06, + "loss": 9.0153, + "step": 394 + }, + { + "epoch": 0.018390483506762575, + "grad_norm": 0.5909598636604557, + "learning_rate": 6.129733085040348e-06, + "loss": 9.1663, + "step": 395 + }, + { + "epoch": 0.018437041692855646, + "grad_norm": 0.5888578949469697, + "learning_rate": 6.145251396648045e-06, + "loss": 9.0824, + "step": 396 + }, + { + "epoch": 0.018483599878948716, + "grad_norm": 0.6255155464658297, + "learning_rate": 6.1607697082557415e-06, + "loss": 8.9432, + "step": 397 + }, + { + "epoch": 0.018530158065041786, + "grad_norm": 0.5935467124568805, + "learning_rate": 6.176288019863439e-06, + "loss": 9.0734, + "step": 398 + }, + { + "epoch": 0.018576716251134856, + "grad_norm": 0.6045456271801779, + "learning_rate": 6.191806331471136e-06, + "loss": 9.0145, + "step": 399 + }, + { + "epoch": 0.018623274437227926, + "grad_norm": 0.6214023502230163, + "learning_rate": 6.207324643078833e-06, + "loss": 9.0122, + "step": 400 + }, + { + "epoch": 0.018669832623320996, + "grad_norm": 0.6084308382625203, + "learning_rate": 6.22284295468653e-06, + "loss": 9.0268, + "step": 401 + }, + { + "epoch": 0.018716390809414066, + "grad_norm": 0.6302483842127996, + "learning_rate": 6.238361266294227e-06, + "loss": 8.9945, + "step": 402 + }, + { + "epoch": 0.018762948995507137, + "grad_norm": 0.6446354865640466, + "learning_rate": 6.253879577901925e-06, + "loss": 8.9416, + "step": 403 + }, + { + "epoch": 0.018809507181600203, + "grad_norm": 0.5863246665867181, + "learning_rate": 6.269397889509622e-06, + "loss": 9.02, + "step": 404 + }, + { + "epoch": 0.018856065367693273, + "grad_norm": 0.5984392738590424, + "learning_rate": 6.284916201117319e-06, + "loss": 9.1116, + "step": 405 + }, + { + "epoch": 0.018902623553786344, + "grad_norm": 0.6131517332532901, + "learning_rate": 6.300434512725016e-06, + "loss": 8.9889, + "step": 406 + }, + { + "epoch": 0.018949181739879414, + "grad_norm": 0.6113119942324936, + "learning_rate": 6.315952824332713e-06, + "loss": 9.001, + "step": 407 + }, + { + "epoch": 0.018995739925972484, + "grad_norm": 0.5887810261205353, + "learning_rate": 6.33147113594041e-06, + "loss": 8.9149, + "step": 408 + }, + { + "epoch": 0.019042298112065554, + "grad_norm": 0.6097720354260577, + "learning_rate": 6.346989447548107e-06, + "loss": 8.9765, + "step": 409 + }, + { + "epoch": 0.019088856298158624, + "grad_norm": 0.583842639717529, + "learning_rate": 6.3625077591558045e-06, + "loss": 8.9948, + "step": 410 + }, + { + "epoch": 0.019135414484251694, + "grad_norm": 0.5976346797787848, + "learning_rate": 6.378026070763502e-06, + "loss": 8.9721, + "step": 411 + }, + { + "epoch": 0.019181972670344764, + "grad_norm": 0.5951770501841839, + "learning_rate": 6.393544382371199e-06, + "loss": 9.0427, + "step": 412 + }, + { + "epoch": 0.019228530856437834, + "grad_norm": 0.596679870167453, + "learning_rate": 6.409062693978896e-06, + "loss": 8.9063, + "step": 413 + }, + { + "epoch": 0.019275089042530905, + "grad_norm": 0.5853043717357411, + "learning_rate": 6.424581005586592e-06, + "loss": 8.9782, + "step": 414 + }, + { + "epoch": 0.01932164722862397, + "grad_norm": 0.5799973066878915, + "learning_rate": 6.440099317194289e-06, + "loss": 9.028, + "step": 415 + }, + { + "epoch": 0.01936820541471704, + "grad_norm": 0.5946772874645814, + "learning_rate": 6.4556176288019865e-06, + "loss": 9.027, + "step": 416 + }, + { + "epoch": 0.01941476360081011, + "grad_norm": 0.5950061288365961, + "learning_rate": 6.471135940409684e-06, + "loss": 8.9722, + "step": 417 + }, + { + "epoch": 0.01946132178690318, + "grad_norm": 0.5842924523929383, + "learning_rate": 6.486654252017381e-06, + "loss": 8.8983, + "step": 418 + }, + { + "epoch": 0.019507879972996252, + "grad_norm": 0.5708011064666275, + "learning_rate": 6.502172563625078e-06, + "loss": 9.0065, + "step": 419 + }, + { + "epoch": 0.019554438159089322, + "grad_norm": 0.5498755037779994, + "learning_rate": 6.517690875232775e-06, + "loss": 9.0263, + "step": 420 + }, + { + "epoch": 0.019600996345182392, + "grad_norm": 0.5781262556380635, + "learning_rate": 6.533209186840472e-06, + "loss": 8.9196, + "step": 421 + }, + { + "epoch": 0.019647554531275462, + "grad_norm": 0.5682707892395541, + "learning_rate": 6.5487274984481695e-06, + "loss": 8.9936, + "step": 422 + }, + { + "epoch": 0.019694112717368532, + "grad_norm": 0.6318846460860622, + "learning_rate": 6.564245810055867e-06, + "loss": 8.7841, + "step": 423 + }, + { + "epoch": 0.019740670903461602, + "grad_norm": 0.5788472163677638, + "learning_rate": 6.579764121663563e-06, + "loss": 8.8236, + "step": 424 + }, + { + "epoch": 0.019787229089554673, + "grad_norm": 0.5939365741882121, + "learning_rate": 6.59528243327126e-06, + "loss": 8.7908, + "step": 425 + }, + { + "epoch": 0.01983378727564774, + "grad_norm": 0.5755056330690865, + "learning_rate": 6.610800744878957e-06, + "loss": 8.8709, + "step": 426 + }, + { + "epoch": 0.01988034546174081, + "grad_norm": 0.5552737448850419, + "learning_rate": 6.626319056486654e-06, + "loss": 8.952, + "step": 427 + }, + { + "epoch": 0.01992690364783388, + "grad_norm": 0.5597386456883178, + "learning_rate": 6.6418373680943515e-06, + "loss": 8.9, + "step": 428 + }, + { + "epoch": 0.01997346183392695, + "grad_norm": 0.6066045350997208, + "learning_rate": 6.657355679702049e-06, + "loss": 8.8482, + "step": 429 + }, + { + "epoch": 0.02002002002002002, + "grad_norm": 0.5513710371814035, + "learning_rate": 6.672873991309746e-06, + "loss": 8.9307, + "step": 430 + }, + { + "epoch": 0.02006657820611309, + "grad_norm": 0.5490836204165914, + "learning_rate": 6.688392302917443e-06, + "loss": 8.8888, + "step": 431 + }, + { + "epoch": 0.02011313639220616, + "grad_norm": 0.5867640302799865, + "learning_rate": 6.70391061452514e-06, + "loss": 8.8538, + "step": 432 + }, + { + "epoch": 0.02015969457829923, + "grad_norm": 0.5703617974436525, + "learning_rate": 6.719428926132837e-06, + "loss": 8.8196, + "step": 433 + }, + { + "epoch": 0.0202062527643923, + "grad_norm": 0.581509880840696, + "learning_rate": 6.734947237740534e-06, + "loss": 8.8698, + "step": 434 + }, + { + "epoch": 0.02025281095048537, + "grad_norm": 0.540976783542496, + "learning_rate": 6.750465549348231e-06, + "loss": 8.8582, + "step": 435 + }, + { + "epoch": 0.020299369136578437, + "grad_norm": 0.5614501273853012, + "learning_rate": 6.765983860955928e-06, + "loss": 8.8503, + "step": 436 + }, + { + "epoch": 0.020345927322671507, + "grad_norm": 0.5654617147620372, + "learning_rate": 6.781502172563625e-06, + "loss": 8.8798, + "step": 437 + }, + { + "epoch": 0.020392485508764577, + "grad_norm": 0.5807867127779124, + "learning_rate": 6.797020484171322e-06, + "loss": 8.7212, + "step": 438 + }, + { + "epoch": 0.020439043694857648, + "grad_norm": 0.5788191693366995, + "learning_rate": 6.812538795779019e-06, + "loss": 8.8078, + "step": 439 + }, + { + "epoch": 0.020485601880950718, + "grad_norm": 0.5554144038732435, + "learning_rate": 6.8280571073867165e-06, + "loss": 8.8092, + "step": 440 + }, + { + "epoch": 0.020532160067043788, + "grad_norm": 0.5640448078395995, + "learning_rate": 6.843575418994414e-06, + "loss": 8.7646, + "step": 441 + }, + { + "epoch": 0.020578718253136858, + "grad_norm": 0.5612996408044892, + "learning_rate": 6.859093730602111e-06, + "loss": 8.8476, + "step": 442 + }, + { + "epoch": 0.020625276439229928, + "grad_norm": 0.5710678454576622, + "learning_rate": 6.874612042209808e-06, + "loss": 8.8063, + "step": 443 + }, + { + "epoch": 0.020671834625323, + "grad_norm": 0.5329041357945961, + "learning_rate": 6.890130353817504e-06, + "loss": 8.7697, + "step": 444 + }, + { + "epoch": 0.02071839281141607, + "grad_norm": 0.5226776992834872, + "learning_rate": 6.9056486654252015e-06, + "loss": 8.7977, + "step": 445 + }, + { + "epoch": 0.02076495099750914, + "grad_norm": 0.5376913582692041, + "learning_rate": 6.921166977032899e-06, + "loss": 8.9064, + "step": 446 + }, + { + "epoch": 0.020811509183602205, + "grad_norm": 0.527285702742542, + "learning_rate": 6.936685288640596e-06, + "loss": 8.8755, + "step": 447 + }, + { + "epoch": 0.020858067369695275, + "grad_norm": 0.5366121859117784, + "learning_rate": 6.952203600248293e-06, + "loss": 8.8058, + "step": 448 + }, + { + "epoch": 0.020904625555788345, + "grad_norm": 0.5513168775677815, + "learning_rate": 6.96772191185599e-06, + "loss": 8.7976, + "step": 449 + }, + { + "epoch": 0.020951183741881416, + "grad_norm": 0.5212642783393423, + "learning_rate": 6.983240223463687e-06, + "loss": 8.7688, + "step": 450 + }, + { + "epoch": 0.020997741927974486, + "grad_norm": 0.5218436699861614, + "learning_rate": 6.998758535071384e-06, + "loss": 8.8543, + "step": 451 + }, + { + "epoch": 0.021044300114067556, + "grad_norm": 0.5389119649612837, + "learning_rate": 7.0142768466790815e-06, + "loss": 8.8283, + "step": 452 + }, + { + "epoch": 0.021090858300160626, + "grad_norm": 0.5231843557879038, + "learning_rate": 7.029795158286778e-06, + "loss": 8.7832, + "step": 453 + }, + { + "epoch": 0.021137416486253696, + "grad_norm": 0.5430231485197596, + "learning_rate": 7.045313469894475e-06, + "loss": 8.7846, + "step": 454 + }, + { + "epoch": 0.021183974672346766, + "grad_norm": 0.53908104115424, + "learning_rate": 7.060831781502172e-06, + "loss": 8.7932, + "step": 455 + }, + { + "epoch": 0.021230532858439836, + "grad_norm": 0.5269193524834919, + "learning_rate": 7.076350093109869e-06, + "loss": 8.7579, + "step": 456 + }, + { + "epoch": 0.021277091044532907, + "grad_norm": 0.5330699011431079, + "learning_rate": 7.0918684047175665e-06, + "loss": 8.6912, + "step": 457 + }, + { + "epoch": 0.021323649230625973, + "grad_norm": 0.5366111556864204, + "learning_rate": 7.107386716325264e-06, + "loss": 8.7235, + "step": 458 + }, + { + "epoch": 0.021370207416719043, + "grad_norm": 0.5399774015531547, + "learning_rate": 7.122905027932961e-06, + "loss": 8.7279, + "step": 459 + }, + { + "epoch": 0.021416765602812114, + "grad_norm": 0.5245826766556709, + "learning_rate": 7.138423339540658e-06, + "loss": 8.7559, + "step": 460 + }, + { + "epoch": 0.021463323788905184, + "grad_norm": 0.5257840434883655, + "learning_rate": 7.153941651148355e-06, + "loss": 8.6184, + "step": 461 + }, + { + "epoch": 0.021509881974998254, + "grad_norm": 0.5393514601119744, + "learning_rate": 7.169459962756052e-06, + "loss": 8.6786, + "step": 462 + }, + { + "epoch": 0.021556440161091324, + "grad_norm": 0.5127213195468944, + "learning_rate": 7.1849782743637486e-06, + "loss": 8.7237, + "step": 463 + }, + { + "epoch": 0.021602998347184394, + "grad_norm": 0.5685479560777068, + "learning_rate": 7.200496585971446e-06, + "loss": 8.6682, + "step": 464 + }, + { + "epoch": 0.021649556533277464, + "grad_norm": 0.537064136822898, + "learning_rate": 7.216014897579143e-06, + "loss": 8.722, + "step": 465 + }, + { + "epoch": 0.021696114719370534, + "grad_norm": 0.5147733344292178, + "learning_rate": 7.23153320918684e-06, + "loss": 8.662, + "step": 466 + }, + { + "epoch": 0.021742672905463604, + "grad_norm": 0.5347759544223106, + "learning_rate": 7.247051520794539e-06, + "loss": 8.7237, + "step": 467 + }, + { + "epoch": 0.021789231091556675, + "grad_norm": 0.5482602165425612, + "learning_rate": 7.262569832402236e-06, + "loss": 8.6204, + "step": 468 + }, + { + "epoch": 0.02183578927764974, + "grad_norm": 0.5250724361567504, + "learning_rate": 7.278088144009932e-06, + "loss": 8.6668, + "step": 469 + }, + { + "epoch": 0.02188234746374281, + "grad_norm": 0.5114796120020922, + "learning_rate": 7.2936064556176295e-06, + "loss": 8.6221, + "step": 470 + }, + { + "epoch": 0.02192890564983588, + "grad_norm": 0.5263743530214181, + "learning_rate": 7.309124767225327e-06, + "loss": 8.6449, + "step": 471 + }, + { + "epoch": 0.02197546383592895, + "grad_norm": 0.5041955737736455, + "learning_rate": 7.324643078833024e-06, + "loss": 8.6658, + "step": 472 + }, + { + "epoch": 0.022022022022022022, + "grad_norm": 0.5513551023438878, + "learning_rate": 7.340161390440721e-06, + "loss": 8.597, + "step": 473 + }, + { + "epoch": 0.022068580208115092, + "grad_norm": 0.523952323864142, + "learning_rate": 7.355679702048418e-06, + "loss": 8.6533, + "step": 474 + }, + { + "epoch": 0.022115138394208162, + "grad_norm": 0.5133561705575611, + "learning_rate": 7.371198013656115e-06, + "loss": 8.555, + "step": 475 + }, + { + "epoch": 0.022161696580301232, + "grad_norm": 0.4982558186564148, + "learning_rate": 7.386716325263812e-06, + "loss": 8.6415, + "step": 476 + }, + { + "epoch": 0.022208254766394302, + "grad_norm": 0.5094518009501655, + "learning_rate": 7.4022346368715096e-06, + "loss": 8.6052, + "step": 477 + }, + { + "epoch": 0.022254812952487373, + "grad_norm": 0.4928417277594994, + "learning_rate": 7.417752948479207e-06, + "loss": 8.6991, + "step": 478 + }, + { + "epoch": 0.02230137113858044, + "grad_norm": 0.49437312032468844, + "learning_rate": 7.433271260086903e-06, + "loss": 8.6141, + "step": 479 + }, + { + "epoch": 0.02234792932467351, + "grad_norm": 0.4918043141969281, + "learning_rate": 7.4487895716946e-06, + "loss": 8.6735, + "step": 480 + }, + { + "epoch": 0.02239448751076658, + "grad_norm": 0.4966512250141325, + "learning_rate": 7.464307883302297e-06, + "loss": 8.6316, + "step": 481 + }, + { + "epoch": 0.02244104569685965, + "grad_norm": 0.524021451003587, + "learning_rate": 7.4798261949099945e-06, + "loss": 8.5539, + "step": 482 + }, + { + "epoch": 0.02248760388295272, + "grad_norm": 0.5083521088515269, + "learning_rate": 7.495344506517692e-06, + "loss": 8.5614, + "step": 483 + }, + { + "epoch": 0.02253416206904579, + "grad_norm": 0.5066652330806518, + "learning_rate": 7.510862818125389e-06, + "loss": 8.6091, + "step": 484 + }, + { + "epoch": 0.02258072025513886, + "grad_norm": 0.4971842035647404, + "learning_rate": 7.526381129733086e-06, + "loss": 8.5542, + "step": 485 + }, + { + "epoch": 0.02262727844123193, + "grad_norm": 0.49701970619526925, + "learning_rate": 7.541899441340783e-06, + "loss": 8.596, + "step": 486 + }, + { + "epoch": 0.022673836627325, + "grad_norm": 0.5069032386042411, + "learning_rate": 7.55741775294848e-06, + "loss": 8.6163, + "step": 487 + }, + { + "epoch": 0.02272039481341807, + "grad_norm": 0.5201343750447671, + "learning_rate": 7.572936064556177e-06, + "loss": 8.4805, + "step": 488 + }, + { + "epoch": 0.02276695299951114, + "grad_norm": 0.5197557274918099, + "learning_rate": 7.588454376163874e-06, + "loss": 8.5041, + "step": 489 + }, + { + "epoch": 0.022813511185604207, + "grad_norm": 0.5058839747002555, + "learning_rate": 7.603972687771571e-06, + "loss": 8.5213, + "step": 490 + }, + { + "epoch": 0.022860069371697277, + "grad_norm": 0.5314662445868537, + "learning_rate": 7.619490999379268e-06, + "loss": 8.5448, + "step": 491 + }, + { + "epoch": 0.022906627557790347, + "grad_norm": 0.4791111101403479, + "learning_rate": 7.635009310986966e-06, + "loss": 8.5672, + "step": 492 + }, + { + "epoch": 0.022953185743883418, + "grad_norm": 0.4626232912483305, + "learning_rate": 7.650527622594662e-06, + "loss": 8.5628, + "step": 493 + }, + { + "epoch": 0.022999743929976488, + "grad_norm": 0.4895315760400803, + "learning_rate": 7.666045934202359e-06, + "loss": 8.4699, + "step": 494 + }, + { + "epoch": 0.023046302116069558, + "grad_norm": 0.4842858922066811, + "learning_rate": 7.681564245810057e-06, + "loss": 8.3624, + "step": 495 + }, + { + "epoch": 0.023092860302162628, + "grad_norm": 0.5001276403123834, + "learning_rate": 7.697082557417753e-06, + "loss": 8.5212, + "step": 496 + }, + { + "epoch": 0.023139418488255698, + "grad_norm": 0.5043769285867562, + "learning_rate": 7.712600869025451e-06, + "loss": 8.5015, + "step": 497 + }, + { + "epoch": 0.02318597667434877, + "grad_norm": 0.4889687745987486, + "learning_rate": 7.728119180633147e-06, + "loss": 8.5375, + "step": 498 + }, + { + "epoch": 0.02323253486044184, + "grad_norm": 0.5130875656005853, + "learning_rate": 7.743637492240845e-06, + "loss": 8.5122, + "step": 499 + }, + { + "epoch": 0.02327909304653491, + "grad_norm": 0.48716975770795323, + "learning_rate": 7.759155803848542e-06, + "loss": 8.4605, + "step": 500 + }, + { + "epoch": 0.023325651232627975, + "grad_norm": 0.4899372651350655, + "learning_rate": 7.77467411545624e-06, + "loss": 8.4998, + "step": 501 + }, + { + "epoch": 0.023372209418721045, + "grad_norm": 0.4652931275425053, + "learning_rate": 7.790192427063936e-06, + "loss": 8.5082, + "step": 502 + }, + { + "epoch": 0.023418767604814116, + "grad_norm": 0.4694956123126235, + "learning_rate": 7.805710738671632e-06, + "loss": 8.5222, + "step": 503 + }, + { + "epoch": 0.023465325790907186, + "grad_norm": 0.48446966083589893, + "learning_rate": 7.82122905027933e-06, + "loss": 8.3882, + "step": 504 + }, + { + "epoch": 0.023511883977000256, + "grad_norm": 0.4905760554258697, + "learning_rate": 7.836747361887026e-06, + "loss": 8.477, + "step": 505 + }, + { + "epoch": 0.023558442163093326, + "grad_norm": 0.5094695746905026, + "learning_rate": 7.852265673494724e-06, + "loss": 8.4077, + "step": 506 + }, + { + "epoch": 0.023605000349186396, + "grad_norm": 0.46749474249987494, + "learning_rate": 7.86778398510242e-06, + "loss": 8.4723, + "step": 507 + }, + { + "epoch": 0.023651558535279466, + "grad_norm": 0.47145535503040925, + "learning_rate": 7.883302296710119e-06, + "loss": 8.5296, + "step": 508 + }, + { + "epoch": 0.023698116721372536, + "grad_norm": 0.4843035446710819, + "learning_rate": 7.898820608317815e-06, + "loss": 8.426, + "step": 509 + }, + { + "epoch": 0.023744674907465606, + "grad_norm": 0.4830585348103063, + "learning_rate": 7.914338919925513e-06, + "loss": 8.4277, + "step": 510 + }, + { + "epoch": 0.023791233093558677, + "grad_norm": 0.49208825356823854, + "learning_rate": 7.92985723153321e-06, + "loss": 8.4357, + "step": 511 + }, + { + "epoch": 0.023837791279651743, + "grad_norm": 0.4906675146252165, + "learning_rate": 7.945375543140907e-06, + "loss": 8.3497, + "step": 512 + }, + { + "epoch": 0.023884349465744813, + "grad_norm": 0.5618721421012429, + "learning_rate": 7.960893854748604e-06, + "loss": 8.4279, + "step": 513 + }, + { + "epoch": 0.023930907651837884, + "grad_norm": 0.4854428778207306, + "learning_rate": 7.9764121663563e-06, + "loss": 8.3712, + "step": 514 + }, + { + "epoch": 0.023977465837930954, + "grad_norm": 0.45466032524842526, + "learning_rate": 7.991930477963998e-06, + "loss": 8.4725, + "step": 515 + }, + { + "epoch": 0.024024024024024024, + "grad_norm": 0.45419458758871867, + "learning_rate": 8.007448789571694e-06, + "loss": 8.3743, + "step": 516 + }, + { + "epoch": 0.024070582210117094, + "grad_norm": 0.5597113559686218, + "learning_rate": 8.022967101179392e-06, + "loss": 8.3931, + "step": 517 + }, + { + "epoch": 0.024117140396210164, + "grad_norm": 0.47801981070708527, + "learning_rate": 8.038485412787089e-06, + "loss": 8.306, + "step": 518 + }, + { + "epoch": 0.024163698582303234, + "grad_norm": 0.47057243416657646, + "learning_rate": 8.054003724394787e-06, + "loss": 8.3549, + "step": 519 + }, + { + "epoch": 0.024210256768396304, + "grad_norm": 0.47527239660628295, + "learning_rate": 8.069522036002483e-06, + "loss": 8.363, + "step": 520 + }, + { + "epoch": 0.024256814954489374, + "grad_norm": 0.4580669776818313, + "learning_rate": 8.085040347610181e-06, + "loss": 8.3912, + "step": 521 + }, + { + "epoch": 0.024303373140582445, + "grad_norm": 0.46472919214182634, + "learning_rate": 8.100558659217877e-06, + "loss": 8.3486, + "step": 522 + }, + { + "epoch": 0.02434993132667551, + "grad_norm": 0.46291527787356945, + "learning_rate": 8.116076970825574e-06, + "loss": 8.3872, + "step": 523 + }, + { + "epoch": 0.02439648951276858, + "grad_norm": 0.44255195470244857, + "learning_rate": 8.131595282433272e-06, + "loss": 8.4047, + "step": 524 + }, + { + "epoch": 0.02444304769886165, + "grad_norm": 0.4658499481950748, + "learning_rate": 8.147113594040968e-06, + "loss": 8.351, + "step": 525 + }, + { + "epoch": 0.02448960588495472, + "grad_norm": 0.4413166514317405, + "learning_rate": 8.162631905648666e-06, + "loss": 8.3586, + "step": 526 + }, + { + "epoch": 0.024536164071047792, + "grad_norm": 0.47334839751152535, + "learning_rate": 8.178150217256362e-06, + "loss": 8.3419, + "step": 527 + }, + { + "epoch": 0.024582722257140862, + "grad_norm": 0.44721248899833954, + "learning_rate": 8.19366852886406e-06, + "loss": 8.3394, + "step": 528 + }, + { + "epoch": 0.024629280443233932, + "grad_norm": 0.4492174381612194, + "learning_rate": 8.209186840471756e-06, + "loss": 8.3186, + "step": 529 + }, + { + "epoch": 0.024675838629327002, + "grad_norm": 0.46579383324964596, + "learning_rate": 8.224705152079454e-06, + "loss": 8.3058, + "step": 530 + }, + { + "epoch": 0.024722396815420072, + "grad_norm": 0.45931065183497466, + "learning_rate": 8.24022346368715e-06, + "loss": 8.2644, + "step": 531 + }, + { + "epoch": 0.024768955001513143, + "grad_norm": 0.49555805110985957, + "learning_rate": 8.255741775294847e-06, + "loss": 8.2899, + "step": 532 + }, + { + "epoch": 0.02481551318760621, + "grad_norm": 0.4517152028268037, + "learning_rate": 8.271260086902545e-06, + "loss": 8.3603, + "step": 533 + }, + { + "epoch": 0.02486207137369928, + "grad_norm": 0.4088956635141987, + "learning_rate": 8.286778398510241e-06, + "loss": 8.3849, + "step": 534 + }, + { + "epoch": 0.02490862955979235, + "grad_norm": 0.4570536872208425, + "learning_rate": 8.30229671011794e-06, + "loss": 8.3047, + "step": 535 + }, + { + "epoch": 0.02495518774588542, + "grad_norm": 0.4585753624303468, + "learning_rate": 8.317815021725636e-06, + "loss": 8.2761, + "step": 536 + }, + { + "epoch": 0.02500174593197849, + "grad_norm": 0.452444906957823, + "learning_rate": 8.333333333333334e-06, + "loss": 8.3162, + "step": 537 + }, + { + "epoch": 0.02504830411807156, + "grad_norm": 0.4602807674068385, + "learning_rate": 8.34885164494103e-06, + "loss": 8.3338, + "step": 538 + }, + { + "epoch": 0.02509486230416463, + "grad_norm": 0.4519284066282247, + "learning_rate": 8.364369956548728e-06, + "loss": 8.2605, + "step": 539 + }, + { + "epoch": 0.0251414204902577, + "grad_norm": 0.4401333219808544, + "learning_rate": 8.379888268156424e-06, + "loss": 8.265, + "step": 540 + }, + { + "epoch": 0.02518797867635077, + "grad_norm": 0.4383657416332876, + "learning_rate": 8.395406579764122e-06, + "loss": 8.3287, + "step": 541 + }, + { + "epoch": 0.02523453686244384, + "grad_norm": 0.46756183969598725, + "learning_rate": 8.410924891371819e-06, + "loss": 8.1975, + "step": 542 + }, + { + "epoch": 0.02528109504853691, + "grad_norm": 0.4565295046141926, + "learning_rate": 8.426443202979515e-06, + "loss": 8.2802, + "step": 543 + }, + { + "epoch": 0.025327653234629977, + "grad_norm": 0.44815547318405513, + "learning_rate": 8.441961514587213e-06, + "loss": 8.2998, + "step": 544 + }, + { + "epoch": 0.025374211420723047, + "grad_norm": 0.46474270628915254, + "learning_rate": 8.45747982619491e-06, + "loss": 8.2944, + "step": 545 + }, + { + "epoch": 0.025420769606816117, + "grad_norm": 0.43493199841415364, + "learning_rate": 8.472998137802607e-06, + "loss": 8.2694, + "step": 546 + }, + { + "epoch": 0.025467327792909188, + "grad_norm": 0.4840251295155541, + "learning_rate": 8.488516449410304e-06, + "loss": 8.2629, + "step": 547 + }, + { + "epoch": 0.025513885979002258, + "grad_norm": 0.47883242664682124, + "learning_rate": 8.504034761018002e-06, + "loss": 8.2997, + "step": 548 + }, + { + "epoch": 0.025560444165095328, + "grad_norm": 0.44099790803463595, + "learning_rate": 8.519553072625698e-06, + "loss": 8.2431, + "step": 549 + }, + { + "epoch": 0.025607002351188398, + "grad_norm": 0.49713325714068873, + "learning_rate": 8.535071384233396e-06, + "loss": 8.2538, + "step": 550 + }, + { + "epoch": 0.025653560537281468, + "grad_norm": 0.44764729014085275, + "learning_rate": 8.550589695841092e-06, + "loss": 8.1903, + "step": 551 + }, + { + "epoch": 0.02570011872337454, + "grad_norm": 0.4729635891545959, + "learning_rate": 8.56610800744879e-06, + "loss": 8.2085, + "step": 552 + }, + { + "epoch": 0.02574667690946761, + "grad_norm": 0.5126630213071719, + "learning_rate": 8.581626319056488e-06, + "loss": 8.1816, + "step": 553 + }, + { + "epoch": 0.02579323509556068, + "grad_norm": 0.44952236587367367, + "learning_rate": 8.597144630664185e-06, + "loss": 8.2553, + "step": 554 + }, + { + "epoch": 0.025839793281653745, + "grad_norm": 0.45593932614749044, + "learning_rate": 8.612662942271883e-06, + "loss": 8.299, + "step": 555 + }, + { + "epoch": 0.025886351467746815, + "grad_norm": 0.43762437483635946, + "learning_rate": 8.628181253879579e-06, + "loss": 8.2522, + "step": 556 + }, + { + "epoch": 0.025932909653839886, + "grad_norm": 0.44334216502572504, + "learning_rate": 8.643699565487277e-06, + "loss": 8.1925, + "step": 557 + }, + { + "epoch": 0.025979467839932956, + "grad_norm": 0.5435886366837787, + "learning_rate": 8.659217877094973e-06, + "loss": 8.0686, + "step": 558 + }, + { + "epoch": 0.026026026026026026, + "grad_norm": 0.4399402309047533, + "learning_rate": 8.67473618870267e-06, + "loss": 8.1489, + "step": 559 + }, + { + "epoch": 0.026072584212119096, + "grad_norm": 0.5841132340241131, + "learning_rate": 8.690254500310367e-06, + "loss": 8.1341, + "step": 560 + }, + { + "epoch": 0.026119142398212166, + "grad_norm": 0.4868143972089643, + "learning_rate": 8.705772811918064e-06, + "loss": 8.1706, + "step": 561 + }, + { + "epoch": 0.026165700584305236, + "grad_norm": 0.5080384547217853, + "learning_rate": 8.721291123525762e-06, + "loss": 8.2538, + "step": 562 + }, + { + "epoch": 0.026212258770398306, + "grad_norm": 0.5689911287416477, + "learning_rate": 8.736809435133458e-06, + "loss": 8.1494, + "step": 563 + }, + { + "epoch": 0.026258816956491376, + "grad_norm": 0.4407832398384714, + "learning_rate": 8.752327746741156e-06, + "loss": 8.2455, + "step": 564 + }, + { + "epoch": 0.026305375142584447, + "grad_norm": 0.5291375796922599, + "learning_rate": 8.767846058348852e-06, + "loss": 8.1285, + "step": 565 + }, + { + "epoch": 0.026351933328677513, + "grad_norm": 0.44043126322197607, + "learning_rate": 8.78336436995655e-06, + "loss": 8.1628, + "step": 566 + }, + { + "epoch": 0.026398491514770583, + "grad_norm": 0.5409867459065124, + "learning_rate": 8.798882681564247e-06, + "loss": 7.9857, + "step": 567 + }, + { + "epoch": 0.026445049700863654, + "grad_norm": 0.4763587303493496, + "learning_rate": 8.814400993171943e-06, + "loss": 8.1446, + "step": 568 + }, + { + "epoch": 0.026491607886956724, + "grad_norm": 0.4414999470398085, + "learning_rate": 8.829919304779641e-06, + "loss": 8.1174, + "step": 569 + }, + { + "epoch": 0.026538166073049794, + "grad_norm": 0.4626169672344333, + "learning_rate": 8.845437616387337e-06, + "loss": 8.1568, + "step": 570 + }, + { + "epoch": 0.026584724259142864, + "grad_norm": 0.4312782282423608, + "learning_rate": 8.860955927995035e-06, + "loss": 8.1173, + "step": 571 + }, + { + "epoch": 0.026631282445235934, + "grad_norm": 0.4397905323558677, + "learning_rate": 8.876474239602732e-06, + "loss": 8.1374, + "step": 572 + }, + { + "epoch": 0.026677840631329004, + "grad_norm": 0.4605079568602714, + "learning_rate": 8.89199255121043e-06, + "loss": 8.1719, + "step": 573 + }, + { + "epoch": 0.026724398817422074, + "grad_norm": 0.4172754506940507, + "learning_rate": 8.907510862818126e-06, + "loss": 8.1057, + "step": 574 + }, + { + "epoch": 0.026770957003515145, + "grad_norm": 0.4223687565551764, + "learning_rate": 8.923029174425824e-06, + "loss": 8.1492, + "step": 575 + }, + { + "epoch": 0.02681751518960821, + "grad_norm": 0.4197252533077379, + "learning_rate": 8.93854748603352e-06, + "loss": 8.1764, + "step": 576 + }, + { + "epoch": 0.02686407337570128, + "grad_norm": 0.39442952515836305, + "learning_rate": 8.954065797641217e-06, + "loss": 8.0757, + "step": 577 + }, + { + "epoch": 0.02691063156179435, + "grad_norm": 0.42002633040525733, + "learning_rate": 8.969584109248915e-06, + "loss": 8.2232, + "step": 578 + }, + { + "epoch": 0.02695718974788742, + "grad_norm": 0.40275744366371413, + "learning_rate": 8.98510242085661e-06, + "loss": 8.145, + "step": 579 + }, + { + "epoch": 0.02700374793398049, + "grad_norm": 0.4150333791466974, + "learning_rate": 9.000620732464309e-06, + "loss": 8.1302, + "step": 580 + }, + { + "epoch": 0.027050306120073562, + "grad_norm": 0.42168345481540526, + "learning_rate": 9.016139044072005e-06, + "loss": 8.0653, + "step": 581 + }, + { + "epoch": 0.027096864306166632, + "grad_norm": 0.4099670912338954, + "learning_rate": 9.031657355679703e-06, + "loss": 8.0768, + "step": 582 + }, + { + "epoch": 0.027143422492259702, + "grad_norm": 0.3904660933217091, + "learning_rate": 9.0471756672874e-06, + "loss": 8.1026, + "step": 583 + }, + { + "epoch": 0.027189980678352772, + "grad_norm": 0.38480330004293867, + "learning_rate": 9.062693978895097e-06, + "loss": 8.1035, + "step": 584 + }, + { + "epoch": 0.027236538864445842, + "grad_norm": 0.4209964206290683, + "learning_rate": 9.078212290502794e-06, + "loss": 8.1538, + "step": 585 + }, + { + "epoch": 0.027283097050538913, + "grad_norm": 0.3723818424722279, + "learning_rate": 9.093730602110492e-06, + "loss": 8.054, + "step": 586 + }, + { + "epoch": 0.02732965523663198, + "grad_norm": 0.45205784501044194, + "learning_rate": 9.109248913718188e-06, + "loss": 7.9876, + "step": 587 + }, + { + "epoch": 0.02737621342272505, + "grad_norm": 0.40643529021263974, + "learning_rate": 9.124767225325884e-06, + "loss": 8.1172, + "step": 588 + }, + { + "epoch": 0.02742277160881812, + "grad_norm": 0.4258764687693268, + "learning_rate": 9.140285536933582e-06, + "loss": 8.13, + "step": 589 + }, + { + "epoch": 0.02746932979491119, + "grad_norm": 0.38107077928785227, + "learning_rate": 9.155803848541279e-06, + "loss": 8.1796, + "step": 590 + }, + { + "epoch": 0.02751588798100426, + "grad_norm": 0.43757262806823366, + "learning_rate": 9.171322160148977e-06, + "loss": 8.0428, + "step": 591 + }, + { + "epoch": 0.02756244616709733, + "grad_norm": 0.4296090545790464, + "learning_rate": 9.186840471756673e-06, + "loss": 8.0832, + "step": 592 + }, + { + "epoch": 0.0276090043531904, + "grad_norm": 0.4252372572355349, + "learning_rate": 9.202358783364371e-06, + "loss": 8.1124, + "step": 593 + }, + { + "epoch": 0.02765556253928347, + "grad_norm": 0.4100203745613154, + "learning_rate": 9.217877094972067e-06, + "loss": 7.9846, + "step": 594 + }, + { + "epoch": 0.02770212072537654, + "grad_norm": 0.43442927551515964, + "learning_rate": 9.233395406579765e-06, + "loss": 8.0738, + "step": 595 + }, + { + "epoch": 0.02774867891146961, + "grad_norm": 0.3829969815434038, + "learning_rate": 9.248913718187462e-06, + "loss": 8.0961, + "step": 596 + }, + { + "epoch": 0.02779523709756268, + "grad_norm": 0.4336779852858108, + "learning_rate": 9.264432029795158e-06, + "loss": 8.045, + "step": 597 + }, + { + "epoch": 0.027841795283655747, + "grad_norm": 0.38572540404558264, + "learning_rate": 9.279950341402856e-06, + "loss": 8.0545, + "step": 598 + }, + { + "epoch": 0.027888353469748817, + "grad_norm": 0.4031054516941499, + "learning_rate": 9.295468653010552e-06, + "loss": 8.0345, + "step": 599 + }, + { + "epoch": 0.027934911655841888, + "grad_norm": 0.3898380503885283, + "learning_rate": 9.31098696461825e-06, + "loss": 8.0679, + "step": 600 + }, + { + "epoch": 0.027981469841934958, + "grad_norm": 0.3951237537481144, + "learning_rate": 9.326505276225947e-06, + "loss": 7.9501, + "step": 601 + }, + { + "epoch": 0.028028028028028028, + "grad_norm": 0.43197963968170405, + "learning_rate": 9.342023587833645e-06, + "loss": 7.9424, + "step": 602 + }, + { + "epoch": 0.028074586214121098, + "grad_norm": 0.39084882028316675, + "learning_rate": 9.35754189944134e-06, + "loss": 8.0805, + "step": 603 + }, + { + "epoch": 0.028121144400214168, + "grad_norm": 0.43080494582086115, + "learning_rate": 9.373060211049039e-06, + "loss": 8.0054, + "step": 604 + }, + { + "epoch": 0.028167702586307238, + "grad_norm": 0.42013944799725694, + "learning_rate": 9.388578522656735e-06, + "loss": 7.8514, + "step": 605 + }, + { + "epoch": 0.02821426077240031, + "grad_norm": 0.3983994920899406, + "learning_rate": 9.404096834264431e-06, + "loss": 7.9936, + "step": 606 + }, + { + "epoch": 0.02826081895849338, + "grad_norm": 0.41148869660987497, + "learning_rate": 9.41961514587213e-06, + "loss": 7.931, + "step": 607 + }, + { + "epoch": 0.02830737714458645, + "grad_norm": 0.413474693345938, + "learning_rate": 9.435133457479826e-06, + "loss": 8.0369, + "step": 608 + }, + { + "epoch": 0.028353935330679515, + "grad_norm": 0.4516684520758093, + "learning_rate": 9.450651769087524e-06, + "loss": 7.8945, + "step": 609 + }, + { + "epoch": 0.028400493516772585, + "grad_norm": 0.3603566337074661, + "learning_rate": 9.46617008069522e-06, + "loss": 7.998, + "step": 610 + }, + { + "epoch": 0.028447051702865656, + "grad_norm": 0.3940598169122538, + "learning_rate": 9.481688392302918e-06, + "loss": 7.9436, + "step": 611 + }, + { + "epoch": 0.028493609888958726, + "grad_norm": 0.35758774076595334, + "learning_rate": 9.497206703910614e-06, + "loss": 7.9648, + "step": 612 + }, + { + "epoch": 0.028540168075051796, + "grad_norm": 0.35032912090452495, + "learning_rate": 9.512725015518312e-06, + "loss": 7.9654, + "step": 613 + }, + { + "epoch": 0.028586726261144866, + "grad_norm": 0.3702165761091532, + "learning_rate": 9.528243327126009e-06, + "loss": 7.9352, + "step": 614 + }, + { + "epoch": 0.028633284447237936, + "grad_norm": 0.33711213011526747, + "learning_rate": 9.543761638733707e-06, + "loss": 8.0226, + "step": 615 + }, + { + "epoch": 0.028679842633331006, + "grad_norm": 0.36881525538064946, + "learning_rate": 9.559279950341403e-06, + "loss": 7.9067, + "step": 616 + }, + { + "epoch": 0.028726400819424076, + "grad_norm": 0.3689356661949512, + "learning_rate": 9.5747982619491e-06, + "loss": 7.9551, + "step": 617 + }, + { + "epoch": 0.028772959005517146, + "grad_norm": 0.3868239350583347, + "learning_rate": 9.590316573556797e-06, + "loss": 7.8461, + "step": 618 + }, + { + "epoch": 0.028819517191610213, + "grad_norm": 0.3693889393404752, + "learning_rate": 9.605834885164494e-06, + "loss": 7.9601, + "step": 619 + }, + { + "epoch": 0.028866075377703283, + "grad_norm": 0.34724367779836973, + "learning_rate": 9.621353196772192e-06, + "loss": 7.935, + "step": 620 + }, + { + "epoch": 0.028912633563796353, + "grad_norm": 0.43035481054867797, + "learning_rate": 9.636871508379888e-06, + "loss": 7.9076, + "step": 621 + }, + { + "epoch": 0.028959191749889424, + "grad_norm": 0.4056556329696784, + "learning_rate": 9.652389819987586e-06, + "loss": 7.9031, + "step": 622 + }, + { + "epoch": 0.029005749935982494, + "grad_norm": 0.4429012271569537, + "learning_rate": 9.667908131595282e-06, + "loss": 7.8519, + "step": 623 + }, + { + "epoch": 0.029052308122075564, + "grad_norm": 0.4736819098465597, + "learning_rate": 9.68342644320298e-06, + "loss": 7.9039, + "step": 624 + }, + { + "epoch": 0.029098866308168634, + "grad_norm": 0.4113982651783671, + "learning_rate": 9.698944754810677e-06, + "loss": 7.9103, + "step": 625 + }, + { + "epoch": 0.029145424494261704, + "grad_norm": 0.5939915749168139, + "learning_rate": 9.714463066418373e-06, + "loss": 7.8075, + "step": 626 + }, + { + "epoch": 0.029191982680354774, + "grad_norm": 0.3912572156080289, + "learning_rate": 9.72998137802607e-06, + "loss": 7.8642, + "step": 627 + }, + { + "epoch": 0.029238540866447844, + "grad_norm": 0.5200645214256644, + "learning_rate": 9.745499689633767e-06, + "loss": 7.8282, + "step": 628 + }, + { + "epoch": 0.029285099052540915, + "grad_norm": 0.36810229909812553, + "learning_rate": 9.761018001241465e-06, + "loss": 7.9861, + "step": 629 + }, + { + "epoch": 0.02933165723863398, + "grad_norm": 0.5440854314311153, + "learning_rate": 9.776536312849161e-06, + "loss": 7.8282, + "step": 630 + }, + { + "epoch": 0.02937821542472705, + "grad_norm": 0.3885178324790208, + "learning_rate": 9.79205462445686e-06, + "loss": 7.8332, + "step": 631 + }, + { + "epoch": 0.02942477361082012, + "grad_norm": 0.49762040979534816, + "learning_rate": 9.807572936064556e-06, + "loss": 7.875, + "step": 632 + }, + { + "epoch": 0.02947133179691319, + "grad_norm": 0.3695717092995512, + "learning_rate": 9.823091247672254e-06, + "loss": 8.012, + "step": 633 + }, + { + "epoch": 0.029517889983006262, + "grad_norm": 0.5059780175973693, + "learning_rate": 9.83860955927995e-06, + "loss": 7.8455, + "step": 634 + }, + { + "epoch": 0.029564448169099332, + "grad_norm": 0.5019599837109578, + "learning_rate": 9.854127870887646e-06, + "loss": 7.8831, + "step": 635 + }, + { + "epoch": 0.029611006355192402, + "grad_norm": 0.3898443546218907, + "learning_rate": 9.869646182495346e-06, + "loss": 7.8755, + "step": 636 + }, + { + "epoch": 0.029657564541285472, + "grad_norm": 0.4818645779349516, + "learning_rate": 9.885164494103042e-06, + "loss": 7.7821, + "step": 637 + }, + { + "epoch": 0.029704122727378542, + "grad_norm": 0.3666037939143093, + "learning_rate": 9.90068280571074e-06, + "loss": 7.8082, + "step": 638 + }, + { + "epoch": 0.029750680913471612, + "grad_norm": 0.4463709614640673, + "learning_rate": 9.916201117318437e-06, + "loss": 7.7965, + "step": 639 + }, + { + "epoch": 0.029797239099564683, + "grad_norm": 0.33339676057028306, + "learning_rate": 9.931719428926135e-06, + "loss": 7.8408, + "step": 640 + }, + { + "epoch": 0.02984379728565775, + "grad_norm": 0.5163037538808969, + "learning_rate": 9.947237740533831e-06, + "loss": 7.8655, + "step": 641 + }, + { + "epoch": 0.02989035547175082, + "grad_norm": 0.32257023646724475, + "learning_rate": 9.962756052141527e-06, + "loss": 7.8969, + "step": 642 + }, + { + "epoch": 0.02993691365784389, + "grad_norm": 0.5330610038037318, + "learning_rate": 9.978274363749225e-06, + "loss": 7.8281, + "step": 643 + }, + { + "epoch": 0.02998347184393696, + "grad_norm": 0.4012022872948946, + "learning_rate": 9.993792675356922e-06, + "loss": 7.7897, + "step": 644 + }, + { + "epoch": 0.03003003003003003, + "grad_norm": 0.44961823500358705, + "learning_rate": 1.000931098696462e-05, + "loss": 7.7166, + "step": 645 + }, + { + "epoch": 0.0300765882161231, + "grad_norm": 0.43697388132418047, + "learning_rate": 1.0024829298572316e-05, + "loss": 7.826, + "step": 646 + }, + { + "epoch": 0.03012314640221617, + "grad_norm": 0.3405362214019478, + "learning_rate": 1.0040347610180014e-05, + "loss": 7.8469, + "step": 647 + }, + { + "epoch": 0.03016970458830924, + "grad_norm": 0.36139043057194087, + "learning_rate": 1.005586592178771e-05, + "loss": 7.8985, + "step": 648 + }, + { + "epoch": 0.03021626277440231, + "grad_norm": 0.42406305428052293, + "learning_rate": 1.0071384233395408e-05, + "loss": 7.7279, + "step": 649 + }, + { + "epoch": 0.03026282096049538, + "grad_norm": 0.38792912733386575, + "learning_rate": 1.0086902545003105e-05, + "loss": 7.8373, + "step": 650 + }, + { + "epoch": 0.03030937914658845, + "grad_norm": 0.3758646021158689, + "learning_rate": 1.0102420856610801e-05, + "loss": 7.759, + "step": 651 + }, + { + "epoch": 0.030355937332681517, + "grad_norm": 0.4440131547955386, + "learning_rate": 1.0117939168218499e-05, + "loss": 7.753, + "step": 652 + }, + { + "epoch": 0.030402495518774587, + "grad_norm": 0.3581955855674197, + "learning_rate": 1.0133457479826195e-05, + "loss": 7.8216, + "step": 653 + }, + { + "epoch": 0.030449053704867658, + "grad_norm": 0.48546249048439283, + "learning_rate": 1.0148975791433893e-05, + "loss": 7.7167, + "step": 654 + }, + { + "epoch": 0.030495611890960728, + "grad_norm": 0.41018450531905204, + "learning_rate": 1.016449410304159e-05, + "loss": 7.7139, + "step": 655 + }, + { + "epoch": 0.030542170077053798, + "grad_norm": 0.47425544472226944, + "learning_rate": 1.0180012414649287e-05, + "loss": 7.7977, + "step": 656 + }, + { + "epoch": 0.030588728263146868, + "grad_norm": 0.3888803942438728, + "learning_rate": 1.0195530726256984e-05, + "loss": 7.7765, + "step": 657 + }, + { + "epoch": 0.030635286449239938, + "grad_norm": 0.45318726504218526, + "learning_rate": 1.0211049037864682e-05, + "loss": 7.7378, + "step": 658 + }, + { + "epoch": 0.030681844635333008, + "grad_norm": 0.3765064342780437, + "learning_rate": 1.0226567349472378e-05, + "loss": 7.7488, + "step": 659 + }, + { + "epoch": 0.03072840282142608, + "grad_norm": 0.39832844985678945, + "learning_rate": 1.0242085661080076e-05, + "loss": 7.7716, + "step": 660 + }, + { + "epoch": 0.03077496100751915, + "grad_norm": 0.445894162763918, + "learning_rate": 1.0257603972687772e-05, + "loss": 7.7167, + "step": 661 + }, + { + "epoch": 0.030821519193612215, + "grad_norm": 0.38237320692637466, + "learning_rate": 1.0273122284295469e-05, + "loss": 7.7729, + "step": 662 + }, + { + "epoch": 0.030868077379705285, + "grad_norm": 0.4748809120472464, + "learning_rate": 1.0288640595903167e-05, + "loss": 7.7768, + "step": 663 + }, + { + "epoch": 0.030914635565798355, + "grad_norm": 0.3507706408760138, + "learning_rate": 1.0304158907510863e-05, + "loss": 7.599, + "step": 664 + }, + { + "epoch": 0.030961193751891426, + "grad_norm": 0.46949032185815825, + "learning_rate": 1.0319677219118561e-05, + "loss": 7.7002, + "step": 665 + }, + { + "epoch": 0.031007751937984496, + "grad_norm": 0.38357789016242366, + "learning_rate": 1.0335195530726257e-05, + "loss": 7.743, + "step": 666 + }, + { + "epoch": 0.031054310124077566, + "grad_norm": 0.3691207770765352, + "learning_rate": 1.0350713842333955e-05, + "loss": 7.7075, + "step": 667 + }, + { + "epoch": 0.031100868310170636, + "grad_norm": 0.38673480961958523, + "learning_rate": 1.0366232153941652e-05, + "loss": 7.6919, + "step": 668 + }, + { + "epoch": 0.031147426496263706, + "grad_norm": 0.3356137596180817, + "learning_rate": 1.038175046554935e-05, + "loss": 7.6969, + "step": 669 + }, + { + "epoch": 0.031193984682356776, + "grad_norm": 0.3860415675692152, + "learning_rate": 1.0397268777157046e-05, + "loss": 7.6272, + "step": 670 + }, + { + "epoch": 0.031240542868449846, + "grad_norm": 0.42196982150512513, + "learning_rate": 1.0412787088764742e-05, + "loss": 7.7043, + "step": 671 + }, + { + "epoch": 0.031287101054542917, + "grad_norm": 0.3435743728724822, + "learning_rate": 1.042830540037244e-05, + "loss": 7.7163, + "step": 672 + }, + { + "epoch": 0.03133365924063598, + "grad_norm": 0.3946695582555209, + "learning_rate": 1.0443823711980137e-05, + "loss": 7.578, + "step": 673 + }, + { + "epoch": 0.03138021742672906, + "grad_norm": 0.41833437955460184, + "learning_rate": 1.0459342023587835e-05, + "loss": 7.5712, + "step": 674 + }, + { + "epoch": 0.03142677561282212, + "grad_norm": 0.3449528722395783, + "learning_rate": 1.0474860335195531e-05, + "loss": 7.7683, + "step": 675 + }, + { + "epoch": 0.0314733337989152, + "grad_norm": 0.4430566456955111, + "learning_rate": 1.0490378646803229e-05, + "loss": 7.6681, + "step": 676 + }, + { + "epoch": 0.031519891985008264, + "grad_norm": 0.34204730208971135, + "learning_rate": 1.0505896958410925e-05, + "loss": 7.6828, + "step": 677 + }, + { + "epoch": 0.03156645017110134, + "grad_norm": 0.41022752636969606, + "learning_rate": 1.0521415270018623e-05, + "loss": 7.6726, + "step": 678 + }, + { + "epoch": 0.031613008357194404, + "grad_norm": 0.4329898475891176, + "learning_rate": 1.053693358162632e-05, + "loss": 7.6416, + "step": 679 + }, + { + "epoch": 0.03165956654328747, + "grad_norm": 0.4074095072450425, + "learning_rate": 1.0552451893234016e-05, + "loss": 7.7094, + "step": 680 + }, + { + "epoch": 0.031706124729380544, + "grad_norm": 0.4082380924964641, + "learning_rate": 1.0567970204841714e-05, + "loss": 7.6467, + "step": 681 + }, + { + "epoch": 0.03175268291547361, + "grad_norm": 0.3835506151629161, + "learning_rate": 1.058348851644941e-05, + "loss": 7.6861, + "step": 682 + }, + { + "epoch": 0.031799241101566685, + "grad_norm": 0.41881529712832677, + "learning_rate": 1.0599006828057108e-05, + "loss": 7.7049, + "step": 683 + }, + { + "epoch": 0.03184579928765975, + "grad_norm": 0.370922276914184, + "learning_rate": 1.0614525139664804e-05, + "loss": 7.6232, + "step": 684 + }, + { + "epoch": 0.031892357473752825, + "grad_norm": 0.44605313011820985, + "learning_rate": 1.0630043451272502e-05, + "loss": 7.5962, + "step": 685 + }, + { + "epoch": 0.03193891565984589, + "grad_norm": 0.32485482346034666, + "learning_rate": 1.0645561762880199e-05, + "loss": 7.6396, + "step": 686 + }, + { + "epoch": 0.031985473845938965, + "grad_norm": 0.5104675358712191, + "learning_rate": 1.0661080074487897e-05, + "loss": 7.689, + "step": 687 + }, + { + "epoch": 0.03203203203203203, + "grad_norm": 0.41657390617420786, + "learning_rate": 1.0676598386095593e-05, + "loss": 7.6817, + "step": 688 + }, + { + "epoch": 0.0320785902181251, + "grad_norm": 0.3436280512663696, + "learning_rate": 1.0692116697703291e-05, + "loss": 7.6732, + "step": 689 + }, + { + "epoch": 0.03212514840421817, + "grad_norm": 0.4348178917803161, + "learning_rate": 1.0707635009310987e-05, + "loss": 7.5987, + "step": 690 + }, + { + "epoch": 0.03217170659031124, + "grad_norm": 0.3308317710397928, + "learning_rate": 1.0723153320918684e-05, + "loss": 7.6795, + "step": 691 + }, + { + "epoch": 0.03221826477640431, + "grad_norm": 0.38136045071577057, + "learning_rate": 1.0738671632526382e-05, + "loss": 7.5336, + "step": 692 + }, + { + "epoch": 0.03226482296249738, + "grad_norm": 0.3615230864760535, + "learning_rate": 1.0754189944134078e-05, + "loss": 7.6293, + "step": 693 + }, + { + "epoch": 0.03231138114859045, + "grad_norm": 0.3905762946517811, + "learning_rate": 1.0769708255741776e-05, + "loss": 7.5911, + "step": 694 + }, + { + "epoch": 0.03235793933468352, + "grad_norm": 0.3549017442332701, + "learning_rate": 1.0785226567349472e-05, + "loss": 7.6159, + "step": 695 + }, + { + "epoch": 0.03240449752077659, + "grad_norm": 0.3557840799232559, + "learning_rate": 1.080074487895717e-05, + "loss": 7.5511, + "step": 696 + }, + { + "epoch": 0.03245105570686966, + "grad_norm": 0.4364740666772273, + "learning_rate": 1.0816263190564867e-05, + "loss": 7.6831, + "step": 697 + }, + { + "epoch": 0.03249761389296273, + "grad_norm": 0.3363374209218526, + "learning_rate": 1.0831781502172565e-05, + "loss": 7.5443, + "step": 698 + }, + { + "epoch": 0.0325441720790558, + "grad_norm": 0.36227758979988905, + "learning_rate": 1.0847299813780261e-05, + "loss": 7.6471, + "step": 699 + }, + { + "epoch": 0.032590730265148866, + "grad_norm": 0.37854185249390054, + "learning_rate": 1.0862818125387957e-05, + "loss": 7.5915, + "step": 700 + }, + { + "epoch": 0.03263728845124194, + "grad_norm": 0.38653313898832564, + "learning_rate": 1.0878336436995655e-05, + "loss": 7.5652, + "step": 701 + }, + { + "epoch": 0.03268384663733501, + "grad_norm": 0.3456152274080095, + "learning_rate": 1.0893854748603351e-05, + "loss": 7.4984, + "step": 702 + }, + { + "epoch": 0.03273040482342808, + "grad_norm": 0.342444114525436, + "learning_rate": 1.090937306021105e-05, + "loss": 7.5147, + "step": 703 + }, + { + "epoch": 0.03277696300952115, + "grad_norm": 0.3547302429286058, + "learning_rate": 1.0924891371818746e-05, + "loss": 7.5633, + "step": 704 + }, + { + "epoch": 0.03282352119561422, + "grad_norm": 0.4130743427811654, + "learning_rate": 1.0940409683426444e-05, + "loss": 7.624, + "step": 705 + }, + { + "epoch": 0.03287007938170729, + "grad_norm": 0.3630055503103083, + "learning_rate": 1.095592799503414e-05, + "loss": 7.6856, + "step": 706 + }, + { + "epoch": 0.03291663756780036, + "grad_norm": 0.4307330679997323, + "learning_rate": 1.0971446306641838e-05, + "loss": 7.6773, + "step": 707 + }, + { + "epoch": 0.03296319575389343, + "grad_norm": 0.42204874469682524, + "learning_rate": 1.0986964618249534e-05, + "loss": 7.4966, + "step": 708 + }, + { + "epoch": 0.0330097539399865, + "grad_norm": 0.3529348520423759, + "learning_rate": 1.100248292985723e-05, + "loss": 7.5387, + "step": 709 + }, + { + "epoch": 0.03305631212607957, + "grad_norm": 0.39151143800614513, + "learning_rate": 1.1018001241464929e-05, + "loss": 7.4764, + "step": 710 + }, + { + "epoch": 0.033102870312172634, + "grad_norm": 0.354211691662233, + "learning_rate": 1.1033519553072625e-05, + "loss": 7.5701, + "step": 711 + }, + { + "epoch": 0.03314942849826571, + "grad_norm": 0.3747867239225377, + "learning_rate": 1.1049037864680323e-05, + "loss": 7.5219, + "step": 712 + }, + { + "epoch": 0.033195986684358775, + "grad_norm": 0.32942232571704644, + "learning_rate": 1.106455617628802e-05, + "loss": 7.6681, + "step": 713 + }, + { + "epoch": 0.03324254487045185, + "grad_norm": 0.3225237474479856, + "learning_rate": 1.1080074487895717e-05, + "loss": 7.6105, + "step": 714 + }, + { + "epoch": 0.033289103056544915, + "grad_norm": 0.38090873405309034, + "learning_rate": 1.1095592799503414e-05, + "loss": 7.5812, + "step": 715 + }, + { + "epoch": 0.03333566124263799, + "grad_norm": 0.34851920527673086, + "learning_rate": 1.1111111111111112e-05, + "loss": 7.6123, + "step": 716 + }, + { + "epoch": 0.033382219428731055, + "grad_norm": 0.39167748537256736, + "learning_rate": 1.1126629422718808e-05, + "loss": 7.6062, + "step": 717 + }, + { + "epoch": 0.03342877761482413, + "grad_norm": 0.360851345992819, + "learning_rate": 1.1142147734326506e-05, + "loss": 7.5994, + "step": 718 + }, + { + "epoch": 0.033475335800917196, + "grad_norm": 0.41172419736435795, + "learning_rate": 1.1157666045934202e-05, + "loss": 7.4795, + "step": 719 + }, + { + "epoch": 0.03352189398701027, + "grad_norm": 0.3490558213174982, + "learning_rate": 1.1173184357541899e-05, + "loss": 7.46, + "step": 720 + }, + { + "epoch": 0.033568452173103336, + "grad_norm": 0.32949689308322916, + "learning_rate": 1.1188702669149598e-05, + "loss": 7.5753, + "step": 721 + }, + { + "epoch": 0.0336150103591964, + "grad_norm": 0.38376246312428003, + "learning_rate": 1.1204220980757295e-05, + "loss": 7.5189, + "step": 722 + }, + { + "epoch": 0.033661568545289476, + "grad_norm": 0.41291299626183736, + "learning_rate": 1.1219739292364993e-05, + "loss": 7.5427, + "step": 723 + }, + { + "epoch": 0.03370812673138254, + "grad_norm": 0.33597104273248973, + "learning_rate": 1.1235257603972689e-05, + "loss": 7.5941, + "step": 724 + }, + { + "epoch": 0.033754684917475616, + "grad_norm": 0.39945118150752973, + "learning_rate": 1.1250775915580385e-05, + "loss": 7.505, + "step": 725 + }, + { + "epoch": 0.03380124310356868, + "grad_norm": 0.37222397463165, + "learning_rate": 1.1266294227188083e-05, + "loss": 7.4431, + "step": 726 + }, + { + "epoch": 0.03384780128966176, + "grad_norm": 0.3651240686552558, + "learning_rate": 1.128181253879578e-05, + "loss": 7.5015, + "step": 727 + }, + { + "epoch": 0.03389435947575482, + "grad_norm": 0.38237739638988877, + "learning_rate": 1.1297330850403477e-05, + "loss": 7.5434, + "step": 728 + }, + { + "epoch": 0.0339409176618479, + "grad_norm": 0.39295996176469056, + "learning_rate": 1.1312849162011174e-05, + "loss": 7.4416, + "step": 729 + }, + { + "epoch": 0.033987475847940964, + "grad_norm": 0.39774865083666194, + "learning_rate": 1.1328367473618872e-05, + "loss": 7.4853, + "step": 730 + }, + { + "epoch": 0.03403403403403404, + "grad_norm": 0.5558623667864665, + "learning_rate": 1.1343885785226568e-05, + "loss": 7.4917, + "step": 731 + }, + { + "epoch": 0.034080592220127104, + "grad_norm": 0.42994690800202545, + "learning_rate": 1.1359404096834266e-05, + "loss": 7.551, + "step": 732 + }, + { + "epoch": 0.03412715040622017, + "grad_norm": 0.42253750433311743, + "learning_rate": 1.1374922408441962e-05, + "loss": 7.647, + "step": 733 + }, + { + "epoch": 0.034173708592313244, + "grad_norm": 0.4465931021713608, + "learning_rate": 1.139044072004966e-05, + "loss": 7.4349, + "step": 734 + }, + { + "epoch": 0.03422026677840631, + "grad_norm": 0.39607205513331845, + "learning_rate": 1.1405959031657357e-05, + "loss": 7.4502, + "step": 735 + }, + { + "epoch": 0.034266824964499384, + "grad_norm": 0.43334020518474015, + "learning_rate": 1.1421477343265053e-05, + "loss": 7.4455, + "step": 736 + }, + { + "epoch": 0.03431338315059245, + "grad_norm": 0.5094540263665638, + "learning_rate": 1.1436995654872751e-05, + "loss": 7.5654, + "step": 737 + }, + { + "epoch": 0.034359941336685525, + "grad_norm": 0.43198635009326264, + "learning_rate": 1.1452513966480447e-05, + "loss": 7.4259, + "step": 738 + }, + { + "epoch": 0.03440649952277859, + "grad_norm": 0.3339219591912998, + "learning_rate": 1.1468032278088145e-05, + "loss": 7.4487, + "step": 739 + }, + { + "epoch": 0.034453057708871665, + "grad_norm": 0.5274064145199359, + "learning_rate": 1.1483550589695842e-05, + "loss": 7.4809, + "step": 740 + }, + { + "epoch": 0.03449961589496473, + "grad_norm": 0.4704002872891441, + "learning_rate": 1.149906890130354e-05, + "loss": 7.4748, + "step": 741 + }, + { + "epoch": 0.034546174081057805, + "grad_norm": 0.4153825003688203, + "learning_rate": 1.1514587212911236e-05, + "loss": 7.435, + "step": 742 + }, + { + "epoch": 0.03459273226715087, + "grad_norm": 0.5411157599085807, + "learning_rate": 1.1530105524518934e-05, + "loss": 7.5249, + "step": 743 + }, + { + "epoch": 0.03463929045324394, + "grad_norm": 0.4783832188528827, + "learning_rate": 1.154562383612663e-05, + "loss": 7.5344, + "step": 744 + }, + { + "epoch": 0.03468584863933701, + "grad_norm": 0.4125935505476577, + "learning_rate": 1.1561142147734327e-05, + "loss": 7.4871, + "step": 745 + }, + { + "epoch": 0.03473240682543008, + "grad_norm": 0.5677612881227598, + "learning_rate": 1.1576660459342025e-05, + "loss": 7.3958, + "step": 746 + }, + { + "epoch": 0.03477896501152315, + "grad_norm": 0.41227513966309026, + "learning_rate": 1.1592178770949721e-05, + "loss": 7.4276, + "step": 747 + }, + { + "epoch": 0.03482552319761622, + "grad_norm": 0.3675337870474722, + "learning_rate": 1.1607697082557419e-05, + "loss": 7.572, + "step": 748 + }, + { + "epoch": 0.03487208138370929, + "grad_norm": 0.48166925625985757, + "learning_rate": 1.1623215394165115e-05, + "loss": 7.3728, + "step": 749 + }, + { + "epoch": 0.03491863956980236, + "grad_norm": 0.4841833307605909, + "learning_rate": 1.1638733705772813e-05, + "loss": 7.3414, + "step": 750 + }, + { + "epoch": 0.03496519775589543, + "grad_norm": 0.4305519802668814, + "learning_rate": 1.165425201738051e-05, + "loss": 7.3828, + "step": 751 + }, + { + "epoch": 0.0350117559419885, + "grad_norm": 0.4601428524640583, + "learning_rate": 1.1669770328988208e-05, + "loss": 7.3843, + "step": 752 + }, + { + "epoch": 0.03505831412808157, + "grad_norm": 0.5846898161791247, + "learning_rate": 1.1685288640595904e-05, + "loss": 7.4851, + "step": 753 + }, + { + "epoch": 0.03510487231417464, + "grad_norm": 0.4090817911416376, + "learning_rate": 1.17008069522036e-05, + "loss": 7.3382, + "step": 754 + }, + { + "epoch": 0.03515143050026771, + "grad_norm": 0.5605333290427676, + "learning_rate": 1.1716325263811298e-05, + "loss": 7.4136, + "step": 755 + }, + { + "epoch": 0.03519798868636078, + "grad_norm": 0.5815452585347526, + "learning_rate": 1.1731843575418994e-05, + "loss": 7.4803, + "step": 756 + }, + { + "epoch": 0.03524454687245385, + "grad_norm": 0.41520449109036023, + "learning_rate": 1.1747361887026692e-05, + "loss": 7.3821, + "step": 757 + }, + { + "epoch": 0.03529110505854692, + "grad_norm": 0.5757270332472689, + "learning_rate": 1.1762880198634389e-05, + "loss": 7.4303, + "step": 758 + }, + { + "epoch": 0.03533766324463999, + "grad_norm": 0.539233635393083, + "learning_rate": 1.1778398510242087e-05, + "loss": 7.4387, + "step": 759 + }, + { + "epoch": 0.03538422143073306, + "grad_norm": 0.33346664987651753, + "learning_rate": 1.1793916821849783e-05, + "loss": 7.5421, + "step": 760 + }, + { + "epoch": 0.03543077961682613, + "grad_norm": 0.4714189153590612, + "learning_rate": 1.1809435133457481e-05, + "loss": 7.3851, + "step": 761 + }, + { + "epoch": 0.0354773378029192, + "grad_norm": 0.35250828906474163, + "learning_rate": 1.1824953445065177e-05, + "loss": 7.4683, + "step": 762 + }, + { + "epoch": 0.03552389598901227, + "grad_norm": 0.3891351018210691, + "learning_rate": 1.1840471756672875e-05, + "loss": 7.504, + "step": 763 + }, + { + "epoch": 0.03557045417510534, + "grad_norm": 0.3652508590781559, + "learning_rate": 1.1855990068280572e-05, + "loss": 7.5244, + "step": 764 + }, + { + "epoch": 0.03561701236119841, + "grad_norm": 0.3618428912963988, + "learning_rate": 1.1871508379888268e-05, + "loss": 7.3318, + "step": 765 + }, + { + "epoch": 0.035663570547291475, + "grad_norm": 0.459716639435288, + "learning_rate": 1.1887026691495966e-05, + "loss": 7.4112, + "step": 766 + }, + { + "epoch": 0.03571012873338455, + "grad_norm": 0.44045390076958896, + "learning_rate": 1.1902545003103662e-05, + "loss": 7.3392, + "step": 767 + }, + { + "epoch": 0.035756686919477615, + "grad_norm": 0.36646578650912437, + "learning_rate": 1.191806331471136e-05, + "loss": 7.4695, + "step": 768 + }, + { + "epoch": 0.03580324510557069, + "grad_norm": 0.42504426042482957, + "learning_rate": 1.1933581626319057e-05, + "loss": 7.4686, + "step": 769 + }, + { + "epoch": 0.035849803291663755, + "grad_norm": 0.5213347582442487, + "learning_rate": 1.1949099937926755e-05, + "loss": 7.4402, + "step": 770 + }, + { + "epoch": 0.03589636147775683, + "grad_norm": 0.4248181369979696, + "learning_rate": 1.1964618249534451e-05, + "loss": 7.4621, + "step": 771 + }, + { + "epoch": 0.035942919663849895, + "grad_norm": 0.3492296310402004, + "learning_rate": 1.1980136561142149e-05, + "loss": 7.3844, + "step": 772 + }, + { + "epoch": 0.03598947784994297, + "grad_norm": 0.342657141302458, + "learning_rate": 1.1995654872749845e-05, + "loss": 7.3393, + "step": 773 + }, + { + "epoch": 0.036036036036036036, + "grad_norm": 0.36908574456226584, + "learning_rate": 1.2011173184357542e-05, + "loss": 7.3571, + "step": 774 + }, + { + "epoch": 0.0360825942221291, + "grad_norm": 0.39637736419980346, + "learning_rate": 1.202669149596524e-05, + "loss": 7.3249, + "step": 775 + }, + { + "epoch": 0.036129152408222176, + "grad_norm": 0.4039710232746576, + "learning_rate": 1.2042209807572936e-05, + "loss": 7.3051, + "step": 776 + }, + { + "epoch": 0.03617571059431524, + "grad_norm": 0.336512397330284, + "learning_rate": 1.2057728119180634e-05, + "loss": 7.4411, + "step": 777 + }, + { + "epoch": 0.036222268780408316, + "grad_norm": 0.38541440580035974, + "learning_rate": 1.207324643078833e-05, + "loss": 7.3867, + "step": 778 + }, + { + "epoch": 0.03626882696650138, + "grad_norm": 0.38844476430080355, + "learning_rate": 1.2088764742396028e-05, + "loss": 7.3529, + "step": 779 + }, + { + "epoch": 0.03631538515259446, + "grad_norm": 0.3699377806627978, + "learning_rate": 1.2104283054003724e-05, + "loss": 7.3608, + "step": 780 + }, + { + "epoch": 0.03636194333868752, + "grad_norm": 0.32814485935277493, + "learning_rate": 1.2119801365611422e-05, + "loss": 7.4171, + "step": 781 + }, + { + "epoch": 0.0364085015247806, + "grad_norm": 0.41487024877512546, + "learning_rate": 1.2135319677219119e-05, + "loss": 7.4315, + "step": 782 + }, + { + "epoch": 0.036455059710873663, + "grad_norm": 0.482707165751038, + "learning_rate": 1.2150837988826817e-05, + "loss": 7.2684, + "step": 783 + }, + { + "epoch": 0.03650161789696674, + "grad_norm": 0.38982728906615, + "learning_rate": 1.2166356300434513e-05, + "loss": 7.402, + "step": 784 + }, + { + "epoch": 0.036548176083059804, + "grad_norm": 0.4294197902164815, + "learning_rate": 1.218187461204221e-05, + "loss": 7.2685, + "step": 785 + }, + { + "epoch": 0.03659473426915287, + "grad_norm": 0.6929236543525911, + "learning_rate": 1.2197392923649907e-05, + "loss": 7.2738, + "step": 786 + }, + { + "epoch": 0.036641292455245944, + "grad_norm": 0.5281445012278841, + "learning_rate": 1.2212911235257604e-05, + "loss": 7.305, + "step": 787 + }, + { + "epoch": 0.03668785064133901, + "grad_norm": 0.384699775587924, + "learning_rate": 1.2228429546865302e-05, + "loss": 7.2488, + "step": 788 + }, + { + "epoch": 0.036734408827432084, + "grad_norm": 0.5286093101380646, + "learning_rate": 1.2243947858472998e-05, + "loss": 7.2794, + "step": 789 + }, + { + "epoch": 0.03678096701352515, + "grad_norm": 0.4701841206772192, + "learning_rate": 1.2259466170080696e-05, + "loss": 7.2952, + "step": 790 + }, + { + "epoch": 0.036827525199618225, + "grad_norm": 0.40910167574986195, + "learning_rate": 1.2274984481688392e-05, + "loss": 7.4928, + "step": 791 + }, + { + "epoch": 0.03687408338571129, + "grad_norm": 0.5391405859344903, + "learning_rate": 1.229050279329609e-05, + "loss": 7.284, + "step": 792 + }, + { + "epoch": 0.036920641571804365, + "grad_norm": 0.46875922961064476, + "learning_rate": 1.2306021104903787e-05, + "loss": 7.4083, + "step": 793 + }, + { + "epoch": 0.03696719975789743, + "grad_norm": 0.46638702201591803, + "learning_rate": 1.2321539416511483e-05, + "loss": 7.3294, + "step": 794 + }, + { + "epoch": 0.037013757943990505, + "grad_norm": 0.42749443118973157, + "learning_rate": 1.2337057728119181e-05, + "loss": 7.3469, + "step": 795 + }, + { + "epoch": 0.03706031613008357, + "grad_norm": 0.5355959868212228, + "learning_rate": 1.2352576039726877e-05, + "loss": 7.3679, + "step": 796 + }, + { + "epoch": 0.03710687431617664, + "grad_norm": 0.4535804276844775, + "learning_rate": 1.2368094351334575e-05, + "loss": 7.3302, + "step": 797 + }, + { + "epoch": 0.03715343250226971, + "grad_norm": 0.5166873668909676, + "learning_rate": 1.2383612662942272e-05, + "loss": 7.1325, + "step": 798 + }, + { + "epoch": 0.03719999068836278, + "grad_norm": 0.47998283742478354, + "learning_rate": 1.239913097454997e-05, + "loss": 7.2472, + "step": 799 + }, + { + "epoch": 0.03724654887445585, + "grad_norm": 0.41234957867881894, + "learning_rate": 1.2414649286157666e-05, + "loss": 7.2864, + "step": 800 + }, + { + "epoch": 0.03729310706054892, + "grad_norm": 0.482976902849979, + "learning_rate": 1.2430167597765364e-05, + "loss": 7.4038, + "step": 801 + }, + { + "epoch": 0.03733966524664199, + "grad_norm": 0.794628776420594, + "learning_rate": 1.244568590937306e-05, + "loss": 7.2136, + "step": 802 + }, + { + "epoch": 0.03738622343273506, + "grad_norm": 0.4622698999505436, + "learning_rate": 1.2461204220980756e-05, + "loss": 7.2299, + "step": 803 + }, + { + "epoch": 0.03743278161882813, + "grad_norm": 0.48965760691814963, + "learning_rate": 1.2476722532588454e-05, + "loss": 7.2627, + "step": 804 + }, + { + "epoch": 0.0374793398049212, + "grad_norm": 0.3982021821604155, + "learning_rate": 1.249224084419615e-05, + "loss": 7.1239, + "step": 805 + }, + { + "epoch": 0.03752589799101427, + "grad_norm": 0.5765001623994744, + "learning_rate": 1.250775915580385e-05, + "loss": 7.2406, + "step": 806 + }, + { + "epoch": 0.03757245617710734, + "grad_norm": 0.4956527245317239, + "learning_rate": 1.2523277467411545e-05, + "loss": 7.3209, + "step": 807 + }, + { + "epoch": 0.037619014363200406, + "grad_norm": 0.44960532536266434, + "learning_rate": 1.2538795779019245e-05, + "loss": 7.1945, + "step": 808 + }, + { + "epoch": 0.03766557254929348, + "grad_norm": 0.40480477066775616, + "learning_rate": 1.255431409062694e-05, + "loss": 7.2816, + "step": 809 + }, + { + "epoch": 0.03771213073538655, + "grad_norm": 0.4857789638340877, + "learning_rate": 1.2569832402234637e-05, + "loss": 7.2338, + "step": 810 + }, + { + "epoch": 0.03775868892147962, + "grad_norm": 0.37924369698080623, + "learning_rate": 1.2585350713842334e-05, + "loss": 7.3116, + "step": 811 + }, + { + "epoch": 0.03780524710757269, + "grad_norm": 0.5073355351634521, + "learning_rate": 1.2600869025450032e-05, + "loss": 7.2334, + "step": 812 + }, + { + "epoch": 0.03785180529366576, + "grad_norm": 0.48418414026638024, + "learning_rate": 1.2616387337057728e-05, + "loss": 7.0868, + "step": 813 + }, + { + "epoch": 0.03789836347975883, + "grad_norm": 0.5224639951843418, + "learning_rate": 1.2631905648665426e-05, + "loss": 7.2433, + "step": 814 + }, + { + "epoch": 0.0379449216658519, + "grad_norm": 0.40628354123186455, + "learning_rate": 1.2647423960273122e-05, + "loss": 7.2123, + "step": 815 + }, + { + "epoch": 0.03799147985194497, + "grad_norm": 0.4724892888227922, + "learning_rate": 1.266294227188082e-05, + "loss": 7.2388, + "step": 816 + }, + { + "epoch": 0.03803803803803804, + "grad_norm": 0.5718857769459931, + "learning_rate": 1.2678460583488517e-05, + "loss": 7.2264, + "step": 817 + }, + { + "epoch": 0.03808459622413111, + "grad_norm": 0.5197907360573912, + "learning_rate": 1.2693978895096215e-05, + "loss": 7.3427, + "step": 818 + }, + { + "epoch": 0.038131154410224175, + "grad_norm": 0.5545654980898654, + "learning_rate": 1.2709497206703911e-05, + "loss": 7.1794, + "step": 819 + }, + { + "epoch": 0.03817771259631725, + "grad_norm": 0.44001204448953773, + "learning_rate": 1.2725015518311609e-05, + "loss": 7.2698, + "step": 820 + }, + { + "epoch": 0.038224270782410315, + "grad_norm": 0.5216939227442718, + "learning_rate": 1.2740533829919305e-05, + "loss": 7.3337, + "step": 821 + }, + { + "epoch": 0.03827082896850339, + "grad_norm": 0.44856891827090484, + "learning_rate": 1.2756052141527003e-05, + "loss": 7.1279, + "step": 822 + }, + { + "epoch": 0.038317387154596455, + "grad_norm": 0.5389745976216593, + "learning_rate": 1.2771570453134698e-05, + "loss": 7.1684, + "step": 823 + }, + { + "epoch": 0.03836394534068953, + "grad_norm": 0.48367685203431643, + "learning_rate": 1.2787088764742398e-05, + "loss": 7.2675, + "step": 824 + }, + { + "epoch": 0.038410503526782595, + "grad_norm": 0.4627042875351097, + "learning_rate": 1.2802607076350092e-05, + "loss": 7.2855, + "step": 825 + }, + { + "epoch": 0.03845706171287567, + "grad_norm": 0.5211895623997456, + "learning_rate": 1.2818125387957792e-05, + "loss": 7.2024, + "step": 826 + }, + { + "epoch": 0.038503619898968736, + "grad_norm": 0.4552738610296671, + "learning_rate": 1.2833643699565486e-05, + "loss": 7.1414, + "step": 827 + }, + { + "epoch": 0.03855017808506181, + "grad_norm": 0.41890621723000626, + "learning_rate": 1.2849162011173184e-05, + "loss": 7.2687, + "step": 828 + }, + { + "epoch": 0.038596736271154876, + "grad_norm": 0.6043252715009854, + "learning_rate": 1.286468032278088e-05, + "loss": 7.1774, + "step": 829 + }, + { + "epoch": 0.03864329445724794, + "grad_norm": 0.5229972274070174, + "learning_rate": 1.2880198634388579e-05, + "loss": 7.2238, + "step": 830 + }, + { + "epoch": 0.038689852643341016, + "grad_norm": 0.4447581227329473, + "learning_rate": 1.2895716945996275e-05, + "loss": 7.2075, + "step": 831 + }, + { + "epoch": 0.03873641082943408, + "grad_norm": 0.6882810753984989, + "learning_rate": 1.2911235257603973e-05, + "loss": 7.1475, + "step": 832 + }, + { + "epoch": 0.038782969015527156, + "grad_norm": 0.7321131531880091, + "learning_rate": 1.292675356921167e-05, + "loss": 7.2412, + "step": 833 + }, + { + "epoch": 0.03882952720162022, + "grad_norm": 0.383881605937177, + "learning_rate": 1.2942271880819367e-05, + "loss": 7.3921, + "step": 834 + }, + { + "epoch": 0.0388760853877133, + "grad_norm": 0.6365060380142951, + "learning_rate": 1.2957790192427064e-05, + "loss": 7.067, + "step": 835 + }, + { + "epoch": 0.03892264357380636, + "grad_norm": 0.6900114649595797, + "learning_rate": 1.2973308504034762e-05, + "loss": 7.1737, + "step": 836 + }, + { + "epoch": 0.03896920175989944, + "grad_norm": 0.4637687877458271, + "learning_rate": 1.2988826815642458e-05, + "loss": 7.1777, + "step": 837 + }, + { + "epoch": 0.039015759945992504, + "grad_norm": 0.5242659027584293, + "learning_rate": 1.3004345127250156e-05, + "loss": 7.3036, + "step": 838 + }, + { + "epoch": 0.03906231813208558, + "grad_norm": 0.42528661174301036, + "learning_rate": 1.3019863438857852e-05, + "loss": 7.1261, + "step": 839 + }, + { + "epoch": 0.039108876318178644, + "grad_norm": 0.42208892121787744, + "learning_rate": 1.303538175046555e-05, + "loss": 7.0312, + "step": 840 + }, + { + "epoch": 0.03915543450427171, + "grad_norm": 0.4344407791332236, + "learning_rate": 1.3050900062073247e-05, + "loss": 7.1618, + "step": 841 + }, + { + "epoch": 0.039201992690364784, + "grad_norm": 0.4500210577979816, + "learning_rate": 1.3066418373680945e-05, + "loss": 7.272, + "step": 842 + }, + { + "epoch": 0.03924855087645785, + "grad_norm": 0.423208930283336, + "learning_rate": 1.308193668528864e-05, + "loss": 7.1522, + "step": 843 + }, + { + "epoch": 0.039295109062550924, + "grad_norm": 0.42920543708856634, + "learning_rate": 1.3097454996896339e-05, + "loss": 7.0694, + "step": 844 + }, + { + "epoch": 0.03934166724864399, + "grad_norm": 0.3898381626903198, + "learning_rate": 1.3112973308504034e-05, + "loss": 7.1534, + "step": 845 + }, + { + "epoch": 0.039388225434737065, + "grad_norm": 0.4329306630478211, + "learning_rate": 1.3128491620111733e-05, + "loss": 7.1233, + "step": 846 + }, + { + "epoch": 0.03943478362083013, + "grad_norm": 0.48343835435039473, + "learning_rate": 1.3144009931719428e-05, + "loss": 7.2407, + "step": 847 + }, + { + "epoch": 0.039481341806923205, + "grad_norm": 0.3673843876281614, + "learning_rate": 1.3159528243327126e-05, + "loss": 7.0954, + "step": 848 + }, + { + "epoch": 0.03952789999301627, + "grad_norm": 0.4608009764579332, + "learning_rate": 1.3175046554934826e-05, + "loss": 7.1514, + "step": 849 + }, + { + "epoch": 0.039574458179109345, + "grad_norm": 0.515000996755759, + "learning_rate": 1.319056486654252e-05, + "loss": 7.1419, + "step": 850 + }, + { + "epoch": 0.03962101636520241, + "grad_norm": 0.47533311054031324, + "learning_rate": 1.320608317815022e-05, + "loss": 7.0841, + "step": 851 + }, + { + "epoch": 0.03966757455129548, + "grad_norm": 0.33300175785416863, + "learning_rate": 1.3221601489757914e-05, + "loss": 7.1752, + "step": 852 + }, + { + "epoch": 0.03971413273738855, + "grad_norm": 0.44513956486538936, + "learning_rate": 1.3237119801365614e-05, + "loss": 7.193, + "step": 853 + }, + { + "epoch": 0.03976069092348162, + "grad_norm": 0.37724540639617643, + "learning_rate": 1.3252638112973309e-05, + "loss": 7.1866, + "step": 854 + }, + { + "epoch": 0.03980724910957469, + "grad_norm": 0.39083734718251345, + "learning_rate": 1.3268156424581007e-05, + "loss": 7.176, + "step": 855 + }, + { + "epoch": 0.03985380729566776, + "grad_norm": 0.4821339240400701, + "learning_rate": 1.3283674736188703e-05, + "loss": 7.0938, + "step": 856 + }, + { + "epoch": 0.03990036548176083, + "grad_norm": 0.4582057484750414, + "learning_rate": 1.3299193047796401e-05, + "loss": 7.0462, + "step": 857 + }, + { + "epoch": 0.0399469236678539, + "grad_norm": 0.42746128014172674, + "learning_rate": 1.3314711359404097e-05, + "loss": 7.1739, + "step": 858 + }, + { + "epoch": 0.03999348185394697, + "grad_norm": 0.45215752193046826, + "learning_rate": 1.3330229671011795e-05, + "loss": 7.0992, + "step": 859 + }, + { + "epoch": 0.04004004004004004, + "grad_norm": 0.4766550575867761, + "learning_rate": 1.3345747982619492e-05, + "loss": 7.1502, + "step": 860 + }, + { + "epoch": 0.04008659822613311, + "grad_norm": 0.3900118383764763, + "learning_rate": 1.336126629422719e-05, + "loss": 7.1067, + "step": 861 + }, + { + "epoch": 0.04013315641222618, + "grad_norm": 0.43206308600093873, + "learning_rate": 1.3376784605834886e-05, + "loss": 6.9948, + "step": 862 + }, + { + "epoch": 0.04017971459831925, + "grad_norm": 0.406747650923422, + "learning_rate": 1.3392302917442584e-05, + "loss": 7.2069, + "step": 863 + }, + { + "epoch": 0.04022627278441232, + "grad_norm": 0.37933739303396485, + "learning_rate": 1.340782122905028e-05, + "loss": 7.0836, + "step": 864 + }, + { + "epoch": 0.04027283097050539, + "grad_norm": 0.4492167487839153, + "learning_rate": 1.3423339540657978e-05, + "loss": 7.0278, + "step": 865 + }, + { + "epoch": 0.04031938915659846, + "grad_norm": 0.38648471151099506, + "learning_rate": 1.3438857852265675e-05, + "loss": 7.0813, + "step": 866 + }, + { + "epoch": 0.04036594734269153, + "grad_norm": 0.46548965298435174, + "learning_rate": 1.3454376163873373e-05, + "loss": 6.9548, + "step": 867 + }, + { + "epoch": 0.0404125055287846, + "grad_norm": 0.639629050617404, + "learning_rate": 1.3469894475481067e-05, + "loss": 7.1877, + "step": 868 + }, + { + "epoch": 0.04045906371487767, + "grad_norm": 0.5353957073924523, + "learning_rate": 1.3485412787088767e-05, + "loss": 7.128, + "step": 869 + }, + { + "epoch": 0.04050562190097074, + "grad_norm": 0.4671767245455621, + "learning_rate": 1.3500931098696462e-05, + "loss": 7.0646, + "step": 870 + }, + { + "epoch": 0.04055218008706381, + "grad_norm": 0.6964725262867114, + "learning_rate": 1.3516449410304161e-05, + "loss": 7.0309, + "step": 871 + }, + { + "epoch": 0.040598738273156874, + "grad_norm": 0.7347701860739855, + "learning_rate": 1.3531967721911856e-05, + "loss": 7.1963, + "step": 872 + }, + { + "epoch": 0.04064529645924995, + "grad_norm": 0.5271086619180505, + "learning_rate": 1.3547486033519554e-05, + "loss": 7.0185, + "step": 873 + }, + { + "epoch": 0.040691854645343015, + "grad_norm": 0.6064509854419831, + "learning_rate": 1.356300434512725e-05, + "loss": 7.0954, + "step": 874 + }, + { + "epoch": 0.04073841283143609, + "grad_norm": 0.791801659526004, + "learning_rate": 1.3578522656734948e-05, + "loss": 7.0585, + "step": 875 + }, + { + "epoch": 0.040784971017529155, + "grad_norm": 0.596008531755191, + "learning_rate": 1.3594040968342644e-05, + "loss": 7.2178, + "step": 876 + }, + { + "epoch": 0.04083152920362223, + "grad_norm": 0.5597221987526212, + "learning_rate": 1.3609559279950342e-05, + "loss": 7.1216, + "step": 877 + }, + { + "epoch": 0.040878087389715295, + "grad_norm": 0.7141906061721119, + "learning_rate": 1.3625077591558039e-05, + "loss": 7.1423, + "step": 878 + }, + { + "epoch": 0.04092464557580837, + "grad_norm": 0.4385738254694144, + "learning_rate": 1.3640595903165737e-05, + "loss": 7.0497, + "step": 879 + }, + { + "epoch": 0.040971203761901435, + "grad_norm": 0.5718599339055526, + "learning_rate": 1.3656114214773433e-05, + "loss": 7.0549, + "step": 880 + }, + { + "epoch": 0.04101776194799451, + "grad_norm": 0.7664153020100509, + "learning_rate": 1.3671632526381131e-05, + "loss": 7.0201, + "step": 881 + }, + { + "epoch": 0.041064320134087576, + "grad_norm": 0.5641603554330761, + "learning_rate": 1.3687150837988827e-05, + "loss": 7.1006, + "step": 882 + }, + { + "epoch": 0.04111087832018064, + "grad_norm": 0.4678922743635806, + "learning_rate": 1.3702669149596525e-05, + "loss": 7.1151, + "step": 883 + }, + { + "epoch": 0.041157436506273716, + "grad_norm": 0.6414798109520335, + "learning_rate": 1.3718187461204222e-05, + "loss": 7.0719, + "step": 884 + }, + { + "epoch": 0.04120399469236678, + "grad_norm": 0.462151529210526, + "learning_rate": 1.373370577281192e-05, + "loss": 7.1096, + "step": 885 + }, + { + "epoch": 0.041250552878459856, + "grad_norm": 0.5284438528366341, + "learning_rate": 1.3749224084419616e-05, + "loss": 7.0465, + "step": 886 + }, + { + "epoch": 0.04129711106455292, + "grad_norm": 0.6287630184742776, + "learning_rate": 1.3764742396027314e-05, + "loss": 7.0598, + "step": 887 + }, + { + "epoch": 0.041343669250646, + "grad_norm": 0.5623033767848394, + "learning_rate": 1.3780260707635009e-05, + "loss": 7.1121, + "step": 888 + }, + { + "epoch": 0.04139022743673906, + "grad_norm": 0.4854883881750437, + "learning_rate": 1.3795779019242708e-05, + "loss": 7.0549, + "step": 889 + }, + { + "epoch": 0.04143678562283214, + "grad_norm": 0.5156127560975878, + "learning_rate": 1.3811297330850403e-05, + "loss": 7.0734, + "step": 890 + }, + { + "epoch": 0.041483343808925204, + "grad_norm": 0.5319989615524575, + "learning_rate": 1.3826815642458103e-05, + "loss": 7.007, + "step": 891 + }, + { + "epoch": 0.04152990199501828, + "grad_norm": 0.4016023346974095, + "learning_rate": 1.3842333954065797e-05, + "loss": 7.0039, + "step": 892 + }, + { + "epoch": 0.041576460181111344, + "grad_norm": 0.5412088608370337, + "learning_rate": 1.3857852265673495e-05, + "loss": 7.0586, + "step": 893 + }, + { + "epoch": 0.04162301836720441, + "grad_norm": 0.3624354124663661, + "learning_rate": 1.3873370577281192e-05, + "loss": 7.0038, + "step": 894 + }, + { + "epoch": 0.041669576553297484, + "grad_norm": 0.5566808839300752, + "learning_rate": 1.388888888888889e-05, + "loss": 6.944, + "step": 895 + }, + { + "epoch": 0.04171613473939055, + "grad_norm": 0.4064435663177523, + "learning_rate": 1.3904407200496586e-05, + "loss": 7.1307, + "step": 896 + }, + { + "epoch": 0.041762692925483624, + "grad_norm": 0.44212899759453794, + "learning_rate": 1.3919925512104284e-05, + "loss": 7.0855, + "step": 897 + }, + { + "epoch": 0.04180925111157669, + "grad_norm": 0.4651898846785921, + "learning_rate": 1.393544382371198e-05, + "loss": 7.0448, + "step": 898 + }, + { + "epoch": 0.041855809297669765, + "grad_norm": 0.45940961051734225, + "learning_rate": 1.3950962135319678e-05, + "loss": 7.0244, + "step": 899 + }, + { + "epoch": 0.04190236748376283, + "grad_norm": 0.5314858117948555, + "learning_rate": 1.3966480446927374e-05, + "loss": 7.0072, + "step": 900 + }, + { + "epoch": 0.041948925669855905, + "grad_norm": 0.48910374421941105, + "learning_rate": 1.3981998758535072e-05, + "loss": 6.9784, + "step": 901 + }, + { + "epoch": 0.04199548385594897, + "grad_norm": 0.46354571862095006, + "learning_rate": 1.3997517070142769e-05, + "loss": 7.0945, + "step": 902 + }, + { + "epoch": 0.042042042042042045, + "grad_norm": 0.4258221531222613, + "learning_rate": 1.4013035381750467e-05, + "loss": 7.0774, + "step": 903 + }, + { + "epoch": 0.04208860022813511, + "grad_norm": 0.5314112920109654, + "learning_rate": 1.4028553693358163e-05, + "loss": 7.1022, + "step": 904 + }, + { + "epoch": 0.04213515841422818, + "grad_norm": 0.4921378014056849, + "learning_rate": 1.4044072004965861e-05, + "loss": 7.0127, + "step": 905 + }, + { + "epoch": 0.04218171660032125, + "grad_norm": 0.43852415258907856, + "learning_rate": 1.4059590316573556e-05, + "loss": 6.9894, + "step": 906 + }, + { + "epoch": 0.04222827478641432, + "grad_norm": 0.5305389923245433, + "learning_rate": 1.4075108628181255e-05, + "loss": 6.9528, + "step": 907 + }, + { + "epoch": 0.04227483297250739, + "grad_norm": 0.6235653640564265, + "learning_rate": 1.409062693978895e-05, + "loss": 6.8461, + "step": 908 + }, + { + "epoch": 0.04232139115860046, + "grad_norm": 0.5010264289185236, + "learning_rate": 1.410614525139665e-05, + "loss": 7.0362, + "step": 909 + }, + { + "epoch": 0.04236794934469353, + "grad_norm": 0.44860778243111293, + "learning_rate": 1.4121663563004344e-05, + "loss": 6.9777, + "step": 910 + }, + { + "epoch": 0.0424145075307866, + "grad_norm": 0.5959617405541866, + "learning_rate": 1.4137181874612044e-05, + "loss": 7.0339, + "step": 911 + }, + { + "epoch": 0.04246106571687967, + "grad_norm": 0.5384769119100249, + "learning_rate": 1.4152700186219739e-05, + "loss": 6.9532, + "step": 912 + }, + { + "epoch": 0.04250762390297274, + "grad_norm": 0.48226360686489006, + "learning_rate": 1.4168218497827437e-05, + "loss": 6.9587, + "step": 913 + }, + { + "epoch": 0.04255418208906581, + "grad_norm": 0.6233056051515639, + "learning_rate": 1.4183736809435133e-05, + "loss": 6.9186, + "step": 914 + }, + { + "epoch": 0.04260074027515888, + "grad_norm": 0.5730812735831701, + "learning_rate": 1.4199255121042831e-05, + "loss": 6.97, + "step": 915 + }, + { + "epoch": 0.042647298461251947, + "grad_norm": 0.6214643587656342, + "learning_rate": 1.4214773432650527e-05, + "loss": 6.9471, + "step": 916 + }, + { + "epoch": 0.04269385664734502, + "grad_norm": 0.5038865435487558, + "learning_rate": 1.4230291744258225e-05, + "loss": 7.0204, + "step": 917 + }, + { + "epoch": 0.04274041483343809, + "grad_norm": 0.5651514366278788, + "learning_rate": 1.4245810055865922e-05, + "loss": 6.8888, + "step": 918 + }, + { + "epoch": 0.04278697301953116, + "grad_norm": 0.5331396639844674, + "learning_rate": 1.426132836747362e-05, + "loss": 6.9955, + "step": 919 + }, + { + "epoch": 0.04283353120562423, + "grad_norm": 0.5079759608304412, + "learning_rate": 1.4276846679081316e-05, + "loss": 6.8958, + "step": 920 + }, + { + "epoch": 0.0428800893917173, + "grad_norm": 0.44898332740653946, + "learning_rate": 1.4292364990689014e-05, + "loss": 7.0996, + "step": 921 + }, + { + "epoch": 0.04292664757781037, + "grad_norm": 0.6250451684432826, + "learning_rate": 1.430788330229671e-05, + "loss": 6.878, + "step": 922 + }, + { + "epoch": 0.04297320576390344, + "grad_norm": 0.546113276624858, + "learning_rate": 1.4323401613904408e-05, + "loss": 6.9565, + "step": 923 + }, + { + "epoch": 0.04301976394999651, + "grad_norm": 0.6655645649263643, + "learning_rate": 1.4338919925512104e-05, + "loss": 6.9828, + "step": 924 + }, + { + "epoch": 0.04306632213608958, + "grad_norm": 0.78412927869446, + "learning_rate": 1.4354438237119802e-05, + "loss": 7.0374, + "step": 925 + }, + { + "epoch": 0.04311288032218265, + "grad_norm": 0.9390513986111247, + "learning_rate": 1.4369956548727497e-05, + "loss": 7.0269, + "step": 926 + }, + { + "epoch": 0.043159438508275715, + "grad_norm": 0.6976384850163523, + "learning_rate": 1.4385474860335197e-05, + "loss": 7.0024, + "step": 927 + }, + { + "epoch": 0.04320599669436879, + "grad_norm": 0.5103198982159768, + "learning_rate": 1.4400993171942891e-05, + "loss": 7.0198, + "step": 928 + }, + { + "epoch": 0.043252554880461855, + "grad_norm": 0.7523534375094859, + "learning_rate": 1.4416511483550591e-05, + "loss": 7.0393, + "step": 929 + }, + { + "epoch": 0.04329911306655493, + "grad_norm": 0.6705521287331973, + "learning_rate": 1.4432029795158286e-05, + "loss": 7.0065, + "step": 930 + }, + { + "epoch": 0.043345671252647995, + "grad_norm": 0.51105427212142, + "learning_rate": 1.4447548106765985e-05, + "loss": 6.9293, + "step": 931 + }, + { + "epoch": 0.04339222943874107, + "grad_norm": 0.8404737462667512, + "learning_rate": 1.446306641837368e-05, + "loss": 6.9254, + "step": 932 + }, + { + "epoch": 0.043438787624834135, + "grad_norm": 0.5892552282385558, + "learning_rate": 1.4478584729981378e-05, + "loss": 6.8144, + "step": 933 + }, + { + "epoch": 0.04348534581092721, + "grad_norm": 0.5906496060361016, + "learning_rate": 1.4494103041589078e-05, + "loss": 6.9066, + "step": 934 + }, + { + "epoch": 0.043531903997020276, + "grad_norm": 0.7936381885466999, + "learning_rate": 1.4509621353196772e-05, + "loss": 6.9439, + "step": 935 + }, + { + "epoch": 0.04357846218311335, + "grad_norm": 0.5765713178264104, + "learning_rate": 1.4525139664804472e-05, + "loss": 6.9469, + "step": 936 + }, + { + "epoch": 0.043625020369206416, + "grad_norm": 0.642157297698964, + "learning_rate": 1.4540657976412167e-05, + "loss": 6.969, + "step": 937 + }, + { + "epoch": 0.04367157855529948, + "grad_norm": 0.4415926392753292, + "learning_rate": 1.4556176288019865e-05, + "loss": 6.8985, + "step": 938 + }, + { + "epoch": 0.043718136741392556, + "grad_norm": 0.8774167100734797, + "learning_rate": 1.4571694599627561e-05, + "loss": 6.8519, + "step": 939 + }, + { + "epoch": 0.04376469492748562, + "grad_norm": 0.7361217679262233, + "learning_rate": 1.4587212911235259e-05, + "loss": 6.8344, + "step": 940 + }, + { + "epoch": 0.043811253113578696, + "grad_norm": 0.5933622025840549, + "learning_rate": 1.4602731222842955e-05, + "loss": 7.0224, + "step": 941 + }, + { + "epoch": 0.04385781129967176, + "grad_norm": 0.7599072302171193, + "learning_rate": 1.4618249534450653e-05, + "loss": 6.9354, + "step": 942 + }, + { + "epoch": 0.04390436948576484, + "grad_norm": 0.6206232564869484, + "learning_rate": 1.463376784605835e-05, + "loss": 6.9039, + "step": 943 + }, + { + "epoch": 0.0439509276718579, + "grad_norm": 0.683688267257024, + "learning_rate": 1.4649286157666048e-05, + "loss": 6.9163, + "step": 944 + }, + { + "epoch": 0.04399748585795098, + "grad_norm": 0.5526940167167735, + "learning_rate": 1.4664804469273744e-05, + "loss": 6.8706, + "step": 945 + }, + { + "epoch": 0.044044044044044044, + "grad_norm": 0.6859618473759951, + "learning_rate": 1.4680322780881442e-05, + "loss": 6.9924, + "step": 946 + }, + { + "epoch": 0.04409060223013712, + "grad_norm": 0.5749908947150593, + "learning_rate": 1.4695841092489138e-05, + "loss": 6.7916, + "step": 947 + }, + { + "epoch": 0.044137160416230184, + "grad_norm": 0.6568132482520105, + "learning_rate": 1.4711359404096836e-05, + "loss": 6.9215, + "step": 948 + }, + { + "epoch": 0.04418371860232325, + "grad_norm": 0.5542801417886509, + "learning_rate": 1.4726877715704532e-05, + "loss": 6.8457, + "step": 949 + }, + { + "epoch": 0.044230276788416324, + "grad_norm": 0.6873455602661795, + "learning_rate": 1.474239602731223e-05, + "loss": 6.8885, + "step": 950 + }, + { + "epoch": 0.04427683497450939, + "grad_norm": 0.6111273578656531, + "learning_rate": 1.4757914338919925e-05, + "loss": 6.7533, + "step": 951 + }, + { + "epoch": 0.044323393160602464, + "grad_norm": 0.61301669655939, + "learning_rate": 1.4773432650527625e-05, + "loss": 6.8823, + "step": 952 + }, + { + "epoch": 0.04436995134669553, + "grad_norm": 0.6225015212896561, + "learning_rate": 1.478895096213532e-05, + "loss": 6.8227, + "step": 953 + }, + { + "epoch": 0.044416509532788605, + "grad_norm": 0.7841123228774871, + "learning_rate": 1.4804469273743019e-05, + "loss": 6.8867, + "step": 954 + }, + { + "epoch": 0.04446306771888167, + "grad_norm": 0.7295524989722996, + "learning_rate": 1.4819987585350714e-05, + "loss": 6.9793, + "step": 955 + }, + { + "epoch": 0.044509625904974745, + "grad_norm": 0.6306985980265205, + "learning_rate": 1.4835505896958413e-05, + "loss": 6.8168, + "step": 956 + }, + { + "epoch": 0.04455618409106781, + "grad_norm": 0.6709823640242045, + "learning_rate": 1.4851024208566108e-05, + "loss": 6.8658, + "step": 957 + }, + { + "epoch": 0.04460274227716088, + "grad_norm": 0.7344117405200579, + "learning_rate": 1.4866542520173806e-05, + "loss": 6.8116, + "step": 958 + }, + { + "epoch": 0.04464930046325395, + "grad_norm": 0.47009239836031297, + "learning_rate": 1.4882060831781502e-05, + "loss": 6.8228, + "step": 959 + }, + { + "epoch": 0.04469585864934702, + "grad_norm": 0.7430806575631398, + "learning_rate": 1.48975791433892e-05, + "loss": 6.7818, + "step": 960 + }, + { + "epoch": 0.04474241683544009, + "grad_norm": 0.7012378956251745, + "learning_rate": 1.4913097454996897e-05, + "loss": 6.9494, + "step": 961 + }, + { + "epoch": 0.04478897502153316, + "grad_norm": 0.5611950501558421, + "learning_rate": 1.4928615766604595e-05, + "loss": 6.8202, + "step": 962 + }, + { + "epoch": 0.04483553320762623, + "grad_norm": 0.6120427649132207, + "learning_rate": 1.4944134078212291e-05, + "loss": 6.7956, + "step": 963 + }, + { + "epoch": 0.0448820913937193, + "grad_norm": 0.6273491580501012, + "learning_rate": 1.4959652389819989e-05, + "loss": 6.734, + "step": 964 + }, + { + "epoch": 0.04492864957981237, + "grad_norm": 0.6803365965028556, + "learning_rate": 1.4975170701427685e-05, + "loss": 6.9031, + "step": 965 + }, + { + "epoch": 0.04497520776590544, + "grad_norm": 0.624426353408146, + "learning_rate": 1.4990689013035383e-05, + "loss": 6.8515, + "step": 966 + }, + { + "epoch": 0.04502176595199851, + "grad_norm": 0.6689764427591225, + "learning_rate": 1.500620732464308e-05, + "loss": 6.8739, + "step": 967 + }, + { + "epoch": 0.04506832413809158, + "grad_norm": 0.7231838371407245, + "learning_rate": 1.5021725636250778e-05, + "loss": 6.889, + "step": 968 + }, + { + "epoch": 0.045114882324184646, + "grad_norm": 0.5563038987771283, + "learning_rate": 1.5037243947858474e-05, + "loss": 6.8897, + "step": 969 + }, + { + "epoch": 0.04516144051027772, + "grad_norm": 0.659432799691328, + "learning_rate": 1.5052762259466172e-05, + "loss": 6.7521, + "step": 970 + }, + { + "epoch": 0.04520799869637079, + "grad_norm": 0.6467486268024072, + "learning_rate": 1.5068280571073867e-05, + "loss": 6.909, + "step": 971 + }, + { + "epoch": 0.04525455688246386, + "grad_norm": 0.7377807823269307, + "learning_rate": 1.5083798882681566e-05, + "loss": 6.7674, + "step": 972 + }, + { + "epoch": 0.04530111506855693, + "grad_norm": 0.630783829161864, + "learning_rate": 1.509931719428926e-05, + "loss": 6.813, + "step": 973 + }, + { + "epoch": 0.04534767325465, + "grad_norm": 0.6861347618020356, + "learning_rate": 1.511483550589696e-05, + "loss": 6.8035, + "step": 974 + }, + { + "epoch": 0.04539423144074307, + "grad_norm": 0.5881396148437491, + "learning_rate": 1.5130353817504655e-05, + "loss": 6.8217, + "step": 975 + }, + { + "epoch": 0.04544078962683614, + "grad_norm": 0.6599890764234698, + "learning_rate": 1.5145872129112355e-05, + "loss": 6.8948, + "step": 976 + }, + { + "epoch": 0.04548734781292921, + "grad_norm": 0.4931093593178824, + "learning_rate": 1.516139044072005e-05, + "loss": 6.8881, + "step": 977 + }, + { + "epoch": 0.04553390599902228, + "grad_norm": 0.7038718283579548, + "learning_rate": 1.5176908752327747e-05, + "loss": 6.8868, + "step": 978 + }, + { + "epoch": 0.04558046418511535, + "grad_norm": 0.6252184405861351, + "learning_rate": 1.5192427063935444e-05, + "loss": 6.7776, + "step": 979 + }, + { + "epoch": 0.045627022371208414, + "grad_norm": 0.6169580003209745, + "learning_rate": 1.5207945375543142e-05, + "loss": 6.8084, + "step": 980 + }, + { + "epoch": 0.04567358055730149, + "grad_norm": 0.49387300732559253, + "learning_rate": 1.5223463687150838e-05, + "loss": 6.8285, + "step": 981 + }, + { + "epoch": 0.045720138743394555, + "grad_norm": 0.606343512672415, + "learning_rate": 1.5238981998758536e-05, + "loss": 6.8916, + "step": 982 + }, + { + "epoch": 0.04576669692948763, + "grad_norm": 0.5177346770329755, + "learning_rate": 1.5254500310366232e-05, + "loss": 6.8303, + "step": 983 + }, + { + "epoch": 0.045813255115580695, + "grad_norm": 0.7118395763829318, + "learning_rate": 1.5270018621973932e-05, + "loss": 6.8474, + "step": 984 + }, + { + "epoch": 0.04585981330167377, + "grad_norm": 0.562047464221103, + "learning_rate": 1.5285536933581625e-05, + "loss": 6.9065, + "step": 985 + }, + { + "epoch": 0.045906371487766835, + "grad_norm": 0.4675419361249735, + "learning_rate": 1.5301055245189325e-05, + "loss": 6.8684, + "step": 986 + }, + { + "epoch": 0.04595292967385991, + "grad_norm": 0.6701438513733561, + "learning_rate": 1.531657355679702e-05, + "loss": 6.842, + "step": 987 + }, + { + "epoch": 0.045999487859952976, + "grad_norm": 0.49590561601749483, + "learning_rate": 1.5332091868404717e-05, + "loss": 6.8025, + "step": 988 + }, + { + "epoch": 0.04604604604604605, + "grad_norm": 0.501542120553763, + "learning_rate": 1.5347610180012414e-05, + "loss": 6.902, + "step": 989 + }, + { + "epoch": 0.046092604232139116, + "grad_norm": 0.6065741184089855, + "learning_rate": 1.5363128491620113e-05, + "loss": 6.8084, + "step": 990 + }, + { + "epoch": 0.04613916241823218, + "grad_norm": 0.5803041108002803, + "learning_rate": 1.537864680322781e-05, + "loss": 6.8197, + "step": 991 + }, + { + "epoch": 0.046185720604325256, + "grad_norm": 0.6156823087901341, + "learning_rate": 1.5394165114835506e-05, + "loss": 6.7755, + "step": 992 + }, + { + "epoch": 0.04623227879041832, + "grad_norm": 0.5839227058687222, + "learning_rate": 1.5409683426443202e-05, + "loss": 6.7635, + "step": 993 + }, + { + "epoch": 0.046278836976511396, + "grad_norm": 0.5531456661693921, + "learning_rate": 1.5425201738050902e-05, + "loss": 6.7236, + "step": 994 + }, + { + "epoch": 0.04632539516260446, + "grad_norm": 0.6197204037645653, + "learning_rate": 1.5440720049658598e-05, + "loss": 6.8741, + "step": 995 + }, + { + "epoch": 0.04637195334869754, + "grad_norm": 0.67164173236946, + "learning_rate": 1.5456238361266295e-05, + "loss": 6.7752, + "step": 996 + }, + { + "epoch": 0.0464185115347906, + "grad_norm": 0.5240868469336281, + "learning_rate": 1.547175667287399e-05, + "loss": 6.7386, + "step": 997 + }, + { + "epoch": 0.04646506972088368, + "grad_norm": 0.7765130854190956, + "learning_rate": 1.548727498448169e-05, + "loss": 6.8348, + "step": 998 + }, + { + "epoch": 0.046511627906976744, + "grad_norm": 0.7031220922600268, + "learning_rate": 1.5502793296089387e-05, + "loss": 6.9079, + "step": 999 + }, + { + "epoch": 0.04655818609306982, + "grad_norm": 0.5407167301453516, + "learning_rate": 1.5518311607697083e-05, + "loss": 6.9259, + "step": 1000 + }, + { + "epoch": 0.046604744279162884, + "grad_norm": 0.7750696990378306, + "learning_rate": 1.553382991930478e-05, + "loss": 6.7265, + "step": 1001 + }, + { + "epoch": 0.04665130246525595, + "grad_norm": 0.7108521331821195, + "learning_rate": 1.554934823091248e-05, + "loss": 6.729, + "step": 1002 + }, + { + "epoch": 0.046697860651349024, + "grad_norm": 0.654517815747948, + "learning_rate": 1.5564866542520172e-05, + "loss": 6.7713, + "step": 1003 + }, + { + "epoch": 0.04674441883744209, + "grad_norm": 0.7763237792420511, + "learning_rate": 1.5580384854127872e-05, + "loss": 6.7255, + "step": 1004 + }, + { + "epoch": 0.046790977023535164, + "grad_norm": 0.6120874989177215, + "learning_rate": 1.5595903165735568e-05, + "loss": 6.7949, + "step": 1005 + }, + { + "epoch": 0.04683753520962823, + "grad_norm": 0.6986271352202851, + "learning_rate": 1.5611421477343264e-05, + "loss": 6.8834, + "step": 1006 + }, + { + "epoch": 0.046884093395721305, + "grad_norm": 0.6793884319639206, + "learning_rate": 1.562693978895096e-05, + "loss": 6.782, + "step": 1007 + }, + { + "epoch": 0.04693065158181437, + "grad_norm": 0.6028197763243154, + "learning_rate": 1.564245810055866e-05, + "loss": 6.8538, + "step": 1008 + }, + { + "epoch": 0.046977209767907445, + "grad_norm": 0.6402204332915994, + "learning_rate": 1.5657976412166357e-05, + "loss": 6.7342, + "step": 1009 + }, + { + "epoch": 0.04702376795400051, + "grad_norm": 0.6449255748434667, + "learning_rate": 1.5673494723774053e-05, + "loss": 6.5772, + "step": 1010 + }, + { + "epoch": 0.047070326140093585, + "grad_norm": 0.6272405715850242, + "learning_rate": 1.568901303538175e-05, + "loss": 6.787, + "step": 1011 + }, + { + "epoch": 0.04711688432618665, + "grad_norm": 0.6244980298939953, + "learning_rate": 1.570453134698945e-05, + "loss": 6.8182, + "step": 1012 + }, + { + "epoch": 0.04716344251227972, + "grad_norm": 0.667347013494792, + "learning_rate": 1.5720049658597145e-05, + "loss": 6.7989, + "step": 1013 + }, + { + "epoch": 0.04721000069837279, + "grad_norm": 0.5420910158259544, + "learning_rate": 1.573556797020484e-05, + "loss": 6.7962, + "step": 1014 + }, + { + "epoch": 0.04725655888446586, + "grad_norm": 0.6951038124742231, + "learning_rate": 1.5751086281812538e-05, + "loss": 6.6946, + "step": 1015 + }, + { + "epoch": 0.04730311707055893, + "grad_norm": 0.6709138100291632, + "learning_rate": 1.5766604593420238e-05, + "loss": 6.7252, + "step": 1016 + }, + { + "epoch": 0.047349675256652, + "grad_norm": 0.6138282210840907, + "learning_rate": 1.5782122905027934e-05, + "loss": 6.7766, + "step": 1017 + }, + { + "epoch": 0.04739623344274507, + "grad_norm": 0.6725512111340876, + "learning_rate": 1.579764121663563e-05, + "loss": 6.797, + "step": 1018 + }, + { + "epoch": 0.04744279162883814, + "grad_norm": 0.5178646785654061, + "learning_rate": 1.581315952824333e-05, + "loss": 6.7336, + "step": 1019 + }, + { + "epoch": 0.04748934981493121, + "grad_norm": 0.7201551896527981, + "learning_rate": 1.5828677839851026e-05, + "loss": 6.623, + "step": 1020 + }, + { + "epoch": 0.04753590800102428, + "grad_norm": 0.5536084811539503, + "learning_rate": 1.5844196151458723e-05, + "loss": 6.7558, + "step": 1021 + }, + { + "epoch": 0.04758246618711735, + "grad_norm": 0.5806314824241198, + "learning_rate": 1.585971446306642e-05, + "loss": 6.7157, + "step": 1022 + }, + { + "epoch": 0.04762902437321042, + "grad_norm": 0.5996276456974796, + "learning_rate": 1.587523277467412e-05, + "loss": 6.8061, + "step": 1023 + }, + { + "epoch": 0.04767558255930349, + "grad_norm": 0.6313486403034412, + "learning_rate": 1.5890751086281815e-05, + "loss": 6.6623, + "step": 1024 + }, + { + "epoch": 0.04772214074539656, + "grad_norm": 0.5419742852799082, + "learning_rate": 1.590626939788951e-05, + "loss": 6.6264, + "step": 1025 + }, + { + "epoch": 0.04776869893148963, + "grad_norm": 0.5138070381257736, + "learning_rate": 1.5921787709497207e-05, + "loss": 6.8909, + "step": 1026 + }, + { + "epoch": 0.0478152571175827, + "grad_norm": 0.5287579469802118, + "learning_rate": 1.5937306021104907e-05, + "loss": 6.6469, + "step": 1027 + }, + { + "epoch": 0.04786181530367577, + "grad_norm": 0.44423154786295815, + "learning_rate": 1.59528243327126e-05, + "loss": 6.7549, + "step": 1028 + }, + { + "epoch": 0.04790837348976884, + "grad_norm": 0.5847222541377776, + "learning_rate": 1.59683426443203e-05, + "loss": 6.7925, + "step": 1029 + }, + { + "epoch": 0.04795493167586191, + "grad_norm": 0.5140799416713435, + "learning_rate": 1.5983860955927996e-05, + "loss": 6.6527, + "step": 1030 + }, + { + "epoch": 0.04800148986195498, + "grad_norm": 0.6339846810107632, + "learning_rate": 1.5999379267535692e-05, + "loss": 6.7551, + "step": 1031 + }, + { + "epoch": 0.04804804804804805, + "grad_norm": 0.5696272825428376, + "learning_rate": 1.601489757914339e-05, + "loss": 6.7828, + "step": 1032 + }, + { + "epoch": 0.04809460623414112, + "grad_norm": 0.5902876265440836, + "learning_rate": 1.603041589075109e-05, + "loss": 6.614, + "step": 1033 + }, + { + "epoch": 0.04814116442023419, + "grad_norm": 0.6531711917830675, + "learning_rate": 1.6045934202358785e-05, + "loss": 6.8226, + "step": 1034 + }, + { + "epoch": 0.048187722606327255, + "grad_norm": 0.6746698620925723, + "learning_rate": 1.606145251396648e-05, + "loss": 6.7026, + "step": 1035 + }, + { + "epoch": 0.04823428079242033, + "grad_norm": 0.5810508504004633, + "learning_rate": 1.6076970825574177e-05, + "loss": 6.644, + "step": 1036 + }, + { + "epoch": 0.048280838978513395, + "grad_norm": 0.8091498112086657, + "learning_rate": 1.6092489137181877e-05, + "loss": 6.6868, + "step": 1037 + }, + { + "epoch": 0.04832739716460647, + "grad_norm": 0.6628520556363793, + "learning_rate": 1.6108007448789573e-05, + "loss": 6.7921, + "step": 1038 + }, + { + "epoch": 0.048373955350699535, + "grad_norm": 0.8577568261877674, + "learning_rate": 1.612352576039727e-05, + "loss": 6.7749, + "step": 1039 + }, + { + "epoch": 0.04842051353679261, + "grad_norm": 0.6456190246694996, + "learning_rate": 1.6139044072004966e-05, + "loss": 6.625, + "step": 1040 + }, + { + "epoch": 0.048467071722885675, + "grad_norm": 0.6484036591089481, + "learning_rate": 1.6154562383612666e-05, + "loss": 6.6713, + "step": 1041 + }, + { + "epoch": 0.04851362990897875, + "grad_norm": 0.6109910824556221, + "learning_rate": 1.6170080695220362e-05, + "loss": 6.7362, + "step": 1042 + }, + { + "epoch": 0.048560188095071816, + "grad_norm": 0.7670592548985071, + "learning_rate": 1.6185599006828058e-05, + "loss": 6.6215, + "step": 1043 + }, + { + "epoch": 0.04860674628116489, + "grad_norm": 0.571486404076043, + "learning_rate": 1.6201117318435755e-05, + "loss": 6.6672, + "step": 1044 + }, + { + "epoch": 0.048653304467257956, + "grad_norm": 0.6690932917547237, + "learning_rate": 1.6216635630043454e-05, + "loss": 6.7592, + "step": 1045 + }, + { + "epoch": 0.04869986265335102, + "grad_norm": 0.5984671460146995, + "learning_rate": 1.6232153941651147e-05, + "loss": 6.6525, + "step": 1046 + }, + { + "epoch": 0.048746420839444096, + "grad_norm": 0.5140730059592942, + "learning_rate": 1.6247672253258847e-05, + "loss": 6.6411, + "step": 1047 + }, + { + "epoch": 0.04879297902553716, + "grad_norm": 0.665113209664322, + "learning_rate": 1.6263190564866543e-05, + "loss": 6.6576, + "step": 1048 + }, + { + "epoch": 0.048839537211630236, + "grad_norm": 0.5407107484679586, + "learning_rate": 1.6278708876474243e-05, + "loss": 6.6926, + "step": 1049 + }, + { + "epoch": 0.0488860953977233, + "grad_norm": 0.6171684740528075, + "learning_rate": 1.6294227188081936e-05, + "loss": 6.6579, + "step": 1050 + }, + { + "epoch": 0.04893265358381638, + "grad_norm": 0.5856384522864987, + "learning_rate": 1.6309745499689635e-05, + "loss": 6.5827, + "step": 1051 + }, + { + "epoch": 0.04897921176990944, + "grad_norm": 0.5248936934809296, + "learning_rate": 1.6325263811297332e-05, + "loss": 6.6708, + "step": 1052 + }, + { + "epoch": 0.04902576995600252, + "grad_norm": 0.5913252496215555, + "learning_rate": 1.6340782122905028e-05, + "loss": 6.6851, + "step": 1053 + }, + { + "epoch": 0.049072328142095584, + "grad_norm": 0.47958584854779696, + "learning_rate": 1.6356300434512724e-05, + "loss": 6.6645, + "step": 1054 + }, + { + "epoch": 0.04911888632818865, + "grad_norm": 0.5218589403431474, + "learning_rate": 1.6371818746120424e-05, + "loss": 6.7876, + "step": 1055 + }, + { + "epoch": 0.049165444514281724, + "grad_norm": 0.5854135847290018, + "learning_rate": 1.638733705772812e-05, + "loss": 6.7509, + "step": 1056 + }, + { + "epoch": 0.04921200270037479, + "grad_norm": 0.5588809255191376, + "learning_rate": 1.6402855369335817e-05, + "loss": 6.7163, + "step": 1057 + }, + { + "epoch": 0.049258560886467864, + "grad_norm": 0.5459258917926142, + "learning_rate": 1.6418373680943513e-05, + "loss": 6.6847, + "step": 1058 + }, + { + "epoch": 0.04930511907256093, + "grad_norm": 0.6611472531165947, + "learning_rate": 1.6433891992551213e-05, + "loss": 6.749, + "step": 1059 + }, + { + "epoch": 0.049351677258654005, + "grad_norm": 0.47241175684468767, + "learning_rate": 1.644941030415891e-05, + "loss": 6.6841, + "step": 1060 + }, + { + "epoch": 0.04939823544474707, + "grad_norm": 0.644862823473111, + "learning_rate": 1.6464928615766605e-05, + "loss": 6.5921, + "step": 1061 + }, + { + "epoch": 0.049444793630840145, + "grad_norm": 0.808313929834897, + "learning_rate": 1.64804469273743e-05, + "loss": 6.8455, + "step": 1062 + }, + { + "epoch": 0.04949135181693321, + "grad_norm": 0.5622875500600815, + "learning_rate": 1.6495965238982e-05, + "loss": 6.7125, + "step": 1063 + }, + { + "epoch": 0.049537910003026285, + "grad_norm": 0.7372020900774064, + "learning_rate": 1.6511483550589694e-05, + "loss": 6.6274, + "step": 1064 + }, + { + "epoch": 0.04958446818911935, + "grad_norm": 0.7936582255157658, + "learning_rate": 1.6527001862197394e-05, + "loss": 6.7349, + "step": 1065 + }, + { + "epoch": 0.04963102637521242, + "grad_norm": 0.8538079793713469, + "learning_rate": 1.654252017380509e-05, + "loss": 6.6265, + "step": 1066 + }, + { + "epoch": 0.04967758456130549, + "grad_norm": 1.0468432793783082, + "learning_rate": 1.655803848541279e-05, + "loss": 6.5841, + "step": 1067 + }, + { + "epoch": 0.04972414274739856, + "grad_norm": 0.6549220045545616, + "learning_rate": 1.6573556797020483e-05, + "loss": 6.6162, + "step": 1068 + }, + { + "epoch": 0.04977070093349163, + "grad_norm": 0.7066008788635907, + "learning_rate": 1.6589075108628183e-05, + "loss": 6.5564, + "step": 1069 + }, + { + "epoch": 0.0498172591195847, + "grad_norm": 1.1244633337964252, + "learning_rate": 1.660459342023588e-05, + "loss": 6.4961, + "step": 1070 + }, + { + "epoch": 0.04986381730567777, + "grad_norm": 0.8594814152352819, + "learning_rate": 1.6620111731843575e-05, + "loss": 6.6384, + "step": 1071 + }, + { + "epoch": 0.04991037549177084, + "grad_norm": 0.6255424511973167, + "learning_rate": 1.663563004345127e-05, + "loss": 6.5945, + "step": 1072 + }, + { + "epoch": 0.04995693367786391, + "grad_norm": 0.649047569774406, + "learning_rate": 1.665114835505897e-05, + "loss": 6.7019, + "step": 1073 + }, + { + "epoch": 0.05000349186395698, + "grad_norm": 0.7266761142245985, + "learning_rate": 1.6666666666666667e-05, + "loss": 6.5619, + "step": 1074 + }, + { + "epoch": 0.05005005005005005, + "grad_norm": 0.8022908699364855, + "learning_rate": 1.6682184978274364e-05, + "loss": 6.5598, + "step": 1075 + }, + { + "epoch": 0.05009660823614312, + "grad_norm": 0.5637895012972713, + "learning_rate": 1.669770328988206e-05, + "loss": 6.5757, + "step": 1076 + }, + { + "epoch": 0.050143166422236186, + "grad_norm": 0.5554026960691626, + "learning_rate": 1.671322160148976e-05, + "loss": 6.6266, + "step": 1077 + }, + { + "epoch": 0.05018972460832926, + "grad_norm": 0.6501206571137185, + "learning_rate": 1.6728739913097456e-05, + "loss": 6.6619, + "step": 1078 + }, + { + "epoch": 0.05023628279442233, + "grad_norm": 0.5591735320191523, + "learning_rate": 1.6744258224705152e-05, + "loss": 6.7152, + "step": 1079 + }, + { + "epoch": 0.0502828409805154, + "grad_norm": 0.5410567829686674, + "learning_rate": 1.675977653631285e-05, + "loss": 6.6414, + "step": 1080 + }, + { + "epoch": 0.05032939916660847, + "grad_norm": 0.5996745134170094, + "learning_rate": 1.677529484792055e-05, + "loss": 6.5962, + "step": 1081 + }, + { + "epoch": 0.05037595735270154, + "grad_norm": 0.7681220752693316, + "learning_rate": 1.6790813159528245e-05, + "loss": 6.5964, + "step": 1082 + }, + { + "epoch": 0.05042251553879461, + "grad_norm": 0.6429513821786306, + "learning_rate": 1.680633147113594e-05, + "loss": 6.5728, + "step": 1083 + }, + { + "epoch": 0.05046907372488768, + "grad_norm": 0.5915396025111491, + "learning_rate": 1.6821849782743637e-05, + "loss": 6.6659, + "step": 1084 + }, + { + "epoch": 0.05051563191098075, + "grad_norm": 0.609679721892854, + "learning_rate": 1.6837368094351337e-05, + "loss": 6.6825, + "step": 1085 + }, + { + "epoch": 0.05056219009707382, + "grad_norm": 0.6235056681169969, + "learning_rate": 1.685288640595903e-05, + "loss": 6.6505, + "step": 1086 + }, + { + "epoch": 0.05060874828316689, + "grad_norm": 0.5802790675460975, + "learning_rate": 1.686840471756673e-05, + "loss": 6.6347, + "step": 1087 + }, + { + "epoch": 0.050655306469259954, + "grad_norm": 0.6793163584315569, + "learning_rate": 1.6883923029174426e-05, + "loss": 6.6578, + "step": 1088 + }, + { + "epoch": 0.05070186465535303, + "grad_norm": 0.7211509397004431, + "learning_rate": 1.6899441340782122e-05, + "loss": 6.5813, + "step": 1089 + }, + { + "epoch": 0.050748422841446095, + "grad_norm": 0.749029010921408, + "learning_rate": 1.691495965238982e-05, + "loss": 6.421, + "step": 1090 + }, + { + "epoch": 0.05079498102753917, + "grad_norm": 0.6228041248854235, + "learning_rate": 1.6930477963997518e-05, + "loss": 6.6095, + "step": 1091 + }, + { + "epoch": 0.050841539213632235, + "grad_norm": 0.5073809077100427, + "learning_rate": 1.6945996275605215e-05, + "loss": 6.4911, + "step": 1092 + }, + { + "epoch": 0.05088809739972531, + "grad_norm": 0.6528409356533076, + "learning_rate": 1.696151458721291e-05, + "loss": 6.6573, + "step": 1093 + }, + { + "epoch": 0.050934655585818375, + "grad_norm": 0.668297389558271, + "learning_rate": 1.6977032898820607e-05, + "loss": 6.479, + "step": 1094 + }, + { + "epoch": 0.05098121377191145, + "grad_norm": 0.6897201839856405, + "learning_rate": 1.6992551210428307e-05, + "loss": 6.5297, + "step": 1095 + }, + { + "epoch": 0.051027771958004516, + "grad_norm": 0.6887897031803447, + "learning_rate": 1.7008069522036003e-05, + "loss": 6.5074, + "step": 1096 + }, + { + "epoch": 0.05107433014409759, + "grad_norm": 0.8112954095272169, + "learning_rate": 1.70235878336437e-05, + "loss": 6.5695, + "step": 1097 + }, + { + "epoch": 0.051120888330190656, + "grad_norm": 0.7463109798697479, + "learning_rate": 1.7039106145251396e-05, + "loss": 6.5939, + "step": 1098 + }, + { + "epoch": 0.05116744651628372, + "grad_norm": 0.5741864635745577, + "learning_rate": 1.7054624456859095e-05, + "loss": 6.7064, + "step": 1099 + }, + { + "epoch": 0.051214004702376796, + "grad_norm": 0.6800940124601166, + "learning_rate": 1.7070142768466792e-05, + "loss": 6.6027, + "step": 1100 + }, + { + "epoch": 0.05126056288846986, + "grad_norm": 0.7180684923188099, + "learning_rate": 1.7085661080074488e-05, + "loss": 6.4839, + "step": 1101 + }, + { + "epoch": 0.051307121074562936, + "grad_norm": 0.7206163991212011, + "learning_rate": 1.7101179391682184e-05, + "loss": 6.5567, + "step": 1102 + }, + { + "epoch": 0.051353679260656, + "grad_norm": 0.5651241004903177, + "learning_rate": 1.7116697703289884e-05, + "loss": 6.6434, + "step": 1103 + }, + { + "epoch": 0.05140023744674908, + "grad_norm": 0.7023706843425687, + "learning_rate": 1.713221601489758e-05, + "loss": 6.5631, + "step": 1104 + }, + { + "epoch": 0.05144679563284214, + "grad_norm": 0.5802823038081734, + "learning_rate": 1.7147734326505277e-05, + "loss": 6.606, + "step": 1105 + }, + { + "epoch": 0.05149335381893522, + "grad_norm": 0.6072509364213576, + "learning_rate": 1.7163252638112976e-05, + "loss": 6.5177, + "step": 1106 + }, + { + "epoch": 0.051539912005028284, + "grad_norm": 0.7439680226801431, + "learning_rate": 1.7178770949720673e-05, + "loss": 6.5237, + "step": 1107 + }, + { + "epoch": 0.05158647019112136, + "grad_norm": 0.6605992401992944, + "learning_rate": 1.719428926132837e-05, + "loss": 6.559, + "step": 1108 + }, + { + "epoch": 0.051633028377214424, + "grad_norm": 0.5921274726867906, + "learning_rate": 1.7209807572936065e-05, + "loss": 6.6198, + "step": 1109 + }, + { + "epoch": 0.05167958656330749, + "grad_norm": 0.6988896996676537, + "learning_rate": 1.7225325884543765e-05, + "loss": 6.6108, + "step": 1110 + }, + { + "epoch": 0.051726144749400564, + "grad_norm": 0.6590841306688336, + "learning_rate": 1.7240844196151458e-05, + "loss": 6.6228, + "step": 1111 + }, + { + "epoch": 0.05177270293549363, + "grad_norm": 0.7011266815753552, + "learning_rate": 1.7256362507759158e-05, + "loss": 6.5279, + "step": 1112 + }, + { + "epoch": 0.051819261121586704, + "grad_norm": 0.5532814381181917, + "learning_rate": 1.7271880819366854e-05, + "loss": 6.5656, + "step": 1113 + }, + { + "epoch": 0.05186581930767977, + "grad_norm": 0.6480806705855315, + "learning_rate": 1.7287399130974554e-05, + "loss": 6.6048, + "step": 1114 + }, + { + "epoch": 0.051912377493772845, + "grad_norm": 0.7024569404469849, + "learning_rate": 1.7302917442582247e-05, + "loss": 6.5434, + "step": 1115 + }, + { + "epoch": 0.05195893567986591, + "grad_norm": 0.6315119498368967, + "learning_rate": 1.7318435754189946e-05, + "loss": 6.5418, + "step": 1116 + }, + { + "epoch": 0.052005493865958985, + "grad_norm": 0.7055569036918334, + "learning_rate": 1.7333954065797643e-05, + "loss": 6.5143, + "step": 1117 + }, + { + "epoch": 0.05205205205205205, + "grad_norm": 0.6533713279907046, + "learning_rate": 1.734947237740534e-05, + "loss": 6.604, + "step": 1118 + }, + { + "epoch": 0.052098610238145125, + "grad_norm": 0.48474818605794634, + "learning_rate": 1.7364990689013035e-05, + "loss": 6.4725, + "step": 1119 + }, + { + "epoch": 0.05214516842423819, + "grad_norm": 0.6939842381520318, + "learning_rate": 1.7380509000620735e-05, + "loss": 6.5766, + "step": 1120 + }, + { + "epoch": 0.05219172661033126, + "grad_norm": 0.6251713979054301, + "learning_rate": 1.739602731222843e-05, + "loss": 6.6059, + "step": 1121 + }, + { + "epoch": 0.05223828479642433, + "grad_norm": 0.6184780595041024, + "learning_rate": 1.7411545623836127e-05, + "loss": 6.6681, + "step": 1122 + }, + { + "epoch": 0.0522848429825174, + "grad_norm": 0.8072698993083789, + "learning_rate": 1.7427063935443824e-05, + "loss": 6.6241, + "step": 1123 + }, + { + "epoch": 0.05233140116861047, + "grad_norm": 0.7877270702091601, + "learning_rate": 1.7442582247051523e-05, + "loss": 6.477, + "step": 1124 + }, + { + "epoch": 0.05237795935470354, + "grad_norm": 0.5630199038670715, + "learning_rate": 1.745810055865922e-05, + "loss": 6.4304, + "step": 1125 + }, + { + "epoch": 0.05242451754079661, + "grad_norm": 0.6476413839545299, + "learning_rate": 1.7473618870266916e-05, + "loss": 6.4697, + "step": 1126 + }, + { + "epoch": 0.05247107572688968, + "grad_norm": 0.7198999296237485, + "learning_rate": 1.7489137181874612e-05, + "loss": 6.5927, + "step": 1127 + }, + { + "epoch": 0.05251763391298275, + "grad_norm": 0.6470328342760168, + "learning_rate": 1.7504655493482312e-05, + "loss": 6.3416, + "step": 1128 + }, + { + "epoch": 0.05256419209907582, + "grad_norm": 0.763883915189629, + "learning_rate": 1.7520173805090005e-05, + "loss": 6.4987, + "step": 1129 + }, + { + "epoch": 0.05261075028516889, + "grad_norm": 0.5495716588035706, + "learning_rate": 1.7535692116697705e-05, + "loss": 6.6163, + "step": 1130 + }, + { + "epoch": 0.05265730847126196, + "grad_norm": 0.7864033093766836, + "learning_rate": 1.75512104283054e-05, + "loss": 6.4611, + "step": 1131 + }, + { + "epoch": 0.05270386665735503, + "grad_norm": 0.7037920284084609, + "learning_rate": 1.75667287399131e-05, + "loss": 6.6894, + "step": 1132 + }, + { + "epoch": 0.0527504248434481, + "grad_norm": 0.7313587134777249, + "learning_rate": 1.7582247051520794e-05, + "loss": 6.4689, + "step": 1133 + }, + { + "epoch": 0.05279698302954117, + "grad_norm": 0.7431081490250409, + "learning_rate": 1.7597765363128493e-05, + "loss": 6.4769, + "step": 1134 + }, + { + "epoch": 0.05284354121563424, + "grad_norm": 0.7503487419273044, + "learning_rate": 1.761328367473619e-05, + "loss": 6.4939, + "step": 1135 + }, + { + "epoch": 0.05289009940172731, + "grad_norm": 0.6126345269906711, + "learning_rate": 1.7628801986343886e-05, + "loss": 6.528, + "step": 1136 + }, + { + "epoch": 0.05293665758782038, + "grad_norm": 1.0627079983480434, + "learning_rate": 1.7644320297951582e-05, + "loss": 6.5757, + "step": 1137 + }, + { + "epoch": 0.05298321577391345, + "grad_norm": 1.0350697095720292, + "learning_rate": 1.7659838609559282e-05, + "loss": 6.4113, + "step": 1138 + }, + { + "epoch": 0.05302977396000652, + "grad_norm": 0.9112396087815857, + "learning_rate": 1.7675356921166978e-05, + "loss": 6.459, + "step": 1139 + }, + { + "epoch": 0.05307633214609959, + "grad_norm": 0.6160428346205371, + "learning_rate": 1.7690875232774675e-05, + "loss": 6.5012, + "step": 1140 + }, + { + "epoch": 0.053122890332192654, + "grad_norm": 0.6623257829665073, + "learning_rate": 1.770639354438237e-05, + "loss": 6.454, + "step": 1141 + }, + { + "epoch": 0.05316944851828573, + "grad_norm": 0.6692471162678854, + "learning_rate": 1.772191185599007e-05, + "loss": 6.4548, + "step": 1142 + }, + { + "epoch": 0.053216006704378795, + "grad_norm": 0.6260852219828833, + "learning_rate": 1.7737430167597767e-05, + "loss": 6.4799, + "step": 1143 + }, + { + "epoch": 0.05326256489047187, + "grad_norm": 0.7368212713011641, + "learning_rate": 1.7752948479205463e-05, + "loss": 6.5533, + "step": 1144 + }, + { + "epoch": 0.053309123076564935, + "grad_norm": 0.640369939633259, + "learning_rate": 1.776846679081316e-05, + "loss": 6.584, + "step": 1145 + }, + { + "epoch": 0.05335568126265801, + "grad_norm": 0.5534406334436222, + "learning_rate": 1.778398510242086e-05, + "loss": 6.5093, + "step": 1146 + }, + { + "epoch": 0.053402239448751075, + "grad_norm": 0.8039715020664648, + "learning_rate": 1.7799503414028555e-05, + "loss": 6.411, + "step": 1147 + }, + { + "epoch": 0.05344879763484415, + "grad_norm": 0.653229207822427, + "learning_rate": 1.7815021725636252e-05, + "loss": 6.6014, + "step": 1148 + }, + { + "epoch": 0.053495355820937215, + "grad_norm": 0.5852603419256136, + "learning_rate": 1.7830540037243948e-05, + "loss": 6.5528, + "step": 1149 + }, + { + "epoch": 0.05354191400703029, + "grad_norm": 0.5637145680385334, + "learning_rate": 1.7846058348851648e-05, + "loss": 6.3938, + "step": 1150 + }, + { + "epoch": 0.053588472193123356, + "grad_norm": 0.652176230919829, + "learning_rate": 1.786157666045934e-05, + "loss": 6.5398, + "step": 1151 + }, + { + "epoch": 0.05363503037921642, + "grad_norm": 0.8388753154468027, + "learning_rate": 1.787709497206704e-05, + "loss": 6.4051, + "step": 1152 + }, + { + "epoch": 0.053681588565309496, + "grad_norm": 0.8995739440529829, + "learning_rate": 1.7892613283674737e-05, + "loss": 6.4254, + "step": 1153 + }, + { + "epoch": 0.05372814675140256, + "grad_norm": 0.8457545135985012, + "learning_rate": 1.7908131595282433e-05, + "loss": 6.5147, + "step": 1154 + }, + { + "epoch": 0.053774704937495636, + "grad_norm": 0.6405830396794199, + "learning_rate": 1.792364990689013e-05, + "loss": 6.433, + "step": 1155 + }, + { + "epoch": 0.0538212631235887, + "grad_norm": 0.5474332587174668, + "learning_rate": 1.793916821849783e-05, + "loss": 6.3961, + "step": 1156 + }, + { + "epoch": 0.053867821309681777, + "grad_norm": 0.6462814266279212, + "learning_rate": 1.7954686530105525e-05, + "loss": 6.483, + "step": 1157 + }, + { + "epoch": 0.05391437949577484, + "grad_norm": 0.663979521280515, + "learning_rate": 1.797020484171322e-05, + "loss": 6.5041, + "step": 1158 + }, + { + "epoch": 0.05396093768186792, + "grad_norm": 0.6020263874802411, + "learning_rate": 1.7985723153320918e-05, + "loss": 6.4763, + "step": 1159 + }, + { + "epoch": 0.05400749586796098, + "grad_norm": 0.5881801767260544, + "learning_rate": 1.8001241464928618e-05, + "loss": 6.3411, + "step": 1160 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 0.5477396672105106, + "learning_rate": 1.8016759776536314e-05, + "loss": 6.4089, + "step": 1161 + }, + { + "epoch": 0.054100612240147124, + "grad_norm": 0.5764651812081582, + "learning_rate": 1.803227808814401e-05, + "loss": 6.5043, + "step": 1162 + }, + { + "epoch": 0.05414717042624019, + "grad_norm": 0.6335648551256547, + "learning_rate": 1.8047796399751707e-05, + "loss": 6.4357, + "step": 1163 + }, + { + "epoch": 0.054193728612333264, + "grad_norm": 0.6214024946763987, + "learning_rate": 1.8063314711359406e-05, + "loss": 6.6768, + "step": 1164 + }, + { + "epoch": 0.05424028679842633, + "grad_norm": 0.7031646460868396, + "learning_rate": 1.8078833022967103e-05, + "loss": 6.4853, + "step": 1165 + }, + { + "epoch": 0.054286844984519404, + "grad_norm": 0.5369447091606849, + "learning_rate": 1.80943513345748e-05, + "loss": 6.4884, + "step": 1166 + }, + { + "epoch": 0.05433340317061247, + "grad_norm": 0.7666218071717635, + "learning_rate": 1.8109869646182495e-05, + "loss": 6.4037, + "step": 1167 + }, + { + "epoch": 0.054379961356705545, + "grad_norm": 0.6858807245809795, + "learning_rate": 1.8125387957790195e-05, + "loss": 6.3981, + "step": 1168 + }, + { + "epoch": 0.05442651954279861, + "grad_norm": 0.6653782986768034, + "learning_rate": 1.8140906269397888e-05, + "loss": 6.4566, + "step": 1169 + }, + { + "epoch": 0.054473077728891685, + "grad_norm": 0.6339492793210395, + "learning_rate": 1.8156424581005588e-05, + "loss": 6.4272, + "step": 1170 + }, + { + "epoch": 0.05451963591498475, + "grad_norm": 0.6272381808538153, + "learning_rate": 1.8171942892613284e-05, + "loss": 6.4448, + "step": 1171 + }, + { + "epoch": 0.054566194101077825, + "grad_norm": 0.6526539356965266, + "learning_rate": 1.8187461204220984e-05, + "loss": 6.4429, + "step": 1172 + }, + { + "epoch": 0.05461275228717089, + "grad_norm": 0.6388111918937304, + "learning_rate": 1.8202979515828676e-05, + "loss": 6.4922, + "step": 1173 + }, + { + "epoch": 0.05465931047326396, + "grad_norm": 0.6909866284452479, + "learning_rate": 1.8218497827436376e-05, + "loss": 6.5298, + "step": 1174 + }, + { + "epoch": 0.05470586865935703, + "grad_norm": 0.8358083773866345, + "learning_rate": 1.8234016139044072e-05, + "loss": 6.5544, + "step": 1175 + }, + { + "epoch": 0.0547524268454501, + "grad_norm": 0.5998073564594648, + "learning_rate": 1.824953445065177e-05, + "loss": 6.6006, + "step": 1176 + }, + { + "epoch": 0.05479898503154317, + "grad_norm": 0.5724032786589005, + "learning_rate": 1.8265052762259465e-05, + "loss": 6.4719, + "step": 1177 + }, + { + "epoch": 0.05484554321763624, + "grad_norm": 0.7199282427216886, + "learning_rate": 1.8280571073867165e-05, + "loss": 6.4455, + "step": 1178 + }, + { + "epoch": 0.05489210140372931, + "grad_norm": 0.7914893475204345, + "learning_rate": 1.829608938547486e-05, + "loss": 6.3706, + "step": 1179 + }, + { + "epoch": 0.05493865958982238, + "grad_norm": 0.8292922633767202, + "learning_rate": 1.8311607697082557e-05, + "loss": 6.4304, + "step": 1180 + }, + { + "epoch": 0.05498521777591545, + "grad_norm": 0.9412802176705901, + "learning_rate": 1.8327126008690254e-05, + "loss": 6.4329, + "step": 1181 + }, + { + "epoch": 0.05503177596200852, + "grad_norm": 0.8412245056532733, + "learning_rate": 1.8342644320297953e-05, + "loss": 6.4016, + "step": 1182 + }, + { + "epoch": 0.05507833414810159, + "grad_norm": 0.6133884694053663, + "learning_rate": 1.835816263190565e-05, + "loss": 6.452, + "step": 1183 + }, + { + "epoch": 0.05512489233419466, + "grad_norm": 0.6518881192821621, + "learning_rate": 1.8373680943513346e-05, + "loss": 6.3758, + "step": 1184 + }, + { + "epoch": 0.055171450520287726, + "grad_norm": 0.7427764368941137, + "learning_rate": 1.8389199255121042e-05, + "loss": 6.4061, + "step": 1185 + }, + { + "epoch": 0.0552180087063808, + "grad_norm": 0.7904640420451629, + "learning_rate": 1.8404717566728742e-05, + "loss": 6.4634, + "step": 1186 + }, + { + "epoch": 0.05526456689247387, + "grad_norm": 0.6890097880070453, + "learning_rate": 1.8420235878336435e-05, + "loss": 6.4532, + "step": 1187 + }, + { + "epoch": 0.05531112507856694, + "grad_norm": 0.8277875428020292, + "learning_rate": 1.8435754189944135e-05, + "loss": 6.2834, + "step": 1188 + }, + { + "epoch": 0.05535768326466001, + "grad_norm": 0.8242621474717133, + "learning_rate": 1.8451272501551834e-05, + "loss": 6.4964, + "step": 1189 + }, + { + "epoch": 0.05540424145075308, + "grad_norm": 0.700417990149289, + "learning_rate": 1.846679081315953e-05, + "loss": 6.3947, + "step": 1190 + }, + { + "epoch": 0.05545079963684615, + "grad_norm": 0.7593478380588746, + "learning_rate": 1.8482309124767227e-05, + "loss": 6.4303, + "step": 1191 + }, + { + "epoch": 0.05549735782293922, + "grad_norm": 0.7033213661514532, + "learning_rate": 1.8497827436374923e-05, + "loss": 6.3471, + "step": 1192 + }, + { + "epoch": 0.05554391600903229, + "grad_norm": 0.8622782873476059, + "learning_rate": 1.8513345747982623e-05, + "loss": 6.4277, + "step": 1193 + }, + { + "epoch": 0.05559047419512536, + "grad_norm": 0.7998871391357582, + "learning_rate": 1.8528864059590316e-05, + "loss": 6.4123, + "step": 1194 + }, + { + "epoch": 0.05563703238121843, + "grad_norm": 0.6135523378657933, + "learning_rate": 1.8544382371198016e-05, + "loss": 6.5161, + "step": 1195 + }, + { + "epoch": 0.055683590567311494, + "grad_norm": 0.7208253099836185, + "learning_rate": 1.8559900682805712e-05, + "loss": 6.4762, + "step": 1196 + }, + { + "epoch": 0.05573014875340457, + "grad_norm": 0.7975064275464997, + "learning_rate": 1.857541899441341e-05, + "loss": 6.2256, + "step": 1197 + }, + { + "epoch": 0.055776706939497635, + "grad_norm": 0.6690803722150486, + "learning_rate": 1.8590937306021104e-05, + "loss": 6.34, + "step": 1198 + }, + { + "epoch": 0.05582326512559071, + "grad_norm": 0.6025315697095033, + "learning_rate": 1.8606455617628804e-05, + "loss": 6.3477, + "step": 1199 + }, + { + "epoch": 0.055869823311683775, + "grad_norm": 0.7690184508035076, + "learning_rate": 1.86219739292365e-05, + "loss": 6.3888, + "step": 1200 + }, + { + "epoch": 0.05591638149777685, + "grad_norm": 0.7692662762487853, + "learning_rate": 1.8637492240844197e-05, + "loss": 6.3915, + "step": 1201 + }, + { + "epoch": 0.055962939683869915, + "grad_norm": 0.9209778996043965, + "learning_rate": 1.8653010552451893e-05, + "loss": 6.3532, + "step": 1202 + }, + { + "epoch": 0.05600949786996299, + "grad_norm": 0.8576265541688153, + "learning_rate": 1.8668528864059593e-05, + "loss": 6.3228, + "step": 1203 + }, + { + "epoch": 0.056056056056056056, + "grad_norm": 0.6489213384619998, + "learning_rate": 1.868404717566729e-05, + "loss": 6.3473, + "step": 1204 + }, + { + "epoch": 0.05610261424214913, + "grad_norm": 0.7765401903805444, + "learning_rate": 1.8699565487274985e-05, + "loss": 6.4306, + "step": 1205 + }, + { + "epoch": 0.056149172428242196, + "grad_norm": 0.7350719003597579, + "learning_rate": 1.871508379888268e-05, + "loss": 6.4024, + "step": 1206 + }, + { + "epoch": 0.05619573061433526, + "grad_norm": 0.6050392602723362, + "learning_rate": 1.873060211049038e-05, + "loss": 6.3181, + "step": 1207 + }, + { + "epoch": 0.056242288800428336, + "grad_norm": 0.8322189907084426, + "learning_rate": 1.8746120422098078e-05, + "loss": 6.3641, + "step": 1208 + }, + { + "epoch": 0.0562888469865214, + "grad_norm": 0.6743587948821544, + "learning_rate": 1.8761638733705774e-05, + "loss": 6.5194, + "step": 1209 + }, + { + "epoch": 0.056335405172614476, + "grad_norm": 0.7440937053682626, + "learning_rate": 1.877715704531347e-05, + "loss": 6.3191, + "step": 1210 + }, + { + "epoch": 0.05638196335870754, + "grad_norm": 0.5935229351336789, + "learning_rate": 1.879267535692117e-05, + "loss": 6.3187, + "step": 1211 + }, + { + "epoch": 0.05642852154480062, + "grad_norm": 0.5239980491203896, + "learning_rate": 1.8808193668528863e-05, + "loss": 6.5238, + "step": 1212 + }, + { + "epoch": 0.05647507973089368, + "grad_norm": 0.7096768794462892, + "learning_rate": 1.8823711980136563e-05, + "loss": 6.2497, + "step": 1213 + }, + { + "epoch": 0.05652163791698676, + "grad_norm": 0.7119630274210189, + "learning_rate": 1.883923029174426e-05, + "loss": 6.3922, + "step": 1214 + }, + { + "epoch": 0.056568196103079824, + "grad_norm": 0.6437239399355259, + "learning_rate": 1.885474860335196e-05, + "loss": 6.4005, + "step": 1215 + }, + { + "epoch": 0.0566147542891729, + "grad_norm": 0.560301347120155, + "learning_rate": 1.887026691495965e-05, + "loss": 6.5351, + "step": 1216 + }, + { + "epoch": 0.056661312475265964, + "grad_norm": 0.5882798421993322, + "learning_rate": 1.888578522656735e-05, + "loss": 6.417, + "step": 1217 + }, + { + "epoch": 0.05670787066135903, + "grad_norm": 0.5591080553742092, + "learning_rate": 1.8901303538175048e-05, + "loss": 6.4207, + "step": 1218 + }, + { + "epoch": 0.056754428847452104, + "grad_norm": 0.6931740332872444, + "learning_rate": 1.8916821849782744e-05, + "loss": 6.4369, + "step": 1219 + }, + { + "epoch": 0.05680098703354517, + "grad_norm": 0.8180675766112022, + "learning_rate": 1.893234016139044e-05, + "loss": 6.2402, + "step": 1220 + }, + { + "epoch": 0.056847545219638244, + "grad_norm": 0.675877086521715, + "learning_rate": 1.894785847299814e-05, + "loss": 6.3905, + "step": 1221 + }, + { + "epoch": 0.05689410340573131, + "grad_norm": 0.4964746356954994, + "learning_rate": 1.8963376784605836e-05, + "loss": 6.5015, + "step": 1222 + }, + { + "epoch": 0.056940661591824385, + "grad_norm": 0.6581526208692766, + "learning_rate": 1.8978895096213532e-05, + "loss": 6.3593, + "step": 1223 + }, + { + "epoch": 0.05698721977791745, + "grad_norm": 0.7249569142334936, + "learning_rate": 1.899441340782123e-05, + "loss": 6.4033, + "step": 1224 + }, + { + "epoch": 0.057033777964010525, + "grad_norm": 0.7340136863813289, + "learning_rate": 1.900993171942893e-05, + "loss": 6.3058, + "step": 1225 + }, + { + "epoch": 0.05708033615010359, + "grad_norm": 0.5959452669018487, + "learning_rate": 1.9025450031036625e-05, + "loss": 6.3311, + "step": 1226 + }, + { + "epoch": 0.057126894336196665, + "grad_norm": 0.6885501883449298, + "learning_rate": 1.904096834264432e-05, + "loss": 6.2925, + "step": 1227 + }, + { + "epoch": 0.05717345252228973, + "grad_norm": 0.7501175729356705, + "learning_rate": 1.9056486654252017e-05, + "loss": 6.2888, + "step": 1228 + }, + { + "epoch": 0.0572200107083828, + "grad_norm": 0.7145927987205316, + "learning_rate": 1.9072004965859717e-05, + "loss": 6.4323, + "step": 1229 + }, + { + "epoch": 0.05726656889447587, + "grad_norm": 0.7264104967835293, + "learning_rate": 1.9087523277467413e-05, + "loss": 6.2949, + "step": 1230 + }, + { + "epoch": 0.05731312708056894, + "grad_norm": 0.6217239092992849, + "learning_rate": 1.910304158907511e-05, + "loss": 6.338, + "step": 1231 + }, + { + "epoch": 0.05735968526666201, + "grad_norm": 0.7686827299826945, + "learning_rate": 1.9118559900682806e-05, + "loss": 6.17, + "step": 1232 + }, + { + "epoch": 0.05740624345275508, + "grad_norm": 0.6871050330358077, + "learning_rate": 1.9134078212290506e-05, + "loss": 6.4565, + "step": 1233 + }, + { + "epoch": 0.05745280163884815, + "grad_norm": 0.565128766008799, + "learning_rate": 1.91495965238982e-05, + "loss": 6.2808, + "step": 1234 + }, + { + "epoch": 0.05749935982494122, + "grad_norm": 0.6487959056024154, + "learning_rate": 1.9165114835505898e-05, + "loss": 6.2711, + "step": 1235 + }, + { + "epoch": 0.05754591801103429, + "grad_norm": 0.6155058331228775, + "learning_rate": 1.9180633147113595e-05, + "loss": 6.4265, + "step": 1236 + }, + { + "epoch": 0.05759247619712736, + "grad_norm": 0.6339856952802494, + "learning_rate": 1.9196151458721294e-05, + "loss": 6.3474, + "step": 1237 + }, + { + "epoch": 0.057639034383220426, + "grad_norm": 0.500239852610753, + "learning_rate": 1.9211669770328987e-05, + "loss": 6.3565, + "step": 1238 + }, + { + "epoch": 0.0576855925693135, + "grad_norm": 0.6148622393835158, + "learning_rate": 1.9227188081936687e-05, + "loss": 6.3643, + "step": 1239 + }, + { + "epoch": 0.05773215075540657, + "grad_norm": 0.5991930926942471, + "learning_rate": 1.9242706393544383e-05, + "loss": 6.3039, + "step": 1240 + }, + { + "epoch": 0.05777870894149964, + "grad_norm": 0.6347171754106666, + "learning_rate": 1.925822470515208e-05, + "loss": 6.3585, + "step": 1241 + }, + { + "epoch": 0.05782526712759271, + "grad_norm": 0.5574629570230739, + "learning_rate": 1.9273743016759776e-05, + "loss": 6.3018, + "step": 1242 + }, + { + "epoch": 0.05787182531368578, + "grad_norm": 0.7046564692480454, + "learning_rate": 1.9289261328367476e-05, + "loss": 6.3841, + "step": 1243 + }, + { + "epoch": 0.05791838349977885, + "grad_norm": 0.8163884168598736, + "learning_rate": 1.9304779639975172e-05, + "loss": 6.4617, + "step": 1244 + }, + { + "epoch": 0.05796494168587192, + "grad_norm": 0.7741648299207653, + "learning_rate": 1.9320297951582868e-05, + "loss": 6.355, + "step": 1245 + }, + { + "epoch": 0.05801149987196499, + "grad_norm": 0.6682088600812196, + "learning_rate": 1.9335816263190564e-05, + "loss": 6.3774, + "step": 1246 + }, + { + "epoch": 0.05805805805805806, + "grad_norm": 0.7184494109630674, + "learning_rate": 1.9351334574798264e-05, + "loss": 6.3878, + "step": 1247 + }, + { + "epoch": 0.05810461624415113, + "grad_norm": 0.6324644726120034, + "learning_rate": 1.936685288640596e-05, + "loss": 6.4523, + "step": 1248 + }, + { + "epoch": 0.058151174430244194, + "grad_norm": 0.561409122376703, + "learning_rate": 1.9382371198013657e-05, + "loss": 6.2612, + "step": 1249 + }, + { + "epoch": 0.05819773261633727, + "grad_norm": 0.5030540647271404, + "learning_rate": 1.9397889509621353e-05, + "loss": 6.3916, + "step": 1250 + }, + { + "epoch": 0.058244290802430335, + "grad_norm": 0.55090593130108, + "learning_rate": 1.9413407821229053e-05, + "loss": 6.238, + "step": 1251 + }, + { + "epoch": 0.05829084898852341, + "grad_norm": 0.5690707260222048, + "learning_rate": 1.9428926132836746e-05, + "loss": 6.3198, + "step": 1252 + }, + { + "epoch": 0.058337407174616475, + "grad_norm": 0.6268545723871476, + "learning_rate": 1.9444444444444445e-05, + "loss": 6.3128, + "step": 1253 + }, + { + "epoch": 0.05838396536070955, + "grad_norm": 0.6970858193089399, + "learning_rate": 1.945996275605214e-05, + "loss": 6.3543, + "step": 1254 + }, + { + "epoch": 0.058430523546802615, + "grad_norm": 0.5980088939154956, + "learning_rate": 1.947548106765984e-05, + "loss": 6.4059, + "step": 1255 + }, + { + "epoch": 0.05847708173289569, + "grad_norm": 0.6579577699182515, + "learning_rate": 1.9490999379267534e-05, + "loss": 6.2878, + "step": 1256 + }, + { + "epoch": 0.058523639918988755, + "grad_norm": 0.9136318614498549, + "learning_rate": 1.9506517690875234e-05, + "loss": 6.2056, + "step": 1257 + }, + { + "epoch": 0.05857019810508183, + "grad_norm": 1.243872327591, + "learning_rate": 1.952203600248293e-05, + "loss": 6.4058, + "step": 1258 + }, + { + "epoch": 0.058616756291174896, + "grad_norm": 0.8145872789536523, + "learning_rate": 1.9537554314090627e-05, + "loss": 6.2293, + "step": 1259 + }, + { + "epoch": 0.05866331447726796, + "grad_norm": 0.6221357155452153, + "learning_rate": 1.9553072625698323e-05, + "loss": 6.4654, + "step": 1260 + }, + { + "epoch": 0.058709872663361036, + "grad_norm": 0.7533016639873577, + "learning_rate": 1.9568590937306023e-05, + "loss": 6.3993, + "step": 1261 + }, + { + "epoch": 0.0587564308494541, + "grad_norm": 0.7492988510046544, + "learning_rate": 1.958410924891372e-05, + "loss": 6.4388, + "step": 1262 + }, + { + "epoch": 0.058802989035547176, + "grad_norm": 0.6914076156187029, + "learning_rate": 1.9599627560521415e-05, + "loss": 6.2397, + "step": 1263 + }, + { + "epoch": 0.05884954722164024, + "grad_norm": 0.7667121381704649, + "learning_rate": 1.961514587212911e-05, + "loss": 6.3042, + "step": 1264 + }, + { + "epoch": 0.05889610540773332, + "grad_norm": 0.7271878281977537, + "learning_rate": 1.963066418373681e-05, + "loss": 6.2333, + "step": 1265 + }, + { + "epoch": 0.05894266359382638, + "grad_norm": 0.6295547745142489, + "learning_rate": 1.9646182495344508e-05, + "loss": 6.2472, + "step": 1266 + }, + { + "epoch": 0.05898922177991946, + "grad_norm": 0.7154789483194144, + "learning_rate": 1.9661700806952204e-05, + "loss": 6.3008, + "step": 1267 + }, + { + "epoch": 0.059035779966012523, + "grad_norm": 0.6464876865682709, + "learning_rate": 1.96772191185599e-05, + "loss": 6.3112, + "step": 1268 + }, + { + "epoch": 0.0590823381521056, + "grad_norm": 0.5831524289537808, + "learning_rate": 1.96927374301676e-05, + "loss": 6.2713, + "step": 1269 + }, + { + "epoch": 0.059128896338198664, + "grad_norm": 0.7024493831531972, + "learning_rate": 1.9708255741775293e-05, + "loss": 6.291, + "step": 1270 + }, + { + "epoch": 0.05917545452429173, + "grad_norm": 0.7005143805905513, + "learning_rate": 1.9723774053382992e-05, + "loss": 6.2998, + "step": 1271 + }, + { + "epoch": 0.059222012710384804, + "grad_norm": 0.7200077522526215, + "learning_rate": 1.9739292364990692e-05, + "loss": 6.3774, + "step": 1272 + }, + { + "epoch": 0.05926857089647787, + "grad_norm": 0.5988853315100849, + "learning_rate": 1.975481067659839e-05, + "loss": 6.307, + "step": 1273 + }, + { + "epoch": 0.059315129082570944, + "grad_norm": 0.5479933383345894, + "learning_rate": 1.9770328988206085e-05, + "loss": 6.3119, + "step": 1274 + }, + { + "epoch": 0.05936168726866401, + "grad_norm": 0.5903526229899355, + "learning_rate": 1.978584729981378e-05, + "loss": 6.2876, + "step": 1275 + }, + { + "epoch": 0.059408245454757085, + "grad_norm": 0.681137468845596, + "learning_rate": 1.980136561142148e-05, + "loss": 6.3779, + "step": 1276 + }, + { + "epoch": 0.05945480364085015, + "grad_norm": 0.7312065297180408, + "learning_rate": 1.9816883923029174e-05, + "loss": 6.2461, + "step": 1277 + }, + { + "epoch": 0.059501361826943225, + "grad_norm": 0.7220689239709784, + "learning_rate": 1.9832402234636873e-05, + "loss": 6.2286, + "step": 1278 + }, + { + "epoch": 0.05954792001303629, + "grad_norm": 0.684767042201151, + "learning_rate": 1.984792054624457e-05, + "loss": 6.3292, + "step": 1279 + }, + { + "epoch": 0.059594478199129365, + "grad_norm": 0.7262426658890504, + "learning_rate": 1.986343885785227e-05, + "loss": 6.1692, + "step": 1280 + }, + { + "epoch": 0.05964103638522243, + "grad_norm": 0.6951335789611552, + "learning_rate": 1.9878957169459962e-05, + "loss": 6.1993, + "step": 1281 + }, + { + "epoch": 0.0596875945713155, + "grad_norm": 0.5460597576047401, + "learning_rate": 1.9894475481067662e-05, + "loss": 6.3035, + "step": 1282 + }, + { + "epoch": 0.05973415275740857, + "grad_norm": 0.8323740011539847, + "learning_rate": 1.990999379267536e-05, + "loss": 6.2097, + "step": 1283 + }, + { + "epoch": 0.05978071094350164, + "grad_norm": 0.6129825857448793, + "learning_rate": 1.9925512104283055e-05, + "loss": 6.1066, + "step": 1284 + }, + { + "epoch": 0.05982726912959471, + "grad_norm": 0.7150876048223967, + "learning_rate": 1.994103041589075e-05, + "loss": 6.2947, + "step": 1285 + }, + { + "epoch": 0.05987382731568778, + "grad_norm": 0.7615321004775006, + "learning_rate": 1.995654872749845e-05, + "loss": 6.3246, + "step": 1286 + }, + { + "epoch": 0.05992038550178085, + "grad_norm": 0.7608253749052487, + "learning_rate": 1.9972067039106147e-05, + "loss": 6.3121, + "step": 1287 + }, + { + "epoch": 0.05996694368787392, + "grad_norm": 0.7374443593492431, + "learning_rate": 1.9987585350713843e-05, + "loss": 6.1034, + "step": 1288 + }, + { + "epoch": 0.06001350187396699, + "grad_norm": 0.654941103056779, + "learning_rate": 2.000310366232154e-05, + "loss": 6.209, + "step": 1289 + }, + { + "epoch": 0.06006006006006006, + "grad_norm": 0.7378022420174993, + "learning_rate": 2.001862197392924e-05, + "loss": 6.1965, + "step": 1290 + }, + { + "epoch": 0.06010661824615313, + "grad_norm": 0.7481964811846269, + "learning_rate": 2.0034140285536936e-05, + "loss": 6.2501, + "step": 1291 + }, + { + "epoch": 0.0601531764322462, + "grad_norm": 0.7312313249188387, + "learning_rate": 2.0049658597144632e-05, + "loss": 6.2094, + "step": 1292 + }, + { + "epoch": 0.060199734618339266, + "grad_norm": 0.7811287871446825, + "learning_rate": 2.0065176908752328e-05, + "loss": 6.2169, + "step": 1293 + }, + { + "epoch": 0.06024629280443234, + "grad_norm": 0.75097180185914, + "learning_rate": 2.0080695220360028e-05, + "loss": 6.2636, + "step": 1294 + }, + { + "epoch": 0.06029285099052541, + "grad_norm": 0.76937195166477, + "learning_rate": 2.0096213531967724e-05, + "loss": 6.2641, + "step": 1295 + }, + { + "epoch": 0.06033940917661848, + "grad_norm": 0.7705737839457076, + "learning_rate": 2.011173184357542e-05, + "loss": 6.3497, + "step": 1296 + }, + { + "epoch": 0.06038596736271155, + "grad_norm": 0.7227486217285795, + "learning_rate": 2.0127250155183117e-05, + "loss": 6.3026, + "step": 1297 + }, + { + "epoch": 0.06043252554880462, + "grad_norm": 0.6870284988471899, + "learning_rate": 2.0142768466790816e-05, + "loss": 6.3528, + "step": 1298 + }, + { + "epoch": 0.06047908373489769, + "grad_norm": 0.8255377185819142, + "learning_rate": 2.015828677839851e-05, + "loss": 6.2046, + "step": 1299 + }, + { + "epoch": 0.06052564192099076, + "grad_norm": 0.8068488012355359, + "learning_rate": 2.017380509000621e-05, + "loss": 6.1197, + "step": 1300 + }, + { + "epoch": 0.06057220010708383, + "grad_norm": 0.8637395564212825, + "learning_rate": 2.0189323401613905e-05, + "loss": 6.2258, + "step": 1301 + }, + { + "epoch": 0.0606187582931769, + "grad_norm": 1.0105918582825335, + "learning_rate": 2.0204841713221602e-05, + "loss": 6.3282, + "step": 1302 + }, + { + "epoch": 0.06066531647926997, + "grad_norm": 0.9030674517842031, + "learning_rate": 2.0220360024829298e-05, + "loss": 6.2679, + "step": 1303 + }, + { + "epoch": 0.060711874665363035, + "grad_norm": 0.8303905368593595, + "learning_rate": 2.0235878336436998e-05, + "loss": 6.2163, + "step": 1304 + }, + { + "epoch": 0.06075843285145611, + "grad_norm": 1.0658071714122652, + "learning_rate": 2.0251396648044694e-05, + "loss": 6.2205, + "step": 1305 + }, + { + "epoch": 0.060804991037549175, + "grad_norm": 1.0047926454561071, + "learning_rate": 2.026691495965239e-05, + "loss": 6.3537, + "step": 1306 + }, + { + "epoch": 0.06085154922364225, + "grad_norm": 0.8220901380644117, + "learning_rate": 2.0282433271260087e-05, + "loss": 6.2733, + "step": 1307 + }, + { + "epoch": 0.060898107409735315, + "grad_norm": 0.647351377015692, + "learning_rate": 2.0297951582867786e-05, + "loss": 6.1696, + "step": 1308 + }, + { + "epoch": 0.06094466559582839, + "grad_norm": 0.7509860657614991, + "learning_rate": 2.0313469894475483e-05, + "loss": 6.2786, + "step": 1309 + }, + { + "epoch": 0.060991223781921455, + "grad_norm": 0.8303275264303329, + "learning_rate": 2.032898820608318e-05, + "loss": 6.1527, + "step": 1310 + }, + { + "epoch": 0.06103778196801453, + "grad_norm": 0.6571354777230081, + "learning_rate": 2.0344506517690875e-05, + "loss": 6.1869, + "step": 1311 + }, + { + "epoch": 0.061084340154107596, + "grad_norm": 0.6482384608837835, + "learning_rate": 2.0360024829298575e-05, + "loss": 6.3084, + "step": 1312 + }, + { + "epoch": 0.06113089834020067, + "grad_norm": 0.6166030602771242, + "learning_rate": 2.037554314090627e-05, + "loss": 6.2306, + "step": 1313 + }, + { + "epoch": 0.061177456526293736, + "grad_norm": 0.730803139848714, + "learning_rate": 2.0391061452513968e-05, + "loss": 6.1364, + "step": 1314 + }, + { + "epoch": 0.0612240147123868, + "grad_norm": 0.5920170221856013, + "learning_rate": 2.0406579764121664e-05, + "loss": 6.2274, + "step": 1315 + }, + { + "epoch": 0.061270572898479876, + "grad_norm": 0.6836475957182014, + "learning_rate": 2.0422098075729364e-05, + "loss": 6.2021, + "step": 1316 + }, + { + "epoch": 0.06131713108457294, + "grad_norm": 0.7771591725262048, + "learning_rate": 2.0437616387337056e-05, + "loss": 6.1322, + "step": 1317 + }, + { + "epoch": 0.061363689270666016, + "grad_norm": 0.6295894280590425, + "learning_rate": 2.0453134698944756e-05, + "loss": 6.2487, + "step": 1318 + }, + { + "epoch": 0.06141024745675908, + "grad_norm": 0.58723734179398, + "learning_rate": 2.0468653010552452e-05, + "loss": 6.255, + "step": 1319 + }, + { + "epoch": 0.06145680564285216, + "grad_norm": 0.6147318998456807, + "learning_rate": 2.0484171322160152e-05, + "loss": 6.2392, + "step": 1320 + }, + { + "epoch": 0.06150336382894522, + "grad_norm": 0.7169986097679937, + "learning_rate": 2.0499689633767845e-05, + "loss": 6.1475, + "step": 1321 + }, + { + "epoch": 0.0615499220150383, + "grad_norm": 0.7253634611418845, + "learning_rate": 2.0515207945375545e-05, + "loss": 6.0956, + "step": 1322 + }, + { + "epoch": 0.061596480201131364, + "grad_norm": 0.5467202857063435, + "learning_rate": 2.053072625698324e-05, + "loss": 6.2886, + "step": 1323 + }, + { + "epoch": 0.06164303838722443, + "grad_norm": 0.6872231502942178, + "learning_rate": 2.0546244568590937e-05, + "loss": 6.1968, + "step": 1324 + }, + { + "epoch": 0.061689596573317504, + "grad_norm": 0.5774833644382681, + "learning_rate": 2.0561762880198634e-05, + "loss": 6.2131, + "step": 1325 + }, + { + "epoch": 0.06173615475941057, + "grad_norm": 0.5763927604099401, + "learning_rate": 2.0577281191806333e-05, + "loss": 6.1314, + "step": 1326 + }, + { + "epoch": 0.061782712945503644, + "grad_norm": 0.6785044259406757, + "learning_rate": 2.059279950341403e-05, + "loss": 6.1247, + "step": 1327 + }, + { + "epoch": 0.06182927113159671, + "grad_norm": 0.687679497922484, + "learning_rate": 2.0608317815021726e-05, + "loss": 6.1408, + "step": 1328 + }, + { + "epoch": 0.061875829317689784, + "grad_norm": 0.6224959542822553, + "learning_rate": 2.0623836126629422e-05, + "loss": 6.2031, + "step": 1329 + }, + { + "epoch": 0.06192238750378285, + "grad_norm": 0.6485408801532406, + "learning_rate": 2.0639354438237122e-05, + "loss": 6.2548, + "step": 1330 + }, + { + "epoch": 0.061968945689875925, + "grad_norm": 0.769215230058325, + "learning_rate": 2.065487274984482e-05, + "loss": 6.2786, + "step": 1331 + }, + { + "epoch": 0.06201550387596899, + "grad_norm": 0.9600608162033986, + "learning_rate": 2.0670391061452515e-05, + "loss": 6.208, + "step": 1332 + }, + { + "epoch": 0.062062062062062065, + "grad_norm": 0.8770712952811274, + "learning_rate": 2.068590937306021e-05, + "loss": 6.0972, + "step": 1333 + }, + { + "epoch": 0.06210862024815513, + "grad_norm": 0.7680245107616164, + "learning_rate": 2.070142768466791e-05, + "loss": 6.1415, + "step": 1334 + }, + { + "epoch": 0.0621551784342482, + "grad_norm": 0.7456456457675648, + "learning_rate": 2.0716945996275604e-05, + "loss": 6.2024, + "step": 1335 + }, + { + "epoch": 0.06220173662034127, + "grad_norm": 0.7634118013609238, + "learning_rate": 2.0732464307883303e-05, + "loss": 6.2509, + "step": 1336 + }, + { + "epoch": 0.06224829480643434, + "grad_norm": 0.6982536980612091, + "learning_rate": 2.0747982619491e-05, + "loss": 6.1411, + "step": 1337 + }, + { + "epoch": 0.06229485299252741, + "grad_norm": 0.6686080466647104, + "learning_rate": 2.07635009310987e-05, + "loss": 6.1821, + "step": 1338 + }, + { + "epoch": 0.06234141117862048, + "grad_norm": 0.6899089689399113, + "learning_rate": 2.0779019242706392e-05, + "loss": 6.137, + "step": 1339 + }, + { + "epoch": 0.06238796936471355, + "grad_norm": 0.7500843584227981, + "learning_rate": 2.0794537554314092e-05, + "loss": 6.2486, + "step": 1340 + }, + { + "epoch": 0.06243452755080662, + "grad_norm": 0.7211447902730068, + "learning_rate": 2.0810055865921788e-05, + "loss": 6.115, + "step": 1341 + }, + { + "epoch": 0.06248108573689969, + "grad_norm": 0.687136856445477, + "learning_rate": 2.0825574177529484e-05, + "loss": 6.0839, + "step": 1342 + }, + { + "epoch": 0.06252764392299276, + "grad_norm": 0.6766326310798402, + "learning_rate": 2.084109248913718e-05, + "loss": 6.1212, + "step": 1343 + }, + { + "epoch": 0.06257420210908583, + "grad_norm": 0.8648322302027096, + "learning_rate": 2.085661080074488e-05, + "loss": 6.2581, + "step": 1344 + }, + { + "epoch": 0.0626207602951789, + "grad_norm": 0.8841842958744891, + "learning_rate": 2.0872129112352577e-05, + "loss": 6.1282, + "step": 1345 + }, + { + "epoch": 0.06266731848127197, + "grad_norm": 0.6752062244067123, + "learning_rate": 2.0887647423960273e-05, + "loss": 6.2442, + "step": 1346 + }, + { + "epoch": 0.06271387666736504, + "grad_norm": 0.6801062872551623, + "learning_rate": 2.090316573556797e-05, + "loss": 6.1002, + "step": 1347 + }, + { + "epoch": 0.06276043485345811, + "grad_norm": 0.7067616752747568, + "learning_rate": 2.091868404717567e-05, + "loss": 6.1831, + "step": 1348 + }, + { + "epoch": 0.06280699303955117, + "grad_norm": 0.648226850161368, + "learning_rate": 2.0934202358783365e-05, + "loss": 6.2739, + "step": 1349 + }, + { + "epoch": 0.06285355122564425, + "grad_norm": 0.6507155577560055, + "learning_rate": 2.0949720670391062e-05, + "loss": 6.2193, + "step": 1350 + }, + { + "epoch": 0.06290010941173732, + "grad_norm": 0.7588776176290221, + "learning_rate": 2.0965238981998758e-05, + "loss": 6.2086, + "step": 1351 + }, + { + "epoch": 0.0629466675978304, + "grad_norm": 0.7927972081570742, + "learning_rate": 2.0980757293606458e-05, + "loss": 6.1055, + "step": 1352 + }, + { + "epoch": 0.06299322578392345, + "grad_norm": 0.9520924590760093, + "learning_rate": 2.0996275605214154e-05, + "loss": 6.1247, + "step": 1353 + }, + { + "epoch": 0.06303978397001653, + "grad_norm": 1.1258662415062384, + "learning_rate": 2.101179391682185e-05, + "loss": 6.2782, + "step": 1354 + }, + { + "epoch": 0.0630863421561096, + "grad_norm": 0.8169547330940561, + "learning_rate": 2.1027312228429547e-05, + "loss": 6.1408, + "step": 1355 + }, + { + "epoch": 0.06313290034220267, + "grad_norm": 0.7936752575763695, + "learning_rate": 2.1042830540037246e-05, + "loss": 6.1999, + "step": 1356 + }, + { + "epoch": 0.06317945852829573, + "grad_norm": 0.6801463826305967, + "learning_rate": 2.1058348851644943e-05, + "loss": 6.1946, + "step": 1357 + }, + { + "epoch": 0.06322601671438881, + "grad_norm": 0.8580856236868434, + "learning_rate": 2.107386716325264e-05, + "loss": 6.1784, + "step": 1358 + }, + { + "epoch": 0.06327257490048188, + "grad_norm": 0.8393191772619794, + "learning_rate": 2.108938547486034e-05, + "loss": 6.2403, + "step": 1359 + }, + { + "epoch": 0.06331913308657494, + "grad_norm": 0.6661042311332871, + "learning_rate": 2.110490378646803e-05, + "loss": 6.0668, + "step": 1360 + }, + { + "epoch": 0.06336569127266801, + "grad_norm": 0.8429956960193301, + "learning_rate": 2.112042209807573e-05, + "loss": 6.0898, + "step": 1361 + }, + { + "epoch": 0.06341224945876109, + "grad_norm": 0.9027937873057506, + "learning_rate": 2.1135940409683428e-05, + "loss": 6.2946, + "step": 1362 + }, + { + "epoch": 0.06345880764485416, + "grad_norm": 0.81102573293559, + "learning_rate": 2.1151458721291127e-05, + "loss": 6.1478, + "step": 1363 + }, + { + "epoch": 0.06350536583094722, + "grad_norm": 0.6417148329199566, + "learning_rate": 2.116697703289882e-05, + "loss": 6.2462, + "step": 1364 + }, + { + "epoch": 0.0635519240170403, + "grad_norm": 0.6760238644624427, + "learning_rate": 2.118249534450652e-05, + "loss": 6.1103, + "step": 1365 + }, + { + "epoch": 0.06359848220313337, + "grad_norm": 0.6737837419281294, + "learning_rate": 2.1198013656114216e-05, + "loss": 6.2529, + "step": 1366 + }, + { + "epoch": 0.06364504038922643, + "grad_norm": 0.6840642612182842, + "learning_rate": 2.1213531967721912e-05, + "loss": 6.0782, + "step": 1367 + }, + { + "epoch": 0.0636915985753195, + "grad_norm": 0.8298025292367743, + "learning_rate": 2.122905027932961e-05, + "loss": 6.1658, + "step": 1368 + }, + { + "epoch": 0.06373815676141258, + "grad_norm": 0.8059070750903318, + "learning_rate": 2.124456859093731e-05, + "loss": 6.2474, + "step": 1369 + }, + { + "epoch": 0.06378471494750565, + "grad_norm": 0.7532461838342209, + "learning_rate": 2.1260086902545005e-05, + "loss": 6.1739, + "step": 1370 + }, + { + "epoch": 0.06383127313359871, + "grad_norm": 0.7407230568952936, + "learning_rate": 2.12756052141527e-05, + "loss": 6.1748, + "step": 1371 + }, + { + "epoch": 0.06387783131969178, + "grad_norm": 0.650903184437211, + "learning_rate": 2.1291123525760397e-05, + "loss": 6.0728, + "step": 1372 + }, + { + "epoch": 0.06392438950578486, + "grad_norm": 0.5921011877176497, + "learning_rate": 2.1306641837368097e-05, + "loss": 6.1308, + "step": 1373 + }, + { + "epoch": 0.06397094769187793, + "grad_norm": 0.703473637847032, + "learning_rate": 2.1322160148975793e-05, + "loss": 6.1396, + "step": 1374 + }, + { + "epoch": 0.06401750587797099, + "grad_norm": 0.6796485029469579, + "learning_rate": 2.133767846058349e-05, + "loss": 6.2443, + "step": 1375 + }, + { + "epoch": 0.06406406406406406, + "grad_norm": 0.6774944092021981, + "learning_rate": 2.1353196772191186e-05, + "loss": 6.1163, + "step": 1376 + }, + { + "epoch": 0.06411062225015714, + "grad_norm": 0.7429065795204925, + "learning_rate": 2.1368715083798886e-05, + "loss": 6.1789, + "step": 1377 + }, + { + "epoch": 0.0641571804362502, + "grad_norm": 0.7060562081174825, + "learning_rate": 2.1384233395406582e-05, + "loss": 6.1677, + "step": 1378 + }, + { + "epoch": 0.06420373862234327, + "grad_norm": 0.698256927442398, + "learning_rate": 2.139975170701428e-05, + "loss": 6.0575, + "step": 1379 + }, + { + "epoch": 0.06425029680843634, + "grad_norm": 0.7000343647674999, + "learning_rate": 2.1415270018621975e-05, + "loss": 6.1758, + "step": 1380 + }, + { + "epoch": 0.06429685499452942, + "grad_norm": 0.5931802047029774, + "learning_rate": 2.1430788330229674e-05, + "loss": 6.1141, + "step": 1381 + }, + { + "epoch": 0.06434341318062248, + "grad_norm": 0.7705475762265437, + "learning_rate": 2.1446306641837367e-05, + "loss": 6.2539, + "step": 1382 + }, + { + "epoch": 0.06438997136671555, + "grad_norm": 0.7414109663437832, + "learning_rate": 2.1461824953445067e-05, + "loss": 6.2914, + "step": 1383 + }, + { + "epoch": 0.06443652955280862, + "grad_norm": 0.5806040291639402, + "learning_rate": 2.1477343265052763e-05, + "loss": 6.2055, + "step": 1384 + }, + { + "epoch": 0.0644830877389017, + "grad_norm": 0.8138032638689164, + "learning_rate": 2.1492861576660463e-05, + "loss": 6.016, + "step": 1385 + }, + { + "epoch": 0.06452964592499476, + "grad_norm": 0.6607230966001687, + "learning_rate": 2.1508379888268156e-05, + "loss": 6.1584, + "step": 1386 + }, + { + "epoch": 0.06457620411108783, + "grad_norm": 0.9698574980107538, + "learning_rate": 2.1523898199875856e-05, + "loss": 6.1204, + "step": 1387 + }, + { + "epoch": 0.0646227622971809, + "grad_norm": 0.911617936523809, + "learning_rate": 2.1539416511483552e-05, + "loss": 6.1348, + "step": 1388 + }, + { + "epoch": 0.06466932048327396, + "grad_norm": 0.6373188555649641, + "learning_rate": 2.1554934823091248e-05, + "loss": 6.1408, + "step": 1389 + }, + { + "epoch": 0.06471587866936704, + "grad_norm": 0.8528928781159736, + "learning_rate": 2.1570453134698945e-05, + "loss": 6.227, + "step": 1390 + }, + { + "epoch": 0.06476243685546011, + "grad_norm": 0.6931586657059707, + "learning_rate": 2.1585971446306644e-05, + "loss": 6.0466, + "step": 1391 + }, + { + "epoch": 0.06480899504155319, + "grad_norm": 0.8883522351697715, + "learning_rate": 2.160148975791434e-05, + "loss": 6.1506, + "step": 1392 + }, + { + "epoch": 0.06485555322764625, + "grad_norm": 0.8687956399717325, + "learning_rate": 2.1617008069522037e-05, + "loss": 6.1204, + "step": 1393 + }, + { + "epoch": 0.06490211141373932, + "grad_norm": 0.895843397122225, + "learning_rate": 2.1632526381129733e-05, + "loss": 6.1729, + "step": 1394 + }, + { + "epoch": 0.06494866959983239, + "grad_norm": 0.9033125844826678, + "learning_rate": 2.1648044692737433e-05, + "loss": 5.9963, + "step": 1395 + }, + { + "epoch": 0.06499522778592547, + "grad_norm": 0.6848075100032349, + "learning_rate": 2.166356300434513e-05, + "loss": 6.1847, + "step": 1396 + }, + { + "epoch": 0.06504178597201853, + "grad_norm": 0.7006707460272938, + "learning_rate": 2.1679081315952825e-05, + "loss": 6.1592, + "step": 1397 + }, + { + "epoch": 0.0650883441581116, + "grad_norm": 0.644992710461075, + "learning_rate": 2.1694599627560522e-05, + "loss": 6.0927, + "step": 1398 + }, + { + "epoch": 0.06513490234420467, + "grad_norm": 0.7358823368285874, + "learning_rate": 2.171011793916822e-05, + "loss": 6.0929, + "step": 1399 + }, + { + "epoch": 0.06518146053029773, + "grad_norm": 0.7028884220770889, + "learning_rate": 2.1725636250775914e-05, + "loss": 6.214, + "step": 1400 + }, + { + "epoch": 0.0652280187163908, + "grad_norm": 0.5765585159049801, + "learning_rate": 2.1741154562383614e-05, + "loss": 6.3164, + "step": 1401 + }, + { + "epoch": 0.06527457690248388, + "grad_norm": 0.7806855352922016, + "learning_rate": 2.175667287399131e-05, + "loss": 6.0492, + "step": 1402 + }, + { + "epoch": 0.06532113508857695, + "grad_norm": 0.9769869970367894, + "learning_rate": 2.177219118559901e-05, + "loss": 6.2143, + "step": 1403 + }, + { + "epoch": 0.06536769327467001, + "grad_norm": 1.0507623456297293, + "learning_rate": 2.1787709497206703e-05, + "loss": 6.128, + "step": 1404 + }, + { + "epoch": 0.06541425146076309, + "grad_norm": 0.7519476351045203, + "learning_rate": 2.1803227808814403e-05, + "loss": 6.1642, + "step": 1405 + }, + { + "epoch": 0.06546080964685616, + "grad_norm": 0.7424400481963169, + "learning_rate": 2.18187461204221e-05, + "loss": 6.0921, + "step": 1406 + }, + { + "epoch": 0.06550736783294923, + "grad_norm": 0.9257335426523406, + "learning_rate": 2.1834264432029795e-05, + "loss": 6.1507, + "step": 1407 + }, + { + "epoch": 0.0655539260190423, + "grad_norm": 0.9688154370687347, + "learning_rate": 2.184978274363749e-05, + "loss": 6.1581, + "step": 1408 + }, + { + "epoch": 0.06560048420513537, + "grad_norm": 0.9928020569428421, + "learning_rate": 2.186530105524519e-05, + "loss": 5.896, + "step": 1409 + }, + { + "epoch": 0.06564704239122844, + "grad_norm": 1.1990662578845572, + "learning_rate": 2.1880819366852888e-05, + "loss": 6.0493, + "step": 1410 + }, + { + "epoch": 0.0656936005773215, + "grad_norm": 0.6349226246880854, + "learning_rate": 2.1896337678460584e-05, + "loss": 6.174, + "step": 1411 + }, + { + "epoch": 0.06574015876341457, + "grad_norm": 0.7533605884988993, + "learning_rate": 2.191185599006828e-05, + "loss": 6.1392, + "step": 1412 + }, + { + "epoch": 0.06578671694950765, + "grad_norm": 1.122059144300556, + "learning_rate": 2.192737430167598e-05, + "loss": 6.0713, + "step": 1413 + }, + { + "epoch": 0.06583327513560072, + "grad_norm": 0.9599645143426182, + "learning_rate": 2.1942892613283676e-05, + "loss": 6.1662, + "step": 1414 + }, + { + "epoch": 0.06587983332169378, + "grad_norm": 0.8169488052509147, + "learning_rate": 2.1958410924891373e-05, + "loss": 5.9966, + "step": 1415 + }, + { + "epoch": 0.06592639150778686, + "grad_norm": 0.5398787846700986, + "learning_rate": 2.197392923649907e-05, + "loss": 6.0343, + "step": 1416 + }, + { + "epoch": 0.06597294969387993, + "grad_norm": 0.672768993274694, + "learning_rate": 2.198944754810677e-05, + "loss": 6.0393, + "step": 1417 + }, + { + "epoch": 0.066019507879973, + "grad_norm": 0.7962548278576558, + "learning_rate": 2.200496585971446e-05, + "loss": 6.0499, + "step": 1418 + }, + { + "epoch": 0.06606606606606606, + "grad_norm": 0.6605915841166744, + "learning_rate": 2.202048417132216e-05, + "loss": 5.9571, + "step": 1419 + }, + { + "epoch": 0.06611262425215914, + "grad_norm": 0.6515621325864533, + "learning_rate": 2.2036002482929857e-05, + "loss": 6.0413, + "step": 1420 + }, + { + "epoch": 0.06615918243825221, + "grad_norm": 0.7406872477811965, + "learning_rate": 2.2051520794537557e-05, + "loss": 6.0947, + "step": 1421 + }, + { + "epoch": 0.06620574062434527, + "grad_norm": 0.7568613560064998, + "learning_rate": 2.206703910614525e-05, + "loss": 6.027, + "step": 1422 + }, + { + "epoch": 0.06625229881043834, + "grad_norm": 0.6808655581138766, + "learning_rate": 2.208255741775295e-05, + "loss": 5.9566, + "step": 1423 + }, + { + "epoch": 0.06629885699653142, + "grad_norm": 0.5806544280031103, + "learning_rate": 2.2098075729360646e-05, + "loss": 6.0889, + "step": 1424 + }, + { + "epoch": 0.06634541518262449, + "grad_norm": 0.7419580255565701, + "learning_rate": 2.2113594040968342e-05, + "loss": 6.0168, + "step": 1425 + }, + { + "epoch": 0.06639197336871755, + "grad_norm": 0.7933793324493793, + "learning_rate": 2.212911235257604e-05, + "loss": 6.0198, + "step": 1426 + }, + { + "epoch": 0.06643853155481062, + "grad_norm": 0.7642276096096984, + "learning_rate": 2.214463066418374e-05, + "loss": 6.0771, + "step": 1427 + }, + { + "epoch": 0.0664850897409037, + "grad_norm": 0.7292022311784555, + "learning_rate": 2.2160148975791435e-05, + "loss": 6.1296, + "step": 1428 + }, + { + "epoch": 0.06653164792699677, + "grad_norm": 0.7494313634089341, + "learning_rate": 2.217566728739913e-05, + "loss": 6.1823, + "step": 1429 + }, + { + "epoch": 0.06657820611308983, + "grad_norm": 0.818130202001924, + "learning_rate": 2.2191185599006827e-05, + "loss": 6.0372, + "step": 1430 + }, + { + "epoch": 0.0666247642991829, + "grad_norm": 0.7457968091811871, + "learning_rate": 2.2206703910614527e-05, + "loss": 6.149, + "step": 1431 + }, + { + "epoch": 0.06667132248527598, + "grad_norm": 0.7192289241416067, + "learning_rate": 2.2222222222222223e-05, + "loss": 6.1832, + "step": 1432 + }, + { + "epoch": 0.06671788067136904, + "grad_norm": 0.9358100152977759, + "learning_rate": 2.223774053382992e-05, + "loss": 6.0888, + "step": 1433 + }, + { + "epoch": 0.06676443885746211, + "grad_norm": 0.8657241389797187, + "learning_rate": 2.2253258845437616e-05, + "loss": 6.1226, + "step": 1434 + }, + { + "epoch": 0.06681099704355518, + "grad_norm": 0.6221145434540217, + "learning_rate": 2.2268777157045316e-05, + "loss": 6.0512, + "step": 1435 + }, + { + "epoch": 0.06685755522964826, + "grad_norm": 0.7309864661337896, + "learning_rate": 2.2284295468653012e-05, + "loss": 6.0076, + "step": 1436 + }, + { + "epoch": 0.06690411341574132, + "grad_norm": 0.6635243168436654, + "learning_rate": 2.2299813780260708e-05, + "loss": 6.0897, + "step": 1437 + }, + { + "epoch": 0.06695067160183439, + "grad_norm": 0.5691400512233171, + "learning_rate": 2.2315332091868405e-05, + "loss": 6.0354, + "step": 1438 + }, + { + "epoch": 0.06699722978792746, + "grad_norm": 0.6619614957484472, + "learning_rate": 2.2330850403476104e-05, + "loss": 6.0394, + "step": 1439 + }, + { + "epoch": 0.06704378797402054, + "grad_norm": 0.6827517402100792, + "learning_rate": 2.2346368715083797e-05, + "loss": 6.0171, + "step": 1440 + }, + { + "epoch": 0.0670903461601136, + "grad_norm": 0.6674301207226152, + "learning_rate": 2.2361887026691497e-05, + "loss": 6.0663, + "step": 1441 + }, + { + "epoch": 0.06713690434620667, + "grad_norm": 0.7922118267913472, + "learning_rate": 2.2377405338299197e-05, + "loss": 6.0014, + "step": 1442 + }, + { + "epoch": 0.06718346253229975, + "grad_norm": 0.68480093221736, + "learning_rate": 2.2392923649906893e-05, + "loss": 6.1416, + "step": 1443 + }, + { + "epoch": 0.0672300207183928, + "grad_norm": 0.6061668622099068, + "learning_rate": 2.240844196151459e-05, + "loss": 6.1296, + "step": 1444 + }, + { + "epoch": 0.06727657890448588, + "grad_norm": 0.7355219651702408, + "learning_rate": 2.2423960273122285e-05, + "loss": 6.1033, + "step": 1445 + }, + { + "epoch": 0.06732313709057895, + "grad_norm": 0.7066771090331968, + "learning_rate": 2.2439478584729985e-05, + "loss": 5.9442, + "step": 1446 + }, + { + "epoch": 0.06736969527667203, + "grad_norm": 0.670258599480405, + "learning_rate": 2.2454996896337678e-05, + "loss": 6.0563, + "step": 1447 + }, + { + "epoch": 0.06741625346276509, + "grad_norm": 0.910354903017208, + "learning_rate": 2.2470515207945378e-05, + "loss": 6.1195, + "step": 1448 + }, + { + "epoch": 0.06746281164885816, + "grad_norm": 0.7814049236942331, + "learning_rate": 2.2486033519553074e-05, + "loss": 5.9702, + "step": 1449 + }, + { + "epoch": 0.06750936983495123, + "grad_norm": 0.7228213227934791, + "learning_rate": 2.250155183116077e-05, + "loss": 6.0797, + "step": 1450 + }, + { + "epoch": 0.0675559280210443, + "grad_norm": 0.8323589743734803, + "learning_rate": 2.2517070142768467e-05, + "loss": 6.0658, + "step": 1451 + }, + { + "epoch": 0.06760248620713737, + "grad_norm": 0.5907958185647235, + "learning_rate": 2.2532588454376166e-05, + "loss": 6.0988, + "step": 1452 + }, + { + "epoch": 0.06764904439323044, + "grad_norm": 0.6076046058541296, + "learning_rate": 2.2548106765983863e-05, + "loss": 5.9973, + "step": 1453 + }, + { + "epoch": 0.06769560257932351, + "grad_norm": 0.6840424104560496, + "learning_rate": 2.256362507759156e-05, + "loss": 6.0963, + "step": 1454 + }, + { + "epoch": 0.06774216076541657, + "grad_norm": 0.7614288886164987, + "learning_rate": 2.2579143389199255e-05, + "loss": 6.0791, + "step": 1455 + }, + { + "epoch": 0.06778871895150965, + "grad_norm": 0.9046954561429771, + "learning_rate": 2.2594661700806955e-05, + "loss": 6.0863, + "step": 1456 + }, + { + "epoch": 0.06783527713760272, + "grad_norm": 0.6229089454623609, + "learning_rate": 2.261018001241465e-05, + "loss": 5.9789, + "step": 1457 + }, + { + "epoch": 0.0678818353236958, + "grad_norm": 0.8426364742990868, + "learning_rate": 2.2625698324022348e-05, + "loss": 6.0958, + "step": 1458 + }, + { + "epoch": 0.06792839350978885, + "grad_norm": 0.8444364652700108, + "learning_rate": 2.2641216635630044e-05, + "loss": 6.0862, + "step": 1459 + }, + { + "epoch": 0.06797495169588193, + "grad_norm": 0.8132966789233589, + "learning_rate": 2.2656734947237744e-05, + "loss": 6.0634, + "step": 1460 + }, + { + "epoch": 0.068021509881975, + "grad_norm": 0.8493014696903661, + "learning_rate": 2.267225325884544e-05, + "loss": 6.0021, + "step": 1461 + }, + { + "epoch": 0.06806806806806807, + "grad_norm": 0.9168405282350098, + "learning_rate": 2.2687771570453136e-05, + "loss": 6.0857, + "step": 1462 + }, + { + "epoch": 0.06811462625416113, + "grad_norm": 0.9759733088555468, + "learning_rate": 2.2703289882060833e-05, + "loss": 5.9711, + "step": 1463 + }, + { + "epoch": 0.06816118444025421, + "grad_norm": 0.6787890762737964, + "learning_rate": 2.2718808193668532e-05, + "loss": 5.9748, + "step": 1464 + }, + { + "epoch": 0.06820774262634728, + "grad_norm": 0.7662856251200494, + "learning_rate": 2.2734326505276225e-05, + "loss": 6.0148, + "step": 1465 + }, + { + "epoch": 0.06825430081244034, + "grad_norm": 0.6337273476864449, + "learning_rate": 2.2749844816883925e-05, + "loss": 6.1228, + "step": 1466 + }, + { + "epoch": 0.06830085899853341, + "grad_norm": 0.6990085587038067, + "learning_rate": 2.276536312849162e-05, + "loss": 6.0347, + "step": 1467 + }, + { + "epoch": 0.06834741718462649, + "grad_norm": 0.7912103082463547, + "learning_rate": 2.278088144009932e-05, + "loss": 5.9913, + "step": 1468 + }, + { + "epoch": 0.06839397537071956, + "grad_norm": 0.7278758793530137, + "learning_rate": 2.2796399751707014e-05, + "loss": 5.9858, + "step": 1469 + }, + { + "epoch": 0.06844053355681262, + "grad_norm": 0.6763277603136141, + "learning_rate": 2.2811918063314713e-05, + "loss": 6.0884, + "step": 1470 + }, + { + "epoch": 0.0684870917429057, + "grad_norm": 0.7235543966698148, + "learning_rate": 2.282743637492241e-05, + "loss": 6.0253, + "step": 1471 + }, + { + "epoch": 0.06853364992899877, + "grad_norm": 0.614112352933398, + "learning_rate": 2.2842954686530106e-05, + "loss": 6.1067, + "step": 1472 + }, + { + "epoch": 0.06858020811509184, + "grad_norm": 0.6321650344186108, + "learning_rate": 2.2858472998137802e-05, + "loss": 6.0342, + "step": 1473 + }, + { + "epoch": 0.0686267663011849, + "grad_norm": 0.6550416820458502, + "learning_rate": 2.2873991309745502e-05, + "loss": 6.1111, + "step": 1474 + }, + { + "epoch": 0.06867332448727798, + "grad_norm": 0.6183882882286503, + "learning_rate": 2.28895096213532e-05, + "loss": 6.1019, + "step": 1475 + }, + { + "epoch": 0.06871988267337105, + "grad_norm": 0.6251883355509026, + "learning_rate": 2.2905027932960895e-05, + "loss": 6.1545, + "step": 1476 + }, + { + "epoch": 0.06876644085946411, + "grad_norm": 0.6135299834548091, + "learning_rate": 2.292054624456859e-05, + "loss": 5.9652, + "step": 1477 + }, + { + "epoch": 0.06881299904555718, + "grad_norm": 0.6149899817360133, + "learning_rate": 2.293606455617629e-05, + "loss": 6.074, + "step": 1478 + }, + { + "epoch": 0.06885955723165026, + "grad_norm": 0.7520247266775094, + "learning_rate": 2.2951582867783987e-05, + "loss": 6.0275, + "step": 1479 + }, + { + "epoch": 0.06890611541774333, + "grad_norm": 0.7124135764058, + "learning_rate": 2.2967101179391683e-05, + "loss": 6.0349, + "step": 1480 + }, + { + "epoch": 0.06895267360383639, + "grad_norm": 0.6680391307466349, + "learning_rate": 2.298261949099938e-05, + "loss": 5.9366, + "step": 1481 + }, + { + "epoch": 0.06899923178992946, + "grad_norm": 0.7957095569357405, + "learning_rate": 2.299813780260708e-05, + "loss": 6.0275, + "step": 1482 + }, + { + "epoch": 0.06904578997602254, + "grad_norm": 0.7105959206695466, + "learning_rate": 2.3013656114214772e-05, + "loss": 6.0742, + "step": 1483 + }, + { + "epoch": 0.06909234816211561, + "grad_norm": 0.5996260073353893, + "learning_rate": 2.3029174425822472e-05, + "loss": 6.1542, + "step": 1484 + }, + { + "epoch": 0.06913890634820867, + "grad_norm": 0.6081146879016882, + "learning_rate": 2.3044692737430168e-05, + "loss": 5.9536, + "step": 1485 + }, + { + "epoch": 0.06918546453430174, + "grad_norm": 0.899152023339351, + "learning_rate": 2.3060211049037868e-05, + "loss": 5.9675, + "step": 1486 + }, + { + "epoch": 0.06923202272039482, + "grad_norm": 1.088961177155693, + "learning_rate": 2.307572936064556e-05, + "loss": 5.9481, + "step": 1487 + }, + { + "epoch": 0.06927858090648788, + "grad_norm": 0.9384902506407606, + "learning_rate": 2.309124767225326e-05, + "loss": 6.0621, + "step": 1488 + }, + { + "epoch": 0.06932513909258095, + "grad_norm": 0.7829705876822768, + "learning_rate": 2.3106765983860957e-05, + "loss": 6.0956, + "step": 1489 + }, + { + "epoch": 0.06937169727867402, + "grad_norm": 0.6642070439554743, + "learning_rate": 2.3122284295468653e-05, + "loss": 5.9874, + "step": 1490 + }, + { + "epoch": 0.0694182554647671, + "grad_norm": 1.0120134178174043, + "learning_rate": 2.313780260707635e-05, + "loss": 5.937, + "step": 1491 + }, + { + "epoch": 0.06946481365086016, + "grad_norm": 1.0544922904236311, + "learning_rate": 2.315332091868405e-05, + "loss": 5.9777, + "step": 1492 + }, + { + "epoch": 0.06951137183695323, + "grad_norm": 0.7158863415327468, + "learning_rate": 2.3168839230291745e-05, + "loss": 6.0468, + "step": 1493 + }, + { + "epoch": 0.0695579300230463, + "grad_norm": 0.7374866255240005, + "learning_rate": 2.3184357541899442e-05, + "loss": 6.1003, + "step": 1494 + }, + { + "epoch": 0.06960448820913938, + "grad_norm": 0.8550244181949099, + "learning_rate": 2.3199875853507138e-05, + "loss": 5.9422, + "step": 1495 + }, + { + "epoch": 0.06965104639523244, + "grad_norm": 0.6936233807200863, + "learning_rate": 2.3215394165114838e-05, + "loss": 6.0568, + "step": 1496 + }, + { + "epoch": 0.06969760458132551, + "grad_norm": 0.7706301064727444, + "learning_rate": 2.3230912476722534e-05, + "loss": 5.9221, + "step": 1497 + }, + { + "epoch": 0.06974416276741859, + "grad_norm": 0.7876744019330344, + "learning_rate": 2.324643078833023e-05, + "loss": 6.0254, + "step": 1498 + }, + { + "epoch": 0.06979072095351165, + "grad_norm": 0.7422064191014008, + "learning_rate": 2.3261949099937927e-05, + "loss": 5.9464, + "step": 1499 + }, + { + "epoch": 0.06983727913960472, + "grad_norm": 0.6585817965561468, + "learning_rate": 2.3277467411545626e-05, + "loss": 5.9413, + "step": 1500 + }, + { + "epoch": 0.06988383732569779, + "grad_norm": 0.7678118052710909, + "learning_rate": 2.3292985723153323e-05, + "loss": 5.9331, + "step": 1501 + }, + { + "epoch": 0.06993039551179087, + "grad_norm": 0.6870717146044155, + "learning_rate": 2.330850403476102e-05, + "loss": 6.0862, + "step": 1502 + }, + { + "epoch": 0.06997695369788393, + "grad_norm": 0.5488258426379458, + "learning_rate": 2.3324022346368715e-05, + "loss": 6.0274, + "step": 1503 + }, + { + "epoch": 0.070023511883977, + "grad_norm": 0.76850800824856, + "learning_rate": 2.3339540657976415e-05, + "loss": 5.8609, + "step": 1504 + }, + { + "epoch": 0.07007007007007007, + "grad_norm": 0.6976159245364101, + "learning_rate": 2.3355058969584108e-05, + "loss": 5.887, + "step": 1505 + }, + { + "epoch": 0.07011662825616315, + "grad_norm": 0.8035347398069256, + "learning_rate": 2.3370577281191808e-05, + "loss": 6.004, + "step": 1506 + }, + { + "epoch": 0.0701631864422562, + "grad_norm": 0.9013914991536627, + "learning_rate": 2.3386095592799504e-05, + "loss": 5.9563, + "step": 1507 + }, + { + "epoch": 0.07020974462834928, + "grad_norm": 0.9988970691506449, + "learning_rate": 2.34016139044072e-05, + "loss": 6.0378, + "step": 1508 + }, + { + "epoch": 0.07025630281444235, + "grad_norm": 0.9582413327959646, + "learning_rate": 2.3417132216014897e-05, + "loss": 5.9614, + "step": 1509 + }, + { + "epoch": 0.07030286100053541, + "grad_norm": 0.8980755567548733, + "learning_rate": 2.3432650527622596e-05, + "loss": 5.8196, + "step": 1510 + }, + { + "epoch": 0.07034941918662849, + "grad_norm": 0.6736787991512365, + "learning_rate": 2.3448168839230293e-05, + "loss": 5.8899, + "step": 1511 + }, + { + "epoch": 0.07039597737272156, + "grad_norm": 0.7077524869868455, + "learning_rate": 2.346368715083799e-05, + "loss": 5.9191, + "step": 1512 + }, + { + "epoch": 0.07044253555881463, + "grad_norm": 0.8452846937749114, + "learning_rate": 2.3479205462445685e-05, + "loss": 5.922, + "step": 1513 + }, + { + "epoch": 0.0704890937449077, + "grad_norm": 1.0857971219044344, + "learning_rate": 2.3494723774053385e-05, + "loss": 5.8728, + "step": 1514 + }, + { + "epoch": 0.07053565193100077, + "grad_norm": 0.7405356378947919, + "learning_rate": 2.351024208566108e-05, + "loss": 5.9204, + "step": 1515 + }, + { + "epoch": 0.07058221011709384, + "grad_norm": 0.5439664524855619, + "learning_rate": 2.3525760397268777e-05, + "loss": 5.9173, + "step": 1516 + }, + { + "epoch": 0.07062876830318691, + "grad_norm": 0.7946040975626111, + "learning_rate": 2.3541278708876474e-05, + "loss": 5.807, + "step": 1517 + }, + { + "epoch": 0.07067532648927997, + "grad_norm": 0.7594681636016971, + "learning_rate": 2.3556797020484173e-05, + "loss": 6.0227, + "step": 1518 + }, + { + "epoch": 0.07072188467537305, + "grad_norm": 0.5998682024735214, + "learning_rate": 2.357231533209187e-05, + "loss": 5.9333, + "step": 1519 + }, + { + "epoch": 0.07076844286146612, + "grad_norm": 0.6900277133464825, + "learning_rate": 2.3587833643699566e-05, + "loss": 6.0885, + "step": 1520 + }, + { + "epoch": 0.07081500104755918, + "grad_norm": 0.8365101391434859, + "learning_rate": 2.3603351955307262e-05, + "loss": 5.829, + "step": 1521 + }, + { + "epoch": 0.07086155923365225, + "grad_norm": 0.536904777331685, + "learning_rate": 2.3618870266914962e-05, + "loss": 6.0342, + "step": 1522 + }, + { + "epoch": 0.07090811741974533, + "grad_norm": 0.7778066549409949, + "learning_rate": 2.3634388578522655e-05, + "loss": 5.8929, + "step": 1523 + }, + { + "epoch": 0.0709546756058384, + "grad_norm": 0.695173488403722, + "learning_rate": 2.3649906890130355e-05, + "loss": 5.8626, + "step": 1524 + }, + { + "epoch": 0.07100123379193146, + "grad_norm": 0.7043807780936743, + "learning_rate": 2.366542520173805e-05, + "loss": 5.8965, + "step": 1525 + }, + { + "epoch": 0.07104779197802454, + "grad_norm": 0.5811871028999548, + "learning_rate": 2.368094351334575e-05, + "loss": 6.0122, + "step": 1526 + }, + { + "epoch": 0.07109435016411761, + "grad_norm": 0.6215669988306112, + "learning_rate": 2.3696461824953447e-05, + "loss": 5.9437, + "step": 1527 + }, + { + "epoch": 0.07114090835021068, + "grad_norm": 0.5324845053634573, + "learning_rate": 2.3711980136561143e-05, + "loss": 5.9518, + "step": 1528 + }, + { + "epoch": 0.07118746653630374, + "grad_norm": 0.563331034381991, + "learning_rate": 2.3727498448168843e-05, + "loss": 6.0356, + "step": 1529 + }, + { + "epoch": 0.07123402472239682, + "grad_norm": 0.5439931725513858, + "learning_rate": 2.3743016759776536e-05, + "loss": 5.8223, + "step": 1530 + }, + { + "epoch": 0.07128058290848989, + "grad_norm": 0.6362047887891105, + "learning_rate": 2.3758535071384236e-05, + "loss": 5.9883, + "step": 1531 + }, + { + "epoch": 0.07132714109458295, + "grad_norm": 0.5502742302018496, + "learning_rate": 2.3774053382991932e-05, + "loss": 5.9511, + "step": 1532 + }, + { + "epoch": 0.07137369928067602, + "grad_norm": 0.673022875274332, + "learning_rate": 2.378957169459963e-05, + "loss": 5.981, + "step": 1533 + }, + { + "epoch": 0.0714202574667691, + "grad_norm": 0.6591432272342598, + "learning_rate": 2.3805090006207325e-05, + "loss": 5.8883, + "step": 1534 + }, + { + "epoch": 0.07146681565286217, + "grad_norm": 0.6590478220455643, + "learning_rate": 2.3820608317815024e-05, + "loss": 5.9925, + "step": 1535 + }, + { + "epoch": 0.07151337383895523, + "grad_norm": 0.6581359880030528, + "learning_rate": 2.383612662942272e-05, + "loss": 6.0704, + "step": 1536 + }, + { + "epoch": 0.0715599320250483, + "grad_norm": 0.7623314967745499, + "learning_rate": 2.3851644941030417e-05, + "loss": 5.9468, + "step": 1537 + }, + { + "epoch": 0.07160649021114138, + "grad_norm": 0.6883979102678789, + "learning_rate": 2.3867163252638113e-05, + "loss": 5.976, + "step": 1538 + }, + { + "epoch": 0.07165304839723445, + "grad_norm": 0.82625513784746, + "learning_rate": 2.3882681564245813e-05, + "loss": 5.9158, + "step": 1539 + }, + { + "epoch": 0.07169960658332751, + "grad_norm": 0.801777529813102, + "learning_rate": 2.389819987585351e-05, + "loss": 5.9172, + "step": 1540 + }, + { + "epoch": 0.07174616476942058, + "grad_norm": 0.6716326119746844, + "learning_rate": 2.3913718187461205e-05, + "loss": 5.9755, + "step": 1541 + }, + { + "epoch": 0.07179272295551366, + "grad_norm": 0.6704686558770426, + "learning_rate": 2.3929236499068902e-05, + "loss": 6.018, + "step": 1542 + }, + { + "epoch": 0.07183928114160672, + "grad_norm": 0.7352607005444383, + "learning_rate": 2.39447548106766e-05, + "loss": 5.8381, + "step": 1543 + }, + { + "epoch": 0.07188583932769979, + "grad_norm": 0.6046467809398888, + "learning_rate": 2.3960273122284298e-05, + "loss": 5.8511, + "step": 1544 + }, + { + "epoch": 0.07193239751379286, + "grad_norm": 0.818542549338031, + "learning_rate": 2.3975791433891994e-05, + "loss": 5.8984, + "step": 1545 + }, + { + "epoch": 0.07197895569988594, + "grad_norm": 0.8022991941726143, + "learning_rate": 2.399130974549969e-05, + "loss": 5.7514, + "step": 1546 + }, + { + "epoch": 0.072025513885979, + "grad_norm": 0.6395362629404586, + "learning_rate": 2.400682805710739e-05, + "loss": 5.8142, + "step": 1547 + }, + { + "epoch": 0.07207207207207207, + "grad_norm": 0.8288259249566714, + "learning_rate": 2.4022346368715083e-05, + "loss": 5.9047, + "step": 1548 + }, + { + "epoch": 0.07211863025816515, + "grad_norm": 0.7312820355772202, + "learning_rate": 2.4037864680322783e-05, + "loss": 6.046, + "step": 1549 + }, + { + "epoch": 0.0721651884442582, + "grad_norm": 0.7998535538813825, + "learning_rate": 2.405338299193048e-05, + "loss": 5.9792, + "step": 1550 + }, + { + "epoch": 0.07221174663035128, + "grad_norm": 0.7325283643860476, + "learning_rate": 2.406890130353818e-05, + "loss": 5.9426, + "step": 1551 + }, + { + "epoch": 0.07225830481644435, + "grad_norm": 0.78599300305126, + "learning_rate": 2.408441961514587e-05, + "loss": 5.9865, + "step": 1552 + }, + { + "epoch": 0.07230486300253743, + "grad_norm": 0.7023202258761898, + "learning_rate": 2.409993792675357e-05, + "loss": 5.9518, + "step": 1553 + }, + { + "epoch": 0.07235142118863049, + "grad_norm": 0.7302351233401009, + "learning_rate": 2.4115456238361268e-05, + "loss": 5.8109, + "step": 1554 + }, + { + "epoch": 0.07239797937472356, + "grad_norm": 0.7411075773806606, + "learning_rate": 2.4130974549968964e-05, + "loss": 5.8987, + "step": 1555 + }, + { + "epoch": 0.07244453756081663, + "grad_norm": 0.7945544655927558, + "learning_rate": 2.414649286157666e-05, + "loss": 5.9644, + "step": 1556 + }, + { + "epoch": 0.0724910957469097, + "grad_norm": 0.6179072448995512, + "learning_rate": 2.416201117318436e-05, + "loss": 5.9163, + "step": 1557 + }, + { + "epoch": 0.07253765393300277, + "grad_norm": 0.7097450575168622, + "learning_rate": 2.4177529484792056e-05, + "loss": 5.9151, + "step": 1558 + }, + { + "epoch": 0.07258421211909584, + "grad_norm": 0.660251535148705, + "learning_rate": 2.4193047796399753e-05, + "loss": 5.9877, + "step": 1559 + }, + { + "epoch": 0.07263077030518891, + "grad_norm": 0.6693483386968752, + "learning_rate": 2.420856610800745e-05, + "loss": 5.985, + "step": 1560 + }, + { + "epoch": 0.07267732849128197, + "grad_norm": 0.6325987400516823, + "learning_rate": 2.422408441961515e-05, + "loss": 5.894, + "step": 1561 + }, + { + "epoch": 0.07272388667737505, + "grad_norm": 0.6068024966930871, + "learning_rate": 2.4239602731222845e-05, + "loss": 5.8759, + "step": 1562 + }, + { + "epoch": 0.07277044486346812, + "grad_norm": 0.7276771705280164, + "learning_rate": 2.425512104283054e-05, + "loss": 5.9063, + "step": 1563 + }, + { + "epoch": 0.0728170030495612, + "grad_norm": 0.6928784083962777, + "learning_rate": 2.4270639354438237e-05, + "loss": 5.8615, + "step": 1564 + }, + { + "epoch": 0.07286356123565425, + "grad_norm": 0.7670271883962401, + "learning_rate": 2.4286157666045937e-05, + "loss": 5.9485, + "step": 1565 + }, + { + "epoch": 0.07291011942174733, + "grad_norm": 0.7095285564849014, + "learning_rate": 2.4301675977653633e-05, + "loss": 5.8917, + "step": 1566 + }, + { + "epoch": 0.0729566776078404, + "grad_norm": 0.7408066912831929, + "learning_rate": 2.431719428926133e-05, + "loss": 5.9652, + "step": 1567 + }, + { + "epoch": 0.07300323579393347, + "grad_norm": 0.741103484224209, + "learning_rate": 2.4332712600869026e-05, + "loss": 5.9159, + "step": 1568 + }, + { + "epoch": 0.07304979398002653, + "grad_norm": 0.7063003219379159, + "learning_rate": 2.4348230912476726e-05, + "loss": 5.9732, + "step": 1569 + }, + { + "epoch": 0.07309635216611961, + "grad_norm": 0.6431185305816309, + "learning_rate": 2.436374922408442e-05, + "loss": 5.9245, + "step": 1570 + }, + { + "epoch": 0.07314291035221268, + "grad_norm": 0.6725993903486213, + "learning_rate": 2.437926753569212e-05, + "loss": 5.8384, + "step": 1571 + }, + { + "epoch": 0.07318946853830574, + "grad_norm": 0.656163099392436, + "learning_rate": 2.4394785847299815e-05, + "loss": 5.9805, + "step": 1572 + }, + { + "epoch": 0.07323602672439881, + "grad_norm": 0.753163624568708, + "learning_rate": 2.441030415890751e-05, + "loss": 5.9813, + "step": 1573 + }, + { + "epoch": 0.07328258491049189, + "grad_norm": 0.8046057768192669, + "learning_rate": 2.4425822470515207e-05, + "loss": 5.7312, + "step": 1574 + }, + { + "epoch": 0.07332914309658496, + "grad_norm": 0.8963848343670441, + "learning_rate": 2.4441340782122907e-05, + "loss": 5.8786, + "step": 1575 + }, + { + "epoch": 0.07337570128267802, + "grad_norm": 0.7614521533058337, + "learning_rate": 2.4456859093730603e-05, + "loss": 5.921, + "step": 1576 + }, + { + "epoch": 0.0734222594687711, + "grad_norm": 0.799212027237626, + "learning_rate": 2.44723774053383e-05, + "loss": 5.9809, + "step": 1577 + }, + { + "epoch": 0.07346881765486417, + "grad_norm": 0.7079113687542895, + "learning_rate": 2.4487895716945996e-05, + "loss": 5.9439, + "step": 1578 + }, + { + "epoch": 0.07351537584095724, + "grad_norm": 0.7261442267979492, + "learning_rate": 2.4503414028553696e-05, + "loss": 5.9063, + "step": 1579 + }, + { + "epoch": 0.0735619340270503, + "grad_norm": 0.6708669651921885, + "learning_rate": 2.4518932340161392e-05, + "loss": 5.8935, + "step": 1580 + }, + { + "epoch": 0.07360849221314338, + "grad_norm": 0.6808317697696534, + "learning_rate": 2.4534450651769088e-05, + "loss": 5.8521, + "step": 1581 + }, + { + "epoch": 0.07365505039923645, + "grad_norm": 0.6829012099195195, + "learning_rate": 2.4549968963376785e-05, + "loss": 5.7803, + "step": 1582 + }, + { + "epoch": 0.07370160858532951, + "grad_norm": 0.7404818828322045, + "learning_rate": 2.4565487274984484e-05, + "loss": 5.817, + "step": 1583 + }, + { + "epoch": 0.07374816677142258, + "grad_norm": 0.7975273365419685, + "learning_rate": 2.458100558659218e-05, + "loss": 5.855, + "step": 1584 + }, + { + "epoch": 0.07379472495751566, + "grad_norm": 0.7324711019529936, + "learning_rate": 2.4596523898199877e-05, + "loss": 5.8876, + "step": 1585 + }, + { + "epoch": 0.07384128314360873, + "grad_norm": 0.6465178543336966, + "learning_rate": 2.4612042209807573e-05, + "loss": 5.9208, + "step": 1586 + }, + { + "epoch": 0.07388784132970179, + "grad_norm": 0.7111582536292205, + "learning_rate": 2.4627560521415273e-05, + "loss": 5.8152, + "step": 1587 + }, + { + "epoch": 0.07393439951579486, + "grad_norm": 0.7714181159723864, + "learning_rate": 2.4643078833022966e-05, + "loss": 5.916, + "step": 1588 + }, + { + "epoch": 0.07398095770188794, + "grad_norm": 0.6650616946414764, + "learning_rate": 2.4658597144630665e-05, + "loss": 5.7958, + "step": 1589 + }, + { + "epoch": 0.07402751588798101, + "grad_norm": 0.6355789393620507, + "learning_rate": 2.4674115456238362e-05, + "loss": 5.7834, + "step": 1590 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 0.666403415079417, + "learning_rate": 2.468963376784606e-05, + "loss": 5.8384, + "step": 1591 + }, + { + "epoch": 0.07412063226016714, + "grad_norm": 0.8813607977285566, + "learning_rate": 2.4705152079453754e-05, + "loss": 5.8504, + "step": 1592 + }, + { + "epoch": 0.07416719044626022, + "grad_norm": 0.9867366284978716, + "learning_rate": 2.4720670391061454e-05, + "loss": 5.824, + "step": 1593 + }, + { + "epoch": 0.07421374863235328, + "grad_norm": 1.063990822375288, + "learning_rate": 2.473618870266915e-05, + "loss": 6.0515, + "step": 1594 + }, + { + "epoch": 0.07426030681844635, + "grad_norm": 0.8343709549086538, + "learning_rate": 2.4751707014276847e-05, + "loss": 5.8839, + "step": 1595 + }, + { + "epoch": 0.07430686500453942, + "grad_norm": 0.9881594052511816, + "learning_rate": 2.4767225325884543e-05, + "loss": 5.9341, + "step": 1596 + }, + { + "epoch": 0.0743534231906325, + "grad_norm": 1.1050909139547598, + "learning_rate": 2.4782743637492243e-05, + "loss": 5.89, + "step": 1597 + }, + { + "epoch": 0.07439998137672556, + "grad_norm": 0.79911677600322, + "learning_rate": 2.479826194909994e-05, + "loss": 5.9961, + "step": 1598 + }, + { + "epoch": 0.07444653956281863, + "grad_norm": 1.0195705523803607, + "learning_rate": 2.4813780260707635e-05, + "loss": 5.9929, + "step": 1599 + }, + { + "epoch": 0.0744930977489117, + "grad_norm": 0.878471703960844, + "learning_rate": 2.482929857231533e-05, + "loss": 5.8833, + "step": 1600 + }, + { + "epoch": 0.07453965593500478, + "grad_norm": 0.9444113656948345, + "learning_rate": 2.484481688392303e-05, + "loss": 5.8124, + "step": 1601 + }, + { + "epoch": 0.07458621412109784, + "grad_norm": 0.6764473805701752, + "learning_rate": 2.4860335195530728e-05, + "loss": 5.801, + "step": 1602 + }, + { + "epoch": 0.07463277230719091, + "grad_norm": 0.7417894499336721, + "learning_rate": 2.4875853507138424e-05, + "loss": 5.8827, + "step": 1603 + }, + { + "epoch": 0.07467933049328399, + "grad_norm": 0.8054277003861536, + "learning_rate": 2.489137181874612e-05, + "loss": 5.7599, + "step": 1604 + }, + { + "epoch": 0.07472588867937704, + "grad_norm": 0.8243085029472546, + "learning_rate": 2.490689013035382e-05, + "loss": 5.8229, + "step": 1605 + }, + { + "epoch": 0.07477244686547012, + "grad_norm": 0.8535766517202373, + "learning_rate": 2.4922408441961513e-05, + "loss": 5.8055, + "step": 1606 + }, + { + "epoch": 0.07481900505156319, + "grad_norm": 0.7257318636250437, + "learning_rate": 2.4937926753569213e-05, + "loss": 5.9686, + "step": 1607 + }, + { + "epoch": 0.07486556323765627, + "grad_norm": 0.7555890213099357, + "learning_rate": 2.495344506517691e-05, + "loss": 5.8277, + "step": 1608 + }, + { + "epoch": 0.07491212142374933, + "grad_norm": 0.8452070709613291, + "learning_rate": 2.496896337678461e-05, + "loss": 5.9355, + "step": 1609 + }, + { + "epoch": 0.0749586796098424, + "grad_norm": 0.8113940213695996, + "learning_rate": 2.49844816883923e-05, + "loss": 5.8417, + "step": 1610 + }, + { + "epoch": 0.07500523779593547, + "grad_norm": 0.8242287296316314, + "learning_rate": 2.5e-05, + "loss": 5.9063, + "step": 1611 + }, + { + "epoch": 0.07505179598202855, + "grad_norm": 0.8643708564843494, + "learning_rate": 2.50155183116077e-05, + "loss": 5.8943, + "step": 1612 + }, + { + "epoch": 0.0750983541681216, + "grad_norm": 0.7577206320267418, + "learning_rate": 2.5031036623215397e-05, + "loss": 5.8003, + "step": 1613 + }, + { + "epoch": 0.07514491235421468, + "grad_norm": 0.8533028228261791, + "learning_rate": 2.504655493482309e-05, + "loss": 5.815, + "step": 1614 + }, + { + "epoch": 0.07519147054030775, + "grad_norm": 0.8009630476385792, + "learning_rate": 2.506207324643079e-05, + "loss": 5.8928, + "step": 1615 + }, + { + "epoch": 0.07523802872640081, + "grad_norm": 1.057825079867267, + "learning_rate": 2.507759155803849e-05, + "loss": 5.8269, + "step": 1616 + }, + { + "epoch": 0.07528458691249389, + "grad_norm": 1.0304030761381713, + "learning_rate": 2.5093109869646186e-05, + "loss": 5.8002, + "step": 1617 + }, + { + "epoch": 0.07533114509858696, + "grad_norm": 0.8129143405819066, + "learning_rate": 2.510862818125388e-05, + "loss": 5.8964, + "step": 1618 + }, + { + "epoch": 0.07537770328468003, + "grad_norm": 0.8178272991653525, + "learning_rate": 2.512414649286158e-05, + "loss": 5.8682, + "step": 1619 + }, + { + "epoch": 0.0754242614707731, + "grad_norm": 1.1801269413730429, + "learning_rate": 2.5139664804469275e-05, + "loss": 5.9362, + "step": 1620 + }, + { + "epoch": 0.07547081965686617, + "grad_norm": 0.7578156645468807, + "learning_rate": 2.5155183116076974e-05, + "loss": 5.8627, + "step": 1621 + }, + { + "epoch": 0.07551737784295924, + "grad_norm": 0.7847796080597205, + "learning_rate": 2.5170701427684667e-05, + "loss": 5.8479, + "step": 1622 + }, + { + "epoch": 0.07556393602905231, + "grad_norm": 0.9435819664269587, + "learning_rate": 2.5186219739292367e-05, + "loss": 5.8716, + "step": 1623 + }, + { + "epoch": 0.07561049421514537, + "grad_norm": 0.7460246088907381, + "learning_rate": 2.5201738050900063e-05, + "loss": 5.7692, + "step": 1624 + }, + { + "epoch": 0.07565705240123845, + "grad_norm": 0.7194966468155206, + "learning_rate": 2.5217256362507763e-05, + "loss": 5.8038, + "step": 1625 + }, + { + "epoch": 0.07570361058733152, + "grad_norm": 0.8373280368100287, + "learning_rate": 2.5232774674115456e-05, + "loss": 5.7738, + "step": 1626 + }, + { + "epoch": 0.07575016877342458, + "grad_norm": 0.7941301129504843, + "learning_rate": 2.5248292985723156e-05, + "loss": 5.7717, + "step": 1627 + }, + { + "epoch": 0.07579672695951765, + "grad_norm": 0.7222651668489756, + "learning_rate": 2.5263811297330852e-05, + "loss": 5.8093, + "step": 1628 + }, + { + "epoch": 0.07584328514561073, + "grad_norm": 0.7002151400636065, + "learning_rate": 2.527932960893855e-05, + "loss": 5.9027, + "step": 1629 + }, + { + "epoch": 0.0758898433317038, + "grad_norm": 0.6877260481700717, + "learning_rate": 2.5294847920546245e-05, + "loss": 5.8684, + "step": 1630 + }, + { + "epoch": 0.07593640151779686, + "grad_norm": 0.7505010201202431, + "learning_rate": 2.531036623215394e-05, + "loss": 5.8144, + "step": 1631 + }, + { + "epoch": 0.07598295970388994, + "grad_norm": 0.6548757164688238, + "learning_rate": 2.532588454376164e-05, + "loss": 5.8628, + "step": 1632 + }, + { + "epoch": 0.07602951788998301, + "grad_norm": 0.7812050106045846, + "learning_rate": 2.534140285536934e-05, + "loss": 5.9077, + "step": 1633 + }, + { + "epoch": 0.07607607607607608, + "grad_norm": 0.8554338174735644, + "learning_rate": 2.5356921166977033e-05, + "loss": 5.8232, + "step": 1634 + }, + { + "epoch": 0.07612263426216914, + "grad_norm": 0.9996563298706621, + "learning_rate": 2.537243947858473e-05, + "loss": 5.9509, + "step": 1635 + }, + { + "epoch": 0.07616919244826222, + "grad_norm": 1.0037852995744097, + "learning_rate": 2.538795779019243e-05, + "loss": 5.8618, + "step": 1636 + }, + { + "epoch": 0.07621575063435529, + "grad_norm": 0.8621076741621467, + "learning_rate": 2.540347610180013e-05, + "loss": 5.7343, + "step": 1637 + }, + { + "epoch": 0.07626230882044835, + "grad_norm": 0.7190279762166191, + "learning_rate": 2.5418994413407822e-05, + "loss": 5.8031, + "step": 1638 + }, + { + "epoch": 0.07630886700654142, + "grad_norm": 0.9582916441458821, + "learning_rate": 2.5434512725015518e-05, + "loss": 5.8755, + "step": 1639 + }, + { + "epoch": 0.0763554251926345, + "grad_norm": 1.2683351912317835, + "learning_rate": 2.5450031036623218e-05, + "loss": 5.8393, + "step": 1640 + }, + { + "epoch": 0.07640198337872757, + "grad_norm": 0.6192236492719058, + "learning_rate": 2.5465549348230918e-05, + "loss": 5.7697, + "step": 1641 + }, + { + "epoch": 0.07644854156482063, + "grad_norm": 1.078360814608427, + "learning_rate": 2.548106765983861e-05, + "loss": 5.7611, + "step": 1642 + }, + { + "epoch": 0.0764950997509137, + "grad_norm": 1.1573756691288386, + "learning_rate": 2.5496585971446307e-05, + "loss": 5.8205, + "step": 1643 + }, + { + "epoch": 0.07654165793700678, + "grad_norm": 0.685497791274457, + "learning_rate": 2.5512104283054006e-05, + "loss": 5.6213, + "step": 1644 + }, + { + "epoch": 0.07658821612309985, + "grad_norm": 1.0597395440796515, + "learning_rate": 2.5527622594661703e-05, + "loss": 5.9239, + "step": 1645 + }, + { + "epoch": 0.07663477430919291, + "grad_norm": 1.0183408835428502, + "learning_rate": 2.5543140906269396e-05, + "loss": 5.7932, + "step": 1646 + }, + { + "epoch": 0.07668133249528598, + "grad_norm": 0.8173308685928249, + "learning_rate": 2.5558659217877095e-05, + "loss": 5.7582, + "step": 1647 + }, + { + "epoch": 0.07672789068137906, + "grad_norm": 0.7662123477573665, + "learning_rate": 2.5574177529484795e-05, + "loss": 5.8533, + "step": 1648 + }, + { + "epoch": 0.07677444886747212, + "grad_norm": 0.8010551172430984, + "learning_rate": 2.558969584109249e-05, + "loss": 5.7848, + "step": 1649 + }, + { + "epoch": 0.07682100705356519, + "grad_norm": 1.0021989823768362, + "learning_rate": 2.5605214152700184e-05, + "loss": 5.8331, + "step": 1650 + }, + { + "epoch": 0.07686756523965826, + "grad_norm": 0.8453861366536748, + "learning_rate": 2.5620732464307884e-05, + "loss": 5.7967, + "step": 1651 + }, + { + "epoch": 0.07691412342575134, + "grad_norm": 0.7088317319113825, + "learning_rate": 2.5636250775915584e-05, + "loss": 5.8051, + "step": 1652 + }, + { + "epoch": 0.0769606816118444, + "grad_norm": 0.7992072370290658, + "learning_rate": 2.565176908752328e-05, + "loss": 5.684, + "step": 1653 + }, + { + "epoch": 0.07700723979793747, + "grad_norm": 0.6909618950589435, + "learning_rate": 2.5667287399130973e-05, + "loss": 5.7372, + "step": 1654 + }, + { + "epoch": 0.07705379798403054, + "grad_norm": 0.6685480948461109, + "learning_rate": 2.5682805710738673e-05, + "loss": 5.7056, + "step": 1655 + }, + { + "epoch": 0.07710035617012362, + "grad_norm": 0.719452193013276, + "learning_rate": 2.569832402234637e-05, + "loss": 5.8352, + "step": 1656 + }, + { + "epoch": 0.07714691435621668, + "grad_norm": 0.7711256600786977, + "learning_rate": 2.571384233395407e-05, + "loss": 5.8619, + "step": 1657 + }, + { + "epoch": 0.07719347254230975, + "grad_norm": 0.6371117980956876, + "learning_rate": 2.572936064556176e-05, + "loss": 5.7272, + "step": 1658 + }, + { + "epoch": 0.07724003072840283, + "grad_norm": 0.7609000497881088, + "learning_rate": 2.574487895716946e-05, + "loss": 5.803, + "step": 1659 + }, + { + "epoch": 0.07728658891449589, + "grad_norm": 0.6762182414016862, + "learning_rate": 2.5760397268777158e-05, + "loss": 5.8043, + "step": 1660 + }, + { + "epoch": 0.07733314710058896, + "grad_norm": 0.6806769685266937, + "learning_rate": 2.5775915580384857e-05, + "loss": 5.6978, + "step": 1661 + }, + { + "epoch": 0.07737970528668203, + "grad_norm": 0.7024995550806885, + "learning_rate": 2.579143389199255e-05, + "loss": 5.9341, + "step": 1662 + }, + { + "epoch": 0.0774262634727751, + "grad_norm": 0.5361675350649338, + "learning_rate": 2.580695220360025e-05, + "loss": 5.8327, + "step": 1663 + }, + { + "epoch": 0.07747282165886817, + "grad_norm": 0.5510837788679845, + "learning_rate": 2.5822470515207946e-05, + "loss": 5.8181, + "step": 1664 + }, + { + "epoch": 0.07751937984496124, + "grad_norm": 0.6190977751671343, + "learning_rate": 2.5837988826815646e-05, + "loss": 5.7306, + "step": 1665 + }, + { + "epoch": 0.07756593803105431, + "grad_norm": 0.7106827955841932, + "learning_rate": 2.585350713842334e-05, + "loss": 5.8724, + "step": 1666 + }, + { + "epoch": 0.07761249621714739, + "grad_norm": 0.6883711844595547, + "learning_rate": 2.586902545003104e-05, + "loss": 5.8043, + "step": 1667 + }, + { + "epoch": 0.07765905440324045, + "grad_norm": 0.624793188133635, + "learning_rate": 2.5884543761638735e-05, + "loss": 5.8666, + "step": 1668 + }, + { + "epoch": 0.07770561258933352, + "grad_norm": 0.5944083514015314, + "learning_rate": 2.5900062073246434e-05, + "loss": 5.8505, + "step": 1669 + }, + { + "epoch": 0.0777521707754266, + "grad_norm": 0.6559520303627023, + "learning_rate": 2.5915580384854127e-05, + "loss": 5.7357, + "step": 1670 + }, + { + "epoch": 0.07779872896151965, + "grad_norm": 0.7295257614639948, + "learning_rate": 2.5931098696461824e-05, + "loss": 5.8395, + "step": 1671 + }, + { + "epoch": 0.07784528714761273, + "grad_norm": 0.8460619618553455, + "learning_rate": 2.5946617008069523e-05, + "loss": 5.7801, + "step": 1672 + }, + { + "epoch": 0.0778918453337058, + "grad_norm": 0.7797313914541182, + "learning_rate": 2.5962135319677223e-05, + "loss": 5.761, + "step": 1673 + }, + { + "epoch": 0.07793840351979887, + "grad_norm": 0.7147156944460157, + "learning_rate": 2.5977653631284916e-05, + "loss": 5.8609, + "step": 1674 + }, + { + "epoch": 0.07798496170589193, + "grad_norm": 0.6516010944767211, + "learning_rate": 2.5993171942892612e-05, + "loss": 5.8237, + "step": 1675 + }, + { + "epoch": 0.07803151989198501, + "grad_norm": 0.6050667007426672, + "learning_rate": 2.6008690254500312e-05, + "loss": 5.6913, + "step": 1676 + }, + { + "epoch": 0.07807807807807808, + "grad_norm": 0.7518049686102704, + "learning_rate": 2.602420856610801e-05, + "loss": 5.8776, + "step": 1677 + }, + { + "epoch": 0.07812463626417115, + "grad_norm": 0.7768171530755699, + "learning_rate": 2.6039726877715705e-05, + "loss": 5.8842, + "step": 1678 + }, + { + "epoch": 0.07817119445026421, + "grad_norm": 0.8050911151535557, + "learning_rate": 2.60552451893234e-05, + "loss": 5.7703, + "step": 1679 + }, + { + "epoch": 0.07821775263635729, + "grad_norm": 0.8503273091733229, + "learning_rate": 2.60707635009311e-05, + "loss": 5.7988, + "step": 1680 + }, + { + "epoch": 0.07826431082245036, + "grad_norm": 0.6976262854242851, + "learning_rate": 2.60862818125388e-05, + "loss": 5.6627, + "step": 1681 + }, + { + "epoch": 0.07831086900854342, + "grad_norm": 0.7047119633275286, + "learning_rate": 2.6101800124146493e-05, + "loss": 5.7505, + "step": 1682 + }, + { + "epoch": 0.0783574271946365, + "grad_norm": 0.8402240103815714, + "learning_rate": 2.611731843575419e-05, + "loss": 5.7099, + "step": 1683 + }, + { + "epoch": 0.07840398538072957, + "grad_norm": 0.8084183758666943, + "learning_rate": 2.613283674736189e-05, + "loss": 5.8141, + "step": 1684 + }, + { + "epoch": 0.07845054356682264, + "grad_norm": 0.7026402581723425, + "learning_rate": 2.6148355058969586e-05, + "loss": 5.6946, + "step": 1685 + }, + { + "epoch": 0.0784971017529157, + "grad_norm": 0.7992258755628345, + "learning_rate": 2.616387337057728e-05, + "loss": 5.7559, + "step": 1686 + }, + { + "epoch": 0.07854365993900878, + "grad_norm": 0.7250182135520558, + "learning_rate": 2.6179391682184978e-05, + "loss": 5.7265, + "step": 1687 + }, + { + "epoch": 0.07859021812510185, + "grad_norm": 0.793270981371217, + "learning_rate": 2.6194909993792678e-05, + "loss": 5.6356, + "step": 1688 + }, + { + "epoch": 0.07863677631119492, + "grad_norm": 0.9165043943002704, + "learning_rate": 2.6210428305400374e-05, + "loss": 5.7208, + "step": 1689 + }, + { + "epoch": 0.07868333449728798, + "grad_norm": 0.8546873546747994, + "learning_rate": 2.6225946617008067e-05, + "loss": 5.9145, + "step": 1690 + }, + { + "epoch": 0.07872989268338106, + "grad_norm": 0.8233828757221856, + "learning_rate": 2.6241464928615767e-05, + "loss": 5.8217, + "step": 1691 + }, + { + "epoch": 0.07877645086947413, + "grad_norm": 0.7322842903043564, + "learning_rate": 2.6256983240223466e-05, + "loss": 5.9636, + "step": 1692 + }, + { + "epoch": 0.07882300905556719, + "grad_norm": 0.8280787990665767, + "learning_rate": 2.6272501551831163e-05, + "loss": 5.8031, + "step": 1693 + }, + { + "epoch": 0.07886956724166026, + "grad_norm": 0.7056600596815296, + "learning_rate": 2.6288019863438856e-05, + "loss": 5.8379, + "step": 1694 + }, + { + "epoch": 0.07891612542775334, + "grad_norm": 0.9187794429537257, + "learning_rate": 2.6303538175046555e-05, + "loss": 5.7192, + "step": 1695 + }, + { + "epoch": 0.07896268361384641, + "grad_norm": 0.8520425843655387, + "learning_rate": 2.631905648665425e-05, + "loss": 5.7376, + "step": 1696 + }, + { + "epoch": 0.07900924179993947, + "grad_norm": 0.7041165650099747, + "learning_rate": 2.633457479826195e-05, + "loss": 5.7561, + "step": 1697 + }, + { + "epoch": 0.07905579998603254, + "grad_norm": 0.7753537717311747, + "learning_rate": 2.635009310986965e-05, + "loss": 5.7815, + "step": 1698 + }, + { + "epoch": 0.07910235817212562, + "grad_norm": 0.7964122697885226, + "learning_rate": 2.6365611421477344e-05, + "loss": 5.6281, + "step": 1699 + }, + { + "epoch": 0.07914891635821869, + "grad_norm": 0.7648981822321662, + "learning_rate": 2.638112973308504e-05, + "loss": 5.776, + "step": 1700 + }, + { + "epoch": 0.07919547454431175, + "grad_norm": 0.8605741749191677, + "learning_rate": 2.639664804469274e-05, + "loss": 5.7346, + "step": 1701 + }, + { + "epoch": 0.07924203273040482, + "grad_norm": 0.8887490842259371, + "learning_rate": 2.641216635630044e-05, + "loss": 5.8041, + "step": 1702 + }, + { + "epoch": 0.0792885909164979, + "grad_norm": 1.1493180880552867, + "learning_rate": 2.6427684667908133e-05, + "loss": 5.8087, + "step": 1703 + }, + { + "epoch": 0.07933514910259096, + "grad_norm": 0.9278176434022894, + "learning_rate": 2.644320297951583e-05, + "loss": 5.8537, + "step": 1704 + }, + { + "epoch": 0.07938170728868403, + "grad_norm": 0.725909054776562, + "learning_rate": 2.645872129112353e-05, + "loss": 5.829, + "step": 1705 + }, + { + "epoch": 0.0794282654747771, + "grad_norm": 1.1013149551492782, + "learning_rate": 2.647423960273123e-05, + "loss": 5.648, + "step": 1706 + }, + { + "epoch": 0.07947482366087018, + "grad_norm": 0.8077755054450861, + "learning_rate": 2.648975791433892e-05, + "loss": 5.7149, + "step": 1707 + }, + { + "epoch": 0.07952138184696324, + "grad_norm": 0.7934653803256383, + "learning_rate": 2.6505276225946618e-05, + "loss": 5.7344, + "step": 1708 + }, + { + "epoch": 0.07956794003305631, + "grad_norm": 0.8575962866617735, + "learning_rate": 2.6520794537554317e-05, + "loss": 5.7653, + "step": 1709 + }, + { + "epoch": 0.07961449821914938, + "grad_norm": 0.784800941684699, + "learning_rate": 2.6536312849162014e-05, + "loss": 5.817, + "step": 1710 + }, + { + "epoch": 0.07966105640524246, + "grad_norm": 0.7070121945447185, + "learning_rate": 2.6551831160769706e-05, + "loss": 5.6857, + "step": 1711 + }, + { + "epoch": 0.07970761459133552, + "grad_norm": 0.5899851574600682, + "learning_rate": 2.6567349472377406e-05, + "loss": 5.7814, + "step": 1712 + }, + { + "epoch": 0.07975417277742859, + "grad_norm": 0.7344427516115921, + "learning_rate": 2.6582867783985106e-05, + "loss": 5.824, + "step": 1713 + }, + { + "epoch": 0.07980073096352167, + "grad_norm": 0.8382912465072438, + "learning_rate": 2.6598386095592802e-05, + "loss": 5.8344, + "step": 1714 + }, + { + "epoch": 0.07984728914961473, + "grad_norm": 0.6965012491200467, + "learning_rate": 2.6613904407200495e-05, + "loss": 5.7707, + "step": 1715 + }, + { + "epoch": 0.0798938473357078, + "grad_norm": 0.6924131462716149, + "learning_rate": 2.6629422718808195e-05, + "loss": 5.865, + "step": 1716 + }, + { + "epoch": 0.07994040552180087, + "grad_norm": 0.6017916953812049, + "learning_rate": 2.6644941030415894e-05, + "loss": 5.6958, + "step": 1717 + }, + { + "epoch": 0.07998696370789395, + "grad_norm": 0.6142386549362895, + "learning_rate": 2.666045934202359e-05, + "loss": 5.7514, + "step": 1718 + }, + { + "epoch": 0.080033521893987, + "grad_norm": 0.6765157546394323, + "learning_rate": 2.6675977653631284e-05, + "loss": 5.7417, + "step": 1719 + }, + { + "epoch": 0.08008008008008008, + "grad_norm": 0.7306092342746103, + "learning_rate": 2.6691495965238983e-05, + "loss": 5.6257, + "step": 1720 + }, + { + "epoch": 0.08012663826617315, + "grad_norm": 0.7142702223846342, + "learning_rate": 2.670701427684668e-05, + "loss": 5.8207, + "step": 1721 + }, + { + "epoch": 0.08017319645226623, + "grad_norm": 0.806494939395157, + "learning_rate": 2.672253258845438e-05, + "loss": 5.8018, + "step": 1722 + }, + { + "epoch": 0.08021975463835929, + "grad_norm": 0.7532468384309018, + "learning_rate": 2.6738050900062072e-05, + "loss": 5.7285, + "step": 1723 + }, + { + "epoch": 0.08026631282445236, + "grad_norm": 0.6908473850589035, + "learning_rate": 2.6753569211669772e-05, + "loss": 5.7813, + "step": 1724 + }, + { + "epoch": 0.08031287101054543, + "grad_norm": 0.9008287013071611, + "learning_rate": 2.676908752327747e-05, + "loss": 5.6379, + "step": 1725 + }, + { + "epoch": 0.0803594291966385, + "grad_norm": 0.7579911278140808, + "learning_rate": 2.6784605834885168e-05, + "loss": 5.7701, + "step": 1726 + }, + { + "epoch": 0.08040598738273157, + "grad_norm": 0.7418925228078093, + "learning_rate": 2.680012414649286e-05, + "loss": 5.6555, + "step": 1727 + }, + { + "epoch": 0.08045254556882464, + "grad_norm": 0.6721838706721374, + "learning_rate": 2.681564245810056e-05, + "loss": 5.706, + "step": 1728 + }, + { + "epoch": 0.08049910375491771, + "grad_norm": 0.6730108697265175, + "learning_rate": 2.6831160769708257e-05, + "loss": 5.7687, + "step": 1729 + }, + { + "epoch": 0.08054566194101077, + "grad_norm": 0.7483919769659811, + "learning_rate": 2.6846679081315957e-05, + "loss": 5.6915, + "step": 1730 + }, + { + "epoch": 0.08059222012710385, + "grad_norm": 0.8768110853763493, + "learning_rate": 2.686219739292365e-05, + "loss": 5.794, + "step": 1731 + }, + { + "epoch": 0.08063877831319692, + "grad_norm": 0.8131069132379557, + "learning_rate": 2.687771570453135e-05, + "loss": 5.739, + "step": 1732 + }, + { + "epoch": 0.08068533649928998, + "grad_norm": 0.6636220800617608, + "learning_rate": 2.6893234016139046e-05, + "loss": 5.8137, + "step": 1733 + }, + { + "epoch": 0.08073189468538305, + "grad_norm": 0.7277984082623672, + "learning_rate": 2.6908752327746745e-05, + "loss": 5.7387, + "step": 1734 + }, + { + "epoch": 0.08077845287147613, + "grad_norm": 0.7299013244795496, + "learning_rate": 2.6924270639354438e-05, + "loss": 5.8668, + "step": 1735 + }, + { + "epoch": 0.0808250110575692, + "grad_norm": 0.7329275958883422, + "learning_rate": 2.6939788950962134e-05, + "loss": 5.7957, + "step": 1736 + }, + { + "epoch": 0.08087156924366226, + "grad_norm": 0.768707984343, + "learning_rate": 2.6955307262569834e-05, + "loss": 5.692, + "step": 1737 + }, + { + "epoch": 0.08091812742975533, + "grad_norm": 0.8398274620164513, + "learning_rate": 2.6970825574177534e-05, + "loss": 5.7222, + "step": 1738 + }, + { + "epoch": 0.08096468561584841, + "grad_norm": 0.8337895856361399, + "learning_rate": 2.6986343885785227e-05, + "loss": 5.6974, + "step": 1739 + }, + { + "epoch": 0.08101124380194148, + "grad_norm": 0.8565438870311992, + "learning_rate": 2.7001862197392923e-05, + "loss": 5.6557, + "step": 1740 + }, + { + "epoch": 0.08105780198803454, + "grad_norm": 0.891050112178886, + "learning_rate": 2.7017380509000623e-05, + "loss": 5.7784, + "step": 1741 + }, + { + "epoch": 0.08110436017412762, + "grad_norm": 0.9345408493439715, + "learning_rate": 2.7032898820608322e-05, + "loss": 5.7407, + "step": 1742 + }, + { + "epoch": 0.08115091836022069, + "grad_norm": 0.9751203798380826, + "learning_rate": 2.7048417132216015e-05, + "loss": 5.8729, + "step": 1743 + }, + { + "epoch": 0.08119747654631375, + "grad_norm": 0.8226723729684584, + "learning_rate": 2.7063935443823712e-05, + "loss": 5.6299, + "step": 1744 + }, + { + "epoch": 0.08124403473240682, + "grad_norm": 0.688052419460772, + "learning_rate": 2.707945375543141e-05, + "loss": 5.8572, + "step": 1745 + }, + { + "epoch": 0.0812905929184999, + "grad_norm": 0.8935781440547819, + "learning_rate": 2.7094972067039108e-05, + "loss": 5.7155, + "step": 1746 + }, + { + "epoch": 0.08133715110459297, + "grad_norm": 0.7050981043997572, + "learning_rate": 2.71104903786468e-05, + "loss": 5.7283, + "step": 1747 + }, + { + "epoch": 0.08138370929068603, + "grad_norm": 0.6508042927089456, + "learning_rate": 2.71260086902545e-05, + "loss": 5.7706, + "step": 1748 + }, + { + "epoch": 0.0814302674767791, + "grad_norm": 0.8615731458904741, + "learning_rate": 2.71415270018622e-05, + "loss": 5.7405, + "step": 1749 + }, + { + "epoch": 0.08147682566287218, + "grad_norm": 0.5603613668049955, + "learning_rate": 2.7157045313469896e-05, + "loss": 5.7583, + "step": 1750 + }, + { + "epoch": 0.08152338384896525, + "grad_norm": 0.7330638185120926, + "learning_rate": 2.717256362507759e-05, + "loss": 5.6656, + "step": 1751 + }, + { + "epoch": 0.08156994203505831, + "grad_norm": 0.7150509090011656, + "learning_rate": 2.718808193668529e-05, + "loss": 5.6987, + "step": 1752 + }, + { + "epoch": 0.08161650022115138, + "grad_norm": 0.791069514135688, + "learning_rate": 2.720360024829299e-05, + "loss": 5.8275, + "step": 1753 + }, + { + "epoch": 0.08166305840724446, + "grad_norm": 0.8828709725663317, + "learning_rate": 2.7219118559900685e-05, + "loss": 5.6639, + "step": 1754 + }, + { + "epoch": 0.08170961659333752, + "grad_norm": 0.7506469357595131, + "learning_rate": 2.7234636871508378e-05, + "loss": 5.7505, + "step": 1755 + }, + { + "epoch": 0.08175617477943059, + "grad_norm": 0.6837505426013993, + "learning_rate": 2.7250155183116078e-05, + "loss": 5.7483, + "step": 1756 + }, + { + "epoch": 0.08180273296552366, + "grad_norm": 0.6693286805122064, + "learning_rate": 2.7265673494723777e-05, + "loss": 5.7188, + "step": 1757 + }, + { + "epoch": 0.08184929115161674, + "grad_norm": 0.8310712777406531, + "learning_rate": 2.7281191806331474e-05, + "loss": 5.6223, + "step": 1758 + }, + { + "epoch": 0.0818958493377098, + "grad_norm": 0.7048825937550259, + "learning_rate": 2.7296710117939166e-05, + "loss": 5.6956, + "step": 1759 + }, + { + "epoch": 0.08194240752380287, + "grad_norm": 0.7741238730665624, + "learning_rate": 2.7312228429546866e-05, + "loss": 5.7388, + "step": 1760 + }, + { + "epoch": 0.08198896570989594, + "grad_norm": 0.7498164963625754, + "learning_rate": 2.7327746741154562e-05, + "loss": 5.7484, + "step": 1761 + }, + { + "epoch": 0.08203552389598902, + "grad_norm": 0.8516468147880037, + "learning_rate": 2.7343265052762262e-05, + "loss": 5.8747, + "step": 1762 + }, + { + "epoch": 0.08208208208208208, + "grad_norm": 0.7520219134211084, + "learning_rate": 2.7358783364369955e-05, + "loss": 5.6802, + "step": 1763 + }, + { + "epoch": 0.08212864026817515, + "grad_norm": 0.6529536124653801, + "learning_rate": 2.7374301675977655e-05, + "loss": 5.7855, + "step": 1764 + }, + { + "epoch": 0.08217519845426823, + "grad_norm": 0.6409904579154043, + "learning_rate": 2.738981998758535e-05, + "loss": 5.7222, + "step": 1765 + }, + { + "epoch": 0.08222175664036128, + "grad_norm": 0.761989063581881, + "learning_rate": 2.740533829919305e-05, + "loss": 5.5928, + "step": 1766 + }, + { + "epoch": 0.08226831482645436, + "grad_norm": 0.6320460348976025, + "learning_rate": 2.7420856610800744e-05, + "loss": 5.7758, + "step": 1767 + }, + { + "epoch": 0.08231487301254743, + "grad_norm": 0.7892748271272984, + "learning_rate": 2.7436374922408443e-05, + "loss": 5.8048, + "step": 1768 + }, + { + "epoch": 0.0823614311986405, + "grad_norm": 0.7680650263201577, + "learning_rate": 2.745189323401614e-05, + "loss": 5.6567, + "step": 1769 + }, + { + "epoch": 0.08240798938473357, + "grad_norm": 0.7549415548188227, + "learning_rate": 2.746741154562384e-05, + "loss": 5.6301, + "step": 1770 + }, + { + "epoch": 0.08245454757082664, + "grad_norm": 0.6786574133564156, + "learning_rate": 2.7482929857231532e-05, + "loss": 5.5803, + "step": 1771 + }, + { + "epoch": 0.08250110575691971, + "grad_norm": 0.7199233041352332, + "learning_rate": 2.7498448168839232e-05, + "loss": 5.8112, + "step": 1772 + }, + { + "epoch": 0.08254766394301279, + "grad_norm": 0.7784125612274817, + "learning_rate": 2.751396648044693e-05, + "loss": 5.8033, + "step": 1773 + }, + { + "epoch": 0.08259422212910585, + "grad_norm": 0.7148474011018322, + "learning_rate": 2.7529484792054628e-05, + "loss": 5.6469, + "step": 1774 + }, + { + "epoch": 0.08264078031519892, + "grad_norm": 0.7598688426317508, + "learning_rate": 2.754500310366232e-05, + "loss": 5.7309, + "step": 1775 + }, + { + "epoch": 0.082687338501292, + "grad_norm": 0.8286052431960197, + "learning_rate": 2.7560521415270017e-05, + "loss": 5.6917, + "step": 1776 + }, + { + "epoch": 0.08273389668738505, + "grad_norm": 0.7803291961040544, + "learning_rate": 2.7576039726877717e-05, + "loss": 5.6697, + "step": 1777 + }, + { + "epoch": 0.08278045487347813, + "grad_norm": 0.8169416507775763, + "learning_rate": 2.7591558038485417e-05, + "loss": 5.6425, + "step": 1778 + }, + { + "epoch": 0.0828270130595712, + "grad_norm": 0.8222743260605805, + "learning_rate": 2.760707635009311e-05, + "loss": 5.5924, + "step": 1779 + }, + { + "epoch": 0.08287357124566427, + "grad_norm": 0.6531683508425242, + "learning_rate": 2.7622594661700806e-05, + "loss": 5.7048, + "step": 1780 + }, + { + "epoch": 0.08292012943175733, + "grad_norm": 0.8096784226432125, + "learning_rate": 2.7638112973308506e-05, + "loss": 5.6922, + "step": 1781 + }, + { + "epoch": 0.08296668761785041, + "grad_norm": 0.9406520369656182, + "learning_rate": 2.7653631284916205e-05, + "loss": 5.6098, + "step": 1782 + }, + { + "epoch": 0.08301324580394348, + "grad_norm": 0.7714265670933276, + "learning_rate": 2.76691495965239e-05, + "loss": 5.5985, + "step": 1783 + }, + { + "epoch": 0.08305980399003655, + "grad_norm": 0.8184773344033981, + "learning_rate": 2.7684667908131594e-05, + "loss": 5.6559, + "step": 1784 + }, + { + "epoch": 0.08310636217612961, + "grad_norm": 1.0427092984457174, + "learning_rate": 2.7700186219739294e-05, + "loss": 5.7647, + "step": 1785 + }, + { + "epoch": 0.08315292036222269, + "grad_norm": 0.9943532580150299, + "learning_rate": 2.771570453134699e-05, + "loss": 5.6674, + "step": 1786 + }, + { + "epoch": 0.08319947854831576, + "grad_norm": 0.5776750182722141, + "learning_rate": 2.773122284295469e-05, + "loss": 5.633, + "step": 1787 + }, + { + "epoch": 0.08324603673440882, + "grad_norm": 0.8866123261715055, + "learning_rate": 2.7746741154562383e-05, + "loss": 5.6758, + "step": 1788 + }, + { + "epoch": 0.0832925949205019, + "grad_norm": 0.9004381995639446, + "learning_rate": 2.7762259466170083e-05, + "loss": 5.6593, + "step": 1789 + }, + { + "epoch": 0.08333915310659497, + "grad_norm": 0.863725318088448, + "learning_rate": 2.777777777777778e-05, + "loss": 5.7634, + "step": 1790 + }, + { + "epoch": 0.08338571129268804, + "grad_norm": 0.7565104758781334, + "learning_rate": 2.779329608938548e-05, + "loss": 5.7476, + "step": 1791 + }, + { + "epoch": 0.0834322694787811, + "grad_norm": 0.9020656203350335, + "learning_rate": 2.7808814400993172e-05, + "loss": 5.678, + "step": 1792 + }, + { + "epoch": 0.08347882766487418, + "grad_norm": 0.837062317922284, + "learning_rate": 2.782433271260087e-05, + "loss": 5.6466, + "step": 1793 + }, + { + "epoch": 0.08352538585096725, + "grad_norm": 0.6240492106011151, + "learning_rate": 2.7839851024208568e-05, + "loss": 5.6161, + "step": 1794 + }, + { + "epoch": 0.08357194403706032, + "grad_norm": 0.7554355643767803, + "learning_rate": 2.7855369335816267e-05, + "loss": 5.8132, + "step": 1795 + }, + { + "epoch": 0.08361850222315338, + "grad_norm": 0.6230764788893005, + "learning_rate": 2.787088764742396e-05, + "loss": 5.6151, + "step": 1796 + }, + { + "epoch": 0.08366506040924646, + "grad_norm": 0.5872641915145088, + "learning_rate": 2.788640595903166e-05, + "loss": 5.7017, + "step": 1797 + }, + { + "epoch": 0.08371161859533953, + "grad_norm": 0.7521590263493287, + "learning_rate": 2.7901924270639356e-05, + "loss": 5.646, + "step": 1798 + }, + { + "epoch": 0.08375817678143259, + "grad_norm": 0.6669425638359852, + "learning_rate": 2.7917442582247056e-05, + "loss": 5.5534, + "step": 1799 + }, + { + "epoch": 0.08380473496752566, + "grad_norm": 0.8162806674498093, + "learning_rate": 2.793296089385475e-05, + "loss": 5.6786, + "step": 1800 + }, + { + "epoch": 0.08385129315361874, + "grad_norm": 0.9068262253245193, + "learning_rate": 2.7948479205462445e-05, + "loss": 5.6822, + "step": 1801 + }, + { + "epoch": 0.08389785133971181, + "grad_norm": 0.7260055708305352, + "learning_rate": 2.7963997517070145e-05, + "loss": 5.6486, + "step": 1802 + }, + { + "epoch": 0.08394440952580487, + "grad_norm": 0.7791137495414552, + "learning_rate": 2.7979515828677845e-05, + "loss": 5.6445, + "step": 1803 + }, + { + "epoch": 0.08399096771189794, + "grad_norm": 0.7484489524588495, + "learning_rate": 2.7995034140285538e-05, + "loss": 5.7241, + "step": 1804 + }, + { + "epoch": 0.08403752589799102, + "grad_norm": 0.8051788980649942, + "learning_rate": 2.8010552451893234e-05, + "loss": 5.5822, + "step": 1805 + }, + { + "epoch": 0.08408408408408409, + "grad_norm": 0.6978680635020319, + "learning_rate": 2.8026070763500934e-05, + "loss": 5.6842, + "step": 1806 + }, + { + "epoch": 0.08413064227017715, + "grad_norm": 0.7432634348157133, + "learning_rate": 2.8041589075108633e-05, + "loss": 5.737, + "step": 1807 + }, + { + "epoch": 0.08417720045627022, + "grad_norm": 0.8330588298721135, + "learning_rate": 2.8057107386716326e-05, + "loss": 5.6917, + "step": 1808 + }, + { + "epoch": 0.0842237586423633, + "grad_norm": 0.9592152521047989, + "learning_rate": 2.8072625698324022e-05, + "loss": 5.7016, + "step": 1809 + }, + { + "epoch": 0.08427031682845636, + "grad_norm": 1.0139094848655152, + "learning_rate": 2.8088144009931722e-05, + "loss": 5.7527, + "step": 1810 + }, + { + "epoch": 0.08431687501454943, + "grad_norm": 0.8937342767118416, + "learning_rate": 2.810366232153942e-05, + "loss": 5.6744, + "step": 1811 + }, + { + "epoch": 0.0843634332006425, + "grad_norm": 0.7852573905058681, + "learning_rate": 2.811918063314711e-05, + "loss": 5.6165, + "step": 1812 + }, + { + "epoch": 0.08440999138673558, + "grad_norm": 0.6969516424952142, + "learning_rate": 2.813469894475481e-05, + "loss": 5.5986, + "step": 1813 + }, + { + "epoch": 0.08445654957282864, + "grad_norm": 0.93883816956931, + "learning_rate": 2.815021725636251e-05, + "loss": 5.6766, + "step": 1814 + }, + { + "epoch": 0.08450310775892171, + "grad_norm": 0.817831070366924, + "learning_rate": 2.8165735567970207e-05, + "loss": 5.5864, + "step": 1815 + }, + { + "epoch": 0.08454966594501478, + "grad_norm": 0.7358310994893666, + "learning_rate": 2.81812538795779e-05, + "loss": 5.6419, + "step": 1816 + }, + { + "epoch": 0.08459622413110786, + "grad_norm": 0.7430787211253604, + "learning_rate": 2.81967721911856e-05, + "loss": 5.6925, + "step": 1817 + }, + { + "epoch": 0.08464278231720092, + "grad_norm": 0.7976207344651146, + "learning_rate": 2.82122905027933e-05, + "loss": 5.7533, + "step": 1818 + }, + { + "epoch": 0.08468934050329399, + "grad_norm": 0.8114370038613943, + "learning_rate": 2.8227808814400996e-05, + "loss": 5.6336, + "step": 1819 + }, + { + "epoch": 0.08473589868938707, + "grad_norm": 0.7615593545301148, + "learning_rate": 2.824332712600869e-05, + "loss": 5.5959, + "step": 1820 + }, + { + "epoch": 0.08478245687548012, + "grad_norm": 0.7292576410210966, + "learning_rate": 2.825884543761639e-05, + "loss": 5.6282, + "step": 1821 + }, + { + "epoch": 0.0848290150615732, + "grad_norm": 0.7725897958771187, + "learning_rate": 2.8274363749224088e-05, + "loss": 5.7063, + "step": 1822 + }, + { + "epoch": 0.08487557324766627, + "grad_norm": 0.7861329538314913, + "learning_rate": 2.8289882060831784e-05, + "loss": 5.6507, + "step": 1823 + }, + { + "epoch": 0.08492213143375935, + "grad_norm": 0.8879630853341519, + "learning_rate": 2.8305400372439477e-05, + "loss": 5.703, + "step": 1824 + }, + { + "epoch": 0.0849686896198524, + "grad_norm": 0.9335331012176022, + "learning_rate": 2.8320918684047177e-05, + "loss": 5.6831, + "step": 1825 + }, + { + "epoch": 0.08501524780594548, + "grad_norm": 0.8186718123742568, + "learning_rate": 2.8336436995654873e-05, + "loss": 5.8263, + "step": 1826 + }, + { + "epoch": 0.08506180599203855, + "grad_norm": 0.8941183456543943, + "learning_rate": 2.8351955307262573e-05, + "loss": 5.557, + "step": 1827 + }, + { + "epoch": 0.08510836417813163, + "grad_norm": 0.9905636028824838, + "learning_rate": 2.8367473618870266e-05, + "loss": 5.665, + "step": 1828 + }, + { + "epoch": 0.08515492236422469, + "grad_norm": 1.0337491866073938, + "learning_rate": 2.8382991930477966e-05, + "loss": 5.5974, + "step": 1829 + }, + { + "epoch": 0.08520148055031776, + "grad_norm": 0.6394233690398661, + "learning_rate": 2.8398510242085662e-05, + "loss": 5.6515, + "step": 1830 + }, + { + "epoch": 0.08524803873641083, + "grad_norm": 0.8060504307750661, + "learning_rate": 2.841402855369336e-05, + "loss": 5.7482, + "step": 1831 + }, + { + "epoch": 0.08529459692250389, + "grad_norm": 1.0900301057181203, + "learning_rate": 2.8429546865301055e-05, + "loss": 5.5916, + "step": 1832 + }, + { + "epoch": 0.08534115510859697, + "grad_norm": 0.8179083262602517, + "learning_rate": 2.8445065176908754e-05, + "loss": 5.6625, + "step": 1833 + }, + { + "epoch": 0.08538771329469004, + "grad_norm": 0.778677053745988, + "learning_rate": 2.846058348851645e-05, + "loss": 5.5431, + "step": 1834 + }, + { + "epoch": 0.08543427148078311, + "grad_norm": 0.8889927228503413, + "learning_rate": 2.847610180012415e-05, + "loss": 5.6242, + "step": 1835 + }, + { + "epoch": 0.08548082966687617, + "grad_norm": 0.7752809771693466, + "learning_rate": 2.8491620111731843e-05, + "loss": 5.6635, + "step": 1836 + }, + { + "epoch": 0.08552738785296925, + "grad_norm": 0.9666223147461838, + "learning_rate": 2.850713842333954e-05, + "loss": 5.5298, + "step": 1837 + }, + { + "epoch": 0.08557394603906232, + "grad_norm": 0.8609008551612921, + "learning_rate": 2.852265673494724e-05, + "loss": 5.545, + "step": 1838 + }, + { + "epoch": 0.0856205042251554, + "grad_norm": 0.9478346721309721, + "learning_rate": 2.853817504655494e-05, + "loss": 5.7062, + "step": 1839 + }, + { + "epoch": 0.08566706241124845, + "grad_norm": 0.7690108275370713, + "learning_rate": 2.8553693358162632e-05, + "loss": 5.5739, + "step": 1840 + }, + { + "epoch": 0.08571362059734153, + "grad_norm": 0.8582141617518252, + "learning_rate": 2.8569211669770328e-05, + "loss": 5.5847, + "step": 1841 + }, + { + "epoch": 0.0857601787834346, + "grad_norm": 0.9250487146821734, + "learning_rate": 2.8584729981378028e-05, + "loss": 5.6862, + "step": 1842 + }, + { + "epoch": 0.08580673696952766, + "grad_norm": 0.7854706732442618, + "learning_rate": 2.8600248292985727e-05, + "loss": 5.5718, + "step": 1843 + }, + { + "epoch": 0.08585329515562073, + "grad_norm": 0.7580693850180612, + "learning_rate": 2.861576660459342e-05, + "loss": 5.7036, + "step": 1844 + }, + { + "epoch": 0.08589985334171381, + "grad_norm": 0.7780190978437008, + "learning_rate": 2.8631284916201117e-05, + "loss": 5.575, + "step": 1845 + }, + { + "epoch": 0.08594641152780688, + "grad_norm": 0.7463482278305463, + "learning_rate": 2.8646803227808816e-05, + "loss": 5.7004, + "step": 1846 + }, + { + "epoch": 0.08599296971389994, + "grad_norm": 0.6958842357364933, + "learning_rate": 2.8662321539416516e-05, + "loss": 5.6704, + "step": 1847 + }, + { + "epoch": 0.08603952789999302, + "grad_norm": 0.7083123079675654, + "learning_rate": 2.867783985102421e-05, + "loss": 5.6098, + "step": 1848 + }, + { + "epoch": 0.08608608608608609, + "grad_norm": 0.8301635444521271, + "learning_rate": 2.8693358162631905e-05, + "loss": 5.6553, + "step": 1849 + }, + { + "epoch": 0.08613264427217916, + "grad_norm": 0.7605140677217564, + "learning_rate": 2.8708876474239605e-05, + "loss": 5.671, + "step": 1850 + }, + { + "epoch": 0.08617920245827222, + "grad_norm": 0.8179845094728212, + "learning_rate": 2.87243947858473e-05, + "loss": 5.6377, + "step": 1851 + }, + { + "epoch": 0.0862257606443653, + "grad_norm": 0.8275575210403706, + "learning_rate": 2.8739913097454994e-05, + "loss": 5.6533, + "step": 1852 + }, + { + "epoch": 0.08627231883045837, + "grad_norm": 0.6218399716883994, + "learning_rate": 2.8755431409062694e-05, + "loss": 5.6214, + "step": 1853 + }, + { + "epoch": 0.08631887701655143, + "grad_norm": 0.8362838649449841, + "learning_rate": 2.8770949720670394e-05, + "loss": 5.6702, + "step": 1854 + }, + { + "epoch": 0.0863654352026445, + "grad_norm": 0.8129890446565755, + "learning_rate": 2.878646803227809e-05, + "loss": 5.5796, + "step": 1855 + }, + { + "epoch": 0.08641199338873758, + "grad_norm": 0.6114665244535991, + "learning_rate": 2.8801986343885783e-05, + "loss": 5.6532, + "step": 1856 + }, + { + "epoch": 0.08645855157483065, + "grad_norm": 0.6945963022824346, + "learning_rate": 2.8817504655493483e-05, + "loss": 5.6813, + "step": 1857 + }, + { + "epoch": 0.08650510976092371, + "grad_norm": 0.8493864978000852, + "learning_rate": 2.8833022967101182e-05, + "loss": 5.6291, + "step": 1858 + }, + { + "epoch": 0.08655166794701678, + "grad_norm": 0.6552372499781814, + "learning_rate": 2.884854127870888e-05, + "loss": 5.625, + "step": 1859 + }, + { + "epoch": 0.08659822613310986, + "grad_norm": 0.6611416408198991, + "learning_rate": 2.886405959031657e-05, + "loss": 5.649, + "step": 1860 + }, + { + "epoch": 0.08664478431920293, + "grad_norm": 0.7044123810981284, + "learning_rate": 2.887957790192427e-05, + "loss": 5.6943, + "step": 1861 + }, + { + "epoch": 0.08669134250529599, + "grad_norm": 0.737615987150976, + "learning_rate": 2.889509621353197e-05, + "loss": 5.5623, + "step": 1862 + }, + { + "epoch": 0.08673790069138906, + "grad_norm": 0.5451834760553242, + "learning_rate": 2.8910614525139667e-05, + "loss": 5.5852, + "step": 1863 + }, + { + "epoch": 0.08678445887748214, + "grad_norm": 0.7635202391082536, + "learning_rate": 2.892613283674736e-05, + "loss": 5.5585, + "step": 1864 + }, + { + "epoch": 0.0868310170635752, + "grad_norm": 0.824137657959961, + "learning_rate": 2.894165114835506e-05, + "loss": 5.6887, + "step": 1865 + }, + { + "epoch": 0.08687757524966827, + "grad_norm": 0.7867800982641315, + "learning_rate": 2.8957169459962756e-05, + "loss": 5.7575, + "step": 1866 + }, + { + "epoch": 0.08692413343576134, + "grad_norm": 0.765657289810297, + "learning_rate": 2.8972687771570456e-05, + "loss": 5.6403, + "step": 1867 + }, + { + "epoch": 0.08697069162185442, + "grad_norm": 0.7571720230544968, + "learning_rate": 2.8988206083178155e-05, + "loss": 5.6346, + "step": 1868 + }, + { + "epoch": 0.08701724980794748, + "grad_norm": 0.6709457004765299, + "learning_rate": 2.900372439478585e-05, + "loss": 5.7093, + "step": 1869 + }, + { + "epoch": 0.08706380799404055, + "grad_norm": 0.6394714969873843, + "learning_rate": 2.9019242706393545e-05, + "loss": 5.5074, + "step": 1870 + }, + { + "epoch": 0.08711036618013362, + "grad_norm": 0.6893968057129879, + "learning_rate": 2.9034761018001244e-05, + "loss": 5.6553, + "step": 1871 + }, + { + "epoch": 0.0871569243662267, + "grad_norm": 0.8411082138164483, + "learning_rate": 2.9050279329608944e-05, + "loss": 5.5994, + "step": 1872 + }, + { + "epoch": 0.08720348255231976, + "grad_norm": 0.8147409453150609, + "learning_rate": 2.9065797641216637e-05, + "loss": 5.5495, + "step": 1873 + }, + { + "epoch": 0.08725004073841283, + "grad_norm": 0.6925563551168255, + "learning_rate": 2.9081315952824333e-05, + "loss": 5.63, + "step": 1874 + }, + { + "epoch": 0.0872965989245059, + "grad_norm": 0.8237722085959855, + "learning_rate": 2.9096834264432033e-05, + "loss": 5.6773, + "step": 1875 + }, + { + "epoch": 0.08734315711059897, + "grad_norm": 0.7422554052024147, + "learning_rate": 2.911235257603973e-05, + "loss": 5.5698, + "step": 1876 + }, + { + "epoch": 0.08738971529669204, + "grad_norm": 0.6908392459212098, + "learning_rate": 2.9127870887647422e-05, + "loss": 5.6063, + "step": 1877 + }, + { + "epoch": 0.08743627348278511, + "grad_norm": 0.6884320883841346, + "learning_rate": 2.9143389199255122e-05, + "loss": 5.5376, + "step": 1878 + }, + { + "epoch": 0.08748283166887819, + "grad_norm": 0.6852246466814005, + "learning_rate": 2.915890751086282e-05, + "loss": 5.726, + "step": 1879 + }, + { + "epoch": 0.08752938985497125, + "grad_norm": 0.6160564580436151, + "learning_rate": 2.9174425822470518e-05, + "loss": 5.6048, + "step": 1880 + }, + { + "epoch": 0.08757594804106432, + "grad_norm": 0.6135762150464287, + "learning_rate": 2.918994413407821e-05, + "loss": 5.6559, + "step": 1881 + }, + { + "epoch": 0.08762250622715739, + "grad_norm": 0.6392025899926642, + "learning_rate": 2.920546244568591e-05, + "loss": 5.5913, + "step": 1882 + }, + { + "epoch": 0.08766906441325047, + "grad_norm": 0.646514617450301, + "learning_rate": 2.922098075729361e-05, + "loss": 5.5574, + "step": 1883 + }, + { + "epoch": 0.08771562259934353, + "grad_norm": 0.5134834205315291, + "learning_rate": 2.9236499068901307e-05, + "loss": 5.7015, + "step": 1884 + }, + { + "epoch": 0.0877621807854366, + "grad_norm": 0.7190806178756947, + "learning_rate": 2.9252017380509e-05, + "loss": 5.6366, + "step": 1885 + }, + { + "epoch": 0.08780873897152967, + "grad_norm": 0.6621764428491176, + "learning_rate": 2.92675356921167e-05, + "loss": 5.5346, + "step": 1886 + }, + { + "epoch": 0.08785529715762273, + "grad_norm": 0.7111640230553169, + "learning_rate": 2.92830540037244e-05, + "loss": 5.5754, + "step": 1887 + }, + { + "epoch": 0.0879018553437158, + "grad_norm": 0.8440641369167317, + "learning_rate": 2.9298572315332095e-05, + "loss": 5.5245, + "step": 1888 + }, + { + "epoch": 0.08794841352980888, + "grad_norm": 0.8603170289230759, + "learning_rate": 2.9314090626939788e-05, + "loss": 5.5121, + "step": 1889 + }, + { + "epoch": 0.08799497171590195, + "grad_norm": 0.827781528434688, + "learning_rate": 2.9329608938547488e-05, + "loss": 5.7217, + "step": 1890 + }, + { + "epoch": 0.08804152990199501, + "grad_norm": 0.8192967753310557, + "learning_rate": 2.9345127250155184e-05, + "loss": 5.5893, + "step": 1891 + }, + { + "epoch": 0.08808808808808809, + "grad_norm": 0.7690100151322772, + "learning_rate": 2.9360645561762884e-05, + "loss": 5.6003, + "step": 1892 + }, + { + "epoch": 0.08813464627418116, + "grad_norm": 1.0383823313501892, + "learning_rate": 2.9376163873370577e-05, + "loss": 5.5487, + "step": 1893 + }, + { + "epoch": 0.08818120446027423, + "grad_norm": 1.004777657194116, + "learning_rate": 2.9391682184978276e-05, + "loss": 5.5551, + "step": 1894 + }, + { + "epoch": 0.0882277626463673, + "grad_norm": 0.7749229228115221, + "learning_rate": 2.9407200496585973e-05, + "loss": 5.6261, + "step": 1895 + }, + { + "epoch": 0.08827432083246037, + "grad_norm": 0.6937369203304308, + "learning_rate": 2.9422718808193672e-05, + "loss": 5.6009, + "step": 1896 + }, + { + "epoch": 0.08832087901855344, + "grad_norm": 0.7996826549950732, + "learning_rate": 2.9438237119801365e-05, + "loss": 5.5347, + "step": 1897 + }, + { + "epoch": 0.0883674372046465, + "grad_norm": 0.7067829097625665, + "learning_rate": 2.9453755431409065e-05, + "loss": 5.571, + "step": 1898 + }, + { + "epoch": 0.08841399539073957, + "grad_norm": 0.8668514211273615, + "learning_rate": 2.946927374301676e-05, + "loss": 5.6634, + "step": 1899 + }, + { + "epoch": 0.08846055357683265, + "grad_norm": 0.9565818829706979, + "learning_rate": 2.948479205462446e-05, + "loss": 5.727, + "step": 1900 + }, + { + "epoch": 0.08850711176292572, + "grad_norm": 0.8516426215362469, + "learning_rate": 2.9500310366232154e-05, + "loss": 5.5708, + "step": 1901 + }, + { + "epoch": 0.08855366994901878, + "grad_norm": 0.6317844538218985, + "learning_rate": 2.951582867783985e-05, + "loss": 5.618, + "step": 1902 + }, + { + "epoch": 0.08860022813511186, + "grad_norm": 0.7758449198431692, + "learning_rate": 2.953134698944755e-05, + "loss": 5.6264, + "step": 1903 + }, + { + "epoch": 0.08864678632120493, + "grad_norm": 0.922322434067587, + "learning_rate": 2.954686530105525e-05, + "loss": 5.5577, + "step": 1904 + }, + { + "epoch": 0.088693344507298, + "grad_norm": 1.05440317412194, + "learning_rate": 2.9562383612662943e-05, + "loss": 5.5094, + "step": 1905 + }, + { + "epoch": 0.08873990269339106, + "grad_norm": 0.8862765365126525, + "learning_rate": 2.957790192427064e-05, + "loss": 5.5827, + "step": 1906 + }, + { + "epoch": 0.08878646087948414, + "grad_norm": 0.856085711017953, + "learning_rate": 2.959342023587834e-05, + "loss": 5.5961, + "step": 1907 + }, + { + "epoch": 0.08883301906557721, + "grad_norm": 0.8867997503980862, + "learning_rate": 2.9608938547486038e-05, + "loss": 5.481, + "step": 1908 + }, + { + "epoch": 0.08887957725167027, + "grad_norm": 0.8500633296509718, + "learning_rate": 2.962445685909373e-05, + "loss": 5.6722, + "step": 1909 + }, + { + "epoch": 0.08892613543776334, + "grad_norm": 0.7634919353305633, + "learning_rate": 2.9639975170701427e-05, + "loss": 5.5543, + "step": 1910 + }, + { + "epoch": 0.08897269362385642, + "grad_norm": 0.6775882354599521, + "learning_rate": 2.9655493482309127e-05, + "loss": 5.4839, + "step": 1911 + }, + { + "epoch": 0.08901925180994949, + "grad_norm": 0.6957864177372448, + "learning_rate": 2.9671011793916827e-05, + "loss": 5.5037, + "step": 1912 + }, + { + "epoch": 0.08906580999604255, + "grad_norm": 0.8112209681728683, + "learning_rate": 2.968653010552452e-05, + "loss": 5.5321, + "step": 1913 + }, + { + "epoch": 0.08911236818213562, + "grad_norm": 0.6958687822686576, + "learning_rate": 2.9702048417132216e-05, + "loss": 5.5894, + "step": 1914 + }, + { + "epoch": 0.0891589263682287, + "grad_norm": 0.7370759063647345, + "learning_rate": 2.9717566728739916e-05, + "loss": 5.5581, + "step": 1915 + }, + { + "epoch": 0.08920548455432176, + "grad_norm": 0.5680144279507562, + "learning_rate": 2.9733085040347612e-05, + "loss": 5.5914, + "step": 1916 + }, + { + "epoch": 0.08925204274041483, + "grad_norm": 0.6940711180378859, + "learning_rate": 2.9748603351955305e-05, + "loss": 5.6189, + "step": 1917 + }, + { + "epoch": 0.0892986009265079, + "grad_norm": 0.7432144048651265, + "learning_rate": 2.9764121663563005e-05, + "loss": 5.4783, + "step": 1918 + }, + { + "epoch": 0.08934515911260098, + "grad_norm": 0.7983509055842892, + "learning_rate": 2.9779639975170704e-05, + "loss": 5.6518, + "step": 1919 + }, + { + "epoch": 0.08939171729869404, + "grad_norm": 0.6368592281757977, + "learning_rate": 2.97951582867784e-05, + "loss": 5.4463, + "step": 1920 + }, + { + "epoch": 0.08943827548478711, + "grad_norm": 0.6648767117380764, + "learning_rate": 2.9810676598386094e-05, + "loss": 5.5843, + "step": 1921 + }, + { + "epoch": 0.08948483367088018, + "grad_norm": 0.7572088725383231, + "learning_rate": 2.9826194909993793e-05, + "loss": 5.5725, + "step": 1922 + }, + { + "epoch": 0.08953139185697326, + "grad_norm": 0.7873773916412898, + "learning_rate": 2.9841713221601493e-05, + "loss": 5.644, + "step": 1923 + }, + { + "epoch": 0.08957795004306632, + "grad_norm": 0.5900849697003538, + "learning_rate": 2.985723153320919e-05, + "loss": 5.6567, + "step": 1924 + }, + { + "epoch": 0.08962450822915939, + "grad_norm": 0.7249418255695877, + "learning_rate": 2.9872749844816882e-05, + "loss": 5.7177, + "step": 1925 + }, + { + "epoch": 0.08967106641525247, + "grad_norm": 0.7356473497042829, + "learning_rate": 2.9888268156424582e-05, + "loss": 5.588, + "step": 1926 + }, + { + "epoch": 0.08971762460134552, + "grad_norm": 0.7525793532943549, + "learning_rate": 2.9903786468032278e-05, + "loss": 5.5887, + "step": 1927 + }, + { + "epoch": 0.0897641827874386, + "grad_norm": 0.7607397714661135, + "learning_rate": 2.9919304779639978e-05, + "loss": 5.607, + "step": 1928 + }, + { + "epoch": 0.08981074097353167, + "grad_norm": 0.68056110805037, + "learning_rate": 2.993482309124767e-05, + "loss": 5.5688, + "step": 1929 + }, + { + "epoch": 0.08985729915962475, + "grad_norm": 0.6902069200244675, + "learning_rate": 2.995034140285537e-05, + "loss": 5.5246, + "step": 1930 + }, + { + "epoch": 0.0899038573457178, + "grad_norm": 0.7430142203107225, + "learning_rate": 2.9965859714463067e-05, + "loss": 5.6254, + "step": 1931 + }, + { + "epoch": 0.08995041553181088, + "grad_norm": 0.8462856851656131, + "learning_rate": 2.9981378026070767e-05, + "loss": 5.5316, + "step": 1932 + }, + { + "epoch": 0.08999697371790395, + "grad_norm": 0.7879542643215856, + "learning_rate": 2.999689633767846e-05, + "loss": 5.5792, + "step": 1933 + }, + { + "epoch": 0.09004353190399703, + "grad_norm": 0.7578423433200348, + "learning_rate": 3.001241464928616e-05, + "loss": 5.5648, + "step": 1934 + }, + { + "epoch": 0.09009009009009009, + "grad_norm": 0.7326866871305803, + "learning_rate": 3.0027932960893855e-05, + "loss": 5.5295, + "step": 1935 + }, + { + "epoch": 0.09013664827618316, + "grad_norm": 0.7774840402197999, + "learning_rate": 3.0043451272501555e-05, + "loss": 5.5467, + "step": 1936 + }, + { + "epoch": 0.09018320646227623, + "grad_norm": 0.9250499689739928, + "learning_rate": 3.0058969584109248e-05, + "loss": 5.5874, + "step": 1937 + }, + { + "epoch": 0.09022976464836929, + "grad_norm": 0.7226255081622043, + "learning_rate": 3.0074487895716948e-05, + "loss": 5.5192, + "step": 1938 + }, + { + "epoch": 0.09027632283446237, + "grad_norm": 0.628289780879942, + "learning_rate": 3.0090006207324644e-05, + "loss": 5.64, + "step": 1939 + }, + { + "epoch": 0.09032288102055544, + "grad_norm": 0.7079210023959386, + "learning_rate": 3.0105524518932344e-05, + "loss": 5.6358, + "step": 1940 + }, + { + "epoch": 0.09036943920664851, + "grad_norm": 0.7856403475114099, + "learning_rate": 3.0121042830540037e-05, + "loss": 5.6918, + "step": 1941 + }, + { + "epoch": 0.09041599739274157, + "grad_norm": 0.7299863459862357, + "learning_rate": 3.0136561142147733e-05, + "loss": 5.6159, + "step": 1942 + }, + { + "epoch": 0.09046255557883465, + "grad_norm": 0.6955725141916074, + "learning_rate": 3.0152079453755433e-05, + "loss": 5.5872, + "step": 1943 + }, + { + "epoch": 0.09050911376492772, + "grad_norm": 0.6795586045174776, + "learning_rate": 3.0167597765363132e-05, + "loss": 5.5881, + "step": 1944 + }, + { + "epoch": 0.0905556719510208, + "grad_norm": 0.6370088868232187, + "learning_rate": 3.0183116076970825e-05, + "loss": 5.4696, + "step": 1945 + }, + { + "epoch": 0.09060223013711385, + "grad_norm": 0.651950057852523, + "learning_rate": 3.019863438857852e-05, + "loss": 5.5288, + "step": 1946 + }, + { + "epoch": 0.09064878832320693, + "grad_norm": 0.7669744895648893, + "learning_rate": 3.021415270018622e-05, + "loss": 5.6035, + "step": 1947 + }, + { + "epoch": 0.0906953465093, + "grad_norm": 0.630094129417523, + "learning_rate": 3.022967101179392e-05, + "loss": 5.5357, + "step": 1948 + }, + { + "epoch": 0.09074190469539306, + "grad_norm": 0.7557256669339629, + "learning_rate": 3.0245189323401614e-05, + "loss": 5.5931, + "step": 1949 + }, + { + "epoch": 0.09078846288148613, + "grad_norm": 0.9835862641201876, + "learning_rate": 3.026070763500931e-05, + "loss": 5.535, + "step": 1950 + }, + { + "epoch": 0.09083502106757921, + "grad_norm": 1.1755681172083206, + "learning_rate": 3.027622594661701e-05, + "loss": 5.4741, + "step": 1951 + }, + { + "epoch": 0.09088157925367228, + "grad_norm": 0.6938885786349052, + "learning_rate": 3.029174425822471e-05, + "loss": 5.6213, + "step": 1952 + }, + { + "epoch": 0.09092813743976534, + "grad_norm": 0.8690745490615884, + "learning_rate": 3.0307262569832406e-05, + "loss": 5.5086, + "step": 1953 + }, + { + "epoch": 0.09097469562585841, + "grad_norm": 0.9731559178816664, + "learning_rate": 3.03227808814401e-05, + "loss": 5.5808, + "step": 1954 + }, + { + "epoch": 0.09102125381195149, + "grad_norm": 0.8332639911670896, + "learning_rate": 3.03382991930478e-05, + "loss": 5.5237, + "step": 1955 + }, + { + "epoch": 0.09106781199804456, + "grad_norm": 0.9193880887673945, + "learning_rate": 3.0353817504655495e-05, + "loss": 5.4002, + "step": 1956 + }, + { + "epoch": 0.09111437018413762, + "grad_norm": 0.8475422047174741, + "learning_rate": 3.0369335816263195e-05, + "loss": 5.5954, + "step": 1957 + }, + { + "epoch": 0.0911609283702307, + "grad_norm": 0.781478586970659, + "learning_rate": 3.0384854127870887e-05, + "loss": 5.6108, + "step": 1958 + }, + { + "epoch": 0.09120748655632377, + "grad_norm": 0.8295037513840093, + "learning_rate": 3.0400372439478587e-05, + "loss": 5.6315, + "step": 1959 + }, + { + "epoch": 0.09125404474241683, + "grad_norm": 0.653210282428843, + "learning_rate": 3.0415890751086283e-05, + "loss": 5.4223, + "step": 1960 + }, + { + "epoch": 0.0913006029285099, + "grad_norm": 0.7575298778723684, + "learning_rate": 3.0431409062693983e-05, + "loss": 5.4391, + "step": 1961 + }, + { + "epoch": 0.09134716111460298, + "grad_norm": 0.7767775428010877, + "learning_rate": 3.0446927374301676e-05, + "loss": 5.6257, + "step": 1962 + }, + { + "epoch": 0.09139371930069605, + "grad_norm": 0.6500536941949372, + "learning_rate": 3.0462445685909376e-05, + "loss": 5.4994, + "step": 1963 + }, + { + "epoch": 0.09144027748678911, + "grad_norm": 0.6781265010091525, + "learning_rate": 3.0477963997517072e-05, + "loss": 5.5446, + "step": 1964 + }, + { + "epoch": 0.09148683567288218, + "grad_norm": 0.758235581012379, + "learning_rate": 3.0493482309124772e-05, + "loss": 5.4854, + "step": 1965 + }, + { + "epoch": 0.09153339385897526, + "grad_norm": 0.7237987964484485, + "learning_rate": 3.0509000620732465e-05, + "loss": 5.4481, + "step": 1966 + }, + { + "epoch": 0.09157995204506833, + "grad_norm": 0.6533127728669784, + "learning_rate": 3.052451893234016e-05, + "loss": 5.514, + "step": 1967 + }, + { + "epoch": 0.09162651023116139, + "grad_norm": 0.629281779104417, + "learning_rate": 3.0540037243947864e-05, + "loss": 5.5798, + "step": 1968 + }, + { + "epoch": 0.09167306841725446, + "grad_norm": 0.7132170444638339, + "learning_rate": 3.055555555555556e-05, + "loss": 5.5692, + "step": 1969 + }, + { + "epoch": 0.09171962660334754, + "grad_norm": 0.7822329204956902, + "learning_rate": 3.057107386716325e-05, + "loss": 5.4213, + "step": 1970 + }, + { + "epoch": 0.0917661847894406, + "grad_norm": 0.8831790323278909, + "learning_rate": 3.058659217877095e-05, + "loss": 5.5932, + "step": 1971 + }, + { + "epoch": 0.09181274297553367, + "grad_norm": 0.8922981600989531, + "learning_rate": 3.060211049037865e-05, + "loss": 5.5974, + "step": 1972 + }, + { + "epoch": 0.09185930116162674, + "grad_norm": 0.9154789425248772, + "learning_rate": 3.0617628801986346e-05, + "loss": 5.5305, + "step": 1973 + }, + { + "epoch": 0.09190585934771982, + "grad_norm": 0.8505954915265098, + "learning_rate": 3.063314711359404e-05, + "loss": 5.4311, + "step": 1974 + }, + { + "epoch": 0.09195241753381288, + "grad_norm": 0.679319924466369, + "learning_rate": 3.064866542520174e-05, + "loss": 5.5264, + "step": 1975 + }, + { + "epoch": 0.09199897571990595, + "grad_norm": 0.6999423969942129, + "learning_rate": 3.0664183736809435e-05, + "loss": 5.5264, + "step": 1976 + }, + { + "epoch": 0.09204553390599902, + "grad_norm": 0.6232402529298515, + "learning_rate": 3.067970204841714e-05, + "loss": 5.458, + "step": 1977 + }, + { + "epoch": 0.0920920920920921, + "grad_norm": 0.7208716746617155, + "learning_rate": 3.069522036002483e-05, + "loss": 5.512, + "step": 1978 + }, + { + "epoch": 0.09213865027818516, + "grad_norm": 0.7571463948536187, + "learning_rate": 3.071073867163253e-05, + "loss": 5.5942, + "step": 1979 + }, + { + "epoch": 0.09218520846427823, + "grad_norm": 0.6523399225829043, + "learning_rate": 3.0726256983240227e-05, + "loss": 5.4863, + "step": 1980 + }, + { + "epoch": 0.0922317666503713, + "grad_norm": 0.6199512899883541, + "learning_rate": 3.074177529484792e-05, + "loss": 5.6017, + "step": 1981 + }, + { + "epoch": 0.09227832483646436, + "grad_norm": 0.6483187913731361, + "learning_rate": 3.075729360645562e-05, + "loss": 5.5847, + "step": 1982 + }, + { + "epoch": 0.09232488302255744, + "grad_norm": 0.6232819377955665, + "learning_rate": 3.0772811918063315e-05, + "loss": 5.5301, + "step": 1983 + }, + { + "epoch": 0.09237144120865051, + "grad_norm": 0.6294698242812704, + "learning_rate": 3.078833022967101e-05, + "loss": 5.5984, + "step": 1984 + }, + { + "epoch": 0.09241799939474359, + "grad_norm": 0.674442582719515, + "learning_rate": 3.0803848541278715e-05, + "loss": 5.5083, + "step": 1985 + }, + { + "epoch": 0.09246455758083665, + "grad_norm": 0.7371751744228199, + "learning_rate": 3.0819366852886404e-05, + "loss": 5.5984, + "step": 1986 + }, + { + "epoch": 0.09251111576692972, + "grad_norm": 0.8497106765593828, + "learning_rate": 3.08348851644941e-05, + "loss": 5.4701, + "step": 1987 + }, + { + "epoch": 0.09255767395302279, + "grad_norm": 0.7123691823240355, + "learning_rate": 3.0850403476101804e-05, + "loss": 5.5665, + "step": 1988 + }, + { + "epoch": 0.09260423213911587, + "grad_norm": 0.7015603218174028, + "learning_rate": 3.08659217877095e-05, + "loss": 5.6093, + "step": 1989 + }, + { + "epoch": 0.09265079032520893, + "grad_norm": 0.7059441785840654, + "learning_rate": 3.0881440099317196e-05, + "loss": 5.4759, + "step": 1990 + }, + { + "epoch": 0.092697348511302, + "grad_norm": 0.6788329132562468, + "learning_rate": 3.089695841092489e-05, + "loss": 5.5066, + "step": 1991 + }, + { + "epoch": 0.09274390669739507, + "grad_norm": 0.8040281685260445, + "learning_rate": 3.091247672253259e-05, + "loss": 5.5947, + "step": 1992 + }, + { + "epoch": 0.09279046488348813, + "grad_norm": 0.8630570849727176, + "learning_rate": 3.092799503414029e-05, + "loss": 5.499, + "step": 1993 + }, + { + "epoch": 0.0928370230695812, + "grad_norm": 0.7664404204888459, + "learning_rate": 3.094351334574798e-05, + "loss": 5.6404, + "step": 1994 + }, + { + "epoch": 0.09288358125567428, + "grad_norm": 0.7671850169499503, + "learning_rate": 3.095903165735568e-05, + "loss": 5.5588, + "step": 1995 + }, + { + "epoch": 0.09293013944176735, + "grad_norm": 0.6582453748899981, + "learning_rate": 3.097454996896338e-05, + "loss": 5.4526, + "step": 1996 + }, + { + "epoch": 0.09297669762786041, + "grad_norm": 0.8612843265903323, + "learning_rate": 3.099006828057108e-05, + "loss": 5.517, + "step": 1997 + }, + { + "epoch": 0.09302325581395349, + "grad_norm": 0.940627975401497, + "learning_rate": 3.1005586592178774e-05, + "loss": 5.5397, + "step": 1998 + }, + { + "epoch": 0.09306981400004656, + "grad_norm": 0.9826515437685127, + "learning_rate": 3.102110490378647e-05, + "loss": 5.3765, + "step": 1999 + }, + { + "epoch": 0.09311637218613963, + "grad_norm": 0.9170053494813368, + "learning_rate": 3.1036623215394166e-05, + "loss": 5.4247, + "step": 2000 + }, + { + "epoch": 0.0931629303722327, + "grad_norm": 0.9043124130948939, + "learning_rate": 3.105214152700186e-05, + "loss": 5.5144, + "step": 2001 + }, + { + "epoch": 0.09320948855832577, + "grad_norm": 0.9825634341623806, + "learning_rate": 3.106765983860956e-05, + "loss": 5.4863, + "step": 2002 + }, + { + "epoch": 0.09325604674441884, + "grad_norm": 0.7867076221755673, + "learning_rate": 3.1083178150217255e-05, + "loss": 5.5208, + "step": 2003 + }, + { + "epoch": 0.0933026049305119, + "grad_norm": 0.8714135096815101, + "learning_rate": 3.109869646182496e-05, + "loss": 5.4533, + "step": 2004 + }, + { + "epoch": 0.09334916311660497, + "grad_norm": 0.7507581829654182, + "learning_rate": 3.1114214773432655e-05, + "loss": 5.47, + "step": 2005 + }, + { + "epoch": 0.09339572130269805, + "grad_norm": 0.7657400804497468, + "learning_rate": 3.1129733085040344e-05, + "loss": 5.5632, + "step": 2006 + }, + { + "epoch": 0.09344227948879112, + "grad_norm": 0.7649746315083483, + "learning_rate": 3.114525139664805e-05, + "loss": 5.588, + "step": 2007 + }, + { + "epoch": 0.09348883767488418, + "grad_norm": 0.7064388999904251, + "learning_rate": 3.1160769708255743e-05, + "loss": 5.4073, + "step": 2008 + }, + { + "epoch": 0.09353539586097726, + "grad_norm": 0.6394324846052116, + "learning_rate": 3.117628801986344e-05, + "loss": 5.5014, + "step": 2009 + }, + { + "epoch": 0.09358195404707033, + "grad_norm": 0.792430647403061, + "learning_rate": 3.1191806331471136e-05, + "loss": 5.5328, + "step": 2010 + }, + { + "epoch": 0.0936285122331634, + "grad_norm": 0.7740014786816583, + "learning_rate": 3.120732464307883e-05, + "loss": 5.5071, + "step": 2011 + }, + { + "epoch": 0.09367507041925646, + "grad_norm": 0.6830639063592907, + "learning_rate": 3.122284295468653e-05, + "loss": 5.4698, + "step": 2012 + }, + { + "epoch": 0.09372162860534954, + "grad_norm": 0.6391042992135568, + "learning_rate": 3.123836126629423e-05, + "loss": 5.3995, + "step": 2013 + }, + { + "epoch": 0.09376818679144261, + "grad_norm": 0.683304660785396, + "learning_rate": 3.125387957790192e-05, + "loss": 5.3917, + "step": 2014 + }, + { + "epoch": 0.09381474497753567, + "grad_norm": 0.6487597789085952, + "learning_rate": 3.1269397889509624e-05, + "loss": 5.5831, + "step": 2015 + }, + { + "epoch": 0.09386130316362874, + "grad_norm": 0.6550023323583538, + "learning_rate": 3.128491620111732e-05, + "loss": 5.5434, + "step": 2016 + }, + { + "epoch": 0.09390786134972182, + "grad_norm": 0.6196749633662026, + "learning_rate": 3.130043451272502e-05, + "loss": 5.5438, + "step": 2017 + }, + { + "epoch": 0.09395441953581489, + "grad_norm": 0.5913071859256668, + "learning_rate": 3.131595282433271e-05, + "loss": 5.5087, + "step": 2018 + }, + { + "epoch": 0.09400097772190795, + "grad_norm": 0.7181240135759702, + "learning_rate": 3.133147113594041e-05, + "loss": 5.4837, + "step": 2019 + }, + { + "epoch": 0.09404753590800102, + "grad_norm": 0.715734488622528, + "learning_rate": 3.1346989447548106e-05, + "loss": 5.4332, + "step": 2020 + }, + { + "epoch": 0.0940940940940941, + "grad_norm": 0.6747860556081424, + "learning_rate": 3.136250775915581e-05, + "loss": 5.4745, + "step": 2021 + }, + { + "epoch": 0.09414065228018717, + "grad_norm": 0.6656240256475393, + "learning_rate": 3.13780260707635e-05, + "loss": 5.4597, + "step": 2022 + }, + { + "epoch": 0.09418721046628023, + "grad_norm": 0.613824631551493, + "learning_rate": 3.13935443823712e-05, + "loss": 5.3477, + "step": 2023 + }, + { + "epoch": 0.0942337686523733, + "grad_norm": 0.6471375481150664, + "learning_rate": 3.14090626939789e-05, + "loss": 5.4157, + "step": 2024 + }, + { + "epoch": 0.09428032683846638, + "grad_norm": 0.6326633110064277, + "learning_rate": 3.1424581005586594e-05, + "loss": 5.5156, + "step": 2025 + }, + { + "epoch": 0.09432688502455944, + "grad_norm": 0.7665977795016903, + "learning_rate": 3.144009931719429e-05, + "loss": 5.4865, + "step": 2026 + }, + { + "epoch": 0.09437344321065251, + "grad_norm": 0.8593596944568087, + "learning_rate": 3.145561762880199e-05, + "loss": 5.4245, + "step": 2027 + }, + { + "epoch": 0.09442000139674558, + "grad_norm": 0.8017242997633972, + "learning_rate": 3.147113594040968e-05, + "loss": 5.4589, + "step": 2028 + }, + { + "epoch": 0.09446655958283866, + "grad_norm": 0.7953806062992725, + "learning_rate": 3.1486654252017386e-05, + "loss": 5.5386, + "step": 2029 + }, + { + "epoch": 0.09451311776893172, + "grad_norm": 0.7316122957780088, + "learning_rate": 3.1502172563625076e-05, + "loss": 5.4861, + "step": 2030 + }, + { + "epoch": 0.09455967595502479, + "grad_norm": 0.7108182379417892, + "learning_rate": 3.151769087523277e-05, + "loss": 5.5029, + "step": 2031 + }, + { + "epoch": 0.09460623414111786, + "grad_norm": 0.6897654223572257, + "learning_rate": 3.1533209186840475e-05, + "loss": 5.4941, + "step": 2032 + }, + { + "epoch": 0.09465279232721094, + "grad_norm": 0.6297689520118401, + "learning_rate": 3.154872749844817e-05, + "loss": 5.5304, + "step": 2033 + }, + { + "epoch": 0.094699350513304, + "grad_norm": 0.6864418198684411, + "learning_rate": 3.156424581005587e-05, + "loss": 5.4496, + "step": 2034 + }, + { + "epoch": 0.09474590869939707, + "grad_norm": 0.6755773205191958, + "learning_rate": 3.1579764121663564e-05, + "loss": 5.4131, + "step": 2035 + }, + { + "epoch": 0.09479246688549015, + "grad_norm": 0.6871684061615659, + "learning_rate": 3.159528243327126e-05, + "loss": 5.5041, + "step": 2036 + }, + { + "epoch": 0.0948390250715832, + "grad_norm": 0.5667659089170108, + "learning_rate": 3.161080074487896e-05, + "loss": 5.4471, + "step": 2037 + }, + { + "epoch": 0.09488558325767628, + "grad_norm": 0.6862495653061996, + "learning_rate": 3.162631905648666e-05, + "loss": 5.4673, + "step": 2038 + }, + { + "epoch": 0.09493214144376935, + "grad_norm": 0.6748408650785077, + "learning_rate": 3.164183736809435e-05, + "loss": 5.5241, + "step": 2039 + }, + { + "epoch": 0.09497869962986243, + "grad_norm": 0.7741400628512435, + "learning_rate": 3.165735567970205e-05, + "loss": 5.4345, + "step": 2040 + }, + { + "epoch": 0.09502525781595549, + "grad_norm": 0.7274086301833415, + "learning_rate": 3.167287399130975e-05, + "loss": 5.4488, + "step": 2041 + }, + { + "epoch": 0.09507181600204856, + "grad_norm": 0.659062453161927, + "learning_rate": 3.1688392302917445e-05, + "loss": 5.3915, + "step": 2042 + }, + { + "epoch": 0.09511837418814163, + "grad_norm": 0.7472577919273168, + "learning_rate": 3.170391061452514e-05, + "loss": 5.443, + "step": 2043 + }, + { + "epoch": 0.0951649323742347, + "grad_norm": 0.6795283642660918, + "learning_rate": 3.171942892613284e-05, + "loss": 5.4973, + "step": 2044 + }, + { + "epoch": 0.09521149056032777, + "grad_norm": 0.7228735800364706, + "learning_rate": 3.1734947237740534e-05, + "loss": 5.4051, + "step": 2045 + }, + { + "epoch": 0.09525804874642084, + "grad_norm": 0.6878713796625887, + "learning_rate": 3.175046554934824e-05, + "loss": 5.3435, + "step": 2046 + }, + { + "epoch": 0.09530460693251391, + "grad_norm": 0.6935861148303738, + "learning_rate": 3.1765983860955927e-05, + "loss": 5.4078, + "step": 2047 + }, + { + "epoch": 0.09535116511860697, + "grad_norm": 0.7616581812876406, + "learning_rate": 3.178150217256363e-05, + "loss": 5.4946, + "step": 2048 + }, + { + "epoch": 0.09539772330470005, + "grad_norm": 0.632319110652569, + "learning_rate": 3.1797020484171326e-05, + "loss": 5.376, + "step": 2049 + }, + { + "epoch": 0.09544428149079312, + "grad_norm": 0.6149826428547617, + "learning_rate": 3.181253879577902e-05, + "loss": 5.6213, + "step": 2050 + }, + { + "epoch": 0.0954908396768862, + "grad_norm": 0.7450522447067004, + "learning_rate": 3.182805710738672e-05, + "loss": 5.4316, + "step": 2051 + }, + { + "epoch": 0.09553739786297925, + "grad_norm": 0.6868272202931508, + "learning_rate": 3.1843575418994415e-05, + "loss": 5.4189, + "step": 2052 + }, + { + "epoch": 0.09558395604907233, + "grad_norm": 0.7915653196578502, + "learning_rate": 3.185909373060211e-05, + "loss": 5.443, + "step": 2053 + }, + { + "epoch": 0.0956305142351654, + "grad_norm": 0.7863420494081199, + "learning_rate": 3.1874612042209814e-05, + "loss": 5.4181, + "step": 2054 + }, + { + "epoch": 0.09567707242125847, + "grad_norm": 0.767734412618603, + "learning_rate": 3.1890130353817504e-05, + "loss": 5.4517, + "step": 2055 + }, + { + "epoch": 0.09572363060735153, + "grad_norm": 0.7544495854385168, + "learning_rate": 3.19056486654252e-05, + "loss": 5.5127, + "step": 2056 + }, + { + "epoch": 0.09577018879344461, + "grad_norm": 0.6470237643806653, + "learning_rate": 3.19211669770329e-05, + "loss": 5.5073, + "step": 2057 + }, + { + "epoch": 0.09581674697953768, + "grad_norm": 0.815137283002879, + "learning_rate": 3.19366852886406e-05, + "loss": 5.4467, + "step": 2058 + }, + { + "epoch": 0.09586330516563074, + "grad_norm": 0.9635826202563684, + "learning_rate": 3.1952203600248296e-05, + "loss": 5.5359, + "step": 2059 + }, + { + "epoch": 0.09590986335172381, + "grad_norm": 1.0206424084184387, + "learning_rate": 3.196772191185599e-05, + "loss": 5.4256, + "step": 2060 + }, + { + "epoch": 0.09595642153781689, + "grad_norm": 0.7546354655682682, + "learning_rate": 3.198324022346369e-05, + "loss": 5.4798, + "step": 2061 + }, + { + "epoch": 0.09600297972390996, + "grad_norm": 0.6584037381046721, + "learning_rate": 3.1998758535071385e-05, + "loss": 5.5849, + "step": 2062 + }, + { + "epoch": 0.09604953791000302, + "grad_norm": 0.8217488159796545, + "learning_rate": 3.201427684667908e-05, + "loss": 5.4521, + "step": 2063 + }, + { + "epoch": 0.0960960960960961, + "grad_norm": 0.8576287462100981, + "learning_rate": 3.202979515828678e-05, + "loss": 5.5323, + "step": 2064 + }, + { + "epoch": 0.09614265428218917, + "grad_norm": 0.8093372011413045, + "learning_rate": 3.204531346989448e-05, + "loss": 5.5391, + "step": 2065 + }, + { + "epoch": 0.09618921246828224, + "grad_norm": 0.8495867102183643, + "learning_rate": 3.206083178150218e-05, + "loss": 5.3963, + "step": 2066 + }, + { + "epoch": 0.0962357706543753, + "grad_norm": 0.9226539327058356, + "learning_rate": 3.2076350093109866e-05, + "loss": 5.5765, + "step": 2067 + }, + { + "epoch": 0.09628232884046838, + "grad_norm": 0.8708156622923079, + "learning_rate": 3.209186840471757e-05, + "loss": 5.4016, + "step": 2068 + }, + { + "epoch": 0.09632888702656145, + "grad_norm": 0.8744502598400337, + "learning_rate": 3.2107386716325266e-05, + "loss": 5.3317, + "step": 2069 + }, + { + "epoch": 0.09637544521265451, + "grad_norm": 1.0283935782093345, + "learning_rate": 3.212290502793296e-05, + "loss": 5.5226, + "step": 2070 + }, + { + "epoch": 0.09642200339874758, + "grad_norm": 1.0908183542520156, + "learning_rate": 3.213842333954066e-05, + "loss": 5.4005, + "step": 2071 + }, + { + "epoch": 0.09646856158484066, + "grad_norm": 0.7832964889004075, + "learning_rate": 3.2153941651148355e-05, + "loss": 5.469, + "step": 2072 + }, + { + "epoch": 0.09651511977093373, + "grad_norm": 1.0115553149489092, + "learning_rate": 3.216945996275606e-05, + "loss": 5.4658, + "step": 2073 + }, + { + "epoch": 0.09656167795702679, + "grad_norm": 1.0371151767560836, + "learning_rate": 3.2184978274363754e-05, + "loss": 5.4619, + "step": 2074 + }, + { + "epoch": 0.09660823614311986, + "grad_norm": 0.9391529778885108, + "learning_rate": 3.2200496585971444e-05, + "loss": 5.5626, + "step": 2075 + }, + { + "epoch": 0.09665479432921294, + "grad_norm": 0.9240950054764541, + "learning_rate": 3.2216014897579147e-05, + "loss": 5.4366, + "step": 2076 + }, + { + "epoch": 0.09670135251530601, + "grad_norm": 0.7879701147337727, + "learning_rate": 3.223153320918684e-05, + "loss": 5.4349, + "step": 2077 + }, + { + "epoch": 0.09674791070139907, + "grad_norm": 0.7491248978122907, + "learning_rate": 3.224705152079454e-05, + "loss": 5.484, + "step": 2078 + }, + { + "epoch": 0.09679446888749214, + "grad_norm": 0.8054904867479733, + "learning_rate": 3.2262569832402236e-05, + "loss": 5.5242, + "step": 2079 + }, + { + "epoch": 0.09684102707358522, + "grad_norm": 0.7523348877015715, + "learning_rate": 3.227808814400993e-05, + "loss": 5.3808, + "step": 2080 + }, + { + "epoch": 0.09688758525967828, + "grad_norm": 0.763055195948239, + "learning_rate": 3.229360645561763e-05, + "loss": 5.434, + "step": 2081 + }, + { + "epoch": 0.09693414344577135, + "grad_norm": 0.9456416352961088, + "learning_rate": 3.230912476722533e-05, + "loss": 5.3913, + "step": 2082 + }, + { + "epoch": 0.09698070163186442, + "grad_norm": 0.9612883728536061, + "learning_rate": 3.232464307883302e-05, + "loss": 5.4192, + "step": 2083 + }, + { + "epoch": 0.0970272598179575, + "grad_norm": 0.8463456326056613, + "learning_rate": 3.2340161390440724e-05, + "loss": 5.4136, + "step": 2084 + }, + { + "epoch": 0.09707381800405056, + "grad_norm": 0.8674355668560957, + "learning_rate": 3.235567970204842e-05, + "loss": 5.32, + "step": 2085 + }, + { + "epoch": 0.09712037619014363, + "grad_norm": 0.776599176056574, + "learning_rate": 3.2371198013656116e-05, + "loss": 5.4358, + "step": 2086 + }, + { + "epoch": 0.0971669343762367, + "grad_norm": 0.9354517689648465, + "learning_rate": 3.238671632526381e-05, + "loss": 5.328, + "step": 2087 + }, + { + "epoch": 0.09721349256232978, + "grad_norm": 0.8466969084868906, + "learning_rate": 3.240223463687151e-05, + "loss": 5.398, + "step": 2088 + }, + { + "epoch": 0.09726005074842284, + "grad_norm": 0.819653071970095, + "learning_rate": 3.2417752948479205e-05, + "loss": 5.5662, + "step": 2089 + }, + { + "epoch": 0.09730660893451591, + "grad_norm": 0.9627538370856293, + "learning_rate": 3.243327126008691e-05, + "loss": 5.4538, + "step": 2090 + }, + { + "epoch": 0.09735316712060899, + "grad_norm": 0.7976780606026511, + "learning_rate": 3.24487895716946e-05, + "loss": 5.3446, + "step": 2091 + }, + { + "epoch": 0.09739972530670205, + "grad_norm": 0.7514796316659534, + "learning_rate": 3.2464307883302294e-05, + "loss": 5.4399, + "step": 2092 + }, + { + "epoch": 0.09744628349279512, + "grad_norm": 0.6986836536239048, + "learning_rate": 3.247982619491e-05, + "loss": 5.3924, + "step": 2093 + }, + { + "epoch": 0.09749284167888819, + "grad_norm": 0.7506315715143965, + "learning_rate": 3.2495344506517694e-05, + "loss": 5.4537, + "step": 2094 + }, + { + "epoch": 0.09753939986498127, + "grad_norm": 0.857208763218546, + "learning_rate": 3.251086281812539e-05, + "loss": 5.3684, + "step": 2095 + }, + { + "epoch": 0.09758595805107433, + "grad_norm": 0.6500221895882673, + "learning_rate": 3.2526381129733086e-05, + "loss": 5.4923, + "step": 2096 + }, + { + "epoch": 0.0976325162371674, + "grad_norm": 0.7810501024666229, + "learning_rate": 3.254189944134078e-05, + "loss": 5.3444, + "step": 2097 + }, + { + "epoch": 0.09767907442326047, + "grad_norm": 0.8005275453538311, + "learning_rate": 3.2557417752948486e-05, + "loss": 5.4487, + "step": 2098 + }, + { + "epoch": 0.09772563260935353, + "grad_norm": 0.6637469347544424, + "learning_rate": 3.2572936064556175e-05, + "loss": 5.3084, + "step": 2099 + }, + { + "epoch": 0.0977721907954466, + "grad_norm": 0.863047343264663, + "learning_rate": 3.258845437616387e-05, + "loss": 5.4742, + "step": 2100 + }, + { + "epoch": 0.09781874898153968, + "grad_norm": 0.7735784226093793, + "learning_rate": 3.2603972687771575e-05, + "loss": 5.5825, + "step": 2101 + }, + { + "epoch": 0.09786530716763275, + "grad_norm": 0.8592242041778161, + "learning_rate": 3.261949099937927e-05, + "loss": 5.4363, + "step": 2102 + }, + { + "epoch": 0.09791186535372581, + "grad_norm": 0.9163132607822887, + "learning_rate": 3.263500931098696e-05, + "loss": 5.5077, + "step": 2103 + }, + { + "epoch": 0.09795842353981889, + "grad_norm": 0.9393090922388708, + "learning_rate": 3.2650527622594664e-05, + "loss": 5.5237, + "step": 2104 + }, + { + "epoch": 0.09800498172591196, + "grad_norm": 0.8477993297061978, + "learning_rate": 3.266604593420236e-05, + "loss": 5.4105, + "step": 2105 + }, + { + "epoch": 0.09805153991200503, + "grad_norm": 0.9453079567639139, + "learning_rate": 3.2681564245810056e-05, + "loss": 5.4709, + "step": 2106 + }, + { + "epoch": 0.0980980980980981, + "grad_norm": 1.1902742646807145, + "learning_rate": 3.269708255741775e-05, + "loss": 5.365, + "step": 2107 + }, + { + "epoch": 0.09814465628419117, + "grad_norm": 0.9771539981935106, + "learning_rate": 3.271260086902545e-05, + "loss": 5.4476, + "step": 2108 + }, + { + "epoch": 0.09819121447028424, + "grad_norm": 0.8259385605413632, + "learning_rate": 3.272811918063315e-05, + "loss": 5.5347, + "step": 2109 + }, + { + "epoch": 0.0982377726563773, + "grad_norm": 0.8871797797338448, + "learning_rate": 3.274363749224085e-05, + "loss": 5.3708, + "step": 2110 + }, + { + "epoch": 0.09828433084247037, + "grad_norm": 1.0154411759793804, + "learning_rate": 3.275915580384854e-05, + "loss": 5.4083, + "step": 2111 + }, + { + "epoch": 0.09833088902856345, + "grad_norm": 0.9205980969853133, + "learning_rate": 3.277467411545624e-05, + "loss": 5.4444, + "step": 2112 + }, + { + "epoch": 0.09837744721465652, + "grad_norm": 0.8007452905565049, + "learning_rate": 3.279019242706394e-05, + "loss": 5.3867, + "step": 2113 + }, + { + "epoch": 0.09842400540074958, + "grad_norm": 0.7897376017851219, + "learning_rate": 3.280571073867163e-05, + "loss": 5.3556, + "step": 2114 + }, + { + "epoch": 0.09847056358684265, + "grad_norm": 0.9823111063434209, + "learning_rate": 3.282122905027933e-05, + "loss": 5.4249, + "step": 2115 + }, + { + "epoch": 0.09851712177293573, + "grad_norm": 0.8143634476705595, + "learning_rate": 3.2836747361887026e-05, + "loss": 5.426, + "step": 2116 + }, + { + "epoch": 0.0985636799590288, + "grad_norm": 0.6972018284740952, + "learning_rate": 3.285226567349472e-05, + "loss": 5.3938, + "step": 2117 + }, + { + "epoch": 0.09861023814512186, + "grad_norm": 1.0449288146085691, + "learning_rate": 3.2867783985102425e-05, + "loss": 5.4812, + "step": 2118 + }, + { + "epoch": 0.09865679633121494, + "grad_norm": 0.8337260989685373, + "learning_rate": 3.2883302296710115e-05, + "loss": 5.387, + "step": 2119 + }, + { + "epoch": 0.09870335451730801, + "grad_norm": 0.7260832836017211, + "learning_rate": 3.289882060831782e-05, + "loss": 5.3897, + "step": 2120 + }, + { + "epoch": 0.09874991270340107, + "grad_norm": 0.886018762412838, + "learning_rate": 3.2914338919925514e-05, + "loss": 5.4554, + "step": 2121 + }, + { + "epoch": 0.09879647088949414, + "grad_norm": 0.6369965357476606, + "learning_rate": 3.292985723153321e-05, + "loss": 5.441, + "step": 2122 + }, + { + "epoch": 0.09884302907558722, + "grad_norm": 0.7803723671776632, + "learning_rate": 3.2945375543140914e-05, + "loss": 5.2592, + "step": 2123 + }, + { + "epoch": 0.09888958726168029, + "grad_norm": 0.7729147838831371, + "learning_rate": 3.29608938547486e-05, + "loss": 5.4988, + "step": 2124 + }, + { + "epoch": 0.09893614544777335, + "grad_norm": 0.7496232277859375, + "learning_rate": 3.29764121663563e-05, + "loss": 5.4003, + "step": 2125 + }, + { + "epoch": 0.09898270363386642, + "grad_norm": 0.7479103415300271, + "learning_rate": 3.2991930477964e-05, + "loss": 5.4588, + "step": 2126 + }, + { + "epoch": 0.0990292618199595, + "grad_norm": 0.6489904446245226, + "learning_rate": 3.30074487895717e-05, + "loss": 5.3516, + "step": 2127 + }, + { + "epoch": 0.09907582000605257, + "grad_norm": 0.8422839803417016, + "learning_rate": 3.302296710117939e-05, + "loss": 5.4571, + "step": 2128 + }, + { + "epoch": 0.09912237819214563, + "grad_norm": 0.9560140692452295, + "learning_rate": 3.303848541278709e-05, + "loss": 5.4249, + "step": 2129 + }, + { + "epoch": 0.0991689363782387, + "grad_norm": 0.8420500294846269, + "learning_rate": 3.305400372439479e-05, + "loss": 5.4079, + "step": 2130 + }, + { + "epoch": 0.09921549456433178, + "grad_norm": 0.818975254518113, + "learning_rate": 3.3069522036002484e-05, + "loss": 5.4088, + "step": 2131 + }, + { + "epoch": 0.09926205275042484, + "grad_norm": 0.868085055965954, + "learning_rate": 3.308504034761018e-05, + "loss": 5.4233, + "step": 2132 + }, + { + "epoch": 0.09930861093651791, + "grad_norm": 0.9253722797125942, + "learning_rate": 3.310055865921788e-05, + "loss": 5.3666, + "step": 2133 + }, + { + "epoch": 0.09935516912261098, + "grad_norm": 0.9689225436903324, + "learning_rate": 3.311607697082558e-05, + "loss": 5.4221, + "step": 2134 + }, + { + "epoch": 0.09940172730870406, + "grad_norm": 0.8157066868335494, + "learning_rate": 3.3131595282433276e-05, + "loss": 5.5876, + "step": 2135 + }, + { + "epoch": 0.09944828549479712, + "grad_norm": 0.9417522709395438, + "learning_rate": 3.3147113594040966e-05, + "loss": 5.4816, + "step": 2136 + }, + { + "epoch": 0.09949484368089019, + "grad_norm": 0.8082090179584906, + "learning_rate": 3.316263190564867e-05, + "loss": 5.413, + "step": 2137 + }, + { + "epoch": 0.09954140186698326, + "grad_norm": 0.7372025893935589, + "learning_rate": 3.3178150217256365e-05, + "loss": 5.3821, + "step": 2138 + }, + { + "epoch": 0.09958796005307634, + "grad_norm": 0.7812748655317177, + "learning_rate": 3.319366852886406e-05, + "loss": 5.3902, + "step": 2139 + }, + { + "epoch": 0.0996345182391694, + "grad_norm": 0.8481006986041254, + "learning_rate": 3.320918684047176e-05, + "loss": 5.381, + "step": 2140 + }, + { + "epoch": 0.09968107642526247, + "grad_norm": 0.7557529619867895, + "learning_rate": 3.3224705152079454e-05, + "loss": 5.3587, + "step": 2141 + }, + { + "epoch": 0.09972763461135555, + "grad_norm": 0.7857935905631557, + "learning_rate": 3.324022346368715e-05, + "loss": 5.3818, + "step": 2142 + }, + { + "epoch": 0.0997741927974486, + "grad_norm": 0.7811341671955595, + "learning_rate": 3.3255741775294853e-05, + "loss": 5.3992, + "step": 2143 + }, + { + "epoch": 0.09982075098354168, + "grad_norm": 0.6898288371997354, + "learning_rate": 3.327126008690254e-05, + "loss": 5.3971, + "step": 2144 + }, + { + "epoch": 0.09986730916963475, + "grad_norm": 0.618990319719707, + "learning_rate": 3.3286778398510246e-05, + "loss": 5.4022, + "step": 2145 + }, + { + "epoch": 0.09991386735572783, + "grad_norm": 0.7388340555336812, + "learning_rate": 3.330229671011794e-05, + "loss": 5.3703, + "step": 2146 + }, + { + "epoch": 0.09996042554182089, + "grad_norm": 0.7039955957437405, + "learning_rate": 3.331781502172564e-05, + "loss": 5.3939, + "step": 2147 + }, + { + "epoch": 0.10000698372791396, + "grad_norm": 0.7405268034494326, + "learning_rate": 3.3333333333333335e-05, + "loss": 5.3962, + "step": 2148 + }, + { + "epoch": 0.10005354191400703, + "grad_norm": 0.6903909773159458, + "learning_rate": 3.334885164494103e-05, + "loss": 5.4066, + "step": 2149 + }, + { + "epoch": 0.1001001001001001, + "grad_norm": 0.7964463869686838, + "learning_rate": 3.336436995654873e-05, + "loss": 5.321, + "step": 2150 + }, + { + "epoch": 0.10014665828619317, + "grad_norm": 0.7760377407090047, + "learning_rate": 3.337988826815643e-05, + "loss": 5.3416, + "step": 2151 + }, + { + "epoch": 0.10019321647228624, + "grad_norm": 0.6682044273498207, + "learning_rate": 3.339540657976412e-05, + "loss": 5.429, + "step": 2152 + }, + { + "epoch": 0.10023977465837931, + "grad_norm": 0.5881340525332729, + "learning_rate": 3.3410924891371816e-05, + "loss": 5.4585, + "step": 2153 + }, + { + "epoch": 0.10028633284447237, + "grad_norm": 0.7703069436714824, + "learning_rate": 3.342644320297952e-05, + "loss": 5.3568, + "step": 2154 + }, + { + "epoch": 0.10033289103056545, + "grad_norm": 0.7758660020300716, + "learning_rate": 3.3441961514587216e-05, + "loss": 5.3032, + "step": 2155 + }, + { + "epoch": 0.10037944921665852, + "grad_norm": 0.5685509388334312, + "learning_rate": 3.345747982619491e-05, + "loss": 5.3636, + "step": 2156 + }, + { + "epoch": 0.1004260074027516, + "grad_norm": 0.6847238790964421, + "learning_rate": 3.347299813780261e-05, + "loss": 5.3782, + "step": 2157 + }, + { + "epoch": 0.10047256558884465, + "grad_norm": 0.6139006876495428, + "learning_rate": 3.3488516449410305e-05, + "loss": 5.3662, + "step": 2158 + }, + { + "epoch": 0.10051912377493773, + "grad_norm": 0.6597623563973267, + "learning_rate": 3.350403476101801e-05, + "loss": 5.3691, + "step": 2159 + }, + { + "epoch": 0.1005656819610308, + "grad_norm": 0.6513781953685653, + "learning_rate": 3.35195530726257e-05, + "loss": 5.3746, + "step": 2160 + }, + { + "epoch": 0.10061224014712387, + "grad_norm": 0.7147948112722353, + "learning_rate": 3.3535071384233394e-05, + "loss": 5.3973, + "step": 2161 + }, + { + "epoch": 0.10065879833321693, + "grad_norm": 0.7541059484286348, + "learning_rate": 3.35505896958411e-05, + "loss": 5.2575, + "step": 2162 + }, + { + "epoch": 0.10070535651931001, + "grad_norm": 0.6468053802904228, + "learning_rate": 3.356610800744879e-05, + "loss": 5.5362, + "step": 2163 + }, + { + "epoch": 0.10075191470540308, + "grad_norm": 0.7492860727759062, + "learning_rate": 3.358162631905649e-05, + "loss": 5.334, + "step": 2164 + }, + { + "epoch": 0.10079847289149614, + "grad_norm": 0.6785885836591642, + "learning_rate": 3.3597144630664186e-05, + "loss": 5.4893, + "step": 2165 + }, + { + "epoch": 0.10084503107758921, + "grad_norm": 0.7441520067315942, + "learning_rate": 3.361266294227188e-05, + "loss": 5.3444, + "step": 2166 + }, + { + "epoch": 0.10089158926368229, + "grad_norm": 0.6304714899447437, + "learning_rate": 3.362818125387958e-05, + "loss": 5.3367, + "step": 2167 + }, + { + "epoch": 0.10093814744977536, + "grad_norm": 0.6676721133011861, + "learning_rate": 3.3643699565487275e-05, + "loss": 5.2943, + "step": 2168 + }, + { + "epoch": 0.10098470563586842, + "grad_norm": 0.6286025808212179, + "learning_rate": 3.365921787709497e-05, + "loss": 5.3504, + "step": 2169 + }, + { + "epoch": 0.1010312638219615, + "grad_norm": 0.6569532063323099, + "learning_rate": 3.3674736188702674e-05, + "loss": 5.3126, + "step": 2170 + }, + { + "epoch": 0.10107782200805457, + "grad_norm": 0.6684496688687382, + "learning_rate": 3.369025450031037e-05, + "loss": 5.4278, + "step": 2171 + }, + { + "epoch": 0.10112438019414764, + "grad_norm": 0.7267544512719635, + "learning_rate": 3.370577281191806e-05, + "loss": 5.4322, + "step": 2172 + }, + { + "epoch": 0.1011709383802407, + "grad_norm": 0.5949073215164684, + "learning_rate": 3.372129112352576e-05, + "loss": 5.3753, + "step": 2173 + }, + { + "epoch": 0.10121749656633378, + "grad_norm": 0.6375561379482905, + "learning_rate": 3.373680943513346e-05, + "loss": 5.3724, + "step": 2174 + }, + { + "epoch": 0.10126405475242685, + "grad_norm": 0.7528866061584675, + "learning_rate": 3.3752327746741156e-05, + "loss": 5.3322, + "step": 2175 + }, + { + "epoch": 0.10131061293851991, + "grad_norm": 0.8226092717501584, + "learning_rate": 3.376784605834885e-05, + "loss": 5.3572, + "step": 2176 + }, + { + "epoch": 0.10135717112461298, + "grad_norm": 0.8065830713483888, + "learning_rate": 3.378336436995655e-05, + "loss": 5.3677, + "step": 2177 + }, + { + "epoch": 0.10140372931070606, + "grad_norm": 0.9142561721772723, + "learning_rate": 3.3798882681564244e-05, + "loss": 5.277, + "step": 2178 + }, + { + "epoch": 0.10145028749679913, + "grad_norm": 0.9152772605077601, + "learning_rate": 3.381440099317195e-05, + "loss": 5.2644, + "step": 2179 + }, + { + "epoch": 0.10149684568289219, + "grad_norm": 0.7862073198566724, + "learning_rate": 3.382991930477964e-05, + "loss": 5.3267, + "step": 2180 + }, + { + "epoch": 0.10154340386898526, + "grad_norm": 0.6690707952544361, + "learning_rate": 3.384543761638734e-05, + "loss": 5.3067, + "step": 2181 + }, + { + "epoch": 0.10158996205507834, + "grad_norm": 0.7507852135661223, + "learning_rate": 3.3860955927995036e-05, + "loss": 5.3353, + "step": 2182 + }, + { + "epoch": 0.10163652024117141, + "grad_norm": 0.802089531035739, + "learning_rate": 3.387647423960273e-05, + "loss": 5.2941, + "step": 2183 + }, + { + "epoch": 0.10168307842726447, + "grad_norm": 0.7863237357856485, + "learning_rate": 3.389199255121043e-05, + "loss": 5.3594, + "step": 2184 + }, + { + "epoch": 0.10172963661335754, + "grad_norm": 0.6943303425123402, + "learning_rate": 3.3907510862818125e-05, + "loss": 5.3535, + "step": 2185 + }, + { + "epoch": 0.10177619479945062, + "grad_norm": 0.9337720474228547, + "learning_rate": 3.392302917442582e-05, + "loss": 5.3173, + "step": 2186 + }, + { + "epoch": 0.10182275298554368, + "grad_norm": 0.924842953399407, + "learning_rate": 3.3938547486033525e-05, + "loss": 5.2962, + "step": 2187 + }, + { + "epoch": 0.10186931117163675, + "grad_norm": 0.7066245065912748, + "learning_rate": 3.3954065797641214e-05, + "loss": 5.3614, + "step": 2188 + }, + { + "epoch": 0.10191586935772982, + "grad_norm": 0.8052808591249031, + "learning_rate": 3.396958410924892e-05, + "loss": 5.42, + "step": 2189 + }, + { + "epoch": 0.1019624275438229, + "grad_norm": 0.9659977648979514, + "learning_rate": 3.3985102420856614e-05, + "loss": 5.3049, + "step": 2190 + }, + { + "epoch": 0.10200898572991596, + "grad_norm": 0.8438459894920506, + "learning_rate": 3.400062073246431e-05, + "loss": 5.3317, + "step": 2191 + }, + { + "epoch": 0.10205554391600903, + "grad_norm": 0.7492908400341401, + "learning_rate": 3.4016139044072006e-05, + "loss": 5.402, + "step": 2192 + }, + { + "epoch": 0.1021021021021021, + "grad_norm": 0.728655470191162, + "learning_rate": 3.40316573556797e-05, + "loss": 5.3714, + "step": 2193 + }, + { + "epoch": 0.10214866028819518, + "grad_norm": 0.6880732279996958, + "learning_rate": 3.40471756672874e-05, + "loss": 5.3542, + "step": 2194 + }, + { + "epoch": 0.10219521847428824, + "grad_norm": 0.6993335195996818, + "learning_rate": 3.40626939788951e-05, + "loss": 5.3401, + "step": 2195 + }, + { + "epoch": 0.10224177666038131, + "grad_norm": 0.6736877771013461, + "learning_rate": 3.407821229050279e-05, + "loss": 5.3019, + "step": 2196 + }, + { + "epoch": 0.10228833484647439, + "grad_norm": 0.593027373297827, + "learning_rate": 3.409373060211049e-05, + "loss": 5.3513, + "step": 2197 + }, + { + "epoch": 0.10233489303256744, + "grad_norm": 0.7320753081887109, + "learning_rate": 3.410924891371819e-05, + "loss": 5.2314, + "step": 2198 + }, + { + "epoch": 0.10238145121866052, + "grad_norm": 0.732656019384533, + "learning_rate": 3.412476722532589e-05, + "loss": 5.3792, + "step": 2199 + }, + { + "epoch": 0.10242800940475359, + "grad_norm": 0.7235417489961586, + "learning_rate": 3.4140285536933584e-05, + "loss": 5.311, + "step": 2200 + }, + { + "epoch": 0.10247456759084667, + "grad_norm": 0.6636379001779439, + "learning_rate": 3.415580384854128e-05, + "loss": 5.3631, + "step": 2201 + }, + { + "epoch": 0.10252112577693973, + "grad_norm": 0.6978912671232538, + "learning_rate": 3.4171322160148976e-05, + "loss": 5.2856, + "step": 2202 + }, + { + "epoch": 0.1025676839630328, + "grad_norm": 0.7338706344875882, + "learning_rate": 3.418684047175668e-05, + "loss": 5.2684, + "step": 2203 + }, + { + "epoch": 0.10261424214912587, + "grad_norm": 0.8102934222715836, + "learning_rate": 3.420235878336437e-05, + "loss": 5.3824, + "step": 2204 + }, + { + "epoch": 0.10266080033521895, + "grad_norm": 0.7917716607950129, + "learning_rate": 3.4217877094972065e-05, + "loss": 5.4085, + "step": 2205 + }, + { + "epoch": 0.102707358521312, + "grad_norm": 0.8080964370909205, + "learning_rate": 3.423339540657977e-05, + "loss": 5.326, + "step": 2206 + }, + { + "epoch": 0.10275391670740508, + "grad_norm": 0.7627180702677906, + "learning_rate": 3.4248913718187464e-05, + "loss": 5.3343, + "step": 2207 + }, + { + "epoch": 0.10280047489349815, + "grad_norm": 0.6931968708151831, + "learning_rate": 3.426443202979516e-05, + "loss": 5.3744, + "step": 2208 + }, + { + "epoch": 0.10284703307959121, + "grad_norm": 0.8414054290378477, + "learning_rate": 3.427995034140286e-05, + "loss": 5.3144, + "step": 2209 + }, + { + "epoch": 0.10289359126568429, + "grad_norm": 0.7360843210054036, + "learning_rate": 3.4295468653010553e-05, + "loss": 5.2394, + "step": 2210 + }, + { + "epoch": 0.10294014945177736, + "grad_norm": 0.7056336932034595, + "learning_rate": 3.431098696461825e-05, + "loss": 5.2457, + "step": 2211 + }, + { + "epoch": 0.10298670763787043, + "grad_norm": 0.7316789389510423, + "learning_rate": 3.432650527622595e-05, + "loss": 5.3695, + "step": 2212 + }, + { + "epoch": 0.1030332658239635, + "grad_norm": 0.7596169133694147, + "learning_rate": 3.434202358783364e-05, + "loss": 5.2346, + "step": 2213 + }, + { + "epoch": 0.10307982401005657, + "grad_norm": 0.6770209702021797, + "learning_rate": 3.4357541899441345e-05, + "loss": 5.4009, + "step": 2214 + }, + { + "epoch": 0.10312638219614964, + "grad_norm": 0.6471864923381014, + "learning_rate": 3.437306021104904e-05, + "loss": 5.256, + "step": 2215 + }, + { + "epoch": 0.10317294038224271, + "grad_norm": 0.6465086739832375, + "learning_rate": 3.438857852265674e-05, + "loss": 5.3242, + "step": 2216 + }, + { + "epoch": 0.10321949856833577, + "grad_norm": 0.7684928789438215, + "learning_rate": 3.4404096834264434e-05, + "loss": 5.2696, + "step": 2217 + }, + { + "epoch": 0.10326605675442885, + "grad_norm": 0.8440860623499276, + "learning_rate": 3.441961514587213e-05, + "loss": 5.469, + "step": 2218 + }, + { + "epoch": 0.10331261494052192, + "grad_norm": 0.7888420162257843, + "learning_rate": 3.443513345747983e-05, + "loss": 5.2857, + "step": 2219 + }, + { + "epoch": 0.10335917312661498, + "grad_norm": 1.0219821137473162, + "learning_rate": 3.445065176908753e-05, + "loss": 5.381, + "step": 2220 + }, + { + "epoch": 0.10340573131270805, + "grad_norm": 0.9047458471583513, + "learning_rate": 3.446617008069522e-05, + "loss": 5.4022, + "step": 2221 + }, + { + "epoch": 0.10345228949880113, + "grad_norm": 0.8177880669590656, + "learning_rate": 3.4481688392302916e-05, + "loss": 5.2882, + "step": 2222 + }, + { + "epoch": 0.1034988476848942, + "grad_norm": 0.9281127556580372, + "learning_rate": 3.449720670391062e-05, + "loss": 5.3351, + "step": 2223 + }, + { + "epoch": 0.10354540587098726, + "grad_norm": 0.8593809219705705, + "learning_rate": 3.4512725015518315e-05, + "loss": 5.3154, + "step": 2224 + }, + { + "epoch": 0.10359196405708034, + "grad_norm": 0.6147906242101948, + "learning_rate": 3.452824332712601e-05, + "loss": 5.3619, + "step": 2225 + }, + { + "epoch": 0.10363852224317341, + "grad_norm": 0.9649696420761845, + "learning_rate": 3.454376163873371e-05, + "loss": 5.2504, + "step": 2226 + }, + { + "epoch": 0.10368508042926648, + "grad_norm": 0.8123952316601365, + "learning_rate": 3.4559279950341404e-05, + "loss": 5.2929, + "step": 2227 + }, + { + "epoch": 0.10373163861535954, + "grad_norm": 0.7467022927688447, + "learning_rate": 3.457479826194911e-05, + "loss": 5.377, + "step": 2228 + }, + { + "epoch": 0.10377819680145262, + "grad_norm": 0.8231492307667637, + "learning_rate": 3.45903165735568e-05, + "loss": 5.3944, + "step": 2229 + }, + { + "epoch": 0.10382475498754569, + "grad_norm": 0.9122629602617277, + "learning_rate": 3.460583488516449e-05, + "loss": 5.3134, + "step": 2230 + }, + { + "epoch": 0.10387131317363875, + "grad_norm": 0.8476687588412708, + "learning_rate": 3.4621353196772196e-05, + "loss": 5.3876, + "step": 2231 + }, + { + "epoch": 0.10391787135973182, + "grad_norm": 0.7559298241376128, + "learning_rate": 3.463687150837989e-05, + "loss": 5.2339, + "step": 2232 + }, + { + "epoch": 0.1039644295458249, + "grad_norm": 0.7067648767940782, + "learning_rate": 3.465238981998758e-05, + "loss": 5.2461, + "step": 2233 + }, + { + "epoch": 0.10401098773191797, + "grad_norm": 0.6712177794036109, + "learning_rate": 3.4667908131595285e-05, + "loss": 5.2235, + "step": 2234 + }, + { + "epoch": 0.10405754591801103, + "grad_norm": 0.758241124357976, + "learning_rate": 3.468342644320298e-05, + "loss": 5.3006, + "step": 2235 + }, + { + "epoch": 0.1041041041041041, + "grad_norm": 0.7003681996869825, + "learning_rate": 3.469894475481068e-05, + "loss": 5.2872, + "step": 2236 + }, + { + "epoch": 0.10415066229019718, + "grad_norm": 0.6036019898230581, + "learning_rate": 3.4714463066418374e-05, + "loss": 5.224, + "step": 2237 + }, + { + "epoch": 0.10419722047629025, + "grad_norm": 0.6813486336638969, + "learning_rate": 3.472998137802607e-05, + "loss": 5.3283, + "step": 2238 + }, + { + "epoch": 0.10424377866238331, + "grad_norm": 0.7131038414076618, + "learning_rate": 3.4745499689633773e-05, + "loss": 5.2511, + "step": 2239 + }, + { + "epoch": 0.10429033684847638, + "grad_norm": 0.7481355519671437, + "learning_rate": 3.476101800124147e-05, + "loss": 5.2935, + "step": 2240 + }, + { + "epoch": 0.10433689503456946, + "grad_norm": 0.7788804557098182, + "learning_rate": 3.477653631284916e-05, + "loss": 5.3192, + "step": 2241 + }, + { + "epoch": 0.10438345322066252, + "grad_norm": 0.6439950384371118, + "learning_rate": 3.479205462445686e-05, + "loss": 5.2464, + "step": 2242 + }, + { + "epoch": 0.10443001140675559, + "grad_norm": 0.6522598312189075, + "learning_rate": 3.480757293606456e-05, + "loss": 5.1979, + "step": 2243 + }, + { + "epoch": 0.10447656959284866, + "grad_norm": 0.7189498600018134, + "learning_rate": 3.4823091247672255e-05, + "loss": 5.3334, + "step": 2244 + }, + { + "epoch": 0.10452312777894174, + "grad_norm": 0.7367398963603679, + "learning_rate": 3.483860955927995e-05, + "loss": 5.2761, + "step": 2245 + }, + { + "epoch": 0.1045696859650348, + "grad_norm": 0.6441989080755299, + "learning_rate": 3.485412787088765e-05, + "loss": 5.1903, + "step": 2246 + }, + { + "epoch": 0.10461624415112787, + "grad_norm": 0.6022285568619092, + "learning_rate": 3.4869646182495344e-05, + "loss": 5.283, + "step": 2247 + }, + { + "epoch": 0.10466280233722094, + "grad_norm": 0.8063049844636355, + "learning_rate": 3.488516449410305e-05, + "loss": 5.1973, + "step": 2248 + }, + { + "epoch": 0.10470936052331402, + "grad_norm": 0.8430378966127542, + "learning_rate": 3.4900682805710737e-05, + "loss": 5.2442, + "step": 2249 + }, + { + "epoch": 0.10475591870940708, + "grad_norm": 0.7169143628478926, + "learning_rate": 3.491620111731844e-05, + "loss": 5.2549, + "step": 2250 + }, + { + "epoch": 0.10480247689550015, + "grad_norm": 0.6996521054608489, + "learning_rate": 3.4931719428926136e-05, + "loss": 5.2151, + "step": 2251 + }, + { + "epoch": 0.10484903508159323, + "grad_norm": 0.6262937152895123, + "learning_rate": 3.494723774053383e-05, + "loss": 5.3771, + "step": 2252 + }, + { + "epoch": 0.10489559326768629, + "grad_norm": 0.6263440206211153, + "learning_rate": 3.496275605214153e-05, + "loss": 5.364, + "step": 2253 + }, + { + "epoch": 0.10494215145377936, + "grad_norm": 0.7540510719554762, + "learning_rate": 3.4978274363749225e-05, + "loss": 5.3165, + "step": 2254 + }, + { + "epoch": 0.10498870963987243, + "grad_norm": 0.863051020785272, + "learning_rate": 3.499379267535692e-05, + "loss": 5.3765, + "step": 2255 + }, + { + "epoch": 0.1050352678259655, + "grad_norm": 0.9988244712756698, + "learning_rate": 3.5009310986964624e-05, + "loss": 5.1755, + "step": 2256 + }, + { + "epoch": 0.10508182601205857, + "grad_norm": 0.8969342887427241, + "learning_rate": 3.5024829298572314e-05, + "loss": 5.262, + "step": 2257 + }, + { + "epoch": 0.10512838419815164, + "grad_norm": 0.8200807055492421, + "learning_rate": 3.504034761018001e-05, + "loss": 5.1555, + "step": 2258 + }, + { + "epoch": 0.10517494238424471, + "grad_norm": 0.7816542635949962, + "learning_rate": 3.505586592178771e-05, + "loss": 5.2598, + "step": 2259 + }, + { + "epoch": 0.10522150057033779, + "grad_norm": 0.7016335747082775, + "learning_rate": 3.507138423339541e-05, + "loss": 5.3536, + "step": 2260 + }, + { + "epoch": 0.10526805875643085, + "grad_norm": 0.6747040010165888, + "learning_rate": 3.5086902545003106e-05, + "loss": 5.323, + "step": 2261 + }, + { + "epoch": 0.10531461694252392, + "grad_norm": 0.6050164050277184, + "learning_rate": 3.51024208566108e-05, + "loss": 5.2876, + "step": 2262 + }, + { + "epoch": 0.105361175128617, + "grad_norm": 0.6654725498309886, + "learning_rate": 3.51179391682185e-05, + "loss": 5.205, + "step": 2263 + }, + { + "epoch": 0.10540773331471005, + "grad_norm": 0.853654241006906, + "learning_rate": 3.51334574798262e-05, + "loss": 5.2788, + "step": 2264 + }, + { + "epoch": 0.10545429150080313, + "grad_norm": 0.8947716910858129, + "learning_rate": 3.514897579143389e-05, + "loss": 5.3752, + "step": 2265 + }, + { + "epoch": 0.1055008496868962, + "grad_norm": 0.9467197293382601, + "learning_rate": 3.516449410304159e-05, + "loss": 5.0684, + "step": 2266 + }, + { + "epoch": 0.10554740787298927, + "grad_norm": 0.9085365720261737, + "learning_rate": 3.518001241464929e-05, + "loss": 5.2583, + "step": 2267 + }, + { + "epoch": 0.10559396605908233, + "grad_norm": 0.7964966843823722, + "learning_rate": 3.519553072625699e-05, + "loss": 5.1856, + "step": 2268 + }, + { + "epoch": 0.10564052424517541, + "grad_norm": 1.0164556747749178, + "learning_rate": 3.5211049037864676e-05, + "loss": 5.2223, + "step": 2269 + }, + { + "epoch": 0.10568708243126848, + "grad_norm": 1.169315436538769, + "learning_rate": 3.522656734947238e-05, + "loss": 5.1124, + "step": 2270 + }, + { + "epoch": 0.10573364061736155, + "grad_norm": 0.6939890334310856, + "learning_rate": 3.5242085661080076e-05, + "loss": 5.2326, + "step": 2271 + }, + { + "epoch": 0.10578019880345461, + "grad_norm": 1.094048006758683, + "learning_rate": 3.525760397268777e-05, + "loss": 5.2286, + "step": 2272 + }, + { + "epoch": 0.10582675698954769, + "grad_norm": 0.7274569611540613, + "learning_rate": 3.527312228429547e-05, + "loss": 5.3418, + "step": 2273 + }, + { + "epoch": 0.10587331517564076, + "grad_norm": 0.7490762888447434, + "learning_rate": 3.5288640595903165e-05, + "loss": 5.2889, + "step": 2274 + }, + { + "epoch": 0.10591987336173382, + "grad_norm": 0.8885313786788263, + "learning_rate": 3.530415890751087e-05, + "loss": 5.1967, + "step": 2275 + }, + { + "epoch": 0.1059664315478269, + "grad_norm": 0.7064508543357966, + "learning_rate": 3.5319677219118564e-05, + "loss": 5.3172, + "step": 2276 + }, + { + "epoch": 0.10601298973391997, + "grad_norm": 0.8239839809575009, + "learning_rate": 3.5335195530726253e-05, + "loss": 5.2819, + "step": 2277 + }, + { + "epoch": 0.10605954792001304, + "grad_norm": 0.7623630348240302, + "learning_rate": 3.5350713842333957e-05, + "loss": 5.2401, + "step": 2278 + }, + { + "epoch": 0.1061061061061061, + "grad_norm": 0.824557496181195, + "learning_rate": 3.536623215394165e-05, + "loss": 5.3874, + "step": 2279 + }, + { + "epoch": 0.10615266429219918, + "grad_norm": 0.9936056054743336, + "learning_rate": 3.538175046554935e-05, + "loss": 5.242, + "step": 2280 + }, + { + "epoch": 0.10619922247829225, + "grad_norm": 0.8789102948554348, + "learning_rate": 3.5397268777157045e-05, + "loss": 5.187, + "step": 2281 + }, + { + "epoch": 0.10624578066438531, + "grad_norm": 0.7072527044899565, + "learning_rate": 3.541278708876474e-05, + "loss": 5.1828, + "step": 2282 + }, + { + "epoch": 0.10629233885047838, + "grad_norm": 0.745173690364008, + "learning_rate": 3.542830540037244e-05, + "loss": 5.2505, + "step": 2283 + }, + { + "epoch": 0.10633889703657146, + "grad_norm": 0.7031831018012473, + "learning_rate": 3.544382371198014e-05, + "loss": 5.339, + "step": 2284 + }, + { + "epoch": 0.10638545522266453, + "grad_norm": 0.8324024914635103, + "learning_rate": 3.545934202358783e-05, + "loss": 5.2763, + "step": 2285 + }, + { + "epoch": 0.10643201340875759, + "grad_norm": 0.9587387687351725, + "learning_rate": 3.5474860335195534e-05, + "loss": 5.2466, + "step": 2286 + }, + { + "epoch": 0.10647857159485066, + "grad_norm": 0.9173655886225056, + "learning_rate": 3.549037864680323e-05, + "loss": 5.2726, + "step": 2287 + }, + { + "epoch": 0.10652512978094374, + "grad_norm": 0.929848701186986, + "learning_rate": 3.5505896958410926e-05, + "loss": 5.3004, + "step": 2288 + }, + { + "epoch": 0.10657168796703681, + "grad_norm": 0.7013129711921726, + "learning_rate": 3.552141527001862e-05, + "loss": 5.32, + "step": 2289 + }, + { + "epoch": 0.10661824615312987, + "grad_norm": 0.7317679952853486, + "learning_rate": 3.553693358162632e-05, + "loss": 5.2967, + "step": 2290 + }, + { + "epoch": 0.10666480433922294, + "grad_norm": 0.8726696709938623, + "learning_rate": 3.5552451893234015e-05, + "loss": 5.2634, + "step": 2291 + }, + { + "epoch": 0.10671136252531602, + "grad_norm": 0.8854224075319196, + "learning_rate": 3.556797020484172e-05, + "loss": 5.3264, + "step": 2292 + }, + { + "epoch": 0.10675792071140908, + "grad_norm": 0.7458059941169585, + "learning_rate": 3.5583488516449415e-05, + "loss": 5.2866, + "step": 2293 + }, + { + "epoch": 0.10680447889750215, + "grad_norm": 0.7326855049577206, + "learning_rate": 3.559900682805711e-05, + "loss": 5.2111, + "step": 2294 + }, + { + "epoch": 0.10685103708359522, + "grad_norm": 0.7769624750388706, + "learning_rate": 3.561452513966481e-05, + "loss": 5.3467, + "step": 2295 + }, + { + "epoch": 0.1068975952696883, + "grad_norm": 0.7286611527727973, + "learning_rate": 3.5630043451272504e-05, + "loss": 5.2445, + "step": 2296 + }, + { + "epoch": 0.10694415345578136, + "grad_norm": 0.6910870966555808, + "learning_rate": 3.56455617628802e-05, + "loss": 5.0954, + "step": 2297 + }, + { + "epoch": 0.10699071164187443, + "grad_norm": 0.7518617546545152, + "learning_rate": 3.5661080074487896e-05, + "loss": 5.2683, + "step": 2298 + }, + { + "epoch": 0.1070372698279675, + "grad_norm": 0.7240644793930381, + "learning_rate": 3.567659838609559e-05, + "loss": 5.1287, + "step": 2299 + }, + { + "epoch": 0.10708382801406058, + "grad_norm": 0.725055771644803, + "learning_rate": 3.5692116697703296e-05, + "loss": 5.1581, + "step": 2300 + }, + { + "epoch": 0.10713038620015364, + "grad_norm": 0.8257908790814731, + "learning_rate": 3.570763500931099e-05, + "loss": 5.2435, + "step": 2301 + }, + { + "epoch": 0.10717694438624671, + "grad_norm": 0.9782268111878651, + "learning_rate": 3.572315332091868e-05, + "loss": 5.2726, + "step": 2302 + }, + { + "epoch": 0.10722350257233978, + "grad_norm": 1.0198026292158031, + "learning_rate": 3.5738671632526385e-05, + "loss": 5.3602, + "step": 2303 + }, + { + "epoch": 0.10727006075843284, + "grad_norm": 0.9707388029456471, + "learning_rate": 3.575418994413408e-05, + "loss": 5.1973, + "step": 2304 + }, + { + "epoch": 0.10731661894452592, + "grad_norm": 0.945483163984769, + "learning_rate": 3.576970825574178e-05, + "loss": 5.3072, + "step": 2305 + }, + { + "epoch": 0.10736317713061899, + "grad_norm": 0.8533848653464842, + "learning_rate": 3.5785226567349473e-05, + "loss": 5.2187, + "step": 2306 + }, + { + "epoch": 0.10740973531671207, + "grad_norm": 0.7493246818653168, + "learning_rate": 3.580074487895717e-05, + "loss": 5.2345, + "step": 2307 + }, + { + "epoch": 0.10745629350280513, + "grad_norm": 0.9013326900230538, + "learning_rate": 3.5816263190564866e-05, + "loss": 5.2323, + "step": 2308 + }, + { + "epoch": 0.1075028516888982, + "grad_norm": 0.8527891627563594, + "learning_rate": 3.583178150217257e-05, + "loss": 5.275, + "step": 2309 + }, + { + "epoch": 0.10754940987499127, + "grad_norm": 0.5998930722236014, + "learning_rate": 3.584729981378026e-05, + "loss": 5.3191, + "step": 2310 + }, + { + "epoch": 0.10759596806108435, + "grad_norm": 0.8963516545121811, + "learning_rate": 3.586281812538796e-05, + "loss": 5.1626, + "step": 2311 + }, + { + "epoch": 0.1076425262471774, + "grad_norm": 0.961994760686145, + "learning_rate": 3.587833643699566e-05, + "loss": 5.2145, + "step": 2312 + }, + { + "epoch": 0.10768908443327048, + "grad_norm": 0.9404008597736686, + "learning_rate": 3.5893854748603354e-05, + "loss": 5.2738, + "step": 2313 + }, + { + "epoch": 0.10773564261936355, + "grad_norm": 0.9732454468973886, + "learning_rate": 3.590937306021105e-05, + "loss": 5.1359, + "step": 2314 + }, + { + "epoch": 0.10778220080545661, + "grad_norm": 0.7790138835668483, + "learning_rate": 3.592489137181875e-05, + "loss": 5.1886, + "step": 2315 + }, + { + "epoch": 0.10782875899154969, + "grad_norm": 0.8310571316369012, + "learning_rate": 3.594040968342644e-05, + "loss": 5.2307, + "step": 2316 + }, + { + "epoch": 0.10787531717764276, + "grad_norm": 0.8786974578065698, + "learning_rate": 3.5955927995034146e-05, + "loss": 5.332, + "step": 2317 + }, + { + "epoch": 0.10792187536373583, + "grad_norm": 0.8126984244147643, + "learning_rate": 3.5971446306641836e-05, + "loss": 5.2903, + "step": 2318 + }, + { + "epoch": 0.1079684335498289, + "grad_norm": 0.7374373261120936, + "learning_rate": 3.598696461824954e-05, + "loss": 5.2799, + "step": 2319 + }, + { + "epoch": 0.10801499173592197, + "grad_norm": 0.8229088830529174, + "learning_rate": 3.6002482929857235e-05, + "loss": 5.179, + "step": 2320 + }, + { + "epoch": 0.10806154992201504, + "grad_norm": 0.7152393852076576, + "learning_rate": 3.601800124146493e-05, + "loss": 5.2226, + "step": 2321 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 0.7790330315602116, + "learning_rate": 3.603351955307263e-05, + "loss": 5.1927, + "step": 2322 + }, + { + "epoch": 0.10815466629420117, + "grad_norm": 0.9854843443559947, + "learning_rate": 3.6049037864680324e-05, + "loss": 5.2276, + "step": 2323 + }, + { + "epoch": 0.10820122448029425, + "grad_norm": 0.9077957018636321, + "learning_rate": 3.606455617628802e-05, + "loss": 5.149, + "step": 2324 + }, + { + "epoch": 0.10824778266638732, + "grad_norm": 0.9000119183278058, + "learning_rate": 3.6080074487895724e-05, + "loss": 5.157, + "step": 2325 + }, + { + "epoch": 0.10829434085248038, + "grad_norm": 0.8574216388161252, + "learning_rate": 3.609559279950341e-05, + "loss": 5.0917, + "step": 2326 + }, + { + "epoch": 0.10834089903857345, + "grad_norm": 0.7753983856821882, + "learning_rate": 3.611111111111111e-05, + "loss": 5.2333, + "step": 2327 + }, + { + "epoch": 0.10838745722466653, + "grad_norm": 0.9304918875782393, + "learning_rate": 3.612662942271881e-05, + "loss": 5.2342, + "step": 2328 + }, + { + "epoch": 0.1084340154107596, + "grad_norm": 0.9272709197794439, + "learning_rate": 3.614214773432651e-05, + "loss": 5.1991, + "step": 2329 + }, + { + "epoch": 0.10848057359685266, + "grad_norm": 0.8624423708732164, + "learning_rate": 3.6157666045934205e-05, + "loss": 5.1903, + "step": 2330 + }, + { + "epoch": 0.10852713178294573, + "grad_norm": 0.7277738965967631, + "learning_rate": 3.61731843575419e-05, + "loss": 5.3476, + "step": 2331 + }, + { + "epoch": 0.10857368996903881, + "grad_norm": 0.8033101766018993, + "learning_rate": 3.61887026691496e-05, + "loss": 5.1092, + "step": 2332 + }, + { + "epoch": 0.10862024815513188, + "grad_norm": 0.8195588686825288, + "learning_rate": 3.6204220980757294e-05, + "loss": 5.104, + "step": 2333 + }, + { + "epoch": 0.10866680634122494, + "grad_norm": 0.7724120968766006, + "learning_rate": 3.621973929236499e-05, + "loss": 5.0631, + "step": 2334 + }, + { + "epoch": 0.10871336452731802, + "grad_norm": 0.7726004486677394, + "learning_rate": 3.623525760397269e-05, + "loss": 5.1841, + "step": 2335 + }, + { + "epoch": 0.10875992271341109, + "grad_norm": 0.7380148951605666, + "learning_rate": 3.625077591558039e-05, + "loss": 5.2348, + "step": 2336 + }, + { + "epoch": 0.10880648089950415, + "grad_norm": 0.7022051651938593, + "learning_rate": 3.6266294227188086e-05, + "loss": 5.183, + "step": 2337 + }, + { + "epoch": 0.10885303908559722, + "grad_norm": 0.693301283915222, + "learning_rate": 3.6281812538795776e-05, + "loss": 5.1609, + "step": 2338 + }, + { + "epoch": 0.1088995972716903, + "grad_norm": 0.6086093132321464, + "learning_rate": 3.629733085040348e-05, + "loss": 5.3049, + "step": 2339 + }, + { + "epoch": 0.10894615545778337, + "grad_norm": 0.7437864695960773, + "learning_rate": 3.6312849162011175e-05, + "loss": 5.165, + "step": 2340 + }, + { + "epoch": 0.10899271364387643, + "grad_norm": 0.9033931140680057, + "learning_rate": 3.632836747361887e-05, + "loss": 5.1611, + "step": 2341 + }, + { + "epoch": 0.1090392718299695, + "grad_norm": 0.8849743632621647, + "learning_rate": 3.634388578522657e-05, + "loss": 5.2181, + "step": 2342 + }, + { + "epoch": 0.10908583001606258, + "grad_norm": 0.8103765270050465, + "learning_rate": 3.6359404096834264e-05, + "loss": 5.2517, + "step": 2343 + }, + { + "epoch": 0.10913238820215565, + "grad_norm": 0.8300441611593714, + "learning_rate": 3.637492240844197e-05, + "loss": 5.2818, + "step": 2344 + }, + { + "epoch": 0.10917894638824871, + "grad_norm": 0.9008700434580309, + "learning_rate": 3.639044072004966e-05, + "loss": 5.2938, + "step": 2345 + }, + { + "epoch": 0.10922550457434178, + "grad_norm": 0.8290354009542843, + "learning_rate": 3.640595903165735e-05, + "loss": 5.1683, + "step": 2346 + }, + { + "epoch": 0.10927206276043486, + "grad_norm": 0.8102969512297155, + "learning_rate": 3.6421477343265056e-05, + "loss": 5.041, + "step": 2347 + }, + { + "epoch": 0.10931862094652792, + "grad_norm": 0.7497782355487012, + "learning_rate": 3.643699565487275e-05, + "loss": 5.1392, + "step": 2348 + }, + { + "epoch": 0.10936517913262099, + "grad_norm": 0.783399858980573, + "learning_rate": 3.645251396648045e-05, + "loss": 5.1182, + "step": 2349 + }, + { + "epoch": 0.10941173731871406, + "grad_norm": 0.7114977763263421, + "learning_rate": 3.6468032278088145e-05, + "loss": 5.2918, + "step": 2350 + }, + { + "epoch": 0.10945829550480714, + "grad_norm": 0.7101217610066575, + "learning_rate": 3.648355058969584e-05, + "loss": 5.2215, + "step": 2351 + }, + { + "epoch": 0.1095048536909002, + "grad_norm": 0.8708447384685736, + "learning_rate": 3.649906890130354e-05, + "loss": 5.2348, + "step": 2352 + }, + { + "epoch": 0.10955141187699327, + "grad_norm": 0.8901974315839688, + "learning_rate": 3.651458721291124e-05, + "loss": 5.1558, + "step": 2353 + }, + { + "epoch": 0.10959797006308634, + "grad_norm": 0.7708019470933984, + "learning_rate": 3.653010552451893e-05, + "loss": 5.2403, + "step": 2354 + }, + { + "epoch": 0.10964452824917942, + "grad_norm": 0.7242442340127824, + "learning_rate": 3.654562383612663e-05, + "loss": 5.2413, + "step": 2355 + }, + { + "epoch": 0.10969108643527248, + "grad_norm": 0.8373428975324275, + "learning_rate": 3.656114214773433e-05, + "loss": 5.1648, + "step": 2356 + }, + { + "epoch": 0.10973764462136555, + "grad_norm": 0.7395928542022794, + "learning_rate": 3.6576660459342026e-05, + "loss": 5.1396, + "step": 2357 + }, + { + "epoch": 0.10978420280745863, + "grad_norm": 0.8081004477889768, + "learning_rate": 3.659217877094972e-05, + "loss": 5.2062, + "step": 2358 + }, + { + "epoch": 0.10983076099355168, + "grad_norm": 0.6775065015961184, + "learning_rate": 3.660769708255742e-05, + "loss": 5.2021, + "step": 2359 + }, + { + "epoch": 0.10987731917964476, + "grad_norm": 0.6342188783747342, + "learning_rate": 3.6623215394165115e-05, + "loss": 5.1638, + "step": 2360 + }, + { + "epoch": 0.10992387736573783, + "grad_norm": 0.7549760575657635, + "learning_rate": 3.663873370577282e-05, + "loss": 5.2077, + "step": 2361 + }, + { + "epoch": 0.1099704355518309, + "grad_norm": 0.8746183746577091, + "learning_rate": 3.665425201738051e-05, + "loss": 5.2059, + "step": 2362 + }, + { + "epoch": 0.11001699373792397, + "grad_norm": 0.8409982037440846, + "learning_rate": 3.6669770328988204e-05, + "loss": 5.2118, + "step": 2363 + }, + { + "epoch": 0.11006355192401704, + "grad_norm": 0.7444260019452055, + "learning_rate": 3.668528864059591e-05, + "loss": 5.2177, + "step": 2364 + }, + { + "epoch": 0.11011011011011011, + "grad_norm": 0.6996767423392335, + "learning_rate": 3.67008069522036e-05, + "loss": 5.1779, + "step": 2365 + }, + { + "epoch": 0.11015666829620319, + "grad_norm": 0.7529955233148621, + "learning_rate": 3.67163252638113e-05, + "loss": 5.0924, + "step": 2366 + }, + { + "epoch": 0.11020322648229625, + "grad_norm": 0.5944999771435968, + "learning_rate": 3.6731843575418996e-05, + "loss": 5.1804, + "step": 2367 + }, + { + "epoch": 0.11024978466838932, + "grad_norm": 0.7549570150572629, + "learning_rate": 3.674736188702669e-05, + "loss": 5.2333, + "step": 2368 + }, + { + "epoch": 0.1102963428544824, + "grad_norm": 0.7447419603502468, + "learning_rate": 3.6762880198634395e-05, + "loss": 5.259, + "step": 2369 + }, + { + "epoch": 0.11034290104057545, + "grad_norm": 0.8838164835417109, + "learning_rate": 3.6778398510242085e-05, + "loss": 5.173, + "step": 2370 + }, + { + "epoch": 0.11038945922666853, + "grad_norm": 0.782273072575536, + "learning_rate": 3.679391682184978e-05, + "loss": 5.1666, + "step": 2371 + }, + { + "epoch": 0.1104360174127616, + "grad_norm": 0.7239261193203353, + "learning_rate": 3.6809435133457484e-05, + "loss": 5.072, + "step": 2372 + }, + { + "epoch": 0.11048257559885467, + "grad_norm": 0.7617311406334528, + "learning_rate": 3.682495344506518e-05, + "loss": 4.9625, + "step": 2373 + }, + { + "epoch": 0.11052913378494773, + "grad_norm": 0.7960046761393751, + "learning_rate": 3.684047175667287e-05, + "loss": 5.2744, + "step": 2374 + }, + { + "epoch": 0.11057569197104081, + "grad_norm": 0.8922723365382573, + "learning_rate": 3.685599006828057e-05, + "loss": 5.1349, + "step": 2375 + }, + { + "epoch": 0.11062225015713388, + "grad_norm": 1.0146965244301422, + "learning_rate": 3.687150837988827e-05, + "loss": 5.2066, + "step": 2376 + }, + { + "epoch": 0.11066880834322695, + "grad_norm": 0.9461529056061168, + "learning_rate": 3.6887026691495965e-05, + "loss": 5.0923, + "step": 2377 + }, + { + "epoch": 0.11071536652932001, + "grad_norm": 0.8427493469922525, + "learning_rate": 3.690254500310367e-05, + "loss": 5.1581, + "step": 2378 + }, + { + "epoch": 0.11076192471541309, + "grad_norm": 0.7971117667170692, + "learning_rate": 3.691806331471136e-05, + "loss": 5.1562, + "step": 2379 + }, + { + "epoch": 0.11080848290150616, + "grad_norm": 0.8306819832757983, + "learning_rate": 3.693358162631906e-05, + "loss": 5.2125, + "step": 2380 + }, + { + "epoch": 0.11085504108759922, + "grad_norm": 0.8250734522791751, + "learning_rate": 3.694909993792676e-05, + "loss": 5.1733, + "step": 2381 + }, + { + "epoch": 0.1109015992736923, + "grad_norm": 0.7725336983279506, + "learning_rate": 3.6964618249534454e-05, + "loss": 5.1603, + "step": 2382 + }, + { + "epoch": 0.11094815745978537, + "grad_norm": 0.9354139391327517, + "learning_rate": 3.698013656114215e-05, + "loss": 5.2821, + "step": 2383 + }, + { + "epoch": 0.11099471564587844, + "grad_norm": 1.0146436403006098, + "learning_rate": 3.6995654872749846e-05, + "loss": 5.1472, + "step": 2384 + }, + { + "epoch": 0.1110412738319715, + "grad_norm": 0.7929642049258893, + "learning_rate": 3.701117318435754e-05, + "loss": 5.1443, + "step": 2385 + }, + { + "epoch": 0.11108783201806458, + "grad_norm": 0.8508606380226909, + "learning_rate": 3.7026691495965246e-05, + "loss": 5.3352, + "step": 2386 + }, + { + "epoch": 0.11113439020415765, + "grad_norm": 0.767802382480292, + "learning_rate": 3.7042209807572935e-05, + "loss": 5.1227, + "step": 2387 + }, + { + "epoch": 0.11118094839025072, + "grad_norm": 0.6829604754707727, + "learning_rate": 3.705772811918063e-05, + "loss": 5.0981, + "step": 2388 + }, + { + "epoch": 0.11122750657634378, + "grad_norm": 0.6704489998103809, + "learning_rate": 3.7073246430788335e-05, + "loss": 5.2002, + "step": 2389 + }, + { + "epoch": 0.11127406476243686, + "grad_norm": 0.7396767954213774, + "learning_rate": 3.708876474239603e-05, + "loss": 5.0493, + "step": 2390 + }, + { + "epoch": 0.11132062294852993, + "grad_norm": 0.6798906114440135, + "learning_rate": 3.710428305400373e-05, + "loss": 5.2248, + "step": 2391 + }, + { + "epoch": 0.11136718113462299, + "grad_norm": 0.7626630920363376, + "learning_rate": 3.7119801365611424e-05, + "loss": 5.0986, + "step": 2392 + }, + { + "epoch": 0.11141373932071606, + "grad_norm": 0.755009420444494, + "learning_rate": 3.713531967721912e-05, + "loss": 5.0326, + "step": 2393 + }, + { + "epoch": 0.11146029750680914, + "grad_norm": 0.7486001144510583, + "learning_rate": 3.715083798882682e-05, + "loss": 5.1386, + "step": 2394 + }, + { + "epoch": 0.11150685569290221, + "grad_norm": 0.7976982232598383, + "learning_rate": 3.716635630043451e-05, + "loss": 5.1205, + "step": 2395 + }, + { + "epoch": 0.11155341387899527, + "grad_norm": 0.7531516420466241, + "learning_rate": 3.718187461204221e-05, + "loss": 5.1549, + "step": 2396 + }, + { + "epoch": 0.11159997206508834, + "grad_norm": 0.774875605195332, + "learning_rate": 3.719739292364991e-05, + "loss": 5.1771, + "step": 2397 + }, + { + "epoch": 0.11164653025118142, + "grad_norm": 0.8195896284670963, + "learning_rate": 3.721291123525761e-05, + "loss": 5.2112, + "step": 2398 + }, + { + "epoch": 0.11169308843727449, + "grad_norm": 0.838654130337697, + "learning_rate": 3.72284295468653e-05, + "loss": 5.1019, + "step": 2399 + }, + { + "epoch": 0.11173964662336755, + "grad_norm": 0.7351229688013994, + "learning_rate": 3.7243947858473e-05, + "loss": 5.1555, + "step": 2400 + }, + { + "epoch": 0.11178620480946062, + "grad_norm": 0.8224958473571662, + "learning_rate": 3.72594661700807e-05, + "loss": 4.9282, + "step": 2401 + }, + { + "epoch": 0.1118327629955537, + "grad_norm": 0.8518969855129563, + "learning_rate": 3.7274984481688393e-05, + "loss": 5.1884, + "step": 2402 + }, + { + "epoch": 0.11187932118164676, + "grad_norm": 0.8189726647668454, + "learning_rate": 3.729050279329609e-05, + "loss": 5.1847, + "step": 2403 + }, + { + "epoch": 0.11192587936773983, + "grad_norm": 0.7977948734043703, + "learning_rate": 3.7306021104903786e-05, + "loss": 5.1045, + "step": 2404 + }, + { + "epoch": 0.1119724375538329, + "grad_norm": 0.8302996073579322, + "learning_rate": 3.732153941651149e-05, + "loss": 5.2283, + "step": 2405 + }, + { + "epoch": 0.11201899573992598, + "grad_norm": 0.8647777713269555, + "learning_rate": 3.7337057728119185e-05, + "loss": 5.1679, + "step": 2406 + }, + { + "epoch": 0.11206555392601904, + "grad_norm": 0.7928917655716109, + "learning_rate": 3.7352576039726875e-05, + "loss": 5.2029, + "step": 2407 + }, + { + "epoch": 0.11211211211211211, + "grad_norm": 0.8133297612338009, + "learning_rate": 3.736809435133458e-05, + "loss": 5.08, + "step": 2408 + }, + { + "epoch": 0.11215867029820518, + "grad_norm": 0.7849565728703763, + "learning_rate": 3.7383612662942274e-05, + "loss": 5.1296, + "step": 2409 + }, + { + "epoch": 0.11220522848429826, + "grad_norm": 0.8717675499654465, + "learning_rate": 3.739913097454997e-05, + "loss": 5.1537, + "step": 2410 + }, + { + "epoch": 0.11225178667039132, + "grad_norm": 0.8606548907145922, + "learning_rate": 3.741464928615767e-05, + "loss": 5.0111, + "step": 2411 + }, + { + "epoch": 0.11229834485648439, + "grad_norm": 0.9606881014533625, + "learning_rate": 3.743016759776536e-05, + "loss": 5.1995, + "step": 2412 + }, + { + "epoch": 0.11234490304257747, + "grad_norm": 0.9839050175079718, + "learning_rate": 3.744568590937306e-05, + "loss": 5.2198, + "step": 2413 + }, + { + "epoch": 0.11239146122867053, + "grad_norm": 0.7801904956807032, + "learning_rate": 3.746120422098076e-05, + "loss": 5.1801, + "step": 2414 + }, + { + "epoch": 0.1124380194147636, + "grad_norm": 0.741720817795378, + "learning_rate": 3.747672253258845e-05, + "loss": 5.1299, + "step": 2415 + }, + { + "epoch": 0.11248457760085667, + "grad_norm": 0.81665729601188, + "learning_rate": 3.7492240844196155e-05, + "loss": 5.1791, + "step": 2416 + }, + { + "epoch": 0.11253113578694975, + "grad_norm": 0.9021443403578101, + "learning_rate": 3.750775915580385e-05, + "loss": 5.062, + "step": 2417 + }, + { + "epoch": 0.1125776939730428, + "grad_norm": 0.8152501838060596, + "learning_rate": 3.752327746741155e-05, + "loss": 5.0488, + "step": 2418 + }, + { + "epoch": 0.11262425215913588, + "grad_norm": 0.8199327673815445, + "learning_rate": 3.7538795779019244e-05, + "loss": 5.102, + "step": 2419 + }, + { + "epoch": 0.11267081034522895, + "grad_norm": 0.9149370131587828, + "learning_rate": 3.755431409062694e-05, + "loss": 5.0792, + "step": 2420 + }, + { + "epoch": 0.11271736853132203, + "grad_norm": 0.8823749779723764, + "learning_rate": 3.756983240223464e-05, + "loss": 4.9436, + "step": 2421 + }, + { + "epoch": 0.11276392671741509, + "grad_norm": 0.747706969885862, + "learning_rate": 3.758535071384234e-05, + "loss": 5.1311, + "step": 2422 + }, + { + "epoch": 0.11281048490350816, + "grad_norm": 1.0027624708337657, + "learning_rate": 3.760086902545003e-05, + "loss": 5.0944, + "step": 2423 + }, + { + "epoch": 0.11285704308960123, + "grad_norm": 1.0320590123507138, + "learning_rate": 3.7616387337057726e-05, + "loss": 5.1807, + "step": 2424 + }, + { + "epoch": 0.11290360127569429, + "grad_norm": 0.801351073857042, + "learning_rate": 3.763190564866543e-05, + "loss": 5.1672, + "step": 2425 + }, + { + "epoch": 0.11295015946178737, + "grad_norm": 0.7911889429511056, + "learning_rate": 3.7647423960273125e-05, + "loss": 4.9327, + "step": 2426 + }, + { + "epoch": 0.11299671764788044, + "grad_norm": 0.8938346623599948, + "learning_rate": 3.766294227188082e-05, + "loss": 5.0963, + "step": 2427 + }, + { + "epoch": 0.11304327583397351, + "grad_norm": 0.8460694532252762, + "learning_rate": 3.767846058348852e-05, + "loss": 5.1197, + "step": 2428 + }, + { + "epoch": 0.11308983402006657, + "grad_norm": 0.7408492275288359, + "learning_rate": 3.7693978895096214e-05, + "loss": 5.1205, + "step": 2429 + }, + { + "epoch": 0.11313639220615965, + "grad_norm": 0.8484028542711138, + "learning_rate": 3.770949720670392e-05, + "loss": 5.0457, + "step": 2430 + }, + { + "epoch": 0.11318295039225272, + "grad_norm": 0.6935344336884605, + "learning_rate": 3.772501551831161e-05, + "loss": 5.1335, + "step": 2431 + }, + { + "epoch": 0.1132295085783458, + "grad_norm": 0.7132713928862143, + "learning_rate": 3.77405338299193e-05, + "loss": 5.0637, + "step": 2432 + }, + { + "epoch": 0.11327606676443885, + "grad_norm": 0.8156848076270877, + "learning_rate": 3.7756052141527006e-05, + "loss": 5.1919, + "step": 2433 + }, + { + "epoch": 0.11332262495053193, + "grad_norm": 0.7112976486730885, + "learning_rate": 3.77715704531347e-05, + "loss": 5.0468, + "step": 2434 + }, + { + "epoch": 0.113369183136625, + "grad_norm": 0.7961973688844052, + "learning_rate": 3.77870887647424e-05, + "loss": 5.0694, + "step": 2435 + }, + { + "epoch": 0.11341574132271806, + "grad_norm": 0.779847208897717, + "learning_rate": 3.7802607076350095e-05, + "loss": 5.1158, + "step": 2436 + }, + { + "epoch": 0.11346229950881113, + "grad_norm": 0.7239660928869652, + "learning_rate": 3.781812538795779e-05, + "loss": 5.01, + "step": 2437 + }, + { + "epoch": 0.11350885769490421, + "grad_norm": 0.6858372911752295, + "learning_rate": 3.783364369956549e-05, + "loss": 5.1451, + "step": 2438 + }, + { + "epoch": 0.11355541588099728, + "grad_norm": 0.709577808691613, + "learning_rate": 3.7849162011173184e-05, + "loss": 5.2104, + "step": 2439 + }, + { + "epoch": 0.11360197406709034, + "grad_norm": 0.7747774108276833, + "learning_rate": 3.786468032278088e-05, + "loss": 5.0635, + "step": 2440 + }, + { + "epoch": 0.11364853225318342, + "grad_norm": 0.8376656715414171, + "learning_rate": 3.788019863438858e-05, + "loss": 5.0412, + "step": 2441 + }, + { + "epoch": 0.11369509043927649, + "grad_norm": 0.7055586706025423, + "learning_rate": 3.789571694599628e-05, + "loss": 5.1244, + "step": 2442 + }, + { + "epoch": 0.11374164862536956, + "grad_norm": 0.6591627897255011, + "learning_rate": 3.791123525760397e-05, + "loss": 5.0028, + "step": 2443 + }, + { + "epoch": 0.11378820681146262, + "grad_norm": 0.6321566624258618, + "learning_rate": 3.792675356921167e-05, + "loss": 5.1584, + "step": 2444 + }, + { + "epoch": 0.1138347649975557, + "grad_norm": 0.6777934135907566, + "learning_rate": 3.794227188081937e-05, + "loss": 5.1229, + "step": 2445 + }, + { + "epoch": 0.11388132318364877, + "grad_norm": 0.7367068186786053, + "learning_rate": 3.7957790192427065e-05, + "loss": 5.0444, + "step": 2446 + }, + { + "epoch": 0.11392788136974183, + "grad_norm": 0.6844264373123031, + "learning_rate": 3.797330850403476e-05, + "loss": 5.1414, + "step": 2447 + }, + { + "epoch": 0.1139744395558349, + "grad_norm": 0.7854537407725108, + "learning_rate": 3.798882681564246e-05, + "loss": 5.0997, + "step": 2448 + }, + { + "epoch": 0.11402099774192798, + "grad_norm": 0.8872914399233097, + "learning_rate": 3.8004345127250154e-05, + "loss": 5.0639, + "step": 2449 + }, + { + "epoch": 0.11406755592802105, + "grad_norm": 0.8978869122785859, + "learning_rate": 3.801986343885786e-05, + "loss": 5.1117, + "step": 2450 + }, + { + "epoch": 0.11411411411411411, + "grad_norm": 1.1108374582521467, + "learning_rate": 3.8035381750465546e-05, + "loss": 5.1552, + "step": 2451 + }, + { + "epoch": 0.11416067230020718, + "grad_norm": 0.9374854907539614, + "learning_rate": 3.805090006207325e-05, + "loss": 5.0356, + "step": 2452 + }, + { + "epoch": 0.11420723048630026, + "grad_norm": 0.9039706332721206, + "learning_rate": 3.8066418373680946e-05, + "loss": 5.1306, + "step": 2453 + }, + { + "epoch": 0.11425378867239333, + "grad_norm": 0.8881316415855522, + "learning_rate": 3.808193668528864e-05, + "loss": 5.0379, + "step": 2454 + }, + { + "epoch": 0.11430034685848639, + "grad_norm": 0.9870580189916014, + "learning_rate": 3.809745499689634e-05, + "loss": 5.1043, + "step": 2455 + }, + { + "epoch": 0.11434690504457946, + "grad_norm": 1.1085298339064604, + "learning_rate": 3.8112973308504035e-05, + "loss": 5.2124, + "step": 2456 + }, + { + "epoch": 0.11439346323067254, + "grad_norm": 1.0087805010284507, + "learning_rate": 3.812849162011173e-05, + "loss": 5.1771, + "step": 2457 + }, + { + "epoch": 0.1144400214167656, + "grad_norm": 1.0552937028940073, + "learning_rate": 3.8144009931719434e-05, + "loss": 5.1628, + "step": 2458 + }, + { + "epoch": 0.11448657960285867, + "grad_norm": 0.8456298621497589, + "learning_rate": 3.815952824332713e-05, + "loss": 5.0594, + "step": 2459 + }, + { + "epoch": 0.11453313778895174, + "grad_norm": 0.9678010623736468, + "learning_rate": 3.817504655493483e-05, + "loss": 5.0126, + "step": 2460 + }, + { + "epoch": 0.11457969597504482, + "grad_norm": 0.8814497976026965, + "learning_rate": 3.819056486654252e-05, + "loss": 5.0458, + "step": 2461 + }, + { + "epoch": 0.11462625416113788, + "grad_norm": 0.7304259102912926, + "learning_rate": 3.820608317815022e-05, + "loss": 5.1085, + "step": 2462 + }, + { + "epoch": 0.11467281234723095, + "grad_norm": 0.8498023584691302, + "learning_rate": 3.8221601489757916e-05, + "loss": 5.0312, + "step": 2463 + }, + { + "epoch": 0.11471937053332402, + "grad_norm": 0.8655783133296892, + "learning_rate": 3.823711980136561e-05, + "loss": 5.0151, + "step": 2464 + }, + { + "epoch": 0.11476592871941708, + "grad_norm": 0.768136215750769, + "learning_rate": 3.825263811297331e-05, + "loss": 4.9853, + "step": 2465 + }, + { + "epoch": 0.11481248690551016, + "grad_norm": 0.7962600965633161, + "learning_rate": 3.826815642458101e-05, + "loss": 5.0374, + "step": 2466 + }, + { + "epoch": 0.11485904509160323, + "grad_norm": 0.7495350531030325, + "learning_rate": 3.828367473618871e-05, + "loss": 4.9571, + "step": 2467 + }, + { + "epoch": 0.1149056032776963, + "grad_norm": 0.7872840098594601, + "learning_rate": 3.82991930477964e-05, + "loss": 5.0071, + "step": 2468 + }, + { + "epoch": 0.11495216146378937, + "grad_norm": 0.801570785611645, + "learning_rate": 3.83147113594041e-05, + "loss": 5.0808, + "step": 2469 + }, + { + "epoch": 0.11499871964988244, + "grad_norm": 0.6760083892518752, + "learning_rate": 3.8330229671011797e-05, + "loss": 5.0976, + "step": 2470 + }, + { + "epoch": 0.11504527783597551, + "grad_norm": 0.8143148835993892, + "learning_rate": 3.834574798261949e-05, + "loss": 5.0959, + "step": 2471 + }, + { + "epoch": 0.11509183602206859, + "grad_norm": 0.8105278309914701, + "learning_rate": 3.836126629422719e-05, + "loss": 5.0403, + "step": 2472 + }, + { + "epoch": 0.11513839420816165, + "grad_norm": 0.8308769690334677, + "learning_rate": 3.8376784605834886e-05, + "loss": 4.969, + "step": 2473 + }, + { + "epoch": 0.11518495239425472, + "grad_norm": 1.0409251338418604, + "learning_rate": 3.839230291744259e-05, + "loss": 4.9003, + "step": 2474 + }, + { + "epoch": 0.11523151058034779, + "grad_norm": 0.8726031620288676, + "learning_rate": 3.8407821229050285e-05, + "loss": 5.0096, + "step": 2475 + }, + { + "epoch": 0.11527806876644085, + "grad_norm": 0.9045864215109668, + "learning_rate": 3.8423339540657974e-05, + "loss": 5.0189, + "step": 2476 + }, + { + "epoch": 0.11532462695253393, + "grad_norm": 0.8347420274240833, + "learning_rate": 3.843885785226568e-05, + "loss": 5.0031, + "step": 2477 + }, + { + "epoch": 0.115371185138627, + "grad_norm": 0.8384707042537284, + "learning_rate": 3.8454376163873374e-05, + "loss": 5.0799, + "step": 2478 + }, + { + "epoch": 0.11541774332472007, + "grad_norm": 0.6694404354846373, + "learning_rate": 3.846989447548107e-05, + "loss": 5.0796, + "step": 2479 + }, + { + "epoch": 0.11546430151081313, + "grad_norm": 0.820780029660952, + "learning_rate": 3.8485412787088766e-05, + "loss": 5.1228, + "step": 2480 + }, + { + "epoch": 0.1155108596969062, + "grad_norm": 0.9355270532592728, + "learning_rate": 3.850093109869646e-05, + "loss": 5.1646, + "step": 2481 + }, + { + "epoch": 0.11555741788299928, + "grad_norm": 0.9021344226970305, + "learning_rate": 3.851644941030416e-05, + "loss": 5.1536, + "step": 2482 + }, + { + "epoch": 0.11560397606909235, + "grad_norm": 0.8431903617782176, + "learning_rate": 3.853196772191186e-05, + "loss": 5.0594, + "step": 2483 + }, + { + "epoch": 0.11565053425518541, + "grad_norm": 0.9120033469160702, + "learning_rate": 3.854748603351955e-05, + "loss": 5.0961, + "step": 2484 + }, + { + "epoch": 0.11569709244127849, + "grad_norm": 1.0113753333818993, + "learning_rate": 3.8563004345127255e-05, + "loss": 5.1703, + "step": 2485 + }, + { + "epoch": 0.11574365062737156, + "grad_norm": 0.912117920971664, + "learning_rate": 3.857852265673495e-05, + "loss": 5.0858, + "step": 2486 + }, + { + "epoch": 0.11579020881346462, + "grad_norm": 0.8718619485582554, + "learning_rate": 3.859404096834265e-05, + "loss": 5.1812, + "step": 2487 + }, + { + "epoch": 0.1158367669995577, + "grad_norm": 0.9025189915903995, + "learning_rate": 3.8609559279950344e-05, + "loss": 5.0567, + "step": 2488 + }, + { + "epoch": 0.11588332518565077, + "grad_norm": 0.9265741418250709, + "learning_rate": 3.862507759155804e-05, + "loss": 5.1068, + "step": 2489 + }, + { + "epoch": 0.11592988337174384, + "grad_norm": 0.8612150958217891, + "learning_rate": 3.8640595903165736e-05, + "loss": 5.066, + "step": 2490 + }, + { + "epoch": 0.1159764415578369, + "grad_norm": 0.7681409628107881, + "learning_rate": 3.865611421477344e-05, + "loss": 5.1157, + "step": 2491 + }, + { + "epoch": 0.11602299974392997, + "grad_norm": 0.6422948662361891, + "learning_rate": 3.867163252638113e-05, + "loss": 5.1477, + "step": 2492 + }, + { + "epoch": 0.11606955793002305, + "grad_norm": 0.7153302054827605, + "learning_rate": 3.8687150837988825e-05, + "loss": 5.1366, + "step": 2493 + }, + { + "epoch": 0.11611611611611612, + "grad_norm": 0.8146593860899396, + "learning_rate": 3.870266914959653e-05, + "loss": 5.025, + "step": 2494 + }, + { + "epoch": 0.11616267430220918, + "grad_norm": 0.8623032312979605, + "learning_rate": 3.8718187461204225e-05, + "loss": 5.0246, + "step": 2495 + }, + { + "epoch": 0.11620923248830226, + "grad_norm": 0.8016103987159249, + "learning_rate": 3.873370577281192e-05, + "loss": 4.9702, + "step": 2496 + }, + { + "epoch": 0.11625579067439533, + "grad_norm": 0.6850494273092106, + "learning_rate": 3.874922408441962e-05, + "loss": 5.0806, + "step": 2497 + }, + { + "epoch": 0.11630234886048839, + "grad_norm": 0.889216506294567, + "learning_rate": 3.8764742396027314e-05, + "loss": 5.0637, + "step": 2498 + }, + { + "epoch": 0.11634890704658146, + "grad_norm": 0.7415655623014623, + "learning_rate": 3.8780260707635017e-05, + "loss": 5.032, + "step": 2499 + }, + { + "epoch": 0.11639546523267454, + "grad_norm": 0.8436829069740034, + "learning_rate": 3.8795779019242706e-05, + "loss": 4.9933, + "step": 2500 + }, + { + "epoch": 0.11644202341876761, + "grad_norm": 0.9087260948142513, + "learning_rate": 3.88112973308504e-05, + "loss": 5.091, + "step": 2501 + }, + { + "epoch": 0.11648858160486067, + "grad_norm": 0.7346154265552286, + "learning_rate": 3.8826815642458106e-05, + "loss": 5.0049, + "step": 2502 + }, + { + "epoch": 0.11653513979095374, + "grad_norm": 0.8571602440978437, + "learning_rate": 3.88423339540658e-05, + "loss": 4.9625, + "step": 2503 + }, + { + "epoch": 0.11658169797704682, + "grad_norm": 0.8254427227329096, + "learning_rate": 3.885785226567349e-05, + "loss": 5.1319, + "step": 2504 + }, + { + "epoch": 0.11662825616313989, + "grad_norm": 0.8126898353304653, + "learning_rate": 3.8873370577281194e-05, + "loss": 4.9837, + "step": 2505 + }, + { + "epoch": 0.11667481434923295, + "grad_norm": 0.998915139538732, + "learning_rate": 3.888888888888889e-05, + "loss": 5.0506, + "step": 2506 + }, + { + "epoch": 0.11672137253532602, + "grad_norm": 0.8953196412965637, + "learning_rate": 3.890440720049659e-05, + "loss": 5.0857, + "step": 2507 + }, + { + "epoch": 0.1167679307214191, + "grad_norm": 0.7727300996863378, + "learning_rate": 3.891992551210428e-05, + "loss": 5.0436, + "step": 2508 + }, + { + "epoch": 0.11681448890751216, + "grad_norm": 0.8462383563475203, + "learning_rate": 3.893544382371198e-05, + "loss": 4.9255, + "step": 2509 + }, + { + "epoch": 0.11686104709360523, + "grad_norm": 0.9276202634213941, + "learning_rate": 3.895096213531968e-05, + "loss": 5.0177, + "step": 2510 + }, + { + "epoch": 0.1169076052796983, + "grad_norm": 0.7491004491177756, + "learning_rate": 3.896648044692738e-05, + "loss": 5.1762, + "step": 2511 + }, + { + "epoch": 0.11695416346579138, + "grad_norm": 0.7607053378299928, + "learning_rate": 3.898199875853507e-05, + "loss": 4.9929, + "step": 2512 + }, + { + "epoch": 0.11700072165188444, + "grad_norm": 0.7570305576895322, + "learning_rate": 3.899751707014277e-05, + "loss": 5.0678, + "step": 2513 + }, + { + "epoch": 0.11704727983797751, + "grad_norm": 0.7741698005623442, + "learning_rate": 3.901303538175047e-05, + "loss": 4.9625, + "step": 2514 + }, + { + "epoch": 0.11709383802407058, + "grad_norm": 0.7612339521594762, + "learning_rate": 3.9028553693358164e-05, + "loss": 4.8683, + "step": 2515 + }, + { + "epoch": 0.11714039621016366, + "grad_norm": 0.7646361832013494, + "learning_rate": 3.904407200496586e-05, + "loss": 5.0516, + "step": 2516 + }, + { + "epoch": 0.11718695439625672, + "grad_norm": 0.8302487760958853, + "learning_rate": 3.905959031657356e-05, + "loss": 4.963, + "step": 2517 + }, + { + "epoch": 0.11723351258234979, + "grad_norm": 0.8093213101257047, + "learning_rate": 3.907510862818125e-05, + "loss": 4.9586, + "step": 2518 + }, + { + "epoch": 0.11728007076844287, + "grad_norm": 0.850564392798992, + "learning_rate": 3.9090626939788956e-05, + "loss": 5.0094, + "step": 2519 + }, + { + "epoch": 0.11732662895453592, + "grad_norm": 0.722089696968699, + "learning_rate": 3.9106145251396646e-05, + "loss": 4.9879, + "step": 2520 + }, + { + "epoch": 0.117373187140629, + "grad_norm": 0.8459473187015664, + "learning_rate": 3.912166356300435e-05, + "loss": 5.0879, + "step": 2521 + }, + { + "epoch": 0.11741974532672207, + "grad_norm": 0.8922569989658864, + "learning_rate": 3.9137181874612045e-05, + "loss": 4.9439, + "step": 2522 + }, + { + "epoch": 0.11746630351281515, + "grad_norm": 0.9244896630731141, + "learning_rate": 3.915270018621974e-05, + "loss": 4.8928, + "step": 2523 + }, + { + "epoch": 0.1175128616989082, + "grad_norm": 0.8177465254893765, + "learning_rate": 3.916821849782744e-05, + "loss": 5.0084, + "step": 2524 + }, + { + "epoch": 0.11755941988500128, + "grad_norm": 0.81980200236346, + "learning_rate": 3.9183736809435134e-05, + "loss": 4.9951, + "step": 2525 + }, + { + "epoch": 0.11760597807109435, + "grad_norm": 0.9411687720511437, + "learning_rate": 3.919925512104283e-05, + "loss": 5.1458, + "step": 2526 + }, + { + "epoch": 0.11765253625718743, + "grad_norm": 0.9529921641432794, + "learning_rate": 3.9214773432650534e-05, + "loss": 5.082, + "step": 2527 + }, + { + "epoch": 0.11769909444328049, + "grad_norm": 1.0251618035380268, + "learning_rate": 3.923029174425822e-05, + "loss": 4.9907, + "step": 2528 + }, + { + "epoch": 0.11774565262937356, + "grad_norm": 0.9269881238158605, + "learning_rate": 3.924581005586592e-05, + "loss": 5.0735, + "step": 2529 + }, + { + "epoch": 0.11779221081546663, + "grad_norm": 0.9339131677066629, + "learning_rate": 3.926132836747362e-05, + "loss": 5.0247, + "step": 2530 + }, + { + "epoch": 0.11783876900155969, + "grad_norm": 0.9533709958766925, + "learning_rate": 3.927684667908132e-05, + "loss": 5.0271, + "step": 2531 + }, + { + "epoch": 0.11788532718765277, + "grad_norm": 0.8002896163100126, + "learning_rate": 3.9292364990689015e-05, + "loss": 4.8966, + "step": 2532 + }, + { + "epoch": 0.11793188537374584, + "grad_norm": 0.726195750941081, + "learning_rate": 3.930788330229671e-05, + "loss": 5.0647, + "step": 2533 + }, + { + "epoch": 0.11797844355983891, + "grad_norm": 0.806045398874243, + "learning_rate": 3.932340161390441e-05, + "loss": 4.9768, + "step": 2534 + }, + { + "epoch": 0.11802500174593197, + "grad_norm": 0.8658690437119724, + "learning_rate": 3.933891992551211e-05, + "loss": 4.9772, + "step": 2535 + }, + { + "epoch": 0.11807155993202505, + "grad_norm": 0.9849675059047976, + "learning_rate": 3.93544382371198e-05, + "loss": 5.0871, + "step": 2536 + }, + { + "epoch": 0.11811811811811812, + "grad_norm": 0.9517985293487979, + "learning_rate": 3.9369956548727497e-05, + "loss": 5.0268, + "step": 2537 + }, + { + "epoch": 0.1181646763042112, + "grad_norm": 0.7720712344870945, + "learning_rate": 3.93854748603352e-05, + "loss": 4.9923, + "step": 2538 + }, + { + "epoch": 0.11821123449030425, + "grad_norm": 0.7403107589120883, + "learning_rate": 3.9400993171942896e-05, + "loss": 4.9912, + "step": 2539 + }, + { + "epoch": 0.11825779267639733, + "grad_norm": 0.8687999013120279, + "learning_rate": 3.9416511483550586e-05, + "loss": 4.969, + "step": 2540 + }, + { + "epoch": 0.1183043508624904, + "grad_norm": 0.7500621495115205, + "learning_rate": 3.943202979515829e-05, + "loss": 5.0585, + "step": 2541 + }, + { + "epoch": 0.11835090904858346, + "grad_norm": 0.7799405265589467, + "learning_rate": 3.9447548106765985e-05, + "loss": 4.9895, + "step": 2542 + }, + { + "epoch": 0.11839746723467653, + "grad_norm": 0.7715104754233831, + "learning_rate": 3.946306641837368e-05, + "loss": 4.9801, + "step": 2543 + }, + { + "epoch": 0.11844402542076961, + "grad_norm": 0.6937991285077149, + "learning_rate": 3.9478584729981384e-05, + "loss": 5.0181, + "step": 2544 + }, + { + "epoch": 0.11849058360686268, + "grad_norm": 0.8950045006689051, + "learning_rate": 3.9494103041589074e-05, + "loss": 5.118, + "step": 2545 + }, + { + "epoch": 0.11853714179295574, + "grad_norm": 1.038334094992165, + "learning_rate": 3.950962135319678e-05, + "loss": 5.0804, + "step": 2546 + }, + { + "epoch": 0.11858369997904881, + "grad_norm": 0.9585371035372283, + "learning_rate": 3.952513966480447e-05, + "loss": 5.0171, + "step": 2547 + }, + { + "epoch": 0.11863025816514189, + "grad_norm": 0.7573380595941215, + "learning_rate": 3.954065797641217e-05, + "loss": 4.9736, + "step": 2548 + }, + { + "epoch": 0.11867681635123496, + "grad_norm": 0.834361316116084, + "learning_rate": 3.9556176288019866e-05, + "loss": 4.9576, + "step": 2549 + }, + { + "epoch": 0.11872337453732802, + "grad_norm": 0.9298906106725772, + "learning_rate": 3.957169459962756e-05, + "loss": 5.0636, + "step": 2550 + }, + { + "epoch": 0.1187699327234211, + "grad_norm": 0.8235458923573654, + "learning_rate": 3.958721291123526e-05, + "loss": 5.0059, + "step": 2551 + }, + { + "epoch": 0.11881649090951417, + "grad_norm": 1.0270646621654667, + "learning_rate": 3.960273122284296e-05, + "loss": 4.9781, + "step": 2552 + }, + { + "epoch": 0.11886304909560723, + "grad_norm": 0.9781931802103777, + "learning_rate": 3.961824953445065e-05, + "loss": 5.1243, + "step": 2553 + }, + { + "epoch": 0.1189096072817003, + "grad_norm": 0.7342425082935667, + "learning_rate": 3.963376784605835e-05, + "loss": 5.0248, + "step": 2554 + }, + { + "epoch": 0.11895616546779338, + "grad_norm": 0.7204585653322536, + "learning_rate": 3.964928615766605e-05, + "loss": 4.8823, + "step": 2555 + }, + { + "epoch": 0.11900272365388645, + "grad_norm": 0.8305475917774635, + "learning_rate": 3.966480446927375e-05, + "loss": 5.003, + "step": 2556 + }, + { + "epoch": 0.11904928183997951, + "grad_norm": 0.774325259354402, + "learning_rate": 3.968032278088144e-05, + "loss": 4.9467, + "step": 2557 + }, + { + "epoch": 0.11909584002607258, + "grad_norm": 0.913329760471223, + "learning_rate": 3.969584109248914e-05, + "loss": 5.039, + "step": 2558 + }, + { + "epoch": 0.11914239821216566, + "grad_norm": 0.8944214845197148, + "learning_rate": 3.9711359404096836e-05, + "loss": 5.0853, + "step": 2559 + }, + { + "epoch": 0.11918895639825873, + "grad_norm": 0.9285298381276695, + "learning_rate": 3.972687771570454e-05, + "loss": 4.862, + "step": 2560 + }, + { + "epoch": 0.11923551458435179, + "grad_norm": 0.9214442130393615, + "learning_rate": 3.974239602731223e-05, + "loss": 4.9415, + "step": 2561 + }, + { + "epoch": 0.11928207277044486, + "grad_norm": 0.8096540567973146, + "learning_rate": 3.9757914338919925e-05, + "loss": 4.8709, + "step": 2562 + }, + { + "epoch": 0.11932863095653794, + "grad_norm": 0.7329285508849716, + "learning_rate": 3.977343265052763e-05, + "loss": 4.963, + "step": 2563 + }, + { + "epoch": 0.119375189142631, + "grad_norm": 0.8882188653007194, + "learning_rate": 3.9788950962135324e-05, + "loss": 5.0328, + "step": 2564 + }, + { + "epoch": 0.11942174732872407, + "grad_norm": 0.9451245789778725, + "learning_rate": 3.980446927374302e-05, + "loss": 5.0484, + "step": 2565 + }, + { + "epoch": 0.11946830551481714, + "grad_norm": 0.8885833261738103, + "learning_rate": 3.981998758535072e-05, + "loss": 4.9712, + "step": 2566 + }, + { + "epoch": 0.11951486370091022, + "grad_norm": 0.9275377607477113, + "learning_rate": 3.983550589695841e-05, + "loss": 4.9652, + "step": 2567 + }, + { + "epoch": 0.11956142188700328, + "grad_norm": 0.7734308257590828, + "learning_rate": 3.985102420856611e-05, + "loss": 4.8978, + "step": 2568 + }, + { + "epoch": 0.11960798007309635, + "grad_norm": 0.7101616968335112, + "learning_rate": 3.9866542520173806e-05, + "loss": 4.9675, + "step": 2569 + }, + { + "epoch": 0.11965453825918942, + "grad_norm": 0.8331943757705301, + "learning_rate": 3.98820608317815e-05, + "loss": 5.0402, + "step": 2570 + }, + { + "epoch": 0.1197010964452825, + "grad_norm": 0.8020117956118714, + "learning_rate": 3.9897579143389205e-05, + "loss": 4.9833, + "step": 2571 + }, + { + "epoch": 0.11974765463137556, + "grad_norm": 0.7352721569038166, + "learning_rate": 3.99130974549969e-05, + "loss": 5.0027, + "step": 2572 + }, + { + "epoch": 0.11979421281746863, + "grad_norm": 0.7473346999381052, + "learning_rate": 3.992861576660459e-05, + "loss": 5.0101, + "step": 2573 + }, + { + "epoch": 0.1198407710035617, + "grad_norm": 0.8103369571414824, + "learning_rate": 3.9944134078212294e-05, + "loss": 5.0296, + "step": 2574 + }, + { + "epoch": 0.11988732918965476, + "grad_norm": 0.7584495920070486, + "learning_rate": 3.995965238981999e-05, + "loss": 4.9294, + "step": 2575 + }, + { + "epoch": 0.11993388737574784, + "grad_norm": 0.8084992159911083, + "learning_rate": 3.9975170701427686e-05, + "loss": 4.992, + "step": 2576 + }, + { + "epoch": 0.11998044556184091, + "grad_norm": 0.935922039414036, + "learning_rate": 3.999068901303538e-05, + "loss": 4.9955, + "step": 2577 + }, + { + "epoch": 0.12002700374793399, + "grad_norm": 1.057345850949869, + "learning_rate": 4.000620732464308e-05, + "loss": 4.9901, + "step": 2578 + }, + { + "epoch": 0.12007356193402705, + "grad_norm": 0.8720583013480203, + "learning_rate": 4.0021725636250775e-05, + "loss": 4.9434, + "step": 2579 + }, + { + "epoch": 0.12012012012012012, + "grad_norm": 0.6843928314640112, + "learning_rate": 4.003724394785848e-05, + "loss": 4.9469, + "step": 2580 + }, + { + "epoch": 0.12016667830621319, + "grad_norm": 0.7343506520535338, + "learning_rate": 4.005276225946617e-05, + "loss": 5.0153, + "step": 2581 + }, + { + "epoch": 0.12021323649230627, + "grad_norm": 0.7847661024373092, + "learning_rate": 4.006828057107387e-05, + "loss": 4.9842, + "step": 2582 + }, + { + "epoch": 0.12025979467839933, + "grad_norm": 0.7153435415636128, + "learning_rate": 4.008379888268157e-05, + "loss": 4.9705, + "step": 2583 + }, + { + "epoch": 0.1203063528644924, + "grad_norm": 0.7770277659446284, + "learning_rate": 4.0099317194289264e-05, + "loss": 4.9876, + "step": 2584 + }, + { + "epoch": 0.12035291105058547, + "grad_norm": 0.7396772983000333, + "learning_rate": 4.011483550589696e-05, + "loss": 4.9831, + "step": 2585 + }, + { + "epoch": 0.12039946923667853, + "grad_norm": 0.8032639623471335, + "learning_rate": 4.0130353817504656e-05, + "loss": 4.8635, + "step": 2586 + }, + { + "epoch": 0.1204460274227716, + "grad_norm": 0.7565144626170431, + "learning_rate": 4.014587212911235e-05, + "loss": 4.8359, + "step": 2587 + }, + { + "epoch": 0.12049258560886468, + "grad_norm": 0.7869471165835189, + "learning_rate": 4.0161390440720056e-05, + "loss": 5.0435, + "step": 2588 + }, + { + "epoch": 0.12053914379495775, + "grad_norm": 0.8787589906887663, + "learning_rate": 4.0176908752327745e-05, + "loss": 4.8743, + "step": 2589 + }, + { + "epoch": 0.12058570198105081, + "grad_norm": 0.8911880189899926, + "learning_rate": 4.019242706393545e-05, + "loss": 4.9452, + "step": 2590 + }, + { + "epoch": 0.12063226016714389, + "grad_norm": 0.7265908166472302, + "learning_rate": 4.0207945375543145e-05, + "loss": 4.8549, + "step": 2591 + }, + { + "epoch": 0.12067881835323696, + "grad_norm": 0.7606188180863671, + "learning_rate": 4.022346368715084e-05, + "loss": 4.9152, + "step": 2592 + }, + { + "epoch": 0.12072537653933003, + "grad_norm": 0.8040405287774692, + "learning_rate": 4.023898199875854e-05, + "loss": 4.9701, + "step": 2593 + }, + { + "epoch": 0.1207719347254231, + "grad_norm": 0.961712102729068, + "learning_rate": 4.0254500310366234e-05, + "loss": 4.998, + "step": 2594 + }, + { + "epoch": 0.12081849291151617, + "grad_norm": 0.9589919135983366, + "learning_rate": 4.027001862197393e-05, + "loss": 4.9498, + "step": 2595 + }, + { + "epoch": 0.12086505109760924, + "grad_norm": 0.8450412346852219, + "learning_rate": 4.028553693358163e-05, + "loss": 4.95, + "step": 2596 + }, + { + "epoch": 0.1209116092837023, + "grad_norm": 0.9219400019679824, + "learning_rate": 4.030105524518932e-05, + "loss": 5.129, + "step": 2597 + }, + { + "epoch": 0.12095816746979537, + "grad_norm": 0.9874936863798988, + "learning_rate": 4.031657355679702e-05, + "loss": 4.9755, + "step": 2598 + }, + { + "epoch": 0.12100472565588845, + "grad_norm": 0.914898372618779, + "learning_rate": 4.033209186840472e-05, + "loss": 4.8892, + "step": 2599 + }, + { + "epoch": 0.12105128384198152, + "grad_norm": 0.828638791393521, + "learning_rate": 4.034761018001242e-05, + "loss": 4.9151, + "step": 2600 + }, + { + "epoch": 0.12109784202807458, + "grad_norm": 0.9534569170131567, + "learning_rate": 4.0363128491620114e-05, + "loss": 4.9843, + "step": 2601 + }, + { + "epoch": 0.12114440021416766, + "grad_norm": 0.8141049758104747, + "learning_rate": 4.037864680322781e-05, + "loss": 4.9353, + "step": 2602 + }, + { + "epoch": 0.12119095840026073, + "grad_norm": 0.6472556156552672, + "learning_rate": 4.039416511483551e-05, + "loss": 4.9832, + "step": 2603 + }, + { + "epoch": 0.1212375165863538, + "grad_norm": 0.8842480832275358, + "learning_rate": 4.0409683426443203e-05, + "loss": 4.9828, + "step": 2604 + }, + { + "epoch": 0.12128407477244686, + "grad_norm": 0.8965835826048936, + "learning_rate": 4.04252017380509e-05, + "loss": 4.9097, + "step": 2605 + }, + { + "epoch": 0.12133063295853994, + "grad_norm": 0.7762792438402065, + "learning_rate": 4.0440720049658596e-05, + "loss": 4.946, + "step": 2606 + }, + { + "epoch": 0.12137719114463301, + "grad_norm": 0.8400847144706385, + "learning_rate": 4.04562383612663e-05, + "loss": 4.9644, + "step": 2607 + }, + { + "epoch": 0.12142374933072607, + "grad_norm": 0.8070449635739094, + "learning_rate": 4.0471756672873995e-05, + "loss": 4.902, + "step": 2608 + }, + { + "epoch": 0.12147030751681914, + "grad_norm": 0.9998704706450533, + "learning_rate": 4.0487274984481685e-05, + "loss": 5.0142, + "step": 2609 + }, + { + "epoch": 0.12151686570291222, + "grad_norm": 1.1442572352633458, + "learning_rate": 4.050279329608939e-05, + "loss": 4.8906, + "step": 2610 + }, + { + "epoch": 0.12156342388900529, + "grad_norm": 0.9156126517000345, + "learning_rate": 4.0518311607697084e-05, + "loss": 5.0344, + "step": 2611 + }, + { + "epoch": 0.12160998207509835, + "grad_norm": 1.0169558294099958, + "learning_rate": 4.053382991930478e-05, + "loss": 4.9358, + "step": 2612 + }, + { + "epoch": 0.12165654026119142, + "grad_norm": 1.1618732447683084, + "learning_rate": 4.054934823091248e-05, + "loss": 4.8885, + "step": 2613 + }, + { + "epoch": 0.1217030984472845, + "grad_norm": 0.8511413218459317, + "learning_rate": 4.056486654252017e-05, + "loss": 4.8368, + "step": 2614 + }, + { + "epoch": 0.12174965663337757, + "grad_norm": 1.0463418372353934, + "learning_rate": 4.0580384854127876e-05, + "loss": 4.9825, + "step": 2615 + }, + { + "epoch": 0.12179621481947063, + "grad_norm": 0.8543442949331337, + "learning_rate": 4.059590316573557e-05, + "loss": 4.8806, + "step": 2616 + }, + { + "epoch": 0.1218427730055637, + "grad_norm": 0.7192162047740702, + "learning_rate": 4.061142147734326e-05, + "loss": 5.0149, + "step": 2617 + }, + { + "epoch": 0.12188933119165678, + "grad_norm": 0.9626411841716611, + "learning_rate": 4.0626939788950965e-05, + "loss": 4.9575, + "step": 2618 + }, + { + "epoch": 0.12193588937774984, + "grad_norm": 0.9338575871833835, + "learning_rate": 4.064245810055866e-05, + "loss": 4.9656, + "step": 2619 + }, + { + "epoch": 0.12198244756384291, + "grad_norm": 0.9068957561256715, + "learning_rate": 4.065797641216636e-05, + "loss": 5.0512, + "step": 2620 + }, + { + "epoch": 0.12202900574993598, + "grad_norm": 0.7544841551978904, + "learning_rate": 4.0673494723774054e-05, + "loss": 4.9671, + "step": 2621 + }, + { + "epoch": 0.12207556393602906, + "grad_norm": 0.9504441697951307, + "learning_rate": 4.068901303538175e-05, + "loss": 4.8878, + "step": 2622 + }, + { + "epoch": 0.12212212212212212, + "grad_norm": 0.9337124319339575, + "learning_rate": 4.070453134698945e-05, + "loss": 4.9054, + "step": 2623 + }, + { + "epoch": 0.12216868030821519, + "grad_norm": 1.141982597790208, + "learning_rate": 4.072004965859715e-05, + "loss": 4.881, + "step": 2624 + }, + { + "epoch": 0.12221523849430826, + "grad_norm": 0.8851172432177769, + "learning_rate": 4.073556797020484e-05, + "loss": 4.9493, + "step": 2625 + }, + { + "epoch": 0.12226179668040134, + "grad_norm": 0.8042712647982553, + "learning_rate": 4.075108628181254e-05, + "loss": 4.981, + "step": 2626 + }, + { + "epoch": 0.1223083548664944, + "grad_norm": 0.9073718978214569, + "learning_rate": 4.076660459342024e-05, + "loss": 4.8911, + "step": 2627 + }, + { + "epoch": 0.12235491305258747, + "grad_norm": 0.8677401547947982, + "learning_rate": 4.0782122905027935e-05, + "loss": 4.877, + "step": 2628 + }, + { + "epoch": 0.12240147123868055, + "grad_norm": 0.8695907046724808, + "learning_rate": 4.079764121663563e-05, + "loss": 4.8631, + "step": 2629 + }, + { + "epoch": 0.1224480294247736, + "grad_norm": 1.0106511097788389, + "learning_rate": 4.081315952824333e-05, + "loss": 4.8916, + "step": 2630 + }, + { + "epoch": 0.12249458761086668, + "grad_norm": 0.9928214642230831, + "learning_rate": 4.0828677839851024e-05, + "loss": 4.8293, + "step": 2631 + }, + { + "epoch": 0.12254114579695975, + "grad_norm": 1.00072285319201, + "learning_rate": 4.084419615145873e-05, + "loss": 4.8792, + "step": 2632 + }, + { + "epoch": 0.12258770398305283, + "grad_norm": 0.8583812029809316, + "learning_rate": 4.0859714463066423e-05, + "loss": 5.0064, + "step": 2633 + }, + { + "epoch": 0.12263426216914589, + "grad_norm": 0.7711205463199866, + "learning_rate": 4.087523277467411e-05, + "loss": 4.7942, + "step": 2634 + }, + { + "epoch": 0.12268082035523896, + "grad_norm": 0.8985215463564464, + "learning_rate": 4.0890751086281816e-05, + "loss": 4.9311, + "step": 2635 + }, + { + "epoch": 0.12272737854133203, + "grad_norm": 0.7878408467933209, + "learning_rate": 4.090626939788951e-05, + "loss": 4.964, + "step": 2636 + }, + { + "epoch": 0.1227739367274251, + "grad_norm": 0.8538463634374771, + "learning_rate": 4.092178770949721e-05, + "loss": 5.0601, + "step": 2637 + }, + { + "epoch": 0.12282049491351817, + "grad_norm": 0.760863238190342, + "learning_rate": 4.0937306021104905e-05, + "loss": 4.8762, + "step": 2638 + }, + { + "epoch": 0.12286705309961124, + "grad_norm": 0.8247747199824129, + "learning_rate": 4.09528243327126e-05, + "loss": 4.9977, + "step": 2639 + }, + { + "epoch": 0.12291361128570431, + "grad_norm": 0.8087537225101828, + "learning_rate": 4.0968342644320304e-05, + "loss": 4.8866, + "step": 2640 + }, + { + "epoch": 0.12296016947179737, + "grad_norm": 0.7883547422767305, + "learning_rate": 4.0983860955928e-05, + "loss": 5.0027, + "step": 2641 + }, + { + "epoch": 0.12300672765789045, + "grad_norm": 0.9687568879997359, + "learning_rate": 4.099937926753569e-05, + "loss": 5.0062, + "step": 2642 + }, + { + "epoch": 0.12305328584398352, + "grad_norm": 1.0530018488271788, + "learning_rate": 4.101489757914339e-05, + "loss": 4.8607, + "step": 2643 + }, + { + "epoch": 0.1230998440300766, + "grad_norm": 1.0960082472437795, + "learning_rate": 4.103041589075109e-05, + "loss": 4.8672, + "step": 2644 + }, + { + "epoch": 0.12314640221616965, + "grad_norm": 0.8110094989143887, + "learning_rate": 4.1045934202358786e-05, + "loss": 4.9463, + "step": 2645 + }, + { + "epoch": 0.12319296040226273, + "grad_norm": 0.6722556783155591, + "learning_rate": 4.106145251396648e-05, + "loss": 4.8532, + "step": 2646 + }, + { + "epoch": 0.1232395185883558, + "grad_norm": 0.7970171899133425, + "learning_rate": 4.107697082557418e-05, + "loss": 5.0086, + "step": 2647 + }, + { + "epoch": 0.12328607677444886, + "grad_norm": 0.826529446241297, + "learning_rate": 4.1092489137181875e-05, + "loss": 5.0071, + "step": 2648 + }, + { + "epoch": 0.12333263496054193, + "grad_norm": 0.867711483445096, + "learning_rate": 4.110800744878958e-05, + "loss": 4.9307, + "step": 2649 + }, + { + "epoch": 0.12337919314663501, + "grad_norm": 0.9856085970183479, + "learning_rate": 4.112352576039727e-05, + "loss": 4.9355, + "step": 2650 + }, + { + "epoch": 0.12342575133272808, + "grad_norm": 0.9784067962569764, + "learning_rate": 4.113904407200497e-05, + "loss": 4.9496, + "step": 2651 + }, + { + "epoch": 0.12347230951882114, + "grad_norm": 0.996351547324134, + "learning_rate": 4.115456238361267e-05, + "loss": 4.8667, + "step": 2652 + }, + { + "epoch": 0.12351886770491421, + "grad_norm": 1.177692462580086, + "learning_rate": 4.117008069522036e-05, + "loss": 4.9394, + "step": 2653 + }, + { + "epoch": 0.12356542589100729, + "grad_norm": 0.770333627574338, + "learning_rate": 4.118559900682806e-05, + "loss": 4.8125, + "step": 2654 + }, + { + "epoch": 0.12361198407710036, + "grad_norm": 0.9408774891020674, + "learning_rate": 4.1201117318435756e-05, + "loss": 4.934, + "step": 2655 + }, + { + "epoch": 0.12365854226319342, + "grad_norm": 1.2141149861285727, + "learning_rate": 4.121663563004345e-05, + "loss": 5.0151, + "step": 2656 + }, + { + "epoch": 0.1237051004492865, + "grad_norm": 0.7379417347081493, + "learning_rate": 4.1232153941651155e-05, + "loss": 4.8291, + "step": 2657 + }, + { + "epoch": 0.12375165863537957, + "grad_norm": 1.0724834222301582, + "learning_rate": 4.1247672253258845e-05, + "loss": 5.0386, + "step": 2658 + }, + { + "epoch": 0.12379821682147263, + "grad_norm": 0.9304003599882185, + "learning_rate": 4.126319056486654e-05, + "loss": 5.0184, + "step": 2659 + }, + { + "epoch": 0.1238447750075657, + "grad_norm": 0.6222403526387054, + "learning_rate": 4.1278708876474244e-05, + "loss": 5.02, + "step": 2660 + }, + { + "epoch": 0.12389133319365878, + "grad_norm": 0.9237050909928954, + "learning_rate": 4.129422718808194e-05, + "loss": 4.8431, + "step": 2661 + }, + { + "epoch": 0.12393789137975185, + "grad_norm": 0.9225183868353831, + "learning_rate": 4.130974549968964e-05, + "loss": 4.9421, + "step": 2662 + }, + { + "epoch": 0.12398444956584491, + "grad_norm": 0.8383989342892578, + "learning_rate": 4.132526381129733e-05, + "loss": 4.8496, + "step": 2663 + }, + { + "epoch": 0.12403100775193798, + "grad_norm": 0.9367379262119893, + "learning_rate": 4.134078212290503e-05, + "loss": 4.8966, + "step": 2664 + }, + { + "epoch": 0.12407756593803106, + "grad_norm": 0.7692446494969879, + "learning_rate": 4.135630043451273e-05, + "loss": 4.9772, + "step": 2665 + }, + { + "epoch": 0.12412412412412413, + "grad_norm": 0.918023404357128, + "learning_rate": 4.137181874612042e-05, + "loss": 4.8879, + "step": 2666 + }, + { + "epoch": 0.12417068231021719, + "grad_norm": 0.9545660657543578, + "learning_rate": 4.138733705772812e-05, + "loss": 4.9047, + "step": 2667 + }, + { + "epoch": 0.12421724049631026, + "grad_norm": 0.7651144845172564, + "learning_rate": 4.140285536933582e-05, + "loss": 4.8873, + "step": 2668 + }, + { + "epoch": 0.12426379868240334, + "grad_norm": 0.7834245558768874, + "learning_rate": 4.141837368094352e-05, + "loss": 4.9012, + "step": 2669 + }, + { + "epoch": 0.1243103568684964, + "grad_norm": 0.7153048944091634, + "learning_rate": 4.143389199255121e-05, + "loss": 4.8485, + "step": 2670 + }, + { + "epoch": 0.12435691505458947, + "grad_norm": 0.6605451383080413, + "learning_rate": 4.144941030415891e-05, + "loss": 4.9117, + "step": 2671 + }, + { + "epoch": 0.12440347324068254, + "grad_norm": 0.7422434003995254, + "learning_rate": 4.1464928615766607e-05, + "loss": 4.9455, + "step": 2672 + }, + { + "epoch": 0.12445003142677562, + "grad_norm": 0.8075987994325488, + "learning_rate": 4.14804469273743e-05, + "loss": 4.9234, + "step": 2673 + }, + { + "epoch": 0.12449658961286868, + "grad_norm": 0.8252851183901508, + "learning_rate": 4.1495965238982e-05, + "loss": 4.8793, + "step": 2674 + }, + { + "epoch": 0.12454314779896175, + "grad_norm": 0.815865194461191, + "learning_rate": 4.1511483550589695e-05, + "loss": 4.8919, + "step": 2675 + }, + { + "epoch": 0.12458970598505482, + "grad_norm": 0.9092847013119674, + "learning_rate": 4.15270018621974e-05, + "loss": 4.8584, + "step": 2676 + }, + { + "epoch": 0.1246362641711479, + "grad_norm": 0.9024063342684019, + "learning_rate": 4.1542520173805095e-05, + "loss": 4.973, + "step": 2677 + }, + { + "epoch": 0.12468282235724096, + "grad_norm": 0.9986970663091269, + "learning_rate": 4.1558038485412784e-05, + "loss": 4.8216, + "step": 2678 + }, + { + "epoch": 0.12472938054333403, + "grad_norm": 0.916557117306804, + "learning_rate": 4.157355679702049e-05, + "loss": 4.913, + "step": 2679 + }, + { + "epoch": 0.1247759387294271, + "grad_norm": 0.841434551290559, + "learning_rate": 4.1589075108628184e-05, + "loss": 4.8398, + "step": 2680 + }, + { + "epoch": 0.12482249691552016, + "grad_norm": 0.9267055851974223, + "learning_rate": 4.160459342023588e-05, + "loss": 4.824, + "step": 2681 + }, + { + "epoch": 0.12486905510161324, + "grad_norm": 0.9167541110909448, + "learning_rate": 4.1620111731843576e-05, + "loss": 4.967, + "step": 2682 + }, + { + "epoch": 0.12491561328770631, + "grad_norm": 0.7546525480599862, + "learning_rate": 4.163563004345127e-05, + "loss": 4.8819, + "step": 2683 + }, + { + "epoch": 0.12496217147379939, + "grad_norm": 0.8173535465574733, + "learning_rate": 4.165114835505897e-05, + "loss": 4.9096, + "step": 2684 + }, + { + "epoch": 0.12500872965989246, + "grad_norm": 0.7652721390919147, + "learning_rate": 4.166666666666667e-05, + "loss": 4.8901, + "step": 2685 + }, + { + "epoch": 0.12505528784598552, + "grad_norm": 0.8815949412009698, + "learning_rate": 4.168218497827436e-05, + "loss": 4.9566, + "step": 2686 + }, + { + "epoch": 0.12510184603207858, + "grad_norm": 1.0181749616524256, + "learning_rate": 4.1697703289882065e-05, + "loss": 4.9728, + "step": 2687 + }, + { + "epoch": 0.12514840421817167, + "grad_norm": 0.7413711259473422, + "learning_rate": 4.171322160148976e-05, + "loss": 4.9926, + "step": 2688 + }, + { + "epoch": 0.12519496240426473, + "grad_norm": 0.7550945551106625, + "learning_rate": 4.172873991309746e-05, + "loss": 4.9807, + "step": 2689 + }, + { + "epoch": 0.1252415205903578, + "grad_norm": 0.840776231454145, + "learning_rate": 4.1744258224705154e-05, + "loss": 4.8433, + "step": 2690 + }, + { + "epoch": 0.12528807877645087, + "grad_norm": 0.7739178702455289, + "learning_rate": 4.175977653631285e-05, + "loss": 4.7929, + "step": 2691 + }, + { + "epoch": 0.12533463696254393, + "grad_norm": 0.8673917728487918, + "learning_rate": 4.1775294847920546e-05, + "loss": 4.7883, + "step": 2692 + }, + { + "epoch": 0.12538119514863702, + "grad_norm": 0.7549381715360681, + "learning_rate": 4.179081315952825e-05, + "loss": 4.789, + "step": 2693 + }, + { + "epoch": 0.12542775333473008, + "grad_norm": 0.8107229685763814, + "learning_rate": 4.180633147113594e-05, + "loss": 4.8809, + "step": 2694 + }, + { + "epoch": 0.12547431152082314, + "grad_norm": 0.9344106476314507, + "learning_rate": 4.1821849782743635e-05, + "loss": 4.7612, + "step": 2695 + }, + { + "epoch": 0.12552086970691623, + "grad_norm": 0.7823293004928611, + "learning_rate": 4.183736809435134e-05, + "loss": 4.7582, + "step": 2696 + }, + { + "epoch": 0.1255674278930093, + "grad_norm": 0.8454105814536212, + "learning_rate": 4.1852886405959035e-05, + "loss": 4.8571, + "step": 2697 + }, + { + "epoch": 0.12561398607910235, + "grad_norm": 0.9977953668911927, + "learning_rate": 4.186840471756673e-05, + "loss": 4.8724, + "step": 2698 + }, + { + "epoch": 0.12566054426519543, + "grad_norm": 1.170607018673931, + "learning_rate": 4.188392302917443e-05, + "loss": 4.9314, + "step": 2699 + }, + { + "epoch": 0.1257071024512885, + "grad_norm": 0.8615552136657284, + "learning_rate": 4.1899441340782123e-05, + "loss": 4.9038, + "step": 2700 + }, + { + "epoch": 0.12575366063738158, + "grad_norm": 0.8470783418811595, + "learning_rate": 4.1914959652389827e-05, + "loss": 4.8706, + "step": 2701 + }, + { + "epoch": 0.12580021882347464, + "grad_norm": 0.8719646799058672, + "learning_rate": 4.1930477963997516e-05, + "loss": 4.8836, + "step": 2702 + }, + { + "epoch": 0.1258467770095677, + "grad_norm": 1.0274938149449473, + "learning_rate": 4.194599627560521e-05, + "loss": 4.8579, + "step": 2703 + }, + { + "epoch": 0.1258933351956608, + "grad_norm": 0.9524955540594547, + "learning_rate": 4.1961514587212915e-05, + "loss": 4.9209, + "step": 2704 + }, + { + "epoch": 0.12593989338175385, + "grad_norm": 0.7765840172189998, + "learning_rate": 4.197703289882061e-05, + "loss": 4.8543, + "step": 2705 + }, + { + "epoch": 0.1259864515678469, + "grad_norm": 0.7692455694081343, + "learning_rate": 4.199255121042831e-05, + "loss": 4.8265, + "step": 2706 + }, + { + "epoch": 0.12603300975394, + "grad_norm": 0.9368006138070308, + "learning_rate": 4.2008069522036004e-05, + "loss": 4.8537, + "step": 2707 + }, + { + "epoch": 0.12607956794003305, + "grad_norm": 0.9346931925568449, + "learning_rate": 4.20235878336437e-05, + "loss": 4.8544, + "step": 2708 + }, + { + "epoch": 0.12612612612612611, + "grad_norm": 0.8738691467092479, + "learning_rate": 4.20391061452514e-05, + "loss": 4.8233, + "step": 2709 + }, + { + "epoch": 0.1261726843122192, + "grad_norm": 0.8266807434555379, + "learning_rate": 4.205462445685909e-05, + "loss": 4.8969, + "step": 2710 + }, + { + "epoch": 0.12621924249831226, + "grad_norm": 0.981232820057412, + "learning_rate": 4.207014276846679e-05, + "loss": 4.8966, + "step": 2711 + }, + { + "epoch": 0.12626580068440535, + "grad_norm": 1.2207121854212115, + "learning_rate": 4.208566108007449e-05, + "loss": 4.747, + "step": 2712 + }, + { + "epoch": 0.1263123588704984, + "grad_norm": 0.8383760112999337, + "learning_rate": 4.210117939168219e-05, + "loss": 4.8258, + "step": 2713 + }, + { + "epoch": 0.12635891705659147, + "grad_norm": 1.0118990645309862, + "learning_rate": 4.2116697703289885e-05, + "loss": 4.8213, + "step": 2714 + }, + { + "epoch": 0.12640547524268456, + "grad_norm": 1.0068711031697328, + "learning_rate": 4.213221601489758e-05, + "loss": 4.782, + "step": 2715 + }, + { + "epoch": 0.12645203342877762, + "grad_norm": 0.9528420682473879, + "learning_rate": 4.214773432650528e-05, + "loss": 4.9057, + "step": 2716 + }, + { + "epoch": 0.12649859161487068, + "grad_norm": 1.0558525440249509, + "learning_rate": 4.2163252638112974e-05, + "loss": 5.081, + "step": 2717 + }, + { + "epoch": 0.12654514980096376, + "grad_norm": 0.9299040768429889, + "learning_rate": 4.217877094972068e-05, + "loss": 4.8619, + "step": 2718 + }, + { + "epoch": 0.12659170798705682, + "grad_norm": 0.89916048517845, + "learning_rate": 4.219428926132837e-05, + "loss": 4.8585, + "step": 2719 + }, + { + "epoch": 0.12663826617314988, + "grad_norm": 1.0774096438964182, + "learning_rate": 4.220980757293606e-05, + "loss": 4.6814, + "step": 2720 + }, + { + "epoch": 0.12668482435924297, + "grad_norm": 1.1308883729793602, + "learning_rate": 4.2225325884543766e-05, + "loss": 4.9175, + "step": 2721 + }, + { + "epoch": 0.12673138254533603, + "grad_norm": 0.8707061101088851, + "learning_rate": 4.224084419615146e-05, + "loss": 4.7394, + "step": 2722 + }, + { + "epoch": 0.12677794073142912, + "grad_norm": 0.782560273016422, + "learning_rate": 4.225636250775916e-05, + "loss": 4.7812, + "step": 2723 + }, + { + "epoch": 0.12682449891752218, + "grad_norm": 0.8061134803957728, + "learning_rate": 4.2271880819366855e-05, + "loss": 4.7234, + "step": 2724 + }, + { + "epoch": 0.12687105710361524, + "grad_norm": 0.8969527865432384, + "learning_rate": 4.228739913097455e-05, + "loss": 4.782, + "step": 2725 + }, + { + "epoch": 0.12691761528970832, + "grad_norm": 0.9156617517341343, + "learning_rate": 4.2302917442582255e-05, + "loss": 4.7966, + "step": 2726 + }, + { + "epoch": 0.12696417347580138, + "grad_norm": 0.9122677146836741, + "learning_rate": 4.2318435754189944e-05, + "loss": 4.6909, + "step": 2727 + }, + { + "epoch": 0.12701073166189444, + "grad_norm": 0.7758707500703437, + "learning_rate": 4.233395406579764e-05, + "loss": 4.8407, + "step": 2728 + }, + { + "epoch": 0.12705728984798753, + "grad_norm": 0.7268073206581791, + "learning_rate": 4.2349472377405343e-05, + "loss": 4.735, + "step": 2729 + }, + { + "epoch": 0.1271038480340806, + "grad_norm": 0.9375814849356556, + "learning_rate": 4.236499068901304e-05, + "loss": 4.8856, + "step": 2730 + }, + { + "epoch": 0.12715040622017365, + "grad_norm": 1.1925567283574883, + "learning_rate": 4.2380509000620736e-05, + "loss": 4.8439, + "step": 2731 + }, + { + "epoch": 0.12719696440626674, + "grad_norm": 0.8350048914664627, + "learning_rate": 4.239602731222843e-05, + "loss": 4.9011, + "step": 2732 + }, + { + "epoch": 0.1272435225923598, + "grad_norm": 0.9686633414754805, + "learning_rate": 4.241154562383613e-05, + "loss": 4.8364, + "step": 2733 + }, + { + "epoch": 0.12729008077845286, + "grad_norm": 1.2936447616733047, + "learning_rate": 4.2427063935443825e-05, + "loss": 4.8271, + "step": 2734 + }, + { + "epoch": 0.12733663896454595, + "grad_norm": 0.6366489222335578, + "learning_rate": 4.244258224705152e-05, + "loss": 4.948, + "step": 2735 + }, + { + "epoch": 0.127383197150639, + "grad_norm": 1.3583847665861517, + "learning_rate": 4.245810055865922e-05, + "loss": 4.7574, + "step": 2736 + }, + { + "epoch": 0.1274297553367321, + "grad_norm": 0.7539851538242279, + "learning_rate": 4.247361887026692e-05, + "loss": 4.7664, + "step": 2737 + }, + { + "epoch": 0.12747631352282515, + "grad_norm": 1.2024219969504424, + "learning_rate": 4.248913718187462e-05, + "loss": 4.8061, + "step": 2738 + }, + { + "epoch": 0.1275228717089182, + "grad_norm": 0.8058841859968038, + "learning_rate": 4.2504655493482307e-05, + "loss": 4.8223, + "step": 2739 + }, + { + "epoch": 0.1275694298950113, + "grad_norm": 0.9740979325768201, + "learning_rate": 4.252017380509001e-05, + "loss": 4.7602, + "step": 2740 + }, + { + "epoch": 0.12761598808110436, + "grad_norm": 0.8663956735323202, + "learning_rate": 4.2535692116697706e-05, + "loss": 4.8394, + "step": 2741 + }, + { + "epoch": 0.12766254626719742, + "grad_norm": 0.9435819677812223, + "learning_rate": 4.25512104283054e-05, + "loss": 4.8057, + "step": 2742 + }, + { + "epoch": 0.1277091044532905, + "grad_norm": 0.980170417185387, + "learning_rate": 4.25667287399131e-05, + "loss": 4.8633, + "step": 2743 + }, + { + "epoch": 0.12775566263938357, + "grad_norm": 0.8995573271376702, + "learning_rate": 4.2582247051520795e-05, + "loss": 4.747, + "step": 2744 + }, + { + "epoch": 0.12780222082547663, + "grad_norm": 1.029052335750342, + "learning_rate": 4.259776536312849e-05, + "loss": 4.8343, + "step": 2745 + }, + { + "epoch": 0.1278487790115697, + "grad_norm": 0.8396619470295855, + "learning_rate": 4.2613283674736194e-05, + "loss": 4.828, + "step": 2746 + }, + { + "epoch": 0.12789533719766277, + "grad_norm": 1.0010542012481707, + "learning_rate": 4.2628801986343884e-05, + "loss": 4.7639, + "step": 2747 + }, + { + "epoch": 0.12794189538375586, + "grad_norm": 1.361726628017166, + "learning_rate": 4.264432029795159e-05, + "loss": 4.6815, + "step": 2748 + }, + { + "epoch": 0.12798845356984892, + "grad_norm": 0.6950557020850753, + "learning_rate": 4.265983860955928e-05, + "loss": 4.8774, + "step": 2749 + }, + { + "epoch": 0.12803501175594198, + "grad_norm": 1.3290469936936105, + "learning_rate": 4.267535692116698e-05, + "loss": 4.8962, + "step": 2750 + }, + { + "epoch": 0.12808156994203507, + "grad_norm": 0.8624101139722261, + "learning_rate": 4.2690875232774676e-05, + "loss": 4.844, + "step": 2751 + }, + { + "epoch": 0.12812812812812813, + "grad_norm": 0.802632064770572, + "learning_rate": 4.270639354438237e-05, + "loss": 4.7717, + "step": 2752 + }, + { + "epoch": 0.1281746863142212, + "grad_norm": 0.9724191664511138, + "learning_rate": 4.272191185599007e-05, + "loss": 4.7039, + "step": 2753 + }, + { + "epoch": 0.12822124450031427, + "grad_norm": 0.889209735817561, + "learning_rate": 4.273743016759777e-05, + "loss": 4.8395, + "step": 2754 + }, + { + "epoch": 0.12826780268640733, + "grad_norm": 0.8106073937155232, + "learning_rate": 4.275294847920546e-05, + "loss": 4.7551, + "step": 2755 + }, + { + "epoch": 0.1283143608725004, + "grad_norm": 0.9828480840347945, + "learning_rate": 4.2768466790813164e-05, + "loss": 4.8954, + "step": 2756 + }, + { + "epoch": 0.12836091905859348, + "grad_norm": 0.8495266783424291, + "learning_rate": 4.278398510242086e-05, + "loss": 4.7806, + "step": 2757 + }, + { + "epoch": 0.12840747724468654, + "grad_norm": 0.6468034424120218, + "learning_rate": 4.279950341402856e-05, + "loss": 4.7761, + "step": 2758 + }, + { + "epoch": 0.12845403543077963, + "grad_norm": 0.7768354835570608, + "learning_rate": 4.281502172563625e-05, + "loss": 4.7906, + "step": 2759 + }, + { + "epoch": 0.1285005936168727, + "grad_norm": 0.7777674751490496, + "learning_rate": 4.283054003724395e-05, + "loss": 4.7947, + "step": 2760 + }, + { + "epoch": 0.12854715180296575, + "grad_norm": 0.8854155700664524, + "learning_rate": 4.2846058348851646e-05, + "loss": 4.932, + "step": 2761 + }, + { + "epoch": 0.12859370998905884, + "grad_norm": 0.820381016124698, + "learning_rate": 4.286157666045935e-05, + "loss": 4.8445, + "step": 2762 + }, + { + "epoch": 0.1286402681751519, + "grad_norm": 0.845830691801744, + "learning_rate": 4.287709497206704e-05, + "loss": 4.885, + "step": 2763 + }, + { + "epoch": 0.12868682636124495, + "grad_norm": 0.8725509422025446, + "learning_rate": 4.2892613283674735e-05, + "loss": 4.665, + "step": 2764 + }, + { + "epoch": 0.12873338454733804, + "grad_norm": 0.9412220391761814, + "learning_rate": 4.290813159528244e-05, + "loss": 4.8532, + "step": 2765 + }, + { + "epoch": 0.1287799427334311, + "grad_norm": 1.0698839185751379, + "learning_rate": 4.2923649906890134e-05, + "loss": 4.8743, + "step": 2766 + }, + { + "epoch": 0.12882650091952416, + "grad_norm": 0.8259118534971871, + "learning_rate": 4.293916821849783e-05, + "loss": 4.7352, + "step": 2767 + }, + { + "epoch": 0.12887305910561725, + "grad_norm": 0.8269400694389787, + "learning_rate": 4.2954686530105527e-05, + "loss": 4.8654, + "step": 2768 + }, + { + "epoch": 0.1289196172917103, + "grad_norm": 0.8867214988981265, + "learning_rate": 4.297020484171322e-05, + "loss": 4.7943, + "step": 2769 + }, + { + "epoch": 0.1289661754778034, + "grad_norm": 0.7685143006503843, + "learning_rate": 4.2985723153320926e-05, + "loss": 4.7791, + "step": 2770 + }, + { + "epoch": 0.12901273366389646, + "grad_norm": 0.7320437098372862, + "learning_rate": 4.3001241464928615e-05, + "loss": 4.8129, + "step": 2771 + }, + { + "epoch": 0.12905929184998952, + "grad_norm": 0.8050702505707135, + "learning_rate": 4.301675977653631e-05, + "loss": 4.8262, + "step": 2772 + }, + { + "epoch": 0.1291058500360826, + "grad_norm": 0.8310948985535732, + "learning_rate": 4.3032278088144015e-05, + "loss": 4.7258, + "step": 2773 + }, + { + "epoch": 0.12915240822217566, + "grad_norm": 0.7385409186430264, + "learning_rate": 4.304779639975171e-05, + "loss": 4.8095, + "step": 2774 + }, + { + "epoch": 0.12919896640826872, + "grad_norm": 0.8395033138149072, + "learning_rate": 4.30633147113594e-05, + "loss": 4.734, + "step": 2775 + }, + { + "epoch": 0.1292455245943618, + "grad_norm": 0.7455289687976785, + "learning_rate": 4.3078833022967104e-05, + "loss": 4.7137, + "step": 2776 + }, + { + "epoch": 0.12929208278045487, + "grad_norm": 0.748351907569335, + "learning_rate": 4.30943513345748e-05, + "loss": 4.8099, + "step": 2777 + }, + { + "epoch": 0.12933864096654793, + "grad_norm": 0.8485867154360214, + "learning_rate": 4.3109869646182496e-05, + "loss": 4.7886, + "step": 2778 + }, + { + "epoch": 0.12938519915264102, + "grad_norm": 0.8749750992965194, + "learning_rate": 4.312538795779019e-05, + "loss": 4.8167, + "step": 2779 + }, + { + "epoch": 0.12943175733873408, + "grad_norm": 0.7336210196058534, + "learning_rate": 4.314090626939789e-05, + "loss": 4.8048, + "step": 2780 + }, + { + "epoch": 0.12947831552482716, + "grad_norm": 0.7897809265213982, + "learning_rate": 4.315642458100559e-05, + "loss": 4.721, + "step": 2781 + }, + { + "epoch": 0.12952487371092022, + "grad_norm": 0.8181314714408823, + "learning_rate": 4.317194289261329e-05, + "loss": 4.7339, + "step": 2782 + }, + { + "epoch": 0.12957143189701328, + "grad_norm": 0.7434693381477568, + "learning_rate": 4.318746120422098e-05, + "loss": 4.7347, + "step": 2783 + }, + { + "epoch": 0.12961799008310637, + "grad_norm": 0.6755817088520462, + "learning_rate": 4.320297951582868e-05, + "loss": 4.8928, + "step": 2784 + }, + { + "epoch": 0.12966454826919943, + "grad_norm": 0.8070424920211314, + "learning_rate": 4.321849782743638e-05, + "loss": 4.7849, + "step": 2785 + }, + { + "epoch": 0.1297111064552925, + "grad_norm": 0.7508752189280451, + "learning_rate": 4.3234016139044074e-05, + "loss": 4.8782, + "step": 2786 + }, + { + "epoch": 0.12975766464138558, + "grad_norm": 0.6889556179690256, + "learning_rate": 4.324953445065177e-05, + "loss": 4.7685, + "step": 2787 + }, + { + "epoch": 0.12980422282747864, + "grad_norm": 0.6900308893985424, + "learning_rate": 4.3265052762259466e-05, + "loss": 4.7258, + "step": 2788 + }, + { + "epoch": 0.1298507810135717, + "grad_norm": 0.6731553286262023, + "learning_rate": 4.328057107386716e-05, + "loss": 4.796, + "step": 2789 + }, + { + "epoch": 0.12989733919966479, + "grad_norm": 0.6556047850290916, + "learning_rate": 4.3296089385474866e-05, + "loss": 4.7286, + "step": 2790 + }, + { + "epoch": 0.12994389738575784, + "grad_norm": 0.6596244477930296, + "learning_rate": 4.3311607697082555e-05, + "loss": 4.5666, + "step": 2791 + }, + { + "epoch": 0.12999045557185093, + "grad_norm": 0.6538035065863326, + "learning_rate": 4.332712600869026e-05, + "loss": 4.7994, + "step": 2792 + }, + { + "epoch": 0.130037013757944, + "grad_norm": 0.7868987599844526, + "learning_rate": 4.3342644320297955e-05, + "loss": 4.7953, + "step": 2793 + }, + { + "epoch": 0.13008357194403705, + "grad_norm": 1.0629601265451196, + "learning_rate": 4.335816263190565e-05, + "loss": 4.8118, + "step": 2794 + }, + { + "epoch": 0.13013013013013014, + "grad_norm": 1.030656144533183, + "learning_rate": 4.337368094351335e-05, + "loss": 4.8689, + "step": 2795 + }, + { + "epoch": 0.1301766883162232, + "grad_norm": 0.9401948030332339, + "learning_rate": 4.3389199255121043e-05, + "loss": 4.756, + "step": 2796 + }, + { + "epoch": 0.13022324650231626, + "grad_norm": 0.8926146608933236, + "learning_rate": 4.340471756672874e-05, + "loss": 4.826, + "step": 2797 + }, + { + "epoch": 0.13026980468840935, + "grad_norm": 0.9249821735416128, + "learning_rate": 4.342023587833644e-05, + "loss": 4.785, + "step": 2798 + }, + { + "epoch": 0.1303163628745024, + "grad_norm": 0.9276571738144802, + "learning_rate": 4.343575418994414e-05, + "loss": 4.8933, + "step": 2799 + }, + { + "epoch": 0.13036292106059547, + "grad_norm": 0.8529795537051544, + "learning_rate": 4.345127250155183e-05, + "loss": 4.6982, + "step": 2800 + }, + { + "epoch": 0.13040947924668855, + "grad_norm": 0.8144344518501367, + "learning_rate": 4.346679081315953e-05, + "loss": 4.7398, + "step": 2801 + }, + { + "epoch": 0.1304560374327816, + "grad_norm": 0.9031128632524574, + "learning_rate": 4.348230912476723e-05, + "loss": 4.8656, + "step": 2802 + }, + { + "epoch": 0.1305025956188747, + "grad_norm": 1.0297616429564271, + "learning_rate": 4.3497827436374924e-05, + "loss": 4.792, + "step": 2803 + }, + { + "epoch": 0.13054915380496776, + "grad_norm": 0.9543793010806308, + "learning_rate": 4.351334574798262e-05, + "loss": 4.842, + "step": 2804 + }, + { + "epoch": 0.13059571199106082, + "grad_norm": 0.9307065631208045, + "learning_rate": 4.352886405959032e-05, + "loss": 4.8206, + "step": 2805 + }, + { + "epoch": 0.1306422701771539, + "grad_norm": 1.089881514358782, + "learning_rate": 4.354438237119802e-05, + "loss": 4.5822, + "step": 2806 + }, + { + "epoch": 0.13068882836324697, + "grad_norm": 0.8774296017166181, + "learning_rate": 4.3559900682805716e-05, + "loss": 4.8395, + "step": 2807 + }, + { + "epoch": 0.13073538654934003, + "grad_norm": 0.7555616317909334, + "learning_rate": 4.3575418994413406e-05, + "loss": 4.7801, + "step": 2808 + }, + { + "epoch": 0.13078194473543311, + "grad_norm": 1.0089827041903972, + "learning_rate": 4.359093730602111e-05, + "loss": 4.7978, + "step": 2809 + }, + { + "epoch": 0.13082850292152617, + "grad_norm": 1.0930261254478522, + "learning_rate": 4.3606455617628805e-05, + "loss": 4.7497, + "step": 2810 + }, + { + "epoch": 0.13087506110761923, + "grad_norm": 0.9018481212092657, + "learning_rate": 4.36219739292365e-05, + "loss": 4.7954, + "step": 2811 + }, + { + "epoch": 0.13092161929371232, + "grad_norm": 0.9657622882933623, + "learning_rate": 4.36374922408442e-05, + "loss": 4.8279, + "step": 2812 + }, + { + "epoch": 0.13096817747980538, + "grad_norm": 1.0771394634523201, + "learning_rate": 4.3653010552451894e-05, + "loss": 4.8429, + "step": 2813 + }, + { + "epoch": 0.13101473566589847, + "grad_norm": 0.7741209804330789, + "learning_rate": 4.366852886405959e-05, + "loss": 4.7528, + "step": 2814 + }, + { + "epoch": 0.13106129385199153, + "grad_norm": 0.9990439460071457, + "learning_rate": 4.3684047175667294e-05, + "loss": 4.7381, + "step": 2815 + }, + { + "epoch": 0.1311078520380846, + "grad_norm": 0.8873698888421897, + "learning_rate": 4.369956548727498e-05, + "loss": 4.6166, + "step": 2816 + }, + { + "epoch": 0.13115441022417768, + "grad_norm": 0.7193575758715539, + "learning_rate": 4.3715083798882686e-05, + "loss": 4.8806, + "step": 2817 + }, + { + "epoch": 0.13120096841027074, + "grad_norm": 0.9542959545025423, + "learning_rate": 4.373060211049038e-05, + "loss": 4.7005, + "step": 2818 + }, + { + "epoch": 0.1312475265963638, + "grad_norm": 0.9181144983400851, + "learning_rate": 4.374612042209808e-05, + "loss": 4.8199, + "step": 2819 + }, + { + "epoch": 0.13129408478245688, + "grad_norm": 1.226956775096458, + "learning_rate": 4.3761638733705775e-05, + "loss": 4.7415, + "step": 2820 + }, + { + "epoch": 0.13134064296854994, + "grad_norm": 0.961989059749543, + "learning_rate": 4.377715704531347e-05, + "loss": 4.7911, + "step": 2821 + }, + { + "epoch": 0.131387201154643, + "grad_norm": 0.9025094522911136, + "learning_rate": 4.379267535692117e-05, + "loss": 4.6715, + "step": 2822 + }, + { + "epoch": 0.1314337593407361, + "grad_norm": 0.7913051875126521, + "learning_rate": 4.380819366852887e-05, + "loss": 4.8086, + "step": 2823 + }, + { + "epoch": 0.13148031752682915, + "grad_norm": 0.8391880152355446, + "learning_rate": 4.382371198013656e-05, + "loss": 4.8605, + "step": 2824 + }, + { + "epoch": 0.13152687571292224, + "grad_norm": 0.9419481523077586, + "learning_rate": 4.383923029174426e-05, + "loss": 4.7436, + "step": 2825 + }, + { + "epoch": 0.1315734338990153, + "grad_norm": 0.7662614332404354, + "learning_rate": 4.385474860335196e-05, + "loss": 4.8892, + "step": 2826 + }, + { + "epoch": 0.13161999208510836, + "grad_norm": 0.7673167846499647, + "learning_rate": 4.3870266914959656e-05, + "loss": 4.6729, + "step": 2827 + }, + { + "epoch": 0.13166655027120144, + "grad_norm": 0.7241832397071909, + "learning_rate": 4.388578522656735e-05, + "loss": 4.5537, + "step": 2828 + }, + { + "epoch": 0.1317131084572945, + "grad_norm": 0.7761285754428481, + "learning_rate": 4.390130353817505e-05, + "loss": 4.7876, + "step": 2829 + }, + { + "epoch": 0.13175966664338756, + "grad_norm": 0.7683788328067772, + "learning_rate": 4.3916821849782745e-05, + "loss": 4.7446, + "step": 2830 + }, + { + "epoch": 0.13180622482948065, + "grad_norm": 0.8907365219404226, + "learning_rate": 4.393234016139045e-05, + "loss": 4.8241, + "step": 2831 + }, + { + "epoch": 0.1318527830155737, + "grad_norm": 0.9238867184967112, + "learning_rate": 4.394785847299814e-05, + "loss": 4.7415, + "step": 2832 + }, + { + "epoch": 0.13189934120166677, + "grad_norm": 0.9194593332886217, + "learning_rate": 4.3963376784605834e-05, + "loss": 4.7412, + "step": 2833 + }, + { + "epoch": 0.13194589938775986, + "grad_norm": 0.8025768631381384, + "learning_rate": 4.397889509621354e-05, + "loss": 4.8106, + "step": 2834 + }, + { + "epoch": 0.13199245757385292, + "grad_norm": 0.7887791731311476, + "learning_rate": 4.399441340782123e-05, + "loss": 4.7895, + "step": 2835 + }, + { + "epoch": 0.132039015759946, + "grad_norm": 0.9198291152187389, + "learning_rate": 4.400993171942892e-05, + "loss": 4.7418, + "step": 2836 + }, + { + "epoch": 0.13208557394603906, + "grad_norm": 0.8352230024113898, + "learning_rate": 4.4025450031036626e-05, + "loss": 4.7748, + "step": 2837 + }, + { + "epoch": 0.13213213213213212, + "grad_norm": 0.6950298814237272, + "learning_rate": 4.404096834264432e-05, + "loss": 4.7677, + "step": 2838 + }, + { + "epoch": 0.1321786903182252, + "grad_norm": 0.7705372304308196, + "learning_rate": 4.405648665425202e-05, + "loss": 4.7621, + "step": 2839 + }, + { + "epoch": 0.13222524850431827, + "grad_norm": 0.916825252311995, + "learning_rate": 4.4072004965859715e-05, + "loss": 4.7308, + "step": 2840 + }, + { + "epoch": 0.13227180669041133, + "grad_norm": 0.9485278473439802, + "learning_rate": 4.408752327746741e-05, + "loss": 4.8161, + "step": 2841 + }, + { + "epoch": 0.13231836487650442, + "grad_norm": 0.8471263607679573, + "learning_rate": 4.4103041589075114e-05, + "loss": 4.8148, + "step": 2842 + }, + { + "epoch": 0.13236492306259748, + "grad_norm": 0.8573887196609228, + "learning_rate": 4.411855990068281e-05, + "loss": 4.8451, + "step": 2843 + }, + { + "epoch": 0.13241148124869054, + "grad_norm": 0.8601932547480508, + "learning_rate": 4.41340782122905e-05, + "loss": 4.5131, + "step": 2844 + }, + { + "epoch": 0.13245803943478363, + "grad_norm": 0.7549044539728804, + "learning_rate": 4.41495965238982e-05, + "loss": 4.7549, + "step": 2845 + }, + { + "epoch": 0.13250459762087669, + "grad_norm": 0.8788415448191074, + "learning_rate": 4.41651148355059e-05, + "loss": 4.736, + "step": 2846 + }, + { + "epoch": 0.13255115580696977, + "grad_norm": 1.011142233864381, + "learning_rate": 4.4180633147113596e-05, + "loss": 4.7813, + "step": 2847 + }, + { + "epoch": 0.13259771399306283, + "grad_norm": 0.9796609081567509, + "learning_rate": 4.419615145872129e-05, + "loss": 4.7957, + "step": 2848 + }, + { + "epoch": 0.1326442721791559, + "grad_norm": 1.1543418462263706, + "learning_rate": 4.421166977032899e-05, + "loss": 4.7193, + "step": 2849 + }, + { + "epoch": 0.13269083036524898, + "grad_norm": 0.8802392448290423, + "learning_rate": 4.4227188081936685e-05, + "loss": 4.7964, + "step": 2850 + }, + { + "epoch": 0.13273738855134204, + "grad_norm": 0.8221125741047929, + "learning_rate": 4.424270639354439e-05, + "loss": 4.8408, + "step": 2851 + }, + { + "epoch": 0.1327839467374351, + "grad_norm": 0.7970586002883483, + "learning_rate": 4.425822470515208e-05, + "loss": 4.8313, + "step": 2852 + }, + { + "epoch": 0.1328305049235282, + "grad_norm": 0.9215236746307134, + "learning_rate": 4.427374301675978e-05, + "loss": 4.717, + "step": 2853 + }, + { + "epoch": 0.13287706310962125, + "grad_norm": 0.8640756864395863, + "learning_rate": 4.428926132836748e-05, + "loss": 4.7882, + "step": 2854 + }, + { + "epoch": 0.1329236212957143, + "grad_norm": 0.8046683354179273, + "learning_rate": 4.430477963997517e-05, + "loss": 4.7114, + "step": 2855 + }, + { + "epoch": 0.1329701794818074, + "grad_norm": 0.995008784132999, + "learning_rate": 4.432029795158287e-05, + "loss": 4.6636, + "step": 2856 + }, + { + "epoch": 0.13301673766790045, + "grad_norm": 0.887706071285602, + "learning_rate": 4.4335816263190566e-05, + "loss": 4.7887, + "step": 2857 + }, + { + "epoch": 0.13306329585399354, + "grad_norm": 0.6686499424699309, + "learning_rate": 4.435133457479826e-05, + "loss": 4.6926, + "step": 2858 + }, + { + "epoch": 0.1331098540400866, + "grad_norm": 0.7817504297551169, + "learning_rate": 4.4366852886405965e-05, + "loss": 4.7465, + "step": 2859 + }, + { + "epoch": 0.13315641222617966, + "grad_norm": 0.8983130670639736, + "learning_rate": 4.4382371198013655e-05, + "loss": 4.6907, + "step": 2860 + }, + { + "epoch": 0.13320297041227275, + "grad_norm": 0.8009560945969512, + "learning_rate": 4.439788950962136e-05, + "loss": 4.7676, + "step": 2861 + }, + { + "epoch": 0.1332495285983658, + "grad_norm": 0.7224243327673424, + "learning_rate": 4.4413407821229054e-05, + "loss": 4.7449, + "step": 2862 + }, + { + "epoch": 0.13329608678445887, + "grad_norm": 0.698832831577228, + "learning_rate": 4.442892613283675e-05, + "loss": 4.8878, + "step": 2863 + }, + { + "epoch": 0.13334264497055195, + "grad_norm": 0.6937951422453612, + "learning_rate": 4.4444444444444447e-05, + "loss": 4.7752, + "step": 2864 + }, + { + "epoch": 0.13338920315664501, + "grad_norm": 0.757071883766874, + "learning_rate": 4.445996275605214e-05, + "loss": 4.7171, + "step": 2865 + }, + { + "epoch": 0.13343576134273807, + "grad_norm": 0.740629096447978, + "learning_rate": 4.447548106765984e-05, + "loss": 4.7241, + "step": 2866 + }, + { + "epoch": 0.13348231952883116, + "grad_norm": 0.8462530578351591, + "learning_rate": 4.449099937926754e-05, + "loss": 4.8116, + "step": 2867 + }, + { + "epoch": 0.13352887771492422, + "grad_norm": 0.9967681561464817, + "learning_rate": 4.450651769087523e-05, + "loss": 4.753, + "step": 2868 + }, + { + "epoch": 0.1335754359010173, + "grad_norm": 1.0431010164598054, + "learning_rate": 4.452203600248293e-05, + "loss": 4.6367, + "step": 2869 + }, + { + "epoch": 0.13362199408711037, + "grad_norm": 1.1051112310249303, + "learning_rate": 4.453755431409063e-05, + "loss": 4.7071, + "step": 2870 + }, + { + "epoch": 0.13366855227320343, + "grad_norm": 0.7936270937689437, + "learning_rate": 4.455307262569833e-05, + "loss": 4.6343, + "step": 2871 + }, + { + "epoch": 0.13371511045929652, + "grad_norm": 0.8093462312992602, + "learning_rate": 4.4568590937306024e-05, + "loss": 4.6526, + "step": 2872 + }, + { + "epoch": 0.13376166864538958, + "grad_norm": 0.9311844188502635, + "learning_rate": 4.458410924891372e-05, + "loss": 4.7958, + "step": 2873 + }, + { + "epoch": 0.13380822683148264, + "grad_norm": 0.8098666382551136, + "learning_rate": 4.4599627560521416e-05, + "loss": 4.7885, + "step": 2874 + }, + { + "epoch": 0.13385478501757572, + "grad_norm": 0.9247778834801808, + "learning_rate": 4.461514587212911e-05, + "loss": 4.6693, + "step": 2875 + }, + { + "epoch": 0.13390134320366878, + "grad_norm": 1.068405441121235, + "learning_rate": 4.463066418373681e-05, + "loss": 4.8088, + "step": 2876 + }, + { + "epoch": 0.13394790138976184, + "grad_norm": 0.835626674345068, + "learning_rate": 4.4646182495344505e-05, + "loss": 4.6079, + "step": 2877 + }, + { + "epoch": 0.13399445957585493, + "grad_norm": 0.8518327359646249, + "learning_rate": 4.466170080695221e-05, + "loss": 4.6425, + "step": 2878 + }, + { + "epoch": 0.134041017761948, + "grad_norm": 0.8162437745387758, + "learning_rate": 4.4677219118559905e-05, + "loss": 4.7084, + "step": 2879 + }, + { + "epoch": 0.13408757594804108, + "grad_norm": 0.7045356265659583, + "learning_rate": 4.4692737430167594e-05, + "loss": 4.7723, + "step": 2880 + }, + { + "epoch": 0.13413413413413414, + "grad_norm": 0.6427628631867179, + "learning_rate": 4.47082557417753e-05, + "loss": 4.8421, + "step": 2881 + }, + { + "epoch": 0.1341806923202272, + "grad_norm": 0.7343799012670884, + "learning_rate": 4.4723774053382994e-05, + "loss": 4.6613, + "step": 2882 + }, + { + "epoch": 0.13422725050632028, + "grad_norm": 0.7287143365851236, + "learning_rate": 4.473929236499069e-05, + "loss": 4.6451, + "step": 2883 + }, + { + "epoch": 0.13427380869241334, + "grad_norm": 0.8593309384526868, + "learning_rate": 4.475481067659839e-05, + "loss": 4.6012, + "step": 2884 + }, + { + "epoch": 0.1343203668785064, + "grad_norm": 0.8318551590526598, + "learning_rate": 4.477032898820608e-05, + "loss": 4.7196, + "step": 2885 + }, + { + "epoch": 0.1343669250645995, + "grad_norm": 0.8028037255261984, + "learning_rate": 4.4785847299813786e-05, + "loss": 4.9976, + "step": 2886 + }, + { + "epoch": 0.13441348325069255, + "grad_norm": 0.7966731761203639, + "learning_rate": 4.480136561142148e-05, + "loss": 4.7103, + "step": 2887 + }, + { + "epoch": 0.1344600414367856, + "grad_norm": 0.7764403151773119, + "learning_rate": 4.481688392302918e-05, + "loss": 4.7395, + "step": 2888 + }, + { + "epoch": 0.1345065996228787, + "grad_norm": 0.9231861916967701, + "learning_rate": 4.4832402234636875e-05, + "loss": 4.7224, + "step": 2889 + }, + { + "epoch": 0.13455315780897176, + "grad_norm": 1.2314718330449808, + "learning_rate": 4.484792054624457e-05, + "loss": 4.6989, + "step": 2890 + }, + { + "epoch": 0.13459971599506484, + "grad_norm": 0.9133936626344921, + "learning_rate": 4.486343885785227e-05, + "loss": 4.7829, + "step": 2891 + }, + { + "epoch": 0.1346462741811579, + "grad_norm": 0.9522498408354418, + "learning_rate": 4.487895716945997e-05, + "loss": 4.6897, + "step": 2892 + }, + { + "epoch": 0.13469283236725096, + "grad_norm": 0.8376242873263574, + "learning_rate": 4.489447548106766e-05, + "loss": 4.7756, + "step": 2893 + }, + { + "epoch": 0.13473939055334405, + "grad_norm": 0.7165819515950397, + "learning_rate": 4.4909993792675356e-05, + "loss": 4.6735, + "step": 2894 + }, + { + "epoch": 0.1347859487394371, + "grad_norm": 0.8468671968228486, + "learning_rate": 4.492551210428306e-05, + "loss": 4.6084, + "step": 2895 + }, + { + "epoch": 0.13483250692553017, + "grad_norm": 0.8714991063213929, + "learning_rate": 4.4941030415890756e-05, + "loss": 4.7936, + "step": 2896 + }, + { + "epoch": 0.13487906511162326, + "grad_norm": 0.8455342151779109, + "learning_rate": 4.495654872749845e-05, + "loss": 4.5518, + "step": 2897 + }, + { + "epoch": 0.13492562329771632, + "grad_norm": 0.679329839284384, + "learning_rate": 4.497206703910615e-05, + "loss": 4.7371, + "step": 2898 + }, + { + "epoch": 0.13497218148380938, + "grad_norm": 0.9158281724530927, + "learning_rate": 4.4987585350713844e-05, + "loss": 4.5827, + "step": 2899 + }, + { + "epoch": 0.13501873966990247, + "grad_norm": 0.8537550966433587, + "learning_rate": 4.500310366232154e-05, + "loss": 4.8381, + "step": 2900 + }, + { + "epoch": 0.13506529785599553, + "grad_norm": 0.8782403273869109, + "learning_rate": 4.501862197392924e-05, + "loss": 4.6485, + "step": 2901 + }, + { + "epoch": 0.1351118560420886, + "grad_norm": 0.7982227674887057, + "learning_rate": 4.503414028553693e-05, + "loss": 4.6911, + "step": 2902 + }, + { + "epoch": 0.13515841422818167, + "grad_norm": 0.7991742358455624, + "learning_rate": 4.5049658597144636e-05, + "loss": 4.6978, + "step": 2903 + }, + { + "epoch": 0.13520497241427473, + "grad_norm": 0.791271698511254, + "learning_rate": 4.506517690875233e-05, + "loss": 4.6732, + "step": 2904 + }, + { + "epoch": 0.13525153060036782, + "grad_norm": 0.722768669389798, + "learning_rate": 4.508069522036002e-05, + "loss": 4.7684, + "step": 2905 + }, + { + "epoch": 0.13529808878646088, + "grad_norm": 0.8126923065770841, + "learning_rate": 4.5096213531967725e-05, + "loss": 4.8339, + "step": 2906 + }, + { + "epoch": 0.13534464697255394, + "grad_norm": 0.9058107660373527, + "learning_rate": 4.511173184357542e-05, + "loss": 4.6924, + "step": 2907 + }, + { + "epoch": 0.13539120515864703, + "grad_norm": 1.0029422749238606, + "learning_rate": 4.512725015518312e-05, + "loss": 4.8247, + "step": 2908 + }, + { + "epoch": 0.1354377633447401, + "grad_norm": 0.8088002896571784, + "learning_rate": 4.5142768466790814e-05, + "loss": 4.8592, + "step": 2909 + }, + { + "epoch": 0.13548432153083315, + "grad_norm": 0.7093950242455492, + "learning_rate": 4.515828677839851e-05, + "loss": 4.7186, + "step": 2910 + }, + { + "epoch": 0.13553087971692623, + "grad_norm": 0.7941948232414844, + "learning_rate": 4.5173805090006214e-05, + "loss": 4.7961, + "step": 2911 + }, + { + "epoch": 0.1355774379030193, + "grad_norm": 0.861602217191964, + "learning_rate": 4.518932340161391e-05, + "loss": 4.6739, + "step": 2912 + }, + { + "epoch": 0.13562399608911238, + "grad_norm": 0.8385929159116741, + "learning_rate": 4.52048417132216e-05, + "loss": 4.6888, + "step": 2913 + }, + { + "epoch": 0.13567055427520544, + "grad_norm": 0.9397092033648576, + "learning_rate": 4.52203600248293e-05, + "loss": 4.6565, + "step": 2914 + }, + { + "epoch": 0.1357171124612985, + "grad_norm": 1.2424667719057922, + "learning_rate": 4.5235878336437e-05, + "loss": 4.7014, + "step": 2915 + }, + { + "epoch": 0.1357636706473916, + "grad_norm": 0.7941636573590309, + "learning_rate": 4.5251396648044695e-05, + "loss": 4.6762, + "step": 2916 + }, + { + "epoch": 0.13581022883348465, + "grad_norm": 1.0619229641627213, + "learning_rate": 4.526691495965239e-05, + "loss": 4.6834, + "step": 2917 + }, + { + "epoch": 0.1358567870195777, + "grad_norm": 0.9893810862486426, + "learning_rate": 4.528243327126009e-05, + "loss": 4.7316, + "step": 2918 + }, + { + "epoch": 0.1359033452056708, + "grad_norm": 1.1303850879728607, + "learning_rate": 4.5297951582867784e-05, + "loss": 4.7831, + "step": 2919 + }, + { + "epoch": 0.13594990339176385, + "grad_norm": 0.8659426504892924, + "learning_rate": 4.531346989447549e-05, + "loss": 4.6653, + "step": 2920 + }, + { + "epoch": 0.13599646157785691, + "grad_norm": 0.8620220600375536, + "learning_rate": 4.532898820608318e-05, + "loss": 4.6247, + "step": 2921 + }, + { + "epoch": 0.13604301976395, + "grad_norm": 0.9748432021973975, + "learning_rate": 4.534450651769088e-05, + "loss": 4.7666, + "step": 2922 + }, + { + "epoch": 0.13608957795004306, + "grad_norm": 1.0506086891391229, + "learning_rate": 4.5360024829298576e-05, + "loss": 4.7556, + "step": 2923 + }, + { + "epoch": 0.13613613613613615, + "grad_norm": 0.9830052616028753, + "learning_rate": 4.537554314090627e-05, + "loss": 4.786, + "step": 2924 + }, + { + "epoch": 0.1361826943222292, + "grad_norm": 0.969262934005879, + "learning_rate": 4.539106145251397e-05, + "loss": 4.6599, + "step": 2925 + }, + { + "epoch": 0.13622925250832227, + "grad_norm": 0.8572420337658799, + "learning_rate": 4.5406579764121665e-05, + "loss": 4.7672, + "step": 2926 + }, + { + "epoch": 0.13627581069441536, + "grad_norm": 0.91958856554729, + "learning_rate": 4.542209807572936e-05, + "loss": 4.761, + "step": 2927 + }, + { + "epoch": 0.13632236888050842, + "grad_norm": 0.886628355000173, + "learning_rate": 4.5437616387337064e-05, + "loss": 4.7498, + "step": 2928 + }, + { + "epoch": 0.13636892706660148, + "grad_norm": 1.0004808400683824, + "learning_rate": 4.5453134698944754e-05, + "loss": 4.7534, + "step": 2929 + }, + { + "epoch": 0.13641548525269456, + "grad_norm": 1.1268989147288013, + "learning_rate": 4.546865301055245e-05, + "loss": 4.6896, + "step": 2930 + }, + { + "epoch": 0.13646204343878762, + "grad_norm": 0.841875023138089, + "learning_rate": 4.548417132216015e-05, + "loss": 4.6821, + "step": 2931 + }, + { + "epoch": 0.13650860162488068, + "grad_norm": 0.89129014375875, + "learning_rate": 4.549968963376785e-05, + "loss": 4.6275, + "step": 2932 + }, + { + "epoch": 0.13655515981097377, + "grad_norm": 0.841872842837764, + "learning_rate": 4.5515207945375546e-05, + "loss": 4.614, + "step": 2933 + }, + { + "epoch": 0.13660171799706683, + "grad_norm": 0.7248148595654752, + "learning_rate": 4.553072625698324e-05, + "loss": 4.7721, + "step": 2934 + }, + { + "epoch": 0.13664827618315992, + "grad_norm": 0.9946474831723396, + "learning_rate": 4.554624456859094e-05, + "loss": 4.6856, + "step": 2935 + }, + { + "epoch": 0.13669483436925298, + "grad_norm": 1.0720080416489473, + "learning_rate": 4.556176288019864e-05, + "loss": 4.8816, + "step": 2936 + }, + { + "epoch": 0.13674139255534604, + "grad_norm": 1.1247148162164053, + "learning_rate": 4.557728119180633e-05, + "loss": 4.7213, + "step": 2937 + }, + { + "epoch": 0.13678795074143912, + "grad_norm": 1.0562923745305586, + "learning_rate": 4.559279950341403e-05, + "loss": 4.695, + "step": 2938 + }, + { + "epoch": 0.13683450892753218, + "grad_norm": 1.090953648695689, + "learning_rate": 4.560831781502173e-05, + "loss": 4.7016, + "step": 2939 + }, + { + "epoch": 0.13688106711362524, + "grad_norm": 0.804890527522558, + "learning_rate": 4.562383612662943e-05, + "loss": 4.6963, + "step": 2940 + }, + { + "epoch": 0.13692762529971833, + "grad_norm": 0.9283358177619735, + "learning_rate": 4.5639354438237116e-05, + "loss": 4.698, + "step": 2941 + }, + { + "epoch": 0.1369741834858114, + "grad_norm": 1.124770955129977, + "learning_rate": 4.565487274984482e-05, + "loss": 4.653, + "step": 2942 + }, + { + "epoch": 0.13702074167190445, + "grad_norm": 0.9098898125816588, + "learning_rate": 4.5670391061452516e-05, + "loss": 4.7936, + "step": 2943 + }, + { + "epoch": 0.13706729985799754, + "grad_norm": 0.9100715029838139, + "learning_rate": 4.568590937306021e-05, + "loss": 4.6427, + "step": 2944 + }, + { + "epoch": 0.1371138580440906, + "grad_norm": 0.9659028799336398, + "learning_rate": 4.570142768466791e-05, + "loss": 4.6408, + "step": 2945 + }, + { + "epoch": 0.13716041623018368, + "grad_norm": 0.9564549854601703, + "learning_rate": 4.5716945996275605e-05, + "loss": 4.661, + "step": 2946 + }, + { + "epoch": 0.13720697441627674, + "grad_norm": 0.9363784622051734, + "learning_rate": 4.573246430788331e-05, + "loss": 4.6528, + "step": 2947 + }, + { + "epoch": 0.1372535326023698, + "grad_norm": 0.9393612928836873, + "learning_rate": 4.5747982619491004e-05, + "loss": 4.6018, + "step": 2948 + }, + { + "epoch": 0.1373000907884629, + "grad_norm": 0.7199631395173594, + "learning_rate": 4.5763500931098694e-05, + "loss": 4.5451, + "step": 2949 + }, + { + "epoch": 0.13734664897455595, + "grad_norm": 0.8497106533932791, + "learning_rate": 4.57790192427064e-05, + "loss": 4.6411, + "step": 2950 + }, + { + "epoch": 0.137393207160649, + "grad_norm": 0.8650324951435906, + "learning_rate": 4.579453755431409e-05, + "loss": 4.7855, + "step": 2951 + }, + { + "epoch": 0.1374397653467421, + "grad_norm": 0.8625234629559998, + "learning_rate": 4.581005586592179e-05, + "loss": 4.5952, + "step": 2952 + }, + { + "epoch": 0.13748632353283516, + "grad_norm": 0.8508534332862681, + "learning_rate": 4.5825574177529486e-05, + "loss": 4.6516, + "step": 2953 + }, + { + "epoch": 0.13753288171892822, + "grad_norm": 0.7475768382863646, + "learning_rate": 4.584109248913718e-05, + "loss": 4.6021, + "step": 2954 + }, + { + "epoch": 0.1375794399050213, + "grad_norm": 0.8330220019883341, + "learning_rate": 4.585661080074488e-05, + "loss": 4.5394, + "step": 2955 + }, + { + "epoch": 0.13762599809111437, + "grad_norm": 0.9253089303351674, + "learning_rate": 4.587212911235258e-05, + "loss": 4.6319, + "step": 2956 + }, + { + "epoch": 0.13767255627720745, + "grad_norm": 0.9530025354737773, + "learning_rate": 4.588764742396027e-05, + "loss": 4.5898, + "step": 2957 + }, + { + "epoch": 0.1377191144633005, + "grad_norm": 0.8931355574394508, + "learning_rate": 4.5903165735567974e-05, + "loss": 4.5521, + "step": 2958 + }, + { + "epoch": 0.13776567264939357, + "grad_norm": 0.9099244519158985, + "learning_rate": 4.591868404717567e-05, + "loss": 4.6703, + "step": 2959 + }, + { + "epoch": 0.13781223083548666, + "grad_norm": 0.9172714112636378, + "learning_rate": 4.5934202358783367e-05, + "loss": 4.7488, + "step": 2960 + }, + { + "epoch": 0.13785878902157972, + "grad_norm": 0.9906820424545628, + "learning_rate": 4.594972067039106e-05, + "loss": 4.7074, + "step": 2961 + }, + { + "epoch": 0.13790534720767278, + "grad_norm": 0.9213743737792572, + "learning_rate": 4.596523898199876e-05, + "loss": 4.6855, + "step": 2962 + }, + { + "epoch": 0.13795190539376587, + "grad_norm": 0.7977907126277411, + "learning_rate": 4.5980757293606456e-05, + "loss": 4.7469, + "step": 2963 + }, + { + "epoch": 0.13799846357985893, + "grad_norm": 0.8601729705299166, + "learning_rate": 4.599627560521416e-05, + "loss": 4.5512, + "step": 2964 + }, + { + "epoch": 0.138045021765952, + "grad_norm": 1.1673169080204164, + "learning_rate": 4.601179391682185e-05, + "loss": 4.6667, + "step": 2965 + }, + { + "epoch": 0.13809157995204507, + "grad_norm": 0.7943283450834245, + "learning_rate": 4.6027312228429544e-05, + "loss": 4.6125, + "step": 2966 + }, + { + "epoch": 0.13813813813813813, + "grad_norm": 0.7093914048675873, + "learning_rate": 4.604283054003725e-05, + "loss": 4.6961, + "step": 2967 + }, + { + "epoch": 0.13818469632423122, + "grad_norm": 0.9115114320960603, + "learning_rate": 4.6058348851644944e-05, + "loss": 4.8095, + "step": 2968 + }, + { + "epoch": 0.13823125451032428, + "grad_norm": 0.7006797516099542, + "learning_rate": 4.607386716325264e-05, + "loss": 4.6019, + "step": 2969 + }, + { + "epoch": 0.13827781269641734, + "grad_norm": 0.7642793058917233, + "learning_rate": 4.6089385474860336e-05, + "loss": 4.6478, + "step": 2970 + }, + { + "epoch": 0.13832437088251043, + "grad_norm": 0.6544834405218439, + "learning_rate": 4.610490378646803e-05, + "loss": 4.5582, + "step": 2971 + }, + { + "epoch": 0.1383709290686035, + "grad_norm": 0.6858186377234088, + "learning_rate": 4.6120422098075736e-05, + "loss": 4.5986, + "step": 2972 + }, + { + "epoch": 0.13841748725469655, + "grad_norm": 0.7577624269554022, + "learning_rate": 4.613594040968343e-05, + "loss": 4.6845, + "step": 2973 + }, + { + "epoch": 0.13846404544078963, + "grad_norm": 0.7824133524204875, + "learning_rate": 4.615145872129112e-05, + "loss": 4.861, + "step": 2974 + }, + { + "epoch": 0.1385106036268827, + "grad_norm": 0.8939834101565479, + "learning_rate": 4.6166977032898825e-05, + "loss": 4.643, + "step": 2975 + }, + { + "epoch": 0.13855716181297575, + "grad_norm": 0.8637915501933454, + "learning_rate": 4.618249534450652e-05, + "loss": 4.6836, + "step": 2976 + }, + { + "epoch": 0.13860371999906884, + "grad_norm": 0.7788294197646378, + "learning_rate": 4.619801365611422e-05, + "loss": 4.7385, + "step": 2977 + }, + { + "epoch": 0.1386502781851619, + "grad_norm": 0.834266377969605, + "learning_rate": 4.6213531967721914e-05, + "loss": 4.6235, + "step": 2978 + }, + { + "epoch": 0.138696836371255, + "grad_norm": 0.906382630743968, + "learning_rate": 4.622905027932961e-05, + "loss": 4.6891, + "step": 2979 + }, + { + "epoch": 0.13874339455734805, + "grad_norm": 0.8438897034902391, + "learning_rate": 4.6244568590937306e-05, + "loss": 4.6026, + "step": 2980 + }, + { + "epoch": 0.1387899527434411, + "grad_norm": 0.9265339712093013, + "learning_rate": 4.626008690254501e-05, + "loss": 4.6466, + "step": 2981 + }, + { + "epoch": 0.1388365109295342, + "grad_norm": 0.7784077224609636, + "learning_rate": 4.62756052141527e-05, + "loss": 4.6386, + "step": 2982 + }, + { + "epoch": 0.13888306911562726, + "grad_norm": 0.8821233768701524, + "learning_rate": 4.62911235257604e-05, + "loss": 4.5489, + "step": 2983 + }, + { + "epoch": 0.13892962730172032, + "grad_norm": 0.8111658679680919, + "learning_rate": 4.63066418373681e-05, + "loss": 4.6424, + "step": 2984 + }, + { + "epoch": 0.1389761854878134, + "grad_norm": 0.8516947404714962, + "learning_rate": 4.6322160148975795e-05, + "loss": 4.5723, + "step": 2985 + }, + { + "epoch": 0.13902274367390646, + "grad_norm": 0.656838605712258, + "learning_rate": 4.633767846058349e-05, + "loss": 4.5628, + "step": 2986 + }, + { + "epoch": 0.13906930185999952, + "grad_norm": 0.809387413354395, + "learning_rate": 4.635319677219119e-05, + "loss": 4.7176, + "step": 2987 + }, + { + "epoch": 0.1391158600460926, + "grad_norm": 0.8118621139059874, + "learning_rate": 4.6368715083798884e-05, + "loss": 4.5812, + "step": 2988 + }, + { + "epoch": 0.13916241823218567, + "grad_norm": 0.7211842479645318, + "learning_rate": 4.638423339540659e-05, + "loss": 4.674, + "step": 2989 + }, + { + "epoch": 0.13920897641827876, + "grad_norm": 0.7935125288935184, + "learning_rate": 4.6399751707014276e-05, + "loss": 4.5628, + "step": 2990 + }, + { + "epoch": 0.13925553460437182, + "grad_norm": 0.7271456333100405, + "learning_rate": 4.641527001862197e-05, + "loss": 4.6319, + "step": 2991 + }, + { + "epoch": 0.13930209279046488, + "grad_norm": 0.7017373042301983, + "learning_rate": 4.6430788330229676e-05, + "loss": 4.6019, + "step": 2992 + }, + { + "epoch": 0.13934865097655796, + "grad_norm": 0.8235033957002503, + "learning_rate": 4.644630664183737e-05, + "loss": 4.5875, + "step": 2993 + }, + { + "epoch": 0.13939520916265102, + "grad_norm": 0.9869471022205581, + "learning_rate": 4.646182495344507e-05, + "loss": 4.7331, + "step": 2994 + }, + { + "epoch": 0.13944176734874408, + "grad_norm": 0.7906233473836384, + "learning_rate": 4.6477343265052764e-05, + "loss": 4.7152, + "step": 2995 + }, + { + "epoch": 0.13948832553483717, + "grad_norm": 0.8023613866877859, + "learning_rate": 4.649286157666046e-05, + "loss": 4.6643, + "step": 2996 + }, + { + "epoch": 0.13953488372093023, + "grad_norm": 0.7838985161210285, + "learning_rate": 4.6508379888268164e-05, + "loss": 4.5031, + "step": 2997 + }, + { + "epoch": 0.1395814419070233, + "grad_norm": 0.806978438008237, + "learning_rate": 4.6523898199875853e-05, + "loss": 4.6463, + "step": 2998 + }, + { + "epoch": 0.13962800009311638, + "grad_norm": 0.8834785492240935, + "learning_rate": 4.653941651148355e-05, + "loss": 4.6872, + "step": 2999 + }, + { + "epoch": 0.13967455827920944, + "grad_norm": 0.7466922312645999, + "learning_rate": 4.655493482309125e-05, + "loss": 4.529, + "step": 3000 + }, + { + "epoch": 0.13972111646530253, + "grad_norm": 0.8789037165397409, + "learning_rate": 4.657045313469895e-05, + "loss": 4.5934, + "step": 3001 + }, + { + "epoch": 0.13976767465139558, + "grad_norm": 0.8604862311470408, + "learning_rate": 4.6585971446306645e-05, + "loss": 4.6536, + "step": 3002 + }, + { + "epoch": 0.13981423283748864, + "grad_norm": 0.795795627656858, + "learning_rate": 4.660148975791434e-05, + "loss": 4.5038, + "step": 3003 + }, + { + "epoch": 0.13986079102358173, + "grad_norm": 0.7480681711592075, + "learning_rate": 4.661700806952204e-05, + "loss": 4.6901, + "step": 3004 + }, + { + "epoch": 0.1399073492096748, + "grad_norm": 1.0213591258854957, + "learning_rate": 4.6632526381129734e-05, + "loss": 4.6728, + "step": 3005 + }, + { + "epoch": 0.13995390739576785, + "grad_norm": 0.8433590928279346, + "learning_rate": 4.664804469273743e-05, + "loss": 4.6318, + "step": 3006 + }, + { + "epoch": 0.14000046558186094, + "grad_norm": 0.933820069625719, + "learning_rate": 4.666356300434513e-05, + "loss": 4.6476, + "step": 3007 + }, + { + "epoch": 0.140047023767954, + "grad_norm": 0.9880458821537522, + "learning_rate": 4.667908131595283e-05, + "loss": 4.5474, + "step": 3008 + }, + { + "epoch": 0.14009358195404706, + "grad_norm": 0.934329830042534, + "learning_rate": 4.6694599627560526e-05, + "loss": 4.6354, + "step": 3009 + }, + { + "epoch": 0.14014014014014015, + "grad_norm": 1.0419552945612922, + "learning_rate": 4.6710117939168216e-05, + "loss": 4.689, + "step": 3010 + }, + { + "epoch": 0.1401866983262332, + "grad_norm": 1.089537524617516, + "learning_rate": 4.672563625077592e-05, + "loss": 4.6315, + "step": 3011 + }, + { + "epoch": 0.1402332565123263, + "grad_norm": 0.8173132455965076, + "learning_rate": 4.6741154562383615e-05, + "loss": 4.6743, + "step": 3012 + }, + { + "epoch": 0.14027981469841935, + "grad_norm": 1.031020852802846, + "learning_rate": 4.675667287399131e-05, + "loss": 4.6499, + "step": 3013 + }, + { + "epoch": 0.1403263728845124, + "grad_norm": 1.0454540743263214, + "learning_rate": 4.677219118559901e-05, + "loss": 4.6233, + "step": 3014 + }, + { + "epoch": 0.1403729310706055, + "grad_norm": 0.9686342158955047, + "learning_rate": 4.6787709497206704e-05, + "loss": 4.7618, + "step": 3015 + }, + { + "epoch": 0.14041948925669856, + "grad_norm": 1.2067287390556103, + "learning_rate": 4.68032278088144e-05, + "loss": 4.7397, + "step": 3016 + }, + { + "epoch": 0.14046604744279162, + "grad_norm": 0.796217378226144, + "learning_rate": 4.6818746120422104e-05, + "loss": 4.6306, + "step": 3017 + }, + { + "epoch": 0.1405126056288847, + "grad_norm": 0.8895733101358692, + "learning_rate": 4.683426443202979e-05, + "loss": 4.5342, + "step": 3018 + }, + { + "epoch": 0.14055916381497777, + "grad_norm": 1.0642468734736046, + "learning_rate": 4.6849782743637496e-05, + "loss": 4.7062, + "step": 3019 + }, + { + "epoch": 0.14060572200107083, + "grad_norm": 0.9531776846862018, + "learning_rate": 4.686530105524519e-05, + "loss": 4.7767, + "step": 3020 + }, + { + "epoch": 0.14065228018716391, + "grad_norm": 0.8271110756869241, + "learning_rate": 4.688081936685289e-05, + "loss": 4.5561, + "step": 3021 + }, + { + "epoch": 0.14069883837325697, + "grad_norm": 0.7577321193422712, + "learning_rate": 4.6896337678460585e-05, + "loss": 4.6047, + "step": 3022 + }, + { + "epoch": 0.14074539655935006, + "grad_norm": 0.7897867184179274, + "learning_rate": 4.691185599006828e-05, + "loss": 4.5348, + "step": 3023 + }, + { + "epoch": 0.14079195474544312, + "grad_norm": 0.9713121438618593, + "learning_rate": 4.692737430167598e-05, + "loss": 4.6936, + "step": 3024 + }, + { + "epoch": 0.14083851293153618, + "grad_norm": 0.8206228902045697, + "learning_rate": 4.694289261328368e-05, + "loss": 4.6187, + "step": 3025 + }, + { + "epoch": 0.14088507111762927, + "grad_norm": 0.76733636074618, + "learning_rate": 4.695841092489137e-05, + "loss": 4.7035, + "step": 3026 + }, + { + "epoch": 0.14093162930372233, + "grad_norm": 0.9239859548781031, + "learning_rate": 4.6973929236499073e-05, + "loss": 4.7, + "step": 3027 + }, + { + "epoch": 0.1409781874898154, + "grad_norm": 0.8596338869116013, + "learning_rate": 4.698944754810677e-05, + "loss": 4.5737, + "step": 3028 + }, + { + "epoch": 0.14102474567590847, + "grad_norm": 0.7645015538686067, + "learning_rate": 4.7004965859714466e-05, + "loss": 4.6537, + "step": 3029 + }, + { + "epoch": 0.14107130386200153, + "grad_norm": 0.7814631170951005, + "learning_rate": 4.702048417132216e-05, + "loss": 4.6361, + "step": 3030 + }, + { + "epoch": 0.1411178620480946, + "grad_norm": 0.6842416167053909, + "learning_rate": 4.703600248292986e-05, + "loss": 4.4994, + "step": 3031 + }, + { + "epoch": 0.14116442023418768, + "grad_norm": 0.7530063191455926, + "learning_rate": 4.7051520794537555e-05, + "loss": 4.6307, + "step": 3032 + }, + { + "epoch": 0.14121097842028074, + "grad_norm": 0.6040384264964037, + "learning_rate": 4.706703910614526e-05, + "loss": 4.5698, + "step": 3033 + }, + { + "epoch": 0.14125753660637383, + "grad_norm": 0.7568565421223636, + "learning_rate": 4.708255741775295e-05, + "loss": 4.617, + "step": 3034 + }, + { + "epoch": 0.1413040947924669, + "grad_norm": 0.825430159296865, + "learning_rate": 4.7098075729360644e-05, + "loss": 4.7388, + "step": 3035 + }, + { + "epoch": 0.14135065297855995, + "grad_norm": 0.8194118285411734, + "learning_rate": 4.711359404096835e-05, + "loss": 4.5756, + "step": 3036 + }, + { + "epoch": 0.14139721116465304, + "grad_norm": 0.7752809623732472, + "learning_rate": 4.712911235257604e-05, + "loss": 4.6427, + "step": 3037 + }, + { + "epoch": 0.1414437693507461, + "grad_norm": 0.7911352432819385, + "learning_rate": 4.714463066418374e-05, + "loss": 4.6443, + "step": 3038 + }, + { + "epoch": 0.14149032753683916, + "grad_norm": 0.7317880133194505, + "learning_rate": 4.7160148975791436e-05, + "loss": 4.6282, + "step": 3039 + }, + { + "epoch": 0.14153688572293224, + "grad_norm": 0.7770878092594157, + "learning_rate": 4.717566728739913e-05, + "loss": 4.6411, + "step": 3040 + }, + { + "epoch": 0.1415834439090253, + "grad_norm": 0.7035358899102993, + "learning_rate": 4.7191185599006835e-05, + "loss": 4.6172, + "step": 3041 + }, + { + "epoch": 0.14163000209511836, + "grad_norm": 0.6126391147737281, + "learning_rate": 4.7206703910614525e-05, + "loss": 4.6457, + "step": 3042 + }, + { + "epoch": 0.14167656028121145, + "grad_norm": 0.6108334350538851, + "learning_rate": 4.722222222222222e-05, + "loss": 4.6068, + "step": 3043 + }, + { + "epoch": 0.1417231184673045, + "grad_norm": 0.6980585200977681, + "learning_rate": 4.7237740533829924e-05, + "loss": 4.6009, + "step": 3044 + }, + { + "epoch": 0.1417696766533976, + "grad_norm": 0.6719970824635003, + "learning_rate": 4.725325884543762e-05, + "loss": 4.6692, + "step": 3045 + }, + { + "epoch": 0.14181623483949066, + "grad_norm": 0.725565904831366, + "learning_rate": 4.726877715704531e-05, + "loss": 4.5914, + "step": 3046 + }, + { + "epoch": 0.14186279302558372, + "grad_norm": 0.7596925581547046, + "learning_rate": 4.728429546865301e-05, + "loss": 4.6773, + "step": 3047 + }, + { + "epoch": 0.1419093512116768, + "grad_norm": 0.8416992272323015, + "learning_rate": 4.729981378026071e-05, + "loss": 4.6214, + "step": 3048 + }, + { + "epoch": 0.14195590939776986, + "grad_norm": 0.9751440977369692, + "learning_rate": 4.7315332091868406e-05, + "loss": 4.6044, + "step": 3049 + }, + { + "epoch": 0.14200246758386292, + "grad_norm": 0.9733352431271597, + "learning_rate": 4.73308504034761e-05, + "loss": 4.6379, + "step": 3050 + }, + { + "epoch": 0.142049025769956, + "grad_norm": 0.8943469940874805, + "learning_rate": 4.73463687150838e-05, + "loss": 4.5863, + "step": 3051 + }, + { + "epoch": 0.14209558395604907, + "grad_norm": 0.9108969203653932, + "learning_rate": 4.73618870266915e-05, + "loss": 4.5363, + "step": 3052 + }, + { + "epoch": 0.14214214214214213, + "grad_norm": 0.904312199669178, + "learning_rate": 4.73774053382992e-05, + "loss": 4.5875, + "step": 3053 + }, + { + "epoch": 0.14218870032823522, + "grad_norm": 0.79314818353502, + "learning_rate": 4.7392923649906894e-05, + "loss": 4.5569, + "step": 3054 + }, + { + "epoch": 0.14223525851432828, + "grad_norm": 0.8005163065511877, + "learning_rate": 4.740844196151459e-05, + "loss": 4.6805, + "step": 3055 + }, + { + "epoch": 0.14228181670042137, + "grad_norm": 1.1013954265579657, + "learning_rate": 4.742396027312229e-05, + "loss": 4.6054, + "step": 3056 + }, + { + "epoch": 0.14232837488651442, + "grad_norm": 1.0379921791725784, + "learning_rate": 4.743947858472998e-05, + "loss": 4.6045, + "step": 3057 + }, + { + "epoch": 0.14237493307260748, + "grad_norm": 0.9444705542702728, + "learning_rate": 4.7454996896337686e-05, + "loss": 4.4815, + "step": 3058 + }, + { + "epoch": 0.14242149125870057, + "grad_norm": 0.8968252605685941, + "learning_rate": 4.7470515207945376e-05, + "loss": 4.6118, + "step": 3059 + }, + { + "epoch": 0.14246804944479363, + "grad_norm": 0.8772576096277199, + "learning_rate": 4.748603351955307e-05, + "loss": 4.7504, + "step": 3060 + }, + { + "epoch": 0.1425146076308867, + "grad_norm": 0.8720833047319705, + "learning_rate": 4.7501551831160775e-05, + "loss": 4.6108, + "step": 3061 + }, + { + "epoch": 0.14256116581697978, + "grad_norm": 0.7690421316503232, + "learning_rate": 4.751707014276847e-05, + "loss": 4.5435, + "step": 3062 + }, + { + "epoch": 0.14260772400307284, + "grad_norm": 0.8684495716470578, + "learning_rate": 4.753258845437617e-05, + "loss": 4.525, + "step": 3063 + }, + { + "epoch": 0.1426542821891659, + "grad_norm": 0.7122907490952639, + "learning_rate": 4.7548106765983864e-05, + "loss": 4.6102, + "step": 3064 + }, + { + "epoch": 0.14270084037525899, + "grad_norm": 0.7426635247144961, + "learning_rate": 4.756362507759156e-05, + "loss": 4.5578, + "step": 3065 + }, + { + "epoch": 0.14274739856135205, + "grad_norm": 0.9132154670074915, + "learning_rate": 4.757914338919926e-05, + "loss": 4.6923, + "step": 3066 + }, + { + "epoch": 0.14279395674744513, + "grad_norm": 1.085843810625245, + "learning_rate": 4.759466170080695e-05, + "loss": 4.7445, + "step": 3067 + }, + { + "epoch": 0.1428405149335382, + "grad_norm": 0.8805930070046049, + "learning_rate": 4.761018001241465e-05, + "loss": 4.6437, + "step": 3068 + }, + { + "epoch": 0.14288707311963125, + "grad_norm": 0.8460000627560684, + "learning_rate": 4.762569832402235e-05, + "loss": 4.6828, + "step": 3069 + }, + { + "epoch": 0.14293363130572434, + "grad_norm": 0.905058321410648, + "learning_rate": 4.764121663563005e-05, + "loss": 4.7638, + "step": 3070 + }, + { + "epoch": 0.1429801894918174, + "grad_norm": 0.924374957566882, + "learning_rate": 4.765673494723774e-05, + "loss": 4.5248, + "step": 3071 + }, + { + "epoch": 0.14302674767791046, + "grad_norm": 1.0940822847608578, + "learning_rate": 4.767225325884544e-05, + "loss": 4.6153, + "step": 3072 + }, + { + "epoch": 0.14307330586400355, + "grad_norm": 0.8995379537499293, + "learning_rate": 4.768777157045314e-05, + "loss": 4.6104, + "step": 3073 + }, + { + "epoch": 0.1431198640500966, + "grad_norm": 0.89993076422944, + "learning_rate": 4.7703289882060834e-05, + "loss": 4.6478, + "step": 3074 + }, + { + "epoch": 0.14316642223618967, + "grad_norm": 0.9596478558999095, + "learning_rate": 4.771880819366853e-05, + "loss": 4.6461, + "step": 3075 + }, + { + "epoch": 0.14321298042228275, + "grad_norm": 0.7308760123202159, + "learning_rate": 4.7734326505276226e-05, + "loss": 4.5462, + "step": 3076 + }, + { + "epoch": 0.1432595386083758, + "grad_norm": 0.8041406645816653, + "learning_rate": 4.774984481688393e-05, + "loss": 4.6389, + "step": 3077 + }, + { + "epoch": 0.1433060967944689, + "grad_norm": 0.8721419067930626, + "learning_rate": 4.7765363128491626e-05, + "loss": 4.6406, + "step": 3078 + }, + { + "epoch": 0.14335265498056196, + "grad_norm": 0.6731758917010017, + "learning_rate": 4.7780881440099315e-05, + "loss": 4.5244, + "step": 3079 + }, + { + "epoch": 0.14339921316665502, + "grad_norm": 0.797759440306949, + "learning_rate": 4.779639975170702e-05, + "loss": 4.4308, + "step": 3080 + }, + { + "epoch": 0.1434457713527481, + "grad_norm": 0.731934139560502, + "learning_rate": 4.7811918063314715e-05, + "loss": 4.6567, + "step": 3081 + }, + { + "epoch": 0.14349232953884117, + "grad_norm": 0.7482349952711456, + "learning_rate": 4.782743637492241e-05, + "loss": 4.5647, + "step": 3082 + }, + { + "epoch": 0.14353888772493423, + "grad_norm": 0.818746275230317, + "learning_rate": 4.784295468653011e-05, + "loss": 4.6229, + "step": 3083 + }, + { + "epoch": 0.14358544591102732, + "grad_norm": 0.7544092378817757, + "learning_rate": 4.7858472998137804e-05, + "loss": 4.6642, + "step": 3084 + }, + { + "epoch": 0.14363200409712037, + "grad_norm": 0.8453056913147869, + "learning_rate": 4.78739913097455e-05, + "loss": 4.5903, + "step": 3085 + }, + { + "epoch": 0.14367856228321343, + "grad_norm": 0.8563637403811668, + "learning_rate": 4.78895096213532e-05, + "loss": 4.6005, + "step": 3086 + }, + { + "epoch": 0.14372512046930652, + "grad_norm": 1.0177728838892495, + "learning_rate": 4.790502793296089e-05, + "loss": 4.6984, + "step": 3087 + }, + { + "epoch": 0.14377167865539958, + "grad_norm": 1.1135416842093366, + "learning_rate": 4.7920546244568596e-05, + "loss": 4.7083, + "step": 3088 + }, + { + "epoch": 0.14381823684149267, + "grad_norm": 0.8333952180996881, + "learning_rate": 4.793606455617629e-05, + "loss": 4.5915, + "step": 3089 + }, + { + "epoch": 0.14386479502758573, + "grad_norm": 0.7417814085210808, + "learning_rate": 4.795158286778399e-05, + "loss": 4.5397, + "step": 3090 + }, + { + "epoch": 0.1439113532136788, + "grad_norm": 0.9139567714418803, + "learning_rate": 4.7967101179391685e-05, + "loss": 4.4969, + "step": 3091 + }, + { + "epoch": 0.14395791139977188, + "grad_norm": 0.7578662547684741, + "learning_rate": 4.798261949099938e-05, + "loss": 4.6898, + "step": 3092 + }, + { + "epoch": 0.14400446958586494, + "grad_norm": 0.7710497064821166, + "learning_rate": 4.799813780260708e-05, + "loss": 4.6745, + "step": 3093 + }, + { + "epoch": 0.144051027771958, + "grad_norm": 0.8191580215721811, + "learning_rate": 4.801365611421478e-05, + "loss": 4.5776, + "step": 3094 + }, + { + "epoch": 0.14409758595805108, + "grad_norm": 0.7674103813937206, + "learning_rate": 4.802917442582247e-05, + "loss": 4.6424, + "step": 3095 + }, + { + "epoch": 0.14414414414414414, + "grad_norm": 0.7367418894793576, + "learning_rate": 4.8044692737430166e-05, + "loss": 4.5515, + "step": 3096 + }, + { + "epoch": 0.1441907023302372, + "grad_norm": 0.7352473420400877, + "learning_rate": 4.806021104903787e-05, + "loss": 4.7522, + "step": 3097 + }, + { + "epoch": 0.1442372605163303, + "grad_norm": 0.838299929606431, + "learning_rate": 4.8075729360645565e-05, + "loss": 4.5791, + "step": 3098 + }, + { + "epoch": 0.14428381870242335, + "grad_norm": 0.9229157763662295, + "learning_rate": 4.809124767225326e-05, + "loss": 4.5656, + "step": 3099 + }, + { + "epoch": 0.1443303768885164, + "grad_norm": 0.9310733280460439, + "learning_rate": 4.810676598386096e-05, + "loss": 4.4512, + "step": 3100 + }, + { + "epoch": 0.1443769350746095, + "grad_norm": 0.8470834537890493, + "learning_rate": 4.8122284295468654e-05, + "loss": 4.5674, + "step": 3101 + }, + { + "epoch": 0.14442349326070256, + "grad_norm": 0.6868966149428061, + "learning_rate": 4.813780260707636e-05, + "loss": 4.6044, + "step": 3102 + }, + { + "epoch": 0.14447005144679564, + "grad_norm": 0.7752236048838506, + "learning_rate": 4.815332091868405e-05, + "loss": 4.7436, + "step": 3103 + }, + { + "epoch": 0.1445166096328887, + "grad_norm": 0.827641949881285, + "learning_rate": 4.816883923029174e-05, + "loss": 4.6329, + "step": 3104 + }, + { + "epoch": 0.14456316781898176, + "grad_norm": 0.7544014422816716, + "learning_rate": 4.8184357541899446e-05, + "loss": 4.5611, + "step": 3105 + }, + { + "epoch": 0.14460972600507485, + "grad_norm": 0.7548873632092574, + "learning_rate": 4.819987585350714e-05, + "loss": 4.5939, + "step": 3106 + }, + { + "epoch": 0.1446562841911679, + "grad_norm": 0.9660202161473213, + "learning_rate": 4.821539416511483e-05, + "loss": 4.6109, + "step": 3107 + }, + { + "epoch": 0.14470284237726097, + "grad_norm": 1.0035420333300185, + "learning_rate": 4.8230912476722535e-05, + "loss": 4.6207, + "step": 3108 + }, + { + "epoch": 0.14474940056335406, + "grad_norm": 1.0986745089001826, + "learning_rate": 4.824643078833023e-05, + "loss": 4.5332, + "step": 3109 + }, + { + "epoch": 0.14479595874944712, + "grad_norm": 0.8665749715851875, + "learning_rate": 4.826194909993793e-05, + "loss": 4.5168, + "step": 3110 + }, + { + "epoch": 0.14484251693554018, + "grad_norm": 0.7455707560019591, + "learning_rate": 4.8277467411545624e-05, + "loss": 4.4862, + "step": 3111 + }, + { + "epoch": 0.14488907512163327, + "grad_norm": 1.0152029333287922, + "learning_rate": 4.829298572315332e-05, + "loss": 4.5517, + "step": 3112 + }, + { + "epoch": 0.14493563330772632, + "grad_norm": 0.9608469901639033, + "learning_rate": 4.8308504034761024e-05, + "loss": 4.5755, + "step": 3113 + }, + { + "epoch": 0.1449821914938194, + "grad_norm": 1.0406184693032428, + "learning_rate": 4.832402234636872e-05, + "loss": 4.5031, + "step": 3114 + }, + { + "epoch": 0.14502874967991247, + "grad_norm": 0.92157769591176, + "learning_rate": 4.833954065797641e-05, + "loss": 4.455, + "step": 3115 + }, + { + "epoch": 0.14507530786600553, + "grad_norm": 0.738388933608529, + "learning_rate": 4.835505896958411e-05, + "loss": 4.5876, + "step": 3116 + }, + { + "epoch": 0.14512186605209862, + "grad_norm": 0.9725403531670839, + "learning_rate": 4.837057728119181e-05, + "loss": 4.5536, + "step": 3117 + }, + { + "epoch": 0.14516842423819168, + "grad_norm": 0.987985934236044, + "learning_rate": 4.8386095592799505e-05, + "loss": 4.5238, + "step": 3118 + }, + { + "epoch": 0.14521498242428474, + "grad_norm": 1.0714269135820196, + "learning_rate": 4.84016139044072e-05, + "loss": 4.6417, + "step": 3119 + }, + { + "epoch": 0.14526154061037783, + "grad_norm": 0.9174714596803623, + "learning_rate": 4.84171322160149e-05, + "loss": 4.5867, + "step": 3120 + }, + { + "epoch": 0.14530809879647089, + "grad_norm": 0.8586267335241586, + "learning_rate": 4.8432650527622594e-05, + "loss": 4.6397, + "step": 3121 + }, + { + "epoch": 0.14535465698256395, + "grad_norm": 0.8485819984893939, + "learning_rate": 4.84481688392303e-05, + "loss": 4.567, + "step": 3122 + }, + { + "epoch": 0.14540121516865703, + "grad_norm": 0.6503964866955553, + "learning_rate": 4.846368715083799e-05, + "loss": 4.5499, + "step": 3123 + }, + { + "epoch": 0.1454477733547501, + "grad_norm": 0.8993139201333239, + "learning_rate": 4.847920546244569e-05, + "loss": 4.6009, + "step": 3124 + }, + { + "epoch": 0.14549433154084318, + "grad_norm": 0.8303048174363641, + "learning_rate": 4.8494723774053386e-05, + "loss": 4.5377, + "step": 3125 + }, + { + "epoch": 0.14554088972693624, + "grad_norm": 0.8402149939568815, + "learning_rate": 4.851024208566108e-05, + "loss": 4.5565, + "step": 3126 + }, + { + "epoch": 0.1455874479130293, + "grad_norm": 0.9420515647958365, + "learning_rate": 4.852576039726878e-05, + "loss": 4.5732, + "step": 3127 + }, + { + "epoch": 0.1456340060991224, + "grad_norm": 0.7770991316131979, + "learning_rate": 4.8541278708876475e-05, + "loss": 4.6657, + "step": 3128 + }, + { + "epoch": 0.14568056428521545, + "grad_norm": 0.7279319172494044, + "learning_rate": 4.855679702048417e-05, + "loss": 4.5509, + "step": 3129 + }, + { + "epoch": 0.1457271224713085, + "grad_norm": 0.7792461373166495, + "learning_rate": 4.8572315332091874e-05, + "loss": 4.5821, + "step": 3130 + }, + { + "epoch": 0.1457736806574016, + "grad_norm": 0.8752481284959805, + "learning_rate": 4.8587833643699564e-05, + "loss": 4.5031, + "step": 3131 + }, + { + "epoch": 0.14582023884349465, + "grad_norm": 0.7347003706502462, + "learning_rate": 4.860335195530727e-05, + "loss": 4.5713, + "step": 3132 + }, + { + "epoch": 0.1458667970295877, + "grad_norm": 0.7270786831269636, + "learning_rate": 4.861887026691496e-05, + "loss": 4.3544, + "step": 3133 + }, + { + "epoch": 0.1459133552156808, + "grad_norm": 0.5612174975119534, + "learning_rate": 4.863438857852266e-05, + "loss": 4.5459, + "step": 3134 + }, + { + "epoch": 0.14595991340177386, + "grad_norm": 0.7319201483693639, + "learning_rate": 4.8649906890130356e-05, + "loss": 4.5523, + "step": 3135 + }, + { + "epoch": 0.14600647158786695, + "grad_norm": 0.7547599608816224, + "learning_rate": 4.866542520173805e-05, + "loss": 4.5398, + "step": 3136 + }, + { + "epoch": 0.14605302977396, + "grad_norm": 0.8132808978783644, + "learning_rate": 4.868094351334575e-05, + "loss": 4.5752, + "step": 3137 + }, + { + "epoch": 0.14609958796005307, + "grad_norm": 0.7777077753175171, + "learning_rate": 4.869646182495345e-05, + "loss": 4.6079, + "step": 3138 + }, + { + "epoch": 0.14614614614614616, + "grad_norm": 0.7031581944606402, + "learning_rate": 4.871198013656115e-05, + "loss": 4.5241, + "step": 3139 + }, + { + "epoch": 0.14619270433223922, + "grad_norm": 0.9124476628747537, + "learning_rate": 4.872749844816884e-05, + "loss": 4.5551, + "step": 3140 + }, + { + "epoch": 0.14623926251833227, + "grad_norm": 0.9982585627136196, + "learning_rate": 4.874301675977654e-05, + "loss": 4.6205, + "step": 3141 + }, + { + "epoch": 0.14628582070442536, + "grad_norm": 0.9620037717837311, + "learning_rate": 4.875853507138424e-05, + "loss": 4.563, + "step": 3142 + }, + { + "epoch": 0.14633237889051842, + "grad_norm": 0.9314713531529866, + "learning_rate": 4.877405338299193e-05, + "loss": 4.4857, + "step": 3143 + }, + { + "epoch": 0.14637893707661148, + "grad_norm": 0.9117821788555495, + "learning_rate": 4.878957169459963e-05, + "loss": 4.4923, + "step": 3144 + }, + { + "epoch": 0.14642549526270457, + "grad_norm": 0.9714370690182161, + "learning_rate": 4.8805090006207326e-05, + "loss": 4.5096, + "step": 3145 + }, + { + "epoch": 0.14647205344879763, + "grad_norm": 0.7800137743802598, + "learning_rate": 4.882060831781502e-05, + "loss": 4.4479, + "step": 3146 + }, + { + "epoch": 0.14651861163489072, + "grad_norm": 0.8069750114630405, + "learning_rate": 4.8836126629422725e-05, + "loss": 4.5932, + "step": 3147 + }, + { + "epoch": 0.14656516982098378, + "grad_norm": 0.7753889805511709, + "learning_rate": 4.8851644941030415e-05, + "loss": 4.4389, + "step": 3148 + }, + { + "epoch": 0.14661172800707684, + "grad_norm": 0.6867273254963967, + "learning_rate": 4.886716325263812e-05, + "loss": 4.6164, + "step": 3149 + }, + { + "epoch": 0.14665828619316992, + "grad_norm": 0.8766120097243576, + "learning_rate": 4.8882681564245814e-05, + "loss": 4.5653, + "step": 3150 + }, + { + "epoch": 0.14670484437926298, + "grad_norm": 0.8031484194565661, + "learning_rate": 4.889819987585351e-05, + "loss": 4.5657, + "step": 3151 + }, + { + "epoch": 0.14675140256535604, + "grad_norm": 0.5810446215002599, + "learning_rate": 4.891371818746121e-05, + "loss": 4.5474, + "step": 3152 + }, + { + "epoch": 0.14679796075144913, + "grad_norm": 0.7672270925583827, + "learning_rate": 4.89292364990689e-05, + "loss": 4.5419, + "step": 3153 + }, + { + "epoch": 0.1468445189375422, + "grad_norm": 0.8821408186062816, + "learning_rate": 4.89447548106766e-05, + "loss": 4.5153, + "step": 3154 + }, + { + "epoch": 0.14689107712363525, + "grad_norm": 0.836182326859593, + "learning_rate": 4.89602731222843e-05, + "loss": 4.5057, + "step": 3155 + }, + { + "epoch": 0.14693763530972834, + "grad_norm": 0.8824897477857445, + "learning_rate": 4.897579143389199e-05, + "loss": 4.5381, + "step": 3156 + }, + { + "epoch": 0.1469841934958214, + "grad_norm": 1.0304324991554294, + "learning_rate": 4.8991309745499695e-05, + "loss": 4.5274, + "step": 3157 + }, + { + "epoch": 0.14703075168191448, + "grad_norm": 1.0164341240640924, + "learning_rate": 4.900682805710739e-05, + "loss": 4.4759, + "step": 3158 + }, + { + "epoch": 0.14707730986800754, + "grad_norm": 0.8973551245799875, + "learning_rate": 4.902234636871509e-05, + "loss": 4.6239, + "step": 3159 + }, + { + "epoch": 0.1471238680541006, + "grad_norm": 0.9225766288339938, + "learning_rate": 4.9037864680322784e-05, + "loss": 4.5184, + "step": 3160 + }, + { + "epoch": 0.1471704262401937, + "grad_norm": 0.9166768043021507, + "learning_rate": 4.905338299193048e-05, + "loss": 4.4915, + "step": 3161 + }, + { + "epoch": 0.14721698442628675, + "grad_norm": 1.1091144497402379, + "learning_rate": 4.9068901303538177e-05, + "loss": 4.6446, + "step": 3162 + }, + { + "epoch": 0.1472635426123798, + "grad_norm": 1.0899200083744487, + "learning_rate": 4.908441961514588e-05, + "loss": 4.7243, + "step": 3163 + }, + { + "epoch": 0.1473101007984729, + "grad_norm": 0.6947102206696344, + "learning_rate": 4.909993792675357e-05, + "loss": 4.635, + "step": 3164 + }, + { + "epoch": 0.14735665898456596, + "grad_norm": 0.9793348457397513, + "learning_rate": 4.9115456238361265e-05, + "loss": 4.5398, + "step": 3165 + }, + { + "epoch": 0.14740321717065902, + "grad_norm": 1.2568544013890002, + "learning_rate": 4.913097454996897e-05, + "loss": 4.6683, + "step": 3166 + }, + { + "epoch": 0.1474497753567521, + "grad_norm": 0.8068450009822624, + "learning_rate": 4.9146492861576665e-05, + "loss": 4.6039, + "step": 3167 + }, + { + "epoch": 0.14749633354284516, + "grad_norm": 0.8023581827526298, + "learning_rate": 4.916201117318436e-05, + "loss": 4.4676, + "step": 3168 + }, + { + "epoch": 0.14754289172893825, + "grad_norm": 0.8551313608262867, + "learning_rate": 4.917752948479206e-05, + "loss": 4.5198, + "step": 3169 + }, + { + "epoch": 0.1475894499150313, + "grad_norm": 0.7832844550167695, + "learning_rate": 4.9193047796399754e-05, + "loss": 4.721, + "step": 3170 + }, + { + "epoch": 0.14763600810112437, + "grad_norm": 0.8513733634808993, + "learning_rate": 4.920856610800745e-05, + "loss": 4.6096, + "step": 3171 + }, + { + "epoch": 0.14768256628721746, + "grad_norm": 0.8581193507607564, + "learning_rate": 4.9224084419615146e-05, + "loss": 4.64, + "step": 3172 + }, + { + "epoch": 0.14772912447331052, + "grad_norm": 1.025796947719785, + "learning_rate": 4.923960273122284e-05, + "loss": 4.5971, + "step": 3173 + }, + { + "epoch": 0.14777568265940358, + "grad_norm": 0.904993419354095, + "learning_rate": 4.9255121042830546e-05, + "loss": 4.5596, + "step": 3174 + }, + { + "epoch": 0.14782224084549667, + "grad_norm": 0.885391601064387, + "learning_rate": 4.927063935443824e-05, + "loss": 4.5558, + "step": 3175 + }, + { + "epoch": 0.14786879903158973, + "grad_norm": 0.788586797871616, + "learning_rate": 4.928615766604593e-05, + "loss": 4.4006, + "step": 3176 + }, + { + "epoch": 0.14791535721768279, + "grad_norm": 0.7789717425527389, + "learning_rate": 4.9301675977653635e-05, + "loss": 4.5102, + "step": 3177 + }, + { + "epoch": 0.14796191540377587, + "grad_norm": 0.930475361697358, + "learning_rate": 4.931719428926133e-05, + "loss": 4.5674, + "step": 3178 + }, + { + "epoch": 0.14800847358986893, + "grad_norm": 0.9145774872716316, + "learning_rate": 4.933271260086903e-05, + "loss": 4.5445, + "step": 3179 + }, + { + "epoch": 0.14805503177596202, + "grad_norm": 0.8459288474330341, + "learning_rate": 4.9348230912476724e-05, + "loss": 4.4704, + "step": 3180 + }, + { + "epoch": 0.14810158996205508, + "grad_norm": 0.7553449550263374, + "learning_rate": 4.936374922408442e-05, + "loss": 4.4481, + "step": 3181 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.7217217749539577, + "learning_rate": 4.937926753569212e-05, + "loss": 4.4834, + "step": 3182 + }, + { + "epoch": 0.14819470633424123, + "grad_norm": 0.7415322052873197, + "learning_rate": 4.939478584729982e-05, + "loss": 4.6399, + "step": 3183 + }, + { + "epoch": 0.1482412645203343, + "grad_norm": 0.7835125734297282, + "learning_rate": 4.941030415890751e-05, + "loss": 4.5795, + "step": 3184 + }, + { + "epoch": 0.14828782270642735, + "grad_norm": 0.8880792489023351, + "learning_rate": 4.942582247051521e-05, + "loss": 4.5114, + "step": 3185 + }, + { + "epoch": 0.14833438089252043, + "grad_norm": 0.8183495356713636, + "learning_rate": 4.944134078212291e-05, + "loss": 4.5259, + "step": 3186 + }, + { + "epoch": 0.1483809390786135, + "grad_norm": 0.7750931818651869, + "learning_rate": 4.9456859093730605e-05, + "loss": 4.6263, + "step": 3187 + }, + { + "epoch": 0.14842749726470655, + "grad_norm": 0.9220186080192576, + "learning_rate": 4.94723774053383e-05, + "loss": 4.6113, + "step": 3188 + }, + { + "epoch": 0.14847405545079964, + "grad_norm": 0.8969388561250776, + "learning_rate": 4.9487895716946e-05, + "loss": 4.6243, + "step": 3189 + }, + { + "epoch": 0.1485206136368927, + "grad_norm": 0.7338571531421292, + "learning_rate": 4.9503414028553693e-05, + "loss": 4.5335, + "step": 3190 + }, + { + "epoch": 0.1485671718229858, + "grad_norm": 0.9798853390020937, + "learning_rate": 4.9518932340161397e-05, + "loss": 4.4522, + "step": 3191 + }, + { + "epoch": 0.14861373000907885, + "grad_norm": 1.0242585375238702, + "learning_rate": 4.9534450651769086e-05, + "loss": 4.528, + "step": 3192 + }, + { + "epoch": 0.1486602881951719, + "grad_norm": 0.8240995887761282, + "learning_rate": 4.954996896337679e-05, + "loss": 4.4547, + "step": 3193 + }, + { + "epoch": 0.148706846381265, + "grad_norm": 0.89121892301862, + "learning_rate": 4.9565487274984485e-05, + "loss": 4.5343, + "step": 3194 + }, + { + "epoch": 0.14875340456735806, + "grad_norm": 0.9573295158626403, + "learning_rate": 4.958100558659218e-05, + "loss": 4.4426, + "step": 3195 + }, + { + "epoch": 0.14879996275345111, + "grad_norm": 0.8505099909759775, + "learning_rate": 4.959652389819988e-05, + "loss": 4.4744, + "step": 3196 + }, + { + "epoch": 0.1488465209395442, + "grad_norm": 0.9521177596724824, + "learning_rate": 4.9612042209807574e-05, + "loss": 4.5473, + "step": 3197 + }, + { + "epoch": 0.14889307912563726, + "grad_norm": 1.1536971973370573, + "learning_rate": 4.962756052141527e-05, + "loss": 4.5966, + "step": 3198 + }, + { + "epoch": 0.14893963731173032, + "grad_norm": 0.7357402771240519, + "learning_rate": 4.9643078833022974e-05, + "loss": 4.4484, + "step": 3199 + }, + { + "epoch": 0.1489861954978234, + "grad_norm": 0.8443513448778265, + "learning_rate": 4.965859714463066e-05, + "loss": 4.3995, + "step": 3200 + }, + { + "epoch": 0.14903275368391647, + "grad_norm": 0.9308007390551241, + "learning_rate": 4.967411545623836e-05, + "loss": 4.4296, + "step": 3201 + }, + { + "epoch": 0.14907931187000956, + "grad_norm": 0.7292421935799541, + "learning_rate": 4.968963376784606e-05, + "loss": 4.4139, + "step": 3202 + }, + { + "epoch": 0.14912587005610262, + "grad_norm": 0.8374398525161497, + "learning_rate": 4.970515207945376e-05, + "loss": 4.616, + "step": 3203 + }, + { + "epoch": 0.14917242824219568, + "grad_norm": 0.8105153514051032, + "learning_rate": 4.9720670391061455e-05, + "loss": 4.4954, + "step": 3204 + }, + { + "epoch": 0.14921898642828876, + "grad_norm": 0.8037050749565006, + "learning_rate": 4.973618870266915e-05, + "loss": 4.6403, + "step": 3205 + }, + { + "epoch": 0.14926554461438182, + "grad_norm": 0.8935393221692532, + "learning_rate": 4.975170701427685e-05, + "loss": 4.3974, + "step": 3206 + }, + { + "epoch": 0.14931210280047488, + "grad_norm": 0.8770306533262466, + "learning_rate": 4.976722532588455e-05, + "loss": 4.4596, + "step": 3207 + }, + { + "epoch": 0.14935866098656797, + "grad_norm": 0.7755922030532316, + "learning_rate": 4.978274363749224e-05, + "loss": 4.5336, + "step": 3208 + }, + { + "epoch": 0.14940521917266103, + "grad_norm": 0.640405311114691, + "learning_rate": 4.979826194909994e-05, + "loss": 4.4793, + "step": 3209 + }, + { + "epoch": 0.1494517773587541, + "grad_norm": 0.6408756330770622, + "learning_rate": 4.981378026070764e-05, + "loss": 4.4996, + "step": 3210 + }, + { + "epoch": 0.14949833554484718, + "grad_norm": 0.7640113335238815, + "learning_rate": 4.9829298572315336e-05, + "loss": 4.4986, + "step": 3211 + }, + { + "epoch": 0.14954489373094024, + "grad_norm": 0.6645124623778488, + "learning_rate": 4.9844816883923026e-05, + "loss": 4.4496, + "step": 3212 + }, + { + "epoch": 0.14959145191703332, + "grad_norm": 0.6171261719550474, + "learning_rate": 4.986033519553073e-05, + "loss": 4.4794, + "step": 3213 + }, + { + "epoch": 0.14963801010312638, + "grad_norm": 0.6307271999740026, + "learning_rate": 4.9875853507138425e-05, + "loss": 4.5648, + "step": 3214 + }, + { + "epoch": 0.14968456828921944, + "grad_norm": 0.7099274184062784, + "learning_rate": 4.989137181874612e-05, + "loss": 4.5505, + "step": 3215 + }, + { + "epoch": 0.14973112647531253, + "grad_norm": 0.7911552575379573, + "learning_rate": 4.990689013035382e-05, + "loss": 4.446, + "step": 3216 + }, + { + "epoch": 0.1497776846614056, + "grad_norm": 0.8052019142579315, + "learning_rate": 4.9922408441961514e-05, + "loss": 4.3966, + "step": 3217 + }, + { + "epoch": 0.14982424284749865, + "grad_norm": 0.7036534100577809, + "learning_rate": 4.993792675356922e-05, + "loss": 4.6195, + "step": 3218 + }, + { + "epoch": 0.14987080103359174, + "grad_norm": 0.7170397727298569, + "learning_rate": 4.9953445065176913e-05, + "loss": 4.4166, + "step": 3219 + }, + { + "epoch": 0.1499173592196848, + "grad_norm": 0.6855830001899061, + "learning_rate": 4.99689633767846e-05, + "loss": 4.5458, + "step": 3220 + }, + { + "epoch": 0.14996391740577786, + "grad_norm": 0.6965760732126557, + "learning_rate": 4.9984481688392306e-05, + "loss": 4.4487, + "step": 3221 + }, + { + "epoch": 0.15001047559187095, + "grad_norm": 0.6585555264147384, + "learning_rate": 5e-05, + "loss": 4.3953, + "step": 3222 + }, + { + "epoch": 0.150057033777964, + "grad_norm": 0.7323920884055306, + "learning_rate": 5.00155183116077e-05, + "loss": 4.3016, + "step": 3223 + }, + { + "epoch": 0.1501035919640571, + "grad_norm": 0.9019536297823638, + "learning_rate": 5.00310366232154e-05, + "loss": 4.3627, + "step": 3224 + }, + { + "epoch": 0.15015015015015015, + "grad_norm": 1.1041748068036144, + "learning_rate": 5.00465549348231e-05, + "loss": 4.7324, + "step": 3225 + }, + { + "epoch": 0.1501967083362432, + "grad_norm": 0.8947329265194476, + "learning_rate": 5.0062073246430794e-05, + "loss": 4.4931, + "step": 3226 + }, + { + "epoch": 0.1502432665223363, + "grad_norm": 0.8187522485311526, + "learning_rate": 5.0077591558038484e-05, + "loss": 4.5205, + "step": 3227 + }, + { + "epoch": 0.15028982470842936, + "grad_norm": 0.8197639992066891, + "learning_rate": 5.009310986964618e-05, + "loss": 4.5547, + "step": 3228 + }, + { + "epoch": 0.15033638289452242, + "grad_norm": 0.7032335969314595, + "learning_rate": 5.010862818125388e-05, + "loss": 4.346, + "step": 3229 + }, + { + "epoch": 0.1503829410806155, + "grad_norm": 0.8108198450926728, + "learning_rate": 5.012414649286158e-05, + "loss": 4.5098, + "step": 3230 + }, + { + "epoch": 0.15042949926670857, + "grad_norm": 0.9106800028764309, + "learning_rate": 5.0139664804469276e-05, + "loss": 4.4924, + "step": 3231 + }, + { + "epoch": 0.15047605745280163, + "grad_norm": 0.790476778855286, + "learning_rate": 5.015518311607698e-05, + "loss": 4.4508, + "step": 3232 + }, + { + "epoch": 0.1505226156388947, + "grad_norm": 0.6414233030751628, + "learning_rate": 5.0170701427684675e-05, + "loss": 4.4672, + "step": 3233 + }, + { + "epoch": 0.15056917382498777, + "grad_norm": 0.8680447768145281, + "learning_rate": 5.018621973929237e-05, + "loss": 4.3147, + "step": 3234 + }, + { + "epoch": 0.15061573201108086, + "grad_norm": 0.8648547556497966, + "learning_rate": 5.020173805090006e-05, + "loss": 4.425, + "step": 3235 + }, + { + "epoch": 0.15066229019717392, + "grad_norm": 0.88129618350203, + "learning_rate": 5.021725636250776e-05, + "loss": 4.4974, + "step": 3236 + }, + { + "epoch": 0.15070884838326698, + "grad_norm": 0.6864245403672176, + "learning_rate": 5.0232774674115454e-05, + "loss": 4.5197, + "step": 3237 + }, + { + "epoch": 0.15075540656936007, + "grad_norm": 0.7031414657438271, + "learning_rate": 5.024829298572316e-05, + "loss": 4.567, + "step": 3238 + }, + { + "epoch": 0.15080196475545313, + "grad_norm": 0.8423682211689711, + "learning_rate": 5.026381129733085e-05, + "loss": 4.5672, + "step": 3239 + }, + { + "epoch": 0.1508485229415462, + "grad_norm": 0.8612472712861963, + "learning_rate": 5.027932960893855e-05, + "loss": 4.5512, + "step": 3240 + }, + { + "epoch": 0.15089508112763927, + "grad_norm": 0.9033823542427762, + "learning_rate": 5.029484792054625e-05, + "loss": 4.52, + "step": 3241 + }, + { + "epoch": 0.15094163931373233, + "grad_norm": 0.9202326420645901, + "learning_rate": 5.031036623215395e-05, + "loss": 4.4591, + "step": 3242 + }, + { + "epoch": 0.1509881974998254, + "grad_norm": 1.0827221909839349, + "learning_rate": 5.032588454376164e-05, + "loss": 4.4613, + "step": 3243 + }, + { + "epoch": 0.15103475568591848, + "grad_norm": 0.7608167158808485, + "learning_rate": 5.0341402855369335e-05, + "loss": 4.3979, + "step": 3244 + }, + { + "epoch": 0.15108131387201154, + "grad_norm": 0.742375963789992, + "learning_rate": 5.035692116697703e-05, + "loss": 4.5077, + "step": 3245 + }, + { + "epoch": 0.15112787205810463, + "grad_norm": 0.8575674207329224, + "learning_rate": 5.0372439478584734e-05, + "loss": 4.5917, + "step": 3246 + }, + { + "epoch": 0.1511744302441977, + "grad_norm": 1.030424914322597, + "learning_rate": 5.038795779019243e-05, + "loss": 4.5761, + "step": 3247 + }, + { + "epoch": 0.15122098843029075, + "grad_norm": 0.8705661035895039, + "learning_rate": 5.040347610180013e-05, + "loss": 4.6177, + "step": 3248 + }, + { + "epoch": 0.15126754661638384, + "grad_norm": 0.650960610936475, + "learning_rate": 5.041899441340783e-05, + "loss": 4.5383, + "step": 3249 + }, + { + "epoch": 0.1513141048024769, + "grad_norm": 0.7589157625467771, + "learning_rate": 5.0434512725015526e-05, + "loss": 4.4758, + "step": 3250 + }, + { + "epoch": 0.15136066298856996, + "grad_norm": 0.8474721895759889, + "learning_rate": 5.0450031036623216e-05, + "loss": 4.3974, + "step": 3251 + }, + { + "epoch": 0.15140722117466304, + "grad_norm": 0.7968019470946041, + "learning_rate": 5.046554934823091e-05, + "loss": 4.5948, + "step": 3252 + }, + { + "epoch": 0.1514537793607561, + "grad_norm": 0.7036352617985566, + "learning_rate": 5.048106765983861e-05, + "loss": 4.5041, + "step": 3253 + }, + { + "epoch": 0.15150033754684916, + "grad_norm": 0.7442060714239908, + "learning_rate": 5.049658597144631e-05, + "loss": 4.4332, + "step": 3254 + }, + { + "epoch": 0.15154689573294225, + "grad_norm": 0.639416676251619, + "learning_rate": 5.051210428305401e-05, + "loss": 4.5811, + "step": 3255 + }, + { + "epoch": 0.1515934539190353, + "grad_norm": 0.6011024525851604, + "learning_rate": 5.0527622594661704e-05, + "loss": 4.5425, + "step": 3256 + }, + { + "epoch": 0.1516400121051284, + "grad_norm": 0.6529212566808192, + "learning_rate": 5.054314090626941e-05, + "loss": 4.6413, + "step": 3257 + }, + { + "epoch": 0.15168657029122146, + "grad_norm": 0.7561490953321659, + "learning_rate": 5.05586592178771e-05, + "loss": 4.3799, + "step": 3258 + }, + { + "epoch": 0.15173312847731452, + "grad_norm": 0.710122374347566, + "learning_rate": 5.057417752948479e-05, + "loss": 4.4946, + "step": 3259 + }, + { + "epoch": 0.1517796866634076, + "grad_norm": 0.7317176042517164, + "learning_rate": 5.058969584109249e-05, + "loss": 4.344, + "step": 3260 + }, + { + "epoch": 0.15182624484950066, + "grad_norm": 0.6102026612991505, + "learning_rate": 5.0605214152700185e-05, + "loss": 4.449, + "step": 3261 + }, + { + "epoch": 0.15187280303559372, + "grad_norm": 0.7565808312950255, + "learning_rate": 5.062073246430788e-05, + "loss": 4.5025, + "step": 3262 + }, + { + "epoch": 0.1519193612216868, + "grad_norm": 0.7408245595651217, + "learning_rate": 5.0636250775915585e-05, + "loss": 4.5524, + "step": 3263 + }, + { + "epoch": 0.15196591940777987, + "grad_norm": 0.7716529558005637, + "learning_rate": 5.065176908752328e-05, + "loss": 4.4177, + "step": 3264 + }, + { + "epoch": 0.15201247759387293, + "grad_norm": 0.7962415654889433, + "learning_rate": 5.066728739913098e-05, + "loss": 4.4692, + "step": 3265 + }, + { + "epoch": 0.15205903577996602, + "grad_norm": 0.9765079428933282, + "learning_rate": 5.068280571073868e-05, + "loss": 4.3573, + "step": 3266 + }, + { + "epoch": 0.15210559396605908, + "grad_norm": 1.1250553524763587, + "learning_rate": 5.069832402234636e-05, + "loss": 4.4737, + "step": 3267 + }, + { + "epoch": 0.15215215215215216, + "grad_norm": 0.8460952473279018, + "learning_rate": 5.0713842333954066e-05, + "loss": 4.4398, + "step": 3268 + }, + { + "epoch": 0.15219871033824522, + "grad_norm": 0.7867343390506927, + "learning_rate": 5.072936064556176e-05, + "loss": 4.4824, + "step": 3269 + }, + { + "epoch": 0.15224526852433828, + "grad_norm": 0.9125588069365489, + "learning_rate": 5.074487895716946e-05, + "loss": 4.4026, + "step": 3270 + }, + { + "epoch": 0.15229182671043137, + "grad_norm": 0.9750293663126959, + "learning_rate": 5.076039726877716e-05, + "loss": 4.4154, + "step": 3271 + }, + { + "epoch": 0.15233838489652443, + "grad_norm": 0.7732805645571462, + "learning_rate": 5.077591558038486e-05, + "loss": 4.4412, + "step": 3272 + }, + { + "epoch": 0.1523849430826175, + "grad_norm": 0.7602443340552091, + "learning_rate": 5.0791433891992555e-05, + "loss": 4.3665, + "step": 3273 + }, + { + "epoch": 0.15243150126871058, + "grad_norm": 0.9785516029559863, + "learning_rate": 5.080695220360026e-05, + "loss": 4.3635, + "step": 3274 + }, + { + "epoch": 0.15247805945480364, + "grad_norm": 0.8821883248037786, + "learning_rate": 5.082247051520794e-05, + "loss": 4.575, + "step": 3275 + }, + { + "epoch": 0.1525246176408967, + "grad_norm": 0.8849059996906107, + "learning_rate": 5.0837988826815644e-05, + "loss": 4.4601, + "step": 3276 + }, + { + "epoch": 0.15257117582698979, + "grad_norm": 0.7845805034816252, + "learning_rate": 5.085350713842334e-05, + "loss": 4.4477, + "step": 3277 + }, + { + "epoch": 0.15261773401308285, + "grad_norm": 0.7911953108415126, + "learning_rate": 5.0869025450031036e-05, + "loss": 4.445, + "step": 3278 + }, + { + "epoch": 0.15266429219917593, + "grad_norm": 0.7689513296296882, + "learning_rate": 5.088454376163874e-05, + "loss": 4.3643, + "step": 3279 + }, + { + "epoch": 0.152710850385269, + "grad_norm": 0.6856959713760626, + "learning_rate": 5.0900062073246436e-05, + "loss": 4.5248, + "step": 3280 + }, + { + "epoch": 0.15275740857136205, + "grad_norm": 0.7169479628458989, + "learning_rate": 5.091558038485413e-05, + "loss": 4.524, + "step": 3281 + }, + { + "epoch": 0.15280396675745514, + "grad_norm": 0.772189635100576, + "learning_rate": 5.0931098696461835e-05, + "loss": 4.4789, + "step": 3282 + }, + { + "epoch": 0.1528505249435482, + "grad_norm": 0.9212058030419155, + "learning_rate": 5.094661700806952e-05, + "loss": 4.5544, + "step": 3283 + }, + { + "epoch": 0.15289708312964126, + "grad_norm": 0.8902664899653979, + "learning_rate": 5.096213531967722e-05, + "loss": 4.4467, + "step": 3284 + }, + { + "epoch": 0.15294364131573435, + "grad_norm": 0.7892079638449835, + "learning_rate": 5.097765363128492e-05, + "loss": 4.3225, + "step": 3285 + }, + { + "epoch": 0.1529901995018274, + "grad_norm": 0.7451713087440718, + "learning_rate": 5.0993171942892613e-05, + "loss": 4.3988, + "step": 3286 + }, + { + "epoch": 0.15303675768792047, + "grad_norm": 0.6865696227446023, + "learning_rate": 5.100869025450031e-05, + "loss": 4.337, + "step": 3287 + }, + { + "epoch": 0.15308331587401355, + "grad_norm": 0.6758922216849521, + "learning_rate": 5.102420856610801e-05, + "loss": 4.4551, + "step": 3288 + }, + { + "epoch": 0.1531298740601066, + "grad_norm": 0.7692789218276054, + "learning_rate": 5.103972687771571e-05, + "loss": 4.5455, + "step": 3289 + }, + { + "epoch": 0.1531764322461997, + "grad_norm": 0.8112948408513799, + "learning_rate": 5.1055245189323405e-05, + "loss": 4.4634, + "step": 3290 + }, + { + "epoch": 0.15322299043229276, + "grad_norm": 0.8060986059399197, + "learning_rate": 5.1070763500931095e-05, + "loss": 4.4785, + "step": 3291 + }, + { + "epoch": 0.15326954861838582, + "grad_norm": 0.8031595442618824, + "learning_rate": 5.108628181253879e-05, + "loss": 4.4956, + "step": 3292 + }, + { + "epoch": 0.1533161068044789, + "grad_norm": 0.7944632545684711, + "learning_rate": 5.1101800124146494e-05, + "loss": 4.5203, + "step": 3293 + }, + { + "epoch": 0.15336266499057197, + "grad_norm": 0.6736445777830405, + "learning_rate": 5.111731843575419e-05, + "loss": 4.4697, + "step": 3294 + }, + { + "epoch": 0.15340922317666503, + "grad_norm": 0.8064396247717812, + "learning_rate": 5.113283674736189e-05, + "loss": 4.5273, + "step": 3295 + }, + { + "epoch": 0.15345578136275811, + "grad_norm": 1.0079831466312017, + "learning_rate": 5.114835505896959e-05, + "loss": 4.4382, + "step": 3296 + }, + { + "epoch": 0.15350233954885117, + "grad_norm": 0.906032759484516, + "learning_rate": 5.1163873370577286e-05, + "loss": 4.6282, + "step": 3297 + }, + { + "epoch": 0.15354889773494423, + "grad_norm": 0.708616330568235, + "learning_rate": 5.117939168218498e-05, + "loss": 4.3197, + "step": 3298 + }, + { + "epoch": 0.15359545592103732, + "grad_norm": 0.6725502221124009, + "learning_rate": 5.119490999379267e-05, + "loss": 4.4881, + "step": 3299 + }, + { + "epoch": 0.15364201410713038, + "grad_norm": 0.8056130446979954, + "learning_rate": 5.121042830540037e-05, + "loss": 4.4416, + "step": 3300 + }, + { + "epoch": 0.15368857229322347, + "grad_norm": 0.7968622128508814, + "learning_rate": 5.122594661700807e-05, + "loss": 4.4944, + "step": 3301 + }, + { + "epoch": 0.15373513047931653, + "grad_norm": 0.7612683513812696, + "learning_rate": 5.124146492861577e-05, + "loss": 4.4519, + "step": 3302 + }, + { + "epoch": 0.1537816886654096, + "grad_norm": 0.8057818017173153, + "learning_rate": 5.1256983240223464e-05, + "loss": 4.4822, + "step": 3303 + }, + { + "epoch": 0.15382824685150268, + "grad_norm": 0.8473145574955414, + "learning_rate": 5.127250155183117e-05, + "loss": 4.501, + "step": 3304 + }, + { + "epoch": 0.15387480503759574, + "grad_norm": 0.8673293867581366, + "learning_rate": 5.1288019863438864e-05, + "loss": 4.5767, + "step": 3305 + }, + { + "epoch": 0.1539213632236888, + "grad_norm": 0.821484207812527, + "learning_rate": 5.130353817504656e-05, + "loss": 4.4184, + "step": 3306 + }, + { + "epoch": 0.15396792140978188, + "grad_norm": 0.7742335397799824, + "learning_rate": 5.131905648665426e-05, + "loss": 4.5716, + "step": 3307 + }, + { + "epoch": 0.15401447959587494, + "grad_norm": 0.9227159788206187, + "learning_rate": 5.1334574798261946e-05, + "loss": 4.3914, + "step": 3308 + }, + { + "epoch": 0.154061037781968, + "grad_norm": 0.8436073965379625, + "learning_rate": 5.135009310986965e-05, + "loss": 4.4507, + "step": 3309 + }, + { + "epoch": 0.1541075959680611, + "grad_norm": 0.8820257238780288, + "learning_rate": 5.1365611421477345e-05, + "loss": 4.5134, + "step": 3310 + }, + { + "epoch": 0.15415415415415415, + "grad_norm": 0.9124702625454713, + "learning_rate": 5.138112973308504e-05, + "loss": 4.435, + "step": 3311 + }, + { + "epoch": 0.15420071234024724, + "grad_norm": 0.7576918465164236, + "learning_rate": 5.139664804469274e-05, + "loss": 4.382, + "step": 3312 + }, + { + "epoch": 0.1542472705263403, + "grad_norm": 0.6635809173348738, + "learning_rate": 5.141216635630044e-05, + "loss": 4.2674, + "step": 3313 + }, + { + "epoch": 0.15429382871243336, + "grad_norm": 0.7370919423017506, + "learning_rate": 5.142768466790814e-05, + "loss": 4.4886, + "step": 3314 + }, + { + "epoch": 0.15434038689852644, + "grad_norm": 0.746719470918261, + "learning_rate": 5.1443202979515834e-05, + "loss": 4.4657, + "step": 3315 + }, + { + "epoch": 0.1543869450846195, + "grad_norm": 0.7937381369556462, + "learning_rate": 5.145872129112352e-05, + "loss": 4.3781, + "step": 3316 + }, + { + "epoch": 0.15443350327071256, + "grad_norm": 0.8223891065777225, + "learning_rate": 5.147423960273122e-05, + "loss": 4.4419, + "step": 3317 + }, + { + "epoch": 0.15448006145680565, + "grad_norm": 0.7446467082529353, + "learning_rate": 5.148975791433892e-05, + "loss": 4.2839, + "step": 3318 + }, + { + "epoch": 0.1545266196428987, + "grad_norm": 0.7605236658346077, + "learning_rate": 5.150527622594662e-05, + "loss": 4.481, + "step": 3319 + }, + { + "epoch": 0.15457317782899177, + "grad_norm": 0.6887389146829455, + "learning_rate": 5.1520794537554315e-05, + "loss": 4.3995, + "step": 3320 + }, + { + "epoch": 0.15461973601508486, + "grad_norm": 0.7562253288575265, + "learning_rate": 5.153631284916202e-05, + "loss": 4.463, + "step": 3321 + }, + { + "epoch": 0.15466629420117792, + "grad_norm": 0.8597045978954566, + "learning_rate": 5.1551831160769714e-05, + "loss": 4.5156, + "step": 3322 + }, + { + "epoch": 0.154712852387271, + "grad_norm": 0.8658659405633882, + "learning_rate": 5.156734947237741e-05, + "loss": 4.5187, + "step": 3323 + }, + { + "epoch": 0.15475941057336406, + "grad_norm": 0.7677598713274686, + "learning_rate": 5.15828677839851e-05, + "loss": 4.5384, + "step": 3324 + }, + { + "epoch": 0.15480596875945712, + "grad_norm": 0.6848325898717961, + "learning_rate": 5.1598386095592797e-05, + "loss": 4.5597, + "step": 3325 + }, + { + "epoch": 0.1548525269455502, + "grad_norm": 0.7308498630341715, + "learning_rate": 5.16139044072005e-05, + "loss": 4.3754, + "step": 3326 + }, + { + "epoch": 0.15489908513164327, + "grad_norm": 0.6309928389262828, + "learning_rate": 5.1629422718808196e-05, + "loss": 4.4338, + "step": 3327 + }, + { + "epoch": 0.15494564331773633, + "grad_norm": 0.69657917134038, + "learning_rate": 5.164494103041589e-05, + "loss": 4.479, + "step": 3328 + }, + { + "epoch": 0.15499220150382942, + "grad_norm": 0.7952576226555693, + "learning_rate": 5.1660459342023595e-05, + "loss": 4.4975, + "step": 3329 + }, + { + "epoch": 0.15503875968992248, + "grad_norm": 0.8387671329849271, + "learning_rate": 5.167597765363129e-05, + "loss": 4.4237, + "step": 3330 + }, + { + "epoch": 0.15508531787601554, + "grad_norm": 0.8899235126128389, + "learning_rate": 5.169149596523899e-05, + "loss": 4.5021, + "step": 3331 + }, + { + "epoch": 0.15513187606210863, + "grad_norm": 0.8756242554288113, + "learning_rate": 5.170701427684668e-05, + "loss": 4.4253, + "step": 3332 + }, + { + "epoch": 0.15517843424820169, + "grad_norm": 0.9847620158617678, + "learning_rate": 5.1722532588454374e-05, + "loss": 4.4678, + "step": 3333 + }, + { + "epoch": 0.15522499243429477, + "grad_norm": 0.836630487074168, + "learning_rate": 5.173805090006208e-05, + "loss": 4.3353, + "step": 3334 + }, + { + "epoch": 0.15527155062038783, + "grad_norm": 0.6570230836234422, + "learning_rate": 5.175356921166977e-05, + "loss": 4.3496, + "step": 3335 + }, + { + "epoch": 0.1553181088064809, + "grad_norm": 0.7714445178348526, + "learning_rate": 5.176908752327747e-05, + "loss": 4.4762, + "step": 3336 + }, + { + "epoch": 0.15536466699257398, + "grad_norm": 0.8193244313943705, + "learning_rate": 5.178460583488517e-05, + "loss": 4.4766, + "step": 3337 + }, + { + "epoch": 0.15541122517866704, + "grad_norm": 0.7721121832988648, + "learning_rate": 5.180012414649287e-05, + "loss": 4.4332, + "step": 3338 + }, + { + "epoch": 0.1554577833647601, + "grad_norm": 0.7955648489669798, + "learning_rate": 5.1815642458100565e-05, + "loss": 4.4728, + "step": 3339 + }, + { + "epoch": 0.1555043415508532, + "grad_norm": 0.7973807753374713, + "learning_rate": 5.1831160769708255e-05, + "loss": 4.4972, + "step": 3340 + }, + { + "epoch": 0.15555089973694625, + "grad_norm": 0.7759991052625779, + "learning_rate": 5.184667908131595e-05, + "loss": 4.5963, + "step": 3341 + }, + { + "epoch": 0.1555974579230393, + "grad_norm": 0.9514204234498824, + "learning_rate": 5.186219739292365e-05, + "loss": 4.4086, + "step": 3342 + }, + { + "epoch": 0.1556440161091324, + "grad_norm": 1.0389163013931313, + "learning_rate": 5.187771570453135e-05, + "loss": 4.4644, + "step": 3343 + }, + { + "epoch": 0.15569057429522545, + "grad_norm": 1.1565883612209207, + "learning_rate": 5.189323401613905e-05, + "loss": 4.4531, + "step": 3344 + }, + { + "epoch": 0.15573713248131854, + "grad_norm": 0.650073162525456, + "learning_rate": 5.190875232774674e-05, + "loss": 4.4169, + "step": 3345 + }, + { + "epoch": 0.1557836906674116, + "grad_norm": 0.8476204398074071, + "learning_rate": 5.1924270639354446e-05, + "loss": 4.3257, + "step": 3346 + }, + { + "epoch": 0.15583024885350466, + "grad_norm": 1.0830836114891547, + "learning_rate": 5.193978895096214e-05, + "loss": 4.3972, + "step": 3347 + }, + { + "epoch": 0.15587680703959775, + "grad_norm": 0.7949395894705411, + "learning_rate": 5.195530726256983e-05, + "loss": 4.359, + "step": 3348 + }, + { + "epoch": 0.1559233652256908, + "grad_norm": 0.937135752183072, + "learning_rate": 5.197082557417753e-05, + "loss": 4.3939, + "step": 3349 + }, + { + "epoch": 0.15596992341178387, + "grad_norm": 0.9754245609339581, + "learning_rate": 5.1986343885785225e-05, + "loss": 4.4722, + "step": 3350 + }, + { + "epoch": 0.15601648159787695, + "grad_norm": 1.050044253879841, + "learning_rate": 5.200186219739293e-05, + "loss": 4.5493, + "step": 3351 + }, + { + "epoch": 0.15606303978397001, + "grad_norm": 0.9056673488115919, + "learning_rate": 5.2017380509000624e-05, + "loss": 4.4639, + "step": 3352 + }, + { + "epoch": 0.15610959797006307, + "grad_norm": 0.8486738492392221, + "learning_rate": 5.203289882060832e-05, + "loss": 4.4978, + "step": 3353 + }, + { + "epoch": 0.15615615615615616, + "grad_norm": 0.7818974466585935, + "learning_rate": 5.204841713221602e-05, + "loss": 4.4263, + "step": 3354 + }, + { + "epoch": 0.15620271434224922, + "grad_norm": 0.7538923237454072, + "learning_rate": 5.206393544382372e-05, + "loss": 4.274, + "step": 3355 + }, + { + "epoch": 0.1562492725283423, + "grad_norm": 0.8660994359482033, + "learning_rate": 5.207945375543141e-05, + "loss": 4.4274, + "step": 3356 + }, + { + "epoch": 0.15629583071443537, + "grad_norm": 0.8125215861919487, + "learning_rate": 5.2094972067039106e-05, + "loss": 4.4746, + "step": 3357 + }, + { + "epoch": 0.15634238890052843, + "grad_norm": 0.8296298593330012, + "learning_rate": 5.21104903786468e-05, + "loss": 4.4984, + "step": 3358 + }, + { + "epoch": 0.15638894708662152, + "grad_norm": 0.7294401005837353, + "learning_rate": 5.2126008690254505e-05, + "loss": 4.4921, + "step": 3359 + }, + { + "epoch": 0.15643550527271458, + "grad_norm": 0.788003526385373, + "learning_rate": 5.21415270018622e-05, + "loss": 4.4307, + "step": 3360 + }, + { + "epoch": 0.15648206345880764, + "grad_norm": 0.9597142005136995, + "learning_rate": 5.21570453134699e-05, + "loss": 4.4195, + "step": 3361 + }, + { + "epoch": 0.15652862164490072, + "grad_norm": 0.9236874407059028, + "learning_rate": 5.21725636250776e-05, + "loss": 4.4891, + "step": 3362 + }, + { + "epoch": 0.15657517983099378, + "grad_norm": 0.7925460363773003, + "learning_rate": 5.21880819366853e-05, + "loss": 4.5446, + "step": 3363 + }, + { + "epoch": 0.15662173801708684, + "grad_norm": 0.8133604033090392, + "learning_rate": 5.2203600248292986e-05, + "loss": 4.3166, + "step": 3364 + }, + { + "epoch": 0.15666829620317993, + "grad_norm": 0.8500406225498398, + "learning_rate": 5.221911855990068e-05, + "loss": 4.4746, + "step": 3365 + }, + { + "epoch": 0.156714854389273, + "grad_norm": 0.9691398287415232, + "learning_rate": 5.223463687150838e-05, + "loss": 4.4094, + "step": 3366 + }, + { + "epoch": 0.15676141257536608, + "grad_norm": 0.8332407705213779, + "learning_rate": 5.2250155183116075e-05, + "loss": 4.5233, + "step": 3367 + }, + { + "epoch": 0.15680797076145914, + "grad_norm": 0.8257886902927459, + "learning_rate": 5.226567349472378e-05, + "loss": 4.2825, + "step": 3368 + }, + { + "epoch": 0.1568545289475522, + "grad_norm": 0.9260616225139041, + "learning_rate": 5.2281191806331475e-05, + "loss": 4.3928, + "step": 3369 + }, + { + "epoch": 0.15690108713364528, + "grad_norm": 0.7863106775487315, + "learning_rate": 5.229671011793917e-05, + "loss": 4.4251, + "step": 3370 + }, + { + "epoch": 0.15694764531973834, + "grad_norm": 0.8801431373707065, + "learning_rate": 5.2312228429546874e-05, + "loss": 4.3841, + "step": 3371 + }, + { + "epoch": 0.1569942035058314, + "grad_norm": 0.8827774490293656, + "learning_rate": 5.232774674115456e-05, + "loss": 4.448, + "step": 3372 + }, + { + "epoch": 0.1570407616919245, + "grad_norm": 0.8432703981135177, + "learning_rate": 5.234326505276226e-05, + "loss": 4.4224, + "step": 3373 + }, + { + "epoch": 0.15708731987801755, + "grad_norm": 0.9250244257320402, + "learning_rate": 5.2358783364369956e-05, + "loss": 4.5096, + "step": 3374 + }, + { + "epoch": 0.1571338780641106, + "grad_norm": 0.8772064378014356, + "learning_rate": 5.237430167597765e-05, + "loss": 4.5492, + "step": 3375 + }, + { + "epoch": 0.1571804362502037, + "grad_norm": 0.8329189432744601, + "learning_rate": 5.2389819987585356e-05, + "loss": 4.4643, + "step": 3376 + }, + { + "epoch": 0.15722699443629676, + "grad_norm": 0.8669380565813001, + "learning_rate": 5.240533829919305e-05, + "loss": 4.4477, + "step": 3377 + }, + { + "epoch": 0.15727355262238985, + "grad_norm": 0.7789356059778902, + "learning_rate": 5.242085661080075e-05, + "loss": 4.4527, + "step": 3378 + }, + { + "epoch": 0.1573201108084829, + "grad_norm": 0.7289995399004009, + "learning_rate": 5.243637492240845e-05, + "loss": 4.3749, + "step": 3379 + }, + { + "epoch": 0.15736666899457596, + "grad_norm": 0.8622159298995055, + "learning_rate": 5.2451893234016134e-05, + "loss": 4.3738, + "step": 3380 + }, + { + "epoch": 0.15741322718066905, + "grad_norm": 0.9398064999011209, + "learning_rate": 5.246741154562384e-05, + "loss": 4.409, + "step": 3381 + }, + { + "epoch": 0.1574597853667621, + "grad_norm": 0.9332787917074054, + "learning_rate": 5.2482929857231534e-05, + "loss": 4.3603, + "step": 3382 + }, + { + "epoch": 0.15750634355285517, + "grad_norm": 0.6392670013128752, + "learning_rate": 5.249844816883923e-05, + "loss": 4.4834, + "step": 3383 + }, + { + "epoch": 0.15755290173894826, + "grad_norm": 0.6945524247561902, + "learning_rate": 5.251396648044693e-05, + "loss": 4.3395, + "step": 3384 + }, + { + "epoch": 0.15759945992504132, + "grad_norm": 0.9402435672739102, + "learning_rate": 5.252948479205463e-05, + "loss": 4.4103, + "step": 3385 + }, + { + "epoch": 0.15764601811113438, + "grad_norm": 0.9605214754516577, + "learning_rate": 5.2545003103662326e-05, + "loss": 4.5007, + "step": 3386 + }, + { + "epoch": 0.15769257629722747, + "grad_norm": 0.823584733764108, + "learning_rate": 5.256052141527003e-05, + "loss": 4.2814, + "step": 3387 + }, + { + "epoch": 0.15773913448332053, + "grad_norm": 0.9044629704693015, + "learning_rate": 5.257603972687771e-05, + "loss": 4.3321, + "step": 3388 + }, + { + "epoch": 0.1577856926694136, + "grad_norm": 0.8797091377627726, + "learning_rate": 5.2591558038485414e-05, + "loss": 4.3758, + "step": 3389 + }, + { + "epoch": 0.15783225085550667, + "grad_norm": 0.9186350931134588, + "learning_rate": 5.260707635009311e-05, + "loss": 4.3657, + "step": 3390 + }, + { + "epoch": 0.15787880904159973, + "grad_norm": 0.9206680524277979, + "learning_rate": 5.262259466170081e-05, + "loss": 4.5046, + "step": 3391 + }, + { + "epoch": 0.15792536722769282, + "grad_norm": 0.8032841918438954, + "learning_rate": 5.26381129733085e-05, + "loss": 4.5597, + "step": 3392 + }, + { + "epoch": 0.15797192541378588, + "grad_norm": 0.7860853769299192, + "learning_rate": 5.2653631284916206e-05, + "loss": 4.4263, + "step": 3393 + }, + { + "epoch": 0.15801848359987894, + "grad_norm": 0.7422384836398306, + "learning_rate": 5.26691495965239e-05, + "loss": 4.3673, + "step": 3394 + }, + { + "epoch": 0.15806504178597203, + "grad_norm": 0.741562500493458, + "learning_rate": 5.26846679081316e-05, + "loss": 4.3665, + "step": 3395 + }, + { + "epoch": 0.1581115999720651, + "grad_norm": 0.8164383139791906, + "learning_rate": 5.27001862197393e-05, + "loss": 4.4165, + "step": 3396 + }, + { + "epoch": 0.15815815815815815, + "grad_norm": 0.7450684133346436, + "learning_rate": 5.2715704531346985e-05, + "loss": 4.4343, + "step": 3397 + }, + { + "epoch": 0.15820471634425123, + "grad_norm": 0.7049650753432614, + "learning_rate": 5.273122284295469e-05, + "loss": 4.3778, + "step": 3398 + }, + { + "epoch": 0.1582512745303443, + "grad_norm": 0.7484437341500256, + "learning_rate": 5.2746741154562384e-05, + "loss": 4.4622, + "step": 3399 + }, + { + "epoch": 0.15829783271643738, + "grad_norm": 0.7725174023991347, + "learning_rate": 5.276225946617008e-05, + "loss": 4.4081, + "step": 3400 + }, + { + "epoch": 0.15834439090253044, + "grad_norm": 0.9569751525970074, + "learning_rate": 5.2777777777777784e-05, + "loss": 4.3942, + "step": 3401 + }, + { + "epoch": 0.1583909490886235, + "grad_norm": 0.9816466137845178, + "learning_rate": 5.279329608938548e-05, + "loss": 4.4586, + "step": 3402 + }, + { + "epoch": 0.1584375072747166, + "grad_norm": 0.7618474947104829, + "learning_rate": 5.2808814400993176e-05, + "loss": 4.3892, + "step": 3403 + }, + { + "epoch": 0.15848406546080965, + "grad_norm": 0.7050436198280801, + "learning_rate": 5.282433271260088e-05, + "loss": 4.5408, + "step": 3404 + }, + { + "epoch": 0.1585306236469027, + "grad_norm": 0.7770756286249301, + "learning_rate": 5.283985102420856e-05, + "loss": 4.3964, + "step": 3405 + }, + { + "epoch": 0.1585771818329958, + "grad_norm": 0.7677325549209943, + "learning_rate": 5.2855369335816265e-05, + "loss": 4.4346, + "step": 3406 + }, + { + "epoch": 0.15862374001908885, + "grad_norm": 0.7493836185189011, + "learning_rate": 5.287088764742396e-05, + "loss": 4.4229, + "step": 3407 + }, + { + "epoch": 0.15867029820518191, + "grad_norm": 0.7589412837811695, + "learning_rate": 5.288640595903166e-05, + "loss": 4.4191, + "step": 3408 + }, + { + "epoch": 0.158716856391275, + "grad_norm": 0.8377478143240432, + "learning_rate": 5.290192427063936e-05, + "loss": 4.4667, + "step": 3409 + }, + { + "epoch": 0.15876341457736806, + "grad_norm": 0.6481891202152857, + "learning_rate": 5.291744258224706e-05, + "loss": 4.361, + "step": 3410 + }, + { + "epoch": 0.15880997276346115, + "grad_norm": 0.7735783058193505, + "learning_rate": 5.2932960893854754e-05, + "loss": 4.4647, + "step": 3411 + }, + { + "epoch": 0.1588565309495542, + "grad_norm": 0.7659731301615388, + "learning_rate": 5.294847920546246e-05, + "loss": 4.3659, + "step": 3412 + }, + { + "epoch": 0.15890308913564727, + "grad_norm": 0.6949307684665613, + "learning_rate": 5.296399751707014e-05, + "loss": 4.3993, + "step": 3413 + }, + { + "epoch": 0.15894964732174036, + "grad_norm": 0.6750815127645674, + "learning_rate": 5.297951582867784e-05, + "loss": 4.3384, + "step": 3414 + }, + { + "epoch": 0.15899620550783342, + "grad_norm": 0.6390242063256187, + "learning_rate": 5.299503414028554e-05, + "loss": 4.3062, + "step": 3415 + }, + { + "epoch": 0.15904276369392648, + "grad_norm": 0.8478202609575766, + "learning_rate": 5.3010552451893235e-05, + "loss": 4.2509, + "step": 3416 + }, + { + "epoch": 0.15908932188001956, + "grad_norm": 0.8184147802878529, + "learning_rate": 5.302607076350093e-05, + "loss": 4.3871, + "step": 3417 + }, + { + "epoch": 0.15913588006611262, + "grad_norm": 0.8247033889349603, + "learning_rate": 5.3041589075108634e-05, + "loss": 4.3338, + "step": 3418 + }, + { + "epoch": 0.15918243825220568, + "grad_norm": 0.7556354118380517, + "learning_rate": 5.305710738671633e-05, + "loss": 4.4209, + "step": 3419 + }, + { + "epoch": 0.15922899643829877, + "grad_norm": 0.8018788046483204, + "learning_rate": 5.307262569832403e-05, + "loss": 4.3353, + "step": 3420 + }, + { + "epoch": 0.15927555462439183, + "grad_norm": 0.7747460107605996, + "learning_rate": 5.308814400993172e-05, + "loss": 4.4286, + "step": 3421 + }, + { + "epoch": 0.15932211281048492, + "grad_norm": 0.832230645808079, + "learning_rate": 5.310366232153941e-05, + "loss": 4.4646, + "step": 3422 + }, + { + "epoch": 0.15936867099657798, + "grad_norm": 0.8414632649358877, + "learning_rate": 5.3119180633147116e-05, + "loss": 4.2922, + "step": 3423 + }, + { + "epoch": 0.15941522918267104, + "grad_norm": 0.794886685267759, + "learning_rate": 5.313469894475481e-05, + "loss": 4.344, + "step": 3424 + }, + { + "epoch": 0.15946178736876412, + "grad_norm": 0.6350633544248578, + "learning_rate": 5.315021725636251e-05, + "loss": 4.4392, + "step": 3425 + }, + { + "epoch": 0.15950834555485718, + "grad_norm": 0.6288643450199211, + "learning_rate": 5.316573556797021e-05, + "loss": 4.3355, + "step": 3426 + }, + { + "epoch": 0.15955490374095024, + "grad_norm": 0.7825201123272713, + "learning_rate": 5.318125387957791e-05, + "loss": 4.3924, + "step": 3427 + }, + { + "epoch": 0.15960146192704333, + "grad_norm": 0.8912384159305606, + "learning_rate": 5.3196772191185604e-05, + "loss": 4.3694, + "step": 3428 + }, + { + "epoch": 0.1596480201131364, + "grad_norm": 0.7929395185371539, + "learning_rate": 5.3212290502793294e-05, + "loss": 4.4354, + "step": 3429 + }, + { + "epoch": 0.15969457829922945, + "grad_norm": 0.647495511632656, + "learning_rate": 5.322780881440099e-05, + "loss": 4.2833, + "step": 3430 + }, + { + "epoch": 0.15974113648532254, + "grad_norm": 0.7780254788681393, + "learning_rate": 5.324332712600869e-05, + "loss": 4.4728, + "step": 3431 + }, + { + "epoch": 0.1597876946714156, + "grad_norm": 0.7973790373120241, + "learning_rate": 5.325884543761639e-05, + "loss": 4.405, + "step": 3432 + }, + { + "epoch": 0.15983425285750869, + "grad_norm": 0.781153564063023, + "learning_rate": 5.3274363749224086e-05, + "loss": 4.3238, + "step": 3433 + }, + { + "epoch": 0.15988081104360174, + "grad_norm": 0.7147732340051173, + "learning_rate": 5.328988206083179e-05, + "loss": 4.4108, + "step": 3434 + }, + { + "epoch": 0.1599273692296948, + "grad_norm": 0.6923590255831287, + "learning_rate": 5.3305400372439485e-05, + "loss": 4.5057, + "step": 3435 + }, + { + "epoch": 0.1599739274157879, + "grad_norm": 0.7106927044619737, + "learning_rate": 5.332091868404718e-05, + "loss": 4.454, + "step": 3436 + }, + { + "epoch": 0.16002048560188095, + "grad_norm": 0.7030973328273191, + "learning_rate": 5.333643699565487e-05, + "loss": 4.3749, + "step": 3437 + }, + { + "epoch": 0.160067043787974, + "grad_norm": 0.8812913748838609, + "learning_rate": 5.335195530726257e-05, + "loss": 4.4105, + "step": 3438 + }, + { + "epoch": 0.1601136019740671, + "grad_norm": 0.951845281468237, + "learning_rate": 5.336747361887027e-05, + "loss": 4.3017, + "step": 3439 + }, + { + "epoch": 0.16016016016016016, + "grad_norm": 0.9510713082206312, + "learning_rate": 5.338299193047797e-05, + "loss": 4.3186, + "step": 3440 + }, + { + "epoch": 0.16020671834625322, + "grad_norm": 1.0406922810181973, + "learning_rate": 5.339851024208566e-05, + "loss": 4.3994, + "step": 3441 + }, + { + "epoch": 0.1602532765323463, + "grad_norm": 0.8052877313316824, + "learning_rate": 5.341402855369336e-05, + "loss": 4.3761, + "step": 3442 + }, + { + "epoch": 0.16029983471843937, + "grad_norm": 0.6626627943278627, + "learning_rate": 5.342954686530106e-05, + "loss": 4.3614, + "step": 3443 + }, + { + "epoch": 0.16034639290453245, + "grad_norm": 0.882674017623859, + "learning_rate": 5.344506517690876e-05, + "loss": 4.3506, + "step": 3444 + }, + { + "epoch": 0.1603929510906255, + "grad_norm": 0.9211772610196466, + "learning_rate": 5.346058348851645e-05, + "loss": 4.3666, + "step": 3445 + }, + { + "epoch": 0.16043950927671857, + "grad_norm": 1.0117465259159677, + "learning_rate": 5.3476101800124145e-05, + "loss": 4.4222, + "step": 3446 + }, + { + "epoch": 0.16048606746281166, + "grad_norm": 0.9510901123222312, + "learning_rate": 5.349162011173184e-05, + "loss": 4.4758, + "step": 3447 + }, + { + "epoch": 0.16053262564890472, + "grad_norm": 0.9246850878896293, + "learning_rate": 5.3507138423339544e-05, + "loss": 4.346, + "step": 3448 + }, + { + "epoch": 0.16057918383499778, + "grad_norm": 0.8630405582297579, + "learning_rate": 5.352265673494724e-05, + "loss": 4.3127, + "step": 3449 + }, + { + "epoch": 0.16062574202109087, + "grad_norm": 0.805126751021584, + "learning_rate": 5.353817504655494e-05, + "loss": 4.3052, + "step": 3450 + }, + { + "epoch": 0.16067230020718393, + "grad_norm": 0.7125576234728754, + "learning_rate": 5.355369335816264e-05, + "loss": 4.3727, + "step": 3451 + }, + { + "epoch": 0.160718858393277, + "grad_norm": 0.7018243944000019, + "learning_rate": 5.3569211669770336e-05, + "loss": 4.2794, + "step": 3452 + }, + { + "epoch": 0.16076541657937007, + "grad_norm": 0.819808590409431, + "learning_rate": 5.3584729981378026e-05, + "loss": 4.3521, + "step": 3453 + }, + { + "epoch": 0.16081197476546313, + "grad_norm": 0.7443539965303767, + "learning_rate": 5.360024829298572e-05, + "loss": 4.3518, + "step": 3454 + }, + { + "epoch": 0.16085853295155622, + "grad_norm": 0.6018276275092397, + "learning_rate": 5.361576660459342e-05, + "loss": 4.3242, + "step": 3455 + }, + { + "epoch": 0.16090509113764928, + "grad_norm": 0.7357076356666193, + "learning_rate": 5.363128491620112e-05, + "loss": 4.4575, + "step": 3456 + }, + { + "epoch": 0.16095164932374234, + "grad_norm": 0.8185673100553317, + "learning_rate": 5.364680322780882e-05, + "loss": 4.3906, + "step": 3457 + }, + { + "epoch": 0.16099820750983543, + "grad_norm": 0.9204479775988779, + "learning_rate": 5.3662321539416514e-05, + "loss": 4.4512, + "step": 3458 + }, + { + "epoch": 0.1610447656959285, + "grad_norm": 1.0427872964267833, + "learning_rate": 5.367783985102422e-05, + "loss": 4.3696, + "step": 3459 + }, + { + "epoch": 0.16109132388202155, + "grad_norm": 0.793608525982375, + "learning_rate": 5.369335816263191e-05, + "loss": 4.3519, + "step": 3460 + }, + { + "epoch": 0.16113788206811464, + "grad_norm": 0.7035449682326105, + "learning_rate": 5.37088764742396e-05, + "loss": 4.4803, + "step": 3461 + }, + { + "epoch": 0.1611844402542077, + "grad_norm": 0.9571191165503925, + "learning_rate": 5.37243947858473e-05, + "loss": 4.4647, + "step": 3462 + }, + { + "epoch": 0.16123099844030075, + "grad_norm": 0.8977334866589358, + "learning_rate": 5.3739913097454995e-05, + "loss": 4.3926, + "step": 3463 + }, + { + "epoch": 0.16127755662639384, + "grad_norm": 0.7807921085294223, + "learning_rate": 5.37554314090627e-05, + "loss": 4.3669, + "step": 3464 + }, + { + "epoch": 0.1613241148124869, + "grad_norm": 0.9834079380991848, + "learning_rate": 5.3770949720670395e-05, + "loss": 4.3214, + "step": 3465 + }, + { + "epoch": 0.16137067299857996, + "grad_norm": 0.9353957810520374, + "learning_rate": 5.378646803227809e-05, + "loss": 4.4621, + "step": 3466 + }, + { + "epoch": 0.16141723118467305, + "grad_norm": 0.9171140228816672, + "learning_rate": 5.380198634388579e-05, + "loss": 4.5397, + "step": 3467 + }, + { + "epoch": 0.1614637893707661, + "grad_norm": 0.9160353164623262, + "learning_rate": 5.381750465549349e-05, + "loss": 4.3847, + "step": 3468 + }, + { + "epoch": 0.1615103475568592, + "grad_norm": 0.8696774787678836, + "learning_rate": 5.383302296710117e-05, + "loss": 4.3819, + "step": 3469 + }, + { + "epoch": 0.16155690574295226, + "grad_norm": 1.1814621274838841, + "learning_rate": 5.3848541278708876e-05, + "loss": 4.319, + "step": 3470 + }, + { + "epoch": 0.16160346392904532, + "grad_norm": 0.757595636091906, + "learning_rate": 5.386405959031657e-05, + "loss": 4.5354, + "step": 3471 + }, + { + "epoch": 0.1616500221151384, + "grad_norm": 0.8314112682300858, + "learning_rate": 5.387957790192427e-05, + "loss": 4.3541, + "step": 3472 + }, + { + "epoch": 0.16169658030123146, + "grad_norm": 0.9848091567517708, + "learning_rate": 5.389509621353197e-05, + "loss": 4.3925, + "step": 3473 + }, + { + "epoch": 0.16174313848732452, + "grad_norm": 1.074451226723814, + "learning_rate": 5.391061452513967e-05, + "loss": 4.3943, + "step": 3474 + }, + { + "epoch": 0.1617896966734176, + "grad_norm": 0.7173848502326635, + "learning_rate": 5.3926132836747365e-05, + "loss": 4.4616, + "step": 3475 + }, + { + "epoch": 0.16183625485951067, + "grad_norm": 0.8912829836837585, + "learning_rate": 5.394165114835507e-05, + "loss": 4.3035, + "step": 3476 + }, + { + "epoch": 0.16188281304560373, + "grad_norm": 0.9413873001763812, + "learning_rate": 5.3957169459962764e-05, + "loss": 4.4839, + "step": 3477 + }, + { + "epoch": 0.16192937123169682, + "grad_norm": 0.7058051073387468, + "learning_rate": 5.3972687771570454e-05, + "loss": 4.3731, + "step": 3478 + }, + { + "epoch": 0.16197592941778988, + "grad_norm": 0.9132641888276638, + "learning_rate": 5.398820608317815e-05, + "loss": 4.3173, + "step": 3479 + }, + { + "epoch": 0.16202248760388296, + "grad_norm": 0.9276196489890093, + "learning_rate": 5.4003724394785846e-05, + "loss": 4.378, + "step": 3480 + }, + { + "epoch": 0.16206904578997602, + "grad_norm": 0.7910371651259266, + "learning_rate": 5.401924270639355e-05, + "loss": 4.3093, + "step": 3481 + }, + { + "epoch": 0.16211560397606908, + "grad_norm": 0.9668436859395995, + "learning_rate": 5.4034761018001246e-05, + "loss": 4.3337, + "step": 3482 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 0.8912470654497674, + "learning_rate": 5.405027932960894e-05, + "loss": 4.2898, + "step": 3483 + }, + { + "epoch": 0.16220872034825523, + "grad_norm": 0.831573782617506, + "learning_rate": 5.4065797641216645e-05, + "loss": 4.4639, + "step": 3484 + }, + { + "epoch": 0.1622552785343483, + "grad_norm": 0.7862306941962272, + "learning_rate": 5.408131595282434e-05, + "loss": 4.324, + "step": 3485 + }, + { + "epoch": 0.16230183672044138, + "grad_norm": 0.7608756011876097, + "learning_rate": 5.409683426443203e-05, + "loss": 4.2796, + "step": 3486 + }, + { + "epoch": 0.16234839490653444, + "grad_norm": 0.8817827585763353, + "learning_rate": 5.411235257603973e-05, + "loss": 4.3899, + "step": 3487 + }, + { + "epoch": 0.1623949530926275, + "grad_norm": 0.8788720352235896, + "learning_rate": 5.4127870887647423e-05, + "loss": 4.293, + "step": 3488 + }, + { + "epoch": 0.16244151127872059, + "grad_norm": 0.8792799110191375, + "learning_rate": 5.4143389199255126e-05, + "loss": 4.4301, + "step": 3489 + }, + { + "epoch": 0.16248806946481364, + "grad_norm": 0.8009180569694146, + "learning_rate": 5.415890751086282e-05, + "loss": 4.474, + "step": 3490 + }, + { + "epoch": 0.16253462765090673, + "grad_norm": 0.8393593108691537, + "learning_rate": 5.417442582247052e-05, + "loss": 4.2661, + "step": 3491 + }, + { + "epoch": 0.1625811858369998, + "grad_norm": 0.7701725326889896, + "learning_rate": 5.4189944134078215e-05, + "loss": 4.3148, + "step": 3492 + }, + { + "epoch": 0.16262774402309285, + "grad_norm": 0.7212912818638455, + "learning_rate": 5.420546244568592e-05, + "loss": 4.2977, + "step": 3493 + }, + { + "epoch": 0.16267430220918594, + "grad_norm": 0.7817533003712773, + "learning_rate": 5.42209807572936e-05, + "loss": 4.1716, + "step": 3494 + }, + { + "epoch": 0.162720860395279, + "grad_norm": 0.8852774584385404, + "learning_rate": 5.4236499068901304e-05, + "loss": 4.245, + "step": 3495 + }, + { + "epoch": 0.16276741858137206, + "grad_norm": 0.8794207887971013, + "learning_rate": 5.4252017380509e-05, + "loss": 4.3163, + "step": 3496 + }, + { + "epoch": 0.16281397676746515, + "grad_norm": 0.8271624008125763, + "learning_rate": 5.42675356921167e-05, + "loss": 4.4315, + "step": 3497 + }, + { + "epoch": 0.1628605349535582, + "grad_norm": 0.7748276394608183, + "learning_rate": 5.42830540037244e-05, + "loss": 4.3947, + "step": 3498 + }, + { + "epoch": 0.16290709313965127, + "grad_norm": 0.8298284392509151, + "learning_rate": 5.4298572315332096e-05, + "loss": 4.2682, + "step": 3499 + }, + { + "epoch": 0.16295365132574435, + "grad_norm": 0.9344108101944629, + "learning_rate": 5.431409062693979e-05, + "loss": 4.4001, + "step": 3500 + }, + { + "epoch": 0.1630002095118374, + "grad_norm": 0.8805304980210626, + "learning_rate": 5.4329608938547496e-05, + "loss": 4.3321, + "step": 3501 + }, + { + "epoch": 0.1630467676979305, + "grad_norm": 0.7822955679571166, + "learning_rate": 5.434512725015518e-05, + "loss": 4.3533, + "step": 3502 + }, + { + "epoch": 0.16309332588402356, + "grad_norm": 0.9660653430171656, + "learning_rate": 5.436064556176288e-05, + "loss": 4.3661, + "step": 3503 + }, + { + "epoch": 0.16313988407011662, + "grad_norm": 0.9825751658439119, + "learning_rate": 5.437616387337058e-05, + "loss": 4.3728, + "step": 3504 + }, + { + "epoch": 0.1631864422562097, + "grad_norm": 0.8637464319310102, + "learning_rate": 5.4391682184978274e-05, + "loss": 4.2537, + "step": 3505 + }, + { + "epoch": 0.16323300044230277, + "grad_norm": 0.91661088632259, + "learning_rate": 5.440720049658598e-05, + "loss": 4.3077, + "step": 3506 + }, + { + "epoch": 0.16327955862839583, + "grad_norm": 0.8713016922895398, + "learning_rate": 5.4422718808193674e-05, + "loss": 4.4242, + "step": 3507 + }, + { + "epoch": 0.16332611681448891, + "grad_norm": 0.8871711364839414, + "learning_rate": 5.443823711980137e-05, + "loss": 4.391, + "step": 3508 + }, + { + "epoch": 0.16337267500058197, + "grad_norm": 0.8966859461201996, + "learning_rate": 5.445375543140907e-05, + "loss": 4.2706, + "step": 3509 + }, + { + "epoch": 0.16341923318667503, + "grad_norm": 0.9478539800631736, + "learning_rate": 5.4469273743016756e-05, + "loss": 4.3743, + "step": 3510 + }, + { + "epoch": 0.16346579137276812, + "grad_norm": 0.7764647185287722, + "learning_rate": 5.448479205462446e-05, + "loss": 4.4307, + "step": 3511 + }, + { + "epoch": 0.16351234955886118, + "grad_norm": 0.9716299463423326, + "learning_rate": 5.4500310366232155e-05, + "loss": 4.5114, + "step": 3512 + }, + { + "epoch": 0.16355890774495427, + "grad_norm": 1.06919661132965, + "learning_rate": 5.451582867783985e-05, + "loss": 4.4371, + "step": 3513 + }, + { + "epoch": 0.16360546593104733, + "grad_norm": 0.9730006715824598, + "learning_rate": 5.4531346989447555e-05, + "loss": 4.3545, + "step": 3514 + }, + { + "epoch": 0.1636520241171404, + "grad_norm": 1.0947489815885676, + "learning_rate": 5.454686530105525e-05, + "loss": 4.4373, + "step": 3515 + }, + { + "epoch": 0.16369858230323348, + "grad_norm": 0.7981937786028832, + "learning_rate": 5.456238361266295e-05, + "loss": 4.2767, + "step": 3516 + }, + { + "epoch": 0.16374514048932653, + "grad_norm": 0.7613501770027933, + "learning_rate": 5.457790192427065e-05, + "loss": 4.2704, + "step": 3517 + }, + { + "epoch": 0.1637916986754196, + "grad_norm": 0.9184781254095116, + "learning_rate": 5.459342023587833e-05, + "loss": 4.3932, + "step": 3518 + }, + { + "epoch": 0.16383825686151268, + "grad_norm": 0.8231403362984103, + "learning_rate": 5.4608938547486036e-05, + "loss": 4.3331, + "step": 3519 + }, + { + "epoch": 0.16388481504760574, + "grad_norm": 0.8698809823039553, + "learning_rate": 5.462445685909373e-05, + "loss": 4.3556, + "step": 3520 + }, + { + "epoch": 0.1639313732336988, + "grad_norm": 0.8001988230030764, + "learning_rate": 5.463997517070143e-05, + "loss": 4.3403, + "step": 3521 + }, + { + "epoch": 0.1639779314197919, + "grad_norm": 0.6872935374472442, + "learning_rate": 5.4655493482309125e-05, + "loss": 4.3419, + "step": 3522 + }, + { + "epoch": 0.16402448960588495, + "grad_norm": 0.7357105918512896, + "learning_rate": 5.467101179391683e-05, + "loss": 4.3159, + "step": 3523 + }, + { + "epoch": 0.16407104779197804, + "grad_norm": 0.675185169177603, + "learning_rate": 5.4686530105524524e-05, + "loss": 4.4695, + "step": 3524 + }, + { + "epoch": 0.1641176059780711, + "grad_norm": 0.6734379937435145, + "learning_rate": 5.470204841713222e-05, + "loss": 4.3776, + "step": 3525 + }, + { + "epoch": 0.16416416416416416, + "grad_norm": 0.6811859173900405, + "learning_rate": 5.471756672873991e-05, + "loss": 4.4236, + "step": 3526 + }, + { + "epoch": 0.16421072235025724, + "grad_norm": 0.6455384835349712, + "learning_rate": 5.4733085040347606e-05, + "loss": 4.4158, + "step": 3527 + }, + { + "epoch": 0.1642572805363503, + "grad_norm": 0.6458045488910394, + "learning_rate": 5.474860335195531e-05, + "loss": 4.3484, + "step": 3528 + }, + { + "epoch": 0.16430383872244336, + "grad_norm": 0.5950616148667676, + "learning_rate": 5.4764121663563006e-05, + "loss": 4.3334, + "step": 3529 + }, + { + "epoch": 0.16435039690853645, + "grad_norm": 0.7107582547747479, + "learning_rate": 5.47796399751707e-05, + "loss": 4.4024, + "step": 3530 + }, + { + "epoch": 0.1643969550946295, + "grad_norm": 0.6927464696301946, + "learning_rate": 5.4795158286778405e-05, + "loss": 4.2547, + "step": 3531 + }, + { + "epoch": 0.16444351328072257, + "grad_norm": 0.7999323615601156, + "learning_rate": 5.48106765983861e-05, + "loss": 4.317, + "step": 3532 + }, + { + "epoch": 0.16449007146681566, + "grad_norm": 0.846377642580368, + "learning_rate": 5.48261949099938e-05, + "loss": 4.3338, + "step": 3533 + }, + { + "epoch": 0.16453662965290872, + "grad_norm": 0.863046097016371, + "learning_rate": 5.484171322160149e-05, + "loss": 4.3075, + "step": 3534 + }, + { + "epoch": 0.1645831878390018, + "grad_norm": 0.7800965062336658, + "learning_rate": 5.4857231533209184e-05, + "loss": 4.3312, + "step": 3535 + }, + { + "epoch": 0.16462974602509486, + "grad_norm": 0.7991451998349715, + "learning_rate": 5.487274984481689e-05, + "loss": 4.372, + "step": 3536 + }, + { + "epoch": 0.16467630421118792, + "grad_norm": 0.7108381430863854, + "learning_rate": 5.488826815642458e-05, + "loss": 4.2676, + "step": 3537 + }, + { + "epoch": 0.164722862397281, + "grad_norm": 0.6962614663589834, + "learning_rate": 5.490378646803228e-05, + "loss": 4.3702, + "step": 3538 + }, + { + "epoch": 0.16476942058337407, + "grad_norm": 0.770896957574041, + "learning_rate": 5.491930477963998e-05, + "loss": 4.3393, + "step": 3539 + }, + { + "epoch": 0.16481597876946713, + "grad_norm": 0.7471413515593559, + "learning_rate": 5.493482309124768e-05, + "loss": 4.2591, + "step": 3540 + }, + { + "epoch": 0.16486253695556022, + "grad_norm": 0.6706245425390303, + "learning_rate": 5.4950341402855375e-05, + "loss": 4.1339, + "step": 3541 + }, + { + "epoch": 0.16490909514165328, + "grad_norm": 0.6584008813851872, + "learning_rate": 5.4965859714463065e-05, + "loss": 4.2986, + "step": 3542 + }, + { + "epoch": 0.16495565332774634, + "grad_norm": 0.7492373138737867, + "learning_rate": 5.498137802607076e-05, + "loss": 4.4169, + "step": 3543 + }, + { + "epoch": 0.16500221151383943, + "grad_norm": 0.6633112942393442, + "learning_rate": 5.4996896337678464e-05, + "loss": 4.3496, + "step": 3544 + }, + { + "epoch": 0.16504876969993248, + "grad_norm": 0.7739133844148739, + "learning_rate": 5.501241464928616e-05, + "loss": 4.3829, + "step": 3545 + }, + { + "epoch": 0.16509532788602557, + "grad_norm": 0.7454180522202253, + "learning_rate": 5.502793296089386e-05, + "loss": 4.3548, + "step": 3546 + }, + { + "epoch": 0.16514188607211863, + "grad_norm": 0.8504106426638061, + "learning_rate": 5.504345127250155e-05, + "loss": 4.3722, + "step": 3547 + }, + { + "epoch": 0.1651884442582117, + "grad_norm": 1.0767095586435038, + "learning_rate": 5.5058969584109256e-05, + "loss": 4.3582, + "step": 3548 + }, + { + "epoch": 0.16523500244430478, + "grad_norm": 1.0752969407853064, + "learning_rate": 5.507448789571695e-05, + "loss": 4.3695, + "step": 3549 + }, + { + "epoch": 0.16528156063039784, + "grad_norm": 0.788948067738338, + "learning_rate": 5.509000620732464e-05, + "loss": 4.3631, + "step": 3550 + }, + { + "epoch": 0.1653281188164909, + "grad_norm": 0.8520890431905562, + "learning_rate": 5.510552451893234e-05, + "loss": 4.4092, + "step": 3551 + }, + { + "epoch": 0.165374677002584, + "grad_norm": 0.9707397046179916, + "learning_rate": 5.5121042830540035e-05, + "loss": 4.2696, + "step": 3552 + }, + { + "epoch": 0.16542123518867705, + "grad_norm": 0.8347652261608222, + "learning_rate": 5.513656114214774e-05, + "loss": 4.3, + "step": 3553 + }, + { + "epoch": 0.1654677933747701, + "grad_norm": 0.7115077266211706, + "learning_rate": 5.5152079453755434e-05, + "loss": 4.3252, + "step": 3554 + }, + { + "epoch": 0.1655143515608632, + "grad_norm": 0.893608035528768, + "learning_rate": 5.516759776536313e-05, + "loss": 4.212, + "step": 3555 + }, + { + "epoch": 0.16556090974695625, + "grad_norm": 0.8938473508668396, + "learning_rate": 5.518311607697083e-05, + "loss": 4.4187, + "step": 3556 + }, + { + "epoch": 0.16560746793304934, + "grad_norm": 0.748212693341169, + "learning_rate": 5.519863438857853e-05, + "loss": 4.4922, + "step": 3557 + }, + { + "epoch": 0.1656540261191424, + "grad_norm": 0.7800877981024605, + "learning_rate": 5.521415270018622e-05, + "loss": 4.3521, + "step": 3558 + }, + { + "epoch": 0.16570058430523546, + "grad_norm": 0.8218783902157765, + "learning_rate": 5.5229671011793915e-05, + "loss": 4.3018, + "step": 3559 + }, + { + "epoch": 0.16574714249132855, + "grad_norm": 0.8957727870481731, + "learning_rate": 5.524518932340161e-05, + "loss": 4.3971, + "step": 3560 + }, + { + "epoch": 0.1657937006774216, + "grad_norm": 0.9741767419861092, + "learning_rate": 5.5260707635009315e-05, + "loss": 4.3961, + "step": 3561 + }, + { + "epoch": 0.16584025886351467, + "grad_norm": 0.8013613073293034, + "learning_rate": 5.527622594661701e-05, + "loss": 4.3734, + "step": 3562 + }, + { + "epoch": 0.16588681704960775, + "grad_norm": 0.7854432071748912, + "learning_rate": 5.529174425822471e-05, + "loss": 4.2363, + "step": 3563 + }, + { + "epoch": 0.16593337523570081, + "grad_norm": 0.809575430439254, + "learning_rate": 5.530726256983241e-05, + "loss": 4.3847, + "step": 3564 + }, + { + "epoch": 0.16597993342179387, + "grad_norm": 0.6713260923982828, + "learning_rate": 5.532278088144011e-05, + "loss": 4.4094, + "step": 3565 + }, + { + "epoch": 0.16602649160788696, + "grad_norm": 0.8092688343865966, + "learning_rate": 5.53382991930478e-05, + "loss": 4.3752, + "step": 3566 + }, + { + "epoch": 0.16607304979398002, + "grad_norm": 0.7089083659459936, + "learning_rate": 5.535381750465549e-05, + "loss": 4.3127, + "step": 3567 + }, + { + "epoch": 0.1661196079800731, + "grad_norm": 0.7167828986877854, + "learning_rate": 5.536933581626319e-05, + "loss": 4.4004, + "step": 3568 + }, + { + "epoch": 0.16616616616616617, + "grad_norm": 0.697222389956952, + "learning_rate": 5.538485412787089e-05, + "loss": 4.353, + "step": 3569 + }, + { + "epoch": 0.16621272435225923, + "grad_norm": 0.7869895329878736, + "learning_rate": 5.540037243947859e-05, + "loss": 4.3657, + "step": 3570 + }, + { + "epoch": 0.16625928253835232, + "grad_norm": 0.7327638382397353, + "learning_rate": 5.5415890751086285e-05, + "loss": 4.3672, + "step": 3571 + }, + { + "epoch": 0.16630584072444538, + "grad_norm": 0.7094514246710966, + "learning_rate": 5.543140906269398e-05, + "loss": 4.3937, + "step": 3572 + }, + { + "epoch": 0.16635239891053843, + "grad_norm": 0.6790425987508933, + "learning_rate": 5.5446927374301684e-05, + "loss": 4.226, + "step": 3573 + }, + { + "epoch": 0.16639895709663152, + "grad_norm": 0.6280489371045338, + "learning_rate": 5.546244568590938e-05, + "loss": 4.3384, + "step": 3574 + }, + { + "epoch": 0.16644551528272458, + "grad_norm": 0.618074773895394, + "learning_rate": 5.547796399751707e-05, + "loss": 4.2864, + "step": 3575 + }, + { + "epoch": 0.16649207346881764, + "grad_norm": 0.6670307884242884, + "learning_rate": 5.5493482309124766e-05, + "loss": 4.3249, + "step": 3576 + }, + { + "epoch": 0.16653863165491073, + "grad_norm": 0.6606977641165576, + "learning_rate": 5.550900062073246e-05, + "loss": 4.3325, + "step": 3577 + }, + { + "epoch": 0.1665851898410038, + "grad_norm": 0.6528210560753869, + "learning_rate": 5.5524518932340166e-05, + "loss": 4.2691, + "step": 3578 + }, + { + "epoch": 0.16663174802709688, + "grad_norm": 0.7146692643447723, + "learning_rate": 5.554003724394786e-05, + "loss": 4.4402, + "step": 3579 + }, + { + "epoch": 0.16667830621318994, + "grad_norm": 0.8490572191228138, + "learning_rate": 5.555555555555556e-05, + "loss": 4.336, + "step": 3580 + }, + { + "epoch": 0.166724864399283, + "grad_norm": 1.0241576557201868, + "learning_rate": 5.557107386716326e-05, + "loss": 4.3561, + "step": 3581 + }, + { + "epoch": 0.16677142258537608, + "grad_norm": 0.9063605926011676, + "learning_rate": 5.558659217877096e-05, + "loss": 4.2751, + "step": 3582 + }, + { + "epoch": 0.16681798077146914, + "grad_norm": 0.7299479981600686, + "learning_rate": 5.560211049037865e-05, + "loss": 4.3942, + "step": 3583 + }, + { + "epoch": 0.1668645389575622, + "grad_norm": 0.6254249169672313, + "learning_rate": 5.5617628801986343e-05, + "loss": 4.3194, + "step": 3584 + }, + { + "epoch": 0.1669110971436553, + "grad_norm": 0.8048899493162577, + "learning_rate": 5.563314711359404e-05, + "loss": 4.4122, + "step": 3585 + }, + { + "epoch": 0.16695765532974835, + "grad_norm": 0.9144429655252937, + "learning_rate": 5.564866542520174e-05, + "loss": 4.3584, + "step": 3586 + }, + { + "epoch": 0.1670042135158414, + "grad_norm": 0.8765607590805536, + "learning_rate": 5.566418373680944e-05, + "loss": 4.3773, + "step": 3587 + }, + { + "epoch": 0.1670507717019345, + "grad_norm": 0.9941049771565581, + "learning_rate": 5.5679702048417135e-05, + "loss": 4.3134, + "step": 3588 + }, + { + "epoch": 0.16709732988802756, + "grad_norm": 0.8812769849171355, + "learning_rate": 5.569522036002484e-05, + "loss": 4.2091, + "step": 3589 + }, + { + "epoch": 0.16714388807412064, + "grad_norm": 0.7016733112557193, + "learning_rate": 5.5710738671632535e-05, + "loss": 4.2771, + "step": 3590 + }, + { + "epoch": 0.1671904462602137, + "grad_norm": 0.7961512836483862, + "learning_rate": 5.5726256983240224e-05, + "loss": 4.2793, + "step": 3591 + }, + { + "epoch": 0.16723700444630676, + "grad_norm": 0.7382808794729243, + "learning_rate": 5.574177529484792e-05, + "loss": 4.4113, + "step": 3592 + }, + { + "epoch": 0.16728356263239985, + "grad_norm": 0.7873819926352505, + "learning_rate": 5.575729360645562e-05, + "loss": 4.5202, + "step": 3593 + }, + { + "epoch": 0.1673301208184929, + "grad_norm": 0.8050774997026645, + "learning_rate": 5.577281191806332e-05, + "loss": 4.2985, + "step": 3594 + }, + { + "epoch": 0.16737667900458597, + "grad_norm": 0.8287887456064305, + "learning_rate": 5.5788330229671016e-05, + "loss": 4.4022, + "step": 3595 + }, + { + "epoch": 0.16742323719067906, + "grad_norm": 0.7790454638634989, + "learning_rate": 5.580384854127871e-05, + "loss": 4.4171, + "step": 3596 + }, + { + "epoch": 0.16746979537677212, + "grad_norm": 0.7090497592537184, + "learning_rate": 5.581936685288641e-05, + "loss": 4.3061, + "step": 3597 + }, + { + "epoch": 0.16751635356286518, + "grad_norm": 0.7193817693493046, + "learning_rate": 5.583488516449411e-05, + "loss": 4.3814, + "step": 3598 + }, + { + "epoch": 0.16756291174895827, + "grad_norm": 0.7599839420011457, + "learning_rate": 5.5850403476101795e-05, + "loss": 4.4286, + "step": 3599 + }, + { + "epoch": 0.16760946993505133, + "grad_norm": 0.8407710240184195, + "learning_rate": 5.58659217877095e-05, + "loss": 4.2776, + "step": 3600 + }, + { + "epoch": 0.1676560281211444, + "grad_norm": 0.8137848960111327, + "learning_rate": 5.5881440099317194e-05, + "loss": 4.2999, + "step": 3601 + }, + { + "epoch": 0.16770258630723747, + "grad_norm": 0.7647879301587025, + "learning_rate": 5.589695841092489e-05, + "loss": 4.3922, + "step": 3602 + }, + { + "epoch": 0.16774914449333053, + "grad_norm": 0.8007631466140895, + "learning_rate": 5.5912476722532594e-05, + "loss": 4.3695, + "step": 3603 + }, + { + "epoch": 0.16779570267942362, + "grad_norm": 0.8051476401819762, + "learning_rate": 5.592799503414029e-05, + "loss": 4.3638, + "step": 3604 + }, + { + "epoch": 0.16784226086551668, + "grad_norm": 0.7857068260691173, + "learning_rate": 5.5943513345747986e-05, + "loss": 4.3008, + "step": 3605 + }, + { + "epoch": 0.16788881905160974, + "grad_norm": 0.8772375095542485, + "learning_rate": 5.595903165735569e-05, + "loss": 4.3167, + "step": 3606 + }, + { + "epoch": 0.16793537723770283, + "grad_norm": 0.8418208665723064, + "learning_rate": 5.597454996896337e-05, + "loss": 4.4376, + "step": 3607 + }, + { + "epoch": 0.1679819354237959, + "grad_norm": 0.8276012653612281, + "learning_rate": 5.5990068280571075e-05, + "loss": 4.2361, + "step": 3608 + }, + { + "epoch": 0.16802849360988895, + "grad_norm": 0.6750280872976829, + "learning_rate": 5.600558659217877e-05, + "loss": 4.2868, + "step": 3609 + }, + { + "epoch": 0.16807505179598203, + "grad_norm": 0.6719024186826245, + "learning_rate": 5.602110490378647e-05, + "loss": 4.3478, + "step": 3610 + }, + { + "epoch": 0.1681216099820751, + "grad_norm": 0.9040365617580138, + "learning_rate": 5.603662321539417e-05, + "loss": 4.3279, + "step": 3611 + }, + { + "epoch": 0.16816816816816818, + "grad_norm": 0.804275951773957, + "learning_rate": 5.605214152700187e-05, + "loss": 4.32, + "step": 3612 + }, + { + "epoch": 0.16821472635426124, + "grad_norm": 0.572966209227738, + "learning_rate": 5.6067659838609563e-05, + "loss": 4.3032, + "step": 3613 + }, + { + "epoch": 0.1682612845403543, + "grad_norm": 0.8013407901286724, + "learning_rate": 5.6083178150217267e-05, + "loss": 4.2537, + "step": 3614 + }, + { + "epoch": 0.1683078427264474, + "grad_norm": 0.7877751398824584, + "learning_rate": 5.609869646182495e-05, + "loss": 4.2283, + "step": 3615 + }, + { + "epoch": 0.16835440091254045, + "grad_norm": 0.8162922061144053, + "learning_rate": 5.611421477343265e-05, + "loss": 4.4519, + "step": 3616 + }, + { + "epoch": 0.1684009590986335, + "grad_norm": 0.9918626059616131, + "learning_rate": 5.612973308504035e-05, + "loss": 4.3608, + "step": 3617 + }, + { + "epoch": 0.1684475172847266, + "grad_norm": 1.0777945392888137, + "learning_rate": 5.6145251396648045e-05, + "loss": 4.3665, + "step": 3618 + }, + { + "epoch": 0.16849407547081965, + "grad_norm": 0.9208484000264621, + "learning_rate": 5.616076970825575e-05, + "loss": 4.3445, + "step": 3619 + }, + { + "epoch": 0.16854063365691271, + "grad_norm": 0.9931693314166439, + "learning_rate": 5.6176288019863444e-05, + "loss": 4.2362, + "step": 3620 + }, + { + "epoch": 0.1685871918430058, + "grad_norm": 0.9862579006336506, + "learning_rate": 5.619180633147114e-05, + "loss": 4.3833, + "step": 3621 + }, + { + "epoch": 0.16863375002909886, + "grad_norm": 0.7897325192411315, + "learning_rate": 5.620732464307884e-05, + "loss": 4.3365, + "step": 3622 + }, + { + "epoch": 0.16868030821519195, + "grad_norm": 0.964954209658312, + "learning_rate": 5.6222842954686527e-05, + "loss": 4.4174, + "step": 3623 + }, + { + "epoch": 0.168726866401285, + "grad_norm": 1.0509668285966625, + "learning_rate": 5.623836126629422e-05, + "loss": 4.371, + "step": 3624 + }, + { + "epoch": 0.16877342458737807, + "grad_norm": 0.8120605249480961, + "learning_rate": 5.6253879577901926e-05, + "loss": 4.3885, + "step": 3625 + }, + { + "epoch": 0.16881998277347116, + "grad_norm": 0.8928403152530587, + "learning_rate": 5.626939788950962e-05, + "loss": 4.2625, + "step": 3626 + }, + { + "epoch": 0.16886654095956422, + "grad_norm": 0.8695272417103678, + "learning_rate": 5.628491620111732e-05, + "loss": 4.3322, + "step": 3627 + }, + { + "epoch": 0.16891309914565728, + "grad_norm": 0.7011205661699005, + "learning_rate": 5.630043451272502e-05, + "loss": 4.3883, + "step": 3628 + }, + { + "epoch": 0.16895965733175036, + "grad_norm": 0.8341153229985455, + "learning_rate": 5.631595282433272e-05, + "loss": 4.2221, + "step": 3629 + }, + { + "epoch": 0.16900621551784342, + "grad_norm": 0.8636422601572483, + "learning_rate": 5.6331471135940414e-05, + "loss": 4.3558, + "step": 3630 + }, + { + "epoch": 0.16905277370393648, + "grad_norm": 0.7846749290565171, + "learning_rate": 5.6346989447548104e-05, + "loss": 4.3775, + "step": 3631 + }, + { + "epoch": 0.16909933189002957, + "grad_norm": 0.7447300550356527, + "learning_rate": 5.63625077591558e-05, + "loss": 4.2584, + "step": 3632 + }, + { + "epoch": 0.16914589007612263, + "grad_norm": 0.722549565221077, + "learning_rate": 5.63780260707635e-05, + "loss": 4.3118, + "step": 3633 + }, + { + "epoch": 0.16919244826221572, + "grad_norm": 0.8062989960317382, + "learning_rate": 5.63935443823712e-05, + "loss": 4.2966, + "step": 3634 + }, + { + "epoch": 0.16923900644830878, + "grad_norm": 0.7075664810649128, + "learning_rate": 5.6409062693978896e-05, + "loss": 4.3862, + "step": 3635 + }, + { + "epoch": 0.16928556463440184, + "grad_norm": 0.6410193762248383, + "learning_rate": 5.64245810055866e-05, + "loss": 4.3943, + "step": 3636 + }, + { + "epoch": 0.16933212282049492, + "grad_norm": 0.7394203964701197, + "learning_rate": 5.6440099317194295e-05, + "loss": 4.2679, + "step": 3637 + }, + { + "epoch": 0.16937868100658798, + "grad_norm": 0.7084069318827085, + "learning_rate": 5.645561762880199e-05, + "loss": 4.3566, + "step": 3638 + }, + { + "epoch": 0.16942523919268104, + "grad_norm": 0.5811841205558723, + "learning_rate": 5.647113594040968e-05, + "loss": 4.2203, + "step": 3639 + }, + { + "epoch": 0.16947179737877413, + "grad_norm": 0.6961362404727476, + "learning_rate": 5.648665425201738e-05, + "loss": 4.3342, + "step": 3640 + }, + { + "epoch": 0.1695183555648672, + "grad_norm": 0.7310109636223873, + "learning_rate": 5.650217256362508e-05, + "loss": 4.2848, + "step": 3641 + }, + { + "epoch": 0.16956491375096025, + "grad_norm": 0.6805537277904329, + "learning_rate": 5.651769087523278e-05, + "loss": 4.4413, + "step": 3642 + }, + { + "epoch": 0.16961147193705334, + "grad_norm": 0.6879186511735615, + "learning_rate": 5.653320918684047e-05, + "loss": 4.2537, + "step": 3643 + }, + { + "epoch": 0.1696580301231464, + "grad_norm": 0.6600367404040266, + "learning_rate": 5.6548727498448176e-05, + "loss": 4.2892, + "step": 3644 + }, + { + "epoch": 0.16970458830923948, + "grad_norm": 0.5488591621994034, + "learning_rate": 5.656424581005587e-05, + "loss": 4.3631, + "step": 3645 + }, + { + "epoch": 0.16975114649533254, + "grad_norm": 0.6134838891395507, + "learning_rate": 5.657976412166357e-05, + "loss": 4.3367, + "step": 3646 + }, + { + "epoch": 0.1697977046814256, + "grad_norm": 0.7165985198604459, + "learning_rate": 5.6595282433271265e-05, + "loss": 4.1984, + "step": 3647 + }, + { + "epoch": 0.1698442628675187, + "grad_norm": 0.6188226458630796, + "learning_rate": 5.6610800744878955e-05, + "loss": 4.3234, + "step": 3648 + }, + { + "epoch": 0.16989082105361175, + "grad_norm": 0.5994598428379075, + "learning_rate": 5.662631905648665e-05, + "loss": 4.443, + "step": 3649 + }, + { + "epoch": 0.1699373792397048, + "grad_norm": 0.749782488134663, + "learning_rate": 5.6641837368094354e-05, + "loss": 4.2241, + "step": 3650 + }, + { + "epoch": 0.1699839374257979, + "grad_norm": 0.8205170405967519, + "learning_rate": 5.665735567970205e-05, + "loss": 4.2941, + "step": 3651 + }, + { + "epoch": 0.17003049561189096, + "grad_norm": 0.6557799531743879, + "learning_rate": 5.6672873991309747e-05, + "loss": 4.2893, + "step": 3652 + }, + { + "epoch": 0.17007705379798402, + "grad_norm": 0.6292398533125002, + "learning_rate": 5.668839230291745e-05, + "loss": 4.2225, + "step": 3653 + }, + { + "epoch": 0.1701236119840771, + "grad_norm": 0.7115142876814131, + "learning_rate": 5.6703910614525146e-05, + "loss": 4.389, + "step": 3654 + }, + { + "epoch": 0.17017017017017017, + "grad_norm": 0.9353533597219801, + "learning_rate": 5.671942892613284e-05, + "loss": 4.2242, + "step": 3655 + }, + { + "epoch": 0.17021672835626325, + "grad_norm": 0.8541488228497186, + "learning_rate": 5.673494723774053e-05, + "loss": 4.3326, + "step": 3656 + }, + { + "epoch": 0.1702632865423563, + "grad_norm": 0.6509838762871971, + "learning_rate": 5.675046554934823e-05, + "loss": 4.2045, + "step": 3657 + }, + { + "epoch": 0.17030984472844937, + "grad_norm": 0.7806250924443233, + "learning_rate": 5.676598386095593e-05, + "loss": 4.2722, + "step": 3658 + }, + { + "epoch": 0.17035640291454246, + "grad_norm": 0.9068797629542394, + "learning_rate": 5.678150217256363e-05, + "loss": 4.2153, + "step": 3659 + }, + { + "epoch": 0.17040296110063552, + "grad_norm": 0.7766996418044181, + "learning_rate": 5.6797020484171324e-05, + "loss": 4.3054, + "step": 3660 + }, + { + "epoch": 0.17044951928672858, + "grad_norm": 0.6767573296715086, + "learning_rate": 5.681253879577903e-05, + "loss": 4.1975, + "step": 3661 + }, + { + "epoch": 0.17049607747282167, + "grad_norm": 0.7996669890764658, + "learning_rate": 5.682805710738672e-05, + "loss": 4.1201, + "step": 3662 + }, + { + "epoch": 0.17054263565891473, + "grad_norm": 0.7831895121442253, + "learning_rate": 5.684357541899442e-05, + "loss": 4.2632, + "step": 3663 + }, + { + "epoch": 0.17058919384500779, + "grad_norm": 0.7368985895371829, + "learning_rate": 5.685909373060211e-05, + "loss": 4.2856, + "step": 3664 + }, + { + "epoch": 0.17063575203110087, + "grad_norm": 0.8847412485570862, + "learning_rate": 5.6874612042209805e-05, + "loss": 4.2021, + "step": 3665 + }, + { + "epoch": 0.17068231021719393, + "grad_norm": 0.749229800124781, + "learning_rate": 5.689013035381751e-05, + "loss": 4.2491, + "step": 3666 + }, + { + "epoch": 0.17072886840328702, + "grad_norm": 0.6323597157506613, + "learning_rate": 5.6905648665425205e-05, + "loss": 4.2054, + "step": 3667 + }, + { + "epoch": 0.17077542658938008, + "grad_norm": 0.7710632760101517, + "learning_rate": 5.69211669770329e-05, + "loss": 4.265, + "step": 3668 + }, + { + "epoch": 0.17082198477547314, + "grad_norm": 0.917964631656785, + "learning_rate": 5.6936685288640604e-05, + "loss": 4.3792, + "step": 3669 + }, + { + "epoch": 0.17086854296156623, + "grad_norm": 1.1385111295710229, + "learning_rate": 5.69522036002483e-05, + "loss": 4.2959, + "step": 3670 + }, + { + "epoch": 0.1709151011476593, + "grad_norm": 0.9376970199059992, + "learning_rate": 5.6967721911856e-05, + "loss": 4.2898, + "step": 3671 + }, + { + "epoch": 0.17096165933375235, + "grad_norm": 0.6528142155132869, + "learning_rate": 5.6983240223463686e-05, + "loss": 4.2232, + "step": 3672 + }, + { + "epoch": 0.17100821751984543, + "grad_norm": 0.8072629514779276, + "learning_rate": 5.699875853507138e-05, + "loss": 4.2421, + "step": 3673 + }, + { + "epoch": 0.1710547757059385, + "grad_norm": 1.0207200979027538, + "learning_rate": 5.701427684667908e-05, + "loss": 4.3075, + "step": 3674 + }, + { + "epoch": 0.17110133389203155, + "grad_norm": 0.7480549008765363, + "learning_rate": 5.702979515828678e-05, + "loss": 4.1877, + "step": 3675 + }, + { + "epoch": 0.17114789207812464, + "grad_norm": 0.6338104331144366, + "learning_rate": 5.704531346989448e-05, + "loss": 4.0597, + "step": 3676 + }, + { + "epoch": 0.1711944502642177, + "grad_norm": 0.8516496093502177, + "learning_rate": 5.7060831781502175e-05, + "loss": 4.4527, + "step": 3677 + }, + { + "epoch": 0.1712410084503108, + "grad_norm": 0.8206706594868681, + "learning_rate": 5.707635009310988e-05, + "loss": 4.2892, + "step": 3678 + }, + { + "epoch": 0.17128756663640385, + "grad_norm": 0.6696030710399113, + "learning_rate": 5.7091868404717574e-05, + "loss": 4.3636, + "step": 3679 + }, + { + "epoch": 0.1713341248224969, + "grad_norm": 0.8285833857401828, + "learning_rate": 5.7107386716325263e-05, + "loss": 4.1362, + "step": 3680 + }, + { + "epoch": 0.17138068300859, + "grad_norm": 0.844697737684298, + "learning_rate": 5.712290502793296e-05, + "loss": 4.2139, + "step": 3681 + }, + { + "epoch": 0.17142724119468306, + "grad_norm": 0.7643395758572806, + "learning_rate": 5.7138423339540656e-05, + "loss": 4.2415, + "step": 3682 + }, + { + "epoch": 0.17147379938077612, + "grad_norm": 0.6586577339985323, + "learning_rate": 5.715394165114836e-05, + "loss": 4.3334, + "step": 3683 + }, + { + "epoch": 0.1715203575668692, + "grad_norm": 0.7017617975677116, + "learning_rate": 5.7169459962756055e-05, + "loss": 4.3468, + "step": 3684 + }, + { + "epoch": 0.17156691575296226, + "grad_norm": 0.7217008835524205, + "learning_rate": 5.718497827436375e-05, + "loss": 4.3148, + "step": 3685 + }, + { + "epoch": 0.17161347393905532, + "grad_norm": 0.6829701480019801, + "learning_rate": 5.7200496585971455e-05, + "loss": 4.3043, + "step": 3686 + }, + { + "epoch": 0.1716600321251484, + "grad_norm": 0.7811463414611702, + "learning_rate": 5.721601489757915e-05, + "loss": 4.389, + "step": 3687 + }, + { + "epoch": 0.17170659031124147, + "grad_norm": 0.9316751739559709, + "learning_rate": 5.723153320918684e-05, + "loss": 4.2172, + "step": 3688 + }, + { + "epoch": 0.17175314849733456, + "grad_norm": 0.8438584401440575, + "learning_rate": 5.724705152079454e-05, + "loss": 4.2154, + "step": 3689 + }, + { + "epoch": 0.17179970668342762, + "grad_norm": 0.78298638063208, + "learning_rate": 5.726256983240223e-05, + "loss": 4.272, + "step": 3690 + }, + { + "epoch": 0.17184626486952068, + "grad_norm": 0.7320728661339723, + "learning_rate": 5.7278088144009936e-05, + "loss": 4.298, + "step": 3691 + }, + { + "epoch": 0.17189282305561376, + "grad_norm": 0.7935805022107305, + "learning_rate": 5.729360645561763e-05, + "loss": 4.317, + "step": 3692 + }, + { + "epoch": 0.17193938124170682, + "grad_norm": 0.7346818102395932, + "learning_rate": 5.730912476722533e-05, + "loss": 4.338, + "step": 3693 + }, + { + "epoch": 0.17198593942779988, + "grad_norm": 0.7780368955707668, + "learning_rate": 5.732464307883303e-05, + "loss": 4.3146, + "step": 3694 + }, + { + "epoch": 0.17203249761389297, + "grad_norm": 0.7717527558749544, + "learning_rate": 5.734016139044073e-05, + "loss": 4.26, + "step": 3695 + }, + { + "epoch": 0.17207905579998603, + "grad_norm": 0.745437289464954, + "learning_rate": 5.735567970204842e-05, + "loss": 4.3595, + "step": 3696 + }, + { + "epoch": 0.1721256139860791, + "grad_norm": 0.7300559651650821, + "learning_rate": 5.7371198013656114e-05, + "loss": 4.2223, + "step": 3697 + }, + { + "epoch": 0.17217217217217218, + "grad_norm": 0.7461020776915682, + "learning_rate": 5.738671632526381e-05, + "loss": 4.1904, + "step": 3698 + }, + { + "epoch": 0.17221873035826524, + "grad_norm": 0.8781445128300626, + "learning_rate": 5.7402234636871514e-05, + "loss": 4.287, + "step": 3699 + }, + { + "epoch": 0.17226528854435832, + "grad_norm": 0.9614117511422503, + "learning_rate": 5.741775294847921e-05, + "loss": 4.3271, + "step": 3700 + }, + { + "epoch": 0.17231184673045138, + "grad_norm": 0.9372406011759222, + "learning_rate": 5.7433271260086906e-05, + "loss": 4.3152, + "step": 3701 + }, + { + "epoch": 0.17235840491654444, + "grad_norm": 0.8867576085615511, + "learning_rate": 5.74487895716946e-05, + "loss": 4.2817, + "step": 3702 + }, + { + "epoch": 0.17240496310263753, + "grad_norm": 0.8026827842580818, + "learning_rate": 5.7464307883302306e-05, + "loss": 4.2906, + "step": 3703 + }, + { + "epoch": 0.1724515212887306, + "grad_norm": 0.8083157775839304, + "learning_rate": 5.747982619490999e-05, + "loss": 4.1784, + "step": 3704 + }, + { + "epoch": 0.17249807947482365, + "grad_norm": 0.8173628303222171, + "learning_rate": 5.749534450651769e-05, + "loss": 4.2156, + "step": 3705 + }, + { + "epoch": 0.17254463766091674, + "grad_norm": 0.6282187299073999, + "learning_rate": 5.751086281812539e-05, + "loss": 4.3944, + "step": 3706 + }, + { + "epoch": 0.1725911958470098, + "grad_norm": 0.7320278724759347, + "learning_rate": 5.7526381129733084e-05, + "loss": 4.2134, + "step": 3707 + }, + { + "epoch": 0.17263775403310286, + "grad_norm": 0.7186548761136805, + "learning_rate": 5.754189944134079e-05, + "loss": 4.3623, + "step": 3708 + }, + { + "epoch": 0.17268431221919595, + "grad_norm": 0.6771451613638945, + "learning_rate": 5.7557417752948483e-05, + "loss": 4.3427, + "step": 3709 + }, + { + "epoch": 0.172730870405289, + "grad_norm": 0.7544267644175723, + "learning_rate": 5.757293606455618e-05, + "loss": 4.324, + "step": 3710 + }, + { + "epoch": 0.1727774285913821, + "grad_norm": 0.6219856904606027, + "learning_rate": 5.758845437616388e-05, + "loss": 4.2007, + "step": 3711 + }, + { + "epoch": 0.17282398677747515, + "grad_norm": 0.6493497544588867, + "learning_rate": 5.7603972687771566e-05, + "loss": 4.1902, + "step": 3712 + }, + { + "epoch": 0.1728705449635682, + "grad_norm": 0.7059724600882559, + "learning_rate": 5.761949099937927e-05, + "loss": 4.2251, + "step": 3713 + }, + { + "epoch": 0.1729171031496613, + "grad_norm": 0.680449915838416, + "learning_rate": 5.7635009310986965e-05, + "loss": 4.1478, + "step": 3714 + }, + { + "epoch": 0.17296366133575436, + "grad_norm": 0.6964141800635018, + "learning_rate": 5.765052762259466e-05, + "loss": 4.1754, + "step": 3715 + }, + { + "epoch": 0.17301021952184742, + "grad_norm": 0.5944867672200377, + "learning_rate": 5.7666045934202364e-05, + "loss": 4.2762, + "step": 3716 + }, + { + "epoch": 0.1730567777079405, + "grad_norm": 0.7059244611498162, + "learning_rate": 5.768156424581006e-05, + "loss": 4.2573, + "step": 3717 + }, + { + "epoch": 0.17310333589403357, + "grad_norm": 0.713686116521856, + "learning_rate": 5.769708255741776e-05, + "loss": 4.3164, + "step": 3718 + }, + { + "epoch": 0.17314989408012663, + "grad_norm": 0.6936741720273194, + "learning_rate": 5.771260086902546e-05, + "loss": 4.25, + "step": 3719 + }, + { + "epoch": 0.1731964522662197, + "grad_norm": 0.7592687161105725, + "learning_rate": 5.772811918063314e-05, + "loss": 4.2479, + "step": 3720 + }, + { + "epoch": 0.17324301045231277, + "grad_norm": 0.7429651859017604, + "learning_rate": 5.7743637492240846e-05, + "loss": 4.2655, + "step": 3721 + }, + { + "epoch": 0.17328956863840586, + "grad_norm": 0.8005905292508195, + "learning_rate": 5.775915580384854e-05, + "loss": 4.2413, + "step": 3722 + }, + { + "epoch": 0.17333612682449892, + "grad_norm": 0.9362693105257913, + "learning_rate": 5.777467411545624e-05, + "loss": 4.3042, + "step": 3723 + }, + { + "epoch": 0.17338268501059198, + "grad_norm": 0.9280377468846212, + "learning_rate": 5.779019242706394e-05, + "loss": 4.3166, + "step": 3724 + }, + { + "epoch": 0.17342924319668507, + "grad_norm": 0.7389309413246399, + "learning_rate": 5.780571073867164e-05, + "loss": 4.1562, + "step": 3725 + }, + { + "epoch": 0.17347580138277813, + "grad_norm": 0.6876832311190092, + "learning_rate": 5.7821229050279334e-05, + "loss": 4.2469, + "step": 3726 + }, + { + "epoch": 0.1735223595688712, + "grad_norm": 0.7257987444951871, + "learning_rate": 5.783674736188703e-05, + "loss": 4.2615, + "step": 3727 + }, + { + "epoch": 0.17356891775496427, + "grad_norm": 0.8808209473085137, + "learning_rate": 5.785226567349472e-05, + "loss": 4.3906, + "step": 3728 + }, + { + "epoch": 0.17361547594105733, + "grad_norm": 0.6666957305152226, + "learning_rate": 5.7867783985102416e-05, + "loss": 4.3162, + "step": 3729 + }, + { + "epoch": 0.1736620341271504, + "grad_norm": 0.6803001971025069, + "learning_rate": 5.788330229671012e-05, + "loss": 4.169, + "step": 3730 + }, + { + "epoch": 0.17370859231324348, + "grad_norm": 0.7201073575888548, + "learning_rate": 5.7898820608317816e-05, + "loss": 4.2299, + "step": 3731 + }, + { + "epoch": 0.17375515049933654, + "grad_norm": 0.7370209806378502, + "learning_rate": 5.791433891992551e-05, + "loss": 4.2415, + "step": 3732 + }, + { + "epoch": 0.17380170868542963, + "grad_norm": 0.7742349631697926, + "learning_rate": 5.7929857231533215e-05, + "loss": 4.3735, + "step": 3733 + }, + { + "epoch": 0.1738482668715227, + "grad_norm": 0.8083493315867806, + "learning_rate": 5.794537554314091e-05, + "loss": 4.2089, + "step": 3734 + }, + { + "epoch": 0.17389482505761575, + "grad_norm": 0.7327667876237545, + "learning_rate": 5.796089385474861e-05, + "loss": 4.1187, + "step": 3735 + }, + { + "epoch": 0.17394138324370884, + "grad_norm": 0.8189265678755312, + "learning_rate": 5.797641216635631e-05, + "loss": 4.2286, + "step": 3736 + }, + { + "epoch": 0.1739879414298019, + "grad_norm": 0.8661549874513017, + "learning_rate": 5.7991930477963994e-05, + "loss": 4.2556, + "step": 3737 + }, + { + "epoch": 0.17403449961589496, + "grad_norm": 0.8639463297281348, + "learning_rate": 5.80074487895717e-05, + "loss": 4.3175, + "step": 3738 + }, + { + "epoch": 0.17408105780198804, + "grad_norm": 0.7844080988941403, + "learning_rate": 5.802296710117939e-05, + "loss": 4.2883, + "step": 3739 + }, + { + "epoch": 0.1741276159880811, + "grad_norm": 0.8204074806578167, + "learning_rate": 5.803848541278709e-05, + "loss": 4.2961, + "step": 3740 + }, + { + "epoch": 0.17417417417417416, + "grad_norm": 0.9196845883966358, + "learning_rate": 5.805400372439479e-05, + "loss": 4.2629, + "step": 3741 + }, + { + "epoch": 0.17422073236026725, + "grad_norm": 0.9504903584995329, + "learning_rate": 5.806952203600249e-05, + "loss": 4.3636, + "step": 3742 + }, + { + "epoch": 0.1742672905463603, + "grad_norm": 0.9526967116907747, + "learning_rate": 5.8085040347610185e-05, + "loss": 4.3612, + "step": 3743 + }, + { + "epoch": 0.1743138487324534, + "grad_norm": 1.0195913926687643, + "learning_rate": 5.810055865921789e-05, + "loss": 4.2879, + "step": 3744 + }, + { + "epoch": 0.17436040691854646, + "grad_norm": 0.7970494807935721, + "learning_rate": 5.811607697082557e-05, + "loss": 4.3016, + "step": 3745 + }, + { + "epoch": 0.17440696510463952, + "grad_norm": 0.7943480545447688, + "learning_rate": 5.8131595282433274e-05, + "loss": 4.1759, + "step": 3746 + }, + { + "epoch": 0.1744535232907326, + "grad_norm": 0.783586628047029, + "learning_rate": 5.814711359404097e-05, + "loss": 4.2899, + "step": 3747 + }, + { + "epoch": 0.17450008147682566, + "grad_norm": 0.6500551304663887, + "learning_rate": 5.8162631905648667e-05, + "loss": 4.2908, + "step": 3748 + }, + { + "epoch": 0.17454663966291872, + "grad_norm": 0.6893010612320049, + "learning_rate": 5.817815021725637e-05, + "loss": 4.2391, + "step": 3749 + }, + { + "epoch": 0.1745931978490118, + "grad_norm": 0.8759832152348036, + "learning_rate": 5.8193668528864066e-05, + "loss": 4.1168, + "step": 3750 + }, + { + "epoch": 0.17463975603510487, + "grad_norm": 1.0152852375913575, + "learning_rate": 5.820918684047176e-05, + "loss": 4.2672, + "step": 3751 + }, + { + "epoch": 0.17468631422119793, + "grad_norm": 0.7795544673644351, + "learning_rate": 5.822470515207946e-05, + "loss": 4.1721, + "step": 3752 + }, + { + "epoch": 0.17473287240729102, + "grad_norm": 0.8461920279749379, + "learning_rate": 5.824022346368715e-05, + "loss": 4.1732, + "step": 3753 + }, + { + "epoch": 0.17477943059338408, + "grad_norm": 1.0343525337171058, + "learning_rate": 5.8255741775294844e-05, + "loss": 4.381, + "step": 3754 + }, + { + "epoch": 0.17482598877947716, + "grad_norm": 0.8823413406072614, + "learning_rate": 5.827126008690255e-05, + "loss": 4.2562, + "step": 3755 + }, + { + "epoch": 0.17487254696557022, + "grad_norm": 0.8604202274082029, + "learning_rate": 5.8286778398510244e-05, + "loss": 4.2271, + "step": 3756 + }, + { + "epoch": 0.17491910515166328, + "grad_norm": 0.7760562910536248, + "learning_rate": 5.830229671011794e-05, + "loss": 4.2069, + "step": 3757 + }, + { + "epoch": 0.17496566333775637, + "grad_norm": 0.7301039141004938, + "learning_rate": 5.831781502172564e-05, + "loss": 4.2517, + "step": 3758 + }, + { + "epoch": 0.17501222152384943, + "grad_norm": 0.8954676548712879, + "learning_rate": 5.833333333333334e-05, + "loss": 4.1892, + "step": 3759 + }, + { + "epoch": 0.1750587797099425, + "grad_norm": 0.9756406548949028, + "learning_rate": 5.8348851644941036e-05, + "loss": 4.3869, + "step": 3760 + }, + { + "epoch": 0.17510533789603558, + "grad_norm": 0.9127024389785918, + "learning_rate": 5.8364369956548725e-05, + "loss": 4.2405, + "step": 3761 + }, + { + "epoch": 0.17515189608212864, + "grad_norm": 0.8845785335974432, + "learning_rate": 5.837988826815642e-05, + "loss": 4.2339, + "step": 3762 + }, + { + "epoch": 0.1751984542682217, + "grad_norm": 0.9698515787243271, + "learning_rate": 5.8395406579764125e-05, + "loss": 4.338, + "step": 3763 + }, + { + "epoch": 0.17524501245431479, + "grad_norm": 0.8845000244207298, + "learning_rate": 5.841092489137182e-05, + "loss": 4.1478, + "step": 3764 + }, + { + "epoch": 0.17529157064040785, + "grad_norm": 0.8142362725174018, + "learning_rate": 5.842644320297952e-05, + "loss": 4.2159, + "step": 3765 + }, + { + "epoch": 0.17533812882650093, + "grad_norm": 0.8381091725867977, + "learning_rate": 5.844196151458722e-05, + "loss": 4.3007, + "step": 3766 + }, + { + "epoch": 0.175384687012594, + "grad_norm": 0.9482459057371416, + "learning_rate": 5.845747982619492e-05, + "loss": 4.3799, + "step": 3767 + }, + { + "epoch": 0.17543124519868705, + "grad_norm": 0.9485203899038541, + "learning_rate": 5.847299813780261e-05, + "loss": 4.1263, + "step": 3768 + }, + { + "epoch": 0.17547780338478014, + "grad_norm": 0.8304717063957062, + "learning_rate": 5.84885164494103e-05, + "loss": 4.2308, + "step": 3769 + }, + { + "epoch": 0.1755243615708732, + "grad_norm": 0.8456994470102737, + "learning_rate": 5.8504034761018e-05, + "loss": 4.232, + "step": 3770 + }, + { + "epoch": 0.17557091975696626, + "grad_norm": 0.8667363653556512, + "learning_rate": 5.85195530726257e-05, + "loss": 4.2477, + "step": 3771 + }, + { + "epoch": 0.17561747794305935, + "grad_norm": 0.7050374554993056, + "learning_rate": 5.85350713842334e-05, + "loss": 4.1803, + "step": 3772 + }, + { + "epoch": 0.1756640361291524, + "grad_norm": 0.8231716016090098, + "learning_rate": 5.8550589695841095e-05, + "loss": 4.2392, + "step": 3773 + }, + { + "epoch": 0.17571059431524547, + "grad_norm": 0.8297562496687668, + "learning_rate": 5.85661080074488e-05, + "loss": 4.2623, + "step": 3774 + }, + { + "epoch": 0.17575715250133855, + "grad_norm": 0.603284609327173, + "learning_rate": 5.8581626319056494e-05, + "loss": 4.3744, + "step": 3775 + }, + { + "epoch": 0.1758037106874316, + "grad_norm": 0.7576289452745608, + "learning_rate": 5.859714463066419e-05, + "loss": 4.3775, + "step": 3776 + }, + { + "epoch": 0.1758502688735247, + "grad_norm": 0.784144599055644, + "learning_rate": 5.861266294227188e-05, + "loss": 4.2384, + "step": 3777 + }, + { + "epoch": 0.17589682705961776, + "grad_norm": 0.7735125418524964, + "learning_rate": 5.8628181253879576e-05, + "loss": 4.3442, + "step": 3778 + }, + { + "epoch": 0.17594338524571082, + "grad_norm": 0.7190619514373509, + "learning_rate": 5.864369956548727e-05, + "loss": 4.2553, + "step": 3779 + }, + { + "epoch": 0.1759899434318039, + "grad_norm": 0.68583327943527, + "learning_rate": 5.8659217877094976e-05, + "loss": 4.234, + "step": 3780 + }, + { + "epoch": 0.17603650161789697, + "grad_norm": 0.709224043745559, + "learning_rate": 5.867473618870267e-05, + "loss": 4.1275, + "step": 3781 + }, + { + "epoch": 0.17608305980399003, + "grad_norm": 0.5986083239216385, + "learning_rate": 5.869025450031037e-05, + "loss": 4.2689, + "step": 3782 + }, + { + "epoch": 0.17612961799008311, + "grad_norm": 0.708007496847629, + "learning_rate": 5.870577281191807e-05, + "loss": 4.1904, + "step": 3783 + }, + { + "epoch": 0.17617617617617617, + "grad_norm": 0.8152681179582396, + "learning_rate": 5.872129112352577e-05, + "loss": 4.2455, + "step": 3784 + }, + { + "epoch": 0.17622273436226923, + "grad_norm": 0.7798387284744596, + "learning_rate": 5.873680943513346e-05, + "loss": 4.2015, + "step": 3785 + }, + { + "epoch": 0.17626929254836232, + "grad_norm": 0.7096402818127227, + "learning_rate": 5.875232774674115e-05, + "loss": 4.2767, + "step": 3786 + }, + { + "epoch": 0.17631585073445538, + "grad_norm": 0.7924569836448044, + "learning_rate": 5.876784605834885e-05, + "loss": 4.2918, + "step": 3787 + }, + { + "epoch": 0.17636240892054847, + "grad_norm": 0.8809432677402642, + "learning_rate": 5.878336436995655e-05, + "loss": 4.3345, + "step": 3788 + }, + { + "epoch": 0.17640896710664153, + "grad_norm": 0.7691325574602325, + "learning_rate": 5.879888268156425e-05, + "loss": 4.3521, + "step": 3789 + }, + { + "epoch": 0.1764555252927346, + "grad_norm": 0.7185268704345704, + "learning_rate": 5.8814400993171945e-05, + "loss": 4.3302, + "step": 3790 + }, + { + "epoch": 0.17650208347882768, + "grad_norm": 0.7120133679294516, + "learning_rate": 5.882991930477965e-05, + "loss": 4.2604, + "step": 3791 + }, + { + "epoch": 0.17654864166492074, + "grad_norm": 0.665180437692759, + "learning_rate": 5.8845437616387345e-05, + "loss": 4.1439, + "step": 3792 + }, + { + "epoch": 0.1765951998510138, + "grad_norm": 0.7906437065534629, + "learning_rate": 5.8860955927995034e-05, + "loss": 4.1149, + "step": 3793 + }, + { + "epoch": 0.17664175803710688, + "grad_norm": 0.8061819310409682, + "learning_rate": 5.887647423960273e-05, + "loss": 4.1165, + "step": 3794 + }, + { + "epoch": 0.17668831622319994, + "grad_norm": 0.7948982450380467, + "learning_rate": 5.889199255121043e-05, + "loss": 4.1678, + "step": 3795 + }, + { + "epoch": 0.176734874409293, + "grad_norm": 0.7027887649511602, + "learning_rate": 5.890751086281813e-05, + "loss": 4.3607, + "step": 3796 + }, + { + "epoch": 0.1767814325953861, + "grad_norm": 0.7274903029030156, + "learning_rate": 5.8923029174425826e-05, + "loss": 4.1161, + "step": 3797 + }, + { + "epoch": 0.17682799078147915, + "grad_norm": 0.7529580508857809, + "learning_rate": 5.893854748603352e-05, + "loss": 4.2469, + "step": 3798 + }, + { + "epoch": 0.17687454896757224, + "grad_norm": 0.6723178154639549, + "learning_rate": 5.8954065797641226e-05, + "loss": 4.3366, + "step": 3799 + }, + { + "epoch": 0.1769211071536653, + "grad_norm": 0.6113468489267573, + "learning_rate": 5.896958410924892e-05, + "loss": 4.2419, + "step": 3800 + }, + { + "epoch": 0.17696766533975836, + "grad_norm": 0.6984877073587827, + "learning_rate": 5.898510242085661e-05, + "loss": 4.1483, + "step": 3801 + }, + { + "epoch": 0.17701422352585144, + "grad_norm": 0.729449696253326, + "learning_rate": 5.900062073246431e-05, + "loss": 4.273, + "step": 3802 + }, + { + "epoch": 0.1770607817119445, + "grad_norm": 0.758552515297829, + "learning_rate": 5.9016139044072004e-05, + "loss": 4.2803, + "step": 3803 + }, + { + "epoch": 0.17710733989803756, + "grad_norm": 0.6925955054160731, + "learning_rate": 5.90316573556797e-05, + "loss": 4.0711, + "step": 3804 + }, + { + "epoch": 0.17715389808413065, + "grad_norm": 0.6782134894778694, + "learning_rate": 5.9047175667287404e-05, + "loss": 4.2494, + "step": 3805 + }, + { + "epoch": 0.1772004562702237, + "grad_norm": 0.7091592423353394, + "learning_rate": 5.90626939788951e-05, + "loss": 4.1641, + "step": 3806 + }, + { + "epoch": 0.17724701445631677, + "grad_norm": 0.7176620116152551, + "learning_rate": 5.9078212290502796e-05, + "loss": 4.3279, + "step": 3807 + }, + { + "epoch": 0.17729357264240986, + "grad_norm": 0.6766652775961538, + "learning_rate": 5.90937306021105e-05, + "loss": 4.1818, + "step": 3808 + }, + { + "epoch": 0.17734013082850292, + "grad_norm": 0.7059484366460622, + "learning_rate": 5.910924891371818e-05, + "loss": 4.2239, + "step": 3809 + }, + { + "epoch": 0.177386689014596, + "grad_norm": 0.7649892970565992, + "learning_rate": 5.9124767225325885e-05, + "loss": 4.2788, + "step": 3810 + }, + { + "epoch": 0.17743324720068906, + "grad_norm": 0.8091614828893807, + "learning_rate": 5.914028553693358e-05, + "loss": 4.2924, + "step": 3811 + }, + { + "epoch": 0.17747980538678212, + "grad_norm": 0.6608611682577281, + "learning_rate": 5.915580384854128e-05, + "loss": 4.2737, + "step": 3812 + }, + { + "epoch": 0.1775263635728752, + "grad_norm": 0.6589701254462523, + "learning_rate": 5.917132216014898e-05, + "loss": 4.2523, + "step": 3813 + }, + { + "epoch": 0.17757292175896827, + "grad_norm": 0.5871503003443718, + "learning_rate": 5.918684047175668e-05, + "loss": 4.2411, + "step": 3814 + }, + { + "epoch": 0.17761947994506133, + "grad_norm": 0.70619383264204, + "learning_rate": 5.920235878336437e-05, + "loss": 4.2247, + "step": 3815 + }, + { + "epoch": 0.17766603813115442, + "grad_norm": 0.7033876173888982, + "learning_rate": 5.9217877094972076e-05, + "loss": 4.2742, + "step": 3816 + }, + { + "epoch": 0.17771259631724748, + "grad_norm": 0.6408263663413075, + "learning_rate": 5.923339540657977e-05, + "loss": 4.108, + "step": 3817 + }, + { + "epoch": 0.17775915450334054, + "grad_norm": 0.6575283441261287, + "learning_rate": 5.924891371818746e-05, + "loss": 4.1745, + "step": 3818 + }, + { + "epoch": 0.17780571268943363, + "grad_norm": 0.678427716807931, + "learning_rate": 5.926443202979516e-05, + "loss": 4.2226, + "step": 3819 + }, + { + "epoch": 0.17785227087552669, + "grad_norm": 0.7432245406964357, + "learning_rate": 5.9279950341402855e-05, + "loss": 4.2045, + "step": 3820 + }, + { + "epoch": 0.17789882906161977, + "grad_norm": 0.9106488504563915, + "learning_rate": 5.929546865301056e-05, + "loss": 4.2089, + "step": 3821 + }, + { + "epoch": 0.17794538724771283, + "grad_norm": 0.9809383862810824, + "learning_rate": 5.9310986964618254e-05, + "loss": 4.3, + "step": 3822 + }, + { + "epoch": 0.1779919454338059, + "grad_norm": 0.8247051583911859, + "learning_rate": 5.932650527622595e-05, + "loss": 4.2524, + "step": 3823 + }, + { + "epoch": 0.17803850361989898, + "grad_norm": 0.7477352960768189, + "learning_rate": 5.9342023587833654e-05, + "loss": 4.2907, + "step": 3824 + }, + { + "epoch": 0.17808506180599204, + "grad_norm": 0.6192090205512047, + "learning_rate": 5.935754189944135e-05, + "loss": 4.2477, + "step": 3825 + }, + { + "epoch": 0.1781316199920851, + "grad_norm": 0.70106914545705, + "learning_rate": 5.937306021104904e-05, + "loss": 4.1457, + "step": 3826 + }, + { + "epoch": 0.1781781781781782, + "grad_norm": 0.7437250001083582, + "learning_rate": 5.9388578522656736e-05, + "loss": 4.1887, + "step": 3827 + }, + { + "epoch": 0.17822473636427125, + "grad_norm": 0.8301337721496481, + "learning_rate": 5.940409683426443e-05, + "loss": 4.3242, + "step": 3828 + }, + { + "epoch": 0.1782712945503643, + "grad_norm": 0.9965873044123397, + "learning_rate": 5.941961514587213e-05, + "loss": 4.3263, + "step": 3829 + }, + { + "epoch": 0.1783178527364574, + "grad_norm": 0.9954471942597153, + "learning_rate": 5.943513345747983e-05, + "loss": 4.1706, + "step": 3830 + }, + { + "epoch": 0.17836441092255045, + "grad_norm": 0.7703569744527963, + "learning_rate": 5.945065176908753e-05, + "loss": 4.1101, + "step": 3831 + }, + { + "epoch": 0.1784109691086435, + "grad_norm": 0.7720691040251132, + "learning_rate": 5.9466170080695224e-05, + "loss": 4.3316, + "step": 3832 + }, + { + "epoch": 0.1784575272947366, + "grad_norm": 0.996728372723186, + "learning_rate": 5.948168839230293e-05, + "loss": 4.1955, + "step": 3833 + }, + { + "epoch": 0.17850408548082966, + "grad_norm": 0.8949227457742617, + "learning_rate": 5.949720670391061e-05, + "loss": 4.1366, + "step": 3834 + }, + { + "epoch": 0.17855064366692275, + "grad_norm": 0.9081920110221556, + "learning_rate": 5.951272501551831e-05, + "loss": 4.242, + "step": 3835 + }, + { + "epoch": 0.1785972018530158, + "grad_norm": 0.9282135750005069, + "learning_rate": 5.952824332712601e-05, + "loss": 4.2648, + "step": 3836 + }, + { + "epoch": 0.17864376003910887, + "grad_norm": 0.8379235327035679, + "learning_rate": 5.9543761638733706e-05, + "loss": 4.2067, + "step": 3837 + }, + { + "epoch": 0.17869031822520196, + "grad_norm": 0.7552071094981911, + "learning_rate": 5.955927995034141e-05, + "loss": 4.256, + "step": 3838 + }, + { + "epoch": 0.17873687641129501, + "grad_norm": 0.913930508102101, + "learning_rate": 5.9574798261949105e-05, + "loss": 4.1856, + "step": 3839 + }, + { + "epoch": 0.17878343459738807, + "grad_norm": 0.9515212372366545, + "learning_rate": 5.95903165735568e-05, + "loss": 4.1532, + "step": 3840 + }, + { + "epoch": 0.17882999278348116, + "grad_norm": 0.806491640401118, + "learning_rate": 5.9605834885164504e-05, + "loss": 4.3485, + "step": 3841 + }, + { + "epoch": 0.17887655096957422, + "grad_norm": 0.8503841195110501, + "learning_rate": 5.962135319677219e-05, + "loss": 4.2264, + "step": 3842 + }, + { + "epoch": 0.17892310915566728, + "grad_norm": 0.7656493938234745, + "learning_rate": 5.963687150837989e-05, + "loss": 4.1202, + "step": 3843 + }, + { + "epoch": 0.17896966734176037, + "grad_norm": 0.8130394960832797, + "learning_rate": 5.965238981998759e-05, + "loss": 4.1253, + "step": 3844 + }, + { + "epoch": 0.17901622552785343, + "grad_norm": 0.8928171416047422, + "learning_rate": 5.966790813159528e-05, + "loss": 4.4513, + "step": 3845 + }, + { + "epoch": 0.17906278371394652, + "grad_norm": 0.9317974193250864, + "learning_rate": 5.9683426443202986e-05, + "loss": 4.2227, + "step": 3846 + }, + { + "epoch": 0.17910934190003958, + "grad_norm": 0.9179169701593529, + "learning_rate": 5.969894475481068e-05, + "loss": 4.2428, + "step": 3847 + }, + { + "epoch": 0.17915590008613264, + "grad_norm": 0.8017613622054821, + "learning_rate": 5.971446306641838e-05, + "loss": 4.1762, + "step": 3848 + }, + { + "epoch": 0.17920245827222572, + "grad_norm": 0.7314726674029486, + "learning_rate": 5.972998137802608e-05, + "loss": 4.3195, + "step": 3849 + }, + { + "epoch": 0.17924901645831878, + "grad_norm": 0.7417441437627389, + "learning_rate": 5.9745499689633764e-05, + "loss": 4.1743, + "step": 3850 + }, + { + "epoch": 0.17929557464441184, + "grad_norm": 0.7753192340710056, + "learning_rate": 5.976101800124147e-05, + "loss": 4.1459, + "step": 3851 + }, + { + "epoch": 0.17934213283050493, + "grad_norm": 0.5727368110431118, + "learning_rate": 5.9776536312849164e-05, + "loss": 4.3219, + "step": 3852 + }, + { + "epoch": 0.179388691016598, + "grad_norm": 0.6318009083477474, + "learning_rate": 5.979205462445686e-05, + "loss": 4.1768, + "step": 3853 + }, + { + "epoch": 0.17943524920269105, + "grad_norm": 0.7539465694116433, + "learning_rate": 5.9807572936064556e-05, + "loss": 4.1738, + "step": 3854 + }, + { + "epoch": 0.17948180738878414, + "grad_norm": 0.6298040913044742, + "learning_rate": 5.982309124767226e-05, + "loss": 4.288, + "step": 3855 + }, + { + "epoch": 0.1795283655748772, + "grad_norm": 0.6717335916816202, + "learning_rate": 5.9838609559279956e-05, + "loss": 4.2042, + "step": 3856 + }, + { + "epoch": 0.17957492376097028, + "grad_norm": 0.7423499862788928, + "learning_rate": 5.985412787088765e-05, + "loss": 4.2208, + "step": 3857 + }, + { + "epoch": 0.17962148194706334, + "grad_norm": 0.7408635715567744, + "learning_rate": 5.986964618249534e-05, + "loss": 4.0906, + "step": 3858 + }, + { + "epoch": 0.1796680401331564, + "grad_norm": 0.8515407679028092, + "learning_rate": 5.988516449410304e-05, + "loss": 4.0411, + "step": 3859 + }, + { + "epoch": 0.1797145983192495, + "grad_norm": 0.7721369668127728, + "learning_rate": 5.990068280571074e-05, + "loss": 4.1322, + "step": 3860 + }, + { + "epoch": 0.17976115650534255, + "grad_norm": 0.7338623097599962, + "learning_rate": 5.991620111731844e-05, + "loss": 4.2899, + "step": 3861 + }, + { + "epoch": 0.1798077146914356, + "grad_norm": 0.8003301672392344, + "learning_rate": 5.9931719428926134e-05, + "loss": 4.2169, + "step": 3862 + }, + { + "epoch": 0.1798542728775287, + "grad_norm": 0.7542121571902406, + "learning_rate": 5.994723774053384e-05, + "loss": 4.0534, + "step": 3863 + }, + { + "epoch": 0.17990083106362176, + "grad_norm": 0.688399029279156, + "learning_rate": 5.996275605214153e-05, + "loss": 4.42, + "step": 3864 + }, + { + "epoch": 0.17994738924971482, + "grad_norm": 0.6957655640230259, + "learning_rate": 5.997827436374923e-05, + "loss": 4.1696, + "step": 3865 + }, + { + "epoch": 0.1799939474358079, + "grad_norm": 0.8093243000696966, + "learning_rate": 5.999379267535692e-05, + "loss": 4.2603, + "step": 3866 + }, + { + "epoch": 0.18004050562190096, + "grad_norm": 0.6990861560072209, + "learning_rate": 6.0009310986964615e-05, + "loss": 4.2072, + "step": 3867 + }, + { + "epoch": 0.18008706380799405, + "grad_norm": 0.8045517776011205, + "learning_rate": 6.002482929857232e-05, + "loss": 4.1172, + "step": 3868 + }, + { + "epoch": 0.1801336219940871, + "grad_norm": 0.8448938382945369, + "learning_rate": 6.0040347610180015e-05, + "loss": 4.1429, + "step": 3869 + }, + { + "epoch": 0.18018018018018017, + "grad_norm": 0.8506048915960782, + "learning_rate": 6.005586592178771e-05, + "loss": 4.2606, + "step": 3870 + }, + { + "epoch": 0.18022673836627326, + "grad_norm": 0.993693209135509, + "learning_rate": 6.0071384233395414e-05, + "loss": 4.1758, + "step": 3871 + }, + { + "epoch": 0.18027329655236632, + "grad_norm": 0.9880331754173068, + "learning_rate": 6.008690254500311e-05, + "loss": 4.209, + "step": 3872 + }, + { + "epoch": 0.18031985473845938, + "grad_norm": 0.7878043274054488, + "learning_rate": 6.010242085661081e-05, + "loss": 4.2431, + "step": 3873 + }, + { + "epoch": 0.18036641292455247, + "grad_norm": 0.8452611946658347, + "learning_rate": 6.0117939168218496e-05, + "loss": 4.3316, + "step": 3874 + }, + { + "epoch": 0.18041297111064553, + "grad_norm": 0.9452995628098069, + "learning_rate": 6.013345747982619e-05, + "loss": 4.2764, + "step": 3875 + }, + { + "epoch": 0.18045952929673859, + "grad_norm": 0.9235796535068934, + "learning_rate": 6.0148975791433896e-05, + "loss": 4.2493, + "step": 3876 + }, + { + "epoch": 0.18050608748283167, + "grad_norm": 0.9062017258893825, + "learning_rate": 6.016449410304159e-05, + "loss": 4.291, + "step": 3877 + }, + { + "epoch": 0.18055264566892473, + "grad_norm": 0.7861198166537434, + "learning_rate": 6.018001241464929e-05, + "loss": 4.2176, + "step": 3878 + }, + { + "epoch": 0.18059920385501782, + "grad_norm": 0.8424442595174734, + "learning_rate": 6.0195530726256984e-05, + "loss": 4.2449, + "step": 3879 + }, + { + "epoch": 0.18064576204111088, + "grad_norm": 0.9857723788974427, + "learning_rate": 6.021104903786469e-05, + "loss": 4.2363, + "step": 3880 + }, + { + "epoch": 0.18069232022720394, + "grad_norm": 0.8161480341349922, + "learning_rate": 6.0226567349472384e-05, + "loss": 4.0656, + "step": 3881 + }, + { + "epoch": 0.18073887841329703, + "grad_norm": 0.8529432267156501, + "learning_rate": 6.0242085661080073e-05, + "loss": 4.0416, + "step": 3882 + }, + { + "epoch": 0.1807854365993901, + "grad_norm": 0.80582431561423, + "learning_rate": 6.025760397268777e-05, + "loss": 4.1992, + "step": 3883 + }, + { + "epoch": 0.18083199478548315, + "grad_norm": 0.7458948637758097, + "learning_rate": 6.0273122284295466e-05, + "loss": 4.1215, + "step": 3884 + }, + { + "epoch": 0.18087855297157623, + "grad_norm": 0.757985365925119, + "learning_rate": 6.028864059590317e-05, + "loss": 4.2328, + "step": 3885 + }, + { + "epoch": 0.1809251111576693, + "grad_norm": 0.7547315135517676, + "learning_rate": 6.0304158907510865e-05, + "loss": 4.2512, + "step": 3886 + }, + { + "epoch": 0.18097166934376235, + "grad_norm": 0.7626695430925317, + "learning_rate": 6.031967721911856e-05, + "loss": 4.1618, + "step": 3887 + }, + { + "epoch": 0.18101822752985544, + "grad_norm": 0.7031225186266488, + "learning_rate": 6.0335195530726265e-05, + "loss": 4.243, + "step": 3888 + }, + { + "epoch": 0.1810647857159485, + "grad_norm": 0.7006544304383493, + "learning_rate": 6.035071384233396e-05, + "loss": 4.2208, + "step": 3889 + }, + { + "epoch": 0.1811113439020416, + "grad_norm": 0.6974158181598236, + "learning_rate": 6.036623215394165e-05, + "loss": 4.188, + "step": 3890 + }, + { + "epoch": 0.18115790208813465, + "grad_norm": 0.7553645289605829, + "learning_rate": 6.038175046554935e-05, + "loss": 4.247, + "step": 3891 + }, + { + "epoch": 0.1812044602742277, + "grad_norm": 0.7026316802501742, + "learning_rate": 6.039726877715704e-05, + "loss": 4.0679, + "step": 3892 + }, + { + "epoch": 0.1812510184603208, + "grad_norm": 0.566220636730432, + "learning_rate": 6.0412787088764746e-05, + "loss": 4.0331, + "step": 3893 + }, + { + "epoch": 0.18129757664641385, + "grad_norm": 0.6876211762904998, + "learning_rate": 6.042830540037244e-05, + "loss": 4.1878, + "step": 3894 + }, + { + "epoch": 0.18134413483250691, + "grad_norm": 0.6566773641923934, + "learning_rate": 6.044382371198014e-05, + "loss": 4.2815, + "step": 3895 + }, + { + "epoch": 0.1813906930186, + "grad_norm": 0.704307167693115, + "learning_rate": 6.045934202358784e-05, + "loss": 4.1846, + "step": 3896 + }, + { + "epoch": 0.18143725120469306, + "grad_norm": 0.7420444048389193, + "learning_rate": 6.047486033519554e-05, + "loss": 4.1776, + "step": 3897 + }, + { + "epoch": 0.18148380939078612, + "grad_norm": 0.6607033234946992, + "learning_rate": 6.049037864680323e-05, + "loss": 4.2361, + "step": 3898 + }, + { + "epoch": 0.1815303675768792, + "grad_norm": 0.667846893095003, + "learning_rate": 6.0505896958410924e-05, + "loss": 4.2474, + "step": 3899 + }, + { + "epoch": 0.18157692576297227, + "grad_norm": 0.8381074635003845, + "learning_rate": 6.052141527001862e-05, + "loss": 4.1784, + "step": 3900 + }, + { + "epoch": 0.18162348394906536, + "grad_norm": 0.7999284158167296, + "learning_rate": 6.0536933581626324e-05, + "loss": 4.3033, + "step": 3901 + }, + { + "epoch": 0.18167004213515842, + "grad_norm": 0.797579548211346, + "learning_rate": 6.055245189323402e-05, + "loss": 4.2718, + "step": 3902 + }, + { + "epoch": 0.18171660032125148, + "grad_norm": 0.7337445468133802, + "learning_rate": 6.0567970204841716e-05, + "loss": 4.1679, + "step": 3903 + }, + { + "epoch": 0.18176315850734456, + "grad_norm": 0.6938544043076895, + "learning_rate": 6.058348851644942e-05, + "loss": 4.1486, + "step": 3904 + }, + { + "epoch": 0.18180971669343762, + "grad_norm": 0.6751524009387101, + "learning_rate": 6.0599006828057116e-05, + "loss": 4.1171, + "step": 3905 + }, + { + "epoch": 0.18185627487953068, + "grad_norm": 0.6504363922397705, + "learning_rate": 6.061452513966481e-05, + "loss": 4.1143, + "step": 3906 + }, + { + "epoch": 0.18190283306562377, + "grad_norm": 0.7056939408816114, + "learning_rate": 6.06300434512725e-05, + "loss": 4.1774, + "step": 3907 + }, + { + "epoch": 0.18194939125171683, + "grad_norm": 0.7962547494652851, + "learning_rate": 6.06455617628802e-05, + "loss": 4.1688, + "step": 3908 + }, + { + "epoch": 0.1819959494378099, + "grad_norm": 0.8597061657867326, + "learning_rate": 6.0661080074487894e-05, + "loss": 4.3177, + "step": 3909 + }, + { + "epoch": 0.18204250762390298, + "grad_norm": 0.8780899821535564, + "learning_rate": 6.06765983860956e-05, + "loss": 4.0677, + "step": 3910 + }, + { + "epoch": 0.18208906580999604, + "grad_norm": 0.7085079108984221, + "learning_rate": 6.0692116697703293e-05, + "loss": 4.2406, + "step": 3911 + }, + { + "epoch": 0.18213562399608912, + "grad_norm": 0.8415253061434667, + "learning_rate": 6.070763500931099e-05, + "loss": 4.1991, + "step": 3912 + }, + { + "epoch": 0.18218218218218218, + "grad_norm": 0.8963406038756172, + "learning_rate": 6.072315332091869e-05, + "loss": 4.2797, + "step": 3913 + }, + { + "epoch": 0.18222874036827524, + "grad_norm": 0.7593049274364481, + "learning_rate": 6.073867163252639e-05, + "loss": 4.1611, + "step": 3914 + }, + { + "epoch": 0.18227529855436833, + "grad_norm": 0.76561011622583, + "learning_rate": 6.075418994413408e-05, + "loss": 4.1569, + "step": 3915 + }, + { + "epoch": 0.1823218567404614, + "grad_norm": 0.7809715580166667, + "learning_rate": 6.0769708255741775e-05, + "loss": 4.1895, + "step": 3916 + }, + { + "epoch": 0.18236841492655445, + "grad_norm": 0.7216626071165699, + "learning_rate": 6.078522656734947e-05, + "loss": 4.2493, + "step": 3917 + }, + { + "epoch": 0.18241497311264754, + "grad_norm": 0.720823102213304, + "learning_rate": 6.0800744878957174e-05, + "loss": 4.2418, + "step": 3918 + }, + { + "epoch": 0.1824615312987406, + "grad_norm": 0.7546569871506793, + "learning_rate": 6.081626319056487e-05, + "loss": 4.127, + "step": 3919 + }, + { + "epoch": 0.18250808948483366, + "grad_norm": 0.6991437975119192, + "learning_rate": 6.083178150217257e-05, + "loss": 4.0123, + "step": 3920 + }, + { + "epoch": 0.18255464767092675, + "grad_norm": 0.690699125955165, + "learning_rate": 6.084729981378027e-05, + "loss": 4.1904, + "step": 3921 + }, + { + "epoch": 0.1826012058570198, + "grad_norm": 0.6853690420586893, + "learning_rate": 6.0862818125387966e-05, + "loss": 4.1939, + "step": 3922 + }, + { + "epoch": 0.1826477640431129, + "grad_norm": 0.7665448586932964, + "learning_rate": 6.0878336436995656e-05, + "loss": 4.0779, + "step": 3923 + }, + { + "epoch": 0.18269432222920595, + "grad_norm": 0.6825025086825263, + "learning_rate": 6.089385474860335e-05, + "loss": 4.086, + "step": 3924 + }, + { + "epoch": 0.182740880415299, + "grad_norm": 0.6836986164271966, + "learning_rate": 6.090937306021105e-05, + "loss": 4.3357, + "step": 3925 + }, + { + "epoch": 0.1827874386013921, + "grad_norm": 0.7923736293450993, + "learning_rate": 6.092489137181875e-05, + "loss": 4.0325, + "step": 3926 + }, + { + "epoch": 0.18283399678748516, + "grad_norm": 0.8685478426192688, + "learning_rate": 6.094040968342645e-05, + "loss": 4.1566, + "step": 3927 + }, + { + "epoch": 0.18288055497357822, + "grad_norm": 0.9770473724035246, + "learning_rate": 6.0955927995034144e-05, + "loss": 4.2797, + "step": 3928 + }, + { + "epoch": 0.1829271131596713, + "grad_norm": 0.8469065011432945, + "learning_rate": 6.097144630664185e-05, + "loss": 4.1143, + "step": 3929 + }, + { + "epoch": 0.18297367134576437, + "grad_norm": 0.6903886131356881, + "learning_rate": 6.0986964618249544e-05, + "loss": 4.2023, + "step": 3930 + }, + { + "epoch": 0.18302022953185743, + "grad_norm": 0.6003948085377285, + "learning_rate": 6.100248292985723e-05, + "loss": 4.249, + "step": 3931 + }, + { + "epoch": 0.1830667877179505, + "grad_norm": 0.6958351686929244, + "learning_rate": 6.101800124146493e-05, + "loss": 4.1645, + "step": 3932 + }, + { + "epoch": 0.18311334590404357, + "grad_norm": 0.780636342753686, + "learning_rate": 6.1033519553072626e-05, + "loss": 4.2152, + "step": 3933 + }, + { + "epoch": 0.18315990409013666, + "grad_norm": 0.7319786917354864, + "learning_rate": 6.104903786468032e-05, + "loss": 4.24, + "step": 3934 + }, + { + "epoch": 0.18320646227622972, + "grad_norm": 0.611968813171826, + "learning_rate": 6.106455617628802e-05, + "loss": 4.1301, + "step": 3935 + }, + { + "epoch": 0.18325302046232278, + "grad_norm": 0.7911861156984469, + "learning_rate": 6.108007448789573e-05, + "loss": 4.0005, + "step": 3936 + }, + { + "epoch": 0.18329957864841587, + "grad_norm": 0.7692833505524698, + "learning_rate": 6.109559279950342e-05, + "loss": 4.2295, + "step": 3937 + }, + { + "epoch": 0.18334613683450893, + "grad_norm": 0.7304245348827282, + "learning_rate": 6.111111111111112e-05, + "loss": 4.1542, + "step": 3938 + }, + { + "epoch": 0.183392695020602, + "grad_norm": 0.7012343383430681, + "learning_rate": 6.11266294227188e-05, + "loss": 4.182, + "step": 3939 + }, + { + "epoch": 0.18343925320669507, + "grad_norm": 0.7108242259111756, + "learning_rate": 6.11421477343265e-05, + "loss": 4.1467, + "step": 3940 + }, + { + "epoch": 0.18348581139278813, + "grad_norm": 0.9214874440382144, + "learning_rate": 6.115766604593421e-05, + "loss": 4.2722, + "step": 3941 + }, + { + "epoch": 0.1835323695788812, + "grad_norm": 0.9002249895920695, + "learning_rate": 6.11731843575419e-05, + "loss": 4.2828, + "step": 3942 + }, + { + "epoch": 0.18357892776497428, + "grad_norm": 0.8816157943676983, + "learning_rate": 6.11887026691496e-05, + "loss": 4.2112, + "step": 3943 + }, + { + "epoch": 0.18362548595106734, + "grad_norm": 0.8399257228596864, + "learning_rate": 6.12042209807573e-05, + "loss": 4.1981, + "step": 3944 + }, + { + "epoch": 0.18367204413716043, + "grad_norm": 0.6016841369951648, + "learning_rate": 6.1219739292365e-05, + "loss": 4.1337, + "step": 3945 + }, + { + "epoch": 0.1837186023232535, + "grad_norm": 0.8387829796611629, + "learning_rate": 6.123525760397269e-05, + "loss": 4.1758, + "step": 3946 + }, + { + "epoch": 0.18376516050934655, + "grad_norm": 0.9077119240743791, + "learning_rate": 6.125077591558039e-05, + "loss": 4.2016, + "step": 3947 + }, + { + "epoch": 0.18381171869543964, + "grad_norm": 0.7713060824114936, + "learning_rate": 6.126629422718808e-05, + "loss": 4.1943, + "step": 3948 + }, + { + "epoch": 0.1838582768815327, + "grad_norm": 0.6380031276891389, + "learning_rate": 6.128181253879578e-05, + "loss": 4.2345, + "step": 3949 + }, + { + "epoch": 0.18390483506762575, + "grad_norm": 0.7514863271558623, + "learning_rate": 6.129733085040348e-05, + "loss": 4.2009, + "step": 3950 + }, + { + "epoch": 0.18395139325371884, + "grad_norm": 0.7575236016136548, + "learning_rate": 6.131284916201117e-05, + "loss": 4.2872, + "step": 3951 + }, + { + "epoch": 0.1839979514398119, + "grad_norm": 0.745543393641894, + "learning_rate": 6.132836747361887e-05, + "loss": 4.2707, + "step": 3952 + }, + { + "epoch": 0.18404450962590496, + "grad_norm": 0.7444070299712902, + "learning_rate": 6.134388578522658e-05, + "loss": 4.3163, + "step": 3953 + }, + { + "epoch": 0.18409106781199805, + "grad_norm": 0.6895205276449621, + "learning_rate": 6.135940409683428e-05, + "loss": 4.1401, + "step": 3954 + }, + { + "epoch": 0.1841376259980911, + "grad_norm": 0.662267302198477, + "learning_rate": 6.137492240844196e-05, + "loss": 4.0335, + "step": 3955 + }, + { + "epoch": 0.1841841841841842, + "grad_norm": 0.6170284803317094, + "learning_rate": 6.139044072004965e-05, + "loss": 4.2407, + "step": 3956 + }, + { + "epoch": 0.18423074237027726, + "grad_norm": 0.6717644511060149, + "learning_rate": 6.140595903165735e-05, + "loss": 4.2351, + "step": 3957 + }, + { + "epoch": 0.18427730055637032, + "grad_norm": 0.7411755914690208, + "learning_rate": 6.142147734326506e-05, + "loss": 4.1056, + "step": 3958 + }, + { + "epoch": 0.1843238587424634, + "grad_norm": 0.6795476741325068, + "learning_rate": 6.143699565487276e-05, + "loss": 4.2765, + "step": 3959 + }, + { + "epoch": 0.18437041692855646, + "grad_norm": 0.6697847975080197, + "learning_rate": 6.145251396648045e-05, + "loss": 4.1984, + "step": 3960 + }, + { + "epoch": 0.18441697511464952, + "grad_norm": 0.7610413761640865, + "learning_rate": 6.146803227808815e-05, + "loss": 4.3048, + "step": 3961 + }, + { + "epoch": 0.1844635333007426, + "grad_norm": 0.7718194632303379, + "learning_rate": 6.148355058969585e-05, + "loss": 4.1238, + "step": 3962 + }, + { + "epoch": 0.18451009148683567, + "grad_norm": 0.7848180977144472, + "learning_rate": 6.149906890130354e-05, + "loss": 4.1788, + "step": 3963 + }, + { + "epoch": 0.18455664967292873, + "grad_norm": 0.6315735344139992, + "learning_rate": 6.151458721291124e-05, + "loss": 4.1617, + "step": 3964 + }, + { + "epoch": 0.18460320785902182, + "grad_norm": 0.6792905602491618, + "learning_rate": 6.153010552451893e-05, + "loss": 4.2978, + "step": 3965 + }, + { + "epoch": 0.18464976604511488, + "grad_norm": 0.6866257639815089, + "learning_rate": 6.154562383612663e-05, + "loss": 4.2318, + "step": 3966 + }, + { + "epoch": 0.18469632423120796, + "grad_norm": 0.7051450748439803, + "learning_rate": 6.156114214773433e-05, + "loss": 4.1677, + "step": 3967 + }, + { + "epoch": 0.18474288241730102, + "grad_norm": 0.7869985317185285, + "learning_rate": 6.157666045934202e-05, + "loss": 4.1054, + "step": 3968 + }, + { + "epoch": 0.18478944060339408, + "grad_norm": 0.7750524115147266, + "learning_rate": 6.159217877094972e-05, + "loss": 4.1563, + "step": 3969 + }, + { + "epoch": 0.18483599878948717, + "grad_norm": 0.7063699491204396, + "learning_rate": 6.160769708255743e-05, + "loss": 4.1434, + "step": 3970 + }, + { + "epoch": 0.18488255697558023, + "grad_norm": 0.7218300963485614, + "learning_rate": 6.162321539416511e-05, + "loss": 4.173, + "step": 3971 + }, + { + "epoch": 0.1849291151616733, + "grad_norm": 0.810512342779486, + "learning_rate": 6.163873370577281e-05, + "loss": 4.0895, + "step": 3972 + }, + { + "epoch": 0.18497567334776638, + "grad_norm": 0.8651682114734073, + "learning_rate": 6.16542520173805e-05, + "loss": 4.1869, + "step": 3973 + }, + { + "epoch": 0.18502223153385944, + "grad_norm": 0.878787728627247, + "learning_rate": 6.16697703289882e-05, + "loss": 4.1983, + "step": 3974 + }, + { + "epoch": 0.1850687897199525, + "grad_norm": 0.8507247571494204, + "learning_rate": 6.168528864059591e-05, + "loss": 4.1964, + "step": 3975 + }, + { + "epoch": 0.18511534790604559, + "grad_norm": 0.7739364477058391, + "learning_rate": 6.170080695220361e-05, + "loss": 4.1788, + "step": 3976 + }, + { + "epoch": 0.18516190609213865, + "grad_norm": 0.8241208501610361, + "learning_rate": 6.17163252638113e-05, + "loss": 4.0271, + "step": 3977 + }, + { + "epoch": 0.18520846427823173, + "grad_norm": 0.8525478285193557, + "learning_rate": 6.1731843575419e-05, + "loss": 4.2006, + "step": 3978 + }, + { + "epoch": 0.1852550224643248, + "grad_norm": 0.835701890155023, + "learning_rate": 6.174736188702668e-05, + "loss": 4.1846, + "step": 3979 + }, + { + "epoch": 0.18530158065041785, + "grad_norm": 0.745425208040336, + "learning_rate": 6.176288019863439e-05, + "loss": 4.1309, + "step": 3980 + }, + { + "epoch": 0.18534813883651094, + "grad_norm": 0.7571437442115034, + "learning_rate": 6.177839851024209e-05, + "loss": 4.2007, + "step": 3981 + }, + { + "epoch": 0.185394697022604, + "grad_norm": 0.8610626606295488, + "learning_rate": 6.179391682184979e-05, + "loss": 4.1183, + "step": 3982 + }, + { + "epoch": 0.18544125520869706, + "grad_norm": 0.8677158442265915, + "learning_rate": 6.180943513345748e-05, + "loss": 4.136, + "step": 3983 + }, + { + "epoch": 0.18548781339479015, + "grad_norm": 0.7949053429243524, + "learning_rate": 6.182495344506518e-05, + "loss": 4.1602, + "step": 3984 + }, + { + "epoch": 0.1855343715808832, + "grad_norm": 0.7290767382065838, + "learning_rate": 6.184047175667287e-05, + "loss": 4.1748, + "step": 3985 + }, + { + "epoch": 0.18558092976697627, + "grad_norm": 0.7486454119290777, + "learning_rate": 6.185599006828058e-05, + "loss": 4.0859, + "step": 3986 + }, + { + "epoch": 0.18562748795306935, + "grad_norm": 0.9185195830745364, + "learning_rate": 6.187150837988828e-05, + "loss": 4.028, + "step": 3987 + }, + { + "epoch": 0.1856740461391624, + "grad_norm": 0.759346879644854, + "learning_rate": 6.188702669149596e-05, + "loss": 4.1119, + "step": 3988 + }, + { + "epoch": 0.1857206043252555, + "grad_norm": 0.7729670280894924, + "learning_rate": 6.190254500310366e-05, + "loss": 4.238, + "step": 3989 + }, + { + "epoch": 0.18576716251134856, + "grad_norm": 0.8051738156951358, + "learning_rate": 6.191806331471136e-05, + "loss": 4.1428, + "step": 3990 + }, + { + "epoch": 0.18581372069744162, + "grad_norm": 0.8523680749840709, + "learning_rate": 6.193358162631907e-05, + "loss": 4.1958, + "step": 3991 + }, + { + "epoch": 0.1858602788835347, + "grad_norm": 0.8296903628479632, + "learning_rate": 6.194909993792676e-05, + "loss": 4.1607, + "step": 3992 + }, + { + "epoch": 0.18590683706962777, + "grad_norm": 0.7886910975698116, + "learning_rate": 6.196461824953446e-05, + "loss": 4.1956, + "step": 3993 + }, + { + "epoch": 0.18595339525572083, + "grad_norm": 0.8874911055924134, + "learning_rate": 6.198013656114215e-05, + "loss": 4.2123, + "step": 3994 + }, + { + "epoch": 0.18599995344181391, + "grad_norm": 0.9057644837594826, + "learning_rate": 6.199565487274985e-05, + "loss": 4.0838, + "step": 3995 + }, + { + "epoch": 0.18604651162790697, + "grad_norm": 0.7594864645514413, + "learning_rate": 6.201117318435755e-05, + "loss": 4.1912, + "step": 3996 + }, + { + "epoch": 0.18609306981400003, + "grad_norm": 0.7014791025170363, + "learning_rate": 6.202669149596524e-05, + "loss": 4.0857, + "step": 3997 + }, + { + "epoch": 0.18613962800009312, + "grad_norm": 0.822766118949515, + "learning_rate": 6.204220980757294e-05, + "loss": 4.1475, + "step": 3998 + }, + { + "epoch": 0.18618618618618618, + "grad_norm": 0.8666095040991393, + "learning_rate": 6.205772811918064e-05, + "loss": 4.0561, + "step": 3999 + }, + { + "epoch": 0.18623274437227927, + "grad_norm": 0.6408297415212535, + "learning_rate": 6.207324643078833e-05, + "loss": 4.1802, + "step": 4000 + }, + { + "epoch": 0.18627930255837233, + "grad_norm": 0.5728749621923871, + "learning_rate": 6.208876474239603e-05, + "loss": 4.2921, + "step": 4001 + }, + { + "epoch": 0.1863258607444654, + "grad_norm": 0.8055986661642176, + "learning_rate": 6.210428305400373e-05, + "loss": 4.1252, + "step": 4002 + }, + { + "epoch": 0.18637241893055848, + "grad_norm": 0.6740123236614911, + "learning_rate": 6.211980136561143e-05, + "loss": 4.197, + "step": 4003 + }, + { + "epoch": 0.18641897711665154, + "grad_norm": 0.6780818564533784, + "learning_rate": 6.213531967721912e-05, + "loss": 4.0465, + "step": 4004 + }, + { + "epoch": 0.1864655353027446, + "grad_norm": 0.6220178171735988, + "learning_rate": 6.215083798882681e-05, + "loss": 4.1325, + "step": 4005 + }, + { + "epoch": 0.18651209348883768, + "grad_norm": 0.66671113015331, + "learning_rate": 6.216635630043451e-05, + "loss": 4.1449, + "step": 4006 + }, + { + "epoch": 0.18655865167493074, + "grad_norm": 0.630108237322099, + "learning_rate": 6.21818746120422e-05, + "loss": 4.1774, + "step": 4007 + }, + { + "epoch": 0.1866052098610238, + "grad_norm": 0.6756020657091563, + "learning_rate": 6.219739292364992e-05, + "loss": 4.0956, + "step": 4008 + }, + { + "epoch": 0.1866517680471169, + "grad_norm": 0.7885837930792755, + "learning_rate": 6.221291123525761e-05, + "loss": 4.2117, + "step": 4009 + }, + { + "epoch": 0.18669832623320995, + "grad_norm": 0.7707996622458259, + "learning_rate": 6.222842954686531e-05, + "loss": 4.1429, + "step": 4010 + }, + { + "epoch": 0.18674488441930304, + "grad_norm": 0.6882548300336012, + "learning_rate": 6.2243947858473e-05, + "loss": 4.0775, + "step": 4011 + }, + { + "epoch": 0.1867914426053961, + "grad_norm": 0.7089294064381979, + "learning_rate": 6.225946617008069e-05, + "loss": 4.2186, + "step": 4012 + }, + { + "epoch": 0.18683800079148916, + "grad_norm": 0.6542719331569343, + "learning_rate": 6.22749844816884e-05, + "loss": 4.1756, + "step": 4013 + }, + { + "epoch": 0.18688455897758224, + "grad_norm": 0.6847784844990806, + "learning_rate": 6.22905027932961e-05, + "loss": 4.1769, + "step": 4014 + }, + { + "epoch": 0.1869311171636753, + "grad_norm": 0.746711996665422, + "learning_rate": 6.230602110490379e-05, + "loss": 4.1234, + "step": 4015 + }, + { + "epoch": 0.18697767534976836, + "grad_norm": 0.771044935413645, + "learning_rate": 6.232153941651149e-05, + "loss": 4.1907, + "step": 4016 + }, + { + "epoch": 0.18702423353586145, + "grad_norm": 0.8333596086426092, + "learning_rate": 6.233705772811918e-05, + "loss": 4.2416, + "step": 4017 + }, + { + "epoch": 0.1870707917219545, + "grad_norm": 0.8534883721204307, + "learning_rate": 6.235257603972688e-05, + "loss": 4.2129, + "step": 4018 + }, + { + "epoch": 0.18711734990804757, + "grad_norm": 0.8054043944647545, + "learning_rate": 6.236809435133459e-05, + "loss": 3.9516, + "step": 4019 + }, + { + "epoch": 0.18716390809414066, + "grad_norm": 0.7584075211701733, + "learning_rate": 6.238361266294227e-05, + "loss": 4.1001, + "step": 4020 + }, + { + "epoch": 0.18721046628023372, + "grad_norm": 0.7046179132483268, + "learning_rate": 6.239913097454997e-05, + "loss": 4.1276, + "step": 4021 + }, + { + "epoch": 0.1872570244663268, + "grad_norm": 0.7213635169607624, + "learning_rate": 6.241464928615766e-05, + "loss": 4.2497, + "step": 4022 + }, + { + "epoch": 0.18730358265241986, + "grad_norm": 0.8212911127287905, + "learning_rate": 6.243016759776536e-05, + "loss": 4.1293, + "step": 4023 + }, + { + "epoch": 0.18735014083851292, + "grad_norm": 0.7652904781469578, + "learning_rate": 6.244568590937306e-05, + "loss": 4.186, + "step": 4024 + }, + { + "epoch": 0.187396699024606, + "grad_norm": 0.6834612117848685, + "learning_rate": 6.246120422098077e-05, + "loss": 4.2406, + "step": 4025 + }, + { + "epoch": 0.18744325721069907, + "grad_norm": 0.7139447621877224, + "learning_rate": 6.247672253258846e-05, + "loss": 4.1018, + "step": 4026 + }, + { + "epoch": 0.18748981539679213, + "grad_norm": 0.6756614867868991, + "learning_rate": 6.249224084419616e-05, + "loss": 4.2467, + "step": 4027 + }, + { + "epoch": 0.18753637358288522, + "grad_norm": 0.7445363746455813, + "learning_rate": 6.250775915580384e-05, + "loss": 4.1195, + "step": 4028 + }, + { + "epoch": 0.18758293176897828, + "grad_norm": 0.7511071712464239, + "learning_rate": 6.252327746741154e-05, + "loss": 4.1648, + "step": 4029 + }, + { + "epoch": 0.18762948995507134, + "grad_norm": 0.6599759572808724, + "learning_rate": 6.253879577901925e-05, + "loss": 4.1711, + "step": 4030 + }, + { + "epoch": 0.18767604814116443, + "grad_norm": 0.7702681891249272, + "learning_rate": 6.255431409062695e-05, + "loss": 4.2072, + "step": 4031 + }, + { + "epoch": 0.18772260632725749, + "grad_norm": 0.8357475574804341, + "learning_rate": 6.256983240223464e-05, + "loss": 4.159, + "step": 4032 + }, + { + "epoch": 0.18776916451335057, + "grad_norm": 0.8286171235899772, + "learning_rate": 6.258535071384234e-05, + "loss": 4.1852, + "step": 4033 + }, + { + "epoch": 0.18781572269944363, + "grad_norm": 0.6635072979857012, + "learning_rate": 6.260086902545003e-05, + "loss": 4.0245, + "step": 4034 + }, + { + "epoch": 0.1878622808855367, + "grad_norm": 0.6466782893412993, + "learning_rate": 6.261638733705773e-05, + "loss": 4.1261, + "step": 4035 + }, + { + "epoch": 0.18790883907162978, + "grad_norm": 0.7843978885586533, + "learning_rate": 6.263190564866543e-05, + "loss": 4.1118, + "step": 4036 + }, + { + "epoch": 0.18795539725772284, + "grad_norm": 0.8416539463997713, + "learning_rate": 6.264742396027312e-05, + "loss": 4.3112, + "step": 4037 + }, + { + "epoch": 0.1880019554438159, + "grad_norm": 0.8014910789013927, + "learning_rate": 6.266294227188082e-05, + "loss": 4.2101, + "step": 4038 + }, + { + "epoch": 0.188048513629909, + "grad_norm": 0.672830701559121, + "learning_rate": 6.267846058348852e-05, + "loss": 4.2212, + "step": 4039 + }, + { + "epoch": 0.18809507181600205, + "grad_norm": 0.735228465429786, + "learning_rate": 6.269397889509621e-05, + "loss": 4.0827, + "step": 4040 + }, + { + "epoch": 0.1881416300020951, + "grad_norm": 0.7974000825867972, + "learning_rate": 6.270949720670392e-05, + "loss": 4.2534, + "step": 4041 + }, + { + "epoch": 0.1881881881881882, + "grad_norm": 0.7795057421590793, + "learning_rate": 6.272501551831162e-05, + "loss": 4.1651, + "step": 4042 + }, + { + "epoch": 0.18823474637428125, + "grad_norm": 0.8264346515999738, + "learning_rate": 6.274053382991931e-05, + "loss": 4.2694, + "step": 4043 + }, + { + "epoch": 0.18828130456037434, + "grad_norm": 0.894308588475187, + "learning_rate": 6.2756052141527e-05, + "loss": 4.1667, + "step": 4044 + }, + { + "epoch": 0.1883278627464674, + "grad_norm": 0.8214533852846893, + "learning_rate": 6.27715704531347e-05, + "loss": 4.2869, + "step": 4045 + }, + { + "epoch": 0.18837442093256046, + "grad_norm": 0.789629296169203, + "learning_rate": 6.27870887647424e-05, + "loss": 4.2101, + "step": 4046 + }, + { + "epoch": 0.18842097911865355, + "grad_norm": 0.86541960662306, + "learning_rate": 6.28026070763501e-05, + "loss": 4.1577, + "step": 4047 + }, + { + "epoch": 0.1884675373047466, + "grad_norm": 0.8369413897820789, + "learning_rate": 6.28181253879578e-05, + "loss": 4.1348, + "step": 4048 + }, + { + "epoch": 0.18851409549083967, + "grad_norm": 0.692627400422643, + "learning_rate": 6.283364369956549e-05, + "loss": 4.1597, + "step": 4049 + }, + { + "epoch": 0.18856065367693275, + "grad_norm": 0.8354904907431329, + "learning_rate": 6.284916201117319e-05, + "loss": 4.1516, + "step": 4050 + }, + { + "epoch": 0.18860721186302581, + "grad_norm": 0.9503385203796904, + "learning_rate": 6.286468032278088e-05, + "loss": 4.1565, + "step": 4051 + }, + { + "epoch": 0.18865377004911887, + "grad_norm": 0.8315662809280958, + "learning_rate": 6.288019863438858e-05, + "loss": 4.1376, + "step": 4052 + }, + { + "epoch": 0.18870032823521196, + "grad_norm": 0.7344579644360374, + "learning_rate": 6.289571694599628e-05, + "loss": 4.2094, + "step": 4053 + }, + { + "epoch": 0.18874688642130502, + "grad_norm": 0.7806422296118244, + "learning_rate": 6.291123525760397e-05, + "loss": 4.0652, + "step": 4054 + }, + { + "epoch": 0.1887934446073981, + "grad_norm": 0.7278393463340084, + "learning_rate": 6.292675356921167e-05, + "loss": 4.2029, + "step": 4055 + }, + { + "epoch": 0.18884000279349117, + "grad_norm": 0.8077609250372254, + "learning_rate": 6.294227188081937e-05, + "loss": 4.0522, + "step": 4056 + }, + { + "epoch": 0.18888656097958423, + "grad_norm": 0.730514872086241, + "learning_rate": 6.295779019242706e-05, + "loss": 4.1275, + "step": 4057 + }, + { + "epoch": 0.18893311916567732, + "grad_norm": 0.7133288847663001, + "learning_rate": 6.297330850403477e-05, + "loss": 4.0792, + "step": 4058 + }, + { + "epoch": 0.18897967735177038, + "grad_norm": 0.8543436125342277, + "learning_rate": 6.298882681564247e-05, + "loss": 4.1127, + "step": 4059 + }, + { + "epoch": 0.18902623553786344, + "grad_norm": 0.8465139582339302, + "learning_rate": 6.300434512725015e-05, + "loss": 4.004, + "step": 4060 + }, + { + "epoch": 0.18907279372395652, + "grad_norm": 0.8390770369639395, + "learning_rate": 6.301986343885785e-05, + "loss": 4.0317, + "step": 4061 + }, + { + "epoch": 0.18911935191004958, + "grad_norm": 0.6991775939163866, + "learning_rate": 6.303538175046554e-05, + "loss": 4.1461, + "step": 4062 + }, + { + "epoch": 0.18916591009614264, + "grad_norm": 0.6858986694647746, + "learning_rate": 6.305090006207325e-05, + "loss": 4.0907, + "step": 4063 + }, + { + "epoch": 0.18921246828223573, + "grad_norm": 0.8290993991391429, + "learning_rate": 6.306641837368095e-05, + "loss": 4.1214, + "step": 4064 + }, + { + "epoch": 0.1892590264683288, + "grad_norm": 0.8541064226240626, + "learning_rate": 6.308193668528865e-05, + "loss": 4.2061, + "step": 4065 + }, + { + "epoch": 0.18930558465442188, + "grad_norm": 0.9423964766563608, + "learning_rate": 6.309745499689634e-05, + "loss": 4.1867, + "step": 4066 + }, + { + "epoch": 0.18935214284051494, + "grad_norm": 1.0300404542620936, + "learning_rate": 6.311297330850404e-05, + "loss": 4.1967, + "step": 4067 + }, + { + "epoch": 0.189398701026608, + "grad_norm": 0.9405051049519003, + "learning_rate": 6.312849162011174e-05, + "loss": 4.1674, + "step": 4068 + }, + { + "epoch": 0.18944525921270108, + "grad_norm": 0.7860878269512304, + "learning_rate": 6.314400993171943e-05, + "loss": 4.1544, + "step": 4069 + }, + { + "epoch": 0.18949181739879414, + "grad_norm": 0.6743787650066745, + "learning_rate": 6.315952824332713e-05, + "loss": 4.257, + "step": 4070 + }, + { + "epoch": 0.1895383755848872, + "grad_norm": 0.7579080672778646, + "learning_rate": 6.317504655493482e-05, + "loss": 4.1456, + "step": 4071 + }, + { + "epoch": 0.1895849337709803, + "grad_norm": 0.6975811759491106, + "learning_rate": 6.319056486654252e-05, + "loss": 4.2176, + "step": 4072 + }, + { + "epoch": 0.18963149195707335, + "grad_norm": 0.7840237916665231, + "learning_rate": 6.320608317815022e-05, + "loss": 4.2005, + "step": 4073 + }, + { + "epoch": 0.1896780501431664, + "grad_norm": 0.764017464717214, + "learning_rate": 6.322160148975791e-05, + "loss": 4.222, + "step": 4074 + }, + { + "epoch": 0.1897246083292595, + "grad_norm": 0.7355764738526458, + "learning_rate": 6.323711980136562e-05, + "loss": 4.2719, + "step": 4075 + }, + { + "epoch": 0.18977116651535256, + "grad_norm": 0.7702067883308888, + "learning_rate": 6.325263811297332e-05, + "loss": 4.1188, + "step": 4076 + }, + { + "epoch": 0.18981772470144564, + "grad_norm": 0.738358234699201, + "learning_rate": 6.3268156424581e-05, + "loss": 4.1171, + "step": 4077 + }, + { + "epoch": 0.1898642828875387, + "grad_norm": 0.729673465060062, + "learning_rate": 6.32836747361887e-05, + "loss": 4.0764, + "step": 4078 + }, + { + "epoch": 0.18991084107363176, + "grad_norm": 0.91222962107939, + "learning_rate": 6.32991930477964e-05, + "loss": 4.1596, + "step": 4079 + }, + { + "epoch": 0.18995739925972485, + "grad_norm": 0.8693015376165459, + "learning_rate": 6.33147113594041e-05, + "loss": 4.028, + "step": 4080 + }, + { + "epoch": 0.1900039574458179, + "grad_norm": 0.7330269263598284, + "learning_rate": 6.33302296710118e-05, + "loss": 4.1578, + "step": 4081 + }, + { + "epoch": 0.19005051563191097, + "grad_norm": 0.7011639056242897, + "learning_rate": 6.33457479826195e-05, + "loss": 4.0593, + "step": 4082 + }, + { + "epoch": 0.19009707381800406, + "grad_norm": 0.7003090457397176, + "learning_rate": 6.33612662942272e-05, + "loss": 4.1821, + "step": 4083 + }, + { + "epoch": 0.19014363200409712, + "grad_norm": 0.5847297074306602, + "learning_rate": 6.337678460583489e-05, + "loss": 4.043, + "step": 4084 + }, + { + "epoch": 0.19019019019019018, + "grad_norm": 0.6058040376114491, + "learning_rate": 6.339230291744259e-05, + "loss": 4.1676, + "step": 4085 + }, + { + "epoch": 0.19023674837628327, + "grad_norm": 0.64244037558356, + "learning_rate": 6.340782122905028e-05, + "loss": 4.1299, + "step": 4086 + }, + { + "epoch": 0.19028330656237633, + "grad_norm": 0.5983451711775477, + "learning_rate": 6.342333954065798e-05, + "loss": 4.1712, + "step": 4087 + }, + { + "epoch": 0.1903298647484694, + "grad_norm": 0.6929478311955273, + "learning_rate": 6.343885785226568e-05, + "loss": 4.229, + "step": 4088 + }, + { + "epoch": 0.19037642293456247, + "grad_norm": 0.6520360956274497, + "learning_rate": 6.345437616387337e-05, + "loss": 4.0913, + "step": 4089 + }, + { + "epoch": 0.19042298112065553, + "grad_norm": 0.5964482004721838, + "learning_rate": 6.346989447548107e-05, + "loss": 4.0407, + "step": 4090 + }, + { + "epoch": 0.19046953930674862, + "grad_norm": 0.5623739209255435, + "learning_rate": 6.348541278708878e-05, + "loss": 4.1868, + "step": 4091 + }, + { + "epoch": 0.19051609749284168, + "grad_norm": 0.6262272986612123, + "learning_rate": 6.350093109869647e-05, + "loss": 3.9795, + "step": 4092 + }, + { + "epoch": 0.19056265567893474, + "grad_norm": 0.6450456249541183, + "learning_rate": 6.351644941030416e-05, + "loss": 4.1226, + "step": 4093 + }, + { + "epoch": 0.19060921386502783, + "grad_norm": 0.7030668087456539, + "learning_rate": 6.353196772191185e-05, + "loss": 4.1151, + "step": 4094 + }, + { + "epoch": 0.1906557720511209, + "grad_norm": 0.7210423633152396, + "learning_rate": 6.354748603351955e-05, + "loss": 4.1892, + "step": 4095 + }, + { + "epoch": 0.19070233023721395, + "grad_norm": 0.740294306388192, + "learning_rate": 6.356300434512726e-05, + "loss": 4.1304, + "step": 4096 + }, + { + "epoch": 0.19074888842330703, + "grad_norm": 0.645645942580855, + "learning_rate": 6.357852265673496e-05, + "loss": 4.0806, + "step": 4097 + }, + { + "epoch": 0.1907954466094001, + "grad_norm": 0.6067096018613718, + "learning_rate": 6.359404096834265e-05, + "loss": 4.1099, + "step": 4098 + }, + { + "epoch": 0.19084200479549318, + "grad_norm": 0.5678646570765754, + "learning_rate": 6.360955927995035e-05, + "loss": 3.9933, + "step": 4099 + }, + { + "epoch": 0.19088856298158624, + "grad_norm": 0.6216129324898577, + "learning_rate": 6.362507759155804e-05, + "loss": 4.2039, + "step": 4100 + }, + { + "epoch": 0.1909351211676793, + "grad_norm": 0.6481713061684334, + "learning_rate": 6.364059590316573e-05, + "loss": 4.239, + "step": 4101 + }, + { + "epoch": 0.1909816793537724, + "grad_norm": 0.6869609152146791, + "learning_rate": 6.365611421477344e-05, + "loss": 3.9784, + "step": 4102 + }, + { + "epoch": 0.19102823753986545, + "grad_norm": 0.6243712267914149, + "learning_rate": 6.367163252638113e-05, + "loss": 4.1125, + "step": 4103 + }, + { + "epoch": 0.1910747957259585, + "grad_norm": 0.6364918149373529, + "learning_rate": 6.368715083798883e-05, + "loss": 4.1162, + "step": 4104 + }, + { + "epoch": 0.1911213539120516, + "grad_norm": 0.8021432422308652, + "learning_rate": 6.370266914959653e-05, + "loss": 4.2486, + "step": 4105 + }, + { + "epoch": 0.19116791209814465, + "grad_norm": 0.7810436201570511, + "learning_rate": 6.371818746120422e-05, + "loss": 4.1255, + "step": 4106 + }, + { + "epoch": 0.19121447028423771, + "grad_norm": 0.7765575604116055, + "learning_rate": 6.373370577281192e-05, + "loss": 4.1592, + "step": 4107 + }, + { + "epoch": 0.1912610284703308, + "grad_norm": 1.071692126977851, + "learning_rate": 6.374922408441963e-05, + "loss": 4.1522, + "step": 4108 + }, + { + "epoch": 0.19130758665642386, + "grad_norm": 0.9356658194537021, + "learning_rate": 6.376474239602731e-05, + "loss": 4.1322, + "step": 4109 + }, + { + "epoch": 0.19135414484251695, + "grad_norm": 0.7680931877303133, + "learning_rate": 6.378026070763501e-05, + "loss": 4.1226, + "step": 4110 + }, + { + "epoch": 0.19140070302861, + "grad_norm": 0.7466185681694001, + "learning_rate": 6.37957790192427e-05, + "loss": 4.1144, + "step": 4111 + }, + { + "epoch": 0.19144726121470307, + "grad_norm": 0.6791054431693127, + "learning_rate": 6.38112973308504e-05, + "loss": 4.1493, + "step": 4112 + }, + { + "epoch": 0.19149381940079616, + "grad_norm": 0.6978610451131491, + "learning_rate": 6.382681564245811e-05, + "loss": 4.1505, + "step": 4113 + }, + { + "epoch": 0.19154037758688922, + "grad_norm": 0.6908948651378242, + "learning_rate": 6.38423339540658e-05, + "loss": 4.1901, + "step": 4114 + }, + { + "epoch": 0.19158693577298228, + "grad_norm": 0.5749425480038505, + "learning_rate": 6.38578522656735e-05, + "loss": 4.0595, + "step": 4115 + }, + { + "epoch": 0.19163349395907536, + "grad_norm": 0.7233961442951893, + "learning_rate": 6.38733705772812e-05, + "loss": 4.1003, + "step": 4116 + }, + { + "epoch": 0.19168005214516842, + "grad_norm": 0.6751987234309695, + "learning_rate": 6.388888888888888e-05, + "loss": 4.23, + "step": 4117 + }, + { + "epoch": 0.19172661033126148, + "grad_norm": 0.6092210513189855, + "learning_rate": 6.390440720049659e-05, + "loss": 4.0625, + "step": 4118 + }, + { + "epoch": 0.19177316851735457, + "grad_norm": 0.6838737762330949, + "learning_rate": 6.391992551210429e-05, + "loss": 4.0552, + "step": 4119 + }, + { + "epoch": 0.19181972670344763, + "grad_norm": 0.7148312462610708, + "learning_rate": 6.393544382371198e-05, + "loss": 4.0282, + "step": 4120 + }, + { + "epoch": 0.19186628488954072, + "grad_norm": 0.7210844784281714, + "learning_rate": 6.395096213531968e-05, + "loss": 3.9915, + "step": 4121 + }, + { + "epoch": 0.19191284307563378, + "grad_norm": 0.7333915275527819, + "learning_rate": 6.396648044692738e-05, + "loss": 4.041, + "step": 4122 + }, + { + "epoch": 0.19195940126172684, + "grad_norm": 0.8454302169856257, + "learning_rate": 6.398199875853507e-05, + "loss": 4.1589, + "step": 4123 + }, + { + "epoch": 0.19200595944781992, + "grad_norm": 0.7879321340126457, + "learning_rate": 6.399751707014277e-05, + "loss": 4.046, + "step": 4124 + }, + { + "epoch": 0.19205251763391298, + "grad_norm": 0.7154460805143492, + "learning_rate": 6.401303538175047e-05, + "loss": 4.157, + "step": 4125 + }, + { + "epoch": 0.19209907582000604, + "grad_norm": 0.7826146642961368, + "learning_rate": 6.402855369335816e-05, + "loss": 4.1488, + "step": 4126 + }, + { + "epoch": 0.19214563400609913, + "grad_norm": 0.8327392476941636, + "learning_rate": 6.404407200496586e-05, + "loss": 4.1086, + "step": 4127 + }, + { + "epoch": 0.1921921921921922, + "grad_norm": 0.7961578393365908, + "learning_rate": 6.405959031657355e-05, + "loss": 4.1157, + "step": 4128 + }, + { + "epoch": 0.19223875037828525, + "grad_norm": 0.7265949167232442, + "learning_rate": 6.407510862818125e-05, + "loss": 4.1623, + "step": 4129 + }, + { + "epoch": 0.19228530856437834, + "grad_norm": 0.7196578597265663, + "learning_rate": 6.409062693978896e-05, + "loss": 4.128, + "step": 4130 + }, + { + "epoch": 0.1923318667504714, + "grad_norm": 0.7053623657834959, + "learning_rate": 6.410614525139666e-05, + "loss": 4.1573, + "step": 4131 + }, + { + "epoch": 0.19237842493656448, + "grad_norm": 0.7563939432295701, + "learning_rate": 6.412166356300435e-05, + "loss": 4.0937, + "step": 4132 + }, + { + "epoch": 0.19242498312265754, + "grad_norm": 0.805529439231568, + "learning_rate": 6.413718187461204e-05, + "loss": 4.011, + "step": 4133 + }, + { + "epoch": 0.1924715413087506, + "grad_norm": 0.806986628085349, + "learning_rate": 6.415270018621973e-05, + "loss": 4.1235, + "step": 4134 + }, + { + "epoch": 0.1925180994948437, + "grad_norm": 0.7787440285705578, + "learning_rate": 6.416821849782744e-05, + "loss": 4.1181, + "step": 4135 + }, + { + "epoch": 0.19256465768093675, + "grad_norm": 0.7936482275035984, + "learning_rate": 6.418373680943514e-05, + "loss": 4.072, + "step": 4136 + }, + { + "epoch": 0.1926112158670298, + "grad_norm": 0.7869266633836373, + "learning_rate": 6.419925512104284e-05, + "loss": 4.1041, + "step": 4137 + }, + { + "epoch": 0.1926577740531229, + "grad_norm": 0.6651414626489011, + "learning_rate": 6.421477343265053e-05, + "loss": 4.2449, + "step": 4138 + }, + { + "epoch": 0.19270433223921596, + "grad_norm": 0.8763453734831014, + "learning_rate": 6.423029174425823e-05, + "loss": 4.1265, + "step": 4139 + }, + { + "epoch": 0.19275089042530902, + "grad_norm": 0.9322434761503801, + "learning_rate": 6.424581005586592e-05, + "loss": 4.094, + "step": 4140 + }, + { + "epoch": 0.1927974486114021, + "grad_norm": 0.9346202548232287, + "learning_rate": 6.426132836747362e-05, + "loss": 4.1475, + "step": 4141 + }, + { + "epoch": 0.19284400679749517, + "grad_norm": 0.8243547110091728, + "learning_rate": 6.427684667908132e-05, + "loss": 4.1839, + "step": 4142 + }, + { + "epoch": 0.19289056498358825, + "grad_norm": 0.7659134836328283, + "learning_rate": 6.429236499068901e-05, + "loss": 4.2028, + "step": 4143 + }, + { + "epoch": 0.1929371231696813, + "grad_norm": 0.734714981220425, + "learning_rate": 6.430788330229671e-05, + "loss": 3.9958, + "step": 4144 + }, + { + "epoch": 0.19298368135577437, + "grad_norm": 0.7213093810233142, + "learning_rate": 6.43234016139044e-05, + "loss": 4.0527, + "step": 4145 + }, + { + "epoch": 0.19303023954186746, + "grad_norm": 0.7921736414739884, + "learning_rate": 6.433891992551212e-05, + "loss": 4.0992, + "step": 4146 + }, + { + "epoch": 0.19307679772796052, + "grad_norm": 0.813573390936768, + "learning_rate": 6.435443823711981e-05, + "loss": 4.1977, + "step": 4147 + }, + { + "epoch": 0.19312335591405358, + "grad_norm": 0.798063117236429, + "learning_rate": 6.436995654872751e-05, + "loss": 4.0503, + "step": 4148 + }, + { + "epoch": 0.19316991410014667, + "grad_norm": 0.6898100897693885, + "learning_rate": 6.438547486033519e-05, + "loss": 4.1643, + "step": 4149 + }, + { + "epoch": 0.19321647228623973, + "grad_norm": 0.8395232118720444, + "learning_rate": 6.440099317194289e-05, + "loss": 4.1914, + "step": 4150 + }, + { + "epoch": 0.1932630304723328, + "grad_norm": 0.8217125885429023, + "learning_rate": 6.441651148355058e-05, + "loss": 4.1362, + "step": 4151 + }, + { + "epoch": 0.19330958865842587, + "grad_norm": 0.7959894344842183, + "learning_rate": 6.443202979515829e-05, + "loss": 3.9147, + "step": 4152 + }, + { + "epoch": 0.19335614684451893, + "grad_norm": 0.7057776183864317, + "learning_rate": 6.444754810676599e-05, + "loss": 4.1827, + "step": 4153 + }, + { + "epoch": 0.19340270503061202, + "grad_norm": 0.7957670073673395, + "learning_rate": 6.446306641837369e-05, + "loss": 4.0654, + "step": 4154 + }, + { + "epoch": 0.19344926321670508, + "grad_norm": 0.7364765655822828, + "learning_rate": 6.447858472998138e-05, + "loss": 4.1993, + "step": 4155 + }, + { + "epoch": 0.19349582140279814, + "grad_norm": 0.7389179784183226, + "learning_rate": 6.449410304158908e-05, + "loss": 4.1806, + "step": 4156 + }, + { + "epoch": 0.19354237958889123, + "grad_norm": 0.6562367359201906, + "learning_rate": 6.450962135319677e-05, + "loss": 4.1165, + "step": 4157 + }, + { + "epoch": 0.1935889377749843, + "grad_norm": 0.6707320766324514, + "learning_rate": 6.452513966480447e-05, + "loss": 4.0797, + "step": 4158 + }, + { + "epoch": 0.19363549596107735, + "grad_norm": 0.6983742177931663, + "learning_rate": 6.454065797641217e-05, + "loss": 4.1238, + "step": 4159 + }, + { + "epoch": 0.19368205414717043, + "grad_norm": 0.6676080777770651, + "learning_rate": 6.455617628801986e-05, + "loss": 4.0814, + "step": 4160 + }, + { + "epoch": 0.1937286123332635, + "grad_norm": 0.6770796022615304, + "learning_rate": 6.457169459962756e-05, + "loss": 4.0362, + "step": 4161 + }, + { + "epoch": 0.19377517051935655, + "grad_norm": 0.6019021936070122, + "learning_rate": 6.458721291123526e-05, + "loss": 4.1436, + "step": 4162 + }, + { + "epoch": 0.19382172870544964, + "grad_norm": 0.6638748897185771, + "learning_rate": 6.460273122284297e-05, + "loss": 4.0895, + "step": 4163 + }, + { + "epoch": 0.1938682868915427, + "grad_norm": 0.6919086544265324, + "learning_rate": 6.461824953445066e-05, + "loss": 4.0665, + "step": 4164 + }, + { + "epoch": 0.1939148450776358, + "grad_norm": 0.7407838580149502, + "learning_rate": 6.463376784605836e-05, + "loss": 4.1257, + "step": 4165 + }, + { + "epoch": 0.19396140326372885, + "grad_norm": 0.7401865438221251, + "learning_rate": 6.464928615766604e-05, + "loss": 4.0254, + "step": 4166 + }, + { + "epoch": 0.1940079614498219, + "grad_norm": 0.7735598202906473, + "learning_rate": 6.466480446927374e-05, + "loss": 4.1117, + "step": 4167 + }, + { + "epoch": 0.194054519635915, + "grad_norm": 1.0515404986094758, + "learning_rate": 6.468032278088145e-05, + "loss": 4.1004, + "step": 4168 + }, + { + "epoch": 0.19410107782200806, + "grad_norm": 1.0156759187645314, + "learning_rate": 6.469584109248914e-05, + "loss": 4.0693, + "step": 4169 + }, + { + "epoch": 0.19414763600810112, + "grad_norm": 0.8464578681890704, + "learning_rate": 6.471135940409684e-05, + "loss": 4.1139, + "step": 4170 + }, + { + "epoch": 0.1941941941941942, + "grad_norm": 0.9605069966660815, + "learning_rate": 6.472687771570454e-05, + "loss": 4.1371, + "step": 4171 + }, + { + "epoch": 0.19424075238028726, + "grad_norm": 0.8985751317738302, + "learning_rate": 6.474239602731223e-05, + "loss": 4.1626, + "step": 4172 + }, + { + "epoch": 0.19428731056638032, + "grad_norm": 0.8872751815969729, + "learning_rate": 6.475791433891993e-05, + "loss": 4.0026, + "step": 4173 + }, + { + "epoch": 0.1943338687524734, + "grad_norm": 0.9726638186093769, + "learning_rate": 6.477343265052763e-05, + "loss": 4.131, + "step": 4174 + }, + { + "epoch": 0.19438042693856647, + "grad_norm": 0.9100914623193274, + "learning_rate": 6.478895096213532e-05, + "loss": 4.2546, + "step": 4175 + }, + { + "epoch": 0.19442698512465956, + "grad_norm": 0.8161444340153023, + "learning_rate": 6.480446927374302e-05, + "loss": 4.1114, + "step": 4176 + }, + { + "epoch": 0.19447354331075262, + "grad_norm": 0.8413476451436144, + "learning_rate": 6.481998758535071e-05, + "loss": 4.2473, + "step": 4177 + }, + { + "epoch": 0.19452010149684568, + "grad_norm": 0.7554309723931966, + "learning_rate": 6.483550589695841e-05, + "loss": 4.261, + "step": 4178 + }, + { + "epoch": 0.19456665968293876, + "grad_norm": 0.6840998483980959, + "learning_rate": 6.485102420856611e-05, + "loss": 4.113, + "step": 4179 + }, + { + "epoch": 0.19461321786903182, + "grad_norm": 0.6510329088080539, + "learning_rate": 6.486654252017382e-05, + "loss": 4.1308, + "step": 4180 + }, + { + "epoch": 0.19465977605512488, + "grad_norm": 0.6580046228482013, + "learning_rate": 6.488206083178151e-05, + "loss": 4.1478, + "step": 4181 + }, + { + "epoch": 0.19470633424121797, + "grad_norm": 0.6574154730312153, + "learning_rate": 6.48975791433892e-05, + "loss": 4.0278, + "step": 4182 + }, + { + "epoch": 0.19475289242731103, + "grad_norm": 0.6140625663758303, + "learning_rate": 6.491309745499689e-05, + "loss": 4.0069, + "step": 4183 + }, + { + "epoch": 0.1947994506134041, + "grad_norm": 0.5733062888646809, + "learning_rate": 6.492861576660459e-05, + "loss": 4.11, + "step": 4184 + }, + { + "epoch": 0.19484600879949718, + "grad_norm": 0.656781472992855, + "learning_rate": 6.49441340782123e-05, + "loss": 4.058, + "step": 4185 + }, + { + "epoch": 0.19489256698559024, + "grad_norm": 0.7179303251233912, + "learning_rate": 6.495965238982e-05, + "loss": 4.127, + "step": 4186 + }, + { + "epoch": 0.1949391251716833, + "grad_norm": 0.6881488797569025, + "learning_rate": 6.497517070142769e-05, + "loss": 4.0523, + "step": 4187 + }, + { + "epoch": 0.19498568335777638, + "grad_norm": 0.617975577235413, + "learning_rate": 6.499068901303539e-05, + "loss": 4.1067, + "step": 4188 + }, + { + "epoch": 0.19503224154386944, + "grad_norm": 0.7354767200164543, + "learning_rate": 6.500620732464308e-05, + "loss": 4.1328, + "step": 4189 + }, + { + "epoch": 0.19507879972996253, + "grad_norm": 0.6515750030926878, + "learning_rate": 6.502172563625078e-05, + "loss": 4.0379, + "step": 4190 + }, + { + "epoch": 0.1951253579160556, + "grad_norm": 0.6550250322326616, + "learning_rate": 6.503724394785848e-05, + "loss": 4.2345, + "step": 4191 + }, + { + "epoch": 0.19517191610214865, + "grad_norm": 0.6657876307195453, + "learning_rate": 6.505276225946617e-05, + "loss": 4.0574, + "step": 4192 + }, + { + "epoch": 0.19521847428824174, + "grad_norm": 0.6281447602390836, + "learning_rate": 6.506828057107387e-05, + "loss": 4.1065, + "step": 4193 + }, + { + "epoch": 0.1952650324743348, + "grad_norm": 0.6483586522067608, + "learning_rate": 6.508379888268157e-05, + "loss": 4.1408, + "step": 4194 + }, + { + "epoch": 0.19531159066042786, + "grad_norm": 0.6532096356807874, + "learning_rate": 6.509931719428926e-05, + "loss": 4.1056, + "step": 4195 + }, + { + "epoch": 0.19535814884652095, + "grad_norm": 0.6649313058129157, + "learning_rate": 6.511483550589697e-05, + "loss": 3.9528, + "step": 4196 + }, + { + "epoch": 0.195404707032614, + "grad_norm": 0.6833903057512363, + "learning_rate": 6.513035381750467e-05, + "loss": 4.1353, + "step": 4197 + }, + { + "epoch": 0.19545126521870707, + "grad_norm": 0.6405372580536397, + "learning_rate": 6.514587212911235e-05, + "loss": 4.0625, + "step": 4198 + }, + { + "epoch": 0.19549782340480015, + "grad_norm": 0.6241740543425109, + "learning_rate": 6.516139044072005e-05, + "loss": 4.0458, + "step": 4199 + }, + { + "epoch": 0.1955443815908932, + "grad_norm": 0.6919044788942976, + "learning_rate": 6.517690875232774e-05, + "loss": 3.9793, + "step": 4200 + }, + { + "epoch": 0.1955909397769863, + "grad_norm": 0.737794180044529, + "learning_rate": 6.519242706393545e-05, + "loss": 4.0828, + "step": 4201 + }, + { + "epoch": 0.19563749796307936, + "grad_norm": 0.6479665064170811, + "learning_rate": 6.520794537554315e-05, + "loss": 4.0848, + "step": 4202 + }, + { + "epoch": 0.19568405614917242, + "grad_norm": 0.6145630826867311, + "learning_rate": 6.522346368715085e-05, + "loss": 4.0039, + "step": 4203 + }, + { + "epoch": 0.1957306143352655, + "grad_norm": 0.7418006820558726, + "learning_rate": 6.523898199875854e-05, + "loss": 4.0464, + "step": 4204 + }, + { + "epoch": 0.19577717252135857, + "grad_norm": 0.8189933358605948, + "learning_rate": 6.525450031036624e-05, + "loss": 4.1189, + "step": 4205 + }, + { + "epoch": 0.19582373070745163, + "grad_norm": 0.7670938437809893, + "learning_rate": 6.527001862197392e-05, + "loss": 4.1214, + "step": 4206 + }, + { + "epoch": 0.19587028889354471, + "grad_norm": 0.7671654599789881, + "learning_rate": 6.528553693358163e-05, + "loss": 4.1223, + "step": 4207 + }, + { + "epoch": 0.19591684707963777, + "grad_norm": 0.7085658136285387, + "learning_rate": 6.530105524518933e-05, + "loss": 4.0361, + "step": 4208 + }, + { + "epoch": 0.19596340526573083, + "grad_norm": 0.8054496798287619, + "learning_rate": 6.531657355679702e-05, + "loss": 4.0989, + "step": 4209 + }, + { + "epoch": 0.19600996345182392, + "grad_norm": 0.8054597086073195, + "learning_rate": 6.533209186840472e-05, + "loss": 4.0499, + "step": 4210 + }, + { + "epoch": 0.19605652163791698, + "grad_norm": 0.7178463366529854, + "learning_rate": 6.534761018001242e-05, + "loss": 4.0731, + "step": 4211 + }, + { + "epoch": 0.19610307982401007, + "grad_norm": 0.719456306741879, + "learning_rate": 6.536312849162011e-05, + "loss": 4.2091, + "step": 4212 + }, + { + "epoch": 0.19614963801010313, + "grad_norm": 0.7717509547500935, + "learning_rate": 6.537864680322782e-05, + "loss": 4.0385, + "step": 4213 + }, + { + "epoch": 0.1961961961961962, + "grad_norm": 0.6897356133477246, + "learning_rate": 6.53941651148355e-05, + "loss": 4.1496, + "step": 4214 + }, + { + "epoch": 0.19624275438228928, + "grad_norm": 0.732244730977832, + "learning_rate": 6.54096834264432e-05, + "loss": 4.1319, + "step": 4215 + }, + { + "epoch": 0.19628931256838233, + "grad_norm": 0.9134687641400414, + "learning_rate": 6.54252017380509e-05, + "loss": 4.1488, + "step": 4216 + }, + { + "epoch": 0.1963358707544754, + "grad_norm": 0.948543391788892, + "learning_rate": 6.54407200496586e-05, + "loss": 4.1417, + "step": 4217 + }, + { + "epoch": 0.19638242894056848, + "grad_norm": 0.8033145471230595, + "learning_rate": 6.54562383612663e-05, + "loss": 4.1469, + "step": 4218 + }, + { + "epoch": 0.19642898712666154, + "grad_norm": 0.803522503635882, + "learning_rate": 6.5471756672874e-05, + "loss": 3.9854, + "step": 4219 + }, + { + "epoch": 0.1964755453127546, + "grad_norm": 0.7803532126724313, + "learning_rate": 6.54872749844817e-05, + "loss": 4.0662, + "step": 4220 + }, + { + "epoch": 0.1965221034988477, + "grad_norm": 0.6542304112784038, + "learning_rate": 6.550279329608939e-05, + "loss": 3.9872, + "step": 4221 + }, + { + "epoch": 0.19656866168494075, + "grad_norm": 0.716710102754414, + "learning_rate": 6.551831160769708e-05, + "loss": 4.1681, + "step": 4222 + }, + { + "epoch": 0.19661521987103384, + "grad_norm": 0.8214467788427581, + "learning_rate": 6.553382991930479e-05, + "loss": 4.195, + "step": 4223 + }, + { + "epoch": 0.1966617780571269, + "grad_norm": 0.7086119834655307, + "learning_rate": 6.554934823091248e-05, + "loss": 4.0171, + "step": 4224 + }, + { + "epoch": 0.19670833624321996, + "grad_norm": 0.6881439624441, + "learning_rate": 6.556486654252018e-05, + "loss": 4.1213, + "step": 4225 + }, + { + "epoch": 0.19675489442931304, + "grad_norm": 0.7287962271068652, + "learning_rate": 6.558038485412787e-05, + "loss": 4.142, + "step": 4226 + }, + { + "epoch": 0.1968014526154061, + "grad_norm": 0.6475615734828505, + "learning_rate": 6.559590316573557e-05, + "loss": 4.0471, + "step": 4227 + }, + { + "epoch": 0.19684801080149916, + "grad_norm": 0.6833768451550292, + "learning_rate": 6.561142147734327e-05, + "loss": 4.1265, + "step": 4228 + }, + { + "epoch": 0.19689456898759225, + "grad_norm": 0.6304469439398579, + "learning_rate": 6.562693978895096e-05, + "loss": 4.0055, + "step": 4229 + }, + { + "epoch": 0.1969411271736853, + "grad_norm": 0.7270825077862878, + "learning_rate": 6.564245810055866e-05, + "loss": 4.0473, + "step": 4230 + }, + { + "epoch": 0.19698768535977837, + "grad_norm": 0.6739734527668985, + "learning_rate": 6.565797641216636e-05, + "loss": 4.0562, + "step": 4231 + }, + { + "epoch": 0.19703424354587146, + "grad_norm": 0.7791360571840387, + "learning_rate": 6.567349472377405e-05, + "loss": 4.0082, + "step": 4232 + }, + { + "epoch": 0.19708080173196452, + "grad_norm": 0.848315610127971, + "learning_rate": 6.568901303538175e-05, + "loss": 4.0733, + "step": 4233 + }, + { + "epoch": 0.1971273599180576, + "grad_norm": 0.9349547942322144, + "learning_rate": 6.570453134698944e-05, + "loss": 4.207, + "step": 4234 + }, + { + "epoch": 0.19717391810415066, + "grad_norm": 0.9808099945191683, + "learning_rate": 6.572004965859715e-05, + "loss": 4.1062, + "step": 4235 + }, + { + "epoch": 0.19722047629024372, + "grad_norm": 0.9748031330587972, + "learning_rate": 6.573556797020485e-05, + "loss": 4.1092, + "step": 4236 + }, + { + "epoch": 0.1972670344763368, + "grad_norm": 0.7089025254775927, + "learning_rate": 6.575108628181255e-05, + "loss": 4.1867, + "step": 4237 + }, + { + "epoch": 0.19731359266242987, + "grad_norm": 0.6799263985870996, + "learning_rate": 6.576660459342023e-05, + "loss": 4.1026, + "step": 4238 + }, + { + "epoch": 0.19736015084852293, + "grad_norm": 0.7373792004497972, + "learning_rate": 6.578212290502793e-05, + "loss": 4.0814, + "step": 4239 + }, + { + "epoch": 0.19740670903461602, + "grad_norm": 0.8537910230320664, + "learning_rate": 6.579764121663564e-05, + "loss": 4.1151, + "step": 4240 + }, + { + "epoch": 0.19745326722070908, + "grad_norm": 0.7117292327280734, + "learning_rate": 6.581315952824333e-05, + "loss": 4.2067, + "step": 4241 + }, + { + "epoch": 0.19749982540680214, + "grad_norm": 0.6608550511001989, + "learning_rate": 6.582867783985103e-05, + "loss": 4.014, + "step": 4242 + }, + { + "epoch": 0.19754638359289522, + "grad_norm": 0.6770913081028694, + "learning_rate": 6.584419615145872e-05, + "loss": 3.9772, + "step": 4243 + }, + { + "epoch": 0.19759294177898828, + "grad_norm": 0.7291915549412323, + "learning_rate": 6.585971446306642e-05, + "loss": 4.0717, + "step": 4244 + }, + { + "epoch": 0.19763949996508137, + "grad_norm": 0.6675762381685618, + "learning_rate": 6.587523277467412e-05, + "loss": 4.1235, + "step": 4245 + }, + { + "epoch": 0.19768605815117443, + "grad_norm": 0.6288491567282604, + "learning_rate": 6.589075108628183e-05, + "loss": 4.0291, + "step": 4246 + }, + { + "epoch": 0.1977326163372675, + "grad_norm": 0.6501807033179932, + "learning_rate": 6.590626939788951e-05, + "loss": 3.9891, + "step": 4247 + }, + { + "epoch": 0.19777917452336058, + "grad_norm": 0.6740758756569037, + "learning_rate": 6.59217877094972e-05, + "loss": 4.0548, + "step": 4248 + }, + { + "epoch": 0.19782573270945364, + "grad_norm": 0.6588266260332081, + "learning_rate": 6.59373060211049e-05, + "loss": 4.0708, + "step": 4249 + }, + { + "epoch": 0.1978722908955467, + "grad_norm": 0.7008662594279461, + "learning_rate": 6.59528243327126e-05, + "loss": 4.0653, + "step": 4250 + }, + { + "epoch": 0.19791884908163979, + "grad_norm": 0.7011547824212456, + "learning_rate": 6.596834264432031e-05, + "loss": 4.0145, + "step": 4251 + }, + { + "epoch": 0.19796540726773285, + "grad_norm": 0.6232787282346571, + "learning_rate": 6.5983860955928e-05, + "loss": 4.1502, + "step": 4252 + }, + { + "epoch": 0.1980119654538259, + "grad_norm": 0.5776418538662291, + "learning_rate": 6.59993792675357e-05, + "loss": 3.9948, + "step": 4253 + }, + { + "epoch": 0.198058523639919, + "grad_norm": 0.5997929332598255, + "learning_rate": 6.60148975791434e-05, + "loss": 4.0476, + "step": 4254 + }, + { + "epoch": 0.19810508182601205, + "grad_norm": 0.6688775690363723, + "learning_rate": 6.603041589075108e-05, + "loss": 4.144, + "step": 4255 + }, + { + "epoch": 0.19815164001210514, + "grad_norm": 0.7117762760100806, + "learning_rate": 6.604593420235878e-05, + "loss": 3.9724, + "step": 4256 + }, + { + "epoch": 0.1981981981981982, + "grad_norm": 0.6618887034937895, + "learning_rate": 6.606145251396649e-05, + "loss": 3.9576, + "step": 4257 + }, + { + "epoch": 0.19824475638429126, + "grad_norm": 0.7644511553481909, + "learning_rate": 6.607697082557418e-05, + "loss": 4.0628, + "step": 4258 + }, + { + "epoch": 0.19829131457038435, + "grad_norm": 0.9272713104894053, + "learning_rate": 6.609248913718188e-05, + "loss": 4.0465, + "step": 4259 + }, + { + "epoch": 0.1983378727564774, + "grad_norm": 1.093154044396487, + "learning_rate": 6.610800744878958e-05, + "loss": 4.0026, + "step": 4260 + }, + { + "epoch": 0.19838443094257047, + "grad_norm": 0.647089169392201, + "learning_rate": 6.612352576039727e-05, + "loss": 4.111, + "step": 4261 + }, + { + "epoch": 0.19843098912866355, + "grad_norm": 0.8180331539343197, + "learning_rate": 6.613904407200497e-05, + "loss": 4.1803, + "step": 4262 + }, + { + "epoch": 0.1984775473147566, + "grad_norm": 1.0662459403764328, + "learning_rate": 6.615456238361266e-05, + "loss": 3.9411, + "step": 4263 + }, + { + "epoch": 0.19852410550084967, + "grad_norm": 0.7621036013278215, + "learning_rate": 6.617008069522036e-05, + "loss": 4.0852, + "step": 4264 + }, + { + "epoch": 0.19857066368694276, + "grad_norm": 0.7076620347656751, + "learning_rate": 6.618559900682806e-05, + "loss": 3.8652, + "step": 4265 + }, + { + "epoch": 0.19861722187303582, + "grad_norm": 0.8108996544589572, + "learning_rate": 6.620111731843575e-05, + "loss": 4.0594, + "step": 4266 + }, + { + "epoch": 0.1986637800591289, + "grad_norm": 0.7115979147138967, + "learning_rate": 6.621663563004345e-05, + "loss": 4.0835, + "step": 4267 + }, + { + "epoch": 0.19871033824522197, + "grad_norm": 0.6104641024087499, + "learning_rate": 6.623215394165116e-05, + "loss": 4.0788, + "step": 4268 + }, + { + "epoch": 0.19875689643131503, + "grad_norm": 0.7148209789442618, + "learning_rate": 6.624767225325886e-05, + "loss": 3.9901, + "step": 4269 + }, + { + "epoch": 0.19880345461740812, + "grad_norm": 0.6772387897723144, + "learning_rate": 6.626319056486655e-05, + "loss": 4.0842, + "step": 4270 + }, + { + "epoch": 0.19885001280350117, + "grad_norm": 0.6703051561549127, + "learning_rate": 6.627870887647424e-05, + "loss": 4.039, + "step": 4271 + }, + { + "epoch": 0.19889657098959423, + "grad_norm": 0.8425449306473786, + "learning_rate": 6.629422718808193e-05, + "loss": 4.1786, + "step": 4272 + }, + { + "epoch": 0.19894312917568732, + "grad_norm": 1.0312820649168453, + "learning_rate": 6.630974549968964e-05, + "loss": 4.0563, + "step": 4273 + }, + { + "epoch": 0.19898968736178038, + "grad_norm": 0.8834196508432595, + "learning_rate": 6.632526381129734e-05, + "loss": 4.0897, + "step": 4274 + }, + { + "epoch": 0.19903624554787344, + "grad_norm": 0.7348508088907122, + "learning_rate": 6.634078212290503e-05, + "loss": 3.9172, + "step": 4275 + }, + { + "epoch": 0.19908280373396653, + "grad_norm": 0.8362123647162208, + "learning_rate": 6.635630043451273e-05, + "loss": 4.0479, + "step": 4276 + }, + { + "epoch": 0.1991293619200596, + "grad_norm": 0.8084440263883478, + "learning_rate": 6.637181874612043e-05, + "loss": 3.9592, + "step": 4277 + }, + { + "epoch": 0.19917592010615268, + "grad_norm": 0.630983709935604, + "learning_rate": 6.638733705772812e-05, + "loss": 3.936, + "step": 4278 + }, + { + "epoch": 0.19922247829224574, + "grad_norm": 0.7774860789843598, + "learning_rate": 6.640285536933582e-05, + "loss": 4.2067, + "step": 4279 + }, + { + "epoch": 0.1992690364783388, + "grad_norm": 0.7329250524350607, + "learning_rate": 6.641837368094352e-05, + "loss": 4.0814, + "step": 4280 + }, + { + "epoch": 0.19931559466443188, + "grad_norm": 0.8612071654047551, + "learning_rate": 6.643389199255121e-05, + "loss": 4.1574, + "step": 4281 + }, + { + "epoch": 0.19936215285052494, + "grad_norm": 0.8156539157729976, + "learning_rate": 6.644941030415891e-05, + "loss": 4.0256, + "step": 4282 + }, + { + "epoch": 0.199408711036618, + "grad_norm": 0.7060762151901034, + "learning_rate": 6.64649286157666e-05, + "loss": 3.9566, + "step": 4283 + }, + { + "epoch": 0.1994552692227111, + "grad_norm": 0.7903062705990271, + "learning_rate": 6.64804469273743e-05, + "loss": 4.0234, + "step": 4284 + }, + { + "epoch": 0.19950182740880415, + "grad_norm": 0.7924498661524543, + "learning_rate": 6.649596523898201e-05, + "loss": 4.1434, + "step": 4285 + }, + { + "epoch": 0.1995483855948972, + "grad_norm": 0.6768160150019455, + "learning_rate": 6.651148355058971e-05, + "loss": 4.0691, + "step": 4286 + }, + { + "epoch": 0.1995949437809903, + "grad_norm": 0.667769114353794, + "learning_rate": 6.652700186219739e-05, + "loss": 4.028, + "step": 4287 + }, + { + "epoch": 0.19964150196708336, + "grad_norm": 0.6315402300692137, + "learning_rate": 6.654252017380509e-05, + "loss": 4.0198, + "step": 4288 + }, + { + "epoch": 0.19968806015317644, + "grad_norm": 0.6755898323603879, + "learning_rate": 6.655803848541278e-05, + "loss": 3.9323, + "step": 4289 + }, + { + "epoch": 0.1997346183392695, + "grad_norm": 0.6914813268727601, + "learning_rate": 6.657355679702049e-05, + "loss": 4.1454, + "step": 4290 + }, + { + "epoch": 0.19978117652536256, + "grad_norm": 0.6959143958878402, + "learning_rate": 6.658907510862819e-05, + "loss": 4.0149, + "step": 4291 + }, + { + "epoch": 0.19982773471145565, + "grad_norm": 0.7114434469534971, + "learning_rate": 6.660459342023588e-05, + "loss": 4.1631, + "step": 4292 + }, + { + "epoch": 0.1998742928975487, + "grad_norm": 0.7147160028661858, + "learning_rate": 6.662011173184358e-05, + "loss": 4.116, + "step": 4293 + }, + { + "epoch": 0.19992085108364177, + "grad_norm": 0.6763566204729957, + "learning_rate": 6.663563004345128e-05, + "loss": 4.0596, + "step": 4294 + }, + { + "epoch": 0.19996740926973486, + "grad_norm": 0.7226083565765395, + "learning_rate": 6.665114835505897e-05, + "loss": 4.1405, + "step": 4295 + }, + { + "epoch": 0.20001396745582792, + "grad_norm": 0.6715188001875505, + "learning_rate": 6.666666666666667e-05, + "loss": 4.1099, + "step": 4296 + }, + { + "epoch": 0.20006052564192098, + "grad_norm": 0.6455758213058357, + "learning_rate": 6.668218497827437e-05, + "loss": 4.0095, + "step": 4297 + }, + { + "epoch": 0.20010708382801407, + "grad_norm": 0.7534732232631699, + "learning_rate": 6.669770328988206e-05, + "loss": 4.0823, + "step": 4298 + }, + { + "epoch": 0.20015364201410712, + "grad_norm": 0.8258945482736192, + "learning_rate": 6.671322160148976e-05, + "loss": 4.1699, + "step": 4299 + }, + { + "epoch": 0.2002002002002002, + "grad_norm": 0.8763864436796247, + "learning_rate": 6.672873991309746e-05, + "loss": 4.098, + "step": 4300 + }, + { + "epoch": 0.20024675838629327, + "grad_norm": 0.8947509420896317, + "learning_rate": 6.674425822470516e-05, + "loss": 4.0558, + "step": 4301 + }, + { + "epoch": 0.20029331657238633, + "grad_norm": 0.7018673627371185, + "learning_rate": 6.675977653631286e-05, + "loss": 4.0671, + "step": 4302 + }, + { + "epoch": 0.20033987475847942, + "grad_norm": 0.6403656094253123, + "learning_rate": 6.677529484792054e-05, + "loss": 4.0683, + "step": 4303 + }, + { + "epoch": 0.20038643294457248, + "grad_norm": 0.7320845631156625, + "learning_rate": 6.679081315952824e-05, + "loss": 4.1561, + "step": 4304 + }, + { + "epoch": 0.20043299113066554, + "grad_norm": 0.6012662784127764, + "learning_rate": 6.680633147113594e-05, + "loss": 4.0498, + "step": 4305 + }, + { + "epoch": 0.20047954931675863, + "grad_norm": 0.7576568218743748, + "learning_rate": 6.682184978274363e-05, + "loss": 4.009, + "step": 4306 + }, + { + "epoch": 0.20052610750285169, + "grad_norm": 0.7301115496721684, + "learning_rate": 6.683736809435134e-05, + "loss": 3.9065, + "step": 4307 + }, + { + "epoch": 0.20057266568894475, + "grad_norm": 0.6272824604038346, + "learning_rate": 6.685288640595904e-05, + "loss": 4.0385, + "step": 4308 + }, + { + "epoch": 0.20061922387503783, + "grad_norm": 0.6882069914831299, + "learning_rate": 6.686840471756674e-05, + "loss": 4.0834, + "step": 4309 + }, + { + "epoch": 0.2006657820611309, + "grad_norm": 0.8611922027859796, + "learning_rate": 6.688392302917443e-05, + "loss": 4.159, + "step": 4310 + }, + { + "epoch": 0.20071234024722398, + "grad_norm": 1.13786434213662, + "learning_rate": 6.689944134078211e-05, + "loss": 4.1292, + "step": 4311 + }, + { + "epoch": 0.20075889843331704, + "grad_norm": 0.6695337956339346, + "learning_rate": 6.691495965238982e-05, + "loss": 4.1303, + "step": 4312 + }, + { + "epoch": 0.2008054566194101, + "grad_norm": 0.7249976407561249, + "learning_rate": 6.693047796399752e-05, + "loss": 4.0435, + "step": 4313 + }, + { + "epoch": 0.2008520148055032, + "grad_norm": 0.7890925953696141, + "learning_rate": 6.694599627560522e-05, + "loss": 4.1028, + "step": 4314 + }, + { + "epoch": 0.20089857299159625, + "grad_norm": 0.789034624271774, + "learning_rate": 6.696151458721291e-05, + "loss": 4.0826, + "step": 4315 + }, + { + "epoch": 0.2009451311776893, + "grad_norm": 0.66480612297092, + "learning_rate": 6.697703289882061e-05, + "loss": 3.9088, + "step": 4316 + }, + { + "epoch": 0.2009916893637824, + "grad_norm": 0.596102978172993, + "learning_rate": 6.69925512104283e-05, + "loss": 4.0558, + "step": 4317 + }, + { + "epoch": 0.20103824754987545, + "grad_norm": 0.6884547789426088, + "learning_rate": 6.700806952203602e-05, + "loss": 4.177, + "step": 4318 + }, + { + "epoch": 0.2010848057359685, + "grad_norm": 0.6669199428132543, + "learning_rate": 6.70235878336437e-05, + "loss": 3.9202, + "step": 4319 + }, + { + "epoch": 0.2011313639220616, + "grad_norm": 0.5934358747736957, + "learning_rate": 6.70391061452514e-05, + "loss": 3.946, + "step": 4320 + }, + { + "epoch": 0.20117792210815466, + "grad_norm": 0.6488398087062854, + "learning_rate": 6.705462445685909e-05, + "loss": 3.9428, + "step": 4321 + }, + { + "epoch": 0.20122448029424775, + "grad_norm": 0.7022215207843729, + "learning_rate": 6.707014276846679e-05, + "loss": 4.1359, + "step": 4322 + }, + { + "epoch": 0.2012710384803408, + "grad_norm": 0.8269926778101588, + "learning_rate": 6.70856610800745e-05, + "loss": 3.9488, + "step": 4323 + }, + { + "epoch": 0.20131759666643387, + "grad_norm": 0.758029748125449, + "learning_rate": 6.71011793916822e-05, + "loss": 3.9794, + "step": 4324 + }, + { + "epoch": 0.20136415485252696, + "grad_norm": 0.6795661402831485, + "learning_rate": 6.711669770328989e-05, + "loss": 4.0123, + "step": 4325 + }, + { + "epoch": 0.20141071303862002, + "grad_norm": 0.7305605072404302, + "learning_rate": 6.713221601489759e-05, + "loss": 4.0907, + "step": 4326 + }, + { + "epoch": 0.20145727122471307, + "grad_norm": 0.6701656199378593, + "learning_rate": 6.714773432650528e-05, + "loss": 4.0146, + "step": 4327 + }, + { + "epoch": 0.20150382941080616, + "grad_norm": 0.7791467938252532, + "learning_rate": 6.716325263811298e-05, + "loss": 4.0078, + "step": 4328 + }, + { + "epoch": 0.20155038759689922, + "grad_norm": 0.8766624173279277, + "learning_rate": 6.717877094972068e-05, + "loss": 4.093, + "step": 4329 + }, + { + "epoch": 0.20159694578299228, + "grad_norm": 0.7930142538014551, + "learning_rate": 6.719428926132837e-05, + "loss": 4.1035, + "step": 4330 + }, + { + "epoch": 0.20164350396908537, + "grad_norm": 0.7442506210205326, + "learning_rate": 6.720980757293607e-05, + "loss": 4.0163, + "step": 4331 + }, + { + "epoch": 0.20169006215517843, + "grad_norm": 0.6932500807137504, + "learning_rate": 6.722532588454376e-05, + "loss": 4.0216, + "step": 4332 + }, + { + "epoch": 0.20173662034127152, + "grad_norm": 0.8194105398192896, + "learning_rate": 6.724084419615146e-05, + "loss": 4.1473, + "step": 4333 + }, + { + "epoch": 0.20178317852736458, + "grad_norm": 0.7603273709697139, + "learning_rate": 6.725636250775916e-05, + "loss": 4.0811, + "step": 4334 + }, + { + "epoch": 0.20182973671345764, + "grad_norm": 0.7540142586379758, + "learning_rate": 6.727188081936687e-05, + "loss": 4.0645, + "step": 4335 + }, + { + "epoch": 0.20187629489955072, + "grad_norm": 0.8703127314830106, + "learning_rate": 6.728739913097455e-05, + "loss": 4.055, + "step": 4336 + }, + { + "epoch": 0.20192285308564378, + "grad_norm": 0.8954569583428877, + "learning_rate": 6.730291744258225e-05, + "loss": 4.0286, + "step": 4337 + }, + { + "epoch": 0.20196941127173684, + "grad_norm": 0.8506332163326433, + "learning_rate": 6.731843575418994e-05, + "loss": 4.1481, + "step": 4338 + }, + { + "epoch": 0.20201596945782993, + "grad_norm": 0.7349262367271884, + "learning_rate": 6.733395406579764e-05, + "loss": 4.0355, + "step": 4339 + }, + { + "epoch": 0.202062527643923, + "grad_norm": 0.7439801049231307, + "learning_rate": 6.734947237740535e-05, + "loss": 4.0882, + "step": 4340 + }, + { + "epoch": 0.20210908583001605, + "grad_norm": 0.7948224638602177, + "learning_rate": 6.736499068901304e-05, + "loss": 4.0719, + "step": 4341 + }, + { + "epoch": 0.20215564401610914, + "grad_norm": 0.766259410940412, + "learning_rate": 6.738050900062074e-05, + "loss": 4.0726, + "step": 4342 + }, + { + "epoch": 0.2022022022022022, + "grad_norm": 0.6770136991338446, + "learning_rate": 6.739602731222844e-05, + "loss": 4.0458, + "step": 4343 + }, + { + "epoch": 0.20224876038829528, + "grad_norm": 0.7157190118447494, + "learning_rate": 6.741154562383612e-05, + "loss": 4.1177, + "step": 4344 + }, + { + "epoch": 0.20229531857438834, + "grad_norm": 0.8046860780631222, + "learning_rate": 6.742706393544383e-05, + "loss": 4.0202, + "step": 4345 + }, + { + "epoch": 0.2023418767604814, + "grad_norm": 0.7514221225438659, + "learning_rate": 6.744258224705153e-05, + "loss": 4.1536, + "step": 4346 + }, + { + "epoch": 0.2023884349465745, + "grad_norm": 0.6902390423073181, + "learning_rate": 6.745810055865922e-05, + "loss": 3.9567, + "step": 4347 + }, + { + "epoch": 0.20243499313266755, + "grad_norm": 0.6541966032072122, + "learning_rate": 6.747361887026692e-05, + "loss": 4.0941, + "step": 4348 + }, + { + "epoch": 0.2024815513187606, + "grad_norm": 0.7531106247459676, + "learning_rate": 6.748913718187461e-05, + "loss": 4.0775, + "step": 4349 + }, + { + "epoch": 0.2025281095048537, + "grad_norm": 0.5910326142956597, + "learning_rate": 6.750465549348231e-05, + "loss": 4.0376, + "step": 4350 + }, + { + "epoch": 0.20257466769094676, + "grad_norm": 0.6371337040964564, + "learning_rate": 6.752017380509002e-05, + "loss": 4.0596, + "step": 4351 + }, + { + "epoch": 0.20262122587703982, + "grad_norm": 0.7421905753389653, + "learning_rate": 6.75356921166977e-05, + "loss": 4.1603, + "step": 4352 + }, + { + "epoch": 0.2026677840631329, + "grad_norm": 0.7351512498407752, + "learning_rate": 6.75512104283054e-05, + "loss": 4.0085, + "step": 4353 + }, + { + "epoch": 0.20271434224922597, + "grad_norm": 0.6832153755084313, + "learning_rate": 6.75667287399131e-05, + "loss": 3.9916, + "step": 4354 + }, + { + "epoch": 0.20276090043531905, + "grad_norm": 0.6305410055526135, + "learning_rate": 6.758224705152079e-05, + "loss": 4.0119, + "step": 4355 + }, + { + "epoch": 0.2028074586214121, + "grad_norm": 0.6338640148800098, + "learning_rate": 6.759776536312849e-05, + "loss": 3.9633, + "step": 4356 + }, + { + "epoch": 0.20285401680750517, + "grad_norm": 0.7414197969098607, + "learning_rate": 6.76132836747362e-05, + "loss": 3.9245, + "step": 4357 + }, + { + "epoch": 0.20290057499359826, + "grad_norm": 0.7362703190632384, + "learning_rate": 6.76288019863439e-05, + "loss": 4.0232, + "step": 4358 + }, + { + "epoch": 0.20294713317969132, + "grad_norm": 0.7540864308233712, + "learning_rate": 6.764432029795159e-05, + "loss": 4.0899, + "step": 4359 + }, + { + "epoch": 0.20299369136578438, + "grad_norm": 0.8143904532759982, + "learning_rate": 6.765983860955927e-05, + "loss": 3.9938, + "step": 4360 + }, + { + "epoch": 0.20304024955187747, + "grad_norm": 0.7022248292887936, + "learning_rate": 6.767535692116697e-05, + "loss": 3.9683, + "step": 4361 + }, + { + "epoch": 0.20308680773797053, + "grad_norm": 0.5874971818624293, + "learning_rate": 6.769087523277468e-05, + "loss": 4.0918, + "step": 4362 + }, + { + "epoch": 0.20313336592406359, + "grad_norm": 0.8003923491841033, + "learning_rate": 6.770639354438238e-05, + "loss": 4.0226, + "step": 4363 + }, + { + "epoch": 0.20317992411015667, + "grad_norm": 0.9396189576232425, + "learning_rate": 6.772191185599007e-05, + "loss": 4.1153, + "step": 4364 + }, + { + "epoch": 0.20322648229624973, + "grad_norm": 1.04063583866896, + "learning_rate": 6.773743016759777e-05, + "loss": 4.1264, + "step": 4365 + }, + { + "epoch": 0.20327304048234282, + "grad_norm": 0.864999648138635, + "learning_rate": 6.775294847920547e-05, + "loss": 4.0593, + "step": 4366 + }, + { + "epoch": 0.20331959866843588, + "grad_norm": 0.7186435569690258, + "learning_rate": 6.776846679081316e-05, + "loss": 4.031, + "step": 4367 + }, + { + "epoch": 0.20336615685452894, + "grad_norm": 0.8466697377809916, + "learning_rate": 6.778398510242086e-05, + "loss": 4.03, + "step": 4368 + }, + { + "epoch": 0.20341271504062203, + "grad_norm": 0.8389209854755589, + "learning_rate": 6.779950341402855e-05, + "loss": 3.9382, + "step": 4369 + }, + { + "epoch": 0.2034592732267151, + "grad_norm": 0.6573063351621861, + "learning_rate": 6.781502172563625e-05, + "loss": 4.0473, + "step": 4370 + }, + { + "epoch": 0.20350583141280815, + "grad_norm": 0.7972745151914399, + "learning_rate": 6.783054003724395e-05, + "loss": 4.0533, + "step": 4371 + }, + { + "epoch": 0.20355238959890123, + "grad_norm": 0.7770875679989236, + "learning_rate": 6.784605834885164e-05, + "loss": 4.0834, + "step": 4372 + }, + { + "epoch": 0.2035989477849943, + "grad_norm": 0.7097964694602283, + "learning_rate": 6.786157666045935e-05, + "loss": 4.0987, + "step": 4373 + }, + { + "epoch": 0.20364550597108735, + "grad_norm": 0.6398430808381504, + "learning_rate": 6.787709497206705e-05, + "loss": 4.0944, + "step": 4374 + }, + { + "epoch": 0.20369206415718044, + "grad_norm": 0.6816600001595167, + "learning_rate": 6.789261328367475e-05, + "loss": 4.0446, + "step": 4375 + }, + { + "epoch": 0.2037386223432735, + "grad_norm": 0.6223352433449371, + "learning_rate": 6.790813159528243e-05, + "loss": 3.9316, + "step": 4376 + }, + { + "epoch": 0.2037851805293666, + "grad_norm": 0.6282955079089046, + "learning_rate": 6.792364990689012e-05, + "loss": 4.1583, + "step": 4377 + }, + { + "epoch": 0.20383173871545965, + "grad_norm": 0.7499944895347913, + "learning_rate": 6.793916821849783e-05, + "loss": 4.06, + "step": 4378 + }, + { + "epoch": 0.2038782969015527, + "grad_norm": 0.9435782670150743, + "learning_rate": 6.795468653010553e-05, + "loss": 4.026, + "step": 4379 + }, + { + "epoch": 0.2039248550876458, + "grad_norm": 0.7931748855011149, + "learning_rate": 6.797020484171323e-05, + "loss": 4.0202, + "step": 4380 + }, + { + "epoch": 0.20397141327373886, + "grad_norm": 0.814203897013613, + "learning_rate": 6.798572315332092e-05, + "loss": 4.1462, + "step": 4381 + }, + { + "epoch": 0.20401797145983191, + "grad_norm": 0.7381099255515791, + "learning_rate": 6.800124146492862e-05, + "loss": 3.9807, + "step": 4382 + }, + { + "epoch": 0.204064529645925, + "grad_norm": 0.6599829124885704, + "learning_rate": 6.801675977653632e-05, + "loss": 4.098, + "step": 4383 + }, + { + "epoch": 0.20411108783201806, + "grad_norm": 0.8515671791213568, + "learning_rate": 6.803227808814401e-05, + "loss": 3.9724, + "step": 4384 + }, + { + "epoch": 0.20415764601811112, + "grad_norm": 0.8598872591125095, + "learning_rate": 6.804779639975171e-05, + "loss": 3.9536, + "step": 4385 + }, + { + "epoch": 0.2042042042042042, + "grad_norm": 0.6986953792938132, + "learning_rate": 6.80633147113594e-05, + "loss": 4.0523, + "step": 4386 + }, + { + "epoch": 0.20425076239029727, + "grad_norm": 0.7996155991322198, + "learning_rate": 6.80788330229671e-05, + "loss": 4.063, + "step": 4387 + }, + { + "epoch": 0.20429732057639036, + "grad_norm": 0.9895418549407584, + "learning_rate": 6.80943513345748e-05, + "loss": 3.9624, + "step": 4388 + }, + { + "epoch": 0.20434387876248342, + "grad_norm": 0.8600242570132193, + "learning_rate": 6.81098696461825e-05, + "loss": 3.9071, + "step": 4389 + }, + { + "epoch": 0.20439043694857648, + "grad_norm": 0.7134925280258142, + "learning_rate": 6.81253879577902e-05, + "loss": 3.9843, + "step": 4390 + }, + { + "epoch": 0.20443699513466956, + "grad_norm": 0.7527951325944674, + "learning_rate": 6.81409062693979e-05, + "loss": 4.0063, + "step": 4391 + }, + { + "epoch": 0.20448355332076262, + "grad_norm": 0.7386249109428348, + "learning_rate": 6.815642458100558e-05, + "loss": 4.0284, + "step": 4392 + }, + { + "epoch": 0.20453011150685568, + "grad_norm": 0.7037227856556145, + "learning_rate": 6.817194289261328e-05, + "loss": 4.0471, + "step": 4393 + }, + { + "epoch": 0.20457666969294877, + "grad_norm": 0.7146340848417209, + "learning_rate": 6.818746120422098e-05, + "loss": 4.0915, + "step": 4394 + }, + { + "epoch": 0.20462322787904183, + "grad_norm": 0.707121531387429, + "learning_rate": 6.820297951582869e-05, + "loss": 4.0654, + "step": 4395 + }, + { + "epoch": 0.2046697860651349, + "grad_norm": 0.6746250286320709, + "learning_rate": 6.821849782743638e-05, + "loss": 4.1253, + "step": 4396 + }, + { + "epoch": 0.20471634425122798, + "grad_norm": 0.7199949394408152, + "learning_rate": 6.823401613904408e-05, + "loss": 3.9086, + "step": 4397 + }, + { + "epoch": 0.20476290243732104, + "grad_norm": 0.706450245059166, + "learning_rate": 6.824953445065177e-05, + "loss": 3.8644, + "step": 4398 + }, + { + "epoch": 0.20480946062341412, + "grad_norm": 0.6309629956929219, + "learning_rate": 6.826505276225947e-05, + "loss": 3.8184, + "step": 4399 + }, + { + "epoch": 0.20485601880950718, + "grad_norm": 0.6940479534977549, + "learning_rate": 6.828057107386717e-05, + "loss": 4.057, + "step": 4400 + }, + { + "epoch": 0.20490257699560024, + "grad_norm": 0.640026200481369, + "learning_rate": 6.829608938547486e-05, + "loss": 3.9353, + "step": 4401 + }, + { + "epoch": 0.20494913518169333, + "grad_norm": 0.6055005709323803, + "learning_rate": 6.831160769708256e-05, + "loss": 4.0265, + "step": 4402 + }, + { + "epoch": 0.2049956933677864, + "grad_norm": 0.5977628724307106, + "learning_rate": 6.832712600869026e-05, + "loss": 3.8722, + "step": 4403 + }, + { + "epoch": 0.20504225155387945, + "grad_norm": 0.6944368410691895, + "learning_rate": 6.834264432029795e-05, + "loss": 4.0665, + "step": 4404 + }, + { + "epoch": 0.20508880973997254, + "grad_norm": 0.772043543613468, + "learning_rate": 6.835816263190565e-05, + "loss": 4.0747, + "step": 4405 + }, + { + "epoch": 0.2051353679260656, + "grad_norm": 0.727389991278572, + "learning_rate": 6.837368094351336e-05, + "loss": 4.0626, + "step": 4406 + }, + { + "epoch": 0.20518192611215866, + "grad_norm": 0.6331119724841919, + "learning_rate": 6.838919925512105e-05, + "loss": 4.1319, + "step": 4407 + }, + { + "epoch": 0.20522848429825175, + "grad_norm": 0.7562452174074519, + "learning_rate": 6.840471756672874e-05, + "loss": 4.0519, + "step": 4408 + }, + { + "epoch": 0.2052750424843448, + "grad_norm": 0.8404248333821405, + "learning_rate": 6.842023587833643e-05, + "loss": 4.1069, + "step": 4409 + }, + { + "epoch": 0.2053216006704379, + "grad_norm": 0.768213425877403, + "learning_rate": 6.843575418994413e-05, + "loss": 4.102, + "step": 4410 + }, + { + "epoch": 0.20536815885653095, + "grad_norm": 0.6840954972483261, + "learning_rate": 6.845127250155183e-05, + "loss": 4.0689, + "step": 4411 + }, + { + "epoch": 0.205414717042624, + "grad_norm": 0.6722324625352303, + "learning_rate": 6.846679081315954e-05, + "loss": 4.0645, + "step": 4412 + }, + { + "epoch": 0.2054612752287171, + "grad_norm": 0.7535719858877916, + "learning_rate": 6.848230912476723e-05, + "loss": 3.9951, + "step": 4413 + }, + { + "epoch": 0.20550783341481016, + "grad_norm": 0.6743387048928532, + "learning_rate": 6.849782743637493e-05, + "loss": 3.9929, + "step": 4414 + }, + { + "epoch": 0.20555439160090322, + "grad_norm": 0.7443321925962451, + "learning_rate": 6.851334574798263e-05, + "loss": 4.0011, + "step": 4415 + }, + { + "epoch": 0.2056009497869963, + "grad_norm": 0.7730625629210994, + "learning_rate": 6.852886405959032e-05, + "loss": 3.9806, + "step": 4416 + }, + { + "epoch": 0.20564750797308937, + "grad_norm": 0.8297436177943861, + "learning_rate": 6.854438237119802e-05, + "loss": 4.0174, + "step": 4417 + }, + { + "epoch": 0.20569406615918243, + "grad_norm": 0.7942867331299067, + "learning_rate": 6.855990068280571e-05, + "loss": 4.0386, + "step": 4418 + }, + { + "epoch": 0.2057406243452755, + "grad_norm": 0.862000121906079, + "learning_rate": 6.857541899441341e-05, + "loss": 4.0186, + "step": 4419 + }, + { + "epoch": 0.20578718253136857, + "grad_norm": 0.9730764633551362, + "learning_rate": 6.859093730602111e-05, + "loss": 4.0872, + "step": 4420 + }, + { + "epoch": 0.20583374071746166, + "grad_norm": 0.9457226942680355, + "learning_rate": 6.86064556176288e-05, + "loss": 4.0294, + "step": 4421 + }, + { + "epoch": 0.20588029890355472, + "grad_norm": 0.838635023306501, + "learning_rate": 6.86219739292365e-05, + "loss": 3.9727, + "step": 4422 + }, + { + "epoch": 0.20592685708964778, + "grad_norm": 0.6991778612425305, + "learning_rate": 6.863749224084421e-05, + "loss": 3.9731, + "step": 4423 + }, + { + "epoch": 0.20597341527574087, + "grad_norm": 0.7069530536291239, + "learning_rate": 6.86530105524519e-05, + "loss": 4.0618, + "step": 4424 + }, + { + "epoch": 0.20601997346183393, + "grad_norm": 0.7689338071300859, + "learning_rate": 6.866852886405959e-05, + "loss": 4.0029, + "step": 4425 + }, + { + "epoch": 0.206066531647927, + "grad_norm": 0.6466320987673042, + "learning_rate": 6.868404717566728e-05, + "loss": 4.0198, + "step": 4426 + }, + { + "epoch": 0.20611308983402007, + "grad_norm": 0.6434707821700649, + "learning_rate": 6.869956548727498e-05, + "loss": 3.9988, + "step": 4427 + }, + { + "epoch": 0.20615964802011313, + "grad_norm": 0.5926886453117758, + "learning_rate": 6.871508379888269e-05, + "loss": 4.0445, + "step": 4428 + }, + { + "epoch": 0.2062062062062062, + "grad_norm": 0.7038778249298256, + "learning_rate": 6.873060211049039e-05, + "loss": 3.8576, + "step": 4429 + }, + { + "epoch": 0.20625276439229928, + "grad_norm": 0.6086028713143229, + "learning_rate": 6.874612042209808e-05, + "loss": 4.0762, + "step": 4430 + }, + { + "epoch": 0.20629932257839234, + "grad_norm": 0.6685246842064215, + "learning_rate": 6.876163873370578e-05, + "loss": 3.9737, + "step": 4431 + }, + { + "epoch": 0.20634588076448543, + "grad_norm": 0.610879983284359, + "learning_rate": 6.877715704531348e-05, + "loss": 3.941, + "step": 4432 + }, + { + "epoch": 0.2063924389505785, + "grad_norm": 0.6320096195496306, + "learning_rate": 6.879267535692117e-05, + "loss": 3.9097, + "step": 4433 + }, + { + "epoch": 0.20643899713667155, + "grad_norm": 0.6682917610246818, + "learning_rate": 6.880819366852887e-05, + "loss": 4.0451, + "step": 4434 + }, + { + "epoch": 0.20648555532276464, + "grad_norm": 0.7726151475264406, + "learning_rate": 6.882371198013656e-05, + "loss": 4.1767, + "step": 4435 + }, + { + "epoch": 0.2065321135088577, + "grad_norm": 0.7539265814537944, + "learning_rate": 6.883923029174426e-05, + "loss": 4.0116, + "step": 4436 + }, + { + "epoch": 0.20657867169495076, + "grad_norm": 0.6638118272733684, + "learning_rate": 6.885474860335196e-05, + "loss": 4.042, + "step": 4437 + }, + { + "epoch": 0.20662522988104384, + "grad_norm": 0.6652047195103513, + "learning_rate": 6.887026691495965e-05, + "loss": 4.0214, + "step": 4438 + }, + { + "epoch": 0.2066717880671369, + "grad_norm": 0.7216134189631742, + "learning_rate": 6.888578522656735e-05, + "loss": 3.9719, + "step": 4439 + }, + { + "epoch": 0.20671834625322996, + "grad_norm": 0.6574401517537415, + "learning_rate": 6.890130353817506e-05, + "loss": 4.0804, + "step": 4440 + }, + { + "epoch": 0.20676490443932305, + "grad_norm": 0.6406518425868958, + "learning_rate": 6.891682184978274e-05, + "loss": 4.0362, + "step": 4441 + }, + { + "epoch": 0.2068114626254161, + "grad_norm": 0.8296265559204961, + "learning_rate": 6.893234016139044e-05, + "loss": 3.9908, + "step": 4442 + }, + { + "epoch": 0.2068580208115092, + "grad_norm": 1.0248431180420057, + "learning_rate": 6.894785847299814e-05, + "loss": 4.0794, + "step": 4443 + }, + { + "epoch": 0.20690457899760226, + "grad_norm": 0.925338122509444, + "learning_rate": 6.896337678460583e-05, + "loss": 3.8707, + "step": 4444 + }, + { + "epoch": 0.20695113718369532, + "grad_norm": 0.741769567939621, + "learning_rate": 6.897889509621354e-05, + "loss": 3.9831, + "step": 4445 + }, + { + "epoch": 0.2069976953697884, + "grad_norm": 0.6199231697171326, + "learning_rate": 6.899441340782124e-05, + "loss": 4.0866, + "step": 4446 + }, + { + "epoch": 0.20704425355588146, + "grad_norm": 0.7117970749568817, + "learning_rate": 6.900993171942893e-05, + "loss": 4.0806, + "step": 4447 + }, + { + "epoch": 0.20709081174197452, + "grad_norm": 0.7104035804178432, + "learning_rate": 6.902545003103663e-05, + "loss": 3.967, + "step": 4448 + }, + { + "epoch": 0.2071373699280676, + "grad_norm": 0.7803258902636603, + "learning_rate": 6.904096834264431e-05, + "loss": 4.0902, + "step": 4449 + }, + { + "epoch": 0.20718392811416067, + "grad_norm": 0.7310259112642709, + "learning_rate": 6.905648665425202e-05, + "loss": 3.965, + "step": 4450 + }, + { + "epoch": 0.20723048630025373, + "grad_norm": 0.6592293637686566, + "learning_rate": 6.907200496585972e-05, + "loss": 4.0623, + "step": 4451 + }, + { + "epoch": 0.20727704448634682, + "grad_norm": 0.6201824285197195, + "learning_rate": 6.908752327746742e-05, + "loss": 4.1585, + "step": 4452 + }, + { + "epoch": 0.20732360267243988, + "grad_norm": 0.5408684573613843, + "learning_rate": 6.910304158907511e-05, + "loss": 3.9109, + "step": 4453 + }, + { + "epoch": 0.20737016085853296, + "grad_norm": 0.6315360312319518, + "learning_rate": 6.911855990068281e-05, + "loss": 4.0953, + "step": 4454 + }, + { + "epoch": 0.20741671904462602, + "grad_norm": 0.7189235464644054, + "learning_rate": 6.91340782122905e-05, + "loss": 4.0231, + "step": 4455 + }, + { + "epoch": 0.20746327723071908, + "grad_norm": 0.7412083461009231, + "learning_rate": 6.914959652389821e-05, + "loss": 4.0499, + "step": 4456 + }, + { + "epoch": 0.20750983541681217, + "grad_norm": 0.6775578935886302, + "learning_rate": 6.91651148355059e-05, + "loss": 4.0051, + "step": 4457 + }, + { + "epoch": 0.20755639360290523, + "grad_norm": 0.6487796796423314, + "learning_rate": 6.91806331471136e-05, + "loss": 4.0063, + "step": 4458 + }, + { + "epoch": 0.2076029517889983, + "grad_norm": 0.5237788031036069, + "learning_rate": 6.919615145872129e-05, + "loss": 3.9967, + "step": 4459 + }, + { + "epoch": 0.20764950997509138, + "grad_norm": 0.5815149840365573, + "learning_rate": 6.921166977032899e-05, + "loss": 4.0007, + "step": 4460 + }, + { + "epoch": 0.20769606816118444, + "grad_norm": 0.6111652704783133, + "learning_rate": 6.922718808193668e-05, + "loss": 3.935, + "step": 4461 + }, + { + "epoch": 0.2077426263472775, + "grad_norm": 0.6436835478041993, + "learning_rate": 6.924270639354439e-05, + "loss": 3.9304, + "step": 4462 + }, + { + "epoch": 0.20778918453337059, + "grad_norm": 0.5606574410619991, + "learning_rate": 6.925822470515209e-05, + "loss": 3.8803, + "step": 4463 + }, + { + "epoch": 0.20783574271946365, + "grad_norm": 0.6612389847500167, + "learning_rate": 6.927374301675979e-05, + "loss": 3.987, + "step": 4464 + }, + { + "epoch": 0.20788230090555673, + "grad_norm": 0.6542653928304472, + "learning_rate": 6.928926132836747e-05, + "loss": 3.9898, + "step": 4465 + }, + { + "epoch": 0.2079288590916498, + "grad_norm": 0.6730383578411056, + "learning_rate": 6.930477963997516e-05, + "loss": 3.9887, + "step": 4466 + }, + { + "epoch": 0.20797541727774285, + "grad_norm": 0.8464288692993807, + "learning_rate": 6.932029795158287e-05, + "loss": 4.0512, + "step": 4467 + }, + { + "epoch": 0.20802197546383594, + "grad_norm": 1.0279348380033757, + "learning_rate": 6.933581626319057e-05, + "loss": 4.0314, + "step": 4468 + }, + { + "epoch": 0.208068533649929, + "grad_norm": 1.0428216751240549, + "learning_rate": 6.935133457479827e-05, + "loss": 4.0343, + "step": 4469 + }, + { + "epoch": 0.20811509183602206, + "grad_norm": 0.8860535503903102, + "learning_rate": 6.936685288640596e-05, + "loss": 4.036, + "step": 4470 + }, + { + "epoch": 0.20816165002211515, + "grad_norm": 0.9425368771607671, + "learning_rate": 6.938237119801366e-05, + "loss": 4.058, + "step": 4471 + }, + { + "epoch": 0.2082082082082082, + "grad_norm": 0.8978827923619515, + "learning_rate": 6.939788950962136e-05, + "loss": 4.0367, + "step": 4472 + }, + { + "epoch": 0.20825476639430127, + "grad_norm": 0.6671020552744644, + "learning_rate": 6.941340782122905e-05, + "loss": 4.0359, + "step": 4473 + }, + { + "epoch": 0.20830132458039435, + "grad_norm": 0.6824373006415888, + "learning_rate": 6.942892613283675e-05, + "loss": 3.9724, + "step": 4474 + }, + { + "epoch": 0.2083478827664874, + "grad_norm": 0.7538294461218785, + "learning_rate": 6.944444444444444e-05, + "loss": 3.8485, + "step": 4475 + }, + { + "epoch": 0.2083944409525805, + "grad_norm": 0.6361056062726566, + "learning_rate": 6.945996275605214e-05, + "loss": 3.9762, + "step": 4476 + }, + { + "epoch": 0.20844099913867356, + "grad_norm": 0.66659745853074, + "learning_rate": 6.947548106765984e-05, + "loss": 4.0966, + "step": 4477 + }, + { + "epoch": 0.20848755732476662, + "grad_norm": 0.7948986755526615, + "learning_rate": 6.949099937926755e-05, + "loss": 3.9843, + "step": 4478 + }, + { + "epoch": 0.2085341155108597, + "grad_norm": 0.7223347241341289, + "learning_rate": 6.950651769087524e-05, + "loss": 3.9298, + "step": 4479 + }, + { + "epoch": 0.20858067369695277, + "grad_norm": 0.6707740258083996, + "learning_rate": 6.952203600248294e-05, + "loss": 3.8759, + "step": 4480 + }, + { + "epoch": 0.20862723188304583, + "grad_norm": 0.7163191341278136, + "learning_rate": 6.953755431409062e-05, + "loss": 4.1137, + "step": 4481 + }, + { + "epoch": 0.20867379006913891, + "grad_norm": 0.7084107180239155, + "learning_rate": 6.955307262569832e-05, + "loss": 3.9813, + "step": 4482 + }, + { + "epoch": 0.20872034825523197, + "grad_norm": 0.769337353937191, + "learning_rate": 6.956859093730603e-05, + "loss": 4.1491, + "step": 4483 + }, + { + "epoch": 0.20876690644132503, + "grad_norm": 0.8242568566336583, + "learning_rate": 6.958410924891372e-05, + "loss": 4.1046, + "step": 4484 + }, + { + "epoch": 0.20881346462741812, + "grad_norm": 0.8379415804998843, + "learning_rate": 6.959962756052142e-05, + "loss": 3.9839, + "step": 4485 + }, + { + "epoch": 0.20886002281351118, + "grad_norm": 0.8588834836664622, + "learning_rate": 6.961514587212912e-05, + "loss": 3.9396, + "step": 4486 + }, + { + "epoch": 0.20890658099960427, + "grad_norm": 0.7076216906430566, + "learning_rate": 6.963066418373681e-05, + "loss": 4.0782, + "step": 4487 + }, + { + "epoch": 0.20895313918569733, + "grad_norm": 0.8004388262502573, + "learning_rate": 6.964618249534451e-05, + "loss": 4.0752, + "step": 4488 + }, + { + "epoch": 0.2089996973717904, + "grad_norm": 0.7863398265262881, + "learning_rate": 6.96617008069522e-05, + "loss": 4.0117, + "step": 4489 + }, + { + "epoch": 0.20904625555788348, + "grad_norm": 0.7040321162708264, + "learning_rate": 6.96772191185599e-05, + "loss": 3.8844, + "step": 4490 + }, + { + "epoch": 0.20909281374397654, + "grad_norm": 0.7465096577355916, + "learning_rate": 6.96927374301676e-05, + "loss": 4.0587, + "step": 4491 + }, + { + "epoch": 0.2091393719300696, + "grad_norm": 0.8335368299063005, + "learning_rate": 6.97082557417753e-05, + "loss": 4.0253, + "step": 4492 + }, + { + "epoch": 0.20918593011616268, + "grad_norm": 0.8316923522916222, + "learning_rate": 6.972377405338299e-05, + "loss": 3.9867, + "step": 4493 + }, + { + "epoch": 0.20923248830225574, + "grad_norm": 0.6443131797240247, + "learning_rate": 6.973929236499069e-05, + "loss": 3.9741, + "step": 4494 + }, + { + "epoch": 0.2092790464883488, + "grad_norm": 0.7378052015372435, + "learning_rate": 6.97548106765984e-05, + "loss": 4.0592, + "step": 4495 + }, + { + "epoch": 0.2093256046744419, + "grad_norm": 0.6996329070206095, + "learning_rate": 6.97703289882061e-05, + "loss": 3.8892, + "step": 4496 + }, + { + "epoch": 0.20937216286053495, + "grad_norm": 0.6827716340426128, + "learning_rate": 6.978584729981379e-05, + "loss": 3.8176, + "step": 4497 + }, + { + "epoch": 0.20941872104662804, + "grad_norm": 0.6895758742367714, + "learning_rate": 6.980136561142147e-05, + "loss": 3.826, + "step": 4498 + }, + { + "epoch": 0.2094652792327211, + "grad_norm": 0.652023756873271, + "learning_rate": 6.981688392302917e-05, + "loss": 3.9803, + "step": 4499 + }, + { + "epoch": 0.20951183741881416, + "grad_norm": 0.6488418623699805, + "learning_rate": 6.983240223463688e-05, + "loss": 4.18, + "step": 4500 + }, + { + "epoch": 0.20955839560490724, + "grad_norm": 0.5620012247017404, + "learning_rate": 6.984792054624458e-05, + "loss": 3.9737, + "step": 4501 + }, + { + "epoch": 0.2096049537910003, + "grad_norm": 0.6400087878718159, + "learning_rate": 6.986343885785227e-05, + "loss": 4.079, + "step": 4502 + }, + { + "epoch": 0.20965151197709336, + "grad_norm": 0.7046016103999868, + "learning_rate": 6.987895716945997e-05, + "loss": 4.011, + "step": 4503 + }, + { + "epoch": 0.20969807016318645, + "grad_norm": 0.7450819058995193, + "learning_rate": 6.989447548106766e-05, + "loss": 3.967, + "step": 4504 + }, + { + "epoch": 0.2097446283492795, + "grad_norm": 0.7258119017635875, + "learning_rate": 6.990999379267536e-05, + "loss": 3.9923, + "step": 4505 + }, + { + "epoch": 0.20979118653537257, + "grad_norm": 0.603472610018283, + "learning_rate": 6.992551210428306e-05, + "loss": 3.923, + "step": 4506 + }, + { + "epoch": 0.20983774472146566, + "grad_norm": 0.579000645423754, + "learning_rate": 6.994103041589075e-05, + "loss": 4.018, + "step": 4507 + }, + { + "epoch": 0.20988430290755872, + "grad_norm": 0.6606776989231363, + "learning_rate": 6.995654872749845e-05, + "loss": 3.8987, + "step": 4508 + }, + { + "epoch": 0.2099308610936518, + "grad_norm": 0.7230770054602452, + "learning_rate": 6.997206703910615e-05, + "loss": 3.9882, + "step": 4509 + }, + { + "epoch": 0.20997741927974486, + "grad_norm": 0.6300451122624031, + "learning_rate": 6.998758535071384e-05, + "loss": 3.9441, + "step": 4510 + }, + { + "epoch": 0.21002397746583792, + "grad_norm": 0.6202157916455863, + "learning_rate": 7.000310366232154e-05, + "loss": 3.9887, + "step": 4511 + }, + { + "epoch": 0.210070535651931, + "grad_norm": 0.4908586357753481, + "learning_rate": 7.001862197392925e-05, + "loss": 3.9518, + "step": 4512 + }, + { + "epoch": 0.21011709383802407, + "grad_norm": 0.7229528762707534, + "learning_rate": 7.003414028553694e-05, + "loss": 3.8926, + "step": 4513 + }, + { + "epoch": 0.21016365202411713, + "grad_norm": 0.9222523525718328, + "learning_rate": 7.004965859714463e-05, + "loss": 3.9858, + "step": 4514 + }, + { + "epoch": 0.21021021021021022, + "grad_norm": 0.786503762977706, + "learning_rate": 7.006517690875232e-05, + "loss": 4.0109, + "step": 4515 + }, + { + "epoch": 0.21025676839630328, + "grad_norm": 0.6866678134360259, + "learning_rate": 7.008069522036002e-05, + "loss": 3.9976, + "step": 4516 + }, + { + "epoch": 0.21030332658239634, + "grad_norm": 0.8274003632172879, + "learning_rate": 7.009621353196773e-05, + "loss": 3.9733, + "step": 4517 + }, + { + "epoch": 0.21034988476848943, + "grad_norm": 0.8758438438994336, + "learning_rate": 7.011173184357543e-05, + "loss": 3.9618, + "step": 4518 + }, + { + "epoch": 0.21039644295458249, + "grad_norm": 0.7927895410606932, + "learning_rate": 7.012725015518312e-05, + "loss": 4.026, + "step": 4519 + }, + { + "epoch": 0.21044300114067557, + "grad_norm": 0.8540354965633378, + "learning_rate": 7.014276846679082e-05, + "loss": 3.9996, + "step": 4520 + }, + { + "epoch": 0.21048955932676863, + "grad_norm": 0.7006999123218097, + "learning_rate": 7.015828677839852e-05, + "loss": 3.9497, + "step": 4521 + }, + { + "epoch": 0.2105361175128617, + "grad_norm": 0.7573986218422881, + "learning_rate": 7.017380509000621e-05, + "loss": 4.0169, + "step": 4522 + }, + { + "epoch": 0.21058267569895478, + "grad_norm": 0.816930038695086, + "learning_rate": 7.018932340161391e-05, + "loss": 4.0064, + "step": 4523 + }, + { + "epoch": 0.21062923388504784, + "grad_norm": 0.7096490940490148, + "learning_rate": 7.02048417132216e-05, + "loss": 3.8609, + "step": 4524 + }, + { + "epoch": 0.2106757920711409, + "grad_norm": 0.6542646239145935, + "learning_rate": 7.02203600248293e-05, + "loss": 3.8529, + "step": 4525 + }, + { + "epoch": 0.210722350257234, + "grad_norm": 0.7458075439505786, + "learning_rate": 7.0235878336437e-05, + "loss": 3.9491, + "step": 4526 + }, + { + "epoch": 0.21076890844332705, + "grad_norm": 0.7618571570964023, + "learning_rate": 7.025139664804469e-05, + "loss": 3.8942, + "step": 4527 + }, + { + "epoch": 0.2108154666294201, + "grad_norm": 0.8393875931070578, + "learning_rate": 7.02669149596524e-05, + "loss": 4.0938, + "step": 4528 + }, + { + "epoch": 0.2108620248155132, + "grad_norm": 0.693848191548225, + "learning_rate": 7.02824332712601e-05, + "loss": 4.1071, + "step": 4529 + }, + { + "epoch": 0.21090858300160625, + "grad_norm": 0.6907818615436417, + "learning_rate": 7.029795158286778e-05, + "loss": 4.193, + "step": 4530 + }, + { + "epoch": 0.21095514118769934, + "grad_norm": 0.7362363260172621, + "learning_rate": 7.031346989447548e-05, + "loss": 4.0845, + "step": 4531 + }, + { + "epoch": 0.2110016993737924, + "grad_norm": 0.7535811909491967, + "learning_rate": 7.032898820608317e-05, + "loss": 3.9874, + "step": 4532 + }, + { + "epoch": 0.21104825755988546, + "grad_norm": 0.7650610950807526, + "learning_rate": 7.034450651769088e-05, + "loss": 3.9643, + "step": 4533 + }, + { + "epoch": 0.21109481574597855, + "grad_norm": 0.7389543898760892, + "learning_rate": 7.036002482929858e-05, + "loss": 3.9631, + "step": 4534 + }, + { + "epoch": 0.2111413739320716, + "grad_norm": 0.6320282896121416, + "learning_rate": 7.037554314090628e-05, + "loss": 3.9028, + "step": 4535 + }, + { + "epoch": 0.21118793211816467, + "grad_norm": 0.6995407884754455, + "learning_rate": 7.039106145251397e-05, + "loss": 3.8925, + "step": 4536 + }, + { + "epoch": 0.21123449030425775, + "grad_norm": 0.7781334769530693, + "learning_rate": 7.040657976412167e-05, + "loss": 4.0257, + "step": 4537 + }, + { + "epoch": 0.21128104849035081, + "grad_norm": 0.7204479846195124, + "learning_rate": 7.042209807572935e-05, + "loss": 4.0458, + "step": 4538 + }, + { + "epoch": 0.21132760667644387, + "grad_norm": 0.6855388028113876, + "learning_rate": 7.043761638733706e-05, + "loss": 3.9791, + "step": 4539 + }, + { + "epoch": 0.21137416486253696, + "grad_norm": 0.6585449664340568, + "learning_rate": 7.045313469894476e-05, + "loss": 4.0677, + "step": 4540 + }, + { + "epoch": 0.21142072304863002, + "grad_norm": 0.6866933261300262, + "learning_rate": 7.046865301055245e-05, + "loss": 3.8361, + "step": 4541 + }, + { + "epoch": 0.2114672812347231, + "grad_norm": 0.675684812238362, + "learning_rate": 7.048417132216015e-05, + "loss": 4.0599, + "step": 4542 + }, + { + "epoch": 0.21151383942081617, + "grad_norm": 0.7196148227900945, + "learning_rate": 7.049968963376785e-05, + "loss": 4.1714, + "step": 4543 + }, + { + "epoch": 0.21156039760690923, + "grad_norm": 0.7273966232361262, + "learning_rate": 7.051520794537554e-05, + "loss": 3.8478, + "step": 4544 + }, + { + "epoch": 0.21160695579300232, + "grad_norm": 0.655516930582414, + "learning_rate": 7.053072625698325e-05, + "loss": 3.9274, + "step": 4545 + }, + { + "epoch": 0.21165351397909538, + "grad_norm": 0.5795340908884776, + "learning_rate": 7.054624456859094e-05, + "loss": 3.87, + "step": 4546 + }, + { + "epoch": 0.21170007216518844, + "grad_norm": 0.6527935134544897, + "learning_rate": 7.056176288019863e-05, + "loss": 4.0103, + "step": 4547 + }, + { + "epoch": 0.21174663035128152, + "grad_norm": 0.7396046768057538, + "learning_rate": 7.057728119180633e-05, + "loss": 3.947, + "step": 4548 + }, + { + "epoch": 0.21179318853737458, + "grad_norm": 0.6370970746321856, + "learning_rate": 7.059279950341403e-05, + "loss": 3.9945, + "step": 4549 + }, + { + "epoch": 0.21183974672346764, + "grad_norm": 0.653495254404399, + "learning_rate": 7.060831781502174e-05, + "loss": 4.0012, + "step": 4550 + }, + { + "epoch": 0.21188630490956073, + "grad_norm": 0.5239714688739976, + "learning_rate": 7.062383612662943e-05, + "loss": 3.9299, + "step": 4551 + }, + { + "epoch": 0.2119328630956538, + "grad_norm": 0.5867581304199692, + "learning_rate": 7.063935443823713e-05, + "loss": 3.9214, + "step": 4552 + }, + { + "epoch": 0.21197942128174685, + "grad_norm": 0.6001861908888871, + "learning_rate": 7.065487274984482e-05, + "loss": 4.026, + "step": 4553 + }, + { + "epoch": 0.21202597946783994, + "grad_norm": 0.7458875600328619, + "learning_rate": 7.067039106145251e-05, + "loss": 3.8667, + "step": 4554 + }, + { + "epoch": 0.212072537653933, + "grad_norm": 0.710605430800518, + "learning_rate": 7.068590937306022e-05, + "loss": 3.9639, + "step": 4555 + }, + { + "epoch": 0.21211909584002608, + "grad_norm": 0.5644592722586972, + "learning_rate": 7.070142768466791e-05, + "loss": 3.9634, + "step": 4556 + }, + { + "epoch": 0.21216565402611914, + "grad_norm": 0.5945311327128071, + "learning_rate": 7.071694599627561e-05, + "loss": 3.9961, + "step": 4557 + }, + { + "epoch": 0.2122122122122122, + "grad_norm": 0.6760138893144473, + "learning_rate": 7.07324643078833e-05, + "loss": 3.9334, + "step": 4558 + }, + { + "epoch": 0.2122587703983053, + "grad_norm": 0.6879533407836623, + "learning_rate": 7.0747982619491e-05, + "loss": 3.9561, + "step": 4559 + }, + { + "epoch": 0.21230532858439835, + "grad_norm": 0.6274514649658056, + "learning_rate": 7.07635009310987e-05, + "loss": 4.015, + "step": 4560 + }, + { + "epoch": 0.2123518867704914, + "grad_norm": 0.7337227487872744, + "learning_rate": 7.077901924270641e-05, + "loss": 4.0964, + "step": 4561 + }, + { + "epoch": 0.2123984449565845, + "grad_norm": 0.7557833150623366, + "learning_rate": 7.079453755431409e-05, + "loss": 4.0712, + "step": 4562 + }, + { + "epoch": 0.21244500314267756, + "grad_norm": 0.5807118162882747, + "learning_rate": 7.081005586592179e-05, + "loss": 3.8843, + "step": 4563 + }, + { + "epoch": 0.21249156132877062, + "grad_norm": 0.6684491957160663, + "learning_rate": 7.082557417752948e-05, + "loss": 3.9891, + "step": 4564 + }, + { + "epoch": 0.2125381195148637, + "grad_norm": 0.7830253988807512, + "learning_rate": 7.084109248913718e-05, + "loss": 4.0043, + "step": 4565 + }, + { + "epoch": 0.21258467770095676, + "grad_norm": 0.7097620511008865, + "learning_rate": 7.085661080074488e-05, + "loss": 3.9919, + "step": 4566 + }, + { + "epoch": 0.21263123588704985, + "grad_norm": 0.7240728622575907, + "learning_rate": 7.087212911235259e-05, + "loss": 3.9695, + "step": 4567 + }, + { + "epoch": 0.2126777940731429, + "grad_norm": 0.7709833265440509, + "learning_rate": 7.088764742396028e-05, + "loss": 4.0229, + "step": 4568 + }, + { + "epoch": 0.21272435225923597, + "grad_norm": 0.8212945078560988, + "learning_rate": 7.090316573556798e-05, + "loss": 3.9459, + "step": 4569 + }, + { + "epoch": 0.21277091044532906, + "grad_norm": 0.6779280000395206, + "learning_rate": 7.091868404717566e-05, + "loss": 4.0167, + "step": 4570 + }, + { + "epoch": 0.21281746863142212, + "grad_norm": 0.621929681474084, + "learning_rate": 7.093420235878336e-05, + "loss": 3.9038, + "step": 4571 + }, + { + "epoch": 0.21286402681751518, + "grad_norm": 0.7230027550231265, + "learning_rate": 7.094972067039107e-05, + "loss": 3.9371, + "step": 4572 + }, + { + "epoch": 0.21291058500360827, + "grad_norm": 0.8234352128364956, + "learning_rate": 7.096523898199876e-05, + "loss": 3.9451, + "step": 4573 + }, + { + "epoch": 0.21295714318970133, + "grad_norm": 0.6345961689388705, + "learning_rate": 7.098075729360646e-05, + "loss": 3.9694, + "step": 4574 + }, + { + "epoch": 0.21300370137579439, + "grad_norm": 0.7652464493328317, + "learning_rate": 7.099627560521416e-05, + "loss": 3.9567, + "step": 4575 + }, + { + "epoch": 0.21305025956188747, + "grad_norm": 0.7785130827913247, + "learning_rate": 7.101179391682185e-05, + "loss": 4.0932, + "step": 4576 + }, + { + "epoch": 0.21309681774798053, + "grad_norm": 0.7917597775452085, + "learning_rate": 7.102731222842955e-05, + "loss": 4.0012, + "step": 4577 + }, + { + "epoch": 0.21314337593407362, + "grad_norm": 0.6927008836523857, + "learning_rate": 7.104283054003725e-05, + "loss": 3.8962, + "step": 4578 + }, + { + "epoch": 0.21318993412016668, + "grad_norm": 0.6976212466715462, + "learning_rate": 7.105834885164494e-05, + "loss": 3.9859, + "step": 4579 + }, + { + "epoch": 0.21323649230625974, + "grad_norm": 0.7517564718626432, + "learning_rate": 7.107386716325264e-05, + "loss": 4.0312, + "step": 4580 + }, + { + "epoch": 0.21328305049235283, + "grad_norm": 0.6595100685966063, + "learning_rate": 7.108938547486033e-05, + "loss": 3.9749, + "step": 4581 + }, + { + "epoch": 0.2133296086784459, + "grad_norm": 0.6262844566823192, + "learning_rate": 7.110490378646803e-05, + "loss": 3.9314, + "step": 4582 + }, + { + "epoch": 0.21337616686453895, + "grad_norm": 0.6289522379003263, + "learning_rate": 7.112042209807574e-05, + "loss": 3.8754, + "step": 4583 + }, + { + "epoch": 0.21342272505063203, + "grad_norm": 0.6388855079126091, + "learning_rate": 7.113594040968344e-05, + "loss": 3.9746, + "step": 4584 + }, + { + "epoch": 0.2134692832367251, + "grad_norm": 0.5768475263935144, + "learning_rate": 7.115145872129113e-05, + "loss": 3.9636, + "step": 4585 + }, + { + "epoch": 0.21351584142281815, + "grad_norm": 0.5856389937817774, + "learning_rate": 7.116697703289883e-05, + "loss": 3.9002, + "step": 4586 + }, + { + "epoch": 0.21356239960891124, + "grad_norm": 0.5578430300340257, + "learning_rate": 7.118249534450651e-05, + "loss": 3.9541, + "step": 4587 + }, + { + "epoch": 0.2136089577950043, + "grad_norm": 0.6128661901590674, + "learning_rate": 7.119801365611422e-05, + "loss": 4.129, + "step": 4588 + }, + { + "epoch": 0.2136555159810974, + "grad_norm": 0.5810652106038093, + "learning_rate": 7.121353196772192e-05, + "loss": 4.0236, + "step": 4589 + }, + { + "epoch": 0.21370207416719045, + "grad_norm": 0.6360625867878784, + "learning_rate": 7.122905027932961e-05, + "loss": 4.0509, + "step": 4590 + }, + { + "epoch": 0.2137486323532835, + "grad_norm": 0.7134677495325533, + "learning_rate": 7.124456859093731e-05, + "loss": 3.9855, + "step": 4591 + }, + { + "epoch": 0.2137951905393766, + "grad_norm": 0.6983636097284516, + "learning_rate": 7.126008690254501e-05, + "loss": 3.9227, + "step": 4592 + }, + { + "epoch": 0.21384174872546965, + "grad_norm": 0.6449468425763087, + "learning_rate": 7.12756052141527e-05, + "loss": 3.9646, + "step": 4593 + }, + { + "epoch": 0.21388830691156271, + "grad_norm": 0.6923287192812324, + "learning_rate": 7.12911235257604e-05, + "loss": 3.9053, + "step": 4594 + }, + { + "epoch": 0.2139348650976558, + "grad_norm": 0.7243179946065746, + "learning_rate": 7.13066418373681e-05, + "loss": 3.9428, + "step": 4595 + }, + { + "epoch": 0.21398142328374886, + "grad_norm": 0.7668426416870324, + "learning_rate": 7.132216014897579e-05, + "loss": 4.0253, + "step": 4596 + }, + { + "epoch": 0.21402798146984192, + "grad_norm": 0.7776809367697766, + "learning_rate": 7.133767846058349e-05, + "loss": 3.9751, + "step": 4597 + }, + { + "epoch": 0.214074539655935, + "grad_norm": 0.7300308565721865, + "learning_rate": 7.135319677219119e-05, + "loss": 3.9726, + "step": 4598 + }, + { + "epoch": 0.21412109784202807, + "grad_norm": 0.5917104902546303, + "learning_rate": 7.136871508379888e-05, + "loss": 4.1038, + "step": 4599 + }, + { + "epoch": 0.21416765602812116, + "grad_norm": 0.6110390345983929, + "learning_rate": 7.138423339540659e-05, + "loss": 3.9205, + "step": 4600 + }, + { + "epoch": 0.21421421421421422, + "grad_norm": 0.545860987868886, + "learning_rate": 7.139975170701429e-05, + "loss": 3.8904, + "step": 4601 + }, + { + "epoch": 0.21426077240030728, + "grad_norm": 0.5865917044957777, + "learning_rate": 7.141527001862198e-05, + "loss": 3.8532, + "step": 4602 + }, + { + "epoch": 0.21430733058640036, + "grad_norm": 0.6290059659997855, + "learning_rate": 7.143078833022967e-05, + "loss": 3.9276, + "step": 4603 + }, + { + "epoch": 0.21435388877249342, + "grad_norm": 0.6776019866620848, + "learning_rate": 7.144630664183736e-05, + "loss": 4.0084, + "step": 4604 + }, + { + "epoch": 0.21440044695858648, + "grad_norm": 0.6611852636654713, + "learning_rate": 7.146182495344507e-05, + "loss": 3.9283, + "step": 4605 + }, + { + "epoch": 0.21444700514467957, + "grad_norm": 0.6489249141793868, + "learning_rate": 7.147734326505277e-05, + "loss": 3.949, + "step": 4606 + }, + { + "epoch": 0.21449356333077263, + "grad_norm": 0.640497937711053, + "learning_rate": 7.149286157666047e-05, + "loss": 4.2102, + "step": 4607 + }, + { + "epoch": 0.2145401215168657, + "grad_norm": 0.7585821973597527, + "learning_rate": 7.150837988826816e-05, + "loss": 3.9931, + "step": 4608 + }, + { + "epoch": 0.21458667970295878, + "grad_norm": 0.7155302725843828, + "learning_rate": 7.152389819987586e-05, + "loss": 3.945, + "step": 4609 + }, + { + "epoch": 0.21463323788905184, + "grad_norm": 0.6159491306636603, + "learning_rate": 7.153941651148355e-05, + "loss": 4.0406, + "step": 4610 + }, + { + "epoch": 0.21467979607514492, + "grad_norm": 0.7849222621132035, + "learning_rate": 7.155493482309125e-05, + "loss": 3.9354, + "step": 4611 + }, + { + "epoch": 0.21472635426123798, + "grad_norm": 0.8553560396271173, + "learning_rate": 7.157045313469895e-05, + "loss": 3.9067, + "step": 4612 + }, + { + "epoch": 0.21477291244733104, + "grad_norm": 0.6775074374368724, + "learning_rate": 7.158597144630664e-05, + "loss": 3.8592, + "step": 4613 + }, + { + "epoch": 0.21481947063342413, + "grad_norm": 0.6756158424008305, + "learning_rate": 7.160148975791434e-05, + "loss": 4.0115, + "step": 4614 + }, + { + "epoch": 0.2148660288195172, + "grad_norm": 0.7121384056864691, + "learning_rate": 7.161700806952204e-05, + "loss": 3.9125, + "step": 4615 + }, + { + "epoch": 0.21491258700561025, + "grad_norm": 0.7205978921135371, + "learning_rate": 7.163252638112973e-05, + "loss": 4.0392, + "step": 4616 + }, + { + "epoch": 0.21495914519170334, + "grad_norm": 0.6948304529859484, + "learning_rate": 7.164804469273744e-05, + "loss": 4.0086, + "step": 4617 + }, + { + "epoch": 0.2150057033777964, + "grad_norm": 0.627173569878286, + "learning_rate": 7.166356300434514e-05, + "loss": 3.9743, + "step": 4618 + }, + { + "epoch": 0.21505226156388946, + "grad_norm": 0.686960741809398, + "learning_rate": 7.167908131595282e-05, + "loss": 3.9239, + "step": 4619 + }, + { + "epoch": 0.21509881974998254, + "grad_norm": 0.6910557552506539, + "learning_rate": 7.169459962756052e-05, + "loss": 3.9491, + "step": 4620 + }, + { + "epoch": 0.2151453779360756, + "grad_norm": 0.8051880522355078, + "learning_rate": 7.171011793916821e-05, + "loss": 4.0146, + "step": 4621 + }, + { + "epoch": 0.2151919361221687, + "grad_norm": 0.7692685566271094, + "learning_rate": 7.172563625077592e-05, + "loss": 4.1341, + "step": 4622 + }, + { + "epoch": 0.21523849430826175, + "grad_norm": 0.6974003582656106, + "learning_rate": 7.174115456238362e-05, + "loss": 3.8898, + "step": 4623 + }, + { + "epoch": 0.2152850524943548, + "grad_norm": 0.6079839966310914, + "learning_rate": 7.175667287399132e-05, + "loss": 3.921, + "step": 4624 + }, + { + "epoch": 0.2153316106804479, + "grad_norm": 0.6958521911747994, + "learning_rate": 7.177219118559901e-05, + "loss": 3.9825, + "step": 4625 + }, + { + "epoch": 0.21537816886654096, + "grad_norm": 0.6373433083507047, + "learning_rate": 7.178770949720671e-05, + "loss": 3.9012, + "step": 4626 + }, + { + "epoch": 0.21542472705263402, + "grad_norm": 0.6357090534413247, + "learning_rate": 7.18032278088144e-05, + "loss": 4.0003, + "step": 4627 + }, + { + "epoch": 0.2154712852387271, + "grad_norm": 0.8116468667282567, + "learning_rate": 7.18187461204221e-05, + "loss": 3.9471, + "step": 4628 + }, + { + "epoch": 0.21551784342482017, + "grad_norm": 0.8725182033426203, + "learning_rate": 7.18342644320298e-05, + "loss": 4.0188, + "step": 4629 + }, + { + "epoch": 0.21556440161091323, + "grad_norm": 0.8948780303632232, + "learning_rate": 7.18497827436375e-05, + "loss": 4.0493, + "step": 4630 + }, + { + "epoch": 0.2156109597970063, + "grad_norm": 0.8041747570857282, + "learning_rate": 7.186530105524519e-05, + "loss": 3.9089, + "step": 4631 + }, + { + "epoch": 0.21565751798309937, + "grad_norm": 0.6146943635821468, + "learning_rate": 7.188081936685289e-05, + "loss": 4.0631, + "step": 4632 + }, + { + "epoch": 0.21570407616919246, + "grad_norm": 0.7377833190180599, + "learning_rate": 7.18963376784606e-05, + "loss": 3.9936, + "step": 4633 + }, + { + "epoch": 0.21575063435528552, + "grad_norm": 0.6376879687881449, + "learning_rate": 7.191185599006829e-05, + "loss": 3.9199, + "step": 4634 + }, + { + "epoch": 0.21579719254137858, + "grad_norm": 0.6839845832586018, + "learning_rate": 7.192737430167598e-05, + "loss": 3.9154, + "step": 4635 + }, + { + "epoch": 0.21584375072747167, + "grad_norm": 0.6829532951828172, + "learning_rate": 7.194289261328367e-05, + "loss": 3.9146, + "step": 4636 + }, + { + "epoch": 0.21589030891356473, + "grad_norm": 0.6394694451895602, + "learning_rate": 7.195841092489137e-05, + "loss": 4.0491, + "step": 4637 + }, + { + "epoch": 0.2159368670996578, + "grad_norm": 0.8829474173653946, + "learning_rate": 7.197392923649908e-05, + "loss": 3.8661, + "step": 4638 + }, + { + "epoch": 0.21598342528575087, + "grad_norm": 0.8464671204733129, + "learning_rate": 7.198944754810677e-05, + "loss": 3.8917, + "step": 4639 + }, + { + "epoch": 0.21602998347184393, + "grad_norm": 0.9115283274174247, + "learning_rate": 7.200496585971447e-05, + "loss": 3.8628, + "step": 4640 + }, + { + "epoch": 0.216076541657937, + "grad_norm": 0.8866292774358598, + "learning_rate": 7.202048417132217e-05, + "loss": 4.055, + "step": 4641 + }, + { + "epoch": 0.21612309984403008, + "grad_norm": 0.8605603156494387, + "learning_rate": 7.203600248292986e-05, + "loss": 4.0724, + "step": 4642 + }, + { + "epoch": 0.21616965803012314, + "grad_norm": 0.7551963510782413, + "learning_rate": 7.205152079453755e-05, + "loss": 4.0051, + "step": 4643 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 0.7939720945841285, + "learning_rate": 7.206703910614526e-05, + "loss": 3.9035, + "step": 4644 + }, + { + "epoch": 0.2162627744023093, + "grad_norm": 0.8271101527414217, + "learning_rate": 7.208255741775295e-05, + "loss": 3.9912, + "step": 4645 + }, + { + "epoch": 0.21630933258840235, + "grad_norm": 0.8133139058661903, + "learning_rate": 7.209807572936065e-05, + "loss": 4.0181, + "step": 4646 + }, + { + "epoch": 0.21635589077449544, + "grad_norm": 0.6978417640951772, + "learning_rate": 7.211359404096834e-05, + "loss": 3.92, + "step": 4647 + }, + { + "epoch": 0.2164024489605885, + "grad_norm": 0.7023550421149773, + "learning_rate": 7.212911235257604e-05, + "loss": 4.0133, + "step": 4648 + }, + { + "epoch": 0.21644900714668155, + "grad_norm": 0.8042791804897329, + "learning_rate": 7.214463066418374e-05, + "loss": 4.0006, + "step": 4649 + }, + { + "epoch": 0.21649556533277464, + "grad_norm": 0.8418409552881434, + "learning_rate": 7.216014897579145e-05, + "loss": 3.9432, + "step": 4650 + }, + { + "epoch": 0.2165421235188677, + "grad_norm": 0.7480523168023098, + "learning_rate": 7.217566728739913e-05, + "loss": 3.9223, + "step": 4651 + }, + { + "epoch": 0.21658868170496076, + "grad_norm": 0.7547710759490963, + "learning_rate": 7.219118559900683e-05, + "loss": 4.0837, + "step": 4652 + }, + { + "epoch": 0.21663523989105385, + "grad_norm": 0.8667508080644838, + "learning_rate": 7.220670391061452e-05, + "loss": 4.0486, + "step": 4653 + }, + { + "epoch": 0.2166817980771469, + "grad_norm": 0.8504585288259341, + "learning_rate": 7.222222222222222e-05, + "loss": 3.9414, + "step": 4654 + }, + { + "epoch": 0.21672835626324, + "grad_norm": 0.8659516458748026, + "learning_rate": 7.223774053382993e-05, + "loss": 3.8828, + "step": 4655 + }, + { + "epoch": 0.21677491444933306, + "grad_norm": 0.7690387932911907, + "learning_rate": 7.225325884543763e-05, + "loss": 3.9106, + "step": 4656 + }, + { + "epoch": 0.21682147263542612, + "grad_norm": 0.7799937546314797, + "learning_rate": 7.226877715704532e-05, + "loss": 3.9174, + "step": 4657 + }, + { + "epoch": 0.2168680308215192, + "grad_norm": 0.7225792641899094, + "learning_rate": 7.228429546865302e-05, + "loss": 3.846, + "step": 4658 + }, + { + "epoch": 0.21691458900761226, + "grad_norm": 0.6996542545399713, + "learning_rate": 7.22998137802607e-05, + "loss": 3.9333, + "step": 4659 + }, + { + "epoch": 0.21696114719370532, + "grad_norm": 0.7069414828477779, + "learning_rate": 7.231533209186841e-05, + "loss": 3.8889, + "step": 4660 + }, + { + "epoch": 0.2170077053797984, + "grad_norm": 0.7292244551330419, + "learning_rate": 7.23308504034761e-05, + "loss": 3.9539, + "step": 4661 + }, + { + "epoch": 0.21705426356589147, + "grad_norm": 0.5690687209811163, + "learning_rate": 7.23463687150838e-05, + "loss": 3.9214, + "step": 4662 + }, + { + "epoch": 0.21710082175198453, + "grad_norm": 0.7078308579924444, + "learning_rate": 7.23618870266915e-05, + "loss": 3.9954, + "step": 4663 + }, + { + "epoch": 0.21714737993807762, + "grad_norm": 0.7088524940618003, + "learning_rate": 7.23774053382992e-05, + "loss": 3.95, + "step": 4664 + }, + { + "epoch": 0.21719393812417068, + "grad_norm": 0.5247296396244352, + "learning_rate": 7.239292364990689e-05, + "loss": 3.8365, + "step": 4665 + }, + { + "epoch": 0.21724049631026376, + "grad_norm": 0.6816956562509907, + "learning_rate": 7.240844196151459e-05, + "loss": 3.9951, + "step": 4666 + }, + { + "epoch": 0.21728705449635682, + "grad_norm": 0.72164275101046, + "learning_rate": 7.24239602731223e-05, + "loss": 3.9035, + "step": 4667 + }, + { + "epoch": 0.21733361268244988, + "grad_norm": 0.5672543299892189, + "learning_rate": 7.243947858472998e-05, + "loss": 3.9226, + "step": 4668 + }, + { + "epoch": 0.21738017086854297, + "grad_norm": 0.6365494863079052, + "learning_rate": 7.245499689633768e-05, + "loss": 3.9919, + "step": 4669 + }, + { + "epoch": 0.21742672905463603, + "grad_norm": 0.6789471498095072, + "learning_rate": 7.247051520794537e-05, + "loss": 4.0847, + "step": 4670 + }, + { + "epoch": 0.2174732872407291, + "grad_norm": 0.7020128021591069, + "learning_rate": 7.248603351955307e-05, + "loss": 3.8998, + "step": 4671 + }, + { + "epoch": 0.21751984542682218, + "grad_norm": 0.6894126878549505, + "learning_rate": 7.250155183116078e-05, + "loss": 3.8534, + "step": 4672 + }, + { + "epoch": 0.21756640361291524, + "grad_norm": 0.6540072378997456, + "learning_rate": 7.251707014276848e-05, + "loss": 3.9763, + "step": 4673 + }, + { + "epoch": 0.2176129617990083, + "grad_norm": 0.5655405664019938, + "learning_rate": 7.253258845437617e-05, + "loss": 3.8688, + "step": 4674 + }, + { + "epoch": 0.21765951998510139, + "grad_norm": 0.632195276177308, + "learning_rate": 7.254810676598387e-05, + "loss": 3.9199, + "step": 4675 + }, + { + "epoch": 0.21770607817119444, + "grad_norm": 0.6394959758386781, + "learning_rate": 7.256362507759155e-05, + "loss": 3.9839, + "step": 4676 + }, + { + "epoch": 0.21775263635728753, + "grad_norm": 0.6217553416179951, + "learning_rate": 7.257914338919926e-05, + "loss": 3.9451, + "step": 4677 + }, + { + "epoch": 0.2177991945433806, + "grad_norm": 0.624545847895362, + "learning_rate": 7.259466170080696e-05, + "loss": 3.9619, + "step": 4678 + }, + { + "epoch": 0.21784575272947365, + "grad_norm": 0.6234216304200522, + "learning_rate": 7.261018001241465e-05, + "loss": 3.9339, + "step": 4679 + }, + { + "epoch": 0.21789231091556674, + "grad_norm": 0.6407026303098335, + "learning_rate": 7.262569832402235e-05, + "loss": 3.8586, + "step": 4680 + }, + { + "epoch": 0.2179388691016598, + "grad_norm": 0.6334263389909371, + "learning_rate": 7.264121663563005e-05, + "loss": 3.8853, + "step": 4681 + }, + { + "epoch": 0.21798542728775286, + "grad_norm": 0.5784084997294777, + "learning_rate": 7.265673494723774e-05, + "loss": 3.904, + "step": 4682 + }, + { + "epoch": 0.21803198547384595, + "grad_norm": 0.5592781230573861, + "learning_rate": 7.267225325884545e-05, + "loss": 4.0502, + "step": 4683 + }, + { + "epoch": 0.218078543659939, + "grad_norm": 0.5800105881716839, + "learning_rate": 7.268777157045314e-05, + "loss": 3.9483, + "step": 4684 + }, + { + "epoch": 0.21812510184603207, + "grad_norm": 0.5089925617474833, + "learning_rate": 7.270328988206083e-05, + "loss": 4.0221, + "step": 4685 + }, + { + "epoch": 0.21817166003212515, + "grad_norm": 0.5960493652977539, + "learning_rate": 7.271880819366853e-05, + "loss": 3.9701, + "step": 4686 + }, + { + "epoch": 0.2182182182182182, + "grad_norm": 0.6606011607668115, + "learning_rate": 7.273432650527622e-05, + "loss": 3.9889, + "step": 4687 + }, + { + "epoch": 0.2182647764043113, + "grad_norm": 0.5946809054483327, + "learning_rate": 7.274984481688393e-05, + "loss": 4.004, + "step": 4688 + }, + { + "epoch": 0.21831133459040436, + "grad_norm": 0.6269118081613952, + "learning_rate": 7.276536312849163e-05, + "loss": 3.9598, + "step": 4689 + }, + { + "epoch": 0.21835789277649742, + "grad_norm": 0.7574365040993802, + "learning_rate": 7.278088144009933e-05, + "loss": 3.9141, + "step": 4690 + }, + { + "epoch": 0.2184044509625905, + "grad_norm": 0.6987605643959008, + "learning_rate": 7.279639975170702e-05, + "loss": 3.926, + "step": 4691 + }, + { + "epoch": 0.21845100914868357, + "grad_norm": 0.673097833668544, + "learning_rate": 7.28119180633147e-05, + "loss": 3.8403, + "step": 4692 + }, + { + "epoch": 0.21849756733477663, + "grad_norm": 0.7659179914884183, + "learning_rate": 7.28274363749224e-05, + "loss": 3.9717, + "step": 4693 + }, + { + "epoch": 0.21854412552086971, + "grad_norm": 0.623335029529434, + "learning_rate": 7.284295468653011e-05, + "loss": 3.8312, + "step": 4694 + }, + { + "epoch": 0.21859068370696277, + "grad_norm": 0.7097085690684142, + "learning_rate": 7.285847299813781e-05, + "loss": 4.0076, + "step": 4695 + }, + { + "epoch": 0.21863724189305583, + "grad_norm": 0.7749736940860013, + "learning_rate": 7.28739913097455e-05, + "loss": 3.8434, + "step": 4696 + }, + { + "epoch": 0.21868380007914892, + "grad_norm": 0.6359307676349956, + "learning_rate": 7.28895096213532e-05, + "loss": 3.8805, + "step": 4697 + }, + { + "epoch": 0.21873035826524198, + "grad_norm": 0.7425267016970992, + "learning_rate": 7.29050279329609e-05, + "loss": 3.8453, + "step": 4698 + }, + { + "epoch": 0.21877691645133507, + "grad_norm": 0.7225816282766544, + "learning_rate": 7.29205462445686e-05, + "loss": 4.0497, + "step": 4699 + }, + { + "epoch": 0.21882347463742813, + "grad_norm": 0.7753067833373523, + "learning_rate": 7.293606455617629e-05, + "loss": 3.9785, + "step": 4700 + }, + { + "epoch": 0.2188700328235212, + "grad_norm": 0.7473577276578705, + "learning_rate": 7.295158286778399e-05, + "loss": 3.9948, + "step": 4701 + }, + { + "epoch": 0.21891659100961428, + "grad_norm": 0.6679917908250615, + "learning_rate": 7.296710117939168e-05, + "loss": 4.0002, + "step": 4702 + }, + { + "epoch": 0.21896314919570734, + "grad_norm": 0.5905677011964728, + "learning_rate": 7.298261949099938e-05, + "loss": 3.9561, + "step": 4703 + }, + { + "epoch": 0.2190097073818004, + "grad_norm": 0.6737934649363784, + "learning_rate": 7.299813780260707e-05, + "loss": 3.9461, + "step": 4704 + }, + { + "epoch": 0.21905626556789348, + "grad_norm": 0.665318790927408, + "learning_rate": 7.301365611421478e-05, + "loss": 4.0088, + "step": 4705 + }, + { + "epoch": 0.21910282375398654, + "grad_norm": 0.6735667292152875, + "learning_rate": 7.302917442582248e-05, + "loss": 3.8535, + "step": 4706 + }, + { + "epoch": 0.2191493819400796, + "grad_norm": 0.5850444982003873, + "learning_rate": 7.304469273743018e-05, + "loss": 4.1037, + "step": 4707 + }, + { + "epoch": 0.2191959401261727, + "grad_norm": 0.6068111322620046, + "learning_rate": 7.306021104903786e-05, + "loss": 3.901, + "step": 4708 + }, + { + "epoch": 0.21924249831226575, + "grad_norm": 0.7684432933906671, + "learning_rate": 7.307572936064556e-05, + "loss": 3.8239, + "step": 4709 + }, + { + "epoch": 0.21928905649835884, + "grad_norm": 0.744359722477903, + "learning_rate": 7.309124767225327e-05, + "loss": 3.9981, + "step": 4710 + }, + { + "epoch": 0.2193356146844519, + "grad_norm": 0.9108731967347742, + "learning_rate": 7.310676598386096e-05, + "loss": 3.9876, + "step": 4711 + }, + { + "epoch": 0.21938217287054496, + "grad_norm": 0.9096386470765728, + "learning_rate": 7.312228429546866e-05, + "loss": 3.8735, + "step": 4712 + }, + { + "epoch": 0.21942873105663804, + "grad_norm": 0.8273065841051689, + "learning_rate": 7.313780260707636e-05, + "loss": 3.9311, + "step": 4713 + }, + { + "epoch": 0.2194752892427311, + "grad_norm": 0.755614882304124, + "learning_rate": 7.315332091868405e-05, + "loss": 3.9539, + "step": 4714 + }, + { + "epoch": 0.21952184742882416, + "grad_norm": 0.9135721920480927, + "learning_rate": 7.316883923029175e-05, + "loss": 4.0088, + "step": 4715 + }, + { + "epoch": 0.21956840561491725, + "grad_norm": 0.953212889252031, + "learning_rate": 7.318435754189944e-05, + "loss": 3.9075, + "step": 4716 + }, + { + "epoch": 0.2196149638010103, + "grad_norm": 0.7369351533062715, + "learning_rate": 7.319987585350714e-05, + "loss": 4.0288, + "step": 4717 + }, + { + "epoch": 0.21966152198710337, + "grad_norm": 0.8055980726023337, + "learning_rate": 7.321539416511484e-05, + "loss": 3.9623, + "step": 4718 + }, + { + "epoch": 0.21970808017319646, + "grad_norm": 0.855245524752236, + "learning_rate": 7.323091247672253e-05, + "loss": 3.848, + "step": 4719 + }, + { + "epoch": 0.21975463835928952, + "grad_norm": 0.7857771372502863, + "learning_rate": 7.324643078833023e-05, + "loss": 3.906, + "step": 4720 + }, + { + "epoch": 0.2198011965453826, + "grad_norm": 0.7619956556166707, + "learning_rate": 7.326194909993793e-05, + "loss": 3.9097, + "step": 4721 + }, + { + "epoch": 0.21984775473147566, + "grad_norm": 0.8306258613093087, + "learning_rate": 7.327746741154564e-05, + "loss": 3.8423, + "step": 4722 + }, + { + "epoch": 0.21989431291756872, + "grad_norm": 0.7483098602536095, + "learning_rate": 7.329298572315333e-05, + "loss": 3.9968, + "step": 4723 + }, + { + "epoch": 0.2199408711036618, + "grad_norm": 0.7023831841549202, + "learning_rate": 7.330850403476101e-05, + "loss": 3.9275, + "step": 4724 + }, + { + "epoch": 0.21998742928975487, + "grad_norm": 0.700060191602098, + "learning_rate": 7.332402234636871e-05, + "loss": 3.9824, + "step": 4725 + }, + { + "epoch": 0.22003398747584793, + "grad_norm": 0.7363284223913362, + "learning_rate": 7.333954065797641e-05, + "loss": 3.947, + "step": 4726 + }, + { + "epoch": 0.22008054566194102, + "grad_norm": 0.624064770063571, + "learning_rate": 7.335505896958412e-05, + "loss": 3.8359, + "step": 4727 + }, + { + "epoch": 0.22012710384803408, + "grad_norm": 0.608687009111786, + "learning_rate": 7.337057728119181e-05, + "loss": 3.8715, + "step": 4728 + }, + { + "epoch": 0.22017366203412714, + "grad_norm": 0.6079275819136517, + "learning_rate": 7.338609559279951e-05, + "loss": 4.0194, + "step": 4729 + }, + { + "epoch": 0.22022022022022023, + "grad_norm": 0.6741935262713414, + "learning_rate": 7.34016139044072e-05, + "loss": 3.9916, + "step": 4730 + }, + { + "epoch": 0.22026677840631328, + "grad_norm": 0.6446431159570165, + "learning_rate": 7.34171322160149e-05, + "loss": 4.0191, + "step": 4731 + }, + { + "epoch": 0.22031333659240637, + "grad_norm": 0.6984503648844526, + "learning_rate": 7.34326505276226e-05, + "loss": 3.9707, + "step": 4732 + }, + { + "epoch": 0.22035989477849943, + "grad_norm": 0.6859484955832403, + "learning_rate": 7.34481688392303e-05, + "loss": 3.8308, + "step": 4733 + }, + { + "epoch": 0.2204064529645925, + "grad_norm": 0.6618476192454769, + "learning_rate": 7.346368715083799e-05, + "loss": 3.9713, + "step": 4734 + }, + { + "epoch": 0.22045301115068558, + "grad_norm": 0.6273403459069854, + "learning_rate": 7.347920546244569e-05, + "loss": 3.9264, + "step": 4735 + }, + { + "epoch": 0.22049956933677864, + "grad_norm": 0.5790144519905226, + "learning_rate": 7.349472377405338e-05, + "loss": 3.9005, + "step": 4736 + }, + { + "epoch": 0.2205461275228717, + "grad_norm": 0.7001825596561064, + "learning_rate": 7.351024208566108e-05, + "loss": 3.9333, + "step": 4737 + }, + { + "epoch": 0.2205926857089648, + "grad_norm": 0.6794221208057261, + "learning_rate": 7.352576039726879e-05, + "loss": 3.7909, + "step": 4738 + }, + { + "epoch": 0.22063924389505785, + "grad_norm": 0.6468867939660222, + "learning_rate": 7.354127870887649e-05, + "loss": 3.9148, + "step": 4739 + }, + { + "epoch": 0.2206858020811509, + "grad_norm": 0.6246665926453778, + "learning_rate": 7.355679702048417e-05, + "loss": 3.9594, + "step": 4740 + }, + { + "epoch": 0.220732360267244, + "grad_norm": 0.6820833115257383, + "learning_rate": 7.357231533209187e-05, + "loss": 3.8179, + "step": 4741 + }, + { + "epoch": 0.22077891845333705, + "grad_norm": 0.7908696138239375, + "learning_rate": 7.358783364369956e-05, + "loss": 3.9411, + "step": 4742 + }, + { + "epoch": 0.22082547663943014, + "grad_norm": 0.9778396143418381, + "learning_rate": 7.360335195530727e-05, + "loss": 3.8542, + "step": 4743 + }, + { + "epoch": 0.2208720348255232, + "grad_norm": 0.8259295134118129, + "learning_rate": 7.361887026691497e-05, + "loss": 3.9295, + "step": 4744 + }, + { + "epoch": 0.22091859301161626, + "grad_norm": 0.739017929787154, + "learning_rate": 7.363438857852266e-05, + "loss": 3.9146, + "step": 4745 + }, + { + "epoch": 0.22096515119770935, + "grad_norm": 0.7584413910101452, + "learning_rate": 7.364990689013036e-05, + "loss": 3.8349, + "step": 4746 + }, + { + "epoch": 0.2210117093838024, + "grad_norm": 0.7752061954204008, + "learning_rate": 7.366542520173806e-05, + "loss": 3.995, + "step": 4747 + }, + { + "epoch": 0.22105826756989547, + "grad_norm": 0.814290553439009, + "learning_rate": 7.368094351334574e-05, + "loss": 3.8816, + "step": 4748 + }, + { + "epoch": 0.22110482575598855, + "grad_norm": 0.6416637411632039, + "learning_rate": 7.369646182495345e-05, + "loss": 4.0187, + "step": 4749 + }, + { + "epoch": 0.22115138394208161, + "grad_norm": 0.7097086346637352, + "learning_rate": 7.371198013656115e-05, + "loss": 4.0095, + "step": 4750 + }, + { + "epoch": 0.22119794212817467, + "grad_norm": 0.8293033571745028, + "learning_rate": 7.372749844816884e-05, + "loss": 3.8928, + "step": 4751 + }, + { + "epoch": 0.22124450031426776, + "grad_norm": 0.7737024817843415, + "learning_rate": 7.374301675977654e-05, + "loss": 3.8377, + "step": 4752 + }, + { + "epoch": 0.22129105850036082, + "grad_norm": 0.694796624249519, + "learning_rate": 7.375853507138423e-05, + "loss": 3.8274, + "step": 4753 + }, + { + "epoch": 0.2213376166864539, + "grad_norm": 0.6877633784333725, + "learning_rate": 7.377405338299193e-05, + "loss": 3.9343, + "step": 4754 + }, + { + "epoch": 0.22138417487254697, + "grad_norm": 0.6448065736663761, + "learning_rate": 7.378957169459964e-05, + "loss": 3.9879, + "step": 4755 + }, + { + "epoch": 0.22143073305864003, + "grad_norm": 0.8197645560148443, + "learning_rate": 7.380509000620734e-05, + "loss": 3.9021, + "step": 4756 + }, + { + "epoch": 0.22147729124473312, + "grad_norm": 0.9458846574578933, + "learning_rate": 7.382060831781502e-05, + "loss": 3.9098, + "step": 4757 + }, + { + "epoch": 0.22152384943082618, + "grad_norm": 0.8155679518053953, + "learning_rate": 7.383612662942272e-05, + "loss": 3.9834, + "step": 4758 + }, + { + "epoch": 0.22157040761691923, + "grad_norm": 0.76778690766974, + "learning_rate": 7.385164494103041e-05, + "loss": 3.9849, + "step": 4759 + }, + { + "epoch": 0.22161696580301232, + "grad_norm": 0.7184213952527033, + "learning_rate": 7.386716325263812e-05, + "loss": 3.9194, + "step": 4760 + }, + { + "epoch": 0.22166352398910538, + "grad_norm": 0.7047498358325494, + "learning_rate": 7.388268156424582e-05, + "loss": 4.0185, + "step": 4761 + }, + { + "epoch": 0.22171008217519844, + "grad_norm": 0.7714876202647936, + "learning_rate": 7.389819987585351e-05, + "loss": 3.8925, + "step": 4762 + }, + { + "epoch": 0.22175664036129153, + "grad_norm": 0.8536266312048836, + "learning_rate": 7.391371818746121e-05, + "loss": 3.8616, + "step": 4763 + }, + { + "epoch": 0.2218031985473846, + "grad_norm": 0.7469569488601202, + "learning_rate": 7.392923649906891e-05, + "loss": 3.8749, + "step": 4764 + }, + { + "epoch": 0.22184975673347768, + "grad_norm": 0.6172046981841656, + "learning_rate": 7.39447548106766e-05, + "loss": 3.9161, + "step": 4765 + }, + { + "epoch": 0.22189631491957074, + "grad_norm": 0.5972546760932272, + "learning_rate": 7.39602731222843e-05, + "loss": 3.9528, + "step": 4766 + }, + { + "epoch": 0.2219428731056638, + "grad_norm": 0.6824111713536184, + "learning_rate": 7.3975791433892e-05, + "loss": 3.9592, + "step": 4767 + }, + { + "epoch": 0.22198943129175688, + "grad_norm": 0.6797816615942213, + "learning_rate": 7.399130974549969e-05, + "loss": 3.9779, + "step": 4768 + }, + { + "epoch": 0.22203598947784994, + "grad_norm": 0.5746810653980233, + "learning_rate": 7.400682805710739e-05, + "loss": 3.9252, + "step": 4769 + }, + { + "epoch": 0.222082547663943, + "grad_norm": 0.5974822161360148, + "learning_rate": 7.402234636871509e-05, + "loss": 4.0028, + "step": 4770 + }, + { + "epoch": 0.2221291058500361, + "grad_norm": 0.5900279641634396, + "learning_rate": 7.403786468032278e-05, + "loss": 3.9455, + "step": 4771 + }, + { + "epoch": 0.22217566403612915, + "grad_norm": 0.6587455560846113, + "learning_rate": 7.405338299193049e-05, + "loss": 4.0086, + "step": 4772 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.5669772745562051, + "learning_rate": 7.406890130353817e-05, + "loss": 3.8306, + "step": 4773 + }, + { + "epoch": 0.2222687804083153, + "grad_norm": 0.5714050509248456, + "learning_rate": 7.408441961514587e-05, + "loss": 3.9362, + "step": 4774 + }, + { + "epoch": 0.22231533859440836, + "grad_norm": 0.5926286216744004, + "learning_rate": 7.409993792675357e-05, + "loss": 3.8405, + "step": 4775 + }, + { + "epoch": 0.22236189678050144, + "grad_norm": 0.6766207431007539, + "learning_rate": 7.411545623836126e-05, + "loss": 3.7173, + "step": 4776 + }, + { + "epoch": 0.2224084549665945, + "grad_norm": 0.707697081992131, + "learning_rate": 7.413097454996897e-05, + "loss": 3.9408, + "step": 4777 + }, + { + "epoch": 0.22245501315268756, + "grad_norm": 0.7568882856971082, + "learning_rate": 7.414649286157667e-05, + "loss": 3.9348, + "step": 4778 + }, + { + "epoch": 0.22250157133878065, + "grad_norm": 0.5729493442168323, + "learning_rate": 7.416201117318437e-05, + "loss": 3.967, + "step": 4779 + }, + { + "epoch": 0.2225481295248737, + "grad_norm": 0.6126968225873571, + "learning_rate": 7.417752948479206e-05, + "loss": 4.0174, + "step": 4780 + }, + { + "epoch": 0.22259468771096677, + "grad_norm": 0.6528267619209243, + "learning_rate": 7.419304779639974e-05, + "loss": 3.863, + "step": 4781 + }, + { + "epoch": 0.22264124589705986, + "grad_norm": 0.5973312664922189, + "learning_rate": 7.420856610800745e-05, + "loss": 3.9237, + "step": 4782 + }, + { + "epoch": 0.22268780408315292, + "grad_norm": 0.5657367778576881, + "learning_rate": 7.422408441961515e-05, + "loss": 3.8965, + "step": 4783 + }, + { + "epoch": 0.22273436226924598, + "grad_norm": 0.5817271579175407, + "learning_rate": 7.423960273122285e-05, + "loss": 3.9482, + "step": 4784 + }, + { + "epoch": 0.22278092045533907, + "grad_norm": 0.5623496166484083, + "learning_rate": 7.425512104283054e-05, + "loss": 3.9753, + "step": 4785 + }, + { + "epoch": 0.22282747864143213, + "grad_norm": 0.6642607589767373, + "learning_rate": 7.427063935443824e-05, + "loss": 3.9301, + "step": 4786 + }, + { + "epoch": 0.2228740368275252, + "grad_norm": 0.5518888438603143, + "learning_rate": 7.428615766604594e-05, + "loss": 3.8186, + "step": 4787 + }, + { + "epoch": 0.22292059501361827, + "grad_norm": 0.5207123556843581, + "learning_rate": 7.430167597765365e-05, + "loss": 4.0557, + "step": 4788 + }, + { + "epoch": 0.22296715319971133, + "grad_norm": 0.591321905437837, + "learning_rate": 7.431719428926133e-05, + "loss": 3.9991, + "step": 4789 + }, + { + "epoch": 0.22301371138580442, + "grad_norm": 0.5861164642969285, + "learning_rate": 7.433271260086903e-05, + "loss": 3.9928, + "step": 4790 + }, + { + "epoch": 0.22306026957189748, + "grad_norm": 0.5991162493525393, + "learning_rate": 7.434823091247672e-05, + "loss": 3.8162, + "step": 4791 + }, + { + "epoch": 0.22310682775799054, + "grad_norm": 0.5454571223225814, + "learning_rate": 7.436374922408442e-05, + "loss": 3.8422, + "step": 4792 + }, + { + "epoch": 0.22315338594408363, + "grad_norm": 0.6298170315984054, + "learning_rate": 7.437926753569213e-05, + "loss": 3.8411, + "step": 4793 + }, + { + "epoch": 0.2231999441301767, + "grad_norm": 0.7428502258821268, + "learning_rate": 7.439478584729982e-05, + "loss": 3.9736, + "step": 4794 + }, + { + "epoch": 0.22324650231626975, + "grad_norm": 0.6743681182415422, + "learning_rate": 7.441030415890752e-05, + "loss": 3.8585, + "step": 4795 + }, + { + "epoch": 0.22329306050236283, + "grad_norm": 0.5426576491753308, + "learning_rate": 7.442582247051522e-05, + "loss": 4.0233, + "step": 4796 + }, + { + "epoch": 0.2233396186884559, + "grad_norm": 0.6838760117050177, + "learning_rate": 7.44413407821229e-05, + "loss": 3.9075, + "step": 4797 + }, + { + "epoch": 0.22338617687454898, + "grad_norm": 0.7838285306684906, + "learning_rate": 7.44568590937306e-05, + "loss": 3.8945, + "step": 4798 + }, + { + "epoch": 0.22343273506064204, + "grad_norm": 0.7246355037320851, + "learning_rate": 7.44723774053383e-05, + "loss": 3.9658, + "step": 4799 + }, + { + "epoch": 0.2234792932467351, + "grad_norm": 0.6514206024002647, + "learning_rate": 7.4487895716946e-05, + "loss": 3.9185, + "step": 4800 + }, + { + "epoch": 0.2235258514328282, + "grad_norm": 0.5915896940955396, + "learning_rate": 7.45034140285537e-05, + "loss": 3.834, + "step": 4801 + }, + { + "epoch": 0.22357240961892125, + "grad_norm": 0.5847368407914249, + "learning_rate": 7.45189323401614e-05, + "loss": 3.859, + "step": 4802 + }, + { + "epoch": 0.2236189678050143, + "grad_norm": 0.7191205435201908, + "learning_rate": 7.453445065176909e-05, + "loss": 4.0242, + "step": 4803 + }, + { + "epoch": 0.2236655259911074, + "grad_norm": 0.6529437287532871, + "learning_rate": 7.454996896337679e-05, + "loss": 3.7793, + "step": 4804 + }, + { + "epoch": 0.22371208417720045, + "grad_norm": 0.6013088088275415, + "learning_rate": 7.456548727498448e-05, + "loss": 3.9315, + "step": 4805 + }, + { + "epoch": 0.22375864236329351, + "grad_norm": 0.7084754232990368, + "learning_rate": 7.458100558659218e-05, + "loss": 3.9527, + "step": 4806 + }, + { + "epoch": 0.2238052005493866, + "grad_norm": 0.7198474447528403, + "learning_rate": 7.459652389819988e-05, + "loss": 3.9928, + "step": 4807 + }, + { + "epoch": 0.22385175873547966, + "grad_norm": 0.6493001565244528, + "learning_rate": 7.461204220980757e-05, + "loss": 3.7949, + "step": 4808 + }, + { + "epoch": 0.22389831692157275, + "grad_norm": 0.6096003572509544, + "learning_rate": 7.462756052141527e-05, + "loss": 3.9712, + "step": 4809 + }, + { + "epoch": 0.2239448751076658, + "grad_norm": 0.7702087516369954, + "learning_rate": 7.464307883302298e-05, + "loss": 3.9396, + "step": 4810 + }, + { + "epoch": 0.22399143329375887, + "grad_norm": 0.703885585762125, + "learning_rate": 7.465859714463067e-05, + "loss": 3.8932, + "step": 4811 + }, + { + "epoch": 0.22403799147985196, + "grad_norm": 0.6394079407146703, + "learning_rate": 7.467411545623837e-05, + "loss": 3.8867, + "step": 4812 + }, + { + "epoch": 0.22408454966594502, + "grad_norm": 0.6073186406131154, + "learning_rate": 7.468963376784605e-05, + "loss": 3.9961, + "step": 4813 + }, + { + "epoch": 0.22413110785203808, + "grad_norm": 0.6196943774154542, + "learning_rate": 7.470515207945375e-05, + "loss": 3.805, + "step": 4814 + }, + { + "epoch": 0.22417766603813116, + "grad_norm": 0.642759416616912, + "learning_rate": 7.472067039106146e-05, + "loss": 3.9231, + "step": 4815 + }, + { + "epoch": 0.22422422422422422, + "grad_norm": 0.6387829221531337, + "learning_rate": 7.473618870266916e-05, + "loss": 3.8305, + "step": 4816 + }, + { + "epoch": 0.22427078241031728, + "grad_norm": 0.6354814953972134, + "learning_rate": 7.475170701427685e-05, + "loss": 3.9023, + "step": 4817 + }, + { + "epoch": 0.22431734059641037, + "grad_norm": 0.6182304339999416, + "learning_rate": 7.476722532588455e-05, + "loss": 3.9791, + "step": 4818 + }, + { + "epoch": 0.22436389878250343, + "grad_norm": 0.5449347872858677, + "learning_rate": 7.478274363749225e-05, + "loss": 3.938, + "step": 4819 + }, + { + "epoch": 0.22441045696859652, + "grad_norm": 0.6325790111947855, + "learning_rate": 7.479826194909994e-05, + "loss": 3.8482, + "step": 4820 + }, + { + "epoch": 0.22445701515468958, + "grad_norm": 0.6170090143161183, + "learning_rate": 7.481378026070764e-05, + "loss": 3.9502, + "step": 4821 + }, + { + "epoch": 0.22450357334078264, + "grad_norm": 0.6887731402918941, + "learning_rate": 7.482929857231533e-05, + "loss": 3.8644, + "step": 4822 + }, + { + "epoch": 0.22455013152687572, + "grad_norm": 0.5809081221851766, + "learning_rate": 7.484481688392303e-05, + "loss": 3.9123, + "step": 4823 + }, + { + "epoch": 0.22459668971296878, + "grad_norm": 0.606703842402609, + "learning_rate": 7.486033519553073e-05, + "loss": 4.0083, + "step": 4824 + }, + { + "epoch": 0.22464324789906184, + "grad_norm": 0.5992046208138196, + "learning_rate": 7.487585350713842e-05, + "loss": 3.9923, + "step": 4825 + }, + { + "epoch": 0.22468980608515493, + "grad_norm": 0.5918836864745315, + "learning_rate": 7.489137181874612e-05, + "loss": 3.8817, + "step": 4826 + }, + { + "epoch": 0.224736364271248, + "grad_norm": 0.6646889945304024, + "learning_rate": 7.490689013035383e-05, + "loss": 3.7725, + "step": 4827 + }, + { + "epoch": 0.22478292245734105, + "grad_norm": 0.6090686195117224, + "learning_rate": 7.492240844196153e-05, + "loss": 3.8581, + "step": 4828 + }, + { + "epoch": 0.22482948064343414, + "grad_norm": 0.6121668084867397, + "learning_rate": 7.493792675356921e-05, + "loss": 3.9088, + "step": 4829 + }, + { + "epoch": 0.2248760388295272, + "grad_norm": 0.6324606619231864, + "learning_rate": 7.49534450651769e-05, + "loss": 3.9055, + "step": 4830 + }, + { + "epoch": 0.22492259701562028, + "grad_norm": 0.6226004521001309, + "learning_rate": 7.49689633767846e-05, + "loss": 3.9699, + "step": 4831 + }, + { + "epoch": 0.22496915520171334, + "grad_norm": 0.6641109811341736, + "learning_rate": 7.498448168839231e-05, + "loss": 3.8423, + "step": 4832 + }, + { + "epoch": 0.2250157133878064, + "grad_norm": 0.6902008096644654, + "learning_rate": 7.500000000000001e-05, + "loss": 3.8445, + "step": 4833 + }, + { + "epoch": 0.2250622715738995, + "grad_norm": 0.6677976697581428, + "learning_rate": 7.50155183116077e-05, + "loss": 3.9616, + "step": 4834 + }, + { + "epoch": 0.22510882975999255, + "grad_norm": 0.6870310824130937, + "learning_rate": 7.50310366232154e-05, + "loss": 3.8943, + "step": 4835 + }, + { + "epoch": 0.2251553879460856, + "grad_norm": 0.712442117111326, + "learning_rate": 7.50465549348231e-05, + "loss": 3.8403, + "step": 4836 + }, + { + "epoch": 0.2252019461321787, + "grad_norm": 0.6001830703430024, + "learning_rate": 7.506207324643079e-05, + "loss": 3.8922, + "step": 4837 + }, + { + "epoch": 0.22524850431827176, + "grad_norm": 0.6984920985783798, + "learning_rate": 7.507759155803849e-05, + "loss": 3.9288, + "step": 4838 + }, + { + "epoch": 0.22529506250436482, + "grad_norm": 0.7472082565120812, + "learning_rate": 7.509310986964618e-05, + "loss": 3.972, + "step": 4839 + }, + { + "epoch": 0.2253416206904579, + "grad_norm": 0.6470648039790386, + "learning_rate": 7.510862818125388e-05, + "loss": 3.8413, + "step": 4840 + }, + { + "epoch": 0.22538817887655097, + "grad_norm": 0.6530040845629208, + "learning_rate": 7.512414649286158e-05, + "loss": 3.8842, + "step": 4841 + }, + { + "epoch": 0.22543473706264405, + "grad_norm": 0.5958231428699016, + "learning_rate": 7.513966480446927e-05, + "loss": 3.8544, + "step": 4842 + }, + { + "epoch": 0.2254812952487371, + "grad_norm": 0.6742996317392311, + "learning_rate": 7.515518311607698e-05, + "loss": 3.9651, + "step": 4843 + }, + { + "epoch": 0.22552785343483017, + "grad_norm": 0.6833017484964958, + "learning_rate": 7.517070142768468e-05, + "loss": 3.9712, + "step": 4844 + }, + { + "epoch": 0.22557441162092326, + "grad_norm": 0.7118115179593554, + "learning_rate": 7.518621973929238e-05, + "loss": 3.9858, + "step": 4845 + }, + { + "epoch": 0.22562096980701632, + "grad_norm": 0.8205661403882663, + "learning_rate": 7.520173805090006e-05, + "loss": 3.9609, + "step": 4846 + }, + { + "epoch": 0.22566752799310938, + "grad_norm": 0.8842861404031896, + "learning_rate": 7.521725636250776e-05, + "loss": 3.854, + "step": 4847 + }, + { + "epoch": 0.22571408617920247, + "grad_norm": 1.0186240937012114, + "learning_rate": 7.523277467411545e-05, + "loss": 3.9222, + "step": 4848 + }, + { + "epoch": 0.22576064436529553, + "grad_norm": 0.7916040626192692, + "learning_rate": 7.524829298572316e-05, + "loss": 3.9068, + "step": 4849 + }, + { + "epoch": 0.22580720255138859, + "grad_norm": 0.7434629299947639, + "learning_rate": 7.526381129733086e-05, + "loss": 3.9151, + "step": 4850 + }, + { + "epoch": 0.22585376073748167, + "grad_norm": 0.7376492675815294, + "learning_rate": 7.527932960893855e-05, + "loss": 3.9614, + "step": 4851 + }, + { + "epoch": 0.22590031892357473, + "grad_norm": 0.6903684836568309, + "learning_rate": 7.529484792054625e-05, + "loss": 3.9584, + "step": 4852 + }, + { + "epoch": 0.22594687710966782, + "grad_norm": 0.5924617178718676, + "learning_rate": 7.531036623215395e-05, + "loss": 3.8763, + "step": 4853 + }, + { + "epoch": 0.22599343529576088, + "grad_norm": 0.6600248484480566, + "learning_rate": 7.532588454376164e-05, + "loss": 3.8659, + "step": 4854 + }, + { + "epoch": 0.22603999348185394, + "grad_norm": 0.7451215371451368, + "learning_rate": 7.534140285536934e-05, + "loss": 4.0201, + "step": 4855 + }, + { + "epoch": 0.22608655166794703, + "grad_norm": 0.6629670390735848, + "learning_rate": 7.535692116697704e-05, + "loss": 3.7871, + "step": 4856 + }, + { + "epoch": 0.2261331098540401, + "grad_norm": 0.5533048400570322, + "learning_rate": 7.537243947858473e-05, + "loss": 3.9707, + "step": 4857 + }, + { + "epoch": 0.22617966804013315, + "grad_norm": 0.7026688868180068, + "learning_rate": 7.538795779019243e-05, + "loss": 3.9745, + "step": 4858 + }, + { + "epoch": 0.22622622622622623, + "grad_norm": 0.75325083797282, + "learning_rate": 7.540347610180012e-05, + "loss": 3.8299, + "step": 4859 + }, + { + "epoch": 0.2262727844123193, + "grad_norm": 0.6104468221740443, + "learning_rate": 7.541899441340783e-05, + "loss": 3.9, + "step": 4860 + }, + { + "epoch": 0.22631934259841235, + "grad_norm": 0.7334686801186493, + "learning_rate": 7.543451272501553e-05, + "loss": 4.025, + "step": 4861 + }, + { + "epoch": 0.22636590078450544, + "grad_norm": 0.84332110561418, + "learning_rate": 7.545003103662321e-05, + "loss": 3.8506, + "step": 4862 + }, + { + "epoch": 0.2264124589705985, + "grad_norm": 0.8445233638593904, + "learning_rate": 7.546554934823091e-05, + "loss": 4.0007, + "step": 4863 + }, + { + "epoch": 0.2264590171566916, + "grad_norm": 0.7713745795007717, + "learning_rate": 7.54810676598386e-05, + "loss": 3.8048, + "step": 4864 + }, + { + "epoch": 0.22650557534278465, + "grad_norm": 0.7126567149318505, + "learning_rate": 7.549658597144632e-05, + "loss": 3.9134, + "step": 4865 + }, + { + "epoch": 0.2265521335288777, + "grad_norm": 0.8817363258597369, + "learning_rate": 7.551210428305401e-05, + "loss": 3.9337, + "step": 4866 + }, + { + "epoch": 0.2265986917149708, + "grad_norm": 1.005518538397221, + "learning_rate": 7.552762259466171e-05, + "loss": 3.7988, + "step": 4867 + }, + { + "epoch": 0.22664524990106386, + "grad_norm": 0.7785150745241227, + "learning_rate": 7.55431409062694e-05, + "loss": 3.8612, + "step": 4868 + }, + { + "epoch": 0.22669180808715692, + "grad_norm": 0.7851901701269239, + "learning_rate": 7.55586592178771e-05, + "loss": 3.9475, + "step": 4869 + }, + { + "epoch": 0.22673836627325, + "grad_norm": 0.8369643758953107, + "learning_rate": 7.55741775294848e-05, + "loss": 3.8758, + "step": 4870 + }, + { + "epoch": 0.22678492445934306, + "grad_norm": 0.6609459931834141, + "learning_rate": 7.55896958410925e-05, + "loss": 3.9496, + "step": 4871 + }, + { + "epoch": 0.22683148264543612, + "grad_norm": 0.6461239642925093, + "learning_rate": 7.560521415270019e-05, + "loss": 3.915, + "step": 4872 + }, + { + "epoch": 0.2268780408315292, + "grad_norm": 0.7354333771307796, + "learning_rate": 7.562073246430789e-05, + "loss": 4.0548, + "step": 4873 + }, + { + "epoch": 0.22692459901762227, + "grad_norm": 0.6998067180643596, + "learning_rate": 7.563625077591558e-05, + "loss": 3.7798, + "step": 4874 + }, + { + "epoch": 0.22697115720371536, + "grad_norm": 0.6835459220403027, + "learning_rate": 7.565176908752328e-05, + "loss": 3.8572, + "step": 4875 + }, + { + "epoch": 0.22701771538980842, + "grad_norm": 0.7012363851261647, + "learning_rate": 7.566728739913098e-05, + "loss": 3.8192, + "step": 4876 + }, + { + "epoch": 0.22706427357590148, + "grad_norm": 0.6927793639568123, + "learning_rate": 7.568280571073869e-05, + "loss": 3.9714, + "step": 4877 + }, + { + "epoch": 0.22711083176199456, + "grad_norm": 0.6381102708908728, + "learning_rate": 7.569832402234637e-05, + "loss": 3.8298, + "step": 4878 + }, + { + "epoch": 0.22715738994808762, + "grad_norm": 0.6279652032688116, + "learning_rate": 7.571384233395406e-05, + "loss": 3.782, + "step": 4879 + }, + { + "epoch": 0.22720394813418068, + "grad_norm": 0.5734590522576295, + "learning_rate": 7.572936064556176e-05, + "loss": 3.9543, + "step": 4880 + }, + { + "epoch": 0.22725050632027377, + "grad_norm": 0.6157991485502553, + "learning_rate": 7.574487895716946e-05, + "loss": 3.9672, + "step": 4881 + }, + { + "epoch": 0.22729706450636683, + "grad_norm": 0.6968662413632891, + "learning_rate": 7.576039726877717e-05, + "loss": 3.9354, + "step": 4882 + }, + { + "epoch": 0.2273436226924599, + "grad_norm": 0.736585260283314, + "learning_rate": 7.577591558038486e-05, + "loss": 3.872, + "step": 4883 + }, + { + "epoch": 0.22739018087855298, + "grad_norm": 0.5397447346433478, + "learning_rate": 7.579143389199256e-05, + "loss": 3.8613, + "step": 4884 + }, + { + "epoch": 0.22743673906464604, + "grad_norm": 0.7140142368214836, + "learning_rate": 7.580695220360026e-05, + "loss": 3.8078, + "step": 4885 + }, + { + "epoch": 0.22748329725073912, + "grad_norm": 0.8185864373655205, + "learning_rate": 7.582247051520794e-05, + "loss": 3.8856, + "step": 4886 + }, + { + "epoch": 0.22752985543683218, + "grad_norm": 0.7320574697606781, + "learning_rate": 7.583798882681565e-05, + "loss": 3.8087, + "step": 4887 + }, + { + "epoch": 0.22757641362292524, + "grad_norm": 0.7412164801175836, + "learning_rate": 7.585350713842334e-05, + "loss": 3.8578, + "step": 4888 + }, + { + "epoch": 0.22762297180901833, + "grad_norm": 0.6521547664705653, + "learning_rate": 7.586902545003104e-05, + "loss": 4.0491, + "step": 4889 + }, + { + "epoch": 0.2276695299951114, + "grad_norm": 0.7376884790388935, + "learning_rate": 7.588454376163874e-05, + "loss": 3.9314, + "step": 4890 + }, + { + "epoch": 0.22771608818120445, + "grad_norm": 0.6028195601015204, + "learning_rate": 7.590006207324643e-05, + "loss": 3.9912, + "step": 4891 + }, + { + "epoch": 0.22776264636729754, + "grad_norm": 0.6365001056492805, + "learning_rate": 7.591558038485413e-05, + "loss": 4.0577, + "step": 4892 + }, + { + "epoch": 0.2278092045533906, + "grad_norm": 0.7026443316686763, + "learning_rate": 7.593109869646184e-05, + "loss": 3.9913, + "step": 4893 + }, + { + "epoch": 0.22785576273948366, + "grad_norm": 0.6697582509094544, + "learning_rate": 7.594661700806952e-05, + "loss": 3.89, + "step": 4894 + }, + { + "epoch": 0.22790232092557675, + "grad_norm": 0.6730123639249767, + "learning_rate": 7.596213531967722e-05, + "loss": 3.8136, + "step": 4895 + }, + { + "epoch": 0.2279488791116698, + "grad_norm": 0.6924671630294752, + "learning_rate": 7.597765363128491e-05, + "loss": 3.9813, + "step": 4896 + }, + { + "epoch": 0.2279954372977629, + "grad_norm": 0.6289593074715643, + "learning_rate": 7.599317194289261e-05, + "loss": 3.917, + "step": 4897 + }, + { + "epoch": 0.22804199548385595, + "grad_norm": 0.6139069063332785, + "learning_rate": 7.600869025450031e-05, + "loss": 4.0242, + "step": 4898 + }, + { + "epoch": 0.228088553669949, + "grad_norm": 0.743071632503203, + "learning_rate": 7.602420856610802e-05, + "loss": 3.9483, + "step": 4899 + }, + { + "epoch": 0.2281351118560421, + "grad_norm": 0.7420827351957268, + "learning_rate": 7.603972687771571e-05, + "loss": 3.8585, + "step": 4900 + }, + { + "epoch": 0.22818167004213516, + "grad_norm": 0.6783041923384102, + "learning_rate": 7.605524518932341e-05, + "loss": 3.88, + "step": 4901 + }, + { + "epoch": 0.22822822822822822, + "grad_norm": 0.6953945229596266, + "learning_rate": 7.607076350093109e-05, + "loss": 3.9957, + "step": 4902 + }, + { + "epoch": 0.2282747864143213, + "grad_norm": 0.6515529443522377, + "learning_rate": 7.608628181253879e-05, + "loss": 3.8921, + "step": 4903 + }, + { + "epoch": 0.22832134460041437, + "grad_norm": 0.6003064426794749, + "learning_rate": 7.61018001241465e-05, + "loss": 3.9615, + "step": 4904 + }, + { + "epoch": 0.22836790278650743, + "grad_norm": 0.5915204039146286, + "learning_rate": 7.61173184357542e-05, + "loss": 3.9242, + "step": 4905 + }, + { + "epoch": 0.2284144609726005, + "grad_norm": 0.6665128828753198, + "learning_rate": 7.613283674736189e-05, + "loss": 4.113, + "step": 4906 + }, + { + "epoch": 0.22846101915869357, + "grad_norm": 0.7645888950767685, + "learning_rate": 7.614835505896959e-05, + "loss": 3.8625, + "step": 4907 + }, + { + "epoch": 0.22850757734478666, + "grad_norm": 0.7341996039652927, + "learning_rate": 7.616387337057728e-05, + "loss": 3.98, + "step": 4908 + }, + { + "epoch": 0.22855413553087972, + "grad_norm": 0.638919665555699, + "learning_rate": 7.617939168218498e-05, + "loss": 3.8923, + "step": 4909 + }, + { + "epoch": 0.22860069371697278, + "grad_norm": 0.5270962065973133, + "learning_rate": 7.619490999379268e-05, + "loss": 3.9271, + "step": 4910 + }, + { + "epoch": 0.22864725190306587, + "grad_norm": 0.6351697993679074, + "learning_rate": 7.621042830540037e-05, + "loss": 3.8772, + "step": 4911 + }, + { + "epoch": 0.22869381008915893, + "grad_norm": 0.7203902817783856, + "learning_rate": 7.622594661700807e-05, + "loss": 3.9296, + "step": 4912 + }, + { + "epoch": 0.228740368275252, + "grad_norm": 0.6097624034236592, + "learning_rate": 7.624146492861577e-05, + "loss": 3.8503, + "step": 4913 + }, + { + "epoch": 0.22878692646134507, + "grad_norm": 0.5420442911723389, + "learning_rate": 7.625698324022346e-05, + "loss": 3.8984, + "step": 4914 + }, + { + "epoch": 0.22883348464743813, + "grad_norm": 0.7355087093504882, + "learning_rate": 7.627250155183117e-05, + "loss": 3.8851, + "step": 4915 + }, + { + "epoch": 0.2288800428335312, + "grad_norm": 0.6381805129480201, + "learning_rate": 7.628801986343887e-05, + "loss": 3.7938, + "step": 4916 + }, + { + "epoch": 0.22892660101962428, + "grad_norm": 0.5228752595772017, + "learning_rate": 7.630353817504656e-05, + "loss": 3.8685, + "step": 4917 + }, + { + "epoch": 0.22897315920571734, + "grad_norm": 0.6111797956554511, + "learning_rate": 7.631905648665426e-05, + "loss": 4.0007, + "step": 4918 + }, + { + "epoch": 0.2290197173918104, + "grad_norm": 0.7839406240413977, + "learning_rate": 7.633457479826194e-05, + "loss": 3.9369, + "step": 4919 + }, + { + "epoch": 0.2290662755779035, + "grad_norm": 0.8259374707586644, + "learning_rate": 7.635009310986965e-05, + "loss": 3.8643, + "step": 4920 + }, + { + "epoch": 0.22911283376399655, + "grad_norm": 0.7683938810831441, + "learning_rate": 7.636561142147735e-05, + "loss": 3.9238, + "step": 4921 + }, + { + "epoch": 0.22915939195008964, + "grad_norm": 0.5376214647325291, + "learning_rate": 7.638112973308505e-05, + "loss": 3.8798, + "step": 4922 + }, + { + "epoch": 0.2292059501361827, + "grad_norm": 0.5531742108029649, + "learning_rate": 7.639664804469274e-05, + "loss": 3.9822, + "step": 4923 + }, + { + "epoch": 0.22925250832227576, + "grad_norm": 0.6341235012649749, + "learning_rate": 7.641216635630044e-05, + "loss": 3.8943, + "step": 4924 + }, + { + "epoch": 0.22929906650836884, + "grad_norm": 0.5446821397242548, + "learning_rate": 7.642768466790814e-05, + "loss": 3.8894, + "step": 4925 + }, + { + "epoch": 0.2293456246944619, + "grad_norm": 0.6386294525271271, + "learning_rate": 7.644320297951583e-05, + "loss": 3.7555, + "step": 4926 + }, + { + "epoch": 0.22939218288055496, + "grad_norm": 0.6522613702140174, + "learning_rate": 7.645872129112353e-05, + "loss": 3.7759, + "step": 4927 + }, + { + "epoch": 0.22943874106664805, + "grad_norm": 0.6912050819156009, + "learning_rate": 7.647423960273122e-05, + "loss": 3.9036, + "step": 4928 + }, + { + "epoch": 0.2294852992527411, + "grad_norm": 0.6186399442351177, + "learning_rate": 7.648975791433892e-05, + "loss": 3.8746, + "step": 4929 + }, + { + "epoch": 0.22953185743883417, + "grad_norm": 0.659926706162954, + "learning_rate": 7.650527622594662e-05, + "loss": 3.8867, + "step": 4930 + }, + { + "epoch": 0.22957841562492726, + "grad_norm": 0.8183662459458555, + "learning_rate": 7.652079453755431e-05, + "loss": 3.7341, + "step": 4931 + }, + { + "epoch": 0.22962497381102032, + "grad_norm": 0.8261456412162934, + "learning_rate": 7.653631284916202e-05, + "loss": 3.8918, + "step": 4932 + }, + { + "epoch": 0.2296715319971134, + "grad_norm": 0.665224821019588, + "learning_rate": 7.655183116076972e-05, + "loss": 3.9303, + "step": 4933 + }, + { + "epoch": 0.22971809018320646, + "grad_norm": 0.696466394136507, + "learning_rate": 7.656734947237742e-05, + "loss": 3.9564, + "step": 4934 + }, + { + "epoch": 0.22976464836929952, + "grad_norm": 0.764445173497353, + "learning_rate": 7.65828677839851e-05, + "loss": 3.9708, + "step": 4935 + }, + { + "epoch": 0.2298112065553926, + "grad_norm": 0.733655180900452, + "learning_rate": 7.65983860955928e-05, + "loss": 3.9606, + "step": 4936 + }, + { + "epoch": 0.22985776474148567, + "grad_norm": 0.7236820470326816, + "learning_rate": 7.66139044072005e-05, + "loss": 3.8964, + "step": 4937 + }, + { + "epoch": 0.22990432292757873, + "grad_norm": 0.8010627911272299, + "learning_rate": 7.66294227188082e-05, + "loss": 3.7892, + "step": 4938 + }, + { + "epoch": 0.22995088111367182, + "grad_norm": 0.749431155942107, + "learning_rate": 7.66449410304159e-05, + "loss": 3.8227, + "step": 4939 + }, + { + "epoch": 0.22999743929976488, + "grad_norm": 0.6628815638514964, + "learning_rate": 7.666045934202359e-05, + "loss": 3.8715, + "step": 4940 + }, + { + "epoch": 0.23004399748585794, + "grad_norm": 0.5563938732430583, + "learning_rate": 7.667597765363129e-05, + "loss": 3.9731, + "step": 4941 + }, + { + "epoch": 0.23009055567195102, + "grad_norm": 0.6982774313283543, + "learning_rate": 7.669149596523899e-05, + "loss": 3.9989, + "step": 4942 + }, + { + "epoch": 0.23013711385804408, + "grad_norm": 0.7118777206102468, + "learning_rate": 7.670701427684668e-05, + "loss": 3.9769, + "step": 4943 + }, + { + "epoch": 0.23018367204413717, + "grad_norm": 0.7118052049784401, + "learning_rate": 7.672253258845438e-05, + "loss": 3.8881, + "step": 4944 + }, + { + "epoch": 0.23023023023023023, + "grad_norm": 0.6289869235324208, + "learning_rate": 7.673805090006207e-05, + "loss": 3.6988, + "step": 4945 + }, + { + "epoch": 0.2302767884163233, + "grad_norm": 0.6982812410815292, + "learning_rate": 7.675356921166977e-05, + "loss": 3.8673, + "step": 4946 + }, + { + "epoch": 0.23032334660241638, + "grad_norm": 0.5581322963836396, + "learning_rate": 7.676908752327747e-05, + "loss": 3.8882, + "step": 4947 + }, + { + "epoch": 0.23036990478850944, + "grad_norm": 0.680924024576682, + "learning_rate": 7.678460583488518e-05, + "loss": 3.7613, + "step": 4948 + }, + { + "epoch": 0.2304164629746025, + "grad_norm": 0.8183361928298125, + "learning_rate": 7.680012414649287e-05, + "loss": 3.9505, + "step": 4949 + }, + { + "epoch": 0.23046302116069559, + "grad_norm": 0.8583753778415694, + "learning_rate": 7.681564245810057e-05, + "loss": 3.8013, + "step": 4950 + }, + { + "epoch": 0.23050957934678865, + "grad_norm": 0.9165124602429493, + "learning_rate": 7.683116076970825e-05, + "loss": 3.861, + "step": 4951 + }, + { + "epoch": 0.2305561375328817, + "grad_norm": 0.8855281483100089, + "learning_rate": 7.684667908131595e-05, + "loss": 3.8783, + "step": 4952 + }, + { + "epoch": 0.2306026957189748, + "grad_norm": 0.7518521971581913, + "learning_rate": 7.686219739292365e-05, + "loss": 3.7258, + "step": 4953 + }, + { + "epoch": 0.23064925390506785, + "grad_norm": 0.7428010416171943, + "learning_rate": 7.687771570453136e-05, + "loss": 3.8647, + "step": 4954 + }, + { + "epoch": 0.23069581209116094, + "grad_norm": 0.6961561603064621, + "learning_rate": 7.689323401613905e-05, + "loss": 3.8061, + "step": 4955 + }, + { + "epoch": 0.230742370277254, + "grad_norm": 0.6037374775902552, + "learning_rate": 7.690875232774675e-05, + "loss": 3.9851, + "step": 4956 + }, + { + "epoch": 0.23078892846334706, + "grad_norm": 0.5922736443059065, + "learning_rate": 7.692427063935444e-05, + "loss": 3.8144, + "step": 4957 + }, + { + "epoch": 0.23083548664944015, + "grad_norm": 0.6276728687547131, + "learning_rate": 7.693978895096214e-05, + "loss": 3.8941, + "step": 4958 + }, + { + "epoch": 0.2308820448355332, + "grad_norm": 0.6517232536813597, + "learning_rate": 7.695530726256984e-05, + "loss": 3.9007, + "step": 4959 + }, + { + "epoch": 0.23092860302162627, + "grad_norm": 0.6055195084189832, + "learning_rate": 7.697082557417753e-05, + "loss": 3.9855, + "step": 4960 + }, + { + "epoch": 0.23097516120771935, + "grad_norm": 0.7732018056123242, + "learning_rate": 7.698634388578523e-05, + "loss": 3.8496, + "step": 4961 + }, + { + "epoch": 0.2310217193938124, + "grad_norm": 0.7472221812388177, + "learning_rate": 7.700186219739293e-05, + "loss": 3.8691, + "step": 4962 + }, + { + "epoch": 0.23106827757990547, + "grad_norm": 0.6190004025653796, + "learning_rate": 7.701738050900062e-05, + "loss": 3.9489, + "step": 4963 + }, + { + "epoch": 0.23111483576599856, + "grad_norm": 0.6996210074868776, + "learning_rate": 7.703289882060832e-05, + "loss": 3.8127, + "step": 4964 + }, + { + "epoch": 0.23116139395209162, + "grad_norm": 0.7188001532224366, + "learning_rate": 7.704841713221603e-05, + "loss": 3.809, + "step": 4965 + }, + { + "epoch": 0.2312079521381847, + "grad_norm": 0.6273662031974104, + "learning_rate": 7.706393544382372e-05, + "loss": 3.9217, + "step": 4966 + }, + { + "epoch": 0.23125451032427777, + "grad_norm": 0.60866400881945, + "learning_rate": 7.707945375543141e-05, + "loss": 3.6775, + "step": 4967 + }, + { + "epoch": 0.23130106851037083, + "grad_norm": 0.6110460140668365, + "learning_rate": 7.70949720670391e-05, + "loss": 3.8404, + "step": 4968 + }, + { + "epoch": 0.23134762669646391, + "grad_norm": 0.6569679648098369, + "learning_rate": 7.71104903786468e-05, + "loss": 3.8925, + "step": 4969 + }, + { + "epoch": 0.23139418488255697, + "grad_norm": 0.6784842690812305, + "learning_rate": 7.712600869025451e-05, + "loss": 3.9951, + "step": 4970 + }, + { + "epoch": 0.23144074306865003, + "grad_norm": 0.6032004842405877, + "learning_rate": 7.71415270018622e-05, + "loss": 3.7291, + "step": 4971 + }, + { + "epoch": 0.23148730125474312, + "grad_norm": 0.6741651991934944, + "learning_rate": 7.71570453134699e-05, + "loss": 3.8613, + "step": 4972 + }, + { + "epoch": 0.23153385944083618, + "grad_norm": 0.7326711922231404, + "learning_rate": 7.71725636250776e-05, + "loss": 3.841, + "step": 4973 + }, + { + "epoch": 0.23158041762692924, + "grad_norm": 0.7083151740586964, + "learning_rate": 7.71880819366853e-05, + "loss": 3.8622, + "step": 4974 + }, + { + "epoch": 0.23162697581302233, + "grad_norm": 0.7105413575754385, + "learning_rate": 7.720360024829299e-05, + "loss": 3.8935, + "step": 4975 + }, + { + "epoch": 0.2316735339991154, + "grad_norm": 0.6214645050379789, + "learning_rate": 7.721911855990069e-05, + "loss": 3.875, + "step": 4976 + }, + { + "epoch": 0.23172009218520848, + "grad_norm": 0.6359427199756347, + "learning_rate": 7.723463687150838e-05, + "loss": 3.8578, + "step": 4977 + }, + { + "epoch": 0.23176665037130154, + "grad_norm": 0.6326047280238208, + "learning_rate": 7.725015518311608e-05, + "loss": 3.835, + "step": 4978 + }, + { + "epoch": 0.2318132085573946, + "grad_norm": 0.6151416042601429, + "learning_rate": 7.726567349472378e-05, + "loss": 3.9818, + "step": 4979 + }, + { + "epoch": 0.23185976674348768, + "grad_norm": 0.6413647948763077, + "learning_rate": 7.728119180633147e-05, + "loss": 3.7998, + "step": 4980 + }, + { + "epoch": 0.23190632492958074, + "grad_norm": 0.6745849631801157, + "learning_rate": 7.729671011793917e-05, + "loss": 3.8723, + "step": 4981 + }, + { + "epoch": 0.2319528831156738, + "grad_norm": 0.6189232597237475, + "learning_rate": 7.731222842954688e-05, + "loss": 3.8457, + "step": 4982 + }, + { + "epoch": 0.2319994413017669, + "grad_norm": 0.6673208962516856, + "learning_rate": 7.732774674115456e-05, + "loss": 3.7967, + "step": 4983 + }, + { + "epoch": 0.23204599948785995, + "grad_norm": 0.7659373741416297, + "learning_rate": 7.734326505276226e-05, + "loss": 3.832, + "step": 4984 + }, + { + "epoch": 0.232092557673953, + "grad_norm": 0.879505364792418, + "learning_rate": 7.735878336436995e-05, + "loss": 3.8737, + "step": 4985 + }, + { + "epoch": 0.2321391158600461, + "grad_norm": 0.7717371864337086, + "learning_rate": 7.737430167597765e-05, + "loss": 3.9125, + "step": 4986 + }, + { + "epoch": 0.23218567404613916, + "grad_norm": 0.6362195022070869, + "learning_rate": 7.738981998758536e-05, + "loss": 3.9754, + "step": 4987 + }, + { + "epoch": 0.23223223223223224, + "grad_norm": 0.7074510402206687, + "learning_rate": 7.740533829919306e-05, + "loss": 3.8478, + "step": 4988 + }, + { + "epoch": 0.2322787904183253, + "grad_norm": 0.5517709054346115, + "learning_rate": 7.742085661080075e-05, + "loss": 3.9069, + "step": 4989 + }, + { + "epoch": 0.23232534860441836, + "grad_norm": 0.6998588986105821, + "learning_rate": 7.743637492240845e-05, + "loss": 3.8889, + "step": 4990 + }, + { + "epoch": 0.23237190679051145, + "grad_norm": 0.6990580402390218, + "learning_rate": 7.745189323401613e-05, + "loss": 3.8951, + "step": 4991 + }, + { + "epoch": 0.2324184649766045, + "grad_norm": 0.6768339316743909, + "learning_rate": 7.746741154562384e-05, + "loss": 3.808, + "step": 4992 + }, + { + "epoch": 0.23246502316269757, + "grad_norm": 0.5132610658379385, + "learning_rate": 7.748292985723154e-05, + "loss": 3.8735, + "step": 4993 + }, + { + "epoch": 0.23251158134879066, + "grad_norm": 0.7603956050199837, + "learning_rate": 7.749844816883923e-05, + "loss": 3.9779, + "step": 4994 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.7561637518369639, + "learning_rate": 7.751396648044693e-05, + "loss": 3.8736, + "step": 4995 + }, + { + "epoch": 0.23260469772097678, + "grad_norm": 0.7337081846572157, + "learning_rate": 7.752948479205463e-05, + "loss": 3.9539, + "step": 4996 + }, + { + "epoch": 0.23265125590706986, + "grad_norm": 0.6375034578112412, + "learning_rate": 7.754500310366232e-05, + "loss": 3.8364, + "step": 4997 + }, + { + "epoch": 0.23269781409316292, + "grad_norm": 0.7065585815547976, + "learning_rate": 7.756052141527003e-05, + "loss": 3.9048, + "step": 4998 + }, + { + "epoch": 0.232744372279256, + "grad_norm": 0.6515784249239857, + "learning_rate": 7.757603972687772e-05, + "loss": 3.8398, + "step": 4999 + }, + { + "epoch": 0.23279093046534907, + "grad_norm": 0.6214924802105768, + "learning_rate": 7.759155803848541e-05, + "loss": 3.8149, + "step": 5000 + }, + { + "epoch": 0.23283748865144213, + "grad_norm": 0.6002277257460154, + "learning_rate": 7.760707635009311e-05, + "loss": 3.9341, + "step": 5001 + }, + { + "epoch": 0.23288404683753522, + "grad_norm": 0.6462018866749593, + "learning_rate": 7.76225946617008e-05, + "loss": 3.8882, + "step": 5002 + }, + { + "epoch": 0.23293060502362828, + "grad_norm": 0.5253689377301891, + "learning_rate": 7.76381129733085e-05, + "loss": 3.8722, + "step": 5003 + }, + { + "epoch": 0.23297716320972134, + "grad_norm": 0.5482598239157014, + "learning_rate": 7.765363128491621e-05, + "loss": 3.8718, + "step": 5004 + }, + { + "epoch": 0.23302372139581443, + "grad_norm": 0.5312043517009969, + "learning_rate": 7.766914959652391e-05, + "loss": 3.8965, + "step": 5005 + }, + { + "epoch": 0.23307027958190749, + "grad_norm": 0.5508129017957695, + "learning_rate": 7.76846679081316e-05, + "loss": 3.8222, + "step": 5006 + }, + { + "epoch": 0.23311683776800055, + "grad_norm": 0.5545675255248685, + "learning_rate": 7.77001862197393e-05, + "loss": 3.9632, + "step": 5007 + }, + { + "epoch": 0.23316339595409363, + "grad_norm": 0.527706441447963, + "learning_rate": 7.771570453134698e-05, + "loss": 3.7935, + "step": 5008 + }, + { + "epoch": 0.2332099541401867, + "grad_norm": 0.5491123756472959, + "learning_rate": 7.773122284295469e-05, + "loss": 3.9286, + "step": 5009 + }, + { + "epoch": 0.23325651232627978, + "grad_norm": 0.6022259816090701, + "learning_rate": 7.774674115456239e-05, + "loss": 3.7594, + "step": 5010 + }, + { + "epoch": 0.23330307051237284, + "grad_norm": 0.591390081148705, + "learning_rate": 7.776225946617009e-05, + "loss": 3.8906, + "step": 5011 + }, + { + "epoch": 0.2333496286984659, + "grad_norm": 0.6006120957954929, + "learning_rate": 7.777777777777778e-05, + "loss": 3.8318, + "step": 5012 + }, + { + "epoch": 0.233396186884559, + "grad_norm": 0.5329560569204446, + "learning_rate": 7.779329608938548e-05, + "loss": 3.8925, + "step": 5013 + }, + { + "epoch": 0.23344274507065205, + "grad_norm": 0.54891255306422, + "learning_rate": 7.780881440099317e-05, + "loss": 3.8706, + "step": 5014 + }, + { + "epoch": 0.2334893032567451, + "grad_norm": 0.651174674598507, + "learning_rate": 7.782433271260088e-05, + "loss": 3.8622, + "step": 5015 + }, + { + "epoch": 0.2335358614428382, + "grad_norm": 0.736268277517622, + "learning_rate": 7.783985102420857e-05, + "loss": 3.9338, + "step": 5016 + }, + { + "epoch": 0.23358241962893125, + "grad_norm": 0.7327961809458585, + "learning_rate": 7.785536933581626e-05, + "loss": 3.8955, + "step": 5017 + }, + { + "epoch": 0.2336289778150243, + "grad_norm": 0.7767360010618216, + "learning_rate": 7.787088764742396e-05, + "loss": 3.9306, + "step": 5018 + }, + { + "epoch": 0.2336755360011174, + "grad_norm": 0.890213909521022, + "learning_rate": 7.788640595903166e-05, + "loss": 3.9387, + "step": 5019 + }, + { + "epoch": 0.23372209418721046, + "grad_norm": 0.6829020254496428, + "learning_rate": 7.790192427063937e-05, + "loss": 3.7928, + "step": 5020 + }, + { + "epoch": 0.23376865237330355, + "grad_norm": 0.5975099661556879, + "learning_rate": 7.791744258224706e-05, + "loss": 3.8218, + "step": 5021 + }, + { + "epoch": 0.2338152105593966, + "grad_norm": 0.7497828616217971, + "learning_rate": 7.793296089385476e-05, + "loss": 3.8434, + "step": 5022 + }, + { + "epoch": 0.23386176874548967, + "grad_norm": 0.6952236094433446, + "learning_rate": 7.794847920546245e-05, + "loss": 3.9026, + "step": 5023 + }, + { + "epoch": 0.23390832693158276, + "grad_norm": 0.6172678199545341, + "learning_rate": 7.796399751707014e-05, + "loss": 3.9113, + "step": 5024 + }, + { + "epoch": 0.23395488511767581, + "grad_norm": 0.6507570136267938, + "learning_rate": 7.797951582867785e-05, + "loss": 3.8542, + "step": 5025 + }, + { + "epoch": 0.23400144330376887, + "grad_norm": 0.6387184671954778, + "learning_rate": 7.799503414028554e-05, + "loss": 3.7944, + "step": 5026 + }, + { + "epoch": 0.23404800148986196, + "grad_norm": 0.5741624355136797, + "learning_rate": 7.801055245189324e-05, + "loss": 3.932, + "step": 5027 + }, + { + "epoch": 0.23409455967595502, + "grad_norm": 0.7072555228570393, + "learning_rate": 7.802607076350094e-05, + "loss": 3.9204, + "step": 5028 + }, + { + "epoch": 0.23414111786204808, + "grad_norm": 0.7296935848168864, + "learning_rate": 7.804158907510863e-05, + "loss": 3.8564, + "step": 5029 + }, + { + "epoch": 0.23418767604814117, + "grad_norm": 0.6727373692774012, + "learning_rate": 7.805710738671633e-05, + "loss": 3.8152, + "step": 5030 + }, + { + "epoch": 0.23423423423423423, + "grad_norm": 0.6273716838889355, + "learning_rate": 7.807262569832402e-05, + "loss": 3.8837, + "step": 5031 + }, + { + "epoch": 0.23428079242032732, + "grad_norm": 0.589038017577128, + "learning_rate": 7.808814400993172e-05, + "loss": 3.8719, + "step": 5032 + }, + { + "epoch": 0.23432735060642038, + "grad_norm": 0.6541104321426029, + "learning_rate": 7.810366232153942e-05, + "loss": 3.863, + "step": 5033 + }, + { + "epoch": 0.23437390879251344, + "grad_norm": 0.6531935320565554, + "learning_rate": 7.811918063314711e-05, + "loss": 3.887, + "step": 5034 + }, + { + "epoch": 0.23442046697860652, + "grad_norm": 0.845604919334425, + "learning_rate": 7.813469894475481e-05, + "loss": 3.9091, + "step": 5035 + }, + { + "epoch": 0.23446702516469958, + "grad_norm": 0.8215568167270008, + "learning_rate": 7.81502172563625e-05, + "loss": 3.9706, + "step": 5036 + }, + { + "epoch": 0.23451358335079264, + "grad_norm": 0.7882467870876985, + "learning_rate": 7.816573556797022e-05, + "loss": 3.7737, + "step": 5037 + }, + { + "epoch": 0.23456014153688573, + "grad_norm": 0.6739333984473027, + "learning_rate": 7.818125387957791e-05, + "loss": 3.8636, + "step": 5038 + }, + { + "epoch": 0.2346066997229788, + "grad_norm": 0.6946012238324047, + "learning_rate": 7.819677219118561e-05, + "loss": 3.7087, + "step": 5039 + }, + { + "epoch": 0.23465325790907185, + "grad_norm": 0.7401685847341998, + "learning_rate": 7.821229050279329e-05, + "loss": 3.8283, + "step": 5040 + }, + { + "epoch": 0.23469981609516494, + "grad_norm": 0.6588870763725481, + "learning_rate": 7.822780881440099e-05, + "loss": 3.7791, + "step": 5041 + }, + { + "epoch": 0.234746374281258, + "grad_norm": 0.6551427856725219, + "learning_rate": 7.82433271260087e-05, + "loss": 3.7675, + "step": 5042 + }, + { + "epoch": 0.23479293246735108, + "grad_norm": 0.637794977231905, + "learning_rate": 7.82588454376164e-05, + "loss": 3.7606, + "step": 5043 + }, + { + "epoch": 0.23483949065344414, + "grad_norm": 0.5194055194406175, + "learning_rate": 7.827436374922409e-05, + "loss": 3.7788, + "step": 5044 + }, + { + "epoch": 0.2348860488395372, + "grad_norm": 0.6764056266194253, + "learning_rate": 7.828988206083179e-05, + "loss": 3.8305, + "step": 5045 + }, + { + "epoch": 0.2349326070256303, + "grad_norm": 0.8952994303554282, + "learning_rate": 7.830540037243948e-05, + "loss": 3.8196, + "step": 5046 + }, + { + "epoch": 0.23497916521172335, + "grad_norm": 0.8375367331877251, + "learning_rate": 7.832091868404718e-05, + "loss": 3.7886, + "step": 5047 + }, + { + "epoch": 0.2350257233978164, + "grad_norm": 0.6495443288443598, + "learning_rate": 7.833643699565488e-05, + "loss": 3.9156, + "step": 5048 + }, + { + "epoch": 0.2350722815839095, + "grad_norm": 0.6566073414286693, + "learning_rate": 7.835195530726257e-05, + "loss": 3.9948, + "step": 5049 + }, + { + "epoch": 0.23511883977000256, + "grad_norm": 0.8336359592067768, + "learning_rate": 7.836747361887027e-05, + "loss": 3.7089, + "step": 5050 + }, + { + "epoch": 0.23516539795609562, + "grad_norm": 0.695154693720495, + "learning_rate": 7.838299193047796e-05, + "loss": 3.7317, + "step": 5051 + }, + { + "epoch": 0.2352119561421887, + "grad_norm": 0.5976362164795852, + "learning_rate": 7.839851024208566e-05, + "loss": 3.9407, + "step": 5052 + }, + { + "epoch": 0.23525851432828176, + "grad_norm": 0.6181664379055902, + "learning_rate": 7.841402855369336e-05, + "loss": 3.7562, + "step": 5053 + }, + { + "epoch": 0.23530507251437485, + "grad_norm": 0.6893196965435007, + "learning_rate": 7.842954686530107e-05, + "loss": 3.821, + "step": 5054 + }, + { + "epoch": 0.2353516307004679, + "grad_norm": 0.673128946793382, + "learning_rate": 7.844506517690876e-05, + "loss": 3.795, + "step": 5055 + }, + { + "epoch": 0.23539818888656097, + "grad_norm": 0.7315291932133795, + "learning_rate": 7.846058348851645e-05, + "loss": 3.913, + "step": 5056 + }, + { + "epoch": 0.23544474707265406, + "grad_norm": 0.7548917727525105, + "learning_rate": 7.847610180012414e-05, + "loss": 3.8632, + "step": 5057 + }, + { + "epoch": 0.23549130525874712, + "grad_norm": 0.7337582685215331, + "learning_rate": 7.849162011173184e-05, + "loss": 3.7666, + "step": 5058 + }, + { + "epoch": 0.23553786344484018, + "grad_norm": 0.678087730824423, + "learning_rate": 7.850713842333955e-05, + "loss": 3.8196, + "step": 5059 + }, + { + "epoch": 0.23558442163093327, + "grad_norm": 0.6824564933906512, + "learning_rate": 7.852265673494724e-05, + "loss": 3.8951, + "step": 5060 + }, + { + "epoch": 0.23563097981702633, + "grad_norm": 0.6632039337475124, + "learning_rate": 7.853817504655494e-05, + "loss": 3.8211, + "step": 5061 + }, + { + "epoch": 0.23567753800311939, + "grad_norm": 0.7123472961488279, + "learning_rate": 7.855369335816264e-05, + "loss": 3.9552, + "step": 5062 + }, + { + "epoch": 0.23572409618921247, + "grad_norm": 0.7187747948643536, + "learning_rate": 7.856921166977033e-05, + "loss": 3.8553, + "step": 5063 + }, + { + "epoch": 0.23577065437530553, + "grad_norm": 0.5917027115435401, + "learning_rate": 7.858472998137803e-05, + "loss": 3.8581, + "step": 5064 + }, + { + "epoch": 0.23581721256139862, + "grad_norm": 0.5795249420453713, + "learning_rate": 7.860024829298573e-05, + "loss": 3.9039, + "step": 5065 + }, + { + "epoch": 0.23586377074749168, + "grad_norm": 0.606957553786326, + "learning_rate": 7.861576660459342e-05, + "loss": 3.7321, + "step": 5066 + }, + { + "epoch": 0.23591032893358474, + "grad_norm": 0.6367461426005091, + "learning_rate": 7.863128491620112e-05, + "loss": 3.8483, + "step": 5067 + }, + { + "epoch": 0.23595688711967783, + "grad_norm": 0.5909416284616751, + "learning_rate": 7.864680322780882e-05, + "loss": 3.8496, + "step": 5068 + }, + { + "epoch": 0.2360034453057709, + "grad_norm": 0.5618348021569588, + "learning_rate": 7.866232153941651e-05, + "loss": 3.8881, + "step": 5069 + }, + { + "epoch": 0.23605000349186395, + "grad_norm": 0.605243855136253, + "learning_rate": 7.867783985102422e-05, + "loss": 3.8637, + "step": 5070 + }, + { + "epoch": 0.23609656167795703, + "grad_norm": 0.5962060024254088, + "learning_rate": 7.869335816263192e-05, + "loss": 3.8307, + "step": 5071 + }, + { + "epoch": 0.2361431198640501, + "grad_norm": 0.5863171207509021, + "learning_rate": 7.87088764742396e-05, + "loss": 3.7138, + "step": 5072 + }, + { + "epoch": 0.23618967805014315, + "grad_norm": 0.6385149569788962, + "learning_rate": 7.87243947858473e-05, + "loss": 3.8692, + "step": 5073 + }, + { + "epoch": 0.23623623623623624, + "grad_norm": 0.7428295783670742, + "learning_rate": 7.873991309745499e-05, + "loss": 3.898, + "step": 5074 + }, + { + "epoch": 0.2362827944223293, + "grad_norm": 0.6726677246600633, + "learning_rate": 7.87554314090627e-05, + "loss": 3.8267, + "step": 5075 + }, + { + "epoch": 0.2363293526084224, + "grad_norm": 0.569908790517138, + "learning_rate": 7.87709497206704e-05, + "loss": 3.8995, + "step": 5076 + }, + { + "epoch": 0.23637591079451545, + "grad_norm": 0.5738343261361917, + "learning_rate": 7.87864680322781e-05, + "loss": 3.7773, + "step": 5077 + }, + { + "epoch": 0.2364224689806085, + "grad_norm": 0.5627635366734406, + "learning_rate": 7.880198634388579e-05, + "loss": 3.9798, + "step": 5078 + }, + { + "epoch": 0.2364690271667016, + "grad_norm": 0.5545614062183462, + "learning_rate": 7.881750465549349e-05, + "loss": 3.7871, + "step": 5079 + }, + { + "epoch": 0.23651558535279466, + "grad_norm": 0.5524419377469435, + "learning_rate": 7.883302296710117e-05, + "loss": 3.8119, + "step": 5080 + }, + { + "epoch": 0.23656214353888771, + "grad_norm": 0.5977429916649433, + "learning_rate": 7.884854127870888e-05, + "loss": 4.0193, + "step": 5081 + }, + { + "epoch": 0.2366087017249808, + "grad_norm": 0.559450872746846, + "learning_rate": 7.886405959031658e-05, + "loss": 3.8594, + "step": 5082 + }, + { + "epoch": 0.23665525991107386, + "grad_norm": 0.6170552713317304, + "learning_rate": 7.887957790192427e-05, + "loss": 3.6993, + "step": 5083 + }, + { + "epoch": 0.23670181809716692, + "grad_norm": 0.5466562707186324, + "learning_rate": 7.889509621353197e-05, + "loss": 3.7726, + "step": 5084 + }, + { + "epoch": 0.23674837628326, + "grad_norm": 0.5311310775812935, + "learning_rate": 7.891061452513967e-05, + "loss": 3.6863, + "step": 5085 + }, + { + "epoch": 0.23679493446935307, + "grad_norm": 0.692934451542569, + "learning_rate": 7.892613283674736e-05, + "loss": 3.7302, + "step": 5086 + }, + { + "epoch": 0.23684149265544616, + "grad_norm": 0.6380601426137634, + "learning_rate": 7.894165114835507e-05, + "loss": 3.9802, + "step": 5087 + }, + { + "epoch": 0.23688805084153922, + "grad_norm": 0.588460696005938, + "learning_rate": 7.895716945996277e-05, + "loss": 3.7147, + "step": 5088 + }, + { + "epoch": 0.23693460902763228, + "grad_norm": 0.609426836388895, + "learning_rate": 7.897268777157045e-05, + "loss": 3.8295, + "step": 5089 + }, + { + "epoch": 0.23698116721372536, + "grad_norm": 0.5598915792582658, + "learning_rate": 7.898820608317815e-05, + "loss": 3.8534, + "step": 5090 + }, + { + "epoch": 0.23702772539981842, + "grad_norm": 0.5788214932157731, + "learning_rate": 7.900372439478584e-05, + "loss": 3.8347, + "step": 5091 + }, + { + "epoch": 0.23707428358591148, + "grad_norm": 0.5740002858847331, + "learning_rate": 7.901924270639355e-05, + "loss": 3.8274, + "step": 5092 + }, + { + "epoch": 0.23712084177200457, + "grad_norm": 0.5633016400276045, + "learning_rate": 7.903476101800125e-05, + "loss": 3.9051, + "step": 5093 + }, + { + "epoch": 0.23716739995809763, + "grad_norm": 0.5605638758774811, + "learning_rate": 7.905027932960895e-05, + "loss": 3.9842, + "step": 5094 + }, + { + "epoch": 0.2372139581441907, + "grad_norm": 0.6247170531711154, + "learning_rate": 7.906579764121664e-05, + "loss": 3.8348, + "step": 5095 + }, + { + "epoch": 0.23726051633028378, + "grad_norm": 0.6630195539673235, + "learning_rate": 7.908131595282434e-05, + "loss": 3.9183, + "step": 5096 + }, + { + "epoch": 0.23730707451637684, + "grad_norm": 0.5928642098580686, + "learning_rate": 7.909683426443204e-05, + "loss": 3.8207, + "step": 5097 + }, + { + "epoch": 0.23735363270246992, + "grad_norm": 0.5994116568596918, + "learning_rate": 7.911235257603973e-05, + "loss": 3.8632, + "step": 5098 + }, + { + "epoch": 0.23740019088856298, + "grad_norm": 0.5990106692474307, + "learning_rate": 7.912787088764743e-05, + "loss": 3.7719, + "step": 5099 + }, + { + "epoch": 0.23744674907465604, + "grad_norm": 0.5940941158140076, + "learning_rate": 7.914338919925512e-05, + "loss": 3.7933, + "step": 5100 + }, + { + "epoch": 0.23749330726074913, + "grad_norm": 0.6290338968465286, + "learning_rate": 7.915890751086282e-05, + "loss": 3.8562, + "step": 5101 + }, + { + "epoch": 0.2375398654468422, + "grad_norm": 0.6525494619332897, + "learning_rate": 7.917442582247052e-05, + "loss": 3.9022, + "step": 5102 + }, + { + "epoch": 0.23758642363293525, + "grad_norm": 0.692300217148437, + "learning_rate": 7.918994413407821e-05, + "loss": 3.7896, + "step": 5103 + }, + { + "epoch": 0.23763298181902834, + "grad_norm": 0.7723876425733669, + "learning_rate": 7.920546244568592e-05, + "loss": 3.7989, + "step": 5104 + }, + { + "epoch": 0.2376795400051214, + "grad_norm": 0.7972247817083251, + "learning_rate": 7.92209807572936e-05, + "loss": 3.7035, + "step": 5105 + }, + { + "epoch": 0.23772609819121446, + "grad_norm": 0.629331439740094, + "learning_rate": 7.92364990689013e-05, + "loss": 3.8721, + "step": 5106 + }, + { + "epoch": 0.23777265637730755, + "grad_norm": 0.5704771531351465, + "learning_rate": 7.9252017380509e-05, + "loss": 3.9029, + "step": 5107 + }, + { + "epoch": 0.2378192145634006, + "grad_norm": 0.6304245483987524, + "learning_rate": 7.92675356921167e-05, + "loss": 3.809, + "step": 5108 + }, + { + "epoch": 0.2378657727494937, + "grad_norm": 0.6791150249223374, + "learning_rate": 7.92830540037244e-05, + "loss": 3.9071, + "step": 5109 + }, + { + "epoch": 0.23791233093558675, + "grad_norm": 0.6041240564470369, + "learning_rate": 7.92985723153321e-05, + "loss": 3.9087, + "step": 5110 + }, + { + "epoch": 0.2379588891216798, + "grad_norm": 0.7704503845093684, + "learning_rate": 7.93140906269398e-05, + "loss": 3.8221, + "step": 5111 + }, + { + "epoch": 0.2380054473077729, + "grad_norm": 0.6892031042540446, + "learning_rate": 7.93296089385475e-05, + "loss": 3.8256, + "step": 5112 + }, + { + "epoch": 0.23805200549386596, + "grad_norm": 0.682921856919118, + "learning_rate": 7.934512725015518e-05, + "loss": 3.8556, + "step": 5113 + }, + { + "epoch": 0.23809856367995902, + "grad_norm": 0.6837519773854762, + "learning_rate": 7.936064556176289e-05, + "loss": 3.8921, + "step": 5114 + }, + { + "epoch": 0.2381451218660521, + "grad_norm": 0.7005527949283196, + "learning_rate": 7.937616387337058e-05, + "loss": 3.7695, + "step": 5115 + }, + { + "epoch": 0.23819168005214517, + "grad_norm": 0.6288224524431961, + "learning_rate": 7.939168218497828e-05, + "loss": 3.8551, + "step": 5116 + }, + { + "epoch": 0.23823823823823823, + "grad_norm": 0.6855395254814742, + "learning_rate": 7.940720049658598e-05, + "loss": 3.9409, + "step": 5117 + }, + { + "epoch": 0.2382847964243313, + "grad_norm": 0.6930730772290573, + "learning_rate": 7.942271880819367e-05, + "loss": 3.9585, + "step": 5118 + }, + { + "epoch": 0.23833135461042437, + "grad_norm": 0.7276506641506019, + "learning_rate": 7.943823711980137e-05, + "loss": 3.9076, + "step": 5119 + }, + { + "epoch": 0.23837791279651746, + "grad_norm": 0.7875880184560254, + "learning_rate": 7.945375543140908e-05, + "loss": 3.8295, + "step": 5120 + }, + { + "epoch": 0.23842447098261052, + "grad_norm": 0.7345931083908674, + "learning_rate": 7.946927374301676e-05, + "loss": 3.8297, + "step": 5121 + }, + { + "epoch": 0.23847102916870358, + "grad_norm": 0.59112156146063, + "learning_rate": 7.948479205462446e-05, + "loss": 3.7045, + "step": 5122 + }, + { + "epoch": 0.23851758735479667, + "grad_norm": 0.6137394837311684, + "learning_rate": 7.950031036623215e-05, + "loss": 3.823, + "step": 5123 + }, + { + "epoch": 0.23856414554088973, + "grad_norm": 0.6759051690459438, + "learning_rate": 7.951582867783985e-05, + "loss": 3.7199, + "step": 5124 + }, + { + "epoch": 0.2386107037269828, + "grad_norm": 0.5427804829673396, + "learning_rate": 7.953134698944756e-05, + "loss": 3.9651, + "step": 5125 + }, + { + "epoch": 0.23865726191307587, + "grad_norm": 0.5696921190313292, + "learning_rate": 7.954686530105526e-05, + "loss": 3.7854, + "step": 5126 + }, + { + "epoch": 0.23870382009916893, + "grad_norm": 0.47617688338630637, + "learning_rate": 7.956238361266295e-05, + "loss": 3.8531, + "step": 5127 + }, + { + "epoch": 0.238750378285262, + "grad_norm": 0.6333700620929081, + "learning_rate": 7.957790192427065e-05, + "loss": 3.5941, + "step": 5128 + }, + { + "epoch": 0.23879693647135508, + "grad_norm": 0.6461834734479417, + "learning_rate": 7.959342023587833e-05, + "loss": 3.8218, + "step": 5129 + }, + { + "epoch": 0.23884349465744814, + "grad_norm": 0.7744667584726243, + "learning_rate": 7.960893854748604e-05, + "loss": 3.7939, + "step": 5130 + }, + { + "epoch": 0.23889005284354123, + "grad_norm": 0.7964759642987559, + "learning_rate": 7.962445685909374e-05, + "loss": 3.9735, + "step": 5131 + }, + { + "epoch": 0.2389366110296343, + "grad_norm": 0.7410509880324447, + "learning_rate": 7.963997517070143e-05, + "loss": 3.9345, + "step": 5132 + }, + { + "epoch": 0.23898316921572735, + "grad_norm": 0.6243974255222386, + "learning_rate": 7.965549348230913e-05, + "loss": 3.7599, + "step": 5133 + }, + { + "epoch": 0.23902972740182044, + "grad_norm": 0.6648064771194692, + "learning_rate": 7.967101179391683e-05, + "loss": 3.7606, + "step": 5134 + }, + { + "epoch": 0.2390762855879135, + "grad_norm": 0.6734669900438673, + "learning_rate": 7.968653010552452e-05, + "loss": 3.7426, + "step": 5135 + }, + { + "epoch": 0.23912284377400655, + "grad_norm": 0.5939158547564497, + "learning_rate": 7.970204841713222e-05, + "loss": 3.9435, + "step": 5136 + }, + { + "epoch": 0.23916940196009964, + "grad_norm": 0.5546190405403599, + "learning_rate": 7.971756672873991e-05, + "loss": 3.856, + "step": 5137 + }, + { + "epoch": 0.2392159601461927, + "grad_norm": 0.5403492076369478, + "learning_rate": 7.973308504034761e-05, + "loss": 3.891, + "step": 5138 + }, + { + "epoch": 0.23926251833228576, + "grad_norm": 0.5400696910060986, + "learning_rate": 7.974860335195531e-05, + "loss": 3.8243, + "step": 5139 + }, + { + "epoch": 0.23930907651837885, + "grad_norm": 0.5817563777062027, + "learning_rate": 7.9764121663563e-05, + "loss": 3.8168, + "step": 5140 + }, + { + "epoch": 0.2393556347044719, + "grad_norm": 0.5341389346594743, + "learning_rate": 7.97796399751707e-05, + "loss": 3.9253, + "step": 5141 + }, + { + "epoch": 0.239402192890565, + "grad_norm": 0.6221741659302759, + "learning_rate": 7.979515828677841e-05, + "loss": 3.8066, + "step": 5142 + }, + { + "epoch": 0.23944875107665806, + "grad_norm": 0.6215885911594068, + "learning_rate": 7.98106765983861e-05, + "loss": 3.8156, + "step": 5143 + }, + { + "epoch": 0.23949530926275112, + "grad_norm": 0.6530092337001187, + "learning_rate": 7.98261949099938e-05, + "loss": 3.7839, + "step": 5144 + }, + { + "epoch": 0.2395418674488442, + "grad_norm": 0.5775314015703276, + "learning_rate": 7.984171322160149e-05, + "loss": 3.7591, + "step": 5145 + }, + { + "epoch": 0.23958842563493726, + "grad_norm": 0.5704507325580038, + "learning_rate": 7.985723153320918e-05, + "loss": 3.8136, + "step": 5146 + }, + { + "epoch": 0.23963498382103032, + "grad_norm": 0.5935542922508187, + "learning_rate": 7.987274984481689e-05, + "loss": 3.89, + "step": 5147 + }, + { + "epoch": 0.2396815420071234, + "grad_norm": 0.6510978643846916, + "learning_rate": 7.988826815642459e-05, + "loss": 3.8038, + "step": 5148 + }, + { + "epoch": 0.23972810019321647, + "grad_norm": 0.6922479534377696, + "learning_rate": 7.990378646803228e-05, + "loss": 3.8703, + "step": 5149 + }, + { + "epoch": 0.23977465837930953, + "grad_norm": 0.6398902757494422, + "learning_rate": 7.991930477963998e-05, + "loss": 3.8663, + "step": 5150 + }, + { + "epoch": 0.23982121656540262, + "grad_norm": 0.6195088804948604, + "learning_rate": 7.993482309124768e-05, + "loss": 3.7944, + "step": 5151 + }, + { + "epoch": 0.23986777475149568, + "grad_norm": 0.5459591839729945, + "learning_rate": 7.995034140285537e-05, + "loss": 3.88, + "step": 5152 + }, + { + "epoch": 0.23991433293758876, + "grad_norm": 0.5641096590746123, + "learning_rate": 7.996585971446307e-05, + "loss": 3.7438, + "step": 5153 + }, + { + "epoch": 0.23996089112368182, + "grad_norm": 0.619547291616166, + "learning_rate": 7.998137802607077e-05, + "loss": 3.8926, + "step": 5154 + }, + { + "epoch": 0.24000744930977488, + "grad_norm": 0.5930452973755335, + "learning_rate": 7.999689633767846e-05, + "loss": 3.9215, + "step": 5155 + }, + { + "epoch": 0.24005400749586797, + "grad_norm": 0.6445724070529516, + "learning_rate": 8.001241464928616e-05, + "loss": 3.8997, + "step": 5156 + }, + { + "epoch": 0.24010056568196103, + "grad_norm": 0.6492111373008362, + "learning_rate": 8.002793296089385e-05, + "loss": 3.8768, + "step": 5157 + }, + { + "epoch": 0.2401471238680541, + "grad_norm": 0.5239071443302878, + "learning_rate": 8.004345127250155e-05, + "loss": 3.7771, + "step": 5158 + }, + { + "epoch": 0.24019368205414718, + "grad_norm": 0.5517517416012373, + "learning_rate": 8.005896958410926e-05, + "loss": 3.7708, + "step": 5159 + }, + { + "epoch": 0.24024024024024024, + "grad_norm": 0.5594753629381888, + "learning_rate": 8.007448789571696e-05, + "loss": 3.7841, + "step": 5160 + }, + { + "epoch": 0.2402867984263333, + "grad_norm": 0.5608441818916047, + "learning_rate": 8.009000620732464e-05, + "loss": 3.689, + "step": 5161 + }, + { + "epoch": 0.24033335661242639, + "grad_norm": 0.5117537088749302, + "learning_rate": 8.010552451893234e-05, + "loss": 3.8149, + "step": 5162 + }, + { + "epoch": 0.24037991479851945, + "grad_norm": 0.5177291953931638, + "learning_rate": 8.012104283054003e-05, + "loss": 3.8635, + "step": 5163 + }, + { + "epoch": 0.24042647298461253, + "grad_norm": 0.516638171852876, + "learning_rate": 8.013656114214774e-05, + "loss": 3.6893, + "step": 5164 + }, + { + "epoch": 0.2404730311707056, + "grad_norm": 0.594669853655644, + "learning_rate": 8.015207945375544e-05, + "loss": 3.9558, + "step": 5165 + }, + { + "epoch": 0.24051958935679865, + "grad_norm": 0.6713214621356135, + "learning_rate": 8.016759776536313e-05, + "loss": 3.7855, + "step": 5166 + }, + { + "epoch": 0.24056614754289174, + "grad_norm": 0.6141389325029473, + "learning_rate": 8.018311607697083e-05, + "loss": 3.909, + "step": 5167 + }, + { + "epoch": 0.2406127057289848, + "grad_norm": 0.6389198779613956, + "learning_rate": 8.019863438857853e-05, + "loss": 3.8374, + "step": 5168 + }, + { + "epoch": 0.24065926391507786, + "grad_norm": 0.755250021221719, + "learning_rate": 8.021415270018622e-05, + "loss": 3.8699, + "step": 5169 + }, + { + "epoch": 0.24070582210117095, + "grad_norm": 1.0092623703058659, + "learning_rate": 8.022967101179392e-05, + "loss": 3.8473, + "step": 5170 + }, + { + "epoch": 0.240752380287264, + "grad_norm": 0.934865765303248, + "learning_rate": 8.024518932340162e-05, + "loss": 3.8348, + "step": 5171 + }, + { + "epoch": 0.24079893847335707, + "grad_norm": 0.8108130765211936, + "learning_rate": 8.026070763500931e-05, + "loss": 3.7193, + "step": 5172 + }, + { + "epoch": 0.24084549665945015, + "grad_norm": 0.6489523986609993, + "learning_rate": 8.027622594661701e-05, + "loss": 3.8868, + "step": 5173 + }, + { + "epoch": 0.2408920548455432, + "grad_norm": 0.69555594225151, + "learning_rate": 8.02917442582247e-05, + "loss": 3.8353, + "step": 5174 + }, + { + "epoch": 0.2409386130316363, + "grad_norm": 0.7454512217613263, + "learning_rate": 8.030726256983242e-05, + "loss": 3.7375, + "step": 5175 + }, + { + "epoch": 0.24098517121772936, + "grad_norm": 0.6401012816050095, + "learning_rate": 8.032278088144011e-05, + "loss": 3.8988, + "step": 5176 + }, + { + "epoch": 0.24103172940382242, + "grad_norm": 0.7542646405859342, + "learning_rate": 8.033829919304781e-05, + "loss": 3.8512, + "step": 5177 + }, + { + "epoch": 0.2410782875899155, + "grad_norm": 0.7407684390779842, + "learning_rate": 8.035381750465549e-05, + "loss": 3.7611, + "step": 5178 + }, + { + "epoch": 0.24112484577600857, + "grad_norm": 0.737213973982457, + "learning_rate": 8.036933581626319e-05, + "loss": 3.9152, + "step": 5179 + }, + { + "epoch": 0.24117140396210163, + "grad_norm": 0.6869287469141039, + "learning_rate": 8.03848541278709e-05, + "loss": 3.8454, + "step": 5180 + }, + { + "epoch": 0.24121796214819471, + "grad_norm": 0.652779477139148, + "learning_rate": 8.040037243947859e-05, + "loss": 3.8121, + "step": 5181 + }, + { + "epoch": 0.24126452033428777, + "grad_norm": 0.7255946502777748, + "learning_rate": 8.041589075108629e-05, + "loss": 3.8844, + "step": 5182 + }, + { + "epoch": 0.24131107852038083, + "grad_norm": 0.7324852013629598, + "learning_rate": 8.043140906269399e-05, + "loss": 3.8528, + "step": 5183 + }, + { + "epoch": 0.24135763670647392, + "grad_norm": 0.7362061406771019, + "learning_rate": 8.044692737430168e-05, + "loss": 4.0217, + "step": 5184 + }, + { + "epoch": 0.24140419489256698, + "grad_norm": 0.711555554707707, + "learning_rate": 8.046244568590938e-05, + "loss": 3.781, + "step": 5185 + }, + { + "epoch": 0.24145075307866007, + "grad_norm": 0.641861639655367, + "learning_rate": 8.047796399751707e-05, + "loss": 3.7507, + "step": 5186 + }, + { + "epoch": 0.24149731126475313, + "grad_norm": 0.7519647093101873, + "learning_rate": 8.049348230912477e-05, + "loss": 3.7804, + "step": 5187 + }, + { + "epoch": 0.2415438694508462, + "grad_norm": 0.7782782716104658, + "learning_rate": 8.050900062073247e-05, + "loss": 3.7599, + "step": 5188 + }, + { + "epoch": 0.24159042763693928, + "grad_norm": 0.7190354379981584, + "learning_rate": 8.052451893234016e-05, + "loss": 3.7848, + "step": 5189 + }, + { + "epoch": 0.24163698582303234, + "grad_norm": 0.6428967582620595, + "learning_rate": 8.054003724394786e-05, + "loss": 3.7644, + "step": 5190 + }, + { + "epoch": 0.2416835440091254, + "grad_norm": 0.6229575661813322, + "learning_rate": 8.055555555555556e-05, + "loss": 3.8392, + "step": 5191 + }, + { + "epoch": 0.24173010219521848, + "grad_norm": 0.6115168351253194, + "learning_rate": 8.057107386716327e-05, + "loss": 3.9717, + "step": 5192 + }, + { + "epoch": 0.24177666038131154, + "grad_norm": 0.6425611505608039, + "learning_rate": 8.058659217877096e-05, + "loss": 3.8611, + "step": 5193 + }, + { + "epoch": 0.2418232185674046, + "grad_norm": 0.6989134577433395, + "learning_rate": 8.060211049037864e-05, + "loss": 3.8917, + "step": 5194 + }, + { + "epoch": 0.2418697767534977, + "grad_norm": 0.7776351301457228, + "learning_rate": 8.061762880198634e-05, + "loss": 3.7599, + "step": 5195 + }, + { + "epoch": 0.24191633493959075, + "grad_norm": 0.7476211005471878, + "learning_rate": 8.063314711359404e-05, + "loss": 3.8729, + "step": 5196 + }, + { + "epoch": 0.24196289312568384, + "grad_norm": 0.5235415380037249, + "learning_rate": 8.064866542520175e-05, + "loss": 3.7917, + "step": 5197 + }, + { + "epoch": 0.2420094513117769, + "grad_norm": 0.6184018036591437, + "learning_rate": 8.066418373680944e-05, + "loss": 3.7008, + "step": 5198 + }, + { + "epoch": 0.24205600949786996, + "grad_norm": 0.6734572065285627, + "learning_rate": 8.067970204841714e-05, + "loss": 3.9192, + "step": 5199 + }, + { + "epoch": 0.24210256768396304, + "grad_norm": 0.6332402536611814, + "learning_rate": 8.069522036002484e-05, + "loss": 3.8721, + "step": 5200 + }, + { + "epoch": 0.2421491258700561, + "grad_norm": 0.6872263888634866, + "learning_rate": 8.071073867163253e-05, + "loss": 3.7661, + "step": 5201 + }, + { + "epoch": 0.24219568405614916, + "grad_norm": 0.6581489638918209, + "learning_rate": 8.072625698324023e-05, + "loss": 3.8895, + "step": 5202 + }, + { + "epoch": 0.24224224224224225, + "grad_norm": 0.6414158278866517, + "learning_rate": 8.074177529484793e-05, + "loss": 3.8562, + "step": 5203 + }, + { + "epoch": 0.2422888004283353, + "grad_norm": 0.6436382121129925, + "learning_rate": 8.075729360645562e-05, + "loss": 3.8718, + "step": 5204 + }, + { + "epoch": 0.24233535861442837, + "grad_norm": 0.6804927920431455, + "learning_rate": 8.077281191806332e-05, + "loss": 3.7904, + "step": 5205 + }, + { + "epoch": 0.24238191680052146, + "grad_norm": 0.6897488953131826, + "learning_rate": 8.078833022967101e-05, + "loss": 3.8006, + "step": 5206 + }, + { + "epoch": 0.24242847498661452, + "grad_norm": 0.7237414846040733, + "learning_rate": 8.080384854127871e-05, + "loss": 3.863, + "step": 5207 + }, + { + "epoch": 0.2424750331727076, + "grad_norm": 0.6635941830381488, + "learning_rate": 8.081936685288641e-05, + "loss": 3.8901, + "step": 5208 + }, + { + "epoch": 0.24252159135880066, + "grad_norm": 0.6272622015006043, + "learning_rate": 8.083488516449412e-05, + "loss": 3.8242, + "step": 5209 + }, + { + "epoch": 0.24256814954489372, + "grad_norm": 0.5410865067127792, + "learning_rate": 8.08504034761018e-05, + "loss": 3.75, + "step": 5210 + }, + { + "epoch": 0.2426147077309868, + "grad_norm": 0.5972130601596288, + "learning_rate": 8.08659217877095e-05, + "loss": 3.8216, + "step": 5211 + }, + { + "epoch": 0.24266126591707987, + "grad_norm": 0.6672919811910142, + "learning_rate": 8.088144009931719e-05, + "loss": 3.8042, + "step": 5212 + }, + { + "epoch": 0.24270782410317293, + "grad_norm": 0.6921478963652785, + "learning_rate": 8.089695841092489e-05, + "loss": 3.8143, + "step": 5213 + }, + { + "epoch": 0.24275438228926602, + "grad_norm": 0.6523301749676081, + "learning_rate": 8.09124767225326e-05, + "loss": 3.8304, + "step": 5214 + }, + { + "epoch": 0.24280094047535908, + "grad_norm": 0.6200810419401674, + "learning_rate": 8.09279950341403e-05, + "loss": 3.9125, + "step": 5215 + }, + { + "epoch": 0.24284749866145214, + "grad_norm": 0.5793185850174818, + "learning_rate": 8.094351334574799e-05, + "loss": 3.7741, + "step": 5216 + }, + { + "epoch": 0.24289405684754523, + "grad_norm": 0.60199779042255, + "learning_rate": 8.095903165735569e-05, + "loss": 3.7469, + "step": 5217 + }, + { + "epoch": 0.24294061503363829, + "grad_norm": 0.6522472037038053, + "learning_rate": 8.097454996896337e-05, + "loss": 3.7594, + "step": 5218 + }, + { + "epoch": 0.24298717321973137, + "grad_norm": 0.6674011690857266, + "learning_rate": 8.099006828057108e-05, + "loss": 3.8631, + "step": 5219 + }, + { + "epoch": 0.24303373140582443, + "grad_norm": 0.5394753119630503, + "learning_rate": 8.100558659217878e-05, + "loss": 3.7853, + "step": 5220 + }, + { + "epoch": 0.2430802895919175, + "grad_norm": 0.5480374113220217, + "learning_rate": 8.102110490378647e-05, + "loss": 3.693, + "step": 5221 + }, + { + "epoch": 0.24312684777801058, + "grad_norm": 0.6457790499813408, + "learning_rate": 8.103662321539417e-05, + "loss": 3.9399, + "step": 5222 + }, + { + "epoch": 0.24317340596410364, + "grad_norm": 0.7209742244241036, + "learning_rate": 8.105214152700186e-05, + "loss": 3.8019, + "step": 5223 + }, + { + "epoch": 0.2432199641501967, + "grad_norm": 0.7386803041113447, + "learning_rate": 8.106765983860956e-05, + "loss": 3.8264, + "step": 5224 + }, + { + "epoch": 0.2432665223362898, + "grad_norm": 0.5720216068266609, + "learning_rate": 8.108317815021727e-05, + "loss": 3.7782, + "step": 5225 + }, + { + "epoch": 0.24331308052238285, + "grad_norm": 0.6490626509810304, + "learning_rate": 8.109869646182495e-05, + "loss": 3.8123, + "step": 5226 + }, + { + "epoch": 0.2433596387084759, + "grad_norm": 0.6042513689501637, + "learning_rate": 8.111421477343265e-05, + "loss": 3.8123, + "step": 5227 + }, + { + "epoch": 0.243406196894569, + "grad_norm": 0.6725266744221302, + "learning_rate": 8.112973308504035e-05, + "loss": 3.8532, + "step": 5228 + }, + { + "epoch": 0.24345275508066205, + "grad_norm": 0.7527666488724972, + "learning_rate": 8.114525139664804e-05, + "loss": 3.9059, + "step": 5229 + }, + { + "epoch": 0.24349931326675514, + "grad_norm": 0.5704125396300395, + "learning_rate": 8.116076970825575e-05, + "loss": 3.7462, + "step": 5230 + }, + { + "epoch": 0.2435458714528482, + "grad_norm": 0.6898464219262415, + "learning_rate": 8.117628801986345e-05, + "loss": 3.7712, + "step": 5231 + }, + { + "epoch": 0.24359242963894126, + "grad_norm": 0.5931064442694262, + "learning_rate": 8.119180633147115e-05, + "loss": 3.7842, + "step": 5232 + }, + { + "epoch": 0.24363898782503435, + "grad_norm": 0.6477853404292175, + "learning_rate": 8.120732464307884e-05, + "loss": 3.8038, + "step": 5233 + }, + { + "epoch": 0.2436855460111274, + "grad_norm": 0.5873421880051148, + "learning_rate": 8.122284295468652e-05, + "loss": 3.74, + "step": 5234 + }, + { + "epoch": 0.24373210419722047, + "grad_norm": 0.5694885386285383, + "learning_rate": 8.123836126629422e-05, + "loss": 3.8715, + "step": 5235 + }, + { + "epoch": 0.24377866238331355, + "grad_norm": 0.6001480474696533, + "learning_rate": 8.125387957790193e-05, + "loss": 3.7203, + "step": 5236 + }, + { + "epoch": 0.24382522056940661, + "grad_norm": 0.6108738967223718, + "learning_rate": 8.126939788950963e-05, + "loss": 3.7063, + "step": 5237 + }, + { + "epoch": 0.24387177875549967, + "grad_norm": 0.6129685948329445, + "learning_rate": 8.128491620111732e-05, + "loss": 3.7051, + "step": 5238 + }, + { + "epoch": 0.24391833694159276, + "grad_norm": 0.6383164191444731, + "learning_rate": 8.130043451272502e-05, + "loss": 3.7069, + "step": 5239 + }, + { + "epoch": 0.24396489512768582, + "grad_norm": 0.5617980243700876, + "learning_rate": 8.131595282433272e-05, + "loss": 3.7775, + "step": 5240 + }, + { + "epoch": 0.2440114533137789, + "grad_norm": 0.5765265641809433, + "learning_rate": 8.133147113594041e-05, + "loss": 3.7931, + "step": 5241 + }, + { + "epoch": 0.24405801149987197, + "grad_norm": 0.6706154185229342, + "learning_rate": 8.134698944754811e-05, + "loss": 3.8246, + "step": 5242 + }, + { + "epoch": 0.24410456968596503, + "grad_norm": 0.759959154640195, + "learning_rate": 8.13625077591558e-05, + "loss": 3.8309, + "step": 5243 + }, + { + "epoch": 0.24415112787205812, + "grad_norm": 0.782617413571973, + "learning_rate": 8.13780260707635e-05, + "loss": 3.7497, + "step": 5244 + }, + { + "epoch": 0.24419768605815118, + "grad_norm": 0.651880088896546, + "learning_rate": 8.13935443823712e-05, + "loss": 3.7791, + "step": 5245 + }, + { + "epoch": 0.24424424424424424, + "grad_norm": 0.6518956121657196, + "learning_rate": 8.14090626939789e-05, + "loss": 3.8482, + "step": 5246 + }, + { + "epoch": 0.24429080243033732, + "grad_norm": 0.7238162799221649, + "learning_rate": 8.14245810055866e-05, + "loss": 3.8295, + "step": 5247 + }, + { + "epoch": 0.24433736061643038, + "grad_norm": 0.6228475830686152, + "learning_rate": 8.14400993171943e-05, + "loss": 3.6681, + "step": 5248 + }, + { + "epoch": 0.24438391880252344, + "grad_norm": 0.569330218210861, + "learning_rate": 8.1455617628802e-05, + "loss": 3.7307, + "step": 5249 + }, + { + "epoch": 0.24443047698861653, + "grad_norm": 0.5997748330091522, + "learning_rate": 8.147113594040968e-05, + "loss": 3.7787, + "step": 5250 + }, + { + "epoch": 0.2444770351747096, + "grad_norm": 0.5640620829539199, + "learning_rate": 8.148665425201738e-05, + "loss": 3.849, + "step": 5251 + }, + { + "epoch": 0.24452359336080268, + "grad_norm": 0.6809929843758635, + "learning_rate": 8.150217256362508e-05, + "loss": 3.8931, + "step": 5252 + }, + { + "epoch": 0.24457015154689574, + "grad_norm": 0.675635541907085, + "learning_rate": 8.151769087523278e-05, + "loss": 3.7978, + "step": 5253 + }, + { + "epoch": 0.2446167097329888, + "grad_norm": 0.716495883255713, + "learning_rate": 8.153320918684048e-05, + "loss": 3.8443, + "step": 5254 + }, + { + "epoch": 0.24466326791908188, + "grad_norm": 0.889264779367484, + "learning_rate": 8.154872749844817e-05, + "loss": 3.7147, + "step": 5255 + }, + { + "epoch": 0.24470982610517494, + "grad_norm": 0.7973881700710833, + "learning_rate": 8.156424581005587e-05, + "loss": 3.7339, + "step": 5256 + }, + { + "epoch": 0.244756384291268, + "grad_norm": 0.6823422367472698, + "learning_rate": 8.157976412166357e-05, + "loss": 3.9091, + "step": 5257 + }, + { + "epoch": 0.2448029424773611, + "grad_norm": 0.7906434384666372, + "learning_rate": 8.159528243327126e-05, + "loss": 3.8153, + "step": 5258 + }, + { + "epoch": 0.24484950066345415, + "grad_norm": 0.7609703253283854, + "learning_rate": 8.161080074487896e-05, + "loss": 3.8895, + "step": 5259 + }, + { + "epoch": 0.2448960588495472, + "grad_norm": 0.7160645720115908, + "learning_rate": 8.162631905648666e-05, + "loss": 3.8776, + "step": 5260 + }, + { + "epoch": 0.2449426170356403, + "grad_norm": 0.900393296158135, + "learning_rate": 8.164183736809435e-05, + "loss": 3.7954, + "step": 5261 + }, + { + "epoch": 0.24498917522173336, + "grad_norm": 0.7829372515420382, + "learning_rate": 8.165735567970205e-05, + "loss": 3.821, + "step": 5262 + }, + { + "epoch": 0.24503573340782644, + "grad_norm": 0.6056097594672414, + "learning_rate": 8.167287399130974e-05, + "loss": 3.6872, + "step": 5263 + }, + { + "epoch": 0.2450822915939195, + "grad_norm": 0.696747102069845, + "learning_rate": 8.168839230291745e-05, + "loss": 3.7906, + "step": 5264 + }, + { + "epoch": 0.24512884978001256, + "grad_norm": 0.761527997042721, + "learning_rate": 8.170391061452515e-05, + "loss": 3.817, + "step": 5265 + }, + { + "epoch": 0.24517540796610565, + "grad_norm": 0.7742422232036221, + "learning_rate": 8.171942892613285e-05, + "loss": 3.8495, + "step": 5266 + }, + { + "epoch": 0.2452219661521987, + "grad_norm": 0.6299209282745973, + "learning_rate": 8.173494723774053e-05, + "loss": 3.7658, + "step": 5267 + }, + { + "epoch": 0.24526852433829177, + "grad_norm": 0.7298220467313593, + "learning_rate": 8.175046554934823e-05, + "loss": 3.8716, + "step": 5268 + }, + { + "epoch": 0.24531508252438486, + "grad_norm": 0.8813989285530145, + "learning_rate": 8.176598386095594e-05, + "loss": 3.8092, + "step": 5269 + }, + { + "epoch": 0.24536164071047792, + "grad_norm": 0.8671233414947846, + "learning_rate": 8.178150217256363e-05, + "loss": 3.8151, + "step": 5270 + }, + { + "epoch": 0.24540819889657098, + "grad_norm": 0.7993365570017731, + "learning_rate": 8.179702048417133e-05, + "loss": 3.7034, + "step": 5271 + }, + { + "epoch": 0.24545475708266407, + "grad_norm": 0.7072085405561004, + "learning_rate": 8.181253879577902e-05, + "loss": 3.8963, + "step": 5272 + }, + { + "epoch": 0.24550131526875713, + "grad_norm": 0.6847839987222821, + "learning_rate": 8.182805710738672e-05, + "loss": 3.6783, + "step": 5273 + }, + { + "epoch": 0.2455478734548502, + "grad_norm": 0.7338193005513264, + "learning_rate": 8.184357541899442e-05, + "loss": 3.7499, + "step": 5274 + }, + { + "epoch": 0.24559443164094327, + "grad_norm": 0.7562792168861691, + "learning_rate": 8.185909373060211e-05, + "loss": 3.9117, + "step": 5275 + }, + { + "epoch": 0.24564098982703633, + "grad_norm": 0.7674096436979747, + "learning_rate": 8.187461204220981e-05, + "loss": 3.8492, + "step": 5276 + }, + { + "epoch": 0.24568754801312942, + "grad_norm": 0.7096471500722802, + "learning_rate": 8.18901303538175e-05, + "loss": 3.8477, + "step": 5277 + }, + { + "epoch": 0.24573410619922248, + "grad_norm": 0.6264633960480602, + "learning_rate": 8.19056486654252e-05, + "loss": 3.8114, + "step": 5278 + }, + { + "epoch": 0.24578066438531554, + "grad_norm": 0.6358109354272817, + "learning_rate": 8.19211669770329e-05, + "loss": 3.8786, + "step": 5279 + }, + { + "epoch": 0.24582722257140863, + "grad_norm": 0.6838262392716569, + "learning_rate": 8.193668528864061e-05, + "loss": 3.7416, + "step": 5280 + }, + { + "epoch": 0.2458737807575017, + "grad_norm": 0.5331985951821749, + "learning_rate": 8.19522036002483e-05, + "loss": 3.7067, + "step": 5281 + }, + { + "epoch": 0.24592033894359475, + "grad_norm": 0.5503960991681165, + "learning_rate": 8.1967721911856e-05, + "loss": 3.7317, + "step": 5282 + }, + { + "epoch": 0.24596689712968783, + "grad_norm": 0.6506388902976566, + "learning_rate": 8.198324022346368e-05, + "loss": 3.7378, + "step": 5283 + }, + { + "epoch": 0.2460134553157809, + "grad_norm": 0.5703296720437633, + "learning_rate": 8.199875853507138e-05, + "loss": 3.8291, + "step": 5284 + }, + { + "epoch": 0.24606001350187395, + "grad_norm": 0.6229585157723274, + "learning_rate": 8.201427684667908e-05, + "loss": 3.7131, + "step": 5285 + }, + { + "epoch": 0.24610657168796704, + "grad_norm": 0.5725533228884503, + "learning_rate": 8.202979515828679e-05, + "loss": 3.7184, + "step": 5286 + }, + { + "epoch": 0.2461531298740601, + "grad_norm": 0.484060679695461, + "learning_rate": 8.204531346989448e-05, + "loss": 3.8473, + "step": 5287 + }, + { + "epoch": 0.2461996880601532, + "grad_norm": 0.5711290733708128, + "learning_rate": 8.206083178150218e-05, + "loss": 3.7388, + "step": 5288 + }, + { + "epoch": 0.24624624624624625, + "grad_norm": 0.5587073754921635, + "learning_rate": 8.207635009310988e-05, + "loss": 3.738, + "step": 5289 + }, + { + "epoch": 0.2462928044323393, + "grad_norm": 0.6278398943569964, + "learning_rate": 8.209186840471757e-05, + "loss": 3.7852, + "step": 5290 + }, + { + "epoch": 0.2463393626184324, + "grad_norm": 0.5841458273175782, + "learning_rate": 8.210738671632527e-05, + "loss": 3.8234, + "step": 5291 + }, + { + "epoch": 0.24638592080452545, + "grad_norm": 0.7264663682049127, + "learning_rate": 8.212290502793296e-05, + "loss": 3.814, + "step": 5292 + }, + { + "epoch": 0.24643247899061851, + "grad_norm": 0.6851241864463815, + "learning_rate": 8.213842333954066e-05, + "loss": 3.842, + "step": 5293 + }, + { + "epoch": 0.2464790371767116, + "grad_norm": 0.6216948444415361, + "learning_rate": 8.215394165114836e-05, + "loss": 3.8103, + "step": 5294 + }, + { + "epoch": 0.24652559536280466, + "grad_norm": 0.5992585348886761, + "learning_rate": 8.216945996275605e-05, + "loss": 3.7977, + "step": 5295 + }, + { + "epoch": 0.24657215354889772, + "grad_norm": 0.561505135134723, + "learning_rate": 8.218497827436375e-05, + "loss": 3.7663, + "step": 5296 + }, + { + "epoch": 0.2466187117349908, + "grad_norm": 0.5641496653475003, + "learning_rate": 8.220049658597146e-05, + "loss": 3.7604, + "step": 5297 + }, + { + "epoch": 0.24666526992108387, + "grad_norm": 0.6244567009875286, + "learning_rate": 8.221601489757916e-05, + "loss": 3.6249, + "step": 5298 + }, + { + "epoch": 0.24671182810717696, + "grad_norm": 0.5389727295073012, + "learning_rate": 8.223153320918684e-05, + "loss": 3.6975, + "step": 5299 + }, + { + "epoch": 0.24675838629327002, + "grad_norm": 0.5531314365148062, + "learning_rate": 8.224705152079453e-05, + "loss": 3.899, + "step": 5300 + }, + { + "epoch": 0.24680494447936308, + "grad_norm": 0.5757008696281499, + "learning_rate": 8.226256983240223e-05, + "loss": 3.9013, + "step": 5301 + }, + { + "epoch": 0.24685150266545616, + "grad_norm": 0.5670438297639412, + "learning_rate": 8.227808814400994e-05, + "loss": 3.8137, + "step": 5302 + }, + { + "epoch": 0.24689806085154922, + "grad_norm": 0.46517149307804595, + "learning_rate": 8.229360645561764e-05, + "loss": 3.7191, + "step": 5303 + }, + { + "epoch": 0.24694461903764228, + "grad_norm": 0.5237919210430582, + "learning_rate": 8.230912476722533e-05, + "loss": 3.7831, + "step": 5304 + }, + { + "epoch": 0.24699117722373537, + "grad_norm": 0.5784013561784148, + "learning_rate": 8.232464307883303e-05, + "loss": 3.7303, + "step": 5305 + }, + { + "epoch": 0.24703773540982843, + "grad_norm": 0.6013925729186959, + "learning_rate": 8.234016139044073e-05, + "loss": 3.6614, + "step": 5306 + }, + { + "epoch": 0.2470842935959215, + "grad_norm": 0.5995199543812986, + "learning_rate": 8.235567970204842e-05, + "loss": 3.8025, + "step": 5307 + }, + { + "epoch": 0.24713085178201458, + "grad_norm": 0.5684697485794333, + "learning_rate": 8.237119801365612e-05, + "loss": 3.7364, + "step": 5308 + }, + { + "epoch": 0.24717740996810764, + "grad_norm": 0.5643056025615883, + "learning_rate": 8.238671632526382e-05, + "loss": 3.7469, + "step": 5309 + }, + { + "epoch": 0.24722396815420072, + "grad_norm": 0.6671308090571836, + "learning_rate": 8.240223463687151e-05, + "loss": 3.8344, + "step": 5310 + }, + { + "epoch": 0.24727052634029378, + "grad_norm": 0.7341838801668158, + "learning_rate": 8.241775294847921e-05, + "loss": 3.8416, + "step": 5311 + }, + { + "epoch": 0.24731708452638684, + "grad_norm": 0.6222008858737951, + "learning_rate": 8.24332712600869e-05, + "loss": 3.7024, + "step": 5312 + }, + { + "epoch": 0.24736364271247993, + "grad_norm": 0.5205725002100196, + "learning_rate": 8.24487895716946e-05, + "loss": 3.8859, + "step": 5313 + }, + { + "epoch": 0.247410200898573, + "grad_norm": 0.6769908494495233, + "learning_rate": 8.246430788330231e-05, + "loss": 3.786, + "step": 5314 + }, + { + "epoch": 0.24745675908466605, + "grad_norm": 0.6515874364802794, + "learning_rate": 8.247982619490999e-05, + "loss": 3.8682, + "step": 5315 + }, + { + "epoch": 0.24750331727075914, + "grad_norm": 0.6626811942711096, + "learning_rate": 8.249534450651769e-05, + "loss": 3.7454, + "step": 5316 + }, + { + "epoch": 0.2475498754568522, + "grad_norm": 0.6240209747097138, + "learning_rate": 8.251086281812539e-05, + "loss": 3.7794, + "step": 5317 + }, + { + "epoch": 0.24759643364294526, + "grad_norm": 0.5818806530684192, + "learning_rate": 8.252638112973308e-05, + "loss": 3.7424, + "step": 5318 + }, + { + "epoch": 0.24764299182903834, + "grad_norm": 0.5915979064740574, + "learning_rate": 8.254189944134079e-05, + "loss": 3.7849, + "step": 5319 + }, + { + "epoch": 0.2476895500151314, + "grad_norm": 0.5972877138992789, + "learning_rate": 8.255741775294849e-05, + "loss": 3.8207, + "step": 5320 + }, + { + "epoch": 0.2477361082012245, + "grad_norm": 0.7382635691788574, + "learning_rate": 8.257293606455618e-05, + "loss": 3.7395, + "step": 5321 + }, + { + "epoch": 0.24778266638731755, + "grad_norm": 0.6916114096999022, + "learning_rate": 8.258845437616388e-05, + "loss": 3.812, + "step": 5322 + }, + { + "epoch": 0.2478292245734106, + "grad_norm": 0.6502314999083393, + "learning_rate": 8.260397268777156e-05, + "loss": 3.7089, + "step": 5323 + }, + { + "epoch": 0.2478757827595037, + "grad_norm": 0.6253518500313211, + "learning_rate": 8.261949099937927e-05, + "loss": 3.8654, + "step": 5324 + }, + { + "epoch": 0.24792234094559676, + "grad_norm": 0.5534288322603291, + "learning_rate": 8.263500931098697e-05, + "loss": 3.8396, + "step": 5325 + }, + { + "epoch": 0.24796889913168982, + "grad_norm": 0.6385450640647862, + "learning_rate": 8.265052762259467e-05, + "loss": 3.8473, + "step": 5326 + }, + { + "epoch": 0.2480154573177829, + "grad_norm": 0.6838107211479556, + "learning_rate": 8.266604593420236e-05, + "loss": 3.8354, + "step": 5327 + }, + { + "epoch": 0.24806201550387597, + "grad_norm": 0.7135674878955905, + "learning_rate": 8.268156424581006e-05, + "loss": 3.8673, + "step": 5328 + }, + { + "epoch": 0.24810857368996903, + "grad_norm": 0.5396303195066956, + "learning_rate": 8.269708255741775e-05, + "loss": 3.7037, + "step": 5329 + }, + { + "epoch": 0.2481551318760621, + "grad_norm": 0.6230339063208096, + "learning_rate": 8.271260086902546e-05, + "loss": 3.7196, + "step": 5330 + }, + { + "epoch": 0.24820169006215517, + "grad_norm": 0.8939379867721836, + "learning_rate": 8.272811918063315e-05, + "loss": 3.6326, + "step": 5331 + }, + { + "epoch": 0.24824824824824826, + "grad_norm": 0.9199916637088152, + "learning_rate": 8.274363749224084e-05, + "loss": 3.8029, + "step": 5332 + }, + { + "epoch": 0.24829480643434132, + "grad_norm": 0.9783603683275796, + "learning_rate": 8.275915580384854e-05, + "loss": 3.9032, + "step": 5333 + }, + { + "epoch": 0.24834136462043438, + "grad_norm": 0.9290251565426418, + "learning_rate": 8.277467411545624e-05, + "loss": 3.7365, + "step": 5334 + }, + { + "epoch": 0.24838792280652747, + "grad_norm": 0.8149411387216545, + "learning_rate": 8.279019242706395e-05, + "loss": 3.7707, + "step": 5335 + }, + { + "epoch": 0.24843448099262053, + "grad_norm": 0.715254652559341, + "learning_rate": 8.280571073867164e-05, + "loss": 3.747, + "step": 5336 + }, + { + "epoch": 0.2484810391787136, + "grad_norm": 0.6740084459068403, + "learning_rate": 8.282122905027934e-05, + "loss": 3.7641, + "step": 5337 + }, + { + "epoch": 0.24852759736480667, + "grad_norm": 0.7060969479214538, + "learning_rate": 8.283674736188704e-05, + "loss": 3.7816, + "step": 5338 + }, + { + "epoch": 0.24857415555089973, + "grad_norm": 0.6136145908364962, + "learning_rate": 8.285226567349472e-05, + "loss": 3.7118, + "step": 5339 + }, + { + "epoch": 0.2486207137369928, + "grad_norm": 0.5973622085306052, + "learning_rate": 8.286778398510241e-05, + "loss": 3.7395, + "step": 5340 + }, + { + "epoch": 0.24866727192308588, + "grad_norm": 0.5958107897440892, + "learning_rate": 8.288330229671012e-05, + "loss": 3.8054, + "step": 5341 + }, + { + "epoch": 0.24871383010917894, + "grad_norm": 0.6529038189775598, + "learning_rate": 8.289882060831782e-05, + "loss": 3.8653, + "step": 5342 + }, + { + "epoch": 0.24876038829527203, + "grad_norm": 0.6271755172781193, + "learning_rate": 8.291433891992552e-05, + "loss": 3.8371, + "step": 5343 + }, + { + "epoch": 0.2488069464813651, + "grad_norm": 0.7886895727046136, + "learning_rate": 8.292985723153321e-05, + "loss": 3.7928, + "step": 5344 + }, + { + "epoch": 0.24885350466745815, + "grad_norm": 0.6123357626011128, + "learning_rate": 8.294537554314091e-05, + "loss": 3.6656, + "step": 5345 + }, + { + "epoch": 0.24890006285355123, + "grad_norm": 0.6581370320326306, + "learning_rate": 8.29608938547486e-05, + "loss": 3.7935, + "step": 5346 + }, + { + "epoch": 0.2489466210396443, + "grad_norm": 0.5627877972618783, + "learning_rate": 8.297641216635632e-05, + "loss": 3.8501, + "step": 5347 + }, + { + "epoch": 0.24899317922573735, + "grad_norm": 0.6394552703933281, + "learning_rate": 8.2991930477964e-05, + "loss": 3.7672, + "step": 5348 + }, + { + "epoch": 0.24903973741183044, + "grad_norm": 0.5927459153040937, + "learning_rate": 8.30074487895717e-05, + "loss": 3.8859, + "step": 5349 + }, + { + "epoch": 0.2490862955979235, + "grad_norm": 0.6834406677086319, + "learning_rate": 8.302296710117939e-05, + "loss": 3.8776, + "step": 5350 + }, + { + "epoch": 0.24913285378401656, + "grad_norm": 0.6633571284813686, + "learning_rate": 8.303848541278709e-05, + "loss": 3.8366, + "step": 5351 + }, + { + "epoch": 0.24917941197010965, + "grad_norm": 0.7363524209208808, + "learning_rate": 8.30540037243948e-05, + "loss": 3.8364, + "step": 5352 + }, + { + "epoch": 0.2492259701562027, + "grad_norm": 0.62451607661743, + "learning_rate": 8.30695220360025e-05, + "loss": 3.8578, + "step": 5353 + }, + { + "epoch": 0.2492725283422958, + "grad_norm": 0.6523906473899997, + "learning_rate": 8.308504034761019e-05, + "loss": 3.9871, + "step": 5354 + }, + { + "epoch": 0.24931908652838886, + "grad_norm": 0.7155209744698036, + "learning_rate": 8.310055865921789e-05, + "loss": 3.9103, + "step": 5355 + }, + { + "epoch": 0.24936564471448192, + "grad_norm": 0.7094141043981906, + "learning_rate": 8.311607697082557e-05, + "loss": 3.8353, + "step": 5356 + }, + { + "epoch": 0.249412202900575, + "grad_norm": 0.6042462972713295, + "learning_rate": 8.313159528243328e-05, + "loss": 3.8876, + "step": 5357 + }, + { + "epoch": 0.24945876108666806, + "grad_norm": 0.5963977330088336, + "learning_rate": 8.314711359404097e-05, + "loss": 3.9037, + "step": 5358 + }, + { + "epoch": 0.24950531927276112, + "grad_norm": 0.5819703278033204, + "learning_rate": 8.316263190564867e-05, + "loss": 3.7417, + "step": 5359 + }, + { + "epoch": 0.2495518774588542, + "grad_norm": 0.5831560695583818, + "learning_rate": 8.317815021725637e-05, + "loss": 3.6976, + "step": 5360 + }, + { + "epoch": 0.24959843564494727, + "grad_norm": 0.6373485715593733, + "learning_rate": 8.319366852886406e-05, + "loss": 3.7979, + "step": 5361 + }, + { + "epoch": 0.24964499383104033, + "grad_norm": 0.7074952891772117, + "learning_rate": 8.320918684047176e-05, + "loss": 3.7982, + "step": 5362 + }, + { + "epoch": 0.24969155201713342, + "grad_norm": 0.7366514605956773, + "learning_rate": 8.322470515207946e-05, + "loss": 3.7896, + "step": 5363 + }, + { + "epoch": 0.24973811020322648, + "grad_norm": 0.8512005618173228, + "learning_rate": 8.324022346368715e-05, + "loss": 3.8104, + "step": 5364 + }, + { + "epoch": 0.24978466838931956, + "grad_norm": 0.7134888437515182, + "learning_rate": 8.325574177529485e-05, + "loss": 3.7226, + "step": 5365 + }, + { + "epoch": 0.24983122657541262, + "grad_norm": 0.5685579019726064, + "learning_rate": 8.327126008690255e-05, + "loss": 3.8232, + "step": 5366 + }, + { + "epoch": 0.24987778476150568, + "grad_norm": 0.8177420278088647, + "learning_rate": 8.328677839851024e-05, + "loss": 3.7574, + "step": 5367 + }, + { + "epoch": 0.24992434294759877, + "grad_norm": 0.8485708259586995, + "learning_rate": 8.330229671011794e-05, + "loss": 3.8183, + "step": 5368 + }, + { + "epoch": 0.24997090113369183, + "grad_norm": 0.5876235137482149, + "learning_rate": 8.331781502172565e-05, + "loss": 3.8156, + "step": 5369 + }, + { + "epoch": 0.2500174593197849, + "grad_norm": 0.7146191787051729, + "learning_rate": 8.333333333333334e-05, + "loss": 3.7059, + "step": 5370 + }, + { + "epoch": 0.250064017505878, + "grad_norm": 0.6911967016708044, + "learning_rate": 8.334885164494104e-05, + "loss": 3.7801, + "step": 5371 + }, + { + "epoch": 0.25011057569197104, + "grad_norm": 0.7215696732216518, + "learning_rate": 8.336436995654872e-05, + "loss": 3.7248, + "step": 5372 + }, + { + "epoch": 0.2501571338780641, + "grad_norm": 0.6465578914743518, + "learning_rate": 8.337988826815642e-05, + "loss": 3.8294, + "step": 5373 + }, + { + "epoch": 0.25020369206415716, + "grad_norm": 0.6669026970605137, + "learning_rate": 8.339540657976413e-05, + "loss": 3.7046, + "step": 5374 + }, + { + "epoch": 0.2502502502502503, + "grad_norm": 0.5983505610252182, + "learning_rate": 8.341092489137183e-05, + "loss": 3.729, + "step": 5375 + }, + { + "epoch": 0.25029680843634333, + "grad_norm": 0.5292792223233608, + "learning_rate": 8.342644320297952e-05, + "loss": 3.7372, + "step": 5376 + }, + { + "epoch": 0.2503433666224364, + "grad_norm": 0.5346844041059549, + "learning_rate": 8.344196151458722e-05, + "loss": 3.6961, + "step": 5377 + }, + { + "epoch": 0.25038992480852945, + "grad_norm": 0.5558739524048731, + "learning_rate": 8.345747982619491e-05, + "loss": 3.8316, + "step": 5378 + }, + { + "epoch": 0.2504364829946225, + "grad_norm": 0.6486891647865782, + "learning_rate": 8.347299813780261e-05, + "loss": 3.8227, + "step": 5379 + }, + { + "epoch": 0.2504830411807156, + "grad_norm": 0.5635305248911929, + "learning_rate": 8.348851644941031e-05, + "loss": 3.7863, + "step": 5380 + }, + { + "epoch": 0.2505295993668087, + "grad_norm": 0.545695265548656, + "learning_rate": 8.3504034761018e-05, + "loss": 3.6972, + "step": 5381 + }, + { + "epoch": 0.25057615755290175, + "grad_norm": 0.7017849441617905, + "learning_rate": 8.35195530726257e-05, + "loss": 3.8853, + "step": 5382 + }, + { + "epoch": 0.2506227157389948, + "grad_norm": 0.8676775730050947, + "learning_rate": 8.35350713842334e-05, + "loss": 3.6709, + "step": 5383 + }, + { + "epoch": 0.25066927392508787, + "grad_norm": 0.7928346165300011, + "learning_rate": 8.355058969584109e-05, + "loss": 3.7199, + "step": 5384 + }, + { + "epoch": 0.2507158321111809, + "grad_norm": 0.7139236928280355, + "learning_rate": 8.35661080074488e-05, + "loss": 3.704, + "step": 5385 + }, + { + "epoch": 0.25076239029727404, + "grad_norm": 0.840151786315354, + "learning_rate": 8.35816263190565e-05, + "loss": 3.902, + "step": 5386 + }, + { + "epoch": 0.2508089484833671, + "grad_norm": 0.7590848013758161, + "learning_rate": 8.35971446306642e-05, + "loss": 3.87, + "step": 5387 + }, + { + "epoch": 0.25085550666946016, + "grad_norm": 0.7043519610156107, + "learning_rate": 8.361266294227188e-05, + "loss": 3.7316, + "step": 5388 + }, + { + "epoch": 0.2509020648555532, + "grad_norm": 0.6787021494694381, + "learning_rate": 8.362818125387957e-05, + "loss": 3.7709, + "step": 5389 + }, + { + "epoch": 0.2509486230416463, + "grad_norm": 0.7369324294255158, + "learning_rate": 8.364369956548727e-05, + "loss": 3.7832, + "step": 5390 + }, + { + "epoch": 0.2509951812277394, + "grad_norm": 0.703906565511497, + "learning_rate": 8.365921787709498e-05, + "loss": 3.7763, + "step": 5391 + }, + { + "epoch": 0.25104173941383245, + "grad_norm": 0.6347204492047702, + "learning_rate": 8.367473618870268e-05, + "loss": 3.9223, + "step": 5392 + }, + { + "epoch": 0.2510882975999255, + "grad_norm": 0.7325937263352456, + "learning_rate": 8.369025450031037e-05, + "loss": 3.7618, + "step": 5393 + }, + { + "epoch": 0.2511348557860186, + "grad_norm": 0.5771554483323443, + "learning_rate": 8.370577281191807e-05, + "loss": 3.6713, + "step": 5394 + }, + { + "epoch": 0.25118141397211163, + "grad_norm": 0.6179683731904798, + "learning_rate": 8.372129112352577e-05, + "loss": 3.742, + "step": 5395 + }, + { + "epoch": 0.2512279721582047, + "grad_norm": 0.6609868782167375, + "learning_rate": 8.373680943513346e-05, + "loss": 3.7428, + "step": 5396 + }, + { + "epoch": 0.2512745303442978, + "grad_norm": 0.5734750275449447, + "learning_rate": 8.375232774674116e-05, + "loss": 3.7875, + "step": 5397 + }, + { + "epoch": 0.25132108853039087, + "grad_norm": 0.5432329187128978, + "learning_rate": 8.376784605834885e-05, + "loss": 3.812, + "step": 5398 + }, + { + "epoch": 0.25136764671648393, + "grad_norm": 0.5823725423469626, + "learning_rate": 8.378336436995655e-05, + "loss": 3.8665, + "step": 5399 + }, + { + "epoch": 0.251414204902577, + "grad_norm": 0.6163820701944368, + "learning_rate": 8.379888268156425e-05, + "loss": 3.8349, + "step": 5400 + }, + { + "epoch": 0.25146076308867005, + "grad_norm": 0.7490182622137513, + "learning_rate": 8.381440099317194e-05, + "loss": 3.8068, + "step": 5401 + }, + { + "epoch": 0.25150732127476316, + "grad_norm": 0.8014748275425817, + "learning_rate": 8.382991930477965e-05, + "loss": 3.8629, + "step": 5402 + }, + { + "epoch": 0.2515538794608562, + "grad_norm": 0.6800052314817426, + "learning_rate": 8.384543761638735e-05, + "loss": 3.8517, + "step": 5403 + }, + { + "epoch": 0.2516004376469493, + "grad_norm": 0.5938567771223692, + "learning_rate": 8.386095592799503e-05, + "loss": 3.7352, + "step": 5404 + }, + { + "epoch": 0.25164699583304234, + "grad_norm": 0.6655578866992184, + "learning_rate": 8.387647423960273e-05, + "loss": 3.6905, + "step": 5405 + }, + { + "epoch": 0.2516935540191354, + "grad_norm": 0.6352606971778554, + "learning_rate": 8.389199255121042e-05, + "loss": 3.7543, + "step": 5406 + }, + { + "epoch": 0.25174011220522846, + "grad_norm": 0.5921393738340327, + "learning_rate": 8.390751086281813e-05, + "loss": 3.7565, + "step": 5407 + }, + { + "epoch": 0.2517866703913216, + "grad_norm": 0.6571705615531228, + "learning_rate": 8.392302917442583e-05, + "loss": 3.6953, + "step": 5408 + }, + { + "epoch": 0.25183322857741464, + "grad_norm": 0.7134479062789227, + "learning_rate": 8.393854748603353e-05, + "loss": 3.7953, + "step": 5409 + }, + { + "epoch": 0.2518797867635077, + "grad_norm": 0.5989566837721235, + "learning_rate": 8.395406579764122e-05, + "loss": 3.7439, + "step": 5410 + }, + { + "epoch": 0.25192634494960076, + "grad_norm": 0.5308603180076662, + "learning_rate": 8.396958410924892e-05, + "loss": 3.6569, + "step": 5411 + }, + { + "epoch": 0.2519729031356938, + "grad_norm": 0.5764622693384545, + "learning_rate": 8.398510242085662e-05, + "loss": 3.7402, + "step": 5412 + }, + { + "epoch": 0.25201946132178693, + "grad_norm": 0.6471559259813319, + "learning_rate": 8.400062073246431e-05, + "loss": 3.6573, + "step": 5413 + }, + { + "epoch": 0.25206601950788, + "grad_norm": 0.6650052329139287, + "learning_rate": 8.401613904407201e-05, + "loss": 3.8192, + "step": 5414 + }, + { + "epoch": 0.25211257769397305, + "grad_norm": 0.6499795247584625, + "learning_rate": 8.40316573556797e-05, + "loss": 3.7678, + "step": 5415 + }, + { + "epoch": 0.2521591358800661, + "grad_norm": 0.581222382083589, + "learning_rate": 8.40471756672874e-05, + "loss": 3.8472, + "step": 5416 + }, + { + "epoch": 0.25220569406615917, + "grad_norm": 0.58552460423106, + "learning_rate": 8.40626939788951e-05, + "loss": 3.7243, + "step": 5417 + }, + { + "epoch": 0.25225225225225223, + "grad_norm": 0.6099172148521887, + "learning_rate": 8.40782122905028e-05, + "loss": 3.7349, + "step": 5418 + }, + { + "epoch": 0.25229881043834534, + "grad_norm": 0.6315489457326761, + "learning_rate": 8.40937306021105e-05, + "loss": 3.7943, + "step": 5419 + }, + { + "epoch": 0.2523453686244384, + "grad_norm": 0.6242270080903889, + "learning_rate": 8.410924891371819e-05, + "loss": 3.8259, + "step": 5420 + }, + { + "epoch": 0.25239192681053146, + "grad_norm": 0.6936761749744962, + "learning_rate": 8.412476722532588e-05, + "loss": 3.7205, + "step": 5421 + }, + { + "epoch": 0.2524384849966245, + "grad_norm": 0.6451990858110559, + "learning_rate": 8.414028553693358e-05, + "loss": 3.8502, + "step": 5422 + }, + { + "epoch": 0.2524850431827176, + "grad_norm": 0.5504621971126202, + "learning_rate": 8.415580384854128e-05, + "loss": 3.7349, + "step": 5423 + }, + { + "epoch": 0.2525316013688107, + "grad_norm": 0.6014908908585614, + "learning_rate": 8.417132216014899e-05, + "loss": 3.6703, + "step": 5424 + }, + { + "epoch": 0.25257815955490376, + "grad_norm": 0.5749965122165138, + "learning_rate": 8.418684047175668e-05, + "loss": 3.7081, + "step": 5425 + }, + { + "epoch": 0.2526247177409968, + "grad_norm": 0.4823822117281424, + "learning_rate": 8.420235878336438e-05, + "loss": 3.9495, + "step": 5426 + }, + { + "epoch": 0.2526712759270899, + "grad_norm": 0.6188193363066284, + "learning_rate": 8.421787709497207e-05, + "loss": 3.9392, + "step": 5427 + }, + { + "epoch": 0.25271783411318294, + "grad_norm": 0.6163615864970126, + "learning_rate": 8.423339540657977e-05, + "loss": 3.7626, + "step": 5428 + }, + { + "epoch": 0.252764392299276, + "grad_norm": 0.5945210578547812, + "learning_rate": 8.424891371818747e-05, + "loss": 3.7516, + "step": 5429 + }, + { + "epoch": 0.2528109504853691, + "grad_norm": 0.527269868196921, + "learning_rate": 8.426443202979516e-05, + "loss": 3.6788, + "step": 5430 + }, + { + "epoch": 0.2528575086714622, + "grad_norm": 0.5371755970571456, + "learning_rate": 8.427995034140286e-05, + "loss": 3.8747, + "step": 5431 + }, + { + "epoch": 0.25290406685755523, + "grad_norm": 0.539996988351808, + "learning_rate": 8.429546865301056e-05, + "loss": 3.8171, + "step": 5432 + }, + { + "epoch": 0.2529506250436483, + "grad_norm": 0.5026154831889311, + "learning_rate": 8.431098696461825e-05, + "loss": 3.7825, + "step": 5433 + }, + { + "epoch": 0.25299718322974135, + "grad_norm": 0.6042970710696325, + "learning_rate": 8.432650527622595e-05, + "loss": 3.7429, + "step": 5434 + }, + { + "epoch": 0.25304374141583447, + "grad_norm": 0.645593966550962, + "learning_rate": 8.434202358783366e-05, + "loss": 3.775, + "step": 5435 + }, + { + "epoch": 0.2530902996019275, + "grad_norm": 0.6447864297534465, + "learning_rate": 8.435754189944135e-05, + "loss": 3.7945, + "step": 5436 + }, + { + "epoch": 0.2531368577880206, + "grad_norm": 0.5355030514409859, + "learning_rate": 8.437306021104904e-05, + "loss": 3.701, + "step": 5437 + }, + { + "epoch": 0.25318341597411365, + "grad_norm": 0.5574931374791194, + "learning_rate": 8.438857852265673e-05, + "loss": 3.7076, + "step": 5438 + }, + { + "epoch": 0.2532299741602067, + "grad_norm": 0.6398823923219475, + "learning_rate": 8.440409683426443e-05, + "loss": 3.7367, + "step": 5439 + }, + { + "epoch": 0.25327653234629977, + "grad_norm": 0.5440189884603199, + "learning_rate": 8.441961514587213e-05, + "loss": 3.6573, + "step": 5440 + }, + { + "epoch": 0.2533230905323929, + "grad_norm": 0.46526573915448566, + "learning_rate": 8.443513345747984e-05, + "loss": 3.7392, + "step": 5441 + }, + { + "epoch": 0.25336964871848594, + "grad_norm": 0.7054351098560094, + "learning_rate": 8.445065176908753e-05, + "loss": 3.8243, + "step": 5442 + }, + { + "epoch": 0.253416206904579, + "grad_norm": 0.6952338222043019, + "learning_rate": 8.446617008069523e-05, + "loss": 3.753, + "step": 5443 + }, + { + "epoch": 0.25346276509067206, + "grad_norm": 0.651495127961516, + "learning_rate": 8.448168839230293e-05, + "loss": 3.6782, + "step": 5444 + }, + { + "epoch": 0.2535093232767651, + "grad_norm": 0.6865083858963771, + "learning_rate": 8.449720670391061e-05, + "loss": 3.778, + "step": 5445 + }, + { + "epoch": 0.25355588146285823, + "grad_norm": 0.7394746078367385, + "learning_rate": 8.451272501551832e-05, + "loss": 3.765, + "step": 5446 + }, + { + "epoch": 0.2536024396489513, + "grad_norm": 0.770133304763489, + "learning_rate": 8.452824332712601e-05, + "loss": 3.7729, + "step": 5447 + }, + { + "epoch": 0.25364899783504435, + "grad_norm": 0.6674938273395844, + "learning_rate": 8.454376163873371e-05, + "loss": 3.7462, + "step": 5448 + }, + { + "epoch": 0.2536955560211374, + "grad_norm": 0.6324239576901801, + "learning_rate": 8.45592799503414e-05, + "loss": 3.7294, + "step": 5449 + }, + { + "epoch": 0.2537421142072305, + "grad_norm": 0.6913309971652638, + "learning_rate": 8.45747982619491e-05, + "loss": 3.8067, + "step": 5450 + }, + { + "epoch": 0.25378867239332353, + "grad_norm": 0.588433713980119, + "learning_rate": 8.45903165735568e-05, + "loss": 3.7077, + "step": 5451 + }, + { + "epoch": 0.25383523057941665, + "grad_norm": 0.6251343719857698, + "learning_rate": 8.460583488516451e-05, + "loss": 3.8415, + "step": 5452 + }, + { + "epoch": 0.2538817887655097, + "grad_norm": 0.6170907180700914, + "learning_rate": 8.462135319677219e-05, + "loss": 3.6757, + "step": 5453 + }, + { + "epoch": 0.25392834695160277, + "grad_norm": 0.5425885227901267, + "learning_rate": 8.463687150837989e-05, + "loss": 3.7096, + "step": 5454 + }, + { + "epoch": 0.25397490513769583, + "grad_norm": 0.6053331702171539, + "learning_rate": 8.465238981998758e-05, + "loss": 3.9134, + "step": 5455 + }, + { + "epoch": 0.2540214633237889, + "grad_norm": 0.5454503210996824, + "learning_rate": 8.466790813159528e-05, + "loss": 3.7274, + "step": 5456 + }, + { + "epoch": 0.254068021509882, + "grad_norm": 0.535732890578042, + "learning_rate": 8.468342644320299e-05, + "loss": 3.8746, + "step": 5457 + }, + { + "epoch": 0.25411457969597506, + "grad_norm": 0.5681367320440399, + "learning_rate": 8.469894475481069e-05, + "loss": 3.6465, + "step": 5458 + }, + { + "epoch": 0.2541611378820681, + "grad_norm": 0.5176928141296407, + "learning_rate": 8.471446306641838e-05, + "loss": 3.705, + "step": 5459 + }, + { + "epoch": 0.2542076960681612, + "grad_norm": 0.5492045383261268, + "learning_rate": 8.472998137802608e-05, + "loss": 3.6454, + "step": 5460 + }, + { + "epoch": 0.25425425425425424, + "grad_norm": 0.5615711835504301, + "learning_rate": 8.474549968963376e-05, + "loss": 3.7246, + "step": 5461 + }, + { + "epoch": 0.2543008124403473, + "grad_norm": 0.5191107937498963, + "learning_rate": 8.476101800124147e-05, + "loss": 3.7371, + "step": 5462 + }, + { + "epoch": 0.2543473706264404, + "grad_norm": 0.6224340354298487, + "learning_rate": 8.477653631284917e-05, + "loss": 3.8672, + "step": 5463 + }, + { + "epoch": 0.2543939288125335, + "grad_norm": 0.7401634925383669, + "learning_rate": 8.479205462445686e-05, + "loss": 3.8722, + "step": 5464 + }, + { + "epoch": 0.25444048699862654, + "grad_norm": 0.6496199852015575, + "learning_rate": 8.480757293606456e-05, + "loss": 3.7141, + "step": 5465 + }, + { + "epoch": 0.2544870451847196, + "grad_norm": 0.6177134763742869, + "learning_rate": 8.482309124767226e-05, + "loss": 3.6961, + "step": 5466 + }, + { + "epoch": 0.25453360337081266, + "grad_norm": 0.6271788754155309, + "learning_rate": 8.483860955927995e-05, + "loss": 3.7389, + "step": 5467 + }, + { + "epoch": 0.2545801615569057, + "grad_norm": 0.5982587084990809, + "learning_rate": 8.485412787088765e-05, + "loss": 3.8723, + "step": 5468 + }, + { + "epoch": 0.25462671974299883, + "grad_norm": 0.6180365193089961, + "learning_rate": 8.486964618249535e-05, + "loss": 3.6428, + "step": 5469 + }, + { + "epoch": 0.2546732779290919, + "grad_norm": 0.5407384621850824, + "learning_rate": 8.488516449410304e-05, + "loss": 3.7712, + "step": 5470 + }, + { + "epoch": 0.25471983611518495, + "grad_norm": 0.5265641650413829, + "learning_rate": 8.490068280571074e-05, + "loss": 3.7425, + "step": 5471 + }, + { + "epoch": 0.254766394301278, + "grad_norm": 0.5880996091609015, + "learning_rate": 8.491620111731844e-05, + "loss": 3.9686, + "step": 5472 + }, + { + "epoch": 0.25481295248737107, + "grad_norm": 0.5216546451988109, + "learning_rate": 8.493171942892613e-05, + "loss": 3.7763, + "step": 5473 + }, + { + "epoch": 0.2548595106734642, + "grad_norm": 0.5944509743763299, + "learning_rate": 8.494723774053384e-05, + "loss": 3.6455, + "step": 5474 + }, + { + "epoch": 0.25490606885955724, + "grad_norm": 0.8434921546872576, + "learning_rate": 8.496275605214154e-05, + "loss": 3.8089, + "step": 5475 + }, + { + "epoch": 0.2549526270456503, + "grad_norm": 0.99274931513577, + "learning_rate": 8.497827436374923e-05, + "loss": 3.8063, + "step": 5476 + }, + { + "epoch": 0.25499918523174336, + "grad_norm": 0.927196189377291, + "learning_rate": 8.499379267535692e-05, + "loss": 3.8013, + "step": 5477 + }, + { + "epoch": 0.2550457434178364, + "grad_norm": 0.6924156544357111, + "learning_rate": 8.500931098696461e-05, + "loss": 3.7209, + "step": 5478 + }, + { + "epoch": 0.2550923016039295, + "grad_norm": 0.6481742858856572, + "learning_rate": 8.502482929857232e-05, + "loss": 3.6793, + "step": 5479 + }, + { + "epoch": 0.2551388597900226, + "grad_norm": 0.6078650389949111, + "learning_rate": 8.504034761018002e-05, + "loss": 3.6767, + "step": 5480 + }, + { + "epoch": 0.25518541797611566, + "grad_norm": 0.5283751719242405, + "learning_rate": 8.505586592178772e-05, + "loss": 3.5898, + "step": 5481 + }, + { + "epoch": 0.2552319761622087, + "grad_norm": 0.6597317855991006, + "learning_rate": 8.507138423339541e-05, + "loss": 3.7894, + "step": 5482 + }, + { + "epoch": 0.2552785343483018, + "grad_norm": 0.5955857201022303, + "learning_rate": 8.508690254500311e-05, + "loss": 3.7398, + "step": 5483 + }, + { + "epoch": 0.25532509253439484, + "grad_norm": 0.5736331145685697, + "learning_rate": 8.51024208566108e-05, + "loss": 3.7848, + "step": 5484 + }, + { + "epoch": 0.25537165072048795, + "grad_norm": 0.5160244281657131, + "learning_rate": 8.51179391682185e-05, + "loss": 3.7408, + "step": 5485 + }, + { + "epoch": 0.255418208906581, + "grad_norm": 0.5539031119530787, + "learning_rate": 8.51334574798262e-05, + "loss": 3.7774, + "step": 5486 + }, + { + "epoch": 0.25546476709267407, + "grad_norm": 0.5401658404178804, + "learning_rate": 8.51489757914339e-05, + "loss": 3.8045, + "step": 5487 + }, + { + "epoch": 0.25551132527876713, + "grad_norm": 0.6341885426329076, + "learning_rate": 8.516449410304159e-05, + "loss": 3.7638, + "step": 5488 + }, + { + "epoch": 0.2555578834648602, + "grad_norm": 0.5813114223879096, + "learning_rate": 8.518001241464929e-05, + "loss": 3.7957, + "step": 5489 + }, + { + "epoch": 0.25560444165095325, + "grad_norm": 0.6349431835585634, + "learning_rate": 8.519553072625698e-05, + "loss": 3.7936, + "step": 5490 + }, + { + "epoch": 0.25565099983704637, + "grad_norm": 0.6999209540820648, + "learning_rate": 8.521104903786469e-05, + "loss": 3.9269, + "step": 5491 + }, + { + "epoch": 0.2556975580231394, + "grad_norm": 0.7317738909305762, + "learning_rate": 8.522656734947239e-05, + "loss": 3.7963, + "step": 5492 + }, + { + "epoch": 0.2557441162092325, + "grad_norm": 0.6736861149645996, + "learning_rate": 8.524208566108007e-05, + "loss": 3.6318, + "step": 5493 + }, + { + "epoch": 0.25579067439532555, + "grad_norm": 0.5272155331145311, + "learning_rate": 8.525760397268777e-05, + "loss": 3.8322, + "step": 5494 + }, + { + "epoch": 0.2558372325814186, + "grad_norm": 0.578913900680189, + "learning_rate": 8.527312228429546e-05, + "loss": 3.7458, + "step": 5495 + }, + { + "epoch": 0.2558837907675117, + "grad_norm": 0.7367201425119271, + "learning_rate": 8.528864059590317e-05, + "loss": 3.8487, + "step": 5496 + }, + { + "epoch": 0.2559303489536048, + "grad_norm": 0.7053485973325078, + "learning_rate": 8.530415890751087e-05, + "loss": 3.7672, + "step": 5497 + }, + { + "epoch": 0.25597690713969784, + "grad_norm": 0.6276055404209143, + "learning_rate": 8.531967721911857e-05, + "loss": 3.7482, + "step": 5498 + }, + { + "epoch": 0.2560234653257909, + "grad_norm": 0.5438109785817016, + "learning_rate": 8.533519553072626e-05, + "loss": 3.6517, + "step": 5499 + }, + { + "epoch": 0.25607002351188396, + "grad_norm": 0.6499216231952833, + "learning_rate": 8.535071384233396e-05, + "loss": 3.6489, + "step": 5500 + }, + { + "epoch": 0.256116581697977, + "grad_norm": 0.6307682238845947, + "learning_rate": 8.536623215394166e-05, + "loss": 3.6347, + "step": 5501 + }, + { + "epoch": 0.25616313988407013, + "grad_norm": 0.5406755519106756, + "learning_rate": 8.538175046554935e-05, + "loss": 3.808, + "step": 5502 + }, + { + "epoch": 0.2562096980701632, + "grad_norm": 0.5843044568288452, + "learning_rate": 8.539726877715705e-05, + "loss": 3.8362, + "step": 5503 + }, + { + "epoch": 0.25625625625625625, + "grad_norm": 0.5822141290582362, + "learning_rate": 8.541278708876474e-05, + "loss": 3.662, + "step": 5504 + }, + { + "epoch": 0.2563028144423493, + "grad_norm": 0.5470581142766394, + "learning_rate": 8.542830540037244e-05, + "loss": 3.7261, + "step": 5505 + }, + { + "epoch": 0.2563493726284424, + "grad_norm": 0.6757628235434092, + "learning_rate": 8.544382371198014e-05, + "loss": 3.693, + "step": 5506 + }, + { + "epoch": 0.2563959308145355, + "grad_norm": 0.7673329472030315, + "learning_rate": 8.545934202358785e-05, + "loss": 3.6551, + "step": 5507 + }, + { + "epoch": 0.25644248900062855, + "grad_norm": 0.5932011925937771, + "learning_rate": 8.547486033519554e-05, + "loss": 3.6255, + "step": 5508 + }, + { + "epoch": 0.2564890471867216, + "grad_norm": 0.5617920014142608, + "learning_rate": 8.549037864680323e-05, + "loss": 3.6721, + "step": 5509 + }, + { + "epoch": 0.25653560537281467, + "grad_norm": 0.6161464472135182, + "learning_rate": 8.550589695841092e-05, + "loss": 3.7897, + "step": 5510 + }, + { + "epoch": 0.2565821635589077, + "grad_norm": 0.6463227539115481, + "learning_rate": 8.552141527001862e-05, + "loss": 3.6896, + "step": 5511 + }, + { + "epoch": 0.2566287217450008, + "grad_norm": 0.6501451461679665, + "learning_rate": 8.553693358162633e-05, + "loss": 3.7927, + "step": 5512 + }, + { + "epoch": 0.2566752799310939, + "grad_norm": 0.6558520837366614, + "learning_rate": 8.555245189323402e-05, + "loss": 3.7387, + "step": 5513 + }, + { + "epoch": 0.25672183811718696, + "grad_norm": 0.6041623650362415, + "learning_rate": 8.556797020484172e-05, + "loss": 3.8305, + "step": 5514 + }, + { + "epoch": 0.25676839630328, + "grad_norm": 0.6469972123215182, + "learning_rate": 8.558348851644942e-05, + "loss": 3.8192, + "step": 5515 + }, + { + "epoch": 0.2568149544893731, + "grad_norm": 0.6601691600129493, + "learning_rate": 8.559900682805711e-05, + "loss": 3.8961, + "step": 5516 + }, + { + "epoch": 0.25686151267546614, + "grad_norm": 0.6402871735593672, + "learning_rate": 8.561452513966481e-05, + "loss": 3.8149, + "step": 5517 + }, + { + "epoch": 0.25690807086155926, + "grad_norm": 0.6332135594814917, + "learning_rate": 8.56300434512725e-05, + "loss": 3.7538, + "step": 5518 + }, + { + "epoch": 0.2569546290476523, + "grad_norm": 0.6848389071132448, + "learning_rate": 8.56455617628802e-05, + "loss": 3.8138, + "step": 5519 + }, + { + "epoch": 0.2570011872337454, + "grad_norm": 0.6113618116230944, + "learning_rate": 8.56610800744879e-05, + "loss": 3.7216, + "step": 5520 + }, + { + "epoch": 0.25704774541983844, + "grad_norm": 0.6167188425326589, + "learning_rate": 8.56765983860956e-05, + "loss": 3.7343, + "step": 5521 + }, + { + "epoch": 0.2570943036059315, + "grad_norm": 0.6511155720798056, + "learning_rate": 8.569211669770329e-05, + "loss": 3.8316, + "step": 5522 + }, + { + "epoch": 0.25714086179202456, + "grad_norm": 0.5742635826908765, + "learning_rate": 8.570763500931099e-05, + "loss": 3.5958, + "step": 5523 + }, + { + "epoch": 0.25718741997811767, + "grad_norm": 0.6859358215792997, + "learning_rate": 8.57231533209187e-05, + "loss": 3.8296, + "step": 5524 + }, + { + "epoch": 0.25723397816421073, + "grad_norm": 0.6192198389065894, + "learning_rate": 8.57386716325264e-05, + "loss": 3.6784, + "step": 5525 + }, + { + "epoch": 0.2572805363503038, + "grad_norm": 0.6153415723232842, + "learning_rate": 8.575418994413408e-05, + "loss": 3.7707, + "step": 5526 + }, + { + "epoch": 0.25732709453639685, + "grad_norm": 0.6174927950792084, + "learning_rate": 8.576970825574177e-05, + "loss": 3.7452, + "step": 5527 + }, + { + "epoch": 0.2573736527224899, + "grad_norm": 0.5956707322371029, + "learning_rate": 8.578522656734947e-05, + "loss": 3.8652, + "step": 5528 + }, + { + "epoch": 0.257420210908583, + "grad_norm": 0.6458891371458871, + "learning_rate": 8.580074487895718e-05, + "loss": 3.7455, + "step": 5529 + }, + { + "epoch": 0.2574667690946761, + "grad_norm": 0.6323919235355918, + "learning_rate": 8.581626319056488e-05, + "loss": 3.7336, + "step": 5530 + }, + { + "epoch": 0.25751332728076914, + "grad_norm": 0.6234542800613122, + "learning_rate": 8.583178150217257e-05, + "loss": 3.721, + "step": 5531 + }, + { + "epoch": 0.2575598854668622, + "grad_norm": 0.5415832593961051, + "learning_rate": 8.584729981378027e-05, + "loss": 3.6856, + "step": 5532 + }, + { + "epoch": 0.25760644365295526, + "grad_norm": 0.6251089948371196, + "learning_rate": 8.586281812538796e-05, + "loss": 3.792, + "step": 5533 + }, + { + "epoch": 0.2576530018390483, + "grad_norm": 0.6403752420766903, + "learning_rate": 8.587833643699566e-05, + "loss": 3.6904, + "step": 5534 + }, + { + "epoch": 0.25769956002514144, + "grad_norm": 0.6437330159329543, + "learning_rate": 8.589385474860336e-05, + "loss": 3.8021, + "step": 5535 + }, + { + "epoch": 0.2577461182112345, + "grad_norm": 0.52169930040787, + "learning_rate": 8.590937306021105e-05, + "loss": 3.7341, + "step": 5536 + }, + { + "epoch": 0.25779267639732756, + "grad_norm": 0.6428898681864018, + "learning_rate": 8.592489137181875e-05, + "loss": 3.6589, + "step": 5537 + }, + { + "epoch": 0.2578392345834206, + "grad_norm": 0.6691620779038082, + "learning_rate": 8.594040968342645e-05, + "loss": 3.717, + "step": 5538 + }, + { + "epoch": 0.2578857927695137, + "grad_norm": 0.6991137142922301, + "learning_rate": 8.595592799503414e-05, + "loss": 3.8683, + "step": 5539 + }, + { + "epoch": 0.2579323509556068, + "grad_norm": 0.6216463725444645, + "learning_rate": 8.597144630664185e-05, + "loss": 3.6938, + "step": 5540 + }, + { + "epoch": 0.25797890914169985, + "grad_norm": 0.6583982128274026, + "learning_rate": 8.598696461824955e-05, + "loss": 3.7533, + "step": 5541 + }, + { + "epoch": 0.2580254673277929, + "grad_norm": 0.6647347781012011, + "learning_rate": 8.600248292985723e-05, + "loss": 3.7684, + "step": 5542 + }, + { + "epoch": 0.25807202551388597, + "grad_norm": 0.6496106856627051, + "learning_rate": 8.601800124146493e-05, + "loss": 3.7438, + "step": 5543 + }, + { + "epoch": 0.25811858369997903, + "grad_norm": 0.5435042632627182, + "learning_rate": 8.603351955307262e-05, + "loss": 3.6425, + "step": 5544 + }, + { + "epoch": 0.2581651418860721, + "grad_norm": 0.6369342784099634, + "learning_rate": 8.604903786468032e-05, + "loss": 3.7417, + "step": 5545 + }, + { + "epoch": 0.2582117000721652, + "grad_norm": 0.7770295323828467, + "learning_rate": 8.606455617628803e-05, + "loss": 3.6506, + "step": 5546 + }, + { + "epoch": 0.25825825825825827, + "grad_norm": 0.6560457163178233, + "learning_rate": 8.608007448789573e-05, + "loss": 3.7095, + "step": 5547 + }, + { + "epoch": 0.2583048164443513, + "grad_norm": 0.7484676475273546, + "learning_rate": 8.609559279950342e-05, + "loss": 3.7446, + "step": 5548 + }, + { + "epoch": 0.2583513746304444, + "grad_norm": 0.7856426630022398, + "learning_rate": 8.611111111111112e-05, + "loss": 3.6063, + "step": 5549 + }, + { + "epoch": 0.25839793281653745, + "grad_norm": 0.5630743259848409, + "learning_rate": 8.61266294227188e-05, + "loss": 3.6469, + "step": 5550 + }, + { + "epoch": 0.25844449100263056, + "grad_norm": 0.6629577265585234, + "learning_rate": 8.614214773432651e-05, + "loss": 3.8201, + "step": 5551 + }, + { + "epoch": 0.2584910491887236, + "grad_norm": 0.7886015142433883, + "learning_rate": 8.615766604593421e-05, + "loss": 3.7571, + "step": 5552 + }, + { + "epoch": 0.2585376073748167, + "grad_norm": 0.6843991086194225, + "learning_rate": 8.61731843575419e-05, + "loss": 3.8462, + "step": 5553 + }, + { + "epoch": 0.25858416556090974, + "grad_norm": 0.6973960944742035, + "learning_rate": 8.61887026691496e-05, + "loss": 3.8294, + "step": 5554 + }, + { + "epoch": 0.2586307237470028, + "grad_norm": 0.6578678577196263, + "learning_rate": 8.62042209807573e-05, + "loss": 3.7251, + "step": 5555 + }, + { + "epoch": 0.25867728193309586, + "grad_norm": 0.6566886213694624, + "learning_rate": 8.621973929236499e-05, + "loss": 3.6853, + "step": 5556 + }, + { + "epoch": 0.258723840119189, + "grad_norm": 0.60558729405943, + "learning_rate": 8.62352576039727e-05, + "loss": 3.6293, + "step": 5557 + }, + { + "epoch": 0.25877039830528203, + "grad_norm": 0.6379502377883033, + "learning_rate": 8.625077591558039e-05, + "loss": 3.7679, + "step": 5558 + }, + { + "epoch": 0.2588169564913751, + "grad_norm": 0.7100786711619617, + "learning_rate": 8.626629422718808e-05, + "loss": 3.7634, + "step": 5559 + }, + { + "epoch": 0.25886351467746815, + "grad_norm": 0.5941035137595637, + "learning_rate": 8.628181253879578e-05, + "loss": 3.6924, + "step": 5560 + }, + { + "epoch": 0.2589100728635612, + "grad_norm": 0.6675685556450799, + "learning_rate": 8.629733085040347e-05, + "loss": 3.6466, + "step": 5561 + }, + { + "epoch": 0.25895663104965433, + "grad_norm": 0.5013534396597373, + "learning_rate": 8.631284916201118e-05, + "loss": 3.8213, + "step": 5562 + }, + { + "epoch": 0.2590031892357474, + "grad_norm": 0.5832927542243864, + "learning_rate": 8.632836747361888e-05, + "loss": 3.6735, + "step": 5563 + }, + { + "epoch": 0.25904974742184045, + "grad_norm": 0.6256067975172559, + "learning_rate": 8.634388578522658e-05, + "loss": 3.7813, + "step": 5564 + }, + { + "epoch": 0.2590963056079335, + "grad_norm": 0.6417704611662378, + "learning_rate": 8.635940409683427e-05, + "loss": 3.7694, + "step": 5565 + }, + { + "epoch": 0.25914286379402657, + "grad_norm": 0.5437763361710741, + "learning_rate": 8.637492240844196e-05, + "loss": 3.7761, + "step": 5566 + }, + { + "epoch": 0.2591894219801196, + "grad_norm": 0.5648757265420843, + "learning_rate": 8.639044072004967e-05, + "loss": 3.5646, + "step": 5567 + }, + { + "epoch": 0.25923598016621274, + "grad_norm": 0.5284754209224791, + "learning_rate": 8.640595903165736e-05, + "loss": 3.8973, + "step": 5568 + }, + { + "epoch": 0.2592825383523058, + "grad_norm": 0.5618357319876888, + "learning_rate": 8.642147734326506e-05, + "loss": 3.766, + "step": 5569 + }, + { + "epoch": 0.25932909653839886, + "grad_norm": 0.5258132012389978, + "learning_rate": 8.643699565487275e-05, + "loss": 3.6514, + "step": 5570 + }, + { + "epoch": 0.2593756547244919, + "grad_norm": 0.5488173811077536, + "learning_rate": 8.645251396648045e-05, + "loss": 3.698, + "step": 5571 + }, + { + "epoch": 0.259422212910585, + "grad_norm": 0.6578688911803536, + "learning_rate": 8.646803227808815e-05, + "loss": 3.6278, + "step": 5572 + }, + { + "epoch": 0.2594687710966781, + "grad_norm": 0.6689142629506475, + "learning_rate": 8.648355058969584e-05, + "loss": 3.6737, + "step": 5573 + }, + { + "epoch": 0.25951532928277116, + "grad_norm": 0.7187484783568163, + "learning_rate": 8.649906890130354e-05, + "loss": 3.7204, + "step": 5574 + }, + { + "epoch": 0.2595618874688642, + "grad_norm": 0.711258655866655, + "learning_rate": 8.651458721291124e-05, + "loss": 3.7761, + "step": 5575 + }, + { + "epoch": 0.2596084456549573, + "grad_norm": 0.7026257746391638, + "learning_rate": 8.653010552451893e-05, + "loss": 3.7004, + "step": 5576 + }, + { + "epoch": 0.25965500384105034, + "grad_norm": 0.684760467796231, + "learning_rate": 8.654562383612663e-05, + "loss": 3.8212, + "step": 5577 + }, + { + "epoch": 0.2597015620271434, + "grad_norm": 0.6504329620372099, + "learning_rate": 8.656114214773433e-05, + "loss": 3.7158, + "step": 5578 + }, + { + "epoch": 0.2597481202132365, + "grad_norm": 0.6608508069617992, + "learning_rate": 8.657666045934203e-05, + "loss": 3.9261, + "step": 5579 + }, + { + "epoch": 0.25979467839932957, + "grad_norm": 0.7843132529807476, + "learning_rate": 8.659217877094973e-05, + "loss": 3.7255, + "step": 5580 + }, + { + "epoch": 0.25984123658542263, + "grad_norm": 0.6449436436061461, + "learning_rate": 8.660769708255743e-05, + "loss": 3.7296, + "step": 5581 + }, + { + "epoch": 0.2598877947715157, + "grad_norm": 0.6261378475610536, + "learning_rate": 8.662321539416511e-05, + "loss": 3.9127, + "step": 5582 + }, + { + "epoch": 0.25993435295760875, + "grad_norm": 0.7875540328461802, + "learning_rate": 8.66387337057728e-05, + "loss": 3.7776, + "step": 5583 + }, + { + "epoch": 0.25998091114370186, + "grad_norm": 0.6654029467724263, + "learning_rate": 8.665425201738052e-05, + "loss": 3.7674, + "step": 5584 + }, + { + "epoch": 0.2600274693297949, + "grad_norm": 0.5811263201266356, + "learning_rate": 8.666977032898821e-05, + "loss": 3.7238, + "step": 5585 + }, + { + "epoch": 0.260074027515888, + "grad_norm": 0.6645559267595474, + "learning_rate": 8.668528864059591e-05, + "loss": 3.8208, + "step": 5586 + }, + { + "epoch": 0.26012058570198104, + "grad_norm": 0.6341485625061514, + "learning_rate": 8.67008069522036e-05, + "loss": 3.7422, + "step": 5587 + }, + { + "epoch": 0.2601671438880741, + "grad_norm": 0.5744486949897258, + "learning_rate": 8.67163252638113e-05, + "loss": 3.7083, + "step": 5588 + }, + { + "epoch": 0.26021370207416716, + "grad_norm": 0.6347473771216701, + "learning_rate": 8.6731843575419e-05, + "loss": 3.6821, + "step": 5589 + }, + { + "epoch": 0.2602602602602603, + "grad_norm": 0.6236293148377104, + "learning_rate": 8.67473618870267e-05, + "loss": 3.6941, + "step": 5590 + }, + { + "epoch": 0.26030681844635334, + "grad_norm": 0.5133142339040825, + "learning_rate": 8.676288019863439e-05, + "loss": 3.793, + "step": 5591 + }, + { + "epoch": 0.2603533766324464, + "grad_norm": 0.5762275225852709, + "learning_rate": 8.677839851024209e-05, + "loss": 3.8066, + "step": 5592 + }, + { + "epoch": 0.26039993481853946, + "grad_norm": 0.5622518479128411, + "learning_rate": 8.679391682184978e-05, + "loss": 3.6637, + "step": 5593 + }, + { + "epoch": 0.2604464930046325, + "grad_norm": 0.5604789751447192, + "learning_rate": 8.680943513345748e-05, + "loss": 3.7749, + "step": 5594 + }, + { + "epoch": 0.26049305119072563, + "grad_norm": 0.5508230829955031, + "learning_rate": 8.682495344506518e-05, + "loss": 3.7006, + "step": 5595 + }, + { + "epoch": 0.2605396093768187, + "grad_norm": 0.5844689218695741, + "learning_rate": 8.684047175667289e-05, + "loss": 3.7563, + "step": 5596 + }, + { + "epoch": 0.26058616756291175, + "grad_norm": 0.562374425736271, + "learning_rate": 8.685599006828058e-05, + "loss": 3.7576, + "step": 5597 + }, + { + "epoch": 0.2606327257490048, + "grad_norm": 0.5006424528481078, + "learning_rate": 8.687150837988828e-05, + "loss": 3.7915, + "step": 5598 + }, + { + "epoch": 0.26067928393509787, + "grad_norm": 0.5416025016245252, + "learning_rate": 8.688702669149596e-05, + "loss": 3.7223, + "step": 5599 + }, + { + "epoch": 0.26072584212119093, + "grad_norm": 0.51692772713631, + "learning_rate": 8.690254500310366e-05, + "loss": 3.668, + "step": 5600 + }, + { + "epoch": 0.26077240030728405, + "grad_norm": 0.5450839167458518, + "learning_rate": 8.691806331471137e-05, + "loss": 3.6798, + "step": 5601 + }, + { + "epoch": 0.2608189584933771, + "grad_norm": 0.6088190040242342, + "learning_rate": 8.693358162631906e-05, + "loss": 3.6168, + "step": 5602 + }, + { + "epoch": 0.26086551667947017, + "grad_norm": 0.5961606897830992, + "learning_rate": 8.694909993792676e-05, + "loss": 3.7778, + "step": 5603 + }, + { + "epoch": 0.2609120748655632, + "grad_norm": 0.5561000355676057, + "learning_rate": 8.696461824953446e-05, + "loss": 3.6259, + "step": 5604 + }, + { + "epoch": 0.2609586330516563, + "grad_norm": 0.4866370355656495, + "learning_rate": 8.698013656114215e-05, + "loss": 3.7669, + "step": 5605 + }, + { + "epoch": 0.2610051912377494, + "grad_norm": 0.5344671835306037, + "learning_rate": 8.699565487274985e-05, + "loss": 3.6382, + "step": 5606 + }, + { + "epoch": 0.26105174942384246, + "grad_norm": 0.6388278941854014, + "learning_rate": 8.701117318435755e-05, + "loss": 3.8527, + "step": 5607 + }, + { + "epoch": 0.2610983076099355, + "grad_norm": 0.6106365607820163, + "learning_rate": 8.702669149596524e-05, + "loss": 3.7205, + "step": 5608 + }, + { + "epoch": 0.2611448657960286, + "grad_norm": 0.5585741056928492, + "learning_rate": 8.704220980757294e-05, + "loss": 3.769, + "step": 5609 + }, + { + "epoch": 0.26119142398212164, + "grad_norm": 0.6072801823982461, + "learning_rate": 8.705772811918063e-05, + "loss": 3.6712, + "step": 5610 + }, + { + "epoch": 0.2612379821682147, + "grad_norm": 0.6197249179332811, + "learning_rate": 8.707324643078833e-05, + "loss": 3.6894, + "step": 5611 + }, + { + "epoch": 0.2612845403543078, + "grad_norm": 0.6650827642041639, + "learning_rate": 8.708876474239604e-05, + "loss": 3.6882, + "step": 5612 + }, + { + "epoch": 0.2613310985404009, + "grad_norm": 0.6073248358272697, + "learning_rate": 8.710428305400374e-05, + "loss": 3.6712, + "step": 5613 + }, + { + "epoch": 0.26137765672649393, + "grad_norm": 0.6228705305669877, + "learning_rate": 8.711980136561143e-05, + "loss": 3.7127, + "step": 5614 + }, + { + "epoch": 0.261424214912587, + "grad_norm": 0.6265207542644501, + "learning_rate": 8.713531967721912e-05, + "loss": 3.663, + "step": 5615 + }, + { + "epoch": 0.26147077309868005, + "grad_norm": 0.6292431317008056, + "learning_rate": 8.715083798882681e-05, + "loss": 3.9188, + "step": 5616 + }, + { + "epoch": 0.26151733128477317, + "grad_norm": 0.7186061915933687, + "learning_rate": 8.716635630043452e-05, + "loss": 3.7873, + "step": 5617 + }, + { + "epoch": 0.26156388947086623, + "grad_norm": 0.7187912698555087, + "learning_rate": 8.718187461204222e-05, + "loss": 3.5975, + "step": 5618 + }, + { + "epoch": 0.2616104476569593, + "grad_norm": 0.7603840677205163, + "learning_rate": 8.719739292364991e-05, + "loss": 3.7307, + "step": 5619 + }, + { + "epoch": 0.26165700584305235, + "grad_norm": 0.6350014795916363, + "learning_rate": 8.721291123525761e-05, + "loss": 3.7807, + "step": 5620 + }, + { + "epoch": 0.2617035640291454, + "grad_norm": 0.6201316824577154, + "learning_rate": 8.722842954686531e-05, + "loss": 3.7285, + "step": 5621 + }, + { + "epoch": 0.26175012221523847, + "grad_norm": 0.6384232900839336, + "learning_rate": 8.7243947858473e-05, + "loss": 3.704, + "step": 5622 + }, + { + "epoch": 0.2617966804013316, + "grad_norm": 0.6948962088920145, + "learning_rate": 8.72594661700807e-05, + "loss": 3.8089, + "step": 5623 + }, + { + "epoch": 0.26184323858742464, + "grad_norm": 0.6416316137103678, + "learning_rate": 8.72749844816884e-05, + "loss": 3.7339, + "step": 5624 + }, + { + "epoch": 0.2618897967735177, + "grad_norm": 0.6258470379069707, + "learning_rate": 8.729050279329609e-05, + "loss": 3.7259, + "step": 5625 + }, + { + "epoch": 0.26193635495961076, + "grad_norm": 0.5700538765512775, + "learning_rate": 8.730602110490379e-05, + "loss": 3.7562, + "step": 5626 + }, + { + "epoch": 0.2619829131457038, + "grad_norm": 0.5893491934796614, + "learning_rate": 8.732153941651148e-05, + "loss": 3.8107, + "step": 5627 + }, + { + "epoch": 0.26202947133179694, + "grad_norm": 0.6116637295674684, + "learning_rate": 8.733705772811918e-05, + "loss": 3.7644, + "step": 5628 + }, + { + "epoch": 0.26207602951789, + "grad_norm": 0.6474978070527342, + "learning_rate": 8.735257603972689e-05, + "loss": 3.7804, + "step": 5629 + }, + { + "epoch": 0.26212258770398306, + "grad_norm": 0.5947321265425471, + "learning_rate": 8.736809435133459e-05, + "loss": 3.7115, + "step": 5630 + }, + { + "epoch": 0.2621691458900761, + "grad_norm": 0.6657863954986396, + "learning_rate": 8.738361266294227e-05, + "loss": 3.7422, + "step": 5631 + }, + { + "epoch": 0.2622157040761692, + "grad_norm": 0.6588721856626372, + "learning_rate": 8.739913097454997e-05, + "loss": 3.7367, + "step": 5632 + }, + { + "epoch": 0.26226226226226224, + "grad_norm": 0.6375892637311412, + "learning_rate": 8.741464928615766e-05, + "loss": 3.829, + "step": 5633 + }, + { + "epoch": 0.26230882044835535, + "grad_norm": 0.5922789181509888, + "learning_rate": 8.743016759776537e-05, + "loss": 3.7782, + "step": 5634 + }, + { + "epoch": 0.2623553786344484, + "grad_norm": 0.5679237790304856, + "learning_rate": 8.744568590937307e-05, + "loss": 3.7117, + "step": 5635 + }, + { + "epoch": 0.26240193682054147, + "grad_norm": 0.5619090897702321, + "learning_rate": 8.746120422098077e-05, + "loss": 3.8131, + "step": 5636 + }, + { + "epoch": 0.26244849500663453, + "grad_norm": 0.5916207641326553, + "learning_rate": 8.747672253258846e-05, + "loss": 3.8148, + "step": 5637 + }, + { + "epoch": 0.2624950531927276, + "grad_norm": 0.7058977223994988, + "learning_rate": 8.749224084419616e-05, + "loss": 3.7713, + "step": 5638 + }, + { + "epoch": 0.2625416113788207, + "grad_norm": 0.6301570770422265, + "learning_rate": 8.750775915580385e-05, + "loss": 3.6934, + "step": 5639 + }, + { + "epoch": 0.26258816956491376, + "grad_norm": 0.6285940485474227, + "learning_rate": 8.752327746741155e-05, + "loss": 3.7303, + "step": 5640 + }, + { + "epoch": 0.2626347277510068, + "grad_norm": 0.5586532611467877, + "learning_rate": 8.753879577901925e-05, + "loss": 3.6877, + "step": 5641 + }, + { + "epoch": 0.2626812859370999, + "grad_norm": 0.5425252102568043, + "learning_rate": 8.755431409062694e-05, + "loss": 3.6246, + "step": 5642 + }, + { + "epoch": 0.26272784412319294, + "grad_norm": 0.6119731367167495, + "learning_rate": 8.756983240223464e-05, + "loss": 3.7747, + "step": 5643 + }, + { + "epoch": 0.262774402309286, + "grad_norm": 0.6030309617317833, + "learning_rate": 8.758535071384234e-05, + "loss": 3.7072, + "step": 5644 + }, + { + "epoch": 0.2628209604953791, + "grad_norm": 0.544343711046905, + "learning_rate": 8.760086902545003e-05, + "loss": 3.6597, + "step": 5645 + }, + { + "epoch": 0.2628675186814722, + "grad_norm": 0.5853630336966953, + "learning_rate": 8.761638733705774e-05, + "loss": 3.7136, + "step": 5646 + }, + { + "epoch": 0.26291407686756524, + "grad_norm": 0.6343253568623226, + "learning_rate": 8.763190564866542e-05, + "loss": 3.7204, + "step": 5647 + }, + { + "epoch": 0.2629606350536583, + "grad_norm": 0.6789649079468383, + "learning_rate": 8.764742396027312e-05, + "loss": 3.6912, + "step": 5648 + }, + { + "epoch": 0.26300719323975136, + "grad_norm": 0.5136049602955935, + "learning_rate": 8.766294227188082e-05, + "loss": 3.6026, + "step": 5649 + }, + { + "epoch": 0.2630537514258445, + "grad_norm": 0.5867647967916231, + "learning_rate": 8.767846058348851e-05, + "loss": 3.6866, + "step": 5650 + }, + { + "epoch": 0.26310030961193753, + "grad_norm": 0.552003643960117, + "learning_rate": 8.769397889509622e-05, + "loss": 3.7629, + "step": 5651 + }, + { + "epoch": 0.2631468677980306, + "grad_norm": 0.5100502723852403, + "learning_rate": 8.770949720670392e-05, + "loss": 3.7409, + "step": 5652 + }, + { + "epoch": 0.26319342598412365, + "grad_norm": 0.6111173782928334, + "learning_rate": 8.772501551831162e-05, + "loss": 3.7599, + "step": 5653 + }, + { + "epoch": 0.2632399841702167, + "grad_norm": 0.699203826162395, + "learning_rate": 8.774053382991931e-05, + "loss": 3.7768, + "step": 5654 + }, + { + "epoch": 0.26328654235630977, + "grad_norm": 0.6578340541971657, + "learning_rate": 8.7756052141527e-05, + "loss": 3.7558, + "step": 5655 + }, + { + "epoch": 0.2633331005424029, + "grad_norm": 0.5198460620008636, + "learning_rate": 8.77715704531347e-05, + "loss": 3.6737, + "step": 5656 + }, + { + "epoch": 0.26337965872849595, + "grad_norm": 0.5602640900638253, + "learning_rate": 8.77870887647424e-05, + "loss": 3.6594, + "step": 5657 + }, + { + "epoch": 0.263426216914589, + "grad_norm": 0.607715781546003, + "learning_rate": 8.78026070763501e-05, + "loss": 3.657, + "step": 5658 + }, + { + "epoch": 0.26347277510068207, + "grad_norm": 0.6194031160853212, + "learning_rate": 8.78181253879578e-05, + "loss": 3.704, + "step": 5659 + }, + { + "epoch": 0.2635193332867751, + "grad_norm": 0.531208857613554, + "learning_rate": 8.783364369956549e-05, + "loss": 3.8268, + "step": 5660 + }, + { + "epoch": 0.26356589147286824, + "grad_norm": 0.5556652052593917, + "learning_rate": 8.784916201117319e-05, + "loss": 3.8205, + "step": 5661 + }, + { + "epoch": 0.2636124496589613, + "grad_norm": 0.6046601928310837, + "learning_rate": 8.78646803227809e-05, + "loss": 3.865, + "step": 5662 + }, + { + "epoch": 0.26365900784505436, + "grad_norm": 0.5033042339390165, + "learning_rate": 8.788019863438858e-05, + "loss": 3.6527, + "step": 5663 + }, + { + "epoch": 0.2637055660311474, + "grad_norm": 0.5644123577044524, + "learning_rate": 8.789571694599628e-05, + "loss": 3.7011, + "step": 5664 + }, + { + "epoch": 0.2637521242172405, + "grad_norm": 0.6187575436156111, + "learning_rate": 8.791123525760397e-05, + "loss": 3.7516, + "step": 5665 + }, + { + "epoch": 0.26379868240333354, + "grad_norm": 0.5470570095069736, + "learning_rate": 8.792675356921167e-05, + "loss": 3.7107, + "step": 5666 + }, + { + "epoch": 0.26384524058942666, + "grad_norm": 0.5120901060838159, + "learning_rate": 8.794227188081938e-05, + "loss": 3.7393, + "step": 5667 + }, + { + "epoch": 0.2638917987755197, + "grad_norm": 0.546279693555752, + "learning_rate": 8.795779019242707e-05, + "loss": 3.7442, + "step": 5668 + }, + { + "epoch": 0.2639383569616128, + "grad_norm": 0.6156238171687276, + "learning_rate": 8.797330850403477e-05, + "loss": 3.7016, + "step": 5669 + }, + { + "epoch": 0.26398491514770583, + "grad_norm": 0.6449018885696556, + "learning_rate": 8.798882681564247e-05, + "loss": 3.6363, + "step": 5670 + }, + { + "epoch": 0.2640314733337989, + "grad_norm": 0.5443614581096944, + "learning_rate": 8.800434512725015e-05, + "loss": 3.7906, + "step": 5671 + }, + { + "epoch": 0.264078031519892, + "grad_norm": 0.6597155440691471, + "learning_rate": 8.801986343885785e-05, + "loss": 3.7668, + "step": 5672 + }, + { + "epoch": 0.26412458970598507, + "grad_norm": 0.704311649809154, + "learning_rate": 8.803538175046556e-05, + "loss": 3.5816, + "step": 5673 + }, + { + "epoch": 0.26417114789207813, + "grad_norm": 0.5852323140582845, + "learning_rate": 8.805090006207325e-05, + "loss": 3.5786, + "step": 5674 + }, + { + "epoch": 0.2642177060781712, + "grad_norm": 0.5881969685555974, + "learning_rate": 8.806641837368095e-05, + "loss": 3.6228, + "step": 5675 + }, + { + "epoch": 0.26426426426426425, + "grad_norm": 0.5861845617166075, + "learning_rate": 8.808193668528864e-05, + "loss": 3.663, + "step": 5676 + }, + { + "epoch": 0.2643108224503573, + "grad_norm": 0.5923666383167256, + "learning_rate": 8.809745499689634e-05, + "loss": 3.7752, + "step": 5677 + }, + { + "epoch": 0.2643573806364504, + "grad_norm": 0.5735336195779093, + "learning_rate": 8.811297330850404e-05, + "loss": 3.6609, + "step": 5678 + }, + { + "epoch": 0.2644039388225435, + "grad_norm": 0.6343039196545647, + "learning_rate": 8.812849162011173e-05, + "loss": 3.7304, + "step": 5679 + }, + { + "epoch": 0.26445049700863654, + "grad_norm": 0.6933125781039323, + "learning_rate": 8.814400993171943e-05, + "loss": 3.656, + "step": 5680 + }, + { + "epoch": 0.2644970551947296, + "grad_norm": 0.6303728169506249, + "learning_rate": 8.815952824332713e-05, + "loss": 3.8338, + "step": 5681 + }, + { + "epoch": 0.26454361338082266, + "grad_norm": 0.5900025020887589, + "learning_rate": 8.817504655493482e-05, + "loss": 3.8049, + "step": 5682 + }, + { + "epoch": 0.2645901715669158, + "grad_norm": 0.617666753504467, + "learning_rate": 8.819056486654252e-05, + "loss": 3.7198, + "step": 5683 + }, + { + "epoch": 0.26463672975300884, + "grad_norm": 0.6002867728250385, + "learning_rate": 8.820608317815023e-05, + "loss": 3.6881, + "step": 5684 + }, + { + "epoch": 0.2646832879391019, + "grad_norm": 0.5632219419054965, + "learning_rate": 8.822160148975792e-05, + "loss": 3.6745, + "step": 5685 + }, + { + "epoch": 0.26472984612519496, + "grad_norm": 0.6009699885661391, + "learning_rate": 8.823711980136562e-05, + "loss": 3.6566, + "step": 5686 + }, + { + "epoch": 0.264776404311288, + "grad_norm": 0.6125413946913275, + "learning_rate": 8.825263811297332e-05, + "loss": 3.7057, + "step": 5687 + }, + { + "epoch": 0.2648229624973811, + "grad_norm": 0.6451951198491954, + "learning_rate": 8.8268156424581e-05, + "loss": 3.7733, + "step": 5688 + }, + { + "epoch": 0.2648695206834742, + "grad_norm": 0.5806603072800491, + "learning_rate": 8.828367473618871e-05, + "loss": 3.6104, + "step": 5689 + }, + { + "epoch": 0.26491607886956725, + "grad_norm": 0.6492979496914032, + "learning_rate": 8.82991930477964e-05, + "loss": 3.6869, + "step": 5690 + }, + { + "epoch": 0.2649626370556603, + "grad_norm": 0.69983051984613, + "learning_rate": 8.83147113594041e-05, + "loss": 3.8128, + "step": 5691 + }, + { + "epoch": 0.26500919524175337, + "grad_norm": 0.6470931750592008, + "learning_rate": 8.83302296710118e-05, + "loss": 3.7774, + "step": 5692 + }, + { + "epoch": 0.26505575342784643, + "grad_norm": 0.5757753046557632, + "learning_rate": 8.83457479826195e-05, + "loss": 3.6974, + "step": 5693 + }, + { + "epoch": 0.26510231161393955, + "grad_norm": 0.6039807542761351, + "learning_rate": 8.836126629422719e-05, + "loss": 3.7463, + "step": 5694 + }, + { + "epoch": 0.2651488698000326, + "grad_norm": 0.5403598531712547, + "learning_rate": 8.83767846058349e-05, + "loss": 3.6256, + "step": 5695 + }, + { + "epoch": 0.26519542798612566, + "grad_norm": 0.51812188378698, + "learning_rate": 8.839230291744258e-05, + "loss": 3.75, + "step": 5696 + }, + { + "epoch": 0.2652419861722187, + "grad_norm": 0.5813343584469406, + "learning_rate": 8.840782122905028e-05, + "loss": 3.8471, + "step": 5697 + }, + { + "epoch": 0.2652885443583118, + "grad_norm": 0.6352592737012284, + "learning_rate": 8.842333954065798e-05, + "loss": 3.7928, + "step": 5698 + }, + { + "epoch": 0.26533510254440484, + "grad_norm": 0.6776565530656082, + "learning_rate": 8.843885785226567e-05, + "loss": 3.6688, + "step": 5699 + }, + { + "epoch": 0.26538166073049796, + "grad_norm": 0.6236543620359958, + "learning_rate": 8.845437616387337e-05, + "loss": 3.7488, + "step": 5700 + }, + { + "epoch": 0.265428218916591, + "grad_norm": 0.6503963965189256, + "learning_rate": 8.846989447548108e-05, + "loss": 3.6837, + "step": 5701 + }, + { + "epoch": 0.2654747771026841, + "grad_norm": 0.6435450978954552, + "learning_rate": 8.848541278708878e-05, + "loss": 3.6659, + "step": 5702 + }, + { + "epoch": 0.26552133528877714, + "grad_norm": 0.585947449057985, + "learning_rate": 8.850093109869647e-05, + "loss": 3.6204, + "step": 5703 + }, + { + "epoch": 0.2655678934748702, + "grad_norm": 0.5357995775853787, + "learning_rate": 8.851644941030415e-05, + "loss": 3.6236, + "step": 5704 + }, + { + "epoch": 0.2656144516609633, + "grad_norm": 0.5493236852009168, + "learning_rate": 8.853196772191185e-05, + "loss": 3.6779, + "step": 5705 + }, + { + "epoch": 0.2656610098470564, + "grad_norm": 0.5721010545343999, + "learning_rate": 8.854748603351956e-05, + "loss": 3.6915, + "step": 5706 + }, + { + "epoch": 0.26570756803314943, + "grad_norm": 0.6198367284713434, + "learning_rate": 8.856300434512726e-05, + "loss": 3.8338, + "step": 5707 + }, + { + "epoch": 0.2657541262192425, + "grad_norm": 0.6421941779619214, + "learning_rate": 8.857852265673495e-05, + "loss": 3.7498, + "step": 5708 + }, + { + "epoch": 0.26580068440533555, + "grad_norm": 0.6956685533357089, + "learning_rate": 8.859404096834265e-05, + "loss": 3.7332, + "step": 5709 + }, + { + "epoch": 0.2658472425914286, + "grad_norm": 0.6412956307602399, + "learning_rate": 8.860955927995035e-05, + "loss": 3.6708, + "step": 5710 + }, + { + "epoch": 0.2658938007775217, + "grad_norm": 0.6381060243639167, + "learning_rate": 8.862507759155804e-05, + "loss": 3.7237, + "step": 5711 + }, + { + "epoch": 0.2659403589636148, + "grad_norm": 0.7119070204092767, + "learning_rate": 8.864059590316574e-05, + "loss": 3.7521, + "step": 5712 + }, + { + "epoch": 0.26598691714970785, + "grad_norm": 0.8133011651829193, + "learning_rate": 8.865611421477343e-05, + "loss": 3.7413, + "step": 5713 + }, + { + "epoch": 0.2660334753358009, + "grad_norm": 0.6614809151029709, + "learning_rate": 8.867163252638113e-05, + "loss": 3.7715, + "step": 5714 + }, + { + "epoch": 0.26608003352189397, + "grad_norm": 0.5878474970542439, + "learning_rate": 8.868715083798883e-05, + "loss": 3.6362, + "step": 5715 + }, + { + "epoch": 0.2661265917079871, + "grad_norm": 0.6847716962736669, + "learning_rate": 8.870266914959652e-05, + "loss": 3.7265, + "step": 5716 + }, + { + "epoch": 0.26617314989408014, + "grad_norm": 0.7327296265014985, + "learning_rate": 8.871818746120423e-05, + "loss": 3.721, + "step": 5717 + }, + { + "epoch": 0.2662197080801732, + "grad_norm": 0.6921491981496178, + "learning_rate": 8.873370577281193e-05, + "loss": 3.7121, + "step": 5718 + }, + { + "epoch": 0.26626626626626626, + "grad_norm": 0.6649268828461017, + "learning_rate": 8.874922408441963e-05, + "loss": 3.6138, + "step": 5719 + }, + { + "epoch": 0.2663128244523593, + "grad_norm": 0.6574927972546799, + "learning_rate": 8.876474239602731e-05, + "loss": 3.7449, + "step": 5720 + }, + { + "epoch": 0.2663593826384524, + "grad_norm": 0.6985367775253942, + "learning_rate": 8.8780260707635e-05, + "loss": 3.7241, + "step": 5721 + }, + { + "epoch": 0.2664059408245455, + "grad_norm": 0.7568030389927243, + "learning_rate": 8.879577901924272e-05, + "loss": 3.7236, + "step": 5722 + }, + { + "epoch": 0.26645249901063855, + "grad_norm": 0.5548640264503373, + "learning_rate": 8.881129733085041e-05, + "loss": 3.7229, + "step": 5723 + }, + { + "epoch": 0.2664990571967316, + "grad_norm": 0.5136018523645673, + "learning_rate": 8.882681564245811e-05, + "loss": 3.7084, + "step": 5724 + }, + { + "epoch": 0.2665456153828247, + "grad_norm": 0.5726720058372095, + "learning_rate": 8.88423339540658e-05, + "loss": 3.7234, + "step": 5725 + }, + { + "epoch": 0.26659217356891773, + "grad_norm": 0.4894214351130146, + "learning_rate": 8.88578522656735e-05, + "loss": 3.5756, + "step": 5726 + }, + { + "epoch": 0.26663873175501085, + "grad_norm": 0.5086434128397312, + "learning_rate": 8.88733705772812e-05, + "loss": 3.6955, + "step": 5727 + }, + { + "epoch": 0.2666852899411039, + "grad_norm": 0.6312304436552508, + "learning_rate": 8.888888888888889e-05, + "loss": 3.7482, + "step": 5728 + }, + { + "epoch": 0.26673184812719697, + "grad_norm": 0.6177924890576159, + "learning_rate": 8.890440720049659e-05, + "loss": 3.7213, + "step": 5729 + }, + { + "epoch": 0.26677840631329003, + "grad_norm": 0.5871940321320329, + "learning_rate": 8.891992551210429e-05, + "loss": 3.6626, + "step": 5730 + }, + { + "epoch": 0.2668249644993831, + "grad_norm": 0.5525564512595095, + "learning_rate": 8.893544382371198e-05, + "loss": 3.6705, + "step": 5731 + }, + { + "epoch": 0.26687152268547615, + "grad_norm": 0.6027260121555104, + "learning_rate": 8.895096213531968e-05, + "loss": 3.7235, + "step": 5732 + }, + { + "epoch": 0.26691808087156926, + "grad_norm": 0.5887243245504432, + "learning_rate": 8.896648044692737e-05, + "loss": 3.7824, + "step": 5733 + }, + { + "epoch": 0.2669646390576623, + "grad_norm": 0.5884851600430306, + "learning_rate": 8.898199875853508e-05, + "loss": 3.8752, + "step": 5734 + }, + { + "epoch": 0.2670111972437554, + "grad_norm": 0.6487023831122126, + "learning_rate": 8.899751707014278e-05, + "loss": 3.6913, + "step": 5735 + }, + { + "epoch": 0.26705775542984844, + "grad_norm": 0.6380328403663942, + "learning_rate": 8.901303538175046e-05, + "loss": 3.8254, + "step": 5736 + }, + { + "epoch": 0.2671043136159415, + "grad_norm": 0.6638603090583926, + "learning_rate": 8.902855369335816e-05, + "loss": 3.8099, + "step": 5737 + }, + { + "epoch": 0.2671508718020346, + "grad_norm": 0.6564304771737852, + "learning_rate": 8.904407200496586e-05, + "loss": 3.727, + "step": 5738 + }, + { + "epoch": 0.2671974299881277, + "grad_norm": 0.5888620236485543, + "learning_rate": 8.905959031657357e-05, + "loss": 3.7495, + "step": 5739 + }, + { + "epoch": 0.26724398817422074, + "grad_norm": 0.5929933535097828, + "learning_rate": 8.907510862818126e-05, + "loss": 3.6131, + "step": 5740 + }, + { + "epoch": 0.2672905463603138, + "grad_norm": 0.5971397580259474, + "learning_rate": 8.909062693978896e-05, + "loss": 3.7942, + "step": 5741 + }, + { + "epoch": 0.26733710454640686, + "grad_norm": 0.6118897564038189, + "learning_rate": 8.910614525139666e-05, + "loss": 3.7099, + "step": 5742 + }, + { + "epoch": 0.2673836627324999, + "grad_norm": 0.5998084497426113, + "learning_rate": 8.912166356300435e-05, + "loss": 3.7302, + "step": 5743 + }, + { + "epoch": 0.26743022091859303, + "grad_norm": 0.6033741887084574, + "learning_rate": 8.913718187461205e-05, + "loss": 3.7, + "step": 5744 + }, + { + "epoch": 0.2674767791046861, + "grad_norm": 0.5408297843590906, + "learning_rate": 8.915270018621974e-05, + "loss": 3.7981, + "step": 5745 + }, + { + "epoch": 0.26752333729077915, + "grad_norm": 0.5651640073679678, + "learning_rate": 8.916821849782744e-05, + "loss": 3.6503, + "step": 5746 + }, + { + "epoch": 0.2675698954768722, + "grad_norm": 0.5611755166571669, + "learning_rate": 8.918373680943514e-05, + "loss": 3.6647, + "step": 5747 + }, + { + "epoch": 0.26761645366296527, + "grad_norm": 0.5340517489912426, + "learning_rate": 8.919925512104283e-05, + "loss": 3.6514, + "step": 5748 + }, + { + "epoch": 0.2676630118490584, + "grad_norm": 0.6827295288182395, + "learning_rate": 8.921477343265053e-05, + "loss": 3.7832, + "step": 5749 + }, + { + "epoch": 0.26770957003515145, + "grad_norm": 0.6985027021372519, + "learning_rate": 8.923029174425823e-05, + "loss": 3.6796, + "step": 5750 + }, + { + "epoch": 0.2677561282212445, + "grad_norm": 0.6916554048415153, + "learning_rate": 8.924581005586594e-05, + "loss": 3.6462, + "step": 5751 + }, + { + "epoch": 0.26780268640733756, + "grad_norm": 0.7290345847945726, + "learning_rate": 8.926132836747362e-05, + "loss": 3.6638, + "step": 5752 + }, + { + "epoch": 0.2678492445934306, + "grad_norm": 0.6750225533187192, + "learning_rate": 8.927684667908131e-05, + "loss": 3.6674, + "step": 5753 + }, + { + "epoch": 0.2678958027795237, + "grad_norm": 0.5392500510919839, + "learning_rate": 8.929236499068901e-05, + "loss": 3.6787, + "step": 5754 + }, + { + "epoch": 0.2679423609656168, + "grad_norm": 0.6350408070189263, + "learning_rate": 8.930788330229671e-05, + "loss": 3.7928, + "step": 5755 + }, + { + "epoch": 0.26798891915170986, + "grad_norm": 0.6278675697821418, + "learning_rate": 8.932340161390442e-05, + "loss": 3.6799, + "step": 5756 + }, + { + "epoch": 0.2680354773378029, + "grad_norm": 0.6944031902822619, + "learning_rate": 8.933891992551211e-05, + "loss": 3.7665, + "step": 5757 + }, + { + "epoch": 0.268082035523896, + "grad_norm": 0.7420287181665527, + "learning_rate": 8.935443823711981e-05, + "loss": 3.7193, + "step": 5758 + }, + { + "epoch": 0.26812859370998904, + "grad_norm": 0.690546512197865, + "learning_rate": 8.93699565487275e-05, + "loss": 3.7161, + "step": 5759 + }, + { + "epoch": 0.26817515189608215, + "grad_norm": 0.49720148544674314, + "learning_rate": 8.938547486033519e-05, + "loss": 3.6726, + "step": 5760 + }, + { + "epoch": 0.2682217100821752, + "grad_norm": 0.6193037567931918, + "learning_rate": 8.94009931719429e-05, + "loss": 3.6648, + "step": 5761 + }, + { + "epoch": 0.2682682682682683, + "grad_norm": 0.7220150024731041, + "learning_rate": 8.94165114835506e-05, + "loss": 3.6683, + "step": 5762 + }, + { + "epoch": 0.26831482645436133, + "grad_norm": 0.6356821276226644, + "learning_rate": 8.943202979515829e-05, + "loss": 3.7354, + "step": 5763 + }, + { + "epoch": 0.2683613846404544, + "grad_norm": 0.5550521406710176, + "learning_rate": 8.944754810676599e-05, + "loss": 3.7108, + "step": 5764 + }, + { + "epoch": 0.26840794282654745, + "grad_norm": 0.7672900916934796, + "learning_rate": 8.946306641837368e-05, + "loss": 3.6473, + "step": 5765 + }, + { + "epoch": 0.26845450101264057, + "grad_norm": 0.7139216792403457, + "learning_rate": 8.947858472998138e-05, + "loss": 3.5948, + "step": 5766 + }, + { + "epoch": 0.2685010591987336, + "grad_norm": 0.5704025925516186, + "learning_rate": 8.949410304158909e-05, + "loss": 3.774, + "step": 5767 + }, + { + "epoch": 0.2685476173848267, + "grad_norm": 0.5890815043595818, + "learning_rate": 8.950962135319679e-05, + "loss": 3.746, + "step": 5768 + }, + { + "epoch": 0.26859417557091975, + "grad_norm": 0.6249332087964975, + "learning_rate": 8.952513966480447e-05, + "loss": 3.7307, + "step": 5769 + }, + { + "epoch": 0.2686407337570128, + "grad_norm": 0.6513684318225179, + "learning_rate": 8.954065797641217e-05, + "loss": 3.721, + "step": 5770 + }, + { + "epoch": 0.2686872919431059, + "grad_norm": 0.6053235428729387, + "learning_rate": 8.955617628801986e-05, + "loss": 3.6808, + "step": 5771 + }, + { + "epoch": 0.268733850129199, + "grad_norm": 0.5772633483093874, + "learning_rate": 8.957169459962757e-05, + "loss": 3.7257, + "step": 5772 + }, + { + "epoch": 0.26878040831529204, + "grad_norm": 0.6489792548373208, + "learning_rate": 8.958721291123527e-05, + "loss": 3.8486, + "step": 5773 + }, + { + "epoch": 0.2688269665013851, + "grad_norm": 0.7546209892755232, + "learning_rate": 8.960273122284296e-05, + "loss": 3.6299, + "step": 5774 + }, + { + "epoch": 0.26887352468747816, + "grad_norm": 0.7187847737123512, + "learning_rate": 8.961824953445066e-05, + "loss": 3.6662, + "step": 5775 + }, + { + "epoch": 0.2689200828735712, + "grad_norm": 0.6373816801870693, + "learning_rate": 8.963376784605836e-05, + "loss": 3.6555, + "step": 5776 + }, + { + "epoch": 0.26896664105966434, + "grad_norm": 0.5589369376322821, + "learning_rate": 8.964928615766604e-05, + "loss": 3.7652, + "step": 5777 + }, + { + "epoch": 0.2690131992457574, + "grad_norm": 0.6075816814449988, + "learning_rate": 8.966480446927375e-05, + "loss": 3.7878, + "step": 5778 + }, + { + "epoch": 0.26905975743185045, + "grad_norm": 0.6863982911232542, + "learning_rate": 8.968032278088145e-05, + "loss": 3.6951, + "step": 5779 + }, + { + "epoch": 0.2691063156179435, + "grad_norm": 0.5970652282005651, + "learning_rate": 8.969584109248914e-05, + "loss": 3.6, + "step": 5780 + }, + { + "epoch": 0.2691528738040366, + "grad_norm": 0.5860339458512546, + "learning_rate": 8.971135940409684e-05, + "loss": 3.7187, + "step": 5781 + }, + { + "epoch": 0.2691994319901297, + "grad_norm": 0.6582290098206284, + "learning_rate": 8.972687771570453e-05, + "loss": 3.667, + "step": 5782 + }, + { + "epoch": 0.26924599017622275, + "grad_norm": 0.6220332724777481, + "learning_rate": 8.974239602731223e-05, + "loss": 3.742, + "step": 5783 + }, + { + "epoch": 0.2692925483623158, + "grad_norm": 0.5952433356023168, + "learning_rate": 8.975791433891994e-05, + "loss": 3.7327, + "step": 5784 + }, + { + "epoch": 0.26933910654840887, + "grad_norm": 0.5960261639043112, + "learning_rate": 8.977343265052762e-05, + "loss": 3.6827, + "step": 5785 + }, + { + "epoch": 0.26938566473450193, + "grad_norm": 0.5050397601395774, + "learning_rate": 8.978895096213532e-05, + "loss": 3.6102, + "step": 5786 + }, + { + "epoch": 0.269432222920595, + "grad_norm": 0.5580617121391629, + "learning_rate": 8.980446927374302e-05, + "loss": 3.72, + "step": 5787 + }, + { + "epoch": 0.2694787811066881, + "grad_norm": 0.541487476056646, + "learning_rate": 8.981998758535071e-05, + "loss": 3.7145, + "step": 5788 + }, + { + "epoch": 0.26952533929278116, + "grad_norm": 0.57936419881589, + "learning_rate": 8.983550589695842e-05, + "loss": 3.6569, + "step": 5789 + }, + { + "epoch": 0.2695718974788742, + "grad_norm": 0.5544247288181645, + "learning_rate": 8.985102420856612e-05, + "loss": 3.6966, + "step": 5790 + }, + { + "epoch": 0.2696184556649673, + "grad_norm": 0.5439461410733328, + "learning_rate": 8.986654252017381e-05, + "loss": 3.8175, + "step": 5791 + }, + { + "epoch": 0.26966501385106034, + "grad_norm": 0.6544651637388779, + "learning_rate": 8.988206083178151e-05, + "loss": 3.6642, + "step": 5792 + }, + { + "epoch": 0.26971157203715346, + "grad_norm": 0.4838756287730116, + "learning_rate": 8.98975791433892e-05, + "loss": 3.6753, + "step": 5793 + }, + { + "epoch": 0.2697581302232465, + "grad_norm": 0.5707672569784902, + "learning_rate": 8.99130974549969e-05, + "loss": 3.648, + "step": 5794 + }, + { + "epoch": 0.2698046884093396, + "grad_norm": 0.6421767658120717, + "learning_rate": 8.99286157666046e-05, + "loss": 3.7593, + "step": 5795 + }, + { + "epoch": 0.26985124659543264, + "grad_norm": 0.6030625650731515, + "learning_rate": 8.99441340782123e-05, + "loss": 3.6549, + "step": 5796 + }, + { + "epoch": 0.2698978047815257, + "grad_norm": 0.543516406516388, + "learning_rate": 8.995965238981999e-05, + "loss": 3.6423, + "step": 5797 + }, + { + "epoch": 0.26994436296761876, + "grad_norm": 0.5690234643585236, + "learning_rate": 8.997517070142769e-05, + "loss": 3.708, + "step": 5798 + }, + { + "epoch": 0.26999092115371187, + "grad_norm": 0.5532750591426737, + "learning_rate": 8.999068901303539e-05, + "loss": 3.7977, + "step": 5799 + }, + { + "epoch": 0.27003747933980493, + "grad_norm": 0.6224820413087032, + "learning_rate": 9.000620732464308e-05, + "loss": 3.6622, + "step": 5800 + }, + { + "epoch": 0.270084037525898, + "grad_norm": 0.6891647415048919, + "learning_rate": 9.002172563625078e-05, + "loss": 3.6285, + "step": 5801 + }, + { + "epoch": 0.27013059571199105, + "grad_norm": 0.6650933554878438, + "learning_rate": 9.003724394785847e-05, + "loss": 3.7088, + "step": 5802 + }, + { + "epoch": 0.2701771538980841, + "grad_norm": 0.6813879175464973, + "learning_rate": 9.005276225946617e-05, + "loss": 3.7518, + "step": 5803 + }, + { + "epoch": 0.2702237120841772, + "grad_norm": 0.6111051597521665, + "learning_rate": 9.006828057107387e-05, + "loss": 3.542, + "step": 5804 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 0.6258314996573092, + "learning_rate": 9.008379888268156e-05, + "loss": 3.8008, + "step": 5805 + }, + { + "epoch": 0.27031682845636335, + "grad_norm": 0.612865975101147, + "learning_rate": 9.009931719428927e-05, + "loss": 3.7297, + "step": 5806 + }, + { + "epoch": 0.2703633866424564, + "grad_norm": 0.48991758698629173, + "learning_rate": 9.011483550589697e-05, + "loss": 3.758, + "step": 5807 + }, + { + "epoch": 0.27040994482854946, + "grad_norm": 0.6219088117519347, + "learning_rate": 9.013035381750467e-05, + "loss": 3.6856, + "step": 5808 + }, + { + "epoch": 0.2704565030146425, + "grad_norm": 0.6468471506562719, + "learning_rate": 9.014587212911235e-05, + "loss": 3.7353, + "step": 5809 + }, + { + "epoch": 0.27050306120073564, + "grad_norm": 0.6272168828127813, + "learning_rate": 9.016139044072004e-05, + "loss": 3.5975, + "step": 5810 + }, + { + "epoch": 0.2705496193868287, + "grad_norm": 0.6179016299943584, + "learning_rate": 9.017690875232775e-05, + "loss": 3.7932, + "step": 5811 + }, + { + "epoch": 0.27059617757292176, + "grad_norm": 0.6970373387587351, + "learning_rate": 9.019242706393545e-05, + "loss": 3.7312, + "step": 5812 + }, + { + "epoch": 0.2706427357590148, + "grad_norm": 0.6330236718180928, + "learning_rate": 9.020794537554315e-05, + "loss": 3.6744, + "step": 5813 + }, + { + "epoch": 0.2706892939451079, + "grad_norm": 0.5427612758760044, + "learning_rate": 9.022346368715084e-05, + "loss": 3.6541, + "step": 5814 + }, + { + "epoch": 0.270735852131201, + "grad_norm": 0.5807236326013543, + "learning_rate": 9.023898199875854e-05, + "loss": 3.7124, + "step": 5815 + }, + { + "epoch": 0.27078241031729405, + "grad_norm": 0.6735231369875605, + "learning_rate": 9.025450031036624e-05, + "loss": 3.6617, + "step": 5816 + }, + { + "epoch": 0.2708289685033871, + "grad_norm": 0.7180489907033034, + "learning_rate": 9.027001862197393e-05, + "loss": 3.7225, + "step": 5817 + }, + { + "epoch": 0.2708755266894802, + "grad_norm": 0.6750216329714412, + "learning_rate": 9.028553693358163e-05, + "loss": 3.7059, + "step": 5818 + }, + { + "epoch": 0.27092208487557323, + "grad_norm": 0.6622082572637343, + "learning_rate": 9.030105524518932e-05, + "loss": 3.6423, + "step": 5819 + }, + { + "epoch": 0.2709686430616663, + "grad_norm": 0.6135115943523236, + "learning_rate": 9.031657355679702e-05, + "loss": 3.6892, + "step": 5820 + }, + { + "epoch": 0.2710152012477594, + "grad_norm": 0.5765317587093421, + "learning_rate": 9.033209186840472e-05, + "loss": 3.7925, + "step": 5821 + }, + { + "epoch": 0.27106175943385247, + "grad_norm": 0.6169339166347955, + "learning_rate": 9.034761018001243e-05, + "loss": 3.7029, + "step": 5822 + }, + { + "epoch": 0.2711083176199455, + "grad_norm": 0.619519902336862, + "learning_rate": 9.036312849162012e-05, + "loss": 3.6903, + "step": 5823 + }, + { + "epoch": 0.2711548758060386, + "grad_norm": 0.5916689005849163, + "learning_rate": 9.037864680322782e-05, + "loss": 3.6877, + "step": 5824 + }, + { + "epoch": 0.27120143399213165, + "grad_norm": 0.567916050125072, + "learning_rate": 9.03941651148355e-05, + "loss": 3.6017, + "step": 5825 + }, + { + "epoch": 0.27124799217822476, + "grad_norm": 0.6119730637254409, + "learning_rate": 9.04096834264432e-05, + "loss": 3.7235, + "step": 5826 + }, + { + "epoch": 0.2712945503643178, + "grad_norm": 0.7992536081594238, + "learning_rate": 9.04252017380509e-05, + "loss": 3.7576, + "step": 5827 + }, + { + "epoch": 0.2713411085504109, + "grad_norm": 0.7472876540489456, + "learning_rate": 9.04407200496586e-05, + "loss": 3.7342, + "step": 5828 + }, + { + "epoch": 0.27138766673650394, + "grad_norm": 0.6508178086519446, + "learning_rate": 9.04562383612663e-05, + "loss": 3.6565, + "step": 5829 + }, + { + "epoch": 0.271434224922597, + "grad_norm": 0.6263700565834097, + "learning_rate": 9.0471756672874e-05, + "loss": 3.7128, + "step": 5830 + }, + { + "epoch": 0.27148078310869006, + "grad_norm": 0.7002965331836908, + "learning_rate": 9.04872749844817e-05, + "loss": 3.6553, + "step": 5831 + }, + { + "epoch": 0.2715273412947832, + "grad_norm": 0.5936140727545071, + "learning_rate": 9.050279329608939e-05, + "loss": 3.6657, + "step": 5832 + }, + { + "epoch": 0.27157389948087624, + "grad_norm": 0.579410426355414, + "learning_rate": 9.051831160769709e-05, + "loss": 3.7036, + "step": 5833 + }, + { + "epoch": 0.2716204576669693, + "grad_norm": 0.6142624129022171, + "learning_rate": 9.053382991930478e-05, + "loss": 3.6437, + "step": 5834 + }, + { + "epoch": 0.27166701585306235, + "grad_norm": 0.6962139026098282, + "learning_rate": 9.054934823091248e-05, + "loss": 3.6676, + "step": 5835 + }, + { + "epoch": 0.2717135740391554, + "grad_norm": 0.5127790923753263, + "learning_rate": 9.056486654252018e-05, + "loss": 3.6642, + "step": 5836 + }, + { + "epoch": 0.27176013222524853, + "grad_norm": 0.4933017458486024, + "learning_rate": 9.058038485412787e-05, + "loss": 3.6857, + "step": 5837 + }, + { + "epoch": 0.2718066904113416, + "grad_norm": 0.6031797123479082, + "learning_rate": 9.059590316573557e-05, + "loss": 3.6286, + "step": 5838 + }, + { + "epoch": 0.27185324859743465, + "grad_norm": 0.6033489589876398, + "learning_rate": 9.061142147734328e-05, + "loss": 3.5633, + "step": 5839 + }, + { + "epoch": 0.2718998067835277, + "grad_norm": 0.5211214338342663, + "learning_rate": 9.062693978895097e-05, + "loss": 3.6146, + "step": 5840 + }, + { + "epoch": 0.27194636496962077, + "grad_norm": 0.48513688321265525, + "learning_rate": 9.064245810055866e-05, + "loss": 3.7154, + "step": 5841 + }, + { + "epoch": 0.27199292315571383, + "grad_norm": 0.6031154892439777, + "learning_rate": 9.065797641216635e-05, + "loss": 3.7754, + "step": 5842 + }, + { + "epoch": 0.27203948134180694, + "grad_norm": 0.5253566790791473, + "learning_rate": 9.067349472377405e-05, + "loss": 3.7378, + "step": 5843 + }, + { + "epoch": 0.2720860395279, + "grad_norm": 0.5066244130385857, + "learning_rate": 9.068901303538176e-05, + "loss": 3.6019, + "step": 5844 + }, + { + "epoch": 0.27213259771399306, + "grad_norm": 0.6128015505629386, + "learning_rate": 9.070453134698946e-05, + "loss": 3.5553, + "step": 5845 + }, + { + "epoch": 0.2721791559000861, + "grad_norm": 0.6361082425607929, + "learning_rate": 9.072004965859715e-05, + "loss": 3.6484, + "step": 5846 + }, + { + "epoch": 0.2722257140861792, + "grad_norm": 0.5525535720621423, + "learning_rate": 9.073556797020485e-05, + "loss": 3.7438, + "step": 5847 + }, + { + "epoch": 0.2722722722722723, + "grad_norm": 0.5140101987162201, + "learning_rate": 9.075108628181254e-05, + "loss": 3.562, + "step": 5848 + }, + { + "epoch": 0.27231883045836536, + "grad_norm": 0.5219210531767143, + "learning_rate": 9.076660459342024e-05, + "loss": 3.7204, + "step": 5849 + }, + { + "epoch": 0.2723653886444584, + "grad_norm": 0.6980801742239156, + "learning_rate": 9.078212290502794e-05, + "loss": 3.6077, + "step": 5850 + }, + { + "epoch": 0.2724119468305515, + "grad_norm": 0.7813500562960458, + "learning_rate": 9.079764121663563e-05, + "loss": 3.7207, + "step": 5851 + }, + { + "epoch": 0.27245850501664454, + "grad_norm": 0.602656590381301, + "learning_rate": 9.081315952824333e-05, + "loss": 3.6711, + "step": 5852 + }, + { + "epoch": 0.2725050632027376, + "grad_norm": 0.6078183214072744, + "learning_rate": 9.082867783985103e-05, + "loss": 3.6277, + "step": 5853 + }, + { + "epoch": 0.2725516213888307, + "grad_norm": 0.6780046746567884, + "learning_rate": 9.084419615145872e-05, + "loss": 3.7432, + "step": 5854 + }, + { + "epoch": 0.27259817957492377, + "grad_norm": 0.6251170460223755, + "learning_rate": 9.085971446306642e-05, + "loss": 3.6338, + "step": 5855 + }, + { + "epoch": 0.27264473776101683, + "grad_norm": 0.6038584799288276, + "learning_rate": 9.087523277467413e-05, + "loss": 3.7574, + "step": 5856 + }, + { + "epoch": 0.2726912959471099, + "grad_norm": 0.6039810015021395, + "learning_rate": 9.089075108628183e-05, + "loss": 3.5731, + "step": 5857 + }, + { + "epoch": 0.27273785413320295, + "grad_norm": 0.5893346258929913, + "learning_rate": 9.090626939788951e-05, + "loss": 3.7323, + "step": 5858 + }, + { + "epoch": 0.27278441231929607, + "grad_norm": 0.5421686877188556, + "learning_rate": 9.09217877094972e-05, + "loss": 3.6027, + "step": 5859 + }, + { + "epoch": 0.2728309705053891, + "grad_norm": 0.5402296556070674, + "learning_rate": 9.09373060211049e-05, + "loss": 3.6362, + "step": 5860 + }, + { + "epoch": 0.2728775286914822, + "grad_norm": 0.5699821100673403, + "learning_rate": 9.095282433271261e-05, + "loss": 3.8252, + "step": 5861 + }, + { + "epoch": 0.27292408687757524, + "grad_norm": 0.6928773256261718, + "learning_rate": 9.09683426443203e-05, + "loss": 3.7252, + "step": 5862 + }, + { + "epoch": 0.2729706450636683, + "grad_norm": 0.6270804762130291, + "learning_rate": 9.0983860955928e-05, + "loss": 3.6675, + "step": 5863 + }, + { + "epoch": 0.27301720324976136, + "grad_norm": 0.6448468764788255, + "learning_rate": 9.09993792675357e-05, + "loss": 3.7401, + "step": 5864 + }, + { + "epoch": 0.2730637614358545, + "grad_norm": 0.5479138549641966, + "learning_rate": 9.10148975791434e-05, + "loss": 3.5815, + "step": 5865 + }, + { + "epoch": 0.27311031962194754, + "grad_norm": 0.598470778194659, + "learning_rate": 9.103041589075109e-05, + "loss": 3.6931, + "step": 5866 + }, + { + "epoch": 0.2731568778080406, + "grad_norm": 0.5709230724476922, + "learning_rate": 9.104593420235879e-05, + "loss": 3.7043, + "step": 5867 + }, + { + "epoch": 0.27320343599413366, + "grad_norm": 0.5282398959472918, + "learning_rate": 9.106145251396648e-05, + "loss": 3.7052, + "step": 5868 + }, + { + "epoch": 0.2732499941802267, + "grad_norm": 0.5628418173711999, + "learning_rate": 9.107697082557418e-05, + "loss": 3.7046, + "step": 5869 + }, + { + "epoch": 0.27329655236631983, + "grad_norm": 0.534323986634162, + "learning_rate": 9.109248913718188e-05, + "loss": 3.5736, + "step": 5870 + }, + { + "epoch": 0.2733431105524129, + "grad_norm": 0.46828126486591237, + "learning_rate": 9.110800744878957e-05, + "loss": 3.5526, + "step": 5871 + }, + { + "epoch": 0.27338966873850595, + "grad_norm": 0.5954200830392552, + "learning_rate": 9.112352576039728e-05, + "loss": 3.6404, + "step": 5872 + }, + { + "epoch": 0.273436226924599, + "grad_norm": 0.6734133958109901, + "learning_rate": 9.113904407200498e-05, + "loss": 3.7595, + "step": 5873 + }, + { + "epoch": 0.2734827851106921, + "grad_norm": 0.645054307383148, + "learning_rate": 9.115456238361266e-05, + "loss": 3.7679, + "step": 5874 + }, + { + "epoch": 0.27352934329678513, + "grad_norm": 0.520088747932217, + "learning_rate": 9.117008069522036e-05, + "loss": 3.6785, + "step": 5875 + }, + { + "epoch": 0.27357590148287825, + "grad_norm": 0.544775360616537, + "learning_rate": 9.118559900682806e-05, + "loss": 3.7008, + "step": 5876 + }, + { + "epoch": 0.2736224596689713, + "grad_norm": 0.5368332478198357, + "learning_rate": 9.120111731843576e-05, + "loss": 3.6702, + "step": 5877 + }, + { + "epoch": 0.27366901785506437, + "grad_norm": 0.49823235112563163, + "learning_rate": 9.121663563004346e-05, + "loss": 3.6373, + "step": 5878 + }, + { + "epoch": 0.2737155760411574, + "grad_norm": 0.5626868098724986, + "learning_rate": 9.123215394165116e-05, + "loss": 3.7624, + "step": 5879 + }, + { + "epoch": 0.2737621342272505, + "grad_norm": 0.5765141675808252, + "learning_rate": 9.124767225325885e-05, + "loss": 3.6638, + "step": 5880 + }, + { + "epoch": 0.2738086924133436, + "grad_norm": 0.5872618524654214, + "learning_rate": 9.126319056486655e-05, + "loss": 3.712, + "step": 5881 + }, + { + "epoch": 0.27385525059943666, + "grad_norm": 0.5557668387720585, + "learning_rate": 9.127870887647423e-05, + "loss": 3.6577, + "step": 5882 + }, + { + "epoch": 0.2739018087855297, + "grad_norm": 0.5577868738616705, + "learning_rate": 9.129422718808194e-05, + "loss": 3.6324, + "step": 5883 + }, + { + "epoch": 0.2739483669716228, + "grad_norm": 0.5980656444441349, + "learning_rate": 9.130974549968964e-05, + "loss": 3.6618, + "step": 5884 + }, + { + "epoch": 0.27399492515771584, + "grad_norm": 0.5400677471271227, + "learning_rate": 9.132526381129734e-05, + "loss": 3.6639, + "step": 5885 + }, + { + "epoch": 0.2740414833438089, + "grad_norm": 0.5732399132760625, + "learning_rate": 9.134078212290503e-05, + "loss": 3.6419, + "step": 5886 + }, + { + "epoch": 0.274088041529902, + "grad_norm": 0.5858135872611471, + "learning_rate": 9.135630043451273e-05, + "loss": 3.661, + "step": 5887 + }, + { + "epoch": 0.2741345997159951, + "grad_norm": 0.5661011298198078, + "learning_rate": 9.137181874612042e-05, + "loss": 3.5492, + "step": 5888 + }, + { + "epoch": 0.27418115790208814, + "grad_norm": 0.5678500055660236, + "learning_rate": 9.138733705772813e-05, + "loss": 3.7202, + "step": 5889 + }, + { + "epoch": 0.2742277160881812, + "grad_norm": 0.5641025753835255, + "learning_rate": 9.140285536933582e-05, + "loss": 3.5695, + "step": 5890 + }, + { + "epoch": 0.27427427427427425, + "grad_norm": 0.6181327410864372, + "learning_rate": 9.141837368094351e-05, + "loss": 3.6821, + "step": 5891 + }, + { + "epoch": 0.27432083246036737, + "grad_norm": 0.5483061184276832, + "learning_rate": 9.143389199255121e-05, + "loss": 3.7589, + "step": 5892 + }, + { + "epoch": 0.27436739064646043, + "grad_norm": 0.5404047312442141, + "learning_rate": 9.14494103041589e-05, + "loss": 3.732, + "step": 5893 + }, + { + "epoch": 0.2744139488325535, + "grad_norm": 0.576566803026233, + "learning_rate": 9.146492861576662e-05, + "loss": 3.6402, + "step": 5894 + }, + { + "epoch": 0.27446050701864655, + "grad_norm": 0.5532057562932446, + "learning_rate": 9.148044692737431e-05, + "loss": 3.5746, + "step": 5895 + }, + { + "epoch": 0.2745070652047396, + "grad_norm": 0.5923044173321267, + "learning_rate": 9.149596523898201e-05, + "loss": 3.6362, + "step": 5896 + }, + { + "epoch": 0.27455362339083267, + "grad_norm": 0.5862178713597388, + "learning_rate": 9.15114835505897e-05, + "loss": 3.5015, + "step": 5897 + }, + { + "epoch": 0.2746001815769258, + "grad_norm": 0.5573513865669112, + "learning_rate": 9.152700186219739e-05, + "loss": 3.6667, + "step": 5898 + }, + { + "epoch": 0.27464673976301884, + "grad_norm": 0.5822555072981229, + "learning_rate": 9.15425201738051e-05, + "loss": 3.6452, + "step": 5899 + }, + { + "epoch": 0.2746932979491119, + "grad_norm": 0.5631765484875301, + "learning_rate": 9.15580384854128e-05, + "loss": 3.5497, + "step": 5900 + }, + { + "epoch": 0.27473985613520496, + "grad_norm": 0.6145428710496216, + "learning_rate": 9.157355679702049e-05, + "loss": 3.7121, + "step": 5901 + }, + { + "epoch": 0.274786414321298, + "grad_norm": 0.5773734380369239, + "learning_rate": 9.158907510862819e-05, + "loss": 3.682, + "step": 5902 + }, + { + "epoch": 0.27483297250739114, + "grad_norm": 0.6020376743057142, + "learning_rate": 9.160459342023588e-05, + "loss": 3.5076, + "step": 5903 + }, + { + "epoch": 0.2748795306934842, + "grad_norm": 0.6060455265544813, + "learning_rate": 9.162011173184358e-05, + "loss": 3.6523, + "step": 5904 + }, + { + "epoch": 0.27492608887957726, + "grad_norm": 0.6097747153146711, + "learning_rate": 9.163563004345128e-05, + "loss": 3.7556, + "step": 5905 + }, + { + "epoch": 0.2749726470656703, + "grad_norm": 0.5881589882984558, + "learning_rate": 9.165114835505897e-05, + "loss": 3.7494, + "step": 5906 + }, + { + "epoch": 0.2750192052517634, + "grad_norm": 0.5858996906306292, + "learning_rate": 9.166666666666667e-05, + "loss": 3.6977, + "step": 5907 + }, + { + "epoch": 0.27506576343785644, + "grad_norm": 0.48872236666146196, + "learning_rate": 9.168218497827436e-05, + "loss": 3.5604, + "step": 5908 + }, + { + "epoch": 0.27511232162394955, + "grad_norm": 0.5142704754572609, + "learning_rate": 9.169770328988206e-05, + "loss": 3.6588, + "step": 5909 + }, + { + "epoch": 0.2751588798100426, + "grad_norm": 0.5425941123482737, + "learning_rate": 9.171322160148976e-05, + "loss": 3.6524, + "step": 5910 + }, + { + "epoch": 0.27520543799613567, + "grad_norm": 0.6224365978343872, + "learning_rate": 9.172873991309747e-05, + "loss": 3.8034, + "step": 5911 + }, + { + "epoch": 0.27525199618222873, + "grad_norm": 0.6920842493460876, + "learning_rate": 9.174425822470516e-05, + "loss": 3.6402, + "step": 5912 + }, + { + "epoch": 0.2752985543683218, + "grad_norm": 0.6485637488628786, + "learning_rate": 9.175977653631286e-05, + "loss": 3.6675, + "step": 5913 + }, + { + "epoch": 0.2753451125544149, + "grad_norm": 0.6521321402428637, + "learning_rate": 9.177529484792054e-05, + "loss": 3.6615, + "step": 5914 + }, + { + "epoch": 0.27539167074050797, + "grad_norm": 0.673756905489888, + "learning_rate": 9.179081315952824e-05, + "loss": 3.6388, + "step": 5915 + }, + { + "epoch": 0.275438228926601, + "grad_norm": 0.6450891039867798, + "learning_rate": 9.180633147113595e-05, + "loss": 3.7736, + "step": 5916 + }, + { + "epoch": 0.2754847871126941, + "grad_norm": 0.644090363801296, + "learning_rate": 9.182184978274364e-05, + "loss": 3.7433, + "step": 5917 + }, + { + "epoch": 0.27553134529878714, + "grad_norm": 0.573408458121482, + "learning_rate": 9.183736809435134e-05, + "loss": 3.6551, + "step": 5918 + }, + { + "epoch": 0.2755779034848802, + "grad_norm": 0.5945126091660219, + "learning_rate": 9.185288640595904e-05, + "loss": 3.7624, + "step": 5919 + }, + { + "epoch": 0.2756244616709733, + "grad_norm": 0.6337409840932611, + "learning_rate": 9.186840471756673e-05, + "loss": 3.8409, + "step": 5920 + }, + { + "epoch": 0.2756710198570664, + "grad_norm": 0.5497259689628744, + "learning_rate": 9.188392302917443e-05, + "loss": 3.6121, + "step": 5921 + }, + { + "epoch": 0.27571757804315944, + "grad_norm": 0.6346349798982074, + "learning_rate": 9.189944134078213e-05, + "loss": 3.7538, + "step": 5922 + }, + { + "epoch": 0.2757641362292525, + "grad_norm": 0.7289691628647209, + "learning_rate": 9.191495965238982e-05, + "loss": 3.6014, + "step": 5923 + }, + { + "epoch": 0.27581069441534556, + "grad_norm": 0.8223619487867871, + "learning_rate": 9.193047796399752e-05, + "loss": 3.6734, + "step": 5924 + }, + { + "epoch": 0.2758572526014387, + "grad_norm": 0.8886519982050058, + "learning_rate": 9.194599627560521e-05, + "loss": 3.6544, + "step": 5925 + }, + { + "epoch": 0.27590381078753173, + "grad_norm": 0.6932253232026446, + "learning_rate": 9.196151458721291e-05, + "loss": 3.5563, + "step": 5926 + }, + { + "epoch": 0.2759503689736248, + "grad_norm": 0.6158974310033721, + "learning_rate": 9.197703289882062e-05, + "loss": 3.6603, + "step": 5927 + }, + { + "epoch": 0.27599692715971785, + "grad_norm": 0.5525297442056529, + "learning_rate": 9.199255121042832e-05, + "loss": 3.6542, + "step": 5928 + }, + { + "epoch": 0.2760434853458109, + "grad_norm": 0.5567348039113109, + "learning_rate": 9.200806952203601e-05, + "loss": 3.6563, + "step": 5929 + }, + { + "epoch": 0.276090043531904, + "grad_norm": 0.5593038017004797, + "learning_rate": 9.20235878336437e-05, + "loss": 3.5129, + "step": 5930 + }, + { + "epoch": 0.2761366017179971, + "grad_norm": 0.5664327311884725, + "learning_rate": 9.203910614525139e-05, + "loss": 3.6929, + "step": 5931 + }, + { + "epoch": 0.27618315990409015, + "grad_norm": 0.4991191730279741, + "learning_rate": 9.205462445685909e-05, + "loss": 3.6359, + "step": 5932 + }, + { + "epoch": 0.2762297180901832, + "grad_norm": 0.5511739426235839, + "learning_rate": 9.20701427684668e-05, + "loss": 3.6077, + "step": 5933 + }, + { + "epoch": 0.27627627627627627, + "grad_norm": 0.593833236578377, + "learning_rate": 9.20856610800745e-05, + "loss": 3.4995, + "step": 5934 + }, + { + "epoch": 0.2763228344623693, + "grad_norm": 0.5146170805507276, + "learning_rate": 9.210117939168219e-05, + "loss": 3.8287, + "step": 5935 + }, + { + "epoch": 0.27636939264846244, + "grad_norm": 0.5817485535948764, + "learning_rate": 9.211669770328989e-05, + "loss": 3.7656, + "step": 5936 + }, + { + "epoch": 0.2764159508345555, + "grad_norm": 0.6717515422539363, + "learning_rate": 9.213221601489758e-05, + "loss": 3.7582, + "step": 5937 + }, + { + "epoch": 0.27646250902064856, + "grad_norm": 0.7473101254937077, + "learning_rate": 9.214773432650528e-05, + "loss": 3.7916, + "step": 5938 + }, + { + "epoch": 0.2765090672067416, + "grad_norm": 0.6058450583911565, + "learning_rate": 9.216325263811298e-05, + "loss": 3.6699, + "step": 5939 + }, + { + "epoch": 0.2765556253928347, + "grad_norm": 0.591349546648795, + "learning_rate": 9.217877094972067e-05, + "loss": 3.748, + "step": 5940 + }, + { + "epoch": 0.27660218357892774, + "grad_norm": 0.5575535825640653, + "learning_rate": 9.219428926132837e-05, + "loss": 3.7327, + "step": 5941 + }, + { + "epoch": 0.27664874176502086, + "grad_norm": 0.5525591850017566, + "learning_rate": 9.220980757293607e-05, + "loss": 3.6819, + "step": 5942 + }, + { + "epoch": 0.2766952999511139, + "grad_norm": 0.532340574382103, + "learning_rate": 9.222532588454376e-05, + "loss": 3.717, + "step": 5943 + }, + { + "epoch": 0.276741858137207, + "grad_norm": 0.5185223252805696, + "learning_rate": 9.224084419615147e-05, + "loss": 3.7579, + "step": 5944 + }, + { + "epoch": 0.27678841632330003, + "grad_norm": 0.5357166332301587, + "learning_rate": 9.225636250775917e-05, + "loss": 3.6216, + "step": 5945 + }, + { + "epoch": 0.2768349745093931, + "grad_norm": 0.5263481453009358, + "learning_rate": 9.227188081936686e-05, + "loss": 3.7156, + "step": 5946 + }, + { + "epoch": 0.2768815326954862, + "grad_norm": 0.48906761376802904, + "learning_rate": 9.228739913097455e-05, + "loss": 3.6655, + "step": 5947 + }, + { + "epoch": 0.27692809088157927, + "grad_norm": 0.5096162497875373, + "learning_rate": 9.230291744258224e-05, + "loss": 3.6878, + "step": 5948 + }, + { + "epoch": 0.27697464906767233, + "grad_norm": 0.5390700019954727, + "learning_rate": 9.231843575418995e-05, + "loss": 3.6074, + "step": 5949 + }, + { + "epoch": 0.2770212072537654, + "grad_norm": 0.6171607602429011, + "learning_rate": 9.233395406579765e-05, + "loss": 3.6936, + "step": 5950 + }, + { + "epoch": 0.27706776543985845, + "grad_norm": 0.5613836405879064, + "learning_rate": 9.234947237740535e-05, + "loss": 3.6325, + "step": 5951 + }, + { + "epoch": 0.2771143236259515, + "grad_norm": 0.5610007733091192, + "learning_rate": 9.236499068901304e-05, + "loss": 3.7091, + "step": 5952 + }, + { + "epoch": 0.2771608818120446, + "grad_norm": 0.6041587890537566, + "learning_rate": 9.238050900062074e-05, + "loss": 3.6246, + "step": 5953 + }, + { + "epoch": 0.2772074399981377, + "grad_norm": 0.5552067757629734, + "learning_rate": 9.239602731222843e-05, + "loss": 3.5852, + "step": 5954 + }, + { + "epoch": 0.27725399818423074, + "grad_norm": 0.53677295660241, + "learning_rate": 9.241154562383613e-05, + "loss": 3.6674, + "step": 5955 + }, + { + "epoch": 0.2773005563703238, + "grad_norm": 0.5766831371328572, + "learning_rate": 9.242706393544383e-05, + "loss": 3.7191, + "step": 5956 + }, + { + "epoch": 0.27734711455641686, + "grad_norm": 0.6897550293160767, + "learning_rate": 9.244258224705152e-05, + "loss": 3.5495, + "step": 5957 + }, + { + "epoch": 0.27739367274251, + "grad_norm": 0.7221348543720214, + "learning_rate": 9.245810055865922e-05, + "loss": 3.7392, + "step": 5958 + }, + { + "epoch": 0.27744023092860304, + "grad_norm": 0.6068432576061851, + "learning_rate": 9.247361887026692e-05, + "loss": 3.5531, + "step": 5959 + }, + { + "epoch": 0.2774867891146961, + "grad_norm": 0.6659773977515304, + "learning_rate": 9.248913718187461e-05, + "loss": 3.6956, + "step": 5960 + }, + { + "epoch": 0.27753334730078916, + "grad_norm": 0.6784455850059394, + "learning_rate": 9.250465549348232e-05, + "loss": 3.6335, + "step": 5961 + }, + { + "epoch": 0.2775799054868822, + "grad_norm": 0.7171335182641696, + "learning_rate": 9.252017380509002e-05, + "loss": 3.6982, + "step": 5962 + }, + { + "epoch": 0.2776264636729753, + "grad_norm": 0.6038237770838927, + "learning_rate": 9.25356921166977e-05, + "loss": 3.7238, + "step": 5963 + }, + { + "epoch": 0.2776730218590684, + "grad_norm": 0.5971664060486954, + "learning_rate": 9.25512104283054e-05, + "loss": 3.6056, + "step": 5964 + }, + { + "epoch": 0.27771958004516145, + "grad_norm": 0.7475288612948319, + "learning_rate": 9.25667287399131e-05, + "loss": 3.6411, + "step": 5965 + }, + { + "epoch": 0.2777661382312545, + "grad_norm": 0.7989061402233738, + "learning_rate": 9.25822470515208e-05, + "loss": 3.6027, + "step": 5966 + }, + { + "epoch": 0.27781269641734757, + "grad_norm": 0.610193512803013, + "learning_rate": 9.25977653631285e-05, + "loss": 3.6672, + "step": 5967 + }, + { + "epoch": 0.27785925460344063, + "grad_norm": 0.6193790627884438, + "learning_rate": 9.26132836747362e-05, + "loss": 3.6236, + "step": 5968 + }, + { + "epoch": 0.27790581278953375, + "grad_norm": 0.6401548115870163, + "learning_rate": 9.262880198634389e-05, + "loss": 3.5633, + "step": 5969 + }, + { + "epoch": 0.2779523709756268, + "grad_norm": 0.6882694779114193, + "learning_rate": 9.264432029795159e-05, + "loss": 3.6319, + "step": 5970 + }, + { + "epoch": 0.27799892916171987, + "grad_norm": 0.6054147220440598, + "learning_rate": 9.265983860955929e-05, + "loss": 3.575, + "step": 5971 + }, + { + "epoch": 0.2780454873478129, + "grad_norm": 0.6070348827024583, + "learning_rate": 9.267535692116698e-05, + "loss": 3.8015, + "step": 5972 + }, + { + "epoch": 0.278092045533906, + "grad_norm": 0.6263364621565545, + "learning_rate": 9.269087523277468e-05, + "loss": 3.6361, + "step": 5973 + }, + { + "epoch": 0.27813860371999904, + "grad_norm": 0.5567294075525805, + "learning_rate": 9.270639354438237e-05, + "loss": 3.6239, + "step": 5974 + }, + { + "epoch": 0.27818516190609216, + "grad_norm": 0.5303412918571356, + "learning_rate": 9.272191185599007e-05, + "loss": 3.6425, + "step": 5975 + }, + { + "epoch": 0.2782317200921852, + "grad_norm": 0.6074522140602877, + "learning_rate": 9.273743016759777e-05, + "loss": 3.6474, + "step": 5976 + }, + { + "epoch": 0.2782782782782783, + "grad_norm": 0.572748936659884, + "learning_rate": 9.275294847920548e-05, + "loss": 3.6503, + "step": 5977 + }, + { + "epoch": 0.27832483646437134, + "grad_norm": 0.6025870351959737, + "learning_rate": 9.276846679081317e-05, + "loss": 3.6582, + "step": 5978 + }, + { + "epoch": 0.2783713946504644, + "grad_norm": 0.6943368778103217, + "learning_rate": 9.278398510242086e-05, + "loss": 3.5638, + "step": 5979 + }, + { + "epoch": 0.2784179528365575, + "grad_norm": 0.6347483987421635, + "learning_rate": 9.279950341402855e-05, + "loss": 3.5956, + "step": 5980 + }, + { + "epoch": 0.2784645110226506, + "grad_norm": 0.5987240833041946, + "learning_rate": 9.281502172563625e-05, + "loss": 3.5574, + "step": 5981 + }, + { + "epoch": 0.27851106920874363, + "grad_norm": 0.5654516172734357, + "learning_rate": 9.283054003724394e-05, + "loss": 3.6641, + "step": 5982 + }, + { + "epoch": 0.2785576273948367, + "grad_norm": 0.5702773848738102, + "learning_rate": 9.284605834885165e-05, + "loss": 3.7097, + "step": 5983 + }, + { + "epoch": 0.27860418558092975, + "grad_norm": 0.6093154608953552, + "learning_rate": 9.286157666045935e-05, + "loss": 3.6959, + "step": 5984 + }, + { + "epoch": 0.2786507437670228, + "grad_norm": 0.6115814579908325, + "learning_rate": 9.287709497206705e-05, + "loss": 3.7494, + "step": 5985 + }, + { + "epoch": 0.27869730195311593, + "grad_norm": 0.5688403920478388, + "learning_rate": 9.289261328367474e-05, + "loss": 3.5431, + "step": 5986 + }, + { + "epoch": 0.278743860139209, + "grad_norm": 0.5183069716337597, + "learning_rate": 9.290813159528243e-05, + "loss": 3.6061, + "step": 5987 + }, + { + "epoch": 0.27879041832530205, + "grad_norm": 0.5990119260259266, + "learning_rate": 9.292364990689014e-05, + "loss": 3.5461, + "step": 5988 + }, + { + "epoch": 0.2788369765113951, + "grad_norm": 0.6697575985742469, + "learning_rate": 9.293916821849783e-05, + "loss": 3.7161, + "step": 5989 + }, + { + "epoch": 0.27888353469748817, + "grad_norm": 0.5482219590746249, + "learning_rate": 9.295468653010553e-05, + "loss": 3.5824, + "step": 5990 + }, + { + "epoch": 0.2789300928835813, + "grad_norm": 0.5278528661410978, + "learning_rate": 9.297020484171323e-05, + "loss": 3.7064, + "step": 5991 + }, + { + "epoch": 0.27897665106967434, + "grad_norm": 0.52709250646279, + "learning_rate": 9.298572315332092e-05, + "loss": 3.6302, + "step": 5992 + }, + { + "epoch": 0.2790232092557674, + "grad_norm": 0.5089952078605027, + "learning_rate": 9.300124146492862e-05, + "loss": 3.5877, + "step": 5993 + }, + { + "epoch": 0.27906976744186046, + "grad_norm": 0.5816601230572133, + "learning_rate": 9.301675977653633e-05, + "loss": 3.7256, + "step": 5994 + }, + { + "epoch": 0.2791163256279535, + "grad_norm": 0.5827915461550973, + "learning_rate": 9.303227808814401e-05, + "loss": 3.577, + "step": 5995 + }, + { + "epoch": 0.2791628838140466, + "grad_norm": 0.5661351856492994, + "learning_rate": 9.304779639975171e-05, + "loss": 3.771, + "step": 5996 + }, + { + "epoch": 0.2792094420001397, + "grad_norm": 0.5882215045961814, + "learning_rate": 9.30633147113594e-05, + "loss": 3.6059, + "step": 5997 + }, + { + "epoch": 0.27925600018623276, + "grad_norm": 0.6104837350881585, + "learning_rate": 9.30788330229671e-05, + "loss": 3.7215, + "step": 5998 + }, + { + "epoch": 0.2793025583723258, + "grad_norm": 0.545719309542999, + "learning_rate": 9.309435133457481e-05, + "loss": 3.5747, + "step": 5999 + }, + { + "epoch": 0.2793491165584189, + "grad_norm": 0.6211898851304596, + "learning_rate": 9.31098696461825e-05, + "loss": 3.7282, + "step": 6000 + }, + { + "epoch": 0.27939567474451193, + "grad_norm": 0.6503408071136658, + "learning_rate": 9.31253879577902e-05, + "loss": 3.7091, + "step": 6001 + }, + { + "epoch": 0.27944223293060505, + "grad_norm": 0.6439531060604422, + "learning_rate": 9.31409062693979e-05, + "loss": 3.6932, + "step": 6002 + }, + { + "epoch": 0.2794887911166981, + "grad_norm": 0.6116490967289133, + "learning_rate": 9.315642458100558e-05, + "loss": 3.5638, + "step": 6003 + }, + { + "epoch": 0.27953534930279117, + "grad_norm": 0.5979053662634628, + "learning_rate": 9.317194289261329e-05, + "loss": 3.6589, + "step": 6004 + }, + { + "epoch": 0.27958190748888423, + "grad_norm": 0.5469802520010993, + "learning_rate": 9.318746120422099e-05, + "loss": 3.6668, + "step": 6005 + }, + { + "epoch": 0.2796284656749773, + "grad_norm": 0.6511343731760532, + "learning_rate": 9.320297951582868e-05, + "loss": 3.7167, + "step": 6006 + }, + { + "epoch": 0.27967502386107035, + "grad_norm": 0.6292167610645345, + "learning_rate": 9.321849782743638e-05, + "loss": 3.6817, + "step": 6007 + }, + { + "epoch": 0.27972158204716346, + "grad_norm": 0.5559794778800811, + "learning_rate": 9.323401613904408e-05, + "loss": 3.6171, + "step": 6008 + }, + { + "epoch": 0.2797681402332565, + "grad_norm": 0.6324596452110071, + "learning_rate": 9.324953445065177e-05, + "loss": 3.7353, + "step": 6009 + }, + { + "epoch": 0.2798146984193496, + "grad_norm": 0.6622564684529898, + "learning_rate": 9.326505276225947e-05, + "loss": 3.6195, + "step": 6010 + }, + { + "epoch": 0.27986125660544264, + "grad_norm": 0.5013905278522247, + "learning_rate": 9.328057107386716e-05, + "loss": 3.6896, + "step": 6011 + }, + { + "epoch": 0.2799078147915357, + "grad_norm": 0.6413570484558027, + "learning_rate": 9.329608938547486e-05, + "loss": 3.6566, + "step": 6012 + }, + { + "epoch": 0.2799543729776288, + "grad_norm": 0.6096099405677115, + "learning_rate": 9.331160769708256e-05, + "loss": 3.7589, + "step": 6013 + }, + { + "epoch": 0.2800009311637219, + "grad_norm": 0.4814250199885203, + "learning_rate": 9.332712600869025e-05, + "loss": 3.5079, + "step": 6014 + }, + { + "epoch": 0.28004748934981494, + "grad_norm": 0.4986528473727704, + "learning_rate": 9.334264432029795e-05, + "loss": 3.6098, + "step": 6015 + }, + { + "epoch": 0.280094047535908, + "grad_norm": 0.5666666576319715, + "learning_rate": 9.335816263190566e-05, + "loss": 3.6336, + "step": 6016 + }, + { + "epoch": 0.28014060572200106, + "grad_norm": 0.5690244437427733, + "learning_rate": 9.337368094351336e-05, + "loss": 3.8246, + "step": 6017 + }, + { + "epoch": 0.2801871639080941, + "grad_norm": 0.5960624511617024, + "learning_rate": 9.338919925512105e-05, + "loss": 3.8077, + "step": 6018 + }, + { + "epoch": 0.28023372209418723, + "grad_norm": 0.6844714322361778, + "learning_rate": 9.340471756672874e-05, + "loss": 3.7807, + "step": 6019 + }, + { + "epoch": 0.2802802802802803, + "grad_norm": 0.7823584821796797, + "learning_rate": 9.342023587833643e-05, + "loss": 3.5969, + "step": 6020 + }, + { + "epoch": 0.28032683846637335, + "grad_norm": 0.7070824694687589, + "learning_rate": 9.343575418994414e-05, + "loss": 3.7356, + "step": 6021 + }, + { + "epoch": 0.2803733966524664, + "grad_norm": 0.5157759770567003, + "learning_rate": 9.345127250155184e-05, + "loss": 3.8099, + "step": 6022 + }, + { + "epoch": 0.28041995483855947, + "grad_norm": 0.6053414537978313, + "learning_rate": 9.346679081315953e-05, + "loss": 3.754, + "step": 6023 + }, + { + "epoch": 0.2804665130246526, + "grad_norm": 0.7173792841493046, + "learning_rate": 9.348230912476723e-05, + "loss": 3.8195, + "step": 6024 + }, + { + "epoch": 0.28051307121074565, + "grad_norm": 0.5895362004786863, + "learning_rate": 9.349782743637493e-05, + "loss": 3.6449, + "step": 6025 + }, + { + "epoch": 0.2805596293968387, + "grad_norm": 0.5817052909374, + "learning_rate": 9.351334574798262e-05, + "loss": 3.6865, + "step": 6026 + }, + { + "epoch": 0.28060618758293177, + "grad_norm": 0.5946033898428535, + "learning_rate": 9.352886405959033e-05, + "loss": 3.6123, + "step": 6027 + }, + { + "epoch": 0.2806527457690248, + "grad_norm": 0.6305260881823819, + "learning_rate": 9.354438237119802e-05, + "loss": 3.565, + "step": 6028 + }, + { + "epoch": 0.2806993039551179, + "grad_norm": 0.5463698238586608, + "learning_rate": 9.355990068280571e-05, + "loss": 3.636, + "step": 6029 + }, + { + "epoch": 0.280745862141211, + "grad_norm": 0.56327299694416, + "learning_rate": 9.357541899441341e-05, + "loss": 3.7076, + "step": 6030 + }, + { + "epoch": 0.28079242032730406, + "grad_norm": 0.5002733724216154, + "learning_rate": 9.35909373060211e-05, + "loss": 3.6606, + "step": 6031 + }, + { + "epoch": 0.2808389785133971, + "grad_norm": 0.5810830580051799, + "learning_rate": 9.36064556176288e-05, + "loss": 3.6535, + "step": 6032 + }, + { + "epoch": 0.2808855366994902, + "grad_norm": 0.5935237867628113, + "learning_rate": 9.362197392923651e-05, + "loss": 3.6357, + "step": 6033 + }, + { + "epoch": 0.28093209488558324, + "grad_norm": 0.5758825035699301, + "learning_rate": 9.363749224084421e-05, + "loss": 3.6855, + "step": 6034 + }, + { + "epoch": 0.28097865307167635, + "grad_norm": 0.5536723451272448, + "learning_rate": 9.36530105524519e-05, + "loss": 3.7103, + "step": 6035 + }, + { + "epoch": 0.2810252112577694, + "grad_norm": 0.4529407159435145, + "learning_rate": 9.366852886405959e-05, + "loss": 3.6311, + "step": 6036 + }, + { + "epoch": 0.2810717694438625, + "grad_norm": 0.4995283311115475, + "learning_rate": 9.368404717566728e-05, + "loss": 3.7035, + "step": 6037 + }, + { + "epoch": 0.28111832762995553, + "grad_norm": 0.6498527383440185, + "learning_rate": 9.369956548727499e-05, + "loss": 3.5928, + "step": 6038 + }, + { + "epoch": 0.2811648858160486, + "grad_norm": 0.8075640132244027, + "learning_rate": 9.371508379888269e-05, + "loss": 3.6028, + "step": 6039 + }, + { + "epoch": 0.28121144400214165, + "grad_norm": 0.7155462788074664, + "learning_rate": 9.373060211049038e-05, + "loss": 3.5568, + "step": 6040 + }, + { + "epoch": 0.28125800218823477, + "grad_norm": 0.5842040721732042, + "learning_rate": 9.374612042209808e-05, + "loss": 3.6832, + "step": 6041 + }, + { + "epoch": 0.28130456037432783, + "grad_norm": 0.5831978451842096, + "learning_rate": 9.376163873370578e-05, + "loss": 3.6663, + "step": 6042 + }, + { + "epoch": 0.2813511185604209, + "grad_norm": 0.5796394087849609, + "learning_rate": 9.377715704531347e-05, + "loss": 3.7192, + "step": 6043 + }, + { + "epoch": 0.28139767674651395, + "grad_norm": 0.555416108819874, + "learning_rate": 9.379267535692117e-05, + "loss": 3.6797, + "step": 6044 + }, + { + "epoch": 0.281444234932607, + "grad_norm": 0.5896236486326943, + "learning_rate": 9.380819366852887e-05, + "loss": 3.7045, + "step": 6045 + }, + { + "epoch": 0.2814907931187001, + "grad_norm": 0.5687148401657116, + "learning_rate": 9.382371198013656e-05, + "loss": 3.6415, + "step": 6046 + }, + { + "epoch": 0.2815373513047932, + "grad_norm": 0.5715961545916161, + "learning_rate": 9.383923029174426e-05, + "loss": 3.6613, + "step": 6047 + }, + { + "epoch": 0.28158390949088624, + "grad_norm": 0.6012430308752653, + "learning_rate": 9.385474860335196e-05, + "loss": 3.7052, + "step": 6048 + }, + { + "epoch": 0.2816304676769793, + "grad_norm": 0.6027010435605904, + "learning_rate": 9.387026691495967e-05, + "loss": 3.7002, + "step": 6049 + }, + { + "epoch": 0.28167702586307236, + "grad_norm": 0.5351394397230325, + "learning_rate": 9.388578522656736e-05, + "loss": 3.7052, + "step": 6050 + }, + { + "epoch": 0.2817235840491654, + "grad_norm": 0.5284687178495048, + "learning_rate": 9.390130353817506e-05, + "loss": 3.631, + "step": 6051 + }, + { + "epoch": 0.28177014223525854, + "grad_norm": 0.4987901758941621, + "learning_rate": 9.391682184978274e-05, + "loss": 3.5899, + "step": 6052 + }, + { + "epoch": 0.2818167004213516, + "grad_norm": 0.5490265583994309, + "learning_rate": 9.393234016139044e-05, + "loss": 3.6344, + "step": 6053 + }, + { + "epoch": 0.28186325860744466, + "grad_norm": 0.49768962863728194, + "learning_rate": 9.394785847299815e-05, + "loss": 3.7034, + "step": 6054 + }, + { + "epoch": 0.2819098167935377, + "grad_norm": 0.5851235014363857, + "learning_rate": 9.396337678460584e-05, + "loss": 3.5898, + "step": 6055 + }, + { + "epoch": 0.2819563749796308, + "grad_norm": 0.5413842947330596, + "learning_rate": 9.397889509621354e-05, + "loss": 3.6233, + "step": 6056 + }, + { + "epoch": 0.2820029331657239, + "grad_norm": 0.560781291172595, + "learning_rate": 9.399441340782124e-05, + "loss": 3.6823, + "step": 6057 + }, + { + "epoch": 0.28204949135181695, + "grad_norm": 0.5176127370768872, + "learning_rate": 9.400993171942893e-05, + "loss": 3.6476, + "step": 6058 + }, + { + "epoch": 0.28209604953791, + "grad_norm": 0.546073742874132, + "learning_rate": 9.402545003103663e-05, + "loss": 3.6336, + "step": 6059 + }, + { + "epoch": 0.28214260772400307, + "grad_norm": 0.5661831035643434, + "learning_rate": 9.404096834264432e-05, + "loss": 3.5159, + "step": 6060 + }, + { + "epoch": 0.28218916591009613, + "grad_norm": 0.6281426934220607, + "learning_rate": 9.405648665425202e-05, + "loss": 3.6698, + "step": 6061 + }, + { + "epoch": 0.2822357240961892, + "grad_norm": 0.624750967077533, + "learning_rate": 9.407200496585972e-05, + "loss": 3.6139, + "step": 6062 + }, + { + "epoch": 0.2822822822822823, + "grad_norm": 0.6320677125416796, + "learning_rate": 9.408752327746741e-05, + "loss": 3.5867, + "step": 6063 + }, + { + "epoch": 0.28232884046837536, + "grad_norm": 0.5950061898336547, + "learning_rate": 9.410304158907511e-05, + "loss": 3.5909, + "step": 6064 + }, + { + "epoch": 0.2823753986544684, + "grad_norm": 0.464022235164863, + "learning_rate": 9.41185599006828e-05, + "loss": 3.5832, + "step": 6065 + }, + { + "epoch": 0.2824219568405615, + "grad_norm": 0.486527039694253, + "learning_rate": 9.413407821229052e-05, + "loss": 3.6897, + "step": 6066 + }, + { + "epoch": 0.28246851502665454, + "grad_norm": 0.4767519144230031, + "learning_rate": 9.414959652389821e-05, + "loss": 3.7267, + "step": 6067 + }, + { + "epoch": 0.28251507321274766, + "grad_norm": 0.4933868483408033, + "learning_rate": 9.41651148355059e-05, + "loss": 3.6781, + "step": 6068 + }, + { + "epoch": 0.2825616313988407, + "grad_norm": 0.5300650763157456, + "learning_rate": 9.418063314711359e-05, + "loss": 3.5165, + "step": 6069 + }, + { + "epoch": 0.2826081895849338, + "grad_norm": 0.5113115103824857, + "learning_rate": 9.419615145872129e-05, + "loss": 3.6605, + "step": 6070 + }, + { + "epoch": 0.28265474777102684, + "grad_norm": 0.6055646224734891, + "learning_rate": 9.4211669770329e-05, + "loss": 3.7408, + "step": 6071 + }, + { + "epoch": 0.2827013059571199, + "grad_norm": 0.6404185234270949, + "learning_rate": 9.42271880819367e-05, + "loss": 3.4698, + "step": 6072 + }, + { + "epoch": 0.28274786414321296, + "grad_norm": 0.6127187101532828, + "learning_rate": 9.424270639354439e-05, + "loss": 3.5458, + "step": 6073 + }, + { + "epoch": 0.28279442232930607, + "grad_norm": 0.5452200235433847, + "learning_rate": 9.425822470515209e-05, + "loss": 3.7938, + "step": 6074 + }, + { + "epoch": 0.28284098051539913, + "grad_norm": 0.5075274398692214, + "learning_rate": 9.427374301675978e-05, + "loss": 3.5976, + "step": 6075 + }, + { + "epoch": 0.2828875387014922, + "grad_norm": 0.5702656310206948, + "learning_rate": 9.428926132836748e-05, + "loss": 3.6076, + "step": 6076 + }, + { + "epoch": 0.28293409688758525, + "grad_norm": 0.6054118006712955, + "learning_rate": 9.430477963997518e-05, + "loss": 3.6415, + "step": 6077 + }, + { + "epoch": 0.2829806550736783, + "grad_norm": 0.590661194710242, + "learning_rate": 9.432029795158287e-05, + "loss": 3.7212, + "step": 6078 + }, + { + "epoch": 0.2830272132597714, + "grad_norm": 0.5246038277390315, + "learning_rate": 9.433581626319057e-05, + "loss": 3.6094, + "step": 6079 + }, + { + "epoch": 0.2830737714458645, + "grad_norm": 0.5220462192734815, + "learning_rate": 9.435133457479826e-05, + "loss": 3.6619, + "step": 6080 + }, + { + "epoch": 0.28312032963195755, + "grad_norm": 0.5028582363781436, + "learning_rate": 9.436685288640596e-05, + "loss": 3.6072, + "step": 6081 + }, + { + "epoch": 0.2831668878180506, + "grad_norm": 0.49033961070680765, + "learning_rate": 9.438237119801367e-05, + "loss": 3.4738, + "step": 6082 + }, + { + "epoch": 0.28321344600414367, + "grad_norm": 0.5061455240848004, + "learning_rate": 9.439788950962137e-05, + "loss": 3.6979, + "step": 6083 + }, + { + "epoch": 0.2832600041902367, + "grad_norm": 0.6281090319713164, + "learning_rate": 9.441340782122905e-05, + "loss": 3.5353, + "step": 6084 + }, + { + "epoch": 0.28330656237632984, + "grad_norm": 0.6436896201062313, + "learning_rate": 9.442892613283675e-05, + "loss": 3.5857, + "step": 6085 + }, + { + "epoch": 0.2833531205624229, + "grad_norm": 0.5843730142013996, + "learning_rate": 9.444444444444444e-05, + "loss": 3.5839, + "step": 6086 + }, + { + "epoch": 0.28339967874851596, + "grad_norm": 0.5928099466403335, + "learning_rate": 9.445996275605214e-05, + "loss": 3.5706, + "step": 6087 + }, + { + "epoch": 0.283446236934609, + "grad_norm": 0.5677340576021052, + "learning_rate": 9.447548106765985e-05, + "loss": 3.6396, + "step": 6088 + }, + { + "epoch": 0.2834927951207021, + "grad_norm": 0.5757335455154723, + "learning_rate": 9.449099937926754e-05, + "loss": 3.5156, + "step": 6089 + }, + { + "epoch": 0.2835393533067952, + "grad_norm": 0.5214362083813892, + "learning_rate": 9.450651769087524e-05, + "loss": 3.5914, + "step": 6090 + }, + { + "epoch": 0.28358591149288825, + "grad_norm": 0.5712255349763049, + "learning_rate": 9.452203600248294e-05, + "loss": 3.6834, + "step": 6091 + }, + { + "epoch": 0.2836324696789813, + "grad_norm": 0.7502264454701464, + "learning_rate": 9.453755431409062e-05, + "loss": 3.7077, + "step": 6092 + }, + { + "epoch": 0.2836790278650744, + "grad_norm": 0.7651168063228517, + "learning_rate": 9.455307262569833e-05, + "loss": 3.6498, + "step": 6093 + }, + { + "epoch": 0.28372558605116743, + "grad_norm": 0.6201868909409124, + "learning_rate": 9.456859093730603e-05, + "loss": 3.6178, + "step": 6094 + }, + { + "epoch": 0.2837721442372605, + "grad_norm": 0.5573782723557728, + "learning_rate": 9.458410924891372e-05, + "loss": 3.6716, + "step": 6095 + }, + { + "epoch": 0.2838187024233536, + "grad_norm": 0.5684859319960888, + "learning_rate": 9.459962756052142e-05, + "loss": 3.6327, + "step": 6096 + }, + { + "epoch": 0.28386526060944667, + "grad_norm": 0.5640975784603659, + "learning_rate": 9.461514587212912e-05, + "loss": 3.7837, + "step": 6097 + }, + { + "epoch": 0.2839118187955397, + "grad_norm": 0.5688560487648152, + "learning_rate": 9.463066418373681e-05, + "loss": 3.6515, + "step": 6098 + }, + { + "epoch": 0.2839583769816328, + "grad_norm": 0.5290216374511076, + "learning_rate": 9.464618249534452e-05, + "loss": 3.6678, + "step": 6099 + }, + { + "epoch": 0.28400493516772585, + "grad_norm": 0.542106132065835, + "learning_rate": 9.46617008069522e-05, + "loss": 3.6292, + "step": 6100 + }, + { + "epoch": 0.28405149335381896, + "grad_norm": 0.5396354061604336, + "learning_rate": 9.46772191185599e-05, + "loss": 3.6171, + "step": 6101 + }, + { + "epoch": 0.284098051539912, + "grad_norm": 0.622969251728005, + "learning_rate": 9.46927374301676e-05, + "loss": 3.6637, + "step": 6102 + }, + { + "epoch": 0.2841446097260051, + "grad_norm": 0.6234562266736334, + "learning_rate": 9.470825574177529e-05, + "loss": 3.6309, + "step": 6103 + }, + { + "epoch": 0.28419116791209814, + "grad_norm": 0.553239251796183, + "learning_rate": 9.4723774053383e-05, + "loss": 3.61, + "step": 6104 + }, + { + "epoch": 0.2842377260981912, + "grad_norm": 0.5961046438229691, + "learning_rate": 9.47392923649907e-05, + "loss": 3.7277, + "step": 6105 + }, + { + "epoch": 0.28428428428428426, + "grad_norm": 0.6112602218785148, + "learning_rate": 9.47548106765984e-05, + "loss": 3.6392, + "step": 6106 + }, + { + "epoch": 0.2843308424703774, + "grad_norm": 0.6739521721118462, + "learning_rate": 9.477032898820609e-05, + "loss": 3.6534, + "step": 6107 + }, + { + "epoch": 0.28437740065647044, + "grad_norm": 0.6196270677168311, + "learning_rate": 9.478584729981379e-05, + "loss": 3.7013, + "step": 6108 + }, + { + "epoch": 0.2844239588425635, + "grad_norm": 0.5425109271770158, + "learning_rate": 9.480136561142148e-05, + "loss": 3.5584, + "step": 6109 + }, + { + "epoch": 0.28447051702865656, + "grad_norm": 0.5862861856958368, + "learning_rate": 9.481688392302918e-05, + "loss": 3.7006, + "step": 6110 + }, + { + "epoch": 0.2845170752147496, + "grad_norm": 0.5135820619682074, + "learning_rate": 9.483240223463688e-05, + "loss": 3.6032, + "step": 6111 + }, + { + "epoch": 0.28456363340084273, + "grad_norm": 0.5623335509551216, + "learning_rate": 9.484792054624457e-05, + "loss": 3.6794, + "step": 6112 + }, + { + "epoch": 0.2846101915869358, + "grad_norm": 0.5752493547985597, + "learning_rate": 9.486343885785227e-05, + "loss": 3.6382, + "step": 6113 + }, + { + "epoch": 0.28465674977302885, + "grad_norm": 0.5730619995272969, + "learning_rate": 9.487895716945997e-05, + "loss": 3.7583, + "step": 6114 + }, + { + "epoch": 0.2847033079591219, + "grad_norm": 0.6622908852865325, + "learning_rate": 9.489447548106766e-05, + "loss": 3.6317, + "step": 6115 + }, + { + "epoch": 0.28474986614521497, + "grad_norm": 0.5427318178465952, + "learning_rate": 9.490999379267537e-05, + "loss": 3.6504, + "step": 6116 + }, + { + "epoch": 0.28479642433130803, + "grad_norm": 0.4618554497048231, + "learning_rate": 9.492551210428305e-05, + "loss": 3.5854, + "step": 6117 + }, + { + "epoch": 0.28484298251740114, + "grad_norm": 0.5077236959067442, + "learning_rate": 9.494103041589075e-05, + "loss": 3.6774, + "step": 6118 + }, + { + "epoch": 0.2848895407034942, + "grad_norm": 0.5092106274958731, + "learning_rate": 9.495654872749845e-05, + "loss": 3.6743, + "step": 6119 + }, + { + "epoch": 0.28493609888958726, + "grad_norm": 0.4811663577502313, + "learning_rate": 9.497206703910614e-05, + "loss": 3.5967, + "step": 6120 + }, + { + "epoch": 0.2849826570756803, + "grad_norm": 0.5468555525816917, + "learning_rate": 9.498758535071385e-05, + "loss": 3.6708, + "step": 6121 + }, + { + "epoch": 0.2850292152617734, + "grad_norm": 0.635677259579925, + "learning_rate": 9.500310366232155e-05, + "loss": 3.6584, + "step": 6122 + }, + { + "epoch": 0.2850757734478665, + "grad_norm": 0.7563153065402377, + "learning_rate": 9.501862197392925e-05, + "loss": 3.7613, + "step": 6123 + }, + { + "epoch": 0.28512233163395956, + "grad_norm": 0.7736202878911949, + "learning_rate": 9.503414028553694e-05, + "loss": 3.6, + "step": 6124 + }, + { + "epoch": 0.2851688898200526, + "grad_norm": 0.7029423219823149, + "learning_rate": 9.504965859714463e-05, + "loss": 3.6384, + "step": 6125 + }, + { + "epoch": 0.2852154480061457, + "grad_norm": 0.5883701928890042, + "learning_rate": 9.506517690875234e-05, + "loss": 3.659, + "step": 6126 + }, + { + "epoch": 0.28526200619223874, + "grad_norm": 0.7394417956120776, + "learning_rate": 9.508069522036003e-05, + "loss": 3.6819, + "step": 6127 + }, + { + "epoch": 0.2853085643783318, + "grad_norm": 0.7548715960235423, + "learning_rate": 9.509621353196773e-05, + "loss": 3.6269, + "step": 6128 + }, + { + "epoch": 0.2853551225644249, + "grad_norm": 0.6320439900729967, + "learning_rate": 9.511173184357542e-05, + "loss": 3.6776, + "step": 6129 + }, + { + "epoch": 0.28540168075051797, + "grad_norm": 0.630397131411245, + "learning_rate": 9.512725015518312e-05, + "loss": 3.7062, + "step": 6130 + }, + { + "epoch": 0.28544823893661103, + "grad_norm": 0.6625100162575381, + "learning_rate": 9.514276846679082e-05, + "loss": 3.5755, + "step": 6131 + }, + { + "epoch": 0.2854947971227041, + "grad_norm": 0.600834586254908, + "learning_rate": 9.515828677839853e-05, + "loss": 3.6171, + "step": 6132 + }, + { + "epoch": 0.28554135530879715, + "grad_norm": 0.5519301334497306, + "learning_rate": 9.517380509000621e-05, + "loss": 3.4983, + "step": 6133 + }, + { + "epoch": 0.28558791349489027, + "grad_norm": 0.6165960205730218, + "learning_rate": 9.51893234016139e-05, + "loss": 3.6566, + "step": 6134 + }, + { + "epoch": 0.2856344716809833, + "grad_norm": 0.6223737939681127, + "learning_rate": 9.52048417132216e-05, + "loss": 3.7062, + "step": 6135 + }, + { + "epoch": 0.2856810298670764, + "grad_norm": 0.5938301534094137, + "learning_rate": 9.52203600248293e-05, + "loss": 3.5264, + "step": 6136 + }, + { + "epoch": 0.28572758805316945, + "grad_norm": 0.478135312287589, + "learning_rate": 9.5235878336437e-05, + "loss": 3.6413, + "step": 6137 + }, + { + "epoch": 0.2857741462392625, + "grad_norm": 0.5648698851598897, + "learning_rate": 9.52513966480447e-05, + "loss": 3.5254, + "step": 6138 + }, + { + "epoch": 0.28582070442535557, + "grad_norm": 0.6370766174149546, + "learning_rate": 9.52669149596524e-05, + "loss": 3.6757, + "step": 6139 + }, + { + "epoch": 0.2858672626114487, + "grad_norm": 0.7025464821991663, + "learning_rate": 9.52824332712601e-05, + "loss": 3.7209, + "step": 6140 + }, + { + "epoch": 0.28591382079754174, + "grad_norm": 0.5975562885447053, + "learning_rate": 9.529795158286778e-05, + "loss": 3.7401, + "step": 6141 + }, + { + "epoch": 0.2859603789836348, + "grad_norm": 0.5063940036396226, + "learning_rate": 9.531346989447548e-05, + "loss": 3.6282, + "step": 6142 + }, + { + "epoch": 0.28600693716972786, + "grad_norm": 0.4816433099925703, + "learning_rate": 9.532898820608319e-05, + "loss": 3.7467, + "step": 6143 + }, + { + "epoch": 0.2860534953558209, + "grad_norm": 0.4775638490498531, + "learning_rate": 9.534450651769088e-05, + "loss": 3.4957, + "step": 6144 + }, + { + "epoch": 0.28610005354191403, + "grad_norm": 0.5567042270154383, + "learning_rate": 9.536002482929858e-05, + "loss": 3.5861, + "step": 6145 + }, + { + "epoch": 0.2861466117280071, + "grad_norm": 0.6219326511728965, + "learning_rate": 9.537554314090627e-05, + "loss": 3.7247, + "step": 6146 + }, + { + "epoch": 0.28619316991410015, + "grad_norm": 0.6851521198000136, + "learning_rate": 9.539106145251397e-05, + "loss": 3.5556, + "step": 6147 + }, + { + "epoch": 0.2862397281001932, + "grad_norm": 0.5861887186888163, + "learning_rate": 9.540657976412167e-05, + "loss": 3.7652, + "step": 6148 + }, + { + "epoch": 0.2862862862862863, + "grad_norm": 0.4999577492361278, + "learning_rate": 9.542209807572936e-05, + "loss": 3.6795, + "step": 6149 + }, + { + "epoch": 0.28633284447237933, + "grad_norm": 0.48464779068078667, + "learning_rate": 9.543761638733706e-05, + "loss": 3.6351, + "step": 6150 + }, + { + "epoch": 0.28637940265847245, + "grad_norm": 0.5546552436904827, + "learning_rate": 9.545313469894476e-05, + "loss": 3.6625, + "step": 6151 + }, + { + "epoch": 0.2864259608445655, + "grad_norm": 0.5588768856643954, + "learning_rate": 9.546865301055245e-05, + "loss": 3.6518, + "step": 6152 + }, + { + "epoch": 0.28647251903065857, + "grad_norm": 0.5743749764463368, + "learning_rate": 9.548417132216015e-05, + "loss": 3.5412, + "step": 6153 + }, + { + "epoch": 0.2865190772167516, + "grad_norm": 0.6102039891471782, + "learning_rate": 9.549968963376786e-05, + "loss": 3.718, + "step": 6154 + }, + { + "epoch": 0.2865656354028447, + "grad_norm": 0.591983728573526, + "learning_rate": 9.551520794537556e-05, + "loss": 3.6124, + "step": 6155 + }, + { + "epoch": 0.2866121935889378, + "grad_norm": 0.6322531570354373, + "learning_rate": 9.553072625698325e-05, + "loss": 3.67, + "step": 6156 + }, + { + "epoch": 0.28665875177503086, + "grad_norm": 0.7450747356432463, + "learning_rate": 9.554624456859093e-05, + "loss": 3.7256, + "step": 6157 + }, + { + "epoch": 0.2867053099611239, + "grad_norm": 0.8563528902334141, + "learning_rate": 9.556176288019863e-05, + "loss": 3.6438, + "step": 6158 + }, + { + "epoch": 0.286751868147217, + "grad_norm": 0.6501804773143738, + "learning_rate": 9.557728119180634e-05, + "loss": 3.6175, + "step": 6159 + }, + { + "epoch": 0.28679842633331004, + "grad_norm": 0.563902005738105, + "learning_rate": 9.559279950341404e-05, + "loss": 3.5279, + "step": 6160 + }, + { + "epoch": 0.2868449845194031, + "grad_norm": 0.6322019458696932, + "learning_rate": 9.560831781502173e-05, + "loss": 3.6584, + "step": 6161 + }, + { + "epoch": 0.2868915427054962, + "grad_norm": 0.5920078915680067, + "learning_rate": 9.562383612662943e-05, + "loss": 3.6754, + "step": 6162 + }, + { + "epoch": 0.2869381008915893, + "grad_norm": 0.5418807204693469, + "learning_rate": 9.563935443823713e-05, + "loss": 3.5537, + "step": 6163 + }, + { + "epoch": 0.28698465907768234, + "grad_norm": 0.572471536926373, + "learning_rate": 9.565487274984482e-05, + "loss": 3.5916, + "step": 6164 + }, + { + "epoch": 0.2870312172637754, + "grad_norm": 0.529758631459736, + "learning_rate": 9.567039106145252e-05, + "loss": 3.6229, + "step": 6165 + }, + { + "epoch": 0.28707777544986846, + "grad_norm": 0.5345552204711637, + "learning_rate": 9.568590937306021e-05, + "loss": 3.4564, + "step": 6166 + }, + { + "epoch": 0.28712433363596157, + "grad_norm": 0.48878333503077276, + "learning_rate": 9.570142768466791e-05, + "loss": 3.6282, + "step": 6167 + }, + { + "epoch": 0.28717089182205463, + "grad_norm": 0.4757125389710588, + "learning_rate": 9.571694599627561e-05, + "loss": 3.5121, + "step": 6168 + }, + { + "epoch": 0.2872174500081477, + "grad_norm": 0.49065367744667104, + "learning_rate": 9.57324643078833e-05, + "loss": 3.6647, + "step": 6169 + }, + { + "epoch": 0.28726400819424075, + "grad_norm": 0.511298948855791, + "learning_rate": 9.5747982619491e-05, + "loss": 3.6072, + "step": 6170 + }, + { + "epoch": 0.2873105663803338, + "grad_norm": 0.5590729982869072, + "learning_rate": 9.576350093109871e-05, + "loss": 3.6043, + "step": 6171 + }, + { + "epoch": 0.28735712456642687, + "grad_norm": 0.6353891848266873, + "learning_rate": 9.57790192427064e-05, + "loss": 3.5574, + "step": 6172 + }, + { + "epoch": 0.28740368275252, + "grad_norm": 0.5823788791879835, + "learning_rate": 9.579453755431409e-05, + "loss": 3.6757, + "step": 6173 + }, + { + "epoch": 0.28745024093861304, + "grad_norm": 0.5047216104517624, + "learning_rate": 9.581005586592178e-05, + "loss": 3.4481, + "step": 6174 + }, + { + "epoch": 0.2874967991247061, + "grad_norm": 0.606812959124547, + "learning_rate": 9.582557417752948e-05, + "loss": 3.6845, + "step": 6175 + }, + { + "epoch": 0.28754335731079916, + "grad_norm": 0.6632118025736602, + "learning_rate": 9.584109248913719e-05, + "loss": 3.6773, + "step": 6176 + }, + { + "epoch": 0.2875899154968922, + "grad_norm": 0.5415454079347941, + "learning_rate": 9.585661080074489e-05, + "loss": 3.6142, + "step": 6177 + }, + { + "epoch": 0.28763647368298534, + "grad_norm": 0.5339074830308432, + "learning_rate": 9.587212911235258e-05, + "loss": 3.6498, + "step": 6178 + }, + { + "epoch": 0.2876830318690784, + "grad_norm": 0.576595466255615, + "learning_rate": 9.588764742396028e-05, + "loss": 3.6462, + "step": 6179 + }, + { + "epoch": 0.28772959005517146, + "grad_norm": 0.555770548522818, + "learning_rate": 9.590316573556798e-05, + "loss": 3.7116, + "step": 6180 + }, + { + "epoch": 0.2877761482412645, + "grad_norm": 0.5751294728419081, + "learning_rate": 9.591868404717567e-05, + "loss": 3.58, + "step": 6181 + }, + { + "epoch": 0.2878227064273576, + "grad_norm": 0.5526587094126153, + "learning_rate": 9.593420235878337e-05, + "loss": 3.5814, + "step": 6182 + }, + { + "epoch": 0.28786926461345064, + "grad_norm": 0.5096062752675599, + "learning_rate": 9.594972067039107e-05, + "loss": 3.5741, + "step": 6183 + }, + { + "epoch": 0.28791582279954375, + "grad_norm": 0.49070767341297483, + "learning_rate": 9.596523898199876e-05, + "loss": 3.6245, + "step": 6184 + }, + { + "epoch": 0.2879623809856368, + "grad_norm": 0.6026126759399546, + "learning_rate": 9.598075729360646e-05, + "loss": 3.5956, + "step": 6185 + }, + { + "epoch": 0.28800893917172987, + "grad_norm": 0.7190715425564774, + "learning_rate": 9.599627560521415e-05, + "loss": 3.582, + "step": 6186 + }, + { + "epoch": 0.28805549735782293, + "grad_norm": 0.6806334719415726, + "learning_rate": 9.601179391682185e-05, + "loss": 3.646, + "step": 6187 + }, + { + "epoch": 0.288102055543916, + "grad_norm": 0.6684149764160349, + "learning_rate": 9.602731222842956e-05, + "loss": 3.5463, + "step": 6188 + }, + { + "epoch": 0.28814861373000905, + "grad_norm": 0.6595462465276422, + "learning_rate": 9.604283054003724e-05, + "loss": 3.6064, + "step": 6189 + }, + { + "epoch": 0.28819517191610217, + "grad_norm": 0.6124701391086051, + "learning_rate": 9.605834885164494e-05, + "loss": 3.7438, + "step": 6190 + }, + { + "epoch": 0.2882417301021952, + "grad_norm": 0.5188865654533055, + "learning_rate": 9.607386716325264e-05, + "loss": 3.7024, + "step": 6191 + }, + { + "epoch": 0.2882882882882883, + "grad_norm": 0.7063776812513871, + "learning_rate": 9.608938547486033e-05, + "loss": 3.7592, + "step": 6192 + }, + { + "epoch": 0.28833484647438135, + "grad_norm": 0.9087705104343285, + "learning_rate": 9.610490378646804e-05, + "loss": 3.6419, + "step": 6193 + }, + { + "epoch": 0.2883814046604744, + "grad_norm": 0.7129463910033921, + "learning_rate": 9.612042209807574e-05, + "loss": 3.6253, + "step": 6194 + }, + { + "epoch": 0.2884279628465675, + "grad_norm": 0.6071566016166614, + "learning_rate": 9.613594040968343e-05, + "loss": 3.5825, + "step": 6195 + }, + { + "epoch": 0.2884745210326606, + "grad_norm": 0.609458806033633, + "learning_rate": 9.615145872129113e-05, + "loss": 3.6355, + "step": 6196 + }, + { + "epoch": 0.28852107921875364, + "grad_norm": 0.6146638055642537, + "learning_rate": 9.616697703289883e-05, + "loss": 3.5571, + "step": 6197 + }, + { + "epoch": 0.2885676374048467, + "grad_norm": 0.6321313121330435, + "learning_rate": 9.618249534450652e-05, + "loss": 3.5751, + "step": 6198 + }, + { + "epoch": 0.28861419559093976, + "grad_norm": 0.6474847543016872, + "learning_rate": 9.619801365611422e-05, + "loss": 3.5876, + "step": 6199 + }, + { + "epoch": 0.2886607537770328, + "grad_norm": 0.5530363646373103, + "learning_rate": 9.621353196772192e-05, + "loss": 3.541, + "step": 6200 + }, + { + "epoch": 0.28870731196312593, + "grad_norm": 0.5976648096937804, + "learning_rate": 9.622905027932961e-05, + "loss": 3.6937, + "step": 6201 + }, + { + "epoch": 0.288753870149219, + "grad_norm": 0.6293112677722206, + "learning_rate": 9.624456859093731e-05, + "loss": 3.7246, + "step": 6202 + }, + { + "epoch": 0.28880042833531205, + "grad_norm": 0.5862846007415001, + "learning_rate": 9.6260086902545e-05, + "loss": 3.6493, + "step": 6203 + }, + { + "epoch": 0.2888469865214051, + "grad_norm": 0.5739393339732245, + "learning_rate": 9.627560521415271e-05, + "loss": 3.6723, + "step": 6204 + }, + { + "epoch": 0.2888935447074982, + "grad_norm": 0.6385335353497712, + "learning_rate": 9.629112352576041e-05, + "loss": 3.6472, + "step": 6205 + }, + { + "epoch": 0.2889401028935913, + "grad_norm": 0.5400452300664058, + "learning_rate": 9.63066418373681e-05, + "loss": 3.5851, + "step": 6206 + }, + { + "epoch": 0.28898666107968435, + "grad_norm": 0.5664031189545905, + "learning_rate": 9.632216014897579e-05, + "loss": 3.5689, + "step": 6207 + }, + { + "epoch": 0.2890332192657774, + "grad_norm": 0.5910797826630685, + "learning_rate": 9.633767846058349e-05, + "loss": 3.7281, + "step": 6208 + }, + { + "epoch": 0.28907977745187047, + "grad_norm": 0.5783025767746488, + "learning_rate": 9.63531967721912e-05, + "loss": 3.634, + "step": 6209 + }, + { + "epoch": 0.2891263356379635, + "grad_norm": 0.5656504283070105, + "learning_rate": 9.636871508379889e-05, + "loss": 3.6822, + "step": 6210 + }, + { + "epoch": 0.2891728938240566, + "grad_norm": 0.5889623456996251, + "learning_rate": 9.638423339540659e-05, + "loss": 3.5982, + "step": 6211 + }, + { + "epoch": 0.2892194520101497, + "grad_norm": 0.49750680368965977, + "learning_rate": 9.639975170701429e-05, + "loss": 3.6997, + "step": 6212 + }, + { + "epoch": 0.28926601019624276, + "grad_norm": 0.5638381172468976, + "learning_rate": 9.641527001862198e-05, + "loss": 3.7213, + "step": 6213 + }, + { + "epoch": 0.2893125683823358, + "grad_norm": 0.5824252333212228, + "learning_rate": 9.643078833022966e-05, + "loss": 3.6571, + "step": 6214 + }, + { + "epoch": 0.2893591265684289, + "grad_norm": 0.5963346693849856, + "learning_rate": 9.644630664183737e-05, + "loss": 3.6208, + "step": 6215 + }, + { + "epoch": 0.28940568475452194, + "grad_norm": 0.5305875783973864, + "learning_rate": 9.646182495344507e-05, + "loss": 3.7292, + "step": 6216 + }, + { + "epoch": 0.28945224294061506, + "grad_norm": 0.46649418069914933, + "learning_rate": 9.647734326505277e-05, + "loss": 3.7625, + "step": 6217 + }, + { + "epoch": 0.2894988011267081, + "grad_norm": 0.5634654455424004, + "learning_rate": 9.649286157666046e-05, + "loss": 3.6534, + "step": 6218 + }, + { + "epoch": 0.2895453593128012, + "grad_norm": 0.5233409546576151, + "learning_rate": 9.650837988826816e-05, + "loss": 3.721, + "step": 6219 + }, + { + "epoch": 0.28959191749889424, + "grad_norm": 0.57119736656223, + "learning_rate": 9.652389819987586e-05, + "loss": 3.6899, + "step": 6220 + }, + { + "epoch": 0.2896384756849873, + "grad_norm": 0.557028748973944, + "learning_rate": 9.653941651148357e-05, + "loss": 3.683, + "step": 6221 + }, + { + "epoch": 0.28968503387108036, + "grad_norm": 0.5746627738054021, + "learning_rate": 9.655493482309125e-05, + "loss": 3.6854, + "step": 6222 + }, + { + "epoch": 0.28973159205717347, + "grad_norm": 0.570066475309967, + "learning_rate": 9.657045313469894e-05, + "loss": 3.7393, + "step": 6223 + }, + { + "epoch": 0.28977815024326653, + "grad_norm": 0.5177132508575548, + "learning_rate": 9.658597144630664e-05, + "loss": 3.6268, + "step": 6224 + }, + { + "epoch": 0.2898247084293596, + "grad_norm": 0.5066256993834899, + "learning_rate": 9.660148975791434e-05, + "loss": 3.5284, + "step": 6225 + }, + { + "epoch": 0.28987126661545265, + "grad_norm": 0.6115877285918726, + "learning_rate": 9.661700806952205e-05, + "loss": 3.6696, + "step": 6226 + }, + { + "epoch": 0.2899178248015457, + "grad_norm": 0.6471283433100402, + "learning_rate": 9.663252638112974e-05, + "loss": 3.6152, + "step": 6227 + }, + { + "epoch": 0.2899643829876388, + "grad_norm": 0.6536686474855867, + "learning_rate": 9.664804469273744e-05, + "loss": 3.6677, + "step": 6228 + }, + { + "epoch": 0.2900109411737319, + "grad_norm": 0.6322465347934109, + "learning_rate": 9.666356300434514e-05, + "loss": 3.5846, + "step": 6229 + }, + { + "epoch": 0.29005749935982494, + "grad_norm": 0.7839917365982678, + "learning_rate": 9.667908131595282e-05, + "loss": 3.5919, + "step": 6230 + }, + { + "epoch": 0.290104057545918, + "grad_norm": 0.7533203604966918, + "learning_rate": 9.669459962756053e-05, + "loss": 3.5828, + "step": 6231 + }, + { + "epoch": 0.29015061573201106, + "grad_norm": 0.7412971570704738, + "learning_rate": 9.671011793916823e-05, + "loss": 3.6407, + "step": 6232 + }, + { + "epoch": 0.2901971739181041, + "grad_norm": 0.5901143612958574, + "learning_rate": 9.672563625077592e-05, + "loss": 3.628, + "step": 6233 + }, + { + "epoch": 0.29024373210419724, + "grad_norm": 0.6555980392727566, + "learning_rate": 9.674115456238362e-05, + "loss": 3.5486, + "step": 6234 + }, + { + "epoch": 0.2902902902902903, + "grad_norm": 0.5893333086831934, + "learning_rate": 9.675667287399131e-05, + "loss": 3.6798, + "step": 6235 + }, + { + "epoch": 0.29033684847638336, + "grad_norm": 0.6082142070185834, + "learning_rate": 9.677219118559901e-05, + "loss": 3.7494, + "step": 6236 + }, + { + "epoch": 0.2903834066624764, + "grad_norm": 0.6124752363877574, + "learning_rate": 9.67877094972067e-05, + "loss": 3.6354, + "step": 6237 + }, + { + "epoch": 0.2904299648485695, + "grad_norm": 0.6983165511520569, + "learning_rate": 9.68032278088144e-05, + "loss": 3.6517, + "step": 6238 + }, + { + "epoch": 0.2904765230346626, + "grad_norm": 0.5347257935249575, + "learning_rate": 9.68187461204221e-05, + "loss": 3.6227, + "step": 6239 + }, + { + "epoch": 0.29052308122075565, + "grad_norm": 0.501134928650455, + "learning_rate": 9.68342644320298e-05, + "loss": 3.5994, + "step": 6240 + }, + { + "epoch": 0.2905696394068487, + "grad_norm": 0.5413708856159095, + "learning_rate": 9.684978274363749e-05, + "loss": 3.5123, + "step": 6241 + }, + { + "epoch": 0.29061619759294177, + "grad_norm": 0.4803650643463497, + "learning_rate": 9.686530105524519e-05, + "loss": 3.6234, + "step": 6242 + }, + { + "epoch": 0.29066275577903483, + "grad_norm": 0.5078205777563543, + "learning_rate": 9.68808193668529e-05, + "loss": 3.6316, + "step": 6243 + }, + { + "epoch": 0.2907093139651279, + "grad_norm": 0.5226663450080938, + "learning_rate": 9.68963376784606e-05, + "loss": 3.6081, + "step": 6244 + }, + { + "epoch": 0.290755872151221, + "grad_norm": 0.556820266171919, + "learning_rate": 9.691185599006829e-05, + "loss": 3.5386, + "step": 6245 + }, + { + "epoch": 0.29080243033731407, + "grad_norm": 0.5139082736672769, + "learning_rate": 9.692737430167597e-05, + "loss": 3.7393, + "step": 6246 + }, + { + "epoch": 0.2908489885234071, + "grad_norm": 0.5755935071771369, + "learning_rate": 9.694289261328367e-05, + "loss": 3.6672, + "step": 6247 + }, + { + "epoch": 0.2908955467095002, + "grad_norm": 0.522796497038344, + "learning_rate": 9.695841092489138e-05, + "loss": 3.5946, + "step": 6248 + }, + { + "epoch": 0.29094210489559325, + "grad_norm": 0.5098576451478948, + "learning_rate": 9.697392923649908e-05, + "loss": 3.6426, + "step": 6249 + }, + { + "epoch": 0.29098866308168636, + "grad_norm": 0.5194975598061646, + "learning_rate": 9.698944754810677e-05, + "loss": 3.7546, + "step": 6250 + }, + { + "epoch": 0.2910352212677794, + "grad_norm": 0.5569450005500661, + "learning_rate": 9.700496585971447e-05, + "loss": 3.7312, + "step": 6251 + }, + { + "epoch": 0.2910817794538725, + "grad_norm": 0.5583041615639471, + "learning_rate": 9.702048417132216e-05, + "loss": 3.5946, + "step": 6252 + }, + { + "epoch": 0.29112833763996554, + "grad_norm": 0.5626649184271716, + "learning_rate": 9.703600248292986e-05, + "loss": 3.5916, + "step": 6253 + }, + { + "epoch": 0.2911748958260586, + "grad_norm": 0.575328942415846, + "learning_rate": 9.705152079453756e-05, + "loss": 3.6546, + "step": 6254 + }, + { + "epoch": 0.29122145401215166, + "grad_norm": 0.5460349422759616, + "learning_rate": 9.706703910614525e-05, + "loss": 3.6235, + "step": 6255 + }, + { + "epoch": 0.2912680121982448, + "grad_norm": 0.4636811943008671, + "learning_rate": 9.708255741775295e-05, + "loss": 3.463, + "step": 6256 + }, + { + "epoch": 0.29131457038433783, + "grad_norm": 0.4805663106822257, + "learning_rate": 9.709807572936065e-05, + "loss": 3.5738, + "step": 6257 + }, + { + "epoch": 0.2913611285704309, + "grad_norm": 0.4894434509769517, + "learning_rate": 9.711359404096834e-05, + "loss": 3.6942, + "step": 6258 + }, + { + "epoch": 0.29140768675652395, + "grad_norm": 0.5553100592472141, + "learning_rate": 9.712911235257605e-05, + "loss": 3.6394, + "step": 6259 + }, + { + "epoch": 0.291454244942617, + "grad_norm": 0.5093533207923667, + "learning_rate": 9.714463066418375e-05, + "loss": 3.5741, + "step": 6260 + }, + { + "epoch": 0.29150080312871013, + "grad_norm": 0.5640040695436105, + "learning_rate": 9.716014897579145e-05, + "loss": 3.6039, + "step": 6261 + }, + { + "epoch": 0.2915473613148032, + "grad_norm": 0.5570471186909767, + "learning_rate": 9.717566728739913e-05, + "loss": 3.5593, + "step": 6262 + }, + { + "epoch": 0.29159391950089625, + "grad_norm": 0.5459588179451061, + "learning_rate": 9.719118559900682e-05, + "loss": 3.6057, + "step": 6263 + }, + { + "epoch": 0.2916404776869893, + "grad_norm": 0.48052879383646996, + "learning_rate": 9.720670391061453e-05, + "loss": 3.5768, + "step": 6264 + }, + { + "epoch": 0.29168703587308237, + "grad_norm": 0.5230229663178811, + "learning_rate": 9.722222222222223e-05, + "loss": 3.5314, + "step": 6265 + }, + { + "epoch": 0.2917335940591754, + "grad_norm": 0.5354703942384619, + "learning_rate": 9.723774053382993e-05, + "loss": 3.6772, + "step": 6266 + }, + { + "epoch": 0.29178015224526854, + "grad_norm": 0.564922476606211, + "learning_rate": 9.725325884543762e-05, + "loss": 3.6084, + "step": 6267 + }, + { + "epoch": 0.2918267104313616, + "grad_norm": 0.6021162201196697, + "learning_rate": 9.726877715704532e-05, + "loss": 3.6329, + "step": 6268 + }, + { + "epoch": 0.29187326861745466, + "grad_norm": 0.5771802988137027, + "learning_rate": 9.728429546865302e-05, + "loss": 3.7369, + "step": 6269 + }, + { + "epoch": 0.2919198268035477, + "grad_norm": 0.6433012959455751, + "learning_rate": 9.729981378026071e-05, + "loss": 3.6926, + "step": 6270 + }, + { + "epoch": 0.2919663849896408, + "grad_norm": 0.7041675449948476, + "learning_rate": 9.731533209186841e-05, + "loss": 3.6645, + "step": 6271 + }, + { + "epoch": 0.2920129431757339, + "grad_norm": 0.6582604703761046, + "learning_rate": 9.73308504034761e-05, + "loss": 3.685, + "step": 6272 + }, + { + "epoch": 0.29205950136182696, + "grad_norm": 0.6921600867002414, + "learning_rate": 9.73463687150838e-05, + "loss": 3.6542, + "step": 6273 + }, + { + "epoch": 0.29210605954792, + "grad_norm": 0.7533091539087092, + "learning_rate": 9.73618870266915e-05, + "loss": 3.5858, + "step": 6274 + }, + { + "epoch": 0.2921526177340131, + "grad_norm": 0.5341804257749055, + "learning_rate": 9.73774053382992e-05, + "loss": 3.5984, + "step": 6275 + }, + { + "epoch": 0.29219917592010614, + "grad_norm": 0.5555565925187174, + "learning_rate": 9.73929236499069e-05, + "loss": 3.5812, + "step": 6276 + }, + { + "epoch": 0.2922457341061992, + "grad_norm": 0.7710072452287647, + "learning_rate": 9.74084419615146e-05, + "loss": 3.6163, + "step": 6277 + }, + { + "epoch": 0.2922922922922923, + "grad_norm": 0.6824634162828466, + "learning_rate": 9.74239602731223e-05, + "loss": 3.6857, + "step": 6278 + }, + { + "epoch": 0.29233885047838537, + "grad_norm": 0.6450865535936101, + "learning_rate": 9.743947858472998e-05, + "loss": 3.6259, + "step": 6279 + }, + { + "epoch": 0.29238540866447843, + "grad_norm": 0.6212976272464964, + "learning_rate": 9.745499689633767e-05, + "loss": 3.5677, + "step": 6280 + }, + { + "epoch": 0.2924319668505715, + "grad_norm": 0.7118235242098924, + "learning_rate": 9.747051520794538e-05, + "loss": 3.5734, + "step": 6281 + }, + { + "epoch": 0.29247852503666455, + "grad_norm": 0.6813029432100832, + "learning_rate": 9.748603351955308e-05, + "loss": 3.5431, + "step": 6282 + }, + { + "epoch": 0.29252508322275766, + "grad_norm": 0.5120998655394745, + "learning_rate": 9.750155183116078e-05, + "loss": 3.6413, + "step": 6283 + }, + { + "epoch": 0.2925716414088507, + "grad_norm": 0.5707414041875173, + "learning_rate": 9.751707014276847e-05, + "loss": 3.5138, + "step": 6284 + }, + { + "epoch": 0.2926181995949438, + "grad_norm": 0.6037342954332577, + "learning_rate": 9.753258845437617e-05, + "loss": 3.5881, + "step": 6285 + }, + { + "epoch": 0.29266475778103684, + "grad_norm": 0.4756733043164846, + "learning_rate": 9.754810676598387e-05, + "loss": 3.4967, + "step": 6286 + }, + { + "epoch": 0.2927113159671299, + "grad_norm": 0.5010373384689352, + "learning_rate": 9.756362507759156e-05, + "loss": 3.5937, + "step": 6287 + }, + { + "epoch": 0.29275787415322296, + "grad_norm": 0.5816886196584568, + "learning_rate": 9.757914338919926e-05, + "loss": 3.5962, + "step": 6288 + }, + { + "epoch": 0.2928044323393161, + "grad_norm": 0.5557887291344756, + "learning_rate": 9.759466170080696e-05, + "loss": 3.6481, + "step": 6289 + }, + { + "epoch": 0.29285099052540914, + "grad_norm": 0.5789547923842833, + "learning_rate": 9.761018001241465e-05, + "loss": 3.5329, + "step": 6290 + }, + { + "epoch": 0.2928975487115022, + "grad_norm": 0.5528241229273505, + "learning_rate": 9.762569832402235e-05, + "loss": 3.63, + "step": 6291 + }, + { + "epoch": 0.29294410689759526, + "grad_norm": 0.49462535889216697, + "learning_rate": 9.764121663563004e-05, + "loss": 3.582, + "step": 6292 + }, + { + "epoch": 0.2929906650836883, + "grad_norm": 0.5965331831333999, + "learning_rate": 9.765673494723775e-05, + "loss": 3.5298, + "step": 6293 + }, + { + "epoch": 0.29303722326978143, + "grad_norm": 0.48643737655730523, + "learning_rate": 9.767225325884545e-05, + "loss": 3.5869, + "step": 6294 + }, + { + "epoch": 0.2930837814558745, + "grad_norm": 0.6027678442963953, + "learning_rate": 9.768777157045313e-05, + "loss": 3.6771, + "step": 6295 + }, + { + "epoch": 0.29313033964196755, + "grad_norm": 0.6550106821410344, + "learning_rate": 9.770328988206083e-05, + "loss": 3.6215, + "step": 6296 + }, + { + "epoch": 0.2931768978280606, + "grad_norm": 0.5983183553995043, + "learning_rate": 9.771880819366853e-05, + "loss": 3.5558, + "step": 6297 + }, + { + "epoch": 0.29322345601415367, + "grad_norm": 0.5578434364578324, + "learning_rate": 9.773432650527624e-05, + "loss": 3.6444, + "step": 6298 + }, + { + "epoch": 0.29327001420024673, + "grad_norm": 0.4956301093035844, + "learning_rate": 9.774984481688393e-05, + "loss": 3.6791, + "step": 6299 + }, + { + "epoch": 0.29331657238633985, + "grad_norm": 0.5349747610875513, + "learning_rate": 9.776536312849163e-05, + "loss": 3.5012, + "step": 6300 + }, + { + "epoch": 0.2933631305724329, + "grad_norm": 0.5168505742817119, + "learning_rate": 9.778088144009932e-05, + "loss": 3.6487, + "step": 6301 + }, + { + "epoch": 0.29340968875852597, + "grad_norm": 0.5347410526237758, + "learning_rate": 9.779639975170702e-05, + "loss": 3.644, + "step": 6302 + }, + { + "epoch": 0.293456246944619, + "grad_norm": 0.4816576385575291, + "learning_rate": 9.781191806331472e-05, + "loss": 3.5008, + "step": 6303 + }, + { + "epoch": 0.2935028051307121, + "grad_norm": 0.5268488983724254, + "learning_rate": 9.782743637492241e-05, + "loss": 3.5738, + "step": 6304 + }, + { + "epoch": 0.2935493633168052, + "grad_norm": 0.5029097926215711, + "learning_rate": 9.784295468653011e-05, + "loss": 3.6202, + "step": 6305 + }, + { + "epoch": 0.29359592150289826, + "grad_norm": 0.5960093071804995, + "learning_rate": 9.78584729981378e-05, + "loss": 3.636, + "step": 6306 + }, + { + "epoch": 0.2936424796889913, + "grad_norm": 0.5411817260437124, + "learning_rate": 9.78739913097455e-05, + "loss": 3.5689, + "step": 6307 + }, + { + "epoch": 0.2936890378750844, + "grad_norm": 0.5532004981225029, + "learning_rate": 9.78895096213532e-05, + "loss": 3.7663, + "step": 6308 + }, + { + "epoch": 0.29373559606117744, + "grad_norm": 0.5984105870953219, + "learning_rate": 9.790502793296091e-05, + "loss": 3.5429, + "step": 6309 + }, + { + "epoch": 0.2937821542472705, + "grad_norm": 0.6492813378322024, + "learning_rate": 9.79205462445686e-05, + "loss": 3.6927, + "step": 6310 + }, + { + "epoch": 0.2938287124333636, + "grad_norm": 0.6210040657846994, + "learning_rate": 9.793606455617629e-05, + "loss": 3.6573, + "step": 6311 + }, + { + "epoch": 0.2938752706194567, + "grad_norm": 0.6764645277077055, + "learning_rate": 9.795158286778398e-05, + "loss": 3.6374, + "step": 6312 + }, + { + "epoch": 0.29392182880554973, + "grad_norm": 0.7132414639088254, + "learning_rate": 9.796710117939168e-05, + "loss": 3.6052, + "step": 6313 + }, + { + "epoch": 0.2939683869916428, + "grad_norm": 0.6388598976839175, + "learning_rate": 9.798261949099939e-05, + "loss": 3.7411, + "step": 6314 + }, + { + "epoch": 0.29401494517773585, + "grad_norm": 0.5725541062201244, + "learning_rate": 9.799813780260709e-05, + "loss": 3.6762, + "step": 6315 + }, + { + "epoch": 0.29406150336382897, + "grad_norm": 0.4890705999368575, + "learning_rate": 9.801365611421478e-05, + "loss": 3.5898, + "step": 6316 + }, + { + "epoch": 0.29410806154992203, + "grad_norm": 0.5169359535249677, + "learning_rate": 9.802917442582248e-05, + "loss": 3.4851, + "step": 6317 + }, + { + "epoch": 0.2941546197360151, + "grad_norm": 0.5998307436751688, + "learning_rate": 9.804469273743018e-05, + "loss": 3.6187, + "step": 6318 + }, + { + "epoch": 0.29420117792210815, + "grad_norm": 0.49932451707149605, + "learning_rate": 9.806021104903786e-05, + "loss": 3.5201, + "step": 6319 + }, + { + "epoch": 0.2942477361082012, + "grad_norm": 0.5039993298850763, + "learning_rate": 9.807572936064557e-05, + "loss": 3.5082, + "step": 6320 + }, + { + "epoch": 0.29429429429429427, + "grad_norm": 0.6246643240289613, + "learning_rate": 9.809124767225326e-05, + "loss": 3.6533, + "step": 6321 + }, + { + "epoch": 0.2943408524803874, + "grad_norm": 0.6968219480430989, + "learning_rate": 9.810676598386096e-05, + "loss": 3.7012, + "step": 6322 + }, + { + "epoch": 0.29438741066648044, + "grad_norm": 0.5987760753826696, + "learning_rate": 9.812228429546866e-05, + "loss": 3.6551, + "step": 6323 + }, + { + "epoch": 0.2944339688525735, + "grad_norm": 0.5325944997947298, + "learning_rate": 9.813780260707635e-05, + "loss": 3.5953, + "step": 6324 + }, + { + "epoch": 0.29448052703866656, + "grad_norm": 0.5373703353929452, + "learning_rate": 9.815332091868405e-05, + "loss": 3.6549, + "step": 6325 + }, + { + "epoch": 0.2945270852247596, + "grad_norm": 0.5151752705644191, + "learning_rate": 9.816883923029176e-05, + "loss": 3.6189, + "step": 6326 + }, + { + "epoch": 0.29457364341085274, + "grad_norm": 0.5724706201733634, + "learning_rate": 9.818435754189944e-05, + "loss": 3.5797, + "step": 6327 + }, + { + "epoch": 0.2946202015969458, + "grad_norm": 0.5409689674476641, + "learning_rate": 9.819987585350714e-05, + "loss": 3.6146, + "step": 6328 + }, + { + "epoch": 0.29466675978303886, + "grad_norm": 0.5378411607314068, + "learning_rate": 9.821539416511483e-05, + "loss": 3.4778, + "step": 6329 + }, + { + "epoch": 0.2947133179691319, + "grad_norm": 0.48474379798206196, + "learning_rate": 9.823091247672253e-05, + "loss": 3.656, + "step": 6330 + }, + { + "epoch": 0.294759876155225, + "grad_norm": 0.513843542013276, + "learning_rate": 9.824643078833024e-05, + "loss": 3.5059, + "step": 6331 + }, + { + "epoch": 0.29480643434131804, + "grad_norm": 0.5076623961283158, + "learning_rate": 9.826194909993794e-05, + "loss": 3.6253, + "step": 6332 + }, + { + "epoch": 0.29485299252741115, + "grad_norm": 0.500474240898729, + "learning_rate": 9.827746741154563e-05, + "loss": 3.7471, + "step": 6333 + }, + { + "epoch": 0.2948995507135042, + "grad_norm": 0.48800839737787405, + "learning_rate": 9.829298572315333e-05, + "loss": 3.5679, + "step": 6334 + }, + { + "epoch": 0.29494610889959727, + "grad_norm": 0.4275608713363109, + "learning_rate": 9.830850403476101e-05, + "loss": 3.5789, + "step": 6335 + }, + { + "epoch": 0.29499266708569033, + "grad_norm": 0.5705101350988223, + "learning_rate": 9.832402234636872e-05, + "loss": 3.6299, + "step": 6336 + }, + { + "epoch": 0.2950392252717834, + "grad_norm": 0.5052387331578667, + "learning_rate": 9.833954065797642e-05, + "loss": 3.5891, + "step": 6337 + }, + { + "epoch": 0.2950857834578765, + "grad_norm": 0.5409218714836671, + "learning_rate": 9.835505896958411e-05, + "loss": 3.5272, + "step": 6338 + }, + { + "epoch": 0.29513234164396956, + "grad_norm": 0.5760591650377205, + "learning_rate": 9.837057728119181e-05, + "loss": 3.6654, + "step": 6339 + }, + { + "epoch": 0.2951788998300626, + "grad_norm": 0.6716810442425528, + "learning_rate": 9.838609559279951e-05, + "loss": 3.543, + "step": 6340 + }, + { + "epoch": 0.2952254580161557, + "grad_norm": 0.7106118867923543, + "learning_rate": 9.84016139044072e-05, + "loss": 3.5586, + "step": 6341 + }, + { + "epoch": 0.29527201620224874, + "grad_norm": 0.6025428710849109, + "learning_rate": 9.84171322160149e-05, + "loss": 3.6254, + "step": 6342 + }, + { + "epoch": 0.2953185743883418, + "grad_norm": 0.5665853644302943, + "learning_rate": 9.84326505276226e-05, + "loss": 3.5657, + "step": 6343 + }, + { + "epoch": 0.2953651325744349, + "grad_norm": 0.5159714995876943, + "learning_rate": 9.844816883923029e-05, + "loss": 3.6548, + "step": 6344 + }, + { + "epoch": 0.295411690760528, + "grad_norm": 0.5577940502169288, + "learning_rate": 9.846368715083799e-05, + "loss": 3.5885, + "step": 6345 + }, + { + "epoch": 0.29545824894662104, + "grad_norm": 0.6251818491656678, + "learning_rate": 9.847920546244569e-05, + "loss": 3.5735, + "step": 6346 + }, + { + "epoch": 0.2955048071327141, + "grad_norm": 0.5060557142537867, + "learning_rate": 9.849472377405338e-05, + "loss": 3.5167, + "step": 6347 + }, + { + "epoch": 0.29555136531880716, + "grad_norm": 0.5783099257567217, + "learning_rate": 9.851024208566109e-05, + "loss": 3.6052, + "step": 6348 + }, + { + "epoch": 0.2955979235049003, + "grad_norm": 0.6558656588636703, + "learning_rate": 9.852576039726879e-05, + "loss": 3.5761, + "step": 6349 + }, + { + "epoch": 0.29564448169099333, + "grad_norm": 0.5486322985154212, + "learning_rate": 9.854127870887648e-05, + "loss": 3.5539, + "step": 6350 + }, + { + "epoch": 0.2956910398770864, + "grad_norm": 0.5177732422299387, + "learning_rate": 9.855679702048417e-05, + "loss": 3.6383, + "step": 6351 + }, + { + "epoch": 0.29573759806317945, + "grad_norm": 0.6970060778694, + "learning_rate": 9.857231533209186e-05, + "loss": 3.5901, + "step": 6352 + }, + { + "epoch": 0.2957841562492725, + "grad_norm": 0.518862711962688, + "learning_rate": 9.858783364369957e-05, + "loss": 3.6538, + "step": 6353 + }, + { + "epoch": 0.29583071443536557, + "grad_norm": 0.5556450967461292, + "learning_rate": 9.860335195530727e-05, + "loss": 3.576, + "step": 6354 + }, + { + "epoch": 0.2958772726214587, + "grad_norm": 0.6451283068562516, + "learning_rate": 9.861887026691497e-05, + "loss": 3.6715, + "step": 6355 + }, + { + "epoch": 0.29592383080755175, + "grad_norm": 0.5559289845500238, + "learning_rate": 9.863438857852266e-05, + "loss": 3.657, + "step": 6356 + }, + { + "epoch": 0.2959703889936448, + "grad_norm": 0.7574037563323374, + "learning_rate": 9.864990689013036e-05, + "loss": 3.6432, + "step": 6357 + }, + { + "epoch": 0.29601694717973787, + "grad_norm": 0.7337186350017604, + "learning_rate": 9.866542520173805e-05, + "loss": 3.5528, + "step": 6358 + }, + { + "epoch": 0.2960635053658309, + "grad_norm": 0.5308830993982306, + "learning_rate": 9.868094351334575e-05, + "loss": 3.6402, + "step": 6359 + }, + { + "epoch": 0.29611006355192404, + "grad_norm": 0.5824259511856096, + "learning_rate": 9.869646182495345e-05, + "loss": 3.5349, + "step": 6360 + }, + { + "epoch": 0.2961566217380171, + "grad_norm": 0.7248549649818696, + "learning_rate": 9.871198013656114e-05, + "loss": 3.7665, + "step": 6361 + }, + { + "epoch": 0.29620317992411016, + "grad_norm": 0.6671755854933017, + "learning_rate": 9.872749844816884e-05, + "loss": 3.5721, + "step": 6362 + }, + { + "epoch": 0.2962497381102032, + "grad_norm": 0.6778352838530739, + "learning_rate": 9.874301675977654e-05, + "loss": 3.6083, + "step": 6363 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.6049067029925416, + "learning_rate": 9.875853507138425e-05, + "loss": 3.6856, + "step": 6364 + }, + { + "epoch": 0.29634285448238934, + "grad_norm": 0.58679490989774, + "learning_rate": 9.877405338299194e-05, + "loss": 3.6797, + "step": 6365 + }, + { + "epoch": 0.29638941266848245, + "grad_norm": 0.739394170043233, + "learning_rate": 9.878957169459964e-05, + "loss": 3.5495, + "step": 6366 + }, + { + "epoch": 0.2964359708545755, + "grad_norm": 0.6145823217002342, + "learning_rate": 9.880509000620733e-05, + "loss": 3.5126, + "step": 6367 + }, + { + "epoch": 0.2964825290406686, + "grad_norm": 0.5431429435840615, + "learning_rate": 9.882060831781502e-05, + "loss": 3.5998, + "step": 6368 + }, + { + "epoch": 0.29652908722676163, + "grad_norm": 0.5816006511233056, + "learning_rate": 9.883612662942271e-05, + "loss": 3.591, + "step": 6369 + }, + { + "epoch": 0.2965756454128547, + "grad_norm": 0.5665439003696846, + "learning_rate": 9.885164494103042e-05, + "loss": 3.6265, + "step": 6370 + }, + { + "epoch": 0.2966222035989478, + "grad_norm": 0.5766383425357082, + "learning_rate": 9.886716325263812e-05, + "loss": 3.698, + "step": 6371 + }, + { + "epoch": 0.29666876178504087, + "grad_norm": 0.6218370900530983, + "learning_rate": 9.888268156424582e-05, + "loss": 3.5684, + "step": 6372 + }, + { + "epoch": 0.29671531997113393, + "grad_norm": 0.5613679634268914, + "learning_rate": 9.889819987585351e-05, + "loss": 3.568, + "step": 6373 + }, + { + "epoch": 0.296761878157227, + "grad_norm": 0.4734860201196043, + "learning_rate": 9.891371818746121e-05, + "loss": 3.6296, + "step": 6374 + }, + { + "epoch": 0.29680843634332005, + "grad_norm": 0.5610419516340802, + "learning_rate": 9.89292364990689e-05, + "loss": 3.5962, + "step": 6375 + }, + { + "epoch": 0.2968549945294131, + "grad_norm": 0.5145830107087306, + "learning_rate": 9.89447548106766e-05, + "loss": 3.5848, + "step": 6376 + }, + { + "epoch": 0.2969015527155062, + "grad_norm": 0.5182315023745876, + "learning_rate": 9.89602731222843e-05, + "loss": 3.6403, + "step": 6377 + }, + { + "epoch": 0.2969481109015993, + "grad_norm": 0.5339831383912805, + "learning_rate": 9.8975791433892e-05, + "loss": 3.6596, + "step": 6378 + }, + { + "epoch": 0.29699466908769234, + "grad_norm": 0.5015127122946148, + "learning_rate": 9.899130974549969e-05, + "loss": 3.6015, + "step": 6379 + }, + { + "epoch": 0.2970412272737854, + "grad_norm": 0.5674869107084095, + "learning_rate": 9.900682805710739e-05, + "loss": 3.5722, + "step": 6380 + }, + { + "epoch": 0.29708778545987846, + "grad_norm": 0.44356496328890893, + "learning_rate": 9.90223463687151e-05, + "loss": 3.5758, + "step": 6381 + }, + { + "epoch": 0.2971343436459716, + "grad_norm": 0.47783797416467866, + "learning_rate": 9.903786468032279e-05, + "loss": 3.5782, + "step": 6382 + }, + { + "epoch": 0.29718090183206464, + "grad_norm": 0.4399714287925493, + "learning_rate": 9.905338299193049e-05, + "loss": 3.5975, + "step": 6383 + }, + { + "epoch": 0.2972274600181577, + "grad_norm": 0.44523020812523134, + "learning_rate": 9.906890130353817e-05, + "loss": 3.5462, + "step": 6384 + }, + { + "epoch": 0.29727401820425076, + "grad_norm": 0.5334920306939737, + "learning_rate": 9.908441961514587e-05, + "loss": 3.4818, + "step": 6385 + }, + { + "epoch": 0.2973205763903438, + "grad_norm": 0.6478981973429557, + "learning_rate": 9.909993792675358e-05, + "loss": 3.7369, + "step": 6386 + }, + { + "epoch": 0.2973671345764369, + "grad_norm": 0.6965442949137122, + "learning_rate": 9.911545623836127e-05, + "loss": 3.7375, + "step": 6387 + }, + { + "epoch": 0.29741369276253, + "grad_norm": 0.5684114583547836, + "learning_rate": 9.913097454996897e-05, + "loss": 3.7049, + "step": 6388 + }, + { + "epoch": 0.29746025094862305, + "grad_norm": 0.6150584143684702, + "learning_rate": 9.914649286157667e-05, + "loss": 3.6468, + "step": 6389 + }, + { + "epoch": 0.2975068091347161, + "grad_norm": 0.6157310415833158, + "learning_rate": 9.916201117318436e-05, + "loss": 3.6629, + "step": 6390 + }, + { + "epoch": 0.29755336732080917, + "grad_norm": 0.5123154346674057, + "learning_rate": 9.917752948479206e-05, + "loss": 3.5347, + "step": 6391 + }, + { + "epoch": 0.29759992550690223, + "grad_norm": 0.5752722112145306, + "learning_rate": 9.919304779639976e-05, + "loss": 3.6525, + "step": 6392 + }, + { + "epoch": 0.29764648369299535, + "grad_norm": 0.5420031116589089, + "learning_rate": 9.920856610800745e-05, + "loss": 3.6959, + "step": 6393 + }, + { + "epoch": 0.2976930418790884, + "grad_norm": 0.49566281663623507, + "learning_rate": 9.922408441961515e-05, + "loss": 3.5663, + "step": 6394 + }, + { + "epoch": 0.29773960006518146, + "grad_norm": 0.5362882759904991, + "learning_rate": 9.923960273122285e-05, + "loss": 3.6707, + "step": 6395 + }, + { + "epoch": 0.2977861582512745, + "grad_norm": 0.6031085503823439, + "learning_rate": 9.925512104283054e-05, + "loss": 3.5147, + "step": 6396 + }, + { + "epoch": 0.2978327164373676, + "grad_norm": 0.4777016509306535, + "learning_rate": 9.927063935443824e-05, + "loss": 3.5278, + "step": 6397 + }, + { + "epoch": 0.29787927462346064, + "grad_norm": 0.5317624010211787, + "learning_rate": 9.928615766604595e-05, + "loss": 3.4587, + "step": 6398 + }, + { + "epoch": 0.29792583280955376, + "grad_norm": 0.5304505781116355, + "learning_rate": 9.930167597765364e-05, + "loss": 3.5652, + "step": 6399 + }, + { + "epoch": 0.2979723909956468, + "grad_norm": 0.5554993169625164, + "learning_rate": 9.931719428926133e-05, + "loss": 3.6481, + "step": 6400 + }, + { + "epoch": 0.2980189491817399, + "grad_norm": 0.5559975072677233, + "learning_rate": 9.933271260086902e-05, + "loss": 3.7296, + "step": 6401 + }, + { + "epoch": 0.29806550736783294, + "grad_norm": 0.5120666627872531, + "learning_rate": 9.934823091247672e-05, + "loss": 3.395, + "step": 6402 + }, + { + "epoch": 0.298112065553926, + "grad_norm": 0.5026168831459357, + "learning_rate": 9.936374922408443e-05, + "loss": 3.6223, + "step": 6403 + }, + { + "epoch": 0.2981586237400191, + "grad_norm": 0.6185201215265979, + "learning_rate": 9.937926753569213e-05, + "loss": 3.6204, + "step": 6404 + }, + { + "epoch": 0.2982051819261122, + "grad_norm": 0.5952141395861446, + "learning_rate": 9.939478584729982e-05, + "loss": 3.5946, + "step": 6405 + }, + { + "epoch": 0.29825174011220523, + "grad_norm": 0.5468803209574828, + "learning_rate": 9.941030415890752e-05, + "loss": 3.5907, + "step": 6406 + }, + { + "epoch": 0.2982982982982983, + "grad_norm": 0.6289689273585592, + "learning_rate": 9.942582247051521e-05, + "loss": 3.493, + "step": 6407 + }, + { + "epoch": 0.29834485648439135, + "grad_norm": 0.6398380870030167, + "learning_rate": 9.944134078212291e-05, + "loss": 3.6541, + "step": 6408 + }, + { + "epoch": 0.2983914146704844, + "grad_norm": 0.6115185443017027, + "learning_rate": 9.945685909373061e-05, + "loss": 3.6501, + "step": 6409 + }, + { + "epoch": 0.2984379728565775, + "grad_norm": 0.568336343860918, + "learning_rate": 9.94723774053383e-05, + "loss": 3.7108, + "step": 6410 + }, + { + "epoch": 0.2984845310426706, + "grad_norm": 0.518986385972094, + "learning_rate": 9.9487895716946e-05, + "loss": 3.5457, + "step": 6411 + }, + { + "epoch": 0.29853108922876365, + "grad_norm": 0.48631918055328055, + "learning_rate": 9.95034140285537e-05, + "loss": 3.6257, + "step": 6412 + }, + { + "epoch": 0.2985776474148567, + "grad_norm": 0.5276906987450937, + "learning_rate": 9.951893234016139e-05, + "loss": 3.5729, + "step": 6413 + }, + { + "epoch": 0.29862420560094977, + "grad_norm": 0.5276361924824828, + "learning_rate": 9.95344506517691e-05, + "loss": 3.6421, + "step": 6414 + }, + { + "epoch": 0.2986707637870429, + "grad_norm": 0.6206735503120867, + "learning_rate": 9.95499689633768e-05, + "loss": 3.5805, + "step": 6415 + }, + { + "epoch": 0.29871732197313594, + "grad_norm": 0.5883877256487686, + "learning_rate": 9.956548727498448e-05, + "loss": 3.6989, + "step": 6416 + }, + { + "epoch": 0.298763880159229, + "grad_norm": 0.647845594499638, + "learning_rate": 9.958100558659218e-05, + "loss": 3.6054, + "step": 6417 + }, + { + "epoch": 0.29881043834532206, + "grad_norm": 0.5458969612601039, + "learning_rate": 9.959652389819987e-05, + "loss": 3.6358, + "step": 6418 + }, + { + "epoch": 0.2988569965314151, + "grad_norm": 0.4749321123000314, + "learning_rate": 9.961204220980757e-05, + "loss": 3.5629, + "step": 6419 + }, + { + "epoch": 0.2989035547175082, + "grad_norm": 0.4487151823055027, + "learning_rate": 9.962756052141528e-05, + "loss": 3.5349, + "step": 6420 + }, + { + "epoch": 0.2989501129036013, + "grad_norm": 0.522419001987389, + "learning_rate": 9.964307883302298e-05, + "loss": 3.5766, + "step": 6421 + }, + { + "epoch": 0.29899667108969435, + "grad_norm": 0.5105503840318011, + "learning_rate": 9.965859714463067e-05, + "loss": 3.6418, + "step": 6422 + }, + { + "epoch": 0.2990432292757874, + "grad_norm": 0.46242188660823863, + "learning_rate": 9.967411545623837e-05, + "loss": 3.6652, + "step": 6423 + }, + { + "epoch": 0.2990897874618805, + "grad_norm": 0.4631135169229558, + "learning_rate": 9.968963376784605e-05, + "loss": 3.6313, + "step": 6424 + }, + { + "epoch": 0.29913634564797353, + "grad_norm": 0.4880104845371607, + "learning_rate": 9.970515207945376e-05, + "loss": 3.6048, + "step": 6425 + }, + { + "epoch": 0.29918290383406665, + "grad_norm": 0.5227045103180774, + "learning_rate": 9.972067039106146e-05, + "loss": 3.6502, + "step": 6426 + }, + { + "epoch": 0.2992294620201597, + "grad_norm": 0.5435997444242603, + "learning_rate": 9.973618870266915e-05, + "loss": 3.5682, + "step": 6427 + }, + { + "epoch": 0.29927602020625277, + "grad_norm": 0.4772563484162545, + "learning_rate": 9.975170701427685e-05, + "loss": 3.6319, + "step": 6428 + }, + { + "epoch": 0.29932257839234583, + "grad_norm": 0.5187876974313794, + "learning_rate": 9.976722532588455e-05, + "loss": 3.6079, + "step": 6429 + }, + { + "epoch": 0.2993691365784389, + "grad_norm": 0.6213565612493711, + "learning_rate": 9.978274363749224e-05, + "loss": 3.5262, + "step": 6430 + }, + { + "epoch": 0.29941569476453195, + "grad_norm": 0.6208464076884463, + "learning_rate": 9.979826194909995e-05, + "loss": 3.6255, + "step": 6431 + }, + { + "epoch": 0.29946225295062506, + "grad_norm": 0.6212187876136483, + "learning_rate": 9.981378026070764e-05, + "loss": 3.621, + "step": 6432 + }, + { + "epoch": 0.2995088111367181, + "grad_norm": 0.6144594137992263, + "learning_rate": 9.982929857231533e-05, + "loss": 3.6213, + "step": 6433 + }, + { + "epoch": 0.2995553693228112, + "grad_norm": 0.6382413295885807, + "learning_rate": 9.984481688392303e-05, + "loss": 3.6862, + "step": 6434 + }, + { + "epoch": 0.29960192750890424, + "grad_norm": 0.6333443638230304, + "learning_rate": 9.986033519553072e-05, + "loss": 3.6504, + "step": 6435 + }, + { + "epoch": 0.2996484856949973, + "grad_norm": 0.6554852827027693, + "learning_rate": 9.987585350713843e-05, + "loss": 3.5991, + "step": 6436 + }, + { + "epoch": 0.2996950438810904, + "grad_norm": 0.6182794898010102, + "learning_rate": 9.989137181874613e-05, + "loss": 3.6726, + "step": 6437 + }, + { + "epoch": 0.2997416020671835, + "grad_norm": 0.6397451091742826, + "learning_rate": 9.990689013035383e-05, + "loss": 3.6306, + "step": 6438 + }, + { + "epoch": 0.29978816025327654, + "grad_norm": 0.6207931624946555, + "learning_rate": 9.992240844196152e-05, + "loss": 3.701, + "step": 6439 + }, + { + "epoch": 0.2998347184393696, + "grad_norm": 0.6420480045258432, + "learning_rate": 9.99379267535692e-05, + "loss": 3.6119, + "step": 6440 + }, + { + "epoch": 0.29988127662546266, + "grad_norm": 0.5425263539997744, + "learning_rate": 9.995344506517692e-05, + "loss": 3.5774, + "step": 6441 + }, + { + "epoch": 0.2999278348115557, + "grad_norm": 0.5673070931187352, + "learning_rate": 9.996896337678461e-05, + "loss": 3.6446, + "step": 6442 + }, + { + "epoch": 0.29997439299764883, + "grad_norm": 0.5676434863237904, + "learning_rate": 9.998448168839231e-05, + "loss": 3.5529, + "step": 6443 + }, + { + "epoch": 0.3000209511837419, + "grad_norm": 0.5637059020648388, + "learning_rate": 0.0001, + "loss": 3.6578, + "step": 6444 + }, + { + "epoch": 0.30006750936983495, + "grad_norm": 0.5197471445989889, + "learning_rate": 9.999999992662748e-05, + "loss": 3.6027, + "step": 6445 + }, + { + "epoch": 0.300114067555928, + "grad_norm": 0.6184994821567277, + "learning_rate": 9.999999970650986e-05, + "loss": 3.4349, + "step": 6446 + }, + { + "epoch": 0.30016062574202107, + "grad_norm": 0.6039759097435385, + "learning_rate": 9.999999933964717e-05, + "loss": 3.581, + "step": 6447 + }, + { + "epoch": 0.3002071839281142, + "grad_norm": 0.5519296246574298, + "learning_rate": 9.99999988260394e-05, + "loss": 3.6533, + "step": 6448 + }, + { + "epoch": 0.30025374211420724, + "grad_norm": 0.510112724184124, + "learning_rate": 9.99999981656866e-05, + "loss": 3.514, + "step": 6449 + }, + { + "epoch": 0.3003003003003003, + "grad_norm": 0.5093171651740079, + "learning_rate": 9.999999735858869e-05, + "loss": 3.552, + "step": 6450 + }, + { + "epoch": 0.30034685848639336, + "grad_norm": 0.7258138579600901, + "learning_rate": 9.999999640474573e-05, + "loss": 3.6342, + "step": 6451 + }, + { + "epoch": 0.3003934166724864, + "grad_norm": 0.6991321478746971, + "learning_rate": 9.99999953041577e-05, + "loss": 3.6597, + "step": 6452 + }, + { + "epoch": 0.3004399748585795, + "grad_norm": 0.5204287379929292, + "learning_rate": 9.999999405682461e-05, + "loss": 3.689, + "step": 6453 + }, + { + "epoch": 0.3004865330446726, + "grad_norm": 0.5667399239518132, + "learning_rate": 9.999999266274647e-05, + "loss": 3.5985, + "step": 6454 + }, + { + "epoch": 0.30053309123076566, + "grad_norm": 0.597794347496735, + "learning_rate": 9.999999112192328e-05, + "loss": 3.5401, + "step": 6455 + }, + { + "epoch": 0.3005796494168587, + "grad_norm": 0.5289110899866896, + "learning_rate": 9.999998943435503e-05, + "loss": 3.5849, + "step": 6456 + }, + { + "epoch": 0.3006262076029518, + "grad_norm": 0.47080841955613534, + "learning_rate": 9.999998760004175e-05, + "loss": 3.4462, + "step": 6457 + }, + { + "epoch": 0.30067276578904484, + "grad_norm": 0.4763145776195962, + "learning_rate": 9.999998561898343e-05, + "loss": 3.5575, + "step": 6458 + }, + { + "epoch": 0.30071932397513795, + "grad_norm": 0.4622388814534723, + "learning_rate": 9.999998349118007e-05, + "loss": 3.573, + "step": 6459 + }, + { + "epoch": 0.300765882161231, + "grad_norm": 0.529723597429195, + "learning_rate": 9.999998121663167e-05, + "loss": 3.5886, + "step": 6460 + }, + { + "epoch": 0.3008124403473241, + "grad_norm": 0.5327988271376708, + "learning_rate": 9.999997879533828e-05, + "loss": 3.5786, + "step": 6461 + }, + { + "epoch": 0.30085899853341713, + "grad_norm": 0.6046128323482091, + "learning_rate": 9.999997622729986e-05, + "loss": 3.6507, + "step": 6462 + }, + { + "epoch": 0.3009055567195102, + "grad_norm": 0.6461229592167085, + "learning_rate": 9.999997351251644e-05, + "loss": 3.6364, + "step": 6463 + }, + { + "epoch": 0.30095211490560325, + "grad_norm": 0.5673103809992499, + "learning_rate": 9.999997065098803e-05, + "loss": 3.5162, + "step": 6464 + }, + { + "epoch": 0.30099867309169637, + "grad_norm": 0.46104149035579667, + "learning_rate": 9.999996764271463e-05, + "loss": 3.587, + "step": 6465 + }, + { + "epoch": 0.3010452312777894, + "grad_norm": 0.5723764499377736, + "learning_rate": 9.999996448769624e-05, + "loss": 3.5946, + "step": 6466 + }, + { + "epoch": 0.3010917894638825, + "grad_norm": 0.5527363426774685, + "learning_rate": 9.999996118593289e-05, + "loss": 3.586, + "step": 6467 + }, + { + "epoch": 0.30113834764997555, + "grad_norm": 0.5220807191345427, + "learning_rate": 9.99999577374246e-05, + "loss": 3.6029, + "step": 6468 + }, + { + "epoch": 0.3011849058360686, + "grad_norm": 0.5314927118610714, + "learning_rate": 9.999995414217132e-05, + "loss": 3.5262, + "step": 6469 + }, + { + "epoch": 0.3012314640221617, + "grad_norm": 0.5296709502258705, + "learning_rate": 9.999995040017311e-05, + "loss": 3.5954, + "step": 6470 + }, + { + "epoch": 0.3012780222082548, + "grad_norm": 0.5953946590535187, + "learning_rate": 9.999994651142999e-05, + "loss": 3.5146, + "step": 6471 + }, + { + "epoch": 0.30132458039434784, + "grad_norm": 0.573839945478337, + "learning_rate": 9.999994247594194e-05, + "loss": 3.5971, + "step": 6472 + }, + { + "epoch": 0.3013711385804409, + "grad_norm": 0.589145636937745, + "learning_rate": 9.9999938293709e-05, + "loss": 3.5847, + "step": 6473 + }, + { + "epoch": 0.30141769676653396, + "grad_norm": 0.5303283107988311, + "learning_rate": 9.999993396473114e-05, + "loss": 3.508, + "step": 6474 + }, + { + "epoch": 0.301464254952627, + "grad_norm": 0.472754639810458, + "learning_rate": 9.999992948900841e-05, + "loss": 3.5474, + "step": 6475 + }, + { + "epoch": 0.30151081313872014, + "grad_norm": 0.5022792428216316, + "learning_rate": 9.999992486654082e-05, + "loss": 3.4604, + "step": 6476 + }, + { + "epoch": 0.3015573713248132, + "grad_norm": 0.5848431178234607, + "learning_rate": 9.999992009732837e-05, + "loss": 3.6279, + "step": 6477 + }, + { + "epoch": 0.30160392951090625, + "grad_norm": 0.48260808309013675, + "learning_rate": 9.999991518137108e-05, + "loss": 3.5653, + "step": 6478 + }, + { + "epoch": 0.3016504876969993, + "grad_norm": 0.4809716691527688, + "learning_rate": 9.999991011866899e-05, + "loss": 3.4655, + "step": 6479 + }, + { + "epoch": 0.3016970458830924, + "grad_norm": 0.5377551554091105, + "learning_rate": 9.999990490922205e-05, + "loss": 3.5436, + "step": 6480 + }, + { + "epoch": 0.3017436040691855, + "grad_norm": 0.5317647704122909, + "learning_rate": 9.999989955303033e-05, + "loss": 3.6686, + "step": 6481 + }, + { + "epoch": 0.30179016225527855, + "grad_norm": 0.5139667449603007, + "learning_rate": 9.999989405009385e-05, + "loss": 3.5504, + "step": 6482 + }, + { + "epoch": 0.3018367204413716, + "grad_norm": 0.5199984005067279, + "learning_rate": 9.999988840041258e-05, + "loss": 3.5742, + "step": 6483 + }, + { + "epoch": 0.30188327862746467, + "grad_norm": 0.44356492460173635, + "learning_rate": 9.999988260398658e-05, + "loss": 3.589, + "step": 6484 + }, + { + "epoch": 0.30192983681355773, + "grad_norm": 0.5055919996887829, + "learning_rate": 9.999987666081583e-05, + "loss": 3.5581, + "step": 6485 + }, + { + "epoch": 0.3019763949996508, + "grad_norm": 0.5954399671231386, + "learning_rate": 9.999987057090039e-05, + "loss": 3.5437, + "step": 6486 + }, + { + "epoch": 0.3020229531857439, + "grad_norm": 0.6607568886561095, + "learning_rate": 9.999986433424025e-05, + "loss": 3.5589, + "step": 6487 + }, + { + "epoch": 0.30206951137183696, + "grad_norm": 0.5121591942281508, + "learning_rate": 9.999985795083542e-05, + "loss": 3.664, + "step": 6488 + }, + { + "epoch": 0.30211606955793, + "grad_norm": 0.4953958470141625, + "learning_rate": 9.999985142068595e-05, + "loss": 3.5081, + "step": 6489 + }, + { + "epoch": 0.3021626277440231, + "grad_norm": 0.5860568675306056, + "learning_rate": 9.999984474379183e-05, + "loss": 3.6074, + "step": 6490 + }, + { + "epoch": 0.30220918593011614, + "grad_norm": 0.6053419360880763, + "learning_rate": 9.99998379201531e-05, + "loss": 3.5826, + "step": 6491 + }, + { + "epoch": 0.30225574411620926, + "grad_norm": 0.5157278236291771, + "learning_rate": 9.999983094976977e-05, + "loss": 3.4653, + "step": 6492 + }, + { + "epoch": 0.3023023023023023, + "grad_norm": 0.49765637168011234, + "learning_rate": 9.999982383264186e-05, + "loss": 3.5601, + "step": 6493 + }, + { + "epoch": 0.3023488604883954, + "grad_norm": 0.5939041737885813, + "learning_rate": 9.999981656876939e-05, + "loss": 3.4939, + "step": 6494 + }, + { + "epoch": 0.30239541867448844, + "grad_norm": 0.4969563975924455, + "learning_rate": 9.999980915815239e-05, + "loss": 3.6427, + "step": 6495 + }, + { + "epoch": 0.3024419768605815, + "grad_norm": 0.5360918387760771, + "learning_rate": 9.999980160079087e-05, + "loss": 3.649, + "step": 6496 + }, + { + "epoch": 0.30248853504667456, + "grad_norm": 0.4994397656975131, + "learning_rate": 9.999979389668486e-05, + "loss": 3.718, + "step": 6497 + }, + { + "epoch": 0.30253509323276767, + "grad_norm": 0.5501971882972306, + "learning_rate": 9.999978604583438e-05, + "loss": 3.6268, + "step": 6498 + }, + { + "epoch": 0.30258165141886073, + "grad_norm": 0.5464732309458564, + "learning_rate": 9.999977804823946e-05, + "loss": 3.5859, + "step": 6499 + }, + { + "epoch": 0.3026282096049538, + "grad_norm": 0.5558503818194884, + "learning_rate": 9.999976990390012e-05, + "loss": 3.5318, + "step": 6500 + }, + { + "epoch": 0.30267476779104685, + "grad_norm": 0.5587025315696281, + "learning_rate": 9.999976161281637e-05, + "loss": 3.6107, + "step": 6501 + }, + { + "epoch": 0.3027213259771399, + "grad_norm": 0.5202774302988589, + "learning_rate": 9.999975317498824e-05, + "loss": 3.4749, + "step": 6502 + }, + { + "epoch": 0.302767884163233, + "grad_norm": 0.49795127995594846, + "learning_rate": 9.999974459041578e-05, + "loss": 3.6095, + "step": 6503 + }, + { + "epoch": 0.3028144423493261, + "grad_norm": 0.5804728220801756, + "learning_rate": 9.999973585909899e-05, + "loss": 3.5884, + "step": 6504 + }, + { + "epoch": 0.30286100053541914, + "grad_norm": 0.6157035099248331, + "learning_rate": 9.999972698103788e-05, + "loss": 3.5535, + "step": 6505 + }, + { + "epoch": 0.3029075587215122, + "grad_norm": 0.5563943443626862, + "learning_rate": 9.999971795623253e-05, + "loss": 3.6187, + "step": 6506 + }, + { + "epoch": 0.30295411690760526, + "grad_norm": 0.5877412085422691, + "learning_rate": 9.999970878468291e-05, + "loss": 3.5968, + "step": 6507 + }, + { + "epoch": 0.3030006750936983, + "grad_norm": 0.6367190548696005, + "learning_rate": 9.999969946638907e-05, + "loss": 3.6408, + "step": 6508 + }, + { + "epoch": 0.30304723327979144, + "grad_norm": 0.5854522836114372, + "learning_rate": 9.999969000135105e-05, + "loss": 3.5148, + "step": 6509 + }, + { + "epoch": 0.3030937914658845, + "grad_norm": 0.6377502925519004, + "learning_rate": 9.999968038956887e-05, + "loss": 3.626, + "step": 6510 + }, + { + "epoch": 0.30314034965197756, + "grad_norm": 0.69315230767395, + "learning_rate": 9.999967063104254e-05, + "loss": 3.6497, + "step": 6511 + }, + { + "epoch": 0.3031869078380706, + "grad_norm": 0.6773161293150711, + "learning_rate": 9.99996607257721e-05, + "loss": 3.7143, + "step": 6512 + }, + { + "epoch": 0.3032334660241637, + "grad_norm": 0.6473170499059651, + "learning_rate": 9.999965067375759e-05, + "loss": 3.6226, + "step": 6513 + }, + { + "epoch": 0.3032800242102568, + "grad_norm": 0.5196461474849494, + "learning_rate": 9.999964047499903e-05, + "loss": 3.6036, + "step": 6514 + }, + { + "epoch": 0.30332658239634985, + "grad_norm": 0.579656438362843, + "learning_rate": 9.999963012949646e-05, + "loss": 3.5686, + "step": 6515 + }, + { + "epoch": 0.3033731405824429, + "grad_norm": 0.5350527952467974, + "learning_rate": 9.999961963724989e-05, + "loss": 3.4635, + "step": 6516 + }, + { + "epoch": 0.303419698768536, + "grad_norm": 0.4682264606526989, + "learning_rate": 9.999960899825936e-05, + "loss": 3.482, + "step": 6517 + }, + { + "epoch": 0.30346625695462903, + "grad_norm": 0.49929763207387734, + "learning_rate": 9.999959821252492e-05, + "loss": 3.58, + "step": 6518 + }, + { + "epoch": 0.3035128151407221, + "grad_norm": 0.4891474932266641, + "learning_rate": 9.999958728004656e-05, + "loss": 3.4994, + "step": 6519 + }, + { + "epoch": 0.3035593733268152, + "grad_norm": 0.46267095587272095, + "learning_rate": 9.999957620082435e-05, + "loss": 3.4599, + "step": 6520 + }, + { + "epoch": 0.30360593151290827, + "grad_norm": 0.45313825136572233, + "learning_rate": 9.999956497485832e-05, + "loss": 3.5303, + "step": 6521 + }, + { + "epoch": 0.3036524896990013, + "grad_norm": 0.4584975881937224, + "learning_rate": 9.999955360214847e-05, + "loss": 3.4228, + "step": 6522 + }, + { + "epoch": 0.3036990478850944, + "grad_norm": 0.4357287890613321, + "learning_rate": 9.999954208269487e-05, + "loss": 3.5773, + "step": 6523 + }, + { + "epoch": 0.30374560607118745, + "grad_norm": 0.44543464794148546, + "learning_rate": 9.999953041649755e-05, + "loss": 3.5419, + "step": 6524 + }, + { + "epoch": 0.30379216425728056, + "grad_norm": 0.45076194133758796, + "learning_rate": 9.999951860355652e-05, + "loss": 3.5539, + "step": 6525 + }, + { + "epoch": 0.3038387224433736, + "grad_norm": 0.4474078323837452, + "learning_rate": 9.999950664387182e-05, + "loss": 3.5174, + "step": 6526 + }, + { + "epoch": 0.3038852806294667, + "grad_norm": 0.48013641431622406, + "learning_rate": 9.999949453744351e-05, + "loss": 3.5711, + "step": 6527 + }, + { + "epoch": 0.30393183881555974, + "grad_norm": 0.4585421970707718, + "learning_rate": 9.99994822842716e-05, + "loss": 3.6155, + "step": 6528 + }, + { + "epoch": 0.3039783970016528, + "grad_norm": 0.5037591535665462, + "learning_rate": 9.999946988435613e-05, + "loss": 3.5207, + "step": 6529 + }, + { + "epoch": 0.30402495518774586, + "grad_norm": 0.5136022491149048, + "learning_rate": 9.999945733769716e-05, + "loss": 3.6399, + "step": 6530 + }, + { + "epoch": 0.304071513373839, + "grad_norm": 0.477200901271168, + "learning_rate": 9.99994446442947e-05, + "loss": 3.5522, + "step": 6531 + }, + { + "epoch": 0.30411807155993203, + "grad_norm": 0.46503545040606115, + "learning_rate": 9.99994318041488e-05, + "loss": 3.6545, + "step": 6532 + }, + { + "epoch": 0.3041646297460251, + "grad_norm": 0.5067323211639955, + "learning_rate": 9.999941881725948e-05, + "loss": 3.5611, + "step": 6533 + }, + { + "epoch": 0.30421118793211815, + "grad_norm": 0.4805108183177251, + "learning_rate": 9.99994056836268e-05, + "loss": 3.5534, + "step": 6534 + }, + { + "epoch": 0.3042577461182112, + "grad_norm": 0.5159562608721066, + "learning_rate": 9.999939240325079e-05, + "loss": 3.5389, + "step": 6535 + }, + { + "epoch": 0.30430430430430433, + "grad_norm": 0.7026032336569714, + "learning_rate": 9.99993789761315e-05, + "loss": 3.5559, + "step": 6536 + }, + { + "epoch": 0.3043508624903974, + "grad_norm": 0.7068842303797737, + "learning_rate": 9.999936540226894e-05, + "loss": 3.629, + "step": 6537 + }, + { + "epoch": 0.30439742067649045, + "grad_norm": 0.6445599063446731, + "learning_rate": 9.999935168166316e-05, + "loss": 3.5561, + "step": 6538 + }, + { + "epoch": 0.3044439788625835, + "grad_norm": 0.5795318727762132, + "learning_rate": 9.999933781431423e-05, + "loss": 3.5795, + "step": 6539 + }, + { + "epoch": 0.30449053704867657, + "grad_norm": 0.5685694655384426, + "learning_rate": 9.999932380022217e-05, + "loss": 3.6074, + "step": 6540 + }, + { + "epoch": 0.30453709523476963, + "grad_norm": 0.6355933940948741, + "learning_rate": 9.999930963938702e-05, + "loss": 3.5143, + "step": 6541 + }, + { + "epoch": 0.30458365342086274, + "grad_norm": 0.629549852243403, + "learning_rate": 9.99992953318088e-05, + "loss": 3.6395, + "step": 6542 + }, + { + "epoch": 0.3046302116069558, + "grad_norm": 0.5227306463902888, + "learning_rate": 9.99992808774876e-05, + "loss": 3.6674, + "step": 6543 + }, + { + "epoch": 0.30467676979304886, + "grad_norm": 0.5358281882306563, + "learning_rate": 9.999926627642343e-05, + "loss": 3.6324, + "step": 6544 + }, + { + "epoch": 0.3047233279791419, + "grad_norm": 0.5695107022562372, + "learning_rate": 9.999925152861634e-05, + "loss": 3.5989, + "step": 6545 + }, + { + "epoch": 0.304769886165235, + "grad_norm": 0.5613081371787648, + "learning_rate": 9.999923663406636e-05, + "loss": 3.636, + "step": 6546 + }, + { + "epoch": 0.3048164443513281, + "grad_norm": 0.4831052113108126, + "learning_rate": 9.999922159277355e-05, + "loss": 3.5998, + "step": 6547 + }, + { + "epoch": 0.30486300253742116, + "grad_norm": 0.6045852582972759, + "learning_rate": 9.999920640473796e-05, + "loss": 3.5831, + "step": 6548 + }, + { + "epoch": 0.3049095607235142, + "grad_norm": 0.52703530133698, + "learning_rate": 9.999919106995962e-05, + "loss": 3.5217, + "step": 6549 + }, + { + "epoch": 0.3049561189096073, + "grad_norm": 0.4313795207385867, + "learning_rate": 9.999917558843858e-05, + "loss": 3.5388, + "step": 6550 + }, + { + "epoch": 0.30500267709570034, + "grad_norm": 0.5529800977911153, + "learning_rate": 9.999915996017488e-05, + "loss": 3.6315, + "step": 6551 + }, + { + "epoch": 0.3050492352817934, + "grad_norm": 0.5494689589690519, + "learning_rate": 9.999914418516858e-05, + "loss": 3.5493, + "step": 6552 + }, + { + "epoch": 0.3050957934678865, + "grad_norm": 0.44325642924214276, + "learning_rate": 9.99991282634197e-05, + "loss": 3.5227, + "step": 6553 + }, + { + "epoch": 0.30514235165397957, + "grad_norm": 0.5406833964420487, + "learning_rate": 9.999911219492832e-05, + "loss": 3.6217, + "step": 6554 + }, + { + "epoch": 0.30518890984007263, + "grad_norm": 0.6651328551778308, + "learning_rate": 9.999909597969448e-05, + "loss": 3.6084, + "step": 6555 + }, + { + "epoch": 0.3052354680261657, + "grad_norm": 0.5760669694128344, + "learning_rate": 9.999907961771821e-05, + "loss": 3.414, + "step": 6556 + }, + { + "epoch": 0.30528202621225875, + "grad_norm": 0.5391034642404726, + "learning_rate": 9.999906310899956e-05, + "loss": 3.6335, + "step": 6557 + }, + { + "epoch": 0.30532858439835187, + "grad_norm": 0.6057599678094492, + "learning_rate": 9.99990464535386e-05, + "loss": 3.6031, + "step": 6558 + }, + { + "epoch": 0.3053751425844449, + "grad_norm": 0.6534624004643188, + "learning_rate": 9.999902965133536e-05, + "loss": 3.3348, + "step": 6559 + }, + { + "epoch": 0.305421700770538, + "grad_norm": 0.6251277273096079, + "learning_rate": 9.999901270238987e-05, + "loss": 3.5528, + "step": 6560 + }, + { + "epoch": 0.30546825895663104, + "grad_norm": 0.5604447004015185, + "learning_rate": 9.999899560670222e-05, + "loss": 3.6018, + "step": 6561 + }, + { + "epoch": 0.3055148171427241, + "grad_norm": 0.6775534366725346, + "learning_rate": 9.999897836427245e-05, + "loss": 3.7019, + "step": 6562 + }, + { + "epoch": 0.30556137532881716, + "grad_norm": 0.6570366683060204, + "learning_rate": 9.99989609751006e-05, + "loss": 3.4884, + "step": 6563 + }, + { + "epoch": 0.3056079335149103, + "grad_norm": 0.5750927648217788, + "learning_rate": 9.999894343918673e-05, + "loss": 3.5132, + "step": 6564 + }, + { + "epoch": 0.30565449170100334, + "grad_norm": 0.5785588544544581, + "learning_rate": 9.99989257565309e-05, + "loss": 3.6519, + "step": 6565 + }, + { + "epoch": 0.3057010498870964, + "grad_norm": 0.6801555477573911, + "learning_rate": 9.999890792713311e-05, + "loss": 3.665, + "step": 6566 + }, + { + "epoch": 0.30574760807318946, + "grad_norm": 0.549821707932245, + "learning_rate": 9.999888995099347e-05, + "loss": 3.6454, + "step": 6567 + }, + { + "epoch": 0.3057941662592825, + "grad_norm": 0.5408753969813327, + "learning_rate": 9.999887182811202e-05, + "loss": 3.4675, + "step": 6568 + }, + { + "epoch": 0.30584072444537563, + "grad_norm": 0.583425284396813, + "learning_rate": 9.999885355848881e-05, + "loss": 3.5922, + "step": 6569 + }, + { + "epoch": 0.3058872826314687, + "grad_norm": 0.5917211136884941, + "learning_rate": 9.999883514212387e-05, + "loss": 3.496, + "step": 6570 + }, + { + "epoch": 0.30593384081756175, + "grad_norm": 0.5771436243703132, + "learning_rate": 9.99988165790173e-05, + "loss": 3.5309, + "step": 6571 + }, + { + "epoch": 0.3059803990036548, + "grad_norm": 0.5567775993844579, + "learning_rate": 9.999879786916912e-05, + "loss": 3.4547, + "step": 6572 + }, + { + "epoch": 0.3060269571897479, + "grad_norm": 0.6316611153479094, + "learning_rate": 9.999877901257938e-05, + "loss": 3.5359, + "step": 6573 + }, + { + "epoch": 0.30607351537584093, + "grad_norm": 0.5635512645090922, + "learning_rate": 9.999876000924816e-05, + "loss": 3.4841, + "step": 6574 + }, + { + "epoch": 0.30612007356193405, + "grad_norm": 0.543303565930203, + "learning_rate": 9.999874085917551e-05, + "loss": 3.5502, + "step": 6575 + }, + { + "epoch": 0.3061666317480271, + "grad_norm": 0.5639295877916009, + "learning_rate": 9.999872156236147e-05, + "loss": 3.506, + "step": 6576 + }, + { + "epoch": 0.30621318993412017, + "grad_norm": 0.5552713514003982, + "learning_rate": 9.99987021188061e-05, + "loss": 3.4697, + "step": 6577 + }, + { + "epoch": 0.3062597481202132, + "grad_norm": 0.5428415607036032, + "learning_rate": 9.99986825285095e-05, + "loss": 3.5862, + "step": 6578 + }, + { + "epoch": 0.3063063063063063, + "grad_norm": 0.520415082328469, + "learning_rate": 9.999866279147166e-05, + "loss": 3.5369, + "step": 6579 + }, + { + "epoch": 0.3063528644923994, + "grad_norm": 0.4913959052511726, + "learning_rate": 9.99986429076927e-05, + "loss": 3.6727, + "step": 6580 + }, + { + "epoch": 0.30639942267849246, + "grad_norm": 0.5417727007835097, + "learning_rate": 9.999862287717262e-05, + "loss": 3.4264, + "step": 6581 + }, + { + "epoch": 0.3064459808645855, + "grad_norm": 0.47042411823466007, + "learning_rate": 9.999860269991153e-05, + "loss": 3.5038, + "step": 6582 + }, + { + "epoch": 0.3064925390506786, + "grad_norm": 0.47007481368727894, + "learning_rate": 9.999858237590945e-05, + "loss": 3.5084, + "step": 6583 + }, + { + "epoch": 0.30653909723677164, + "grad_norm": 0.47973857726957253, + "learning_rate": 9.999856190516645e-05, + "loss": 3.4197, + "step": 6584 + }, + { + "epoch": 0.3065856554228647, + "grad_norm": 0.4146073810141982, + "learning_rate": 9.999854128768262e-05, + "loss": 3.4913, + "step": 6585 + }, + { + "epoch": 0.3066322136089578, + "grad_norm": 0.48447658811041955, + "learning_rate": 9.999852052345797e-05, + "loss": 3.5542, + "step": 6586 + }, + { + "epoch": 0.3066787717950509, + "grad_norm": 0.5024902758044724, + "learning_rate": 9.999849961249261e-05, + "loss": 3.519, + "step": 6587 + }, + { + "epoch": 0.30672532998114393, + "grad_norm": 0.6347066142757963, + "learning_rate": 9.999847855478656e-05, + "loss": 3.6577, + "step": 6588 + }, + { + "epoch": 0.306771888167237, + "grad_norm": 0.5017273027471764, + "learning_rate": 9.999845735033992e-05, + "loss": 3.5077, + "step": 6589 + }, + { + "epoch": 0.30681844635333005, + "grad_norm": 0.543952441685907, + "learning_rate": 9.999843599915273e-05, + "loss": 3.5416, + "step": 6590 + }, + { + "epoch": 0.30686500453942317, + "grad_norm": 0.5990067191830625, + "learning_rate": 9.999841450122505e-05, + "loss": 3.64, + "step": 6591 + }, + { + "epoch": 0.30691156272551623, + "grad_norm": 0.4655954268547387, + "learning_rate": 9.999839285655695e-05, + "loss": 3.6578, + "step": 6592 + }, + { + "epoch": 0.3069581209116093, + "grad_norm": 0.5304371472030899, + "learning_rate": 9.99983710651485e-05, + "loss": 3.5201, + "step": 6593 + }, + { + "epoch": 0.30700467909770235, + "grad_norm": 0.6155044777411842, + "learning_rate": 9.999834912699974e-05, + "loss": 3.5714, + "step": 6594 + }, + { + "epoch": 0.3070512372837954, + "grad_norm": 0.5278004620747365, + "learning_rate": 9.999832704211077e-05, + "loss": 3.6206, + "step": 6595 + }, + { + "epoch": 0.30709779546988847, + "grad_norm": 0.47069249850150147, + "learning_rate": 9.999830481048162e-05, + "loss": 3.5024, + "step": 6596 + }, + { + "epoch": 0.3071443536559816, + "grad_norm": 0.6239973513995283, + "learning_rate": 9.999828243211239e-05, + "loss": 3.6118, + "step": 6597 + }, + { + "epoch": 0.30719091184207464, + "grad_norm": 0.5350724451578606, + "learning_rate": 9.99982599070031e-05, + "loss": 3.5, + "step": 6598 + }, + { + "epoch": 0.3072374700281677, + "grad_norm": 0.5039502215072722, + "learning_rate": 9.999823723515387e-05, + "loss": 3.6107, + "step": 6599 + }, + { + "epoch": 0.30728402821426076, + "grad_norm": 0.5563859801185109, + "learning_rate": 9.999821441656473e-05, + "loss": 3.495, + "step": 6600 + }, + { + "epoch": 0.3073305864003538, + "grad_norm": 0.5159440652364854, + "learning_rate": 9.999819145123575e-05, + "loss": 3.6143, + "step": 6601 + }, + { + "epoch": 0.30737714458644694, + "grad_norm": 0.4804919456284715, + "learning_rate": 9.999816833916702e-05, + "loss": 3.5668, + "step": 6602 + }, + { + "epoch": 0.30742370277254, + "grad_norm": 0.45157802209254605, + "learning_rate": 9.999814508035856e-05, + "loss": 3.5983, + "step": 6603 + }, + { + "epoch": 0.30747026095863306, + "grad_norm": 0.5080877136065155, + "learning_rate": 9.99981216748105e-05, + "loss": 3.6541, + "step": 6604 + }, + { + "epoch": 0.3075168191447261, + "grad_norm": 0.5258883526573759, + "learning_rate": 9.999809812252287e-05, + "loss": 3.573, + "step": 6605 + }, + { + "epoch": 0.3075633773308192, + "grad_norm": 0.4841516161847843, + "learning_rate": 9.999807442349574e-05, + "loss": 3.5447, + "step": 6606 + }, + { + "epoch": 0.30760993551691224, + "grad_norm": 0.5177607356682357, + "learning_rate": 9.99980505777292e-05, + "loss": 3.561, + "step": 6607 + }, + { + "epoch": 0.30765649370300535, + "grad_norm": 0.5854093581904155, + "learning_rate": 9.99980265852233e-05, + "loss": 3.5418, + "step": 6608 + }, + { + "epoch": 0.3077030518890984, + "grad_norm": 0.5330027277500836, + "learning_rate": 9.99980024459781e-05, + "loss": 3.5814, + "step": 6609 + }, + { + "epoch": 0.30774961007519147, + "grad_norm": 0.5470374833064402, + "learning_rate": 9.999797815999372e-05, + "loss": 3.6321, + "step": 6610 + }, + { + "epoch": 0.30779616826128453, + "grad_norm": 0.5497746532538935, + "learning_rate": 9.999795372727017e-05, + "loss": 3.5704, + "step": 6611 + }, + { + "epoch": 0.3078427264473776, + "grad_norm": 0.4931369527373499, + "learning_rate": 9.999792914780757e-05, + "loss": 3.5088, + "step": 6612 + }, + { + "epoch": 0.3078892846334707, + "grad_norm": 0.5704812540102049, + "learning_rate": 9.999790442160596e-05, + "loss": 3.5396, + "step": 6613 + }, + { + "epoch": 0.30793584281956377, + "grad_norm": 0.49745502306244216, + "learning_rate": 9.999787954866545e-05, + "loss": 3.5179, + "step": 6614 + }, + { + "epoch": 0.3079824010056568, + "grad_norm": 0.49066437839646165, + "learning_rate": 9.999785452898608e-05, + "loss": 3.5711, + "step": 6615 + }, + { + "epoch": 0.3080289591917499, + "grad_norm": 0.5727036796781751, + "learning_rate": 9.999782936256792e-05, + "loss": 3.574, + "step": 6616 + }, + { + "epoch": 0.30807551737784294, + "grad_norm": 0.5603044280688633, + "learning_rate": 9.999780404941107e-05, + "loss": 3.6383, + "step": 6617 + }, + { + "epoch": 0.308122075563936, + "grad_norm": 0.5274836081763314, + "learning_rate": 9.999777858951558e-05, + "loss": 3.5216, + "step": 6618 + }, + { + "epoch": 0.3081686337500291, + "grad_norm": 0.5048139071770948, + "learning_rate": 9.999775298288154e-05, + "loss": 3.59, + "step": 6619 + }, + { + "epoch": 0.3082151919361222, + "grad_norm": 0.5649500345748466, + "learning_rate": 9.999772722950902e-05, + "loss": 3.5139, + "step": 6620 + }, + { + "epoch": 0.30826175012221524, + "grad_norm": 0.5300635855793372, + "learning_rate": 9.999770132939809e-05, + "loss": 3.6527, + "step": 6621 + }, + { + "epoch": 0.3083083083083083, + "grad_norm": 0.49258417921463127, + "learning_rate": 9.999767528254886e-05, + "loss": 3.6829, + "step": 6622 + }, + { + "epoch": 0.30835486649440136, + "grad_norm": 0.5650936469882345, + "learning_rate": 9.999764908896136e-05, + "loss": 3.6073, + "step": 6623 + }, + { + "epoch": 0.3084014246804945, + "grad_norm": 0.5047018150878797, + "learning_rate": 9.999762274863567e-05, + "loss": 3.5481, + "step": 6624 + }, + { + "epoch": 0.30844798286658753, + "grad_norm": 0.5687471898917594, + "learning_rate": 9.999759626157191e-05, + "loss": 3.5341, + "step": 6625 + }, + { + "epoch": 0.3084945410526806, + "grad_norm": 0.5496448086246054, + "learning_rate": 9.999756962777012e-05, + "loss": 3.5337, + "step": 6626 + }, + { + "epoch": 0.30854109923877365, + "grad_norm": 0.5683650117504019, + "learning_rate": 9.999754284723037e-05, + "loss": 3.6295, + "step": 6627 + }, + { + "epoch": 0.3085876574248667, + "grad_norm": 0.49045635844831176, + "learning_rate": 9.999751591995278e-05, + "loss": 3.5496, + "step": 6628 + }, + { + "epoch": 0.3086342156109598, + "grad_norm": 0.5177977791992444, + "learning_rate": 9.99974888459374e-05, + "loss": 3.5174, + "step": 6629 + }, + { + "epoch": 0.3086807737970529, + "grad_norm": 0.518201336141404, + "learning_rate": 9.999746162518431e-05, + "loss": 3.6399, + "step": 6630 + }, + { + "epoch": 0.30872733198314595, + "grad_norm": 0.5346289734636771, + "learning_rate": 9.99974342576936e-05, + "loss": 3.5334, + "step": 6631 + }, + { + "epoch": 0.308773890169239, + "grad_norm": 0.44955892392831676, + "learning_rate": 9.999740674346535e-05, + "loss": 3.5675, + "step": 6632 + }, + { + "epoch": 0.30882044835533207, + "grad_norm": 0.5382461988101366, + "learning_rate": 9.999737908249963e-05, + "loss": 3.6189, + "step": 6633 + }, + { + "epoch": 0.3088670065414251, + "grad_norm": 0.5603689266070908, + "learning_rate": 9.999735127479653e-05, + "loss": 3.5103, + "step": 6634 + }, + { + "epoch": 0.30891356472751824, + "grad_norm": 0.5289977705187432, + "learning_rate": 9.999732332035614e-05, + "loss": 3.5888, + "step": 6635 + }, + { + "epoch": 0.3089601229136113, + "grad_norm": 0.42626464460047503, + "learning_rate": 9.999729521917852e-05, + "loss": 3.5869, + "step": 6636 + }, + { + "epoch": 0.30900668109970436, + "grad_norm": 0.48663423470500705, + "learning_rate": 9.999726697126377e-05, + "loss": 3.5273, + "step": 6637 + }, + { + "epoch": 0.3090532392857974, + "grad_norm": 0.4727774144793558, + "learning_rate": 9.999723857661197e-05, + "loss": 3.3724, + "step": 6638 + }, + { + "epoch": 0.3090997974718905, + "grad_norm": 0.45893481996207774, + "learning_rate": 9.999721003522318e-05, + "loss": 3.4707, + "step": 6639 + }, + { + "epoch": 0.30914635565798354, + "grad_norm": 0.4702199634684127, + "learning_rate": 9.999718134709752e-05, + "loss": 3.5412, + "step": 6640 + }, + { + "epoch": 0.30919291384407666, + "grad_norm": 0.5396595910610104, + "learning_rate": 9.999715251223506e-05, + "loss": 3.5161, + "step": 6641 + }, + { + "epoch": 0.3092394720301697, + "grad_norm": 0.4926825212337821, + "learning_rate": 9.999712353063589e-05, + "loss": 3.4972, + "step": 6642 + }, + { + "epoch": 0.3092860302162628, + "grad_norm": 0.4465622897945334, + "learning_rate": 9.999709440230008e-05, + "loss": 3.4621, + "step": 6643 + }, + { + "epoch": 0.30933258840235583, + "grad_norm": 0.4368202940880474, + "learning_rate": 9.99970651272277e-05, + "loss": 3.49, + "step": 6644 + }, + { + "epoch": 0.3093791465884489, + "grad_norm": 0.48261619305513237, + "learning_rate": 9.99970357054189e-05, + "loss": 3.4883, + "step": 6645 + }, + { + "epoch": 0.309425704774542, + "grad_norm": 0.43412780369132176, + "learning_rate": 9.99970061368737e-05, + "loss": 3.658, + "step": 6646 + }, + { + "epoch": 0.30947226296063507, + "grad_norm": 0.5246994831902251, + "learning_rate": 9.999697642159222e-05, + "loss": 3.5113, + "step": 6647 + }, + { + "epoch": 0.30951882114672813, + "grad_norm": 0.5079829238782821, + "learning_rate": 9.999694655957453e-05, + "loss": 3.3995, + "step": 6648 + }, + { + "epoch": 0.3095653793328212, + "grad_norm": 0.5138237063972121, + "learning_rate": 9.999691655082072e-05, + "loss": 3.5139, + "step": 6649 + }, + { + "epoch": 0.30961193751891425, + "grad_norm": 0.5348781040892516, + "learning_rate": 9.999688639533091e-05, + "loss": 3.5781, + "step": 6650 + }, + { + "epoch": 0.3096584957050073, + "grad_norm": 0.6441295423710948, + "learning_rate": 9.999685609310516e-05, + "loss": 3.5863, + "step": 6651 + }, + { + "epoch": 0.3097050538911004, + "grad_norm": 0.5693629201853879, + "learning_rate": 9.999682564414357e-05, + "loss": 3.4124, + "step": 6652 + }, + { + "epoch": 0.3097516120771935, + "grad_norm": 0.4647426992220892, + "learning_rate": 9.99967950484462e-05, + "loss": 3.5831, + "step": 6653 + }, + { + "epoch": 0.30979817026328654, + "grad_norm": 0.5594679753410144, + "learning_rate": 9.999676430601317e-05, + "loss": 3.6534, + "step": 6654 + }, + { + "epoch": 0.3098447284493796, + "grad_norm": 0.6938752686820082, + "learning_rate": 9.999673341684457e-05, + "loss": 3.596, + "step": 6655 + }, + { + "epoch": 0.30989128663547266, + "grad_norm": 0.5828754098741193, + "learning_rate": 9.999670238094047e-05, + "loss": 3.6865, + "step": 6656 + }, + { + "epoch": 0.3099378448215658, + "grad_norm": 0.4728099952929618, + "learning_rate": 9.999667119830098e-05, + "loss": 3.5727, + "step": 6657 + }, + { + "epoch": 0.30998440300765884, + "grad_norm": 0.5399635791595854, + "learning_rate": 9.99966398689262e-05, + "loss": 3.501, + "step": 6658 + }, + { + "epoch": 0.3100309611937519, + "grad_norm": 0.5870849193522159, + "learning_rate": 9.99966083928162e-05, + "loss": 3.4688, + "step": 6659 + }, + { + "epoch": 0.31007751937984496, + "grad_norm": 0.5275859528148301, + "learning_rate": 9.999657676997107e-05, + "loss": 3.6283, + "step": 6660 + }, + { + "epoch": 0.310124077565938, + "grad_norm": 0.5096367785613828, + "learning_rate": 9.999654500039091e-05, + "loss": 3.5617, + "step": 6661 + }, + { + "epoch": 0.3101706357520311, + "grad_norm": 0.5997139297188826, + "learning_rate": 9.999651308407583e-05, + "loss": 3.6253, + "step": 6662 + }, + { + "epoch": 0.3102171939381242, + "grad_norm": 0.6440464573328679, + "learning_rate": 9.99964810210259e-05, + "loss": 3.6881, + "step": 6663 + }, + { + "epoch": 0.31026375212421725, + "grad_norm": 0.47274893563463083, + "learning_rate": 9.999644881124122e-05, + "loss": 3.5616, + "step": 6664 + }, + { + "epoch": 0.3103103103103103, + "grad_norm": 0.5058001658392592, + "learning_rate": 9.99964164547219e-05, + "loss": 3.6006, + "step": 6665 + }, + { + "epoch": 0.31035686849640337, + "grad_norm": 0.514488423426204, + "learning_rate": 9.999638395146801e-05, + "loss": 3.5426, + "step": 6666 + }, + { + "epoch": 0.31040342668249643, + "grad_norm": 0.5145785150995699, + "learning_rate": 9.999635130147967e-05, + "loss": 3.6471, + "step": 6667 + }, + { + "epoch": 0.31044998486858955, + "grad_norm": 0.4515329981465099, + "learning_rate": 9.999631850475696e-05, + "loss": 3.5202, + "step": 6668 + }, + { + "epoch": 0.3104965430546826, + "grad_norm": 0.5122039491841746, + "learning_rate": 9.999628556129998e-05, + "loss": 3.5569, + "step": 6669 + }, + { + "epoch": 0.31054310124077567, + "grad_norm": 0.47436431481644487, + "learning_rate": 9.999625247110883e-05, + "loss": 3.634, + "step": 6670 + }, + { + "epoch": 0.3105896594268687, + "grad_norm": 0.5062932806127615, + "learning_rate": 9.99962192341836e-05, + "loss": 3.6829, + "step": 6671 + }, + { + "epoch": 0.3106362176129618, + "grad_norm": 0.5666414882335437, + "learning_rate": 9.999618585052439e-05, + "loss": 3.5976, + "step": 6672 + }, + { + "epoch": 0.31068277579905484, + "grad_norm": 0.5184475247041113, + "learning_rate": 9.99961523201313e-05, + "loss": 3.4301, + "step": 6673 + }, + { + "epoch": 0.31072933398514796, + "grad_norm": 0.5178374959511236, + "learning_rate": 9.999611864300445e-05, + "loss": 3.5204, + "step": 6674 + }, + { + "epoch": 0.310775892171241, + "grad_norm": 0.5238675557201277, + "learning_rate": 9.999608481914389e-05, + "loss": 3.5484, + "step": 6675 + }, + { + "epoch": 0.3108224503573341, + "grad_norm": 0.5362799649371316, + "learning_rate": 9.999605084854976e-05, + "loss": 3.3847, + "step": 6676 + }, + { + "epoch": 0.31086900854342714, + "grad_norm": 0.4570094449710533, + "learning_rate": 9.999601673122214e-05, + "loss": 3.5467, + "step": 6677 + }, + { + "epoch": 0.3109155667295202, + "grad_norm": 0.5062361607444952, + "learning_rate": 9.999598246716114e-05, + "loss": 3.5976, + "step": 6678 + }, + { + "epoch": 0.3109621249156133, + "grad_norm": 0.49726443682973864, + "learning_rate": 9.999594805636686e-05, + "loss": 3.3888, + "step": 6679 + }, + { + "epoch": 0.3110086831017064, + "grad_norm": 0.4331852371746275, + "learning_rate": 9.999591349883939e-05, + "loss": 3.5135, + "step": 6680 + }, + { + "epoch": 0.31105524128779943, + "grad_norm": 0.4763257628822235, + "learning_rate": 9.999587879457886e-05, + "loss": 3.6451, + "step": 6681 + }, + { + "epoch": 0.3111017994738925, + "grad_norm": 0.5198486417228215, + "learning_rate": 9.999584394358532e-05, + "loss": 3.4103, + "step": 6682 + }, + { + "epoch": 0.31114835765998555, + "grad_norm": 0.5522685587535271, + "learning_rate": 9.999580894585891e-05, + "loss": 3.4701, + "step": 6683 + }, + { + "epoch": 0.3111949158460786, + "grad_norm": 0.5356724311089105, + "learning_rate": 9.999577380139975e-05, + "loss": 3.5663, + "step": 6684 + }, + { + "epoch": 0.3112414740321717, + "grad_norm": 0.5690287989248392, + "learning_rate": 9.99957385102079e-05, + "loss": 3.5093, + "step": 6685 + }, + { + "epoch": 0.3112880322182648, + "grad_norm": 0.5589139474418858, + "learning_rate": 9.999570307228349e-05, + "loss": 3.6118, + "step": 6686 + }, + { + "epoch": 0.31133459040435785, + "grad_norm": 0.5260638104625533, + "learning_rate": 9.999566748762661e-05, + "loss": 3.5816, + "step": 6687 + }, + { + "epoch": 0.3113811485904509, + "grad_norm": 0.511753093526167, + "learning_rate": 9.999563175623738e-05, + "loss": 3.4457, + "step": 6688 + }, + { + "epoch": 0.31142770677654397, + "grad_norm": 0.5397936676190673, + "learning_rate": 9.999559587811589e-05, + "loss": 3.5663, + "step": 6689 + }, + { + "epoch": 0.3114742649626371, + "grad_norm": 0.5558909691166303, + "learning_rate": 9.999555985326224e-05, + "loss": 3.4508, + "step": 6690 + }, + { + "epoch": 0.31152082314873014, + "grad_norm": 0.5498889287191152, + "learning_rate": 9.999552368167658e-05, + "loss": 3.4866, + "step": 6691 + }, + { + "epoch": 0.3115673813348232, + "grad_norm": 0.5701458886286249, + "learning_rate": 9.999548736335895e-05, + "loss": 3.5472, + "step": 6692 + }, + { + "epoch": 0.31161393952091626, + "grad_norm": 0.5800455473362378, + "learning_rate": 9.99954508983095e-05, + "loss": 3.5686, + "step": 6693 + }, + { + "epoch": 0.3116604977070093, + "grad_norm": 0.4822687893771743, + "learning_rate": 9.999541428652834e-05, + "loss": 3.5308, + "step": 6694 + }, + { + "epoch": 0.3117070558931024, + "grad_norm": 0.5090614762181277, + "learning_rate": 9.999537752801554e-05, + "loss": 3.4864, + "step": 6695 + }, + { + "epoch": 0.3117536140791955, + "grad_norm": 0.4781729402778489, + "learning_rate": 9.999534062277125e-05, + "loss": 3.4807, + "step": 6696 + }, + { + "epoch": 0.31180017226528856, + "grad_norm": 0.49384662143003816, + "learning_rate": 9.999530357079556e-05, + "loss": 3.5609, + "step": 6697 + }, + { + "epoch": 0.3118467304513816, + "grad_norm": 0.5015500719253001, + "learning_rate": 9.999526637208858e-05, + "loss": 3.4788, + "step": 6698 + }, + { + "epoch": 0.3118932886374747, + "grad_norm": 0.5138057118559375, + "learning_rate": 9.999522902665042e-05, + "loss": 3.6154, + "step": 6699 + }, + { + "epoch": 0.31193984682356773, + "grad_norm": 0.5627742509083101, + "learning_rate": 9.999519153448118e-05, + "loss": 3.6176, + "step": 6700 + }, + { + "epoch": 0.31198640500966085, + "grad_norm": 0.4633525138217006, + "learning_rate": 9.999515389558098e-05, + "loss": 3.4974, + "step": 6701 + }, + { + "epoch": 0.3120329631957539, + "grad_norm": 0.46692595329846215, + "learning_rate": 9.999511610994992e-05, + "loss": 3.5517, + "step": 6702 + }, + { + "epoch": 0.31207952138184697, + "grad_norm": 0.5062401353462437, + "learning_rate": 9.999507817758814e-05, + "loss": 3.5342, + "step": 6703 + }, + { + "epoch": 0.31212607956794003, + "grad_norm": 0.5253424916560316, + "learning_rate": 9.999504009849571e-05, + "loss": 3.6285, + "step": 6704 + }, + { + "epoch": 0.3121726377540331, + "grad_norm": 0.500372448138012, + "learning_rate": 9.999500187267277e-05, + "loss": 3.5239, + "step": 6705 + }, + { + "epoch": 0.31221919594012615, + "grad_norm": 0.48726988046373293, + "learning_rate": 9.999496350011943e-05, + "loss": 3.4632, + "step": 6706 + }, + { + "epoch": 0.31226575412621926, + "grad_norm": 0.4887597902537717, + "learning_rate": 9.99949249808358e-05, + "loss": 3.6885, + "step": 6707 + }, + { + "epoch": 0.3123123123123123, + "grad_norm": 0.4922797629768826, + "learning_rate": 9.9994886314822e-05, + "loss": 3.5588, + "step": 6708 + }, + { + "epoch": 0.3123588704984054, + "grad_norm": 0.5275033604561201, + "learning_rate": 9.99948475020781e-05, + "loss": 3.5728, + "step": 6709 + }, + { + "epoch": 0.31240542868449844, + "grad_norm": 0.48872770787293207, + "learning_rate": 9.999480854260428e-05, + "loss": 3.5508, + "step": 6710 + }, + { + "epoch": 0.3124519868705915, + "grad_norm": 0.43332428506048065, + "learning_rate": 9.99947694364006e-05, + "loss": 3.5705, + "step": 6711 + }, + { + "epoch": 0.3124985450566846, + "grad_norm": 0.4801491450002416, + "learning_rate": 9.999473018346722e-05, + "loss": 3.4839, + "step": 6712 + }, + { + "epoch": 0.3125451032427777, + "grad_norm": 0.3893397153538327, + "learning_rate": 9.999469078380421e-05, + "loss": 3.5635, + "step": 6713 + }, + { + "epoch": 0.31259166142887074, + "grad_norm": 0.47301590118596937, + "learning_rate": 9.999465123741171e-05, + "loss": 3.4558, + "step": 6714 + }, + { + "epoch": 0.3126382196149638, + "grad_norm": 0.5587204448100307, + "learning_rate": 9.999461154428985e-05, + "loss": 3.5076, + "step": 6715 + }, + { + "epoch": 0.31268477780105686, + "grad_norm": 0.5262616946792246, + "learning_rate": 9.999457170443872e-05, + "loss": 3.4889, + "step": 6716 + }, + { + "epoch": 0.3127313359871499, + "grad_norm": 0.5277309498778197, + "learning_rate": 9.999453171785846e-05, + "loss": 3.4151, + "step": 6717 + }, + { + "epoch": 0.31277789417324303, + "grad_norm": 0.5636707033663614, + "learning_rate": 9.999449158454915e-05, + "loss": 3.5623, + "step": 6718 + }, + { + "epoch": 0.3128244523593361, + "grad_norm": 0.6873896003193005, + "learning_rate": 9.999445130451095e-05, + "loss": 3.6027, + "step": 6719 + }, + { + "epoch": 0.31287101054542915, + "grad_norm": 0.5869262471694838, + "learning_rate": 9.999441087774394e-05, + "loss": 3.3695, + "step": 6720 + }, + { + "epoch": 0.3129175687315222, + "grad_norm": 0.48449585011912033, + "learning_rate": 9.99943703042483e-05, + "loss": 3.5298, + "step": 6721 + }, + { + "epoch": 0.31296412691761527, + "grad_norm": 0.5849425558020742, + "learning_rate": 9.999432958402409e-05, + "loss": 3.5851, + "step": 6722 + }, + { + "epoch": 0.3130106851037084, + "grad_norm": 0.5390347222129794, + "learning_rate": 9.999428871707143e-05, + "loss": 3.4569, + "step": 6723 + }, + { + "epoch": 0.31305724328980145, + "grad_norm": 0.4865727250076929, + "learning_rate": 9.999424770339048e-05, + "loss": 3.4833, + "step": 6724 + }, + { + "epoch": 0.3131038014758945, + "grad_norm": 0.5313980535230158, + "learning_rate": 9.999420654298132e-05, + "loss": 3.5544, + "step": 6725 + }, + { + "epoch": 0.31315035966198757, + "grad_norm": 0.6840790226071278, + "learning_rate": 9.99941652358441e-05, + "loss": 3.5885, + "step": 6726 + }, + { + "epoch": 0.3131969178480806, + "grad_norm": 0.6125172820473431, + "learning_rate": 9.999412378197892e-05, + "loss": 3.5381, + "step": 6727 + }, + { + "epoch": 0.3132434760341737, + "grad_norm": 0.5929512680659356, + "learning_rate": 9.999408218138593e-05, + "loss": 3.5933, + "step": 6728 + }, + { + "epoch": 0.3132900342202668, + "grad_norm": 0.64318274297572, + "learning_rate": 9.999404043406521e-05, + "loss": 3.6238, + "step": 6729 + }, + { + "epoch": 0.31333659240635986, + "grad_norm": 0.5946229336429111, + "learning_rate": 9.999399854001692e-05, + "loss": 3.6138, + "step": 6730 + }, + { + "epoch": 0.3133831505924529, + "grad_norm": 0.5729697830428476, + "learning_rate": 9.999395649924117e-05, + "loss": 3.5629, + "step": 6731 + }, + { + "epoch": 0.313429708778546, + "grad_norm": 0.5865281069939814, + "learning_rate": 9.999391431173809e-05, + "loss": 3.4339, + "step": 6732 + }, + { + "epoch": 0.31347626696463904, + "grad_norm": 0.5081526894906068, + "learning_rate": 9.999387197750778e-05, + "loss": 3.5338, + "step": 6733 + }, + { + "epoch": 0.31352282515073215, + "grad_norm": 0.5657769069085001, + "learning_rate": 9.99938294965504e-05, + "loss": 3.4678, + "step": 6734 + }, + { + "epoch": 0.3135693833368252, + "grad_norm": 0.5535861939164176, + "learning_rate": 9.999378686886605e-05, + "loss": 3.6092, + "step": 6735 + }, + { + "epoch": 0.3136159415229183, + "grad_norm": 0.5149797589764741, + "learning_rate": 9.999374409445485e-05, + "loss": 3.5998, + "step": 6736 + }, + { + "epoch": 0.31366249970901133, + "grad_norm": 0.49463907448821925, + "learning_rate": 9.999370117331695e-05, + "loss": 3.4795, + "step": 6737 + }, + { + "epoch": 0.3137090578951044, + "grad_norm": 0.4940534485593699, + "learning_rate": 9.999365810545245e-05, + "loss": 3.5413, + "step": 6738 + }, + { + "epoch": 0.31375561608119745, + "grad_norm": 0.5038854214711319, + "learning_rate": 9.99936148908615e-05, + "loss": 3.5112, + "step": 6739 + }, + { + "epoch": 0.31380217426729057, + "grad_norm": 0.488469981370805, + "learning_rate": 9.999357152954421e-05, + "loss": 3.6473, + "step": 6740 + }, + { + "epoch": 0.3138487324533836, + "grad_norm": 0.4856075061086257, + "learning_rate": 9.99935280215007e-05, + "loss": 3.5422, + "step": 6741 + }, + { + "epoch": 0.3138952906394767, + "grad_norm": 0.4859238029703879, + "learning_rate": 9.999348436673112e-05, + "loss": 3.5037, + "step": 6742 + }, + { + "epoch": 0.31394184882556975, + "grad_norm": 0.5420482328114432, + "learning_rate": 9.99934405652356e-05, + "loss": 3.538, + "step": 6743 + }, + { + "epoch": 0.3139884070116628, + "grad_norm": 0.5374825856943511, + "learning_rate": 9.999339661701424e-05, + "loss": 3.5127, + "step": 6744 + }, + { + "epoch": 0.3140349651977559, + "grad_norm": 0.5653798433231424, + "learning_rate": 9.99933525220672e-05, + "loss": 3.5729, + "step": 6745 + }, + { + "epoch": 0.314081523383849, + "grad_norm": 0.4357751976196311, + "learning_rate": 9.99933082803946e-05, + "loss": 3.5813, + "step": 6746 + }, + { + "epoch": 0.31412808156994204, + "grad_norm": 0.500534598654961, + "learning_rate": 9.999326389199654e-05, + "loss": 3.5258, + "step": 6747 + }, + { + "epoch": 0.3141746397560351, + "grad_norm": 0.4043440346871604, + "learning_rate": 9.99932193568732e-05, + "loss": 3.4829, + "step": 6748 + }, + { + "epoch": 0.31422119794212816, + "grad_norm": 0.5121269917323285, + "learning_rate": 9.999317467502466e-05, + "loss": 3.4438, + "step": 6749 + }, + { + "epoch": 0.3142677561282212, + "grad_norm": 0.5462866040384847, + "learning_rate": 9.99931298464511e-05, + "loss": 3.5092, + "step": 6750 + }, + { + "epoch": 0.31431431431431434, + "grad_norm": 0.4665625487848, + "learning_rate": 9.999308487115263e-05, + "loss": 3.5517, + "step": 6751 + }, + { + "epoch": 0.3143608725004074, + "grad_norm": 0.5612800123364209, + "learning_rate": 9.999303974912938e-05, + "loss": 3.5662, + "step": 6752 + }, + { + "epoch": 0.31440743068650046, + "grad_norm": 0.5827299243637778, + "learning_rate": 9.999299448038147e-05, + "loss": 3.4755, + "step": 6753 + }, + { + "epoch": 0.3144539888725935, + "grad_norm": 0.6229853020652993, + "learning_rate": 9.999294906490906e-05, + "loss": 3.5647, + "step": 6754 + }, + { + "epoch": 0.3145005470586866, + "grad_norm": 0.5632188636413535, + "learning_rate": 9.999290350271226e-05, + "loss": 3.5459, + "step": 6755 + }, + { + "epoch": 0.3145471052447797, + "grad_norm": 0.55989714022437, + "learning_rate": 9.999285779379121e-05, + "loss": 3.5813, + "step": 6756 + }, + { + "epoch": 0.31459366343087275, + "grad_norm": 0.5995970510517614, + "learning_rate": 9.999281193814607e-05, + "loss": 3.5961, + "step": 6757 + }, + { + "epoch": 0.3146402216169658, + "grad_norm": 0.5922071725545408, + "learning_rate": 9.999276593577693e-05, + "loss": 3.4476, + "step": 6758 + }, + { + "epoch": 0.31468677980305887, + "grad_norm": 0.6119072407454808, + "learning_rate": 9.999271978668396e-05, + "loss": 3.506, + "step": 6759 + }, + { + "epoch": 0.31473333798915193, + "grad_norm": 0.5827341219870454, + "learning_rate": 9.999267349086727e-05, + "loss": 3.5733, + "step": 6760 + }, + { + "epoch": 0.314779896175245, + "grad_norm": 0.5927912432724319, + "learning_rate": 9.999262704832701e-05, + "loss": 3.6442, + "step": 6761 + }, + { + "epoch": 0.3148264543613381, + "grad_norm": 0.5989745710643523, + "learning_rate": 9.999258045906332e-05, + "loss": 3.5856, + "step": 6762 + }, + { + "epoch": 0.31487301254743116, + "grad_norm": 0.5964680682506475, + "learning_rate": 9.999253372307634e-05, + "loss": 3.4523, + "step": 6763 + }, + { + "epoch": 0.3149195707335242, + "grad_norm": 0.5630016790074537, + "learning_rate": 9.99924868403662e-05, + "loss": 3.5043, + "step": 6764 + }, + { + "epoch": 0.3149661289196173, + "grad_norm": 0.6007845908011125, + "learning_rate": 9.999243981093302e-05, + "loss": 3.5268, + "step": 6765 + }, + { + "epoch": 0.31501268710571034, + "grad_norm": 0.5765120494682909, + "learning_rate": 9.999239263477696e-05, + "loss": 3.6014, + "step": 6766 + }, + { + "epoch": 0.31505924529180346, + "grad_norm": 0.6516562670554029, + "learning_rate": 9.999234531189815e-05, + "loss": 3.5637, + "step": 6767 + }, + { + "epoch": 0.3151058034778965, + "grad_norm": 0.679552151066458, + "learning_rate": 9.999229784229674e-05, + "loss": 3.5672, + "step": 6768 + }, + { + "epoch": 0.3151523616639896, + "grad_norm": 0.535461663608937, + "learning_rate": 9.999225022597285e-05, + "loss": 3.4826, + "step": 6769 + }, + { + "epoch": 0.31519891985008264, + "grad_norm": 0.5415267442237053, + "learning_rate": 9.999220246292665e-05, + "loss": 3.5393, + "step": 6770 + }, + { + "epoch": 0.3152454780361757, + "grad_norm": 0.5296998399442661, + "learning_rate": 9.999215455315824e-05, + "loss": 3.5409, + "step": 6771 + }, + { + "epoch": 0.31529203622226876, + "grad_norm": 0.40373590853202823, + "learning_rate": 9.999210649666778e-05, + "loss": 3.4628, + "step": 6772 + }, + { + "epoch": 0.31533859440836187, + "grad_norm": 0.4665479776085227, + "learning_rate": 9.999205829345543e-05, + "loss": 3.642, + "step": 6773 + }, + { + "epoch": 0.31538515259445493, + "grad_norm": 0.5377512712848319, + "learning_rate": 9.99920099435213e-05, + "loss": 3.5855, + "step": 6774 + }, + { + "epoch": 0.315431710780548, + "grad_norm": 0.5467505527056664, + "learning_rate": 9.999196144686556e-05, + "loss": 3.5758, + "step": 6775 + }, + { + "epoch": 0.31547826896664105, + "grad_norm": 0.47325015186958663, + "learning_rate": 9.999191280348833e-05, + "loss": 3.4628, + "step": 6776 + }, + { + "epoch": 0.3155248271527341, + "grad_norm": 0.4661059854698323, + "learning_rate": 9.999186401338975e-05, + "loss": 3.5562, + "step": 6777 + }, + { + "epoch": 0.3155713853388272, + "grad_norm": 0.607154085959551, + "learning_rate": 9.999181507656998e-05, + "loss": 3.5607, + "step": 6778 + }, + { + "epoch": 0.3156179435249203, + "grad_norm": 0.513775646850281, + "learning_rate": 9.999176599302916e-05, + "loss": 3.6224, + "step": 6779 + }, + { + "epoch": 0.31566450171101335, + "grad_norm": 0.464731177842669, + "learning_rate": 9.999171676276744e-05, + "loss": 3.4333, + "step": 6780 + }, + { + "epoch": 0.3157110598971064, + "grad_norm": 0.5280157139995084, + "learning_rate": 9.999166738578495e-05, + "loss": 3.5468, + "step": 6781 + }, + { + "epoch": 0.31575761808319946, + "grad_norm": 0.5586839448218622, + "learning_rate": 9.999161786208185e-05, + "loss": 3.5332, + "step": 6782 + }, + { + "epoch": 0.3158041762692925, + "grad_norm": 0.5133493256240191, + "learning_rate": 9.999156819165825e-05, + "loss": 3.4367, + "step": 6783 + }, + { + "epoch": 0.31585073445538564, + "grad_norm": 0.45662086912347133, + "learning_rate": 9.999151837451435e-05, + "loss": 3.617, + "step": 6784 + }, + { + "epoch": 0.3158972926414787, + "grad_norm": 0.5410305933058414, + "learning_rate": 9.999146841065026e-05, + "loss": 3.4847, + "step": 6785 + }, + { + "epoch": 0.31594385082757176, + "grad_norm": 0.6773029503339815, + "learning_rate": 9.999141830006613e-05, + "loss": 3.5571, + "step": 6786 + }, + { + "epoch": 0.3159904090136648, + "grad_norm": 0.681537153867895, + "learning_rate": 9.999136804276212e-05, + "loss": 3.5483, + "step": 6787 + }, + { + "epoch": 0.3160369671997579, + "grad_norm": 0.6500681012717328, + "learning_rate": 9.999131763873836e-05, + "loss": 3.5167, + "step": 6788 + }, + { + "epoch": 0.316083525385851, + "grad_norm": 0.5869256073354666, + "learning_rate": 9.999126708799502e-05, + "loss": 3.4604, + "step": 6789 + }, + { + "epoch": 0.31613008357194405, + "grad_norm": 0.5533451706609407, + "learning_rate": 9.999121639053221e-05, + "loss": 3.5414, + "step": 6790 + }, + { + "epoch": 0.3161766417580371, + "grad_norm": 0.5446106368436822, + "learning_rate": 9.999116554635013e-05, + "loss": 3.5115, + "step": 6791 + }, + { + "epoch": 0.3162231999441302, + "grad_norm": 0.5171160994451525, + "learning_rate": 9.99911145554489e-05, + "loss": 3.5539, + "step": 6792 + }, + { + "epoch": 0.31626975813022323, + "grad_norm": 0.513748505094888, + "learning_rate": 9.999106341782867e-05, + "loss": 3.453, + "step": 6793 + }, + { + "epoch": 0.3163163163163163, + "grad_norm": 0.549146114983969, + "learning_rate": 9.99910121334896e-05, + "loss": 3.5063, + "step": 6794 + }, + { + "epoch": 0.3163628745024094, + "grad_norm": 0.514077188252268, + "learning_rate": 9.999096070243183e-05, + "loss": 3.586, + "step": 6795 + }, + { + "epoch": 0.31640943268850247, + "grad_norm": 0.45636052279490646, + "learning_rate": 9.999090912465552e-05, + "loss": 3.52, + "step": 6796 + }, + { + "epoch": 0.3164559908745955, + "grad_norm": 0.5202608694993192, + "learning_rate": 9.999085740016081e-05, + "loss": 3.4932, + "step": 6797 + }, + { + "epoch": 0.3165025490606886, + "grad_norm": 0.45748755609354463, + "learning_rate": 9.999080552894786e-05, + "loss": 3.6175, + "step": 6798 + }, + { + "epoch": 0.31654910724678165, + "grad_norm": 0.47018836895771776, + "learning_rate": 9.999075351101682e-05, + "loss": 3.5815, + "step": 6799 + }, + { + "epoch": 0.31659566543287476, + "grad_norm": 0.5316960216781609, + "learning_rate": 9.999070134636786e-05, + "loss": 3.6098, + "step": 6800 + }, + { + "epoch": 0.3166422236189678, + "grad_norm": 0.5134231422246441, + "learning_rate": 9.999064903500109e-05, + "loss": 3.5839, + "step": 6801 + }, + { + "epoch": 0.3166887818050609, + "grad_norm": 0.6115904771565361, + "learning_rate": 9.99905965769167e-05, + "loss": 3.5251, + "step": 6802 + }, + { + "epoch": 0.31673533999115394, + "grad_norm": 0.6552637562969978, + "learning_rate": 9.999054397211483e-05, + "loss": 3.449, + "step": 6803 + }, + { + "epoch": 0.316781898177247, + "grad_norm": 0.512007076708839, + "learning_rate": 9.999049122059565e-05, + "loss": 3.6015, + "step": 6804 + }, + { + "epoch": 0.31682845636334006, + "grad_norm": 0.586125730422755, + "learning_rate": 9.99904383223593e-05, + "loss": 3.5456, + "step": 6805 + }, + { + "epoch": 0.3168750145494332, + "grad_norm": 0.5948786089046904, + "learning_rate": 9.999038527740592e-05, + "loss": 3.5126, + "step": 6806 + }, + { + "epoch": 0.31692157273552624, + "grad_norm": 0.5738518398539669, + "learning_rate": 9.999033208573571e-05, + "loss": 3.5824, + "step": 6807 + }, + { + "epoch": 0.3169681309216193, + "grad_norm": 0.5882806897892597, + "learning_rate": 9.999027874734878e-05, + "loss": 3.6076, + "step": 6808 + }, + { + "epoch": 0.31701468910771236, + "grad_norm": 0.541887957475745, + "learning_rate": 9.999022526224534e-05, + "loss": 3.496, + "step": 6809 + }, + { + "epoch": 0.3170612472938054, + "grad_norm": 0.45494114745943803, + "learning_rate": 9.999017163042547e-05, + "loss": 3.5827, + "step": 6810 + }, + { + "epoch": 0.31710780547989853, + "grad_norm": 0.4984578177997869, + "learning_rate": 9.999011785188939e-05, + "loss": 3.577, + "step": 6811 + }, + { + "epoch": 0.3171543636659916, + "grad_norm": 0.44445236925106696, + "learning_rate": 9.999006392663723e-05, + "loss": 3.4983, + "step": 6812 + }, + { + "epoch": 0.31720092185208465, + "grad_norm": 0.49632816268079377, + "learning_rate": 9.999000985466917e-05, + "loss": 3.4027, + "step": 6813 + }, + { + "epoch": 0.3172474800381777, + "grad_norm": 0.46891702081853953, + "learning_rate": 9.998995563598536e-05, + "loss": 3.5644, + "step": 6814 + }, + { + "epoch": 0.31729403822427077, + "grad_norm": 0.4651695787067736, + "learning_rate": 9.998990127058595e-05, + "loss": 3.4195, + "step": 6815 + }, + { + "epoch": 0.31734059641036383, + "grad_norm": 0.48315017076210776, + "learning_rate": 9.99898467584711e-05, + "loss": 3.5253, + "step": 6816 + }, + { + "epoch": 0.31738715459645694, + "grad_norm": 0.5939164286282846, + "learning_rate": 9.998979209964097e-05, + "loss": 3.5746, + "step": 6817 + }, + { + "epoch": 0.31743371278255, + "grad_norm": 0.5565997216702266, + "learning_rate": 9.998973729409574e-05, + "loss": 3.403, + "step": 6818 + }, + { + "epoch": 0.31748027096864306, + "grad_norm": 0.411305426434964, + "learning_rate": 9.998968234183555e-05, + "loss": 3.5563, + "step": 6819 + }, + { + "epoch": 0.3175268291547361, + "grad_norm": 0.5206554678467797, + "learning_rate": 9.998962724286056e-05, + "loss": 3.5409, + "step": 6820 + }, + { + "epoch": 0.3175733873408292, + "grad_norm": 0.54949947697293, + "learning_rate": 9.998957199717094e-05, + "loss": 3.5134, + "step": 6821 + }, + { + "epoch": 0.3176199455269223, + "grad_norm": 0.5242349833565955, + "learning_rate": 9.998951660476687e-05, + "loss": 3.4707, + "step": 6822 + }, + { + "epoch": 0.31766650371301536, + "grad_norm": 0.4528294395491294, + "learning_rate": 9.998946106564848e-05, + "loss": 3.4961, + "step": 6823 + }, + { + "epoch": 0.3177130618991084, + "grad_norm": 0.5320068114327146, + "learning_rate": 9.998940537981593e-05, + "loss": 3.4214, + "step": 6824 + }, + { + "epoch": 0.3177596200852015, + "grad_norm": 0.5024170368229716, + "learning_rate": 9.998934954726942e-05, + "loss": 3.5262, + "step": 6825 + }, + { + "epoch": 0.31780617827129454, + "grad_norm": 0.4426924566983365, + "learning_rate": 9.998929356800909e-05, + "loss": 3.4788, + "step": 6826 + }, + { + "epoch": 0.3178527364573876, + "grad_norm": 0.5342717312290248, + "learning_rate": 9.99892374420351e-05, + "loss": 3.5259, + "step": 6827 + }, + { + "epoch": 0.3178992946434807, + "grad_norm": 0.6052281254508418, + "learning_rate": 9.998918116934764e-05, + "loss": 3.59, + "step": 6828 + }, + { + "epoch": 0.31794585282957377, + "grad_norm": 0.5520110518710574, + "learning_rate": 9.998912474994684e-05, + "loss": 3.637, + "step": 6829 + }, + { + "epoch": 0.31799241101566683, + "grad_norm": 0.6116765743851506, + "learning_rate": 9.99890681838329e-05, + "loss": 3.5454, + "step": 6830 + }, + { + "epoch": 0.3180389692017599, + "grad_norm": 0.4866561250644095, + "learning_rate": 9.998901147100596e-05, + "loss": 3.4801, + "step": 6831 + }, + { + "epoch": 0.31808552738785295, + "grad_norm": 0.5110485524315058, + "learning_rate": 9.998895461146619e-05, + "loss": 3.5304, + "step": 6832 + }, + { + "epoch": 0.31813208557394607, + "grad_norm": 0.49746437324260184, + "learning_rate": 9.998889760521377e-05, + "loss": 3.555, + "step": 6833 + }, + { + "epoch": 0.3181786437600391, + "grad_norm": 0.45342389643538533, + "learning_rate": 9.998884045224886e-05, + "loss": 3.4814, + "step": 6834 + }, + { + "epoch": 0.3182252019461322, + "grad_norm": 0.46124884448566633, + "learning_rate": 9.998878315257163e-05, + "loss": 3.4961, + "step": 6835 + }, + { + "epoch": 0.31827176013222525, + "grad_norm": 0.48104047738490435, + "learning_rate": 9.998872570618225e-05, + "loss": 3.529, + "step": 6836 + }, + { + "epoch": 0.3183183183183183, + "grad_norm": 0.4870552060848134, + "learning_rate": 9.998866811308087e-05, + "loss": 3.4479, + "step": 6837 + }, + { + "epoch": 0.31836487650441136, + "grad_norm": 0.42776552765833, + "learning_rate": 9.99886103732677e-05, + "loss": 3.4271, + "step": 6838 + }, + { + "epoch": 0.3184114346905045, + "grad_norm": 0.410903643989048, + "learning_rate": 9.998855248674286e-05, + "loss": 3.4266, + "step": 6839 + }, + { + "epoch": 0.31845799287659754, + "grad_norm": 0.4689842909993842, + "learning_rate": 9.998849445350655e-05, + "loss": 3.4084, + "step": 6840 + }, + { + "epoch": 0.3185045510626906, + "grad_norm": 0.4840039319590851, + "learning_rate": 9.998843627355892e-05, + "loss": 3.6104, + "step": 6841 + }, + { + "epoch": 0.31855110924878366, + "grad_norm": 0.4777162668560733, + "learning_rate": 9.998837794690017e-05, + "loss": 3.4532, + "step": 6842 + }, + { + "epoch": 0.3185976674348767, + "grad_norm": 0.5389030718459019, + "learning_rate": 9.998831947353045e-05, + "loss": 3.5369, + "step": 6843 + }, + { + "epoch": 0.31864422562096983, + "grad_norm": 0.6131566367357744, + "learning_rate": 9.998826085344994e-05, + "loss": 3.4649, + "step": 6844 + }, + { + "epoch": 0.3186907838070629, + "grad_norm": 0.5615107168174124, + "learning_rate": 9.998820208665881e-05, + "loss": 3.5246, + "step": 6845 + }, + { + "epoch": 0.31873734199315595, + "grad_norm": 0.6251978402409758, + "learning_rate": 9.998814317315723e-05, + "loss": 3.5471, + "step": 6846 + }, + { + "epoch": 0.318783900179249, + "grad_norm": 0.48051453341270856, + "learning_rate": 9.998808411294537e-05, + "loss": 3.5427, + "step": 6847 + }, + { + "epoch": 0.3188304583653421, + "grad_norm": 0.4779972763138907, + "learning_rate": 9.998802490602342e-05, + "loss": 3.4967, + "step": 6848 + }, + { + "epoch": 0.31887701655143513, + "grad_norm": 0.4889613836287104, + "learning_rate": 9.998796555239154e-05, + "loss": 3.4332, + "step": 6849 + }, + { + "epoch": 0.31892357473752825, + "grad_norm": 0.5229710450872915, + "learning_rate": 9.998790605204989e-05, + "loss": 3.4367, + "step": 6850 + }, + { + "epoch": 0.3189701329236213, + "grad_norm": 0.49190076183277115, + "learning_rate": 9.998784640499868e-05, + "loss": 3.5543, + "step": 6851 + }, + { + "epoch": 0.31901669110971437, + "grad_norm": 0.5769750992334778, + "learning_rate": 9.998778661123805e-05, + "loss": 3.4155, + "step": 6852 + }, + { + "epoch": 0.3190632492958074, + "grad_norm": 0.5893545058731668, + "learning_rate": 9.998772667076819e-05, + "loss": 3.4925, + "step": 6853 + }, + { + "epoch": 0.3191098074819005, + "grad_norm": 0.4737286847291865, + "learning_rate": 9.998766658358929e-05, + "loss": 3.657, + "step": 6854 + }, + { + "epoch": 0.3191563656679936, + "grad_norm": 0.49640704855518447, + "learning_rate": 9.99876063497015e-05, + "loss": 3.4565, + "step": 6855 + }, + { + "epoch": 0.31920292385408666, + "grad_norm": 0.5020223134444395, + "learning_rate": 9.998754596910503e-05, + "loss": 3.475, + "step": 6856 + }, + { + "epoch": 0.3192494820401797, + "grad_norm": 0.4506043034091593, + "learning_rate": 9.998748544180002e-05, + "loss": 3.4234, + "step": 6857 + }, + { + "epoch": 0.3192960402262728, + "grad_norm": 0.5013998871791654, + "learning_rate": 9.998742476778666e-05, + "loss": 3.4656, + "step": 6858 + }, + { + "epoch": 0.31934259841236584, + "grad_norm": 0.4602888596901336, + "learning_rate": 9.998736394706514e-05, + "loss": 3.4929, + "step": 6859 + }, + { + "epoch": 0.3193891565984589, + "grad_norm": 0.4782128640600509, + "learning_rate": 9.998730297963563e-05, + "loss": 3.5308, + "step": 6860 + }, + { + "epoch": 0.319435714784552, + "grad_norm": 0.4934759033315797, + "learning_rate": 9.998724186549832e-05, + "loss": 3.5192, + "step": 6861 + }, + { + "epoch": 0.3194822729706451, + "grad_norm": 0.4015680733862524, + "learning_rate": 9.998718060465337e-05, + "loss": 3.3763, + "step": 6862 + }, + { + "epoch": 0.31952883115673814, + "grad_norm": 0.46103448273310643, + "learning_rate": 9.998711919710098e-05, + "loss": 3.5397, + "step": 6863 + }, + { + "epoch": 0.3195753893428312, + "grad_norm": 0.49555089884207904, + "learning_rate": 9.998705764284131e-05, + "loss": 3.5646, + "step": 6864 + }, + { + "epoch": 0.31962194752892426, + "grad_norm": 0.46234232643461254, + "learning_rate": 9.998699594187454e-05, + "loss": 3.556, + "step": 6865 + }, + { + "epoch": 0.31966850571501737, + "grad_norm": 0.4586709059582017, + "learning_rate": 9.998693409420087e-05, + "loss": 3.6263, + "step": 6866 + }, + { + "epoch": 0.31971506390111043, + "grad_norm": 0.41197512553170296, + "learning_rate": 9.998687209982049e-05, + "loss": 3.5137, + "step": 6867 + }, + { + "epoch": 0.3197616220872035, + "grad_norm": 0.46904018759606914, + "learning_rate": 9.998680995873355e-05, + "loss": 3.532, + "step": 6868 + }, + { + "epoch": 0.31980818027329655, + "grad_norm": 0.535257871888125, + "learning_rate": 9.998674767094026e-05, + "loss": 3.5085, + "step": 6869 + }, + { + "epoch": 0.3198547384593896, + "grad_norm": 0.5793502635459553, + "learning_rate": 9.998668523644077e-05, + "loss": 3.4012, + "step": 6870 + }, + { + "epoch": 0.31990129664548267, + "grad_norm": 0.5611447794175974, + "learning_rate": 9.998662265523529e-05, + "loss": 3.5137, + "step": 6871 + }, + { + "epoch": 0.3199478548315758, + "grad_norm": 0.49870286673465264, + "learning_rate": 9.9986559927324e-05, + "loss": 3.5959, + "step": 6872 + }, + { + "epoch": 0.31999441301766884, + "grad_norm": 0.4321788706731322, + "learning_rate": 9.998649705270707e-05, + "loss": 3.5757, + "step": 6873 + }, + { + "epoch": 0.3200409712037619, + "grad_norm": 0.5811118463479806, + "learning_rate": 9.998643403138471e-05, + "loss": 3.4906, + "step": 6874 + }, + { + "epoch": 0.32008752938985496, + "grad_norm": 0.54669131888869, + "learning_rate": 9.998637086335709e-05, + "loss": 3.5669, + "step": 6875 + }, + { + "epoch": 0.320134087575948, + "grad_norm": 0.501279521807294, + "learning_rate": 9.998630754862439e-05, + "loss": 3.5672, + "step": 6876 + }, + { + "epoch": 0.32018064576204114, + "grad_norm": 0.5316365805249857, + "learning_rate": 9.99862440871868e-05, + "loss": 3.487, + "step": 6877 + }, + { + "epoch": 0.3202272039481342, + "grad_norm": 0.6497720427744184, + "learning_rate": 9.998618047904451e-05, + "loss": 3.514, + "step": 6878 + }, + { + "epoch": 0.32027376213422726, + "grad_norm": 0.738602871393028, + "learning_rate": 9.99861167241977e-05, + "loss": 3.567, + "step": 6879 + }, + { + "epoch": 0.3203203203203203, + "grad_norm": 0.5965385432135591, + "learning_rate": 9.998605282264658e-05, + "loss": 3.4847, + "step": 6880 + }, + { + "epoch": 0.3203668785064134, + "grad_norm": 0.5608145294159412, + "learning_rate": 9.998598877439132e-05, + "loss": 3.5327, + "step": 6881 + }, + { + "epoch": 0.32041343669250644, + "grad_norm": 0.6025983472821418, + "learning_rate": 9.998592457943208e-05, + "loss": 3.4875, + "step": 6882 + }, + { + "epoch": 0.32045999487859955, + "grad_norm": 0.5740824547065448, + "learning_rate": 9.99858602377691e-05, + "loss": 3.5134, + "step": 6883 + }, + { + "epoch": 0.3205065530646926, + "grad_norm": 0.5533541537469969, + "learning_rate": 9.998579574940253e-05, + "loss": 3.44, + "step": 6884 + }, + { + "epoch": 0.32055311125078567, + "grad_norm": 0.6362733573000899, + "learning_rate": 9.998573111433258e-05, + "loss": 3.6007, + "step": 6885 + }, + { + "epoch": 0.32059966943687873, + "grad_norm": 0.6650697697150594, + "learning_rate": 9.998566633255944e-05, + "loss": 3.5175, + "step": 6886 + }, + { + "epoch": 0.3206462276229718, + "grad_norm": 0.5738140638053504, + "learning_rate": 9.998560140408328e-05, + "loss": 3.5252, + "step": 6887 + }, + { + "epoch": 0.3206927858090649, + "grad_norm": 0.4994982381388185, + "learning_rate": 9.998553632890431e-05, + "loss": 3.3459, + "step": 6888 + }, + { + "epoch": 0.32073934399515797, + "grad_norm": 0.4343885980452634, + "learning_rate": 9.998547110702272e-05, + "loss": 3.5155, + "step": 6889 + }, + { + "epoch": 0.320785902181251, + "grad_norm": 0.4869368573585589, + "learning_rate": 9.99854057384387e-05, + "loss": 3.5146, + "step": 6890 + }, + { + "epoch": 0.3208324603673441, + "grad_norm": 0.5093465179074318, + "learning_rate": 9.998534022315242e-05, + "loss": 3.5899, + "step": 6891 + }, + { + "epoch": 0.32087901855343715, + "grad_norm": 0.5961818955312451, + "learning_rate": 9.998527456116411e-05, + "loss": 3.5595, + "step": 6892 + }, + { + "epoch": 0.3209255767395302, + "grad_norm": 0.6370623197932535, + "learning_rate": 9.998520875247395e-05, + "loss": 3.4563, + "step": 6893 + }, + { + "epoch": 0.3209721349256233, + "grad_norm": 0.5640649163693747, + "learning_rate": 9.998514279708211e-05, + "loss": 3.4897, + "step": 6894 + }, + { + "epoch": 0.3210186931117164, + "grad_norm": 0.5033865836285245, + "learning_rate": 9.99850766949888e-05, + "loss": 3.5769, + "step": 6895 + }, + { + "epoch": 0.32106525129780944, + "grad_norm": 0.5254871119929309, + "learning_rate": 9.998501044619422e-05, + "loss": 3.4354, + "step": 6896 + }, + { + "epoch": 0.3211118094839025, + "grad_norm": 0.5195874253107808, + "learning_rate": 9.998494405069855e-05, + "loss": 3.6004, + "step": 6897 + }, + { + "epoch": 0.32115836766999556, + "grad_norm": 0.536327504851575, + "learning_rate": 9.9984877508502e-05, + "loss": 3.4863, + "step": 6898 + }, + { + "epoch": 0.3212049258560887, + "grad_norm": 0.5651446633688536, + "learning_rate": 9.998481081960476e-05, + "loss": 3.6017, + "step": 6899 + }, + { + "epoch": 0.32125148404218173, + "grad_norm": 0.5403593366754829, + "learning_rate": 9.998474398400702e-05, + "loss": 3.595, + "step": 6900 + }, + { + "epoch": 0.3212980422282748, + "grad_norm": 0.5701855979783367, + "learning_rate": 9.998467700170899e-05, + "loss": 3.5476, + "step": 6901 + }, + { + "epoch": 0.32134460041436785, + "grad_norm": 0.6251971739026654, + "learning_rate": 9.998460987271084e-05, + "loss": 3.5154, + "step": 6902 + }, + { + "epoch": 0.3213911586004609, + "grad_norm": 0.5536604138143767, + "learning_rate": 9.99845425970128e-05, + "loss": 3.4974, + "step": 6903 + }, + { + "epoch": 0.321437716786554, + "grad_norm": 0.4519209839046022, + "learning_rate": 9.998447517461504e-05, + "loss": 3.5488, + "step": 6904 + }, + { + "epoch": 0.3214842749726471, + "grad_norm": 0.5313764372838516, + "learning_rate": 9.998440760551779e-05, + "loss": 3.5357, + "step": 6905 + }, + { + "epoch": 0.32153083315874015, + "grad_norm": 0.5190554222206888, + "learning_rate": 9.998433988972121e-05, + "loss": 3.4452, + "step": 6906 + }, + { + "epoch": 0.3215773913448332, + "grad_norm": 0.4650727951765128, + "learning_rate": 9.998427202722552e-05, + "loss": 3.4921, + "step": 6907 + }, + { + "epoch": 0.32162394953092627, + "grad_norm": 0.43286918436588306, + "learning_rate": 9.998420401803093e-05, + "loss": 3.3853, + "step": 6908 + }, + { + "epoch": 0.3216705077170193, + "grad_norm": 0.4983493870824559, + "learning_rate": 9.998413586213759e-05, + "loss": 3.416, + "step": 6909 + }, + { + "epoch": 0.32171706590311244, + "grad_norm": 0.5040082839318518, + "learning_rate": 9.998406755954576e-05, + "loss": 3.3587, + "step": 6910 + }, + { + "epoch": 0.3217636240892055, + "grad_norm": 0.44310099448954887, + "learning_rate": 9.998399911025562e-05, + "loss": 3.4933, + "step": 6911 + }, + { + "epoch": 0.32181018227529856, + "grad_norm": 0.46568955329148, + "learning_rate": 9.998393051426737e-05, + "loss": 3.4438, + "step": 6912 + }, + { + "epoch": 0.3218567404613916, + "grad_norm": 0.5104426711416398, + "learning_rate": 9.99838617715812e-05, + "loss": 3.5049, + "step": 6913 + }, + { + "epoch": 0.3219032986474847, + "grad_norm": 0.5198471017272671, + "learning_rate": 9.998379288219732e-05, + "loss": 3.4984, + "step": 6914 + }, + { + "epoch": 0.32194985683357774, + "grad_norm": 0.46263487858340585, + "learning_rate": 9.998372384611593e-05, + "loss": 3.4524, + "step": 6915 + }, + { + "epoch": 0.32199641501967086, + "grad_norm": 0.4960675934825502, + "learning_rate": 9.998365466333723e-05, + "loss": 3.616, + "step": 6916 + }, + { + "epoch": 0.3220429732057639, + "grad_norm": 0.5274258826362507, + "learning_rate": 9.998358533386144e-05, + "loss": 3.5064, + "step": 6917 + }, + { + "epoch": 0.322089531391857, + "grad_norm": 0.4502812178035264, + "learning_rate": 9.998351585768875e-05, + "loss": 3.5477, + "step": 6918 + }, + { + "epoch": 0.32213608957795004, + "grad_norm": 0.42201609979474286, + "learning_rate": 9.998344623481935e-05, + "loss": 3.5244, + "step": 6919 + }, + { + "epoch": 0.3221826477640431, + "grad_norm": 0.5088215124973493, + "learning_rate": 9.998337646525347e-05, + "loss": 3.5062, + "step": 6920 + }, + { + "epoch": 0.32222920595013615, + "grad_norm": 0.4337282145578154, + "learning_rate": 9.99833065489913e-05, + "loss": 3.5577, + "step": 6921 + }, + { + "epoch": 0.32227576413622927, + "grad_norm": 0.47765275936579427, + "learning_rate": 9.998323648603306e-05, + "loss": 3.5977, + "step": 6922 + }, + { + "epoch": 0.32232232232232233, + "grad_norm": 0.484422764798233, + "learning_rate": 9.998316627637893e-05, + "loss": 3.5342, + "step": 6923 + }, + { + "epoch": 0.3223688805084154, + "grad_norm": 0.4989666973241639, + "learning_rate": 9.998309592002914e-05, + "loss": 3.4776, + "step": 6924 + }, + { + "epoch": 0.32241543869450845, + "grad_norm": 0.5362304768653748, + "learning_rate": 9.99830254169839e-05, + "loss": 3.5734, + "step": 6925 + }, + { + "epoch": 0.3224619968806015, + "grad_norm": 0.5031670195522169, + "learning_rate": 9.99829547672434e-05, + "loss": 3.5119, + "step": 6926 + }, + { + "epoch": 0.3225085550666946, + "grad_norm": 0.4991069039937645, + "learning_rate": 9.998288397080784e-05, + "loss": 3.6313, + "step": 6927 + }, + { + "epoch": 0.3225551132527877, + "grad_norm": 0.4711964672310035, + "learning_rate": 9.998281302767744e-05, + "loss": 3.5979, + "step": 6928 + }, + { + "epoch": 0.32260167143888074, + "grad_norm": 0.5387150774437013, + "learning_rate": 9.99827419378524e-05, + "loss": 3.4848, + "step": 6929 + }, + { + "epoch": 0.3226482296249738, + "grad_norm": 0.5372000856620396, + "learning_rate": 9.998267070133297e-05, + "loss": 3.5405, + "step": 6930 + }, + { + "epoch": 0.32269478781106686, + "grad_norm": 0.508654020215602, + "learning_rate": 9.998259931811928e-05, + "loss": 3.5174, + "step": 6931 + }, + { + "epoch": 0.3227413459971599, + "grad_norm": 0.5106850367755879, + "learning_rate": 9.998252778821162e-05, + "loss": 3.4919, + "step": 6932 + }, + { + "epoch": 0.32278790418325304, + "grad_norm": 0.56211878453247, + "learning_rate": 9.998245611161016e-05, + "loss": 3.6198, + "step": 6933 + }, + { + "epoch": 0.3228344623693461, + "grad_norm": 0.7259293273978241, + "learning_rate": 9.99823842883151e-05, + "loss": 3.3942, + "step": 6934 + }, + { + "epoch": 0.32288102055543916, + "grad_norm": 0.7545742188541998, + "learning_rate": 9.998231231832667e-05, + "loss": 3.5106, + "step": 6935 + }, + { + "epoch": 0.3229275787415322, + "grad_norm": 0.5812516572024085, + "learning_rate": 9.998224020164509e-05, + "loss": 3.3746, + "step": 6936 + }, + { + "epoch": 0.3229741369276253, + "grad_norm": 0.49488824592833247, + "learning_rate": 9.998216793827056e-05, + "loss": 3.4386, + "step": 6937 + }, + { + "epoch": 0.3230206951137184, + "grad_norm": 0.518186198875685, + "learning_rate": 9.998209552820327e-05, + "loss": 3.487, + "step": 6938 + }, + { + "epoch": 0.32306725329981145, + "grad_norm": 0.5209146182518766, + "learning_rate": 9.998202297144348e-05, + "loss": 3.5557, + "step": 6939 + }, + { + "epoch": 0.3231138114859045, + "grad_norm": 0.4300440059067606, + "learning_rate": 9.998195026799135e-05, + "loss": 3.4828, + "step": 6940 + }, + { + "epoch": 0.32316036967199757, + "grad_norm": 0.4562921971983593, + "learning_rate": 9.998187741784715e-05, + "loss": 3.5585, + "step": 6941 + }, + { + "epoch": 0.32320692785809063, + "grad_norm": 0.5026275793686021, + "learning_rate": 9.998180442101103e-05, + "loss": 3.5745, + "step": 6942 + }, + { + "epoch": 0.3232534860441837, + "grad_norm": 0.5376688732194769, + "learning_rate": 9.998173127748327e-05, + "loss": 3.4675, + "step": 6943 + }, + { + "epoch": 0.3233000442302768, + "grad_norm": 0.47295014986967054, + "learning_rate": 9.998165798726403e-05, + "loss": 3.4831, + "step": 6944 + }, + { + "epoch": 0.32334660241636987, + "grad_norm": 0.40605834602316343, + "learning_rate": 9.998158455035356e-05, + "loss": 3.5335, + "step": 6945 + }, + { + "epoch": 0.3233931606024629, + "grad_norm": 0.4776284716397377, + "learning_rate": 9.998151096675205e-05, + "loss": 3.5739, + "step": 6946 + }, + { + "epoch": 0.323439718788556, + "grad_norm": 0.5692395916059456, + "learning_rate": 9.998143723645974e-05, + "loss": 3.4839, + "step": 6947 + }, + { + "epoch": 0.32348627697464905, + "grad_norm": 0.6945363624698162, + "learning_rate": 9.998136335947683e-05, + "loss": 3.4146, + "step": 6948 + }, + { + "epoch": 0.32353283516074216, + "grad_norm": 0.5107119477543065, + "learning_rate": 9.998128933580356e-05, + "loss": 3.4409, + "step": 6949 + }, + { + "epoch": 0.3235793933468352, + "grad_norm": 0.4774863494330817, + "learning_rate": 9.998121516544012e-05, + "loss": 3.5776, + "step": 6950 + }, + { + "epoch": 0.3236259515329283, + "grad_norm": 0.5321655415680163, + "learning_rate": 9.998114084838672e-05, + "loss": 3.3712, + "step": 6951 + }, + { + "epoch": 0.32367250971902134, + "grad_norm": 0.5713148937064529, + "learning_rate": 9.998106638464362e-05, + "loss": 3.4226, + "step": 6952 + }, + { + "epoch": 0.3237190679051144, + "grad_norm": 0.5128484838001386, + "learning_rate": 9.9980991774211e-05, + "loss": 3.4728, + "step": 6953 + }, + { + "epoch": 0.32376562609120746, + "grad_norm": 0.4887903060792456, + "learning_rate": 9.99809170170891e-05, + "loss": 3.5033, + "step": 6954 + }, + { + "epoch": 0.3238121842773006, + "grad_norm": 0.49831866858240165, + "learning_rate": 9.998084211327812e-05, + "loss": 3.5342, + "step": 6955 + }, + { + "epoch": 0.32385874246339363, + "grad_norm": 0.4944789671254641, + "learning_rate": 9.99807670627783e-05, + "loss": 3.4889, + "step": 6956 + }, + { + "epoch": 0.3239053006494867, + "grad_norm": 0.5033822276006955, + "learning_rate": 9.998069186558986e-05, + "loss": 3.4812, + "step": 6957 + }, + { + "epoch": 0.32395185883557975, + "grad_norm": 0.48610924107780445, + "learning_rate": 9.9980616521713e-05, + "loss": 3.536, + "step": 6958 + }, + { + "epoch": 0.3239984170216728, + "grad_norm": 0.49033118518641683, + "learning_rate": 9.998054103114797e-05, + "loss": 3.4838, + "step": 6959 + }, + { + "epoch": 0.32404497520776593, + "grad_norm": 0.4959564979246967, + "learning_rate": 9.998046539389496e-05, + "loss": 3.495, + "step": 6960 + }, + { + "epoch": 0.324091533393859, + "grad_norm": 0.44823440875273296, + "learning_rate": 9.998038960995422e-05, + "loss": 3.569, + "step": 6961 + }, + { + "epoch": 0.32413809157995205, + "grad_norm": 0.4461300203310428, + "learning_rate": 9.998031367932596e-05, + "loss": 3.5003, + "step": 6962 + }, + { + "epoch": 0.3241846497660451, + "grad_norm": 0.4636212907598449, + "learning_rate": 9.99802376020104e-05, + "loss": 3.4332, + "step": 6963 + }, + { + "epoch": 0.32423120795213817, + "grad_norm": 0.48062094579597237, + "learning_rate": 9.998016137800776e-05, + "loss": 3.54, + "step": 6964 + }, + { + "epoch": 0.3242777661382312, + "grad_norm": 0.5498244684301525, + "learning_rate": 9.998008500731828e-05, + "loss": 3.4114, + "step": 6965 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 0.48460113749459655, + "learning_rate": 9.998000848994218e-05, + "loss": 3.5463, + "step": 6966 + }, + { + "epoch": 0.3243708825104174, + "grad_norm": 0.45250365670224313, + "learning_rate": 9.997993182587965e-05, + "loss": 3.5546, + "step": 6967 + }, + { + "epoch": 0.32441744069651046, + "grad_norm": 0.5235624445324287, + "learning_rate": 9.997985501513097e-05, + "loss": 3.4846, + "step": 6968 + }, + { + "epoch": 0.3244639988826035, + "grad_norm": 0.5340742510032217, + "learning_rate": 9.997977805769634e-05, + "loss": 3.4763, + "step": 6969 + }, + { + "epoch": 0.3245105570686966, + "grad_norm": 0.4806249609893463, + "learning_rate": 9.997970095357597e-05, + "loss": 3.5734, + "step": 6970 + }, + { + "epoch": 0.3245571152547897, + "grad_norm": 0.5345687241558827, + "learning_rate": 9.997962370277012e-05, + "loss": 3.59, + "step": 6971 + }, + { + "epoch": 0.32460367344088276, + "grad_norm": 0.562762512338243, + "learning_rate": 9.997954630527898e-05, + "loss": 3.5023, + "step": 6972 + }, + { + "epoch": 0.3246502316269758, + "grad_norm": 0.501600469579233, + "learning_rate": 9.99794687611028e-05, + "loss": 3.4258, + "step": 6973 + }, + { + "epoch": 0.3246967898130689, + "grad_norm": 0.48168740164774637, + "learning_rate": 9.997939107024181e-05, + "loss": 3.5648, + "step": 6974 + }, + { + "epoch": 0.32474334799916194, + "grad_norm": 0.49620271675905847, + "learning_rate": 9.997931323269622e-05, + "loss": 3.5103, + "step": 6975 + }, + { + "epoch": 0.324789906185255, + "grad_norm": 0.49387502742864053, + "learning_rate": 9.997923524846628e-05, + "loss": 3.5346, + "step": 6976 + }, + { + "epoch": 0.3248364643713481, + "grad_norm": 0.4614691981868225, + "learning_rate": 9.997915711755222e-05, + "loss": 3.4537, + "step": 6977 + }, + { + "epoch": 0.32488302255744117, + "grad_norm": 0.4939891109846872, + "learning_rate": 9.997907883995423e-05, + "loss": 3.4855, + "step": 6978 + }, + { + "epoch": 0.32492958074353423, + "grad_norm": 0.4667770152290122, + "learning_rate": 9.997900041567258e-05, + "loss": 3.5286, + "step": 6979 + }, + { + "epoch": 0.3249761389296273, + "grad_norm": 0.49518751660656407, + "learning_rate": 9.99789218447075e-05, + "loss": 3.4491, + "step": 6980 + }, + { + "epoch": 0.32502269711572035, + "grad_norm": 0.5333749080715428, + "learning_rate": 9.997884312705919e-05, + "loss": 3.4734, + "step": 6981 + }, + { + "epoch": 0.32506925530181346, + "grad_norm": 0.46414520105722107, + "learning_rate": 9.99787642627279e-05, + "loss": 3.4418, + "step": 6982 + }, + { + "epoch": 0.3251158134879065, + "grad_norm": 0.47454639396020887, + "learning_rate": 9.997868525171385e-05, + "loss": 3.5427, + "step": 6983 + }, + { + "epoch": 0.3251623716739996, + "grad_norm": 0.48019499321820197, + "learning_rate": 9.997860609401731e-05, + "loss": 3.3744, + "step": 6984 + }, + { + "epoch": 0.32520892986009264, + "grad_norm": 0.4580557126617517, + "learning_rate": 9.997852678963847e-05, + "loss": 3.5738, + "step": 6985 + }, + { + "epoch": 0.3252554880461857, + "grad_norm": 0.4163920756977613, + "learning_rate": 9.997844733857758e-05, + "loss": 3.4197, + "step": 6986 + }, + { + "epoch": 0.32530204623227876, + "grad_norm": 0.5427278197504595, + "learning_rate": 9.997836774083486e-05, + "loss": 3.5155, + "step": 6987 + }, + { + "epoch": 0.3253486044183719, + "grad_norm": 0.5858179616877542, + "learning_rate": 9.997828799641058e-05, + "loss": 3.537, + "step": 6988 + }, + { + "epoch": 0.32539516260446494, + "grad_norm": 0.5242804664264381, + "learning_rate": 9.997820810530493e-05, + "loss": 3.4304, + "step": 6989 + }, + { + "epoch": 0.325441720790558, + "grad_norm": 0.5147262274440906, + "learning_rate": 9.997812806751816e-05, + "loss": 3.5716, + "step": 6990 + }, + { + "epoch": 0.32548827897665106, + "grad_norm": 0.6008374984927308, + "learning_rate": 9.997804788305052e-05, + "loss": 3.4771, + "step": 6991 + }, + { + "epoch": 0.3255348371627441, + "grad_norm": 0.5346553318010209, + "learning_rate": 9.997796755190223e-05, + "loss": 3.462, + "step": 6992 + }, + { + "epoch": 0.32558139534883723, + "grad_norm": 0.5217448726553837, + "learning_rate": 9.997788707407353e-05, + "loss": 3.4809, + "step": 6993 + }, + { + "epoch": 0.3256279535349303, + "grad_norm": 0.5237988597511886, + "learning_rate": 9.997780644956465e-05, + "loss": 3.4596, + "step": 6994 + }, + { + "epoch": 0.32567451172102335, + "grad_norm": 0.430645973522838, + "learning_rate": 9.997772567837584e-05, + "loss": 3.4316, + "step": 6995 + }, + { + "epoch": 0.3257210699071164, + "grad_norm": 0.5046634368211128, + "learning_rate": 9.997764476050732e-05, + "loss": 3.5788, + "step": 6996 + }, + { + "epoch": 0.32576762809320947, + "grad_norm": 0.5750621156478093, + "learning_rate": 9.997756369595933e-05, + "loss": 3.5774, + "step": 6997 + }, + { + "epoch": 0.32581418627930253, + "grad_norm": 0.5080574395799704, + "learning_rate": 9.997748248473213e-05, + "loss": 3.5007, + "step": 6998 + }, + { + "epoch": 0.32586074446539565, + "grad_norm": 0.46096018787310356, + "learning_rate": 9.997740112682594e-05, + "loss": 3.4908, + "step": 6999 + }, + { + "epoch": 0.3259073026514887, + "grad_norm": 0.45943388187533035, + "learning_rate": 9.997731962224098e-05, + "loss": 3.3885, + "step": 7000 + }, + { + "epoch": 0.32595386083758177, + "grad_norm": 0.41694219111881486, + "learning_rate": 9.997723797097753e-05, + "loss": 3.5689, + "step": 7001 + }, + { + "epoch": 0.3260004190236748, + "grad_norm": 0.508212758460463, + "learning_rate": 9.99771561730358e-05, + "loss": 3.4394, + "step": 7002 + }, + { + "epoch": 0.3260469772097679, + "grad_norm": 0.49265505868980897, + "learning_rate": 9.997707422841605e-05, + "loss": 3.3169, + "step": 7003 + }, + { + "epoch": 0.326093535395861, + "grad_norm": 0.5057933243794412, + "learning_rate": 9.99769921371185e-05, + "loss": 3.4627, + "step": 7004 + }, + { + "epoch": 0.32614009358195406, + "grad_norm": 0.4761056524254384, + "learning_rate": 9.997690989914342e-05, + "loss": 3.4602, + "step": 7005 + }, + { + "epoch": 0.3261866517680471, + "grad_norm": 0.4861193894070933, + "learning_rate": 9.997682751449102e-05, + "loss": 3.55, + "step": 7006 + }, + { + "epoch": 0.3262332099541402, + "grad_norm": 0.45139993776896076, + "learning_rate": 9.997674498316157e-05, + "loss": 3.5055, + "step": 7007 + }, + { + "epoch": 0.32627976814023324, + "grad_norm": 0.5345683821153868, + "learning_rate": 9.997666230515528e-05, + "loss": 3.5345, + "step": 7008 + }, + { + "epoch": 0.3263263263263263, + "grad_norm": 0.5126988798812929, + "learning_rate": 9.997657948047242e-05, + "loss": 3.5609, + "step": 7009 + }, + { + "epoch": 0.3263728845124194, + "grad_norm": 0.5336857319110238, + "learning_rate": 9.997649650911321e-05, + "loss": 3.5018, + "step": 7010 + }, + { + "epoch": 0.3264194426985125, + "grad_norm": 0.5257589020616489, + "learning_rate": 9.99764133910779e-05, + "loss": 3.4883, + "step": 7011 + }, + { + "epoch": 0.32646600088460553, + "grad_norm": 0.45412938702191574, + "learning_rate": 9.997633012636676e-05, + "loss": 3.4601, + "step": 7012 + }, + { + "epoch": 0.3265125590706986, + "grad_norm": 0.4811128200142284, + "learning_rate": 9.997624671498001e-05, + "loss": 3.5631, + "step": 7013 + }, + { + "epoch": 0.32655911725679165, + "grad_norm": 0.48666160267422875, + "learning_rate": 9.997616315691789e-05, + "loss": 3.5694, + "step": 7014 + }, + { + "epoch": 0.32660567544288477, + "grad_norm": 0.6095373920705282, + "learning_rate": 9.997607945218065e-05, + "loss": 3.5126, + "step": 7015 + }, + { + "epoch": 0.32665223362897783, + "grad_norm": 0.5704870339413978, + "learning_rate": 9.997599560076857e-05, + "loss": 3.5187, + "step": 7016 + }, + { + "epoch": 0.3266987918150709, + "grad_norm": 0.5254237256447807, + "learning_rate": 9.997591160268185e-05, + "loss": 3.4599, + "step": 7017 + }, + { + "epoch": 0.32674535000116395, + "grad_norm": 0.40999237661334176, + "learning_rate": 9.997582745792075e-05, + "loss": 3.5249, + "step": 7018 + }, + { + "epoch": 0.326791908187257, + "grad_norm": 0.49011553518758544, + "learning_rate": 9.997574316648552e-05, + "loss": 3.4397, + "step": 7019 + }, + { + "epoch": 0.32683846637335007, + "grad_norm": 0.5134886265262745, + "learning_rate": 9.997565872837642e-05, + "loss": 3.5204, + "step": 7020 + }, + { + "epoch": 0.3268850245594432, + "grad_norm": 0.49066802164952583, + "learning_rate": 9.997557414359366e-05, + "loss": 3.5281, + "step": 7021 + }, + { + "epoch": 0.32693158274553624, + "grad_norm": 0.5629482772168523, + "learning_rate": 9.997548941213753e-05, + "loss": 3.5026, + "step": 7022 + }, + { + "epoch": 0.3269781409316293, + "grad_norm": 0.543398215434691, + "learning_rate": 9.997540453400825e-05, + "loss": 3.5043, + "step": 7023 + }, + { + "epoch": 0.32702469911772236, + "grad_norm": 0.4828458701867084, + "learning_rate": 9.997531950920608e-05, + "loss": 3.4612, + "step": 7024 + }, + { + "epoch": 0.3270712573038154, + "grad_norm": 0.4539970014812022, + "learning_rate": 9.99752343377313e-05, + "loss": 3.4241, + "step": 7025 + }, + { + "epoch": 0.32711781548990854, + "grad_norm": 0.5575926359952795, + "learning_rate": 9.99751490195841e-05, + "loss": 3.3758, + "step": 7026 + }, + { + "epoch": 0.3271643736760016, + "grad_norm": 0.5779664403955045, + "learning_rate": 9.997506355476478e-05, + "loss": 3.4446, + "step": 7027 + }, + { + "epoch": 0.32721093186209466, + "grad_norm": 0.4844123977759814, + "learning_rate": 9.997497794327356e-05, + "loss": 3.5676, + "step": 7028 + }, + { + "epoch": 0.3272574900481877, + "grad_norm": 0.49667410112931226, + "learning_rate": 9.997489218511072e-05, + "loss": 3.504, + "step": 7029 + }, + { + "epoch": 0.3273040482342808, + "grad_norm": 0.5937251891377017, + "learning_rate": 9.997480628027647e-05, + "loss": 3.4471, + "step": 7030 + }, + { + "epoch": 0.32735060642037384, + "grad_norm": 0.6225907227208786, + "learning_rate": 9.99747202287711e-05, + "loss": 3.5084, + "step": 7031 + }, + { + "epoch": 0.32739716460646695, + "grad_norm": 0.5434272709766438, + "learning_rate": 9.997463403059483e-05, + "loss": 3.5888, + "step": 7032 + }, + { + "epoch": 0.32744372279256, + "grad_norm": 0.5906881214689254, + "learning_rate": 9.997454768574797e-05, + "loss": 3.5556, + "step": 7033 + }, + { + "epoch": 0.32749028097865307, + "grad_norm": 0.5914077855688622, + "learning_rate": 9.99744611942307e-05, + "loss": 3.5257, + "step": 7034 + }, + { + "epoch": 0.32753683916474613, + "grad_norm": 0.4409873605427901, + "learning_rate": 9.997437455604334e-05, + "loss": 3.4058, + "step": 7035 + }, + { + "epoch": 0.3275833973508392, + "grad_norm": 0.4813787002365264, + "learning_rate": 9.997428777118609e-05, + "loss": 3.5459, + "step": 7036 + }, + { + "epoch": 0.3276299555369323, + "grad_norm": 0.540292763354648, + "learning_rate": 9.997420083965925e-05, + "loss": 3.4488, + "step": 7037 + }, + { + "epoch": 0.32767651372302536, + "grad_norm": 0.5538820692417713, + "learning_rate": 9.997411376146303e-05, + "loss": 3.4387, + "step": 7038 + }, + { + "epoch": 0.3277230719091184, + "grad_norm": 0.448025593016939, + "learning_rate": 9.997402653659774e-05, + "loss": 3.4395, + "step": 7039 + }, + { + "epoch": 0.3277696300952115, + "grad_norm": 0.49865876089689465, + "learning_rate": 9.997393916506356e-05, + "loss": 3.5485, + "step": 7040 + }, + { + "epoch": 0.32781618828130454, + "grad_norm": 0.5579427879383022, + "learning_rate": 9.997385164686083e-05, + "loss": 3.5769, + "step": 7041 + }, + { + "epoch": 0.3278627464673976, + "grad_norm": 0.6017986699574412, + "learning_rate": 9.997376398198977e-05, + "loss": 3.5768, + "step": 7042 + }, + { + "epoch": 0.3279093046534907, + "grad_norm": 0.4833591347747144, + "learning_rate": 9.997367617045062e-05, + "loss": 3.5653, + "step": 7043 + }, + { + "epoch": 0.3279558628395838, + "grad_norm": 0.4395268858035349, + "learning_rate": 9.997358821224366e-05, + "loss": 3.4433, + "step": 7044 + }, + { + "epoch": 0.32800242102567684, + "grad_norm": 0.4862919882494605, + "learning_rate": 9.997350010736912e-05, + "loss": 3.6743, + "step": 7045 + }, + { + "epoch": 0.3280489792117699, + "grad_norm": 0.4649187964383116, + "learning_rate": 9.997341185582732e-05, + "loss": 3.5277, + "step": 7046 + }, + { + "epoch": 0.32809553739786296, + "grad_norm": 0.5096415671017375, + "learning_rate": 9.997332345761845e-05, + "loss": 3.5421, + "step": 7047 + }, + { + "epoch": 0.3281420955839561, + "grad_norm": 0.5330091089096798, + "learning_rate": 9.99732349127428e-05, + "loss": 3.5249, + "step": 7048 + }, + { + "epoch": 0.32818865377004913, + "grad_norm": 0.483706507277685, + "learning_rate": 9.997314622120065e-05, + "loss": 3.5589, + "step": 7049 + }, + { + "epoch": 0.3282352119561422, + "grad_norm": 0.5072820135973288, + "learning_rate": 9.997305738299221e-05, + "loss": 3.4187, + "step": 7050 + }, + { + "epoch": 0.32828177014223525, + "grad_norm": 0.504927867841522, + "learning_rate": 9.997296839811778e-05, + "loss": 3.563, + "step": 7051 + }, + { + "epoch": 0.3283283283283283, + "grad_norm": 0.5155274911562376, + "learning_rate": 9.997287926657762e-05, + "loss": 3.554, + "step": 7052 + }, + { + "epoch": 0.32837488651442137, + "grad_norm": 0.4943495312543393, + "learning_rate": 9.997278998837198e-05, + "loss": 3.4316, + "step": 7053 + }, + { + "epoch": 0.3284214447005145, + "grad_norm": 0.5133578249116248, + "learning_rate": 9.997270056350112e-05, + "loss": 3.6555, + "step": 7054 + }, + { + "epoch": 0.32846800288660755, + "grad_norm": 0.4589674665477495, + "learning_rate": 9.997261099196531e-05, + "loss": 3.4845, + "step": 7055 + }, + { + "epoch": 0.3285145610727006, + "grad_norm": 0.4909111610003436, + "learning_rate": 9.997252127376482e-05, + "loss": 3.5109, + "step": 7056 + }, + { + "epoch": 0.32856111925879367, + "grad_norm": 0.5269182694075244, + "learning_rate": 9.99724314088999e-05, + "loss": 3.4034, + "step": 7057 + }, + { + "epoch": 0.3286076774448867, + "grad_norm": 0.5455764469108972, + "learning_rate": 9.997234139737079e-05, + "loss": 3.4689, + "step": 7058 + }, + { + "epoch": 0.32865423563097984, + "grad_norm": 0.6692686741664137, + "learning_rate": 9.997225123917782e-05, + "loss": 3.4554, + "step": 7059 + }, + { + "epoch": 0.3287007938170729, + "grad_norm": 0.585670723726187, + "learning_rate": 9.997216093432118e-05, + "loss": 3.5312, + "step": 7060 + }, + { + "epoch": 0.32874735200316596, + "grad_norm": 0.4986864344475655, + "learning_rate": 9.997207048280119e-05, + "loss": 3.4618, + "step": 7061 + }, + { + "epoch": 0.328793910189259, + "grad_norm": 0.54633169179706, + "learning_rate": 9.99719798846181e-05, + "loss": 3.4356, + "step": 7062 + }, + { + "epoch": 0.3288404683753521, + "grad_norm": 0.5352564268670318, + "learning_rate": 9.997188913977215e-05, + "loss": 3.4606, + "step": 7063 + }, + { + "epoch": 0.32888702656144514, + "grad_norm": 0.6822140810922738, + "learning_rate": 9.997179824826365e-05, + "loss": 3.5752, + "step": 7064 + }, + { + "epoch": 0.32893358474753825, + "grad_norm": 0.63870069104669, + "learning_rate": 9.997170721009285e-05, + "loss": 3.3808, + "step": 7065 + }, + { + "epoch": 0.3289801429336313, + "grad_norm": 0.5122683321652082, + "learning_rate": 9.997161602526e-05, + "loss": 3.5153, + "step": 7066 + }, + { + "epoch": 0.3290267011197244, + "grad_norm": 0.6306777481752034, + "learning_rate": 9.997152469376539e-05, + "loss": 3.4707, + "step": 7067 + }, + { + "epoch": 0.32907325930581743, + "grad_norm": 0.5801304688810944, + "learning_rate": 9.997143321560927e-05, + "loss": 3.4818, + "step": 7068 + }, + { + "epoch": 0.3291198174919105, + "grad_norm": 0.5702394955215006, + "learning_rate": 9.997134159079192e-05, + "loss": 3.5577, + "step": 7069 + }, + { + "epoch": 0.3291663756780036, + "grad_norm": 0.6893110303879608, + "learning_rate": 9.997124981931359e-05, + "loss": 3.5538, + "step": 7070 + }, + { + "epoch": 0.32921293386409667, + "grad_norm": 0.6250631729358451, + "learning_rate": 9.997115790117457e-05, + "loss": 3.4389, + "step": 7071 + }, + { + "epoch": 0.32925949205018973, + "grad_norm": 0.5726374385759231, + "learning_rate": 9.997106583637514e-05, + "loss": 3.5241, + "step": 7072 + }, + { + "epoch": 0.3293060502362828, + "grad_norm": 0.5210231197565451, + "learning_rate": 9.997097362491555e-05, + "loss": 3.4336, + "step": 7073 + }, + { + "epoch": 0.32935260842237585, + "grad_norm": 0.5532772848554185, + "learning_rate": 9.997088126679606e-05, + "loss": 3.5531, + "step": 7074 + }, + { + "epoch": 0.3293991666084689, + "grad_norm": 0.5586210954050679, + "learning_rate": 9.997078876201697e-05, + "loss": 3.5041, + "step": 7075 + }, + { + "epoch": 0.329445724794562, + "grad_norm": 0.5423529814704109, + "learning_rate": 9.997069611057853e-05, + "loss": 3.3595, + "step": 7076 + }, + { + "epoch": 0.3294922829806551, + "grad_norm": 0.5225907938404084, + "learning_rate": 9.997060331248102e-05, + "loss": 3.5276, + "step": 7077 + }, + { + "epoch": 0.32953884116674814, + "grad_norm": 0.49591192885430735, + "learning_rate": 9.997051036772472e-05, + "loss": 3.4444, + "step": 7078 + }, + { + "epoch": 0.3295853993528412, + "grad_norm": 0.49639946929136824, + "learning_rate": 9.99704172763099e-05, + "loss": 3.5897, + "step": 7079 + }, + { + "epoch": 0.32963195753893426, + "grad_norm": 0.4892236820659086, + "learning_rate": 9.997032403823682e-05, + "loss": 3.4295, + "step": 7080 + }, + { + "epoch": 0.3296785157250274, + "grad_norm": 0.4915903474466773, + "learning_rate": 9.997023065350576e-05, + "loss": 3.5449, + "step": 7081 + }, + { + "epoch": 0.32972507391112044, + "grad_norm": 0.5014923673534131, + "learning_rate": 9.9970137122117e-05, + "loss": 3.5607, + "step": 7082 + }, + { + "epoch": 0.3297716320972135, + "grad_norm": 0.5075597075619639, + "learning_rate": 9.997004344407082e-05, + "loss": 3.5014, + "step": 7083 + }, + { + "epoch": 0.32981819028330656, + "grad_norm": 0.6038530179016416, + "learning_rate": 9.996994961936747e-05, + "loss": 3.4882, + "step": 7084 + }, + { + "epoch": 0.3298647484693996, + "grad_norm": 0.5604228278242477, + "learning_rate": 9.996985564800723e-05, + "loss": 3.5261, + "step": 7085 + }, + { + "epoch": 0.3299113066554927, + "grad_norm": 0.5284329754688398, + "learning_rate": 9.99697615299904e-05, + "loss": 3.4673, + "step": 7086 + }, + { + "epoch": 0.3299578648415858, + "grad_norm": 0.48250356099144026, + "learning_rate": 9.996966726531725e-05, + "loss": 3.4766, + "step": 7087 + }, + { + "epoch": 0.33000442302767885, + "grad_norm": 0.5214841051035596, + "learning_rate": 9.996957285398805e-05, + "loss": 3.517, + "step": 7088 + }, + { + "epoch": 0.3300509812137719, + "grad_norm": 0.4963799739016782, + "learning_rate": 9.996947829600307e-05, + "loss": 3.6222, + "step": 7089 + }, + { + "epoch": 0.33009753939986497, + "grad_norm": 0.4914524128932991, + "learning_rate": 9.996938359136259e-05, + "loss": 3.3377, + "step": 7090 + }, + { + "epoch": 0.33014409758595803, + "grad_norm": 0.44605986843073847, + "learning_rate": 9.996928874006691e-05, + "loss": 3.4831, + "step": 7091 + }, + { + "epoch": 0.33019065577205114, + "grad_norm": 0.44546819968957163, + "learning_rate": 9.996919374211627e-05, + "loss": 3.4926, + "step": 7092 + }, + { + "epoch": 0.3302372139581442, + "grad_norm": 0.4161437336172765, + "learning_rate": 9.996909859751097e-05, + "loss": 3.5989, + "step": 7093 + }, + { + "epoch": 0.33028377214423726, + "grad_norm": 0.5263994924452843, + "learning_rate": 9.99690033062513e-05, + "loss": 3.6338, + "step": 7094 + }, + { + "epoch": 0.3303303303303303, + "grad_norm": 0.5498229731780929, + "learning_rate": 9.996890786833752e-05, + "loss": 3.6445, + "step": 7095 + }, + { + "epoch": 0.3303768885164234, + "grad_norm": 0.5349002801457163, + "learning_rate": 9.996881228376991e-05, + "loss": 3.5097, + "step": 7096 + }, + { + "epoch": 0.33042344670251644, + "grad_norm": 0.5017497133317321, + "learning_rate": 9.996871655254878e-05, + "loss": 3.5924, + "step": 7097 + }, + { + "epoch": 0.33047000488860956, + "grad_norm": 0.477124069041165, + "learning_rate": 9.996862067467436e-05, + "loss": 3.4566, + "step": 7098 + }, + { + "epoch": 0.3305165630747026, + "grad_norm": 0.47311378406982196, + "learning_rate": 9.996852465014699e-05, + "loss": 3.4976, + "step": 7099 + }, + { + "epoch": 0.3305631212607957, + "grad_norm": 0.46071616772990154, + "learning_rate": 9.996842847896691e-05, + "loss": 3.4803, + "step": 7100 + }, + { + "epoch": 0.33060967944688874, + "grad_norm": 0.545490728163644, + "learning_rate": 9.996833216113442e-05, + "loss": 3.4751, + "step": 7101 + }, + { + "epoch": 0.3306562376329818, + "grad_norm": 0.5138897398508442, + "learning_rate": 9.99682356966498e-05, + "loss": 3.4584, + "step": 7102 + }, + { + "epoch": 0.3307027958190749, + "grad_norm": 0.4364411477851802, + "learning_rate": 9.996813908551332e-05, + "loss": 3.4874, + "step": 7103 + }, + { + "epoch": 0.330749354005168, + "grad_norm": 0.419375566168255, + "learning_rate": 9.996804232772527e-05, + "loss": 3.4268, + "step": 7104 + }, + { + "epoch": 0.33079591219126103, + "grad_norm": 0.45243262091305686, + "learning_rate": 9.996794542328596e-05, + "loss": 3.5295, + "step": 7105 + }, + { + "epoch": 0.3308424703773541, + "grad_norm": 0.5330091261070483, + "learning_rate": 9.996784837219564e-05, + "loss": 3.4933, + "step": 7106 + }, + { + "epoch": 0.33088902856344715, + "grad_norm": 0.45918266093010096, + "learning_rate": 9.99677511744546e-05, + "loss": 3.4595, + "step": 7107 + }, + { + "epoch": 0.3309355867495402, + "grad_norm": 0.47650684536914556, + "learning_rate": 9.996765383006316e-05, + "loss": 3.4209, + "step": 7108 + }, + { + "epoch": 0.3309821449356333, + "grad_norm": 0.5591341511573947, + "learning_rate": 9.996755633902155e-05, + "loss": 3.5401, + "step": 7109 + }, + { + "epoch": 0.3310287031217264, + "grad_norm": 0.6723673223728601, + "learning_rate": 9.996745870133011e-05, + "loss": 3.4247, + "step": 7110 + }, + { + "epoch": 0.33107526130781945, + "grad_norm": 0.6306304747511777, + "learning_rate": 9.996736091698908e-05, + "loss": 3.3989, + "step": 7111 + }, + { + "epoch": 0.3311218194939125, + "grad_norm": 0.5220403827287952, + "learning_rate": 9.996726298599878e-05, + "loss": 3.5321, + "step": 7112 + }, + { + "epoch": 0.33116837768000557, + "grad_norm": 0.4917475768758855, + "learning_rate": 9.996716490835948e-05, + "loss": 3.4756, + "step": 7113 + }, + { + "epoch": 0.3312149358660987, + "grad_norm": 0.5416625410816892, + "learning_rate": 9.996706668407147e-05, + "loss": 3.4383, + "step": 7114 + }, + { + "epoch": 0.33126149405219174, + "grad_norm": 0.5436594596154918, + "learning_rate": 9.996696831313505e-05, + "loss": 3.517, + "step": 7115 + }, + { + "epoch": 0.3313080522382848, + "grad_norm": 0.4719289395661391, + "learning_rate": 9.99668697955505e-05, + "loss": 3.5172, + "step": 7116 + }, + { + "epoch": 0.33135461042437786, + "grad_norm": 0.4855696280141304, + "learning_rate": 9.99667711313181e-05, + "loss": 3.3629, + "step": 7117 + }, + { + "epoch": 0.3314011686104709, + "grad_norm": 0.5619680726906291, + "learning_rate": 9.996667232043816e-05, + "loss": 3.4779, + "step": 7118 + }, + { + "epoch": 0.331447726796564, + "grad_norm": 0.4626411323152495, + "learning_rate": 9.996657336291095e-05, + "loss": 3.5221, + "step": 7119 + }, + { + "epoch": 0.3314942849826571, + "grad_norm": 0.4838237029948878, + "learning_rate": 9.996647425873677e-05, + "loss": 3.4867, + "step": 7120 + }, + { + "epoch": 0.33154084316875015, + "grad_norm": 0.5689317850742998, + "learning_rate": 9.996637500791593e-05, + "loss": 3.5881, + "step": 7121 + }, + { + "epoch": 0.3315874013548432, + "grad_norm": 0.576267639734517, + "learning_rate": 9.996627561044869e-05, + "loss": 3.5006, + "step": 7122 + }, + { + "epoch": 0.3316339595409363, + "grad_norm": 0.5385348223946996, + "learning_rate": 9.996617606633534e-05, + "loss": 3.4263, + "step": 7123 + }, + { + "epoch": 0.33168051772702933, + "grad_norm": 0.47474455281978, + "learning_rate": 9.99660763755762e-05, + "loss": 3.5918, + "step": 7124 + }, + { + "epoch": 0.33172707591312245, + "grad_norm": 0.49214533951639605, + "learning_rate": 9.996597653817154e-05, + "loss": 3.3634, + "step": 7125 + }, + { + "epoch": 0.3317736340992155, + "grad_norm": 0.504920474535927, + "learning_rate": 9.996587655412167e-05, + "loss": 3.6363, + "step": 7126 + }, + { + "epoch": 0.33182019228530857, + "grad_norm": 0.5000404730935557, + "learning_rate": 9.996577642342688e-05, + "loss": 3.5483, + "step": 7127 + }, + { + "epoch": 0.33186675047140163, + "grad_norm": 0.5336311844825685, + "learning_rate": 9.996567614608745e-05, + "loss": 3.5304, + "step": 7128 + }, + { + "epoch": 0.3319133086574947, + "grad_norm": 0.5720840988417909, + "learning_rate": 9.996557572210368e-05, + "loss": 3.4016, + "step": 7129 + }, + { + "epoch": 0.33195986684358775, + "grad_norm": 0.4958209589646813, + "learning_rate": 9.996547515147587e-05, + "loss": 3.4612, + "step": 7130 + }, + { + "epoch": 0.33200642502968086, + "grad_norm": 0.4491005840877765, + "learning_rate": 9.99653744342043e-05, + "loss": 3.4778, + "step": 7131 + }, + { + "epoch": 0.3320529832157739, + "grad_norm": 0.4849483349761708, + "learning_rate": 9.99652735702893e-05, + "loss": 3.4408, + "step": 7132 + }, + { + "epoch": 0.332099541401867, + "grad_norm": 0.47137920749390305, + "learning_rate": 9.996517255973115e-05, + "loss": 3.5115, + "step": 7133 + }, + { + "epoch": 0.33214609958796004, + "grad_norm": 0.4903736773381256, + "learning_rate": 9.996507140253012e-05, + "loss": 3.4661, + "step": 7134 + }, + { + "epoch": 0.3321926577740531, + "grad_norm": 0.6161877984569512, + "learning_rate": 9.996497009868653e-05, + "loss": 3.5128, + "step": 7135 + }, + { + "epoch": 0.3322392159601462, + "grad_norm": 0.5911742225976568, + "learning_rate": 9.996486864820069e-05, + "loss": 3.4664, + "step": 7136 + }, + { + "epoch": 0.3322857741462393, + "grad_norm": 0.40343584793662857, + "learning_rate": 9.996476705107287e-05, + "loss": 3.3377, + "step": 7137 + }, + { + "epoch": 0.33233233233233234, + "grad_norm": 0.5114165516869352, + "learning_rate": 9.996466530730337e-05, + "loss": 3.5197, + "step": 7138 + }, + { + "epoch": 0.3323788905184254, + "grad_norm": 0.5024739654802164, + "learning_rate": 9.996456341689253e-05, + "loss": 3.3764, + "step": 7139 + }, + { + "epoch": 0.33242544870451846, + "grad_norm": 0.46008876947679395, + "learning_rate": 9.996446137984059e-05, + "loss": 3.5219, + "step": 7140 + }, + { + "epoch": 0.3324720068906115, + "grad_norm": 0.5153206223661093, + "learning_rate": 9.99643591961479e-05, + "loss": 3.5872, + "step": 7141 + }, + { + "epoch": 0.33251856507670463, + "grad_norm": 0.5529070895262328, + "learning_rate": 9.996425686581472e-05, + "loss": 3.545, + "step": 7142 + }, + { + "epoch": 0.3325651232627977, + "grad_norm": 0.5393549705747777, + "learning_rate": 9.996415438884139e-05, + "loss": 3.439, + "step": 7143 + }, + { + "epoch": 0.33261168144889075, + "grad_norm": 0.5411143707430958, + "learning_rate": 9.996405176522818e-05, + "loss": 3.4511, + "step": 7144 + }, + { + "epoch": 0.3326582396349838, + "grad_norm": 0.4734875314133335, + "learning_rate": 9.996394899497538e-05, + "loss": 3.4078, + "step": 7145 + }, + { + "epoch": 0.33270479782107687, + "grad_norm": 0.4770732825562671, + "learning_rate": 9.996384607808333e-05, + "loss": 3.4563, + "step": 7146 + }, + { + "epoch": 0.33275135600717, + "grad_norm": 0.5233692470799394, + "learning_rate": 9.996374301455231e-05, + "loss": 3.3931, + "step": 7147 + }, + { + "epoch": 0.33279791419326304, + "grad_norm": 0.5304814039375327, + "learning_rate": 9.996363980438264e-05, + "loss": 3.4817, + "step": 7148 + }, + { + "epoch": 0.3328444723793561, + "grad_norm": 0.4117118704707266, + "learning_rate": 9.996353644757459e-05, + "loss": 3.3422, + "step": 7149 + }, + { + "epoch": 0.33289103056544916, + "grad_norm": 0.4520132702199342, + "learning_rate": 9.99634329441285e-05, + "loss": 3.4411, + "step": 7150 + }, + { + "epoch": 0.3329375887515422, + "grad_norm": 0.5136488312034859, + "learning_rate": 9.996332929404463e-05, + "loss": 3.544, + "step": 7151 + }, + { + "epoch": 0.3329841469376353, + "grad_norm": 0.4560837348079674, + "learning_rate": 9.996322549732335e-05, + "loss": 3.4746, + "step": 7152 + }, + { + "epoch": 0.3330307051237284, + "grad_norm": 0.5089117317115499, + "learning_rate": 9.99631215539649e-05, + "loss": 3.5532, + "step": 7153 + }, + { + "epoch": 0.33307726330982146, + "grad_norm": 0.48489437815418657, + "learning_rate": 9.99630174639696e-05, + "loss": 3.4914, + "step": 7154 + }, + { + "epoch": 0.3331238214959145, + "grad_norm": 0.4267562153335884, + "learning_rate": 9.99629132273378e-05, + "loss": 3.3748, + "step": 7155 + }, + { + "epoch": 0.3331703796820076, + "grad_norm": 0.5576310621158007, + "learning_rate": 9.996280884406974e-05, + "loss": 3.4099, + "step": 7156 + }, + { + "epoch": 0.33321693786810064, + "grad_norm": 0.5990698451407807, + "learning_rate": 9.996270431416578e-05, + "loss": 3.5271, + "step": 7157 + }, + { + "epoch": 0.33326349605419375, + "grad_norm": 0.4905553303488597, + "learning_rate": 9.99625996376262e-05, + "loss": 3.4985, + "step": 7158 + }, + { + "epoch": 0.3333100542402868, + "grad_norm": 0.46643573489096146, + "learning_rate": 9.99624948144513e-05, + "loss": 3.4192, + "step": 7159 + }, + { + "epoch": 0.3333566124263799, + "grad_norm": 0.5290854607146886, + "learning_rate": 9.996238984464141e-05, + "loss": 3.3139, + "step": 7160 + }, + { + "epoch": 0.33340317061247293, + "grad_norm": 0.4852326109331819, + "learning_rate": 9.996228472819682e-05, + "loss": 3.5209, + "step": 7161 + }, + { + "epoch": 0.333449728798566, + "grad_norm": 0.48510322950465234, + "learning_rate": 9.996217946511786e-05, + "loss": 3.5806, + "step": 7162 + }, + { + "epoch": 0.33349628698465905, + "grad_norm": 0.4490811543858503, + "learning_rate": 9.996207405540481e-05, + "loss": 3.5078, + "step": 7163 + }, + { + "epoch": 0.33354284517075217, + "grad_norm": 0.471846431452608, + "learning_rate": 9.9961968499058e-05, + "loss": 3.4219, + "step": 7164 + }, + { + "epoch": 0.3335894033568452, + "grad_norm": 0.43181742792809513, + "learning_rate": 9.996186279607773e-05, + "loss": 3.5829, + "step": 7165 + }, + { + "epoch": 0.3336359615429383, + "grad_norm": 0.42313105117015065, + "learning_rate": 9.996175694646433e-05, + "loss": 3.4683, + "step": 7166 + }, + { + "epoch": 0.33368251972903135, + "grad_norm": 0.4854137089067856, + "learning_rate": 9.996165095021809e-05, + "loss": 3.5362, + "step": 7167 + }, + { + "epoch": 0.3337290779151244, + "grad_norm": 0.4232406539996337, + "learning_rate": 9.996154480733933e-05, + "loss": 3.4891, + "step": 7168 + }, + { + "epoch": 0.3337756361012175, + "grad_norm": 0.439559850808726, + "learning_rate": 9.996143851782835e-05, + "loss": 3.4923, + "step": 7169 + }, + { + "epoch": 0.3338221942873106, + "grad_norm": 0.4420035265939886, + "learning_rate": 9.996133208168546e-05, + "loss": 3.4136, + "step": 7170 + }, + { + "epoch": 0.33386875247340364, + "grad_norm": 0.5587679245379589, + "learning_rate": 9.9961225498911e-05, + "loss": 3.5442, + "step": 7171 + }, + { + "epoch": 0.3339153106594967, + "grad_norm": 0.5017986938791953, + "learning_rate": 9.996111876950524e-05, + "loss": 3.5126, + "step": 7172 + }, + { + "epoch": 0.33396186884558976, + "grad_norm": 0.5113578108385297, + "learning_rate": 9.996101189346855e-05, + "loss": 3.508, + "step": 7173 + }, + { + "epoch": 0.3340084270316828, + "grad_norm": 0.47479963533678005, + "learning_rate": 9.99609048708012e-05, + "loss": 3.4643, + "step": 7174 + }, + { + "epoch": 0.33405498521777593, + "grad_norm": 0.47332251524354874, + "learning_rate": 9.996079770150351e-05, + "loss": 3.4395, + "step": 7175 + }, + { + "epoch": 0.334101543403869, + "grad_norm": 0.4431497823538958, + "learning_rate": 9.99606903855758e-05, + "loss": 3.5168, + "step": 7176 + }, + { + "epoch": 0.33414810158996205, + "grad_norm": 0.4443947482550754, + "learning_rate": 9.996058292301839e-05, + "loss": 3.4096, + "step": 7177 + }, + { + "epoch": 0.3341946597760551, + "grad_norm": 0.5003671804403972, + "learning_rate": 9.99604753138316e-05, + "loss": 3.4905, + "step": 7178 + }, + { + "epoch": 0.3342412179621482, + "grad_norm": 0.4417375043923431, + "learning_rate": 9.996036755801573e-05, + "loss": 3.4814, + "step": 7179 + }, + { + "epoch": 0.3342877761482413, + "grad_norm": 0.4255134650461167, + "learning_rate": 9.99602596555711e-05, + "loss": 3.4743, + "step": 7180 + }, + { + "epoch": 0.33433433433433435, + "grad_norm": 0.5423308008267962, + "learning_rate": 9.996015160649804e-05, + "loss": 3.409, + "step": 7181 + }, + { + "epoch": 0.3343808925204274, + "grad_norm": 0.5474681794064651, + "learning_rate": 9.996004341079686e-05, + "loss": 3.4601, + "step": 7182 + }, + { + "epoch": 0.33442745070652047, + "grad_norm": 0.4646591092303276, + "learning_rate": 9.995993506846786e-05, + "loss": 3.4873, + "step": 7183 + }, + { + "epoch": 0.33447400889261353, + "grad_norm": 0.46859013074290407, + "learning_rate": 9.995982657951138e-05, + "loss": 3.4171, + "step": 7184 + }, + { + "epoch": 0.3345205670787066, + "grad_norm": 0.3880522622477487, + "learning_rate": 9.99597179439277e-05, + "loss": 3.4742, + "step": 7185 + }, + { + "epoch": 0.3345671252647997, + "grad_norm": 0.4830325642371253, + "learning_rate": 9.995960916171721e-05, + "loss": 3.4521, + "step": 7186 + }, + { + "epoch": 0.33461368345089276, + "grad_norm": 0.45072318935489664, + "learning_rate": 9.995950023288018e-05, + "loss": 3.4315, + "step": 7187 + }, + { + "epoch": 0.3346602416369858, + "grad_norm": 0.4115678660467887, + "learning_rate": 9.995939115741693e-05, + "loss": 3.4914, + "step": 7188 + }, + { + "epoch": 0.3347067998230789, + "grad_norm": 0.46553076707465596, + "learning_rate": 9.99592819353278e-05, + "loss": 3.4358, + "step": 7189 + }, + { + "epoch": 0.33475335800917194, + "grad_norm": 0.4203859335415759, + "learning_rate": 9.99591725666131e-05, + "loss": 3.4507, + "step": 7190 + }, + { + "epoch": 0.33479991619526506, + "grad_norm": 0.48153412702847814, + "learning_rate": 9.995906305127314e-05, + "loss": 3.4253, + "step": 7191 + }, + { + "epoch": 0.3348464743813581, + "grad_norm": 0.4285464362388712, + "learning_rate": 9.995895338930825e-05, + "loss": 3.5144, + "step": 7192 + }, + { + "epoch": 0.3348930325674512, + "grad_norm": 0.4968823552443742, + "learning_rate": 9.995884358071876e-05, + "loss": 3.4669, + "step": 7193 + }, + { + "epoch": 0.33493959075354424, + "grad_norm": 0.53555373016167, + "learning_rate": 9.995873362550498e-05, + "loss": 3.6164, + "step": 7194 + }, + { + "epoch": 0.3349861489396373, + "grad_norm": 0.44338379961263225, + "learning_rate": 9.995862352366724e-05, + "loss": 3.5481, + "step": 7195 + }, + { + "epoch": 0.33503270712573036, + "grad_norm": 0.5183874735824145, + "learning_rate": 9.995851327520586e-05, + "loss": 3.5441, + "step": 7196 + }, + { + "epoch": 0.33507926531182347, + "grad_norm": 0.5505324627767153, + "learning_rate": 9.995840288012117e-05, + "loss": 3.4379, + "step": 7197 + }, + { + "epoch": 0.33512582349791653, + "grad_norm": 0.572130625056027, + "learning_rate": 9.99582923384135e-05, + "loss": 3.4604, + "step": 7198 + }, + { + "epoch": 0.3351723816840096, + "grad_norm": 0.5162999631121462, + "learning_rate": 9.995818165008315e-05, + "loss": 3.5255, + "step": 7199 + }, + { + "epoch": 0.33521893987010265, + "grad_norm": 0.45437868324174113, + "learning_rate": 9.995807081513046e-05, + "loss": 3.3614, + "step": 7200 + }, + { + "epoch": 0.3352654980561957, + "grad_norm": 0.48625505164829497, + "learning_rate": 9.995795983355575e-05, + "loss": 3.4712, + "step": 7201 + }, + { + "epoch": 0.3353120562422888, + "grad_norm": 0.5226403773787802, + "learning_rate": 9.995784870535936e-05, + "loss": 3.332, + "step": 7202 + }, + { + "epoch": 0.3353586144283819, + "grad_norm": 0.5056140807189216, + "learning_rate": 9.99577374305416e-05, + "loss": 3.5158, + "step": 7203 + }, + { + "epoch": 0.33540517261447494, + "grad_norm": 0.5088924144122164, + "learning_rate": 9.99576260091028e-05, + "loss": 3.56, + "step": 7204 + }, + { + "epoch": 0.335451730800568, + "grad_norm": 0.5327228258809881, + "learning_rate": 9.995751444104329e-05, + "loss": 3.4228, + "step": 7205 + }, + { + "epoch": 0.33549828898666106, + "grad_norm": 0.5238722394290033, + "learning_rate": 9.99574027263634e-05, + "loss": 3.4895, + "step": 7206 + }, + { + "epoch": 0.3355448471727541, + "grad_norm": 0.6053576208391116, + "learning_rate": 9.995729086506344e-05, + "loss": 3.492, + "step": 7207 + }, + { + "epoch": 0.33559140535884724, + "grad_norm": 0.4766374134773582, + "learning_rate": 9.995717885714377e-05, + "loss": 3.3534, + "step": 7208 + }, + { + "epoch": 0.3356379635449403, + "grad_norm": 0.45623491187163784, + "learning_rate": 9.995706670260471e-05, + "loss": 3.524, + "step": 7209 + }, + { + "epoch": 0.33568452173103336, + "grad_norm": 0.5518013455447295, + "learning_rate": 9.995695440144656e-05, + "loss": 3.5009, + "step": 7210 + }, + { + "epoch": 0.3357310799171264, + "grad_norm": 0.5396553237949648, + "learning_rate": 9.995684195366968e-05, + "loss": 3.616, + "step": 7211 + }, + { + "epoch": 0.3357776381032195, + "grad_norm": 0.5503937147657266, + "learning_rate": 9.99567293592744e-05, + "loss": 3.5036, + "step": 7212 + }, + { + "epoch": 0.3358241962893126, + "grad_norm": 0.5089883726662633, + "learning_rate": 9.995661661826103e-05, + "loss": 3.4879, + "step": 7213 + }, + { + "epoch": 0.33587075447540565, + "grad_norm": 0.44714872476521444, + "learning_rate": 9.99565037306299e-05, + "loss": 3.5444, + "step": 7214 + }, + { + "epoch": 0.3359173126614987, + "grad_norm": 0.4961698320151293, + "learning_rate": 9.995639069638136e-05, + "loss": 3.4272, + "step": 7215 + }, + { + "epoch": 0.3359638708475918, + "grad_norm": 0.4341765188493296, + "learning_rate": 9.995627751551575e-05, + "loss": 3.3926, + "step": 7216 + }, + { + "epoch": 0.33601042903368483, + "grad_norm": 0.4576664624924556, + "learning_rate": 9.995616418803337e-05, + "loss": 3.5406, + "step": 7217 + }, + { + "epoch": 0.3360569872197779, + "grad_norm": 0.4646313249782505, + "learning_rate": 9.995605071393459e-05, + "loss": 3.5131, + "step": 7218 + }, + { + "epoch": 0.336103545405871, + "grad_norm": 0.378892871616803, + "learning_rate": 9.995593709321971e-05, + "loss": 3.4382, + "step": 7219 + }, + { + "epoch": 0.33615010359196407, + "grad_norm": 0.35777948589332886, + "learning_rate": 9.995582332588906e-05, + "loss": 3.3262, + "step": 7220 + }, + { + "epoch": 0.3361966617780571, + "grad_norm": 0.43623756802491176, + "learning_rate": 9.995570941194301e-05, + "loss": 3.4044, + "step": 7221 + }, + { + "epoch": 0.3362432199641502, + "grad_norm": 0.47641501359789457, + "learning_rate": 9.995559535138188e-05, + "loss": 3.4916, + "step": 7222 + }, + { + "epoch": 0.33628977815024325, + "grad_norm": 0.4559299607363162, + "learning_rate": 9.995548114420598e-05, + "loss": 3.5635, + "step": 7223 + }, + { + "epoch": 0.33633633633633636, + "grad_norm": 0.5006729517462863, + "learning_rate": 9.995536679041568e-05, + "loss": 3.5016, + "step": 7224 + }, + { + "epoch": 0.3363828945224294, + "grad_norm": 0.4721047995290486, + "learning_rate": 9.995525229001128e-05, + "loss": 3.4524, + "step": 7225 + }, + { + "epoch": 0.3364294527085225, + "grad_norm": 0.4890786240308772, + "learning_rate": 9.995513764299315e-05, + "loss": 3.5826, + "step": 7226 + }, + { + "epoch": 0.33647601089461554, + "grad_norm": 0.5039505587888905, + "learning_rate": 9.995502284936162e-05, + "loss": 3.3222, + "step": 7227 + }, + { + "epoch": 0.3365225690807086, + "grad_norm": 0.48603342421219914, + "learning_rate": 9.995490790911701e-05, + "loss": 3.4969, + "step": 7228 + }, + { + "epoch": 0.33656912726680166, + "grad_norm": 0.5402433395059895, + "learning_rate": 9.995479282225966e-05, + "loss": 3.5167, + "step": 7229 + }, + { + "epoch": 0.3366156854528948, + "grad_norm": 0.554097210347677, + "learning_rate": 9.995467758878994e-05, + "loss": 3.3961, + "step": 7230 + }, + { + "epoch": 0.33666224363898783, + "grad_norm": 0.46736339898108487, + "learning_rate": 9.995456220870814e-05, + "loss": 3.4627, + "step": 7231 + }, + { + "epoch": 0.3367088018250809, + "grad_norm": 0.4539500871865871, + "learning_rate": 9.995444668201462e-05, + "loss": 3.515, + "step": 7232 + }, + { + "epoch": 0.33675536001117395, + "grad_norm": 0.4479374849985329, + "learning_rate": 9.995433100870973e-05, + "loss": 3.439, + "step": 7233 + }, + { + "epoch": 0.336801918197267, + "grad_norm": 0.43158153232195184, + "learning_rate": 9.99542151887938e-05, + "loss": 3.4311, + "step": 7234 + }, + { + "epoch": 0.33684847638336013, + "grad_norm": 0.43148779102155554, + "learning_rate": 9.995409922226715e-05, + "loss": 3.5782, + "step": 7235 + }, + { + "epoch": 0.3368950345694532, + "grad_norm": 0.46895652887127, + "learning_rate": 9.995398310913015e-05, + "loss": 3.4886, + "step": 7236 + }, + { + "epoch": 0.33694159275554625, + "grad_norm": 0.5148678540628949, + "learning_rate": 9.995386684938315e-05, + "loss": 3.4619, + "step": 7237 + }, + { + "epoch": 0.3369881509416393, + "grad_norm": 0.42226828891944107, + "learning_rate": 9.995375044302646e-05, + "loss": 3.56, + "step": 7238 + }, + { + "epoch": 0.33703470912773237, + "grad_norm": 0.45288712185625696, + "learning_rate": 9.995363389006043e-05, + "loss": 3.4795, + "step": 7239 + }, + { + "epoch": 0.33708126731382543, + "grad_norm": 0.490028807802144, + "learning_rate": 9.995351719048541e-05, + "loss": 3.4228, + "step": 7240 + }, + { + "epoch": 0.33712782549991854, + "grad_norm": 0.4633097726289624, + "learning_rate": 9.995340034430173e-05, + "loss": 3.5562, + "step": 7241 + }, + { + "epoch": 0.3371743836860116, + "grad_norm": 0.5314229520202611, + "learning_rate": 9.995328335150975e-05, + "loss": 3.4451, + "step": 7242 + }, + { + "epoch": 0.33722094187210466, + "grad_norm": 0.638219245106692, + "learning_rate": 9.99531662121098e-05, + "loss": 3.5036, + "step": 7243 + }, + { + "epoch": 0.3372675000581977, + "grad_norm": 0.6575859136312461, + "learning_rate": 9.995304892610223e-05, + "loss": 3.5247, + "step": 7244 + }, + { + "epoch": 0.3373140582442908, + "grad_norm": 0.5336079865120226, + "learning_rate": 9.995293149348739e-05, + "loss": 3.4701, + "step": 7245 + }, + { + "epoch": 0.3373606164303839, + "grad_norm": 0.544668038905353, + "learning_rate": 9.995281391426561e-05, + "loss": 3.51, + "step": 7246 + }, + { + "epoch": 0.33740717461647696, + "grad_norm": 0.5205066177184711, + "learning_rate": 9.995269618843723e-05, + "loss": 3.4553, + "step": 7247 + }, + { + "epoch": 0.33745373280257, + "grad_norm": 0.4598888707442177, + "learning_rate": 9.995257831600262e-05, + "loss": 3.5242, + "step": 7248 + }, + { + "epoch": 0.3375002909886631, + "grad_norm": 0.5670663474754372, + "learning_rate": 9.995246029696212e-05, + "loss": 3.4065, + "step": 7249 + }, + { + "epoch": 0.33754684917475614, + "grad_norm": 0.6087262469989186, + "learning_rate": 9.995234213131607e-05, + "loss": 3.5677, + "step": 7250 + }, + { + "epoch": 0.3375934073608492, + "grad_norm": 0.5197794895734544, + "learning_rate": 9.995222381906481e-05, + "loss": 3.4484, + "step": 7251 + }, + { + "epoch": 0.3376399655469423, + "grad_norm": 0.7707184615885856, + "learning_rate": 9.99521053602087e-05, + "loss": 3.3996, + "step": 7252 + }, + { + "epoch": 0.33768652373303537, + "grad_norm": 0.7827817005183965, + "learning_rate": 9.995198675474807e-05, + "loss": 3.5796, + "step": 7253 + }, + { + "epoch": 0.33773308191912843, + "grad_norm": 0.5226627551624078, + "learning_rate": 9.995186800268329e-05, + "loss": 3.3933, + "step": 7254 + }, + { + "epoch": 0.3377796401052215, + "grad_norm": 0.7444460088911096, + "learning_rate": 9.99517491040147e-05, + "loss": 3.5419, + "step": 7255 + }, + { + "epoch": 0.33782619829131455, + "grad_norm": 0.7599004808239254, + "learning_rate": 9.995163005874264e-05, + "loss": 3.4265, + "step": 7256 + }, + { + "epoch": 0.33787275647740767, + "grad_norm": 0.5547466996940517, + "learning_rate": 9.995151086686746e-05, + "loss": 3.3331, + "step": 7257 + }, + { + "epoch": 0.3379193146635007, + "grad_norm": 0.5743683879952107, + "learning_rate": 9.995139152838954e-05, + "loss": 3.4497, + "step": 7258 + }, + { + "epoch": 0.3379658728495938, + "grad_norm": 0.5014192492070321, + "learning_rate": 9.995127204330919e-05, + "loss": 3.3781, + "step": 7259 + }, + { + "epoch": 0.33801243103568684, + "grad_norm": 0.5091183733865143, + "learning_rate": 9.995115241162678e-05, + "loss": 3.5358, + "step": 7260 + }, + { + "epoch": 0.3380589892217799, + "grad_norm": 0.550829757397929, + "learning_rate": 9.995103263334266e-05, + "loss": 3.4579, + "step": 7261 + }, + { + "epoch": 0.33810554740787296, + "grad_norm": 0.4867183988824487, + "learning_rate": 9.995091270845718e-05, + "loss": 3.5806, + "step": 7262 + }, + { + "epoch": 0.3381521055939661, + "grad_norm": 0.5355561395430882, + "learning_rate": 9.99507926369707e-05, + "loss": 3.3808, + "step": 7263 + }, + { + "epoch": 0.33819866378005914, + "grad_norm": 0.4699063575095486, + "learning_rate": 9.995067241888356e-05, + "loss": 3.4669, + "step": 7264 + }, + { + "epoch": 0.3382452219661522, + "grad_norm": 0.5474796602431095, + "learning_rate": 9.99505520541961e-05, + "loss": 3.3865, + "step": 7265 + }, + { + "epoch": 0.33829178015224526, + "grad_norm": 0.6044524452695929, + "learning_rate": 9.995043154290871e-05, + "loss": 3.4112, + "step": 7266 + }, + { + "epoch": 0.3383383383383383, + "grad_norm": 0.4952029104730582, + "learning_rate": 9.995031088502171e-05, + "loss": 3.4134, + "step": 7267 + }, + { + "epoch": 0.33838489652443143, + "grad_norm": 0.47293039669085507, + "learning_rate": 9.995019008053549e-05, + "loss": 3.3736, + "step": 7268 + }, + { + "epoch": 0.3384314547105245, + "grad_norm": 0.45696329578953726, + "learning_rate": 9.995006912945038e-05, + "loss": 3.3798, + "step": 7269 + }, + { + "epoch": 0.33847801289661755, + "grad_norm": 0.5048402213013462, + "learning_rate": 9.994994803176671e-05, + "loss": 3.4169, + "step": 7270 + }, + { + "epoch": 0.3385245710827106, + "grad_norm": 0.5155779787644827, + "learning_rate": 9.994982678748489e-05, + "loss": 3.5466, + "step": 7271 + }, + { + "epoch": 0.33857112926880367, + "grad_norm": 0.5046428047101306, + "learning_rate": 9.994970539660524e-05, + "loss": 3.3825, + "step": 7272 + }, + { + "epoch": 0.33861768745489673, + "grad_norm": 0.4516084607453516, + "learning_rate": 9.994958385912812e-05, + "loss": 3.482, + "step": 7273 + }, + { + "epoch": 0.33866424564098985, + "grad_norm": 0.45785163650164584, + "learning_rate": 9.99494621750539e-05, + "loss": 3.4638, + "step": 7274 + }, + { + "epoch": 0.3387108038270829, + "grad_norm": 0.4992926062179097, + "learning_rate": 9.994934034438294e-05, + "loss": 3.4322, + "step": 7275 + }, + { + "epoch": 0.33875736201317597, + "grad_norm": 0.44674253233157785, + "learning_rate": 9.994921836711557e-05, + "loss": 3.4114, + "step": 7276 + }, + { + "epoch": 0.338803920199269, + "grad_norm": 0.4555731210435502, + "learning_rate": 9.994909624325217e-05, + "loss": 3.5543, + "step": 7277 + }, + { + "epoch": 0.3388504783853621, + "grad_norm": 0.4864915253859488, + "learning_rate": 9.99489739727931e-05, + "loss": 3.4639, + "step": 7278 + }, + { + "epoch": 0.3388970365714552, + "grad_norm": 0.446388562112898, + "learning_rate": 9.99488515557387e-05, + "loss": 3.48, + "step": 7279 + }, + { + "epoch": 0.33894359475754826, + "grad_norm": 0.40351161701927835, + "learning_rate": 9.994872899208935e-05, + "loss": 3.5186, + "step": 7280 + }, + { + "epoch": 0.3389901529436413, + "grad_norm": 0.4406073655729518, + "learning_rate": 9.994860628184539e-05, + "loss": 3.6798, + "step": 7281 + }, + { + "epoch": 0.3390367111297344, + "grad_norm": 0.49862781565288267, + "learning_rate": 9.99484834250072e-05, + "loss": 3.4815, + "step": 7282 + }, + { + "epoch": 0.33908326931582744, + "grad_norm": 0.44320462903399077, + "learning_rate": 9.994836042157514e-05, + "loss": 3.5385, + "step": 7283 + }, + { + "epoch": 0.3391298275019205, + "grad_norm": 0.4173197760884729, + "learning_rate": 9.994823727154956e-05, + "loss": 3.3851, + "step": 7284 + }, + { + "epoch": 0.3391763856880136, + "grad_norm": 0.42847308899204634, + "learning_rate": 9.994811397493081e-05, + "loss": 3.5241, + "step": 7285 + }, + { + "epoch": 0.3392229438741067, + "grad_norm": 0.4304537196792193, + "learning_rate": 9.994799053171928e-05, + "loss": 3.4789, + "step": 7286 + }, + { + "epoch": 0.33926950206019973, + "grad_norm": 0.4380808993466883, + "learning_rate": 9.994786694191531e-05, + "loss": 3.2926, + "step": 7287 + }, + { + "epoch": 0.3393160602462928, + "grad_norm": 0.4399969201053961, + "learning_rate": 9.994774320551927e-05, + "loss": 3.5068, + "step": 7288 + }, + { + "epoch": 0.33936261843238585, + "grad_norm": 0.4756422610664259, + "learning_rate": 9.994761932253154e-05, + "loss": 3.4082, + "step": 7289 + }, + { + "epoch": 0.33940917661847897, + "grad_norm": 0.4738992701580259, + "learning_rate": 9.994749529295245e-05, + "loss": 3.4734, + "step": 7290 + }, + { + "epoch": 0.33945573480457203, + "grad_norm": 0.5268946894076776, + "learning_rate": 9.994737111678239e-05, + "loss": 3.4876, + "step": 7291 + }, + { + "epoch": 0.3395022929906651, + "grad_norm": 0.5817644207597532, + "learning_rate": 9.994724679402173e-05, + "loss": 3.5024, + "step": 7292 + }, + { + "epoch": 0.33954885117675815, + "grad_norm": 0.5487794193130536, + "learning_rate": 9.994712232467079e-05, + "loss": 3.5257, + "step": 7293 + }, + { + "epoch": 0.3395954093628512, + "grad_norm": 0.47746760637895186, + "learning_rate": 9.994699770873e-05, + "loss": 3.503, + "step": 7294 + }, + { + "epoch": 0.33964196754894427, + "grad_norm": 0.4210361718595897, + "learning_rate": 9.994687294619967e-05, + "loss": 3.4474, + "step": 7295 + }, + { + "epoch": 0.3396885257350374, + "grad_norm": 0.42532440671854793, + "learning_rate": 9.994674803708019e-05, + "loss": 3.2807, + "step": 7296 + }, + { + "epoch": 0.33973508392113044, + "grad_norm": 0.4765337922789837, + "learning_rate": 9.994662298137193e-05, + "loss": 3.4206, + "step": 7297 + }, + { + "epoch": 0.3397816421072235, + "grad_norm": 0.4734654766619816, + "learning_rate": 9.994649777907525e-05, + "loss": 3.531, + "step": 7298 + }, + { + "epoch": 0.33982820029331656, + "grad_norm": 0.5175261988366182, + "learning_rate": 9.994637243019051e-05, + "loss": 3.4564, + "step": 7299 + }, + { + "epoch": 0.3398747584794096, + "grad_norm": 0.5280817445491175, + "learning_rate": 9.99462469347181e-05, + "loss": 3.4302, + "step": 7300 + }, + { + "epoch": 0.33992131666550274, + "grad_norm": 0.45883245807418577, + "learning_rate": 9.99461212926584e-05, + "loss": 3.4187, + "step": 7301 + }, + { + "epoch": 0.3399678748515958, + "grad_norm": 0.5063031525571151, + "learning_rate": 9.994599550401171e-05, + "loss": 3.3514, + "step": 7302 + }, + { + "epoch": 0.34001443303768886, + "grad_norm": 0.4896791810754011, + "learning_rate": 9.994586956877847e-05, + "loss": 3.4243, + "step": 7303 + }, + { + "epoch": 0.3400609912237819, + "grad_norm": 0.42896314837968236, + "learning_rate": 9.9945743486959e-05, + "loss": 3.4889, + "step": 7304 + }, + { + "epoch": 0.340107549409875, + "grad_norm": 0.4662111617590383, + "learning_rate": 9.994561725855371e-05, + "loss": 3.4483, + "step": 7305 + }, + { + "epoch": 0.34015410759596804, + "grad_norm": 0.4228993158844977, + "learning_rate": 9.994549088356296e-05, + "loss": 3.4327, + "step": 7306 + }, + { + "epoch": 0.34020066578206115, + "grad_norm": 0.45109356464050065, + "learning_rate": 9.994536436198712e-05, + "loss": 3.4958, + "step": 7307 + }, + { + "epoch": 0.3402472239681542, + "grad_norm": 0.4568656251812018, + "learning_rate": 9.994523769382653e-05, + "loss": 3.5591, + "step": 7308 + }, + { + "epoch": 0.34029378215424727, + "grad_norm": 0.4474177243448991, + "learning_rate": 9.994511087908161e-05, + "loss": 3.4723, + "step": 7309 + }, + { + "epoch": 0.34034034034034033, + "grad_norm": 0.40798113360465493, + "learning_rate": 9.994498391775268e-05, + "loss": 3.4864, + "step": 7310 + }, + { + "epoch": 0.3403868985264334, + "grad_norm": 0.45580129172798034, + "learning_rate": 9.994485680984018e-05, + "loss": 3.4894, + "step": 7311 + }, + { + "epoch": 0.3404334567125265, + "grad_norm": 0.40783931306586785, + "learning_rate": 9.994472955534443e-05, + "loss": 3.3719, + "step": 7312 + }, + { + "epoch": 0.34048001489861957, + "grad_norm": 0.4494283475557479, + "learning_rate": 9.994460215426583e-05, + "loss": 3.4922, + "step": 7313 + }, + { + "epoch": 0.3405265730847126, + "grad_norm": 0.46434571475825404, + "learning_rate": 9.994447460660473e-05, + "loss": 3.5101, + "step": 7314 + }, + { + "epoch": 0.3405731312708057, + "grad_norm": 0.48168199403681505, + "learning_rate": 9.994434691236151e-05, + "loss": 3.4424, + "step": 7315 + }, + { + "epoch": 0.34061968945689874, + "grad_norm": 0.46380849201564944, + "learning_rate": 9.994421907153656e-05, + "loss": 3.4436, + "step": 7316 + }, + { + "epoch": 0.3406662476429918, + "grad_norm": 0.40490011096787215, + "learning_rate": 9.994409108413026e-05, + "loss": 3.3797, + "step": 7317 + }, + { + "epoch": 0.3407128058290849, + "grad_norm": 0.47821904305795615, + "learning_rate": 9.994396295014294e-05, + "loss": 3.4592, + "step": 7318 + }, + { + "epoch": 0.340759364015178, + "grad_norm": 0.5150174026549719, + "learning_rate": 9.994383466957505e-05, + "loss": 3.4775, + "step": 7319 + }, + { + "epoch": 0.34080592220127104, + "grad_norm": 0.4338777696897454, + "learning_rate": 9.99437062424269e-05, + "loss": 3.3961, + "step": 7320 + }, + { + "epoch": 0.3408524803873641, + "grad_norm": 0.5049134454738639, + "learning_rate": 9.99435776686989e-05, + "loss": 3.5301, + "step": 7321 + }, + { + "epoch": 0.34089903857345716, + "grad_norm": 0.5269450232079527, + "learning_rate": 9.994344894839142e-05, + "loss": 3.4347, + "step": 7322 + }, + { + "epoch": 0.3409455967595503, + "grad_norm": 0.4737806389217303, + "learning_rate": 9.994332008150483e-05, + "loss": 3.6122, + "step": 7323 + }, + { + "epoch": 0.34099215494564333, + "grad_norm": 0.5560066016555578, + "learning_rate": 9.994319106803952e-05, + "loss": 3.4259, + "step": 7324 + }, + { + "epoch": 0.3410387131317364, + "grad_norm": 0.5312891199337905, + "learning_rate": 9.994306190799588e-05, + "loss": 3.4296, + "step": 7325 + }, + { + "epoch": 0.34108527131782945, + "grad_norm": 0.435537202631665, + "learning_rate": 9.994293260137425e-05, + "loss": 3.5562, + "step": 7326 + }, + { + "epoch": 0.3411318295039225, + "grad_norm": 0.5257722830808096, + "learning_rate": 9.994280314817504e-05, + "loss": 3.3187, + "step": 7327 + }, + { + "epoch": 0.34117838769001557, + "grad_norm": 0.5466972659483961, + "learning_rate": 9.994267354839864e-05, + "loss": 3.4366, + "step": 7328 + }, + { + "epoch": 0.3412249458761087, + "grad_norm": 0.4402962249999283, + "learning_rate": 9.994254380204539e-05, + "loss": 3.3576, + "step": 7329 + }, + { + "epoch": 0.34127150406220175, + "grad_norm": 0.5538129155961762, + "learning_rate": 9.99424139091157e-05, + "loss": 3.6033, + "step": 7330 + }, + { + "epoch": 0.3413180622482948, + "grad_norm": 0.5484109939718551, + "learning_rate": 9.994228386960996e-05, + "loss": 3.4468, + "step": 7331 + }, + { + "epoch": 0.34136462043438787, + "grad_norm": 0.49493025330076196, + "learning_rate": 9.994215368352852e-05, + "loss": 3.4735, + "step": 7332 + }, + { + "epoch": 0.3414111786204809, + "grad_norm": 0.532252022596582, + "learning_rate": 9.994202335087177e-05, + "loss": 3.6275, + "step": 7333 + }, + { + "epoch": 0.34145773680657404, + "grad_norm": 0.6318435546424626, + "learning_rate": 9.994189287164013e-05, + "loss": 3.5174, + "step": 7334 + }, + { + "epoch": 0.3415042949926671, + "grad_norm": 0.5392895077644448, + "learning_rate": 9.994176224583394e-05, + "loss": 3.4432, + "step": 7335 + }, + { + "epoch": 0.34155085317876016, + "grad_norm": 0.4722096909508896, + "learning_rate": 9.994163147345359e-05, + "loss": 3.4042, + "step": 7336 + }, + { + "epoch": 0.3415974113648532, + "grad_norm": 0.610807357253142, + "learning_rate": 9.994150055449949e-05, + "loss": 3.4836, + "step": 7337 + }, + { + "epoch": 0.3416439695509463, + "grad_norm": 0.5915656921589232, + "learning_rate": 9.9941369488972e-05, + "loss": 3.506, + "step": 7338 + }, + { + "epoch": 0.34169052773703934, + "grad_norm": 0.593680899343963, + "learning_rate": 9.99412382768715e-05, + "loss": 3.4307, + "step": 7339 + }, + { + "epoch": 0.34173708592313246, + "grad_norm": 0.5292325888691944, + "learning_rate": 9.994110691819839e-05, + "loss": 3.4189, + "step": 7340 + }, + { + "epoch": 0.3417836441092255, + "grad_norm": 0.5954415921525783, + "learning_rate": 9.994097541295305e-05, + "loss": 3.4903, + "step": 7341 + }, + { + "epoch": 0.3418302022953186, + "grad_norm": 0.559733736964651, + "learning_rate": 9.994084376113588e-05, + "loss": 3.5305, + "step": 7342 + }, + { + "epoch": 0.34187676048141163, + "grad_norm": 0.5654750390450758, + "learning_rate": 9.994071196274723e-05, + "loss": 3.4418, + "step": 7343 + }, + { + "epoch": 0.3419233186675047, + "grad_norm": 0.5268061756094462, + "learning_rate": 9.994058001778754e-05, + "loss": 3.4316, + "step": 7344 + }, + { + "epoch": 0.3419698768535978, + "grad_norm": 0.5942557389892873, + "learning_rate": 9.994044792625714e-05, + "loss": 3.4296, + "step": 7345 + }, + { + "epoch": 0.34201643503969087, + "grad_norm": 0.4803692599271819, + "learning_rate": 9.994031568815647e-05, + "loss": 3.5222, + "step": 7346 + }, + { + "epoch": 0.34206299322578393, + "grad_norm": 0.4788970953082618, + "learning_rate": 9.994018330348587e-05, + "loss": 3.5251, + "step": 7347 + }, + { + "epoch": 0.342109551411877, + "grad_norm": 0.5419305133911453, + "learning_rate": 9.994005077224578e-05, + "loss": 3.4929, + "step": 7348 + }, + { + "epoch": 0.34215610959797005, + "grad_norm": 0.4931174337807413, + "learning_rate": 9.993991809443653e-05, + "loss": 3.4649, + "step": 7349 + }, + { + "epoch": 0.3422026677840631, + "grad_norm": 0.5180622795403239, + "learning_rate": 9.993978527005857e-05, + "loss": 3.4466, + "step": 7350 + }, + { + "epoch": 0.3422492259701562, + "grad_norm": 0.5528753821560441, + "learning_rate": 9.993965229911223e-05, + "loss": 3.391, + "step": 7351 + }, + { + "epoch": 0.3422957841562493, + "grad_norm": 0.5190520500884448, + "learning_rate": 9.993951918159793e-05, + "loss": 3.4917, + "step": 7352 + }, + { + "epoch": 0.34234234234234234, + "grad_norm": 0.45051719085312875, + "learning_rate": 9.993938591751609e-05, + "loss": 3.485, + "step": 7353 + }, + { + "epoch": 0.3423889005284354, + "grad_norm": 0.48581274492581705, + "learning_rate": 9.993925250686707e-05, + "loss": 3.5419, + "step": 7354 + }, + { + "epoch": 0.34243545871452846, + "grad_norm": 0.4594796020618867, + "learning_rate": 9.993911894965124e-05, + "loss": 3.4366, + "step": 7355 + }, + { + "epoch": 0.3424820169006216, + "grad_norm": 0.3923563047954371, + "learning_rate": 9.993898524586902e-05, + "loss": 3.4717, + "step": 7356 + }, + { + "epoch": 0.34252857508671464, + "grad_norm": 0.4516946612379364, + "learning_rate": 9.993885139552082e-05, + "loss": 3.3402, + "step": 7357 + }, + { + "epoch": 0.3425751332728077, + "grad_norm": 0.4310049774311886, + "learning_rate": 9.993871739860698e-05, + "loss": 3.5114, + "step": 7358 + }, + { + "epoch": 0.34262169145890076, + "grad_norm": 0.5781133863078028, + "learning_rate": 9.993858325512794e-05, + "loss": 3.54, + "step": 7359 + }, + { + "epoch": 0.3426682496449938, + "grad_norm": 0.5109871830061837, + "learning_rate": 9.993844896508408e-05, + "loss": 3.4123, + "step": 7360 + }, + { + "epoch": 0.3427148078310869, + "grad_norm": 0.4652521579145569, + "learning_rate": 9.993831452847579e-05, + "loss": 3.5811, + "step": 7361 + }, + { + "epoch": 0.34276136601718, + "grad_norm": 0.5997375961217475, + "learning_rate": 9.993817994530348e-05, + "loss": 3.5203, + "step": 7362 + }, + { + "epoch": 0.34280792420327305, + "grad_norm": 0.5302646621644984, + "learning_rate": 9.993804521556751e-05, + "loss": 3.3972, + "step": 7363 + }, + { + "epoch": 0.3428544823893661, + "grad_norm": 0.44368529763035014, + "learning_rate": 9.993791033926831e-05, + "loss": 3.4493, + "step": 7364 + }, + { + "epoch": 0.34290104057545917, + "grad_norm": 0.5417403736720083, + "learning_rate": 9.993777531640627e-05, + "loss": 3.398, + "step": 7365 + }, + { + "epoch": 0.34294759876155223, + "grad_norm": 0.48136661242438633, + "learning_rate": 9.993764014698176e-05, + "loss": 3.398, + "step": 7366 + }, + { + "epoch": 0.34299415694764535, + "grad_norm": 0.4788873724576207, + "learning_rate": 9.993750483099519e-05, + "loss": 3.4707, + "step": 7367 + }, + { + "epoch": 0.3430407151337384, + "grad_norm": 0.507796895858746, + "learning_rate": 9.993736936844699e-05, + "loss": 3.5372, + "step": 7368 + }, + { + "epoch": 0.34308727331983147, + "grad_norm": 0.5794879861229538, + "learning_rate": 9.993723375933752e-05, + "loss": 3.5154, + "step": 7369 + }, + { + "epoch": 0.3431338315059245, + "grad_norm": 0.4756906529483361, + "learning_rate": 9.993709800366717e-05, + "loss": 3.4717, + "step": 7370 + }, + { + "epoch": 0.3431803896920176, + "grad_norm": 0.47408095038613396, + "learning_rate": 9.993696210143639e-05, + "loss": 3.4792, + "step": 7371 + }, + { + "epoch": 0.34322694787811064, + "grad_norm": 0.39279264738092656, + "learning_rate": 9.993682605264552e-05, + "loss": 3.4695, + "step": 7372 + }, + { + "epoch": 0.34327350606420376, + "grad_norm": 0.46735150997247255, + "learning_rate": 9.9936689857295e-05, + "loss": 3.5297, + "step": 7373 + }, + { + "epoch": 0.3433200642502968, + "grad_norm": 0.4731146232888381, + "learning_rate": 9.99365535153852e-05, + "loss": 3.3828, + "step": 7374 + }, + { + "epoch": 0.3433666224363899, + "grad_norm": 0.4531156820691186, + "learning_rate": 9.993641702691654e-05, + "loss": 3.5609, + "step": 7375 + }, + { + "epoch": 0.34341318062248294, + "grad_norm": 0.4274203165100354, + "learning_rate": 9.993628039188942e-05, + "loss": 3.5037, + "step": 7376 + }, + { + "epoch": 0.343459738808576, + "grad_norm": 0.43614923719511806, + "learning_rate": 9.993614361030424e-05, + "loss": 3.4734, + "step": 7377 + }, + { + "epoch": 0.3435062969946691, + "grad_norm": 0.41606170527159503, + "learning_rate": 9.99360066821614e-05, + "loss": 3.3739, + "step": 7378 + }, + { + "epoch": 0.3435528551807622, + "grad_norm": 0.4058140699667426, + "learning_rate": 9.993586960746129e-05, + "loss": 3.449, + "step": 7379 + }, + { + "epoch": 0.34359941336685523, + "grad_norm": 0.39781521722293145, + "learning_rate": 9.993573238620433e-05, + "loss": 3.395, + "step": 7380 + }, + { + "epoch": 0.3436459715529483, + "grad_norm": 1.0430559955663392, + "learning_rate": 9.993559501839091e-05, + "loss": 3.3174, + "step": 7381 + }, + { + "epoch": 0.34369252973904135, + "grad_norm": 1.1079215556287108, + "learning_rate": 9.993545750402144e-05, + "loss": 3.4235, + "step": 7382 + }, + { + "epoch": 0.3437390879251344, + "grad_norm": 0.8926471659367393, + "learning_rate": 9.993531984309631e-05, + "loss": 3.5173, + "step": 7383 + }, + { + "epoch": 0.3437856461112275, + "grad_norm": 0.7370435306824482, + "learning_rate": 9.993518203561596e-05, + "loss": 3.4715, + "step": 7384 + }, + { + "epoch": 0.3438322042973206, + "grad_norm": 0.5484622270066206, + "learning_rate": 9.993504408158074e-05, + "loss": 3.5887, + "step": 7385 + }, + { + "epoch": 0.34387876248341365, + "grad_norm": 0.6520974653596096, + "learning_rate": 9.993490598099111e-05, + "loss": 3.4629, + "step": 7386 + }, + { + "epoch": 0.3439253206695067, + "grad_norm": 0.6621378826277267, + "learning_rate": 9.993476773384745e-05, + "loss": 3.4175, + "step": 7387 + }, + { + "epoch": 0.34397187885559977, + "grad_norm": 0.5504516133548489, + "learning_rate": 9.993462934015014e-05, + "loss": 3.4221, + "step": 7388 + }, + { + "epoch": 0.3440184370416929, + "grad_norm": 0.6108286145569867, + "learning_rate": 9.993449079989963e-05, + "loss": 3.4362, + "step": 7389 + }, + { + "epoch": 0.34406499522778594, + "grad_norm": 0.6129099342489766, + "learning_rate": 9.993435211309633e-05, + "loss": 3.6104, + "step": 7390 + }, + { + "epoch": 0.344111553413879, + "grad_norm": 0.5346179202639872, + "learning_rate": 9.99342132797406e-05, + "loss": 3.4553, + "step": 7391 + }, + { + "epoch": 0.34415811159997206, + "grad_norm": 0.4462280119929173, + "learning_rate": 9.993407429983287e-05, + "loss": 3.4629, + "step": 7392 + }, + { + "epoch": 0.3442046697860651, + "grad_norm": 0.5648247650064325, + "learning_rate": 9.993393517337356e-05, + "loss": 3.5027, + "step": 7393 + }, + { + "epoch": 0.3442512279721582, + "grad_norm": 0.506834082030684, + "learning_rate": 9.993379590036306e-05, + "loss": 3.5599, + "step": 7394 + }, + { + "epoch": 0.3442977861582513, + "grad_norm": 0.46643098296782554, + "learning_rate": 9.99336564808018e-05, + "loss": 3.424, + "step": 7395 + }, + { + "epoch": 0.34434434434434436, + "grad_norm": 0.48202216649404656, + "learning_rate": 9.993351691469019e-05, + "loss": 3.4325, + "step": 7396 + }, + { + "epoch": 0.3443909025304374, + "grad_norm": 0.4520099280359664, + "learning_rate": 9.99333772020286e-05, + "loss": 3.51, + "step": 7397 + }, + { + "epoch": 0.3444374607165305, + "grad_norm": 0.4535025368309767, + "learning_rate": 9.993323734281747e-05, + "loss": 3.4509, + "step": 7398 + }, + { + "epoch": 0.34448401890262353, + "grad_norm": 0.517718307020941, + "learning_rate": 9.993309733705723e-05, + "loss": 3.4822, + "step": 7399 + }, + { + "epoch": 0.34453057708871665, + "grad_norm": 0.4571589194482569, + "learning_rate": 9.993295718474826e-05, + "loss": 3.593, + "step": 7400 + }, + { + "epoch": 0.3445771352748097, + "grad_norm": 0.4310496631138253, + "learning_rate": 9.993281688589098e-05, + "loss": 3.6465, + "step": 7401 + }, + { + "epoch": 0.34462369346090277, + "grad_norm": 0.4858380254545526, + "learning_rate": 9.993267644048578e-05, + "loss": 3.3961, + "step": 7402 + }, + { + "epoch": 0.34467025164699583, + "grad_norm": 0.4377568664409572, + "learning_rate": 9.993253584853313e-05, + "loss": 3.5554, + "step": 7403 + }, + { + "epoch": 0.3447168098330889, + "grad_norm": 0.47058434343102773, + "learning_rate": 9.993239511003339e-05, + "loss": 3.5922, + "step": 7404 + }, + { + "epoch": 0.34476336801918195, + "grad_norm": 0.497618869455566, + "learning_rate": 9.993225422498697e-05, + "loss": 3.5836, + "step": 7405 + }, + { + "epoch": 0.34480992620527506, + "grad_norm": 0.46496887256153335, + "learning_rate": 9.993211319339433e-05, + "loss": 3.5923, + "step": 7406 + }, + { + "epoch": 0.3448564843913681, + "grad_norm": 0.4703150339483085, + "learning_rate": 9.993197201525583e-05, + "loss": 3.4054, + "step": 7407 + }, + { + "epoch": 0.3449030425774612, + "grad_norm": 0.4458060792674135, + "learning_rate": 9.993183069057194e-05, + "loss": 3.5351, + "step": 7408 + }, + { + "epoch": 0.34494960076355424, + "grad_norm": 0.44582101647181704, + "learning_rate": 9.993168921934304e-05, + "loss": 3.4594, + "step": 7409 + }, + { + "epoch": 0.3449961589496473, + "grad_norm": 0.45066287508991915, + "learning_rate": 9.993154760156953e-05, + "loss": 3.5266, + "step": 7410 + }, + { + "epoch": 0.3450427171357404, + "grad_norm": 0.48490081946959507, + "learning_rate": 9.993140583725187e-05, + "loss": 3.3904, + "step": 7411 + }, + { + "epoch": 0.3450892753218335, + "grad_norm": 0.4856984464899171, + "learning_rate": 9.993126392639046e-05, + "loss": 3.5455, + "step": 7412 + }, + { + "epoch": 0.34513583350792654, + "grad_norm": 0.5342769993768511, + "learning_rate": 9.993112186898568e-05, + "loss": 3.4261, + "step": 7413 + }, + { + "epoch": 0.3451823916940196, + "grad_norm": 0.5350538822574867, + "learning_rate": 9.993097966503799e-05, + "loss": 3.4371, + "step": 7414 + }, + { + "epoch": 0.34522894988011266, + "grad_norm": 0.4956717764095313, + "learning_rate": 9.99308373145478e-05, + "loss": 3.4572, + "step": 7415 + }, + { + "epoch": 0.3452755080662057, + "grad_norm": 0.5524401403201966, + "learning_rate": 9.993069481751551e-05, + "loss": 3.4167, + "step": 7416 + }, + { + "epoch": 0.34532206625229883, + "grad_norm": 0.5265042296750513, + "learning_rate": 9.993055217394157e-05, + "loss": 3.4061, + "step": 7417 + }, + { + "epoch": 0.3453686244383919, + "grad_norm": 0.5388808044557739, + "learning_rate": 9.993040938382635e-05, + "loss": 3.3512, + "step": 7418 + }, + { + "epoch": 0.34541518262448495, + "grad_norm": 0.46753110513393975, + "learning_rate": 9.993026644717032e-05, + "loss": 3.4413, + "step": 7419 + }, + { + "epoch": 0.345461740810578, + "grad_norm": 0.538023565236133, + "learning_rate": 9.993012336397386e-05, + "loss": 3.526, + "step": 7420 + }, + { + "epoch": 0.34550829899667107, + "grad_norm": 0.5444783117887138, + "learning_rate": 9.992998013423741e-05, + "loss": 3.5215, + "step": 7421 + }, + { + "epoch": 0.3455548571827642, + "grad_norm": 0.48814250229944295, + "learning_rate": 9.99298367579614e-05, + "loss": 3.4671, + "step": 7422 + }, + { + "epoch": 0.34560141536885725, + "grad_norm": 0.5000807478441108, + "learning_rate": 9.992969323514623e-05, + "loss": 3.5087, + "step": 7423 + }, + { + "epoch": 0.3456479735549503, + "grad_norm": 0.49645360886206635, + "learning_rate": 9.992954956579233e-05, + "loss": 3.4282, + "step": 7424 + }, + { + "epoch": 0.34569453174104336, + "grad_norm": 0.4775419331241344, + "learning_rate": 9.992940574990012e-05, + "loss": 3.6046, + "step": 7425 + }, + { + "epoch": 0.3457410899271364, + "grad_norm": 0.4907711449146166, + "learning_rate": 9.992926178747002e-05, + "loss": 3.3696, + "step": 7426 + }, + { + "epoch": 0.3457876481132295, + "grad_norm": 0.45918086423695803, + "learning_rate": 9.992911767850247e-05, + "loss": 3.4498, + "step": 7427 + }, + { + "epoch": 0.3458342062993226, + "grad_norm": 0.4851695346157916, + "learning_rate": 9.992897342299786e-05, + "loss": 3.522, + "step": 7428 + }, + { + "epoch": 0.34588076448541566, + "grad_norm": 0.41878159106119767, + "learning_rate": 9.992882902095664e-05, + "loss": 3.3684, + "step": 7429 + }, + { + "epoch": 0.3459273226715087, + "grad_norm": 0.4165465411415452, + "learning_rate": 9.992868447237922e-05, + "loss": 3.4833, + "step": 7430 + }, + { + "epoch": 0.3459738808576018, + "grad_norm": 0.5081525946698505, + "learning_rate": 9.992853977726604e-05, + "loss": 3.4869, + "step": 7431 + }, + { + "epoch": 0.34602043904369484, + "grad_norm": 0.4699593490016458, + "learning_rate": 9.99283949356175e-05, + "loss": 3.5079, + "step": 7432 + }, + { + "epoch": 0.34606699722978795, + "grad_norm": 0.4231304569949861, + "learning_rate": 9.992824994743406e-05, + "loss": 3.3855, + "step": 7433 + }, + { + "epoch": 0.346113555415881, + "grad_norm": 0.4740528834580584, + "learning_rate": 9.99281048127161e-05, + "loss": 3.4701, + "step": 7434 + }, + { + "epoch": 0.3461601136019741, + "grad_norm": 0.5183548053841408, + "learning_rate": 9.99279595314641e-05, + "loss": 3.4591, + "step": 7435 + }, + { + "epoch": 0.34620667178806713, + "grad_norm": 0.5011685181011303, + "learning_rate": 9.992781410367844e-05, + "loss": 3.4843, + "step": 7436 + }, + { + "epoch": 0.3462532299741602, + "grad_norm": 0.5482709823630014, + "learning_rate": 9.992766852935956e-05, + "loss": 3.3679, + "step": 7437 + }, + { + "epoch": 0.34629978816025325, + "grad_norm": 0.4652002737085493, + "learning_rate": 9.99275228085079e-05, + "loss": 3.4441, + "step": 7438 + }, + { + "epoch": 0.34634634634634637, + "grad_norm": 0.48549013276762315, + "learning_rate": 9.992737694112388e-05, + "loss": 3.4747, + "step": 7439 + }, + { + "epoch": 0.3463929045324394, + "grad_norm": 0.5499815472020024, + "learning_rate": 9.992723092720793e-05, + "loss": 3.4181, + "step": 7440 + }, + { + "epoch": 0.3464394627185325, + "grad_norm": 0.4830400544446056, + "learning_rate": 9.992708476676047e-05, + "loss": 3.4447, + "step": 7441 + }, + { + "epoch": 0.34648602090462555, + "grad_norm": 0.469806606670936, + "learning_rate": 9.992693845978194e-05, + "loss": 3.4252, + "step": 7442 + }, + { + "epoch": 0.3465325790907186, + "grad_norm": 0.5070638620200045, + "learning_rate": 9.992679200627274e-05, + "loss": 3.4519, + "step": 7443 + }, + { + "epoch": 0.3465791372768117, + "grad_norm": 0.4882173873401045, + "learning_rate": 9.992664540623336e-05, + "loss": 3.4573, + "step": 7444 + }, + { + "epoch": 0.3466256954629048, + "grad_norm": 0.4556962110841786, + "learning_rate": 9.992649865966417e-05, + "loss": 3.468, + "step": 7445 + }, + { + "epoch": 0.34667225364899784, + "grad_norm": 0.5159365638440101, + "learning_rate": 9.992635176656564e-05, + "loss": 3.491, + "step": 7446 + }, + { + "epoch": 0.3467188118350909, + "grad_norm": 0.4945524463240747, + "learning_rate": 9.992620472693818e-05, + "loss": 3.3693, + "step": 7447 + }, + { + "epoch": 0.34676537002118396, + "grad_norm": 0.4794097229731299, + "learning_rate": 9.992605754078223e-05, + "loss": 3.4799, + "step": 7448 + }, + { + "epoch": 0.346811928207277, + "grad_norm": 0.5318855305322746, + "learning_rate": 9.99259102080982e-05, + "loss": 3.3664, + "step": 7449 + }, + { + "epoch": 0.34685848639337014, + "grad_norm": 0.5898636993789786, + "learning_rate": 9.992576272888658e-05, + "loss": 3.4814, + "step": 7450 + }, + { + "epoch": 0.3469050445794632, + "grad_norm": 0.5028608410059537, + "learning_rate": 9.992561510314773e-05, + "loss": 3.4035, + "step": 7451 + }, + { + "epoch": 0.34695160276555626, + "grad_norm": 0.46095351370826376, + "learning_rate": 9.992546733088214e-05, + "loss": 3.4885, + "step": 7452 + }, + { + "epoch": 0.3469981609516493, + "grad_norm": 0.48352665709574294, + "learning_rate": 9.992531941209022e-05, + "loss": 3.4266, + "step": 7453 + }, + { + "epoch": 0.3470447191377424, + "grad_norm": 0.4368820216624425, + "learning_rate": 9.99251713467724e-05, + "loss": 3.4858, + "step": 7454 + }, + { + "epoch": 0.3470912773238355, + "grad_norm": 0.4339202503965736, + "learning_rate": 9.99250231349291e-05, + "loss": 3.4097, + "step": 7455 + }, + { + "epoch": 0.34713783550992855, + "grad_norm": 0.4570070251835177, + "learning_rate": 9.99248747765608e-05, + "loss": 3.5512, + "step": 7456 + }, + { + "epoch": 0.3471843936960216, + "grad_norm": 0.4089501411274993, + "learning_rate": 9.992472627166792e-05, + "loss": 3.493, + "step": 7457 + }, + { + "epoch": 0.34723095188211467, + "grad_norm": 0.4729234072248096, + "learning_rate": 9.992457762025087e-05, + "loss": 3.4321, + "step": 7458 + }, + { + "epoch": 0.34727751006820773, + "grad_norm": 0.5070743050642453, + "learning_rate": 9.99244288223101e-05, + "loss": 3.4171, + "step": 7459 + }, + { + "epoch": 0.3473240682543008, + "grad_norm": 0.42045613928121395, + "learning_rate": 9.992427987784607e-05, + "loss": 3.4902, + "step": 7460 + }, + { + "epoch": 0.3473706264403939, + "grad_norm": 0.421223681162034, + "learning_rate": 9.992413078685918e-05, + "loss": 3.3985, + "step": 7461 + }, + { + "epoch": 0.34741718462648696, + "grad_norm": 0.4064294510080748, + "learning_rate": 9.992398154934988e-05, + "loss": 3.4632, + "step": 7462 + }, + { + "epoch": 0.34746374281258, + "grad_norm": 0.41819747505964916, + "learning_rate": 9.992383216531863e-05, + "loss": 3.3963, + "step": 7463 + }, + { + "epoch": 0.3475103009986731, + "grad_norm": 0.4614767925139998, + "learning_rate": 9.992368263476584e-05, + "loss": 3.372, + "step": 7464 + }, + { + "epoch": 0.34755685918476614, + "grad_norm": 0.4952408346446196, + "learning_rate": 9.992353295769196e-05, + "loss": 3.3947, + "step": 7465 + }, + { + "epoch": 0.34760341737085926, + "grad_norm": 0.4585544765152171, + "learning_rate": 9.992338313409743e-05, + "loss": 3.2655, + "step": 7466 + }, + { + "epoch": 0.3476499755569523, + "grad_norm": 0.45377555635289674, + "learning_rate": 9.99232331639827e-05, + "loss": 3.4114, + "step": 7467 + }, + { + "epoch": 0.3476965337430454, + "grad_norm": 0.46592424999349746, + "learning_rate": 9.992308304734817e-05, + "loss": 3.3778, + "step": 7468 + }, + { + "epoch": 0.34774309192913844, + "grad_norm": 0.49038166955400314, + "learning_rate": 9.992293278419434e-05, + "loss": 3.3887, + "step": 7469 + }, + { + "epoch": 0.3477896501152315, + "grad_norm": 0.41234553752311437, + "learning_rate": 9.99227823745216e-05, + "loss": 3.4795, + "step": 7470 + }, + { + "epoch": 0.34783620830132456, + "grad_norm": 0.5286469400712158, + "learning_rate": 9.992263181833041e-05, + "loss": 3.4424, + "step": 7471 + }, + { + "epoch": 0.34788276648741767, + "grad_norm": 0.4796839908524919, + "learning_rate": 9.992248111562123e-05, + "loss": 3.4219, + "step": 7472 + }, + { + "epoch": 0.34792932467351073, + "grad_norm": 0.455879206982139, + "learning_rate": 9.992233026639448e-05, + "loss": 3.4246, + "step": 7473 + }, + { + "epoch": 0.3479758828596038, + "grad_norm": 0.5051874935798554, + "learning_rate": 9.992217927065062e-05, + "loss": 3.5, + "step": 7474 + }, + { + "epoch": 0.34802244104569685, + "grad_norm": 0.5347124553622404, + "learning_rate": 9.992202812839006e-05, + "loss": 3.4106, + "step": 7475 + }, + { + "epoch": 0.3480689992317899, + "grad_norm": 0.4505361730220707, + "learning_rate": 9.992187683961328e-05, + "loss": 3.4284, + "step": 7476 + }, + { + "epoch": 0.348115557417883, + "grad_norm": 0.45563931526496576, + "learning_rate": 9.99217254043207e-05, + "loss": 3.5199, + "step": 7477 + }, + { + "epoch": 0.3481621156039761, + "grad_norm": 0.5412681579868177, + "learning_rate": 9.992157382251279e-05, + "loss": 3.4889, + "step": 7478 + }, + { + "epoch": 0.34820867379006915, + "grad_norm": 0.4764895991034028, + "learning_rate": 9.992142209418997e-05, + "loss": 3.4142, + "step": 7479 + }, + { + "epoch": 0.3482552319761622, + "grad_norm": 0.43536094591866453, + "learning_rate": 9.992127021935269e-05, + "loss": 3.5194, + "step": 7480 + }, + { + "epoch": 0.34830179016225526, + "grad_norm": 0.5177642298383566, + "learning_rate": 9.99211181980014e-05, + "loss": 3.5098, + "step": 7481 + }, + { + "epoch": 0.3483483483483483, + "grad_norm": 0.5187487046387448, + "learning_rate": 9.992096603013655e-05, + "loss": 3.5146, + "step": 7482 + }, + { + "epoch": 0.34839490653444144, + "grad_norm": 0.4892410290111195, + "learning_rate": 9.992081371575858e-05, + "loss": 3.5027, + "step": 7483 + }, + { + "epoch": 0.3484414647205345, + "grad_norm": 0.4693899350592047, + "learning_rate": 9.992066125486794e-05, + "loss": 3.4419, + "step": 7484 + }, + { + "epoch": 0.34848802290662756, + "grad_norm": 0.4928497311446939, + "learning_rate": 9.992050864746508e-05, + "loss": 3.5207, + "step": 7485 + }, + { + "epoch": 0.3485345810927206, + "grad_norm": 0.44644504212636615, + "learning_rate": 9.992035589355045e-05, + "loss": 3.3626, + "step": 7486 + }, + { + "epoch": 0.3485811392788137, + "grad_norm": 0.4798708033617233, + "learning_rate": 9.992020299312449e-05, + "loss": 3.3576, + "step": 7487 + }, + { + "epoch": 0.3486276974649068, + "grad_norm": 0.43575525885000305, + "learning_rate": 9.992004994618764e-05, + "loss": 3.4888, + "step": 7488 + }, + { + "epoch": 0.34867425565099985, + "grad_norm": 0.4912079552547252, + "learning_rate": 9.991989675274037e-05, + "loss": 3.4812, + "step": 7489 + }, + { + "epoch": 0.3487208138370929, + "grad_norm": 0.4782497911054012, + "learning_rate": 9.991974341278312e-05, + "loss": 3.4458, + "step": 7490 + }, + { + "epoch": 0.348767372023186, + "grad_norm": 0.4051639004433723, + "learning_rate": 9.991958992631635e-05, + "loss": 3.4711, + "step": 7491 + }, + { + "epoch": 0.34881393020927903, + "grad_norm": 0.470931026987332, + "learning_rate": 9.991943629334049e-05, + "loss": 3.5084, + "step": 7492 + }, + { + "epoch": 0.3488604883953721, + "grad_norm": 0.455935787875864, + "learning_rate": 9.991928251385601e-05, + "loss": 3.4697, + "step": 7493 + }, + { + "epoch": 0.3489070465814652, + "grad_norm": 0.438816915249467, + "learning_rate": 9.991912858786334e-05, + "loss": 3.4999, + "step": 7494 + }, + { + "epoch": 0.34895360476755827, + "grad_norm": 0.38129678964653063, + "learning_rate": 9.991897451536297e-05, + "loss": 3.3273, + "step": 7495 + }, + { + "epoch": 0.3490001629536513, + "grad_norm": 0.4436529445883258, + "learning_rate": 9.991882029635531e-05, + "loss": 3.5579, + "step": 7496 + }, + { + "epoch": 0.3490467211397444, + "grad_norm": 0.4683967577223073, + "learning_rate": 9.991866593084083e-05, + "loss": 3.4444, + "step": 7497 + }, + { + "epoch": 0.34909327932583745, + "grad_norm": 0.38578301514055396, + "learning_rate": 9.991851141881999e-05, + "loss": 3.5103, + "step": 7498 + }, + { + "epoch": 0.34913983751193056, + "grad_norm": 0.4005854451374931, + "learning_rate": 9.991835676029324e-05, + "loss": 3.3826, + "step": 7499 + }, + { + "epoch": 0.3491863956980236, + "grad_norm": 0.3453225124437245, + "learning_rate": 9.991820195526103e-05, + "loss": 3.4488, + "step": 7500 + }, + { + "epoch": 0.3492329538841167, + "grad_norm": 0.3862782184314333, + "learning_rate": 9.99180470037238e-05, + "loss": 3.4691, + "step": 7501 + }, + { + "epoch": 0.34927951207020974, + "grad_norm": 0.3946380189414324, + "learning_rate": 9.991789190568203e-05, + "loss": 3.5049, + "step": 7502 + }, + { + "epoch": 0.3493260702563028, + "grad_norm": 0.37971010093081364, + "learning_rate": 9.991773666113616e-05, + "loss": 3.426, + "step": 7503 + }, + { + "epoch": 0.34937262844239586, + "grad_norm": 0.38643040330569456, + "learning_rate": 9.991758127008666e-05, + "loss": 3.472, + "step": 7504 + }, + { + "epoch": 0.349419186628489, + "grad_norm": 0.46591253217265804, + "learning_rate": 9.991742573253397e-05, + "loss": 3.3962, + "step": 7505 + }, + { + "epoch": 0.34946574481458204, + "grad_norm": 0.45667393228805464, + "learning_rate": 9.991727004847855e-05, + "loss": 3.4307, + "step": 7506 + }, + { + "epoch": 0.3495123030006751, + "grad_norm": 0.4120871676443876, + "learning_rate": 9.991711421792087e-05, + "loss": 3.48, + "step": 7507 + }, + { + "epoch": 0.34955886118676815, + "grad_norm": 0.4831214171420471, + "learning_rate": 9.991695824086137e-05, + "loss": 3.3921, + "step": 7508 + }, + { + "epoch": 0.3496054193728612, + "grad_norm": 0.43725683607942367, + "learning_rate": 9.991680211730053e-05, + "loss": 3.4863, + "step": 7509 + }, + { + "epoch": 0.34965197755895433, + "grad_norm": 0.3852058314366577, + "learning_rate": 9.991664584723876e-05, + "loss": 3.4339, + "step": 7510 + }, + { + "epoch": 0.3496985357450474, + "grad_norm": 0.44448049011657126, + "learning_rate": 9.991648943067658e-05, + "loss": 3.3915, + "step": 7511 + }, + { + "epoch": 0.34974509393114045, + "grad_norm": 0.4477965619978481, + "learning_rate": 9.991633286761441e-05, + "loss": 3.5288, + "step": 7512 + }, + { + "epoch": 0.3497916521172335, + "grad_norm": 0.407975248523766, + "learning_rate": 9.991617615805273e-05, + "loss": 3.4723, + "step": 7513 + }, + { + "epoch": 0.34983821030332657, + "grad_norm": 0.4835605062738611, + "learning_rate": 9.9916019301992e-05, + "loss": 3.3857, + "step": 7514 + }, + { + "epoch": 0.34988476848941963, + "grad_norm": 0.48366181568496214, + "learning_rate": 9.991586229943264e-05, + "loss": 3.5248, + "step": 7515 + }, + { + "epoch": 0.34993132667551274, + "grad_norm": 0.4715911124704833, + "learning_rate": 9.991570515037517e-05, + "loss": 3.475, + "step": 7516 + }, + { + "epoch": 0.3499778848616058, + "grad_norm": 0.525338809655857, + "learning_rate": 9.991554785482e-05, + "loss": 3.5257, + "step": 7517 + }, + { + "epoch": 0.35002444304769886, + "grad_norm": 0.4805817722749713, + "learning_rate": 9.991539041276763e-05, + "loss": 3.4852, + "step": 7518 + }, + { + "epoch": 0.3500710012337919, + "grad_norm": 0.4679688586730614, + "learning_rate": 9.991523282421849e-05, + "loss": 3.4407, + "step": 7519 + }, + { + "epoch": 0.350117559419885, + "grad_norm": 0.4249438004025822, + "learning_rate": 9.991507508917309e-05, + "loss": 3.4336, + "step": 7520 + }, + { + "epoch": 0.3501641176059781, + "grad_norm": 0.44247050319905884, + "learning_rate": 9.991491720763183e-05, + "loss": 3.5687, + "step": 7521 + }, + { + "epoch": 0.35021067579207116, + "grad_norm": 0.5377611276042243, + "learning_rate": 9.991475917959521e-05, + "loss": 3.471, + "step": 7522 + }, + { + "epoch": 0.3502572339781642, + "grad_norm": 0.4811593157397403, + "learning_rate": 9.991460100506372e-05, + "loss": 3.4345, + "step": 7523 + }, + { + "epoch": 0.3503037921642573, + "grad_norm": 0.4689711923881089, + "learning_rate": 9.991444268403776e-05, + "loss": 3.4181, + "step": 7524 + }, + { + "epoch": 0.35035035035035034, + "grad_norm": 0.43551361979156017, + "learning_rate": 9.991428421651783e-05, + "loss": 3.4658, + "step": 7525 + }, + { + "epoch": 0.3503969085364434, + "grad_norm": 0.4118632436657589, + "learning_rate": 9.991412560250441e-05, + "loss": 3.3788, + "step": 7526 + }, + { + "epoch": 0.3504434667225365, + "grad_norm": 0.45570762344941185, + "learning_rate": 9.991396684199795e-05, + "loss": 3.4377, + "step": 7527 + }, + { + "epoch": 0.35049002490862957, + "grad_norm": 0.4226187766825202, + "learning_rate": 9.99138079349989e-05, + "loss": 3.4117, + "step": 7528 + }, + { + "epoch": 0.35053658309472263, + "grad_norm": 0.45599946698201516, + "learning_rate": 9.991364888150773e-05, + "loss": 3.4659, + "step": 7529 + }, + { + "epoch": 0.3505831412808157, + "grad_norm": 0.41875140832394014, + "learning_rate": 9.991348968152495e-05, + "loss": 3.4364, + "step": 7530 + }, + { + "epoch": 0.35062969946690875, + "grad_norm": 0.4182594981253992, + "learning_rate": 9.991333033505097e-05, + "loss": 3.4884, + "step": 7531 + }, + { + "epoch": 0.35067625765300187, + "grad_norm": 0.43341103525373303, + "learning_rate": 9.99131708420863e-05, + "loss": 3.4907, + "step": 7532 + }, + { + "epoch": 0.3507228158390949, + "grad_norm": 0.4108621006266734, + "learning_rate": 9.99130112026314e-05, + "loss": 3.4691, + "step": 7533 + }, + { + "epoch": 0.350769374025188, + "grad_norm": 0.411943180559505, + "learning_rate": 9.99128514166867e-05, + "loss": 3.4684, + "step": 7534 + }, + { + "epoch": 0.35081593221128105, + "grad_norm": 0.43601045258347026, + "learning_rate": 9.991269148425271e-05, + "loss": 3.4115, + "step": 7535 + }, + { + "epoch": 0.3508624903973741, + "grad_norm": 0.43898660402036627, + "learning_rate": 9.991253140532988e-05, + "loss": 3.3837, + "step": 7536 + }, + { + "epoch": 0.35090904858346716, + "grad_norm": 0.40786093603568424, + "learning_rate": 9.99123711799187e-05, + "loss": 3.4706, + "step": 7537 + }, + { + "epoch": 0.3509556067695603, + "grad_norm": 0.4662240397701045, + "learning_rate": 9.991221080801963e-05, + "loss": 3.4381, + "step": 7538 + }, + { + "epoch": 0.35100216495565334, + "grad_norm": 0.45883137554434816, + "learning_rate": 9.991205028963314e-05, + "loss": 3.4333, + "step": 7539 + }, + { + "epoch": 0.3510487231417464, + "grad_norm": 0.4133567868059798, + "learning_rate": 9.991188962475968e-05, + "loss": 3.3535, + "step": 7540 + }, + { + "epoch": 0.35109528132783946, + "grad_norm": 0.5289619492153589, + "learning_rate": 9.991172881339975e-05, + "loss": 3.4414, + "step": 7541 + }, + { + "epoch": 0.3511418395139325, + "grad_norm": 0.6300300398691433, + "learning_rate": 9.991156785555382e-05, + "loss": 3.591, + "step": 7542 + }, + { + "epoch": 0.35118839770002563, + "grad_norm": 0.5671079485938583, + "learning_rate": 9.991140675122234e-05, + "loss": 3.476, + "step": 7543 + }, + { + "epoch": 0.3512349558861187, + "grad_norm": 0.4881306096552307, + "learning_rate": 9.991124550040582e-05, + "loss": 3.3416, + "step": 7544 + }, + { + "epoch": 0.35128151407221175, + "grad_norm": 0.5270536847780224, + "learning_rate": 9.99110841031047e-05, + "loss": 3.4765, + "step": 7545 + }, + { + "epoch": 0.3513280722583048, + "grad_norm": 0.5715678401554711, + "learning_rate": 9.991092255931946e-05, + "loss": 3.4303, + "step": 7546 + }, + { + "epoch": 0.3513746304443979, + "grad_norm": 0.6100875504339317, + "learning_rate": 9.99107608690506e-05, + "loss": 3.4147, + "step": 7547 + }, + { + "epoch": 0.35142118863049093, + "grad_norm": 0.6252004031449785, + "learning_rate": 9.991059903229856e-05, + "loss": 3.5466, + "step": 7548 + }, + { + "epoch": 0.35146774681658405, + "grad_norm": 0.4878462875011289, + "learning_rate": 9.991043704906382e-05, + "loss": 3.3829, + "step": 7549 + }, + { + "epoch": 0.3515143050026771, + "grad_norm": 0.47144439389949994, + "learning_rate": 9.991027491934687e-05, + "loss": 3.4543, + "step": 7550 + }, + { + "epoch": 0.35156086318877017, + "grad_norm": 0.6056693737093531, + "learning_rate": 9.99101126431482e-05, + "loss": 3.4364, + "step": 7551 + }, + { + "epoch": 0.3516074213748632, + "grad_norm": 0.6189444758044176, + "learning_rate": 9.990995022046824e-05, + "loss": 3.482, + "step": 7552 + }, + { + "epoch": 0.3516539795609563, + "grad_norm": 0.5605039710008409, + "learning_rate": 9.99097876513075e-05, + "loss": 3.5302, + "step": 7553 + }, + { + "epoch": 0.3517005377470494, + "grad_norm": 0.5283160232603332, + "learning_rate": 9.990962493566644e-05, + "loss": 3.4208, + "step": 7554 + }, + { + "epoch": 0.35174709593314246, + "grad_norm": 0.4754977029487563, + "learning_rate": 9.990946207354557e-05, + "loss": 3.3119, + "step": 7555 + }, + { + "epoch": 0.3517936541192355, + "grad_norm": 0.46881808770067906, + "learning_rate": 9.990929906494534e-05, + "loss": 3.403, + "step": 7556 + }, + { + "epoch": 0.3518402123053286, + "grad_norm": 0.49998912996280104, + "learning_rate": 9.990913590986624e-05, + "loss": 3.4237, + "step": 7557 + }, + { + "epoch": 0.35188677049142164, + "grad_norm": 0.5124083395089152, + "learning_rate": 9.990897260830873e-05, + "loss": 3.5073, + "step": 7558 + }, + { + "epoch": 0.3519333286775147, + "grad_norm": 0.4840975741580288, + "learning_rate": 9.99088091602733e-05, + "loss": 3.3767, + "step": 7559 + }, + { + "epoch": 0.3519798868636078, + "grad_norm": 0.44469642714247604, + "learning_rate": 9.990864556576044e-05, + "loss": 3.4307, + "step": 7560 + }, + { + "epoch": 0.3520264450497009, + "grad_norm": 0.44435179798532876, + "learning_rate": 9.990848182477061e-05, + "loss": 3.3594, + "step": 7561 + }, + { + "epoch": 0.35207300323579394, + "grad_norm": 0.47746136613429535, + "learning_rate": 9.990831793730432e-05, + "loss": 3.4782, + "step": 7562 + }, + { + "epoch": 0.352119561421887, + "grad_norm": 0.47252818743370906, + "learning_rate": 9.990815390336203e-05, + "loss": 3.4383, + "step": 7563 + }, + { + "epoch": 0.35216611960798005, + "grad_norm": 0.4806440822146852, + "learning_rate": 9.990798972294422e-05, + "loss": 3.4634, + "step": 7564 + }, + { + "epoch": 0.35221267779407317, + "grad_norm": 0.4759816680741179, + "learning_rate": 9.990782539605137e-05, + "loss": 3.478, + "step": 7565 + }, + { + "epoch": 0.35225923598016623, + "grad_norm": 0.47013265773909846, + "learning_rate": 9.990766092268398e-05, + "loss": 3.3662, + "step": 7566 + }, + { + "epoch": 0.3523057941662593, + "grad_norm": 0.4318256921299041, + "learning_rate": 9.990749630284252e-05, + "loss": 3.5292, + "step": 7567 + }, + { + "epoch": 0.35235235235235235, + "grad_norm": 0.4105492467546388, + "learning_rate": 9.990733153652747e-05, + "loss": 3.4004, + "step": 7568 + }, + { + "epoch": 0.3523989105384454, + "grad_norm": 0.49620765987784066, + "learning_rate": 9.990716662373932e-05, + "loss": 3.4582, + "step": 7569 + }, + { + "epoch": 0.35244546872453847, + "grad_norm": 0.5136859148824968, + "learning_rate": 9.990700156447856e-05, + "loss": 3.4628, + "step": 7570 + }, + { + "epoch": 0.3524920269106316, + "grad_norm": 0.43522353067466496, + "learning_rate": 9.990683635874568e-05, + "loss": 3.3804, + "step": 7571 + }, + { + "epoch": 0.35253858509672464, + "grad_norm": 0.4999338176686607, + "learning_rate": 9.990667100654112e-05, + "loss": 3.4655, + "step": 7572 + }, + { + "epoch": 0.3525851432828177, + "grad_norm": 0.5611411551105323, + "learning_rate": 9.99065055078654e-05, + "loss": 3.3983, + "step": 7573 + }, + { + "epoch": 0.35263170146891076, + "grad_norm": 0.4470458164462133, + "learning_rate": 9.990633986271902e-05, + "loss": 3.4688, + "step": 7574 + }, + { + "epoch": 0.3526782596550038, + "grad_norm": 0.43913429353792083, + "learning_rate": 9.990617407110245e-05, + "loss": 3.38, + "step": 7575 + }, + { + "epoch": 0.35272481784109694, + "grad_norm": 0.42592942372792475, + "learning_rate": 9.990600813301615e-05, + "loss": 3.5023, + "step": 7576 + }, + { + "epoch": 0.35277137602719, + "grad_norm": 0.4580866150927702, + "learning_rate": 9.990584204846068e-05, + "loss": 3.4612, + "step": 7577 + }, + { + "epoch": 0.35281793421328306, + "grad_norm": 0.4370882791756764, + "learning_rate": 9.990567581743644e-05, + "loss": 3.3135, + "step": 7578 + }, + { + "epoch": 0.3528644923993761, + "grad_norm": 0.4559601328960056, + "learning_rate": 9.990550943994396e-05, + "loss": 3.3813, + "step": 7579 + }, + { + "epoch": 0.3529110505854692, + "grad_norm": 0.41581910353206, + "learning_rate": 9.990534291598375e-05, + "loss": 3.4358, + "step": 7580 + }, + { + "epoch": 0.35295760877156224, + "grad_norm": 0.4447415172190342, + "learning_rate": 9.990517624555625e-05, + "loss": 3.4431, + "step": 7581 + }, + { + "epoch": 0.35300416695765535, + "grad_norm": 0.4147284023018046, + "learning_rate": 9.990500942866199e-05, + "loss": 3.3483, + "step": 7582 + }, + { + "epoch": 0.3530507251437484, + "grad_norm": 0.4561174427758631, + "learning_rate": 9.990484246530144e-05, + "loss": 3.3998, + "step": 7583 + }, + { + "epoch": 0.35309728332984147, + "grad_norm": 0.4239637778735582, + "learning_rate": 9.99046753554751e-05, + "loss": 3.4356, + "step": 7584 + }, + { + "epoch": 0.35314384151593453, + "grad_norm": 0.4427975148471932, + "learning_rate": 9.990450809918344e-05, + "loss": 3.3439, + "step": 7585 + }, + { + "epoch": 0.3531903997020276, + "grad_norm": 0.4332765223676898, + "learning_rate": 9.990434069642697e-05, + "loss": 3.4106, + "step": 7586 + }, + { + "epoch": 0.3532369578881207, + "grad_norm": 0.4016362556820517, + "learning_rate": 9.990417314720618e-05, + "loss": 3.4337, + "step": 7587 + }, + { + "epoch": 0.35328351607421377, + "grad_norm": 0.40413994476482185, + "learning_rate": 9.990400545152157e-05, + "loss": 3.3494, + "step": 7588 + }, + { + "epoch": 0.3533300742603068, + "grad_norm": 0.5152552103518717, + "learning_rate": 9.99038376093736e-05, + "loss": 3.3655, + "step": 7589 + }, + { + "epoch": 0.3533766324463999, + "grad_norm": 0.578022541900094, + "learning_rate": 9.990366962076279e-05, + "loss": 3.5256, + "step": 7590 + }, + { + "epoch": 0.35342319063249295, + "grad_norm": 0.6133896350651955, + "learning_rate": 9.990350148568963e-05, + "loss": 3.4705, + "step": 7591 + }, + { + "epoch": 0.353469748818586, + "grad_norm": 0.5444666089677253, + "learning_rate": 9.99033332041546e-05, + "loss": 3.4159, + "step": 7592 + }, + { + "epoch": 0.3535163070046791, + "grad_norm": 0.513781560123175, + "learning_rate": 9.990316477615821e-05, + "loss": 3.4828, + "step": 7593 + }, + { + "epoch": 0.3535628651907722, + "grad_norm": 0.4812805814878925, + "learning_rate": 9.990299620170093e-05, + "loss": 3.5142, + "step": 7594 + }, + { + "epoch": 0.35360942337686524, + "grad_norm": 0.4201785583553799, + "learning_rate": 9.99028274807833e-05, + "loss": 3.317, + "step": 7595 + }, + { + "epoch": 0.3536559815629583, + "grad_norm": 0.4262919928556352, + "learning_rate": 9.990265861340578e-05, + "loss": 3.4465, + "step": 7596 + }, + { + "epoch": 0.35370253974905136, + "grad_norm": 0.41788484834900974, + "learning_rate": 9.990248959956887e-05, + "loss": 3.3643, + "step": 7597 + }, + { + "epoch": 0.3537490979351445, + "grad_norm": 0.43034959793689986, + "learning_rate": 9.990232043927308e-05, + "loss": 3.4713, + "step": 7598 + }, + { + "epoch": 0.35379565612123753, + "grad_norm": 0.42729504616125463, + "learning_rate": 9.990215113251889e-05, + "loss": 3.5036, + "step": 7599 + }, + { + "epoch": 0.3538422143073306, + "grad_norm": 0.4700062436088594, + "learning_rate": 9.990198167930678e-05, + "loss": 3.4321, + "step": 7600 + }, + { + "epoch": 0.35388877249342365, + "grad_norm": 0.4730721251602537, + "learning_rate": 9.990181207963729e-05, + "loss": 3.4294, + "step": 7601 + }, + { + "epoch": 0.3539353306795167, + "grad_norm": 0.45454107382979114, + "learning_rate": 9.990164233351089e-05, + "loss": 3.4937, + "step": 7602 + }, + { + "epoch": 0.3539818888656098, + "grad_norm": 0.4676163526463461, + "learning_rate": 9.990147244092809e-05, + "loss": 3.5077, + "step": 7603 + }, + { + "epoch": 0.3540284470517029, + "grad_norm": 0.43077477507830053, + "learning_rate": 9.99013024018894e-05, + "loss": 3.4175, + "step": 7604 + }, + { + "epoch": 0.35407500523779595, + "grad_norm": 0.4198919509013736, + "learning_rate": 9.990113221639528e-05, + "loss": 3.4633, + "step": 7605 + }, + { + "epoch": 0.354121563423889, + "grad_norm": 0.4353606689282414, + "learning_rate": 9.990096188444627e-05, + "loss": 3.4068, + "step": 7606 + }, + { + "epoch": 0.35416812160998207, + "grad_norm": 0.45416562032082386, + "learning_rate": 9.990079140604283e-05, + "loss": 3.4034, + "step": 7607 + }, + { + "epoch": 0.3542146797960751, + "grad_norm": 0.41817376666499023, + "learning_rate": 9.990062078118552e-05, + "loss": 3.5207, + "step": 7608 + }, + { + "epoch": 0.35426123798216824, + "grad_norm": 0.44988182145775596, + "learning_rate": 9.990045000987477e-05, + "loss": 3.3717, + "step": 7609 + }, + { + "epoch": 0.3543077961682613, + "grad_norm": 0.44069665615554204, + "learning_rate": 9.990027909211113e-05, + "loss": 3.3609, + "step": 7610 + }, + { + "epoch": 0.35435435435435436, + "grad_norm": 0.43452152210471257, + "learning_rate": 9.990010802789509e-05, + "loss": 3.4812, + "step": 7611 + }, + { + "epoch": 0.3544009125404474, + "grad_norm": 0.4933801899854551, + "learning_rate": 9.989993681722713e-05, + "loss": 3.454, + "step": 7612 + }, + { + "epoch": 0.3544474707265405, + "grad_norm": 0.449266982896988, + "learning_rate": 9.98997654601078e-05, + "loss": 3.3454, + "step": 7613 + }, + { + "epoch": 0.35449402891263354, + "grad_norm": 0.4476400144628461, + "learning_rate": 9.989959395653755e-05, + "loss": 3.4546, + "step": 7614 + }, + { + "epoch": 0.35454058709872666, + "grad_norm": 0.42972598762816894, + "learning_rate": 9.989942230651692e-05, + "loss": 3.4741, + "step": 7615 + }, + { + "epoch": 0.3545871452848197, + "grad_norm": 0.44099901730267677, + "learning_rate": 9.98992505100464e-05, + "loss": 3.3617, + "step": 7616 + }, + { + "epoch": 0.3546337034709128, + "grad_norm": 0.487177629933555, + "learning_rate": 9.989907856712649e-05, + "loss": 3.4442, + "step": 7617 + }, + { + "epoch": 0.35468026165700584, + "grad_norm": 0.47998249432590806, + "learning_rate": 9.989890647775771e-05, + "loss": 3.5113, + "step": 7618 + }, + { + "epoch": 0.3547268198430989, + "grad_norm": 0.4451117695502231, + "learning_rate": 9.989873424194053e-05, + "loss": 3.4217, + "step": 7619 + }, + { + "epoch": 0.354773378029192, + "grad_norm": 0.5768479325876855, + "learning_rate": 9.989856185967552e-05, + "loss": 3.4159, + "step": 7620 + }, + { + "epoch": 0.35481993621528507, + "grad_norm": 0.539424195992226, + "learning_rate": 9.989838933096311e-05, + "loss": 3.4656, + "step": 7621 + }, + { + "epoch": 0.35486649440137813, + "grad_norm": 0.4439118220165481, + "learning_rate": 9.989821665580388e-05, + "loss": 3.55, + "step": 7622 + }, + { + "epoch": 0.3549130525874712, + "grad_norm": 0.46536756788364225, + "learning_rate": 9.989804383419827e-05, + "loss": 3.5033, + "step": 7623 + }, + { + "epoch": 0.35495961077356425, + "grad_norm": 0.4991068225890251, + "learning_rate": 9.989787086614684e-05, + "loss": 3.4558, + "step": 7624 + }, + { + "epoch": 0.3550061689596573, + "grad_norm": 0.5152699269803922, + "learning_rate": 9.989769775165006e-05, + "loss": 3.5155, + "step": 7625 + }, + { + "epoch": 0.3550527271457504, + "grad_norm": 0.48228683989531024, + "learning_rate": 9.989752449070844e-05, + "loss": 3.4935, + "step": 7626 + }, + { + "epoch": 0.3550992853318435, + "grad_norm": 0.4686494958057004, + "learning_rate": 9.989735108332253e-05, + "loss": 3.5326, + "step": 7627 + }, + { + "epoch": 0.35514584351793654, + "grad_norm": 0.4784110325132191, + "learning_rate": 9.98971775294928e-05, + "loss": 3.4479, + "step": 7628 + }, + { + "epoch": 0.3551924017040296, + "grad_norm": 0.5043443722697947, + "learning_rate": 9.989700382921976e-05, + "loss": 3.4566, + "step": 7629 + }, + { + "epoch": 0.35523895989012266, + "grad_norm": 0.4884895216534559, + "learning_rate": 9.989682998250394e-05, + "loss": 3.3413, + "step": 7630 + }, + { + "epoch": 0.3552855180762158, + "grad_norm": 0.4432435956979018, + "learning_rate": 9.989665598934583e-05, + "loss": 3.3637, + "step": 7631 + }, + { + "epoch": 0.35533207626230884, + "grad_norm": 0.5137116033763547, + "learning_rate": 9.989648184974594e-05, + "loss": 3.4491, + "step": 7632 + }, + { + "epoch": 0.3553786344484019, + "grad_norm": 0.5035214819958961, + "learning_rate": 9.989630756370482e-05, + "loss": 3.4788, + "step": 7633 + }, + { + "epoch": 0.35542519263449496, + "grad_norm": 0.4536492443167841, + "learning_rate": 9.989613313122293e-05, + "loss": 3.438, + "step": 7634 + }, + { + "epoch": 0.355471750820588, + "grad_norm": 0.4342015873947089, + "learning_rate": 9.989595855230083e-05, + "loss": 3.48, + "step": 7635 + }, + { + "epoch": 0.3555183090066811, + "grad_norm": 0.43555696875096506, + "learning_rate": 9.989578382693899e-05, + "loss": 3.4143, + "step": 7636 + }, + { + "epoch": 0.3555648671927742, + "grad_norm": 0.4900605719870941, + "learning_rate": 9.989560895513795e-05, + "loss": 3.4349, + "step": 7637 + }, + { + "epoch": 0.35561142537886725, + "grad_norm": 0.4266302552908264, + "learning_rate": 9.989543393689819e-05, + "loss": 3.3865, + "step": 7638 + }, + { + "epoch": 0.3556579835649603, + "grad_norm": 0.44795890067825733, + "learning_rate": 9.989525877222026e-05, + "loss": 3.4188, + "step": 7639 + }, + { + "epoch": 0.35570454175105337, + "grad_norm": 0.4609922258092133, + "learning_rate": 9.989508346110466e-05, + "loss": 3.372, + "step": 7640 + }, + { + "epoch": 0.35575109993714643, + "grad_norm": 0.4019062294845843, + "learning_rate": 9.989490800355191e-05, + "loss": 3.3425, + "step": 7641 + }, + { + "epoch": 0.35579765812323955, + "grad_norm": 0.5137049139252979, + "learning_rate": 9.989473239956254e-05, + "loss": 3.4081, + "step": 7642 + }, + { + "epoch": 0.3558442163093326, + "grad_norm": 0.5174122471417125, + "learning_rate": 9.989455664913701e-05, + "loss": 3.6196, + "step": 7643 + }, + { + "epoch": 0.35589077449542567, + "grad_norm": 0.49084330135181575, + "learning_rate": 9.989438075227589e-05, + "loss": 3.3239, + "step": 7644 + }, + { + "epoch": 0.3559373326815187, + "grad_norm": 0.47174630375334337, + "learning_rate": 9.989420470897968e-05, + "loss": 3.4796, + "step": 7645 + }, + { + "epoch": 0.3559838908676118, + "grad_norm": 0.4522102025361618, + "learning_rate": 9.989402851924887e-05, + "loss": 3.4122, + "step": 7646 + }, + { + "epoch": 0.35603044905370484, + "grad_norm": 0.5157405356823233, + "learning_rate": 9.989385218308403e-05, + "loss": 3.4872, + "step": 7647 + }, + { + "epoch": 0.35607700723979796, + "grad_norm": 0.439977453161824, + "learning_rate": 9.989367570048565e-05, + "loss": 3.4686, + "step": 7648 + }, + { + "epoch": 0.356123565425891, + "grad_norm": 0.4524364427879116, + "learning_rate": 9.989349907145422e-05, + "loss": 3.4713, + "step": 7649 + }, + { + "epoch": 0.3561701236119841, + "grad_norm": 0.47118799536234973, + "learning_rate": 9.989332229599031e-05, + "loss": 3.3544, + "step": 7650 + }, + { + "epoch": 0.35621668179807714, + "grad_norm": 0.46574587170725834, + "learning_rate": 9.98931453740944e-05, + "loss": 3.398, + "step": 7651 + }, + { + "epoch": 0.3562632399841702, + "grad_norm": 0.44444591609691436, + "learning_rate": 9.989296830576704e-05, + "loss": 3.3847, + "step": 7652 + }, + { + "epoch": 0.35630979817026326, + "grad_norm": 0.43120870124209976, + "learning_rate": 9.989279109100873e-05, + "loss": 3.3997, + "step": 7653 + }, + { + "epoch": 0.3563563563563564, + "grad_norm": 0.49611698553696876, + "learning_rate": 9.989261372981997e-05, + "loss": 3.4082, + "step": 7654 + }, + { + "epoch": 0.35640291454244943, + "grad_norm": 0.44610447367849315, + "learning_rate": 9.989243622220133e-05, + "loss": 3.5101, + "step": 7655 + }, + { + "epoch": 0.3564494727285425, + "grad_norm": 0.49940321871958987, + "learning_rate": 9.98922585681533e-05, + "loss": 3.4639, + "step": 7656 + }, + { + "epoch": 0.35649603091463555, + "grad_norm": 0.5478632916215985, + "learning_rate": 9.98920807676764e-05, + "loss": 3.3594, + "step": 7657 + }, + { + "epoch": 0.3565425891007286, + "grad_norm": 0.5284948754489895, + "learning_rate": 9.989190282077117e-05, + "loss": 3.4831, + "step": 7658 + }, + { + "epoch": 0.35658914728682173, + "grad_norm": 0.6239589773802614, + "learning_rate": 9.98917247274381e-05, + "loss": 3.3702, + "step": 7659 + }, + { + "epoch": 0.3566357054729148, + "grad_norm": 0.5701594962856521, + "learning_rate": 9.989154648767774e-05, + "loss": 3.4022, + "step": 7660 + }, + { + "epoch": 0.35668226365900785, + "grad_norm": 0.5189777700905034, + "learning_rate": 9.989136810149062e-05, + "loss": 3.5174, + "step": 7661 + }, + { + "epoch": 0.3567288218451009, + "grad_norm": 0.48084518663527737, + "learning_rate": 9.989118956887725e-05, + "loss": 3.3343, + "step": 7662 + }, + { + "epoch": 0.35677538003119397, + "grad_norm": 0.49023666328102883, + "learning_rate": 9.989101088983813e-05, + "loss": 3.4622, + "step": 7663 + }, + { + "epoch": 0.356821938217287, + "grad_norm": 0.4133895531096925, + "learning_rate": 9.989083206437382e-05, + "loss": 3.4413, + "step": 7664 + }, + { + "epoch": 0.35686849640338014, + "grad_norm": 0.46129366224855783, + "learning_rate": 9.989065309248484e-05, + "loss": 3.4149, + "step": 7665 + }, + { + "epoch": 0.3569150545894732, + "grad_norm": 0.42151483663932443, + "learning_rate": 9.98904739741717e-05, + "loss": 3.2595, + "step": 7666 + }, + { + "epoch": 0.35696161277556626, + "grad_norm": 0.4328037976781641, + "learning_rate": 9.989029470943494e-05, + "loss": 3.3862, + "step": 7667 + }, + { + "epoch": 0.3570081709616593, + "grad_norm": 0.44204587765885006, + "learning_rate": 9.989011529827507e-05, + "loss": 3.385, + "step": 7668 + }, + { + "epoch": 0.3570547291477524, + "grad_norm": 0.46621148553253516, + "learning_rate": 9.988993574069263e-05, + "loss": 3.4344, + "step": 7669 + }, + { + "epoch": 0.3571012873338455, + "grad_norm": 0.45064834807665005, + "learning_rate": 9.988975603668815e-05, + "loss": 3.4481, + "step": 7670 + }, + { + "epoch": 0.35714784551993856, + "grad_norm": 0.4218154014789155, + "learning_rate": 9.988957618626216e-05, + "loss": 3.3566, + "step": 7671 + }, + { + "epoch": 0.3571944037060316, + "grad_norm": 0.4196865812427401, + "learning_rate": 9.988939618941516e-05, + "loss": 3.4685, + "step": 7672 + }, + { + "epoch": 0.3572409618921247, + "grad_norm": 0.4497086706473259, + "learning_rate": 9.98892160461477e-05, + "loss": 3.3395, + "step": 7673 + }, + { + "epoch": 0.35728752007821774, + "grad_norm": 0.442592418272852, + "learning_rate": 9.988903575646031e-05, + "loss": 3.476, + "step": 7674 + }, + { + "epoch": 0.3573340782643108, + "grad_norm": 0.40497456176781194, + "learning_rate": 9.988885532035353e-05, + "loss": 3.4329, + "step": 7675 + }, + { + "epoch": 0.3573806364504039, + "grad_norm": 0.4460491830881733, + "learning_rate": 9.988867473782784e-05, + "loss": 3.3997, + "step": 7676 + }, + { + "epoch": 0.35742719463649697, + "grad_norm": 0.42724224364102803, + "learning_rate": 9.988849400888382e-05, + "loss": 3.3835, + "step": 7677 + }, + { + "epoch": 0.35747375282259003, + "grad_norm": 0.4707507189742629, + "learning_rate": 9.9888313133522e-05, + "loss": 3.3766, + "step": 7678 + }, + { + "epoch": 0.3575203110086831, + "grad_norm": 0.44600150606699723, + "learning_rate": 9.988813211174288e-05, + "loss": 3.4353, + "step": 7679 + }, + { + "epoch": 0.35756686919477615, + "grad_norm": 0.47834314262805394, + "learning_rate": 9.988795094354701e-05, + "loss": 3.4412, + "step": 7680 + }, + { + "epoch": 0.35761342738086926, + "grad_norm": 0.4083975342385705, + "learning_rate": 9.988776962893494e-05, + "loss": 3.3562, + "step": 7681 + }, + { + "epoch": 0.3576599855669623, + "grad_norm": 0.4692065755230704, + "learning_rate": 9.988758816790715e-05, + "loss": 3.4228, + "step": 7682 + }, + { + "epoch": 0.3577065437530554, + "grad_norm": 0.44729589622914206, + "learning_rate": 9.988740656046421e-05, + "loss": 3.3975, + "step": 7683 + }, + { + "epoch": 0.35775310193914844, + "grad_norm": 0.4301199053201449, + "learning_rate": 9.988722480660667e-05, + "loss": 3.4477, + "step": 7684 + }, + { + "epoch": 0.3577996601252415, + "grad_norm": 0.4298838240529587, + "learning_rate": 9.988704290633501e-05, + "loss": 3.4851, + "step": 7685 + }, + { + "epoch": 0.35784621831133456, + "grad_norm": 0.44144801229266173, + "learning_rate": 9.98868608596498e-05, + "loss": 3.3835, + "step": 7686 + }, + { + "epoch": 0.3578927764974277, + "grad_norm": 0.4418378118495568, + "learning_rate": 9.988667866655157e-05, + "loss": 3.4572, + "step": 7687 + }, + { + "epoch": 0.35793933468352074, + "grad_norm": 0.4863201931721789, + "learning_rate": 9.988649632704085e-05, + "loss": 3.3173, + "step": 7688 + }, + { + "epoch": 0.3579858928696138, + "grad_norm": 0.5264723628245305, + "learning_rate": 9.98863138411182e-05, + "loss": 3.3756, + "step": 7689 + }, + { + "epoch": 0.35803245105570686, + "grad_norm": 0.4991276927826103, + "learning_rate": 9.988613120878411e-05, + "loss": 3.481, + "step": 7690 + }, + { + "epoch": 0.3580790092417999, + "grad_norm": 0.4235882832968806, + "learning_rate": 9.988594843003915e-05, + "loss": 3.5004, + "step": 7691 + }, + { + "epoch": 0.35812556742789303, + "grad_norm": 0.4655983747409204, + "learning_rate": 9.988576550488384e-05, + "loss": 3.384, + "step": 7692 + }, + { + "epoch": 0.3581721256139861, + "grad_norm": 0.5026836857156883, + "learning_rate": 9.988558243331872e-05, + "loss": 3.3872, + "step": 7693 + }, + { + "epoch": 0.35821868380007915, + "grad_norm": 0.4836161946534488, + "learning_rate": 9.988539921534435e-05, + "loss": 3.4832, + "step": 7694 + }, + { + "epoch": 0.3582652419861722, + "grad_norm": 0.5374042206108116, + "learning_rate": 9.988521585096122e-05, + "loss": 3.5349, + "step": 7695 + }, + { + "epoch": 0.35831180017226527, + "grad_norm": 0.5360602974884985, + "learning_rate": 9.988503234016991e-05, + "loss": 3.4305, + "step": 7696 + }, + { + "epoch": 0.35835835835835833, + "grad_norm": 0.47737044688581837, + "learning_rate": 9.988484868297094e-05, + "loss": 3.367, + "step": 7697 + }, + { + "epoch": 0.35840491654445145, + "grad_norm": 0.6512988794651803, + "learning_rate": 9.988466487936487e-05, + "loss": 3.4217, + "step": 7698 + }, + { + "epoch": 0.3584514747305445, + "grad_norm": 0.6443779998480741, + "learning_rate": 9.98844809293522e-05, + "loss": 3.4064, + "step": 7699 + }, + { + "epoch": 0.35849803291663757, + "grad_norm": 0.5057334856289879, + "learning_rate": 9.988429683293351e-05, + "loss": 3.5391, + "step": 7700 + }, + { + "epoch": 0.3585445911027306, + "grad_norm": 0.5048473063077639, + "learning_rate": 9.988411259010931e-05, + "loss": 3.3709, + "step": 7701 + }, + { + "epoch": 0.3585911492888237, + "grad_norm": 0.49139609921885175, + "learning_rate": 9.988392820088018e-05, + "loss": 3.3751, + "step": 7702 + }, + { + "epoch": 0.3586377074749168, + "grad_norm": 0.5444193692822061, + "learning_rate": 9.98837436652466e-05, + "loss": 3.528, + "step": 7703 + }, + { + "epoch": 0.35868426566100986, + "grad_norm": 0.47420932610726096, + "learning_rate": 9.988355898320916e-05, + "loss": 3.5508, + "step": 7704 + }, + { + "epoch": 0.3587308238471029, + "grad_norm": 0.528538127964013, + "learning_rate": 9.98833741547684e-05, + "loss": 3.4119, + "step": 7705 + }, + { + "epoch": 0.358777382033196, + "grad_norm": 0.534553965499579, + "learning_rate": 9.988318917992482e-05, + "loss": 3.3744, + "step": 7706 + }, + { + "epoch": 0.35882394021928904, + "grad_norm": 0.46805000850379297, + "learning_rate": 9.988300405867903e-05, + "loss": 3.253, + "step": 7707 + }, + { + "epoch": 0.3588704984053821, + "grad_norm": 0.5130067813035972, + "learning_rate": 9.988281879103153e-05, + "loss": 3.3462, + "step": 7708 + }, + { + "epoch": 0.3589170565914752, + "grad_norm": 0.5003318027374586, + "learning_rate": 9.988263337698286e-05, + "loss": 3.4303, + "step": 7709 + }, + { + "epoch": 0.3589636147775683, + "grad_norm": 0.44755115119042027, + "learning_rate": 9.988244781653357e-05, + "loss": 3.4902, + "step": 7710 + }, + { + "epoch": 0.35901017296366133, + "grad_norm": 0.4456113728262875, + "learning_rate": 9.988226210968423e-05, + "loss": 3.4337, + "step": 7711 + }, + { + "epoch": 0.3590567311497544, + "grad_norm": 0.41452836332364407, + "learning_rate": 9.988207625643535e-05, + "loss": 3.4291, + "step": 7712 + }, + { + "epoch": 0.35910328933584745, + "grad_norm": 0.44537938675426003, + "learning_rate": 9.988189025678751e-05, + "loss": 3.3663, + "step": 7713 + }, + { + "epoch": 0.35914984752194057, + "grad_norm": 0.4928713512082583, + "learning_rate": 9.988170411074122e-05, + "loss": 3.3873, + "step": 7714 + }, + { + "epoch": 0.35919640570803363, + "grad_norm": 0.42405401386853325, + "learning_rate": 9.988151781829705e-05, + "loss": 3.3586, + "step": 7715 + }, + { + "epoch": 0.3592429638941267, + "grad_norm": 0.45825409469163164, + "learning_rate": 9.988133137945552e-05, + "loss": 3.4247, + "step": 7716 + }, + { + "epoch": 0.35928952208021975, + "grad_norm": 0.5101041057809721, + "learning_rate": 9.988114479421721e-05, + "loss": 3.3938, + "step": 7717 + }, + { + "epoch": 0.3593360802663128, + "grad_norm": 0.4430430860885314, + "learning_rate": 9.988095806258266e-05, + "loss": 3.4381, + "step": 7718 + }, + { + "epoch": 0.35938263845240587, + "grad_norm": 0.4836955254405247, + "learning_rate": 9.98807711845524e-05, + "loss": 3.3949, + "step": 7719 + }, + { + "epoch": 0.359429196638499, + "grad_norm": 0.4759292708899398, + "learning_rate": 9.9880584160127e-05, + "loss": 3.404, + "step": 7720 + }, + { + "epoch": 0.35947575482459204, + "grad_norm": 0.47679998458098377, + "learning_rate": 9.9880396989307e-05, + "loss": 3.3767, + "step": 7721 + }, + { + "epoch": 0.3595223130106851, + "grad_norm": 0.5344252934843737, + "learning_rate": 9.988020967209295e-05, + "loss": 3.4399, + "step": 7722 + }, + { + "epoch": 0.35956887119677816, + "grad_norm": 0.4832175527604553, + "learning_rate": 9.988002220848538e-05, + "loss": 3.4634, + "step": 7723 + }, + { + "epoch": 0.3596154293828712, + "grad_norm": 0.4575729944149557, + "learning_rate": 9.987983459848488e-05, + "loss": 3.5011, + "step": 7724 + }, + { + "epoch": 0.35966198756896434, + "grad_norm": 0.45494403414271906, + "learning_rate": 9.987964684209197e-05, + "loss": 3.351, + "step": 7725 + }, + { + "epoch": 0.3597085457550574, + "grad_norm": 0.4067201116563724, + "learning_rate": 9.987945893930721e-05, + "loss": 3.4959, + "step": 7726 + }, + { + "epoch": 0.35975510394115046, + "grad_norm": 0.4180461501421041, + "learning_rate": 9.987927089013116e-05, + "loss": 3.5295, + "step": 7727 + }, + { + "epoch": 0.3598016621272435, + "grad_norm": 0.4827572580622743, + "learning_rate": 9.987908269456437e-05, + "loss": 3.4529, + "step": 7728 + }, + { + "epoch": 0.3598482203133366, + "grad_norm": 0.4469762484221062, + "learning_rate": 9.987889435260736e-05, + "loss": 3.4814, + "step": 7729 + }, + { + "epoch": 0.35989477849942964, + "grad_norm": 0.41971386942369937, + "learning_rate": 9.987870586426074e-05, + "loss": 3.4667, + "step": 7730 + }, + { + "epoch": 0.35994133668552275, + "grad_norm": 0.4306630456391312, + "learning_rate": 9.9878517229525e-05, + "loss": 3.3942, + "step": 7731 + }, + { + "epoch": 0.3599878948716158, + "grad_norm": 0.40883582291622156, + "learning_rate": 9.987832844840076e-05, + "loss": 3.2936, + "step": 7732 + }, + { + "epoch": 0.36003445305770887, + "grad_norm": 0.43841395098025254, + "learning_rate": 9.987813952088852e-05, + "loss": 3.4274, + "step": 7733 + }, + { + "epoch": 0.36008101124380193, + "grad_norm": 0.5052874434844135, + "learning_rate": 9.987795044698885e-05, + "loss": 3.4348, + "step": 7734 + }, + { + "epoch": 0.360127569429895, + "grad_norm": 0.4389792513449661, + "learning_rate": 9.987776122670231e-05, + "loss": 3.3008, + "step": 7735 + }, + { + "epoch": 0.3601741276159881, + "grad_norm": 0.41956331978678635, + "learning_rate": 9.987757186002946e-05, + "loss": 3.4383, + "step": 7736 + }, + { + "epoch": 0.36022068580208116, + "grad_norm": 0.47711835184582513, + "learning_rate": 9.987738234697085e-05, + "loss": 3.4307, + "step": 7737 + }, + { + "epoch": 0.3602672439881742, + "grad_norm": 0.4587474816505096, + "learning_rate": 9.987719268752705e-05, + "loss": 3.46, + "step": 7738 + }, + { + "epoch": 0.3603138021742673, + "grad_norm": 0.3992969497217398, + "learning_rate": 9.987700288169856e-05, + "loss": 3.2885, + "step": 7739 + }, + { + "epoch": 0.36036036036036034, + "grad_norm": 0.4511821482181114, + "learning_rate": 9.987681292948603e-05, + "loss": 3.4562, + "step": 7740 + }, + { + "epoch": 0.3604069185464534, + "grad_norm": 0.5002978362043746, + "learning_rate": 9.987662283088995e-05, + "loss": 3.4369, + "step": 7741 + }, + { + "epoch": 0.3604534767325465, + "grad_norm": 0.44810266029553636, + "learning_rate": 9.987643258591089e-05, + "loss": 3.41, + "step": 7742 + }, + { + "epoch": 0.3605000349186396, + "grad_norm": 0.44238892094161475, + "learning_rate": 9.987624219454941e-05, + "loss": 3.3359, + "step": 7743 + }, + { + "epoch": 0.36054659310473264, + "grad_norm": 0.5199343085717668, + "learning_rate": 9.987605165680609e-05, + "loss": 3.4013, + "step": 7744 + }, + { + "epoch": 0.3605931512908257, + "grad_norm": 0.49434701658893343, + "learning_rate": 9.987586097268146e-05, + "loss": 3.39, + "step": 7745 + }, + { + "epoch": 0.36063970947691876, + "grad_norm": 0.41864048334811704, + "learning_rate": 9.987567014217609e-05, + "loss": 3.3337, + "step": 7746 + }, + { + "epoch": 0.3606862676630119, + "grad_norm": 0.5559760398134088, + "learning_rate": 9.987547916529054e-05, + "loss": 3.5434, + "step": 7747 + }, + { + "epoch": 0.36073282584910493, + "grad_norm": 0.591398269734777, + "learning_rate": 9.987528804202539e-05, + "loss": 3.4755, + "step": 7748 + }, + { + "epoch": 0.360779384035198, + "grad_norm": 0.5494666660338059, + "learning_rate": 9.987509677238117e-05, + "loss": 3.388, + "step": 7749 + }, + { + "epoch": 0.36082594222129105, + "grad_norm": 0.4985908022785705, + "learning_rate": 9.987490535635845e-05, + "loss": 3.4045, + "step": 7750 + }, + { + "epoch": 0.3608725004073841, + "grad_norm": 0.4719288671994143, + "learning_rate": 9.98747137939578e-05, + "loss": 3.4995, + "step": 7751 + }, + { + "epoch": 0.36091905859347717, + "grad_norm": 0.4489745187068014, + "learning_rate": 9.987452208517978e-05, + "loss": 3.3342, + "step": 7752 + }, + { + "epoch": 0.3609656167795703, + "grad_norm": 0.5664558411649989, + "learning_rate": 9.987433023002495e-05, + "loss": 3.4827, + "step": 7753 + }, + { + "epoch": 0.36101217496566335, + "grad_norm": 0.5665354800737386, + "learning_rate": 9.987413822849388e-05, + "loss": 3.275, + "step": 7754 + }, + { + "epoch": 0.3610587331517564, + "grad_norm": 0.4215319118658775, + "learning_rate": 9.987394608058712e-05, + "loss": 3.4346, + "step": 7755 + }, + { + "epoch": 0.36110529133784947, + "grad_norm": 0.44632912275446257, + "learning_rate": 9.987375378630525e-05, + "loss": 3.4401, + "step": 7756 + }, + { + "epoch": 0.3611518495239425, + "grad_norm": 0.4456933377982035, + "learning_rate": 9.987356134564882e-05, + "loss": 3.403, + "step": 7757 + }, + { + "epoch": 0.36119840771003564, + "grad_norm": 0.39674930476694403, + "learning_rate": 9.98733687586184e-05, + "loss": 3.2645, + "step": 7758 + }, + { + "epoch": 0.3612449658961287, + "grad_norm": 0.4568236719041023, + "learning_rate": 9.987317602521456e-05, + "loss": 3.4544, + "step": 7759 + }, + { + "epoch": 0.36129152408222176, + "grad_norm": 0.42121421796108766, + "learning_rate": 9.987298314543786e-05, + "loss": 3.4507, + "step": 7760 + }, + { + "epoch": 0.3613380822683148, + "grad_norm": 0.42519362262899835, + "learning_rate": 9.987279011928887e-05, + "loss": 3.4383, + "step": 7761 + }, + { + "epoch": 0.3613846404544079, + "grad_norm": 0.42466801771860335, + "learning_rate": 9.987259694676815e-05, + "loss": 3.4937, + "step": 7762 + }, + { + "epoch": 0.36143119864050094, + "grad_norm": 0.42957371306761827, + "learning_rate": 9.987240362787626e-05, + "loss": 3.3428, + "step": 7763 + }, + { + "epoch": 0.36147775682659405, + "grad_norm": 0.42545526213712714, + "learning_rate": 9.98722101626138e-05, + "loss": 3.4006, + "step": 7764 + }, + { + "epoch": 0.3615243150126871, + "grad_norm": 0.4893612327945778, + "learning_rate": 9.987201655098131e-05, + "loss": 3.4915, + "step": 7765 + }, + { + "epoch": 0.3615708731987802, + "grad_norm": 0.5159305540845995, + "learning_rate": 9.987182279297936e-05, + "loss": 3.4459, + "step": 7766 + }, + { + "epoch": 0.36161743138487323, + "grad_norm": 0.408431329216724, + "learning_rate": 9.987162888860853e-05, + "loss": 3.3307, + "step": 7767 + }, + { + "epoch": 0.3616639895709663, + "grad_norm": 0.49628244928127857, + "learning_rate": 9.987143483786939e-05, + "loss": 3.3542, + "step": 7768 + }, + { + "epoch": 0.3617105477570594, + "grad_norm": 0.5400289877481782, + "learning_rate": 9.987124064076248e-05, + "loss": 3.3563, + "step": 7769 + }, + { + "epoch": 0.36175710594315247, + "grad_norm": 0.37950181874108696, + "learning_rate": 9.98710462972884e-05, + "loss": 3.3576, + "step": 7770 + }, + { + "epoch": 0.36180366412924553, + "grad_norm": 0.4717078933992653, + "learning_rate": 9.98708518074477e-05, + "loss": 3.4139, + "step": 7771 + }, + { + "epoch": 0.3618502223153386, + "grad_norm": 0.5358048139886321, + "learning_rate": 9.987065717124099e-05, + "loss": 3.3869, + "step": 7772 + }, + { + "epoch": 0.36189678050143165, + "grad_norm": 0.5560259311216494, + "learning_rate": 9.98704623886688e-05, + "loss": 3.5113, + "step": 7773 + }, + { + "epoch": 0.3619433386875247, + "grad_norm": 0.5087295450720859, + "learning_rate": 9.987026745973172e-05, + "loss": 3.376, + "step": 7774 + }, + { + "epoch": 0.3619898968736178, + "grad_norm": 0.4800406269966815, + "learning_rate": 9.987007238443031e-05, + "loss": 3.4048, + "step": 7775 + }, + { + "epoch": 0.3620364550597109, + "grad_norm": 0.4413563189867641, + "learning_rate": 9.986987716276515e-05, + "loss": 3.4089, + "step": 7776 + }, + { + "epoch": 0.36208301324580394, + "grad_norm": 0.4686915198135059, + "learning_rate": 9.986968179473684e-05, + "loss": 3.375, + "step": 7777 + }, + { + "epoch": 0.362129571431897, + "grad_norm": 0.48930876805830875, + "learning_rate": 9.98694862803459e-05, + "loss": 3.3816, + "step": 7778 + }, + { + "epoch": 0.36217612961799006, + "grad_norm": 0.4961809508810353, + "learning_rate": 9.986929061959294e-05, + "loss": 3.3958, + "step": 7779 + }, + { + "epoch": 0.3622226878040832, + "grad_norm": 0.49629166485117215, + "learning_rate": 9.986909481247851e-05, + "loss": 3.4239, + "step": 7780 + }, + { + "epoch": 0.36226924599017624, + "grad_norm": 0.43756330931858717, + "learning_rate": 9.98688988590032e-05, + "loss": 3.4483, + "step": 7781 + }, + { + "epoch": 0.3623158041762693, + "grad_norm": 0.43803985633442016, + "learning_rate": 9.98687027591676e-05, + "loss": 3.4314, + "step": 7782 + }, + { + "epoch": 0.36236236236236236, + "grad_norm": 0.5270312070700928, + "learning_rate": 9.986850651297228e-05, + "loss": 3.4804, + "step": 7783 + }, + { + "epoch": 0.3624089205484554, + "grad_norm": 0.5138243888194796, + "learning_rate": 9.986831012041779e-05, + "loss": 3.3665, + "step": 7784 + }, + { + "epoch": 0.3624554787345485, + "grad_norm": 0.4732160669106265, + "learning_rate": 9.986811358150473e-05, + "loss": 3.4013, + "step": 7785 + }, + { + "epoch": 0.3625020369206416, + "grad_norm": 0.4754338637352536, + "learning_rate": 9.986791689623366e-05, + "loss": 3.5378, + "step": 7786 + }, + { + "epoch": 0.36254859510673465, + "grad_norm": 0.46848601367364, + "learning_rate": 9.986772006460518e-05, + "loss": 3.416, + "step": 7787 + }, + { + "epoch": 0.3625951532928277, + "grad_norm": 0.5117855310214062, + "learning_rate": 9.986752308661985e-05, + "loss": 3.394, + "step": 7788 + }, + { + "epoch": 0.36264171147892077, + "grad_norm": 0.4754440416771346, + "learning_rate": 9.986732596227823e-05, + "loss": 3.4204, + "step": 7789 + }, + { + "epoch": 0.36268826966501383, + "grad_norm": 0.41918898022271317, + "learning_rate": 9.986712869158095e-05, + "loss": 3.4619, + "step": 7790 + }, + { + "epoch": 0.36273482785110694, + "grad_norm": 0.4465522183108274, + "learning_rate": 9.986693127452856e-05, + "loss": 3.3335, + "step": 7791 + }, + { + "epoch": 0.3627813860372, + "grad_norm": 0.4170880320734021, + "learning_rate": 9.986673371112161e-05, + "loss": 3.3883, + "step": 7792 + }, + { + "epoch": 0.36282794422329306, + "grad_norm": 0.43716689188447155, + "learning_rate": 9.986653600136074e-05, + "loss": 3.5905, + "step": 7793 + }, + { + "epoch": 0.3628745024093861, + "grad_norm": 0.44317903223300803, + "learning_rate": 9.986633814524649e-05, + "loss": 3.4188, + "step": 7794 + }, + { + "epoch": 0.3629210605954792, + "grad_norm": 0.42324763617158323, + "learning_rate": 9.986614014277944e-05, + "loss": 3.4095, + "step": 7795 + }, + { + "epoch": 0.36296761878157224, + "grad_norm": 0.4450281535911912, + "learning_rate": 9.98659419939602e-05, + "loss": 3.5362, + "step": 7796 + }, + { + "epoch": 0.36301417696766536, + "grad_norm": 0.4335771361358979, + "learning_rate": 9.986574369878931e-05, + "loss": 3.392, + "step": 7797 + }, + { + "epoch": 0.3630607351537584, + "grad_norm": 0.43433108900729944, + "learning_rate": 9.986554525726739e-05, + "loss": 3.3509, + "step": 7798 + }, + { + "epoch": 0.3631072933398515, + "grad_norm": 0.44652313918630016, + "learning_rate": 9.9865346669395e-05, + "loss": 3.3757, + "step": 7799 + }, + { + "epoch": 0.36315385152594454, + "grad_norm": 0.44875235027276, + "learning_rate": 9.986514793517275e-05, + "loss": 3.4109, + "step": 7800 + }, + { + "epoch": 0.3632004097120376, + "grad_norm": 0.46976351869421146, + "learning_rate": 9.986494905460117e-05, + "loss": 3.2969, + "step": 7801 + }, + { + "epoch": 0.3632469678981307, + "grad_norm": 0.4627668177819452, + "learning_rate": 9.98647500276809e-05, + "loss": 3.3152, + "step": 7802 + }, + { + "epoch": 0.3632935260842238, + "grad_norm": 0.47306382976907646, + "learning_rate": 9.98645508544125e-05, + "loss": 3.5384, + "step": 7803 + }, + { + "epoch": 0.36334008427031683, + "grad_norm": 0.47796645123525644, + "learning_rate": 9.986435153479656e-05, + "loss": 3.436, + "step": 7804 + }, + { + "epoch": 0.3633866424564099, + "grad_norm": 0.45338118268900446, + "learning_rate": 9.986415206883365e-05, + "loss": 3.3486, + "step": 7805 + }, + { + "epoch": 0.36343320064250295, + "grad_norm": 0.5003490413754914, + "learning_rate": 9.986395245652435e-05, + "loss": 3.3332, + "step": 7806 + }, + { + "epoch": 0.363479758828596, + "grad_norm": 0.46975343280222226, + "learning_rate": 9.98637526978693e-05, + "loss": 3.3657, + "step": 7807 + }, + { + "epoch": 0.3635263170146891, + "grad_norm": 0.4682898542143278, + "learning_rate": 9.986355279286902e-05, + "loss": 3.2682, + "step": 7808 + }, + { + "epoch": 0.3635728752007822, + "grad_norm": 0.514360081024939, + "learning_rate": 9.986335274152414e-05, + "loss": 3.4621, + "step": 7809 + }, + { + "epoch": 0.36361943338687525, + "grad_norm": 0.5057950245710469, + "learning_rate": 9.986315254383523e-05, + "loss": 3.451, + "step": 7810 + }, + { + "epoch": 0.3636659915729683, + "grad_norm": 0.48531065085170855, + "learning_rate": 9.986295219980287e-05, + "loss": 3.3891, + "step": 7811 + }, + { + "epoch": 0.36371254975906137, + "grad_norm": 0.5028429953822161, + "learning_rate": 9.986275170942766e-05, + "loss": 3.4646, + "step": 7812 + }, + { + "epoch": 0.3637591079451545, + "grad_norm": 0.524394002732974, + "learning_rate": 9.986255107271018e-05, + "loss": 3.548, + "step": 7813 + }, + { + "epoch": 0.36380566613124754, + "grad_norm": 0.5228110189870376, + "learning_rate": 9.986235028965105e-05, + "loss": 3.3865, + "step": 7814 + }, + { + "epoch": 0.3638522243173406, + "grad_norm": 0.4846103398917053, + "learning_rate": 9.986214936025081e-05, + "loss": 3.4754, + "step": 7815 + }, + { + "epoch": 0.36389878250343366, + "grad_norm": 0.47501506397227317, + "learning_rate": 9.986194828451008e-05, + "loss": 3.463, + "step": 7816 + }, + { + "epoch": 0.3639453406895267, + "grad_norm": 0.5110658636693682, + "learning_rate": 9.986174706242945e-05, + "loss": 3.4824, + "step": 7817 + }, + { + "epoch": 0.3639918988756198, + "grad_norm": 0.4270519077982392, + "learning_rate": 9.98615456940095e-05, + "loss": 3.3752, + "step": 7818 + }, + { + "epoch": 0.3640384570617129, + "grad_norm": 0.49612809963972365, + "learning_rate": 9.986134417925083e-05, + "loss": 3.3901, + "step": 7819 + }, + { + "epoch": 0.36408501524780595, + "grad_norm": 0.4592044563788407, + "learning_rate": 9.986114251815401e-05, + "loss": 3.3911, + "step": 7820 + }, + { + "epoch": 0.364131573433899, + "grad_norm": 0.4691398324681879, + "learning_rate": 9.986094071071967e-05, + "loss": 3.5069, + "step": 7821 + }, + { + "epoch": 0.3641781316199921, + "grad_norm": 0.447578446844384, + "learning_rate": 9.986073875694837e-05, + "loss": 3.364, + "step": 7822 + }, + { + "epoch": 0.36422468980608513, + "grad_norm": 0.4068364782620846, + "learning_rate": 9.986053665684072e-05, + "loss": 3.4947, + "step": 7823 + }, + { + "epoch": 0.36427124799217825, + "grad_norm": 0.4565192589380727, + "learning_rate": 9.98603344103973e-05, + "loss": 3.5085, + "step": 7824 + }, + { + "epoch": 0.3643178061782713, + "grad_norm": 0.5078085184266402, + "learning_rate": 9.986013201761873e-05, + "loss": 3.3656, + "step": 7825 + }, + { + "epoch": 0.36436436436436437, + "grad_norm": 0.4365192308231825, + "learning_rate": 9.985992947850555e-05, + "loss": 3.3259, + "step": 7826 + }, + { + "epoch": 0.36441092255045743, + "grad_norm": 0.47900432760519823, + "learning_rate": 9.985972679305842e-05, + "loss": 3.4715, + "step": 7827 + }, + { + "epoch": 0.3644574807365505, + "grad_norm": 0.567505577120082, + "learning_rate": 9.98595239612779e-05, + "loss": 3.4027, + "step": 7828 + }, + { + "epoch": 0.36450403892264355, + "grad_norm": 0.5433141739612741, + "learning_rate": 9.985932098316458e-05, + "loss": 3.4117, + "step": 7829 + }, + { + "epoch": 0.36455059710873666, + "grad_norm": 0.45913252697151474, + "learning_rate": 9.985911785871905e-05, + "loss": 3.432, + "step": 7830 + }, + { + "epoch": 0.3645971552948297, + "grad_norm": 0.5500757280461803, + "learning_rate": 9.985891458794194e-05, + "loss": 3.5029, + "step": 7831 + }, + { + "epoch": 0.3646437134809228, + "grad_norm": 0.5417899350810034, + "learning_rate": 9.985871117083383e-05, + "loss": 3.3374, + "step": 7832 + }, + { + "epoch": 0.36469027166701584, + "grad_norm": 0.4820608564889112, + "learning_rate": 9.98585076073953e-05, + "loss": 3.3395, + "step": 7833 + }, + { + "epoch": 0.3647368298531089, + "grad_norm": 0.5823762205929544, + "learning_rate": 9.985830389762698e-05, + "loss": 3.4011, + "step": 7834 + }, + { + "epoch": 0.364783388039202, + "grad_norm": 0.4741323658796327, + "learning_rate": 9.985810004152944e-05, + "loss": 3.4504, + "step": 7835 + }, + { + "epoch": 0.3648299462252951, + "grad_norm": 0.48068942522799185, + "learning_rate": 9.98578960391033e-05, + "loss": 3.3684, + "step": 7836 + }, + { + "epoch": 0.36487650441138814, + "grad_norm": 0.44885535761234935, + "learning_rate": 9.985769189034912e-05, + "loss": 3.3693, + "step": 7837 + }, + { + "epoch": 0.3649230625974812, + "grad_norm": 0.47796354558334175, + "learning_rate": 9.985748759526754e-05, + "loss": 3.4083, + "step": 7838 + }, + { + "epoch": 0.36496962078357426, + "grad_norm": 0.49866342577948425, + "learning_rate": 9.985728315385915e-05, + "loss": 3.4791, + "step": 7839 + }, + { + "epoch": 0.3650161789696673, + "grad_norm": 0.5021065864534342, + "learning_rate": 9.985707856612456e-05, + "loss": 3.4284, + "step": 7840 + }, + { + "epoch": 0.36506273715576043, + "grad_norm": 0.4972507151344959, + "learning_rate": 9.985687383206433e-05, + "loss": 3.441, + "step": 7841 + }, + { + "epoch": 0.3651092953418535, + "grad_norm": 0.46366399645060247, + "learning_rate": 9.985666895167909e-05, + "loss": 3.332, + "step": 7842 + }, + { + "epoch": 0.36515585352794655, + "grad_norm": 0.47974723426786564, + "learning_rate": 9.985646392496946e-05, + "loss": 3.362, + "step": 7843 + }, + { + "epoch": 0.3652024117140396, + "grad_norm": 0.520665101779806, + "learning_rate": 9.985625875193599e-05, + "loss": 3.4365, + "step": 7844 + }, + { + "epoch": 0.36524896990013267, + "grad_norm": 0.4651779430590965, + "learning_rate": 9.985605343257934e-05, + "loss": 3.4028, + "step": 7845 + }, + { + "epoch": 0.3652955280862258, + "grad_norm": 0.4532983793735146, + "learning_rate": 9.985584796690006e-05, + "loss": 3.4975, + "step": 7846 + }, + { + "epoch": 0.36534208627231884, + "grad_norm": 0.4306971717665782, + "learning_rate": 9.985564235489879e-05, + "loss": 3.4049, + "step": 7847 + }, + { + "epoch": 0.3653886444584119, + "grad_norm": 0.4453132409179393, + "learning_rate": 9.985543659657612e-05, + "loss": 3.5126, + "step": 7848 + }, + { + "epoch": 0.36543520264450496, + "grad_norm": 0.39829611111409824, + "learning_rate": 9.985523069193266e-05, + "loss": 3.4843, + "step": 7849 + }, + { + "epoch": 0.365481760830598, + "grad_norm": 0.43622422871556327, + "learning_rate": 9.9855024640969e-05, + "loss": 3.4544, + "step": 7850 + }, + { + "epoch": 0.3655283190166911, + "grad_norm": 0.4267666409265531, + "learning_rate": 9.985481844368577e-05, + "loss": 3.3663, + "step": 7851 + }, + { + "epoch": 0.3655748772027842, + "grad_norm": 0.4648467641222077, + "learning_rate": 9.985461210008356e-05, + "loss": 3.4534, + "step": 7852 + }, + { + "epoch": 0.36562143538887726, + "grad_norm": 0.42986485145113495, + "learning_rate": 9.985440561016295e-05, + "loss": 3.409, + "step": 7853 + }, + { + "epoch": 0.3656679935749703, + "grad_norm": 0.4136526514217192, + "learning_rate": 9.985419897392459e-05, + "loss": 3.4151, + "step": 7854 + }, + { + "epoch": 0.3657145517610634, + "grad_norm": 0.3892295809742594, + "learning_rate": 9.985399219136906e-05, + "loss": 3.4336, + "step": 7855 + }, + { + "epoch": 0.36576110994715644, + "grad_norm": 0.5101101817674351, + "learning_rate": 9.985378526249698e-05, + "loss": 3.3795, + "step": 7856 + }, + { + "epoch": 0.36580766813324955, + "grad_norm": 0.5030020181175185, + "learning_rate": 9.985357818730895e-05, + "loss": 3.4598, + "step": 7857 + }, + { + "epoch": 0.3658542263193426, + "grad_norm": 0.47980046654674074, + "learning_rate": 9.985337096580558e-05, + "loss": 3.309, + "step": 7858 + }, + { + "epoch": 0.36590078450543567, + "grad_norm": 0.46983036210370416, + "learning_rate": 9.985316359798747e-05, + "loss": 3.4841, + "step": 7859 + }, + { + "epoch": 0.36594734269152873, + "grad_norm": 0.541395763799952, + "learning_rate": 9.985295608385525e-05, + "loss": 3.3838, + "step": 7860 + }, + { + "epoch": 0.3659939008776218, + "grad_norm": 0.5288742093683666, + "learning_rate": 9.98527484234095e-05, + "loss": 3.3534, + "step": 7861 + }, + { + "epoch": 0.36604045906371485, + "grad_norm": 0.48061403751899523, + "learning_rate": 9.985254061665086e-05, + "loss": 3.4119, + "step": 7862 + }, + { + "epoch": 0.36608701724980797, + "grad_norm": 0.37329741712667724, + "learning_rate": 9.985233266357992e-05, + "loss": 3.4526, + "step": 7863 + }, + { + "epoch": 0.366133575435901, + "grad_norm": 0.40460254823070657, + "learning_rate": 9.985212456419727e-05, + "loss": 3.4478, + "step": 7864 + }, + { + "epoch": 0.3661801336219941, + "grad_norm": 0.4697154801135291, + "learning_rate": 9.98519163185036e-05, + "loss": 3.4333, + "step": 7865 + }, + { + "epoch": 0.36622669180808715, + "grad_norm": 0.4896189405074112, + "learning_rate": 9.985170792649942e-05, + "loss": 3.3448, + "step": 7866 + }, + { + "epoch": 0.3662732499941802, + "grad_norm": 0.38123104714505396, + "learning_rate": 9.985149938818542e-05, + "loss": 3.4974, + "step": 7867 + }, + { + "epoch": 0.3663198081802733, + "grad_norm": 0.5050705377813077, + "learning_rate": 9.985129070356214e-05, + "loss": 3.3936, + "step": 7868 + }, + { + "epoch": 0.3663663663663664, + "grad_norm": 0.4412769638767788, + "learning_rate": 9.985108187263026e-05, + "loss": 3.4033, + "step": 7869 + }, + { + "epoch": 0.36641292455245944, + "grad_norm": 0.4360788260291911, + "learning_rate": 9.985087289539037e-05, + "loss": 3.3689, + "step": 7870 + }, + { + "epoch": 0.3664594827385525, + "grad_norm": 0.43571653137077115, + "learning_rate": 9.985066377184306e-05, + "loss": 3.4072, + "step": 7871 + }, + { + "epoch": 0.36650604092464556, + "grad_norm": 0.5147060721973445, + "learning_rate": 9.985045450198898e-05, + "loss": 3.4627, + "step": 7872 + }, + { + "epoch": 0.3665525991107386, + "grad_norm": 0.4581125059533713, + "learning_rate": 9.985024508582872e-05, + "loss": 3.2788, + "step": 7873 + }, + { + "epoch": 0.36659915729683173, + "grad_norm": 0.4579842757907356, + "learning_rate": 9.985003552336291e-05, + "loss": 3.3876, + "step": 7874 + }, + { + "epoch": 0.3666457154829248, + "grad_norm": 0.5152976934126878, + "learning_rate": 9.984982581459215e-05, + "loss": 3.367, + "step": 7875 + }, + { + "epoch": 0.36669227366901785, + "grad_norm": 0.4680266543036226, + "learning_rate": 9.984961595951705e-05, + "loss": 3.4047, + "step": 7876 + }, + { + "epoch": 0.3667388318551109, + "grad_norm": 0.45475140910496104, + "learning_rate": 9.984940595813825e-05, + "loss": 3.3019, + "step": 7877 + }, + { + "epoch": 0.366785390041204, + "grad_norm": 0.510036965692996, + "learning_rate": 9.984919581045635e-05, + "loss": 3.3918, + "step": 7878 + }, + { + "epoch": 0.3668319482272971, + "grad_norm": 0.4707092324053203, + "learning_rate": 9.984898551647199e-05, + "loss": 3.4205, + "step": 7879 + }, + { + "epoch": 0.36687850641339015, + "grad_norm": 0.4459301933961235, + "learning_rate": 9.984877507618575e-05, + "loss": 3.3197, + "step": 7880 + }, + { + "epoch": 0.3669250645994832, + "grad_norm": 0.47319516892926855, + "learning_rate": 9.984856448959828e-05, + "loss": 3.2877, + "step": 7881 + }, + { + "epoch": 0.36697162278557627, + "grad_norm": 0.47957297102785107, + "learning_rate": 9.984835375671016e-05, + "loss": 3.4505, + "step": 7882 + }, + { + "epoch": 0.3670181809716693, + "grad_norm": 0.5205770827715681, + "learning_rate": 9.984814287752205e-05, + "loss": 3.4552, + "step": 7883 + }, + { + "epoch": 0.3670647391577624, + "grad_norm": 0.49767985244175367, + "learning_rate": 9.984793185203456e-05, + "loss": 3.4015, + "step": 7884 + }, + { + "epoch": 0.3671112973438555, + "grad_norm": 0.4368941535136218, + "learning_rate": 9.984772068024827e-05, + "loss": 3.4588, + "step": 7885 + }, + { + "epoch": 0.36715785552994856, + "grad_norm": 0.4259104085069764, + "learning_rate": 9.984750936216387e-05, + "loss": 3.4395, + "step": 7886 + }, + { + "epoch": 0.3672044137160416, + "grad_norm": 0.49381218170364105, + "learning_rate": 9.984729789778192e-05, + "loss": 3.4105, + "step": 7887 + }, + { + "epoch": 0.3672509719021347, + "grad_norm": 0.4503316181265441, + "learning_rate": 9.984708628710305e-05, + "loss": 3.4375, + "step": 7888 + }, + { + "epoch": 0.36729753008822774, + "grad_norm": 0.43969166729278925, + "learning_rate": 9.984687453012791e-05, + "loss": 3.3828, + "step": 7889 + }, + { + "epoch": 0.36734408827432086, + "grad_norm": 0.4639175174976507, + "learning_rate": 9.98466626268571e-05, + "loss": 3.4341, + "step": 7890 + }, + { + "epoch": 0.3673906464604139, + "grad_norm": 0.40231316966601305, + "learning_rate": 9.984645057729125e-05, + "loss": 3.3478, + "step": 7891 + }, + { + "epoch": 0.367437204646507, + "grad_norm": 0.386999821986291, + "learning_rate": 9.984623838143097e-05, + "loss": 3.3103, + "step": 7892 + }, + { + "epoch": 0.36748376283260004, + "grad_norm": 0.4217873932434255, + "learning_rate": 9.984602603927689e-05, + "loss": 3.3746, + "step": 7893 + }, + { + "epoch": 0.3675303210186931, + "grad_norm": 0.3622820482063746, + "learning_rate": 9.984581355082965e-05, + "loss": 3.3752, + "step": 7894 + }, + { + "epoch": 0.36757687920478616, + "grad_norm": 0.5458414445195743, + "learning_rate": 9.984560091608983e-05, + "loss": 3.4647, + "step": 7895 + }, + { + "epoch": 0.36762343739087927, + "grad_norm": 0.5281046757680551, + "learning_rate": 9.98453881350581e-05, + "loss": 3.4175, + "step": 7896 + }, + { + "epoch": 0.36766999557697233, + "grad_norm": 0.49116942385137974, + "learning_rate": 9.984517520773507e-05, + "loss": 3.3695, + "step": 7897 + }, + { + "epoch": 0.3677165537630654, + "grad_norm": 0.42333129182515833, + "learning_rate": 9.984496213412136e-05, + "loss": 3.2904, + "step": 7898 + }, + { + "epoch": 0.36776311194915845, + "grad_norm": 0.4218437579380728, + "learning_rate": 9.984474891421759e-05, + "loss": 3.4569, + "step": 7899 + }, + { + "epoch": 0.3678096701352515, + "grad_norm": 0.4667978302836461, + "learning_rate": 9.98445355480244e-05, + "loss": 3.4278, + "step": 7900 + }, + { + "epoch": 0.3678562283213446, + "grad_norm": 0.5115351677586353, + "learning_rate": 9.984432203554239e-05, + "loss": 3.488, + "step": 7901 + }, + { + "epoch": 0.3679027865074377, + "grad_norm": 0.4920567661602586, + "learning_rate": 9.984410837677223e-05, + "loss": 3.4588, + "step": 7902 + }, + { + "epoch": 0.36794934469353074, + "grad_norm": 0.5069644736557644, + "learning_rate": 9.98438945717145e-05, + "loss": 3.3474, + "step": 7903 + }, + { + "epoch": 0.3679959028796238, + "grad_norm": 0.48749166858884535, + "learning_rate": 9.984368062036986e-05, + "loss": 3.4185, + "step": 7904 + }, + { + "epoch": 0.36804246106571686, + "grad_norm": 0.5204367601545581, + "learning_rate": 9.984346652273894e-05, + "loss": 3.3934, + "step": 7905 + }, + { + "epoch": 0.3680890192518099, + "grad_norm": 0.555999198617575, + "learning_rate": 9.984325227882234e-05, + "loss": 3.5717, + "step": 7906 + }, + { + "epoch": 0.36813557743790304, + "grad_norm": 0.5452510775266921, + "learning_rate": 9.984303788862071e-05, + "loss": 3.4529, + "step": 7907 + }, + { + "epoch": 0.3681821356239961, + "grad_norm": 0.4651280711112067, + "learning_rate": 9.984282335213468e-05, + "loss": 3.4167, + "step": 7908 + }, + { + "epoch": 0.36822869381008916, + "grad_norm": 0.489167177319776, + "learning_rate": 9.984260866936485e-05, + "loss": 3.4606, + "step": 7909 + }, + { + "epoch": 0.3682752519961822, + "grad_norm": 0.4659252101136228, + "learning_rate": 9.984239384031191e-05, + "loss": 3.3759, + "step": 7910 + }, + { + "epoch": 0.3683218101822753, + "grad_norm": 0.4988317737675647, + "learning_rate": 9.984217886497643e-05, + "loss": 3.4603, + "step": 7911 + }, + { + "epoch": 0.3683683683683684, + "grad_norm": 0.5193618250853635, + "learning_rate": 9.984196374335908e-05, + "loss": 3.3508, + "step": 7912 + }, + { + "epoch": 0.36841492655446145, + "grad_norm": 0.5282566912009531, + "learning_rate": 9.984174847546047e-05, + "loss": 3.5967, + "step": 7913 + }, + { + "epoch": 0.3684614847405545, + "grad_norm": 0.5177224454813745, + "learning_rate": 9.984153306128123e-05, + "loss": 3.3585, + "step": 7914 + }, + { + "epoch": 0.36850804292664757, + "grad_norm": 0.478540083940367, + "learning_rate": 9.9841317500822e-05, + "loss": 3.3241, + "step": 7915 + }, + { + "epoch": 0.36855460111274063, + "grad_norm": 0.4766451148409297, + "learning_rate": 9.984110179408342e-05, + "loss": 3.4301, + "step": 7916 + }, + { + "epoch": 0.3686011592988337, + "grad_norm": 0.43882297973358825, + "learning_rate": 9.984088594106613e-05, + "loss": 3.3278, + "step": 7917 + }, + { + "epoch": 0.3686477174849268, + "grad_norm": 0.4320428016004001, + "learning_rate": 9.984066994177073e-05, + "loss": 3.4376, + "step": 7918 + }, + { + "epoch": 0.36869427567101987, + "grad_norm": 0.4772050194864994, + "learning_rate": 9.984045379619789e-05, + "loss": 3.4305, + "step": 7919 + }, + { + "epoch": 0.3687408338571129, + "grad_norm": 0.537570827885887, + "learning_rate": 9.984023750434821e-05, + "loss": 3.4837, + "step": 7920 + }, + { + "epoch": 0.368787392043206, + "grad_norm": 0.48689641982757975, + "learning_rate": 9.984002106622235e-05, + "loss": 3.3785, + "step": 7921 + }, + { + "epoch": 0.36883395022929905, + "grad_norm": 0.4445679587586481, + "learning_rate": 9.983980448182094e-05, + "loss": 3.4029, + "step": 7922 + }, + { + "epoch": 0.36888050841539216, + "grad_norm": 0.51923424479172, + "learning_rate": 9.983958775114462e-05, + "loss": 3.4345, + "step": 7923 + }, + { + "epoch": 0.3689270666014852, + "grad_norm": 0.5179019911186984, + "learning_rate": 9.983937087419402e-05, + "loss": 3.4398, + "step": 7924 + }, + { + "epoch": 0.3689736247875783, + "grad_norm": 0.42406177370500864, + "learning_rate": 9.983915385096976e-05, + "loss": 3.4723, + "step": 7925 + }, + { + "epoch": 0.36902018297367134, + "grad_norm": 0.4753912764922919, + "learning_rate": 9.98389366814725e-05, + "loss": 3.3859, + "step": 7926 + }, + { + "epoch": 0.3690667411597644, + "grad_norm": 0.5434305099875307, + "learning_rate": 9.983871936570289e-05, + "loss": 3.3318, + "step": 7927 + }, + { + "epoch": 0.36911329934585746, + "grad_norm": 0.5573070189072169, + "learning_rate": 9.983850190366152e-05, + "loss": 3.3765, + "step": 7928 + }, + { + "epoch": 0.3691598575319506, + "grad_norm": 0.4814926147044692, + "learning_rate": 9.983828429534908e-05, + "loss": 3.437, + "step": 7929 + }, + { + "epoch": 0.36920641571804363, + "grad_norm": 0.46486545162573806, + "learning_rate": 9.983806654076617e-05, + "loss": 3.5377, + "step": 7930 + }, + { + "epoch": 0.3692529739041367, + "grad_norm": 0.4323489368642138, + "learning_rate": 9.983784863991346e-05, + "loss": 3.4464, + "step": 7931 + }, + { + "epoch": 0.36929953209022975, + "grad_norm": 0.5071508447742582, + "learning_rate": 9.983763059279157e-05, + "loss": 3.3148, + "step": 7932 + }, + { + "epoch": 0.3693460902763228, + "grad_norm": 0.4649570892190936, + "learning_rate": 9.983741239940111e-05, + "loss": 3.4901, + "step": 7933 + }, + { + "epoch": 0.36939264846241593, + "grad_norm": 0.4950783078131254, + "learning_rate": 9.98371940597428e-05, + "loss": 3.1901, + "step": 7934 + }, + { + "epoch": 0.369439206648509, + "grad_norm": 0.5464075702717812, + "learning_rate": 9.983697557381722e-05, + "loss": 3.3869, + "step": 7935 + }, + { + "epoch": 0.36948576483460205, + "grad_norm": 0.6031044469606056, + "learning_rate": 9.983675694162502e-05, + "loss": 3.4067, + "step": 7936 + }, + { + "epoch": 0.3695323230206951, + "grad_norm": 0.5035437982354419, + "learning_rate": 9.983653816316685e-05, + "loss": 3.3438, + "step": 7937 + }, + { + "epoch": 0.36957888120678817, + "grad_norm": 0.49066282518176446, + "learning_rate": 9.983631923844335e-05, + "loss": 3.4418, + "step": 7938 + }, + { + "epoch": 0.3696254393928812, + "grad_norm": 0.5164918009465249, + "learning_rate": 9.983610016745517e-05, + "loss": 3.4173, + "step": 7939 + }, + { + "epoch": 0.36967199757897434, + "grad_norm": 0.4861284189123535, + "learning_rate": 9.983588095020295e-05, + "loss": 3.451, + "step": 7940 + }, + { + "epoch": 0.3697185557650674, + "grad_norm": 0.44944810634519694, + "learning_rate": 9.983566158668731e-05, + "loss": 3.3652, + "step": 7941 + }, + { + "epoch": 0.36976511395116046, + "grad_norm": 0.4571346411824523, + "learning_rate": 9.983544207690893e-05, + "loss": 3.3671, + "step": 7942 + }, + { + "epoch": 0.3698116721372535, + "grad_norm": 0.5002139947480516, + "learning_rate": 9.983522242086842e-05, + "loss": 3.3493, + "step": 7943 + }, + { + "epoch": 0.3698582303233466, + "grad_norm": 0.48335847179317515, + "learning_rate": 9.983500261856646e-05, + "loss": 3.3667, + "step": 7944 + }, + { + "epoch": 0.3699047885094397, + "grad_norm": 0.42665075282903836, + "learning_rate": 9.983478267000367e-05, + "loss": 3.3958, + "step": 7945 + }, + { + "epoch": 0.36995134669553276, + "grad_norm": 0.4506027091200855, + "learning_rate": 9.98345625751807e-05, + "loss": 3.3523, + "step": 7946 + }, + { + "epoch": 0.3699979048816258, + "grad_norm": 0.41723335890215063, + "learning_rate": 9.98343423340982e-05, + "loss": 3.4897, + "step": 7947 + }, + { + "epoch": 0.3700444630677189, + "grad_norm": 0.43112336888745645, + "learning_rate": 9.98341219467568e-05, + "loss": 3.4027, + "step": 7948 + }, + { + "epoch": 0.37009102125381194, + "grad_norm": 0.47261700155638126, + "learning_rate": 9.983390141315719e-05, + "loss": 3.2905, + "step": 7949 + }, + { + "epoch": 0.370137579439905, + "grad_norm": 0.41285590889080326, + "learning_rate": 9.983368073329997e-05, + "loss": 3.3797, + "step": 7950 + }, + { + "epoch": 0.3701841376259981, + "grad_norm": 0.4322943419411539, + "learning_rate": 9.983345990718583e-05, + "loss": 3.4576, + "step": 7951 + }, + { + "epoch": 0.37023069581209117, + "grad_norm": 0.44972592094221564, + "learning_rate": 9.983323893481535e-05, + "loss": 3.4042, + "step": 7952 + }, + { + "epoch": 0.37027725399818423, + "grad_norm": 0.38195633495378456, + "learning_rate": 9.983301781618926e-05, + "loss": 3.4041, + "step": 7953 + }, + { + "epoch": 0.3703238121842773, + "grad_norm": 0.42398130761017044, + "learning_rate": 9.983279655130816e-05, + "loss": 3.3909, + "step": 7954 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 0.4047954245023595, + "learning_rate": 9.983257514017271e-05, + "loss": 3.3391, + "step": 7955 + }, + { + "epoch": 0.37041692855646347, + "grad_norm": 0.38555577354232984, + "learning_rate": 9.983235358278356e-05, + "loss": 3.3286, + "step": 7956 + }, + { + "epoch": 0.3704634867425565, + "grad_norm": 0.41045898552317056, + "learning_rate": 9.983213187914137e-05, + "loss": 3.427, + "step": 7957 + }, + { + "epoch": 0.3705100449286496, + "grad_norm": 0.44765013852089436, + "learning_rate": 9.983191002924678e-05, + "loss": 3.3639, + "step": 7958 + }, + { + "epoch": 0.37055660311474264, + "grad_norm": 0.3702577681994594, + "learning_rate": 9.983168803310044e-05, + "loss": 3.4402, + "step": 7959 + }, + { + "epoch": 0.3706031613008357, + "grad_norm": 0.41270342601438054, + "learning_rate": 9.9831465890703e-05, + "loss": 3.4217, + "step": 7960 + }, + { + "epoch": 0.37064971948692876, + "grad_norm": 0.4086366928472405, + "learning_rate": 9.983124360205514e-05, + "loss": 3.3356, + "step": 7961 + }, + { + "epoch": 0.3706962776730219, + "grad_norm": 0.4048397457406348, + "learning_rate": 9.983102116715745e-05, + "loss": 3.4662, + "step": 7962 + }, + { + "epoch": 0.37074283585911494, + "grad_norm": 0.3925596491583229, + "learning_rate": 9.983079858601066e-05, + "loss": 3.3641, + "step": 7963 + }, + { + "epoch": 0.370789394045208, + "grad_norm": 0.38588124785673344, + "learning_rate": 9.983057585861537e-05, + "loss": 3.37, + "step": 7964 + }, + { + "epoch": 0.37083595223130106, + "grad_norm": 0.40537600934881085, + "learning_rate": 9.983035298497224e-05, + "loss": 3.3945, + "step": 7965 + }, + { + "epoch": 0.3708825104173941, + "grad_norm": 0.4794830589999777, + "learning_rate": 9.983012996508195e-05, + "loss": 3.5049, + "step": 7966 + }, + { + "epoch": 0.37092906860348723, + "grad_norm": 0.48729601731127925, + "learning_rate": 9.982990679894513e-05, + "loss": 3.3713, + "step": 7967 + }, + { + "epoch": 0.3709756267895803, + "grad_norm": 0.4413599131679562, + "learning_rate": 9.982968348656243e-05, + "loss": 3.4176, + "step": 7968 + }, + { + "epoch": 0.37102218497567335, + "grad_norm": 0.4585484496600623, + "learning_rate": 9.982946002793454e-05, + "loss": 3.4794, + "step": 7969 + }, + { + "epoch": 0.3710687431617664, + "grad_norm": 0.5749897662391428, + "learning_rate": 9.982923642306207e-05, + "loss": 3.4259, + "step": 7970 + }, + { + "epoch": 0.37111530134785947, + "grad_norm": 0.57013161181603, + "learning_rate": 9.982901267194572e-05, + "loss": 3.3597, + "step": 7971 + }, + { + "epoch": 0.37116185953395253, + "grad_norm": 0.5583437214660799, + "learning_rate": 9.982878877458613e-05, + "loss": 3.4534, + "step": 7972 + }, + { + "epoch": 0.37120841772004565, + "grad_norm": 0.4960129983131131, + "learning_rate": 9.982856473098395e-05, + "loss": 3.4018, + "step": 7973 + }, + { + "epoch": 0.3712549759061387, + "grad_norm": 0.4188677979699001, + "learning_rate": 9.982834054113982e-05, + "loss": 3.4369, + "step": 7974 + }, + { + "epoch": 0.37130153409223177, + "grad_norm": 0.4862528569302521, + "learning_rate": 9.982811620505443e-05, + "loss": 3.3418, + "step": 7975 + }, + { + "epoch": 0.3713480922783248, + "grad_norm": 0.46000804241726173, + "learning_rate": 9.982789172272845e-05, + "loss": 3.3245, + "step": 7976 + }, + { + "epoch": 0.3713946504644179, + "grad_norm": 0.43656674989588873, + "learning_rate": 9.98276670941625e-05, + "loss": 3.36, + "step": 7977 + }, + { + "epoch": 0.371441208650511, + "grad_norm": 0.48027368840766793, + "learning_rate": 9.982744231935724e-05, + "loss": 3.3315, + "step": 7978 + }, + { + "epoch": 0.37148776683660406, + "grad_norm": 0.43595544873881314, + "learning_rate": 9.982721739831337e-05, + "loss": 3.3345, + "step": 7979 + }, + { + "epoch": 0.3715343250226971, + "grad_norm": 0.4325370037183413, + "learning_rate": 9.98269923310315e-05, + "loss": 3.2989, + "step": 7980 + }, + { + "epoch": 0.3715808832087902, + "grad_norm": 0.5041882444240988, + "learning_rate": 9.982676711751234e-05, + "loss": 3.3471, + "step": 7981 + }, + { + "epoch": 0.37162744139488324, + "grad_norm": 0.4489994328031652, + "learning_rate": 9.982654175775652e-05, + "loss": 3.4119, + "step": 7982 + }, + { + "epoch": 0.3716739995809763, + "grad_norm": 0.4219014089219267, + "learning_rate": 9.98263162517647e-05, + "loss": 3.4097, + "step": 7983 + }, + { + "epoch": 0.3717205577670694, + "grad_norm": 0.4603952261596678, + "learning_rate": 9.982609059953759e-05, + "loss": 3.4735, + "step": 7984 + }, + { + "epoch": 0.3717671159531625, + "grad_norm": 0.45450137557595044, + "learning_rate": 9.982586480107576e-05, + "loss": 3.3952, + "step": 7985 + }, + { + "epoch": 0.37181367413925553, + "grad_norm": 0.49023919586214565, + "learning_rate": 9.982563885637996e-05, + "loss": 3.3586, + "step": 7986 + }, + { + "epoch": 0.3718602323253486, + "grad_norm": 0.4936164395959368, + "learning_rate": 9.98254127654508e-05, + "loss": 3.4638, + "step": 7987 + }, + { + "epoch": 0.37190679051144165, + "grad_norm": 0.47581384906672297, + "learning_rate": 9.982518652828896e-05, + "loss": 3.4245, + "step": 7988 + }, + { + "epoch": 0.37195334869753477, + "grad_norm": 0.4072743316233488, + "learning_rate": 9.982496014489513e-05, + "loss": 3.4422, + "step": 7989 + }, + { + "epoch": 0.37199990688362783, + "grad_norm": 0.4597742584329251, + "learning_rate": 9.982473361526993e-05, + "loss": 3.4338, + "step": 7990 + }, + { + "epoch": 0.3720464650697209, + "grad_norm": 0.40408272260278966, + "learning_rate": 9.982450693941406e-05, + "loss": 3.4083, + "step": 7991 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 0.3975321678831782, + "learning_rate": 9.982428011732815e-05, + "loss": 3.357, + "step": 7992 + }, + { + "epoch": 0.372139581441907, + "grad_norm": 0.4617765268764642, + "learning_rate": 9.98240531490129e-05, + "loss": 3.3966, + "step": 7993 + }, + { + "epoch": 0.37218613962800007, + "grad_norm": 0.4225755850189732, + "learning_rate": 9.982382603446896e-05, + "loss": 3.2653, + "step": 7994 + }, + { + "epoch": 0.3722326978140932, + "grad_norm": 0.41779233241285946, + "learning_rate": 9.9823598773697e-05, + "loss": 3.3339, + "step": 7995 + }, + { + "epoch": 0.37227925600018624, + "grad_norm": 0.4416116859198915, + "learning_rate": 9.982337136669768e-05, + "loss": 3.373, + "step": 7996 + }, + { + "epoch": 0.3723258141862793, + "grad_norm": 0.47298670176899565, + "learning_rate": 9.98231438134717e-05, + "loss": 3.4998, + "step": 7997 + }, + { + "epoch": 0.37237237237237236, + "grad_norm": 0.3770295605168403, + "learning_rate": 9.982291611401965e-05, + "loss": 3.3231, + "step": 7998 + }, + { + "epoch": 0.3724189305584654, + "grad_norm": 0.4404135198977067, + "learning_rate": 9.98226882683423e-05, + "loss": 3.2974, + "step": 7999 + }, + { + "epoch": 0.37246548874455854, + "grad_norm": 0.5307857347257205, + "learning_rate": 9.982246027644025e-05, + "loss": 3.3535, + "step": 8000 + }, + { + "epoch": 0.3725120469306516, + "grad_norm": 0.45454034084686135, + "learning_rate": 9.982223213831418e-05, + "loss": 3.2473, + "step": 8001 + }, + { + "epoch": 0.37255860511674466, + "grad_norm": 0.4074749063122811, + "learning_rate": 9.982200385396476e-05, + "loss": 3.3032, + "step": 8002 + }, + { + "epoch": 0.3726051633028377, + "grad_norm": 0.5287149844898378, + "learning_rate": 9.982177542339268e-05, + "loss": 3.4249, + "step": 8003 + }, + { + "epoch": 0.3726517214889308, + "grad_norm": 0.490501318118284, + "learning_rate": 9.982154684659861e-05, + "loss": 3.2814, + "step": 8004 + }, + { + "epoch": 0.37269827967502384, + "grad_norm": 0.38962591844199135, + "learning_rate": 9.982131812358319e-05, + "loss": 3.3947, + "step": 8005 + }, + { + "epoch": 0.37274483786111695, + "grad_norm": 0.5196181634651894, + "learning_rate": 9.982108925434711e-05, + "loss": 3.4132, + "step": 8006 + }, + { + "epoch": 0.37279139604721, + "grad_norm": 0.5133040778013368, + "learning_rate": 9.982086023889104e-05, + "loss": 3.3105, + "step": 8007 + }, + { + "epoch": 0.37283795423330307, + "grad_norm": 0.40701870496066167, + "learning_rate": 9.982063107721566e-05, + "loss": 3.5657, + "step": 8008 + }, + { + "epoch": 0.37288451241939613, + "grad_norm": 0.5165940885793407, + "learning_rate": 9.982040176932163e-05, + "loss": 3.4113, + "step": 8009 + }, + { + "epoch": 0.3729310706054892, + "grad_norm": 0.4888659734651894, + "learning_rate": 9.982017231520963e-05, + "loss": 3.4327, + "step": 8010 + }, + { + "epoch": 0.3729776287915823, + "grad_norm": 0.47976845504759935, + "learning_rate": 9.981994271488035e-05, + "loss": 3.3153, + "step": 8011 + }, + { + "epoch": 0.37302418697767536, + "grad_norm": 0.5205518508323954, + "learning_rate": 9.981971296833444e-05, + "loss": 3.3184, + "step": 8012 + }, + { + "epoch": 0.3730707451637684, + "grad_norm": 0.574458866592295, + "learning_rate": 9.981948307557256e-05, + "loss": 3.3414, + "step": 8013 + }, + { + "epoch": 0.3731173033498615, + "grad_norm": 0.590609735396122, + "learning_rate": 9.981925303659543e-05, + "loss": 3.3611, + "step": 8014 + }, + { + "epoch": 0.37316386153595454, + "grad_norm": 0.44820018639969234, + "learning_rate": 9.981902285140367e-05, + "loss": 3.3895, + "step": 8015 + }, + { + "epoch": 0.3732104197220476, + "grad_norm": 0.47515676040512994, + "learning_rate": 9.981879251999802e-05, + "loss": 3.5009, + "step": 8016 + }, + { + "epoch": 0.3732569779081407, + "grad_norm": 0.5705559263683895, + "learning_rate": 9.981856204237911e-05, + "loss": 3.4922, + "step": 8017 + }, + { + "epoch": 0.3733035360942338, + "grad_norm": 0.4721811565366867, + "learning_rate": 9.981833141854762e-05, + "loss": 3.4031, + "step": 8018 + }, + { + "epoch": 0.37335009428032684, + "grad_norm": 0.47727608416319506, + "learning_rate": 9.981810064850423e-05, + "loss": 3.4465, + "step": 8019 + }, + { + "epoch": 0.3733966524664199, + "grad_norm": 0.49070149805072716, + "learning_rate": 9.981786973224964e-05, + "loss": 3.4107, + "step": 8020 + }, + { + "epoch": 0.37344321065251296, + "grad_norm": 0.5011316975416281, + "learning_rate": 9.981763866978451e-05, + "loss": 3.4916, + "step": 8021 + }, + { + "epoch": 0.3734897688386061, + "grad_norm": 0.5166383983703239, + "learning_rate": 9.98174074611095e-05, + "loss": 3.3528, + "step": 8022 + }, + { + "epoch": 0.37353632702469913, + "grad_norm": 0.5120148651080956, + "learning_rate": 9.981717610622532e-05, + "loss": 3.214, + "step": 8023 + }, + { + "epoch": 0.3735828852107922, + "grad_norm": 0.4650244959055296, + "learning_rate": 9.981694460513263e-05, + "loss": 3.4223, + "step": 8024 + }, + { + "epoch": 0.37362944339688525, + "grad_norm": 0.4934077674891122, + "learning_rate": 9.981671295783213e-05, + "loss": 3.3246, + "step": 8025 + }, + { + "epoch": 0.3736760015829783, + "grad_norm": 0.5336588987893278, + "learning_rate": 9.981648116432447e-05, + "loss": 3.5077, + "step": 8026 + }, + { + "epoch": 0.37372255976907137, + "grad_norm": 0.4654009117430244, + "learning_rate": 9.981624922461034e-05, + "loss": 3.3007, + "step": 8027 + }, + { + "epoch": 0.3737691179551645, + "grad_norm": 0.3895811927523467, + "learning_rate": 9.981601713869044e-05, + "loss": 3.302, + "step": 8028 + }, + { + "epoch": 0.37381567614125755, + "grad_norm": 0.48486115399962937, + "learning_rate": 9.981578490656544e-05, + "loss": 3.411, + "step": 8029 + }, + { + "epoch": 0.3738622343273506, + "grad_norm": 0.5085278671850253, + "learning_rate": 9.9815552528236e-05, + "loss": 3.2901, + "step": 8030 + }, + { + "epoch": 0.37390879251344367, + "grad_norm": 0.4627677206747766, + "learning_rate": 9.981532000370284e-05, + "loss": 3.4833, + "step": 8031 + }, + { + "epoch": 0.3739553506995367, + "grad_norm": 0.5092091454814208, + "learning_rate": 9.981508733296662e-05, + "loss": 3.4522, + "step": 8032 + }, + { + "epoch": 0.37400190888562984, + "grad_norm": 0.5072593447553648, + "learning_rate": 9.981485451602802e-05, + "loss": 3.3748, + "step": 8033 + }, + { + "epoch": 0.3740484670717229, + "grad_norm": 0.4190319834939965, + "learning_rate": 9.981462155288773e-05, + "loss": 3.2642, + "step": 8034 + }, + { + "epoch": 0.37409502525781596, + "grad_norm": 0.46035292727595045, + "learning_rate": 9.981438844354644e-05, + "loss": 3.5097, + "step": 8035 + }, + { + "epoch": 0.374141583443909, + "grad_norm": 0.5137277727937224, + "learning_rate": 9.981415518800484e-05, + "loss": 3.3201, + "step": 8036 + }, + { + "epoch": 0.3741881416300021, + "grad_norm": 0.4944671546065648, + "learning_rate": 9.981392178626356e-05, + "loss": 3.357, + "step": 8037 + }, + { + "epoch": 0.37423469981609514, + "grad_norm": 0.5142383706624393, + "learning_rate": 9.981368823832336e-05, + "loss": 3.3406, + "step": 8038 + }, + { + "epoch": 0.37428125800218826, + "grad_norm": 0.5028782684734316, + "learning_rate": 9.98134545441849e-05, + "loss": 3.3462, + "step": 8039 + }, + { + "epoch": 0.3743278161882813, + "grad_norm": 0.5322877448909464, + "learning_rate": 9.981322070384883e-05, + "loss": 3.5365, + "step": 8040 + }, + { + "epoch": 0.3743743743743744, + "grad_norm": 0.4687297025785532, + "learning_rate": 9.981298671731589e-05, + "loss": 3.4158, + "step": 8041 + }, + { + "epoch": 0.37442093256046743, + "grad_norm": 0.5279656608399498, + "learning_rate": 9.981275258458672e-05, + "loss": 3.3922, + "step": 8042 + }, + { + "epoch": 0.3744674907465605, + "grad_norm": 0.4959847859417606, + "learning_rate": 9.981251830566203e-05, + "loss": 3.3419, + "step": 8043 + }, + { + "epoch": 0.3745140489326536, + "grad_norm": 0.45903876120937026, + "learning_rate": 9.981228388054253e-05, + "loss": 3.3416, + "step": 8044 + }, + { + "epoch": 0.37456060711874667, + "grad_norm": 0.43552507484682845, + "learning_rate": 9.981204930922886e-05, + "loss": 3.3148, + "step": 8045 + }, + { + "epoch": 0.37460716530483973, + "grad_norm": 0.4031427908251664, + "learning_rate": 9.981181459172174e-05, + "loss": 3.3258, + "step": 8046 + }, + { + "epoch": 0.3746537234909328, + "grad_norm": 0.44616016896350225, + "learning_rate": 9.981157972802185e-05, + "loss": 3.3882, + "step": 8047 + }, + { + "epoch": 0.37470028167702585, + "grad_norm": 0.4434186427841035, + "learning_rate": 9.981134471812988e-05, + "loss": 3.2584, + "step": 8048 + }, + { + "epoch": 0.3747468398631189, + "grad_norm": 0.436436214000013, + "learning_rate": 9.981110956204654e-05, + "loss": 3.3595, + "step": 8049 + }, + { + "epoch": 0.374793398049212, + "grad_norm": 0.406279394620096, + "learning_rate": 9.981087425977247e-05, + "loss": 3.3911, + "step": 8050 + }, + { + "epoch": 0.3748399562353051, + "grad_norm": 0.46158694471281086, + "learning_rate": 9.981063881130841e-05, + "loss": 3.4094, + "step": 8051 + }, + { + "epoch": 0.37488651442139814, + "grad_norm": 0.4306781462578813, + "learning_rate": 9.981040321665502e-05, + "loss": 3.3596, + "step": 8052 + }, + { + "epoch": 0.3749330726074912, + "grad_norm": 0.43960724099144655, + "learning_rate": 9.981016747581302e-05, + "loss": 3.3686, + "step": 8053 + }, + { + "epoch": 0.37497963079358426, + "grad_norm": 0.4925288658920686, + "learning_rate": 9.980993158878306e-05, + "loss": 3.4065, + "step": 8054 + }, + { + "epoch": 0.3750261889796774, + "grad_norm": 0.43532218133671513, + "learning_rate": 9.98096955555659e-05, + "loss": 3.4445, + "step": 8055 + }, + { + "epoch": 0.37507274716577044, + "grad_norm": 0.4475255730911559, + "learning_rate": 9.980945937616216e-05, + "loss": 3.3954, + "step": 8056 + }, + { + "epoch": 0.3751193053518635, + "grad_norm": 0.49565213524902674, + "learning_rate": 9.980922305057257e-05, + "loss": 3.3868, + "step": 8057 + }, + { + "epoch": 0.37516586353795656, + "grad_norm": 0.4798141487214056, + "learning_rate": 9.98089865787978e-05, + "loss": 3.4231, + "step": 8058 + }, + { + "epoch": 0.3752124217240496, + "grad_norm": 0.43504129599610186, + "learning_rate": 9.980874996083859e-05, + "loss": 3.4501, + "step": 8059 + }, + { + "epoch": 0.3752589799101427, + "grad_norm": 0.42166924512713694, + "learning_rate": 9.980851319669559e-05, + "loss": 3.3477, + "step": 8060 + }, + { + "epoch": 0.3753055380962358, + "grad_norm": 0.4981281213837146, + "learning_rate": 9.980827628636952e-05, + "loss": 3.3738, + "step": 8061 + }, + { + "epoch": 0.37535209628232885, + "grad_norm": 0.5052435920853088, + "learning_rate": 9.980803922986106e-05, + "loss": 3.4086, + "step": 8062 + }, + { + "epoch": 0.3753986544684219, + "grad_norm": 0.5240871102672496, + "learning_rate": 9.98078020271709e-05, + "loss": 3.449, + "step": 8063 + }, + { + "epoch": 0.37544521265451497, + "grad_norm": 0.4698564219660281, + "learning_rate": 9.980756467829977e-05, + "loss": 3.4208, + "step": 8064 + }, + { + "epoch": 0.37549177084060803, + "grad_norm": 0.4593252296452997, + "learning_rate": 9.980732718324834e-05, + "loss": 3.4091, + "step": 8065 + }, + { + "epoch": 0.37553832902670115, + "grad_norm": 0.47257114293739305, + "learning_rate": 9.980708954201729e-05, + "loss": 3.4277, + "step": 8066 + }, + { + "epoch": 0.3755848872127942, + "grad_norm": 0.5670003323666758, + "learning_rate": 9.980685175460737e-05, + "loss": 3.46, + "step": 8067 + }, + { + "epoch": 0.37563144539888726, + "grad_norm": 0.5364249878508845, + "learning_rate": 9.980661382101924e-05, + "loss": 3.3548, + "step": 8068 + }, + { + "epoch": 0.3756780035849803, + "grad_norm": 0.42627912048667366, + "learning_rate": 9.980637574125358e-05, + "loss": 3.3157, + "step": 8069 + }, + { + "epoch": 0.3757245617710734, + "grad_norm": 0.44734191728092265, + "learning_rate": 9.980613751531114e-05, + "loss": 3.4472, + "step": 8070 + }, + { + "epoch": 0.37577111995716644, + "grad_norm": 0.4521355611650669, + "learning_rate": 9.980589914319258e-05, + "loss": 3.3047, + "step": 8071 + }, + { + "epoch": 0.37581767814325956, + "grad_norm": 0.422973942711834, + "learning_rate": 9.980566062489863e-05, + "loss": 3.5148, + "step": 8072 + }, + { + "epoch": 0.3758642363293526, + "grad_norm": 0.4404276153057386, + "learning_rate": 9.980542196042995e-05, + "loss": 3.3673, + "step": 8073 + }, + { + "epoch": 0.3759107945154457, + "grad_norm": 0.5098015719547468, + "learning_rate": 9.980518314978726e-05, + "loss": 3.4927, + "step": 8074 + }, + { + "epoch": 0.37595735270153874, + "grad_norm": 0.4914825905918648, + "learning_rate": 9.980494419297128e-05, + "loss": 3.2617, + "step": 8075 + }, + { + "epoch": 0.3760039108876318, + "grad_norm": 0.5301109471237124, + "learning_rate": 9.980470508998269e-05, + "loss": 3.4861, + "step": 8076 + }, + { + "epoch": 0.3760504690737249, + "grad_norm": 0.5065864163165195, + "learning_rate": 9.980446584082221e-05, + "loss": 3.4334, + "step": 8077 + }, + { + "epoch": 0.376097027259818, + "grad_norm": 0.49754341953792325, + "learning_rate": 9.980422644549053e-05, + "loss": 3.3225, + "step": 8078 + }, + { + "epoch": 0.37614358544591103, + "grad_norm": 0.5285210544872564, + "learning_rate": 9.980398690398833e-05, + "loss": 3.4518, + "step": 8079 + }, + { + "epoch": 0.3761901436320041, + "grad_norm": 0.5378970497039107, + "learning_rate": 9.980374721631635e-05, + "loss": 3.383, + "step": 8080 + }, + { + "epoch": 0.37623670181809715, + "grad_norm": 0.5765578194451709, + "learning_rate": 9.980350738247527e-05, + "loss": 3.3632, + "step": 8081 + }, + { + "epoch": 0.3762832600041902, + "grad_norm": 0.5250051515378282, + "learning_rate": 9.98032674024658e-05, + "loss": 3.3728, + "step": 8082 + }, + { + "epoch": 0.3763298181902833, + "grad_norm": 0.5586960719745616, + "learning_rate": 9.980302727628866e-05, + "loss": 3.3475, + "step": 8083 + }, + { + "epoch": 0.3763763763763764, + "grad_norm": 0.5694038507608408, + "learning_rate": 9.980278700394455e-05, + "loss": 3.3581, + "step": 8084 + }, + { + "epoch": 0.37642293456246945, + "grad_norm": 0.4741120696751015, + "learning_rate": 9.980254658543413e-05, + "loss": 3.3493, + "step": 8085 + }, + { + "epoch": 0.3764694927485625, + "grad_norm": 0.47827523358368546, + "learning_rate": 9.980230602075818e-05, + "loss": 3.3607, + "step": 8086 + }, + { + "epoch": 0.37651605093465557, + "grad_norm": 0.4864038763032028, + "learning_rate": 9.980206530991734e-05, + "loss": 3.3896, + "step": 8087 + }, + { + "epoch": 0.3765626091207487, + "grad_norm": 0.4914226359379937, + "learning_rate": 9.980182445291236e-05, + "loss": 3.2942, + "step": 8088 + }, + { + "epoch": 0.37660916730684174, + "grad_norm": 0.42432134320686915, + "learning_rate": 9.980158344974392e-05, + "loss": 3.3671, + "step": 8089 + }, + { + "epoch": 0.3766557254929348, + "grad_norm": 0.4786844808261259, + "learning_rate": 9.980134230041275e-05, + "loss": 3.4095, + "step": 8090 + }, + { + "epoch": 0.37670228367902786, + "grad_norm": 0.4648757089741418, + "learning_rate": 9.980110100491955e-05, + "loss": 3.3141, + "step": 8091 + }, + { + "epoch": 0.3767488418651209, + "grad_norm": 0.4518798258161405, + "learning_rate": 9.980085956326502e-05, + "loss": 3.4521, + "step": 8092 + }, + { + "epoch": 0.376795400051214, + "grad_norm": 0.430893139923428, + "learning_rate": 9.980061797544987e-05, + "loss": 3.3624, + "step": 8093 + }, + { + "epoch": 0.3768419582373071, + "grad_norm": 0.4663357304968742, + "learning_rate": 9.98003762414748e-05, + "loss": 3.4473, + "step": 8094 + }, + { + "epoch": 0.37688851642340016, + "grad_norm": 0.49557113227031907, + "learning_rate": 9.980013436134055e-05, + "loss": 3.3349, + "step": 8095 + }, + { + "epoch": 0.3769350746094932, + "grad_norm": 0.5195962396487445, + "learning_rate": 9.979989233504781e-05, + "loss": 3.3164, + "step": 8096 + }, + { + "epoch": 0.3769816327955863, + "grad_norm": 0.420764771805895, + "learning_rate": 9.979965016259729e-05, + "loss": 3.4216, + "step": 8097 + }, + { + "epoch": 0.37702819098167933, + "grad_norm": 0.4635505460544169, + "learning_rate": 9.97994078439897e-05, + "loss": 3.362, + "step": 8098 + }, + { + "epoch": 0.37707474916777245, + "grad_norm": 0.5467870317031209, + "learning_rate": 9.979916537922576e-05, + "loss": 3.4076, + "step": 8099 + }, + { + "epoch": 0.3771213073538655, + "grad_norm": 0.4286336404860123, + "learning_rate": 9.979892276830617e-05, + "loss": 3.3901, + "step": 8100 + }, + { + "epoch": 0.37716786553995857, + "grad_norm": 0.5022674064893203, + "learning_rate": 9.979868001123165e-05, + "loss": 3.4344, + "step": 8101 + }, + { + "epoch": 0.37721442372605163, + "grad_norm": 0.49944523155915627, + "learning_rate": 9.97984371080029e-05, + "loss": 3.3762, + "step": 8102 + }, + { + "epoch": 0.3772609819121447, + "grad_norm": 0.4682477266853373, + "learning_rate": 9.979819405862065e-05, + "loss": 3.4785, + "step": 8103 + }, + { + "epoch": 0.37730754009823775, + "grad_norm": 0.46703620270079577, + "learning_rate": 9.979795086308562e-05, + "loss": 3.4076, + "step": 8104 + }, + { + "epoch": 0.37735409828433086, + "grad_norm": 0.39876666052901066, + "learning_rate": 9.97977075213985e-05, + "loss": 3.3683, + "step": 8105 + }, + { + "epoch": 0.3774006564704239, + "grad_norm": 0.4825785509698149, + "learning_rate": 9.979746403356001e-05, + "loss": 3.4149, + "step": 8106 + }, + { + "epoch": 0.377447214656517, + "grad_norm": 0.4303010575653195, + "learning_rate": 9.979722039957086e-05, + "loss": 3.431, + "step": 8107 + }, + { + "epoch": 0.37749377284261004, + "grad_norm": 0.46961463084732363, + "learning_rate": 9.97969766194318e-05, + "loss": 3.4666, + "step": 8108 + }, + { + "epoch": 0.3775403310287031, + "grad_norm": 0.5130842114660069, + "learning_rate": 9.979673269314351e-05, + "loss": 3.5149, + "step": 8109 + }, + { + "epoch": 0.3775868892147962, + "grad_norm": 0.47818793623800987, + "learning_rate": 9.979648862070671e-05, + "loss": 3.3326, + "step": 8110 + }, + { + "epoch": 0.3776334474008893, + "grad_norm": 0.4338507494063344, + "learning_rate": 9.979624440212212e-05, + "loss": 3.4729, + "step": 8111 + }, + { + "epoch": 0.37768000558698234, + "grad_norm": 0.4343791692777879, + "learning_rate": 9.979600003739046e-05, + "loss": 3.2535, + "step": 8112 + }, + { + "epoch": 0.3777265637730754, + "grad_norm": 0.47209811567980714, + "learning_rate": 9.979575552651246e-05, + "loss": 3.4141, + "step": 8113 + }, + { + "epoch": 0.37777312195916846, + "grad_norm": 0.44552037125458804, + "learning_rate": 9.979551086948879e-05, + "loss": 3.3621, + "step": 8114 + }, + { + "epoch": 0.3778196801452615, + "grad_norm": 0.45129353858275767, + "learning_rate": 9.979526606632024e-05, + "loss": 3.4462, + "step": 8115 + }, + { + "epoch": 0.37786623833135463, + "grad_norm": 0.4591893721109368, + "learning_rate": 9.979502111700746e-05, + "loss": 3.4164, + "step": 8116 + }, + { + "epoch": 0.3779127965174477, + "grad_norm": 0.4929100183808137, + "learning_rate": 9.97947760215512e-05, + "loss": 3.3508, + "step": 8117 + }, + { + "epoch": 0.37795935470354075, + "grad_norm": 0.5401657997622226, + "learning_rate": 9.97945307799522e-05, + "loss": 3.3923, + "step": 8118 + }, + { + "epoch": 0.3780059128896338, + "grad_norm": 0.4764917133220802, + "learning_rate": 9.979428539221114e-05, + "loss": 3.3312, + "step": 8119 + }, + { + "epoch": 0.37805247107572687, + "grad_norm": 0.4800879239824236, + "learning_rate": 9.979403985832877e-05, + "loss": 3.3986, + "step": 8120 + }, + { + "epoch": 0.37809902926182, + "grad_norm": 0.4163706648600143, + "learning_rate": 9.979379417830579e-05, + "loss": 3.3279, + "step": 8121 + }, + { + "epoch": 0.37814558744791305, + "grad_norm": 0.46658879464485425, + "learning_rate": 9.979354835214295e-05, + "loss": 3.4761, + "step": 8122 + }, + { + "epoch": 0.3781921456340061, + "grad_norm": 0.49849907481457384, + "learning_rate": 9.979330237984092e-05, + "loss": 3.3438, + "step": 8123 + }, + { + "epoch": 0.37823870382009916, + "grad_norm": 0.44955011836384934, + "learning_rate": 9.979305626140046e-05, + "loss": 3.2994, + "step": 8124 + }, + { + "epoch": 0.3782852620061922, + "grad_norm": 0.41294337749087373, + "learning_rate": 9.97928099968223e-05, + "loss": 3.341, + "step": 8125 + }, + { + "epoch": 0.3783318201922853, + "grad_norm": 0.5022469588767303, + "learning_rate": 9.979256358610714e-05, + "loss": 3.4961, + "step": 8126 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 0.4698108666298687, + "learning_rate": 9.97923170292557e-05, + "loss": 3.4763, + "step": 8127 + }, + { + "epoch": 0.37842493656447146, + "grad_norm": 0.45193313919157935, + "learning_rate": 9.979207032626873e-05, + "loss": 3.2851, + "step": 8128 + }, + { + "epoch": 0.3784714947505645, + "grad_norm": 0.44681374230196325, + "learning_rate": 9.979182347714693e-05, + "loss": 3.3276, + "step": 8129 + }, + { + "epoch": 0.3785180529366576, + "grad_norm": 0.42045158827539336, + "learning_rate": 9.979157648189104e-05, + "loss": 3.219, + "step": 8130 + }, + { + "epoch": 0.37856461112275064, + "grad_norm": 0.45723389251053653, + "learning_rate": 9.979132934050178e-05, + "loss": 3.3857, + "step": 8131 + }, + { + "epoch": 0.37861116930884375, + "grad_norm": 0.46905462420913185, + "learning_rate": 9.979108205297987e-05, + "loss": 3.4939, + "step": 8132 + }, + { + "epoch": 0.3786577274949368, + "grad_norm": 0.47164808524640095, + "learning_rate": 9.979083461932604e-05, + "loss": 3.437, + "step": 8133 + }, + { + "epoch": 0.3787042856810299, + "grad_norm": 0.44396819937565085, + "learning_rate": 9.9790587039541e-05, + "loss": 3.2838, + "step": 8134 + }, + { + "epoch": 0.37875084386712293, + "grad_norm": 0.41892849806429217, + "learning_rate": 9.979033931362551e-05, + "loss": 3.4606, + "step": 8135 + }, + { + "epoch": 0.378797402053216, + "grad_norm": 0.46076752481540734, + "learning_rate": 9.979009144158028e-05, + "loss": 3.3526, + "step": 8136 + }, + { + "epoch": 0.37884396023930905, + "grad_norm": 0.45767967442914104, + "learning_rate": 9.978984342340603e-05, + "loss": 3.358, + "step": 8137 + }, + { + "epoch": 0.37889051842540217, + "grad_norm": 0.4733238871763479, + "learning_rate": 9.97895952591035e-05, + "loss": 3.4914, + "step": 8138 + }, + { + "epoch": 0.3789370766114952, + "grad_norm": 0.44192044000802444, + "learning_rate": 9.978934694867341e-05, + "loss": 3.3741, + "step": 8139 + }, + { + "epoch": 0.3789836347975883, + "grad_norm": 0.4635666785602867, + "learning_rate": 9.97890984921165e-05, + "loss": 3.3651, + "step": 8140 + }, + { + "epoch": 0.37903019298368135, + "grad_norm": 0.4224535594542161, + "learning_rate": 9.978884988943348e-05, + "loss": 3.3505, + "step": 8141 + }, + { + "epoch": 0.3790767511697744, + "grad_norm": 0.49319044186907157, + "learning_rate": 9.97886011406251e-05, + "loss": 3.309, + "step": 8142 + }, + { + "epoch": 0.3791233093558675, + "grad_norm": 0.4304180112388974, + "learning_rate": 9.978835224569206e-05, + "loss": 3.4597, + "step": 8143 + }, + { + "epoch": 0.3791698675419606, + "grad_norm": 0.4289596422421342, + "learning_rate": 9.978810320463513e-05, + "loss": 3.3901, + "step": 8144 + }, + { + "epoch": 0.37921642572805364, + "grad_norm": 0.5010437345356461, + "learning_rate": 9.978785401745502e-05, + "loss": 3.3938, + "step": 8145 + }, + { + "epoch": 0.3792629839141467, + "grad_norm": 0.49868735608223996, + "learning_rate": 9.978760468415245e-05, + "loss": 3.3235, + "step": 8146 + }, + { + "epoch": 0.37930954210023976, + "grad_norm": 0.4599145752054486, + "learning_rate": 9.978735520472818e-05, + "loss": 3.337, + "step": 8147 + }, + { + "epoch": 0.3793561002863328, + "grad_norm": 0.4361598567791671, + "learning_rate": 9.978710557918293e-05, + "loss": 3.4345, + "step": 8148 + }, + { + "epoch": 0.37940265847242594, + "grad_norm": 0.4373147850161267, + "learning_rate": 9.978685580751742e-05, + "loss": 3.3938, + "step": 8149 + }, + { + "epoch": 0.379449216658519, + "grad_norm": 0.43923524392116464, + "learning_rate": 9.97866058897324e-05, + "loss": 3.5095, + "step": 8150 + }, + { + "epoch": 0.37949577484461205, + "grad_norm": 0.4412447299794545, + "learning_rate": 9.97863558258286e-05, + "loss": 3.3791, + "step": 8151 + }, + { + "epoch": 0.3795423330307051, + "grad_norm": 0.42382144754941087, + "learning_rate": 9.978610561580673e-05, + "loss": 3.3435, + "step": 8152 + }, + { + "epoch": 0.3795888912167982, + "grad_norm": 0.4730269827680577, + "learning_rate": 9.978585525966757e-05, + "loss": 3.4184, + "step": 8153 + }, + { + "epoch": 0.3796354494028913, + "grad_norm": 0.42111348741523835, + "learning_rate": 9.978560475741181e-05, + "loss": 3.3992, + "step": 8154 + }, + { + "epoch": 0.37968200758898435, + "grad_norm": 0.4180463501022471, + "learning_rate": 9.978535410904022e-05, + "loss": 3.4746, + "step": 8155 + }, + { + "epoch": 0.3797285657750774, + "grad_norm": 0.46147591768790386, + "learning_rate": 9.978510331455351e-05, + "loss": 3.4112, + "step": 8156 + }, + { + "epoch": 0.37977512396117047, + "grad_norm": 0.4468635317286903, + "learning_rate": 9.978485237395243e-05, + "loss": 3.4636, + "step": 8157 + }, + { + "epoch": 0.37982168214726353, + "grad_norm": 0.4436641985092481, + "learning_rate": 9.97846012872377e-05, + "loss": 3.3724, + "step": 8158 + }, + { + "epoch": 0.3798682403333566, + "grad_norm": 0.4664241765946259, + "learning_rate": 9.978435005441009e-05, + "loss": 3.4047, + "step": 8159 + }, + { + "epoch": 0.3799147985194497, + "grad_norm": 0.4605712181563387, + "learning_rate": 9.97840986754703e-05, + "loss": 3.3747, + "step": 8160 + }, + { + "epoch": 0.37996135670554276, + "grad_norm": 0.4558418857123746, + "learning_rate": 9.978384715041911e-05, + "loss": 3.4443, + "step": 8161 + }, + { + "epoch": 0.3800079148916358, + "grad_norm": 0.4967976467244129, + "learning_rate": 9.97835954792572e-05, + "loss": 3.4208, + "step": 8162 + }, + { + "epoch": 0.3800544730777289, + "grad_norm": 0.4895237861873897, + "learning_rate": 9.978334366198536e-05, + "loss": 3.5048, + "step": 8163 + }, + { + "epoch": 0.38010103126382194, + "grad_norm": 0.4866699550372483, + "learning_rate": 9.978309169860432e-05, + "loss": 3.4332, + "step": 8164 + }, + { + "epoch": 0.38014758944991506, + "grad_norm": 0.42882620500706903, + "learning_rate": 9.97828395891148e-05, + "loss": 3.4974, + "step": 8165 + }, + { + "epoch": 0.3801941476360081, + "grad_norm": 0.4091097430827501, + "learning_rate": 9.978258733351756e-05, + "loss": 3.4276, + "step": 8166 + }, + { + "epoch": 0.3802407058221012, + "grad_norm": 0.4493608422219998, + "learning_rate": 9.978233493181331e-05, + "loss": 3.3678, + "step": 8167 + }, + { + "epoch": 0.38028726400819424, + "grad_norm": 0.4173201971913836, + "learning_rate": 9.978208238400282e-05, + "loss": 3.3654, + "step": 8168 + }, + { + "epoch": 0.3803338221942873, + "grad_norm": 0.43691308535378565, + "learning_rate": 9.978182969008683e-05, + "loss": 3.403, + "step": 8169 + }, + { + "epoch": 0.38038038038038036, + "grad_norm": 0.4639840459141553, + "learning_rate": 9.978157685006606e-05, + "loss": 3.4161, + "step": 8170 + }, + { + "epoch": 0.38042693856647347, + "grad_norm": 0.457291049205813, + "learning_rate": 9.978132386394128e-05, + "loss": 3.2651, + "step": 8171 + }, + { + "epoch": 0.38047349675256653, + "grad_norm": 0.46608177999571515, + "learning_rate": 9.978107073171321e-05, + "loss": 3.361, + "step": 8172 + }, + { + "epoch": 0.3805200549386596, + "grad_norm": 0.417287411458697, + "learning_rate": 9.978081745338261e-05, + "loss": 3.1927, + "step": 8173 + }, + { + "epoch": 0.38056661312475265, + "grad_norm": 0.4039878213737477, + "learning_rate": 9.978056402895021e-05, + "loss": 3.3956, + "step": 8174 + }, + { + "epoch": 0.3806131713108457, + "grad_norm": 0.4293041349829115, + "learning_rate": 9.978031045841676e-05, + "loss": 3.4125, + "step": 8175 + }, + { + "epoch": 0.3806597294969388, + "grad_norm": 0.4920587221097095, + "learning_rate": 9.9780056741783e-05, + "loss": 3.5452, + "step": 8176 + }, + { + "epoch": 0.3807062876830319, + "grad_norm": 0.4424712437104194, + "learning_rate": 9.977980287904968e-05, + "loss": 3.3706, + "step": 8177 + }, + { + "epoch": 0.38075284586912495, + "grad_norm": 0.40696381717184527, + "learning_rate": 9.977954887021754e-05, + "loss": 3.3457, + "step": 8178 + }, + { + "epoch": 0.380799404055218, + "grad_norm": 0.48669741941744804, + "learning_rate": 9.977929471528733e-05, + "loss": 3.3072, + "step": 8179 + }, + { + "epoch": 0.38084596224131106, + "grad_norm": 0.506399396868313, + "learning_rate": 9.977904041425981e-05, + "loss": 3.3531, + "step": 8180 + }, + { + "epoch": 0.3808925204274041, + "grad_norm": 0.40477035845751763, + "learning_rate": 9.977878596713567e-05, + "loss": 3.3551, + "step": 8181 + }, + { + "epoch": 0.38093907861349724, + "grad_norm": 0.4967833458712645, + "learning_rate": 9.977853137391573e-05, + "loss": 3.325, + "step": 8182 + }, + { + "epoch": 0.3809856367995903, + "grad_norm": 0.5982688642847256, + "learning_rate": 9.97782766346007e-05, + "loss": 3.4138, + "step": 8183 + }, + { + "epoch": 0.38103219498568336, + "grad_norm": 0.5119263260011764, + "learning_rate": 9.977802174919133e-05, + "loss": 3.3747, + "step": 8184 + }, + { + "epoch": 0.3810787531717764, + "grad_norm": 0.45446038232160396, + "learning_rate": 9.977776671768837e-05, + "loss": 3.332, + "step": 8185 + }, + { + "epoch": 0.3811253113578695, + "grad_norm": 0.4175389482577835, + "learning_rate": 9.977751154009257e-05, + "loss": 3.4236, + "step": 8186 + }, + { + "epoch": 0.3811718695439626, + "grad_norm": 0.4252706438287943, + "learning_rate": 9.977725621640468e-05, + "loss": 3.3063, + "step": 8187 + }, + { + "epoch": 0.38121842773005565, + "grad_norm": 0.4994616822508929, + "learning_rate": 9.977700074662544e-05, + "loss": 3.3261, + "step": 8188 + }, + { + "epoch": 0.3812649859161487, + "grad_norm": 0.5083011746059053, + "learning_rate": 9.977674513075561e-05, + "loss": 3.3434, + "step": 8189 + }, + { + "epoch": 0.3813115441022418, + "grad_norm": 0.4374918088918759, + "learning_rate": 9.977648936879594e-05, + "loss": 3.403, + "step": 8190 + }, + { + "epoch": 0.38135810228833483, + "grad_norm": 0.4475403418949862, + "learning_rate": 9.977623346074716e-05, + "loss": 3.3435, + "step": 8191 + }, + { + "epoch": 0.3814046604744279, + "grad_norm": 0.48214380586713185, + "learning_rate": 9.977597740661006e-05, + "loss": 3.2818, + "step": 8192 + }, + { + "epoch": 0.381451218660521, + "grad_norm": 0.5160650913587495, + "learning_rate": 9.977572120638537e-05, + "loss": 3.2934, + "step": 8193 + }, + { + "epoch": 0.38149777684661407, + "grad_norm": 0.448134992210527, + "learning_rate": 9.977546486007381e-05, + "loss": 3.2196, + "step": 8194 + }, + { + "epoch": 0.3815443350327071, + "grad_norm": 0.479575916539114, + "learning_rate": 9.97752083676762e-05, + "loss": 3.3792, + "step": 8195 + }, + { + "epoch": 0.3815908932188002, + "grad_norm": 0.4755672743493593, + "learning_rate": 9.977495172919324e-05, + "loss": 3.3703, + "step": 8196 + }, + { + "epoch": 0.38163745140489325, + "grad_norm": 0.49956504811009433, + "learning_rate": 9.977469494462571e-05, + "loss": 3.3729, + "step": 8197 + }, + { + "epoch": 0.38168400959098636, + "grad_norm": 0.5016037681038654, + "learning_rate": 9.977443801397434e-05, + "loss": 3.4789, + "step": 8198 + }, + { + "epoch": 0.3817305677770794, + "grad_norm": 0.39675358115990744, + "learning_rate": 9.977418093723992e-05, + "loss": 3.4015, + "step": 8199 + }, + { + "epoch": 0.3817771259631725, + "grad_norm": 0.4729796305254134, + "learning_rate": 9.977392371442317e-05, + "loss": 3.3886, + "step": 8200 + }, + { + "epoch": 0.38182368414926554, + "grad_norm": 0.5272955992776203, + "learning_rate": 9.977366634552485e-05, + "loss": 3.4032, + "step": 8201 + }, + { + "epoch": 0.3818702423353586, + "grad_norm": 0.4845161711885219, + "learning_rate": 9.977340883054573e-05, + "loss": 3.4369, + "step": 8202 + }, + { + "epoch": 0.38191680052145166, + "grad_norm": 0.4414144199939191, + "learning_rate": 9.977315116948656e-05, + "loss": 3.3581, + "step": 8203 + }, + { + "epoch": 0.3819633587075448, + "grad_norm": 0.4563931427849311, + "learning_rate": 9.97728933623481e-05, + "loss": 3.3655, + "step": 8204 + }, + { + "epoch": 0.38200991689363784, + "grad_norm": 0.44213694597940123, + "learning_rate": 9.977263540913109e-05, + "loss": 3.4342, + "step": 8205 + }, + { + "epoch": 0.3820564750797309, + "grad_norm": 0.39501538808416803, + "learning_rate": 9.977237730983631e-05, + "loss": 3.4296, + "step": 8206 + }, + { + "epoch": 0.38210303326582395, + "grad_norm": 0.5241802558764813, + "learning_rate": 9.97721190644645e-05, + "loss": 3.4567, + "step": 8207 + }, + { + "epoch": 0.382149591451917, + "grad_norm": 0.49878930948124683, + "learning_rate": 9.977186067301642e-05, + "loss": 3.3363, + "step": 8208 + }, + { + "epoch": 0.38219614963801013, + "grad_norm": 0.4718334946122524, + "learning_rate": 9.977160213549284e-05, + "loss": 3.2652, + "step": 8209 + }, + { + "epoch": 0.3822427078241032, + "grad_norm": 0.4299406200315247, + "learning_rate": 9.97713434518945e-05, + "loss": 3.396, + "step": 8210 + }, + { + "epoch": 0.38228926601019625, + "grad_norm": 0.3874433860714183, + "learning_rate": 9.977108462222218e-05, + "loss": 3.3926, + "step": 8211 + }, + { + "epoch": 0.3823358241962893, + "grad_norm": 0.46509439522476576, + "learning_rate": 9.977082564647663e-05, + "loss": 3.3476, + "step": 8212 + }, + { + "epoch": 0.38238238238238237, + "grad_norm": 0.48593779837227125, + "learning_rate": 9.977056652465861e-05, + "loss": 3.3815, + "step": 8213 + }, + { + "epoch": 0.38242894056847543, + "grad_norm": 0.4797783819489583, + "learning_rate": 9.977030725676887e-05, + "loss": 3.3299, + "step": 8214 + }, + { + "epoch": 0.38247549875456854, + "grad_norm": 0.40199500196927634, + "learning_rate": 9.97700478428082e-05, + "loss": 3.3743, + "step": 8215 + }, + { + "epoch": 0.3825220569406616, + "grad_norm": 0.402366262936029, + "learning_rate": 9.976978828277732e-05, + "loss": 3.3299, + "step": 8216 + }, + { + "epoch": 0.38256861512675466, + "grad_norm": 0.3935062516642039, + "learning_rate": 9.976952857667703e-05, + "loss": 3.3503, + "step": 8217 + }, + { + "epoch": 0.3826151733128477, + "grad_norm": 0.41447313793736246, + "learning_rate": 9.976926872450807e-05, + "loss": 3.3142, + "step": 8218 + }, + { + "epoch": 0.3826617314989408, + "grad_norm": 0.42855748677892014, + "learning_rate": 9.976900872627122e-05, + "loss": 3.3404, + "step": 8219 + }, + { + "epoch": 0.3827082896850339, + "grad_norm": 0.4101954661234242, + "learning_rate": 9.976874858196722e-05, + "loss": 3.3915, + "step": 8220 + }, + { + "epoch": 0.38275484787112696, + "grad_norm": 0.47958977752088416, + "learning_rate": 9.976848829159684e-05, + "loss": 3.3178, + "step": 8221 + }, + { + "epoch": 0.38280140605722, + "grad_norm": 0.4807259997804668, + "learning_rate": 9.976822785516087e-05, + "loss": 3.367, + "step": 8222 + }, + { + "epoch": 0.3828479642433131, + "grad_norm": 0.472411772687689, + "learning_rate": 9.976796727266005e-05, + "loss": 3.2667, + "step": 8223 + }, + { + "epoch": 0.38289452242940614, + "grad_norm": 0.4373589286610717, + "learning_rate": 9.976770654409513e-05, + "loss": 3.4151, + "step": 8224 + }, + { + "epoch": 0.3829410806154992, + "grad_norm": 0.4540645863776264, + "learning_rate": 9.976744566946692e-05, + "loss": 3.4066, + "step": 8225 + }, + { + "epoch": 0.3829876388015923, + "grad_norm": 0.5289579623185858, + "learning_rate": 9.976718464877615e-05, + "loss": 3.4525, + "step": 8226 + }, + { + "epoch": 0.38303419698768537, + "grad_norm": 0.5172842753925667, + "learning_rate": 9.976692348202359e-05, + "loss": 3.2863, + "step": 8227 + }, + { + "epoch": 0.38308075517377843, + "grad_norm": 0.45798813049320725, + "learning_rate": 9.976666216921001e-05, + "loss": 3.3839, + "step": 8228 + }, + { + "epoch": 0.3831273133598715, + "grad_norm": 0.5157220125368082, + "learning_rate": 9.976640071033619e-05, + "loss": 3.5236, + "step": 8229 + }, + { + "epoch": 0.38317387154596455, + "grad_norm": 0.5054890960108951, + "learning_rate": 9.976613910540288e-05, + "loss": 3.3613, + "step": 8230 + }, + { + "epoch": 0.38322042973205767, + "grad_norm": 0.47894290602433104, + "learning_rate": 9.976587735441085e-05, + "loss": 3.362, + "step": 8231 + }, + { + "epoch": 0.3832669879181507, + "grad_norm": 0.5108191744128625, + "learning_rate": 9.976561545736089e-05, + "loss": 3.436, + "step": 8232 + }, + { + "epoch": 0.3833135461042438, + "grad_norm": 0.4982578339062224, + "learning_rate": 9.976535341425373e-05, + "loss": 3.2906, + "step": 8233 + }, + { + "epoch": 0.38336010429033684, + "grad_norm": 0.43249659797136264, + "learning_rate": 9.976509122509018e-05, + "loss": 3.3003, + "step": 8234 + }, + { + "epoch": 0.3834066624764299, + "grad_norm": 0.4687891031415445, + "learning_rate": 9.976482888987098e-05, + "loss": 3.2086, + "step": 8235 + }, + { + "epoch": 0.38345322066252296, + "grad_norm": 0.4617869595784003, + "learning_rate": 9.976456640859691e-05, + "loss": 3.4136, + "step": 8236 + }, + { + "epoch": 0.3834997788486161, + "grad_norm": 0.4759718407202879, + "learning_rate": 9.976430378126874e-05, + "loss": 3.3605, + "step": 8237 + }, + { + "epoch": 0.38354633703470914, + "grad_norm": 0.46345823097760425, + "learning_rate": 9.976404100788725e-05, + "loss": 3.5005, + "step": 8238 + }, + { + "epoch": 0.3835928952208022, + "grad_norm": 0.5425592908450091, + "learning_rate": 9.976377808845319e-05, + "loss": 3.3792, + "step": 8239 + }, + { + "epoch": 0.38363945340689526, + "grad_norm": 0.5828220253352872, + "learning_rate": 9.976351502296735e-05, + "loss": 3.2976, + "step": 8240 + }, + { + "epoch": 0.3836860115929883, + "grad_norm": 0.5072366508370126, + "learning_rate": 9.976325181143049e-05, + "loss": 3.2783, + "step": 8241 + }, + { + "epoch": 0.38373256977908143, + "grad_norm": 0.5126647435569354, + "learning_rate": 9.976298845384339e-05, + "loss": 3.3735, + "step": 8242 + }, + { + "epoch": 0.3837791279651745, + "grad_norm": 0.5170587411909632, + "learning_rate": 9.976272495020682e-05, + "loss": 3.2978, + "step": 8243 + }, + { + "epoch": 0.38382568615126755, + "grad_norm": 0.495862395507141, + "learning_rate": 9.976246130052156e-05, + "loss": 3.281, + "step": 8244 + }, + { + "epoch": 0.3838722443373606, + "grad_norm": 0.4344830267186774, + "learning_rate": 9.976219750478839e-05, + "loss": 3.3682, + "step": 8245 + }, + { + "epoch": 0.3839188025234537, + "grad_norm": 0.4584158092435885, + "learning_rate": 9.976193356300805e-05, + "loss": 3.2851, + "step": 8246 + }, + { + "epoch": 0.38396536070954673, + "grad_norm": 0.4305059041091041, + "learning_rate": 9.976166947518136e-05, + "loss": 3.2102, + "step": 8247 + }, + { + "epoch": 0.38401191889563985, + "grad_norm": 0.43570389201770005, + "learning_rate": 9.976140524130905e-05, + "loss": 3.4666, + "step": 8248 + }, + { + "epoch": 0.3840584770817329, + "grad_norm": 0.45184343250932835, + "learning_rate": 9.976114086139192e-05, + "loss": 3.4044, + "step": 8249 + }, + { + "epoch": 0.38410503526782597, + "grad_norm": 0.4300598668716647, + "learning_rate": 9.976087633543077e-05, + "loss": 3.2822, + "step": 8250 + }, + { + "epoch": 0.384151593453919, + "grad_norm": 0.49424613360632547, + "learning_rate": 9.976061166342632e-05, + "loss": 3.2447, + "step": 8251 + }, + { + "epoch": 0.3841981516400121, + "grad_norm": 0.3907914240718665, + "learning_rate": 9.97603468453794e-05, + "loss": 3.298, + "step": 8252 + }, + { + "epoch": 0.3842447098261052, + "grad_norm": 0.4741277825464603, + "learning_rate": 9.976008188129075e-05, + "loss": 3.417, + "step": 8253 + }, + { + "epoch": 0.38429126801219826, + "grad_norm": 0.4358191598506758, + "learning_rate": 9.975981677116115e-05, + "loss": 3.291, + "step": 8254 + }, + { + "epoch": 0.3843378261982913, + "grad_norm": 0.4521473876187275, + "learning_rate": 9.975955151499141e-05, + "loss": 3.3174, + "step": 8255 + }, + { + "epoch": 0.3843843843843844, + "grad_norm": 0.44929919581625033, + "learning_rate": 9.975928611278229e-05, + "loss": 3.3984, + "step": 8256 + }, + { + "epoch": 0.38443094257047744, + "grad_norm": 0.4693510283358173, + "learning_rate": 9.975902056453456e-05, + "loss": 3.354, + "step": 8257 + }, + { + "epoch": 0.3844775007565705, + "grad_norm": 0.542407459293892, + "learning_rate": 9.975875487024901e-05, + "loss": 3.3731, + "step": 8258 + }, + { + "epoch": 0.3845240589426636, + "grad_norm": 0.4593208283895434, + "learning_rate": 9.975848902992642e-05, + "loss": 3.3429, + "step": 8259 + }, + { + "epoch": 0.3845706171287567, + "grad_norm": 0.4400927127465638, + "learning_rate": 9.975822304356755e-05, + "loss": 3.3542, + "step": 8260 + }, + { + "epoch": 0.38461717531484974, + "grad_norm": 0.4930827642555521, + "learning_rate": 9.97579569111732e-05, + "loss": 3.3903, + "step": 8261 + }, + { + "epoch": 0.3846637335009428, + "grad_norm": 0.47967324531975425, + "learning_rate": 9.975769063274416e-05, + "loss": 3.4709, + "step": 8262 + }, + { + "epoch": 0.38471029168703585, + "grad_norm": 0.4774556175104014, + "learning_rate": 9.97574242082812e-05, + "loss": 3.3625, + "step": 8263 + }, + { + "epoch": 0.38475684987312897, + "grad_norm": 0.42136244045565174, + "learning_rate": 9.975715763778511e-05, + "loss": 3.5253, + "step": 8264 + }, + { + "epoch": 0.38480340805922203, + "grad_norm": 0.49062568462013384, + "learning_rate": 9.975689092125663e-05, + "loss": 3.4454, + "step": 8265 + }, + { + "epoch": 0.3848499662453151, + "grad_norm": 0.44615442077685835, + "learning_rate": 9.975662405869662e-05, + "loss": 3.3552, + "step": 8266 + }, + { + "epoch": 0.38489652443140815, + "grad_norm": 0.4368705502095842, + "learning_rate": 9.97563570501058e-05, + "loss": 3.3131, + "step": 8267 + }, + { + "epoch": 0.3849430826175012, + "grad_norm": 0.44303859837436643, + "learning_rate": 9.975608989548498e-05, + "loss": 3.2655, + "step": 8268 + }, + { + "epoch": 0.38498964080359427, + "grad_norm": 0.43595175091709415, + "learning_rate": 9.975582259483491e-05, + "loss": 3.2144, + "step": 8269 + }, + { + "epoch": 0.3850361989896874, + "grad_norm": 0.44358611778907175, + "learning_rate": 9.975555514815642e-05, + "loss": 3.4427, + "step": 8270 + }, + { + "epoch": 0.38508275717578044, + "grad_norm": 0.5385909141362053, + "learning_rate": 9.97552875554503e-05, + "loss": 3.4614, + "step": 8271 + }, + { + "epoch": 0.3851293153618735, + "grad_norm": 0.5209734875544519, + "learning_rate": 9.975501981671729e-05, + "loss": 3.3551, + "step": 8272 + }, + { + "epoch": 0.38517587354796656, + "grad_norm": 0.5071337323090405, + "learning_rate": 9.975475193195821e-05, + "loss": 3.392, + "step": 8273 + }, + { + "epoch": 0.3852224317340596, + "grad_norm": 0.4908029953684769, + "learning_rate": 9.975448390117382e-05, + "loss": 3.3603, + "step": 8274 + }, + { + "epoch": 0.38526898992015274, + "grad_norm": 0.47491360372002356, + "learning_rate": 9.975421572436492e-05, + "loss": 3.318, + "step": 8275 + }, + { + "epoch": 0.3853155481062458, + "grad_norm": 0.44589244398702504, + "learning_rate": 9.975394740153231e-05, + "loss": 3.2672, + "step": 8276 + }, + { + "epoch": 0.38536210629233886, + "grad_norm": 0.4915842045734582, + "learning_rate": 9.975367893267677e-05, + "loss": 3.3412, + "step": 8277 + }, + { + "epoch": 0.3854086644784319, + "grad_norm": 0.48269557006635094, + "learning_rate": 9.975341031779908e-05, + "loss": 3.3998, + "step": 8278 + }, + { + "epoch": 0.385455222664525, + "grad_norm": 0.4918875031580278, + "learning_rate": 9.975314155690002e-05, + "loss": 3.2743, + "step": 8279 + }, + { + "epoch": 0.38550178085061804, + "grad_norm": 0.4455731292087654, + "learning_rate": 9.97528726499804e-05, + "loss": 3.4204, + "step": 8280 + }, + { + "epoch": 0.38554833903671115, + "grad_norm": 0.45565954526332186, + "learning_rate": 9.975260359704099e-05, + "loss": 3.3505, + "step": 8281 + }, + { + "epoch": 0.3855948972228042, + "grad_norm": 0.4479115823561694, + "learning_rate": 9.97523343980826e-05, + "loss": 3.3523, + "step": 8282 + }, + { + "epoch": 0.38564145540889727, + "grad_norm": 0.4579455706165083, + "learning_rate": 9.975206505310601e-05, + "loss": 3.3673, + "step": 8283 + }, + { + "epoch": 0.38568801359499033, + "grad_norm": 0.4024034338283996, + "learning_rate": 9.975179556211203e-05, + "loss": 3.374, + "step": 8284 + }, + { + "epoch": 0.3857345717810834, + "grad_norm": 0.4574880941397124, + "learning_rate": 9.97515259251014e-05, + "loss": 3.3312, + "step": 8285 + }, + { + "epoch": 0.3857811299671765, + "grad_norm": 0.4184969291589231, + "learning_rate": 9.975125614207497e-05, + "loss": 3.3438, + "step": 8286 + }, + { + "epoch": 0.38582768815326957, + "grad_norm": 0.4175582443103675, + "learning_rate": 9.975098621303346e-05, + "loss": 3.3446, + "step": 8287 + }, + { + "epoch": 0.3858742463393626, + "grad_norm": 0.45704706312439597, + "learning_rate": 9.975071613797775e-05, + "loss": 3.2751, + "step": 8288 + }, + { + "epoch": 0.3859208045254557, + "grad_norm": 0.47011747395630127, + "learning_rate": 9.975044591690858e-05, + "loss": 3.2913, + "step": 8289 + }, + { + "epoch": 0.38596736271154874, + "grad_norm": 0.4425333390111278, + "learning_rate": 9.975017554982675e-05, + "loss": 3.3468, + "step": 8290 + }, + { + "epoch": 0.3860139208976418, + "grad_norm": 0.44392000057999265, + "learning_rate": 9.974990503673305e-05, + "loss": 3.4343, + "step": 8291 + }, + { + "epoch": 0.3860604790837349, + "grad_norm": 0.4645545467349502, + "learning_rate": 9.974963437762829e-05, + "loss": 3.3831, + "step": 8292 + }, + { + "epoch": 0.386107037269828, + "grad_norm": 0.4848313348278381, + "learning_rate": 9.974936357251324e-05, + "loss": 3.3027, + "step": 8293 + }, + { + "epoch": 0.38615359545592104, + "grad_norm": 0.45222665453495736, + "learning_rate": 9.974909262138872e-05, + "loss": 3.336, + "step": 8294 + }, + { + "epoch": 0.3862001536420141, + "grad_norm": 0.5187620365013604, + "learning_rate": 9.97488215242555e-05, + "loss": 3.3318, + "step": 8295 + }, + { + "epoch": 0.38624671182810716, + "grad_norm": 0.44230242188117663, + "learning_rate": 9.974855028111441e-05, + "loss": 3.3781, + "step": 8296 + }, + { + "epoch": 0.3862932700142003, + "grad_norm": 0.49285665152888947, + "learning_rate": 9.974827889196622e-05, + "loss": 3.3423, + "step": 8297 + }, + { + "epoch": 0.38633982820029333, + "grad_norm": 0.4789819789832454, + "learning_rate": 9.974800735681173e-05, + "loss": 3.3315, + "step": 8298 + }, + { + "epoch": 0.3863863863863864, + "grad_norm": 0.5095682324212938, + "learning_rate": 9.974773567565174e-05, + "loss": 3.3415, + "step": 8299 + }, + { + "epoch": 0.38643294457247945, + "grad_norm": 0.5172739961943498, + "learning_rate": 9.974746384848705e-05, + "loss": 3.3654, + "step": 8300 + }, + { + "epoch": 0.3864795027585725, + "grad_norm": 0.5002892841493732, + "learning_rate": 9.974719187531844e-05, + "loss": 3.3685, + "step": 8301 + }, + { + "epoch": 0.3865260609446656, + "grad_norm": 0.49318265210921663, + "learning_rate": 9.974691975614674e-05, + "loss": 3.3339, + "step": 8302 + }, + { + "epoch": 0.3865726191307587, + "grad_norm": 0.4486045543023789, + "learning_rate": 9.974664749097273e-05, + "loss": 3.3323, + "step": 8303 + }, + { + "epoch": 0.38661917731685175, + "grad_norm": 0.4890516729385486, + "learning_rate": 9.974637507979721e-05, + "loss": 3.3252, + "step": 8304 + }, + { + "epoch": 0.3866657355029448, + "grad_norm": 0.4544443810935982, + "learning_rate": 9.974610252262097e-05, + "loss": 3.2927, + "step": 8305 + }, + { + "epoch": 0.38671229368903787, + "grad_norm": 0.464090530817193, + "learning_rate": 9.974582981944482e-05, + "loss": 3.3222, + "step": 8306 + }, + { + "epoch": 0.3867588518751309, + "grad_norm": 0.4451402113521435, + "learning_rate": 9.974555697026958e-05, + "loss": 3.403, + "step": 8307 + }, + { + "epoch": 0.38680541006122404, + "grad_norm": 0.468575714135573, + "learning_rate": 9.9745283975096e-05, + "loss": 3.3536, + "step": 8308 + }, + { + "epoch": 0.3868519682473171, + "grad_norm": 0.44760105938085176, + "learning_rate": 9.974501083392493e-05, + "loss": 3.4584, + "step": 8309 + }, + { + "epoch": 0.38689852643341016, + "grad_norm": 0.4802410448362094, + "learning_rate": 9.974473754675715e-05, + "loss": 3.3736, + "step": 8310 + }, + { + "epoch": 0.3869450846195032, + "grad_norm": 0.44489547003097196, + "learning_rate": 9.974446411359348e-05, + "loss": 3.3884, + "step": 8311 + }, + { + "epoch": 0.3869916428055963, + "grad_norm": 0.4866394681346502, + "learning_rate": 9.974419053443469e-05, + "loss": 3.4898, + "step": 8312 + }, + { + "epoch": 0.38703820099168934, + "grad_norm": 0.4597750139743263, + "learning_rate": 9.97439168092816e-05, + "loss": 3.3475, + "step": 8313 + }, + { + "epoch": 0.38708475917778246, + "grad_norm": 0.46675700628845634, + "learning_rate": 9.974364293813503e-05, + "loss": 3.2921, + "step": 8314 + }, + { + "epoch": 0.3871313173638755, + "grad_norm": 0.40493810521192425, + "learning_rate": 9.974336892099576e-05, + "loss": 3.4303, + "step": 8315 + }, + { + "epoch": 0.3871778755499686, + "grad_norm": 0.47004208760519695, + "learning_rate": 9.974309475786461e-05, + "loss": 3.4133, + "step": 8316 + }, + { + "epoch": 0.38722443373606164, + "grad_norm": 0.508154483044765, + "learning_rate": 9.974282044874237e-05, + "loss": 3.4121, + "step": 8317 + }, + { + "epoch": 0.3872709919221547, + "grad_norm": 0.45526845009751654, + "learning_rate": 9.974254599362986e-05, + "loss": 3.3455, + "step": 8318 + }, + { + "epoch": 0.3873175501082478, + "grad_norm": 0.41994964982497063, + "learning_rate": 9.974227139252786e-05, + "loss": 3.3755, + "step": 8319 + }, + { + "epoch": 0.38736410829434087, + "grad_norm": 0.3954987127970616, + "learning_rate": 9.974199664543722e-05, + "loss": 3.3084, + "step": 8320 + }, + { + "epoch": 0.38741066648043393, + "grad_norm": 0.42234553843424943, + "learning_rate": 9.974172175235869e-05, + "loss": 3.3643, + "step": 8321 + }, + { + "epoch": 0.387457224666527, + "grad_norm": 0.42206426505937733, + "learning_rate": 9.974144671329313e-05, + "loss": 3.1668, + "step": 8322 + }, + { + "epoch": 0.38750378285262005, + "grad_norm": 0.4210354460625096, + "learning_rate": 9.974117152824132e-05, + "loss": 3.3135, + "step": 8323 + }, + { + "epoch": 0.3875503410387131, + "grad_norm": 0.39556982063131707, + "learning_rate": 9.974089619720407e-05, + "loss": 3.3695, + "step": 8324 + }, + { + "epoch": 0.3875968992248062, + "grad_norm": 0.44169377901473844, + "learning_rate": 9.97406207201822e-05, + "loss": 3.4256, + "step": 8325 + }, + { + "epoch": 0.3876434574108993, + "grad_norm": 0.39519795804044083, + "learning_rate": 9.974034509717649e-05, + "loss": 3.2719, + "step": 8326 + }, + { + "epoch": 0.38769001559699234, + "grad_norm": 0.41687965094780705, + "learning_rate": 9.974006932818778e-05, + "loss": 3.4287, + "step": 8327 + }, + { + "epoch": 0.3877365737830854, + "grad_norm": 0.43164882513799624, + "learning_rate": 9.973979341321686e-05, + "loss": 3.387, + "step": 8328 + }, + { + "epoch": 0.38778313196917846, + "grad_norm": 0.40553637386052005, + "learning_rate": 9.973951735226455e-05, + "loss": 3.176, + "step": 8329 + }, + { + "epoch": 0.3878296901552716, + "grad_norm": 0.4969761573879587, + "learning_rate": 9.973924114533164e-05, + "loss": 3.3016, + "step": 8330 + }, + { + "epoch": 0.38787624834136464, + "grad_norm": 0.43585092980466317, + "learning_rate": 9.973896479241897e-05, + "loss": 3.4534, + "step": 8331 + }, + { + "epoch": 0.3879228065274577, + "grad_norm": 0.44255509805260146, + "learning_rate": 9.973868829352734e-05, + "loss": 3.497, + "step": 8332 + }, + { + "epoch": 0.38796936471355076, + "grad_norm": 0.47467650449779314, + "learning_rate": 9.973841164865757e-05, + "loss": 3.4555, + "step": 8333 + }, + { + "epoch": 0.3880159228996438, + "grad_norm": 0.46820725645812034, + "learning_rate": 9.973813485781045e-05, + "loss": 3.3621, + "step": 8334 + }, + { + "epoch": 0.3880624810857369, + "grad_norm": 0.3807708715223199, + "learning_rate": 9.973785792098681e-05, + "loss": 3.2879, + "step": 8335 + }, + { + "epoch": 0.38810903927183, + "grad_norm": 0.4668484632931044, + "learning_rate": 9.973758083818745e-05, + "loss": 3.3699, + "step": 8336 + }, + { + "epoch": 0.38815559745792305, + "grad_norm": 0.488867715087149, + "learning_rate": 9.97373036094132e-05, + "loss": 3.2945, + "step": 8337 + }, + { + "epoch": 0.3882021556440161, + "grad_norm": 0.4915112989473049, + "learning_rate": 9.973702623466486e-05, + "loss": 3.4472, + "step": 8338 + }, + { + "epoch": 0.38824871383010917, + "grad_norm": 0.46315246828285195, + "learning_rate": 9.973674871394326e-05, + "loss": 3.2117, + "step": 8339 + }, + { + "epoch": 0.38829527201620223, + "grad_norm": 0.4719225261130536, + "learning_rate": 9.973647104724918e-05, + "loss": 3.3734, + "step": 8340 + }, + { + "epoch": 0.38834183020229535, + "grad_norm": 0.4905603895011212, + "learning_rate": 9.973619323458345e-05, + "loss": 3.3558, + "step": 8341 + }, + { + "epoch": 0.3883883883883884, + "grad_norm": 0.4808174175475862, + "learning_rate": 9.973591527594692e-05, + "loss": 3.3828, + "step": 8342 + }, + { + "epoch": 0.38843494657448147, + "grad_norm": 0.4580346003260982, + "learning_rate": 9.973563717134035e-05, + "loss": 3.3468, + "step": 8343 + }, + { + "epoch": 0.3884815047605745, + "grad_norm": 0.4812814729915031, + "learning_rate": 9.973535892076462e-05, + "loss": 3.3052, + "step": 8344 + }, + { + "epoch": 0.3885280629466676, + "grad_norm": 0.6030239832182441, + "learning_rate": 9.973508052422049e-05, + "loss": 3.419, + "step": 8345 + }, + { + "epoch": 0.38857462113276064, + "grad_norm": 0.590909063579097, + "learning_rate": 9.97348019817088e-05, + "loss": 3.3904, + "step": 8346 + }, + { + "epoch": 0.38862117931885376, + "grad_norm": 0.4161392184938416, + "learning_rate": 9.973452329323037e-05, + "loss": 3.3135, + "step": 8347 + }, + { + "epoch": 0.3886677375049468, + "grad_norm": 0.4560182106298237, + "learning_rate": 9.973424445878601e-05, + "loss": 3.3521, + "step": 8348 + }, + { + "epoch": 0.3887142956910399, + "grad_norm": 0.4258318509877113, + "learning_rate": 9.973396547837654e-05, + "loss": 3.3541, + "step": 8349 + }, + { + "epoch": 0.38876085387713294, + "grad_norm": 0.4209216681319469, + "learning_rate": 9.973368635200278e-05, + "loss": 3.3355, + "step": 8350 + }, + { + "epoch": 0.388807412063226, + "grad_norm": 0.4666173399894803, + "learning_rate": 9.973340707966556e-05, + "loss": 3.3606, + "step": 8351 + }, + { + "epoch": 0.3888539702493191, + "grad_norm": 0.4920901875210294, + "learning_rate": 9.973312766136569e-05, + "loss": 3.4142, + "step": 8352 + }, + { + "epoch": 0.3889005284354122, + "grad_norm": 0.35771448129563427, + "learning_rate": 9.973284809710398e-05, + "loss": 3.3314, + "step": 8353 + }, + { + "epoch": 0.38894708662150523, + "grad_norm": 0.46313698623694766, + "learning_rate": 9.973256838688126e-05, + "loss": 3.2751, + "step": 8354 + }, + { + "epoch": 0.3889936448075983, + "grad_norm": 0.4550824905618591, + "learning_rate": 9.973228853069836e-05, + "loss": 3.4909, + "step": 8355 + }, + { + "epoch": 0.38904020299369135, + "grad_norm": 0.3954263360966633, + "learning_rate": 9.973200852855609e-05, + "loss": 3.4271, + "step": 8356 + }, + { + "epoch": 0.3890867611797844, + "grad_norm": 0.47819727326043004, + "learning_rate": 9.973172838045527e-05, + "loss": 3.3754, + "step": 8357 + }, + { + "epoch": 0.38913331936587753, + "grad_norm": 0.4302677326260871, + "learning_rate": 9.973144808639673e-05, + "loss": 3.3661, + "step": 8358 + }, + { + "epoch": 0.3891798775519706, + "grad_norm": 0.4058186239090094, + "learning_rate": 9.973116764638129e-05, + "loss": 3.3022, + "step": 8359 + }, + { + "epoch": 0.38922643573806365, + "grad_norm": 0.44741058790956306, + "learning_rate": 9.973088706040975e-05, + "loss": 3.3203, + "step": 8360 + }, + { + "epoch": 0.3892729939241567, + "grad_norm": 0.4394676795788077, + "learning_rate": 9.9730606328483e-05, + "loss": 3.2467, + "step": 8361 + }, + { + "epoch": 0.38931955211024977, + "grad_norm": 0.4650379616078152, + "learning_rate": 9.97303254506018e-05, + "loss": 3.3482, + "step": 8362 + }, + { + "epoch": 0.3893661102963429, + "grad_norm": 0.4838761273317508, + "learning_rate": 9.973004442676698e-05, + "loss": 3.3397, + "step": 8363 + }, + { + "epoch": 0.38941266848243594, + "grad_norm": 0.38864642153817563, + "learning_rate": 9.972976325697938e-05, + "loss": 3.3215, + "step": 8364 + }, + { + "epoch": 0.389459226668529, + "grad_norm": 0.4537789838995928, + "learning_rate": 9.972948194123984e-05, + "loss": 3.3403, + "step": 8365 + }, + { + "epoch": 0.38950578485462206, + "grad_norm": 0.48903526807618086, + "learning_rate": 9.972920047954917e-05, + "loss": 3.2744, + "step": 8366 + }, + { + "epoch": 0.3895523430407151, + "grad_norm": 0.44577760602447103, + "learning_rate": 9.972891887190819e-05, + "loss": 3.3547, + "step": 8367 + }, + { + "epoch": 0.3895989012268082, + "grad_norm": 0.42953752808859796, + "learning_rate": 9.972863711831773e-05, + "loss": 3.2961, + "step": 8368 + }, + { + "epoch": 0.3896454594129013, + "grad_norm": 0.5035230009516871, + "learning_rate": 9.972835521877862e-05, + "loss": 3.42, + "step": 8369 + }, + { + "epoch": 0.38969201759899436, + "grad_norm": 0.460214771766809, + "learning_rate": 9.972807317329168e-05, + "loss": 3.3226, + "step": 8370 + }, + { + "epoch": 0.3897385757850874, + "grad_norm": 0.49151654054517546, + "learning_rate": 9.972779098185775e-05, + "loss": 3.4303, + "step": 8371 + }, + { + "epoch": 0.3897851339711805, + "grad_norm": 0.48305104535529586, + "learning_rate": 9.972750864447765e-05, + "loss": 3.3517, + "step": 8372 + }, + { + "epoch": 0.38983169215727353, + "grad_norm": 0.466385786565923, + "learning_rate": 9.972722616115223e-05, + "loss": 3.3923, + "step": 8373 + }, + { + "epoch": 0.3898782503433666, + "grad_norm": 0.4854015599962929, + "learning_rate": 9.972694353188228e-05, + "loss": 3.2907, + "step": 8374 + }, + { + "epoch": 0.3899248085294597, + "grad_norm": 0.49112238217897464, + "learning_rate": 9.972666075666865e-05, + "loss": 3.331, + "step": 8375 + }, + { + "epoch": 0.38997136671555277, + "grad_norm": 0.44828227684573074, + "learning_rate": 9.972637783551217e-05, + "loss": 3.3671, + "step": 8376 + }, + { + "epoch": 0.39001792490164583, + "grad_norm": 0.465165334129465, + "learning_rate": 9.972609476841367e-05, + "loss": 3.2877, + "step": 8377 + }, + { + "epoch": 0.3900644830877389, + "grad_norm": 0.5146322151057352, + "learning_rate": 9.9725811555374e-05, + "loss": 3.4205, + "step": 8378 + }, + { + "epoch": 0.39011104127383195, + "grad_norm": 0.5096028839069912, + "learning_rate": 9.972552819639394e-05, + "loss": 3.3835, + "step": 8379 + }, + { + "epoch": 0.39015759945992506, + "grad_norm": 0.4011318384287408, + "learning_rate": 9.972524469147437e-05, + "loss": 3.3567, + "step": 8380 + }, + { + "epoch": 0.3902041576460181, + "grad_norm": 0.5051375232093317, + "learning_rate": 9.97249610406161e-05, + "loss": 3.2888, + "step": 8381 + }, + { + "epoch": 0.3902507158321112, + "grad_norm": 0.45178971738451157, + "learning_rate": 9.972467724381998e-05, + "loss": 3.3587, + "step": 8382 + }, + { + "epoch": 0.39029727401820424, + "grad_norm": 0.4389265954657364, + "learning_rate": 9.972439330108683e-05, + "loss": 3.4734, + "step": 8383 + }, + { + "epoch": 0.3903438322042973, + "grad_norm": 0.4612007886550887, + "learning_rate": 9.972410921241747e-05, + "loss": 3.4297, + "step": 8384 + }, + { + "epoch": 0.39039039039039036, + "grad_norm": 0.5174674705893385, + "learning_rate": 9.972382497781277e-05, + "loss": 3.3255, + "step": 8385 + }, + { + "epoch": 0.3904369485764835, + "grad_norm": 0.4669868891741716, + "learning_rate": 9.972354059727351e-05, + "loss": 3.3932, + "step": 8386 + }, + { + "epoch": 0.39048350676257654, + "grad_norm": 0.4275219362354381, + "learning_rate": 9.972325607080058e-05, + "loss": 3.3184, + "step": 8387 + }, + { + "epoch": 0.3905300649486696, + "grad_norm": 0.48487610245334517, + "learning_rate": 9.972297139839479e-05, + "loss": 3.3742, + "step": 8388 + }, + { + "epoch": 0.39057662313476266, + "grad_norm": 0.49807966515636554, + "learning_rate": 9.972268658005697e-05, + "loss": 3.3738, + "step": 8389 + }, + { + "epoch": 0.3906231813208557, + "grad_norm": 0.40130170358126455, + "learning_rate": 9.972240161578798e-05, + "loss": 3.4835, + "step": 8390 + }, + { + "epoch": 0.39066973950694883, + "grad_norm": 0.4893407217878895, + "learning_rate": 9.972211650558863e-05, + "loss": 3.3548, + "step": 8391 + }, + { + "epoch": 0.3907162976930419, + "grad_norm": 0.5244680985217114, + "learning_rate": 9.972183124945976e-05, + "loss": 3.2774, + "step": 8392 + }, + { + "epoch": 0.39076285587913495, + "grad_norm": 0.4280000790072301, + "learning_rate": 9.97215458474022e-05, + "loss": 3.3801, + "step": 8393 + }, + { + "epoch": 0.390809414065228, + "grad_norm": 0.4157480651505337, + "learning_rate": 9.972126029941684e-05, + "loss": 3.4744, + "step": 8394 + }, + { + "epoch": 0.39085597225132107, + "grad_norm": 0.4816190367314356, + "learning_rate": 9.972097460550446e-05, + "loss": 3.3527, + "step": 8395 + }, + { + "epoch": 0.39090253043741413, + "grad_norm": 0.4432248609346595, + "learning_rate": 9.972068876566592e-05, + "loss": 3.3656, + "step": 8396 + }, + { + "epoch": 0.39094908862350725, + "grad_norm": 0.4200043442782707, + "learning_rate": 9.972040277990204e-05, + "loss": 3.3709, + "step": 8397 + }, + { + "epoch": 0.3909956468096003, + "grad_norm": 0.4508810929307033, + "learning_rate": 9.97201166482137e-05, + "loss": 3.3795, + "step": 8398 + }, + { + "epoch": 0.39104220499569337, + "grad_norm": 0.44291401331664454, + "learning_rate": 9.97198303706017e-05, + "loss": 3.2371, + "step": 8399 + }, + { + "epoch": 0.3910887631817864, + "grad_norm": 0.47913591546996426, + "learning_rate": 9.971954394706691e-05, + "loss": 3.3868, + "step": 8400 + }, + { + "epoch": 0.3911353213678795, + "grad_norm": 0.49995174865911923, + "learning_rate": 9.971925737761015e-05, + "loss": 3.4191, + "step": 8401 + }, + { + "epoch": 0.3911818795539726, + "grad_norm": 0.5068763229016623, + "learning_rate": 9.971897066223226e-05, + "loss": 3.3304, + "step": 8402 + }, + { + "epoch": 0.39122843774006566, + "grad_norm": 0.4186160473727785, + "learning_rate": 9.97186838009341e-05, + "loss": 3.3709, + "step": 8403 + }, + { + "epoch": 0.3912749959261587, + "grad_norm": 0.43806887891179264, + "learning_rate": 9.971839679371648e-05, + "loss": 3.269, + "step": 8404 + }, + { + "epoch": 0.3913215541122518, + "grad_norm": 0.4739723481976532, + "learning_rate": 9.971810964058029e-05, + "loss": 3.3447, + "step": 8405 + }, + { + "epoch": 0.39136811229834484, + "grad_norm": 0.4597941986519748, + "learning_rate": 9.971782234152633e-05, + "loss": 3.3438, + "step": 8406 + }, + { + "epoch": 0.3914146704844379, + "grad_norm": 0.39581638939434505, + "learning_rate": 9.971753489655548e-05, + "loss": 3.3235, + "step": 8407 + }, + { + "epoch": 0.391461228670531, + "grad_norm": 0.485198448975855, + "learning_rate": 9.971724730566853e-05, + "loss": 3.2792, + "step": 8408 + }, + { + "epoch": 0.3915077868566241, + "grad_norm": 0.4685958594805213, + "learning_rate": 9.97169595688664e-05, + "loss": 3.3809, + "step": 8409 + }, + { + "epoch": 0.39155434504271713, + "grad_norm": 0.44341149911241157, + "learning_rate": 9.971667168614986e-05, + "loss": 3.371, + "step": 8410 + }, + { + "epoch": 0.3916009032288102, + "grad_norm": 0.4344690891278048, + "learning_rate": 9.971638365751978e-05, + "loss": 3.2544, + "step": 8411 + }, + { + "epoch": 0.39164746141490325, + "grad_norm": 0.40343389407973057, + "learning_rate": 9.971609548297704e-05, + "loss": 3.2264, + "step": 8412 + }, + { + "epoch": 0.39169401960099637, + "grad_norm": 0.43068947782342104, + "learning_rate": 9.971580716252245e-05, + "loss": 3.3938, + "step": 8413 + }, + { + "epoch": 0.39174057778708943, + "grad_norm": 0.3926140417187613, + "learning_rate": 9.971551869615686e-05, + "loss": 3.3315, + "step": 8414 + }, + { + "epoch": 0.3917871359731825, + "grad_norm": 0.3917542833164027, + "learning_rate": 9.971523008388111e-05, + "loss": 3.3325, + "step": 8415 + }, + { + "epoch": 0.39183369415927555, + "grad_norm": 0.384325534265658, + "learning_rate": 9.971494132569606e-05, + "loss": 3.3739, + "step": 8416 + }, + { + "epoch": 0.3918802523453686, + "grad_norm": 0.3505112441640412, + "learning_rate": 9.971465242160256e-05, + "loss": 3.2843, + "step": 8417 + }, + { + "epoch": 0.39192681053146167, + "grad_norm": 0.3693480693606663, + "learning_rate": 9.971436337160145e-05, + "loss": 3.3882, + "step": 8418 + }, + { + "epoch": 0.3919733687175548, + "grad_norm": 0.4026201380945478, + "learning_rate": 9.971407417569358e-05, + "loss": 3.3548, + "step": 8419 + }, + { + "epoch": 0.39201992690364784, + "grad_norm": 0.34130033907551344, + "learning_rate": 9.971378483387981e-05, + "loss": 3.204, + "step": 8420 + }, + { + "epoch": 0.3920664850897409, + "grad_norm": 0.41578678835119876, + "learning_rate": 9.971349534616098e-05, + "loss": 3.5203, + "step": 8421 + }, + { + "epoch": 0.39211304327583396, + "grad_norm": 0.37378444108024245, + "learning_rate": 9.971320571253793e-05, + "loss": 3.3282, + "step": 8422 + }, + { + "epoch": 0.392159601461927, + "grad_norm": 0.3794393805482, + "learning_rate": 9.971291593301153e-05, + "loss": 3.4507, + "step": 8423 + }, + { + "epoch": 0.39220615964802014, + "grad_norm": 0.42974434106230863, + "learning_rate": 9.97126260075826e-05, + "loss": 3.2966, + "step": 8424 + }, + { + "epoch": 0.3922527178341132, + "grad_norm": 0.4428136956696409, + "learning_rate": 9.971233593625203e-05, + "loss": 3.3859, + "step": 8425 + }, + { + "epoch": 0.39229927602020626, + "grad_norm": 0.4131895694110276, + "learning_rate": 9.971204571902065e-05, + "loss": 3.4188, + "step": 8426 + }, + { + "epoch": 0.3923458342062993, + "grad_norm": 0.41377038869980687, + "learning_rate": 9.97117553558893e-05, + "loss": 3.3184, + "step": 8427 + }, + { + "epoch": 0.3923923923923924, + "grad_norm": 0.43086158179166656, + "learning_rate": 9.971146484685885e-05, + "loss": 3.3448, + "step": 8428 + }, + { + "epoch": 0.39243895057848543, + "grad_norm": 0.46004060882269887, + "learning_rate": 9.971117419193015e-05, + "loss": 3.3319, + "step": 8429 + }, + { + "epoch": 0.39248550876457855, + "grad_norm": 0.37625215373823695, + "learning_rate": 9.971088339110403e-05, + "loss": 3.2712, + "step": 8430 + }, + { + "epoch": 0.3925320669506716, + "grad_norm": 0.41234528420045563, + "learning_rate": 9.971059244438139e-05, + "loss": 3.3688, + "step": 8431 + }, + { + "epoch": 0.39257862513676467, + "grad_norm": 0.4506513390736797, + "learning_rate": 9.971030135176306e-05, + "loss": 3.3116, + "step": 8432 + }, + { + "epoch": 0.39262518332285773, + "grad_norm": 0.4228737029156782, + "learning_rate": 9.971001011324988e-05, + "loss": 3.5008, + "step": 8433 + }, + { + "epoch": 0.3926717415089508, + "grad_norm": 0.4352679510856581, + "learning_rate": 9.970971872884272e-05, + "loss": 3.3933, + "step": 8434 + }, + { + "epoch": 0.3927182996950439, + "grad_norm": 0.42066945464986205, + "learning_rate": 9.970942719854244e-05, + "loss": 3.3003, + "step": 8435 + }, + { + "epoch": 0.39276485788113696, + "grad_norm": 0.48290534952513847, + "learning_rate": 9.970913552234988e-05, + "loss": 3.3366, + "step": 8436 + }, + { + "epoch": 0.39281141606723, + "grad_norm": 0.456218823029028, + "learning_rate": 9.970884370026592e-05, + "loss": 3.3494, + "step": 8437 + }, + { + "epoch": 0.3928579742533231, + "grad_norm": 0.4085641275019762, + "learning_rate": 9.970855173229139e-05, + "loss": 3.2396, + "step": 8438 + }, + { + "epoch": 0.39290453243941614, + "grad_norm": 0.4815633412786467, + "learning_rate": 9.970825961842715e-05, + "loss": 3.334, + "step": 8439 + }, + { + "epoch": 0.3929510906255092, + "grad_norm": 0.45782885396395695, + "learning_rate": 9.970796735867407e-05, + "loss": 3.2963, + "step": 8440 + }, + { + "epoch": 0.3929976488116023, + "grad_norm": 0.4359875004419261, + "learning_rate": 9.970767495303301e-05, + "loss": 3.359, + "step": 8441 + }, + { + "epoch": 0.3930442069976954, + "grad_norm": 0.4274305471142468, + "learning_rate": 9.970738240150482e-05, + "loss": 3.2663, + "step": 8442 + }, + { + "epoch": 0.39309076518378844, + "grad_norm": 0.43453851939652594, + "learning_rate": 9.970708970409035e-05, + "loss": 3.3273, + "step": 8443 + }, + { + "epoch": 0.3931373233698815, + "grad_norm": 0.39953080530318036, + "learning_rate": 9.970679686079047e-05, + "loss": 3.3903, + "step": 8444 + }, + { + "epoch": 0.39318388155597456, + "grad_norm": 0.42060351010395086, + "learning_rate": 9.970650387160604e-05, + "loss": 3.2766, + "step": 8445 + }, + { + "epoch": 0.3932304397420677, + "grad_norm": 0.4233620613977547, + "learning_rate": 9.970621073653793e-05, + "loss": 3.4846, + "step": 8446 + }, + { + "epoch": 0.39327699792816073, + "grad_norm": 0.480691940116541, + "learning_rate": 9.970591745558699e-05, + "loss": 3.3304, + "step": 8447 + }, + { + "epoch": 0.3933235561142538, + "grad_norm": 0.4391652147285116, + "learning_rate": 9.970562402875407e-05, + "loss": 3.2884, + "step": 8448 + }, + { + "epoch": 0.39337011430034685, + "grad_norm": 0.4636104898364459, + "learning_rate": 9.970533045604002e-05, + "loss": 3.4336, + "step": 8449 + }, + { + "epoch": 0.3934166724864399, + "grad_norm": 0.4227124414391733, + "learning_rate": 9.970503673744576e-05, + "loss": 3.1925, + "step": 8450 + }, + { + "epoch": 0.39346323067253297, + "grad_norm": 0.4210909633674569, + "learning_rate": 9.970474287297209e-05, + "loss": 3.2237, + "step": 8451 + }, + { + "epoch": 0.3935097888586261, + "grad_norm": 0.4832923178581319, + "learning_rate": 9.970444886261989e-05, + "loss": 3.2793, + "step": 8452 + }, + { + "epoch": 0.39355634704471915, + "grad_norm": 0.4419278461694883, + "learning_rate": 9.970415470639005e-05, + "loss": 3.3407, + "step": 8453 + }, + { + "epoch": 0.3936029052308122, + "grad_norm": 0.4335893151636564, + "learning_rate": 9.97038604042834e-05, + "loss": 3.4269, + "step": 8454 + }, + { + "epoch": 0.39364946341690527, + "grad_norm": 0.4355978334500499, + "learning_rate": 9.970356595630082e-05, + "loss": 3.1634, + "step": 8455 + }, + { + "epoch": 0.3936960216029983, + "grad_norm": 0.4232560110649132, + "learning_rate": 9.970327136244317e-05, + "loss": 3.3474, + "step": 8456 + }, + { + "epoch": 0.39374257978909144, + "grad_norm": 0.5420817509928517, + "learning_rate": 9.97029766227113e-05, + "loss": 3.3253, + "step": 8457 + }, + { + "epoch": 0.3937891379751845, + "grad_norm": 0.4505138219952777, + "learning_rate": 9.970268173710611e-05, + "loss": 3.2772, + "step": 8458 + }, + { + "epoch": 0.39383569616127756, + "grad_norm": 0.5123209954724744, + "learning_rate": 9.970238670562844e-05, + "loss": 3.2369, + "step": 8459 + }, + { + "epoch": 0.3938822543473706, + "grad_norm": 0.5080759103536517, + "learning_rate": 9.970209152827918e-05, + "loss": 3.298, + "step": 8460 + }, + { + "epoch": 0.3939288125334637, + "grad_norm": 0.43447277317609956, + "learning_rate": 9.970179620505915e-05, + "loss": 3.3676, + "step": 8461 + }, + { + "epoch": 0.39397537071955674, + "grad_norm": 0.4713473152958697, + "learning_rate": 9.970150073596925e-05, + "loss": 3.2178, + "step": 8462 + }, + { + "epoch": 0.39402192890564985, + "grad_norm": 0.45332650776396044, + "learning_rate": 9.970120512101036e-05, + "loss": 3.2558, + "step": 8463 + }, + { + "epoch": 0.3940684870917429, + "grad_norm": 0.4431913688786282, + "learning_rate": 9.97009093601833e-05, + "loss": 3.3783, + "step": 8464 + }, + { + "epoch": 0.394115045277836, + "grad_norm": 0.4819614223319294, + "learning_rate": 9.970061345348899e-05, + "loss": 3.3645, + "step": 8465 + }, + { + "epoch": 0.39416160346392903, + "grad_norm": 0.4306499036131012, + "learning_rate": 9.970031740092826e-05, + "loss": 3.5239, + "step": 8466 + }, + { + "epoch": 0.3942081616500221, + "grad_norm": 0.5070004337605586, + "learning_rate": 9.970002120250202e-05, + "loss": 3.3545, + "step": 8467 + }, + { + "epoch": 0.3942547198361152, + "grad_norm": 0.4598720326636599, + "learning_rate": 9.969972485821109e-05, + "loss": 3.2892, + "step": 8468 + }, + { + "epoch": 0.39430127802220827, + "grad_norm": 0.4747170425730174, + "learning_rate": 9.969942836805637e-05, + "loss": 3.3614, + "step": 8469 + }, + { + "epoch": 0.39434783620830133, + "grad_norm": 0.4802909014929153, + "learning_rate": 9.969913173203872e-05, + "loss": 3.2996, + "step": 8470 + }, + { + "epoch": 0.3943943943943944, + "grad_norm": 0.47474669592859325, + "learning_rate": 9.969883495015904e-05, + "loss": 3.429, + "step": 8471 + }, + { + "epoch": 0.39444095258048745, + "grad_norm": 0.4569996629125054, + "learning_rate": 9.969853802241814e-05, + "loss": 3.2974, + "step": 8472 + }, + { + "epoch": 0.3944875107665805, + "grad_norm": 0.4415658551886969, + "learning_rate": 9.969824094881695e-05, + "loss": 3.3163, + "step": 8473 + }, + { + "epoch": 0.3945340689526736, + "grad_norm": 0.47827508164942745, + "learning_rate": 9.96979437293563e-05, + "loss": 3.3355, + "step": 8474 + }, + { + "epoch": 0.3945806271387667, + "grad_norm": 0.4419593976179282, + "learning_rate": 9.96976463640371e-05, + "loss": 3.3442, + "step": 8475 + }, + { + "epoch": 0.39462718532485974, + "grad_norm": 0.49489763365187966, + "learning_rate": 9.969734885286019e-05, + "loss": 3.3202, + "step": 8476 + }, + { + "epoch": 0.3946737435109528, + "grad_norm": 0.438349597552627, + "learning_rate": 9.969705119582647e-05, + "loss": 3.4088, + "step": 8477 + }, + { + "epoch": 0.39472030169704586, + "grad_norm": 0.4806600901040097, + "learning_rate": 9.96967533929368e-05, + "loss": 3.3752, + "step": 8478 + }, + { + "epoch": 0.394766859883139, + "grad_norm": 0.4339241219560377, + "learning_rate": 9.969645544419203e-05, + "loss": 3.3558, + "step": 8479 + }, + { + "epoch": 0.39481341806923204, + "grad_norm": 0.48992440512667274, + "learning_rate": 9.96961573495931e-05, + "loss": 3.3701, + "step": 8480 + }, + { + "epoch": 0.3948599762553251, + "grad_norm": 0.43141328474457313, + "learning_rate": 9.969585910914082e-05, + "loss": 3.2684, + "step": 8481 + }, + { + "epoch": 0.39490653444141816, + "grad_norm": 0.4430182670869069, + "learning_rate": 9.96955607228361e-05, + "loss": 3.2537, + "step": 8482 + }, + { + "epoch": 0.3949530926275112, + "grad_norm": 0.4053425084603323, + "learning_rate": 9.969526219067978e-05, + "loss": 3.3216, + "step": 8483 + }, + { + "epoch": 0.3949996508136043, + "grad_norm": 0.42125764948571104, + "learning_rate": 9.969496351267278e-05, + "loss": 3.3531, + "step": 8484 + }, + { + "epoch": 0.3950462089996974, + "grad_norm": 0.421020481360844, + "learning_rate": 9.969466468881596e-05, + "loss": 3.394, + "step": 8485 + }, + { + "epoch": 0.39509276718579045, + "grad_norm": 0.42178945271425, + "learning_rate": 9.96943657191102e-05, + "loss": 3.4504, + "step": 8486 + }, + { + "epoch": 0.3951393253718835, + "grad_norm": 0.39558159700845485, + "learning_rate": 9.969406660355635e-05, + "loss": 3.466, + "step": 8487 + }, + { + "epoch": 0.39518588355797657, + "grad_norm": 0.398249821427323, + "learning_rate": 9.969376734215533e-05, + "loss": 3.3311, + "step": 8488 + }, + { + "epoch": 0.39523244174406963, + "grad_norm": 0.48794777932646755, + "learning_rate": 9.9693467934908e-05, + "loss": 3.4343, + "step": 8489 + }, + { + "epoch": 0.39527899993016274, + "grad_norm": 0.4413837960630916, + "learning_rate": 9.969316838181521e-05, + "loss": 3.2712, + "step": 8490 + }, + { + "epoch": 0.3953255581162558, + "grad_norm": 0.38829468282705637, + "learning_rate": 9.969286868287789e-05, + "loss": 3.44, + "step": 8491 + }, + { + "epoch": 0.39537211630234886, + "grad_norm": 0.48952457294999496, + "learning_rate": 9.96925688380969e-05, + "loss": 3.218, + "step": 8492 + }, + { + "epoch": 0.3954186744884419, + "grad_norm": 0.44210415760678845, + "learning_rate": 9.96922688474731e-05, + "loss": 3.368, + "step": 8493 + }, + { + "epoch": 0.395465232674535, + "grad_norm": 0.4440786508025994, + "learning_rate": 9.96919687110074e-05, + "loss": 3.4298, + "step": 8494 + }, + { + "epoch": 0.39551179086062804, + "grad_norm": 0.4435299148477909, + "learning_rate": 9.969166842870066e-05, + "loss": 3.3566, + "step": 8495 + }, + { + "epoch": 0.39555834904672116, + "grad_norm": 0.4282176587234169, + "learning_rate": 9.969136800055377e-05, + "loss": 3.4164, + "step": 8496 + }, + { + "epoch": 0.3956049072328142, + "grad_norm": 0.46316433205654384, + "learning_rate": 9.96910674265676e-05, + "loss": 3.267, + "step": 8497 + }, + { + "epoch": 0.3956514654189073, + "grad_norm": 0.44857255779315963, + "learning_rate": 9.969076670674307e-05, + "loss": 3.3718, + "step": 8498 + }, + { + "epoch": 0.39569802360500034, + "grad_norm": 0.4247096027041811, + "learning_rate": 9.969046584108102e-05, + "loss": 3.4309, + "step": 8499 + }, + { + "epoch": 0.3957445817910934, + "grad_norm": 0.5004278067085874, + "learning_rate": 9.969016482958233e-05, + "loss": 3.2813, + "step": 8500 + }, + { + "epoch": 0.3957911399771865, + "grad_norm": 0.5761601042655906, + "learning_rate": 9.968986367224792e-05, + "loss": 3.4309, + "step": 8501 + }, + { + "epoch": 0.39583769816327957, + "grad_norm": 0.5040845132427259, + "learning_rate": 9.968956236907866e-05, + "loss": 3.2877, + "step": 8502 + }, + { + "epoch": 0.39588425634937263, + "grad_norm": 0.45030601226196504, + "learning_rate": 9.968926092007542e-05, + "loss": 3.3917, + "step": 8503 + }, + { + "epoch": 0.3959308145354657, + "grad_norm": 0.41574624082141076, + "learning_rate": 9.968895932523909e-05, + "loss": 3.3116, + "step": 8504 + }, + { + "epoch": 0.39597737272155875, + "grad_norm": 0.4662534807036686, + "learning_rate": 9.968865758457058e-05, + "loss": 3.3289, + "step": 8505 + }, + { + "epoch": 0.3960239309076518, + "grad_norm": 0.4648624267593225, + "learning_rate": 9.968835569807074e-05, + "loss": 3.4158, + "step": 8506 + }, + { + "epoch": 0.3960704890937449, + "grad_norm": 0.4121464431271334, + "learning_rate": 9.968805366574047e-05, + "loss": 3.3198, + "step": 8507 + }, + { + "epoch": 0.396117047279838, + "grad_norm": 0.4313287921040508, + "learning_rate": 9.968775148758066e-05, + "loss": 3.4502, + "step": 8508 + }, + { + "epoch": 0.39616360546593105, + "grad_norm": 0.42877622916236874, + "learning_rate": 9.96874491635922e-05, + "loss": 3.4182, + "step": 8509 + }, + { + "epoch": 0.3962101636520241, + "grad_norm": 0.3959422008711481, + "learning_rate": 9.968714669377595e-05, + "loss": 3.3288, + "step": 8510 + }, + { + "epoch": 0.39625672183811717, + "grad_norm": 0.4912237432108079, + "learning_rate": 9.968684407813285e-05, + "loss": 3.2989, + "step": 8511 + }, + { + "epoch": 0.3963032800242103, + "grad_norm": 0.4814640270126941, + "learning_rate": 9.968654131666374e-05, + "loss": 3.3356, + "step": 8512 + }, + { + "epoch": 0.39634983821030334, + "grad_norm": 0.46235032971314177, + "learning_rate": 9.968623840936952e-05, + "loss": 3.3769, + "step": 8513 + }, + { + "epoch": 0.3963963963963964, + "grad_norm": 0.46508904026723097, + "learning_rate": 9.96859353562511e-05, + "loss": 3.2857, + "step": 8514 + }, + { + "epoch": 0.39644295458248946, + "grad_norm": 0.4957308274952294, + "learning_rate": 9.968563215730935e-05, + "loss": 3.2919, + "step": 8515 + }, + { + "epoch": 0.3964895127685825, + "grad_norm": 0.43575684144232957, + "learning_rate": 9.968532881254516e-05, + "loss": 3.4126, + "step": 8516 + }, + { + "epoch": 0.3965360709546756, + "grad_norm": 0.515089471183749, + "learning_rate": 9.968502532195943e-05, + "loss": 3.3072, + "step": 8517 + }, + { + "epoch": 0.3965826291407687, + "grad_norm": 0.45567054195945234, + "learning_rate": 9.968472168555305e-05, + "loss": 3.3314, + "step": 8518 + }, + { + "epoch": 0.39662918732686175, + "grad_norm": 0.524188271581571, + "learning_rate": 9.96844179033269e-05, + "loss": 3.3531, + "step": 8519 + }, + { + "epoch": 0.3966757455129548, + "grad_norm": 0.4365543574093159, + "learning_rate": 9.968411397528188e-05, + "loss": 3.1708, + "step": 8520 + }, + { + "epoch": 0.3967223036990479, + "grad_norm": 0.48182210849369445, + "learning_rate": 9.968380990141888e-05, + "loss": 3.3413, + "step": 8521 + }, + { + "epoch": 0.39676886188514093, + "grad_norm": 0.4629686731900824, + "learning_rate": 9.968350568173879e-05, + "loss": 3.4021, + "step": 8522 + }, + { + "epoch": 0.39681542007123405, + "grad_norm": 0.4769520995821674, + "learning_rate": 9.968320131624251e-05, + "loss": 3.4348, + "step": 8523 + }, + { + "epoch": 0.3968619782573271, + "grad_norm": 0.45088144232439303, + "learning_rate": 9.968289680493092e-05, + "loss": 3.4179, + "step": 8524 + }, + { + "epoch": 0.39690853644342017, + "grad_norm": 0.46995892801705036, + "learning_rate": 9.968259214780494e-05, + "loss": 3.371, + "step": 8525 + }, + { + "epoch": 0.3969550946295132, + "grad_norm": 0.45472616525121645, + "learning_rate": 9.968228734486542e-05, + "loss": 3.2978, + "step": 8526 + }, + { + "epoch": 0.3970016528156063, + "grad_norm": 0.4462856523242753, + "learning_rate": 9.968198239611331e-05, + "loss": 3.3284, + "step": 8527 + }, + { + "epoch": 0.39704821100169935, + "grad_norm": 0.40399364751595995, + "learning_rate": 9.968167730154945e-05, + "loss": 3.3498, + "step": 8528 + }, + { + "epoch": 0.39709476918779246, + "grad_norm": 0.43912979324367046, + "learning_rate": 9.968137206117478e-05, + "loss": 3.3092, + "step": 8529 + }, + { + "epoch": 0.3971413273738855, + "grad_norm": 0.41427417674004613, + "learning_rate": 9.968106667499018e-05, + "loss": 3.2987, + "step": 8530 + }, + { + "epoch": 0.3971878855599786, + "grad_norm": 0.3979421422477916, + "learning_rate": 9.968076114299653e-05, + "loss": 3.3774, + "step": 8531 + }, + { + "epoch": 0.39723444374607164, + "grad_norm": 0.3708425181308862, + "learning_rate": 9.968045546519474e-05, + "loss": 3.399, + "step": 8532 + }, + { + "epoch": 0.3972810019321647, + "grad_norm": 0.4155126534760121, + "learning_rate": 9.968014964158572e-05, + "loss": 3.3141, + "step": 8533 + }, + { + "epoch": 0.3973275601182578, + "grad_norm": 0.4205048082743075, + "learning_rate": 9.967984367217034e-05, + "loss": 3.3154, + "step": 8534 + }, + { + "epoch": 0.3973741183043509, + "grad_norm": 0.4450551945360283, + "learning_rate": 9.967953755694952e-05, + "loss": 3.2615, + "step": 8535 + }, + { + "epoch": 0.39742067649044394, + "grad_norm": 0.4214656843596643, + "learning_rate": 9.967923129592416e-05, + "loss": 3.3652, + "step": 8536 + }, + { + "epoch": 0.397467234676537, + "grad_norm": 0.40583435728811174, + "learning_rate": 9.967892488909514e-05, + "loss": 3.2538, + "step": 8537 + }, + { + "epoch": 0.39751379286263006, + "grad_norm": 0.4088552561159768, + "learning_rate": 9.967861833646337e-05, + "loss": 3.3059, + "step": 8538 + }, + { + "epoch": 0.3975603510487231, + "grad_norm": 0.43099270190132616, + "learning_rate": 9.967831163802976e-05, + "loss": 3.3322, + "step": 8539 + }, + { + "epoch": 0.39760690923481623, + "grad_norm": 0.430172170859464, + "learning_rate": 9.967800479379519e-05, + "loss": 3.268, + "step": 8540 + }, + { + "epoch": 0.3976534674209093, + "grad_norm": 0.4237540667590982, + "learning_rate": 9.967769780376058e-05, + "loss": 3.3076, + "step": 8541 + }, + { + "epoch": 0.39770002560700235, + "grad_norm": 0.461549895162867, + "learning_rate": 9.96773906679268e-05, + "loss": 3.4072, + "step": 8542 + }, + { + "epoch": 0.3977465837930954, + "grad_norm": 0.5737135225456791, + "learning_rate": 9.967708338629478e-05, + "loss": 3.3553, + "step": 8543 + }, + { + "epoch": 0.39779314197918847, + "grad_norm": 0.5334601830958686, + "learning_rate": 9.967677595886543e-05, + "loss": 3.3335, + "step": 8544 + }, + { + "epoch": 0.3978397001652816, + "grad_norm": 0.4872788023205388, + "learning_rate": 9.967646838563963e-05, + "loss": 3.3689, + "step": 8545 + }, + { + "epoch": 0.39788625835137464, + "grad_norm": 0.48752665255786015, + "learning_rate": 9.967616066661827e-05, + "loss": 3.4209, + "step": 8546 + }, + { + "epoch": 0.3979328165374677, + "grad_norm": 0.5390057158645672, + "learning_rate": 9.967585280180229e-05, + "loss": 3.3963, + "step": 8547 + }, + { + "epoch": 0.39797937472356076, + "grad_norm": 0.4850049178977143, + "learning_rate": 9.967554479119258e-05, + "loss": 3.384, + "step": 8548 + }, + { + "epoch": 0.3980259329096538, + "grad_norm": 0.44927128298969177, + "learning_rate": 9.967523663479003e-05, + "loss": 3.27, + "step": 8549 + }, + { + "epoch": 0.3980724910957469, + "grad_norm": 0.5276965068069869, + "learning_rate": 9.967492833259555e-05, + "loss": 3.4219, + "step": 8550 + }, + { + "epoch": 0.39811904928184, + "grad_norm": 0.42818372758946216, + "learning_rate": 9.967461988461007e-05, + "loss": 3.437, + "step": 8551 + }, + { + "epoch": 0.39816560746793306, + "grad_norm": 0.5343336018505008, + "learning_rate": 9.967431129083445e-05, + "loss": 3.3477, + "step": 8552 + }, + { + "epoch": 0.3982121656540261, + "grad_norm": 0.5036445302404886, + "learning_rate": 9.967400255126964e-05, + "loss": 3.3964, + "step": 8553 + }, + { + "epoch": 0.3982587238401192, + "grad_norm": 0.512468333736315, + "learning_rate": 9.967369366591652e-05, + "loss": 3.4134, + "step": 8554 + }, + { + "epoch": 0.39830528202621224, + "grad_norm": 0.5236527735645393, + "learning_rate": 9.967338463477601e-05, + "loss": 3.2822, + "step": 8555 + }, + { + "epoch": 0.39835184021230535, + "grad_norm": 0.499348262136517, + "learning_rate": 9.967307545784899e-05, + "loss": 3.3633, + "step": 8556 + }, + { + "epoch": 0.3983983983983984, + "grad_norm": 0.4491507479868839, + "learning_rate": 9.967276613513639e-05, + "loss": 3.2672, + "step": 8557 + }, + { + "epoch": 0.39844495658449147, + "grad_norm": 0.4308417240547544, + "learning_rate": 9.967245666663912e-05, + "loss": 3.3017, + "step": 8558 + }, + { + "epoch": 0.39849151477058453, + "grad_norm": 0.42731844508395156, + "learning_rate": 9.967214705235809e-05, + "loss": 3.2133, + "step": 8559 + }, + { + "epoch": 0.3985380729566776, + "grad_norm": 0.4111512273760166, + "learning_rate": 9.967183729229419e-05, + "loss": 3.3416, + "step": 8560 + }, + { + "epoch": 0.39858463114277065, + "grad_norm": 0.45070817238726874, + "learning_rate": 9.967152738644836e-05, + "loss": 3.2997, + "step": 8561 + }, + { + "epoch": 0.39863118932886377, + "grad_norm": 0.44033475874470696, + "learning_rate": 9.967121733482147e-05, + "loss": 3.3074, + "step": 8562 + }, + { + "epoch": 0.3986777475149568, + "grad_norm": 0.48183579975933716, + "learning_rate": 9.967090713741446e-05, + "loss": 3.4661, + "step": 8563 + }, + { + "epoch": 0.3987243057010499, + "grad_norm": 0.48959929936959495, + "learning_rate": 9.967059679422824e-05, + "loss": 3.3757, + "step": 8564 + }, + { + "epoch": 0.39877086388714295, + "grad_norm": 0.430409177401479, + "learning_rate": 9.967028630526369e-05, + "loss": 3.252, + "step": 8565 + }, + { + "epoch": 0.398817422073236, + "grad_norm": 0.37613152062913136, + "learning_rate": 9.966997567052176e-05, + "loss": 3.2896, + "step": 8566 + }, + { + "epoch": 0.3988639802593291, + "grad_norm": 0.41701381315610336, + "learning_rate": 9.966966489000334e-05, + "loss": 3.3246, + "step": 8567 + }, + { + "epoch": 0.3989105384454222, + "grad_norm": 0.43154770851115914, + "learning_rate": 9.966935396370935e-05, + "loss": 3.2761, + "step": 8568 + }, + { + "epoch": 0.39895709663151524, + "grad_norm": 0.40917440981123765, + "learning_rate": 9.966904289164069e-05, + "loss": 3.4299, + "step": 8569 + }, + { + "epoch": 0.3990036548176083, + "grad_norm": 0.39671869216016364, + "learning_rate": 9.966873167379829e-05, + "loss": 3.4288, + "step": 8570 + }, + { + "epoch": 0.39905021300370136, + "grad_norm": 0.40173850134356104, + "learning_rate": 9.966842031018306e-05, + "loss": 3.3577, + "step": 8571 + }, + { + "epoch": 0.3990967711897944, + "grad_norm": 0.4216112820564448, + "learning_rate": 9.96681088007959e-05, + "loss": 3.3378, + "step": 8572 + }, + { + "epoch": 0.39914332937588753, + "grad_norm": 0.40762815926204327, + "learning_rate": 9.966779714563774e-05, + "loss": 3.3607, + "step": 8573 + }, + { + "epoch": 0.3991898875619806, + "grad_norm": 0.4614781287248337, + "learning_rate": 9.966748534470948e-05, + "loss": 3.2621, + "step": 8574 + }, + { + "epoch": 0.39923644574807365, + "grad_norm": 0.41920996749842915, + "learning_rate": 9.966717339801206e-05, + "loss": 3.2754, + "step": 8575 + }, + { + "epoch": 0.3992830039341667, + "grad_norm": 0.4208884507497401, + "learning_rate": 9.966686130554637e-05, + "loss": 3.3622, + "step": 8576 + }, + { + "epoch": 0.3993295621202598, + "grad_norm": 0.4903443073734668, + "learning_rate": 9.966654906731332e-05, + "loss": 3.3564, + "step": 8577 + }, + { + "epoch": 0.3993761203063529, + "grad_norm": 0.4613706743025196, + "learning_rate": 9.966623668331385e-05, + "loss": 3.4178, + "step": 8578 + }, + { + "epoch": 0.39942267849244595, + "grad_norm": 0.4122128666096355, + "learning_rate": 9.966592415354888e-05, + "loss": 3.342, + "step": 8579 + }, + { + "epoch": 0.399469236678539, + "grad_norm": 0.3966843580861166, + "learning_rate": 9.966561147801932e-05, + "loss": 3.3233, + "step": 8580 + }, + { + "epoch": 0.39951579486463207, + "grad_norm": 0.44709258534226504, + "learning_rate": 9.966529865672607e-05, + "loss": 3.5177, + "step": 8581 + }, + { + "epoch": 0.3995623530507251, + "grad_norm": 0.47186085802972577, + "learning_rate": 9.966498568967005e-05, + "loss": 3.2994, + "step": 8582 + }, + { + "epoch": 0.3996089112368182, + "grad_norm": 0.38244023811837957, + "learning_rate": 9.966467257685222e-05, + "loss": 3.2596, + "step": 8583 + }, + { + "epoch": 0.3996554694229113, + "grad_norm": 0.42672701981567984, + "learning_rate": 9.966435931827345e-05, + "loss": 3.331, + "step": 8584 + }, + { + "epoch": 0.39970202760900436, + "grad_norm": 0.4357718951239302, + "learning_rate": 9.966404591393468e-05, + "loss": 3.2989, + "step": 8585 + }, + { + "epoch": 0.3997485857950974, + "grad_norm": 0.44898188786487375, + "learning_rate": 9.966373236383681e-05, + "loss": 3.4439, + "step": 8586 + }, + { + "epoch": 0.3997951439811905, + "grad_norm": 0.45422579016640235, + "learning_rate": 9.966341866798082e-05, + "loss": 3.4329, + "step": 8587 + }, + { + "epoch": 0.39984170216728354, + "grad_norm": 0.4254637268998534, + "learning_rate": 9.966310482636755e-05, + "loss": 3.2733, + "step": 8588 + }, + { + "epoch": 0.39988826035337666, + "grad_norm": 0.40952870018799914, + "learning_rate": 9.966279083899798e-05, + "loss": 3.3822, + "step": 8589 + }, + { + "epoch": 0.3999348185394697, + "grad_norm": 0.44717412336077494, + "learning_rate": 9.966247670587301e-05, + "loss": 3.3931, + "step": 8590 + }, + { + "epoch": 0.3999813767255628, + "grad_norm": 0.4239997444378877, + "learning_rate": 9.966216242699355e-05, + "loss": 3.3495, + "step": 8591 + }, + { + "epoch": 0.40002793491165584, + "grad_norm": 0.39236540746235105, + "learning_rate": 9.966184800236055e-05, + "loss": 3.3289, + "step": 8592 + }, + { + "epoch": 0.4000744930977489, + "grad_norm": 0.46032645493356444, + "learning_rate": 9.966153343197491e-05, + "loss": 3.355, + "step": 8593 + }, + { + "epoch": 0.40012105128384196, + "grad_norm": 0.4516505860676737, + "learning_rate": 9.966121871583756e-05, + "loss": 3.4069, + "step": 8594 + }, + { + "epoch": 0.40016760946993507, + "grad_norm": 0.46280176667016726, + "learning_rate": 9.966090385394944e-05, + "loss": 3.4099, + "step": 8595 + }, + { + "epoch": 0.40021416765602813, + "grad_norm": 0.4484075221096517, + "learning_rate": 9.966058884631145e-05, + "loss": 3.3784, + "step": 8596 + }, + { + "epoch": 0.4002607258421212, + "grad_norm": 0.40268066937770997, + "learning_rate": 9.966027369292452e-05, + "loss": 3.2839, + "step": 8597 + }, + { + "epoch": 0.40030728402821425, + "grad_norm": 0.3714092607604111, + "learning_rate": 9.965995839378958e-05, + "loss": 3.2964, + "step": 8598 + }, + { + "epoch": 0.4003538422143073, + "grad_norm": 0.45573805156892305, + "learning_rate": 9.965964294890758e-05, + "loss": 3.2246, + "step": 8599 + }, + { + "epoch": 0.4004004004004004, + "grad_norm": 0.39071160871271854, + "learning_rate": 9.965932735827939e-05, + "loss": 3.2701, + "step": 8600 + }, + { + "epoch": 0.4004469585864935, + "grad_norm": 0.4167322909751075, + "learning_rate": 9.965901162190597e-05, + "loss": 3.3146, + "step": 8601 + }, + { + "epoch": 0.40049351677258654, + "grad_norm": 0.4633805929795507, + "learning_rate": 9.965869573978825e-05, + "loss": 3.3503, + "step": 8602 + }, + { + "epoch": 0.4005400749586796, + "grad_norm": 0.44739175104810797, + "learning_rate": 9.965837971192716e-05, + "loss": 3.3644, + "step": 8603 + }, + { + "epoch": 0.40058663314477266, + "grad_norm": 0.49441137834459514, + "learning_rate": 9.96580635383236e-05, + "loss": 3.2959, + "step": 8604 + }, + { + "epoch": 0.4006331913308657, + "grad_norm": 0.4972962028836905, + "learning_rate": 9.965774721897852e-05, + "loss": 3.349, + "step": 8605 + }, + { + "epoch": 0.40067974951695884, + "grad_norm": 0.4698163272347432, + "learning_rate": 9.965743075389284e-05, + "loss": 3.2307, + "step": 8606 + }, + { + "epoch": 0.4007263077030519, + "grad_norm": 0.5129329611449678, + "learning_rate": 9.965711414306751e-05, + "loss": 3.4268, + "step": 8607 + }, + { + "epoch": 0.40077286588914496, + "grad_norm": 0.5018115602038532, + "learning_rate": 9.965679738650343e-05, + "loss": 3.3539, + "step": 8608 + }, + { + "epoch": 0.400819424075238, + "grad_norm": 0.5051373022894303, + "learning_rate": 9.965648048420155e-05, + "loss": 3.3524, + "step": 8609 + }, + { + "epoch": 0.4008659822613311, + "grad_norm": 0.5448812491374692, + "learning_rate": 9.965616343616278e-05, + "loss": 3.2345, + "step": 8610 + }, + { + "epoch": 0.4009125404474242, + "grad_norm": 0.546952652668676, + "learning_rate": 9.965584624238806e-05, + "loss": 3.3759, + "step": 8611 + }, + { + "epoch": 0.40095909863351725, + "grad_norm": 0.46514357835901843, + "learning_rate": 9.965552890287833e-05, + "loss": 3.2648, + "step": 8612 + }, + { + "epoch": 0.4010056568196103, + "grad_norm": 0.48467448962758974, + "learning_rate": 9.965521141763452e-05, + "loss": 3.369, + "step": 8613 + }, + { + "epoch": 0.40105221500570337, + "grad_norm": 0.5482783095785753, + "learning_rate": 9.965489378665755e-05, + "loss": 3.4346, + "step": 8614 + }, + { + "epoch": 0.40109877319179643, + "grad_norm": 0.526256594014125, + "learning_rate": 9.965457600994836e-05, + "loss": 3.3134, + "step": 8615 + }, + { + "epoch": 0.4011453313778895, + "grad_norm": 0.43910160426545136, + "learning_rate": 9.965425808750787e-05, + "loss": 3.3484, + "step": 8616 + }, + { + "epoch": 0.4011918895639826, + "grad_norm": 0.5097489158060704, + "learning_rate": 9.965394001933704e-05, + "loss": 3.2716, + "step": 8617 + }, + { + "epoch": 0.40123844775007567, + "grad_norm": 0.5429653966617493, + "learning_rate": 9.965362180543679e-05, + "loss": 3.3092, + "step": 8618 + }, + { + "epoch": 0.4012850059361687, + "grad_norm": 0.45264801187590664, + "learning_rate": 9.965330344580805e-05, + "loss": 3.4063, + "step": 8619 + }, + { + "epoch": 0.4013315641222618, + "grad_norm": 0.5577737161110359, + "learning_rate": 9.965298494045175e-05, + "loss": 3.363, + "step": 8620 + }, + { + "epoch": 0.40137812230835485, + "grad_norm": 0.4875059646299498, + "learning_rate": 9.965266628936883e-05, + "loss": 3.3843, + "step": 8621 + }, + { + "epoch": 0.40142468049444796, + "grad_norm": 0.42421942345973074, + "learning_rate": 9.965234749256023e-05, + "loss": 3.3245, + "step": 8622 + }, + { + "epoch": 0.401471238680541, + "grad_norm": 0.5170287850422204, + "learning_rate": 9.965202855002687e-05, + "loss": 3.2957, + "step": 8623 + }, + { + "epoch": 0.4015177968666341, + "grad_norm": 0.5138496431896543, + "learning_rate": 9.965170946176971e-05, + "loss": 3.3409, + "step": 8624 + }, + { + "epoch": 0.40156435505272714, + "grad_norm": 0.44018661623196964, + "learning_rate": 9.965139022778967e-05, + "loss": 3.314, + "step": 8625 + }, + { + "epoch": 0.4016109132388202, + "grad_norm": 0.456528011217723, + "learning_rate": 9.965107084808769e-05, + "loss": 3.4114, + "step": 8626 + }, + { + "epoch": 0.40165747142491326, + "grad_norm": 0.44858370916342627, + "learning_rate": 9.965075132266471e-05, + "loss": 3.3275, + "step": 8627 + }, + { + "epoch": 0.4017040296110064, + "grad_norm": 0.4234055057680394, + "learning_rate": 9.965043165152166e-05, + "loss": 3.1785, + "step": 8628 + }, + { + "epoch": 0.40175058779709943, + "grad_norm": 0.4695494572288627, + "learning_rate": 9.96501118346595e-05, + "loss": 3.3444, + "step": 8629 + }, + { + "epoch": 0.4017971459831925, + "grad_norm": 0.46763931869109127, + "learning_rate": 9.964979187207915e-05, + "loss": 3.3833, + "step": 8630 + }, + { + "epoch": 0.40184370416928555, + "grad_norm": 0.43583304783511795, + "learning_rate": 9.964947176378154e-05, + "loss": 3.3998, + "step": 8631 + }, + { + "epoch": 0.4018902623553786, + "grad_norm": 0.4595529450135011, + "learning_rate": 9.964915150976763e-05, + "loss": 3.3637, + "step": 8632 + }, + { + "epoch": 0.40193682054147173, + "grad_norm": 0.5009306440090607, + "learning_rate": 9.964883111003835e-05, + "loss": 3.2803, + "step": 8633 + }, + { + "epoch": 0.4019833787275648, + "grad_norm": 0.43793629732182204, + "learning_rate": 9.964851056459465e-05, + "loss": 3.38, + "step": 8634 + }, + { + "epoch": 0.40202993691365785, + "grad_norm": 0.4411861703488384, + "learning_rate": 9.964818987343745e-05, + "loss": 3.4092, + "step": 8635 + }, + { + "epoch": 0.4020764950997509, + "grad_norm": 0.4718690755418445, + "learning_rate": 9.964786903656771e-05, + "loss": 3.4102, + "step": 8636 + }, + { + "epoch": 0.40212305328584397, + "grad_norm": 0.4677232195307654, + "learning_rate": 9.964754805398637e-05, + "loss": 3.2959, + "step": 8637 + }, + { + "epoch": 0.402169611471937, + "grad_norm": 0.43126537991272923, + "learning_rate": 9.964722692569434e-05, + "loss": 3.201, + "step": 8638 + }, + { + "epoch": 0.40221616965803014, + "grad_norm": 0.4150403997751233, + "learning_rate": 9.964690565169262e-05, + "loss": 3.4036, + "step": 8639 + }, + { + "epoch": 0.4022627278441232, + "grad_norm": 0.4674152482068893, + "learning_rate": 9.964658423198212e-05, + "loss": 3.4069, + "step": 8640 + }, + { + "epoch": 0.40230928603021626, + "grad_norm": 0.42273831939298284, + "learning_rate": 9.964626266656378e-05, + "loss": 3.434, + "step": 8641 + }, + { + "epoch": 0.4023558442163093, + "grad_norm": 0.37023765371120115, + "learning_rate": 9.964594095543855e-05, + "loss": 3.3013, + "step": 8642 + }, + { + "epoch": 0.4024024024024024, + "grad_norm": 0.4469772559555663, + "learning_rate": 9.964561909860737e-05, + "loss": 3.3256, + "step": 8643 + }, + { + "epoch": 0.4024489605884955, + "grad_norm": 0.3856700310055636, + "learning_rate": 9.96452970960712e-05, + "loss": 3.3827, + "step": 8644 + }, + { + "epoch": 0.40249551877458856, + "grad_norm": 0.39706775315954307, + "learning_rate": 9.964497494783096e-05, + "loss": 3.2867, + "step": 8645 + }, + { + "epoch": 0.4025420769606816, + "grad_norm": 0.3831585997650902, + "learning_rate": 9.964465265388762e-05, + "loss": 3.3418, + "step": 8646 + }, + { + "epoch": 0.4025886351467747, + "grad_norm": 0.38548917715661585, + "learning_rate": 9.964433021424211e-05, + "loss": 3.3229, + "step": 8647 + }, + { + "epoch": 0.40263519333286774, + "grad_norm": 0.4171107694176683, + "learning_rate": 9.964400762889538e-05, + "loss": 3.1773, + "step": 8648 + }, + { + "epoch": 0.4026817515189608, + "grad_norm": 0.3820531127634182, + "learning_rate": 9.964368489784839e-05, + "loss": 3.3713, + "step": 8649 + }, + { + "epoch": 0.4027283097050539, + "grad_norm": 0.37438428691498066, + "learning_rate": 9.964336202110207e-05, + "loss": 3.2812, + "step": 8650 + }, + { + "epoch": 0.40277486789114697, + "grad_norm": 0.43198150488495834, + "learning_rate": 9.964303899865737e-05, + "loss": 3.2694, + "step": 8651 + }, + { + "epoch": 0.40282142607724003, + "grad_norm": 0.37042436105303467, + "learning_rate": 9.964271583051524e-05, + "loss": 3.3775, + "step": 8652 + }, + { + "epoch": 0.4028679842633331, + "grad_norm": 0.41513465480609246, + "learning_rate": 9.964239251667664e-05, + "loss": 3.3299, + "step": 8653 + }, + { + "epoch": 0.40291454244942615, + "grad_norm": 0.3737312320598258, + "learning_rate": 9.96420690571425e-05, + "loss": 3.4335, + "step": 8654 + }, + { + "epoch": 0.40296110063551926, + "grad_norm": 0.3918191536695356, + "learning_rate": 9.964174545191378e-05, + "loss": 3.2254, + "step": 8655 + }, + { + "epoch": 0.4030076588216123, + "grad_norm": 0.35691718373845294, + "learning_rate": 9.964142170099142e-05, + "loss": 3.4231, + "step": 8656 + }, + { + "epoch": 0.4030542170077054, + "grad_norm": 0.4406096995146783, + "learning_rate": 9.964109780437638e-05, + "loss": 3.3169, + "step": 8657 + }, + { + "epoch": 0.40310077519379844, + "grad_norm": 0.42620728448499157, + "learning_rate": 9.964077376206962e-05, + "loss": 3.376, + "step": 8658 + }, + { + "epoch": 0.4031473333798915, + "grad_norm": 0.4057492531396419, + "learning_rate": 9.964044957407206e-05, + "loss": 3.2904, + "step": 8659 + }, + { + "epoch": 0.40319389156598456, + "grad_norm": 0.3975539729927362, + "learning_rate": 9.964012524038469e-05, + "loss": 3.2532, + "step": 8660 + }, + { + "epoch": 0.4032404497520777, + "grad_norm": 0.4842056611330027, + "learning_rate": 9.963980076100843e-05, + "loss": 3.3006, + "step": 8661 + }, + { + "epoch": 0.40328700793817074, + "grad_norm": 0.41700775356254915, + "learning_rate": 9.963947613594426e-05, + "loss": 3.3086, + "step": 8662 + }, + { + "epoch": 0.4033335661242638, + "grad_norm": 0.3705910750045201, + "learning_rate": 9.963915136519311e-05, + "loss": 3.4133, + "step": 8663 + }, + { + "epoch": 0.40338012431035686, + "grad_norm": 0.43822942763887895, + "learning_rate": 9.963882644875593e-05, + "loss": 3.3965, + "step": 8664 + }, + { + "epoch": 0.4034266824964499, + "grad_norm": 0.4697278967099909, + "learning_rate": 9.963850138663369e-05, + "loss": 3.2725, + "step": 8665 + }, + { + "epoch": 0.40347324068254303, + "grad_norm": 0.4081741699032153, + "learning_rate": 9.963817617882736e-05, + "loss": 3.3651, + "step": 8666 + }, + { + "epoch": 0.4035197988686361, + "grad_norm": 0.40464785335024894, + "learning_rate": 9.963785082533786e-05, + "loss": 3.3319, + "step": 8667 + }, + { + "epoch": 0.40356635705472915, + "grad_norm": 0.4459806693503869, + "learning_rate": 9.963752532616613e-05, + "loss": 3.4212, + "step": 8668 + }, + { + "epoch": 0.4036129152408222, + "grad_norm": 0.42894059318077843, + "learning_rate": 9.963719968131319e-05, + "loss": 3.3229, + "step": 8669 + }, + { + "epoch": 0.40365947342691527, + "grad_norm": 0.4532190030699505, + "learning_rate": 9.963687389077994e-05, + "loss": 3.3136, + "step": 8670 + }, + { + "epoch": 0.40370603161300833, + "grad_norm": 0.4414260837761993, + "learning_rate": 9.963654795456736e-05, + "loss": 3.4008, + "step": 8671 + }, + { + "epoch": 0.40375258979910145, + "grad_norm": 0.4152553047745786, + "learning_rate": 9.96362218726764e-05, + "loss": 3.418, + "step": 8672 + }, + { + "epoch": 0.4037991479851945, + "grad_norm": 0.5152910695774146, + "learning_rate": 9.963589564510802e-05, + "loss": 3.2776, + "step": 8673 + }, + { + "epoch": 0.40384570617128757, + "grad_norm": 0.4709947957654786, + "learning_rate": 9.96355692718632e-05, + "loss": 3.3066, + "step": 8674 + }, + { + "epoch": 0.4038922643573806, + "grad_norm": 0.4255484142048042, + "learning_rate": 9.963524275294283e-05, + "loss": 3.4201, + "step": 8675 + }, + { + "epoch": 0.4039388225434737, + "grad_norm": 0.4721698136308832, + "learning_rate": 9.963491608834794e-05, + "loss": 3.3777, + "step": 8676 + }, + { + "epoch": 0.4039853807295668, + "grad_norm": 0.43326796622285907, + "learning_rate": 9.963458927807946e-05, + "loss": 3.3105, + "step": 8677 + }, + { + "epoch": 0.40403193891565986, + "grad_norm": 0.5099111850034801, + "learning_rate": 9.963426232213835e-05, + "loss": 3.2937, + "step": 8678 + }, + { + "epoch": 0.4040784971017529, + "grad_norm": 0.503675010225604, + "learning_rate": 9.963393522052556e-05, + "loss": 3.4016, + "step": 8679 + }, + { + "epoch": 0.404125055287846, + "grad_norm": 0.41063644691179274, + "learning_rate": 9.963360797324206e-05, + "loss": 3.2187, + "step": 8680 + }, + { + "epoch": 0.40417161347393904, + "grad_norm": 0.5237879514374109, + "learning_rate": 9.963328058028882e-05, + "loss": 3.2633, + "step": 8681 + }, + { + "epoch": 0.4042181716600321, + "grad_norm": 0.5729056383841084, + "learning_rate": 9.963295304166678e-05, + "loss": 3.2776, + "step": 8682 + }, + { + "epoch": 0.4042647298461252, + "grad_norm": 0.4737674200174514, + "learning_rate": 9.963262535737694e-05, + "loss": 3.2946, + "step": 8683 + }, + { + "epoch": 0.4043112880322183, + "grad_norm": 0.47138518634349047, + "learning_rate": 9.96322975274202e-05, + "loss": 3.368, + "step": 8684 + }, + { + "epoch": 0.40435784621831133, + "grad_norm": 0.5036810156917919, + "learning_rate": 9.963196955179757e-05, + "loss": 3.4179, + "step": 8685 + }, + { + "epoch": 0.4044044044044044, + "grad_norm": 0.44467924919882235, + "learning_rate": 9.963164143051e-05, + "loss": 3.3104, + "step": 8686 + }, + { + "epoch": 0.40445096259049745, + "grad_norm": 0.4559970407966485, + "learning_rate": 9.963131316355845e-05, + "loss": 3.4224, + "step": 8687 + }, + { + "epoch": 0.40449752077659057, + "grad_norm": 0.4453644394179485, + "learning_rate": 9.96309847509439e-05, + "loss": 3.257, + "step": 8688 + }, + { + "epoch": 0.40454407896268363, + "grad_norm": 0.4243467945539518, + "learning_rate": 9.963065619266727e-05, + "loss": 3.2768, + "step": 8689 + }, + { + "epoch": 0.4045906371487767, + "grad_norm": 0.45200354689870514, + "learning_rate": 9.963032748872958e-05, + "loss": 3.366, + "step": 8690 + }, + { + "epoch": 0.40463719533486975, + "grad_norm": 0.4340653678454758, + "learning_rate": 9.962999863913175e-05, + "loss": 3.2531, + "step": 8691 + }, + { + "epoch": 0.4046837535209628, + "grad_norm": 0.39111404778840875, + "learning_rate": 9.962966964387477e-05, + "loss": 3.3447, + "step": 8692 + }, + { + "epoch": 0.40473031170705587, + "grad_norm": 0.4127306341424654, + "learning_rate": 9.96293405029596e-05, + "loss": 3.3188, + "step": 8693 + }, + { + "epoch": 0.404776869893149, + "grad_norm": 0.47980254927375865, + "learning_rate": 9.96290112163872e-05, + "loss": 3.3188, + "step": 8694 + }, + { + "epoch": 0.40482342807924204, + "grad_norm": 0.39842511571368355, + "learning_rate": 9.962868178415855e-05, + "loss": 3.3008, + "step": 8695 + }, + { + "epoch": 0.4048699862653351, + "grad_norm": 0.36614514247179886, + "learning_rate": 9.96283522062746e-05, + "loss": 3.2799, + "step": 8696 + }, + { + "epoch": 0.40491654445142816, + "grad_norm": 0.3967281058644646, + "learning_rate": 9.962802248273632e-05, + "loss": 3.2911, + "step": 8697 + }, + { + "epoch": 0.4049631026375212, + "grad_norm": 0.4291169430935477, + "learning_rate": 9.96276926135447e-05, + "loss": 3.311, + "step": 8698 + }, + { + "epoch": 0.40500966082361434, + "grad_norm": 0.44141710755038016, + "learning_rate": 9.962736259870069e-05, + "loss": 3.3622, + "step": 8699 + }, + { + "epoch": 0.4050562190097074, + "grad_norm": 0.4365112528431902, + "learning_rate": 9.962703243820525e-05, + "loss": 3.2977, + "step": 8700 + }, + { + "epoch": 0.40510277719580046, + "grad_norm": 0.39594914986587876, + "learning_rate": 9.962670213205937e-05, + "loss": 3.2932, + "step": 8701 + }, + { + "epoch": 0.4051493353818935, + "grad_norm": 0.39301375341123435, + "learning_rate": 9.9626371680264e-05, + "loss": 3.3346, + "step": 8702 + }, + { + "epoch": 0.4051958935679866, + "grad_norm": 0.4055617259777456, + "learning_rate": 9.962604108282013e-05, + "loss": 3.2722, + "step": 8703 + }, + { + "epoch": 0.40524245175407964, + "grad_norm": 0.42466207555456503, + "learning_rate": 9.96257103397287e-05, + "loss": 3.4726, + "step": 8704 + }, + { + "epoch": 0.40528900994017275, + "grad_norm": 0.41117254133258824, + "learning_rate": 9.962537945099071e-05, + "loss": 3.2906, + "step": 8705 + }, + { + "epoch": 0.4053355681262658, + "grad_norm": 0.406797262758545, + "learning_rate": 9.962504841660712e-05, + "loss": 3.3959, + "step": 8706 + }, + { + "epoch": 0.40538212631235887, + "grad_norm": 0.4262861670342066, + "learning_rate": 9.962471723657889e-05, + "loss": 3.4088, + "step": 8707 + }, + { + "epoch": 0.40542868449845193, + "grad_norm": 0.3829574988136765, + "learning_rate": 9.962438591090701e-05, + "loss": 3.4455, + "step": 8708 + }, + { + "epoch": 0.405475242684545, + "grad_norm": 0.43633235175763285, + "learning_rate": 9.962405443959244e-05, + "loss": 3.3935, + "step": 8709 + }, + { + "epoch": 0.4055218008706381, + "grad_norm": 0.4291597052960054, + "learning_rate": 9.962372282263617e-05, + "loss": 3.2368, + "step": 8710 + }, + { + "epoch": 0.40556835905673116, + "grad_norm": 0.46595211100920797, + "learning_rate": 9.962339106003917e-05, + "loss": 3.3575, + "step": 8711 + }, + { + "epoch": 0.4056149172428242, + "grad_norm": 0.42235203628088064, + "learning_rate": 9.962305915180238e-05, + "loss": 3.2568, + "step": 8712 + }, + { + "epoch": 0.4056614754289173, + "grad_norm": 0.4612554605051871, + "learning_rate": 9.962272709792683e-05, + "loss": 3.2472, + "step": 8713 + }, + { + "epoch": 0.40570803361501034, + "grad_norm": 0.5721663429354588, + "learning_rate": 9.962239489841345e-05, + "loss": 3.3423, + "step": 8714 + }, + { + "epoch": 0.4057545918011034, + "grad_norm": 0.445789720372622, + "learning_rate": 9.962206255326323e-05, + "loss": 3.3182, + "step": 8715 + }, + { + "epoch": 0.4058011499871965, + "grad_norm": 0.44640416454513665, + "learning_rate": 9.962173006247715e-05, + "loss": 3.4055, + "step": 8716 + }, + { + "epoch": 0.4058477081732896, + "grad_norm": 0.4858142956632304, + "learning_rate": 9.962139742605616e-05, + "loss": 3.3448, + "step": 8717 + }, + { + "epoch": 0.40589426635938264, + "grad_norm": 0.4867388842086327, + "learning_rate": 9.962106464400128e-05, + "loss": 3.4175, + "step": 8718 + }, + { + "epoch": 0.4059408245454757, + "grad_norm": 0.45408965937965867, + "learning_rate": 9.962073171631345e-05, + "loss": 3.1583, + "step": 8719 + }, + { + "epoch": 0.40598738273156876, + "grad_norm": 0.3843906156015199, + "learning_rate": 9.962039864299368e-05, + "loss": 3.2324, + "step": 8720 + }, + { + "epoch": 0.4060339409176619, + "grad_norm": 0.464551918217109, + "learning_rate": 9.962006542404292e-05, + "loss": 3.3231, + "step": 8721 + }, + { + "epoch": 0.40608049910375493, + "grad_norm": 0.42313905774444993, + "learning_rate": 9.961973205946215e-05, + "loss": 3.3486, + "step": 8722 + }, + { + "epoch": 0.406127057289848, + "grad_norm": 0.3863859288777957, + "learning_rate": 9.961939854925235e-05, + "loss": 3.3582, + "step": 8723 + }, + { + "epoch": 0.40617361547594105, + "grad_norm": 0.36057588003436536, + "learning_rate": 9.961906489341452e-05, + "loss": 3.2807, + "step": 8724 + }, + { + "epoch": 0.4062201736620341, + "grad_norm": 0.3793932908493149, + "learning_rate": 9.961873109194961e-05, + "loss": 3.2844, + "step": 8725 + }, + { + "epoch": 0.40626673184812717, + "grad_norm": 0.3979658575071167, + "learning_rate": 9.96183971448586e-05, + "loss": 3.3955, + "step": 8726 + }, + { + "epoch": 0.4063132900342203, + "grad_norm": 0.3838675464699707, + "learning_rate": 9.961806305214252e-05, + "loss": 3.418, + "step": 8727 + }, + { + "epoch": 0.40635984822031335, + "grad_norm": 0.3844660186905768, + "learning_rate": 9.96177288138023e-05, + "loss": 3.3406, + "step": 8728 + }, + { + "epoch": 0.4064064064064064, + "grad_norm": 0.3998378594832958, + "learning_rate": 9.96173944298389e-05, + "loss": 3.3318, + "step": 8729 + }, + { + "epoch": 0.40645296459249947, + "grad_norm": 0.43958007004162375, + "learning_rate": 9.961705990025337e-05, + "loss": 3.3046, + "step": 8730 + }, + { + "epoch": 0.4064995227785925, + "grad_norm": 0.4430830174548086, + "learning_rate": 9.961672522504665e-05, + "loss": 3.347, + "step": 8731 + }, + { + "epoch": 0.40654608096468564, + "grad_norm": 0.4471029472520066, + "learning_rate": 9.961639040421972e-05, + "loss": 3.2652, + "step": 8732 + }, + { + "epoch": 0.4065926391507787, + "grad_norm": 0.4076791901914356, + "learning_rate": 9.96160554377736e-05, + "loss": 3.3674, + "step": 8733 + }, + { + "epoch": 0.40663919733687176, + "grad_norm": 0.43603330005410174, + "learning_rate": 9.961572032570921e-05, + "loss": 3.4033, + "step": 8734 + }, + { + "epoch": 0.4066857555229648, + "grad_norm": 0.4253550237096424, + "learning_rate": 9.96153850680276e-05, + "loss": 3.2772, + "step": 8735 + }, + { + "epoch": 0.4067323137090579, + "grad_norm": 0.4532203942542015, + "learning_rate": 9.96150496647297e-05, + "loss": 3.2997, + "step": 8736 + }, + { + "epoch": 0.40677887189515094, + "grad_norm": 0.4742760701339291, + "learning_rate": 9.961471411581651e-05, + "loss": 3.4057, + "step": 8737 + }, + { + "epoch": 0.40682543008124405, + "grad_norm": 0.4289123196099702, + "learning_rate": 9.961437842128904e-05, + "loss": 3.3719, + "step": 8738 + }, + { + "epoch": 0.4068719882673371, + "grad_norm": 0.4399549080071717, + "learning_rate": 9.961404258114826e-05, + "loss": 3.2593, + "step": 8739 + }, + { + "epoch": 0.4069185464534302, + "grad_norm": 0.47749372721946043, + "learning_rate": 9.961370659539515e-05, + "loss": 3.2429, + "step": 8740 + }, + { + "epoch": 0.40696510463952323, + "grad_norm": 0.4371017751633655, + "learning_rate": 9.961337046403069e-05, + "loss": 3.4032, + "step": 8741 + }, + { + "epoch": 0.4070116628256163, + "grad_norm": 0.5155344676385679, + "learning_rate": 9.961303418705588e-05, + "loss": 3.3404, + "step": 8742 + }, + { + "epoch": 0.4070582210117094, + "grad_norm": 0.5122468279054914, + "learning_rate": 9.961269776447171e-05, + "loss": 3.2929, + "step": 8743 + }, + { + "epoch": 0.40710477919780247, + "grad_norm": 0.5104927358682175, + "learning_rate": 9.961236119627915e-05, + "loss": 3.377, + "step": 8744 + }, + { + "epoch": 0.40715133738389553, + "grad_norm": 0.452850416027841, + "learning_rate": 9.961202448247919e-05, + "loss": 3.2951, + "step": 8745 + }, + { + "epoch": 0.4071978955699886, + "grad_norm": 0.38158861301371594, + "learning_rate": 9.961168762307285e-05, + "loss": 3.2715, + "step": 8746 + }, + { + "epoch": 0.40724445375608165, + "grad_norm": 0.47571899647047305, + "learning_rate": 9.961135061806107e-05, + "loss": 3.3233, + "step": 8747 + }, + { + "epoch": 0.4072910119421747, + "grad_norm": 0.46434270450030707, + "learning_rate": 9.961101346744488e-05, + "loss": 3.1562, + "step": 8748 + }, + { + "epoch": 0.4073375701282678, + "grad_norm": 0.5082758502713666, + "learning_rate": 9.961067617122524e-05, + "loss": 3.402, + "step": 8749 + }, + { + "epoch": 0.4073841283143609, + "grad_norm": 0.4631143942553854, + "learning_rate": 9.961033872940316e-05, + "loss": 3.3426, + "step": 8750 + }, + { + "epoch": 0.40743068650045394, + "grad_norm": 0.39894817319342013, + "learning_rate": 9.961000114197962e-05, + "loss": 3.3471, + "step": 8751 + }, + { + "epoch": 0.407477244686547, + "grad_norm": 0.4565376558751419, + "learning_rate": 9.960966340895562e-05, + "loss": 3.2969, + "step": 8752 + }, + { + "epoch": 0.40752380287264006, + "grad_norm": 0.4297678642481119, + "learning_rate": 9.960932553033216e-05, + "loss": 3.2627, + "step": 8753 + }, + { + "epoch": 0.4075703610587332, + "grad_norm": 0.40051173008063135, + "learning_rate": 9.960898750611018e-05, + "loss": 3.3859, + "step": 8754 + }, + { + "epoch": 0.40761691924482624, + "grad_norm": 0.48383927220866, + "learning_rate": 9.960864933629072e-05, + "loss": 3.3575, + "step": 8755 + }, + { + "epoch": 0.4076634774309193, + "grad_norm": 0.4856185424982187, + "learning_rate": 9.960831102087478e-05, + "loss": 3.4481, + "step": 8756 + }, + { + "epoch": 0.40771003561701236, + "grad_norm": 0.45648553638224115, + "learning_rate": 9.960797255986333e-05, + "loss": 3.3691, + "step": 8757 + }, + { + "epoch": 0.4077565938031054, + "grad_norm": 0.4880705562581557, + "learning_rate": 9.960763395325736e-05, + "loss": 3.2299, + "step": 8758 + }, + { + "epoch": 0.4078031519891985, + "grad_norm": 0.43714930593811646, + "learning_rate": 9.960729520105788e-05, + "loss": 3.2236, + "step": 8759 + }, + { + "epoch": 0.4078497101752916, + "grad_norm": 0.4484637134674577, + "learning_rate": 9.960695630326586e-05, + "loss": 3.3337, + "step": 8760 + }, + { + "epoch": 0.40789626836138465, + "grad_norm": 0.49670990836926043, + "learning_rate": 9.960661725988232e-05, + "loss": 3.2958, + "step": 8761 + }, + { + "epoch": 0.4079428265474777, + "grad_norm": 0.500014816405234, + "learning_rate": 9.960627807090824e-05, + "loss": 3.2817, + "step": 8762 + }, + { + "epoch": 0.40798938473357077, + "grad_norm": 0.4789584944366052, + "learning_rate": 9.960593873634463e-05, + "loss": 3.3903, + "step": 8763 + }, + { + "epoch": 0.40803594291966383, + "grad_norm": 0.4419970989851253, + "learning_rate": 9.960559925619247e-05, + "loss": 3.4003, + "step": 8764 + }, + { + "epoch": 0.40808250110575695, + "grad_norm": 0.4562663184464875, + "learning_rate": 9.960525963045278e-05, + "loss": 3.2238, + "step": 8765 + }, + { + "epoch": 0.40812905929185, + "grad_norm": 0.42011961960612393, + "learning_rate": 9.960491985912651e-05, + "loss": 3.2486, + "step": 8766 + }, + { + "epoch": 0.40817561747794306, + "grad_norm": 0.38054839657278144, + "learning_rate": 9.96045799422147e-05, + "loss": 3.2688, + "step": 8767 + }, + { + "epoch": 0.4082221756640361, + "grad_norm": 0.48589746118394556, + "learning_rate": 9.960423987971835e-05, + "loss": 3.3374, + "step": 8768 + }, + { + "epoch": 0.4082687338501292, + "grad_norm": 0.4233243355733994, + "learning_rate": 9.960389967163842e-05, + "loss": 3.3552, + "step": 8769 + }, + { + "epoch": 0.40831529203622224, + "grad_norm": 0.4410713532749796, + "learning_rate": 9.960355931797594e-05, + "loss": 3.3048, + "step": 8770 + }, + { + "epoch": 0.40836185022231536, + "grad_norm": 0.4318542101897238, + "learning_rate": 9.960321881873191e-05, + "loss": 3.2925, + "step": 8771 + }, + { + "epoch": 0.4084084084084084, + "grad_norm": 0.41657003197922937, + "learning_rate": 9.96028781739073e-05, + "loss": 3.3496, + "step": 8772 + }, + { + "epoch": 0.4084549665945015, + "grad_norm": 0.42777058563297926, + "learning_rate": 9.960253738350315e-05, + "loss": 3.3225, + "step": 8773 + }, + { + "epoch": 0.40850152478059454, + "grad_norm": 0.4213797573711051, + "learning_rate": 9.960219644752043e-05, + "loss": 3.4041, + "step": 8774 + }, + { + "epoch": 0.4085480829666876, + "grad_norm": 0.3905421671935037, + "learning_rate": 9.960185536596013e-05, + "loss": 3.1683, + "step": 8775 + }, + { + "epoch": 0.4085946411527807, + "grad_norm": 0.40157120711172994, + "learning_rate": 9.960151413882331e-05, + "loss": 3.3549, + "step": 8776 + }, + { + "epoch": 0.4086411993388738, + "grad_norm": 0.43571535100655234, + "learning_rate": 9.960117276611092e-05, + "loss": 3.3259, + "step": 8777 + }, + { + "epoch": 0.40868775752496683, + "grad_norm": 0.4366351985594952, + "learning_rate": 9.960083124782396e-05, + "loss": 3.3094, + "step": 8778 + }, + { + "epoch": 0.4087343157110599, + "grad_norm": 0.42139635842063683, + "learning_rate": 9.960048958396345e-05, + "loss": 3.3466, + "step": 8779 + }, + { + "epoch": 0.40878087389715295, + "grad_norm": 0.4047333473572423, + "learning_rate": 9.96001477745304e-05, + "loss": 3.4043, + "step": 8780 + }, + { + "epoch": 0.408827432083246, + "grad_norm": 0.4196180726205951, + "learning_rate": 9.959980581952579e-05, + "loss": 3.3855, + "step": 8781 + }, + { + "epoch": 0.4088739902693391, + "grad_norm": 0.3945597658809591, + "learning_rate": 9.959946371895064e-05, + "loss": 3.2666, + "step": 8782 + }, + { + "epoch": 0.4089205484554322, + "grad_norm": 0.37917989566293664, + "learning_rate": 9.959912147280597e-05, + "loss": 3.2974, + "step": 8783 + }, + { + "epoch": 0.40896710664152525, + "grad_norm": 0.471314599837252, + "learning_rate": 9.959877908109275e-05, + "loss": 3.2997, + "step": 8784 + }, + { + "epoch": 0.4090136648276183, + "grad_norm": 0.45172384456414416, + "learning_rate": 9.959843654381198e-05, + "loss": 3.2582, + "step": 8785 + }, + { + "epoch": 0.40906022301371137, + "grad_norm": 0.46196642155920914, + "learning_rate": 9.95980938609647e-05, + "loss": 3.3258, + "step": 8786 + }, + { + "epoch": 0.4091067811998045, + "grad_norm": 0.540965782066533, + "learning_rate": 9.95977510325519e-05, + "loss": 3.3126, + "step": 8787 + }, + { + "epoch": 0.40915333938589754, + "grad_norm": 0.5238239760523893, + "learning_rate": 9.95974080585746e-05, + "loss": 3.2608, + "step": 8788 + }, + { + "epoch": 0.4091998975719906, + "grad_norm": 0.42284221711674885, + "learning_rate": 9.95970649390338e-05, + "loss": 3.2753, + "step": 8789 + }, + { + "epoch": 0.40924645575808366, + "grad_norm": 0.4991902260382104, + "learning_rate": 9.959672167393047e-05, + "loss": 3.3481, + "step": 8790 + }, + { + "epoch": 0.4092930139441767, + "grad_norm": 0.6273834967248643, + "learning_rate": 9.959637826326567e-05, + "loss": 3.151, + "step": 8791 + }, + { + "epoch": 0.4093395721302698, + "grad_norm": 0.5957492997736334, + "learning_rate": 9.959603470704037e-05, + "loss": 3.351, + "step": 8792 + }, + { + "epoch": 0.4093861303163629, + "grad_norm": 0.6306558697356434, + "learning_rate": 9.95956910052556e-05, + "loss": 3.3176, + "step": 8793 + }, + { + "epoch": 0.40943268850245595, + "grad_norm": 0.48487245296137466, + "learning_rate": 9.959534715791237e-05, + "loss": 3.3437, + "step": 8794 + }, + { + "epoch": 0.409479246688549, + "grad_norm": 0.5222482760852704, + "learning_rate": 9.959500316501168e-05, + "loss": 3.2516, + "step": 8795 + }, + { + "epoch": 0.4095258048746421, + "grad_norm": 0.4639117267716946, + "learning_rate": 9.959465902655454e-05, + "loss": 3.3156, + "step": 8796 + }, + { + "epoch": 0.40957236306073513, + "grad_norm": 0.4573301215053848, + "learning_rate": 9.959431474254196e-05, + "loss": 3.3108, + "step": 8797 + }, + { + "epoch": 0.40961892124682825, + "grad_norm": 0.4829483939162874, + "learning_rate": 9.959397031297495e-05, + "loss": 3.2954, + "step": 8798 + }, + { + "epoch": 0.4096654794329213, + "grad_norm": 0.4183617976614113, + "learning_rate": 9.959362573785454e-05, + "loss": 3.2717, + "step": 8799 + }, + { + "epoch": 0.40971203761901437, + "grad_norm": 0.4009536018295579, + "learning_rate": 9.95932810171817e-05, + "loss": 3.0406, + "step": 8800 + }, + { + "epoch": 0.40975859580510743, + "grad_norm": 0.4848598181634362, + "learning_rate": 9.959293615095749e-05, + "loss": 3.3129, + "step": 8801 + }, + { + "epoch": 0.4098051539912005, + "grad_norm": 0.5498501441996012, + "learning_rate": 9.959259113918287e-05, + "loss": 3.2549, + "step": 8802 + }, + { + "epoch": 0.40985171217729355, + "grad_norm": 0.49140547161610937, + "learning_rate": 9.95922459818589e-05, + "loss": 3.1877, + "step": 8803 + }, + { + "epoch": 0.40989827036338666, + "grad_norm": 0.46299484810488806, + "learning_rate": 9.959190067898655e-05, + "loss": 3.3262, + "step": 8804 + }, + { + "epoch": 0.4099448285494797, + "grad_norm": 0.5026443804835242, + "learning_rate": 9.959155523056689e-05, + "loss": 3.2539, + "step": 8805 + }, + { + "epoch": 0.4099913867355728, + "grad_norm": 0.44775175330314365, + "learning_rate": 9.959120963660089e-05, + "loss": 3.3511, + "step": 8806 + }, + { + "epoch": 0.41003794492166584, + "grad_norm": 0.45227966555440613, + "learning_rate": 9.959086389708957e-05, + "loss": 3.3473, + "step": 8807 + }, + { + "epoch": 0.4100845031077589, + "grad_norm": 0.4933947573175847, + "learning_rate": 9.959051801203396e-05, + "loss": 3.3075, + "step": 8808 + }, + { + "epoch": 0.410131061293852, + "grad_norm": 0.5261072893534774, + "learning_rate": 9.959017198143505e-05, + "loss": 3.3649, + "step": 8809 + }, + { + "epoch": 0.4101776194799451, + "grad_norm": 0.40726535679895864, + "learning_rate": 9.958982580529388e-05, + "loss": 3.3235, + "step": 8810 + }, + { + "epoch": 0.41022417766603814, + "grad_norm": 0.4704704741237943, + "learning_rate": 9.958947948361145e-05, + "loss": 3.3303, + "step": 8811 + }, + { + "epoch": 0.4102707358521312, + "grad_norm": 0.5144274573790212, + "learning_rate": 9.95891330163888e-05, + "loss": 3.3269, + "step": 8812 + }, + { + "epoch": 0.41031729403822426, + "grad_norm": 0.4577615292307446, + "learning_rate": 9.958878640362689e-05, + "loss": 3.2853, + "step": 8813 + }, + { + "epoch": 0.4103638522243173, + "grad_norm": 0.45305712479926596, + "learning_rate": 9.958843964532683e-05, + "loss": 3.2621, + "step": 8814 + }, + { + "epoch": 0.41041041041041043, + "grad_norm": 0.5004962346558398, + "learning_rate": 9.958809274148955e-05, + "loss": 3.3487, + "step": 8815 + }, + { + "epoch": 0.4104569685965035, + "grad_norm": 0.46006947255733266, + "learning_rate": 9.958774569211611e-05, + "loss": 3.2694, + "step": 8816 + }, + { + "epoch": 0.41050352678259655, + "grad_norm": 0.47802355606056307, + "learning_rate": 9.958739849720753e-05, + "loss": 3.3373, + "step": 8817 + }, + { + "epoch": 0.4105500849686896, + "grad_norm": 0.5482647639725596, + "learning_rate": 9.958705115676481e-05, + "loss": 3.2533, + "step": 8818 + }, + { + "epoch": 0.41059664315478267, + "grad_norm": 0.45010739797183785, + "learning_rate": 9.958670367078899e-05, + "loss": 3.3434, + "step": 8819 + }, + { + "epoch": 0.4106432013408758, + "grad_norm": 0.5162033504043126, + "learning_rate": 9.958635603928107e-05, + "loss": 3.3764, + "step": 8820 + }, + { + "epoch": 0.41068975952696885, + "grad_norm": 0.45930061849765363, + "learning_rate": 9.958600826224209e-05, + "loss": 3.3079, + "step": 8821 + }, + { + "epoch": 0.4107363177130619, + "grad_norm": 0.4692153736290222, + "learning_rate": 9.958566033967305e-05, + "loss": 3.2345, + "step": 8822 + }, + { + "epoch": 0.41078287589915496, + "grad_norm": 0.47017957108968234, + "learning_rate": 9.958531227157499e-05, + "loss": 3.2758, + "step": 8823 + }, + { + "epoch": 0.410829434085248, + "grad_norm": 0.47353839564369415, + "learning_rate": 9.958496405794891e-05, + "loss": 3.2587, + "step": 8824 + }, + { + "epoch": 0.4108759922713411, + "grad_norm": 0.5021171345845155, + "learning_rate": 9.958461569879586e-05, + "loss": 3.3666, + "step": 8825 + }, + { + "epoch": 0.4109225504574342, + "grad_norm": 0.4545437585568683, + "learning_rate": 9.958426719411685e-05, + "loss": 3.3714, + "step": 8826 + }, + { + "epoch": 0.41096910864352726, + "grad_norm": 0.49286484913743006, + "learning_rate": 9.958391854391289e-05, + "loss": 3.2998, + "step": 8827 + }, + { + "epoch": 0.4110156668296203, + "grad_norm": 0.4363183696295247, + "learning_rate": 9.958356974818502e-05, + "loss": 3.3147, + "step": 8828 + }, + { + "epoch": 0.4110622250157134, + "grad_norm": 0.3992306957879863, + "learning_rate": 9.958322080693426e-05, + "loss": 3.3127, + "step": 8829 + }, + { + "epoch": 0.41110878320180644, + "grad_norm": 0.4426313797127788, + "learning_rate": 9.958287172016163e-05, + "loss": 3.2537, + "step": 8830 + }, + { + "epoch": 0.41115534138789955, + "grad_norm": 0.42745514337587703, + "learning_rate": 9.958252248786815e-05, + "loss": 3.2771, + "step": 8831 + }, + { + "epoch": 0.4112018995739926, + "grad_norm": 0.45556124655887476, + "learning_rate": 9.958217311005485e-05, + "loss": 3.4794, + "step": 8832 + }, + { + "epoch": 0.4112484577600857, + "grad_norm": 0.4621019091573079, + "learning_rate": 9.958182358672277e-05, + "loss": 3.2554, + "step": 8833 + }, + { + "epoch": 0.41129501594617873, + "grad_norm": 0.400508647079935, + "learning_rate": 9.958147391787292e-05, + "loss": 3.2837, + "step": 8834 + }, + { + "epoch": 0.4113415741322718, + "grad_norm": 0.4429006851670194, + "learning_rate": 9.95811241035063e-05, + "loss": 3.3757, + "step": 8835 + }, + { + "epoch": 0.41138813231836485, + "grad_norm": 0.44776789706520975, + "learning_rate": 9.958077414362401e-05, + "loss": 3.3337, + "step": 8836 + }, + { + "epoch": 0.41143469050445797, + "grad_norm": 0.40224899985075213, + "learning_rate": 9.958042403822701e-05, + "loss": 3.326, + "step": 8837 + }, + { + "epoch": 0.411481248690551, + "grad_norm": 0.44695248562426293, + "learning_rate": 9.958007378731634e-05, + "loss": 3.3338, + "step": 8838 + }, + { + "epoch": 0.4115278068766441, + "grad_norm": 0.4798452101917216, + "learning_rate": 9.957972339089305e-05, + "loss": 3.3816, + "step": 8839 + }, + { + "epoch": 0.41157436506273715, + "grad_norm": 0.4141929983604472, + "learning_rate": 9.957937284895816e-05, + "loss": 3.2692, + "step": 8840 + }, + { + "epoch": 0.4116209232488302, + "grad_norm": 0.4347635681450812, + "learning_rate": 9.957902216151269e-05, + "loss": 3.4117, + "step": 8841 + }, + { + "epoch": 0.4116674814349233, + "grad_norm": 0.4256721901100479, + "learning_rate": 9.957867132855767e-05, + "loss": 3.3609, + "step": 8842 + }, + { + "epoch": 0.4117140396210164, + "grad_norm": 0.3762413838593566, + "learning_rate": 9.957832035009414e-05, + "loss": 3.3047, + "step": 8843 + }, + { + "epoch": 0.41176059780710944, + "grad_norm": 0.4393028799627131, + "learning_rate": 9.957796922612312e-05, + "loss": 3.2964, + "step": 8844 + }, + { + "epoch": 0.4118071559932025, + "grad_norm": 0.39532420886882147, + "learning_rate": 9.957761795664564e-05, + "loss": 3.3396, + "step": 8845 + }, + { + "epoch": 0.41185371417929556, + "grad_norm": 0.4029979272074728, + "learning_rate": 9.957726654166275e-05, + "loss": 3.3195, + "step": 8846 + }, + { + "epoch": 0.4119002723653886, + "grad_norm": 0.4048741115723648, + "learning_rate": 9.957691498117546e-05, + "loss": 3.3136, + "step": 8847 + }, + { + "epoch": 0.41194683055148174, + "grad_norm": 0.418460033613133, + "learning_rate": 9.95765632751848e-05, + "loss": 3.4179, + "step": 8848 + }, + { + "epoch": 0.4119933887375748, + "grad_norm": 0.368128215480879, + "learning_rate": 9.957621142369182e-05, + "loss": 3.3412, + "step": 8849 + }, + { + "epoch": 0.41203994692366785, + "grad_norm": 0.41102730022865197, + "learning_rate": 9.957585942669755e-05, + "loss": 3.2476, + "step": 8850 + }, + { + "epoch": 0.4120865051097609, + "grad_norm": 0.36920800268589965, + "learning_rate": 9.9575507284203e-05, + "loss": 3.3591, + "step": 8851 + }, + { + "epoch": 0.412133063295854, + "grad_norm": 0.3656995636810849, + "learning_rate": 9.957515499620923e-05, + "loss": 3.3567, + "step": 8852 + }, + { + "epoch": 0.4121796214819471, + "grad_norm": 0.383978774817604, + "learning_rate": 9.957480256271727e-05, + "loss": 3.3239, + "step": 8853 + }, + { + "epoch": 0.41222617966804015, + "grad_norm": 0.36738962654384283, + "learning_rate": 9.957444998372814e-05, + "loss": 3.3003, + "step": 8854 + }, + { + "epoch": 0.4122727378541332, + "grad_norm": 0.36744172203417086, + "learning_rate": 9.957409725924289e-05, + "loss": 3.3563, + "step": 8855 + }, + { + "epoch": 0.41231929604022627, + "grad_norm": 0.37862512703718076, + "learning_rate": 9.957374438926254e-05, + "loss": 3.3075, + "step": 8856 + }, + { + "epoch": 0.41236585422631933, + "grad_norm": 0.3801097773744257, + "learning_rate": 9.957339137378814e-05, + "loss": 3.3694, + "step": 8857 + }, + { + "epoch": 0.4124124124124124, + "grad_norm": 0.3748506916753026, + "learning_rate": 9.957303821282072e-05, + "loss": 3.3936, + "step": 8858 + }, + { + "epoch": 0.4124589705985055, + "grad_norm": 0.3971184703074463, + "learning_rate": 9.957268490636131e-05, + "loss": 3.3166, + "step": 8859 + }, + { + "epoch": 0.41250552878459856, + "grad_norm": 0.4187744914304128, + "learning_rate": 9.957233145441094e-05, + "loss": 3.4294, + "step": 8860 + }, + { + "epoch": 0.4125520869706916, + "grad_norm": 0.42457444613740425, + "learning_rate": 9.957197785697069e-05, + "loss": 3.2158, + "step": 8861 + }, + { + "epoch": 0.4125986451567847, + "grad_norm": 0.3650695975505022, + "learning_rate": 9.957162411404155e-05, + "loss": 3.3239, + "step": 8862 + }, + { + "epoch": 0.41264520334287774, + "grad_norm": 0.40899136551589554, + "learning_rate": 9.957127022562459e-05, + "loss": 3.4196, + "step": 8863 + }, + { + "epoch": 0.41269176152897086, + "grad_norm": 0.3962094821555665, + "learning_rate": 9.957091619172084e-05, + "loss": 3.3166, + "step": 8864 + }, + { + "epoch": 0.4127383197150639, + "grad_norm": 0.3623424775973357, + "learning_rate": 9.957056201233131e-05, + "loss": 3.3468, + "step": 8865 + }, + { + "epoch": 0.412784877901157, + "grad_norm": 0.4287209768022595, + "learning_rate": 9.957020768745709e-05, + "loss": 3.3422, + "step": 8866 + }, + { + "epoch": 0.41283143608725004, + "grad_norm": 0.38780929776096446, + "learning_rate": 9.956985321709917e-05, + "loss": 3.2358, + "step": 8867 + }, + { + "epoch": 0.4128779942733431, + "grad_norm": 0.43830325440698076, + "learning_rate": 9.956949860125862e-05, + "loss": 3.2196, + "step": 8868 + }, + { + "epoch": 0.41292455245943616, + "grad_norm": 0.3609153149008036, + "learning_rate": 9.95691438399365e-05, + "loss": 3.3392, + "step": 8869 + }, + { + "epoch": 0.41297111064552927, + "grad_norm": 0.4046005605335421, + "learning_rate": 9.95687889331338e-05, + "loss": 3.188, + "step": 8870 + }, + { + "epoch": 0.41301766883162233, + "grad_norm": 0.408678100729437, + "learning_rate": 9.95684338808516e-05, + "loss": 3.34, + "step": 8871 + }, + { + "epoch": 0.4130642270177154, + "grad_norm": 0.3995186224407149, + "learning_rate": 9.956807868309092e-05, + "loss": 3.3394, + "step": 8872 + }, + { + "epoch": 0.41311078520380845, + "grad_norm": 0.4265672036651288, + "learning_rate": 9.956772333985281e-05, + "loss": 3.2728, + "step": 8873 + }, + { + "epoch": 0.4131573433899015, + "grad_norm": 0.4240334710462787, + "learning_rate": 9.956736785113832e-05, + "loss": 3.3359, + "step": 8874 + }, + { + "epoch": 0.4132039015759946, + "grad_norm": 0.46967824577613276, + "learning_rate": 9.95670122169485e-05, + "loss": 3.3894, + "step": 8875 + }, + { + "epoch": 0.4132504597620877, + "grad_norm": 0.4405023800154161, + "learning_rate": 9.956665643728436e-05, + "loss": 3.3853, + "step": 8876 + }, + { + "epoch": 0.41329701794818074, + "grad_norm": 0.4206179110284288, + "learning_rate": 9.956630051214699e-05, + "loss": 3.3382, + "step": 8877 + }, + { + "epoch": 0.4133435761342738, + "grad_norm": 0.45732183270255194, + "learning_rate": 9.95659444415374e-05, + "loss": 3.3409, + "step": 8878 + }, + { + "epoch": 0.41339013432036686, + "grad_norm": 0.45705795772602315, + "learning_rate": 9.956558822545665e-05, + "loss": 3.3092, + "step": 8879 + }, + { + "epoch": 0.4134366925064599, + "grad_norm": 0.39685779641062274, + "learning_rate": 9.956523186390577e-05, + "loss": 3.1998, + "step": 8880 + }, + { + "epoch": 0.41348325069255304, + "grad_norm": 0.43997986728720595, + "learning_rate": 9.956487535688584e-05, + "loss": 3.3017, + "step": 8881 + }, + { + "epoch": 0.4135298088786461, + "grad_norm": 0.45259155425285225, + "learning_rate": 9.956451870439786e-05, + "loss": 3.2357, + "step": 8882 + }, + { + "epoch": 0.41357636706473916, + "grad_norm": 0.3698011505922157, + "learning_rate": 9.956416190644291e-05, + "loss": 3.3534, + "step": 8883 + }, + { + "epoch": 0.4136229252508322, + "grad_norm": 0.44591102036080693, + "learning_rate": 9.956380496302203e-05, + "loss": 3.2056, + "step": 8884 + }, + { + "epoch": 0.4136694834369253, + "grad_norm": 0.4218836418585144, + "learning_rate": 9.956344787413626e-05, + "loss": 3.2582, + "step": 8885 + }, + { + "epoch": 0.4137160416230184, + "grad_norm": 0.46503159673945393, + "learning_rate": 9.956309063978666e-05, + "loss": 3.2681, + "step": 8886 + }, + { + "epoch": 0.41376259980911145, + "grad_norm": 0.43722389031084663, + "learning_rate": 9.956273325997427e-05, + "loss": 3.3327, + "step": 8887 + }, + { + "epoch": 0.4138091579952045, + "grad_norm": 0.4464293919027531, + "learning_rate": 9.956237573470013e-05, + "loss": 3.333, + "step": 8888 + }, + { + "epoch": 0.4138557161812976, + "grad_norm": 0.4487714117968968, + "learning_rate": 9.956201806396531e-05, + "loss": 3.3424, + "step": 8889 + }, + { + "epoch": 0.41390227436739063, + "grad_norm": 0.42599317184119284, + "learning_rate": 9.956166024777085e-05, + "loss": 3.2275, + "step": 8890 + }, + { + "epoch": 0.4139488325534837, + "grad_norm": 0.44350065782175535, + "learning_rate": 9.95613022861178e-05, + "loss": 3.2643, + "step": 8891 + }, + { + "epoch": 0.4139953907395768, + "grad_norm": 0.48058850664249503, + "learning_rate": 9.956094417900721e-05, + "loss": 3.3052, + "step": 8892 + }, + { + "epoch": 0.41404194892566987, + "grad_norm": 0.38726717415471384, + "learning_rate": 9.956058592644013e-05, + "loss": 3.323, + "step": 8893 + }, + { + "epoch": 0.4140885071117629, + "grad_norm": 0.5111208322355227, + "learning_rate": 9.95602275284176e-05, + "loss": 3.2748, + "step": 8894 + }, + { + "epoch": 0.414135065297856, + "grad_norm": 0.45553337345895073, + "learning_rate": 9.955986898494071e-05, + "loss": 3.3501, + "step": 8895 + }, + { + "epoch": 0.41418162348394905, + "grad_norm": 0.43155185536066937, + "learning_rate": 9.955951029601047e-05, + "loss": 3.3293, + "step": 8896 + }, + { + "epoch": 0.41422818167004216, + "grad_norm": 0.44923287817121244, + "learning_rate": 9.955915146162795e-05, + "loss": 3.289, + "step": 8897 + }, + { + "epoch": 0.4142747398561352, + "grad_norm": 0.5033636005887842, + "learning_rate": 9.95587924817942e-05, + "loss": 3.2757, + "step": 8898 + }, + { + "epoch": 0.4143212980422283, + "grad_norm": 0.5445159451554565, + "learning_rate": 9.955843335651029e-05, + "loss": 3.3141, + "step": 8899 + }, + { + "epoch": 0.41436785622832134, + "grad_norm": 0.42039642102119656, + "learning_rate": 9.955807408577727e-05, + "loss": 3.313, + "step": 8900 + }, + { + "epoch": 0.4144144144144144, + "grad_norm": 0.5064670938271391, + "learning_rate": 9.955771466959616e-05, + "loss": 3.2655, + "step": 8901 + }, + { + "epoch": 0.41446097260050746, + "grad_norm": 0.41311444000672864, + "learning_rate": 9.955735510796804e-05, + "loss": 3.3686, + "step": 8902 + }, + { + "epoch": 0.4145075307866006, + "grad_norm": 0.450233028681168, + "learning_rate": 9.955699540089399e-05, + "loss": 3.3511, + "step": 8903 + }, + { + "epoch": 0.41455408897269364, + "grad_norm": 0.42083918551791705, + "learning_rate": 9.955663554837502e-05, + "loss": 3.3688, + "step": 8904 + }, + { + "epoch": 0.4146006471587867, + "grad_norm": 0.4615848855361098, + "learning_rate": 9.95562755504122e-05, + "loss": 3.3357, + "step": 8905 + }, + { + "epoch": 0.41464720534487975, + "grad_norm": 0.41758518385191057, + "learning_rate": 9.955591540700662e-05, + "loss": 3.4133, + "step": 8906 + }, + { + "epoch": 0.4146937635309728, + "grad_norm": 0.41744760430527894, + "learning_rate": 9.955555511815928e-05, + "loss": 3.2962, + "step": 8907 + }, + { + "epoch": 0.41474032171706593, + "grad_norm": 0.42871509793549206, + "learning_rate": 9.955519468387128e-05, + "loss": 3.3268, + "step": 8908 + }, + { + "epoch": 0.414786879903159, + "grad_norm": 0.39198790890280505, + "learning_rate": 9.955483410414368e-05, + "loss": 3.3139, + "step": 8909 + }, + { + "epoch": 0.41483343808925205, + "grad_norm": 0.3828485499732821, + "learning_rate": 9.95544733789775e-05, + "loss": 3.2068, + "step": 8910 + }, + { + "epoch": 0.4148799962753451, + "grad_norm": 0.4123894023830836, + "learning_rate": 9.955411250837384e-05, + "loss": 3.3068, + "step": 8911 + }, + { + "epoch": 0.41492655446143817, + "grad_norm": 0.4338157799179043, + "learning_rate": 9.955375149233373e-05, + "loss": 3.2836, + "step": 8912 + }, + { + "epoch": 0.41497311264753123, + "grad_norm": 0.41200103110219627, + "learning_rate": 9.955339033085825e-05, + "loss": 3.2453, + "step": 8913 + }, + { + "epoch": 0.41501967083362434, + "grad_norm": 0.4737792621857798, + "learning_rate": 9.955302902394845e-05, + "loss": 3.2781, + "step": 8914 + }, + { + "epoch": 0.4150662290197174, + "grad_norm": 0.4722370085576929, + "learning_rate": 9.95526675716054e-05, + "loss": 3.2665, + "step": 8915 + }, + { + "epoch": 0.41511278720581046, + "grad_norm": 0.5315769828496498, + "learning_rate": 9.955230597383015e-05, + "loss": 3.3864, + "step": 8916 + }, + { + "epoch": 0.4151593453919035, + "grad_norm": 0.4872524662900798, + "learning_rate": 9.955194423062374e-05, + "loss": 3.3306, + "step": 8917 + }, + { + "epoch": 0.4152059035779966, + "grad_norm": 0.4793453953776911, + "learning_rate": 9.955158234198728e-05, + "loss": 3.3411, + "step": 8918 + }, + { + "epoch": 0.4152524617640897, + "grad_norm": 0.41853419815802073, + "learning_rate": 9.955122030792179e-05, + "loss": 3.2772, + "step": 8919 + }, + { + "epoch": 0.41529901995018276, + "grad_norm": 0.4803947786642301, + "learning_rate": 9.955085812842836e-05, + "loss": 3.3796, + "step": 8920 + }, + { + "epoch": 0.4153455781362758, + "grad_norm": 0.4339190816378217, + "learning_rate": 9.955049580350804e-05, + "loss": 3.2847, + "step": 8921 + }, + { + "epoch": 0.4153921363223689, + "grad_norm": 0.44538390640751857, + "learning_rate": 9.95501333331619e-05, + "loss": 3.391, + "step": 8922 + }, + { + "epoch": 0.41543869450846194, + "grad_norm": 0.4173776411779718, + "learning_rate": 9.9549770717391e-05, + "loss": 3.2701, + "step": 8923 + }, + { + "epoch": 0.415485252694555, + "grad_norm": 0.38009519092968463, + "learning_rate": 9.95494079561964e-05, + "loss": 3.3524, + "step": 8924 + }, + { + "epoch": 0.4155318108806481, + "grad_norm": 0.382729703735439, + "learning_rate": 9.954904504957918e-05, + "loss": 3.2878, + "step": 8925 + }, + { + "epoch": 0.41557836906674117, + "grad_norm": 0.4065188520692583, + "learning_rate": 9.954868199754037e-05, + "loss": 3.2117, + "step": 8926 + }, + { + "epoch": 0.41562492725283423, + "grad_norm": 0.4031262302605594, + "learning_rate": 9.954831880008109e-05, + "loss": 3.4837, + "step": 8927 + }, + { + "epoch": 0.4156714854389273, + "grad_norm": 0.4001621226226124, + "learning_rate": 9.954795545720236e-05, + "loss": 3.2279, + "step": 8928 + }, + { + "epoch": 0.41571804362502035, + "grad_norm": 0.3860760160086522, + "learning_rate": 9.954759196890526e-05, + "loss": 3.2727, + "step": 8929 + }, + { + "epoch": 0.41576460181111347, + "grad_norm": 0.3830941577625341, + "learning_rate": 9.954722833519086e-05, + "loss": 3.3586, + "step": 8930 + }, + { + "epoch": 0.4158111599972065, + "grad_norm": 0.38626082305020487, + "learning_rate": 9.954686455606023e-05, + "loss": 3.2296, + "step": 8931 + }, + { + "epoch": 0.4158577181832996, + "grad_norm": 0.41866139343308745, + "learning_rate": 9.954650063151443e-05, + "loss": 3.4177, + "step": 8932 + }, + { + "epoch": 0.41590427636939264, + "grad_norm": 0.3959132546009377, + "learning_rate": 9.954613656155454e-05, + "loss": 3.2964, + "step": 8933 + }, + { + "epoch": 0.4159508345554857, + "grad_norm": 0.43196058872233395, + "learning_rate": 9.954577234618161e-05, + "loss": 3.2852, + "step": 8934 + }, + { + "epoch": 0.41599739274157876, + "grad_norm": 0.44310949516804654, + "learning_rate": 9.954540798539672e-05, + "loss": 3.3661, + "step": 8935 + }, + { + "epoch": 0.4160439509276719, + "grad_norm": 0.4222030311104344, + "learning_rate": 9.954504347920095e-05, + "loss": 3.3237, + "step": 8936 + }, + { + "epoch": 0.41609050911376494, + "grad_norm": 0.42573611899304464, + "learning_rate": 9.954467882759535e-05, + "loss": 3.3483, + "step": 8937 + }, + { + "epoch": 0.416137067299858, + "grad_norm": 0.43151921878430854, + "learning_rate": 9.954431403058099e-05, + "loss": 3.3148, + "step": 8938 + }, + { + "epoch": 0.41618362548595106, + "grad_norm": 0.47293446033605524, + "learning_rate": 9.954394908815897e-05, + "loss": 3.3231, + "step": 8939 + }, + { + "epoch": 0.4162301836720441, + "grad_norm": 0.4366469286445768, + "learning_rate": 9.954358400033032e-05, + "loss": 3.243, + "step": 8940 + }, + { + "epoch": 0.41627674185813723, + "grad_norm": 0.4220878851857453, + "learning_rate": 9.954321876709613e-05, + "loss": 3.2738, + "step": 8941 + }, + { + "epoch": 0.4163233000442303, + "grad_norm": 0.3992593120873721, + "learning_rate": 9.95428533884575e-05, + "loss": 3.2889, + "step": 8942 + }, + { + "epoch": 0.41636985823032335, + "grad_norm": 0.44034667859508936, + "learning_rate": 9.954248786441545e-05, + "loss": 3.2518, + "step": 8943 + }, + { + "epoch": 0.4164164164164164, + "grad_norm": 0.4495703017496968, + "learning_rate": 9.95421221949711e-05, + "loss": 3.2863, + "step": 8944 + }, + { + "epoch": 0.4164629746025095, + "grad_norm": 0.4122618994321814, + "learning_rate": 9.954175638012549e-05, + "loss": 3.236, + "step": 8945 + }, + { + "epoch": 0.41650953278860253, + "grad_norm": 0.3847975438329812, + "learning_rate": 9.954139041987971e-05, + "loss": 3.3421, + "step": 8946 + }, + { + "epoch": 0.41655609097469565, + "grad_norm": 0.50566589264982, + "learning_rate": 9.954102431423482e-05, + "loss": 3.2877, + "step": 8947 + }, + { + "epoch": 0.4166026491607887, + "grad_norm": 0.48255296641017614, + "learning_rate": 9.954065806319191e-05, + "loss": 3.2462, + "step": 8948 + }, + { + "epoch": 0.41664920734688177, + "grad_norm": 0.42099862113072684, + "learning_rate": 9.954029166675206e-05, + "loss": 3.2774, + "step": 8949 + }, + { + "epoch": 0.4166957655329748, + "grad_norm": 0.4174610775357525, + "learning_rate": 9.953992512491633e-05, + "loss": 3.3384, + "step": 8950 + }, + { + "epoch": 0.4167423237190679, + "grad_norm": 0.49909671293723623, + "learning_rate": 9.953955843768579e-05, + "loss": 3.3263, + "step": 8951 + }, + { + "epoch": 0.416788881905161, + "grad_norm": 0.496597359053561, + "learning_rate": 9.953919160506153e-05, + "loss": 3.3379, + "step": 8952 + }, + { + "epoch": 0.41683544009125406, + "grad_norm": 0.4677787008325723, + "learning_rate": 9.953882462704461e-05, + "loss": 3.3409, + "step": 8953 + }, + { + "epoch": 0.4168819982773471, + "grad_norm": 0.407860986683629, + "learning_rate": 9.953845750363614e-05, + "loss": 3.2986, + "step": 8954 + }, + { + "epoch": 0.4169285564634402, + "grad_norm": 0.3949562862493241, + "learning_rate": 9.953809023483718e-05, + "loss": 3.3599, + "step": 8955 + }, + { + "epoch": 0.41697511464953324, + "grad_norm": 0.4490179802903791, + "learning_rate": 9.953772282064879e-05, + "loss": 3.2456, + "step": 8956 + }, + { + "epoch": 0.4170216728356263, + "grad_norm": 0.44697912813903373, + "learning_rate": 9.953735526107206e-05, + "loss": 3.2987, + "step": 8957 + }, + { + "epoch": 0.4170682310217194, + "grad_norm": 0.43080594846051046, + "learning_rate": 9.95369875561081e-05, + "loss": 3.2531, + "step": 8958 + }, + { + "epoch": 0.4171147892078125, + "grad_norm": 0.41149554042381525, + "learning_rate": 9.953661970575795e-05, + "loss": 3.1775, + "step": 8959 + }, + { + "epoch": 0.41716134739390553, + "grad_norm": 0.4103366708946434, + "learning_rate": 9.953625171002269e-05, + "loss": 3.3476, + "step": 8960 + }, + { + "epoch": 0.4172079055799986, + "grad_norm": 0.48655905367572727, + "learning_rate": 9.953588356890341e-05, + "loss": 3.2985, + "step": 8961 + }, + { + "epoch": 0.41725446376609165, + "grad_norm": 0.5225691996371136, + "learning_rate": 9.953551528240121e-05, + "loss": 3.2762, + "step": 8962 + }, + { + "epoch": 0.41730102195218477, + "grad_norm": 0.4648605055824835, + "learning_rate": 9.953514685051714e-05, + "loss": 3.244, + "step": 8963 + }, + { + "epoch": 0.41734758013827783, + "grad_norm": 0.4291057898363382, + "learning_rate": 9.95347782732523e-05, + "loss": 3.3929, + "step": 8964 + }, + { + "epoch": 0.4173941383243709, + "grad_norm": 0.557794924841455, + "learning_rate": 9.953440955060776e-05, + "loss": 3.482, + "step": 8965 + }, + { + "epoch": 0.41744069651046395, + "grad_norm": 0.5166422001621496, + "learning_rate": 9.95340406825846e-05, + "loss": 3.2459, + "step": 8966 + }, + { + "epoch": 0.417487254696557, + "grad_norm": 0.394591000924742, + "learning_rate": 9.953367166918392e-05, + "loss": 3.3081, + "step": 8967 + }, + { + "epoch": 0.41753381288265007, + "grad_norm": 0.4746259964370741, + "learning_rate": 9.95333025104068e-05, + "loss": 3.3543, + "step": 8968 + }, + { + "epoch": 0.4175803710687432, + "grad_norm": 0.4784511239224872, + "learning_rate": 9.95329332062543e-05, + "loss": 3.3323, + "step": 8969 + }, + { + "epoch": 0.41762692925483624, + "grad_norm": 0.4128302152969331, + "learning_rate": 9.953256375672755e-05, + "loss": 3.2248, + "step": 8970 + }, + { + "epoch": 0.4176734874409293, + "grad_norm": 0.4157877184188869, + "learning_rate": 9.953219416182757e-05, + "loss": 3.3692, + "step": 8971 + }, + { + "epoch": 0.41772004562702236, + "grad_norm": 0.4244790311224861, + "learning_rate": 9.953182442155551e-05, + "loss": 3.2061, + "step": 8972 + }, + { + "epoch": 0.4177666038131154, + "grad_norm": 0.4036016899477491, + "learning_rate": 9.953145453591241e-05, + "loss": 3.2524, + "step": 8973 + }, + { + "epoch": 0.41781316199920854, + "grad_norm": 0.3631017906489841, + "learning_rate": 9.953108450489937e-05, + "loss": 3.344, + "step": 8974 + }, + { + "epoch": 0.4178597201853016, + "grad_norm": 0.4366126733587478, + "learning_rate": 9.953071432851748e-05, + "loss": 3.3428, + "step": 8975 + }, + { + "epoch": 0.41790627837139466, + "grad_norm": 0.3672913072276227, + "learning_rate": 9.953034400676782e-05, + "loss": 3.2815, + "step": 8976 + }, + { + "epoch": 0.4179528365574877, + "grad_norm": 0.3955715975895078, + "learning_rate": 9.952997353965148e-05, + "loss": 3.1943, + "step": 8977 + }, + { + "epoch": 0.4179993947435808, + "grad_norm": 0.39318440123357323, + "learning_rate": 9.952960292716955e-05, + "loss": 3.3607, + "step": 8978 + }, + { + "epoch": 0.41804595292967384, + "grad_norm": 0.3954963413487057, + "learning_rate": 9.952923216932313e-05, + "loss": 3.2386, + "step": 8979 + }, + { + "epoch": 0.41809251111576695, + "grad_norm": 0.42170799136005704, + "learning_rate": 9.952886126611328e-05, + "loss": 3.3472, + "step": 8980 + }, + { + "epoch": 0.41813906930186, + "grad_norm": 0.42452281203436765, + "learning_rate": 9.95284902175411e-05, + "loss": 3.3337, + "step": 8981 + }, + { + "epoch": 0.41818562748795307, + "grad_norm": 0.42300687505223056, + "learning_rate": 9.952811902360767e-05, + "loss": 3.3234, + "step": 8982 + }, + { + "epoch": 0.41823218567404613, + "grad_norm": 0.45338017030226446, + "learning_rate": 9.95277476843141e-05, + "loss": 3.3221, + "step": 8983 + }, + { + "epoch": 0.4182787438601392, + "grad_norm": 0.41655549609426107, + "learning_rate": 9.952737619966145e-05, + "loss": 3.2789, + "step": 8984 + }, + { + "epoch": 0.4183253020462323, + "grad_norm": 0.38088647722986957, + "learning_rate": 9.952700456965088e-05, + "loss": 3.3093, + "step": 8985 + }, + { + "epoch": 0.41837186023232537, + "grad_norm": 0.4220894340782174, + "learning_rate": 9.952663279428339e-05, + "loss": 3.3025, + "step": 8986 + }, + { + "epoch": 0.4184184184184184, + "grad_norm": 0.3795360510836694, + "learning_rate": 9.952626087356012e-05, + "loss": 3.3001, + "step": 8987 + }, + { + "epoch": 0.4184649766045115, + "grad_norm": 0.44194405810962595, + "learning_rate": 9.952588880748214e-05, + "loss": 3.1898, + "step": 8988 + }, + { + "epoch": 0.41851153479060454, + "grad_norm": 0.43491533310037617, + "learning_rate": 9.952551659605058e-05, + "loss": 3.4179, + "step": 8989 + }, + { + "epoch": 0.4185580929766976, + "grad_norm": 0.4398642977408217, + "learning_rate": 9.95251442392665e-05, + "loss": 3.3617, + "step": 8990 + }, + { + "epoch": 0.4186046511627907, + "grad_norm": 0.40399573460862304, + "learning_rate": 9.9524771737131e-05, + "loss": 3.2436, + "step": 8991 + }, + { + "epoch": 0.4186512093488838, + "grad_norm": 0.4771143714385258, + "learning_rate": 9.952439908964517e-05, + "loss": 3.1886, + "step": 8992 + }, + { + "epoch": 0.41869776753497684, + "grad_norm": 0.43733026812450376, + "learning_rate": 9.952402629681013e-05, + "loss": 3.2609, + "step": 8993 + }, + { + "epoch": 0.4187443257210699, + "grad_norm": 0.4445556045000359, + "learning_rate": 9.952365335862693e-05, + "loss": 3.2628, + "step": 8994 + }, + { + "epoch": 0.41879088390716296, + "grad_norm": 0.452696927169121, + "learning_rate": 9.952328027509667e-05, + "loss": 3.3259, + "step": 8995 + }, + { + "epoch": 0.4188374420932561, + "grad_norm": 0.35844756069029754, + "learning_rate": 9.95229070462205e-05, + "loss": 3.342, + "step": 8996 + }, + { + "epoch": 0.41888400027934913, + "grad_norm": 0.49953460043243775, + "learning_rate": 9.952253367199946e-05, + "loss": 3.3352, + "step": 8997 + }, + { + "epoch": 0.4189305584654422, + "grad_norm": 0.4096687880341782, + "learning_rate": 9.952216015243465e-05, + "loss": 3.3024, + "step": 8998 + }, + { + "epoch": 0.41897711665153525, + "grad_norm": 0.41164577226246934, + "learning_rate": 9.952178648752719e-05, + "loss": 3.3907, + "step": 8999 + }, + { + "epoch": 0.4190236748376283, + "grad_norm": 0.42869569756766457, + "learning_rate": 9.952141267727816e-05, + "loss": 3.3446, + "step": 9000 + }, + { + "epoch": 0.4190702330237214, + "grad_norm": 0.4248578015299064, + "learning_rate": 9.952103872168868e-05, + "loss": 3.3652, + "step": 9001 + }, + { + "epoch": 0.4191167912098145, + "grad_norm": 0.40350229885649186, + "learning_rate": 9.952066462075979e-05, + "loss": 3.3417, + "step": 9002 + }, + { + "epoch": 0.41916334939590755, + "grad_norm": 0.4347814086648697, + "learning_rate": 9.952029037449266e-05, + "loss": 3.2394, + "step": 9003 + }, + { + "epoch": 0.4192099075820006, + "grad_norm": 0.42807887949129414, + "learning_rate": 9.951991598288836e-05, + "loss": 3.3099, + "step": 9004 + }, + { + "epoch": 0.41925646576809367, + "grad_norm": 0.4600113196020832, + "learning_rate": 9.951954144594797e-05, + "loss": 3.3207, + "step": 9005 + }, + { + "epoch": 0.4193030239541867, + "grad_norm": 0.41943262853646907, + "learning_rate": 9.951916676367261e-05, + "loss": 3.3068, + "step": 9006 + }, + { + "epoch": 0.41934958214027984, + "grad_norm": 0.4028515230882631, + "learning_rate": 9.951879193606338e-05, + "loss": 3.2834, + "step": 9007 + }, + { + "epoch": 0.4193961403263729, + "grad_norm": 0.4214274414826127, + "learning_rate": 9.951841696312137e-05, + "loss": 3.3014, + "step": 9008 + }, + { + "epoch": 0.41944269851246596, + "grad_norm": 0.42617738264222566, + "learning_rate": 9.951804184484767e-05, + "loss": 3.2394, + "step": 9009 + }, + { + "epoch": 0.419489256698559, + "grad_norm": 0.3620190457341021, + "learning_rate": 9.951766658124341e-05, + "loss": 3.3963, + "step": 9010 + }, + { + "epoch": 0.4195358148846521, + "grad_norm": 0.38457515753676796, + "learning_rate": 9.951729117230968e-05, + "loss": 3.1973, + "step": 9011 + }, + { + "epoch": 0.41958237307074514, + "grad_norm": 0.4241463650035485, + "learning_rate": 9.951691561804757e-05, + "loss": 3.2674, + "step": 9012 + }, + { + "epoch": 0.41962893125683826, + "grad_norm": 0.38745407623113076, + "learning_rate": 9.95165399184582e-05, + "loss": 3.1609, + "step": 9013 + }, + { + "epoch": 0.4196754894429313, + "grad_norm": 0.38472187158724, + "learning_rate": 9.951616407354265e-05, + "loss": 3.3862, + "step": 9014 + }, + { + "epoch": 0.4197220476290244, + "grad_norm": 0.3770974053067524, + "learning_rate": 9.951578808330205e-05, + "loss": 3.2055, + "step": 9015 + }, + { + "epoch": 0.41976860581511743, + "grad_norm": 0.45069976364209974, + "learning_rate": 9.951541194773748e-05, + "loss": 3.3627, + "step": 9016 + }, + { + "epoch": 0.4198151640012105, + "grad_norm": 0.4106862636638431, + "learning_rate": 9.951503566685006e-05, + "loss": 3.3309, + "step": 9017 + }, + { + "epoch": 0.4198617221873036, + "grad_norm": 0.44297313097826563, + "learning_rate": 9.95146592406409e-05, + "loss": 3.3992, + "step": 9018 + }, + { + "epoch": 0.41990828037339667, + "grad_norm": 0.46025473980408843, + "learning_rate": 9.951428266911107e-05, + "loss": 3.3081, + "step": 9019 + }, + { + "epoch": 0.41995483855948973, + "grad_norm": 0.44856759428141885, + "learning_rate": 9.95139059522617e-05, + "loss": 3.4262, + "step": 9020 + }, + { + "epoch": 0.4200013967455828, + "grad_norm": 0.4631915130653155, + "learning_rate": 9.951352909009392e-05, + "loss": 3.2885, + "step": 9021 + }, + { + "epoch": 0.42004795493167585, + "grad_norm": 0.4926095437669082, + "learning_rate": 9.951315208260879e-05, + "loss": 3.436, + "step": 9022 + }, + { + "epoch": 0.4200945131177689, + "grad_norm": 0.47310783771994075, + "learning_rate": 9.951277492980744e-05, + "loss": 3.3331, + "step": 9023 + }, + { + "epoch": 0.420141071303862, + "grad_norm": 0.43865699092506844, + "learning_rate": 9.951239763169097e-05, + "loss": 3.2133, + "step": 9024 + }, + { + "epoch": 0.4201876294899551, + "grad_norm": 0.39201928634048444, + "learning_rate": 9.951202018826049e-05, + "loss": 3.263, + "step": 9025 + }, + { + "epoch": 0.42023418767604814, + "grad_norm": 0.4315589599912146, + "learning_rate": 9.951164259951711e-05, + "loss": 3.3721, + "step": 9026 + }, + { + "epoch": 0.4202807458621412, + "grad_norm": 0.4400457998663592, + "learning_rate": 9.951126486546194e-05, + "loss": 3.4077, + "step": 9027 + }, + { + "epoch": 0.42032730404823426, + "grad_norm": 0.4846614334992523, + "learning_rate": 9.951088698609609e-05, + "loss": 3.265, + "step": 9028 + }, + { + "epoch": 0.4203738622343274, + "grad_norm": 0.3607108840096909, + "learning_rate": 9.951050896142064e-05, + "loss": 3.2237, + "step": 9029 + }, + { + "epoch": 0.42042042042042044, + "grad_norm": 0.3837721397787809, + "learning_rate": 9.951013079143676e-05, + "loss": 3.343, + "step": 9030 + }, + { + "epoch": 0.4204669786065135, + "grad_norm": 0.4217806771149098, + "learning_rate": 9.95097524761455e-05, + "loss": 3.3101, + "step": 9031 + }, + { + "epoch": 0.42051353679260656, + "grad_norm": 0.4089661087447016, + "learning_rate": 9.950937401554801e-05, + "loss": 3.3635, + "step": 9032 + }, + { + "epoch": 0.4205600949786996, + "grad_norm": 0.41393748177292433, + "learning_rate": 9.950899540964537e-05, + "loss": 3.2753, + "step": 9033 + }, + { + "epoch": 0.4206066531647927, + "grad_norm": 0.4434210157843748, + "learning_rate": 9.95086166584387e-05, + "loss": 3.3167, + "step": 9034 + }, + { + "epoch": 0.4206532113508858, + "grad_norm": 0.38854584190892555, + "learning_rate": 9.950823776192913e-05, + "loss": 3.2528, + "step": 9035 + }, + { + "epoch": 0.42069976953697885, + "grad_norm": 0.3819132908575218, + "learning_rate": 9.950785872011777e-05, + "loss": 3.3434, + "step": 9036 + }, + { + "epoch": 0.4207463277230719, + "grad_norm": 0.43937384120424394, + "learning_rate": 9.950747953300569e-05, + "loss": 3.253, + "step": 9037 + }, + { + "epoch": 0.42079288590916497, + "grad_norm": 0.4300544964833025, + "learning_rate": 9.950710020059406e-05, + "loss": 3.2401, + "step": 9038 + }, + { + "epoch": 0.42083944409525803, + "grad_norm": 0.4411273428511787, + "learning_rate": 9.950672072288397e-05, + "loss": 3.394, + "step": 9039 + }, + { + "epoch": 0.42088600228135115, + "grad_norm": 0.44492447357919856, + "learning_rate": 9.950634109987653e-05, + "loss": 3.3018, + "step": 9040 + }, + { + "epoch": 0.4209325604674442, + "grad_norm": 0.3805067098489647, + "learning_rate": 9.950596133157284e-05, + "loss": 3.296, + "step": 9041 + }, + { + "epoch": 0.42097911865353727, + "grad_norm": 0.46211512678132605, + "learning_rate": 9.950558141797403e-05, + "loss": 3.2573, + "step": 9042 + }, + { + "epoch": 0.4210256768396303, + "grad_norm": 0.3674149542575045, + "learning_rate": 9.950520135908124e-05, + "loss": 3.2202, + "step": 9043 + }, + { + "epoch": 0.4210722350257234, + "grad_norm": 0.3860230570507241, + "learning_rate": 9.950482115489555e-05, + "loss": 3.3578, + "step": 9044 + }, + { + "epoch": 0.42111879321181644, + "grad_norm": 0.3992496094870673, + "learning_rate": 9.950444080541807e-05, + "loss": 3.2674, + "step": 9045 + }, + { + "epoch": 0.42116535139790956, + "grad_norm": 0.3915419448389112, + "learning_rate": 9.950406031064996e-05, + "loss": 3.2647, + "step": 9046 + }, + { + "epoch": 0.4212119095840026, + "grad_norm": 0.44581191158095906, + "learning_rate": 9.950367967059228e-05, + "loss": 3.301, + "step": 9047 + }, + { + "epoch": 0.4212584677700957, + "grad_norm": 0.4517598212522332, + "learning_rate": 9.950329888524621e-05, + "loss": 3.3833, + "step": 9048 + }, + { + "epoch": 0.42130502595618874, + "grad_norm": 0.4825261418948296, + "learning_rate": 9.950291795461282e-05, + "loss": 3.3982, + "step": 9049 + }, + { + "epoch": 0.4213515841422818, + "grad_norm": 0.4288474981537706, + "learning_rate": 9.950253687869325e-05, + "loss": 3.25, + "step": 9050 + }, + { + "epoch": 0.4213981423283749, + "grad_norm": 0.4834936040352552, + "learning_rate": 9.950215565748859e-05, + "loss": 3.2423, + "step": 9051 + }, + { + "epoch": 0.421444700514468, + "grad_norm": 0.36806624111930886, + "learning_rate": 9.9501774291e-05, + "loss": 3.2645, + "step": 9052 + }, + { + "epoch": 0.42149125870056103, + "grad_norm": 0.4473687349230447, + "learning_rate": 9.950139277922857e-05, + "loss": 3.3156, + "step": 9053 + }, + { + "epoch": 0.4215378168866541, + "grad_norm": 0.42421786325699135, + "learning_rate": 9.950101112217543e-05, + "loss": 3.3536, + "step": 9054 + }, + { + "epoch": 0.42158437507274715, + "grad_norm": 0.39556740651437067, + "learning_rate": 9.95006293198417e-05, + "loss": 3.3662, + "step": 9055 + }, + { + "epoch": 0.4216309332588402, + "grad_norm": 0.4106458809183252, + "learning_rate": 9.950024737222849e-05, + "loss": 3.3129, + "step": 9056 + }, + { + "epoch": 0.42167749144493333, + "grad_norm": 0.4058321273204839, + "learning_rate": 9.949986527933694e-05, + "loss": 3.2667, + "step": 9057 + }, + { + "epoch": 0.4217240496310264, + "grad_norm": 0.3861463195674383, + "learning_rate": 9.949948304116817e-05, + "loss": 3.193, + "step": 9058 + }, + { + "epoch": 0.42177060781711945, + "grad_norm": 0.4562027997815731, + "learning_rate": 9.949910065772329e-05, + "loss": 3.2075, + "step": 9059 + }, + { + "epoch": 0.4218171660032125, + "grad_norm": 0.3977392084318497, + "learning_rate": 9.949871812900342e-05, + "loss": 3.2478, + "step": 9060 + }, + { + "epoch": 0.42186372418930557, + "grad_norm": 0.40303493404638274, + "learning_rate": 9.949833545500969e-05, + "loss": 3.3107, + "step": 9061 + }, + { + "epoch": 0.4219102823753987, + "grad_norm": 0.41644118324722545, + "learning_rate": 9.949795263574323e-05, + "loss": 3.313, + "step": 9062 + }, + { + "epoch": 0.42195684056149174, + "grad_norm": 0.41725343485527044, + "learning_rate": 9.949756967120513e-05, + "loss": 3.5081, + "step": 9063 + }, + { + "epoch": 0.4220033987475848, + "grad_norm": 0.4804766659199397, + "learning_rate": 9.949718656139657e-05, + "loss": 3.3617, + "step": 9064 + }, + { + "epoch": 0.42204995693367786, + "grad_norm": 0.3944702224761561, + "learning_rate": 9.949680330631864e-05, + "loss": 3.2695, + "step": 9065 + }, + { + "epoch": 0.4220965151197709, + "grad_norm": 0.44088209216484653, + "learning_rate": 9.949641990597244e-05, + "loss": 3.3522, + "step": 9066 + }, + { + "epoch": 0.422143073305864, + "grad_norm": 0.3953633384814084, + "learning_rate": 9.949603636035915e-05, + "loss": 3.3553, + "step": 9067 + }, + { + "epoch": 0.4221896314919571, + "grad_norm": 0.4648133998403384, + "learning_rate": 9.949565266947987e-05, + "loss": 3.3081, + "step": 9068 + }, + { + "epoch": 0.42223618967805016, + "grad_norm": 0.38440730944800766, + "learning_rate": 9.949526883333571e-05, + "loss": 3.2485, + "step": 9069 + }, + { + "epoch": 0.4222827478641432, + "grad_norm": 0.46922539084844944, + "learning_rate": 9.949488485192783e-05, + "loss": 3.2936, + "step": 9070 + }, + { + "epoch": 0.4223293060502363, + "grad_norm": 0.4558693500893809, + "learning_rate": 9.949450072525733e-05, + "loss": 3.378, + "step": 9071 + }, + { + "epoch": 0.42237586423632933, + "grad_norm": 0.39825709237223667, + "learning_rate": 9.949411645332534e-05, + "loss": 3.1581, + "step": 9072 + }, + { + "epoch": 0.42242242242242245, + "grad_norm": 0.4386636395839763, + "learning_rate": 9.949373203613301e-05, + "loss": 3.2567, + "step": 9073 + }, + { + "epoch": 0.4224689806085155, + "grad_norm": 0.4919645972440808, + "learning_rate": 9.949334747368144e-05, + "loss": 3.3386, + "step": 9074 + }, + { + "epoch": 0.42251553879460857, + "grad_norm": 0.47277540393809725, + "learning_rate": 9.949296276597178e-05, + "loss": 3.277, + "step": 9075 + }, + { + "epoch": 0.42256209698070163, + "grad_norm": 0.4938433553735442, + "learning_rate": 9.949257791300512e-05, + "loss": 3.2624, + "step": 9076 + }, + { + "epoch": 0.4226086551667947, + "grad_norm": 0.45254376666756235, + "learning_rate": 9.949219291478266e-05, + "loss": 3.3148, + "step": 9077 + }, + { + "epoch": 0.42265521335288775, + "grad_norm": 0.4130536833571178, + "learning_rate": 9.949180777130547e-05, + "loss": 3.316, + "step": 9078 + }, + { + "epoch": 0.42270177153898086, + "grad_norm": 0.49883993930373877, + "learning_rate": 9.949142248257471e-05, + "loss": 3.3246, + "step": 9079 + }, + { + "epoch": 0.4227483297250739, + "grad_norm": 0.5281098986009303, + "learning_rate": 9.949103704859149e-05, + "loss": 3.3521, + "step": 9080 + }, + { + "epoch": 0.422794887911167, + "grad_norm": 0.4742108303132936, + "learning_rate": 9.949065146935695e-05, + "loss": 3.3833, + "step": 9081 + }, + { + "epoch": 0.42284144609726004, + "grad_norm": 0.4610604196913347, + "learning_rate": 9.949026574487223e-05, + "loss": 3.2137, + "step": 9082 + }, + { + "epoch": 0.4228880042833531, + "grad_norm": 0.38015466281511456, + "learning_rate": 9.948987987513844e-05, + "loss": 3.2578, + "step": 9083 + }, + { + "epoch": 0.4229345624694462, + "grad_norm": 0.43754684063799326, + "learning_rate": 9.948949386015676e-05, + "loss": 3.3146, + "step": 9084 + }, + { + "epoch": 0.4229811206555393, + "grad_norm": 0.5280566810792469, + "learning_rate": 9.948910769992828e-05, + "loss": 3.3285, + "step": 9085 + }, + { + "epoch": 0.42302767884163234, + "grad_norm": 0.5028281383712068, + "learning_rate": 9.948872139445414e-05, + "loss": 3.263, + "step": 9086 + }, + { + "epoch": 0.4230742370277254, + "grad_norm": 0.4737341214330425, + "learning_rate": 9.948833494373546e-05, + "loss": 3.3566, + "step": 9087 + }, + { + "epoch": 0.42312079521381846, + "grad_norm": 0.4138974410570347, + "learning_rate": 9.948794834777342e-05, + "loss": 3.3149, + "step": 9088 + }, + { + "epoch": 0.4231673533999115, + "grad_norm": 0.5502907944099175, + "learning_rate": 9.948756160656912e-05, + "loss": 3.261, + "step": 9089 + }, + { + "epoch": 0.42321391158600463, + "grad_norm": 0.5747040093831423, + "learning_rate": 9.948717472012368e-05, + "loss": 3.4425, + "step": 9090 + }, + { + "epoch": 0.4232604697720977, + "grad_norm": 0.5058340782139535, + "learning_rate": 9.948678768843828e-05, + "loss": 3.2298, + "step": 9091 + }, + { + "epoch": 0.42330702795819075, + "grad_norm": 0.48459693745177906, + "learning_rate": 9.948640051151404e-05, + "loss": 3.2168, + "step": 9092 + }, + { + "epoch": 0.4233535861442838, + "grad_norm": 0.48546873469504837, + "learning_rate": 9.948601318935207e-05, + "loss": 3.3176, + "step": 9093 + }, + { + "epoch": 0.42340014433037687, + "grad_norm": 0.49778386289809684, + "learning_rate": 9.948562572195354e-05, + "loss": 3.3842, + "step": 9094 + }, + { + "epoch": 0.42344670251647, + "grad_norm": 0.49307310570925467, + "learning_rate": 9.948523810931956e-05, + "loss": 3.3182, + "step": 9095 + }, + { + "epoch": 0.42349326070256305, + "grad_norm": 0.4822924452225427, + "learning_rate": 9.94848503514513e-05, + "loss": 3.3137, + "step": 9096 + }, + { + "epoch": 0.4235398188886561, + "grad_norm": 0.4878704661121483, + "learning_rate": 9.948446244834986e-05, + "loss": 3.3561, + "step": 9097 + }, + { + "epoch": 0.42358637707474917, + "grad_norm": 0.36716305015810397, + "learning_rate": 9.948407440001642e-05, + "loss": 3.2393, + "step": 9098 + }, + { + "epoch": 0.4236329352608422, + "grad_norm": 0.4857879998322665, + "learning_rate": 9.948368620645208e-05, + "loss": 3.3948, + "step": 9099 + }, + { + "epoch": 0.4236794934469353, + "grad_norm": 0.47713150026731144, + "learning_rate": 9.9483297867658e-05, + "loss": 3.3577, + "step": 9100 + }, + { + "epoch": 0.4237260516330284, + "grad_norm": 0.42911939680341166, + "learning_rate": 9.948290938363531e-05, + "loss": 3.3446, + "step": 9101 + }, + { + "epoch": 0.42377260981912146, + "grad_norm": 0.4462516881382703, + "learning_rate": 9.948252075438515e-05, + "loss": 3.3487, + "step": 9102 + }, + { + "epoch": 0.4238191680052145, + "grad_norm": 0.4441521677806081, + "learning_rate": 9.948213197990869e-05, + "loss": 3.1933, + "step": 9103 + }, + { + "epoch": 0.4238657261913076, + "grad_norm": 0.3877483103773329, + "learning_rate": 9.948174306020704e-05, + "loss": 3.2352, + "step": 9104 + }, + { + "epoch": 0.42391228437740064, + "grad_norm": 0.4282879482164717, + "learning_rate": 9.948135399528134e-05, + "loss": 3.28, + "step": 9105 + }, + { + "epoch": 0.4239588425634937, + "grad_norm": 0.436153968188471, + "learning_rate": 9.948096478513274e-05, + "loss": 3.3765, + "step": 9106 + }, + { + "epoch": 0.4240054007495868, + "grad_norm": 0.38985333845295095, + "learning_rate": 9.94805754297624e-05, + "loss": 3.2911, + "step": 9107 + }, + { + "epoch": 0.4240519589356799, + "grad_norm": 0.39619853729461313, + "learning_rate": 9.948018592917142e-05, + "loss": 3.3195, + "step": 9108 + }, + { + "epoch": 0.42409851712177293, + "grad_norm": 0.3629009004852506, + "learning_rate": 9.947979628336099e-05, + "loss": 3.1744, + "step": 9109 + }, + { + "epoch": 0.424145075307866, + "grad_norm": 0.40683854282172044, + "learning_rate": 9.947940649233221e-05, + "loss": 3.2263, + "step": 9110 + }, + { + "epoch": 0.42419163349395905, + "grad_norm": 0.38273063810276525, + "learning_rate": 9.947901655608627e-05, + "loss": 3.3341, + "step": 9111 + }, + { + "epoch": 0.42423819168005217, + "grad_norm": 0.42585860472565795, + "learning_rate": 9.947862647462428e-05, + "loss": 3.244, + "step": 9112 + }, + { + "epoch": 0.4242847498661452, + "grad_norm": 0.45344087563455704, + "learning_rate": 9.947823624794739e-05, + "loss": 3.3364, + "step": 9113 + }, + { + "epoch": 0.4243313080522383, + "grad_norm": 0.45150861148066657, + "learning_rate": 9.947784587605678e-05, + "loss": 3.2884, + "step": 9114 + }, + { + "epoch": 0.42437786623833135, + "grad_norm": 0.4426744272888856, + "learning_rate": 9.947745535895353e-05, + "loss": 3.2936, + "step": 9115 + }, + { + "epoch": 0.4244244244244244, + "grad_norm": 0.42227501798594613, + "learning_rate": 9.947706469663884e-05, + "loss": 3.1908, + "step": 9116 + }, + { + "epoch": 0.42447098261051747, + "grad_norm": 0.4243022385858319, + "learning_rate": 9.947667388911383e-05, + "loss": 3.3292, + "step": 9117 + }, + { + "epoch": 0.4245175407966106, + "grad_norm": 0.4224628640812956, + "learning_rate": 9.947628293637967e-05, + "loss": 3.2525, + "step": 9118 + }, + { + "epoch": 0.42456409898270364, + "grad_norm": 0.40372861415177025, + "learning_rate": 9.947589183843748e-05, + "loss": 3.2499, + "step": 9119 + }, + { + "epoch": 0.4246106571687967, + "grad_norm": 0.42080072447873884, + "learning_rate": 9.947550059528844e-05, + "loss": 3.2306, + "step": 9120 + }, + { + "epoch": 0.42465721535488976, + "grad_norm": 0.37689258228450906, + "learning_rate": 9.947510920693368e-05, + "loss": 3.2298, + "step": 9121 + }, + { + "epoch": 0.4247037735409828, + "grad_norm": 0.35747063191136597, + "learning_rate": 9.947471767337433e-05, + "loss": 3.2516, + "step": 9122 + }, + { + "epoch": 0.42475033172707594, + "grad_norm": 0.42262132775595423, + "learning_rate": 9.947432599461156e-05, + "loss": 3.339, + "step": 9123 + }, + { + "epoch": 0.424796889913169, + "grad_norm": 0.4044488519754231, + "learning_rate": 9.947393417064652e-05, + "loss": 3.3698, + "step": 9124 + }, + { + "epoch": 0.42484344809926206, + "grad_norm": 0.3963246330882981, + "learning_rate": 9.947354220148037e-05, + "loss": 3.2337, + "step": 9125 + }, + { + "epoch": 0.4248900062853551, + "grad_norm": 0.3961763928342246, + "learning_rate": 9.947315008711425e-05, + "loss": 3.273, + "step": 9126 + }, + { + "epoch": 0.4249365644714482, + "grad_norm": 0.3736830687359398, + "learning_rate": 9.947275782754927e-05, + "loss": 3.3626, + "step": 9127 + }, + { + "epoch": 0.42498312265754123, + "grad_norm": 0.41595828631372167, + "learning_rate": 9.947236542278666e-05, + "loss": 3.1692, + "step": 9128 + }, + { + "epoch": 0.42502968084363435, + "grad_norm": 0.42214622040340727, + "learning_rate": 9.947197287282752e-05, + "loss": 3.3029, + "step": 9129 + }, + { + "epoch": 0.4250762390297274, + "grad_norm": 0.42979436823820727, + "learning_rate": 9.9471580177673e-05, + "loss": 3.2947, + "step": 9130 + }, + { + "epoch": 0.42512279721582047, + "grad_norm": 0.4862989494920069, + "learning_rate": 9.947118733732428e-05, + "loss": 3.2421, + "step": 9131 + }, + { + "epoch": 0.42516935540191353, + "grad_norm": 0.4569466665074101, + "learning_rate": 9.947079435178251e-05, + "loss": 3.2495, + "step": 9132 + }, + { + "epoch": 0.4252159135880066, + "grad_norm": 0.428996112678793, + "learning_rate": 9.94704012210488e-05, + "loss": 3.1777, + "step": 9133 + }, + { + "epoch": 0.4252624717740997, + "grad_norm": 0.4826948621894264, + "learning_rate": 9.947000794512436e-05, + "loss": 3.3291, + "step": 9134 + }, + { + "epoch": 0.42530902996019276, + "grad_norm": 0.4118462416102113, + "learning_rate": 9.946961452401031e-05, + "loss": 3.1979, + "step": 9135 + }, + { + "epoch": 0.4253555881462858, + "grad_norm": 0.4352571305887746, + "learning_rate": 9.946922095770782e-05, + "loss": 3.2552, + "step": 9136 + }, + { + "epoch": 0.4254021463323789, + "grad_norm": 0.4421640793731751, + "learning_rate": 9.946882724621805e-05, + "loss": 3.2312, + "step": 9137 + }, + { + "epoch": 0.42544870451847194, + "grad_norm": 0.42574455643460074, + "learning_rate": 9.946843338954213e-05, + "loss": 3.2437, + "step": 9138 + }, + { + "epoch": 0.425495262704565, + "grad_norm": 0.4047059662298148, + "learning_rate": 9.946803938768123e-05, + "loss": 3.2948, + "step": 9139 + }, + { + "epoch": 0.4255418208906581, + "grad_norm": 0.38955483939851254, + "learning_rate": 9.946764524063652e-05, + "loss": 3.2805, + "step": 9140 + }, + { + "epoch": 0.4255883790767512, + "grad_norm": 0.38616041370255894, + "learning_rate": 9.946725094840914e-05, + "loss": 3.3191, + "step": 9141 + }, + { + "epoch": 0.42563493726284424, + "grad_norm": 0.39129175458403564, + "learning_rate": 9.946685651100025e-05, + "loss": 3.2638, + "step": 9142 + }, + { + "epoch": 0.4256814954489373, + "grad_norm": 0.4190489205542186, + "learning_rate": 9.946646192841101e-05, + "loss": 3.3178, + "step": 9143 + }, + { + "epoch": 0.42572805363503036, + "grad_norm": 0.3618529993410408, + "learning_rate": 9.946606720064258e-05, + "loss": 3.3149, + "step": 9144 + }, + { + "epoch": 0.42577461182112347, + "grad_norm": 0.487993761830738, + "learning_rate": 9.94656723276961e-05, + "loss": 3.2715, + "step": 9145 + }, + { + "epoch": 0.42582117000721653, + "grad_norm": 0.4532051189209958, + "learning_rate": 9.946527730957276e-05, + "loss": 3.2544, + "step": 9146 + }, + { + "epoch": 0.4258677281933096, + "grad_norm": 0.439876873486223, + "learning_rate": 9.94648821462737e-05, + "loss": 3.3665, + "step": 9147 + }, + { + "epoch": 0.42591428637940265, + "grad_norm": 0.40824510667324576, + "learning_rate": 9.94644868378001e-05, + "loss": 3.277, + "step": 9148 + }, + { + "epoch": 0.4259608445654957, + "grad_norm": 0.4043527034376072, + "learning_rate": 9.946409138415307e-05, + "loss": 3.3292, + "step": 9149 + }, + { + "epoch": 0.42600740275158877, + "grad_norm": 0.4263339539445135, + "learning_rate": 9.946369578533383e-05, + "loss": 3.3124, + "step": 9150 + }, + { + "epoch": 0.4260539609376819, + "grad_norm": 0.43193630625922613, + "learning_rate": 9.946330004134352e-05, + "loss": 3.2706, + "step": 9151 + }, + { + "epoch": 0.42610051912377495, + "grad_norm": 0.3600561088137057, + "learning_rate": 9.946290415218329e-05, + "loss": 3.3111, + "step": 9152 + }, + { + "epoch": 0.426147077309868, + "grad_norm": 0.39973029476918653, + "learning_rate": 9.94625081178543e-05, + "loss": 3.1325, + "step": 9153 + }, + { + "epoch": 0.42619363549596107, + "grad_norm": 0.4067190676545818, + "learning_rate": 9.946211193835773e-05, + "loss": 3.3419, + "step": 9154 + }, + { + "epoch": 0.4262401936820541, + "grad_norm": 0.4304541229931323, + "learning_rate": 9.946171561369473e-05, + "loss": 3.2843, + "step": 9155 + }, + { + "epoch": 0.42628675186814724, + "grad_norm": 0.4170237939189353, + "learning_rate": 9.946131914386647e-05, + "loss": 3.3244, + "step": 9156 + }, + { + "epoch": 0.4263333100542403, + "grad_norm": 0.3807203795207999, + "learning_rate": 9.94609225288741e-05, + "loss": 3.3678, + "step": 9157 + }, + { + "epoch": 0.42637986824033336, + "grad_norm": 0.4776561992364183, + "learning_rate": 9.946052576871883e-05, + "loss": 3.3898, + "step": 9158 + }, + { + "epoch": 0.4264264264264264, + "grad_norm": 0.46555733913984537, + "learning_rate": 9.946012886340177e-05, + "loss": 3.353, + "step": 9159 + }, + { + "epoch": 0.4264729846125195, + "grad_norm": 0.4350883162247817, + "learning_rate": 9.94597318129241e-05, + "loss": 3.2815, + "step": 9160 + }, + { + "epoch": 0.42651954279861254, + "grad_norm": 0.4120798144170573, + "learning_rate": 9.945933461728697e-05, + "loss": 3.321, + "step": 9161 + }, + { + "epoch": 0.42656610098470565, + "grad_norm": 0.4260086718326035, + "learning_rate": 9.94589372764916e-05, + "loss": 3.2788, + "step": 9162 + }, + { + "epoch": 0.4266126591707987, + "grad_norm": 0.4437589722749297, + "learning_rate": 9.945853979053911e-05, + "loss": 3.2821, + "step": 9163 + }, + { + "epoch": 0.4266592173568918, + "grad_norm": 0.3868219972847483, + "learning_rate": 9.945814215943068e-05, + "loss": 3.2667, + "step": 9164 + }, + { + "epoch": 0.42670577554298483, + "grad_norm": 0.3882023428731373, + "learning_rate": 9.945774438316746e-05, + "loss": 3.2152, + "step": 9165 + }, + { + "epoch": 0.4267523337290779, + "grad_norm": 0.4340012853952927, + "learning_rate": 9.945734646175064e-05, + "loss": 3.3976, + "step": 9166 + }, + { + "epoch": 0.426798891915171, + "grad_norm": 0.4315379497077593, + "learning_rate": 9.945694839518139e-05, + "loss": 3.2088, + "step": 9167 + }, + { + "epoch": 0.42684545010126407, + "grad_norm": 0.4587681151842201, + "learning_rate": 9.945655018346087e-05, + "loss": 3.2501, + "step": 9168 + }, + { + "epoch": 0.4268920082873571, + "grad_norm": 0.4829564188318787, + "learning_rate": 9.945615182659023e-05, + "loss": 3.3127, + "step": 9169 + }, + { + "epoch": 0.4269385664734502, + "grad_norm": 0.4509671627390204, + "learning_rate": 9.945575332457068e-05, + "loss": 3.2025, + "step": 9170 + }, + { + "epoch": 0.42698512465954325, + "grad_norm": 0.4039513926549687, + "learning_rate": 9.945535467740335e-05, + "loss": 3.301, + "step": 9171 + }, + { + "epoch": 0.4270316828456363, + "grad_norm": 0.4200539259969495, + "learning_rate": 9.945495588508943e-05, + "loss": 3.2926, + "step": 9172 + }, + { + "epoch": 0.4270782410317294, + "grad_norm": 0.4580186253653767, + "learning_rate": 9.945455694763008e-05, + "loss": 3.3121, + "step": 9173 + }, + { + "epoch": 0.4271247992178225, + "grad_norm": 0.4567717406878884, + "learning_rate": 9.945415786502648e-05, + "loss": 3.4192, + "step": 9174 + }, + { + "epoch": 0.42717135740391554, + "grad_norm": 0.432231016555816, + "learning_rate": 9.945375863727981e-05, + "loss": 3.4083, + "step": 9175 + }, + { + "epoch": 0.4272179155900086, + "grad_norm": 0.43025613311392064, + "learning_rate": 9.945335926439122e-05, + "loss": 3.1852, + "step": 9176 + }, + { + "epoch": 0.42726447377610166, + "grad_norm": 0.39953069591296436, + "learning_rate": 9.94529597463619e-05, + "loss": 3.2725, + "step": 9177 + }, + { + "epoch": 0.4273110319621948, + "grad_norm": 0.46246169947941096, + "learning_rate": 9.945256008319301e-05, + "loss": 3.254, + "step": 9178 + }, + { + "epoch": 0.42735759014828784, + "grad_norm": 0.3857178796481066, + "learning_rate": 9.945216027488573e-05, + "loss": 3.3224, + "step": 9179 + }, + { + "epoch": 0.4274041483343809, + "grad_norm": 0.42811226987924694, + "learning_rate": 9.945176032144123e-05, + "loss": 3.2619, + "step": 9180 + }, + { + "epoch": 0.42745070652047396, + "grad_norm": 0.46852993350577005, + "learning_rate": 9.945136022286068e-05, + "loss": 3.285, + "step": 9181 + }, + { + "epoch": 0.427497264706567, + "grad_norm": 0.37844148343047246, + "learning_rate": 9.945095997914527e-05, + "loss": 3.3207, + "step": 9182 + }, + { + "epoch": 0.4275438228926601, + "grad_norm": 0.44744613724519144, + "learning_rate": 9.945055959029616e-05, + "loss": 3.1833, + "step": 9183 + }, + { + "epoch": 0.4275903810787532, + "grad_norm": 0.43970673363266827, + "learning_rate": 9.945015905631452e-05, + "loss": 3.2762, + "step": 9184 + }, + { + "epoch": 0.42763693926484625, + "grad_norm": 0.4625480640688416, + "learning_rate": 9.944975837720153e-05, + "loss": 3.2824, + "step": 9185 + }, + { + "epoch": 0.4276834974509393, + "grad_norm": 0.4855889958145661, + "learning_rate": 9.94493575529584e-05, + "loss": 3.3719, + "step": 9186 + }, + { + "epoch": 0.42773005563703237, + "grad_norm": 0.4519335920371702, + "learning_rate": 9.944895658358626e-05, + "loss": 3.2524, + "step": 9187 + }, + { + "epoch": 0.42777661382312543, + "grad_norm": 0.45233682309181034, + "learning_rate": 9.944855546908631e-05, + "loss": 3.2341, + "step": 9188 + }, + { + "epoch": 0.42782317200921854, + "grad_norm": 0.45658737249277814, + "learning_rate": 9.944815420945972e-05, + "loss": 3.1145, + "step": 9189 + }, + { + "epoch": 0.4278697301953116, + "grad_norm": 0.5500483823501543, + "learning_rate": 9.944775280470766e-05, + "loss": 3.232, + "step": 9190 + }, + { + "epoch": 0.42791628838140466, + "grad_norm": 0.5383770805324265, + "learning_rate": 9.944735125483132e-05, + "loss": 3.2574, + "step": 9191 + }, + { + "epoch": 0.4279628465674977, + "grad_norm": 0.4713339704095898, + "learning_rate": 9.944694955983187e-05, + "loss": 3.3685, + "step": 9192 + }, + { + "epoch": 0.4280094047535908, + "grad_norm": 0.5681661131215434, + "learning_rate": 9.944654771971051e-05, + "loss": 3.3576, + "step": 9193 + }, + { + "epoch": 0.42805596293968384, + "grad_norm": 0.4967763502426959, + "learning_rate": 9.94461457344684e-05, + "loss": 3.3214, + "step": 9194 + }, + { + "epoch": 0.42810252112577696, + "grad_norm": 0.4822599705637564, + "learning_rate": 9.944574360410671e-05, + "loss": 3.3824, + "step": 9195 + }, + { + "epoch": 0.42814907931187, + "grad_norm": 0.44958150255822804, + "learning_rate": 9.944534132862664e-05, + "loss": 3.2162, + "step": 9196 + }, + { + "epoch": 0.4281956374979631, + "grad_norm": 0.5111851427610457, + "learning_rate": 9.944493890802937e-05, + "loss": 3.3949, + "step": 9197 + }, + { + "epoch": 0.42824219568405614, + "grad_norm": 0.4385418488593536, + "learning_rate": 9.944453634231608e-05, + "loss": 3.2602, + "step": 9198 + }, + { + "epoch": 0.4282887538701492, + "grad_norm": 0.3850080066706626, + "learning_rate": 9.944413363148793e-05, + "loss": 3.2715, + "step": 9199 + }, + { + "epoch": 0.4283353120562423, + "grad_norm": 0.40159796525246566, + "learning_rate": 9.944373077554614e-05, + "loss": 3.1249, + "step": 9200 + }, + { + "epoch": 0.42838187024233537, + "grad_norm": 0.39002539677060005, + "learning_rate": 9.944332777449185e-05, + "loss": 3.2773, + "step": 9201 + }, + { + "epoch": 0.42842842842842843, + "grad_norm": 0.46276146707099464, + "learning_rate": 9.944292462832627e-05, + "loss": 3.3196, + "step": 9202 + }, + { + "epoch": 0.4284749866145215, + "grad_norm": 0.3865246756315343, + "learning_rate": 9.944252133705058e-05, + "loss": 3.2732, + "step": 9203 + }, + { + "epoch": 0.42852154480061455, + "grad_norm": 0.4460097001665688, + "learning_rate": 9.944211790066597e-05, + "loss": 3.2092, + "step": 9204 + }, + { + "epoch": 0.4285681029867076, + "grad_norm": 0.45840071848036323, + "learning_rate": 9.944171431917361e-05, + "loss": 3.2412, + "step": 9205 + }, + { + "epoch": 0.4286146611728007, + "grad_norm": 0.4706857001880168, + "learning_rate": 9.944131059257468e-05, + "loss": 3.1898, + "step": 9206 + }, + { + "epoch": 0.4286612193588938, + "grad_norm": 0.4788587086740159, + "learning_rate": 9.944090672087038e-05, + "loss": 3.3113, + "step": 9207 + }, + { + "epoch": 0.42870777754498685, + "grad_norm": 0.4542774441639408, + "learning_rate": 9.944050270406189e-05, + "loss": 3.2026, + "step": 9208 + }, + { + "epoch": 0.4287543357310799, + "grad_norm": 0.4552526969155604, + "learning_rate": 9.944009854215039e-05, + "loss": 3.3037, + "step": 9209 + }, + { + "epoch": 0.42880089391717296, + "grad_norm": 0.4920242172721757, + "learning_rate": 9.943969423513708e-05, + "loss": 3.2518, + "step": 9210 + }, + { + "epoch": 0.4288474521032661, + "grad_norm": 0.44947904760061713, + "learning_rate": 9.943928978302312e-05, + "loss": 3.2347, + "step": 9211 + }, + { + "epoch": 0.42889401028935914, + "grad_norm": 0.4051265950876226, + "learning_rate": 9.943888518580975e-05, + "loss": 3.2136, + "step": 9212 + }, + { + "epoch": 0.4289405684754522, + "grad_norm": 0.49182607967656855, + "learning_rate": 9.943848044349809e-05, + "loss": 3.2354, + "step": 9213 + }, + { + "epoch": 0.42898712666154526, + "grad_norm": 0.502127887501509, + "learning_rate": 9.943807555608937e-05, + "loss": 3.272, + "step": 9214 + }, + { + "epoch": 0.4290336848476383, + "grad_norm": 0.3999309437857272, + "learning_rate": 9.943767052358477e-05, + "loss": 3.3334, + "step": 9215 + }, + { + "epoch": 0.4290802430337314, + "grad_norm": 0.41306940050602126, + "learning_rate": 9.943726534598549e-05, + "loss": 3.2628, + "step": 9216 + }, + { + "epoch": 0.4291268012198245, + "grad_norm": 0.42121690672246104, + "learning_rate": 9.94368600232927e-05, + "loss": 3.1778, + "step": 9217 + }, + { + "epoch": 0.42917335940591755, + "grad_norm": 0.4104662886124835, + "learning_rate": 9.943645455550759e-05, + "loss": 3.3627, + "step": 9218 + }, + { + "epoch": 0.4292199175920106, + "grad_norm": 0.43811287101551605, + "learning_rate": 9.943604894263135e-05, + "loss": 3.2859, + "step": 9219 + }, + { + "epoch": 0.4292664757781037, + "grad_norm": 0.3810851879236946, + "learning_rate": 9.943564318466518e-05, + "loss": 3.3489, + "step": 9220 + }, + { + "epoch": 0.42931303396419673, + "grad_norm": 0.45361259985338176, + "learning_rate": 9.943523728161027e-05, + "loss": 3.3072, + "step": 9221 + }, + { + "epoch": 0.42935959215028985, + "grad_norm": 0.393021051826748, + "learning_rate": 9.943483123346782e-05, + "loss": 3.4476, + "step": 9222 + }, + { + "epoch": 0.4294061503363829, + "grad_norm": 0.44793489017316745, + "learning_rate": 9.943442504023899e-05, + "loss": 3.2583, + "step": 9223 + }, + { + "epoch": 0.42945270852247597, + "grad_norm": 0.43382651610395395, + "learning_rate": 9.9434018701925e-05, + "loss": 3.3024, + "step": 9224 + }, + { + "epoch": 0.429499266708569, + "grad_norm": 0.4341122321274274, + "learning_rate": 9.943361221852704e-05, + "loss": 3.3874, + "step": 9225 + }, + { + "epoch": 0.4295458248946621, + "grad_norm": 0.4441323315570082, + "learning_rate": 9.94332055900463e-05, + "loss": 3.308, + "step": 9226 + }, + { + "epoch": 0.42959238308075515, + "grad_norm": 0.4284384102268873, + "learning_rate": 9.943279881648397e-05, + "loss": 3.3506, + "step": 9227 + }, + { + "epoch": 0.42963894126684826, + "grad_norm": 0.44704906790035576, + "learning_rate": 9.943239189784124e-05, + "loss": 3.2964, + "step": 9228 + }, + { + "epoch": 0.4296854994529413, + "grad_norm": 0.43472908620979867, + "learning_rate": 9.94319848341193e-05, + "loss": 3.1886, + "step": 9229 + }, + { + "epoch": 0.4297320576390344, + "grad_norm": 0.4221421943121579, + "learning_rate": 9.943157762531938e-05, + "loss": 3.2136, + "step": 9230 + }, + { + "epoch": 0.42977861582512744, + "grad_norm": 0.4016063750624314, + "learning_rate": 9.943117027144265e-05, + "loss": 3.2588, + "step": 9231 + }, + { + "epoch": 0.4298251740112205, + "grad_norm": 0.4132816091155017, + "learning_rate": 9.943076277249029e-05, + "loss": 3.2471, + "step": 9232 + }, + { + "epoch": 0.4298717321973136, + "grad_norm": 0.39091837505605054, + "learning_rate": 9.94303551284635e-05, + "loss": 3.345, + "step": 9233 + }, + { + "epoch": 0.4299182903834067, + "grad_norm": 0.41787879443244746, + "learning_rate": 9.94299473393635e-05, + "loss": 3.2805, + "step": 9234 + }, + { + "epoch": 0.42996484856949974, + "grad_norm": 0.44235361336674456, + "learning_rate": 9.942953940519148e-05, + "loss": 3.2959, + "step": 9235 + }, + { + "epoch": 0.4300114067555928, + "grad_norm": 0.4706205566318644, + "learning_rate": 9.942913132594862e-05, + "loss": 3.3451, + "step": 9236 + }, + { + "epoch": 0.43005796494168586, + "grad_norm": 0.3570552932757947, + "learning_rate": 9.942872310163613e-05, + "loss": 3.1461, + "step": 9237 + }, + { + "epoch": 0.4301045231277789, + "grad_norm": 0.4064655931938328, + "learning_rate": 9.942831473225523e-05, + "loss": 3.3559, + "step": 9238 + }, + { + "epoch": 0.43015108131387203, + "grad_norm": 0.4361817605971175, + "learning_rate": 9.942790621780706e-05, + "loss": 3.3978, + "step": 9239 + }, + { + "epoch": 0.4301976394999651, + "grad_norm": 0.4285726873430434, + "learning_rate": 9.942749755829287e-05, + "loss": 3.2028, + "step": 9240 + }, + { + "epoch": 0.43024419768605815, + "grad_norm": 0.41668189213917384, + "learning_rate": 9.942708875371385e-05, + "loss": 3.3971, + "step": 9241 + }, + { + "epoch": 0.4302907558721512, + "grad_norm": 0.44703256692847665, + "learning_rate": 9.942667980407118e-05, + "loss": 3.1492, + "step": 9242 + }, + { + "epoch": 0.43033731405824427, + "grad_norm": 0.3977560134351953, + "learning_rate": 9.94262707093661e-05, + "loss": 3.1526, + "step": 9243 + }, + { + "epoch": 0.4303838722443374, + "grad_norm": 0.4605765125285399, + "learning_rate": 9.942586146959976e-05, + "loss": 3.1978, + "step": 9244 + }, + { + "epoch": 0.43043043043043044, + "grad_norm": 0.4241796812521752, + "learning_rate": 9.94254520847734e-05, + "loss": 3.251, + "step": 9245 + }, + { + "epoch": 0.4304769886165235, + "grad_norm": 0.43524109153666607, + "learning_rate": 9.94250425548882e-05, + "loss": 3.3201, + "step": 9246 + }, + { + "epoch": 0.43052354680261656, + "grad_norm": 0.4469264012762789, + "learning_rate": 9.942463287994538e-05, + "loss": 3.2034, + "step": 9247 + }, + { + "epoch": 0.4305701049887096, + "grad_norm": 0.44742210469137755, + "learning_rate": 9.942422305994613e-05, + "loss": 3.1841, + "step": 9248 + }, + { + "epoch": 0.4306166631748027, + "grad_norm": 0.4561711695464699, + "learning_rate": 9.942381309489164e-05, + "loss": 3.3307, + "step": 9249 + }, + { + "epoch": 0.4306632213608958, + "grad_norm": 0.5060574110737532, + "learning_rate": 9.942340298478314e-05, + "loss": 3.3061, + "step": 9250 + }, + { + "epoch": 0.43070977954698886, + "grad_norm": 0.5407146282180334, + "learning_rate": 9.942299272962181e-05, + "loss": 3.2317, + "step": 9251 + }, + { + "epoch": 0.4307563377330819, + "grad_norm": 0.4747519429979895, + "learning_rate": 9.942258232940888e-05, + "loss": 3.3971, + "step": 9252 + }, + { + "epoch": 0.430802895919175, + "grad_norm": 0.43286875052515883, + "learning_rate": 9.942217178414553e-05, + "loss": 3.2609, + "step": 9253 + }, + { + "epoch": 0.43084945410526804, + "grad_norm": 0.4545749986651147, + "learning_rate": 9.942176109383297e-05, + "loss": 3.1111, + "step": 9254 + }, + { + "epoch": 0.43089601229136115, + "grad_norm": 0.4308209778509754, + "learning_rate": 9.942135025847242e-05, + "loss": 3.2481, + "step": 9255 + }, + { + "epoch": 0.4309425704774542, + "grad_norm": 0.4777916365408067, + "learning_rate": 9.942093927806507e-05, + "loss": 3.2649, + "step": 9256 + }, + { + "epoch": 0.43098912866354727, + "grad_norm": 0.4584152659791746, + "learning_rate": 9.942052815261213e-05, + "loss": 3.2957, + "step": 9257 + }, + { + "epoch": 0.43103568684964033, + "grad_norm": 0.4196741801985604, + "learning_rate": 9.942011688211482e-05, + "loss": 3.4363, + "step": 9258 + }, + { + "epoch": 0.4310822450357334, + "grad_norm": 0.5404829660183639, + "learning_rate": 9.941970546657434e-05, + "loss": 3.3822, + "step": 9259 + }, + { + "epoch": 0.43112880322182645, + "grad_norm": 0.5550415942950908, + "learning_rate": 9.941929390599187e-05, + "loss": 3.3102, + "step": 9260 + }, + { + "epoch": 0.43117536140791957, + "grad_norm": 0.47424921958511096, + "learning_rate": 9.941888220036865e-05, + "loss": 3.3154, + "step": 9261 + }, + { + "epoch": 0.4312219195940126, + "grad_norm": 0.41910758414720584, + "learning_rate": 9.941847034970588e-05, + "loss": 3.3382, + "step": 9262 + }, + { + "epoch": 0.4312684777801057, + "grad_norm": 0.42826195831284175, + "learning_rate": 9.941805835400478e-05, + "loss": 3.245, + "step": 9263 + }, + { + "epoch": 0.43131503596619875, + "grad_norm": 0.43189300513101936, + "learning_rate": 9.941764621326653e-05, + "loss": 3.3113, + "step": 9264 + }, + { + "epoch": 0.4313615941522918, + "grad_norm": 0.3998946019616889, + "learning_rate": 9.941723392749237e-05, + "loss": 3.2597, + "step": 9265 + }, + { + "epoch": 0.4314081523383849, + "grad_norm": 0.355871258614531, + "learning_rate": 9.94168214966835e-05, + "loss": 3.2332, + "step": 9266 + }, + { + "epoch": 0.431454710524478, + "grad_norm": 0.384102341017979, + "learning_rate": 9.94164089208411e-05, + "loss": 3.2688, + "step": 9267 + }, + { + "epoch": 0.43150126871057104, + "grad_norm": 0.39441671974325104, + "learning_rate": 9.941599619996644e-05, + "loss": 3.2502, + "step": 9268 + }, + { + "epoch": 0.4315478268966641, + "grad_norm": 0.35594217294980957, + "learning_rate": 9.941558333406068e-05, + "loss": 3.2452, + "step": 9269 + }, + { + "epoch": 0.43159438508275716, + "grad_norm": 0.3657273020966623, + "learning_rate": 9.941517032312507e-05, + "loss": 3.2866, + "step": 9270 + }, + { + "epoch": 0.4316409432688502, + "grad_norm": 0.4027152633557879, + "learning_rate": 9.941475716716078e-05, + "loss": 3.2825, + "step": 9271 + }, + { + "epoch": 0.43168750145494333, + "grad_norm": 0.3621527903501821, + "learning_rate": 9.941434386616905e-05, + "loss": 3.2633, + "step": 9272 + }, + { + "epoch": 0.4317340596410364, + "grad_norm": 0.3789048301242197, + "learning_rate": 9.94139304201511e-05, + "loss": 3.3104, + "step": 9273 + }, + { + "epoch": 0.43178061782712945, + "grad_norm": 0.39888745018115074, + "learning_rate": 9.941351682910813e-05, + "loss": 3.2349, + "step": 9274 + }, + { + "epoch": 0.4318271760132225, + "grad_norm": 0.37591290728455545, + "learning_rate": 9.941310309304135e-05, + "loss": 3.2461, + "step": 9275 + }, + { + "epoch": 0.4318737341993156, + "grad_norm": 0.41104828048526354, + "learning_rate": 9.9412689211952e-05, + "loss": 3.1876, + "step": 9276 + }, + { + "epoch": 0.4319202923854087, + "grad_norm": 0.38637993722437264, + "learning_rate": 9.941227518584124e-05, + "loss": 3.3617, + "step": 9277 + }, + { + "epoch": 0.43196685057150175, + "grad_norm": 0.3885825113404867, + "learning_rate": 9.941186101471034e-05, + "loss": 3.2145, + "step": 9278 + }, + { + "epoch": 0.4320134087575948, + "grad_norm": 0.45167353794786236, + "learning_rate": 9.941144669856051e-05, + "loss": 3.351, + "step": 9279 + }, + { + "epoch": 0.43205996694368787, + "grad_norm": 0.4089013389693527, + "learning_rate": 9.941103223739292e-05, + "loss": 3.3541, + "step": 9280 + }, + { + "epoch": 0.4321065251297809, + "grad_norm": 0.38838972149457196, + "learning_rate": 9.941061763120884e-05, + "loss": 3.2578, + "step": 9281 + }, + { + "epoch": 0.432153083315874, + "grad_norm": 0.3647196349776253, + "learning_rate": 9.941020288000946e-05, + "loss": 3.2533, + "step": 9282 + }, + { + "epoch": 0.4321996415019671, + "grad_norm": 0.4092694319659562, + "learning_rate": 9.940978798379601e-05, + "loss": 3.2948, + "step": 9283 + }, + { + "epoch": 0.43224619968806016, + "grad_norm": 0.4470992565980489, + "learning_rate": 9.940937294256969e-05, + "loss": 3.268, + "step": 9284 + }, + { + "epoch": 0.4322927578741532, + "grad_norm": 0.42921784766721816, + "learning_rate": 9.940895775633172e-05, + "loss": 3.2784, + "step": 9285 + }, + { + "epoch": 0.4323393160602463, + "grad_norm": 0.39162165544112804, + "learning_rate": 9.940854242508335e-05, + "loss": 3.3654, + "step": 9286 + }, + { + "epoch": 0.43238587424633934, + "grad_norm": 0.48312304576821724, + "learning_rate": 9.940812694882575e-05, + "loss": 3.2705, + "step": 9287 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 0.45575569406767197, + "learning_rate": 9.940771132756019e-05, + "loss": 3.2756, + "step": 9288 + }, + { + "epoch": 0.4324789906185255, + "grad_norm": 0.40706883678928507, + "learning_rate": 9.940729556128785e-05, + "loss": 3.287, + "step": 9289 + }, + { + "epoch": 0.4325255488046186, + "grad_norm": 0.41277680488976864, + "learning_rate": 9.940687965000997e-05, + "loss": 3.2761, + "step": 9290 + }, + { + "epoch": 0.43257210699071164, + "grad_norm": 0.4186934294444754, + "learning_rate": 9.940646359372775e-05, + "loss": 3.2406, + "step": 9291 + }, + { + "epoch": 0.4326186651768047, + "grad_norm": 0.3962785302449132, + "learning_rate": 9.940604739244245e-05, + "loss": 3.2235, + "step": 9292 + }, + { + "epoch": 0.43266522336289776, + "grad_norm": 0.4307945206512357, + "learning_rate": 9.940563104615526e-05, + "loss": 3.3114, + "step": 9293 + }, + { + "epoch": 0.43271178154899087, + "grad_norm": 0.4671958322327937, + "learning_rate": 9.94052145548674e-05, + "loss": 3.2722, + "step": 9294 + }, + { + "epoch": 0.43275833973508393, + "grad_norm": 0.5181821364849276, + "learning_rate": 9.940479791858011e-05, + "loss": 3.266, + "step": 9295 + }, + { + "epoch": 0.432804897921177, + "grad_norm": 0.5011635974381597, + "learning_rate": 9.940438113729462e-05, + "loss": 3.2171, + "step": 9296 + }, + { + "epoch": 0.43285145610727005, + "grad_norm": 0.4296278873764788, + "learning_rate": 9.940396421101211e-05, + "loss": 3.2469, + "step": 9297 + }, + { + "epoch": 0.4328980142933631, + "grad_norm": 0.437837206946297, + "learning_rate": 9.940354713973385e-05, + "loss": 3.2653, + "step": 9298 + }, + { + "epoch": 0.4329445724794562, + "grad_norm": 0.43093719523661744, + "learning_rate": 9.940312992346104e-05, + "loss": 3.3268, + "step": 9299 + }, + { + "epoch": 0.4329911306655493, + "grad_norm": 0.41294084933079983, + "learning_rate": 9.940271256219491e-05, + "loss": 3.248, + "step": 9300 + }, + { + "epoch": 0.43303768885164234, + "grad_norm": 0.4189843260765348, + "learning_rate": 9.94022950559367e-05, + "loss": 3.2523, + "step": 9301 + }, + { + "epoch": 0.4330842470377354, + "grad_norm": 0.46812500113155375, + "learning_rate": 9.940187740468761e-05, + "loss": 3.2397, + "step": 9302 + }, + { + "epoch": 0.43313080522382846, + "grad_norm": 0.44132315562168034, + "learning_rate": 9.940145960844887e-05, + "loss": 3.332, + "step": 9303 + }, + { + "epoch": 0.4331773634099215, + "grad_norm": 0.45547462412594686, + "learning_rate": 9.940104166722172e-05, + "loss": 3.273, + "step": 9304 + }, + { + "epoch": 0.43322392159601464, + "grad_norm": 0.41986380902800646, + "learning_rate": 9.940062358100738e-05, + "loss": 3.204, + "step": 9305 + }, + { + "epoch": 0.4332704797821077, + "grad_norm": 0.44122588322719997, + "learning_rate": 9.940020534980707e-05, + "loss": 3.3053, + "step": 9306 + }, + { + "epoch": 0.43331703796820076, + "grad_norm": 0.4455003996247484, + "learning_rate": 9.939978697362204e-05, + "loss": 3.3899, + "step": 9307 + }, + { + "epoch": 0.4333635961542938, + "grad_norm": 0.37725348272725373, + "learning_rate": 9.939936845245347e-05, + "loss": 3.2418, + "step": 9308 + }, + { + "epoch": 0.4334101543403869, + "grad_norm": 0.40576462300666644, + "learning_rate": 9.939894978630267e-05, + "loss": 3.1397, + "step": 9309 + }, + { + "epoch": 0.43345671252648, + "grad_norm": 0.4048976998854858, + "learning_rate": 9.939853097517077e-05, + "loss": 3.2843, + "step": 9310 + }, + { + "epoch": 0.43350327071257305, + "grad_norm": 0.40980665648748604, + "learning_rate": 9.939811201905908e-05, + "loss": 3.3083, + "step": 9311 + }, + { + "epoch": 0.4335498288986661, + "grad_norm": 0.4413713704087463, + "learning_rate": 9.939769291796878e-05, + "loss": 3.3397, + "step": 9312 + }, + { + "epoch": 0.43359638708475917, + "grad_norm": 0.4437128147853618, + "learning_rate": 9.939727367190112e-05, + "loss": 3.229, + "step": 9313 + }, + { + "epoch": 0.43364294527085223, + "grad_norm": 0.4035441994856457, + "learning_rate": 9.939685428085734e-05, + "loss": 3.3074, + "step": 9314 + }, + { + "epoch": 0.4336895034569453, + "grad_norm": 0.4450579483251811, + "learning_rate": 9.939643474483865e-05, + "loss": 3.2244, + "step": 9315 + }, + { + "epoch": 0.4337360616430384, + "grad_norm": 0.4409008371105244, + "learning_rate": 9.93960150638463e-05, + "loss": 3.3488, + "step": 9316 + }, + { + "epoch": 0.43378261982913147, + "grad_norm": 0.3871260495650272, + "learning_rate": 9.939559523788149e-05, + "loss": 3.2325, + "step": 9317 + }, + { + "epoch": 0.4338291780152245, + "grad_norm": 0.4429887058144301, + "learning_rate": 9.93951752669455e-05, + "loss": 3.2825, + "step": 9318 + }, + { + "epoch": 0.4338757362013176, + "grad_norm": 0.42600462652205157, + "learning_rate": 9.939475515103952e-05, + "loss": 3.2593, + "step": 9319 + }, + { + "epoch": 0.43392229438741065, + "grad_norm": 0.4074254771259172, + "learning_rate": 9.939433489016481e-05, + "loss": 3.2334, + "step": 9320 + }, + { + "epoch": 0.43396885257350376, + "grad_norm": 0.372317743532765, + "learning_rate": 9.939391448432258e-05, + "loss": 3.299, + "step": 9321 + }, + { + "epoch": 0.4340154107595968, + "grad_norm": 0.43546362744598666, + "learning_rate": 9.939349393351409e-05, + "loss": 3.2445, + "step": 9322 + }, + { + "epoch": 0.4340619689456899, + "grad_norm": 0.42329192114393066, + "learning_rate": 9.939307323774055e-05, + "loss": 3.268, + "step": 9323 + }, + { + "epoch": 0.43410852713178294, + "grad_norm": 0.43512847961227513, + "learning_rate": 9.939265239700321e-05, + "loss": 3.2126, + "step": 9324 + }, + { + "epoch": 0.434155085317876, + "grad_norm": 0.4189061189982484, + "learning_rate": 9.939223141130331e-05, + "loss": 3.3288, + "step": 9325 + }, + { + "epoch": 0.43420164350396906, + "grad_norm": 0.41039257021474557, + "learning_rate": 9.939181028064207e-05, + "loss": 3.2466, + "step": 9326 + }, + { + "epoch": 0.4342482016900622, + "grad_norm": 0.4147253251555731, + "learning_rate": 9.939138900502074e-05, + "loss": 3.1793, + "step": 9327 + }, + { + "epoch": 0.43429475987615523, + "grad_norm": 0.4125793321414123, + "learning_rate": 9.939096758444053e-05, + "loss": 3.1702, + "step": 9328 + }, + { + "epoch": 0.4343413180622483, + "grad_norm": 0.44104635170441725, + "learning_rate": 9.939054601890272e-05, + "loss": 3.2658, + "step": 9329 + }, + { + "epoch": 0.43438787624834135, + "grad_norm": 0.4454191951700112, + "learning_rate": 9.93901243084085e-05, + "loss": 3.3357, + "step": 9330 + }, + { + "epoch": 0.4344344344344344, + "grad_norm": 0.4294568598395848, + "learning_rate": 9.938970245295916e-05, + "loss": 3.2516, + "step": 9331 + }, + { + "epoch": 0.43448099262052753, + "grad_norm": 0.3626832296605055, + "learning_rate": 9.938928045255589e-05, + "loss": 3.3081, + "step": 9332 + }, + { + "epoch": 0.4345275508066206, + "grad_norm": 0.4340251402206282, + "learning_rate": 9.938885830719995e-05, + "loss": 3.1973, + "step": 9333 + }, + { + "epoch": 0.43457410899271365, + "grad_norm": 0.4091342917179092, + "learning_rate": 9.938843601689256e-05, + "loss": 3.3622, + "step": 9334 + }, + { + "epoch": 0.4346206671788067, + "grad_norm": 0.3910114204647216, + "learning_rate": 9.9388013581635e-05, + "loss": 3.1826, + "step": 9335 + }, + { + "epoch": 0.43466722536489977, + "grad_norm": 0.38781385907327376, + "learning_rate": 9.938759100142848e-05, + "loss": 3.2144, + "step": 9336 + }, + { + "epoch": 0.4347137835509928, + "grad_norm": 0.436435694226734, + "learning_rate": 9.938716827627424e-05, + "loss": 3.2954, + "step": 9337 + }, + { + "epoch": 0.43476034173708594, + "grad_norm": 0.4775890577992681, + "learning_rate": 9.938674540617353e-05, + "loss": 3.3821, + "step": 9338 + }, + { + "epoch": 0.434806899923179, + "grad_norm": 0.4258565832802745, + "learning_rate": 9.938632239112759e-05, + "loss": 3.2723, + "step": 9339 + }, + { + "epoch": 0.43485345810927206, + "grad_norm": 0.3901236839073547, + "learning_rate": 9.938589923113766e-05, + "loss": 3.2396, + "step": 9340 + }, + { + "epoch": 0.4349000162953651, + "grad_norm": 0.42675648421015544, + "learning_rate": 9.938547592620498e-05, + "loss": 3.2646, + "step": 9341 + }, + { + "epoch": 0.4349465744814582, + "grad_norm": 0.44874692923220155, + "learning_rate": 9.938505247633079e-05, + "loss": 3.2592, + "step": 9342 + }, + { + "epoch": 0.4349931326675513, + "grad_norm": 0.38187353261740214, + "learning_rate": 9.938462888151633e-05, + "loss": 3.2332, + "step": 9343 + }, + { + "epoch": 0.43503969085364436, + "grad_norm": 0.4562799674052654, + "learning_rate": 9.938420514176285e-05, + "loss": 3.371, + "step": 9344 + }, + { + "epoch": 0.4350862490397374, + "grad_norm": 0.44560878795283987, + "learning_rate": 9.93837812570716e-05, + "loss": 3.3103, + "step": 9345 + }, + { + "epoch": 0.4351328072258305, + "grad_norm": 0.4117848715345765, + "learning_rate": 9.938335722744381e-05, + "loss": 3.337, + "step": 9346 + }, + { + "epoch": 0.43517936541192354, + "grad_norm": 0.433194417942036, + "learning_rate": 9.938293305288076e-05, + "loss": 3.3206, + "step": 9347 + }, + { + "epoch": 0.4352259235980166, + "grad_norm": 0.43375813738381763, + "learning_rate": 9.938250873338364e-05, + "loss": 3.2232, + "step": 9348 + }, + { + "epoch": 0.4352724817841097, + "grad_norm": 0.3673827565477838, + "learning_rate": 9.938208426895372e-05, + "loss": 3.3333, + "step": 9349 + }, + { + "epoch": 0.43531903997020277, + "grad_norm": 0.41517242415130967, + "learning_rate": 9.938165965959225e-05, + "loss": 3.334, + "step": 9350 + }, + { + "epoch": 0.43536559815629583, + "grad_norm": 0.4686105426302374, + "learning_rate": 9.93812349053005e-05, + "loss": 3.2545, + "step": 9351 + }, + { + "epoch": 0.4354121563423889, + "grad_norm": 0.4352993081032315, + "learning_rate": 9.938081000607965e-05, + "loss": 3.3245, + "step": 9352 + }, + { + "epoch": 0.43545871452848195, + "grad_norm": 0.3977647181878002, + "learning_rate": 9.9380384961931e-05, + "loss": 3.2843, + "step": 9353 + }, + { + "epoch": 0.43550527271457506, + "grad_norm": 0.4263064873275325, + "learning_rate": 9.93799597728558e-05, + "loss": 3.1861, + "step": 9354 + }, + { + "epoch": 0.4355518309006681, + "grad_norm": 0.4052475190824365, + "learning_rate": 9.937953443885527e-05, + "loss": 3.2532, + "step": 9355 + }, + { + "epoch": 0.4355983890867612, + "grad_norm": 0.43033670723527073, + "learning_rate": 9.937910895993068e-05, + "loss": 3.2447, + "step": 9356 + }, + { + "epoch": 0.43564494727285424, + "grad_norm": 0.44603285017361577, + "learning_rate": 9.937868333608327e-05, + "loss": 3.2959, + "step": 9357 + }, + { + "epoch": 0.4356915054589473, + "grad_norm": 0.40598798983166196, + "learning_rate": 9.937825756731427e-05, + "loss": 3.2961, + "step": 9358 + }, + { + "epoch": 0.43573806364504036, + "grad_norm": 0.4064049861638336, + "learning_rate": 9.937783165362498e-05, + "loss": 3.3028, + "step": 9359 + }, + { + "epoch": 0.4357846218311335, + "grad_norm": 0.3866033147820444, + "learning_rate": 9.937740559501658e-05, + "loss": 3.2946, + "step": 9360 + }, + { + "epoch": 0.43583118001722654, + "grad_norm": 0.40086541520402946, + "learning_rate": 9.937697939149038e-05, + "loss": 3.3233, + "step": 9361 + }, + { + "epoch": 0.4358777382033196, + "grad_norm": 0.4872039143770773, + "learning_rate": 9.937655304304762e-05, + "loss": 3.257, + "step": 9362 + }, + { + "epoch": 0.43592429638941266, + "grad_norm": 0.43541429941009335, + "learning_rate": 9.937612654968953e-05, + "loss": 3.2219, + "step": 9363 + }, + { + "epoch": 0.4359708545755057, + "grad_norm": 0.40707890931776175, + "learning_rate": 9.937569991141737e-05, + "loss": 3.3865, + "step": 9364 + }, + { + "epoch": 0.43601741276159883, + "grad_norm": 0.4040238243236779, + "learning_rate": 9.937527312823241e-05, + "loss": 3.3321, + "step": 9365 + }, + { + "epoch": 0.4360639709476919, + "grad_norm": 0.47180168488839064, + "learning_rate": 9.937484620013587e-05, + "loss": 3.2207, + "step": 9366 + }, + { + "epoch": 0.43611052913378495, + "grad_norm": 0.42085226135057624, + "learning_rate": 9.937441912712903e-05, + "loss": 3.3363, + "step": 9367 + }, + { + "epoch": 0.436157087319878, + "grad_norm": 0.4104134530080861, + "learning_rate": 9.937399190921313e-05, + "loss": 3.2851, + "step": 9368 + }, + { + "epoch": 0.43620364550597107, + "grad_norm": 0.3778624461613312, + "learning_rate": 9.937356454638943e-05, + "loss": 3.2958, + "step": 9369 + }, + { + "epoch": 0.43625020369206413, + "grad_norm": 0.39324270120380034, + "learning_rate": 9.937313703865919e-05, + "loss": 3.3337, + "step": 9370 + }, + { + "epoch": 0.43629676187815725, + "grad_norm": 0.3792558808713083, + "learning_rate": 9.937270938602363e-05, + "loss": 3.2454, + "step": 9371 + }, + { + "epoch": 0.4363433200642503, + "grad_norm": 0.3751158253854011, + "learning_rate": 9.937228158848407e-05, + "loss": 3.2743, + "step": 9372 + }, + { + "epoch": 0.43638987825034337, + "grad_norm": 0.4352800961484924, + "learning_rate": 9.937185364604169e-05, + "loss": 3.1943, + "step": 9373 + }, + { + "epoch": 0.4364364364364364, + "grad_norm": 0.4525506332612778, + "learning_rate": 9.93714255586978e-05, + "loss": 3.241, + "step": 9374 + }, + { + "epoch": 0.4364829946225295, + "grad_norm": 0.41280697315336357, + "learning_rate": 9.937099732645365e-05, + "loss": 3.2405, + "step": 9375 + }, + { + "epoch": 0.4365295528086226, + "grad_norm": 0.40137348428511677, + "learning_rate": 9.937056894931047e-05, + "loss": 3.2663, + "step": 9376 + }, + { + "epoch": 0.43657611099471566, + "grad_norm": 0.44693248917540457, + "learning_rate": 9.937014042726955e-05, + "loss": 3.2319, + "step": 9377 + }, + { + "epoch": 0.4366226691808087, + "grad_norm": 0.37904671070462603, + "learning_rate": 9.936971176033211e-05, + "loss": 3.2153, + "step": 9378 + }, + { + "epoch": 0.4366692273669018, + "grad_norm": 0.4544591547645278, + "learning_rate": 9.936928294849946e-05, + "loss": 3.3124, + "step": 9379 + }, + { + "epoch": 0.43671578555299484, + "grad_norm": 0.4020189892968949, + "learning_rate": 9.936885399177281e-05, + "loss": 3.299, + "step": 9380 + }, + { + "epoch": 0.4367623437390879, + "grad_norm": 0.3804246209022742, + "learning_rate": 9.936842489015343e-05, + "loss": 3.4076, + "step": 9381 + }, + { + "epoch": 0.436808901925181, + "grad_norm": 0.499017085032662, + "learning_rate": 9.93679956436426e-05, + "loss": 3.1729, + "step": 9382 + }, + { + "epoch": 0.4368554601112741, + "grad_norm": 0.46195620701247786, + "learning_rate": 9.936756625224156e-05, + "loss": 3.3275, + "step": 9383 + }, + { + "epoch": 0.43690201829736713, + "grad_norm": 0.4057256091476665, + "learning_rate": 9.936713671595157e-05, + "loss": 3.2159, + "step": 9384 + }, + { + "epoch": 0.4369485764834602, + "grad_norm": 0.44615475480402195, + "learning_rate": 9.93667070347739e-05, + "loss": 3.3269, + "step": 9385 + }, + { + "epoch": 0.43699513466955325, + "grad_norm": 0.43104556938088784, + "learning_rate": 9.936627720870984e-05, + "loss": 3.2575, + "step": 9386 + }, + { + "epoch": 0.43704169285564637, + "grad_norm": 0.40979749235357193, + "learning_rate": 9.936584723776057e-05, + "loss": 3.2198, + "step": 9387 + }, + { + "epoch": 0.43708825104173943, + "grad_norm": 0.43246815111712034, + "learning_rate": 9.936541712192743e-05, + "loss": 3.3256, + "step": 9388 + }, + { + "epoch": 0.4371348092278325, + "grad_norm": 0.43645865964305597, + "learning_rate": 9.936498686121165e-05, + "loss": 3.1173, + "step": 9389 + }, + { + "epoch": 0.43718136741392555, + "grad_norm": 0.4141658442411111, + "learning_rate": 9.93645564556145e-05, + "loss": 3.3075, + "step": 9390 + }, + { + "epoch": 0.4372279256000186, + "grad_norm": 0.5275504134450026, + "learning_rate": 9.936412590513723e-05, + "loss": 3.318, + "step": 9391 + }, + { + "epoch": 0.43727448378611167, + "grad_norm": 0.4378690806275258, + "learning_rate": 9.936369520978113e-05, + "loss": 3.2443, + "step": 9392 + }, + { + "epoch": 0.4373210419722048, + "grad_norm": 0.40368479904224847, + "learning_rate": 9.936326436954744e-05, + "loss": 3.2172, + "step": 9393 + }, + { + "epoch": 0.43736760015829784, + "grad_norm": 0.38301829474674676, + "learning_rate": 9.936283338443742e-05, + "loss": 3.2631, + "step": 9394 + }, + { + "epoch": 0.4374141583443909, + "grad_norm": 0.43323676384911564, + "learning_rate": 9.936240225445236e-05, + "loss": 3.3732, + "step": 9395 + }, + { + "epoch": 0.43746071653048396, + "grad_norm": 0.4312157254400382, + "learning_rate": 9.936197097959351e-05, + "loss": 3.2053, + "step": 9396 + }, + { + "epoch": 0.437507274716577, + "grad_norm": 0.42367599691018426, + "learning_rate": 9.936153955986213e-05, + "loss": 3.1953, + "step": 9397 + }, + { + "epoch": 0.43755383290267014, + "grad_norm": 0.40984671714926413, + "learning_rate": 9.93611079952595e-05, + "loss": 3.1969, + "step": 9398 + }, + { + "epoch": 0.4376003910887632, + "grad_norm": 0.433070210548196, + "learning_rate": 9.936067628578689e-05, + "loss": 3.1768, + "step": 9399 + }, + { + "epoch": 0.43764694927485626, + "grad_norm": 0.3930674499721621, + "learning_rate": 9.936024443144555e-05, + "loss": 3.2802, + "step": 9400 + }, + { + "epoch": 0.4376935074609493, + "grad_norm": 0.4318436893261842, + "learning_rate": 9.935981243223676e-05, + "loss": 3.3379, + "step": 9401 + }, + { + "epoch": 0.4377400656470424, + "grad_norm": 0.46050669093172025, + "learning_rate": 9.935938028816177e-05, + "loss": 3.1079, + "step": 9402 + }, + { + "epoch": 0.43778662383313544, + "grad_norm": 0.45944561778622267, + "learning_rate": 9.935894799922187e-05, + "loss": 3.2402, + "step": 9403 + }, + { + "epoch": 0.43783318201922855, + "grad_norm": 0.39041704105695535, + "learning_rate": 9.935851556541833e-05, + "loss": 3.3946, + "step": 9404 + }, + { + "epoch": 0.4378797402053216, + "grad_norm": 0.4193162173642464, + "learning_rate": 9.93580829867524e-05, + "loss": 3.2058, + "step": 9405 + }, + { + "epoch": 0.43792629839141467, + "grad_norm": 0.44096714176022606, + "learning_rate": 9.935765026322535e-05, + "loss": 3.384, + "step": 9406 + }, + { + "epoch": 0.43797285657750773, + "grad_norm": 0.40163842213256806, + "learning_rate": 9.935721739483848e-05, + "loss": 3.1682, + "step": 9407 + }, + { + "epoch": 0.4380194147636008, + "grad_norm": 0.42022660559439284, + "learning_rate": 9.935678438159302e-05, + "loss": 3.2679, + "step": 9408 + }, + { + "epoch": 0.4380659729496939, + "grad_norm": 0.39509396947443803, + "learning_rate": 9.935635122349028e-05, + "loss": 3.2655, + "step": 9409 + }, + { + "epoch": 0.43811253113578696, + "grad_norm": 0.4276930212243855, + "learning_rate": 9.935591792053148e-05, + "loss": 3.1883, + "step": 9410 + }, + { + "epoch": 0.43815908932188, + "grad_norm": 0.40500865730250557, + "learning_rate": 9.935548447271794e-05, + "loss": 3.3953, + "step": 9411 + }, + { + "epoch": 0.4382056475079731, + "grad_norm": 0.39403654885610595, + "learning_rate": 9.935505088005093e-05, + "loss": 3.3815, + "step": 9412 + }, + { + "epoch": 0.43825220569406614, + "grad_norm": 0.43395315837880555, + "learning_rate": 9.93546171425317e-05, + "loss": 3.3264, + "step": 9413 + }, + { + "epoch": 0.4382987638801592, + "grad_norm": 0.3911199533880752, + "learning_rate": 9.935418326016152e-05, + "loss": 3.2114, + "step": 9414 + }, + { + "epoch": 0.4383453220662523, + "grad_norm": 0.40939220425985096, + "learning_rate": 9.935374923294167e-05, + "loss": 3.0912, + "step": 9415 + }, + { + "epoch": 0.4383918802523454, + "grad_norm": 0.36543508845721034, + "learning_rate": 9.935331506087344e-05, + "loss": 3.1637, + "step": 9416 + }, + { + "epoch": 0.43843843843843844, + "grad_norm": 0.461114725612583, + "learning_rate": 9.93528807439581e-05, + "loss": 3.2456, + "step": 9417 + }, + { + "epoch": 0.4384849966245315, + "grad_norm": 0.5315009173693847, + "learning_rate": 9.935244628219691e-05, + "loss": 3.2277, + "step": 9418 + }, + { + "epoch": 0.43853155481062456, + "grad_norm": 0.5040611543149639, + "learning_rate": 9.935201167559115e-05, + "loss": 3.2112, + "step": 9419 + }, + { + "epoch": 0.4385781129967177, + "grad_norm": 0.4695216681530379, + "learning_rate": 9.93515769241421e-05, + "loss": 3.2538, + "step": 9420 + }, + { + "epoch": 0.43862467118281073, + "grad_norm": 0.4631329786279166, + "learning_rate": 9.935114202785103e-05, + "loss": 3.2852, + "step": 9421 + }, + { + "epoch": 0.4386712293689038, + "grad_norm": 0.42615228453097026, + "learning_rate": 9.935070698671922e-05, + "loss": 3.1893, + "step": 9422 + }, + { + "epoch": 0.43871778755499685, + "grad_norm": 0.4247728044088477, + "learning_rate": 9.935027180074797e-05, + "loss": 3.24, + "step": 9423 + }, + { + "epoch": 0.4387643457410899, + "grad_norm": 0.48043052366402234, + "learning_rate": 9.93498364699385e-05, + "loss": 3.1945, + "step": 9424 + }, + { + "epoch": 0.43881090392718297, + "grad_norm": 0.47003977312796363, + "learning_rate": 9.934940099429213e-05, + "loss": 3.3284, + "step": 9425 + }, + { + "epoch": 0.4388574621132761, + "grad_norm": 0.449937945570888, + "learning_rate": 9.934896537381013e-05, + "loss": 3.303, + "step": 9426 + }, + { + "epoch": 0.43890402029936915, + "grad_norm": 0.46277802794438166, + "learning_rate": 9.934852960849379e-05, + "loss": 3.293, + "step": 9427 + }, + { + "epoch": 0.4389505784854622, + "grad_norm": 0.47422606122503996, + "learning_rate": 9.934809369834437e-05, + "loss": 3.2861, + "step": 9428 + }, + { + "epoch": 0.43899713667155527, + "grad_norm": 0.45962133434662333, + "learning_rate": 9.934765764336316e-05, + "loss": 3.2673, + "step": 9429 + }, + { + "epoch": 0.4390436948576483, + "grad_norm": 0.4545461625311712, + "learning_rate": 9.934722144355143e-05, + "loss": 3.2829, + "step": 9430 + }, + { + "epoch": 0.43909025304374144, + "grad_norm": 0.40559314027970556, + "learning_rate": 9.934678509891046e-05, + "loss": 3.2857, + "step": 9431 + }, + { + "epoch": 0.4391368112298345, + "grad_norm": 0.5332186076872076, + "learning_rate": 9.934634860944156e-05, + "loss": 3.2699, + "step": 9432 + }, + { + "epoch": 0.43918336941592756, + "grad_norm": 0.40560306087220627, + "learning_rate": 9.934591197514597e-05, + "loss": 3.3428, + "step": 9433 + }, + { + "epoch": 0.4392299276020206, + "grad_norm": 0.42604730396250756, + "learning_rate": 9.934547519602499e-05, + "loss": 3.2915, + "step": 9434 + }, + { + "epoch": 0.4392764857881137, + "grad_norm": 0.5087298960836317, + "learning_rate": 9.934503827207991e-05, + "loss": 3.1846, + "step": 9435 + }, + { + "epoch": 0.43932304397420674, + "grad_norm": 0.3999974767714436, + "learning_rate": 9.934460120331202e-05, + "loss": 3.2451, + "step": 9436 + }, + { + "epoch": 0.43936960216029985, + "grad_norm": 0.49512163458058583, + "learning_rate": 9.934416398972255e-05, + "loss": 3.1396, + "step": 9437 + }, + { + "epoch": 0.4394161603463929, + "grad_norm": 0.40413271093917685, + "learning_rate": 9.934372663131283e-05, + "loss": 3.3233, + "step": 9438 + }, + { + "epoch": 0.439462718532486, + "grad_norm": 0.4450480401210957, + "learning_rate": 9.934328912808415e-05, + "loss": 3.2496, + "step": 9439 + }, + { + "epoch": 0.43950927671857903, + "grad_norm": 0.4483035924108621, + "learning_rate": 9.934285148003776e-05, + "loss": 3.2181, + "step": 9440 + }, + { + "epoch": 0.4395558349046721, + "grad_norm": 0.4124974176664263, + "learning_rate": 9.934241368717498e-05, + "loss": 3.1908, + "step": 9441 + }, + { + "epoch": 0.4396023930907652, + "grad_norm": 0.43589842826125574, + "learning_rate": 9.934197574949707e-05, + "loss": 3.2988, + "step": 9442 + }, + { + "epoch": 0.43964895127685827, + "grad_norm": 0.4487711437524832, + "learning_rate": 9.934153766700532e-05, + "loss": 3.4168, + "step": 9443 + }, + { + "epoch": 0.43969550946295133, + "grad_norm": 0.41785872852512795, + "learning_rate": 9.934109943970103e-05, + "loss": 3.313, + "step": 9444 + }, + { + "epoch": 0.4397420676490444, + "grad_norm": 0.4307634902023051, + "learning_rate": 9.934066106758545e-05, + "loss": 3.3219, + "step": 9445 + }, + { + "epoch": 0.43978862583513745, + "grad_norm": 0.45300647837842806, + "learning_rate": 9.93402225506599e-05, + "loss": 3.1662, + "step": 9446 + }, + { + "epoch": 0.4398351840212305, + "grad_norm": 0.4360480404881542, + "learning_rate": 9.933978388892566e-05, + "loss": 3.1363, + "step": 9447 + }, + { + "epoch": 0.4398817422073236, + "grad_norm": 0.3645548418606906, + "learning_rate": 9.933934508238402e-05, + "loss": 3.2616, + "step": 9448 + }, + { + "epoch": 0.4399283003934167, + "grad_norm": 0.4326286310690877, + "learning_rate": 9.933890613103624e-05, + "loss": 3.2594, + "step": 9449 + }, + { + "epoch": 0.43997485857950974, + "grad_norm": 0.4218088308816746, + "learning_rate": 9.933846703488367e-05, + "loss": 3.2772, + "step": 9450 + }, + { + "epoch": 0.4400214167656028, + "grad_norm": 0.426488543606951, + "learning_rate": 9.933802779392754e-05, + "loss": 3.2214, + "step": 9451 + }, + { + "epoch": 0.44006797495169586, + "grad_norm": 0.3884493936522934, + "learning_rate": 9.933758840816914e-05, + "loss": 3.1831, + "step": 9452 + }, + { + "epoch": 0.440114533137789, + "grad_norm": 0.40954028039680046, + "learning_rate": 9.933714887760981e-05, + "loss": 3.1922, + "step": 9453 + }, + { + "epoch": 0.44016109132388204, + "grad_norm": 0.41247433098711583, + "learning_rate": 9.933670920225079e-05, + "loss": 3.2303, + "step": 9454 + }, + { + "epoch": 0.4402076495099751, + "grad_norm": 0.4000088958663889, + "learning_rate": 9.93362693820934e-05, + "loss": 3.19, + "step": 9455 + }, + { + "epoch": 0.44025420769606816, + "grad_norm": 0.4196564048385156, + "learning_rate": 9.93358294171389e-05, + "loss": 3.3236, + "step": 9456 + }, + { + "epoch": 0.4403007658821612, + "grad_norm": 0.38878384263427124, + "learning_rate": 9.933538930738861e-05, + "loss": 3.328, + "step": 9457 + }, + { + "epoch": 0.4403473240682543, + "grad_norm": 0.35985890515859015, + "learning_rate": 9.933494905284383e-05, + "loss": 3.2806, + "step": 9458 + }, + { + "epoch": 0.4403938822543474, + "grad_norm": 0.4227699239986006, + "learning_rate": 9.933450865350583e-05, + "loss": 3.2366, + "step": 9459 + }, + { + "epoch": 0.44044044044044045, + "grad_norm": 0.41812124274914014, + "learning_rate": 9.93340681093759e-05, + "loss": 3.346, + "step": 9460 + }, + { + "epoch": 0.4404869986265335, + "grad_norm": 0.3663761108508988, + "learning_rate": 9.933362742045533e-05, + "loss": 3.3018, + "step": 9461 + }, + { + "epoch": 0.44053355681262657, + "grad_norm": 0.377734909098114, + "learning_rate": 9.933318658674544e-05, + "loss": 3.1508, + "step": 9462 + }, + { + "epoch": 0.44058011499871963, + "grad_norm": 0.3874845135009742, + "learning_rate": 9.93327456082475e-05, + "loss": 3.2534, + "step": 9463 + }, + { + "epoch": 0.44062667318481274, + "grad_norm": 0.35972743385902944, + "learning_rate": 9.933230448496281e-05, + "loss": 3.2919, + "step": 9464 + }, + { + "epoch": 0.4406732313709058, + "grad_norm": 0.42651314351203135, + "learning_rate": 9.933186321689267e-05, + "loss": 3.3079, + "step": 9465 + }, + { + "epoch": 0.44071978955699886, + "grad_norm": 0.45227424680447814, + "learning_rate": 9.933142180403837e-05, + "loss": 3.2235, + "step": 9466 + }, + { + "epoch": 0.4407663477430919, + "grad_norm": 0.36907066162300983, + "learning_rate": 9.933098024640121e-05, + "loss": 3.2381, + "step": 9467 + }, + { + "epoch": 0.440812905929185, + "grad_norm": 0.39540466171888916, + "learning_rate": 9.933053854398248e-05, + "loss": 3.2333, + "step": 9468 + }, + { + "epoch": 0.44085946411527804, + "grad_norm": 0.3883292526195298, + "learning_rate": 9.933009669678349e-05, + "loss": 3.3431, + "step": 9469 + }, + { + "epoch": 0.44090602230137116, + "grad_norm": 0.42999799753173673, + "learning_rate": 9.93296547048055e-05, + "loss": 3.3503, + "step": 9470 + }, + { + "epoch": 0.4409525804874642, + "grad_norm": 0.4577937120395693, + "learning_rate": 9.932921256804986e-05, + "loss": 3.1721, + "step": 9471 + }, + { + "epoch": 0.4409991386735573, + "grad_norm": 0.38169783516918554, + "learning_rate": 9.932877028651782e-05, + "loss": 3.2876, + "step": 9472 + }, + { + "epoch": 0.44104569685965034, + "grad_norm": 0.3921983763976913, + "learning_rate": 9.932832786021071e-05, + "loss": 3.3587, + "step": 9473 + }, + { + "epoch": 0.4410922550457434, + "grad_norm": 0.38859843027623814, + "learning_rate": 9.932788528912982e-05, + "loss": 3.1888, + "step": 9474 + }, + { + "epoch": 0.4411388132318365, + "grad_norm": 0.3945568972671402, + "learning_rate": 9.932744257327644e-05, + "loss": 3.2938, + "step": 9475 + }, + { + "epoch": 0.4411853714179296, + "grad_norm": 0.4049344489473076, + "learning_rate": 9.932699971265189e-05, + "loss": 3.1447, + "step": 9476 + }, + { + "epoch": 0.44123192960402263, + "grad_norm": 0.38130954280299517, + "learning_rate": 9.932655670725743e-05, + "loss": 3.2887, + "step": 9477 + }, + { + "epoch": 0.4412784877901157, + "grad_norm": 0.3385560157533356, + "learning_rate": 9.93261135570944e-05, + "loss": 3.2455, + "step": 9478 + }, + { + "epoch": 0.44132504597620875, + "grad_norm": 0.42651028071545044, + "learning_rate": 9.932567026216408e-05, + "loss": 3.1763, + "step": 9479 + }, + { + "epoch": 0.4413716041623018, + "grad_norm": 0.37256984111409025, + "learning_rate": 9.932522682246779e-05, + "loss": 3.2966, + "step": 9480 + }, + { + "epoch": 0.4414181623483949, + "grad_norm": 0.3903971427030716, + "learning_rate": 9.93247832380068e-05, + "loss": 3.295, + "step": 9481 + }, + { + "epoch": 0.441464720534488, + "grad_norm": 0.39953729321156795, + "learning_rate": 9.932433950878244e-05, + "loss": 3.3697, + "step": 9482 + }, + { + "epoch": 0.44151127872058105, + "grad_norm": 0.4261961937757278, + "learning_rate": 9.932389563479602e-05, + "loss": 3.1959, + "step": 9483 + }, + { + "epoch": 0.4415578369066741, + "grad_norm": 0.4341255517715366, + "learning_rate": 9.932345161604881e-05, + "loss": 3.1512, + "step": 9484 + }, + { + "epoch": 0.44160439509276717, + "grad_norm": 0.3700322670444154, + "learning_rate": 9.932300745254212e-05, + "loss": 3.0888, + "step": 9485 + }, + { + "epoch": 0.4416509532788603, + "grad_norm": 0.39995353069142037, + "learning_rate": 9.932256314427729e-05, + "loss": 3.1698, + "step": 9486 + }, + { + "epoch": 0.44169751146495334, + "grad_norm": 0.3723394445128246, + "learning_rate": 9.932211869125556e-05, + "loss": 3.3127, + "step": 9487 + }, + { + "epoch": 0.4417440696510464, + "grad_norm": 0.4009170142830134, + "learning_rate": 9.93216740934783e-05, + "loss": 3.281, + "step": 9488 + }, + { + "epoch": 0.44179062783713946, + "grad_norm": 0.37274609520900354, + "learning_rate": 9.932122935094677e-05, + "loss": 3.2693, + "step": 9489 + }, + { + "epoch": 0.4418371860232325, + "grad_norm": 0.41629088019207383, + "learning_rate": 9.932078446366229e-05, + "loss": 3.2931, + "step": 9490 + }, + { + "epoch": 0.4418837442093256, + "grad_norm": 0.3976839589591436, + "learning_rate": 9.932033943162616e-05, + "loss": 3.305, + "step": 9491 + }, + { + "epoch": 0.4419303023954187, + "grad_norm": 0.41797945705250716, + "learning_rate": 9.931989425483971e-05, + "loss": 3.3224, + "step": 9492 + }, + { + "epoch": 0.44197686058151175, + "grad_norm": 0.4395435474207734, + "learning_rate": 9.931944893330423e-05, + "loss": 3.1774, + "step": 9493 + }, + { + "epoch": 0.4420234187676048, + "grad_norm": 0.4496522182250004, + "learning_rate": 9.931900346702101e-05, + "loss": 3.1982, + "step": 9494 + }, + { + "epoch": 0.4420699769536979, + "grad_norm": 0.4085112677355731, + "learning_rate": 9.931855785599138e-05, + "loss": 3.2519, + "step": 9495 + }, + { + "epoch": 0.44211653513979093, + "grad_norm": 0.39871155375539064, + "learning_rate": 9.931811210021664e-05, + "loss": 3.2662, + "step": 9496 + }, + { + "epoch": 0.44216309332588405, + "grad_norm": 0.39080793612833403, + "learning_rate": 9.931766619969811e-05, + "loss": 3.3185, + "step": 9497 + }, + { + "epoch": 0.4422096515119771, + "grad_norm": 0.3934636810074864, + "learning_rate": 9.931722015443708e-05, + "loss": 3.3698, + "step": 9498 + }, + { + "epoch": 0.44225620969807017, + "grad_norm": 0.43078527042098136, + "learning_rate": 9.931677396443485e-05, + "loss": 3.3211, + "step": 9499 + }, + { + "epoch": 0.44230276788416323, + "grad_norm": 0.381807561151839, + "learning_rate": 9.931632762969278e-05, + "loss": 3.2245, + "step": 9500 + }, + { + "epoch": 0.4423493260702563, + "grad_norm": 0.4390130136031024, + "learning_rate": 9.931588115021213e-05, + "loss": 3.1203, + "step": 9501 + }, + { + "epoch": 0.44239588425634935, + "grad_norm": 0.43626314251802384, + "learning_rate": 9.931543452599423e-05, + "loss": 3.3697, + "step": 9502 + }, + { + "epoch": 0.44244244244244246, + "grad_norm": 0.4098957516195662, + "learning_rate": 9.93149877570404e-05, + "loss": 3.2534, + "step": 9503 + }, + { + "epoch": 0.4424890006285355, + "grad_norm": 0.3899828817049292, + "learning_rate": 9.931454084335192e-05, + "loss": 3.2821, + "step": 9504 + }, + { + "epoch": 0.4425355588146286, + "grad_norm": 0.46691608340834667, + "learning_rate": 9.931409378493013e-05, + "loss": 3.1783, + "step": 9505 + }, + { + "epoch": 0.44258211700072164, + "grad_norm": 0.4568944146697679, + "learning_rate": 9.931364658177634e-05, + "loss": 3.2765, + "step": 9506 + }, + { + "epoch": 0.4426286751868147, + "grad_norm": 0.38655029281335185, + "learning_rate": 9.931319923389184e-05, + "loss": 3.1878, + "step": 9507 + }, + { + "epoch": 0.4426752333729078, + "grad_norm": 0.40329850503009196, + "learning_rate": 9.931275174127796e-05, + "loss": 3.2959, + "step": 9508 + }, + { + "epoch": 0.4427217915590009, + "grad_norm": 0.42810075411163073, + "learning_rate": 9.931230410393602e-05, + "loss": 3.3097, + "step": 9509 + }, + { + "epoch": 0.44276834974509394, + "grad_norm": 0.38250083515706956, + "learning_rate": 9.931185632186732e-05, + "loss": 3.3368, + "step": 9510 + }, + { + "epoch": 0.442814907931187, + "grad_norm": 0.36874991656255823, + "learning_rate": 9.931140839507318e-05, + "loss": 3.3309, + "step": 9511 + }, + { + "epoch": 0.44286146611728006, + "grad_norm": 0.3739134407522754, + "learning_rate": 9.931096032355491e-05, + "loss": 3.2264, + "step": 9512 + }, + { + "epoch": 0.4429080243033731, + "grad_norm": 0.40277453095255644, + "learning_rate": 9.931051210731384e-05, + "loss": 3.2362, + "step": 9513 + }, + { + "epoch": 0.44295458248946623, + "grad_norm": 0.48096180563243, + "learning_rate": 9.931006374635128e-05, + "loss": 3.2535, + "step": 9514 + }, + { + "epoch": 0.4430011406755593, + "grad_norm": 0.42807516693269454, + "learning_rate": 9.930961524066853e-05, + "loss": 3.1247, + "step": 9515 + }, + { + "epoch": 0.44304769886165235, + "grad_norm": 0.37817381804778216, + "learning_rate": 9.930916659026691e-05, + "loss": 3.2194, + "step": 9516 + }, + { + "epoch": 0.4430942570477454, + "grad_norm": 0.4166033291957374, + "learning_rate": 9.930871779514776e-05, + "loss": 3.2332, + "step": 9517 + }, + { + "epoch": 0.44314081523383847, + "grad_norm": 0.43614111462470956, + "learning_rate": 9.930826885531237e-05, + "loss": 3.2165, + "step": 9518 + }, + { + "epoch": 0.4431873734199316, + "grad_norm": 0.4106849570500026, + "learning_rate": 9.930781977076207e-05, + "loss": 3.1814, + "step": 9519 + }, + { + "epoch": 0.44323393160602464, + "grad_norm": 0.46528246546906565, + "learning_rate": 9.930737054149816e-05, + "loss": 3.352, + "step": 9520 + }, + { + "epoch": 0.4432804897921177, + "grad_norm": 0.45381895190746974, + "learning_rate": 9.930692116752201e-05, + "loss": 3.1588, + "step": 9521 + }, + { + "epoch": 0.44332704797821076, + "grad_norm": 0.42560895828873463, + "learning_rate": 9.930647164883488e-05, + "loss": 3.3496, + "step": 9522 + }, + { + "epoch": 0.4433736061643038, + "grad_norm": 0.43787208643006303, + "learning_rate": 9.930602198543813e-05, + "loss": 3.177, + "step": 9523 + }, + { + "epoch": 0.4434201643503969, + "grad_norm": 0.42265279804517664, + "learning_rate": 9.930557217733305e-05, + "loss": 3.2498, + "step": 9524 + }, + { + "epoch": 0.44346672253649, + "grad_norm": 0.47286975653915225, + "learning_rate": 9.930512222452098e-05, + "loss": 3.3242, + "step": 9525 + }, + { + "epoch": 0.44351328072258306, + "grad_norm": 0.42041507386955723, + "learning_rate": 9.930467212700323e-05, + "loss": 3.2084, + "step": 9526 + }, + { + "epoch": 0.4435598389086761, + "grad_norm": 0.43102455701287684, + "learning_rate": 9.930422188478112e-05, + "loss": 3.3537, + "step": 9527 + }, + { + "epoch": 0.4436063970947692, + "grad_norm": 0.40720657357718565, + "learning_rate": 9.930377149785598e-05, + "loss": 3.32, + "step": 9528 + }, + { + "epoch": 0.44365295528086224, + "grad_norm": 0.4009069244997484, + "learning_rate": 9.930332096622913e-05, + "loss": 3.3615, + "step": 9529 + }, + { + "epoch": 0.44369951346695535, + "grad_norm": 0.457443858831759, + "learning_rate": 9.930287028990189e-05, + "loss": 3.2521, + "step": 9530 + }, + { + "epoch": 0.4437460716530484, + "grad_norm": 0.4179050949668171, + "learning_rate": 9.930241946887557e-05, + "loss": 3.2801, + "step": 9531 + }, + { + "epoch": 0.4437926298391415, + "grad_norm": 0.38847586482196766, + "learning_rate": 9.930196850315153e-05, + "loss": 3.2979, + "step": 9532 + }, + { + "epoch": 0.44383918802523453, + "grad_norm": 0.4040110810591771, + "learning_rate": 9.930151739273105e-05, + "loss": 3.0889, + "step": 9533 + }, + { + "epoch": 0.4438857462113276, + "grad_norm": 0.46097814976290075, + "learning_rate": 9.930106613761549e-05, + "loss": 3.3239, + "step": 9534 + }, + { + "epoch": 0.44393230439742065, + "grad_norm": 0.44192282301987096, + "learning_rate": 9.930061473780614e-05, + "loss": 3.3602, + "step": 9535 + }, + { + "epoch": 0.44397886258351377, + "grad_norm": 0.3983462935456399, + "learning_rate": 9.930016319330436e-05, + "loss": 3.1513, + "step": 9536 + }, + { + "epoch": 0.4440254207696068, + "grad_norm": 0.39325159892965517, + "learning_rate": 9.929971150411143e-05, + "loss": 3.1367, + "step": 9537 + }, + { + "epoch": 0.4440719789556999, + "grad_norm": 0.4092716490421481, + "learning_rate": 9.929925967022873e-05, + "loss": 3.2118, + "step": 9538 + }, + { + "epoch": 0.44411853714179295, + "grad_norm": 0.3628041643526646, + "learning_rate": 9.929880769165756e-05, + "loss": 3.2605, + "step": 9539 + }, + { + "epoch": 0.444165095327886, + "grad_norm": 0.4962734964119384, + "learning_rate": 9.929835556839924e-05, + "loss": 3.3364, + "step": 9540 + }, + { + "epoch": 0.4442116535139791, + "grad_norm": 0.46311076852354577, + "learning_rate": 9.92979033004551e-05, + "loss": 3.1777, + "step": 9541 + }, + { + "epoch": 0.4442582117000722, + "grad_norm": 0.4238665960605454, + "learning_rate": 9.929745088782647e-05, + "loss": 3.3308, + "step": 9542 + }, + { + "epoch": 0.44430476988616524, + "grad_norm": 0.41672726668949317, + "learning_rate": 9.929699833051468e-05, + "loss": 3.3098, + "step": 9543 + }, + { + "epoch": 0.4443513280722583, + "grad_norm": 0.49813255172130694, + "learning_rate": 9.929654562852105e-05, + "loss": 3.4011, + "step": 9544 + }, + { + "epoch": 0.44439788625835136, + "grad_norm": 0.418572276069235, + "learning_rate": 9.929609278184692e-05, + "loss": 3.1701, + "step": 9545 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.4439772536011403, + "learning_rate": 9.929563979049362e-05, + "loss": 3.339, + "step": 9546 + }, + { + "epoch": 0.44449100263053754, + "grad_norm": 0.46830521972680306, + "learning_rate": 9.929518665446247e-05, + "loss": 3.2049, + "step": 9547 + }, + { + "epoch": 0.4445375608166306, + "grad_norm": 0.44253220191425713, + "learning_rate": 9.92947333737548e-05, + "loss": 3.2506, + "step": 9548 + }, + { + "epoch": 0.44458411900272365, + "grad_norm": 0.44211925383669537, + "learning_rate": 9.929427994837195e-05, + "loss": 3.2825, + "step": 9549 + }, + { + "epoch": 0.4446306771888167, + "grad_norm": 0.5184715847039115, + "learning_rate": 9.929382637831524e-05, + "loss": 3.2625, + "step": 9550 + }, + { + "epoch": 0.4446772353749098, + "grad_norm": 0.46870070498730826, + "learning_rate": 9.9293372663586e-05, + "loss": 3.3351, + "step": 9551 + }, + { + "epoch": 0.4447237935610029, + "grad_norm": 0.5370853265596371, + "learning_rate": 9.929291880418559e-05, + "loss": 3.2083, + "step": 9552 + }, + { + "epoch": 0.44477035174709595, + "grad_norm": 0.4730434672298445, + "learning_rate": 9.929246480011529e-05, + "loss": 3.3277, + "step": 9553 + }, + { + "epoch": 0.444816909933189, + "grad_norm": 0.45318268724413596, + "learning_rate": 9.929201065137649e-05, + "loss": 3.2028, + "step": 9554 + }, + { + "epoch": 0.44486346811928207, + "grad_norm": 0.5066555300360237, + "learning_rate": 9.929155635797047e-05, + "loss": 3.3416, + "step": 9555 + }, + { + "epoch": 0.44491002630537513, + "grad_norm": 0.4467982059163886, + "learning_rate": 9.929110191989861e-05, + "loss": 3.1349, + "step": 9556 + }, + { + "epoch": 0.4449565844914682, + "grad_norm": 0.4626399580275592, + "learning_rate": 9.92906473371622e-05, + "loss": 3.1584, + "step": 9557 + }, + { + "epoch": 0.4450031426775613, + "grad_norm": 0.43615632036501123, + "learning_rate": 9.92901926097626e-05, + "loss": 3.2198, + "step": 9558 + }, + { + "epoch": 0.44504970086365436, + "grad_norm": 0.4586138088216399, + "learning_rate": 9.928973773770116e-05, + "loss": 3.3423, + "step": 9559 + }, + { + "epoch": 0.4450962590497474, + "grad_norm": 0.40340130664589235, + "learning_rate": 9.928928272097919e-05, + "loss": 3.2292, + "step": 9560 + }, + { + "epoch": 0.4451428172358405, + "grad_norm": 0.4979694067578085, + "learning_rate": 9.928882755959801e-05, + "loss": 3.3325, + "step": 9561 + }, + { + "epoch": 0.44518937542193354, + "grad_norm": 0.5141697365406728, + "learning_rate": 9.928837225355901e-05, + "loss": 3.3374, + "step": 9562 + }, + { + "epoch": 0.44523593360802666, + "grad_norm": 0.37799086101084134, + "learning_rate": 9.928791680286347e-05, + "loss": 3.2568, + "step": 9563 + }, + { + "epoch": 0.4452824917941197, + "grad_norm": 0.4337224039760649, + "learning_rate": 9.928746120751275e-05, + "loss": 3.2778, + "step": 9564 + }, + { + "epoch": 0.4453290499802128, + "grad_norm": 0.4884061967419503, + "learning_rate": 9.928700546750819e-05, + "loss": 3.1036, + "step": 9565 + }, + { + "epoch": 0.44537560816630584, + "grad_norm": 0.3997925550090305, + "learning_rate": 9.928654958285114e-05, + "loss": 3.224, + "step": 9566 + }, + { + "epoch": 0.4454221663523989, + "grad_norm": 0.4240640359085046, + "learning_rate": 9.92860935535429e-05, + "loss": 3.239, + "step": 9567 + }, + { + "epoch": 0.44546872453849196, + "grad_norm": 0.44098161616368553, + "learning_rate": 9.928563737958483e-05, + "loss": 3.2579, + "step": 9568 + }, + { + "epoch": 0.44551528272458507, + "grad_norm": 0.3752953202939375, + "learning_rate": 9.928518106097829e-05, + "loss": 3.3088, + "step": 9569 + }, + { + "epoch": 0.44556184091067813, + "grad_norm": 0.44466425221722, + "learning_rate": 9.928472459772458e-05, + "loss": 3.1881, + "step": 9570 + }, + { + "epoch": 0.4456083990967712, + "grad_norm": 0.4427143943901058, + "learning_rate": 9.928426798982507e-05, + "loss": 3.3313, + "step": 9571 + }, + { + "epoch": 0.44565495728286425, + "grad_norm": 0.45494386644877993, + "learning_rate": 9.928381123728108e-05, + "loss": 3.2081, + "step": 9572 + }, + { + "epoch": 0.4457015154689573, + "grad_norm": 0.4224203151740217, + "learning_rate": 9.928335434009396e-05, + "loss": 3.2816, + "step": 9573 + }, + { + "epoch": 0.4457480736550504, + "grad_norm": 0.43841532819927376, + "learning_rate": 9.928289729826506e-05, + "loss": 3.3472, + "step": 9574 + }, + { + "epoch": 0.4457946318411435, + "grad_norm": 0.4094679627935324, + "learning_rate": 9.928244011179571e-05, + "loss": 3.2551, + "step": 9575 + }, + { + "epoch": 0.44584119002723654, + "grad_norm": 0.3962073730115091, + "learning_rate": 9.928198278068725e-05, + "loss": 3.2258, + "step": 9576 + }, + { + "epoch": 0.4458877482133296, + "grad_norm": 0.4262868694704869, + "learning_rate": 9.928152530494101e-05, + "loss": 3.2619, + "step": 9577 + }, + { + "epoch": 0.44593430639942266, + "grad_norm": 0.3791965218753327, + "learning_rate": 9.928106768455837e-05, + "loss": 3.2025, + "step": 9578 + }, + { + "epoch": 0.4459808645855157, + "grad_norm": 0.39984977366872954, + "learning_rate": 9.928060991954065e-05, + "loss": 3.1248, + "step": 9579 + }, + { + "epoch": 0.44602742277160884, + "grad_norm": 0.4471038300596178, + "learning_rate": 9.928015200988918e-05, + "loss": 3.3359, + "step": 9580 + }, + { + "epoch": 0.4460739809577019, + "grad_norm": 0.49924297487087305, + "learning_rate": 9.927969395560534e-05, + "loss": 3.1915, + "step": 9581 + }, + { + "epoch": 0.44612053914379496, + "grad_norm": 0.44553535328994265, + "learning_rate": 9.927923575669044e-05, + "loss": 3.1371, + "step": 9582 + }, + { + "epoch": 0.446167097329888, + "grad_norm": 0.43412503158680504, + "learning_rate": 9.927877741314584e-05, + "loss": 3.2663, + "step": 9583 + }, + { + "epoch": 0.4462136555159811, + "grad_norm": 0.47421174873077915, + "learning_rate": 9.927831892497288e-05, + "loss": 3.2408, + "step": 9584 + }, + { + "epoch": 0.4462602137020742, + "grad_norm": 0.41763845870019906, + "learning_rate": 9.927786029217291e-05, + "loss": 3.2366, + "step": 9585 + }, + { + "epoch": 0.44630677188816725, + "grad_norm": 0.40261677920887284, + "learning_rate": 9.927740151474726e-05, + "loss": 3.1444, + "step": 9586 + }, + { + "epoch": 0.4463533300742603, + "grad_norm": 0.41283500377139704, + "learning_rate": 9.927694259269733e-05, + "loss": 3.2879, + "step": 9587 + }, + { + "epoch": 0.4463998882603534, + "grad_norm": 0.4313326589377955, + "learning_rate": 9.927648352602438e-05, + "loss": 3.2274, + "step": 9588 + }, + { + "epoch": 0.44644644644644643, + "grad_norm": 0.36160385224236824, + "learning_rate": 9.927602431472984e-05, + "loss": 3.1859, + "step": 9589 + }, + { + "epoch": 0.4464930046325395, + "grad_norm": 0.4320586852939689, + "learning_rate": 9.927556495881503e-05, + "loss": 3.2117, + "step": 9590 + }, + { + "epoch": 0.4465395628186326, + "grad_norm": 0.4366207355836341, + "learning_rate": 9.927510545828127e-05, + "loss": 3.2799, + "step": 9591 + }, + { + "epoch": 0.44658612100472567, + "grad_norm": 0.44930259909066045, + "learning_rate": 9.927464581312993e-05, + "loss": 3.295, + "step": 9592 + }, + { + "epoch": 0.4466326791908187, + "grad_norm": 0.42954283505051544, + "learning_rate": 9.927418602336236e-05, + "loss": 3.3124, + "step": 9593 + }, + { + "epoch": 0.4466792373769118, + "grad_norm": 0.47440830953687163, + "learning_rate": 9.927372608897991e-05, + "loss": 3.1581, + "step": 9594 + }, + { + "epoch": 0.44672579556300485, + "grad_norm": 0.4406293394921645, + "learning_rate": 9.927326600998393e-05, + "loss": 3.2285, + "step": 9595 + }, + { + "epoch": 0.44677235374909796, + "grad_norm": 0.44706342919184383, + "learning_rate": 9.927280578637577e-05, + "loss": 3.2232, + "step": 9596 + }, + { + "epoch": 0.446818911935191, + "grad_norm": 0.4882988201484236, + "learning_rate": 9.927234541815677e-05, + "loss": 3.2548, + "step": 9597 + }, + { + "epoch": 0.4468654701212841, + "grad_norm": 0.42780496485827896, + "learning_rate": 9.92718849053283e-05, + "loss": 3.2828, + "step": 9598 + }, + { + "epoch": 0.44691202830737714, + "grad_norm": 0.4164872767493489, + "learning_rate": 9.92714242478917e-05, + "loss": 3.2906, + "step": 9599 + }, + { + "epoch": 0.4469585864934702, + "grad_norm": 0.5267749636747295, + "learning_rate": 9.927096344584833e-05, + "loss": 3.2727, + "step": 9600 + }, + { + "epoch": 0.44700514467956326, + "grad_norm": 0.3945941047286713, + "learning_rate": 9.927050249919953e-05, + "loss": 3.1902, + "step": 9601 + }, + { + "epoch": 0.4470517028656564, + "grad_norm": 0.39816069587836234, + "learning_rate": 9.927004140794667e-05, + "loss": 3.2542, + "step": 9602 + }, + { + "epoch": 0.44709826105174943, + "grad_norm": 0.4235785305538238, + "learning_rate": 9.926958017209108e-05, + "loss": 3.2278, + "step": 9603 + }, + { + "epoch": 0.4471448192378425, + "grad_norm": 0.456049012649488, + "learning_rate": 9.926911879163412e-05, + "loss": 3.2522, + "step": 9604 + }, + { + "epoch": 0.44719137742393555, + "grad_norm": 0.439925290703304, + "learning_rate": 9.926865726657715e-05, + "loss": 3.3277, + "step": 9605 + }, + { + "epoch": 0.4472379356100286, + "grad_norm": 0.42637552750952906, + "learning_rate": 9.926819559692154e-05, + "loss": 3.3339, + "step": 9606 + }, + { + "epoch": 0.44728449379612173, + "grad_norm": 0.4033301039949456, + "learning_rate": 9.926773378266863e-05, + "loss": 3.249, + "step": 9607 + }, + { + "epoch": 0.4473310519822148, + "grad_norm": 0.48704137557952243, + "learning_rate": 9.926727182381976e-05, + "loss": 3.363, + "step": 9608 + }, + { + "epoch": 0.44737761016830785, + "grad_norm": 0.46756778752444556, + "learning_rate": 9.92668097203763e-05, + "loss": 3.2431, + "step": 9609 + }, + { + "epoch": 0.4474241683544009, + "grad_norm": 0.45219542564453463, + "learning_rate": 9.926634747233963e-05, + "loss": 3.2119, + "step": 9610 + }, + { + "epoch": 0.44747072654049397, + "grad_norm": 0.48413990838326765, + "learning_rate": 9.926588507971107e-05, + "loss": 3.2526, + "step": 9611 + }, + { + "epoch": 0.44751728472658703, + "grad_norm": 0.4663912608260257, + "learning_rate": 9.926542254249198e-05, + "loss": 3.2517, + "step": 9612 + }, + { + "epoch": 0.44756384291268014, + "grad_norm": 0.3945019757350919, + "learning_rate": 9.926495986068374e-05, + "loss": 3.1821, + "step": 9613 + }, + { + "epoch": 0.4476104010987732, + "grad_norm": 0.44731101361541675, + "learning_rate": 9.92644970342877e-05, + "loss": 3.2423, + "step": 9614 + }, + { + "epoch": 0.44765695928486626, + "grad_norm": 0.5269607404094392, + "learning_rate": 9.92640340633052e-05, + "loss": 3.2833, + "step": 9615 + }, + { + "epoch": 0.4477035174709593, + "grad_norm": 0.4370887264746633, + "learning_rate": 9.926357094773763e-05, + "loss": 3.3427, + "step": 9616 + }, + { + "epoch": 0.4477500756570524, + "grad_norm": 0.497463022570099, + "learning_rate": 9.926310768758632e-05, + "loss": 3.3367, + "step": 9617 + }, + { + "epoch": 0.4477966338431455, + "grad_norm": 0.4472285586095542, + "learning_rate": 9.926264428285264e-05, + "loss": 3.1738, + "step": 9618 + }, + { + "epoch": 0.44784319202923856, + "grad_norm": 0.417573290968415, + "learning_rate": 9.926218073353797e-05, + "loss": 3.1527, + "step": 9619 + }, + { + "epoch": 0.4478897502153316, + "grad_norm": 0.4701324026084087, + "learning_rate": 9.926171703964364e-05, + "loss": 3.294, + "step": 9620 + }, + { + "epoch": 0.4479363084014247, + "grad_norm": 0.5041810387393256, + "learning_rate": 9.926125320117102e-05, + "loss": 3.3354, + "step": 9621 + }, + { + "epoch": 0.44798286658751774, + "grad_norm": 0.4092862011924225, + "learning_rate": 9.926078921812149e-05, + "loss": 3.207, + "step": 9622 + }, + { + "epoch": 0.4480294247736108, + "grad_norm": 0.4945358426878678, + "learning_rate": 9.926032509049639e-05, + "loss": 3.3204, + "step": 9623 + }, + { + "epoch": 0.4480759829597039, + "grad_norm": 0.514447920095055, + "learning_rate": 9.925986081829708e-05, + "loss": 3.2758, + "step": 9624 + }, + { + "epoch": 0.44812254114579697, + "grad_norm": 0.49354576338782913, + "learning_rate": 9.925939640152493e-05, + "loss": 3.3095, + "step": 9625 + }, + { + "epoch": 0.44816909933189003, + "grad_norm": 0.42138841886608613, + "learning_rate": 9.925893184018132e-05, + "loss": 3.3166, + "step": 9626 + }, + { + "epoch": 0.4482156575179831, + "grad_norm": 0.5143766512038366, + "learning_rate": 9.925846713426759e-05, + "loss": 3.1407, + "step": 9627 + }, + { + "epoch": 0.44826221570407615, + "grad_norm": 0.48361360139176124, + "learning_rate": 9.92580022837851e-05, + "loss": 3.2757, + "step": 9628 + }, + { + "epoch": 0.44830877389016927, + "grad_norm": 0.47018957207846035, + "learning_rate": 9.925753728873523e-05, + "loss": 3.349, + "step": 9629 + }, + { + "epoch": 0.4483553320762623, + "grad_norm": 0.45433965271866494, + "learning_rate": 9.925707214911936e-05, + "loss": 3.1397, + "step": 9630 + }, + { + "epoch": 0.4484018902623554, + "grad_norm": 0.4222382090567566, + "learning_rate": 9.925660686493882e-05, + "loss": 3.2909, + "step": 9631 + }, + { + "epoch": 0.44844844844844844, + "grad_norm": 0.4350834854183084, + "learning_rate": 9.925614143619499e-05, + "loss": 3.2338, + "step": 9632 + }, + { + "epoch": 0.4484950066345415, + "grad_norm": 0.4399869083137938, + "learning_rate": 9.925567586288925e-05, + "loss": 3.4046, + "step": 9633 + }, + { + "epoch": 0.44854156482063456, + "grad_norm": 0.4124664030588599, + "learning_rate": 9.925521014502295e-05, + "loss": 3.3125, + "step": 9634 + }, + { + "epoch": 0.4485881230067277, + "grad_norm": 0.39144620816711223, + "learning_rate": 9.925474428259745e-05, + "loss": 3.2085, + "step": 9635 + }, + { + "epoch": 0.44863468119282074, + "grad_norm": 0.3745037834449804, + "learning_rate": 9.925427827561414e-05, + "loss": 3.2955, + "step": 9636 + }, + { + "epoch": 0.4486812393789138, + "grad_norm": 0.4232517086392856, + "learning_rate": 9.925381212407436e-05, + "loss": 3.3481, + "step": 9637 + }, + { + "epoch": 0.44872779756500686, + "grad_norm": 0.43723125818899844, + "learning_rate": 9.925334582797951e-05, + "loss": 3.257, + "step": 9638 + }, + { + "epoch": 0.4487743557510999, + "grad_norm": 0.4677591721849521, + "learning_rate": 9.925287938733094e-05, + "loss": 3.1267, + "step": 9639 + }, + { + "epoch": 0.44882091393719303, + "grad_norm": 0.4136744423969454, + "learning_rate": 9.925241280213002e-05, + "loss": 3.1135, + "step": 9640 + }, + { + "epoch": 0.4488674721232861, + "grad_norm": 0.4584161489773946, + "learning_rate": 9.925194607237811e-05, + "loss": 3.2561, + "step": 9641 + }, + { + "epoch": 0.44891403030937915, + "grad_norm": 0.48499775647736804, + "learning_rate": 9.925147919807661e-05, + "loss": 3.2969, + "step": 9642 + }, + { + "epoch": 0.4489605884954722, + "grad_norm": 0.4587628034204384, + "learning_rate": 9.925101217922685e-05, + "loss": 3.3232, + "step": 9643 + }, + { + "epoch": 0.4490071466815653, + "grad_norm": 0.4042430565004223, + "learning_rate": 9.925054501583024e-05, + "loss": 3.2635, + "step": 9644 + }, + { + "epoch": 0.44905370486765833, + "grad_norm": 0.407469079221476, + "learning_rate": 9.925007770788813e-05, + "loss": 3.3253, + "step": 9645 + }, + { + "epoch": 0.44910026305375145, + "grad_norm": 0.4865085541733622, + "learning_rate": 9.924961025540187e-05, + "loss": 3.2647, + "step": 9646 + }, + { + "epoch": 0.4491468212398445, + "grad_norm": 0.4612032436049239, + "learning_rate": 9.924914265837288e-05, + "loss": 3.3375, + "step": 9647 + }, + { + "epoch": 0.44919337942593757, + "grad_norm": 0.44553886318591784, + "learning_rate": 9.92486749168025e-05, + "loss": 3.3111, + "step": 9648 + }, + { + "epoch": 0.4492399376120306, + "grad_norm": 0.4284559874279479, + "learning_rate": 9.92482070306921e-05, + "loss": 3.2465, + "step": 9649 + }, + { + "epoch": 0.4492864957981237, + "grad_norm": 0.4441652413280844, + "learning_rate": 9.924773900004308e-05, + "loss": 3.2648, + "step": 9650 + }, + { + "epoch": 0.4493330539842168, + "grad_norm": 0.39877947900178967, + "learning_rate": 9.92472708248568e-05, + "loss": 3.2449, + "step": 9651 + }, + { + "epoch": 0.44937961217030986, + "grad_norm": 0.42368002262219445, + "learning_rate": 9.924680250513463e-05, + "loss": 3.1383, + "step": 9652 + }, + { + "epoch": 0.4494261703564029, + "grad_norm": 0.4205389519111894, + "learning_rate": 9.924633404087793e-05, + "loss": 3.2896, + "step": 9653 + }, + { + "epoch": 0.449472728542496, + "grad_norm": 0.3672680746490663, + "learning_rate": 9.924586543208811e-05, + "loss": 3.2386, + "step": 9654 + }, + { + "epoch": 0.44951928672858904, + "grad_norm": 0.3610916453235676, + "learning_rate": 9.924539667876651e-05, + "loss": 3.2312, + "step": 9655 + }, + { + "epoch": 0.4495658449146821, + "grad_norm": 0.3997453886413843, + "learning_rate": 9.924492778091454e-05, + "loss": 3.3348, + "step": 9656 + }, + { + "epoch": 0.4496124031007752, + "grad_norm": 0.41029080985640043, + "learning_rate": 9.924445873853354e-05, + "loss": 3.3579, + "step": 9657 + }, + { + "epoch": 0.4496589612868683, + "grad_norm": 0.41222212060906843, + "learning_rate": 9.924398955162493e-05, + "loss": 3.2388, + "step": 9658 + }, + { + "epoch": 0.44970551947296133, + "grad_norm": 0.36334296794658155, + "learning_rate": 9.924352022019003e-05, + "loss": 3.2176, + "step": 9659 + }, + { + "epoch": 0.4497520776590544, + "grad_norm": 0.39929186292178004, + "learning_rate": 9.924305074423028e-05, + "loss": 3.4103, + "step": 9660 + }, + { + "epoch": 0.44979863584514745, + "grad_norm": 0.4029871031333854, + "learning_rate": 9.9242581123747e-05, + "loss": 3.2782, + "step": 9661 + }, + { + "epoch": 0.44984519403124057, + "grad_norm": 0.4051362596985078, + "learning_rate": 9.924211135874161e-05, + "loss": 3.3269, + "step": 9662 + }, + { + "epoch": 0.44989175221733363, + "grad_norm": 0.39270493826559344, + "learning_rate": 9.924164144921549e-05, + "loss": 3.2836, + "step": 9663 + }, + { + "epoch": 0.4499383104034267, + "grad_norm": 0.4203326487066958, + "learning_rate": 9.924117139516997e-05, + "loss": 3.2203, + "step": 9664 + }, + { + "epoch": 0.44998486858951975, + "grad_norm": 0.4679896781886957, + "learning_rate": 9.92407011966065e-05, + "loss": 3.2349, + "step": 9665 + }, + { + "epoch": 0.4500314267756128, + "grad_norm": 0.4386274866237812, + "learning_rate": 9.924023085352639e-05, + "loss": 3.2422, + "step": 9666 + }, + { + "epoch": 0.45007798496170587, + "grad_norm": 0.4614053218039683, + "learning_rate": 9.923976036593107e-05, + "loss": 3.2261, + "step": 9667 + }, + { + "epoch": 0.450124543147799, + "grad_norm": 0.44110546340117596, + "learning_rate": 9.923928973382191e-05, + "loss": 3.2418, + "step": 9668 + }, + { + "epoch": 0.45017110133389204, + "grad_norm": 0.474236138490133, + "learning_rate": 9.923881895720026e-05, + "loss": 3.1821, + "step": 9669 + }, + { + "epoch": 0.4502176595199851, + "grad_norm": 0.3505209455983327, + "learning_rate": 9.923834803606755e-05, + "loss": 3.1561, + "step": 9670 + }, + { + "epoch": 0.45026421770607816, + "grad_norm": 0.4694239797349111, + "learning_rate": 9.923787697042512e-05, + "loss": 3.2564, + "step": 9671 + }, + { + "epoch": 0.4503107758921712, + "grad_norm": 0.4555705204811445, + "learning_rate": 9.92374057602744e-05, + "loss": 3.302, + "step": 9672 + }, + { + "epoch": 0.45035733407826434, + "grad_norm": 0.44174471258131626, + "learning_rate": 9.923693440561672e-05, + "loss": 3.3578, + "step": 9673 + }, + { + "epoch": 0.4504038922643574, + "grad_norm": 0.44777492505093835, + "learning_rate": 9.92364629064535e-05, + "loss": 3.2824, + "step": 9674 + }, + { + "epoch": 0.45045045045045046, + "grad_norm": 0.41098237139249216, + "learning_rate": 9.923599126278611e-05, + "loss": 3.2189, + "step": 9675 + }, + { + "epoch": 0.4504970086365435, + "grad_norm": 0.40251450997955496, + "learning_rate": 9.923551947461593e-05, + "loss": 3.26, + "step": 9676 + }, + { + "epoch": 0.4505435668226366, + "grad_norm": 0.3829157085668652, + "learning_rate": 9.923504754194437e-05, + "loss": 3.2192, + "step": 9677 + }, + { + "epoch": 0.45059012500872964, + "grad_norm": 0.40409524057047774, + "learning_rate": 9.923457546477278e-05, + "loss": 3.4334, + "step": 9678 + }, + { + "epoch": 0.45063668319482275, + "grad_norm": 0.4076410895945949, + "learning_rate": 9.923410324310256e-05, + "loss": 3.3588, + "step": 9679 + }, + { + "epoch": 0.4506832413809158, + "grad_norm": 0.33964104615578444, + "learning_rate": 9.92336308769351e-05, + "loss": 3.1594, + "step": 9680 + }, + { + "epoch": 0.45072979956700887, + "grad_norm": 0.4437783586367439, + "learning_rate": 9.923315836627179e-05, + "loss": 3.2383, + "step": 9681 + }, + { + "epoch": 0.45077635775310193, + "grad_norm": 0.40260349993072514, + "learning_rate": 9.9232685711114e-05, + "loss": 3.2854, + "step": 9682 + }, + { + "epoch": 0.450822915939195, + "grad_norm": 0.39090262510934126, + "learning_rate": 9.923221291146315e-05, + "loss": 3.2903, + "step": 9683 + }, + { + "epoch": 0.4508694741252881, + "grad_norm": 0.4081009760387286, + "learning_rate": 9.923173996732058e-05, + "loss": 3.2722, + "step": 9684 + }, + { + "epoch": 0.45091603231138117, + "grad_norm": 0.3721064791377329, + "learning_rate": 9.923126687868771e-05, + "loss": 3.33, + "step": 9685 + }, + { + "epoch": 0.4509625904974742, + "grad_norm": 0.3769502472960612, + "learning_rate": 9.923079364556592e-05, + "loss": 3.259, + "step": 9686 + }, + { + "epoch": 0.4510091486835673, + "grad_norm": 0.3582569945867342, + "learning_rate": 9.92303202679566e-05, + "loss": 3.2857, + "step": 9687 + }, + { + "epoch": 0.45105570686966034, + "grad_norm": 0.40223107130043406, + "learning_rate": 9.922984674586115e-05, + "loss": 3.2214, + "step": 9688 + }, + { + "epoch": 0.4511022650557534, + "grad_norm": 0.4345142413682543, + "learning_rate": 9.922937307928094e-05, + "loss": 3.3067, + "step": 9689 + }, + { + "epoch": 0.4511488232418465, + "grad_norm": 0.42350778250308835, + "learning_rate": 9.922889926821737e-05, + "loss": 3.2766, + "step": 9690 + }, + { + "epoch": 0.4511953814279396, + "grad_norm": 0.3395748408723717, + "learning_rate": 9.922842531267185e-05, + "loss": 3.2768, + "step": 9691 + }, + { + "epoch": 0.45124193961403264, + "grad_norm": 0.40222015423634644, + "learning_rate": 9.922795121264572e-05, + "loss": 3.1786, + "step": 9692 + }, + { + "epoch": 0.4512884978001257, + "grad_norm": 0.45363751150116954, + "learning_rate": 9.922747696814043e-05, + "loss": 3.27, + "step": 9693 + }, + { + "epoch": 0.45133505598621876, + "grad_norm": 0.35239160035311656, + "learning_rate": 9.922700257915734e-05, + "loss": 3.1939, + "step": 9694 + }, + { + "epoch": 0.4513816141723119, + "grad_norm": 0.3979151434877222, + "learning_rate": 9.922652804569783e-05, + "loss": 3.298, + "step": 9695 + }, + { + "epoch": 0.45142817235840493, + "grad_norm": 0.4663333403305003, + "learning_rate": 9.922605336776332e-05, + "loss": 3.2705, + "step": 9696 + }, + { + "epoch": 0.451474730544498, + "grad_norm": 0.4160382005043734, + "learning_rate": 9.922557854535519e-05, + "loss": 3.3245, + "step": 9697 + }, + { + "epoch": 0.45152128873059105, + "grad_norm": 0.4006946655910071, + "learning_rate": 9.922510357847484e-05, + "loss": 3.1331, + "step": 9698 + }, + { + "epoch": 0.4515678469166841, + "grad_norm": 0.3874596410564579, + "learning_rate": 9.922462846712366e-05, + "loss": 3.2073, + "step": 9699 + }, + { + "epoch": 0.45161440510277717, + "grad_norm": 0.393278831033473, + "learning_rate": 9.922415321130304e-05, + "loss": 3.285, + "step": 9700 + }, + { + "epoch": 0.4516609632888703, + "grad_norm": 0.42327632382438424, + "learning_rate": 9.92236778110144e-05, + "loss": 3.3112, + "step": 9701 + }, + { + "epoch": 0.45170752147496335, + "grad_norm": 0.4176956753765761, + "learning_rate": 9.92232022662591e-05, + "loss": 3.2523, + "step": 9702 + }, + { + "epoch": 0.4517540796610564, + "grad_norm": 0.379834927468311, + "learning_rate": 9.922272657703854e-05, + "loss": 3.2076, + "step": 9703 + }, + { + "epoch": 0.45180063784714947, + "grad_norm": 0.39365579829324165, + "learning_rate": 9.922225074335413e-05, + "loss": 3.164, + "step": 9704 + }, + { + "epoch": 0.4518471960332425, + "grad_norm": 0.3626198452997995, + "learning_rate": 9.922177476520729e-05, + "loss": 3.2621, + "step": 9705 + }, + { + "epoch": 0.45189375421933564, + "grad_norm": 0.40641915030397247, + "learning_rate": 9.922129864259936e-05, + "loss": 3.2586, + "step": 9706 + }, + { + "epoch": 0.4519403124054287, + "grad_norm": 0.42902805257603727, + "learning_rate": 9.922082237553178e-05, + "loss": 3.276, + "step": 9707 + }, + { + "epoch": 0.45198687059152176, + "grad_norm": 0.4701076859219675, + "learning_rate": 9.922034596400594e-05, + "loss": 3.3458, + "step": 9708 + }, + { + "epoch": 0.4520334287776148, + "grad_norm": 0.4493813856780311, + "learning_rate": 9.921986940802321e-05, + "loss": 3.3491, + "step": 9709 + }, + { + "epoch": 0.4520799869637079, + "grad_norm": 0.3982971179478535, + "learning_rate": 9.921939270758505e-05, + "loss": 3.3304, + "step": 9710 + }, + { + "epoch": 0.45212654514980094, + "grad_norm": 0.37706480791050334, + "learning_rate": 9.921891586269278e-05, + "loss": 3.322, + "step": 9711 + }, + { + "epoch": 0.45217310333589406, + "grad_norm": 0.4219080291051773, + "learning_rate": 9.921843887334787e-05, + "loss": 3.2526, + "step": 9712 + }, + { + "epoch": 0.4522196615219871, + "grad_norm": 0.4649729865557696, + "learning_rate": 9.92179617395517e-05, + "loss": 3.1768, + "step": 9713 + }, + { + "epoch": 0.4522662197080802, + "grad_norm": 0.46277663680892184, + "learning_rate": 9.921748446130564e-05, + "loss": 3.2882, + "step": 9714 + }, + { + "epoch": 0.45231277789417323, + "grad_norm": 0.4495196648085895, + "learning_rate": 9.92170070386111e-05, + "loss": 3.3404, + "step": 9715 + }, + { + "epoch": 0.4523593360802663, + "grad_norm": 0.450621951457388, + "learning_rate": 9.921652947146951e-05, + "loss": 3.2951, + "step": 9716 + }, + { + "epoch": 0.4524058942663594, + "grad_norm": 0.5635982768307372, + "learning_rate": 9.921605175988226e-05, + "loss": 3.2678, + "step": 9717 + }, + { + "epoch": 0.45245245245245247, + "grad_norm": 0.45054243362832447, + "learning_rate": 9.921557390385073e-05, + "loss": 3.266, + "step": 9718 + }, + { + "epoch": 0.45249901063854553, + "grad_norm": 0.4109451912469457, + "learning_rate": 9.921509590337635e-05, + "loss": 3.2627, + "step": 9719 + }, + { + "epoch": 0.4525455688246386, + "grad_norm": 0.6018346502783883, + "learning_rate": 9.921461775846052e-05, + "loss": 3.2297, + "step": 9720 + }, + { + "epoch": 0.45259212701073165, + "grad_norm": 0.45927351220745577, + "learning_rate": 9.921413946910463e-05, + "loss": 3.2516, + "step": 9721 + }, + { + "epoch": 0.4526386851968247, + "grad_norm": 0.4224263458391815, + "learning_rate": 9.921366103531007e-05, + "loss": 3.1966, + "step": 9722 + }, + { + "epoch": 0.4526852433829178, + "grad_norm": 0.46330744505170024, + "learning_rate": 9.92131824570783e-05, + "loss": 3.1884, + "step": 9723 + }, + { + "epoch": 0.4527318015690109, + "grad_norm": 0.4491951289032183, + "learning_rate": 9.921270373441065e-05, + "loss": 3.2078, + "step": 9724 + }, + { + "epoch": 0.45277835975510394, + "grad_norm": 0.41644519621912834, + "learning_rate": 9.921222486730858e-05, + "loss": 3.2918, + "step": 9725 + }, + { + "epoch": 0.452824917941197, + "grad_norm": 0.47628817165561743, + "learning_rate": 9.921174585577347e-05, + "loss": 3.1807, + "step": 9726 + }, + { + "epoch": 0.45287147612729006, + "grad_norm": 0.4389798090224334, + "learning_rate": 9.921126669980675e-05, + "loss": 3.247, + "step": 9727 + }, + { + "epoch": 0.4529180343133832, + "grad_norm": 0.41886915378986883, + "learning_rate": 9.92107873994098e-05, + "loss": 3.1969, + "step": 9728 + }, + { + "epoch": 0.45296459249947624, + "grad_norm": 0.4847197537645203, + "learning_rate": 9.921030795458404e-05, + "loss": 3.1541, + "step": 9729 + }, + { + "epoch": 0.4530111506855693, + "grad_norm": 0.485520079006624, + "learning_rate": 9.920982836533087e-05, + "loss": 3.2776, + "step": 9730 + }, + { + "epoch": 0.45305770887166236, + "grad_norm": 0.43679636526384, + "learning_rate": 9.92093486316517e-05, + "loss": 3.3811, + "step": 9731 + }, + { + "epoch": 0.4531042670577554, + "grad_norm": 0.4113821502760806, + "learning_rate": 9.920886875354796e-05, + "loss": 3.1217, + "step": 9732 + }, + { + "epoch": 0.4531508252438485, + "grad_norm": 0.44095506570504406, + "learning_rate": 9.920838873102101e-05, + "loss": 3.2168, + "step": 9733 + }, + { + "epoch": 0.4531973834299416, + "grad_norm": 0.3750868296700707, + "learning_rate": 9.92079085640723e-05, + "loss": 3.2295, + "step": 9734 + }, + { + "epoch": 0.45324394161603465, + "grad_norm": 0.36453115778944783, + "learning_rate": 9.920742825270322e-05, + "loss": 3.1869, + "step": 9735 + }, + { + "epoch": 0.4532904998021277, + "grad_norm": 0.41167534097803826, + "learning_rate": 9.92069477969152e-05, + "loss": 3.1882, + "step": 9736 + }, + { + "epoch": 0.45333705798822077, + "grad_norm": 0.40202347294705765, + "learning_rate": 9.920646719670964e-05, + "loss": 3.2542, + "step": 9737 + }, + { + "epoch": 0.45338361617431383, + "grad_norm": 0.37379087607556183, + "learning_rate": 9.920598645208791e-05, + "loss": 3.382, + "step": 9738 + }, + { + "epoch": 0.45343017436040695, + "grad_norm": 0.4137523181970968, + "learning_rate": 9.920550556305149e-05, + "loss": 3.167, + "step": 9739 + }, + { + "epoch": 0.4534767325465, + "grad_norm": 0.44673576400042914, + "learning_rate": 9.920502452960175e-05, + "loss": 3.1594, + "step": 9740 + }, + { + "epoch": 0.45352329073259307, + "grad_norm": 0.4374130443105802, + "learning_rate": 9.920454335174011e-05, + "loss": 3.2349, + "step": 9741 + }, + { + "epoch": 0.4535698489186861, + "grad_norm": 0.39718816205531593, + "learning_rate": 9.920406202946798e-05, + "loss": 3.0723, + "step": 9742 + }, + { + "epoch": 0.4536164071047792, + "grad_norm": 0.4292732501839517, + "learning_rate": 9.920358056278677e-05, + "loss": 3.1952, + "step": 9743 + }, + { + "epoch": 0.45366296529087224, + "grad_norm": 0.45742856823628447, + "learning_rate": 9.920309895169793e-05, + "loss": 3.3124, + "step": 9744 + }, + { + "epoch": 0.45370952347696536, + "grad_norm": 0.4017877935235722, + "learning_rate": 9.920261719620281e-05, + "loss": 3.2395, + "step": 9745 + }, + { + "epoch": 0.4537560816630584, + "grad_norm": 0.5378972267255661, + "learning_rate": 9.920213529630286e-05, + "loss": 3.2747, + "step": 9746 + }, + { + "epoch": 0.4538026398491515, + "grad_norm": 0.4121611421137737, + "learning_rate": 9.920165325199949e-05, + "loss": 3.1182, + "step": 9747 + }, + { + "epoch": 0.45384919803524454, + "grad_norm": 0.45422705756151355, + "learning_rate": 9.920117106329412e-05, + "loss": 3.1681, + "step": 9748 + }, + { + "epoch": 0.4538957562213376, + "grad_norm": 0.48229985132828646, + "learning_rate": 9.920068873018817e-05, + "loss": 3.2316, + "step": 9749 + }, + { + "epoch": 0.4539423144074307, + "grad_norm": 0.40231019851425565, + "learning_rate": 9.920020625268302e-05, + "loss": 3.3015, + "step": 9750 + }, + { + "epoch": 0.4539888725935238, + "grad_norm": 0.42034577923908384, + "learning_rate": 9.919972363078014e-05, + "loss": 3.2284, + "step": 9751 + }, + { + "epoch": 0.45403543077961683, + "grad_norm": 0.45023213982529464, + "learning_rate": 9.91992408644809e-05, + "loss": 3.2772, + "step": 9752 + }, + { + "epoch": 0.4540819889657099, + "grad_norm": 0.4154476820859789, + "learning_rate": 9.919875795378675e-05, + "loss": 3.2351, + "step": 9753 + }, + { + "epoch": 0.45412854715180295, + "grad_norm": 0.43865928861469716, + "learning_rate": 9.919827489869907e-05, + "loss": 3.2879, + "step": 9754 + }, + { + "epoch": 0.454175105337896, + "grad_norm": 0.39535814835259236, + "learning_rate": 9.919779169921932e-05, + "loss": 3.3187, + "step": 9755 + }, + { + "epoch": 0.4542216635239891, + "grad_norm": 0.40401846035830535, + "learning_rate": 9.919730835534888e-05, + "loss": 3.2794, + "step": 9756 + }, + { + "epoch": 0.4542682217100822, + "grad_norm": 0.49622270874597196, + "learning_rate": 9.91968248670892e-05, + "loss": 3.336, + "step": 9757 + }, + { + "epoch": 0.45431477989617525, + "grad_norm": 0.445767647337907, + "learning_rate": 9.919634123444168e-05, + "loss": 3.3281, + "step": 9758 + }, + { + "epoch": 0.4543613380822683, + "grad_norm": 0.4798498594098091, + "learning_rate": 9.919585745740776e-05, + "loss": 3.2182, + "step": 9759 + }, + { + "epoch": 0.45440789626836137, + "grad_norm": 0.4707578251353999, + "learning_rate": 9.919537353598882e-05, + "loss": 3.2342, + "step": 9760 + }, + { + "epoch": 0.4544544544544545, + "grad_norm": 0.4598423227148676, + "learning_rate": 9.919488947018632e-05, + "loss": 3.3723, + "step": 9761 + }, + { + "epoch": 0.45450101264054754, + "grad_norm": 0.4039948207535217, + "learning_rate": 9.919440526000166e-05, + "loss": 3.2774, + "step": 9762 + }, + { + "epoch": 0.4545475708266406, + "grad_norm": 0.45592135050236965, + "learning_rate": 9.919392090543628e-05, + "loss": 3.2953, + "step": 9763 + }, + { + "epoch": 0.45459412901273366, + "grad_norm": 0.42477786337382745, + "learning_rate": 9.919343640649158e-05, + "loss": 3.3472, + "step": 9764 + }, + { + "epoch": 0.4546406871988267, + "grad_norm": 0.5271286492683059, + "learning_rate": 9.919295176316899e-05, + "loss": 3.2392, + "step": 9765 + }, + { + "epoch": 0.4546872453849198, + "grad_norm": 0.4346590142149405, + "learning_rate": 9.919246697546994e-05, + "loss": 3.2105, + "step": 9766 + }, + { + "epoch": 0.4547338035710129, + "grad_norm": 0.4047699296278798, + "learning_rate": 9.919198204339584e-05, + "loss": 3.216, + "step": 9767 + }, + { + "epoch": 0.45478036175710596, + "grad_norm": 0.45670510319732016, + "learning_rate": 9.919149696694811e-05, + "loss": 3.259, + "step": 9768 + }, + { + "epoch": 0.454826919943199, + "grad_norm": 0.3907842828246364, + "learning_rate": 9.919101174612819e-05, + "loss": 3.219, + "step": 9769 + }, + { + "epoch": 0.4548734781292921, + "grad_norm": 0.4140065072037273, + "learning_rate": 9.919052638093749e-05, + "loss": 3.2764, + "step": 9770 + }, + { + "epoch": 0.45492003631538513, + "grad_norm": 0.41335822176905557, + "learning_rate": 9.919004087137744e-05, + "loss": 3.193, + "step": 9771 + }, + { + "epoch": 0.45496659450147825, + "grad_norm": 0.3740681831407782, + "learning_rate": 9.918955521744948e-05, + "loss": 3.2388, + "step": 9772 + }, + { + "epoch": 0.4550131526875713, + "grad_norm": 0.46238100026385237, + "learning_rate": 9.918906941915502e-05, + "loss": 3.2689, + "step": 9773 + }, + { + "epoch": 0.45505971087366437, + "grad_norm": 0.41341401070658534, + "learning_rate": 9.918858347649549e-05, + "loss": 3.1227, + "step": 9774 + }, + { + "epoch": 0.45510626905975743, + "grad_norm": 0.38032347294173896, + "learning_rate": 9.91880973894723e-05, + "loss": 3.2509, + "step": 9775 + }, + { + "epoch": 0.4551528272458505, + "grad_norm": 0.4756444725753514, + "learning_rate": 9.91876111580869e-05, + "loss": 3.2206, + "step": 9776 + }, + { + "epoch": 0.45519938543194355, + "grad_norm": 0.44080527123104096, + "learning_rate": 9.91871247823407e-05, + "loss": 3.3232, + "step": 9777 + }, + { + "epoch": 0.45524594361803666, + "grad_norm": 0.4126985030638465, + "learning_rate": 9.918663826223513e-05, + "loss": 3.1862, + "step": 9778 + }, + { + "epoch": 0.4552925018041297, + "grad_norm": 0.4387142635452009, + "learning_rate": 9.918615159777163e-05, + "loss": 3.265, + "step": 9779 + }, + { + "epoch": 0.4553390599902228, + "grad_norm": 0.4497499219652861, + "learning_rate": 9.918566478895162e-05, + "loss": 3.3726, + "step": 9780 + }, + { + "epoch": 0.45538561817631584, + "grad_norm": 0.4374170099558037, + "learning_rate": 9.918517783577653e-05, + "loss": 3.3107, + "step": 9781 + }, + { + "epoch": 0.4554321763624089, + "grad_norm": 0.39866629576528073, + "learning_rate": 9.918469073824779e-05, + "loss": 3.2924, + "step": 9782 + }, + { + "epoch": 0.455478734548502, + "grad_norm": 0.45962194083232316, + "learning_rate": 9.918420349636683e-05, + "loss": 3.2792, + "step": 9783 + }, + { + "epoch": 0.4555252927345951, + "grad_norm": 0.5297311074767616, + "learning_rate": 9.918371611013507e-05, + "loss": 3.2852, + "step": 9784 + }, + { + "epoch": 0.45557185092068814, + "grad_norm": 0.376353461488584, + "learning_rate": 9.918322857955396e-05, + "loss": 3.2205, + "step": 9785 + }, + { + "epoch": 0.4556184091067812, + "grad_norm": 0.4431119316173635, + "learning_rate": 9.918274090462491e-05, + "loss": 3.1934, + "step": 9786 + }, + { + "epoch": 0.45566496729287426, + "grad_norm": 0.46477589846976847, + "learning_rate": 9.918225308534936e-05, + "loss": 3.3412, + "step": 9787 + }, + { + "epoch": 0.4557115254789673, + "grad_norm": 0.43336634812102026, + "learning_rate": 9.918176512172875e-05, + "loss": 3.1428, + "step": 9788 + }, + { + "epoch": 0.45575808366506043, + "grad_norm": 0.4740251790147592, + "learning_rate": 9.918127701376449e-05, + "loss": 3.3011, + "step": 9789 + }, + { + "epoch": 0.4558046418511535, + "grad_norm": 0.5046480687943582, + "learning_rate": 9.918078876145804e-05, + "loss": 3.271, + "step": 9790 + }, + { + "epoch": 0.45585120003724655, + "grad_norm": 0.4848479590773497, + "learning_rate": 9.918030036481083e-05, + "loss": 3.2175, + "step": 9791 + }, + { + "epoch": 0.4558977582233396, + "grad_norm": 0.4055659582534747, + "learning_rate": 9.917981182382428e-05, + "loss": 3.2431, + "step": 9792 + }, + { + "epoch": 0.45594431640943267, + "grad_norm": 0.4369488521611689, + "learning_rate": 9.91793231384998e-05, + "loss": 3.2799, + "step": 9793 + }, + { + "epoch": 0.4559908745955258, + "grad_norm": 0.4161227679118026, + "learning_rate": 9.917883430883887e-05, + "loss": 3.2027, + "step": 9794 + }, + { + "epoch": 0.45603743278161885, + "grad_norm": 0.4049063027848725, + "learning_rate": 9.917834533484292e-05, + "loss": 3.2362, + "step": 9795 + }, + { + "epoch": 0.4560839909677119, + "grad_norm": 0.42588438624963676, + "learning_rate": 9.917785621651336e-05, + "loss": 3.2427, + "step": 9796 + }, + { + "epoch": 0.45613054915380497, + "grad_norm": 0.4456511175942642, + "learning_rate": 9.917736695385163e-05, + "loss": 3.2068, + "step": 9797 + }, + { + "epoch": 0.456177107339898, + "grad_norm": 0.3857681279682556, + "learning_rate": 9.917687754685918e-05, + "loss": 3.3106, + "step": 9798 + }, + { + "epoch": 0.4562236655259911, + "grad_norm": 0.48935641330494967, + "learning_rate": 9.917638799553743e-05, + "loss": 3.1965, + "step": 9799 + }, + { + "epoch": 0.4562702237120842, + "grad_norm": 0.5033009033613075, + "learning_rate": 9.917589829988783e-05, + "loss": 3.3079, + "step": 9800 + }, + { + "epoch": 0.45631678189817726, + "grad_norm": 0.42290498649486413, + "learning_rate": 9.917540845991182e-05, + "loss": 3.3507, + "step": 9801 + }, + { + "epoch": 0.4563633400842703, + "grad_norm": 0.38708163551752317, + "learning_rate": 9.917491847561082e-05, + "loss": 3.2256, + "step": 9802 + }, + { + "epoch": 0.4564098982703634, + "grad_norm": 0.4322275951813997, + "learning_rate": 9.91744283469863e-05, + "loss": 3.275, + "step": 9803 + }, + { + "epoch": 0.45645645645645644, + "grad_norm": 0.405210031028577, + "learning_rate": 9.917393807403965e-05, + "loss": 3.2763, + "step": 9804 + }, + { + "epoch": 0.45650301464254955, + "grad_norm": 0.37749226854189155, + "learning_rate": 9.917344765677235e-05, + "loss": 3.2131, + "step": 9805 + }, + { + "epoch": 0.4565495728286426, + "grad_norm": 0.357655250367895, + "learning_rate": 9.917295709518583e-05, + "loss": 3.2294, + "step": 9806 + }, + { + "epoch": 0.4565961310147357, + "grad_norm": 0.39014840871589634, + "learning_rate": 9.917246638928152e-05, + "loss": 3.241, + "step": 9807 + }, + { + "epoch": 0.45664268920082873, + "grad_norm": 0.4252497041749519, + "learning_rate": 9.917197553906086e-05, + "loss": 3.2733, + "step": 9808 + }, + { + "epoch": 0.4566892473869218, + "grad_norm": 0.38954015595941854, + "learning_rate": 9.91714845445253e-05, + "loss": 3.1877, + "step": 9809 + }, + { + "epoch": 0.45673580557301485, + "grad_norm": 0.44415563891712817, + "learning_rate": 9.917099340567628e-05, + "loss": 3.229, + "step": 9810 + }, + { + "epoch": 0.45678236375910797, + "grad_norm": 0.4888470146576194, + "learning_rate": 9.917050212251524e-05, + "loss": 3.3104, + "step": 9811 + }, + { + "epoch": 0.456828921945201, + "grad_norm": 0.4522629218409933, + "learning_rate": 9.917001069504362e-05, + "loss": 3.3557, + "step": 9812 + }, + { + "epoch": 0.4568754801312941, + "grad_norm": 0.47283553641520043, + "learning_rate": 9.916951912326285e-05, + "loss": 3.2635, + "step": 9813 + }, + { + "epoch": 0.45692203831738715, + "grad_norm": 0.44857014449638427, + "learning_rate": 9.91690274071744e-05, + "loss": 3.2749, + "step": 9814 + }, + { + "epoch": 0.4569685965034802, + "grad_norm": 0.3977946232729615, + "learning_rate": 9.916853554677969e-05, + "loss": 3.2278, + "step": 9815 + }, + { + "epoch": 0.4570151546895733, + "grad_norm": 0.5094047035482915, + "learning_rate": 9.916804354208018e-05, + "loss": 3.2409, + "step": 9816 + }, + { + "epoch": 0.4570617128756664, + "grad_norm": 0.5692224943460005, + "learning_rate": 9.91675513930773e-05, + "loss": 3.2526, + "step": 9817 + }, + { + "epoch": 0.45710827106175944, + "grad_norm": 0.43942349605718556, + "learning_rate": 9.91670590997725e-05, + "loss": 3.2001, + "step": 9818 + }, + { + "epoch": 0.4571548292478525, + "grad_norm": 0.4241622265197091, + "learning_rate": 9.916656666216723e-05, + "loss": 3.2347, + "step": 9819 + }, + { + "epoch": 0.45720138743394556, + "grad_norm": 0.45756225633956316, + "learning_rate": 9.916607408026293e-05, + "loss": 3.3274, + "step": 9820 + }, + { + "epoch": 0.4572479456200386, + "grad_norm": 0.4172932133265733, + "learning_rate": 9.916558135406105e-05, + "loss": 3.2183, + "step": 9821 + }, + { + "epoch": 0.45729450380613174, + "grad_norm": 0.3952925343017569, + "learning_rate": 9.916508848356302e-05, + "loss": 3.2138, + "step": 9822 + }, + { + "epoch": 0.4573410619922248, + "grad_norm": 0.42721552540411445, + "learning_rate": 9.916459546877031e-05, + "loss": 3.3423, + "step": 9823 + }, + { + "epoch": 0.45738762017831786, + "grad_norm": 0.4191328143394462, + "learning_rate": 9.916410230968434e-05, + "loss": 3.1773, + "step": 9824 + }, + { + "epoch": 0.4574341783644109, + "grad_norm": 0.36644848633580746, + "learning_rate": 9.916360900630659e-05, + "loss": 3.163, + "step": 9825 + }, + { + "epoch": 0.457480736550504, + "grad_norm": 0.37363697758316394, + "learning_rate": 9.916311555863849e-05, + "loss": 3.2478, + "step": 9826 + }, + { + "epoch": 0.4575272947365971, + "grad_norm": 0.3876931743637642, + "learning_rate": 9.916262196668148e-05, + "loss": 3.2442, + "step": 9827 + }, + { + "epoch": 0.45757385292269015, + "grad_norm": 0.4035077285328692, + "learning_rate": 9.916212823043702e-05, + "loss": 3.2534, + "step": 9828 + }, + { + "epoch": 0.4576204111087832, + "grad_norm": 0.39181648497823773, + "learning_rate": 9.916163434990656e-05, + "loss": 3.2667, + "step": 9829 + }, + { + "epoch": 0.45766696929487627, + "grad_norm": 0.4163050887868461, + "learning_rate": 9.916114032509153e-05, + "loss": 3.2468, + "step": 9830 + }, + { + "epoch": 0.45771352748096933, + "grad_norm": 0.43609361852182904, + "learning_rate": 9.916064615599344e-05, + "loss": 3.1341, + "step": 9831 + }, + { + "epoch": 0.4577600856670624, + "grad_norm": 0.3849774266188017, + "learning_rate": 9.916015184261366e-05, + "loss": 3.1902, + "step": 9832 + }, + { + "epoch": 0.4578066438531555, + "grad_norm": 0.4063866682429776, + "learning_rate": 9.915965738495367e-05, + "loss": 3.2641, + "step": 9833 + }, + { + "epoch": 0.45785320203924856, + "grad_norm": 0.40489230092004785, + "learning_rate": 9.915916278301496e-05, + "loss": 3.2678, + "step": 9834 + }, + { + "epoch": 0.4578997602253416, + "grad_norm": 0.4358023368793571, + "learning_rate": 9.915866803679894e-05, + "loss": 3.2026, + "step": 9835 + }, + { + "epoch": 0.4579463184114347, + "grad_norm": 0.43147964627112656, + "learning_rate": 9.915817314630707e-05, + "loss": 3.2594, + "step": 9836 + }, + { + "epoch": 0.45799287659752774, + "grad_norm": 0.4153670915890073, + "learning_rate": 9.91576781115408e-05, + "loss": 3.3758, + "step": 9837 + }, + { + "epoch": 0.4580394347836208, + "grad_norm": 0.4186583529897093, + "learning_rate": 9.915718293250158e-05, + "loss": 3.2771, + "step": 9838 + }, + { + "epoch": 0.4580859929697139, + "grad_norm": 0.4227991019718183, + "learning_rate": 9.915668760919088e-05, + "loss": 3.167, + "step": 9839 + }, + { + "epoch": 0.458132551155807, + "grad_norm": 0.39837445287162665, + "learning_rate": 9.915619214161014e-05, + "loss": 3.1709, + "step": 9840 + }, + { + "epoch": 0.45817910934190004, + "grad_norm": 0.3650617082032213, + "learning_rate": 9.915569652976082e-05, + "loss": 3.1102, + "step": 9841 + }, + { + "epoch": 0.4582256675279931, + "grad_norm": 0.40333023918485716, + "learning_rate": 9.915520077364438e-05, + "loss": 3.272, + "step": 9842 + }, + { + "epoch": 0.45827222571408616, + "grad_norm": 0.4129445789853234, + "learning_rate": 9.915470487326228e-05, + "loss": 3.2331, + "step": 9843 + }, + { + "epoch": 0.45831878390017927, + "grad_norm": 0.3759147678586043, + "learning_rate": 9.915420882861595e-05, + "loss": 3.3356, + "step": 9844 + }, + { + "epoch": 0.45836534208627233, + "grad_norm": 0.4415740281379698, + "learning_rate": 9.915371263970685e-05, + "loss": 3.1366, + "step": 9845 + }, + { + "epoch": 0.4584119002723654, + "grad_norm": 0.39346598007289885, + "learning_rate": 9.915321630653646e-05, + "loss": 3.2627, + "step": 9846 + }, + { + "epoch": 0.45845845845845845, + "grad_norm": 0.36925592430795834, + "learning_rate": 9.915271982910621e-05, + "loss": 3.1928, + "step": 9847 + }, + { + "epoch": 0.4585050166445515, + "grad_norm": 0.37468022648176563, + "learning_rate": 9.915222320741757e-05, + "loss": 3.2133, + "step": 9848 + }, + { + "epoch": 0.45855157483064457, + "grad_norm": 0.40689847122722983, + "learning_rate": 9.915172644147201e-05, + "loss": 3.2379, + "step": 9849 + }, + { + "epoch": 0.4585981330167377, + "grad_norm": 0.40855313430248547, + "learning_rate": 9.915122953127097e-05, + "loss": 3.1448, + "step": 9850 + }, + { + "epoch": 0.45864469120283075, + "grad_norm": 0.40578405032137044, + "learning_rate": 9.91507324768159e-05, + "loss": 3.2464, + "step": 9851 + }, + { + "epoch": 0.4586912493889238, + "grad_norm": 0.3938342966967832, + "learning_rate": 9.91502352781083e-05, + "loss": 3.2615, + "step": 9852 + }, + { + "epoch": 0.45873780757501686, + "grad_norm": 0.392700467908643, + "learning_rate": 9.914973793514959e-05, + "loss": 3.2276, + "step": 9853 + }, + { + "epoch": 0.4587843657611099, + "grad_norm": 0.4084019592796449, + "learning_rate": 9.914924044794123e-05, + "loss": 3.3299, + "step": 9854 + }, + { + "epoch": 0.45883092394720304, + "grad_norm": 0.3972174775574824, + "learning_rate": 9.91487428164847e-05, + "loss": 3.2942, + "step": 9855 + }, + { + "epoch": 0.4588774821332961, + "grad_norm": 0.45648889543825516, + "learning_rate": 9.914824504078146e-05, + "loss": 3.2634, + "step": 9856 + }, + { + "epoch": 0.45892404031938916, + "grad_norm": 0.4161855622433861, + "learning_rate": 9.914774712083295e-05, + "loss": 3.2687, + "step": 9857 + }, + { + "epoch": 0.4589705985054822, + "grad_norm": 0.37238689995083474, + "learning_rate": 9.914724905664064e-05, + "loss": 3.2847, + "step": 9858 + }, + { + "epoch": 0.4590171566915753, + "grad_norm": 0.4032193667251723, + "learning_rate": 9.9146750848206e-05, + "loss": 3.26, + "step": 9859 + }, + { + "epoch": 0.45906371487766834, + "grad_norm": 0.35466047299302583, + "learning_rate": 9.91462524955305e-05, + "loss": 3.3058, + "step": 9860 + }, + { + "epoch": 0.45911027306376145, + "grad_norm": 0.40246407661448125, + "learning_rate": 9.914575399861559e-05, + "loss": 3.271, + "step": 9861 + }, + { + "epoch": 0.4591568312498545, + "grad_norm": 0.3808991938722638, + "learning_rate": 9.914525535746271e-05, + "loss": 3.1855, + "step": 9862 + }, + { + "epoch": 0.4592033894359476, + "grad_norm": 0.3801018908434686, + "learning_rate": 9.914475657207337e-05, + "loss": 3.2021, + "step": 9863 + }, + { + "epoch": 0.45924994762204063, + "grad_norm": 0.41816229922436793, + "learning_rate": 9.9144257642449e-05, + "loss": 3.3626, + "step": 9864 + }, + { + "epoch": 0.4592965058081337, + "grad_norm": 0.4297368544629693, + "learning_rate": 9.914375856859109e-05, + "loss": 3.3064, + "step": 9865 + }, + { + "epoch": 0.4593430639942268, + "grad_norm": 0.4212547145332701, + "learning_rate": 9.914325935050107e-05, + "loss": 3.269, + "step": 9866 + }, + { + "epoch": 0.45938962218031987, + "grad_norm": 0.43100000671119243, + "learning_rate": 9.914275998818043e-05, + "loss": 3.1976, + "step": 9867 + }, + { + "epoch": 0.4594361803664129, + "grad_norm": 0.3664435348104484, + "learning_rate": 9.914226048163064e-05, + "loss": 3.2435, + "step": 9868 + }, + { + "epoch": 0.459482738552506, + "grad_norm": 0.4084916131965387, + "learning_rate": 9.914176083085316e-05, + "loss": 3.1526, + "step": 9869 + }, + { + "epoch": 0.45952929673859905, + "grad_norm": 0.3981165220545747, + "learning_rate": 9.914126103584945e-05, + "loss": 3.2115, + "step": 9870 + }, + { + "epoch": 0.4595758549246921, + "grad_norm": 0.42349813520987556, + "learning_rate": 9.914076109662096e-05, + "loss": 3.3411, + "step": 9871 + }, + { + "epoch": 0.4596224131107852, + "grad_norm": 0.42368175126079155, + "learning_rate": 9.91402610131692e-05, + "loss": 3.3045, + "step": 9872 + }, + { + "epoch": 0.4596689712968783, + "grad_norm": 0.382056069894787, + "learning_rate": 9.913976078549561e-05, + "loss": 3.3001, + "step": 9873 + }, + { + "epoch": 0.45971552948297134, + "grad_norm": 0.3852719043928955, + "learning_rate": 9.913926041360165e-05, + "loss": 3.2842, + "step": 9874 + }, + { + "epoch": 0.4597620876690644, + "grad_norm": 0.3819548277904577, + "learning_rate": 9.913875989748882e-05, + "loss": 3.1772, + "step": 9875 + }, + { + "epoch": 0.45980864585515746, + "grad_norm": 0.40679604879661074, + "learning_rate": 9.913825923715855e-05, + "loss": 3.2307, + "step": 9876 + }, + { + "epoch": 0.4598552040412506, + "grad_norm": 0.4101939483932758, + "learning_rate": 9.913775843261235e-05, + "loss": 3.2315, + "step": 9877 + }, + { + "epoch": 0.45990176222734364, + "grad_norm": 0.4139857594849595, + "learning_rate": 9.913725748385167e-05, + "loss": 3.1337, + "step": 9878 + }, + { + "epoch": 0.4599483204134367, + "grad_norm": 0.4509922118783192, + "learning_rate": 9.913675639087797e-05, + "loss": 3.2821, + "step": 9879 + }, + { + "epoch": 0.45999487859952976, + "grad_norm": 0.46126375827415195, + "learning_rate": 9.913625515369274e-05, + "loss": 3.2489, + "step": 9880 + }, + { + "epoch": 0.4600414367856228, + "grad_norm": 0.39765866980026005, + "learning_rate": 9.913575377229742e-05, + "loss": 3.2835, + "step": 9881 + }, + { + "epoch": 0.4600879949717159, + "grad_norm": 0.37361032314825776, + "learning_rate": 9.913525224669354e-05, + "loss": 3.3012, + "step": 9882 + }, + { + "epoch": 0.460134553157809, + "grad_norm": 0.3895362627523262, + "learning_rate": 9.913475057688251e-05, + "loss": 3.1192, + "step": 9883 + }, + { + "epoch": 0.46018111134390205, + "grad_norm": 0.3758782168206963, + "learning_rate": 9.913424876286583e-05, + "loss": 3.2527, + "step": 9884 + }, + { + "epoch": 0.4602276695299951, + "grad_norm": 0.4190014696369538, + "learning_rate": 9.913374680464498e-05, + "loss": 3.255, + "step": 9885 + }, + { + "epoch": 0.46027422771608817, + "grad_norm": 0.4100897662751138, + "learning_rate": 9.91332447022214e-05, + "loss": 3.241, + "step": 9886 + }, + { + "epoch": 0.46032078590218123, + "grad_norm": 0.35831086168957993, + "learning_rate": 9.913274245559661e-05, + "loss": 3.2128, + "step": 9887 + }, + { + "epoch": 0.46036734408827434, + "grad_norm": 0.4582805969515345, + "learning_rate": 9.913224006477206e-05, + "loss": 3.2901, + "step": 9888 + }, + { + "epoch": 0.4604139022743674, + "grad_norm": 0.40926706708737154, + "learning_rate": 9.913173752974921e-05, + "loss": 3.2607, + "step": 9889 + }, + { + "epoch": 0.46046046046046046, + "grad_norm": 0.39494262263049845, + "learning_rate": 9.913123485052958e-05, + "loss": 3.2792, + "step": 9890 + }, + { + "epoch": 0.4605070186465535, + "grad_norm": 0.4281926243435296, + "learning_rate": 9.913073202711458e-05, + "loss": 3.3147, + "step": 9891 + }, + { + "epoch": 0.4605535768326466, + "grad_norm": 0.36053335146026183, + "learning_rate": 9.913022905950575e-05, + "loss": 3.2114, + "step": 9892 + }, + { + "epoch": 0.46060013501873964, + "grad_norm": 0.4044068281093284, + "learning_rate": 9.912972594770452e-05, + "loss": 3.1741, + "step": 9893 + }, + { + "epoch": 0.46064669320483276, + "grad_norm": 0.4727132909657365, + "learning_rate": 9.91292226917124e-05, + "loss": 3.0709, + "step": 9894 + }, + { + "epoch": 0.4606932513909258, + "grad_norm": 0.43841343703863134, + "learning_rate": 9.912871929153083e-05, + "loss": 3.1776, + "step": 9895 + }, + { + "epoch": 0.4607398095770189, + "grad_norm": 0.3856148077839776, + "learning_rate": 9.912821574716132e-05, + "loss": 3.1908, + "step": 9896 + }, + { + "epoch": 0.46078636776311194, + "grad_norm": 0.4786643117320648, + "learning_rate": 9.912771205860533e-05, + "loss": 3.289, + "step": 9897 + }, + { + "epoch": 0.460832925949205, + "grad_norm": 0.4883159746513597, + "learning_rate": 9.912720822586437e-05, + "loss": 3.2355, + "step": 9898 + }, + { + "epoch": 0.4608794841352981, + "grad_norm": 0.37298004080365704, + "learning_rate": 9.912670424893987e-05, + "loss": 3.1306, + "step": 9899 + }, + { + "epoch": 0.46092604232139117, + "grad_norm": 0.3730430120946278, + "learning_rate": 9.912620012783333e-05, + "loss": 3.1189, + "step": 9900 + }, + { + "epoch": 0.46097260050748423, + "grad_norm": 0.42303507029784154, + "learning_rate": 9.912569586254625e-05, + "loss": 3.2599, + "step": 9901 + }, + { + "epoch": 0.4610191586935773, + "grad_norm": 0.34912796176843736, + "learning_rate": 9.912519145308008e-05, + "loss": 3.204, + "step": 9902 + }, + { + "epoch": 0.46106571687967035, + "grad_norm": 0.37827257408164217, + "learning_rate": 9.912468689943632e-05, + "loss": 3.2635, + "step": 9903 + }, + { + "epoch": 0.4611122750657634, + "grad_norm": 0.43280236741438394, + "learning_rate": 9.912418220161645e-05, + "loss": 3.2414, + "step": 9904 + }, + { + "epoch": 0.4611588332518565, + "grad_norm": 0.4204085942319899, + "learning_rate": 9.912367735962193e-05, + "loss": 3.1499, + "step": 9905 + }, + { + "epoch": 0.4612053914379496, + "grad_norm": 0.35939815536334824, + "learning_rate": 9.912317237345426e-05, + "loss": 3.274, + "step": 9906 + }, + { + "epoch": 0.46125194962404265, + "grad_norm": 0.40412710846960265, + "learning_rate": 9.912266724311492e-05, + "loss": 3.1929, + "step": 9907 + }, + { + "epoch": 0.4612985078101357, + "grad_norm": 0.39628947748178023, + "learning_rate": 9.91221619686054e-05, + "loss": 3.0399, + "step": 9908 + }, + { + "epoch": 0.46134506599622876, + "grad_norm": 0.35904136596379516, + "learning_rate": 9.912165654992715e-05, + "loss": 3.3249, + "step": 9909 + }, + { + "epoch": 0.4613916241823219, + "grad_norm": 0.43526115013658273, + "learning_rate": 9.91211509870817e-05, + "loss": 3.1947, + "step": 9910 + }, + { + "epoch": 0.46143818236841494, + "grad_norm": 0.3955019226139231, + "learning_rate": 9.91206452800705e-05, + "loss": 3.235, + "step": 9911 + }, + { + "epoch": 0.461484740554508, + "grad_norm": 0.3623997913532994, + "learning_rate": 9.912013942889505e-05, + "loss": 3.3315, + "step": 9912 + }, + { + "epoch": 0.46153129874060106, + "grad_norm": 0.4191972081016359, + "learning_rate": 9.911963343355684e-05, + "loss": 3.1991, + "step": 9913 + }, + { + "epoch": 0.4615778569266941, + "grad_norm": 0.3987334674644562, + "learning_rate": 9.911912729405732e-05, + "loss": 3.3158, + "step": 9914 + }, + { + "epoch": 0.4616244151127872, + "grad_norm": 0.3864912534287985, + "learning_rate": 9.911862101039802e-05, + "loss": 3.3027, + "step": 9915 + }, + { + "epoch": 0.4616709732988803, + "grad_norm": 0.39521908232200886, + "learning_rate": 9.91181145825804e-05, + "loss": 3.4129, + "step": 9916 + }, + { + "epoch": 0.46171753148497335, + "grad_norm": 0.4555511964605999, + "learning_rate": 9.911760801060595e-05, + "loss": 3.2086, + "step": 9917 + }, + { + "epoch": 0.4617640896710664, + "grad_norm": 0.41610881620344403, + "learning_rate": 9.911710129447616e-05, + "loss": 3.3012, + "step": 9918 + }, + { + "epoch": 0.4618106478571595, + "grad_norm": 0.41203616867384146, + "learning_rate": 9.911659443419252e-05, + "loss": 3.1864, + "step": 9919 + }, + { + "epoch": 0.46185720604325253, + "grad_norm": 0.4410967405357968, + "learning_rate": 9.911608742975653e-05, + "loss": 3.2401, + "step": 9920 + }, + { + "epoch": 0.46190376422934565, + "grad_norm": 0.4435495657200655, + "learning_rate": 9.911558028116963e-05, + "loss": 3.2507, + "step": 9921 + }, + { + "epoch": 0.4619503224154387, + "grad_norm": 0.3883882089971686, + "learning_rate": 9.911507298843336e-05, + "loss": 3.219, + "step": 9922 + }, + { + "epoch": 0.46199688060153177, + "grad_norm": 0.4439667258107227, + "learning_rate": 9.911456555154918e-05, + "loss": 3.3437, + "step": 9923 + }, + { + "epoch": 0.4620434387876248, + "grad_norm": 0.39270028549951036, + "learning_rate": 9.91140579705186e-05, + "loss": 3.2383, + "step": 9924 + }, + { + "epoch": 0.4620899969737179, + "grad_norm": 0.3646785617856819, + "learning_rate": 9.91135502453431e-05, + "loss": 3.1989, + "step": 9925 + }, + { + "epoch": 0.46213655515981095, + "grad_norm": 0.3670522877339052, + "learning_rate": 9.911304237602416e-05, + "loss": 3.1962, + "step": 9926 + }, + { + "epoch": 0.46218311334590406, + "grad_norm": 0.39478328121119327, + "learning_rate": 9.911253436256327e-05, + "loss": 3.226, + "step": 9927 + }, + { + "epoch": 0.4622296715319971, + "grad_norm": 0.35458792574515013, + "learning_rate": 9.911202620496194e-05, + "loss": 3.2604, + "step": 9928 + }, + { + "epoch": 0.4622762297180902, + "grad_norm": 0.41989128339995957, + "learning_rate": 9.911151790322166e-05, + "loss": 3.2437, + "step": 9929 + }, + { + "epoch": 0.46232278790418324, + "grad_norm": 0.4425377872269777, + "learning_rate": 9.91110094573439e-05, + "loss": 3.3174, + "step": 9930 + }, + { + "epoch": 0.4623693460902763, + "grad_norm": 0.3972394230127539, + "learning_rate": 9.911050086733017e-05, + "loss": 3.1927, + "step": 9931 + }, + { + "epoch": 0.4624159042763694, + "grad_norm": 0.38015220671504574, + "learning_rate": 9.910999213318197e-05, + "loss": 3.3114, + "step": 9932 + }, + { + "epoch": 0.4624624624624625, + "grad_norm": 0.35926215110157167, + "learning_rate": 9.910948325490075e-05, + "loss": 3.278, + "step": 9933 + }, + { + "epoch": 0.46250902064855554, + "grad_norm": 0.3843608336799212, + "learning_rate": 9.910897423248806e-05, + "loss": 3.1039, + "step": 9934 + }, + { + "epoch": 0.4625555788346486, + "grad_norm": 0.37182843703441476, + "learning_rate": 9.910846506594536e-05, + "loss": 3.1986, + "step": 9935 + }, + { + "epoch": 0.46260213702074165, + "grad_norm": 0.4067147836302649, + "learning_rate": 9.910795575527416e-05, + "loss": 3.2287, + "step": 9936 + }, + { + "epoch": 0.4626486952068347, + "grad_norm": 0.4350647908632462, + "learning_rate": 9.910744630047594e-05, + "loss": 3.1424, + "step": 9937 + }, + { + "epoch": 0.46269525339292783, + "grad_norm": 0.43356325678252977, + "learning_rate": 9.910693670155222e-05, + "loss": 3.2004, + "step": 9938 + }, + { + "epoch": 0.4627418115790209, + "grad_norm": 0.362281684062496, + "learning_rate": 9.910642695850446e-05, + "loss": 3.2167, + "step": 9939 + }, + { + "epoch": 0.46278836976511395, + "grad_norm": 0.4074467454776598, + "learning_rate": 9.910591707133419e-05, + "loss": 3.34, + "step": 9940 + }, + { + "epoch": 0.462834927951207, + "grad_norm": 0.3947872063463895, + "learning_rate": 9.910540704004287e-05, + "loss": 3.2898, + "step": 9941 + }, + { + "epoch": 0.46288148613730007, + "grad_norm": 0.40424622446887104, + "learning_rate": 9.910489686463202e-05, + "loss": 3.221, + "step": 9942 + }, + { + "epoch": 0.4629280443233932, + "grad_norm": 0.3743357985436858, + "learning_rate": 9.910438654510316e-05, + "loss": 3.2799, + "step": 9943 + }, + { + "epoch": 0.46297460250948624, + "grad_norm": 0.41118576961492165, + "learning_rate": 9.910387608145775e-05, + "loss": 3.2312, + "step": 9944 + }, + { + "epoch": 0.4630211606955793, + "grad_norm": 0.3847063421789034, + "learning_rate": 9.91033654736973e-05, + "loss": 3.2683, + "step": 9945 + }, + { + "epoch": 0.46306771888167236, + "grad_norm": 0.3480023555023236, + "learning_rate": 9.910285472182331e-05, + "loss": 3.3561, + "step": 9946 + }, + { + "epoch": 0.4631142770677654, + "grad_norm": 0.3894761596868412, + "learning_rate": 9.910234382583727e-05, + "loss": 3.2564, + "step": 9947 + }, + { + "epoch": 0.4631608352538585, + "grad_norm": 0.4229386319173077, + "learning_rate": 9.91018327857407e-05, + "loss": 3.2931, + "step": 9948 + }, + { + "epoch": 0.4632073934399516, + "grad_norm": 0.44656861374219364, + "learning_rate": 9.910132160153507e-05, + "loss": 3.2308, + "step": 9949 + }, + { + "epoch": 0.46325395162604466, + "grad_norm": 0.40347311780742146, + "learning_rate": 9.910081027322191e-05, + "loss": 3.1522, + "step": 9950 + }, + { + "epoch": 0.4633005098121377, + "grad_norm": 0.36419587253100716, + "learning_rate": 9.910029880080271e-05, + "loss": 3.1992, + "step": 9951 + }, + { + "epoch": 0.4633470679982308, + "grad_norm": 0.37963406524461796, + "learning_rate": 9.909978718427898e-05, + "loss": 3.2855, + "step": 9952 + }, + { + "epoch": 0.46339362618432384, + "grad_norm": 0.3860502213837442, + "learning_rate": 9.909927542365221e-05, + "loss": 3.221, + "step": 9953 + }, + { + "epoch": 0.46344018437041695, + "grad_norm": 0.4115071195664317, + "learning_rate": 9.909876351892389e-05, + "loss": 3.2046, + "step": 9954 + }, + { + "epoch": 0.46348674255651, + "grad_norm": 0.3645193913121722, + "learning_rate": 9.909825147009555e-05, + "loss": 3.2117, + "step": 9955 + }, + { + "epoch": 0.46353330074260307, + "grad_norm": 0.4042989442198444, + "learning_rate": 9.909773927716866e-05, + "loss": 3.2235, + "step": 9956 + }, + { + "epoch": 0.46357985892869613, + "grad_norm": 0.391043737132137, + "learning_rate": 9.909722694014477e-05, + "loss": 3.1628, + "step": 9957 + }, + { + "epoch": 0.4636264171147892, + "grad_norm": 0.3655451350235586, + "learning_rate": 9.909671445902534e-05, + "loss": 3.2457, + "step": 9958 + }, + { + "epoch": 0.46367297530088225, + "grad_norm": 0.396875558386768, + "learning_rate": 9.909620183381187e-05, + "loss": 3.3478, + "step": 9959 + }, + { + "epoch": 0.46371953348697537, + "grad_norm": 0.3830869816566971, + "learning_rate": 9.909568906450592e-05, + "loss": 3.1674, + "step": 9960 + }, + { + "epoch": 0.4637660916730684, + "grad_norm": 0.40172273412669246, + "learning_rate": 9.909517615110894e-05, + "loss": 3.2855, + "step": 9961 + }, + { + "epoch": 0.4638126498591615, + "grad_norm": 0.4110687790545321, + "learning_rate": 9.909466309362245e-05, + "loss": 3.2525, + "step": 9962 + }, + { + "epoch": 0.46385920804525455, + "grad_norm": 0.4284297343463296, + "learning_rate": 9.909414989204798e-05, + "loss": 3.1825, + "step": 9963 + }, + { + "epoch": 0.4639057662313476, + "grad_norm": 0.41082577296961814, + "learning_rate": 9.9093636546387e-05, + "loss": 3.152, + "step": 9964 + }, + { + "epoch": 0.4639523244174407, + "grad_norm": 0.3765367762672575, + "learning_rate": 9.909312305664104e-05, + "loss": 3.2826, + "step": 9965 + }, + { + "epoch": 0.4639988826035338, + "grad_norm": 0.43649624701670864, + "learning_rate": 9.90926094228116e-05, + "loss": 3.1865, + "step": 9966 + }, + { + "epoch": 0.46404544078962684, + "grad_norm": 0.3497813339204264, + "learning_rate": 9.909209564490018e-05, + "loss": 3.1893, + "step": 9967 + }, + { + "epoch": 0.4640919989757199, + "grad_norm": 0.41594050961357437, + "learning_rate": 9.90915817229083e-05, + "loss": 3.2782, + "step": 9968 + }, + { + "epoch": 0.46413855716181296, + "grad_norm": 0.41463620543396124, + "learning_rate": 9.909106765683748e-05, + "loss": 3.1984, + "step": 9969 + }, + { + "epoch": 0.464185115347906, + "grad_norm": 0.37589278596608283, + "learning_rate": 9.90905534466892e-05, + "loss": 3.1508, + "step": 9970 + }, + { + "epoch": 0.46423167353399913, + "grad_norm": 0.3928369318087263, + "learning_rate": 9.909003909246497e-05, + "loss": 3.3103, + "step": 9971 + }, + { + "epoch": 0.4642782317200922, + "grad_norm": 0.4224439496372195, + "learning_rate": 9.908952459416632e-05, + "loss": 3.1311, + "step": 9972 + }, + { + "epoch": 0.46432478990618525, + "grad_norm": 0.33585809437080244, + "learning_rate": 9.908900995179476e-05, + "loss": 3.3686, + "step": 9973 + }, + { + "epoch": 0.4643713480922783, + "grad_norm": 0.4149694726110101, + "learning_rate": 9.90884951653518e-05, + "loss": 3.2512, + "step": 9974 + }, + { + "epoch": 0.4644179062783714, + "grad_norm": 0.4435809831663201, + "learning_rate": 9.908798023483892e-05, + "loss": 3.2076, + "step": 9975 + }, + { + "epoch": 0.4644644644644645, + "grad_norm": 0.42544654011911165, + "learning_rate": 9.908746516025767e-05, + "loss": 3.174, + "step": 9976 + }, + { + "epoch": 0.46451102265055755, + "grad_norm": 0.3686611243381104, + "learning_rate": 9.908694994160952e-05, + "loss": 3.1549, + "step": 9977 + }, + { + "epoch": 0.4645575808366506, + "grad_norm": 0.4149323300580994, + "learning_rate": 9.908643457889604e-05, + "loss": 3.2575, + "step": 9978 + }, + { + "epoch": 0.46460413902274367, + "grad_norm": 0.3933414762745185, + "learning_rate": 9.908591907211868e-05, + "loss": 3.1044, + "step": 9979 + }, + { + "epoch": 0.4646506972088367, + "grad_norm": 0.34902129290866474, + "learning_rate": 9.908540342127901e-05, + "loss": 3.2817, + "step": 9980 + }, + { + "epoch": 0.4646972553949298, + "grad_norm": 0.43783435796878445, + "learning_rate": 9.908488762637851e-05, + "loss": 3.1975, + "step": 9981 + }, + { + "epoch": 0.4647438135810229, + "grad_norm": 0.42037627997665966, + "learning_rate": 9.908437168741869e-05, + "loss": 3.3536, + "step": 9982 + }, + { + "epoch": 0.46479037176711596, + "grad_norm": 0.4075127848245467, + "learning_rate": 9.908385560440108e-05, + "loss": 3.2893, + "step": 9983 + }, + { + "epoch": 0.464836929953209, + "grad_norm": 0.3805009867425962, + "learning_rate": 9.908333937732718e-05, + "loss": 3.3014, + "step": 9984 + }, + { + "epoch": 0.4648834881393021, + "grad_norm": 0.4076601813692967, + "learning_rate": 9.908282300619854e-05, + "loss": 3.174, + "step": 9985 + }, + { + "epoch": 0.46493004632539514, + "grad_norm": 0.3998661941707749, + "learning_rate": 9.908230649101663e-05, + "loss": 3.2493, + "step": 9986 + }, + { + "epoch": 0.46497660451148826, + "grad_norm": 0.43644743932193, + "learning_rate": 9.908178983178301e-05, + "loss": 3.2238, + "step": 9987 + }, + { + "epoch": 0.4650231626975813, + "grad_norm": 0.48434239280098457, + "learning_rate": 9.908127302849915e-05, + "loss": 3.2259, + "step": 9988 + }, + { + "epoch": 0.4650697208836744, + "grad_norm": 0.44655133562653776, + "learning_rate": 9.908075608116658e-05, + "loss": 3.27, + "step": 9989 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.39898315422012925, + "learning_rate": 9.908023898978684e-05, + "loss": 3.1949, + "step": 9990 + }, + { + "epoch": 0.4651628372558605, + "grad_norm": 0.43439845551275313, + "learning_rate": 9.907972175436143e-05, + "loss": 3.2579, + "step": 9991 + }, + { + "epoch": 0.46520939544195355, + "grad_norm": 0.40298619309774214, + "learning_rate": 9.907920437489187e-05, + "loss": 3.172, + "step": 9992 + }, + { + "epoch": 0.46525595362804667, + "grad_norm": 0.4336147436120527, + "learning_rate": 9.907868685137969e-05, + "loss": 3.2219, + "step": 9993 + }, + { + "epoch": 0.46530251181413973, + "grad_norm": 0.5118787688667926, + "learning_rate": 9.90781691838264e-05, + "loss": 3.1812, + "step": 9994 + }, + { + "epoch": 0.4653490700002328, + "grad_norm": 0.45079282000196935, + "learning_rate": 9.907765137223351e-05, + "loss": 3.2009, + "step": 9995 + }, + { + "epoch": 0.46539562818632585, + "grad_norm": 0.4038265374827733, + "learning_rate": 9.907713341660254e-05, + "loss": 3.3523, + "step": 9996 + }, + { + "epoch": 0.4654421863724189, + "grad_norm": 0.44299590462430083, + "learning_rate": 9.907661531693503e-05, + "loss": 3.1534, + "step": 9997 + }, + { + "epoch": 0.465488744558512, + "grad_norm": 0.44786465335722025, + "learning_rate": 9.90760970732325e-05, + "loss": 3.1635, + "step": 9998 + }, + { + "epoch": 0.4655353027446051, + "grad_norm": 0.41378305654765696, + "learning_rate": 9.907557868549644e-05, + "loss": 3.2175, + "step": 9999 + }, + { + "epoch": 0.46558186093069814, + "grad_norm": 0.46133654889270936, + "learning_rate": 9.907506015372839e-05, + "loss": 3.3725, + "step": 10000 + }, + { + "epoch": 0.4656284191167912, + "grad_norm": 0.46184550833386273, + "learning_rate": 9.907454147792988e-05, + "loss": 3.1867, + "step": 10001 + }, + { + "epoch": 0.46567497730288426, + "grad_norm": 0.4250798942357528, + "learning_rate": 9.907402265810244e-05, + "loss": 3.2657, + "step": 10002 + }, + { + "epoch": 0.4657215354889773, + "grad_norm": 0.463284331721197, + "learning_rate": 9.907350369424754e-05, + "loss": 3.3551, + "step": 10003 + }, + { + "epoch": 0.46576809367507044, + "grad_norm": 0.39784443238968215, + "learning_rate": 9.907298458636679e-05, + "loss": 3.1578, + "step": 10004 + }, + { + "epoch": 0.4658146518611635, + "grad_norm": 0.46955224458396055, + "learning_rate": 9.907246533446164e-05, + "loss": 3.2101, + "step": 10005 + }, + { + "epoch": 0.46586121004725656, + "grad_norm": 0.43321748780157243, + "learning_rate": 9.907194593853364e-05, + "loss": 3.0947, + "step": 10006 + }, + { + "epoch": 0.4659077682333496, + "grad_norm": 0.45379302604252925, + "learning_rate": 9.907142639858432e-05, + "loss": 3.2931, + "step": 10007 + }, + { + "epoch": 0.4659543264194427, + "grad_norm": 0.4207593300943141, + "learning_rate": 9.907090671461517e-05, + "loss": 3.223, + "step": 10008 + }, + { + "epoch": 0.4660008846055358, + "grad_norm": 0.465174416055888, + "learning_rate": 9.907038688662778e-05, + "loss": 3.2167, + "step": 10009 + }, + { + "epoch": 0.46604744279162885, + "grad_norm": 0.4634332828585014, + "learning_rate": 9.906986691462361e-05, + "loss": 3.2892, + "step": 10010 + }, + { + "epoch": 0.4660940009777219, + "grad_norm": 0.3747457695478588, + "learning_rate": 9.906934679860422e-05, + "loss": 3.2349, + "step": 10011 + }, + { + "epoch": 0.46614055916381497, + "grad_norm": 0.4445342534058926, + "learning_rate": 9.906882653857114e-05, + "loss": 3.161, + "step": 10012 + }, + { + "epoch": 0.46618711734990803, + "grad_norm": 0.420052431800046, + "learning_rate": 9.906830613452589e-05, + "loss": 3.3256, + "step": 10013 + }, + { + "epoch": 0.4662336755360011, + "grad_norm": 0.41756092132004546, + "learning_rate": 9.906778558647e-05, + "loss": 3.3629, + "step": 10014 + }, + { + "epoch": 0.4662802337220942, + "grad_norm": 0.4255438163952269, + "learning_rate": 9.906726489440499e-05, + "loss": 3.244, + "step": 10015 + }, + { + "epoch": 0.46632679190818727, + "grad_norm": 0.43716503521258665, + "learning_rate": 9.906674405833238e-05, + "loss": 3.2497, + "step": 10016 + }, + { + "epoch": 0.4663733500942803, + "grad_norm": 0.7340558174955163, + "learning_rate": 9.906622307825372e-05, + "loss": 3.2478, + "step": 10017 + }, + { + "epoch": 0.4664199082803734, + "grad_norm": 0.4002058917041204, + "learning_rate": 9.906570195417052e-05, + "loss": 3.2883, + "step": 10018 + }, + { + "epoch": 0.46646646646646645, + "grad_norm": 1.8583056898819186, + "learning_rate": 9.906518068608433e-05, + "loss": 3.2825, + "step": 10019 + }, + { + "epoch": 0.46651302465255956, + "grad_norm": 2.4908548806984565, + "learning_rate": 9.906465927399667e-05, + "loss": 3.2528, + "step": 10020 + }, + { + "epoch": 0.4665595828386526, + "grad_norm": 0.9245209654262255, + "learning_rate": 9.906413771790906e-05, + "loss": 3.1846, + "step": 10021 + }, + { + "epoch": 0.4666061410247457, + "grad_norm": 0.7090002317413484, + "learning_rate": 9.906361601782304e-05, + "loss": 3.2317, + "step": 10022 + }, + { + "epoch": 0.46665269921083874, + "grad_norm": 0.5180581883178983, + "learning_rate": 9.906309417374013e-05, + "loss": 3.1931, + "step": 10023 + }, + { + "epoch": 0.4666992573969318, + "grad_norm": 0.5534915003969079, + "learning_rate": 9.906257218566188e-05, + "loss": 3.2961, + "step": 10024 + }, + { + "epoch": 0.46674581558302486, + "grad_norm": 0.5388567383739922, + "learning_rate": 9.906205005358982e-05, + "loss": 3.1623, + "step": 10025 + }, + { + "epoch": 0.466792373769118, + "grad_norm": 0.489657277587599, + "learning_rate": 9.906152777752548e-05, + "loss": 3.2904, + "step": 10026 + }, + { + "epoch": 0.46683893195521103, + "grad_norm": 0.47898378871850317, + "learning_rate": 9.906100535747039e-05, + "loss": 3.17, + "step": 10027 + }, + { + "epoch": 0.4668854901413041, + "grad_norm": 0.43289722400639213, + "learning_rate": 9.906048279342606e-05, + "loss": 3.3094, + "step": 10028 + }, + { + "epoch": 0.46693204832739715, + "grad_norm": 0.47903673863452817, + "learning_rate": 9.905996008539407e-05, + "loss": 3.2079, + "step": 10029 + }, + { + "epoch": 0.4669786065134902, + "grad_norm": 0.3798732272261301, + "learning_rate": 9.905943723337592e-05, + "loss": 3.2329, + "step": 10030 + }, + { + "epoch": 0.46702516469958333, + "grad_norm": 0.4195972915697526, + "learning_rate": 9.905891423737317e-05, + "loss": 3.1662, + "step": 10031 + }, + { + "epoch": 0.4670717228856764, + "grad_norm": 0.40100024260126393, + "learning_rate": 9.905839109738733e-05, + "loss": 3.3299, + "step": 10032 + }, + { + "epoch": 0.46711828107176945, + "grad_norm": 0.46203405160929806, + "learning_rate": 9.905786781341993e-05, + "loss": 3.2577, + "step": 10033 + }, + { + "epoch": 0.4671648392578625, + "grad_norm": 0.3685705413913494, + "learning_rate": 9.905734438547254e-05, + "loss": 3.162, + "step": 10034 + }, + { + "epoch": 0.46721139744395557, + "grad_norm": 0.43137131779277177, + "learning_rate": 9.905682081354666e-05, + "loss": 3.2528, + "step": 10035 + }, + { + "epoch": 0.4672579556300486, + "grad_norm": 0.3584973444012974, + "learning_rate": 9.905629709764386e-05, + "loss": 3.155, + "step": 10036 + }, + { + "epoch": 0.46730451381614174, + "grad_norm": 0.3865830159018833, + "learning_rate": 9.905577323776565e-05, + "loss": 3.2078, + "step": 10037 + }, + { + "epoch": 0.4673510720022348, + "grad_norm": 0.39124845537637404, + "learning_rate": 9.905524923391358e-05, + "loss": 3.2372, + "step": 10038 + }, + { + "epoch": 0.46739763018832786, + "grad_norm": 0.3910584000790865, + "learning_rate": 9.90547250860892e-05, + "loss": 3.2546, + "step": 10039 + }, + { + "epoch": 0.4674441883744209, + "grad_norm": 0.3920777615497515, + "learning_rate": 9.905420079429402e-05, + "loss": 3.3887, + "step": 10040 + }, + { + "epoch": 0.467490746560514, + "grad_norm": 0.3948663653894621, + "learning_rate": 9.90536763585296e-05, + "loss": 3.2325, + "step": 10041 + }, + { + "epoch": 0.4675373047466071, + "grad_norm": 0.3888156724585329, + "learning_rate": 9.905315177879749e-05, + "loss": 3.3065, + "step": 10042 + }, + { + "epoch": 0.46758386293270016, + "grad_norm": 0.4085182869710675, + "learning_rate": 9.905262705509918e-05, + "loss": 3.3266, + "step": 10043 + }, + { + "epoch": 0.4676304211187932, + "grad_norm": 0.37012738476459905, + "learning_rate": 9.905210218743626e-05, + "loss": 3.258, + "step": 10044 + }, + { + "epoch": 0.4676769793048863, + "grad_norm": 0.3578533617187415, + "learning_rate": 9.905157717581024e-05, + "loss": 3.25, + "step": 10045 + }, + { + "epoch": 0.46772353749097934, + "grad_norm": 0.41702451635814614, + "learning_rate": 9.905105202022268e-05, + "loss": 3.3595, + "step": 10046 + }, + { + "epoch": 0.4677700956770724, + "grad_norm": 0.3970760646161015, + "learning_rate": 9.905052672067511e-05, + "loss": 3.2527, + "step": 10047 + }, + { + "epoch": 0.4678166538631655, + "grad_norm": 0.4124269835264864, + "learning_rate": 9.905000127716908e-05, + "loss": 3.0787, + "step": 10048 + }, + { + "epoch": 0.46786321204925857, + "grad_norm": 0.45091647069665647, + "learning_rate": 9.904947568970612e-05, + "loss": 3.1818, + "step": 10049 + }, + { + "epoch": 0.46790977023535163, + "grad_norm": 0.4090369190421149, + "learning_rate": 9.90489499582878e-05, + "loss": 3.2246, + "step": 10050 + }, + { + "epoch": 0.4679563284214447, + "grad_norm": 0.3846124119479134, + "learning_rate": 9.904842408291564e-05, + "loss": 3.3418, + "step": 10051 + }, + { + "epoch": 0.46800288660753775, + "grad_norm": 0.4415350979434025, + "learning_rate": 9.904789806359119e-05, + "loss": 3.2272, + "step": 10052 + }, + { + "epoch": 0.46804944479363086, + "grad_norm": 0.3764091875208173, + "learning_rate": 9.904737190031598e-05, + "loss": 3.2, + "step": 10053 + }, + { + "epoch": 0.4680960029797239, + "grad_norm": 0.4099577487517025, + "learning_rate": 9.904684559309156e-05, + "loss": 3.2863, + "step": 10054 + }, + { + "epoch": 0.468142561165817, + "grad_norm": 0.42137727341812553, + "learning_rate": 9.90463191419195e-05, + "loss": 3.106, + "step": 10055 + }, + { + "epoch": 0.46818911935191004, + "grad_norm": 0.37530489938269435, + "learning_rate": 9.904579254680131e-05, + "loss": 3.1546, + "step": 10056 + }, + { + "epoch": 0.4682356775380031, + "grad_norm": 0.4202157062467255, + "learning_rate": 9.904526580773857e-05, + "loss": 3.261, + "step": 10057 + }, + { + "epoch": 0.46828223572409616, + "grad_norm": 0.4570770575502102, + "learning_rate": 9.904473892473279e-05, + "loss": 3.2714, + "step": 10058 + }, + { + "epoch": 0.4683287939101893, + "grad_norm": 0.3996723427477667, + "learning_rate": 9.904421189778553e-05, + "loss": 3.1299, + "step": 10059 + }, + { + "epoch": 0.46837535209628234, + "grad_norm": 0.391413846271161, + "learning_rate": 9.904368472689833e-05, + "loss": 3.2589, + "step": 10060 + }, + { + "epoch": 0.4684219102823754, + "grad_norm": 0.4528287767382215, + "learning_rate": 9.904315741207278e-05, + "loss": 3.2284, + "step": 10061 + }, + { + "epoch": 0.46846846846846846, + "grad_norm": 0.35377486588203744, + "learning_rate": 9.904262995331036e-05, + "loss": 3.3038, + "step": 10062 + }, + { + "epoch": 0.4685150266545615, + "grad_norm": 0.46826041917570627, + "learning_rate": 9.904210235061266e-05, + "loss": 3.1569, + "step": 10063 + }, + { + "epoch": 0.46856158484065463, + "grad_norm": 0.3998882435654217, + "learning_rate": 9.904157460398125e-05, + "loss": 3.208, + "step": 10064 + }, + { + "epoch": 0.4686081430267477, + "grad_norm": 0.4052235430814798, + "learning_rate": 9.904104671341763e-05, + "loss": 3.1802, + "step": 10065 + }, + { + "epoch": 0.46865470121284075, + "grad_norm": 0.4917337288054075, + "learning_rate": 9.904051867892337e-05, + "loss": 3.2746, + "step": 10066 + }, + { + "epoch": 0.4687012593989338, + "grad_norm": 0.48200213664329983, + "learning_rate": 9.903999050050002e-05, + "loss": 3.3513, + "step": 10067 + }, + { + "epoch": 0.46874781758502687, + "grad_norm": 0.4232054243260905, + "learning_rate": 9.903946217814912e-05, + "loss": 3.2249, + "step": 10068 + }, + { + "epoch": 0.46879437577111993, + "grad_norm": 0.4707037849470847, + "learning_rate": 9.903893371187223e-05, + "loss": 3.1471, + "step": 10069 + }, + { + "epoch": 0.46884093395721305, + "grad_norm": 0.42708331891634055, + "learning_rate": 9.90384051016709e-05, + "loss": 3.3205, + "step": 10070 + }, + { + "epoch": 0.4688874921433061, + "grad_norm": 0.46377625187732435, + "learning_rate": 9.90378763475467e-05, + "loss": 3.2684, + "step": 10071 + }, + { + "epoch": 0.46893405032939917, + "grad_norm": 0.4659983730933378, + "learning_rate": 9.903734744950114e-05, + "loss": 3.2979, + "step": 10072 + }, + { + "epoch": 0.4689806085154922, + "grad_norm": 0.3781386227559319, + "learning_rate": 9.903681840753582e-05, + "loss": 3.2488, + "step": 10073 + }, + { + "epoch": 0.4690271667015853, + "grad_norm": 0.3940744370723837, + "learning_rate": 9.903628922165226e-05, + "loss": 3.1303, + "step": 10074 + }, + { + "epoch": 0.4690737248876784, + "grad_norm": 0.41951544729051354, + "learning_rate": 9.903575989185201e-05, + "loss": 3.1843, + "step": 10075 + }, + { + "epoch": 0.46912028307377146, + "grad_norm": 0.35875656252017574, + "learning_rate": 9.903523041813665e-05, + "loss": 3.2503, + "step": 10076 + }, + { + "epoch": 0.4691668412598645, + "grad_norm": 0.42468413008894357, + "learning_rate": 9.903470080050771e-05, + "loss": 3.3101, + "step": 10077 + }, + { + "epoch": 0.4692133994459576, + "grad_norm": 0.3863659746755122, + "learning_rate": 9.903417103896678e-05, + "loss": 3.3125, + "step": 10078 + }, + { + "epoch": 0.46925995763205064, + "grad_norm": 0.41538858981546767, + "learning_rate": 9.903364113351536e-05, + "loss": 3.2203, + "step": 10079 + }, + { + "epoch": 0.4693065158181437, + "grad_norm": 0.4314934672246193, + "learning_rate": 9.903311108415504e-05, + "loss": 3.2934, + "step": 10080 + }, + { + "epoch": 0.4693530740042368, + "grad_norm": 0.440674981515934, + "learning_rate": 9.903258089088738e-05, + "loss": 3.2065, + "step": 10081 + }, + { + "epoch": 0.4693996321903299, + "grad_norm": 0.3660783472213236, + "learning_rate": 9.903205055371391e-05, + "loss": 3.1664, + "step": 10082 + }, + { + "epoch": 0.46944619037642293, + "grad_norm": 0.4170548463005344, + "learning_rate": 9.903152007263621e-05, + "loss": 3.2662, + "step": 10083 + }, + { + "epoch": 0.469492748562516, + "grad_norm": 0.3584477765527125, + "learning_rate": 9.903098944765583e-05, + "loss": 3.1622, + "step": 10084 + }, + { + "epoch": 0.46953930674860905, + "grad_norm": 0.3780970577459923, + "learning_rate": 9.903045867877432e-05, + "loss": 3.0956, + "step": 10085 + }, + { + "epoch": 0.46958586493470217, + "grad_norm": 0.4191487711275198, + "learning_rate": 9.902992776599325e-05, + "loss": 3.207, + "step": 10086 + }, + { + "epoch": 0.46963242312079523, + "grad_norm": 0.40618882795340966, + "learning_rate": 9.902939670931417e-05, + "loss": 3.1742, + "step": 10087 + }, + { + "epoch": 0.4696789813068883, + "grad_norm": 0.4220372325289262, + "learning_rate": 9.902886550873864e-05, + "loss": 3.2522, + "step": 10088 + }, + { + "epoch": 0.46972553949298135, + "grad_norm": 0.41565407055325254, + "learning_rate": 9.902833416426822e-05, + "loss": 3.2759, + "step": 10089 + }, + { + "epoch": 0.4697720976790744, + "grad_norm": 0.373736391610541, + "learning_rate": 9.902780267590447e-05, + "loss": 3.245, + "step": 10090 + }, + { + "epoch": 0.46981865586516747, + "grad_norm": 0.4507221897873291, + "learning_rate": 9.902727104364895e-05, + "loss": 3.2362, + "step": 10091 + }, + { + "epoch": 0.4698652140512606, + "grad_norm": 0.4203309437924298, + "learning_rate": 9.902673926750322e-05, + "loss": 3.2518, + "step": 10092 + }, + { + "epoch": 0.46991177223735364, + "grad_norm": 0.41428964065818236, + "learning_rate": 9.902620734746884e-05, + "loss": 3.2647, + "step": 10093 + }, + { + "epoch": 0.4699583304234467, + "grad_norm": 0.3683166379418432, + "learning_rate": 9.902567528354738e-05, + "loss": 3.1866, + "step": 10094 + }, + { + "epoch": 0.47000488860953976, + "grad_norm": 0.4614034825900483, + "learning_rate": 9.902514307574038e-05, + "loss": 3.2851, + "step": 10095 + }, + { + "epoch": 0.4700514467956328, + "grad_norm": 0.38004845888088207, + "learning_rate": 9.902461072404941e-05, + "loss": 3.287, + "step": 10096 + }, + { + "epoch": 0.47009800498172594, + "grad_norm": 0.39380958933815585, + "learning_rate": 9.902407822847606e-05, + "loss": 3.2061, + "step": 10097 + }, + { + "epoch": 0.470144563167819, + "grad_norm": 0.39726513764188287, + "learning_rate": 9.902354558902185e-05, + "loss": 3.2122, + "step": 10098 + }, + { + "epoch": 0.47019112135391206, + "grad_norm": 0.395131124033526, + "learning_rate": 9.902301280568837e-05, + "loss": 3.2437, + "step": 10099 + }, + { + "epoch": 0.4702376795400051, + "grad_norm": 0.38119347666904585, + "learning_rate": 9.902247987847717e-05, + "loss": 3.0959, + "step": 10100 + }, + { + "epoch": 0.4702842377260982, + "grad_norm": 0.381531332458288, + "learning_rate": 9.902194680738983e-05, + "loss": 3.2307, + "step": 10101 + }, + { + "epoch": 0.47033079591219124, + "grad_norm": 0.4088550716321821, + "learning_rate": 9.90214135924279e-05, + "loss": 3.2707, + "step": 10102 + }, + { + "epoch": 0.47037735409828435, + "grad_norm": 0.39693681989110796, + "learning_rate": 9.902088023359295e-05, + "loss": 3.2027, + "step": 10103 + }, + { + "epoch": 0.4704239122843774, + "grad_norm": 0.35439380880038995, + "learning_rate": 9.902034673088655e-05, + "loss": 3.1442, + "step": 10104 + }, + { + "epoch": 0.47047047047047047, + "grad_norm": 0.37154541181313927, + "learning_rate": 9.901981308431027e-05, + "loss": 3.1916, + "step": 10105 + }, + { + "epoch": 0.47051702865656353, + "grad_norm": 0.3650138076336354, + "learning_rate": 9.901927929386566e-05, + "loss": 3.2195, + "step": 10106 + }, + { + "epoch": 0.4705635868426566, + "grad_norm": 0.3610596706854128, + "learning_rate": 9.901874535955429e-05, + "loss": 3.2598, + "step": 10107 + }, + { + "epoch": 0.4706101450287497, + "grad_norm": 0.40904533426815737, + "learning_rate": 9.901821128137773e-05, + "loss": 3.2199, + "step": 10108 + }, + { + "epoch": 0.47065670321484276, + "grad_norm": 0.38043510869598923, + "learning_rate": 9.901767705933755e-05, + "loss": 3.1981, + "step": 10109 + }, + { + "epoch": 0.4707032614009358, + "grad_norm": 0.37679588414560444, + "learning_rate": 9.901714269343533e-05, + "loss": 3.235, + "step": 10110 + }, + { + "epoch": 0.4707498195870289, + "grad_norm": 0.40374467114730384, + "learning_rate": 9.901660818367261e-05, + "loss": 3.2504, + "step": 10111 + }, + { + "epoch": 0.47079637777312194, + "grad_norm": 0.3564559066375413, + "learning_rate": 9.901607353005097e-05, + "loss": 3.2587, + "step": 10112 + }, + { + "epoch": 0.470842935959215, + "grad_norm": 0.4148431924664489, + "learning_rate": 9.901553873257201e-05, + "loss": 3.1388, + "step": 10113 + }, + { + "epoch": 0.4708894941453081, + "grad_norm": 0.4083234077866481, + "learning_rate": 9.901500379123725e-05, + "loss": 3.2377, + "step": 10114 + }, + { + "epoch": 0.4709360523314012, + "grad_norm": 0.4101102587268791, + "learning_rate": 9.90144687060483e-05, + "loss": 3.3542, + "step": 10115 + }, + { + "epoch": 0.47098261051749424, + "grad_norm": 0.3940630313573499, + "learning_rate": 9.90139334770067e-05, + "loss": 3.2308, + "step": 10116 + }, + { + "epoch": 0.4710291687035873, + "grad_norm": 0.4076124271820626, + "learning_rate": 9.901339810411403e-05, + "loss": 3.2984, + "step": 10117 + }, + { + "epoch": 0.47107572688968036, + "grad_norm": 0.3794667007059312, + "learning_rate": 9.901286258737187e-05, + "loss": 3.1729, + "step": 10118 + }, + { + "epoch": 0.4711222850757735, + "grad_norm": 0.3810238634755365, + "learning_rate": 9.901232692678178e-05, + "loss": 3.1919, + "step": 10119 + }, + { + "epoch": 0.47116884326186653, + "grad_norm": 0.42270396862744325, + "learning_rate": 9.901179112234536e-05, + "loss": 3.3317, + "step": 10120 + }, + { + "epoch": 0.4712154014479596, + "grad_norm": 0.41468852642002635, + "learning_rate": 9.901125517406413e-05, + "loss": 3.1702, + "step": 10121 + }, + { + "epoch": 0.47126195963405265, + "grad_norm": 0.37606216692513095, + "learning_rate": 9.901071908193973e-05, + "loss": 3.2918, + "step": 10122 + }, + { + "epoch": 0.4713085178201457, + "grad_norm": 0.4291107129710802, + "learning_rate": 9.901018284597368e-05, + "loss": 3.2196, + "step": 10123 + }, + { + "epoch": 0.47135507600623877, + "grad_norm": 0.4549573331359496, + "learning_rate": 9.900964646616757e-05, + "loss": 3.1553, + "step": 10124 + }, + { + "epoch": 0.4714016341923319, + "grad_norm": 0.45753276095235995, + "learning_rate": 9.900910994252297e-05, + "loss": 3.2146, + "step": 10125 + }, + { + "epoch": 0.47144819237842495, + "grad_norm": 0.433711582340463, + "learning_rate": 9.900857327504148e-05, + "loss": 3.3449, + "step": 10126 + }, + { + "epoch": 0.471494750564518, + "grad_norm": 0.46885979567514197, + "learning_rate": 9.900803646372463e-05, + "loss": 3.1564, + "step": 10127 + }, + { + "epoch": 0.47154130875061107, + "grad_norm": 0.3858832439051668, + "learning_rate": 9.900749950857404e-05, + "loss": 3.1356, + "step": 10128 + }, + { + "epoch": 0.4715878669367041, + "grad_norm": 0.43748033753488186, + "learning_rate": 9.900696240959126e-05, + "loss": 3.1644, + "step": 10129 + }, + { + "epoch": 0.47163442512279724, + "grad_norm": 0.392943646570248, + "learning_rate": 9.900642516677788e-05, + "loss": 3.1962, + "step": 10130 + }, + { + "epoch": 0.4716809833088903, + "grad_norm": 0.45605597588834346, + "learning_rate": 9.900588778013546e-05, + "loss": 3.2548, + "step": 10131 + }, + { + "epoch": 0.47172754149498336, + "grad_norm": 0.37888624155367395, + "learning_rate": 9.900535024966559e-05, + "loss": 3.2598, + "step": 10132 + }, + { + "epoch": 0.4717740996810764, + "grad_norm": 0.40738644405617225, + "learning_rate": 9.900481257536984e-05, + "loss": 3.0757, + "step": 10133 + }, + { + "epoch": 0.4718206578671695, + "grad_norm": 0.3784692557311683, + "learning_rate": 9.90042747572498e-05, + "loss": 3.234, + "step": 10134 + }, + { + "epoch": 0.47186721605326254, + "grad_norm": 0.4412240784142135, + "learning_rate": 9.900373679530706e-05, + "loss": 3.2135, + "step": 10135 + }, + { + "epoch": 0.47191377423935565, + "grad_norm": 0.4357794521226776, + "learning_rate": 9.900319868954316e-05, + "loss": 3.171, + "step": 10136 + }, + { + "epoch": 0.4719603324254487, + "grad_norm": 0.3926081610980638, + "learning_rate": 9.900266043995969e-05, + "loss": 3.226, + "step": 10137 + }, + { + "epoch": 0.4720068906115418, + "grad_norm": 0.40582287531003824, + "learning_rate": 9.900212204655826e-05, + "loss": 3.2248, + "step": 10138 + }, + { + "epoch": 0.47205344879763483, + "grad_norm": 0.42388768700202, + "learning_rate": 9.900158350934041e-05, + "loss": 3.2944, + "step": 10139 + }, + { + "epoch": 0.4721000069837279, + "grad_norm": 0.4436986651076505, + "learning_rate": 9.900104482830776e-05, + "loss": 3.3431, + "step": 10140 + }, + { + "epoch": 0.472146565169821, + "grad_norm": 0.4186522978291408, + "learning_rate": 9.900050600346186e-05, + "loss": 3.2115, + "step": 10141 + }, + { + "epoch": 0.47219312335591407, + "grad_norm": 0.4416848091576645, + "learning_rate": 9.89999670348043e-05, + "loss": 3.2473, + "step": 10142 + }, + { + "epoch": 0.47223968154200713, + "grad_norm": 0.43913900114987214, + "learning_rate": 9.899942792233666e-05, + "loss": 3.2355, + "step": 10143 + }, + { + "epoch": 0.4722862397281002, + "grad_norm": 0.4471632334426819, + "learning_rate": 9.899888866606053e-05, + "loss": 3.1486, + "step": 10144 + }, + { + "epoch": 0.47233279791419325, + "grad_norm": 0.41461837130807794, + "learning_rate": 9.89983492659775e-05, + "loss": 3.2418, + "step": 10145 + }, + { + "epoch": 0.4723793561002863, + "grad_norm": 0.47920098781786996, + "learning_rate": 9.899780972208913e-05, + "loss": 3.1944, + "step": 10146 + }, + { + "epoch": 0.4724259142863794, + "grad_norm": 0.3970431699628999, + "learning_rate": 9.899727003439703e-05, + "loss": 3.2431, + "step": 10147 + }, + { + "epoch": 0.4724724724724725, + "grad_norm": 0.43659524552646345, + "learning_rate": 9.899673020290274e-05, + "loss": 3.2037, + "step": 10148 + }, + { + "epoch": 0.47251903065856554, + "grad_norm": 0.3993201491844032, + "learning_rate": 9.899619022760792e-05, + "loss": 3.221, + "step": 10149 + }, + { + "epoch": 0.4725655888446586, + "grad_norm": 0.41793931028407266, + "learning_rate": 9.899565010851406e-05, + "loss": 3.286, + "step": 10150 + }, + { + "epoch": 0.47261214703075166, + "grad_norm": 0.39947080455692585, + "learning_rate": 9.899510984562282e-05, + "loss": 3.2031, + "step": 10151 + }, + { + "epoch": 0.4726587052168448, + "grad_norm": 0.38483704490486076, + "learning_rate": 9.899456943893575e-05, + "loss": 3.1844, + "step": 10152 + }, + { + "epoch": 0.47270526340293784, + "grad_norm": 0.36203225357950836, + "learning_rate": 9.899402888845446e-05, + "loss": 3.241, + "step": 10153 + }, + { + "epoch": 0.4727518215890309, + "grad_norm": 0.42837986885983936, + "learning_rate": 9.899348819418051e-05, + "loss": 3.2611, + "step": 10154 + }, + { + "epoch": 0.47279837977512396, + "grad_norm": 0.46617208377970776, + "learning_rate": 9.89929473561155e-05, + "loss": 3.293, + "step": 10155 + }, + { + "epoch": 0.472844937961217, + "grad_norm": 0.41222840851719417, + "learning_rate": 9.899240637426101e-05, + "loss": 3.2326, + "step": 10156 + }, + { + "epoch": 0.4728914961473101, + "grad_norm": 0.4223373498118147, + "learning_rate": 9.899186524861865e-05, + "loss": 3.1318, + "step": 10157 + }, + { + "epoch": 0.4729380543334032, + "grad_norm": 0.4224387519641885, + "learning_rate": 9.899132397918998e-05, + "loss": 3.2285, + "step": 10158 + }, + { + "epoch": 0.47298461251949625, + "grad_norm": 0.36969320339123335, + "learning_rate": 9.89907825659766e-05, + "loss": 3.2785, + "step": 10159 + }, + { + "epoch": 0.4730311707055893, + "grad_norm": 0.4685032818659747, + "learning_rate": 9.89902410089801e-05, + "loss": 3.2996, + "step": 10160 + }, + { + "epoch": 0.47307772889168237, + "grad_norm": 0.5529865686766084, + "learning_rate": 9.898969930820208e-05, + "loss": 3.0961, + "step": 10161 + }, + { + "epoch": 0.47312428707777543, + "grad_norm": 0.4140480256396695, + "learning_rate": 9.89891574636441e-05, + "loss": 3.2453, + "step": 10162 + }, + { + "epoch": 0.47317084526386854, + "grad_norm": 0.4786051369841238, + "learning_rate": 9.898861547530779e-05, + "loss": 3.3022, + "step": 10163 + }, + { + "epoch": 0.4732174034499616, + "grad_norm": 0.4655783118150692, + "learning_rate": 9.898807334319471e-05, + "loss": 3.26, + "step": 10164 + }, + { + "epoch": 0.47326396163605466, + "grad_norm": 0.3958648814543042, + "learning_rate": 9.898753106730644e-05, + "loss": 3.3207, + "step": 10165 + }, + { + "epoch": 0.4733105198221477, + "grad_norm": 0.43671101100283943, + "learning_rate": 9.898698864764463e-05, + "loss": 3.2412, + "step": 10166 + }, + { + "epoch": 0.4733570780082408, + "grad_norm": 0.4650583688801811, + "learning_rate": 9.898644608421082e-05, + "loss": 3.2255, + "step": 10167 + }, + { + "epoch": 0.47340363619433384, + "grad_norm": 0.44827210216834407, + "learning_rate": 9.898590337700661e-05, + "loss": 3.4137, + "step": 10168 + }, + { + "epoch": 0.47345019438042696, + "grad_norm": 0.38601009922347423, + "learning_rate": 9.89853605260336e-05, + "loss": 3.2791, + "step": 10169 + }, + { + "epoch": 0.47349675256652, + "grad_norm": 0.41112273618719986, + "learning_rate": 9.898481753129338e-05, + "loss": 3.2206, + "step": 10170 + }, + { + "epoch": 0.4735433107526131, + "grad_norm": 0.43592030843635265, + "learning_rate": 9.898427439278757e-05, + "loss": 3.3505, + "step": 10171 + }, + { + "epoch": 0.47358986893870614, + "grad_norm": 0.4127691685055358, + "learning_rate": 9.89837311105177e-05, + "loss": 3.1231, + "step": 10172 + }, + { + "epoch": 0.4736364271247992, + "grad_norm": 0.4454919459568015, + "learning_rate": 9.898318768448545e-05, + "loss": 3.2242, + "step": 10173 + }, + { + "epoch": 0.4736829853108923, + "grad_norm": 0.39380448303062954, + "learning_rate": 9.898264411469234e-05, + "loss": 3.2871, + "step": 10174 + }, + { + "epoch": 0.4737295434969854, + "grad_norm": 0.38142138805385656, + "learning_rate": 9.898210040113999e-05, + "loss": 3.1071, + "step": 10175 + }, + { + "epoch": 0.47377610168307843, + "grad_norm": 0.42451853606847983, + "learning_rate": 9.898155654383002e-05, + "loss": 3.097, + "step": 10176 + }, + { + "epoch": 0.4738226598691715, + "grad_norm": 0.3421851935903349, + "learning_rate": 9.898101254276401e-05, + "loss": 3.2826, + "step": 10177 + }, + { + "epoch": 0.47386921805526455, + "grad_norm": 0.43166560241063484, + "learning_rate": 9.898046839794354e-05, + "loss": 3.2398, + "step": 10178 + }, + { + "epoch": 0.4739157762413576, + "grad_norm": 0.41231026066532633, + "learning_rate": 9.897992410937023e-05, + "loss": 3.1904, + "step": 10179 + }, + { + "epoch": 0.4739623344274507, + "grad_norm": 0.3747232261264039, + "learning_rate": 9.897937967704566e-05, + "loss": 3.1406, + "step": 10180 + }, + { + "epoch": 0.4740088926135438, + "grad_norm": 0.40884362635644883, + "learning_rate": 9.897883510097143e-05, + "loss": 3.2077, + "step": 10181 + }, + { + "epoch": 0.47405545079963685, + "grad_norm": 0.43386137184498097, + "learning_rate": 9.897829038114917e-05, + "loss": 3.2336, + "step": 10182 + }, + { + "epoch": 0.4741020089857299, + "grad_norm": 0.38572094004271773, + "learning_rate": 9.897774551758042e-05, + "loss": 3.2798, + "step": 10183 + }, + { + "epoch": 0.47414856717182297, + "grad_norm": 0.371447038003538, + "learning_rate": 9.897720051026684e-05, + "loss": 3.2794, + "step": 10184 + }, + { + "epoch": 0.4741951253579161, + "grad_norm": 0.37908564911271153, + "learning_rate": 9.897665535920998e-05, + "loss": 3.1898, + "step": 10185 + }, + { + "epoch": 0.47424168354400914, + "grad_norm": 0.44513143366010016, + "learning_rate": 9.897611006441148e-05, + "loss": 3.3324, + "step": 10186 + }, + { + "epoch": 0.4742882417301022, + "grad_norm": 0.4118293218069476, + "learning_rate": 9.89755646258729e-05, + "loss": 3.1983, + "step": 10187 + }, + { + "epoch": 0.47433479991619526, + "grad_norm": 0.4298921908102085, + "learning_rate": 9.897501904359588e-05, + "loss": 3.2514, + "step": 10188 + }, + { + "epoch": 0.4743813581022883, + "grad_norm": 0.408494540674427, + "learning_rate": 9.8974473317582e-05, + "loss": 3.3249, + "step": 10189 + }, + { + "epoch": 0.4744279162883814, + "grad_norm": 0.42014399212870757, + "learning_rate": 9.897392744783288e-05, + "loss": 3.2134, + "step": 10190 + }, + { + "epoch": 0.4744744744744745, + "grad_norm": 0.4233771662204764, + "learning_rate": 9.897338143435008e-05, + "loss": 3.193, + "step": 10191 + }, + { + "epoch": 0.47452103266056755, + "grad_norm": 0.3884074492653109, + "learning_rate": 9.897283527713523e-05, + "loss": 3.335, + "step": 10192 + }, + { + "epoch": 0.4745675908466606, + "grad_norm": 0.40638488455933824, + "learning_rate": 9.897228897618996e-05, + "loss": 3.2542, + "step": 10193 + }, + { + "epoch": 0.4746141490327537, + "grad_norm": 0.44227477013122507, + "learning_rate": 9.897174253151583e-05, + "loss": 3.2205, + "step": 10194 + }, + { + "epoch": 0.47466070721884673, + "grad_norm": 0.4165910208784493, + "learning_rate": 9.897119594311446e-05, + "loss": 3.3577, + "step": 10195 + }, + { + "epoch": 0.47470726540493985, + "grad_norm": 0.4321926494273374, + "learning_rate": 9.897064921098747e-05, + "loss": 3.2391, + "step": 10196 + }, + { + "epoch": 0.4747538235910329, + "grad_norm": 0.4638628144610435, + "learning_rate": 9.897010233513643e-05, + "loss": 3.2418, + "step": 10197 + }, + { + "epoch": 0.47480038177712597, + "grad_norm": 0.4229508748600982, + "learning_rate": 9.896955531556296e-05, + "loss": 3.3053, + "step": 10198 + }, + { + "epoch": 0.47484693996321903, + "grad_norm": 0.40594850216010603, + "learning_rate": 9.896900815226869e-05, + "loss": 3.2717, + "step": 10199 + }, + { + "epoch": 0.4748934981493121, + "grad_norm": 0.430675425617065, + "learning_rate": 9.896846084525519e-05, + "loss": 3.2814, + "step": 10200 + }, + { + "epoch": 0.47494005633540515, + "grad_norm": 0.417737644028806, + "learning_rate": 9.89679133945241e-05, + "loss": 3.24, + "step": 10201 + }, + { + "epoch": 0.47498661452149826, + "grad_norm": 0.40501160697067345, + "learning_rate": 9.896736580007699e-05, + "loss": 3.1757, + "step": 10202 + }, + { + "epoch": 0.4750331727075913, + "grad_norm": 0.46433087539441403, + "learning_rate": 9.896681806191548e-05, + "loss": 3.1485, + "step": 10203 + }, + { + "epoch": 0.4750797308936844, + "grad_norm": 0.39749717535430706, + "learning_rate": 9.896627018004118e-05, + "loss": 3.3056, + "step": 10204 + }, + { + "epoch": 0.47512628907977744, + "grad_norm": 0.3965731188672466, + "learning_rate": 9.896572215445573e-05, + "loss": 3.2469, + "step": 10205 + }, + { + "epoch": 0.4751728472658705, + "grad_norm": 0.46896558240642316, + "learning_rate": 9.89651739851607e-05, + "loss": 3.2616, + "step": 10206 + }, + { + "epoch": 0.4752194054519636, + "grad_norm": 0.3824667723630606, + "learning_rate": 9.89646256721577e-05, + "loss": 3.2186, + "step": 10207 + }, + { + "epoch": 0.4752659636380567, + "grad_norm": 0.47782148072275493, + "learning_rate": 9.896407721544833e-05, + "loss": 3.242, + "step": 10208 + }, + { + "epoch": 0.47531252182414974, + "grad_norm": 0.4028669852079394, + "learning_rate": 9.896352861503425e-05, + "loss": 3.2514, + "step": 10209 + }, + { + "epoch": 0.4753590800102428, + "grad_norm": 0.43191170207272267, + "learning_rate": 9.896297987091702e-05, + "loss": 3.149, + "step": 10210 + }, + { + "epoch": 0.47540563819633586, + "grad_norm": 0.44140332979172714, + "learning_rate": 9.896243098309828e-05, + "loss": 3.3461, + "step": 10211 + }, + { + "epoch": 0.4754521963824289, + "grad_norm": 0.4539110571893802, + "learning_rate": 9.896188195157962e-05, + "loss": 3.2589, + "step": 10212 + }, + { + "epoch": 0.47549875456852203, + "grad_norm": 0.3408032331932217, + "learning_rate": 9.896133277636266e-05, + "loss": 3.1918, + "step": 10213 + }, + { + "epoch": 0.4755453127546151, + "grad_norm": 0.39154291351276727, + "learning_rate": 9.896078345744901e-05, + "loss": 3.2618, + "step": 10214 + }, + { + "epoch": 0.47559187094070815, + "grad_norm": 0.3502535020271717, + "learning_rate": 9.89602339948403e-05, + "loss": 3.1232, + "step": 10215 + }, + { + "epoch": 0.4756384291268012, + "grad_norm": 0.41999368474759835, + "learning_rate": 9.895968438853808e-05, + "loss": 3.1787, + "step": 10216 + }, + { + "epoch": 0.47568498731289427, + "grad_norm": 0.4065296450188102, + "learning_rate": 9.895913463854406e-05, + "loss": 3.2621, + "step": 10217 + }, + { + "epoch": 0.4757315454989874, + "grad_norm": 0.4007083895958772, + "learning_rate": 9.895858474485979e-05, + "loss": 3.2217, + "step": 10218 + }, + { + "epoch": 0.47577810368508044, + "grad_norm": 0.42974157969315674, + "learning_rate": 9.895803470748688e-05, + "loss": 3.1563, + "step": 10219 + }, + { + "epoch": 0.4758246618711735, + "grad_norm": 0.36491991999898055, + "learning_rate": 9.895748452642698e-05, + "loss": 3.1816, + "step": 10220 + }, + { + "epoch": 0.47587122005726656, + "grad_norm": 0.4137659876149811, + "learning_rate": 9.895693420168169e-05, + "loss": 3.2017, + "step": 10221 + }, + { + "epoch": 0.4759177782433596, + "grad_norm": 0.3823766636754369, + "learning_rate": 9.89563837332526e-05, + "loss": 3.1629, + "step": 10222 + }, + { + "epoch": 0.4759643364294527, + "grad_norm": 0.3999329432758259, + "learning_rate": 9.895583312114136e-05, + "loss": 3.2598, + "step": 10223 + }, + { + "epoch": 0.4760108946155458, + "grad_norm": 0.42679840076012626, + "learning_rate": 9.895528236534957e-05, + "loss": 3.1573, + "step": 10224 + }, + { + "epoch": 0.47605745280163886, + "grad_norm": 0.4223364293576512, + "learning_rate": 9.895473146587885e-05, + "loss": 3.3287, + "step": 10225 + }, + { + "epoch": 0.4761040109877319, + "grad_norm": 0.39903379880855033, + "learning_rate": 9.895418042273081e-05, + "loss": 3.1566, + "step": 10226 + }, + { + "epoch": 0.476150569173825, + "grad_norm": 0.3961020914873514, + "learning_rate": 9.895362923590707e-05, + "loss": 3.1663, + "step": 10227 + }, + { + "epoch": 0.47619712735991804, + "grad_norm": 0.39388266571552455, + "learning_rate": 9.895307790540927e-05, + "loss": 3.2138, + "step": 10228 + }, + { + "epoch": 0.47624368554601115, + "grad_norm": 0.38261480991953567, + "learning_rate": 9.8952526431239e-05, + "loss": 3.2683, + "step": 10229 + }, + { + "epoch": 0.4762902437321042, + "grad_norm": 0.4645680627141623, + "learning_rate": 9.895197481339789e-05, + "loss": 3.2306, + "step": 10230 + }, + { + "epoch": 0.4763368019181973, + "grad_norm": 0.4752553669947222, + "learning_rate": 9.895142305188754e-05, + "loss": 3.2721, + "step": 10231 + }, + { + "epoch": 0.47638336010429033, + "grad_norm": 0.4149119691418024, + "learning_rate": 9.89508711467096e-05, + "loss": 3.2295, + "step": 10232 + }, + { + "epoch": 0.4764299182903834, + "grad_norm": 0.44036857711512856, + "learning_rate": 9.895031909786568e-05, + "loss": 3.3514, + "step": 10233 + }, + { + "epoch": 0.47647647647647645, + "grad_norm": 0.3908566171940763, + "learning_rate": 9.894976690535739e-05, + "loss": 3.2136, + "step": 10234 + }, + { + "epoch": 0.47652303466256957, + "grad_norm": 0.39072751441811776, + "learning_rate": 9.894921456918636e-05, + "loss": 3.2162, + "step": 10235 + }, + { + "epoch": 0.4765695928486626, + "grad_norm": 0.4267257208633291, + "learning_rate": 9.894866208935421e-05, + "loss": 3.2145, + "step": 10236 + }, + { + "epoch": 0.4766161510347557, + "grad_norm": 0.4115693382303302, + "learning_rate": 9.894810946586255e-05, + "loss": 3.1761, + "step": 10237 + }, + { + "epoch": 0.47666270922084875, + "grad_norm": 0.40184454837472716, + "learning_rate": 9.8947556698713e-05, + "loss": 3.2944, + "step": 10238 + }, + { + "epoch": 0.4767092674069418, + "grad_norm": 0.37198834496702793, + "learning_rate": 9.894700378790722e-05, + "loss": 3.2509, + "step": 10239 + }, + { + "epoch": 0.4767558255930349, + "grad_norm": 0.35714139836989073, + "learning_rate": 9.89464507334468e-05, + "loss": 3.2485, + "step": 10240 + }, + { + "epoch": 0.476802383779128, + "grad_norm": 0.38126924747104296, + "learning_rate": 9.894589753533334e-05, + "loss": 3.1887, + "step": 10241 + }, + { + "epoch": 0.47684894196522104, + "grad_norm": 0.3832011331333408, + "learning_rate": 9.894534419356853e-05, + "loss": 3.211, + "step": 10242 + }, + { + "epoch": 0.4768955001513141, + "grad_norm": 0.42751585424531685, + "learning_rate": 9.894479070815393e-05, + "loss": 3.3025, + "step": 10243 + }, + { + "epoch": 0.47694205833740716, + "grad_norm": 0.3947572997507521, + "learning_rate": 9.89442370790912e-05, + "loss": 3.1669, + "step": 10244 + }, + { + "epoch": 0.4769886165235002, + "grad_norm": 0.3514888845670702, + "learning_rate": 9.894368330638197e-05, + "loss": 3.2075, + "step": 10245 + }, + { + "epoch": 0.47703517470959333, + "grad_norm": 0.341922434169332, + "learning_rate": 9.894312939002783e-05, + "loss": 3.2615, + "step": 10246 + }, + { + "epoch": 0.4770817328956864, + "grad_norm": 0.3440724646382473, + "learning_rate": 9.894257533003045e-05, + "loss": 3.1818, + "step": 10247 + }, + { + "epoch": 0.47712829108177945, + "grad_norm": 0.38903747225956814, + "learning_rate": 9.894202112639141e-05, + "loss": 3.2251, + "step": 10248 + }, + { + "epoch": 0.4771748492678725, + "grad_norm": 0.40353291770194655, + "learning_rate": 9.894146677911238e-05, + "loss": 3.2722, + "step": 10249 + }, + { + "epoch": 0.4772214074539656, + "grad_norm": 0.3531499491342512, + "learning_rate": 9.894091228819495e-05, + "loss": 3.3278, + "step": 10250 + }, + { + "epoch": 0.4772679656400587, + "grad_norm": 0.4157193039636028, + "learning_rate": 9.894035765364076e-05, + "loss": 3.2053, + "step": 10251 + }, + { + "epoch": 0.47731452382615175, + "grad_norm": 0.39811676370383353, + "learning_rate": 9.893980287545146e-05, + "loss": 3.2247, + "step": 10252 + }, + { + "epoch": 0.4773610820122448, + "grad_norm": 0.3643126821253059, + "learning_rate": 9.893924795362865e-05, + "loss": 3.189, + "step": 10253 + }, + { + "epoch": 0.47740764019833787, + "grad_norm": 0.43037377115885744, + "learning_rate": 9.893869288817397e-05, + "loss": 3.2096, + "step": 10254 + }, + { + "epoch": 0.47745419838443093, + "grad_norm": 0.3759015586968261, + "learning_rate": 9.893813767908904e-05, + "loss": 3.1734, + "step": 10255 + }, + { + "epoch": 0.477500756570524, + "grad_norm": 0.3893125056778352, + "learning_rate": 9.89375823263755e-05, + "loss": 3.2603, + "step": 10256 + }, + { + "epoch": 0.4775473147566171, + "grad_norm": 0.3473489900897055, + "learning_rate": 9.8937026830035e-05, + "loss": 3.2584, + "step": 10257 + }, + { + "epoch": 0.47759387294271016, + "grad_norm": 0.37249217827425385, + "learning_rate": 9.893647119006913e-05, + "loss": 3.1487, + "step": 10258 + }, + { + "epoch": 0.4776404311288032, + "grad_norm": 0.3746984759016671, + "learning_rate": 9.893591540647953e-05, + "loss": 3.2602, + "step": 10259 + }, + { + "epoch": 0.4776869893148963, + "grad_norm": 0.4101291075835673, + "learning_rate": 9.893535947926786e-05, + "loss": 3.1838, + "step": 10260 + }, + { + "epoch": 0.47773354750098934, + "grad_norm": 0.3927020388600034, + "learning_rate": 9.893480340843571e-05, + "loss": 3.1285, + "step": 10261 + }, + { + "epoch": 0.47778010568708246, + "grad_norm": 0.38538179432045133, + "learning_rate": 9.893424719398474e-05, + "loss": 3.2242, + "step": 10262 + }, + { + "epoch": 0.4778266638731755, + "grad_norm": 0.3760563217097666, + "learning_rate": 9.893369083591657e-05, + "loss": 3.1553, + "step": 10263 + }, + { + "epoch": 0.4778732220592686, + "grad_norm": 0.36790503637676325, + "learning_rate": 9.893313433423286e-05, + "loss": 3.2693, + "step": 10264 + }, + { + "epoch": 0.47791978024536164, + "grad_norm": 0.38545040473371084, + "learning_rate": 9.89325776889352e-05, + "loss": 3.212, + "step": 10265 + }, + { + "epoch": 0.4779663384314547, + "grad_norm": 0.353142818650653, + "learning_rate": 9.893202090002525e-05, + "loss": 3.161, + "step": 10266 + }, + { + "epoch": 0.47801289661754776, + "grad_norm": 0.3676430387028859, + "learning_rate": 9.893146396750464e-05, + "loss": 3.1556, + "step": 10267 + }, + { + "epoch": 0.47805945480364087, + "grad_norm": 0.40899729831009163, + "learning_rate": 9.8930906891375e-05, + "loss": 3.2643, + "step": 10268 + }, + { + "epoch": 0.47810601298973393, + "grad_norm": 0.44404694173763, + "learning_rate": 9.893034967163796e-05, + "loss": 3.2572, + "step": 10269 + }, + { + "epoch": 0.478152571175827, + "grad_norm": 0.3920484242830656, + "learning_rate": 9.892979230829519e-05, + "loss": 3.1242, + "step": 10270 + }, + { + "epoch": 0.47819912936192005, + "grad_norm": 0.3809380330249327, + "learning_rate": 9.892923480134829e-05, + "loss": 3.1824, + "step": 10271 + }, + { + "epoch": 0.4782456875480131, + "grad_norm": 0.4216699667497956, + "learning_rate": 9.892867715079889e-05, + "loss": 3.3274, + "step": 10272 + }, + { + "epoch": 0.4782922457341062, + "grad_norm": 0.41876294987742846, + "learning_rate": 9.892811935664867e-05, + "loss": 3.244, + "step": 10273 + }, + { + "epoch": 0.4783388039201993, + "grad_norm": 0.42090653799130184, + "learning_rate": 9.892756141889921e-05, + "loss": 3.2543, + "step": 10274 + }, + { + "epoch": 0.47838536210629234, + "grad_norm": 0.37602115357237625, + "learning_rate": 9.892700333755221e-05, + "loss": 3.2775, + "step": 10275 + }, + { + "epoch": 0.4784319202923854, + "grad_norm": 0.4103480762115658, + "learning_rate": 9.892644511260924e-05, + "loss": 3.2008, + "step": 10276 + }, + { + "epoch": 0.47847847847847846, + "grad_norm": 0.4542242986422285, + "learning_rate": 9.892588674407199e-05, + "loss": 3.1646, + "step": 10277 + }, + { + "epoch": 0.4785250366645715, + "grad_norm": 0.4419554383845503, + "learning_rate": 9.892532823194209e-05, + "loss": 3.2709, + "step": 10278 + }, + { + "epoch": 0.47857159485066464, + "grad_norm": 0.3879769642644732, + "learning_rate": 9.892476957622117e-05, + "loss": 3.2203, + "step": 10279 + }, + { + "epoch": 0.4786181530367577, + "grad_norm": 0.44498991189419174, + "learning_rate": 9.892421077691088e-05, + "loss": 3.1696, + "step": 10280 + }, + { + "epoch": 0.47866471122285076, + "grad_norm": 0.4662336212802949, + "learning_rate": 9.892365183401282e-05, + "loss": 3.3566, + "step": 10281 + }, + { + "epoch": 0.4787112694089438, + "grad_norm": 0.38264254177575135, + "learning_rate": 9.89230927475287e-05, + "loss": 3.1675, + "step": 10282 + }, + { + "epoch": 0.4787578275950369, + "grad_norm": 0.43566386102767374, + "learning_rate": 9.892253351746008e-05, + "loss": 3.1962, + "step": 10283 + }, + { + "epoch": 0.47880438578113, + "grad_norm": 0.4155215682524464, + "learning_rate": 9.892197414380868e-05, + "loss": 3.2515, + "step": 10284 + }, + { + "epoch": 0.47885094396722305, + "grad_norm": 0.3846737021479184, + "learning_rate": 9.89214146265761e-05, + "loss": 3.1792, + "step": 10285 + }, + { + "epoch": 0.4788975021533161, + "grad_norm": 0.3948332867842575, + "learning_rate": 9.892085496576397e-05, + "loss": 3.1468, + "step": 10286 + }, + { + "epoch": 0.47894406033940917, + "grad_norm": 0.3969552161882065, + "learning_rate": 9.892029516137396e-05, + "loss": 3.2791, + "step": 10287 + }, + { + "epoch": 0.47899061852550223, + "grad_norm": 0.33370597624915216, + "learning_rate": 9.89197352134077e-05, + "loss": 3.1216, + "step": 10288 + }, + { + "epoch": 0.4790371767115953, + "grad_norm": 0.41945233426934647, + "learning_rate": 9.891917512186683e-05, + "loss": 3.3032, + "step": 10289 + }, + { + "epoch": 0.4790837348976884, + "grad_norm": 0.4571794311452689, + "learning_rate": 9.891861488675302e-05, + "loss": 3.2407, + "step": 10290 + }, + { + "epoch": 0.47913029308378147, + "grad_norm": 0.3736263250567876, + "learning_rate": 9.891805450806788e-05, + "loss": 3.2781, + "step": 10291 + }, + { + "epoch": 0.4791768512698745, + "grad_norm": 0.4214036453655797, + "learning_rate": 9.891749398581307e-05, + "loss": 3.2798, + "step": 10292 + }, + { + "epoch": 0.4792234094559676, + "grad_norm": 0.4092608555685708, + "learning_rate": 9.891693331999024e-05, + "loss": 3.2107, + "step": 10293 + }, + { + "epoch": 0.47926996764206065, + "grad_norm": 0.41136276497778107, + "learning_rate": 9.891637251060103e-05, + "loss": 3.0859, + "step": 10294 + }, + { + "epoch": 0.47931652582815376, + "grad_norm": 0.5012847300069259, + "learning_rate": 9.891581155764709e-05, + "loss": 3.1991, + "step": 10295 + }, + { + "epoch": 0.4793630840142468, + "grad_norm": 0.5008930157996413, + "learning_rate": 9.891525046113005e-05, + "loss": 3.2236, + "step": 10296 + }, + { + "epoch": 0.4794096422003399, + "grad_norm": 0.41367819810122064, + "learning_rate": 9.891468922105158e-05, + "loss": 3.2048, + "step": 10297 + }, + { + "epoch": 0.47945620038643294, + "grad_norm": 0.4192224804969298, + "learning_rate": 9.89141278374133e-05, + "loss": 3.1946, + "step": 10298 + }, + { + "epoch": 0.479502758572526, + "grad_norm": 0.40768557274481415, + "learning_rate": 9.89135663102169e-05, + "loss": 3.2227, + "step": 10299 + }, + { + "epoch": 0.47954931675861906, + "grad_norm": 0.39891800647084763, + "learning_rate": 9.891300463946398e-05, + "loss": 3.23, + "step": 10300 + }, + { + "epoch": 0.4795958749447122, + "grad_norm": 0.39259898696971424, + "learning_rate": 9.891244282515621e-05, + "loss": 3.113, + "step": 10301 + }, + { + "epoch": 0.47964243313080523, + "grad_norm": 0.40700857894846154, + "learning_rate": 9.891188086729523e-05, + "loss": 3.2062, + "step": 10302 + }, + { + "epoch": 0.4796889913168983, + "grad_norm": 0.37509351051769513, + "learning_rate": 9.891131876588274e-05, + "loss": 3.1001, + "step": 10303 + }, + { + "epoch": 0.47973554950299135, + "grad_norm": 0.37021996179149025, + "learning_rate": 9.89107565209203e-05, + "loss": 3.2459, + "step": 10304 + }, + { + "epoch": 0.4797821076890844, + "grad_norm": 0.38835093581453267, + "learning_rate": 9.891019413240962e-05, + "loss": 3.1744, + "step": 10305 + }, + { + "epoch": 0.47982866587517753, + "grad_norm": 0.37745984505274005, + "learning_rate": 9.890963160035237e-05, + "loss": 3.1062, + "step": 10306 + }, + { + "epoch": 0.4798752240612706, + "grad_norm": 0.40919759634515657, + "learning_rate": 9.890906892475014e-05, + "loss": 3.1925, + "step": 10307 + }, + { + "epoch": 0.47992178224736365, + "grad_norm": 0.4078661766951978, + "learning_rate": 9.890850610560463e-05, + "loss": 3.1555, + "step": 10308 + }, + { + "epoch": 0.4799683404334567, + "grad_norm": 0.3415742830717618, + "learning_rate": 9.890794314291745e-05, + "loss": 3.2821, + "step": 10309 + }, + { + "epoch": 0.48001489861954977, + "grad_norm": 0.5357105056208151, + "learning_rate": 9.890738003669029e-05, + "loss": 3.1967, + "step": 10310 + }, + { + "epoch": 0.4800614568056428, + "grad_norm": 0.4769311183964055, + "learning_rate": 9.890681678692479e-05, + "loss": 3.2641, + "step": 10311 + }, + { + "epoch": 0.48010801499173594, + "grad_norm": 0.433315451126998, + "learning_rate": 9.890625339362258e-05, + "loss": 3.2668, + "step": 10312 + }, + { + "epoch": 0.480154573177829, + "grad_norm": 0.39089869492762863, + "learning_rate": 9.890568985678536e-05, + "loss": 3.1991, + "step": 10313 + }, + { + "epoch": 0.48020113136392206, + "grad_norm": 0.43165810546380895, + "learning_rate": 9.890512617641474e-05, + "loss": 3.1913, + "step": 10314 + }, + { + "epoch": 0.4802476895500151, + "grad_norm": 0.4601844019126026, + "learning_rate": 9.89045623525124e-05, + "loss": 3.1433, + "step": 10315 + }, + { + "epoch": 0.4802942477361082, + "grad_norm": 0.44131655257674174, + "learning_rate": 9.890399838508e-05, + "loss": 3.2273, + "step": 10316 + }, + { + "epoch": 0.4803408059222013, + "grad_norm": 0.40703463271242046, + "learning_rate": 9.890343427411917e-05, + "loss": 3.1689, + "step": 10317 + }, + { + "epoch": 0.48038736410829436, + "grad_norm": 0.4316547605725937, + "learning_rate": 9.890287001963157e-05, + "loss": 3.1968, + "step": 10318 + }, + { + "epoch": 0.4804339222943874, + "grad_norm": 0.40707856553178595, + "learning_rate": 9.890230562161887e-05, + "loss": 3.2085, + "step": 10319 + }, + { + "epoch": 0.4804804804804805, + "grad_norm": 0.4565478924082091, + "learning_rate": 9.890174108008272e-05, + "loss": 3.0842, + "step": 10320 + }, + { + "epoch": 0.48052703866657354, + "grad_norm": 0.4288901673876613, + "learning_rate": 9.890117639502479e-05, + "loss": 3.1133, + "step": 10321 + }, + { + "epoch": 0.4805735968526666, + "grad_norm": 0.4320764135772556, + "learning_rate": 9.890061156644671e-05, + "loss": 3.1714, + "step": 10322 + }, + { + "epoch": 0.4806201550387597, + "grad_norm": 0.4303952620565619, + "learning_rate": 9.890004659435016e-05, + "loss": 3.3265, + "step": 10323 + }, + { + "epoch": 0.48066671322485277, + "grad_norm": 0.47369156897230624, + "learning_rate": 9.889948147873679e-05, + "loss": 3.194, + "step": 10324 + }, + { + "epoch": 0.48071327141094583, + "grad_norm": 0.39864456734969966, + "learning_rate": 9.889891621960825e-05, + "loss": 3.2812, + "step": 10325 + }, + { + "epoch": 0.4807598295970389, + "grad_norm": 0.3894165642858902, + "learning_rate": 9.889835081696622e-05, + "loss": 3.3103, + "step": 10326 + }, + { + "epoch": 0.48080638778313195, + "grad_norm": 0.4303992766964519, + "learning_rate": 9.889778527081235e-05, + "loss": 3.1961, + "step": 10327 + }, + { + "epoch": 0.48085294596922507, + "grad_norm": 0.4003795811800079, + "learning_rate": 9.889721958114829e-05, + "loss": 3.1271, + "step": 10328 + }, + { + "epoch": 0.4808995041553181, + "grad_norm": 0.36409366656540043, + "learning_rate": 9.889665374797571e-05, + "loss": 3.1965, + "step": 10329 + }, + { + "epoch": 0.4809460623414112, + "grad_norm": 0.39433004652921005, + "learning_rate": 9.889608777129626e-05, + "loss": 3.1328, + "step": 10330 + }, + { + "epoch": 0.48099262052750424, + "grad_norm": 0.3766027640197866, + "learning_rate": 9.889552165111162e-05, + "loss": 3.193, + "step": 10331 + }, + { + "epoch": 0.4810391787135973, + "grad_norm": 0.41156481097099157, + "learning_rate": 9.889495538742344e-05, + "loss": 3.3026, + "step": 10332 + }, + { + "epoch": 0.48108573689969036, + "grad_norm": 0.43820085186029645, + "learning_rate": 9.889438898023338e-05, + "loss": 3.2307, + "step": 10333 + }, + { + "epoch": 0.4811322950857835, + "grad_norm": 0.38852360989795764, + "learning_rate": 9.88938224295431e-05, + "loss": 3.2356, + "step": 10334 + }, + { + "epoch": 0.48117885327187654, + "grad_norm": 0.4046426134367175, + "learning_rate": 9.889325573535428e-05, + "loss": 3.2668, + "step": 10335 + }, + { + "epoch": 0.4812254114579696, + "grad_norm": 0.3867497760081628, + "learning_rate": 9.889268889766857e-05, + "loss": 3.2003, + "step": 10336 + }, + { + "epoch": 0.48127196964406266, + "grad_norm": 0.36825464827773563, + "learning_rate": 9.889212191648762e-05, + "loss": 3.104, + "step": 10337 + }, + { + "epoch": 0.4813185278301557, + "grad_norm": 0.40870862999469915, + "learning_rate": 9.889155479181313e-05, + "loss": 3.1701, + "step": 10338 + }, + { + "epoch": 0.48136508601624883, + "grad_norm": 0.34071799175601725, + "learning_rate": 9.889098752364674e-05, + "loss": 3.2045, + "step": 10339 + }, + { + "epoch": 0.4814116442023419, + "grad_norm": 0.3968646617862045, + "learning_rate": 9.88904201119901e-05, + "loss": 3.1576, + "step": 10340 + }, + { + "epoch": 0.48145820238843495, + "grad_norm": 0.3722507765125643, + "learning_rate": 9.888985255684491e-05, + "loss": 3.3003, + "step": 10341 + }, + { + "epoch": 0.481504760574528, + "grad_norm": 0.42002757299802507, + "learning_rate": 9.88892848582128e-05, + "loss": 3.216, + "step": 10342 + }, + { + "epoch": 0.48155131876062107, + "grad_norm": 0.4429054486357073, + "learning_rate": 9.888871701609548e-05, + "loss": 3.2226, + "step": 10343 + }, + { + "epoch": 0.48159787694671413, + "grad_norm": 0.43147574857839577, + "learning_rate": 9.888814903049458e-05, + "loss": 3.162, + "step": 10344 + }, + { + "epoch": 0.48164443513280725, + "grad_norm": 0.41543911190139826, + "learning_rate": 9.888758090141177e-05, + "loss": 3.2002, + "step": 10345 + }, + { + "epoch": 0.4816909933189003, + "grad_norm": 0.3476833701838013, + "learning_rate": 9.888701262884874e-05, + "loss": 3.146, + "step": 10346 + }, + { + "epoch": 0.48173755150499337, + "grad_norm": 0.4400315290414596, + "learning_rate": 9.888644421280715e-05, + "loss": 3.1191, + "step": 10347 + }, + { + "epoch": 0.4817841096910864, + "grad_norm": 0.40183260881863236, + "learning_rate": 9.888587565328863e-05, + "loss": 3.2417, + "step": 10348 + }, + { + "epoch": 0.4818306678771795, + "grad_norm": 0.365519810524522, + "learning_rate": 9.888530695029492e-05, + "loss": 3.2719, + "step": 10349 + }, + { + "epoch": 0.4818772260632726, + "grad_norm": 0.3664117671693989, + "learning_rate": 9.888473810382763e-05, + "loss": 3.1096, + "step": 10350 + }, + { + "epoch": 0.48192378424936566, + "grad_norm": 0.3727124461190506, + "learning_rate": 9.888416911388844e-05, + "loss": 3.1406, + "step": 10351 + }, + { + "epoch": 0.4819703424354587, + "grad_norm": 0.3956695602210911, + "learning_rate": 9.888359998047906e-05, + "loss": 3.1641, + "step": 10352 + }, + { + "epoch": 0.4820169006215518, + "grad_norm": 0.3924380387058798, + "learning_rate": 9.888303070360111e-05, + "loss": 3.2411, + "step": 10353 + }, + { + "epoch": 0.48206345880764484, + "grad_norm": 0.4139582456370101, + "learning_rate": 9.88824612832563e-05, + "loss": 3.1818, + "step": 10354 + }, + { + "epoch": 0.4821100169937379, + "grad_norm": 0.44019735185694847, + "learning_rate": 9.888189171944624e-05, + "loss": 3.253, + "step": 10355 + }, + { + "epoch": 0.482156575179831, + "grad_norm": 0.4143192823563663, + "learning_rate": 9.888132201217268e-05, + "loss": 3.0686, + "step": 10356 + }, + { + "epoch": 0.4822031333659241, + "grad_norm": 0.43204764155586894, + "learning_rate": 9.888075216143723e-05, + "loss": 3.2679, + "step": 10357 + }, + { + "epoch": 0.48224969155201713, + "grad_norm": 0.4108105670766935, + "learning_rate": 9.888018216724162e-05, + "loss": 3.2114, + "step": 10358 + }, + { + "epoch": 0.4822962497381102, + "grad_norm": 0.40001501003590756, + "learning_rate": 9.887961202958746e-05, + "loss": 3.0999, + "step": 10359 + }, + { + "epoch": 0.48234280792420325, + "grad_norm": 0.4620430983604079, + "learning_rate": 9.887904174847646e-05, + "loss": 3.1449, + "step": 10360 + }, + { + "epoch": 0.48238936611029637, + "grad_norm": 0.4150747023220217, + "learning_rate": 9.88784713239103e-05, + "loss": 3.108, + "step": 10361 + }, + { + "epoch": 0.48243592429638943, + "grad_norm": 0.4193425544581079, + "learning_rate": 9.887790075589063e-05, + "loss": 3.1536, + "step": 10362 + }, + { + "epoch": 0.4824824824824825, + "grad_norm": 0.503307811513644, + "learning_rate": 9.887733004441914e-05, + "loss": 3.1994, + "step": 10363 + }, + { + "epoch": 0.48252904066857555, + "grad_norm": 0.49972828723873514, + "learning_rate": 9.887675918949753e-05, + "loss": 3.2316, + "step": 10364 + }, + { + "epoch": 0.4825755988546686, + "grad_norm": 0.3930211747572092, + "learning_rate": 9.88761881911274e-05, + "loss": 3.2595, + "step": 10365 + }, + { + "epoch": 0.48262215704076167, + "grad_norm": 0.4127385725197127, + "learning_rate": 9.88756170493105e-05, + "loss": 3.2528, + "step": 10366 + }, + { + "epoch": 0.4826687152268548, + "grad_norm": 0.3913886560938764, + "learning_rate": 9.887504576404848e-05, + "loss": 3.2394, + "step": 10367 + }, + { + "epoch": 0.48271527341294784, + "grad_norm": 0.3812869587472048, + "learning_rate": 9.8874474335343e-05, + "loss": 3.2301, + "step": 10368 + }, + { + "epoch": 0.4827618315990409, + "grad_norm": 0.3820767884182132, + "learning_rate": 9.887390276319577e-05, + "loss": 3.1429, + "step": 10369 + }, + { + "epoch": 0.48280838978513396, + "grad_norm": 0.3768556873791153, + "learning_rate": 9.887333104760845e-05, + "loss": 3.28, + "step": 10370 + }, + { + "epoch": 0.482854947971227, + "grad_norm": 0.4117475065383622, + "learning_rate": 9.88727591885827e-05, + "loss": 3.2735, + "step": 10371 + }, + { + "epoch": 0.48290150615732014, + "grad_norm": 0.39983193298525127, + "learning_rate": 9.887218718612024e-05, + "loss": 3.2247, + "step": 10372 + }, + { + "epoch": 0.4829480643434132, + "grad_norm": 0.4070150490554891, + "learning_rate": 9.887161504022271e-05, + "loss": 3.0962, + "step": 10373 + }, + { + "epoch": 0.48299462252950626, + "grad_norm": 0.37385219209528353, + "learning_rate": 9.887104275089181e-05, + "loss": 3.1943, + "step": 10374 + }, + { + "epoch": 0.4830411807155993, + "grad_norm": 0.4085555057944084, + "learning_rate": 9.887047031812919e-05, + "loss": 3.2114, + "step": 10375 + }, + { + "epoch": 0.4830877389016924, + "grad_norm": 0.38508018424906126, + "learning_rate": 9.886989774193658e-05, + "loss": 3.2292, + "step": 10376 + }, + { + "epoch": 0.48313429708778544, + "grad_norm": 0.36696367429948057, + "learning_rate": 9.886932502231563e-05, + "loss": 3.2651, + "step": 10377 + }, + { + "epoch": 0.48318085527387855, + "grad_norm": 0.4006517213085457, + "learning_rate": 9.886875215926803e-05, + "loss": 3.0891, + "step": 10378 + }, + { + "epoch": 0.4832274134599716, + "grad_norm": 0.4111075717701812, + "learning_rate": 9.886817915279545e-05, + "loss": 3.1788, + "step": 10379 + }, + { + "epoch": 0.48327397164606467, + "grad_norm": 0.3804611728124809, + "learning_rate": 9.886760600289958e-05, + "loss": 3.2401, + "step": 10380 + }, + { + "epoch": 0.48332052983215773, + "grad_norm": 0.4291350960875753, + "learning_rate": 9.886703270958211e-05, + "loss": 3.2541, + "step": 10381 + }, + { + "epoch": 0.4833670880182508, + "grad_norm": 0.3747336114468116, + "learning_rate": 9.88664592728447e-05, + "loss": 3.1766, + "step": 10382 + }, + { + "epoch": 0.4834136462043439, + "grad_norm": 0.42532559765228706, + "learning_rate": 9.886588569268904e-05, + "loss": 3.3086, + "step": 10383 + }, + { + "epoch": 0.48346020439043697, + "grad_norm": 0.43154335039315556, + "learning_rate": 9.886531196911684e-05, + "loss": 3.2858, + "step": 10384 + }, + { + "epoch": 0.48350676257653, + "grad_norm": 0.4175452761313252, + "learning_rate": 9.886473810212974e-05, + "loss": 3.274, + "step": 10385 + }, + { + "epoch": 0.4835533207626231, + "grad_norm": 0.45650332722253256, + "learning_rate": 9.886416409172947e-05, + "loss": 3.2127, + "step": 10386 + }, + { + "epoch": 0.48359987894871614, + "grad_norm": 0.46942838195197056, + "learning_rate": 9.88635899379177e-05, + "loss": 3.1743, + "step": 10387 + }, + { + "epoch": 0.4836464371348092, + "grad_norm": 0.4789620855829812, + "learning_rate": 9.886301564069607e-05, + "loss": 3.2055, + "step": 10388 + }, + { + "epoch": 0.4836929953209023, + "grad_norm": 0.4056263494945694, + "learning_rate": 9.886244120006634e-05, + "loss": 3.1996, + "step": 10389 + }, + { + "epoch": 0.4837395535069954, + "grad_norm": 0.38863259055076865, + "learning_rate": 9.886186661603013e-05, + "loss": 3.1818, + "step": 10390 + }, + { + "epoch": 0.48378611169308844, + "grad_norm": 0.4523645443872085, + "learning_rate": 9.886129188858917e-05, + "loss": 3.2795, + "step": 10391 + }, + { + "epoch": 0.4838326698791815, + "grad_norm": 0.47330354723024043, + "learning_rate": 9.886071701774513e-05, + "loss": 3.1875, + "step": 10392 + }, + { + "epoch": 0.48387922806527456, + "grad_norm": 0.376448176056743, + "learning_rate": 9.886014200349968e-05, + "loss": 3.2134, + "step": 10393 + }, + { + "epoch": 0.4839257862513677, + "grad_norm": 0.3913097430001639, + "learning_rate": 9.885956684585455e-05, + "loss": 3.2603, + "step": 10394 + }, + { + "epoch": 0.48397234443746073, + "grad_norm": 0.41256757725453136, + "learning_rate": 9.885899154481142e-05, + "loss": 3.1281, + "step": 10395 + }, + { + "epoch": 0.4840189026235538, + "grad_norm": 0.3604945275177084, + "learning_rate": 9.885841610037194e-05, + "loss": 3.2357, + "step": 10396 + }, + { + "epoch": 0.48406546080964685, + "grad_norm": 0.421865153532898, + "learning_rate": 9.885784051253783e-05, + "loss": 3.2746, + "step": 10397 + }, + { + "epoch": 0.4841120189957399, + "grad_norm": 0.4213231426730724, + "learning_rate": 9.885726478131076e-05, + "loss": 3.1319, + "step": 10398 + }, + { + "epoch": 0.48415857718183297, + "grad_norm": 0.3801905696834721, + "learning_rate": 9.885668890669244e-05, + "loss": 3.2099, + "step": 10399 + }, + { + "epoch": 0.4842051353679261, + "grad_norm": 0.48392268735781974, + "learning_rate": 9.885611288868456e-05, + "loss": 3.2528, + "step": 10400 + }, + { + "epoch": 0.48425169355401915, + "grad_norm": 0.40828495366420114, + "learning_rate": 9.885553672728878e-05, + "loss": 3.175, + "step": 10401 + }, + { + "epoch": 0.4842982517401122, + "grad_norm": 0.40787214672092775, + "learning_rate": 9.885496042250681e-05, + "loss": 3.212, + "step": 10402 + }, + { + "epoch": 0.48434480992620527, + "grad_norm": 0.4109943414213801, + "learning_rate": 9.885438397434037e-05, + "loss": 3.2666, + "step": 10403 + }, + { + "epoch": 0.4843913681122983, + "grad_norm": 0.4032662146627754, + "learning_rate": 9.885380738279111e-05, + "loss": 3.1584, + "step": 10404 + }, + { + "epoch": 0.48443792629839144, + "grad_norm": 0.38419485245558055, + "learning_rate": 9.885323064786074e-05, + "loss": 3.0881, + "step": 10405 + }, + { + "epoch": 0.4844844844844845, + "grad_norm": 0.3862562155143761, + "learning_rate": 9.885265376955096e-05, + "loss": 3.103, + "step": 10406 + }, + { + "epoch": 0.48453104267057756, + "grad_norm": 0.37838485982678793, + "learning_rate": 9.885207674786346e-05, + "loss": 3.2062, + "step": 10407 + }, + { + "epoch": 0.4845776008566706, + "grad_norm": 0.41806845578582574, + "learning_rate": 9.88514995827999e-05, + "loss": 3.2279, + "step": 10408 + }, + { + "epoch": 0.4846241590427637, + "grad_norm": 0.39229861815127304, + "learning_rate": 9.885092227436201e-05, + "loss": 3.1576, + "step": 10409 + }, + { + "epoch": 0.48467071722885674, + "grad_norm": 0.4338339261013301, + "learning_rate": 9.885034482255148e-05, + "loss": 3.2529, + "step": 10410 + }, + { + "epoch": 0.48471727541494986, + "grad_norm": 0.4739784853361764, + "learning_rate": 9.884976722737e-05, + "loss": 3.2387, + "step": 10411 + }, + { + "epoch": 0.4847638336010429, + "grad_norm": 0.41045467069621283, + "learning_rate": 9.884918948881928e-05, + "loss": 3.237, + "step": 10412 + }, + { + "epoch": 0.484810391787136, + "grad_norm": 0.4496580731860897, + "learning_rate": 9.884861160690099e-05, + "loss": 3.183, + "step": 10413 + }, + { + "epoch": 0.48485694997322903, + "grad_norm": 0.41138875885062115, + "learning_rate": 9.884803358161683e-05, + "loss": 3.1742, + "step": 10414 + }, + { + "epoch": 0.4849035081593221, + "grad_norm": 0.39006247402419675, + "learning_rate": 9.884745541296851e-05, + "loss": 3.2827, + "step": 10415 + }, + { + "epoch": 0.4849500663454152, + "grad_norm": 0.4609803637614663, + "learning_rate": 9.884687710095772e-05, + "loss": 3.1911, + "step": 10416 + }, + { + "epoch": 0.48499662453150827, + "grad_norm": 0.40703958688285324, + "learning_rate": 9.884629864558616e-05, + "loss": 3.2162, + "step": 10417 + }, + { + "epoch": 0.48504318271760133, + "grad_norm": 0.4238596206101609, + "learning_rate": 9.884572004685553e-05, + "loss": 3.1944, + "step": 10418 + }, + { + "epoch": 0.4850897409036944, + "grad_norm": 0.5037150716426702, + "learning_rate": 9.884514130476752e-05, + "loss": 3.2067, + "step": 10419 + }, + { + "epoch": 0.48513629908978745, + "grad_norm": 0.4353251366202995, + "learning_rate": 9.884456241932383e-05, + "loss": 3.2571, + "step": 10420 + }, + { + "epoch": 0.4851828572758805, + "grad_norm": 0.4133869475015328, + "learning_rate": 9.884398339052616e-05, + "loss": 3.1243, + "step": 10421 + }, + { + "epoch": 0.4852294154619736, + "grad_norm": 0.39926716337845986, + "learning_rate": 9.884340421837621e-05, + "loss": 3.2384, + "step": 10422 + }, + { + "epoch": 0.4852759736480667, + "grad_norm": 0.42942201453560025, + "learning_rate": 9.884282490287569e-05, + "loss": 3.267, + "step": 10423 + }, + { + "epoch": 0.48532253183415974, + "grad_norm": 0.369316737581564, + "learning_rate": 9.884224544402627e-05, + "loss": 3.1555, + "step": 10424 + }, + { + "epoch": 0.4853690900202528, + "grad_norm": 0.46388516021651605, + "learning_rate": 9.88416658418297e-05, + "loss": 3.2137, + "step": 10425 + }, + { + "epoch": 0.48541564820634586, + "grad_norm": 0.4104719311427798, + "learning_rate": 9.884108609628764e-05, + "loss": 3.1625, + "step": 10426 + }, + { + "epoch": 0.485462206392439, + "grad_norm": 0.421438964020223, + "learning_rate": 9.88405062074018e-05, + "loss": 3.1578, + "step": 10427 + }, + { + "epoch": 0.48550876457853204, + "grad_norm": 0.4714136150883279, + "learning_rate": 9.883992617517387e-05, + "loss": 3.1176, + "step": 10428 + }, + { + "epoch": 0.4855553227646251, + "grad_norm": 0.4345065089269439, + "learning_rate": 9.88393459996056e-05, + "loss": 3.2445, + "step": 10429 + }, + { + "epoch": 0.48560188095071816, + "grad_norm": 0.4118592961074114, + "learning_rate": 9.883876568069865e-05, + "loss": 3.1153, + "step": 10430 + }, + { + "epoch": 0.4856484391368112, + "grad_norm": 0.3995604617037415, + "learning_rate": 9.883818521845473e-05, + "loss": 3.2322, + "step": 10431 + }, + { + "epoch": 0.4856949973229043, + "grad_norm": 0.39224230267365695, + "learning_rate": 9.883760461287554e-05, + "loss": 3.1417, + "step": 10432 + }, + { + "epoch": 0.4857415555089974, + "grad_norm": 0.45447691823452885, + "learning_rate": 9.883702386396281e-05, + "loss": 3.2211, + "step": 10433 + }, + { + "epoch": 0.48578811369509045, + "grad_norm": 0.44603510927873335, + "learning_rate": 9.883644297171821e-05, + "loss": 3.227, + "step": 10434 + }, + { + "epoch": 0.4858346718811835, + "grad_norm": 0.39323021902479083, + "learning_rate": 9.883586193614347e-05, + "loss": 3.114, + "step": 10435 + }, + { + "epoch": 0.48588123006727657, + "grad_norm": 0.3889321577369602, + "learning_rate": 9.883528075724029e-05, + "loss": 3.1568, + "step": 10436 + }, + { + "epoch": 0.48592778825336963, + "grad_norm": 0.40093619319049306, + "learning_rate": 9.883469943501035e-05, + "loss": 3.1904, + "step": 10437 + }, + { + "epoch": 0.48597434643946275, + "grad_norm": 0.3991161822446571, + "learning_rate": 9.883411796945541e-05, + "loss": 3.2188, + "step": 10438 + }, + { + "epoch": 0.4860209046255558, + "grad_norm": 0.38500691453872277, + "learning_rate": 9.883353636057712e-05, + "loss": 3.1728, + "step": 10439 + }, + { + "epoch": 0.48606746281164886, + "grad_norm": 0.37947479068929146, + "learning_rate": 9.883295460837723e-05, + "loss": 3.2706, + "step": 10440 + }, + { + "epoch": 0.4861140209977419, + "grad_norm": 0.39196683762919915, + "learning_rate": 9.883237271285741e-05, + "loss": 3.2796, + "step": 10441 + }, + { + "epoch": 0.486160579183835, + "grad_norm": 0.34450464202500963, + "learning_rate": 9.88317906740194e-05, + "loss": 3.2197, + "step": 10442 + }, + { + "epoch": 0.48620713736992804, + "grad_norm": 0.3592153782193545, + "learning_rate": 9.883120849186489e-05, + "loss": 3.2576, + "step": 10443 + }, + { + "epoch": 0.48625369555602116, + "grad_norm": 0.41013024455797764, + "learning_rate": 9.88306261663956e-05, + "loss": 3.2333, + "step": 10444 + }, + { + "epoch": 0.4863002537421142, + "grad_norm": 0.3430400009377042, + "learning_rate": 9.883004369761322e-05, + "loss": 3.0914, + "step": 10445 + }, + { + "epoch": 0.4863468119282073, + "grad_norm": 0.40607119617706633, + "learning_rate": 9.882946108551948e-05, + "loss": 3.1946, + "step": 10446 + }, + { + "epoch": 0.48639337011430034, + "grad_norm": 0.3610185542191481, + "learning_rate": 9.882887833011609e-05, + "loss": 3.0874, + "step": 10447 + }, + { + "epoch": 0.4864399283003934, + "grad_norm": 0.37450949638267417, + "learning_rate": 9.882829543140473e-05, + "loss": 3.1659, + "step": 10448 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 0.36385102778100686, + "learning_rate": 9.882771238938717e-05, + "loss": 3.1528, + "step": 10449 + }, + { + "epoch": 0.4865330446725796, + "grad_norm": 0.355354055786998, + "learning_rate": 9.882712920406504e-05, + "loss": 3.2797, + "step": 10450 + }, + { + "epoch": 0.48657960285867263, + "grad_norm": 0.3569760786233637, + "learning_rate": 9.882654587544014e-05, + "loss": 3.1466, + "step": 10451 + }, + { + "epoch": 0.4866261610447657, + "grad_norm": 0.43125047349452916, + "learning_rate": 9.882596240351412e-05, + "loss": 3.3249, + "step": 10452 + }, + { + "epoch": 0.48667271923085875, + "grad_norm": 0.4187056681365375, + "learning_rate": 9.88253787882887e-05, + "loss": 3.1771, + "step": 10453 + }, + { + "epoch": 0.4867192774169518, + "grad_norm": 0.3981866113903335, + "learning_rate": 9.88247950297656e-05, + "loss": 3.2458, + "step": 10454 + }, + { + "epoch": 0.4867658356030449, + "grad_norm": 0.39868904695834556, + "learning_rate": 9.882421112794655e-05, + "loss": 3.1292, + "step": 10455 + }, + { + "epoch": 0.486812393789138, + "grad_norm": 0.37829923931698306, + "learning_rate": 9.882362708283326e-05, + "loss": 3.1363, + "step": 10456 + }, + { + "epoch": 0.48685895197523105, + "grad_norm": 0.3726001643927644, + "learning_rate": 9.88230428944274e-05, + "loss": 3.1786, + "step": 10457 + }, + { + "epoch": 0.4869055101613241, + "grad_norm": 0.376931390948406, + "learning_rate": 9.882245856273075e-05, + "loss": 3.1861, + "step": 10458 + }, + { + "epoch": 0.48695206834741717, + "grad_norm": 0.48658731423399204, + "learning_rate": 9.8821874087745e-05, + "loss": 3.1863, + "step": 10459 + }, + { + "epoch": 0.4869986265335103, + "grad_norm": 0.42403061220772675, + "learning_rate": 9.882128946947184e-05, + "loss": 3.1141, + "step": 10460 + }, + { + "epoch": 0.48704518471960334, + "grad_norm": 0.38388393014021555, + "learning_rate": 9.8820704707913e-05, + "loss": 3.162, + "step": 10461 + }, + { + "epoch": 0.4870917429056964, + "grad_norm": 0.3842362354360879, + "learning_rate": 9.882011980307022e-05, + "loss": 3.151, + "step": 10462 + }, + { + "epoch": 0.48713830109178946, + "grad_norm": 0.4031189626478562, + "learning_rate": 9.881953475494519e-05, + "loss": 3.2292, + "step": 10463 + }, + { + "epoch": 0.4871848592778825, + "grad_norm": 0.3636350550604819, + "learning_rate": 9.881894956353963e-05, + "loss": 3.1693, + "step": 10464 + }, + { + "epoch": 0.4872314174639756, + "grad_norm": 0.36942754217461327, + "learning_rate": 9.881836422885526e-05, + "loss": 3.2158, + "step": 10465 + }, + { + "epoch": 0.4872779756500687, + "grad_norm": 0.36591462296915445, + "learning_rate": 9.881777875089382e-05, + "loss": 3.1807, + "step": 10466 + }, + { + "epoch": 0.48732453383616176, + "grad_norm": 0.4149796159785871, + "learning_rate": 9.881719312965699e-05, + "loss": 3.2516, + "step": 10467 + }, + { + "epoch": 0.4873710920222548, + "grad_norm": 0.3499611073868981, + "learning_rate": 9.881660736514651e-05, + "loss": 3.1467, + "step": 10468 + }, + { + "epoch": 0.4874176502083479, + "grad_norm": 0.39440329541545543, + "learning_rate": 9.881602145736411e-05, + "loss": 3.1616, + "step": 10469 + }, + { + "epoch": 0.48746420839444093, + "grad_norm": 0.42624874068996066, + "learning_rate": 9.881543540631149e-05, + "loss": 3.2271, + "step": 10470 + }, + { + "epoch": 0.48751076658053405, + "grad_norm": 0.34789985058442097, + "learning_rate": 9.881484921199035e-05, + "loss": 3.2783, + "step": 10471 + }, + { + "epoch": 0.4875573247666271, + "grad_norm": 0.42767545852377303, + "learning_rate": 9.881426287440247e-05, + "loss": 3.2726, + "step": 10472 + }, + { + "epoch": 0.48760388295272017, + "grad_norm": 0.38706027737106535, + "learning_rate": 9.881367639354953e-05, + "loss": 3.2231, + "step": 10473 + }, + { + "epoch": 0.48765044113881323, + "grad_norm": 0.32497851453014087, + "learning_rate": 9.881308976943324e-05, + "loss": 3.2473, + "step": 10474 + }, + { + "epoch": 0.4876969993249063, + "grad_norm": 0.39702491635808645, + "learning_rate": 9.881250300205536e-05, + "loss": 3.309, + "step": 10475 + }, + { + "epoch": 0.48774355751099935, + "grad_norm": 0.35185089796355407, + "learning_rate": 9.88119160914176e-05, + "loss": 3.085, + "step": 10476 + }, + { + "epoch": 0.48779011569709246, + "grad_norm": 0.40147082019715064, + "learning_rate": 9.881132903752166e-05, + "loss": 3.2737, + "step": 10477 + }, + { + "epoch": 0.4878366738831855, + "grad_norm": 0.3913428137436373, + "learning_rate": 9.881074184036927e-05, + "loss": 3.1852, + "step": 10478 + }, + { + "epoch": 0.4878832320692786, + "grad_norm": 0.3457476894831935, + "learning_rate": 9.881015449996218e-05, + "loss": 3.2227, + "step": 10479 + }, + { + "epoch": 0.48792979025537164, + "grad_norm": 0.40058403384306523, + "learning_rate": 9.880956701630208e-05, + "loss": 3.2874, + "step": 10480 + }, + { + "epoch": 0.4879763484414647, + "grad_norm": 0.4066369009820125, + "learning_rate": 9.880897938939072e-05, + "loss": 3.2026, + "step": 10481 + }, + { + "epoch": 0.4880229066275578, + "grad_norm": 0.3342076856770245, + "learning_rate": 9.880839161922981e-05, + "loss": 3.1446, + "step": 10482 + }, + { + "epoch": 0.4880694648136509, + "grad_norm": 0.4253776124614509, + "learning_rate": 9.880780370582108e-05, + "loss": 3.2684, + "step": 10483 + }, + { + "epoch": 0.48811602299974394, + "grad_norm": 0.46605573322241034, + "learning_rate": 9.880721564916627e-05, + "loss": 3.1746, + "step": 10484 + }, + { + "epoch": 0.488162581185837, + "grad_norm": 0.43111290603736124, + "learning_rate": 9.880662744926707e-05, + "loss": 3.2614, + "step": 10485 + }, + { + "epoch": 0.48820913937193006, + "grad_norm": 0.4231371822971523, + "learning_rate": 9.880603910612523e-05, + "loss": 3.1688, + "step": 10486 + }, + { + "epoch": 0.4882556975580231, + "grad_norm": 0.35559377143644183, + "learning_rate": 9.880545061974246e-05, + "loss": 3.1409, + "step": 10487 + }, + { + "epoch": 0.48830225574411623, + "grad_norm": 0.42481519911509513, + "learning_rate": 9.880486199012053e-05, + "loss": 3.0964, + "step": 10488 + }, + { + "epoch": 0.4883488139302093, + "grad_norm": 0.3971953819304541, + "learning_rate": 9.880427321726111e-05, + "loss": 3.2207, + "step": 10489 + }, + { + "epoch": 0.48839537211630235, + "grad_norm": 0.4239016009810297, + "learning_rate": 9.880368430116598e-05, + "loss": 3.2352, + "step": 10490 + }, + { + "epoch": 0.4884419303023954, + "grad_norm": 0.3960770382934245, + "learning_rate": 9.880309524183683e-05, + "loss": 3.2136, + "step": 10491 + }, + { + "epoch": 0.48848848848848847, + "grad_norm": 0.4188441912517359, + "learning_rate": 9.88025060392754e-05, + "loss": 3.2266, + "step": 10492 + }, + { + "epoch": 0.4885350466745816, + "grad_norm": 0.35858241456361484, + "learning_rate": 9.880191669348344e-05, + "loss": 3.1826, + "step": 10493 + }, + { + "epoch": 0.48858160486067465, + "grad_norm": 0.35298751208034856, + "learning_rate": 9.880132720446264e-05, + "loss": 3.3341, + "step": 10494 + }, + { + "epoch": 0.4886281630467677, + "grad_norm": 0.41068717102182467, + "learning_rate": 9.880073757221477e-05, + "loss": 3.3224, + "step": 10495 + }, + { + "epoch": 0.48867472123286076, + "grad_norm": 0.4245528986316708, + "learning_rate": 9.880014779674154e-05, + "loss": 3.1711, + "step": 10496 + }, + { + "epoch": 0.4887212794189538, + "grad_norm": 0.3548809623544759, + "learning_rate": 9.879955787804469e-05, + "loss": 3.1853, + "step": 10497 + }, + { + "epoch": 0.4887678376050469, + "grad_norm": 0.3660226111810917, + "learning_rate": 9.879896781612593e-05, + "loss": 3.1673, + "step": 10498 + }, + { + "epoch": 0.48881439579114, + "grad_norm": 0.3798839808303191, + "learning_rate": 9.879837761098702e-05, + "loss": 3.2262, + "step": 10499 + }, + { + "epoch": 0.48886095397723306, + "grad_norm": 0.45904874185163014, + "learning_rate": 9.879778726262968e-05, + "loss": 3.3659, + "step": 10500 + }, + { + "epoch": 0.4889075121633261, + "grad_norm": 0.42532757361028123, + "learning_rate": 9.879719677105562e-05, + "loss": 3.2243, + "step": 10501 + }, + { + "epoch": 0.4889540703494192, + "grad_norm": 0.35965272052110686, + "learning_rate": 9.87966061362666e-05, + "loss": 3.1253, + "step": 10502 + }, + { + "epoch": 0.48900062853551224, + "grad_norm": 0.42608435348787715, + "learning_rate": 9.879601535826438e-05, + "loss": 3.2113, + "step": 10503 + }, + { + "epoch": 0.48904718672160535, + "grad_norm": 0.3581198875214101, + "learning_rate": 9.879542443705063e-05, + "loss": 3.0248, + "step": 10504 + }, + { + "epoch": 0.4890937449076984, + "grad_norm": 0.4134171172452102, + "learning_rate": 9.879483337262712e-05, + "loss": 3.2491, + "step": 10505 + }, + { + "epoch": 0.4891403030937915, + "grad_norm": 0.39480856016799953, + "learning_rate": 9.879424216499559e-05, + "loss": 3.2378, + "step": 10506 + }, + { + "epoch": 0.48918686127988453, + "grad_norm": 0.3879010240644967, + "learning_rate": 9.879365081415776e-05, + "loss": 3.2624, + "step": 10507 + }, + { + "epoch": 0.4892334194659776, + "grad_norm": 0.38107980024586563, + "learning_rate": 9.879305932011538e-05, + "loss": 3.1684, + "step": 10508 + }, + { + "epoch": 0.48927997765207065, + "grad_norm": 0.35492527657931816, + "learning_rate": 9.879246768287019e-05, + "loss": 3.1818, + "step": 10509 + }, + { + "epoch": 0.48932653583816377, + "grad_norm": 0.35089886159885714, + "learning_rate": 9.879187590242388e-05, + "loss": 3.2048, + "step": 10510 + }, + { + "epoch": 0.4893730940242568, + "grad_norm": 0.3895642450585224, + "learning_rate": 9.879128397877823e-05, + "loss": 3.1563, + "step": 10511 + }, + { + "epoch": 0.4894196522103499, + "grad_norm": 0.4049162887442905, + "learning_rate": 9.879069191193498e-05, + "loss": 3.1509, + "step": 10512 + }, + { + "epoch": 0.48946621039644295, + "grad_norm": 0.4092168957970338, + "learning_rate": 9.879009970189585e-05, + "loss": 3.1989, + "step": 10513 + }, + { + "epoch": 0.489512768582536, + "grad_norm": 0.3481865336385563, + "learning_rate": 9.878950734866259e-05, + "loss": 3.1652, + "step": 10514 + }, + { + "epoch": 0.4895593267686291, + "grad_norm": 0.3768017665780301, + "learning_rate": 9.878891485223694e-05, + "loss": 3.2257, + "step": 10515 + }, + { + "epoch": 0.4896058849547222, + "grad_norm": 0.380286058640857, + "learning_rate": 9.878832221262062e-05, + "loss": 3.2835, + "step": 10516 + }, + { + "epoch": 0.48965244314081524, + "grad_norm": 0.37502646337425444, + "learning_rate": 9.878772942981538e-05, + "loss": 3.3014, + "step": 10517 + }, + { + "epoch": 0.4896990013269083, + "grad_norm": 0.3625807122703622, + "learning_rate": 9.878713650382296e-05, + "loss": 3.0644, + "step": 10518 + }, + { + "epoch": 0.48974555951300136, + "grad_norm": 0.3653868207519494, + "learning_rate": 9.87865434346451e-05, + "loss": 3.0977, + "step": 10519 + }, + { + "epoch": 0.4897921176990944, + "grad_norm": 0.37955023802917476, + "learning_rate": 9.878595022228355e-05, + "loss": 3.2012, + "step": 10520 + }, + { + "epoch": 0.48983867588518754, + "grad_norm": 0.3905188803274872, + "learning_rate": 9.878535686674003e-05, + "loss": 3.1308, + "step": 10521 + }, + { + "epoch": 0.4898852340712806, + "grad_norm": 0.35935284458676664, + "learning_rate": 9.87847633680163e-05, + "loss": 3.0831, + "step": 10522 + }, + { + "epoch": 0.48993179225737366, + "grad_norm": 0.38104164341692015, + "learning_rate": 9.87841697261141e-05, + "loss": 3.1652, + "step": 10523 + }, + { + "epoch": 0.4899783504434667, + "grad_norm": 0.3700344761706303, + "learning_rate": 9.878357594103516e-05, + "loss": 3.2152, + "step": 10524 + }, + { + "epoch": 0.4900249086295598, + "grad_norm": 0.44805138602145994, + "learning_rate": 9.878298201278124e-05, + "loss": 3.154, + "step": 10525 + }, + { + "epoch": 0.4900714668156529, + "grad_norm": 0.38902954180599336, + "learning_rate": 9.878238794135407e-05, + "loss": 3.1563, + "step": 10526 + }, + { + "epoch": 0.49011802500174595, + "grad_norm": 0.37479722873310084, + "learning_rate": 9.878179372675538e-05, + "loss": 3.2218, + "step": 10527 + }, + { + "epoch": 0.490164583187839, + "grad_norm": 0.4508260472936623, + "learning_rate": 9.878119936898695e-05, + "loss": 3.1919, + "step": 10528 + }, + { + "epoch": 0.49021114137393207, + "grad_norm": 0.3913626433944916, + "learning_rate": 9.87806048680505e-05, + "loss": 3.2599, + "step": 10529 + }, + { + "epoch": 0.49025769956002513, + "grad_norm": 0.40360549799855333, + "learning_rate": 9.878001022394779e-05, + "loss": 3.1555, + "step": 10530 + }, + { + "epoch": 0.4903042577461182, + "grad_norm": 0.45684569230307814, + "learning_rate": 9.877941543668055e-05, + "loss": 3.274, + "step": 10531 + }, + { + "epoch": 0.4903508159322113, + "grad_norm": 0.4305604039412616, + "learning_rate": 9.877882050625053e-05, + "loss": 3.2299, + "step": 10532 + }, + { + "epoch": 0.49039737411830436, + "grad_norm": 0.3896584641700199, + "learning_rate": 9.877822543265946e-05, + "loss": 3.2145, + "step": 10533 + }, + { + "epoch": 0.4904439323043974, + "grad_norm": 0.4285252557941259, + "learning_rate": 9.877763021590913e-05, + "loss": 3.1929, + "step": 10534 + }, + { + "epoch": 0.4904904904904905, + "grad_norm": 0.35952349210429824, + "learning_rate": 9.877703485600124e-05, + "loss": 3.1571, + "step": 10535 + }, + { + "epoch": 0.49053704867658354, + "grad_norm": 0.3960058382326044, + "learning_rate": 9.877643935293758e-05, + "loss": 3.1618, + "step": 10536 + }, + { + "epoch": 0.49058360686267666, + "grad_norm": 0.358116190462423, + "learning_rate": 9.877584370671986e-05, + "loss": 3.2073, + "step": 10537 + }, + { + "epoch": 0.4906301650487697, + "grad_norm": 0.37163011584976385, + "learning_rate": 9.877524791734985e-05, + "loss": 3.1389, + "step": 10538 + }, + { + "epoch": 0.4906767232348628, + "grad_norm": 0.3684791641036014, + "learning_rate": 9.87746519848293e-05, + "loss": 3.1614, + "step": 10539 + }, + { + "epoch": 0.49072328142095584, + "grad_norm": 0.3831614289003712, + "learning_rate": 9.877405590915994e-05, + "loss": 3.0755, + "step": 10540 + }, + { + "epoch": 0.4907698396070489, + "grad_norm": 0.425944629243461, + "learning_rate": 9.877345969034353e-05, + "loss": 3.2357, + "step": 10541 + }, + { + "epoch": 0.49081639779314196, + "grad_norm": 0.4784329092915174, + "learning_rate": 9.877286332838183e-05, + "loss": 3.1335, + "step": 10542 + }, + { + "epoch": 0.49086295597923507, + "grad_norm": 0.44146139475358576, + "learning_rate": 9.877226682327658e-05, + "loss": 3.1413, + "step": 10543 + }, + { + "epoch": 0.49090951416532813, + "grad_norm": 0.40675531111182817, + "learning_rate": 9.877167017502952e-05, + "loss": 3.1253, + "step": 10544 + }, + { + "epoch": 0.4909560723514212, + "grad_norm": 0.3668962699964148, + "learning_rate": 9.877107338364242e-05, + "loss": 3.2433, + "step": 10545 + }, + { + "epoch": 0.49100263053751425, + "grad_norm": 0.3873237750566218, + "learning_rate": 9.877047644911702e-05, + "loss": 3.2242, + "step": 10546 + }, + { + "epoch": 0.4910491887236073, + "grad_norm": 0.40368202612203546, + "learning_rate": 9.876987937145508e-05, + "loss": 3.2059, + "step": 10547 + }, + { + "epoch": 0.4910957469097004, + "grad_norm": 0.34315320639595065, + "learning_rate": 9.876928215065835e-05, + "loss": 3.2152, + "step": 10548 + }, + { + "epoch": 0.4911423050957935, + "grad_norm": 0.3768811510096098, + "learning_rate": 9.876868478672858e-05, + "loss": 3.3162, + "step": 10549 + }, + { + "epoch": 0.49118886328188655, + "grad_norm": 0.44198124451606974, + "learning_rate": 9.876808727966753e-05, + "loss": 3.2873, + "step": 10550 + }, + { + "epoch": 0.4912354214679796, + "grad_norm": 0.4039616601101556, + "learning_rate": 9.876748962947694e-05, + "loss": 3.3872, + "step": 10551 + }, + { + "epoch": 0.49128197965407266, + "grad_norm": 0.36328894633129577, + "learning_rate": 9.876689183615858e-05, + "loss": 3.1436, + "step": 10552 + }, + { + "epoch": 0.4913285378401657, + "grad_norm": 0.4285577739092862, + "learning_rate": 9.876629389971419e-05, + "loss": 3.2413, + "step": 10553 + }, + { + "epoch": 0.49137509602625884, + "grad_norm": 0.3917308273504184, + "learning_rate": 9.876569582014554e-05, + "loss": 3.1583, + "step": 10554 + }, + { + "epoch": 0.4914216542123519, + "grad_norm": 0.4116689975448089, + "learning_rate": 9.876509759745436e-05, + "loss": 3.2331, + "step": 10555 + }, + { + "epoch": 0.49146821239844496, + "grad_norm": 0.4287603147963912, + "learning_rate": 9.876449923164244e-05, + "loss": 3.2731, + "step": 10556 + }, + { + "epoch": 0.491514770584538, + "grad_norm": 0.3886992533927911, + "learning_rate": 9.876390072271151e-05, + "loss": 3.2574, + "step": 10557 + }, + { + "epoch": 0.4915613287706311, + "grad_norm": 0.4175042610883531, + "learning_rate": 9.876330207066334e-05, + "loss": 3.0843, + "step": 10558 + }, + { + "epoch": 0.49160788695672414, + "grad_norm": 0.4605608300094148, + "learning_rate": 9.876270327549969e-05, + "loss": 3.251, + "step": 10559 + }, + { + "epoch": 0.49165444514281725, + "grad_norm": 0.4139089422299737, + "learning_rate": 9.87621043372223e-05, + "loss": 3.1755, + "step": 10560 + }, + { + "epoch": 0.4917010033289103, + "grad_norm": 0.35903113102152845, + "learning_rate": 9.876150525583293e-05, + "loss": 3.2304, + "step": 10561 + }, + { + "epoch": 0.4917475615150034, + "grad_norm": 0.4136497448670257, + "learning_rate": 9.876090603133337e-05, + "loss": 3.2659, + "step": 10562 + }, + { + "epoch": 0.49179411970109643, + "grad_norm": 0.45423494991498153, + "learning_rate": 9.876030666372533e-05, + "loss": 3.2256, + "step": 10563 + }, + { + "epoch": 0.4918406778871895, + "grad_norm": 0.38608758863801373, + "learning_rate": 9.87597071530106e-05, + "loss": 3.2101, + "step": 10564 + }, + { + "epoch": 0.4918872360732826, + "grad_norm": 0.38332395908572503, + "learning_rate": 9.875910749919093e-05, + "loss": 3.1578, + "step": 10565 + }, + { + "epoch": 0.49193379425937567, + "grad_norm": 0.43142669340843165, + "learning_rate": 9.875850770226809e-05, + "loss": 3.1878, + "step": 10566 + }, + { + "epoch": 0.4919803524454687, + "grad_norm": 0.37522648536403613, + "learning_rate": 9.875790776224382e-05, + "loss": 3.1721, + "step": 10567 + }, + { + "epoch": 0.4920269106315618, + "grad_norm": 0.40158004138499476, + "learning_rate": 9.875730767911992e-05, + "loss": 3.1669, + "step": 10568 + }, + { + "epoch": 0.49207346881765485, + "grad_norm": 0.39678046467651773, + "learning_rate": 9.875670745289812e-05, + "loss": 3.1526, + "step": 10569 + }, + { + "epoch": 0.4921200270037479, + "grad_norm": 0.36704505844043916, + "learning_rate": 9.875610708358018e-05, + "loss": 3.1983, + "step": 10570 + }, + { + "epoch": 0.492166585189841, + "grad_norm": 0.379003636100902, + "learning_rate": 9.875550657116785e-05, + "loss": 3.1291, + "step": 10571 + }, + { + "epoch": 0.4922131433759341, + "grad_norm": 0.41270937426756005, + "learning_rate": 9.875490591566294e-05, + "loss": 3.0945, + "step": 10572 + }, + { + "epoch": 0.49225970156202714, + "grad_norm": 0.3626617048456486, + "learning_rate": 9.875430511706717e-05, + "loss": 3.2926, + "step": 10573 + }, + { + "epoch": 0.4923062597481202, + "grad_norm": 0.43777808625388503, + "learning_rate": 9.875370417538232e-05, + "loss": 3.0891, + "step": 10574 + }, + { + "epoch": 0.49235281793421326, + "grad_norm": 0.44805625230888746, + "learning_rate": 9.875310309061015e-05, + "loss": 3.0748, + "step": 10575 + }, + { + "epoch": 0.4923993761203064, + "grad_norm": 0.42016096911509737, + "learning_rate": 9.875250186275244e-05, + "loss": 3.3339, + "step": 10576 + }, + { + "epoch": 0.49244593430639944, + "grad_norm": 0.435320138483614, + "learning_rate": 9.875190049181092e-05, + "loss": 3.1964, + "step": 10577 + }, + { + "epoch": 0.4924924924924925, + "grad_norm": 0.37837541175250033, + "learning_rate": 9.875129897778738e-05, + "loss": 3.2879, + "step": 10578 + }, + { + "epoch": 0.49253905067858555, + "grad_norm": 0.40448231075353297, + "learning_rate": 9.875069732068358e-05, + "loss": 3.23, + "step": 10579 + }, + { + "epoch": 0.4925856088646786, + "grad_norm": 0.3609222546504487, + "learning_rate": 9.87500955205013e-05, + "loss": 3.2368, + "step": 10580 + }, + { + "epoch": 0.4926321670507717, + "grad_norm": 0.38956351731825845, + "learning_rate": 9.874949357724228e-05, + "loss": 3.0814, + "step": 10581 + }, + { + "epoch": 0.4926787252368648, + "grad_norm": 0.37017826693878536, + "learning_rate": 9.874889149090829e-05, + "loss": 3.0929, + "step": 10582 + }, + { + "epoch": 0.49272528342295785, + "grad_norm": 0.3776737686000824, + "learning_rate": 9.874828926150113e-05, + "loss": 3.3019, + "step": 10583 + }, + { + "epoch": 0.4927718416090509, + "grad_norm": 0.38560937896154307, + "learning_rate": 9.874768688902252e-05, + "loss": 3.1556, + "step": 10584 + }, + { + "epoch": 0.49281839979514397, + "grad_norm": 0.37319249049916586, + "learning_rate": 9.874708437347427e-05, + "loss": 3.0569, + "step": 10585 + }, + { + "epoch": 0.49286495798123703, + "grad_norm": 0.362719446344441, + "learning_rate": 9.874648171485811e-05, + "loss": 3.1583, + "step": 10586 + }, + { + "epoch": 0.49291151616733014, + "grad_norm": 0.4209508841049557, + "learning_rate": 9.874587891317583e-05, + "loss": 3.2464, + "step": 10587 + }, + { + "epoch": 0.4929580743534232, + "grad_norm": 0.33477017154722777, + "learning_rate": 9.874527596842922e-05, + "loss": 3.2105, + "step": 10588 + }, + { + "epoch": 0.49300463253951626, + "grad_norm": 0.3949090203396853, + "learning_rate": 9.874467288062001e-05, + "loss": 3.2042, + "step": 10589 + }, + { + "epoch": 0.4930511907256093, + "grad_norm": 0.37563481355567996, + "learning_rate": 9.874406964975e-05, + "loss": 3.2505, + "step": 10590 + }, + { + "epoch": 0.4930977489117024, + "grad_norm": 0.38961426936546006, + "learning_rate": 9.874346627582095e-05, + "loss": 3.1335, + "step": 10591 + }, + { + "epoch": 0.49314430709779544, + "grad_norm": 0.4320218777425284, + "learning_rate": 9.87428627588346e-05, + "loss": 3.1509, + "step": 10592 + }, + { + "epoch": 0.49319086528388856, + "grad_norm": 0.38205684322322414, + "learning_rate": 9.874225909879278e-05, + "loss": 3.2339, + "step": 10593 + }, + { + "epoch": 0.4932374234699816, + "grad_norm": 0.4107957739385294, + "learning_rate": 9.874165529569722e-05, + "loss": 3.2038, + "step": 10594 + }, + { + "epoch": 0.4932839816560747, + "grad_norm": 0.4484234137181278, + "learning_rate": 9.87410513495497e-05, + "loss": 3.1073, + "step": 10595 + }, + { + "epoch": 0.49333053984216774, + "grad_norm": 0.4666064187110231, + "learning_rate": 9.874044726035198e-05, + "loss": 3.1921, + "step": 10596 + }, + { + "epoch": 0.4933770980282608, + "grad_norm": 0.45691424602496156, + "learning_rate": 9.873984302810588e-05, + "loss": 3.1508, + "step": 10597 + }, + { + "epoch": 0.4934236562143539, + "grad_norm": 0.4189717475693692, + "learning_rate": 9.873923865281313e-05, + "loss": 3.2551, + "step": 10598 + }, + { + "epoch": 0.49347021440044697, + "grad_norm": 0.4480620528306564, + "learning_rate": 9.873863413447551e-05, + "loss": 3.0819, + "step": 10599 + }, + { + "epoch": 0.49351677258654003, + "grad_norm": 0.39673243070403685, + "learning_rate": 9.873802947309481e-05, + "loss": 3.2462, + "step": 10600 + }, + { + "epoch": 0.4935633307726331, + "grad_norm": 0.4574763684546103, + "learning_rate": 9.87374246686728e-05, + "loss": 3.0478, + "step": 10601 + }, + { + "epoch": 0.49360988895872615, + "grad_norm": 0.46020017325040075, + "learning_rate": 9.873681972121124e-05, + "loss": 3.1376, + "step": 10602 + }, + { + "epoch": 0.4936564471448192, + "grad_norm": 0.4626060436973318, + "learning_rate": 9.873621463071192e-05, + "loss": 3.1448, + "step": 10603 + }, + { + "epoch": 0.4937030053309123, + "grad_norm": 0.36904498931102075, + "learning_rate": 9.87356093971766e-05, + "loss": 3.1094, + "step": 10604 + }, + { + "epoch": 0.4937495635170054, + "grad_norm": 0.4166441655875738, + "learning_rate": 9.873500402060707e-05, + "loss": 3.1506, + "step": 10605 + }, + { + "epoch": 0.49379612170309845, + "grad_norm": 0.3975188301377343, + "learning_rate": 9.873439850100512e-05, + "loss": 3.1167, + "step": 10606 + }, + { + "epoch": 0.4938426798891915, + "grad_norm": 0.42365052234686973, + "learning_rate": 9.87337928383725e-05, + "loss": 3.167, + "step": 10607 + }, + { + "epoch": 0.49388923807528456, + "grad_norm": 0.46052710365052363, + "learning_rate": 9.873318703271099e-05, + "loss": 3.2356, + "step": 10608 + }, + { + "epoch": 0.4939357962613777, + "grad_norm": 0.36583336060724775, + "learning_rate": 9.873258108402239e-05, + "loss": 3.1561, + "step": 10609 + }, + { + "epoch": 0.49398235444747074, + "grad_norm": 0.4314431451330029, + "learning_rate": 9.873197499230847e-05, + "loss": 3.1553, + "step": 10610 + }, + { + "epoch": 0.4940289126335638, + "grad_norm": 0.3841705228371111, + "learning_rate": 9.8731368757571e-05, + "loss": 3.2004, + "step": 10611 + }, + { + "epoch": 0.49407547081965686, + "grad_norm": 0.3856047760297062, + "learning_rate": 9.873076237981176e-05, + "loss": 3.086, + "step": 10612 + }, + { + "epoch": 0.4941220290057499, + "grad_norm": 0.4090022562339582, + "learning_rate": 9.873015585903253e-05, + "loss": 3.2826, + "step": 10613 + }, + { + "epoch": 0.494168587191843, + "grad_norm": 0.40210347353064235, + "learning_rate": 9.87295491952351e-05, + "loss": 3.1661, + "step": 10614 + }, + { + "epoch": 0.4942151453779361, + "grad_norm": 0.5009709467730324, + "learning_rate": 9.872894238842125e-05, + "loss": 3.3355, + "step": 10615 + }, + { + "epoch": 0.49426170356402915, + "grad_norm": 0.4268103238500063, + "learning_rate": 9.872833543859275e-05, + "loss": 3.1976, + "step": 10616 + }, + { + "epoch": 0.4943082617501222, + "grad_norm": 0.3829048934715483, + "learning_rate": 9.872772834575136e-05, + "loss": 3.2159, + "step": 10617 + }, + { + "epoch": 0.4943548199362153, + "grad_norm": 0.49488587786766597, + "learning_rate": 9.872712110989893e-05, + "loss": 3.2093, + "step": 10618 + }, + { + "epoch": 0.49440137812230833, + "grad_norm": 0.4698267091684205, + "learning_rate": 9.872651373103718e-05, + "loss": 3.1112, + "step": 10619 + }, + { + "epoch": 0.49444793630840145, + "grad_norm": 0.4338096697646974, + "learning_rate": 9.87259062091679e-05, + "loss": 3.2461, + "step": 10620 + }, + { + "epoch": 0.4944944944944945, + "grad_norm": 0.40119899450408575, + "learning_rate": 9.872529854429291e-05, + "loss": 3.141, + "step": 10621 + }, + { + "epoch": 0.49454105268058757, + "grad_norm": 0.3632869429924093, + "learning_rate": 9.872469073641395e-05, + "loss": 3.1543, + "step": 10622 + }, + { + "epoch": 0.4945876108666806, + "grad_norm": 0.4074558831860262, + "learning_rate": 9.872408278553284e-05, + "loss": 3.1906, + "step": 10623 + }, + { + "epoch": 0.4946341690527737, + "grad_norm": 0.3770043280884287, + "learning_rate": 9.872347469165133e-05, + "loss": 3.16, + "step": 10624 + }, + { + "epoch": 0.49468072723886675, + "grad_norm": 0.4024370273513671, + "learning_rate": 9.872286645477122e-05, + "loss": 3.0697, + "step": 10625 + }, + { + "epoch": 0.49472728542495986, + "grad_norm": 0.3393521275898079, + "learning_rate": 9.872225807489432e-05, + "loss": 3.2369, + "step": 10626 + }, + { + "epoch": 0.4947738436110529, + "grad_norm": 0.37787581322664104, + "learning_rate": 9.872164955202238e-05, + "loss": 3.1434, + "step": 10627 + }, + { + "epoch": 0.494820401797146, + "grad_norm": 0.35868395123650065, + "learning_rate": 9.872104088615719e-05, + "loss": 3.2204, + "step": 10628 + }, + { + "epoch": 0.49486695998323904, + "grad_norm": 0.34943098449453736, + "learning_rate": 9.872043207730055e-05, + "loss": 3.2452, + "step": 10629 + }, + { + "epoch": 0.4949135181693321, + "grad_norm": 0.4018708484047773, + "learning_rate": 9.871982312545425e-05, + "loss": 3.2033, + "step": 10630 + }, + { + "epoch": 0.4949600763554252, + "grad_norm": 0.3971175196023875, + "learning_rate": 9.871921403062005e-05, + "loss": 3.1425, + "step": 10631 + }, + { + "epoch": 0.4950066345415183, + "grad_norm": 0.39966497066486234, + "learning_rate": 9.871860479279977e-05, + "loss": 3.126, + "step": 10632 + }, + { + "epoch": 0.49505319272761134, + "grad_norm": 0.39731765078014103, + "learning_rate": 9.87179954119952e-05, + "loss": 3.2188, + "step": 10633 + }, + { + "epoch": 0.4950997509137044, + "grad_norm": 0.4119580805866471, + "learning_rate": 9.871738588820809e-05, + "loss": 3.1298, + "step": 10634 + }, + { + "epoch": 0.49514630909979745, + "grad_norm": 0.36205641025484164, + "learning_rate": 9.871677622144027e-05, + "loss": 3.3341, + "step": 10635 + }, + { + "epoch": 0.4951928672858905, + "grad_norm": 0.39351267460311107, + "learning_rate": 9.871616641169349e-05, + "loss": 3.1561, + "step": 10636 + }, + { + "epoch": 0.49523942547198363, + "grad_norm": 0.42249480140919893, + "learning_rate": 9.871555645896957e-05, + "loss": 3.294, + "step": 10637 + }, + { + "epoch": 0.4952859836580767, + "grad_norm": 0.3867651982622768, + "learning_rate": 9.871494636327029e-05, + "loss": 3.1361, + "step": 10638 + }, + { + "epoch": 0.49533254184416975, + "grad_norm": 0.3735896028488212, + "learning_rate": 9.871433612459744e-05, + "loss": 3.1324, + "step": 10639 + }, + { + "epoch": 0.4953791000302628, + "grad_norm": 0.3638539548967873, + "learning_rate": 9.871372574295281e-05, + "loss": 3.159, + "step": 10640 + }, + { + "epoch": 0.49542565821635587, + "grad_norm": 0.4176749319395332, + "learning_rate": 9.871311521833821e-05, + "loss": 3.1807, + "step": 10641 + }, + { + "epoch": 0.495472216402449, + "grad_norm": 0.3972087076634933, + "learning_rate": 9.87125045507554e-05, + "loss": 3.1607, + "step": 10642 + }, + { + "epoch": 0.49551877458854204, + "grad_norm": 0.3941576250621481, + "learning_rate": 9.87118937402062e-05, + "loss": 3.23, + "step": 10643 + }, + { + "epoch": 0.4955653327746351, + "grad_norm": 0.37180880829380847, + "learning_rate": 9.871128278669239e-05, + "loss": 3.205, + "step": 10644 + }, + { + "epoch": 0.49561189096072816, + "grad_norm": 0.4033971201432477, + "learning_rate": 9.871067169021574e-05, + "loss": 3.0546, + "step": 10645 + }, + { + "epoch": 0.4956584491468212, + "grad_norm": 0.41787959352989223, + "learning_rate": 9.87100604507781e-05, + "loss": 3.2017, + "step": 10646 + }, + { + "epoch": 0.4957050073329143, + "grad_norm": 0.39339734203696225, + "learning_rate": 9.870944906838122e-05, + "loss": 3.2239, + "step": 10647 + }, + { + "epoch": 0.4957515655190074, + "grad_norm": 0.43187571011098613, + "learning_rate": 9.87088375430269e-05, + "loss": 3.1358, + "step": 10648 + }, + { + "epoch": 0.49579812370510046, + "grad_norm": 0.40629336051272136, + "learning_rate": 9.870822587471692e-05, + "loss": 3.1824, + "step": 10649 + }, + { + "epoch": 0.4958446818911935, + "grad_norm": 0.4022488252988561, + "learning_rate": 9.870761406345314e-05, + "loss": 3.1986, + "step": 10650 + }, + { + "epoch": 0.4958912400772866, + "grad_norm": 0.40840852311303044, + "learning_rate": 9.870700210923729e-05, + "loss": 3.1981, + "step": 10651 + }, + { + "epoch": 0.49593779826337964, + "grad_norm": 0.3924779456126989, + "learning_rate": 9.870639001207118e-05, + "loss": 3.1766, + "step": 10652 + }, + { + "epoch": 0.49598435644947275, + "grad_norm": 0.36539091686404485, + "learning_rate": 9.870577777195662e-05, + "loss": 3.2051, + "step": 10653 + }, + { + "epoch": 0.4960309146355658, + "grad_norm": 0.378774630562038, + "learning_rate": 9.870516538889541e-05, + "loss": 3.2057, + "step": 10654 + }, + { + "epoch": 0.49607747282165887, + "grad_norm": 0.3629375770500403, + "learning_rate": 9.870455286288934e-05, + "loss": 3.1349, + "step": 10655 + }, + { + "epoch": 0.49612403100775193, + "grad_norm": 0.3974696906495257, + "learning_rate": 9.870394019394018e-05, + "loss": 3.1724, + "step": 10656 + }, + { + "epoch": 0.496170589193845, + "grad_norm": 0.4076066805224503, + "learning_rate": 9.870332738204977e-05, + "loss": 3.337, + "step": 10657 + }, + { + "epoch": 0.49621714737993805, + "grad_norm": 0.3557387989159008, + "learning_rate": 9.870271442721989e-05, + "loss": 3.2249, + "step": 10658 + }, + { + "epoch": 0.49626370556603117, + "grad_norm": 0.4069192949231365, + "learning_rate": 9.870210132945235e-05, + "loss": 3.1154, + "step": 10659 + }, + { + "epoch": 0.4963102637521242, + "grad_norm": 0.41217006714130616, + "learning_rate": 9.870148808874893e-05, + "loss": 3.2242, + "step": 10660 + }, + { + "epoch": 0.4963568219382173, + "grad_norm": 0.4422342386379205, + "learning_rate": 9.870087470511144e-05, + "loss": 3.2165, + "step": 10661 + }, + { + "epoch": 0.49640338012431034, + "grad_norm": 0.3782602564073404, + "learning_rate": 9.870026117854167e-05, + "loss": 3.1497, + "step": 10662 + }, + { + "epoch": 0.4964499383104034, + "grad_norm": 0.4031747758991462, + "learning_rate": 9.869964750904146e-05, + "loss": 3.131, + "step": 10663 + }, + { + "epoch": 0.4964964964964965, + "grad_norm": 0.4134488482999995, + "learning_rate": 9.869903369661257e-05, + "loss": 3.3305, + "step": 10664 + }, + { + "epoch": 0.4965430546825896, + "grad_norm": 0.36223291370468796, + "learning_rate": 9.86984197412568e-05, + "loss": 3.2251, + "step": 10665 + }, + { + "epoch": 0.49658961286868264, + "grad_norm": 0.37242975972998843, + "learning_rate": 9.869780564297598e-05, + "loss": 3.0508, + "step": 10666 + }, + { + "epoch": 0.4966361710547757, + "grad_norm": 0.3871184935215783, + "learning_rate": 9.86971914017719e-05, + "loss": 3.2593, + "step": 10667 + }, + { + "epoch": 0.49668272924086876, + "grad_norm": 0.42372767634473646, + "learning_rate": 9.869657701764636e-05, + "loss": 3.1342, + "step": 10668 + }, + { + "epoch": 0.4967292874269618, + "grad_norm": 0.36756838413188897, + "learning_rate": 9.869596249060115e-05, + "loss": 3.1494, + "step": 10669 + }, + { + "epoch": 0.49677584561305493, + "grad_norm": 0.4086108387915879, + "learning_rate": 9.86953478206381e-05, + "loss": 3.0991, + "step": 10670 + }, + { + "epoch": 0.496822403799148, + "grad_norm": 0.4477699429117738, + "learning_rate": 9.8694733007759e-05, + "loss": 3.2093, + "step": 10671 + }, + { + "epoch": 0.49686896198524105, + "grad_norm": 0.33217059348232203, + "learning_rate": 9.869411805196565e-05, + "loss": 3.1005, + "step": 10672 + }, + { + "epoch": 0.4969155201713341, + "grad_norm": 0.4686086714065465, + "learning_rate": 9.869350295325985e-05, + "loss": 3.355, + "step": 10673 + }, + { + "epoch": 0.4969620783574272, + "grad_norm": 0.46702161617606386, + "learning_rate": 9.869288771164345e-05, + "loss": 3.1331, + "step": 10674 + }, + { + "epoch": 0.4970086365435203, + "grad_norm": 0.386256176245458, + "learning_rate": 9.86922723271182e-05, + "loss": 3.1746, + "step": 10675 + }, + { + "epoch": 0.49705519472961335, + "grad_norm": 0.46577026458210563, + "learning_rate": 9.869165679968592e-05, + "loss": 3.3519, + "step": 10676 + }, + { + "epoch": 0.4971017529157064, + "grad_norm": 0.4488053773671858, + "learning_rate": 9.869104112934844e-05, + "loss": 3.2303, + "step": 10677 + }, + { + "epoch": 0.49714831110179947, + "grad_norm": 0.41234067225648124, + "learning_rate": 9.869042531610755e-05, + "loss": 3.2241, + "step": 10678 + }, + { + "epoch": 0.4971948692878925, + "grad_norm": 0.4039609300805892, + "learning_rate": 9.868980935996505e-05, + "loss": 3.1664, + "step": 10679 + }, + { + "epoch": 0.4972414274739856, + "grad_norm": 0.4182976854186412, + "learning_rate": 9.868919326092276e-05, + "loss": 3.2226, + "step": 10680 + }, + { + "epoch": 0.4972879856600787, + "grad_norm": 0.4779969811778533, + "learning_rate": 9.868857701898248e-05, + "loss": 3.146, + "step": 10681 + }, + { + "epoch": 0.49733454384617176, + "grad_norm": 0.40563638159631243, + "learning_rate": 9.868796063414603e-05, + "loss": 3.2379, + "step": 10682 + }, + { + "epoch": 0.4973811020322648, + "grad_norm": 0.40469600310185294, + "learning_rate": 9.868734410641521e-05, + "loss": 3.152, + "step": 10683 + }, + { + "epoch": 0.4974276602183579, + "grad_norm": 0.48106828712450816, + "learning_rate": 9.868672743579183e-05, + "loss": 3.19, + "step": 10684 + }, + { + "epoch": 0.49747421840445094, + "grad_norm": 0.4111229330941062, + "learning_rate": 9.86861106222777e-05, + "loss": 3.1375, + "step": 10685 + }, + { + "epoch": 0.49752077659054406, + "grad_norm": 0.39472020756246895, + "learning_rate": 9.868549366587463e-05, + "loss": 3.1915, + "step": 10686 + }, + { + "epoch": 0.4975673347766371, + "grad_norm": 0.4108721939437341, + "learning_rate": 9.868487656658443e-05, + "loss": 3.1287, + "step": 10687 + }, + { + "epoch": 0.4976138929627302, + "grad_norm": 0.3949797746812704, + "learning_rate": 9.868425932440892e-05, + "loss": 3.2587, + "step": 10688 + }, + { + "epoch": 0.49766045114882324, + "grad_norm": 0.3939255503063094, + "learning_rate": 9.868364193934992e-05, + "loss": 3.0841, + "step": 10689 + }, + { + "epoch": 0.4977070093349163, + "grad_norm": 0.3997171668277466, + "learning_rate": 9.868302441140919e-05, + "loss": 3.2719, + "step": 10690 + }, + { + "epoch": 0.49775356752100935, + "grad_norm": 0.3845340067541915, + "learning_rate": 9.868240674058861e-05, + "loss": 3.1435, + "step": 10691 + }, + { + "epoch": 0.49780012570710247, + "grad_norm": 0.4170607149317637, + "learning_rate": 9.868178892688996e-05, + "loss": 3.2088, + "step": 10692 + }, + { + "epoch": 0.49784668389319553, + "grad_norm": 0.41249647782706805, + "learning_rate": 9.868117097031505e-05, + "loss": 3.2119, + "step": 10693 + }, + { + "epoch": 0.4978932420792886, + "grad_norm": 0.42718583307296065, + "learning_rate": 9.86805528708657e-05, + "loss": 3.0918, + "step": 10694 + }, + { + "epoch": 0.49793980026538165, + "grad_norm": 0.40390559968170603, + "learning_rate": 9.867993462854373e-05, + "loss": 3.1898, + "step": 10695 + }, + { + "epoch": 0.4979863584514747, + "grad_norm": 0.380935559141056, + "learning_rate": 9.867931624335093e-05, + "loss": 3.171, + "step": 10696 + }, + { + "epoch": 0.4980329166375678, + "grad_norm": 0.40277061505277173, + "learning_rate": 9.867869771528914e-05, + "loss": 3.1676, + "step": 10697 + }, + { + "epoch": 0.4980794748236609, + "grad_norm": 0.356440908770564, + "learning_rate": 9.867807904436017e-05, + "loss": 3.1931, + "step": 10698 + }, + { + "epoch": 0.49812603300975394, + "grad_norm": 0.41072650780006137, + "learning_rate": 9.867746023056583e-05, + "loss": 3.2491, + "step": 10699 + }, + { + "epoch": 0.498172591195847, + "grad_norm": 0.3533567324450499, + "learning_rate": 9.867684127390795e-05, + "loss": 3.1873, + "step": 10700 + }, + { + "epoch": 0.49821914938194006, + "grad_norm": 0.4002543813993354, + "learning_rate": 9.867622217438832e-05, + "loss": 3.0848, + "step": 10701 + }, + { + "epoch": 0.4982657075680331, + "grad_norm": 0.40278126017956845, + "learning_rate": 9.867560293200879e-05, + "loss": 3.2505, + "step": 10702 + }, + { + "epoch": 0.49831226575412624, + "grad_norm": 0.3972043622160113, + "learning_rate": 9.867498354677115e-05, + "loss": 3.0572, + "step": 10703 + }, + { + "epoch": 0.4983588239402193, + "grad_norm": 0.3475048992149292, + "learning_rate": 9.867436401867722e-05, + "loss": 3.0965, + "step": 10704 + }, + { + "epoch": 0.49840538212631236, + "grad_norm": 0.396998626218964, + "learning_rate": 9.867374434772885e-05, + "loss": 3.2639, + "step": 10705 + }, + { + "epoch": 0.4984519403124054, + "grad_norm": 0.44199080849391525, + "learning_rate": 9.867312453392783e-05, + "loss": 3.1249, + "step": 10706 + }, + { + "epoch": 0.4984984984984985, + "grad_norm": 0.36117830379582344, + "learning_rate": 9.867250457727596e-05, + "loss": 3.2329, + "step": 10707 + }, + { + "epoch": 0.4985450566845916, + "grad_norm": 0.40421639433413514, + "learning_rate": 9.867188447777511e-05, + "loss": 3.1256, + "step": 10708 + }, + { + "epoch": 0.49859161487068465, + "grad_norm": 0.35415323133920285, + "learning_rate": 9.867126423542707e-05, + "loss": 3.1558, + "step": 10709 + }, + { + "epoch": 0.4986381730567777, + "grad_norm": 0.3828284181838322, + "learning_rate": 9.867064385023366e-05, + "loss": 3.1451, + "step": 10710 + }, + { + "epoch": 0.49868473124287077, + "grad_norm": 0.4258384425413005, + "learning_rate": 9.86700233221967e-05, + "loss": 3.1374, + "step": 10711 + }, + { + "epoch": 0.49873128942896383, + "grad_norm": 0.3757053541475898, + "learning_rate": 9.866940265131804e-05, + "loss": 3.2084, + "step": 10712 + }, + { + "epoch": 0.4987778476150569, + "grad_norm": 0.3977763490612459, + "learning_rate": 9.866878183759946e-05, + "loss": 3.3062, + "step": 10713 + }, + { + "epoch": 0.49882440580115, + "grad_norm": 0.4217560216785892, + "learning_rate": 9.866816088104279e-05, + "loss": 3.2121, + "step": 10714 + }, + { + "epoch": 0.49887096398724307, + "grad_norm": 0.4148101832607633, + "learning_rate": 9.866753978164987e-05, + "loss": 3.1287, + "step": 10715 + }, + { + "epoch": 0.4989175221733361, + "grad_norm": 0.385441197905142, + "learning_rate": 9.866691853942252e-05, + "loss": 3.095, + "step": 10716 + }, + { + "epoch": 0.4989640803594292, + "grad_norm": 0.37506532880972104, + "learning_rate": 9.866629715436257e-05, + "loss": 3.1518, + "step": 10717 + }, + { + "epoch": 0.49901063854552224, + "grad_norm": 0.43871798682939794, + "learning_rate": 9.86656756264718e-05, + "loss": 3.2374, + "step": 10718 + }, + { + "epoch": 0.49905719673161536, + "grad_norm": 0.44694960694485913, + "learning_rate": 9.86650539557521e-05, + "loss": 3.2129, + "step": 10719 + }, + { + "epoch": 0.4991037549177084, + "grad_norm": 0.4705082301517469, + "learning_rate": 9.866443214220524e-05, + "loss": 3.3352, + "step": 10720 + }, + { + "epoch": 0.4991503131038015, + "grad_norm": 0.36319384586816567, + "learning_rate": 9.866381018583308e-05, + "loss": 3.1631, + "step": 10721 + }, + { + "epoch": 0.49919687128989454, + "grad_norm": 0.43993582335887577, + "learning_rate": 9.866318808663741e-05, + "loss": 3.2645, + "step": 10722 + }, + { + "epoch": 0.4992434294759876, + "grad_norm": 0.38520456759166666, + "learning_rate": 9.86625658446201e-05, + "loss": 3.1142, + "step": 10723 + }, + { + "epoch": 0.49928998766208066, + "grad_norm": 0.44038425405224235, + "learning_rate": 9.866194345978295e-05, + "loss": 3.1867, + "step": 10724 + }, + { + "epoch": 0.4993365458481738, + "grad_norm": 0.3985701568486585, + "learning_rate": 9.866132093212778e-05, + "loss": 3.2048, + "step": 10725 + }, + { + "epoch": 0.49938310403426683, + "grad_norm": 0.43121749924051767, + "learning_rate": 9.866069826165643e-05, + "loss": 3.2709, + "step": 10726 + }, + { + "epoch": 0.4994296622203599, + "grad_norm": 0.4171571746471692, + "learning_rate": 9.866007544837073e-05, + "loss": 3.1367, + "step": 10727 + }, + { + "epoch": 0.49947622040645295, + "grad_norm": 0.3393018184591227, + "learning_rate": 9.865945249227248e-05, + "loss": 3.1236, + "step": 10728 + }, + { + "epoch": 0.499522778592546, + "grad_norm": 0.40060368335742125, + "learning_rate": 9.865882939336356e-05, + "loss": 3.2242, + "step": 10729 + }, + { + "epoch": 0.49956933677863913, + "grad_norm": 0.40169910135401077, + "learning_rate": 9.865820615164577e-05, + "loss": 3.1225, + "step": 10730 + }, + { + "epoch": 0.4996158949647322, + "grad_norm": 0.3672242153156314, + "learning_rate": 9.865758276712091e-05, + "loss": 3.1362, + "step": 10731 + }, + { + "epoch": 0.49966245315082525, + "grad_norm": 0.39138514697052473, + "learning_rate": 9.865695923979086e-05, + "loss": 3.1044, + "step": 10732 + }, + { + "epoch": 0.4997090113369183, + "grad_norm": 0.43827712264938434, + "learning_rate": 9.865633556965741e-05, + "loss": 3.2582, + "step": 10733 + }, + { + "epoch": 0.49975556952301137, + "grad_norm": 0.40345889882981933, + "learning_rate": 9.865571175672243e-05, + "loss": 3.1186, + "step": 10734 + }, + { + "epoch": 0.4998021277091044, + "grad_norm": 0.39439139287307695, + "learning_rate": 9.865508780098774e-05, + "loss": 3.2648, + "step": 10735 + }, + { + "epoch": 0.49984868589519754, + "grad_norm": 0.49913980427137034, + "learning_rate": 9.865446370245513e-05, + "loss": 3.1734, + "step": 10736 + }, + { + "epoch": 0.4998952440812906, + "grad_norm": 0.4461701040926774, + "learning_rate": 9.865383946112647e-05, + "loss": 3.1055, + "step": 10737 + }, + { + "epoch": 0.49994180226738366, + "grad_norm": 0.3875083808621537, + "learning_rate": 9.865321507700359e-05, + "loss": 3.1977, + "step": 10738 + }, + { + "epoch": 0.4999883604534767, + "grad_norm": 0.4467439664045657, + "learning_rate": 9.865259055008831e-05, + "loss": 3.2141, + "step": 10739 + }, + { + "epoch": 0.5000349186395698, + "grad_norm": 0.5155462067159088, + "learning_rate": 9.865196588038249e-05, + "loss": 3.2586, + "step": 10740 + }, + { + "epoch": 0.5000814768256628, + "grad_norm": 0.39467699802492645, + "learning_rate": 9.865134106788791e-05, + "loss": 3.2055, + "step": 10741 + }, + { + "epoch": 0.500128035011756, + "grad_norm": 0.4041064684148214, + "learning_rate": 9.865071611260646e-05, + "loss": 3.2281, + "step": 10742 + }, + { + "epoch": 0.500174593197849, + "grad_norm": 0.4362327714640645, + "learning_rate": 9.865009101453995e-05, + "loss": 3.1806, + "step": 10743 + }, + { + "epoch": 0.5002211513839421, + "grad_norm": 0.355196968159719, + "learning_rate": 9.86494657736902e-05, + "loss": 3.3205, + "step": 10744 + }, + { + "epoch": 0.5002677095700352, + "grad_norm": 0.4203556468692222, + "learning_rate": 9.864884039005908e-05, + "loss": 3.2175, + "step": 10745 + }, + { + "epoch": 0.5003142677561282, + "grad_norm": 0.40869226577939155, + "learning_rate": 9.86482148636484e-05, + "loss": 3.2105, + "step": 10746 + }, + { + "epoch": 0.5003608259422213, + "grad_norm": 0.4502830183106791, + "learning_rate": 9.864758919446e-05, + "loss": 3.2544, + "step": 10747 + }, + { + "epoch": 0.5004073841283143, + "grad_norm": 0.4216437042570721, + "learning_rate": 9.864696338249573e-05, + "loss": 3.1472, + "step": 10748 + }, + { + "epoch": 0.5004539423144074, + "grad_norm": 0.37836246829736414, + "learning_rate": 9.86463374277574e-05, + "loss": 3.1886, + "step": 10749 + }, + { + "epoch": 0.5005005005005005, + "grad_norm": 0.43269616644248055, + "learning_rate": 9.864571133024686e-05, + "loss": 3.1764, + "step": 10750 + }, + { + "epoch": 0.5005470586865935, + "grad_norm": 0.4115670347668568, + "learning_rate": 9.864508508996595e-05, + "loss": 3.1677, + "step": 10751 + }, + { + "epoch": 0.5005936168726867, + "grad_norm": 0.39803452770277553, + "learning_rate": 9.864445870691653e-05, + "loss": 3.2392, + "step": 10752 + }, + { + "epoch": 0.5006401750587797, + "grad_norm": 0.42317823961383544, + "learning_rate": 9.864383218110038e-05, + "loss": 3.2144, + "step": 10753 + }, + { + "epoch": 0.5006867332448728, + "grad_norm": 0.3843114098586527, + "learning_rate": 9.86432055125194e-05, + "loss": 3.065, + "step": 10754 + }, + { + "epoch": 0.5007332914309659, + "grad_norm": 0.40481123291039744, + "learning_rate": 9.86425787011754e-05, + "loss": 3.1668, + "step": 10755 + }, + { + "epoch": 0.5007798496170589, + "grad_norm": 0.4263994849575341, + "learning_rate": 9.864195174707023e-05, + "loss": 3.0593, + "step": 10756 + }, + { + "epoch": 0.500826407803152, + "grad_norm": 0.3684553485497107, + "learning_rate": 9.86413246502057e-05, + "loss": 3.1016, + "step": 10757 + }, + { + "epoch": 0.500872965989245, + "grad_norm": 0.47695160162264644, + "learning_rate": 9.864069741058369e-05, + "loss": 3.2091, + "step": 10758 + }, + { + "epoch": 0.5009195241753381, + "grad_norm": 0.45818073342552884, + "learning_rate": 9.864007002820603e-05, + "loss": 3.1687, + "step": 10759 + }, + { + "epoch": 0.5009660823614313, + "grad_norm": 0.39257302704797525, + "learning_rate": 9.863944250307455e-05, + "loss": 3.192, + "step": 10760 + }, + { + "epoch": 0.5010126405475243, + "grad_norm": 0.35089410721312736, + "learning_rate": 9.86388148351911e-05, + "loss": 3.1256, + "step": 10761 + }, + { + "epoch": 0.5010591987336174, + "grad_norm": 0.4016596607080973, + "learning_rate": 9.863818702455752e-05, + "loss": 3.1501, + "step": 10762 + }, + { + "epoch": 0.5011057569197104, + "grad_norm": 0.38153679366136034, + "learning_rate": 9.863755907117565e-05, + "loss": 2.9953, + "step": 10763 + }, + { + "epoch": 0.5011523151058035, + "grad_norm": 0.3878059231603726, + "learning_rate": 9.863693097504733e-05, + "loss": 3.2212, + "step": 10764 + }, + { + "epoch": 0.5011988732918965, + "grad_norm": 0.38992326101091745, + "learning_rate": 9.863630273617442e-05, + "loss": 3.1854, + "step": 10765 + }, + { + "epoch": 0.5012454314779896, + "grad_norm": 0.3882844677025632, + "learning_rate": 9.863567435455876e-05, + "loss": 3.1314, + "step": 10766 + }, + { + "epoch": 0.5012919896640827, + "grad_norm": 0.3518901771522584, + "learning_rate": 9.863504583020218e-05, + "loss": 3.2127, + "step": 10767 + }, + { + "epoch": 0.5013385478501757, + "grad_norm": 0.37099575012815506, + "learning_rate": 9.863441716310651e-05, + "loss": 3.0841, + "step": 10768 + }, + { + "epoch": 0.5013851060362688, + "grad_norm": 0.39095106724887374, + "learning_rate": 9.863378835327364e-05, + "loss": 3.2462, + "step": 10769 + }, + { + "epoch": 0.5014316642223619, + "grad_norm": 0.37456241423375297, + "learning_rate": 9.863315940070539e-05, + "loss": 3.1324, + "step": 10770 + }, + { + "epoch": 0.501478222408455, + "grad_norm": 0.4250949138830732, + "learning_rate": 9.863253030540361e-05, + "loss": 3.1246, + "step": 10771 + }, + { + "epoch": 0.5015247805945481, + "grad_norm": 0.38717995048325055, + "learning_rate": 9.863190106737013e-05, + "loss": 3.1413, + "step": 10772 + }, + { + "epoch": 0.5015713387806411, + "grad_norm": 0.40252198004219436, + "learning_rate": 9.863127168660683e-05, + "loss": 3.0949, + "step": 10773 + }, + { + "epoch": 0.5016178969667342, + "grad_norm": 0.37963809693526296, + "learning_rate": 9.863064216311553e-05, + "loss": 3.2002, + "step": 10774 + }, + { + "epoch": 0.5016644551528272, + "grad_norm": 0.4204477472593467, + "learning_rate": 9.86300124968981e-05, + "loss": 3.1656, + "step": 10775 + }, + { + "epoch": 0.5017110133389203, + "grad_norm": 0.4136445347826309, + "learning_rate": 9.862938268795635e-05, + "loss": 3.1817, + "step": 10776 + }, + { + "epoch": 0.5017575715250134, + "grad_norm": 0.44892273982964315, + "learning_rate": 9.862875273629217e-05, + "loss": 3.2544, + "step": 10777 + }, + { + "epoch": 0.5018041297111064, + "grad_norm": 0.39540861019752616, + "learning_rate": 9.862812264190738e-05, + "loss": 3.1154, + "step": 10778 + }, + { + "epoch": 0.5018506878971996, + "grad_norm": 0.3624795898214495, + "learning_rate": 9.862749240480385e-05, + "loss": 3.1216, + "step": 10779 + }, + { + "epoch": 0.5018972460832926, + "grad_norm": 0.3886749748567495, + "learning_rate": 9.862686202498342e-05, + "loss": 3.2089, + "step": 10780 + }, + { + "epoch": 0.5019438042693857, + "grad_norm": 0.4293391912052408, + "learning_rate": 9.862623150244792e-05, + "loss": 3.1736, + "step": 10781 + }, + { + "epoch": 0.5019903624554788, + "grad_norm": 0.38347414755538534, + "learning_rate": 9.862560083719925e-05, + "loss": 3.151, + "step": 10782 + }, + { + "epoch": 0.5020369206415718, + "grad_norm": 0.4526163651398505, + "learning_rate": 9.862497002923921e-05, + "loss": 3.1134, + "step": 10783 + }, + { + "epoch": 0.5020834788276649, + "grad_norm": 0.5228008928798897, + "learning_rate": 9.862433907856969e-05, + "loss": 3.2622, + "step": 10784 + }, + { + "epoch": 0.5021300370137579, + "grad_norm": 0.40028370787108986, + "learning_rate": 9.862370798519252e-05, + "loss": 3.1972, + "step": 10785 + }, + { + "epoch": 0.502176595199851, + "grad_norm": 0.41481128797631817, + "learning_rate": 9.862307674910956e-05, + "loss": 3.2765, + "step": 10786 + }, + { + "epoch": 0.502223153385944, + "grad_norm": 0.4929571716785761, + "learning_rate": 9.862244537032264e-05, + "loss": 3.2169, + "step": 10787 + }, + { + "epoch": 0.5022697115720371, + "grad_norm": 0.4309073064099923, + "learning_rate": 9.862181384883365e-05, + "loss": 3.221, + "step": 10788 + }, + { + "epoch": 0.5023162697581303, + "grad_norm": 0.39367451921655905, + "learning_rate": 9.862118218464442e-05, + "loss": 3.2342, + "step": 10789 + }, + { + "epoch": 0.5023628279442233, + "grad_norm": 0.4711878505090341, + "learning_rate": 9.862055037775681e-05, + "loss": 3.2011, + "step": 10790 + }, + { + "epoch": 0.5024093861303164, + "grad_norm": 0.3606534944660577, + "learning_rate": 9.861991842817269e-05, + "loss": 3.3185, + "step": 10791 + }, + { + "epoch": 0.5024559443164094, + "grad_norm": 0.4429606906251044, + "learning_rate": 9.861928633589389e-05, + "loss": 3.1135, + "step": 10792 + }, + { + "epoch": 0.5025025025025025, + "grad_norm": 0.4195066796474705, + "learning_rate": 9.861865410092228e-05, + "loss": 3.2491, + "step": 10793 + }, + { + "epoch": 0.5025490606885956, + "grad_norm": 0.3935402068292259, + "learning_rate": 9.861802172325969e-05, + "loss": 3.1101, + "step": 10794 + }, + { + "epoch": 0.5025956188746886, + "grad_norm": 0.4365825339131148, + "learning_rate": 9.861738920290802e-05, + "loss": 3.1617, + "step": 10795 + }, + { + "epoch": 0.5026421770607817, + "grad_norm": 0.3586573184239674, + "learning_rate": 9.861675653986909e-05, + "loss": 3.0961, + "step": 10796 + }, + { + "epoch": 0.5026887352468747, + "grad_norm": 0.39373913460255155, + "learning_rate": 9.861612373414478e-05, + "loss": 3.1552, + "step": 10797 + }, + { + "epoch": 0.5027352934329679, + "grad_norm": 0.41156732615035635, + "learning_rate": 9.861549078573691e-05, + "loss": 3.0806, + "step": 10798 + }, + { + "epoch": 0.502781851619061, + "grad_norm": 0.37428700899250333, + "learning_rate": 9.861485769464738e-05, + "loss": 3.2688, + "step": 10799 + }, + { + "epoch": 0.502828409805154, + "grad_norm": 0.4153498465026704, + "learning_rate": 9.861422446087804e-05, + "loss": 3.2243, + "step": 10800 + }, + { + "epoch": 0.5028749679912471, + "grad_norm": 0.3497830602579211, + "learning_rate": 9.861359108443074e-05, + "loss": 3.1113, + "step": 10801 + }, + { + "epoch": 0.5029215261773401, + "grad_norm": 0.37850770396626837, + "learning_rate": 9.861295756530733e-05, + "loss": 3.2162, + "step": 10802 + }, + { + "epoch": 0.5029680843634332, + "grad_norm": 0.3318350449106237, + "learning_rate": 9.861232390350967e-05, + "loss": 3.2472, + "step": 10803 + }, + { + "epoch": 0.5030146425495263, + "grad_norm": 0.4015997162754451, + "learning_rate": 9.861169009903965e-05, + "loss": 3.1745, + "step": 10804 + }, + { + "epoch": 0.5030612007356193, + "grad_norm": 0.4116465063192571, + "learning_rate": 9.86110561518991e-05, + "loss": 3.1497, + "step": 10805 + }, + { + "epoch": 0.5031077589217124, + "grad_norm": 0.3699971028639368, + "learning_rate": 9.861042206208987e-05, + "loss": 3.145, + "step": 10806 + }, + { + "epoch": 0.5031543171078054, + "grad_norm": 0.39998341098692813, + "learning_rate": 9.860978782961387e-05, + "loss": 3.1711, + "step": 10807 + }, + { + "epoch": 0.5032008752938986, + "grad_norm": 0.3760799839412387, + "learning_rate": 9.860915345447293e-05, + "loss": 3.237, + "step": 10808 + }, + { + "epoch": 0.5032474334799916, + "grad_norm": 0.335998671112423, + "learning_rate": 9.860851893666889e-05, + "loss": 3.1528, + "step": 10809 + }, + { + "epoch": 0.5032939916660847, + "grad_norm": 0.40562780447721086, + "learning_rate": 9.860788427620364e-05, + "loss": 3.2437, + "step": 10810 + }, + { + "epoch": 0.5033405498521778, + "grad_norm": 0.3354874004468357, + "learning_rate": 9.860724947307905e-05, + "loss": 3.1033, + "step": 10811 + }, + { + "epoch": 0.5033871080382708, + "grad_norm": 0.3456746092354075, + "learning_rate": 9.860661452729698e-05, + "loss": 3.1702, + "step": 10812 + }, + { + "epoch": 0.5034336662243639, + "grad_norm": 0.37589041540126683, + "learning_rate": 9.860597943885928e-05, + "loss": 3.1377, + "step": 10813 + }, + { + "epoch": 0.5034802244104569, + "grad_norm": 0.3486336901452999, + "learning_rate": 9.860534420776779e-05, + "loss": 3.2634, + "step": 10814 + }, + { + "epoch": 0.50352678259655, + "grad_norm": 0.3509924180827087, + "learning_rate": 9.860470883402444e-05, + "loss": 3.1499, + "step": 10815 + }, + { + "epoch": 0.5035733407826432, + "grad_norm": 0.3657349605757805, + "learning_rate": 9.860407331763104e-05, + "loss": 3.1686, + "step": 10816 + }, + { + "epoch": 0.5036198989687362, + "grad_norm": 0.33878998367067925, + "learning_rate": 9.860343765858947e-05, + "loss": 2.9945, + "step": 10817 + }, + { + "epoch": 0.5036664571548293, + "grad_norm": 0.36170374070929134, + "learning_rate": 9.86028018569016e-05, + "loss": 3.2064, + "step": 10818 + }, + { + "epoch": 0.5037130153409223, + "grad_norm": 0.34914259034571404, + "learning_rate": 9.86021659125693e-05, + "loss": 3.2752, + "step": 10819 + }, + { + "epoch": 0.5037595735270154, + "grad_norm": 0.36324972534708416, + "learning_rate": 9.860152982559444e-05, + "loss": 3.2406, + "step": 10820 + }, + { + "epoch": 0.5038061317131085, + "grad_norm": 0.39240277063325485, + "learning_rate": 9.860089359597885e-05, + "loss": 3.0751, + "step": 10821 + }, + { + "epoch": 0.5038526898992015, + "grad_norm": 0.3900940399119962, + "learning_rate": 9.860025722372446e-05, + "loss": 3.1862, + "step": 10822 + }, + { + "epoch": 0.5038992480852946, + "grad_norm": 0.37401229544085707, + "learning_rate": 9.859962070883308e-05, + "loss": 3.0545, + "step": 10823 + }, + { + "epoch": 0.5039458062713876, + "grad_norm": 0.4023117447528997, + "learning_rate": 9.859898405130661e-05, + "loss": 3.2448, + "step": 10824 + }, + { + "epoch": 0.5039923644574807, + "grad_norm": 0.42281815727317157, + "learning_rate": 9.859834725114691e-05, + "loss": 3.185, + "step": 10825 + }, + { + "epoch": 0.5040389226435739, + "grad_norm": 0.37876021405426796, + "learning_rate": 9.859771030835585e-05, + "loss": 3.1907, + "step": 10826 + }, + { + "epoch": 0.5040854808296669, + "grad_norm": 0.3906745857400912, + "learning_rate": 9.85970732229353e-05, + "loss": 3.1588, + "step": 10827 + }, + { + "epoch": 0.50413203901576, + "grad_norm": 0.43154776124859523, + "learning_rate": 9.859643599488711e-05, + "loss": 3.1089, + "step": 10828 + }, + { + "epoch": 0.504178597201853, + "grad_norm": 0.4033133825672514, + "learning_rate": 9.859579862421319e-05, + "loss": 3.209, + "step": 10829 + }, + { + "epoch": 0.5042251553879461, + "grad_norm": 0.4488551693836298, + "learning_rate": 9.859516111091538e-05, + "loss": 3.2347, + "step": 10830 + }, + { + "epoch": 0.5042717135740391, + "grad_norm": 0.4106600826251309, + "learning_rate": 9.859452345499556e-05, + "loss": 3.1955, + "step": 10831 + }, + { + "epoch": 0.5043182717601322, + "grad_norm": 0.40454746702078614, + "learning_rate": 9.859388565645559e-05, + "loss": 3.1171, + "step": 10832 + }, + { + "epoch": 0.5043648299462253, + "grad_norm": 0.44528001760855357, + "learning_rate": 9.859324771529737e-05, + "loss": 3.2778, + "step": 10833 + }, + { + "epoch": 0.5044113881323183, + "grad_norm": 0.42245317412335154, + "learning_rate": 9.859260963152273e-05, + "loss": 3.1607, + "step": 10834 + }, + { + "epoch": 0.5044579463184115, + "grad_norm": 0.4127396862724406, + "learning_rate": 9.859197140513358e-05, + "loss": 3.1545, + "step": 10835 + }, + { + "epoch": 0.5045045045045045, + "grad_norm": 0.4689889907346345, + "learning_rate": 9.85913330361318e-05, + "loss": 3.2453, + "step": 10836 + }, + { + "epoch": 0.5045510626905976, + "grad_norm": 0.38443299871046055, + "learning_rate": 9.859069452451924e-05, + "loss": 3.189, + "step": 10837 + }, + { + "epoch": 0.5045976208766907, + "grad_norm": 0.3883599131727485, + "learning_rate": 9.859005587029776e-05, + "loss": 3.1653, + "step": 10838 + }, + { + "epoch": 0.5046441790627837, + "grad_norm": 0.3674045296107221, + "learning_rate": 9.858941707346925e-05, + "loss": 3.1888, + "step": 10839 + }, + { + "epoch": 0.5046907372488768, + "grad_norm": 0.3934088443268788, + "learning_rate": 9.858877813403561e-05, + "loss": 3.2678, + "step": 10840 + }, + { + "epoch": 0.5047372954349698, + "grad_norm": 0.3964877206968598, + "learning_rate": 9.858813905199869e-05, + "loss": 3.2076, + "step": 10841 + }, + { + "epoch": 0.5047838536210629, + "grad_norm": 0.39925653540837164, + "learning_rate": 9.858749982736035e-05, + "loss": 3.1396, + "step": 10842 + }, + { + "epoch": 0.504830411807156, + "grad_norm": 0.4293761935009559, + "learning_rate": 9.85868604601225e-05, + "loss": 3.1719, + "step": 10843 + }, + { + "epoch": 0.504876969993249, + "grad_norm": 0.36758261858860153, + "learning_rate": 9.8586220950287e-05, + "loss": 3.1571, + "step": 10844 + }, + { + "epoch": 0.5049235281793422, + "grad_norm": 0.37774125720298546, + "learning_rate": 9.858558129785572e-05, + "loss": 3.2287, + "step": 10845 + }, + { + "epoch": 0.5049700863654352, + "grad_norm": 0.43076505002644877, + "learning_rate": 9.858494150283055e-05, + "loss": 3.2261, + "step": 10846 + }, + { + "epoch": 0.5050166445515283, + "grad_norm": 0.3858577142821729, + "learning_rate": 9.858430156521337e-05, + "loss": 3.1753, + "step": 10847 + }, + { + "epoch": 0.5050632027376214, + "grad_norm": 0.4142608349646766, + "learning_rate": 9.858366148500604e-05, + "loss": 3.2556, + "step": 10848 + }, + { + "epoch": 0.5051097609237144, + "grad_norm": 0.42128848520862455, + "learning_rate": 9.858302126221046e-05, + "loss": 3.2118, + "step": 10849 + }, + { + "epoch": 0.5051563191098075, + "grad_norm": 0.3743450047777036, + "learning_rate": 9.858238089682848e-05, + "loss": 3.1541, + "step": 10850 + }, + { + "epoch": 0.5052028772959005, + "grad_norm": 0.3833440640713583, + "learning_rate": 9.858174038886202e-05, + "loss": 3.1694, + "step": 10851 + }, + { + "epoch": 0.5052494354819936, + "grad_norm": 0.3885801472553849, + "learning_rate": 9.858109973831293e-05, + "loss": 3.2373, + "step": 10852 + }, + { + "epoch": 0.5052959936680866, + "grad_norm": 0.356794025514296, + "learning_rate": 9.85804589451831e-05, + "loss": 3.136, + "step": 10853 + }, + { + "epoch": 0.5053425518541798, + "grad_norm": 0.3661406205943272, + "learning_rate": 9.85798180094744e-05, + "loss": 3.2407, + "step": 10854 + }, + { + "epoch": 0.5053891100402729, + "grad_norm": 0.3538384507070992, + "learning_rate": 9.857917693118873e-05, + "loss": 3.2015, + "step": 10855 + }, + { + "epoch": 0.5054356682263659, + "grad_norm": 0.37108456705159754, + "learning_rate": 9.857853571032796e-05, + "loss": 3.2117, + "step": 10856 + }, + { + "epoch": 0.505482226412459, + "grad_norm": 0.42787224756288433, + "learning_rate": 9.857789434689398e-05, + "loss": 3.2496, + "step": 10857 + }, + { + "epoch": 0.505528784598552, + "grad_norm": 0.4382697476771295, + "learning_rate": 9.857725284088866e-05, + "loss": 3.2646, + "step": 10858 + }, + { + "epoch": 0.5055753427846451, + "grad_norm": 0.3673620784939825, + "learning_rate": 9.857661119231387e-05, + "loss": 3.1596, + "step": 10859 + }, + { + "epoch": 0.5056219009707382, + "grad_norm": 0.41612062255049187, + "learning_rate": 9.857596940117153e-05, + "loss": 3.2686, + "step": 10860 + }, + { + "epoch": 0.5056684591568312, + "grad_norm": 0.38693625843965224, + "learning_rate": 9.85753274674635e-05, + "loss": 3.2233, + "step": 10861 + }, + { + "epoch": 0.5057150173429243, + "grad_norm": 0.39930043617292454, + "learning_rate": 9.857468539119167e-05, + "loss": 3.2375, + "step": 10862 + }, + { + "epoch": 0.5057615755290173, + "grad_norm": 0.3808634229476399, + "learning_rate": 9.857404317235795e-05, + "loss": 3.1505, + "step": 10863 + }, + { + "epoch": 0.5058081337151105, + "grad_norm": 0.4140708385790141, + "learning_rate": 9.857340081096415e-05, + "loss": 3.2433, + "step": 10864 + }, + { + "epoch": 0.5058546919012036, + "grad_norm": 0.3841184586778099, + "learning_rate": 9.857275830701225e-05, + "loss": 3.1631, + "step": 10865 + }, + { + "epoch": 0.5059012500872966, + "grad_norm": 0.41883523247314014, + "learning_rate": 9.857211566050406e-05, + "loss": 3.2178, + "step": 10866 + }, + { + "epoch": 0.5059478082733897, + "grad_norm": 0.4053402221351868, + "learning_rate": 9.857147287144151e-05, + "loss": 3.2455, + "step": 10867 + }, + { + "epoch": 0.5059943664594827, + "grad_norm": 0.4313558958020869, + "learning_rate": 9.857082993982646e-05, + "loss": 3.0833, + "step": 10868 + }, + { + "epoch": 0.5060409246455758, + "grad_norm": 0.4474663983091114, + "learning_rate": 9.857018686566082e-05, + "loss": 3.1569, + "step": 10869 + }, + { + "epoch": 0.5060874828316689, + "grad_norm": 0.43146029941038916, + "learning_rate": 9.856954364894646e-05, + "loss": 3.1679, + "step": 10870 + }, + { + "epoch": 0.5061340410177619, + "grad_norm": 0.43324350921504284, + "learning_rate": 9.856890028968527e-05, + "loss": 3.293, + "step": 10871 + }, + { + "epoch": 0.506180599203855, + "grad_norm": 0.4027344928536314, + "learning_rate": 9.856825678787915e-05, + "loss": 3.1373, + "step": 10872 + }, + { + "epoch": 0.5062271573899481, + "grad_norm": 0.4352582828144199, + "learning_rate": 9.856761314352997e-05, + "loss": 3.149, + "step": 10873 + }, + { + "epoch": 0.5062737155760412, + "grad_norm": 0.40866808151594514, + "learning_rate": 9.856696935663965e-05, + "loss": 3.0565, + "step": 10874 + }, + { + "epoch": 0.5063202737621342, + "grad_norm": 0.38647801216485556, + "learning_rate": 9.856632542721005e-05, + "loss": 3.2064, + "step": 10875 + }, + { + "epoch": 0.5063668319482273, + "grad_norm": 0.4567056225933155, + "learning_rate": 9.856568135524306e-05, + "loss": 3.2396, + "step": 10876 + }, + { + "epoch": 0.5064133901343204, + "grad_norm": 0.4180118555056864, + "learning_rate": 9.85650371407406e-05, + "loss": 3.2076, + "step": 10877 + }, + { + "epoch": 0.5064599483204134, + "grad_norm": 0.37663410304610206, + "learning_rate": 9.856439278370453e-05, + "loss": 3.2266, + "step": 10878 + }, + { + "epoch": 0.5065065065065065, + "grad_norm": 0.36512578897082315, + "learning_rate": 9.856374828413675e-05, + "loss": 3.1191, + "step": 10879 + }, + { + "epoch": 0.5065530646925995, + "grad_norm": 0.4111497674445326, + "learning_rate": 9.856310364203916e-05, + "loss": 3.2275, + "step": 10880 + }, + { + "epoch": 0.5065996228786926, + "grad_norm": 0.38083024249363634, + "learning_rate": 9.856245885741364e-05, + "loss": 3.0859, + "step": 10881 + }, + { + "epoch": 0.5066461810647858, + "grad_norm": 0.4257332923958318, + "learning_rate": 9.856181393026208e-05, + "loss": 3.1828, + "step": 10882 + }, + { + "epoch": 0.5066927392508788, + "grad_norm": 0.38424031705524025, + "learning_rate": 9.85611688605864e-05, + "loss": 3.1425, + "step": 10883 + }, + { + "epoch": 0.5067392974369719, + "grad_norm": 0.39685238380178967, + "learning_rate": 9.856052364838846e-05, + "loss": 3.1869, + "step": 10884 + }, + { + "epoch": 0.5067858556230649, + "grad_norm": 0.4136220975757592, + "learning_rate": 9.855987829367016e-05, + "loss": 3.1817, + "step": 10885 + }, + { + "epoch": 0.506832413809158, + "grad_norm": 0.38950107949875656, + "learning_rate": 9.85592327964334e-05, + "loss": 3.1676, + "step": 10886 + }, + { + "epoch": 0.5068789719952511, + "grad_norm": 0.35877492217583, + "learning_rate": 9.855858715668009e-05, + "loss": 3.0604, + "step": 10887 + }, + { + "epoch": 0.5069255301813441, + "grad_norm": 0.41975409084299764, + "learning_rate": 9.85579413744121e-05, + "loss": 3.2206, + "step": 10888 + }, + { + "epoch": 0.5069720883674372, + "grad_norm": 0.3710252121554976, + "learning_rate": 9.855729544963135e-05, + "loss": 3.2331, + "step": 10889 + }, + { + "epoch": 0.5070186465535302, + "grad_norm": 0.3869937501823856, + "learning_rate": 9.85566493823397e-05, + "loss": 3.2327, + "step": 10890 + }, + { + "epoch": 0.5070652047396234, + "grad_norm": 0.37303450312146047, + "learning_rate": 9.855600317253909e-05, + "loss": 3.0661, + "step": 10891 + }, + { + "epoch": 0.5071117629257165, + "grad_norm": 0.3923919882270158, + "learning_rate": 9.855535682023138e-05, + "loss": 3.2239, + "step": 10892 + }, + { + "epoch": 0.5071583211118095, + "grad_norm": 0.41953642505510447, + "learning_rate": 9.855471032541849e-05, + "loss": 3.2234, + "step": 10893 + }, + { + "epoch": 0.5072048792979026, + "grad_norm": 0.36839818809768454, + "learning_rate": 9.85540636881023e-05, + "loss": 3.1096, + "step": 10894 + }, + { + "epoch": 0.5072514374839956, + "grad_norm": 0.3748072755078017, + "learning_rate": 9.855341690828473e-05, + "loss": 3.2752, + "step": 10895 + }, + { + "epoch": 0.5072979956700887, + "grad_norm": 0.4244364914477781, + "learning_rate": 9.855276998596765e-05, + "loss": 3.1209, + "step": 10896 + }, + { + "epoch": 0.5073445538561817, + "grad_norm": 0.3676763989118235, + "learning_rate": 9.855212292115297e-05, + "loss": 3.2139, + "step": 10897 + }, + { + "epoch": 0.5073911120422748, + "grad_norm": 0.34271790905626043, + "learning_rate": 9.85514757138426e-05, + "loss": 3.1079, + "step": 10898 + }, + { + "epoch": 0.5074376702283679, + "grad_norm": 0.39630927306504066, + "learning_rate": 9.855082836403844e-05, + "loss": 3.1791, + "step": 10899 + }, + { + "epoch": 0.507484228414461, + "grad_norm": 0.38999606584809576, + "learning_rate": 9.855018087174237e-05, + "loss": 3.1137, + "step": 10900 + }, + { + "epoch": 0.5075307866005541, + "grad_norm": 0.3783599971586428, + "learning_rate": 9.854953323695631e-05, + "loss": 3.0578, + "step": 10901 + }, + { + "epoch": 0.5075773447866471, + "grad_norm": 0.3703422290015854, + "learning_rate": 9.854888545968214e-05, + "loss": 3.177, + "step": 10902 + }, + { + "epoch": 0.5076239029727402, + "grad_norm": 0.38730863728949083, + "learning_rate": 9.854823753992179e-05, + "loss": 3.1203, + "step": 10903 + }, + { + "epoch": 0.5076704611588333, + "grad_norm": 0.3673052434648144, + "learning_rate": 9.854758947767715e-05, + "loss": 3.3126, + "step": 10904 + }, + { + "epoch": 0.5077170193449263, + "grad_norm": 0.3737922726803694, + "learning_rate": 9.85469412729501e-05, + "loss": 3.1962, + "step": 10905 + }, + { + "epoch": 0.5077635775310194, + "grad_norm": 0.39040499237050214, + "learning_rate": 9.854629292574257e-05, + "loss": 3.2041, + "step": 10906 + }, + { + "epoch": 0.5078101357171124, + "grad_norm": 0.31885470816295214, + "learning_rate": 9.854564443605645e-05, + "loss": 3.2232, + "step": 10907 + }, + { + "epoch": 0.5078566939032055, + "grad_norm": 0.3784799613282434, + "learning_rate": 9.854499580389365e-05, + "loss": 3.0985, + "step": 10908 + }, + { + "epoch": 0.5079032520892987, + "grad_norm": 0.37794609280914343, + "learning_rate": 9.854434702925605e-05, + "loss": 3.0413, + "step": 10909 + }, + { + "epoch": 0.5079498102753917, + "grad_norm": 0.38789964767085866, + "learning_rate": 9.854369811214559e-05, + "loss": 3.1883, + "step": 10910 + }, + { + "epoch": 0.5079963684614848, + "grad_norm": 0.5115325617951939, + "learning_rate": 9.854304905256416e-05, + "loss": 3.2617, + "step": 10911 + }, + { + "epoch": 0.5080429266475778, + "grad_norm": 0.45463816071374413, + "learning_rate": 9.854239985051367e-05, + "loss": 3.1828, + "step": 10912 + }, + { + "epoch": 0.5080894848336709, + "grad_norm": 0.3781767302812372, + "learning_rate": 9.854175050599602e-05, + "loss": 3.1146, + "step": 10913 + }, + { + "epoch": 0.508136043019764, + "grad_norm": 0.4750934021708985, + "learning_rate": 9.854110101901308e-05, + "loss": 3.1888, + "step": 10914 + }, + { + "epoch": 0.508182601205857, + "grad_norm": 0.4569467954833721, + "learning_rate": 9.854045138956683e-05, + "loss": 3.2249, + "step": 10915 + }, + { + "epoch": 0.5082291593919501, + "grad_norm": 0.3723284063233659, + "learning_rate": 9.853980161765912e-05, + "loss": 3.1953, + "step": 10916 + }, + { + "epoch": 0.5082757175780431, + "grad_norm": 0.4759521994751182, + "learning_rate": 9.853915170329187e-05, + "loss": 3.2613, + "step": 10917 + }, + { + "epoch": 0.5083222757641362, + "grad_norm": 0.3976189157846709, + "learning_rate": 9.853850164646701e-05, + "loss": 3.1706, + "step": 10918 + }, + { + "epoch": 0.5083688339502292, + "grad_norm": 0.4374043930136713, + "learning_rate": 9.853785144718642e-05, + "loss": 3.1604, + "step": 10919 + }, + { + "epoch": 0.5084153921363224, + "grad_norm": 0.4926905782207435, + "learning_rate": 9.8537201105452e-05, + "loss": 3.2236, + "step": 10920 + }, + { + "epoch": 0.5084619503224155, + "grad_norm": 0.4760499392519015, + "learning_rate": 9.85365506212657e-05, + "loss": 3.1577, + "step": 10921 + }, + { + "epoch": 0.5085085085085085, + "grad_norm": 0.4067477844882955, + "learning_rate": 9.85358999946294e-05, + "loss": 3.2001, + "step": 10922 + }, + { + "epoch": 0.5085550666946016, + "grad_norm": 0.4609491701227913, + "learning_rate": 9.8535249225545e-05, + "loss": 3.1071, + "step": 10923 + }, + { + "epoch": 0.5086016248806946, + "grad_norm": 0.38942780593993215, + "learning_rate": 9.853459831401444e-05, + "loss": 3.1458, + "step": 10924 + }, + { + "epoch": 0.5086481830667877, + "grad_norm": 0.3534756302780555, + "learning_rate": 9.853394726003961e-05, + "loss": 3.2237, + "step": 10925 + }, + { + "epoch": 0.5086947412528808, + "grad_norm": 0.39155701622639166, + "learning_rate": 9.853329606362243e-05, + "loss": 3.234, + "step": 10926 + }, + { + "epoch": 0.5087412994389738, + "grad_norm": 0.35790477535677934, + "learning_rate": 9.853264472476481e-05, + "loss": 3.1254, + "step": 10927 + }, + { + "epoch": 0.508787857625067, + "grad_norm": 0.3737847408956126, + "learning_rate": 9.853199324346866e-05, + "loss": 3.156, + "step": 10928 + }, + { + "epoch": 0.50883441581116, + "grad_norm": 0.3912711438070563, + "learning_rate": 9.853134161973587e-05, + "loss": 3.092, + "step": 10929 + }, + { + "epoch": 0.5088809739972531, + "grad_norm": 0.34044774125276017, + "learning_rate": 9.85306898535684e-05, + "loss": 3.1515, + "step": 10930 + }, + { + "epoch": 0.5089275321833462, + "grad_norm": 0.37783905222821107, + "learning_rate": 9.853003794496814e-05, + "loss": 3.1903, + "step": 10931 + }, + { + "epoch": 0.5089740903694392, + "grad_norm": 0.3731648540528701, + "learning_rate": 9.852938589393697e-05, + "loss": 3.2079, + "step": 10932 + }, + { + "epoch": 0.5090206485555323, + "grad_norm": 0.3590629545492049, + "learning_rate": 9.852873370047685e-05, + "loss": 3.1394, + "step": 10933 + }, + { + "epoch": 0.5090672067416253, + "grad_norm": 0.4398183452552804, + "learning_rate": 9.852808136458968e-05, + "loss": 3.2659, + "step": 10934 + }, + { + "epoch": 0.5091137649277184, + "grad_norm": 0.3826325432671383, + "learning_rate": 9.852742888627738e-05, + "loss": 3.1234, + "step": 10935 + }, + { + "epoch": 0.5091603231138114, + "grad_norm": 0.3435243334050885, + "learning_rate": 9.852677626554183e-05, + "loss": 3.2872, + "step": 10936 + }, + { + "epoch": 0.5092068812999045, + "grad_norm": 0.4183786175679308, + "learning_rate": 9.8526123502385e-05, + "loss": 3.1653, + "step": 10937 + }, + { + "epoch": 0.5092534394859977, + "grad_norm": 0.39616615193714444, + "learning_rate": 9.852547059680875e-05, + "loss": 3.1938, + "step": 10938 + }, + { + "epoch": 0.5092999976720907, + "grad_norm": 0.3904930791264087, + "learning_rate": 9.852481754881504e-05, + "loss": 3.1975, + "step": 10939 + }, + { + "epoch": 0.5093465558581838, + "grad_norm": 0.43075372818327784, + "learning_rate": 9.852416435840579e-05, + "loss": 3.1933, + "step": 10940 + }, + { + "epoch": 0.5093931140442768, + "grad_norm": 0.40364954720189433, + "learning_rate": 9.852351102558287e-05, + "loss": 3.1527, + "step": 10941 + }, + { + "epoch": 0.5094396722303699, + "grad_norm": 0.3830385879917592, + "learning_rate": 9.852285755034824e-05, + "loss": 3.2165, + "step": 10942 + }, + { + "epoch": 0.509486230416463, + "grad_norm": 0.4600997329659304, + "learning_rate": 9.852220393270379e-05, + "loss": 3.147, + "step": 10943 + }, + { + "epoch": 0.509532788602556, + "grad_norm": 0.4168392797538461, + "learning_rate": 9.852155017265146e-05, + "loss": 3.1127, + "step": 10944 + }, + { + "epoch": 0.5095793467886491, + "grad_norm": 0.41780534625489424, + "learning_rate": 9.852089627019317e-05, + "loss": 3.1835, + "step": 10945 + }, + { + "epoch": 0.5096259049747421, + "grad_norm": 0.37874688113344196, + "learning_rate": 9.85202422253308e-05, + "loss": 3.3234, + "step": 10946 + }, + { + "epoch": 0.5096724631608353, + "grad_norm": 0.4113520255557655, + "learning_rate": 9.851958803806633e-05, + "loss": 3.2039, + "step": 10947 + }, + { + "epoch": 0.5097190213469284, + "grad_norm": 0.423354679481619, + "learning_rate": 9.851893370840163e-05, + "loss": 3.1667, + "step": 10948 + }, + { + "epoch": 0.5097655795330214, + "grad_norm": 0.42738485602879467, + "learning_rate": 9.851827923633865e-05, + "loss": 3.1538, + "step": 10949 + }, + { + "epoch": 0.5098121377191145, + "grad_norm": 0.37655441013204405, + "learning_rate": 9.85176246218793e-05, + "loss": 3.0658, + "step": 10950 + }, + { + "epoch": 0.5098586959052075, + "grad_norm": 0.41763236535112447, + "learning_rate": 9.851696986502549e-05, + "loss": 3.221, + "step": 10951 + }, + { + "epoch": 0.5099052540913006, + "grad_norm": 0.4052968432514637, + "learning_rate": 9.851631496577918e-05, + "loss": 3.2454, + "step": 10952 + }, + { + "epoch": 0.5099518122773937, + "grad_norm": 0.43674536089204413, + "learning_rate": 9.851565992414224e-05, + "loss": 3.2404, + "step": 10953 + }, + { + "epoch": 0.5099983704634867, + "grad_norm": 0.3810617865880451, + "learning_rate": 9.851500474011664e-05, + "loss": 3.1601, + "step": 10954 + }, + { + "epoch": 0.5100449286495798, + "grad_norm": 0.3988096694623183, + "learning_rate": 9.851434941370425e-05, + "loss": 3.1637, + "step": 10955 + }, + { + "epoch": 0.5100914868356728, + "grad_norm": 0.3847723755293097, + "learning_rate": 9.851369394490705e-05, + "loss": 3.1977, + "step": 10956 + }, + { + "epoch": 0.510138045021766, + "grad_norm": 0.3864496091090013, + "learning_rate": 9.851303833372692e-05, + "loss": 3.1483, + "step": 10957 + }, + { + "epoch": 0.510184603207859, + "grad_norm": 0.3968156799185929, + "learning_rate": 9.851238258016583e-05, + "loss": 3.1869, + "step": 10958 + }, + { + "epoch": 0.5102311613939521, + "grad_norm": 0.3607014947475352, + "learning_rate": 9.851172668422563e-05, + "loss": 3.1389, + "step": 10959 + }, + { + "epoch": 0.5102777195800452, + "grad_norm": 0.39386656981072343, + "learning_rate": 9.851107064590832e-05, + "loss": 3.0977, + "step": 10960 + }, + { + "epoch": 0.5103242777661382, + "grad_norm": 0.3899780298262399, + "learning_rate": 9.85104144652158e-05, + "loss": 3.1451, + "step": 10961 + }, + { + "epoch": 0.5103708359522313, + "grad_norm": 0.391250694318516, + "learning_rate": 9.850975814214999e-05, + "loss": 3.2037, + "step": 10962 + }, + { + "epoch": 0.5104173941383243, + "grad_norm": 0.44489635920891896, + "learning_rate": 9.850910167671281e-05, + "loss": 3.1962, + "step": 10963 + }, + { + "epoch": 0.5104639523244174, + "grad_norm": 0.43302164608615273, + "learning_rate": 9.85084450689062e-05, + "loss": 3.2708, + "step": 10964 + }, + { + "epoch": 0.5105105105105106, + "grad_norm": 0.4061545982396699, + "learning_rate": 9.850778831873208e-05, + "loss": 3.209, + "step": 10965 + }, + { + "epoch": 0.5105570686966036, + "grad_norm": 0.36358022280915586, + "learning_rate": 9.85071314261924e-05, + "loss": 3.2769, + "step": 10966 + }, + { + "epoch": 0.5106036268826967, + "grad_norm": 0.4021246928136716, + "learning_rate": 9.850647439128904e-05, + "loss": 3.1495, + "step": 10967 + }, + { + "epoch": 0.5106501850687897, + "grad_norm": 0.4020835284073294, + "learning_rate": 9.850581721402397e-05, + "loss": 3.2544, + "step": 10968 + }, + { + "epoch": 0.5106967432548828, + "grad_norm": 0.4295607774940075, + "learning_rate": 9.85051598943991e-05, + "loss": 3.2469, + "step": 10969 + }, + { + "epoch": 0.5107433014409759, + "grad_norm": 0.36006744831714826, + "learning_rate": 9.850450243241638e-05, + "loss": 3.0812, + "step": 10970 + }, + { + "epoch": 0.5107898596270689, + "grad_norm": 0.4204413261134262, + "learning_rate": 9.850384482807771e-05, + "loss": 3.3254, + "step": 10971 + }, + { + "epoch": 0.510836417813162, + "grad_norm": 0.4902390896716007, + "learning_rate": 9.850318708138503e-05, + "loss": 3.161, + "step": 10972 + }, + { + "epoch": 0.510882975999255, + "grad_norm": 0.393226934518439, + "learning_rate": 9.850252919234028e-05, + "loss": 3.1173, + "step": 10973 + }, + { + "epoch": 0.5109295341853481, + "grad_norm": 0.4004272967095715, + "learning_rate": 9.850187116094538e-05, + "loss": 3.0782, + "step": 10974 + }, + { + "epoch": 0.5109760923714413, + "grad_norm": 0.45328431165241057, + "learning_rate": 9.850121298720227e-05, + "loss": 3.2069, + "step": 10975 + }, + { + "epoch": 0.5110226505575343, + "grad_norm": 0.4071271566621807, + "learning_rate": 9.850055467111288e-05, + "loss": 3.2508, + "step": 10976 + }, + { + "epoch": 0.5110692087436274, + "grad_norm": 0.45320697260731596, + "learning_rate": 9.849989621267915e-05, + "loss": 3.1637, + "step": 10977 + }, + { + "epoch": 0.5111157669297204, + "grad_norm": 0.4263503203563298, + "learning_rate": 9.849923761190298e-05, + "loss": 3.1318, + "step": 10978 + }, + { + "epoch": 0.5111623251158135, + "grad_norm": 0.4053109800519432, + "learning_rate": 9.849857886878634e-05, + "loss": 3.2239, + "step": 10979 + }, + { + "epoch": 0.5112088833019065, + "grad_norm": 0.4649155860256622, + "learning_rate": 9.849791998333115e-05, + "loss": 3.2339, + "step": 10980 + }, + { + "epoch": 0.5112554414879996, + "grad_norm": 0.39245817408479333, + "learning_rate": 9.849726095553934e-05, + "loss": 3.0824, + "step": 10981 + }, + { + "epoch": 0.5113019996740927, + "grad_norm": 0.3593386044433066, + "learning_rate": 9.849660178541284e-05, + "loss": 3.1258, + "step": 10982 + }, + { + "epoch": 0.5113485578601857, + "grad_norm": 0.3947969430693574, + "learning_rate": 9.84959424729536e-05, + "loss": 3.1281, + "step": 10983 + }, + { + "epoch": 0.5113951160462789, + "grad_norm": 0.3542649354217607, + "learning_rate": 9.849528301816354e-05, + "loss": 3.2984, + "step": 10984 + }, + { + "epoch": 0.5114416742323719, + "grad_norm": 0.37755237248057477, + "learning_rate": 9.84946234210446e-05, + "loss": 3.0996, + "step": 10985 + }, + { + "epoch": 0.511488232418465, + "grad_norm": 0.4190058638470797, + "learning_rate": 9.849396368159873e-05, + "loss": 3.0829, + "step": 10986 + }, + { + "epoch": 0.5115347906045581, + "grad_norm": 0.3992463976851071, + "learning_rate": 9.849330379982786e-05, + "loss": 3.1846, + "step": 10987 + }, + { + "epoch": 0.5115813487906511, + "grad_norm": 0.3642843792462895, + "learning_rate": 9.849264377573391e-05, + "loss": 3.0746, + "step": 10988 + }, + { + "epoch": 0.5116279069767442, + "grad_norm": 0.46001062154991956, + "learning_rate": 9.849198360931882e-05, + "loss": 3.2387, + "step": 10989 + }, + { + "epoch": 0.5116744651628372, + "grad_norm": 0.4700610231066916, + "learning_rate": 9.849132330058454e-05, + "loss": 3.2339, + "step": 10990 + }, + { + "epoch": 0.5117210233489303, + "grad_norm": 0.3952112724553324, + "learning_rate": 9.8490662849533e-05, + "loss": 3.1918, + "step": 10991 + }, + { + "epoch": 0.5117675815350234, + "grad_norm": 0.38899544321001195, + "learning_rate": 9.849000225616615e-05, + "loss": 3.151, + "step": 10992 + }, + { + "epoch": 0.5118141397211164, + "grad_norm": 0.3645263510582939, + "learning_rate": 9.848934152048592e-05, + "loss": 3.255, + "step": 10993 + }, + { + "epoch": 0.5118606979072096, + "grad_norm": 0.39649838151456746, + "learning_rate": 9.848868064249424e-05, + "loss": 3.209, + "step": 10994 + }, + { + "epoch": 0.5119072560933026, + "grad_norm": 0.36013550783821957, + "learning_rate": 9.848801962219307e-05, + "loss": 3.1846, + "step": 10995 + }, + { + "epoch": 0.5119538142793957, + "grad_norm": 0.37322420080163743, + "learning_rate": 9.848735845958433e-05, + "loss": 3.1911, + "step": 10996 + }, + { + "epoch": 0.5120003724654888, + "grad_norm": 0.350799827979403, + "learning_rate": 9.848669715466998e-05, + "loss": 3.1328, + "step": 10997 + }, + { + "epoch": 0.5120469306515818, + "grad_norm": 0.35414348816419416, + "learning_rate": 9.848603570745193e-05, + "loss": 3.2432, + "step": 10998 + }, + { + "epoch": 0.5120934888376749, + "grad_norm": 0.41432135740920156, + "learning_rate": 9.848537411793217e-05, + "loss": 3.1996, + "step": 10999 + }, + { + "epoch": 0.5121400470237679, + "grad_norm": 0.4192565195267719, + "learning_rate": 9.84847123861126e-05, + "loss": 3.2294, + "step": 11000 + }, + { + "epoch": 0.512186605209861, + "grad_norm": 0.36664343286309475, + "learning_rate": 9.848405051199518e-05, + "loss": 3.1866, + "step": 11001 + }, + { + "epoch": 0.512233163395954, + "grad_norm": 0.41104181713241894, + "learning_rate": 9.848338849558184e-05, + "loss": 3.2076, + "step": 11002 + }, + { + "epoch": 0.5122797215820472, + "grad_norm": 0.41246546963510444, + "learning_rate": 9.848272633687454e-05, + "loss": 3.1983, + "step": 11003 + }, + { + "epoch": 0.5123262797681403, + "grad_norm": 0.36859513898150875, + "learning_rate": 9.84820640358752e-05, + "loss": 3.1295, + "step": 11004 + }, + { + "epoch": 0.5123728379542333, + "grad_norm": 0.4132365249095376, + "learning_rate": 9.84814015925858e-05, + "loss": 3.2551, + "step": 11005 + }, + { + "epoch": 0.5124193961403264, + "grad_norm": 0.3686412019004929, + "learning_rate": 9.848073900700825e-05, + "loss": 3.0927, + "step": 11006 + }, + { + "epoch": 0.5124659543264194, + "grad_norm": 0.4363626332308898, + "learning_rate": 9.84800762791445e-05, + "loss": 3.1822, + "step": 11007 + }, + { + "epoch": 0.5125125125125125, + "grad_norm": 0.3907073563100011, + "learning_rate": 9.84794134089965e-05, + "loss": 3.0133, + "step": 11008 + }, + { + "epoch": 0.5125590706986056, + "grad_norm": 0.36214125697550326, + "learning_rate": 9.847875039656622e-05, + "loss": 3.1013, + "step": 11009 + }, + { + "epoch": 0.5126056288846986, + "grad_norm": 0.4445257419368235, + "learning_rate": 9.847808724185557e-05, + "loss": 3.2469, + "step": 11010 + }, + { + "epoch": 0.5126521870707917, + "grad_norm": 0.3901464361299814, + "learning_rate": 9.847742394486652e-05, + "loss": 3.1127, + "step": 11011 + }, + { + "epoch": 0.5126987452568847, + "grad_norm": 0.4128457954960126, + "learning_rate": 9.8476760505601e-05, + "loss": 3.0425, + "step": 11012 + }, + { + "epoch": 0.5127453034429779, + "grad_norm": 0.3925971320061664, + "learning_rate": 9.847609692406096e-05, + "loss": 3.1778, + "step": 11013 + }, + { + "epoch": 0.512791861629071, + "grad_norm": 0.40157579300457047, + "learning_rate": 9.847543320024835e-05, + "loss": 3.2152, + "step": 11014 + }, + { + "epoch": 0.512838419815164, + "grad_norm": 0.42753681025522344, + "learning_rate": 9.847476933416513e-05, + "loss": 3.2292, + "step": 11015 + }, + { + "epoch": 0.5128849780012571, + "grad_norm": 0.45896555144798507, + "learning_rate": 9.847410532581322e-05, + "loss": 3.2723, + "step": 11016 + }, + { + "epoch": 0.5129315361873501, + "grad_norm": 0.3702476404368082, + "learning_rate": 9.84734411751946e-05, + "loss": 3.2182, + "step": 11017 + }, + { + "epoch": 0.5129780943734432, + "grad_norm": 0.4018792370257177, + "learning_rate": 9.84727768823112e-05, + "loss": 3.2298, + "step": 11018 + }, + { + "epoch": 0.5130246525595363, + "grad_norm": 0.40969139591391945, + "learning_rate": 9.847211244716498e-05, + "loss": 3.2582, + "step": 11019 + }, + { + "epoch": 0.5130712107456293, + "grad_norm": 0.3941017017220858, + "learning_rate": 9.847144786975788e-05, + "loss": 3.1798, + "step": 11020 + }, + { + "epoch": 0.5131177689317225, + "grad_norm": 0.4012411598626069, + "learning_rate": 9.847078315009187e-05, + "loss": 3.0891, + "step": 11021 + }, + { + "epoch": 0.5131643271178155, + "grad_norm": 0.4275983597542102, + "learning_rate": 9.847011828816886e-05, + "loss": 3.2135, + "step": 11022 + }, + { + "epoch": 0.5132108853039086, + "grad_norm": 0.3770356023680822, + "learning_rate": 9.846945328399084e-05, + "loss": 3.1933, + "step": 11023 + }, + { + "epoch": 0.5132574434900016, + "grad_norm": 0.40798122368673806, + "learning_rate": 9.846878813755975e-05, + "loss": 3.2117, + "step": 11024 + }, + { + "epoch": 0.5133040016760947, + "grad_norm": 0.40073944275414475, + "learning_rate": 9.846812284887755e-05, + "loss": 3.1641, + "step": 11025 + }, + { + "epoch": 0.5133505598621878, + "grad_norm": 0.39139747088232785, + "learning_rate": 9.846745741794619e-05, + "loss": 3.1588, + "step": 11026 + }, + { + "epoch": 0.5133971180482808, + "grad_norm": 0.38403199823588435, + "learning_rate": 9.846679184476759e-05, + "loss": 3.2178, + "step": 11027 + }, + { + "epoch": 0.5134436762343739, + "grad_norm": 0.3574995929051506, + "learning_rate": 9.846612612934376e-05, + "loss": 3.0207, + "step": 11028 + }, + { + "epoch": 0.5134902344204669, + "grad_norm": 0.3339282530427272, + "learning_rate": 9.846546027167661e-05, + "loss": 3.1275, + "step": 11029 + }, + { + "epoch": 0.51353679260656, + "grad_norm": 0.40486870461116947, + "learning_rate": 9.846479427176812e-05, + "loss": 3.1839, + "step": 11030 + }, + { + "epoch": 0.5135833507926532, + "grad_norm": 0.37055565321743467, + "learning_rate": 9.84641281296202e-05, + "loss": 3.1746, + "step": 11031 + }, + { + "epoch": 0.5136299089787462, + "grad_norm": 0.38553358977489843, + "learning_rate": 9.846346184523488e-05, + "loss": 3.1279, + "step": 11032 + }, + { + "epoch": 0.5136764671648393, + "grad_norm": 0.371076580659639, + "learning_rate": 9.846279541861405e-05, + "loss": 3.1892, + "step": 11033 + }, + { + "epoch": 0.5137230253509323, + "grad_norm": 0.3802421039193521, + "learning_rate": 9.84621288497597e-05, + "loss": 3.1834, + "step": 11034 + }, + { + "epoch": 0.5137695835370254, + "grad_norm": 0.34289455890243487, + "learning_rate": 9.846146213867376e-05, + "loss": 3.162, + "step": 11035 + }, + { + "epoch": 0.5138161417231185, + "grad_norm": 0.39509723028949484, + "learning_rate": 9.846079528535821e-05, + "loss": 3.2249, + "step": 11036 + }, + { + "epoch": 0.5138626999092115, + "grad_norm": 0.4017742108940531, + "learning_rate": 9.846012828981501e-05, + "loss": 3.1763, + "step": 11037 + }, + { + "epoch": 0.5139092580953046, + "grad_norm": 0.46636658040585693, + "learning_rate": 9.84594611520461e-05, + "loss": 3.2866, + "step": 11038 + }, + { + "epoch": 0.5139558162813976, + "grad_norm": 0.42102351305173474, + "learning_rate": 9.845879387205346e-05, + "loss": 3.2054, + "step": 11039 + }, + { + "epoch": 0.5140023744674908, + "grad_norm": 0.39690610775789215, + "learning_rate": 9.8458126449839e-05, + "loss": 3.1676, + "step": 11040 + }, + { + "epoch": 0.5140489326535839, + "grad_norm": 0.4336485423922037, + "learning_rate": 9.845745888540474e-05, + "loss": 3.1683, + "step": 11041 + }, + { + "epoch": 0.5140954908396769, + "grad_norm": 0.3780899339831193, + "learning_rate": 9.845679117875262e-05, + "loss": 3.1549, + "step": 11042 + }, + { + "epoch": 0.51414204902577, + "grad_norm": 0.3904891711258343, + "learning_rate": 9.845612332988456e-05, + "loss": 3.2288, + "step": 11043 + }, + { + "epoch": 0.514188607211863, + "grad_norm": 0.39792978227438386, + "learning_rate": 9.845545533880258e-05, + "loss": 3.0831, + "step": 11044 + }, + { + "epoch": 0.5142351653979561, + "grad_norm": 0.4105738504431479, + "learning_rate": 9.84547872055086e-05, + "loss": 3.2112, + "step": 11045 + }, + { + "epoch": 0.5142817235840491, + "grad_norm": 0.38930417166056247, + "learning_rate": 9.84541189300046e-05, + "loss": 3.1573, + "step": 11046 + }, + { + "epoch": 0.5143282817701422, + "grad_norm": 0.398620394136607, + "learning_rate": 9.845345051229252e-05, + "loss": 3.2022, + "step": 11047 + }, + { + "epoch": 0.5143748399562353, + "grad_norm": 0.40688440172517837, + "learning_rate": 9.845278195237435e-05, + "loss": 3.2069, + "step": 11048 + }, + { + "epoch": 0.5144213981423283, + "grad_norm": 0.34235703538216117, + "learning_rate": 9.845211325025203e-05, + "loss": 3.1041, + "step": 11049 + }, + { + "epoch": 0.5144679563284215, + "grad_norm": 0.37619711743577916, + "learning_rate": 9.845144440592754e-05, + "loss": 3.1494, + "step": 11050 + }, + { + "epoch": 0.5145145145145145, + "grad_norm": 0.369088780228048, + "learning_rate": 9.845077541940284e-05, + "loss": 3.2068, + "step": 11051 + }, + { + "epoch": 0.5145610727006076, + "grad_norm": 0.36497443831888776, + "learning_rate": 9.845010629067985e-05, + "loss": 3.1751, + "step": 11052 + }, + { + "epoch": 0.5146076308867007, + "grad_norm": 0.3606883739250875, + "learning_rate": 9.844943701976061e-05, + "loss": 3.1503, + "step": 11053 + }, + { + "epoch": 0.5146541890727937, + "grad_norm": 0.3801660675785268, + "learning_rate": 9.844876760664703e-05, + "loss": 3.2472, + "step": 11054 + }, + { + "epoch": 0.5147007472588868, + "grad_norm": 0.37245849566009154, + "learning_rate": 9.84480980513411e-05, + "loss": 3.2603, + "step": 11055 + }, + { + "epoch": 0.5147473054449798, + "grad_norm": 0.3866230231381722, + "learning_rate": 9.844742835384477e-05, + "loss": 3.08, + "step": 11056 + }, + { + "epoch": 0.5147938636310729, + "grad_norm": 0.35493100262350896, + "learning_rate": 9.844675851416001e-05, + "loss": 3.0206, + "step": 11057 + }, + { + "epoch": 0.514840421817166, + "grad_norm": 0.3906361805953509, + "learning_rate": 9.844608853228879e-05, + "loss": 3.2206, + "step": 11058 + }, + { + "epoch": 0.514886980003259, + "grad_norm": 0.35781587896691275, + "learning_rate": 9.844541840823307e-05, + "loss": 3.2162, + "step": 11059 + }, + { + "epoch": 0.5149335381893522, + "grad_norm": 0.3714653059895805, + "learning_rate": 9.84447481419948e-05, + "loss": 3.1692, + "step": 11060 + }, + { + "epoch": 0.5149800963754452, + "grad_norm": 0.3861138388592855, + "learning_rate": 9.844407773357599e-05, + "loss": 3.1258, + "step": 11061 + }, + { + "epoch": 0.5150266545615383, + "grad_norm": 0.3764003065138879, + "learning_rate": 9.844340718297857e-05, + "loss": 3.1112, + "step": 11062 + }, + { + "epoch": 0.5150732127476314, + "grad_norm": 0.4712777213802182, + "learning_rate": 9.844273649020455e-05, + "loss": 3.1764, + "step": 11063 + }, + { + "epoch": 0.5151197709337244, + "grad_norm": 0.4559429899597162, + "learning_rate": 9.844206565525584e-05, + "loss": 3.0944, + "step": 11064 + }, + { + "epoch": 0.5151663291198175, + "grad_norm": 0.3867352977997331, + "learning_rate": 9.844139467813444e-05, + "loss": 3.1645, + "step": 11065 + }, + { + "epoch": 0.5152128873059105, + "grad_norm": 0.3702514036969871, + "learning_rate": 9.844072355884236e-05, + "loss": 3.1299, + "step": 11066 + }, + { + "epoch": 0.5152594454920036, + "grad_norm": 0.3937982122744094, + "learning_rate": 9.844005229738149e-05, + "loss": 3.2219, + "step": 11067 + }, + { + "epoch": 0.5153060036780966, + "grad_norm": 0.4276582596844193, + "learning_rate": 9.843938089375384e-05, + "loss": 3.2279, + "step": 11068 + }, + { + "epoch": 0.5153525618641898, + "grad_norm": 0.3952653521251412, + "learning_rate": 9.843870934796139e-05, + "loss": 3.2642, + "step": 11069 + }, + { + "epoch": 0.5153991200502829, + "grad_norm": 0.38191249426811935, + "learning_rate": 9.84380376600061e-05, + "loss": 3.1489, + "step": 11070 + }, + { + "epoch": 0.5154456782363759, + "grad_norm": 0.4041453662006075, + "learning_rate": 9.843736582988993e-05, + "loss": 3.126, + "step": 11071 + }, + { + "epoch": 0.515492236422469, + "grad_norm": 0.40065678478909755, + "learning_rate": 9.843669385761487e-05, + "loss": 3.1704, + "step": 11072 + }, + { + "epoch": 0.515538794608562, + "grad_norm": 0.36623604321629993, + "learning_rate": 9.843602174318289e-05, + "loss": 3.1403, + "step": 11073 + }, + { + "epoch": 0.5155853527946551, + "grad_norm": 0.4130520811252801, + "learning_rate": 9.843534948659596e-05, + "loss": 3.1658, + "step": 11074 + }, + { + "epoch": 0.5156319109807482, + "grad_norm": 0.43535084944529184, + "learning_rate": 9.843467708785604e-05, + "loss": 3.1872, + "step": 11075 + }, + { + "epoch": 0.5156784691668412, + "grad_norm": 0.4283580339494268, + "learning_rate": 9.843400454696513e-05, + "loss": 3.018, + "step": 11076 + }, + { + "epoch": 0.5157250273529344, + "grad_norm": 0.43688045466265835, + "learning_rate": 9.843333186392517e-05, + "loss": 3.2641, + "step": 11077 + }, + { + "epoch": 0.5157715855390274, + "grad_norm": 0.42427018355494717, + "learning_rate": 9.843265903873816e-05, + "loss": 3.1226, + "step": 11078 + }, + { + "epoch": 0.5158181437251205, + "grad_norm": 0.4745771947571634, + "learning_rate": 9.843198607140607e-05, + "loss": 3.2123, + "step": 11079 + }, + { + "epoch": 0.5158647019112136, + "grad_norm": 0.42690315833379294, + "learning_rate": 9.843131296193087e-05, + "loss": 3.1614, + "step": 11080 + }, + { + "epoch": 0.5159112600973066, + "grad_norm": 0.38548665173581503, + "learning_rate": 9.843063971031454e-05, + "loss": 3.1378, + "step": 11081 + }, + { + "epoch": 0.5159578182833997, + "grad_norm": 0.38186516560137707, + "learning_rate": 9.842996631655905e-05, + "loss": 3.1766, + "step": 11082 + }, + { + "epoch": 0.5160043764694927, + "grad_norm": 0.38118774438298436, + "learning_rate": 9.842929278066639e-05, + "loss": 3.1155, + "step": 11083 + }, + { + "epoch": 0.5160509346555858, + "grad_norm": 0.39669307413235905, + "learning_rate": 9.842861910263851e-05, + "loss": 3.1733, + "step": 11084 + }, + { + "epoch": 0.5160974928416789, + "grad_norm": 0.3872354751138355, + "learning_rate": 9.842794528247741e-05, + "loss": 3.1182, + "step": 11085 + }, + { + "epoch": 0.5161440510277719, + "grad_norm": 0.4331632067407214, + "learning_rate": 9.842727132018507e-05, + "loss": 3.2528, + "step": 11086 + }, + { + "epoch": 0.5161906092138651, + "grad_norm": 0.41971868279776386, + "learning_rate": 9.842659721576345e-05, + "loss": 3.1542, + "step": 11087 + }, + { + "epoch": 0.5162371673999581, + "grad_norm": 0.43819825078108376, + "learning_rate": 9.842592296921453e-05, + "loss": 3.1065, + "step": 11088 + }, + { + "epoch": 0.5162837255860512, + "grad_norm": 0.41739734632114667, + "learning_rate": 9.842524858054031e-05, + "loss": 3.0994, + "step": 11089 + }, + { + "epoch": 0.5163302837721442, + "grad_norm": 0.3581241871139848, + "learning_rate": 9.842457404974276e-05, + "loss": 3.117, + "step": 11090 + }, + { + "epoch": 0.5163768419582373, + "grad_norm": 0.37603994667689117, + "learning_rate": 9.842389937682386e-05, + "loss": 3.0812, + "step": 11091 + }, + { + "epoch": 0.5164234001443304, + "grad_norm": 0.35235149781656794, + "learning_rate": 9.842322456178556e-05, + "loss": 3.06, + "step": 11092 + }, + { + "epoch": 0.5164699583304234, + "grad_norm": 0.3819479833130655, + "learning_rate": 9.842254960462989e-05, + "loss": 3.15, + "step": 11093 + }, + { + "epoch": 0.5165165165165165, + "grad_norm": 0.3894609048670598, + "learning_rate": 9.84218745053588e-05, + "loss": 3.0006, + "step": 11094 + }, + { + "epoch": 0.5165630747026095, + "grad_norm": 0.35454178261495695, + "learning_rate": 9.842119926397428e-05, + "loss": 3.1282, + "step": 11095 + }, + { + "epoch": 0.5166096328887027, + "grad_norm": 0.4195250223779788, + "learning_rate": 9.842052388047831e-05, + "loss": 3.0917, + "step": 11096 + }, + { + "epoch": 0.5166561910747958, + "grad_norm": 0.3959074929298418, + "learning_rate": 9.841984835487288e-05, + "loss": 3.1076, + "step": 11097 + }, + { + "epoch": 0.5167027492608888, + "grad_norm": 0.3900551147131855, + "learning_rate": 9.841917268715995e-05, + "loss": 3.1824, + "step": 11098 + }, + { + "epoch": 0.5167493074469819, + "grad_norm": 0.36926712954550567, + "learning_rate": 9.841849687734152e-05, + "loss": 3.1388, + "step": 11099 + }, + { + "epoch": 0.5167958656330749, + "grad_norm": 0.3882216390648275, + "learning_rate": 9.841782092541958e-05, + "loss": 3.0405, + "step": 11100 + }, + { + "epoch": 0.516842423819168, + "grad_norm": 0.41762524281144814, + "learning_rate": 9.84171448313961e-05, + "loss": 3.1727, + "step": 11101 + }, + { + "epoch": 0.5168889820052611, + "grad_norm": 0.3721632533523949, + "learning_rate": 9.841646859527307e-05, + "loss": 3.1144, + "step": 11102 + }, + { + "epoch": 0.5169355401913541, + "grad_norm": 0.3655432420814797, + "learning_rate": 9.841579221705248e-05, + "loss": 3.1283, + "step": 11103 + }, + { + "epoch": 0.5169820983774472, + "grad_norm": 0.33011806311694786, + "learning_rate": 9.84151156967363e-05, + "loss": 3.1146, + "step": 11104 + }, + { + "epoch": 0.5170286565635402, + "grad_norm": 0.37972500636532713, + "learning_rate": 9.841443903432655e-05, + "loss": 3.0621, + "step": 11105 + }, + { + "epoch": 0.5170752147496334, + "grad_norm": 0.3470237336541176, + "learning_rate": 9.841376222982516e-05, + "loss": 3.2266, + "step": 11106 + }, + { + "epoch": 0.5171217729357265, + "grad_norm": 0.4041998912993029, + "learning_rate": 9.841308528323416e-05, + "loss": 3.0751, + "step": 11107 + }, + { + "epoch": 0.5171683311218195, + "grad_norm": 0.4193335264111284, + "learning_rate": 9.841240819455553e-05, + "loss": 3.0972, + "step": 11108 + }, + { + "epoch": 0.5172148893079126, + "grad_norm": 0.38161244223064333, + "learning_rate": 9.841173096379124e-05, + "loss": 3.1356, + "step": 11109 + }, + { + "epoch": 0.5172614474940056, + "grad_norm": 0.3918964371479297, + "learning_rate": 9.841105359094329e-05, + "loss": 3.2297, + "step": 11110 + }, + { + "epoch": 0.5173080056800987, + "grad_norm": 0.37026543969454273, + "learning_rate": 9.841037607601366e-05, + "loss": 3.077, + "step": 11111 + }, + { + "epoch": 0.5173545638661917, + "grad_norm": 0.36343658469328793, + "learning_rate": 9.840969841900436e-05, + "loss": 3.2975, + "step": 11112 + }, + { + "epoch": 0.5174011220522848, + "grad_norm": 0.3773281746258802, + "learning_rate": 9.840902061991737e-05, + "loss": 3.1697, + "step": 11113 + }, + { + "epoch": 0.517447680238378, + "grad_norm": 0.45531051060369054, + "learning_rate": 9.840834267875466e-05, + "loss": 3.2261, + "step": 11114 + }, + { + "epoch": 0.517494238424471, + "grad_norm": 0.39549543600948245, + "learning_rate": 9.840766459551825e-05, + "loss": 3.1808, + "step": 11115 + }, + { + "epoch": 0.5175407966105641, + "grad_norm": 0.3679030375308253, + "learning_rate": 9.840698637021008e-05, + "loss": 3.0997, + "step": 11116 + }, + { + "epoch": 0.5175873547966571, + "grad_norm": 0.3995470504034687, + "learning_rate": 9.840630800283222e-05, + "loss": 3.1463, + "step": 11117 + }, + { + "epoch": 0.5176339129827502, + "grad_norm": 0.364828904447752, + "learning_rate": 9.840562949338658e-05, + "loss": 3.3115, + "step": 11118 + }, + { + "epoch": 0.5176804711688433, + "grad_norm": 0.4131043598912191, + "learning_rate": 9.84049508418752e-05, + "loss": 3.1374, + "step": 11119 + }, + { + "epoch": 0.5177270293549363, + "grad_norm": 0.3753705368370561, + "learning_rate": 9.840427204830005e-05, + "loss": 3.2635, + "step": 11120 + }, + { + "epoch": 0.5177735875410294, + "grad_norm": 0.366108522042396, + "learning_rate": 9.840359311266314e-05, + "loss": 3.1232, + "step": 11121 + }, + { + "epoch": 0.5178201457271224, + "grad_norm": 0.3242764745360887, + "learning_rate": 9.840291403496645e-05, + "loss": 3.0888, + "step": 11122 + }, + { + "epoch": 0.5178667039132155, + "grad_norm": 0.33181300661507446, + "learning_rate": 9.840223481521196e-05, + "loss": 3.18, + "step": 11123 + }, + { + "epoch": 0.5179132620993087, + "grad_norm": 0.3469803505445786, + "learning_rate": 9.840155545340168e-05, + "loss": 3.1092, + "step": 11124 + }, + { + "epoch": 0.5179598202854017, + "grad_norm": 0.3654835817660682, + "learning_rate": 9.840087594953762e-05, + "loss": 3.1883, + "step": 11125 + }, + { + "epoch": 0.5180063784714948, + "grad_norm": 0.35267927064385457, + "learning_rate": 9.840019630362176e-05, + "loss": 3.1868, + "step": 11126 + }, + { + "epoch": 0.5180529366575878, + "grad_norm": 0.39648947659844214, + "learning_rate": 9.839951651565608e-05, + "loss": 3.1019, + "step": 11127 + }, + { + "epoch": 0.5180994948436809, + "grad_norm": 0.38665110073146636, + "learning_rate": 9.83988365856426e-05, + "loss": 3.1542, + "step": 11128 + }, + { + "epoch": 0.518146053029774, + "grad_norm": 0.3904323719978154, + "learning_rate": 9.839815651358328e-05, + "loss": 3.1592, + "step": 11129 + }, + { + "epoch": 0.518192611215867, + "grad_norm": 0.3510206616545039, + "learning_rate": 9.839747629948016e-05, + "loss": 3.1399, + "step": 11130 + }, + { + "epoch": 0.5182391694019601, + "grad_norm": 0.37397538906776373, + "learning_rate": 9.839679594333522e-05, + "loss": 3.141, + "step": 11131 + }, + { + "epoch": 0.5182857275880531, + "grad_norm": 0.3592506450365157, + "learning_rate": 9.839611544515043e-05, + "loss": 3.2025, + "step": 11132 + }, + { + "epoch": 0.5183322857741463, + "grad_norm": 0.39377810380422335, + "learning_rate": 9.839543480492782e-05, + "loss": 3.2597, + "step": 11133 + }, + { + "epoch": 0.5183788439602393, + "grad_norm": 0.4325600596609602, + "learning_rate": 9.839475402266938e-05, + "loss": 3.1496, + "step": 11134 + }, + { + "epoch": 0.5184254021463324, + "grad_norm": 0.4168423171817392, + "learning_rate": 9.839407309837709e-05, + "loss": 3.1068, + "step": 11135 + }, + { + "epoch": 0.5184719603324255, + "grad_norm": 0.4045596480897611, + "learning_rate": 9.839339203205298e-05, + "loss": 3.1565, + "step": 11136 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 0.42298073923569746, + "learning_rate": 9.839271082369902e-05, + "loss": 3.2336, + "step": 11137 + }, + { + "epoch": 0.5185650767046116, + "grad_norm": 0.36932188702811264, + "learning_rate": 9.839202947331722e-05, + "loss": 3.2356, + "step": 11138 + }, + { + "epoch": 0.5186116348907046, + "grad_norm": 0.3679425090781842, + "learning_rate": 9.839134798090959e-05, + "loss": 3.1799, + "step": 11139 + }, + { + "epoch": 0.5186581930767977, + "grad_norm": 0.3893289747162535, + "learning_rate": 9.839066634647812e-05, + "loss": 3.1215, + "step": 11140 + }, + { + "epoch": 0.5187047512628908, + "grad_norm": 0.3554508396493234, + "learning_rate": 9.838998457002482e-05, + "loss": 3.1834, + "step": 11141 + }, + { + "epoch": 0.5187513094489838, + "grad_norm": 0.39928419363963535, + "learning_rate": 9.838930265155166e-05, + "loss": 3.1619, + "step": 11142 + }, + { + "epoch": 0.518797867635077, + "grad_norm": 0.356045800386262, + "learning_rate": 9.838862059106067e-05, + "loss": 3.1518, + "step": 11143 + }, + { + "epoch": 0.51884442582117, + "grad_norm": 0.3890489261859624, + "learning_rate": 9.838793838855385e-05, + "loss": 3.1372, + "step": 11144 + }, + { + "epoch": 0.5188909840072631, + "grad_norm": 0.3754042963422427, + "learning_rate": 9.83872560440332e-05, + "loss": 3.0856, + "step": 11145 + }, + { + "epoch": 0.5189375421933562, + "grad_norm": 0.3416588185413794, + "learning_rate": 9.838657355750073e-05, + "loss": 3.1871, + "step": 11146 + }, + { + "epoch": 0.5189841003794492, + "grad_norm": 0.3626520865854299, + "learning_rate": 9.838589092895843e-05, + "loss": 3.2392, + "step": 11147 + }, + { + "epoch": 0.5190306585655423, + "grad_norm": 0.34965789499922934, + "learning_rate": 9.838520815840829e-05, + "loss": 3.117, + "step": 11148 + }, + { + "epoch": 0.5190772167516353, + "grad_norm": 0.3970742312447113, + "learning_rate": 9.838452524585234e-05, + "loss": 3.1835, + "step": 11149 + }, + { + "epoch": 0.5191237749377284, + "grad_norm": 0.4038980514868307, + "learning_rate": 9.838384219129257e-05, + "loss": 3.0935, + "step": 11150 + }, + { + "epoch": 0.5191703331238215, + "grad_norm": 0.34052307846051577, + "learning_rate": 9.8383158994731e-05, + "loss": 3.0726, + "step": 11151 + }, + { + "epoch": 0.5192168913099146, + "grad_norm": 0.3773093350669679, + "learning_rate": 9.838247565616962e-05, + "loss": 3.2125, + "step": 11152 + }, + { + "epoch": 0.5192634494960077, + "grad_norm": 0.3947375153215702, + "learning_rate": 9.838179217561046e-05, + "loss": 3.2065, + "step": 11153 + }, + { + "epoch": 0.5193100076821007, + "grad_norm": 0.3699136441662772, + "learning_rate": 9.838110855305547e-05, + "loss": 3.2501, + "step": 11154 + }, + { + "epoch": 0.5193565658681938, + "grad_norm": 0.3751589403807965, + "learning_rate": 9.838042478850671e-05, + "loss": 3.2354, + "step": 11155 + }, + { + "epoch": 0.5194031240542868, + "grad_norm": 0.4092525773636655, + "learning_rate": 9.837974088196617e-05, + "loss": 3.1906, + "step": 11156 + }, + { + "epoch": 0.5194496822403799, + "grad_norm": 0.41643115485563514, + "learning_rate": 9.837905683343587e-05, + "loss": 3.191, + "step": 11157 + }, + { + "epoch": 0.519496240426473, + "grad_norm": 0.40525393922933606, + "learning_rate": 9.837837264291778e-05, + "loss": 3.2767, + "step": 11158 + }, + { + "epoch": 0.519542798612566, + "grad_norm": 0.39379937457493286, + "learning_rate": 9.837768831041394e-05, + "loss": 3.1031, + "step": 11159 + }, + { + "epoch": 0.5195893567986591, + "grad_norm": 0.36940380999197353, + "learning_rate": 9.837700383592636e-05, + "loss": 3.0905, + "step": 11160 + }, + { + "epoch": 0.5196359149847521, + "grad_norm": 0.3702487206401796, + "learning_rate": 9.837631921945703e-05, + "loss": 3.1527, + "step": 11161 + }, + { + "epoch": 0.5196824731708453, + "grad_norm": 0.4751425420579111, + "learning_rate": 9.837563446100797e-05, + "loss": 3.2915, + "step": 11162 + }, + { + "epoch": 0.5197290313569384, + "grad_norm": 0.4191668434546743, + "learning_rate": 9.83749495605812e-05, + "loss": 3.1244, + "step": 11163 + }, + { + "epoch": 0.5197755895430314, + "grad_norm": 0.377233732433446, + "learning_rate": 9.83742645181787e-05, + "loss": 3.238, + "step": 11164 + }, + { + "epoch": 0.5198221477291245, + "grad_norm": 0.45348322250226536, + "learning_rate": 9.837357933380252e-05, + "loss": 3.1649, + "step": 11165 + }, + { + "epoch": 0.5198687059152175, + "grad_norm": 0.4898871287073209, + "learning_rate": 9.837289400745464e-05, + "loss": 3.223, + "step": 11166 + }, + { + "epoch": 0.5199152641013106, + "grad_norm": 0.3857225551413653, + "learning_rate": 9.837220853913706e-05, + "loss": 3.1611, + "step": 11167 + }, + { + "epoch": 0.5199618222874037, + "grad_norm": 0.43658634888179554, + "learning_rate": 9.837152292885186e-05, + "loss": 3.0638, + "step": 11168 + }, + { + "epoch": 0.5200083804734967, + "grad_norm": 0.45140260733781334, + "learning_rate": 9.837083717660096e-05, + "loss": 3.1835, + "step": 11169 + }, + { + "epoch": 0.5200549386595898, + "grad_norm": 0.48719420416070947, + "learning_rate": 9.837015128238645e-05, + "loss": 3.1145, + "step": 11170 + }, + { + "epoch": 0.5201014968456829, + "grad_norm": 0.42407784085985417, + "learning_rate": 9.83694652462103e-05, + "loss": 3.1101, + "step": 11171 + }, + { + "epoch": 0.520148055031776, + "grad_norm": 0.4187633796165388, + "learning_rate": 9.836877906807453e-05, + "loss": 3.2178, + "step": 11172 + }, + { + "epoch": 0.5201946132178691, + "grad_norm": 0.3968002800413646, + "learning_rate": 9.836809274798116e-05, + "loss": 3.1818, + "step": 11173 + }, + { + "epoch": 0.5202411714039621, + "grad_norm": 0.40745340687675563, + "learning_rate": 9.836740628593221e-05, + "loss": 3.175, + "step": 11174 + }, + { + "epoch": 0.5202877295900552, + "grad_norm": 0.37611975534964587, + "learning_rate": 9.836671968192968e-05, + "loss": 3.0369, + "step": 11175 + }, + { + "epoch": 0.5203342877761482, + "grad_norm": 0.3865077955052295, + "learning_rate": 9.836603293597559e-05, + "loss": 3.1478, + "step": 11176 + }, + { + "epoch": 0.5203808459622413, + "grad_norm": 0.3528155595050601, + "learning_rate": 9.836534604807198e-05, + "loss": 3.235, + "step": 11177 + }, + { + "epoch": 0.5204274041483343, + "grad_norm": 0.3763481729283357, + "learning_rate": 9.836465901822082e-05, + "loss": 3.1798, + "step": 11178 + }, + { + "epoch": 0.5204739623344274, + "grad_norm": 0.3674294437371154, + "learning_rate": 9.836397184642416e-05, + "loss": 3.2079, + "step": 11179 + }, + { + "epoch": 0.5205205205205206, + "grad_norm": 0.4312306228688857, + "learning_rate": 9.8363284532684e-05, + "loss": 3.2385, + "step": 11180 + }, + { + "epoch": 0.5205670787066136, + "grad_norm": 0.42196612392642563, + "learning_rate": 9.836259707700237e-05, + "loss": 3.0199, + "step": 11181 + }, + { + "epoch": 0.5206136368927067, + "grad_norm": 0.4240896303332426, + "learning_rate": 9.836190947938128e-05, + "loss": 3.151, + "step": 11182 + }, + { + "epoch": 0.5206601950787997, + "grad_norm": 0.4180041776701868, + "learning_rate": 9.836122173982277e-05, + "loss": 3.1963, + "step": 11183 + }, + { + "epoch": 0.5207067532648928, + "grad_norm": 0.46456646387363953, + "learning_rate": 9.83605338583288e-05, + "loss": 3.1009, + "step": 11184 + }, + { + "epoch": 0.5207533114509859, + "grad_norm": 0.4272577377311306, + "learning_rate": 9.835984583490145e-05, + "loss": 3.1368, + "step": 11185 + }, + { + "epoch": 0.5207998696370789, + "grad_norm": 0.41354679951241513, + "learning_rate": 9.835915766954272e-05, + "loss": 3.2412, + "step": 11186 + }, + { + "epoch": 0.520846427823172, + "grad_norm": 0.4741733395471054, + "learning_rate": 9.835846936225462e-05, + "loss": 3.2014, + "step": 11187 + }, + { + "epoch": 0.520892986009265, + "grad_norm": 0.448937228121498, + "learning_rate": 9.835778091303918e-05, + "loss": 3.1398, + "step": 11188 + }, + { + "epoch": 0.5209395441953582, + "grad_norm": 0.4391525532365366, + "learning_rate": 9.835709232189842e-05, + "loss": 3.1395, + "step": 11189 + }, + { + "epoch": 0.5209861023814513, + "grad_norm": 0.40401064342391807, + "learning_rate": 9.835640358883435e-05, + "loss": 3.1206, + "step": 11190 + }, + { + "epoch": 0.5210326605675443, + "grad_norm": 0.42106125819598905, + "learning_rate": 9.8355714713849e-05, + "loss": 3.2109, + "step": 11191 + }, + { + "epoch": 0.5210792187536374, + "grad_norm": 0.4414820750033992, + "learning_rate": 9.835502569694439e-05, + "loss": 3.2513, + "step": 11192 + }, + { + "epoch": 0.5211257769397304, + "grad_norm": 0.4135506008832714, + "learning_rate": 9.835433653812257e-05, + "loss": 3.0944, + "step": 11193 + }, + { + "epoch": 0.5211723351258235, + "grad_norm": 0.37541781617867936, + "learning_rate": 9.83536472373855e-05, + "loss": 3.1314, + "step": 11194 + }, + { + "epoch": 0.5212188933119166, + "grad_norm": 0.40497994433212936, + "learning_rate": 9.835295779473526e-05, + "loss": 3.0378, + "step": 11195 + }, + { + "epoch": 0.5212654514980096, + "grad_norm": 0.4029614859488647, + "learning_rate": 9.835226821017382e-05, + "loss": 3.2095, + "step": 11196 + }, + { + "epoch": 0.5213120096841027, + "grad_norm": 0.36196831356599746, + "learning_rate": 9.835157848370326e-05, + "loss": 3.1171, + "step": 11197 + }, + { + "epoch": 0.5213585678701957, + "grad_norm": 0.46176120668824744, + "learning_rate": 9.83508886153256e-05, + "loss": 3.132, + "step": 11198 + }, + { + "epoch": 0.5214051260562889, + "grad_norm": 0.4285934237559687, + "learning_rate": 9.835019860504282e-05, + "loss": 3.2052, + "step": 11199 + }, + { + "epoch": 0.5214516842423819, + "grad_norm": 0.38889607314418806, + "learning_rate": 9.834950845285697e-05, + "loss": 3.1955, + "step": 11200 + }, + { + "epoch": 0.521498242428475, + "grad_norm": 0.4154083104206728, + "learning_rate": 9.834881815877009e-05, + "loss": 3.1997, + "step": 11201 + }, + { + "epoch": 0.5215448006145681, + "grad_norm": 0.4176874356033679, + "learning_rate": 9.834812772278419e-05, + "loss": 3.2228, + "step": 11202 + }, + { + "epoch": 0.5215913588006611, + "grad_norm": 0.3899575499193462, + "learning_rate": 9.834743714490129e-05, + "loss": 3.2033, + "step": 11203 + }, + { + "epoch": 0.5216379169867542, + "grad_norm": 0.39841198353455387, + "learning_rate": 9.834674642512342e-05, + "loss": 3.2018, + "step": 11204 + }, + { + "epoch": 0.5216844751728472, + "grad_norm": 0.34968313591012423, + "learning_rate": 9.834605556345263e-05, + "loss": 3.101, + "step": 11205 + }, + { + "epoch": 0.5217310333589403, + "grad_norm": 0.37788531946956133, + "learning_rate": 9.834536455989092e-05, + "loss": 3.2583, + "step": 11206 + }, + { + "epoch": 0.5217775915450334, + "grad_norm": 0.4152674768048255, + "learning_rate": 9.834467341444032e-05, + "loss": 3.194, + "step": 11207 + }, + { + "epoch": 0.5218241497311265, + "grad_norm": 0.33492007008698754, + "learning_rate": 9.834398212710287e-05, + "loss": 3.1736, + "step": 11208 + }, + { + "epoch": 0.5218707079172196, + "grad_norm": 0.3735419079258309, + "learning_rate": 9.83432906978806e-05, + "loss": 3.1368, + "step": 11209 + }, + { + "epoch": 0.5219172661033126, + "grad_norm": 0.347893195101974, + "learning_rate": 9.834259912677555e-05, + "loss": 3.1349, + "step": 11210 + }, + { + "epoch": 0.5219638242894057, + "grad_norm": 0.35486885150225267, + "learning_rate": 9.83419074137897e-05, + "loss": 3.0727, + "step": 11211 + }, + { + "epoch": 0.5220103824754988, + "grad_norm": 0.36034908348700856, + "learning_rate": 9.834121555892513e-05, + "loss": 3.1431, + "step": 11212 + }, + { + "epoch": 0.5220569406615918, + "grad_norm": 0.3622076491055402, + "learning_rate": 9.834052356218387e-05, + "loss": 3.0627, + "step": 11213 + }, + { + "epoch": 0.5221034988476849, + "grad_norm": 0.34388763405654793, + "learning_rate": 9.833983142356791e-05, + "loss": 3.1638, + "step": 11214 + }, + { + "epoch": 0.5221500570337779, + "grad_norm": 0.41705718060143465, + "learning_rate": 9.833913914307932e-05, + "loss": 3.2263, + "step": 11215 + }, + { + "epoch": 0.522196615219871, + "grad_norm": 0.4756874302881365, + "learning_rate": 9.833844672072013e-05, + "loss": 3.1157, + "step": 11216 + }, + { + "epoch": 0.5222431734059642, + "grad_norm": 0.4756643297823539, + "learning_rate": 9.833775415649235e-05, + "loss": 3.1565, + "step": 11217 + }, + { + "epoch": 0.5222897315920572, + "grad_norm": 0.4006977774967952, + "learning_rate": 9.833706145039801e-05, + "loss": 3.0741, + "step": 11218 + }, + { + "epoch": 0.5223362897781503, + "grad_norm": 0.42111035623287885, + "learning_rate": 9.833636860243918e-05, + "loss": 3.1719, + "step": 11219 + }, + { + "epoch": 0.5223828479642433, + "grad_norm": 0.451975253661049, + "learning_rate": 9.833567561261785e-05, + "loss": 3.2134, + "step": 11220 + }, + { + "epoch": 0.5224294061503364, + "grad_norm": 0.4303956862587267, + "learning_rate": 9.833498248093608e-05, + "loss": 3.1385, + "step": 11221 + }, + { + "epoch": 0.5224759643364294, + "grad_norm": 0.40489463188503944, + "learning_rate": 9.833428920739592e-05, + "loss": 3.1356, + "step": 11222 + }, + { + "epoch": 0.5225225225225225, + "grad_norm": 0.49083850256419276, + "learning_rate": 9.833359579199935e-05, + "loss": 3.1149, + "step": 11223 + }, + { + "epoch": 0.5225690807086156, + "grad_norm": 0.43164096195862656, + "learning_rate": 9.833290223474845e-05, + "loss": 3.1424, + "step": 11224 + }, + { + "epoch": 0.5226156388947086, + "grad_norm": 0.3651445207709954, + "learning_rate": 9.833220853564525e-05, + "loss": 3.1253, + "step": 11225 + }, + { + "epoch": 0.5226621970808017, + "grad_norm": 0.40854903626139494, + "learning_rate": 9.833151469469178e-05, + "loss": 3.2193, + "step": 11226 + }, + { + "epoch": 0.5227087552668948, + "grad_norm": 0.38212322639477236, + "learning_rate": 9.833082071189007e-05, + "loss": 3.1862, + "step": 11227 + }, + { + "epoch": 0.5227553134529879, + "grad_norm": 0.4157944481948782, + "learning_rate": 9.833012658724217e-05, + "loss": 3.3275, + "step": 11228 + }, + { + "epoch": 0.522801871639081, + "grad_norm": 0.3888191385917221, + "learning_rate": 9.83294323207501e-05, + "loss": 3.1525, + "step": 11229 + }, + { + "epoch": 0.522848429825174, + "grad_norm": 0.36888438017303105, + "learning_rate": 9.83287379124159e-05, + "loss": 3.0972, + "step": 11230 + }, + { + "epoch": 0.5228949880112671, + "grad_norm": 0.38435955897013435, + "learning_rate": 9.832804336224164e-05, + "loss": 3.1358, + "step": 11231 + }, + { + "epoch": 0.5229415461973601, + "grad_norm": 0.3356541428296597, + "learning_rate": 9.832734867022931e-05, + "loss": 3.0399, + "step": 11232 + }, + { + "epoch": 0.5229881043834532, + "grad_norm": 0.44105495116214055, + "learning_rate": 9.832665383638099e-05, + "loss": 3.1141, + "step": 11233 + }, + { + "epoch": 0.5230346625695463, + "grad_norm": 0.449348193988684, + "learning_rate": 9.832595886069869e-05, + "loss": 3.0551, + "step": 11234 + }, + { + "epoch": 0.5230812207556393, + "grad_norm": 0.3707276549758709, + "learning_rate": 9.832526374318447e-05, + "loss": 3.1326, + "step": 11235 + }, + { + "epoch": 0.5231277789417325, + "grad_norm": 0.39605637866353227, + "learning_rate": 9.832456848384035e-05, + "loss": 3.0207, + "step": 11236 + }, + { + "epoch": 0.5231743371278255, + "grad_norm": 0.42513780837174847, + "learning_rate": 9.832387308266839e-05, + "loss": 3.1651, + "step": 11237 + }, + { + "epoch": 0.5232208953139186, + "grad_norm": 0.37932342402096314, + "learning_rate": 9.832317753967063e-05, + "loss": 3.2136, + "step": 11238 + }, + { + "epoch": 0.5232674535000117, + "grad_norm": 0.4439413247272265, + "learning_rate": 9.83224818548491e-05, + "loss": 3.0796, + "step": 11239 + }, + { + "epoch": 0.5233140116861047, + "grad_norm": 0.40070185912188877, + "learning_rate": 9.832178602820584e-05, + "loss": 3.1378, + "step": 11240 + }, + { + "epoch": 0.5233605698721978, + "grad_norm": 0.40342237294662364, + "learning_rate": 9.83210900597429e-05, + "loss": 3.1327, + "step": 11241 + }, + { + "epoch": 0.5234071280582908, + "grad_norm": 0.41615623759522735, + "learning_rate": 9.832039394946233e-05, + "loss": 3.203, + "step": 11242 + }, + { + "epoch": 0.5234536862443839, + "grad_norm": 0.41722390264085796, + "learning_rate": 9.831969769736615e-05, + "loss": 3.0671, + "step": 11243 + }, + { + "epoch": 0.5235002444304769, + "grad_norm": 0.36040046273277393, + "learning_rate": 9.831900130345644e-05, + "loss": 3.1356, + "step": 11244 + }, + { + "epoch": 0.52354680261657, + "grad_norm": 0.4209045571134716, + "learning_rate": 9.831830476773521e-05, + "loss": 3.2035, + "step": 11245 + }, + { + "epoch": 0.5235933608026632, + "grad_norm": 0.41192690576594, + "learning_rate": 9.83176080902045e-05, + "loss": 3.215, + "step": 11246 + }, + { + "epoch": 0.5236399189887562, + "grad_norm": 0.3987228987137792, + "learning_rate": 9.83169112708664e-05, + "loss": 3.1664, + "step": 11247 + }, + { + "epoch": 0.5236864771748493, + "grad_norm": 0.39098972857493414, + "learning_rate": 9.83162143097229e-05, + "loss": 3.1021, + "step": 11248 + }, + { + "epoch": 0.5237330353609423, + "grad_norm": 0.39492498961313044, + "learning_rate": 9.831551720677607e-05, + "loss": 3.1543, + "step": 11249 + }, + { + "epoch": 0.5237795935470354, + "grad_norm": 0.3821715895709871, + "learning_rate": 9.831481996202798e-05, + "loss": 3.1896, + "step": 11250 + }, + { + "epoch": 0.5238261517331285, + "grad_norm": 0.42018293515108074, + "learning_rate": 9.831412257548063e-05, + "loss": 3.1931, + "step": 11251 + }, + { + "epoch": 0.5238727099192215, + "grad_norm": 0.45274445823088294, + "learning_rate": 9.83134250471361e-05, + "loss": 3.2258, + "step": 11252 + }, + { + "epoch": 0.5239192681053146, + "grad_norm": 0.3971587267508695, + "learning_rate": 9.831272737699643e-05, + "loss": 3.2543, + "step": 11253 + }, + { + "epoch": 0.5239658262914076, + "grad_norm": 0.48643559999567987, + "learning_rate": 9.831202956506367e-05, + "loss": 3.1921, + "step": 11254 + }, + { + "epoch": 0.5240123844775008, + "grad_norm": 0.45643466094035706, + "learning_rate": 9.831133161133986e-05, + "loss": 3.1118, + "step": 11255 + }, + { + "epoch": 0.5240589426635939, + "grad_norm": 0.42271194459734185, + "learning_rate": 9.831063351582704e-05, + "loss": 3.1279, + "step": 11256 + }, + { + "epoch": 0.5241055008496869, + "grad_norm": 0.46120970130354083, + "learning_rate": 9.830993527852727e-05, + "loss": 3.1905, + "step": 11257 + }, + { + "epoch": 0.52415205903578, + "grad_norm": 0.4489831790376545, + "learning_rate": 9.83092368994426e-05, + "loss": 3.2098, + "step": 11258 + }, + { + "epoch": 0.524198617221873, + "grad_norm": 0.4134546892454505, + "learning_rate": 9.830853837857507e-05, + "loss": 3.2476, + "step": 11259 + }, + { + "epoch": 0.5242451754079661, + "grad_norm": 0.41332508079986363, + "learning_rate": 9.830783971592676e-05, + "loss": 3.1044, + "step": 11260 + }, + { + "epoch": 0.5242917335940592, + "grad_norm": 0.36330773291493945, + "learning_rate": 9.83071409114997e-05, + "loss": 3.2084, + "step": 11261 + }, + { + "epoch": 0.5243382917801522, + "grad_norm": 0.4104204087406211, + "learning_rate": 9.830644196529592e-05, + "loss": 3.1512, + "step": 11262 + }, + { + "epoch": 0.5243848499662453, + "grad_norm": 0.3551337070949696, + "learning_rate": 9.830574287731752e-05, + "loss": 3.11, + "step": 11263 + }, + { + "epoch": 0.5244314081523384, + "grad_norm": 0.3971280414732962, + "learning_rate": 9.830504364756649e-05, + "loss": 3.149, + "step": 11264 + }, + { + "epoch": 0.5244779663384315, + "grad_norm": 0.36103872654346186, + "learning_rate": 9.830434427604494e-05, + "loss": 3.1462, + "step": 11265 + }, + { + "epoch": 0.5245245245245245, + "grad_norm": 0.3853356048323397, + "learning_rate": 9.83036447627549e-05, + "loss": 3.0127, + "step": 11266 + }, + { + "epoch": 0.5245710827106176, + "grad_norm": 0.38759474012839606, + "learning_rate": 9.830294510769842e-05, + "loss": 3.1936, + "step": 11267 + }, + { + "epoch": 0.5246176408967107, + "grad_norm": 0.3710915041910341, + "learning_rate": 9.830224531087754e-05, + "loss": 3.1425, + "step": 11268 + }, + { + "epoch": 0.5246641990828037, + "grad_norm": 0.4173788035773042, + "learning_rate": 9.830154537229434e-05, + "loss": 3.1193, + "step": 11269 + }, + { + "epoch": 0.5247107572688968, + "grad_norm": 0.4041228166037257, + "learning_rate": 9.830084529195086e-05, + "loss": 3.1537, + "step": 11270 + }, + { + "epoch": 0.5247573154549898, + "grad_norm": 0.38221737166036684, + "learning_rate": 9.830014506984915e-05, + "loss": 3.0401, + "step": 11271 + }, + { + "epoch": 0.5248038736410829, + "grad_norm": 0.4419531147397887, + "learning_rate": 9.829944470599129e-05, + "loss": 3.1009, + "step": 11272 + }, + { + "epoch": 0.5248504318271761, + "grad_norm": 0.412797504548022, + "learning_rate": 9.82987442003793e-05, + "loss": 3.1501, + "step": 11273 + }, + { + "epoch": 0.5248969900132691, + "grad_norm": 0.4017155436764297, + "learning_rate": 9.829804355301527e-05, + "loss": 3.1278, + "step": 11274 + }, + { + "epoch": 0.5249435481993622, + "grad_norm": 0.43539409014136193, + "learning_rate": 9.829734276390123e-05, + "loss": 3.14, + "step": 11275 + }, + { + "epoch": 0.5249901063854552, + "grad_norm": 0.35813286836235725, + "learning_rate": 9.829664183303924e-05, + "loss": 3.0794, + "step": 11276 + }, + { + "epoch": 0.5250366645715483, + "grad_norm": 0.39069557149184636, + "learning_rate": 9.829594076043138e-05, + "loss": 3.1626, + "step": 11277 + }, + { + "epoch": 0.5250832227576414, + "grad_norm": 0.4012223474952043, + "learning_rate": 9.829523954607969e-05, + "loss": 3.1697, + "step": 11278 + }, + { + "epoch": 0.5251297809437344, + "grad_norm": 0.41194184462280886, + "learning_rate": 9.829453818998622e-05, + "loss": 3.2136, + "step": 11279 + }, + { + "epoch": 0.5251763391298275, + "grad_norm": 0.40259537786634103, + "learning_rate": 9.829383669215304e-05, + "loss": 3.161, + "step": 11280 + }, + { + "epoch": 0.5252228973159205, + "grad_norm": 0.41725028676097, + "learning_rate": 9.829313505258222e-05, + "loss": 3.0938, + "step": 11281 + }, + { + "epoch": 0.5252694555020136, + "grad_norm": 0.3892931356595985, + "learning_rate": 9.829243327127579e-05, + "loss": 3.0629, + "step": 11282 + }, + { + "epoch": 0.5253160136881068, + "grad_norm": 0.34561295084953014, + "learning_rate": 9.829173134823584e-05, + "loss": 3.068, + "step": 11283 + }, + { + "epoch": 0.5253625718741998, + "grad_norm": 0.4028010584609366, + "learning_rate": 9.829102928346439e-05, + "loss": 3.1029, + "step": 11284 + }, + { + "epoch": 0.5254091300602929, + "grad_norm": 0.37536973933475665, + "learning_rate": 9.829032707696355e-05, + "loss": 3.1258, + "step": 11285 + }, + { + "epoch": 0.5254556882463859, + "grad_norm": 0.3692958523348891, + "learning_rate": 9.828962472873536e-05, + "loss": 3.0906, + "step": 11286 + }, + { + "epoch": 0.525502246432479, + "grad_norm": 0.4413793477518283, + "learning_rate": 9.828892223878186e-05, + "loss": 3.1563, + "step": 11287 + }, + { + "epoch": 0.525548804618572, + "grad_norm": 0.44792872889310603, + "learning_rate": 9.828821960710514e-05, + "loss": 3.1083, + "step": 11288 + }, + { + "epoch": 0.5255953628046651, + "grad_norm": 0.3503208708960175, + "learning_rate": 9.828751683370728e-05, + "loss": 3.2155, + "step": 11289 + }, + { + "epoch": 0.5256419209907582, + "grad_norm": 0.41867763534262925, + "learning_rate": 9.828681391859027e-05, + "loss": 3.1385, + "step": 11290 + }, + { + "epoch": 0.5256884791768512, + "grad_norm": 0.4917933766602357, + "learning_rate": 9.828611086175624e-05, + "loss": 3.2644, + "step": 11291 + }, + { + "epoch": 0.5257350373629444, + "grad_norm": 0.40831329926294313, + "learning_rate": 9.828540766320723e-05, + "loss": 3.0983, + "step": 11292 + }, + { + "epoch": 0.5257815955490374, + "grad_norm": 0.37492547743807947, + "learning_rate": 9.82847043229453e-05, + "loss": 3.0508, + "step": 11293 + }, + { + "epoch": 0.5258281537351305, + "grad_norm": 0.4098321084297871, + "learning_rate": 9.828400084097254e-05, + "loss": 3.2004, + "step": 11294 + }, + { + "epoch": 0.5258747119212236, + "grad_norm": 0.3548027735490749, + "learning_rate": 9.828329721729097e-05, + "loss": 3.0753, + "step": 11295 + }, + { + "epoch": 0.5259212701073166, + "grad_norm": 0.3811386059903542, + "learning_rate": 9.828259345190268e-05, + "loss": 3.2999, + "step": 11296 + }, + { + "epoch": 0.5259678282934097, + "grad_norm": 0.39490472900354906, + "learning_rate": 9.828188954480975e-05, + "loss": 3.1883, + "step": 11297 + }, + { + "epoch": 0.5260143864795027, + "grad_norm": 0.45810812677378, + "learning_rate": 9.828118549601422e-05, + "loss": 3.1301, + "step": 11298 + }, + { + "epoch": 0.5260609446655958, + "grad_norm": 0.4067442662132645, + "learning_rate": 9.828048130551817e-05, + "loss": 3.1832, + "step": 11299 + }, + { + "epoch": 0.526107502851689, + "grad_norm": 0.36431584418879764, + "learning_rate": 9.827977697332366e-05, + "loss": 3.1048, + "step": 11300 + }, + { + "epoch": 0.526154061037782, + "grad_norm": 0.35574880069724046, + "learning_rate": 9.827907249943278e-05, + "loss": 3.2062, + "step": 11301 + }, + { + "epoch": 0.5262006192238751, + "grad_norm": 0.3844924246410667, + "learning_rate": 9.827836788384755e-05, + "loss": 3.2183, + "step": 11302 + }, + { + "epoch": 0.5262471774099681, + "grad_norm": 0.3795479695448226, + "learning_rate": 9.827766312657009e-05, + "loss": 3.2723, + "step": 11303 + }, + { + "epoch": 0.5262937355960612, + "grad_norm": 0.37413171816931984, + "learning_rate": 9.827695822760243e-05, + "loss": 3.1541, + "step": 11304 + }, + { + "epoch": 0.5263402937821543, + "grad_norm": 0.3741226149376701, + "learning_rate": 9.827625318694666e-05, + "loss": 3.2341, + "step": 11305 + }, + { + "epoch": 0.5263868519682473, + "grad_norm": 0.44009421486825717, + "learning_rate": 9.827554800460483e-05, + "loss": 3.2286, + "step": 11306 + }, + { + "epoch": 0.5264334101543404, + "grad_norm": 0.3906530619419189, + "learning_rate": 9.827484268057904e-05, + "loss": 3.2308, + "step": 11307 + }, + { + "epoch": 0.5264799683404334, + "grad_norm": 0.3765858453138885, + "learning_rate": 9.827413721487134e-05, + "loss": 3.2438, + "step": 11308 + }, + { + "epoch": 0.5265265265265265, + "grad_norm": 0.39134901461908034, + "learning_rate": 9.82734316074838e-05, + "loss": 3.1676, + "step": 11309 + }, + { + "epoch": 0.5265730847126195, + "grad_norm": 0.38409648128337437, + "learning_rate": 9.82727258584185e-05, + "loss": 3.1172, + "step": 11310 + }, + { + "epoch": 0.5266196428987127, + "grad_norm": 0.39143682462505885, + "learning_rate": 9.827201996767749e-05, + "loss": 3.1249, + "step": 11311 + }, + { + "epoch": 0.5266662010848058, + "grad_norm": 0.3904948519226965, + "learning_rate": 9.827131393526287e-05, + "loss": 3.1847, + "step": 11312 + }, + { + "epoch": 0.5267127592708988, + "grad_norm": 0.46028955864109594, + "learning_rate": 9.827060776117669e-05, + "loss": 3.2761, + "step": 11313 + }, + { + "epoch": 0.5267593174569919, + "grad_norm": 0.42677367192557775, + "learning_rate": 9.826990144542104e-05, + "loss": 3.2048, + "step": 11314 + }, + { + "epoch": 0.5268058756430849, + "grad_norm": 0.42301411052532606, + "learning_rate": 9.826919498799799e-05, + "loss": 3.169, + "step": 11315 + }, + { + "epoch": 0.526852433829178, + "grad_norm": 0.43376376319644344, + "learning_rate": 9.82684883889096e-05, + "loss": 3.114, + "step": 11316 + }, + { + "epoch": 0.5268989920152711, + "grad_norm": 0.40567431750780014, + "learning_rate": 9.826778164815796e-05, + "loss": 3.1061, + "step": 11317 + }, + { + "epoch": 0.5269455502013641, + "grad_norm": 0.3908892893547952, + "learning_rate": 9.826707476574512e-05, + "loss": 3.2864, + "step": 11318 + }, + { + "epoch": 0.5269921083874572, + "grad_norm": 0.43831279784609367, + "learning_rate": 9.826636774167317e-05, + "loss": 3.1632, + "step": 11319 + }, + { + "epoch": 0.5270386665735503, + "grad_norm": 0.36051395348714926, + "learning_rate": 9.82656605759442e-05, + "loss": 3.0337, + "step": 11320 + }, + { + "epoch": 0.5270852247596434, + "grad_norm": 0.43674320329414995, + "learning_rate": 9.826495326856027e-05, + "loss": 3.1034, + "step": 11321 + }, + { + "epoch": 0.5271317829457365, + "grad_norm": 0.44987950030141305, + "learning_rate": 9.826424581952345e-05, + "loss": 3.1372, + "step": 11322 + }, + { + "epoch": 0.5271783411318295, + "grad_norm": 0.3961238233972282, + "learning_rate": 9.826353822883581e-05, + "loss": 3.1746, + "step": 11323 + }, + { + "epoch": 0.5272248993179226, + "grad_norm": 0.36770229291758877, + "learning_rate": 9.826283049649947e-05, + "loss": 3.2005, + "step": 11324 + }, + { + "epoch": 0.5272714575040156, + "grad_norm": 0.40972034161305615, + "learning_rate": 9.826212262251646e-05, + "loss": 3.1582, + "step": 11325 + }, + { + "epoch": 0.5273180156901087, + "grad_norm": 0.3834181538280492, + "learning_rate": 9.826141460688887e-05, + "loss": 3.1832, + "step": 11326 + }, + { + "epoch": 0.5273645738762018, + "grad_norm": 0.3561506783645935, + "learning_rate": 9.82607064496188e-05, + "loss": 3.0702, + "step": 11327 + }, + { + "epoch": 0.5274111320622948, + "grad_norm": 0.39803607860322315, + "learning_rate": 9.82599981507083e-05, + "loss": 3.1602, + "step": 11328 + }, + { + "epoch": 0.527457690248388, + "grad_norm": 0.40261735631696577, + "learning_rate": 9.825928971015945e-05, + "loss": 3.0178, + "step": 11329 + }, + { + "epoch": 0.527504248434481, + "grad_norm": 0.3813440520957598, + "learning_rate": 9.825858112797435e-05, + "loss": 3.0661, + "step": 11330 + }, + { + "epoch": 0.5275508066205741, + "grad_norm": 0.4585598983309403, + "learning_rate": 9.825787240415507e-05, + "loss": 3.2303, + "step": 11331 + }, + { + "epoch": 0.5275973648066671, + "grad_norm": 0.3947211039901671, + "learning_rate": 9.825716353870369e-05, + "loss": 3.1488, + "step": 11332 + }, + { + "epoch": 0.5276439229927602, + "grad_norm": 0.3911752516018894, + "learning_rate": 9.825645453162229e-05, + "loss": 3.1258, + "step": 11333 + }, + { + "epoch": 0.5276904811788533, + "grad_norm": 0.47268696378243713, + "learning_rate": 9.825574538291293e-05, + "loss": 3.1212, + "step": 11334 + }, + { + "epoch": 0.5277370393649463, + "grad_norm": 0.4195120550038428, + "learning_rate": 9.825503609257774e-05, + "loss": 3.1786, + "step": 11335 + }, + { + "epoch": 0.5277835975510394, + "grad_norm": 0.44625159813467097, + "learning_rate": 9.825432666061876e-05, + "loss": 3.2129, + "step": 11336 + }, + { + "epoch": 0.5278301557371324, + "grad_norm": 0.40373264611437926, + "learning_rate": 9.825361708703808e-05, + "loss": 3.1717, + "step": 11337 + }, + { + "epoch": 0.5278767139232255, + "grad_norm": 0.3858699539506796, + "learning_rate": 9.82529073718378e-05, + "loss": 3.1545, + "step": 11338 + }, + { + "epoch": 0.5279232721093187, + "grad_norm": 0.4014865547095074, + "learning_rate": 9.825219751501999e-05, + "loss": 3.1519, + "step": 11339 + }, + { + "epoch": 0.5279698302954117, + "grad_norm": 0.36415982394239615, + "learning_rate": 9.825148751658673e-05, + "loss": 3.0777, + "step": 11340 + }, + { + "epoch": 0.5280163884815048, + "grad_norm": 0.430760006397937, + "learning_rate": 9.82507773765401e-05, + "loss": 3.1381, + "step": 11341 + }, + { + "epoch": 0.5280629466675978, + "grad_norm": 0.4802978669426074, + "learning_rate": 9.82500670948822e-05, + "loss": 3.1084, + "step": 11342 + }, + { + "epoch": 0.5281095048536909, + "grad_norm": 0.3537923153614449, + "learning_rate": 9.82493566716151e-05, + "loss": 3.1801, + "step": 11343 + }, + { + "epoch": 0.528156063039784, + "grad_norm": 0.4085777451412462, + "learning_rate": 9.824864610674089e-05, + "loss": 3.1303, + "step": 11344 + }, + { + "epoch": 0.528202621225877, + "grad_norm": 0.39550637165807967, + "learning_rate": 9.824793540026167e-05, + "loss": 3.3462, + "step": 11345 + }, + { + "epoch": 0.5282491794119701, + "grad_norm": 0.37996220016483995, + "learning_rate": 9.82472245521795e-05, + "loss": 3.0584, + "step": 11346 + }, + { + "epoch": 0.5282957375980631, + "grad_norm": 0.4056079000805409, + "learning_rate": 9.824651356249648e-05, + "loss": 3.1244, + "step": 11347 + }, + { + "epoch": 0.5283422957841563, + "grad_norm": 0.38782547652077176, + "learning_rate": 9.82458024312147e-05, + "loss": 3.0532, + "step": 11348 + }, + { + "epoch": 0.5283888539702494, + "grad_norm": 0.3959825210931301, + "learning_rate": 9.824509115833624e-05, + "loss": 3.0975, + "step": 11349 + }, + { + "epoch": 0.5284354121563424, + "grad_norm": 0.4244441855905788, + "learning_rate": 9.82443797438632e-05, + "loss": 3.1148, + "step": 11350 + }, + { + "epoch": 0.5284819703424355, + "grad_norm": 0.4704464184867464, + "learning_rate": 9.824366818779764e-05, + "loss": 3.1806, + "step": 11351 + }, + { + "epoch": 0.5285285285285285, + "grad_norm": 0.3967374321691603, + "learning_rate": 9.824295649014169e-05, + "loss": 3.174, + "step": 11352 + }, + { + "epoch": 0.5285750867146216, + "grad_norm": 0.4045303419979962, + "learning_rate": 9.82422446508974e-05, + "loss": 3.2124, + "step": 11353 + }, + { + "epoch": 0.5286216449007146, + "grad_norm": 0.6956122984422155, + "learning_rate": 9.824153267006688e-05, + "loss": 3.1592, + "step": 11354 + }, + { + "epoch": 0.5286682030868077, + "grad_norm": 0.4832196721355474, + "learning_rate": 9.824082054765221e-05, + "loss": 3.1538, + "step": 11355 + }, + { + "epoch": 0.5287147612729008, + "grad_norm": 0.49881569885804733, + "learning_rate": 9.824010828365549e-05, + "loss": 3.2215, + "step": 11356 + }, + { + "epoch": 0.5287613194589939, + "grad_norm": 0.47469346583732147, + "learning_rate": 9.82393958780788e-05, + "loss": 3.0382, + "step": 11357 + }, + { + "epoch": 0.528807877645087, + "grad_norm": 0.43630162442049136, + "learning_rate": 9.823868333092422e-05, + "loss": 3.1273, + "step": 11358 + }, + { + "epoch": 0.52885443583118, + "grad_norm": 0.4899530978141642, + "learning_rate": 9.823797064219389e-05, + "loss": 3.2561, + "step": 11359 + }, + { + "epoch": 0.5289009940172731, + "grad_norm": 0.5016002121425982, + "learning_rate": 9.823725781188987e-05, + "loss": 3.2757, + "step": 11360 + }, + { + "epoch": 0.5289475522033662, + "grad_norm": 0.4257909755778078, + "learning_rate": 9.823654484001421e-05, + "loss": 3.0552, + "step": 11361 + }, + { + "epoch": 0.5289941103894592, + "grad_norm": 0.3993895890463475, + "learning_rate": 9.823583172656907e-05, + "loss": 3.2449, + "step": 11362 + }, + { + "epoch": 0.5290406685755523, + "grad_norm": 0.44772821115465367, + "learning_rate": 9.823511847155651e-05, + "loss": 3.2152, + "step": 11363 + }, + { + "epoch": 0.5290872267616453, + "grad_norm": 0.4252756105321058, + "learning_rate": 9.823440507497863e-05, + "loss": 3.2259, + "step": 11364 + }, + { + "epoch": 0.5291337849477384, + "grad_norm": 0.42220010189294443, + "learning_rate": 9.823369153683752e-05, + "loss": 3.1453, + "step": 11365 + }, + { + "epoch": 0.5291803431338316, + "grad_norm": 0.43637399130827564, + "learning_rate": 9.823297785713528e-05, + "loss": 3.1848, + "step": 11366 + }, + { + "epoch": 0.5292269013199246, + "grad_norm": 0.4040043801990596, + "learning_rate": 9.8232264035874e-05, + "loss": 3.0285, + "step": 11367 + }, + { + "epoch": 0.5292734595060177, + "grad_norm": 0.4352373077004174, + "learning_rate": 9.823155007305579e-05, + "loss": 3.0838, + "step": 11368 + }, + { + "epoch": 0.5293200176921107, + "grad_norm": 0.4214702833400858, + "learning_rate": 9.823083596868272e-05, + "loss": 3.1246, + "step": 11369 + }, + { + "epoch": 0.5293665758782038, + "grad_norm": 0.38483805368612595, + "learning_rate": 9.82301217227569e-05, + "loss": 3.2291, + "step": 11370 + }, + { + "epoch": 0.5294131340642969, + "grad_norm": 0.41469020979885557, + "learning_rate": 9.822940733528043e-05, + "loss": 3.1246, + "step": 11371 + }, + { + "epoch": 0.5294596922503899, + "grad_norm": 0.36076271961380235, + "learning_rate": 9.822869280625538e-05, + "loss": 3.1005, + "step": 11372 + }, + { + "epoch": 0.529506250436483, + "grad_norm": 0.3679411220850883, + "learning_rate": 9.822797813568388e-05, + "loss": 3.0974, + "step": 11373 + }, + { + "epoch": 0.529552808622576, + "grad_norm": 0.43749641262094324, + "learning_rate": 9.822726332356802e-05, + "loss": 3.1483, + "step": 11374 + }, + { + "epoch": 0.5295993668086691, + "grad_norm": 0.3373664588802259, + "learning_rate": 9.82265483699099e-05, + "loss": 3.1942, + "step": 11375 + }, + { + "epoch": 0.5296459249947622, + "grad_norm": 0.4317220924521164, + "learning_rate": 9.82258332747116e-05, + "loss": 3.2091, + "step": 11376 + }, + { + "epoch": 0.5296924831808553, + "grad_norm": 0.426345118927404, + "learning_rate": 9.822511803797523e-05, + "loss": 3.2055, + "step": 11377 + }, + { + "epoch": 0.5297390413669484, + "grad_norm": 0.38497320641872146, + "learning_rate": 9.82244026597029e-05, + "loss": 3.1436, + "step": 11378 + }, + { + "epoch": 0.5297855995530414, + "grad_norm": 0.4479442562209067, + "learning_rate": 9.82236871398967e-05, + "loss": 3.1511, + "step": 11379 + }, + { + "epoch": 0.5298321577391345, + "grad_norm": 0.38409758752462364, + "learning_rate": 9.82229714785587e-05, + "loss": 3.2628, + "step": 11380 + }, + { + "epoch": 0.5298787159252275, + "grad_norm": 0.4052869126929921, + "learning_rate": 9.822225567569105e-05, + "loss": 3.2191, + "step": 11381 + }, + { + "epoch": 0.5299252741113206, + "grad_norm": 0.4212484931734968, + "learning_rate": 9.822153973129584e-05, + "loss": 3.1814, + "step": 11382 + }, + { + "epoch": 0.5299718322974137, + "grad_norm": 0.3693229430989966, + "learning_rate": 9.822082364537515e-05, + "loss": 3.2092, + "step": 11383 + }, + { + "epoch": 0.5300183904835067, + "grad_norm": 0.3821802340887768, + "learning_rate": 9.822010741793108e-05, + "loss": 3.048, + "step": 11384 + }, + { + "epoch": 0.5300649486695999, + "grad_norm": 0.37126419138776495, + "learning_rate": 9.821939104896577e-05, + "loss": 3.0836, + "step": 11385 + }, + { + "epoch": 0.5301115068556929, + "grad_norm": 0.36329168372576665, + "learning_rate": 9.821867453848128e-05, + "loss": 3.1268, + "step": 11386 + }, + { + "epoch": 0.530158065041786, + "grad_norm": 0.39217630160001704, + "learning_rate": 9.821795788647976e-05, + "loss": 3.201, + "step": 11387 + }, + { + "epoch": 0.5302046232278791, + "grad_norm": 0.3644793182919721, + "learning_rate": 9.821724109296325e-05, + "loss": 3.1809, + "step": 11388 + }, + { + "epoch": 0.5302511814139721, + "grad_norm": 0.35490858391533503, + "learning_rate": 9.82165241579339e-05, + "loss": 3.1098, + "step": 11389 + }, + { + "epoch": 0.5302977396000652, + "grad_norm": 0.37763982681639124, + "learning_rate": 9.821580708139379e-05, + "loss": 3.1335, + "step": 11390 + }, + { + "epoch": 0.5303442977861582, + "grad_norm": 0.3740311523175476, + "learning_rate": 9.821508986334507e-05, + "loss": 3.1862, + "step": 11391 + }, + { + "epoch": 0.5303908559722513, + "grad_norm": 0.39317390532333424, + "learning_rate": 9.821437250378978e-05, + "loss": 3.2126, + "step": 11392 + }, + { + "epoch": 0.5304374141583444, + "grad_norm": 0.3408204040363772, + "learning_rate": 9.821365500273006e-05, + "loss": 3.1589, + "step": 11393 + }, + { + "epoch": 0.5304839723444374, + "grad_norm": 0.4563144725986431, + "learning_rate": 9.821293736016803e-05, + "loss": 3.1942, + "step": 11394 + }, + { + "epoch": 0.5305305305305306, + "grad_norm": 0.3903845583885374, + "learning_rate": 9.821221957610575e-05, + "loss": 3.22, + "step": 11395 + }, + { + "epoch": 0.5305770887166236, + "grad_norm": 0.3594498423765609, + "learning_rate": 9.821150165054538e-05, + "loss": 3.2563, + "step": 11396 + }, + { + "epoch": 0.5306236469027167, + "grad_norm": 0.4250193751389847, + "learning_rate": 9.8210783583489e-05, + "loss": 3.1908, + "step": 11397 + }, + { + "epoch": 0.5306702050888097, + "grad_norm": 0.38580594708901217, + "learning_rate": 9.821006537493871e-05, + "loss": 3.0885, + "step": 11398 + }, + { + "epoch": 0.5307167632749028, + "grad_norm": 0.3185086467602854, + "learning_rate": 9.820934702489663e-05, + "loss": 2.9815, + "step": 11399 + }, + { + "epoch": 0.5307633214609959, + "grad_norm": 0.401156543776037, + "learning_rate": 9.820862853336486e-05, + "loss": 3.1444, + "step": 11400 + }, + { + "epoch": 0.5308098796470889, + "grad_norm": 0.31832644034254737, + "learning_rate": 9.820790990034553e-05, + "loss": 3.0999, + "step": 11401 + }, + { + "epoch": 0.530856437833182, + "grad_norm": 0.3815437511830658, + "learning_rate": 9.820719112584073e-05, + "loss": 3.1429, + "step": 11402 + }, + { + "epoch": 0.530902996019275, + "grad_norm": 0.3938163725679311, + "learning_rate": 9.820647220985256e-05, + "loss": 3.2441, + "step": 11403 + }, + { + "epoch": 0.5309495542053682, + "grad_norm": 0.3834198767373707, + "learning_rate": 9.820575315238316e-05, + "loss": 3.0651, + "step": 11404 + }, + { + "epoch": 0.5309961123914613, + "grad_norm": 0.35812612650902337, + "learning_rate": 9.820503395343461e-05, + "loss": 3.1791, + "step": 11405 + }, + { + "epoch": 0.5310426705775543, + "grad_norm": 0.3710329730546975, + "learning_rate": 9.820431461300904e-05, + "loss": 3.0946, + "step": 11406 + }, + { + "epoch": 0.5310892287636474, + "grad_norm": 0.40421350626698094, + "learning_rate": 9.820359513110855e-05, + "loss": 3.1015, + "step": 11407 + }, + { + "epoch": 0.5311357869497404, + "grad_norm": 0.33911130482943774, + "learning_rate": 9.820287550773527e-05, + "loss": 2.9581, + "step": 11408 + }, + { + "epoch": 0.5311823451358335, + "grad_norm": 0.3632560396437523, + "learning_rate": 9.820215574289129e-05, + "loss": 3.033, + "step": 11409 + }, + { + "epoch": 0.5312289033219266, + "grad_norm": 0.37651240381136286, + "learning_rate": 9.820143583657873e-05, + "loss": 3.1906, + "step": 11410 + }, + { + "epoch": 0.5312754615080196, + "grad_norm": 0.3316209158494384, + "learning_rate": 9.820071578879971e-05, + "loss": 3.154, + "step": 11411 + }, + { + "epoch": 0.5313220196941127, + "grad_norm": 0.3699962521564283, + "learning_rate": 9.819999559955634e-05, + "loss": 3.1575, + "step": 11412 + }, + { + "epoch": 0.5313685778802058, + "grad_norm": 0.37972001525414906, + "learning_rate": 9.819927526885074e-05, + "loss": 3.1023, + "step": 11413 + }, + { + "epoch": 0.5314151360662989, + "grad_norm": 0.36174215620851935, + "learning_rate": 9.819855479668498e-05, + "loss": 3.2266, + "step": 11414 + }, + { + "epoch": 0.531461694252392, + "grad_norm": 0.4087617951092848, + "learning_rate": 9.819783418306123e-05, + "loss": 3.1338, + "step": 11415 + }, + { + "epoch": 0.531508252438485, + "grad_norm": 0.4182223771322768, + "learning_rate": 9.819711342798159e-05, + "loss": 3.2289, + "step": 11416 + }, + { + "epoch": 0.5315548106245781, + "grad_norm": 0.3972116849209552, + "learning_rate": 9.819639253144816e-05, + "loss": 3.1374, + "step": 11417 + }, + { + "epoch": 0.5316013688106711, + "grad_norm": 0.3983003761395412, + "learning_rate": 9.819567149346309e-05, + "loss": 3.1675, + "step": 11418 + }, + { + "epoch": 0.5316479269967642, + "grad_norm": 0.452739996926616, + "learning_rate": 9.819495031402845e-05, + "loss": 3.2446, + "step": 11419 + }, + { + "epoch": 0.5316944851828572, + "grad_norm": 0.4309533751182387, + "learning_rate": 9.819422899314639e-05, + "loss": 3.2127, + "step": 11420 + }, + { + "epoch": 0.5317410433689503, + "grad_norm": 0.4259997307471966, + "learning_rate": 9.819350753081901e-05, + "loss": 3.18, + "step": 11421 + }, + { + "epoch": 0.5317876015550435, + "grad_norm": 0.45742225495963373, + "learning_rate": 9.819278592704844e-05, + "loss": 3.1314, + "step": 11422 + }, + { + "epoch": 0.5318341597411365, + "grad_norm": 0.40470233122666693, + "learning_rate": 9.819206418183678e-05, + "loss": 3.2406, + "step": 11423 + }, + { + "epoch": 0.5318807179272296, + "grad_norm": 0.4147424176229711, + "learning_rate": 9.819134229518616e-05, + "loss": 3.2688, + "step": 11424 + }, + { + "epoch": 0.5319272761133226, + "grad_norm": 0.44398072910250524, + "learning_rate": 9.81906202670987e-05, + "loss": 3.1827, + "step": 11425 + }, + { + "epoch": 0.5319738342994157, + "grad_norm": 0.36234525631030473, + "learning_rate": 9.818989809757652e-05, + "loss": 3.0725, + "step": 11426 + }, + { + "epoch": 0.5320203924855088, + "grad_norm": 0.4053032629111198, + "learning_rate": 9.818917578662174e-05, + "loss": 3.1229, + "step": 11427 + }, + { + "epoch": 0.5320669506716018, + "grad_norm": 0.40802556562927583, + "learning_rate": 9.818845333423647e-05, + "loss": 3.2551, + "step": 11428 + }, + { + "epoch": 0.5321135088576949, + "grad_norm": 0.3506823897415717, + "learning_rate": 9.818773074042284e-05, + "loss": 3.0617, + "step": 11429 + }, + { + "epoch": 0.5321600670437879, + "grad_norm": 0.3837392910574324, + "learning_rate": 9.818700800518297e-05, + "loss": 3.2237, + "step": 11430 + }, + { + "epoch": 0.532206625229881, + "grad_norm": 0.4045234913088581, + "learning_rate": 9.818628512851898e-05, + "loss": 3.1617, + "step": 11431 + }, + { + "epoch": 0.5322531834159742, + "grad_norm": 0.36087627230167385, + "learning_rate": 9.818556211043297e-05, + "loss": 3.3024, + "step": 11432 + }, + { + "epoch": 0.5322997416020672, + "grad_norm": 0.3813600575678245, + "learning_rate": 9.81848389509271e-05, + "loss": 3.2464, + "step": 11433 + }, + { + "epoch": 0.5323462997881603, + "grad_norm": 0.3961586962563536, + "learning_rate": 9.818411565000347e-05, + "loss": 3.1097, + "step": 11434 + }, + { + "epoch": 0.5323928579742533, + "grad_norm": 0.38221369448954146, + "learning_rate": 9.818339220766422e-05, + "loss": 3.1919, + "step": 11435 + }, + { + "epoch": 0.5324394161603464, + "grad_norm": 0.3778467957651414, + "learning_rate": 9.818266862391146e-05, + "loss": 3.2683, + "step": 11436 + }, + { + "epoch": 0.5324859743464395, + "grad_norm": 0.3587188812382875, + "learning_rate": 9.81819448987473e-05, + "loss": 3.0733, + "step": 11437 + }, + { + "epoch": 0.5325325325325325, + "grad_norm": 0.3336357422677257, + "learning_rate": 9.818122103217389e-05, + "loss": 3.1707, + "step": 11438 + }, + { + "epoch": 0.5325790907186256, + "grad_norm": 0.4074196622588514, + "learning_rate": 9.818049702419333e-05, + "loss": 3.2284, + "step": 11439 + }, + { + "epoch": 0.5326256489047186, + "grad_norm": 0.4701504001635554, + "learning_rate": 9.817977287480777e-05, + "loss": 3.1592, + "step": 11440 + }, + { + "epoch": 0.5326722070908118, + "grad_norm": 0.40097965243516803, + "learning_rate": 9.817904858401932e-05, + "loss": 3.1396, + "step": 11441 + }, + { + "epoch": 0.5327187652769048, + "grad_norm": 0.40269672735305795, + "learning_rate": 9.81783241518301e-05, + "loss": 3.1355, + "step": 11442 + }, + { + "epoch": 0.5327653234629979, + "grad_norm": 0.38035072142547866, + "learning_rate": 9.817759957824224e-05, + "loss": 3.243, + "step": 11443 + }, + { + "epoch": 0.532811881649091, + "grad_norm": 0.3538665192084566, + "learning_rate": 9.817687486325788e-05, + "loss": 3.0533, + "step": 11444 + }, + { + "epoch": 0.532858439835184, + "grad_norm": 0.36195997986521156, + "learning_rate": 9.817615000687915e-05, + "loss": 3.1976, + "step": 11445 + }, + { + "epoch": 0.5329049980212771, + "grad_norm": 0.414654043310713, + "learning_rate": 9.817542500910816e-05, + "loss": 3.101, + "step": 11446 + }, + { + "epoch": 0.5329515562073701, + "grad_norm": 0.3731255573222157, + "learning_rate": 9.817469986994704e-05, + "loss": 3.2127, + "step": 11447 + }, + { + "epoch": 0.5329981143934632, + "grad_norm": 0.4082354925222888, + "learning_rate": 9.817397458939792e-05, + "loss": 3.2013, + "step": 11448 + }, + { + "epoch": 0.5330446725795563, + "grad_norm": 0.3800849551485233, + "learning_rate": 9.817324916746293e-05, + "loss": 3.0173, + "step": 11449 + }, + { + "epoch": 0.5330912307656493, + "grad_norm": 0.38111108826098467, + "learning_rate": 9.817252360414421e-05, + "loss": 3.1008, + "step": 11450 + }, + { + "epoch": 0.5331377889517425, + "grad_norm": 0.378607803178384, + "learning_rate": 9.817179789944387e-05, + "loss": 3.153, + "step": 11451 + }, + { + "epoch": 0.5331843471378355, + "grad_norm": 0.3449324292352106, + "learning_rate": 9.817107205336404e-05, + "loss": 3.1313, + "step": 11452 + }, + { + "epoch": 0.5332309053239286, + "grad_norm": 0.3681075799592454, + "learning_rate": 9.817034606590688e-05, + "loss": 3.0238, + "step": 11453 + }, + { + "epoch": 0.5332774635100217, + "grad_norm": 0.3668162546853104, + "learning_rate": 9.81696199370745e-05, + "loss": 3.0079, + "step": 11454 + }, + { + "epoch": 0.5333240216961147, + "grad_norm": 0.3988452405481302, + "learning_rate": 9.816889366686901e-05, + "loss": 3.0282, + "step": 11455 + }, + { + "epoch": 0.5333705798822078, + "grad_norm": 0.397912601306496, + "learning_rate": 9.816816725529258e-05, + "loss": 3.0984, + "step": 11456 + }, + { + "epoch": 0.5334171380683008, + "grad_norm": 0.3795114836418947, + "learning_rate": 9.81674407023473e-05, + "loss": 3.207, + "step": 11457 + }, + { + "epoch": 0.5334636962543939, + "grad_norm": 0.4444593245610684, + "learning_rate": 9.816671400803536e-05, + "loss": 3.2037, + "step": 11458 + }, + { + "epoch": 0.533510254440487, + "grad_norm": 0.4176432157544677, + "learning_rate": 9.816598717235884e-05, + "loss": 3.2129, + "step": 11459 + }, + { + "epoch": 0.5335568126265801, + "grad_norm": 0.4409495375364926, + "learning_rate": 9.81652601953199e-05, + "loss": 3.1724, + "step": 11460 + }, + { + "epoch": 0.5336033708126732, + "grad_norm": 0.4330089041960479, + "learning_rate": 9.816453307692066e-05, + "loss": 3.2221, + "step": 11461 + }, + { + "epoch": 0.5336499289987662, + "grad_norm": 0.4200050854683063, + "learning_rate": 9.816380581716327e-05, + "loss": 3.1657, + "step": 11462 + }, + { + "epoch": 0.5336964871848593, + "grad_norm": 0.3898515733210021, + "learning_rate": 9.816307841604983e-05, + "loss": 3.1645, + "step": 11463 + }, + { + "epoch": 0.5337430453709523, + "grad_norm": 0.34550300291542063, + "learning_rate": 9.816235087358253e-05, + "loss": 2.9936, + "step": 11464 + }, + { + "epoch": 0.5337896035570454, + "grad_norm": 0.42536023269031276, + "learning_rate": 9.816162318976346e-05, + "loss": 3.1612, + "step": 11465 + }, + { + "epoch": 0.5338361617431385, + "grad_norm": 0.3810492843799439, + "learning_rate": 9.816089536459479e-05, + "loss": 3.155, + "step": 11466 + }, + { + "epoch": 0.5338827199292315, + "grad_norm": 0.38335122238374075, + "learning_rate": 9.81601673980786e-05, + "loss": 3.0953, + "step": 11467 + }, + { + "epoch": 0.5339292781153246, + "grad_norm": 0.41407113912232457, + "learning_rate": 9.81594392902171e-05, + "loss": 3.0911, + "step": 11468 + }, + { + "epoch": 0.5339758363014177, + "grad_norm": 0.3415701886020871, + "learning_rate": 9.815871104101237e-05, + "loss": 3.0488, + "step": 11469 + }, + { + "epoch": 0.5340223944875108, + "grad_norm": 0.4231178002610944, + "learning_rate": 9.815798265046656e-05, + "loss": 3.2695, + "step": 11470 + }, + { + "epoch": 0.5340689526736039, + "grad_norm": 0.4605175026083961, + "learning_rate": 9.815725411858184e-05, + "loss": 3.1287, + "step": 11471 + }, + { + "epoch": 0.5341155108596969, + "grad_norm": 0.40601943235035115, + "learning_rate": 9.815652544536032e-05, + "loss": 3.1727, + "step": 11472 + }, + { + "epoch": 0.53416206904579, + "grad_norm": 0.41730165842728895, + "learning_rate": 9.815579663080411e-05, + "loss": 3.1608, + "step": 11473 + }, + { + "epoch": 0.534208627231883, + "grad_norm": 0.37842723658156097, + "learning_rate": 9.81550676749154e-05, + "loss": 3.1824, + "step": 11474 + }, + { + "epoch": 0.5342551854179761, + "grad_norm": 0.415679957216725, + "learning_rate": 9.815433857769632e-05, + "loss": 3.1382, + "step": 11475 + }, + { + "epoch": 0.5343017436040692, + "grad_norm": 0.3667127050499853, + "learning_rate": 9.8153609339149e-05, + "loss": 3.1711, + "step": 11476 + }, + { + "epoch": 0.5343483017901622, + "grad_norm": 0.3874665569028122, + "learning_rate": 9.815287995927558e-05, + "loss": 3.138, + "step": 11477 + }, + { + "epoch": 0.5343948599762554, + "grad_norm": 0.35735329557758094, + "learning_rate": 9.815215043807819e-05, + "loss": 3.1125, + "step": 11478 + }, + { + "epoch": 0.5344414181623484, + "grad_norm": 0.37979564620913264, + "learning_rate": 9.815142077555898e-05, + "loss": 3.0593, + "step": 11479 + }, + { + "epoch": 0.5344879763484415, + "grad_norm": 0.3631539794366733, + "learning_rate": 9.815069097172012e-05, + "loss": 3.2447, + "step": 11480 + }, + { + "epoch": 0.5345345345345346, + "grad_norm": 0.3836383767912352, + "learning_rate": 9.814996102656369e-05, + "loss": 3.1491, + "step": 11481 + }, + { + "epoch": 0.5345810927206276, + "grad_norm": 0.3768063830480777, + "learning_rate": 9.814923094009189e-05, + "loss": 3.2413, + "step": 11482 + }, + { + "epoch": 0.5346276509067207, + "grad_norm": 0.4188390880187332, + "learning_rate": 9.814850071230683e-05, + "loss": 3.1174, + "step": 11483 + }, + { + "epoch": 0.5346742090928137, + "grad_norm": 0.41187452352904974, + "learning_rate": 9.814777034321068e-05, + "loss": 3.1927, + "step": 11484 + }, + { + "epoch": 0.5347207672789068, + "grad_norm": 0.3493894179387715, + "learning_rate": 9.814703983280556e-05, + "loss": 3.229, + "step": 11485 + }, + { + "epoch": 0.5347673254649998, + "grad_norm": 0.36998433855742546, + "learning_rate": 9.814630918109362e-05, + "loss": 3.2203, + "step": 11486 + }, + { + "epoch": 0.534813883651093, + "grad_norm": 0.3491721684275618, + "learning_rate": 9.814557838807699e-05, + "loss": 3.158, + "step": 11487 + }, + { + "epoch": 0.5348604418371861, + "grad_norm": 0.37737053074679316, + "learning_rate": 9.814484745375784e-05, + "loss": 3.1102, + "step": 11488 + }, + { + "epoch": 0.5349070000232791, + "grad_norm": 0.4129878025478497, + "learning_rate": 9.814411637813831e-05, + "loss": 3.1162, + "step": 11489 + }, + { + "epoch": 0.5349535582093722, + "grad_norm": 0.39793811550398733, + "learning_rate": 9.814338516122056e-05, + "loss": 3.189, + "step": 11490 + }, + { + "epoch": 0.5350001163954652, + "grad_norm": 0.39138165806066183, + "learning_rate": 9.814265380300668e-05, + "loss": 3.2324, + "step": 11491 + }, + { + "epoch": 0.5350466745815583, + "grad_norm": 0.44259245629953947, + "learning_rate": 9.81419223034989e-05, + "loss": 3.1528, + "step": 11492 + }, + { + "epoch": 0.5350932327676514, + "grad_norm": 0.36362301263407926, + "learning_rate": 9.814119066269929e-05, + "loss": 3.1941, + "step": 11493 + }, + { + "epoch": 0.5351397909537444, + "grad_norm": 0.4555448632468082, + "learning_rate": 9.814045888061003e-05, + "loss": 3.1142, + "step": 11494 + }, + { + "epoch": 0.5351863491398375, + "grad_norm": 0.5046715513211448, + "learning_rate": 9.813972695723326e-05, + "loss": 3.2071, + "step": 11495 + }, + { + "epoch": 0.5352329073259305, + "grad_norm": 0.39341026556246345, + "learning_rate": 9.813899489257115e-05, + "loss": 3.0195, + "step": 11496 + }, + { + "epoch": 0.5352794655120237, + "grad_norm": 0.4251351004801256, + "learning_rate": 9.813826268662583e-05, + "loss": 3.1835, + "step": 11497 + }, + { + "epoch": 0.5353260236981168, + "grad_norm": 0.5453115161511995, + "learning_rate": 9.813753033939943e-05, + "loss": 3.0953, + "step": 11498 + }, + { + "epoch": 0.5353725818842098, + "grad_norm": 0.4596270395694821, + "learning_rate": 9.813679785089414e-05, + "loss": 3.1258, + "step": 11499 + }, + { + "epoch": 0.5354191400703029, + "grad_norm": 0.4158605456053234, + "learning_rate": 9.813606522111209e-05, + "loss": 3.1008, + "step": 11500 + }, + { + "epoch": 0.5354656982563959, + "grad_norm": 0.45960884095727494, + "learning_rate": 9.813533245005542e-05, + "loss": 3.1431, + "step": 11501 + }, + { + "epoch": 0.535512256442489, + "grad_norm": 0.3614127827618141, + "learning_rate": 9.81345995377263e-05, + "loss": 3.1784, + "step": 11502 + }, + { + "epoch": 0.5355588146285821, + "grad_norm": 0.42415984775889265, + "learning_rate": 9.813386648412688e-05, + "loss": 3.152, + "step": 11503 + }, + { + "epoch": 0.5356053728146751, + "grad_norm": 0.39033284869534335, + "learning_rate": 9.81331332892593e-05, + "loss": 3.1651, + "step": 11504 + }, + { + "epoch": 0.5356519310007682, + "grad_norm": 0.35124015487866933, + "learning_rate": 9.813239995312571e-05, + "loss": 3.1761, + "step": 11505 + }, + { + "epoch": 0.5356984891868612, + "grad_norm": 0.3944828914077622, + "learning_rate": 9.813166647572827e-05, + "loss": 3.1338, + "step": 11506 + }, + { + "epoch": 0.5357450473729544, + "grad_norm": 0.3690716412930036, + "learning_rate": 9.813093285706912e-05, + "loss": 3.214, + "step": 11507 + }, + { + "epoch": 0.5357916055590474, + "grad_norm": 0.3818814180646222, + "learning_rate": 9.813019909715044e-05, + "loss": 3.0245, + "step": 11508 + }, + { + "epoch": 0.5358381637451405, + "grad_norm": 0.4202947109693106, + "learning_rate": 9.812946519597437e-05, + "loss": 3.1583, + "step": 11509 + }, + { + "epoch": 0.5358847219312336, + "grad_norm": 0.4009014438114406, + "learning_rate": 9.812873115354304e-05, + "loss": 3.1338, + "step": 11510 + }, + { + "epoch": 0.5359312801173266, + "grad_norm": 0.408559427147145, + "learning_rate": 9.812799696985864e-05, + "loss": 3.2134, + "step": 11511 + }, + { + "epoch": 0.5359778383034197, + "grad_norm": 0.40208339436963403, + "learning_rate": 9.812726264492331e-05, + "loss": 3.0667, + "step": 11512 + }, + { + "epoch": 0.5360243964895127, + "grad_norm": 0.35763869816859895, + "learning_rate": 9.812652817873919e-05, + "loss": 3.155, + "step": 11513 + }, + { + "epoch": 0.5360709546756058, + "grad_norm": 0.40851501813102076, + "learning_rate": 9.812579357130848e-05, + "loss": 3.1339, + "step": 11514 + }, + { + "epoch": 0.536117512861699, + "grad_norm": 0.3576585859722658, + "learning_rate": 9.812505882263329e-05, + "loss": 3.0876, + "step": 11515 + }, + { + "epoch": 0.536164071047792, + "grad_norm": 0.3869222686750184, + "learning_rate": 9.81243239327158e-05, + "loss": 3.1774, + "step": 11516 + }, + { + "epoch": 0.5362106292338851, + "grad_norm": 0.386125845187112, + "learning_rate": 9.812358890155816e-05, + "loss": 3.1603, + "step": 11517 + }, + { + "epoch": 0.5362571874199781, + "grad_norm": 0.3615736295183126, + "learning_rate": 9.812285372916251e-05, + "loss": 3.1762, + "step": 11518 + }, + { + "epoch": 0.5363037456060712, + "grad_norm": 0.3818095690394241, + "learning_rate": 9.812211841553104e-05, + "loss": 3.2039, + "step": 11519 + }, + { + "epoch": 0.5363503037921643, + "grad_norm": 0.32476934088090187, + "learning_rate": 9.812138296066588e-05, + "loss": 3.1579, + "step": 11520 + }, + { + "epoch": 0.5363968619782573, + "grad_norm": 0.39121110076991317, + "learning_rate": 9.812064736456923e-05, + "loss": 3.1186, + "step": 11521 + }, + { + "epoch": 0.5364434201643504, + "grad_norm": 0.4566268300946259, + "learning_rate": 9.81199116272432e-05, + "loss": 3.0321, + "step": 11522 + }, + { + "epoch": 0.5364899783504434, + "grad_norm": 0.35661948302053076, + "learning_rate": 9.811917574868997e-05, + "loss": 3.0879, + "step": 11523 + }, + { + "epoch": 0.5365365365365365, + "grad_norm": 0.39699554197131853, + "learning_rate": 9.81184397289117e-05, + "loss": 3.2576, + "step": 11524 + }, + { + "epoch": 0.5365830947226297, + "grad_norm": 0.405271283812723, + "learning_rate": 9.811770356791055e-05, + "loss": 3.1971, + "step": 11525 + }, + { + "epoch": 0.5366296529087227, + "grad_norm": 0.36281275492043263, + "learning_rate": 9.811696726568868e-05, + "loss": 3.1101, + "step": 11526 + }, + { + "epoch": 0.5366762110948158, + "grad_norm": 0.40903392013450085, + "learning_rate": 9.811623082224827e-05, + "loss": 3.2117, + "step": 11527 + }, + { + "epoch": 0.5367227692809088, + "grad_norm": 0.3777825128165235, + "learning_rate": 9.811549423759145e-05, + "loss": 3.257, + "step": 11528 + }, + { + "epoch": 0.5367693274670019, + "grad_norm": 0.3723955148076649, + "learning_rate": 9.811475751172039e-05, + "loss": 3.1794, + "step": 11529 + }, + { + "epoch": 0.5368158856530949, + "grad_norm": 0.40056312695919577, + "learning_rate": 9.811402064463726e-05, + "loss": 3.1522, + "step": 11530 + }, + { + "epoch": 0.536862443839188, + "grad_norm": 0.38662698246413457, + "learning_rate": 9.811328363634422e-05, + "loss": 3.1342, + "step": 11531 + }, + { + "epoch": 0.5369090020252811, + "grad_norm": 0.43768775914631475, + "learning_rate": 9.811254648684342e-05, + "loss": 3.1554, + "step": 11532 + }, + { + "epoch": 0.5369555602113741, + "grad_norm": 0.3882993284206093, + "learning_rate": 9.811180919613705e-05, + "loss": 3.1955, + "step": 11533 + }, + { + "epoch": 0.5370021183974673, + "grad_norm": 0.4187004274339593, + "learning_rate": 9.811107176422727e-05, + "loss": 3.1233, + "step": 11534 + }, + { + "epoch": 0.5370486765835603, + "grad_norm": 0.41579189105023007, + "learning_rate": 9.811033419111622e-05, + "loss": 3.1275, + "step": 11535 + }, + { + "epoch": 0.5370952347696534, + "grad_norm": 0.4071953020579748, + "learning_rate": 9.810959647680606e-05, + "loss": 3.0844, + "step": 11536 + }, + { + "epoch": 0.5371417929557465, + "grad_norm": 0.3786415206246717, + "learning_rate": 9.810885862129901e-05, + "loss": 3.2404, + "step": 11537 + }, + { + "epoch": 0.5371883511418395, + "grad_norm": 0.40123757829778045, + "learning_rate": 9.810812062459717e-05, + "loss": 3.1936, + "step": 11538 + }, + { + "epoch": 0.5372349093279326, + "grad_norm": 0.37801964799578996, + "learning_rate": 9.810738248670275e-05, + "loss": 3.0928, + "step": 11539 + }, + { + "epoch": 0.5372814675140256, + "grad_norm": 0.4065980768379594, + "learning_rate": 9.81066442076179e-05, + "loss": 3.2088, + "step": 11540 + }, + { + "epoch": 0.5373280257001187, + "grad_norm": 0.47620505478870684, + "learning_rate": 9.81059057873448e-05, + "loss": 3.1831, + "step": 11541 + }, + { + "epoch": 0.5373745838862118, + "grad_norm": 0.40926553866710147, + "learning_rate": 9.810516722588559e-05, + "loss": 3.0771, + "step": 11542 + }, + { + "epoch": 0.5374211420723048, + "grad_norm": 0.3767659989614617, + "learning_rate": 9.810442852324247e-05, + "loss": 3.1122, + "step": 11543 + }, + { + "epoch": 0.537467700258398, + "grad_norm": 0.3806592824704361, + "learning_rate": 9.810368967941756e-05, + "loss": 3.1263, + "step": 11544 + }, + { + "epoch": 0.537514258444491, + "grad_norm": 0.4022163504958009, + "learning_rate": 9.810295069441309e-05, + "loss": 3.115, + "step": 11545 + }, + { + "epoch": 0.5375608166305841, + "grad_norm": 0.39000116510410193, + "learning_rate": 9.810221156823119e-05, + "loss": 3.1369, + "step": 11546 + }, + { + "epoch": 0.5376073748166772, + "grad_norm": 0.3508972992136708, + "learning_rate": 9.810147230087403e-05, + "loss": 3.0001, + "step": 11547 + }, + { + "epoch": 0.5376539330027702, + "grad_norm": 0.39020283444244824, + "learning_rate": 9.810073289234379e-05, + "loss": 3.0753, + "step": 11548 + }, + { + "epoch": 0.5377004911888633, + "grad_norm": 0.4178017854385608, + "learning_rate": 9.809999334264264e-05, + "loss": 3.1617, + "step": 11549 + }, + { + "epoch": 0.5377470493749563, + "grad_norm": 0.4159770431293474, + "learning_rate": 9.809925365177275e-05, + "loss": 3.1082, + "step": 11550 + }, + { + "epoch": 0.5377936075610494, + "grad_norm": 0.4053981671507685, + "learning_rate": 9.809851381973629e-05, + "loss": 2.9604, + "step": 11551 + }, + { + "epoch": 0.5378401657471424, + "grad_norm": 0.39020513106379684, + "learning_rate": 9.809777384653543e-05, + "loss": 3.175, + "step": 11552 + }, + { + "epoch": 0.5378867239332356, + "grad_norm": 0.38814429137529183, + "learning_rate": 9.809703373217234e-05, + "loss": 3.103, + "step": 11553 + }, + { + "epoch": 0.5379332821193287, + "grad_norm": 0.3644994584147921, + "learning_rate": 9.80962934766492e-05, + "loss": 3.1154, + "step": 11554 + }, + { + "epoch": 0.5379798403054217, + "grad_norm": 0.45758444595077535, + "learning_rate": 9.809555307996818e-05, + "loss": 3.0351, + "step": 11555 + }, + { + "epoch": 0.5380263984915148, + "grad_norm": 0.4104919817147625, + "learning_rate": 9.809481254213142e-05, + "loss": 3.1772, + "step": 11556 + }, + { + "epoch": 0.5380729566776078, + "grad_norm": 0.377766293629399, + "learning_rate": 9.809407186314116e-05, + "loss": 3.1424, + "step": 11557 + }, + { + "epoch": 0.5381195148637009, + "grad_norm": 0.4254001371580179, + "learning_rate": 9.809333104299952e-05, + "loss": 3.1044, + "step": 11558 + }, + { + "epoch": 0.538166073049794, + "grad_norm": 0.42969256421289614, + "learning_rate": 9.80925900817087e-05, + "loss": 3.2042, + "step": 11559 + }, + { + "epoch": 0.538212631235887, + "grad_norm": 0.4176006262629195, + "learning_rate": 9.809184897927086e-05, + "loss": 3.1177, + "step": 11560 + }, + { + "epoch": 0.5382591894219801, + "grad_norm": 0.38290179838072413, + "learning_rate": 9.809110773568818e-05, + "loss": 3.2113, + "step": 11561 + }, + { + "epoch": 0.5383057476080731, + "grad_norm": 0.3813001255115419, + "learning_rate": 9.809036635096282e-05, + "loss": 3.1355, + "step": 11562 + }, + { + "epoch": 0.5383523057941663, + "grad_norm": 0.3708909128569511, + "learning_rate": 9.8089624825097e-05, + "loss": 3.1562, + "step": 11563 + }, + { + "epoch": 0.5383988639802594, + "grad_norm": 0.3858995480484267, + "learning_rate": 9.808888315809285e-05, + "loss": 3.1543, + "step": 11564 + }, + { + "epoch": 0.5384454221663524, + "grad_norm": 0.3772896369364363, + "learning_rate": 9.808814134995259e-05, + "loss": 3.1761, + "step": 11565 + }, + { + "epoch": 0.5384919803524455, + "grad_norm": 0.371884919885837, + "learning_rate": 9.808739940067835e-05, + "loss": 3.1698, + "step": 11566 + }, + { + "epoch": 0.5385385385385385, + "grad_norm": 0.38629905243408036, + "learning_rate": 9.808665731027233e-05, + "loss": 3.2363, + "step": 11567 + }, + { + "epoch": 0.5385850967246316, + "grad_norm": 0.38569843080667526, + "learning_rate": 9.808591507873672e-05, + "loss": 3.1689, + "step": 11568 + }, + { + "epoch": 0.5386316549107247, + "grad_norm": 0.42160971255629437, + "learning_rate": 9.808517270607369e-05, + "loss": 3.1109, + "step": 11569 + }, + { + "epoch": 0.5386782130968177, + "grad_norm": 0.32022869953688604, + "learning_rate": 9.808443019228539e-05, + "loss": 3.0528, + "step": 11570 + }, + { + "epoch": 0.5387247712829109, + "grad_norm": 0.4054069648447172, + "learning_rate": 9.808368753737404e-05, + "loss": 3.037, + "step": 11571 + }, + { + "epoch": 0.5387713294690039, + "grad_norm": 0.39548041096866576, + "learning_rate": 9.808294474134183e-05, + "loss": 3.0358, + "step": 11572 + }, + { + "epoch": 0.538817887655097, + "grad_norm": 0.3849276731192837, + "learning_rate": 9.808220180419087e-05, + "loss": 3.0844, + "step": 11573 + }, + { + "epoch": 0.53886444584119, + "grad_norm": 0.4202600322801972, + "learning_rate": 9.80814587259234e-05, + "loss": 3.1084, + "step": 11574 + }, + { + "epoch": 0.5389110040272831, + "grad_norm": 0.42551126846632803, + "learning_rate": 9.80807155065416e-05, + "loss": 3.0672, + "step": 11575 + }, + { + "epoch": 0.5389575622133762, + "grad_norm": 0.3632691896007223, + "learning_rate": 9.807997214604763e-05, + "loss": 3.149, + "step": 11576 + }, + { + "epoch": 0.5390041203994692, + "grad_norm": 0.4409722688897547, + "learning_rate": 9.807922864444368e-05, + "loss": 3.1553, + "step": 11577 + }, + { + "epoch": 0.5390506785855623, + "grad_norm": 0.41837405135090683, + "learning_rate": 9.807848500173191e-05, + "loss": 3.218, + "step": 11578 + }, + { + "epoch": 0.5390972367716553, + "grad_norm": 0.42974118319062676, + "learning_rate": 9.807774121791454e-05, + "loss": 3.0769, + "step": 11579 + }, + { + "epoch": 0.5391437949577484, + "grad_norm": 0.401928244798061, + "learning_rate": 9.807699729299374e-05, + "loss": 3.0523, + "step": 11580 + }, + { + "epoch": 0.5391903531438416, + "grad_norm": 0.41657977015122777, + "learning_rate": 9.807625322697169e-05, + "loss": 3.1325, + "step": 11581 + }, + { + "epoch": 0.5392369113299346, + "grad_norm": 0.40142282856968975, + "learning_rate": 9.807550901985056e-05, + "loss": 3.1533, + "step": 11582 + }, + { + "epoch": 0.5392834695160277, + "grad_norm": 0.42909978863603027, + "learning_rate": 9.807476467163254e-05, + "loss": 3.1831, + "step": 11583 + }, + { + "epoch": 0.5393300277021207, + "grad_norm": 0.3679047293080683, + "learning_rate": 9.807402018231984e-05, + "loss": 3.082, + "step": 11584 + }, + { + "epoch": 0.5393765858882138, + "grad_norm": 0.42017531835059957, + "learning_rate": 9.807327555191464e-05, + "loss": 3.2265, + "step": 11585 + }, + { + "epoch": 0.5394231440743069, + "grad_norm": 0.40390451717673237, + "learning_rate": 9.80725307804191e-05, + "loss": 3.0627, + "step": 11586 + }, + { + "epoch": 0.5394697022603999, + "grad_norm": 0.37606582847506875, + "learning_rate": 9.80717858678354e-05, + "loss": 3.1205, + "step": 11587 + }, + { + "epoch": 0.539516260446493, + "grad_norm": 0.3900152036806551, + "learning_rate": 9.807104081416576e-05, + "loss": 3.1261, + "step": 11588 + }, + { + "epoch": 0.539562818632586, + "grad_norm": 0.3532339477929682, + "learning_rate": 9.807029561941235e-05, + "loss": 3.1877, + "step": 11589 + }, + { + "epoch": 0.5396093768186792, + "grad_norm": 0.3842677454763846, + "learning_rate": 9.806955028357734e-05, + "loss": 3.172, + "step": 11590 + }, + { + "epoch": 0.5396559350047723, + "grad_norm": 0.3551336492360729, + "learning_rate": 9.806880480666296e-05, + "loss": 3.317, + "step": 11591 + }, + { + "epoch": 0.5397024931908653, + "grad_norm": 0.3344581297881585, + "learning_rate": 9.806805918867136e-05, + "loss": 3.145, + "step": 11592 + }, + { + "epoch": 0.5397490513769584, + "grad_norm": 0.4036972547890114, + "learning_rate": 9.806731342960476e-05, + "loss": 3.2171, + "step": 11593 + }, + { + "epoch": 0.5397956095630514, + "grad_norm": 0.43436901594842164, + "learning_rate": 9.806656752946531e-05, + "loss": 3.1491, + "step": 11594 + }, + { + "epoch": 0.5398421677491445, + "grad_norm": 0.3409168325056977, + "learning_rate": 9.806582148825523e-05, + "loss": 3.1972, + "step": 11595 + }, + { + "epoch": 0.5398887259352375, + "grad_norm": 0.4055856153813149, + "learning_rate": 9.80650753059767e-05, + "loss": 3.0208, + "step": 11596 + }, + { + "epoch": 0.5399352841213306, + "grad_norm": 0.3663467018627552, + "learning_rate": 9.806432898263191e-05, + "loss": 3.1618, + "step": 11597 + }, + { + "epoch": 0.5399818423074237, + "grad_norm": 0.39302667562889454, + "learning_rate": 9.806358251822303e-05, + "loss": 3.1163, + "step": 11598 + }, + { + "epoch": 0.5400284004935167, + "grad_norm": 0.40370475331597305, + "learning_rate": 9.806283591275229e-05, + "loss": 3.1469, + "step": 11599 + }, + { + "epoch": 0.5400749586796099, + "grad_norm": 0.39335051636984436, + "learning_rate": 9.806208916622186e-05, + "loss": 3.0029, + "step": 11600 + }, + { + "epoch": 0.5401215168657029, + "grad_norm": 0.3917944065922821, + "learning_rate": 9.806134227863392e-05, + "loss": 3.0125, + "step": 11601 + }, + { + "epoch": 0.540168075051796, + "grad_norm": 0.43032095832216805, + "learning_rate": 9.806059524999069e-05, + "loss": 3.1173, + "step": 11602 + }, + { + "epoch": 0.5402146332378891, + "grad_norm": 0.35954508595847695, + "learning_rate": 9.805984808029435e-05, + "loss": 3.1311, + "step": 11603 + }, + { + "epoch": 0.5402611914239821, + "grad_norm": 0.41515226174317527, + "learning_rate": 9.805910076954709e-05, + "loss": 3.1376, + "step": 11604 + }, + { + "epoch": 0.5403077496100752, + "grad_norm": 0.3649873034047971, + "learning_rate": 9.805835331775109e-05, + "loss": 3.2284, + "step": 11605 + }, + { + "epoch": 0.5403543077961682, + "grad_norm": 0.3571775405004511, + "learning_rate": 9.805760572490856e-05, + "loss": 3.1644, + "step": 11606 + }, + { + "epoch": 0.5404008659822613, + "grad_norm": 0.39201407776773295, + "learning_rate": 9.80568579910217e-05, + "loss": 3.1435, + "step": 11607 + }, + { + "epoch": 0.5404474241683545, + "grad_norm": 0.3789681463800797, + "learning_rate": 9.80561101160927e-05, + "loss": 3.1732, + "step": 11608 + }, + { + "epoch": 0.5404939823544475, + "grad_norm": 0.3670155861763701, + "learning_rate": 9.805536210012374e-05, + "loss": 3.232, + "step": 11609 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.36589924360763965, + "learning_rate": 9.805461394311704e-05, + "loss": 3.0583, + "step": 11610 + }, + { + "epoch": 0.5405870987266336, + "grad_norm": 0.35565118471933005, + "learning_rate": 9.805386564507478e-05, + "loss": 3.1704, + "step": 11611 + }, + { + "epoch": 0.5406336569127267, + "grad_norm": 0.43812718564844544, + "learning_rate": 9.805311720599913e-05, + "loss": 3.1828, + "step": 11612 + }, + { + "epoch": 0.5406802150988198, + "grad_norm": 0.3956428022301705, + "learning_rate": 9.805236862589235e-05, + "loss": 3.1363, + "step": 11613 + }, + { + "epoch": 0.5407267732849128, + "grad_norm": 0.3763406543551393, + "learning_rate": 9.805161990475657e-05, + "loss": 3.0434, + "step": 11614 + }, + { + "epoch": 0.5407733314710059, + "grad_norm": 0.4144341348536821, + "learning_rate": 9.805087104259405e-05, + "loss": 3.0788, + "step": 11615 + }, + { + "epoch": 0.5408198896570989, + "grad_norm": 0.3521146219969872, + "learning_rate": 9.805012203940694e-05, + "loss": 3.1324, + "step": 11616 + }, + { + "epoch": 0.540866447843192, + "grad_norm": 0.3632064945129765, + "learning_rate": 9.804937289519746e-05, + "loss": 3.0753, + "step": 11617 + }, + { + "epoch": 0.540913006029285, + "grad_norm": 0.3455083107741963, + "learning_rate": 9.80486236099678e-05, + "loss": 3.092, + "step": 11618 + }, + { + "epoch": 0.5409595642153782, + "grad_norm": 0.4086838376059283, + "learning_rate": 9.804787418372018e-05, + "loss": 3.1315, + "step": 11619 + }, + { + "epoch": 0.5410061224014713, + "grad_norm": 0.3360465668522402, + "learning_rate": 9.804712461645677e-05, + "loss": 2.9952, + "step": 11620 + }, + { + "epoch": 0.5410526805875643, + "grad_norm": 0.40031614316835334, + "learning_rate": 9.804637490817978e-05, + "loss": 3.1376, + "step": 11621 + }, + { + "epoch": 0.5410992387736574, + "grad_norm": 0.37816343665594687, + "learning_rate": 9.80456250588914e-05, + "loss": 3.1341, + "step": 11622 + }, + { + "epoch": 0.5411457969597504, + "grad_norm": 0.38695579710404, + "learning_rate": 9.804487506859388e-05, + "loss": 3.1482, + "step": 11623 + }, + { + "epoch": 0.5411923551458435, + "grad_norm": 0.4285720630814303, + "learning_rate": 9.804412493728937e-05, + "loss": 3.1218, + "step": 11624 + }, + { + "epoch": 0.5412389133319366, + "grad_norm": 0.3627355972373202, + "learning_rate": 9.804337466498008e-05, + "loss": 3.1894, + "step": 11625 + }, + { + "epoch": 0.5412854715180296, + "grad_norm": 0.39967644780968203, + "learning_rate": 9.804262425166821e-05, + "loss": 3.1071, + "step": 11626 + }, + { + "epoch": 0.5413320297041228, + "grad_norm": 0.36071036325510797, + "learning_rate": 9.804187369735598e-05, + "loss": 3.1386, + "step": 11627 + }, + { + "epoch": 0.5413785878902158, + "grad_norm": 0.34465892448826696, + "learning_rate": 9.804112300204559e-05, + "loss": 3.2297, + "step": 11628 + }, + { + "epoch": 0.5414251460763089, + "grad_norm": 0.33441780010467315, + "learning_rate": 9.804037216573923e-05, + "loss": 3.1311, + "step": 11629 + }, + { + "epoch": 0.541471704262402, + "grad_norm": 0.37094051638033015, + "learning_rate": 9.803962118843911e-05, + "loss": 3.1297, + "step": 11630 + }, + { + "epoch": 0.541518262448495, + "grad_norm": 0.3335909835154337, + "learning_rate": 9.803887007014744e-05, + "loss": 3.1212, + "step": 11631 + }, + { + "epoch": 0.5415648206345881, + "grad_norm": 0.42212669169954437, + "learning_rate": 9.803811881086642e-05, + "loss": 3.1214, + "step": 11632 + }, + { + "epoch": 0.5416113788206811, + "grad_norm": 0.35139571028101113, + "learning_rate": 9.803736741059824e-05, + "loss": 3.1753, + "step": 11633 + }, + { + "epoch": 0.5416579370067742, + "grad_norm": 0.39305049642626494, + "learning_rate": 9.803661586934514e-05, + "loss": 3.2155, + "step": 11634 + }, + { + "epoch": 0.5417044951928673, + "grad_norm": 0.39637537519301114, + "learning_rate": 9.803586418710929e-05, + "loss": 3.1192, + "step": 11635 + }, + { + "epoch": 0.5417510533789603, + "grad_norm": 0.33150891376099745, + "learning_rate": 9.803511236389291e-05, + "loss": 3.0645, + "step": 11636 + }, + { + "epoch": 0.5417976115650535, + "grad_norm": 0.4278642267678431, + "learning_rate": 9.803436039969821e-05, + "loss": 3.2189, + "step": 11637 + }, + { + "epoch": 0.5418441697511465, + "grad_norm": 0.3743696568105476, + "learning_rate": 9.80336082945274e-05, + "loss": 3.2296, + "step": 11638 + }, + { + "epoch": 0.5418907279372396, + "grad_norm": 0.3902176272531579, + "learning_rate": 9.803285604838267e-05, + "loss": 3.0605, + "step": 11639 + }, + { + "epoch": 0.5419372861233326, + "grad_norm": 0.3414514373531976, + "learning_rate": 9.803210366126626e-05, + "loss": 3.165, + "step": 11640 + }, + { + "epoch": 0.5419838443094257, + "grad_norm": 0.3870795434858923, + "learning_rate": 9.803135113318034e-05, + "loss": 3.2076, + "step": 11641 + }, + { + "epoch": 0.5420304024955188, + "grad_norm": 0.36082577195454, + "learning_rate": 9.803059846412714e-05, + "loss": 3.1215, + "step": 11642 + }, + { + "epoch": 0.5420769606816118, + "grad_norm": 0.372982823553962, + "learning_rate": 9.802984565410887e-05, + "loss": 3.1271, + "step": 11643 + }, + { + "epoch": 0.5421235188677049, + "grad_norm": 0.36646191610364565, + "learning_rate": 9.802909270312772e-05, + "loss": 3.141, + "step": 11644 + }, + { + "epoch": 0.5421700770537979, + "grad_norm": 0.42099462221188644, + "learning_rate": 9.802833961118592e-05, + "loss": 3.1419, + "step": 11645 + }, + { + "epoch": 0.542216635239891, + "grad_norm": 0.3644267813093811, + "learning_rate": 9.802758637828569e-05, + "loss": 3.1805, + "step": 11646 + }, + { + "epoch": 0.5422631934259842, + "grad_norm": 0.3781900549997282, + "learning_rate": 9.802683300442921e-05, + "loss": 3.2337, + "step": 11647 + }, + { + "epoch": 0.5423097516120772, + "grad_norm": 0.4052559547934524, + "learning_rate": 9.802607948961872e-05, + "loss": 3.2439, + "step": 11648 + }, + { + "epoch": 0.5423563097981703, + "grad_norm": 0.3820448858190945, + "learning_rate": 9.802532583385641e-05, + "loss": 3.0943, + "step": 11649 + }, + { + "epoch": 0.5424028679842633, + "grad_norm": 0.33745613460719465, + "learning_rate": 9.80245720371445e-05, + "loss": 3.14, + "step": 11650 + }, + { + "epoch": 0.5424494261703564, + "grad_norm": 0.40911513118086695, + "learning_rate": 9.80238180994852e-05, + "loss": 3.1113, + "step": 11651 + }, + { + "epoch": 0.5424959843564495, + "grad_norm": 0.3281513055033858, + "learning_rate": 9.802306402088073e-05, + "loss": 3.1594, + "step": 11652 + }, + { + "epoch": 0.5425425425425425, + "grad_norm": 0.385249360806139, + "learning_rate": 9.802230980133329e-05, + "loss": 3.0814, + "step": 11653 + }, + { + "epoch": 0.5425891007286356, + "grad_norm": 0.365123054494381, + "learning_rate": 9.802155544084511e-05, + "loss": 3.1848, + "step": 11654 + }, + { + "epoch": 0.5426356589147286, + "grad_norm": 0.36144164142394036, + "learning_rate": 9.802080093941839e-05, + "loss": 3.1194, + "step": 11655 + }, + { + "epoch": 0.5426822171008218, + "grad_norm": 0.3704687116182649, + "learning_rate": 9.802004629705536e-05, + "loss": 3.1496, + "step": 11656 + }, + { + "epoch": 0.5427287752869148, + "grad_norm": 0.4060077333250648, + "learning_rate": 9.801929151375821e-05, + "loss": 3.1598, + "step": 11657 + }, + { + "epoch": 0.5427753334730079, + "grad_norm": 0.380014621241597, + "learning_rate": 9.801853658952918e-05, + "loss": 3.1073, + "step": 11658 + }, + { + "epoch": 0.542821891659101, + "grad_norm": 0.36875488480210783, + "learning_rate": 9.801778152437048e-05, + "loss": 3.1684, + "step": 11659 + }, + { + "epoch": 0.542868449845194, + "grad_norm": 0.36489713025459247, + "learning_rate": 9.801702631828431e-05, + "loss": 3.1073, + "step": 11660 + }, + { + "epoch": 0.5429150080312871, + "grad_norm": 0.3855989127607105, + "learning_rate": 9.80162709712729e-05, + "loss": 3.122, + "step": 11661 + }, + { + "epoch": 0.5429615662173801, + "grad_norm": 0.4614626707704951, + "learning_rate": 9.801551548333849e-05, + "loss": 3.1781, + "step": 11662 + }, + { + "epoch": 0.5430081244034732, + "grad_norm": 0.3875623576499196, + "learning_rate": 9.801475985448324e-05, + "loss": 3.1752, + "step": 11663 + }, + { + "epoch": 0.5430546825895664, + "grad_norm": 0.37985562366598613, + "learning_rate": 9.801400408470942e-05, + "loss": 3.1406, + "step": 11664 + }, + { + "epoch": 0.5431012407756594, + "grad_norm": 0.37710446051353985, + "learning_rate": 9.801324817401922e-05, + "loss": 3.0962, + "step": 11665 + }, + { + "epoch": 0.5431477989617525, + "grad_norm": 0.3462636777365871, + "learning_rate": 9.801249212241485e-05, + "loss": 3.0645, + "step": 11666 + }, + { + "epoch": 0.5431943571478455, + "grad_norm": 0.4071001572250672, + "learning_rate": 9.801173592989857e-05, + "loss": 3.07, + "step": 11667 + }, + { + "epoch": 0.5432409153339386, + "grad_norm": 0.3908486415529599, + "learning_rate": 9.801097959647256e-05, + "loss": 3.1677, + "step": 11668 + }, + { + "epoch": 0.5432874735200317, + "grad_norm": 0.368903210825823, + "learning_rate": 9.801022312213908e-05, + "loss": 3.1825, + "step": 11669 + }, + { + "epoch": 0.5433340317061247, + "grad_norm": 0.38867702980650576, + "learning_rate": 9.80094665069003e-05, + "loss": 3.1099, + "step": 11670 + }, + { + "epoch": 0.5433805898922178, + "grad_norm": 0.36529455353072315, + "learning_rate": 9.800870975075846e-05, + "loss": 3.1406, + "step": 11671 + }, + { + "epoch": 0.5434271480783108, + "grad_norm": 0.35814540481923735, + "learning_rate": 9.80079528537158e-05, + "loss": 3.1388, + "step": 11672 + }, + { + "epoch": 0.5434737062644039, + "grad_norm": 0.4276961311634838, + "learning_rate": 9.800719581577454e-05, + "loss": 3.1938, + "step": 11673 + }, + { + "epoch": 0.5435202644504971, + "grad_norm": 0.3488785285763001, + "learning_rate": 9.800643863693687e-05, + "loss": 3.149, + "step": 11674 + }, + { + "epoch": 0.5435668226365901, + "grad_norm": 0.3908923996747102, + "learning_rate": 9.800568131720504e-05, + "loss": 3.1339, + "step": 11675 + }, + { + "epoch": 0.5436133808226832, + "grad_norm": 0.3992897587791774, + "learning_rate": 9.800492385658126e-05, + "loss": 3.1501, + "step": 11676 + }, + { + "epoch": 0.5436599390087762, + "grad_norm": 0.3787718282130665, + "learning_rate": 9.800416625506777e-05, + "loss": 3.0795, + "step": 11677 + }, + { + "epoch": 0.5437064971948693, + "grad_norm": 0.4418372031268775, + "learning_rate": 9.800340851266676e-05, + "loss": 3.1456, + "step": 11678 + }, + { + "epoch": 0.5437530553809623, + "grad_norm": 0.3955092021439276, + "learning_rate": 9.80026506293805e-05, + "loss": 3.1001, + "step": 11679 + }, + { + "epoch": 0.5437996135670554, + "grad_norm": 0.37134421238211446, + "learning_rate": 9.800189260521118e-05, + "loss": 3.1201, + "step": 11680 + }, + { + "epoch": 0.5438461717531485, + "grad_norm": 0.45383304042545125, + "learning_rate": 9.800113444016103e-05, + "loss": 3.1395, + "step": 11681 + }, + { + "epoch": 0.5438927299392415, + "grad_norm": 0.4093746862416435, + "learning_rate": 9.800037613423228e-05, + "loss": 3.161, + "step": 11682 + }, + { + "epoch": 0.5439392881253347, + "grad_norm": 0.41113301498688076, + "learning_rate": 9.799961768742715e-05, + "loss": 3.1117, + "step": 11683 + }, + { + "epoch": 0.5439858463114277, + "grad_norm": 0.49671332641440774, + "learning_rate": 9.799885909974788e-05, + "loss": 3.2213, + "step": 11684 + }, + { + "epoch": 0.5440324044975208, + "grad_norm": 0.46984717342477356, + "learning_rate": 9.799810037119667e-05, + "loss": 3.0445, + "step": 11685 + }, + { + "epoch": 0.5440789626836139, + "grad_norm": 0.3612861291044888, + "learning_rate": 9.799734150177578e-05, + "loss": 3.0296, + "step": 11686 + }, + { + "epoch": 0.5441255208697069, + "grad_norm": 0.4156805433130793, + "learning_rate": 9.799658249148742e-05, + "loss": 3.0603, + "step": 11687 + }, + { + "epoch": 0.5441720790558, + "grad_norm": 0.4009254017422998, + "learning_rate": 9.799582334033382e-05, + "loss": 3.0749, + "step": 11688 + }, + { + "epoch": 0.544218637241893, + "grad_norm": 0.40870255950032564, + "learning_rate": 9.79950640483172e-05, + "loss": 3.1499, + "step": 11689 + }, + { + "epoch": 0.5442651954279861, + "grad_norm": 0.4407547778570246, + "learning_rate": 9.79943046154398e-05, + "loss": 3.1119, + "step": 11690 + }, + { + "epoch": 0.5443117536140792, + "grad_norm": 0.3819817908795679, + "learning_rate": 9.799354504170385e-05, + "loss": 3.0729, + "step": 11691 + }, + { + "epoch": 0.5443583118001722, + "grad_norm": 0.4179908345705376, + "learning_rate": 9.799278532711156e-05, + "loss": 3.2139, + "step": 11692 + }, + { + "epoch": 0.5444048699862654, + "grad_norm": 0.3813242853954428, + "learning_rate": 9.799202547166519e-05, + "loss": 3.0663, + "step": 11693 + }, + { + "epoch": 0.5444514281723584, + "grad_norm": 0.42321565668639244, + "learning_rate": 9.799126547536694e-05, + "loss": 3.1469, + "step": 11694 + }, + { + "epoch": 0.5444979863584515, + "grad_norm": 0.4431182061789189, + "learning_rate": 9.799050533821905e-05, + "loss": 3.2176, + "step": 11695 + }, + { + "epoch": 0.5445445445445446, + "grad_norm": 0.46785082078423634, + "learning_rate": 9.798974506022378e-05, + "loss": 3.2525, + "step": 11696 + }, + { + "epoch": 0.5445911027306376, + "grad_norm": 0.40720090704319517, + "learning_rate": 9.79889846413833e-05, + "loss": 3.1254, + "step": 11697 + }, + { + "epoch": 0.5446376609167307, + "grad_norm": 0.4668216732091799, + "learning_rate": 9.79882240816999e-05, + "loss": 3.0019, + "step": 11698 + }, + { + "epoch": 0.5446842191028237, + "grad_norm": 0.4017037141416014, + "learning_rate": 9.798746338117579e-05, + "loss": 3.1796, + "step": 11699 + }, + { + "epoch": 0.5447307772889168, + "grad_norm": 0.42788820579766673, + "learning_rate": 9.79867025398132e-05, + "loss": 3.1866, + "step": 11700 + }, + { + "epoch": 0.5447773354750098, + "grad_norm": 0.43652572851663896, + "learning_rate": 9.798594155761436e-05, + "loss": 3.0562, + "step": 11701 + }, + { + "epoch": 0.544823893661103, + "grad_norm": 0.4489051446746279, + "learning_rate": 9.798518043458152e-05, + "loss": 3.1403, + "step": 11702 + }, + { + "epoch": 0.5448704518471961, + "grad_norm": 0.42234603325320863, + "learning_rate": 9.798441917071688e-05, + "loss": 3.1553, + "step": 11703 + }, + { + "epoch": 0.5449170100332891, + "grad_norm": 0.48590723210864173, + "learning_rate": 9.798365776602272e-05, + "loss": 3.0754, + "step": 11704 + }, + { + "epoch": 0.5449635682193822, + "grad_norm": 0.4123758158617388, + "learning_rate": 9.798289622050125e-05, + "loss": 3.1632, + "step": 11705 + }, + { + "epoch": 0.5450101264054752, + "grad_norm": 0.3878096436688302, + "learning_rate": 9.79821345341547e-05, + "loss": 3.0803, + "step": 11706 + }, + { + "epoch": 0.5450566845915683, + "grad_norm": 0.39130520597085094, + "learning_rate": 9.79813727069853e-05, + "loss": 3.1351, + "step": 11707 + }, + { + "epoch": 0.5451032427776614, + "grad_norm": 0.42140813679160743, + "learning_rate": 9.798061073899531e-05, + "loss": 3.0635, + "step": 11708 + }, + { + "epoch": 0.5451498009637544, + "grad_norm": 0.38197783510518096, + "learning_rate": 9.797984863018696e-05, + "loss": 3.1992, + "step": 11709 + }, + { + "epoch": 0.5451963591498475, + "grad_norm": 0.3836116946336823, + "learning_rate": 9.797908638056248e-05, + "loss": 3.1459, + "step": 11710 + }, + { + "epoch": 0.5452429173359405, + "grad_norm": 0.4127703752446022, + "learning_rate": 9.797832399012409e-05, + "loss": 3.0956, + "step": 11711 + }, + { + "epoch": 0.5452894755220337, + "grad_norm": 0.3920367161822907, + "learning_rate": 9.797756145887408e-05, + "loss": 3.1142, + "step": 11712 + }, + { + "epoch": 0.5453360337081268, + "grad_norm": 0.3937342980046534, + "learning_rate": 9.797679878681463e-05, + "loss": 3.1953, + "step": 11713 + }, + { + "epoch": 0.5453825918942198, + "grad_norm": 0.3934863085301259, + "learning_rate": 9.7976035973948e-05, + "loss": 3.1373, + "step": 11714 + }, + { + "epoch": 0.5454291500803129, + "grad_norm": 0.3980642596722514, + "learning_rate": 9.797527302027645e-05, + "loss": 3.1019, + "step": 11715 + }, + { + "epoch": 0.5454757082664059, + "grad_norm": 0.36457517600747097, + "learning_rate": 9.797450992580218e-05, + "loss": 3.0957, + "step": 11716 + }, + { + "epoch": 0.545522266452499, + "grad_norm": 0.37289102343739855, + "learning_rate": 9.797374669052746e-05, + "loss": 3.0976, + "step": 11717 + }, + { + "epoch": 0.5455688246385921, + "grad_norm": 0.3617932913245918, + "learning_rate": 9.797298331445451e-05, + "loss": 3.0329, + "step": 11718 + }, + { + "epoch": 0.5456153828246851, + "grad_norm": 0.3698903070414623, + "learning_rate": 9.79722197975856e-05, + "loss": 3.0857, + "step": 11719 + }, + { + "epoch": 0.5456619410107783, + "grad_norm": 0.34524388274739864, + "learning_rate": 9.797145613992294e-05, + "loss": 3.1971, + "step": 11720 + }, + { + "epoch": 0.5457084991968713, + "grad_norm": 0.35968433880726547, + "learning_rate": 9.797069234146877e-05, + "loss": 3.0926, + "step": 11721 + }, + { + "epoch": 0.5457550573829644, + "grad_norm": 0.3806313283306614, + "learning_rate": 9.796992840222535e-05, + "loss": 3.1386, + "step": 11722 + }, + { + "epoch": 0.5458016155690574, + "grad_norm": 0.3616568740195854, + "learning_rate": 9.796916432219491e-05, + "loss": 3.1168, + "step": 11723 + }, + { + "epoch": 0.5458481737551505, + "grad_norm": 0.3634684897694568, + "learning_rate": 9.796840010137972e-05, + "loss": 3.2182, + "step": 11724 + }, + { + "epoch": 0.5458947319412436, + "grad_norm": 0.36043716008506455, + "learning_rate": 9.796763573978198e-05, + "loss": 3.2174, + "step": 11725 + }, + { + "epoch": 0.5459412901273366, + "grad_norm": 0.346829742852236, + "learning_rate": 9.796687123740397e-05, + "loss": 3.1131, + "step": 11726 + }, + { + "epoch": 0.5459878483134297, + "grad_norm": 0.3774574005479834, + "learning_rate": 9.796610659424792e-05, + "loss": 2.9988, + "step": 11727 + }, + { + "epoch": 0.5460344064995227, + "grad_norm": 0.409287283891124, + "learning_rate": 9.796534181031606e-05, + "loss": 3.1121, + "step": 11728 + }, + { + "epoch": 0.5460809646856158, + "grad_norm": 0.38957072993379305, + "learning_rate": 9.796457688561065e-05, + "loss": 3.0856, + "step": 11729 + }, + { + "epoch": 0.546127522871709, + "grad_norm": 0.35306175112060456, + "learning_rate": 9.796381182013393e-05, + "loss": 3.1687, + "step": 11730 + }, + { + "epoch": 0.546174081057802, + "grad_norm": 0.3811994852685204, + "learning_rate": 9.796304661388815e-05, + "loss": 3.2255, + "step": 11731 + }, + { + "epoch": 0.5462206392438951, + "grad_norm": 0.4056908688183903, + "learning_rate": 9.796228126687556e-05, + "loss": 3.0945, + "step": 11732 + }, + { + "epoch": 0.5462671974299881, + "grad_norm": 0.3209186595655906, + "learning_rate": 9.79615157790984e-05, + "loss": 3.0909, + "step": 11733 + }, + { + "epoch": 0.5463137556160812, + "grad_norm": 0.36484078015170185, + "learning_rate": 9.796075015055892e-05, + "loss": 3.121, + "step": 11734 + }, + { + "epoch": 0.5463603138021743, + "grad_norm": 0.3324285515216057, + "learning_rate": 9.795998438125934e-05, + "loss": 3.1508, + "step": 11735 + }, + { + "epoch": 0.5464068719882673, + "grad_norm": 0.36070554696899143, + "learning_rate": 9.795921847120195e-05, + "loss": 3.0346, + "step": 11736 + }, + { + "epoch": 0.5464534301743604, + "grad_norm": 0.3460589416417704, + "learning_rate": 9.795845242038897e-05, + "loss": 3.1843, + "step": 11737 + }, + { + "epoch": 0.5464999883604534, + "grad_norm": 0.34594735057229503, + "learning_rate": 9.795768622882266e-05, + "loss": 3.1193, + "step": 11738 + }, + { + "epoch": 0.5465465465465466, + "grad_norm": 0.29521739000965846, + "learning_rate": 9.795691989650526e-05, + "loss": 3.0905, + "step": 11739 + }, + { + "epoch": 0.5465931047326397, + "grad_norm": 0.33757158742418825, + "learning_rate": 9.795615342343903e-05, + "loss": 3.0833, + "step": 11740 + }, + { + "epoch": 0.5466396629187327, + "grad_norm": 0.36637236744278967, + "learning_rate": 9.795538680962622e-05, + "loss": 3.1358, + "step": 11741 + }, + { + "epoch": 0.5466862211048258, + "grad_norm": 0.33139989798453634, + "learning_rate": 9.795462005506907e-05, + "loss": 3.1105, + "step": 11742 + }, + { + "epoch": 0.5467327792909188, + "grad_norm": 0.3775981545696805, + "learning_rate": 9.795385315976984e-05, + "loss": 2.9915, + "step": 11743 + }, + { + "epoch": 0.5467793374770119, + "grad_norm": 0.35351687693446926, + "learning_rate": 9.795308612373075e-05, + "loss": 3.199, + "step": 11744 + }, + { + "epoch": 0.5468258956631049, + "grad_norm": 0.4180286248998447, + "learning_rate": 9.795231894695411e-05, + "loss": 3.1514, + "step": 11745 + }, + { + "epoch": 0.546872453849198, + "grad_norm": 0.4335799671719078, + "learning_rate": 9.795155162944211e-05, + "loss": 3.165, + "step": 11746 + }, + { + "epoch": 0.5469190120352911, + "grad_norm": 0.3679751549911898, + "learning_rate": 9.795078417119705e-05, + "loss": 3.1263, + "step": 11747 + }, + { + "epoch": 0.5469655702213841, + "grad_norm": 0.4127526970286633, + "learning_rate": 9.795001657222116e-05, + "loss": 3.1488, + "step": 11748 + }, + { + "epoch": 0.5470121284074773, + "grad_norm": 0.3875136806756787, + "learning_rate": 9.794924883251668e-05, + "loss": 3.0578, + "step": 11749 + }, + { + "epoch": 0.5470586865935703, + "grad_norm": 0.39714753205100356, + "learning_rate": 9.79484809520859e-05, + "loss": 3.1394, + "step": 11750 + }, + { + "epoch": 0.5471052447796634, + "grad_norm": 0.3653675156240993, + "learning_rate": 9.794771293093104e-05, + "loss": 3.0973, + "step": 11751 + }, + { + "epoch": 0.5471518029657565, + "grad_norm": 0.4337394651071609, + "learning_rate": 9.794694476905436e-05, + "loss": 3.2393, + "step": 11752 + }, + { + "epoch": 0.5471983611518495, + "grad_norm": 0.41627048683346807, + "learning_rate": 9.794617646645813e-05, + "loss": 3.2489, + "step": 11753 + }, + { + "epoch": 0.5472449193379426, + "grad_norm": 0.36237509462520573, + "learning_rate": 9.79454080231446e-05, + "loss": 3.0945, + "step": 11754 + }, + { + "epoch": 0.5472914775240356, + "grad_norm": 0.3891263261881894, + "learning_rate": 9.7944639439116e-05, + "loss": 3.1534, + "step": 11755 + }, + { + "epoch": 0.5473380357101287, + "grad_norm": 0.38811034295876506, + "learning_rate": 9.794387071437463e-05, + "loss": 3.1607, + "step": 11756 + }, + { + "epoch": 0.5473845938962218, + "grad_norm": 0.3929157650117627, + "learning_rate": 9.794310184892271e-05, + "loss": 3.0588, + "step": 11757 + }, + { + "epoch": 0.5474311520823149, + "grad_norm": 0.40012069026216707, + "learning_rate": 9.794233284276251e-05, + "loss": 3.1191, + "step": 11758 + }, + { + "epoch": 0.547477710268408, + "grad_norm": 0.3677025594048793, + "learning_rate": 9.794156369589629e-05, + "loss": 3.0627, + "step": 11759 + }, + { + "epoch": 0.547524268454501, + "grad_norm": 0.3736707243436241, + "learning_rate": 9.794079440832631e-05, + "loss": 3.1889, + "step": 11760 + }, + { + "epoch": 0.5475708266405941, + "grad_norm": 0.3868435413141412, + "learning_rate": 9.794002498005481e-05, + "loss": 3.0937, + "step": 11761 + }, + { + "epoch": 0.5476173848266872, + "grad_norm": 0.3770468985515926, + "learning_rate": 9.793925541108406e-05, + "loss": 3.1302, + "step": 11762 + }, + { + "epoch": 0.5476639430127802, + "grad_norm": 0.3539082342586771, + "learning_rate": 9.793848570141632e-05, + "loss": 3.0748, + "step": 11763 + }, + { + "epoch": 0.5477105011988733, + "grad_norm": 0.37439516013469754, + "learning_rate": 9.793771585105384e-05, + "loss": 3.0453, + "step": 11764 + }, + { + "epoch": 0.5477570593849663, + "grad_norm": 0.37925026944015977, + "learning_rate": 9.79369458599989e-05, + "loss": 3.1107, + "step": 11765 + }, + { + "epoch": 0.5478036175710594, + "grad_norm": 0.37231016283635876, + "learning_rate": 9.793617572825376e-05, + "loss": 3.2272, + "step": 11766 + }, + { + "epoch": 0.5478501757571524, + "grad_norm": 0.4053718827460726, + "learning_rate": 9.793540545582064e-05, + "loss": 3.0831, + "step": 11767 + }, + { + "epoch": 0.5478967339432456, + "grad_norm": 0.3731590818379223, + "learning_rate": 9.793463504270184e-05, + "loss": 3.1203, + "step": 11768 + }, + { + "epoch": 0.5479432921293387, + "grad_norm": 0.408758287089392, + "learning_rate": 9.793386448889961e-05, + "loss": 3.1984, + "step": 11769 + }, + { + "epoch": 0.5479898503154317, + "grad_norm": 0.32826889480021715, + "learning_rate": 9.79330937944162e-05, + "loss": 3.0513, + "step": 11770 + }, + { + "epoch": 0.5480364085015248, + "grad_norm": 0.3616452759712749, + "learning_rate": 9.793232295925387e-05, + "loss": 3.0816, + "step": 11771 + }, + { + "epoch": 0.5480829666876178, + "grad_norm": 0.4261557582083075, + "learning_rate": 9.793155198341493e-05, + "loss": 3.1952, + "step": 11772 + }, + { + "epoch": 0.5481295248737109, + "grad_norm": 0.4058223961162778, + "learning_rate": 9.793078086690157e-05, + "loss": 3.2215, + "step": 11773 + }, + { + "epoch": 0.548176083059804, + "grad_norm": 0.3737292365794528, + "learning_rate": 9.793000960971612e-05, + "loss": 3.113, + "step": 11774 + }, + { + "epoch": 0.548222641245897, + "grad_norm": 0.4151871436029004, + "learning_rate": 9.792923821186079e-05, + "loss": 3.099, + "step": 11775 + }, + { + "epoch": 0.5482691994319902, + "grad_norm": 0.37715008322893395, + "learning_rate": 9.792846667333789e-05, + "loss": 3.2081, + "step": 11776 + }, + { + "epoch": 0.5483157576180832, + "grad_norm": 0.4131089718105919, + "learning_rate": 9.792769499414965e-05, + "loss": 3.0483, + "step": 11777 + }, + { + "epoch": 0.5483623158041763, + "grad_norm": 0.4355701275639783, + "learning_rate": 9.792692317429834e-05, + "loss": 3.1055, + "step": 11778 + }, + { + "epoch": 0.5484088739902694, + "grad_norm": 0.3506449705780952, + "learning_rate": 9.792615121378624e-05, + "loss": 3.248, + "step": 11779 + }, + { + "epoch": 0.5484554321763624, + "grad_norm": 0.46519616122213253, + "learning_rate": 9.792537911261562e-05, + "loss": 3.2312, + "step": 11780 + }, + { + "epoch": 0.5485019903624555, + "grad_norm": 0.483182682263039, + "learning_rate": 9.792460687078872e-05, + "loss": 3.152, + "step": 11781 + }, + { + "epoch": 0.5485485485485485, + "grad_norm": 0.40151211007010545, + "learning_rate": 9.792383448830782e-05, + "loss": 2.9686, + "step": 11782 + }, + { + "epoch": 0.5485951067346416, + "grad_norm": 0.40989065945088415, + "learning_rate": 9.792306196517518e-05, + "loss": 3.1434, + "step": 11783 + }, + { + "epoch": 0.5486416649207347, + "grad_norm": 0.41078138740134523, + "learning_rate": 9.792228930139309e-05, + "loss": 3.0987, + "step": 11784 + }, + { + "epoch": 0.5486882231068277, + "grad_norm": 0.391961576520099, + "learning_rate": 9.79215164969638e-05, + "loss": 3.2024, + "step": 11785 + }, + { + "epoch": 0.5487347812929209, + "grad_norm": 0.40014013906869245, + "learning_rate": 9.792074355188957e-05, + "loss": 3.1904, + "step": 11786 + }, + { + "epoch": 0.5487813394790139, + "grad_norm": 0.47535344950552316, + "learning_rate": 9.791997046617268e-05, + "loss": 3.0898, + "step": 11787 + }, + { + "epoch": 0.548827897665107, + "grad_norm": 0.39633905797087665, + "learning_rate": 9.791919723981539e-05, + "loss": 3.1131, + "step": 11788 + }, + { + "epoch": 0.5488744558512, + "grad_norm": 0.3752562343977559, + "learning_rate": 9.791842387282e-05, + "loss": 3.1175, + "step": 11789 + }, + { + "epoch": 0.5489210140372931, + "grad_norm": 0.413157108676119, + "learning_rate": 9.791765036518874e-05, + "loss": 3.0331, + "step": 11790 + }, + { + "epoch": 0.5489675722233862, + "grad_norm": 0.3736633641534037, + "learning_rate": 9.791687671692391e-05, + "loss": 3.2096, + "step": 11791 + }, + { + "epoch": 0.5490141304094792, + "grad_norm": 0.44743626170216194, + "learning_rate": 9.791610292802776e-05, + "loss": 3.2349, + "step": 11792 + }, + { + "epoch": 0.5490606885955723, + "grad_norm": 0.4255688000579481, + "learning_rate": 9.791532899850257e-05, + "loss": 3.1465, + "step": 11793 + }, + { + "epoch": 0.5491072467816653, + "grad_norm": 0.3803395473084388, + "learning_rate": 9.79145549283506e-05, + "loss": 3.1656, + "step": 11794 + }, + { + "epoch": 0.5491538049677585, + "grad_norm": 0.45891121982408767, + "learning_rate": 9.791378071757415e-05, + "loss": 3.1679, + "step": 11795 + }, + { + "epoch": 0.5492003631538516, + "grad_norm": 0.40940192267154657, + "learning_rate": 9.791300636617546e-05, + "loss": 3.0488, + "step": 11796 + }, + { + "epoch": 0.5492469213399446, + "grad_norm": 0.3692974553666964, + "learning_rate": 9.791223187415682e-05, + "loss": 3.0985, + "step": 11797 + }, + { + "epoch": 0.5492934795260377, + "grad_norm": 0.4722931221898123, + "learning_rate": 9.791145724152049e-05, + "loss": 3.1661, + "step": 11798 + }, + { + "epoch": 0.5493400377121307, + "grad_norm": 0.40046131940987534, + "learning_rate": 9.791068246826876e-05, + "loss": 3.1545, + "step": 11799 + }, + { + "epoch": 0.5493865958982238, + "grad_norm": 0.3631906574586461, + "learning_rate": 9.79099075544039e-05, + "loss": 3.1076, + "step": 11800 + }, + { + "epoch": 0.5494331540843169, + "grad_norm": 0.43894561414954714, + "learning_rate": 9.790913249992818e-05, + "loss": 3.1261, + "step": 11801 + }, + { + "epoch": 0.5494797122704099, + "grad_norm": 0.36481044503469295, + "learning_rate": 9.790835730484389e-05, + "loss": 3.2159, + "step": 11802 + }, + { + "epoch": 0.549526270456503, + "grad_norm": 0.39225743702206634, + "learning_rate": 9.790758196915327e-05, + "loss": 3.1607, + "step": 11803 + }, + { + "epoch": 0.549572828642596, + "grad_norm": 0.4535110279608371, + "learning_rate": 9.790680649285863e-05, + "loss": 3.2718, + "step": 11804 + }, + { + "epoch": 0.5496193868286892, + "grad_norm": 0.3940869700970952, + "learning_rate": 9.790603087596222e-05, + "loss": 3.23, + "step": 11805 + }, + { + "epoch": 0.5496659450147823, + "grad_norm": 0.35543180002618574, + "learning_rate": 9.790525511846633e-05, + "loss": 3.1207, + "step": 11806 + }, + { + "epoch": 0.5497125032008753, + "grad_norm": 0.38314123865062794, + "learning_rate": 9.790447922037324e-05, + "loss": 3.1945, + "step": 11807 + }, + { + "epoch": 0.5497590613869684, + "grad_norm": 0.3767372813498037, + "learning_rate": 9.790370318168523e-05, + "loss": 3.0685, + "step": 11808 + }, + { + "epoch": 0.5498056195730614, + "grad_norm": 0.3404897293421972, + "learning_rate": 9.790292700240458e-05, + "loss": 3.1167, + "step": 11809 + }, + { + "epoch": 0.5498521777591545, + "grad_norm": 0.37155502068444984, + "learning_rate": 9.790215068253353e-05, + "loss": 3.025, + "step": 11810 + }, + { + "epoch": 0.5498987359452475, + "grad_norm": 0.3618231313426978, + "learning_rate": 9.79013742220744e-05, + "loss": 3.0574, + "step": 11811 + }, + { + "epoch": 0.5499452941313406, + "grad_norm": 0.3830619148730422, + "learning_rate": 9.790059762102945e-05, + "loss": 3.0707, + "step": 11812 + }, + { + "epoch": 0.5499918523174337, + "grad_norm": 0.36858281894551204, + "learning_rate": 9.789982087940097e-05, + "loss": 3.2005, + "step": 11813 + }, + { + "epoch": 0.5500384105035268, + "grad_norm": 0.38155377592352663, + "learning_rate": 9.789904399719124e-05, + "loss": 3.0641, + "step": 11814 + }, + { + "epoch": 0.5500849686896199, + "grad_norm": 0.3814122256334624, + "learning_rate": 9.789826697440254e-05, + "loss": 3.1804, + "step": 11815 + }, + { + "epoch": 0.5501315268757129, + "grad_norm": 0.43420821691711625, + "learning_rate": 9.789748981103712e-05, + "loss": 3.2274, + "step": 11816 + }, + { + "epoch": 0.550178085061806, + "grad_norm": 0.340283916121416, + "learning_rate": 9.789671250709731e-05, + "loss": 3.1183, + "step": 11817 + }, + { + "epoch": 0.5502246432478991, + "grad_norm": 0.35013654086735446, + "learning_rate": 9.789593506258537e-05, + "loss": 2.9812, + "step": 11818 + }, + { + "epoch": 0.5502712014339921, + "grad_norm": 0.33723100615716767, + "learning_rate": 9.789515747750355e-05, + "loss": 3.0912, + "step": 11819 + }, + { + "epoch": 0.5503177596200852, + "grad_norm": 0.3599324607342053, + "learning_rate": 9.789437975185418e-05, + "loss": 3.1158, + "step": 11820 + }, + { + "epoch": 0.5503643178061782, + "grad_norm": 0.36134676268658705, + "learning_rate": 9.789360188563953e-05, + "loss": 3.1422, + "step": 11821 + }, + { + "epoch": 0.5504108759922713, + "grad_norm": 0.38556871641197016, + "learning_rate": 9.789282387886187e-05, + "loss": 3.2027, + "step": 11822 + }, + { + "epoch": 0.5504574341783645, + "grad_norm": 0.3704907208219241, + "learning_rate": 9.789204573152348e-05, + "loss": 3.2232, + "step": 11823 + }, + { + "epoch": 0.5505039923644575, + "grad_norm": 0.4068846736631967, + "learning_rate": 9.789126744362667e-05, + "loss": 3.0689, + "step": 11824 + }, + { + "epoch": 0.5505505505505506, + "grad_norm": 0.4359750736278424, + "learning_rate": 9.78904890151737e-05, + "loss": 3.0058, + "step": 11825 + }, + { + "epoch": 0.5505971087366436, + "grad_norm": 0.3469769021501873, + "learning_rate": 9.788971044616686e-05, + "loss": 3.1094, + "step": 11826 + }, + { + "epoch": 0.5506436669227367, + "grad_norm": 0.4032857598725923, + "learning_rate": 9.788893173660845e-05, + "loss": 3.1856, + "step": 11827 + }, + { + "epoch": 0.5506902251088298, + "grad_norm": 0.3720823152638695, + "learning_rate": 9.788815288650072e-05, + "loss": 3.156, + "step": 11828 + }, + { + "epoch": 0.5507367832949228, + "grad_norm": 0.33212738217393073, + "learning_rate": 9.788737389584601e-05, + "loss": 3.1209, + "step": 11829 + }, + { + "epoch": 0.5507833414810159, + "grad_norm": 0.36026304235966106, + "learning_rate": 9.788659476464657e-05, + "loss": 3.0888, + "step": 11830 + }, + { + "epoch": 0.5508298996671089, + "grad_norm": 0.3823920603800249, + "learning_rate": 9.788581549290468e-05, + "loss": 3.2321, + "step": 11831 + }, + { + "epoch": 0.550876457853202, + "grad_norm": 0.3615695404825658, + "learning_rate": 9.788503608062262e-05, + "loss": 3.0889, + "step": 11832 + }, + { + "epoch": 0.550923016039295, + "grad_norm": 0.4001187337425696, + "learning_rate": 9.788425652780273e-05, + "loss": 3.0511, + "step": 11833 + }, + { + "epoch": 0.5509695742253882, + "grad_norm": 0.41372221238483115, + "learning_rate": 9.788347683444725e-05, + "loss": 3.1592, + "step": 11834 + }, + { + "epoch": 0.5510161324114813, + "grad_norm": 0.3803390705063405, + "learning_rate": 9.788269700055849e-05, + "loss": 3.1129, + "step": 11835 + }, + { + "epoch": 0.5510626905975743, + "grad_norm": 0.4129710723622753, + "learning_rate": 9.788191702613872e-05, + "loss": 3.1624, + "step": 11836 + }, + { + "epoch": 0.5511092487836674, + "grad_norm": 0.37152463587990553, + "learning_rate": 9.788113691119025e-05, + "loss": 3.0868, + "step": 11837 + }, + { + "epoch": 0.5511558069697604, + "grad_norm": 0.4000582375810982, + "learning_rate": 9.788035665571536e-05, + "loss": 3.1836, + "step": 11838 + }, + { + "epoch": 0.5512023651558535, + "grad_norm": 0.44013599798164244, + "learning_rate": 9.787957625971633e-05, + "loss": 3.1515, + "step": 11839 + }, + { + "epoch": 0.5512489233419466, + "grad_norm": 0.354815837371652, + "learning_rate": 9.787879572319546e-05, + "loss": 3.1515, + "step": 11840 + }, + { + "epoch": 0.5512954815280396, + "grad_norm": 0.40829897253468983, + "learning_rate": 9.787801504615506e-05, + "loss": 3.0478, + "step": 11841 + }, + { + "epoch": 0.5513420397141328, + "grad_norm": 0.3860596150459064, + "learning_rate": 9.78772342285974e-05, + "loss": 3.1441, + "step": 11842 + }, + { + "epoch": 0.5513885979002258, + "grad_norm": 0.3516680446145993, + "learning_rate": 9.787645327052476e-05, + "loss": 3.2069, + "step": 11843 + }, + { + "epoch": 0.5514351560863189, + "grad_norm": 0.3846991747621164, + "learning_rate": 9.787567217193945e-05, + "loss": 3.0723, + "step": 11844 + }, + { + "epoch": 0.551481714272412, + "grad_norm": 0.36057647260276304, + "learning_rate": 9.787489093284377e-05, + "loss": 3.1634, + "step": 11845 + }, + { + "epoch": 0.551528272458505, + "grad_norm": 0.34125913056233953, + "learning_rate": 9.787410955323999e-05, + "loss": 3.1737, + "step": 11846 + }, + { + "epoch": 0.5515748306445981, + "grad_norm": 0.3720479834478818, + "learning_rate": 9.787332803313042e-05, + "loss": 3.1211, + "step": 11847 + }, + { + "epoch": 0.5516213888306911, + "grad_norm": 0.3566818924739574, + "learning_rate": 9.787254637251735e-05, + "loss": 3.1077, + "step": 11848 + }, + { + "epoch": 0.5516679470167842, + "grad_norm": 0.3812426061748647, + "learning_rate": 9.787176457140305e-05, + "loss": 3.1435, + "step": 11849 + }, + { + "epoch": 0.5517145052028773, + "grad_norm": 0.38022516191396943, + "learning_rate": 9.787098262978987e-05, + "loss": 3.0706, + "step": 11850 + }, + { + "epoch": 0.5517610633889704, + "grad_norm": 0.3629907817766069, + "learning_rate": 9.787020054768006e-05, + "loss": 3.2036, + "step": 11851 + }, + { + "epoch": 0.5518076215750635, + "grad_norm": 0.40381513840534045, + "learning_rate": 9.786941832507592e-05, + "loss": 3.0545, + "step": 11852 + }, + { + "epoch": 0.5518541797611565, + "grad_norm": 0.43747610275221377, + "learning_rate": 9.786863596197977e-05, + "loss": 2.9309, + "step": 11853 + }, + { + "epoch": 0.5519007379472496, + "grad_norm": 0.3603084811334871, + "learning_rate": 9.786785345839387e-05, + "loss": 3.1336, + "step": 11854 + }, + { + "epoch": 0.5519472961333426, + "grad_norm": 0.4165726843828829, + "learning_rate": 9.786707081432054e-05, + "loss": 3.153, + "step": 11855 + }, + { + "epoch": 0.5519938543194357, + "grad_norm": 0.39618666964328253, + "learning_rate": 9.786628802976207e-05, + "loss": 3.2242, + "step": 11856 + }, + { + "epoch": 0.5520404125055288, + "grad_norm": 0.4202872265399501, + "learning_rate": 9.786550510472077e-05, + "loss": 3.2506, + "step": 11857 + }, + { + "epoch": 0.5520869706916218, + "grad_norm": 0.42349196796104266, + "learning_rate": 9.786472203919894e-05, + "loss": 3.1295, + "step": 11858 + }, + { + "epoch": 0.5521335288777149, + "grad_norm": 0.35150012165931155, + "learning_rate": 9.786393883319884e-05, + "loss": 3.1563, + "step": 11859 + }, + { + "epoch": 0.552180087063808, + "grad_norm": 0.37988097016217276, + "learning_rate": 9.78631554867228e-05, + "loss": 3.1327, + "step": 11860 + }, + { + "epoch": 0.5522266452499011, + "grad_norm": 0.40529169296746553, + "learning_rate": 9.786237199977313e-05, + "loss": 3.1475, + "step": 11861 + }, + { + "epoch": 0.5522732034359942, + "grad_norm": 0.38353229596223204, + "learning_rate": 9.786158837235209e-05, + "loss": 3.2264, + "step": 11862 + }, + { + "epoch": 0.5523197616220872, + "grad_norm": 0.3564790512060953, + "learning_rate": 9.786080460446202e-05, + "loss": 3.1541, + "step": 11863 + }, + { + "epoch": 0.5523663198081803, + "grad_norm": 0.39287607245618994, + "learning_rate": 9.78600206961052e-05, + "loss": 3.0752, + "step": 11864 + }, + { + "epoch": 0.5524128779942733, + "grad_norm": 0.3441044551341766, + "learning_rate": 9.785923664728394e-05, + "loss": 3.0772, + "step": 11865 + }, + { + "epoch": 0.5524594361803664, + "grad_norm": 0.3934978954798279, + "learning_rate": 9.785845245800053e-05, + "loss": 3.1158, + "step": 11866 + }, + { + "epoch": 0.5525059943664595, + "grad_norm": 0.37757271542586834, + "learning_rate": 9.785766812825728e-05, + "loss": 3.1497, + "step": 11867 + }, + { + "epoch": 0.5525525525525525, + "grad_norm": 0.39223703301271995, + "learning_rate": 9.785688365805648e-05, + "loss": 3.1226, + "step": 11868 + }, + { + "epoch": 0.5525991107386456, + "grad_norm": 0.3722267139059118, + "learning_rate": 9.785609904740046e-05, + "loss": 3.1471, + "step": 11869 + }, + { + "epoch": 0.5526456689247387, + "grad_norm": 0.3996528207161021, + "learning_rate": 9.785531429629148e-05, + "loss": 3.0903, + "step": 11870 + }, + { + "epoch": 0.5526922271108318, + "grad_norm": 0.3377929990693183, + "learning_rate": 9.785452940473187e-05, + "loss": 3.1115, + "step": 11871 + }, + { + "epoch": 0.5527387852969249, + "grad_norm": 0.40134904832417484, + "learning_rate": 9.785374437272394e-05, + "loss": 3.1069, + "step": 11872 + }, + { + "epoch": 0.5527853434830179, + "grad_norm": 0.4076935684875043, + "learning_rate": 9.785295920026999e-05, + "loss": 3.1266, + "step": 11873 + }, + { + "epoch": 0.552831901669111, + "grad_norm": 0.3446389137262781, + "learning_rate": 9.785217388737231e-05, + "loss": 3.0578, + "step": 11874 + }, + { + "epoch": 0.552878459855204, + "grad_norm": 0.3877384103426323, + "learning_rate": 9.785138843403322e-05, + "loss": 3.0278, + "step": 11875 + }, + { + "epoch": 0.5529250180412971, + "grad_norm": 0.36887258172723375, + "learning_rate": 9.785060284025502e-05, + "loss": 3.0767, + "step": 11876 + }, + { + "epoch": 0.5529715762273901, + "grad_norm": 0.3622551170947625, + "learning_rate": 9.784981710603999e-05, + "loss": 3.0519, + "step": 11877 + }, + { + "epoch": 0.5530181344134832, + "grad_norm": 0.3763988096309777, + "learning_rate": 9.784903123139048e-05, + "loss": 3.2091, + "step": 11878 + }, + { + "epoch": 0.5530646925995764, + "grad_norm": 0.3618349669703875, + "learning_rate": 9.78482452163088e-05, + "loss": 3.1011, + "step": 11879 + }, + { + "epoch": 0.5531112507856694, + "grad_norm": 0.35060394920262206, + "learning_rate": 9.78474590607972e-05, + "loss": 3.1376, + "step": 11880 + }, + { + "epoch": 0.5531578089717625, + "grad_norm": 0.35339159646296847, + "learning_rate": 9.784667276485804e-05, + "loss": 3.1182, + "step": 11881 + }, + { + "epoch": 0.5532043671578555, + "grad_norm": 0.3802892911010809, + "learning_rate": 9.78458863284936e-05, + "loss": 3.1039, + "step": 11882 + }, + { + "epoch": 0.5532509253439486, + "grad_norm": 0.386352727014652, + "learning_rate": 9.78450997517062e-05, + "loss": 3.1846, + "step": 11883 + }, + { + "epoch": 0.5532974835300417, + "grad_norm": 0.40416946745844057, + "learning_rate": 9.784431303449815e-05, + "loss": 3.2067, + "step": 11884 + }, + { + "epoch": 0.5533440417161347, + "grad_norm": 0.3680680979799521, + "learning_rate": 9.784352617687173e-05, + "loss": 3.1612, + "step": 11885 + }, + { + "epoch": 0.5533905999022278, + "grad_norm": 0.36637400063204817, + "learning_rate": 9.784273917882931e-05, + "loss": 3.23, + "step": 11886 + }, + { + "epoch": 0.5534371580883208, + "grad_norm": 0.36232920220408965, + "learning_rate": 9.784195204037315e-05, + "loss": 3.1988, + "step": 11887 + }, + { + "epoch": 0.553483716274414, + "grad_norm": 0.3555176248577343, + "learning_rate": 9.784116476150556e-05, + "loss": 3.0659, + "step": 11888 + }, + { + "epoch": 0.5535302744605071, + "grad_norm": 0.4063343641872631, + "learning_rate": 9.784037734222889e-05, + "loss": 3.1993, + "step": 11889 + }, + { + "epoch": 0.5535768326466001, + "grad_norm": 0.3855453091681149, + "learning_rate": 9.78395897825454e-05, + "loss": 3.0871, + "step": 11890 + }, + { + "epoch": 0.5536233908326932, + "grad_norm": 0.37810853743209544, + "learning_rate": 9.783880208245744e-05, + "loss": 3.1292, + "step": 11891 + }, + { + "epoch": 0.5536699490187862, + "grad_norm": 0.42740573321622805, + "learning_rate": 9.783801424196731e-05, + "loss": 3.1426, + "step": 11892 + }, + { + "epoch": 0.5537165072048793, + "grad_norm": 0.4040587949407977, + "learning_rate": 9.783722626107732e-05, + "loss": 3.1761, + "step": 11893 + }, + { + "epoch": 0.5537630653909724, + "grad_norm": 0.37090658369264595, + "learning_rate": 9.783643813978978e-05, + "loss": 3.0784, + "step": 11894 + }, + { + "epoch": 0.5538096235770654, + "grad_norm": 0.3940311763770341, + "learning_rate": 9.7835649878107e-05, + "loss": 3.05, + "step": 11895 + }, + { + "epoch": 0.5538561817631585, + "grad_norm": 0.3874822766498167, + "learning_rate": 9.783486147603132e-05, + "loss": 3.0139, + "step": 11896 + }, + { + "epoch": 0.5539027399492515, + "grad_norm": 0.42637213579616595, + "learning_rate": 9.783407293356503e-05, + "loss": 3.0849, + "step": 11897 + }, + { + "epoch": 0.5539492981353447, + "grad_norm": 0.37956570876530654, + "learning_rate": 9.783328425071044e-05, + "loss": 3.0796, + "step": 11898 + }, + { + "epoch": 0.5539958563214377, + "grad_norm": 0.3609109312124248, + "learning_rate": 9.783249542746988e-05, + "loss": 3.0669, + "step": 11899 + }, + { + "epoch": 0.5540424145075308, + "grad_norm": 0.42146963150525957, + "learning_rate": 9.783170646384565e-05, + "loss": 3.1464, + "step": 11900 + }, + { + "epoch": 0.5540889726936239, + "grad_norm": 0.4103279164949503, + "learning_rate": 9.783091735984007e-05, + "loss": 3.1639, + "step": 11901 + }, + { + "epoch": 0.5541355308797169, + "grad_norm": 0.3927457084761409, + "learning_rate": 9.783012811545546e-05, + "loss": 3.093, + "step": 11902 + }, + { + "epoch": 0.55418208906581, + "grad_norm": 0.4481706683055278, + "learning_rate": 9.782933873069416e-05, + "loss": 3.1233, + "step": 11903 + }, + { + "epoch": 0.554228647251903, + "grad_norm": 0.4445285163365548, + "learning_rate": 9.782854920555844e-05, + "loss": 3.1996, + "step": 11904 + }, + { + "epoch": 0.5542752054379961, + "grad_norm": 0.36308739568451515, + "learning_rate": 9.782775954005064e-05, + "loss": 3.1034, + "step": 11905 + }, + { + "epoch": 0.5543217636240892, + "grad_norm": 0.4271869973348987, + "learning_rate": 9.78269697341731e-05, + "loss": 3.1785, + "step": 11906 + }, + { + "epoch": 0.5543683218101823, + "grad_norm": 0.4429965330782112, + "learning_rate": 9.782617978792808e-05, + "loss": 3.1593, + "step": 11907 + }, + { + "epoch": 0.5544148799962754, + "grad_norm": 0.34633389067095544, + "learning_rate": 9.782538970131796e-05, + "loss": 3.1397, + "step": 11908 + }, + { + "epoch": 0.5544614381823684, + "grad_norm": 0.44415423819821737, + "learning_rate": 9.782459947434502e-05, + "loss": 3.1437, + "step": 11909 + }, + { + "epoch": 0.5545079963684615, + "grad_norm": 0.35136906550683294, + "learning_rate": 9.78238091070116e-05, + "loss": 3.0927, + "step": 11910 + }, + { + "epoch": 0.5545545545545546, + "grad_norm": 0.3461318946748751, + "learning_rate": 9.782301859932002e-05, + "loss": 3.1106, + "step": 11911 + }, + { + "epoch": 0.5546011127406476, + "grad_norm": 0.37906124778629163, + "learning_rate": 9.782222795127258e-05, + "loss": 3.1424, + "step": 11912 + }, + { + "epoch": 0.5546476709267407, + "grad_norm": 0.38951858628093056, + "learning_rate": 9.78214371628716e-05, + "loss": 3.0778, + "step": 11913 + }, + { + "epoch": 0.5546942291128337, + "grad_norm": 0.38975498856655383, + "learning_rate": 9.782064623411943e-05, + "loss": 3.1028, + "step": 11914 + }, + { + "epoch": 0.5547407872989268, + "grad_norm": 0.356526242844491, + "learning_rate": 9.781985516501837e-05, + "loss": 3.0383, + "step": 11915 + }, + { + "epoch": 0.55478734548502, + "grad_norm": 0.40712865574113305, + "learning_rate": 9.781906395557074e-05, + "loss": 3.2247, + "step": 11916 + }, + { + "epoch": 0.554833903671113, + "grad_norm": 0.3583504046455017, + "learning_rate": 9.781827260577888e-05, + "loss": 3.1972, + "step": 11917 + }, + { + "epoch": 0.5548804618572061, + "grad_norm": 0.38480104095807066, + "learning_rate": 9.78174811156451e-05, + "loss": 3.1662, + "step": 11918 + }, + { + "epoch": 0.5549270200432991, + "grad_norm": 0.3753881279965764, + "learning_rate": 9.781668948517172e-05, + "loss": 3.1039, + "step": 11919 + }, + { + "epoch": 0.5549735782293922, + "grad_norm": 0.39745877043013306, + "learning_rate": 9.781589771436106e-05, + "loss": 3.1008, + "step": 11920 + }, + { + "epoch": 0.5550201364154852, + "grad_norm": 0.3900356507494929, + "learning_rate": 9.781510580321547e-05, + "loss": 3.1355, + "step": 11921 + }, + { + "epoch": 0.5550666946015783, + "grad_norm": 0.4217360057441784, + "learning_rate": 9.781431375173725e-05, + "loss": 3.1508, + "step": 11922 + }, + { + "epoch": 0.5551132527876714, + "grad_norm": 0.4609392463705846, + "learning_rate": 9.781352155992872e-05, + "loss": 3.1445, + "step": 11923 + }, + { + "epoch": 0.5551598109737644, + "grad_norm": 0.42753112796729736, + "learning_rate": 9.781272922779221e-05, + "loss": 3.0917, + "step": 11924 + }, + { + "epoch": 0.5552063691598575, + "grad_norm": 0.37379177848227985, + "learning_rate": 9.781193675533005e-05, + "loss": 3.1121, + "step": 11925 + }, + { + "epoch": 0.5552529273459506, + "grad_norm": 0.41501152434790217, + "learning_rate": 9.781114414254457e-05, + "loss": 3.1372, + "step": 11926 + }, + { + "epoch": 0.5552994855320437, + "grad_norm": 0.35620708938183504, + "learning_rate": 9.781035138943811e-05, + "loss": 3.1134, + "step": 11927 + }, + { + "epoch": 0.5553460437181368, + "grad_norm": 0.40158020964285074, + "learning_rate": 9.780955849601294e-05, + "loss": 3.073, + "step": 11928 + }, + { + "epoch": 0.5553926019042298, + "grad_norm": 0.4022034553533369, + "learning_rate": 9.780876546227147e-05, + "loss": 3.0918, + "step": 11929 + }, + { + "epoch": 0.5554391600903229, + "grad_norm": 0.4232555454530406, + "learning_rate": 9.780797228821596e-05, + "loss": 3.143, + "step": 11930 + }, + { + "epoch": 0.5554857182764159, + "grad_norm": 0.3753641560357447, + "learning_rate": 9.780717897384876e-05, + "loss": 3.0527, + "step": 11931 + }, + { + "epoch": 0.555532276462509, + "grad_norm": 0.40046593430608834, + "learning_rate": 9.78063855191722e-05, + "loss": 3.1177, + "step": 11932 + }, + { + "epoch": 0.5555788346486021, + "grad_norm": 0.42729767374018596, + "learning_rate": 9.780559192418861e-05, + "loss": 3.1572, + "step": 11933 + }, + { + "epoch": 0.5556253928346951, + "grad_norm": 0.363494219652663, + "learning_rate": 9.780479818890031e-05, + "loss": 3.1465, + "step": 11934 + }, + { + "epoch": 0.5556719510207883, + "grad_norm": 0.4045008512804942, + "learning_rate": 9.780400431330965e-05, + "loss": 3.13, + "step": 11935 + }, + { + "epoch": 0.5557185092068813, + "grad_norm": 0.4014191161390235, + "learning_rate": 9.780321029741895e-05, + "loss": 3.071, + "step": 11936 + }, + { + "epoch": 0.5557650673929744, + "grad_norm": 0.37843387642919124, + "learning_rate": 9.780241614123053e-05, + "loss": 3.1408, + "step": 11937 + }, + { + "epoch": 0.5558116255790675, + "grad_norm": 0.3816247034563709, + "learning_rate": 9.780162184474672e-05, + "loss": 3.0351, + "step": 11938 + }, + { + "epoch": 0.5558581837651605, + "grad_norm": 0.37551218260015784, + "learning_rate": 9.780082740796988e-05, + "loss": 3.1825, + "step": 11939 + }, + { + "epoch": 0.5559047419512536, + "grad_norm": 0.4225917271800567, + "learning_rate": 9.78000328309023e-05, + "loss": 3.1818, + "step": 11940 + }, + { + "epoch": 0.5559513001373466, + "grad_norm": 0.3928871217894137, + "learning_rate": 9.779923811354634e-05, + "loss": 3.1178, + "step": 11941 + }, + { + "epoch": 0.5559978583234397, + "grad_norm": 0.4232731294045023, + "learning_rate": 9.779844325590433e-05, + "loss": 3.1978, + "step": 11942 + }, + { + "epoch": 0.5560444165095327, + "grad_norm": 0.3983393991076115, + "learning_rate": 9.779764825797859e-05, + "loss": 3.1754, + "step": 11943 + }, + { + "epoch": 0.5560909746956259, + "grad_norm": 0.3569872815582588, + "learning_rate": 9.779685311977147e-05, + "loss": 3.1072, + "step": 11944 + }, + { + "epoch": 0.556137532881719, + "grad_norm": 0.4093263889453943, + "learning_rate": 9.779605784128529e-05, + "loss": 3.0346, + "step": 11945 + }, + { + "epoch": 0.556184091067812, + "grad_norm": 0.38789372665553823, + "learning_rate": 9.779526242252239e-05, + "loss": 3.1063, + "step": 11946 + }, + { + "epoch": 0.5562306492539051, + "grad_norm": 0.3622384845199041, + "learning_rate": 9.77944668634851e-05, + "loss": 3.165, + "step": 11947 + }, + { + "epoch": 0.5562772074399981, + "grad_norm": 0.38306710369810554, + "learning_rate": 9.779367116417575e-05, + "loss": 3.1057, + "step": 11948 + }, + { + "epoch": 0.5563237656260912, + "grad_norm": 0.3350626594356682, + "learning_rate": 9.77928753245967e-05, + "loss": 3.0868, + "step": 11949 + }, + { + "epoch": 0.5563703238121843, + "grad_norm": 0.34913985530707925, + "learning_rate": 9.779207934475027e-05, + "loss": 3.1376, + "step": 11950 + }, + { + "epoch": 0.5564168819982773, + "grad_norm": 0.3572771905077648, + "learning_rate": 9.779128322463877e-05, + "loss": 3.101, + "step": 11951 + }, + { + "epoch": 0.5564634401843704, + "grad_norm": 0.3385136987769663, + "learning_rate": 9.779048696426459e-05, + "loss": 3.1418, + "step": 11952 + }, + { + "epoch": 0.5565099983704634, + "grad_norm": 0.34586638719155016, + "learning_rate": 9.778969056363004e-05, + "loss": 3.081, + "step": 11953 + }, + { + "epoch": 0.5565565565565566, + "grad_norm": 0.3758354734177974, + "learning_rate": 9.778889402273743e-05, + "loss": 3.1066, + "step": 11954 + }, + { + "epoch": 0.5566031147426497, + "grad_norm": 0.40986456320653203, + "learning_rate": 9.778809734158914e-05, + "loss": 3.1259, + "step": 11955 + }, + { + "epoch": 0.5566496729287427, + "grad_norm": 0.39839133972441965, + "learning_rate": 9.77873005201875e-05, + "loss": 3.1372, + "step": 11956 + }, + { + "epoch": 0.5566962311148358, + "grad_norm": 0.32884887997178675, + "learning_rate": 9.778650355853482e-05, + "loss": 3.0544, + "step": 11957 + }, + { + "epoch": 0.5567427893009288, + "grad_norm": 0.363319939522027, + "learning_rate": 9.778570645663348e-05, + "loss": 3.2059, + "step": 11958 + }, + { + "epoch": 0.5567893474870219, + "grad_norm": 0.35185725589368483, + "learning_rate": 9.77849092144858e-05, + "loss": 3.1194, + "step": 11959 + }, + { + "epoch": 0.556835905673115, + "grad_norm": 0.39340540408641594, + "learning_rate": 9.77841118320941e-05, + "loss": 3.1792, + "step": 11960 + }, + { + "epoch": 0.556882463859208, + "grad_norm": 0.3570666160547083, + "learning_rate": 9.778331430946075e-05, + "loss": 3.1419, + "step": 11961 + }, + { + "epoch": 0.5569290220453011, + "grad_norm": 0.4086295765829168, + "learning_rate": 9.778251664658808e-05, + "loss": 3.1061, + "step": 11962 + }, + { + "epoch": 0.5569755802313942, + "grad_norm": 0.4188217638549596, + "learning_rate": 9.778171884347844e-05, + "loss": 3.1263, + "step": 11963 + }, + { + "epoch": 0.5570221384174873, + "grad_norm": 0.35813720719637054, + "learning_rate": 9.778092090013416e-05, + "loss": 3.1406, + "step": 11964 + }, + { + "epoch": 0.5570686966035803, + "grad_norm": 0.4630627048560631, + "learning_rate": 9.778012281655757e-05, + "loss": 3.1811, + "step": 11965 + }, + { + "epoch": 0.5571152547896734, + "grad_norm": 0.4054223818605791, + "learning_rate": 9.777932459275103e-05, + "loss": 3.0986, + "step": 11966 + }, + { + "epoch": 0.5571618129757665, + "grad_norm": 0.40081790839653963, + "learning_rate": 9.777852622871688e-05, + "loss": 3.031, + "step": 11967 + }, + { + "epoch": 0.5572083711618595, + "grad_norm": 0.4210776010433098, + "learning_rate": 9.777772772445749e-05, + "loss": 3.1014, + "step": 11968 + }, + { + "epoch": 0.5572549293479526, + "grad_norm": 0.4182366215278864, + "learning_rate": 9.777692907997514e-05, + "loss": 3.2223, + "step": 11969 + }, + { + "epoch": 0.5573014875340456, + "grad_norm": 0.3938380386104299, + "learning_rate": 9.777613029527222e-05, + "loss": 3.0797, + "step": 11970 + }, + { + "epoch": 0.5573480457201387, + "grad_norm": 0.4247402671652444, + "learning_rate": 9.777533137035106e-05, + "loss": 3.1747, + "step": 11971 + }, + { + "epoch": 0.5573946039062319, + "grad_norm": 0.3823898864644548, + "learning_rate": 9.777453230521403e-05, + "loss": 3.1499, + "step": 11972 + }, + { + "epoch": 0.5574411620923249, + "grad_norm": 0.425461943093518, + "learning_rate": 9.777373309986344e-05, + "loss": 2.9245, + "step": 11973 + }, + { + "epoch": 0.557487720278418, + "grad_norm": 0.3813842286436821, + "learning_rate": 9.777293375430165e-05, + "loss": 3.2618, + "step": 11974 + }, + { + "epoch": 0.557534278464511, + "grad_norm": 0.42477556890071605, + "learning_rate": 9.7772134268531e-05, + "loss": 3.064, + "step": 11975 + }, + { + "epoch": 0.5575808366506041, + "grad_norm": 0.39235664447218854, + "learning_rate": 9.777133464255384e-05, + "loss": 3.1821, + "step": 11976 + }, + { + "epoch": 0.5576273948366972, + "grad_norm": 0.4000354528742027, + "learning_rate": 9.777053487637253e-05, + "loss": 3.0993, + "step": 11977 + }, + { + "epoch": 0.5576739530227902, + "grad_norm": 0.42337811240504586, + "learning_rate": 9.77697349699894e-05, + "loss": 3.1846, + "step": 11978 + }, + { + "epoch": 0.5577205112088833, + "grad_norm": 0.3741359858446069, + "learning_rate": 9.776893492340682e-05, + "loss": 3.1267, + "step": 11979 + }, + { + "epoch": 0.5577670693949763, + "grad_norm": 0.4183963776405537, + "learning_rate": 9.77681347366271e-05, + "loss": 3.1846, + "step": 11980 + }, + { + "epoch": 0.5578136275810694, + "grad_norm": 0.4418829930373978, + "learning_rate": 9.776733440965263e-05, + "loss": 3.1827, + "step": 11981 + }, + { + "epoch": 0.5578601857671626, + "grad_norm": 0.42034416029582455, + "learning_rate": 9.776653394248571e-05, + "loss": 3.1407, + "step": 11982 + }, + { + "epoch": 0.5579067439532556, + "grad_norm": 0.3996705745853478, + "learning_rate": 9.776573333512876e-05, + "loss": 3.1312, + "step": 11983 + }, + { + "epoch": 0.5579533021393487, + "grad_norm": 0.47035189421702384, + "learning_rate": 9.776493258758406e-05, + "loss": 3.1484, + "step": 11984 + }, + { + "epoch": 0.5579998603254417, + "grad_norm": 0.3813966160939622, + "learning_rate": 9.776413169985399e-05, + "loss": 3.1439, + "step": 11985 + }, + { + "epoch": 0.5580464185115348, + "grad_norm": 0.4929825474137421, + "learning_rate": 9.77633306719409e-05, + "loss": 3.1556, + "step": 11986 + }, + { + "epoch": 0.5580929766976278, + "grad_norm": 0.3937937457519806, + "learning_rate": 9.776252950384714e-05, + "loss": 3.1194, + "step": 11987 + }, + { + "epoch": 0.5581395348837209, + "grad_norm": 0.43166381940171744, + "learning_rate": 9.776172819557508e-05, + "loss": 2.9543, + "step": 11988 + }, + { + "epoch": 0.558186093069814, + "grad_norm": 0.4011561045364247, + "learning_rate": 9.776092674712702e-05, + "loss": 3.1588, + "step": 11989 + }, + { + "epoch": 0.558232651255907, + "grad_norm": 0.405043378187136, + "learning_rate": 9.776012515850537e-05, + "loss": 3.1217, + "step": 11990 + }, + { + "epoch": 0.5582792094420002, + "grad_norm": 0.4136509997981661, + "learning_rate": 9.775932342971245e-05, + "loss": 3.1312, + "step": 11991 + }, + { + "epoch": 0.5583257676280932, + "grad_norm": 0.4225545064342442, + "learning_rate": 9.775852156075062e-05, + "loss": 3.0955, + "step": 11992 + }, + { + "epoch": 0.5583723258141863, + "grad_norm": 0.4480733117834285, + "learning_rate": 9.775771955162224e-05, + "loss": 3.1682, + "step": 11993 + }, + { + "epoch": 0.5584188840002794, + "grad_norm": 0.42502197478459025, + "learning_rate": 9.775691740232966e-05, + "loss": 3.0954, + "step": 11994 + }, + { + "epoch": 0.5584654421863724, + "grad_norm": 0.37456503216282855, + "learning_rate": 9.775611511287522e-05, + "loss": 3.2194, + "step": 11995 + }, + { + "epoch": 0.5585120003724655, + "grad_norm": 0.3778040959955464, + "learning_rate": 9.77553126832613e-05, + "loss": 3.2026, + "step": 11996 + }, + { + "epoch": 0.5585585585585585, + "grad_norm": 0.35060816725148874, + "learning_rate": 9.775451011349024e-05, + "loss": 3.0871, + "step": 11997 + }, + { + "epoch": 0.5586051167446516, + "grad_norm": 0.44899903414545167, + "learning_rate": 9.77537074035644e-05, + "loss": 3.1472, + "step": 11998 + }, + { + "epoch": 0.5586516749307447, + "grad_norm": 0.36066713551208146, + "learning_rate": 9.775290455348612e-05, + "loss": 3.1785, + "step": 11999 + }, + { + "epoch": 0.5586982331168378, + "grad_norm": 0.3937593783251821, + "learning_rate": 9.775210156325777e-05, + "loss": 3.228, + "step": 12000 + }, + { + "epoch": 0.5587447913029309, + "grad_norm": 0.44909346238553843, + "learning_rate": 9.775129843288171e-05, + "loss": 3.2462, + "step": 12001 + }, + { + "epoch": 0.5587913494890239, + "grad_norm": 0.4109950999259573, + "learning_rate": 9.77504951623603e-05, + "loss": 3.0059, + "step": 12002 + }, + { + "epoch": 0.558837907675117, + "grad_norm": 0.3903340040854843, + "learning_rate": 9.774969175169588e-05, + "loss": 3.0705, + "step": 12003 + }, + { + "epoch": 0.5588844658612101, + "grad_norm": 0.40003355583552785, + "learning_rate": 9.774888820089083e-05, + "loss": 3.0544, + "step": 12004 + }, + { + "epoch": 0.5589310240473031, + "grad_norm": 0.43190781466716704, + "learning_rate": 9.774808450994749e-05, + "loss": 3.164, + "step": 12005 + }, + { + "epoch": 0.5589775822333962, + "grad_norm": 0.43172765038131744, + "learning_rate": 9.774728067886822e-05, + "loss": 3.0572, + "step": 12006 + }, + { + "epoch": 0.5590241404194892, + "grad_norm": 0.37447368825212113, + "learning_rate": 9.77464767076554e-05, + "loss": 3.0636, + "step": 12007 + }, + { + "epoch": 0.5590706986055823, + "grad_norm": 0.5056916142950719, + "learning_rate": 9.774567259631136e-05, + "loss": 3.1302, + "step": 12008 + }, + { + "epoch": 0.5591172567916753, + "grad_norm": 0.4193487518759144, + "learning_rate": 9.774486834483849e-05, + "loss": 3.0622, + "step": 12009 + }, + { + "epoch": 0.5591638149777685, + "grad_norm": 0.41141878375391633, + "learning_rate": 9.774406395323913e-05, + "loss": 3.042, + "step": 12010 + }, + { + "epoch": 0.5592103731638616, + "grad_norm": 0.4524136204925162, + "learning_rate": 9.774325942151564e-05, + "loss": 3.0955, + "step": 12011 + }, + { + "epoch": 0.5592569313499546, + "grad_norm": 0.3883276671899435, + "learning_rate": 9.774245474967039e-05, + "loss": 3.1255, + "step": 12012 + }, + { + "epoch": 0.5593034895360477, + "grad_norm": 0.4077860913776892, + "learning_rate": 9.774164993770573e-05, + "loss": 3.1691, + "step": 12013 + }, + { + "epoch": 0.5593500477221407, + "grad_norm": 0.4401372666014064, + "learning_rate": 9.774084498562406e-05, + "loss": 3.2088, + "step": 12014 + }, + { + "epoch": 0.5593966059082338, + "grad_norm": 0.3891569781742258, + "learning_rate": 9.774003989342768e-05, + "loss": 3.0715, + "step": 12015 + }, + { + "epoch": 0.5594431640943269, + "grad_norm": 0.3939616310002057, + "learning_rate": 9.773923466111901e-05, + "loss": 3.0729, + "step": 12016 + }, + { + "epoch": 0.5594897222804199, + "grad_norm": 0.4156982358484625, + "learning_rate": 9.773842928870038e-05, + "loss": 3.0358, + "step": 12017 + }, + { + "epoch": 0.559536280466513, + "grad_norm": 0.3411416325760896, + "learning_rate": 9.773762377617415e-05, + "loss": 3.0761, + "step": 12018 + }, + { + "epoch": 0.559582838652606, + "grad_norm": 0.42012674741243916, + "learning_rate": 9.773681812354272e-05, + "loss": 2.9897, + "step": 12019 + }, + { + "epoch": 0.5596293968386992, + "grad_norm": 0.3852978110620123, + "learning_rate": 9.773601233080841e-05, + "loss": 3.0794, + "step": 12020 + }, + { + "epoch": 0.5596759550247923, + "grad_norm": 0.3641251995560067, + "learning_rate": 9.773520639797362e-05, + "loss": 3.1814, + "step": 12021 + }, + { + "epoch": 0.5597225132108853, + "grad_norm": 0.34605995453369154, + "learning_rate": 9.773440032504069e-05, + "loss": 3.1426, + "step": 12022 + }, + { + "epoch": 0.5597690713969784, + "grad_norm": 0.4162630184691308, + "learning_rate": 9.773359411201202e-05, + "loss": 3.2811, + "step": 12023 + }, + { + "epoch": 0.5598156295830714, + "grad_norm": 0.4071194052115934, + "learning_rate": 9.773278775888994e-05, + "loss": 3.0847, + "step": 12024 + }, + { + "epoch": 0.5598621877691645, + "grad_norm": 0.40583532323770594, + "learning_rate": 9.773198126567682e-05, + "loss": 3.0922, + "step": 12025 + }, + { + "epoch": 0.5599087459552576, + "grad_norm": 0.379718311708257, + "learning_rate": 9.773117463237505e-05, + "loss": 3.1411, + "step": 12026 + }, + { + "epoch": 0.5599553041413506, + "grad_norm": 0.423108883957493, + "learning_rate": 9.773036785898699e-05, + "loss": 3.0895, + "step": 12027 + }, + { + "epoch": 0.5600018623274438, + "grad_norm": 0.34957486534612153, + "learning_rate": 9.7729560945515e-05, + "loss": 3.1064, + "step": 12028 + }, + { + "epoch": 0.5600484205135368, + "grad_norm": 0.4165904111594054, + "learning_rate": 9.772875389196143e-05, + "loss": 3.1153, + "step": 12029 + }, + { + "epoch": 0.5600949786996299, + "grad_norm": 0.374138679243429, + "learning_rate": 9.772794669832869e-05, + "loss": 3.1071, + "step": 12030 + }, + { + "epoch": 0.5601415368857229, + "grad_norm": 0.3748936293922233, + "learning_rate": 9.772713936461912e-05, + "loss": 3.0608, + "step": 12031 + }, + { + "epoch": 0.560188095071816, + "grad_norm": 0.33530566055025696, + "learning_rate": 9.77263318908351e-05, + "loss": 3.106, + "step": 12032 + }, + { + "epoch": 0.5602346532579091, + "grad_norm": 0.44025470425973023, + "learning_rate": 9.772552427697901e-05, + "loss": 3.2075, + "step": 12033 + }, + { + "epoch": 0.5602812114440021, + "grad_norm": 0.4202650468600813, + "learning_rate": 9.77247165230532e-05, + "loss": 3.1347, + "step": 12034 + }, + { + "epoch": 0.5603277696300952, + "grad_norm": 0.35817820208550066, + "learning_rate": 9.772390862906004e-05, + "loss": 3.0727, + "step": 12035 + }, + { + "epoch": 0.5603743278161882, + "grad_norm": 0.4300941171563099, + "learning_rate": 9.772310059500193e-05, + "loss": 3.0971, + "step": 12036 + }, + { + "epoch": 0.5604208860022813, + "grad_norm": 0.40251790344828964, + "learning_rate": 9.77222924208812e-05, + "loss": 3.1909, + "step": 12037 + }, + { + "epoch": 0.5604674441883745, + "grad_norm": 0.37893352970796557, + "learning_rate": 9.772148410670025e-05, + "loss": 3.1125, + "step": 12038 + }, + { + "epoch": 0.5605140023744675, + "grad_norm": 0.3631430402724988, + "learning_rate": 9.772067565246145e-05, + "loss": 3.0785, + "step": 12039 + }, + { + "epoch": 0.5605605605605606, + "grad_norm": 0.3829899048944014, + "learning_rate": 9.771986705816716e-05, + "loss": 3.0575, + "step": 12040 + }, + { + "epoch": 0.5606071187466536, + "grad_norm": 0.382697564449398, + "learning_rate": 9.771905832381976e-05, + "loss": 3.1395, + "step": 12041 + }, + { + "epoch": 0.5606536769327467, + "grad_norm": 0.3480763382176417, + "learning_rate": 9.771824944942164e-05, + "loss": 3.0795, + "step": 12042 + }, + { + "epoch": 0.5607002351188398, + "grad_norm": 0.3976615497236303, + "learning_rate": 9.771744043497515e-05, + "loss": 3.0021, + "step": 12043 + }, + { + "epoch": 0.5607467933049328, + "grad_norm": 0.32938216829957756, + "learning_rate": 9.771663128048268e-05, + "loss": 3.041, + "step": 12044 + }, + { + "epoch": 0.5607933514910259, + "grad_norm": 0.39612905918465485, + "learning_rate": 9.771582198594658e-05, + "loss": 3.053, + "step": 12045 + }, + { + "epoch": 0.5608399096771189, + "grad_norm": 0.37846678684228396, + "learning_rate": 9.771501255136925e-05, + "loss": 3.1623, + "step": 12046 + }, + { + "epoch": 0.5608864678632121, + "grad_norm": 0.37540674271652125, + "learning_rate": 9.771420297675306e-05, + "loss": 3.1995, + "step": 12047 + }, + { + "epoch": 0.5609330260493052, + "grad_norm": 0.35082741545687063, + "learning_rate": 9.771339326210039e-05, + "loss": 3.036, + "step": 12048 + }, + { + "epoch": 0.5609795842353982, + "grad_norm": 0.3985398723207513, + "learning_rate": 9.771258340741362e-05, + "loss": 3.1204, + "step": 12049 + }, + { + "epoch": 0.5610261424214913, + "grad_norm": 0.3703939004350584, + "learning_rate": 9.771177341269511e-05, + "loss": 3.1787, + "step": 12050 + }, + { + "epoch": 0.5610727006075843, + "grad_norm": 0.3481905670953057, + "learning_rate": 9.771096327794725e-05, + "loss": 3.099, + "step": 12051 + }, + { + "epoch": 0.5611192587936774, + "grad_norm": 0.3416559688660215, + "learning_rate": 9.771015300317239e-05, + "loss": 3.148, + "step": 12052 + }, + { + "epoch": 0.5611658169797704, + "grad_norm": 0.37282008888217893, + "learning_rate": 9.770934258837295e-05, + "loss": 3.0106, + "step": 12053 + }, + { + "epoch": 0.5612123751658635, + "grad_norm": 0.32770003067463827, + "learning_rate": 9.770853203355128e-05, + "loss": 3.0605, + "step": 12054 + }, + { + "epoch": 0.5612589333519566, + "grad_norm": 0.37923087069764333, + "learning_rate": 9.770772133870978e-05, + "loss": 3.1298, + "step": 12055 + }, + { + "epoch": 0.5613054915380497, + "grad_norm": 0.3387623149087181, + "learning_rate": 9.770691050385081e-05, + "loss": 3.0815, + "step": 12056 + }, + { + "epoch": 0.5613520497241428, + "grad_norm": 0.3245986284293057, + "learning_rate": 9.770609952897676e-05, + "loss": 3.1162, + "step": 12057 + }, + { + "epoch": 0.5613986079102358, + "grad_norm": 0.35656954128972423, + "learning_rate": 9.770528841409002e-05, + "loss": 3.1662, + "step": 12058 + }, + { + "epoch": 0.5614451660963289, + "grad_norm": 0.3711790371371668, + "learning_rate": 9.770447715919295e-05, + "loss": 3.1148, + "step": 12059 + }, + { + "epoch": 0.561491724282422, + "grad_norm": 0.34007993691751154, + "learning_rate": 9.770366576428793e-05, + "loss": 3.2013, + "step": 12060 + }, + { + "epoch": 0.561538282468515, + "grad_norm": 0.38922297552274143, + "learning_rate": 9.770285422937735e-05, + "loss": 3.1553, + "step": 12061 + }, + { + "epoch": 0.5615848406546081, + "grad_norm": 0.35552049751806036, + "learning_rate": 9.77020425544636e-05, + "loss": 2.9661, + "step": 12062 + }, + { + "epoch": 0.5616313988407011, + "grad_norm": 0.3706051963727661, + "learning_rate": 9.770123073954905e-05, + "loss": 3.1054, + "step": 12063 + }, + { + "epoch": 0.5616779570267942, + "grad_norm": 0.35425440852394685, + "learning_rate": 9.77004187846361e-05, + "loss": 3.1172, + "step": 12064 + }, + { + "epoch": 0.5617245152128874, + "grad_norm": 0.3785862117954871, + "learning_rate": 9.76996066897271e-05, + "loss": 2.9983, + "step": 12065 + }, + { + "epoch": 0.5617710733989804, + "grad_norm": 0.36291654041733556, + "learning_rate": 9.769879445482448e-05, + "loss": 3.0989, + "step": 12066 + }, + { + "epoch": 0.5618176315850735, + "grad_norm": 0.39255292609072917, + "learning_rate": 9.769798207993057e-05, + "loss": 3.0925, + "step": 12067 + }, + { + "epoch": 0.5618641897711665, + "grad_norm": 0.3687132128959612, + "learning_rate": 9.769716956504779e-05, + "loss": 3.1999, + "step": 12068 + }, + { + "epoch": 0.5619107479572596, + "grad_norm": 0.38956082810816156, + "learning_rate": 9.769635691017853e-05, + "loss": 3.1793, + "step": 12069 + }, + { + "epoch": 0.5619573061433527, + "grad_norm": 0.36593360838922023, + "learning_rate": 9.769554411532515e-05, + "loss": 3.1296, + "step": 12070 + }, + { + "epoch": 0.5620038643294457, + "grad_norm": 0.3509703010431074, + "learning_rate": 9.769473118049005e-05, + "loss": 3.1756, + "step": 12071 + }, + { + "epoch": 0.5620504225155388, + "grad_norm": 0.35766513613164164, + "learning_rate": 9.76939181056756e-05, + "loss": 3.0657, + "step": 12072 + }, + { + "epoch": 0.5620969807016318, + "grad_norm": 0.36777441398421695, + "learning_rate": 9.769310489088421e-05, + "loss": 3.1898, + "step": 12073 + }, + { + "epoch": 0.562143538887725, + "grad_norm": 0.3392912397613842, + "learning_rate": 9.769229153611826e-05, + "loss": 3.1532, + "step": 12074 + }, + { + "epoch": 0.562190097073818, + "grad_norm": 0.3519829522227483, + "learning_rate": 9.769147804138012e-05, + "loss": 3.1525, + "step": 12075 + }, + { + "epoch": 0.5622366552599111, + "grad_norm": 0.3213804071876685, + "learning_rate": 9.769066440667221e-05, + "loss": 3.0554, + "step": 12076 + }, + { + "epoch": 0.5622832134460042, + "grad_norm": 0.34304405034280167, + "learning_rate": 9.768985063199688e-05, + "loss": 3.1659, + "step": 12077 + }, + { + "epoch": 0.5623297716320972, + "grad_norm": 0.3415716098792204, + "learning_rate": 9.768903671735654e-05, + "loss": 3.1966, + "step": 12078 + }, + { + "epoch": 0.5623763298181903, + "grad_norm": 0.383473136772659, + "learning_rate": 9.76882226627536e-05, + "loss": 3.1579, + "step": 12079 + }, + { + "epoch": 0.5624228880042833, + "grad_norm": 0.3386427407939862, + "learning_rate": 9.768740846819038e-05, + "loss": 2.9442, + "step": 12080 + }, + { + "epoch": 0.5624694461903764, + "grad_norm": 0.3749772398662697, + "learning_rate": 9.768659413366935e-05, + "loss": 3.1134, + "step": 12081 + }, + { + "epoch": 0.5625160043764695, + "grad_norm": 0.37115360288348775, + "learning_rate": 9.768577965919285e-05, + "loss": 3.0528, + "step": 12082 + }, + { + "epoch": 0.5625625625625625, + "grad_norm": 0.40223961340022646, + "learning_rate": 9.768496504476328e-05, + "loss": 3.2123, + "step": 12083 + }, + { + "epoch": 0.5626091207486557, + "grad_norm": 0.390619261992695, + "learning_rate": 9.768415029038305e-05, + "loss": 3.1638, + "step": 12084 + }, + { + "epoch": 0.5626556789347487, + "grad_norm": 0.3705621973458564, + "learning_rate": 9.768333539605452e-05, + "loss": 3.0716, + "step": 12085 + }, + { + "epoch": 0.5627022371208418, + "grad_norm": 0.39652792701146683, + "learning_rate": 9.76825203617801e-05, + "loss": 3.1232, + "step": 12086 + }, + { + "epoch": 0.5627487953069349, + "grad_norm": 0.36631396008166917, + "learning_rate": 9.76817051875622e-05, + "loss": 3.1141, + "step": 12087 + }, + { + "epoch": 0.5627953534930279, + "grad_norm": 0.38734361284996827, + "learning_rate": 9.768088987340318e-05, + "loss": 3.1333, + "step": 12088 + }, + { + "epoch": 0.562841911679121, + "grad_norm": 0.37085626294779933, + "learning_rate": 9.768007441930545e-05, + "loss": 3.1817, + "step": 12089 + }, + { + "epoch": 0.562888469865214, + "grad_norm": 0.36922941382358143, + "learning_rate": 9.767925882527138e-05, + "loss": 3.0809, + "step": 12090 + }, + { + "epoch": 0.5629350280513071, + "grad_norm": 0.40779954026555876, + "learning_rate": 9.767844309130339e-05, + "loss": 3.0925, + "step": 12091 + }, + { + "epoch": 0.5629815862374002, + "grad_norm": 0.3953271828012359, + "learning_rate": 9.767762721740388e-05, + "loss": 3.0457, + "step": 12092 + }, + { + "epoch": 0.5630281444234932, + "grad_norm": 0.3970237189122569, + "learning_rate": 9.767681120357522e-05, + "loss": 3.0136, + "step": 12093 + }, + { + "epoch": 0.5630747026095864, + "grad_norm": 0.38960469895530303, + "learning_rate": 9.767599504981982e-05, + "loss": 3.0506, + "step": 12094 + }, + { + "epoch": 0.5631212607956794, + "grad_norm": 0.4352475729296302, + "learning_rate": 9.767517875614007e-05, + "loss": 3.1454, + "step": 12095 + }, + { + "epoch": 0.5631678189817725, + "grad_norm": 0.5124894757204476, + "learning_rate": 9.767436232253837e-05, + "loss": 3.1982, + "step": 12096 + }, + { + "epoch": 0.5632143771678655, + "grad_norm": 0.42394247230000764, + "learning_rate": 9.767354574901712e-05, + "loss": 3.1318, + "step": 12097 + }, + { + "epoch": 0.5632609353539586, + "grad_norm": 0.37790733795453774, + "learning_rate": 9.767272903557869e-05, + "loss": 3.1141, + "step": 12098 + }, + { + "epoch": 0.5633074935400517, + "grad_norm": 0.4195182777430557, + "learning_rate": 9.767191218222551e-05, + "loss": 3.0185, + "step": 12099 + }, + { + "epoch": 0.5633540517261447, + "grad_norm": 0.4525812441234316, + "learning_rate": 9.767109518895997e-05, + "loss": 3.1015, + "step": 12100 + }, + { + "epoch": 0.5634006099122378, + "grad_norm": 0.4321324352993404, + "learning_rate": 9.767027805578444e-05, + "loss": 3.1207, + "step": 12101 + }, + { + "epoch": 0.5634471680983308, + "grad_norm": 0.4147025255044536, + "learning_rate": 9.766946078270136e-05, + "loss": 3.2243, + "step": 12102 + }, + { + "epoch": 0.563493726284424, + "grad_norm": 0.4037926791039533, + "learning_rate": 9.766864336971311e-05, + "loss": 3.0798, + "step": 12103 + }, + { + "epoch": 0.5635402844705171, + "grad_norm": 0.40343254593068983, + "learning_rate": 9.766782581682208e-05, + "loss": 3.1652, + "step": 12104 + }, + { + "epoch": 0.5635868426566101, + "grad_norm": 0.41972150363208977, + "learning_rate": 9.766700812403068e-05, + "loss": 3.1217, + "step": 12105 + }, + { + "epoch": 0.5636334008427032, + "grad_norm": 0.3963281434573563, + "learning_rate": 9.76661902913413e-05, + "loss": 3.1128, + "step": 12106 + }, + { + "epoch": 0.5636799590287962, + "grad_norm": 0.4078295078885187, + "learning_rate": 9.766537231875636e-05, + "loss": 3.0971, + "step": 12107 + }, + { + "epoch": 0.5637265172148893, + "grad_norm": 0.3827334763233681, + "learning_rate": 9.766455420627825e-05, + "loss": 3.198, + "step": 12108 + }, + { + "epoch": 0.5637730754009824, + "grad_norm": 0.40427893309170754, + "learning_rate": 9.766373595390935e-05, + "loss": 3.0306, + "step": 12109 + }, + { + "epoch": 0.5638196335870754, + "grad_norm": 0.41046923971542865, + "learning_rate": 9.76629175616521e-05, + "loss": 3.1491, + "step": 12110 + }, + { + "epoch": 0.5638661917731685, + "grad_norm": 0.4085888608677034, + "learning_rate": 9.766209902950886e-05, + "loss": 3.2043, + "step": 12111 + }, + { + "epoch": 0.5639127499592616, + "grad_norm": 0.3923487737499771, + "learning_rate": 9.766128035748207e-05, + "loss": 3.1672, + "step": 12112 + }, + { + "epoch": 0.5639593081453547, + "grad_norm": 0.420705916601118, + "learning_rate": 9.766046154557411e-05, + "loss": 3.0865, + "step": 12113 + }, + { + "epoch": 0.5640058663314478, + "grad_norm": 0.42632538708852974, + "learning_rate": 9.765964259378741e-05, + "loss": 3.1198, + "step": 12114 + }, + { + "epoch": 0.5640524245175408, + "grad_norm": 0.38114447438079163, + "learning_rate": 9.765882350212434e-05, + "loss": 3.1257, + "step": 12115 + }, + { + "epoch": 0.5640989827036339, + "grad_norm": 0.38648366027250275, + "learning_rate": 9.765800427058731e-05, + "loss": 3.0481, + "step": 12116 + }, + { + "epoch": 0.5641455408897269, + "grad_norm": 0.40419837847229567, + "learning_rate": 9.765718489917875e-05, + "loss": 3.1567, + "step": 12117 + }, + { + "epoch": 0.56419209907582, + "grad_norm": 0.3626628221758107, + "learning_rate": 9.765636538790104e-05, + "loss": 3.1491, + "step": 12118 + }, + { + "epoch": 0.564238657261913, + "grad_norm": 0.4112939758718125, + "learning_rate": 9.765554573675659e-05, + "loss": 3.2104, + "step": 12119 + }, + { + "epoch": 0.5642852154480061, + "grad_norm": 0.4110237350400569, + "learning_rate": 9.765472594574781e-05, + "loss": 3.1727, + "step": 12120 + }, + { + "epoch": 0.5643317736340993, + "grad_norm": 0.404758866219191, + "learning_rate": 9.765390601487709e-05, + "loss": 3.2499, + "step": 12121 + }, + { + "epoch": 0.5643783318201923, + "grad_norm": 0.40608558356175756, + "learning_rate": 9.765308594414688e-05, + "loss": 3.2041, + "step": 12122 + }, + { + "epoch": 0.5644248900062854, + "grad_norm": 0.4016974523472391, + "learning_rate": 9.765226573355954e-05, + "loss": 3.1567, + "step": 12123 + }, + { + "epoch": 0.5644714481923784, + "grad_norm": 0.4148417192688738, + "learning_rate": 9.765144538311749e-05, + "loss": 3.1058, + "step": 12124 + }, + { + "epoch": 0.5645180063784715, + "grad_norm": 0.3883308858377819, + "learning_rate": 9.765062489282314e-05, + "loss": 3.2044, + "step": 12125 + }, + { + "epoch": 0.5645645645645646, + "grad_norm": 0.4136971789354885, + "learning_rate": 9.764980426267893e-05, + "loss": 3.1024, + "step": 12126 + }, + { + "epoch": 0.5646111227506576, + "grad_norm": 0.4759748998573743, + "learning_rate": 9.76489834926872e-05, + "loss": 3.0935, + "step": 12127 + }, + { + "epoch": 0.5646576809367507, + "grad_norm": 0.3597067989772291, + "learning_rate": 9.764816258285042e-05, + "loss": 3.0291, + "step": 12128 + }, + { + "epoch": 0.5647042391228437, + "grad_norm": 0.46285610016945716, + "learning_rate": 9.764734153317098e-05, + "loss": 3.0912, + "step": 12129 + }, + { + "epoch": 0.5647507973089368, + "grad_norm": 0.39852359068987286, + "learning_rate": 9.764652034365127e-05, + "loss": 3.0702, + "step": 12130 + }, + { + "epoch": 0.56479735549503, + "grad_norm": 0.3982601658951562, + "learning_rate": 9.764569901429372e-05, + "loss": 3.2356, + "step": 12131 + }, + { + "epoch": 0.564843913681123, + "grad_norm": 0.40538560327258866, + "learning_rate": 9.764487754510076e-05, + "loss": 3.0854, + "step": 12132 + }, + { + "epoch": 0.5648904718672161, + "grad_norm": 0.4320313740492047, + "learning_rate": 9.764405593607476e-05, + "loss": 3.1726, + "step": 12133 + }, + { + "epoch": 0.5649370300533091, + "grad_norm": 0.370755743828141, + "learning_rate": 9.764323418721815e-05, + "loss": 3.1591, + "step": 12134 + }, + { + "epoch": 0.5649835882394022, + "grad_norm": 0.4316148329754842, + "learning_rate": 9.764241229853334e-05, + "loss": 3.1096, + "step": 12135 + }, + { + "epoch": 0.5650301464254953, + "grad_norm": 0.4048019789643869, + "learning_rate": 9.764159027002274e-05, + "loss": 3.031, + "step": 12136 + }, + { + "epoch": 0.5650767046115883, + "grad_norm": 0.4222542024251145, + "learning_rate": 9.764076810168877e-05, + "loss": 3.2357, + "step": 12137 + }, + { + "epoch": 0.5651232627976814, + "grad_norm": 0.41937147642318845, + "learning_rate": 9.763994579353384e-05, + "loss": 3.1358, + "step": 12138 + }, + { + "epoch": 0.5651698209837744, + "grad_norm": 0.4357406183797617, + "learning_rate": 9.763912334556037e-05, + "loss": 3.1265, + "step": 12139 + }, + { + "epoch": 0.5652163791698676, + "grad_norm": 0.39168579710222745, + "learning_rate": 9.763830075777077e-05, + "loss": 3.123, + "step": 12140 + }, + { + "epoch": 0.5652629373559606, + "grad_norm": 0.4424874218680947, + "learning_rate": 9.763747803016743e-05, + "loss": 3.0738, + "step": 12141 + }, + { + "epoch": 0.5653094955420537, + "grad_norm": 0.40101676359361554, + "learning_rate": 9.763665516275278e-05, + "loss": 3.0452, + "step": 12142 + }, + { + "epoch": 0.5653560537281468, + "grad_norm": 0.40700158855495394, + "learning_rate": 9.763583215552927e-05, + "loss": 3.1348, + "step": 12143 + }, + { + "epoch": 0.5654026119142398, + "grad_norm": 0.4665802666436636, + "learning_rate": 9.763500900849926e-05, + "loss": 3.1874, + "step": 12144 + }, + { + "epoch": 0.5654491701003329, + "grad_norm": 0.36029062122806166, + "learning_rate": 9.76341857216652e-05, + "loss": 3.2008, + "step": 12145 + }, + { + "epoch": 0.5654957282864259, + "grad_norm": 0.428323473083555, + "learning_rate": 9.763336229502951e-05, + "loss": 3.0867, + "step": 12146 + }, + { + "epoch": 0.565542286472519, + "grad_norm": 0.3866651508919874, + "learning_rate": 9.763253872859457e-05, + "loss": 3.0951, + "step": 12147 + }, + { + "epoch": 0.5655888446586121, + "grad_norm": 0.37617508455280846, + "learning_rate": 9.763171502236284e-05, + "loss": 3.0971, + "step": 12148 + }, + { + "epoch": 0.5656354028447051, + "grad_norm": 0.3806784460686215, + "learning_rate": 9.763089117633673e-05, + "loss": 3.0272, + "step": 12149 + }, + { + "epoch": 0.5656819610307983, + "grad_norm": 0.3733513403248877, + "learning_rate": 9.763006719051861e-05, + "loss": 3.0509, + "step": 12150 + }, + { + "epoch": 0.5657285192168913, + "grad_norm": 0.38177814407021704, + "learning_rate": 9.762924306491097e-05, + "loss": 3.1118, + "step": 12151 + }, + { + "epoch": 0.5657750774029844, + "grad_norm": 0.34991787549597997, + "learning_rate": 9.762841879951618e-05, + "loss": 3.1648, + "step": 12152 + }, + { + "epoch": 0.5658216355890775, + "grad_norm": 0.361819190489609, + "learning_rate": 9.762759439433667e-05, + "loss": 3.053, + "step": 12153 + }, + { + "epoch": 0.5658681937751705, + "grad_norm": 0.34991925111422045, + "learning_rate": 9.762676984937487e-05, + "loss": 3.1137, + "step": 12154 + }, + { + "epoch": 0.5659147519612636, + "grad_norm": 0.3557542175079512, + "learning_rate": 9.762594516463318e-05, + "loss": 3.0703, + "step": 12155 + }, + { + "epoch": 0.5659613101473566, + "grad_norm": 0.3381055542871566, + "learning_rate": 9.762512034011404e-05, + "loss": 3.263, + "step": 12156 + }, + { + "epoch": 0.5660078683334497, + "grad_norm": 0.38977835144848005, + "learning_rate": 9.762429537581986e-05, + "loss": 3.2147, + "step": 12157 + }, + { + "epoch": 0.5660544265195429, + "grad_norm": 0.38418918941085645, + "learning_rate": 9.762347027175307e-05, + "loss": 3.1219, + "step": 12158 + }, + { + "epoch": 0.5661009847056359, + "grad_norm": 0.37364191561198823, + "learning_rate": 9.76226450279161e-05, + "loss": 3.1021, + "step": 12159 + }, + { + "epoch": 0.566147542891729, + "grad_norm": 0.40690549787111013, + "learning_rate": 9.762181964431133e-05, + "loss": 3.1194, + "step": 12160 + }, + { + "epoch": 0.566194101077822, + "grad_norm": 0.35163227908561434, + "learning_rate": 9.762099412094124e-05, + "loss": 3.1476, + "step": 12161 + }, + { + "epoch": 0.5662406592639151, + "grad_norm": 0.41514723125596514, + "learning_rate": 9.762016845780819e-05, + "loss": 3.084, + "step": 12162 + }, + { + "epoch": 0.5662872174500081, + "grad_norm": 0.39508155065089334, + "learning_rate": 9.761934265491466e-05, + "loss": 3.1092, + "step": 12163 + }, + { + "epoch": 0.5663337756361012, + "grad_norm": 0.3614445428987082, + "learning_rate": 9.761851671226305e-05, + "loss": 3.0333, + "step": 12164 + }, + { + "epoch": 0.5663803338221943, + "grad_norm": 0.41843407804861416, + "learning_rate": 9.761769062985578e-05, + "loss": 3.1673, + "step": 12165 + }, + { + "epoch": 0.5664268920082873, + "grad_norm": 0.3629253853438893, + "learning_rate": 9.761686440769528e-05, + "loss": 3.0877, + "step": 12166 + }, + { + "epoch": 0.5664734501943804, + "grad_norm": 0.4050966270452427, + "learning_rate": 9.761603804578398e-05, + "loss": 3.231, + "step": 12167 + }, + { + "epoch": 0.5665200083804734, + "grad_norm": 0.3705719444548756, + "learning_rate": 9.761521154412428e-05, + "loss": 3.069, + "step": 12168 + }, + { + "epoch": 0.5665665665665666, + "grad_norm": 0.37096577663942654, + "learning_rate": 9.761438490271864e-05, + "loss": 3.0544, + "step": 12169 + }, + { + "epoch": 0.5666131247526597, + "grad_norm": 0.3776845402539608, + "learning_rate": 9.761355812156947e-05, + "loss": 3.0589, + "step": 12170 + }, + { + "epoch": 0.5666596829387527, + "grad_norm": 0.34949540339021573, + "learning_rate": 9.76127312006792e-05, + "loss": 3.14, + "step": 12171 + }, + { + "epoch": 0.5667062411248458, + "grad_norm": 0.3920730893908253, + "learning_rate": 9.761190414005026e-05, + "loss": 3.0906, + "step": 12172 + }, + { + "epoch": 0.5667527993109388, + "grad_norm": 0.392482981086411, + "learning_rate": 9.761107693968505e-05, + "loss": 3.1596, + "step": 12173 + }, + { + "epoch": 0.5667993574970319, + "grad_norm": 0.39522663983708956, + "learning_rate": 9.761024959958604e-05, + "loss": 3.0584, + "step": 12174 + }, + { + "epoch": 0.566845915683125, + "grad_norm": 0.42371982084775317, + "learning_rate": 9.760942211975563e-05, + "loss": 3.1678, + "step": 12175 + }, + { + "epoch": 0.566892473869218, + "grad_norm": 0.346013796602167, + "learning_rate": 9.760859450019625e-05, + "loss": 3.0882, + "step": 12176 + }, + { + "epoch": 0.5669390320553112, + "grad_norm": 0.4121854312117624, + "learning_rate": 9.760776674091035e-05, + "loss": 3.0747, + "step": 12177 + }, + { + "epoch": 0.5669855902414042, + "grad_norm": 0.39001510685708124, + "learning_rate": 9.760693884190033e-05, + "loss": 3.1015, + "step": 12178 + }, + { + "epoch": 0.5670321484274973, + "grad_norm": 0.4281590877984734, + "learning_rate": 9.760611080316864e-05, + "loss": 3.1279, + "step": 12179 + }, + { + "epoch": 0.5670787066135904, + "grad_norm": 0.36914804421020747, + "learning_rate": 9.760528262471771e-05, + "loss": 3.0538, + "step": 12180 + }, + { + "epoch": 0.5671252647996834, + "grad_norm": 0.39798956291028875, + "learning_rate": 9.760445430654996e-05, + "loss": 3.1913, + "step": 12181 + }, + { + "epoch": 0.5671718229857765, + "grad_norm": 0.3682511628024469, + "learning_rate": 9.760362584866783e-05, + "loss": 3.0795, + "step": 12182 + }, + { + "epoch": 0.5672183811718695, + "grad_norm": 0.4002064062290272, + "learning_rate": 9.760279725107376e-05, + "loss": 3.1507, + "step": 12183 + }, + { + "epoch": 0.5672649393579626, + "grad_norm": 0.39896749941322746, + "learning_rate": 9.760196851377015e-05, + "loss": 3.1113, + "step": 12184 + }, + { + "epoch": 0.5673114975440556, + "grad_norm": 0.41315545032904094, + "learning_rate": 9.760113963675945e-05, + "loss": 3.166, + "step": 12185 + }, + { + "epoch": 0.5673580557301487, + "grad_norm": 0.39955472863547453, + "learning_rate": 9.76003106200441e-05, + "loss": 3.072, + "step": 12186 + }, + { + "epoch": 0.5674046139162419, + "grad_norm": 0.40595524984239306, + "learning_rate": 9.759948146362655e-05, + "loss": 3.0424, + "step": 12187 + }, + { + "epoch": 0.5674511721023349, + "grad_norm": 0.39583079208646627, + "learning_rate": 9.759865216750919e-05, + "loss": 3.081, + "step": 12188 + }, + { + "epoch": 0.567497730288428, + "grad_norm": 0.4120669552034934, + "learning_rate": 9.759782273169447e-05, + "loss": 3.134, + "step": 12189 + }, + { + "epoch": 0.567544288474521, + "grad_norm": 0.41132016908693786, + "learning_rate": 9.759699315618483e-05, + "loss": 3.1359, + "step": 12190 + }, + { + "epoch": 0.5675908466606141, + "grad_norm": 0.41963219941323177, + "learning_rate": 9.759616344098272e-05, + "loss": 3.1698, + "step": 12191 + }, + { + "epoch": 0.5676374048467072, + "grad_norm": 0.43966474465781324, + "learning_rate": 9.759533358609054e-05, + "loss": 3.1548, + "step": 12192 + }, + { + "epoch": 0.5676839630328002, + "grad_norm": 0.4091155817411281, + "learning_rate": 9.759450359151076e-05, + "loss": 3.1256, + "step": 12193 + }, + { + "epoch": 0.5677305212188933, + "grad_norm": 0.39565356855804357, + "learning_rate": 9.759367345724579e-05, + "loss": 3.0904, + "step": 12194 + }, + { + "epoch": 0.5677770794049863, + "grad_norm": 0.47174645795713216, + "learning_rate": 9.759284318329808e-05, + "loss": 3.0953, + "step": 12195 + }, + { + "epoch": 0.5678236375910795, + "grad_norm": 0.3613284889543051, + "learning_rate": 9.759201276967007e-05, + "loss": 3.1144, + "step": 12196 + }, + { + "epoch": 0.5678701957771726, + "grad_norm": 0.3908522972053139, + "learning_rate": 9.759118221636418e-05, + "loss": 3.1316, + "step": 12197 + }, + { + "epoch": 0.5679167539632656, + "grad_norm": 0.3968419137939651, + "learning_rate": 9.759035152338288e-05, + "loss": 3.0905, + "step": 12198 + }, + { + "epoch": 0.5679633121493587, + "grad_norm": 0.38074360681258934, + "learning_rate": 9.758952069072857e-05, + "loss": 3.0584, + "step": 12199 + }, + { + "epoch": 0.5680098703354517, + "grad_norm": 0.33799657320630494, + "learning_rate": 9.758868971840372e-05, + "loss": 3.0968, + "step": 12200 + }, + { + "epoch": 0.5680564285215448, + "grad_norm": 0.3724768031863361, + "learning_rate": 9.758785860641073e-05, + "loss": 3.0904, + "step": 12201 + }, + { + "epoch": 0.5681029867076379, + "grad_norm": 0.37704997678447594, + "learning_rate": 9.758702735475208e-05, + "loss": 3.052, + "step": 12202 + }, + { + "epoch": 0.5681495448937309, + "grad_norm": 0.35839487176086565, + "learning_rate": 9.758619596343019e-05, + "loss": 3.0847, + "step": 12203 + }, + { + "epoch": 0.568196103079824, + "grad_norm": 0.3708252868275835, + "learning_rate": 9.75853644324475e-05, + "loss": 3.0884, + "step": 12204 + }, + { + "epoch": 0.568242661265917, + "grad_norm": 0.32552908179987233, + "learning_rate": 9.758453276180645e-05, + "loss": 3.0119, + "step": 12205 + }, + { + "epoch": 0.5682892194520102, + "grad_norm": 0.329867683675975, + "learning_rate": 9.758370095150949e-05, + "loss": 2.9724, + "step": 12206 + }, + { + "epoch": 0.5683357776381032, + "grad_norm": 0.33588187547291887, + "learning_rate": 9.758286900155905e-05, + "loss": 3.1315, + "step": 12207 + }, + { + "epoch": 0.5683823358241963, + "grad_norm": 0.33833736222742594, + "learning_rate": 9.758203691195758e-05, + "loss": 3.219, + "step": 12208 + }, + { + "epoch": 0.5684288940102894, + "grad_norm": 0.3464034341768651, + "learning_rate": 9.758120468270752e-05, + "loss": 3.2526, + "step": 12209 + }, + { + "epoch": 0.5684754521963824, + "grad_norm": 0.3359983392202982, + "learning_rate": 9.758037231381132e-05, + "loss": 3.1473, + "step": 12210 + }, + { + "epoch": 0.5685220103824755, + "grad_norm": 0.3386419321348561, + "learning_rate": 9.75795398052714e-05, + "loss": 3.1042, + "step": 12211 + }, + { + "epoch": 0.5685685685685685, + "grad_norm": 0.3762977295116272, + "learning_rate": 9.757870715709023e-05, + "loss": 3.1062, + "step": 12212 + }, + { + "epoch": 0.5686151267546616, + "grad_norm": 0.3692103868957484, + "learning_rate": 9.757787436927024e-05, + "loss": 3.1037, + "step": 12213 + }, + { + "epoch": 0.5686616849407548, + "grad_norm": 0.35953391833139575, + "learning_rate": 9.757704144181387e-05, + "loss": 3.0026, + "step": 12214 + }, + { + "epoch": 0.5687082431268478, + "grad_norm": 0.3753111021689496, + "learning_rate": 9.757620837472357e-05, + "loss": 3.1689, + "step": 12215 + }, + { + "epoch": 0.5687548013129409, + "grad_norm": 0.36907778620614606, + "learning_rate": 9.75753751680018e-05, + "loss": 3.07, + "step": 12216 + }, + { + "epoch": 0.5688013594990339, + "grad_norm": 0.3876825225889835, + "learning_rate": 9.757454182165097e-05, + "loss": 3.0856, + "step": 12217 + }, + { + "epoch": 0.568847917685127, + "grad_norm": 0.3374660299884765, + "learning_rate": 9.757370833567356e-05, + "loss": 3.0312, + "step": 12218 + }, + { + "epoch": 0.5688944758712201, + "grad_norm": 0.36493375704704484, + "learning_rate": 9.757287471007199e-05, + "loss": 2.993, + "step": 12219 + }, + { + "epoch": 0.5689410340573131, + "grad_norm": 0.3506354637975605, + "learning_rate": 9.757204094484873e-05, + "loss": 3.0402, + "step": 12220 + }, + { + "epoch": 0.5689875922434062, + "grad_norm": 0.38749045008151245, + "learning_rate": 9.757120704000622e-05, + "loss": 3.0341, + "step": 12221 + }, + { + "epoch": 0.5690341504294992, + "grad_norm": 0.3845651468394179, + "learning_rate": 9.75703729955469e-05, + "loss": 3.1768, + "step": 12222 + }, + { + "epoch": 0.5690807086155923, + "grad_norm": 0.372521098149137, + "learning_rate": 9.756953881147322e-05, + "loss": 2.9988, + "step": 12223 + }, + { + "epoch": 0.5691272668016855, + "grad_norm": 0.3603257489239104, + "learning_rate": 9.756870448778763e-05, + "loss": 3.0826, + "step": 12224 + }, + { + "epoch": 0.5691738249877785, + "grad_norm": 0.41907374962662375, + "learning_rate": 9.756787002449256e-05, + "loss": 3.1857, + "step": 12225 + }, + { + "epoch": 0.5692203831738716, + "grad_norm": 0.3793469277900127, + "learning_rate": 9.756703542159052e-05, + "loss": 3.1425, + "step": 12226 + }, + { + "epoch": 0.5692669413599646, + "grad_norm": 0.33467766324041276, + "learning_rate": 9.756620067908389e-05, + "loss": 3.0915, + "step": 12227 + }, + { + "epoch": 0.5693134995460577, + "grad_norm": 0.33675653928694604, + "learning_rate": 9.756536579697515e-05, + "loss": 3.1915, + "step": 12228 + }, + { + "epoch": 0.5693600577321507, + "grad_norm": 0.3682527293207386, + "learning_rate": 9.756453077526675e-05, + "loss": 3.1833, + "step": 12229 + }, + { + "epoch": 0.5694066159182438, + "grad_norm": 0.30315555409188283, + "learning_rate": 9.756369561396113e-05, + "loss": 2.9941, + "step": 12230 + }, + { + "epoch": 0.5694531741043369, + "grad_norm": 0.34361658195718103, + "learning_rate": 9.756286031306075e-05, + "loss": 3.0942, + "step": 12231 + }, + { + "epoch": 0.5694997322904299, + "grad_norm": 0.3527651851803317, + "learning_rate": 9.756202487256807e-05, + "loss": 3.0477, + "step": 12232 + }, + { + "epoch": 0.569546290476523, + "grad_norm": 0.3272922259933105, + "learning_rate": 9.756118929248553e-05, + "loss": 3.1292, + "step": 12233 + }, + { + "epoch": 0.5695928486626161, + "grad_norm": 0.3550450453772726, + "learning_rate": 9.756035357281559e-05, + "loss": 3.1008, + "step": 12234 + }, + { + "epoch": 0.5696394068487092, + "grad_norm": 0.32207761112378674, + "learning_rate": 9.75595177135607e-05, + "loss": 3.0661, + "step": 12235 + }, + { + "epoch": 0.5696859650348023, + "grad_norm": 0.3405655147744967, + "learning_rate": 9.755868171472329e-05, + "loss": 3.0066, + "step": 12236 + }, + { + "epoch": 0.5697325232208953, + "grad_norm": 0.34069660851934486, + "learning_rate": 9.755784557630585e-05, + "loss": 3.1965, + "step": 12237 + }, + { + "epoch": 0.5697790814069884, + "grad_norm": 0.37966980182750765, + "learning_rate": 9.755700929831082e-05, + "loss": 3.108, + "step": 12238 + }, + { + "epoch": 0.5698256395930814, + "grad_norm": 0.34977276312310246, + "learning_rate": 9.755617288074063e-05, + "loss": 3.1812, + "step": 12239 + }, + { + "epoch": 0.5698721977791745, + "grad_norm": 0.3506560909868185, + "learning_rate": 9.75553363235978e-05, + "loss": 3.2063, + "step": 12240 + }, + { + "epoch": 0.5699187559652676, + "grad_norm": 0.39003552341473224, + "learning_rate": 9.75544996268847e-05, + "loss": 2.9115, + "step": 12241 + }, + { + "epoch": 0.5699653141513606, + "grad_norm": 0.3673258985727232, + "learning_rate": 9.755366279060385e-05, + "loss": 3.0816, + "step": 12242 + }, + { + "epoch": 0.5700118723374538, + "grad_norm": 0.37113867097012476, + "learning_rate": 9.755282581475769e-05, + "loss": 3.1146, + "step": 12243 + }, + { + "epoch": 0.5700584305235468, + "grad_norm": 0.3502995855258525, + "learning_rate": 9.755198869934866e-05, + "loss": 3.1039, + "step": 12244 + }, + { + "epoch": 0.5701049887096399, + "grad_norm": 0.3556954258795264, + "learning_rate": 9.755115144437923e-05, + "loss": 3.2306, + "step": 12245 + }, + { + "epoch": 0.570151546895733, + "grad_norm": 0.34652107585818026, + "learning_rate": 9.755031404985185e-05, + "loss": 3.0414, + "step": 12246 + }, + { + "epoch": 0.570198105081826, + "grad_norm": 0.3656344536698709, + "learning_rate": 9.754947651576899e-05, + "loss": 3.2082, + "step": 12247 + }, + { + "epoch": 0.5702446632679191, + "grad_norm": 0.41415299821319584, + "learning_rate": 9.75486388421331e-05, + "loss": 3.1976, + "step": 12248 + }, + { + "epoch": 0.5702912214540121, + "grad_norm": 0.32499570893997626, + "learning_rate": 9.754780102894664e-05, + "loss": 3.2237, + "step": 12249 + }, + { + "epoch": 0.5703377796401052, + "grad_norm": 0.357064752282648, + "learning_rate": 9.754696307621207e-05, + "loss": 3.1597, + "step": 12250 + }, + { + "epoch": 0.5703843378261982, + "grad_norm": 0.3784914767410281, + "learning_rate": 9.754612498393185e-05, + "loss": 3.0571, + "step": 12251 + }, + { + "epoch": 0.5704308960122914, + "grad_norm": 0.39297093584064136, + "learning_rate": 9.754528675210843e-05, + "loss": 3.1923, + "step": 12252 + }, + { + "epoch": 0.5704774541983845, + "grad_norm": 0.3532989300255618, + "learning_rate": 9.754444838074426e-05, + "loss": 3.1528, + "step": 12253 + }, + { + "epoch": 0.5705240123844775, + "grad_norm": 0.42359256268659856, + "learning_rate": 9.754360986984184e-05, + "loss": 3.1731, + "step": 12254 + }, + { + "epoch": 0.5705705705705706, + "grad_norm": 0.3934576167738401, + "learning_rate": 9.75427712194036e-05, + "loss": 3.129, + "step": 12255 + }, + { + "epoch": 0.5706171287566636, + "grad_norm": 0.3669367008231158, + "learning_rate": 9.754193242943203e-05, + "loss": 3.0647, + "step": 12256 + }, + { + "epoch": 0.5706636869427567, + "grad_norm": 0.37088655818219624, + "learning_rate": 9.754109349992955e-05, + "loss": 3.1794, + "step": 12257 + }, + { + "epoch": 0.5707102451288498, + "grad_norm": 0.40831682956203613, + "learning_rate": 9.754025443089866e-05, + "loss": 3.1515, + "step": 12258 + }, + { + "epoch": 0.5707568033149428, + "grad_norm": 0.3681796556163572, + "learning_rate": 9.75394152223418e-05, + "loss": 3.1556, + "step": 12259 + }, + { + "epoch": 0.5708033615010359, + "grad_norm": 0.41328988589582605, + "learning_rate": 9.753857587426143e-05, + "loss": 3.0605, + "step": 12260 + }, + { + "epoch": 0.570849919687129, + "grad_norm": 0.40273585283196894, + "learning_rate": 9.753773638666004e-05, + "loss": 3.1317, + "step": 12261 + }, + { + "epoch": 0.5708964778732221, + "grad_norm": 0.44231566052404253, + "learning_rate": 9.753689675954006e-05, + "loss": 3.0901, + "step": 12262 + }, + { + "epoch": 0.5709430360593152, + "grad_norm": 0.3873713483183478, + "learning_rate": 9.753605699290398e-05, + "loss": 3.0482, + "step": 12263 + }, + { + "epoch": 0.5709895942454082, + "grad_norm": 0.38765603884908373, + "learning_rate": 9.753521708675426e-05, + "loss": 3.1899, + "step": 12264 + }, + { + "epoch": 0.5710361524315013, + "grad_norm": 0.41166777481689615, + "learning_rate": 9.753437704109337e-05, + "loss": 3.0956, + "step": 12265 + }, + { + "epoch": 0.5710827106175943, + "grad_norm": 0.4470079262207784, + "learning_rate": 9.753353685592375e-05, + "loss": 3.0673, + "step": 12266 + }, + { + "epoch": 0.5711292688036874, + "grad_norm": 0.4391105775753388, + "learning_rate": 9.753269653124788e-05, + "loss": 3.1517, + "step": 12267 + }, + { + "epoch": 0.5711758269897805, + "grad_norm": 0.41710127862759505, + "learning_rate": 9.753185606706822e-05, + "loss": 3.1553, + "step": 12268 + }, + { + "epoch": 0.5712223851758735, + "grad_norm": 0.41760395113081095, + "learning_rate": 9.753101546338727e-05, + "loss": 3.1154, + "step": 12269 + }, + { + "epoch": 0.5712689433619667, + "grad_norm": 0.44756558867912183, + "learning_rate": 9.753017472020746e-05, + "loss": 3.1613, + "step": 12270 + }, + { + "epoch": 0.5713155015480597, + "grad_norm": 0.3704144490161778, + "learning_rate": 9.752933383753126e-05, + "loss": 3.0502, + "step": 12271 + }, + { + "epoch": 0.5713620597341528, + "grad_norm": 0.4339227006938671, + "learning_rate": 9.752849281536116e-05, + "loss": 3.1381, + "step": 12272 + }, + { + "epoch": 0.5714086179202458, + "grad_norm": 0.36882825165155875, + "learning_rate": 9.752765165369962e-05, + "loss": 3.1344, + "step": 12273 + }, + { + "epoch": 0.5714551761063389, + "grad_norm": 0.39474526137910304, + "learning_rate": 9.752681035254909e-05, + "loss": 3.1433, + "step": 12274 + }, + { + "epoch": 0.571501734292432, + "grad_norm": 0.3783483619853854, + "learning_rate": 9.752596891191207e-05, + "loss": 3.0428, + "step": 12275 + }, + { + "epoch": 0.571548292478525, + "grad_norm": 0.42388807492575953, + "learning_rate": 9.7525127331791e-05, + "loss": 2.975, + "step": 12276 + }, + { + "epoch": 0.5715948506646181, + "grad_norm": 0.4261701175005446, + "learning_rate": 9.752428561218838e-05, + "loss": 3.1344, + "step": 12277 + }, + { + "epoch": 0.5716414088507111, + "grad_norm": 0.3825580627901477, + "learning_rate": 9.752344375310664e-05, + "loss": 3.1479, + "step": 12278 + }, + { + "epoch": 0.5716879670368042, + "grad_norm": 0.3753411625344951, + "learning_rate": 9.75226017545483e-05, + "loss": 3.0247, + "step": 12279 + }, + { + "epoch": 0.5717345252228974, + "grad_norm": 0.375075061839273, + "learning_rate": 9.752175961651578e-05, + "loss": 3.1431, + "step": 12280 + }, + { + "epoch": 0.5717810834089904, + "grad_norm": 0.36230170717799254, + "learning_rate": 9.75209173390116e-05, + "loss": 3.0845, + "step": 12281 + }, + { + "epoch": 0.5718276415950835, + "grad_norm": 0.37507352755861856, + "learning_rate": 9.752007492203819e-05, + "loss": 3.0221, + "step": 12282 + }, + { + "epoch": 0.5718741997811765, + "grad_norm": 0.35635723566896654, + "learning_rate": 9.751923236559806e-05, + "loss": 3.0236, + "step": 12283 + }, + { + "epoch": 0.5719207579672696, + "grad_norm": 0.36661169819438844, + "learning_rate": 9.751838966969364e-05, + "loss": 3.1019, + "step": 12284 + }, + { + "epoch": 0.5719673161533627, + "grad_norm": 0.3911866407068733, + "learning_rate": 9.751754683432746e-05, + "loss": 3.179, + "step": 12285 + }, + { + "epoch": 0.5720138743394557, + "grad_norm": 0.3788225597839738, + "learning_rate": 9.751670385950194e-05, + "loss": 3.1596, + "step": 12286 + }, + { + "epoch": 0.5720604325255488, + "grad_norm": 0.38185280497864865, + "learning_rate": 9.751586074521958e-05, + "loss": 3.0852, + "step": 12287 + }, + { + "epoch": 0.5721069907116418, + "grad_norm": 0.45483702158652906, + "learning_rate": 9.751501749148283e-05, + "loss": 3.1522, + "step": 12288 + }, + { + "epoch": 0.572153548897735, + "grad_norm": 0.44820690224016435, + "learning_rate": 9.751417409829421e-05, + "loss": 3.2335, + "step": 12289 + }, + { + "epoch": 0.5722001070838281, + "grad_norm": 0.325099347500606, + "learning_rate": 9.751333056565617e-05, + "loss": 3.0882, + "step": 12290 + }, + { + "epoch": 0.5722466652699211, + "grad_norm": 0.36385279785388785, + "learning_rate": 9.751248689357118e-05, + "loss": 3.0718, + "step": 12291 + }, + { + "epoch": 0.5722932234560142, + "grad_norm": 0.3590360059350176, + "learning_rate": 9.751164308204173e-05, + "loss": 3.0461, + "step": 12292 + }, + { + "epoch": 0.5723397816421072, + "grad_norm": 0.3945564544901794, + "learning_rate": 9.751079913107027e-05, + "loss": 3.0883, + "step": 12293 + }, + { + "epoch": 0.5723863398282003, + "grad_norm": 0.3559467443958551, + "learning_rate": 9.750995504065931e-05, + "loss": 3.0689, + "step": 12294 + }, + { + "epoch": 0.5724328980142933, + "grad_norm": 0.37382653952497047, + "learning_rate": 9.750911081081131e-05, + "loss": 3.0384, + "step": 12295 + }, + { + "epoch": 0.5724794562003864, + "grad_norm": 0.37782173702828786, + "learning_rate": 9.750826644152874e-05, + "loss": 3.1301, + "step": 12296 + }, + { + "epoch": 0.5725260143864795, + "grad_norm": 0.357723800520132, + "learning_rate": 9.75074219328141e-05, + "loss": 3.1061, + "step": 12297 + }, + { + "epoch": 0.5725725725725725, + "grad_norm": 0.40710429803755843, + "learning_rate": 9.750657728466985e-05, + "loss": 3.0601, + "step": 12298 + }, + { + "epoch": 0.5726191307586657, + "grad_norm": 0.3795440323727616, + "learning_rate": 9.750573249709848e-05, + "loss": 3.0573, + "step": 12299 + }, + { + "epoch": 0.5726656889447587, + "grad_norm": 0.35928239261324835, + "learning_rate": 9.750488757010245e-05, + "loss": 3.1099, + "step": 12300 + }, + { + "epoch": 0.5727122471308518, + "grad_norm": 0.3575921561006557, + "learning_rate": 9.750404250368428e-05, + "loss": 3.0819, + "step": 12301 + }, + { + "epoch": 0.5727588053169449, + "grad_norm": 0.37321678023989396, + "learning_rate": 9.75031972978464e-05, + "loss": 3.1502, + "step": 12302 + }, + { + "epoch": 0.5728053635030379, + "grad_norm": 0.3553220944262455, + "learning_rate": 9.750235195259134e-05, + "loss": 3.2014, + "step": 12303 + }, + { + "epoch": 0.572851921689131, + "grad_norm": 0.3422333627028093, + "learning_rate": 9.750150646792156e-05, + "loss": 3.099, + "step": 12304 + }, + { + "epoch": 0.572898479875224, + "grad_norm": 0.3610088013417951, + "learning_rate": 9.750066084383951e-05, + "loss": 3.0273, + "step": 12305 + }, + { + "epoch": 0.5729450380613171, + "grad_norm": 0.3452061902507573, + "learning_rate": 9.749981508034771e-05, + "loss": 3.1306, + "step": 12306 + }, + { + "epoch": 0.5729915962474103, + "grad_norm": 0.39603676389657416, + "learning_rate": 9.749896917744864e-05, + "loss": 3.1186, + "step": 12307 + }, + { + "epoch": 0.5730381544335033, + "grad_norm": 0.42418448507613754, + "learning_rate": 9.749812313514477e-05, + "loss": 3.1869, + "step": 12308 + }, + { + "epoch": 0.5730847126195964, + "grad_norm": 0.44045002075833645, + "learning_rate": 9.74972769534386e-05, + "loss": 3.2096, + "step": 12309 + }, + { + "epoch": 0.5731312708056894, + "grad_norm": 0.4105456531741394, + "learning_rate": 9.749643063233258e-05, + "loss": 3.1683, + "step": 12310 + }, + { + "epoch": 0.5731778289917825, + "grad_norm": 0.3785167819068121, + "learning_rate": 9.749558417182922e-05, + "loss": 3.0877, + "step": 12311 + }, + { + "epoch": 0.5732243871778756, + "grad_norm": 0.4227710857893554, + "learning_rate": 9.749473757193102e-05, + "loss": 3.1182, + "step": 12312 + }, + { + "epoch": 0.5732709453639686, + "grad_norm": 0.34677147057496543, + "learning_rate": 9.749389083264043e-05, + "loss": 2.9509, + "step": 12313 + }, + { + "epoch": 0.5733175035500617, + "grad_norm": 0.39666257525779713, + "learning_rate": 9.749304395395995e-05, + "loss": 3.0924, + "step": 12314 + }, + { + "epoch": 0.5733640617361547, + "grad_norm": 0.3525853596106996, + "learning_rate": 9.749219693589205e-05, + "loss": 3.1543, + "step": 12315 + }, + { + "epoch": 0.5734106199222478, + "grad_norm": 0.3865717382529169, + "learning_rate": 9.749134977843925e-05, + "loss": 3.1795, + "step": 12316 + }, + { + "epoch": 0.5734571781083408, + "grad_norm": 0.34585077947773, + "learning_rate": 9.7490502481604e-05, + "loss": 3.074, + "step": 12317 + }, + { + "epoch": 0.573503736294434, + "grad_norm": 0.3630644761906359, + "learning_rate": 9.748965504538882e-05, + "loss": 3.116, + "step": 12318 + }, + { + "epoch": 0.5735502944805271, + "grad_norm": 0.37353590074299003, + "learning_rate": 9.748880746979619e-05, + "loss": 3.151, + "step": 12319 + }, + { + "epoch": 0.5735968526666201, + "grad_norm": 0.37399851733119666, + "learning_rate": 9.748795975482857e-05, + "loss": 3.156, + "step": 12320 + }, + { + "epoch": 0.5736434108527132, + "grad_norm": 0.3722128142825858, + "learning_rate": 9.748711190048847e-05, + "loss": 2.9679, + "step": 12321 + }, + { + "epoch": 0.5736899690388062, + "grad_norm": 0.3833991102934722, + "learning_rate": 9.748626390677837e-05, + "loss": 3.1115, + "step": 12322 + }, + { + "epoch": 0.5737365272248993, + "grad_norm": 0.3761112105364059, + "learning_rate": 9.748541577370078e-05, + "loss": 3.1174, + "step": 12323 + }, + { + "epoch": 0.5737830854109924, + "grad_norm": 0.3497687384457187, + "learning_rate": 9.748456750125816e-05, + "loss": 3.0202, + "step": 12324 + }, + { + "epoch": 0.5738296435970854, + "grad_norm": 0.36618465601119804, + "learning_rate": 9.748371908945302e-05, + "loss": 3.1206, + "step": 12325 + }, + { + "epoch": 0.5738762017831786, + "grad_norm": 0.41219818820265663, + "learning_rate": 9.748287053828784e-05, + "loss": 3.1305, + "step": 12326 + }, + { + "epoch": 0.5739227599692716, + "grad_norm": 0.3952320120349933, + "learning_rate": 9.74820218477651e-05, + "loss": 2.9918, + "step": 12327 + }, + { + "epoch": 0.5739693181553647, + "grad_norm": 0.38242725586018667, + "learning_rate": 9.748117301788731e-05, + "loss": 3.1021, + "step": 12328 + }, + { + "epoch": 0.5740158763414578, + "grad_norm": 0.4152432674870371, + "learning_rate": 9.748032404865696e-05, + "loss": 3.1583, + "step": 12329 + }, + { + "epoch": 0.5740624345275508, + "grad_norm": 0.4017074347286361, + "learning_rate": 9.747947494007654e-05, + "loss": 3.1314, + "step": 12330 + }, + { + "epoch": 0.5741089927136439, + "grad_norm": 0.4151073911352179, + "learning_rate": 9.747862569214854e-05, + "loss": 2.94, + "step": 12331 + }, + { + "epoch": 0.5741555508997369, + "grad_norm": 0.4337327333288811, + "learning_rate": 9.747777630487545e-05, + "loss": 3.2153, + "step": 12332 + }, + { + "epoch": 0.57420210908583, + "grad_norm": 0.3257524398322035, + "learning_rate": 9.747692677825975e-05, + "loss": 3.1156, + "step": 12333 + }, + { + "epoch": 0.5742486672719231, + "grad_norm": 0.39184420599984604, + "learning_rate": 9.747607711230396e-05, + "loss": 3.1503, + "step": 12334 + }, + { + "epoch": 0.5742952254580161, + "grad_norm": 0.3973684356349362, + "learning_rate": 9.747522730701055e-05, + "loss": 3.1283, + "step": 12335 + }, + { + "epoch": 0.5743417836441093, + "grad_norm": 0.36974383196089844, + "learning_rate": 9.747437736238203e-05, + "loss": 3.2328, + "step": 12336 + }, + { + "epoch": 0.5743883418302023, + "grad_norm": 0.3838830903497951, + "learning_rate": 9.74735272784209e-05, + "loss": 3.1358, + "step": 12337 + }, + { + "epoch": 0.5744349000162954, + "grad_norm": 0.3635299525639182, + "learning_rate": 9.747267705512963e-05, + "loss": 3.1493, + "step": 12338 + }, + { + "epoch": 0.5744814582023884, + "grad_norm": 0.4317636186138806, + "learning_rate": 9.747182669251074e-05, + "loss": 3.1009, + "step": 12339 + }, + { + "epoch": 0.5745280163884815, + "grad_norm": 0.44219203867236256, + "learning_rate": 9.747097619056671e-05, + "loss": 3.0866, + "step": 12340 + }, + { + "epoch": 0.5745745745745746, + "grad_norm": 0.3655882871292167, + "learning_rate": 9.747012554930004e-05, + "loss": 3.0372, + "step": 12341 + }, + { + "epoch": 0.5746211327606676, + "grad_norm": 0.4337307203195387, + "learning_rate": 9.746927476871323e-05, + "loss": 3.1652, + "step": 12342 + }, + { + "epoch": 0.5746676909467607, + "grad_norm": 0.4398679493090579, + "learning_rate": 9.746842384880878e-05, + "loss": 3.069, + "step": 12343 + }, + { + "epoch": 0.5747142491328537, + "grad_norm": 0.4123381573731739, + "learning_rate": 9.746757278958918e-05, + "loss": 3.0562, + "step": 12344 + }, + { + "epoch": 0.5747608073189469, + "grad_norm": 0.4295390072236817, + "learning_rate": 9.746672159105692e-05, + "loss": 3.1076, + "step": 12345 + }, + { + "epoch": 0.57480736550504, + "grad_norm": 0.4237221818692835, + "learning_rate": 9.746587025321452e-05, + "loss": 3.1701, + "step": 12346 + }, + { + "epoch": 0.574853923691133, + "grad_norm": 0.3926141412284063, + "learning_rate": 9.746501877606446e-05, + "loss": 3.13, + "step": 12347 + }, + { + "epoch": 0.5749004818772261, + "grad_norm": 0.436697852467503, + "learning_rate": 9.746416715960925e-05, + "loss": 3.1791, + "step": 12348 + }, + { + "epoch": 0.5749470400633191, + "grad_norm": 0.39244705009292874, + "learning_rate": 9.746331540385139e-05, + "loss": 3.049, + "step": 12349 + }, + { + "epoch": 0.5749935982494122, + "grad_norm": 0.4000491790569353, + "learning_rate": 9.746246350879337e-05, + "loss": 2.9977, + "step": 12350 + }, + { + "epoch": 0.5750401564355053, + "grad_norm": 0.48432252988454566, + "learning_rate": 9.746161147443769e-05, + "loss": 2.9806, + "step": 12351 + }, + { + "epoch": 0.5750867146215983, + "grad_norm": 0.3367156181137181, + "learning_rate": 9.746075930078687e-05, + "loss": 3.1173, + "step": 12352 + }, + { + "epoch": 0.5751332728076914, + "grad_norm": 0.42829351753836686, + "learning_rate": 9.745990698784338e-05, + "loss": 3.2003, + "step": 12353 + }, + { + "epoch": 0.5751798309937844, + "grad_norm": 0.40167142420891633, + "learning_rate": 9.745905453560976e-05, + "loss": 3.119, + "step": 12354 + }, + { + "epoch": 0.5752263891798776, + "grad_norm": 0.3526710689372088, + "learning_rate": 9.745820194408847e-05, + "loss": 3.137, + "step": 12355 + }, + { + "epoch": 0.5752729473659707, + "grad_norm": 0.39483787811817317, + "learning_rate": 9.745734921328203e-05, + "loss": 3.0088, + "step": 12356 + }, + { + "epoch": 0.5753195055520637, + "grad_norm": 0.39304420427331266, + "learning_rate": 9.745649634319296e-05, + "loss": 3.1208, + "step": 12357 + }, + { + "epoch": 0.5753660637381568, + "grad_norm": 0.3756612810485336, + "learning_rate": 9.745564333382375e-05, + "loss": 3.0748, + "step": 12358 + }, + { + "epoch": 0.5754126219242498, + "grad_norm": 0.3898238562929818, + "learning_rate": 9.745479018517689e-05, + "loss": 3.122, + "step": 12359 + }, + { + "epoch": 0.5754591801103429, + "grad_norm": 0.40675859930438335, + "learning_rate": 9.74539368972549e-05, + "loss": 3.1462, + "step": 12360 + }, + { + "epoch": 0.5755057382964359, + "grad_norm": 0.5203437713773923, + "learning_rate": 9.745308347006028e-05, + "loss": 2.9387, + "step": 12361 + }, + { + "epoch": 0.575552296482529, + "grad_norm": 0.4197054851649797, + "learning_rate": 9.745222990359555e-05, + "loss": 3.2146, + "step": 12362 + }, + { + "epoch": 0.5755988546686222, + "grad_norm": 0.4245880470305485, + "learning_rate": 9.745137619786317e-05, + "loss": 3.0444, + "step": 12363 + }, + { + "epoch": 0.5756454128547152, + "grad_norm": 0.4700728009985354, + "learning_rate": 9.74505223528657e-05, + "loss": 3.0755, + "step": 12364 + }, + { + "epoch": 0.5756919710408083, + "grad_norm": 0.36581670493074453, + "learning_rate": 9.744966836860561e-05, + "loss": 3.2142, + "step": 12365 + }, + { + "epoch": 0.5757385292269013, + "grad_norm": 0.4507473145938807, + "learning_rate": 9.744881424508543e-05, + "loss": 3.1561, + "step": 12366 + }, + { + "epoch": 0.5757850874129944, + "grad_norm": 0.4401728243090108, + "learning_rate": 9.744795998230764e-05, + "loss": 3.1904, + "step": 12367 + }, + { + "epoch": 0.5758316455990875, + "grad_norm": 0.41605537040709784, + "learning_rate": 9.744710558027478e-05, + "loss": 3.16, + "step": 12368 + }, + { + "epoch": 0.5758782037851805, + "grad_norm": 0.42164109092218693, + "learning_rate": 9.744625103898932e-05, + "loss": 3.1967, + "step": 12369 + }, + { + "epoch": 0.5759247619712736, + "grad_norm": 0.38121318791245135, + "learning_rate": 9.74453963584538e-05, + "loss": 3.2198, + "step": 12370 + }, + { + "epoch": 0.5759713201573666, + "grad_norm": 0.3887862410754773, + "learning_rate": 9.74445415386707e-05, + "loss": 3.1387, + "step": 12371 + }, + { + "epoch": 0.5760178783434597, + "grad_norm": 0.36809102532950944, + "learning_rate": 9.744368657964257e-05, + "loss": 3.0973, + "step": 12372 + }, + { + "epoch": 0.5760644365295529, + "grad_norm": 0.3539329841059785, + "learning_rate": 9.744283148137188e-05, + "loss": 3.0927, + "step": 12373 + }, + { + "epoch": 0.5761109947156459, + "grad_norm": 0.40230479521226115, + "learning_rate": 9.744197624386116e-05, + "loss": 3.0343, + "step": 12374 + }, + { + "epoch": 0.576157552901739, + "grad_norm": 0.3743254312734769, + "learning_rate": 9.74411208671129e-05, + "loss": 3.2126, + "step": 12375 + }, + { + "epoch": 0.576204111087832, + "grad_norm": 0.4112253834046336, + "learning_rate": 9.744026535112962e-05, + "loss": 3.0904, + "step": 12376 + }, + { + "epoch": 0.5762506692739251, + "grad_norm": 0.39693470306455286, + "learning_rate": 9.743940969591386e-05, + "loss": 2.9538, + "step": 12377 + }, + { + "epoch": 0.5762972274600181, + "grad_norm": 0.38693507178430275, + "learning_rate": 9.743855390146808e-05, + "loss": 3.1812, + "step": 12378 + }, + { + "epoch": 0.5763437856461112, + "grad_norm": 0.43578346894836734, + "learning_rate": 9.743769796779483e-05, + "loss": 3.1324, + "step": 12379 + }, + { + "epoch": 0.5763903438322043, + "grad_norm": 0.3831275466010125, + "learning_rate": 9.743684189489662e-05, + "loss": 3.132, + "step": 12380 + }, + { + "epoch": 0.5764369020182973, + "grad_norm": 0.4112810043199705, + "learning_rate": 9.743598568277594e-05, + "loss": 3.1851, + "step": 12381 + }, + { + "epoch": 0.5764834602043905, + "grad_norm": 0.4095317342087458, + "learning_rate": 9.743512933143531e-05, + "loss": 3.1894, + "step": 12382 + }, + { + "epoch": 0.5765300183904835, + "grad_norm": 0.4066668852814374, + "learning_rate": 9.743427284087727e-05, + "loss": 3.0568, + "step": 12383 + }, + { + "epoch": 0.5765765765765766, + "grad_norm": 0.4404045065875138, + "learning_rate": 9.743341621110429e-05, + "loss": 3.0537, + "step": 12384 + }, + { + "epoch": 0.5766231347626697, + "grad_norm": 0.3980092548825941, + "learning_rate": 9.74325594421189e-05, + "loss": 3.0457, + "step": 12385 + }, + { + "epoch": 0.5766696929487627, + "grad_norm": 0.3894935876602573, + "learning_rate": 9.743170253392364e-05, + "loss": 3.143, + "step": 12386 + }, + { + "epoch": 0.5767162511348558, + "grad_norm": 0.35350738669621434, + "learning_rate": 9.7430845486521e-05, + "loss": 3.0375, + "step": 12387 + }, + { + "epoch": 0.5767628093209488, + "grad_norm": 0.47604032903821797, + "learning_rate": 9.742998829991351e-05, + "loss": 3.1128, + "step": 12388 + }, + { + "epoch": 0.5768093675070419, + "grad_norm": 0.3818283786141527, + "learning_rate": 9.742913097410367e-05, + "loss": 3.06, + "step": 12389 + }, + { + "epoch": 0.576855925693135, + "grad_norm": 0.40729796197292345, + "learning_rate": 9.7428273509094e-05, + "loss": 3.0829, + "step": 12390 + }, + { + "epoch": 0.576902483879228, + "grad_norm": 0.4075502383356714, + "learning_rate": 9.742741590488701e-05, + "loss": 3.1987, + "step": 12391 + }, + { + "epoch": 0.5769490420653212, + "grad_norm": 0.3797598710656877, + "learning_rate": 9.742655816148525e-05, + "loss": 3.1445, + "step": 12392 + }, + { + "epoch": 0.5769956002514142, + "grad_norm": 0.40465352713944663, + "learning_rate": 9.742570027889121e-05, + "loss": 3.0708, + "step": 12393 + }, + { + "epoch": 0.5770421584375073, + "grad_norm": 0.36517925748993524, + "learning_rate": 9.74248422571074e-05, + "loss": 3.2022, + "step": 12394 + }, + { + "epoch": 0.5770887166236004, + "grad_norm": 0.4116769125912916, + "learning_rate": 9.742398409613636e-05, + "loss": 3.0174, + "step": 12395 + }, + { + "epoch": 0.5771352748096934, + "grad_norm": 0.38442061678244727, + "learning_rate": 9.742312579598059e-05, + "loss": 3.1465, + "step": 12396 + }, + { + "epoch": 0.5771818329957865, + "grad_norm": 0.35970816147824886, + "learning_rate": 9.742226735664262e-05, + "loss": 2.9931, + "step": 12397 + }, + { + "epoch": 0.5772283911818795, + "grad_norm": 0.39388584261025533, + "learning_rate": 9.742140877812497e-05, + "loss": 3.1416, + "step": 12398 + }, + { + "epoch": 0.5772749493679726, + "grad_norm": 0.3736433412046174, + "learning_rate": 9.742055006043016e-05, + "loss": 2.9866, + "step": 12399 + }, + { + "epoch": 0.5773215075540656, + "grad_norm": 0.3647075168182218, + "learning_rate": 9.741969120356069e-05, + "loss": 3.0741, + "step": 12400 + }, + { + "epoch": 0.5773680657401588, + "grad_norm": 0.3969217147926877, + "learning_rate": 9.741883220751912e-05, + "loss": 3.0867, + "step": 12401 + }, + { + "epoch": 0.5774146239262519, + "grad_norm": 0.37431045392803014, + "learning_rate": 9.741797307230793e-05, + "loss": 3.1313, + "step": 12402 + }, + { + "epoch": 0.5774611821123449, + "grad_norm": 0.4173633197913745, + "learning_rate": 9.741711379792968e-05, + "loss": 3.1674, + "step": 12403 + }, + { + "epoch": 0.577507740298438, + "grad_norm": 0.3920041989656567, + "learning_rate": 9.741625438438686e-05, + "loss": 3.1269, + "step": 12404 + }, + { + "epoch": 0.577554298484531, + "grad_norm": 0.404264305507307, + "learning_rate": 9.7415394831682e-05, + "loss": 3.2136, + "step": 12405 + }, + { + "epoch": 0.5776008566706241, + "grad_norm": 0.4008522102640349, + "learning_rate": 9.741453513981764e-05, + "loss": 3.1418, + "step": 12406 + }, + { + "epoch": 0.5776474148567172, + "grad_norm": 0.4202051520416033, + "learning_rate": 9.741367530879628e-05, + "loss": 3.1404, + "step": 12407 + }, + { + "epoch": 0.5776939730428102, + "grad_norm": 0.4494614952040497, + "learning_rate": 9.741281533862047e-05, + "loss": 3.2484, + "step": 12408 + }, + { + "epoch": 0.5777405312289033, + "grad_norm": 0.3853708889348752, + "learning_rate": 9.741195522929271e-05, + "loss": 3.2034, + "step": 12409 + }, + { + "epoch": 0.5777870894149963, + "grad_norm": 0.39767801268032893, + "learning_rate": 9.741109498081552e-05, + "loss": 3.0398, + "step": 12410 + }, + { + "epoch": 0.5778336476010895, + "grad_norm": 0.3851304045602147, + "learning_rate": 9.741023459319145e-05, + "loss": 3.0988, + "step": 12411 + }, + { + "epoch": 0.5778802057871826, + "grad_norm": 0.39106938539291697, + "learning_rate": 9.740937406642301e-05, + "loss": 3.1446, + "step": 12412 + }, + { + "epoch": 0.5779267639732756, + "grad_norm": 0.3855143057731911, + "learning_rate": 9.740851340051272e-05, + "loss": 3.0811, + "step": 12413 + }, + { + "epoch": 0.5779733221593687, + "grad_norm": 0.38388397070239066, + "learning_rate": 9.740765259546311e-05, + "loss": 3.0509, + "step": 12414 + }, + { + "epoch": 0.5780198803454617, + "grad_norm": 0.3601917308504511, + "learning_rate": 9.740679165127673e-05, + "loss": 3.1183, + "step": 12415 + }, + { + "epoch": 0.5780664385315548, + "grad_norm": 0.4122421022639386, + "learning_rate": 9.740593056795609e-05, + "loss": 3.0412, + "step": 12416 + }, + { + "epoch": 0.5781129967176479, + "grad_norm": 0.3871232933293319, + "learning_rate": 9.740506934550368e-05, + "loss": 3.0411, + "step": 12417 + }, + { + "epoch": 0.5781595549037409, + "grad_norm": 0.3609739964259105, + "learning_rate": 9.74042079839221e-05, + "loss": 3.1906, + "step": 12418 + }, + { + "epoch": 0.578206113089834, + "grad_norm": 0.4180551725904358, + "learning_rate": 9.740334648321381e-05, + "loss": 3.2186, + "step": 12419 + }, + { + "epoch": 0.578252671275927, + "grad_norm": 0.3588744337909685, + "learning_rate": 9.740248484338139e-05, + "loss": 3.0832, + "step": 12420 + }, + { + "epoch": 0.5782992294620202, + "grad_norm": 0.4018794221372898, + "learning_rate": 9.740162306442733e-05, + "loss": 3.1386, + "step": 12421 + }, + { + "epoch": 0.5783457876481132, + "grad_norm": 0.41005237010198264, + "learning_rate": 9.740076114635419e-05, + "loss": 2.9705, + "step": 12422 + }, + { + "epoch": 0.5783923458342063, + "grad_norm": 0.4278226390632847, + "learning_rate": 9.739989908916447e-05, + "loss": 3.2675, + "step": 12423 + }, + { + "epoch": 0.5784389040202994, + "grad_norm": 0.3637622113578155, + "learning_rate": 9.739903689286074e-05, + "loss": 3.1553, + "step": 12424 + }, + { + "epoch": 0.5784854622063924, + "grad_norm": 0.4129415980256248, + "learning_rate": 9.739817455744549e-05, + "loss": 3.0639, + "step": 12425 + }, + { + "epoch": 0.5785320203924855, + "grad_norm": 0.41764806786210446, + "learning_rate": 9.739731208292126e-05, + "loss": 3.1049, + "step": 12426 + }, + { + "epoch": 0.5785785785785785, + "grad_norm": 0.40591799548147534, + "learning_rate": 9.739644946929059e-05, + "loss": 3.0729, + "step": 12427 + }, + { + "epoch": 0.5786251367646716, + "grad_norm": 0.39380888827987864, + "learning_rate": 9.739558671655602e-05, + "loss": 3.2045, + "step": 12428 + }, + { + "epoch": 0.5786716949507648, + "grad_norm": 0.42473575627854443, + "learning_rate": 9.739472382472007e-05, + "loss": 3.0347, + "step": 12429 + }, + { + "epoch": 0.5787182531368578, + "grad_norm": 0.34392614535996585, + "learning_rate": 9.739386079378527e-05, + "loss": 3.1513, + "step": 12430 + }, + { + "epoch": 0.5787648113229509, + "grad_norm": 0.4056761101736757, + "learning_rate": 9.739299762375417e-05, + "loss": 3.092, + "step": 12431 + }, + { + "epoch": 0.5788113695090439, + "grad_norm": 0.4089644539568757, + "learning_rate": 9.739213431462928e-05, + "loss": 3.1075, + "step": 12432 + }, + { + "epoch": 0.578857927695137, + "grad_norm": 0.4220911798517395, + "learning_rate": 9.739127086641315e-05, + "loss": 3.1608, + "step": 12433 + }, + { + "epoch": 0.5789044858812301, + "grad_norm": 0.4375263391381537, + "learning_rate": 9.739040727910831e-05, + "loss": 3.2579, + "step": 12434 + }, + { + "epoch": 0.5789510440673231, + "grad_norm": 0.4068392155355241, + "learning_rate": 9.738954355271728e-05, + "loss": 3.0234, + "step": 12435 + }, + { + "epoch": 0.5789976022534162, + "grad_norm": 0.40971612796661216, + "learning_rate": 9.738867968724262e-05, + "loss": 3.0616, + "step": 12436 + }, + { + "epoch": 0.5790441604395092, + "grad_norm": 0.3990830368936396, + "learning_rate": 9.738781568268685e-05, + "loss": 3.1287, + "step": 12437 + }, + { + "epoch": 0.5790907186256024, + "grad_norm": 0.3653916807776639, + "learning_rate": 9.73869515390525e-05, + "loss": 3.0477, + "step": 12438 + }, + { + "epoch": 0.5791372768116955, + "grad_norm": 0.3892582944743543, + "learning_rate": 9.738608725634214e-05, + "loss": 3.0494, + "step": 12439 + }, + { + "epoch": 0.5791838349977885, + "grad_norm": 0.374977723296084, + "learning_rate": 9.738522283455826e-05, + "loss": 3.1497, + "step": 12440 + }, + { + "epoch": 0.5792303931838816, + "grad_norm": 0.36218283431826265, + "learning_rate": 9.738435827370343e-05, + "loss": 3.0058, + "step": 12441 + }, + { + "epoch": 0.5792769513699746, + "grad_norm": 0.37612959445726907, + "learning_rate": 9.738349357378018e-05, + "loss": 2.9308, + "step": 12442 + }, + { + "epoch": 0.5793235095560677, + "grad_norm": 0.35955114580410436, + "learning_rate": 9.738262873479103e-05, + "loss": 3.0564, + "step": 12443 + }, + { + "epoch": 0.5793700677421607, + "grad_norm": 0.3521776621942993, + "learning_rate": 9.738176375673855e-05, + "loss": 3.0852, + "step": 12444 + }, + { + "epoch": 0.5794166259282538, + "grad_norm": 0.3996956023024111, + "learning_rate": 9.738089863962526e-05, + "loss": 3.1625, + "step": 12445 + }, + { + "epoch": 0.5794631841143469, + "grad_norm": 0.3479984517455529, + "learning_rate": 9.738003338345368e-05, + "loss": 3.1436, + "step": 12446 + }, + { + "epoch": 0.57950974230044, + "grad_norm": 0.39234642481661486, + "learning_rate": 9.737916798822639e-05, + "loss": 3.1157, + "step": 12447 + }, + { + "epoch": 0.5795563004865331, + "grad_norm": 0.34917981936703424, + "learning_rate": 9.73783024539459e-05, + "loss": 3.1159, + "step": 12448 + }, + { + "epoch": 0.5796028586726261, + "grad_norm": 0.3793799058036768, + "learning_rate": 9.737743678061475e-05, + "loss": 3.0364, + "step": 12449 + }, + { + "epoch": 0.5796494168587192, + "grad_norm": 0.35148072215583137, + "learning_rate": 9.737657096823551e-05, + "loss": 3.1816, + "step": 12450 + }, + { + "epoch": 0.5796959750448123, + "grad_norm": 0.38309191153872146, + "learning_rate": 9.73757050168107e-05, + "loss": 3.1352, + "step": 12451 + }, + { + "epoch": 0.5797425332309053, + "grad_norm": 0.3814695420285262, + "learning_rate": 9.737483892634284e-05, + "loss": 3.0502, + "step": 12452 + }, + { + "epoch": 0.5797890914169984, + "grad_norm": 0.37538133433391074, + "learning_rate": 9.737397269683453e-05, + "loss": 3.0797, + "step": 12453 + }, + { + "epoch": 0.5798356496030914, + "grad_norm": 0.3436091430852089, + "learning_rate": 9.737310632828826e-05, + "loss": 3.1569, + "step": 12454 + }, + { + "epoch": 0.5798822077891845, + "grad_norm": 0.3784355270055101, + "learning_rate": 9.737223982070659e-05, + "loss": 2.9956, + "step": 12455 + }, + { + "epoch": 0.5799287659752776, + "grad_norm": 0.37680493853949537, + "learning_rate": 9.737137317409206e-05, + "loss": 3.0676, + "step": 12456 + }, + { + "epoch": 0.5799753241613707, + "grad_norm": 0.3462894253659032, + "learning_rate": 9.737050638844723e-05, + "loss": 3.1344, + "step": 12457 + }, + { + "epoch": 0.5800218823474638, + "grad_norm": 0.39619385004245894, + "learning_rate": 9.736963946377461e-05, + "loss": 3.0978, + "step": 12458 + }, + { + "epoch": 0.5800684405335568, + "grad_norm": 0.39794434251871447, + "learning_rate": 9.736877240007678e-05, + "loss": 3.1931, + "step": 12459 + }, + { + "epoch": 0.5801149987196499, + "grad_norm": 0.4043747161497197, + "learning_rate": 9.736790519735626e-05, + "loss": 3.0789, + "step": 12460 + }, + { + "epoch": 0.580161556905743, + "grad_norm": 0.39284162609306456, + "learning_rate": 9.73670378556156e-05, + "loss": 3.052, + "step": 12461 + }, + { + "epoch": 0.580208115091836, + "grad_norm": 0.35768973225136524, + "learning_rate": 9.736617037485736e-05, + "loss": 3.1304, + "step": 12462 + }, + { + "epoch": 0.5802546732779291, + "grad_norm": 0.38033142446402834, + "learning_rate": 9.736530275508409e-05, + "loss": 3.0462, + "step": 12463 + }, + { + "epoch": 0.5803012314640221, + "grad_norm": 0.4171674003274346, + "learning_rate": 9.73644349962983e-05, + "loss": 3.1148, + "step": 12464 + }, + { + "epoch": 0.5803477896501152, + "grad_norm": 0.35613012915783404, + "learning_rate": 9.736356709850257e-05, + "loss": 3.0736, + "step": 12465 + }, + { + "epoch": 0.5803943478362082, + "grad_norm": 0.4237511495199879, + "learning_rate": 9.736269906169943e-05, + "loss": 3.1892, + "step": 12466 + }, + { + "epoch": 0.5804409060223014, + "grad_norm": 0.4538934645793843, + "learning_rate": 9.736183088589144e-05, + "loss": 3.1294, + "step": 12467 + }, + { + "epoch": 0.5804874642083945, + "grad_norm": 0.335966226519919, + "learning_rate": 9.736096257108114e-05, + "loss": 3.104, + "step": 12468 + }, + { + "epoch": 0.5805340223944875, + "grad_norm": 0.40173163023668407, + "learning_rate": 9.736009411727108e-05, + "loss": 3.0741, + "step": 12469 + }, + { + "epoch": 0.5805805805805806, + "grad_norm": 0.3742538836990236, + "learning_rate": 9.735922552446381e-05, + "loss": 3.1268, + "step": 12470 + }, + { + "epoch": 0.5806271387666736, + "grad_norm": 0.39352630504804365, + "learning_rate": 9.735835679266187e-05, + "loss": 3.1379, + "step": 12471 + }, + { + "epoch": 0.5806736969527667, + "grad_norm": 0.3681138872669959, + "learning_rate": 9.735748792186784e-05, + "loss": 3.1831, + "step": 12472 + }, + { + "epoch": 0.5807202551388598, + "grad_norm": 0.4026551162779417, + "learning_rate": 9.735661891208423e-05, + "loss": 3.2093, + "step": 12473 + }, + { + "epoch": 0.5807668133249528, + "grad_norm": 0.3720622147141103, + "learning_rate": 9.735574976331361e-05, + "loss": 3.0803, + "step": 12474 + }, + { + "epoch": 0.580813371511046, + "grad_norm": 0.3574811553999143, + "learning_rate": 9.735488047555854e-05, + "loss": 3.1054, + "step": 12475 + }, + { + "epoch": 0.580859929697139, + "grad_norm": 0.37245379782815974, + "learning_rate": 9.735401104882156e-05, + "loss": 3.1427, + "step": 12476 + }, + { + "epoch": 0.5809064878832321, + "grad_norm": 0.3303499762016091, + "learning_rate": 9.735314148310522e-05, + "loss": 3.1362, + "step": 12477 + }, + { + "epoch": 0.5809530460693252, + "grad_norm": 0.37034263683637936, + "learning_rate": 9.735227177841207e-05, + "loss": 3.164, + "step": 12478 + }, + { + "epoch": 0.5809996042554182, + "grad_norm": 0.3404314725843577, + "learning_rate": 9.735140193474468e-05, + "loss": 3.0492, + "step": 12479 + }, + { + "epoch": 0.5810461624415113, + "grad_norm": 0.38406402858032257, + "learning_rate": 9.735053195210557e-05, + "loss": 3.174, + "step": 12480 + }, + { + "epoch": 0.5810927206276043, + "grad_norm": 0.31502472641232143, + "learning_rate": 9.734966183049733e-05, + "loss": 3.1854, + "step": 12481 + }, + { + "epoch": 0.5811392788136974, + "grad_norm": 0.3601929991700481, + "learning_rate": 9.734879156992249e-05, + "loss": 3.0228, + "step": 12482 + }, + { + "epoch": 0.5811858369997905, + "grad_norm": 0.33950698772523114, + "learning_rate": 9.734792117038361e-05, + "loss": 3.1116, + "step": 12483 + }, + { + "epoch": 0.5812323951858835, + "grad_norm": 0.34137167946451474, + "learning_rate": 9.734705063188325e-05, + "loss": 3.1986, + "step": 12484 + }, + { + "epoch": 0.5812789533719767, + "grad_norm": 0.35678871778801813, + "learning_rate": 9.734617995442396e-05, + "loss": 3.144, + "step": 12485 + }, + { + "epoch": 0.5813255115580697, + "grad_norm": 0.3481340165797501, + "learning_rate": 9.734530913800831e-05, + "loss": 3.1008, + "step": 12486 + }, + { + "epoch": 0.5813720697441628, + "grad_norm": 0.36526757600498766, + "learning_rate": 9.734443818263882e-05, + "loss": 3.2029, + "step": 12487 + }, + { + "epoch": 0.5814186279302558, + "grad_norm": 0.3390159199387638, + "learning_rate": 9.734356708831808e-05, + "loss": 3.2052, + "step": 12488 + }, + { + "epoch": 0.5814651861163489, + "grad_norm": 0.34843359949598884, + "learning_rate": 9.734269585504862e-05, + "loss": 3.1061, + "step": 12489 + }, + { + "epoch": 0.581511744302442, + "grad_norm": 0.3978307782487431, + "learning_rate": 9.734182448283303e-05, + "loss": 3.0811, + "step": 12490 + }, + { + "epoch": 0.581558302488535, + "grad_norm": 0.3775569683451937, + "learning_rate": 9.734095297167385e-05, + "loss": 3.1214, + "step": 12491 + }, + { + "epoch": 0.5816048606746281, + "grad_norm": 0.37125482731637327, + "learning_rate": 9.734008132157362e-05, + "loss": 3.1141, + "step": 12492 + }, + { + "epoch": 0.5816514188607211, + "grad_norm": 0.4284781671719733, + "learning_rate": 9.733920953253493e-05, + "loss": 3.2368, + "step": 12493 + }, + { + "epoch": 0.5816979770468143, + "grad_norm": 0.46613619734570344, + "learning_rate": 9.733833760456031e-05, + "loss": 3.0846, + "step": 12494 + }, + { + "epoch": 0.5817445352329074, + "grad_norm": 0.39669127474029814, + "learning_rate": 9.733746553765235e-05, + "loss": 3.0507, + "step": 12495 + }, + { + "epoch": 0.5817910934190004, + "grad_norm": 0.4159494734607951, + "learning_rate": 9.733659333181358e-05, + "loss": 3.1403, + "step": 12496 + }, + { + "epoch": 0.5818376516050935, + "grad_norm": 0.4446361556818614, + "learning_rate": 9.733572098704657e-05, + "loss": 3.2668, + "step": 12497 + }, + { + "epoch": 0.5818842097911865, + "grad_norm": 0.38968606257969435, + "learning_rate": 9.733484850335388e-05, + "loss": 3.148, + "step": 12498 + }, + { + "epoch": 0.5819307679772796, + "grad_norm": 0.3730646806770716, + "learning_rate": 9.733397588073809e-05, + "loss": 3.1251, + "step": 12499 + }, + { + "epoch": 0.5819773261633727, + "grad_norm": 0.36723069686381865, + "learning_rate": 9.733310311920173e-05, + "loss": 3.0841, + "step": 12500 + }, + { + "epoch": 0.5820238843494657, + "grad_norm": 0.40974203104283063, + "learning_rate": 9.733223021874738e-05, + "loss": 3.0859, + "step": 12501 + }, + { + "epoch": 0.5820704425355588, + "grad_norm": 0.35513410780743127, + "learning_rate": 9.733135717937761e-05, + "loss": 3.1655, + "step": 12502 + }, + { + "epoch": 0.5821170007216518, + "grad_norm": 0.384280705829586, + "learning_rate": 9.733048400109494e-05, + "loss": 3.0584, + "step": 12503 + }, + { + "epoch": 0.582163558907745, + "grad_norm": 0.35763271719547846, + "learning_rate": 9.732961068390198e-05, + "loss": 3.1111, + "step": 12504 + }, + { + "epoch": 0.5822101170938381, + "grad_norm": 0.3184561035976412, + "learning_rate": 9.732873722780127e-05, + "loss": 3.0685, + "step": 12505 + }, + { + "epoch": 0.5822566752799311, + "grad_norm": 0.34265798962937993, + "learning_rate": 9.732786363279539e-05, + "loss": 3.0609, + "step": 12506 + }, + { + "epoch": 0.5823032334660242, + "grad_norm": 0.3353274372604325, + "learning_rate": 9.732698989888688e-05, + "loss": 3.207, + "step": 12507 + }, + { + "epoch": 0.5823497916521172, + "grad_norm": 0.32660222432503805, + "learning_rate": 9.732611602607834e-05, + "loss": 3.108, + "step": 12508 + }, + { + "epoch": 0.5823963498382103, + "grad_norm": 0.30500519249525887, + "learning_rate": 9.732524201437228e-05, + "loss": 3.0522, + "step": 12509 + }, + { + "epoch": 0.5824429080243033, + "grad_norm": 0.40095591357106597, + "learning_rate": 9.732436786377131e-05, + "loss": 3.1647, + "step": 12510 + }, + { + "epoch": 0.5824894662103964, + "grad_norm": 0.39174425239521476, + "learning_rate": 9.7323493574278e-05, + "loss": 3.0615, + "step": 12511 + }, + { + "epoch": 0.5825360243964895, + "grad_norm": 0.3836828709361803, + "learning_rate": 9.732261914589486e-05, + "loss": 3.0903, + "step": 12512 + }, + { + "epoch": 0.5825825825825826, + "grad_norm": 0.4128840917286214, + "learning_rate": 9.732174457862453e-05, + "loss": 3.0058, + "step": 12513 + }, + { + "epoch": 0.5826291407686757, + "grad_norm": 0.3634024083517229, + "learning_rate": 9.732086987246952e-05, + "loss": 3.0985, + "step": 12514 + }, + { + "epoch": 0.5826756989547687, + "grad_norm": 0.41464721921689035, + "learning_rate": 9.731999502743243e-05, + "loss": 3.0845, + "step": 12515 + }, + { + "epoch": 0.5827222571408618, + "grad_norm": 0.45678345642461193, + "learning_rate": 9.731912004351583e-05, + "loss": 3.1084, + "step": 12516 + }, + { + "epoch": 0.5827688153269549, + "grad_norm": 0.4557071498061985, + "learning_rate": 9.731824492072225e-05, + "loss": 3.1334, + "step": 12517 + }, + { + "epoch": 0.5828153735130479, + "grad_norm": 0.42200626662928103, + "learning_rate": 9.731736965905429e-05, + "loss": 3.1064, + "step": 12518 + }, + { + "epoch": 0.582861931699141, + "grad_norm": 0.43031817823483814, + "learning_rate": 9.73164942585145e-05, + "loss": 3.0425, + "step": 12519 + }, + { + "epoch": 0.582908489885234, + "grad_norm": 0.4149278094695122, + "learning_rate": 9.731561871910549e-05, + "loss": 3.1521, + "step": 12520 + }, + { + "epoch": 0.5829550480713271, + "grad_norm": 0.37091036561418067, + "learning_rate": 9.731474304082978e-05, + "loss": 3.121, + "step": 12521 + }, + { + "epoch": 0.5830016062574203, + "grad_norm": 0.4276759872549898, + "learning_rate": 9.731386722368996e-05, + "loss": 3.0784, + "step": 12522 + }, + { + "epoch": 0.5830481644435133, + "grad_norm": 0.36940407961028054, + "learning_rate": 9.73129912676886e-05, + "loss": 3.1482, + "step": 12523 + }, + { + "epoch": 0.5830947226296064, + "grad_norm": 0.37383958164177394, + "learning_rate": 9.731211517282827e-05, + "loss": 3.1625, + "step": 12524 + }, + { + "epoch": 0.5831412808156994, + "grad_norm": 0.397809099478674, + "learning_rate": 9.731123893911156e-05, + "loss": 3.1101, + "step": 12525 + }, + { + "epoch": 0.5831878390017925, + "grad_norm": 0.36473440984533295, + "learning_rate": 9.731036256654103e-05, + "loss": 3.143, + "step": 12526 + }, + { + "epoch": 0.5832343971878856, + "grad_norm": 0.35233701448842975, + "learning_rate": 9.730948605511922e-05, + "loss": 3.1425, + "step": 12527 + }, + { + "epoch": 0.5832809553739786, + "grad_norm": 0.38343114769735903, + "learning_rate": 9.730860940484874e-05, + "loss": 3.0912, + "step": 12528 + }, + { + "epoch": 0.5833275135600717, + "grad_norm": 0.4013099154900542, + "learning_rate": 9.730773261573215e-05, + "loss": 3.0587, + "step": 12529 + }, + { + "epoch": 0.5833740717461647, + "grad_norm": 0.34975207100692307, + "learning_rate": 9.730685568777203e-05, + "loss": 3.2355, + "step": 12530 + }, + { + "epoch": 0.5834206299322579, + "grad_norm": 0.37644774160198646, + "learning_rate": 9.730597862097095e-05, + "loss": 3.1212, + "step": 12531 + }, + { + "epoch": 0.5834671881183509, + "grad_norm": 0.4127195342210053, + "learning_rate": 9.730510141533147e-05, + "loss": 3.1858, + "step": 12532 + }, + { + "epoch": 0.583513746304444, + "grad_norm": 0.3416636640665904, + "learning_rate": 9.730422407085619e-05, + "loss": 3.0247, + "step": 12533 + }, + { + "epoch": 0.5835603044905371, + "grad_norm": 0.394901110505845, + "learning_rate": 9.730334658754767e-05, + "loss": 3.1683, + "step": 12534 + }, + { + "epoch": 0.5836068626766301, + "grad_norm": 0.3583416484697198, + "learning_rate": 9.730246896540848e-05, + "loss": 3.0601, + "step": 12535 + }, + { + "epoch": 0.5836534208627232, + "grad_norm": 0.3695341582193235, + "learning_rate": 9.730159120444122e-05, + "loss": 3.0765, + "step": 12536 + }, + { + "epoch": 0.5836999790488162, + "grad_norm": 0.34106892994502097, + "learning_rate": 9.730071330464845e-05, + "loss": 3.0551, + "step": 12537 + }, + { + "epoch": 0.5837465372349093, + "grad_norm": 0.4283860928575669, + "learning_rate": 9.729983526603274e-05, + "loss": 3.2013, + "step": 12538 + }, + { + "epoch": 0.5837930954210024, + "grad_norm": 0.3617806978669317, + "learning_rate": 9.729895708859667e-05, + "loss": 3.06, + "step": 12539 + }, + { + "epoch": 0.5838396536070954, + "grad_norm": 0.396368597377949, + "learning_rate": 9.729807877234283e-05, + "loss": 3.0585, + "step": 12540 + }, + { + "epoch": 0.5838862117931886, + "grad_norm": 0.3927574569096961, + "learning_rate": 9.729720031727377e-05, + "loss": 3.1513, + "step": 12541 + }, + { + "epoch": 0.5839327699792816, + "grad_norm": 0.36528355619079, + "learning_rate": 9.729632172339209e-05, + "loss": 3.0721, + "step": 12542 + }, + { + "epoch": 0.5839793281653747, + "grad_norm": 0.43628908847183556, + "learning_rate": 9.729544299070038e-05, + "loss": 3.1164, + "step": 12543 + }, + { + "epoch": 0.5840258863514678, + "grad_norm": 0.33163795423522563, + "learning_rate": 9.729456411920119e-05, + "loss": 3.028, + "step": 12544 + }, + { + "epoch": 0.5840724445375608, + "grad_norm": 0.40062316051254176, + "learning_rate": 9.729368510889714e-05, + "loss": 3.0983, + "step": 12545 + }, + { + "epoch": 0.5841190027236539, + "grad_norm": 0.38348638437252025, + "learning_rate": 9.729280595979076e-05, + "loss": 3.0249, + "step": 12546 + }, + { + "epoch": 0.5841655609097469, + "grad_norm": 0.36623832261141503, + "learning_rate": 9.729192667188465e-05, + "loss": 2.9773, + "step": 12547 + }, + { + "epoch": 0.58421211909584, + "grad_norm": 0.38867640018122107, + "learning_rate": 9.72910472451814e-05, + "loss": 3.1134, + "step": 12548 + }, + { + "epoch": 0.5842586772819331, + "grad_norm": 0.3423783773665444, + "learning_rate": 9.729016767968359e-05, + "loss": 3.1871, + "step": 12549 + }, + { + "epoch": 0.5843052354680262, + "grad_norm": 0.37519593343287655, + "learning_rate": 9.728928797539379e-05, + "loss": 3.2018, + "step": 12550 + }, + { + "epoch": 0.5843517936541193, + "grad_norm": 0.3612611666160971, + "learning_rate": 9.728840813231459e-05, + "loss": 3.0627, + "step": 12551 + }, + { + "epoch": 0.5843983518402123, + "grad_norm": 0.3627751843862738, + "learning_rate": 9.728752815044857e-05, + "loss": 3.1306, + "step": 12552 + }, + { + "epoch": 0.5844449100263054, + "grad_norm": 0.3870742138520021, + "learning_rate": 9.728664802979832e-05, + "loss": 3.1564, + "step": 12553 + }, + { + "epoch": 0.5844914682123984, + "grad_norm": 0.355904734897506, + "learning_rate": 9.728576777036642e-05, + "loss": 3.0603, + "step": 12554 + }, + { + "epoch": 0.5845380263984915, + "grad_norm": 0.3475254348534349, + "learning_rate": 9.728488737215544e-05, + "loss": 3.1508, + "step": 12555 + }, + { + "epoch": 0.5845845845845846, + "grad_norm": 0.3309735238765811, + "learning_rate": 9.728400683516797e-05, + "loss": 3.0549, + "step": 12556 + }, + { + "epoch": 0.5846311427706776, + "grad_norm": 0.37589276068786925, + "learning_rate": 9.728312615940661e-05, + "loss": 3.0713, + "step": 12557 + }, + { + "epoch": 0.5846777009567707, + "grad_norm": 0.33751242559564665, + "learning_rate": 9.728224534487393e-05, + "loss": 3.1747, + "step": 12558 + }, + { + "epoch": 0.5847242591428637, + "grad_norm": 0.36690380162390046, + "learning_rate": 9.728136439157252e-05, + "loss": 3.0073, + "step": 12559 + }, + { + "epoch": 0.5847708173289569, + "grad_norm": 0.3700158120686193, + "learning_rate": 9.728048329950495e-05, + "loss": 3.057, + "step": 12560 + }, + { + "epoch": 0.58481737551505, + "grad_norm": 0.3606918146704421, + "learning_rate": 9.727960206867383e-05, + "loss": 3.0489, + "step": 12561 + }, + { + "epoch": 0.584863933701143, + "grad_norm": 0.3962180143529851, + "learning_rate": 9.727872069908174e-05, + "loss": 3.2381, + "step": 12562 + }, + { + "epoch": 0.5849104918872361, + "grad_norm": 0.34969696563605374, + "learning_rate": 9.727783919073125e-05, + "loss": 3.0749, + "step": 12563 + }, + { + "epoch": 0.5849570500733291, + "grad_norm": 0.334930856577945, + "learning_rate": 9.727695754362497e-05, + "loss": 3.0971, + "step": 12564 + }, + { + "epoch": 0.5850036082594222, + "grad_norm": 0.37029532434829954, + "learning_rate": 9.727607575776547e-05, + "loss": 3.1265, + "step": 12565 + }, + { + "epoch": 0.5850501664455153, + "grad_norm": 0.3475660831375791, + "learning_rate": 9.727519383315536e-05, + "loss": 3.0821, + "step": 12566 + }, + { + "epoch": 0.5850967246316083, + "grad_norm": 0.36471672852101983, + "learning_rate": 9.72743117697972e-05, + "loss": 3.0718, + "step": 12567 + }, + { + "epoch": 0.5851432828177014, + "grad_norm": 0.3417910530499092, + "learning_rate": 9.727342956769359e-05, + "loss": 3.0576, + "step": 12568 + }, + { + "epoch": 0.5851898410037945, + "grad_norm": 0.39186536144059114, + "learning_rate": 9.727254722684713e-05, + "loss": 3.0881, + "step": 12569 + }, + { + "epoch": 0.5852363991898876, + "grad_norm": 0.3596807065487065, + "learning_rate": 9.72716647472604e-05, + "loss": 3.165, + "step": 12570 + }, + { + "epoch": 0.5852829573759807, + "grad_norm": 0.4071382671959792, + "learning_rate": 9.7270782128936e-05, + "loss": 3.0546, + "step": 12571 + }, + { + "epoch": 0.5853295155620737, + "grad_norm": 0.31322345907464166, + "learning_rate": 9.726989937187649e-05, + "loss": 2.972, + "step": 12572 + }, + { + "epoch": 0.5853760737481668, + "grad_norm": 0.36830655811630797, + "learning_rate": 9.726901647608449e-05, + "loss": 2.913, + "step": 12573 + }, + { + "epoch": 0.5854226319342598, + "grad_norm": 0.34247294127826433, + "learning_rate": 9.726813344156259e-05, + "loss": 3.1447, + "step": 12574 + }, + { + "epoch": 0.5854691901203529, + "grad_norm": 0.44030095557453425, + "learning_rate": 9.726725026831336e-05, + "loss": 3.0974, + "step": 12575 + }, + { + "epoch": 0.5855157483064459, + "grad_norm": 0.3868793461969006, + "learning_rate": 9.726636695633942e-05, + "loss": 3.1636, + "step": 12576 + }, + { + "epoch": 0.585562306492539, + "grad_norm": 0.3824566366290496, + "learning_rate": 9.726548350564336e-05, + "loss": 3.0743, + "step": 12577 + }, + { + "epoch": 0.5856088646786322, + "grad_norm": 0.37970370002011644, + "learning_rate": 9.726459991622773e-05, + "loss": 3.1509, + "step": 12578 + }, + { + "epoch": 0.5856554228647252, + "grad_norm": 0.3911667735734511, + "learning_rate": 9.726371618809518e-05, + "loss": 3.0237, + "step": 12579 + }, + { + "epoch": 0.5857019810508183, + "grad_norm": 0.3695596848293949, + "learning_rate": 9.726283232124826e-05, + "loss": 3.0946, + "step": 12580 + }, + { + "epoch": 0.5857485392369113, + "grad_norm": 0.38504758735962874, + "learning_rate": 9.726194831568961e-05, + "loss": 3.1202, + "step": 12581 + }, + { + "epoch": 0.5857950974230044, + "grad_norm": 0.4037972079073439, + "learning_rate": 9.726106417142177e-05, + "loss": 3.0919, + "step": 12582 + }, + { + "epoch": 0.5858416556090975, + "grad_norm": 0.3721003481892859, + "learning_rate": 9.726017988844739e-05, + "loss": 3.1003, + "step": 12583 + }, + { + "epoch": 0.5858882137951905, + "grad_norm": 0.37619568606614906, + "learning_rate": 9.725929546676902e-05, + "loss": 3.169, + "step": 12584 + }, + { + "epoch": 0.5859347719812836, + "grad_norm": 0.369730257278561, + "learning_rate": 9.725841090638929e-05, + "loss": 3.0421, + "step": 12585 + }, + { + "epoch": 0.5859813301673766, + "grad_norm": 0.36256838760055804, + "learning_rate": 9.725752620731076e-05, + "loss": 3.0754, + "step": 12586 + }, + { + "epoch": 0.5860278883534698, + "grad_norm": 0.37177144449104765, + "learning_rate": 9.725664136953606e-05, + "loss": 3.1269, + "step": 12587 + }, + { + "epoch": 0.5860744465395629, + "grad_norm": 0.3463693058724815, + "learning_rate": 9.725575639306777e-05, + "loss": 3.1767, + "step": 12588 + }, + { + "epoch": 0.5861210047256559, + "grad_norm": 0.3629574319945421, + "learning_rate": 9.725487127790849e-05, + "loss": 3.2206, + "step": 12589 + }, + { + "epoch": 0.586167562911749, + "grad_norm": 0.3734627861145697, + "learning_rate": 9.725398602406082e-05, + "loss": 3.1141, + "step": 12590 + }, + { + "epoch": 0.586214121097842, + "grad_norm": 0.34521476969839276, + "learning_rate": 9.725310063152735e-05, + "loss": 3.0594, + "step": 12591 + }, + { + "epoch": 0.5862606792839351, + "grad_norm": 0.3742966717271272, + "learning_rate": 9.725221510031069e-05, + "loss": 3.2279, + "step": 12592 + }, + { + "epoch": 0.5863072374700282, + "grad_norm": 0.37000297626877043, + "learning_rate": 9.725132943041343e-05, + "loss": 3.0792, + "step": 12593 + }, + { + "epoch": 0.5863537956561212, + "grad_norm": 0.40733991480729137, + "learning_rate": 9.725044362183818e-05, + "loss": 3.0473, + "step": 12594 + }, + { + "epoch": 0.5864003538422143, + "grad_norm": 0.42994650588577604, + "learning_rate": 9.724955767458752e-05, + "loss": 3.1218, + "step": 12595 + }, + { + "epoch": 0.5864469120283073, + "grad_norm": 0.38347687965234717, + "learning_rate": 9.724867158866406e-05, + "loss": 3.108, + "step": 12596 + }, + { + "epoch": 0.5864934702144005, + "grad_norm": 0.39074478791564565, + "learning_rate": 9.724778536407042e-05, + "loss": 3.0304, + "step": 12597 + }, + { + "epoch": 0.5865400284004935, + "grad_norm": 0.36812978816216013, + "learning_rate": 9.724689900080919e-05, + "loss": 3.1559, + "step": 12598 + }, + { + "epoch": 0.5865865865865866, + "grad_norm": 0.3522861068345599, + "learning_rate": 9.724601249888295e-05, + "loss": 3.0485, + "step": 12599 + }, + { + "epoch": 0.5866331447726797, + "grad_norm": 0.3797012668272555, + "learning_rate": 9.724512585829432e-05, + "loss": 3.1483, + "step": 12600 + }, + { + "epoch": 0.5866797029587727, + "grad_norm": 0.3956472008907745, + "learning_rate": 9.72442390790459e-05, + "loss": 3.1582, + "step": 12601 + }, + { + "epoch": 0.5867262611448658, + "grad_norm": 0.4334276187845287, + "learning_rate": 9.72433521611403e-05, + "loss": 3.187, + "step": 12602 + }, + { + "epoch": 0.5867728193309588, + "grad_norm": 0.3790685520512045, + "learning_rate": 9.724246510458012e-05, + "loss": 3.0952, + "step": 12603 + }, + { + "epoch": 0.5868193775170519, + "grad_norm": 0.3488780340105341, + "learning_rate": 9.724157790936794e-05, + "loss": 3.0143, + "step": 12604 + }, + { + "epoch": 0.586865935703145, + "grad_norm": 0.3878073839094547, + "learning_rate": 9.72406905755064e-05, + "loss": 3.0528, + "step": 12605 + }, + { + "epoch": 0.586912493889238, + "grad_norm": 0.3978442262070021, + "learning_rate": 9.723980310299807e-05, + "loss": 3.1289, + "step": 12606 + }, + { + "epoch": 0.5869590520753312, + "grad_norm": 0.39135917383752006, + "learning_rate": 9.723891549184559e-05, + "loss": 3.0283, + "step": 12607 + }, + { + "epoch": 0.5870056102614242, + "grad_norm": 0.3735523012581592, + "learning_rate": 9.723802774205154e-05, + "loss": 3.1788, + "step": 12608 + }, + { + "epoch": 0.5870521684475173, + "grad_norm": 0.5101938166051861, + "learning_rate": 9.723713985361853e-05, + "loss": 3.1074, + "step": 12609 + }, + { + "epoch": 0.5870987266336104, + "grad_norm": 0.4033931913354115, + "learning_rate": 9.723625182654916e-05, + "loss": 3.0187, + "step": 12610 + }, + { + "epoch": 0.5871452848197034, + "grad_norm": 0.4032489438762377, + "learning_rate": 9.723536366084607e-05, + "loss": 3.0651, + "step": 12611 + }, + { + "epoch": 0.5871918430057965, + "grad_norm": 0.4495702716371077, + "learning_rate": 9.723447535651183e-05, + "loss": 3.0154, + "step": 12612 + }, + { + "epoch": 0.5872384011918895, + "grad_norm": 0.4282477497876802, + "learning_rate": 9.723358691354905e-05, + "loss": 3.0452, + "step": 12613 + }, + { + "epoch": 0.5872849593779826, + "grad_norm": 0.4212568016030562, + "learning_rate": 9.723269833196034e-05, + "loss": 3.1347, + "step": 12614 + }, + { + "epoch": 0.5873315175640758, + "grad_norm": 0.4666438695196942, + "learning_rate": 9.723180961174832e-05, + "loss": 3.053, + "step": 12615 + }, + { + "epoch": 0.5873780757501688, + "grad_norm": 0.3804596613979218, + "learning_rate": 9.72309207529156e-05, + "loss": 3.1672, + "step": 12616 + }, + { + "epoch": 0.5874246339362619, + "grad_norm": 0.3990194762375462, + "learning_rate": 9.723003175546477e-05, + "loss": 3.0754, + "step": 12617 + }, + { + "epoch": 0.5874711921223549, + "grad_norm": 0.3975672821425874, + "learning_rate": 9.722914261939846e-05, + "loss": 3.0659, + "step": 12618 + }, + { + "epoch": 0.587517750308448, + "grad_norm": 0.42082002922582185, + "learning_rate": 9.722825334471925e-05, + "loss": 3.0065, + "step": 12619 + }, + { + "epoch": 0.587564308494541, + "grad_norm": 0.38402624922315337, + "learning_rate": 9.722736393142979e-05, + "loss": 3.0784, + "step": 12620 + }, + { + "epoch": 0.5876108666806341, + "grad_norm": 0.3917347592633264, + "learning_rate": 9.722647437953268e-05, + "loss": 3.0027, + "step": 12621 + }, + { + "epoch": 0.5876574248667272, + "grad_norm": 0.4032488023303399, + "learning_rate": 9.722558468903049e-05, + "loss": 3.0825, + "step": 12622 + }, + { + "epoch": 0.5877039830528202, + "grad_norm": 0.4227069597746339, + "learning_rate": 9.722469485992588e-05, + "loss": 3.2352, + "step": 12623 + }, + { + "epoch": 0.5877505412389133, + "grad_norm": 0.3843680681837016, + "learning_rate": 9.722380489222144e-05, + "loss": 2.9688, + "step": 12624 + }, + { + "epoch": 0.5877970994250064, + "grad_norm": 0.39463973610314956, + "learning_rate": 9.722291478591978e-05, + "loss": 3.1045, + "step": 12625 + }, + { + "epoch": 0.5878436576110995, + "grad_norm": 0.43403488449071786, + "learning_rate": 9.722202454102351e-05, + "loss": 3.1047, + "step": 12626 + }, + { + "epoch": 0.5878902157971926, + "grad_norm": 0.4218486437717066, + "learning_rate": 9.722113415753527e-05, + "loss": 3.0768, + "step": 12627 + }, + { + "epoch": 0.5879367739832856, + "grad_norm": 0.4050869216338237, + "learning_rate": 9.722024363545764e-05, + "loss": 3.1385, + "step": 12628 + }, + { + "epoch": 0.5879833321693787, + "grad_norm": 0.36899855540275267, + "learning_rate": 9.721935297479325e-05, + "loss": 3.0711, + "step": 12629 + }, + { + "epoch": 0.5880298903554717, + "grad_norm": 0.4172691927603058, + "learning_rate": 9.72184621755447e-05, + "loss": 3.0903, + "step": 12630 + }, + { + "epoch": 0.5880764485415648, + "grad_norm": 0.4134198117058803, + "learning_rate": 9.721757123771464e-05, + "loss": 3.0776, + "step": 12631 + }, + { + "epoch": 0.5881230067276579, + "grad_norm": 0.3922808346581841, + "learning_rate": 9.721668016130565e-05, + "loss": 3.0867, + "step": 12632 + }, + { + "epoch": 0.5881695649137509, + "grad_norm": 0.40190852018558393, + "learning_rate": 9.721578894632035e-05, + "loss": 3.0738, + "step": 12633 + }, + { + "epoch": 0.5882161230998441, + "grad_norm": 0.4200392419587285, + "learning_rate": 9.721489759276137e-05, + "loss": 3.0454, + "step": 12634 + }, + { + "epoch": 0.5882626812859371, + "grad_norm": 0.3725217021656036, + "learning_rate": 9.721400610063129e-05, + "loss": 3.1487, + "step": 12635 + }, + { + "epoch": 0.5883092394720302, + "grad_norm": 0.39708095166320634, + "learning_rate": 9.721311446993278e-05, + "loss": 3.1338, + "step": 12636 + }, + { + "epoch": 0.5883557976581233, + "grad_norm": 0.42765075756772636, + "learning_rate": 9.721222270066843e-05, + "loss": 3.1018, + "step": 12637 + }, + { + "epoch": 0.5884023558442163, + "grad_norm": 0.3671608017304079, + "learning_rate": 9.721133079284085e-05, + "loss": 3.153, + "step": 12638 + }, + { + "epoch": 0.5884489140303094, + "grad_norm": 0.4466436926810445, + "learning_rate": 9.721043874645266e-05, + "loss": 3.1278, + "step": 12639 + }, + { + "epoch": 0.5884954722164024, + "grad_norm": 0.4155057178263887, + "learning_rate": 9.720954656150649e-05, + "loss": 2.9372, + "step": 12640 + }, + { + "epoch": 0.5885420304024955, + "grad_norm": 0.38678318975157816, + "learning_rate": 9.720865423800495e-05, + "loss": 3.0644, + "step": 12641 + }, + { + "epoch": 0.5885885885885885, + "grad_norm": 0.5610130237967084, + "learning_rate": 9.720776177595066e-05, + "loss": 3.1711, + "step": 12642 + }, + { + "epoch": 0.5886351467746817, + "grad_norm": 0.41944667660473045, + "learning_rate": 9.720686917534624e-05, + "loss": 3.101, + "step": 12643 + }, + { + "epoch": 0.5886817049607748, + "grad_norm": 0.4053929536725176, + "learning_rate": 9.720597643619431e-05, + "loss": 3.1328, + "step": 12644 + }, + { + "epoch": 0.5887282631468678, + "grad_norm": 0.3962658349863418, + "learning_rate": 9.72050835584975e-05, + "loss": 3.1815, + "step": 12645 + }, + { + "epoch": 0.5887748213329609, + "grad_norm": 0.43362880639226925, + "learning_rate": 9.720419054225842e-05, + "loss": 3.16, + "step": 12646 + }, + { + "epoch": 0.5888213795190539, + "grad_norm": 0.3995305432066323, + "learning_rate": 9.720329738747967e-05, + "loss": 3.1412, + "step": 12647 + }, + { + "epoch": 0.588867937705147, + "grad_norm": 0.35889801495548906, + "learning_rate": 9.720240409416391e-05, + "loss": 2.9631, + "step": 12648 + }, + { + "epoch": 0.5889144958912401, + "grad_norm": 0.4193986542588867, + "learning_rate": 9.720151066231374e-05, + "loss": 3.1296, + "step": 12649 + }, + { + "epoch": 0.5889610540773331, + "grad_norm": 0.3982415822395903, + "learning_rate": 9.720061709193178e-05, + "loss": 2.9804, + "step": 12650 + }, + { + "epoch": 0.5890076122634262, + "grad_norm": 0.39893747785721084, + "learning_rate": 9.719972338302067e-05, + "loss": 3.0329, + "step": 12651 + }, + { + "epoch": 0.5890541704495192, + "grad_norm": 0.39565225108051616, + "learning_rate": 9.719882953558302e-05, + "loss": 2.9732, + "step": 12652 + }, + { + "epoch": 0.5891007286356124, + "grad_norm": 0.3843329073442394, + "learning_rate": 9.719793554962144e-05, + "loss": 3.0542, + "step": 12653 + }, + { + "epoch": 0.5891472868217055, + "grad_norm": 0.40958768653316857, + "learning_rate": 9.719704142513859e-05, + "loss": 3.0866, + "step": 12654 + }, + { + "epoch": 0.5891938450077985, + "grad_norm": 0.43122451752865026, + "learning_rate": 9.719614716213706e-05, + "loss": 2.9942, + "step": 12655 + }, + { + "epoch": 0.5892404031938916, + "grad_norm": 0.4039439575672422, + "learning_rate": 9.719525276061949e-05, + "loss": 3.0754, + "step": 12656 + }, + { + "epoch": 0.5892869613799846, + "grad_norm": 0.35801831379248233, + "learning_rate": 9.71943582205885e-05, + "loss": 3.037, + "step": 12657 + }, + { + "epoch": 0.5893335195660777, + "grad_norm": 0.41695268165371635, + "learning_rate": 9.719346354204672e-05, + "loss": 3.143, + "step": 12658 + }, + { + "epoch": 0.5893800777521708, + "grad_norm": 0.4299221091443624, + "learning_rate": 9.719256872499678e-05, + "loss": 3.2282, + "step": 12659 + }, + { + "epoch": 0.5894266359382638, + "grad_norm": 0.38701737407081505, + "learning_rate": 9.719167376944129e-05, + "loss": 3.2299, + "step": 12660 + }, + { + "epoch": 0.589473194124357, + "grad_norm": 0.37837401945736915, + "learning_rate": 9.71907786753829e-05, + "loss": 2.973, + "step": 12661 + }, + { + "epoch": 0.58951975231045, + "grad_norm": 0.36279649186393664, + "learning_rate": 9.718988344282422e-05, + "loss": 3.0575, + "step": 12662 + }, + { + "epoch": 0.5895663104965431, + "grad_norm": 0.3691274757412211, + "learning_rate": 9.718898807176788e-05, + "loss": 3.0108, + "step": 12663 + }, + { + "epoch": 0.5896128686826361, + "grad_norm": 0.38224850543801675, + "learning_rate": 9.718809256221651e-05, + "loss": 3.1175, + "step": 12664 + }, + { + "epoch": 0.5896594268687292, + "grad_norm": 0.37428289540634213, + "learning_rate": 9.718719691417273e-05, + "loss": 3.211, + "step": 12665 + }, + { + "epoch": 0.5897059850548223, + "grad_norm": 0.39352988075789974, + "learning_rate": 9.718630112763918e-05, + "loss": 3.0873, + "step": 12666 + }, + { + "epoch": 0.5897525432409153, + "grad_norm": 0.393576910777775, + "learning_rate": 9.718540520261847e-05, + "loss": 3.0229, + "step": 12667 + }, + { + "epoch": 0.5897991014270084, + "grad_norm": 0.38754481964496545, + "learning_rate": 9.718450913911327e-05, + "loss": 3.0811, + "step": 12668 + }, + { + "epoch": 0.5898456596131014, + "grad_norm": 0.4045148992224508, + "learning_rate": 9.718361293712617e-05, + "loss": 3.0146, + "step": 12669 + }, + { + "epoch": 0.5898922177991945, + "grad_norm": 0.40983646299569415, + "learning_rate": 9.718271659665981e-05, + "loss": 3.1673, + "step": 12670 + }, + { + "epoch": 0.5899387759852877, + "grad_norm": 0.34929119055226315, + "learning_rate": 9.718182011771684e-05, + "loss": 3.0968, + "step": 12671 + }, + { + "epoch": 0.5899853341713807, + "grad_norm": 0.41136352679460636, + "learning_rate": 9.718092350029986e-05, + "loss": 3.1137, + "step": 12672 + }, + { + "epoch": 0.5900318923574738, + "grad_norm": 0.35173445897676414, + "learning_rate": 9.718002674441154e-05, + "loss": 3.051, + "step": 12673 + }, + { + "epoch": 0.5900784505435668, + "grad_norm": 0.38902195051541894, + "learning_rate": 9.717912985005447e-05, + "loss": 3.042, + "step": 12674 + }, + { + "epoch": 0.5901250087296599, + "grad_norm": 0.368535211261376, + "learning_rate": 9.717823281723131e-05, + "loss": 3.0709, + "step": 12675 + }, + { + "epoch": 0.590171566915753, + "grad_norm": 0.3729125803915606, + "learning_rate": 9.717733564594467e-05, + "loss": 3.155, + "step": 12676 + }, + { + "epoch": 0.590218125101846, + "grad_norm": 0.3564550476751633, + "learning_rate": 9.71764383361972e-05, + "loss": 3.0714, + "step": 12677 + }, + { + "epoch": 0.5902646832879391, + "grad_norm": 0.38096935376744645, + "learning_rate": 9.717554088799154e-05, + "loss": 2.9837, + "step": 12678 + }, + { + "epoch": 0.5903112414740321, + "grad_norm": 0.37662325752789205, + "learning_rate": 9.717464330133032e-05, + "loss": 3.054, + "step": 12679 + }, + { + "epoch": 0.5903577996601252, + "grad_norm": 0.41599675318404844, + "learning_rate": 9.717374557621616e-05, + "loss": 3.1712, + "step": 12680 + }, + { + "epoch": 0.5904043578462184, + "grad_norm": 0.40122963812629964, + "learning_rate": 9.717284771265171e-05, + "loss": 3.0252, + "step": 12681 + }, + { + "epoch": 0.5904509160323114, + "grad_norm": 0.35203678050830495, + "learning_rate": 9.717194971063961e-05, + "loss": 3.1072, + "step": 12682 + }, + { + "epoch": 0.5904974742184045, + "grad_norm": 0.3734340481187579, + "learning_rate": 9.717105157018247e-05, + "loss": 3.0517, + "step": 12683 + }, + { + "epoch": 0.5905440324044975, + "grad_norm": 0.3523044431327765, + "learning_rate": 9.717015329128294e-05, + "loss": 3.1148, + "step": 12684 + }, + { + "epoch": 0.5905905905905906, + "grad_norm": 0.37783756075550834, + "learning_rate": 9.716925487394366e-05, + "loss": 3.0452, + "step": 12685 + }, + { + "epoch": 0.5906371487766836, + "grad_norm": 0.3261705896452867, + "learning_rate": 9.716835631816726e-05, + "loss": 3.2147, + "step": 12686 + }, + { + "epoch": 0.5906837069627767, + "grad_norm": 0.3981734094942039, + "learning_rate": 9.716745762395639e-05, + "loss": 3.1027, + "step": 12687 + }, + { + "epoch": 0.5907302651488698, + "grad_norm": 0.32840172828546055, + "learning_rate": 9.716655879131366e-05, + "loss": 3.0822, + "step": 12688 + }, + { + "epoch": 0.5907768233349628, + "grad_norm": 0.35951964123005264, + "learning_rate": 9.716565982024175e-05, + "loss": 3.0955, + "step": 12689 + }, + { + "epoch": 0.590823381521056, + "grad_norm": 0.39995231101537015, + "learning_rate": 9.716476071074328e-05, + "loss": 3.0567, + "step": 12690 + }, + { + "epoch": 0.590869939707149, + "grad_norm": 0.44722556167062955, + "learning_rate": 9.716386146282086e-05, + "loss": 3.0503, + "step": 12691 + }, + { + "epoch": 0.5909164978932421, + "grad_norm": 0.3609230913565451, + "learning_rate": 9.716296207647718e-05, + "loss": 3.0937, + "step": 12692 + }, + { + "epoch": 0.5909630560793352, + "grad_norm": 0.3739666361341295, + "learning_rate": 9.716206255171481e-05, + "loss": 3.1104, + "step": 12693 + }, + { + "epoch": 0.5910096142654282, + "grad_norm": 0.41806226563954385, + "learning_rate": 9.716116288853647e-05, + "loss": 3.0195, + "step": 12694 + }, + { + "epoch": 0.5910561724515213, + "grad_norm": 0.35414285448975097, + "learning_rate": 9.716026308694475e-05, + "loss": 3.1068, + "step": 12695 + }, + { + "epoch": 0.5911027306376143, + "grad_norm": 0.44584666094482844, + "learning_rate": 9.71593631469423e-05, + "loss": 3.0946, + "step": 12696 + }, + { + "epoch": 0.5911492888237074, + "grad_norm": 0.36923515301575693, + "learning_rate": 9.715846306853177e-05, + "loss": 3.0724, + "step": 12697 + }, + { + "epoch": 0.5911958470098005, + "grad_norm": 0.36211459495948195, + "learning_rate": 9.71575628517158e-05, + "loss": 2.9757, + "step": 12698 + }, + { + "epoch": 0.5912424051958936, + "grad_norm": 0.3897188883919508, + "learning_rate": 9.715666249649702e-05, + "loss": 2.9592, + "step": 12699 + }, + { + "epoch": 0.5912889633819867, + "grad_norm": 0.34969850360608706, + "learning_rate": 9.715576200287809e-05, + "loss": 2.9619, + "step": 12700 + }, + { + "epoch": 0.5913355215680797, + "grad_norm": 0.3597063346136639, + "learning_rate": 9.715486137086164e-05, + "loss": 3.1901, + "step": 12701 + }, + { + "epoch": 0.5913820797541728, + "grad_norm": 0.4415967773639636, + "learning_rate": 9.715396060045032e-05, + "loss": 3.2325, + "step": 12702 + }, + { + "epoch": 0.5914286379402659, + "grad_norm": 0.41469180620356444, + "learning_rate": 9.715305969164676e-05, + "loss": 3.1714, + "step": 12703 + }, + { + "epoch": 0.5914751961263589, + "grad_norm": 0.3852253995466544, + "learning_rate": 9.715215864445363e-05, + "loss": 3.1722, + "step": 12704 + }, + { + "epoch": 0.591521754312452, + "grad_norm": 0.403917536885728, + "learning_rate": 9.715125745887356e-05, + "loss": 3.0255, + "step": 12705 + }, + { + "epoch": 0.591568312498545, + "grad_norm": 0.3525738688252663, + "learning_rate": 9.715035613490919e-05, + "loss": 3.0648, + "step": 12706 + }, + { + "epoch": 0.5916148706846381, + "grad_norm": 0.3994986990684364, + "learning_rate": 9.714945467256315e-05, + "loss": 3.1312, + "step": 12707 + }, + { + "epoch": 0.5916614288707311, + "grad_norm": 0.4099004092532343, + "learning_rate": 9.714855307183815e-05, + "loss": 3.0644, + "step": 12708 + }, + { + "epoch": 0.5917079870568243, + "grad_norm": 0.36346983961135737, + "learning_rate": 9.714765133273675e-05, + "loss": 3.1412, + "step": 12709 + }, + { + "epoch": 0.5917545452429174, + "grad_norm": 0.39943183543185634, + "learning_rate": 9.714674945526165e-05, + "loss": 3.0928, + "step": 12710 + }, + { + "epoch": 0.5918011034290104, + "grad_norm": 0.36670088172281523, + "learning_rate": 9.714584743941551e-05, + "loss": 3.1189, + "step": 12711 + }, + { + "epoch": 0.5918476616151035, + "grad_norm": 0.40271642125565194, + "learning_rate": 9.714494528520092e-05, + "loss": 3.0476, + "step": 12712 + }, + { + "epoch": 0.5918942198011965, + "grad_norm": 0.36775277629651204, + "learning_rate": 9.714404299262056e-05, + "loss": 3.093, + "step": 12713 + }, + { + "epoch": 0.5919407779872896, + "grad_norm": 0.38103568361598344, + "learning_rate": 9.71431405616771e-05, + "loss": 3.199, + "step": 12714 + }, + { + "epoch": 0.5919873361733827, + "grad_norm": 0.3917316646008866, + "learning_rate": 9.714223799237315e-05, + "loss": 3.1379, + "step": 12715 + }, + { + "epoch": 0.5920338943594757, + "grad_norm": 0.36412441463289696, + "learning_rate": 9.714133528471138e-05, + "loss": 3.021, + "step": 12716 + }, + { + "epoch": 0.5920804525455688, + "grad_norm": 0.367946339198251, + "learning_rate": 9.714043243869444e-05, + "loss": 3.0546, + "step": 12717 + }, + { + "epoch": 0.5921270107316619, + "grad_norm": 0.3524262677331694, + "learning_rate": 9.713952945432498e-05, + "loss": 3.0399, + "step": 12718 + }, + { + "epoch": 0.592173568917755, + "grad_norm": 0.3777079633903172, + "learning_rate": 9.713862633160564e-05, + "loss": 3.0858, + "step": 12719 + }, + { + "epoch": 0.5922201271038481, + "grad_norm": 0.37951394827686247, + "learning_rate": 9.713772307053906e-05, + "loss": 3.0729, + "step": 12720 + }, + { + "epoch": 0.5922666852899411, + "grad_norm": 0.38008679356213576, + "learning_rate": 9.713681967112793e-05, + "loss": 3.0597, + "step": 12721 + }, + { + "epoch": 0.5923132434760342, + "grad_norm": 0.37242057629093817, + "learning_rate": 9.713591613337488e-05, + "loss": 3.0809, + "step": 12722 + }, + { + "epoch": 0.5923598016621272, + "grad_norm": 0.34677851020883904, + "learning_rate": 9.713501245728253e-05, + "loss": 3.032, + "step": 12723 + }, + { + "epoch": 0.5924063598482203, + "grad_norm": 0.3673312694227575, + "learning_rate": 9.713410864285358e-05, + "loss": 3.0898, + "step": 12724 + }, + { + "epoch": 0.5924529180343134, + "grad_norm": 0.4146922570303087, + "learning_rate": 9.713320469009066e-05, + "loss": 3.0967, + "step": 12725 + }, + { + "epoch": 0.5924994762204064, + "grad_norm": 0.3887359037853016, + "learning_rate": 9.713230059899644e-05, + "loss": 3.1544, + "step": 12726 + }, + { + "epoch": 0.5925460344064996, + "grad_norm": 0.3864721992718161, + "learning_rate": 9.713139636957356e-05, + "loss": 3.0896, + "step": 12727 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.39444912863641374, + "learning_rate": 9.713049200182466e-05, + "loss": 3.1681, + "step": 12728 + }, + { + "epoch": 0.5926391507786857, + "grad_norm": 0.39335982892744353, + "learning_rate": 9.712958749575242e-05, + "loss": 3.0896, + "step": 12729 + }, + { + "epoch": 0.5926857089647787, + "grad_norm": 0.3984775319259566, + "learning_rate": 9.712868285135948e-05, + "loss": 3.1635, + "step": 12730 + }, + { + "epoch": 0.5927322671508718, + "grad_norm": 0.37350571965837254, + "learning_rate": 9.71277780686485e-05, + "loss": 3.054, + "step": 12731 + }, + { + "epoch": 0.5927788253369649, + "grad_norm": 0.39446473039545243, + "learning_rate": 9.712687314762212e-05, + "loss": 3.1499, + "step": 12732 + }, + { + "epoch": 0.5928253835230579, + "grad_norm": 0.3693720313776949, + "learning_rate": 9.712596808828303e-05, + "loss": 3.1711, + "step": 12733 + }, + { + "epoch": 0.592871941709151, + "grad_norm": 0.33385082196215926, + "learning_rate": 9.712506289063385e-05, + "loss": 3.045, + "step": 12734 + }, + { + "epoch": 0.592918499895244, + "grad_norm": 0.39505263928336576, + "learning_rate": 9.712415755467727e-05, + "loss": 3.1354, + "step": 12735 + }, + { + "epoch": 0.5929650580813371, + "grad_norm": 0.3423804844579429, + "learning_rate": 9.712325208041593e-05, + "loss": 3.1209, + "step": 12736 + }, + { + "epoch": 0.5930116162674303, + "grad_norm": 0.397159168665946, + "learning_rate": 9.712234646785248e-05, + "loss": 3.0917, + "step": 12737 + }, + { + "epoch": 0.5930581744535233, + "grad_norm": 0.36023117656251513, + "learning_rate": 9.712144071698959e-05, + "loss": 3.0565, + "step": 12738 + }, + { + "epoch": 0.5931047326396164, + "grad_norm": 0.360874020400223, + "learning_rate": 9.71205348278299e-05, + "loss": 3.0552, + "step": 12739 + }, + { + "epoch": 0.5931512908257094, + "grad_norm": 0.3482406353801339, + "learning_rate": 9.71196288003761e-05, + "loss": 3.0099, + "step": 12740 + }, + { + "epoch": 0.5931978490118025, + "grad_norm": 0.4121888849955981, + "learning_rate": 9.711872263463082e-05, + "loss": 3.2229, + "step": 12741 + }, + { + "epoch": 0.5932444071978956, + "grad_norm": 0.4082466160481063, + "learning_rate": 9.711781633059673e-05, + "loss": 3.0854, + "step": 12742 + }, + { + "epoch": 0.5932909653839886, + "grad_norm": 0.3414594443063246, + "learning_rate": 9.711690988827651e-05, + "loss": 3.0295, + "step": 12743 + }, + { + "epoch": 0.5933375235700817, + "grad_norm": 0.38777304324778883, + "learning_rate": 9.711600330767278e-05, + "loss": 3.1398, + "step": 12744 + }, + { + "epoch": 0.5933840817561747, + "grad_norm": 0.3496091217487005, + "learning_rate": 9.711509658878823e-05, + "loss": 3.0779, + "step": 12745 + }, + { + "epoch": 0.5934306399422679, + "grad_norm": 0.3914169658744512, + "learning_rate": 9.711418973162552e-05, + "loss": 3.1536, + "step": 12746 + }, + { + "epoch": 0.593477198128361, + "grad_norm": 0.35258511683193944, + "learning_rate": 9.711328273618729e-05, + "loss": 3.0544, + "step": 12747 + }, + { + "epoch": 0.593523756314454, + "grad_norm": 0.36320021485000636, + "learning_rate": 9.711237560247622e-05, + "loss": 3.1183, + "step": 12748 + }, + { + "epoch": 0.5935703145005471, + "grad_norm": 0.3942567552365183, + "learning_rate": 9.711146833049498e-05, + "loss": 3.1372, + "step": 12749 + }, + { + "epoch": 0.5936168726866401, + "grad_norm": 0.40379203984051265, + "learning_rate": 9.71105609202462e-05, + "loss": 3.1265, + "step": 12750 + }, + { + "epoch": 0.5936634308727332, + "grad_norm": 0.36378975172468214, + "learning_rate": 9.71096533717326e-05, + "loss": 3.1602, + "step": 12751 + }, + { + "epoch": 0.5937099890588262, + "grad_norm": 0.3571609830691691, + "learning_rate": 9.710874568495677e-05, + "loss": 2.9917, + "step": 12752 + }, + { + "epoch": 0.5937565472449193, + "grad_norm": 0.3316311232973363, + "learning_rate": 9.710783785992143e-05, + "loss": 2.9597, + "step": 12753 + }, + { + "epoch": 0.5938031054310124, + "grad_norm": 0.3158572758127861, + "learning_rate": 9.710692989662922e-05, + "loss": 3.0041, + "step": 12754 + }, + { + "epoch": 0.5938496636171054, + "grad_norm": 0.3410662243223876, + "learning_rate": 9.710602179508283e-05, + "loss": 3.0148, + "step": 12755 + }, + { + "epoch": 0.5938962218031986, + "grad_norm": 0.3389207318028688, + "learning_rate": 9.710511355528488e-05, + "loss": 3.1026, + "step": 12756 + }, + { + "epoch": 0.5939427799892916, + "grad_norm": 0.3378324642590058, + "learning_rate": 9.710420517723808e-05, + "loss": 3.1121, + "step": 12757 + }, + { + "epoch": 0.5939893381753847, + "grad_norm": 0.3380152049324821, + "learning_rate": 9.710329666094509e-05, + "loss": 3.1117, + "step": 12758 + }, + { + "epoch": 0.5940358963614778, + "grad_norm": 0.3393588403451347, + "learning_rate": 9.710238800640854e-05, + "loss": 3.1037, + "step": 12759 + }, + { + "epoch": 0.5940824545475708, + "grad_norm": 0.3859625688028082, + "learning_rate": 9.710147921363113e-05, + "loss": 3.0831, + "step": 12760 + }, + { + "epoch": 0.5941290127336639, + "grad_norm": 0.38492139617295074, + "learning_rate": 9.710057028261553e-05, + "loss": 3.1075, + "step": 12761 + }, + { + "epoch": 0.5941755709197569, + "grad_norm": 0.3766456988176237, + "learning_rate": 9.709966121336438e-05, + "loss": 3.122, + "step": 12762 + }, + { + "epoch": 0.59422212910585, + "grad_norm": 0.37069814062477247, + "learning_rate": 9.709875200588036e-05, + "loss": 3.0046, + "step": 12763 + }, + { + "epoch": 0.5942686872919432, + "grad_norm": 0.37660490751251124, + "learning_rate": 9.709784266016617e-05, + "loss": 3.117, + "step": 12764 + }, + { + "epoch": 0.5943152454780362, + "grad_norm": 0.381525212173998, + "learning_rate": 9.709693317622444e-05, + "loss": 3.0598, + "step": 12765 + }, + { + "epoch": 0.5943618036641293, + "grad_norm": 0.37678508480064077, + "learning_rate": 9.709602355405784e-05, + "loss": 3.0922, + "step": 12766 + }, + { + "epoch": 0.5944083618502223, + "grad_norm": 0.3930803796571073, + "learning_rate": 9.709511379366906e-05, + "loss": 3.079, + "step": 12767 + }, + { + "epoch": 0.5944549200363154, + "grad_norm": 0.4194721462954404, + "learning_rate": 9.709420389506075e-05, + "loss": 3.154, + "step": 12768 + }, + { + "epoch": 0.5945014782224085, + "grad_norm": 0.3638110257956459, + "learning_rate": 9.70932938582356e-05, + "loss": 3.1391, + "step": 12769 + }, + { + "epoch": 0.5945480364085015, + "grad_norm": 0.41314765964940703, + "learning_rate": 9.709238368319627e-05, + "loss": 3.0067, + "step": 12770 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 0.40518385105650573, + "learning_rate": 9.709147336994544e-05, + "loss": 3.0983, + "step": 12771 + }, + { + "epoch": 0.5946411527806876, + "grad_norm": 0.3742114149199133, + "learning_rate": 9.709056291848577e-05, + "loss": 3.1631, + "step": 12772 + }, + { + "epoch": 0.5946877109667807, + "grad_norm": 0.35946398233584526, + "learning_rate": 9.708965232881994e-05, + "loss": 3.1134, + "step": 12773 + }, + { + "epoch": 0.5947342691528738, + "grad_norm": 0.350184279207229, + "learning_rate": 9.70887416009506e-05, + "loss": 2.9643, + "step": 12774 + }, + { + "epoch": 0.5947808273389669, + "grad_norm": 0.4074592253266464, + "learning_rate": 9.708783073488046e-05, + "loss": 3.1217, + "step": 12775 + }, + { + "epoch": 0.59482738552506, + "grad_norm": 0.362993296508549, + "learning_rate": 9.70869197306122e-05, + "loss": 3.0981, + "step": 12776 + }, + { + "epoch": 0.594873943711153, + "grad_norm": 0.4130785266889285, + "learning_rate": 9.708600858814842e-05, + "loss": 3.187, + "step": 12777 + }, + { + "epoch": 0.5949205018972461, + "grad_norm": 0.43514865699731364, + "learning_rate": 9.708509730749187e-05, + "loss": 3.0033, + "step": 12778 + }, + { + "epoch": 0.5949670600833391, + "grad_norm": 0.3765155727458574, + "learning_rate": 9.708418588864521e-05, + "loss": 3.0969, + "step": 12779 + }, + { + "epoch": 0.5950136182694322, + "grad_norm": 0.38296967757765493, + "learning_rate": 9.708327433161109e-05, + "loss": 3.1853, + "step": 12780 + }, + { + "epoch": 0.5950601764555253, + "grad_norm": 0.40795630926948256, + "learning_rate": 9.708236263639219e-05, + "loss": 3.0656, + "step": 12781 + }, + { + "epoch": 0.5951067346416183, + "grad_norm": 0.3875731219504485, + "learning_rate": 9.70814508029912e-05, + "loss": 3.0346, + "step": 12782 + }, + { + "epoch": 0.5951532928277115, + "grad_norm": 0.370524119648456, + "learning_rate": 9.708053883141078e-05, + "loss": 3.0086, + "step": 12783 + }, + { + "epoch": 0.5951998510138045, + "grad_norm": 0.39053122099728294, + "learning_rate": 9.707962672165363e-05, + "loss": 3.1339, + "step": 12784 + }, + { + "epoch": 0.5952464091998976, + "grad_norm": 0.39399537174284016, + "learning_rate": 9.707871447372241e-05, + "loss": 3.062, + "step": 12785 + }, + { + "epoch": 0.5952929673859907, + "grad_norm": 0.35627172004147456, + "learning_rate": 9.707780208761981e-05, + "loss": 3.0599, + "step": 12786 + }, + { + "epoch": 0.5953395255720837, + "grad_norm": 0.3893679941395775, + "learning_rate": 9.707688956334848e-05, + "loss": 3.1381, + "step": 12787 + }, + { + "epoch": 0.5953860837581768, + "grad_norm": 0.3841515425059184, + "learning_rate": 9.707597690091113e-05, + "loss": 3.1947, + "step": 12788 + }, + { + "epoch": 0.5954326419442698, + "grad_norm": 0.36772123745900365, + "learning_rate": 9.707506410031042e-05, + "loss": 3.009, + "step": 12789 + }, + { + "epoch": 0.5954792001303629, + "grad_norm": 0.3823586628555932, + "learning_rate": 9.707415116154904e-05, + "loss": 3.0376, + "step": 12790 + }, + { + "epoch": 0.595525758316456, + "grad_norm": 0.36011654703197754, + "learning_rate": 9.707323808462967e-05, + "loss": 3.0782, + "step": 12791 + }, + { + "epoch": 0.595572316502549, + "grad_norm": 0.37473347030069565, + "learning_rate": 9.707232486955497e-05, + "loss": 3.1403, + "step": 12792 + }, + { + "epoch": 0.5956188746886422, + "grad_norm": 0.405904578213843, + "learning_rate": 9.707141151632764e-05, + "loss": 3.057, + "step": 12793 + }, + { + "epoch": 0.5956654328747352, + "grad_norm": 0.35538294273600607, + "learning_rate": 9.707049802495036e-05, + "loss": 3.1222, + "step": 12794 + }, + { + "epoch": 0.5957119910608283, + "grad_norm": 0.40645373730975065, + "learning_rate": 9.70695843954258e-05, + "loss": 3.1578, + "step": 12795 + }, + { + "epoch": 0.5957585492469213, + "grad_norm": 0.3759790600034577, + "learning_rate": 9.706867062775664e-05, + "loss": 3.1118, + "step": 12796 + }, + { + "epoch": 0.5958051074330144, + "grad_norm": 0.414089916793526, + "learning_rate": 9.706775672194557e-05, + "loss": 3.153, + "step": 12797 + }, + { + "epoch": 0.5958516656191075, + "grad_norm": 0.3967800036275929, + "learning_rate": 9.706684267799529e-05, + "loss": 3.1196, + "step": 12798 + }, + { + "epoch": 0.5958982238052005, + "grad_norm": 0.4218371290737703, + "learning_rate": 9.706592849590845e-05, + "loss": 3.0646, + "step": 12799 + }, + { + "epoch": 0.5959447819912936, + "grad_norm": 0.3939225707926769, + "learning_rate": 9.706501417568774e-05, + "loss": 3.12, + "step": 12800 + }, + { + "epoch": 0.5959913401773866, + "grad_norm": 0.4534687905014817, + "learning_rate": 9.706409971733586e-05, + "loss": 3.0478, + "step": 12801 + }, + { + "epoch": 0.5960378983634798, + "grad_norm": 0.4406476499407208, + "learning_rate": 9.706318512085548e-05, + "loss": 3.0724, + "step": 12802 + }, + { + "epoch": 0.5960844565495729, + "grad_norm": 0.4198265140847054, + "learning_rate": 9.706227038624929e-05, + "loss": 3.1503, + "step": 12803 + }, + { + "epoch": 0.5961310147356659, + "grad_norm": 0.40253939010068984, + "learning_rate": 9.706135551351996e-05, + "loss": 3.0344, + "step": 12804 + }, + { + "epoch": 0.596177572921759, + "grad_norm": 0.37993733652492073, + "learning_rate": 9.70604405026702e-05, + "loss": 3.074, + "step": 12805 + }, + { + "epoch": 0.596224131107852, + "grad_norm": 0.39969056753868126, + "learning_rate": 9.705952535370269e-05, + "loss": 2.9545, + "step": 12806 + }, + { + "epoch": 0.5962706892939451, + "grad_norm": 0.34486775017440346, + "learning_rate": 9.70586100666201e-05, + "loss": 3.0267, + "step": 12807 + }, + { + "epoch": 0.5963172474800382, + "grad_norm": 0.39866853724740975, + "learning_rate": 9.705769464142513e-05, + "loss": 3.0273, + "step": 12808 + }, + { + "epoch": 0.5963638056661312, + "grad_norm": 0.3947920954921535, + "learning_rate": 9.705677907812046e-05, + "loss": 3.0732, + "step": 12809 + }, + { + "epoch": 0.5964103638522243, + "grad_norm": 0.34838518015999004, + "learning_rate": 9.705586337670877e-05, + "loss": 3.1178, + "step": 12810 + }, + { + "epoch": 0.5964569220383173, + "grad_norm": 0.4042905825264193, + "learning_rate": 9.705494753719277e-05, + "loss": 3.167, + "step": 12811 + }, + { + "epoch": 0.5965034802244105, + "grad_norm": 0.3923783031516465, + "learning_rate": 9.705403155957513e-05, + "loss": 3.0514, + "step": 12812 + }, + { + "epoch": 0.5965500384105036, + "grad_norm": 0.35421886961411353, + "learning_rate": 9.705311544385854e-05, + "loss": 3.0648, + "step": 12813 + }, + { + "epoch": 0.5965965965965966, + "grad_norm": 0.36982652312881265, + "learning_rate": 9.705219919004569e-05, + "loss": 3.1174, + "step": 12814 + }, + { + "epoch": 0.5966431547826897, + "grad_norm": 0.3926303155791352, + "learning_rate": 9.705128279813929e-05, + "loss": 3.0597, + "step": 12815 + }, + { + "epoch": 0.5966897129687827, + "grad_norm": 0.31653259410744133, + "learning_rate": 9.7050366268142e-05, + "loss": 3.0202, + "step": 12816 + }, + { + "epoch": 0.5967362711548758, + "grad_norm": 0.3726280429862336, + "learning_rate": 9.704944960005651e-05, + "loss": 3.1285, + "step": 12817 + }, + { + "epoch": 0.5967828293409688, + "grad_norm": 0.3749127097085671, + "learning_rate": 9.704853279388553e-05, + "loss": 3.2412, + "step": 12818 + }, + { + "epoch": 0.5968293875270619, + "grad_norm": 0.3726915619508758, + "learning_rate": 9.704761584963173e-05, + "loss": 3.0966, + "step": 12819 + }, + { + "epoch": 0.596875945713155, + "grad_norm": 0.3838522115053122, + "learning_rate": 9.704669876729781e-05, + "loss": 3.19, + "step": 12820 + }, + { + "epoch": 0.5969225038992481, + "grad_norm": 0.3557859715966046, + "learning_rate": 9.704578154688648e-05, + "loss": 3.085, + "step": 12821 + }, + { + "epoch": 0.5969690620853412, + "grad_norm": 0.3778301079723853, + "learning_rate": 9.704486418840042e-05, + "loss": 3.1405, + "step": 12822 + }, + { + "epoch": 0.5970156202714342, + "grad_norm": 0.3723773868752687, + "learning_rate": 9.704394669184231e-05, + "loss": 2.9165, + "step": 12823 + }, + { + "epoch": 0.5970621784575273, + "grad_norm": 0.33486369482640627, + "learning_rate": 9.704302905721484e-05, + "loss": 2.9334, + "step": 12824 + }, + { + "epoch": 0.5971087366436204, + "grad_norm": 0.3727081469152646, + "learning_rate": 9.704211128452073e-05, + "loss": 3.0997, + "step": 12825 + }, + { + "epoch": 0.5971552948297134, + "grad_norm": 0.3394124860233261, + "learning_rate": 9.704119337376265e-05, + "loss": 3.0549, + "step": 12826 + }, + { + "epoch": 0.5972018530158065, + "grad_norm": 0.36277805256549817, + "learning_rate": 9.704027532494331e-05, + "loss": 3.1838, + "step": 12827 + }, + { + "epoch": 0.5972484112018995, + "grad_norm": 0.3504169674033841, + "learning_rate": 9.703935713806538e-05, + "loss": 3.0953, + "step": 12828 + }, + { + "epoch": 0.5972949693879926, + "grad_norm": 0.3383705336224213, + "learning_rate": 9.703843881313158e-05, + "loss": 3.1655, + "step": 12829 + }, + { + "epoch": 0.5973415275740858, + "grad_norm": 0.33359140137173904, + "learning_rate": 9.703752035014459e-05, + "loss": 2.958, + "step": 12830 + }, + { + "epoch": 0.5973880857601788, + "grad_norm": 0.33405247032985735, + "learning_rate": 9.703660174910712e-05, + "loss": 3.0562, + "step": 12831 + }, + { + "epoch": 0.5974346439462719, + "grad_norm": 0.35744773628112236, + "learning_rate": 9.703568301002187e-05, + "loss": 3.0683, + "step": 12832 + }, + { + "epoch": 0.5974812021323649, + "grad_norm": 0.374385396271221, + "learning_rate": 9.70347641328915e-05, + "loss": 3.1463, + "step": 12833 + }, + { + "epoch": 0.597527760318458, + "grad_norm": 0.39565618261852015, + "learning_rate": 9.703384511771873e-05, + "loss": 3.0679, + "step": 12834 + }, + { + "epoch": 0.5975743185045511, + "grad_norm": 0.3701355366473389, + "learning_rate": 9.703292596450629e-05, + "loss": 3.2159, + "step": 12835 + }, + { + "epoch": 0.5976208766906441, + "grad_norm": 0.380856039566976, + "learning_rate": 9.703200667325682e-05, + "loss": 3.0869, + "step": 12836 + }, + { + "epoch": 0.5976674348767372, + "grad_norm": 0.38145109296481133, + "learning_rate": 9.703108724397305e-05, + "loss": 3.1907, + "step": 12837 + }, + { + "epoch": 0.5977139930628302, + "grad_norm": 0.4296489531464699, + "learning_rate": 9.703016767665767e-05, + "loss": 3.0863, + "step": 12838 + }, + { + "epoch": 0.5977605512489234, + "grad_norm": 0.354430182481738, + "learning_rate": 9.702924797131339e-05, + "loss": 3.1741, + "step": 12839 + }, + { + "epoch": 0.5978071094350164, + "grad_norm": 0.4138076076457401, + "learning_rate": 9.702832812794289e-05, + "loss": 3.1199, + "step": 12840 + }, + { + "epoch": 0.5978536676211095, + "grad_norm": 0.3927760501794985, + "learning_rate": 9.702740814654887e-05, + "loss": 3.1411, + "step": 12841 + }, + { + "epoch": 0.5979002258072026, + "grad_norm": 0.35792193964200986, + "learning_rate": 9.702648802713406e-05, + "loss": 3.0452, + "step": 12842 + }, + { + "epoch": 0.5979467839932956, + "grad_norm": 0.3925478906992445, + "learning_rate": 9.702556776970113e-05, + "loss": 3.0962, + "step": 12843 + }, + { + "epoch": 0.5979933421793887, + "grad_norm": 0.42366461780936365, + "learning_rate": 9.70246473742528e-05, + "loss": 2.9688, + "step": 12844 + }, + { + "epoch": 0.5980399003654817, + "grad_norm": 0.42925655121062334, + "learning_rate": 9.702372684079175e-05, + "loss": 3.1515, + "step": 12845 + }, + { + "epoch": 0.5980864585515748, + "grad_norm": 0.3743279756687892, + "learning_rate": 9.702280616932072e-05, + "loss": 2.967, + "step": 12846 + }, + { + "epoch": 0.5981330167376679, + "grad_norm": 0.3586331059172648, + "learning_rate": 9.702188535984236e-05, + "loss": 3.0788, + "step": 12847 + }, + { + "epoch": 0.598179574923761, + "grad_norm": 0.40951542501910987, + "learning_rate": 9.702096441235941e-05, + "loss": 2.9998, + "step": 12848 + }, + { + "epoch": 0.5982261331098541, + "grad_norm": 0.33616714867178604, + "learning_rate": 9.702004332687455e-05, + "loss": 3.1282, + "step": 12849 + }, + { + "epoch": 0.5982726912959471, + "grad_norm": 0.3582828494463901, + "learning_rate": 9.701912210339051e-05, + "loss": 3.143, + "step": 12850 + }, + { + "epoch": 0.5983192494820402, + "grad_norm": 0.38107474452569806, + "learning_rate": 9.701820074190996e-05, + "loss": 2.9879, + "step": 12851 + }, + { + "epoch": 0.5983658076681333, + "grad_norm": 0.36151991587234483, + "learning_rate": 9.701727924243565e-05, + "loss": 3.0435, + "step": 12852 + }, + { + "epoch": 0.5984123658542263, + "grad_norm": 0.41750905046260584, + "learning_rate": 9.701635760497023e-05, + "loss": 3.1227, + "step": 12853 + }, + { + "epoch": 0.5984589240403194, + "grad_norm": 0.3830777082959691, + "learning_rate": 9.701543582951646e-05, + "loss": 3.0573, + "step": 12854 + }, + { + "epoch": 0.5985054822264124, + "grad_norm": 0.36480136300007465, + "learning_rate": 9.7014513916077e-05, + "loss": 3.1205, + "step": 12855 + }, + { + "epoch": 0.5985520404125055, + "grad_norm": 0.4023485648579418, + "learning_rate": 9.701359186465458e-05, + "loss": 3.0261, + "step": 12856 + }, + { + "epoch": 0.5985985985985987, + "grad_norm": 0.39898374302482986, + "learning_rate": 9.701266967525188e-05, + "loss": 2.9887, + "step": 12857 + }, + { + "epoch": 0.5986451567846917, + "grad_norm": 0.39916602555150016, + "learning_rate": 9.701174734787165e-05, + "loss": 3.1241, + "step": 12858 + }, + { + "epoch": 0.5986917149707848, + "grad_norm": 0.44823935690805805, + "learning_rate": 9.701082488251656e-05, + "loss": 3.0425, + "step": 12859 + }, + { + "epoch": 0.5987382731568778, + "grad_norm": 0.37031515551756716, + "learning_rate": 9.700990227918933e-05, + "loss": 3.049, + "step": 12860 + }, + { + "epoch": 0.5987848313429709, + "grad_norm": 0.374408744544731, + "learning_rate": 9.700897953789268e-05, + "loss": 2.9791, + "step": 12861 + }, + { + "epoch": 0.5988313895290639, + "grad_norm": 0.39232231407252277, + "learning_rate": 9.70080566586293e-05, + "loss": 3.0585, + "step": 12862 + }, + { + "epoch": 0.598877947715157, + "grad_norm": 0.34044166850967084, + "learning_rate": 9.70071336414019e-05, + "loss": 2.9619, + "step": 12863 + }, + { + "epoch": 0.5989245059012501, + "grad_norm": 0.38154262835712605, + "learning_rate": 9.700621048621322e-05, + "loss": 3.0191, + "step": 12864 + }, + { + "epoch": 0.5989710640873431, + "grad_norm": 0.3615023660756201, + "learning_rate": 9.70052871930659e-05, + "loss": 3.11, + "step": 12865 + }, + { + "epoch": 0.5990176222734362, + "grad_norm": 0.34770541524850856, + "learning_rate": 9.700436376196271e-05, + "loss": 3.1348, + "step": 12866 + }, + { + "epoch": 0.5990641804595292, + "grad_norm": 0.3640602548344233, + "learning_rate": 9.700344019290636e-05, + "loss": 3.0483, + "step": 12867 + }, + { + "epoch": 0.5991107386456224, + "grad_norm": 0.36362226471487613, + "learning_rate": 9.700251648589953e-05, + "loss": 3.0522, + "step": 12868 + }, + { + "epoch": 0.5991572968317155, + "grad_norm": 0.3954136034151789, + "learning_rate": 9.700159264094495e-05, + "loss": 3.1766, + "step": 12869 + }, + { + "epoch": 0.5992038550178085, + "grad_norm": 0.39775497806167337, + "learning_rate": 9.700066865804531e-05, + "loss": 3.0642, + "step": 12870 + }, + { + "epoch": 0.5992504132039016, + "grad_norm": 0.33551223250804324, + "learning_rate": 9.699974453720336e-05, + "loss": 3.1408, + "step": 12871 + }, + { + "epoch": 0.5992969713899946, + "grad_norm": 0.41936716381327405, + "learning_rate": 9.699882027842177e-05, + "loss": 3.1082, + "step": 12872 + }, + { + "epoch": 0.5993435295760877, + "grad_norm": 0.3842132758625923, + "learning_rate": 9.699789588170329e-05, + "loss": 3.0726, + "step": 12873 + }, + { + "epoch": 0.5993900877621808, + "grad_norm": 0.4049798292164079, + "learning_rate": 9.699697134705061e-05, + "loss": 2.9508, + "step": 12874 + }, + { + "epoch": 0.5994366459482738, + "grad_norm": 0.3941628434702796, + "learning_rate": 9.699604667446643e-05, + "loss": 3.0483, + "step": 12875 + }, + { + "epoch": 0.599483204134367, + "grad_norm": 0.4230429026412108, + "learning_rate": 9.699512186395351e-05, + "loss": 3.0326, + "step": 12876 + }, + { + "epoch": 0.59952976232046, + "grad_norm": 0.40076421756556985, + "learning_rate": 9.699419691551452e-05, + "loss": 3.0768, + "step": 12877 + }, + { + "epoch": 0.5995763205065531, + "grad_norm": 0.3818125754283508, + "learning_rate": 9.69932718291522e-05, + "loss": 3.1306, + "step": 12878 + }, + { + "epoch": 0.5996228786926462, + "grad_norm": 0.39123259764348367, + "learning_rate": 9.699234660486926e-05, + "loss": 3.1331, + "step": 12879 + }, + { + "epoch": 0.5996694368787392, + "grad_norm": 0.3615132599258781, + "learning_rate": 9.69914212426684e-05, + "loss": 3.0576, + "step": 12880 + }, + { + "epoch": 0.5997159950648323, + "grad_norm": 0.39224850612437623, + "learning_rate": 9.699049574255236e-05, + "loss": 3.0704, + "step": 12881 + }, + { + "epoch": 0.5997625532509253, + "grad_norm": 0.35525199050738443, + "learning_rate": 9.698957010452383e-05, + "loss": 3.1387, + "step": 12882 + }, + { + "epoch": 0.5998091114370184, + "grad_norm": 0.34364861390522955, + "learning_rate": 9.698864432858556e-05, + "loss": 3.0864, + "step": 12883 + }, + { + "epoch": 0.5998556696231114, + "grad_norm": 0.3719729152043101, + "learning_rate": 9.698771841474024e-05, + "loss": 3.175, + "step": 12884 + }, + { + "epoch": 0.5999022278092045, + "grad_norm": 0.3780064794471781, + "learning_rate": 9.698679236299059e-05, + "loss": 3.0872, + "step": 12885 + }, + { + "epoch": 0.5999487859952977, + "grad_norm": 0.3621003497040247, + "learning_rate": 9.698586617333933e-05, + "loss": 3.047, + "step": 12886 + }, + { + "epoch": 0.5999953441813907, + "grad_norm": 0.39274607198609507, + "learning_rate": 9.69849398457892e-05, + "loss": 3.1301, + "step": 12887 + }, + { + "epoch": 0.6000419023674838, + "grad_norm": 0.4075826323039503, + "learning_rate": 9.698401338034289e-05, + "loss": 3.0897, + "step": 12888 + }, + { + "epoch": 0.6000884605535768, + "grad_norm": 0.3560651687587407, + "learning_rate": 9.698308677700313e-05, + "loss": 3.0675, + "step": 12889 + }, + { + "epoch": 0.6001350187396699, + "grad_norm": 0.35973048409798003, + "learning_rate": 9.698216003577263e-05, + "loss": 3.0145, + "step": 12890 + }, + { + "epoch": 0.600181576925763, + "grad_norm": 0.3494742120967148, + "learning_rate": 9.698123315665412e-05, + "loss": 3.1047, + "step": 12891 + }, + { + "epoch": 0.600228135111856, + "grad_norm": 0.34869112476762204, + "learning_rate": 9.698030613965034e-05, + "loss": 3.0896, + "step": 12892 + }, + { + "epoch": 0.6002746932979491, + "grad_norm": 0.3344502922657257, + "learning_rate": 9.697937898476398e-05, + "loss": 3.0857, + "step": 12893 + }, + { + "epoch": 0.6003212514840421, + "grad_norm": 0.33744787037696067, + "learning_rate": 9.697845169199775e-05, + "loss": 3.0795, + "step": 12894 + }, + { + "epoch": 0.6003678096701353, + "grad_norm": 0.34183494072623655, + "learning_rate": 9.697752426135442e-05, + "loss": 3.0755, + "step": 12895 + }, + { + "epoch": 0.6004143678562284, + "grad_norm": 0.40884295993112985, + "learning_rate": 9.697659669283668e-05, + "loss": 3.1295, + "step": 12896 + }, + { + "epoch": 0.6004609260423214, + "grad_norm": 0.3828863292558405, + "learning_rate": 9.697566898644724e-05, + "loss": 3.0365, + "step": 12897 + }, + { + "epoch": 0.6005074842284145, + "grad_norm": 0.33101934726511684, + "learning_rate": 9.697474114218885e-05, + "loss": 3.0072, + "step": 12898 + }, + { + "epoch": 0.6005540424145075, + "grad_norm": 0.3770886988175385, + "learning_rate": 9.697381316006422e-05, + "loss": 3.0167, + "step": 12899 + }, + { + "epoch": 0.6006006006006006, + "grad_norm": 0.3310406085075769, + "learning_rate": 9.697288504007608e-05, + "loss": 2.9733, + "step": 12900 + }, + { + "epoch": 0.6006471587866937, + "grad_norm": 0.396967828300432, + "learning_rate": 9.697195678222715e-05, + "loss": 3.055, + "step": 12901 + }, + { + "epoch": 0.6006937169727867, + "grad_norm": 0.39433748281007924, + "learning_rate": 9.697102838652015e-05, + "loss": 3.05, + "step": 12902 + }, + { + "epoch": 0.6007402751588798, + "grad_norm": 0.3457643363386323, + "learning_rate": 9.697009985295781e-05, + "loss": 3.0778, + "step": 12903 + }, + { + "epoch": 0.6007868333449728, + "grad_norm": 0.39996689313047207, + "learning_rate": 9.696917118154284e-05, + "loss": 3.2027, + "step": 12904 + }, + { + "epoch": 0.600833391531066, + "grad_norm": 0.36921279389530287, + "learning_rate": 9.696824237227802e-05, + "loss": 3.1401, + "step": 12905 + }, + { + "epoch": 0.600879949717159, + "grad_norm": 0.33876959954604474, + "learning_rate": 9.696731342516601e-05, + "loss": 3.0868, + "step": 12906 + }, + { + "epoch": 0.6009265079032521, + "grad_norm": 0.3801979900476027, + "learning_rate": 9.696638434020955e-05, + "loss": 3.1689, + "step": 12907 + }, + { + "epoch": 0.6009730660893452, + "grad_norm": 0.3522867225615954, + "learning_rate": 9.69654551174114e-05, + "loss": 3.1136, + "step": 12908 + }, + { + "epoch": 0.6010196242754382, + "grad_norm": 0.3361771621011469, + "learning_rate": 9.696452575677426e-05, + "loss": 3.0776, + "step": 12909 + }, + { + "epoch": 0.6010661824615313, + "grad_norm": 0.36915247471419105, + "learning_rate": 9.696359625830086e-05, + "loss": 3.1311, + "step": 12910 + }, + { + "epoch": 0.6011127406476243, + "grad_norm": 0.35609253605398555, + "learning_rate": 9.696266662199393e-05, + "loss": 3.0452, + "step": 12911 + }, + { + "epoch": 0.6011592988337174, + "grad_norm": 0.3816849513326999, + "learning_rate": 9.69617368478562e-05, + "loss": 3.0688, + "step": 12912 + }, + { + "epoch": 0.6012058570198106, + "grad_norm": 0.3730101372813995, + "learning_rate": 9.696080693589041e-05, + "loss": 3.0553, + "step": 12913 + }, + { + "epoch": 0.6012524152059036, + "grad_norm": 0.3441112727677013, + "learning_rate": 9.695987688609926e-05, + "loss": 3.1553, + "step": 12914 + }, + { + "epoch": 0.6012989733919967, + "grad_norm": 0.339236712374553, + "learning_rate": 9.695894669848552e-05, + "loss": 3.0379, + "step": 12915 + }, + { + "epoch": 0.6013455315780897, + "grad_norm": 0.35565388903812056, + "learning_rate": 9.695801637305188e-05, + "loss": 3.0834, + "step": 12916 + }, + { + "epoch": 0.6013920897641828, + "grad_norm": 0.3632035529448341, + "learning_rate": 9.695708590980111e-05, + "loss": 3.0732, + "step": 12917 + }, + { + "epoch": 0.6014386479502759, + "grad_norm": 0.3337174906291357, + "learning_rate": 9.69561553087359e-05, + "loss": 3.0797, + "step": 12918 + }, + { + "epoch": 0.6014852061363689, + "grad_norm": 0.3857268232047335, + "learning_rate": 9.695522456985899e-05, + "loss": 3.0696, + "step": 12919 + }, + { + "epoch": 0.601531764322462, + "grad_norm": 0.35069315116441907, + "learning_rate": 9.695429369317315e-05, + "loss": 3.0926, + "step": 12920 + }, + { + "epoch": 0.601578322508555, + "grad_norm": 0.39956827446827625, + "learning_rate": 9.695336267868106e-05, + "loss": 3.0567, + "step": 12921 + }, + { + "epoch": 0.6016248806946481, + "grad_norm": 0.35445036241545014, + "learning_rate": 9.69524315263855e-05, + "loss": 3.0961, + "step": 12922 + }, + { + "epoch": 0.6016714388807413, + "grad_norm": 0.40433670279389555, + "learning_rate": 9.695150023628915e-05, + "loss": 3.0866, + "step": 12923 + }, + { + "epoch": 0.6017179970668343, + "grad_norm": 0.3789135310204089, + "learning_rate": 9.695056880839479e-05, + "loss": 3.2244, + "step": 12924 + }, + { + "epoch": 0.6017645552529274, + "grad_norm": 0.3942385395634497, + "learning_rate": 9.694963724270513e-05, + "loss": 3.0488, + "step": 12925 + }, + { + "epoch": 0.6018111134390204, + "grad_norm": 0.36046674953571073, + "learning_rate": 9.69487055392229e-05, + "loss": 3.1933, + "step": 12926 + }, + { + "epoch": 0.6018576716251135, + "grad_norm": 0.36539090759922965, + "learning_rate": 9.694777369795087e-05, + "loss": 3.147, + "step": 12927 + }, + { + "epoch": 0.6019042298112065, + "grad_norm": 0.38376096103758, + "learning_rate": 9.694684171889173e-05, + "loss": 3.1155, + "step": 12928 + }, + { + "epoch": 0.6019507879972996, + "grad_norm": 0.3559211654813856, + "learning_rate": 9.694590960204825e-05, + "loss": 3.1344, + "step": 12929 + }, + { + "epoch": 0.6019973461833927, + "grad_norm": 0.3364798236974006, + "learning_rate": 9.694497734742314e-05, + "loss": 3.0512, + "step": 12930 + }, + { + "epoch": 0.6020439043694857, + "grad_norm": 0.3862207225423991, + "learning_rate": 9.694404495501914e-05, + "loss": 3.1304, + "step": 12931 + }, + { + "epoch": 0.6020904625555789, + "grad_norm": 0.37793398748637835, + "learning_rate": 9.6943112424839e-05, + "loss": 3.1586, + "step": 12932 + }, + { + "epoch": 0.6021370207416719, + "grad_norm": 0.4029582534077132, + "learning_rate": 9.694217975688545e-05, + "loss": 3.0895, + "step": 12933 + }, + { + "epoch": 0.602183578927765, + "grad_norm": 0.38437098805150693, + "learning_rate": 9.694124695116123e-05, + "loss": 3.0378, + "step": 12934 + }, + { + "epoch": 0.6022301371138581, + "grad_norm": 0.357538949895721, + "learning_rate": 9.694031400766907e-05, + "loss": 3.059, + "step": 12935 + }, + { + "epoch": 0.6022766952999511, + "grad_norm": 0.42364097458257177, + "learning_rate": 9.69393809264117e-05, + "loss": 3.129, + "step": 12936 + }, + { + "epoch": 0.6023232534860442, + "grad_norm": 0.3908983193767291, + "learning_rate": 9.69384477073919e-05, + "loss": 3.0876, + "step": 12937 + }, + { + "epoch": 0.6023698116721372, + "grad_norm": 0.3301456851490655, + "learning_rate": 9.693751435061235e-05, + "loss": 3.0214, + "step": 12938 + }, + { + "epoch": 0.6024163698582303, + "grad_norm": 0.39597881285712233, + "learning_rate": 9.693658085607584e-05, + "loss": 3.1299, + "step": 12939 + }, + { + "epoch": 0.6024629280443234, + "grad_norm": 0.3432434356949626, + "learning_rate": 9.69356472237851e-05, + "loss": 2.9525, + "step": 12940 + }, + { + "epoch": 0.6025094862304164, + "grad_norm": 0.4107367997541809, + "learning_rate": 9.693471345374283e-05, + "loss": 3.0334, + "step": 12941 + }, + { + "epoch": 0.6025560444165096, + "grad_norm": 0.45053986049058153, + "learning_rate": 9.693377954595183e-05, + "loss": 3.1371, + "step": 12942 + }, + { + "epoch": 0.6026026026026026, + "grad_norm": 0.36173804879874005, + "learning_rate": 9.69328455004148e-05, + "loss": 3.0308, + "step": 12943 + }, + { + "epoch": 0.6026491607886957, + "grad_norm": 0.3892562295134537, + "learning_rate": 9.693191131713448e-05, + "loss": 3.0997, + "step": 12944 + }, + { + "epoch": 0.6026957189747888, + "grad_norm": 0.38132904029755593, + "learning_rate": 9.693097699611362e-05, + "loss": 3.1118, + "step": 12945 + }, + { + "epoch": 0.6027422771608818, + "grad_norm": 0.373235321150679, + "learning_rate": 9.693004253735498e-05, + "loss": 3.0965, + "step": 12946 + }, + { + "epoch": 0.6027888353469749, + "grad_norm": 0.4082010777786693, + "learning_rate": 9.692910794086128e-05, + "loss": 3.1396, + "step": 12947 + }, + { + "epoch": 0.6028353935330679, + "grad_norm": 0.3759263283376634, + "learning_rate": 9.692817320663528e-05, + "loss": 3.1412, + "step": 12948 + }, + { + "epoch": 0.602881951719161, + "grad_norm": 0.4337959334020814, + "learning_rate": 9.692723833467971e-05, + "loss": 3.1906, + "step": 12949 + }, + { + "epoch": 0.602928509905254, + "grad_norm": 0.37073522568898043, + "learning_rate": 9.692630332499732e-05, + "loss": 3.0257, + "step": 12950 + }, + { + "epoch": 0.6029750680913472, + "grad_norm": 0.37751766277578724, + "learning_rate": 9.692536817759086e-05, + "loss": 2.9779, + "step": 12951 + }, + { + "epoch": 0.6030216262774403, + "grad_norm": 0.3947926054997699, + "learning_rate": 9.692443289246304e-05, + "loss": 3.128, + "step": 12952 + }, + { + "epoch": 0.6030681844635333, + "grad_norm": 0.3417023506371607, + "learning_rate": 9.692349746961667e-05, + "loss": 3.0271, + "step": 12953 + }, + { + "epoch": 0.6031147426496264, + "grad_norm": 0.36766227544895724, + "learning_rate": 9.692256190905443e-05, + "loss": 3.0865, + "step": 12954 + }, + { + "epoch": 0.6031613008357194, + "grad_norm": 0.39387468390595487, + "learning_rate": 9.69216262107791e-05, + "loss": 3.103, + "step": 12955 + }, + { + "epoch": 0.6032078590218125, + "grad_norm": 0.3602521195459057, + "learning_rate": 9.692069037479344e-05, + "loss": 3.0168, + "step": 12956 + }, + { + "epoch": 0.6032544172079056, + "grad_norm": 0.3375180479761552, + "learning_rate": 9.691975440110015e-05, + "loss": 3.0813, + "step": 12957 + }, + { + "epoch": 0.6033009753939986, + "grad_norm": 0.39709734816555115, + "learning_rate": 9.691881828970201e-05, + "loss": 3.1172, + "step": 12958 + }, + { + "epoch": 0.6033475335800917, + "grad_norm": 0.3610220904918238, + "learning_rate": 9.691788204060177e-05, + "loss": 3.0111, + "step": 12959 + }, + { + "epoch": 0.6033940917661847, + "grad_norm": 0.39858960458253906, + "learning_rate": 9.691694565380216e-05, + "loss": 2.9899, + "step": 12960 + }, + { + "epoch": 0.6034406499522779, + "grad_norm": 0.3632994290327145, + "learning_rate": 9.691600912930593e-05, + "loss": 2.9894, + "step": 12961 + }, + { + "epoch": 0.603487208138371, + "grad_norm": 0.36313194081189126, + "learning_rate": 9.691507246711585e-05, + "loss": 3.0083, + "step": 12962 + }, + { + "epoch": 0.603533766324464, + "grad_norm": 0.38373066029904324, + "learning_rate": 9.691413566723464e-05, + "loss": 3.0806, + "step": 12963 + }, + { + "epoch": 0.6035803245105571, + "grad_norm": 0.4271339244019861, + "learning_rate": 9.691319872966506e-05, + "loss": 3.2077, + "step": 12964 + }, + { + "epoch": 0.6036268826966501, + "grad_norm": 0.3576294336724169, + "learning_rate": 9.691226165440989e-05, + "loss": 3.1175, + "step": 12965 + }, + { + "epoch": 0.6036734408827432, + "grad_norm": 0.39147926294063806, + "learning_rate": 9.691132444147184e-05, + "loss": 3.1639, + "step": 12966 + }, + { + "epoch": 0.6037199990688363, + "grad_norm": 0.34325793972035, + "learning_rate": 9.691038709085367e-05, + "loss": 3.0653, + "step": 12967 + }, + { + "epoch": 0.6037665572549293, + "grad_norm": 0.35485613241610453, + "learning_rate": 9.690944960255814e-05, + "loss": 3.1275, + "step": 12968 + }, + { + "epoch": 0.6038131154410225, + "grad_norm": 0.3655692404340576, + "learning_rate": 9.690851197658798e-05, + "loss": 3.1231, + "step": 12969 + }, + { + "epoch": 0.6038596736271155, + "grad_norm": 0.3826729815805137, + "learning_rate": 9.690757421294599e-05, + "loss": 3.1008, + "step": 12970 + }, + { + "epoch": 0.6039062318132086, + "grad_norm": 0.31175143370573766, + "learning_rate": 9.690663631163486e-05, + "loss": 3.0487, + "step": 12971 + }, + { + "epoch": 0.6039527899993016, + "grad_norm": 0.3804865435484985, + "learning_rate": 9.690569827265738e-05, + "loss": 3.0001, + "step": 12972 + }, + { + "epoch": 0.6039993481853947, + "grad_norm": 0.3377914165914097, + "learning_rate": 9.690476009601633e-05, + "loss": 3.1199, + "step": 12973 + }, + { + "epoch": 0.6040459063714878, + "grad_norm": 0.37124923992565523, + "learning_rate": 9.690382178171439e-05, + "loss": 3.1797, + "step": 12974 + }, + { + "epoch": 0.6040924645575808, + "grad_norm": 0.3331874959454248, + "learning_rate": 9.690288332975437e-05, + "loss": 3.1287, + "step": 12975 + }, + { + "epoch": 0.6041390227436739, + "grad_norm": 0.3539292235536336, + "learning_rate": 9.690194474013901e-05, + "loss": 3.0135, + "step": 12976 + }, + { + "epoch": 0.6041855809297669, + "grad_norm": 0.3493535351475146, + "learning_rate": 9.690100601287104e-05, + "loss": 3.0488, + "step": 12977 + }, + { + "epoch": 0.60423213911586, + "grad_norm": 0.3553481953640156, + "learning_rate": 9.690006714795326e-05, + "loss": 3.1604, + "step": 12978 + }, + { + "epoch": 0.6042786973019532, + "grad_norm": 0.38961655857736965, + "learning_rate": 9.68991281453884e-05, + "loss": 3.1125, + "step": 12979 + }, + { + "epoch": 0.6043252554880462, + "grad_norm": 0.3722228392002546, + "learning_rate": 9.689818900517922e-05, + "loss": 3.0884, + "step": 12980 + }, + { + "epoch": 0.6043718136741393, + "grad_norm": 0.3700515854858977, + "learning_rate": 9.689724972732847e-05, + "loss": 3.0807, + "step": 12981 + }, + { + "epoch": 0.6044183718602323, + "grad_norm": 0.39983136118421436, + "learning_rate": 9.689631031183892e-05, + "loss": 3.0355, + "step": 12982 + }, + { + "epoch": 0.6044649300463254, + "grad_norm": 0.3541277203274127, + "learning_rate": 9.689537075871332e-05, + "loss": 3.0747, + "step": 12983 + }, + { + "epoch": 0.6045114882324185, + "grad_norm": 0.4155265021827233, + "learning_rate": 9.689443106795441e-05, + "loss": 3.0538, + "step": 12984 + }, + { + "epoch": 0.6045580464185115, + "grad_norm": 0.3840364294011109, + "learning_rate": 9.6893491239565e-05, + "loss": 3.0947, + "step": 12985 + }, + { + "epoch": 0.6046046046046046, + "grad_norm": 0.35032174015682793, + "learning_rate": 9.689255127354779e-05, + "loss": 3.0108, + "step": 12986 + }, + { + "epoch": 0.6046511627906976, + "grad_norm": 0.37601422425550907, + "learning_rate": 9.689161116990556e-05, + "loss": 3.0943, + "step": 12987 + }, + { + "epoch": 0.6046977209767908, + "grad_norm": 0.38634462566312067, + "learning_rate": 9.689067092864107e-05, + "loss": 3.0127, + "step": 12988 + }, + { + "epoch": 0.6047442791628839, + "grad_norm": 0.39999828584240865, + "learning_rate": 9.688973054975707e-05, + "loss": 3.0596, + "step": 12989 + }, + { + "epoch": 0.6047908373489769, + "grad_norm": 0.4231035679007962, + "learning_rate": 9.688879003325636e-05, + "loss": 3.1196, + "step": 12990 + }, + { + "epoch": 0.60483739553507, + "grad_norm": 0.40848631677314623, + "learning_rate": 9.688784937914164e-05, + "loss": 3.031, + "step": 12991 + }, + { + "epoch": 0.604883953721163, + "grad_norm": 0.40788098956524554, + "learning_rate": 9.688690858741572e-05, + "loss": 2.9867, + "step": 12992 + }, + { + "epoch": 0.6049305119072561, + "grad_norm": 0.3661410537773122, + "learning_rate": 9.688596765808134e-05, + "loss": 2.9973, + "step": 12993 + }, + { + "epoch": 0.6049770700933491, + "grad_norm": 0.3805821926895507, + "learning_rate": 9.688502659114127e-05, + "loss": 3.096, + "step": 12994 + }, + { + "epoch": 0.6050236282794422, + "grad_norm": 0.36695071376109956, + "learning_rate": 9.688408538659825e-05, + "loss": 3.0887, + "step": 12995 + }, + { + "epoch": 0.6050701864655353, + "grad_norm": 0.3891889812888367, + "learning_rate": 9.688314404445506e-05, + "loss": 3.0443, + "step": 12996 + }, + { + "epoch": 0.6051167446516283, + "grad_norm": 0.36876319973330357, + "learning_rate": 9.688220256471447e-05, + "loss": 3.1911, + "step": 12997 + }, + { + "epoch": 0.6051633028377215, + "grad_norm": 0.3734368476101671, + "learning_rate": 9.688126094737923e-05, + "loss": 3.1221, + "step": 12998 + }, + { + "epoch": 0.6052098610238145, + "grad_norm": 0.34785992029694546, + "learning_rate": 9.688031919245211e-05, + "loss": 3.0586, + "step": 12999 + }, + { + "epoch": 0.6052564192099076, + "grad_norm": 0.3249795647511201, + "learning_rate": 9.687937729993587e-05, + "loss": 3.132, + "step": 13000 + }, + { + "epoch": 0.6053029773960007, + "grad_norm": 0.361004019080155, + "learning_rate": 9.687843526983328e-05, + "loss": 3.0623, + "step": 13001 + }, + { + "epoch": 0.6053495355820937, + "grad_norm": 0.34694160207222, + "learning_rate": 9.68774931021471e-05, + "loss": 3.0427, + "step": 13002 + }, + { + "epoch": 0.6053960937681868, + "grad_norm": 0.36085532668704173, + "learning_rate": 9.68765507968801e-05, + "loss": 3.0324, + "step": 13003 + }, + { + "epoch": 0.6054426519542798, + "grad_norm": 0.364970840141367, + "learning_rate": 9.687560835403502e-05, + "loss": 3.1137, + "step": 13004 + }, + { + "epoch": 0.6054892101403729, + "grad_norm": 0.3558819969753635, + "learning_rate": 9.687466577361467e-05, + "loss": 3.1583, + "step": 13005 + }, + { + "epoch": 0.605535768326466, + "grad_norm": 0.357978141687436, + "learning_rate": 9.687372305562178e-05, + "loss": 3.0387, + "step": 13006 + }, + { + "epoch": 0.605582326512559, + "grad_norm": 0.371323070122986, + "learning_rate": 9.687278020005914e-05, + "loss": 3.1269, + "step": 13007 + }, + { + "epoch": 0.6056288846986522, + "grad_norm": 0.36058648664370135, + "learning_rate": 9.68718372069295e-05, + "loss": 3.064, + "step": 13008 + }, + { + "epoch": 0.6056754428847452, + "grad_norm": 0.355802307237298, + "learning_rate": 9.687089407623564e-05, + "loss": 3.0258, + "step": 13009 + }, + { + "epoch": 0.6057220010708383, + "grad_norm": 0.3405244655754191, + "learning_rate": 9.686995080798032e-05, + "loss": 3.0192, + "step": 13010 + }, + { + "epoch": 0.6057685592569314, + "grad_norm": 0.347482541070458, + "learning_rate": 9.686900740216632e-05, + "loss": 3.023, + "step": 13011 + }, + { + "epoch": 0.6058151174430244, + "grad_norm": 0.3490148930072832, + "learning_rate": 9.686806385879639e-05, + "loss": 3.0353, + "step": 13012 + }, + { + "epoch": 0.6058616756291175, + "grad_norm": 0.373278601085222, + "learning_rate": 9.686712017787331e-05, + "loss": 3.0924, + "step": 13013 + }, + { + "epoch": 0.6059082338152105, + "grad_norm": 0.4138642726383235, + "learning_rate": 9.686617635939988e-05, + "loss": 3.0063, + "step": 13014 + }, + { + "epoch": 0.6059547920013036, + "grad_norm": 0.3699986525683928, + "learning_rate": 9.68652324033788e-05, + "loss": 3.0111, + "step": 13015 + }, + { + "epoch": 0.6060013501873966, + "grad_norm": 0.3996219572549381, + "learning_rate": 9.68642883098129e-05, + "loss": 3.0662, + "step": 13016 + }, + { + "epoch": 0.6060479083734898, + "grad_norm": 0.36631648746765066, + "learning_rate": 9.686334407870492e-05, + "loss": 3.1082, + "step": 13017 + }, + { + "epoch": 0.6060944665595829, + "grad_norm": 0.36285183934833454, + "learning_rate": 9.686239971005765e-05, + "loss": 3.1749, + "step": 13018 + }, + { + "epoch": 0.6061410247456759, + "grad_norm": 0.37083214088534266, + "learning_rate": 9.686145520387386e-05, + "loss": 3.0254, + "step": 13019 + }, + { + "epoch": 0.606187582931769, + "grad_norm": 0.3374427034186386, + "learning_rate": 9.68605105601563e-05, + "loss": 3.0419, + "step": 13020 + }, + { + "epoch": 0.606234141117862, + "grad_norm": 0.3897444172730016, + "learning_rate": 9.685956577890776e-05, + "loss": 3.0439, + "step": 13021 + }, + { + "epoch": 0.6062806993039551, + "grad_norm": 0.30815612779984153, + "learning_rate": 9.685862086013102e-05, + "loss": 2.9551, + "step": 13022 + }, + { + "epoch": 0.6063272574900482, + "grad_norm": 0.37047422946650027, + "learning_rate": 9.685767580382883e-05, + "loss": 3.2195, + "step": 13023 + }, + { + "epoch": 0.6063738156761412, + "grad_norm": 0.36207220337561, + "learning_rate": 9.6856730610004e-05, + "loss": 3.0929, + "step": 13024 + }, + { + "epoch": 0.6064203738622344, + "grad_norm": 0.35783629129183564, + "learning_rate": 9.685578527865926e-05, + "loss": 3.0903, + "step": 13025 + }, + { + "epoch": 0.6064669320483274, + "grad_norm": 0.39640967754144907, + "learning_rate": 9.685483980979742e-05, + "loss": 3.0707, + "step": 13026 + }, + { + "epoch": 0.6065134902344205, + "grad_norm": 0.3297362252778364, + "learning_rate": 9.685389420342123e-05, + "loss": 2.9848, + "step": 13027 + }, + { + "epoch": 0.6065600484205136, + "grad_norm": 0.4268141050255523, + "learning_rate": 9.685294845953349e-05, + "loss": 3.1101, + "step": 13028 + }, + { + "epoch": 0.6066066066066066, + "grad_norm": 0.3806779825339615, + "learning_rate": 9.685200257813696e-05, + "loss": 3.0162, + "step": 13029 + }, + { + "epoch": 0.6066531647926997, + "grad_norm": 0.3616661614616731, + "learning_rate": 9.685105655923441e-05, + "loss": 2.9901, + "step": 13030 + }, + { + "epoch": 0.6066997229787927, + "grad_norm": 0.4173151664914263, + "learning_rate": 9.685011040282863e-05, + "loss": 3.0626, + "step": 13031 + }, + { + "epoch": 0.6067462811648858, + "grad_norm": 0.4202556572159233, + "learning_rate": 9.684916410892237e-05, + "loss": 3.1926, + "step": 13032 + }, + { + "epoch": 0.6067928393509789, + "grad_norm": 0.374208084968071, + "learning_rate": 9.684821767751846e-05, + "loss": 2.9634, + "step": 13033 + }, + { + "epoch": 0.606839397537072, + "grad_norm": 0.35978269833731163, + "learning_rate": 9.684727110861963e-05, + "loss": 3.1236, + "step": 13034 + }, + { + "epoch": 0.6068859557231651, + "grad_norm": 0.3619714266552767, + "learning_rate": 9.684632440222867e-05, + "loss": 3.1678, + "step": 13035 + }, + { + "epoch": 0.6069325139092581, + "grad_norm": 0.34977204647647914, + "learning_rate": 9.684537755834838e-05, + "loss": 3.0826, + "step": 13036 + }, + { + "epoch": 0.6069790720953512, + "grad_norm": 0.3961311186283099, + "learning_rate": 9.684443057698151e-05, + "loss": 3.1826, + "step": 13037 + }, + { + "epoch": 0.6070256302814442, + "grad_norm": 0.38136691686191715, + "learning_rate": 9.684348345813086e-05, + "loss": 3.1152, + "step": 13038 + }, + { + "epoch": 0.6070721884675373, + "grad_norm": 0.3792188522539234, + "learning_rate": 9.68425362017992e-05, + "loss": 3.1199, + "step": 13039 + }, + { + "epoch": 0.6071187466536304, + "grad_norm": 0.3363217323179091, + "learning_rate": 9.684158880798931e-05, + "loss": 2.9537, + "step": 13040 + }, + { + "epoch": 0.6071653048397234, + "grad_norm": 0.3712787808311357, + "learning_rate": 9.684064127670395e-05, + "loss": 3.068, + "step": 13041 + }, + { + "epoch": 0.6072118630258165, + "grad_norm": 0.40034304539142684, + "learning_rate": 9.683969360794595e-05, + "loss": 3.111, + "step": 13042 + }, + { + "epoch": 0.6072584212119095, + "grad_norm": 0.3668792895454619, + "learning_rate": 9.683874580171805e-05, + "loss": 3.0788, + "step": 13043 + }, + { + "epoch": 0.6073049793980027, + "grad_norm": 0.38332090403841806, + "learning_rate": 9.683779785802305e-05, + "loss": 3.0664, + "step": 13044 + }, + { + "epoch": 0.6073515375840958, + "grad_norm": 0.3930984883092854, + "learning_rate": 9.683684977686374e-05, + "loss": 3.1498, + "step": 13045 + }, + { + "epoch": 0.6073980957701888, + "grad_norm": 0.37555643379208137, + "learning_rate": 9.683590155824287e-05, + "loss": 3.0275, + "step": 13046 + }, + { + "epoch": 0.6074446539562819, + "grad_norm": 0.37360945235162835, + "learning_rate": 9.683495320216325e-05, + "loss": 3.0181, + "step": 13047 + }, + { + "epoch": 0.6074912121423749, + "grad_norm": 0.4416117932608834, + "learning_rate": 9.683400470862765e-05, + "loss": 3.0467, + "step": 13048 + }, + { + "epoch": 0.607537770328468, + "grad_norm": 0.3925163668788114, + "learning_rate": 9.683305607763887e-05, + "loss": 3.1409, + "step": 13049 + }, + { + "epoch": 0.6075843285145611, + "grad_norm": 0.36294786232364357, + "learning_rate": 9.683210730919969e-05, + "loss": 3.0962, + "step": 13050 + }, + { + "epoch": 0.6076308867006541, + "grad_norm": 0.4429143551200719, + "learning_rate": 9.683115840331287e-05, + "loss": 3.0991, + "step": 13051 + }, + { + "epoch": 0.6076774448867472, + "grad_norm": 0.39073935105126695, + "learning_rate": 9.683020935998122e-05, + "loss": 3.1353, + "step": 13052 + }, + { + "epoch": 0.6077240030728402, + "grad_norm": 0.38882496549325196, + "learning_rate": 9.682926017920753e-05, + "loss": 3.148, + "step": 13053 + }, + { + "epoch": 0.6077705612589334, + "grad_norm": 0.4174448422856896, + "learning_rate": 9.682831086099458e-05, + "loss": 3.0994, + "step": 13054 + }, + { + "epoch": 0.6078171194450265, + "grad_norm": 0.37364011778507433, + "learning_rate": 9.682736140534513e-05, + "loss": 3.1016, + "step": 13055 + }, + { + "epoch": 0.6078636776311195, + "grad_norm": 0.38310704584937005, + "learning_rate": 9.6826411812262e-05, + "loss": 3.0326, + "step": 13056 + }, + { + "epoch": 0.6079102358172126, + "grad_norm": 0.36933946444727467, + "learning_rate": 9.682546208174796e-05, + "loss": 3.0596, + "step": 13057 + }, + { + "epoch": 0.6079567940033056, + "grad_norm": 0.40168241499225865, + "learning_rate": 9.68245122138058e-05, + "loss": 3.1675, + "step": 13058 + }, + { + "epoch": 0.6080033521893987, + "grad_norm": 0.4019149992544447, + "learning_rate": 9.682356220843831e-05, + "loss": 3.0214, + "step": 13059 + }, + { + "epoch": 0.6080499103754917, + "grad_norm": 0.3776599550658756, + "learning_rate": 9.682261206564828e-05, + "loss": 3.1025, + "step": 13060 + }, + { + "epoch": 0.6080964685615848, + "grad_norm": 0.3886298917084096, + "learning_rate": 9.68216617854385e-05, + "loss": 2.986, + "step": 13061 + }, + { + "epoch": 0.608143026747678, + "grad_norm": 0.4213774878823361, + "learning_rate": 9.682071136781174e-05, + "loss": 3.0914, + "step": 13062 + }, + { + "epoch": 0.608189584933771, + "grad_norm": 0.3793415688585398, + "learning_rate": 9.681976081277083e-05, + "loss": 3.1712, + "step": 13063 + }, + { + "epoch": 0.6082361431198641, + "grad_norm": 0.3738037038483655, + "learning_rate": 9.681881012031852e-05, + "loss": 2.9935, + "step": 13064 + }, + { + "epoch": 0.6082827013059571, + "grad_norm": 0.38315936990015803, + "learning_rate": 9.681785929045761e-05, + "loss": 3.0678, + "step": 13065 + }, + { + "epoch": 0.6083292594920502, + "grad_norm": 0.3718683789663261, + "learning_rate": 9.68169083231909e-05, + "loss": 3.0539, + "step": 13066 + }, + { + "epoch": 0.6083758176781433, + "grad_norm": 0.3821404507970689, + "learning_rate": 9.681595721852118e-05, + "loss": 3.0422, + "step": 13067 + }, + { + "epoch": 0.6084223758642363, + "grad_norm": 0.39395484766103106, + "learning_rate": 9.681500597645123e-05, + "loss": 3.1426, + "step": 13068 + }, + { + "epoch": 0.6084689340503294, + "grad_norm": 0.37896762104010484, + "learning_rate": 9.681405459698385e-05, + "loss": 3.0931, + "step": 13069 + }, + { + "epoch": 0.6085154922364224, + "grad_norm": 0.3685736567067473, + "learning_rate": 9.681310308012184e-05, + "loss": 3.0469, + "step": 13070 + }, + { + "epoch": 0.6085620504225155, + "grad_norm": 0.3210696124443211, + "learning_rate": 9.681215142586799e-05, + "loss": 3.0303, + "step": 13071 + }, + { + "epoch": 0.6086086086086087, + "grad_norm": 0.3690877902338033, + "learning_rate": 9.681119963422506e-05, + "loss": 3.0703, + "step": 13072 + }, + { + "epoch": 0.6086551667947017, + "grad_norm": 0.3250120830706328, + "learning_rate": 9.68102477051959e-05, + "loss": 3.1216, + "step": 13073 + }, + { + "epoch": 0.6087017249807948, + "grad_norm": 0.35030892077023834, + "learning_rate": 9.680929563878327e-05, + "loss": 2.9954, + "step": 13074 + }, + { + "epoch": 0.6087482831668878, + "grad_norm": 0.3404640253613998, + "learning_rate": 9.680834343498996e-05, + "loss": 3.1616, + "step": 13075 + }, + { + "epoch": 0.6087948413529809, + "grad_norm": 0.3451282250268336, + "learning_rate": 9.680739109381877e-05, + "loss": 2.9286, + "step": 13076 + }, + { + "epoch": 0.608841399539074, + "grad_norm": 0.372526005110744, + "learning_rate": 9.68064386152725e-05, + "loss": 3.0871, + "step": 13077 + }, + { + "epoch": 0.608887957725167, + "grad_norm": 0.3931118818279997, + "learning_rate": 9.680548599935395e-05, + "loss": 3.0888, + "step": 13078 + }, + { + "epoch": 0.6089345159112601, + "grad_norm": 0.38263760963793647, + "learning_rate": 9.68045332460659e-05, + "loss": 3.1915, + "step": 13079 + }, + { + "epoch": 0.6089810740973531, + "grad_norm": 0.378256517635304, + "learning_rate": 9.680358035541118e-05, + "loss": 3.0177, + "step": 13080 + }, + { + "epoch": 0.6090276322834463, + "grad_norm": 0.3913167477202608, + "learning_rate": 9.680262732739253e-05, + "loss": 3.1239, + "step": 13081 + }, + { + "epoch": 0.6090741904695393, + "grad_norm": 0.36156138481697503, + "learning_rate": 9.68016741620128e-05, + "loss": 2.9436, + "step": 13082 + }, + { + "epoch": 0.6091207486556324, + "grad_norm": 0.4045399684461806, + "learning_rate": 9.680072085927476e-05, + "loss": 3.063, + "step": 13083 + }, + { + "epoch": 0.6091673068417255, + "grad_norm": 0.39927330264121064, + "learning_rate": 9.679976741918121e-05, + "loss": 3.1332, + "step": 13084 + }, + { + "epoch": 0.6092138650278185, + "grad_norm": 0.38970831188098237, + "learning_rate": 9.679881384173497e-05, + "loss": 3.1516, + "step": 13085 + }, + { + "epoch": 0.6092604232139116, + "grad_norm": 0.3567580695223178, + "learning_rate": 9.67978601269388e-05, + "loss": 3.0738, + "step": 13086 + }, + { + "epoch": 0.6093069814000046, + "grad_norm": 0.35929234788312, + "learning_rate": 9.679690627479555e-05, + "loss": 2.9618, + "step": 13087 + }, + { + "epoch": 0.6093535395860977, + "grad_norm": 0.37024111661356046, + "learning_rate": 9.679595228530798e-05, + "loss": 3.2194, + "step": 13088 + }, + { + "epoch": 0.6094000977721908, + "grad_norm": 0.3981276868663828, + "learning_rate": 9.67949981584789e-05, + "loss": 3.1813, + "step": 13089 + }, + { + "epoch": 0.6094466559582838, + "grad_norm": 0.3572676973643759, + "learning_rate": 9.679404389431109e-05, + "loss": 3.0174, + "step": 13090 + }, + { + "epoch": 0.609493214144377, + "grad_norm": 0.3748921285977281, + "learning_rate": 9.679308949280739e-05, + "loss": 2.9661, + "step": 13091 + }, + { + "epoch": 0.60953977233047, + "grad_norm": 0.38730566737715016, + "learning_rate": 9.679213495397057e-05, + "loss": 3.1347, + "step": 13092 + }, + { + "epoch": 0.6095863305165631, + "grad_norm": 0.3868079959872026, + "learning_rate": 9.679118027780346e-05, + "loss": 3.0667, + "step": 13093 + }, + { + "epoch": 0.6096328887026562, + "grad_norm": 0.3537651788055807, + "learning_rate": 9.679022546430883e-05, + "loss": 3.0918, + "step": 13094 + }, + { + "epoch": 0.6096794468887492, + "grad_norm": 0.3872139902305206, + "learning_rate": 9.678927051348952e-05, + "loss": 3.1258, + "step": 13095 + }, + { + "epoch": 0.6097260050748423, + "grad_norm": 0.3627648967671337, + "learning_rate": 9.678831542534827e-05, + "loss": 3.0553, + "step": 13096 + }, + { + "epoch": 0.6097725632609353, + "grad_norm": 0.36812032060011274, + "learning_rate": 9.678736019988796e-05, + "loss": 3.1375, + "step": 13097 + }, + { + "epoch": 0.6098191214470284, + "grad_norm": 0.3803102031436578, + "learning_rate": 9.678640483711135e-05, + "loss": 3.0652, + "step": 13098 + }, + { + "epoch": 0.6098656796331215, + "grad_norm": 0.3462519618887966, + "learning_rate": 9.678544933702125e-05, + "loss": 3.0913, + "step": 13099 + }, + { + "epoch": 0.6099122378192146, + "grad_norm": 0.3396891096838713, + "learning_rate": 9.678449369962047e-05, + "loss": 3.0935, + "step": 13100 + }, + { + "epoch": 0.6099587960053077, + "grad_norm": 0.34628837374050814, + "learning_rate": 9.678353792491179e-05, + "loss": 3.0026, + "step": 13101 + }, + { + "epoch": 0.6100053541914007, + "grad_norm": 0.3669288911685792, + "learning_rate": 9.678258201289805e-05, + "loss": 2.9958, + "step": 13102 + }, + { + "epoch": 0.6100519123774938, + "grad_norm": 0.3408218053928857, + "learning_rate": 9.678162596358203e-05, + "loss": 3.156, + "step": 13103 + }, + { + "epoch": 0.6100984705635868, + "grad_norm": 0.342385115393443, + "learning_rate": 9.678066977696656e-05, + "loss": 3.0081, + "step": 13104 + }, + { + "epoch": 0.6101450287496799, + "grad_norm": 0.3385087312584647, + "learning_rate": 9.677971345305444e-05, + "loss": 3.1383, + "step": 13105 + }, + { + "epoch": 0.610191586935773, + "grad_norm": 0.3412657617045377, + "learning_rate": 9.677875699184844e-05, + "loss": 3.0653, + "step": 13106 + }, + { + "epoch": 0.610238145121866, + "grad_norm": 0.3783556626841289, + "learning_rate": 9.677780039335142e-05, + "loss": 3.1018, + "step": 13107 + }, + { + "epoch": 0.6102847033079591, + "grad_norm": 0.34627015451198206, + "learning_rate": 9.677684365756617e-05, + "loss": 3.1435, + "step": 13108 + }, + { + "epoch": 0.6103312614940521, + "grad_norm": 0.3670421371393459, + "learning_rate": 9.677588678449548e-05, + "loss": 3.0494, + "step": 13109 + }, + { + "epoch": 0.6103778196801453, + "grad_norm": 0.3430378894891514, + "learning_rate": 9.677492977414215e-05, + "loss": 3.0979, + "step": 13110 + }, + { + "epoch": 0.6104243778662384, + "grad_norm": 0.363590561639372, + "learning_rate": 9.677397262650904e-05, + "loss": 3.1413, + "step": 13111 + }, + { + "epoch": 0.6104709360523314, + "grad_norm": 0.38678002682683577, + "learning_rate": 9.677301534159891e-05, + "loss": 2.9483, + "step": 13112 + }, + { + "epoch": 0.6105174942384245, + "grad_norm": 0.39969601852018816, + "learning_rate": 9.67720579194146e-05, + "loss": 3.0533, + "step": 13113 + }, + { + "epoch": 0.6105640524245175, + "grad_norm": 0.36162798303590826, + "learning_rate": 9.677110035995891e-05, + "loss": 3.0532, + "step": 13114 + }, + { + "epoch": 0.6106106106106106, + "grad_norm": 0.4108075047730359, + "learning_rate": 9.677014266323464e-05, + "loss": 3.0154, + "step": 13115 + }, + { + "epoch": 0.6106571687967037, + "grad_norm": 0.38596457089061165, + "learning_rate": 9.676918482924462e-05, + "loss": 3.0091, + "step": 13116 + }, + { + "epoch": 0.6107037269827967, + "grad_norm": 0.33133206397240167, + "learning_rate": 9.676822685799163e-05, + "loss": 3.0762, + "step": 13117 + }, + { + "epoch": 0.6107502851688899, + "grad_norm": 0.3357644933967957, + "learning_rate": 9.676726874947853e-05, + "loss": 2.9984, + "step": 13118 + }, + { + "epoch": 0.6107968433549829, + "grad_norm": 0.33704379260127004, + "learning_rate": 9.676631050370807e-05, + "loss": 3.0425, + "step": 13119 + }, + { + "epoch": 0.610843401541076, + "grad_norm": 0.3212528788524202, + "learning_rate": 9.676535212068312e-05, + "loss": 3.0037, + "step": 13120 + }, + { + "epoch": 0.610889959727169, + "grad_norm": 0.3586622913439069, + "learning_rate": 9.676439360040646e-05, + "loss": 2.9635, + "step": 13121 + }, + { + "epoch": 0.6109365179132621, + "grad_norm": 0.3388923450430251, + "learning_rate": 9.676343494288091e-05, + "loss": 2.9982, + "step": 13122 + }, + { + "epoch": 0.6109830760993552, + "grad_norm": 0.3687878399291292, + "learning_rate": 9.67624761481093e-05, + "loss": 3.0051, + "step": 13123 + }, + { + "epoch": 0.6110296342854482, + "grad_norm": 0.3338813667978361, + "learning_rate": 9.676151721609441e-05, + "loss": 3.1454, + "step": 13124 + }, + { + "epoch": 0.6110761924715413, + "grad_norm": 0.38257473825583704, + "learning_rate": 9.676055814683908e-05, + "loss": 3.0973, + "step": 13125 + }, + { + "epoch": 0.6111227506576343, + "grad_norm": 0.3731815497691472, + "learning_rate": 9.675959894034612e-05, + "loss": 3.1201, + "step": 13126 + }, + { + "epoch": 0.6111693088437274, + "grad_norm": 0.33648169062735117, + "learning_rate": 9.675863959661834e-05, + "loss": 3.0532, + "step": 13127 + }, + { + "epoch": 0.6112158670298206, + "grad_norm": 0.34110870685475214, + "learning_rate": 9.675768011565856e-05, + "loss": 3.0336, + "step": 13128 + }, + { + "epoch": 0.6112624252159136, + "grad_norm": 0.35540250076650115, + "learning_rate": 9.67567204974696e-05, + "loss": 3.2063, + "step": 13129 + }, + { + "epoch": 0.6113089834020067, + "grad_norm": 0.3491426847182538, + "learning_rate": 9.675576074205426e-05, + "loss": 3.0761, + "step": 13130 + }, + { + "epoch": 0.6113555415880997, + "grad_norm": 0.35360611689888727, + "learning_rate": 9.675480084941539e-05, + "loss": 3.103, + "step": 13131 + }, + { + "epoch": 0.6114020997741928, + "grad_norm": 0.3860504987037235, + "learning_rate": 9.675384081955577e-05, + "loss": 3.0003, + "step": 13132 + }, + { + "epoch": 0.6114486579602859, + "grad_norm": 0.3772008982271297, + "learning_rate": 9.675288065247824e-05, + "loss": 3.0123, + "step": 13133 + }, + { + "epoch": 0.6114952161463789, + "grad_norm": 0.3245876461742422, + "learning_rate": 9.675192034818561e-05, + "loss": 2.9099, + "step": 13134 + }, + { + "epoch": 0.611541774332472, + "grad_norm": 0.3676828590164854, + "learning_rate": 9.675095990668069e-05, + "loss": 3.0928, + "step": 13135 + }, + { + "epoch": 0.611588332518565, + "grad_norm": 0.3781044695649201, + "learning_rate": 9.674999932796632e-05, + "loss": 3.0269, + "step": 13136 + }, + { + "epoch": 0.6116348907046582, + "grad_norm": 0.3402043296252076, + "learning_rate": 9.67490386120453e-05, + "loss": 2.9471, + "step": 13137 + }, + { + "epoch": 0.6116814488907513, + "grad_norm": 0.41787867298586334, + "learning_rate": 9.674807775892048e-05, + "loss": 3.0768, + "step": 13138 + }, + { + "epoch": 0.6117280070768443, + "grad_norm": 0.44001983177759235, + "learning_rate": 9.674711676859463e-05, + "loss": 3.0716, + "step": 13139 + }, + { + "epoch": 0.6117745652629374, + "grad_norm": 0.3675315006149561, + "learning_rate": 9.67461556410706e-05, + "loss": 3.0705, + "step": 13140 + }, + { + "epoch": 0.6118211234490304, + "grad_norm": 0.4011781213161222, + "learning_rate": 9.67451943763512e-05, + "loss": 3.1131, + "step": 13141 + }, + { + "epoch": 0.6118676816351235, + "grad_norm": 0.43954822403668986, + "learning_rate": 9.674423297443928e-05, + "loss": 3.0533, + "step": 13142 + }, + { + "epoch": 0.6119142398212165, + "grad_norm": 0.37359050324171206, + "learning_rate": 9.674327143533764e-05, + "loss": 3.0367, + "step": 13143 + }, + { + "epoch": 0.6119607980073096, + "grad_norm": 0.43119582353712765, + "learning_rate": 9.67423097590491e-05, + "loss": 3.0421, + "step": 13144 + }, + { + "epoch": 0.6120073561934027, + "grad_norm": 0.470604216158118, + "learning_rate": 9.674134794557649e-05, + "loss": 3.0644, + "step": 13145 + }, + { + "epoch": 0.6120539143794957, + "grad_norm": 0.4075083772677315, + "learning_rate": 9.674038599492263e-05, + "loss": 3.0414, + "step": 13146 + }, + { + "epoch": 0.6121004725655889, + "grad_norm": 0.4495699604957458, + "learning_rate": 9.673942390709032e-05, + "loss": 3.0625, + "step": 13147 + }, + { + "epoch": 0.6121470307516819, + "grad_norm": 0.39204535427544757, + "learning_rate": 9.673846168208242e-05, + "loss": 3.1105, + "step": 13148 + }, + { + "epoch": 0.612193588937775, + "grad_norm": 0.4021379087957451, + "learning_rate": 9.673749931990174e-05, + "loss": 3.0598, + "step": 13149 + }, + { + "epoch": 0.6122401471238681, + "grad_norm": 0.41325461169186783, + "learning_rate": 9.673653682055111e-05, + "loss": 3.0919, + "step": 13150 + }, + { + "epoch": 0.6122867053099611, + "grad_norm": 0.37496151907123454, + "learning_rate": 9.673557418403334e-05, + "loss": 3.1237, + "step": 13151 + }, + { + "epoch": 0.6123332634960542, + "grad_norm": 0.4174496532573499, + "learning_rate": 9.673461141035127e-05, + "loss": 3.0733, + "step": 13152 + }, + { + "epoch": 0.6123798216821472, + "grad_norm": 0.4319919495994622, + "learning_rate": 9.673364849950771e-05, + "loss": 3.0634, + "step": 13153 + }, + { + "epoch": 0.6124263798682403, + "grad_norm": 0.39239293255038116, + "learning_rate": 9.673268545150551e-05, + "loss": 2.9635, + "step": 13154 + }, + { + "epoch": 0.6124729380543334, + "grad_norm": 0.4172489506631324, + "learning_rate": 9.673172226634747e-05, + "loss": 2.9173, + "step": 13155 + }, + { + "epoch": 0.6125194962404265, + "grad_norm": 0.3745511753074216, + "learning_rate": 9.673075894403643e-05, + "loss": 3.1161, + "step": 13156 + }, + { + "epoch": 0.6125660544265196, + "grad_norm": 0.4344480829701172, + "learning_rate": 9.672979548457523e-05, + "loss": 3.1128, + "step": 13157 + }, + { + "epoch": 0.6126126126126126, + "grad_norm": 0.40695649030061487, + "learning_rate": 9.672883188796666e-05, + "loss": 3.0984, + "step": 13158 + }, + { + "epoch": 0.6126591707987057, + "grad_norm": 0.3844234274020874, + "learning_rate": 9.67278681542136e-05, + "loss": 3.1468, + "step": 13159 + }, + { + "epoch": 0.6127057289847988, + "grad_norm": 0.44153294515360003, + "learning_rate": 9.672690428331883e-05, + "loss": 3.1089, + "step": 13160 + }, + { + "epoch": 0.6127522871708918, + "grad_norm": 0.38640545895576855, + "learning_rate": 9.67259402752852e-05, + "loss": 3.1276, + "step": 13161 + }, + { + "epoch": 0.6127988453569849, + "grad_norm": 0.3648195320151067, + "learning_rate": 9.672497613011554e-05, + "loss": 2.9134, + "step": 13162 + }, + { + "epoch": 0.6128454035430779, + "grad_norm": 0.38193587505940213, + "learning_rate": 9.672401184781268e-05, + "loss": 2.982, + "step": 13163 + }, + { + "epoch": 0.612891961729171, + "grad_norm": 0.3870384622731617, + "learning_rate": 9.672304742837945e-05, + "loss": 2.9986, + "step": 13164 + }, + { + "epoch": 0.612938519915264, + "grad_norm": 0.3876704626612589, + "learning_rate": 9.672208287181867e-05, + "loss": 3.0261, + "step": 13165 + }, + { + "epoch": 0.6129850781013572, + "grad_norm": 0.3803123977852829, + "learning_rate": 9.672111817813321e-05, + "loss": 3.0839, + "step": 13166 + }, + { + "epoch": 0.6130316362874503, + "grad_norm": 0.3649063019913036, + "learning_rate": 9.672015334732585e-05, + "loss": 3.0748, + "step": 13167 + }, + { + "epoch": 0.6130781944735433, + "grad_norm": 0.3614921596627827, + "learning_rate": 9.671918837939943e-05, + "loss": 3.0782, + "step": 13168 + }, + { + "epoch": 0.6131247526596364, + "grad_norm": 0.3510659701863426, + "learning_rate": 9.671822327435681e-05, + "loss": 3.0628, + "step": 13169 + }, + { + "epoch": 0.6131713108457294, + "grad_norm": 0.4215314889622883, + "learning_rate": 9.671725803220079e-05, + "loss": 3.0329, + "step": 13170 + }, + { + "epoch": 0.6132178690318225, + "grad_norm": 0.34037735471747427, + "learning_rate": 9.671629265293425e-05, + "loss": 3.1187, + "step": 13171 + }, + { + "epoch": 0.6132644272179156, + "grad_norm": 0.4151245342021311, + "learning_rate": 9.671532713655997e-05, + "loss": 3.056, + "step": 13172 + }, + { + "epoch": 0.6133109854040086, + "grad_norm": 0.41976550307306965, + "learning_rate": 9.671436148308082e-05, + "loss": 3.1131, + "step": 13173 + }, + { + "epoch": 0.6133575435901018, + "grad_norm": 0.3653406568531864, + "learning_rate": 9.671339569249962e-05, + "loss": 3.1625, + "step": 13174 + }, + { + "epoch": 0.6134041017761948, + "grad_norm": 0.4055802966440189, + "learning_rate": 9.67124297648192e-05, + "loss": 3.0992, + "step": 13175 + }, + { + "epoch": 0.6134506599622879, + "grad_norm": 0.3822551510995598, + "learning_rate": 9.67114637000424e-05, + "loss": 3.1111, + "step": 13176 + }, + { + "epoch": 0.613497218148381, + "grad_norm": 0.37445607578830176, + "learning_rate": 9.671049749817205e-05, + "loss": 3.094, + "step": 13177 + }, + { + "epoch": 0.613543776334474, + "grad_norm": 0.39803181066187454, + "learning_rate": 9.6709531159211e-05, + "loss": 3.1422, + "step": 13178 + }, + { + "epoch": 0.6135903345205671, + "grad_norm": 0.4036087251691977, + "learning_rate": 9.670856468316207e-05, + "loss": 3.0361, + "step": 13179 + }, + { + "epoch": 0.6136368927066601, + "grad_norm": 0.37527489884449045, + "learning_rate": 9.670759807002812e-05, + "loss": 3.0769, + "step": 13180 + }, + { + "epoch": 0.6136834508927532, + "grad_norm": 0.34374926230080144, + "learning_rate": 9.670663131981197e-05, + "loss": 3.052, + "step": 13181 + }, + { + "epoch": 0.6137300090788463, + "grad_norm": 0.36613251866783314, + "learning_rate": 9.670566443251646e-05, + "loss": 3.0208, + "step": 13182 + }, + { + "epoch": 0.6137765672649393, + "grad_norm": 0.3941061305454689, + "learning_rate": 9.67046974081444e-05, + "loss": 3.1469, + "step": 13183 + }, + { + "epoch": 0.6138231254510325, + "grad_norm": 0.33985818737641815, + "learning_rate": 9.670373024669868e-05, + "loss": 3.0496, + "step": 13184 + }, + { + "epoch": 0.6138696836371255, + "grad_norm": 0.3638623922521997, + "learning_rate": 9.67027629481821e-05, + "loss": 3.0037, + "step": 13185 + }, + { + "epoch": 0.6139162418232186, + "grad_norm": 0.4014167955980396, + "learning_rate": 9.670179551259751e-05, + "loss": 3.1684, + "step": 13186 + }, + { + "epoch": 0.6139628000093116, + "grad_norm": 0.40691016054412765, + "learning_rate": 9.670082793994775e-05, + "loss": 3.0602, + "step": 13187 + }, + { + "epoch": 0.6140093581954047, + "grad_norm": 0.3786380711156947, + "learning_rate": 9.669986023023567e-05, + "loss": 3.0746, + "step": 13188 + }, + { + "epoch": 0.6140559163814978, + "grad_norm": 0.3772503628367985, + "learning_rate": 9.66988923834641e-05, + "loss": 3.1392, + "step": 13189 + }, + { + "epoch": 0.6141024745675908, + "grad_norm": 0.3754326548233808, + "learning_rate": 9.669792439963587e-05, + "loss": 3.0435, + "step": 13190 + }, + { + "epoch": 0.6141490327536839, + "grad_norm": 0.36428167059572225, + "learning_rate": 9.669695627875385e-05, + "loss": 3.1011, + "step": 13191 + }, + { + "epoch": 0.6141955909397769, + "grad_norm": 0.3472883124791964, + "learning_rate": 9.669598802082086e-05, + "loss": 3.0794, + "step": 13192 + }, + { + "epoch": 0.61424214912587, + "grad_norm": 0.3509697466545328, + "learning_rate": 9.669501962583974e-05, + "loss": 3.0622, + "step": 13193 + }, + { + "epoch": 0.6142887073119632, + "grad_norm": 0.41472484222952344, + "learning_rate": 9.669405109381334e-05, + "loss": 3.0101, + "step": 13194 + }, + { + "epoch": 0.6143352654980562, + "grad_norm": 0.362422381254069, + "learning_rate": 9.669308242474451e-05, + "loss": 3.1033, + "step": 13195 + }, + { + "epoch": 0.6143818236841493, + "grad_norm": 0.38769897896845157, + "learning_rate": 9.669211361863605e-05, + "loss": 3.0716, + "step": 13196 + }, + { + "epoch": 0.6144283818702423, + "grad_norm": 0.41145522747666013, + "learning_rate": 9.669114467549088e-05, + "loss": 3.0663, + "step": 13197 + }, + { + "epoch": 0.6144749400563354, + "grad_norm": 0.42231722627661106, + "learning_rate": 9.669017559531177e-05, + "loss": 2.9659, + "step": 13198 + }, + { + "epoch": 0.6145214982424285, + "grad_norm": 0.40021973146508383, + "learning_rate": 9.66892063781016e-05, + "loss": 3.0174, + "step": 13199 + }, + { + "epoch": 0.6145680564285215, + "grad_norm": 0.3676959953327994, + "learning_rate": 9.668823702386322e-05, + "loss": 3.0132, + "step": 13200 + }, + { + "epoch": 0.6146146146146146, + "grad_norm": 0.3504168591275417, + "learning_rate": 9.668726753259944e-05, + "loss": 3.1083, + "step": 13201 + }, + { + "epoch": 0.6146611728007076, + "grad_norm": 0.39341768384484416, + "learning_rate": 9.668629790431316e-05, + "loss": 3.1011, + "step": 13202 + }, + { + "epoch": 0.6147077309868008, + "grad_norm": 0.3102490707514421, + "learning_rate": 9.668532813900718e-05, + "loss": 2.9702, + "step": 13203 + }, + { + "epoch": 0.6147542891728939, + "grad_norm": 0.4163159125023161, + "learning_rate": 9.668435823668436e-05, + "loss": 3.198, + "step": 13204 + }, + { + "epoch": 0.6148008473589869, + "grad_norm": 0.3567408114580099, + "learning_rate": 9.668338819734754e-05, + "loss": 3.0018, + "step": 13205 + }, + { + "epoch": 0.61484740554508, + "grad_norm": 0.39014033973305745, + "learning_rate": 9.66824180209996e-05, + "loss": 3.1193, + "step": 13206 + }, + { + "epoch": 0.614893963731173, + "grad_norm": 0.37092761457781215, + "learning_rate": 9.668144770764333e-05, + "loss": 3.0962, + "step": 13207 + }, + { + "epoch": 0.6149405219172661, + "grad_norm": 0.33487237679886067, + "learning_rate": 9.668047725728164e-05, + "loss": 3.0361, + "step": 13208 + }, + { + "epoch": 0.6149870801033591, + "grad_norm": 0.4188699234960022, + "learning_rate": 9.667950666991733e-05, + "loss": 3.1057, + "step": 13209 + }, + { + "epoch": 0.6150336382894522, + "grad_norm": 0.3645216673856775, + "learning_rate": 9.667853594555328e-05, + "loss": 3.1155, + "step": 13210 + }, + { + "epoch": 0.6150801964755453, + "grad_norm": 0.3610579418683578, + "learning_rate": 9.66775650841923e-05, + "loss": 3.0628, + "step": 13211 + }, + { + "epoch": 0.6151267546616384, + "grad_norm": 0.3762987880470335, + "learning_rate": 9.66765940858373e-05, + "loss": 3.0985, + "step": 13212 + }, + { + "epoch": 0.6151733128477315, + "grad_norm": 0.36809558385721997, + "learning_rate": 9.667562295049107e-05, + "loss": 2.8921, + "step": 13213 + }, + { + "epoch": 0.6152198710338245, + "grad_norm": 0.3645540051498524, + "learning_rate": 9.667465167815648e-05, + "loss": 3.1813, + "step": 13214 + }, + { + "epoch": 0.6152664292199176, + "grad_norm": 0.3679639844748271, + "learning_rate": 9.667368026883641e-05, + "loss": 3.123, + "step": 13215 + }, + { + "epoch": 0.6153129874060107, + "grad_norm": 0.3380124602531789, + "learning_rate": 9.667270872253366e-05, + "loss": 3.1215, + "step": 13216 + }, + { + "epoch": 0.6153595455921037, + "grad_norm": 0.38599406776681316, + "learning_rate": 9.667173703925112e-05, + "loss": 3.1383, + "step": 13217 + }, + { + "epoch": 0.6154061037781968, + "grad_norm": 0.36443609916661796, + "learning_rate": 9.667076521899163e-05, + "loss": 3.069, + "step": 13218 + }, + { + "epoch": 0.6154526619642898, + "grad_norm": 0.3831560956750044, + "learning_rate": 9.666979326175804e-05, + "loss": 3.0488, + "step": 13219 + }, + { + "epoch": 0.6154992201503829, + "grad_norm": 0.3899027282035113, + "learning_rate": 9.666882116755323e-05, + "loss": 3.0279, + "step": 13220 + }, + { + "epoch": 0.6155457783364761, + "grad_norm": 0.3813509612320967, + "learning_rate": 9.666784893637999e-05, + "loss": 3.1731, + "step": 13221 + }, + { + "epoch": 0.6155923365225691, + "grad_norm": 0.3601287887302129, + "learning_rate": 9.666687656824122e-05, + "loss": 3.1129, + "step": 13222 + }, + { + "epoch": 0.6156388947086622, + "grad_norm": 0.3839283624216257, + "learning_rate": 9.666590406313978e-05, + "loss": 3.0407, + "step": 13223 + }, + { + "epoch": 0.6156854528947552, + "grad_norm": 0.3788839723234238, + "learning_rate": 9.66649314210785e-05, + "loss": 3.0231, + "step": 13224 + }, + { + "epoch": 0.6157320110808483, + "grad_norm": 0.34423772671570824, + "learning_rate": 9.666395864206024e-05, + "loss": 3.0094, + "step": 13225 + }, + { + "epoch": 0.6157785692669414, + "grad_norm": 0.3735788067945198, + "learning_rate": 9.666298572608786e-05, + "loss": 3.1662, + "step": 13226 + }, + { + "epoch": 0.6158251274530344, + "grad_norm": 0.35087885169328864, + "learning_rate": 9.666201267316421e-05, + "loss": 2.9966, + "step": 13227 + }, + { + "epoch": 0.6158716856391275, + "grad_norm": 0.4029897529141556, + "learning_rate": 9.666103948329217e-05, + "loss": 3.0615, + "step": 13228 + }, + { + "epoch": 0.6159182438252205, + "grad_norm": 0.3401959836295626, + "learning_rate": 9.666006615647456e-05, + "loss": 3.0351, + "step": 13229 + }, + { + "epoch": 0.6159648020113137, + "grad_norm": 0.48262160866367143, + "learning_rate": 9.665909269271425e-05, + "loss": 3.0219, + "step": 13230 + }, + { + "epoch": 0.6160113601974067, + "grad_norm": 0.40262325055302195, + "learning_rate": 9.66581190920141e-05, + "loss": 3.0713, + "step": 13231 + }, + { + "epoch": 0.6160579183834998, + "grad_norm": 0.36672781808320953, + "learning_rate": 9.665714535437698e-05, + "loss": 2.9795, + "step": 13232 + }, + { + "epoch": 0.6161044765695929, + "grad_norm": 0.3712359249341709, + "learning_rate": 9.665617147980573e-05, + "loss": 3.083, + "step": 13233 + }, + { + "epoch": 0.6161510347556859, + "grad_norm": 0.3737379723226244, + "learning_rate": 9.66551974683032e-05, + "loss": 3.1304, + "step": 13234 + }, + { + "epoch": 0.616197592941779, + "grad_norm": 0.38215155087267677, + "learning_rate": 9.665422331987227e-05, + "loss": 3.1519, + "step": 13235 + }, + { + "epoch": 0.616244151127872, + "grad_norm": 0.35925424161067926, + "learning_rate": 9.665324903451579e-05, + "loss": 3.128, + "step": 13236 + }, + { + "epoch": 0.6162907093139651, + "grad_norm": 0.36279771952293627, + "learning_rate": 9.665227461223663e-05, + "loss": 3.0551, + "step": 13237 + }, + { + "epoch": 0.6163372675000582, + "grad_norm": 0.36098142825622204, + "learning_rate": 9.665130005303761e-05, + "loss": 3.1056, + "step": 13238 + }, + { + "epoch": 0.6163838256861512, + "grad_norm": 0.4056355644062424, + "learning_rate": 9.665032535692164e-05, + "loss": 3.0378, + "step": 13239 + }, + { + "epoch": 0.6164303838722444, + "grad_norm": 0.3359072122465405, + "learning_rate": 9.664935052389157e-05, + "loss": 3.0612, + "step": 13240 + }, + { + "epoch": 0.6164769420583374, + "grad_norm": 0.433011478672131, + "learning_rate": 9.664837555395026e-05, + "loss": 3.1426, + "step": 13241 + }, + { + "epoch": 0.6165235002444305, + "grad_norm": 0.3587152553198608, + "learning_rate": 9.664740044710054e-05, + "loss": 3.1, + "step": 13242 + }, + { + "epoch": 0.6165700584305236, + "grad_norm": 0.3865986710233997, + "learning_rate": 9.664642520334531e-05, + "loss": 2.9702, + "step": 13243 + }, + { + "epoch": 0.6166166166166166, + "grad_norm": 0.40824665707882046, + "learning_rate": 9.664544982268739e-05, + "loss": 3.0363, + "step": 13244 + }, + { + "epoch": 0.6166631748027097, + "grad_norm": 0.3899958488604067, + "learning_rate": 9.664447430512971e-05, + "loss": 3.0572, + "step": 13245 + }, + { + "epoch": 0.6167097329888027, + "grad_norm": 0.40027440728736036, + "learning_rate": 9.664349865067506e-05, + "loss": 3.1189, + "step": 13246 + }, + { + "epoch": 0.6167562911748958, + "grad_norm": 0.3963526678677605, + "learning_rate": 9.664252285932635e-05, + "loss": 3.088, + "step": 13247 + }, + { + "epoch": 0.616802849360989, + "grad_norm": 0.400056805932948, + "learning_rate": 9.664154693108642e-05, + "loss": 3.2087, + "step": 13248 + }, + { + "epoch": 0.616849407547082, + "grad_norm": 0.4152456708658674, + "learning_rate": 9.664057086595817e-05, + "loss": 3.079, + "step": 13249 + }, + { + "epoch": 0.6168959657331751, + "grad_norm": 0.39079011207048625, + "learning_rate": 9.663959466394442e-05, + "loss": 3.0824, + "step": 13250 + }, + { + "epoch": 0.6169425239192681, + "grad_norm": 0.4048412580802828, + "learning_rate": 9.663861832504806e-05, + "loss": 3.129, + "step": 13251 + }, + { + "epoch": 0.6169890821053612, + "grad_norm": 0.3929416488075604, + "learning_rate": 9.663764184927195e-05, + "loss": 2.9965, + "step": 13252 + }, + { + "epoch": 0.6170356402914542, + "grad_norm": 0.3857691412159852, + "learning_rate": 9.663666523661897e-05, + "loss": 3.0645, + "step": 13253 + }, + { + "epoch": 0.6170821984775473, + "grad_norm": 0.3623680226872924, + "learning_rate": 9.663568848709195e-05, + "loss": 3.1061, + "step": 13254 + }, + { + "epoch": 0.6171287566636404, + "grad_norm": 0.3997175589021442, + "learning_rate": 9.663471160069377e-05, + "loss": 3.0944, + "step": 13255 + }, + { + "epoch": 0.6171753148497334, + "grad_norm": 0.3807540367942776, + "learning_rate": 9.663373457742732e-05, + "loss": 3.07, + "step": 13256 + }, + { + "epoch": 0.6172218730358265, + "grad_norm": 0.3836663892997815, + "learning_rate": 9.663275741729546e-05, + "loss": 3.0445, + "step": 13257 + }, + { + "epoch": 0.6172684312219195, + "grad_norm": 0.37674096567645493, + "learning_rate": 9.663178012030106e-05, + "loss": 3.0143, + "step": 13258 + }, + { + "epoch": 0.6173149894080127, + "grad_norm": 0.35011493400418325, + "learning_rate": 9.663080268644695e-05, + "loss": 3.118, + "step": 13259 + }, + { + "epoch": 0.6173615475941058, + "grad_norm": 0.4212575411195848, + "learning_rate": 9.662982511573604e-05, + "loss": 3.0653, + "step": 13260 + }, + { + "epoch": 0.6174081057801988, + "grad_norm": 0.359724766502871, + "learning_rate": 9.662884740817119e-05, + "loss": 3.0464, + "step": 13261 + }, + { + "epoch": 0.6174546639662919, + "grad_norm": 0.4046241492604945, + "learning_rate": 9.662786956375527e-05, + "loss": 2.9802, + "step": 13262 + }, + { + "epoch": 0.6175012221523849, + "grad_norm": 0.3878674771055061, + "learning_rate": 9.662689158249114e-05, + "loss": 2.9971, + "step": 13263 + }, + { + "epoch": 0.617547780338478, + "grad_norm": 0.33633435468162926, + "learning_rate": 9.662591346438168e-05, + "loss": 2.9103, + "step": 13264 + }, + { + "epoch": 0.6175943385245711, + "grad_norm": 0.3828087342875118, + "learning_rate": 9.662493520942977e-05, + "loss": 3.0953, + "step": 13265 + }, + { + "epoch": 0.6176408967106641, + "grad_norm": 0.4006362236124497, + "learning_rate": 9.662395681763825e-05, + "loss": 3.0667, + "step": 13266 + }, + { + "epoch": 0.6176874548967572, + "grad_norm": 0.34195616696186637, + "learning_rate": 9.662297828901002e-05, + "loss": 3.084, + "step": 13267 + }, + { + "epoch": 0.6177340130828503, + "grad_norm": 0.39933114256664404, + "learning_rate": 9.662199962354793e-05, + "loss": 3.1622, + "step": 13268 + }, + { + "epoch": 0.6177805712689434, + "grad_norm": 0.39167562305827114, + "learning_rate": 9.662102082125487e-05, + "loss": 3.1077, + "step": 13269 + }, + { + "epoch": 0.6178271294550365, + "grad_norm": 0.35058400103426113, + "learning_rate": 9.662004188213371e-05, + "loss": 3.022, + "step": 13270 + }, + { + "epoch": 0.6178736876411295, + "grad_norm": 0.38480810909304175, + "learning_rate": 9.661906280618731e-05, + "loss": 3.1069, + "step": 13271 + }, + { + "epoch": 0.6179202458272226, + "grad_norm": 0.37660990239782094, + "learning_rate": 9.661808359341858e-05, + "loss": 2.9638, + "step": 13272 + }, + { + "epoch": 0.6179668040133156, + "grad_norm": 0.3265341934285554, + "learning_rate": 9.661710424383034e-05, + "loss": 3.0264, + "step": 13273 + }, + { + "epoch": 0.6180133621994087, + "grad_norm": 0.3619083192239539, + "learning_rate": 9.66161247574255e-05, + "loss": 3.0573, + "step": 13274 + }, + { + "epoch": 0.6180599203855017, + "grad_norm": 0.3330384278750101, + "learning_rate": 9.661514513420693e-05, + "loss": 3.0496, + "step": 13275 + }, + { + "epoch": 0.6181064785715948, + "grad_norm": 0.3262878101036961, + "learning_rate": 9.66141653741775e-05, + "loss": 3.085, + "step": 13276 + }, + { + "epoch": 0.618153036757688, + "grad_norm": 0.3521497109397845, + "learning_rate": 9.661318547734009e-05, + "loss": 3.1192, + "step": 13277 + }, + { + "epoch": 0.618199594943781, + "grad_norm": 0.34699183990212024, + "learning_rate": 9.661220544369755e-05, + "loss": 3.0456, + "step": 13278 + }, + { + "epoch": 0.6182461531298741, + "grad_norm": 0.38234205963365364, + "learning_rate": 9.66112252732528e-05, + "loss": 2.9532, + "step": 13279 + }, + { + "epoch": 0.6182927113159671, + "grad_norm": 0.34490191432170747, + "learning_rate": 9.66102449660087e-05, + "loss": 2.9952, + "step": 13280 + }, + { + "epoch": 0.6183392695020602, + "grad_norm": 0.34196450973770676, + "learning_rate": 9.660926452196811e-05, + "loss": 3.0374, + "step": 13281 + }, + { + "epoch": 0.6183858276881533, + "grad_norm": 0.34797746892828996, + "learning_rate": 9.660828394113393e-05, + "loss": 2.9936, + "step": 13282 + }, + { + "epoch": 0.6184323858742463, + "grad_norm": 0.3774861841362995, + "learning_rate": 9.660730322350901e-05, + "loss": 3.1221, + "step": 13283 + }, + { + "epoch": 0.6184789440603394, + "grad_norm": 0.362364078805441, + "learning_rate": 9.660632236909628e-05, + "loss": 2.9642, + "step": 13284 + }, + { + "epoch": 0.6185255022464324, + "grad_norm": 0.37581498009896874, + "learning_rate": 9.660534137789857e-05, + "loss": 3.1804, + "step": 13285 + }, + { + "epoch": 0.6185720604325256, + "grad_norm": 0.37755089025171723, + "learning_rate": 9.660436024991876e-05, + "loss": 3.0738, + "step": 13286 + }, + { + "epoch": 0.6186186186186187, + "grad_norm": 0.35959869227158914, + "learning_rate": 9.660337898515975e-05, + "loss": 2.9652, + "step": 13287 + }, + { + "epoch": 0.6186651768047117, + "grad_norm": 0.3551979108685943, + "learning_rate": 9.660239758362442e-05, + "loss": 3.0179, + "step": 13288 + }, + { + "epoch": 0.6187117349908048, + "grad_norm": 0.356731712539478, + "learning_rate": 9.660141604531565e-05, + "loss": 3.0688, + "step": 13289 + }, + { + "epoch": 0.6187582931768978, + "grad_norm": 0.3537186694936825, + "learning_rate": 9.66004343702363e-05, + "loss": 3.0833, + "step": 13290 + }, + { + "epoch": 0.6188048513629909, + "grad_norm": 0.396069894085962, + "learning_rate": 9.659945255838928e-05, + "loss": 3.1235, + "step": 13291 + }, + { + "epoch": 0.618851409549084, + "grad_norm": 0.38015962274652354, + "learning_rate": 9.659847060977745e-05, + "loss": 2.9832, + "step": 13292 + }, + { + "epoch": 0.618897967735177, + "grad_norm": 0.3893895268056426, + "learning_rate": 9.659748852440371e-05, + "loss": 3.1275, + "step": 13293 + }, + { + "epoch": 0.6189445259212701, + "grad_norm": 0.39237104020679126, + "learning_rate": 9.659650630227091e-05, + "loss": 3.0983, + "step": 13294 + }, + { + "epoch": 0.6189910841073631, + "grad_norm": 0.41824435050169667, + "learning_rate": 9.659552394338197e-05, + "loss": 2.9643, + "step": 13295 + }, + { + "epoch": 0.6190376422934563, + "grad_norm": 0.3899090018921025, + "learning_rate": 9.659454144773976e-05, + "loss": 3.074, + "step": 13296 + }, + { + "epoch": 0.6190842004795493, + "grad_norm": 0.4297325293123055, + "learning_rate": 9.659355881534715e-05, + "loss": 3.0252, + "step": 13297 + }, + { + "epoch": 0.6191307586656424, + "grad_norm": 0.42847421750274334, + "learning_rate": 9.659257604620705e-05, + "loss": 3.0939, + "step": 13298 + }, + { + "epoch": 0.6191773168517355, + "grad_norm": 0.3856655626444434, + "learning_rate": 9.659159314032231e-05, + "loss": 2.8955, + "step": 13299 + }, + { + "epoch": 0.6192238750378285, + "grad_norm": 0.42441626167117175, + "learning_rate": 9.659061009769584e-05, + "loss": 2.9627, + "step": 13300 + }, + { + "epoch": 0.6192704332239216, + "grad_norm": 0.40177407558125483, + "learning_rate": 9.658962691833053e-05, + "loss": 2.9662, + "step": 13301 + }, + { + "epoch": 0.6193169914100146, + "grad_norm": 0.45630526931453164, + "learning_rate": 9.658864360222925e-05, + "loss": 3.0691, + "step": 13302 + }, + { + "epoch": 0.6193635495961077, + "grad_norm": 0.4534752093721069, + "learning_rate": 9.658766014939488e-05, + "loss": 3.1597, + "step": 13303 + }, + { + "epoch": 0.6194101077822008, + "grad_norm": 0.39665166938796814, + "learning_rate": 9.658667655983034e-05, + "loss": 3.1709, + "step": 13304 + }, + { + "epoch": 0.6194566659682939, + "grad_norm": 0.4237563726598895, + "learning_rate": 9.658569283353848e-05, + "loss": 3.1412, + "step": 13305 + }, + { + "epoch": 0.619503224154387, + "grad_norm": 0.4275549150949197, + "learning_rate": 9.658470897052219e-05, + "loss": 3.0852, + "step": 13306 + }, + { + "epoch": 0.61954978234048, + "grad_norm": 0.4080312405968976, + "learning_rate": 9.658372497078438e-05, + "loss": 3.0984, + "step": 13307 + }, + { + "epoch": 0.6195963405265731, + "grad_norm": 0.4286713180831884, + "learning_rate": 9.658274083432793e-05, + "loss": 3.1294, + "step": 13308 + }, + { + "epoch": 0.6196428987126662, + "grad_norm": 0.401645152383626, + "learning_rate": 9.658175656115573e-05, + "loss": 3.1077, + "step": 13309 + }, + { + "epoch": 0.6196894568987592, + "grad_norm": 0.4315526220199511, + "learning_rate": 9.658077215127064e-05, + "loss": 2.9661, + "step": 13310 + }, + { + "epoch": 0.6197360150848523, + "grad_norm": 0.37539808906433897, + "learning_rate": 9.65797876046756e-05, + "loss": 3.1126, + "step": 13311 + }, + { + "epoch": 0.6197825732709453, + "grad_norm": 0.40811661270771626, + "learning_rate": 9.657880292137345e-05, + "loss": 3.1124, + "step": 13312 + }, + { + "epoch": 0.6198291314570384, + "grad_norm": 0.3957448897125675, + "learning_rate": 9.657781810136712e-05, + "loss": 3.1112, + "step": 13313 + }, + { + "epoch": 0.6198756896431316, + "grad_norm": 0.39579032682625215, + "learning_rate": 9.657683314465947e-05, + "loss": 3.1214, + "step": 13314 + }, + { + "epoch": 0.6199222478292246, + "grad_norm": 0.3894872888501806, + "learning_rate": 9.657584805125341e-05, + "loss": 3.042, + "step": 13315 + }, + { + "epoch": 0.6199688060153177, + "grad_norm": 0.4082669087691218, + "learning_rate": 9.657486282115182e-05, + "loss": 3.1297, + "step": 13316 + }, + { + "epoch": 0.6200153642014107, + "grad_norm": 0.4057966341260068, + "learning_rate": 9.65738774543576e-05, + "loss": 3.045, + "step": 13317 + }, + { + "epoch": 0.6200619223875038, + "grad_norm": 0.3465311092829059, + "learning_rate": 9.657289195087363e-05, + "loss": 2.9803, + "step": 13318 + }, + { + "epoch": 0.6201084805735968, + "grad_norm": 0.4109411129563341, + "learning_rate": 9.657190631070283e-05, + "loss": 3.0572, + "step": 13319 + }, + { + "epoch": 0.6201550387596899, + "grad_norm": 0.37418771285131586, + "learning_rate": 9.657092053384805e-05, + "loss": 3.0881, + "step": 13320 + }, + { + "epoch": 0.620201596945783, + "grad_norm": 0.3993463903480614, + "learning_rate": 9.656993462031222e-05, + "loss": 3.0994, + "step": 13321 + }, + { + "epoch": 0.620248155131876, + "grad_norm": 0.3457697702906405, + "learning_rate": 9.656894857009822e-05, + "loss": 3.0914, + "step": 13322 + }, + { + "epoch": 0.6202947133179691, + "grad_norm": 0.4076383045811993, + "learning_rate": 9.656796238320893e-05, + "loss": 3.159, + "step": 13323 + }, + { + "epoch": 0.6203412715040622, + "grad_norm": 0.3983357198415561, + "learning_rate": 9.656697605964727e-05, + "loss": 3.0322, + "step": 13324 + }, + { + "epoch": 0.6203878296901553, + "grad_norm": 0.3363370817576534, + "learning_rate": 9.656598959941612e-05, + "loss": 3.063, + "step": 13325 + }, + { + "epoch": 0.6204343878762484, + "grad_norm": 0.40869203946653165, + "learning_rate": 9.656500300251838e-05, + "loss": 2.9759, + "step": 13326 + }, + { + "epoch": 0.6204809460623414, + "grad_norm": 0.33390272907597013, + "learning_rate": 9.656401626895694e-05, + "loss": 3.0591, + "step": 13327 + }, + { + "epoch": 0.6205275042484345, + "grad_norm": 0.35665375445241204, + "learning_rate": 9.656302939873471e-05, + "loss": 2.9722, + "step": 13328 + }, + { + "epoch": 0.6205740624345275, + "grad_norm": 0.36305964518120615, + "learning_rate": 9.656204239185455e-05, + "loss": 3.138, + "step": 13329 + }, + { + "epoch": 0.6206206206206206, + "grad_norm": 0.3670221082575142, + "learning_rate": 9.656105524831941e-05, + "loss": 3.0957, + "step": 13330 + }, + { + "epoch": 0.6206671788067137, + "grad_norm": 0.3637552765759049, + "learning_rate": 9.656006796813215e-05, + "loss": 3.0628, + "step": 13331 + }, + { + "epoch": 0.6207137369928067, + "grad_norm": 0.35896383923197306, + "learning_rate": 9.655908055129568e-05, + "loss": 3.0066, + "step": 13332 + }, + { + "epoch": 0.6207602951788999, + "grad_norm": 0.35727523747100387, + "learning_rate": 9.65580929978129e-05, + "loss": 3.1025, + "step": 13333 + }, + { + "epoch": 0.6208068533649929, + "grad_norm": 0.37127012759754885, + "learning_rate": 9.655710530768669e-05, + "loss": 3.0875, + "step": 13334 + }, + { + "epoch": 0.620853411551086, + "grad_norm": 0.3621034082872145, + "learning_rate": 9.655611748091997e-05, + "loss": 3.0715, + "step": 13335 + }, + { + "epoch": 0.6208999697371791, + "grad_norm": 0.36120233878972235, + "learning_rate": 9.655512951751564e-05, + "loss": 2.9676, + "step": 13336 + }, + { + "epoch": 0.6209465279232721, + "grad_norm": 0.3873332737103801, + "learning_rate": 9.655414141747658e-05, + "loss": 3.0737, + "step": 13337 + }, + { + "epoch": 0.6209930861093652, + "grad_norm": 0.3334530234526091, + "learning_rate": 9.65531531808057e-05, + "loss": 3.0198, + "step": 13338 + }, + { + "epoch": 0.6210396442954582, + "grad_norm": 0.34591941755844086, + "learning_rate": 9.65521648075059e-05, + "loss": 3.1324, + "step": 13339 + }, + { + "epoch": 0.6210862024815513, + "grad_norm": 0.35413755691489207, + "learning_rate": 9.655117629758008e-05, + "loss": 3.0646, + "step": 13340 + }, + { + "epoch": 0.6211327606676443, + "grad_norm": 0.33576465660663135, + "learning_rate": 9.655018765103116e-05, + "loss": 3.0453, + "step": 13341 + }, + { + "epoch": 0.6211793188537374, + "grad_norm": 0.43879547035715105, + "learning_rate": 9.654919886786202e-05, + "loss": 3.1905, + "step": 13342 + }, + { + "epoch": 0.6212258770398306, + "grad_norm": 0.3826910479285082, + "learning_rate": 9.654820994807556e-05, + "loss": 3.1139, + "step": 13343 + }, + { + "epoch": 0.6212724352259236, + "grad_norm": 0.3340025622790019, + "learning_rate": 9.65472208916747e-05, + "loss": 3.0685, + "step": 13344 + }, + { + "epoch": 0.6213189934120167, + "grad_norm": 0.4548164638828488, + "learning_rate": 9.654623169866232e-05, + "loss": 3.0927, + "step": 13345 + }, + { + "epoch": 0.6213655515981097, + "grad_norm": 0.37220335927135323, + "learning_rate": 9.654524236904135e-05, + "loss": 3.0583, + "step": 13346 + }, + { + "epoch": 0.6214121097842028, + "grad_norm": 0.34705101938092797, + "learning_rate": 9.654425290281466e-05, + "loss": 2.9938, + "step": 13347 + }, + { + "epoch": 0.6214586679702959, + "grad_norm": 0.3475477571034804, + "learning_rate": 9.654326329998518e-05, + "loss": 3.0102, + "step": 13348 + }, + { + "epoch": 0.6215052261563889, + "grad_norm": 0.35300727235151635, + "learning_rate": 9.654227356055581e-05, + "loss": 2.8938, + "step": 13349 + }, + { + "epoch": 0.621551784342482, + "grad_norm": 0.3536077440351524, + "learning_rate": 9.654128368452946e-05, + "loss": 3.0647, + "step": 13350 + }, + { + "epoch": 0.621598342528575, + "grad_norm": 0.3413565328989788, + "learning_rate": 9.654029367190902e-05, + "loss": 3.084, + "step": 13351 + }, + { + "epoch": 0.6216449007146682, + "grad_norm": 0.3844604167887222, + "learning_rate": 9.65393035226974e-05, + "loss": 3.1529, + "step": 13352 + }, + { + "epoch": 0.6216914589007613, + "grad_norm": 0.3646776774532274, + "learning_rate": 9.653831323689751e-05, + "loss": 3.1684, + "step": 13353 + }, + { + "epoch": 0.6217380170868543, + "grad_norm": 0.39610644615285145, + "learning_rate": 9.653732281451226e-05, + "loss": 3.0539, + "step": 13354 + }, + { + "epoch": 0.6217845752729474, + "grad_norm": 0.4197505143659648, + "learning_rate": 9.653633225554455e-05, + "loss": 3.1632, + "step": 13355 + }, + { + "epoch": 0.6218311334590404, + "grad_norm": 0.37847942349973285, + "learning_rate": 9.65353415599973e-05, + "loss": 3.0109, + "step": 13356 + }, + { + "epoch": 0.6218776916451335, + "grad_norm": 0.4440094263423513, + "learning_rate": 9.653435072787339e-05, + "loss": 3.1207, + "step": 13357 + }, + { + "epoch": 0.6219242498312266, + "grad_norm": 0.3746621987835567, + "learning_rate": 9.653335975917574e-05, + "loss": 3.0456, + "step": 13358 + }, + { + "epoch": 0.6219708080173196, + "grad_norm": 0.41324145354305875, + "learning_rate": 9.653236865390729e-05, + "loss": 3.0326, + "step": 13359 + }, + { + "epoch": 0.6220173662034127, + "grad_norm": 0.4217518209372363, + "learning_rate": 9.65313774120709e-05, + "loss": 3.0572, + "step": 13360 + }, + { + "epoch": 0.6220639243895058, + "grad_norm": 0.38598624523655545, + "learning_rate": 9.653038603366951e-05, + "loss": 3.1247, + "step": 13361 + }, + { + "epoch": 0.6221104825755989, + "grad_norm": 0.4031582393971311, + "learning_rate": 9.652939451870602e-05, + "loss": 3.0994, + "step": 13362 + }, + { + "epoch": 0.6221570407616919, + "grad_norm": 0.3744107430364071, + "learning_rate": 9.652840286718335e-05, + "loss": 2.9805, + "step": 13363 + }, + { + "epoch": 0.622203598947785, + "grad_norm": 0.36684934084264814, + "learning_rate": 9.65274110791044e-05, + "loss": 2.9544, + "step": 13364 + }, + { + "epoch": 0.6222501571338781, + "grad_norm": 0.39704931333739635, + "learning_rate": 9.652641915447205e-05, + "loss": 3.1453, + "step": 13365 + }, + { + "epoch": 0.6222967153199711, + "grad_norm": 0.3791758466912364, + "learning_rate": 9.652542709328928e-05, + "loss": 3.1165, + "step": 13366 + }, + { + "epoch": 0.6223432735060642, + "grad_norm": 0.33761760571418664, + "learning_rate": 9.652443489555896e-05, + "loss": 2.9933, + "step": 13367 + }, + { + "epoch": 0.6223898316921572, + "grad_norm": 0.3603505364858376, + "learning_rate": 9.652344256128398e-05, + "loss": 3.0288, + "step": 13368 + }, + { + "epoch": 0.6224363898782503, + "grad_norm": 0.35212377210085316, + "learning_rate": 9.65224500904673e-05, + "loss": 3.0567, + "step": 13369 + }, + { + "epoch": 0.6224829480643435, + "grad_norm": 0.3710896603319757, + "learning_rate": 9.652145748311182e-05, + "loss": 2.9865, + "step": 13370 + }, + { + "epoch": 0.6225295062504365, + "grad_norm": 0.3446828768244864, + "learning_rate": 9.652046473922045e-05, + "loss": 3.0156, + "step": 13371 + }, + { + "epoch": 0.6225760644365296, + "grad_norm": 0.3632018559326556, + "learning_rate": 9.651947185879607e-05, + "loss": 3.1319, + "step": 13372 + }, + { + "epoch": 0.6226226226226226, + "grad_norm": 0.3894545165818004, + "learning_rate": 9.651847884184164e-05, + "loss": 3.119, + "step": 13373 + }, + { + "epoch": 0.6226691808087157, + "grad_norm": 0.3421819499878193, + "learning_rate": 9.651748568836005e-05, + "loss": 3.084, + "step": 13374 + }, + { + "epoch": 0.6227157389948088, + "grad_norm": 0.3948216522728145, + "learning_rate": 9.651649239835424e-05, + "loss": 3.0639, + "step": 13375 + }, + { + "epoch": 0.6227622971809018, + "grad_norm": 0.31704829572580856, + "learning_rate": 9.65154989718271e-05, + "loss": 3.0582, + "step": 13376 + }, + { + "epoch": 0.6228088553669949, + "grad_norm": 0.39084873555148614, + "learning_rate": 9.651450540878154e-05, + "loss": 3.0693, + "step": 13377 + }, + { + "epoch": 0.6228554135530879, + "grad_norm": 0.3584584189280475, + "learning_rate": 9.651351170922049e-05, + "loss": 3.1139, + "step": 13378 + }, + { + "epoch": 0.622901971739181, + "grad_norm": 0.3969175582129446, + "learning_rate": 9.651251787314687e-05, + "loss": 3.0934, + "step": 13379 + }, + { + "epoch": 0.6229485299252742, + "grad_norm": 0.3632668659998336, + "learning_rate": 9.651152390056362e-05, + "loss": 3.0344, + "step": 13380 + }, + { + "epoch": 0.6229950881113672, + "grad_norm": 0.3849508973889691, + "learning_rate": 9.651052979147359e-05, + "loss": 3.0545, + "step": 13381 + }, + { + "epoch": 0.6230416462974603, + "grad_norm": 0.3566918725683879, + "learning_rate": 9.650953554587976e-05, + "loss": 3.0512, + "step": 13382 + }, + { + "epoch": 0.6230882044835533, + "grad_norm": 0.36506822281735607, + "learning_rate": 9.650854116378501e-05, + "loss": 3.0344, + "step": 13383 + }, + { + "epoch": 0.6231347626696464, + "grad_norm": 0.39766968714425643, + "learning_rate": 9.650754664519228e-05, + "loss": 3.1934, + "step": 13384 + }, + { + "epoch": 0.6231813208557394, + "grad_norm": 0.3671848051215637, + "learning_rate": 9.650655199010449e-05, + "loss": 3.1114, + "step": 13385 + }, + { + "epoch": 0.6232278790418325, + "grad_norm": 0.370068735073015, + "learning_rate": 9.650555719852454e-05, + "loss": 3.0419, + "step": 13386 + }, + { + "epoch": 0.6232744372279256, + "grad_norm": 0.3241206028480874, + "learning_rate": 9.650456227045537e-05, + "loss": 3.034, + "step": 13387 + }, + { + "epoch": 0.6233209954140186, + "grad_norm": 0.351011010870778, + "learning_rate": 9.650356720589989e-05, + "loss": 3.0732, + "step": 13388 + }, + { + "epoch": 0.6233675536001118, + "grad_norm": 0.38015029155900015, + "learning_rate": 9.650257200486102e-05, + "loss": 3.1338, + "step": 13389 + }, + { + "epoch": 0.6234141117862048, + "grad_norm": 0.3473067059762225, + "learning_rate": 9.650157666734168e-05, + "loss": 3.1334, + "step": 13390 + }, + { + "epoch": 0.6234606699722979, + "grad_norm": 0.3348248847357759, + "learning_rate": 9.65005811933448e-05, + "loss": 3.0181, + "step": 13391 + }, + { + "epoch": 0.623507228158391, + "grad_norm": 0.3485815165357671, + "learning_rate": 9.649958558287329e-05, + "loss": 3.0883, + "step": 13392 + }, + { + "epoch": 0.623553786344484, + "grad_norm": 0.3636780428931929, + "learning_rate": 9.649858983593008e-05, + "loss": 3.1381, + "step": 13393 + }, + { + "epoch": 0.6236003445305771, + "grad_norm": 0.3593988071108702, + "learning_rate": 9.649759395251808e-05, + "loss": 3.0186, + "step": 13394 + }, + { + "epoch": 0.6236469027166701, + "grad_norm": 0.34384684490881917, + "learning_rate": 9.649659793264023e-05, + "loss": 3.0378, + "step": 13395 + }, + { + "epoch": 0.6236934609027632, + "grad_norm": 0.3918594454557209, + "learning_rate": 9.649560177629944e-05, + "loss": 2.9754, + "step": 13396 + }, + { + "epoch": 0.6237400190888563, + "grad_norm": 0.38953058074321956, + "learning_rate": 9.649460548349864e-05, + "loss": 3.058, + "step": 13397 + }, + { + "epoch": 0.6237865772749493, + "grad_norm": 0.35478791064664955, + "learning_rate": 9.649360905424077e-05, + "loss": 2.9884, + "step": 13398 + }, + { + "epoch": 0.6238331354610425, + "grad_norm": 0.3649886794299721, + "learning_rate": 9.649261248852872e-05, + "loss": 3.0134, + "step": 13399 + }, + { + "epoch": 0.6238796936471355, + "grad_norm": 0.4276303503406376, + "learning_rate": 9.649161578636544e-05, + "loss": 3.0611, + "step": 13400 + }, + { + "epoch": 0.6239262518332286, + "grad_norm": 0.3493435703493961, + "learning_rate": 9.649061894775386e-05, + "loss": 3.0642, + "step": 13401 + }, + { + "epoch": 0.6239728100193217, + "grad_norm": 0.38840404859627053, + "learning_rate": 9.648962197269686e-05, + "loss": 3.0667, + "step": 13402 + }, + { + "epoch": 0.6240193682054147, + "grad_norm": 0.3820762909042841, + "learning_rate": 9.648862486119743e-05, + "loss": 3.0807, + "step": 13403 + }, + { + "epoch": 0.6240659263915078, + "grad_norm": 0.372306112854117, + "learning_rate": 9.648762761325846e-05, + "loss": 3.1281, + "step": 13404 + }, + { + "epoch": 0.6241124845776008, + "grad_norm": 0.4242237688788664, + "learning_rate": 9.648663022888287e-05, + "loss": 3.0985, + "step": 13405 + }, + { + "epoch": 0.6241590427636939, + "grad_norm": 0.3918300393319074, + "learning_rate": 9.648563270807362e-05, + "loss": 3.0914, + "step": 13406 + }, + { + "epoch": 0.6242056009497869, + "grad_norm": 0.3956845026217853, + "learning_rate": 9.648463505083361e-05, + "loss": 3.1579, + "step": 13407 + }, + { + "epoch": 0.6242521591358801, + "grad_norm": 0.41536920159148155, + "learning_rate": 9.648363725716576e-05, + "loss": 3.1179, + "step": 13408 + }, + { + "epoch": 0.6242987173219732, + "grad_norm": 0.3920864018541894, + "learning_rate": 9.648263932707301e-05, + "loss": 3.0371, + "step": 13409 + }, + { + "epoch": 0.6243452755080662, + "grad_norm": 0.44166678239426554, + "learning_rate": 9.648164126055833e-05, + "loss": 2.9752, + "step": 13410 + }, + { + "epoch": 0.6243918336941593, + "grad_norm": 0.3657829375903991, + "learning_rate": 9.648064305762459e-05, + "loss": 3.0233, + "step": 13411 + }, + { + "epoch": 0.6244383918802523, + "grad_norm": 0.4026777304928969, + "learning_rate": 9.647964471827473e-05, + "loss": 3.0454, + "step": 13412 + }, + { + "epoch": 0.6244849500663454, + "grad_norm": 0.3584775739564112, + "learning_rate": 9.64786462425117e-05, + "loss": 3.0592, + "step": 13413 + }, + { + "epoch": 0.6245315082524385, + "grad_norm": 0.3519674972397969, + "learning_rate": 9.647764763033844e-05, + "loss": 3.222, + "step": 13414 + }, + { + "epoch": 0.6245780664385315, + "grad_norm": 0.39574312441653975, + "learning_rate": 9.647664888175784e-05, + "loss": 3.0007, + "step": 13415 + }, + { + "epoch": 0.6246246246246246, + "grad_norm": 0.36954391463687664, + "learning_rate": 9.647564999677288e-05, + "loss": 3.0919, + "step": 13416 + }, + { + "epoch": 0.6246711828107177, + "grad_norm": 0.37444203699139456, + "learning_rate": 9.647465097538644e-05, + "loss": 2.9455, + "step": 13417 + }, + { + "epoch": 0.6247177409968108, + "grad_norm": 0.3846631279830023, + "learning_rate": 9.647365181760149e-05, + "loss": 3.002, + "step": 13418 + }, + { + "epoch": 0.6247642991829039, + "grad_norm": 0.405912376512883, + "learning_rate": 9.647265252342095e-05, + "loss": 3.2242, + "step": 13419 + }, + { + "epoch": 0.6248108573689969, + "grad_norm": 0.3795815263933174, + "learning_rate": 9.647165309284775e-05, + "loss": 3.0511, + "step": 13420 + }, + { + "epoch": 0.62485741555509, + "grad_norm": 0.33598091401212804, + "learning_rate": 9.647065352588482e-05, + "loss": 2.951, + "step": 13421 + }, + { + "epoch": 0.624903973741183, + "grad_norm": 0.3945907651558309, + "learning_rate": 9.646965382253511e-05, + "loss": 3.0968, + "step": 13422 + }, + { + "epoch": 0.6249505319272761, + "grad_norm": 0.36666050079805745, + "learning_rate": 9.646865398280153e-05, + "loss": 3.0053, + "step": 13423 + }, + { + "epoch": 0.6249970901133692, + "grad_norm": 0.36472280392265977, + "learning_rate": 9.646765400668705e-05, + "loss": 3.0637, + "step": 13424 + }, + { + "epoch": 0.6250436482994622, + "grad_norm": 0.35761697084691246, + "learning_rate": 9.646665389419458e-05, + "loss": 3.0779, + "step": 13425 + }, + { + "epoch": 0.6250902064855554, + "grad_norm": 0.39768150149411624, + "learning_rate": 9.646565364532705e-05, + "loss": 3.1098, + "step": 13426 + }, + { + "epoch": 0.6251367646716484, + "grad_norm": 0.34991382747874866, + "learning_rate": 9.64646532600874e-05, + "loss": 2.9197, + "step": 13427 + }, + { + "epoch": 0.6251833228577415, + "grad_norm": 0.3756921772882535, + "learning_rate": 9.646365273847857e-05, + "loss": 3.0848, + "step": 13428 + }, + { + "epoch": 0.6252298810438345, + "grad_norm": 0.37274146697663524, + "learning_rate": 9.646265208050351e-05, + "loss": 3.1472, + "step": 13429 + }, + { + "epoch": 0.6252764392299276, + "grad_norm": 0.33422752894695035, + "learning_rate": 9.646165128616513e-05, + "loss": 3.0532, + "step": 13430 + }, + { + "epoch": 0.6253229974160207, + "grad_norm": 0.3743670634973493, + "learning_rate": 9.646065035546639e-05, + "loss": 2.9991, + "step": 13431 + }, + { + "epoch": 0.6253695556021137, + "grad_norm": 0.33474778491236623, + "learning_rate": 9.645964928841022e-05, + "loss": 3.0286, + "step": 13432 + }, + { + "epoch": 0.6254161137882068, + "grad_norm": 0.3909859550474914, + "learning_rate": 9.645864808499954e-05, + "loss": 3.076, + "step": 13433 + }, + { + "epoch": 0.6254626719742998, + "grad_norm": 0.41079526149496715, + "learning_rate": 9.645764674523732e-05, + "loss": 3.0032, + "step": 13434 + }, + { + "epoch": 0.625509230160393, + "grad_norm": 0.37989567944882396, + "learning_rate": 9.645664526912647e-05, + "loss": 3.1548, + "step": 13435 + }, + { + "epoch": 0.6255557883464861, + "grad_norm": 0.3974455659905464, + "learning_rate": 9.645564365666997e-05, + "loss": 3.0369, + "step": 13436 + }, + { + "epoch": 0.6256023465325791, + "grad_norm": 0.3718574833613476, + "learning_rate": 9.645464190787071e-05, + "loss": 3.0772, + "step": 13437 + }, + { + "epoch": 0.6256489047186722, + "grad_norm": 0.3703296027424101, + "learning_rate": 9.645364002273166e-05, + "loss": 3.0694, + "step": 13438 + }, + { + "epoch": 0.6256954629047652, + "grad_norm": 0.3616344925019293, + "learning_rate": 9.645263800125575e-05, + "loss": 2.9487, + "step": 13439 + }, + { + "epoch": 0.6257420210908583, + "grad_norm": 0.42562532731701014, + "learning_rate": 9.645163584344592e-05, + "loss": 3.0243, + "step": 13440 + }, + { + "epoch": 0.6257885792769514, + "grad_norm": 0.35858343793998126, + "learning_rate": 9.645063354930513e-05, + "loss": 3.0627, + "step": 13441 + }, + { + "epoch": 0.6258351374630444, + "grad_norm": 0.3668532017661364, + "learning_rate": 9.64496311188363e-05, + "loss": 3.0469, + "step": 13442 + }, + { + "epoch": 0.6258816956491375, + "grad_norm": 0.37787117061969505, + "learning_rate": 9.644862855204237e-05, + "loss": 3.0022, + "step": 13443 + }, + { + "epoch": 0.6259282538352305, + "grad_norm": 0.3411217131326817, + "learning_rate": 9.644762584892629e-05, + "loss": 3.031, + "step": 13444 + }, + { + "epoch": 0.6259748120213237, + "grad_norm": 0.396941421477725, + "learning_rate": 9.6446623009491e-05, + "loss": 3.1239, + "step": 13445 + }, + { + "epoch": 0.6260213702074168, + "grad_norm": 0.37644276282013284, + "learning_rate": 9.644562003373947e-05, + "loss": 2.9591, + "step": 13446 + }, + { + "epoch": 0.6260679283935098, + "grad_norm": 0.33191172384072676, + "learning_rate": 9.644461692167461e-05, + "loss": 3.0853, + "step": 13447 + }, + { + "epoch": 0.6261144865796029, + "grad_norm": 0.3745192469547433, + "learning_rate": 9.644361367329938e-05, + "loss": 3.0724, + "step": 13448 + }, + { + "epoch": 0.6261610447656959, + "grad_norm": 0.37458996292079455, + "learning_rate": 9.64426102886167e-05, + "loss": 3.1406, + "step": 13449 + }, + { + "epoch": 0.626207602951789, + "grad_norm": 0.39964544256172635, + "learning_rate": 9.644160676762956e-05, + "loss": 3.0635, + "step": 13450 + }, + { + "epoch": 0.626254161137882, + "grad_norm": 0.37708148599657476, + "learning_rate": 9.644060311034086e-05, + "loss": 2.9969, + "step": 13451 + }, + { + "epoch": 0.6263007193239751, + "grad_norm": 0.41516744109819115, + "learning_rate": 9.643959931675358e-05, + "loss": 3.1042, + "step": 13452 + }, + { + "epoch": 0.6263472775100682, + "grad_norm": 0.38946480785581933, + "learning_rate": 9.643859538687064e-05, + "loss": 3.1833, + "step": 13453 + }, + { + "epoch": 0.6263938356961612, + "grad_norm": 0.38910917691022157, + "learning_rate": 9.643759132069499e-05, + "loss": 3.0571, + "step": 13454 + }, + { + "epoch": 0.6264403938822544, + "grad_norm": 0.37335963186077564, + "learning_rate": 9.64365871182296e-05, + "loss": 3.033, + "step": 13455 + }, + { + "epoch": 0.6264869520683474, + "grad_norm": 0.36852590105752886, + "learning_rate": 9.64355827794774e-05, + "loss": 3.0024, + "step": 13456 + }, + { + "epoch": 0.6265335102544405, + "grad_norm": 0.44458666583368117, + "learning_rate": 9.643457830444133e-05, + "loss": 3.117, + "step": 13457 + }, + { + "epoch": 0.6265800684405336, + "grad_norm": 0.4008839425368648, + "learning_rate": 9.643357369312434e-05, + "loss": 3.1173, + "step": 13458 + }, + { + "epoch": 0.6266266266266266, + "grad_norm": 0.46367209653237523, + "learning_rate": 9.643256894552942e-05, + "loss": 3.0643, + "step": 13459 + }, + { + "epoch": 0.6266731848127197, + "grad_norm": 0.36737199815994537, + "learning_rate": 9.643156406165945e-05, + "loss": 3.0578, + "step": 13460 + }, + { + "epoch": 0.6267197429988127, + "grad_norm": 0.39898285135016837, + "learning_rate": 9.643055904151742e-05, + "loss": 2.9306, + "step": 13461 + }, + { + "epoch": 0.6267663011849058, + "grad_norm": 0.38586208595251487, + "learning_rate": 9.642955388510629e-05, + "loss": 3.0513, + "step": 13462 + }, + { + "epoch": 0.626812859370999, + "grad_norm": 0.4082992813851875, + "learning_rate": 9.642854859242898e-05, + "loss": 3.109, + "step": 13463 + }, + { + "epoch": 0.626859417557092, + "grad_norm": 0.3943459632824351, + "learning_rate": 9.642754316348846e-05, + "loss": 3.1347, + "step": 13464 + }, + { + "epoch": 0.6269059757431851, + "grad_norm": 0.3504658001823984, + "learning_rate": 9.642653759828765e-05, + "loss": 3.0487, + "step": 13465 + }, + { + "epoch": 0.6269525339292781, + "grad_norm": 0.3399018419833433, + "learning_rate": 9.642553189682957e-05, + "loss": 3.0513, + "step": 13466 + }, + { + "epoch": 0.6269990921153712, + "grad_norm": 0.3298405170628875, + "learning_rate": 9.64245260591171e-05, + "loss": 2.8784, + "step": 13467 + }, + { + "epoch": 0.6270456503014643, + "grad_norm": 0.34103419096459653, + "learning_rate": 9.642352008515322e-05, + "loss": 2.9146, + "step": 13468 + }, + { + "epoch": 0.6270922084875573, + "grad_norm": 0.34605054355867876, + "learning_rate": 9.642251397494089e-05, + "loss": 2.9714, + "step": 13469 + }, + { + "epoch": 0.6271387666736504, + "grad_norm": 0.31148150250165857, + "learning_rate": 9.642150772848304e-05, + "loss": 3.0859, + "step": 13470 + }, + { + "epoch": 0.6271853248597434, + "grad_norm": 0.36119505294734255, + "learning_rate": 9.642050134578267e-05, + "loss": 3.093, + "step": 13471 + }, + { + "epoch": 0.6272318830458365, + "grad_norm": 0.35409803696424863, + "learning_rate": 9.641949482684267e-05, + "loss": 3.1127, + "step": 13472 + }, + { + "epoch": 0.6272784412319296, + "grad_norm": 0.37370991584087543, + "learning_rate": 9.641848817166604e-05, + "loss": 2.9823, + "step": 13473 + }, + { + "epoch": 0.6273249994180227, + "grad_norm": 0.39149828561633016, + "learning_rate": 9.641748138025572e-05, + "loss": 3.012, + "step": 13474 + }, + { + "epoch": 0.6273715576041158, + "grad_norm": 0.43634641512241734, + "learning_rate": 9.641647445261465e-05, + "loss": 3.043, + "step": 13475 + }, + { + "epoch": 0.6274181157902088, + "grad_norm": 0.34665172511824205, + "learning_rate": 9.641546738874582e-05, + "loss": 2.9996, + "step": 13476 + }, + { + "epoch": 0.6274646739763019, + "grad_norm": 0.35921265973033767, + "learning_rate": 9.641446018865216e-05, + "loss": 3.0586, + "step": 13477 + }, + { + "epoch": 0.6275112321623949, + "grad_norm": 0.42716480224421327, + "learning_rate": 9.641345285233663e-05, + "loss": 3.0552, + "step": 13478 + }, + { + "epoch": 0.627557790348488, + "grad_norm": 0.3431777562080412, + "learning_rate": 9.641244537980218e-05, + "loss": 2.9435, + "step": 13479 + }, + { + "epoch": 0.6276043485345811, + "grad_norm": 0.4367911637524245, + "learning_rate": 9.64114377710518e-05, + "loss": 3.1241, + "step": 13480 + }, + { + "epoch": 0.6276509067206741, + "grad_norm": 0.4183043333247721, + "learning_rate": 9.64104300260884e-05, + "loss": 3.1246, + "step": 13481 + }, + { + "epoch": 0.6276974649067673, + "grad_norm": 0.3940261985365163, + "learning_rate": 9.640942214491496e-05, + "loss": 3.1371, + "step": 13482 + }, + { + "epoch": 0.6277440230928603, + "grad_norm": 0.3931227284989623, + "learning_rate": 9.640841412753445e-05, + "loss": 3.0144, + "step": 13483 + }, + { + "epoch": 0.6277905812789534, + "grad_norm": 0.35641225009246064, + "learning_rate": 9.640740597394981e-05, + "loss": 3.0553, + "step": 13484 + }, + { + "epoch": 0.6278371394650465, + "grad_norm": 0.4010259773804566, + "learning_rate": 9.6406397684164e-05, + "loss": 3.1195, + "step": 13485 + }, + { + "epoch": 0.6278836976511395, + "grad_norm": 0.38323532015270706, + "learning_rate": 9.640538925817999e-05, + "loss": 3.0328, + "step": 13486 + }, + { + "epoch": 0.6279302558372326, + "grad_norm": 0.3518684931107578, + "learning_rate": 9.640438069600075e-05, + "loss": 3.1458, + "step": 13487 + }, + { + "epoch": 0.6279768140233256, + "grad_norm": 0.3596118518707185, + "learning_rate": 9.64033719976292e-05, + "loss": 3.079, + "step": 13488 + }, + { + "epoch": 0.6280233722094187, + "grad_norm": 0.3479223181422914, + "learning_rate": 9.640236316306833e-05, + "loss": 2.9368, + "step": 13489 + }, + { + "epoch": 0.6280699303955118, + "grad_norm": 0.36431920123367506, + "learning_rate": 9.64013541923211e-05, + "loss": 3.033, + "step": 13490 + }, + { + "epoch": 0.6281164885816048, + "grad_norm": 0.3627849783239644, + "learning_rate": 9.640034508539046e-05, + "loss": 2.9869, + "step": 13491 + }, + { + "epoch": 0.628163046767698, + "grad_norm": 0.3292204848163671, + "learning_rate": 9.63993358422794e-05, + "loss": 2.9423, + "step": 13492 + }, + { + "epoch": 0.628209604953791, + "grad_norm": 0.3711899635024875, + "learning_rate": 9.639832646299083e-05, + "loss": 3.0052, + "step": 13493 + }, + { + "epoch": 0.6282561631398841, + "grad_norm": 0.38828611371547767, + "learning_rate": 9.639731694752776e-05, + "loss": 2.9933, + "step": 13494 + }, + { + "epoch": 0.6283027213259771, + "grad_norm": 0.3513867395457554, + "learning_rate": 9.639630729589313e-05, + "loss": 3.1175, + "step": 13495 + }, + { + "epoch": 0.6283492795120702, + "grad_norm": 0.40849458124643434, + "learning_rate": 9.639529750808992e-05, + "loss": 3.0965, + "step": 13496 + }, + { + "epoch": 0.6283958376981633, + "grad_norm": 0.3691494175786416, + "learning_rate": 9.639428758412108e-05, + "loss": 3.0774, + "step": 13497 + }, + { + "epoch": 0.6284423958842563, + "grad_norm": 0.4268633310307957, + "learning_rate": 9.639327752398957e-05, + "loss": 2.9972, + "step": 13498 + }, + { + "epoch": 0.6284889540703494, + "grad_norm": 0.37664419230995033, + "learning_rate": 9.639226732769836e-05, + "loss": 2.9804, + "step": 13499 + }, + { + "epoch": 0.6285355122564424, + "grad_norm": 0.37938146400860695, + "learning_rate": 9.639125699525042e-05, + "loss": 3.0602, + "step": 13500 + }, + { + "epoch": 0.6285820704425356, + "grad_norm": 0.33721046643750924, + "learning_rate": 9.639024652664871e-05, + "loss": 3.004, + "step": 13501 + }, + { + "epoch": 0.6286286286286287, + "grad_norm": 0.3476908917163302, + "learning_rate": 9.63892359218962e-05, + "loss": 3.0207, + "step": 13502 + }, + { + "epoch": 0.6286751868147217, + "grad_norm": 0.34120261849385897, + "learning_rate": 9.638822518099584e-05, + "loss": 3.0441, + "step": 13503 + }, + { + "epoch": 0.6287217450008148, + "grad_norm": 0.3889399232737739, + "learning_rate": 9.638721430395061e-05, + "loss": 3.0675, + "step": 13504 + }, + { + "epoch": 0.6287683031869078, + "grad_norm": 0.38333095172358267, + "learning_rate": 9.638620329076349e-05, + "loss": 3.022, + "step": 13505 + }, + { + "epoch": 0.6288148613730009, + "grad_norm": 0.3581480178302515, + "learning_rate": 9.638519214143742e-05, + "loss": 3.0787, + "step": 13506 + }, + { + "epoch": 0.628861419559094, + "grad_norm": 0.37315855398733994, + "learning_rate": 9.638418085597541e-05, + "loss": 2.9132, + "step": 13507 + }, + { + "epoch": 0.628907977745187, + "grad_norm": 0.38629820216456545, + "learning_rate": 9.638316943438036e-05, + "loss": 3.1447, + "step": 13508 + }, + { + "epoch": 0.6289545359312801, + "grad_norm": 0.3911396446318052, + "learning_rate": 9.63821578766553e-05, + "loss": 2.9475, + "step": 13509 + }, + { + "epoch": 0.6290010941173731, + "grad_norm": 0.3374328308457377, + "learning_rate": 9.638114618280316e-05, + "loss": 3.0239, + "step": 13510 + }, + { + "epoch": 0.6290476523034663, + "grad_norm": 0.35574457587018166, + "learning_rate": 9.638013435282693e-05, + "loss": 3.0443, + "step": 13511 + }, + { + "epoch": 0.6290942104895594, + "grad_norm": 0.3632455749859365, + "learning_rate": 9.63791223867296e-05, + "loss": 3.1575, + "step": 13512 + }, + { + "epoch": 0.6291407686756524, + "grad_norm": 0.3674941243124678, + "learning_rate": 9.637811028451408e-05, + "loss": 3.0848, + "step": 13513 + }, + { + "epoch": 0.6291873268617455, + "grad_norm": 0.3659881547632155, + "learning_rate": 9.63770980461834e-05, + "loss": 3.0413, + "step": 13514 + }, + { + "epoch": 0.6292338850478385, + "grad_norm": 0.3663889640856766, + "learning_rate": 9.63760856717405e-05, + "loss": 3.0131, + "step": 13515 + }, + { + "epoch": 0.6292804432339316, + "grad_norm": 0.3747764485826748, + "learning_rate": 9.637507316118834e-05, + "loss": 3.0754, + "step": 13516 + }, + { + "epoch": 0.6293270014200246, + "grad_norm": 0.3518459366972796, + "learning_rate": 9.637406051452993e-05, + "loss": 3.0416, + "step": 13517 + }, + { + "epoch": 0.6293735596061177, + "grad_norm": 0.3470026065122274, + "learning_rate": 9.637304773176821e-05, + "loss": 3.0908, + "step": 13518 + }, + { + "epoch": 0.6294201177922109, + "grad_norm": 0.3377951342882317, + "learning_rate": 9.637203481290617e-05, + "loss": 2.9679, + "step": 13519 + }, + { + "epoch": 0.6294666759783039, + "grad_norm": 0.42081367494397304, + "learning_rate": 9.637102175794677e-05, + "loss": 3.0003, + "step": 13520 + }, + { + "epoch": 0.629513234164397, + "grad_norm": 0.3763697216048282, + "learning_rate": 9.637000856689299e-05, + "loss": 3.0476, + "step": 13521 + }, + { + "epoch": 0.62955979235049, + "grad_norm": 0.4049917770105007, + "learning_rate": 9.63689952397478e-05, + "loss": 3.0278, + "step": 13522 + }, + { + "epoch": 0.6296063505365831, + "grad_norm": 0.3624893914301564, + "learning_rate": 9.636798177651418e-05, + "loss": 3.0843, + "step": 13523 + }, + { + "epoch": 0.6296529087226762, + "grad_norm": 0.392694963845943, + "learning_rate": 9.63669681771951e-05, + "loss": 3.0342, + "step": 13524 + }, + { + "epoch": 0.6296994669087692, + "grad_norm": 0.4029400173226154, + "learning_rate": 9.636595444179355e-05, + "loss": 3.0744, + "step": 13525 + }, + { + "epoch": 0.6297460250948623, + "grad_norm": 0.37800741283646183, + "learning_rate": 9.636494057031249e-05, + "loss": 3.0894, + "step": 13526 + }, + { + "epoch": 0.6297925832809553, + "grad_norm": 0.37213499489293395, + "learning_rate": 9.636392656275488e-05, + "loss": 3.1682, + "step": 13527 + }, + { + "epoch": 0.6298391414670484, + "grad_norm": 0.3316210326605614, + "learning_rate": 9.636291241912372e-05, + "loss": 3.0326, + "step": 13528 + }, + { + "epoch": 0.6298856996531416, + "grad_norm": 0.33116648453917785, + "learning_rate": 9.636189813942199e-05, + "loss": 3.0825, + "step": 13529 + }, + { + "epoch": 0.6299322578392346, + "grad_norm": 0.3409452270996751, + "learning_rate": 9.636088372365264e-05, + "loss": 2.9007, + "step": 13530 + }, + { + "epoch": 0.6299788160253277, + "grad_norm": 0.3653269905708446, + "learning_rate": 9.635986917181867e-05, + "loss": 2.9885, + "step": 13531 + }, + { + "epoch": 0.6300253742114207, + "grad_norm": 0.34187697417251195, + "learning_rate": 9.635885448392306e-05, + "loss": 3.0349, + "step": 13532 + }, + { + "epoch": 0.6300719323975138, + "grad_norm": 0.3204082623473477, + "learning_rate": 9.635783965996876e-05, + "loss": 2.9581, + "step": 13533 + }, + { + "epoch": 0.6301184905836069, + "grad_norm": 0.34630267922083646, + "learning_rate": 9.635682469995879e-05, + "loss": 2.9352, + "step": 13534 + }, + { + "epoch": 0.6301650487696999, + "grad_norm": 0.36613106627652103, + "learning_rate": 9.63558096038961e-05, + "loss": 3.048, + "step": 13535 + }, + { + "epoch": 0.630211606955793, + "grad_norm": 0.34051042544334836, + "learning_rate": 9.635479437178368e-05, + "loss": 3.1129, + "step": 13536 + }, + { + "epoch": 0.630258165141886, + "grad_norm": 0.38321899616625515, + "learning_rate": 9.635377900362448e-05, + "loss": 3.0457, + "step": 13537 + }, + { + "epoch": 0.6303047233279792, + "grad_norm": 0.3776300594871421, + "learning_rate": 9.635276349942152e-05, + "loss": 3.0705, + "step": 13538 + }, + { + "epoch": 0.6303512815140722, + "grad_norm": 0.32300376060811203, + "learning_rate": 9.635174785917778e-05, + "loss": 3.0938, + "step": 13539 + }, + { + "epoch": 0.6303978397001653, + "grad_norm": 0.4010026622042746, + "learning_rate": 9.63507320828962e-05, + "loss": 3.0435, + "step": 13540 + }, + { + "epoch": 0.6304443978862584, + "grad_norm": 0.4340378901248949, + "learning_rate": 9.634971617057981e-05, + "loss": 3.1069, + "step": 13541 + }, + { + "epoch": 0.6304909560723514, + "grad_norm": 0.37855585607484066, + "learning_rate": 9.634870012223156e-05, + "loss": 3.1162, + "step": 13542 + }, + { + "epoch": 0.6305375142584445, + "grad_norm": 0.45400659573697816, + "learning_rate": 9.634768393785445e-05, + "loss": 3.0107, + "step": 13543 + }, + { + "epoch": 0.6305840724445375, + "grad_norm": 0.4014069019875813, + "learning_rate": 9.634666761745144e-05, + "loss": 3.0159, + "step": 13544 + }, + { + "epoch": 0.6306306306306306, + "grad_norm": 0.3584883776959401, + "learning_rate": 9.634565116102553e-05, + "loss": 3.1021, + "step": 13545 + }, + { + "epoch": 0.6306771888167237, + "grad_norm": 0.4949894545828079, + "learning_rate": 9.63446345685797e-05, + "loss": 3.1249, + "step": 13546 + }, + { + "epoch": 0.6307237470028167, + "grad_norm": 0.3748361760743344, + "learning_rate": 9.634361784011694e-05, + "loss": 2.9695, + "step": 13547 + }, + { + "epoch": 0.6307703051889099, + "grad_norm": 0.37679367838725175, + "learning_rate": 9.634260097564023e-05, + "loss": 3.0318, + "step": 13548 + }, + { + "epoch": 0.6308168633750029, + "grad_norm": 0.36063421103325916, + "learning_rate": 9.634158397515254e-05, + "loss": 2.9889, + "step": 13549 + }, + { + "epoch": 0.630863421561096, + "grad_norm": 0.3972888667927385, + "learning_rate": 9.634056683865687e-05, + "loss": 3.049, + "step": 13550 + }, + { + "epoch": 0.6309099797471891, + "grad_norm": 0.3561171760236354, + "learning_rate": 9.63395495661562e-05, + "loss": 3.0624, + "step": 13551 + }, + { + "epoch": 0.6309565379332821, + "grad_norm": 0.45425610946127554, + "learning_rate": 9.633853215765351e-05, + "loss": 3.1602, + "step": 13552 + }, + { + "epoch": 0.6310030961193752, + "grad_norm": 0.36562153619369026, + "learning_rate": 9.633751461315181e-05, + "loss": 3.0332, + "step": 13553 + }, + { + "epoch": 0.6310496543054682, + "grad_norm": 0.3562416534163883, + "learning_rate": 9.633649693265406e-05, + "loss": 3.0256, + "step": 13554 + }, + { + "epoch": 0.6310962124915613, + "grad_norm": 0.3888527462948634, + "learning_rate": 9.633547911616327e-05, + "loss": 2.9465, + "step": 13555 + }, + { + "epoch": 0.6311427706776545, + "grad_norm": 0.3795149283025149, + "learning_rate": 9.633446116368239e-05, + "loss": 3.0758, + "step": 13556 + }, + { + "epoch": 0.6311893288637475, + "grad_norm": 0.37089928903628966, + "learning_rate": 9.633344307521444e-05, + "loss": 3.0091, + "step": 13557 + }, + { + "epoch": 0.6312358870498406, + "grad_norm": 0.3918416141456934, + "learning_rate": 9.633242485076241e-05, + "loss": 3.0521, + "step": 13558 + }, + { + "epoch": 0.6312824452359336, + "grad_norm": 0.36021727477054105, + "learning_rate": 9.633140649032927e-05, + "loss": 3.0524, + "step": 13559 + }, + { + "epoch": 0.6313290034220267, + "grad_norm": 0.39578335507624646, + "learning_rate": 9.633038799391801e-05, + "loss": 3.0805, + "step": 13560 + }, + { + "epoch": 0.6313755616081197, + "grad_norm": 0.36016515416897676, + "learning_rate": 9.632936936153164e-05, + "loss": 2.9851, + "step": 13561 + }, + { + "epoch": 0.6314221197942128, + "grad_norm": 0.3704004089334484, + "learning_rate": 9.632835059317314e-05, + "loss": 2.978, + "step": 13562 + }, + { + "epoch": 0.6314686779803059, + "grad_norm": 0.36966621551956647, + "learning_rate": 9.632733168884547e-05, + "loss": 3.0513, + "step": 13563 + }, + { + "epoch": 0.6315152361663989, + "grad_norm": 0.39451228836598523, + "learning_rate": 9.632631264855166e-05, + "loss": 2.9774, + "step": 13564 + }, + { + "epoch": 0.631561794352492, + "grad_norm": 0.4326610080597083, + "learning_rate": 9.632529347229468e-05, + "loss": 3.0337, + "step": 13565 + }, + { + "epoch": 0.631608352538585, + "grad_norm": 0.45385005336645406, + "learning_rate": 9.632427416007753e-05, + "loss": 3.0521, + "step": 13566 + }, + { + "epoch": 0.6316549107246782, + "grad_norm": 0.39192780593913923, + "learning_rate": 9.63232547119032e-05, + "loss": 3.0724, + "step": 13567 + }, + { + "epoch": 0.6317014689107713, + "grad_norm": 0.37701983786799226, + "learning_rate": 9.632223512777469e-05, + "loss": 3.0099, + "step": 13568 + }, + { + "epoch": 0.6317480270968643, + "grad_norm": 0.4047075479187413, + "learning_rate": 9.632121540769497e-05, + "loss": 2.9922, + "step": 13569 + }, + { + "epoch": 0.6317945852829574, + "grad_norm": 0.39592365817698155, + "learning_rate": 9.632019555166705e-05, + "loss": 3.041, + "step": 13570 + }, + { + "epoch": 0.6318411434690504, + "grad_norm": 0.3616933549619661, + "learning_rate": 9.631917555969392e-05, + "loss": 3.025, + "step": 13571 + }, + { + "epoch": 0.6318877016551435, + "grad_norm": 0.42751388621792186, + "learning_rate": 9.631815543177858e-05, + "loss": 3.1064, + "step": 13572 + }, + { + "epoch": 0.6319342598412366, + "grad_norm": 0.3789278073157759, + "learning_rate": 9.6317135167924e-05, + "loss": 2.9364, + "step": 13573 + }, + { + "epoch": 0.6319808180273296, + "grad_norm": 0.38940555932081466, + "learning_rate": 9.63161147681332e-05, + "loss": 2.9916, + "step": 13574 + }, + { + "epoch": 0.6320273762134228, + "grad_norm": 0.37665909130167974, + "learning_rate": 9.631509423240918e-05, + "loss": 3.1454, + "step": 13575 + }, + { + "epoch": 0.6320739343995158, + "grad_norm": 0.4053130164580416, + "learning_rate": 9.63140735607549e-05, + "loss": 3.0458, + "step": 13576 + }, + { + "epoch": 0.6321204925856089, + "grad_norm": 0.3874734737741031, + "learning_rate": 9.63130527531734e-05, + "loss": 3.0071, + "step": 13577 + }, + { + "epoch": 0.632167050771702, + "grad_norm": 0.4289499153829629, + "learning_rate": 9.631203180966762e-05, + "loss": 2.9913, + "step": 13578 + }, + { + "epoch": 0.632213608957795, + "grad_norm": 0.41619769579591487, + "learning_rate": 9.631101073024061e-05, + "loss": 3.0162, + "step": 13579 + }, + { + "epoch": 0.6322601671438881, + "grad_norm": 0.3605115633026288, + "learning_rate": 9.630998951489534e-05, + "loss": 3.0641, + "step": 13580 + }, + { + "epoch": 0.6323067253299811, + "grad_norm": 0.39690207383721576, + "learning_rate": 9.630896816363483e-05, + "loss": 3.0337, + "step": 13581 + }, + { + "epoch": 0.6323532835160742, + "grad_norm": 0.3750840457066831, + "learning_rate": 9.630794667646203e-05, + "loss": 3.0922, + "step": 13582 + }, + { + "epoch": 0.6323998417021672, + "grad_norm": 0.4000659367323474, + "learning_rate": 9.630692505337999e-05, + "loss": 3.0274, + "step": 13583 + }, + { + "epoch": 0.6324463998882603, + "grad_norm": 0.37605367929381917, + "learning_rate": 9.630590329439169e-05, + "loss": 3.0531, + "step": 13584 + }, + { + "epoch": 0.6324929580743535, + "grad_norm": 0.3907714874452961, + "learning_rate": 9.630488139950012e-05, + "loss": 3.1006, + "step": 13585 + }, + { + "epoch": 0.6325395162604465, + "grad_norm": 0.37984188399394636, + "learning_rate": 9.630385936870828e-05, + "loss": 3.0801, + "step": 13586 + }, + { + "epoch": 0.6325860744465396, + "grad_norm": 0.35882857772237203, + "learning_rate": 9.630283720201918e-05, + "loss": 3.0447, + "step": 13587 + }, + { + "epoch": 0.6326326326326326, + "grad_norm": 0.4119485230511792, + "learning_rate": 9.630181489943581e-05, + "loss": 3.0485, + "step": 13588 + }, + { + "epoch": 0.6326791908187257, + "grad_norm": 0.4148837694091245, + "learning_rate": 9.630079246096117e-05, + "loss": 3.0411, + "step": 13589 + }, + { + "epoch": 0.6327257490048188, + "grad_norm": 0.3692779150434751, + "learning_rate": 9.629976988659827e-05, + "loss": 3.0485, + "step": 13590 + }, + { + "epoch": 0.6327723071909118, + "grad_norm": 0.38885860568095754, + "learning_rate": 9.629874717635012e-05, + "loss": 3.0801, + "step": 13591 + }, + { + "epoch": 0.6328188653770049, + "grad_norm": 0.4217396215637363, + "learning_rate": 9.62977243302197e-05, + "loss": 3.0102, + "step": 13592 + }, + { + "epoch": 0.6328654235630979, + "grad_norm": 0.37210742966083443, + "learning_rate": 9.629670134821002e-05, + "loss": 3.0923, + "step": 13593 + }, + { + "epoch": 0.632911981749191, + "grad_norm": 0.4059035108242084, + "learning_rate": 9.629567823032409e-05, + "loss": 3.0244, + "step": 13594 + }, + { + "epoch": 0.6329585399352842, + "grad_norm": 0.3633570749927692, + "learning_rate": 9.629465497656488e-05, + "loss": 3.1347, + "step": 13595 + }, + { + "epoch": 0.6330050981213772, + "grad_norm": 0.38223954077434047, + "learning_rate": 9.629363158693543e-05, + "loss": 3.0137, + "step": 13596 + }, + { + "epoch": 0.6330516563074703, + "grad_norm": 0.3779448938087802, + "learning_rate": 9.629260806143874e-05, + "loss": 3.0501, + "step": 13597 + }, + { + "epoch": 0.6330982144935633, + "grad_norm": 0.34763937545567, + "learning_rate": 9.62915844000778e-05, + "loss": 3.2027, + "step": 13598 + }, + { + "epoch": 0.6331447726796564, + "grad_norm": 0.3999723648982738, + "learning_rate": 9.629056060285562e-05, + "loss": 3.098, + "step": 13599 + }, + { + "epoch": 0.6331913308657495, + "grad_norm": 0.36837453068027903, + "learning_rate": 9.62895366697752e-05, + "loss": 3.0802, + "step": 13600 + }, + { + "epoch": 0.6332378890518425, + "grad_norm": 0.3635479692264641, + "learning_rate": 9.628851260083958e-05, + "loss": 3.0662, + "step": 13601 + }, + { + "epoch": 0.6332844472379356, + "grad_norm": 0.36818297226009516, + "learning_rate": 9.62874883960517e-05, + "loss": 3.1048, + "step": 13602 + }, + { + "epoch": 0.6333310054240286, + "grad_norm": 0.3669641280886608, + "learning_rate": 9.628646405541461e-05, + "loss": 3.0709, + "step": 13603 + }, + { + "epoch": 0.6333775636101218, + "grad_norm": 0.39805119449062054, + "learning_rate": 9.628543957893132e-05, + "loss": 3.0586, + "step": 13604 + }, + { + "epoch": 0.6334241217962148, + "grad_norm": 0.37877772280754707, + "learning_rate": 9.628441496660481e-05, + "loss": 3.0276, + "step": 13605 + }, + { + "epoch": 0.6334706799823079, + "grad_norm": 0.37537544540298284, + "learning_rate": 9.62833902184381e-05, + "loss": 3.0802, + "step": 13606 + }, + { + "epoch": 0.633517238168401, + "grad_norm": 0.41319186950807374, + "learning_rate": 9.628236533443422e-05, + "loss": 3.0764, + "step": 13607 + }, + { + "epoch": 0.633563796354494, + "grad_norm": 0.3901524647581896, + "learning_rate": 9.628134031459615e-05, + "loss": 3.0703, + "step": 13608 + }, + { + "epoch": 0.6336103545405871, + "grad_norm": 0.3847989113174412, + "learning_rate": 9.62803151589269e-05, + "loss": 2.9646, + "step": 13609 + }, + { + "epoch": 0.6336569127266801, + "grad_norm": 0.3390359791146052, + "learning_rate": 9.627928986742949e-05, + "loss": 3.0256, + "step": 13610 + }, + { + "epoch": 0.6337034709127732, + "grad_norm": 0.36940249193150043, + "learning_rate": 9.627826444010691e-05, + "loss": 2.9897, + "step": 13611 + }, + { + "epoch": 0.6337500290988664, + "grad_norm": 0.36625802927898277, + "learning_rate": 9.62772388769622e-05, + "loss": 3.0286, + "step": 13612 + }, + { + "epoch": 0.6337965872849594, + "grad_norm": 0.36346475036798076, + "learning_rate": 9.627621317799834e-05, + "loss": 3.1009, + "step": 13613 + }, + { + "epoch": 0.6338431454710525, + "grad_norm": 0.33708481654561223, + "learning_rate": 9.627518734321837e-05, + "loss": 3.0657, + "step": 13614 + }, + { + "epoch": 0.6338897036571455, + "grad_norm": 0.38173068849714403, + "learning_rate": 9.627416137262526e-05, + "loss": 3.1219, + "step": 13615 + }, + { + "epoch": 0.6339362618432386, + "grad_norm": 0.3248697046134943, + "learning_rate": 9.627313526622206e-05, + "loss": 3.046, + "step": 13616 + }, + { + "epoch": 0.6339828200293317, + "grad_norm": 0.35070250968886607, + "learning_rate": 9.627210902401176e-05, + "loss": 3.0735, + "step": 13617 + }, + { + "epoch": 0.6340293782154247, + "grad_norm": 0.31219720875525536, + "learning_rate": 9.627108264599739e-05, + "loss": 3.0666, + "step": 13618 + }, + { + "epoch": 0.6340759364015178, + "grad_norm": 0.3665167000139436, + "learning_rate": 9.627005613218194e-05, + "loss": 2.9224, + "step": 13619 + }, + { + "epoch": 0.6341224945876108, + "grad_norm": 0.39986398106468896, + "learning_rate": 9.626902948256844e-05, + "loss": 3.0813, + "step": 13620 + }, + { + "epoch": 0.634169052773704, + "grad_norm": 0.33764813355211404, + "learning_rate": 9.62680026971599e-05, + "loss": 2.9761, + "step": 13621 + }, + { + "epoch": 0.6342156109597971, + "grad_norm": 0.4009520291065384, + "learning_rate": 9.626697577595932e-05, + "loss": 3.0332, + "step": 13622 + }, + { + "epoch": 0.6342621691458901, + "grad_norm": 0.37440602004854395, + "learning_rate": 9.626594871896972e-05, + "loss": 3.0766, + "step": 13623 + }, + { + "epoch": 0.6343087273319832, + "grad_norm": 0.3688977160862764, + "learning_rate": 9.626492152619412e-05, + "loss": 3.0593, + "step": 13624 + }, + { + "epoch": 0.6343552855180762, + "grad_norm": 0.3769480172229773, + "learning_rate": 9.626389419763556e-05, + "loss": 3.01, + "step": 13625 + }, + { + "epoch": 0.6344018437041693, + "grad_norm": 0.34952450101128524, + "learning_rate": 9.626286673329701e-05, + "loss": 3.0476, + "step": 13626 + }, + { + "epoch": 0.6344484018902623, + "grad_norm": 0.33474616705513244, + "learning_rate": 9.62618391331815e-05, + "loss": 3.0645, + "step": 13627 + }, + { + "epoch": 0.6344949600763554, + "grad_norm": 0.3867769946203212, + "learning_rate": 9.626081139729204e-05, + "loss": 3.0086, + "step": 13628 + }, + { + "epoch": 0.6345415182624485, + "grad_norm": 0.3594400792041565, + "learning_rate": 9.625978352563168e-05, + "loss": 3.0463, + "step": 13629 + }, + { + "epoch": 0.6345880764485415, + "grad_norm": 0.36592867864051865, + "learning_rate": 9.625875551820339e-05, + "loss": 3.0346, + "step": 13630 + }, + { + "epoch": 0.6346346346346347, + "grad_norm": 0.34463820203847345, + "learning_rate": 9.625772737501022e-05, + "loss": 3.0609, + "step": 13631 + }, + { + "epoch": 0.6346811928207277, + "grad_norm": 0.36554678660381523, + "learning_rate": 9.625669909605518e-05, + "loss": 2.9781, + "step": 13632 + }, + { + "epoch": 0.6347277510068208, + "grad_norm": 0.3481727438365473, + "learning_rate": 9.62556706813413e-05, + "loss": 3.1083, + "step": 13633 + }, + { + "epoch": 0.6347743091929139, + "grad_norm": 0.3734873332528906, + "learning_rate": 9.625464213087155e-05, + "loss": 3.0337, + "step": 13634 + }, + { + "epoch": 0.6348208673790069, + "grad_norm": 0.3366451336447661, + "learning_rate": 9.6253613444649e-05, + "loss": 2.9958, + "step": 13635 + }, + { + "epoch": 0.6348674255651, + "grad_norm": 0.3897943535177279, + "learning_rate": 9.625258462267666e-05, + "loss": 3.1197, + "step": 13636 + }, + { + "epoch": 0.634913983751193, + "grad_norm": 0.36365407966093494, + "learning_rate": 9.625155566495751e-05, + "loss": 3.1126, + "step": 13637 + }, + { + "epoch": 0.6349605419372861, + "grad_norm": 0.3714211439371374, + "learning_rate": 9.625052657149463e-05, + "loss": 3.1004, + "step": 13638 + }, + { + "epoch": 0.6350071001233792, + "grad_norm": 0.4150334603337239, + "learning_rate": 9.6249497342291e-05, + "loss": 3.0393, + "step": 13639 + }, + { + "epoch": 0.6350536583094722, + "grad_norm": 0.3535437185012075, + "learning_rate": 9.624846797734965e-05, + "loss": 3.187, + "step": 13640 + }, + { + "epoch": 0.6351002164955654, + "grad_norm": 0.36116309408221897, + "learning_rate": 9.624743847667362e-05, + "loss": 3.1002, + "step": 13641 + }, + { + "epoch": 0.6351467746816584, + "grad_norm": 0.4175442645377864, + "learning_rate": 9.62464088402659e-05, + "loss": 3.0161, + "step": 13642 + }, + { + "epoch": 0.6351933328677515, + "grad_norm": 0.3639750437123545, + "learning_rate": 9.624537906812952e-05, + "loss": 3.0331, + "step": 13643 + }, + { + "epoch": 0.6352398910538446, + "grad_norm": 0.4104560574914832, + "learning_rate": 9.624434916026752e-05, + "loss": 3.1171, + "step": 13644 + }, + { + "epoch": 0.6352864492399376, + "grad_norm": 0.3932994940371994, + "learning_rate": 9.62433191166829e-05, + "loss": 3.103, + "step": 13645 + }, + { + "epoch": 0.6353330074260307, + "grad_norm": 0.3605832076483838, + "learning_rate": 9.62422889373787e-05, + "loss": 3.0346, + "step": 13646 + }, + { + "epoch": 0.6353795656121237, + "grad_norm": 0.37845738413979185, + "learning_rate": 9.624125862235793e-05, + "loss": 3.0818, + "step": 13647 + }, + { + "epoch": 0.6354261237982168, + "grad_norm": 0.4255514347034639, + "learning_rate": 9.624022817162363e-05, + "loss": 3.1027, + "step": 13648 + }, + { + "epoch": 0.6354726819843098, + "grad_norm": 0.339226106102167, + "learning_rate": 9.623919758517883e-05, + "loss": 3.0271, + "step": 13649 + }, + { + "epoch": 0.635519240170403, + "grad_norm": 0.4004310173717735, + "learning_rate": 9.623816686302651e-05, + "loss": 3.0903, + "step": 13650 + }, + { + "epoch": 0.6355657983564961, + "grad_norm": 0.4192501988659228, + "learning_rate": 9.623713600516975e-05, + "loss": 2.9961, + "step": 13651 + }, + { + "epoch": 0.6356123565425891, + "grad_norm": 0.33160861575683803, + "learning_rate": 9.623610501161154e-05, + "loss": 3.0071, + "step": 13652 + }, + { + "epoch": 0.6356589147286822, + "grad_norm": 0.448313704980529, + "learning_rate": 9.623507388235492e-05, + "loss": 3.0129, + "step": 13653 + }, + { + "epoch": 0.6357054729147752, + "grad_norm": 0.43457612002918444, + "learning_rate": 9.623404261740292e-05, + "loss": 3.05, + "step": 13654 + }, + { + "epoch": 0.6357520311008683, + "grad_norm": 0.3999973257103731, + "learning_rate": 9.623301121675855e-05, + "loss": 3.0027, + "step": 13655 + }, + { + "epoch": 0.6357985892869614, + "grad_norm": 0.4425525998919484, + "learning_rate": 9.623197968042484e-05, + "loss": 3.0047, + "step": 13656 + }, + { + "epoch": 0.6358451474730544, + "grad_norm": 0.38877440023770543, + "learning_rate": 9.623094800840484e-05, + "loss": 3.0085, + "step": 13657 + }, + { + "epoch": 0.6358917056591475, + "grad_norm": 0.367966909413778, + "learning_rate": 9.622991620070156e-05, + "loss": 3.05, + "step": 13658 + }, + { + "epoch": 0.6359382638452405, + "grad_norm": 0.37418568646465494, + "learning_rate": 9.622888425731802e-05, + "loss": 3.1495, + "step": 13659 + }, + { + "epoch": 0.6359848220313337, + "grad_norm": 0.40611348684547505, + "learning_rate": 9.622785217825727e-05, + "loss": 3.0197, + "step": 13660 + }, + { + "epoch": 0.6360313802174268, + "grad_norm": 0.4136236755923676, + "learning_rate": 9.622681996352232e-05, + "loss": 3.0177, + "step": 13661 + }, + { + "epoch": 0.6360779384035198, + "grad_norm": 0.3947302230436515, + "learning_rate": 9.62257876131162e-05, + "loss": 3.1072, + "step": 13662 + }, + { + "epoch": 0.6361244965896129, + "grad_norm": 0.4013575309879797, + "learning_rate": 9.622475512704197e-05, + "loss": 3.1775, + "step": 13663 + }, + { + "epoch": 0.6361710547757059, + "grad_norm": 0.3730107473347986, + "learning_rate": 9.622372250530262e-05, + "loss": 2.9754, + "step": 13664 + }, + { + "epoch": 0.636217612961799, + "grad_norm": 0.3536614824953663, + "learning_rate": 9.62226897479012e-05, + "loss": 3.0357, + "step": 13665 + }, + { + "epoch": 0.6362641711478921, + "grad_norm": 0.40045915611860133, + "learning_rate": 9.622165685484074e-05, + "loss": 3.0727, + "step": 13666 + }, + { + "epoch": 0.6363107293339851, + "grad_norm": 0.38449421503484554, + "learning_rate": 9.622062382612427e-05, + "loss": 3.0515, + "step": 13667 + }, + { + "epoch": 0.6363572875200783, + "grad_norm": 0.370278408997351, + "learning_rate": 9.621959066175483e-05, + "loss": 2.994, + "step": 13668 + }, + { + "epoch": 0.6364038457061713, + "grad_norm": 0.39030940599354624, + "learning_rate": 9.621855736173544e-05, + "loss": 3.142, + "step": 13669 + }, + { + "epoch": 0.6364504038922644, + "grad_norm": 0.3932722479510497, + "learning_rate": 9.621752392606913e-05, + "loss": 2.8948, + "step": 13670 + }, + { + "epoch": 0.6364969620783574, + "grad_norm": 0.38238637613736687, + "learning_rate": 9.621649035475894e-05, + "loss": 3.0746, + "step": 13671 + }, + { + "epoch": 0.6365435202644505, + "grad_norm": 0.37666145486912644, + "learning_rate": 9.621545664780792e-05, + "loss": 3.067, + "step": 13672 + }, + { + "epoch": 0.6365900784505436, + "grad_norm": 0.3922786566837007, + "learning_rate": 9.621442280521908e-05, + "loss": 3.0678, + "step": 13673 + }, + { + "epoch": 0.6366366366366366, + "grad_norm": 0.4273446293708598, + "learning_rate": 9.621338882699547e-05, + "loss": 3.094, + "step": 13674 + }, + { + "epoch": 0.6366831948227297, + "grad_norm": 0.3836429549050974, + "learning_rate": 9.621235471314009e-05, + "loss": 3.029, + "step": 13675 + }, + { + "epoch": 0.6367297530088227, + "grad_norm": 0.40313690308921585, + "learning_rate": 9.621132046365604e-05, + "loss": 2.9131, + "step": 13676 + }, + { + "epoch": 0.6367763111949158, + "grad_norm": 0.36268567068007745, + "learning_rate": 9.62102860785463e-05, + "loss": 3.0251, + "step": 13677 + }, + { + "epoch": 0.636822869381009, + "grad_norm": 0.3878319549088594, + "learning_rate": 9.620925155781391e-05, + "loss": 3.0536, + "step": 13678 + }, + { + "epoch": 0.636869427567102, + "grad_norm": 0.3916948160357159, + "learning_rate": 9.620821690146193e-05, + "loss": 3.0705, + "step": 13679 + }, + { + "epoch": 0.6369159857531951, + "grad_norm": 0.35606987305614135, + "learning_rate": 9.620718210949337e-05, + "loss": 2.9688, + "step": 13680 + }, + { + "epoch": 0.6369625439392881, + "grad_norm": 0.3621317039809419, + "learning_rate": 9.62061471819113e-05, + "loss": 2.9817, + "step": 13681 + }, + { + "epoch": 0.6370091021253812, + "grad_norm": 0.34606353912063026, + "learning_rate": 9.620511211871875e-05, + "loss": 3.1053, + "step": 13682 + }, + { + "epoch": 0.6370556603114743, + "grad_norm": 0.3723997691409764, + "learning_rate": 9.620407691991873e-05, + "loss": 2.9946, + "step": 13683 + }, + { + "epoch": 0.6371022184975673, + "grad_norm": 0.3451093008767156, + "learning_rate": 9.62030415855143e-05, + "loss": 3.1841, + "step": 13684 + }, + { + "epoch": 0.6371487766836604, + "grad_norm": 0.37291143778779967, + "learning_rate": 9.620200611550849e-05, + "loss": 2.9766, + "step": 13685 + }, + { + "epoch": 0.6371953348697534, + "grad_norm": 0.3639009893593395, + "learning_rate": 9.620097050990435e-05, + "loss": 3.095, + "step": 13686 + }, + { + "epoch": 0.6372418930558466, + "grad_norm": 0.35241407473020303, + "learning_rate": 9.619993476870492e-05, + "loss": 3.0145, + "step": 13687 + }, + { + "epoch": 0.6372884512419397, + "grad_norm": 0.3224871325015867, + "learning_rate": 9.619889889191323e-05, + "loss": 3.0401, + "step": 13688 + }, + { + "epoch": 0.6373350094280327, + "grad_norm": 0.3643575639417564, + "learning_rate": 9.619786287953231e-05, + "loss": 3.085, + "step": 13689 + }, + { + "epoch": 0.6373815676141258, + "grad_norm": 0.3268386793619403, + "learning_rate": 9.619682673156523e-05, + "loss": 3.0794, + "step": 13690 + }, + { + "epoch": 0.6374281258002188, + "grad_norm": 0.37023328328189037, + "learning_rate": 9.619579044801501e-05, + "loss": 2.9225, + "step": 13691 + }, + { + "epoch": 0.6374746839863119, + "grad_norm": 0.3687965375296011, + "learning_rate": 9.61947540288847e-05, + "loss": 3.0567, + "step": 13692 + }, + { + "epoch": 0.6375212421724049, + "grad_norm": 0.40112087337698854, + "learning_rate": 9.619371747417731e-05, + "loss": 3.1154, + "step": 13693 + }, + { + "epoch": 0.637567800358498, + "grad_norm": 0.3691858659995519, + "learning_rate": 9.619268078389593e-05, + "loss": 3.0909, + "step": 13694 + }, + { + "epoch": 0.6376143585445911, + "grad_norm": 0.39850091762391704, + "learning_rate": 9.619164395804359e-05, + "loss": 3.0228, + "step": 13695 + }, + { + "epoch": 0.6376609167306841, + "grad_norm": 0.367279049989617, + "learning_rate": 9.619060699662332e-05, + "loss": 2.9445, + "step": 13696 + }, + { + "epoch": 0.6377074749167773, + "grad_norm": 0.37151040654399115, + "learning_rate": 9.618956989963817e-05, + "loss": 3.0504, + "step": 13697 + }, + { + "epoch": 0.6377540331028703, + "grad_norm": 0.3445591088709852, + "learning_rate": 9.618853266709118e-05, + "loss": 3.1202, + "step": 13698 + }, + { + "epoch": 0.6378005912889634, + "grad_norm": 0.3509173066215327, + "learning_rate": 9.61874952989854e-05, + "loss": 3.0424, + "step": 13699 + }, + { + "epoch": 0.6378471494750565, + "grad_norm": 0.3780107118384235, + "learning_rate": 9.618645779532387e-05, + "loss": 3.074, + "step": 13700 + }, + { + "epoch": 0.6378937076611495, + "grad_norm": 0.3526515716526818, + "learning_rate": 9.618542015610963e-05, + "loss": 2.9534, + "step": 13701 + }, + { + "epoch": 0.6379402658472426, + "grad_norm": 0.3735786480558447, + "learning_rate": 9.618438238134576e-05, + "loss": 3.0758, + "step": 13702 + }, + { + "epoch": 0.6379868240333356, + "grad_norm": 0.35598604386982946, + "learning_rate": 9.618334447103523e-05, + "loss": 3.1347, + "step": 13703 + }, + { + "epoch": 0.6380333822194287, + "grad_norm": 0.34208323978367805, + "learning_rate": 9.618230642518116e-05, + "loss": 3.0452, + "step": 13704 + }, + { + "epoch": 0.6380799404055219, + "grad_norm": 0.3510495716152438, + "learning_rate": 9.618126824378656e-05, + "loss": 3.1258, + "step": 13705 + }, + { + "epoch": 0.6381264985916149, + "grad_norm": 0.34804643957106696, + "learning_rate": 9.618022992685451e-05, + "loss": 3.0042, + "step": 13706 + }, + { + "epoch": 0.638173056777708, + "grad_norm": 0.3389575826871991, + "learning_rate": 9.617919147438804e-05, + "loss": 2.9291, + "step": 13707 + }, + { + "epoch": 0.638219614963801, + "grad_norm": 0.3671295597794861, + "learning_rate": 9.617815288639017e-05, + "loss": 2.9906, + "step": 13708 + }, + { + "epoch": 0.6382661731498941, + "grad_norm": 0.34576634378572696, + "learning_rate": 9.617711416286397e-05, + "loss": 3.05, + "step": 13709 + }, + { + "epoch": 0.6383127313359872, + "grad_norm": 0.41690870728453355, + "learning_rate": 9.617607530381248e-05, + "loss": 3.1096, + "step": 13710 + }, + { + "epoch": 0.6383592895220802, + "grad_norm": 0.38537905551885676, + "learning_rate": 9.617503630923878e-05, + "loss": 3.0713, + "step": 13711 + }, + { + "epoch": 0.6384058477081733, + "grad_norm": 0.37576628823608127, + "learning_rate": 9.617399717914589e-05, + "loss": 3.1484, + "step": 13712 + }, + { + "epoch": 0.6384524058942663, + "grad_norm": 0.38043368592740007, + "learning_rate": 9.617295791353686e-05, + "loss": 3.1724, + "step": 13713 + }, + { + "epoch": 0.6384989640803594, + "grad_norm": 0.3402665114644761, + "learning_rate": 9.617191851241477e-05, + "loss": 3.0698, + "step": 13714 + }, + { + "epoch": 0.6385455222664524, + "grad_norm": 0.3577275356154522, + "learning_rate": 9.617087897578262e-05, + "loss": 2.9314, + "step": 13715 + }, + { + "epoch": 0.6385920804525456, + "grad_norm": 0.3186381152291129, + "learning_rate": 9.616983930364349e-05, + "loss": 3.0354, + "step": 13716 + }, + { + "epoch": 0.6386386386386387, + "grad_norm": 0.36931685725724284, + "learning_rate": 9.616879949600045e-05, + "loss": 3.0033, + "step": 13717 + }, + { + "epoch": 0.6386851968247317, + "grad_norm": 0.32760782950803363, + "learning_rate": 9.616775955285653e-05, + "loss": 2.8998, + "step": 13718 + }, + { + "epoch": 0.6387317550108248, + "grad_norm": 0.4098312741740837, + "learning_rate": 9.616671947421476e-05, + "loss": 3.0494, + "step": 13719 + }, + { + "epoch": 0.6387783131969178, + "grad_norm": 0.3943580027937434, + "learning_rate": 9.616567926007825e-05, + "loss": 3.1374, + "step": 13720 + }, + { + "epoch": 0.6388248713830109, + "grad_norm": 0.34954821453811574, + "learning_rate": 9.616463891045e-05, + "loss": 3.0693, + "step": 13721 + }, + { + "epoch": 0.638871429569104, + "grad_norm": 0.39480657380805256, + "learning_rate": 9.616359842533308e-05, + "loss": 2.9217, + "step": 13722 + }, + { + "epoch": 0.638917987755197, + "grad_norm": 0.3651868284588991, + "learning_rate": 9.616255780473055e-05, + "loss": 2.9795, + "step": 13723 + }, + { + "epoch": 0.6389645459412902, + "grad_norm": 0.37925826547664, + "learning_rate": 9.616151704864547e-05, + "loss": 2.9808, + "step": 13724 + }, + { + "epoch": 0.6390111041273832, + "grad_norm": 0.3855020767179593, + "learning_rate": 9.616047615708087e-05, + "loss": 3.0425, + "step": 13725 + }, + { + "epoch": 0.6390576623134763, + "grad_norm": 0.3812850824838963, + "learning_rate": 9.615943513003983e-05, + "loss": 3.0757, + "step": 13726 + }, + { + "epoch": 0.6391042204995694, + "grad_norm": 0.3634414068349731, + "learning_rate": 9.61583939675254e-05, + "loss": 3.1421, + "step": 13727 + }, + { + "epoch": 0.6391507786856624, + "grad_norm": 0.42600160784104224, + "learning_rate": 9.61573526695406e-05, + "loss": 3.1405, + "step": 13728 + }, + { + "epoch": 0.6391973368717555, + "grad_norm": 0.33661607842966423, + "learning_rate": 9.615631123608854e-05, + "loss": 3.1558, + "step": 13729 + }, + { + "epoch": 0.6392438950578485, + "grad_norm": 0.39491804535805075, + "learning_rate": 9.615526966717228e-05, + "loss": 3.1264, + "step": 13730 + }, + { + "epoch": 0.6392904532439416, + "grad_norm": 0.3812695758476337, + "learning_rate": 9.61542279627948e-05, + "loss": 3.0055, + "step": 13731 + }, + { + "epoch": 0.6393370114300347, + "grad_norm": 0.3930910251032412, + "learning_rate": 9.615318612295925e-05, + "loss": 3.1007, + "step": 13732 + }, + { + "epoch": 0.6393835696161277, + "grad_norm": 0.36472457043169726, + "learning_rate": 9.615214414766862e-05, + "loss": 3.06, + "step": 13733 + }, + { + "epoch": 0.6394301278022209, + "grad_norm": 0.3581709676105013, + "learning_rate": 9.615110203692601e-05, + "loss": 2.9754, + "step": 13734 + }, + { + "epoch": 0.6394766859883139, + "grad_norm": 0.3627084122114685, + "learning_rate": 9.615005979073446e-05, + "loss": 3.0412, + "step": 13735 + }, + { + "epoch": 0.639523244174407, + "grad_norm": 0.3637193333058483, + "learning_rate": 9.614901740909702e-05, + "loss": 3.02, + "step": 13736 + }, + { + "epoch": 0.6395698023605, + "grad_norm": 0.34453215486877836, + "learning_rate": 9.614797489201676e-05, + "loss": 3.13, + "step": 13737 + }, + { + "epoch": 0.6396163605465931, + "grad_norm": 0.38054399842244374, + "learning_rate": 9.614693223949675e-05, + "loss": 3.2017, + "step": 13738 + }, + { + "epoch": 0.6396629187326862, + "grad_norm": 0.34677438765387464, + "learning_rate": 9.614588945154004e-05, + "loss": 3.0268, + "step": 13739 + }, + { + "epoch": 0.6397094769187792, + "grad_norm": 0.37563728511757793, + "learning_rate": 9.614484652814968e-05, + "loss": 2.9674, + "step": 13740 + }, + { + "epoch": 0.6397560351048723, + "grad_norm": 0.3516744027155795, + "learning_rate": 9.614380346932875e-05, + "loss": 3.051, + "step": 13741 + }, + { + "epoch": 0.6398025932909653, + "grad_norm": 0.37347089700278063, + "learning_rate": 9.614276027508031e-05, + "loss": 3.1324, + "step": 13742 + }, + { + "epoch": 0.6398491514770585, + "grad_norm": 0.3312379150120569, + "learning_rate": 9.61417169454074e-05, + "loss": 3.1207, + "step": 13743 + }, + { + "epoch": 0.6398957096631516, + "grad_norm": 0.35237729559250497, + "learning_rate": 9.61406734803131e-05, + "loss": 3.0396, + "step": 13744 + }, + { + "epoch": 0.6399422678492446, + "grad_norm": 0.33924732465665475, + "learning_rate": 9.613962987980048e-05, + "loss": 3.041, + "step": 13745 + }, + { + "epoch": 0.6399888260353377, + "grad_norm": 0.35333743237317905, + "learning_rate": 9.613858614387257e-05, + "loss": 3.1024, + "step": 13746 + }, + { + "epoch": 0.6400353842214307, + "grad_norm": 0.3664944918216688, + "learning_rate": 9.613754227253247e-05, + "loss": 2.9895, + "step": 13747 + }, + { + "epoch": 0.6400819424075238, + "grad_norm": 0.4157207153013495, + "learning_rate": 9.613649826578322e-05, + "loss": 3.0339, + "step": 13748 + }, + { + "epoch": 0.6401285005936169, + "grad_norm": 0.33418856582234996, + "learning_rate": 9.61354541236279e-05, + "loss": 2.9499, + "step": 13749 + }, + { + "epoch": 0.6401750587797099, + "grad_norm": 0.3556697144467833, + "learning_rate": 9.613440984606957e-05, + "loss": 3.0229, + "step": 13750 + }, + { + "epoch": 0.640221616965803, + "grad_norm": 0.3605898384048029, + "learning_rate": 9.613336543311128e-05, + "loss": 3.0785, + "step": 13751 + }, + { + "epoch": 0.640268175151896, + "grad_norm": 0.34154443529286427, + "learning_rate": 9.613232088475612e-05, + "loss": 3.1011, + "step": 13752 + }, + { + "epoch": 0.6403147333379892, + "grad_norm": 0.3644188597421734, + "learning_rate": 9.613127620100715e-05, + "loss": 3.058, + "step": 13753 + }, + { + "epoch": 0.6403612915240823, + "grad_norm": 0.37529483016858606, + "learning_rate": 9.61302313818674e-05, + "loss": 3.0411, + "step": 13754 + }, + { + "epoch": 0.6404078497101753, + "grad_norm": 0.3821413881153705, + "learning_rate": 9.612918642734e-05, + "loss": 3.1426, + "step": 13755 + }, + { + "epoch": 0.6404544078962684, + "grad_norm": 0.35319095428746616, + "learning_rate": 9.612814133742795e-05, + "loss": 3.0835, + "step": 13756 + }, + { + "epoch": 0.6405009660823614, + "grad_norm": 0.3562912801409972, + "learning_rate": 9.612709611213437e-05, + "loss": 2.9752, + "step": 13757 + }, + { + "epoch": 0.6405475242684545, + "grad_norm": 0.34082618366708295, + "learning_rate": 9.61260507514623e-05, + "loss": 2.9875, + "step": 13758 + }, + { + "epoch": 0.6405940824545475, + "grad_norm": 0.3588854776090243, + "learning_rate": 9.612500525541483e-05, + "loss": 3.0081, + "step": 13759 + }, + { + "epoch": 0.6406406406406406, + "grad_norm": 0.3332063011381492, + "learning_rate": 9.6123959623995e-05, + "loss": 3.0104, + "step": 13760 + }, + { + "epoch": 0.6406871988267338, + "grad_norm": 0.3929023679939398, + "learning_rate": 9.612291385720589e-05, + "loss": 3.0708, + "step": 13761 + }, + { + "epoch": 0.6407337570128268, + "grad_norm": 0.3319722049918703, + "learning_rate": 9.612186795505059e-05, + "loss": 3.0362, + "step": 13762 + }, + { + "epoch": 0.6407803151989199, + "grad_norm": 0.37025766281323297, + "learning_rate": 9.612082191753212e-05, + "loss": 3.1358, + "step": 13763 + }, + { + "epoch": 0.6408268733850129, + "grad_norm": 0.34889621078582206, + "learning_rate": 9.61197757446536e-05, + "loss": 3.0383, + "step": 13764 + }, + { + "epoch": 0.640873431571106, + "grad_norm": 0.33545711225969765, + "learning_rate": 9.611872943641809e-05, + "loss": 3.1332, + "step": 13765 + }, + { + "epoch": 0.6409199897571991, + "grad_norm": 0.36789903894476017, + "learning_rate": 9.611768299282864e-05, + "loss": 3.0996, + "step": 13766 + }, + { + "epoch": 0.6409665479432921, + "grad_norm": 0.31334755801417663, + "learning_rate": 9.611663641388833e-05, + "loss": 3.1537, + "step": 13767 + }, + { + "epoch": 0.6410131061293852, + "grad_norm": 0.3942954685349094, + "learning_rate": 9.611558969960024e-05, + "loss": 3.0851, + "step": 13768 + }, + { + "epoch": 0.6410596643154782, + "grad_norm": 0.35925700786209497, + "learning_rate": 9.611454284996743e-05, + "loss": 3.1219, + "step": 13769 + }, + { + "epoch": 0.6411062225015713, + "grad_norm": 0.3510443461469753, + "learning_rate": 9.6113495864993e-05, + "loss": 3.0953, + "step": 13770 + }, + { + "epoch": 0.6411527806876645, + "grad_norm": 0.31777131905571376, + "learning_rate": 9.611244874468e-05, + "loss": 3.0609, + "step": 13771 + }, + { + "epoch": 0.6411993388737575, + "grad_norm": 0.35811170283429467, + "learning_rate": 9.611140148903149e-05, + "loss": 3.0376, + "step": 13772 + }, + { + "epoch": 0.6412458970598506, + "grad_norm": 0.34626596321302383, + "learning_rate": 9.611035409805056e-05, + "loss": 3.0396, + "step": 13773 + }, + { + "epoch": 0.6412924552459436, + "grad_norm": 0.3243633536672693, + "learning_rate": 9.610930657174028e-05, + "loss": 3.0411, + "step": 13774 + }, + { + "epoch": 0.6413390134320367, + "grad_norm": 0.34550476320526347, + "learning_rate": 9.610825891010376e-05, + "loss": 2.9716, + "step": 13775 + }, + { + "epoch": 0.6413855716181298, + "grad_norm": 0.30207770234842884, + "learning_rate": 9.610721111314401e-05, + "loss": 2.9856, + "step": 13776 + }, + { + "epoch": 0.6414321298042228, + "grad_norm": 0.33938032072059543, + "learning_rate": 9.610616318086414e-05, + "loss": 3.0151, + "step": 13777 + }, + { + "epoch": 0.6414786879903159, + "grad_norm": 0.3364842973842556, + "learning_rate": 9.610511511326723e-05, + "loss": 3.0569, + "step": 13778 + }, + { + "epoch": 0.6415252461764089, + "grad_norm": 0.3461090689094549, + "learning_rate": 9.610406691035637e-05, + "loss": 2.9684, + "step": 13779 + }, + { + "epoch": 0.641571804362502, + "grad_norm": 0.35627745743230105, + "learning_rate": 9.610301857213458e-05, + "loss": 3.0636, + "step": 13780 + }, + { + "epoch": 0.6416183625485951, + "grad_norm": 0.3352160446500294, + "learning_rate": 9.610197009860498e-05, + "loss": 3.0517, + "step": 13781 + }, + { + "epoch": 0.6416649207346882, + "grad_norm": 0.349142350868103, + "learning_rate": 9.610092148977066e-05, + "loss": 3.0813, + "step": 13782 + }, + { + "epoch": 0.6417114789207813, + "grad_norm": 0.4037468496486796, + "learning_rate": 9.609987274563466e-05, + "loss": 3.0976, + "step": 13783 + }, + { + "epoch": 0.6417580371068743, + "grad_norm": 0.36611934239608396, + "learning_rate": 9.609882386620009e-05, + "loss": 3.1407, + "step": 13784 + }, + { + "epoch": 0.6418045952929674, + "grad_norm": 0.3730164059135682, + "learning_rate": 9.609777485147e-05, + "loss": 3.128, + "step": 13785 + }, + { + "epoch": 0.6418511534790604, + "grad_norm": 0.3303201367779772, + "learning_rate": 9.609672570144747e-05, + "loss": 3.0522, + "step": 13786 + }, + { + "epoch": 0.6418977116651535, + "grad_norm": 0.3909001559705178, + "learning_rate": 9.609567641613562e-05, + "loss": 3.1155, + "step": 13787 + }, + { + "epoch": 0.6419442698512466, + "grad_norm": 0.36091200437338905, + "learning_rate": 9.609462699553749e-05, + "loss": 3.0613, + "step": 13788 + }, + { + "epoch": 0.6419908280373396, + "grad_norm": 0.34496917894584833, + "learning_rate": 9.609357743965617e-05, + "loss": 3.0606, + "step": 13789 + }, + { + "epoch": 0.6420373862234328, + "grad_norm": 0.3935090488496603, + "learning_rate": 9.609252774849473e-05, + "loss": 2.9466, + "step": 13790 + }, + { + "epoch": 0.6420839444095258, + "grad_norm": 0.39661726059041874, + "learning_rate": 9.609147792205627e-05, + "loss": 2.9879, + "step": 13791 + }, + { + "epoch": 0.6421305025956189, + "grad_norm": 0.39594950108255855, + "learning_rate": 9.609042796034386e-05, + "loss": 3.1379, + "step": 13792 + }, + { + "epoch": 0.642177060781712, + "grad_norm": 0.4600736645995758, + "learning_rate": 9.608937786336059e-05, + "loss": 3.1621, + "step": 13793 + }, + { + "epoch": 0.642223618967805, + "grad_norm": 0.4145102471601302, + "learning_rate": 9.608832763110954e-05, + "loss": 3.0485, + "step": 13794 + }, + { + "epoch": 0.6422701771538981, + "grad_norm": 0.3645010066321044, + "learning_rate": 9.608727726359379e-05, + "loss": 3.0215, + "step": 13795 + }, + { + "epoch": 0.6423167353399911, + "grad_norm": 0.39092586879560676, + "learning_rate": 9.608622676081642e-05, + "loss": 3.0733, + "step": 13796 + }, + { + "epoch": 0.6423632935260842, + "grad_norm": 0.3681219022262395, + "learning_rate": 9.60851761227805e-05, + "loss": 3.0738, + "step": 13797 + }, + { + "epoch": 0.6424098517121773, + "grad_norm": 0.33963559205443106, + "learning_rate": 9.608412534948914e-05, + "loss": 2.9767, + "step": 13798 + }, + { + "epoch": 0.6424564098982704, + "grad_norm": 0.4188529591270649, + "learning_rate": 9.608307444094541e-05, + "loss": 2.9857, + "step": 13799 + }, + { + "epoch": 0.6425029680843635, + "grad_norm": 0.3700940322708051, + "learning_rate": 9.60820233971524e-05, + "loss": 2.9466, + "step": 13800 + }, + { + "epoch": 0.6425495262704565, + "grad_norm": 0.3739490828278512, + "learning_rate": 9.60809722181132e-05, + "loss": 3.0372, + "step": 13801 + }, + { + "epoch": 0.6425960844565496, + "grad_norm": 0.4287807181089634, + "learning_rate": 9.607992090383087e-05, + "loss": 3.1149, + "step": 13802 + }, + { + "epoch": 0.6426426426426426, + "grad_norm": 0.3511054485226234, + "learning_rate": 9.607886945430851e-05, + "loss": 3.0302, + "step": 13803 + }, + { + "epoch": 0.6426892008287357, + "grad_norm": 0.3710653034842304, + "learning_rate": 9.607781786954921e-05, + "loss": 2.9963, + "step": 13804 + }, + { + "epoch": 0.6427357590148288, + "grad_norm": 0.4173041724222703, + "learning_rate": 9.607676614955608e-05, + "loss": 3.0472, + "step": 13805 + }, + { + "epoch": 0.6427823172009218, + "grad_norm": 0.3555141373395867, + "learning_rate": 9.607571429433216e-05, + "loss": 2.9566, + "step": 13806 + }, + { + "epoch": 0.6428288753870149, + "grad_norm": 0.3827938088216519, + "learning_rate": 9.607466230388055e-05, + "loss": 3.0215, + "step": 13807 + }, + { + "epoch": 0.642875433573108, + "grad_norm": 0.36601115631128883, + "learning_rate": 9.607361017820435e-05, + "loss": 3.0835, + "step": 13808 + }, + { + "epoch": 0.6429219917592011, + "grad_norm": 0.36349537992499376, + "learning_rate": 9.607255791730664e-05, + "loss": 3.0996, + "step": 13809 + }, + { + "epoch": 0.6429685499452942, + "grad_norm": 0.35400046715970873, + "learning_rate": 9.607150552119052e-05, + "loss": 2.9319, + "step": 13810 + }, + { + "epoch": 0.6430151081313872, + "grad_norm": 0.3698610227553313, + "learning_rate": 9.607045298985906e-05, + "loss": 3.1038, + "step": 13811 + }, + { + "epoch": 0.6430616663174803, + "grad_norm": 0.3602090063655871, + "learning_rate": 9.606940032331537e-05, + "loss": 3.0668, + "step": 13812 + }, + { + "epoch": 0.6431082245035733, + "grad_norm": 0.38442151391617707, + "learning_rate": 9.606834752156253e-05, + "loss": 3.0635, + "step": 13813 + }, + { + "epoch": 0.6431547826896664, + "grad_norm": 0.37119010316275625, + "learning_rate": 9.606729458460362e-05, + "loss": 2.8927, + "step": 13814 + }, + { + "epoch": 0.6432013408757595, + "grad_norm": 0.3455868288304982, + "learning_rate": 9.606624151244174e-05, + "loss": 2.9809, + "step": 13815 + }, + { + "epoch": 0.6432478990618525, + "grad_norm": 0.377402755787043, + "learning_rate": 9.606518830507998e-05, + "loss": 2.9226, + "step": 13816 + }, + { + "epoch": 0.6432944572479457, + "grad_norm": 0.35745887545466765, + "learning_rate": 9.606413496252144e-05, + "loss": 3.0229, + "step": 13817 + }, + { + "epoch": 0.6433410154340387, + "grad_norm": 0.4072509938067502, + "learning_rate": 9.606308148476919e-05, + "loss": 3.1016, + "step": 13818 + }, + { + "epoch": 0.6433875736201318, + "grad_norm": 0.43857335395678615, + "learning_rate": 9.606202787182634e-05, + "loss": 3.0113, + "step": 13819 + }, + { + "epoch": 0.6434341318062249, + "grad_norm": 0.38953482890959806, + "learning_rate": 9.606097412369597e-05, + "loss": 3.0585, + "step": 13820 + }, + { + "epoch": 0.6434806899923179, + "grad_norm": 0.3841521386206034, + "learning_rate": 9.605992024038118e-05, + "loss": 2.9252, + "step": 13821 + }, + { + "epoch": 0.643527248178411, + "grad_norm": 0.39232145806806346, + "learning_rate": 9.605886622188507e-05, + "loss": 3.0238, + "step": 13822 + }, + { + "epoch": 0.643573806364504, + "grad_norm": 0.42033948605264, + "learning_rate": 9.605781206821072e-05, + "loss": 3.0388, + "step": 13823 + }, + { + "epoch": 0.6436203645505971, + "grad_norm": 0.3673202622283755, + "learning_rate": 9.605675777936122e-05, + "loss": 3.0982, + "step": 13824 + }, + { + "epoch": 0.6436669227366901, + "grad_norm": 0.4255149182945009, + "learning_rate": 9.605570335533968e-05, + "loss": 2.9567, + "step": 13825 + }, + { + "epoch": 0.6437134809227832, + "grad_norm": 0.3347433095517248, + "learning_rate": 9.605464879614921e-05, + "loss": 3.0539, + "step": 13826 + }, + { + "epoch": 0.6437600391088764, + "grad_norm": 0.42262941054985514, + "learning_rate": 9.605359410179284e-05, + "loss": 3.0827, + "step": 13827 + }, + { + "epoch": 0.6438065972949694, + "grad_norm": 0.41606596905764043, + "learning_rate": 9.605253927227375e-05, + "loss": 3.0741, + "step": 13828 + }, + { + "epoch": 0.6438531554810625, + "grad_norm": 0.38451013285407454, + "learning_rate": 9.605148430759497e-05, + "loss": 3.0043, + "step": 13829 + }, + { + "epoch": 0.6438997136671555, + "grad_norm": 0.37563305442298334, + "learning_rate": 9.605042920775961e-05, + "loss": 2.962, + "step": 13830 + }, + { + "epoch": 0.6439462718532486, + "grad_norm": 0.3780545075326643, + "learning_rate": 9.60493739727708e-05, + "loss": 3.0713, + "step": 13831 + }, + { + "epoch": 0.6439928300393417, + "grad_norm": 0.4022013894222988, + "learning_rate": 9.604831860263162e-05, + "loss": 3.0447, + "step": 13832 + }, + { + "epoch": 0.6440393882254347, + "grad_norm": 0.34986129915027536, + "learning_rate": 9.604726309734514e-05, + "loss": 3.0024, + "step": 13833 + }, + { + "epoch": 0.6440859464115278, + "grad_norm": 0.41216646573760884, + "learning_rate": 9.604620745691449e-05, + "loss": 3.0756, + "step": 13834 + }, + { + "epoch": 0.6441325045976208, + "grad_norm": 0.35164928270082235, + "learning_rate": 9.604515168134276e-05, + "loss": 3.1278, + "step": 13835 + }, + { + "epoch": 0.644179062783714, + "grad_norm": 0.39024725795197995, + "learning_rate": 9.604409577063302e-05, + "loss": 3.1104, + "step": 13836 + }, + { + "epoch": 0.6442256209698071, + "grad_norm": 0.3985281267627247, + "learning_rate": 9.604303972478843e-05, + "loss": 3.0945, + "step": 13837 + }, + { + "epoch": 0.6442721791559001, + "grad_norm": 0.3250749452958431, + "learning_rate": 9.604198354381204e-05, + "loss": 2.9689, + "step": 13838 + }, + { + "epoch": 0.6443187373419932, + "grad_norm": 0.3915599313341342, + "learning_rate": 9.604092722770696e-05, + "loss": 3.1231, + "step": 13839 + }, + { + "epoch": 0.6443652955280862, + "grad_norm": 0.33042696532244276, + "learning_rate": 9.60398707764763e-05, + "loss": 3.0589, + "step": 13840 + }, + { + "epoch": 0.6444118537141793, + "grad_norm": 0.35787660067172344, + "learning_rate": 9.603881419012315e-05, + "loss": 3.0122, + "step": 13841 + }, + { + "epoch": 0.6444584119002723, + "grad_norm": 0.37432373902643684, + "learning_rate": 9.603775746865063e-05, + "loss": 3.0606, + "step": 13842 + }, + { + "epoch": 0.6445049700863654, + "grad_norm": 0.3455004506220734, + "learning_rate": 9.60367006120618e-05, + "loss": 3.0625, + "step": 13843 + }, + { + "epoch": 0.6445515282724585, + "grad_norm": 0.3953636801971265, + "learning_rate": 9.603564362035982e-05, + "loss": 3.1135, + "step": 13844 + }, + { + "epoch": 0.6445980864585515, + "grad_norm": 0.367196984290969, + "learning_rate": 9.603458649354775e-05, + "loss": 3.0678, + "step": 13845 + }, + { + "epoch": 0.6446446446446447, + "grad_norm": 0.37325956752810424, + "learning_rate": 9.603352923162869e-05, + "loss": 3.0119, + "step": 13846 + }, + { + "epoch": 0.6446912028307377, + "grad_norm": 0.4097466563808948, + "learning_rate": 9.603247183460577e-05, + "loss": 2.9468, + "step": 13847 + }, + { + "epoch": 0.6447377610168308, + "grad_norm": 0.3668095899211892, + "learning_rate": 9.603141430248208e-05, + "loss": 3.1429, + "step": 13848 + }, + { + "epoch": 0.6447843192029239, + "grad_norm": 0.41015150949376566, + "learning_rate": 9.603035663526073e-05, + "loss": 3.038, + "step": 13849 + }, + { + "epoch": 0.6448308773890169, + "grad_norm": 0.3989371017447099, + "learning_rate": 9.60292988329448e-05, + "loss": 3.0279, + "step": 13850 + }, + { + "epoch": 0.64487743557511, + "grad_norm": 0.5786308080187564, + "learning_rate": 9.602824089553743e-05, + "loss": 3.0492, + "step": 13851 + }, + { + "epoch": 0.644923993761203, + "grad_norm": 0.4359215618008529, + "learning_rate": 9.602718282304168e-05, + "loss": 3.0782, + "step": 13852 + }, + { + "epoch": 0.6449705519472961, + "grad_norm": 0.39597885284563167, + "learning_rate": 9.602612461546072e-05, + "loss": 3.0196, + "step": 13853 + }, + { + "epoch": 0.6450171101333892, + "grad_norm": 0.38352652987164826, + "learning_rate": 9.60250662727976e-05, + "loss": 3.0114, + "step": 13854 + }, + { + "epoch": 0.6450636683194823, + "grad_norm": 0.3756997940919526, + "learning_rate": 9.602400779505545e-05, + "loss": 3.0568, + "step": 13855 + }, + { + "epoch": 0.6451102265055754, + "grad_norm": 0.39422051759357585, + "learning_rate": 9.602294918223735e-05, + "loss": 3.0785, + "step": 13856 + }, + { + "epoch": 0.6451567846916684, + "grad_norm": 0.4139304788084287, + "learning_rate": 9.602189043434645e-05, + "loss": 3.0251, + "step": 13857 + }, + { + "epoch": 0.6452033428777615, + "grad_norm": 0.3547135986855586, + "learning_rate": 9.602083155138583e-05, + "loss": 3.1177, + "step": 13858 + }, + { + "epoch": 0.6452499010638546, + "grad_norm": 0.39268879847224397, + "learning_rate": 9.60197725333586e-05, + "loss": 2.9442, + "step": 13859 + }, + { + "epoch": 0.6452964592499476, + "grad_norm": 0.33895220495810663, + "learning_rate": 9.601871338026789e-05, + "loss": 3.0864, + "step": 13860 + }, + { + "epoch": 0.6453430174360407, + "grad_norm": 0.3793797383801255, + "learning_rate": 9.601765409211677e-05, + "loss": 3.0424, + "step": 13861 + }, + { + "epoch": 0.6453895756221337, + "grad_norm": 0.38160486534143956, + "learning_rate": 9.601659466890838e-05, + "loss": 2.8978, + "step": 13862 + }, + { + "epoch": 0.6454361338082268, + "grad_norm": 0.37094080704311555, + "learning_rate": 9.60155351106458e-05, + "loss": 2.9561, + "step": 13863 + }, + { + "epoch": 0.6454826919943198, + "grad_norm": 0.4171809025453907, + "learning_rate": 9.601447541733218e-05, + "loss": 3.1131, + "step": 13864 + }, + { + "epoch": 0.645529250180413, + "grad_norm": 0.3814957287210803, + "learning_rate": 9.601341558897059e-05, + "loss": 3.1127, + "step": 13865 + }, + { + "epoch": 0.6455758083665061, + "grad_norm": 0.3724432945290391, + "learning_rate": 9.601235562556417e-05, + "loss": 3.0716, + "step": 13866 + }, + { + "epoch": 0.6456223665525991, + "grad_norm": 0.34054603435312303, + "learning_rate": 9.6011295527116e-05, + "loss": 3.015, + "step": 13867 + }, + { + "epoch": 0.6456689247386922, + "grad_norm": 0.4188837401745727, + "learning_rate": 9.601023529362924e-05, + "loss": 2.9561, + "step": 13868 + }, + { + "epoch": 0.6457154829247852, + "grad_norm": 0.32623022248094924, + "learning_rate": 9.600917492510695e-05, + "loss": 3.0102, + "step": 13869 + }, + { + "epoch": 0.6457620411108783, + "grad_norm": 0.3834457392855829, + "learning_rate": 9.600811442155228e-05, + "loss": 3.0238, + "step": 13870 + }, + { + "epoch": 0.6458085992969714, + "grad_norm": 0.42214943993884485, + "learning_rate": 9.600705378296832e-05, + "loss": 2.9693, + "step": 13871 + }, + { + "epoch": 0.6458551574830644, + "grad_norm": 0.3484998835836475, + "learning_rate": 9.600599300935817e-05, + "loss": 3.0398, + "step": 13872 + }, + { + "epoch": 0.6459017156691576, + "grad_norm": 0.37893822802197824, + "learning_rate": 9.600493210072498e-05, + "loss": 3.1163, + "step": 13873 + }, + { + "epoch": 0.6459482738552506, + "grad_norm": 0.44161618961559473, + "learning_rate": 9.600387105707187e-05, + "loss": 3.0126, + "step": 13874 + }, + { + "epoch": 0.6459948320413437, + "grad_norm": 0.38239191572863224, + "learning_rate": 9.60028098784019e-05, + "loss": 3.0995, + "step": 13875 + }, + { + "epoch": 0.6460413902274368, + "grad_norm": 0.38858463394667786, + "learning_rate": 9.600174856471822e-05, + "loss": 3.0436, + "step": 13876 + }, + { + "epoch": 0.6460879484135298, + "grad_norm": 0.3560334212810289, + "learning_rate": 9.600068711602395e-05, + "loss": 3.0337, + "step": 13877 + }, + { + "epoch": 0.6461345065996229, + "grad_norm": 0.390859916700294, + "learning_rate": 9.599962553232218e-05, + "loss": 2.9945, + "step": 13878 + }, + { + "epoch": 0.6461810647857159, + "grad_norm": 0.3611967410270141, + "learning_rate": 9.599856381361605e-05, + "loss": 3.0331, + "step": 13879 + }, + { + "epoch": 0.646227622971809, + "grad_norm": 0.3910368256154467, + "learning_rate": 9.599750195990867e-05, + "loss": 3.0279, + "step": 13880 + }, + { + "epoch": 0.6462741811579021, + "grad_norm": 0.31794393907496177, + "learning_rate": 9.599643997120315e-05, + "loss": 3.0003, + "step": 13881 + }, + { + "epoch": 0.6463207393439951, + "grad_norm": 0.3771895597345637, + "learning_rate": 9.599537784750261e-05, + "loss": 3.1945, + "step": 13882 + }, + { + "epoch": 0.6463672975300883, + "grad_norm": 0.37162690835895723, + "learning_rate": 9.599431558881016e-05, + "loss": 3.1074, + "step": 13883 + }, + { + "epoch": 0.6464138557161813, + "grad_norm": 0.3724340472915544, + "learning_rate": 9.599325319512893e-05, + "loss": 3.0196, + "step": 13884 + }, + { + "epoch": 0.6464604139022744, + "grad_norm": 0.36963363306730057, + "learning_rate": 9.599219066646203e-05, + "loss": 3.0922, + "step": 13885 + }, + { + "epoch": 0.6465069720883674, + "grad_norm": 0.3436747264502671, + "learning_rate": 9.59911280028126e-05, + "loss": 2.9088, + "step": 13886 + }, + { + "epoch": 0.6465535302744605, + "grad_norm": 0.37705049328778917, + "learning_rate": 9.599006520418372e-05, + "loss": 3.091, + "step": 13887 + }, + { + "epoch": 0.6466000884605536, + "grad_norm": 0.39935970393306214, + "learning_rate": 9.598900227057855e-05, + "loss": 3.0386, + "step": 13888 + }, + { + "epoch": 0.6466466466466466, + "grad_norm": 0.354363793805196, + "learning_rate": 9.598793920200015e-05, + "loss": 2.9866, + "step": 13889 + }, + { + "epoch": 0.6466932048327397, + "grad_norm": 0.36034621905506714, + "learning_rate": 9.598687599845171e-05, + "loss": 3.0489, + "step": 13890 + }, + { + "epoch": 0.6467397630188327, + "grad_norm": 0.3694148201575206, + "learning_rate": 9.598581265993632e-05, + "loss": 3.1306, + "step": 13891 + }, + { + "epoch": 0.6467863212049259, + "grad_norm": 0.36030332239781815, + "learning_rate": 9.598474918645709e-05, + "loss": 3.132, + "step": 13892 + }, + { + "epoch": 0.646832879391019, + "grad_norm": 0.3556870663943157, + "learning_rate": 9.598368557801716e-05, + "loss": 3.0588, + "step": 13893 + }, + { + "epoch": 0.646879437577112, + "grad_norm": 0.38199191032361673, + "learning_rate": 9.598262183461963e-05, + "loss": 3.0873, + "step": 13894 + }, + { + "epoch": 0.6469259957632051, + "grad_norm": 0.3489461561447616, + "learning_rate": 9.598155795626764e-05, + "loss": 3.0863, + "step": 13895 + }, + { + "epoch": 0.6469725539492981, + "grad_norm": 0.36795405965071387, + "learning_rate": 9.598049394296432e-05, + "loss": 3.1508, + "step": 13896 + }, + { + "epoch": 0.6470191121353912, + "grad_norm": 0.3440745316542783, + "learning_rate": 9.597942979471276e-05, + "loss": 3.0929, + "step": 13897 + }, + { + "epoch": 0.6470656703214843, + "grad_norm": 0.3413153252822182, + "learning_rate": 9.597836551151611e-05, + "loss": 3.0631, + "step": 13898 + }, + { + "epoch": 0.6471122285075773, + "grad_norm": 0.2987701096348223, + "learning_rate": 9.59773010933775e-05, + "loss": 3.0062, + "step": 13899 + }, + { + "epoch": 0.6471587866936704, + "grad_norm": 0.3618391440111714, + "learning_rate": 9.597623654030002e-05, + "loss": 3.0386, + "step": 13900 + }, + { + "epoch": 0.6472053448797634, + "grad_norm": 0.3786850893495207, + "learning_rate": 9.597517185228684e-05, + "loss": 3.0741, + "step": 13901 + }, + { + "epoch": 0.6472519030658566, + "grad_norm": 0.34442983545479466, + "learning_rate": 9.597410702934104e-05, + "loss": 3.0721, + "step": 13902 + }, + { + "epoch": 0.6472984612519497, + "grad_norm": 0.3697594434881219, + "learning_rate": 9.597304207146577e-05, + "loss": 3.1095, + "step": 13903 + }, + { + "epoch": 0.6473450194380427, + "grad_norm": 0.3877442525434675, + "learning_rate": 9.597197697866414e-05, + "loss": 3.1407, + "step": 13904 + }, + { + "epoch": 0.6473915776241358, + "grad_norm": 0.3714953584772458, + "learning_rate": 9.59709117509393e-05, + "loss": 3.024, + "step": 13905 + }, + { + "epoch": 0.6474381358102288, + "grad_norm": 0.3188804711973359, + "learning_rate": 9.596984638829436e-05, + "loss": 3.0201, + "step": 13906 + }, + { + "epoch": 0.6474846939963219, + "grad_norm": 0.345440565693314, + "learning_rate": 9.596878089073246e-05, + "loss": 2.9977, + "step": 13907 + }, + { + "epoch": 0.6475312521824149, + "grad_norm": 0.3588053608436, + "learning_rate": 9.596771525825671e-05, + "loss": 2.9221, + "step": 13908 + }, + { + "epoch": 0.647577810368508, + "grad_norm": 0.3570521068963387, + "learning_rate": 9.596664949087025e-05, + "loss": 2.9987, + "step": 13909 + }, + { + "epoch": 0.6476243685546011, + "grad_norm": 0.3249228305308951, + "learning_rate": 9.596558358857618e-05, + "loss": 2.9656, + "step": 13910 + }, + { + "epoch": 0.6476709267406942, + "grad_norm": 0.4082476093168734, + "learning_rate": 9.596451755137767e-05, + "loss": 3.1264, + "step": 13911 + }, + { + "epoch": 0.6477174849267873, + "grad_norm": 0.4336812984151078, + "learning_rate": 9.596345137927783e-05, + "loss": 3.0372, + "step": 13912 + }, + { + "epoch": 0.6477640431128803, + "grad_norm": 0.36046520188008746, + "learning_rate": 9.596238507227977e-05, + "loss": 3.0584, + "step": 13913 + }, + { + "epoch": 0.6478106012989734, + "grad_norm": 0.34726671517873875, + "learning_rate": 9.596131863038664e-05, + "loss": 3.051, + "step": 13914 + }, + { + "epoch": 0.6478571594850665, + "grad_norm": 0.38532894502984627, + "learning_rate": 9.596025205360159e-05, + "loss": 3.1013, + "step": 13915 + }, + { + "epoch": 0.6479037176711595, + "grad_norm": 0.33502213057902885, + "learning_rate": 9.595918534192771e-05, + "loss": 3.091, + "step": 13916 + }, + { + "epoch": 0.6479502758572526, + "grad_norm": 0.3787512595905543, + "learning_rate": 9.595811849536816e-05, + "loss": 3.0283, + "step": 13917 + }, + { + "epoch": 0.6479968340433456, + "grad_norm": 0.42060288562344705, + "learning_rate": 9.595705151392605e-05, + "loss": 2.9551, + "step": 13918 + }, + { + "epoch": 0.6480433922294387, + "grad_norm": 0.36444082955433693, + "learning_rate": 9.595598439760453e-05, + "loss": 2.9036, + "step": 13919 + }, + { + "epoch": 0.6480899504155319, + "grad_norm": 0.4447666449732717, + "learning_rate": 9.595491714640672e-05, + "loss": 3.0224, + "step": 13920 + }, + { + "epoch": 0.6481365086016249, + "grad_norm": 0.376287880283693, + "learning_rate": 9.595384976033574e-05, + "loss": 3.0331, + "step": 13921 + }, + { + "epoch": 0.648183066787718, + "grad_norm": 0.36994354370248206, + "learning_rate": 9.595278223939476e-05, + "loss": 2.9903, + "step": 13922 + }, + { + "epoch": 0.648229624973811, + "grad_norm": 0.4099462033454083, + "learning_rate": 9.59517145835869e-05, + "loss": 3.0555, + "step": 13923 + }, + { + "epoch": 0.6482761831599041, + "grad_norm": 0.3874144009617404, + "learning_rate": 9.595064679291526e-05, + "loss": 3.0056, + "step": 13924 + }, + { + "epoch": 0.6483227413459972, + "grad_norm": 0.37178817649538753, + "learning_rate": 9.594957886738301e-05, + "loss": 3.0752, + "step": 13925 + }, + { + "epoch": 0.6483692995320902, + "grad_norm": 0.3663391682060872, + "learning_rate": 9.594851080699326e-05, + "loss": 3.0917, + "step": 13926 + }, + { + "epoch": 0.6484158577181833, + "grad_norm": 0.3525506136464662, + "learning_rate": 9.594744261174917e-05, + "loss": 2.9906, + "step": 13927 + }, + { + "epoch": 0.6484624159042763, + "grad_norm": 0.382938639952541, + "learning_rate": 9.594637428165387e-05, + "loss": 3.1138, + "step": 13928 + }, + { + "epoch": 0.6485089740903694, + "grad_norm": 0.3832335603734359, + "learning_rate": 9.594530581671046e-05, + "loss": 2.9911, + "step": 13929 + }, + { + "epoch": 0.6485555322764625, + "grad_norm": 0.3299549319086674, + "learning_rate": 9.594423721692213e-05, + "loss": 3.0466, + "step": 13930 + }, + { + "epoch": 0.6486020904625556, + "grad_norm": 0.3763379898313056, + "learning_rate": 9.594316848229197e-05, + "loss": 3.1232, + "step": 13931 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.41396270291968057, + "learning_rate": 9.594209961282315e-05, + "loss": 3.0568, + "step": 13932 + }, + { + "epoch": 0.6486952068347417, + "grad_norm": 0.383797455843102, + "learning_rate": 9.594103060851881e-05, + "loss": 3.0644, + "step": 13933 + }, + { + "epoch": 0.6487417650208348, + "grad_norm": 0.37239566169614907, + "learning_rate": 9.593996146938205e-05, + "loss": 3.0741, + "step": 13934 + }, + { + "epoch": 0.6487883232069278, + "grad_norm": 0.3478871659529496, + "learning_rate": 9.593889219541603e-05, + "loss": 2.9708, + "step": 13935 + }, + { + "epoch": 0.6488348813930209, + "grad_norm": 0.3707081044981345, + "learning_rate": 9.59378227866239e-05, + "loss": 3.151, + "step": 13936 + }, + { + "epoch": 0.648881439579114, + "grad_norm": 0.37984110974326796, + "learning_rate": 9.593675324300876e-05, + "loss": 3.0208, + "step": 13937 + }, + { + "epoch": 0.648927997765207, + "grad_norm": 0.32176937617377405, + "learning_rate": 9.593568356457378e-05, + "loss": 3.0986, + "step": 13938 + }, + { + "epoch": 0.6489745559513002, + "grad_norm": 0.35318684781145054, + "learning_rate": 9.593461375132211e-05, + "loss": 2.9923, + "step": 13939 + }, + { + "epoch": 0.6490211141373932, + "grad_norm": 0.35189504159113966, + "learning_rate": 9.593354380325686e-05, + "loss": 3.0116, + "step": 13940 + }, + { + "epoch": 0.6490676723234863, + "grad_norm": 0.3860280559579407, + "learning_rate": 9.59324737203812e-05, + "loss": 2.9426, + "step": 13941 + }, + { + "epoch": 0.6491142305095794, + "grad_norm": 0.3453597816670076, + "learning_rate": 9.593140350269824e-05, + "loss": 2.9993, + "step": 13942 + }, + { + "epoch": 0.6491607886956724, + "grad_norm": 0.4611951259856086, + "learning_rate": 9.593033315021115e-05, + "loss": 3.0566, + "step": 13943 + }, + { + "epoch": 0.6492073468817655, + "grad_norm": 0.4068166169304656, + "learning_rate": 9.592926266292305e-05, + "loss": 3.0975, + "step": 13944 + }, + { + "epoch": 0.6492539050678585, + "grad_norm": 0.3770464643801561, + "learning_rate": 9.592819204083708e-05, + "loss": 3.164, + "step": 13945 + }, + { + "epoch": 0.6493004632539516, + "grad_norm": 0.4018046301640362, + "learning_rate": 9.59271212839564e-05, + "loss": 2.9874, + "step": 13946 + }, + { + "epoch": 0.6493470214400447, + "grad_norm": 0.3366280983209337, + "learning_rate": 9.592605039228415e-05, + "loss": 3.0236, + "step": 13947 + }, + { + "epoch": 0.6493935796261378, + "grad_norm": 0.36183392097447437, + "learning_rate": 9.592497936582345e-05, + "loss": 3.0538, + "step": 13948 + }, + { + "epoch": 0.6494401378122309, + "grad_norm": 0.39523420099636497, + "learning_rate": 9.592390820457747e-05, + "loss": 3.164, + "step": 13949 + }, + { + "epoch": 0.6494866959983239, + "grad_norm": 0.38166861596934815, + "learning_rate": 9.592283690854935e-05, + "loss": 2.9685, + "step": 13950 + }, + { + "epoch": 0.649533254184417, + "grad_norm": 0.36287943112767884, + "learning_rate": 9.592176547774222e-05, + "loss": 3.0043, + "step": 13951 + }, + { + "epoch": 0.64957981237051, + "grad_norm": 0.39207923115681304, + "learning_rate": 9.592069391215924e-05, + "loss": 3.0794, + "step": 13952 + }, + { + "epoch": 0.6496263705566031, + "grad_norm": 0.3544543412054868, + "learning_rate": 9.591962221180353e-05, + "loss": 2.8761, + "step": 13953 + }, + { + "epoch": 0.6496729287426962, + "grad_norm": 0.3589961540985598, + "learning_rate": 9.591855037667826e-05, + "loss": 2.9008, + "step": 13954 + }, + { + "epoch": 0.6497194869287892, + "grad_norm": 0.4002466908937853, + "learning_rate": 9.591747840678658e-05, + "loss": 3.019, + "step": 13955 + }, + { + "epoch": 0.6497660451148823, + "grad_norm": 0.366709886484404, + "learning_rate": 9.591640630213161e-05, + "loss": 3.0077, + "step": 13956 + }, + { + "epoch": 0.6498126033009753, + "grad_norm": 0.3816586181666946, + "learning_rate": 9.591533406271653e-05, + "loss": 3.0803, + "step": 13957 + }, + { + "epoch": 0.6498591614870685, + "grad_norm": 0.39878087763768777, + "learning_rate": 9.591426168854445e-05, + "loss": 3.0706, + "step": 13958 + }, + { + "epoch": 0.6499057196731616, + "grad_norm": 0.3727853546620882, + "learning_rate": 9.591318917961853e-05, + "loss": 3.15, + "step": 13959 + }, + { + "epoch": 0.6499522778592546, + "grad_norm": 0.35830916267823226, + "learning_rate": 9.591211653594195e-05, + "loss": 3.0353, + "step": 13960 + }, + { + "epoch": 0.6499988360453477, + "grad_norm": 0.40525817463231023, + "learning_rate": 9.59110437575178e-05, + "loss": 3.007, + "step": 13961 + }, + { + "epoch": 0.6500453942314407, + "grad_norm": 0.4424421082300599, + "learning_rate": 9.590997084434927e-05, + "loss": 3.0431, + "step": 13962 + }, + { + "epoch": 0.6500919524175338, + "grad_norm": 0.3343427730976508, + "learning_rate": 9.59088977964395e-05, + "loss": 3.0142, + "step": 13963 + }, + { + "epoch": 0.6501385106036269, + "grad_norm": 0.37631540196521945, + "learning_rate": 9.590782461379164e-05, + "loss": 3.0635, + "step": 13964 + }, + { + "epoch": 0.6501850687897199, + "grad_norm": 0.3684706854568871, + "learning_rate": 9.590675129640885e-05, + "loss": 2.9789, + "step": 13965 + }, + { + "epoch": 0.650231626975813, + "grad_norm": 0.34888578047198926, + "learning_rate": 9.590567784429426e-05, + "loss": 3.0337, + "step": 13966 + }, + { + "epoch": 0.650278185161906, + "grad_norm": 0.40993194222793194, + "learning_rate": 9.5904604257451e-05, + "loss": 3.0005, + "step": 13967 + }, + { + "epoch": 0.6503247433479992, + "grad_norm": 0.35978578986983656, + "learning_rate": 9.590353053588228e-05, + "loss": 3.1264, + "step": 13968 + }, + { + "epoch": 0.6503713015340923, + "grad_norm": 0.36965800241468905, + "learning_rate": 9.590245667959122e-05, + "loss": 3.0826, + "step": 13969 + }, + { + "epoch": 0.6504178597201853, + "grad_norm": 0.3899052217242201, + "learning_rate": 9.590138268858095e-05, + "loss": 3.0263, + "step": 13970 + }, + { + "epoch": 0.6504644179062784, + "grad_norm": 0.3401092530457864, + "learning_rate": 9.590030856285467e-05, + "loss": 2.942, + "step": 13971 + }, + { + "epoch": 0.6505109760923714, + "grad_norm": 0.38754999423702474, + "learning_rate": 9.58992343024155e-05, + "loss": 3.1083, + "step": 13972 + }, + { + "epoch": 0.6505575342784645, + "grad_norm": 0.40930647920154634, + "learning_rate": 9.589815990726658e-05, + "loss": 3.133, + "step": 13973 + }, + { + "epoch": 0.6506040924645575, + "grad_norm": 0.37345107622286844, + "learning_rate": 9.58970853774111e-05, + "loss": 3.0238, + "step": 13974 + }, + { + "epoch": 0.6506506506506506, + "grad_norm": 0.39433558447155564, + "learning_rate": 9.589601071285219e-05, + "loss": 3.0081, + "step": 13975 + }, + { + "epoch": 0.6506972088367438, + "grad_norm": 0.4220194644628874, + "learning_rate": 9.5894935913593e-05, + "loss": 3.0934, + "step": 13976 + }, + { + "epoch": 0.6507437670228368, + "grad_norm": 0.3399318351127683, + "learning_rate": 9.589386097963672e-05, + "loss": 3.0389, + "step": 13977 + }, + { + "epoch": 0.6507903252089299, + "grad_norm": 0.3814672974226289, + "learning_rate": 9.589278591098646e-05, + "loss": 3.0116, + "step": 13978 + }, + { + "epoch": 0.6508368833950229, + "grad_norm": 0.3754910882981123, + "learning_rate": 9.58917107076454e-05, + "loss": 3.0242, + "step": 13979 + }, + { + "epoch": 0.650883441581116, + "grad_norm": 0.43180469292085544, + "learning_rate": 9.589063536961669e-05, + "loss": 3.1057, + "step": 13980 + }, + { + "epoch": 0.6509299997672091, + "grad_norm": 0.39507976308102416, + "learning_rate": 9.588955989690349e-05, + "loss": 2.9757, + "step": 13981 + }, + { + "epoch": 0.6509765579533021, + "grad_norm": 0.4008995297710753, + "learning_rate": 9.588848428950894e-05, + "loss": 3.0589, + "step": 13982 + }, + { + "epoch": 0.6510231161393952, + "grad_norm": 0.49061650515578625, + "learning_rate": 9.588740854743624e-05, + "loss": 3.0747, + "step": 13983 + }, + { + "epoch": 0.6510696743254882, + "grad_norm": 0.37787488683702014, + "learning_rate": 9.588633267068848e-05, + "loss": 3.1298, + "step": 13984 + }, + { + "epoch": 0.6511162325115813, + "grad_norm": 0.3990870725147106, + "learning_rate": 9.588525665926886e-05, + "loss": 2.9538, + "step": 13985 + }, + { + "epoch": 0.6511627906976745, + "grad_norm": 0.4001827569872378, + "learning_rate": 9.588418051318055e-05, + "loss": 3.0134, + "step": 13986 + }, + { + "epoch": 0.6512093488837675, + "grad_norm": 0.40718307143956195, + "learning_rate": 9.588310423242668e-05, + "loss": 3.0827, + "step": 13987 + }, + { + "epoch": 0.6512559070698606, + "grad_norm": 0.3850581358467542, + "learning_rate": 9.588202781701043e-05, + "loss": 3.014, + "step": 13988 + }, + { + "epoch": 0.6513024652559536, + "grad_norm": 0.3765553699577112, + "learning_rate": 9.588095126693493e-05, + "loss": 3.0194, + "step": 13989 + }, + { + "epoch": 0.6513490234420467, + "grad_norm": 0.39275683189212374, + "learning_rate": 9.587987458220337e-05, + "loss": 3.0382, + "step": 13990 + }, + { + "epoch": 0.6513955816281398, + "grad_norm": 0.375805029576428, + "learning_rate": 9.587879776281889e-05, + "loss": 3.0429, + "step": 13991 + }, + { + "epoch": 0.6514421398142328, + "grad_norm": 0.3449923782097681, + "learning_rate": 9.587772080878467e-05, + "loss": 3.0694, + "step": 13992 + }, + { + "epoch": 0.6514886980003259, + "grad_norm": 0.37976365334794787, + "learning_rate": 9.587664372010385e-05, + "loss": 3.0611, + "step": 13993 + }, + { + "epoch": 0.6515352561864189, + "grad_norm": 0.4405551976385078, + "learning_rate": 9.587556649677961e-05, + "loss": 3.08, + "step": 13994 + }, + { + "epoch": 0.6515818143725121, + "grad_norm": 0.3414319184432464, + "learning_rate": 9.587448913881509e-05, + "loss": 3.041, + "step": 13995 + }, + { + "epoch": 0.6516283725586051, + "grad_norm": 0.37731545681289425, + "learning_rate": 9.587341164621346e-05, + "loss": 3.0912, + "step": 13996 + }, + { + "epoch": 0.6516749307446982, + "grad_norm": 0.34895645047228235, + "learning_rate": 9.58723340189779e-05, + "loss": 2.9794, + "step": 13997 + }, + { + "epoch": 0.6517214889307913, + "grad_norm": 0.34964277780661507, + "learning_rate": 9.587125625711155e-05, + "loss": 3.1811, + "step": 13998 + }, + { + "epoch": 0.6517680471168843, + "grad_norm": 0.35782791960631205, + "learning_rate": 9.587017836061759e-05, + "loss": 3.0916, + "step": 13999 + }, + { + "epoch": 0.6518146053029774, + "grad_norm": 0.31188737830125884, + "learning_rate": 9.586910032949917e-05, + "loss": 2.9857, + "step": 14000 + }, + { + "epoch": 0.6518611634890704, + "grad_norm": 0.38788445795790294, + "learning_rate": 9.586802216375946e-05, + "loss": 3.0113, + "step": 14001 + }, + { + "epoch": 0.6519077216751635, + "grad_norm": 0.33981871965390853, + "learning_rate": 9.586694386340163e-05, + "loss": 2.8907, + "step": 14002 + }, + { + "epoch": 0.6519542798612566, + "grad_norm": 0.3842319617490241, + "learning_rate": 9.586586542842884e-05, + "loss": 3.0835, + "step": 14003 + }, + { + "epoch": 0.6520008380473497, + "grad_norm": 0.35593708083846237, + "learning_rate": 9.586478685884424e-05, + "loss": 3.1169, + "step": 14004 + }, + { + "epoch": 0.6520473962334428, + "grad_norm": 0.34806476447330753, + "learning_rate": 9.586370815465101e-05, + "loss": 3.0594, + "step": 14005 + }, + { + "epoch": 0.6520939544195358, + "grad_norm": 0.37378427990336954, + "learning_rate": 9.586262931585233e-05, + "loss": 3.0174, + "step": 14006 + }, + { + "epoch": 0.6521405126056289, + "grad_norm": 0.3803964520642478, + "learning_rate": 9.586155034245133e-05, + "loss": 2.9913, + "step": 14007 + }, + { + "epoch": 0.652187070791722, + "grad_norm": 0.3799659477607793, + "learning_rate": 9.586047123445121e-05, + "loss": 3.06, + "step": 14008 + }, + { + "epoch": 0.652233628977815, + "grad_norm": 0.3454718564915181, + "learning_rate": 9.585939199185512e-05, + "loss": 3.077, + "step": 14009 + }, + { + "epoch": 0.6522801871639081, + "grad_norm": 0.39888448474254595, + "learning_rate": 9.585831261466624e-05, + "loss": 3.1184, + "step": 14010 + }, + { + "epoch": 0.6523267453500011, + "grad_norm": 0.40712755646462123, + "learning_rate": 9.585723310288773e-05, + "loss": 3.0367, + "step": 14011 + }, + { + "epoch": 0.6523733035360942, + "grad_norm": 0.3502997146827617, + "learning_rate": 9.585615345652274e-05, + "loss": 3.1173, + "step": 14012 + }, + { + "epoch": 0.6524198617221874, + "grad_norm": 0.3900274120286835, + "learning_rate": 9.58550736755745e-05, + "loss": 3.0465, + "step": 14013 + }, + { + "epoch": 0.6524664199082804, + "grad_norm": 0.39918913324512895, + "learning_rate": 9.585399376004609e-05, + "loss": 2.9955, + "step": 14014 + }, + { + "epoch": 0.6525129780943735, + "grad_norm": 0.33999376107861684, + "learning_rate": 9.585291370994073e-05, + "loss": 3.0763, + "step": 14015 + }, + { + "epoch": 0.6525595362804665, + "grad_norm": 0.42879811420453434, + "learning_rate": 9.585183352526162e-05, + "loss": 3.0267, + "step": 14016 + }, + { + "epoch": 0.6526060944665596, + "grad_norm": 0.38070428509351356, + "learning_rate": 9.585075320601187e-05, + "loss": 3.0102, + "step": 14017 + }, + { + "epoch": 0.6526526526526526, + "grad_norm": 0.3884961003523169, + "learning_rate": 9.584967275219469e-05, + "loss": 3.0613, + "step": 14018 + }, + { + "epoch": 0.6526992108387457, + "grad_norm": 0.45255522745376575, + "learning_rate": 9.584859216381322e-05, + "loss": 2.9882, + "step": 14019 + }, + { + "epoch": 0.6527457690248388, + "grad_norm": 0.35683988372980285, + "learning_rate": 9.584751144087066e-05, + "loss": 3.0855, + "step": 14020 + }, + { + "epoch": 0.6527923272109318, + "grad_norm": 0.394175522134968, + "learning_rate": 9.584643058337018e-05, + "loss": 3.1205, + "step": 14021 + }, + { + "epoch": 0.652838885397025, + "grad_norm": 0.4712726389482621, + "learning_rate": 9.584534959131492e-05, + "loss": 3.0889, + "step": 14022 + }, + { + "epoch": 0.652885443583118, + "grad_norm": 0.4037191742341827, + "learning_rate": 9.584426846470809e-05, + "loss": 3.0813, + "step": 14023 + }, + { + "epoch": 0.6529320017692111, + "grad_norm": 0.3683548067682219, + "learning_rate": 9.584318720355284e-05, + "loss": 3.1057, + "step": 14024 + }, + { + "epoch": 0.6529785599553042, + "grad_norm": 0.3830150523395478, + "learning_rate": 9.584210580785236e-05, + "loss": 2.977, + "step": 14025 + }, + { + "epoch": 0.6530251181413972, + "grad_norm": 0.44143648973030525, + "learning_rate": 9.584102427760981e-05, + "loss": 3.0741, + "step": 14026 + }, + { + "epoch": 0.6530716763274903, + "grad_norm": 0.38672583308966363, + "learning_rate": 9.583994261282838e-05, + "loss": 2.9969, + "step": 14027 + }, + { + "epoch": 0.6531182345135833, + "grad_norm": 0.386409006208884, + "learning_rate": 9.583886081351124e-05, + "loss": 3.1041, + "step": 14028 + }, + { + "epoch": 0.6531647926996764, + "grad_norm": 0.3843100000635498, + "learning_rate": 9.583777887966153e-05, + "loss": 3.1008, + "step": 14029 + }, + { + "epoch": 0.6532113508857695, + "grad_norm": 0.36857202925956756, + "learning_rate": 9.583669681128247e-05, + "loss": 3.0431, + "step": 14030 + }, + { + "epoch": 0.6532579090718625, + "grad_norm": 0.3930567899598498, + "learning_rate": 9.583561460837722e-05, + "loss": 3.1133, + "step": 14031 + }, + { + "epoch": 0.6533044672579557, + "grad_norm": 0.3518089460429832, + "learning_rate": 9.583453227094896e-05, + "loss": 3.0697, + "step": 14032 + }, + { + "epoch": 0.6533510254440487, + "grad_norm": 0.3935042176420032, + "learning_rate": 9.583344979900086e-05, + "loss": 3.0877, + "step": 14033 + }, + { + "epoch": 0.6533975836301418, + "grad_norm": 0.38038953884808246, + "learning_rate": 9.583236719253612e-05, + "loss": 3.0581, + "step": 14034 + }, + { + "epoch": 0.6534441418162349, + "grad_norm": 0.3396461583947909, + "learning_rate": 9.583128445155787e-05, + "loss": 2.9429, + "step": 14035 + }, + { + "epoch": 0.6534907000023279, + "grad_norm": 0.34098105102978454, + "learning_rate": 9.583020157606932e-05, + "loss": 2.9891, + "step": 14036 + }, + { + "epoch": 0.653537258188421, + "grad_norm": 0.3546185648923095, + "learning_rate": 9.582911856607365e-05, + "loss": 3.0207, + "step": 14037 + }, + { + "epoch": 0.653583816374514, + "grad_norm": 0.37502582429517917, + "learning_rate": 9.582803542157403e-05, + "loss": 3.0664, + "step": 14038 + }, + { + "epoch": 0.6536303745606071, + "grad_norm": 0.3661212832406706, + "learning_rate": 9.582695214257365e-05, + "loss": 2.9647, + "step": 14039 + }, + { + "epoch": 0.6536769327467001, + "grad_norm": 0.36545872243562355, + "learning_rate": 9.582586872907567e-05, + "loss": 3.0447, + "step": 14040 + }, + { + "epoch": 0.6537234909327932, + "grad_norm": 0.4110389311833555, + "learning_rate": 9.582478518108327e-05, + "loss": 3.0706, + "step": 14041 + }, + { + "epoch": 0.6537700491188864, + "grad_norm": 0.3723151854777774, + "learning_rate": 9.582370149859965e-05, + "loss": 3.038, + "step": 14042 + }, + { + "epoch": 0.6538166073049794, + "grad_norm": 0.3735469115414918, + "learning_rate": 9.582261768162799e-05, + "loss": 3.014, + "step": 14043 + }, + { + "epoch": 0.6538631654910725, + "grad_norm": 0.3507747082870283, + "learning_rate": 9.582153373017146e-05, + "loss": 2.9849, + "step": 14044 + }, + { + "epoch": 0.6539097236771655, + "grad_norm": 0.3240884236527961, + "learning_rate": 9.582044964423322e-05, + "loss": 3.0932, + "step": 14045 + }, + { + "epoch": 0.6539562818632586, + "grad_norm": 0.34799761261198847, + "learning_rate": 9.58193654238165e-05, + "loss": 3.0049, + "step": 14046 + }, + { + "epoch": 0.6540028400493517, + "grad_norm": 0.35029023703242357, + "learning_rate": 9.581828106892445e-05, + "loss": 3.071, + "step": 14047 + }, + { + "epoch": 0.6540493982354447, + "grad_norm": 0.3576976318890466, + "learning_rate": 9.581719657956025e-05, + "loss": 3.0706, + "step": 14048 + }, + { + "epoch": 0.6540959564215378, + "grad_norm": 0.35392916726145746, + "learning_rate": 9.581611195572711e-05, + "loss": 3.021, + "step": 14049 + }, + { + "epoch": 0.6541425146076308, + "grad_norm": 0.34022382217995806, + "learning_rate": 9.581502719742816e-05, + "loss": 2.9271, + "step": 14050 + }, + { + "epoch": 0.654189072793724, + "grad_norm": 0.35919848046213027, + "learning_rate": 9.581394230466665e-05, + "loss": 3.051, + "step": 14051 + }, + { + "epoch": 0.6542356309798171, + "grad_norm": 0.37473549398933975, + "learning_rate": 9.581285727744574e-05, + "loss": 3.154, + "step": 14052 + }, + { + "epoch": 0.6542821891659101, + "grad_norm": 0.36233501192040274, + "learning_rate": 9.581177211576857e-05, + "loss": 3.0065, + "step": 14053 + }, + { + "epoch": 0.6543287473520032, + "grad_norm": 0.38617630651337104, + "learning_rate": 9.581068681963839e-05, + "loss": 3.0184, + "step": 14054 + }, + { + "epoch": 0.6543753055380962, + "grad_norm": 0.3537640865896703, + "learning_rate": 9.580960138905835e-05, + "loss": 3.0564, + "step": 14055 + }, + { + "epoch": 0.6544218637241893, + "grad_norm": 0.3917078483918917, + "learning_rate": 9.580851582403164e-05, + "loss": 2.9732, + "step": 14056 + }, + { + "epoch": 0.6544684219102824, + "grad_norm": 0.37950862407511454, + "learning_rate": 9.580743012456146e-05, + "loss": 3.0051, + "step": 14057 + }, + { + "epoch": 0.6545149800963754, + "grad_norm": 0.3658045353712008, + "learning_rate": 9.580634429065097e-05, + "loss": 2.9677, + "step": 14058 + }, + { + "epoch": 0.6545615382824685, + "grad_norm": 0.39563097267871034, + "learning_rate": 9.580525832230338e-05, + "loss": 2.99, + "step": 14059 + }, + { + "epoch": 0.6546080964685616, + "grad_norm": 0.38460689081231486, + "learning_rate": 9.580417221952187e-05, + "loss": 3.0449, + "step": 14060 + }, + { + "epoch": 0.6546546546546547, + "grad_norm": 0.3781290349753173, + "learning_rate": 9.580308598230963e-05, + "loss": 3.0138, + "step": 14061 + }, + { + "epoch": 0.6547012128407477, + "grad_norm": 0.3734620975562425, + "learning_rate": 9.580199961066983e-05, + "loss": 2.9771, + "step": 14062 + }, + { + "epoch": 0.6547477710268408, + "grad_norm": 0.34186500478123716, + "learning_rate": 9.580091310460569e-05, + "loss": 3.0279, + "step": 14063 + }, + { + "epoch": 0.6547943292129339, + "grad_norm": 0.3679543476783228, + "learning_rate": 9.579982646412038e-05, + "loss": 3.0895, + "step": 14064 + }, + { + "epoch": 0.6548408873990269, + "grad_norm": 0.36927699960344523, + "learning_rate": 9.579873968921708e-05, + "loss": 2.9582, + "step": 14065 + }, + { + "epoch": 0.65488744558512, + "grad_norm": 0.34845197829382724, + "learning_rate": 9.5797652779899e-05, + "loss": 2.9969, + "step": 14066 + }, + { + "epoch": 0.654934003771213, + "grad_norm": 0.3937336248410457, + "learning_rate": 9.579656573616933e-05, + "loss": 3.0025, + "step": 14067 + }, + { + "epoch": 0.6549805619573061, + "grad_norm": 0.3608007332911798, + "learning_rate": 9.579547855803124e-05, + "loss": 3.0446, + "step": 14068 + }, + { + "epoch": 0.6550271201433993, + "grad_norm": 0.43421914543966794, + "learning_rate": 9.579439124548792e-05, + "loss": 3.0051, + "step": 14069 + }, + { + "epoch": 0.6550736783294923, + "grad_norm": 0.3811973346091435, + "learning_rate": 9.579330379854259e-05, + "loss": 3.0244, + "step": 14070 + }, + { + "epoch": 0.6551202365155854, + "grad_norm": 0.40026999073036745, + "learning_rate": 9.579221621719842e-05, + "loss": 3.0569, + "step": 14071 + }, + { + "epoch": 0.6551667947016784, + "grad_norm": 0.3589284793764535, + "learning_rate": 9.579112850145861e-05, + "loss": 3.1288, + "step": 14072 + }, + { + "epoch": 0.6552133528877715, + "grad_norm": 0.36695497627705803, + "learning_rate": 9.579004065132636e-05, + "loss": 2.9946, + "step": 14073 + }, + { + "epoch": 0.6552599110738646, + "grad_norm": 0.38940747447694835, + "learning_rate": 9.578895266680483e-05, + "loss": 3.0555, + "step": 14074 + }, + { + "epoch": 0.6553064692599576, + "grad_norm": 0.34275897657782906, + "learning_rate": 9.578786454789724e-05, + "loss": 3.0643, + "step": 14075 + }, + { + "epoch": 0.6553530274460507, + "grad_norm": 0.38122442186235594, + "learning_rate": 9.57867762946068e-05, + "loss": 3.0608, + "step": 14076 + }, + { + "epoch": 0.6553995856321437, + "grad_norm": 0.3532582735022955, + "learning_rate": 9.578568790693665e-05, + "loss": 2.972, + "step": 14077 + }, + { + "epoch": 0.6554461438182368, + "grad_norm": 0.3368202440295787, + "learning_rate": 9.578459938489003e-05, + "loss": 2.933, + "step": 14078 + }, + { + "epoch": 0.65549270200433, + "grad_norm": 0.3588169844020343, + "learning_rate": 9.578351072847011e-05, + "loss": 3.1562, + "step": 14079 + }, + { + "epoch": 0.655539260190423, + "grad_norm": 0.3486239295171572, + "learning_rate": 9.578242193768012e-05, + "loss": 3.1242, + "step": 14080 + }, + { + "epoch": 0.6555858183765161, + "grad_norm": 0.391849548638123, + "learning_rate": 9.578133301252322e-05, + "loss": 3.0231, + "step": 14081 + }, + { + "epoch": 0.6556323765626091, + "grad_norm": 0.39055877475336037, + "learning_rate": 9.578024395300262e-05, + "loss": 3.1004, + "step": 14082 + }, + { + "epoch": 0.6556789347487022, + "grad_norm": 0.3507635376964259, + "learning_rate": 9.577915475912151e-05, + "loss": 2.9631, + "step": 14083 + }, + { + "epoch": 0.6557254929347952, + "grad_norm": 0.3787528430663433, + "learning_rate": 9.57780654308831e-05, + "loss": 3.2321, + "step": 14084 + }, + { + "epoch": 0.6557720511208883, + "grad_norm": 0.37585851702378004, + "learning_rate": 9.577697596829056e-05, + "loss": 3.0112, + "step": 14085 + }, + { + "epoch": 0.6558186093069814, + "grad_norm": 0.3486231508955688, + "learning_rate": 9.577588637134713e-05, + "loss": 3.0489, + "step": 14086 + }, + { + "epoch": 0.6558651674930744, + "grad_norm": 0.39967739628908594, + "learning_rate": 9.577479664005596e-05, + "loss": 3.1111, + "step": 14087 + }, + { + "epoch": 0.6559117256791676, + "grad_norm": 0.36379004039639523, + "learning_rate": 9.577370677442029e-05, + "loss": 3.0175, + "step": 14088 + }, + { + "epoch": 0.6559582838652606, + "grad_norm": 0.41551703049415745, + "learning_rate": 9.577261677444329e-05, + "loss": 3.108, + "step": 14089 + }, + { + "epoch": 0.6560048420513537, + "grad_norm": 0.42288684700855855, + "learning_rate": 9.577152664012816e-05, + "loss": 3.0998, + "step": 14090 + }, + { + "epoch": 0.6560514002374468, + "grad_norm": 0.3702312035746144, + "learning_rate": 9.577043637147812e-05, + "loss": 3.0336, + "step": 14091 + }, + { + "epoch": 0.6560979584235398, + "grad_norm": 0.4074569942125144, + "learning_rate": 9.576934596849636e-05, + "loss": 3.0607, + "step": 14092 + }, + { + "epoch": 0.6561445166096329, + "grad_norm": 0.40470875557093255, + "learning_rate": 9.576825543118609e-05, + "loss": 2.8945, + "step": 14093 + }, + { + "epoch": 0.6561910747957259, + "grad_norm": 0.3636954364808646, + "learning_rate": 9.576716475955048e-05, + "loss": 3.0203, + "step": 14094 + }, + { + "epoch": 0.656237632981819, + "grad_norm": 0.481308981997654, + "learning_rate": 9.576607395359275e-05, + "loss": 3.061, + "step": 14095 + }, + { + "epoch": 0.6562841911679121, + "grad_norm": 0.3791190513936368, + "learning_rate": 9.57649830133161e-05, + "loss": 3.02, + "step": 14096 + }, + { + "epoch": 0.6563307493540051, + "grad_norm": 0.3940378300068253, + "learning_rate": 9.576389193872375e-05, + "loss": 2.9751, + "step": 14097 + }, + { + "epoch": 0.6563773075400983, + "grad_norm": 0.4432796386423991, + "learning_rate": 9.576280072981887e-05, + "loss": 3.0921, + "step": 14098 + }, + { + "epoch": 0.6564238657261913, + "grad_norm": 0.3697381070501019, + "learning_rate": 9.57617093866047e-05, + "loss": 2.9972, + "step": 14099 + }, + { + "epoch": 0.6564704239122844, + "grad_norm": 0.4439523944965778, + "learning_rate": 9.576061790908439e-05, + "loss": 3.1424, + "step": 14100 + }, + { + "epoch": 0.6565169820983775, + "grad_norm": 0.4192238359533706, + "learning_rate": 9.575952629726119e-05, + "loss": 3.1266, + "step": 14101 + }, + { + "epoch": 0.6565635402844705, + "grad_norm": 0.388388479497199, + "learning_rate": 9.575843455113829e-05, + "loss": 3.0172, + "step": 14102 + }, + { + "epoch": 0.6566100984705636, + "grad_norm": 0.4127611716976776, + "learning_rate": 9.575734267071889e-05, + "loss": 3.0108, + "step": 14103 + }, + { + "epoch": 0.6566566566566566, + "grad_norm": 0.3922008705730615, + "learning_rate": 9.57562506560062e-05, + "loss": 3.0736, + "step": 14104 + }, + { + "epoch": 0.6567032148427497, + "grad_norm": 0.3772559667263856, + "learning_rate": 9.575515850700341e-05, + "loss": 3.1142, + "step": 14105 + }, + { + "epoch": 0.6567497730288427, + "grad_norm": 0.37067715777749716, + "learning_rate": 9.575406622371376e-05, + "loss": 3.0667, + "step": 14106 + }, + { + "epoch": 0.6567963312149359, + "grad_norm": 0.3820436786693524, + "learning_rate": 9.575297380614041e-05, + "loss": 2.9924, + "step": 14107 + }, + { + "epoch": 0.656842889401029, + "grad_norm": 0.36685629422658755, + "learning_rate": 9.57518812542866e-05, + "loss": 2.9957, + "step": 14108 + }, + { + "epoch": 0.656889447587122, + "grad_norm": 0.39207206676332484, + "learning_rate": 9.575078856815554e-05, + "loss": 2.9234, + "step": 14109 + }, + { + "epoch": 0.6569360057732151, + "grad_norm": 0.3378930317027315, + "learning_rate": 9.57496957477504e-05, + "loss": 3.1343, + "step": 14110 + }, + { + "epoch": 0.6569825639593081, + "grad_norm": 0.3657006002657818, + "learning_rate": 9.574860279307443e-05, + "loss": 3.008, + "step": 14111 + }, + { + "epoch": 0.6570291221454012, + "grad_norm": 0.39392810736953054, + "learning_rate": 9.57475097041308e-05, + "loss": 3.1204, + "step": 14112 + }, + { + "epoch": 0.6570756803314943, + "grad_norm": 0.35698631477004567, + "learning_rate": 9.574641648092273e-05, + "loss": 3.0163, + "step": 14113 + }, + { + "epoch": 0.6571222385175873, + "grad_norm": 0.37600572104471747, + "learning_rate": 9.574532312345346e-05, + "loss": 3.0777, + "step": 14114 + }, + { + "epoch": 0.6571687967036804, + "grad_norm": 0.36523657849257796, + "learning_rate": 9.574422963172616e-05, + "loss": 3.0156, + "step": 14115 + }, + { + "epoch": 0.6572153548897735, + "grad_norm": 0.37785131704239827, + "learning_rate": 9.574313600574406e-05, + "loss": 3.0339, + "step": 14116 + }, + { + "epoch": 0.6572619130758666, + "grad_norm": 0.3829828491755855, + "learning_rate": 9.574204224551035e-05, + "loss": 2.9787, + "step": 14117 + }, + { + "epoch": 0.6573084712619597, + "grad_norm": 0.3887762648367216, + "learning_rate": 9.574094835102824e-05, + "loss": 2.9647, + "step": 14118 + }, + { + "epoch": 0.6573550294480527, + "grad_norm": 0.4023156447313497, + "learning_rate": 9.573985432230099e-05, + "loss": 3.1522, + "step": 14119 + }, + { + "epoch": 0.6574015876341458, + "grad_norm": 0.4091651997500688, + "learning_rate": 9.573876015933175e-05, + "loss": 3.1336, + "step": 14120 + }, + { + "epoch": 0.6574481458202388, + "grad_norm": 0.3969002900175532, + "learning_rate": 9.573766586212376e-05, + "loss": 3.0415, + "step": 14121 + }, + { + "epoch": 0.6574947040063319, + "grad_norm": 0.4299636266589368, + "learning_rate": 9.573657143068022e-05, + "loss": 3.03, + "step": 14122 + }, + { + "epoch": 0.657541262192425, + "grad_norm": 0.37630548993828855, + "learning_rate": 9.573547686500434e-05, + "loss": 3.0699, + "step": 14123 + }, + { + "epoch": 0.657587820378518, + "grad_norm": 0.38080020436568246, + "learning_rate": 9.573438216509937e-05, + "loss": 2.9168, + "step": 14124 + }, + { + "epoch": 0.6576343785646112, + "grad_norm": 0.4134840027281591, + "learning_rate": 9.573328733096848e-05, + "loss": 3.076, + "step": 14125 + }, + { + "epoch": 0.6576809367507042, + "grad_norm": 0.37670632445019725, + "learning_rate": 9.573219236261489e-05, + "loss": 2.9506, + "step": 14126 + }, + { + "epoch": 0.6577274949367973, + "grad_norm": 0.3554436626237824, + "learning_rate": 9.573109726004184e-05, + "loss": 3.037, + "step": 14127 + }, + { + "epoch": 0.6577740531228903, + "grad_norm": 0.36617278105120327, + "learning_rate": 9.57300020232525e-05, + "loss": 3.0368, + "step": 14128 + }, + { + "epoch": 0.6578206113089834, + "grad_norm": 0.37003825714643146, + "learning_rate": 9.572890665225012e-05, + "loss": 2.9307, + "step": 14129 + }, + { + "epoch": 0.6578671694950765, + "grad_norm": 0.36136140785271015, + "learning_rate": 9.57278111470379e-05, + "loss": 3.0594, + "step": 14130 + }, + { + "epoch": 0.6579137276811695, + "grad_norm": 0.3602025853055493, + "learning_rate": 9.572671550761907e-05, + "loss": 3.0199, + "step": 14131 + }, + { + "epoch": 0.6579602858672626, + "grad_norm": 0.34725497275919154, + "learning_rate": 9.572561973399682e-05, + "loss": 2.9683, + "step": 14132 + }, + { + "epoch": 0.6580068440533556, + "grad_norm": 0.3508632865587385, + "learning_rate": 9.57245238261744e-05, + "loss": 2.9229, + "step": 14133 + }, + { + "epoch": 0.6580534022394487, + "grad_norm": 0.35035236236283873, + "learning_rate": 9.5723427784155e-05, + "loss": 3.0183, + "step": 14134 + }, + { + "epoch": 0.6580999604255419, + "grad_norm": 0.3729870338350996, + "learning_rate": 9.572233160794185e-05, + "loss": 2.9953, + "step": 14135 + }, + { + "epoch": 0.6581465186116349, + "grad_norm": 0.2923405545437241, + "learning_rate": 9.572123529753814e-05, + "loss": 2.9991, + "step": 14136 + }, + { + "epoch": 0.658193076797728, + "grad_norm": 0.3472026820731823, + "learning_rate": 9.572013885294712e-05, + "loss": 3.0701, + "step": 14137 + }, + { + "epoch": 0.658239634983821, + "grad_norm": 0.35811329781762774, + "learning_rate": 9.571904227417199e-05, + "loss": 3.0479, + "step": 14138 + }, + { + "epoch": 0.6582861931699141, + "grad_norm": 0.3259698852390162, + "learning_rate": 9.571794556121599e-05, + "loss": 3.1467, + "step": 14139 + }, + { + "epoch": 0.6583327513560072, + "grad_norm": 0.3654403839452139, + "learning_rate": 9.571684871408232e-05, + "loss": 3.0975, + "step": 14140 + }, + { + "epoch": 0.6583793095421002, + "grad_norm": 0.3072606729326477, + "learning_rate": 9.571575173277419e-05, + "loss": 2.9637, + "step": 14141 + }, + { + "epoch": 0.6584258677281933, + "grad_norm": 0.33803423132391947, + "learning_rate": 9.571465461729486e-05, + "loss": 3.0508, + "step": 14142 + }, + { + "epoch": 0.6584724259142863, + "grad_norm": 0.3409402249794051, + "learning_rate": 9.571355736764749e-05, + "loss": 3.064, + "step": 14143 + }, + { + "epoch": 0.6585189841003795, + "grad_norm": 0.39046173058327954, + "learning_rate": 9.571245998383536e-05, + "loss": 3.0817, + "step": 14144 + }, + { + "epoch": 0.6585655422864726, + "grad_norm": 0.337629748640362, + "learning_rate": 9.571136246586163e-05, + "loss": 2.9824, + "step": 14145 + }, + { + "epoch": 0.6586121004725656, + "grad_norm": 0.3683266210953953, + "learning_rate": 9.571026481372958e-05, + "loss": 3.1122, + "step": 14146 + }, + { + "epoch": 0.6586586586586587, + "grad_norm": 0.40991254524147896, + "learning_rate": 9.57091670274424e-05, + "loss": 3.0418, + "step": 14147 + }, + { + "epoch": 0.6587052168447517, + "grad_norm": 0.44889665150305313, + "learning_rate": 9.570806910700332e-05, + "loss": 3.0549, + "step": 14148 + }, + { + "epoch": 0.6587517750308448, + "grad_norm": 0.3834744900919166, + "learning_rate": 9.570697105241556e-05, + "loss": 3.0573, + "step": 14149 + }, + { + "epoch": 0.6587983332169378, + "grad_norm": 0.428713314418986, + "learning_rate": 9.570587286368234e-05, + "loss": 3.0098, + "step": 14150 + }, + { + "epoch": 0.6588448914030309, + "grad_norm": 0.4039985737281917, + "learning_rate": 9.570477454080689e-05, + "loss": 2.9414, + "step": 14151 + }, + { + "epoch": 0.658891449589124, + "grad_norm": 0.3718394924997146, + "learning_rate": 9.570367608379243e-05, + "loss": 3.1008, + "step": 14152 + }, + { + "epoch": 0.658938007775217, + "grad_norm": 0.3892658609549586, + "learning_rate": 9.570257749264218e-05, + "loss": 3.0659, + "step": 14153 + }, + { + "epoch": 0.6589845659613102, + "grad_norm": 0.3942476930887572, + "learning_rate": 9.570147876735936e-05, + "loss": 3.079, + "step": 14154 + }, + { + "epoch": 0.6590311241474032, + "grad_norm": 0.382316125149679, + "learning_rate": 9.570037990794722e-05, + "loss": 2.9921, + "step": 14155 + }, + { + "epoch": 0.6590776823334963, + "grad_norm": 0.4414238506818733, + "learning_rate": 9.569928091440896e-05, + "loss": 3.1045, + "step": 14156 + }, + { + "epoch": 0.6591242405195894, + "grad_norm": 0.3289274708721267, + "learning_rate": 9.56981817867478e-05, + "loss": 3.1308, + "step": 14157 + }, + { + "epoch": 0.6591707987056824, + "grad_norm": 0.37366317211306344, + "learning_rate": 9.569708252496699e-05, + "loss": 2.9604, + "step": 14158 + }, + { + "epoch": 0.6592173568917755, + "grad_norm": 0.384118349797013, + "learning_rate": 9.569598312906973e-05, + "loss": 2.9242, + "step": 14159 + }, + { + "epoch": 0.6592639150778685, + "grad_norm": 0.3277539970699876, + "learning_rate": 9.569488359905929e-05, + "loss": 3.0411, + "step": 14160 + }, + { + "epoch": 0.6593104732639616, + "grad_norm": 0.4146415754885131, + "learning_rate": 9.569378393493883e-05, + "loss": 3.0547, + "step": 14161 + }, + { + "epoch": 0.6593570314500548, + "grad_norm": 0.36853239566704765, + "learning_rate": 9.569268413671166e-05, + "loss": 3.0372, + "step": 14162 + }, + { + "epoch": 0.6594035896361478, + "grad_norm": 0.35972118811336296, + "learning_rate": 9.569158420438093e-05, + "loss": 2.9988, + "step": 14163 + }, + { + "epoch": 0.6594501478222409, + "grad_norm": 0.38427354474364445, + "learning_rate": 9.569048413794992e-05, + "loss": 3.011, + "step": 14164 + }, + { + "epoch": 0.6594967060083339, + "grad_norm": 0.36701293791245804, + "learning_rate": 9.568938393742182e-05, + "loss": 3.04, + "step": 14165 + }, + { + "epoch": 0.659543264194427, + "grad_norm": 0.4049022442944795, + "learning_rate": 9.56882836027999e-05, + "loss": 2.9116, + "step": 14166 + }, + { + "epoch": 0.6595898223805201, + "grad_norm": 0.3934081998600171, + "learning_rate": 9.568718313408737e-05, + "loss": 3.0693, + "step": 14167 + }, + { + "epoch": 0.6596363805666131, + "grad_norm": 0.36422573030091193, + "learning_rate": 9.568608253128745e-05, + "loss": 3.023, + "step": 14168 + }, + { + "epoch": 0.6596829387527062, + "grad_norm": 0.369566674965183, + "learning_rate": 9.568498179440337e-05, + "loss": 2.901, + "step": 14169 + }, + { + "epoch": 0.6597294969387992, + "grad_norm": 0.37387320531930257, + "learning_rate": 9.568388092343839e-05, + "loss": 3.0333, + "step": 14170 + }, + { + "epoch": 0.6597760551248923, + "grad_norm": 0.36211275110441205, + "learning_rate": 9.568277991839569e-05, + "loss": 2.9896, + "step": 14171 + }, + { + "epoch": 0.6598226133109854, + "grad_norm": 0.33194554754086386, + "learning_rate": 9.568167877927856e-05, + "loss": 2.9606, + "step": 14172 + }, + { + "epoch": 0.6598691714970785, + "grad_norm": 0.3503918447202362, + "learning_rate": 9.568057750609018e-05, + "loss": 3.0426, + "step": 14173 + }, + { + "epoch": 0.6599157296831716, + "grad_norm": 0.3824951646001061, + "learning_rate": 9.567947609883384e-05, + "loss": 2.9453, + "step": 14174 + }, + { + "epoch": 0.6599622878692646, + "grad_norm": 0.4161519773966898, + "learning_rate": 9.567837455751269e-05, + "loss": 3.0848, + "step": 14175 + }, + { + "epoch": 0.6600088460553577, + "grad_norm": 0.34394887006943836, + "learning_rate": 9.567727288213005e-05, + "loss": 2.9858, + "step": 14176 + }, + { + "epoch": 0.6600554042414507, + "grad_norm": 0.4060472904298656, + "learning_rate": 9.567617107268909e-05, + "loss": 2.9602, + "step": 14177 + }, + { + "epoch": 0.6601019624275438, + "grad_norm": 0.45206379506947486, + "learning_rate": 9.567506912919309e-05, + "loss": 3.0527, + "step": 14178 + }, + { + "epoch": 0.6601485206136369, + "grad_norm": 0.3718587493998167, + "learning_rate": 9.567396705164524e-05, + "loss": 2.9838, + "step": 14179 + }, + { + "epoch": 0.6601950787997299, + "grad_norm": 0.3847051236726545, + "learning_rate": 9.567286484004882e-05, + "loss": 3.1183, + "step": 14180 + }, + { + "epoch": 0.660241636985823, + "grad_norm": 0.4219806963605666, + "learning_rate": 9.567176249440701e-05, + "loss": 3.0674, + "step": 14181 + }, + { + "epoch": 0.6602881951719161, + "grad_norm": 0.38877919079099826, + "learning_rate": 9.567066001472312e-05, + "loss": 2.9541, + "step": 14182 + }, + { + "epoch": 0.6603347533580092, + "grad_norm": 0.35077177301702145, + "learning_rate": 9.56695574010003e-05, + "loss": 2.9101, + "step": 14183 + }, + { + "epoch": 0.6603813115441023, + "grad_norm": 0.3852580713562808, + "learning_rate": 9.566845465324185e-05, + "loss": 3.1472, + "step": 14184 + }, + { + "epoch": 0.6604278697301953, + "grad_norm": 0.42387172218093827, + "learning_rate": 9.566735177145098e-05, + "loss": 3.0335, + "step": 14185 + }, + { + "epoch": 0.6604744279162884, + "grad_norm": 0.35022342209623114, + "learning_rate": 9.566624875563092e-05, + "loss": 2.9996, + "step": 14186 + }, + { + "epoch": 0.6605209861023814, + "grad_norm": 0.4272560721835822, + "learning_rate": 9.566514560578493e-05, + "loss": 2.9493, + "step": 14187 + }, + { + "epoch": 0.6605675442884745, + "grad_norm": 0.4072467656310474, + "learning_rate": 9.566404232191623e-05, + "loss": 2.9346, + "step": 14188 + }, + { + "epoch": 0.6606141024745676, + "grad_norm": 0.3524570176864871, + "learning_rate": 9.566293890402809e-05, + "loss": 3.0757, + "step": 14189 + }, + { + "epoch": 0.6606606606606606, + "grad_norm": 0.4029145668941487, + "learning_rate": 9.56618353521237e-05, + "loss": 3.0053, + "step": 14190 + }, + { + "epoch": 0.6607072188467538, + "grad_norm": 0.3520040501085859, + "learning_rate": 9.566073166620633e-05, + "loss": 2.9855, + "step": 14191 + }, + { + "epoch": 0.6607537770328468, + "grad_norm": 0.37864569141086785, + "learning_rate": 9.56596278462792e-05, + "loss": 3.1275, + "step": 14192 + }, + { + "epoch": 0.6608003352189399, + "grad_norm": 0.38880552515929745, + "learning_rate": 9.565852389234556e-05, + "loss": 3.0042, + "step": 14193 + }, + { + "epoch": 0.6608468934050329, + "grad_norm": 0.36807578984197675, + "learning_rate": 9.565741980440865e-05, + "loss": 2.968, + "step": 14194 + }, + { + "epoch": 0.660893451591126, + "grad_norm": 0.39090434606882124, + "learning_rate": 9.565631558247173e-05, + "loss": 3.0289, + "step": 14195 + }, + { + "epoch": 0.6609400097772191, + "grad_norm": 0.33301009475182414, + "learning_rate": 9.5655211226538e-05, + "loss": 2.9796, + "step": 14196 + }, + { + "epoch": 0.6609865679633121, + "grad_norm": 0.40766054182359246, + "learning_rate": 9.565410673661074e-05, + "loss": 3.0988, + "step": 14197 + }, + { + "epoch": 0.6610331261494052, + "grad_norm": 0.34669318872834115, + "learning_rate": 9.565300211269317e-05, + "loss": 2.96, + "step": 14198 + }, + { + "epoch": 0.6610796843354982, + "grad_norm": 0.38959648384062096, + "learning_rate": 9.565189735478852e-05, + "loss": 2.9908, + "step": 14199 + }, + { + "epoch": 0.6611262425215914, + "grad_norm": 0.35843078551010876, + "learning_rate": 9.565079246290008e-05, + "loss": 2.9278, + "step": 14200 + }, + { + "epoch": 0.6611728007076845, + "grad_norm": 0.3971552199364072, + "learning_rate": 9.564968743703102e-05, + "loss": 3.1434, + "step": 14201 + }, + { + "epoch": 0.6612193588937775, + "grad_norm": 0.37149651585994076, + "learning_rate": 9.564858227718466e-05, + "loss": 2.9434, + "step": 14202 + }, + { + "epoch": 0.6612659170798706, + "grad_norm": 0.33475450653632044, + "learning_rate": 9.56474769833642e-05, + "loss": 2.9817, + "step": 14203 + }, + { + "epoch": 0.6613124752659636, + "grad_norm": 0.3236062504912998, + "learning_rate": 9.564637155557287e-05, + "loss": 3.0652, + "step": 14204 + }, + { + "epoch": 0.6613590334520567, + "grad_norm": 0.3421722936202189, + "learning_rate": 9.564526599381395e-05, + "loss": 2.9618, + "step": 14205 + }, + { + "epoch": 0.6614055916381498, + "grad_norm": 0.3087569048905764, + "learning_rate": 9.564416029809069e-05, + "loss": 3.0965, + "step": 14206 + }, + { + "epoch": 0.6614521498242428, + "grad_norm": 0.38496318490014436, + "learning_rate": 9.56430544684063e-05, + "loss": 2.9492, + "step": 14207 + }, + { + "epoch": 0.661498708010336, + "grad_norm": 0.3198040242980734, + "learning_rate": 9.564194850476404e-05, + "loss": 2.9486, + "step": 14208 + }, + { + "epoch": 0.661545266196429, + "grad_norm": 0.32048804633351236, + "learning_rate": 9.564084240716716e-05, + "loss": 3.009, + "step": 14209 + }, + { + "epoch": 0.6615918243825221, + "grad_norm": 0.3609096880291414, + "learning_rate": 9.563973617561892e-05, + "loss": 3.0444, + "step": 14210 + }, + { + "epoch": 0.6616383825686152, + "grad_norm": 0.3677602220441599, + "learning_rate": 9.563862981012252e-05, + "loss": 3.1111, + "step": 14211 + }, + { + "epoch": 0.6616849407547082, + "grad_norm": 0.3456128763127256, + "learning_rate": 9.563752331068125e-05, + "loss": 2.9747, + "step": 14212 + }, + { + "epoch": 0.6617314989408013, + "grad_norm": 0.3323164336055769, + "learning_rate": 9.563641667729835e-05, + "loss": 2.9872, + "step": 14213 + }, + { + "epoch": 0.6617780571268943, + "grad_norm": 0.3774768022171236, + "learning_rate": 9.563530990997706e-05, + "loss": 3.0196, + "step": 14214 + }, + { + "epoch": 0.6618246153129874, + "grad_norm": 0.3389477602104018, + "learning_rate": 9.563420300872063e-05, + "loss": 2.9701, + "step": 14215 + }, + { + "epoch": 0.6618711734990804, + "grad_norm": 0.3662997305051268, + "learning_rate": 9.563309597353231e-05, + "loss": 3.0114, + "step": 14216 + }, + { + "epoch": 0.6619177316851735, + "grad_norm": 0.3920012741253788, + "learning_rate": 9.563198880441536e-05, + "loss": 3.0237, + "step": 14217 + }, + { + "epoch": 0.6619642898712667, + "grad_norm": 0.32971352604731025, + "learning_rate": 9.563088150137301e-05, + "loss": 3.0423, + "step": 14218 + }, + { + "epoch": 0.6620108480573597, + "grad_norm": 0.39724011864994097, + "learning_rate": 9.562977406440852e-05, + "loss": 2.9998, + "step": 14219 + }, + { + "epoch": 0.6620574062434528, + "grad_norm": 0.39092943448980244, + "learning_rate": 9.562866649352514e-05, + "loss": 2.946, + "step": 14220 + }, + { + "epoch": 0.6621039644295458, + "grad_norm": 0.36409245242864396, + "learning_rate": 9.562755878872612e-05, + "loss": 2.8994, + "step": 14221 + }, + { + "epoch": 0.6621505226156389, + "grad_norm": 0.38977282069765934, + "learning_rate": 9.56264509500147e-05, + "loss": 2.9886, + "step": 14222 + }, + { + "epoch": 0.662197080801732, + "grad_norm": 0.3790497538076107, + "learning_rate": 9.562534297739413e-05, + "loss": 3.1176, + "step": 14223 + }, + { + "epoch": 0.662243638987825, + "grad_norm": 0.3416852887191983, + "learning_rate": 9.56242348708677e-05, + "loss": 3.0149, + "step": 14224 + }, + { + "epoch": 0.6622901971739181, + "grad_norm": 0.35230386799214886, + "learning_rate": 9.562312663043861e-05, + "loss": 3.0962, + "step": 14225 + }, + { + "epoch": 0.6623367553600111, + "grad_norm": 0.36334438431305494, + "learning_rate": 9.562201825611017e-05, + "loss": 3.1494, + "step": 14226 + }, + { + "epoch": 0.6623833135461042, + "grad_norm": 0.3668364784687434, + "learning_rate": 9.562090974788558e-05, + "loss": 3.1286, + "step": 14227 + }, + { + "epoch": 0.6624298717321974, + "grad_norm": 0.3974152579089706, + "learning_rate": 9.561980110576813e-05, + "loss": 2.9855, + "step": 14228 + }, + { + "epoch": 0.6624764299182904, + "grad_norm": 0.3134502002656954, + "learning_rate": 9.561869232976104e-05, + "loss": 3.0048, + "step": 14229 + }, + { + "epoch": 0.6625229881043835, + "grad_norm": 0.36029727262044814, + "learning_rate": 9.56175834198676e-05, + "loss": 3.0574, + "step": 14230 + }, + { + "epoch": 0.6625695462904765, + "grad_norm": 0.3398009985571134, + "learning_rate": 9.561647437609102e-05, + "loss": 3.1099, + "step": 14231 + }, + { + "epoch": 0.6626161044765696, + "grad_norm": 0.3536626203398155, + "learning_rate": 9.561536519843459e-05, + "loss": 2.9832, + "step": 14232 + }, + { + "epoch": 0.6626626626626627, + "grad_norm": 0.3620338325558326, + "learning_rate": 9.561425588690158e-05, + "loss": 2.9996, + "step": 14233 + }, + { + "epoch": 0.6627092208487557, + "grad_norm": 0.39786961449962815, + "learning_rate": 9.56131464414952e-05, + "loss": 3.1243, + "step": 14234 + }, + { + "epoch": 0.6627557790348488, + "grad_norm": 0.35853438757422446, + "learning_rate": 9.561203686221874e-05, + "loss": 3.0797, + "step": 14235 + }, + { + "epoch": 0.6628023372209418, + "grad_norm": 0.3679529122040526, + "learning_rate": 9.561092714907544e-05, + "loss": 3.0314, + "step": 14236 + }, + { + "epoch": 0.662848895407035, + "grad_norm": 0.3991067672379705, + "learning_rate": 9.560981730206856e-05, + "loss": 3.0344, + "step": 14237 + }, + { + "epoch": 0.662895453593128, + "grad_norm": 0.40406783929982476, + "learning_rate": 9.560870732120136e-05, + "loss": 2.9614, + "step": 14238 + }, + { + "epoch": 0.6629420117792211, + "grad_norm": 0.3585751218920946, + "learning_rate": 9.560759720647711e-05, + "loss": 2.9792, + "step": 14239 + }, + { + "epoch": 0.6629885699653142, + "grad_norm": 0.4477298403165441, + "learning_rate": 9.560648695789905e-05, + "loss": 3.1336, + "step": 14240 + }, + { + "epoch": 0.6630351281514072, + "grad_norm": 0.3873065324091817, + "learning_rate": 9.560537657547042e-05, + "loss": 2.9745, + "step": 14241 + }, + { + "epoch": 0.6630816863375003, + "grad_norm": 0.3981307292800794, + "learning_rate": 9.560426605919452e-05, + "loss": 3.0793, + "step": 14242 + }, + { + "epoch": 0.6631282445235933, + "grad_norm": 0.40960432847379147, + "learning_rate": 9.56031554090746e-05, + "loss": 3.0824, + "step": 14243 + }, + { + "epoch": 0.6631748027096864, + "grad_norm": 0.3642960162425003, + "learning_rate": 9.56020446251139e-05, + "loss": 2.9592, + "step": 14244 + }, + { + "epoch": 0.6632213608957795, + "grad_norm": 0.38721358688095053, + "learning_rate": 9.560093370731571e-05, + "loss": 3.1583, + "step": 14245 + }, + { + "epoch": 0.6632679190818725, + "grad_norm": 0.38050759194787726, + "learning_rate": 9.559982265568326e-05, + "loss": 3.1033, + "step": 14246 + }, + { + "epoch": 0.6633144772679657, + "grad_norm": 0.3543259404464907, + "learning_rate": 9.559871147021982e-05, + "loss": 3.097, + "step": 14247 + }, + { + "epoch": 0.6633610354540587, + "grad_norm": 0.42284859869689084, + "learning_rate": 9.559760015092864e-05, + "loss": 3.0756, + "step": 14248 + }, + { + "epoch": 0.6634075936401518, + "grad_norm": 0.3797916223663076, + "learning_rate": 9.559648869781301e-05, + "loss": 2.9621, + "step": 14249 + }, + { + "epoch": 0.6634541518262449, + "grad_norm": 0.36228904998866246, + "learning_rate": 9.559537711087619e-05, + "loss": 3.0105, + "step": 14250 + }, + { + "epoch": 0.6635007100123379, + "grad_norm": 0.38883687082878543, + "learning_rate": 9.559426539012141e-05, + "loss": 3.0276, + "step": 14251 + }, + { + "epoch": 0.663547268198431, + "grad_norm": 0.3463744693818689, + "learning_rate": 9.559315353555197e-05, + "loss": 2.974, + "step": 14252 + }, + { + "epoch": 0.663593826384524, + "grad_norm": 0.4018465243254565, + "learning_rate": 9.55920415471711e-05, + "loss": 3.1306, + "step": 14253 + }, + { + "epoch": 0.6636403845706171, + "grad_norm": 0.35448300134974375, + "learning_rate": 9.559092942498208e-05, + "loss": 3.0597, + "step": 14254 + }, + { + "epoch": 0.6636869427567103, + "grad_norm": 0.36206014950692056, + "learning_rate": 9.558981716898819e-05, + "loss": 2.985, + "step": 14255 + }, + { + "epoch": 0.6637335009428033, + "grad_norm": 0.36933955426137843, + "learning_rate": 9.558870477919267e-05, + "loss": 2.9879, + "step": 14256 + }, + { + "epoch": 0.6637800591288964, + "grad_norm": 0.35797268069195093, + "learning_rate": 9.558759225559879e-05, + "loss": 3.0853, + "step": 14257 + }, + { + "epoch": 0.6638266173149894, + "grad_norm": 0.3857930305457578, + "learning_rate": 9.558647959820984e-05, + "loss": 2.9695, + "step": 14258 + }, + { + "epoch": 0.6638731755010825, + "grad_norm": 0.34540942331269014, + "learning_rate": 9.558536680702903e-05, + "loss": 3.0023, + "step": 14259 + }, + { + "epoch": 0.6639197336871755, + "grad_norm": 0.409728515303978, + "learning_rate": 9.558425388205967e-05, + "loss": 3.0638, + "step": 14260 + }, + { + "epoch": 0.6639662918732686, + "grad_norm": 0.36424094625960185, + "learning_rate": 9.558314082330501e-05, + "loss": 2.9768, + "step": 14261 + }, + { + "epoch": 0.6640128500593617, + "grad_norm": 0.42315395688438046, + "learning_rate": 9.558202763076834e-05, + "loss": 3.1103, + "step": 14262 + }, + { + "epoch": 0.6640594082454547, + "grad_norm": 0.3835933798431025, + "learning_rate": 9.558091430445291e-05, + "loss": 3.0696, + "step": 14263 + }, + { + "epoch": 0.6641059664315478, + "grad_norm": 0.3939047755408699, + "learning_rate": 9.557980084436196e-05, + "loss": 3.1425, + "step": 14264 + }, + { + "epoch": 0.6641525246176408, + "grad_norm": 0.3777762849969552, + "learning_rate": 9.55786872504988e-05, + "loss": 2.9942, + "step": 14265 + }, + { + "epoch": 0.664199082803734, + "grad_norm": 0.348963461716794, + "learning_rate": 9.557757352286669e-05, + "loss": 2.9809, + "step": 14266 + }, + { + "epoch": 0.6642456409898271, + "grad_norm": 0.38645923402473303, + "learning_rate": 9.557645966146889e-05, + "loss": 3.0366, + "step": 14267 + }, + { + "epoch": 0.6642921991759201, + "grad_norm": 0.35608321800946097, + "learning_rate": 9.557534566630867e-05, + "loss": 3.0189, + "step": 14268 + }, + { + "epoch": 0.6643387573620132, + "grad_norm": 0.34415871660359804, + "learning_rate": 9.55742315373893e-05, + "loss": 3.0481, + "step": 14269 + }, + { + "epoch": 0.6643853155481062, + "grad_norm": 0.38147826923331096, + "learning_rate": 9.557311727471404e-05, + "loss": 3.0501, + "step": 14270 + }, + { + "epoch": 0.6644318737341993, + "grad_norm": 0.3253651968875384, + "learning_rate": 9.557200287828618e-05, + "loss": 2.9539, + "step": 14271 + }, + { + "epoch": 0.6644784319202924, + "grad_norm": 0.3259666595668692, + "learning_rate": 9.557088834810899e-05, + "loss": 3.0717, + "step": 14272 + }, + { + "epoch": 0.6645249901063854, + "grad_norm": 0.32811268179534847, + "learning_rate": 9.556977368418572e-05, + "loss": 3.0572, + "step": 14273 + }, + { + "epoch": 0.6645715482924786, + "grad_norm": 0.3451093274557486, + "learning_rate": 9.556865888651965e-05, + "loss": 2.9578, + "step": 14274 + }, + { + "epoch": 0.6646181064785716, + "grad_norm": 0.3566697613237503, + "learning_rate": 9.556754395511405e-05, + "loss": 3.2138, + "step": 14275 + }, + { + "epoch": 0.6646646646646647, + "grad_norm": 0.37964077447941885, + "learning_rate": 9.556642888997222e-05, + "loss": 2.9785, + "step": 14276 + }, + { + "epoch": 0.6647112228507578, + "grad_norm": 0.359204125664773, + "learning_rate": 9.55653136910974e-05, + "loss": 3.1015, + "step": 14277 + }, + { + "epoch": 0.6647577810368508, + "grad_norm": 0.3794807494624155, + "learning_rate": 9.556419835849286e-05, + "loss": 2.9949, + "step": 14278 + }, + { + "epoch": 0.6648043392229439, + "grad_norm": 0.38058443927894303, + "learning_rate": 9.556308289216192e-05, + "loss": 3.0845, + "step": 14279 + }, + { + "epoch": 0.6648508974090369, + "grad_norm": 0.3658751891922358, + "learning_rate": 9.55619672921078e-05, + "loss": 3.0944, + "step": 14280 + }, + { + "epoch": 0.66489745559513, + "grad_norm": 0.3705382977757158, + "learning_rate": 9.556085155833379e-05, + "loss": 3.0457, + "step": 14281 + }, + { + "epoch": 0.664944013781223, + "grad_norm": 0.3501556635660517, + "learning_rate": 9.555973569084316e-05, + "loss": 2.9865, + "step": 14282 + }, + { + "epoch": 0.6649905719673161, + "grad_norm": 0.3615053453467726, + "learning_rate": 9.555861968963921e-05, + "loss": 2.9933, + "step": 14283 + }, + { + "epoch": 0.6650371301534093, + "grad_norm": 0.35938647624536935, + "learning_rate": 9.555750355472519e-05, + "loss": 3.1087, + "step": 14284 + }, + { + "epoch": 0.6650836883395023, + "grad_norm": 0.3938584931626981, + "learning_rate": 9.55563872861044e-05, + "loss": 3.0949, + "step": 14285 + }, + { + "epoch": 0.6651302465255954, + "grad_norm": 0.3278232553365728, + "learning_rate": 9.555527088378009e-05, + "loss": 3.0032, + "step": 14286 + }, + { + "epoch": 0.6651768047116884, + "grad_norm": 0.395382476812066, + "learning_rate": 9.555415434775555e-05, + "loss": 3.0321, + "step": 14287 + }, + { + "epoch": 0.6652233628977815, + "grad_norm": 0.3977618868517066, + "learning_rate": 9.555303767803406e-05, + "loss": 2.9821, + "step": 14288 + }, + { + "epoch": 0.6652699210838746, + "grad_norm": 0.33255385786643404, + "learning_rate": 9.555192087461889e-05, + "loss": 2.9512, + "step": 14289 + }, + { + "epoch": 0.6653164792699676, + "grad_norm": 0.4142816179401086, + "learning_rate": 9.555080393751333e-05, + "loss": 3.0605, + "step": 14290 + }, + { + "epoch": 0.6653630374560607, + "grad_norm": 0.37994021642174197, + "learning_rate": 9.554968686672062e-05, + "loss": 3.1497, + "step": 14291 + }, + { + "epoch": 0.6654095956421537, + "grad_norm": 0.39023112672140436, + "learning_rate": 9.554856966224408e-05, + "loss": 3.0574, + "step": 14292 + }, + { + "epoch": 0.6654561538282469, + "grad_norm": 0.35269160577244674, + "learning_rate": 9.554745232408698e-05, + "loss": 2.9548, + "step": 14293 + }, + { + "epoch": 0.66550271201434, + "grad_norm": 0.38233364072180026, + "learning_rate": 9.55463348522526e-05, + "loss": 3.019, + "step": 14294 + }, + { + "epoch": 0.665549270200433, + "grad_norm": 0.37384276035093034, + "learning_rate": 9.55452172467442e-05, + "loss": 3.1379, + "step": 14295 + }, + { + "epoch": 0.6655958283865261, + "grad_norm": 0.41559488579108106, + "learning_rate": 9.554409950756509e-05, + "loss": 3.1017, + "step": 14296 + }, + { + "epoch": 0.6656423865726191, + "grad_norm": 0.3855269589188236, + "learning_rate": 9.554298163471851e-05, + "loss": 2.9903, + "step": 14297 + }, + { + "epoch": 0.6656889447587122, + "grad_norm": 0.36044236461737067, + "learning_rate": 9.554186362820778e-05, + "loss": 3.0455, + "step": 14298 + }, + { + "epoch": 0.6657355029448053, + "grad_norm": 0.4027083555021115, + "learning_rate": 9.554074548803618e-05, + "loss": 3.0671, + "step": 14299 + }, + { + "epoch": 0.6657820611308983, + "grad_norm": 0.3585032543523123, + "learning_rate": 9.553962721420696e-05, + "loss": 3.0906, + "step": 14300 + }, + { + "epoch": 0.6658286193169914, + "grad_norm": 0.41768800962835106, + "learning_rate": 9.553850880672343e-05, + "loss": 3.0373, + "step": 14301 + }, + { + "epoch": 0.6658751775030844, + "grad_norm": 0.37288727620667905, + "learning_rate": 9.553739026558885e-05, + "loss": 3.0427, + "step": 14302 + }, + { + "epoch": 0.6659217356891776, + "grad_norm": 0.42950312647279437, + "learning_rate": 9.553627159080654e-05, + "loss": 3.1479, + "step": 14303 + }, + { + "epoch": 0.6659682938752706, + "grad_norm": 0.3708196257410949, + "learning_rate": 9.553515278237975e-05, + "loss": 3.0305, + "step": 14304 + }, + { + "epoch": 0.6660148520613637, + "grad_norm": 0.3750795458986324, + "learning_rate": 9.553403384031175e-05, + "loss": 2.9647, + "step": 14305 + }, + { + "epoch": 0.6660614102474568, + "grad_norm": 0.38875791758262185, + "learning_rate": 9.553291476460587e-05, + "loss": 3.0479, + "step": 14306 + }, + { + "epoch": 0.6661079684335498, + "grad_norm": 0.37572142089858274, + "learning_rate": 9.553179555526537e-05, + "loss": 3.1012, + "step": 14307 + }, + { + "epoch": 0.6661545266196429, + "grad_norm": 0.36756413236473634, + "learning_rate": 9.553067621229352e-05, + "loss": 2.9921, + "step": 14308 + }, + { + "epoch": 0.6662010848057359, + "grad_norm": 0.3873094513554039, + "learning_rate": 9.552955673569364e-05, + "loss": 2.9074, + "step": 14309 + }, + { + "epoch": 0.666247642991829, + "grad_norm": 0.3575716671133309, + "learning_rate": 9.552843712546898e-05, + "loss": 3.0191, + "step": 14310 + }, + { + "epoch": 0.6662942011779222, + "grad_norm": 0.42407815356985007, + "learning_rate": 9.552731738162286e-05, + "loss": 3.0979, + "step": 14311 + }, + { + "epoch": 0.6663407593640152, + "grad_norm": 0.34970387862828417, + "learning_rate": 9.552619750415853e-05, + "loss": 3.0179, + "step": 14312 + }, + { + "epoch": 0.6663873175501083, + "grad_norm": 0.3484059329444766, + "learning_rate": 9.552507749307931e-05, + "loss": 3.039, + "step": 14313 + }, + { + "epoch": 0.6664338757362013, + "grad_norm": 0.38477331533601766, + "learning_rate": 9.552395734838846e-05, + "loss": 3.0142, + "step": 14314 + }, + { + "epoch": 0.6664804339222944, + "grad_norm": 0.3115535803284183, + "learning_rate": 9.55228370700893e-05, + "loss": 3.0939, + "step": 14315 + }, + { + "epoch": 0.6665269921083875, + "grad_norm": 0.37652317591613077, + "learning_rate": 9.552171665818508e-05, + "loss": 2.9714, + "step": 14316 + }, + { + "epoch": 0.6665735502944805, + "grad_norm": 0.3543072313230353, + "learning_rate": 9.552059611267911e-05, + "loss": 2.914, + "step": 14317 + }, + { + "epoch": 0.6666201084805736, + "grad_norm": 0.3804046010157876, + "learning_rate": 9.551947543357466e-05, + "loss": 3.0064, + "step": 14318 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.3632790690658776, + "learning_rate": 9.551835462087507e-05, + "loss": 2.9588, + "step": 14319 + }, + { + "epoch": 0.6667132248527597, + "grad_norm": 0.3538493929892248, + "learning_rate": 9.551723367458356e-05, + "loss": 3.0134, + "step": 14320 + }, + { + "epoch": 0.6667597830388529, + "grad_norm": 0.36735439924100644, + "learning_rate": 9.551611259470348e-05, + "loss": 3.0418, + "step": 14321 + }, + { + "epoch": 0.6668063412249459, + "grad_norm": 0.3653841326885966, + "learning_rate": 9.551499138123807e-05, + "loss": 3.0238, + "step": 14322 + }, + { + "epoch": 0.666852899411039, + "grad_norm": 0.4218745632956101, + "learning_rate": 9.551387003419065e-05, + "loss": 3.0637, + "step": 14323 + }, + { + "epoch": 0.666899457597132, + "grad_norm": 0.3625056596094759, + "learning_rate": 9.551274855356451e-05, + "loss": 3.0724, + "step": 14324 + }, + { + "epoch": 0.6669460157832251, + "grad_norm": 0.42862782362041746, + "learning_rate": 9.551162693936295e-05, + "loss": 2.9561, + "step": 14325 + }, + { + "epoch": 0.6669925739693181, + "grad_norm": 0.39881901597204034, + "learning_rate": 9.551050519158922e-05, + "loss": 2.9768, + "step": 14326 + }, + { + "epoch": 0.6670391321554112, + "grad_norm": 0.37227800600867617, + "learning_rate": 9.550938331024666e-05, + "loss": 2.9841, + "step": 14327 + }, + { + "epoch": 0.6670856903415043, + "grad_norm": 0.41936635279771395, + "learning_rate": 9.550826129533854e-05, + "loss": 2.9446, + "step": 14328 + }, + { + "epoch": 0.6671322485275973, + "grad_norm": 0.3381732297459533, + "learning_rate": 9.550713914686815e-05, + "loss": 2.972, + "step": 14329 + }, + { + "epoch": 0.6671788067136905, + "grad_norm": 0.391579617621128, + "learning_rate": 9.55060168648388e-05, + "loss": 3.0215, + "step": 14330 + }, + { + "epoch": 0.6672253648997835, + "grad_norm": 0.42372120172394645, + "learning_rate": 9.550489444925377e-05, + "loss": 3.1104, + "step": 14331 + }, + { + "epoch": 0.6672719230858766, + "grad_norm": 0.381489631276496, + "learning_rate": 9.550377190011636e-05, + "loss": 3.0493, + "step": 14332 + }, + { + "epoch": 0.6673184812719697, + "grad_norm": 0.37494680532136887, + "learning_rate": 9.550264921742986e-05, + "loss": 3.0371, + "step": 14333 + }, + { + "epoch": 0.6673650394580627, + "grad_norm": 0.3703926129077526, + "learning_rate": 9.550152640119757e-05, + "loss": 2.9187, + "step": 14334 + }, + { + "epoch": 0.6674115976441558, + "grad_norm": 0.37796931929037275, + "learning_rate": 9.550040345142277e-05, + "loss": 3.0947, + "step": 14335 + }, + { + "epoch": 0.6674581558302488, + "grad_norm": 0.42267242314138515, + "learning_rate": 9.549928036810878e-05, + "loss": 3.0147, + "step": 14336 + }, + { + "epoch": 0.6675047140163419, + "grad_norm": 0.33936892926072, + "learning_rate": 9.549815715125888e-05, + "loss": 3.0586, + "step": 14337 + }, + { + "epoch": 0.667551272202435, + "grad_norm": 0.38437319094453243, + "learning_rate": 9.549703380087637e-05, + "loss": 3.0414, + "step": 14338 + }, + { + "epoch": 0.667597830388528, + "grad_norm": 0.38270881888732056, + "learning_rate": 9.549591031696456e-05, + "loss": 2.9303, + "step": 14339 + }, + { + "epoch": 0.6676443885746212, + "grad_norm": 0.3854780534512143, + "learning_rate": 9.549478669952672e-05, + "loss": 2.9926, + "step": 14340 + }, + { + "epoch": 0.6676909467607142, + "grad_norm": 0.37414158266454867, + "learning_rate": 9.549366294856616e-05, + "loss": 3.097, + "step": 14341 + }, + { + "epoch": 0.6677375049468073, + "grad_norm": 0.3567665451192644, + "learning_rate": 9.54925390640862e-05, + "loss": 3.0016, + "step": 14342 + }, + { + "epoch": 0.6677840631329004, + "grad_norm": 0.36643943545365504, + "learning_rate": 9.549141504609012e-05, + "loss": 3.0657, + "step": 14343 + }, + { + "epoch": 0.6678306213189934, + "grad_norm": 0.3770763003332967, + "learning_rate": 9.54902908945812e-05, + "loss": 3.1728, + "step": 14344 + }, + { + "epoch": 0.6678771795050865, + "grad_norm": 0.35361481769092407, + "learning_rate": 9.548916660956277e-05, + "loss": 2.96, + "step": 14345 + }, + { + "epoch": 0.6679237376911795, + "grad_norm": 0.3453067697680026, + "learning_rate": 9.54880421910381e-05, + "loss": 2.9382, + "step": 14346 + }, + { + "epoch": 0.6679702958772726, + "grad_norm": 0.36217797602319657, + "learning_rate": 9.548691763901053e-05, + "loss": 2.969, + "step": 14347 + }, + { + "epoch": 0.6680168540633656, + "grad_norm": 0.358335069185593, + "learning_rate": 9.548579295348334e-05, + "loss": 3.0502, + "step": 14348 + }, + { + "epoch": 0.6680634122494588, + "grad_norm": 0.33928613070324737, + "learning_rate": 9.548466813445981e-05, + "loss": 3.1488, + "step": 14349 + }, + { + "epoch": 0.6681099704355519, + "grad_norm": 0.3681530045588781, + "learning_rate": 9.548354318194327e-05, + "loss": 3.0221, + "step": 14350 + }, + { + "epoch": 0.6681565286216449, + "grad_norm": 0.3374649517210079, + "learning_rate": 9.548241809593701e-05, + "loss": 3.0801, + "step": 14351 + }, + { + "epoch": 0.668203086807738, + "grad_norm": 0.3720104180894616, + "learning_rate": 9.548129287644435e-05, + "loss": 3.0735, + "step": 14352 + }, + { + "epoch": 0.668249644993831, + "grad_norm": 0.3722364650027333, + "learning_rate": 9.548016752346853e-05, + "loss": 3.0974, + "step": 14353 + }, + { + "epoch": 0.6682962031799241, + "grad_norm": 0.3500571563052796, + "learning_rate": 9.547904203701295e-05, + "loss": 2.9908, + "step": 14354 + }, + { + "epoch": 0.6683427613660172, + "grad_norm": 0.37732940557234934, + "learning_rate": 9.547791641708083e-05, + "loss": 2.9611, + "step": 14355 + }, + { + "epoch": 0.6683893195521102, + "grad_norm": 0.37953926751962064, + "learning_rate": 9.547679066367551e-05, + "loss": 3.0631, + "step": 14356 + }, + { + "epoch": 0.6684358777382033, + "grad_norm": 0.3562911325125418, + "learning_rate": 9.54756647768003e-05, + "loss": 2.939, + "step": 14357 + }, + { + "epoch": 0.6684824359242963, + "grad_norm": 0.3501257157427092, + "learning_rate": 9.547453875645849e-05, + "loss": 3.0425, + "step": 14358 + }, + { + "epoch": 0.6685289941103895, + "grad_norm": 0.39107596647291065, + "learning_rate": 9.547341260265339e-05, + "loss": 3.1126, + "step": 14359 + }, + { + "epoch": 0.6685755522964826, + "grad_norm": 0.33483712919975256, + "learning_rate": 9.54722863153883e-05, + "loss": 2.9388, + "step": 14360 + }, + { + "epoch": 0.6686221104825756, + "grad_norm": 0.36681233146403747, + "learning_rate": 9.547115989466653e-05, + "loss": 3.0204, + "step": 14361 + }, + { + "epoch": 0.6686686686686687, + "grad_norm": 0.3916534487622908, + "learning_rate": 9.547003334049138e-05, + "loss": 3.0322, + "step": 14362 + }, + { + "epoch": 0.6687152268547617, + "grad_norm": 0.3207193320618963, + "learning_rate": 9.546890665286619e-05, + "loss": 2.9609, + "step": 14363 + }, + { + "epoch": 0.6687617850408548, + "grad_norm": 0.3668734502331273, + "learning_rate": 9.54677798317942e-05, + "loss": 2.9763, + "step": 14364 + }, + { + "epoch": 0.6688083432269479, + "grad_norm": 0.34424112511996735, + "learning_rate": 9.546665287727878e-05, + "loss": 3.0179, + "step": 14365 + }, + { + "epoch": 0.6688549014130409, + "grad_norm": 0.3621979475225392, + "learning_rate": 9.546552578932321e-05, + "loss": 3.0294, + "step": 14366 + }, + { + "epoch": 0.668901459599134, + "grad_norm": 0.36473733816458226, + "learning_rate": 9.54643985679308e-05, + "loss": 3.0293, + "step": 14367 + }, + { + "epoch": 0.6689480177852271, + "grad_norm": 0.33769418844310106, + "learning_rate": 9.546327121310485e-05, + "loss": 3.0193, + "step": 14368 + }, + { + "epoch": 0.6689945759713202, + "grad_norm": 0.37717927522035105, + "learning_rate": 9.546214372484867e-05, + "loss": 3.0488, + "step": 14369 + }, + { + "epoch": 0.6690411341574132, + "grad_norm": 0.35581643503266247, + "learning_rate": 9.546101610316561e-05, + "loss": 3.0588, + "step": 14370 + }, + { + "epoch": 0.6690876923435063, + "grad_norm": 0.34766580515984224, + "learning_rate": 9.545988834805893e-05, + "loss": 3.0062, + "step": 14371 + }, + { + "epoch": 0.6691342505295994, + "grad_norm": 0.38858713989395993, + "learning_rate": 9.545876045953195e-05, + "loss": 2.9869, + "step": 14372 + }, + { + "epoch": 0.6691808087156924, + "grad_norm": 0.31377566537044704, + "learning_rate": 9.545763243758798e-05, + "loss": 2.9847, + "step": 14373 + }, + { + "epoch": 0.6692273669017855, + "grad_norm": 0.3640045882099475, + "learning_rate": 9.545650428223034e-05, + "loss": 3.0706, + "step": 14374 + }, + { + "epoch": 0.6692739250878785, + "grad_norm": 0.32278978410514, + "learning_rate": 9.545537599346234e-05, + "loss": 2.9201, + "step": 14375 + }, + { + "epoch": 0.6693204832739716, + "grad_norm": 0.34291168571174707, + "learning_rate": 9.545424757128731e-05, + "loss": 3.0547, + "step": 14376 + }, + { + "epoch": 0.6693670414600648, + "grad_norm": 0.36847198627344385, + "learning_rate": 9.545311901570853e-05, + "loss": 3.0477, + "step": 14377 + }, + { + "epoch": 0.6694135996461578, + "grad_norm": 0.32722535978448863, + "learning_rate": 9.545199032672932e-05, + "loss": 2.9925, + "step": 14378 + }, + { + "epoch": 0.6694601578322509, + "grad_norm": 0.3630716970104594, + "learning_rate": 9.545086150435298e-05, + "loss": 3.0578, + "step": 14379 + }, + { + "epoch": 0.6695067160183439, + "grad_norm": 0.3182629925293795, + "learning_rate": 9.544973254858286e-05, + "loss": 3.0007, + "step": 14380 + }, + { + "epoch": 0.669553274204437, + "grad_norm": 0.31988021974538505, + "learning_rate": 9.544860345942224e-05, + "loss": 2.8759, + "step": 14381 + }, + { + "epoch": 0.6695998323905301, + "grad_norm": 0.3296436601826319, + "learning_rate": 9.544747423687446e-05, + "loss": 3.0295, + "step": 14382 + }, + { + "epoch": 0.6696463905766231, + "grad_norm": 0.3281514962920747, + "learning_rate": 9.544634488094282e-05, + "loss": 3.157, + "step": 14383 + }, + { + "epoch": 0.6696929487627162, + "grad_norm": 0.3347769739026463, + "learning_rate": 9.544521539163063e-05, + "loss": 3.0376, + "step": 14384 + }, + { + "epoch": 0.6697395069488092, + "grad_norm": 0.33684262211080634, + "learning_rate": 9.54440857689412e-05, + "loss": 2.9996, + "step": 14385 + }, + { + "epoch": 0.6697860651349024, + "grad_norm": 0.37128623540230116, + "learning_rate": 9.544295601287787e-05, + "loss": 2.9905, + "step": 14386 + }, + { + "epoch": 0.6698326233209955, + "grad_norm": 0.3477169673253992, + "learning_rate": 9.544182612344393e-05, + "loss": 2.9215, + "step": 14387 + }, + { + "epoch": 0.6698791815070885, + "grad_norm": 0.39902045435018574, + "learning_rate": 9.544069610064271e-05, + "loss": 3.0391, + "step": 14388 + }, + { + "epoch": 0.6699257396931816, + "grad_norm": 0.36545677643014557, + "learning_rate": 9.543956594447754e-05, + "loss": 2.9983, + "step": 14389 + }, + { + "epoch": 0.6699722978792746, + "grad_norm": 0.35475844507514154, + "learning_rate": 9.543843565495169e-05, + "loss": 2.947, + "step": 14390 + }, + { + "epoch": 0.6700188560653677, + "grad_norm": 0.3498583499161125, + "learning_rate": 9.543730523206853e-05, + "loss": 3.0781, + "step": 14391 + }, + { + "epoch": 0.6700654142514607, + "grad_norm": 0.32321893152160885, + "learning_rate": 9.543617467583135e-05, + "loss": 3.0104, + "step": 14392 + }, + { + "epoch": 0.6701119724375538, + "grad_norm": 0.38643803333829024, + "learning_rate": 9.543504398624348e-05, + "loss": 3.0124, + "step": 14393 + }, + { + "epoch": 0.6701585306236469, + "grad_norm": 0.337875071074052, + "learning_rate": 9.543391316330821e-05, + "loss": 3.1611, + "step": 14394 + }, + { + "epoch": 0.67020508880974, + "grad_norm": 0.3830573589551064, + "learning_rate": 9.54327822070289e-05, + "loss": 3.109, + "step": 14395 + }, + { + "epoch": 0.6702516469958331, + "grad_norm": 0.3387090525522306, + "learning_rate": 9.543165111740887e-05, + "loss": 2.9913, + "step": 14396 + }, + { + "epoch": 0.6702982051819261, + "grad_norm": 0.36788033081194865, + "learning_rate": 9.543051989445138e-05, + "loss": 2.9415, + "step": 14397 + }, + { + "epoch": 0.6703447633680192, + "grad_norm": 0.38319922283386953, + "learning_rate": 9.54293885381598e-05, + "loss": 3.055, + "step": 14398 + }, + { + "epoch": 0.6703913215541123, + "grad_norm": 0.3982588194558733, + "learning_rate": 9.542825704853745e-05, + "loss": 3.1278, + "step": 14399 + }, + { + "epoch": 0.6704378797402053, + "grad_norm": 0.3475987301248705, + "learning_rate": 9.542712542558763e-05, + "loss": 2.8854, + "step": 14400 + }, + { + "epoch": 0.6704844379262984, + "grad_norm": 0.46115315894670333, + "learning_rate": 9.542599366931369e-05, + "loss": 3.1293, + "step": 14401 + }, + { + "epoch": 0.6705309961123914, + "grad_norm": 0.4229706306853926, + "learning_rate": 9.542486177971892e-05, + "loss": 3.034, + "step": 14402 + }, + { + "epoch": 0.6705775542984845, + "grad_norm": 0.3599611066553341, + "learning_rate": 9.542372975680666e-05, + "loss": 3.0747, + "step": 14403 + }, + { + "epoch": 0.6706241124845777, + "grad_norm": 0.4143499410484804, + "learning_rate": 9.542259760058022e-05, + "loss": 3.0844, + "step": 14404 + }, + { + "epoch": 0.6706706706706707, + "grad_norm": 0.3657880211087971, + "learning_rate": 9.542146531104294e-05, + "loss": 2.93, + "step": 14405 + }, + { + "epoch": 0.6707172288567638, + "grad_norm": 0.34723319338670117, + "learning_rate": 9.542033288819813e-05, + "loss": 3.0757, + "step": 14406 + }, + { + "epoch": 0.6707637870428568, + "grad_norm": 0.38927251667623813, + "learning_rate": 9.541920033204912e-05, + "loss": 3.0644, + "step": 14407 + }, + { + "epoch": 0.6708103452289499, + "grad_norm": 0.357183840941922, + "learning_rate": 9.541806764259923e-05, + "loss": 3.0231, + "step": 14408 + }, + { + "epoch": 0.670856903415043, + "grad_norm": 0.42400966268481133, + "learning_rate": 9.541693481985179e-05, + "loss": 3.0022, + "step": 14409 + }, + { + "epoch": 0.670903461601136, + "grad_norm": 0.41336126072192036, + "learning_rate": 9.541580186381011e-05, + "loss": 3.1616, + "step": 14410 + }, + { + "epoch": 0.6709500197872291, + "grad_norm": 0.41240423788890257, + "learning_rate": 9.541466877447753e-05, + "loss": 2.9998, + "step": 14411 + }, + { + "epoch": 0.6709965779733221, + "grad_norm": 0.3952772304659271, + "learning_rate": 9.541353555185738e-05, + "loss": 3.0913, + "step": 14412 + }, + { + "epoch": 0.6710431361594152, + "grad_norm": 0.40722748361184397, + "learning_rate": 9.541240219595297e-05, + "loss": 3.0649, + "step": 14413 + }, + { + "epoch": 0.6710896943455082, + "grad_norm": 0.3571773055858249, + "learning_rate": 9.541126870676764e-05, + "loss": 3.0391, + "step": 14414 + }, + { + "epoch": 0.6711362525316014, + "grad_norm": 0.38203112780954657, + "learning_rate": 9.54101350843047e-05, + "loss": 2.9489, + "step": 14415 + }, + { + "epoch": 0.6711828107176945, + "grad_norm": 0.37984374822579225, + "learning_rate": 9.54090013285675e-05, + "loss": 3.0944, + "step": 14416 + }, + { + "epoch": 0.6712293689037875, + "grad_norm": 0.3989159843613536, + "learning_rate": 9.540786743955934e-05, + "loss": 2.9788, + "step": 14417 + }, + { + "epoch": 0.6712759270898806, + "grad_norm": 0.37568600148174586, + "learning_rate": 9.540673341728357e-05, + "loss": 3.0985, + "step": 14418 + }, + { + "epoch": 0.6713224852759736, + "grad_norm": 0.421457102677575, + "learning_rate": 9.540559926174351e-05, + "loss": 3.1124, + "step": 14419 + }, + { + "epoch": 0.6713690434620667, + "grad_norm": 0.33745735661469156, + "learning_rate": 9.540446497294248e-05, + "loss": 3.0088, + "step": 14420 + }, + { + "epoch": 0.6714156016481598, + "grad_norm": 0.4177759383870581, + "learning_rate": 9.540333055088385e-05, + "loss": 3.092, + "step": 14421 + }, + { + "epoch": 0.6714621598342528, + "grad_norm": 0.3676089632395086, + "learning_rate": 9.540219599557088e-05, + "loss": 2.9766, + "step": 14422 + }, + { + "epoch": 0.671508718020346, + "grad_norm": 0.369684049897486, + "learning_rate": 9.540106130700696e-05, + "loss": 3.0235, + "step": 14423 + }, + { + "epoch": 0.671555276206439, + "grad_norm": 0.35816313057564575, + "learning_rate": 9.539992648519538e-05, + "loss": 2.9998, + "step": 14424 + }, + { + "epoch": 0.6716018343925321, + "grad_norm": 0.3942624971693115, + "learning_rate": 9.539879153013951e-05, + "loss": 2.9962, + "step": 14425 + }, + { + "epoch": 0.6716483925786252, + "grad_norm": 0.3633443993218226, + "learning_rate": 9.539765644184264e-05, + "loss": 3.1222, + "step": 14426 + }, + { + "epoch": 0.6716949507647182, + "grad_norm": 0.3576187814974664, + "learning_rate": 9.539652122030813e-05, + "loss": 2.9713, + "step": 14427 + }, + { + "epoch": 0.6717415089508113, + "grad_norm": 0.3500132530356824, + "learning_rate": 9.539538586553931e-05, + "loss": 2.9563, + "step": 14428 + }, + { + "epoch": 0.6717880671369043, + "grad_norm": 0.3650751992502892, + "learning_rate": 9.539425037753949e-05, + "loss": 2.906, + "step": 14429 + }, + { + "epoch": 0.6718346253229974, + "grad_norm": 0.36364098081920276, + "learning_rate": 9.539311475631202e-05, + "loss": 3.0485, + "step": 14430 + }, + { + "epoch": 0.6718811835090905, + "grad_norm": 0.3673423164949239, + "learning_rate": 9.539197900186023e-05, + "loss": 3.1016, + "step": 14431 + }, + { + "epoch": 0.6719277416951835, + "grad_norm": 0.3598112537249969, + "learning_rate": 9.539084311418747e-05, + "loss": 2.9693, + "step": 14432 + }, + { + "epoch": 0.6719742998812767, + "grad_norm": 0.3951850594607062, + "learning_rate": 9.538970709329705e-05, + "loss": 2.964, + "step": 14433 + }, + { + "epoch": 0.6720208580673697, + "grad_norm": 0.35602218275990555, + "learning_rate": 9.538857093919229e-05, + "loss": 3.0684, + "step": 14434 + }, + { + "epoch": 0.6720674162534628, + "grad_norm": 0.36945447009355803, + "learning_rate": 9.538743465187656e-05, + "loss": 3.0685, + "step": 14435 + }, + { + "epoch": 0.6721139744395558, + "grad_norm": 0.3430351505187379, + "learning_rate": 9.538629823135318e-05, + "loss": 3.0856, + "step": 14436 + }, + { + "epoch": 0.6721605326256489, + "grad_norm": 0.3822862120596, + "learning_rate": 9.538516167762551e-05, + "loss": 3.0685, + "step": 14437 + }, + { + "epoch": 0.672207090811742, + "grad_norm": 0.3670425339833812, + "learning_rate": 9.538402499069683e-05, + "loss": 3.0667, + "step": 14438 + }, + { + "epoch": 0.672253648997835, + "grad_norm": 0.32430332789397576, + "learning_rate": 9.538288817057051e-05, + "loss": 2.9437, + "step": 14439 + }, + { + "epoch": 0.6723002071839281, + "grad_norm": 0.36767238462562685, + "learning_rate": 9.53817512172499e-05, + "loss": 3.1567, + "step": 14440 + }, + { + "epoch": 0.6723467653700211, + "grad_norm": 0.38373142263253707, + "learning_rate": 9.538061413073831e-05, + "loss": 3.0501, + "step": 14441 + }, + { + "epoch": 0.6723933235561143, + "grad_norm": 0.38876743097767424, + "learning_rate": 9.537947691103908e-05, + "loss": 2.9976, + "step": 14442 + }, + { + "epoch": 0.6724398817422074, + "grad_norm": 0.3808433252760406, + "learning_rate": 9.537833955815558e-05, + "loss": 3.0791, + "step": 14443 + }, + { + "epoch": 0.6724864399283004, + "grad_norm": 0.4133577241759314, + "learning_rate": 9.537720207209111e-05, + "loss": 2.984, + "step": 14444 + }, + { + "epoch": 0.6725329981143935, + "grad_norm": 0.3593862937518306, + "learning_rate": 9.537606445284903e-05, + "loss": 3.066, + "step": 14445 + }, + { + "epoch": 0.6725795563004865, + "grad_norm": 0.41877631663548637, + "learning_rate": 9.537492670043267e-05, + "loss": 2.9956, + "step": 14446 + }, + { + "epoch": 0.6726261144865796, + "grad_norm": 0.4037621813783657, + "learning_rate": 9.537378881484535e-05, + "loss": 2.9338, + "step": 14447 + }, + { + "epoch": 0.6726726726726727, + "grad_norm": 0.34779933992799705, + "learning_rate": 9.537265079609045e-05, + "loss": 3.0222, + "step": 14448 + }, + { + "epoch": 0.6727192308587657, + "grad_norm": 0.36273945027228927, + "learning_rate": 9.537151264417127e-05, + "loss": 3.0622, + "step": 14449 + }, + { + "epoch": 0.6727657890448588, + "grad_norm": 0.3583201218266297, + "learning_rate": 9.53703743590912e-05, + "loss": 3.0251, + "step": 14450 + }, + { + "epoch": 0.6728123472309518, + "grad_norm": 0.34695080218108243, + "learning_rate": 9.536923594085353e-05, + "loss": 3.033, + "step": 14451 + }, + { + "epoch": 0.672858905417045, + "grad_norm": 0.9454451506346022, + "learning_rate": 9.536809738946163e-05, + "loss": 3.0263, + "step": 14452 + }, + { + "epoch": 0.6729054636031381, + "grad_norm": 0.5621235878514057, + "learning_rate": 9.536695870491883e-05, + "loss": 3.0151, + "step": 14453 + }, + { + "epoch": 0.6729520217892311, + "grad_norm": 0.43294303263918543, + "learning_rate": 9.536581988722848e-05, + "loss": 3.1318, + "step": 14454 + }, + { + "epoch": 0.6729985799753242, + "grad_norm": 0.43421898308605544, + "learning_rate": 9.536468093639392e-05, + "loss": 3.0364, + "step": 14455 + }, + { + "epoch": 0.6730451381614172, + "grad_norm": 0.39809953931835823, + "learning_rate": 9.536354185241847e-05, + "loss": 3.0537, + "step": 14456 + }, + { + "epoch": 0.6730916963475103, + "grad_norm": 0.4266594866659262, + "learning_rate": 9.536240263530552e-05, + "loss": 3.0483, + "step": 14457 + }, + { + "epoch": 0.6731382545336033, + "grad_norm": 0.3766472721270523, + "learning_rate": 9.536126328505837e-05, + "loss": 2.9845, + "step": 14458 + }, + { + "epoch": 0.6731848127196964, + "grad_norm": 0.36373230965741815, + "learning_rate": 9.53601238016804e-05, + "loss": 3.0239, + "step": 14459 + }, + { + "epoch": 0.6732313709057896, + "grad_norm": 0.3576458911812897, + "learning_rate": 9.535898418517492e-05, + "loss": 3.1014, + "step": 14460 + }, + { + "epoch": 0.6732779290918826, + "grad_norm": 0.3668854329208527, + "learning_rate": 9.535784443554528e-05, + "loss": 3.0154, + "step": 14461 + }, + { + "epoch": 0.6733244872779757, + "grad_norm": 0.34474962400367437, + "learning_rate": 9.535670455279486e-05, + "loss": 2.9465, + "step": 14462 + }, + { + "epoch": 0.6733710454640687, + "grad_norm": 0.37846869055828203, + "learning_rate": 9.535556453692696e-05, + "loss": 3.048, + "step": 14463 + }, + { + "epoch": 0.6734176036501618, + "grad_norm": 0.3676819347658566, + "learning_rate": 9.535442438794495e-05, + "loss": 3.0023, + "step": 14464 + }, + { + "epoch": 0.6734641618362549, + "grad_norm": 0.3177956752519597, + "learning_rate": 9.535328410585219e-05, + "loss": 3.0965, + "step": 14465 + }, + { + "epoch": 0.6735107200223479, + "grad_norm": 0.4030716151833875, + "learning_rate": 9.535214369065199e-05, + "loss": 3.091, + "step": 14466 + }, + { + "epoch": 0.673557278208441, + "grad_norm": 0.32263399607315424, + "learning_rate": 9.535100314234773e-05, + "loss": 2.9579, + "step": 14467 + }, + { + "epoch": 0.673603836394534, + "grad_norm": 0.3670534010640478, + "learning_rate": 9.534986246094274e-05, + "loss": 3.0038, + "step": 14468 + }, + { + "epoch": 0.6736503945806271, + "grad_norm": 0.3385846505378693, + "learning_rate": 9.534872164644035e-05, + "loss": 2.9968, + "step": 14469 + }, + { + "epoch": 0.6736969527667203, + "grad_norm": 0.3324295682645731, + "learning_rate": 9.534758069884396e-05, + "loss": 2.9413, + "step": 14470 + }, + { + "epoch": 0.6737435109528133, + "grad_norm": 0.3794923461858314, + "learning_rate": 9.534643961815688e-05, + "loss": 3.0579, + "step": 14471 + }, + { + "epoch": 0.6737900691389064, + "grad_norm": 0.3754107807514251, + "learning_rate": 9.534529840438245e-05, + "loss": 3.0492, + "step": 14472 + }, + { + "epoch": 0.6738366273249994, + "grad_norm": 0.332259897830851, + "learning_rate": 9.534415705752406e-05, + "loss": 3.0621, + "step": 14473 + }, + { + "epoch": 0.6738831855110925, + "grad_norm": 0.3992197018145991, + "learning_rate": 9.534301557758502e-05, + "loss": 3.0085, + "step": 14474 + }, + { + "epoch": 0.6739297436971856, + "grad_norm": 0.40810795745424727, + "learning_rate": 9.53418739645687e-05, + "loss": 3.1141, + "step": 14475 + }, + { + "epoch": 0.6739763018832786, + "grad_norm": 0.3410078918168745, + "learning_rate": 9.534073221847845e-05, + "loss": 3.0545, + "step": 14476 + }, + { + "epoch": 0.6740228600693717, + "grad_norm": 0.3990478205610789, + "learning_rate": 9.533959033931761e-05, + "loss": 3.0409, + "step": 14477 + }, + { + "epoch": 0.6740694182554647, + "grad_norm": 0.38735437326039573, + "learning_rate": 9.533844832708955e-05, + "loss": 2.9267, + "step": 14478 + }, + { + "epoch": 0.6741159764415579, + "grad_norm": 0.5606356863337476, + "learning_rate": 9.533730618179762e-05, + "loss": 3.0085, + "step": 14479 + }, + { + "epoch": 0.6741625346276509, + "grad_norm": 0.5450335525020583, + "learning_rate": 9.533616390344513e-05, + "loss": 2.9169, + "step": 14480 + }, + { + "epoch": 0.674209092813744, + "grad_norm": 0.40091917144839273, + "learning_rate": 9.53350214920355e-05, + "loss": 3.0186, + "step": 14481 + }, + { + "epoch": 0.6742556509998371, + "grad_norm": 0.4627908958773775, + "learning_rate": 9.533387894757202e-05, + "loss": 2.941, + "step": 14482 + }, + { + "epoch": 0.6743022091859301, + "grad_norm": 0.36453534381174496, + "learning_rate": 9.533273627005809e-05, + "loss": 3.0332, + "step": 14483 + }, + { + "epoch": 0.6743487673720232, + "grad_norm": 0.4900120432740112, + "learning_rate": 9.533159345949704e-05, + "loss": 3.0125, + "step": 14484 + }, + { + "epoch": 0.6743953255581162, + "grad_norm": 0.40972928093467614, + "learning_rate": 9.533045051589222e-05, + "loss": 3.0521, + "step": 14485 + }, + { + "epoch": 0.6744418837442093, + "grad_norm": 0.6355968006515407, + "learning_rate": 9.5329307439247e-05, + "loss": 3.0505, + "step": 14486 + }, + { + "epoch": 0.6744884419303024, + "grad_norm": 0.5315441048739659, + "learning_rate": 9.532816422956473e-05, + "loss": 3.0474, + "step": 14487 + }, + { + "epoch": 0.6745350001163954, + "grad_norm": 0.4710620784489195, + "learning_rate": 9.532702088684877e-05, + "loss": 3.1465, + "step": 14488 + }, + { + "epoch": 0.6745815583024886, + "grad_norm": 0.4275389010948693, + "learning_rate": 9.532587741110246e-05, + "loss": 3.0526, + "step": 14489 + }, + { + "epoch": 0.6746281164885816, + "grad_norm": 0.41442838711318025, + "learning_rate": 9.532473380232917e-05, + "loss": 3.0509, + "step": 14490 + }, + { + "epoch": 0.6746746746746747, + "grad_norm": 0.416533136063542, + "learning_rate": 9.532359006053224e-05, + "loss": 3.0281, + "step": 14491 + }, + { + "epoch": 0.6747212328607678, + "grad_norm": 0.4206541688671564, + "learning_rate": 9.532244618571505e-05, + "loss": 3.1109, + "step": 14492 + }, + { + "epoch": 0.6747677910468608, + "grad_norm": 0.4992990772297698, + "learning_rate": 9.532130217788095e-05, + "loss": 3.0596, + "step": 14493 + }, + { + "epoch": 0.6748143492329539, + "grad_norm": 0.42889352526845637, + "learning_rate": 9.532015803703328e-05, + "loss": 3.1336, + "step": 14494 + }, + { + "epoch": 0.6748609074190469, + "grad_norm": 0.41318533914820077, + "learning_rate": 9.531901376317543e-05, + "loss": 3.1079, + "step": 14495 + }, + { + "epoch": 0.67490746560514, + "grad_norm": 0.4163267086690418, + "learning_rate": 9.53178693563107e-05, + "loss": 3.0598, + "step": 14496 + }, + { + "epoch": 0.6749540237912331, + "grad_norm": 0.3878183699876167, + "learning_rate": 9.531672481644252e-05, + "loss": 3.0665, + "step": 14497 + }, + { + "epoch": 0.6750005819773262, + "grad_norm": 0.4860416392524713, + "learning_rate": 9.531558014357422e-05, + "loss": 3.0253, + "step": 14498 + }, + { + "epoch": 0.6750471401634193, + "grad_norm": 0.3909740016132407, + "learning_rate": 9.531443533770913e-05, + "loss": 2.9799, + "step": 14499 + }, + { + "epoch": 0.6750936983495123, + "grad_norm": 0.3628058087696343, + "learning_rate": 9.531329039885066e-05, + "loss": 2.9451, + "step": 14500 + }, + { + "epoch": 0.6751402565356054, + "grad_norm": 0.3960463475280966, + "learning_rate": 9.531214532700214e-05, + "loss": 3.003, + "step": 14501 + }, + { + "epoch": 0.6751868147216984, + "grad_norm": 0.3635694509058993, + "learning_rate": 9.531100012216695e-05, + "loss": 3.1448, + "step": 14502 + }, + { + "epoch": 0.6752333729077915, + "grad_norm": 0.38976078994140784, + "learning_rate": 9.530985478434842e-05, + "loss": 3.0161, + "step": 14503 + }, + { + "epoch": 0.6752799310938846, + "grad_norm": 0.43091651259311353, + "learning_rate": 9.530870931354994e-05, + "loss": 3.0696, + "step": 14504 + }, + { + "epoch": 0.6753264892799776, + "grad_norm": 0.36994917954615575, + "learning_rate": 9.530756370977486e-05, + "loss": 3.0704, + "step": 14505 + }, + { + "epoch": 0.6753730474660707, + "grad_norm": 0.4596980042258268, + "learning_rate": 9.530641797302655e-05, + "loss": 2.9786, + "step": 14506 + }, + { + "epoch": 0.6754196056521637, + "grad_norm": 0.4727620508643889, + "learning_rate": 9.530527210330835e-05, + "loss": 3.0258, + "step": 14507 + }, + { + "epoch": 0.6754661638382569, + "grad_norm": 0.40283263375561246, + "learning_rate": 9.530412610062364e-05, + "loss": 3.0665, + "step": 14508 + }, + { + "epoch": 0.67551272202435, + "grad_norm": 0.45643974756534716, + "learning_rate": 9.530297996497582e-05, + "loss": 3.0872, + "step": 14509 + }, + { + "epoch": 0.675559280210443, + "grad_norm": 0.3656098999470736, + "learning_rate": 9.530183369636818e-05, + "loss": 3.0609, + "step": 14510 + }, + { + "epoch": 0.6756058383965361, + "grad_norm": 0.42249548060775066, + "learning_rate": 9.530068729480413e-05, + "loss": 3.055, + "step": 14511 + }, + { + "epoch": 0.6756523965826291, + "grad_norm": 0.4452371637012971, + "learning_rate": 9.529954076028702e-05, + "loss": 3.0215, + "step": 14512 + }, + { + "epoch": 0.6756989547687222, + "grad_norm": 0.3878309485590523, + "learning_rate": 9.529839409282022e-05, + "loss": 2.9526, + "step": 14513 + }, + { + "epoch": 0.6757455129548153, + "grad_norm": 0.43086485054711865, + "learning_rate": 9.529724729240711e-05, + "loss": 3.008, + "step": 14514 + }, + { + "epoch": 0.6757920711409083, + "grad_norm": 0.37282911801656593, + "learning_rate": 9.529610035905103e-05, + "loss": 3.0648, + "step": 14515 + }, + { + "epoch": 0.6758386293270014, + "grad_norm": 0.42701824829899415, + "learning_rate": 9.529495329275538e-05, + "loss": 3.0797, + "step": 14516 + }, + { + "epoch": 0.6758851875130945, + "grad_norm": 0.397135258988442, + "learning_rate": 9.529380609352347e-05, + "loss": 2.9314, + "step": 14517 + }, + { + "epoch": 0.6759317456991876, + "grad_norm": 0.36099136661195874, + "learning_rate": 9.529265876135872e-05, + "loss": 2.963, + "step": 14518 + }, + { + "epoch": 0.6759783038852807, + "grad_norm": 0.4254666392477945, + "learning_rate": 9.529151129626448e-05, + "loss": 2.9493, + "step": 14519 + }, + { + "epoch": 0.6760248620713737, + "grad_norm": 0.3757095730958241, + "learning_rate": 9.52903636982441e-05, + "loss": 3.0714, + "step": 14520 + }, + { + "epoch": 0.6760714202574668, + "grad_norm": 0.3539823427401734, + "learning_rate": 9.528921596730098e-05, + "loss": 2.8718, + "step": 14521 + }, + { + "epoch": 0.6761179784435598, + "grad_norm": 0.3771687086332379, + "learning_rate": 9.528806810343848e-05, + "loss": 2.9967, + "step": 14522 + }, + { + "epoch": 0.6761645366296529, + "grad_norm": 0.33395979188142144, + "learning_rate": 9.528692010665995e-05, + "loss": 3.0239, + "step": 14523 + }, + { + "epoch": 0.6762110948157459, + "grad_norm": 0.35057437606124897, + "learning_rate": 9.528577197696878e-05, + "loss": 2.9707, + "step": 14524 + }, + { + "epoch": 0.676257653001839, + "grad_norm": 0.3450885565249069, + "learning_rate": 9.528462371436831e-05, + "loss": 3.1445, + "step": 14525 + }, + { + "epoch": 0.6763042111879322, + "grad_norm": 0.37924398393677894, + "learning_rate": 9.528347531886195e-05, + "loss": 3.029, + "step": 14526 + }, + { + "epoch": 0.6763507693740252, + "grad_norm": 0.36022548809473565, + "learning_rate": 9.528232679045304e-05, + "loss": 3.0659, + "step": 14527 + }, + { + "epoch": 0.6763973275601183, + "grad_norm": 0.37957853515065193, + "learning_rate": 9.528117812914496e-05, + "loss": 2.9734, + "step": 14528 + }, + { + "epoch": 0.6764438857462113, + "grad_norm": 0.35527037673444284, + "learning_rate": 9.52800293349411e-05, + "loss": 3.0373, + "step": 14529 + }, + { + "epoch": 0.6764904439323044, + "grad_norm": 0.38659296298330065, + "learning_rate": 9.52788804078448e-05, + "loss": 3.11, + "step": 14530 + }, + { + "epoch": 0.6765370021183975, + "grad_norm": 0.35701118364902784, + "learning_rate": 9.527773134785946e-05, + "loss": 3.1158, + "step": 14531 + }, + { + "epoch": 0.6765835603044905, + "grad_norm": 0.35439123190300953, + "learning_rate": 9.527658215498843e-05, + "loss": 2.9248, + "step": 14532 + }, + { + "epoch": 0.6766301184905836, + "grad_norm": 0.3907021994056887, + "learning_rate": 9.527543282923508e-05, + "loss": 2.925, + "step": 14533 + }, + { + "epoch": 0.6766766766766766, + "grad_norm": 0.3395897567719041, + "learning_rate": 9.527428337060282e-05, + "loss": 2.8708, + "step": 14534 + }, + { + "epoch": 0.6767232348627698, + "grad_norm": 0.38643836920639174, + "learning_rate": 9.527313377909499e-05, + "loss": 3.0099, + "step": 14535 + }, + { + "epoch": 0.6767697930488629, + "grad_norm": 0.3419439494402256, + "learning_rate": 9.527198405471496e-05, + "loss": 2.9809, + "step": 14536 + }, + { + "epoch": 0.6768163512349559, + "grad_norm": 0.37287633184445845, + "learning_rate": 9.527083419746612e-05, + "loss": 2.9646, + "step": 14537 + }, + { + "epoch": 0.676862909421049, + "grad_norm": 0.36861003790461433, + "learning_rate": 9.526968420735185e-05, + "loss": 3.0306, + "step": 14538 + }, + { + "epoch": 0.676909467607142, + "grad_norm": 0.37029447463777354, + "learning_rate": 9.526853408437552e-05, + "loss": 2.9727, + "step": 14539 + }, + { + "epoch": 0.6769560257932351, + "grad_norm": 0.4106411144386445, + "learning_rate": 9.526738382854049e-05, + "loss": 3.0794, + "step": 14540 + }, + { + "epoch": 0.6770025839793282, + "grad_norm": 0.35976231883086585, + "learning_rate": 9.526623343985015e-05, + "loss": 2.8341, + "step": 14541 + }, + { + "epoch": 0.6770491421654212, + "grad_norm": 0.36585574356430123, + "learning_rate": 9.526508291830788e-05, + "loss": 3.0553, + "step": 14542 + }, + { + "epoch": 0.6770957003515143, + "grad_norm": 0.360874157812027, + "learning_rate": 9.526393226391705e-05, + "loss": 3.0009, + "step": 14543 + }, + { + "epoch": 0.6771422585376073, + "grad_norm": 0.37344377971598647, + "learning_rate": 9.526278147668104e-05, + "loss": 3.0953, + "step": 14544 + }, + { + "epoch": 0.6771888167237005, + "grad_norm": 0.35639287039605544, + "learning_rate": 9.526163055660322e-05, + "loss": 3.094, + "step": 14545 + }, + { + "epoch": 0.6772353749097935, + "grad_norm": 0.3833923541931708, + "learning_rate": 9.526047950368697e-05, + "loss": 3.1623, + "step": 14546 + }, + { + "epoch": 0.6772819330958866, + "grad_norm": 0.3767696056590085, + "learning_rate": 9.52593283179357e-05, + "loss": 2.9349, + "step": 14547 + }, + { + "epoch": 0.6773284912819797, + "grad_norm": 0.38613701792461647, + "learning_rate": 9.525817699935272e-05, + "loss": 2.9438, + "step": 14548 + }, + { + "epoch": 0.6773750494680727, + "grad_norm": 0.369848185599674, + "learning_rate": 9.525702554794147e-05, + "loss": 2.9883, + "step": 14549 + }, + { + "epoch": 0.6774216076541658, + "grad_norm": 0.3423712648152579, + "learning_rate": 9.52558739637053e-05, + "loss": 3.0391, + "step": 14550 + }, + { + "epoch": 0.6774681658402588, + "grad_norm": 0.3729458243030114, + "learning_rate": 9.525472224664761e-05, + "loss": 2.964, + "step": 14551 + }, + { + "epoch": 0.6775147240263519, + "grad_norm": 0.3772734576838693, + "learning_rate": 9.525357039677176e-05, + "loss": 3.0147, + "step": 14552 + }, + { + "epoch": 0.677561282212445, + "grad_norm": 0.4044389838112006, + "learning_rate": 9.525241841408115e-05, + "loss": 2.9191, + "step": 14553 + }, + { + "epoch": 0.677607840398538, + "grad_norm": 0.4005948724025079, + "learning_rate": 9.525126629857914e-05, + "loss": 3.0938, + "step": 14554 + }, + { + "epoch": 0.6776543985846312, + "grad_norm": 0.4301028093097214, + "learning_rate": 9.525011405026912e-05, + "loss": 3.1181, + "step": 14555 + }, + { + "epoch": 0.6777009567707242, + "grad_norm": 0.36938406808307067, + "learning_rate": 9.524896166915447e-05, + "loss": 3.1269, + "step": 14556 + }, + { + "epoch": 0.6777475149568173, + "grad_norm": 0.396494860785198, + "learning_rate": 9.524780915523859e-05, + "loss": 2.9769, + "step": 14557 + }, + { + "epoch": 0.6777940731429104, + "grad_norm": 0.3081749598742197, + "learning_rate": 9.524665650852483e-05, + "loss": 2.9634, + "step": 14558 + }, + { + "epoch": 0.6778406313290034, + "grad_norm": 0.3798675687211483, + "learning_rate": 9.52455037290166e-05, + "loss": 3.0293, + "step": 14559 + }, + { + "epoch": 0.6778871895150965, + "grad_norm": 0.37847724562764534, + "learning_rate": 9.524435081671728e-05, + "loss": 2.9763, + "step": 14560 + }, + { + "epoch": 0.6779337477011895, + "grad_norm": 0.3554719868372501, + "learning_rate": 9.524319777163025e-05, + "loss": 3.027, + "step": 14561 + }, + { + "epoch": 0.6779803058872826, + "grad_norm": 0.3592762277395613, + "learning_rate": 9.524204459375887e-05, + "loss": 3.0248, + "step": 14562 + }, + { + "epoch": 0.6780268640733756, + "grad_norm": 0.3354603649646488, + "learning_rate": 9.524089128310655e-05, + "loss": 2.9902, + "step": 14563 + }, + { + "epoch": 0.6780734222594688, + "grad_norm": 0.3577476722309101, + "learning_rate": 9.523973783967669e-05, + "loss": 3.0113, + "step": 14564 + }, + { + "epoch": 0.6781199804455619, + "grad_norm": 0.39144689039756436, + "learning_rate": 9.523858426347266e-05, + "loss": 3.0788, + "step": 14565 + }, + { + "epoch": 0.6781665386316549, + "grad_norm": 0.3296710252805901, + "learning_rate": 9.523743055449781e-05, + "loss": 2.9013, + "step": 14566 + }, + { + "epoch": 0.678213096817748, + "grad_norm": 0.35541513933876157, + "learning_rate": 9.52362767127556e-05, + "loss": 3.0587, + "step": 14567 + }, + { + "epoch": 0.678259655003841, + "grad_norm": 0.4064816048857576, + "learning_rate": 9.523512273824932e-05, + "loss": 3.0441, + "step": 14568 + }, + { + "epoch": 0.6783062131899341, + "grad_norm": 0.3604060195326468, + "learning_rate": 9.523396863098246e-05, + "loss": 3.024, + "step": 14569 + }, + { + "epoch": 0.6783527713760272, + "grad_norm": 0.36773779814876534, + "learning_rate": 9.523281439095834e-05, + "loss": 2.9347, + "step": 14570 + }, + { + "epoch": 0.6783993295621202, + "grad_norm": 0.342668366810171, + "learning_rate": 9.523166001818035e-05, + "loss": 2.9553, + "step": 14571 + }, + { + "epoch": 0.6784458877482133, + "grad_norm": 0.38624765071858985, + "learning_rate": 9.523050551265193e-05, + "loss": 3.0366, + "step": 14572 + }, + { + "epoch": 0.6784924459343064, + "grad_norm": 0.34257044238853285, + "learning_rate": 9.522935087437641e-05, + "loss": 3.082, + "step": 14573 + }, + { + "epoch": 0.6785390041203995, + "grad_norm": 0.36697689356651025, + "learning_rate": 9.52281961033572e-05, + "loss": 3.0545, + "step": 14574 + }, + { + "epoch": 0.6785855623064926, + "grad_norm": 0.38473095273737873, + "learning_rate": 9.522704119959769e-05, + "loss": 3.1669, + "step": 14575 + }, + { + "epoch": 0.6786321204925856, + "grad_norm": 0.34356821438715823, + "learning_rate": 9.522588616310127e-05, + "loss": 2.9648, + "step": 14576 + }, + { + "epoch": 0.6786786786786787, + "grad_norm": 0.3772993233795579, + "learning_rate": 9.522473099387133e-05, + "loss": 3.0622, + "step": 14577 + }, + { + "epoch": 0.6787252368647717, + "grad_norm": 0.3705753165350878, + "learning_rate": 9.522357569191128e-05, + "loss": 3.0757, + "step": 14578 + }, + { + "epoch": 0.6787717950508648, + "grad_norm": 0.3584044799515854, + "learning_rate": 9.522242025722446e-05, + "loss": 3.1089, + "step": 14579 + }, + { + "epoch": 0.6788183532369579, + "grad_norm": 0.3121105272161164, + "learning_rate": 9.522126468981431e-05, + "loss": 2.9802, + "step": 14580 + }, + { + "epoch": 0.6788649114230509, + "grad_norm": 0.34996135156901076, + "learning_rate": 9.52201089896842e-05, + "loss": 3.0317, + "step": 14581 + }, + { + "epoch": 0.6789114696091441, + "grad_norm": 0.32962528577816896, + "learning_rate": 9.521895315683752e-05, + "loss": 2.897, + "step": 14582 + }, + { + "epoch": 0.6789580277952371, + "grad_norm": 0.3424387522578013, + "learning_rate": 9.521779719127768e-05, + "loss": 2.9743, + "step": 14583 + }, + { + "epoch": 0.6790045859813302, + "grad_norm": 0.3402807022076957, + "learning_rate": 9.521664109300804e-05, + "loss": 2.9879, + "step": 14584 + }, + { + "epoch": 0.6790511441674232, + "grad_norm": 0.32399374202520836, + "learning_rate": 9.521548486203202e-05, + "loss": 2.9652, + "step": 14585 + }, + { + "epoch": 0.6790977023535163, + "grad_norm": 0.32972381807209283, + "learning_rate": 9.521432849835302e-05, + "loss": 3.0622, + "step": 14586 + }, + { + "epoch": 0.6791442605396094, + "grad_norm": 0.33681081721615036, + "learning_rate": 9.52131720019744e-05, + "loss": 3.0605, + "step": 14587 + }, + { + "epoch": 0.6791908187257024, + "grad_norm": 0.36050309811234255, + "learning_rate": 9.521201537289959e-05, + "loss": 2.9138, + "step": 14588 + }, + { + "epoch": 0.6792373769117955, + "grad_norm": 0.35527660356152596, + "learning_rate": 9.521085861113197e-05, + "loss": 3.104, + "step": 14589 + }, + { + "epoch": 0.6792839350978885, + "grad_norm": 0.34084518450694595, + "learning_rate": 9.520970171667491e-05, + "loss": 2.971, + "step": 14590 + }, + { + "epoch": 0.6793304932839817, + "grad_norm": 0.34832210853863693, + "learning_rate": 9.520854468953187e-05, + "loss": 2.9322, + "step": 14591 + }, + { + "epoch": 0.6793770514700748, + "grad_norm": 0.3778784616765615, + "learning_rate": 9.520738752970617e-05, + "loss": 3.0078, + "step": 14592 + }, + { + "epoch": 0.6794236096561678, + "grad_norm": 0.3716409348656393, + "learning_rate": 9.520623023720125e-05, + "loss": 3.087, + "step": 14593 + }, + { + "epoch": 0.6794701678422609, + "grad_norm": 0.3842289818245161, + "learning_rate": 9.520507281202051e-05, + "loss": 3.0011, + "step": 14594 + }, + { + "epoch": 0.6795167260283539, + "grad_norm": 0.36540310134774384, + "learning_rate": 9.520391525416733e-05, + "loss": 3.0847, + "step": 14595 + }, + { + "epoch": 0.679563284214447, + "grad_norm": 0.3835240412763036, + "learning_rate": 9.520275756364509e-05, + "loss": 3.0738, + "step": 14596 + }, + { + "epoch": 0.6796098424005401, + "grad_norm": 0.37339013779198127, + "learning_rate": 9.520159974045723e-05, + "loss": 3.1443, + "step": 14597 + }, + { + "epoch": 0.6796564005866331, + "grad_norm": 0.3325185389183247, + "learning_rate": 9.520044178460713e-05, + "loss": 3.0911, + "step": 14598 + }, + { + "epoch": 0.6797029587727262, + "grad_norm": 0.364989132663532, + "learning_rate": 9.519928369609818e-05, + "loss": 2.9835, + "step": 14599 + }, + { + "epoch": 0.6797495169588192, + "grad_norm": 0.3870917443292548, + "learning_rate": 9.519812547493378e-05, + "loss": 3.1452, + "step": 14600 + }, + { + "epoch": 0.6797960751449124, + "grad_norm": 0.3813227233788057, + "learning_rate": 9.519696712111735e-05, + "loss": 3.1105, + "step": 14601 + }, + { + "epoch": 0.6798426333310055, + "grad_norm": 0.38007760910291805, + "learning_rate": 9.519580863465227e-05, + "loss": 3.0647, + "step": 14602 + }, + { + "epoch": 0.6798891915170985, + "grad_norm": 0.35137624144684415, + "learning_rate": 9.519465001554193e-05, + "loss": 3.0074, + "step": 14603 + }, + { + "epoch": 0.6799357497031916, + "grad_norm": 0.36368336936314705, + "learning_rate": 9.519349126378976e-05, + "loss": 2.9898, + "step": 14604 + }, + { + "epoch": 0.6799823078892846, + "grad_norm": 0.4039590487895467, + "learning_rate": 9.519233237939914e-05, + "loss": 3.0766, + "step": 14605 + }, + { + "epoch": 0.6800288660753777, + "grad_norm": 0.357890604583778, + "learning_rate": 9.519117336237349e-05, + "loss": 3.058, + "step": 14606 + }, + { + "epoch": 0.6800754242614707, + "grad_norm": 0.38971564506083906, + "learning_rate": 9.519001421271617e-05, + "loss": 3.1205, + "step": 14607 + }, + { + "epoch": 0.6801219824475638, + "grad_norm": 0.3250869641245239, + "learning_rate": 9.518885493043063e-05, + "loss": 3.0536, + "step": 14608 + }, + { + "epoch": 0.680168540633657, + "grad_norm": 0.36086346068049985, + "learning_rate": 9.518769551552025e-05, + "loss": 2.9917, + "step": 14609 + }, + { + "epoch": 0.68021509881975, + "grad_norm": 0.3474967092426691, + "learning_rate": 9.518653596798844e-05, + "loss": 3.0202, + "step": 14610 + }, + { + "epoch": 0.6802616570058431, + "grad_norm": 0.33538353476354876, + "learning_rate": 9.51853762878386e-05, + "loss": 2.9876, + "step": 14611 + }, + { + "epoch": 0.6803082151919361, + "grad_norm": 0.37549553796807067, + "learning_rate": 9.51842164750741e-05, + "loss": 2.9579, + "step": 14612 + }, + { + "epoch": 0.6803547733780292, + "grad_norm": 0.34567807783968696, + "learning_rate": 9.518305652969842e-05, + "loss": 2.9727, + "step": 14613 + }, + { + "epoch": 0.6804013315641223, + "grad_norm": 0.3642470121335656, + "learning_rate": 9.518189645171488e-05, + "loss": 2.9674, + "step": 14614 + }, + { + "epoch": 0.6804478897502153, + "grad_norm": 0.368570888348468, + "learning_rate": 9.518073624112695e-05, + "loss": 3.0872, + "step": 14615 + }, + { + "epoch": 0.6804944479363084, + "grad_norm": 0.3509381653108775, + "learning_rate": 9.5179575897938e-05, + "loss": 3.0628, + "step": 14616 + }, + { + "epoch": 0.6805410061224014, + "grad_norm": 0.3726885561860087, + "learning_rate": 9.517841542215146e-05, + "loss": 3.0471, + "step": 14617 + }, + { + "epoch": 0.6805875643084945, + "grad_norm": 0.3759253201309656, + "learning_rate": 9.517725481377071e-05, + "loss": 3.0169, + "step": 14618 + }, + { + "epoch": 0.6806341224945877, + "grad_norm": 0.37550729714771913, + "learning_rate": 9.517609407279918e-05, + "loss": 3.0051, + "step": 14619 + }, + { + "epoch": 0.6806806806806807, + "grad_norm": 0.3741707076163693, + "learning_rate": 9.517493319924025e-05, + "loss": 3.0194, + "step": 14620 + }, + { + "epoch": 0.6807272388667738, + "grad_norm": 0.394808752353815, + "learning_rate": 9.517377219309734e-05, + "loss": 3.0535, + "step": 14621 + }, + { + "epoch": 0.6807737970528668, + "grad_norm": 0.3876386989685168, + "learning_rate": 9.517261105437388e-05, + "loss": 3.0108, + "step": 14622 + }, + { + "epoch": 0.6808203552389599, + "grad_norm": 0.36580090453165104, + "learning_rate": 9.517144978307323e-05, + "loss": 3.0627, + "step": 14623 + }, + { + "epoch": 0.680866913425053, + "grad_norm": 0.36192598280983435, + "learning_rate": 9.517028837919883e-05, + "loss": 3.0469, + "step": 14624 + }, + { + "epoch": 0.680913471611146, + "grad_norm": 0.4205302755702069, + "learning_rate": 9.51691268427541e-05, + "loss": 3.0266, + "step": 14625 + }, + { + "epoch": 0.6809600297972391, + "grad_norm": 0.39668903528322197, + "learning_rate": 9.516796517374242e-05, + "loss": 3.0664, + "step": 14626 + }, + { + "epoch": 0.6810065879833321, + "grad_norm": 0.3610840473790844, + "learning_rate": 9.516680337216722e-05, + "loss": 2.9907, + "step": 14627 + }, + { + "epoch": 0.6810531461694252, + "grad_norm": 0.3794065847813194, + "learning_rate": 9.51656414380319e-05, + "loss": 3.0397, + "step": 14628 + }, + { + "epoch": 0.6810997043555183, + "grad_norm": 0.3451723262405777, + "learning_rate": 9.516447937133986e-05, + "loss": 3.0751, + "step": 14629 + }, + { + "epoch": 0.6811462625416114, + "grad_norm": 0.39580914034646825, + "learning_rate": 9.516331717209453e-05, + "loss": 3.0364, + "step": 14630 + }, + { + "epoch": 0.6811928207277045, + "grad_norm": 0.35893760894176174, + "learning_rate": 9.516215484029932e-05, + "loss": 2.8445, + "step": 14631 + }, + { + "epoch": 0.6812393789137975, + "grad_norm": 0.39400093141698755, + "learning_rate": 9.516099237595763e-05, + "loss": 3.0074, + "step": 14632 + }, + { + "epoch": 0.6812859370998906, + "grad_norm": 0.34083086666628126, + "learning_rate": 9.515982977907287e-05, + "loss": 2.9651, + "step": 14633 + }, + { + "epoch": 0.6813324952859836, + "grad_norm": 0.40430467080944216, + "learning_rate": 9.515866704964847e-05, + "loss": 2.9837, + "step": 14634 + }, + { + "epoch": 0.6813790534720767, + "grad_norm": 0.348558744647239, + "learning_rate": 9.515750418768783e-05, + "loss": 2.9409, + "step": 14635 + }, + { + "epoch": 0.6814256116581698, + "grad_norm": 0.3716401794354455, + "learning_rate": 9.515634119319437e-05, + "loss": 3.0154, + "step": 14636 + }, + { + "epoch": 0.6814721698442628, + "grad_norm": 0.40333733919955655, + "learning_rate": 9.515517806617147e-05, + "loss": 2.9605, + "step": 14637 + }, + { + "epoch": 0.681518728030356, + "grad_norm": 0.35729328031543717, + "learning_rate": 9.51540148066226e-05, + "loss": 2.9645, + "step": 14638 + }, + { + "epoch": 0.681565286216449, + "grad_norm": 0.3740155066578124, + "learning_rate": 9.515285141455113e-05, + "loss": 3.0703, + "step": 14639 + }, + { + "epoch": 0.6816118444025421, + "grad_norm": 0.37617631570404736, + "learning_rate": 9.51516878899605e-05, + "loss": 3.0487, + "step": 14640 + }, + { + "epoch": 0.6816584025886352, + "grad_norm": 0.3304744498300262, + "learning_rate": 9.515052423285409e-05, + "loss": 2.9799, + "step": 14641 + }, + { + "epoch": 0.6817049607747282, + "grad_norm": 0.3850988124807631, + "learning_rate": 9.514936044323536e-05, + "loss": 2.994, + "step": 14642 + }, + { + "epoch": 0.6817515189608213, + "grad_norm": 0.33366383115432, + "learning_rate": 9.51481965211077e-05, + "loss": 3.0281, + "step": 14643 + }, + { + "epoch": 0.6817980771469143, + "grad_norm": 0.3682037897927408, + "learning_rate": 9.514703246647454e-05, + "loss": 3.0113, + "step": 14644 + }, + { + "epoch": 0.6818446353330074, + "grad_norm": 0.35663495956003893, + "learning_rate": 9.514586827933927e-05, + "loss": 3.0783, + "step": 14645 + }, + { + "epoch": 0.6818911935191005, + "grad_norm": 0.37512483418527265, + "learning_rate": 9.514470395970534e-05, + "loss": 3.0152, + "step": 14646 + }, + { + "epoch": 0.6819377517051936, + "grad_norm": 0.41441402901752505, + "learning_rate": 9.514353950757615e-05, + "loss": 2.9852, + "step": 14647 + }, + { + "epoch": 0.6819843098912867, + "grad_norm": 0.34307161034292194, + "learning_rate": 9.51423749229551e-05, + "loss": 2.9734, + "step": 14648 + }, + { + "epoch": 0.6820308680773797, + "grad_norm": 0.412192710648986, + "learning_rate": 9.514121020584565e-05, + "loss": 3.0086, + "step": 14649 + }, + { + "epoch": 0.6820774262634728, + "grad_norm": 0.3358071059322835, + "learning_rate": 9.514004535625119e-05, + "loss": 3.1529, + "step": 14650 + }, + { + "epoch": 0.6821239844495658, + "grad_norm": 0.38024513442746294, + "learning_rate": 9.513888037417513e-05, + "loss": 3.0355, + "step": 14651 + }, + { + "epoch": 0.6821705426356589, + "grad_norm": 0.3419319999366554, + "learning_rate": 9.513771525962092e-05, + "loss": 2.926, + "step": 14652 + }, + { + "epoch": 0.682217100821752, + "grad_norm": 0.36650362915464896, + "learning_rate": 9.513655001259195e-05, + "loss": 2.9874, + "step": 14653 + }, + { + "epoch": 0.682263659007845, + "grad_norm": 0.3499674763859589, + "learning_rate": 9.513538463309166e-05, + "loss": 2.9439, + "step": 14654 + }, + { + "epoch": 0.6823102171939381, + "grad_norm": 0.358547546001024, + "learning_rate": 9.513421912112345e-05, + "loss": 3.0565, + "step": 14655 + }, + { + "epoch": 0.6823567753800311, + "grad_norm": 0.35571109470780127, + "learning_rate": 9.513305347669077e-05, + "loss": 3.0423, + "step": 14656 + }, + { + "epoch": 0.6824033335661243, + "grad_norm": 0.3806468900338201, + "learning_rate": 9.513188769979702e-05, + "loss": 2.9908, + "step": 14657 + }, + { + "epoch": 0.6824498917522174, + "grad_norm": 0.3855059521019687, + "learning_rate": 9.513072179044563e-05, + "loss": 2.946, + "step": 14658 + }, + { + "epoch": 0.6824964499383104, + "grad_norm": 0.3692721791988127, + "learning_rate": 9.512955574864001e-05, + "loss": 3.1281, + "step": 14659 + }, + { + "epoch": 0.6825430081244035, + "grad_norm": 0.37919628801267996, + "learning_rate": 9.512838957438359e-05, + "loss": 3.0997, + "step": 14660 + }, + { + "epoch": 0.6825895663104965, + "grad_norm": 0.31248219974471986, + "learning_rate": 9.51272232676798e-05, + "loss": 3.0079, + "step": 14661 + }, + { + "epoch": 0.6826361244965896, + "grad_norm": 0.36062596300055155, + "learning_rate": 9.512605682853204e-05, + "loss": 3.0262, + "step": 14662 + }, + { + "epoch": 0.6826826826826827, + "grad_norm": 0.29274528893321417, + "learning_rate": 9.512489025694377e-05, + "loss": 3.0009, + "step": 14663 + }, + { + "epoch": 0.6827292408687757, + "grad_norm": 0.3564529289410679, + "learning_rate": 9.512372355291838e-05, + "loss": 3.1491, + "step": 14664 + }, + { + "epoch": 0.6827757990548688, + "grad_norm": 0.37147927347498816, + "learning_rate": 9.512255671645931e-05, + "loss": 3.0064, + "step": 14665 + }, + { + "epoch": 0.6828223572409619, + "grad_norm": 0.30098398977631863, + "learning_rate": 9.512138974756996e-05, + "loss": 2.973, + "step": 14666 + }, + { + "epoch": 0.682868915427055, + "grad_norm": 0.3838118553056206, + "learning_rate": 9.512022264625381e-05, + "loss": 3.0742, + "step": 14667 + }, + { + "epoch": 0.6829154736131481, + "grad_norm": 0.396471383708373, + "learning_rate": 9.511905541251424e-05, + "loss": 3.067, + "step": 14668 + }, + { + "epoch": 0.6829620317992411, + "grad_norm": 0.36186071366092576, + "learning_rate": 9.511788804635469e-05, + "loss": 3.1469, + "step": 14669 + }, + { + "epoch": 0.6830085899853342, + "grad_norm": 0.3717971617810816, + "learning_rate": 9.511672054777858e-05, + "loss": 2.9288, + "step": 14670 + }, + { + "epoch": 0.6830551481714272, + "grad_norm": 0.39152163660434697, + "learning_rate": 9.511555291678933e-05, + "loss": 2.9328, + "step": 14671 + }, + { + "epoch": 0.6831017063575203, + "grad_norm": 0.3561405043004303, + "learning_rate": 9.511438515339038e-05, + "loss": 2.9413, + "step": 14672 + }, + { + "epoch": 0.6831482645436133, + "grad_norm": 0.3682437848197161, + "learning_rate": 9.511321725758517e-05, + "loss": 3.0212, + "step": 14673 + }, + { + "epoch": 0.6831948227297064, + "grad_norm": 0.3379626874399936, + "learning_rate": 9.511204922937709e-05, + "loss": 3.0301, + "step": 14674 + }, + { + "epoch": 0.6832413809157996, + "grad_norm": 0.36324962887500806, + "learning_rate": 9.51108810687696e-05, + "loss": 3.0693, + "step": 14675 + }, + { + "epoch": 0.6832879391018926, + "grad_norm": 0.3744371861462907, + "learning_rate": 9.510971277576613e-05, + "loss": 3.1237, + "step": 14676 + }, + { + "epoch": 0.6833344972879857, + "grad_norm": 0.3334422294422635, + "learning_rate": 9.510854435037008e-05, + "loss": 3.0227, + "step": 14677 + }, + { + "epoch": 0.6833810554740787, + "grad_norm": 0.34692382313787407, + "learning_rate": 9.510737579258491e-05, + "loss": 3.0258, + "step": 14678 + }, + { + "epoch": 0.6834276136601718, + "grad_norm": 0.3282823444108931, + "learning_rate": 9.510620710241403e-05, + "loss": 3.0213, + "step": 14679 + }, + { + "epoch": 0.6834741718462649, + "grad_norm": 0.3568136133106188, + "learning_rate": 9.510503827986085e-05, + "loss": 2.9966, + "step": 14680 + }, + { + "epoch": 0.6835207300323579, + "grad_norm": 0.32765164013318293, + "learning_rate": 9.510386932492886e-05, + "loss": 3.0498, + "step": 14681 + }, + { + "epoch": 0.683567288218451, + "grad_norm": 0.3452472615294382, + "learning_rate": 9.510270023762144e-05, + "loss": 3.0187, + "step": 14682 + }, + { + "epoch": 0.683613846404544, + "grad_norm": 0.3426575554867593, + "learning_rate": 9.510153101794206e-05, + "loss": 3.0332, + "step": 14683 + }, + { + "epoch": 0.6836604045906371, + "grad_norm": 0.33236998030431364, + "learning_rate": 9.510036166589411e-05, + "loss": 3.1106, + "step": 14684 + }, + { + "epoch": 0.6837069627767303, + "grad_norm": 0.366225460395106, + "learning_rate": 9.509919218148105e-05, + "loss": 3.0188, + "step": 14685 + }, + { + "epoch": 0.6837535209628233, + "grad_norm": 0.34129085371887363, + "learning_rate": 9.50980225647063e-05, + "loss": 3.0644, + "step": 14686 + }, + { + "epoch": 0.6838000791489164, + "grad_norm": 0.3564946562029493, + "learning_rate": 9.509685281557329e-05, + "loss": 3.0005, + "step": 14687 + }, + { + "epoch": 0.6838466373350094, + "grad_norm": 0.34718782138967846, + "learning_rate": 9.509568293408546e-05, + "loss": 3.0218, + "step": 14688 + }, + { + "epoch": 0.6838931955211025, + "grad_norm": 0.3413373205186584, + "learning_rate": 9.509451292024626e-05, + "loss": 2.9896, + "step": 14689 + }, + { + "epoch": 0.6839397537071956, + "grad_norm": 0.3614446539435548, + "learning_rate": 9.50933427740591e-05, + "loss": 3.0361, + "step": 14690 + }, + { + "epoch": 0.6839863118932886, + "grad_norm": 0.3464752996695067, + "learning_rate": 9.509217249552741e-05, + "loss": 3.0936, + "step": 14691 + }, + { + "epoch": 0.6840328700793817, + "grad_norm": 0.352734158337656, + "learning_rate": 9.509100208465465e-05, + "loss": 3.027, + "step": 14692 + }, + { + "epoch": 0.6840794282654747, + "grad_norm": 0.3218981306827866, + "learning_rate": 9.508983154144425e-05, + "loss": 3.0145, + "step": 14693 + }, + { + "epoch": 0.6841259864515679, + "grad_norm": 0.4539036633396155, + "learning_rate": 9.50886608658996e-05, + "loss": 3.1098, + "step": 14694 + }, + { + "epoch": 0.6841725446376609, + "grad_norm": 0.43214376406585064, + "learning_rate": 9.50874900580242e-05, + "loss": 3.0749, + "step": 14695 + }, + { + "epoch": 0.684219102823754, + "grad_norm": 0.33991467733876884, + "learning_rate": 9.508631911782146e-05, + "loss": 3.0108, + "step": 14696 + }, + { + "epoch": 0.6842656610098471, + "grad_norm": 0.4124167383800242, + "learning_rate": 9.50851480452948e-05, + "loss": 3.003, + "step": 14697 + }, + { + "epoch": 0.6843122191959401, + "grad_norm": 0.41334230056158305, + "learning_rate": 9.508397684044768e-05, + "loss": 3.0816, + "step": 14698 + }, + { + "epoch": 0.6843587773820332, + "grad_norm": 0.34384576541323353, + "learning_rate": 9.508280550328354e-05, + "loss": 3.1261, + "step": 14699 + }, + { + "epoch": 0.6844053355681262, + "grad_norm": 0.4693727950180143, + "learning_rate": 9.508163403380578e-05, + "loss": 2.9589, + "step": 14700 + }, + { + "epoch": 0.6844518937542193, + "grad_norm": 0.36910797057312394, + "learning_rate": 9.50804624320179e-05, + "loss": 3.0064, + "step": 14701 + }, + { + "epoch": 0.6844984519403124, + "grad_norm": 0.3739162116197133, + "learning_rate": 9.507929069792327e-05, + "loss": 3.0388, + "step": 14702 + }, + { + "epoch": 0.6845450101264055, + "grad_norm": 0.3961450140804995, + "learning_rate": 9.507811883152539e-05, + "loss": 2.9705, + "step": 14703 + }, + { + "epoch": 0.6845915683124986, + "grad_norm": 0.37804733776698807, + "learning_rate": 9.507694683282766e-05, + "loss": 3.0907, + "step": 14704 + }, + { + "epoch": 0.6846381264985916, + "grad_norm": 0.3824878491627865, + "learning_rate": 9.507577470183353e-05, + "loss": 3.0185, + "step": 14705 + }, + { + "epoch": 0.6846846846846847, + "grad_norm": 0.35333110416347613, + "learning_rate": 9.507460243854644e-05, + "loss": 3.0225, + "step": 14706 + }, + { + "epoch": 0.6847312428707778, + "grad_norm": 0.430437683357448, + "learning_rate": 9.507343004296985e-05, + "loss": 3.0205, + "step": 14707 + }, + { + "epoch": 0.6847778010568708, + "grad_norm": 0.38427897677079537, + "learning_rate": 9.507225751510717e-05, + "loss": 3.0282, + "step": 14708 + }, + { + "epoch": 0.6848243592429639, + "grad_norm": 0.362765930686864, + "learning_rate": 9.507108485496185e-05, + "loss": 2.9232, + "step": 14709 + }, + { + "epoch": 0.6848709174290569, + "grad_norm": 0.3727881437955827, + "learning_rate": 9.506991206253735e-05, + "loss": 3.0075, + "step": 14710 + }, + { + "epoch": 0.68491747561515, + "grad_norm": 0.37981659433222836, + "learning_rate": 9.506873913783708e-05, + "loss": 3.0641, + "step": 14711 + }, + { + "epoch": 0.6849640338012432, + "grad_norm": 0.35214942223475115, + "learning_rate": 9.50675660808645e-05, + "loss": 3.0401, + "step": 14712 + }, + { + "epoch": 0.6850105919873362, + "grad_norm": 0.37319124655052055, + "learning_rate": 9.506639289162309e-05, + "loss": 3.0263, + "step": 14713 + }, + { + "epoch": 0.6850571501734293, + "grad_norm": 0.390607114632229, + "learning_rate": 9.506521957011622e-05, + "loss": 3.0366, + "step": 14714 + }, + { + "epoch": 0.6851037083595223, + "grad_norm": 0.37418097540828726, + "learning_rate": 9.506404611634738e-05, + "loss": 2.964, + "step": 14715 + }, + { + "epoch": 0.6851502665456154, + "grad_norm": 0.3827319554020168, + "learning_rate": 9.506287253032e-05, + "loss": 3.0244, + "step": 14716 + }, + { + "epoch": 0.6851968247317084, + "grad_norm": 0.39356596133496086, + "learning_rate": 9.506169881203754e-05, + "loss": 3.013, + "step": 14717 + }, + { + "epoch": 0.6852433829178015, + "grad_norm": 0.3587946952496884, + "learning_rate": 9.506052496150341e-05, + "loss": 3.1193, + "step": 14718 + }, + { + "epoch": 0.6852899411038946, + "grad_norm": 0.41242589677408514, + "learning_rate": 9.505935097872112e-05, + "loss": 2.8839, + "step": 14719 + }, + { + "epoch": 0.6853364992899876, + "grad_norm": 0.3788050213271814, + "learning_rate": 9.505817686369404e-05, + "loss": 3.1082, + "step": 14720 + }, + { + "epoch": 0.6853830574760807, + "grad_norm": 0.33091673724042614, + "learning_rate": 9.505700261642567e-05, + "loss": 3.0343, + "step": 14721 + }, + { + "epoch": 0.6854296156621738, + "grad_norm": 0.3529628203430213, + "learning_rate": 9.505582823691941e-05, + "loss": 3.0249, + "step": 14722 + }, + { + "epoch": 0.6854761738482669, + "grad_norm": 0.3342508452114319, + "learning_rate": 9.505465372517875e-05, + "loss": 2.9333, + "step": 14723 + }, + { + "epoch": 0.68552273203436, + "grad_norm": 0.35733355745394074, + "learning_rate": 9.505347908120712e-05, + "loss": 3.0392, + "step": 14724 + }, + { + "epoch": 0.685569290220453, + "grad_norm": 0.37722322318610274, + "learning_rate": 9.505230430500796e-05, + "loss": 3.0494, + "step": 14725 + }, + { + "epoch": 0.6856158484065461, + "grad_norm": 0.3638990268117165, + "learning_rate": 9.505112939658474e-05, + "loss": 3.039, + "step": 14726 + }, + { + "epoch": 0.6856624065926391, + "grad_norm": 0.3489611666107676, + "learning_rate": 9.504995435594089e-05, + "loss": 2.8403, + "step": 14727 + }, + { + "epoch": 0.6857089647787322, + "grad_norm": 0.3255184372807715, + "learning_rate": 9.504877918307985e-05, + "loss": 3.0605, + "step": 14728 + }, + { + "epoch": 0.6857555229648253, + "grad_norm": 0.38539580570248666, + "learning_rate": 9.504760387800508e-05, + "loss": 3.0648, + "step": 14729 + }, + { + "epoch": 0.6858020811509183, + "grad_norm": 0.3299183068098466, + "learning_rate": 9.504642844072005e-05, + "loss": 3.132, + "step": 14730 + }, + { + "epoch": 0.6858486393370115, + "grad_norm": 0.36593026183929533, + "learning_rate": 9.504525287122816e-05, + "loss": 3.0158, + "step": 14731 + }, + { + "epoch": 0.6858951975231045, + "grad_norm": 0.35430359646370185, + "learning_rate": 9.504407716953292e-05, + "loss": 2.983, + "step": 14732 + }, + { + "epoch": 0.6859417557091976, + "grad_norm": 0.3730285147707343, + "learning_rate": 9.504290133563773e-05, + "loss": 2.9667, + "step": 14733 + }, + { + "epoch": 0.6859883138952907, + "grad_norm": 0.36937749798346275, + "learning_rate": 9.504172536954609e-05, + "loss": 3.0689, + "step": 14734 + }, + { + "epoch": 0.6860348720813837, + "grad_norm": 0.35093468221406854, + "learning_rate": 9.50405492712614e-05, + "loss": 3.0116, + "step": 14735 + }, + { + "epoch": 0.6860814302674768, + "grad_norm": 0.3735925867911329, + "learning_rate": 9.503937304078714e-05, + "loss": 3.0266, + "step": 14736 + }, + { + "epoch": 0.6861279884535698, + "grad_norm": 0.33984159129931657, + "learning_rate": 9.503819667812675e-05, + "loss": 3.002, + "step": 14737 + }, + { + "epoch": 0.6861745466396629, + "grad_norm": 0.36416606056803336, + "learning_rate": 9.50370201832837e-05, + "loss": 2.9692, + "step": 14738 + }, + { + "epoch": 0.6862211048257559, + "grad_norm": 0.3366023566482218, + "learning_rate": 9.503584355626142e-05, + "loss": 3.0699, + "step": 14739 + }, + { + "epoch": 0.686267663011849, + "grad_norm": 0.3663972371169394, + "learning_rate": 9.50346667970634e-05, + "loss": 2.9946, + "step": 14740 + }, + { + "epoch": 0.6863142211979422, + "grad_norm": 0.39530499191977053, + "learning_rate": 9.503348990569305e-05, + "loss": 2.9195, + "step": 14741 + }, + { + "epoch": 0.6863607793840352, + "grad_norm": 0.40733627022533214, + "learning_rate": 9.503231288215387e-05, + "loss": 3.002, + "step": 14742 + }, + { + "epoch": 0.6864073375701283, + "grad_norm": 0.35418746879582175, + "learning_rate": 9.503113572644926e-05, + "loss": 2.9941, + "step": 14743 + }, + { + "epoch": 0.6864538957562213, + "grad_norm": 0.3792509213828204, + "learning_rate": 9.50299584385827e-05, + "loss": 2.9848, + "step": 14744 + }, + { + "epoch": 0.6865004539423144, + "grad_norm": 0.4117188303973107, + "learning_rate": 9.502878101855766e-05, + "loss": 3.0811, + "step": 14745 + }, + { + "epoch": 0.6865470121284075, + "grad_norm": 0.3781896568123376, + "learning_rate": 9.502760346637757e-05, + "loss": 3.0691, + "step": 14746 + }, + { + "epoch": 0.6865935703145005, + "grad_norm": 0.3567236780427419, + "learning_rate": 9.502642578204593e-05, + "loss": 2.905, + "step": 14747 + }, + { + "epoch": 0.6866401285005936, + "grad_norm": 0.3964986325513267, + "learning_rate": 9.502524796556615e-05, + "loss": 3.0035, + "step": 14748 + }, + { + "epoch": 0.6866866866866866, + "grad_norm": 0.3842652329627098, + "learning_rate": 9.50240700169417e-05, + "loss": 2.9756, + "step": 14749 + }, + { + "epoch": 0.6867332448727798, + "grad_norm": 0.39034893930429837, + "learning_rate": 9.502289193617604e-05, + "loss": 3.1278, + "step": 14750 + }, + { + "epoch": 0.6867798030588729, + "grad_norm": 0.3777436049846758, + "learning_rate": 9.502171372327262e-05, + "loss": 3.0693, + "step": 14751 + }, + { + "epoch": 0.6868263612449659, + "grad_norm": 0.34886937803842033, + "learning_rate": 9.502053537823492e-05, + "loss": 2.9038, + "step": 14752 + }, + { + "epoch": 0.686872919431059, + "grad_norm": 0.36664840212404687, + "learning_rate": 9.501935690106638e-05, + "loss": 3.0254, + "step": 14753 + }, + { + "epoch": 0.686919477617152, + "grad_norm": 0.34023503553360396, + "learning_rate": 9.501817829177047e-05, + "loss": 3.0201, + "step": 14754 + }, + { + "epoch": 0.6869660358032451, + "grad_norm": 0.41483240650642106, + "learning_rate": 9.501699955035062e-05, + "loss": 3.0237, + "step": 14755 + }, + { + "epoch": 0.6870125939893382, + "grad_norm": 0.3647684098573311, + "learning_rate": 9.501582067681033e-05, + "loss": 3.0288, + "step": 14756 + }, + { + "epoch": 0.6870591521754312, + "grad_norm": 0.37380964514566034, + "learning_rate": 9.501464167115304e-05, + "loss": 3.0126, + "step": 14757 + }, + { + "epoch": 0.6871057103615243, + "grad_norm": 0.3671144264065898, + "learning_rate": 9.501346253338221e-05, + "loss": 3.0413, + "step": 14758 + }, + { + "epoch": 0.6871522685476174, + "grad_norm": 0.4056594734978148, + "learning_rate": 9.50122832635013e-05, + "loss": 3.0277, + "step": 14759 + }, + { + "epoch": 0.6871988267337105, + "grad_norm": 0.37362903816369314, + "learning_rate": 9.501110386151377e-05, + "loss": 2.9847, + "step": 14760 + }, + { + "epoch": 0.6872453849198035, + "grad_norm": 0.33374116564157513, + "learning_rate": 9.500992432742309e-05, + "loss": 2.9901, + "step": 14761 + }, + { + "epoch": 0.6872919431058966, + "grad_norm": 0.38572206622055705, + "learning_rate": 9.500874466123271e-05, + "loss": 2.9594, + "step": 14762 + }, + { + "epoch": 0.6873385012919897, + "grad_norm": 0.35639606772646903, + "learning_rate": 9.500756486294608e-05, + "loss": 2.9621, + "step": 14763 + }, + { + "epoch": 0.6873850594780827, + "grad_norm": 0.36942196992906856, + "learning_rate": 9.50063849325667e-05, + "loss": 2.9994, + "step": 14764 + }, + { + "epoch": 0.6874316176641758, + "grad_norm": 0.39341261292743884, + "learning_rate": 9.500520487009803e-05, + "loss": 3.0232, + "step": 14765 + }, + { + "epoch": 0.6874781758502688, + "grad_norm": 0.4296313113946305, + "learning_rate": 9.50040246755435e-05, + "loss": 2.9773, + "step": 14766 + }, + { + "epoch": 0.6875247340363619, + "grad_norm": 0.37329350420149177, + "learning_rate": 9.500284434890661e-05, + "loss": 2.9469, + "step": 14767 + }, + { + "epoch": 0.687571292222455, + "grad_norm": 0.38343892243608624, + "learning_rate": 9.500166389019078e-05, + "loss": 3.0762, + "step": 14768 + }, + { + "epoch": 0.6876178504085481, + "grad_norm": 0.3966531164181454, + "learning_rate": 9.500048329939952e-05, + "loss": 3.0009, + "step": 14769 + }, + { + "epoch": 0.6876644085946412, + "grad_norm": 0.346294092641954, + "learning_rate": 9.499930257653626e-05, + "loss": 2.976, + "step": 14770 + }, + { + "epoch": 0.6877109667807342, + "grad_norm": 0.40485810053850685, + "learning_rate": 9.499812172160448e-05, + "loss": 3.0962, + "step": 14771 + }, + { + "epoch": 0.6877575249668273, + "grad_norm": 0.37473322655915314, + "learning_rate": 9.499694073460765e-05, + "loss": 3.07, + "step": 14772 + }, + { + "epoch": 0.6878040831529204, + "grad_norm": 0.37943250084013386, + "learning_rate": 9.499575961554923e-05, + "loss": 3.0575, + "step": 14773 + }, + { + "epoch": 0.6878506413390134, + "grad_norm": 0.42534541033701223, + "learning_rate": 9.49945783644327e-05, + "loss": 3.007, + "step": 14774 + }, + { + "epoch": 0.6878971995251065, + "grad_norm": 0.35076332397057414, + "learning_rate": 9.49933969812615e-05, + "loss": 2.9686, + "step": 14775 + }, + { + "epoch": 0.6879437577111995, + "grad_norm": 0.39336787127664535, + "learning_rate": 9.499221546603911e-05, + "loss": 2.8972, + "step": 14776 + }, + { + "epoch": 0.6879903158972926, + "grad_norm": 0.38376619140030704, + "learning_rate": 9.499103381876902e-05, + "loss": 3.0069, + "step": 14777 + }, + { + "epoch": 0.6880368740833858, + "grad_norm": 0.36128459434710675, + "learning_rate": 9.498985203945468e-05, + "loss": 3.0416, + "step": 14778 + }, + { + "epoch": 0.6880834322694788, + "grad_norm": 0.3921920410131599, + "learning_rate": 9.498867012809953e-05, + "loss": 2.9578, + "step": 14779 + }, + { + "epoch": 0.6881299904555719, + "grad_norm": 0.3569698381922445, + "learning_rate": 9.498748808470708e-05, + "loss": 2.9676, + "step": 14780 + }, + { + "epoch": 0.6881765486416649, + "grad_norm": 0.3863441837509538, + "learning_rate": 9.498630590928078e-05, + "loss": 3.078, + "step": 14781 + }, + { + "epoch": 0.688223106827758, + "grad_norm": 0.34783447214489344, + "learning_rate": 9.498512360182411e-05, + "loss": 2.9591, + "step": 14782 + }, + { + "epoch": 0.688269665013851, + "grad_norm": 0.37718869402210653, + "learning_rate": 9.498394116234053e-05, + "loss": 3.0165, + "step": 14783 + }, + { + "epoch": 0.6883162231999441, + "grad_norm": 0.40447857356027217, + "learning_rate": 9.498275859083352e-05, + "loss": 2.9919, + "step": 14784 + }, + { + "epoch": 0.6883627813860372, + "grad_norm": 0.3904412625938676, + "learning_rate": 9.498157588730654e-05, + "loss": 3.033, + "step": 14785 + }, + { + "epoch": 0.6884093395721302, + "grad_norm": 0.35042757553708676, + "learning_rate": 9.498039305176307e-05, + "loss": 2.95, + "step": 14786 + }, + { + "epoch": 0.6884558977582234, + "grad_norm": 0.3967269066398949, + "learning_rate": 9.497921008420655e-05, + "loss": 3.0535, + "step": 14787 + }, + { + "epoch": 0.6885024559443164, + "grad_norm": 0.3844889357212738, + "learning_rate": 9.497802698464053e-05, + "loss": 3.0292, + "step": 14788 + }, + { + "epoch": 0.6885490141304095, + "grad_norm": 0.4087088285317428, + "learning_rate": 9.49768437530684e-05, + "loss": 3.0694, + "step": 14789 + }, + { + "epoch": 0.6885955723165026, + "grad_norm": 0.3867134229934471, + "learning_rate": 9.497566038949366e-05, + "loss": 2.9729, + "step": 14790 + }, + { + "epoch": 0.6886421305025956, + "grad_norm": 0.34728319863447715, + "learning_rate": 9.497447689391981e-05, + "loss": 2.9596, + "step": 14791 + }, + { + "epoch": 0.6886886886886887, + "grad_norm": 0.4124005913388301, + "learning_rate": 9.497329326635029e-05, + "loss": 2.9803, + "step": 14792 + }, + { + "epoch": 0.6887352468747817, + "grad_norm": 0.3698112806989415, + "learning_rate": 9.497210950678859e-05, + "loss": 3.0967, + "step": 14793 + }, + { + "epoch": 0.6887818050608748, + "grad_norm": 0.35134784446832446, + "learning_rate": 9.497092561523818e-05, + "loss": 3.1147, + "step": 14794 + }, + { + "epoch": 0.688828363246968, + "grad_norm": 0.4351870148655538, + "learning_rate": 9.496974159170251e-05, + "loss": 3.0198, + "step": 14795 + }, + { + "epoch": 0.688874921433061, + "grad_norm": 0.3430727879375682, + "learning_rate": 9.496855743618509e-05, + "loss": 3.0764, + "step": 14796 + }, + { + "epoch": 0.6889214796191541, + "grad_norm": 0.36972594936318876, + "learning_rate": 9.49673731486894e-05, + "loss": 2.9331, + "step": 14797 + }, + { + "epoch": 0.6889680378052471, + "grad_norm": 0.333791322260626, + "learning_rate": 9.496618872921888e-05, + "loss": 3.1072, + "step": 14798 + }, + { + "epoch": 0.6890145959913402, + "grad_norm": 0.3867515334733856, + "learning_rate": 9.496500417777704e-05, + "loss": 3.0688, + "step": 14799 + }, + { + "epoch": 0.6890611541774333, + "grad_norm": 0.36598034494136683, + "learning_rate": 9.496381949436735e-05, + "loss": 2.9942, + "step": 14800 + }, + { + "epoch": 0.6891077123635263, + "grad_norm": 0.3670547177777265, + "learning_rate": 9.496263467899326e-05, + "loss": 2.9973, + "step": 14801 + }, + { + "epoch": 0.6891542705496194, + "grad_norm": 0.3676308603590175, + "learning_rate": 9.496144973165827e-05, + "loss": 3.0328, + "step": 14802 + }, + { + "epoch": 0.6892008287357124, + "grad_norm": 0.3558157918444709, + "learning_rate": 9.496026465236586e-05, + "loss": 3.1, + "step": 14803 + }, + { + "epoch": 0.6892473869218055, + "grad_norm": 0.3609929850941282, + "learning_rate": 9.49590794411195e-05, + "loss": 2.9903, + "step": 14804 + }, + { + "epoch": 0.6892939451078985, + "grad_norm": 0.3725056770406046, + "learning_rate": 9.495789409792267e-05, + "loss": 3.0491, + "step": 14805 + }, + { + "epoch": 0.6893405032939917, + "grad_norm": 0.39414078193384, + "learning_rate": 9.495670862277885e-05, + "loss": 2.9981, + "step": 14806 + }, + { + "epoch": 0.6893870614800848, + "grad_norm": 0.4269334306150711, + "learning_rate": 9.495552301569152e-05, + "loss": 3.0559, + "step": 14807 + }, + { + "epoch": 0.6894336196661778, + "grad_norm": 0.37124590212098457, + "learning_rate": 9.495433727666416e-05, + "loss": 3.0407, + "step": 14808 + }, + { + "epoch": 0.6894801778522709, + "grad_norm": 0.3724257626319985, + "learning_rate": 9.495315140570026e-05, + "loss": 2.958, + "step": 14809 + }, + { + "epoch": 0.6895267360383639, + "grad_norm": 0.3629810602026796, + "learning_rate": 9.495196540280326e-05, + "loss": 3.0572, + "step": 14810 + }, + { + "epoch": 0.689573294224457, + "grad_norm": 0.32994249183994645, + "learning_rate": 9.49507792679767e-05, + "loss": 3.0359, + "step": 14811 + }, + { + "epoch": 0.6896198524105501, + "grad_norm": 0.33324438005298257, + "learning_rate": 9.494959300122403e-05, + "loss": 3.052, + "step": 14812 + }, + { + "epoch": 0.6896664105966431, + "grad_norm": 0.35120555561473443, + "learning_rate": 9.494840660254871e-05, + "loss": 2.9613, + "step": 14813 + }, + { + "epoch": 0.6897129687827362, + "grad_norm": 0.3532113905725939, + "learning_rate": 9.494722007195427e-05, + "loss": 2.9963, + "step": 14814 + }, + { + "epoch": 0.6897595269688293, + "grad_norm": 0.3402596081030185, + "learning_rate": 9.494603340944415e-05, + "loss": 3.0266, + "step": 14815 + }, + { + "epoch": 0.6898060851549224, + "grad_norm": 0.3374928622763346, + "learning_rate": 9.494484661502185e-05, + "loss": 2.9893, + "step": 14816 + }, + { + "epoch": 0.6898526433410155, + "grad_norm": 0.34889403448993844, + "learning_rate": 9.494365968869086e-05, + "loss": 2.9854, + "step": 14817 + }, + { + "epoch": 0.6898992015271085, + "grad_norm": 0.3195770971636189, + "learning_rate": 9.494247263045464e-05, + "loss": 3.0356, + "step": 14818 + }, + { + "epoch": 0.6899457597132016, + "grad_norm": 0.3716884289715018, + "learning_rate": 9.494128544031671e-05, + "loss": 2.9264, + "step": 14819 + }, + { + "epoch": 0.6899923178992946, + "grad_norm": 0.3388141291622428, + "learning_rate": 9.494009811828053e-05, + "loss": 3.1082, + "step": 14820 + }, + { + "epoch": 0.6900388760853877, + "grad_norm": 0.40843121752258904, + "learning_rate": 9.493891066434958e-05, + "loss": 3.0421, + "step": 14821 + }, + { + "epoch": 0.6900854342714808, + "grad_norm": 0.34982411042273587, + "learning_rate": 9.493772307852737e-05, + "loss": 3.1011, + "step": 14822 + }, + { + "epoch": 0.6901319924575738, + "grad_norm": 0.3630789650423041, + "learning_rate": 9.493653536081736e-05, + "loss": 2.9175, + "step": 14823 + }, + { + "epoch": 0.690178550643667, + "grad_norm": 0.3786975362220679, + "learning_rate": 9.493534751122306e-05, + "loss": 3.029, + "step": 14824 + }, + { + "epoch": 0.69022510882976, + "grad_norm": 0.3545844236357902, + "learning_rate": 9.493415952974793e-05, + "loss": 3.1034, + "step": 14825 + }, + { + "epoch": 0.6902716670158531, + "grad_norm": 0.3919066432743311, + "learning_rate": 9.493297141639545e-05, + "loss": 3.0132, + "step": 14826 + }, + { + "epoch": 0.6903182252019461, + "grad_norm": 0.3541917895255736, + "learning_rate": 9.493178317116915e-05, + "loss": 3.0262, + "step": 14827 + }, + { + "epoch": 0.6903647833880392, + "grad_norm": 0.3440515502330811, + "learning_rate": 9.493059479407249e-05, + "loss": 3.1213, + "step": 14828 + }, + { + "epoch": 0.6904113415741323, + "grad_norm": 0.35294093312367175, + "learning_rate": 9.492940628510896e-05, + "loss": 3.0727, + "step": 14829 + }, + { + "epoch": 0.6904578997602253, + "grad_norm": 0.33490413477975395, + "learning_rate": 9.492821764428204e-05, + "loss": 2.9273, + "step": 14830 + }, + { + "epoch": 0.6905044579463184, + "grad_norm": 0.3446471296360377, + "learning_rate": 9.492702887159524e-05, + "loss": 3.0548, + "step": 14831 + }, + { + "epoch": 0.6905510161324114, + "grad_norm": 0.37431047734199613, + "learning_rate": 9.492583996705204e-05, + "loss": 3.0276, + "step": 14832 + }, + { + "epoch": 0.6905975743185045, + "grad_norm": 0.35955206043556553, + "learning_rate": 9.49246509306559e-05, + "loss": 2.9855, + "step": 14833 + }, + { + "epoch": 0.6906441325045977, + "grad_norm": 0.33871068455244807, + "learning_rate": 9.492346176241036e-05, + "loss": 3.0321, + "step": 14834 + }, + { + "epoch": 0.6906906906906907, + "grad_norm": 0.32957438914586407, + "learning_rate": 9.492227246231887e-05, + "loss": 3.0438, + "step": 14835 + }, + { + "epoch": 0.6907372488767838, + "grad_norm": 0.3491493683679623, + "learning_rate": 9.492108303038494e-05, + "loss": 3.0027, + "step": 14836 + }, + { + "epoch": 0.6907838070628768, + "grad_norm": 0.3072131203082866, + "learning_rate": 9.491989346661207e-05, + "loss": 2.9913, + "step": 14837 + }, + { + "epoch": 0.6908303652489699, + "grad_norm": 0.3396060027231449, + "learning_rate": 9.491870377100373e-05, + "loss": 3.0576, + "step": 14838 + }, + { + "epoch": 0.690876923435063, + "grad_norm": 0.3284285564294515, + "learning_rate": 9.49175139435634e-05, + "loss": 2.9735, + "step": 14839 + }, + { + "epoch": 0.690923481621156, + "grad_norm": 0.3397987029388619, + "learning_rate": 9.491632398429462e-05, + "loss": 3.0003, + "step": 14840 + }, + { + "epoch": 0.6909700398072491, + "grad_norm": 0.38613955394465266, + "learning_rate": 9.491513389320083e-05, + "loss": 3.0434, + "step": 14841 + }, + { + "epoch": 0.6910165979933421, + "grad_norm": 0.33406720065658596, + "learning_rate": 9.491394367028557e-05, + "loss": 2.9566, + "step": 14842 + }, + { + "epoch": 0.6910631561794353, + "grad_norm": 0.3509648426618125, + "learning_rate": 9.49127533155523e-05, + "loss": 2.9022, + "step": 14843 + }, + { + "epoch": 0.6911097143655284, + "grad_norm": 0.36085992942386724, + "learning_rate": 9.491156282900453e-05, + "loss": 3.0606, + "step": 14844 + }, + { + "epoch": 0.6911562725516214, + "grad_norm": 0.389945908278197, + "learning_rate": 9.491037221064574e-05, + "loss": 3.0823, + "step": 14845 + }, + { + "epoch": 0.6912028307377145, + "grad_norm": 0.35842190302601673, + "learning_rate": 9.490918146047942e-05, + "loss": 3.0089, + "step": 14846 + }, + { + "epoch": 0.6912493889238075, + "grad_norm": 0.3420929694332793, + "learning_rate": 9.49079905785091e-05, + "loss": 2.9893, + "step": 14847 + }, + { + "epoch": 0.6912959471099006, + "grad_norm": 0.34623287672180125, + "learning_rate": 9.490679956473825e-05, + "loss": 3.0045, + "step": 14848 + }, + { + "epoch": 0.6913425052959936, + "grad_norm": 0.32078716879073504, + "learning_rate": 9.490560841917035e-05, + "loss": 2.8954, + "step": 14849 + }, + { + "epoch": 0.6913890634820867, + "grad_norm": 0.3651362219084322, + "learning_rate": 9.490441714180893e-05, + "loss": 2.9969, + "step": 14850 + }, + { + "epoch": 0.6914356216681798, + "grad_norm": 0.4044694062303745, + "learning_rate": 9.490322573265746e-05, + "loss": 3.0085, + "step": 14851 + }, + { + "epoch": 0.6914821798542728, + "grad_norm": 0.31069372059523087, + "learning_rate": 9.490203419171944e-05, + "loss": 2.9678, + "step": 14852 + }, + { + "epoch": 0.691528738040366, + "grad_norm": 0.39771689313151987, + "learning_rate": 9.490084251899838e-05, + "loss": 2.9848, + "step": 14853 + }, + { + "epoch": 0.691575296226459, + "grad_norm": 0.3646416074424488, + "learning_rate": 9.489965071449778e-05, + "loss": 2.9608, + "step": 14854 + }, + { + "epoch": 0.6916218544125521, + "grad_norm": 0.36875529008486096, + "learning_rate": 9.489845877822111e-05, + "loss": 3.0518, + "step": 14855 + }, + { + "epoch": 0.6916684125986452, + "grad_norm": 0.37176476199032554, + "learning_rate": 9.489726671017189e-05, + "loss": 2.9847, + "step": 14856 + }, + { + "epoch": 0.6917149707847382, + "grad_norm": 0.3509327819961762, + "learning_rate": 9.489607451035362e-05, + "loss": 3.0372, + "step": 14857 + }, + { + "epoch": 0.6917615289708313, + "grad_norm": 0.384258185842359, + "learning_rate": 9.48948821787698e-05, + "loss": 3.0062, + "step": 14858 + }, + { + "epoch": 0.6918080871569243, + "grad_norm": 0.3512706293577368, + "learning_rate": 9.489368971542392e-05, + "loss": 2.9775, + "step": 14859 + }, + { + "epoch": 0.6918546453430174, + "grad_norm": 0.3711573178236654, + "learning_rate": 9.489249712031949e-05, + "loss": 3.1122, + "step": 14860 + }, + { + "epoch": 0.6919012035291106, + "grad_norm": 0.37825166934147986, + "learning_rate": 9.489130439346e-05, + "loss": 3.0078, + "step": 14861 + }, + { + "epoch": 0.6919477617152036, + "grad_norm": 0.3647399568996421, + "learning_rate": 9.489011153484895e-05, + "loss": 2.9016, + "step": 14862 + }, + { + "epoch": 0.6919943199012967, + "grad_norm": 0.3738437040468552, + "learning_rate": 9.488891854448985e-05, + "loss": 3.0124, + "step": 14863 + }, + { + "epoch": 0.6920408780873897, + "grad_norm": 0.3854728798895469, + "learning_rate": 9.48877254223862e-05, + "loss": 2.9608, + "step": 14864 + }, + { + "epoch": 0.6920874362734828, + "grad_norm": 0.3530251821390655, + "learning_rate": 9.488653216854149e-05, + "loss": 2.9901, + "step": 14865 + }, + { + "epoch": 0.6921339944595759, + "grad_norm": 0.3486967547349474, + "learning_rate": 9.488533878295922e-05, + "loss": 2.9524, + "step": 14866 + }, + { + "epoch": 0.6921805526456689, + "grad_norm": 0.334085488514699, + "learning_rate": 9.488414526564291e-05, + "loss": 2.9906, + "step": 14867 + }, + { + "epoch": 0.692227110831762, + "grad_norm": 0.34367823929821384, + "learning_rate": 9.488295161659606e-05, + "loss": 2.9463, + "step": 14868 + }, + { + "epoch": 0.692273669017855, + "grad_norm": 0.3404679969618395, + "learning_rate": 9.488175783582217e-05, + "loss": 2.9973, + "step": 14869 + }, + { + "epoch": 0.6923202272039481, + "grad_norm": 0.31409305829457596, + "learning_rate": 9.488056392332474e-05, + "loss": 2.896, + "step": 14870 + }, + { + "epoch": 0.6923667853900412, + "grad_norm": 0.3819069897323175, + "learning_rate": 9.487936987910728e-05, + "loss": 2.9624, + "step": 14871 + }, + { + "epoch": 0.6924133435761343, + "grad_norm": 0.3380327840409811, + "learning_rate": 9.487817570317328e-05, + "loss": 2.9882, + "step": 14872 + }, + { + "epoch": 0.6924599017622274, + "grad_norm": 0.3477378428755494, + "learning_rate": 9.487698139552628e-05, + "loss": 2.9333, + "step": 14873 + }, + { + "epoch": 0.6925064599483204, + "grad_norm": 0.32750235088555113, + "learning_rate": 9.487578695616973e-05, + "loss": 3.0261, + "step": 14874 + }, + { + "epoch": 0.6925530181344135, + "grad_norm": 0.34281933138462833, + "learning_rate": 9.487459238510718e-05, + "loss": 2.9471, + "step": 14875 + }, + { + "epoch": 0.6925995763205065, + "grad_norm": 0.3171899067227737, + "learning_rate": 9.487339768234212e-05, + "loss": 2.9588, + "step": 14876 + }, + { + "epoch": 0.6926461345065996, + "grad_norm": 0.3532059061037496, + "learning_rate": 9.487220284787807e-05, + "loss": 3.0599, + "step": 14877 + }, + { + "epoch": 0.6926926926926927, + "grad_norm": 0.33778216076409295, + "learning_rate": 9.48710078817185e-05, + "loss": 2.9364, + "step": 14878 + }, + { + "epoch": 0.6927392508787857, + "grad_norm": 0.3630571730950787, + "learning_rate": 9.486981278386697e-05, + "loss": 3.0142, + "step": 14879 + }, + { + "epoch": 0.6927858090648789, + "grad_norm": 0.34633908823093984, + "learning_rate": 9.486861755432694e-05, + "loss": 3.0303, + "step": 14880 + }, + { + "epoch": 0.6928323672509719, + "grad_norm": 0.37320504203421606, + "learning_rate": 9.486742219310196e-05, + "loss": 2.9929, + "step": 14881 + }, + { + "epoch": 0.692878925437065, + "grad_norm": 0.3982145484614047, + "learning_rate": 9.486622670019549e-05, + "loss": 3.0942, + "step": 14882 + }, + { + "epoch": 0.6929254836231581, + "grad_norm": 0.4085793329122331, + "learning_rate": 9.486503107561109e-05, + "loss": 3.1497, + "step": 14883 + }, + { + "epoch": 0.6929720418092511, + "grad_norm": 0.3635397910731723, + "learning_rate": 9.486383531935222e-05, + "loss": 3.0252, + "step": 14884 + }, + { + "epoch": 0.6930185999953442, + "grad_norm": 0.41544171397603863, + "learning_rate": 9.486263943142242e-05, + "loss": 3.056, + "step": 14885 + }, + { + "epoch": 0.6930651581814372, + "grad_norm": 0.4270251466397973, + "learning_rate": 9.48614434118252e-05, + "loss": 3.0209, + "step": 14886 + }, + { + "epoch": 0.6931117163675303, + "grad_norm": 0.3632079908826021, + "learning_rate": 9.486024726056406e-05, + "loss": 3.0257, + "step": 14887 + }, + { + "epoch": 0.6931582745536234, + "grad_norm": 0.37350435436677737, + "learning_rate": 9.485905097764251e-05, + "loss": 2.9331, + "step": 14888 + }, + { + "epoch": 0.6932048327397164, + "grad_norm": 0.3496406174491016, + "learning_rate": 9.485785456306407e-05, + "loss": 3.0326, + "step": 14889 + }, + { + "epoch": 0.6932513909258096, + "grad_norm": 0.3772219897616136, + "learning_rate": 9.485665801683224e-05, + "loss": 2.9834, + "step": 14890 + }, + { + "epoch": 0.6932979491119026, + "grad_norm": 0.31361171610468946, + "learning_rate": 9.485546133895055e-05, + "loss": 2.9885, + "step": 14891 + }, + { + "epoch": 0.6933445072979957, + "grad_norm": 0.4046615101611238, + "learning_rate": 9.485426452942249e-05, + "loss": 2.9083, + "step": 14892 + }, + { + "epoch": 0.6933910654840887, + "grad_norm": 0.3754555338503923, + "learning_rate": 9.485306758825159e-05, + "loss": 3.0395, + "step": 14893 + }, + { + "epoch": 0.6934376236701818, + "grad_norm": 0.36644684984283016, + "learning_rate": 9.485187051544135e-05, + "loss": 3.0244, + "step": 14894 + }, + { + "epoch": 0.6934841818562749, + "grad_norm": 0.34291647389790986, + "learning_rate": 9.485067331099528e-05, + "loss": 3.0011, + "step": 14895 + }, + { + "epoch": 0.6935307400423679, + "grad_norm": 0.3484924128284653, + "learning_rate": 9.484947597491691e-05, + "loss": 3.0423, + "step": 14896 + }, + { + "epoch": 0.693577298228461, + "grad_norm": 0.35435544886901016, + "learning_rate": 9.484827850720975e-05, + "loss": 2.9855, + "step": 14897 + }, + { + "epoch": 0.693623856414554, + "grad_norm": 0.35196341593241987, + "learning_rate": 9.48470809078773e-05, + "loss": 3.0875, + "step": 14898 + }, + { + "epoch": 0.6936704146006472, + "grad_norm": 0.32755592517497195, + "learning_rate": 9.48458831769231e-05, + "loss": 2.8655, + "step": 14899 + }, + { + "epoch": 0.6937169727867403, + "grad_norm": 0.35232012112139177, + "learning_rate": 9.484468531435064e-05, + "loss": 3.0497, + "step": 14900 + }, + { + "epoch": 0.6937635309728333, + "grad_norm": 0.331777438581853, + "learning_rate": 9.484348732016346e-05, + "loss": 2.9389, + "step": 14901 + }, + { + "epoch": 0.6938100891589264, + "grad_norm": 0.3484795696992937, + "learning_rate": 9.484228919436505e-05, + "loss": 3.06, + "step": 14902 + }, + { + "epoch": 0.6938566473450194, + "grad_norm": 0.38602720195580387, + "learning_rate": 9.484109093695894e-05, + "loss": 2.9942, + "step": 14903 + }, + { + "epoch": 0.6939032055311125, + "grad_norm": 0.3498506110486525, + "learning_rate": 9.483989254794865e-05, + "loss": 2.9826, + "step": 14904 + }, + { + "epoch": 0.6939497637172056, + "grad_norm": 0.35419810647802985, + "learning_rate": 9.483869402733769e-05, + "loss": 3.0155, + "step": 14905 + }, + { + "epoch": 0.6939963219032986, + "grad_norm": 0.3553800867230313, + "learning_rate": 9.483749537512959e-05, + "loss": 3.0698, + "step": 14906 + }, + { + "epoch": 0.6940428800893917, + "grad_norm": 0.3312685466503138, + "learning_rate": 9.483629659132784e-05, + "loss": 2.9855, + "step": 14907 + }, + { + "epoch": 0.6940894382754847, + "grad_norm": 0.3720448439649935, + "learning_rate": 9.483509767593599e-05, + "loss": 2.9508, + "step": 14908 + }, + { + "epoch": 0.6941359964615779, + "grad_norm": 0.3539390723690785, + "learning_rate": 9.483389862895755e-05, + "loss": 3.016, + "step": 14909 + }, + { + "epoch": 0.694182554647671, + "grad_norm": 0.34640962964912386, + "learning_rate": 9.483269945039603e-05, + "loss": 3.0447, + "step": 14910 + }, + { + "epoch": 0.694229112833764, + "grad_norm": 0.33738420971838345, + "learning_rate": 9.483150014025495e-05, + "loss": 2.9676, + "step": 14911 + }, + { + "epoch": 0.6942756710198571, + "grad_norm": 0.3276292445724652, + "learning_rate": 9.483030069853783e-05, + "loss": 3.0416, + "step": 14912 + }, + { + "epoch": 0.6943222292059501, + "grad_norm": 0.3465618437309314, + "learning_rate": 9.48291011252482e-05, + "loss": 3.0012, + "step": 14913 + }, + { + "epoch": 0.6943687873920432, + "grad_norm": 0.3364317952258156, + "learning_rate": 9.482790142038958e-05, + "loss": 3.0113, + "step": 14914 + }, + { + "epoch": 0.6944153455781362, + "grad_norm": 0.3469985741117729, + "learning_rate": 9.482670158396547e-05, + "loss": 3.0493, + "step": 14915 + }, + { + "epoch": 0.6944619037642293, + "grad_norm": 0.31040174620063743, + "learning_rate": 9.48255016159794e-05, + "loss": 2.9643, + "step": 14916 + }, + { + "epoch": 0.6945084619503225, + "grad_norm": 0.36549168847933894, + "learning_rate": 9.482430151643493e-05, + "loss": 2.9394, + "step": 14917 + }, + { + "epoch": 0.6945550201364155, + "grad_norm": 0.3070860632158043, + "learning_rate": 9.482310128533553e-05, + "loss": 3.0356, + "step": 14918 + }, + { + "epoch": 0.6946015783225086, + "grad_norm": 0.3638499446184162, + "learning_rate": 9.482190092268475e-05, + "loss": 3.0363, + "step": 14919 + }, + { + "epoch": 0.6946481365086016, + "grad_norm": 0.34753720169952457, + "learning_rate": 9.48207004284861e-05, + "loss": 3.015, + "step": 14920 + }, + { + "epoch": 0.6946946946946947, + "grad_norm": 0.3447868526872956, + "learning_rate": 9.481949980274311e-05, + "loss": 3.1107, + "step": 14921 + }, + { + "epoch": 0.6947412528807878, + "grad_norm": 0.37167502320194507, + "learning_rate": 9.48182990454593e-05, + "loss": 3.0408, + "step": 14922 + }, + { + "epoch": 0.6947878110668808, + "grad_norm": 0.3580399789788059, + "learning_rate": 9.481709815663822e-05, + "loss": 2.9459, + "step": 14923 + }, + { + "epoch": 0.6948343692529739, + "grad_norm": 0.3483126050649379, + "learning_rate": 9.481589713628336e-05, + "loss": 3.114, + "step": 14924 + }, + { + "epoch": 0.6948809274390669, + "grad_norm": 0.3513070758293593, + "learning_rate": 9.481469598439825e-05, + "loss": 3.0798, + "step": 14925 + }, + { + "epoch": 0.69492748562516, + "grad_norm": 0.3544782718473221, + "learning_rate": 9.481349470098644e-05, + "loss": 3.0295, + "step": 14926 + }, + { + "epoch": 0.6949740438112532, + "grad_norm": 0.36309212408080943, + "learning_rate": 9.481229328605141e-05, + "loss": 3.1236, + "step": 14927 + }, + { + "epoch": 0.6950206019973462, + "grad_norm": 0.33600378317356067, + "learning_rate": 9.481109173959675e-05, + "loss": 3.0802, + "step": 14928 + }, + { + "epoch": 0.6950671601834393, + "grad_norm": 0.32113080685633827, + "learning_rate": 9.480989006162592e-05, + "loss": 3.0037, + "step": 14929 + }, + { + "epoch": 0.6951137183695323, + "grad_norm": 0.3315306787776539, + "learning_rate": 9.480868825214248e-05, + "loss": 3.1109, + "step": 14930 + }, + { + "epoch": 0.6951602765556254, + "grad_norm": 0.35590938760733576, + "learning_rate": 9.480748631114998e-05, + "loss": 3.0405, + "step": 14931 + }, + { + "epoch": 0.6952068347417185, + "grad_norm": 0.34658152728527414, + "learning_rate": 9.480628423865189e-05, + "loss": 2.9928, + "step": 14932 + }, + { + "epoch": 0.6952533929278115, + "grad_norm": 0.3501611143852053, + "learning_rate": 9.480508203465179e-05, + "loss": 2.9396, + "step": 14933 + }, + { + "epoch": 0.6952999511139046, + "grad_norm": 0.3465518224128618, + "learning_rate": 9.480387969915317e-05, + "loss": 3.1266, + "step": 14934 + }, + { + "epoch": 0.6953465092999976, + "grad_norm": 0.4124589429463638, + "learning_rate": 9.480267723215958e-05, + "loss": 3.018, + "step": 14935 + }, + { + "epoch": 0.6953930674860908, + "grad_norm": 0.35760436383358246, + "learning_rate": 9.480147463367457e-05, + "loss": 2.9377, + "step": 14936 + }, + { + "epoch": 0.6954396256721838, + "grad_norm": 0.37271504521433735, + "learning_rate": 9.480027190370162e-05, + "loss": 3.0374, + "step": 14937 + }, + { + "epoch": 0.6954861838582769, + "grad_norm": 0.41637730194553524, + "learning_rate": 9.47990690422443e-05, + "loss": 2.8177, + "step": 14938 + }, + { + "epoch": 0.69553274204437, + "grad_norm": 0.40132832384895256, + "learning_rate": 9.479786604930613e-05, + "loss": 3.2283, + "step": 14939 + }, + { + "epoch": 0.695579300230463, + "grad_norm": 0.4379288136454068, + "learning_rate": 9.479666292489062e-05, + "loss": 2.9727, + "step": 14940 + }, + { + "epoch": 0.6956258584165561, + "grad_norm": 0.404227903419478, + "learning_rate": 9.479545966900132e-05, + "loss": 2.9888, + "step": 14941 + }, + { + "epoch": 0.6956724166026491, + "grad_norm": 0.3830762891857847, + "learning_rate": 9.479425628164178e-05, + "loss": 3.0351, + "step": 14942 + }, + { + "epoch": 0.6957189747887422, + "grad_norm": 0.4241851197044258, + "learning_rate": 9.479305276281549e-05, + "loss": 2.9423, + "step": 14943 + }, + { + "epoch": 0.6957655329748353, + "grad_norm": 0.3858805478102821, + "learning_rate": 9.479184911252601e-05, + "loss": 3.0043, + "step": 14944 + }, + { + "epoch": 0.6958120911609283, + "grad_norm": 0.3775252372224102, + "learning_rate": 9.479064533077686e-05, + "loss": 3.0172, + "step": 14945 + }, + { + "epoch": 0.6958586493470215, + "grad_norm": 0.4092361456944633, + "learning_rate": 9.478944141757158e-05, + "loss": 2.9522, + "step": 14946 + }, + { + "epoch": 0.6959052075331145, + "grad_norm": 0.3839659312853231, + "learning_rate": 9.478823737291371e-05, + "loss": 2.9975, + "step": 14947 + }, + { + "epoch": 0.6959517657192076, + "grad_norm": 0.36665823842040085, + "learning_rate": 9.478703319680677e-05, + "loss": 2.9665, + "step": 14948 + }, + { + "epoch": 0.6959983239053007, + "grad_norm": 0.36838824412043164, + "learning_rate": 9.47858288892543e-05, + "loss": 2.9672, + "step": 14949 + }, + { + "epoch": 0.6960448820913937, + "grad_norm": 0.3616946645898757, + "learning_rate": 9.478462445025983e-05, + "loss": 3.1072, + "step": 14950 + }, + { + "epoch": 0.6960914402774868, + "grad_norm": 0.3616973980480597, + "learning_rate": 9.47834198798269e-05, + "loss": 3.1474, + "step": 14951 + }, + { + "epoch": 0.6961379984635798, + "grad_norm": 0.3485244705507217, + "learning_rate": 9.478221517795904e-05, + "loss": 2.9793, + "step": 14952 + }, + { + "epoch": 0.6961845566496729, + "grad_norm": 0.3537805346416735, + "learning_rate": 9.478101034465981e-05, + "loss": 3.0493, + "step": 14953 + }, + { + "epoch": 0.696231114835766, + "grad_norm": 0.355642849913624, + "learning_rate": 9.477980537993272e-05, + "loss": 2.9419, + "step": 14954 + }, + { + "epoch": 0.6962776730218591, + "grad_norm": 0.3240109470461234, + "learning_rate": 9.47786002837813e-05, + "loss": 2.9514, + "step": 14955 + }, + { + "epoch": 0.6963242312079522, + "grad_norm": 0.3811291612038244, + "learning_rate": 9.47773950562091e-05, + "loss": 3.0474, + "step": 14956 + }, + { + "epoch": 0.6963707893940452, + "grad_norm": 0.3439151827960954, + "learning_rate": 9.477618969721966e-05, + "loss": 3.0615, + "step": 14957 + }, + { + "epoch": 0.6964173475801383, + "grad_norm": 0.328659800625249, + "learning_rate": 9.477498420681652e-05, + "loss": 2.9295, + "step": 14958 + }, + { + "epoch": 0.6964639057662313, + "grad_norm": 0.35627519722826245, + "learning_rate": 9.477377858500321e-05, + "loss": 2.998, + "step": 14959 + }, + { + "epoch": 0.6965104639523244, + "grad_norm": 0.3418043456912648, + "learning_rate": 9.477257283178328e-05, + "loss": 2.9564, + "step": 14960 + }, + { + "epoch": 0.6965570221384175, + "grad_norm": 0.3640069941827284, + "learning_rate": 9.477136694716024e-05, + "loss": 3.0228, + "step": 14961 + }, + { + "epoch": 0.6966035803245105, + "grad_norm": 0.3344867392106577, + "learning_rate": 9.477016093113767e-05, + "loss": 2.9478, + "step": 14962 + }, + { + "epoch": 0.6966501385106036, + "grad_norm": 0.3933747726434902, + "learning_rate": 9.476895478371907e-05, + "loss": 3.0165, + "step": 14963 + }, + { + "epoch": 0.6966966966966966, + "grad_norm": 0.38039419719169565, + "learning_rate": 9.476774850490802e-05, + "loss": 2.9813, + "step": 14964 + }, + { + "epoch": 0.6967432548827898, + "grad_norm": 0.3949726508959786, + "learning_rate": 9.476654209470801e-05, + "loss": 3.0288, + "step": 14965 + }, + { + "epoch": 0.6967898130688829, + "grad_norm": 0.3480679068842625, + "learning_rate": 9.476533555312263e-05, + "loss": 3.0008, + "step": 14966 + }, + { + "epoch": 0.6968363712549759, + "grad_norm": 0.3545420547348564, + "learning_rate": 9.47641288801554e-05, + "loss": 3.0825, + "step": 14967 + }, + { + "epoch": 0.696882929441069, + "grad_norm": 0.38930942912071803, + "learning_rate": 9.476292207580986e-05, + "loss": 3.0255, + "step": 14968 + }, + { + "epoch": 0.696929487627162, + "grad_norm": 0.3385905558200931, + "learning_rate": 9.476171514008953e-05, + "loss": 3.1733, + "step": 14969 + }, + { + "epoch": 0.6969760458132551, + "grad_norm": 0.46501973960208354, + "learning_rate": 9.476050807299801e-05, + "loss": 2.9375, + "step": 14970 + }, + { + "epoch": 0.6970226039993482, + "grad_norm": 0.35652374658912933, + "learning_rate": 9.475930087453879e-05, + "loss": 2.8975, + "step": 14971 + }, + { + "epoch": 0.6970691621854412, + "grad_norm": 0.386838834519947, + "learning_rate": 9.475809354471543e-05, + "loss": 2.9908, + "step": 14972 + }, + { + "epoch": 0.6971157203715344, + "grad_norm": 0.37507080771389195, + "learning_rate": 9.475688608353148e-05, + "loss": 2.9612, + "step": 14973 + }, + { + "epoch": 0.6971622785576274, + "grad_norm": 0.36007927511186977, + "learning_rate": 9.475567849099048e-05, + "loss": 3.0814, + "step": 14974 + }, + { + "epoch": 0.6972088367437205, + "grad_norm": 0.40624810361230473, + "learning_rate": 9.475447076709597e-05, + "loss": 3.1081, + "step": 14975 + }, + { + "epoch": 0.6972553949298136, + "grad_norm": 0.3406733986332069, + "learning_rate": 9.47532629118515e-05, + "loss": 3.0625, + "step": 14976 + }, + { + "epoch": 0.6973019531159066, + "grad_norm": 0.4103843993369065, + "learning_rate": 9.475205492526062e-05, + "loss": 3.0336, + "step": 14977 + }, + { + "epoch": 0.6973485113019997, + "grad_norm": 0.4109016488722729, + "learning_rate": 9.475084680732684e-05, + "loss": 3.0698, + "step": 14978 + }, + { + "epoch": 0.6973950694880927, + "grad_norm": 0.3456854687854729, + "learning_rate": 9.474963855805376e-05, + "loss": 2.9701, + "step": 14979 + }, + { + "epoch": 0.6974416276741858, + "grad_norm": 0.3987001230619613, + "learning_rate": 9.474843017744489e-05, + "loss": 3.0443, + "step": 14980 + }, + { + "epoch": 0.6974881858602788, + "grad_norm": 0.40443264993358224, + "learning_rate": 9.474722166550379e-05, + "loss": 3.0269, + "step": 14981 + }, + { + "epoch": 0.697534744046372, + "grad_norm": 0.3942778208222763, + "learning_rate": 9.4746013022234e-05, + "loss": 3.053, + "step": 14982 + }, + { + "epoch": 0.6975813022324651, + "grad_norm": 0.4108878836172974, + "learning_rate": 9.474480424763908e-05, + "loss": 2.9626, + "step": 14983 + }, + { + "epoch": 0.6976278604185581, + "grad_norm": 0.37539996285722077, + "learning_rate": 9.474359534172255e-05, + "loss": 3.0414, + "step": 14984 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.43240458938753384, + "learning_rate": 9.474238630448797e-05, + "loss": 3.0108, + "step": 14985 + }, + { + "epoch": 0.6977209767907442, + "grad_norm": 0.39655789742451353, + "learning_rate": 9.474117713593891e-05, + "loss": 2.9923, + "step": 14986 + }, + { + "epoch": 0.6977675349768373, + "grad_norm": 0.36783087203837145, + "learning_rate": 9.47399678360789e-05, + "loss": 3.0653, + "step": 14987 + }, + { + "epoch": 0.6978140931629304, + "grad_norm": 0.4010735947984656, + "learning_rate": 9.47387584049115e-05, + "loss": 3.1151, + "step": 14988 + }, + { + "epoch": 0.6978606513490234, + "grad_norm": 0.4229078955484748, + "learning_rate": 9.473754884244024e-05, + "loss": 2.9081, + "step": 14989 + }, + { + "epoch": 0.6979072095351165, + "grad_norm": 0.34726394995812765, + "learning_rate": 9.473633914866868e-05, + "loss": 3.0914, + "step": 14990 + }, + { + "epoch": 0.6979537677212095, + "grad_norm": 0.4015465767121956, + "learning_rate": 9.473512932360038e-05, + "loss": 3.0759, + "step": 14991 + }, + { + "epoch": 0.6980003259073027, + "grad_norm": 0.3661836300248985, + "learning_rate": 9.473391936723888e-05, + "loss": 3.0615, + "step": 14992 + }, + { + "epoch": 0.6980468840933958, + "grad_norm": 0.3586533939637846, + "learning_rate": 9.473270927958774e-05, + "loss": 3.0745, + "step": 14993 + }, + { + "epoch": 0.6980934422794888, + "grad_norm": 0.3587707014082159, + "learning_rate": 9.47314990606505e-05, + "loss": 3.0011, + "step": 14994 + }, + { + "epoch": 0.6981400004655819, + "grad_norm": 0.3579680078222393, + "learning_rate": 9.473028871043071e-05, + "loss": 3.0405, + "step": 14995 + }, + { + "epoch": 0.6981865586516749, + "grad_norm": 0.35876854561717336, + "learning_rate": 9.472907822893194e-05, + "loss": 3.0011, + "step": 14996 + }, + { + "epoch": 0.698233116837768, + "grad_norm": 0.38799114960853603, + "learning_rate": 9.472786761615773e-05, + "loss": 3.0107, + "step": 14997 + }, + { + "epoch": 0.6982796750238611, + "grad_norm": 0.37458671978982383, + "learning_rate": 9.472665687211162e-05, + "loss": 3.0226, + "step": 14998 + }, + { + "epoch": 0.6983262332099541, + "grad_norm": 0.41529427156745896, + "learning_rate": 9.47254459967972e-05, + "loss": 2.9797, + "step": 14999 + }, + { + "epoch": 0.6983727913960472, + "grad_norm": 0.3879322858899296, + "learning_rate": 9.472423499021798e-05, + "loss": 2.9749, + "step": 15000 + }, + { + "epoch": 0.6984193495821402, + "grad_norm": 0.3794987019069919, + "learning_rate": 9.472302385237756e-05, + "loss": 3.0723, + "step": 15001 + }, + { + "epoch": 0.6984659077682334, + "grad_norm": 0.41384903174449444, + "learning_rate": 9.472181258327946e-05, + "loss": 2.9762, + "step": 15002 + }, + { + "epoch": 0.6985124659543264, + "grad_norm": 0.41018033824239175, + "learning_rate": 9.472060118292725e-05, + "loss": 3.0444, + "step": 15003 + }, + { + "epoch": 0.6985590241404195, + "grad_norm": 0.36578088220440325, + "learning_rate": 9.471938965132447e-05, + "loss": 2.8294, + "step": 15004 + }, + { + "epoch": 0.6986055823265126, + "grad_norm": 0.3835571758128151, + "learning_rate": 9.471817798847469e-05, + "loss": 2.9797, + "step": 15005 + }, + { + "epoch": 0.6986521405126056, + "grad_norm": 0.3879319026624815, + "learning_rate": 9.471696619438147e-05, + "loss": 3.0698, + "step": 15006 + }, + { + "epoch": 0.6986986986986987, + "grad_norm": 0.39272907327516504, + "learning_rate": 9.471575426904835e-05, + "loss": 2.988, + "step": 15007 + }, + { + "epoch": 0.6987452568847917, + "grad_norm": 0.4661198326645903, + "learning_rate": 9.471454221247891e-05, + "loss": 2.9965, + "step": 15008 + }, + { + "epoch": 0.6987918150708848, + "grad_norm": 0.42635449748357734, + "learning_rate": 9.471333002467668e-05, + "loss": 2.9718, + "step": 15009 + }, + { + "epoch": 0.698838373256978, + "grad_norm": 0.40815861571235346, + "learning_rate": 9.471211770564524e-05, + "loss": 3.0426, + "step": 15010 + }, + { + "epoch": 0.698884931443071, + "grad_norm": 0.4231587673460088, + "learning_rate": 9.471090525538814e-05, + "loss": 3.0651, + "step": 15011 + }, + { + "epoch": 0.6989314896291641, + "grad_norm": 0.3776556510493172, + "learning_rate": 9.470969267390893e-05, + "loss": 3.0127, + "step": 15012 + }, + { + "epoch": 0.6989780478152571, + "grad_norm": 0.3625412422589047, + "learning_rate": 9.470847996121118e-05, + "loss": 2.9256, + "step": 15013 + }, + { + "epoch": 0.6990246060013502, + "grad_norm": 0.4007874335856372, + "learning_rate": 9.470726711729845e-05, + "loss": 3.1201, + "step": 15014 + }, + { + "epoch": 0.6990711641874433, + "grad_norm": 0.33053583427583666, + "learning_rate": 9.470605414217429e-05, + "loss": 2.9097, + "step": 15015 + }, + { + "epoch": 0.6991177223735363, + "grad_norm": 0.39780157763024604, + "learning_rate": 9.470484103584227e-05, + "loss": 3.0552, + "step": 15016 + }, + { + "epoch": 0.6991642805596294, + "grad_norm": 0.3805586409590542, + "learning_rate": 9.470362779830595e-05, + "loss": 3.0385, + "step": 15017 + }, + { + "epoch": 0.6992108387457224, + "grad_norm": 0.33490266632953575, + "learning_rate": 9.470241442956888e-05, + "loss": 3.1145, + "step": 15018 + }, + { + "epoch": 0.6992573969318155, + "grad_norm": 0.38171510161318106, + "learning_rate": 9.470120092963463e-05, + "loss": 3.0891, + "step": 15019 + }, + { + "epoch": 0.6993039551179087, + "grad_norm": 0.35834121498241717, + "learning_rate": 9.469998729850675e-05, + "loss": 3.0867, + "step": 15020 + }, + { + "epoch": 0.6993505133040017, + "grad_norm": 0.39765756196059054, + "learning_rate": 9.469877353618882e-05, + "loss": 3.0403, + "step": 15021 + }, + { + "epoch": 0.6993970714900948, + "grad_norm": 0.3750598507646144, + "learning_rate": 9.469755964268439e-05, + "loss": 2.9891, + "step": 15022 + }, + { + "epoch": 0.6994436296761878, + "grad_norm": 0.33888205451275266, + "learning_rate": 9.469634561799703e-05, + "loss": 3.0288, + "step": 15023 + }, + { + "epoch": 0.6994901878622809, + "grad_norm": 0.34026023001865985, + "learning_rate": 9.46951314621303e-05, + "loss": 3.0589, + "step": 15024 + }, + { + "epoch": 0.6995367460483739, + "grad_norm": 0.4071817268798133, + "learning_rate": 9.469391717508776e-05, + "loss": 3.0159, + "step": 15025 + }, + { + "epoch": 0.699583304234467, + "grad_norm": 0.32340090753821144, + "learning_rate": 9.469270275687297e-05, + "loss": 2.9846, + "step": 15026 + }, + { + "epoch": 0.6996298624205601, + "grad_norm": 0.42470965560479157, + "learning_rate": 9.46914882074895e-05, + "loss": 2.9882, + "step": 15027 + }, + { + "epoch": 0.6996764206066531, + "grad_norm": 0.4221798459762885, + "learning_rate": 9.469027352694093e-05, + "loss": 3.0795, + "step": 15028 + }, + { + "epoch": 0.6997229787927463, + "grad_norm": 0.3644064146014907, + "learning_rate": 9.46890587152308e-05, + "loss": 3.0295, + "step": 15029 + }, + { + "epoch": 0.6997695369788393, + "grad_norm": 0.43697451757839045, + "learning_rate": 9.468784377236269e-05, + "loss": 3.0413, + "step": 15030 + }, + { + "epoch": 0.6998160951649324, + "grad_norm": 0.36426082357963546, + "learning_rate": 9.468662869834015e-05, + "loss": 3.0969, + "step": 15031 + }, + { + "epoch": 0.6998626533510255, + "grad_norm": 0.43413482728228314, + "learning_rate": 9.468541349316675e-05, + "loss": 3.1717, + "step": 15032 + }, + { + "epoch": 0.6999092115371185, + "grad_norm": 0.4227418155898424, + "learning_rate": 9.468419815684608e-05, + "loss": 2.9818, + "step": 15033 + }, + { + "epoch": 0.6999557697232116, + "grad_norm": 0.3496528586558797, + "learning_rate": 9.468298268938169e-05, + "loss": 2.999, + "step": 15034 + }, + { + "epoch": 0.7000023279093046, + "grad_norm": 0.4099116456517183, + "learning_rate": 9.468176709077715e-05, + "loss": 3.0072, + "step": 15035 + }, + { + "epoch": 0.7000488860953977, + "grad_norm": 0.34120769401612827, + "learning_rate": 9.468055136103602e-05, + "loss": 2.9012, + "step": 15036 + }, + { + "epoch": 0.7000954442814908, + "grad_norm": 0.38846419414415173, + "learning_rate": 9.467933550016188e-05, + "loss": 2.9283, + "step": 15037 + }, + { + "epoch": 0.7001420024675838, + "grad_norm": 0.3729902780930641, + "learning_rate": 9.467811950815828e-05, + "loss": 2.8656, + "step": 15038 + }, + { + "epoch": 0.700188560653677, + "grad_norm": 0.36624262050595546, + "learning_rate": 9.467690338502881e-05, + "loss": 2.8615, + "step": 15039 + }, + { + "epoch": 0.70023511883977, + "grad_norm": 0.34731498845494413, + "learning_rate": 9.467568713077703e-05, + "loss": 3.0199, + "step": 15040 + }, + { + "epoch": 0.7002816770258631, + "grad_norm": 0.3421123044435521, + "learning_rate": 9.467447074540649e-05, + "loss": 3.1298, + "step": 15041 + }, + { + "epoch": 0.7003282352119562, + "grad_norm": 0.34890684682386475, + "learning_rate": 9.467325422892081e-05, + "loss": 2.965, + "step": 15042 + }, + { + "epoch": 0.7003747933980492, + "grad_norm": 0.3455032356179176, + "learning_rate": 9.467203758132351e-05, + "loss": 3.0066, + "step": 15043 + }, + { + "epoch": 0.7004213515841423, + "grad_norm": 0.34595861887811524, + "learning_rate": 9.467082080261819e-05, + "loss": 2.9319, + "step": 15044 + }, + { + "epoch": 0.7004679097702353, + "grad_norm": 0.36582537643584234, + "learning_rate": 9.466960389280841e-05, + "loss": 2.9639, + "step": 15045 + }, + { + "epoch": 0.7005144679563284, + "grad_norm": 0.3322038719446949, + "learning_rate": 9.466838685189773e-05, + "loss": 3.0389, + "step": 15046 + }, + { + "epoch": 0.7005610261424214, + "grad_norm": 0.3905927785265478, + "learning_rate": 9.466716967988976e-05, + "loss": 2.9916, + "step": 15047 + }, + { + "epoch": 0.7006075843285146, + "grad_norm": 0.337276369388026, + "learning_rate": 9.466595237678803e-05, + "loss": 3.0626, + "step": 15048 + }, + { + "epoch": 0.7006541425146077, + "grad_norm": 0.36878758393039224, + "learning_rate": 9.466473494259613e-05, + "loss": 2.958, + "step": 15049 + }, + { + "epoch": 0.7007007007007007, + "grad_norm": 0.35746563270327186, + "learning_rate": 9.466351737731765e-05, + "loss": 3.0375, + "step": 15050 + }, + { + "epoch": 0.7007472588867938, + "grad_norm": 0.3514102619979911, + "learning_rate": 9.466229968095613e-05, + "loss": 3.0464, + "step": 15051 + }, + { + "epoch": 0.7007938170728868, + "grad_norm": 0.38418089322300214, + "learning_rate": 9.466108185351517e-05, + "loss": 3.0917, + "step": 15052 + }, + { + "epoch": 0.7008403752589799, + "grad_norm": 0.3811688308022764, + "learning_rate": 9.465986389499832e-05, + "loss": 2.9738, + "step": 15053 + }, + { + "epoch": 0.700886933445073, + "grad_norm": 0.37671475274565275, + "learning_rate": 9.465864580540918e-05, + "loss": 3.0815, + "step": 15054 + }, + { + "epoch": 0.700933491631166, + "grad_norm": 0.40861272663354375, + "learning_rate": 9.465742758475131e-05, + "loss": 2.866, + "step": 15055 + }, + { + "epoch": 0.7009800498172591, + "grad_norm": 0.3385255490484193, + "learning_rate": 9.465620923302829e-05, + "loss": 2.9638, + "step": 15056 + }, + { + "epoch": 0.7010266080033521, + "grad_norm": 0.3883561631999421, + "learning_rate": 9.46549907502437e-05, + "loss": 3.0328, + "step": 15057 + }, + { + "epoch": 0.7010731661894453, + "grad_norm": 0.3399873110008705, + "learning_rate": 9.465377213640111e-05, + "loss": 3.0058, + "step": 15058 + }, + { + "epoch": 0.7011197243755384, + "grad_norm": 0.3627530100035758, + "learning_rate": 9.465255339150409e-05, + "loss": 2.9118, + "step": 15059 + }, + { + "epoch": 0.7011662825616314, + "grad_norm": 0.37784421996207934, + "learning_rate": 9.465133451555623e-05, + "loss": 3.0191, + "step": 15060 + }, + { + "epoch": 0.7012128407477245, + "grad_norm": 0.35918857195307086, + "learning_rate": 9.465011550856111e-05, + "loss": 2.9653, + "step": 15061 + }, + { + "epoch": 0.7012593989338175, + "grad_norm": 0.3525413815057742, + "learning_rate": 9.46488963705223e-05, + "loss": 2.9585, + "step": 15062 + }, + { + "epoch": 0.7013059571199106, + "grad_norm": 0.3810481970680997, + "learning_rate": 9.464767710144337e-05, + "loss": 3.0534, + "step": 15063 + }, + { + "epoch": 0.7013525153060037, + "grad_norm": 0.34082081254749835, + "learning_rate": 9.464645770132789e-05, + "loss": 2.9188, + "step": 15064 + }, + { + "epoch": 0.7013990734920967, + "grad_norm": 0.35898701092795604, + "learning_rate": 9.464523817017949e-05, + "loss": 2.9599, + "step": 15065 + }, + { + "epoch": 0.7014456316781899, + "grad_norm": 0.39262246761170444, + "learning_rate": 9.464401850800169e-05, + "loss": 2.9326, + "step": 15066 + }, + { + "epoch": 0.7014921898642829, + "grad_norm": 0.3376381865402805, + "learning_rate": 9.464279871479811e-05, + "loss": 3.0719, + "step": 15067 + }, + { + "epoch": 0.701538748050376, + "grad_norm": 0.32099396156866555, + "learning_rate": 9.464157879057229e-05, + "loss": 2.9349, + "step": 15068 + }, + { + "epoch": 0.701585306236469, + "grad_norm": 0.3638582758581464, + "learning_rate": 9.464035873532786e-05, + "loss": 2.9972, + "step": 15069 + }, + { + "epoch": 0.7016318644225621, + "grad_norm": 0.32415104746244733, + "learning_rate": 9.463913854906836e-05, + "loss": 3.0275, + "step": 15070 + }, + { + "epoch": 0.7016784226086552, + "grad_norm": 0.3403501426095813, + "learning_rate": 9.46379182317974e-05, + "loss": 3.0342, + "step": 15071 + }, + { + "epoch": 0.7017249807947482, + "grad_norm": 0.3425360847352098, + "learning_rate": 9.463669778351853e-05, + "loss": 3.0119, + "step": 15072 + }, + { + "epoch": 0.7017715389808413, + "grad_norm": 0.3473351917605487, + "learning_rate": 9.463547720423536e-05, + "loss": 2.8718, + "step": 15073 + }, + { + "epoch": 0.7018180971669343, + "grad_norm": 0.33233944040114843, + "learning_rate": 9.463425649395145e-05, + "loss": 3.04, + "step": 15074 + }, + { + "epoch": 0.7018646553530274, + "grad_norm": 0.3804165003568469, + "learning_rate": 9.46330356526704e-05, + "loss": 3.0049, + "step": 15075 + }, + { + "epoch": 0.7019112135391206, + "grad_norm": 0.36676059917204673, + "learning_rate": 9.463181468039579e-05, + "loss": 3.0177, + "step": 15076 + }, + { + "epoch": 0.7019577717252136, + "grad_norm": 0.4256134219731853, + "learning_rate": 9.46305935771312e-05, + "loss": 3.0255, + "step": 15077 + }, + { + "epoch": 0.7020043299113067, + "grad_norm": 0.406345333141523, + "learning_rate": 9.462937234288022e-05, + "loss": 2.9715, + "step": 15078 + }, + { + "epoch": 0.7020508880973997, + "grad_norm": 0.37806778903264077, + "learning_rate": 9.462815097764643e-05, + "loss": 2.9979, + "step": 15079 + }, + { + "epoch": 0.7020974462834928, + "grad_norm": 0.40005117204474977, + "learning_rate": 9.46269294814334e-05, + "loss": 2.9901, + "step": 15080 + }, + { + "epoch": 0.7021440044695859, + "grad_norm": 0.3664281130890941, + "learning_rate": 9.462570785424474e-05, + "loss": 3.0455, + "step": 15081 + }, + { + "epoch": 0.7021905626556789, + "grad_norm": 0.4368372628581947, + "learning_rate": 9.4624486096084e-05, + "loss": 2.9895, + "step": 15082 + }, + { + "epoch": 0.702237120841772, + "grad_norm": 0.3627517909444529, + "learning_rate": 9.462326420695482e-05, + "loss": 3.0693, + "step": 15083 + }, + { + "epoch": 0.702283679027865, + "grad_norm": 0.4096478524334347, + "learning_rate": 9.462204218686075e-05, + "loss": 3.0772, + "step": 15084 + }, + { + "epoch": 0.7023302372139582, + "grad_norm": 0.3869827222839659, + "learning_rate": 9.462082003580539e-05, + "loss": 3.1862, + "step": 15085 + }, + { + "epoch": 0.7023767954000513, + "grad_norm": 0.42148405106976794, + "learning_rate": 9.46195977537923e-05, + "loss": 2.9792, + "step": 15086 + }, + { + "epoch": 0.7024233535861443, + "grad_norm": 0.4044101046718946, + "learning_rate": 9.461837534082509e-05, + "loss": 3.0546, + "step": 15087 + }, + { + "epoch": 0.7024699117722374, + "grad_norm": 0.3487694264613756, + "learning_rate": 9.461715279690735e-05, + "loss": 3.0011, + "step": 15088 + }, + { + "epoch": 0.7025164699583304, + "grad_norm": 0.38666085335482875, + "learning_rate": 9.461593012204266e-05, + "loss": 3.027, + "step": 15089 + }, + { + "epoch": 0.7025630281444235, + "grad_norm": 0.3779300812203595, + "learning_rate": 9.461470731623461e-05, + "loss": 3.008, + "step": 15090 + }, + { + "epoch": 0.7026095863305165, + "grad_norm": 0.38221280124367957, + "learning_rate": 9.461348437948678e-05, + "loss": 3.1117, + "step": 15091 + }, + { + "epoch": 0.7026561445166096, + "grad_norm": 0.4045302739417482, + "learning_rate": 9.461226131180278e-05, + "loss": 2.9793, + "step": 15092 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 0.36151689014642174, + "learning_rate": 9.461103811318618e-05, + "loss": 2.9925, + "step": 15093 + }, + { + "epoch": 0.7027492608887957, + "grad_norm": 0.3738911479387267, + "learning_rate": 9.46098147836406e-05, + "loss": 3.0152, + "step": 15094 + }, + { + "epoch": 0.7027958190748889, + "grad_norm": 0.37443480250392536, + "learning_rate": 9.460859132316959e-05, + "loss": 3.0519, + "step": 15095 + }, + { + "epoch": 0.7028423772609819, + "grad_norm": 0.35352101456430096, + "learning_rate": 9.460736773177675e-05, + "loss": 2.9091, + "step": 15096 + }, + { + "epoch": 0.702888935447075, + "grad_norm": 0.3735295449958171, + "learning_rate": 9.46061440094657e-05, + "loss": 2.928, + "step": 15097 + }, + { + "epoch": 0.7029354936331681, + "grad_norm": 0.3537417827886179, + "learning_rate": 9.460492015624e-05, + "loss": 2.9849, + "step": 15098 + }, + { + "epoch": 0.7029820518192611, + "grad_norm": 0.33360416380456165, + "learning_rate": 9.460369617210326e-05, + "loss": 2.934, + "step": 15099 + }, + { + "epoch": 0.7030286100053542, + "grad_norm": 0.3664272848684126, + "learning_rate": 9.460247205705905e-05, + "loss": 2.8982, + "step": 15100 + }, + { + "epoch": 0.7030751681914472, + "grad_norm": 0.339364767293446, + "learning_rate": 9.4601247811111e-05, + "loss": 3.0348, + "step": 15101 + }, + { + "epoch": 0.7031217263775403, + "grad_norm": 0.3909330049258777, + "learning_rate": 9.460002343426266e-05, + "loss": 2.8295, + "step": 15102 + }, + { + "epoch": 0.7031682845636335, + "grad_norm": 0.36073090582129874, + "learning_rate": 9.459879892651767e-05, + "loss": 3.0309, + "step": 15103 + }, + { + "epoch": 0.7032148427497265, + "grad_norm": 0.32027142920795765, + "learning_rate": 9.459757428787958e-05, + "loss": 2.845, + "step": 15104 + }, + { + "epoch": 0.7032614009358196, + "grad_norm": 0.33233248346704297, + "learning_rate": 9.459634951835202e-05, + "loss": 3.0387, + "step": 15105 + }, + { + "epoch": 0.7033079591219126, + "grad_norm": 0.35851157843835835, + "learning_rate": 9.459512461793855e-05, + "loss": 3.0204, + "step": 15106 + }, + { + "epoch": 0.7033545173080057, + "grad_norm": 0.33877362364794733, + "learning_rate": 9.45938995866428e-05, + "loss": 3.009, + "step": 15107 + }, + { + "epoch": 0.7034010754940988, + "grad_norm": 0.3310259717826977, + "learning_rate": 9.459267442446833e-05, + "loss": 3.0467, + "step": 15108 + }, + { + "epoch": 0.7034476336801918, + "grad_norm": 0.3612123484849393, + "learning_rate": 9.459144913141876e-05, + "loss": 2.9586, + "step": 15109 + }, + { + "epoch": 0.7034941918662849, + "grad_norm": 0.3368635203266549, + "learning_rate": 9.459022370749768e-05, + "loss": 2.9353, + "step": 15110 + }, + { + "epoch": 0.7035407500523779, + "grad_norm": 0.3543136377032002, + "learning_rate": 9.458899815270869e-05, + "loss": 3.0583, + "step": 15111 + }, + { + "epoch": 0.703587308238471, + "grad_norm": 0.36836809785587066, + "learning_rate": 9.458777246705538e-05, + "loss": 3.0572, + "step": 15112 + }, + { + "epoch": 0.703633866424564, + "grad_norm": 0.3217426795966384, + "learning_rate": 9.458654665054137e-05, + "loss": 2.9818, + "step": 15113 + }, + { + "epoch": 0.7036804246106572, + "grad_norm": 0.3690470149635002, + "learning_rate": 9.458532070317021e-05, + "loss": 2.9555, + "step": 15114 + }, + { + "epoch": 0.7037269827967503, + "grad_norm": 0.3206095645456658, + "learning_rate": 9.458409462494553e-05, + "loss": 3.0644, + "step": 15115 + }, + { + "epoch": 0.7037735409828433, + "grad_norm": 0.36535186878978615, + "learning_rate": 9.458286841587094e-05, + "loss": 2.9784, + "step": 15116 + }, + { + "epoch": 0.7038200991689364, + "grad_norm": 0.3300262118864308, + "learning_rate": 9.458164207595e-05, + "loss": 3.0102, + "step": 15117 + }, + { + "epoch": 0.7038666573550294, + "grad_norm": 0.33440319024699156, + "learning_rate": 9.458041560518635e-05, + "loss": 2.9883, + "step": 15118 + }, + { + "epoch": 0.7039132155411225, + "grad_norm": 0.36748878235463933, + "learning_rate": 9.457918900358357e-05, + "loss": 2.9873, + "step": 15119 + }, + { + "epoch": 0.7039597737272156, + "grad_norm": 0.3514830805649433, + "learning_rate": 9.457796227114527e-05, + "loss": 3.0558, + "step": 15120 + }, + { + "epoch": 0.7040063319133086, + "grad_norm": 0.3479869804056148, + "learning_rate": 9.457673540787502e-05, + "loss": 3.0377, + "step": 15121 + }, + { + "epoch": 0.7040528900994018, + "grad_norm": 0.36839050518083444, + "learning_rate": 9.457550841377646e-05, + "loss": 3.0067, + "step": 15122 + }, + { + "epoch": 0.7040994482854948, + "grad_norm": 0.3774569991024627, + "learning_rate": 9.457428128885318e-05, + "loss": 3.1023, + "step": 15123 + }, + { + "epoch": 0.7041460064715879, + "grad_norm": 0.3456512406711574, + "learning_rate": 9.457305403310875e-05, + "loss": 3.0195, + "step": 15124 + }, + { + "epoch": 0.704192564657681, + "grad_norm": 0.31010076959529503, + "learning_rate": 9.457182664654681e-05, + "loss": 3.12, + "step": 15125 + }, + { + "epoch": 0.704239122843774, + "grad_norm": 0.33814641766939457, + "learning_rate": 9.457059912917096e-05, + "loss": 3.0411, + "step": 15126 + }, + { + "epoch": 0.7042856810298671, + "grad_norm": 0.3587069741851343, + "learning_rate": 9.456937148098478e-05, + "loss": 3.06, + "step": 15127 + }, + { + "epoch": 0.7043322392159601, + "grad_norm": 0.3312951571220656, + "learning_rate": 9.45681437019919e-05, + "loss": 2.9106, + "step": 15128 + }, + { + "epoch": 0.7043787974020532, + "grad_norm": 0.3513791557773505, + "learning_rate": 9.45669157921959e-05, + "loss": 3.1052, + "step": 15129 + }, + { + "epoch": 0.7044253555881463, + "grad_norm": 0.32860319162534946, + "learning_rate": 9.456568775160038e-05, + "loss": 3.0468, + "step": 15130 + }, + { + "epoch": 0.7044719137742393, + "grad_norm": 0.31584781425562564, + "learning_rate": 9.456445958020897e-05, + "loss": 2.8805, + "step": 15131 + }, + { + "epoch": 0.7045184719603325, + "grad_norm": 0.36688340841316425, + "learning_rate": 9.456323127802526e-05, + "loss": 3.0874, + "step": 15132 + }, + { + "epoch": 0.7045650301464255, + "grad_norm": 0.3506513529384165, + "learning_rate": 9.456200284505287e-05, + "loss": 3.0518, + "step": 15133 + }, + { + "epoch": 0.7046115883325186, + "grad_norm": 0.33679701345996355, + "learning_rate": 9.456077428129537e-05, + "loss": 3.0238, + "step": 15134 + }, + { + "epoch": 0.7046581465186116, + "grad_norm": 0.3658804057003311, + "learning_rate": 9.45595455867564e-05, + "loss": 3.0278, + "step": 15135 + }, + { + "epoch": 0.7047047047047047, + "grad_norm": 0.34034581443413725, + "learning_rate": 9.455831676143956e-05, + "loss": 2.9576, + "step": 15136 + }, + { + "epoch": 0.7047512628907978, + "grad_norm": 0.3813770179871628, + "learning_rate": 9.455708780534844e-05, + "loss": 3.0609, + "step": 15137 + }, + { + "epoch": 0.7047978210768908, + "grad_norm": 0.3929194264102399, + "learning_rate": 9.455585871848665e-05, + "loss": 3.0729, + "step": 15138 + }, + { + "epoch": 0.7048443792629839, + "grad_norm": 0.3746459455116828, + "learning_rate": 9.45546295008578e-05, + "loss": 2.8919, + "step": 15139 + }, + { + "epoch": 0.7048909374490769, + "grad_norm": 0.3594723077015563, + "learning_rate": 9.455340015246553e-05, + "loss": 2.9926, + "step": 15140 + }, + { + "epoch": 0.70493749563517, + "grad_norm": 0.37067569433079584, + "learning_rate": 9.455217067331339e-05, + "loss": 2.9744, + "step": 15141 + }, + { + "epoch": 0.7049840538212632, + "grad_norm": 0.3778298674052789, + "learning_rate": 9.455094106340503e-05, + "loss": 3.0089, + "step": 15142 + }, + { + "epoch": 0.7050306120073562, + "grad_norm": 0.3799847614120705, + "learning_rate": 9.454971132274407e-05, + "loss": 2.9584, + "step": 15143 + }, + { + "epoch": 0.7050771701934493, + "grad_norm": 0.3949667076488844, + "learning_rate": 9.454848145133406e-05, + "loss": 2.9872, + "step": 15144 + }, + { + "epoch": 0.7051237283795423, + "grad_norm": 0.395051855428153, + "learning_rate": 9.454725144917866e-05, + "loss": 3.056, + "step": 15145 + }, + { + "epoch": 0.7051702865656354, + "grad_norm": 0.3286121300340188, + "learning_rate": 9.454602131628146e-05, + "loss": 2.9454, + "step": 15146 + }, + { + "epoch": 0.7052168447517285, + "grad_norm": 0.40007274237801593, + "learning_rate": 9.454479105264608e-05, + "loss": 2.9055, + "step": 15147 + }, + { + "epoch": 0.7052634029378215, + "grad_norm": 0.3473336599765632, + "learning_rate": 9.454356065827613e-05, + "loss": 3.0053, + "step": 15148 + }, + { + "epoch": 0.7053099611239146, + "grad_norm": 0.4280663177111624, + "learning_rate": 9.454233013317522e-05, + "loss": 3.0338, + "step": 15149 + }, + { + "epoch": 0.7053565193100076, + "grad_norm": 0.360159765563964, + "learning_rate": 9.454109947734695e-05, + "loss": 2.9683, + "step": 15150 + }, + { + "epoch": 0.7054030774961008, + "grad_norm": 0.4088965898523535, + "learning_rate": 9.453986869079495e-05, + "loss": 3.0139, + "step": 15151 + }, + { + "epoch": 0.7054496356821939, + "grad_norm": 0.3581850186784474, + "learning_rate": 9.453863777352282e-05, + "loss": 2.889, + "step": 15152 + }, + { + "epoch": 0.7054961938682869, + "grad_norm": 0.3665378368026059, + "learning_rate": 9.453740672553417e-05, + "loss": 2.9626, + "step": 15153 + }, + { + "epoch": 0.70554275205438, + "grad_norm": 0.4162143105741761, + "learning_rate": 9.453617554683262e-05, + "loss": 2.9562, + "step": 15154 + }, + { + "epoch": 0.705589310240473, + "grad_norm": 0.3586888778446355, + "learning_rate": 9.45349442374218e-05, + "loss": 3.0338, + "step": 15155 + }, + { + "epoch": 0.7056358684265661, + "grad_norm": 0.39454920364608015, + "learning_rate": 9.45337127973053e-05, + "loss": 3.0783, + "step": 15156 + }, + { + "epoch": 0.7056824266126591, + "grad_norm": 0.36619180847170774, + "learning_rate": 9.453248122648673e-05, + "loss": 2.9241, + "step": 15157 + }, + { + "epoch": 0.7057289847987522, + "grad_norm": 0.4197847151259769, + "learning_rate": 9.453124952496972e-05, + "loss": 3.1613, + "step": 15158 + }, + { + "epoch": 0.7057755429848453, + "grad_norm": 0.3795894945326368, + "learning_rate": 9.453001769275787e-05, + "loss": 3.044, + "step": 15159 + }, + { + "epoch": 0.7058221011709384, + "grad_norm": 0.36752890573447966, + "learning_rate": 9.452878572985482e-05, + "loss": 2.9173, + "step": 15160 + }, + { + "epoch": 0.7058686593570315, + "grad_norm": 0.41840825167955253, + "learning_rate": 9.452755363626415e-05, + "loss": 2.9854, + "step": 15161 + }, + { + "epoch": 0.7059152175431245, + "grad_norm": 0.35665472343650517, + "learning_rate": 9.452632141198952e-05, + "loss": 2.9498, + "step": 15162 + }, + { + "epoch": 0.7059617757292176, + "grad_norm": 0.38799267093770945, + "learning_rate": 9.45250890570345e-05, + "loss": 3.0939, + "step": 15163 + }, + { + "epoch": 0.7060083339153107, + "grad_norm": 0.3638211443503569, + "learning_rate": 9.452385657140275e-05, + "loss": 2.9369, + "step": 15164 + }, + { + "epoch": 0.7060548921014037, + "grad_norm": 0.34802419658984657, + "learning_rate": 9.452262395509785e-05, + "loss": 2.9413, + "step": 15165 + }, + { + "epoch": 0.7061014502874968, + "grad_norm": 0.3941997434149132, + "learning_rate": 9.452139120812346e-05, + "loss": 2.9176, + "step": 15166 + }, + { + "epoch": 0.7061480084735898, + "grad_norm": 0.34263529687849864, + "learning_rate": 9.452015833048316e-05, + "loss": 2.9146, + "step": 15167 + }, + { + "epoch": 0.7061945666596829, + "grad_norm": 0.3986513134821587, + "learning_rate": 9.451892532218056e-05, + "loss": 2.9731, + "step": 15168 + }, + { + "epoch": 0.7062411248457761, + "grad_norm": 0.3468753405352981, + "learning_rate": 9.451769218321933e-05, + "loss": 2.986, + "step": 15169 + }, + { + "epoch": 0.7062876830318691, + "grad_norm": 0.38945280477222016, + "learning_rate": 9.451645891360306e-05, + "loss": 2.9259, + "step": 15170 + }, + { + "epoch": 0.7063342412179622, + "grad_norm": 0.32304520364319794, + "learning_rate": 9.451522551333535e-05, + "loss": 2.9362, + "step": 15171 + }, + { + "epoch": 0.7063807994040552, + "grad_norm": 0.4049063116628564, + "learning_rate": 9.451399198241984e-05, + "loss": 3.0873, + "step": 15172 + }, + { + "epoch": 0.7064273575901483, + "grad_norm": 0.338654539002225, + "learning_rate": 9.451275832086015e-05, + "loss": 3.0188, + "step": 15173 + }, + { + "epoch": 0.7064739157762414, + "grad_norm": 0.37642981120613944, + "learning_rate": 9.451152452865992e-05, + "loss": 3.0046, + "step": 15174 + }, + { + "epoch": 0.7065204739623344, + "grad_norm": 0.3294384039789289, + "learning_rate": 9.451029060582273e-05, + "loss": 3.0902, + "step": 15175 + }, + { + "epoch": 0.7065670321484275, + "grad_norm": 0.40370311944662557, + "learning_rate": 9.450905655235222e-05, + "loss": 3.0047, + "step": 15176 + }, + { + "epoch": 0.7066135903345205, + "grad_norm": 0.35211625167686694, + "learning_rate": 9.450782236825202e-05, + "loss": 2.9807, + "step": 15177 + }, + { + "epoch": 0.7066601485206137, + "grad_norm": 0.35173483167866154, + "learning_rate": 9.450658805352575e-05, + "loss": 2.9569, + "step": 15178 + }, + { + "epoch": 0.7067067067067067, + "grad_norm": 0.3826371730158995, + "learning_rate": 9.450535360817703e-05, + "loss": 3.0205, + "step": 15179 + }, + { + "epoch": 0.7067532648927998, + "grad_norm": 0.3755972415843987, + "learning_rate": 9.450411903220948e-05, + "loss": 2.8864, + "step": 15180 + }, + { + "epoch": 0.7067998230788929, + "grad_norm": 0.3599828046237246, + "learning_rate": 9.450288432562672e-05, + "loss": 2.9817, + "step": 15181 + }, + { + "epoch": 0.7068463812649859, + "grad_norm": 0.3354458076203718, + "learning_rate": 9.450164948843239e-05, + "loss": 3.0546, + "step": 15182 + }, + { + "epoch": 0.706892939451079, + "grad_norm": 0.38059236599708984, + "learning_rate": 9.450041452063008e-05, + "loss": 2.9829, + "step": 15183 + }, + { + "epoch": 0.706939497637172, + "grad_norm": 0.3486179662490935, + "learning_rate": 9.449917942222346e-05, + "loss": 2.9101, + "step": 15184 + }, + { + "epoch": 0.7069860558232651, + "grad_norm": 0.3433297363204249, + "learning_rate": 9.449794419321611e-05, + "loss": 2.9543, + "step": 15185 + }, + { + "epoch": 0.7070326140093582, + "grad_norm": 0.33628246737870604, + "learning_rate": 9.44967088336117e-05, + "loss": 2.9948, + "step": 15186 + }, + { + "epoch": 0.7070791721954512, + "grad_norm": 0.3176838861790803, + "learning_rate": 9.449547334341382e-05, + "loss": 3.0627, + "step": 15187 + }, + { + "epoch": 0.7071257303815444, + "grad_norm": 0.37969844789500773, + "learning_rate": 9.449423772262611e-05, + "loss": 3.0634, + "step": 15188 + }, + { + "epoch": 0.7071722885676374, + "grad_norm": 0.3336742610618695, + "learning_rate": 9.449300197125221e-05, + "loss": 3.0649, + "step": 15189 + }, + { + "epoch": 0.7072188467537305, + "grad_norm": 0.345132074393698, + "learning_rate": 9.449176608929572e-05, + "loss": 3.0535, + "step": 15190 + }, + { + "epoch": 0.7072654049398236, + "grad_norm": 0.35863049100899314, + "learning_rate": 9.449053007676028e-05, + "loss": 3.025, + "step": 15191 + }, + { + "epoch": 0.7073119631259166, + "grad_norm": 0.34416260668022924, + "learning_rate": 9.448929393364951e-05, + "loss": 2.9149, + "step": 15192 + }, + { + "epoch": 0.7073585213120097, + "grad_norm": 0.387531150489248, + "learning_rate": 9.448805765996707e-05, + "loss": 3.0147, + "step": 15193 + }, + { + "epoch": 0.7074050794981027, + "grad_norm": 0.3487547897962992, + "learning_rate": 9.448682125571655e-05, + "loss": 3.0008, + "step": 15194 + }, + { + "epoch": 0.7074516376841958, + "grad_norm": 0.3544754379574759, + "learning_rate": 9.448558472090158e-05, + "loss": 3.0133, + "step": 15195 + }, + { + "epoch": 0.707498195870289, + "grad_norm": 0.36140970554588975, + "learning_rate": 9.448434805552582e-05, + "loss": 2.9649, + "step": 15196 + }, + { + "epoch": 0.707544754056382, + "grad_norm": 0.3382809323407466, + "learning_rate": 9.448311125959286e-05, + "loss": 2.9707, + "step": 15197 + }, + { + "epoch": 0.7075913122424751, + "grad_norm": 0.3379310271111462, + "learning_rate": 9.448187433310637e-05, + "loss": 2.9833, + "step": 15198 + }, + { + "epoch": 0.7076378704285681, + "grad_norm": 0.3472396297989277, + "learning_rate": 9.448063727606994e-05, + "loss": 2.9827, + "step": 15199 + }, + { + "epoch": 0.7076844286146612, + "grad_norm": 0.3332448462930489, + "learning_rate": 9.447940008848726e-05, + "loss": 3.0484, + "step": 15200 + }, + { + "epoch": 0.7077309868007542, + "grad_norm": 0.32741984010526015, + "learning_rate": 9.447816277036189e-05, + "loss": 2.9978, + "step": 15201 + }, + { + "epoch": 0.7077775449868473, + "grad_norm": 0.3274504388848788, + "learning_rate": 9.44769253216975e-05, + "loss": 3.1729, + "step": 15202 + }, + { + "epoch": 0.7078241031729404, + "grad_norm": 0.3444658480729288, + "learning_rate": 9.447568774249772e-05, + "loss": 3.0368, + "step": 15203 + }, + { + "epoch": 0.7078706613590334, + "grad_norm": 0.34010618891906347, + "learning_rate": 9.447445003276618e-05, + "loss": 3.066, + "step": 15204 + }, + { + "epoch": 0.7079172195451265, + "grad_norm": 0.36175423424276415, + "learning_rate": 9.447321219250651e-05, + "loss": 3.0429, + "step": 15205 + }, + { + "epoch": 0.7079637777312195, + "grad_norm": 0.33616775767896245, + "learning_rate": 9.447197422172233e-05, + "loss": 2.94, + "step": 15206 + }, + { + "epoch": 0.7080103359173127, + "grad_norm": 0.33257718040894163, + "learning_rate": 9.44707361204173e-05, + "loss": 2.9634, + "step": 15207 + }, + { + "epoch": 0.7080568941034058, + "grad_norm": 0.3739840950651574, + "learning_rate": 9.446949788859505e-05, + "loss": 3.0007, + "step": 15208 + }, + { + "epoch": 0.7081034522894988, + "grad_norm": 0.37254972779036655, + "learning_rate": 9.446825952625918e-05, + "loss": 2.9062, + "step": 15209 + }, + { + "epoch": 0.7081500104755919, + "grad_norm": 0.3518642492532422, + "learning_rate": 9.446702103341336e-05, + "loss": 2.9191, + "step": 15210 + }, + { + "epoch": 0.7081965686616849, + "grad_norm": 0.32868901122864674, + "learning_rate": 9.446578241006122e-05, + "loss": 2.9384, + "step": 15211 + }, + { + "epoch": 0.708243126847778, + "grad_norm": 0.35914471349692484, + "learning_rate": 9.446454365620638e-05, + "loss": 2.9555, + "step": 15212 + }, + { + "epoch": 0.7082896850338711, + "grad_norm": 0.38171740558604594, + "learning_rate": 9.44633047718525e-05, + "loss": 3.0046, + "step": 15213 + }, + { + "epoch": 0.7083362432199641, + "grad_norm": 0.3514518614527248, + "learning_rate": 9.446206575700319e-05, + "loss": 2.9932, + "step": 15214 + }, + { + "epoch": 0.7083828014060572, + "grad_norm": 0.3859526898221838, + "learning_rate": 9.44608266116621e-05, + "loss": 2.9666, + "step": 15215 + }, + { + "epoch": 0.7084293595921503, + "grad_norm": 0.3503295974782634, + "learning_rate": 9.445958733583287e-05, + "loss": 3.0466, + "step": 15216 + }, + { + "epoch": 0.7084759177782434, + "grad_norm": 0.38912070342805655, + "learning_rate": 9.445834792951911e-05, + "loss": 2.9832, + "step": 15217 + }, + { + "epoch": 0.7085224759643365, + "grad_norm": 0.41641514704900306, + "learning_rate": 9.445710839272449e-05, + "loss": 2.8535, + "step": 15218 + }, + { + "epoch": 0.7085690341504295, + "grad_norm": 0.3600093148783471, + "learning_rate": 9.445586872545265e-05, + "loss": 3.1055, + "step": 15219 + }, + { + "epoch": 0.7086155923365226, + "grad_norm": 0.4278348930440637, + "learning_rate": 9.44546289277072e-05, + "loss": 2.9092, + "step": 15220 + }, + { + "epoch": 0.7086621505226156, + "grad_norm": 0.3586893052797901, + "learning_rate": 9.445338899949178e-05, + "loss": 2.9801, + "step": 15221 + }, + { + "epoch": 0.7087087087087087, + "grad_norm": 0.3648219589361736, + "learning_rate": 9.445214894081007e-05, + "loss": 2.9463, + "step": 15222 + }, + { + "epoch": 0.7087552668948017, + "grad_norm": 0.3997457378292009, + "learning_rate": 9.445090875166568e-05, + "loss": 2.9623, + "step": 15223 + }, + { + "epoch": 0.7088018250808948, + "grad_norm": 0.3783037313841622, + "learning_rate": 9.444966843206224e-05, + "loss": 3.0481, + "step": 15224 + }, + { + "epoch": 0.708848383266988, + "grad_norm": 0.42067911087769755, + "learning_rate": 9.444842798200341e-05, + "loss": 2.9898, + "step": 15225 + }, + { + "epoch": 0.708894941453081, + "grad_norm": 0.3272603136199862, + "learning_rate": 9.444718740149281e-05, + "loss": 3.0496, + "step": 15226 + }, + { + "epoch": 0.7089414996391741, + "grad_norm": 0.3495086931284255, + "learning_rate": 9.44459466905341e-05, + "loss": 3.0346, + "step": 15227 + }, + { + "epoch": 0.7089880578252671, + "grad_norm": 0.31187119212030423, + "learning_rate": 9.444470584913093e-05, + "loss": 3.0124, + "step": 15228 + }, + { + "epoch": 0.7090346160113602, + "grad_norm": 0.35248391698074716, + "learning_rate": 9.444346487728691e-05, + "loss": 2.9226, + "step": 15229 + }, + { + "epoch": 0.7090811741974533, + "grad_norm": 0.3053593295125898, + "learning_rate": 9.444222377500572e-05, + "loss": 3.0132, + "step": 15230 + }, + { + "epoch": 0.7091277323835463, + "grad_norm": 0.34632323794905484, + "learning_rate": 9.444098254229095e-05, + "loss": 3.0049, + "step": 15231 + }, + { + "epoch": 0.7091742905696394, + "grad_norm": 0.34399797030178375, + "learning_rate": 9.443974117914629e-05, + "loss": 2.9423, + "step": 15232 + }, + { + "epoch": 0.7092208487557324, + "grad_norm": 0.3517810495345403, + "learning_rate": 9.443849968557537e-05, + "loss": 2.9562, + "step": 15233 + }, + { + "epoch": 0.7092674069418256, + "grad_norm": 0.3686341542787184, + "learning_rate": 9.443725806158182e-05, + "loss": 3.0096, + "step": 15234 + }, + { + "epoch": 0.7093139651279187, + "grad_norm": 0.3619446444782412, + "learning_rate": 9.443601630716932e-05, + "loss": 2.8792, + "step": 15235 + }, + { + "epoch": 0.7093605233140117, + "grad_norm": 0.3626788843677392, + "learning_rate": 9.443477442234146e-05, + "loss": 3.0698, + "step": 15236 + }, + { + "epoch": 0.7094070815001048, + "grad_norm": 0.3600492844618293, + "learning_rate": 9.443353240710193e-05, + "loss": 2.9899, + "step": 15237 + }, + { + "epoch": 0.7094536396861978, + "grad_norm": 0.3750591647196632, + "learning_rate": 9.443229026145436e-05, + "loss": 3.0091, + "step": 15238 + }, + { + "epoch": 0.7095001978722909, + "grad_norm": 0.3586202709613999, + "learning_rate": 9.443104798540239e-05, + "loss": 3.0392, + "step": 15239 + }, + { + "epoch": 0.709546756058384, + "grad_norm": 0.3402344126649171, + "learning_rate": 9.442980557894967e-05, + "loss": 3.0252, + "step": 15240 + }, + { + "epoch": 0.709593314244477, + "grad_norm": 0.3738870348542323, + "learning_rate": 9.442856304209987e-05, + "loss": 2.9782, + "step": 15241 + }, + { + "epoch": 0.7096398724305701, + "grad_norm": 0.3687616410266846, + "learning_rate": 9.442732037485659e-05, + "loss": 3.0063, + "step": 15242 + }, + { + "epoch": 0.7096864306166631, + "grad_norm": 0.3253450325425558, + "learning_rate": 9.44260775772235e-05, + "loss": 2.9351, + "step": 15243 + }, + { + "epoch": 0.7097329888027563, + "grad_norm": 0.3648837643854634, + "learning_rate": 9.442483464920426e-05, + "loss": 3.0162, + "step": 15244 + }, + { + "epoch": 0.7097795469888493, + "grad_norm": 0.3687637524069752, + "learning_rate": 9.442359159080251e-05, + "loss": 2.908, + "step": 15245 + }, + { + "epoch": 0.7098261051749424, + "grad_norm": 0.3892452774842284, + "learning_rate": 9.442234840202188e-05, + "loss": 2.955, + "step": 15246 + }, + { + "epoch": 0.7098726633610355, + "grad_norm": 0.3982786350354456, + "learning_rate": 9.442110508286606e-05, + "loss": 2.9951, + "step": 15247 + }, + { + "epoch": 0.7099192215471285, + "grad_norm": 0.40180802101887997, + "learning_rate": 9.441986163333864e-05, + "loss": 2.9397, + "step": 15248 + }, + { + "epoch": 0.7099657797332216, + "grad_norm": 0.3890929612231304, + "learning_rate": 9.441861805344332e-05, + "loss": 2.9275, + "step": 15249 + }, + { + "epoch": 0.7100123379193146, + "grad_norm": 0.42004111231044094, + "learning_rate": 9.441737434318374e-05, + "loss": 3.0103, + "step": 15250 + }, + { + "epoch": 0.7100588961054077, + "grad_norm": 0.39228655970932774, + "learning_rate": 9.441613050256352e-05, + "loss": 2.999, + "step": 15251 + }, + { + "epoch": 0.7101054542915008, + "grad_norm": 0.4126802151122403, + "learning_rate": 9.441488653158635e-05, + "loss": 3.0472, + "step": 15252 + }, + { + "epoch": 0.7101520124775939, + "grad_norm": 0.4236568608764982, + "learning_rate": 9.441364243025586e-05, + "loss": 3.0061, + "step": 15253 + }, + { + "epoch": 0.710198570663687, + "grad_norm": 0.3840066469771878, + "learning_rate": 9.44123981985757e-05, + "loss": 2.9323, + "step": 15254 + }, + { + "epoch": 0.71024512884978, + "grad_norm": 0.40830871805159324, + "learning_rate": 9.441115383654954e-05, + "loss": 3.0075, + "step": 15255 + }, + { + "epoch": 0.7102916870358731, + "grad_norm": 0.3739335436844715, + "learning_rate": 9.440990934418102e-05, + "loss": 2.967, + "step": 15256 + }, + { + "epoch": 0.7103382452219662, + "grad_norm": 0.40317225971886034, + "learning_rate": 9.440866472147375e-05, + "loss": 3.0289, + "step": 15257 + }, + { + "epoch": 0.7103848034080592, + "grad_norm": 0.4112768089158373, + "learning_rate": 9.440741996843147e-05, + "loss": 2.9566, + "step": 15258 + }, + { + "epoch": 0.7104313615941523, + "grad_norm": 0.36092158233795557, + "learning_rate": 9.440617508505778e-05, + "loss": 3.1269, + "step": 15259 + }, + { + "epoch": 0.7104779197802453, + "grad_norm": 0.4098469411138882, + "learning_rate": 9.440493007135633e-05, + "loss": 2.8768, + "step": 15260 + }, + { + "epoch": 0.7105244779663384, + "grad_norm": 0.3897900890960023, + "learning_rate": 9.440368492733078e-05, + "loss": 3.0519, + "step": 15261 + }, + { + "epoch": 0.7105710361524316, + "grad_norm": 0.38372271894083315, + "learning_rate": 9.44024396529848e-05, + "loss": 2.9684, + "step": 15262 + }, + { + "epoch": 0.7106175943385246, + "grad_norm": 0.41147157351766817, + "learning_rate": 9.440119424832204e-05, + "loss": 2.9933, + "step": 15263 + }, + { + "epoch": 0.7106641525246177, + "grad_norm": 0.3568642038376943, + "learning_rate": 9.439994871334613e-05, + "loss": 2.9756, + "step": 15264 + }, + { + "epoch": 0.7107107107107107, + "grad_norm": 0.38479393368793924, + "learning_rate": 9.439870304806075e-05, + "loss": 2.8637, + "step": 15265 + }, + { + "epoch": 0.7107572688968038, + "grad_norm": 0.3527903576832344, + "learning_rate": 9.439745725246956e-05, + "loss": 2.8789, + "step": 15266 + }, + { + "epoch": 0.7108038270828968, + "grad_norm": 0.36459042364156385, + "learning_rate": 9.43962113265762e-05, + "loss": 2.9918, + "step": 15267 + }, + { + "epoch": 0.7108503852689899, + "grad_norm": 0.375189974218198, + "learning_rate": 9.439496527038434e-05, + "loss": 2.9359, + "step": 15268 + }, + { + "epoch": 0.710896943455083, + "grad_norm": 0.3451055931611146, + "learning_rate": 9.439371908389763e-05, + "loss": 2.992, + "step": 15269 + }, + { + "epoch": 0.710943501641176, + "grad_norm": 0.3779248455773158, + "learning_rate": 9.439247276711971e-05, + "loss": 2.9328, + "step": 15270 + }, + { + "epoch": 0.7109900598272691, + "grad_norm": 0.3697192935907874, + "learning_rate": 9.439122632005429e-05, + "loss": 3.0069, + "step": 15271 + }, + { + "epoch": 0.7110366180133622, + "grad_norm": 0.35557213658344766, + "learning_rate": 9.438997974270495e-05, + "loss": 3.0236, + "step": 15272 + }, + { + "epoch": 0.7110831761994553, + "grad_norm": 0.3634967899034014, + "learning_rate": 9.438873303507542e-05, + "loss": 3.0732, + "step": 15273 + }, + { + "epoch": 0.7111297343855484, + "grad_norm": 0.41908357304508154, + "learning_rate": 9.438748619716933e-05, + "loss": 3.0768, + "step": 15274 + }, + { + "epoch": 0.7111762925716414, + "grad_norm": 0.3594453638165098, + "learning_rate": 9.438623922899035e-05, + "loss": 3.0509, + "step": 15275 + }, + { + "epoch": 0.7112228507577345, + "grad_norm": 0.3635253184145237, + "learning_rate": 9.43849921305421e-05, + "loss": 3.0602, + "step": 15276 + }, + { + "epoch": 0.7112694089438275, + "grad_norm": 0.34458063432233305, + "learning_rate": 9.438374490182831e-05, + "loss": 2.9432, + "step": 15277 + }, + { + "epoch": 0.7113159671299206, + "grad_norm": 0.3560253768760888, + "learning_rate": 9.438249754285257e-05, + "loss": 2.9709, + "step": 15278 + }, + { + "epoch": 0.7113625253160137, + "grad_norm": 0.3529850754401031, + "learning_rate": 9.438125005361859e-05, + "loss": 3.115, + "step": 15279 + }, + { + "epoch": 0.7114090835021067, + "grad_norm": 0.36809692055503107, + "learning_rate": 9.438000243413e-05, + "loss": 3.0511, + "step": 15280 + }, + { + "epoch": 0.7114556416881999, + "grad_norm": 0.3849761928354232, + "learning_rate": 9.437875468439048e-05, + "loss": 3.0725, + "step": 15281 + }, + { + "epoch": 0.7115021998742929, + "grad_norm": 0.33637375370093076, + "learning_rate": 9.43775068044037e-05, + "loss": 2.936, + "step": 15282 + }, + { + "epoch": 0.711548758060386, + "grad_norm": 0.4027116727728424, + "learning_rate": 9.437625879417329e-05, + "loss": 3.0269, + "step": 15283 + }, + { + "epoch": 0.7115953162464791, + "grad_norm": 0.34894948283879734, + "learning_rate": 9.437501065370294e-05, + "loss": 2.9734, + "step": 15284 + }, + { + "epoch": 0.7116418744325721, + "grad_norm": 0.34193772451082954, + "learning_rate": 9.43737623829963e-05, + "loss": 2.9389, + "step": 15285 + }, + { + "epoch": 0.7116884326186652, + "grad_norm": 0.34761177980577196, + "learning_rate": 9.437251398205705e-05, + "loss": 2.8669, + "step": 15286 + }, + { + "epoch": 0.7117349908047582, + "grad_norm": 0.38162306430623694, + "learning_rate": 9.437126545088883e-05, + "loss": 3.015, + "step": 15287 + }, + { + "epoch": 0.7117815489908513, + "grad_norm": 0.3979785815547465, + "learning_rate": 9.437001678949532e-05, + "loss": 3.0885, + "step": 15288 + }, + { + "epoch": 0.7118281071769443, + "grad_norm": 0.3791587541916133, + "learning_rate": 9.43687679978802e-05, + "loss": 3.0862, + "step": 15289 + }, + { + "epoch": 0.7118746653630375, + "grad_norm": 0.37915089400056495, + "learning_rate": 9.43675190760471e-05, + "loss": 3.0915, + "step": 15290 + }, + { + "epoch": 0.7119212235491306, + "grad_norm": 0.43086361105884036, + "learning_rate": 9.436627002399971e-05, + "loss": 2.9722, + "step": 15291 + }, + { + "epoch": 0.7119677817352236, + "grad_norm": 0.3688828780465189, + "learning_rate": 9.436502084174168e-05, + "loss": 3.0459, + "step": 15292 + }, + { + "epoch": 0.7120143399213167, + "grad_norm": 0.3696496053138288, + "learning_rate": 9.43637715292767e-05, + "loss": 3.0459, + "step": 15293 + }, + { + "epoch": 0.7120608981074097, + "grad_norm": 0.3617629174035875, + "learning_rate": 9.43625220866084e-05, + "loss": 2.9223, + "step": 15294 + }, + { + "epoch": 0.7121074562935028, + "grad_norm": 0.36640099341926663, + "learning_rate": 9.436127251374048e-05, + "loss": 2.9737, + "step": 15295 + }, + { + "epoch": 0.7121540144795959, + "grad_norm": 0.40711607263794913, + "learning_rate": 9.436002281067659e-05, + "loss": 3.0173, + "step": 15296 + }, + { + "epoch": 0.7122005726656889, + "grad_norm": 0.35615457071170437, + "learning_rate": 9.435877297742041e-05, + "loss": 3.0372, + "step": 15297 + }, + { + "epoch": 0.712247130851782, + "grad_norm": 0.37712417225904904, + "learning_rate": 9.43575230139756e-05, + "loss": 3.0241, + "step": 15298 + }, + { + "epoch": 0.712293689037875, + "grad_norm": 0.3619822652841412, + "learning_rate": 9.435627292034583e-05, + "loss": 3.0994, + "step": 15299 + }, + { + "epoch": 0.7123402472239682, + "grad_norm": 0.4067487723442324, + "learning_rate": 9.435502269653477e-05, + "loss": 2.95, + "step": 15300 + }, + { + "epoch": 0.7123868054100613, + "grad_norm": 0.38455886027539404, + "learning_rate": 9.435377234254607e-05, + "loss": 3.013, + "step": 15301 + }, + { + "epoch": 0.7124333635961543, + "grad_norm": 0.3479335140722366, + "learning_rate": 9.435252185838345e-05, + "loss": 3.0513, + "step": 15302 + }, + { + "epoch": 0.7124799217822474, + "grad_norm": 0.372649937876483, + "learning_rate": 9.435127124405053e-05, + "loss": 3.0086, + "step": 15303 + }, + { + "epoch": 0.7125264799683404, + "grad_norm": 0.3491061189068009, + "learning_rate": 9.4350020499551e-05, + "loss": 2.95, + "step": 15304 + }, + { + "epoch": 0.7125730381544335, + "grad_norm": 0.38850307122177313, + "learning_rate": 9.434876962488852e-05, + "loss": 3.0175, + "step": 15305 + }, + { + "epoch": 0.7126195963405265, + "grad_norm": 0.3968201007712841, + "learning_rate": 9.434751862006678e-05, + "loss": 3.11, + "step": 15306 + }, + { + "epoch": 0.7126661545266196, + "grad_norm": 0.35971525369422175, + "learning_rate": 9.434626748508944e-05, + "loss": 2.9801, + "step": 15307 + }, + { + "epoch": 0.7127127127127127, + "grad_norm": 0.36094361685683923, + "learning_rate": 9.434501621996017e-05, + "loss": 2.9874, + "step": 15308 + }, + { + "epoch": 0.7127592708988058, + "grad_norm": 0.3796568378745108, + "learning_rate": 9.434376482468266e-05, + "loss": 2.9707, + "step": 15309 + }, + { + "epoch": 0.7128058290848989, + "grad_norm": 0.35361020818956573, + "learning_rate": 9.434251329926056e-05, + "loss": 3.086, + "step": 15310 + }, + { + "epoch": 0.7128523872709919, + "grad_norm": 0.37348347957166145, + "learning_rate": 9.434126164369755e-05, + "loss": 3.0388, + "step": 15311 + }, + { + "epoch": 0.712898945457085, + "grad_norm": 0.3779491376648238, + "learning_rate": 9.434000985799731e-05, + "loss": 3.0154, + "step": 15312 + }, + { + "epoch": 0.7129455036431781, + "grad_norm": 0.36603728472692754, + "learning_rate": 9.433875794216349e-05, + "loss": 2.9878, + "step": 15313 + }, + { + "epoch": 0.7129920618292711, + "grad_norm": 0.36293339261407664, + "learning_rate": 9.433750589619981e-05, + "loss": 3.0162, + "step": 15314 + }, + { + "epoch": 0.7130386200153642, + "grad_norm": 0.3833958535983663, + "learning_rate": 9.43362537201099e-05, + "loss": 2.9359, + "step": 15315 + }, + { + "epoch": 0.7130851782014572, + "grad_norm": 0.37245945197051006, + "learning_rate": 9.433500141389747e-05, + "loss": 3.0807, + "step": 15316 + }, + { + "epoch": 0.7131317363875503, + "grad_norm": 0.3086600198671955, + "learning_rate": 9.433374897756616e-05, + "loss": 2.9523, + "step": 15317 + }, + { + "epoch": 0.7131782945736435, + "grad_norm": 0.35102895802921724, + "learning_rate": 9.433249641111968e-05, + "loss": 2.9195, + "step": 15318 + }, + { + "epoch": 0.7132248527597365, + "grad_norm": 0.3196852148083515, + "learning_rate": 9.433124371456168e-05, + "loss": 3.0381, + "step": 15319 + }, + { + "epoch": 0.7132714109458296, + "grad_norm": 0.40135128073695564, + "learning_rate": 9.432999088789585e-05, + "loss": 3.0799, + "step": 15320 + }, + { + "epoch": 0.7133179691319226, + "grad_norm": 0.3902367781655557, + "learning_rate": 9.432873793112587e-05, + "loss": 3.0084, + "step": 15321 + }, + { + "epoch": 0.7133645273180157, + "grad_norm": 0.3823262581755704, + "learning_rate": 9.43274848442554e-05, + "loss": 3.09, + "step": 15322 + }, + { + "epoch": 0.7134110855041088, + "grad_norm": 0.37039759322291693, + "learning_rate": 9.432623162728813e-05, + "loss": 2.9805, + "step": 15323 + }, + { + "epoch": 0.7134576436902018, + "grad_norm": 0.353806414837469, + "learning_rate": 9.432497828022775e-05, + "loss": 3.0514, + "step": 15324 + }, + { + "epoch": 0.7135042018762949, + "grad_norm": 0.3709332349129133, + "learning_rate": 9.432372480307792e-05, + "loss": 2.9701, + "step": 15325 + }, + { + "epoch": 0.7135507600623879, + "grad_norm": 0.36395129670893894, + "learning_rate": 9.432247119584231e-05, + "loss": 3.0449, + "step": 15326 + }, + { + "epoch": 0.713597318248481, + "grad_norm": 0.39886534523744765, + "learning_rate": 9.432121745852462e-05, + "loss": 3.0383, + "step": 15327 + }, + { + "epoch": 0.713643876434574, + "grad_norm": 0.3469294737527477, + "learning_rate": 9.431996359112854e-05, + "loss": 2.9233, + "step": 15328 + }, + { + "epoch": 0.7136904346206672, + "grad_norm": 0.39654653207625307, + "learning_rate": 9.431870959365772e-05, + "loss": 3.0258, + "step": 15329 + }, + { + "epoch": 0.7137369928067603, + "grad_norm": 0.4089275452204806, + "learning_rate": 9.431745546611586e-05, + "loss": 3.1182, + "step": 15330 + }, + { + "epoch": 0.7137835509928533, + "grad_norm": 0.35450731420692133, + "learning_rate": 9.431620120850661e-05, + "loss": 3.1144, + "step": 15331 + }, + { + "epoch": 0.7138301091789464, + "grad_norm": 0.3761339477096773, + "learning_rate": 9.431494682083371e-05, + "loss": 3.0722, + "step": 15332 + }, + { + "epoch": 0.7138766673650394, + "grad_norm": 0.3954531191883039, + "learning_rate": 9.431369230310078e-05, + "loss": 3.0257, + "step": 15333 + }, + { + "epoch": 0.7139232255511325, + "grad_norm": 0.3603114718157781, + "learning_rate": 9.431243765531154e-05, + "loss": 2.9723, + "step": 15334 + }, + { + "epoch": 0.7139697837372256, + "grad_norm": 0.3875670435428484, + "learning_rate": 9.431118287746968e-05, + "loss": 2.9635, + "step": 15335 + }, + { + "epoch": 0.7140163419233186, + "grad_norm": 0.3652249415928407, + "learning_rate": 9.430992796957883e-05, + "loss": 2.9478, + "step": 15336 + }, + { + "epoch": 0.7140629001094118, + "grad_norm": 0.3947174236764558, + "learning_rate": 9.430867293164273e-05, + "loss": 3.0409, + "step": 15337 + }, + { + "epoch": 0.7141094582955048, + "grad_norm": 0.3381525662645388, + "learning_rate": 9.430741776366503e-05, + "loss": 2.9078, + "step": 15338 + }, + { + "epoch": 0.7141560164815979, + "grad_norm": 0.3721603822328242, + "learning_rate": 9.430616246564943e-05, + "loss": 3.0493, + "step": 15339 + }, + { + "epoch": 0.714202574667691, + "grad_norm": 0.3907622656169166, + "learning_rate": 9.43049070375996e-05, + "loss": 3.0147, + "step": 15340 + }, + { + "epoch": 0.714249132853784, + "grad_norm": 0.3337556081447702, + "learning_rate": 9.430365147951922e-05, + "loss": 2.882, + "step": 15341 + }, + { + "epoch": 0.7142956910398771, + "grad_norm": 0.3873825447924169, + "learning_rate": 9.430239579141201e-05, + "loss": 3.1098, + "step": 15342 + }, + { + "epoch": 0.7143422492259701, + "grad_norm": 0.3765105630518291, + "learning_rate": 9.430113997328163e-05, + "loss": 2.9915, + "step": 15343 + }, + { + "epoch": 0.7143888074120632, + "grad_norm": 0.31955459101082667, + "learning_rate": 9.429988402513177e-05, + "loss": 2.9792, + "step": 15344 + }, + { + "epoch": 0.7144353655981563, + "grad_norm": 0.37792316986707797, + "learning_rate": 9.429862794696611e-05, + "loss": 3.0531, + "step": 15345 + }, + { + "epoch": 0.7144819237842494, + "grad_norm": 0.32463899248696404, + "learning_rate": 9.429737173878833e-05, + "loss": 3.026, + "step": 15346 + }, + { + "epoch": 0.7145284819703425, + "grad_norm": 0.4300923097854088, + "learning_rate": 9.429611540060215e-05, + "loss": 3.0157, + "step": 15347 + }, + { + "epoch": 0.7145750401564355, + "grad_norm": 0.38036166686468603, + "learning_rate": 9.429485893241121e-05, + "loss": 2.8932, + "step": 15348 + }, + { + "epoch": 0.7146215983425286, + "grad_norm": 0.3808742148997464, + "learning_rate": 9.429360233421924e-05, + "loss": 2.8139, + "step": 15349 + }, + { + "epoch": 0.7146681565286216, + "grad_norm": 0.4095915362833176, + "learning_rate": 9.42923456060299e-05, + "loss": 3.1319, + "step": 15350 + }, + { + "epoch": 0.7147147147147147, + "grad_norm": 0.3414258457527844, + "learning_rate": 9.429108874784692e-05, + "loss": 2.9942, + "step": 15351 + }, + { + "epoch": 0.7147612729008078, + "grad_norm": 0.38014652667428356, + "learning_rate": 9.428983175967392e-05, + "loss": 2.9638, + "step": 15352 + }, + { + "epoch": 0.7148078310869008, + "grad_norm": 0.401732199544121, + "learning_rate": 9.428857464151463e-05, + "loss": 3.0082, + "step": 15353 + }, + { + "epoch": 0.7148543892729939, + "grad_norm": 0.35182646746183094, + "learning_rate": 9.428731739337276e-05, + "loss": 2.9722, + "step": 15354 + }, + { + "epoch": 0.7149009474590869, + "grad_norm": 0.3893649758166812, + "learning_rate": 9.428606001525196e-05, + "loss": 2.9723, + "step": 15355 + }, + { + "epoch": 0.7149475056451801, + "grad_norm": 0.3695912239962611, + "learning_rate": 9.428480250715595e-05, + "loss": 3.0588, + "step": 15356 + }, + { + "epoch": 0.7149940638312732, + "grad_norm": 0.4003677823625404, + "learning_rate": 9.42835448690884e-05, + "loss": 2.9265, + "step": 15357 + }, + { + "epoch": 0.7150406220173662, + "grad_norm": 0.3704745315001369, + "learning_rate": 9.4282287101053e-05, + "loss": 2.9806, + "step": 15358 + }, + { + "epoch": 0.7150871802034593, + "grad_norm": 0.3727955568497658, + "learning_rate": 9.428102920305346e-05, + "loss": 3.0343, + "step": 15359 + }, + { + "epoch": 0.7151337383895523, + "grad_norm": 0.3864141105293068, + "learning_rate": 9.427977117509345e-05, + "loss": 3.0727, + "step": 15360 + }, + { + "epoch": 0.7151802965756454, + "grad_norm": 0.39764545188485295, + "learning_rate": 9.42785130171767e-05, + "loss": 3.0856, + "step": 15361 + }, + { + "epoch": 0.7152268547617385, + "grad_norm": 0.37893802057192905, + "learning_rate": 9.427725472930685e-05, + "loss": 2.9643, + "step": 15362 + }, + { + "epoch": 0.7152734129478315, + "grad_norm": 0.3982899937065241, + "learning_rate": 9.427599631148763e-05, + "loss": 2.9925, + "step": 15363 + }, + { + "epoch": 0.7153199711339246, + "grad_norm": 0.40044890222097707, + "learning_rate": 9.427473776372273e-05, + "loss": 2.9626, + "step": 15364 + }, + { + "epoch": 0.7153665293200177, + "grad_norm": 0.3633671426960754, + "learning_rate": 9.427347908601582e-05, + "loss": 2.9087, + "step": 15365 + }, + { + "epoch": 0.7154130875061108, + "grad_norm": 0.3768659513709309, + "learning_rate": 9.427222027837061e-05, + "loss": 2.9816, + "step": 15366 + }, + { + "epoch": 0.7154596456922039, + "grad_norm": 0.3616112939802741, + "learning_rate": 9.42709613407908e-05, + "loss": 3.0629, + "step": 15367 + }, + { + "epoch": 0.7155062038782969, + "grad_norm": 0.40051365373804043, + "learning_rate": 9.42697022732801e-05, + "loss": 2.9497, + "step": 15368 + }, + { + "epoch": 0.71555276206439, + "grad_norm": 0.3710813070186041, + "learning_rate": 9.426844307584216e-05, + "loss": 2.8757, + "step": 15369 + }, + { + "epoch": 0.715599320250483, + "grad_norm": 0.4129584671231502, + "learning_rate": 9.426718374848072e-05, + "loss": 3.0609, + "step": 15370 + }, + { + "epoch": 0.7156458784365761, + "grad_norm": 0.386391513016472, + "learning_rate": 9.426592429119944e-05, + "loss": 3.001, + "step": 15371 + }, + { + "epoch": 0.7156924366226691, + "grad_norm": 0.42498148803014396, + "learning_rate": 9.426466470400204e-05, + "loss": 3.0291, + "step": 15372 + }, + { + "epoch": 0.7157389948087622, + "grad_norm": 0.3553919880682353, + "learning_rate": 9.426340498689221e-05, + "loss": 2.9843, + "step": 15373 + }, + { + "epoch": 0.7157855529948554, + "grad_norm": 0.3888707395148137, + "learning_rate": 9.426214513987363e-05, + "loss": 2.9508, + "step": 15374 + }, + { + "epoch": 0.7158321111809484, + "grad_norm": 0.3519541772151056, + "learning_rate": 9.426088516295004e-05, + "loss": 3.0969, + "step": 15375 + }, + { + "epoch": 0.7158786693670415, + "grad_norm": 0.39742466360950196, + "learning_rate": 9.42596250561251e-05, + "loss": 2.962, + "step": 15376 + }, + { + "epoch": 0.7159252275531345, + "grad_norm": 0.38382215736957714, + "learning_rate": 9.425836481940253e-05, + "loss": 2.9503, + "step": 15377 + }, + { + "epoch": 0.7159717857392276, + "grad_norm": 0.35251254251109526, + "learning_rate": 9.425710445278602e-05, + "loss": 3.0107, + "step": 15378 + }, + { + "epoch": 0.7160183439253207, + "grad_norm": 0.3425733666077254, + "learning_rate": 9.425584395627925e-05, + "loss": 3.0497, + "step": 15379 + }, + { + "epoch": 0.7160649021114137, + "grad_norm": 0.3666007763532797, + "learning_rate": 9.425458332988595e-05, + "loss": 3.0465, + "step": 15380 + }, + { + "epoch": 0.7161114602975068, + "grad_norm": 0.3822146615745715, + "learning_rate": 9.425332257360981e-05, + "loss": 3.0167, + "step": 15381 + }, + { + "epoch": 0.7161580184835998, + "grad_norm": 0.3388538417944422, + "learning_rate": 9.425206168745453e-05, + "loss": 3.028, + "step": 15382 + }, + { + "epoch": 0.716204576669693, + "grad_norm": 0.35826633878058584, + "learning_rate": 9.42508006714238e-05, + "loss": 3.0589, + "step": 15383 + }, + { + "epoch": 0.7162511348557861, + "grad_norm": 0.3472045600261534, + "learning_rate": 9.424953952552133e-05, + "loss": 2.8737, + "step": 15384 + }, + { + "epoch": 0.7162976930418791, + "grad_norm": 0.3771500814625603, + "learning_rate": 9.424827824975082e-05, + "loss": 2.8689, + "step": 15385 + }, + { + "epoch": 0.7163442512279722, + "grad_norm": 0.3707610305315182, + "learning_rate": 9.4247016844116e-05, + "loss": 3.0261, + "step": 15386 + }, + { + "epoch": 0.7163908094140652, + "grad_norm": 0.3691854469814657, + "learning_rate": 9.42457553086205e-05, + "loss": 2.9247, + "step": 15387 + }, + { + "epoch": 0.7164373676001583, + "grad_norm": 0.3386279024394228, + "learning_rate": 9.424449364326808e-05, + "loss": 2.8495, + "step": 15388 + }, + { + "epoch": 0.7164839257862514, + "grad_norm": 0.41485417431317356, + "learning_rate": 9.424323184806245e-05, + "loss": 3.0179, + "step": 15389 + }, + { + "epoch": 0.7165304839723444, + "grad_norm": 0.3866250339272227, + "learning_rate": 9.424196992300727e-05, + "loss": 3.0786, + "step": 15390 + }, + { + "epoch": 0.7165770421584375, + "grad_norm": 0.3538250726420529, + "learning_rate": 9.424070786810629e-05, + "loss": 2.9938, + "step": 15391 + }, + { + "epoch": 0.7166236003445305, + "grad_norm": 0.3680175225790185, + "learning_rate": 9.423944568336318e-05, + "loss": 2.9637, + "step": 15392 + }, + { + "epoch": 0.7166701585306237, + "grad_norm": 0.34956029939012323, + "learning_rate": 9.423818336878165e-05, + "loss": 3.0531, + "step": 15393 + }, + { + "epoch": 0.7167167167167167, + "grad_norm": 0.37820813771366646, + "learning_rate": 9.423692092436541e-05, + "loss": 3.0397, + "step": 15394 + }, + { + "epoch": 0.7167632749028098, + "grad_norm": 0.3775037575014348, + "learning_rate": 9.423565835011818e-05, + "loss": 3.0305, + "step": 15395 + }, + { + "epoch": 0.7168098330889029, + "grad_norm": 0.3480440577774094, + "learning_rate": 9.423439564604364e-05, + "loss": 2.9538, + "step": 15396 + }, + { + "epoch": 0.7168563912749959, + "grad_norm": 0.37994345189030265, + "learning_rate": 9.42331328121455e-05, + "loss": 2.9729, + "step": 15397 + }, + { + "epoch": 0.716902949461089, + "grad_norm": 0.3483309178523399, + "learning_rate": 9.423186984842749e-05, + "loss": 2.9282, + "step": 15398 + }, + { + "epoch": 0.716949507647182, + "grad_norm": 0.3528881514581904, + "learning_rate": 9.423060675489327e-05, + "loss": 3.0008, + "step": 15399 + }, + { + "epoch": 0.7169960658332751, + "grad_norm": 0.35892767533663095, + "learning_rate": 9.42293435315466e-05, + "loss": 3.0793, + "step": 15400 + }, + { + "epoch": 0.7170426240193682, + "grad_norm": 0.3413937777891323, + "learning_rate": 9.422808017839117e-05, + "loss": 2.9767, + "step": 15401 + }, + { + "epoch": 0.7170891822054613, + "grad_norm": 0.35321271208293087, + "learning_rate": 9.422681669543066e-05, + "loss": 3.0129, + "step": 15402 + }, + { + "epoch": 0.7171357403915544, + "grad_norm": 0.38222771335919226, + "learning_rate": 9.422555308266882e-05, + "loss": 2.9786, + "step": 15403 + }, + { + "epoch": 0.7171822985776474, + "grad_norm": 0.34595364011109836, + "learning_rate": 9.422428934010932e-05, + "loss": 2.9208, + "step": 15404 + }, + { + "epoch": 0.7172288567637405, + "grad_norm": 0.335905425654675, + "learning_rate": 9.42230254677559e-05, + "loss": 2.9534, + "step": 15405 + }, + { + "epoch": 0.7172754149498336, + "grad_norm": 0.34312823530943604, + "learning_rate": 9.422176146561226e-05, + "loss": 3.0189, + "step": 15406 + }, + { + "epoch": 0.7173219731359266, + "grad_norm": 0.3233322434486178, + "learning_rate": 9.42204973336821e-05, + "loss": 2.9747, + "step": 15407 + }, + { + "epoch": 0.7173685313220197, + "grad_norm": 0.3868343104985419, + "learning_rate": 9.421923307196914e-05, + "loss": 3.0045, + "step": 15408 + }, + { + "epoch": 0.7174150895081127, + "grad_norm": 0.3641253680705413, + "learning_rate": 9.421796868047708e-05, + "loss": 2.9583, + "step": 15409 + }, + { + "epoch": 0.7174616476942058, + "grad_norm": 0.3434176958234591, + "learning_rate": 9.421670415920964e-05, + "loss": 2.97, + "step": 15410 + }, + { + "epoch": 0.717508205880299, + "grad_norm": 0.35033214420857856, + "learning_rate": 9.421543950817053e-05, + "loss": 2.9584, + "step": 15411 + }, + { + "epoch": 0.717554764066392, + "grad_norm": 0.34775287263765686, + "learning_rate": 9.421417472736347e-05, + "loss": 3.0041, + "step": 15412 + }, + { + "epoch": 0.7176013222524851, + "grad_norm": 0.35160922378633447, + "learning_rate": 9.421290981679216e-05, + "loss": 2.9752, + "step": 15413 + }, + { + "epoch": 0.7176478804385781, + "grad_norm": 0.3489196538468292, + "learning_rate": 9.421164477646031e-05, + "loss": 2.948, + "step": 15414 + }, + { + "epoch": 0.7176944386246712, + "grad_norm": 0.3691677447354943, + "learning_rate": 9.421037960637164e-05, + "loss": 3.0471, + "step": 15415 + }, + { + "epoch": 0.7177409968107642, + "grad_norm": 0.3857232311273743, + "learning_rate": 9.420911430652986e-05, + "loss": 3.0084, + "step": 15416 + }, + { + "epoch": 0.7177875549968573, + "grad_norm": 0.3497568061009151, + "learning_rate": 9.42078488769387e-05, + "loss": 3.0826, + "step": 15417 + }, + { + "epoch": 0.7178341131829504, + "grad_norm": 0.377119932659071, + "learning_rate": 9.420658331760183e-05, + "loss": 3.0934, + "step": 15418 + }, + { + "epoch": 0.7178806713690434, + "grad_norm": 0.37352533995174003, + "learning_rate": 9.420531762852302e-05, + "loss": 2.9826, + "step": 15419 + }, + { + "epoch": 0.7179272295551365, + "grad_norm": 0.39266859678063115, + "learning_rate": 9.420405180970593e-05, + "loss": 3.0078, + "step": 15420 + }, + { + "epoch": 0.7179737877412296, + "grad_norm": 0.36145083324782606, + "learning_rate": 9.420278586115433e-05, + "loss": 3.0518, + "step": 15421 + }, + { + "epoch": 0.7180203459273227, + "grad_norm": 0.40942202921800347, + "learning_rate": 9.42015197828719e-05, + "loss": 3.0613, + "step": 15422 + }, + { + "epoch": 0.7180669041134158, + "grad_norm": 0.387016366353177, + "learning_rate": 9.420025357486235e-05, + "loss": 3.0466, + "step": 15423 + }, + { + "epoch": 0.7181134622995088, + "grad_norm": 0.4039746012532298, + "learning_rate": 9.419898723712943e-05, + "loss": 3.0485, + "step": 15424 + }, + { + "epoch": 0.7181600204856019, + "grad_norm": 0.32628860392427345, + "learning_rate": 9.419772076967683e-05, + "loss": 2.961, + "step": 15425 + }, + { + "epoch": 0.7182065786716949, + "grad_norm": 0.3732291849352855, + "learning_rate": 9.419645417250827e-05, + "loss": 2.8946, + "step": 15426 + }, + { + "epoch": 0.718253136857788, + "grad_norm": 0.3164451447283478, + "learning_rate": 9.419518744562748e-05, + "loss": 2.9969, + "step": 15427 + }, + { + "epoch": 0.7182996950438811, + "grad_norm": 0.37540087059909527, + "learning_rate": 9.419392058903816e-05, + "loss": 3.059, + "step": 15428 + }, + { + "epoch": 0.7183462532299741, + "grad_norm": 0.3337230489651945, + "learning_rate": 9.419265360274403e-05, + "loss": 2.9566, + "step": 15429 + }, + { + "epoch": 0.7183928114160673, + "grad_norm": 0.3602418298240117, + "learning_rate": 9.419138648674882e-05, + "loss": 2.9795, + "step": 15430 + }, + { + "epoch": 0.7184393696021603, + "grad_norm": 0.36912760903162667, + "learning_rate": 9.419011924105626e-05, + "loss": 2.9663, + "step": 15431 + }, + { + "epoch": 0.7184859277882534, + "grad_norm": 0.32832285074561784, + "learning_rate": 9.418885186567003e-05, + "loss": 2.9467, + "step": 15432 + }, + { + "epoch": 0.7185324859743465, + "grad_norm": 0.36229881391398894, + "learning_rate": 9.418758436059389e-05, + "loss": 2.898, + "step": 15433 + }, + { + "epoch": 0.7185790441604395, + "grad_norm": 0.33807233671773296, + "learning_rate": 9.418631672583154e-05, + "loss": 3.0192, + "step": 15434 + }, + { + "epoch": 0.7186256023465326, + "grad_norm": 0.374186834069773, + "learning_rate": 9.41850489613867e-05, + "loss": 3.02, + "step": 15435 + }, + { + "epoch": 0.7186721605326256, + "grad_norm": 0.3595406579253779, + "learning_rate": 9.418378106726309e-05, + "loss": 3.0121, + "step": 15436 + }, + { + "epoch": 0.7187187187187187, + "grad_norm": 0.3207370973791877, + "learning_rate": 9.418251304346444e-05, + "loss": 2.8712, + "step": 15437 + }, + { + "epoch": 0.7187652769048117, + "grad_norm": 0.3569417878120638, + "learning_rate": 9.418124488999447e-05, + "loss": 2.919, + "step": 15438 + }, + { + "epoch": 0.7188118350909048, + "grad_norm": 0.36900696525052457, + "learning_rate": 9.41799766068569e-05, + "loss": 2.9354, + "step": 15439 + }, + { + "epoch": 0.718858393276998, + "grad_norm": 0.31370376638479475, + "learning_rate": 9.417870819405546e-05, + "loss": 3.1149, + "step": 15440 + }, + { + "epoch": 0.718904951463091, + "grad_norm": 0.38558112891910173, + "learning_rate": 9.417743965159385e-05, + "loss": 3.0258, + "step": 15441 + }, + { + "epoch": 0.7189515096491841, + "grad_norm": 0.3371337527875218, + "learning_rate": 9.417617097947579e-05, + "loss": 2.9812, + "step": 15442 + }, + { + "epoch": 0.7189980678352771, + "grad_norm": 0.34687953459028487, + "learning_rate": 9.417490217770504e-05, + "loss": 3.0281, + "step": 15443 + }, + { + "epoch": 0.7190446260213702, + "grad_norm": 0.35263133899147436, + "learning_rate": 9.41736332462853e-05, + "loss": 2.8874, + "step": 15444 + }, + { + "epoch": 0.7190911842074633, + "grad_norm": 0.31977835672357907, + "learning_rate": 9.41723641852203e-05, + "loss": 2.9296, + "step": 15445 + }, + { + "epoch": 0.7191377423935563, + "grad_norm": 0.37586267990268957, + "learning_rate": 9.417109499451378e-05, + "loss": 3.0259, + "step": 15446 + }, + { + "epoch": 0.7191843005796494, + "grad_norm": 0.3378375329818426, + "learning_rate": 9.416982567416943e-05, + "loss": 3.0267, + "step": 15447 + }, + { + "epoch": 0.7192308587657424, + "grad_norm": 0.33929260832077673, + "learning_rate": 9.416855622419098e-05, + "loss": 2.9395, + "step": 15448 + }, + { + "epoch": 0.7192774169518356, + "grad_norm": 0.3146958540097535, + "learning_rate": 9.41672866445822e-05, + "loss": 2.9973, + "step": 15449 + }, + { + "epoch": 0.7193239751379287, + "grad_norm": 0.34486529300526864, + "learning_rate": 9.416601693534677e-05, + "loss": 2.9849, + "step": 15450 + }, + { + "epoch": 0.7193705333240217, + "grad_norm": 0.3487090536361838, + "learning_rate": 9.416474709648841e-05, + "loss": 3.0352, + "step": 15451 + }, + { + "epoch": 0.7194170915101148, + "grad_norm": 0.3821857309522161, + "learning_rate": 9.41634771280109e-05, + "loss": 2.9435, + "step": 15452 + }, + { + "epoch": 0.7194636496962078, + "grad_norm": 0.3574279111716412, + "learning_rate": 9.416220702991793e-05, + "loss": 2.8742, + "step": 15453 + }, + { + "epoch": 0.7195102078823009, + "grad_norm": 0.42123745015784625, + "learning_rate": 9.416093680221322e-05, + "loss": 3.1776, + "step": 15454 + }, + { + "epoch": 0.719556766068394, + "grad_norm": 0.3726315002587874, + "learning_rate": 9.415966644490052e-05, + "loss": 3.0014, + "step": 15455 + }, + { + "epoch": 0.719603324254487, + "grad_norm": 0.4002484753685148, + "learning_rate": 9.415839595798355e-05, + "loss": 2.9464, + "step": 15456 + }, + { + "epoch": 0.7196498824405801, + "grad_norm": 0.39453595518458623, + "learning_rate": 9.415712534146603e-05, + "loss": 3.0809, + "step": 15457 + }, + { + "epoch": 0.7196964406266732, + "grad_norm": 0.3520625976704922, + "learning_rate": 9.41558545953517e-05, + "loss": 3.0004, + "step": 15458 + }, + { + "epoch": 0.7197429988127663, + "grad_norm": 0.4683403664179684, + "learning_rate": 9.41545837196443e-05, + "loss": 3.0773, + "step": 15459 + }, + { + "epoch": 0.7197895569988593, + "grad_norm": 0.3816762324828889, + "learning_rate": 9.415331271434753e-05, + "loss": 2.9899, + "step": 15460 + }, + { + "epoch": 0.7198361151849524, + "grad_norm": 0.36755700104839384, + "learning_rate": 9.415204157946513e-05, + "loss": 2.9323, + "step": 15461 + }, + { + "epoch": 0.7198826733710455, + "grad_norm": 0.35551911908724343, + "learning_rate": 9.415077031500086e-05, + "loss": 2.9639, + "step": 15462 + }, + { + "epoch": 0.7199292315571385, + "grad_norm": 0.34031709093821705, + "learning_rate": 9.414949892095842e-05, + "loss": 3.0166, + "step": 15463 + }, + { + "epoch": 0.7199757897432316, + "grad_norm": 0.3382465184946452, + "learning_rate": 9.414822739734155e-05, + "loss": 3.019, + "step": 15464 + }, + { + "epoch": 0.7200223479293246, + "grad_norm": 0.3606276626133006, + "learning_rate": 9.414695574415397e-05, + "loss": 3.0424, + "step": 15465 + }, + { + "epoch": 0.7200689061154177, + "grad_norm": 0.3975202273333387, + "learning_rate": 9.414568396139944e-05, + "loss": 2.978, + "step": 15466 + }, + { + "epoch": 0.7201154643015109, + "grad_norm": 0.37652460475567273, + "learning_rate": 9.414441204908169e-05, + "loss": 3.0086, + "step": 15467 + }, + { + "epoch": 0.7201620224876039, + "grad_norm": 0.36360856951375947, + "learning_rate": 9.414314000720441e-05, + "loss": 2.9404, + "step": 15468 + }, + { + "epoch": 0.720208580673697, + "grad_norm": 0.3796144400275712, + "learning_rate": 9.414186783577136e-05, + "loss": 2.9867, + "step": 15469 + }, + { + "epoch": 0.72025513885979, + "grad_norm": 0.36332211958683724, + "learning_rate": 9.414059553478631e-05, + "loss": 3.0705, + "step": 15470 + }, + { + "epoch": 0.7203016970458831, + "grad_norm": 0.38258898789688184, + "learning_rate": 9.413932310425294e-05, + "loss": 3.0804, + "step": 15471 + }, + { + "epoch": 0.7203482552319762, + "grad_norm": 0.3672621277576524, + "learning_rate": 9.4138050544175e-05, + "loss": 2.9511, + "step": 15472 + }, + { + "epoch": 0.7203948134180692, + "grad_norm": 0.34821524392887615, + "learning_rate": 9.413677785455623e-05, + "loss": 3.0474, + "step": 15473 + }, + { + "epoch": 0.7204413716041623, + "grad_norm": 0.3540352945806399, + "learning_rate": 9.413550503540039e-05, + "loss": 3.0011, + "step": 15474 + }, + { + "epoch": 0.7204879297902553, + "grad_norm": 0.3406309065997213, + "learning_rate": 9.413423208671117e-05, + "loss": 2.9801, + "step": 15475 + }, + { + "epoch": 0.7205344879763484, + "grad_norm": 0.3315259925912799, + "learning_rate": 9.413295900849233e-05, + "loss": 3.0615, + "step": 15476 + }, + { + "epoch": 0.7205810461624416, + "grad_norm": 0.36184752490040795, + "learning_rate": 9.413168580074761e-05, + "loss": 3.0302, + "step": 15477 + }, + { + "epoch": 0.7206276043485346, + "grad_norm": 0.32294747312748284, + "learning_rate": 9.413041246348074e-05, + "loss": 2.9502, + "step": 15478 + }, + { + "epoch": 0.7206741625346277, + "grad_norm": 0.3615740408943263, + "learning_rate": 9.412913899669543e-05, + "loss": 2.8534, + "step": 15479 + }, + { + "epoch": 0.7207207207207207, + "grad_norm": 0.35685537952299307, + "learning_rate": 9.412786540039547e-05, + "loss": 3.0934, + "step": 15480 + }, + { + "epoch": 0.7207672789068138, + "grad_norm": 0.3538537042868465, + "learning_rate": 9.412659167458456e-05, + "loss": 3.0401, + "step": 15481 + }, + { + "epoch": 0.7208138370929068, + "grad_norm": 0.3604520578431649, + "learning_rate": 9.412531781926645e-05, + "loss": 3.0608, + "step": 15482 + }, + { + "epoch": 0.7208603952789999, + "grad_norm": 0.33805484247644263, + "learning_rate": 9.41240438344449e-05, + "loss": 2.9802, + "step": 15483 + }, + { + "epoch": 0.720906953465093, + "grad_norm": 0.4074590068691931, + "learning_rate": 9.412276972012362e-05, + "loss": 3.0445, + "step": 15484 + }, + { + "epoch": 0.720953511651186, + "grad_norm": 0.34234868879408553, + "learning_rate": 9.412149547630635e-05, + "loss": 2.8603, + "step": 15485 + }, + { + "epoch": 0.7210000698372792, + "grad_norm": 0.43550542934649317, + "learning_rate": 9.412022110299683e-05, + "loss": 3.0462, + "step": 15486 + }, + { + "epoch": 0.7210466280233722, + "grad_norm": 0.4228765265909121, + "learning_rate": 9.411894660019882e-05, + "loss": 2.9735, + "step": 15487 + }, + { + "epoch": 0.7210931862094653, + "grad_norm": 0.32991078732184637, + "learning_rate": 9.411767196791605e-05, + "loss": 2.8848, + "step": 15488 + }, + { + "epoch": 0.7211397443955584, + "grad_norm": 0.4405173987762454, + "learning_rate": 9.411639720615227e-05, + "loss": 3.0019, + "step": 15489 + }, + { + "epoch": 0.7211863025816514, + "grad_norm": 0.3653048050229754, + "learning_rate": 9.411512231491117e-05, + "loss": 2.949, + "step": 15490 + }, + { + "epoch": 0.7212328607677445, + "grad_norm": 0.38196190012156306, + "learning_rate": 9.411384729419657e-05, + "loss": 3.009, + "step": 15491 + }, + { + "epoch": 0.7212794189538375, + "grad_norm": 0.37756985438258084, + "learning_rate": 9.411257214401216e-05, + "loss": 2.9316, + "step": 15492 + }, + { + "epoch": 0.7213259771399306, + "grad_norm": 0.36228992150890676, + "learning_rate": 9.41112968643617e-05, + "loss": 2.9008, + "step": 15493 + }, + { + "epoch": 0.7213725353260237, + "grad_norm": 0.3741016151845215, + "learning_rate": 9.411002145524892e-05, + "loss": 2.9986, + "step": 15494 + }, + { + "epoch": 0.7214190935121167, + "grad_norm": 0.3793004138234875, + "learning_rate": 9.410874591667757e-05, + "loss": 2.9453, + "step": 15495 + }, + { + "epoch": 0.7214656516982099, + "grad_norm": 0.38120438517079575, + "learning_rate": 9.410747024865142e-05, + "loss": 2.9202, + "step": 15496 + }, + { + "epoch": 0.7215122098843029, + "grad_norm": 0.3377025956536397, + "learning_rate": 9.410619445117418e-05, + "loss": 2.9746, + "step": 15497 + }, + { + "epoch": 0.721558768070396, + "grad_norm": 0.36542118643238475, + "learning_rate": 9.41049185242496e-05, + "loss": 3.0002, + "step": 15498 + }, + { + "epoch": 0.7216053262564891, + "grad_norm": 0.3427029870192819, + "learning_rate": 9.410364246788143e-05, + "loss": 3.0366, + "step": 15499 + }, + { + "epoch": 0.7216518844425821, + "grad_norm": 0.38194319705194946, + "learning_rate": 9.410236628207341e-05, + "loss": 3.049, + "step": 15500 + }, + { + "epoch": 0.7216984426286752, + "grad_norm": 0.3677588561210366, + "learning_rate": 9.410108996682929e-05, + "loss": 3.0614, + "step": 15501 + }, + { + "epoch": 0.7217450008147682, + "grad_norm": 0.35039657094509163, + "learning_rate": 9.409981352215282e-05, + "loss": 2.9567, + "step": 15502 + }, + { + "epoch": 0.7217915590008613, + "grad_norm": 0.3819720050265586, + "learning_rate": 9.409853694804776e-05, + "loss": 2.962, + "step": 15503 + }, + { + "epoch": 0.7218381171869543, + "grad_norm": 0.32049668692703215, + "learning_rate": 9.409726024451781e-05, + "loss": 3.0108, + "step": 15504 + }, + { + "epoch": 0.7218846753730475, + "grad_norm": 0.38506972881009666, + "learning_rate": 9.409598341156675e-05, + "loss": 3.055, + "step": 15505 + }, + { + "epoch": 0.7219312335591406, + "grad_norm": 0.3925376071819922, + "learning_rate": 9.409470644919833e-05, + "loss": 2.9985, + "step": 15506 + }, + { + "epoch": 0.7219777917452336, + "grad_norm": 0.31362356571790445, + "learning_rate": 9.40934293574163e-05, + "loss": 2.8714, + "step": 15507 + }, + { + "epoch": 0.7220243499313267, + "grad_norm": 0.4347749823738905, + "learning_rate": 9.409215213622437e-05, + "loss": 3.0832, + "step": 15508 + }, + { + "epoch": 0.7220709081174197, + "grad_norm": 0.33929247134272394, + "learning_rate": 9.409087478562634e-05, + "loss": 3.0303, + "step": 15509 + }, + { + "epoch": 0.7221174663035128, + "grad_norm": 0.3987499092775593, + "learning_rate": 9.408959730562593e-05, + "loss": 3.0791, + "step": 15510 + }, + { + "epoch": 0.7221640244896059, + "grad_norm": 0.3690544689637259, + "learning_rate": 9.40883196962269e-05, + "loss": 3.0233, + "step": 15511 + }, + { + "epoch": 0.7222105826756989, + "grad_norm": 0.36907976796120057, + "learning_rate": 9.408704195743298e-05, + "loss": 3.1603, + "step": 15512 + }, + { + "epoch": 0.722257140861792, + "grad_norm": 0.37867039333060426, + "learning_rate": 9.408576408924796e-05, + "loss": 3.0238, + "step": 15513 + }, + { + "epoch": 0.722303699047885, + "grad_norm": 0.3669006878929301, + "learning_rate": 9.408448609167555e-05, + "loss": 3.0295, + "step": 15514 + }, + { + "epoch": 0.7223502572339782, + "grad_norm": 0.37351716314649247, + "learning_rate": 9.408320796471953e-05, + "loss": 2.9795, + "step": 15515 + }, + { + "epoch": 0.7223968154200713, + "grad_norm": 0.3721739477329159, + "learning_rate": 9.408192970838363e-05, + "loss": 3.0199, + "step": 15516 + }, + { + "epoch": 0.7224433736061643, + "grad_norm": 0.34473464824183125, + "learning_rate": 9.408065132267159e-05, + "loss": 2.9997, + "step": 15517 + }, + { + "epoch": 0.7224899317922574, + "grad_norm": 0.4129301735401586, + "learning_rate": 9.40793728075872e-05, + "loss": 2.9822, + "step": 15518 + }, + { + "epoch": 0.7225364899783504, + "grad_norm": 0.34386355663653617, + "learning_rate": 9.407809416313419e-05, + "loss": 3.0049, + "step": 15519 + }, + { + "epoch": 0.7225830481644435, + "grad_norm": 0.4319838289759534, + "learning_rate": 9.407681538931631e-05, + "loss": 3.1003, + "step": 15520 + }, + { + "epoch": 0.7226296063505366, + "grad_norm": 0.32436446436108335, + "learning_rate": 9.407553648613733e-05, + "loss": 2.9243, + "step": 15521 + }, + { + "epoch": 0.7226761645366296, + "grad_norm": 0.43340701352513794, + "learning_rate": 9.4074257453601e-05, + "loss": 3.0655, + "step": 15522 + }, + { + "epoch": 0.7227227227227228, + "grad_norm": 0.34067730510390737, + "learning_rate": 9.407297829171104e-05, + "loss": 2.9855, + "step": 15523 + }, + { + "epoch": 0.7227692809088158, + "grad_norm": 0.34079307634801964, + "learning_rate": 9.407169900047125e-05, + "loss": 2.9317, + "step": 15524 + }, + { + "epoch": 0.7228158390949089, + "grad_norm": 0.36815118150351556, + "learning_rate": 9.407041957988536e-05, + "loss": 3.0178, + "step": 15525 + }, + { + "epoch": 0.7228623972810019, + "grad_norm": 0.34074594446638845, + "learning_rate": 9.406914002995712e-05, + "loss": 2.8658, + "step": 15526 + }, + { + "epoch": 0.722908955467095, + "grad_norm": 0.34570899525985394, + "learning_rate": 9.40678603506903e-05, + "loss": 3.0548, + "step": 15527 + }, + { + "epoch": 0.7229555136531881, + "grad_norm": 0.37966051200707956, + "learning_rate": 9.406658054208866e-05, + "loss": 2.9993, + "step": 15528 + }, + { + "epoch": 0.7230020718392811, + "grad_norm": 0.34397131586038093, + "learning_rate": 9.406530060415595e-05, + "loss": 3.0327, + "step": 15529 + }, + { + "epoch": 0.7230486300253742, + "grad_norm": 0.341555612088432, + "learning_rate": 9.406402053689591e-05, + "loss": 3.0424, + "step": 15530 + }, + { + "epoch": 0.7230951882114672, + "grad_norm": 0.3808171837031954, + "learning_rate": 9.406274034031232e-05, + "loss": 3.0997, + "step": 15531 + }, + { + "epoch": 0.7231417463975603, + "grad_norm": 0.3440726144078883, + "learning_rate": 9.406146001440894e-05, + "loss": 3.069, + "step": 15532 + }, + { + "epoch": 0.7231883045836535, + "grad_norm": 0.3374332561378709, + "learning_rate": 9.40601795591895e-05, + "loss": 2.8941, + "step": 15533 + }, + { + "epoch": 0.7232348627697465, + "grad_norm": 0.34843575905003693, + "learning_rate": 9.405889897465779e-05, + "loss": 3.0882, + "step": 15534 + }, + { + "epoch": 0.7232814209558396, + "grad_norm": 0.37253247405123546, + "learning_rate": 9.405761826081751e-05, + "loss": 3.1069, + "step": 15535 + }, + { + "epoch": 0.7233279791419326, + "grad_norm": 0.3342802611048811, + "learning_rate": 9.40563374176725e-05, + "loss": 3.0116, + "step": 15536 + }, + { + "epoch": 0.7233745373280257, + "grad_norm": 0.38044710646074037, + "learning_rate": 9.405505644522647e-05, + "loss": 2.8433, + "step": 15537 + }, + { + "epoch": 0.7234210955141188, + "grad_norm": 0.37827169053916465, + "learning_rate": 9.405377534348319e-05, + "loss": 2.989, + "step": 15538 + }, + { + "epoch": 0.7234676537002118, + "grad_norm": 0.344946470262661, + "learning_rate": 9.405249411244641e-05, + "loss": 3.0378, + "step": 15539 + }, + { + "epoch": 0.7235142118863049, + "grad_norm": 0.41270408414363263, + "learning_rate": 9.405121275211991e-05, + "loss": 3.0502, + "step": 15540 + }, + { + "epoch": 0.7235607700723979, + "grad_norm": 0.34512602586725494, + "learning_rate": 9.404993126250744e-05, + "loss": 3.1163, + "step": 15541 + }, + { + "epoch": 0.7236073282584911, + "grad_norm": 0.40039874241944096, + "learning_rate": 9.404864964361277e-05, + "loss": 2.9177, + "step": 15542 + }, + { + "epoch": 0.7236538864445842, + "grad_norm": 0.3647452633154412, + "learning_rate": 9.404736789543965e-05, + "loss": 2.9375, + "step": 15543 + }, + { + "epoch": 0.7237004446306772, + "grad_norm": 0.3903873287264345, + "learning_rate": 9.404608601799182e-05, + "loss": 2.9995, + "step": 15544 + }, + { + "epoch": 0.7237470028167703, + "grad_norm": 0.37238781379553176, + "learning_rate": 9.404480401127308e-05, + "loss": 2.9853, + "step": 15545 + }, + { + "epoch": 0.7237935610028633, + "grad_norm": 0.3638057102107115, + "learning_rate": 9.404352187528719e-05, + "loss": 3.0481, + "step": 15546 + }, + { + "epoch": 0.7238401191889564, + "grad_norm": 0.3807010480697613, + "learning_rate": 9.404223961003788e-05, + "loss": 3.067, + "step": 15547 + }, + { + "epoch": 0.7238866773750494, + "grad_norm": 0.3557444097687286, + "learning_rate": 9.404095721552897e-05, + "loss": 3.0533, + "step": 15548 + }, + { + "epoch": 0.7239332355611425, + "grad_norm": 0.3323640341861759, + "learning_rate": 9.403967469176415e-05, + "loss": 2.9345, + "step": 15549 + }, + { + "epoch": 0.7239797937472356, + "grad_norm": 0.34577474775503536, + "learning_rate": 9.403839203874726e-05, + "loss": 2.9515, + "step": 15550 + }, + { + "epoch": 0.7240263519333286, + "grad_norm": 0.33524906744625294, + "learning_rate": 9.403710925648202e-05, + "loss": 2.9764, + "step": 15551 + }, + { + "epoch": 0.7240729101194218, + "grad_norm": 0.34351067371095334, + "learning_rate": 9.403582634497219e-05, + "loss": 2.985, + "step": 15552 + }, + { + "epoch": 0.7241194683055148, + "grad_norm": 0.32601415839251013, + "learning_rate": 9.403454330422155e-05, + "loss": 2.9765, + "step": 15553 + }, + { + "epoch": 0.7241660264916079, + "grad_norm": 0.33198766161169957, + "learning_rate": 9.403326013423385e-05, + "loss": 2.9856, + "step": 15554 + }, + { + "epoch": 0.724212584677701, + "grad_norm": 0.3353065715522379, + "learning_rate": 9.40319768350129e-05, + "loss": 3.0241, + "step": 15555 + }, + { + "epoch": 0.724259142863794, + "grad_norm": 0.3611271485665389, + "learning_rate": 9.403069340656241e-05, + "loss": 3.0704, + "step": 15556 + }, + { + "epoch": 0.7243057010498871, + "grad_norm": 0.352931807120751, + "learning_rate": 9.402940984888618e-05, + "loss": 2.9764, + "step": 15557 + }, + { + "epoch": 0.7243522592359801, + "grad_norm": 0.3348650691735931, + "learning_rate": 9.402812616198798e-05, + "loss": 2.9324, + "step": 15558 + }, + { + "epoch": 0.7243988174220732, + "grad_norm": 0.36644052894683526, + "learning_rate": 9.402684234587156e-05, + "loss": 3.0336, + "step": 15559 + }, + { + "epoch": 0.7244453756081664, + "grad_norm": 0.44383451790872397, + "learning_rate": 9.40255584005407e-05, + "loss": 2.9842, + "step": 15560 + }, + { + "epoch": 0.7244919337942594, + "grad_norm": 0.3546805404448196, + "learning_rate": 9.402427432599917e-05, + "loss": 3.0176, + "step": 15561 + }, + { + "epoch": 0.7245384919803525, + "grad_norm": 0.3334923119199164, + "learning_rate": 9.402299012225071e-05, + "loss": 3.0576, + "step": 15562 + }, + { + "epoch": 0.7245850501664455, + "grad_norm": 0.34691837874325354, + "learning_rate": 9.402170578929912e-05, + "loss": 2.9447, + "step": 15563 + }, + { + "epoch": 0.7246316083525386, + "grad_norm": 0.3202826940821219, + "learning_rate": 9.402042132714816e-05, + "loss": 3.0745, + "step": 15564 + }, + { + "epoch": 0.7246781665386317, + "grad_norm": 0.4790200560241719, + "learning_rate": 9.40191367358016e-05, + "loss": 2.8763, + "step": 15565 + }, + { + "epoch": 0.7247247247247247, + "grad_norm": 0.3481162239535444, + "learning_rate": 9.401785201526321e-05, + "loss": 3.0519, + "step": 15566 + }, + { + "epoch": 0.7247712829108178, + "grad_norm": 0.36814534404496657, + "learning_rate": 9.401656716553677e-05, + "loss": 3.0133, + "step": 15567 + }, + { + "epoch": 0.7248178410969108, + "grad_norm": 0.3722497607851517, + "learning_rate": 9.401528218662604e-05, + "loss": 3.1003, + "step": 15568 + }, + { + "epoch": 0.724864399283004, + "grad_norm": 0.3512498081977043, + "learning_rate": 9.401399707853478e-05, + "loss": 3.1581, + "step": 15569 + }, + { + "epoch": 0.724910957469097, + "grad_norm": 0.336684548040607, + "learning_rate": 9.401271184126679e-05, + "loss": 2.9885, + "step": 15570 + }, + { + "epoch": 0.7249575156551901, + "grad_norm": 0.3969162255364958, + "learning_rate": 9.401142647482581e-05, + "loss": 2.9789, + "step": 15571 + }, + { + "epoch": 0.7250040738412832, + "grad_norm": 0.35032989200293047, + "learning_rate": 9.401014097921564e-05, + "loss": 3.0865, + "step": 15572 + }, + { + "epoch": 0.7250506320273762, + "grad_norm": 0.3566446081795631, + "learning_rate": 9.400885535444004e-05, + "loss": 2.9455, + "step": 15573 + }, + { + "epoch": 0.7250971902134693, + "grad_norm": 0.3986669462169757, + "learning_rate": 9.400756960050279e-05, + "loss": 2.9635, + "step": 15574 + }, + { + "epoch": 0.7251437483995623, + "grad_norm": 0.34310314685811777, + "learning_rate": 9.400628371740765e-05, + "loss": 3.1272, + "step": 15575 + }, + { + "epoch": 0.7251903065856554, + "grad_norm": 0.34727475915461203, + "learning_rate": 9.40049977051584e-05, + "loss": 2.9742, + "step": 15576 + }, + { + "epoch": 0.7252368647717485, + "grad_norm": 0.3312971299138665, + "learning_rate": 9.400371156375883e-05, + "loss": 2.897, + "step": 15577 + }, + { + "epoch": 0.7252834229578415, + "grad_norm": 0.3249859879120546, + "learning_rate": 9.40024252932127e-05, + "loss": 2.9892, + "step": 15578 + }, + { + "epoch": 0.7253299811439347, + "grad_norm": 0.3421776839136198, + "learning_rate": 9.400113889352378e-05, + "loss": 2.9384, + "step": 15579 + }, + { + "epoch": 0.7253765393300277, + "grad_norm": 0.37183044480094174, + "learning_rate": 9.399985236469587e-05, + "loss": 2.9618, + "step": 15580 + }, + { + "epoch": 0.7254230975161208, + "grad_norm": 0.37827975354004406, + "learning_rate": 9.399856570673268e-05, + "loss": 2.9555, + "step": 15581 + }, + { + "epoch": 0.7254696557022139, + "grad_norm": 0.4454154368106629, + "learning_rate": 9.399727891963808e-05, + "loss": 3.1468, + "step": 15582 + }, + { + "epoch": 0.7255162138883069, + "grad_norm": 0.3964803272652923, + "learning_rate": 9.399599200341578e-05, + "loss": 3.0188, + "step": 15583 + }, + { + "epoch": 0.7255627720744, + "grad_norm": 0.3576316435961992, + "learning_rate": 9.399470495806959e-05, + "loss": 2.9402, + "step": 15584 + }, + { + "epoch": 0.725609330260493, + "grad_norm": 0.38575271026756025, + "learning_rate": 9.399341778360328e-05, + "loss": 2.9625, + "step": 15585 + }, + { + "epoch": 0.7256558884465861, + "grad_norm": 0.3613348515422981, + "learning_rate": 9.399213048002062e-05, + "loss": 3.0274, + "step": 15586 + }, + { + "epoch": 0.7257024466326792, + "grad_norm": 0.38170970686081024, + "learning_rate": 9.399084304732538e-05, + "loss": 2.9104, + "step": 15587 + }, + { + "epoch": 0.7257490048187722, + "grad_norm": 0.35282285637113736, + "learning_rate": 9.398955548552135e-05, + "loss": 2.9892, + "step": 15588 + }, + { + "epoch": 0.7257955630048654, + "grad_norm": 0.38539288107770153, + "learning_rate": 9.398826779461231e-05, + "loss": 2.9982, + "step": 15589 + }, + { + "epoch": 0.7258421211909584, + "grad_norm": 0.3559633416554329, + "learning_rate": 9.398697997460204e-05, + "loss": 3.0227, + "step": 15590 + }, + { + "epoch": 0.7258886793770515, + "grad_norm": 0.3466150026063757, + "learning_rate": 9.398569202549432e-05, + "loss": 3.0106, + "step": 15591 + }, + { + "epoch": 0.7259352375631445, + "grad_norm": 0.3165191137938555, + "learning_rate": 9.398440394729292e-05, + "loss": 2.8558, + "step": 15592 + }, + { + "epoch": 0.7259817957492376, + "grad_norm": 0.35513962353382367, + "learning_rate": 9.398311574000163e-05, + "loss": 2.9959, + "step": 15593 + }, + { + "epoch": 0.7260283539353307, + "grad_norm": 0.3557075172192958, + "learning_rate": 9.398182740362423e-05, + "loss": 3.0305, + "step": 15594 + }, + { + "epoch": 0.7260749121214237, + "grad_norm": 0.35686832513851363, + "learning_rate": 9.39805389381645e-05, + "loss": 2.9286, + "step": 15595 + }, + { + "epoch": 0.7261214703075168, + "grad_norm": 0.34453487009563943, + "learning_rate": 9.397925034362623e-05, + "loss": 2.8882, + "step": 15596 + }, + { + "epoch": 0.7261680284936098, + "grad_norm": 0.36882737775842567, + "learning_rate": 9.397796162001318e-05, + "loss": 2.8776, + "step": 15597 + }, + { + "epoch": 0.726214586679703, + "grad_norm": 0.3455693055241102, + "learning_rate": 9.397667276732916e-05, + "loss": 3.0116, + "step": 15598 + }, + { + "epoch": 0.7262611448657961, + "grad_norm": 0.3710044687273633, + "learning_rate": 9.397538378557791e-05, + "loss": 2.9636, + "step": 15599 + }, + { + "epoch": 0.7263077030518891, + "grad_norm": 0.3830187072308096, + "learning_rate": 9.397409467476326e-05, + "loss": 2.9446, + "step": 15600 + }, + { + "epoch": 0.7263542612379822, + "grad_norm": 0.3551998042392904, + "learning_rate": 9.397280543488898e-05, + "loss": 3.0171, + "step": 15601 + }, + { + "epoch": 0.7264008194240752, + "grad_norm": 0.3756480023075114, + "learning_rate": 9.397151606595885e-05, + "loss": 2.9539, + "step": 15602 + }, + { + "epoch": 0.7264473776101683, + "grad_norm": 0.35924320578122526, + "learning_rate": 9.397022656797663e-05, + "loss": 3.2021, + "step": 15603 + }, + { + "epoch": 0.7264939357962614, + "grad_norm": 0.3656348316161747, + "learning_rate": 9.396893694094613e-05, + "loss": 2.8072, + "step": 15604 + }, + { + "epoch": 0.7265404939823544, + "grad_norm": 0.3751383472373483, + "learning_rate": 9.396764718487114e-05, + "loss": 3.0525, + "step": 15605 + }, + { + "epoch": 0.7265870521684475, + "grad_norm": 0.37027886127318244, + "learning_rate": 9.396635729975544e-05, + "loss": 2.9764, + "step": 15606 + }, + { + "epoch": 0.7266336103545405, + "grad_norm": 0.3655152184322606, + "learning_rate": 9.396506728560282e-05, + "loss": 3.0003, + "step": 15607 + }, + { + "epoch": 0.7266801685406337, + "grad_norm": 0.4179769134546289, + "learning_rate": 9.396377714241704e-05, + "loss": 2.958, + "step": 15608 + }, + { + "epoch": 0.7267267267267268, + "grad_norm": 0.3944664855695164, + "learning_rate": 9.396248687020192e-05, + "loss": 2.9861, + "step": 15609 + }, + { + "epoch": 0.7267732849128198, + "grad_norm": 0.35987951802740864, + "learning_rate": 9.396119646896122e-05, + "loss": 3.0004, + "step": 15610 + }, + { + "epoch": 0.7268198430989129, + "grad_norm": 0.40652058892409876, + "learning_rate": 9.395990593869875e-05, + "loss": 2.9639, + "step": 15611 + }, + { + "epoch": 0.7268664012850059, + "grad_norm": 0.3905000516039632, + "learning_rate": 9.395861527941829e-05, + "loss": 2.9487, + "step": 15612 + }, + { + "epoch": 0.726912959471099, + "grad_norm": 0.37826914941258893, + "learning_rate": 9.395732449112361e-05, + "loss": 3.0208, + "step": 15613 + }, + { + "epoch": 0.726959517657192, + "grad_norm": 0.4031568142815815, + "learning_rate": 9.395603357381852e-05, + "loss": 2.9703, + "step": 15614 + }, + { + "epoch": 0.7270060758432851, + "grad_norm": 0.3435672931448075, + "learning_rate": 9.39547425275068e-05, + "loss": 2.9658, + "step": 15615 + }, + { + "epoch": 0.7270526340293783, + "grad_norm": 0.40061209336488135, + "learning_rate": 9.395345135219225e-05, + "loss": 2.9234, + "step": 15616 + }, + { + "epoch": 0.7270991922154713, + "grad_norm": 0.3695128781795212, + "learning_rate": 9.395216004787863e-05, + "loss": 3.0573, + "step": 15617 + }, + { + "epoch": 0.7271457504015644, + "grad_norm": 0.408504170821194, + "learning_rate": 9.395086861456978e-05, + "loss": 2.9938, + "step": 15618 + }, + { + "epoch": 0.7271923085876574, + "grad_norm": 0.3866033941871188, + "learning_rate": 9.394957705226943e-05, + "loss": 3.0351, + "step": 15619 + }, + { + "epoch": 0.7272388667737505, + "grad_norm": 0.39688162847001285, + "learning_rate": 9.394828536098143e-05, + "loss": 3.0189, + "step": 15620 + }, + { + "epoch": 0.7272854249598436, + "grad_norm": 0.3954334512390421, + "learning_rate": 9.394699354070953e-05, + "loss": 2.9692, + "step": 15621 + }, + { + "epoch": 0.7273319831459366, + "grad_norm": 0.3766068734469335, + "learning_rate": 9.394570159145752e-05, + "loss": 2.9254, + "step": 15622 + }, + { + "epoch": 0.7273785413320297, + "grad_norm": 0.4911685991913503, + "learning_rate": 9.394440951322923e-05, + "loss": 3.0636, + "step": 15623 + }, + { + "epoch": 0.7274250995181227, + "grad_norm": 0.3737262782259263, + "learning_rate": 9.394311730602841e-05, + "loss": 2.9349, + "step": 15624 + }, + { + "epoch": 0.7274716577042158, + "grad_norm": 0.41620418356764094, + "learning_rate": 9.394182496985887e-05, + "loss": 3.072, + "step": 15625 + }, + { + "epoch": 0.727518215890309, + "grad_norm": 0.4303147792769396, + "learning_rate": 9.394053250472441e-05, + "loss": 2.9537, + "step": 15626 + }, + { + "epoch": 0.727564774076402, + "grad_norm": 0.3402657174189634, + "learning_rate": 9.393923991062881e-05, + "loss": 2.9501, + "step": 15627 + }, + { + "epoch": 0.7276113322624951, + "grad_norm": 0.3799162198833901, + "learning_rate": 9.393794718757587e-05, + "loss": 3.0081, + "step": 15628 + }, + { + "epoch": 0.7276578904485881, + "grad_norm": 0.35191746501817556, + "learning_rate": 9.393665433556939e-05, + "loss": 3.0882, + "step": 15629 + }, + { + "epoch": 0.7277044486346812, + "grad_norm": 0.3560354734261458, + "learning_rate": 9.393536135461314e-05, + "loss": 3.0278, + "step": 15630 + }, + { + "epoch": 0.7277510068207743, + "grad_norm": 0.3082468280994538, + "learning_rate": 9.393406824471096e-05, + "loss": 2.8641, + "step": 15631 + }, + { + "epoch": 0.7277975650068673, + "grad_norm": 0.37338986814549247, + "learning_rate": 9.393277500586659e-05, + "loss": 2.8884, + "step": 15632 + }, + { + "epoch": 0.7278441231929604, + "grad_norm": 0.36057865619306173, + "learning_rate": 9.393148163808387e-05, + "loss": 2.8231, + "step": 15633 + }, + { + "epoch": 0.7278906813790534, + "grad_norm": 0.3384782713676753, + "learning_rate": 9.393018814136658e-05, + "loss": 3.0752, + "step": 15634 + }, + { + "epoch": 0.7279372395651466, + "grad_norm": 0.37203671602757177, + "learning_rate": 9.392889451571851e-05, + "loss": 2.9401, + "step": 15635 + }, + { + "epoch": 0.7279837977512396, + "grad_norm": 0.34580375026658083, + "learning_rate": 9.392760076114347e-05, + "loss": 3.1231, + "step": 15636 + }, + { + "epoch": 0.7280303559373327, + "grad_norm": 0.3783472867613622, + "learning_rate": 9.392630687764524e-05, + "loss": 2.9369, + "step": 15637 + }, + { + "epoch": 0.7280769141234258, + "grad_norm": 0.32909506123454463, + "learning_rate": 9.392501286522763e-05, + "loss": 3.0139, + "step": 15638 + }, + { + "epoch": 0.7281234723095188, + "grad_norm": 0.3441031368358182, + "learning_rate": 9.392371872389443e-05, + "loss": 2.9391, + "step": 15639 + }, + { + "epoch": 0.7281700304956119, + "grad_norm": 0.33478174162542984, + "learning_rate": 9.392242445364946e-05, + "loss": 3.0568, + "step": 15640 + }, + { + "epoch": 0.7282165886817049, + "grad_norm": 0.33839501366874053, + "learning_rate": 9.392113005449648e-05, + "loss": 2.9461, + "step": 15641 + }, + { + "epoch": 0.728263146867798, + "grad_norm": 0.33778733683346723, + "learning_rate": 9.391983552643933e-05, + "loss": 3.0308, + "step": 15642 + }, + { + "epoch": 0.7283097050538911, + "grad_norm": 0.3225574183734123, + "learning_rate": 9.391854086948177e-05, + "loss": 2.8883, + "step": 15643 + }, + { + "epoch": 0.7283562632399841, + "grad_norm": 0.3330339413075406, + "learning_rate": 9.391724608362764e-05, + "loss": 2.9048, + "step": 15644 + }, + { + "epoch": 0.7284028214260773, + "grad_norm": 0.33181180069302935, + "learning_rate": 9.39159511688807e-05, + "loss": 3.0257, + "step": 15645 + }, + { + "epoch": 0.7284493796121703, + "grad_norm": 0.34254021929587064, + "learning_rate": 9.391465612524479e-05, + "loss": 3.085, + "step": 15646 + }, + { + "epoch": 0.7284959377982634, + "grad_norm": 0.3430858162909103, + "learning_rate": 9.391336095272368e-05, + "loss": 3.0167, + "step": 15647 + }, + { + "epoch": 0.7285424959843565, + "grad_norm": 0.33766205355311607, + "learning_rate": 9.391206565132116e-05, + "loss": 2.9392, + "step": 15648 + }, + { + "epoch": 0.7285890541704495, + "grad_norm": 0.35761640141507267, + "learning_rate": 9.39107702210411e-05, + "loss": 3.0671, + "step": 15649 + }, + { + "epoch": 0.7286356123565426, + "grad_norm": 0.3646283073225399, + "learning_rate": 9.390947466188723e-05, + "loss": 3.0336, + "step": 15650 + }, + { + "epoch": 0.7286821705426356, + "grad_norm": 0.35712998795375483, + "learning_rate": 9.390817897386335e-05, + "loss": 3.0282, + "step": 15651 + }, + { + "epoch": 0.7287287287287287, + "grad_norm": 0.3525951011814992, + "learning_rate": 9.390688315697333e-05, + "loss": 2.9365, + "step": 15652 + }, + { + "epoch": 0.7287752869148219, + "grad_norm": 0.36032713081961404, + "learning_rate": 9.390558721122091e-05, + "loss": 2.9069, + "step": 15653 + }, + { + "epoch": 0.7288218451009149, + "grad_norm": 0.3540909228176654, + "learning_rate": 9.390429113660994e-05, + "loss": 3.0632, + "step": 15654 + }, + { + "epoch": 0.728868403287008, + "grad_norm": 0.34905647606397733, + "learning_rate": 9.390299493314418e-05, + "loss": 2.9391, + "step": 15655 + }, + { + "epoch": 0.728914961473101, + "grad_norm": 0.3162206646214758, + "learning_rate": 9.390169860082748e-05, + "loss": 2.8989, + "step": 15656 + }, + { + "epoch": 0.7289615196591941, + "grad_norm": 0.30240088931508924, + "learning_rate": 9.390040213966359e-05, + "loss": 2.9945, + "step": 15657 + }, + { + "epoch": 0.7290080778452871, + "grad_norm": 0.33246110324916356, + "learning_rate": 9.389910554965635e-05, + "loss": 3.0417, + "step": 15658 + }, + { + "epoch": 0.7290546360313802, + "grad_norm": 0.32989617621756934, + "learning_rate": 9.389780883080958e-05, + "loss": 3.0837, + "step": 15659 + }, + { + "epoch": 0.7291011942174733, + "grad_norm": 0.3449491477770119, + "learning_rate": 9.389651198312705e-05, + "loss": 3.0753, + "step": 15660 + }, + { + "epoch": 0.7291477524035663, + "grad_norm": 0.3688800414086186, + "learning_rate": 9.389521500661258e-05, + "loss": 3.129, + "step": 15661 + }, + { + "epoch": 0.7291943105896594, + "grad_norm": 0.34280612570500985, + "learning_rate": 9.389391790126998e-05, + "loss": 2.998, + "step": 15662 + }, + { + "epoch": 0.7292408687757524, + "grad_norm": 0.3163626565209925, + "learning_rate": 9.389262066710307e-05, + "loss": 2.9888, + "step": 15663 + }, + { + "epoch": 0.7292874269618456, + "grad_norm": 0.3535443144740966, + "learning_rate": 9.389132330411563e-05, + "loss": 2.9357, + "step": 15664 + }, + { + "epoch": 0.7293339851479387, + "grad_norm": 0.3497250276325728, + "learning_rate": 9.389002581231147e-05, + "loss": 2.9685, + "step": 15665 + }, + { + "epoch": 0.7293805433340317, + "grad_norm": 0.38536283174826663, + "learning_rate": 9.388872819169444e-05, + "loss": 2.9686, + "step": 15666 + }, + { + "epoch": 0.7294271015201248, + "grad_norm": 0.3570121310476005, + "learning_rate": 9.388743044226829e-05, + "loss": 2.9776, + "step": 15667 + }, + { + "epoch": 0.7294736597062178, + "grad_norm": 0.3873231270191964, + "learning_rate": 9.388613256403686e-05, + "loss": 2.9494, + "step": 15668 + }, + { + "epoch": 0.7295202178923109, + "grad_norm": 0.41944672780292747, + "learning_rate": 9.388483455700395e-05, + "loss": 2.9993, + "step": 15669 + }, + { + "epoch": 0.729566776078404, + "grad_norm": 0.3768524718339173, + "learning_rate": 9.388353642117339e-05, + "loss": 2.9944, + "step": 15670 + }, + { + "epoch": 0.729613334264497, + "grad_norm": 0.3955587060773792, + "learning_rate": 9.388223815654895e-05, + "loss": 3.0293, + "step": 15671 + }, + { + "epoch": 0.7296598924505902, + "grad_norm": 0.3724098883605177, + "learning_rate": 9.388093976313447e-05, + "loss": 2.9441, + "step": 15672 + }, + { + "epoch": 0.7297064506366832, + "grad_norm": 0.3634424834037954, + "learning_rate": 9.387964124093377e-05, + "loss": 2.9506, + "step": 15673 + }, + { + "epoch": 0.7297530088227763, + "grad_norm": 0.3646450067581526, + "learning_rate": 9.387834258995064e-05, + "loss": 3.037, + "step": 15674 + }, + { + "epoch": 0.7297995670088694, + "grad_norm": 0.3832997655673976, + "learning_rate": 9.38770438101889e-05, + "loss": 2.9982, + "step": 15675 + }, + { + "epoch": 0.7298461251949624, + "grad_norm": 0.3377565419262259, + "learning_rate": 9.387574490165236e-05, + "loss": 2.9846, + "step": 15676 + }, + { + "epoch": 0.7298926833810555, + "grad_norm": 0.33744968710309226, + "learning_rate": 9.387444586434482e-05, + "loss": 2.9685, + "step": 15677 + }, + { + "epoch": 0.7299392415671485, + "grad_norm": 0.3672812787315489, + "learning_rate": 9.38731466982701e-05, + "loss": 2.902, + "step": 15678 + }, + { + "epoch": 0.7299857997532416, + "grad_norm": 0.3701275388372472, + "learning_rate": 9.387184740343202e-05, + "loss": 3.0389, + "step": 15679 + }, + { + "epoch": 0.7300323579393346, + "grad_norm": 0.32668904304522156, + "learning_rate": 9.387054797983441e-05, + "loss": 3.0256, + "step": 15680 + }, + { + "epoch": 0.7300789161254277, + "grad_norm": 0.3798766688462739, + "learning_rate": 9.386924842748105e-05, + "loss": 3.0583, + "step": 15681 + }, + { + "epoch": 0.7301254743115209, + "grad_norm": 0.3636452851425697, + "learning_rate": 9.386794874637576e-05, + "loss": 2.9263, + "step": 15682 + }, + { + "epoch": 0.7301720324976139, + "grad_norm": 0.34863287246637115, + "learning_rate": 9.386664893652237e-05, + "loss": 2.9212, + "step": 15683 + }, + { + "epoch": 0.730218590683707, + "grad_norm": 0.41463162890824257, + "learning_rate": 9.38653489979247e-05, + "loss": 3.0196, + "step": 15684 + }, + { + "epoch": 0.7302651488698, + "grad_norm": 0.3626566360018074, + "learning_rate": 9.386404893058653e-05, + "loss": 2.9495, + "step": 15685 + }, + { + "epoch": 0.7303117070558931, + "grad_norm": 0.3839223462063567, + "learning_rate": 9.38627487345117e-05, + "loss": 2.9964, + "step": 15686 + }, + { + "epoch": 0.7303582652419862, + "grad_norm": 0.4266146683792024, + "learning_rate": 9.386144840970404e-05, + "loss": 3.0589, + "step": 15687 + }, + { + "epoch": 0.7304048234280792, + "grad_norm": 0.3167465330506787, + "learning_rate": 9.386014795616734e-05, + "loss": 3.0606, + "step": 15688 + }, + { + "epoch": 0.7304513816141723, + "grad_norm": 0.40178207498050483, + "learning_rate": 9.385884737390543e-05, + "loss": 3.0341, + "step": 15689 + }, + { + "epoch": 0.7304979398002653, + "grad_norm": 0.3641721002525432, + "learning_rate": 9.385754666292212e-05, + "loss": 2.9865, + "step": 15690 + }, + { + "epoch": 0.7305444979863585, + "grad_norm": 0.386061676545463, + "learning_rate": 9.385624582322123e-05, + "loss": 2.9807, + "step": 15691 + }, + { + "epoch": 0.7305910561724516, + "grad_norm": 0.40380782354131833, + "learning_rate": 9.385494485480656e-05, + "loss": 2.9112, + "step": 15692 + }, + { + "epoch": 0.7306376143585446, + "grad_norm": 0.40467048808390055, + "learning_rate": 9.385364375768198e-05, + "loss": 2.9278, + "step": 15693 + }, + { + "epoch": 0.7306841725446377, + "grad_norm": 0.33372287726135486, + "learning_rate": 9.385234253185127e-05, + "loss": 3.0359, + "step": 15694 + }, + { + "epoch": 0.7307307307307307, + "grad_norm": 0.3921696016331426, + "learning_rate": 9.385104117731825e-05, + "loss": 2.9513, + "step": 15695 + }, + { + "epoch": 0.7307772889168238, + "grad_norm": 0.34294983691234737, + "learning_rate": 9.384973969408673e-05, + "loss": 2.9399, + "step": 15696 + }, + { + "epoch": 0.7308238471029169, + "grad_norm": 0.36074402120969257, + "learning_rate": 9.384843808216057e-05, + "loss": 2.9897, + "step": 15697 + }, + { + "epoch": 0.7308704052890099, + "grad_norm": 0.3344726988856132, + "learning_rate": 9.384713634154354e-05, + "loss": 2.7931, + "step": 15698 + }, + { + "epoch": 0.730916963475103, + "grad_norm": 0.3551760476552065, + "learning_rate": 9.38458344722395e-05, + "loss": 3.0648, + "step": 15699 + }, + { + "epoch": 0.730963521661196, + "grad_norm": 0.36784498914046426, + "learning_rate": 9.384453247425225e-05, + "loss": 2.9131, + "step": 15700 + }, + { + "epoch": 0.7310100798472892, + "grad_norm": 0.38648861424704695, + "learning_rate": 9.384323034758563e-05, + "loss": 2.9671, + "step": 15701 + }, + { + "epoch": 0.7310566380333822, + "grad_norm": 0.39456263355426074, + "learning_rate": 9.384192809224342e-05, + "loss": 3.0221, + "step": 15702 + }, + { + "epoch": 0.7311031962194753, + "grad_norm": 0.41922735730842686, + "learning_rate": 9.384062570822949e-05, + "loss": 3.0077, + "step": 15703 + }, + { + "epoch": 0.7311497544055684, + "grad_norm": 0.37525970929941804, + "learning_rate": 9.383932319554763e-05, + "loss": 3.0153, + "step": 15704 + }, + { + "epoch": 0.7311963125916614, + "grad_norm": 0.38921473277688956, + "learning_rate": 9.383802055420168e-05, + "loss": 2.9835, + "step": 15705 + }, + { + "epoch": 0.7312428707777545, + "grad_norm": 0.3465758191597658, + "learning_rate": 9.383671778419547e-05, + "loss": 2.9733, + "step": 15706 + }, + { + "epoch": 0.7312894289638475, + "grad_norm": 0.36713913647430335, + "learning_rate": 9.38354148855328e-05, + "loss": 2.9685, + "step": 15707 + }, + { + "epoch": 0.7313359871499406, + "grad_norm": 0.3806625638996117, + "learning_rate": 9.383411185821751e-05, + "loss": 2.9504, + "step": 15708 + }, + { + "epoch": 0.7313825453360338, + "grad_norm": 0.3570964263469347, + "learning_rate": 9.383280870225343e-05, + "loss": 3.0148, + "step": 15709 + }, + { + "epoch": 0.7314291035221268, + "grad_norm": 0.3916560696218347, + "learning_rate": 9.383150541764436e-05, + "loss": 3.0527, + "step": 15710 + }, + { + "epoch": 0.7314756617082199, + "grad_norm": 0.31954645498938333, + "learning_rate": 9.383020200439414e-05, + "loss": 2.998, + "step": 15711 + }, + { + "epoch": 0.7315222198943129, + "grad_norm": 0.4152437236856077, + "learning_rate": 9.38288984625066e-05, + "loss": 3.0737, + "step": 15712 + }, + { + "epoch": 0.731568778080406, + "grad_norm": 0.36655601206241334, + "learning_rate": 9.382759479198557e-05, + "loss": 2.9763, + "step": 15713 + }, + { + "epoch": 0.7316153362664991, + "grad_norm": 0.3612719919688799, + "learning_rate": 9.382629099283486e-05, + "loss": 3.0618, + "step": 15714 + }, + { + "epoch": 0.7316618944525921, + "grad_norm": 0.3638638982175669, + "learning_rate": 9.382498706505828e-05, + "loss": 2.9699, + "step": 15715 + }, + { + "epoch": 0.7317084526386852, + "grad_norm": 0.3609834871068393, + "learning_rate": 9.382368300865971e-05, + "loss": 2.9015, + "step": 15716 + }, + { + "epoch": 0.7317550108247782, + "grad_norm": 0.3611934836704253, + "learning_rate": 9.382237882364294e-05, + "loss": 3.0495, + "step": 15717 + }, + { + "epoch": 0.7318015690108713, + "grad_norm": 0.38763065067693797, + "learning_rate": 9.38210745100118e-05, + "loss": 2.9839, + "step": 15718 + }, + { + "epoch": 0.7318481271969645, + "grad_norm": 0.34822128281982223, + "learning_rate": 9.381977006777013e-05, + "loss": 2.9451, + "step": 15719 + }, + { + "epoch": 0.7318946853830575, + "grad_norm": 0.38414245471049807, + "learning_rate": 9.381846549692175e-05, + "loss": 2.9819, + "step": 15720 + }, + { + "epoch": 0.7319412435691506, + "grad_norm": 0.3899707225074121, + "learning_rate": 9.38171607974705e-05, + "loss": 2.9765, + "step": 15721 + }, + { + "epoch": 0.7319878017552436, + "grad_norm": 0.351301200148332, + "learning_rate": 9.381585596942019e-05, + "loss": 2.8932, + "step": 15722 + }, + { + "epoch": 0.7320343599413367, + "grad_norm": 0.3682253764874269, + "learning_rate": 9.381455101277465e-05, + "loss": 2.9528, + "step": 15723 + }, + { + "epoch": 0.7320809181274297, + "grad_norm": 0.4085332949436971, + "learning_rate": 9.381324592753772e-05, + "loss": 2.9332, + "step": 15724 + }, + { + "epoch": 0.7321274763135228, + "grad_norm": 0.36015622587794277, + "learning_rate": 9.381194071371324e-05, + "loss": 3.0253, + "step": 15725 + }, + { + "epoch": 0.7321740344996159, + "grad_norm": 0.42768315322933786, + "learning_rate": 9.381063537130503e-05, + "loss": 2.9896, + "step": 15726 + }, + { + "epoch": 0.7322205926857089, + "grad_norm": 0.3508518714880422, + "learning_rate": 9.380932990031693e-05, + "loss": 2.9864, + "step": 15727 + }, + { + "epoch": 0.732267150871802, + "grad_norm": 0.3992785676992031, + "learning_rate": 9.380802430075276e-05, + "loss": 2.9742, + "step": 15728 + }, + { + "epoch": 0.7323137090578951, + "grad_norm": 0.37562988037893663, + "learning_rate": 9.380671857261634e-05, + "loss": 2.8962, + "step": 15729 + }, + { + "epoch": 0.7323602672439882, + "grad_norm": 0.3944367471612579, + "learning_rate": 9.380541271591152e-05, + "loss": 3.0034, + "step": 15730 + }, + { + "epoch": 0.7324068254300813, + "grad_norm": 0.46292934911361755, + "learning_rate": 9.380410673064213e-05, + "loss": 2.8856, + "step": 15731 + }, + { + "epoch": 0.7324533836161743, + "grad_norm": 0.3566334000790983, + "learning_rate": 9.380280061681202e-05, + "loss": 2.9119, + "step": 15732 + }, + { + "epoch": 0.7324999418022674, + "grad_norm": 0.41942731723893933, + "learning_rate": 9.380149437442497e-05, + "loss": 2.9162, + "step": 15733 + }, + { + "epoch": 0.7325464999883604, + "grad_norm": 0.37656385812552157, + "learning_rate": 9.380018800348487e-05, + "loss": 3.0353, + "step": 15734 + }, + { + "epoch": 0.7325930581744535, + "grad_norm": 0.38101975213551736, + "learning_rate": 9.379888150399554e-05, + "loss": 2.9378, + "step": 15735 + }, + { + "epoch": 0.7326396163605466, + "grad_norm": 0.3862004177129493, + "learning_rate": 9.37975748759608e-05, + "loss": 3.0021, + "step": 15736 + }, + { + "epoch": 0.7326861745466396, + "grad_norm": 0.385937930499593, + "learning_rate": 9.379626811938449e-05, + "loss": 2.9458, + "step": 15737 + }, + { + "epoch": 0.7327327327327328, + "grad_norm": 0.33274802178575735, + "learning_rate": 9.379496123427045e-05, + "loss": 2.9506, + "step": 15738 + }, + { + "epoch": 0.7327792909188258, + "grad_norm": 0.37599714931424727, + "learning_rate": 9.379365422062251e-05, + "loss": 2.9568, + "step": 15739 + }, + { + "epoch": 0.7328258491049189, + "grad_norm": 0.3432551974824658, + "learning_rate": 9.379234707844452e-05, + "loss": 2.9999, + "step": 15740 + }, + { + "epoch": 0.732872407291012, + "grad_norm": 0.3755910659824677, + "learning_rate": 9.379103980774029e-05, + "loss": 3.0445, + "step": 15741 + }, + { + "epoch": 0.732918965477105, + "grad_norm": 0.3701944280562221, + "learning_rate": 9.378973240851369e-05, + "loss": 2.9701, + "step": 15742 + }, + { + "epoch": 0.7329655236631981, + "grad_norm": 0.37616586408564, + "learning_rate": 9.378842488076854e-05, + "loss": 3.0817, + "step": 15743 + }, + { + "epoch": 0.7330120818492911, + "grad_norm": 0.32474080238452296, + "learning_rate": 9.378711722450865e-05, + "loss": 2.9751, + "step": 15744 + }, + { + "epoch": 0.7330586400353842, + "grad_norm": 0.39659378674362167, + "learning_rate": 9.378580943973792e-05, + "loss": 3.0095, + "step": 15745 + }, + { + "epoch": 0.7331051982214772, + "grad_norm": 0.3497561573078944, + "learning_rate": 9.378450152646013e-05, + "loss": 3.0403, + "step": 15746 + }, + { + "epoch": 0.7331517564075704, + "grad_norm": 0.3650298140101598, + "learning_rate": 9.378319348467914e-05, + "loss": 2.8381, + "step": 15747 + }, + { + "epoch": 0.7331983145936635, + "grad_norm": 0.3568671748146414, + "learning_rate": 9.378188531439879e-05, + "loss": 2.9936, + "step": 15748 + }, + { + "epoch": 0.7332448727797565, + "grad_norm": 0.36276682016219686, + "learning_rate": 9.378057701562293e-05, + "loss": 3.0074, + "step": 15749 + }, + { + "epoch": 0.7332914309658496, + "grad_norm": 0.3504302204522034, + "learning_rate": 9.37792685883554e-05, + "loss": 2.9438, + "step": 15750 + }, + { + "epoch": 0.7333379891519426, + "grad_norm": 0.35369467892940515, + "learning_rate": 9.37779600326e-05, + "loss": 2.9784, + "step": 15751 + }, + { + "epoch": 0.7333845473380357, + "grad_norm": 0.3423110586223297, + "learning_rate": 9.377665134836063e-05, + "loss": 3.084, + "step": 15752 + }, + { + "epoch": 0.7334311055241288, + "grad_norm": 0.36249758908977936, + "learning_rate": 9.377534253564109e-05, + "loss": 2.9442, + "step": 15753 + }, + { + "epoch": 0.7334776637102218, + "grad_norm": 0.3532922977240614, + "learning_rate": 9.37740335944452e-05, + "loss": 3.03, + "step": 15754 + }, + { + "epoch": 0.7335242218963149, + "grad_norm": 0.3197656761256191, + "learning_rate": 9.377272452477688e-05, + "loss": 2.8581, + "step": 15755 + }, + { + "epoch": 0.733570780082408, + "grad_norm": 0.359141104074791, + "learning_rate": 9.377141532663991e-05, + "loss": 2.8543, + "step": 15756 + }, + { + "epoch": 0.7336173382685011, + "grad_norm": 0.35228402388933105, + "learning_rate": 9.377010600003815e-05, + "loss": 2.9492, + "step": 15757 + }, + { + "epoch": 0.7336638964545942, + "grad_norm": 0.3337915203465227, + "learning_rate": 9.376879654497543e-05, + "loss": 2.9013, + "step": 15758 + }, + { + "epoch": 0.7337104546406872, + "grad_norm": 0.3641981587262348, + "learning_rate": 9.376748696145561e-05, + "loss": 2.8963, + "step": 15759 + }, + { + "epoch": 0.7337570128267803, + "grad_norm": 0.31194462869351247, + "learning_rate": 9.376617724948253e-05, + "loss": 2.9981, + "step": 15760 + }, + { + "epoch": 0.7338035710128733, + "grad_norm": 0.3509279177327033, + "learning_rate": 9.376486740906003e-05, + "loss": 3.0335, + "step": 15761 + }, + { + "epoch": 0.7338501291989664, + "grad_norm": 0.3312356644489457, + "learning_rate": 9.376355744019196e-05, + "loss": 2.9153, + "step": 15762 + }, + { + "epoch": 0.7338966873850595, + "grad_norm": 0.3412970523153392, + "learning_rate": 9.376224734288217e-05, + "loss": 2.9671, + "step": 15763 + }, + { + "epoch": 0.7339432455711525, + "grad_norm": 0.3453242208881259, + "learning_rate": 9.376093711713447e-05, + "loss": 3.0412, + "step": 15764 + }, + { + "epoch": 0.7339898037572457, + "grad_norm": 0.31254969338242006, + "learning_rate": 9.375962676295274e-05, + "loss": 2.9279, + "step": 15765 + }, + { + "epoch": 0.7340363619433387, + "grad_norm": 0.3382385528132516, + "learning_rate": 9.375831628034082e-05, + "loss": 3.0024, + "step": 15766 + }, + { + "epoch": 0.7340829201294318, + "grad_norm": 0.3074608119919993, + "learning_rate": 9.375700566930257e-05, + "loss": 3.0445, + "step": 15767 + }, + { + "epoch": 0.7341294783155248, + "grad_norm": 0.353283112780542, + "learning_rate": 9.375569492984179e-05, + "loss": 2.9569, + "step": 15768 + }, + { + "epoch": 0.7341760365016179, + "grad_norm": 0.3211741900451596, + "learning_rate": 9.375438406196238e-05, + "loss": 3.0192, + "step": 15769 + }, + { + "epoch": 0.734222594687711, + "grad_norm": 0.34337477007829914, + "learning_rate": 9.375307306566813e-05, + "loss": 2.9969, + "step": 15770 + }, + { + "epoch": 0.734269152873804, + "grad_norm": 0.30903932202134193, + "learning_rate": 9.375176194096295e-05, + "loss": 2.9898, + "step": 15771 + }, + { + "epoch": 0.7343157110598971, + "grad_norm": 0.32902276127779273, + "learning_rate": 9.375045068785066e-05, + "loss": 3.0272, + "step": 15772 + }, + { + "epoch": 0.7343622692459901, + "grad_norm": 0.3547778106982057, + "learning_rate": 9.374913930633509e-05, + "loss": 2.8959, + "step": 15773 + }, + { + "epoch": 0.7344088274320832, + "grad_norm": 0.3419779886328708, + "learning_rate": 9.374782779642012e-05, + "loss": 2.9567, + "step": 15774 + }, + { + "epoch": 0.7344553856181764, + "grad_norm": 0.325446058142327, + "learning_rate": 9.374651615810957e-05, + "loss": 2.8854, + "step": 15775 + }, + { + "epoch": 0.7345019438042694, + "grad_norm": 0.31926401835951485, + "learning_rate": 9.374520439140733e-05, + "loss": 2.9091, + "step": 15776 + }, + { + "epoch": 0.7345485019903625, + "grad_norm": 0.34915696635861804, + "learning_rate": 9.37438924963172e-05, + "loss": 3.0572, + "step": 15777 + }, + { + "epoch": 0.7345950601764555, + "grad_norm": 0.3399577333207399, + "learning_rate": 9.374258047284306e-05, + "loss": 2.9864, + "step": 15778 + }, + { + "epoch": 0.7346416183625486, + "grad_norm": 0.33181223991799935, + "learning_rate": 9.374126832098877e-05, + "loss": 2.9174, + "step": 15779 + }, + { + "epoch": 0.7346881765486417, + "grad_norm": 0.33848493640432087, + "learning_rate": 9.373995604075816e-05, + "loss": 2.9498, + "step": 15780 + }, + { + "epoch": 0.7347347347347347, + "grad_norm": 0.35725176975153844, + "learning_rate": 9.373864363215508e-05, + "loss": 2.8923, + "step": 15781 + }, + { + "epoch": 0.7347812929208278, + "grad_norm": 0.3431643368265717, + "learning_rate": 9.373733109518339e-05, + "loss": 2.9941, + "step": 15782 + }, + { + "epoch": 0.7348278511069208, + "grad_norm": 0.35926461107663926, + "learning_rate": 9.373601842984695e-05, + "loss": 2.9299, + "step": 15783 + }, + { + "epoch": 0.734874409293014, + "grad_norm": 0.35775019611511355, + "learning_rate": 9.37347056361496e-05, + "loss": 2.9395, + "step": 15784 + }, + { + "epoch": 0.7349209674791071, + "grad_norm": 0.3362004265156428, + "learning_rate": 9.37333927140952e-05, + "loss": 2.9094, + "step": 15785 + }, + { + "epoch": 0.7349675256652001, + "grad_norm": 0.3666465970841984, + "learning_rate": 9.37320796636876e-05, + "loss": 3.0079, + "step": 15786 + }, + { + "epoch": 0.7350140838512932, + "grad_norm": 0.35244950407624737, + "learning_rate": 9.373076648493065e-05, + "loss": 2.9437, + "step": 15787 + }, + { + "epoch": 0.7350606420373862, + "grad_norm": 0.3774980399634337, + "learning_rate": 9.372945317782821e-05, + "loss": 3.1444, + "step": 15788 + }, + { + "epoch": 0.7351072002234793, + "grad_norm": 0.40664136989896493, + "learning_rate": 9.372813974238415e-05, + "loss": 2.8808, + "step": 15789 + }, + { + "epoch": 0.7351537584095723, + "grad_norm": 0.3503552862023593, + "learning_rate": 9.372682617860229e-05, + "loss": 3.0758, + "step": 15790 + }, + { + "epoch": 0.7352003165956654, + "grad_norm": 0.41817341576407413, + "learning_rate": 9.37255124864865e-05, + "loss": 3.1148, + "step": 15791 + }, + { + "epoch": 0.7352468747817585, + "grad_norm": 0.37024706315721095, + "learning_rate": 9.372419866604065e-05, + "loss": 2.9344, + "step": 15792 + }, + { + "epoch": 0.7352934329678515, + "grad_norm": 0.4500111974801019, + "learning_rate": 9.372288471726858e-05, + "loss": 2.8836, + "step": 15793 + }, + { + "epoch": 0.7353399911539447, + "grad_norm": 0.38431202559843486, + "learning_rate": 9.372157064017414e-05, + "loss": 2.9756, + "step": 15794 + }, + { + "epoch": 0.7353865493400377, + "grad_norm": 0.38659621325510596, + "learning_rate": 9.37202564347612e-05, + "loss": 2.9839, + "step": 15795 + }, + { + "epoch": 0.7354331075261308, + "grad_norm": 0.3666604756280067, + "learning_rate": 9.371894210103363e-05, + "loss": 2.9999, + "step": 15796 + }, + { + "epoch": 0.7354796657122239, + "grad_norm": 0.34102249397396933, + "learning_rate": 9.371762763899527e-05, + "loss": 3.0403, + "step": 15797 + }, + { + "epoch": 0.7355262238983169, + "grad_norm": 0.36546024417586126, + "learning_rate": 9.371631304864997e-05, + "loss": 3.095, + "step": 15798 + }, + { + "epoch": 0.73557278208441, + "grad_norm": 0.33059385702126187, + "learning_rate": 9.371499833000161e-05, + "loss": 2.9662, + "step": 15799 + }, + { + "epoch": 0.735619340270503, + "grad_norm": 0.3510865746429261, + "learning_rate": 9.371368348305402e-05, + "loss": 3.0735, + "step": 15800 + }, + { + "epoch": 0.7356658984565961, + "grad_norm": 0.3590208176079251, + "learning_rate": 9.371236850781109e-05, + "loss": 2.9087, + "step": 15801 + }, + { + "epoch": 0.7357124566426892, + "grad_norm": 0.36498697660495744, + "learning_rate": 9.371105340427666e-05, + "loss": 2.9543, + "step": 15802 + }, + { + "epoch": 0.7357590148287823, + "grad_norm": 0.3579171980100047, + "learning_rate": 9.370973817245459e-05, + "loss": 2.9802, + "step": 15803 + }, + { + "epoch": 0.7358055730148754, + "grad_norm": 0.3701128953621613, + "learning_rate": 9.370842281234875e-05, + "loss": 2.9657, + "step": 15804 + }, + { + "epoch": 0.7358521312009684, + "grad_norm": 0.33131914685674335, + "learning_rate": 9.370710732396299e-05, + "loss": 2.9056, + "step": 15805 + }, + { + "epoch": 0.7358986893870615, + "grad_norm": 0.34801510901946947, + "learning_rate": 9.370579170730118e-05, + "loss": 2.917, + "step": 15806 + }, + { + "epoch": 0.7359452475731546, + "grad_norm": 0.3462968149075629, + "learning_rate": 9.370447596236717e-05, + "loss": 3.0296, + "step": 15807 + }, + { + "epoch": 0.7359918057592476, + "grad_norm": 0.3609979504904038, + "learning_rate": 9.370316008916485e-05, + "loss": 3.0158, + "step": 15808 + }, + { + "epoch": 0.7360383639453407, + "grad_norm": 0.32903851173306625, + "learning_rate": 9.370184408769804e-05, + "loss": 3.038, + "step": 15809 + }, + { + "epoch": 0.7360849221314337, + "grad_norm": 0.3689144082513029, + "learning_rate": 9.370052795797062e-05, + "loss": 2.9862, + "step": 15810 + }, + { + "epoch": 0.7361314803175268, + "grad_norm": 0.3693752079852545, + "learning_rate": 9.369921169998646e-05, + "loss": 2.9301, + "step": 15811 + }, + { + "epoch": 0.7361780385036198, + "grad_norm": 0.35366895158043943, + "learning_rate": 9.369789531374944e-05, + "loss": 2.9847, + "step": 15812 + }, + { + "epoch": 0.736224596689713, + "grad_norm": 0.39109896854016346, + "learning_rate": 9.369657879926338e-05, + "loss": 2.9242, + "step": 15813 + }, + { + "epoch": 0.7362711548758061, + "grad_norm": 0.3458620324980172, + "learning_rate": 9.369526215653218e-05, + "loss": 2.9547, + "step": 15814 + }, + { + "epoch": 0.7363177130618991, + "grad_norm": 0.36762821905595916, + "learning_rate": 9.369394538555967e-05, + "loss": 3.0135, + "step": 15815 + }, + { + "epoch": 0.7363642712479922, + "grad_norm": 0.32731004000506836, + "learning_rate": 9.369262848634974e-05, + "loss": 2.9941, + "step": 15816 + }, + { + "epoch": 0.7364108294340852, + "grad_norm": 0.3475140350203061, + "learning_rate": 9.369131145890627e-05, + "loss": 2.9583, + "step": 15817 + }, + { + "epoch": 0.7364573876201783, + "grad_norm": 0.35424192521078574, + "learning_rate": 9.368999430323308e-05, + "loss": 3.0485, + "step": 15818 + }, + { + "epoch": 0.7365039458062714, + "grad_norm": 0.330866003727722, + "learning_rate": 9.368867701933408e-05, + "loss": 2.936, + "step": 15819 + }, + { + "epoch": 0.7365505039923644, + "grad_norm": 0.3572437753319366, + "learning_rate": 9.36873596072131e-05, + "loss": 2.9115, + "step": 15820 + }, + { + "epoch": 0.7365970621784576, + "grad_norm": 0.3110917848192461, + "learning_rate": 9.368604206687403e-05, + "loss": 3.0223, + "step": 15821 + }, + { + "epoch": 0.7366436203645506, + "grad_norm": 0.3327341454936081, + "learning_rate": 9.368472439832074e-05, + "loss": 2.9238, + "step": 15822 + }, + { + "epoch": 0.7366901785506437, + "grad_norm": 0.31911364859586994, + "learning_rate": 9.368340660155707e-05, + "loss": 2.9146, + "step": 15823 + }, + { + "epoch": 0.7367367367367368, + "grad_norm": 0.353859785660786, + "learning_rate": 9.368208867658692e-05, + "loss": 2.9769, + "step": 15824 + }, + { + "epoch": 0.7367832949228298, + "grad_norm": 0.3226815788186911, + "learning_rate": 9.368077062341413e-05, + "loss": 2.8979, + "step": 15825 + }, + { + "epoch": 0.7368298531089229, + "grad_norm": 0.3526065362496858, + "learning_rate": 9.367945244204261e-05, + "loss": 2.99, + "step": 15826 + }, + { + "epoch": 0.7368764112950159, + "grad_norm": 0.36734573630612305, + "learning_rate": 9.367813413247617e-05, + "loss": 3.0838, + "step": 15827 + }, + { + "epoch": 0.736922969481109, + "grad_norm": 0.31616708290225, + "learning_rate": 9.367681569471872e-05, + "loss": 2.9967, + "step": 15828 + }, + { + "epoch": 0.7369695276672021, + "grad_norm": 0.37830190544411596, + "learning_rate": 9.367549712877413e-05, + "loss": 2.9641, + "step": 15829 + }, + { + "epoch": 0.7370160858532951, + "grad_norm": 0.32035137017128706, + "learning_rate": 9.367417843464623e-05, + "loss": 3.019, + "step": 15830 + }, + { + "epoch": 0.7370626440393883, + "grad_norm": 0.38552177481605393, + "learning_rate": 9.367285961233893e-05, + "loss": 3.013, + "step": 15831 + }, + { + "epoch": 0.7371092022254813, + "grad_norm": 0.3363761598728991, + "learning_rate": 9.367154066185611e-05, + "loss": 3.0071, + "step": 15832 + }, + { + "epoch": 0.7371557604115744, + "grad_norm": 0.36652877543791496, + "learning_rate": 9.36702215832016e-05, + "loss": 2.9682, + "step": 15833 + }, + { + "epoch": 0.7372023185976674, + "grad_norm": 0.3776586158532, + "learning_rate": 9.36689023763793e-05, + "loss": 3.0566, + "step": 15834 + }, + { + "epoch": 0.7372488767837605, + "grad_norm": 0.3363966031776157, + "learning_rate": 9.366758304139307e-05, + "loss": 2.904, + "step": 15835 + }, + { + "epoch": 0.7372954349698536, + "grad_norm": 0.3082504789722463, + "learning_rate": 9.366626357824679e-05, + "loss": 2.9869, + "step": 15836 + }, + { + "epoch": 0.7373419931559466, + "grad_norm": 0.3802980001492576, + "learning_rate": 9.366494398694433e-05, + "loss": 2.9645, + "step": 15837 + }, + { + "epoch": 0.7373885513420397, + "grad_norm": 0.32929722466888117, + "learning_rate": 9.366362426748956e-05, + "loss": 3.0853, + "step": 15838 + }, + { + "epoch": 0.7374351095281327, + "grad_norm": 0.3531718844202787, + "learning_rate": 9.366230441988635e-05, + "loss": 2.9953, + "step": 15839 + }, + { + "epoch": 0.7374816677142259, + "grad_norm": 0.3955670386159285, + "learning_rate": 9.366098444413857e-05, + "loss": 3.0004, + "step": 15840 + }, + { + "epoch": 0.737528225900319, + "grad_norm": 0.3586417179923186, + "learning_rate": 9.365966434025012e-05, + "loss": 2.8824, + "step": 15841 + }, + { + "epoch": 0.737574784086412, + "grad_norm": 0.3633481518679402, + "learning_rate": 9.365834410822486e-05, + "loss": 2.9891, + "step": 15842 + }, + { + "epoch": 0.7376213422725051, + "grad_norm": 0.3973007943319282, + "learning_rate": 9.365702374806664e-05, + "loss": 2.9715, + "step": 15843 + }, + { + "epoch": 0.7376679004585981, + "grad_norm": 0.3794825162571338, + "learning_rate": 9.365570325977936e-05, + "loss": 3.0033, + "step": 15844 + }, + { + "epoch": 0.7377144586446912, + "grad_norm": 0.37347473556508715, + "learning_rate": 9.36543826433669e-05, + "loss": 2.8547, + "step": 15845 + }, + { + "epoch": 0.7377610168307843, + "grad_norm": 0.3933551060872598, + "learning_rate": 9.365306189883311e-05, + "loss": 3.0067, + "step": 15846 + }, + { + "epoch": 0.7378075750168773, + "grad_norm": 0.4339313127051681, + "learning_rate": 9.36517410261819e-05, + "loss": 3.0387, + "step": 15847 + }, + { + "epoch": 0.7378541332029704, + "grad_norm": 0.42821508763668664, + "learning_rate": 9.365042002541713e-05, + "loss": 2.929, + "step": 15848 + }, + { + "epoch": 0.7379006913890634, + "grad_norm": 0.3926602442984056, + "learning_rate": 9.364909889654267e-05, + "loss": 3.0885, + "step": 15849 + }, + { + "epoch": 0.7379472495751566, + "grad_norm": 0.42949137788720415, + "learning_rate": 9.364777763956242e-05, + "loss": 2.9094, + "step": 15850 + }, + { + "epoch": 0.7379938077612497, + "grad_norm": 0.3597418660926594, + "learning_rate": 9.364645625448023e-05, + "loss": 3.0138, + "step": 15851 + }, + { + "epoch": 0.7380403659473427, + "grad_norm": 0.37829791234402, + "learning_rate": 9.36451347413e-05, + "loss": 2.9755, + "step": 15852 + }, + { + "epoch": 0.7380869241334358, + "grad_norm": 0.35660458548460633, + "learning_rate": 9.36438131000256e-05, + "loss": 3.0636, + "step": 15853 + }, + { + "epoch": 0.7381334823195288, + "grad_norm": 0.3845958903603702, + "learning_rate": 9.36424913306609e-05, + "loss": 3.0327, + "step": 15854 + }, + { + "epoch": 0.7381800405056219, + "grad_norm": 0.37959405312475863, + "learning_rate": 9.364116943320978e-05, + "loss": 2.9126, + "step": 15855 + }, + { + "epoch": 0.7382265986917149, + "grad_norm": 0.3534978337309107, + "learning_rate": 9.363984740767614e-05, + "loss": 3.0041, + "step": 15856 + }, + { + "epoch": 0.738273156877808, + "grad_norm": 0.40821061042454304, + "learning_rate": 9.363852525406385e-05, + "loss": 3.0628, + "step": 15857 + }, + { + "epoch": 0.7383197150639011, + "grad_norm": 0.3094914431564615, + "learning_rate": 9.363720297237676e-05, + "loss": 2.9749, + "step": 15858 + }, + { + "epoch": 0.7383662732499942, + "grad_norm": 0.40303520008948207, + "learning_rate": 9.363588056261881e-05, + "loss": 2.9872, + "step": 15859 + }, + { + "epoch": 0.7384128314360873, + "grad_norm": 0.31557244820724056, + "learning_rate": 9.363455802479385e-05, + "loss": 2.921, + "step": 15860 + }, + { + "epoch": 0.7384593896221803, + "grad_norm": 0.42960683889969986, + "learning_rate": 9.363323535890573e-05, + "loss": 2.9835, + "step": 15861 + }, + { + "epoch": 0.7385059478082734, + "grad_norm": 0.35648344352514105, + "learning_rate": 9.36319125649584e-05, + "loss": 3.0189, + "step": 15862 + }, + { + "epoch": 0.7385525059943665, + "grad_norm": 0.3742901823789378, + "learning_rate": 9.363058964295567e-05, + "loss": 3.0213, + "step": 15863 + }, + { + "epoch": 0.7385990641804595, + "grad_norm": 0.3450583716608285, + "learning_rate": 9.362926659290149e-05, + "loss": 2.9518, + "step": 15864 + }, + { + "epoch": 0.7386456223665526, + "grad_norm": 0.3244081630027506, + "learning_rate": 9.36279434147997e-05, + "loss": 3.0294, + "step": 15865 + }, + { + "epoch": 0.7386921805526456, + "grad_norm": 0.36802639254465863, + "learning_rate": 9.362662010865418e-05, + "loss": 2.9527, + "step": 15866 + }, + { + "epoch": 0.7387387387387387, + "grad_norm": 0.38084996113912917, + "learning_rate": 9.362529667446884e-05, + "loss": 2.9334, + "step": 15867 + }, + { + "epoch": 0.7387852969248319, + "grad_norm": 0.32008060826550605, + "learning_rate": 9.362397311224756e-05, + "loss": 2.9813, + "step": 15868 + }, + { + "epoch": 0.7388318551109249, + "grad_norm": 0.36124346804175117, + "learning_rate": 9.362264942199419e-05, + "loss": 2.9621, + "step": 15869 + }, + { + "epoch": 0.738878413297018, + "grad_norm": 0.3430547090384112, + "learning_rate": 9.362132560371266e-05, + "loss": 2.9488, + "step": 15870 + }, + { + "epoch": 0.738924971483111, + "grad_norm": 0.3614532779219395, + "learning_rate": 9.362000165740684e-05, + "loss": 3.0441, + "step": 15871 + }, + { + "epoch": 0.7389715296692041, + "grad_norm": 0.3387252291229328, + "learning_rate": 9.361867758308061e-05, + "loss": 2.9741, + "step": 15872 + }, + { + "epoch": 0.7390180878552972, + "grad_norm": 0.3469398850787171, + "learning_rate": 9.361735338073785e-05, + "loss": 2.8599, + "step": 15873 + }, + { + "epoch": 0.7390646460413902, + "grad_norm": 0.3633231935725513, + "learning_rate": 9.361602905038247e-05, + "loss": 3.1064, + "step": 15874 + }, + { + "epoch": 0.7391112042274833, + "grad_norm": 0.31474854260625484, + "learning_rate": 9.361470459201833e-05, + "loss": 3.0773, + "step": 15875 + }, + { + "epoch": 0.7391577624135763, + "grad_norm": 0.3529041717811599, + "learning_rate": 9.361338000564932e-05, + "loss": 3.1166, + "step": 15876 + }, + { + "epoch": 0.7392043205996695, + "grad_norm": 0.32776052307379155, + "learning_rate": 9.361205529127935e-05, + "loss": 2.9507, + "step": 15877 + }, + { + "epoch": 0.7392508787857625, + "grad_norm": 0.35468988960956244, + "learning_rate": 9.361073044891229e-05, + "loss": 2.9335, + "step": 15878 + }, + { + "epoch": 0.7392974369718556, + "grad_norm": 0.3454590235629324, + "learning_rate": 9.360940547855204e-05, + "loss": 2.8651, + "step": 15879 + }, + { + "epoch": 0.7393439951579487, + "grad_norm": 0.32106840375972756, + "learning_rate": 9.360808038020248e-05, + "loss": 2.9587, + "step": 15880 + }, + { + "epoch": 0.7393905533440417, + "grad_norm": 0.36474853255801254, + "learning_rate": 9.360675515386748e-05, + "loss": 2.9766, + "step": 15881 + }, + { + "epoch": 0.7394371115301348, + "grad_norm": 0.3246746590819742, + "learning_rate": 9.360542979955098e-05, + "loss": 3.0169, + "step": 15882 + }, + { + "epoch": 0.7394836697162278, + "grad_norm": 0.4009617224152582, + "learning_rate": 9.360410431725681e-05, + "loss": 3.0418, + "step": 15883 + }, + { + "epoch": 0.7395302279023209, + "grad_norm": 0.3031482220027044, + "learning_rate": 9.360277870698891e-05, + "loss": 2.926, + "step": 15884 + }, + { + "epoch": 0.739576786088414, + "grad_norm": 0.3829829860755462, + "learning_rate": 9.360145296875114e-05, + "loss": 2.9835, + "step": 15885 + }, + { + "epoch": 0.739623344274507, + "grad_norm": 0.31907656995442263, + "learning_rate": 9.360012710254743e-05, + "loss": 2.9741, + "step": 15886 + }, + { + "epoch": 0.7396699024606002, + "grad_norm": 0.36614031391006846, + "learning_rate": 9.35988011083816e-05, + "loss": 3.0087, + "step": 15887 + }, + { + "epoch": 0.7397164606466932, + "grad_norm": 0.3415123000141957, + "learning_rate": 9.359747498625759e-05, + "loss": 2.9819, + "step": 15888 + }, + { + "epoch": 0.7397630188327863, + "grad_norm": 0.39406321651130266, + "learning_rate": 9.35961487361793e-05, + "loss": 3.0373, + "step": 15889 + }, + { + "epoch": 0.7398095770188794, + "grad_norm": 0.34651148090311634, + "learning_rate": 9.359482235815061e-05, + "loss": 3.0103, + "step": 15890 + }, + { + "epoch": 0.7398561352049724, + "grad_norm": 0.40837790661092765, + "learning_rate": 9.359349585217541e-05, + "loss": 2.8822, + "step": 15891 + }, + { + "epoch": 0.7399026933910655, + "grad_norm": 0.39435412102616985, + "learning_rate": 9.359216921825759e-05, + "loss": 2.9829, + "step": 15892 + }, + { + "epoch": 0.7399492515771585, + "grad_norm": 0.38015697129656045, + "learning_rate": 9.359084245640106e-05, + "loss": 2.9365, + "step": 15893 + }, + { + "epoch": 0.7399958097632516, + "grad_norm": 0.41234968682738915, + "learning_rate": 9.358951556660967e-05, + "loss": 2.9471, + "step": 15894 + }, + { + "epoch": 0.7400423679493447, + "grad_norm": 0.3548311758803799, + "learning_rate": 9.358818854888739e-05, + "loss": 3.0514, + "step": 15895 + }, + { + "epoch": 0.7400889261354378, + "grad_norm": 0.40315701349102756, + "learning_rate": 9.358686140323804e-05, + "loss": 2.9946, + "step": 15896 + }, + { + "epoch": 0.7401354843215309, + "grad_norm": 0.37613193084226193, + "learning_rate": 9.358553412966555e-05, + "loss": 2.9888, + "step": 15897 + }, + { + "epoch": 0.7401820425076239, + "grad_norm": 0.4341698838751012, + "learning_rate": 9.358420672817381e-05, + "loss": 3.0339, + "step": 15898 + }, + { + "epoch": 0.740228600693717, + "grad_norm": 0.42660384988436434, + "learning_rate": 9.358287919876671e-05, + "loss": 2.9969, + "step": 15899 + }, + { + "epoch": 0.74027515887981, + "grad_norm": 0.3870720893834843, + "learning_rate": 9.358155154144817e-05, + "loss": 2.9863, + "step": 15900 + }, + { + "epoch": 0.7403217170659031, + "grad_norm": 0.37827782186562986, + "learning_rate": 9.358022375622207e-05, + "loss": 2.8417, + "step": 15901 + }, + { + "epoch": 0.7403682752519962, + "grad_norm": 0.3694846858576305, + "learning_rate": 9.35788958430923e-05, + "loss": 2.9701, + "step": 15902 + }, + { + "epoch": 0.7404148334380892, + "grad_norm": 0.39872289062874, + "learning_rate": 9.357756780206278e-05, + "loss": 2.8053, + "step": 15903 + }, + { + "epoch": 0.7404613916241823, + "grad_norm": 0.38972253254904804, + "learning_rate": 9.357623963313738e-05, + "loss": 3.0697, + "step": 15904 + }, + { + "epoch": 0.7405079498102753, + "grad_norm": 0.41576359042376365, + "learning_rate": 9.357491133631999e-05, + "loss": 3.1506, + "step": 15905 + }, + { + "epoch": 0.7405545079963685, + "grad_norm": 0.32173205481516576, + "learning_rate": 9.357358291161456e-05, + "loss": 2.9205, + "step": 15906 + }, + { + "epoch": 0.7406010661824616, + "grad_norm": 0.40959256441115427, + "learning_rate": 9.357225435902492e-05, + "loss": 3.0561, + "step": 15907 + }, + { + "epoch": 0.7406476243685546, + "grad_norm": 0.3505498087907301, + "learning_rate": 9.357092567855504e-05, + "loss": 2.8416, + "step": 15908 + }, + { + "epoch": 0.7406941825546477, + "grad_norm": 0.4141761091643869, + "learning_rate": 9.356959687020876e-05, + "loss": 2.9915, + "step": 15909 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.41055080870087113, + "learning_rate": 9.356826793399003e-05, + "loss": 2.9585, + "step": 15910 + }, + { + "epoch": 0.7407872989268338, + "grad_norm": 0.38795233803334406, + "learning_rate": 9.356693886990271e-05, + "loss": 3.0412, + "step": 15911 + }, + { + "epoch": 0.7408338571129269, + "grad_norm": 0.39823810776883667, + "learning_rate": 9.356560967795072e-05, + "loss": 2.9401, + "step": 15912 + }, + { + "epoch": 0.7408804152990199, + "grad_norm": 0.35120946133102626, + "learning_rate": 9.356428035813795e-05, + "loss": 3.0267, + "step": 15913 + }, + { + "epoch": 0.740926973485113, + "grad_norm": 0.3755187383461893, + "learning_rate": 9.356295091046832e-05, + "loss": 2.9447, + "step": 15914 + }, + { + "epoch": 0.740973531671206, + "grad_norm": 0.3554840510902656, + "learning_rate": 9.356162133494573e-05, + "loss": 3.0526, + "step": 15915 + }, + { + "epoch": 0.7410200898572992, + "grad_norm": 0.33871018757148796, + "learning_rate": 9.356029163157405e-05, + "loss": 3.0199, + "step": 15916 + }, + { + "epoch": 0.7410666480433923, + "grad_norm": 0.3720188037427267, + "learning_rate": 9.355896180035721e-05, + "loss": 3.1011, + "step": 15917 + }, + { + "epoch": 0.7411132062294853, + "grad_norm": 0.3639095479991235, + "learning_rate": 9.355763184129913e-05, + "loss": 3.002, + "step": 15918 + }, + { + "epoch": 0.7411597644155784, + "grad_norm": 0.40489195439560693, + "learning_rate": 9.355630175440366e-05, + "loss": 2.9693, + "step": 15919 + }, + { + "epoch": 0.7412063226016714, + "grad_norm": 0.34024593713357915, + "learning_rate": 9.355497153967477e-05, + "loss": 2.9801, + "step": 15920 + }, + { + "epoch": 0.7412528807877645, + "grad_norm": 0.36033171769913025, + "learning_rate": 9.355364119711632e-05, + "loss": 2.9965, + "step": 15921 + }, + { + "epoch": 0.7412994389738575, + "grad_norm": 0.33312825485830955, + "learning_rate": 9.35523107267322e-05, + "loss": 2.9338, + "step": 15922 + }, + { + "epoch": 0.7413459971599506, + "grad_norm": 0.3682017929947909, + "learning_rate": 9.355098012852635e-05, + "loss": 2.9678, + "step": 15923 + }, + { + "epoch": 0.7413925553460438, + "grad_norm": 0.41107961493617806, + "learning_rate": 9.354964940250269e-05, + "loss": 3.0504, + "step": 15924 + }, + { + "epoch": 0.7414391135321368, + "grad_norm": 0.35245067401899166, + "learning_rate": 9.354831854866506e-05, + "loss": 2.94, + "step": 15925 + }, + { + "epoch": 0.7414856717182299, + "grad_norm": 0.35257406371211, + "learning_rate": 9.354698756701742e-05, + "loss": 2.8767, + "step": 15926 + }, + { + "epoch": 0.7415322299043229, + "grad_norm": 0.3325029321869429, + "learning_rate": 9.354565645756365e-05, + "loss": 2.9038, + "step": 15927 + }, + { + "epoch": 0.741578788090416, + "grad_norm": 0.35442778856634644, + "learning_rate": 9.35443252203077e-05, + "loss": 2.9502, + "step": 15928 + }, + { + "epoch": 0.7416253462765091, + "grad_norm": 0.3498013221773631, + "learning_rate": 9.354299385525342e-05, + "loss": 3.0253, + "step": 15929 + }, + { + "epoch": 0.7416719044626021, + "grad_norm": 0.33369752651838, + "learning_rate": 9.354166236240476e-05, + "loss": 2.9987, + "step": 15930 + }, + { + "epoch": 0.7417184626486952, + "grad_norm": 0.32829532237092, + "learning_rate": 9.35403307417656e-05, + "loss": 2.9137, + "step": 15931 + }, + { + "epoch": 0.7417650208347882, + "grad_norm": 0.3848957191439163, + "learning_rate": 9.353899899333986e-05, + "loss": 2.969, + "step": 15932 + }, + { + "epoch": 0.7418115790208814, + "grad_norm": 0.35466683043898817, + "learning_rate": 9.353766711713143e-05, + "loss": 2.9035, + "step": 15933 + }, + { + "epoch": 0.7418581372069745, + "grad_norm": 0.388420402167347, + "learning_rate": 9.353633511314426e-05, + "loss": 2.9501, + "step": 15934 + }, + { + "epoch": 0.7419046953930675, + "grad_norm": 0.3609060367194015, + "learning_rate": 9.353500298138222e-05, + "loss": 2.9606, + "step": 15935 + }, + { + "epoch": 0.7419512535791606, + "grad_norm": 0.3888209395706149, + "learning_rate": 9.353367072184925e-05, + "loss": 2.9016, + "step": 15936 + }, + { + "epoch": 0.7419978117652536, + "grad_norm": 0.45035538950874987, + "learning_rate": 9.353233833454925e-05, + "loss": 2.9095, + "step": 15937 + }, + { + "epoch": 0.7420443699513467, + "grad_norm": 0.36925897822820264, + "learning_rate": 9.353100581948611e-05, + "loss": 2.9948, + "step": 15938 + }, + { + "epoch": 0.7420909281374398, + "grad_norm": 0.38665460588475453, + "learning_rate": 9.352967317666375e-05, + "loss": 2.9563, + "step": 15939 + }, + { + "epoch": 0.7421374863235328, + "grad_norm": 0.33641309840668737, + "learning_rate": 9.352834040608611e-05, + "loss": 2.8845, + "step": 15940 + }, + { + "epoch": 0.7421840445096259, + "grad_norm": 0.40636887404970085, + "learning_rate": 9.352700750775707e-05, + "loss": 2.9521, + "step": 15941 + }, + { + "epoch": 0.7422306026957189, + "grad_norm": 0.39337144446006406, + "learning_rate": 9.352567448168055e-05, + "loss": 2.8853, + "step": 15942 + }, + { + "epoch": 0.7422771608818121, + "grad_norm": 0.3714427634313926, + "learning_rate": 9.352434132786046e-05, + "loss": 2.8095, + "step": 15943 + }, + { + "epoch": 0.7423237190679051, + "grad_norm": 0.3921053742955597, + "learning_rate": 9.35230080463007e-05, + "loss": 3.0547, + "step": 15944 + }, + { + "epoch": 0.7423702772539982, + "grad_norm": 0.38166130565928197, + "learning_rate": 9.352167463700522e-05, + "loss": 3.0183, + "step": 15945 + }, + { + "epoch": 0.7424168354400913, + "grad_norm": 0.38796853182884067, + "learning_rate": 9.352034109997792e-05, + "loss": 2.9569, + "step": 15946 + }, + { + "epoch": 0.7424633936261843, + "grad_norm": 0.40198611629399705, + "learning_rate": 9.35190074352227e-05, + "loss": 3.0183, + "step": 15947 + }, + { + "epoch": 0.7425099518122774, + "grad_norm": 0.36042745595822573, + "learning_rate": 9.351767364274347e-05, + "loss": 2.9475, + "step": 15948 + }, + { + "epoch": 0.7425565099983704, + "grad_norm": 0.37913853741267295, + "learning_rate": 9.351633972254417e-05, + "loss": 2.9525, + "step": 15949 + }, + { + "epoch": 0.7426030681844635, + "grad_norm": 0.3699613506813079, + "learning_rate": 9.351500567462867e-05, + "loss": 2.9704, + "step": 15950 + }, + { + "epoch": 0.7426496263705566, + "grad_norm": 0.4108872713574934, + "learning_rate": 9.351367149900095e-05, + "loss": 2.956, + "step": 15951 + }, + { + "epoch": 0.7426961845566497, + "grad_norm": 0.32822313657174795, + "learning_rate": 9.351233719566487e-05, + "loss": 3.0424, + "step": 15952 + }, + { + "epoch": 0.7427427427427428, + "grad_norm": 0.4391387017235192, + "learning_rate": 9.351100276462438e-05, + "loss": 3.0242, + "step": 15953 + }, + { + "epoch": 0.7427893009288358, + "grad_norm": 0.3204072352366168, + "learning_rate": 9.350966820588338e-05, + "loss": 2.8866, + "step": 15954 + }, + { + "epoch": 0.7428358591149289, + "grad_norm": 0.3916597921144438, + "learning_rate": 9.350833351944578e-05, + "loss": 2.992, + "step": 15955 + }, + { + "epoch": 0.742882417301022, + "grad_norm": 0.3491306388257864, + "learning_rate": 9.350699870531552e-05, + "loss": 2.888, + "step": 15956 + }, + { + "epoch": 0.742928975487115, + "grad_norm": 0.34511358327472863, + "learning_rate": 9.35056637634965e-05, + "loss": 2.8865, + "step": 15957 + }, + { + "epoch": 0.7429755336732081, + "grad_norm": 0.33763634439679285, + "learning_rate": 9.350432869399264e-05, + "loss": 3.0443, + "step": 15958 + }, + { + "epoch": 0.7430220918593011, + "grad_norm": 0.31960130705968953, + "learning_rate": 9.350299349680787e-05, + "loss": 2.8484, + "step": 15959 + }, + { + "epoch": 0.7430686500453942, + "grad_norm": 0.3216464481917525, + "learning_rate": 9.350165817194609e-05, + "loss": 2.8692, + "step": 15960 + }, + { + "epoch": 0.7431152082314874, + "grad_norm": 0.3313979769278703, + "learning_rate": 9.350032271941125e-05, + "loss": 3.0138, + "step": 15961 + }, + { + "epoch": 0.7431617664175804, + "grad_norm": 0.35473769203806077, + "learning_rate": 9.349898713920721e-05, + "loss": 2.9328, + "step": 15962 + }, + { + "epoch": 0.7432083246036735, + "grad_norm": 0.30967150224193235, + "learning_rate": 9.349765143133797e-05, + "loss": 2.995, + "step": 15963 + }, + { + "epoch": 0.7432548827897665, + "grad_norm": 0.3596985175533699, + "learning_rate": 9.349631559580738e-05, + "loss": 3.0031, + "step": 15964 + }, + { + "epoch": 0.7433014409758596, + "grad_norm": 0.3255617403361409, + "learning_rate": 9.349497963261939e-05, + "loss": 2.9817, + "step": 15965 + }, + { + "epoch": 0.7433479991619526, + "grad_norm": 0.39451111168857317, + "learning_rate": 9.349364354177793e-05, + "loss": 3.0101, + "step": 15966 + }, + { + "epoch": 0.7433945573480457, + "grad_norm": 0.34037527132805195, + "learning_rate": 9.349230732328691e-05, + "loss": 2.9923, + "step": 15967 + }, + { + "epoch": 0.7434411155341388, + "grad_norm": 0.34891869714645307, + "learning_rate": 9.349097097715025e-05, + "loss": 2.8851, + "step": 15968 + }, + { + "epoch": 0.7434876737202318, + "grad_norm": 0.412606282338109, + "learning_rate": 9.348963450337187e-05, + "loss": 3.0965, + "step": 15969 + }, + { + "epoch": 0.743534231906325, + "grad_norm": 0.35385072554198327, + "learning_rate": 9.34882979019557e-05, + "loss": 2.9117, + "step": 15970 + }, + { + "epoch": 0.743580790092418, + "grad_norm": 0.387460054711955, + "learning_rate": 9.348696117290568e-05, + "loss": 2.919, + "step": 15971 + }, + { + "epoch": 0.7436273482785111, + "grad_norm": 0.3599430467111584, + "learning_rate": 9.348562431622569e-05, + "loss": 2.8637, + "step": 15972 + }, + { + "epoch": 0.7436739064646042, + "grad_norm": 0.36469749616276304, + "learning_rate": 9.348428733191968e-05, + "loss": 2.9594, + "step": 15973 + }, + { + "epoch": 0.7437204646506972, + "grad_norm": 0.35245120760447823, + "learning_rate": 9.348295021999157e-05, + "loss": 2.9224, + "step": 15974 + }, + { + "epoch": 0.7437670228367903, + "grad_norm": 0.36913562464021393, + "learning_rate": 9.348161298044529e-05, + "loss": 3.0166, + "step": 15975 + }, + { + "epoch": 0.7438135810228833, + "grad_norm": 0.3737555045601651, + "learning_rate": 9.348027561328476e-05, + "loss": 2.9824, + "step": 15976 + }, + { + "epoch": 0.7438601392089764, + "grad_norm": 0.34475365967255467, + "learning_rate": 9.347893811851389e-05, + "loss": 2.8599, + "step": 15977 + }, + { + "epoch": 0.7439066973950695, + "grad_norm": 0.3482362822628938, + "learning_rate": 9.347760049613665e-05, + "loss": 2.8564, + "step": 15978 + }, + { + "epoch": 0.7439532555811625, + "grad_norm": 0.3507723825317146, + "learning_rate": 9.34762627461569e-05, + "loss": 2.9799, + "step": 15979 + }, + { + "epoch": 0.7439998137672557, + "grad_norm": 0.36974899021766683, + "learning_rate": 9.347492486857863e-05, + "loss": 3.1621, + "step": 15980 + }, + { + "epoch": 0.7440463719533487, + "grad_norm": 0.36514434843663374, + "learning_rate": 9.347358686340572e-05, + "loss": 3.0033, + "step": 15981 + }, + { + "epoch": 0.7440929301394418, + "grad_norm": 0.3718271518806382, + "learning_rate": 9.347224873064214e-05, + "loss": 3.097, + "step": 15982 + }, + { + "epoch": 0.7441394883255349, + "grad_norm": 0.3567785179395881, + "learning_rate": 9.347091047029177e-05, + "loss": 3.0181, + "step": 15983 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 0.37280717502328703, + "learning_rate": 9.346957208235856e-05, + "loss": 2.8591, + "step": 15984 + }, + { + "epoch": 0.744232604697721, + "grad_norm": 0.33899375384678687, + "learning_rate": 9.346823356684644e-05, + "loss": 3.0089, + "step": 15985 + }, + { + "epoch": 0.744279162883814, + "grad_norm": 0.3888402736423096, + "learning_rate": 9.346689492375935e-05, + "loss": 3.069, + "step": 15986 + }, + { + "epoch": 0.7443257210699071, + "grad_norm": 0.3709384414475167, + "learning_rate": 9.346555615310119e-05, + "loss": 2.9369, + "step": 15987 + }, + { + "epoch": 0.7443722792560001, + "grad_norm": 0.4021918472177714, + "learning_rate": 9.346421725487591e-05, + "loss": 2.91, + "step": 15988 + }, + { + "epoch": 0.7444188374420933, + "grad_norm": 0.36430814767268166, + "learning_rate": 9.346287822908744e-05, + "loss": 2.9852, + "step": 15989 + }, + { + "epoch": 0.7444653956281864, + "grad_norm": 0.3829339571737337, + "learning_rate": 9.346153907573971e-05, + "loss": 2.9648, + "step": 15990 + }, + { + "epoch": 0.7445119538142794, + "grad_norm": 0.33311881010510125, + "learning_rate": 9.346019979483663e-05, + "loss": 2.9255, + "step": 15991 + }, + { + "epoch": 0.7445585120003725, + "grad_norm": 0.3743874607009988, + "learning_rate": 9.345886038638215e-05, + "loss": 3.0109, + "step": 15992 + }, + { + "epoch": 0.7446050701864655, + "grad_norm": 0.3542493295993546, + "learning_rate": 9.34575208503802e-05, + "loss": 3.0059, + "step": 15993 + }, + { + "epoch": 0.7446516283725586, + "grad_norm": 0.3353978750518203, + "learning_rate": 9.34561811868347e-05, + "loss": 3.0322, + "step": 15994 + }, + { + "epoch": 0.7446981865586517, + "grad_norm": 0.33827773915352954, + "learning_rate": 9.34548413957496e-05, + "loss": 2.9701, + "step": 15995 + }, + { + "epoch": 0.7447447447447447, + "grad_norm": 0.3096613576153722, + "learning_rate": 9.345350147712881e-05, + "loss": 2.8587, + "step": 15996 + }, + { + "epoch": 0.7447913029308378, + "grad_norm": 0.3706570496058322, + "learning_rate": 9.345216143097628e-05, + "loss": 3.0121, + "step": 15997 + }, + { + "epoch": 0.7448378611169308, + "grad_norm": 0.3472078553412396, + "learning_rate": 9.345082125729594e-05, + "loss": 3.1109, + "step": 15998 + }, + { + "epoch": 0.744884419303024, + "grad_norm": 0.3404147867920552, + "learning_rate": 9.344948095609172e-05, + "loss": 2.9679, + "step": 15999 + }, + { + "epoch": 0.7449309774891171, + "grad_norm": 0.3431412980629383, + "learning_rate": 9.344814052736755e-05, + "loss": 3.0556, + "step": 16000 + }, + { + "epoch": 0.7449775356752101, + "grad_norm": 0.35070284882085473, + "learning_rate": 9.344679997112736e-05, + "loss": 2.9081, + "step": 16001 + }, + { + "epoch": 0.7450240938613032, + "grad_norm": 0.3368661215590704, + "learning_rate": 9.34454592873751e-05, + "loss": 3.0167, + "step": 16002 + }, + { + "epoch": 0.7450706520473962, + "grad_norm": 0.3499734594776518, + "learning_rate": 9.34441184761147e-05, + "loss": 2.9041, + "step": 16003 + }, + { + "epoch": 0.7451172102334893, + "grad_norm": 0.3652027575589835, + "learning_rate": 9.344277753735009e-05, + "loss": 2.9288, + "step": 16004 + }, + { + "epoch": 0.7451637684195824, + "grad_norm": 0.34704345901385225, + "learning_rate": 9.344143647108522e-05, + "loss": 2.9979, + "step": 16005 + }, + { + "epoch": 0.7452103266056754, + "grad_norm": 0.3763222722167547, + "learning_rate": 9.344009527732398e-05, + "loss": 3.0165, + "step": 16006 + }, + { + "epoch": 0.7452568847917685, + "grad_norm": 0.38539901128409, + "learning_rate": 9.343875395607036e-05, + "loss": 3.0764, + "step": 16007 + }, + { + "epoch": 0.7453034429778616, + "grad_norm": 0.33588385263617176, + "learning_rate": 9.343741250732828e-05, + "loss": 3.0368, + "step": 16008 + }, + { + "epoch": 0.7453500011639547, + "grad_norm": 0.367767242537905, + "learning_rate": 9.343607093110167e-05, + "loss": 2.9958, + "step": 16009 + }, + { + "epoch": 0.7453965593500477, + "grad_norm": 0.35966686592152053, + "learning_rate": 9.343472922739447e-05, + "loss": 2.9387, + "step": 16010 + }, + { + "epoch": 0.7454431175361408, + "grad_norm": 0.32608103071530986, + "learning_rate": 9.343338739621062e-05, + "loss": 3.0291, + "step": 16011 + }, + { + "epoch": 0.7454896757222339, + "grad_norm": 0.39078293987882173, + "learning_rate": 9.343204543755407e-05, + "loss": 2.9711, + "step": 16012 + }, + { + "epoch": 0.7455362339083269, + "grad_norm": 0.3027008243198328, + "learning_rate": 9.343070335142873e-05, + "loss": 2.9337, + "step": 16013 + }, + { + "epoch": 0.74558279209442, + "grad_norm": 0.3445890445538821, + "learning_rate": 9.342936113783855e-05, + "loss": 2.9776, + "step": 16014 + }, + { + "epoch": 0.745629350280513, + "grad_norm": 0.33662994643400534, + "learning_rate": 9.342801879678749e-05, + "loss": 3.0375, + "step": 16015 + }, + { + "epoch": 0.7456759084666061, + "grad_norm": 0.3475234816609824, + "learning_rate": 9.342667632827946e-05, + "loss": 2.7998, + "step": 16016 + }, + { + "epoch": 0.7457224666526993, + "grad_norm": 0.3075772944003659, + "learning_rate": 9.342533373231843e-05, + "loss": 2.996, + "step": 16017 + }, + { + "epoch": 0.7457690248387923, + "grad_norm": 0.3475236006393034, + "learning_rate": 9.34239910089083e-05, + "loss": 2.8925, + "step": 16018 + }, + { + "epoch": 0.7458155830248854, + "grad_norm": 0.34252526814868395, + "learning_rate": 9.342264815805304e-05, + "loss": 3.0467, + "step": 16019 + }, + { + "epoch": 0.7458621412109784, + "grad_norm": 0.4086583926949803, + "learning_rate": 9.342130517975659e-05, + "loss": 2.9987, + "step": 16020 + }, + { + "epoch": 0.7459086993970715, + "grad_norm": 0.3636311900300417, + "learning_rate": 9.34199620740229e-05, + "loss": 3.0121, + "step": 16021 + }, + { + "epoch": 0.7459552575831646, + "grad_norm": 0.35101172804705727, + "learning_rate": 9.341861884085587e-05, + "loss": 2.879, + "step": 16022 + }, + { + "epoch": 0.7460018157692576, + "grad_norm": 0.3900097754231086, + "learning_rate": 9.34172754802595e-05, + "loss": 2.9775, + "step": 16023 + }, + { + "epoch": 0.7460483739553507, + "grad_norm": 0.3864755315610604, + "learning_rate": 9.341593199223768e-05, + "loss": 2.9057, + "step": 16024 + }, + { + "epoch": 0.7460949321414437, + "grad_norm": 0.4313914925499304, + "learning_rate": 9.34145883767944e-05, + "loss": 3.0229, + "step": 16025 + }, + { + "epoch": 0.7461414903275368, + "grad_norm": 0.4054607760679213, + "learning_rate": 9.341324463393358e-05, + "loss": 2.9937, + "step": 16026 + }, + { + "epoch": 0.7461880485136299, + "grad_norm": 0.403232993123886, + "learning_rate": 9.341190076365913e-05, + "loss": 2.9487, + "step": 16027 + }, + { + "epoch": 0.746234606699723, + "grad_norm": 0.37814814513275635, + "learning_rate": 9.341055676597505e-05, + "loss": 3.0664, + "step": 16028 + }, + { + "epoch": 0.7462811648858161, + "grad_norm": 0.43750568433573683, + "learning_rate": 9.340921264088527e-05, + "loss": 2.9958, + "step": 16029 + }, + { + "epoch": 0.7463277230719091, + "grad_norm": 0.34177902274951627, + "learning_rate": 9.340786838839373e-05, + "loss": 2.9796, + "step": 16030 + }, + { + "epoch": 0.7463742812580022, + "grad_norm": 0.4037034884217743, + "learning_rate": 9.340652400850436e-05, + "loss": 2.9708, + "step": 16031 + }, + { + "epoch": 0.7464208394440952, + "grad_norm": 0.34387431037250527, + "learning_rate": 9.340517950122112e-05, + "loss": 3.0199, + "step": 16032 + }, + { + "epoch": 0.7464673976301883, + "grad_norm": 0.38440143119510267, + "learning_rate": 9.340383486654795e-05, + "loss": 2.946, + "step": 16033 + }, + { + "epoch": 0.7465139558162814, + "grad_norm": 0.3480850801804205, + "learning_rate": 9.340249010448881e-05, + "loss": 2.9415, + "step": 16034 + }, + { + "epoch": 0.7465605140023744, + "grad_norm": 0.38964920883358695, + "learning_rate": 9.340114521504764e-05, + "loss": 3.0119, + "step": 16035 + }, + { + "epoch": 0.7466070721884676, + "grad_norm": 0.34081809123746326, + "learning_rate": 9.339980019822838e-05, + "loss": 3.0067, + "step": 16036 + }, + { + "epoch": 0.7466536303745606, + "grad_norm": 0.3902335973164285, + "learning_rate": 9.339845505403497e-05, + "loss": 2.9581, + "step": 16037 + }, + { + "epoch": 0.7467001885606537, + "grad_norm": 0.346742078084181, + "learning_rate": 9.339710978247138e-05, + "loss": 2.96, + "step": 16038 + }, + { + "epoch": 0.7467467467467468, + "grad_norm": 0.37462116784335986, + "learning_rate": 9.339576438354155e-05, + "loss": 2.9896, + "step": 16039 + }, + { + "epoch": 0.7467933049328398, + "grad_norm": 0.3480390293585918, + "learning_rate": 9.339441885724942e-05, + "loss": 2.9487, + "step": 16040 + }, + { + "epoch": 0.7468398631189329, + "grad_norm": 0.3656542727949538, + "learning_rate": 9.339307320359896e-05, + "loss": 2.9346, + "step": 16041 + }, + { + "epoch": 0.7468864213050259, + "grad_norm": 0.34391744471417146, + "learning_rate": 9.339172742259408e-05, + "loss": 2.9553, + "step": 16042 + }, + { + "epoch": 0.746932979491119, + "grad_norm": 0.3615871517953049, + "learning_rate": 9.339038151423878e-05, + "loss": 3.0712, + "step": 16043 + }, + { + "epoch": 0.7469795376772121, + "grad_norm": 0.34935001406931726, + "learning_rate": 9.338903547853697e-05, + "loss": 2.9764, + "step": 16044 + }, + { + "epoch": 0.7470260958633052, + "grad_norm": 0.37816057345569865, + "learning_rate": 9.338768931549262e-05, + "loss": 2.9634, + "step": 16045 + }, + { + "epoch": 0.7470726540493983, + "grad_norm": 0.3312959355682346, + "learning_rate": 9.338634302510969e-05, + "loss": 3.0155, + "step": 16046 + }, + { + "epoch": 0.7471192122354913, + "grad_norm": 0.39130634288552, + "learning_rate": 9.338499660739211e-05, + "loss": 3.0478, + "step": 16047 + }, + { + "epoch": 0.7471657704215844, + "grad_norm": 0.3726379576020319, + "learning_rate": 9.338365006234382e-05, + "loss": 2.9184, + "step": 16048 + }, + { + "epoch": 0.7472123286076774, + "grad_norm": 0.36111745991458166, + "learning_rate": 9.338230338996881e-05, + "loss": 2.9497, + "step": 16049 + }, + { + "epoch": 0.7472588867937705, + "grad_norm": 0.3766860441004378, + "learning_rate": 9.3380956590271e-05, + "loss": 2.8176, + "step": 16050 + }, + { + "epoch": 0.7473054449798636, + "grad_norm": 0.33683005895696905, + "learning_rate": 9.337960966325436e-05, + "loss": 2.9439, + "step": 16051 + }, + { + "epoch": 0.7473520031659566, + "grad_norm": 0.3798868736046508, + "learning_rate": 9.337826260892285e-05, + "loss": 3.039, + "step": 16052 + }, + { + "epoch": 0.7473985613520497, + "grad_norm": 0.3591085043886695, + "learning_rate": 9.337691542728042e-05, + "loss": 2.9705, + "step": 16053 + }, + { + "epoch": 0.7474451195381427, + "grad_norm": 0.3415231906038174, + "learning_rate": 9.337556811833099e-05, + "loss": 2.9019, + "step": 16054 + }, + { + "epoch": 0.7474916777242359, + "grad_norm": 0.38422350905586183, + "learning_rate": 9.337422068207855e-05, + "loss": 2.8906, + "step": 16055 + }, + { + "epoch": 0.747538235910329, + "grad_norm": 0.33649454218629754, + "learning_rate": 9.337287311852705e-05, + "loss": 2.7658, + "step": 16056 + }, + { + "epoch": 0.747584794096422, + "grad_norm": 0.39296718180335527, + "learning_rate": 9.337152542768043e-05, + "loss": 2.9764, + "step": 16057 + }, + { + "epoch": 0.7476313522825151, + "grad_norm": 0.4062180479493937, + "learning_rate": 9.337017760954267e-05, + "loss": 3.01, + "step": 16058 + }, + { + "epoch": 0.7476779104686081, + "grad_norm": 0.3698826727351855, + "learning_rate": 9.336882966411771e-05, + "loss": 2.8574, + "step": 16059 + }, + { + "epoch": 0.7477244686547012, + "grad_norm": 0.3717502551136717, + "learning_rate": 9.336748159140951e-05, + "loss": 2.8495, + "step": 16060 + }, + { + "epoch": 0.7477710268407943, + "grad_norm": 0.37351106732890516, + "learning_rate": 9.336613339142201e-05, + "loss": 3.0105, + "step": 16061 + }, + { + "epoch": 0.7478175850268873, + "grad_norm": 0.35033434318688267, + "learning_rate": 9.33647850641592e-05, + "loss": 2.9989, + "step": 16062 + }, + { + "epoch": 0.7478641432129804, + "grad_norm": 0.3263030296033364, + "learning_rate": 9.336343660962501e-05, + "loss": 3.0263, + "step": 16063 + }, + { + "epoch": 0.7479107013990735, + "grad_norm": 0.3553266966748707, + "learning_rate": 9.33620880278234e-05, + "loss": 2.8749, + "step": 16064 + }, + { + "epoch": 0.7479572595851666, + "grad_norm": 0.337488140730067, + "learning_rate": 9.336073931875833e-05, + "loss": 3.0418, + "step": 16065 + }, + { + "epoch": 0.7480038177712597, + "grad_norm": 0.3520290576633597, + "learning_rate": 9.335939048243377e-05, + "loss": 2.9756, + "step": 16066 + }, + { + "epoch": 0.7480503759573527, + "grad_norm": 0.3124571087002007, + "learning_rate": 9.335804151885367e-05, + "loss": 2.9755, + "step": 16067 + }, + { + "epoch": 0.7480969341434458, + "grad_norm": 0.34161417744057837, + "learning_rate": 9.335669242802199e-05, + "loss": 2.993, + "step": 16068 + }, + { + "epoch": 0.7481434923295388, + "grad_norm": 0.3508778372839244, + "learning_rate": 9.335534320994267e-05, + "loss": 3.0907, + "step": 16069 + }, + { + "epoch": 0.7481900505156319, + "grad_norm": 0.33173178975173157, + "learning_rate": 9.335399386461971e-05, + "loss": 3.0007, + "step": 16070 + }, + { + "epoch": 0.7482366087017249, + "grad_norm": 0.37034282971273574, + "learning_rate": 9.335264439205706e-05, + "loss": 3.0705, + "step": 16071 + }, + { + "epoch": 0.748283166887818, + "grad_norm": 0.352298890503383, + "learning_rate": 9.335129479225865e-05, + "loss": 2.9444, + "step": 16072 + }, + { + "epoch": 0.7483297250739112, + "grad_norm": 0.35995560964994644, + "learning_rate": 9.334994506522846e-05, + "loss": 2.9289, + "step": 16073 + }, + { + "epoch": 0.7483762832600042, + "grad_norm": 0.3530312079751935, + "learning_rate": 9.334859521097045e-05, + "loss": 2.9392, + "step": 16074 + }, + { + "epoch": 0.7484228414460973, + "grad_norm": 0.31872484898315356, + "learning_rate": 9.33472452294886e-05, + "loss": 2.9176, + "step": 16075 + }, + { + "epoch": 0.7484693996321903, + "grad_norm": 0.34821415661327565, + "learning_rate": 9.334589512078684e-05, + "loss": 2.9676, + "step": 16076 + }, + { + "epoch": 0.7485159578182834, + "grad_norm": 0.35880031993713357, + "learning_rate": 9.334454488486915e-05, + "loss": 3.0658, + "step": 16077 + }, + { + "epoch": 0.7485625160043765, + "grad_norm": 0.33521580356142805, + "learning_rate": 9.334319452173951e-05, + "loss": 2.8952, + "step": 16078 + }, + { + "epoch": 0.7486090741904695, + "grad_norm": 0.32098801922951853, + "learning_rate": 9.334184403140184e-05, + "loss": 2.9703, + "step": 16079 + }, + { + "epoch": 0.7486556323765626, + "grad_norm": 0.3598632426113358, + "learning_rate": 9.334049341386014e-05, + "loss": 2.908, + "step": 16080 + }, + { + "epoch": 0.7487021905626556, + "grad_norm": 0.32399724077704145, + "learning_rate": 9.333914266911836e-05, + "loss": 2.9208, + "step": 16081 + }, + { + "epoch": 0.7487487487487487, + "grad_norm": 0.36083210608989036, + "learning_rate": 9.333779179718048e-05, + "loss": 2.9813, + "step": 16082 + }, + { + "epoch": 0.7487953069348419, + "grad_norm": 0.35309066410691603, + "learning_rate": 9.333644079805042e-05, + "loss": 3.0479, + "step": 16083 + }, + { + "epoch": 0.7488418651209349, + "grad_norm": 0.33013590258181524, + "learning_rate": 9.333508967173218e-05, + "loss": 2.8991, + "step": 16084 + }, + { + "epoch": 0.748888423307028, + "grad_norm": 0.37745778736139124, + "learning_rate": 9.333373841822974e-05, + "loss": 3.0846, + "step": 16085 + }, + { + "epoch": 0.748934981493121, + "grad_norm": 0.3083132075547518, + "learning_rate": 9.333238703754702e-05, + "loss": 3.0519, + "step": 16086 + }, + { + "epoch": 0.7489815396792141, + "grad_norm": 0.3817347319741416, + "learning_rate": 9.333103552968804e-05, + "loss": 3.1062, + "step": 16087 + }, + { + "epoch": 0.7490280978653072, + "grad_norm": 0.3291841630599155, + "learning_rate": 9.332968389465675e-05, + "loss": 2.8916, + "step": 16088 + }, + { + "epoch": 0.7490746560514002, + "grad_norm": 0.36488124311700315, + "learning_rate": 9.332833213245709e-05, + "loss": 2.9264, + "step": 16089 + }, + { + "epoch": 0.7491212142374933, + "grad_norm": 0.3659211211831772, + "learning_rate": 9.332698024309303e-05, + "loss": 2.9659, + "step": 16090 + }, + { + "epoch": 0.7491677724235863, + "grad_norm": 0.35124968554639235, + "learning_rate": 9.332562822656856e-05, + "loss": 2.9648, + "step": 16091 + }, + { + "epoch": 0.7492143306096795, + "grad_norm": 0.39497542228810506, + "learning_rate": 9.332427608288765e-05, + "loss": 3.0155, + "step": 16092 + }, + { + "epoch": 0.7492608887957725, + "grad_norm": 0.39344872632571337, + "learning_rate": 9.332292381205425e-05, + "loss": 3.009, + "step": 16093 + }, + { + "epoch": 0.7493074469818656, + "grad_norm": 0.39364959417281054, + "learning_rate": 9.332157141407235e-05, + "loss": 2.9419, + "step": 16094 + }, + { + "epoch": 0.7493540051679587, + "grad_norm": 0.34564975427234595, + "learning_rate": 9.332021888894586e-05, + "loss": 3.0223, + "step": 16095 + }, + { + "epoch": 0.7494005633540517, + "grad_norm": 0.3647558741899326, + "learning_rate": 9.331886623667885e-05, + "loss": 2.9535, + "step": 16096 + }, + { + "epoch": 0.7494471215401448, + "grad_norm": 0.39500167860174024, + "learning_rate": 9.331751345727519e-05, + "loss": 2.9071, + "step": 16097 + }, + { + "epoch": 0.7494936797262378, + "grad_norm": 0.3193189164820351, + "learning_rate": 9.331616055073894e-05, + "loss": 2.9743, + "step": 16098 + }, + { + "epoch": 0.7495402379123309, + "grad_norm": 0.38001604716333187, + "learning_rate": 9.3314807517074e-05, + "loss": 2.9569, + "step": 16099 + }, + { + "epoch": 0.749586796098424, + "grad_norm": 0.3297372224527295, + "learning_rate": 9.331345435628437e-05, + "loss": 2.9775, + "step": 16100 + }, + { + "epoch": 0.749633354284517, + "grad_norm": 0.36365129405831176, + "learning_rate": 9.331210106837402e-05, + "loss": 2.8713, + "step": 16101 + }, + { + "epoch": 0.7496799124706102, + "grad_norm": 0.34465561273906536, + "learning_rate": 9.331074765334692e-05, + "loss": 2.9653, + "step": 16102 + }, + { + "epoch": 0.7497264706567032, + "grad_norm": 0.35297646954971085, + "learning_rate": 9.330939411120705e-05, + "loss": 3.0133, + "step": 16103 + }, + { + "epoch": 0.7497730288427963, + "grad_norm": 0.3224145634306931, + "learning_rate": 9.330804044195836e-05, + "loss": 2.8991, + "step": 16104 + }, + { + "epoch": 0.7498195870288894, + "grad_norm": 0.34804366883121013, + "learning_rate": 9.330668664560484e-05, + "loss": 2.9301, + "step": 16105 + }, + { + "epoch": 0.7498661452149824, + "grad_norm": 0.35220988725737123, + "learning_rate": 9.330533272215048e-05, + "loss": 2.9915, + "step": 16106 + }, + { + "epoch": 0.7499127034010755, + "grad_norm": 0.34851877699431016, + "learning_rate": 9.330397867159922e-05, + "loss": 2.8583, + "step": 16107 + }, + { + "epoch": 0.7499592615871685, + "grad_norm": 0.3707969166940021, + "learning_rate": 9.330262449395504e-05, + "loss": 3.0173, + "step": 16108 + }, + { + "epoch": 0.7500058197732616, + "grad_norm": 0.350891766567217, + "learning_rate": 9.330127018922194e-05, + "loss": 2.9276, + "step": 16109 + }, + { + "epoch": 0.7500523779593548, + "grad_norm": 0.36249597232856323, + "learning_rate": 9.329991575740387e-05, + "loss": 2.8975, + "step": 16110 + }, + { + "epoch": 0.7500989361454478, + "grad_norm": 0.3512879904207731, + "learning_rate": 9.329856119850482e-05, + "loss": 2.9753, + "step": 16111 + }, + { + "epoch": 0.7501454943315409, + "grad_norm": 0.3495691712968626, + "learning_rate": 9.329720651252874e-05, + "loss": 3.014, + "step": 16112 + }, + { + "epoch": 0.7501920525176339, + "grad_norm": 0.3644122618744209, + "learning_rate": 9.329585169947964e-05, + "loss": 3.0572, + "step": 16113 + }, + { + "epoch": 0.750238610703727, + "grad_norm": 0.32733301859097697, + "learning_rate": 9.329449675936149e-05, + "loss": 2.9716, + "step": 16114 + }, + { + "epoch": 0.75028516888982, + "grad_norm": 0.3351681048276205, + "learning_rate": 9.329314169217824e-05, + "loss": 2.8786, + "step": 16115 + }, + { + "epoch": 0.7503317270759131, + "grad_norm": 0.347258600396337, + "learning_rate": 9.329178649793389e-05, + "loss": 3.0431, + "step": 16116 + }, + { + "epoch": 0.7503782852620062, + "grad_norm": 0.34199037590949805, + "learning_rate": 9.329043117663241e-05, + "loss": 3.0234, + "step": 16117 + }, + { + "epoch": 0.7504248434480992, + "grad_norm": 0.34749632040489986, + "learning_rate": 9.328907572827777e-05, + "loss": 2.8496, + "step": 16118 + }, + { + "epoch": 0.7504714016341923, + "grad_norm": 0.34984962658383006, + "learning_rate": 9.328772015287397e-05, + "loss": 3.1378, + "step": 16119 + }, + { + "epoch": 0.7505179598202854, + "grad_norm": 0.32150840060543884, + "learning_rate": 9.328636445042497e-05, + "loss": 2.7838, + "step": 16120 + }, + { + "epoch": 0.7505645180063785, + "grad_norm": 0.3097963861566606, + "learning_rate": 9.328500862093476e-05, + "loss": 2.9264, + "step": 16121 + }, + { + "epoch": 0.7506110761924716, + "grad_norm": 0.32384609242234114, + "learning_rate": 9.328365266440732e-05, + "loss": 2.9491, + "step": 16122 + }, + { + "epoch": 0.7506576343785646, + "grad_norm": 0.35066750345505154, + "learning_rate": 9.328229658084661e-05, + "loss": 2.9721, + "step": 16123 + }, + { + "epoch": 0.7507041925646577, + "grad_norm": 0.31553344824561513, + "learning_rate": 9.328094037025663e-05, + "loss": 2.9898, + "step": 16124 + }, + { + "epoch": 0.7507507507507507, + "grad_norm": 0.345139732883922, + "learning_rate": 9.327958403264135e-05, + "loss": 3.0142, + "step": 16125 + }, + { + "epoch": 0.7507973089368438, + "grad_norm": 0.3561580661535494, + "learning_rate": 9.327822756800476e-05, + "loss": 2.8687, + "step": 16126 + }, + { + "epoch": 0.7508438671229369, + "grad_norm": 0.3294741611473261, + "learning_rate": 9.327687097635084e-05, + "loss": 2.8893, + "step": 16127 + }, + { + "epoch": 0.7508904253090299, + "grad_norm": 0.394208696384307, + "learning_rate": 9.327551425768355e-05, + "loss": 2.9538, + "step": 16128 + }, + { + "epoch": 0.7509369834951231, + "grad_norm": 0.40990106602531734, + "learning_rate": 9.32741574120069e-05, + "loss": 3.0219, + "step": 16129 + }, + { + "epoch": 0.7509835416812161, + "grad_norm": 0.3540325178529774, + "learning_rate": 9.327280043932487e-05, + "loss": 2.9576, + "step": 16130 + }, + { + "epoch": 0.7510300998673092, + "grad_norm": 0.3633303524377212, + "learning_rate": 9.327144333964141e-05, + "loss": 2.8425, + "step": 16131 + }, + { + "epoch": 0.7510766580534023, + "grad_norm": 0.37927171384574393, + "learning_rate": 9.327008611296056e-05, + "loss": 2.9985, + "step": 16132 + }, + { + "epoch": 0.7511232162394953, + "grad_norm": 0.3406224379420291, + "learning_rate": 9.326872875928625e-05, + "loss": 2.8982, + "step": 16133 + }, + { + "epoch": 0.7511697744255884, + "grad_norm": 0.36688768987449827, + "learning_rate": 9.326737127862248e-05, + "loss": 2.9743, + "step": 16134 + }, + { + "epoch": 0.7512163326116814, + "grad_norm": 0.3086932553056972, + "learning_rate": 9.326601367097325e-05, + "loss": 2.9038, + "step": 16135 + }, + { + "epoch": 0.7512628907977745, + "grad_norm": 0.35101857156740396, + "learning_rate": 9.326465593634252e-05, + "loss": 2.9576, + "step": 16136 + }, + { + "epoch": 0.7513094489838675, + "grad_norm": 0.30489241295676095, + "learning_rate": 9.326329807473428e-05, + "loss": 3.0685, + "step": 16137 + }, + { + "epoch": 0.7513560071699606, + "grad_norm": 0.3783275321687328, + "learning_rate": 9.326194008615255e-05, + "loss": 2.8819, + "step": 16138 + }, + { + "epoch": 0.7514025653560538, + "grad_norm": 0.31488896007432016, + "learning_rate": 9.326058197060127e-05, + "loss": 2.9547, + "step": 16139 + }, + { + "epoch": 0.7514491235421468, + "grad_norm": 0.343492085853163, + "learning_rate": 9.325922372808445e-05, + "loss": 2.8515, + "step": 16140 + }, + { + "epoch": 0.7514956817282399, + "grad_norm": 0.32940290451862597, + "learning_rate": 9.325786535860605e-05, + "loss": 3.068, + "step": 16141 + }, + { + "epoch": 0.7515422399143329, + "grad_norm": 0.34613335844308996, + "learning_rate": 9.325650686217012e-05, + "loss": 2.9389, + "step": 16142 + }, + { + "epoch": 0.751588798100426, + "grad_norm": 0.34968448390775747, + "learning_rate": 9.325514823878056e-05, + "loss": 2.9757, + "step": 16143 + }, + { + "epoch": 0.7516353562865191, + "grad_norm": 0.3748594996736295, + "learning_rate": 9.325378948844143e-05, + "loss": 3.0731, + "step": 16144 + }, + { + "epoch": 0.7516819144726121, + "grad_norm": 0.3447471369292479, + "learning_rate": 9.325243061115667e-05, + "loss": 2.8991, + "step": 16145 + }, + { + "epoch": 0.7517284726587052, + "grad_norm": 0.3633985985718458, + "learning_rate": 9.32510716069303e-05, + "loss": 2.9826, + "step": 16146 + }, + { + "epoch": 0.7517750308447982, + "grad_norm": 0.3468894303101351, + "learning_rate": 9.324971247576628e-05, + "loss": 3.0718, + "step": 16147 + }, + { + "epoch": 0.7518215890308914, + "grad_norm": 0.33068403575787103, + "learning_rate": 9.324835321766863e-05, + "loss": 2.9311, + "step": 16148 + }, + { + "epoch": 0.7518681472169845, + "grad_norm": 0.3524595419784991, + "learning_rate": 9.324699383264131e-05, + "loss": 2.9918, + "step": 16149 + }, + { + "epoch": 0.7519147054030775, + "grad_norm": 0.3400397978979584, + "learning_rate": 9.324563432068833e-05, + "loss": 3.0756, + "step": 16150 + }, + { + "epoch": 0.7519612635891706, + "grad_norm": 0.31837056404837843, + "learning_rate": 9.324427468181369e-05, + "loss": 2.9402, + "step": 16151 + }, + { + "epoch": 0.7520078217752636, + "grad_norm": 0.34089615617756824, + "learning_rate": 9.324291491602135e-05, + "loss": 2.9331, + "step": 16152 + }, + { + "epoch": 0.7520543799613567, + "grad_norm": 0.31090681016066485, + "learning_rate": 9.324155502331531e-05, + "loss": 2.9549, + "step": 16153 + }, + { + "epoch": 0.7521009381474498, + "grad_norm": 0.352856445105909, + "learning_rate": 9.324019500369958e-05, + "loss": 2.9386, + "step": 16154 + }, + { + "epoch": 0.7521474963335428, + "grad_norm": 0.36089694870319616, + "learning_rate": 9.323883485717813e-05, + "loss": 2.951, + "step": 16155 + }, + { + "epoch": 0.752194054519636, + "grad_norm": 0.3534513662905156, + "learning_rate": 9.323747458375496e-05, + "loss": 3.0318, + "step": 16156 + }, + { + "epoch": 0.752240612705729, + "grad_norm": 0.359139980283891, + "learning_rate": 9.323611418343406e-05, + "loss": 2.816, + "step": 16157 + }, + { + "epoch": 0.7522871708918221, + "grad_norm": 0.3769401426619601, + "learning_rate": 9.32347536562194e-05, + "loss": 2.956, + "step": 16158 + }, + { + "epoch": 0.7523337290779151, + "grad_norm": 0.367345501179977, + "learning_rate": 9.323339300211504e-05, + "loss": 2.8815, + "step": 16159 + }, + { + "epoch": 0.7523802872640082, + "grad_norm": 0.3497376024159724, + "learning_rate": 9.323203222112492e-05, + "loss": 2.8297, + "step": 16160 + }, + { + "epoch": 0.7524268454501013, + "grad_norm": 0.35807648749531473, + "learning_rate": 9.323067131325302e-05, + "loss": 3.0238, + "step": 16161 + }, + { + "epoch": 0.7524734036361943, + "grad_norm": 0.3312462979049998, + "learning_rate": 9.32293102785034e-05, + "loss": 2.9515, + "step": 16162 + }, + { + "epoch": 0.7525199618222874, + "grad_norm": 0.3716513065809379, + "learning_rate": 9.322794911687997e-05, + "loss": 3.019, + "step": 16163 + }, + { + "epoch": 0.7525665200083804, + "grad_norm": 0.34722432903179357, + "learning_rate": 9.322658782838679e-05, + "loss": 3.0161, + "step": 16164 + }, + { + "epoch": 0.7526130781944735, + "grad_norm": 0.3742443658298391, + "learning_rate": 9.322522641302783e-05, + "loss": 2.9002, + "step": 16165 + }, + { + "epoch": 0.7526596363805667, + "grad_norm": 0.33055037658096303, + "learning_rate": 9.322386487080709e-05, + "loss": 3.0046, + "step": 16166 + }, + { + "epoch": 0.7527061945666597, + "grad_norm": 0.3613653602799368, + "learning_rate": 9.322250320172856e-05, + "loss": 2.9097, + "step": 16167 + }, + { + "epoch": 0.7527527527527528, + "grad_norm": 0.3361432144712644, + "learning_rate": 9.322114140579626e-05, + "loss": 2.8898, + "step": 16168 + }, + { + "epoch": 0.7527993109388458, + "grad_norm": 0.35303723728654646, + "learning_rate": 9.321977948301414e-05, + "loss": 2.923, + "step": 16169 + }, + { + "epoch": 0.7528458691249389, + "grad_norm": 0.35234579265364907, + "learning_rate": 9.321841743338624e-05, + "loss": 2.9198, + "step": 16170 + }, + { + "epoch": 0.752892427311032, + "grad_norm": 0.44414812316870106, + "learning_rate": 9.321705525691653e-05, + "loss": 2.95, + "step": 16171 + }, + { + "epoch": 0.752938985497125, + "grad_norm": 0.38784134700729733, + "learning_rate": 9.321569295360904e-05, + "loss": 3.0425, + "step": 16172 + }, + { + "epoch": 0.7529855436832181, + "grad_norm": 0.3523779556182826, + "learning_rate": 9.321433052346773e-05, + "loss": 3.007, + "step": 16173 + }, + { + "epoch": 0.7530321018693111, + "grad_norm": 0.39571865879117424, + "learning_rate": 9.321296796649663e-05, + "loss": 2.9943, + "step": 16174 + }, + { + "epoch": 0.7530786600554042, + "grad_norm": 0.358802377445714, + "learning_rate": 9.321160528269972e-05, + "loss": 2.9456, + "step": 16175 + }, + { + "epoch": 0.7531252182414974, + "grad_norm": 0.3696525813092359, + "learning_rate": 9.321024247208101e-05, + "loss": 2.9244, + "step": 16176 + }, + { + "epoch": 0.7531717764275904, + "grad_norm": 0.37528665593181953, + "learning_rate": 9.32088795346445e-05, + "loss": 3.0415, + "step": 16177 + }, + { + "epoch": 0.7532183346136835, + "grad_norm": 0.3250293729102564, + "learning_rate": 9.320751647039418e-05, + "loss": 2.872, + "step": 16178 + }, + { + "epoch": 0.7532648927997765, + "grad_norm": 0.3878964629923488, + "learning_rate": 9.320615327933406e-05, + "loss": 3.0029, + "step": 16179 + }, + { + "epoch": 0.7533114509858696, + "grad_norm": 0.3432882855214082, + "learning_rate": 9.320478996146813e-05, + "loss": 3.1093, + "step": 16180 + }, + { + "epoch": 0.7533580091719626, + "grad_norm": 0.3884475005089288, + "learning_rate": 9.32034265168004e-05, + "loss": 2.9101, + "step": 16181 + }, + { + "epoch": 0.7534045673580557, + "grad_norm": 0.31551432471172275, + "learning_rate": 9.320206294533486e-05, + "loss": 2.9448, + "step": 16182 + }, + { + "epoch": 0.7534511255441488, + "grad_norm": 0.3829325528908079, + "learning_rate": 9.320069924707553e-05, + "loss": 2.9093, + "step": 16183 + }, + { + "epoch": 0.7534976837302418, + "grad_norm": 0.3613563873437009, + "learning_rate": 9.319933542202642e-05, + "loss": 2.8984, + "step": 16184 + }, + { + "epoch": 0.753544241916335, + "grad_norm": 0.3450174474670138, + "learning_rate": 9.319797147019149e-05, + "loss": 2.8147, + "step": 16185 + }, + { + "epoch": 0.753590800102428, + "grad_norm": 0.4094917966701305, + "learning_rate": 9.319660739157479e-05, + "loss": 2.9894, + "step": 16186 + }, + { + "epoch": 0.7536373582885211, + "grad_norm": 0.3606349140098105, + "learning_rate": 9.319524318618028e-05, + "loss": 2.9094, + "step": 16187 + }, + { + "epoch": 0.7536839164746142, + "grad_norm": 0.3442504552369979, + "learning_rate": 9.319387885401201e-05, + "loss": 2.9414, + "step": 16188 + }, + { + "epoch": 0.7537304746607072, + "grad_norm": 0.39848410520541494, + "learning_rate": 9.319251439507395e-05, + "loss": 2.9677, + "step": 16189 + }, + { + "epoch": 0.7537770328468003, + "grad_norm": 0.33601021441643597, + "learning_rate": 9.319114980937012e-05, + "loss": 2.931, + "step": 16190 + }, + { + "epoch": 0.7538235910328933, + "grad_norm": 0.38152012579739225, + "learning_rate": 9.31897850969045e-05, + "loss": 3.0197, + "step": 16191 + }, + { + "epoch": 0.7538701492189864, + "grad_norm": 0.38964711316613554, + "learning_rate": 9.318842025768113e-05, + "loss": 2.8804, + "step": 16192 + }, + { + "epoch": 0.7539167074050795, + "grad_norm": 0.3348407655650661, + "learning_rate": 9.318705529170401e-05, + "loss": 2.9734, + "step": 16193 + }, + { + "epoch": 0.7539632655911725, + "grad_norm": 0.3410393549784906, + "learning_rate": 9.318569019897712e-05, + "loss": 2.9776, + "step": 16194 + }, + { + "epoch": 0.7540098237772657, + "grad_norm": 0.34001991891213257, + "learning_rate": 9.31843249795045e-05, + "loss": 2.8981, + "step": 16195 + }, + { + "epoch": 0.7540563819633587, + "grad_norm": 0.33463103754214507, + "learning_rate": 9.318295963329014e-05, + "loss": 2.8884, + "step": 16196 + }, + { + "epoch": 0.7541029401494518, + "grad_norm": 0.33909561935473237, + "learning_rate": 9.318159416033804e-05, + "loss": 3.0011, + "step": 16197 + }, + { + "epoch": 0.7541494983355449, + "grad_norm": 0.3605561324994669, + "learning_rate": 9.318022856065222e-05, + "loss": 2.9147, + "step": 16198 + }, + { + "epoch": 0.7541960565216379, + "grad_norm": 0.30726385204701, + "learning_rate": 9.317886283423667e-05, + "loss": 2.9133, + "step": 16199 + }, + { + "epoch": 0.754242614707731, + "grad_norm": 0.3493787195906619, + "learning_rate": 9.317749698109543e-05, + "loss": 3.0487, + "step": 16200 + }, + { + "epoch": 0.754289172893824, + "grad_norm": 0.3540470992204947, + "learning_rate": 9.317613100123248e-05, + "loss": 2.9389, + "step": 16201 + }, + { + "epoch": 0.7543357310799171, + "grad_norm": 0.3311847205281745, + "learning_rate": 9.317476489465185e-05, + "loss": 2.9218, + "step": 16202 + }, + { + "epoch": 0.7543822892660101, + "grad_norm": 0.37361874604617823, + "learning_rate": 9.317339866135753e-05, + "loss": 2.9899, + "step": 16203 + }, + { + "epoch": 0.7544288474521033, + "grad_norm": 0.3456423627423066, + "learning_rate": 9.317203230135354e-05, + "loss": 3.0046, + "step": 16204 + }, + { + "epoch": 0.7544754056381964, + "grad_norm": 0.3866942825879689, + "learning_rate": 9.317066581464387e-05, + "loss": 2.9953, + "step": 16205 + }, + { + "epoch": 0.7545219638242894, + "grad_norm": 0.366019971273964, + "learning_rate": 9.316929920123256e-05, + "loss": 2.9901, + "step": 16206 + }, + { + "epoch": 0.7545685220103825, + "grad_norm": 0.3567374618520645, + "learning_rate": 9.316793246112363e-05, + "loss": 3.0633, + "step": 16207 + }, + { + "epoch": 0.7546150801964755, + "grad_norm": 0.3823118805136, + "learning_rate": 9.316656559432105e-05, + "loss": 3.0186, + "step": 16208 + }, + { + "epoch": 0.7546616383825686, + "grad_norm": 0.3384546305285264, + "learning_rate": 9.316519860082886e-05, + "loss": 2.9643, + "step": 16209 + }, + { + "epoch": 0.7547081965686617, + "grad_norm": 0.4164151160174309, + "learning_rate": 9.316383148065107e-05, + "loss": 3.091, + "step": 16210 + }, + { + "epoch": 0.7547547547547547, + "grad_norm": 0.3511612570809866, + "learning_rate": 9.316246423379167e-05, + "loss": 2.9175, + "step": 16211 + }, + { + "epoch": 0.7548013129408478, + "grad_norm": 0.36241696195923656, + "learning_rate": 9.316109686025469e-05, + "loss": 3.0081, + "step": 16212 + }, + { + "epoch": 0.7548478711269409, + "grad_norm": 0.399950001632956, + "learning_rate": 9.315972936004416e-05, + "loss": 3.0516, + "step": 16213 + }, + { + "epoch": 0.754894429313034, + "grad_norm": 0.3357386574516651, + "learning_rate": 9.315836173316408e-05, + "loss": 2.9775, + "step": 16214 + }, + { + "epoch": 0.7549409874991271, + "grad_norm": 0.4047241531524294, + "learning_rate": 9.315699397961844e-05, + "loss": 2.9887, + "step": 16215 + }, + { + "epoch": 0.7549875456852201, + "grad_norm": 0.37204164472354434, + "learning_rate": 9.315562609941129e-05, + "loss": 2.9456, + "step": 16216 + }, + { + "epoch": 0.7550341038713132, + "grad_norm": 0.3552188947732644, + "learning_rate": 9.315425809254661e-05, + "loss": 2.8707, + "step": 16217 + }, + { + "epoch": 0.7550806620574062, + "grad_norm": 0.3702481205275278, + "learning_rate": 9.315288995902845e-05, + "loss": 3.1014, + "step": 16218 + }, + { + "epoch": 0.7551272202434993, + "grad_norm": 0.34855152916379845, + "learning_rate": 9.31515216988608e-05, + "loss": 2.9679, + "step": 16219 + }, + { + "epoch": 0.7551737784295924, + "grad_norm": 0.33191074306297236, + "learning_rate": 9.31501533120477e-05, + "loss": 2.9396, + "step": 16220 + }, + { + "epoch": 0.7552203366156854, + "grad_norm": 0.3505730228158848, + "learning_rate": 9.314878479859313e-05, + "loss": 2.9832, + "step": 16221 + }, + { + "epoch": 0.7552668948017786, + "grad_norm": 0.3400310379963953, + "learning_rate": 9.314741615850115e-05, + "loss": 2.7895, + "step": 16222 + }, + { + "epoch": 0.7553134529878716, + "grad_norm": 0.3888443249070307, + "learning_rate": 9.314604739177573e-05, + "loss": 2.9306, + "step": 16223 + }, + { + "epoch": 0.7553600111739647, + "grad_norm": 0.3893311602254822, + "learning_rate": 9.314467849842093e-05, + "loss": 2.8878, + "step": 16224 + }, + { + "epoch": 0.7554065693600577, + "grad_norm": 0.3558900174542686, + "learning_rate": 9.314330947844074e-05, + "loss": 2.94, + "step": 16225 + }, + { + "epoch": 0.7554531275461508, + "grad_norm": 0.3624898229414018, + "learning_rate": 9.31419403318392e-05, + "loss": 2.8923, + "step": 16226 + }, + { + "epoch": 0.7554996857322439, + "grad_norm": 0.33339871328753273, + "learning_rate": 9.31405710586203e-05, + "loss": 2.9444, + "step": 16227 + }, + { + "epoch": 0.7555462439183369, + "grad_norm": 0.38982534390485885, + "learning_rate": 9.313920165878809e-05, + "loss": 2.8759, + "step": 16228 + }, + { + "epoch": 0.75559280210443, + "grad_norm": 0.34861252442458834, + "learning_rate": 9.313783213234656e-05, + "loss": 2.9021, + "step": 16229 + }, + { + "epoch": 0.755639360290523, + "grad_norm": 0.3898642171318118, + "learning_rate": 9.313646247929974e-05, + "loss": 2.9122, + "step": 16230 + }, + { + "epoch": 0.7556859184766161, + "grad_norm": 0.37071623901350864, + "learning_rate": 9.313509269965166e-05, + "loss": 2.977, + "step": 16231 + }, + { + "epoch": 0.7557324766627093, + "grad_norm": 0.32145870958627937, + "learning_rate": 9.313372279340634e-05, + "loss": 2.8382, + "step": 16232 + }, + { + "epoch": 0.7557790348488023, + "grad_norm": 0.35112330843753525, + "learning_rate": 9.313235276056777e-05, + "loss": 2.9444, + "step": 16233 + }, + { + "epoch": 0.7558255930348954, + "grad_norm": 0.33503289175434015, + "learning_rate": 9.313098260114002e-05, + "loss": 2.983, + "step": 16234 + }, + { + "epoch": 0.7558721512209884, + "grad_norm": 0.3723279766670027, + "learning_rate": 9.312961231512706e-05, + "loss": 2.9299, + "step": 16235 + }, + { + "epoch": 0.7559187094070815, + "grad_norm": 0.3416205956445901, + "learning_rate": 9.312824190253295e-05, + "loss": 2.8287, + "step": 16236 + }, + { + "epoch": 0.7559652675931746, + "grad_norm": 0.370927040351537, + "learning_rate": 9.312687136336169e-05, + "loss": 3.0131, + "step": 16237 + }, + { + "epoch": 0.7560118257792676, + "grad_norm": 0.3674762067282672, + "learning_rate": 9.312550069761732e-05, + "loss": 2.9674, + "step": 16238 + }, + { + "epoch": 0.7560583839653607, + "grad_norm": 0.35068037012746917, + "learning_rate": 9.312412990530387e-05, + "loss": 3.023, + "step": 16239 + }, + { + "epoch": 0.7561049421514537, + "grad_norm": 0.3493806192509862, + "learning_rate": 9.312275898642532e-05, + "loss": 2.9575, + "step": 16240 + }, + { + "epoch": 0.7561515003375469, + "grad_norm": 0.4331959452664702, + "learning_rate": 9.312138794098572e-05, + "loss": 3.0052, + "step": 16241 + }, + { + "epoch": 0.75619805852364, + "grad_norm": 0.3580419902509911, + "learning_rate": 9.312001676898911e-05, + "loss": 2.9838, + "step": 16242 + }, + { + "epoch": 0.756244616709733, + "grad_norm": 0.40495341524602935, + "learning_rate": 9.311864547043949e-05, + "loss": 3.0844, + "step": 16243 + }, + { + "epoch": 0.7562911748958261, + "grad_norm": 0.4093682061134849, + "learning_rate": 9.311727404534091e-05, + "loss": 3.0267, + "step": 16244 + }, + { + "epoch": 0.7563377330819191, + "grad_norm": 0.3524369715971266, + "learning_rate": 9.311590249369736e-05, + "loss": 2.9655, + "step": 16245 + }, + { + "epoch": 0.7563842912680122, + "grad_norm": 0.37549663856005255, + "learning_rate": 9.311453081551289e-05, + "loss": 2.9416, + "step": 16246 + }, + { + "epoch": 0.7564308494541052, + "grad_norm": 0.34874599616227786, + "learning_rate": 9.311315901079151e-05, + "loss": 3.0909, + "step": 16247 + }, + { + "epoch": 0.7564774076401983, + "grad_norm": 0.38067112356217064, + "learning_rate": 9.311178707953727e-05, + "loss": 3.0227, + "step": 16248 + }, + { + "epoch": 0.7565239658262914, + "grad_norm": 0.35053533080432825, + "learning_rate": 9.311041502175418e-05, + "loss": 3.0333, + "step": 16249 + }, + { + "epoch": 0.7565705240123844, + "grad_norm": 0.37057979091325816, + "learning_rate": 9.310904283744627e-05, + "loss": 2.9148, + "step": 16250 + }, + { + "epoch": 0.7566170821984776, + "grad_norm": 0.3491354951322471, + "learning_rate": 9.310767052661756e-05, + "loss": 2.9029, + "step": 16251 + }, + { + "epoch": 0.7566636403845706, + "grad_norm": 0.39009854716905734, + "learning_rate": 9.31062980892721e-05, + "loss": 2.9735, + "step": 16252 + }, + { + "epoch": 0.7567101985706637, + "grad_norm": 0.36075999465010533, + "learning_rate": 9.310492552541387e-05, + "loss": 2.9144, + "step": 16253 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 0.3606437631785581, + "learning_rate": 9.310355283504695e-05, + "loss": 2.7859, + "step": 16254 + }, + { + "epoch": 0.7568033149428498, + "grad_norm": 0.3789022046330626, + "learning_rate": 9.310218001817534e-05, + "loss": 3.0938, + "step": 16255 + }, + { + "epoch": 0.7568498731289429, + "grad_norm": 0.3356163308197847, + "learning_rate": 9.310080707480309e-05, + "loss": 2.9049, + "step": 16256 + }, + { + "epoch": 0.7568964313150359, + "grad_norm": 0.38817800440164935, + "learning_rate": 9.309943400493422e-05, + "loss": 3.027, + "step": 16257 + }, + { + "epoch": 0.756942989501129, + "grad_norm": 0.3170890017675389, + "learning_rate": 9.309806080857274e-05, + "loss": 2.9224, + "step": 16258 + }, + { + "epoch": 0.7569895476872222, + "grad_norm": 0.3952313962927293, + "learning_rate": 9.30966874857227e-05, + "loss": 3.0002, + "step": 16259 + }, + { + "epoch": 0.7570361058733152, + "grad_norm": 0.3785771701540082, + "learning_rate": 9.309531403638816e-05, + "loss": 2.9444, + "step": 16260 + }, + { + "epoch": 0.7570826640594083, + "grad_norm": 0.33639133446283453, + "learning_rate": 9.309394046057309e-05, + "loss": 3.0065, + "step": 16261 + }, + { + "epoch": 0.7571292222455013, + "grad_norm": 0.39827120959765644, + "learning_rate": 9.309256675828155e-05, + "loss": 2.9457, + "step": 16262 + }, + { + "epoch": 0.7571757804315944, + "grad_norm": 0.35567476716351176, + "learning_rate": 9.309119292951758e-05, + "loss": 2.9511, + "step": 16263 + }, + { + "epoch": 0.7572223386176875, + "grad_norm": 0.3325722766056881, + "learning_rate": 9.308981897428519e-05, + "loss": 2.9797, + "step": 16264 + }, + { + "epoch": 0.7572688968037805, + "grad_norm": 0.3585596721416391, + "learning_rate": 9.308844489258843e-05, + "loss": 2.9271, + "step": 16265 + }, + { + "epoch": 0.7573154549898736, + "grad_norm": 0.33884170265874825, + "learning_rate": 9.308707068443134e-05, + "loss": 2.9542, + "step": 16266 + }, + { + "epoch": 0.7573620131759666, + "grad_norm": 0.3504090307693163, + "learning_rate": 9.308569634981794e-05, + "loss": 3.0668, + "step": 16267 + }, + { + "epoch": 0.7574085713620597, + "grad_norm": 0.36012736613339075, + "learning_rate": 9.308432188875227e-05, + "loss": 2.9803, + "step": 16268 + }, + { + "epoch": 0.7574551295481528, + "grad_norm": 0.38843700823043936, + "learning_rate": 9.308294730123835e-05, + "loss": 3.0431, + "step": 16269 + }, + { + "epoch": 0.7575016877342459, + "grad_norm": 0.3777976537471312, + "learning_rate": 9.308157258728023e-05, + "loss": 2.9687, + "step": 16270 + }, + { + "epoch": 0.757548245920339, + "grad_norm": 0.4059837331266386, + "learning_rate": 9.308019774688192e-05, + "loss": 2.9949, + "step": 16271 + }, + { + "epoch": 0.757594804106432, + "grad_norm": 0.3633037574349005, + "learning_rate": 9.307882278004751e-05, + "loss": 2.9974, + "step": 16272 + }, + { + "epoch": 0.7576413622925251, + "grad_norm": 0.41152375478506936, + "learning_rate": 9.307744768678097e-05, + "loss": 2.9916, + "step": 16273 + }, + { + "epoch": 0.7576879204786181, + "grad_norm": 0.3219326737333849, + "learning_rate": 9.307607246708638e-05, + "loss": 2.928, + "step": 16274 + }, + { + "epoch": 0.7577344786647112, + "grad_norm": 0.4079069213209352, + "learning_rate": 9.307469712096775e-05, + "loss": 2.9962, + "step": 16275 + }, + { + "epoch": 0.7577810368508043, + "grad_norm": 0.3803901315456914, + "learning_rate": 9.307332164842912e-05, + "loss": 2.9342, + "step": 16276 + }, + { + "epoch": 0.7578275950368973, + "grad_norm": 0.33047414043935547, + "learning_rate": 9.307194604947456e-05, + "loss": 3.0171, + "step": 16277 + }, + { + "epoch": 0.7578741532229905, + "grad_norm": 0.3661363091922712, + "learning_rate": 9.307057032410806e-05, + "loss": 3.02, + "step": 16278 + }, + { + "epoch": 0.7579207114090835, + "grad_norm": 0.3547181822894212, + "learning_rate": 9.306919447233369e-05, + "loss": 2.9357, + "step": 16279 + }, + { + "epoch": 0.7579672695951766, + "grad_norm": 0.34479980555472667, + "learning_rate": 9.306781849415548e-05, + "loss": 2.944, + "step": 16280 + }, + { + "epoch": 0.7580138277812697, + "grad_norm": 0.33697422664478766, + "learning_rate": 9.306644238957745e-05, + "loss": 2.9635, + "step": 16281 + }, + { + "epoch": 0.7580603859673627, + "grad_norm": 0.34735214676684545, + "learning_rate": 9.306506615860366e-05, + "loss": 2.8903, + "step": 16282 + }, + { + "epoch": 0.7581069441534558, + "grad_norm": 0.38255764258164565, + "learning_rate": 9.306368980123816e-05, + "loss": 2.9473, + "step": 16283 + }, + { + "epoch": 0.7581535023395488, + "grad_norm": 0.3257299188925306, + "learning_rate": 9.306231331748496e-05, + "loss": 2.9137, + "step": 16284 + }, + { + "epoch": 0.7582000605256419, + "grad_norm": 0.3675163782623269, + "learning_rate": 9.306093670734813e-05, + "loss": 3.0012, + "step": 16285 + }, + { + "epoch": 0.758246618711735, + "grad_norm": 0.3614724098170271, + "learning_rate": 9.305955997083167e-05, + "loss": 3.0741, + "step": 16286 + }, + { + "epoch": 0.758293176897828, + "grad_norm": 0.3721056413262075, + "learning_rate": 9.305818310793965e-05, + "loss": 2.941, + "step": 16287 + }, + { + "epoch": 0.7583397350839212, + "grad_norm": 0.3366574925517575, + "learning_rate": 9.30568061186761e-05, + "loss": 2.9779, + "step": 16288 + }, + { + "epoch": 0.7583862932700142, + "grad_norm": 0.36978010780744186, + "learning_rate": 9.305542900304507e-05, + "loss": 3.047, + "step": 16289 + }, + { + "epoch": 0.7584328514561073, + "grad_norm": 0.3323147924255253, + "learning_rate": 9.30540517610506e-05, + "loss": 2.8888, + "step": 16290 + }, + { + "epoch": 0.7584794096422003, + "grad_norm": 0.350000696183532, + "learning_rate": 9.305267439269673e-05, + "loss": 3.1178, + "step": 16291 + }, + { + "epoch": 0.7585259678282934, + "grad_norm": 0.3107046883067405, + "learning_rate": 9.305129689798749e-05, + "loss": 3.0563, + "step": 16292 + }, + { + "epoch": 0.7585725260143865, + "grad_norm": 0.3732417879503091, + "learning_rate": 9.304991927692694e-05, + "loss": 2.9164, + "step": 16293 + }, + { + "epoch": 0.7586190842004795, + "grad_norm": 0.3471875361188818, + "learning_rate": 9.304854152951914e-05, + "loss": 2.9522, + "step": 16294 + }, + { + "epoch": 0.7586656423865726, + "grad_norm": 0.3585313853599954, + "learning_rate": 9.304716365576809e-05, + "loss": 2.9791, + "step": 16295 + }, + { + "epoch": 0.7587122005726656, + "grad_norm": 0.3571169856319204, + "learning_rate": 9.304578565567786e-05, + "loss": 2.8791, + "step": 16296 + }, + { + "epoch": 0.7587587587587588, + "grad_norm": 0.35087894604907466, + "learning_rate": 9.304440752925249e-05, + "loss": 2.992, + "step": 16297 + }, + { + "epoch": 0.7588053169448519, + "grad_norm": 0.37939722449035895, + "learning_rate": 9.304302927649602e-05, + "loss": 2.9299, + "step": 16298 + }, + { + "epoch": 0.7588518751309449, + "grad_norm": 0.3088336117171583, + "learning_rate": 9.30416508974125e-05, + "loss": 2.9502, + "step": 16299 + }, + { + "epoch": 0.758898433317038, + "grad_norm": 0.35748423713838806, + "learning_rate": 9.3040272392006e-05, + "loss": 3.0035, + "step": 16300 + }, + { + "epoch": 0.758944991503131, + "grad_norm": 0.33336649279530006, + "learning_rate": 9.303889376028049e-05, + "loss": 2.9731, + "step": 16301 + }, + { + "epoch": 0.7589915496892241, + "grad_norm": 0.36751581934942945, + "learning_rate": 9.303751500224011e-05, + "loss": 2.9685, + "step": 16302 + }, + { + "epoch": 0.7590381078753172, + "grad_norm": 0.32789668683772566, + "learning_rate": 9.303613611788883e-05, + "loss": 2.991, + "step": 16303 + }, + { + "epoch": 0.7590846660614102, + "grad_norm": 0.3652697293436802, + "learning_rate": 9.303475710723078e-05, + "loss": 2.9656, + "step": 16304 + }, + { + "epoch": 0.7591312242475033, + "grad_norm": 0.3205065081936454, + "learning_rate": 9.303337797026993e-05, + "loss": 3.0, + "step": 16305 + }, + { + "epoch": 0.7591777824335963, + "grad_norm": 0.34535951565717804, + "learning_rate": 9.303199870701033e-05, + "loss": 2.8266, + "step": 16306 + }, + { + "epoch": 0.7592243406196895, + "grad_norm": 0.33841445918209834, + "learning_rate": 9.303061931745609e-05, + "loss": 3.0205, + "step": 16307 + }, + { + "epoch": 0.7592708988057826, + "grad_norm": 0.33991114921313725, + "learning_rate": 9.30292398016112e-05, + "loss": 3.0384, + "step": 16308 + }, + { + "epoch": 0.7593174569918756, + "grad_norm": 0.32020456958615956, + "learning_rate": 9.302786015947974e-05, + "loss": 2.9046, + "step": 16309 + }, + { + "epoch": 0.7593640151779687, + "grad_norm": 0.3340246284618576, + "learning_rate": 9.302648039106574e-05, + "loss": 2.9853, + "step": 16310 + }, + { + "epoch": 0.7594105733640617, + "grad_norm": 0.35396464895918445, + "learning_rate": 9.302510049637328e-05, + "loss": 2.9647, + "step": 16311 + }, + { + "epoch": 0.7594571315501548, + "grad_norm": 0.3275545985574269, + "learning_rate": 9.302372047540638e-05, + "loss": 2.9203, + "step": 16312 + }, + { + "epoch": 0.7595036897362478, + "grad_norm": 0.38436033283642557, + "learning_rate": 9.302234032816908e-05, + "loss": 2.9549, + "step": 16313 + }, + { + "epoch": 0.7595502479223409, + "grad_norm": 0.35948964934259714, + "learning_rate": 9.302096005466546e-05, + "loss": 2.9676, + "step": 16314 + }, + { + "epoch": 0.759596806108434, + "grad_norm": 0.36501319565269535, + "learning_rate": 9.301957965489956e-05, + "loss": 3.0069, + "step": 16315 + }, + { + "epoch": 0.7596433642945271, + "grad_norm": 0.36089454005725324, + "learning_rate": 9.301819912887544e-05, + "loss": 3.0661, + "step": 16316 + }, + { + "epoch": 0.7596899224806202, + "grad_norm": 0.3697456113696269, + "learning_rate": 9.301681847659713e-05, + "loss": 2.8864, + "step": 16317 + }, + { + "epoch": 0.7597364806667132, + "grad_norm": 0.40091647611928166, + "learning_rate": 9.301543769806871e-05, + "loss": 3.0207, + "step": 16318 + }, + { + "epoch": 0.7597830388528063, + "grad_norm": 0.34535726248232423, + "learning_rate": 9.301405679329422e-05, + "loss": 2.9262, + "step": 16319 + }, + { + "epoch": 0.7598295970388994, + "grad_norm": 0.34775416502195255, + "learning_rate": 9.30126757622777e-05, + "loss": 2.8979, + "step": 16320 + }, + { + "epoch": 0.7598761552249924, + "grad_norm": 0.3574682127389657, + "learning_rate": 9.301129460502322e-05, + "loss": 2.9267, + "step": 16321 + }, + { + "epoch": 0.7599227134110855, + "grad_norm": 0.3448682834218263, + "learning_rate": 9.300991332153482e-05, + "loss": 2.9627, + "step": 16322 + }, + { + "epoch": 0.7599692715971785, + "grad_norm": 0.3110868763197182, + "learning_rate": 9.300853191181658e-05, + "loss": 2.9526, + "step": 16323 + }, + { + "epoch": 0.7600158297832716, + "grad_norm": 0.3568343946824496, + "learning_rate": 9.30071503758725e-05, + "loss": 2.9418, + "step": 16324 + }, + { + "epoch": 0.7600623879693648, + "grad_norm": 0.34136751774721014, + "learning_rate": 9.30057687137067e-05, + "loss": 2.8628, + "step": 16325 + }, + { + "epoch": 0.7601089461554578, + "grad_norm": 0.35701585890434745, + "learning_rate": 9.300438692532321e-05, + "loss": 2.8883, + "step": 16326 + }, + { + "epoch": 0.7601555043415509, + "grad_norm": 0.3162377379447835, + "learning_rate": 9.300300501072605e-05, + "loss": 2.9116, + "step": 16327 + }, + { + "epoch": 0.7602020625276439, + "grad_norm": 0.36248399058253566, + "learning_rate": 9.300162296991934e-05, + "loss": 2.8839, + "step": 16328 + }, + { + "epoch": 0.760248620713737, + "grad_norm": 0.326947164185665, + "learning_rate": 9.300024080290708e-05, + "loss": 2.969, + "step": 16329 + }, + { + "epoch": 0.7602951788998301, + "grad_norm": 0.34206605898655884, + "learning_rate": 9.299885850969337e-05, + "loss": 2.9336, + "step": 16330 + }, + { + "epoch": 0.7603417370859231, + "grad_norm": 0.3398153841896849, + "learning_rate": 9.299747609028223e-05, + "loss": 2.9751, + "step": 16331 + }, + { + "epoch": 0.7603882952720162, + "grad_norm": 0.3532940618242623, + "learning_rate": 9.299609354467774e-05, + "loss": 2.9083, + "step": 16332 + }, + { + "epoch": 0.7604348534581092, + "grad_norm": 0.35593601305701955, + "learning_rate": 9.299471087288395e-05, + "loss": 2.9692, + "step": 16333 + }, + { + "epoch": 0.7604814116442024, + "grad_norm": 0.3491215579061933, + "learning_rate": 9.299332807490493e-05, + "loss": 2.9611, + "step": 16334 + }, + { + "epoch": 0.7605279698302954, + "grad_norm": 0.320472570807179, + "learning_rate": 9.29919451507447e-05, + "loss": 2.9855, + "step": 16335 + }, + { + "epoch": 0.7605745280163885, + "grad_norm": 0.32748647147980897, + "learning_rate": 9.299056210040737e-05, + "loss": 2.982, + "step": 16336 + }, + { + "epoch": 0.7606210862024816, + "grad_norm": 0.3755736794725626, + "learning_rate": 9.298917892389697e-05, + "loss": 2.9509, + "step": 16337 + }, + { + "epoch": 0.7606676443885746, + "grad_norm": 0.340013127033234, + "learning_rate": 9.298779562121756e-05, + "loss": 3.0043, + "step": 16338 + }, + { + "epoch": 0.7607142025746677, + "grad_norm": 0.41184083553256334, + "learning_rate": 9.29864121923732e-05, + "loss": 3.0092, + "step": 16339 + }, + { + "epoch": 0.7607607607607607, + "grad_norm": 0.3869277377296726, + "learning_rate": 9.298502863736798e-05, + "loss": 3.0139, + "step": 16340 + }, + { + "epoch": 0.7608073189468538, + "grad_norm": 0.3583834491907994, + "learning_rate": 9.298364495620591e-05, + "loss": 2.9877, + "step": 16341 + }, + { + "epoch": 0.7608538771329469, + "grad_norm": 0.34128725958058953, + "learning_rate": 9.29822611488911e-05, + "loss": 3.0299, + "step": 16342 + }, + { + "epoch": 0.76090043531904, + "grad_norm": 0.37727761414498184, + "learning_rate": 9.298087721542757e-05, + "loss": 2.9226, + "step": 16343 + }, + { + "epoch": 0.7609469935051331, + "grad_norm": 0.34225752915294505, + "learning_rate": 9.29794931558194e-05, + "loss": 2.9176, + "step": 16344 + }, + { + "epoch": 0.7609935516912261, + "grad_norm": 0.3559994963167454, + "learning_rate": 9.297810897007065e-05, + "loss": 2.9693, + "step": 16345 + }, + { + "epoch": 0.7610401098773192, + "grad_norm": 0.36194563264364615, + "learning_rate": 9.29767246581854e-05, + "loss": 3.0005, + "step": 16346 + }, + { + "epoch": 0.7610866680634123, + "grad_norm": 0.36353075974740046, + "learning_rate": 9.297534022016768e-05, + "loss": 2.9368, + "step": 16347 + }, + { + "epoch": 0.7611332262495053, + "grad_norm": 0.33911891075623124, + "learning_rate": 9.297395565602158e-05, + "loss": 2.9969, + "step": 16348 + }, + { + "epoch": 0.7611797844355984, + "grad_norm": 0.3554506348131895, + "learning_rate": 9.297257096575116e-05, + "loss": 2.8947, + "step": 16349 + }, + { + "epoch": 0.7612263426216914, + "grad_norm": 0.3345979971608472, + "learning_rate": 9.297118614936047e-05, + "loss": 2.9658, + "step": 16350 + }, + { + "epoch": 0.7612729008077845, + "grad_norm": 0.3405828521187299, + "learning_rate": 9.296980120685357e-05, + "loss": 2.9982, + "step": 16351 + }, + { + "epoch": 0.7613194589938777, + "grad_norm": 0.3446197963455965, + "learning_rate": 9.296841613823454e-05, + "loss": 2.9391, + "step": 16352 + }, + { + "epoch": 0.7613660171799707, + "grad_norm": 0.35334449342037866, + "learning_rate": 9.296703094350745e-05, + "loss": 3.0447, + "step": 16353 + }, + { + "epoch": 0.7614125753660638, + "grad_norm": 0.3697216018674882, + "learning_rate": 9.296564562267636e-05, + "loss": 2.9071, + "step": 16354 + }, + { + "epoch": 0.7614591335521568, + "grad_norm": 0.35578055366928224, + "learning_rate": 9.296426017574533e-05, + "loss": 2.8659, + "step": 16355 + }, + { + "epoch": 0.7615056917382499, + "grad_norm": 0.35724084330088146, + "learning_rate": 9.296287460271844e-05, + "loss": 2.8663, + "step": 16356 + }, + { + "epoch": 0.7615522499243429, + "grad_norm": 0.3578956173471759, + "learning_rate": 9.296148890359973e-05, + "loss": 3.0543, + "step": 16357 + }, + { + "epoch": 0.761598808110436, + "grad_norm": 0.344078602325565, + "learning_rate": 9.296010307839328e-05, + "loss": 2.9249, + "step": 16358 + }, + { + "epoch": 0.7616453662965291, + "grad_norm": 0.36272025821983217, + "learning_rate": 9.295871712710317e-05, + "loss": 2.9168, + "step": 16359 + }, + { + "epoch": 0.7616919244826221, + "grad_norm": 0.3315685460767657, + "learning_rate": 9.295733104973345e-05, + "loss": 2.8496, + "step": 16360 + }, + { + "epoch": 0.7617384826687152, + "grad_norm": 0.3665047591611103, + "learning_rate": 9.29559448462882e-05, + "loss": 2.9542, + "step": 16361 + }, + { + "epoch": 0.7617850408548082, + "grad_norm": 0.348408642198942, + "learning_rate": 9.295455851677148e-05, + "loss": 2.917, + "step": 16362 + }, + { + "epoch": 0.7618315990409014, + "grad_norm": 0.3067136720957816, + "learning_rate": 9.295317206118736e-05, + "loss": 2.892, + "step": 16363 + }, + { + "epoch": 0.7618781572269945, + "grad_norm": 0.32883022993498995, + "learning_rate": 9.295178547953993e-05, + "loss": 3.0393, + "step": 16364 + }, + { + "epoch": 0.7619247154130875, + "grad_norm": 0.39527768818523584, + "learning_rate": 9.295039877183322e-05, + "loss": 3.0629, + "step": 16365 + }, + { + "epoch": 0.7619712735991806, + "grad_norm": 0.3406446287187008, + "learning_rate": 9.294901193807133e-05, + "loss": 2.9613, + "step": 16366 + }, + { + "epoch": 0.7620178317852736, + "grad_norm": 0.35005507722419843, + "learning_rate": 9.294762497825833e-05, + "loss": 3.0138, + "step": 16367 + }, + { + "epoch": 0.7620643899713667, + "grad_norm": 0.3260549444219422, + "learning_rate": 9.294623789239827e-05, + "loss": 2.8888, + "step": 16368 + }, + { + "epoch": 0.7621109481574598, + "grad_norm": 0.346341815030869, + "learning_rate": 9.294485068049523e-05, + "loss": 2.8874, + "step": 16369 + }, + { + "epoch": 0.7621575063435528, + "grad_norm": 0.3033732216493292, + "learning_rate": 9.294346334255328e-05, + "loss": 2.8796, + "step": 16370 + }, + { + "epoch": 0.762204064529646, + "grad_norm": 0.36883654646909847, + "learning_rate": 9.294207587857651e-05, + "loss": 2.975, + "step": 16371 + }, + { + "epoch": 0.762250622715739, + "grad_norm": 0.31910965199639446, + "learning_rate": 9.294068828856896e-05, + "loss": 3.0264, + "step": 16372 + }, + { + "epoch": 0.7622971809018321, + "grad_norm": 0.3560140093111344, + "learning_rate": 9.293930057253474e-05, + "loss": 2.948, + "step": 16373 + }, + { + "epoch": 0.7623437390879252, + "grad_norm": 0.3615558996243508, + "learning_rate": 9.293791273047789e-05, + "loss": 3.0113, + "step": 16374 + }, + { + "epoch": 0.7623902972740182, + "grad_norm": 0.3655424170841108, + "learning_rate": 9.293652476240251e-05, + "loss": 2.875, + "step": 16375 + }, + { + "epoch": 0.7624368554601113, + "grad_norm": 0.3941829724697214, + "learning_rate": 9.293513666831264e-05, + "loss": 3.0522, + "step": 16376 + }, + { + "epoch": 0.7624834136462043, + "grad_norm": 0.3939325218670166, + "learning_rate": 9.293374844821238e-05, + "loss": 2.9995, + "step": 16377 + }, + { + "epoch": 0.7625299718322974, + "grad_norm": 0.4217773695964806, + "learning_rate": 9.293236010210582e-05, + "loss": 2.8655, + "step": 16378 + }, + { + "epoch": 0.7625765300183904, + "grad_norm": 0.34675353547352045, + "learning_rate": 9.293097162999699e-05, + "loss": 2.9455, + "step": 16379 + }, + { + "epoch": 0.7626230882044835, + "grad_norm": 0.427281040757002, + "learning_rate": 9.292958303189e-05, + "loss": 2.9818, + "step": 16380 + }, + { + "epoch": 0.7626696463905767, + "grad_norm": 0.41508363446623847, + "learning_rate": 9.29281943077889e-05, + "loss": 2.9094, + "step": 16381 + }, + { + "epoch": 0.7627162045766697, + "grad_norm": 0.3876887475347839, + "learning_rate": 9.292680545769778e-05, + "loss": 3.0631, + "step": 16382 + }, + { + "epoch": 0.7627627627627628, + "grad_norm": 0.4228168412604911, + "learning_rate": 9.292541648162072e-05, + "loss": 2.9275, + "step": 16383 + }, + { + "epoch": 0.7628093209488558, + "grad_norm": 0.3469242975807317, + "learning_rate": 9.29240273795618e-05, + "loss": 3.0087, + "step": 16384 + }, + { + "epoch": 0.7628558791349489, + "grad_norm": 0.4490017434941957, + "learning_rate": 9.292263815152507e-05, + "loss": 2.9872, + "step": 16385 + }, + { + "epoch": 0.762902437321042, + "grad_norm": 0.4352315131027956, + "learning_rate": 9.292124879751465e-05, + "loss": 3.0395, + "step": 16386 + }, + { + "epoch": 0.762948995507135, + "grad_norm": 0.34916384559436286, + "learning_rate": 9.291985931753458e-05, + "loss": 3.028, + "step": 16387 + }, + { + "epoch": 0.7629955536932281, + "grad_norm": 0.39112143839006724, + "learning_rate": 9.291846971158894e-05, + "loss": 2.9596, + "step": 16388 + }, + { + "epoch": 0.7630421118793211, + "grad_norm": 0.3478749558287809, + "learning_rate": 9.291707997968183e-05, + "loss": 3.0049, + "step": 16389 + }, + { + "epoch": 0.7630886700654143, + "grad_norm": 0.3483879313581422, + "learning_rate": 9.29156901218173e-05, + "loss": 2.9929, + "step": 16390 + }, + { + "epoch": 0.7631352282515074, + "grad_norm": 0.36651594767730333, + "learning_rate": 9.29143001379995e-05, + "loss": 2.9673, + "step": 16391 + }, + { + "epoch": 0.7631817864376004, + "grad_norm": 0.36308678909741493, + "learning_rate": 9.291291002823241e-05, + "loss": 3.0009, + "step": 16392 + }, + { + "epoch": 0.7632283446236935, + "grad_norm": 0.3779795172892843, + "learning_rate": 9.291151979252016e-05, + "loss": 3.0076, + "step": 16393 + }, + { + "epoch": 0.7632749028097865, + "grad_norm": 0.35950245006264425, + "learning_rate": 9.291012943086684e-05, + "loss": 2.8409, + "step": 16394 + }, + { + "epoch": 0.7633214609958796, + "grad_norm": 0.3771987104155483, + "learning_rate": 9.290873894327652e-05, + "loss": 3.0383, + "step": 16395 + }, + { + "epoch": 0.7633680191819727, + "grad_norm": 0.3189353083917647, + "learning_rate": 9.290734832975326e-05, + "loss": 3.0284, + "step": 16396 + }, + { + "epoch": 0.7634145773680657, + "grad_norm": 0.3687501898235011, + "learning_rate": 9.290595759030117e-05, + "loss": 3.0976, + "step": 16397 + }, + { + "epoch": 0.7634611355541588, + "grad_norm": 0.3476819111477595, + "learning_rate": 9.290456672492433e-05, + "loss": 2.9091, + "step": 16398 + }, + { + "epoch": 0.7635076937402518, + "grad_norm": 0.3080466724578752, + "learning_rate": 9.29031757336268e-05, + "loss": 2.9365, + "step": 16399 + }, + { + "epoch": 0.763554251926345, + "grad_norm": 0.34778586415878165, + "learning_rate": 9.290178461641268e-05, + "loss": 2.9423, + "step": 16400 + }, + { + "epoch": 0.763600810112438, + "grad_norm": 0.3417764076076309, + "learning_rate": 9.290039337328605e-05, + "loss": 2.9048, + "step": 16401 + }, + { + "epoch": 0.7636473682985311, + "grad_norm": 0.32888950553002144, + "learning_rate": 9.2899002004251e-05, + "loss": 2.9708, + "step": 16402 + }, + { + "epoch": 0.7636939264846242, + "grad_norm": 0.37171601898838263, + "learning_rate": 9.289761050931159e-05, + "loss": 2.9713, + "step": 16403 + }, + { + "epoch": 0.7637404846707172, + "grad_norm": 0.29742448448086084, + "learning_rate": 9.289621888847194e-05, + "loss": 2.9679, + "step": 16404 + }, + { + "epoch": 0.7637870428568103, + "grad_norm": 0.34626960395980877, + "learning_rate": 9.28948271417361e-05, + "loss": 3.0262, + "step": 16405 + }, + { + "epoch": 0.7638336010429033, + "grad_norm": 0.29901812378904663, + "learning_rate": 9.289343526910817e-05, + "loss": 2.9684, + "step": 16406 + }, + { + "epoch": 0.7638801592289964, + "grad_norm": 0.3423319163824548, + "learning_rate": 9.289204327059222e-05, + "loss": 2.9405, + "step": 16407 + }, + { + "epoch": 0.7639267174150896, + "grad_norm": 0.3122227135554103, + "learning_rate": 9.289065114619236e-05, + "loss": 2.9558, + "step": 16408 + }, + { + "epoch": 0.7639732756011826, + "grad_norm": 0.3459220817185004, + "learning_rate": 9.288925889591267e-05, + "loss": 2.9647, + "step": 16409 + }, + { + "epoch": 0.7640198337872757, + "grad_norm": 0.3472052181175835, + "learning_rate": 9.288786651975723e-05, + "loss": 2.9583, + "step": 16410 + }, + { + "epoch": 0.7640663919733687, + "grad_norm": 0.33271532538917054, + "learning_rate": 9.288647401773012e-05, + "loss": 2.8884, + "step": 16411 + }, + { + "epoch": 0.7641129501594618, + "grad_norm": 0.3185609745327156, + "learning_rate": 9.288508138983543e-05, + "loss": 3.0056, + "step": 16412 + }, + { + "epoch": 0.7641595083455549, + "grad_norm": 0.34729626984615397, + "learning_rate": 9.288368863607726e-05, + "loss": 3.0004, + "step": 16413 + }, + { + "epoch": 0.7642060665316479, + "grad_norm": 0.3559167818801074, + "learning_rate": 9.288229575645968e-05, + "loss": 3.0673, + "step": 16414 + }, + { + "epoch": 0.764252624717741, + "grad_norm": 0.31502186956827644, + "learning_rate": 9.28809027509868e-05, + "loss": 2.9748, + "step": 16415 + }, + { + "epoch": 0.764299182903834, + "grad_norm": 0.3715042411110514, + "learning_rate": 9.287950961966269e-05, + "loss": 3.0226, + "step": 16416 + }, + { + "epoch": 0.7643457410899271, + "grad_norm": 0.35326172073656437, + "learning_rate": 9.287811636249143e-05, + "loss": 2.9386, + "step": 16417 + }, + { + "epoch": 0.7643922992760203, + "grad_norm": 0.32231176450258586, + "learning_rate": 9.287672297947713e-05, + "loss": 2.9433, + "step": 16418 + }, + { + "epoch": 0.7644388574621133, + "grad_norm": 0.41692518682957985, + "learning_rate": 9.287532947062389e-05, + "loss": 2.9247, + "step": 16419 + }, + { + "epoch": 0.7644854156482064, + "grad_norm": 0.325390933793122, + "learning_rate": 9.287393583593575e-05, + "loss": 2.9883, + "step": 16420 + }, + { + "epoch": 0.7645319738342994, + "grad_norm": 0.35853583846374776, + "learning_rate": 9.287254207541686e-05, + "loss": 2.8779, + "step": 16421 + }, + { + "epoch": 0.7645785320203925, + "grad_norm": 0.3621438741909673, + "learning_rate": 9.287114818907128e-05, + "loss": 3.0228, + "step": 16422 + }, + { + "epoch": 0.7646250902064855, + "grad_norm": 0.3395028062498746, + "learning_rate": 9.286975417690309e-05, + "loss": 2.9692, + "step": 16423 + }, + { + "epoch": 0.7646716483925786, + "grad_norm": 0.37760140679630505, + "learning_rate": 9.28683600389164e-05, + "loss": 2.9618, + "step": 16424 + }, + { + "epoch": 0.7647182065786717, + "grad_norm": 0.29499603581188294, + "learning_rate": 9.286696577511529e-05, + "loss": 2.9641, + "step": 16425 + }, + { + "epoch": 0.7647647647647647, + "grad_norm": 0.3504729383685185, + "learning_rate": 9.286557138550387e-05, + "loss": 3.0063, + "step": 16426 + }, + { + "epoch": 0.7648113229508579, + "grad_norm": 0.3368336378384986, + "learning_rate": 9.286417687008621e-05, + "loss": 3.0165, + "step": 16427 + }, + { + "epoch": 0.7648578811369509, + "grad_norm": 0.3432631875812394, + "learning_rate": 9.286278222886643e-05, + "loss": 2.9956, + "step": 16428 + }, + { + "epoch": 0.764904439323044, + "grad_norm": 0.3440735870007723, + "learning_rate": 9.286138746184861e-05, + "loss": 3.027, + "step": 16429 + }, + { + "epoch": 0.7649509975091371, + "grad_norm": 0.37358437646646997, + "learning_rate": 9.285999256903681e-05, + "loss": 3.0267, + "step": 16430 + }, + { + "epoch": 0.7649975556952301, + "grad_norm": 0.33689400044399215, + "learning_rate": 9.285859755043518e-05, + "loss": 3.0391, + "step": 16431 + }, + { + "epoch": 0.7650441138813232, + "grad_norm": 0.32779359549898696, + "learning_rate": 9.285720240604778e-05, + "loss": 2.8932, + "step": 16432 + }, + { + "epoch": 0.7650906720674162, + "grad_norm": 0.37343748791377745, + "learning_rate": 9.285580713587871e-05, + "loss": 2.9427, + "step": 16433 + }, + { + "epoch": 0.7651372302535093, + "grad_norm": 0.3371908754371947, + "learning_rate": 9.285441173993207e-05, + "loss": 2.9979, + "step": 16434 + }, + { + "epoch": 0.7651837884396024, + "grad_norm": 0.41816000766682426, + "learning_rate": 9.285301621821195e-05, + "loss": 2.9407, + "step": 16435 + }, + { + "epoch": 0.7652303466256954, + "grad_norm": 0.35179313396013634, + "learning_rate": 9.285162057072245e-05, + "loss": 2.8917, + "step": 16436 + }, + { + "epoch": 0.7652769048117886, + "grad_norm": 0.3750958593324837, + "learning_rate": 9.285022479746768e-05, + "loss": 2.9195, + "step": 16437 + }, + { + "epoch": 0.7653234629978816, + "grad_norm": 0.3607826177091083, + "learning_rate": 9.28488288984517e-05, + "loss": 2.9321, + "step": 16438 + }, + { + "epoch": 0.7653700211839747, + "grad_norm": 0.3374002448080036, + "learning_rate": 9.284743287367865e-05, + "loss": 3.0574, + "step": 16439 + }, + { + "epoch": 0.7654165793700678, + "grad_norm": 0.40764265826971263, + "learning_rate": 9.28460367231526e-05, + "loss": 2.9448, + "step": 16440 + }, + { + "epoch": 0.7654631375561608, + "grad_norm": 0.3508641775855912, + "learning_rate": 9.284464044687764e-05, + "loss": 2.904, + "step": 16441 + }, + { + "epoch": 0.7655096957422539, + "grad_norm": 0.40150195040658404, + "learning_rate": 9.28432440448579e-05, + "loss": 3.0543, + "step": 16442 + }, + { + "epoch": 0.7655562539283469, + "grad_norm": 0.37955537630469616, + "learning_rate": 9.284184751709744e-05, + "loss": 2.9697, + "step": 16443 + }, + { + "epoch": 0.76560281211444, + "grad_norm": 0.3533098421796933, + "learning_rate": 9.284045086360039e-05, + "loss": 2.9771, + "step": 16444 + }, + { + "epoch": 0.765649370300533, + "grad_norm": 0.3997256613422332, + "learning_rate": 9.283905408437084e-05, + "loss": 2.9952, + "step": 16445 + }, + { + "epoch": 0.7656959284866262, + "grad_norm": 0.3585944217379682, + "learning_rate": 9.283765717941286e-05, + "loss": 2.9139, + "step": 16446 + }, + { + "epoch": 0.7657424866727193, + "grad_norm": 0.3502470291853546, + "learning_rate": 9.283626014873062e-05, + "loss": 3.0029, + "step": 16447 + }, + { + "epoch": 0.7657890448588123, + "grad_norm": 0.37793601308159874, + "learning_rate": 9.283486299232812e-05, + "loss": 2.9521, + "step": 16448 + }, + { + "epoch": 0.7658356030449054, + "grad_norm": 0.362173777540129, + "learning_rate": 9.283346571020957e-05, + "loss": 3.0638, + "step": 16449 + }, + { + "epoch": 0.7658821612309984, + "grad_norm": 0.3343024064968906, + "learning_rate": 9.2832068302379e-05, + "loss": 2.9257, + "step": 16450 + }, + { + "epoch": 0.7659287194170915, + "grad_norm": 0.35477476393935536, + "learning_rate": 9.283067076884053e-05, + "loss": 2.9471, + "step": 16451 + }, + { + "epoch": 0.7659752776031846, + "grad_norm": 0.3473795586899854, + "learning_rate": 9.282927310959825e-05, + "loss": 2.9281, + "step": 16452 + }, + { + "epoch": 0.7660218357892776, + "grad_norm": 0.36934229136089564, + "learning_rate": 9.282787532465629e-05, + "loss": 2.8694, + "step": 16453 + }, + { + "epoch": 0.7660683939753707, + "grad_norm": 0.3315565058890619, + "learning_rate": 9.282647741401873e-05, + "loss": 2.9231, + "step": 16454 + }, + { + "epoch": 0.7661149521614637, + "grad_norm": 0.3656343178084471, + "learning_rate": 9.282507937768968e-05, + "loss": 2.8285, + "step": 16455 + }, + { + "epoch": 0.7661615103475569, + "grad_norm": 0.33652818762646486, + "learning_rate": 9.282368121567321e-05, + "loss": 2.9565, + "step": 16456 + }, + { + "epoch": 0.76620806853365, + "grad_norm": 0.3713319437242387, + "learning_rate": 9.28222829279735e-05, + "loss": 3.0192, + "step": 16457 + }, + { + "epoch": 0.766254626719743, + "grad_norm": 0.31797992109376955, + "learning_rate": 9.282088451459458e-05, + "loss": 2.8212, + "step": 16458 + }, + { + "epoch": 0.7663011849058361, + "grad_norm": 0.3528089316726134, + "learning_rate": 9.28194859755406e-05, + "loss": 2.9951, + "step": 16459 + }, + { + "epoch": 0.7663477430919291, + "grad_norm": 0.33055658350375877, + "learning_rate": 9.281808731081563e-05, + "loss": 2.9531, + "step": 16460 + }, + { + "epoch": 0.7663943012780222, + "grad_norm": 0.36184659950123294, + "learning_rate": 9.28166885204238e-05, + "loss": 3.0047, + "step": 16461 + }, + { + "epoch": 0.7664408594641153, + "grad_norm": 0.3347799898376446, + "learning_rate": 9.28152896043692e-05, + "loss": 2.9717, + "step": 16462 + }, + { + "epoch": 0.7664874176502083, + "grad_norm": 0.35325147098910614, + "learning_rate": 9.281389056265596e-05, + "loss": 2.968, + "step": 16463 + }, + { + "epoch": 0.7665339758363015, + "grad_norm": 0.3184273585804822, + "learning_rate": 9.281249139528815e-05, + "loss": 2.9763, + "step": 16464 + }, + { + "epoch": 0.7665805340223945, + "grad_norm": 0.3673612251618859, + "learning_rate": 9.281109210226992e-05, + "loss": 3.0688, + "step": 16465 + }, + { + "epoch": 0.7666270922084876, + "grad_norm": 0.3474880865991085, + "learning_rate": 9.280969268360534e-05, + "loss": 2.9198, + "step": 16466 + }, + { + "epoch": 0.7666736503945806, + "grad_norm": 0.33594611970191124, + "learning_rate": 9.280829313929852e-05, + "loss": 2.8714, + "step": 16467 + }, + { + "epoch": 0.7667202085806737, + "grad_norm": 0.33670816701365386, + "learning_rate": 9.280689346935359e-05, + "loss": 2.9495, + "step": 16468 + }, + { + "epoch": 0.7667667667667668, + "grad_norm": 0.3559222643134277, + "learning_rate": 9.280549367377462e-05, + "loss": 3.1276, + "step": 16469 + }, + { + "epoch": 0.7668133249528598, + "grad_norm": 0.330981618187142, + "learning_rate": 9.280409375256578e-05, + "loss": 3.1038, + "step": 16470 + }, + { + "epoch": 0.7668598831389529, + "grad_norm": 0.3458833506782487, + "learning_rate": 9.280269370573112e-05, + "loss": 2.9053, + "step": 16471 + }, + { + "epoch": 0.7669064413250459, + "grad_norm": 0.34285909954920374, + "learning_rate": 9.280129353327478e-05, + "loss": 3.0309, + "step": 16472 + }, + { + "epoch": 0.766952999511139, + "grad_norm": 0.33131839021644083, + "learning_rate": 9.279989323520084e-05, + "loss": 2.904, + "step": 16473 + }, + { + "epoch": 0.7669995576972322, + "grad_norm": 0.33466374469510546, + "learning_rate": 9.279849281151345e-05, + "loss": 2.8961, + "step": 16474 + }, + { + "epoch": 0.7670461158833252, + "grad_norm": 0.3059187296908191, + "learning_rate": 9.279709226221669e-05, + "loss": 3.0366, + "step": 16475 + }, + { + "epoch": 0.7670926740694183, + "grad_norm": 0.316062350016745, + "learning_rate": 9.279569158731469e-05, + "loss": 2.958, + "step": 16476 + }, + { + "epoch": 0.7671392322555113, + "grad_norm": 0.3275441427259383, + "learning_rate": 9.279429078681155e-05, + "loss": 2.9701, + "step": 16477 + }, + { + "epoch": 0.7671857904416044, + "grad_norm": 0.3169565545497315, + "learning_rate": 9.279288986071138e-05, + "loss": 2.9535, + "step": 16478 + }, + { + "epoch": 0.7672323486276975, + "grad_norm": 0.35030730779744884, + "learning_rate": 9.279148880901827e-05, + "loss": 3.0067, + "step": 16479 + }, + { + "epoch": 0.7672789068137905, + "grad_norm": 0.3364840914021815, + "learning_rate": 9.279008763173638e-05, + "loss": 2.8976, + "step": 16480 + }, + { + "epoch": 0.7673254649998836, + "grad_norm": 0.34698056309721603, + "learning_rate": 9.27886863288698e-05, + "loss": 2.9239, + "step": 16481 + }, + { + "epoch": 0.7673720231859766, + "grad_norm": 0.3157158615599022, + "learning_rate": 9.278728490042265e-05, + "loss": 2.72, + "step": 16482 + }, + { + "epoch": 0.7674185813720698, + "grad_norm": 0.3583554731041365, + "learning_rate": 9.278588334639901e-05, + "loss": 2.9442, + "step": 16483 + }, + { + "epoch": 0.7674651395581629, + "grad_norm": 0.33597035317700474, + "learning_rate": 9.278448166680303e-05, + "loss": 3.0427, + "step": 16484 + }, + { + "epoch": 0.7675116977442559, + "grad_norm": 0.4071452166627429, + "learning_rate": 9.278307986163881e-05, + "loss": 3.0903, + "step": 16485 + }, + { + "epoch": 0.767558255930349, + "grad_norm": 0.3562840547477867, + "learning_rate": 9.278167793091046e-05, + "loss": 3.1463, + "step": 16486 + }, + { + "epoch": 0.767604814116442, + "grad_norm": 0.3574532092598289, + "learning_rate": 9.278027587462211e-05, + "loss": 3.0143, + "step": 16487 + }, + { + "epoch": 0.7676513723025351, + "grad_norm": 0.35560402465244306, + "learning_rate": 9.277887369277787e-05, + "loss": 2.9079, + "step": 16488 + }, + { + "epoch": 0.7676979304886281, + "grad_norm": 0.3441453049042007, + "learning_rate": 9.277747138538184e-05, + "loss": 2.787, + "step": 16489 + }, + { + "epoch": 0.7677444886747212, + "grad_norm": 0.3717041951605816, + "learning_rate": 9.277606895243814e-05, + "loss": 2.9916, + "step": 16490 + }, + { + "epoch": 0.7677910468608143, + "grad_norm": 0.3511160330958362, + "learning_rate": 9.277466639395089e-05, + "loss": 2.9846, + "step": 16491 + }, + { + "epoch": 0.7678376050469073, + "grad_norm": 0.3558274842105156, + "learning_rate": 9.277326370992423e-05, + "loss": 3.0196, + "step": 16492 + }, + { + "epoch": 0.7678841632330005, + "grad_norm": 0.36132520028635823, + "learning_rate": 9.277186090036223e-05, + "loss": 2.9631, + "step": 16493 + }, + { + "epoch": 0.7679307214190935, + "grad_norm": 0.3202544256681189, + "learning_rate": 9.277045796526904e-05, + "loss": 2.9769, + "step": 16494 + }, + { + "epoch": 0.7679772796051866, + "grad_norm": 0.38256412060988015, + "learning_rate": 9.276905490464877e-05, + "loss": 2.9708, + "step": 16495 + }, + { + "epoch": 0.7680238377912797, + "grad_norm": 0.3650732634553231, + "learning_rate": 9.276765171850554e-05, + "loss": 2.9598, + "step": 16496 + }, + { + "epoch": 0.7680703959773727, + "grad_norm": 0.3617395212673861, + "learning_rate": 9.276624840684347e-05, + "loss": 3.0121, + "step": 16497 + }, + { + "epoch": 0.7681169541634658, + "grad_norm": 0.33956966523429366, + "learning_rate": 9.276484496966667e-05, + "loss": 3.0069, + "step": 16498 + }, + { + "epoch": 0.7681635123495588, + "grad_norm": 0.3277296964385421, + "learning_rate": 9.276344140697927e-05, + "loss": 2.9705, + "step": 16499 + }, + { + "epoch": 0.7682100705356519, + "grad_norm": 0.38208401925700086, + "learning_rate": 9.276203771878537e-05, + "loss": 2.9114, + "step": 16500 + }, + { + "epoch": 0.768256628721745, + "grad_norm": 0.34547819282401965, + "learning_rate": 9.27606339050891e-05, + "loss": 2.8943, + "step": 16501 + }, + { + "epoch": 0.768303186907838, + "grad_norm": 0.34831814298160213, + "learning_rate": 9.27592299658946e-05, + "loss": 2.9396, + "step": 16502 + }, + { + "epoch": 0.7683497450939312, + "grad_norm": 0.3529678903830917, + "learning_rate": 9.275782590120597e-05, + "loss": 2.962, + "step": 16503 + }, + { + "epoch": 0.7683963032800242, + "grad_norm": 0.32090252490624654, + "learning_rate": 9.275642171102732e-05, + "loss": 2.902, + "step": 16504 + }, + { + "epoch": 0.7684428614661173, + "grad_norm": 0.3591572625941468, + "learning_rate": 9.275501739536278e-05, + "loss": 3.0167, + "step": 16505 + }, + { + "epoch": 0.7684894196522104, + "grad_norm": 0.3665449332489717, + "learning_rate": 9.27536129542165e-05, + "loss": 2.9704, + "step": 16506 + }, + { + "epoch": 0.7685359778383034, + "grad_norm": 0.34564309663251286, + "learning_rate": 9.275220838759255e-05, + "loss": 2.986, + "step": 16507 + }, + { + "epoch": 0.7685825360243965, + "grad_norm": 0.3817466918681811, + "learning_rate": 9.27508036954951e-05, + "loss": 2.9908, + "step": 16508 + }, + { + "epoch": 0.7686290942104895, + "grad_norm": 0.3490326674719537, + "learning_rate": 9.274939887792823e-05, + "loss": 3.0056, + "step": 16509 + }, + { + "epoch": 0.7686756523965826, + "grad_norm": 0.38017436102824137, + "learning_rate": 9.274799393489612e-05, + "loss": 3.0228, + "step": 16510 + }, + { + "epoch": 0.7687222105826756, + "grad_norm": 0.3593423129659387, + "learning_rate": 9.274658886640283e-05, + "loss": 2.8791, + "step": 16511 + }, + { + "epoch": 0.7687687687687688, + "grad_norm": 0.3688179812735974, + "learning_rate": 9.274518367245253e-05, + "loss": 2.9325, + "step": 16512 + }, + { + "epoch": 0.7688153269548619, + "grad_norm": 0.3479455788504782, + "learning_rate": 9.274377835304932e-05, + "loss": 2.9079, + "step": 16513 + }, + { + "epoch": 0.7688618851409549, + "grad_norm": 0.3647273142335069, + "learning_rate": 9.274237290819734e-05, + "loss": 3.0359, + "step": 16514 + }, + { + "epoch": 0.768908443327048, + "grad_norm": 0.34955076952701053, + "learning_rate": 9.27409673379007e-05, + "loss": 3.013, + "step": 16515 + }, + { + "epoch": 0.768955001513141, + "grad_norm": 0.34491929995631165, + "learning_rate": 9.273956164216353e-05, + "loss": 3.0137, + "step": 16516 + }, + { + "epoch": 0.7690015596992341, + "grad_norm": 0.31973984467929417, + "learning_rate": 9.273815582098996e-05, + "loss": 3.044, + "step": 16517 + }, + { + "epoch": 0.7690481178853272, + "grad_norm": 0.37788453093980356, + "learning_rate": 9.27367498743841e-05, + "loss": 2.9312, + "step": 16518 + }, + { + "epoch": 0.7690946760714202, + "grad_norm": 0.3305131574642362, + "learning_rate": 9.273534380235011e-05, + "loss": 2.8456, + "step": 16519 + }, + { + "epoch": 0.7691412342575134, + "grad_norm": 0.3193342090305785, + "learning_rate": 9.273393760489209e-05, + "loss": 2.8852, + "step": 16520 + }, + { + "epoch": 0.7691877924436064, + "grad_norm": 0.3692195874954325, + "learning_rate": 9.273253128201416e-05, + "loss": 2.9783, + "step": 16521 + }, + { + "epoch": 0.7692343506296995, + "grad_norm": 0.3403025819202621, + "learning_rate": 9.273112483372047e-05, + "loss": 2.9506, + "step": 16522 + }, + { + "epoch": 0.7692809088157926, + "grad_norm": 0.32526761795537645, + "learning_rate": 9.272971826001514e-05, + "loss": 2.9684, + "step": 16523 + }, + { + "epoch": 0.7693274670018856, + "grad_norm": 0.3686684547418085, + "learning_rate": 9.272831156090229e-05, + "loss": 2.9106, + "step": 16524 + }, + { + "epoch": 0.7693740251879787, + "grad_norm": 0.3394559184310961, + "learning_rate": 9.272690473638606e-05, + "loss": 3.0205, + "step": 16525 + }, + { + "epoch": 0.7694205833740717, + "grad_norm": 0.33582740192016075, + "learning_rate": 9.272549778647059e-05, + "loss": 3.0114, + "step": 16526 + }, + { + "epoch": 0.7694671415601648, + "grad_norm": 0.34489766787292897, + "learning_rate": 9.272409071115997e-05, + "loss": 2.9707, + "step": 16527 + }, + { + "epoch": 0.7695136997462579, + "grad_norm": 0.3223695986573847, + "learning_rate": 9.272268351045837e-05, + "loss": 3.0351, + "step": 16528 + }, + { + "epoch": 0.7695602579323509, + "grad_norm": 0.33634672378405744, + "learning_rate": 9.272127618436987e-05, + "loss": 2.8964, + "step": 16529 + }, + { + "epoch": 0.7696068161184441, + "grad_norm": 0.3665203773181166, + "learning_rate": 9.271986873289866e-05, + "loss": 3.0238, + "step": 16530 + }, + { + "epoch": 0.7696533743045371, + "grad_norm": 0.3574228286648334, + "learning_rate": 9.271846115604886e-05, + "loss": 2.9081, + "step": 16531 + }, + { + "epoch": 0.7696999324906302, + "grad_norm": 0.32838928194922506, + "learning_rate": 9.271705345382455e-05, + "loss": 2.9171, + "step": 16532 + }, + { + "epoch": 0.7697464906767232, + "grad_norm": 0.35543295561447663, + "learning_rate": 9.271564562622992e-05, + "loss": 2.8838, + "step": 16533 + }, + { + "epoch": 0.7697930488628163, + "grad_norm": 0.34278540985948364, + "learning_rate": 9.271423767326905e-05, + "loss": 2.9625, + "step": 16534 + }, + { + "epoch": 0.7698396070489094, + "grad_norm": 0.36091281459224783, + "learning_rate": 9.271282959494614e-05, + "loss": 2.9314, + "step": 16535 + }, + { + "epoch": 0.7698861652350024, + "grad_norm": 0.30671667336283265, + "learning_rate": 9.271142139126525e-05, + "loss": 2.883, + "step": 16536 + }, + { + "epoch": 0.7699327234210955, + "grad_norm": 0.3404544076227231, + "learning_rate": 9.271001306223056e-05, + "loss": 2.9049, + "step": 16537 + }, + { + "epoch": 0.7699792816071885, + "grad_norm": 0.32619037267151124, + "learning_rate": 9.270860460784618e-05, + "loss": 2.9382, + "step": 16538 + }, + { + "epoch": 0.7700258397932817, + "grad_norm": 0.3652046505392272, + "learning_rate": 9.270719602811627e-05, + "loss": 2.9992, + "step": 16539 + }, + { + "epoch": 0.7700723979793748, + "grad_norm": 0.3306436451723151, + "learning_rate": 9.270578732304494e-05, + "loss": 2.9919, + "step": 16540 + }, + { + "epoch": 0.7701189561654678, + "grad_norm": 0.3169678749448657, + "learning_rate": 9.270437849263633e-05, + "loss": 2.9848, + "step": 16541 + }, + { + "epoch": 0.7701655143515609, + "grad_norm": 0.337542076407013, + "learning_rate": 9.270296953689457e-05, + "loss": 2.9114, + "step": 16542 + }, + { + "epoch": 0.7702120725376539, + "grad_norm": 0.35960004951967567, + "learning_rate": 9.27015604558238e-05, + "loss": 2.9449, + "step": 16543 + }, + { + "epoch": 0.770258630723747, + "grad_norm": 0.34091096719956404, + "learning_rate": 9.270015124942817e-05, + "loss": 2.9698, + "step": 16544 + }, + { + "epoch": 0.7703051889098401, + "grad_norm": 0.3357060196780244, + "learning_rate": 9.26987419177118e-05, + "loss": 2.9558, + "step": 16545 + }, + { + "epoch": 0.7703517470959331, + "grad_norm": 0.35574175482411985, + "learning_rate": 9.269733246067883e-05, + "loss": 2.9829, + "step": 16546 + }, + { + "epoch": 0.7703983052820262, + "grad_norm": 0.31091716739210956, + "learning_rate": 9.26959228783334e-05, + "loss": 2.9132, + "step": 16547 + }, + { + "epoch": 0.7704448634681192, + "grad_norm": 0.340709864321107, + "learning_rate": 9.269451317067962e-05, + "loss": 2.8763, + "step": 16548 + }, + { + "epoch": 0.7704914216542124, + "grad_norm": 0.35149240347326904, + "learning_rate": 9.269310333772167e-05, + "loss": 2.8512, + "step": 16549 + }, + { + "epoch": 0.7705379798403055, + "grad_norm": 0.3341688689526857, + "learning_rate": 9.269169337946367e-05, + "loss": 2.8362, + "step": 16550 + }, + { + "epoch": 0.7705845380263985, + "grad_norm": 0.35507416991431723, + "learning_rate": 9.269028329590974e-05, + "loss": 2.9305, + "step": 16551 + }, + { + "epoch": 0.7706310962124916, + "grad_norm": 0.3442128452536277, + "learning_rate": 9.268887308706405e-05, + "loss": 2.9057, + "step": 16552 + }, + { + "epoch": 0.7706776543985846, + "grad_norm": 0.3439147518854585, + "learning_rate": 9.26874627529307e-05, + "loss": 2.9395, + "step": 16553 + }, + { + "epoch": 0.7707242125846777, + "grad_norm": 0.32971385326363734, + "learning_rate": 9.268605229351387e-05, + "loss": 2.9188, + "step": 16554 + }, + { + "epoch": 0.7707707707707707, + "grad_norm": 0.3350383893353679, + "learning_rate": 9.268464170881767e-05, + "loss": 3.0073, + "step": 16555 + }, + { + "epoch": 0.7708173289568638, + "grad_norm": 0.3342575828783466, + "learning_rate": 9.268323099884627e-05, + "loss": 2.9548, + "step": 16556 + }, + { + "epoch": 0.770863887142957, + "grad_norm": 0.32540795006877193, + "learning_rate": 9.268182016360378e-05, + "loss": 2.8792, + "step": 16557 + }, + { + "epoch": 0.77091044532905, + "grad_norm": 0.33361957928556274, + "learning_rate": 9.268040920309436e-05, + "loss": 3.0653, + "step": 16558 + }, + { + "epoch": 0.7709570035151431, + "grad_norm": 0.3673139189960916, + "learning_rate": 9.267899811732212e-05, + "loss": 3.0583, + "step": 16559 + }, + { + "epoch": 0.7710035617012361, + "grad_norm": 0.330352665101088, + "learning_rate": 9.267758690629126e-05, + "loss": 3.0369, + "step": 16560 + }, + { + "epoch": 0.7710501198873292, + "grad_norm": 0.3312504932666607, + "learning_rate": 9.267617557000586e-05, + "loss": 2.927, + "step": 16561 + }, + { + "epoch": 0.7710966780734223, + "grad_norm": 0.32144932536927034, + "learning_rate": 9.267476410847011e-05, + "loss": 2.9338, + "step": 16562 + }, + { + "epoch": 0.7711432362595153, + "grad_norm": 0.33636058410526026, + "learning_rate": 9.267335252168812e-05, + "loss": 3.018, + "step": 16563 + }, + { + "epoch": 0.7711897944456084, + "grad_norm": 0.3385181969435764, + "learning_rate": 9.267194080966404e-05, + "loss": 3.0359, + "step": 16564 + }, + { + "epoch": 0.7712363526317014, + "grad_norm": 0.32159111608944546, + "learning_rate": 9.267052897240203e-05, + "loss": 2.926, + "step": 16565 + }, + { + "epoch": 0.7712829108177945, + "grad_norm": 0.33018891320842797, + "learning_rate": 9.266911700990622e-05, + "loss": 2.9616, + "step": 16566 + }, + { + "epoch": 0.7713294690038877, + "grad_norm": 0.34734701570565146, + "learning_rate": 9.266770492218074e-05, + "loss": 2.9717, + "step": 16567 + }, + { + "epoch": 0.7713760271899807, + "grad_norm": 0.32819417356952835, + "learning_rate": 9.266629270922975e-05, + "loss": 2.7543, + "step": 16568 + }, + { + "epoch": 0.7714225853760738, + "grad_norm": 0.35287974081519224, + "learning_rate": 9.26648803710574e-05, + "loss": 2.9443, + "step": 16569 + }, + { + "epoch": 0.7714691435621668, + "grad_norm": 0.34504352904674485, + "learning_rate": 9.266346790766784e-05, + "loss": 3.0507, + "step": 16570 + }, + { + "epoch": 0.7715157017482599, + "grad_norm": 0.3655916826440001, + "learning_rate": 9.26620553190652e-05, + "loss": 2.998, + "step": 16571 + }, + { + "epoch": 0.771562259934353, + "grad_norm": 0.3390735235790045, + "learning_rate": 9.266064260525362e-05, + "loss": 2.9466, + "step": 16572 + }, + { + "epoch": 0.771608818120446, + "grad_norm": 0.3588694065839461, + "learning_rate": 9.265922976623728e-05, + "loss": 2.961, + "step": 16573 + }, + { + "epoch": 0.7716553763065391, + "grad_norm": 0.3282530512473439, + "learning_rate": 9.265781680202028e-05, + "loss": 3.0374, + "step": 16574 + }, + { + "epoch": 0.7717019344926321, + "grad_norm": 0.3360486029105525, + "learning_rate": 9.26564037126068e-05, + "loss": 3.0067, + "step": 16575 + }, + { + "epoch": 0.7717484926787253, + "grad_norm": 0.3346486217465285, + "learning_rate": 9.265499049800097e-05, + "loss": 2.9139, + "step": 16576 + }, + { + "epoch": 0.7717950508648183, + "grad_norm": 0.3596268242557491, + "learning_rate": 9.265357715820695e-05, + "loss": 2.9007, + "step": 16577 + }, + { + "epoch": 0.7718416090509114, + "grad_norm": 0.3046437406772534, + "learning_rate": 9.265216369322889e-05, + "loss": 2.8516, + "step": 16578 + }, + { + "epoch": 0.7718881672370045, + "grad_norm": 0.3234186740050151, + "learning_rate": 9.265075010307093e-05, + "loss": 2.9111, + "step": 16579 + }, + { + "epoch": 0.7719347254230975, + "grad_norm": 0.32541091573191283, + "learning_rate": 9.26493363877372e-05, + "loss": 2.9271, + "step": 16580 + }, + { + "epoch": 0.7719812836091906, + "grad_norm": 0.3549197040278396, + "learning_rate": 9.26479225472319e-05, + "loss": 2.9764, + "step": 16581 + }, + { + "epoch": 0.7720278417952836, + "grad_norm": 0.32346483827914485, + "learning_rate": 9.264650858155912e-05, + "loss": 3.0907, + "step": 16582 + }, + { + "epoch": 0.7720743999813767, + "grad_norm": 0.38033200096426484, + "learning_rate": 9.264509449072306e-05, + "loss": 2.8516, + "step": 16583 + }, + { + "epoch": 0.7721209581674698, + "grad_norm": 0.3413064419862951, + "learning_rate": 9.264368027472784e-05, + "loss": 3.037, + "step": 16584 + }, + { + "epoch": 0.7721675163535628, + "grad_norm": 0.35972139791467705, + "learning_rate": 9.264226593357763e-05, + "loss": 3.0108, + "step": 16585 + }, + { + "epoch": 0.772214074539656, + "grad_norm": 0.36362642629067843, + "learning_rate": 9.264085146727655e-05, + "loss": 3.0642, + "step": 16586 + }, + { + "epoch": 0.772260632725749, + "grad_norm": 0.30972965062250124, + "learning_rate": 9.263943687582879e-05, + "loss": 3.0367, + "step": 16587 + }, + { + "epoch": 0.7723071909118421, + "grad_norm": 0.40298973497065793, + "learning_rate": 9.263802215923846e-05, + "loss": 3.0388, + "step": 16588 + }, + { + "epoch": 0.7723537490979352, + "grad_norm": 0.3328543361375686, + "learning_rate": 9.263660731750976e-05, + "loss": 3.0107, + "step": 16589 + }, + { + "epoch": 0.7724003072840282, + "grad_norm": 0.37567187200620594, + "learning_rate": 9.263519235064681e-05, + "loss": 2.969, + "step": 16590 + }, + { + "epoch": 0.7724468654701213, + "grad_norm": 0.355859062326917, + "learning_rate": 9.263377725865378e-05, + "loss": 2.939, + "step": 16591 + }, + { + "epoch": 0.7724934236562143, + "grad_norm": 0.3475872046322031, + "learning_rate": 9.26323620415348e-05, + "loss": 2.8834, + "step": 16592 + }, + { + "epoch": 0.7725399818423074, + "grad_norm": 0.3392901210890434, + "learning_rate": 9.263094669929404e-05, + "loss": 3.037, + "step": 16593 + }, + { + "epoch": 0.7725865400284005, + "grad_norm": 0.342990972476412, + "learning_rate": 9.262953123193565e-05, + "loss": 2.9619, + "step": 16594 + }, + { + "epoch": 0.7726330982144936, + "grad_norm": 0.3352240984609057, + "learning_rate": 9.262811563946379e-05, + "loss": 3.0236, + "step": 16595 + }, + { + "epoch": 0.7726796564005867, + "grad_norm": 0.35205180990051016, + "learning_rate": 9.262669992188262e-05, + "loss": 2.8779, + "step": 16596 + }, + { + "epoch": 0.7727262145866797, + "grad_norm": 0.3273669651350718, + "learning_rate": 9.262528407919626e-05, + "loss": 2.9203, + "step": 16597 + }, + { + "epoch": 0.7727727727727728, + "grad_norm": 0.3396909257727377, + "learning_rate": 9.26238681114089e-05, + "loss": 2.943, + "step": 16598 + }, + { + "epoch": 0.7728193309588658, + "grad_norm": 0.3325586586995472, + "learning_rate": 9.262245201852471e-05, + "loss": 2.8818, + "step": 16599 + }, + { + "epoch": 0.7728658891449589, + "grad_norm": 0.3671369982051985, + "learning_rate": 9.26210358005478e-05, + "loss": 2.9553, + "step": 16600 + }, + { + "epoch": 0.772912447331052, + "grad_norm": 0.3228752351394344, + "learning_rate": 9.261961945748234e-05, + "loss": 2.9496, + "step": 16601 + }, + { + "epoch": 0.772959005517145, + "grad_norm": 0.3701700962522586, + "learning_rate": 9.261820298933251e-05, + "loss": 2.9947, + "step": 16602 + }, + { + "epoch": 0.7730055637032381, + "grad_norm": 0.33984734135600997, + "learning_rate": 9.261678639610245e-05, + "loss": 2.9418, + "step": 16603 + }, + { + "epoch": 0.7730521218893311, + "grad_norm": 0.3383799224244701, + "learning_rate": 9.261536967779633e-05, + "loss": 2.8656, + "step": 16604 + }, + { + "epoch": 0.7730986800754243, + "grad_norm": 0.3493014394141645, + "learning_rate": 9.261395283441829e-05, + "loss": 3.031, + "step": 16605 + }, + { + "epoch": 0.7731452382615174, + "grad_norm": 0.3368564836854499, + "learning_rate": 9.26125358659725e-05, + "loss": 3.0281, + "step": 16606 + }, + { + "epoch": 0.7731917964476104, + "grad_norm": 0.3584058772659171, + "learning_rate": 9.26111187724631e-05, + "loss": 2.9993, + "step": 16607 + }, + { + "epoch": 0.7732383546337035, + "grad_norm": 0.35865337555935145, + "learning_rate": 9.260970155389428e-05, + "loss": 2.9454, + "step": 16608 + }, + { + "epoch": 0.7732849128197965, + "grad_norm": 0.3865664408538483, + "learning_rate": 9.260828421027016e-05, + "loss": 2.9115, + "step": 16609 + }, + { + "epoch": 0.7733314710058896, + "grad_norm": 0.3520046984365759, + "learning_rate": 9.260686674159496e-05, + "loss": 3.0091, + "step": 16610 + }, + { + "epoch": 0.7733780291919827, + "grad_norm": 0.39090046943297213, + "learning_rate": 9.260544914787278e-05, + "loss": 2.9773, + "step": 16611 + }, + { + "epoch": 0.7734245873780757, + "grad_norm": 0.36737835185834045, + "learning_rate": 9.260403142910781e-05, + "loss": 3.0576, + "step": 16612 + }, + { + "epoch": 0.7734711455641688, + "grad_norm": 0.3253102938838348, + "learning_rate": 9.26026135853042e-05, + "loss": 2.9107, + "step": 16613 + }, + { + "epoch": 0.7735177037502619, + "grad_norm": 0.3381033603411436, + "learning_rate": 9.260119561646613e-05, + "loss": 2.9614, + "step": 16614 + }, + { + "epoch": 0.773564261936355, + "grad_norm": 0.32897069042698757, + "learning_rate": 9.259977752259774e-05, + "loss": 3.0186, + "step": 16615 + }, + { + "epoch": 0.7736108201224481, + "grad_norm": 0.32338540174992614, + "learning_rate": 9.25983593037032e-05, + "loss": 3.0214, + "step": 16616 + }, + { + "epoch": 0.7736573783085411, + "grad_norm": 0.347112330572007, + "learning_rate": 9.259694095978666e-05, + "loss": 2.8886, + "step": 16617 + }, + { + "epoch": 0.7737039364946342, + "grad_norm": 0.33285511808215407, + "learning_rate": 9.259552249085232e-05, + "loss": 2.991, + "step": 16618 + }, + { + "epoch": 0.7737504946807272, + "grad_norm": 0.34662092491305424, + "learning_rate": 9.259410389690429e-05, + "loss": 2.8727, + "step": 16619 + }, + { + "epoch": 0.7737970528668203, + "grad_norm": 0.31797273722512487, + "learning_rate": 9.259268517794678e-05, + "loss": 2.9768, + "step": 16620 + }, + { + "epoch": 0.7738436110529133, + "grad_norm": 0.3514853916472063, + "learning_rate": 9.259126633398393e-05, + "loss": 3.0867, + "step": 16621 + }, + { + "epoch": 0.7738901692390064, + "grad_norm": 0.3495499674412927, + "learning_rate": 9.258984736501991e-05, + "loss": 2.9243, + "step": 16622 + }, + { + "epoch": 0.7739367274250996, + "grad_norm": 0.3351833196373934, + "learning_rate": 9.258842827105889e-05, + "loss": 2.9018, + "step": 16623 + }, + { + "epoch": 0.7739832856111926, + "grad_norm": 0.37995329166990877, + "learning_rate": 9.258700905210501e-05, + "loss": 3.0365, + "step": 16624 + }, + { + "epoch": 0.7740298437972857, + "grad_norm": 0.3150078664141015, + "learning_rate": 9.258558970816246e-05, + "loss": 2.9427, + "step": 16625 + }, + { + "epoch": 0.7740764019833787, + "grad_norm": 0.3830601144142557, + "learning_rate": 9.258417023923541e-05, + "loss": 3.036, + "step": 16626 + }, + { + "epoch": 0.7741229601694718, + "grad_norm": 0.388358398057574, + "learning_rate": 9.258275064532802e-05, + "loss": 2.9526, + "step": 16627 + }, + { + "epoch": 0.7741695183555649, + "grad_norm": 0.3415176899821767, + "learning_rate": 9.258133092644444e-05, + "loss": 2.9324, + "step": 16628 + }, + { + "epoch": 0.7742160765416579, + "grad_norm": 0.3700843807669324, + "learning_rate": 9.257991108258886e-05, + "loss": 2.9183, + "step": 16629 + }, + { + "epoch": 0.774262634727751, + "grad_norm": 0.3216555410049042, + "learning_rate": 9.257849111376543e-05, + "loss": 2.8407, + "step": 16630 + }, + { + "epoch": 0.774309192913844, + "grad_norm": 0.304364215546688, + "learning_rate": 9.25770710199783e-05, + "loss": 2.9244, + "step": 16631 + }, + { + "epoch": 0.7743557510999372, + "grad_norm": 0.36301818098089367, + "learning_rate": 9.25756508012317e-05, + "loss": 2.9211, + "step": 16632 + }, + { + "epoch": 0.7744023092860303, + "grad_norm": 0.3346982708443703, + "learning_rate": 9.257423045752973e-05, + "loss": 3.0384, + "step": 16633 + }, + { + "epoch": 0.7744488674721233, + "grad_norm": 0.38787948054484833, + "learning_rate": 9.25728099888766e-05, + "loss": 2.984, + "step": 16634 + }, + { + "epoch": 0.7744954256582164, + "grad_norm": 0.33193517473104417, + "learning_rate": 9.257138939527646e-05, + "loss": 3.0196, + "step": 16635 + }, + { + "epoch": 0.7745419838443094, + "grad_norm": 0.3462755843320415, + "learning_rate": 9.256996867673349e-05, + "loss": 2.8488, + "step": 16636 + }, + { + "epoch": 0.7745885420304025, + "grad_norm": 0.34356071068059796, + "learning_rate": 9.256854783325187e-05, + "loss": 2.8402, + "step": 16637 + }, + { + "epoch": 0.7746351002164956, + "grad_norm": 0.3810120710976946, + "learning_rate": 9.256712686483573e-05, + "loss": 2.9866, + "step": 16638 + }, + { + "epoch": 0.7746816584025886, + "grad_norm": 0.3411763548594042, + "learning_rate": 9.256570577148928e-05, + "loss": 2.8874, + "step": 16639 + }, + { + "epoch": 0.7747282165886817, + "grad_norm": 0.3962056525880894, + "learning_rate": 9.256428455321667e-05, + "loss": 2.9129, + "step": 16640 + }, + { + "epoch": 0.7747747747747747, + "grad_norm": 0.35040458990826956, + "learning_rate": 9.25628632100221e-05, + "loss": 2.9653, + "step": 16641 + }, + { + "epoch": 0.7748213329608679, + "grad_norm": 0.36919481086852696, + "learning_rate": 9.25614417419097e-05, + "loss": 2.8853, + "step": 16642 + }, + { + "epoch": 0.7748678911469609, + "grad_norm": 0.33375810404336675, + "learning_rate": 9.256002014888367e-05, + "loss": 2.9305, + "step": 16643 + }, + { + "epoch": 0.774914449333054, + "grad_norm": 0.34424500050205453, + "learning_rate": 9.255859843094816e-05, + "loss": 2.9329, + "step": 16644 + }, + { + "epoch": 0.7749610075191471, + "grad_norm": 0.3475594796462438, + "learning_rate": 9.255717658810737e-05, + "loss": 2.944, + "step": 16645 + }, + { + "epoch": 0.7750075657052401, + "grad_norm": 0.34066467719215104, + "learning_rate": 9.255575462036546e-05, + "loss": 2.9329, + "step": 16646 + }, + { + "epoch": 0.7750541238913332, + "grad_norm": 0.3751442747985587, + "learning_rate": 9.255433252772658e-05, + "loss": 2.9583, + "step": 16647 + }, + { + "epoch": 0.7751006820774262, + "grad_norm": 0.3371082228612921, + "learning_rate": 9.255291031019496e-05, + "loss": 2.8018, + "step": 16648 + }, + { + "epoch": 0.7751472402635193, + "grad_norm": 0.3868007159029034, + "learning_rate": 9.255148796777471e-05, + "loss": 2.9157, + "step": 16649 + }, + { + "epoch": 0.7751937984496124, + "grad_norm": 0.3479051763919095, + "learning_rate": 9.255006550047005e-05, + "loss": 2.9599, + "step": 16650 + }, + { + "epoch": 0.7752403566357055, + "grad_norm": 0.36621849547724206, + "learning_rate": 9.254864290828514e-05, + "loss": 3.0093, + "step": 16651 + }, + { + "epoch": 0.7752869148217986, + "grad_norm": 0.3735310970219369, + "learning_rate": 9.254722019122416e-05, + "loss": 3.056, + "step": 16652 + }, + { + "epoch": 0.7753334730078916, + "grad_norm": 0.3546422606832491, + "learning_rate": 9.254579734929128e-05, + "loss": 2.9313, + "step": 16653 + }, + { + "epoch": 0.7753800311939847, + "grad_norm": 0.37140636401453636, + "learning_rate": 9.254437438249067e-05, + "loss": 2.9727, + "step": 16654 + }, + { + "epoch": 0.7754265893800778, + "grad_norm": 0.4037782362966348, + "learning_rate": 9.25429512908265e-05, + "loss": 2.9332, + "step": 16655 + }, + { + "epoch": 0.7754731475661708, + "grad_norm": 0.371138528126495, + "learning_rate": 9.254152807430298e-05, + "loss": 2.9478, + "step": 16656 + }, + { + "epoch": 0.7755197057522639, + "grad_norm": 0.40005668818500945, + "learning_rate": 9.254010473292424e-05, + "loss": 2.9257, + "step": 16657 + }, + { + "epoch": 0.7755662639383569, + "grad_norm": 0.37312197599772423, + "learning_rate": 9.253868126669452e-05, + "loss": 2.9866, + "step": 16658 + }, + { + "epoch": 0.77561282212445, + "grad_norm": 0.4211135376773445, + "learning_rate": 9.253725767561794e-05, + "loss": 2.9419, + "step": 16659 + }, + { + "epoch": 0.7756593803105432, + "grad_norm": 0.4279577575552699, + "learning_rate": 9.25358339596987e-05, + "loss": 2.9846, + "step": 16660 + }, + { + "epoch": 0.7757059384966362, + "grad_norm": 0.4143215940830736, + "learning_rate": 9.253441011894097e-05, + "loss": 2.8882, + "step": 16661 + }, + { + "epoch": 0.7757524966827293, + "grad_norm": 0.36054102707200936, + "learning_rate": 9.253298615334895e-05, + "loss": 3.0506, + "step": 16662 + }, + { + "epoch": 0.7757990548688223, + "grad_norm": 0.3902413810356527, + "learning_rate": 9.25315620629268e-05, + "loss": 3.0204, + "step": 16663 + }, + { + "epoch": 0.7758456130549154, + "grad_norm": 0.3753359887032266, + "learning_rate": 9.253013784767872e-05, + "loss": 3.0301, + "step": 16664 + }, + { + "epoch": 0.7758921712410084, + "grad_norm": 0.3458093727352815, + "learning_rate": 9.252871350760885e-05, + "loss": 2.8335, + "step": 16665 + }, + { + "epoch": 0.7759387294271015, + "grad_norm": 0.41313115675228695, + "learning_rate": 9.252728904272142e-05, + "loss": 2.9783, + "step": 16666 + }, + { + "epoch": 0.7759852876131946, + "grad_norm": 0.3359594014709008, + "learning_rate": 9.252586445302057e-05, + "loss": 3.006, + "step": 16667 + }, + { + "epoch": 0.7760318457992876, + "grad_norm": 0.430367616550003, + "learning_rate": 9.252443973851051e-05, + "loss": 2.9307, + "step": 16668 + }, + { + "epoch": 0.7760784039853807, + "grad_norm": 0.3854144308197849, + "learning_rate": 9.25230148991954e-05, + "loss": 2.8775, + "step": 16669 + }, + { + "epoch": 0.7761249621714738, + "grad_norm": 0.33874070667181666, + "learning_rate": 9.252158993507943e-05, + "loss": 2.9395, + "step": 16670 + }, + { + "epoch": 0.7761715203575669, + "grad_norm": 0.41800867292587773, + "learning_rate": 9.252016484616676e-05, + "loss": 2.9903, + "step": 16671 + }, + { + "epoch": 0.77621807854366, + "grad_norm": 0.33473691727758365, + "learning_rate": 9.251873963246164e-05, + "loss": 3.001, + "step": 16672 + }, + { + "epoch": 0.776264636729753, + "grad_norm": 0.3710605870084749, + "learning_rate": 9.251731429396818e-05, + "loss": 2.9096, + "step": 16673 + }, + { + "epoch": 0.7763111949158461, + "grad_norm": 0.34774008415275615, + "learning_rate": 9.25158888306906e-05, + "loss": 2.9044, + "step": 16674 + }, + { + "epoch": 0.7763577531019391, + "grad_norm": 0.33818242287415007, + "learning_rate": 9.251446324263307e-05, + "loss": 3.152, + "step": 16675 + }, + { + "epoch": 0.7764043112880322, + "grad_norm": 0.3752269777001469, + "learning_rate": 9.251303752979978e-05, + "loss": 3.0023, + "step": 16676 + }, + { + "epoch": 0.7764508694741253, + "grad_norm": 0.3896038406181716, + "learning_rate": 9.251161169219493e-05, + "loss": 2.9167, + "step": 16677 + }, + { + "epoch": 0.7764974276602183, + "grad_norm": 0.3484322938116935, + "learning_rate": 9.251018572982267e-05, + "loss": 2.9247, + "step": 16678 + }, + { + "epoch": 0.7765439858463115, + "grad_norm": 0.38155803119477477, + "learning_rate": 9.250875964268719e-05, + "loss": 2.9756, + "step": 16679 + }, + { + "epoch": 0.7765905440324045, + "grad_norm": 0.4095504946818265, + "learning_rate": 9.250733343079272e-05, + "loss": 2.9622, + "step": 16680 + }, + { + "epoch": 0.7766371022184976, + "grad_norm": 0.35805072688127243, + "learning_rate": 9.250590709414339e-05, + "loss": 2.9423, + "step": 16681 + }, + { + "epoch": 0.7766836604045907, + "grad_norm": 0.4290750976885359, + "learning_rate": 9.250448063274342e-05, + "loss": 2.9314, + "step": 16682 + }, + { + "epoch": 0.7767302185906837, + "grad_norm": 0.3974647523195082, + "learning_rate": 9.250305404659698e-05, + "loss": 3.0344, + "step": 16683 + }, + { + "epoch": 0.7767767767767768, + "grad_norm": 0.41128482758945883, + "learning_rate": 9.250162733570826e-05, + "loss": 2.9185, + "step": 16684 + }, + { + "epoch": 0.7768233349628698, + "grad_norm": 0.37706808674980796, + "learning_rate": 9.250020050008147e-05, + "loss": 2.9391, + "step": 16685 + }, + { + "epoch": 0.7768698931489629, + "grad_norm": 0.3941263087519581, + "learning_rate": 9.249877353972076e-05, + "loss": 2.9626, + "step": 16686 + }, + { + "epoch": 0.7769164513350559, + "grad_norm": 0.3610302344942572, + "learning_rate": 9.249734645463036e-05, + "loss": 3.0994, + "step": 16687 + }, + { + "epoch": 0.776963009521149, + "grad_norm": 0.38508889486294723, + "learning_rate": 9.249591924481441e-05, + "loss": 2.9608, + "step": 16688 + }, + { + "epoch": 0.7770095677072422, + "grad_norm": 0.36233460472163453, + "learning_rate": 9.249449191027713e-05, + "loss": 3.0258, + "step": 16689 + }, + { + "epoch": 0.7770561258933352, + "grad_norm": 0.3995845441539694, + "learning_rate": 9.249306445102271e-05, + "loss": 2.9918, + "step": 16690 + }, + { + "epoch": 0.7771026840794283, + "grad_norm": 0.3665566656856684, + "learning_rate": 9.249163686705533e-05, + "loss": 3.0488, + "step": 16691 + }, + { + "epoch": 0.7771492422655213, + "grad_norm": 0.35648573958548796, + "learning_rate": 9.249020915837918e-05, + "loss": 3.0149, + "step": 16692 + }, + { + "epoch": 0.7771958004516144, + "grad_norm": 0.36825357832929695, + "learning_rate": 9.248878132499845e-05, + "loss": 2.8931, + "step": 16693 + }, + { + "epoch": 0.7772423586377075, + "grad_norm": 0.3449124746395253, + "learning_rate": 9.248735336691734e-05, + "loss": 2.9028, + "step": 16694 + }, + { + "epoch": 0.7772889168238005, + "grad_norm": 0.3884382213784079, + "learning_rate": 9.248592528414002e-05, + "loss": 2.9429, + "step": 16695 + }, + { + "epoch": 0.7773354750098936, + "grad_norm": 0.371965646288778, + "learning_rate": 9.248449707667071e-05, + "loss": 2.9261, + "step": 16696 + }, + { + "epoch": 0.7773820331959866, + "grad_norm": 0.3633606220995254, + "learning_rate": 9.248306874451357e-05, + "loss": 2.9869, + "step": 16697 + }, + { + "epoch": 0.7774285913820798, + "grad_norm": 0.3575475076122257, + "learning_rate": 9.248164028767281e-05, + "loss": 3.0365, + "step": 16698 + }, + { + "epoch": 0.7774751495681729, + "grad_norm": 0.3622493464897545, + "learning_rate": 9.248021170615263e-05, + "loss": 2.9012, + "step": 16699 + }, + { + "epoch": 0.7775217077542659, + "grad_norm": 0.3649215354780561, + "learning_rate": 9.247878299995721e-05, + "loss": 2.894, + "step": 16700 + }, + { + "epoch": 0.777568265940359, + "grad_norm": 0.3407850670807384, + "learning_rate": 9.247735416909076e-05, + "loss": 2.9004, + "step": 16701 + }, + { + "epoch": 0.777614824126452, + "grad_norm": 0.367745563923536, + "learning_rate": 9.247592521355745e-05, + "loss": 3.0138, + "step": 16702 + }, + { + "epoch": 0.7776613823125451, + "grad_norm": 0.3355672299991151, + "learning_rate": 9.247449613336148e-05, + "loss": 2.9794, + "step": 16703 + }, + { + "epoch": 0.7777079404986382, + "grad_norm": 0.3698286730142545, + "learning_rate": 9.247306692850704e-05, + "loss": 2.944, + "step": 16704 + }, + { + "epoch": 0.7777544986847312, + "grad_norm": 0.3666884873849029, + "learning_rate": 9.247163759899835e-05, + "loss": 2.9548, + "step": 16705 + }, + { + "epoch": 0.7778010568708243, + "grad_norm": 0.3157551751175235, + "learning_rate": 9.247020814483958e-05, + "loss": 2.914, + "step": 16706 + }, + { + "epoch": 0.7778476150569174, + "grad_norm": 0.36412427879126613, + "learning_rate": 9.246877856603494e-05, + "loss": 2.9847, + "step": 16707 + }, + { + "epoch": 0.7778941732430105, + "grad_norm": 0.3417121790927981, + "learning_rate": 9.24673488625886e-05, + "loss": 2.9459, + "step": 16708 + }, + { + "epoch": 0.7779407314291035, + "grad_norm": 0.35086895041297456, + "learning_rate": 9.24659190345048e-05, + "loss": 2.9425, + "step": 16709 + }, + { + "epoch": 0.7779872896151966, + "grad_norm": 0.3456841101387702, + "learning_rate": 9.24644890817877e-05, + "loss": 3.004, + "step": 16710 + }, + { + "epoch": 0.7780338478012897, + "grad_norm": 0.3410615765635684, + "learning_rate": 9.246305900444151e-05, + "loss": 2.9482, + "step": 16711 + }, + { + "epoch": 0.7780804059873827, + "grad_norm": 0.3454553919063952, + "learning_rate": 9.246162880247043e-05, + "loss": 2.8349, + "step": 16712 + }, + { + "epoch": 0.7781269641734758, + "grad_norm": 0.3561238352141133, + "learning_rate": 9.246019847587863e-05, + "loss": 2.8011, + "step": 16713 + }, + { + "epoch": 0.7781735223595688, + "grad_norm": 0.3546205611596499, + "learning_rate": 9.245876802467035e-05, + "loss": 2.9179, + "step": 16714 + }, + { + "epoch": 0.7782200805456619, + "grad_norm": 0.34900667380442796, + "learning_rate": 9.245733744884977e-05, + "loss": 2.9206, + "step": 16715 + }, + { + "epoch": 0.7782666387317551, + "grad_norm": 0.4055280452045058, + "learning_rate": 9.245590674842108e-05, + "loss": 2.9875, + "step": 16716 + }, + { + "epoch": 0.7783131969178481, + "grad_norm": 0.3299870278250635, + "learning_rate": 9.24544759233885e-05, + "loss": 3.0327, + "step": 16717 + }, + { + "epoch": 0.7783597551039412, + "grad_norm": 0.38804616746792125, + "learning_rate": 9.24530449737562e-05, + "loss": 2.8254, + "step": 16718 + }, + { + "epoch": 0.7784063132900342, + "grad_norm": 0.3330762825009973, + "learning_rate": 9.24516138995284e-05, + "loss": 2.9359, + "step": 16719 + }, + { + "epoch": 0.7784528714761273, + "grad_norm": 0.3703796832883819, + "learning_rate": 9.245018270070928e-05, + "loss": 2.9919, + "step": 16720 + }, + { + "epoch": 0.7784994296622204, + "grad_norm": 0.37155499621131305, + "learning_rate": 9.244875137730308e-05, + "loss": 2.8655, + "step": 16721 + }, + { + "epoch": 0.7785459878483134, + "grad_norm": 0.34367362291082476, + "learning_rate": 9.244731992931398e-05, + "loss": 2.927, + "step": 16722 + }, + { + "epoch": 0.7785925460344065, + "grad_norm": 0.38807111941904066, + "learning_rate": 9.244588835674614e-05, + "loss": 3.0006, + "step": 16723 + }, + { + "epoch": 0.7786391042204995, + "grad_norm": 0.3649583881486943, + "learning_rate": 9.244445665960383e-05, + "loss": 2.8896, + "step": 16724 + }, + { + "epoch": 0.7786856624065926, + "grad_norm": 0.33919027665919926, + "learning_rate": 9.244302483789122e-05, + "loss": 3.0368, + "step": 16725 + }, + { + "epoch": 0.7787322205926858, + "grad_norm": 0.37610598342503393, + "learning_rate": 9.24415928916125e-05, + "loss": 3.0671, + "step": 16726 + }, + { + "epoch": 0.7787787787787788, + "grad_norm": 0.30021167083936584, + "learning_rate": 9.24401608207719e-05, + "loss": 2.9218, + "step": 16727 + }, + { + "epoch": 0.7788253369648719, + "grad_norm": 0.36983272531830336, + "learning_rate": 9.24387286253736e-05, + "loss": 2.9683, + "step": 16728 + }, + { + "epoch": 0.7788718951509649, + "grad_norm": 0.32923722428556, + "learning_rate": 9.243729630542182e-05, + "loss": 2.9966, + "step": 16729 + }, + { + "epoch": 0.778918453337058, + "grad_norm": 0.3510858611897263, + "learning_rate": 9.243586386092076e-05, + "loss": 3.0244, + "step": 16730 + }, + { + "epoch": 0.778965011523151, + "grad_norm": 0.3313704048116817, + "learning_rate": 9.243443129187461e-05, + "loss": 2.9214, + "step": 16731 + }, + { + "epoch": 0.7790115697092441, + "grad_norm": 0.31652096626010656, + "learning_rate": 9.243299859828758e-05, + "loss": 2.9512, + "step": 16732 + }, + { + "epoch": 0.7790581278953372, + "grad_norm": 0.3309337112913729, + "learning_rate": 9.24315657801639e-05, + "loss": 3.0194, + "step": 16733 + }, + { + "epoch": 0.7791046860814302, + "grad_norm": 0.31448140458318685, + "learning_rate": 9.243013283750774e-05, + "loss": 2.9607, + "step": 16734 + }, + { + "epoch": 0.7791512442675234, + "grad_norm": 0.35406803706430034, + "learning_rate": 9.242869977032331e-05, + "loss": 2.8457, + "step": 16735 + }, + { + "epoch": 0.7791978024536164, + "grad_norm": 0.34186941773697577, + "learning_rate": 9.242726657861484e-05, + "loss": 2.9151, + "step": 16736 + }, + { + "epoch": 0.7792443606397095, + "grad_norm": 0.3670262303409953, + "learning_rate": 9.242583326238652e-05, + "loss": 2.9375, + "step": 16737 + }, + { + "epoch": 0.7792909188258026, + "grad_norm": 0.35987757656672353, + "learning_rate": 9.242439982164256e-05, + "loss": 2.9749, + "step": 16738 + }, + { + "epoch": 0.7793374770118956, + "grad_norm": 0.32720935981116706, + "learning_rate": 9.242296625638716e-05, + "loss": 2.9267, + "step": 16739 + }, + { + "epoch": 0.7793840351979887, + "grad_norm": 0.38351027167083485, + "learning_rate": 9.242153256662455e-05, + "loss": 2.8514, + "step": 16740 + }, + { + "epoch": 0.7794305933840817, + "grad_norm": 0.34168601367466367, + "learning_rate": 9.242009875235891e-05, + "loss": 2.9035, + "step": 16741 + }, + { + "epoch": 0.7794771515701748, + "grad_norm": 0.35793695488055577, + "learning_rate": 9.241866481359444e-05, + "loss": 2.9528, + "step": 16742 + }, + { + "epoch": 0.779523709756268, + "grad_norm": 0.3337346090105155, + "learning_rate": 9.241723075033539e-05, + "loss": 2.8879, + "step": 16743 + }, + { + "epoch": 0.779570267942361, + "grad_norm": 0.3555319716708561, + "learning_rate": 9.241579656258595e-05, + "loss": 2.8427, + "step": 16744 + }, + { + "epoch": 0.7796168261284541, + "grad_norm": 0.3441332140974513, + "learning_rate": 9.241436225035032e-05, + "loss": 2.8936, + "step": 16745 + }, + { + "epoch": 0.7796633843145471, + "grad_norm": 0.3420564828488016, + "learning_rate": 9.241292781363271e-05, + "loss": 2.9731, + "step": 16746 + }, + { + "epoch": 0.7797099425006402, + "grad_norm": 0.3292113448147163, + "learning_rate": 9.241149325243734e-05, + "loss": 2.8938, + "step": 16747 + }, + { + "epoch": 0.7797565006867332, + "grad_norm": 0.35849151410929764, + "learning_rate": 9.241005856676842e-05, + "loss": 2.9956, + "step": 16748 + }, + { + "epoch": 0.7798030588728263, + "grad_norm": 0.35209486635015924, + "learning_rate": 9.240862375663016e-05, + "loss": 2.9059, + "step": 16749 + }, + { + "epoch": 0.7798496170589194, + "grad_norm": 0.332756605026197, + "learning_rate": 9.240718882202675e-05, + "loss": 2.8451, + "step": 16750 + }, + { + "epoch": 0.7798961752450124, + "grad_norm": 0.33147945275419316, + "learning_rate": 9.240575376296243e-05, + "loss": 3.0144, + "step": 16751 + }, + { + "epoch": 0.7799427334311055, + "grad_norm": 0.36657870260377057, + "learning_rate": 9.24043185794414e-05, + "loss": 3.0299, + "step": 16752 + }, + { + "epoch": 0.7799892916171985, + "grad_norm": 0.3461498169174898, + "learning_rate": 9.240288327146786e-05, + "loss": 2.9007, + "step": 16753 + }, + { + "epoch": 0.7800358498032917, + "grad_norm": 0.38719170228211286, + "learning_rate": 9.240144783904605e-05, + "loss": 3.0172, + "step": 16754 + }, + { + "epoch": 0.7800824079893848, + "grad_norm": 0.4209926552615169, + "learning_rate": 9.240001228218017e-05, + "loss": 2.9188, + "step": 16755 + }, + { + "epoch": 0.7801289661754778, + "grad_norm": 0.3482367176447885, + "learning_rate": 9.23985766008744e-05, + "loss": 3.0746, + "step": 16756 + }, + { + "epoch": 0.7801755243615709, + "grad_norm": 0.36361861360951614, + "learning_rate": 9.239714079513302e-05, + "loss": 2.9282, + "step": 16757 + }, + { + "epoch": 0.7802220825476639, + "grad_norm": 0.3855700121913445, + "learning_rate": 9.23957048649602e-05, + "loss": 2.9895, + "step": 16758 + }, + { + "epoch": 0.780268640733757, + "grad_norm": 0.3808900770743598, + "learning_rate": 9.239426881036016e-05, + "loss": 3.0136, + "step": 16759 + }, + { + "epoch": 0.7803151989198501, + "grad_norm": 0.40210519111539106, + "learning_rate": 9.23928326313371e-05, + "loss": 3.0589, + "step": 16760 + }, + { + "epoch": 0.7803617571059431, + "grad_norm": 0.3586967128787604, + "learning_rate": 9.239139632789528e-05, + "loss": 2.9486, + "step": 16761 + }, + { + "epoch": 0.7804083152920362, + "grad_norm": 0.3667458572705351, + "learning_rate": 9.238995990003889e-05, + "loss": 2.8834, + "step": 16762 + }, + { + "epoch": 0.7804548734781293, + "grad_norm": 0.3815649464165771, + "learning_rate": 9.238852334777213e-05, + "loss": 2.995, + "step": 16763 + }, + { + "epoch": 0.7805014316642224, + "grad_norm": 0.33574526944370986, + "learning_rate": 9.238708667109923e-05, + "loss": 3.0166, + "step": 16764 + }, + { + "epoch": 0.7805479898503155, + "grad_norm": 0.36028884837207703, + "learning_rate": 9.23856498700244e-05, + "loss": 2.9497, + "step": 16765 + }, + { + "epoch": 0.7805945480364085, + "grad_norm": 0.3623528111361641, + "learning_rate": 9.238421294455187e-05, + "loss": 2.98, + "step": 16766 + }, + { + "epoch": 0.7806411062225016, + "grad_norm": 0.3749105943963664, + "learning_rate": 9.238277589468586e-05, + "loss": 3.0163, + "step": 16767 + }, + { + "epoch": 0.7806876644085946, + "grad_norm": 0.38163224788276756, + "learning_rate": 9.238133872043058e-05, + "loss": 3.0425, + "step": 16768 + }, + { + "epoch": 0.7807342225946877, + "grad_norm": 0.31857196939450344, + "learning_rate": 9.237990142179023e-05, + "loss": 2.9416, + "step": 16769 + }, + { + "epoch": 0.7807807807807807, + "grad_norm": 0.3864403695462162, + "learning_rate": 9.237846399876904e-05, + "loss": 3.0834, + "step": 16770 + }, + { + "epoch": 0.7808273389668738, + "grad_norm": 0.3383559453059468, + "learning_rate": 9.237702645137124e-05, + "loss": 2.9083, + "step": 16771 + }, + { + "epoch": 0.780873897152967, + "grad_norm": 0.35940470950692316, + "learning_rate": 9.237558877960107e-05, + "loss": 3.0027, + "step": 16772 + }, + { + "epoch": 0.78092045533906, + "grad_norm": 0.34975813925520155, + "learning_rate": 9.237415098346268e-05, + "loss": 2.8453, + "step": 16773 + }, + { + "epoch": 0.7809670135251531, + "grad_norm": 0.3508211511294578, + "learning_rate": 9.237271306296037e-05, + "loss": 3.0361, + "step": 16774 + }, + { + "epoch": 0.7810135717112461, + "grad_norm": 0.3103481831109118, + "learning_rate": 9.237127501809827e-05, + "loss": 2.967, + "step": 16775 + }, + { + "epoch": 0.7810601298973392, + "grad_norm": 0.3564296170267246, + "learning_rate": 9.23698368488807e-05, + "loss": 3.02, + "step": 16776 + }, + { + "epoch": 0.7811066880834323, + "grad_norm": 0.30052878117908827, + "learning_rate": 9.236839855531181e-05, + "loss": 2.9178, + "step": 16777 + }, + { + "epoch": 0.7811532462695253, + "grad_norm": 0.359331003749513, + "learning_rate": 9.236696013739585e-05, + "loss": 2.9763, + "step": 16778 + }, + { + "epoch": 0.7811998044556184, + "grad_norm": 0.2939415145112324, + "learning_rate": 9.236552159513702e-05, + "loss": 2.8419, + "step": 16779 + }, + { + "epoch": 0.7812463626417114, + "grad_norm": 0.34516178891059845, + "learning_rate": 9.236408292853959e-05, + "loss": 2.8818, + "step": 16780 + }, + { + "epoch": 0.7812929208278045, + "grad_norm": 0.355940400739319, + "learning_rate": 9.236264413760774e-05, + "loss": 3.0027, + "step": 16781 + }, + { + "epoch": 0.7813394790138977, + "grad_norm": 0.3364631618673726, + "learning_rate": 9.236120522234569e-05, + "loss": 2.9422, + "step": 16782 + }, + { + "epoch": 0.7813860371999907, + "grad_norm": 0.3533455325588321, + "learning_rate": 9.235976618275768e-05, + "loss": 2.9591, + "step": 16783 + }, + { + "epoch": 0.7814325953860838, + "grad_norm": 0.35223146765897095, + "learning_rate": 9.235832701884794e-05, + "loss": 2.8122, + "step": 16784 + }, + { + "epoch": 0.7814791535721768, + "grad_norm": 0.36745202030554835, + "learning_rate": 9.235688773062068e-05, + "loss": 2.9994, + "step": 16785 + }, + { + "epoch": 0.7815257117582699, + "grad_norm": 0.35374843628706887, + "learning_rate": 9.235544831808012e-05, + "loss": 2.9936, + "step": 16786 + }, + { + "epoch": 0.781572269944363, + "grad_norm": 0.36286918803401946, + "learning_rate": 9.235400878123048e-05, + "loss": 2.9612, + "step": 16787 + }, + { + "epoch": 0.781618828130456, + "grad_norm": 0.383921227664389, + "learning_rate": 9.235256912007602e-05, + "loss": 3.0353, + "step": 16788 + }, + { + "epoch": 0.7816653863165491, + "grad_norm": 0.3359923615108909, + "learning_rate": 9.235112933462093e-05, + "loss": 2.9231, + "step": 16789 + }, + { + "epoch": 0.7817119445026421, + "grad_norm": 0.3723526242774394, + "learning_rate": 9.234968942486945e-05, + "loss": 2.8184, + "step": 16790 + }, + { + "epoch": 0.7817585026887353, + "grad_norm": 0.3756084901239515, + "learning_rate": 9.23482493908258e-05, + "loss": 3.0239, + "step": 16791 + }, + { + "epoch": 0.7818050608748283, + "grad_norm": 0.37274434596751, + "learning_rate": 9.234680923249422e-05, + "loss": 2.9584, + "step": 16792 + }, + { + "epoch": 0.7818516190609214, + "grad_norm": 0.3697476322617309, + "learning_rate": 9.234536894987892e-05, + "loss": 3.0408, + "step": 16793 + }, + { + "epoch": 0.7818981772470145, + "grad_norm": 0.3408020732066769, + "learning_rate": 9.234392854298414e-05, + "loss": 2.9915, + "step": 16794 + }, + { + "epoch": 0.7819447354331075, + "grad_norm": 0.37184554783525714, + "learning_rate": 9.234248801181408e-05, + "loss": 2.952, + "step": 16795 + }, + { + "epoch": 0.7819912936192006, + "grad_norm": 0.3562470301253389, + "learning_rate": 9.234104735637302e-05, + "loss": 2.9759, + "step": 16796 + }, + { + "epoch": 0.7820378518052936, + "grad_norm": 0.32962326858442853, + "learning_rate": 9.233960657666514e-05, + "loss": 2.9569, + "step": 16797 + }, + { + "epoch": 0.7820844099913867, + "grad_norm": 0.34851721566721416, + "learning_rate": 9.233816567269469e-05, + "loss": 3.022, + "step": 16798 + }, + { + "epoch": 0.7821309681774798, + "grad_norm": 0.33593770481329566, + "learning_rate": 9.23367246444659e-05, + "loss": 2.9791, + "step": 16799 + }, + { + "epoch": 0.7821775263635729, + "grad_norm": 0.34955600415202254, + "learning_rate": 9.233528349198298e-05, + "loss": 2.9143, + "step": 16800 + }, + { + "epoch": 0.782224084549666, + "grad_norm": 0.3771181362418849, + "learning_rate": 9.233384221525019e-05, + "loss": 2.9919, + "step": 16801 + }, + { + "epoch": 0.782270642735759, + "grad_norm": 0.3412008552344815, + "learning_rate": 9.233240081427173e-05, + "loss": 2.8511, + "step": 16802 + }, + { + "epoch": 0.7823172009218521, + "grad_norm": 0.3539629992840061, + "learning_rate": 9.233095928905185e-05, + "loss": 3.0434, + "step": 16803 + }, + { + "epoch": 0.7823637591079452, + "grad_norm": 0.37680189722363866, + "learning_rate": 9.232951763959477e-05, + "loss": 3.0015, + "step": 16804 + }, + { + "epoch": 0.7824103172940382, + "grad_norm": 0.3581669430114493, + "learning_rate": 9.232807586590473e-05, + "loss": 2.9726, + "step": 16805 + }, + { + "epoch": 0.7824568754801313, + "grad_norm": 0.3411128656079906, + "learning_rate": 9.232663396798596e-05, + "loss": 2.8745, + "step": 16806 + }, + { + "epoch": 0.7825034336662243, + "grad_norm": 0.33996142115008915, + "learning_rate": 9.232519194584268e-05, + "loss": 3.0185, + "step": 16807 + }, + { + "epoch": 0.7825499918523174, + "grad_norm": 0.3467641806943622, + "learning_rate": 9.232374979947914e-05, + "loss": 2.9722, + "step": 16808 + }, + { + "epoch": 0.7825965500384106, + "grad_norm": 0.3761085653472771, + "learning_rate": 9.232230752889956e-05, + "loss": 3.1058, + "step": 16809 + }, + { + "epoch": 0.7826431082245036, + "grad_norm": 0.34191631297046843, + "learning_rate": 9.232086513410818e-05, + "loss": 2.9092, + "step": 16810 + }, + { + "epoch": 0.7826896664105967, + "grad_norm": 0.3483313494171868, + "learning_rate": 9.231942261510922e-05, + "loss": 2.8299, + "step": 16811 + }, + { + "epoch": 0.7827362245966897, + "grad_norm": 0.300727411088955, + "learning_rate": 9.231797997190694e-05, + "loss": 2.923, + "step": 16812 + }, + { + "epoch": 0.7827827827827828, + "grad_norm": 0.33495016611523926, + "learning_rate": 9.231653720450555e-05, + "loss": 2.9754, + "step": 16813 + }, + { + "epoch": 0.7828293409688758, + "grad_norm": 0.31172828496130683, + "learning_rate": 9.231509431290929e-05, + "loss": 2.8648, + "step": 16814 + }, + { + "epoch": 0.7828758991549689, + "grad_norm": 0.3681724024746093, + "learning_rate": 9.231365129712239e-05, + "loss": 2.9356, + "step": 16815 + }, + { + "epoch": 0.782922457341062, + "grad_norm": 0.33079510864157885, + "learning_rate": 9.231220815714912e-05, + "loss": 3.0035, + "step": 16816 + }, + { + "epoch": 0.782969015527155, + "grad_norm": 0.3480318438328807, + "learning_rate": 9.231076489299366e-05, + "loss": 2.9696, + "step": 16817 + }, + { + "epoch": 0.7830155737132481, + "grad_norm": 0.3427700034163151, + "learning_rate": 9.230932150466027e-05, + "loss": 2.9124, + "step": 16818 + }, + { + "epoch": 0.7830621318993412, + "grad_norm": 0.3407354338843171, + "learning_rate": 9.230787799215322e-05, + "loss": 2.885, + "step": 16819 + }, + { + "epoch": 0.7831086900854343, + "grad_norm": 0.33417852827017375, + "learning_rate": 9.23064343554767e-05, + "loss": 2.8843, + "step": 16820 + }, + { + "epoch": 0.7831552482715274, + "grad_norm": 0.3805705455213173, + "learning_rate": 9.230499059463495e-05, + "loss": 2.9368, + "step": 16821 + }, + { + "epoch": 0.7832018064576204, + "grad_norm": 0.3418283165417828, + "learning_rate": 9.230354670963224e-05, + "loss": 2.9621, + "step": 16822 + }, + { + "epoch": 0.7832483646437135, + "grad_norm": 0.3791119431000276, + "learning_rate": 9.230210270047277e-05, + "loss": 2.9263, + "step": 16823 + }, + { + "epoch": 0.7832949228298065, + "grad_norm": 0.371179214485825, + "learning_rate": 9.23006585671608e-05, + "loss": 2.978, + "step": 16824 + }, + { + "epoch": 0.7833414810158996, + "grad_norm": 0.3517107964090744, + "learning_rate": 9.229921430970058e-05, + "loss": 2.9945, + "step": 16825 + }, + { + "epoch": 0.7833880392019927, + "grad_norm": 0.33971575189862374, + "learning_rate": 9.229776992809631e-05, + "loss": 2.8107, + "step": 16826 + }, + { + "epoch": 0.7834345973880857, + "grad_norm": 0.32572557726496487, + "learning_rate": 9.229632542235227e-05, + "loss": 2.9535, + "step": 16827 + }, + { + "epoch": 0.7834811555741789, + "grad_norm": 0.3323923575664761, + "learning_rate": 9.229488079247268e-05, + "loss": 3.0728, + "step": 16828 + }, + { + "epoch": 0.7835277137602719, + "grad_norm": 0.3625731553272446, + "learning_rate": 9.229343603846177e-05, + "loss": 3.0248, + "step": 16829 + }, + { + "epoch": 0.783574271946365, + "grad_norm": 0.3002660264100055, + "learning_rate": 9.22919911603238e-05, + "loss": 2.7896, + "step": 16830 + }, + { + "epoch": 0.7836208301324581, + "grad_norm": 0.3297906951712952, + "learning_rate": 9.2290546158063e-05, + "loss": 2.9973, + "step": 16831 + }, + { + "epoch": 0.7836673883185511, + "grad_norm": 0.39071744462852975, + "learning_rate": 9.228910103168363e-05, + "loss": 3.0171, + "step": 16832 + }, + { + "epoch": 0.7837139465046442, + "grad_norm": 0.31098371009735876, + "learning_rate": 9.22876557811899e-05, + "loss": 2.9436, + "step": 16833 + }, + { + "epoch": 0.7837605046907372, + "grad_norm": 0.35530380665280137, + "learning_rate": 9.228621040658607e-05, + "loss": 2.8889, + "step": 16834 + }, + { + "epoch": 0.7838070628768303, + "grad_norm": 0.3594320104564303, + "learning_rate": 9.228476490787636e-05, + "loss": 2.9445, + "step": 16835 + }, + { + "epoch": 0.7838536210629233, + "grad_norm": 0.37950329489939394, + "learning_rate": 9.228331928506505e-05, + "loss": 2.8888, + "step": 16836 + }, + { + "epoch": 0.7839001792490164, + "grad_norm": 0.3602507009527078, + "learning_rate": 9.228187353815637e-05, + "loss": 2.9734, + "step": 16837 + }, + { + "epoch": 0.7839467374351096, + "grad_norm": 0.37367332876919845, + "learning_rate": 9.228042766715454e-05, + "loss": 3.0463, + "step": 16838 + }, + { + "epoch": 0.7839932956212026, + "grad_norm": 0.3577142461098708, + "learning_rate": 9.227898167206383e-05, + "loss": 2.9624, + "step": 16839 + }, + { + "epoch": 0.7840398538072957, + "grad_norm": 0.3348543923367352, + "learning_rate": 9.227753555288848e-05, + "loss": 3.0521, + "step": 16840 + }, + { + "epoch": 0.7840864119933887, + "grad_norm": 0.3780882212637792, + "learning_rate": 9.22760893096327e-05, + "loss": 2.9379, + "step": 16841 + }, + { + "epoch": 0.7841329701794818, + "grad_norm": 0.3531376550465337, + "learning_rate": 9.227464294230078e-05, + "loss": 3.0218, + "step": 16842 + }, + { + "epoch": 0.7841795283655749, + "grad_norm": 0.3753474132542772, + "learning_rate": 9.227319645089695e-05, + "loss": 2.9365, + "step": 16843 + }, + { + "epoch": 0.7842260865516679, + "grad_norm": 0.3596694373893211, + "learning_rate": 9.227174983542545e-05, + "loss": 2.9167, + "step": 16844 + }, + { + "epoch": 0.784272644737761, + "grad_norm": 0.3379968568742629, + "learning_rate": 9.227030309589053e-05, + "loss": 3.0084, + "step": 16845 + }, + { + "epoch": 0.784319202923854, + "grad_norm": 0.40552883822139835, + "learning_rate": 9.226885623229644e-05, + "loss": 2.9465, + "step": 16846 + }, + { + "epoch": 0.7843657611099472, + "grad_norm": 0.3686022574006584, + "learning_rate": 9.226740924464741e-05, + "loss": 2.955, + "step": 16847 + }, + { + "epoch": 0.7844123192960403, + "grad_norm": 0.34874216245928297, + "learning_rate": 9.226596213294772e-05, + "loss": 2.9202, + "step": 16848 + }, + { + "epoch": 0.7844588774821333, + "grad_norm": 0.4052194995663283, + "learning_rate": 9.226451489720156e-05, + "loss": 2.8956, + "step": 16849 + }, + { + "epoch": 0.7845054356682264, + "grad_norm": 0.39150763683582657, + "learning_rate": 9.226306753741324e-05, + "loss": 3.019, + "step": 16850 + }, + { + "epoch": 0.7845519938543194, + "grad_norm": 0.35486709119366877, + "learning_rate": 9.226162005358697e-05, + "loss": 3.0108, + "step": 16851 + }, + { + "epoch": 0.7845985520404125, + "grad_norm": 0.3881021003410887, + "learning_rate": 9.226017244572701e-05, + "loss": 3.0172, + "step": 16852 + }, + { + "epoch": 0.7846451102265056, + "grad_norm": 0.34609251770534116, + "learning_rate": 9.22587247138376e-05, + "loss": 2.9694, + "step": 16853 + }, + { + "epoch": 0.7846916684125986, + "grad_norm": 0.3571827853587008, + "learning_rate": 9.225727685792302e-05, + "loss": 2.8238, + "step": 16854 + }, + { + "epoch": 0.7847382265986917, + "grad_norm": 0.3277054247659413, + "learning_rate": 9.225582887798748e-05, + "loss": 2.9451, + "step": 16855 + }, + { + "epoch": 0.7847847847847848, + "grad_norm": 0.371041139357876, + "learning_rate": 9.225438077403523e-05, + "loss": 3.0137, + "step": 16856 + }, + { + "epoch": 0.7848313429708779, + "grad_norm": 0.3938023032485401, + "learning_rate": 9.225293254607055e-05, + "loss": 3.0001, + "step": 16857 + }, + { + "epoch": 0.7848779011569709, + "grad_norm": 0.33341383897395, + "learning_rate": 9.225148419409768e-05, + "loss": 2.9246, + "step": 16858 + }, + { + "epoch": 0.784924459343064, + "grad_norm": 0.4069896411333613, + "learning_rate": 9.225003571812086e-05, + "loss": 3.0066, + "step": 16859 + }, + { + "epoch": 0.7849710175291571, + "grad_norm": 0.4087031739958078, + "learning_rate": 9.224858711814435e-05, + "loss": 3.0015, + "step": 16860 + }, + { + "epoch": 0.7850175757152501, + "grad_norm": 0.34952434317271197, + "learning_rate": 9.224713839417241e-05, + "loss": 2.9959, + "step": 16861 + }, + { + "epoch": 0.7850641339013432, + "grad_norm": 0.3956637705884676, + "learning_rate": 9.224568954620927e-05, + "loss": 2.9479, + "step": 16862 + }, + { + "epoch": 0.7851106920874362, + "grad_norm": 0.33956966990027504, + "learning_rate": 9.224424057425918e-05, + "loss": 2.9111, + "step": 16863 + }, + { + "epoch": 0.7851572502735293, + "grad_norm": 0.37142680566491704, + "learning_rate": 9.224279147832642e-05, + "loss": 2.8936, + "step": 16864 + }, + { + "epoch": 0.7852038084596225, + "grad_norm": 0.37448072028646556, + "learning_rate": 9.224134225841523e-05, + "loss": 2.9361, + "step": 16865 + }, + { + "epoch": 0.7852503666457155, + "grad_norm": 0.37899887136462657, + "learning_rate": 9.223989291452985e-05, + "loss": 2.9692, + "step": 16866 + }, + { + "epoch": 0.7852969248318086, + "grad_norm": 0.37744248226380717, + "learning_rate": 9.223844344667456e-05, + "loss": 3.0503, + "step": 16867 + }, + { + "epoch": 0.7853434830179016, + "grad_norm": 0.3693953511182112, + "learning_rate": 9.223699385485359e-05, + "loss": 2.9806, + "step": 16868 + }, + { + "epoch": 0.7853900412039947, + "grad_norm": 0.36373002551236355, + "learning_rate": 9.22355441390712e-05, + "loss": 2.9155, + "step": 16869 + }, + { + "epoch": 0.7854365993900878, + "grad_norm": 0.39435249277720047, + "learning_rate": 9.223409429933167e-05, + "loss": 3.0367, + "step": 16870 + }, + { + "epoch": 0.7854831575761808, + "grad_norm": 0.33309937666687717, + "learning_rate": 9.223264433563921e-05, + "loss": 2.9712, + "step": 16871 + }, + { + "epoch": 0.7855297157622739, + "grad_norm": 0.34203385296646266, + "learning_rate": 9.22311942479981e-05, + "loss": 2.9935, + "step": 16872 + }, + { + "epoch": 0.7855762739483669, + "grad_norm": 0.4238736631888933, + "learning_rate": 9.222974403641263e-05, + "loss": 2.834, + "step": 16873 + }, + { + "epoch": 0.78562283213446, + "grad_norm": 0.405147014975672, + "learning_rate": 9.222829370088699e-05, + "loss": 3.0429, + "step": 16874 + }, + { + "epoch": 0.7856693903205532, + "grad_norm": 0.3822331107135635, + "learning_rate": 9.222684324142548e-05, + "loss": 2.9062, + "step": 16875 + }, + { + "epoch": 0.7857159485066462, + "grad_norm": 0.4111970880755779, + "learning_rate": 9.222539265803234e-05, + "loss": 2.9491, + "step": 16876 + }, + { + "epoch": 0.7857625066927393, + "grad_norm": 0.3457096120595929, + "learning_rate": 9.222394195071182e-05, + "loss": 2.8799, + "step": 16877 + }, + { + "epoch": 0.7858090648788323, + "grad_norm": 0.423859919930489, + "learning_rate": 9.222249111946821e-05, + "loss": 2.9767, + "step": 16878 + }, + { + "epoch": 0.7858556230649254, + "grad_norm": 0.38593949791925297, + "learning_rate": 9.222104016430574e-05, + "loss": 2.962, + "step": 16879 + }, + { + "epoch": 0.7859021812510184, + "grad_norm": 0.4046867736099087, + "learning_rate": 9.221958908522867e-05, + "loss": 2.9374, + "step": 16880 + }, + { + "epoch": 0.7859487394371115, + "grad_norm": 0.4019192445245889, + "learning_rate": 9.221813788224127e-05, + "loss": 2.8446, + "step": 16881 + }, + { + "epoch": 0.7859952976232046, + "grad_norm": 0.33994023182540684, + "learning_rate": 9.221668655534781e-05, + "loss": 2.9794, + "step": 16882 + }, + { + "epoch": 0.7860418558092976, + "grad_norm": 0.3986370833828237, + "learning_rate": 9.221523510455252e-05, + "loss": 2.9964, + "step": 16883 + }, + { + "epoch": 0.7860884139953908, + "grad_norm": 0.33208287490449623, + "learning_rate": 9.221378352985968e-05, + "loss": 2.9942, + "step": 16884 + }, + { + "epoch": 0.7861349721814838, + "grad_norm": 0.3671145014864132, + "learning_rate": 9.221233183127354e-05, + "loss": 2.8958, + "step": 16885 + }, + { + "epoch": 0.7861815303675769, + "grad_norm": 0.34155144041594526, + "learning_rate": 9.221088000879836e-05, + "loss": 2.9659, + "step": 16886 + }, + { + "epoch": 0.78622808855367, + "grad_norm": 0.3679662715203456, + "learning_rate": 9.220942806243841e-05, + "loss": 2.9542, + "step": 16887 + }, + { + "epoch": 0.786274646739763, + "grad_norm": 0.3422209018399057, + "learning_rate": 9.220797599219794e-05, + "loss": 2.8281, + "step": 16888 + }, + { + "epoch": 0.7863212049258561, + "grad_norm": 0.3411603385779653, + "learning_rate": 9.220652379808123e-05, + "loss": 2.9079, + "step": 16889 + }, + { + "epoch": 0.7863677631119491, + "grad_norm": 0.34796078213725207, + "learning_rate": 9.220507148009252e-05, + "loss": 3.0123, + "step": 16890 + }, + { + "epoch": 0.7864143212980422, + "grad_norm": 0.36138102352379664, + "learning_rate": 9.220361903823609e-05, + "loss": 2.9424, + "step": 16891 + }, + { + "epoch": 0.7864608794841353, + "grad_norm": 0.32921987382853707, + "learning_rate": 9.22021664725162e-05, + "loss": 2.9955, + "step": 16892 + }, + { + "epoch": 0.7865074376702283, + "grad_norm": 0.36874373893087925, + "learning_rate": 9.220071378293711e-05, + "loss": 2.9739, + "step": 16893 + }, + { + "epoch": 0.7865539958563215, + "grad_norm": 0.3418605146712332, + "learning_rate": 9.219926096950306e-05, + "loss": 2.8938, + "step": 16894 + }, + { + "epoch": 0.7866005540424145, + "grad_norm": 0.3830938214056547, + "learning_rate": 9.219780803221836e-05, + "loss": 2.9418, + "step": 16895 + }, + { + "epoch": 0.7866471122285076, + "grad_norm": 0.37240323726560215, + "learning_rate": 9.219635497108726e-05, + "loss": 3.018, + "step": 16896 + }, + { + "epoch": 0.7866936704146007, + "grad_norm": 0.3507202242980925, + "learning_rate": 9.219490178611398e-05, + "loss": 2.9618, + "step": 16897 + }, + { + "epoch": 0.7867402286006937, + "grad_norm": 0.38133031795746863, + "learning_rate": 9.219344847730285e-05, + "loss": 2.9322, + "step": 16898 + }, + { + "epoch": 0.7867867867867868, + "grad_norm": 0.37354650333902967, + "learning_rate": 9.219199504465809e-05, + "loss": 3.1341, + "step": 16899 + }, + { + "epoch": 0.7868333449728798, + "grad_norm": 0.33464989865630934, + "learning_rate": 9.219054148818397e-05, + "loss": 2.8969, + "step": 16900 + }, + { + "epoch": 0.7868799031589729, + "grad_norm": 0.38181250557351315, + "learning_rate": 9.21890878078848e-05, + "loss": 3.0211, + "step": 16901 + }, + { + "epoch": 0.7869264613450659, + "grad_norm": 0.3679488369514663, + "learning_rate": 9.218763400376479e-05, + "loss": 3.0269, + "step": 16902 + }, + { + "epoch": 0.7869730195311591, + "grad_norm": 0.3760947925905511, + "learning_rate": 9.218618007582825e-05, + "loss": 3.0201, + "step": 16903 + }, + { + "epoch": 0.7870195777172522, + "grad_norm": 0.37670591669163284, + "learning_rate": 9.21847260240794e-05, + "loss": 3.0003, + "step": 16904 + }, + { + "epoch": 0.7870661359033452, + "grad_norm": 0.36365319029481213, + "learning_rate": 9.218327184852256e-05, + "loss": 2.9046, + "step": 16905 + }, + { + "epoch": 0.7871126940894383, + "grad_norm": 0.34038697678833857, + "learning_rate": 9.218181754916196e-05, + "loss": 2.9165, + "step": 16906 + }, + { + "epoch": 0.7871592522755313, + "grad_norm": 0.360583537662233, + "learning_rate": 9.218036312600187e-05, + "loss": 2.9131, + "step": 16907 + }, + { + "epoch": 0.7872058104616244, + "grad_norm": 0.3598956084399984, + "learning_rate": 9.217890857904659e-05, + "loss": 2.9128, + "step": 16908 + }, + { + "epoch": 0.7872523686477175, + "grad_norm": 0.3584397617935736, + "learning_rate": 9.217745390830036e-05, + "loss": 2.9166, + "step": 16909 + }, + { + "epoch": 0.7872989268338105, + "grad_norm": 0.37671533474826097, + "learning_rate": 9.217599911376746e-05, + "loss": 3.045, + "step": 16910 + }, + { + "epoch": 0.7873454850199036, + "grad_norm": 0.35870317863100754, + "learning_rate": 9.217454419545214e-05, + "loss": 2.9995, + "step": 16911 + }, + { + "epoch": 0.7873920432059967, + "grad_norm": 0.3187394400872924, + "learning_rate": 9.217308915335871e-05, + "loss": 2.8577, + "step": 16912 + }, + { + "epoch": 0.7874386013920898, + "grad_norm": 0.3944921787576989, + "learning_rate": 9.217163398749142e-05, + "loss": 2.9676, + "step": 16913 + }, + { + "epoch": 0.7874851595781829, + "grad_norm": 0.3137572312638268, + "learning_rate": 9.217017869785453e-05, + "loss": 2.9787, + "step": 16914 + }, + { + "epoch": 0.7875317177642759, + "grad_norm": 0.3868336909744262, + "learning_rate": 9.216872328445231e-05, + "loss": 2.9532, + "step": 16915 + }, + { + "epoch": 0.787578275950369, + "grad_norm": 0.3304918108297257, + "learning_rate": 9.216726774728906e-05, + "loss": 2.864, + "step": 16916 + }, + { + "epoch": 0.787624834136462, + "grad_norm": 0.36888527788164444, + "learning_rate": 9.216581208636903e-05, + "loss": 2.8609, + "step": 16917 + }, + { + "epoch": 0.7876713923225551, + "grad_norm": 0.33438513623194965, + "learning_rate": 9.216435630169649e-05, + "loss": 2.8959, + "step": 16918 + }, + { + "epoch": 0.7877179505086482, + "grad_norm": 0.37733112045062056, + "learning_rate": 9.216290039327572e-05, + "loss": 3.0168, + "step": 16919 + }, + { + "epoch": 0.7877645086947412, + "grad_norm": 0.343412578112596, + "learning_rate": 9.216144436111099e-05, + "loss": 2.879, + "step": 16920 + }, + { + "epoch": 0.7878110668808344, + "grad_norm": 0.3624099702298145, + "learning_rate": 9.215998820520659e-05, + "loss": 3.002, + "step": 16921 + }, + { + "epoch": 0.7878576250669274, + "grad_norm": 0.3611761301563029, + "learning_rate": 9.215853192556677e-05, + "loss": 3.0567, + "step": 16922 + }, + { + "epoch": 0.7879041832530205, + "grad_norm": 0.37523556907247935, + "learning_rate": 9.215707552219579e-05, + "loss": 2.9704, + "step": 16923 + }, + { + "epoch": 0.7879507414391135, + "grad_norm": 0.3525287820115548, + "learning_rate": 9.215561899509796e-05, + "loss": 2.9627, + "step": 16924 + }, + { + "epoch": 0.7879972996252066, + "grad_norm": 0.3664775284516363, + "learning_rate": 9.215416234427753e-05, + "loss": 3.0187, + "step": 16925 + }, + { + "epoch": 0.7880438578112997, + "grad_norm": 0.33994191407134033, + "learning_rate": 9.21527055697388e-05, + "loss": 2.9378, + "step": 16926 + }, + { + "epoch": 0.7880904159973927, + "grad_norm": 0.3533636348506008, + "learning_rate": 9.215124867148605e-05, + "loss": 3.01, + "step": 16927 + }, + { + "epoch": 0.7881369741834858, + "grad_norm": 0.36982870644654897, + "learning_rate": 9.214979164952351e-05, + "loss": 3.0563, + "step": 16928 + }, + { + "epoch": 0.7881835323695788, + "grad_norm": 0.3023413937187773, + "learning_rate": 9.21483345038555e-05, + "loss": 2.948, + "step": 16929 + }, + { + "epoch": 0.788230090555672, + "grad_norm": 0.37455740855910535, + "learning_rate": 9.214687723448627e-05, + "loss": 2.9138, + "step": 16930 + }, + { + "epoch": 0.7882766487417651, + "grad_norm": 0.3624901684065598, + "learning_rate": 9.214541984142011e-05, + "loss": 2.8769, + "step": 16931 + }, + { + "epoch": 0.7883232069278581, + "grad_norm": 0.3575787168424344, + "learning_rate": 9.21439623246613e-05, + "loss": 2.9695, + "step": 16932 + }, + { + "epoch": 0.7883697651139512, + "grad_norm": 0.3281763027488316, + "learning_rate": 9.21425046842141e-05, + "loss": 2.9063, + "step": 16933 + }, + { + "epoch": 0.7884163233000442, + "grad_norm": 0.37273140137766114, + "learning_rate": 9.214104692008282e-05, + "loss": 2.9786, + "step": 16934 + }, + { + "epoch": 0.7884628814861373, + "grad_norm": 0.3547110353245161, + "learning_rate": 9.213958903227171e-05, + "loss": 2.9292, + "step": 16935 + }, + { + "epoch": 0.7885094396722304, + "grad_norm": 0.3782269162450421, + "learning_rate": 9.213813102078506e-05, + "loss": 2.9598, + "step": 16936 + }, + { + "epoch": 0.7885559978583234, + "grad_norm": 0.35099527664577945, + "learning_rate": 9.213667288562713e-05, + "loss": 3.002, + "step": 16937 + }, + { + "epoch": 0.7886025560444165, + "grad_norm": 0.32364216646412985, + "learning_rate": 9.213521462680224e-05, + "loss": 2.9256, + "step": 16938 + }, + { + "epoch": 0.7886491142305095, + "grad_norm": 0.3444372468514789, + "learning_rate": 9.213375624431461e-05, + "loss": 3.051, + "step": 16939 + }, + { + "epoch": 0.7886956724166027, + "grad_norm": 0.3241887318384106, + "learning_rate": 9.21322977381686e-05, + "loss": 3.0594, + "step": 16940 + }, + { + "epoch": 0.7887422306026958, + "grad_norm": 0.3427139615725431, + "learning_rate": 9.213083910836842e-05, + "loss": 3.0372, + "step": 16941 + }, + { + "epoch": 0.7887887887887888, + "grad_norm": 0.32681580775434504, + "learning_rate": 9.212938035491838e-05, + "loss": 2.889, + "step": 16942 + }, + { + "epoch": 0.7888353469748819, + "grad_norm": 0.3369615993995517, + "learning_rate": 9.212792147782276e-05, + "loss": 2.9881, + "step": 16943 + }, + { + "epoch": 0.7888819051609749, + "grad_norm": 0.35321570588089474, + "learning_rate": 9.212646247708585e-05, + "loss": 3.057, + "step": 16944 + }, + { + "epoch": 0.788928463347068, + "grad_norm": 0.36534532152304666, + "learning_rate": 9.212500335271192e-05, + "loss": 2.9844, + "step": 16945 + }, + { + "epoch": 0.788975021533161, + "grad_norm": 0.36671714407072203, + "learning_rate": 9.212354410470525e-05, + "loss": 2.9699, + "step": 16946 + }, + { + "epoch": 0.7890215797192541, + "grad_norm": 0.3671549220875902, + "learning_rate": 9.212208473307014e-05, + "loss": 3.0626, + "step": 16947 + }, + { + "epoch": 0.7890681379053472, + "grad_norm": 0.34666222431447347, + "learning_rate": 9.212062523781086e-05, + "loss": 2.8993, + "step": 16948 + }, + { + "epoch": 0.7891146960914402, + "grad_norm": 0.3225810385566303, + "learning_rate": 9.211916561893169e-05, + "loss": 2.9173, + "step": 16949 + }, + { + "epoch": 0.7891612542775334, + "grad_norm": 0.36841180260946327, + "learning_rate": 9.21177058764369e-05, + "loss": 2.9429, + "step": 16950 + }, + { + "epoch": 0.7892078124636264, + "grad_norm": 0.32042272663482313, + "learning_rate": 9.211624601033081e-05, + "loss": 2.9146, + "step": 16951 + }, + { + "epoch": 0.7892543706497195, + "grad_norm": 0.3794133459584282, + "learning_rate": 9.211478602061769e-05, + "loss": 3.0109, + "step": 16952 + }, + { + "epoch": 0.7893009288358126, + "grad_norm": 0.3160111340312022, + "learning_rate": 9.211332590730181e-05, + "loss": 2.9948, + "step": 16953 + }, + { + "epoch": 0.7893474870219056, + "grad_norm": 0.37579629332219827, + "learning_rate": 9.211186567038748e-05, + "loss": 3.0052, + "step": 16954 + }, + { + "epoch": 0.7893940452079987, + "grad_norm": 0.34466459463830107, + "learning_rate": 9.211040530987896e-05, + "loss": 2.9171, + "step": 16955 + }, + { + "epoch": 0.7894406033940917, + "grad_norm": 0.3491703555017938, + "learning_rate": 9.210894482578057e-05, + "loss": 2.9609, + "step": 16956 + }, + { + "epoch": 0.7894871615801848, + "grad_norm": 0.33433712262150594, + "learning_rate": 9.210748421809655e-05, + "loss": 2.9488, + "step": 16957 + }, + { + "epoch": 0.789533719766278, + "grad_norm": 0.3415451355193726, + "learning_rate": 9.210602348683122e-05, + "loss": 2.983, + "step": 16958 + }, + { + "epoch": 0.789580277952371, + "grad_norm": 0.35772775127022116, + "learning_rate": 9.210456263198886e-05, + "loss": 3.0029, + "step": 16959 + }, + { + "epoch": 0.7896268361384641, + "grad_norm": 0.384711136323848, + "learning_rate": 9.210310165357376e-05, + "loss": 2.887, + "step": 16960 + }, + { + "epoch": 0.7896733943245571, + "grad_norm": 0.36092454147690395, + "learning_rate": 9.210164055159021e-05, + "loss": 2.9893, + "step": 16961 + }, + { + "epoch": 0.7897199525106502, + "grad_norm": 0.3810225524945762, + "learning_rate": 9.210017932604249e-05, + "loss": 2.9524, + "step": 16962 + }, + { + "epoch": 0.7897665106967433, + "grad_norm": 0.3559509055931574, + "learning_rate": 9.209871797693487e-05, + "loss": 2.882, + "step": 16963 + }, + { + "epoch": 0.7898130688828363, + "grad_norm": 0.3301047611556468, + "learning_rate": 9.209725650427169e-05, + "loss": 3.0474, + "step": 16964 + }, + { + "epoch": 0.7898596270689294, + "grad_norm": 0.3398045776522925, + "learning_rate": 9.20957949080572e-05, + "loss": 2.9559, + "step": 16965 + }, + { + "epoch": 0.7899061852550224, + "grad_norm": 0.332185420723007, + "learning_rate": 9.20943331882957e-05, + "loss": 2.9364, + "step": 16966 + }, + { + "epoch": 0.7899527434411155, + "grad_norm": 0.34138715856844515, + "learning_rate": 9.209287134499147e-05, + "loss": 2.9256, + "step": 16967 + }, + { + "epoch": 0.7899993016272085, + "grad_norm": 0.37780024345576074, + "learning_rate": 9.209140937814882e-05, + "loss": 2.9383, + "step": 16968 + }, + { + "epoch": 0.7900458598133017, + "grad_norm": 0.35080638544204845, + "learning_rate": 9.208994728777202e-05, + "loss": 2.9059, + "step": 16969 + }, + { + "epoch": 0.7900924179993948, + "grad_norm": 0.3703757948908034, + "learning_rate": 9.208848507386538e-05, + "loss": 2.8915, + "step": 16970 + }, + { + "epoch": 0.7901389761854878, + "grad_norm": 0.38650299365073526, + "learning_rate": 9.208702273643317e-05, + "loss": 2.8608, + "step": 16971 + }, + { + "epoch": 0.7901855343715809, + "grad_norm": 0.37290703934676317, + "learning_rate": 9.208556027547971e-05, + "loss": 3.0072, + "step": 16972 + }, + { + "epoch": 0.7902320925576739, + "grad_norm": 0.3595649373740225, + "learning_rate": 9.208409769100928e-05, + "loss": 2.9289, + "step": 16973 + }, + { + "epoch": 0.790278650743767, + "grad_norm": 0.36981508527482093, + "learning_rate": 9.208263498302614e-05, + "loss": 2.9997, + "step": 16974 + }, + { + "epoch": 0.7903252089298601, + "grad_norm": 0.36458892185056335, + "learning_rate": 9.208117215153465e-05, + "loss": 2.9451, + "step": 16975 + }, + { + "epoch": 0.7903717671159531, + "grad_norm": 0.407032224621676, + "learning_rate": 9.207970919653903e-05, + "loss": 2.992, + "step": 16976 + }, + { + "epoch": 0.7904183253020463, + "grad_norm": 0.332009994273827, + "learning_rate": 9.207824611804364e-05, + "loss": 2.9096, + "step": 16977 + }, + { + "epoch": 0.7904648834881393, + "grad_norm": 0.4044601276504722, + "learning_rate": 9.207678291605273e-05, + "loss": 3.0183, + "step": 16978 + }, + { + "epoch": 0.7905114416742324, + "grad_norm": 0.3567607033866071, + "learning_rate": 9.20753195905706e-05, + "loss": 3.0435, + "step": 16979 + }, + { + "epoch": 0.7905579998603255, + "grad_norm": 0.3615780291708891, + "learning_rate": 9.207385614160155e-05, + "loss": 3.0358, + "step": 16980 + }, + { + "epoch": 0.7906045580464185, + "grad_norm": 0.3464226725803721, + "learning_rate": 9.20723925691499e-05, + "loss": 2.8882, + "step": 16981 + }, + { + "epoch": 0.7906511162325116, + "grad_norm": 0.3243142981972423, + "learning_rate": 9.207092887321991e-05, + "loss": 2.9069, + "step": 16982 + }, + { + "epoch": 0.7906976744186046, + "grad_norm": 0.3455793997555106, + "learning_rate": 9.206946505381587e-05, + "loss": 3.0763, + "step": 16983 + }, + { + "epoch": 0.7907442326046977, + "grad_norm": 0.2927379910121721, + "learning_rate": 9.206800111094211e-05, + "loss": 3.0634, + "step": 16984 + }, + { + "epoch": 0.7907907907907908, + "grad_norm": 0.3614148795752591, + "learning_rate": 9.206653704460293e-05, + "loss": 2.9299, + "step": 16985 + }, + { + "epoch": 0.7908373489768838, + "grad_norm": 0.31567492160774885, + "learning_rate": 9.206507285480257e-05, + "loss": 2.9235, + "step": 16986 + }, + { + "epoch": 0.790883907162977, + "grad_norm": 0.32719973267889296, + "learning_rate": 9.20636085415454e-05, + "loss": 2.9135, + "step": 16987 + }, + { + "epoch": 0.79093046534907, + "grad_norm": 0.35004469791992526, + "learning_rate": 9.206214410483566e-05, + "loss": 3.0029, + "step": 16988 + }, + { + "epoch": 0.7909770235351631, + "grad_norm": 0.3561163765446393, + "learning_rate": 9.206067954467769e-05, + "loss": 2.9144, + "step": 16989 + }, + { + "epoch": 0.7910235817212561, + "grad_norm": 0.33991371489900574, + "learning_rate": 9.205921486107576e-05, + "loss": 2.9635, + "step": 16990 + }, + { + "epoch": 0.7910701399073492, + "grad_norm": 0.35401451919340987, + "learning_rate": 9.205775005403417e-05, + "loss": 2.9604, + "step": 16991 + }, + { + "epoch": 0.7911166980934423, + "grad_norm": 0.37029545363110644, + "learning_rate": 9.205628512355724e-05, + "loss": 2.9427, + "step": 16992 + }, + { + "epoch": 0.7911632562795353, + "grad_norm": 0.3451328627296031, + "learning_rate": 9.205482006964926e-05, + "loss": 2.9076, + "step": 16993 + }, + { + "epoch": 0.7912098144656284, + "grad_norm": 0.37797760164466887, + "learning_rate": 9.20533548923145e-05, + "loss": 2.9678, + "step": 16994 + }, + { + "epoch": 0.7912563726517214, + "grad_norm": 0.35542941377812676, + "learning_rate": 9.205188959155731e-05, + "loss": 2.9544, + "step": 16995 + }, + { + "epoch": 0.7913029308378146, + "grad_norm": 0.38440747620054994, + "learning_rate": 9.205042416738195e-05, + "loss": 3.0638, + "step": 16996 + }, + { + "epoch": 0.7913494890239077, + "grad_norm": 0.33653729578891944, + "learning_rate": 9.204895861979274e-05, + "loss": 3.0107, + "step": 16997 + }, + { + "epoch": 0.7913960472100007, + "grad_norm": 0.3377160918063823, + "learning_rate": 9.2047492948794e-05, + "loss": 2.9241, + "step": 16998 + }, + { + "epoch": 0.7914426053960938, + "grad_norm": 0.40017190841300887, + "learning_rate": 9.204602715438999e-05, + "loss": 3.0461, + "step": 16999 + }, + { + "epoch": 0.7914891635821868, + "grad_norm": 0.32510352484172866, + "learning_rate": 9.204456123658505e-05, + "loss": 2.9275, + "step": 17000 + }, + { + "epoch": 0.7915357217682799, + "grad_norm": 0.38461742911003177, + "learning_rate": 9.204309519538345e-05, + "loss": 3.0384, + "step": 17001 + }, + { + "epoch": 0.791582279954373, + "grad_norm": 0.3598537751119252, + "learning_rate": 9.20416290307895e-05, + "loss": 2.8559, + "step": 17002 + }, + { + "epoch": 0.791628838140466, + "grad_norm": 0.3379919646885507, + "learning_rate": 9.204016274280753e-05, + "loss": 2.9448, + "step": 17003 + }, + { + "epoch": 0.7916753963265591, + "grad_norm": 0.3404310917343979, + "learning_rate": 9.203869633144182e-05, + "loss": 2.886, + "step": 17004 + }, + { + "epoch": 0.7917219545126521, + "grad_norm": 0.35481254809878326, + "learning_rate": 9.203722979669667e-05, + "loss": 2.9493, + "step": 17005 + }, + { + "epoch": 0.7917685126987453, + "grad_norm": 0.351757346192715, + "learning_rate": 9.20357631385764e-05, + "loss": 2.9223, + "step": 17006 + }, + { + "epoch": 0.7918150708848384, + "grad_norm": 0.3424298019965008, + "learning_rate": 9.20342963570853e-05, + "loss": 2.9597, + "step": 17007 + }, + { + "epoch": 0.7918616290709314, + "grad_norm": 0.34119765483372405, + "learning_rate": 9.203282945222769e-05, + "loss": 3.009, + "step": 17008 + }, + { + "epoch": 0.7919081872570245, + "grad_norm": 0.342409875024772, + "learning_rate": 9.203136242400785e-05, + "loss": 2.956, + "step": 17009 + }, + { + "epoch": 0.7919547454431175, + "grad_norm": 0.33118841430365464, + "learning_rate": 9.20298952724301e-05, + "loss": 2.9162, + "step": 17010 + }, + { + "epoch": 0.7920013036292106, + "grad_norm": 0.3448156372412934, + "learning_rate": 9.202842799749878e-05, + "loss": 2.9456, + "step": 17011 + }, + { + "epoch": 0.7920478618153036, + "grad_norm": 0.3248386047896591, + "learning_rate": 9.202696059921813e-05, + "loss": 2.9333, + "step": 17012 + }, + { + "epoch": 0.7920944200013967, + "grad_norm": 0.32885243655800045, + "learning_rate": 9.202549307759252e-05, + "loss": 2.9115, + "step": 17013 + }, + { + "epoch": 0.7921409781874899, + "grad_norm": 0.31359909699470334, + "learning_rate": 9.20240254326262e-05, + "loss": 2.9914, + "step": 17014 + }, + { + "epoch": 0.7921875363735829, + "grad_norm": 0.34685667462524106, + "learning_rate": 9.202255766432353e-05, + "loss": 2.802, + "step": 17015 + }, + { + "epoch": 0.792234094559676, + "grad_norm": 0.35513362403418497, + "learning_rate": 9.202108977268877e-05, + "loss": 2.9116, + "step": 17016 + }, + { + "epoch": 0.792280652745769, + "grad_norm": 0.34417224886153613, + "learning_rate": 9.201962175772627e-05, + "loss": 3.0016, + "step": 17017 + }, + { + "epoch": 0.7923272109318621, + "grad_norm": 0.35044485492880717, + "learning_rate": 9.201815361944031e-05, + "loss": 2.9485, + "step": 17018 + }, + { + "epoch": 0.7923737691179552, + "grad_norm": 0.41032485128592305, + "learning_rate": 9.20166853578352e-05, + "loss": 3.0289, + "step": 17019 + }, + { + "epoch": 0.7924203273040482, + "grad_norm": 0.3550319558964126, + "learning_rate": 9.201521697291528e-05, + "loss": 2.9713, + "step": 17020 + }, + { + "epoch": 0.7924668854901413, + "grad_norm": 0.40323959733489667, + "learning_rate": 9.201374846468482e-05, + "loss": 2.9614, + "step": 17021 + }, + { + "epoch": 0.7925134436762343, + "grad_norm": 0.3435462472256235, + "learning_rate": 9.201227983314818e-05, + "loss": 2.8771, + "step": 17022 + }, + { + "epoch": 0.7925600018623274, + "grad_norm": 0.3765507374814863, + "learning_rate": 9.20108110783096e-05, + "loss": 2.9814, + "step": 17023 + }, + { + "epoch": 0.7926065600484206, + "grad_norm": 0.3816376623158065, + "learning_rate": 9.200934220017344e-05, + "loss": 2.9596, + "step": 17024 + }, + { + "epoch": 0.7926531182345136, + "grad_norm": 0.3487509150002247, + "learning_rate": 9.2007873198744e-05, + "loss": 2.9593, + "step": 17025 + }, + { + "epoch": 0.7926996764206067, + "grad_norm": 0.3968396503370767, + "learning_rate": 9.200640407402558e-05, + "loss": 3.0023, + "step": 17026 + }, + { + "epoch": 0.7927462346066997, + "grad_norm": 0.35596704558662745, + "learning_rate": 9.200493482602252e-05, + "loss": 2.9368, + "step": 17027 + }, + { + "epoch": 0.7927927927927928, + "grad_norm": 0.36226471658108034, + "learning_rate": 9.20034654547391e-05, + "loss": 2.9387, + "step": 17028 + }, + { + "epoch": 0.7928393509788859, + "grad_norm": 0.35125308327573257, + "learning_rate": 9.200199596017966e-05, + "loss": 2.9774, + "step": 17029 + }, + { + "epoch": 0.7928859091649789, + "grad_norm": 0.3684905310820727, + "learning_rate": 9.200052634234849e-05, + "loss": 3.0261, + "step": 17030 + }, + { + "epoch": 0.792932467351072, + "grad_norm": 0.3572137974370763, + "learning_rate": 9.199905660124992e-05, + "loss": 2.9725, + "step": 17031 + }, + { + "epoch": 0.792979025537165, + "grad_norm": 0.36563774488694323, + "learning_rate": 9.199758673688825e-05, + "loss": 2.9447, + "step": 17032 + }, + { + "epoch": 0.7930255837232582, + "grad_norm": 0.3231119306996576, + "learning_rate": 9.19961167492678e-05, + "loss": 2.9646, + "step": 17033 + }, + { + "epoch": 0.7930721419093512, + "grad_norm": 0.3776650266771477, + "learning_rate": 9.199464663839288e-05, + "loss": 2.991, + "step": 17034 + }, + { + "epoch": 0.7931187000954443, + "grad_norm": 0.31881389472550387, + "learning_rate": 9.199317640426783e-05, + "loss": 2.9377, + "step": 17035 + }, + { + "epoch": 0.7931652582815374, + "grad_norm": 0.3527779954193689, + "learning_rate": 9.199170604689693e-05, + "loss": 2.8914, + "step": 17036 + }, + { + "epoch": 0.7932118164676304, + "grad_norm": 0.31740772459083094, + "learning_rate": 9.199023556628452e-05, + "loss": 2.9565, + "step": 17037 + }, + { + "epoch": 0.7932583746537235, + "grad_norm": 0.34275977953489883, + "learning_rate": 9.19887649624349e-05, + "loss": 2.8816, + "step": 17038 + }, + { + "epoch": 0.7933049328398165, + "grad_norm": 0.34661616715820726, + "learning_rate": 9.198729423535238e-05, + "loss": 2.9928, + "step": 17039 + }, + { + "epoch": 0.7933514910259096, + "grad_norm": 0.3332989269400083, + "learning_rate": 9.19858233850413e-05, + "loss": 2.9166, + "step": 17040 + }, + { + "epoch": 0.7933980492120027, + "grad_norm": 0.3897016499039983, + "learning_rate": 9.198435241150595e-05, + "loss": 2.9527, + "step": 17041 + }, + { + "epoch": 0.7934446073980957, + "grad_norm": 0.33671014596354937, + "learning_rate": 9.198288131475069e-05, + "loss": 2.8959, + "step": 17042 + }, + { + "epoch": 0.7934911655841889, + "grad_norm": 0.3855774714349201, + "learning_rate": 9.198141009477978e-05, + "loss": 2.9128, + "step": 17043 + }, + { + "epoch": 0.7935377237702819, + "grad_norm": 0.3662173497328488, + "learning_rate": 9.197993875159757e-05, + "loss": 2.9453, + "step": 17044 + }, + { + "epoch": 0.793584281956375, + "grad_norm": 0.34894934272398587, + "learning_rate": 9.197846728520838e-05, + "loss": 2.9055, + "step": 17045 + }, + { + "epoch": 0.7936308401424681, + "grad_norm": 0.338742853810047, + "learning_rate": 9.197699569561653e-05, + "loss": 2.9337, + "step": 17046 + }, + { + "epoch": 0.7936773983285611, + "grad_norm": 0.36180220629490867, + "learning_rate": 9.197552398282634e-05, + "loss": 2.9671, + "step": 17047 + }, + { + "epoch": 0.7937239565146542, + "grad_norm": 0.41538858401917483, + "learning_rate": 9.197405214684211e-05, + "loss": 3.0347, + "step": 17048 + }, + { + "epoch": 0.7937705147007472, + "grad_norm": 0.3515616708932946, + "learning_rate": 9.197258018766818e-05, + "loss": 2.9088, + "step": 17049 + }, + { + "epoch": 0.7938170728868403, + "grad_norm": 0.34248128620009294, + "learning_rate": 9.197110810530885e-05, + "loss": 2.9893, + "step": 17050 + }, + { + "epoch": 0.7938636310729335, + "grad_norm": 0.34829935952870966, + "learning_rate": 9.196963589976846e-05, + "loss": 2.8589, + "step": 17051 + }, + { + "epoch": 0.7939101892590265, + "grad_norm": 0.34119335505213993, + "learning_rate": 9.196816357105133e-05, + "loss": 2.9641, + "step": 17052 + }, + { + "epoch": 0.7939567474451196, + "grad_norm": 0.36670991425562804, + "learning_rate": 9.196669111916176e-05, + "loss": 3.0496, + "step": 17053 + }, + { + "epoch": 0.7940033056312126, + "grad_norm": 0.36011467491601157, + "learning_rate": 9.19652185441041e-05, + "loss": 2.9285, + "step": 17054 + }, + { + "epoch": 0.7940498638173057, + "grad_norm": 0.38721559619817425, + "learning_rate": 9.196374584588265e-05, + "loss": 2.9649, + "step": 17055 + }, + { + "epoch": 0.7940964220033987, + "grad_norm": 0.3755230381889403, + "learning_rate": 9.196227302450174e-05, + "loss": 2.8751, + "step": 17056 + }, + { + "epoch": 0.7941429801894918, + "grad_norm": 0.37614236180482674, + "learning_rate": 9.19608000799657e-05, + "loss": 2.9208, + "step": 17057 + }, + { + "epoch": 0.7941895383755849, + "grad_norm": 0.4153737794979712, + "learning_rate": 9.195932701227883e-05, + "loss": 3.0631, + "step": 17058 + }, + { + "epoch": 0.7942360965616779, + "grad_norm": 0.3531790052139635, + "learning_rate": 9.195785382144549e-05, + "loss": 2.9459, + "step": 17059 + }, + { + "epoch": 0.794282654747771, + "grad_norm": 0.3767231284911907, + "learning_rate": 9.195638050746997e-05, + "loss": 3.0186, + "step": 17060 + }, + { + "epoch": 0.794329212933864, + "grad_norm": 0.4150894111174865, + "learning_rate": 9.19549070703566e-05, + "loss": 2.9208, + "step": 17061 + }, + { + "epoch": 0.7943757711199572, + "grad_norm": 0.35628527108590763, + "learning_rate": 9.195343351010972e-05, + "loss": 2.8479, + "step": 17062 + }, + { + "epoch": 0.7944223293060503, + "grad_norm": 0.3539929677952295, + "learning_rate": 9.195195982673365e-05, + "loss": 2.8585, + "step": 17063 + }, + { + "epoch": 0.7944688874921433, + "grad_norm": 0.36190998400124197, + "learning_rate": 9.195048602023271e-05, + "loss": 2.9829, + "step": 17064 + }, + { + "epoch": 0.7945154456782364, + "grad_norm": 0.3556383039080723, + "learning_rate": 9.194901209061122e-05, + "loss": 3.003, + "step": 17065 + }, + { + "epoch": 0.7945620038643294, + "grad_norm": 0.3492905836464809, + "learning_rate": 9.194753803787351e-05, + "loss": 2.9618, + "step": 17066 + }, + { + "epoch": 0.7946085620504225, + "grad_norm": 0.3540103703722519, + "learning_rate": 9.194606386202391e-05, + "loss": 2.9446, + "step": 17067 + }, + { + "epoch": 0.7946551202365156, + "grad_norm": 0.3524945715351994, + "learning_rate": 9.194458956306675e-05, + "loss": 3.0667, + "step": 17068 + }, + { + "epoch": 0.7947016784226086, + "grad_norm": 0.32750461726873253, + "learning_rate": 9.194311514100636e-05, + "loss": 2.9399, + "step": 17069 + }, + { + "epoch": 0.7947482366087018, + "grad_norm": 0.33166801192661455, + "learning_rate": 9.194164059584704e-05, + "loss": 2.9555, + "step": 17070 + }, + { + "epoch": 0.7947947947947948, + "grad_norm": 0.3242370996188148, + "learning_rate": 9.194016592759315e-05, + "loss": 2.9422, + "step": 17071 + }, + { + "epoch": 0.7948413529808879, + "grad_norm": 0.3921210460295057, + "learning_rate": 9.1938691136249e-05, + "loss": 2.9394, + "step": 17072 + }, + { + "epoch": 0.794887911166981, + "grad_norm": 0.33299605372229396, + "learning_rate": 9.193721622181892e-05, + "loss": 2.8932, + "step": 17073 + }, + { + "epoch": 0.794934469353074, + "grad_norm": 0.3579181350043245, + "learning_rate": 9.193574118430724e-05, + "loss": 2.9536, + "step": 17074 + }, + { + "epoch": 0.7949810275391671, + "grad_norm": 0.34630241470137746, + "learning_rate": 9.19342660237183e-05, + "loss": 3.0641, + "step": 17075 + }, + { + "epoch": 0.7950275857252601, + "grad_norm": 0.34091416382421597, + "learning_rate": 9.193279074005641e-05, + "loss": 2.9873, + "step": 17076 + }, + { + "epoch": 0.7950741439113532, + "grad_norm": 0.361812924924746, + "learning_rate": 9.193131533332592e-05, + "loss": 3.0185, + "step": 17077 + }, + { + "epoch": 0.7951207020974462, + "grad_norm": 0.3542000877902473, + "learning_rate": 9.192983980353115e-05, + "loss": 2.9208, + "step": 17078 + }, + { + "epoch": 0.7951672602835393, + "grad_norm": 0.3088883659692688, + "learning_rate": 9.192836415067643e-05, + "loss": 2.9124, + "step": 17079 + }, + { + "epoch": 0.7952138184696325, + "grad_norm": 0.3734716699172936, + "learning_rate": 9.192688837476608e-05, + "loss": 2.9419, + "step": 17080 + }, + { + "epoch": 0.7952603766557255, + "grad_norm": 0.3263639869977278, + "learning_rate": 9.192541247580446e-05, + "loss": 2.9891, + "step": 17081 + }, + { + "epoch": 0.7953069348418186, + "grad_norm": 0.3727528867180131, + "learning_rate": 9.192393645379589e-05, + "loss": 2.9036, + "step": 17082 + }, + { + "epoch": 0.7953534930279116, + "grad_norm": 0.3740315835923071, + "learning_rate": 9.192246030874467e-05, + "loss": 2.8862, + "step": 17083 + }, + { + "epoch": 0.7954000512140047, + "grad_norm": 0.3328385120268321, + "learning_rate": 9.192098404065518e-05, + "loss": 2.891, + "step": 17084 + }, + { + "epoch": 0.7954466094000978, + "grad_norm": 0.39277322269270837, + "learning_rate": 9.191950764953172e-05, + "loss": 3.0168, + "step": 17085 + }, + { + "epoch": 0.7954931675861908, + "grad_norm": 0.34422761043932326, + "learning_rate": 9.191803113537864e-05, + "loss": 2.9874, + "step": 17086 + }, + { + "epoch": 0.7955397257722839, + "grad_norm": 0.35054409581366874, + "learning_rate": 9.191655449820028e-05, + "loss": 2.9957, + "step": 17087 + }, + { + "epoch": 0.7955862839583769, + "grad_norm": 0.36329239562961047, + "learning_rate": 9.191507773800095e-05, + "loss": 3.0334, + "step": 17088 + }, + { + "epoch": 0.79563284214447, + "grad_norm": 0.366140575070601, + "learning_rate": 9.191360085478499e-05, + "loss": 2.9394, + "step": 17089 + }, + { + "epoch": 0.7956794003305632, + "grad_norm": 0.361124925947669, + "learning_rate": 9.191212384855675e-05, + "loss": 2.9294, + "step": 17090 + }, + { + "epoch": 0.7957259585166562, + "grad_norm": 0.359154109452593, + "learning_rate": 9.191064671932054e-05, + "loss": 2.9174, + "step": 17091 + }, + { + "epoch": 0.7957725167027493, + "grad_norm": 0.34162386900838687, + "learning_rate": 9.190916946708074e-05, + "loss": 2.8996, + "step": 17092 + }, + { + "epoch": 0.7958190748888423, + "grad_norm": 0.4341730558548874, + "learning_rate": 9.190769209184164e-05, + "loss": 3.1635, + "step": 17093 + }, + { + "epoch": 0.7958656330749354, + "grad_norm": 0.3532009617692015, + "learning_rate": 9.19062145936076e-05, + "loss": 2.8888, + "step": 17094 + }, + { + "epoch": 0.7959121912610285, + "grad_norm": 0.38266886238168074, + "learning_rate": 9.190473697238293e-05, + "loss": 2.886, + "step": 17095 + }, + { + "epoch": 0.7959587494471215, + "grad_norm": 0.3536242362217138, + "learning_rate": 9.1903259228172e-05, + "loss": 2.9925, + "step": 17096 + }, + { + "epoch": 0.7960053076332146, + "grad_norm": 0.38103881602055667, + "learning_rate": 9.190178136097912e-05, + "loss": 3.0019, + "step": 17097 + }, + { + "epoch": 0.7960518658193076, + "grad_norm": 0.38000561252104315, + "learning_rate": 9.190030337080866e-05, + "loss": 3.0369, + "step": 17098 + }, + { + "epoch": 0.7960984240054008, + "grad_norm": 0.3702571049591049, + "learning_rate": 9.18988252576649e-05, + "loss": 2.9449, + "step": 17099 + }, + { + "epoch": 0.7961449821914938, + "grad_norm": 0.4211036564855575, + "learning_rate": 9.189734702155226e-05, + "loss": 3.0233, + "step": 17100 + }, + { + "epoch": 0.7961915403775869, + "grad_norm": 0.3711357791953224, + "learning_rate": 9.189586866247501e-05, + "loss": 3.0326, + "step": 17101 + }, + { + "epoch": 0.79623809856368, + "grad_norm": 0.38919940486860843, + "learning_rate": 9.189439018043753e-05, + "loss": 2.8535, + "step": 17102 + }, + { + "epoch": 0.796284656749773, + "grad_norm": 0.3821131223861108, + "learning_rate": 9.189291157544412e-05, + "loss": 3.0459, + "step": 17103 + }, + { + "epoch": 0.7963312149358661, + "grad_norm": 0.34233476768129634, + "learning_rate": 9.189143284749915e-05, + "loss": 2.929, + "step": 17104 + }, + { + "epoch": 0.7963777731219591, + "grad_norm": 0.39249280007794685, + "learning_rate": 9.188995399660696e-05, + "loss": 2.9836, + "step": 17105 + }, + { + "epoch": 0.7964243313080522, + "grad_norm": 0.38085924567088386, + "learning_rate": 9.188847502277188e-05, + "loss": 2.9533, + "step": 17106 + }, + { + "epoch": 0.7964708894941454, + "grad_norm": 0.3729334384234763, + "learning_rate": 9.188699592599825e-05, + "loss": 2.979, + "step": 17107 + }, + { + "epoch": 0.7965174476802384, + "grad_norm": 0.3621705384062135, + "learning_rate": 9.18855167062904e-05, + "loss": 2.9811, + "step": 17108 + }, + { + "epoch": 0.7965640058663315, + "grad_norm": 0.3492690614185133, + "learning_rate": 9.18840373636527e-05, + "loss": 2.985, + "step": 17109 + }, + { + "epoch": 0.7966105640524245, + "grad_norm": 0.3261621819275892, + "learning_rate": 9.188255789808949e-05, + "loss": 2.8426, + "step": 17110 + }, + { + "epoch": 0.7966571222385176, + "grad_norm": 0.35822979625365436, + "learning_rate": 9.188107830960507e-05, + "loss": 3.0202, + "step": 17111 + }, + { + "epoch": 0.7967036804246107, + "grad_norm": 0.32305970325051375, + "learning_rate": 9.187959859820382e-05, + "loss": 2.8205, + "step": 17112 + }, + { + "epoch": 0.7967502386107037, + "grad_norm": 0.3547053971074017, + "learning_rate": 9.187811876389007e-05, + "loss": 2.9766, + "step": 17113 + }, + { + "epoch": 0.7967967967967968, + "grad_norm": 0.3209364195686748, + "learning_rate": 9.187663880666818e-05, + "loss": 2.9825, + "step": 17114 + }, + { + "epoch": 0.7968433549828898, + "grad_norm": 0.33660445520933924, + "learning_rate": 9.187515872654247e-05, + "loss": 2.8914, + "step": 17115 + }, + { + "epoch": 0.7968899131689829, + "grad_norm": 0.3255042913149292, + "learning_rate": 9.18736785235173e-05, + "loss": 3.0341, + "step": 17116 + }, + { + "epoch": 0.7969364713550761, + "grad_norm": 0.326198660942148, + "learning_rate": 9.187219819759701e-05, + "loss": 2.863, + "step": 17117 + }, + { + "epoch": 0.7969830295411691, + "grad_norm": 0.29963954818545363, + "learning_rate": 9.187071774878596e-05, + "loss": 2.9141, + "step": 17118 + }, + { + "epoch": 0.7970295877272622, + "grad_norm": 0.33003104982789266, + "learning_rate": 9.186923717708845e-05, + "loss": 3.0762, + "step": 17119 + }, + { + "epoch": 0.7970761459133552, + "grad_norm": 0.3334189057858108, + "learning_rate": 9.186775648250884e-05, + "loss": 2.8569, + "step": 17120 + }, + { + "epoch": 0.7971227040994483, + "grad_norm": 0.322605254410121, + "learning_rate": 9.186627566505153e-05, + "loss": 2.9432, + "step": 17121 + }, + { + "epoch": 0.7971692622855413, + "grad_norm": 0.3169288464104275, + "learning_rate": 9.18647947247208e-05, + "loss": 2.9591, + "step": 17122 + }, + { + "epoch": 0.7972158204716344, + "grad_norm": 0.3211470924234391, + "learning_rate": 9.186331366152103e-05, + "loss": 3.0428, + "step": 17123 + }, + { + "epoch": 0.7972623786577275, + "grad_norm": 0.31579436192914806, + "learning_rate": 9.186183247545656e-05, + "loss": 2.9555, + "step": 17124 + }, + { + "epoch": 0.7973089368438205, + "grad_norm": 0.3328544818174197, + "learning_rate": 9.186035116653173e-05, + "loss": 2.9372, + "step": 17125 + }, + { + "epoch": 0.7973554950299137, + "grad_norm": 0.3225680754895832, + "learning_rate": 9.185886973475091e-05, + "loss": 3.0992, + "step": 17126 + }, + { + "epoch": 0.7974020532160067, + "grad_norm": 0.35714043229159326, + "learning_rate": 9.185738818011842e-05, + "loss": 2.9414, + "step": 17127 + }, + { + "epoch": 0.7974486114020998, + "grad_norm": 0.3397013865573558, + "learning_rate": 9.185590650263863e-05, + "loss": 2.937, + "step": 17128 + }, + { + "epoch": 0.7974951695881929, + "grad_norm": 0.3640292710722367, + "learning_rate": 9.185442470231584e-05, + "loss": 2.8379, + "step": 17129 + }, + { + "epoch": 0.7975417277742859, + "grad_norm": 0.32726184984378454, + "learning_rate": 9.185294277915447e-05, + "loss": 3.0252, + "step": 17130 + }, + { + "epoch": 0.797588285960379, + "grad_norm": 0.34046176817079, + "learning_rate": 9.185146073315883e-05, + "loss": 2.8786, + "step": 17131 + }, + { + "epoch": 0.797634844146472, + "grad_norm": 0.3183374608097564, + "learning_rate": 9.184997856433328e-05, + "loss": 2.9556, + "step": 17132 + }, + { + "epoch": 0.7976814023325651, + "grad_norm": 0.3449413546298494, + "learning_rate": 9.184849627268215e-05, + "loss": 2.9669, + "step": 17133 + }, + { + "epoch": 0.7977279605186582, + "grad_norm": 0.3415398054807927, + "learning_rate": 9.184701385820982e-05, + "loss": 3.0224, + "step": 17134 + }, + { + "epoch": 0.7977745187047512, + "grad_norm": 0.3325114496377197, + "learning_rate": 9.184553132092062e-05, + "loss": 2.9887, + "step": 17135 + }, + { + "epoch": 0.7978210768908444, + "grad_norm": 0.35354778884513627, + "learning_rate": 9.184404866081891e-05, + "loss": 3.0077, + "step": 17136 + }, + { + "epoch": 0.7978676350769374, + "grad_norm": 0.2997747400124745, + "learning_rate": 9.184256587790904e-05, + "loss": 2.8664, + "step": 17137 + }, + { + "epoch": 0.7979141932630305, + "grad_norm": 0.33116739621393926, + "learning_rate": 9.184108297219538e-05, + "loss": 2.9617, + "step": 17138 + }, + { + "epoch": 0.7979607514491236, + "grad_norm": 0.33962214219337444, + "learning_rate": 9.183959994368224e-05, + "loss": 2.9193, + "step": 17139 + }, + { + "epoch": 0.7980073096352166, + "grad_norm": 0.3347260671701798, + "learning_rate": 9.1838116792374e-05, + "loss": 2.9336, + "step": 17140 + }, + { + "epoch": 0.7980538678213097, + "grad_norm": 0.3847366009821097, + "learning_rate": 9.183663351827501e-05, + "loss": 2.9974, + "step": 17141 + }, + { + "epoch": 0.7981004260074027, + "grad_norm": 0.3228994810333322, + "learning_rate": 9.183515012138962e-05, + "loss": 2.9012, + "step": 17142 + }, + { + "epoch": 0.7981469841934958, + "grad_norm": 0.35227780567298633, + "learning_rate": 9.183366660172219e-05, + "loss": 2.8275, + "step": 17143 + }, + { + "epoch": 0.7981935423795888, + "grad_norm": 0.3393033450843228, + "learning_rate": 9.183218295927707e-05, + "loss": 2.8534, + "step": 17144 + }, + { + "epoch": 0.798240100565682, + "grad_norm": 0.3231176463931382, + "learning_rate": 9.183069919405863e-05, + "loss": 2.9901, + "step": 17145 + }, + { + "epoch": 0.7982866587517751, + "grad_norm": 0.34084765655956406, + "learning_rate": 9.182921530607118e-05, + "loss": 2.8708, + "step": 17146 + }, + { + "epoch": 0.7983332169378681, + "grad_norm": 0.3680578018012506, + "learning_rate": 9.182773129531911e-05, + "loss": 2.8705, + "step": 17147 + }, + { + "epoch": 0.7983797751239612, + "grad_norm": 0.36708738896708426, + "learning_rate": 9.18262471618068e-05, + "loss": 3.058, + "step": 17148 + }, + { + "epoch": 0.7984263333100542, + "grad_norm": 0.3821112532229697, + "learning_rate": 9.182476290553853e-05, + "loss": 2.9069, + "step": 17149 + }, + { + "epoch": 0.7984728914961473, + "grad_norm": 0.39382199868645035, + "learning_rate": 9.182327852651873e-05, + "loss": 3.0279, + "step": 17150 + }, + { + "epoch": 0.7985194496822404, + "grad_norm": 0.3493870454552962, + "learning_rate": 9.182179402475171e-05, + "loss": 3.0483, + "step": 17151 + }, + { + "epoch": 0.7985660078683334, + "grad_norm": 0.41347386759801563, + "learning_rate": 9.182030940024185e-05, + "loss": 2.9968, + "step": 17152 + }, + { + "epoch": 0.7986125660544265, + "grad_norm": 0.338453978569396, + "learning_rate": 9.181882465299352e-05, + "loss": 2.8903, + "step": 17153 + }, + { + "epoch": 0.7986591242405195, + "grad_norm": 0.358649290172176, + "learning_rate": 9.181733978301103e-05, + "loss": 2.9744, + "step": 17154 + }, + { + "epoch": 0.7987056824266127, + "grad_norm": 0.3662409381685029, + "learning_rate": 9.181585479029879e-05, + "loss": 2.9699, + "step": 17155 + }, + { + "epoch": 0.7987522406127058, + "grad_norm": 0.3360786242780256, + "learning_rate": 9.181436967486113e-05, + "loss": 2.8792, + "step": 17156 + }, + { + "epoch": 0.7987987987987988, + "grad_norm": 0.3636500459103078, + "learning_rate": 9.181288443670242e-05, + "loss": 2.8056, + "step": 17157 + }, + { + "epoch": 0.7988453569848919, + "grad_norm": 0.37501893168241157, + "learning_rate": 9.1811399075827e-05, + "loss": 2.9577, + "step": 17158 + }, + { + "epoch": 0.7988919151709849, + "grad_norm": 0.3388372088229171, + "learning_rate": 9.180991359223926e-05, + "loss": 2.9858, + "step": 17159 + }, + { + "epoch": 0.798938473357078, + "grad_norm": 0.414703034364997, + "learning_rate": 9.180842798594352e-05, + "loss": 3.056, + "step": 17160 + }, + { + "epoch": 0.7989850315431711, + "grad_norm": 0.38029946104662277, + "learning_rate": 9.180694225694418e-05, + "loss": 2.9003, + "step": 17161 + }, + { + "epoch": 0.7990315897292641, + "grad_norm": 0.37288382732163045, + "learning_rate": 9.180545640524557e-05, + "loss": 2.9888, + "step": 17162 + }, + { + "epoch": 0.7990781479153573, + "grad_norm": 0.3543566200339616, + "learning_rate": 9.180397043085207e-05, + "loss": 2.9617, + "step": 17163 + }, + { + "epoch": 0.7991247061014503, + "grad_norm": 0.36780975935790955, + "learning_rate": 9.180248433376805e-05, + "loss": 2.8461, + "step": 17164 + }, + { + "epoch": 0.7991712642875434, + "grad_norm": 0.34233462108859014, + "learning_rate": 9.180099811399783e-05, + "loss": 2.8551, + "step": 17165 + }, + { + "epoch": 0.7992178224736364, + "grad_norm": 0.4048621276998495, + "learning_rate": 9.179951177154582e-05, + "loss": 2.9002, + "step": 17166 + }, + { + "epoch": 0.7992643806597295, + "grad_norm": 0.37963716257495117, + "learning_rate": 9.179802530641635e-05, + "loss": 3.0207, + "step": 17167 + }, + { + "epoch": 0.7993109388458226, + "grad_norm": 0.3382041880737643, + "learning_rate": 9.17965387186138e-05, + "loss": 2.9509, + "step": 17168 + }, + { + "epoch": 0.7993574970319156, + "grad_norm": 0.3910429713488418, + "learning_rate": 9.179505200814251e-05, + "loss": 3.0651, + "step": 17169 + }, + { + "epoch": 0.7994040552180087, + "grad_norm": 0.3295763914332584, + "learning_rate": 9.179356517500689e-05, + "loss": 2.9336, + "step": 17170 + }, + { + "epoch": 0.7994506134041017, + "grad_norm": 0.34506468816288705, + "learning_rate": 9.179207821921125e-05, + "loss": 2.9829, + "step": 17171 + }, + { + "epoch": 0.7994971715901948, + "grad_norm": 0.3225829389671942, + "learning_rate": 9.179059114075999e-05, + "loss": 2.8892, + "step": 17172 + }, + { + "epoch": 0.799543729776288, + "grad_norm": 0.33487338144830453, + "learning_rate": 9.178910393965746e-05, + "loss": 2.9302, + "step": 17173 + }, + { + "epoch": 0.799590287962381, + "grad_norm": 0.3374636001205272, + "learning_rate": 9.178761661590801e-05, + "loss": 2.8854, + "step": 17174 + }, + { + "epoch": 0.7996368461484741, + "grad_norm": 0.33271184104517104, + "learning_rate": 9.178612916951603e-05, + "loss": 2.9518, + "step": 17175 + }, + { + "epoch": 0.7996834043345671, + "grad_norm": 0.35659060803007553, + "learning_rate": 9.17846416004859e-05, + "loss": 2.9919, + "step": 17176 + }, + { + "epoch": 0.7997299625206602, + "grad_norm": 0.3698740136447947, + "learning_rate": 9.178315390882194e-05, + "loss": 2.9901, + "step": 17177 + }, + { + "epoch": 0.7997765207067533, + "grad_norm": 0.32719392016384724, + "learning_rate": 9.178166609452854e-05, + "loss": 2.8518, + "step": 17178 + }, + { + "epoch": 0.7998230788928463, + "grad_norm": 0.3495515902791112, + "learning_rate": 9.178017815761008e-05, + "loss": 3.0225, + "step": 17179 + }, + { + "epoch": 0.7998696370789394, + "grad_norm": 0.3234992410181881, + "learning_rate": 9.177869009807089e-05, + "loss": 2.9775, + "step": 17180 + }, + { + "epoch": 0.7999161952650324, + "grad_norm": 0.33948956315531964, + "learning_rate": 9.177720191591538e-05, + "loss": 2.9365, + "step": 17181 + }, + { + "epoch": 0.7999627534511256, + "grad_norm": 0.3313463641596537, + "learning_rate": 9.17757136111479e-05, + "loss": 2.8641, + "step": 17182 + }, + { + "epoch": 0.8000093116372187, + "grad_norm": 0.3239938345446258, + "learning_rate": 9.177422518377281e-05, + "loss": 2.8809, + "step": 17183 + }, + { + "epoch": 0.8000558698233117, + "grad_norm": 0.3718617722398449, + "learning_rate": 9.177273663379449e-05, + "loss": 2.8703, + "step": 17184 + }, + { + "epoch": 0.8001024280094048, + "grad_norm": 0.33600158958299886, + "learning_rate": 9.17712479612173e-05, + "loss": 2.8975, + "step": 17185 + }, + { + "epoch": 0.8001489861954978, + "grad_norm": 0.33909167934625917, + "learning_rate": 9.17697591660456e-05, + "loss": 3.059, + "step": 17186 + }, + { + "epoch": 0.8001955443815909, + "grad_norm": 0.3406375522764302, + "learning_rate": 9.176827024828379e-05, + "loss": 3.0059, + "step": 17187 + }, + { + "epoch": 0.8002421025676839, + "grad_norm": 0.32173952036991155, + "learning_rate": 9.17667812079362e-05, + "loss": 2.8341, + "step": 17188 + }, + { + "epoch": 0.800288660753777, + "grad_norm": 0.3682872364708083, + "learning_rate": 9.176529204500725e-05, + "loss": 2.9672, + "step": 17189 + }, + { + "epoch": 0.8003352189398701, + "grad_norm": 0.373730221979909, + "learning_rate": 9.176380275950126e-05, + "loss": 3.021, + "step": 17190 + }, + { + "epoch": 0.8003817771259631, + "grad_norm": 0.3034657125012343, + "learning_rate": 9.176231335142263e-05, + "loss": 2.9062, + "step": 17191 + }, + { + "epoch": 0.8004283353120563, + "grad_norm": 0.3775843603454443, + "learning_rate": 9.176082382077573e-05, + "loss": 2.9521, + "step": 17192 + }, + { + "epoch": 0.8004748934981493, + "grad_norm": 0.36556644622588896, + "learning_rate": 9.175933416756493e-05, + "loss": 3.0018, + "step": 17193 + }, + { + "epoch": 0.8005214516842424, + "grad_norm": 0.34290140939661523, + "learning_rate": 9.175784439179457e-05, + "loss": 2.9425, + "step": 17194 + }, + { + "epoch": 0.8005680098703355, + "grad_norm": 0.3691450821195036, + "learning_rate": 9.175635449346908e-05, + "loss": 2.9944, + "step": 17195 + }, + { + "epoch": 0.8006145680564285, + "grad_norm": 0.310701504679894, + "learning_rate": 9.17548644725928e-05, + "loss": 3.0197, + "step": 17196 + }, + { + "epoch": 0.8006611262425216, + "grad_norm": 0.36443397896535656, + "learning_rate": 9.175337432917009e-05, + "loss": 2.9674, + "step": 17197 + }, + { + "epoch": 0.8007076844286146, + "grad_norm": 0.3189632839707523, + "learning_rate": 9.175188406320536e-05, + "loss": 2.9207, + "step": 17198 + }, + { + "epoch": 0.8007542426147077, + "grad_norm": 0.33114952144809917, + "learning_rate": 9.175039367470295e-05, + "loss": 2.8816, + "step": 17199 + }, + { + "epoch": 0.8008008008008008, + "grad_norm": 0.3334871746870459, + "learning_rate": 9.174890316366724e-05, + "loss": 3.0115, + "step": 17200 + }, + { + "epoch": 0.8008473589868939, + "grad_norm": 0.35217412776720325, + "learning_rate": 9.174741253010264e-05, + "loss": 2.8162, + "step": 17201 + }, + { + "epoch": 0.800893917172987, + "grad_norm": 0.3430583167157708, + "learning_rate": 9.174592177401347e-05, + "loss": 3.0059, + "step": 17202 + }, + { + "epoch": 0.80094047535908, + "grad_norm": 0.35704867902910686, + "learning_rate": 9.174443089540414e-05, + "loss": 3.1497, + "step": 17203 + }, + { + "epoch": 0.8009870335451731, + "grad_norm": 0.357666351441568, + "learning_rate": 9.174293989427903e-05, + "loss": 2.8576, + "step": 17204 + }, + { + "epoch": 0.8010335917312662, + "grad_norm": 0.3256188503348413, + "learning_rate": 9.174144877064247e-05, + "loss": 2.9486, + "step": 17205 + }, + { + "epoch": 0.8010801499173592, + "grad_norm": 0.3345780007406465, + "learning_rate": 9.173995752449891e-05, + "loss": 2.9703, + "step": 17206 + }, + { + "epoch": 0.8011267081034523, + "grad_norm": 0.34789179717663216, + "learning_rate": 9.173846615585267e-05, + "loss": 2.9925, + "step": 17207 + }, + { + "epoch": 0.8011732662895453, + "grad_norm": 0.3503647750600605, + "learning_rate": 9.173697466470814e-05, + "loss": 2.9011, + "step": 17208 + }, + { + "epoch": 0.8012198244756384, + "grad_norm": 0.32242019527080795, + "learning_rate": 9.173548305106971e-05, + "loss": 2.9047, + "step": 17209 + }, + { + "epoch": 0.8012663826617314, + "grad_norm": 0.3692890823649684, + "learning_rate": 9.173399131494175e-05, + "loss": 2.8749, + "step": 17210 + }, + { + "epoch": 0.8013129408478246, + "grad_norm": 0.32734735055828434, + "learning_rate": 9.173249945632863e-05, + "loss": 3.0418, + "step": 17211 + }, + { + "epoch": 0.8013594990339177, + "grad_norm": 0.3450162974610677, + "learning_rate": 9.173100747523473e-05, + "loss": 2.8669, + "step": 17212 + }, + { + "epoch": 0.8014060572200107, + "grad_norm": 0.3464615461599868, + "learning_rate": 9.172951537166445e-05, + "loss": 2.8532, + "step": 17213 + }, + { + "epoch": 0.8014526154061038, + "grad_norm": 0.3211972240033322, + "learning_rate": 9.172802314562214e-05, + "loss": 3.0, + "step": 17214 + }, + { + "epoch": 0.8014991735921968, + "grad_norm": 0.3593167923262499, + "learning_rate": 9.172653079711219e-05, + "loss": 2.8719, + "step": 17215 + }, + { + "epoch": 0.8015457317782899, + "grad_norm": 0.3381772997897464, + "learning_rate": 9.1725038326139e-05, + "loss": 3.0065, + "step": 17216 + }, + { + "epoch": 0.801592289964383, + "grad_norm": 0.3826485459495159, + "learning_rate": 9.172354573270691e-05, + "loss": 2.9904, + "step": 17217 + }, + { + "epoch": 0.801638848150476, + "grad_norm": 0.30881395948159024, + "learning_rate": 9.172205301682036e-05, + "loss": 2.8923, + "step": 17218 + }, + { + "epoch": 0.8016854063365692, + "grad_norm": 0.34198778201694285, + "learning_rate": 9.172056017848366e-05, + "loss": 2.9987, + "step": 17219 + }, + { + "epoch": 0.8017319645226622, + "grad_norm": 0.32902460173013065, + "learning_rate": 9.171906721770125e-05, + "loss": 2.9665, + "step": 17220 + }, + { + "epoch": 0.8017785227087553, + "grad_norm": 0.35699008091480594, + "learning_rate": 9.171757413447747e-05, + "loss": 3.0066, + "step": 17221 + }, + { + "epoch": 0.8018250808948484, + "grad_norm": 0.3121631578053893, + "learning_rate": 9.171608092881672e-05, + "loss": 2.8968, + "step": 17222 + }, + { + "epoch": 0.8018716390809414, + "grad_norm": 0.3513734055502875, + "learning_rate": 9.171458760072339e-05, + "loss": 2.9648, + "step": 17223 + }, + { + "epoch": 0.8019181972670345, + "grad_norm": 0.34438378350015975, + "learning_rate": 9.171309415020186e-05, + "loss": 2.9715, + "step": 17224 + }, + { + "epoch": 0.8019647554531275, + "grad_norm": 0.35846111331668523, + "learning_rate": 9.17116005772565e-05, + "loss": 2.9072, + "step": 17225 + }, + { + "epoch": 0.8020113136392206, + "grad_norm": 0.33512001162727745, + "learning_rate": 9.17101068818917e-05, + "loss": 2.8752, + "step": 17226 + }, + { + "epoch": 0.8020578718253137, + "grad_norm": 0.3649159466052778, + "learning_rate": 9.170861306411184e-05, + "loss": 2.8359, + "step": 17227 + }, + { + "epoch": 0.8021044300114067, + "grad_norm": 0.34070628790720114, + "learning_rate": 9.170711912392132e-05, + "loss": 3.0404, + "step": 17228 + }, + { + "epoch": 0.8021509881974999, + "grad_norm": 0.3529344291979294, + "learning_rate": 9.170562506132452e-05, + "loss": 2.9979, + "step": 17229 + }, + { + "epoch": 0.8021975463835929, + "grad_norm": 0.3863494846004315, + "learning_rate": 9.170413087632581e-05, + "loss": 3.0425, + "step": 17230 + }, + { + "epoch": 0.802244104569686, + "grad_norm": 0.36325893965632466, + "learning_rate": 9.170263656892959e-05, + "loss": 2.9501, + "step": 17231 + }, + { + "epoch": 0.802290662755779, + "grad_norm": 0.3840187443818173, + "learning_rate": 9.170114213914023e-05, + "loss": 3.0325, + "step": 17232 + }, + { + "epoch": 0.8023372209418721, + "grad_norm": 0.367791599454513, + "learning_rate": 9.169964758696213e-05, + "loss": 2.9, + "step": 17233 + }, + { + "epoch": 0.8023837791279652, + "grad_norm": 0.35868412994684246, + "learning_rate": 9.169815291239968e-05, + "loss": 2.9103, + "step": 17234 + }, + { + "epoch": 0.8024303373140582, + "grad_norm": 0.4068024434638825, + "learning_rate": 9.169665811545724e-05, + "loss": 2.9446, + "step": 17235 + }, + { + "epoch": 0.8024768955001513, + "grad_norm": 0.38796767426763307, + "learning_rate": 9.169516319613923e-05, + "loss": 3.0029, + "step": 17236 + }, + { + "epoch": 0.8025234536862443, + "grad_norm": 0.34477528588066597, + "learning_rate": 9.169366815445001e-05, + "loss": 2.9658, + "step": 17237 + }, + { + "epoch": 0.8025700118723375, + "grad_norm": 0.3642126574875984, + "learning_rate": 9.169217299039401e-05, + "loss": 3.0002, + "step": 17238 + }, + { + "epoch": 0.8026165700584306, + "grad_norm": 0.3541145848201922, + "learning_rate": 9.169067770397556e-05, + "loss": 2.9777, + "step": 17239 + }, + { + "epoch": 0.8026631282445236, + "grad_norm": 0.37329743875384147, + "learning_rate": 9.168918229519911e-05, + "loss": 2.9836, + "step": 17240 + }, + { + "epoch": 0.8027096864306167, + "grad_norm": 0.41334815425286703, + "learning_rate": 9.1687686764069e-05, + "loss": 3.0042, + "step": 17241 + }, + { + "epoch": 0.8027562446167097, + "grad_norm": 0.33164909301167783, + "learning_rate": 9.168619111058962e-05, + "loss": 2.8947, + "step": 17242 + }, + { + "epoch": 0.8028028028028028, + "grad_norm": 0.38701158621638504, + "learning_rate": 9.16846953347654e-05, + "loss": 2.9282, + "step": 17243 + }, + { + "epoch": 0.8028493609888959, + "grad_norm": 0.35889421756013695, + "learning_rate": 9.16831994366007e-05, + "loss": 3.0381, + "step": 17244 + }, + { + "epoch": 0.8028959191749889, + "grad_norm": 0.32718639969112456, + "learning_rate": 9.168170341609992e-05, + "loss": 2.9739, + "step": 17245 + }, + { + "epoch": 0.802942477361082, + "grad_norm": 0.36288714008691275, + "learning_rate": 9.168020727326743e-05, + "loss": 2.9729, + "step": 17246 + }, + { + "epoch": 0.802989035547175, + "grad_norm": 0.35428916686354583, + "learning_rate": 9.167871100810765e-05, + "loss": 2.9454, + "step": 17247 + }, + { + "epoch": 0.8030355937332682, + "grad_norm": 0.3631276005663947, + "learning_rate": 9.167721462062495e-05, + "loss": 2.8874, + "step": 17248 + }, + { + "epoch": 0.8030821519193613, + "grad_norm": 0.35096677633768547, + "learning_rate": 9.167571811082375e-05, + "loss": 2.9002, + "step": 17249 + }, + { + "epoch": 0.8031287101054543, + "grad_norm": 0.36164515769516764, + "learning_rate": 9.167422147870841e-05, + "loss": 2.9683, + "step": 17250 + }, + { + "epoch": 0.8031752682915474, + "grad_norm": 0.3442086098760198, + "learning_rate": 9.167272472428334e-05, + "loss": 3.0119, + "step": 17251 + }, + { + "epoch": 0.8032218264776404, + "grad_norm": 0.36358444396091233, + "learning_rate": 9.167122784755291e-05, + "loss": 2.9178, + "step": 17252 + }, + { + "epoch": 0.8032683846637335, + "grad_norm": 0.3507628227827076, + "learning_rate": 9.166973084852155e-05, + "loss": 2.9905, + "step": 17253 + }, + { + "epoch": 0.8033149428498265, + "grad_norm": 0.3566931052308628, + "learning_rate": 9.166823372719364e-05, + "loss": 2.8056, + "step": 17254 + }, + { + "epoch": 0.8033615010359196, + "grad_norm": 0.3594494816391094, + "learning_rate": 9.166673648357356e-05, + "loss": 3.0467, + "step": 17255 + }, + { + "epoch": 0.8034080592220127, + "grad_norm": 0.34953322831906286, + "learning_rate": 9.166523911766572e-05, + "loss": 2.8165, + "step": 17256 + }, + { + "epoch": 0.8034546174081058, + "grad_norm": 0.3370256995855343, + "learning_rate": 9.166374162947449e-05, + "loss": 2.8846, + "step": 17257 + }, + { + "epoch": 0.8035011755941989, + "grad_norm": 0.34245448912475496, + "learning_rate": 9.16622440190043e-05, + "loss": 2.827, + "step": 17258 + }, + { + "epoch": 0.8035477337802919, + "grad_norm": 0.32460787756491166, + "learning_rate": 9.166074628625953e-05, + "loss": 2.9303, + "step": 17259 + }, + { + "epoch": 0.803594291966385, + "grad_norm": 0.38239080368247946, + "learning_rate": 9.165924843124455e-05, + "loss": 2.9401, + "step": 17260 + }, + { + "epoch": 0.8036408501524781, + "grad_norm": 0.33401089967048897, + "learning_rate": 9.16577504539638e-05, + "loss": 2.8753, + "step": 17261 + }, + { + "epoch": 0.8036874083385711, + "grad_norm": 0.36531517862235335, + "learning_rate": 9.165625235442165e-05, + "loss": 2.9048, + "step": 17262 + }, + { + "epoch": 0.8037339665246642, + "grad_norm": 0.38757004319408433, + "learning_rate": 9.165475413262252e-05, + "loss": 2.981, + "step": 17263 + }, + { + "epoch": 0.8037805247107572, + "grad_norm": 0.3657702825364015, + "learning_rate": 9.165325578857078e-05, + "loss": 2.9516, + "step": 17264 + }, + { + "epoch": 0.8038270828968503, + "grad_norm": 0.4430107338577149, + "learning_rate": 9.165175732227082e-05, + "loss": 3.0086, + "step": 17265 + }, + { + "epoch": 0.8038736410829435, + "grad_norm": 0.3704754807506996, + "learning_rate": 9.165025873372708e-05, + "loss": 2.8506, + "step": 17266 + }, + { + "epoch": 0.8039201992690365, + "grad_norm": 0.38010252894675556, + "learning_rate": 9.164876002294393e-05, + "loss": 3.0521, + "step": 17267 + }, + { + "epoch": 0.8039667574551296, + "grad_norm": 0.40073302995234744, + "learning_rate": 9.164726118992577e-05, + "loss": 2.8688, + "step": 17268 + }, + { + "epoch": 0.8040133156412226, + "grad_norm": 0.3330537151959538, + "learning_rate": 9.164576223467698e-05, + "loss": 2.9854, + "step": 17269 + }, + { + "epoch": 0.8040598738273157, + "grad_norm": 0.3619892153876928, + "learning_rate": 9.164426315720201e-05, + "loss": 2.8069, + "step": 17270 + }, + { + "epoch": 0.8041064320134088, + "grad_norm": 0.34153448892914084, + "learning_rate": 9.164276395750522e-05, + "loss": 2.9779, + "step": 17271 + }, + { + "epoch": 0.8041529901995018, + "grad_norm": 0.34083422494885535, + "learning_rate": 9.164126463559101e-05, + "loss": 2.9681, + "step": 17272 + }, + { + "epoch": 0.8041995483855949, + "grad_norm": 0.37646927868974206, + "learning_rate": 9.16397651914638e-05, + "loss": 2.9003, + "step": 17273 + }, + { + "epoch": 0.8042461065716879, + "grad_norm": 0.35664020055541035, + "learning_rate": 9.163826562512798e-05, + "loss": 2.9109, + "step": 17274 + }, + { + "epoch": 0.804292664757781, + "grad_norm": 0.3463413571795945, + "learning_rate": 9.163676593658796e-05, + "loss": 2.9322, + "step": 17275 + }, + { + "epoch": 0.804339222943874, + "grad_norm": 0.4035173401667521, + "learning_rate": 9.163526612584812e-05, + "loss": 2.9775, + "step": 17276 + }, + { + "epoch": 0.8043857811299672, + "grad_norm": 0.3356755026106581, + "learning_rate": 9.163376619291289e-05, + "loss": 2.8995, + "step": 17277 + }, + { + "epoch": 0.8044323393160603, + "grad_norm": 0.3795011232424892, + "learning_rate": 9.163226613778665e-05, + "loss": 2.8734, + "step": 17278 + }, + { + "epoch": 0.8044788975021533, + "grad_norm": 0.3535683501593456, + "learning_rate": 9.163076596047381e-05, + "loss": 2.9054, + "step": 17279 + }, + { + "epoch": 0.8045254556882464, + "grad_norm": 0.33644505802373276, + "learning_rate": 9.162926566097878e-05, + "loss": 2.8865, + "step": 17280 + }, + { + "epoch": 0.8045720138743394, + "grad_norm": 0.37952568275401777, + "learning_rate": 9.162776523930595e-05, + "loss": 2.9703, + "step": 17281 + }, + { + "epoch": 0.8046185720604325, + "grad_norm": 0.36794419128301686, + "learning_rate": 9.162626469545972e-05, + "loss": 2.8629, + "step": 17282 + }, + { + "epoch": 0.8046651302465256, + "grad_norm": 0.3945754765656029, + "learning_rate": 9.162476402944452e-05, + "loss": 3.077, + "step": 17283 + }, + { + "epoch": 0.8047116884326186, + "grad_norm": 0.34684134394913557, + "learning_rate": 9.162326324126474e-05, + "loss": 2.9991, + "step": 17284 + }, + { + "epoch": 0.8047582466187118, + "grad_norm": 0.35641550257061716, + "learning_rate": 9.162176233092477e-05, + "loss": 2.9449, + "step": 17285 + }, + { + "epoch": 0.8048048048048048, + "grad_norm": 0.3534255825467768, + "learning_rate": 9.162026129842903e-05, + "loss": 2.9359, + "step": 17286 + }, + { + "epoch": 0.8048513629908979, + "grad_norm": 0.35116697590660145, + "learning_rate": 9.161876014378193e-05, + "loss": 2.9397, + "step": 17287 + }, + { + "epoch": 0.804897921176991, + "grad_norm": 0.3387554179901901, + "learning_rate": 9.161725886698786e-05, + "loss": 2.9885, + "step": 17288 + }, + { + "epoch": 0.804944479363084, + "grad_norm": 0.3278359365214819, + "learning_rate": 9.161575746805125e-05, + "loss": 2.9697, + "step": 17289 + }, + { + "epoch": 0.8049910375491771, + "grad_norm": 0.38073865542966295, + "learning_rate": 9.161425594697648e-05, + "loss": 2.9591, + "step": 17290 + }, + { + "epoch": 0.8050375957352701, + "grad_norm": 0.3519214637087496, + "learning_rate": 9.161275430376798e-05, + "loss": 2.986, + "step": 17291 + }, + { + "epoch": 0.8050841539213632, + "grad_norm": 0.35708848742724075, + "learning_rate": 9.161125253843014e-05, + "loss": 2.9217, + "step": 17292 + }, + { + "epoch": 0.8051307121074563, + "grad_norm": 0.342770880512152, + "learning_rate": 9.160975065096737e-05, + "loss": 2.9121, + "step": 17293 + }, + { + "epoch": 0.8051772702935494, + "grad_norm": 0.33217687228184994, + "learning_rate": 9.160824864138407e-05, + "loss": 2.8898, + "step": 17294 + }, + { + "epoch": 0.8052238284796425, + "grad_norm": 0.3500942895437037, + "learning_rate": 9.160674650968468e-05, + "loss": 2.9661, + "step": 17295 + }, + { + "epoch": 0.8052703866657355, + "grad_norm": 0.36896300655069086, + "learning_rate": 9.160524425587359e-05, + "loss": 2.9532, + "step": 17296 + }, + { + "epoch": 0.8053169448518286, + "grad_norm": 0.33530577996729594, + "learning_rate": 9.160374187995518e-05, + "loss": 2.8685, + "step": 17297 + }, + { + "epoch": 0.8053635030379216, + "grad_norm": 0.3649861291797373, + "learning_rate": 9.160223938193391e-05, + "loss": 2.9303, + "step": 17298 + }, + { + "epoch": 0.8054100612240147, + "grad_norm": 0.3429179504115978, + "learning_rate": 9.160073676181416e-05, + "loss": 2.9711, + "step": 17299 + }, + { + "epoch": 0.8054566194101078, + "grad_norm": 0.34018193219593407, + "learning_rate": 9.159923401960033e-05, + "loss": 2.8723, + "step": 17300 + }, + { + "epoch": 0.8055031775962008, + "grad_norm": 0.3139461816491856, + "learning_rate": 9.159773115529687e-05, + "loss": 2.9776, + "step": 17301 + }, + { + "epoch": 0.8055497357822939, + "grad_norm": 0.36077406393813305, + "learning_rate": 9.159622816890816e-05, + "loss": 3.0132, + "step": 17302 + }, + { + "epoch": 0.805596293968387, + "grad_norm": 0.33597379433492686, + "learning_rate": 9.159472506043861e-05, + "loss": 2.8945, + "step": 17303 + }, + { + "epoch": 0.8056428521544801, + "grad_norm": 0.33718241305743046, + "learning_rate": 9.159322182989265e-05, + "loss": 2.8719, + "step": 17304 + }, + { + "epoch": 0.8056894103405732, + "grad_norm": 0.3582335309084625, + "learning_rate": 9.159171847727468e-05, + "loss": 2.9043, + "step": 17305 + }, + { + "epoch": 0.8057359685266662, + "grad_norm": 0.3317999659644173, + "learning_rate": 9.15902150025891e-05, + "loss": 2.8972, + "step": 17306 + }, + { + "epoch": 0.8057825267127593, + "grad_norm": 0.38137798193262823, + "learning_rate": 9.158871140584036e-05, + "loss": 3.0145, + "step": 17307 + }, + { + "epoch": 0.8058290848988523, + "grad_norm": 0.3465975261713229, + "learning_rate": 9.158720768703283e-05, + "loss": 3.0096, + "step": 17308 + }, + { + "epoch": 0.8058756430849454, + "grad_norm": 0.3563591510795904, + "learning_rate": 9.158570384617096e-05, + "loss": 2.9083, + "step": 17309 + }, + { + "epoch": 0.8059222012710385, + "grad_norm": 0.3356479379710436, + "learning_rate": 9.158419988325912e-05, + "loss": 2.8576, + "step": 17310 + }, + { + "epoch": 0.8059687594571315, + "grad_norm": 0.3648583860270936, + "learning_rate": 9.158269579830177e-05, + "loss": 2.9391, + "step": 17311 + }, + { + "epoch": 0.8060153176432246, + "grad_norm": 0.3588885287964511, + "learning_rate": 9.158119159130329e-05, + "loss": 2.9381, + "step": 17312 + }, + { + "epoch": 0.8060618758293177, + "grad_norm": 0.36648472884238403, + "learning_rate": 9.157968726226813e-05, + "loss": 3.055, + "step": 17313 + }, + { + "epoch": 0.8061084340154108, + "grad_norm": 0.37395204645046304, + "learning_rate": 9.157818281120068e-05, + "loss": 2.9522, + "step": 17314 + }, + { + "epoch": 0.8061549922015039, + "grad_norm": 0.3497527653885649, + "learning_rate": 9.157667823810535e-05, + "loss": 2.9747, + "step": 17315 + }, + { + "epoch": 0.8062015503875969, + "grad_norm": 0.3968754923952481, + "learning_rate": 9.157517354298656e-05, + "loss": 3.0577, + "step": 17316 + }, + { + "epoch": 0.80624810857369, + "grad_norm": 0.31365744026624115, + "learning_rate": 9.157366872584874e-05, + "loss": 2.85, + "step": 17317 + }, + { + "epoch": 0.806294666759783, + "grad_norm": 0.35302505434885173, + "learning_rate": 9.157216378669629e-05, + "loss": 2.9484, + "step": 17318 + }, + { + "epoch": 0.8063412249458761, + "grad_norm": 0.3292459062856784, + "learning_rate": 9.157065872553365e-05, + "loss": 2.88, + "step": 17319 + }, + { + "epoch": 0.8063877831319691, + "grad_norm": 0.3571971276325165, + "learning_rate": 9.156915354236521e-05, + "loss": 3.0, + "step": 17320 + }, + { + "epoch": 0.8064343413180622, + "grad_norm": 0.36406085711872627, + "learning_rate": 9.156764823719541e-05, + "loss": 3.0137, + "step": 17321 + }, + { + "epoch": 0.8064808995041554, + "grad_norm": 0.38235750385018474, + "learning_rate": 9.156614281002864e-05, + "loss": 2.9499, + "step": 17322 + }, + { + "epoch": 0.8065274576902484, + "grad_norm": 0.36756233547876244, + "learning_rate": 9.156463726086935e-05, + "loss": 2.865, + "step": 17323 + }, + { + "epoch": 0.8065740158763415, + "grad_norm": 0.34749282432432593, + "learning_rate": 9.156313158972194e-05, + "loss": 2.9944, + "step": 17324 + }, + { + "epoch": 0.8066205740624345, + "grad_norm": 0.3891122820934401, + "learning_rate": 9.156162579659084e-05, + "loss": 2.9172, + "step": 17325 + }, + { + "epoch": 0.8066671322485276, + "grad_norm": 0.33992912408793036, + "learning_rate": 9.156011988148046e-05, + "loss": 2.9931, + "step": 17326 + }, + { + "epoch": 0.8067136904346207, + "grad_norm": 0.3805539407743195, + "learning_rate": 9.155861384439522e-05, + "loss": 2.9598, + "step": 17327 + }, + { + "epoch": 0.8067602486207137, + "grad_norm": 0.3146785441263403, + "learning_rate": 9.155710768533953e-05, + "loss": 2.9291, + "step": 17328 + }, + { + "epoch": 0.8068068068068068, + "grad_norm": 0.36663475037619997, + "learning_rate": 9.155560140431785e-05, + "loss": 2.8659, + "step": 17329 + }, + { + "epoch": 0.8068533649928998, + "grad_norm": 0.3373152264689945, + "learning_rate": 9.155409500133456e-05, + "loss": 2.8662, + "step": 17330 + }, + { + "epoch": 0.806899923178993, + "grad_norm": 0.3320842563775585, + "learning_rate": 9.155258847639409e-05, + "loss": 2.9687, + "step": 17331 + }, + { + "epoch": 0.8069464813650861, + "grad_norm": 0.34549264376959166, + "learning_rate": 9.155108182950088e-05, + "loss": 2.9823, + "step": 17332 + }, + { + "epoch": 0.8069930395511791, + "grad_norm": 0.3363446965314868, + "learning_rate": 9.154957506065932e-05, + "loss": 2.8418, + "step": 17333 + }, + { + "epoch": 0.8070395977372722, + "grad_norm": 0.3411723751655312, + "learning_rate": 9.154806816987385e-05, + "loss": 3.0829, + "step": 17334 + }, + { + "epoch": 0.8070861559233652, + "grad_norm": 0.35172388634259777, + "learning_rate": 9.15465611571489e-05, + "loss": 2.9186, + "step": 17335 + }, + { + "epoch": 0.8071327141094583, + "grad_norm": 0.35850841483396917, + "learning_rate": 9.15450540224889e-05, + "loss": 2.8961, + "step": 17336 + }, + { + "epoch": 0.8071792722955514, + "grad_norm": 0.3718781970888526, + "learning_rate": 9.154354676589825e-05, + "loss": 3.0146, + "step": 17337 + }, + { + "epoch": 0.8072258304816444, + "grad_norm": 0.3382378693188946, + "learning_rate": 9.154203938738138e-05, + "loss": 2.9279, + "step": 17338 + }, + { + "epoch": 0.8072723886677375, + "grad_norm": 0.3558197606868723, + "learning_rate": 9.154053188694271e-05, + "loss": 2.9791, + "step": 17339 + }, + { + "epoch": 0.8073189468538305, + "grad_norm": 0.3652555280325243, + "learning_rate": 9.153902426458668e-05, + "loss": 2.9152, + "step": 17340 + }, + { + "epoch": 0.8073655050399237, + "grad_norm": 0.33960248921129604, + "learning_rate": 9.153751652031772e-05, + "loss": 2.9555, + "step": 17341 + }, + { + "epoch": 0.8074120632260167, + "grad_norm": 0.3522745249017907, + "learning_rate": 9.153600865414022e-05, + "loss": 2.943, + "step": 17342 + }, + { + "epoch": 0.8074586214121098, + "grad_norm": 0.3326420879558269, + "learning_rate": 9.153450066605863e-05, + "loss": 2.9862, + "step": 17343 + }, + { + "epoch": 0.8075051795982029, + "grad_norm": 0.3102480477369304, + "learning_rate": 9.153299255607737e-05, + "loss": 2.8715, + "step": 17344 + }, + { + "epoch": 0.8075517377842959, + "grad_norm": 0.35822965210369245, + "learning_rate": 9.153148432420088e-05, + "loss": 3.0087, + "step": 17345 + }, + { + "epoch": 0.807598295970389, + "grad_norm": 0.33419982442829577, + "learning_rate": 9.152997597043356e-05, + "loss": 2.9102, + "step": 17346 + }, + { + "epoch": 0.807644854156482, + "grad_norm": 0.3444210840502593, + "learning_rate": 9.152846749477986e-05, + "loss": 2.9161, + "step": 17347 + }, + { + "epoch": 0.8076914123425751, + "grad_norm": 0.3740006238422478, + "learning_rate": 9.15269588972442e-05, + "loss": 2.9512, + "step": 17348 + }, + { + "epoch": 0.8077379705286682, + "grad_norm": 0.40400971058834173, + "learning_rate": 9.152545017783102e-05, + "loss": 3.0773, + "step": 17349 + }, + { + "epoch": 0.8077845287147613, + "grad_norm": 0.3634314340077202, + "learning_rate": 9.152394133654472e-05, + "loss": 2.9426, + "step": 17350 + }, + { + "epoch": 0.8078310869008544, + "grad_norm": 0.41093017977570645, + "learning_rate": 9.152243237338973e-05, + "loss": 3.0458, + "step": 17351 + }, + { + "epoch": 0.8078776450869474, + "grad_norm": 0.3815810917059091, + "learning_rate": 9.152092328837051e-05, + "loss": 2.9358, + "step": 17352 + }, + { + "epoch": 0.8079242032730405, + "grad_norm": 0.38584863138575315, + "learning_rate": 9.151941408149147e-05, + "loss": 3.0173, + "step": 17353 + }, + { + "epoch": 0.8079707614591336, + "grad_norm": 0.336089065304431, + "learning_rate": 9.151790475275703e-05, + "loss": 2.8028, + "step": 17354 + }, + { + "epoch": 0.8080173196452266, + "grad_norm": 0.3909261105776114, + "learning_rate": 9.151639530217165e-05, + "loss": 2.9599, + "step": 17355 + }, + { + "epoch": 0.8080638778313197, + "grad_norm": 0.3441290856584126, + "learning_rate": 9.151488572973972e-05, + "loss": 3.0042, + "step": 17356 + }, + { + "epoch": 0.8081104360174127, + "grad_norm": 0.3460548232000588, + "learning_rate": 9.151337603546571e-05, + "loss": 2.9657, + "step": 17357 + }, + { + "epoch": 0.8081569942035058, + "grad_norm": 0.3897371257794634, + "learning_rate": 9.1511866219354e-05, + "loss": 2.9361, + "step": 17358 + }, + { + "epoch": 0.808203552389599, + "grad_norm": 0.35945162181410906, + "learning_rate": 9.151035628140908e-05, + "loss": 2.9337, + "step": 17359 + }, + { + "epoch": 0.808250110575692, + "grad_norm": 0.3271396637611223, + "learning_rate": 9.150884622163533e-05, + "loss": 2.9385, + "step": 17360 + }, + { + "epoch": 0.8082966687617851, + "grad_norm": 0.45041202248480267, + "learning_rate": 9.150733604003723e-05, + "loss": 3.0343, + "step": 17361 + }, + { + "epoch": 0.8083432269478781, + "grad_norm": 0.33518260930327204, + "learning_rate": 9.150582573661918e-05, + "loss": 2.9545, + "step": 17362 + }, + { + "epoch": 0.8083897851339712, + "grad_norm": 0.3350108945178347, + "learning_rate": 9.150431531138561e-05, + "loss": 2.9604, + "step": 17363 + }, + { + "epoch": 0.8084363433200642, + "grad_norm": 0.36379197097776483, + "learning_rate": 9.150280476434097e-05, + "loss": 2.9246, + "step": 17364 + }, + { + "epoch": 0.8084829015061573, + "grad_norm": 0.30904542244755157, + "learning_rate": 9.150129409548969e-05, + "loss": 2.9475, + "step": 17365 + }, + { + "epoch": 0.8085294596922504, + "grad_norm": 0.37154613561135225, + "learning_rate": 9.14997833048362e-05, + "loss": 2.8471, + "step": 17366 + }, + { + "epoch": 0.8085760178783434, + "grad_norm": 0.3688514292516693, + "learning_rate": 9.149827239238493e-05, + "loss": 2.8829, + "step": 17367 + }, + { + "epoch": 0.8086225760644365, + "grad_norm": 0.39928164402992333, + "learning_rate": 9.149676135814032e-05, + "loss": 3.0011, + "step": 17368 + }, + { + "epoch": 0.8086691342505296, + "grad_norm": 0.3853062106898155, + "learning_rate": 9.149525020210681e-05, + "loss": 2.9192, + "step": 17369 + }, + { + "epoch": 0.8087156924366227, + "grad_norm": 0.3735433610216214, + "learning_rate": 9.149373892428882e-05, + "loss": 2.889, + "step": 17370 + }, + { + "epoch": 0.8087622506227158, + "grad_norm": 0.3958935574781755, + "learning_rate": 9.14922275246908e-05, + "loss": 2.9614, + "step": 17371 + }, + { + "epoch": 0.8088088088088088, + "grad_norm": 0.3874707361317305, + "learning_rate": 9.149071600331718e-05, + "loss": 2.8685, + "step": 17372 + }, + { + "epoch": 0.8088553669949019, + "grad_norm": 0.351082770576951, + "learning_rate": 9.148920436017238e-05, + "loss": 2.9022, + "step": 17373 + }, + { + "epoch": 0.8089019251809949, + "grad_norm": 0.3694071697290173, + "learning_rate": 9.148769259526087e-05, + "loss": 3.0079, + "step": 17374 + }, + { + "epoch": 0.808948483367088, + "grad_norm": 0.3745011939577078, + "learning_rate": 9.148618070858708e-05, + "loss": 3.0219, + "step": 17375 + }, + { + "epoch": 0.8089950415531811, + "grad_norm": 0.31845655162722875, + "learning_rate": 9.148466870015542e-05, + "loss": 2.8702, + "step": 17376 + }, + { + "epoch": 0.8090415997392741, + "grad_norm": 0.3633571476571874, + "learning_rate": 9.148315656997033e-05, + "loss": 2.9872, + "step": 17377 + }, + { + "epoch": 0.8090881579253673, + "grad_norm": 0.3172029150550551, + "learning_rate": 9.148164431803629e-05, + "loss": 2.9847, + "step": 17378 + }, + { + "epoch": 0.8091347161114603, + "grad_norm": 0.36167763377199363, + "learning_rate": 9.14801319443577e-05, + "loss": 2.9975, + "step": 17379 + }, + { + "epoch": 0.8091812742975534, + "grad_norm": 0.35936669408004973, + "learning_rate": 9.147861944893901e-05, + "loss": 2.9559, + "step": 17380 + }, + { + "epoch": 0.8092278324836465, + "grad_norm": 0.3333668254905151, + "learning_rate": 9.147710683178465e-05, + "loss": 2.8389, + "step": 17381 + }, + { + "epoch": 0.8092743906697395, + "grad_norm": 0.42019253289974795, + "learning_rate": 9.147559409289908e-05, + "loss": 2.9364, + "step": 17382 + }, + { + "epoch": 0.8093209488558326, + "grad_norm": 0.35709901375684955, + "learning_rate": 9.147408123228671e-05, + "loss": 2.9215, + "step": 17383 + }, + { + "epoch": 0.8093675070419256, + "grad_norm": 0.3912224776729001, + "learning_rate": 9.147256824995201e-05, + "loss": 2.9726, + "step": 17384 + }, + { + "epoch": 0.8094140652280187, + "grad_norm": 0.402440670090722, + "learning_rate": 9.147105514589942e-05, + "loss": 3.0181, + "step": 17385 + }, + { + "epoch": 0.8094606234141117, + "grad_norm": 0.34028940102805744, + "learning_rate": 9.146954192013335e-05, + "loss": 2.9612, + "step": 17386 + }, + { + "epoch": 0.8095071816002049, + "grad_norm": 0.38840233521389605, + "learning_rate": 9.146802857265826e-05, + "loss": 2.8998, + "step": 17387 + }, + { + "epoch": 0.809553739786298, + "grad_norm": 0.3366998790401094, + "learning_rate": 9.14665151034786e-05, + "loss": 2.938, + "step": 17388 + }, + { + "epoch": 0.809600297972391, + "grad_norm": 0.3562448238689194, + "learning_rate": 9.146500151259881e-05, + "loss": 2.8684, + "step": 17389 + }, + { + "epoch": 0.8096468561584841, + "grad_norm": 0.32734424313559707, + "learning_rate": 9.14634878000233e-05, + "loss": 3.0355, + "step": 17390 + }, + { + "epoch": 0.8096934143445771, + "grad_norm": 0.3760737789696928, + "learning_rate": 9.146197396575657e-05, + "loss": 2.978, + "step": 17391 + }, + { + "epoch": 0.8097399725306702, + "grad_norm": 0.36616259389727995, + "learning_rate": 9.146046000980302e-05, + "loss": 2.8461, + "step": 17392 + }, + { + "epoch": 0.8097865307167633, + "grad_norm": 0.3456133095785067, + "learning_rate": 9.145894593216709e-05, + "loss": 2.8142, + "step": 17393 + }, + { + "epoch": 0.8098330889028563, + "grad_norm": 0.37835912685605716, + "learning_rate": 9.145743173285323e-05, + "loss": 2.8266, + "step": 17394 + }, + { + "epoch": 0.8098796470889494, + "grad_norm": 0.3556230201961031, + "learning_rate": 9.145591741186593e-05, + "loss": 2.9016, + "step": 17395 + }, + { + "epoch": 0.8099262052750424, + "grad_norm": 0.3586722044332689, + "learning_rate": 9.145440296920958e-05, + "loss": 2.9355, + "step": 17396 + }, + { + "epoch": 0.8099727634611356, + "grad_norm": 0.34796547354957114, + "learning_rate": 9.145288840488863e-05, + "loss": 2.9149, + "step": 17397 + }, + { + "epoch": 0.8100193216472287, + "grad_norm": 0.3691956341289891, + "learning_rate": 9.145137371890753e-05, + "loss": 2.9784, + "step": 17398 + }, + { + "epoch": 0.8100658798333217, + "grad_norm": 0.34283483951688287, + "learning_rate": 9.144985891127076e-05, + "loss": 2.8169, + "step": 17399 + }, + { + "epoch": 0.8101124380194148, + "grad_norm": 0.3338742134921837, + "learning_rate": 9.144834398198272e-05, + "loss": 2.8886, + "step": 17400 + }, + { + "epoch": 0.8101589962055078, + "grad_norm": 0.3524438951637904, + "learning_rate": 9.144682893104789e-05, + "loss": 2.9976, + "step": 17401 + }, + { + "epoch": 0.8102055543916009, + "grad_norm": 0.3751902672241762, + "learning_rate": 9.144531375847068e-05, + "loss": 2.9103, + "step": 17402 + }, + { + "epoch": 0.810252112577694, + "grad_norm": 0.3433516858092486, + "learning_rate": 9.144379846425555e-05, + "loss": 2.9882, + "step": 17403 + }, + { + "epoch": 0.810298670763787, + "grad_norm": 0.3602924594657878, + "learning_rate": 9.144228304840697e-05, + "loss": 2.9761, + "step": 17404 + }, + { + "epoch": 0.8103452289498801, + "grad_norm": 0.4321148783429729, + "learning_rate": 9.144076751092938e-05, + "loss": 2.954, + "step": 17405 + }, + { + "epoch": 0.8103917871359732, + "grad_norm": 0.3656153386439559, + "learning_rate": 9.14392518518272e-05, + "loss": 2.9897, + "step": 17406 + }, + { + "epoch": 0.8104383453220663, + "grad_norm": 0.37873015803161675, + "learning_rate": 9.143773607110492e-05, + "loss": 2.8867, + "step": 17407 + }, + { + "epoch": 0.8104849035081593, + "grad_norm": 0.3763768294835012, + "learning_rate": 9.143622016876695e-05, + "loss": 2.9502, + "step": 17408 + }, + { + "epoch": 0.8105314616942524, + "grad_norm": 0.3698972467304192, + "learning_rate": 9.143470414481776e-05, + "loss": 2.8747, + "step": 17409 + }, + { + "epoch": 0.8105780198803455, + "grad_norm": 0.32993688039257796, + "learning_rate": 9.143318799926179e-05, + "loss": 2.9826, + "step": 17410 + }, + { + "epoch": 0.8106245780664385, + "grad_norm": 0.3557508747101468, + "learning_rate": 9.143167173210352e-05, + "loss": 2.8927, + "step": 17411 + }, + { + "epoch": 0.8106711362525316, + "grad_norm": 0.3399727507645589, + "learning_rate": 9.143015534334735e-05, + "loss": 2.9717, + "step": 17412 + }, + { + "epoch": 0.8107176944386246, + "grad_norm": 0.37607628950094496, + "learning_rate": 9.142863883299777e-05, + "loss": 2.9506, + "step": 17413 + }, + { + "epoch": 0.8107642526247177, + "grad_norm": 0.3671509027811862, + "learning_rate": 9.14271222010592e-05, + "loss": 2.9464, + "step": 17414 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.3690935404585999, + "learning_rate": 9.142560544753611e-05, + "loss": 2.9853, + "step": 17415 + }, + { + "epoch": 0.8108573689969039, + "grad_norm": 0.33345252998364044, + "learning_rate": 9.142408857243297e-05, + "loss": 2.9088, + "step": 17416 + }, + { + "epoch": 0.810903927182997, + "grad_norm": 0.3416222476238149, + "learning_rate": 9.14225715757542e-05, + "loss": 2.9059, + "step": 17417 + }, + { + "epoch": 0.81095048536909, + "grad_norm": 0.36030950473252044, + "learning_rate": 9.142105445750425e-05, + "loss": 2.992, + "step": 17418 + }, + { + "epoch": 0.8109970435551831, + "grad_norm": 0.3337866084650118, + "learning_rate": 9.14195372176876e-05, + "loss": 2.9146, + "step": 17419 + }, + { + "epoch": 0.8110436017412762, + "grad_norm": 0.3399113724785366, + "learning_rate": 9.141801985630869e-05, + "loss": 2.985, + "step": 17420 + }, + { + "epoch": 0.8110901599273692, + "grad_norm": 0.33971467744849115, + "learning_rate": 9.141650237337195e-05, + "loss": 2.9214, + "step": 17421 + }, + { + "epoch": 0.8111367181134623, + "grad_norm": 0.3450721712813239, + "learning_rate": 9.141498476888188e-05, + "loss": 2.9477, + "step": 17422 + }, + { + "epoch": 0.8111832762995553, + "grad_norm": 0.34390483434754787, + "learning_rate": 9.14134670428429e-05, + "loss": 2.9433, + "step": 17423 + }, + { + "epoch": 0.8112298344856484, + "grad_norm": 0.3640548376339318, + "learning_rate": 9.141194919525948e-05, + "loss": 2.9005, + "step": 17424 + }, + { + "epoch": 0.8112763926717416, + "grad_norm": 0.33739950980178307, + "learning_rate": 9.141043122613608e-05, + "loss": 2.9473, + "step": 17425 + }, + { + "epoch": 0.8113229508578346, + "grad_norm": 0.3619763646353777, + "learning_rate": 9.140891313547712e-05, + "loss": 2.9319, + "step": 17426 + }, + { + "epoch": 0.8113695090439277, + "grad_norm": 0.30972798325909434, + "learning_rate": 9.14073949232871e-05, + "loss": 2.9917, + "step": 17427 + }, + { + "epoch": 0.8114160672300207, + "grad_norm": 0.38380209790357145, + "learning_rate": 9.140587658957043e-05, + "loss": 2.9529, + "step": 17428 + }, + { + "epoch": 0.8114626254161138, + "grad_norm": 0.3107044338384774, + "learning_rate": 9.14043581343316e-05, + "loss": 2.8948, + "step": 17429 + }, + { + "epoch": 0.8115091836022068, + "grad_norm": 0.3551526371012613, + "learning_rate": 9.140283955757508e-05, + "loss": 2.9769, + "step": 17430 + }, + { + "epoch": 0.8115557417882999, + "grad_norm": 0.3360468617873014, + "learning_rate": 9.140132085930529e-05, + "loss": 2.872, + "step": 17431 + }, + { + "epoch": 0.811602299974393, + "grad_norm": 0.3512150033204062, + "learning_rate": 9.13998020395267e-05, + "loss": 2.8428, + "step": 17432 + }, + { + "epoch": 0.811648858160486, + "grad_norm": 0.35694769727120523, + "learning_rate": 9.139828309824376e-05, + "loss": 2.8303, + "step": 17433 + }, + { + "epoch": 0.8116954163465792, + "grad_norm": 0.34779657926691393, + "learning_rate": 9.139676403546095e-05, + "loss": 2.975, + "step": 17434 + }, + { + "epoch": 0.8117419745326722, + "grad_norm": 0.3490560247711502, + "learning_rate": 9.13952448511827e-05, + "loss": 2.9426, + "step": 17435 + }, + { + "epoch": 0.8117885327187653, + "grad_norm": 0.3457330992708872, + "learning_rate": 9.13937255454135e-05, + "loss": 2.9686, + "step": 17436 + }, + { + "epoch": 0.8118350909048584, + "grad_norm": 0.3645842589490127, + "learning_rate": 9.13922061181578e-05, + "loss": 2.911, + "step": 17437 + }, + { + "epoch": 0.8118816490909514, + "grad_norm": 0.4050691556987407, + "learning_rate": 9.139068656942004e-05, + "loss": 2.9474, + "step": 17438 + }, + { + "epoch": 0.8119282072770445, + "grad_norm": 0.3601902337934794, + "learning_rate": 9.138916689920469e-05, + "loss": 2.8578, + "step": 17439 + }, + { + "epoch": 0.8119747654631375, + "grad_norm": 0.4231495422618275, + "learning_rate": 9.138764710751621e-05, + "loss": 2.867, + "step": 17440 + }, + { + "epoch": 0.8120213236492306, + "grad_norm": 0.3638645987129319, + "learning_rate": 9.138612719435909e-05, + "loss": 2.9553, + "step": 17441 + }, + { + "epoch": 0.8120678818353237, + "grad_norm": 0.41561716401570403, + "learning_rate": 9.138460715973773e-05, + "loss": 3.0349, + "step": 17442 + }, + { + "epoch": 0.8121144400214168, + "grad_norm": 0.34391924079040276, + "learning_rate": 9.138308700365666e-05, + "loss": 2.9153, + "step": 17443 + }, + { + "epoch": 0.8121609982075099, + "grad_norm": 0.41454578576089995, + "learning_rate": 9.138156672612028e-05, + "loss": 2.8789, + "step": 17444 + }, + { + "epoch": 0.8122075563936029, + "grad_norm": 0.3671914771509564, + "learning_rate": 9.138004632713308e-05, + "loss": 2.9257, + "step": 17445 + }, + { + "epoch": 0.812254114579696, + "grad_norm": 0.3707483932371427, + "learning_rate": 9.137852580669954e-05, + "loss": 2.8842, + "step": 17446 + }, + { + "epoch": 0.8123006727657891, + "grad_norm": 0.4066583846116324, + "learning_rate": 9.137700516482408e-05, + "loss": 3.0007, + "step": 17447 + }, + { + "epoch": 0.8123472309518821, + "grad_norm": 0.3858911843085486, + "learning_rate": 9.13754844015112e-05, + "loss": 2.949, + "step": 17448 + }, + { + "epoch": 0.8123937891379752, + "grad_norm": 0.3929624633604761, + "learning_rate": 9.137396351676535e-05, + "loss": 2.8335, + "step": 17449 + }, + { + "epoch": 0.8124403473240682, + "grad_norm": 0.3339769362899963, + "learning_rate": 9.137244251059099e-05, + "loss": 2.9002, + "step": 17450 + }, + { + "epoch": 0.8124869055101613, + "grad_norm": 0.3800750847303625, + "learning_rate": 9.137092138299258e-05, + "loss": 2.8299, + "step": 17451 + }, + { + "epoch": 0.8125334636962543, + "grad_norm": 0.32307263268460745, + "learning_rate": 9.136940013397459e-05, + "loss": 3.0026, + "step": 17452 + }, + { + "epoch": 0.8125800218823475, + "grad_norm": 0.3614460743800062, + "learning_rate": 9.136787876354149e-05, + "loss": 2.9021, + "step": 17453 + }, + { + "epoch": 0.8126265800684406, + "grad_norm": 0.32185866020605247, + "learning_rate": 9.136635727169776e-05, + "loss": 2.9985, + "step": 17454 + }, + { + "epoch": 0.8126731382545336, + "grad_norm": 0.4085671641447449, + "learning_rate": 9.136483565844782e-05, + "loss": 2.9462, + "step": 17455 + }, + { + "epoch": 0.8127196964406267, + "grad_norm": 0.3554485248767992, + "learning_rate": 9.136331392379618e-05, + "loss": 2.8566, + "step": 17456 + }, + { + "epoch": 0.8127662546267197, + "grad_norm": 0.38613168203592874, + "learning_rate": 9.136179206774728e-05, + "loss": 3.0056, + "step": 17457 + }, + { + "epoch": 0.8128128128128128, + "grad_norm": 0.3656064888312518, + "learning_rate": 9.136027009030558e-05, + "loss": 2.9332, + "step": 17458 + }, + { + "epoch": 0.8128593709989059, + "grad_norm": 0.3514443414477461, + "learning_rate": 9.135874799147559e-05, + "loss": 2.9153, + "step": 17459 + }, + { + "epoch": 0.8129059291849989, + "grad_norm": 0.33599379594758133, + "learning_rate": 9.135722577126174e-05, + "loss": 2.9347, + "step": 17460 + }, + { + "epoch": 0.812952487371092, + "grad_norm": 0.3333572419232276, + "learning_rate": 9.13557034296685e-05, + "loss": 2.9894, + "step": 17461 + }, + { + "epoch": 0.812999045557185, + "grad_norm": 0.363345513851507, + "learning_rate": 9.135418096670036e-05, + "loss": 2.9098, + "step": 17462 + }, + { + "epoch": 0.8130456037432782, + "grad_norm": 0.3648491949199148, + "learning_rate": 9.135265838236176e-05, + "loss": 2.942, + "step": 17463 + }, + { + "epoch": 0.8130921619293713, + "grad_norm": 0.35743269815035916, + "learning_rate": 9.135113567665718e-05, + "loss": 2.8759, + "step": 17464 + }, + { + "epoch": 0.8131387201154643, + "grad_norm": 0.3488813985962371, + "learning_rate": 9.13496128495911e-05, + "loss": 2.9148, + "step": 17465 + }, + { + "epoch": 0.8131852783015574, + "grad_norm": 0.3532357331576047, + "learning_rate": 9.134808990116798e-05, + "loss": 2.9215, + "step": 17466 + }, + { + "epoch": 0.8132318364876504, + "grad_norm": 0.35770923119085074, + "learning_rate": 9.134656683139228e-05, + "loss": 2.8986, + "step": 17467 + }, + { + "epoch": 0.8132783946737435, + "grad_norm": 0.340384135284816, + "learning_rate": 9.134504364026848e-05, + "loss": 3.0217, + "step": 17468 + }, + { + "epoch": 0.8133249528598366, + "grad_norm": 0.39077338570691744, + "learning_rate": 9.134352032780105e-05, + "loss": 2.9377, + "step": 17469 + }, + { + "epoch": 0.8133715110459296, + "grad_norm": 0.36415301327512517, + "learning_rate": 9.134199689399447e-05, + "loss": 2.9358, + "step": 17470 + }, + { + "epoch": 0.8134180692320228, + "grad_norm": 0.3562611047530839, + "learning_rate": 9.134047333885321e-05, + "loss": 2.8103, + "step": 17471 + }, + { + "epoch": 0.8134646274181158, + "grad_norm": 0.4122801233107707, + "learning_rate": 9.13389496623817e-05, + "loss": 2.9633, + "step": 17472 + }, + { + "epoch": 0.8135111856042089, + "grad_norm": 0.37773699133059213, + "learning_rate": 9.133742586458447e-05, + "loss": 2.8786, + "step": 17473 + }, + { + "epoch": 0.8135577437903019, + "grad_norm": 0.3625619010850551, + "learning_rate": 9.133590194546598e-05, + "loss": 2.9151, + "step": 17474 + }, + { + "epoch": 0.813604301976395, + "grad_norm": 0.3647238180542158, + "learning_rate": 9.133437790503066e-05, + "loss": 2.9393, + "step": 17475 + }, + { + "epoch": 0.8136508601624881, + "grad_norm": 0.3450280687953069, + "learning_rate": 9.133285374328303e-05, + "loss": 2.8756, + "step": 17476 + }, + { + "epoch": 0.8136974183485811, + "grad_norm": 0.34412404104447486, + "learning_rate": 9.133132946022754e-05, + "loss": 3.0178, + "step": 17477 + }, + { + "epoch": 0.8137439765346742, + "grad_norm": 0.3749596275847965, + "learning_rate": 9.132980505586868e-05, + "loss": 2.8913, + "step": 17478 + }, + { + "epoch": 0.8137905347207672, + "grad_norm": 0.3499858395548727, + "learning_rate": 9.13282805302109e-05, + "loss": 2.9399, + "step": 17479 + }, + { + "epoch": 0.8138370929068603, + "grad_norm": 0.3653862433796773, + "learning_rate": 9.13267558832587e-05, + "loss": 2.9546, + "step": 17480 + }, + { + "epoch": 0.8138836510929535, + "grad_norm": 0.3667352778767619, + "learning_rate": 9.132523111501653e-05, + "loss": 3.0265, + "step": 17481 + }, + { + "epoch": 0.8139302092790465, + "grad_norm": 0.3387043781099144, + "learning_rate": 9.132370622548889e-05, + "loss": 2.9298, + "step": 17482 + }, + { + "epoch": 0.8139767674651396, + "grad_norm": 0.3711955263985487, + "learning_rate": 9.132218121468024e-05, + "loss": 2.9426, + "step": 17483 + }, + { + "epoch": 0.8140233256512326, + "grad_norm": 0.3450103365743195, + "learning_rate": 9.132065608259506e-05, + "loss": 2.9215, + "step": 17484 + }, + { + "epoch": 0.8140698838373257, + "grad_norm": 0.37431057342972784, + "learning_rate": 9.131913082923782e-05, + "loss": 2.992, + "step": 17485 + }, + { + "epoch": 0.8141164420234188, + "grad_norm": 0.37431154715998555, + "learning_rate": 9.131760545461299e-05, + "loss": 3.0541, + "step": 17486 + }, + { + "epoch": 0.8141630002095118, + "grad_norm": 0.33234801925546115, + "learning_rate": 9.131607995872508e-05, + "loss": 2.8589, + "step": 17487 + }, + { + "epoch": 0.8142095583956049, + "grad_norm": 0.3041880153160741, + "learning_rate": 9.131455434157854e-05, + "loss": 2.8665, + "step": 17488 + }, + { + "epoch": 0.8142561165816979, + "grad_norm": 0.3031501372953136, + "learning_rate": 9.131302860317785e-05, + "loss": 2.7993, + "step": 17489 + }, + { + "epoch": 0.8143026747677911, + "grad_norm": 0.31619593241134925, + "learning_rate": 9.13115027435275e-05, + "loss": 2.8431, + "step": 17490 + }, + { + "epoch": 0.8143492329538841, + "grad_norm": 0.31123225066201854, + "learning_rate": 9.130997676263195e-05, + "loss": 3.0267, + "step": 17491 + }, + { + "epoch": 0.8143957911399772, + "grad_norm": 0.3419622811587792, + "learning_rate": 9.130845066049568e-05, + "loss": 2.972, + "step": 17492 + }, + { + "epoch": 0.8144423493260703, + "grad_norm": 0.3115369284042464, + "learning_rate": 9.130692443712319e-05, + "loss": 2.8913, + "step": 17493 + }, + { + "epoch": 0.8144889075121633, + "grad_norm": 0.37066285238776403, + "learning_rate": 9.130539809251895e-05, + "loss": 2.8422, + "step": 17494 + }, + { + "epoch": 0.8145354656982564, + "grad_norm": 0.3664975807405926, + "learning_rate": 9.130387162668743e-05, + "loss": 2.9345, + "step": 17495 + }, + { + "epoch": 0.8145820238843494, + "grad_norm": 0.3234675687184563, + "learning_rate": 9.130234503963311e-05, + "loss": 3.0305, + "step": 17496 + }, + { + "epoch": 0.8146285820704425, + "grad_norm": 0.33793578885957215, + "learning_rate": 9.130081833136049e-05, + "loss": 2.9808, + "step": 17497 + }, + { + "epoch": 0.8146751402565356, + "grad_norm": 0.33107306624678395, + "learning_rate": 9.129929150187401e-05, + "loss": 2.9247, + "step": 17498 + }, + { + "epoch": 0.8147216984426287, + "grad_norm": 0.33516738795099565, + "learning_rate": 9.12977645511782e-05, + "loss": 2.9169, + "step": 17499 + }, + { + "epoch": 0.8147682566287218, + "grad_norm": 0.33036492583309274, + "learning_rate": 9.129623747927751e-05, + "loss": 2.8941, + "step": 17500 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 0.30412893318328227, + "learning_rate": 9.129471028617644e-05, + "loss": 3.0087, + "step": 17501 + }, + { + "epoch": 0.8148613730009079, + "grad_norm": 0.3126363516875288, + "learning_rate": 9.129318297187946e-05, + "loss": 3.0271, + "step": 17502 + }, + { + "epoch": 0.814907931187001, + "grad_norm": 0.3272280485733211, + "learning_rate": 9.129165553639106e-05, + "loss": 2.9411, + "step": 17503 + }, + { + "epoch": 0.814954489373094, + "grad_norm": 0.33187425344975474, + "learning_rate": 9.129012797971571e-05, + "loss": 2.904, + "step": 17504 + }, + { + "epoch": 0.8150010475591871, + "grad_norm": 0.33343579157598474, + "learning_rate": 9.12886003018579e-05, + "loss": 2.8764, + "step": 17505 + }, + { + "epoch": 0.8150476057452801, + "grad_norm": 0.31395522360013073, + "learning_rate": 9.128707250282214e-05, + "loss": 2.9902, + "step": 17506 + }, + { + "epoch": 0.8150941639313732, + "grad_norm": 0.36877844760862544, + "learning_rate": 9.128554458261287e-05, + "loss": 2.9831, + "step": 17507 + }, + { + "epoch": 0.8151407221174664, + "grad_norm": 0.31550777941978936, + "learning_rate": 9.12840165412346e-05, + "loss": 2.8817, + "step": 17508 + }, + { + "epoch": 0.8151872803035594, + "grad_norm": 0.3653640544753963, + "learning_rate": 9.12824883786918e-05, + "loss": 3.0, + "step": 17509 + }, + { + "epoch": 0.8152338384896525, + "grad_norm": 0.3550849229528378, + "learning_rate": 9.128096009498896e-05, + "loss": 3.0164, + "step": 17510 + }, + { + "epoch": 0.8152803966757455, + "grad_norm": 0.3084050438217754, + "learning_rate": 9.127943169013057e-05, + "loss": 2.9568, + "step": 17511 + }, + { + "epoch": 0.8153269548618386, + "grad_norm": 0.37594984533159587, + "learning_rate": 9.127790316412114e-05, + "loss": 2.9446, + "step": 17512 + }, + { + "epoch": 0.8153735130479316, + "grad_norm": 0.3510909320419332, + "learning_rate": 9.127637451696511e-05, + "loss": 2.9272, + "step": 17513 + }, + { + "epoch": 0.8154200712340247, + "grad_norm": 0.3728975042939496, + "learning_rate": 9.127484574866699e-05, + "loss": 2.9828, + "step": 17514 + }, + { + "epoch": 0.8154666294201178, + "grad_norm": 0.40675205104240736, + "learning_rate": 9.127331685923127e-05, + "loss": 2.8992, + "step": 17515 + }, + { + "epoch": 0.8155131876062108, + "grad_norm": 0.3371364399961062, + "learning_rate": 9.127178784866242e-05, + "loss": 3.0073, + "step": 17516 + }, + { + "epoch": 0.815559745792304, + "grad_norm": 0.3993231930872489, + "learning_rate": 9.127025871696495e-05, + "loss": 2.9817, + "step": 17517 + }, + { + "epoch": 0.815606303978397, + "grad_norm": 0.34945297024291444, + "learning_rate": 9.126872946414333e-05, + "loss": 2.9875, + "step": 17518 + }, + { + "epoch": 0.8156528621644901, + "grad_norm": 0.43359046345224556, + "learning_rate": 9.126720009020205e-05, + "loss": 2.9882, + "step": 17519 + }, + { + "epoch": 0.8156994203505832, + "grad_norm": 0.37108637246223364, + "learning_rate": 9.126567059514562e-05, + "loss": 2.9256, + "step": 17520 + }, + { + "epoch": 0.8157459785366762, + "grad_norm": 0.3934506280859894, + "learning_rate": 9.126414097897849e-05, + "loss": 2.9715, + "step": 17521 + }, + { + "epoch": 0.8157925367227693, + "grad_norm": 0.42077632253961506, + "learning_rate": 9.126261124170518e-05, + "loss": 2.9374, + "step": 17522 + }, + { + "epoch": 0.8158390949088623, + "grad_norm": 0.30920546204666616, + "learning_rate": 9.126108138333019e-05, + "loss": 2.991, + "step": 17523 + }, + { + "epoch": 0.8158856530949554, + "grad_norm": 0.3848461298155644, + "learning_rate": 9.125955140385796e-05, + "loss": 2.9219, + "step": 17524 + }, + { + "epoch": 0.8159322112810485, + "grad_norm": 0.3396354005868409, + "learning_rate": 9.125802130329302e-05, + "loss": 2.8866, + "step": 17525 + }, + { + "epoch": 0.8159787694671415, + "grad_norm": 0.3930590693732462, + "learning_rate": 9.125649108163987e-05, + "loss": 3.0023, + "step": 17526 + }, + { + "epoch": 0.8160253276532347, + "grad_norm": 0.3541339751804622, + "learning_rate": 9.125496073890296e-05, + "loss": 2.8963, + "step": 17527 + }, + { + "epoch": 0.8160718858393277, + "grad_norm": 0.37670639255107485, + "learning_rate": 9.125343027508682e-05, + "loss": 2.9088, + "step": 17528 + }, + { + "epoch": 0.8161184440254208, + "grad_norm": 0.349592155794061, + "learning_rate": 9.125189969019592e-05, + "loss": 2.8845, + "step": 17529 + }, + { + "epoch": 0.8161650022115139, + "grad_norm": 0.3759189735869836, + "learning_rate": 9.125036898423476e-05, + "loss": 2.9824, + "step": 17530 + }, + { + "epoch": 0.8162115603976069, + "grad_norm": 0.35693260517717235, + "learning_rate": 9.124883815720783e-05, + "loss": 2.8739, + "step": 17531 + }, + { + "epoch": 0.8162581185837, + "grad_norm": 0.3524415651045116, + "learning_rate": 9.124730720911962e-05, + "loss": 2.927, + "step": 17532 + }, + { + "epoch": 0.816304676769793, + "grad_norm": 0.39564850159010656, + "learning_rate": 9.124577613997463e-05, + "loss": 2.9339, + "step": 17533 + }, + { + "epoch": 0.8163512349558861, + "grad_norm": 0.3473171712532397, + "learning_rate": 9.124424494977736e-05, + "loss": 2.9361, + "step": 17534 + }, + { + "epoch": 0.8163977931419791, + "grad_norm": 0.3943317721977693, + "learning_rate": 9.124271363853228e-05, + "loss": 2.9877, + "step": 17535 + }, + { + "epoch": 0.8164443513280722, + "grad_norm": 0.33854175734731884, + "learning_rate": 9.124118220624391e-05, + "loss": 3.0184, + "step": 17536 + }, + { + "epoch": 0.8164909095141654, + "grad_norm": 0.3774094991764441, + "learning_rate": 9.123965065291673e-05, + "loss": 2.9312, + "step": 17537 + }, + { + "epoch": 0.8165374677002584, + "grad_norm": 0.3408008361009795, + "learning_rate": 9.123811897855523e-05, + "loss": 2.8963, + "step": 17538 + }, + { + "epoch": 0.8165840258863515, + "grad_norm": 0.37371849795722434, + "learning_rate": 9.123658718316393e-05, + "loss": 3.0206, + "step": 17539 + }, + { + "epoch": 0.8166305840724445, + "grad_norm": 0.3604457647704325, + "learning_rate": 9.123505526674729e-05, + "loss": 3.0116, + "step": 17540 + }, + { + "epoch": 0.8166771422585376, + "grad_norm": 0.3591467304280421, + "learning_rate": 9.123352322930983e-05, + "loss": 2.9222, + "step": 17541 + }, + { + "epoch": 0.8167237004446307, + "grad_norm": 0.327436590083997, + "learning_rate": 9.123199107085605e-05, + "loss": 2.9572, + "step": 17542 + }, + { + "epoch": 0.8167702586307237, + "grad_norm": 0.3599851198175121, + "learning_rate": 9.123045879139042e-05, + "loss": 2.8507, + "step": 17543 + }, + { + "epoch": 0.8168168168168168, + "grad_norm": 0.3591989561048305, + "learning_rate": 9.122892639091748e-05, + "loss": 3.0233, + "step": 17544 + }, + { + "epoch": 0.8168633750029098, + "grad_norm": 0.3671747945423963, + "learning_rate": 9.122739386944169e-05, + "loss": 2.8362, + "step": 17545 + }, + { + "epoch": 0.816909933189003, + "grad_norm": 0.3254765210614006, + "learning_rate": 9.122586122696757e-05, + "loss": 2.8682, + "step": 17546 + }, + { + "epoch": 0.8169564913750961, + "grad_norm": 0.36545623876127636, + "learning_rate": 9.12243284634996e-05, + "loss": 2.862, + "step": 17547 + }, + { + "epoch": 0.8170030495611891, + "grad_norm": 0.3282581666443781, + "learning_rate": 9.122279557904228e-05, + "loss": 2.8989, + "step": 17548 + }, + { + "epoch": 0.8170496077472822, + "grad_norm": 0.3388806311575792, + "learning_rate": 9.122126257360012e-05, + "loss": 2.9815, + "step": 17549 + }, + { + "epoch": 0.8170961659333752, + "grad_norm": 0.33512196230501234, + "learning_rate": 9.121972944717762e-05, + "loss": 2.9656, + "step": 17550 + }, + { + "epoch": 0.8171427241194683, + "grad_norm": 0.32842869653437434, + "learning_rate": 9.121819619977928e-05, + "loss": 2.8517, + "step": 17551 + }, + { + "epoch": 0.8171892823055614, + "grad_norm": 0.3343573104158073, + "learning_rate": 9.12166628314096e-05, + "loss": 3.0001, + "step": 17552 + }, + { + "epoch": 0.8172358404916544, + "grad_norm": 0.3260293160281882, + "learning_rate": 9.121512934207306e-05, + "loss": 2.8537, + "step": 17553 + }, + { + "epoch": 0.8172823986777475, + "grad_norm": 0.32206273486868026, + "learning_rate": 9.121359573177417e-05, + "loss": 2.9708, + "step": 17554 + }, + { + "epoch": 0.8173289568638406, + "grad_norm": 0.3527873679310721, + "learning_rate": 9.121206200051744e-05, + "loss": 2.9517, + "step": 17555 + }, + { + "epoch": 0.8173755150499337, + "grad_norm": 0.33675830426454856, + "learning_rate": 9.121052814830739e-05, + "loss": 2.8567, + "step": 17556 + }, + { + "epoch": 0.8174220732360267, + "grad_norm": 0.3331619817982727, + "learning_rate": 9.120899417514848e-05, + "loss": 2.9205, + "step": 17557 + }, + { + "epoch": 0.8174686314221198, + "grad_norm": 0.38570893500253695, + "learning_rate": 9.120746008104524e-05, + "loss": 2.9298, + "step": 17558 + }, + { + "epoch": 0.8175151896082129, + "grad_norm": 0.36935196674863024, + "learning_rate": 9.120592586600216e-05, + "loss": 2.9403, + "step": 17559 + }, + { + "epoch": 0.8175617477943059, + "grad_norm": 0.40945536269643523, + "learning_rate": 9.120439153002374e-05, + "loss": 3.0409, + "step": 17560 + }, + { + "epoch": 0.817608305980399, + "grad_norm": 0.31678167456026424, + "learning_rate": 9.12028570731145e-05, + "loss": 3.0268, + "step": 17561 + }, + { + "epoch": 0.817654864166492, + "grad_norm": 0.3995070011611105, + "learning_rate": 9.120132249527893e-05, + "loss": 2.9342, + "step": 17562 + }, + { + "epoch": 0.8177014223525851, + "grad_norm": 0.35292390482909747, + "learning_rate": 9.119978779652155e-05, + "loss": 2.9331, + "step": 17563 + }, + { + "epoch": 0.8177479805386783, + "grad_norm": 0.34106588987652275, + "learning_rate": 9.119825297684683e-05, + "loss": 2.8816, + "step": 17564 + }, + { + "epoch": 0.8177945387247713, + "grad_norm": 0.3507755903526113, + "learning_rate": 9.119671803625932e-05, + "loss": 2.9612, + "step": 17565 + }, + { + "epoch": 0.8178410969108644, + "grad_norm": 0.33179640548137623, + "learning_rate": 9.119518297476348e-05, + "loss": 2.7238, + "step": 17566 + }, + { + "epoch": 0.8178876550969574, + "grad_norm": 0.3345012170287292, + "learning_rate": 9.119364779236384e-05, + "loss": 2.9525, + "step": 17567 + }, + { + "epoch": 0.8179342132830505, + "grad_norm": 0.35879065650111325, + "learning_rate": 9.11921124890649e-05, + "loss": 2.8682, + "step": 17568 + }, + { + "epoch": 0.8179807714691436, + "grad_norm": 0.38763036189244926, + "learning_rate": 9.119057706487117e-05, + "loss": 3.0367, + "step": 17569 + }, + { + "epoch": 0.8180273296552366, + "grad_norm": 0.34137297806159606, + "learning_rate": 9.118904151978717e-05, + "loss": 3.0127, + "step": 17570 + }, + { + "epoch": 0.8180738878413297, + "grad_norm": 0.36255544262386685, + "learning_rate": 9.118750585381738e-05, + "loss": 2.9367, + "step": 17571 + }, + { + "epoch": 0.8181204460274227, + "grad_norm": 0.3562245811754041, + "learning_rate": 9.118597006696632e-05, + "loss": 2.9245, + "step": 17572 + }, + { + "epoch": 0.8181670042135158, + "grad_norm": 0.32538178148867214, + "learning_rate": 9.118443415923848e-05, + "loss": 2.9477, + "step": 17573 + }, + { + "epoch": 0.818213562399609, + "grad_norm": 0.39632909328261834, + "learning_rate": 9.11828981306384e-05, + "loss": 2.9935, + "step": 17574 + }, + { + "epoch": 0.818260120585702, + "grad_norm": 0.34168284041428476, + "learning_rate": 9.118136198117058e-05, + "loss": 2.8879, + "step": 17575 + }, + { + "epoch": 0.8183066787717951, + "grad_norm": 0.3653065607663675, + "learning_rate": 9.11798257108395e-05, + "loss": 2.9666, + "step": 17576 + }, + { + "epoch": 0.8183532369578881, + "grad_norm": 0.33938123524395125, + "learning_rate": 9.117828931964969e-05, + "loss": 2.8989, + "step": 17577 + }, + { + "epoch": 0.8183997951439812, + "grad_norm": 0.34187596800881803, + "learning_rate": 9.117675280760566e-05, + "loss": 3.0185, + "step": 17578 + }, + { + "epoch": 0.8184463533300742, + "grad_norm": 0.32894335719980106, + "learning_rate": 9.117521617471191e-05, + "loss": 3.0112, + "step": 17579 + }, + { + "epoch": 0.8184929115161673, + "grad_norm": 0.34683166704314433, + "learning_rate": 9.117367942097298e-05, + "loss": 2.8712, + "step": 17580 + }, + { + "epoch": 0.8185394697022604, + "grad_norm": 0.3415570824230168, + "learning_rate": 9.117214254639335e-05, + "loss": 2.9186, + "step": 17581 + }, + { + "epoch": 0.8185860278883534, + "grad_norm": 0.33611759896088833, + "learning_rate": 9.117060555097752e-05, + "loss": 2.8469, + "step": 17582 + }, + { + "epoch": 0.8186325860744466, + "grad_norm": 0.3390276571290703, + "learning_rate": 9.116906843473003e-05, + "loss": 2.9298, + "step": 17583 + }, + { + "epoch": 0.8186791442605396, + "grad_norm": 0.3822332364668328, + "learning_rate": 9.116753119765539e-05, + "loss": 2.9747, + "step": 17584 + }, + { + "epoch": 0.8187257024466327, + "grad_norm": 0.37145174104353984, + "learning_rate": 9.116599383975809e-05, + "loss": 2.9461, + "step": 17585 + }, + { + "epoch": 0.8187722606327258, + "grad_norm": 0.3444612026832428, + "learning_rate": 9.116445636104265e-05, + "loss": 2.8984, + "step": 17586 + }, + { + "epoch": 0.8188188188188188, + "grad_norm": 0.3817545345767605, + "learning_rate": 9.11629187615136e-05, + "loss": 2.8682, + "step": 17587 + }, + { + "epoch": 0.8188653770049119, + "grad_norm": 0.38059209993141363, + "learning_rate": 9.116138104117542e-05, + "loss": 3.042, + "step": 17588 + }, + { + "epoch": 0.8189119351910049, + "grad_norm": 0.36871337456633674, + "learning_rate": 9.115984320003266e-05, + "loss": 2.9101, + "step": 17589 + }, + { + "epoch": 0.818958493377098, + "grad_norm": 0.4165687042491185, + "learning_rate": 9.115830523808982e-05, + "loss": 2.8981, + "step": 17590 + }, + { + "epoch": 0.8190050515631911, + "grad_norm": 0.34455619148668626, + "learning_rate": 9.115676715535138e-05, + "loss": 2.9217, + "step": 17591 + }, + { + "epoch": 0.8190516097492841, + "grad_norm": 0.34266292677206467, + "learning_rate": 9.11552289518219e-05, + "loss": 3.0018, + "step": 17592 + }, + { + "epoch": 0.8190981679353773, + "grad_norm": 0.3959467829430263, + "learning_rate": 9.115369062750587e-05, + "loss": 2.9609, + "step": 17593 + }, + { + "epoch": 0.8191447261214703, + "grad_norm": 0.3381573716825485, + "learning_rate": 9.115215218240783e-05, + "loss": 2.9769, + "step": 17594 + }, + { + "epoch": 0.8191912843075634, + "grad_norm": 0.43482258815775854, + "learning_rate": 9.115061361653225e-05, + "loss": 3.0517, + "step": 17595 + }, + { + "epoch": 0.8192378424936565, + "grad_norm": 0.40454876255223066, + "learning_rate": 9.11490749298837e-05, + "loss": 2.9177, + "step": 17596 + }, + { + "epoch": 0.8192844006797495, + "grad_norm": 0.44719964562178866, + "learning_rate": 9.114753612246667e-05, + "loss": 2.871, + "step": 17597 + }, + { + "epoch": 0.8193309588658426, + "grad_norm": 0.4047060166030958, + "learning_rate": 9.114599719428565e-05, + "loss": 2.9212, + "step": 17598 + }, + { + "epoch": 0.8193775170519356, + "grad_norm": 0.38746045815832947, + "learning_rate": 9.11444581453452e-05, + "loss": 3.0201, + "step": 17599 + }, + { + "epoch": 0.8194240752380287, + "grad_norm": 0.3775795012053722, + "learning_rate": 9.114291897564981e-05, + "loss": 2.9818, + "step": 17600 + }, + { + "epoch": 0.8194706334241217, + "grad_norm": 0.3443014451129098, + "learning_rate": 9.114137968520401e-05, + "loss": 2.8685, + "step": 17601 + }, + { + "epoch": 0.8195171916102149, + "grad_norm": 0.39392018043412386, + "learning_rate": 9.113984027401233e-05, + "loss": 2.8085, + "step": 17602 + }, + { + "epoch": 0.819563749796308, + "grad_norm": 0.3546823529157206, + "learning_rate": 9.113830074207924e-05, + "loss": 2.8556, + "step": 17603 + }, + { + "epoch": 0.819610307982401, + "grad_norm": 0.40276885881090463, + "learning_rate": 9.11367610894093e-05, + "loss": 2.9012, + "step": 17604 + }, + { + "epoch": 0.8196568661684941, + "grad_norm": 0.3534310923582913, + "learning_rate": 9.113522131600703e-05, + "loss": 3.0715, + "step": 17605 + }, + { + "epoch": 0.8197034243545871, + "grad_norm": 0.3592354554585181, + "learning_rate": 9.113368142187693e-05, + "loss": 2.819, + "step": 17606 + }, + { + "epoch": 0.8197499825406802, + "grad_norm": 0.35747658306355695, + "learning_rate": 9.113214140702354e-05, + "loss": 2.7789, + "step": 17607 + }, + { + "epoch": 0.8197965407267733, + "grad_norm": 0.38047531872587004, + "learning_rate": 9.113060127145135e-05, + "loss": 2.8554, + "step": 17608 + }, + { + "epoch": 0.8198430989128663, + "grad_norm": 0.32997947276568707, + "learning_rate": 9.11290610151649e-05, + "loss": 2.9829, + "step": 17609 + }, + { + "epoch": 0.8198896570989594, + "grad_norm": 0.38673347824420384, + "learning_rate": 9.112752063816872e-05, + "loss": 3.0658, + "step": 17610 + }, + { + "epoch": 0.8199362152850524, + "grad_norm": 0.3475704821152422, + "learning_rate": 9.11259801404673e-05, + "loss": 2.8944, + "step": 17611 + }, + { + "epoch": 0.8199827734711456, + "grad_norm": 0.33932579858910333, + "learning_rate": 9.112443952206519e-05, + "loss": 2.8808, + "step": 17612 + }, + { + "epoch": 0.8200293316572387, + "grad_norm": 0.4038084603533337, + "learning_rate": 9.112289878296691e-05, + "loss": 2.9208, + "step": 17613 + }, + { + "epoch": 0.8200758898433317, + "grad_norm": 0.37235273226970705, + "learning_rate": 9.112135792317695e-05, + "loss": 2.9385, + "step": 17614 + }, + { + "epoch": 0.8201224480294248, + "grad_norm": 0.3791335478498045, + "learning_rate": 9.111981694269988e-05, + "loss": 2.9607, + "step": 17615 + }, + { + "epoch": 0.8201690062155178, + "grad_norm": 0.40687834649586346, + "learning_rate": 9.111827584154019e-05, + "loss": 2.9305, + "step": 17616 + }, + { + "epoch": 0.8202155644016109, + "grad_norm": 0.3510663592690428, + "learning_rate": 9.111673461970242e-05, + "loss": 2.848, + "step": 17617 + }, + { + "epoch": 0.820262122587704, + "grad_norm": 0.3925333580527856, + "learning_rate": 9.111519327719107e-05, + "loss": 2.9512, + "step": 17618 + }, + { + "epoch": 0.820308680773797, + "grad_norm": 0.396163135775934, + "learning_rate": 9.111365181401068e-05, + "loss": 2.9227, + "step": 17619 + }, + { + "epoch": 0.8203552389598902, + "grad_norm": 0.3307678274469439, + "learning_rate": 9.111211023016577e-05, + "loss": 2.964, + "step": 17620 + }, + { + "epoch": 0.8204017971459832, + "grad_norm": 0.40034613895974414, + "learning_rate": 9.111056852566088e-05, + "loss": 3.0401, + "step": 17621 + }, + { + "epoch": 0.8204483553320763, + "grad_norm": 0.3333385115498391, + "learning_rate": 9.110902670050052e-05, + "loss": 2.9186, + "step": 17622 + }, + { + "epoch": 0.8204949135181693, + "grad_norm": 0.36576752093788867, + "learning_rate": 9.11074847546892e-05, + "loss": 2.9867, + "step": 17623 + }, + { + "epoch": 0.8205414717042624, + "grad_norm": 0.3711552323417087, + "learning_rate": 9.110594268823147e-05, + "loss": 2.9943, + "step": 17624 + }, + { + "epoch": 0.8205880298903555, + "grad_norm": 0.33794748218942766, + "learning_rate": 9.110440050113185e-05, + "loss": 2.9229, + "step": 17625 + }, + { + "epoch": 0.8206345880764485, + "grad_norm": 0.348582035852407, + "learning_rate": 9.110285819339487e-05, + "loss": 2.9291, + "step": 17626 + }, + { + "epoch": 0.8206811462625416, + "grad_norm": 0.3495824307110165, + "learning_rate": 9.110131576502504e-05, + "loss": 2.8445, + "step": 17627 + }, + { + "epoch": 0.8207277044486346, + "grad_norm": 0.35002918684510875, + "learning_rate": 9.109977321602691e-05, + "loss": 2.8998, + "step": 17628 + }, + { + "epoch": 0.8207742626347277, + "grad_norm": 0.34394312959229517, + "learning_rate": 9.109823054640498e-05, + "loss": 2.9646, + "step": 17629 + }, + { + "epoch": 0.8208208208208209, + "grad_norm": 0.3859692998090605, + "learning_rate": 9.10966877561638e-05, + "loss": 2.9721, + "step": 17630 + }, + { + "epoch": 0.8208673790069139, + "grad_norm": 0.34625632122367117, + "learning_rate": 9.109514484530789e-05, + "loss": 2.9276, + "step": 17631 + }, + { + "epoch": 0.820913937193007, + "grad_norm": 0.3994210361003402, + "learning_rate": 9.109360181384177e-05, + "loss": 2.8525, + "step": 17632 + }, + { + "epoch": 0.8209604953791, + "grad_norm": 0.3729332992964673, + "learning_rate": 9.109205866177e-05, + "loss": 2.9969, + "step": 17633 + }, + { + "epoch": 0.8210070535651931, + "grad_norm": 0.3642688803162643, + "learning_rate": 9.109051538909707e-05, + "loss": 2.9181, + "step": 17634 + }, + { + "epoch": 0.8210536117512862, + "grad_norm": 0.36310315077490846, + "learning_rate": 9.108897199582753e-05, + "loss": 2.9777, + "step": 17635 + }, + { + "epoch": 0.8211001699373792, + "grad_norm": 0.36799921625857007, + "learning_rate": 9.108742848196588e-05, + "loss": 2.9007, + "step": 17636 + }, + { + "epoch": 0.8211467281234723, + "grad_norm": 0.360140884021085, + "learning_rate": 9.10858848475167e-05, + "loss": 2.9145, + "step": 17637 + }, + { + "epoch": 0.8211932863095653, + "grad_norm": 0.32964265793633174, + "learning_rate": 9.10843410924845e-05, + "loss": 2.9355, + "step": 17638 + }, + { + "epoch": 0.8212398444956585, + "grad_norm": 0.3819838574695393, + "learning_rate": 9.10827972168738e-05, + "loss": 2.9121, + "step": 17639 + }, + { + "epoch": 0.8212864026817516, + "grad_norm": 0.3418682977603321, + "learning_rate": 9.108125322068912e-05, + "loss": 2.9411, + "step": 17640 + }, + { + "epoch": 0.8213329608678446, + "grad_norm": 0.39944488074567736, + "learning_rate": 9.107970910393503e-05, + "loss": 3.0233, + "step": 17641 + }, + { + "epoch": 0.8213795190539377, + "grad_norm": 0.374617596948994, + "learning_rate": 9.107816486661604e-05, + "loss": 2.8878, + "step": 17642 + }, + { + "epoch": 0.8214260772400307, + "grad_norm": 0.3466023284850201, + "learning_rate": 9.107662050873667e-05, + "loss": 2.8745, + "step": 17643 + }, + { + "epoch": 0.8214726354261238, + "grad_norm": 0.4172198315161939, + "learning_rate": 9.107507603030147e-05, + "loss": 2.9488, + "step": 17644 + }, + { + "epoch": 0.8215191936122168, + "grad_norm": 0.349080752989872, + "learning_rate": 9.107353143131498e-05, + "loss": 2.9452, + "step": 17645 + }, + { + "epoch": 0.8215657517983099, + "grad_norm": 0.36745522812684805, + "learning_rate": 9.107198671178172e-05, + "loss": 3.0039, + "step": 17646 + }, + { + "epoch": 0.821612309984403, + "grad_norm": 0.36879104054137707, + "learning_rate": 9.107044187170621e-05, + "loss": 2.9452, + "step": 17647 + }, + { + "epoch": 0.821658868170496, + "grad_norm": 0.34456541872672564, + "learning_rate": 9.106889691109302e-05, + "loss": 2.9525, + "step": 17648 + }, + { + "epoch": 0.8217054263565892, + "grad_norm": 0.4437339218273278, + "learning_rate": 9.106735182994665e-05, + "loss": 2.9211, + "step": 17649 + }, + { + "epoch": 0.8217519845426822, + "grad_norm": 0.36045594779930107, + "learning_rate": 9.106580662827165e-05, + "loss": 2.9703, + "step": 17650 + }, + { + "epoch": 0.8217985427287753, + "grad_norm": 0.4042810749770579, + "learning_rate": 9.106426130607254e-05, + "loss": 3.0844, + "step": 17651 + }, + { + "epoch": 0.8218451009148684, + "grad_norm": 0.4100553789490916, + "learning_rate": 9.106271586335388e-05, + "loss": 2.8557, + "step": 17652 + }, + { + "epoch": 0.8218916591009614, + "grad_norm": 0.3974771112298596, + "learning_rate": 9.10611703001202e-05, + "loss": 2.9803, + "step": 17653 + }, + { + "epoch": 0.8219382172870545, + "grad_norm": 0.36641554318225966, + "learning_rate": 9.105962461637603e-05, + "loss": 2.8248, + "step": 17654 + }, + { + "epoch": 0.8219847754731475, + "grad_norm": 0.3866567767529517, + "learning_rate": 9.105807881212591e-05, + "loss": 2.8967, + "step": 17655 + }, + { + "epoch": 0.8220313336592406, + "grad_norm": 0.3780452710634526, + "learning_rate": 9.105653288737437e-05, + "loss": 3.0265, + "step": 17656 + }, + { + "epoch": 0.8220778918453338, + "grad_norm": 0.3724692147173662, + "learning_rate": 9.105498684212595e-05, + "loss": 3.0096, + "step": 17657 + }, + { + "epoch": 0.8221244500314268, + "grad_norm": 0.3460263916217021, + "learning_rate": 9.105344067638518e-05, + "loss": 2.939, + "step": 17658 + }, + { + "epoch": 0.8221710082175199, + "grad_norm": 0.37688523016056236, + "learning_rate": 9.105189439015661e-05, + "loss": 2.9999, + "step": 17659 + }, + { + "epoch": 0.8222175664036129, + "grad_norm": 0.35763263803063067, + "learning_rate": 9.105034798344478e-05, + "loss": 3.0617, + "step": 17660 + }, + { + "epoch": 0.822264124589706, + "grad_norm": 0.3626790286600427, + "learning_rate": 9.104880145625421e-05, + "loss": 2.9999, + "step": 17661 + }, + { + "epoch": 0.8223106827757991, + "grad_norm": 0.36087160002628993, + "learning_rate": 9.104725480858947e-05, + "loss": 2.9602, + "step": 17662 + }, + { + "epoch": 0.8223572409618921, + "grad_norm": 0.3674820749585694, + "learning_rate": 9.104570804045506e-05, + "loss": 2.9711, + "step": 17663 + }, + { + "epoch": 0.8224037991479852, + "grad_norm": 0.39413422523770186, + "learning_rate": 9.104416115185556e-05, + "loss": 2.9616, + "step": 17664 + }, + { + "epoch": 0.8224503573340782, + "grad_norm": 0.404476975175679, + "learning_rate": 9.104261414279548e-05, + "loss": 2.9009, + "step": 17665 + }, + { + "epoch": 0.8224969155201713, + "grad_norm": 0.3870651785573555, + "learning_rate": 9.104106701327936e-05, + "loss": 3.0163, + "step": 17666 + }, + { + "epoch": 0.8225434737062643, + "grad_norm": 0.40130268337641445, + "learning_rate": 9.103951976331177e-05, + "loss": 2.8751, + "step": 17667 + }, + { + "epoch": 0.8225900318923575, + "grad_norm": 0.3666861040283247, + "learning_rate": 9.103797239289724e-05, + "loss": 2.9927, + "step": 17668 + }, + { + "epoch": 0.8226365900784506, + "grad_norm": 0.39996143233913617, + "learning_rate": 9.10364249020403e-05, + "loss": 3.0056, + "step": 17669 + }, + { + "epoch": 0.8226831482645436, + "grad_norm": 0.46621686995713274, + "learning_rate": 9.103487729074547e-05, + "loss": 2.8754, + "step": 17670 + }, + { + "epoch": 0.8227297064506367, + "grad_norm": 0.389957316112865, + "learning_rate": 9.103332955901734e-05, + "loss": 2.7878, + "step": 17671 + }, + { + "epoch": 0.8227762646367297, + "grad_norm": 0.44246646702642906, + "learning_rate": 9.103178170686043e-05, + "loss": 2.9696, + "step": 17672 + }, + { + "epoch": 0.8228228228228228, + "grad_norm": 0.3450438391024675, + "learning_rate": 9.103023373427927e-05, + "loss": 2.8144, + "step": 17673 + }, + { + "epoch": 0.8228693810089159, + "grad_norm": 0.3822223882243087, + "learning_rate": 9.102868564127844e-05, + "loss": 2.8601, + "step": 17674 + }, + { + "epoch": 0.8229159391950089, + "grad_norm": 0.380298809410141, + "learning_rate": 9.102713742786245e-05, + "loss": 2.9427, + "step": 17675 + }, + { + "epoch": 0.822962497381102, + "grad_norm": 0.360369413775753, + "learning_rate": 9.102558909403583e-05, + "loss": 3.0022, + "step": 17676 + }, + { + "epoch": 0.8230090555671951, + "grad_norm": 0.4053119059615079, + "learning_rate": 9.102404063980317e-05, + "loss": 2.9043, + "step": 17677 + }, + { + "epoch": 0.8230556137532882, + "grad_norm": 0.37911962836640745, + "learning_rate": 9.102249206516899e-05, + "loss": 2.9753, + "step": 17678 + }, + { + "epoch": 0.8231021719393813, + "grad_norm": 0.3954851471611559, + "learning_rate": 9.102094337013782e-05, + "loss": 2.8888, + "step": 17679 + }, + { + "epoch": 0.8231487301254743, + "grad_norm": 0.3466812914138444, + "learning_rate": 9.101939455471425e-05, + "loss": 2.9022, + "step": 17680 + }, + { + "epoch": 0.8231952883115674, + "grad_norm": 0.4157764585812427, + "learning_rate": 9.101784561890278e-05, + "loss": 2.991, + "step": 17681 + }, + { + "epoch": 0.8232418464976604, + "grad_norm": 0.3763293263671609, + "learning_rate": 9.101629656270798e-05, + "loss": 2.8492, + "step": 17682 + }, + { + "epoch": 0.8232884046837535, + "grad_norm": 0.38275266246899603, + "learning_rate": 9.101474738613439e-05, + "loss": 2.9978, + "step": 17683 + }, + { + "epoch": 0.8233349628698466, + "grad_norm": 0.4050044669718651, + "learning_rate": 9.101319808918655e-05, + "loss": 2.9092, + "step": 17684 + }, + { + "epoch": 0.8233815210559396, + "grad_norm": 0.3929907724333609, + "learning_rate": 9.101164867186903e-05, + "loss": 3.0553, + "step": 17685 + }, + { + "epoch": 0.8234280792420328, + "grad_norm": 0.36184816643342277, + "learning_rate": 9.101009913418634e-05, + "loss": 2.98, + "step": 17686 + }, + { + "epoch": 0.8234746374281258, + "grad_norm": 0.3662076000311796, + "learning_rate": 9.100854947614307e-05, + "loss": 2.882, + "step": 17687 + }, + { + "epoch": 0.8235211956142189, + "grad_norm": 0.3823655588313953, + "learning_rate": 9.100699969774374e-05, + "loss": 3.0129, + "step": 17688 + }, + { + "epoch": 0.8235677538003119, + "grad_norm": 0.351453868096226, + "learning_rate": 9.10054497989929e-05, + "loss": 2.8764, + "step": 17689 + }, + { + "epoch": 0.823614311986405, + "grad_norm": 0.3921769417379337, + "learning_rate": 9.100389977989512e-05, + "loss": 2.9758, + "step": 17690 + }, + { + "epoch": 0.8236608701724981, + "grad_norm": 0.3437761460061545, + "learning_rate": 9.100234964045492e-05, + "loss": 2.8836, + "step": 17691 + }, + { + "epoch": 0.8237074283585911, + "grad_norm": 0.3516746037708343, + "learning_rate": 9.100079938067688e-05, + "loss": 2.9804, + "step": 17692 + }, + { + "epoch": 0.8237539865446842, + "grad_norm": 0.343300993191297, + "learning_rate": 9.099924900056551e-05, + "loss": 2.9775, + "step": 17693 + }, + { + "epoch": 0.8238005447307772, + "grad_norm": 0.3436997431796975, + "learning_rate": 9.099769850012539e-05, + "loss": 2.9695, + "step": 17694 + }, + { + "epoch": 0.8238471029168704, + "grad_norm": 0.3884212186686749, + "learning_rate": 9.099614787936105e-05, + "loss": 2.8705, + "step": 17695 + }, + { + "epoch": 0.8238936611029635, + "grad_norm": 0.31682702229574505, + "learning_rate": 9.099459713827709e-05, + "loss": 2.8139, + "step": 17696 + }, + { + "epoch": 0.8239402192890565, + "grad_norm": 0.358455536644202, + "learning_rate": 9.099304627687799e-05, + "loss": 3.0175, + "step": 17697 + }, + { + "epoch": 0.8239867774751496, + "grad_norm": 0.336957802803647, + "learning_rate": 9.099149529516837e-05, + "loss": 2.9337, + "step": 17698 + }, + { + "epoch": 0.8240333356612426, + "grad_norm": 0.385816584779915, + "learning_rate": 9.098994419315272e-05, + "loss": 3.0015, + "step": 17699 + }, + { + "epoch": 0.8240798938473357, + "grad_norm": 0.36088814756040066, + "learning_rate": 9.098839297083564e-05, + "loss": 2.9674, + "step": 17700 + }, + { + "epoch": 0.8241264520334288, + "grad_norm": 0.34701677378242507, + "learning_rate": 9.098684162822166e-05, + "loss": 2.9186, + "step": 17701 + }, + { + "epoch": 0.8241730102195218, + "grad_norm": 0.3610055217441791, + "learning_rate": 9.098529016531534e-05, + "loss": 2.8765, + "step": 17702 + }, + { + "epoch": 0.8242195684056149, + "grad_norm": 0.3500626271417989, + "learning_rate": 9.098373858212124e-05, + "loss": 2.9918, + "step": 17703 + }, + { + "epoch": 0.824266126591708, + "grad_norm": 0.3628794726254118, + "learning_rate": 9.098218687864388e-05, + "loss": 2.9561, + "step": 17704 + }, + { + "epoch": 0.8243126847778011, + "grad_norm": 0.34559797542414, + "learning_rate": 9.098063505488785e-05, + "loss": 2.9105, + "step": 17705 + }, + { + "epoch": 0.8243592429638942, + "grad_norm": 0.3979814470737532, + "learning_rate": 9.09790831108577e-05, + "loss": 2.8923, + "step": 17706 + }, + { + "epoch": 0.8244058011499872, + "grad_norm": 0.3589539092748312, + "learning_rate": 9.097753104655797e-05, + "loss": 2.9439, + "step": 17707 + }, + { + "epoch": 0.8244523593360803, + "grad_norm": 0.40719192021383394, + "learning_rate": 9.097597886199324e-05, + "loss": 2.8738, + "step": 17708 + }, + { + "epoch": 0.8244989175221733, + "grad_norm": 0.33293737868459294, + "learning_rate": 9.097442655716803e-05, + "loss": 2.9173, + "step": 17709 + }, + { + "epoch": 0.8245454757082664, + "grad_norm": 0.42175291183277835, + "learning_rate": 9.097287413208693e-05, + "loss": 2.92, + "step": 17710 + }, + { + "epoch": 0.8245920338943594, + "grad_norm": 0.335667608033243, + "learning_rate": 9.097132158675448e-05, + "loss": 2.8807, + "step": 17711 + }, + { + "epoch": 0.8246385920804525, + "grad_norm": 0.40190187776858377, + "learning_rate": 9.096976892117522e-05, + "loss": 2.8415, + "step": 17712 + }, + { + "epoch": 0.8246851502665457, + "grad_norm": 0.37985424298336756, + "learning_rate": 9.096821613535373e-05, + "loss": 2.9305, + "step": 17713 + }, + { + "epoch": 0.8247317084526387, + "grad_norm": 0.34744970850533685, + "learning_rate": 9.096666322929457e-05, + "loss": 2.8028, + "step": 17714 + }, + { + "epoch": 0.8247782666387318, + "grad_norm": 0.3601235140174462, + "learning_rate": 9.096511020300229e-05, + "loss": 2.9587, + "step": 17715 + }, + { + "epoch": 0.8248248248248248, + "grad_norm": 0.3654617785220051, + "learning_rate": 9.096355705648144e-05, + "loss": 2.9385, + "step": 17716 + }, + { + "epoch": 0.8248713830109179, + "grad_norm": 0.36878858620862887, + "learning_rate": 9.096200378973658e-05, + "loss": 2.9741, + "step": 17717 + }, + { + "epoch": 0.824917941197011, + "grad_norm": 0.3351055428131786, + "learning_rate": 9.09604504027723e-05, + "loss": 3.0033, + "step": 17718 + }, + { + "epoch": 0.824964499383104, + "grad_norm": 0.3714070829828265, + "learning_rate": 9.095889689559312e-05, + "loss": 2.9206, + "step": 17719 + }, + { + "epoch": 0.8250110575691971, + "grad_norm": 0.3387193957406303, + "learning_rate": 9.095734326820359e-05, + "loss": 2.9591, + "step": 17720 + }, + { + "epoch": 0.8250576157552901, + "grad_norm": 0.3264941865751752, + "learning_rate": 9.095578952060833e-05, + "loss": 3.0436, + "step": 17721 + }, + { + "epoch": 0.8251041739413832, + "grad_norm": 0.35939642658612486, + "learning_rate": 9.095423565281182e-05, + "loss": 2.9551, + "step": 17722 + }, + { + "epoch": 0.8251507321274764, + "grad_norm": 0.34525225320772224, + "learning_rate": 9.09526816648187e-05, + "loss": 3.0036, + "step": 17723 + }, + { + "epoch": 0.8251972903135694, + "grad_norm": 0.33135823577715284, + "learning_rate": 9.095112755663349e-05, + "loss": 2.9717, + "step": 17724 + }, + { + "epoch": 0.8252438484996625, + "grad_norm": 0.36198289039777126, + "learning_rate": 9.094957332826073e-05, + "loss": 2.8875, + "step": 17725 + }, + { + "epoch": 0.8252904066857555, + "grad_norm": 0.35373770787564424, + "learning_rate": 9.094801897970503e-05, + "loss": 2.8514, + "step": 17726 + }, + { + "epoch": 0.8253369648718486, + "grad_norm": 0.33583765337565613, + "learning_rate": 9.094646451097091e-05, + "loss": 2.9209, + "step": 17727 + }, + { + "epoch": 0.8253835230579417, + "grad_norm": 0.34617124389081894, + "learning_rate": 9.094490992206297e-05, + "loss": 2.9351, + "step": 17728 + }, + { + "epoch": 0.8254300812440347, + "grad_norm": 0.3492626918213219, + "learning_rate": 9.094335521298573e-05, + "loss": 2.9381, + "step": 17729 + }, + { + "epoch": 0.8254766394301278, + "grad_norm": 0.3454945724835941, + "learning_rate": 9.09418003837438e-05, + "loss": 2.9793, + "step": 17730 + }, + { + "epoch": 0.8255231976162208, + "grad_norm": 0.3411775845210622, + "learning_rate": 9.094024543434172e-05, + "loss": 2.8958, + "step": 17731 + }, + { + "epoch": 0.825569755802314, + "grad_norm": 0.33417928252867257, + "learning_rate": 9.093869036478405e-05, + "loss": 2.9774, + "step": 17732 + }, + { + "epoch": 0.825616313988407, + "grad_norm": 0.3569995022343192, + "learning_rate": 9.093713517507534e-05, + "loss": 2.9228, + "step": 17733 + }, + { + "epoch": 0.8256628721745001, + "grad_norm": 0.3215196496616186, + "learning_rate": 9.093557986522018e-05, + "loss": 2.8462, + "step": 17734 + }, + { + "epoch": 0.8257094303605932, + "grad_norm": 0.3833454997417122, + "learning_rate": 9.093402443522313e-05, + "loss": 2.9461, + "step": 17735 + }, + { + "epoch": 0.8257559885466862, + "grad_norm": 0.34104915644893496, + "learning_rate": 9.093246888508874e-05, + "loss": 2.8815, + "step": 17736 + }, + { + "epoch": 0.8258025467327793, + "grad_norm": 0.35386804398464616, + "learning_rate": 9.093091321482159e-05, + "loss": 2.9321, + "step": 17737 + }, + { + "epoch": 0.8258491049188723, + "grad_norm": 0.4112756361922857, + "learning_rate": 9.092935742442625e-05, + "loss": 2.9893, + "step": 17738 + }, + { + "epoch": 0.8258956631049654, + "grad_norm": 0.3112643883161365, + "learning_rate": 9.092780151390728e-05, + "loss": 2.9438, + "step": 17739 + }, + { + "epoch": 0.8259422212910585, + "grad_norm": 0.34833837166761356, + "learning_rate": 9.092624548326924e-05, + "loss": 2.8879, + "step": 17740 + }, + { + "epoch": 0.8259887794771515, + "grad_norm": 0.31073844757667585, + "learning_rate": 9.09246893325167e-05, + "loss": 2.984, + "step": 17741 + }, + { + "epoch": 0.8260353376632447, + "grad_norm": 0.34222084337459, + "learning_rate": 9.092313306165423e-05, + "loss": 2.9803, + "step": 17742 + }, + { + "epoch": 0.8260818958493377, + "grad_norm": 0.31636141498428916, + "learning_rate": 9.092157667068639e-05, + "loss": 2.9556, + "step": 17743 + }, + { + "epoch": 0.8261284540354308, + "grad_norm": 0.3353097470222855, + "learning_rate": 9.092002015961775e-05, + "loss": 2.9417, + "step": 17744 + }, + { + "epoch": 0.8261750122215239, + "grad_norm": 0.33467306501115823, + "learning_rate": 9.09184635284529e-05, + "loss": 2.9253, + "step": 17745 + }, + { + "epoch": 0.8262215704076169, + "grad_norm": 0.3204259135848021, + "learning_rate": 9.091690677719637e-05, + "loss": 3.0229, + "step": 17746 + }, + { + "epoch": 0.82626812859371, + "grad_norm": 0.34430571778855656, + "learning_rate": 9.091534990585278e-05, + "loss": 2.9683, + "step": 17747 + }, + { + "epoch": 0.826314686779803, + "grad_norm": 0.3115250948046728, + "learning_rate": 9.091379291442664e-05, + "loss": 2.8888, + "step": 17748 + }, + { + "epoch": 0.8263612449658961, + "grad_norm": 0.3706247181639285, + "learning_rate": 9.091223580292256e-05, + "loss": 2.969, + "step": 17749 + }, + { + "epoch": 0.8264078031519893, + "grad_norm": 0.36565496955475274, + "learning_rate": 9.091067857134509e-05, + "loss": 2.9614, + "step": 17750 + }, + { + "epoch": 0.8264543613380823, + "grad_norm": 0.32478151714457004, + "learning_rate": 9.090912121969882e-05, + "loss": 2.9543, + "step": 17751 + }, + { + "epoch": 0.8265009195241754, + "grad_norm": 0.362992642163613, + "learning_rate": 9.09075637479883e-05, + "loss": 3.0582, + "step": 17752 + }, + { + "epoch": 0.8265474777102684, + "grad_norm": 0.33812167962483186, + "learning_rate": 9.090600615621811e-05, + "loss": 2.9627, + "step": 17753 + }, + { + "epoch": 0.8265940358963615, + "grad_norm": 0.3341379639776257, + "learning_rate": 9.090444844439285e-05, + "loss": 2.961, + "step": 17754 + }, + { + "epoch": 0.8266405940824545, + "grad_norm": 0.34341302227693676, + "learning_rate": 9.090289061251704e-05, + "loss": 2.8977, + "step": 17755 + }, + { + "epoch": 0.8266871522685476, + "grad_norm": 0.3466441290865802, + "learning_rate": 9.090133266059527e-05, + "loss": 3.0438, + "step": 17756 + }, + { + "epoch": 0.8267337104546407, + "grad_norm": 0.36406520705303963, + "learning_rate": 9.089977458863213e-05, + "loss": 2.9669, + "step": 17757 + }, + { + "epoch": 0.8267802686407337, + "grad_norm": 0.3276625920956478, + "learning_rate": 9.089821639663216e-05, + "loss": 2.95, + "step": 17758 + }, + { + "epoch": 0.8268268268268268, + "grad_norm": 0.3602813873428493, + "learning_rate": 9.089665808459998e-05, + "loss": 3.005, + "step": 17759 + }, + { + "epoch": 0.8268733850129198, + "grad_norm": 0.3435570113692897, + "learning_rate": 9.089509965254013e-05, + "loss": 2.8624, + "step": 17760 + }, + { + "epoch": 0.826919943199013, + "grad_norm": 0.3774073726352274, + "learning_rate": 9.089354110045718e-05, + "loss": 2.8869, + "step": 17761 + }, + { + "epoch": 0.8269665013851061, + "grad_norm": 0.2945771490950603, + "learning_rate": 9.089198242835573e-05, + "loss": 2.794, + "step": 17762 + }, + { + "epoch": 0.8270130595711991, + "grad_norm": 0.39787312782944034, + "learning_rate": 9.089042363624034e-05, + "loss": 2.9231, + "step": 17763 + }, + { + "epoch": 0.8270596177572922, + "grad_norm": 0.3080858707369825, + "learning_rate": 9.088886472411557e-05, + "loss": 2.853, + "step": 17764 + }, + { + "epoch": 0.8271061759433852, + "grad_norm": 0.3892132636334878, + "learning_rate": 9.088730569198602e-05, + "loss": 2.9373, + "step": 17765 + }, + { + "epoch": 0.8271527341294783, + "grad_norm": 0.3066404384331689, + "learning_rate": 9.088574653985625e-05, + "loss": 2.9081, + "step": 17766 + }, + { + "epoch": 0.8271992923155714, + "grad_norm": 0.38661926943458225, + "learning_rate": 9.088418726773084e-05, + "loss": 2.9279, + "step": 17767 + }, + { + "epoch": 0.8272458505016644, + "grad_norm": 0.3487032737363298, + "learning_rate": 9.088262787561439e-05, + "loss": 2.9542, + "step": 17768 + }, + { + "epoch": 0.8272924086877576, + "grad_norm": 0.3528402185694796, + "learning_rate": 9.088106836351145e-05, + "loss": 2.8711, + "step": 17769 + }, + { + "epoch": 0.8273389668738506, + "grad_norm": 0.3801785768481393, + "learning_rate": 9.087950873142658e-05, + "loss": 2.9629, + "step": 17770 + }, + { + "epoch": 0.8273855250599437, + "grad_norm": 0.3777522638094413, + "learning_rate": 9.087794897936439e-05, + "loss": 2.9346, + "step": 17771 + }, + { + "epoch": 0.8274320832460368, + "grad_norm": 0.35951328200641464, + "learning_rate": 9.087638910732945e-05, + "loss": 3.0575, + "step": 17772 + }, + { + "epoch": 0.8274786414321298, + "grad_norm": 0.38220934462036127, + "learning_rate": 9.087482911532633e-05, + "loss": 2.9286, + "step": 17773 + }, + { + "epoch": 0.8275251996182229, + "grad_norm": 0.3644314305316419, + "learning_rate": 9.087326900335962e-05, + "loss": 3.0051, + "step": 17774 + }, + { + "epoch": 0.8275717578043159, + "grad_norm": 0.3399753610379195, + "learning_rate": 9.08717087714339e-05, + "loss": 2.9075, + "step": 17775 + }, + { + "epoch": 0.827618315990409, + "grad_norm": 0.37601037420205924, + "learning_rate": 9.087014841955371e-05, + "loss": 2.8943, + "step": 17776 + }, + { + "epoch": 0.827664874176502, + "grad_norm": 0.3346765329560205, + "learning_rate": 9.086858794772368e-05, + "loss": 2.9701, + "step": 17777 + }, + { + "epoch": 0.8277114323625951, + "grad_norm": 0.3645970499537042, + "learning_rate": 9.086702735594838e-05, + "loss": 2.9403, + "step": 17778 + }, + { + "epoch": 0.8277579905486883, + "grad_norm": 0.32441203360647075, + "learning_rate": 9.086546664423236e-05, + "loss": 3.049, + "step": 17779 + }, + { + "epoch": 0.8278045487347813, + "grad_norm": 0.3271126487376236, + "learning_rate": 9.086390581258024e-05, + "loss": 2.9189, + "step": 17780 + }, + { + "epoch": 0.8278511069208744, + "grad_norm": 0.36213653422643344, + "learning_rate": 9.086234486099658e-05, + "loss": 2.9109, + "step": 17781 + }, + { + "epoch": 0.8278976651069674, + "grad_norm": 0.30008579078909864, + "learning_rate": 9.086078378948596e-05, + "loss": 2.9589, + "step": 17782 + }, + { + "epoch": 0.8279442232930605, + "grad_norm": 0.33627911272813, + "learning_rate": 9.085922259805296e-05, + "loss": 2.9765, + "step": 17783 + }, + { + "epoch": 0.8279907814791536, + "grad_norm": 0.35745009430915575, + "learning_rate": 9.085766128670217e-05, + "loss": 2.9185, + "step": 17784 + }, + { + "epoch": 0.8280373396652466, + "grad_norm": 0.3646467706896591, + "learning_rate": 9.085609985543817e-05, + "loss": 2.85, + "step": 17785 + }, + { + "epoch": 0.8280838978513397, + "grad_norm": 0.34259869002924614, + "learning_rate": 9.085453830426555e-05, + "loss": 2.9117, + "step": 17786 + }, + { + "epoch": 0.8281304560374327, + "grad_norm": 0.34446547722227916, + "learning_rate": 9.085297663318887e-05, + "loss": 2.9637, + "step": 17787 + }, + { + "epoch": 0.8281770142235259, + "grad_norm": 0.3670131916395916, + "learning_rate": 9.085141484221273e-05, + "loss": 2.9214, + "step": 17788 + }, + { + "epoch": 0.828223572409619, + "grad_norm": 0.35845941923828695, + "learning_rate": 9.084985293134172e-05, + "loss": 2.9311, + "step": 17789 + }, + { + "epoch": 0.828270130595712, + "grad_norm": 0.3454487433398754, + "learning_rate": 9.084829090058042e-05, + "loss": 2.9818, + "step": 17790 + }, + { + "epoch": 0.8283166887818051, + "grad_norm": 0.3547185084338155, + "learning_rate": 9.084672874993341e-05, + "loss": 3.0015, + "step": 17791 + }, + { + "epoch": 0.8283632469678981, + "grad_norm": 0.3070796662117036, + "learning_rate": 9.084516647940527e-05, + "loss": 2.9362, + "step": 17792 + }, + { + "epoch": 0.8284098051539912, + "grad_norm": 0.34882700549921897, + "learning_rate": 9.084360408900058e-05, + "loss": 2.9505, + "step": 17793 + }, + { + "epoch": 0.8284563633400843, + "grad_norm": 0.3290182575191537, + "learning_rate": 9.084204157872395e-05, + "loss": 2.9247, + "step": 17794 + }, + { + "epoch": 0.8285029215261773, + "grad_norm": 0.34814831130085544, + "learning_rate": 9.084047894857994e-05, + "loss": 2.9524, + "step": 17795 + }, + { + "epoch": 0.8285494797122704, + "grad_norm": 0.3578343212289737, + "learning_rate": 9.083891619857316e-05, + "loss": 3.0052, + "step": 17796 + }, + { + "epoch": 0.8285960378983634, + "grad_norm": 0.3503890541460049, + "learning_rate": 9.083735332870818e-05, + "loss": 2.9012, + "step": 17797 + }, + { + "epoch": 0.8286425960844566, + "grad_norm": 0.3649346607224524, + "learning_rate": 9.083579033898957e-05, + "loss": 2.986, + "step": 17798 + }, + { + "epoch": 0.8286891542705496, + "grad_norm": 0.3645213600586169, + "learning_rate": 9.083422722942197e-05, + "loss": 2.9186, + "step": 17799 + }, + { + "epoch": 0.8287357124566427, + "grad_norm": 0.34510115637444105, + "learning_rate": 9.083266400000992e-05, + "loss": 2.9487, + "step": 17800 + }, + { + "epoch": 0.8287822706427358, + "grad_norm": 0.35774522921613433, + "learning_rate": 9.083110065075804e-05, + "loss": 2.8704, + "step": 17801 + }, + { + "epoch": 0.8288288288288288, + "grad_norm": 0.3637670548838561, + "learning_rate": 9.082953718167088e-05, + "loss": 2.9009, + "step": 17802 + }, + { + "epoch": 0.8288753870149219, + "grad_norm": 0.3557741272746586, + "learning_rate": 9.082797359275305e-05, + "loss": 2.9543, + "step": 17803 + }, + { + "epoch": 0.8289219452010149, + "grad_norm": 0.37074404858519155, + "learning_rate": 9.082640988400917e-05, + "loss": 2.9584, + "step": 17804 + }, + { + "epoch": 0.828968503387108, + "grad_norm": 0.3531612304832107, + "learning_rate": 9.082484605544377e-05, + "loss": 2.9377, + "step": 17805 + }, + { + "epoch": 0.8290150615732012, + "grad_norm": 0.343337222558878, + "learning_rate": 9.082328210706147e-05, + "loss": 2.8717, + "step": 17806 + }, + { + "epoch": 0.8290616197592942, + "grad_norm": 0.3446911492513116, + "learning_rate": 9.082171803886688e-05, + "loss": 2.912, + "step": 17807 + }, + { + "epoch": 0.8291081779453873, + "grad_norm": 0.39018990547018706, + "learning_rate": 9.082015385086456e-05, + "loss": 2.9271, + "step": 17808 + }, + { + "epoch": 0.8291547361314803, + "grad_norm": 0.3415498022494512, + "learning_rate": 9.08185895430591e-05, + "loss": 2.924, + "step": 17809 + }, + { + "epoch": 0.8292012943175734, + "grad_norm": 0.34400539358287, + "learning_rate": 9.081702511545511e-05, + "loss": 2.8945, + "step": 17810 + }, + { + "epoch": 0.8292478525036665, + "grad_norm": 0.3452448794137601, + "learning_rate": 9.081546056805717e-05, + "loss": 2.8872, + "step": 17811 + }, + { + "epoch": 0.8292944106897595, + "grad_norm": 0.341599412895248, + "learning_rate": 9.081389590086987e-05, + "loss": 3.0436, + "step": 17812 + }, + { + "epoch": 0.8293409688758526, + "grad_norm": 0.3918189183340889, + "learning_rate": 9.081233111389783e-05, + "loss": 2.9746, + "step": 17813 + }, + { + "epoch": 0.8293875270619456, + "grad_norm": 0.36018705432630976, + "learning_rate": 9.081076620714557e-05, + "loss": 2.9742, + "step": 17814 + }, + { + "epoch": 0.8294340852480387, + "grad_norm": 0.38522598786267837, + "learning_rate": 9.080920118061777e-05, + "loss": 2.8853, + "step": 17815 + }, + { + "epoch": 0.8294806434341319, + "grad_norm": 0.33921479688668804, + "learning_rate": 9.080763603431898e-05, + "loss": 2.9944, + "step": 17816 + }, + { + "epoch": 0.8295272016202249, + "grad_norm": 0.3558589080402032, + "learning_rate": 9.080607076825378e-05, + "loss": 2.9613, + "step": 17817 + }, + { + "epoch": 0.829573759806318, + "grad_norm": 0.351836327473566, + "learning_rate": 9.080450538242679e-05, + "loss": 2.9333, + "step": 17818 + }, + { + "epoch": 0.829620317992411, + "grad_norm": 0.3182932261000349, + "learning_rate": 9.080293987684262e-05, + "loss": 2.8996, + "step": 17819 + }, + { + "epoch": 0.8296668761785041, + "grad_norm": 0.36228973397536385, + "learning_rate": 9.080137425150582e-05, + "loss": 2.9527, + "step": 17820 + }, + { + "epoch": 0.8297134343645971, + "grad_norm": 0.3370511318004826, + "learning_rate": 9.079980850642102e-05, + "loss": 2.879, + "step": 17821 + }, + { + "epoch": 0.8297599925506902, + "grad_norm": 0.360364608957062, + "learning_rate": 9.079824264159278e-05, + "loss": 2.9161, + "step": 17822 + }, + { + "epoch": 0.8298065507367833, + "grad_norm": 0.33593738774193166, + "learning_rate": 9.079667665702574e-05, + "loss": 3.0414, + "step": 17823 + }, + { + "epoch": 0.8298531089228763, + "grad_norm": 0.3539722940840384, + "learning_rate": 9.079511055272445e-05, + "loss": 2.953, + "step": 17824 + }, + { + "epoch": 0.8298996671089695, + "grad_norm": 0.3795355174948672, + "learning_rate": 9.079354432869355e-05, + "loss": 2.9712, + "step": 17825 + }, + { + "epoch": 0.8299462252950625, + "grad_norm": 0.3376431063486536, + "learning_rate": 9.079197798493761e-05, + "loss": 2.9145, + "step": 17826 + }, + { + "epoch": 0.8299927834811556, + "grad_norm": 0.35123836323491453, + "learning_rate": 9.079041152146123e-05, + "loss": 2.9781, + "step": 17827 + }, + { + "epoch": 0.8300393416672487, + "grad_norm": 0.33603963709341844, + "learning_rate": 9.078884493826901e-05, + "loss": 2.8986, + "step": 17828 + }, + { + "epoch": 0.8300858998533417, + "grad_norm": 0.3818378038092777, + "learning_rate": 9.078727823536554e-05, + "loss": 3.0023, + "step": 17829 + }, + { + "epoch": 0.8301324580394348, + "grad_norm": 0.3399935097042549, + "learning_rate": 9.078571141275545e-05, + "loss": 2.9193, + "step": 17830 + }, + { + "epoch": 0.8301790162255278, + "grad_norm": 0.35455037365635234, + "learning_rate": 9.07841444704433e-05, + "loss": 2.9053, + "step": 17831 + }, + { + "epoch": 0.8302255744116209, + "grad_norm": 0.3834140090189135, + "learning_rate": 9.07825774084337e-05, + "loss": 2.9403, + "step": 17832 + }, + { + "epoch": 0.830272132597714, + "grad_norm": 0.3415887764832618, + "learning_rate": 9.078101022673127e-05, + "loss": 2.8876, + "step": 17833 + }, + { + "epoch": 0.830318690783807, + "grad_norm": 0.38162914276863535, + "learning_rate": 9.077944292534058e-05, + "loss": 2.9748, + "step": 17834 + }, + { + "epoch": 0.8303652489699002, + "grad_norm": 0.3273232707232882, + "learning_rate": 9.077787550426624e-05, + "loss": 2.9183, + "step": 17835 + }, + { + "epoch": 0.8304118071559932, + "grad_norm": 0.4091674559775957, + "learning_rate": 9.077630796351286e-05, + "loss": 3.0163, + "step": 17836 + }, + { + "epoch": 0.8304583653420863, + "grad_norm": 0.4052373670230475, + "learning_rate": 9.077474030308501e-05, + "loss": 2.953, + "step": 17837 + }, + { + "epoch": 0.8305049235281794, + "grad_norm": 0.3946031584148746, + "learning_rate": 9.077317252298734e-05, + "loss": 2.8316, + "step": 17838 + }, + { + "epoch": 0.8305514817142724, + "grad_norm": 0.38389308594760313, + "learning_rate": 9.077160462322443e-05, + "loss": 2.9167, + "step": 17839 + }, + { + "epoch": 0.8305980399003655, + "grad_norm": 0.34451612478180366, + "learning_rate": 9.077003660380087e-05, + "loss": 2.9625, + "step": 17840 + }, + { + "epoch": 0.8306445980864585, + "grad_norm": 0.3560454467788136, + "learning_rate": 9.076846846472126e-05, + "loss": 3.0206, + "step": 17841 + }, + { + "epoch": 0.8306911562725516, + "grad_norm": 0.350226856855, + "learning_rate": 9.076690020599021e-05, + "loss": 2.9275, + "step": 17842 + }, + { + "epoch": 0.8307377144586446, + "grad_norm": 0.3572058711737655, + "learning_rate": 9.076533182761233e-05, + "loss": 2.8699, + "step": 17843 + }, + { + "epoch": 0.8307842726447378, + "grad_norm": 0.3674235477482083, + "learning_rate": 9.076376332959223e-05, + "loss": 2.902, + "step": 17844 + }, + { + "epoch": 0.8308308308308309, + "grad_norm": 0.3483841278335764, + "learning_rate": 9.076219471193449e-05, + "loss": 3.0298, + "step": 17845 + }, + { + "epoch": 0.8308773890169239, + "grad_norm": 0.3217603928480938, + "learning_rate": 9.076062597464372e-05, + "loss": 2.8905, + "step": 17846 + }, + { + "epoch": 0.830923947203017, + "grad_norm": 0.37471252337876393, + "learning_rate": 9.075905711772453e-05, + "loss": 3.0655, + "step": 17847 + }, + { + "epoch": 0.83097050538911, + "grad_norm": 0.3319309632263177, + "learning_rate": 9.075748814118153e-05, + "loss": 2.8933, + "step": 17848 + }, + { + "epoch": 0.8310170635752031, + "grad_norm": 0.33557767667782323, + "learning_rate": 9.07559190450193e-05, + "loss": 2.8673, + "step": 17849 + }, + { + "epoch": 0.8310636217612962, + "grad_norm": 0.39512109904098114, + "learning_rate": 9.075434982924248e-05, + "loss": 2.9036, + "step": 17850 + }, + { + "epoch": 0.8311101799473892, + "grad_norm": 0.32688705571320603, + "learning_rate": 9.075278049385566e-05, + "loss": 2.9082, + "step": 17851 + }, + { + "epoch": 0.8311567381334823, + "grad_norm": 0.3116463936033277, + "learning_rate": 9.075121103886344e-05, + "loss": 2.8701, + "step": 17852 + }, + { + "epoch": 0.8312032963195753, + "grad_norm": 0.3406381964764722, + "learning_rate": 9.074964146427042e-05, + "loss": 2.8583, + "step": 17853 + }, + { + "epoch": 0.8312498545056685, + "grad_norm": 0.33753387029817483, + "learning_rate": 9.074807177008122e-05, + "loss": 2.8976, + "step": 17854 + }, + { + "epoch": 0.8312964126917616, + "grad_norm": 0.33578538520059664, + "learning_rate": 9.074650195630046e-05, + "loss": 3.0611, + "step": 17855 + }, + { + "epoch": 0.8313429708778546, + "grad_norm": 0.36548428966121027, + "learning_rate": 9.07449320229327e-05, + "loss": 2.9526, + "step": 17856 + }, + { + "epoch": 0.8313895290639477, + "grad_norm": 0.33772523093135054, + "learning_rate": 9.074336196998261e-05, + "loss": 2.9214, + "step": 17857 + }, + { + "epoch": 0.8314360872500407, + "grad_norm": 0.37240900303682234, + "learning_rate": 9.074179179745474e-05, + "loss": 3.0172, + "step": 17858 + }, + { + "epoch": 0.8314826454361338, + "grad_norm": 0.345555182019151, + "learning_rate": 9.074022150535375e-05, + "loss": 2.8718, + "step": 17859 + }, + { + "epoch": 0.8315292036222269, + "grad_norm": 0.37751556854682167, + "learning_rate": 9.073865109368421e-05, + "loss": 2.9166, + "step": 17860 + }, + { + "epoch": 0.8315757618083199, + "grad_norm": 0.3116914802612439, + "learning_rate": 9.073708056245074e-05, + "loss": 2.8946, + "step": 17861 + }, + { + "epoch": 0.831622319994413, + "grad_norm": 0.3286341840009505, + "learning_rate": 9.073550991165796e-05, + "loss": 2.9729, + "step": 17862 + }, + { + "epoch": 0.831668878180506, + "grad_norm": 0.3542471524953582, + "learning_rate": 9.073393914131048e-05, + "loss": 3.0485, + "step": 17863 + }, + { + "epoch": 0.8317154363665992, + "grad_norm": 0.32897610197669175, + "learning_rate": 9.073236825141288e-05, + "loss": 2.9756, + "step": 17864 + }, + { + "epoch": 0.8317619945526922, + "grad_norm": 0.34492287078296885, + "learning_rate": 9.07307972419698e-05, + "loss": 2.9523, + "step": 17865 + }, + { + "epoch": 0.8318085527387853, + "grad_norm": 0.325154449364048, + "learning_rate": 9.072922611298584e-05, + "loss": 2.9009, + "step": 17866 + }, + { + "epoch": 0.8318551109248784, + "grad_norm": 0.31916656554492207, + "learning_rate": 9.072765486446562e-05, + "loss": 2.9064, + "step": 17867 + }, + { + "epoch": 0.8319016691109714, + "grad_norm": 0.34409125754495956, + "learning_rate": 9.072608349641373e-05, + "loss": 2.9171, + "step": 17868 + }, + { + "epoch": 0.8319482272970645, + "grad_norm": 0.36428436126379327, + "learning_rate": 9.072451200883483e-05, + "loss": 2.9212, + "step": 17869 + }, + { + "epoch": 0.8319947854831575, + "grad_norm": 0.327304975308665, + "learning_rate": 9.072294040173348e-05, + "loss": 2.9589, + "step": 17870 + }, + { + "epoch": 0.8320413436692506, + "grad_norm": 0.3166596979811181, + "learning_rate": 9.072136867511431e-05, + "loss": 2.9921, + "step": 17871 + }, + { + "epoch": 0.8320879018553438, + "grad_norm": 0.3116278711096682, + "learning_rate": 9.071979682898193e-05, + "loss": 2.8901, + "step": 17872 + }, + { + "epoch": 0.8321344600414368, + "grad_norm": 0.3072101757279584, + "learning_rate": 9.071822486334097e-05, + "loss": 2.8775, + "step": 17873 + }, + { + "epoch": 0.8321810182275299, + "grad_norm": 0.3339068520581096, + "learning_rate": 9.071665277819603e-05, + "loss": 2.9716, + "step": 17874 + }, + { + "epoch": 0.8322275764136229, + "grad_norm": 0.32212122654097947, + "learning_rate": 9.071508057355171e-05, + "loss": 2.8966, + "step": 17875 + }, + { + "epoch": 0.832274134599716, + "grad_norm": 0.34388406195542737, + "learning_rate": 9.071350824941265e-05, + "loss": 2.9885, + "step": 17876 + }, + { + "epoch": 0.8323206927858091, + "grad_norm": 0.32192254896144135, + "learning_rate": 9.071193580578346e-05, + "loss": 2.8677, + "step": 17877 + }, + { + "epoch": 0.8323672509719021, + "grad_norm": 0.3331704962513359, + "learning_rate": 9.071036324266875e-05, + "loss": 2.9319, + "step": 17878 + }, + { + "epoch": 0.8324138091579952, + "grad_norm": 0.35822464215569444, + "learning_rate": 9.070879056007312e-05, + "loss": 3.0112, + "step": 17879 + }, + { + "epoch": 0.8324603673440882, + "grad_norm": 0.3134096011868369, + "learning_rate": 9.070721775800122e-05, + "loss": 3.0076, + "step": 17880 + }, + { + "epoch": 0.8325069255301814, + "grad_norm": 0.3615180779108778, + "learning_rate": 9.070564483645762e-05, + "loss": 2.957, + "step": 17881 + }, + { + "epoch": 0.8325534837162745, + "grad_norm": 0.3327971143552079, + "learning_rate": 9.070407179544698e-05, + "loss": 2.9272, + "step": 17882 + }, + { + "epoch": 0.8326000419023675, + "grad_norm": 0.3299079136996531, + "learning_rate": 9.070249863497391e-05, + "loss": 3.0988, + "step": 17883 + }, + { + "epoch": 0.8326466000884606, + "grad_norm": 0.34356588611133904, + "learning_rate": 9.0700925355043e-05, + "loss": 2.8161, + "step": 17884 + }, + { + "epoch": 0.8326931582745536, + "grad_norm": 0.37044125398675126, + "learning_rate": 9.069935195565888e-05, + "loss": 2.9555, + "step": 17885 + }, + { + "epoch": 0.8327397164606467, + "grad_norm": 0.36326954914579646, + "learning_rate": 9.069777843682618e-05, + "loss": 2.8386, + "step": 17886 + }, + { + "epoch": 0.8327862746467397, + "grad_norm": 0.3572668635216954, + "learning_rate": 9.06962047985495e-05, + "loss": 2.9926, + "step": 17887 + }, + { + "epoch": 0.8328328328328328, + "grad_norm": 0.4053822743259828, + "learning_rate": 9.069463104083349e-05, + "loss": 2.9173, + "step": 17888 + }, + { + "epoch": 0.8328793910189259, + "grad_norm": 0.34869711499276485, + "learning_rate": 9.069305716368273e-05, + "loss": 2.9226, + "step": 17889 + }, + { + "epoch": 0.832925949205019, + "grad_norm": 0.3432515499619321, + "learning_rate": 9.069148316710186e-05, + "loss": 2.829, + "step": 17890 + }, + { + "epoch": 0.8329725073911121, + "grad_norm": 0.33360724008466724, + "learning_rate": 9.068990905109549e-05, + "loss": 2.7531, + "step": 17891 + }, + { + "epoch": 0.8330190655772051, + "grad_norm": 0.32286052937669213, + "learning_rate": 9.068833481566826e-05, + "loss": 3.0288, + "step": 17892 + }, + { + "epoch": 0.8330656237632982, + "grad_norm": 0.3593847407654896, + "learning_rate": 9.068676046082476e-05, + "loss": 2.971, + "step": 17893 + }, + { + "epoch": 0.8331121819493913, + "grad_norm": 0.3373786686321831, + "learning_rate": 9.068518598656963e-05, + "loss": 3.0313, + "step": 17894 + }, + { + "epoch": 0.8331587401354843, + "grad_norm": 0.34137982770469427, + "learning_rate": 9.068361139290749e-05, + "loss": 2.8867, + "step": 17895 + }, + { + "epoch": 0.8332052983215774, + "grad_norm": 0.323957936391566, + "learning_rate": 9.068203667984296e-05, + "loss": 2.9623, + "step": 17896 + }, + { + "epoch": 0.8332518565076704, + "grad_norm": 0.38095369520177014, + "learning_rate": 9.068046184738066e-05, + "loss": 2.9081, + "step": 17897 + }, + { + "epoch": 0.8332984146937635, + "grad_norm": 0.33316733171544893, + "learning_rate": 9.067888689552522e-05, + "loss": 2.9571, + "step": 17898 + }, + { + "epoch": 0.8333449728798566, + "grad_norm": 0.35195340193589997, + "learning_rate": 9.067731182428123e-05, + "loss": 2.8637, + "step": 17899 + }, + { + "epoch": 0.8333915310659497, + "grad_norm": 0.3457475737006071, + "learning_rate": 9.067573663365335e-05, + "loss": 2.9858, + "step": 17900 + }, + { + "epoch": 0.8334380892520428, + "grad_norm": 0.34337547477376906, + "learning_rate": 9.067416132364621e-05, + "loss": 3.0304, + "step": 17901 + }, + { + "epoch": 0.8334846474381358, + "grad_norm": 0.32689557696220034, + "learning_rate": 9.06725858942644e-05, + "loss": 2.9903, + "step": 17902 + }, + { + "epoch": 0.8335312056242289, + "grad_norm": 0.3368992335352803, + "learning_rate": 9.067101034551255e-05, + "loss": 2.9422, + "step": 17903 + }, + { + "epoch": 0.833577763810322, + "grad_norm": 0.3724369436975191, + "learning_rate": 9.066943467739528e-05, + "loss": 2.9229, + "step": 17904 + }, + { + "epoch": 0.833624321996415, + "grad_norm": 0.31023349312186194, + "learning_rate": 9.066785888991724e-05, + "loss": 3.0089, + "step": 17905 + }, + { + "epoch": 0.8336708801825081, + "grad_norm": 0.36795688531676285, + "learning_rate": 9.066628298308304e-05, + "loss": 2.9467, + "step": 17906 + }, + { + "epoch": 0.8337174383686011, + "grad_norm": 0.3149190792759346, + "learning_rate": 9.066470695689732e-05, + "loss": 2.9102, + "step": 17907 + }, + { + "epoch": 0.8337639965546942, + "grad_norm": 0.3247518613165964, + "learning_rate": 9.066313081136468e-05, + "loss": 2.8419, + "step": 17908 + }, + { + "epoch": 0.8338105547407872, + "grad_norm": 0.35293258569693886, + "learning_rate": 9.066155454648975e-05, + "loss": 3.0114, + "step": 17909 + }, + { + "epoch": 0.8338571129268804, + "grad_norm": 0.3647391277070358, + "learning_rate": 9.065997816227717e-05, + "loss": 2.9797, + "step": 17910 + }, + { + "epoch": 0.8339036711129735, + "grad_norm": 0.32235931048519123, + "learning_rate": 9.065840165873157e-05, + "loss": 2.82, + "step": 17911 + }, + { + "epoch": 0.8339502292990665, + "grad_norm": 0.34883730074865005, + "learning_rate": 9.065682503585754e-05, + "loss": 2.885, + "step": 17912 + }, + { + "epoch": 0.8339967874851596, + "grad_norm": 0.33659172216685623, + "learning_rate": 9.065524829365974e-05, + "loss": 2.9331, + "step": 17913 + }, + { + "epoch": 0.8340433456712526, + "grad_norm": 0.3080846789043819, + "learning_rate": 9.065367143214281e-05, + "loss": 2.8541, + "step": 17914 + }, + { + "epoch": 0.8340899038573457, + "grad_norm": 0.36321434571548905, + "learning_rate": 9.065209445131135e-05, + "loss": 2.9607, + "step": 17915 + }, + { + "epoch": 0.8341364620434388, + "grad_norm": 0.34855124644356117, + "learning_rate": 9.065051735117001e-05, + "loss": 2.913, + "step": 17916 + }, + { + "epoch": 0.8341830202295318, + "grad_norm": 0.3572125025582148, + "learning_rate": 9.064894013172338e-05, + "loss": 2.8427, + "step": 17917 + }, + { + "epoch": 0.834229578415625, + "grad_norm": 0.38352164464926297, + "learning_rate": 9.064736279297614e-05, + "loss": 2.9605, + "step": 17918 + }, + { + "epoch": 0.834276136601718, + "grad_norm": 0.3552264449560475, + "learning_rate": 9.064578533493288e-05, + "loss": 2.8319, + "step": 17919 + }, + { + "epoch": 0.8343226947878111, + "grad_norm": 0.4050313525159179, + "learning_rate": 9.064420775759824e-05, + "loss": 2.9837, + "step": 17920 + }, + { + "epoch": 0.8343692529739042, + "grad_norm": 0.3740070942721323, + "learning_rate": 9.064263006097687e-05, + "loss": 2.9853, + "step": 17921 + }, + { + "epoch": 0.8344158111599972, + "grad_norm": 0.35488039438086355, + "learning_rate": 9.064105224507338e-05, + "loss": 2.9545, + "step": 17922 + }, + { + "epoch": 0.8344623693460903, + "grad_norm": 0.37051457930477755, + "learning_rate": 9.063947430989238e-05, + "loss": 2.9703, + "step": 17923 + }, + { + "epoch": 0.8345089275321833, + "grad_norm": 0.34130590776891573, + "learning_rate": 9.063789625543856e-05, + "loss": 2.9709, + "step": 17924 + }, + { + "epoch": 0.8345554857182764, + "grad_norm": 0.3288021188249562, + "learning_rate": 9.063631808171652e-05, + "loss": 2.9786, + "step": 17925 + }, + { + "epoch": 0.8346020439043695, + "grad_norm": 0.3691623645890095, + "learning_rate": 9.063473978873087e-05, + "loss": 2.8914, + "step": 17926 + }, + { + "epoch": 0.8346486020904625, + "grad_norm": 0.33406670687578666, + "learning_rate": 9.063316137648628e-05, + "loss": 2.9164, + "step": 17927 + }, + { + "epoch": 0.8346951602765557, + "grad_norm": 0.34160734989604735, + "learning_rate": 9.063158284498734e-05, + "loss": 2.8695, + "step": 17928 + }, + { + "epoch": 0.8347417184626487, + "grad_norm": 0.31627061837551373, + "learning_rate": 9.063000419423872e-05, + "loss": 2.7743, + "step": 17929 + }, + { + "epoch": 0.8347882766487418, + "grad_norm": 0.37676846996398095, + "learning_rate": 9.062842542424504e-05, + "loss": 2.945, + "step": 17930 + }, + { + "epoch": 0.8348348348348348, + "grad_norm": 0.35984648373466, + "learning_rate": 9.062684653501095e-05, + "loss": 2.923, + "step": 17931 + }, + { + "epoch": 0.8348813930209279, + "grad_norm": 0.33698714700650667, + "learning_rate": 9.062526752654105e-05, + "loss": 2.9321, + "step": 17932 + }, + { + "epoch": 0.834927951207021, + "grad_norm": 0.3541326087193022, + "learning_rate": 9.062368839884e-05, + "loss": 2.8868, + "step": 17933 + }, + { + "epoch": 0.834974509393114, + "grad_norm": 0.3154660977711707, + "learning_rate": 9.06221091519124e-05, + "loss": 2.933, + "step": 17934 + }, + { + "epoch": 0.8350210675792071, + "grad_norm": 0.34671937396955027, + "learning_rate": 9.062052978576294e-05, + "loss": 2.7713, + "step": 17935 + }, + { + "epoch": 0.8350676257653001, + "grad_norm": 0.3318425726460116, + "learning_rate": 9.061895030039623e-05, + "loss": 2.8844, + "step": 17936 + }, + { + "epoch": 0.8351141839513933, + "grad_norm": 0.32705766140845915, + "learning_rate": 9.061737069581689e-05, + "loss": 2.9593, + "step": 17937 + }, + { + "epoch": 0.8351607421374864, + "grad_norm": 0.34328952517941896, + "learning_rate": 9.061579097202957e-05, + "loss": 2.8922, + "step": 17938 + }, + { + "epoch": 0.8352073003235794, + "grad_norm": 0.3209888331310987, + "learning_rate": 9.061421112903889e-05, + "loss": 2.8593, + "step": 17939 + }, + { + "epoch": 0.8352538585096725, + "grad_norm": 0.3348577302832602, + "learning_rate": 9.061263116684953e-05, + "loss": 2.9179, + "step": 17940 + }, + { + "epoch": 0.8353004166957655, + "grad_norm": 0.31076532900881954, + "learning_rate": 9.061105108546607e-05, + "loss": 2.9039, + "step": 17941 + }, + { + "epoch": 0.8353469748818586, + "grad_norm": 0.3276371029937438, + "learning_rate": 9.060947088489321e-05, + "loss": 2.8175, + "step": 17942 + }, + { + "epoch": 0.8353935330679517, + "grad_norm": 0.3189205725378996, + "learning_rate": 9.060789056513553e-05, + "loss": 3.0321, + "step": 17943 + }, + { + "epoch": 0.8354400912540447, + "grad_norm": 0.3955516372354715, + "learning_rate": 9.06063101261977e-05, + "loss": 2.9818, + "step": 17944 + }, + { + "epoch": 0.8354866494401378, + "grad_norm": 0.334823391751194, + "learning_rate": 9.060472956808435e-05, + "loss": 2.8914, + "step": 17945 + }, + { + "epoch": 0.8355332076262308, + "grad_norm": 0.35475345736188096, + "learning_rate": 9.060314889080012e-05, + "loss": 2.8644, + "step": 17946 + }, + { + "epoch": 0.835579765812324, + "grad_norm": 0.35462423306819335, + "learning_rate": 9.060156809434963e-05, + "loss": 2.9493, + "step": 17947 + }, + { + "epoch": 0.8356263239984171, + "grad_norm": 0.3669750631387423, + "learning_rate": 9.059998717873756e-05, + "loss": 2.9437, + "step": 17948 + }, + { + "epoch": 0.8356728821845101, + "grad_norm": 0.3525299214661175, + "learning_rate": 9.059840614396852e-05, + "loss": 2.9039, + "step": 17949 + }, + { + "epoch": 0.8357194403706032, + "grad_norm": 0.3296382004916963, + "learning_rate": 9.059682499004715e-05, + "loss": 2.8784, + "step": 17950 + }, + { + "epoch": 0.8357659985566962, + "grad_norm": 0.36459516647717083, + "learning_rate": 9.059524371697811e-05, + "loss": 2.9895, + "step": 17951 + }, + { + "epoch": 0.8358125567427893, + "grad_norm": 0.343714259482946, + "learning_rate": 9.059366232476603e-05, + "loss": 3.0797, + "step": 17952 + }, + { + "epoch": 0.8358591149288823, + "grad_norm": 0.3678481981721411, + "learning_rate": 9.059208081341555e-05, + "loss": 3.005, + "step": 17953 + }, + { + "epoch": 0.8359056731149754, + "grad_norm": 0.3390240481425575, + "learning_rate": 9.05904991829313e-05, + "loss": 2.9484, + "step": 17954 + }, + { + "epoch": 0.8359522313010685, + "grad_norm": 0.3769171929067259, + "learning_rate": 9.058891743331795e-05, + "loss": 2.883, + "step": 17955 + }, + { + "epoch": 0.8359987894871616, + "grad_norm": 0.32915021708708264, + "learning_rate": 9.05873355645801e-05, + "loss": 3.0552, + "step": 17956 + }, + { + "epoch": 0.8360453476732547, + "grad_norm": 0.41107646917423385, + "learning_rate": 9.058575357672243e-05, + "loss": 2.9053, + "step": 17957 + }, + { + "epoch": 0.8360919058593477, + "grad_norm": 0.32928716304017797, + "learning_rate": 9.058417146974958e-05, + "loss": 2.8757, + "step": 17958 + }, + { + "epoch": 0.8361384640454408, + "grad_norm": 0.36280624912483034, + "learning_rate": 9.058258924366618e-05, + "loss": 3.0019, + "step": 17959 + }, + { + "epoch": 0.8361850222315339, + "grad_norm": 0.3468730834949878, + "learning_rate": 9.058100689847689e-05, + "loss": 2.9247, + "step": 17960 + }, + { + "epoch": 0.8362315804176269, + "grad_norm": 0.34803948771475873, + "learning_rate": 9.057942443418633e-05, + "loss": 2.9743, + "step": 17961 + }, + { + "epoch": 0.83627813860372, + "grad_norm": 0.35631699262410715, + "learning_rate": 9.057784185079916e-05, + "loss": 2.9485, + "step": 17962 + }, + { + "epoch": 0.836324696789813, + "grad_norm": 0.31456730332251065, + "learning_rate": 9.057625914832003e-05, + "loss": 2.9129, + "step": 17963 + }, + { + "epoch": 0.8363712549759061, + "grad_norm": 0.3562789999840126, + "learning_rate": 9.057467632675356e-05, + "loss": 2.8694, + "step": 17964 + }, + { + "epoch": 0.8364178131619993, + "grad_norm": 0.3296901028765506, + "learning_rate": 9.05730933861044e-05, + "loss": 2.8385, + "step": 17965 + }, + { + "epoch": 0.8364643713480923, + "grad_norm": 0.3317320106815373, + "learning_rate": 9.057151032637725e-05, + "loss": 2.8396, + "step": 17966 + }, + { + "epoch": 0.8365109295341854, + "grad_norm": 0.3181923648857502, + "learning_rate": 9.056992714757668e-05, + "loss": 2.9393, + "step": 17967 + }, + { + "epoch": 0.8365574877202784, + "grad_norm": 0.342831764139186, + "learning_rate": 9.056834384970738e-05, + "loss": 2.8722, + "step": 17968 + }, + { + "epoch": 0.8366040459063715, + "grad_norm": 0.29754387809491745, + "learning_rate": 9.056676043277399e-05, + "loss": 2.987, + "step": 17969 + }, + { + "epoch": 0.8366506040924646, + "grad_norm": 0.33203295335741845, + "learning_rate": 9.056517689678116e-05, + "loss": 2.8601, + "step": 17970 + }, + { + "epoch": 0.8366971622785576, + "grad_norm": 0.3291171606507392, + "learning_rate": 9.056359324173354e-05, + "loss": 2.8467, + "step": 17971 + }, + { + "epoch": 0.8367437204646507, + "grad_norm": 0.33044790844543204, + "learning_rate": 9.056200946763575e-05, + "loss": 2.727, + "step": 17972 + }, + { + "epoch": 0.8367902786507437, + "grad_norm": 0.3796022246041977, + "learning_rate": 9.056042557449246e-05, + "loss": 2.8918, + "step": 17973 + }, + { + "epoch": 0.8368368368368369, + "grad_norm": 0.3942473629911452, + "learning_rate": 9.055884156230832e-05, + "loss": 2.898, + "step": 17974 + }, + { + "epoch": 0.8368833950229299, + "grad_norm": 0.3828072621416822, + "learning_rate": 9.055725743108798e-05, + "loss": 2.9003, + "step": 17975 + }, + { + "epoch": 0.836929953209023, + "grad_norm": 0.3444958671742249, + "learning_rate": 9.055567318083608e-05, + "loss": 2.9812, + "step": 17976 + }, + { + "epoch": 0.8369765113951161, + "grad_norm": 0.3623734546181019, + "learning_rate": 9.055408881155728e-05, + "loss": 2.9759, + "step": 17977 + }, + { + "epoch": 0.8370230695812091, + "grad_norm": 0.360435132799618, + "learning_rate": 9.055250432325622e-05, + "loss": 2.9802, + "step": 17978 + }, + { + "epoch": 0.8370696277673022, + "grad_norm": 0.31967261856691126, + "learning_rate": 9.055091971593757e-05, + "loss": 2.8445, + "step": 17979 + }, + { + "epoch": 0.8371161859533952, + "grad_norm": 0.4000569227541948, + "learning_rate": 9.054933498960594e-05, + "loss": 2.9509, + "step": 17980 + }, + { + "epoch": 0.8371627441394883, + "grad_norm": 0.3091100797927701, + "learning_rate": 9.054775014426603e-05, + "loss": 2.9354, + "step": 17981 + }, + { + "epoch": 0.8372093023255814, + "grad_norm": 0.40150020116641355, + "learning_rate": 9.054616517992244e-05, + "loss": 2.9673, + "step": 17982 + }, + { + "epoch": 0.8372558605116744, + "grad_norm": 0.37209287260953905, + "learning_rate": 9.054458009657987e-05, + "loss": 2.8701, + "step": 17983 + }, + { + "epoch": 0.8373024186977676, + "grad_norm": 0.3836882349466262, + "learning_rate": 9.054299489424296e-05, + "loss": 2.9415, + "step": 17984 + }, + { + "epoch": 0.8373489768838606, + "grad_norm": 0.3591391094101308, + "learning_rate": 9.054140957291634e-05, + "loss": 2.9592, + "step": 17985 + }, + { + "epoch": 0.8373955350699537, + "grad_norm": 0.3508179450890698, + "learning_rate": 9.053982413260468e-05, + "loss": 2.8999, + "step": 17986 + }, + { + "epoch": 0.8374420932560468, + "grad_norm": 0.3682507858165746, + "learning_rate": 9.053823857331263e-05, + "loss": 3.0245, + "step": 17987 + }, + { + "epoch": 0.8374886514421398, + "grad_norm": 0.3271954646633217, + "learning_rate": 9.053665289504483e-05, + "loss": 2.8144, + "step": 17988 + }, + { + "epoch": 0.8375352096282329, + "grad_norm": 0.38968438312095177, + "learning_rate": 9.053506709780598e-05, + "loss": 2.8843, + "step": 17989 + }, + { + "epoch": 0.8375817678143259, + "grad_norm": 0.32078466437068537, + "learning_rate": 9.053348118160068e-05, + "loss": 2.8849, + "step": 17990 + }, + { + "epoch": 0.837628326000419, + "grad_norm": 0.38348413351373767, + "learning_rate": 9.05318951464336e-05, + "loss": 2.9023, + "step": 17991 + }, + { + "epoch": 0.8376748841865121, + "grad_norm": 0.36414000452730805, + "learning_rate": 9.053030899230942e-05, + "loss": 2.918, + "step": 17992 + }, + { + "epoch": 0.8377214423726052, + "grad_norm": 0.384256823730727, + "learning_rate": 9.052872271923275e-05, + "loss": 3.0701, + "step": 17993 + }, + { + "epoch": 0.8377680005586983, + "grad_norm": 0.39830444438479823, + "learning_rate": 9.052713632720828e-05, + "loss": 2.986, + "step": 17994 + }, + { + "epoch": 0.8378145587447913, + "grad_norm": 0.4101457596461047, + "learning_rate": 9.052554981624066e-05, + "loss": 2.9142, + "step": 17995 + }, + { + "epoch": 0.8378611169308844, + "grad_norm": 0.4073181727912378, + "learning_rate": 9.052396318633456e-05, + "loss": 2.9318, + "step": 17996 + }, + { + "epoch": 0.8379076751169774, + "grad_norm": 0.401710475654877, + "learning_rate": 9.05223764374946e-05, + "loss": 3.0352, + "step": 17997 + }, + { + "epoch": 0.8379542333030705, + "grad_norm": 0.35987555120774684, + "learning_rate": 9.052078956972547e-05, + "loss": 2.8915, + "step": 17998 + }, + { + "epoch": 0.8380007914891636, + "grad_norm": 0.41928117545452986, + "learning_rate": 9.051920258303182e-05, + "loss": 2.8965, + "step": 17999 + }, + { + "epoch": 0.8380473496752566, + "grad_norm": 0.3464481211028829, + "learning_rate": 9.051761547741829e-05, + "loss": 2.8283, + "step": 18000 + }, + { + "epoch": 0.8380939078613497, + "grad_norm": 0.4475629928049638, + "learning_rate": 9.051602825288954e-05, + "loss": 2.9126, + "step": 18001 + }, + { + "epoch": 0.8381404660474427, + "grad_norm": 0.3629878237264695, + "learning_rate": 9.051444090945025e-05, + "loss": 2.906, + "step": 18002 + }, + { + "epoch": 0.8381870242335359, + "grad_norm": 0.3849395683682759, + "learning_rate": 9.051285344710508e-05, + "loss": 2.885, + "step": 18003 + }, + { + "epoch": 0.838233582419629, + "grad_norm": 0.36686115199260666, + "learning_rate": 9.051126586585865e-05, + "loss": 2.906, + "step": 18004 + }, + { + "epoch": 0.838280140605722, + "grad_norm": 0.34129435226483185, + "learning_rate": 9.050967816571567e-05, + "loss": 2.8739, + "step": 18005 + }, + { + "epoch": 0.8383266987918151, + "grad_norm": 0.4021556737224909, + "learning_rate": 9.050809034668075e-05, + "loss": 2.9931, + "step": 18006 + }, + { + "epoch": 0.8383732569779081, + "grad_norm": 0.33092451890173835, + "learning_rate": 9.05065024087586e-05, + "loss": 2.9716, + "step": 18007 + }, + { + "epoch": 0.8384198151640012, + "grad_norm": 0.3773488089326414, + "learning_rate": 9.050491435195384e-05, + "loss": 2.932, + "step": 18008 + }, + { + "epoch": 0.8384663733500943, + "grad_norm": 0.4359625156245415, + "learning_rate": 9.050332617627116e-05, + "loss": 2.9744, + "step": 18009 + }, + { + "epoch": 0.8385129315361873, + "grad_norm": 0.4116164888061433, + "learning_rate": 9.05017378817152e-05, + "loss": 2.9201, + "step": 18010 + }, + { + "epoch": 0.8385594897222804, + "grad_norm": 0.37016229328932215, + "learning_rate": 9.050014946829063e-05, + "loss": 2.9549, + "step": 18011 + }, + { + "epoch": 0.8386060479083735, + "grad_norm": 0.4310421496535301, + "learning_rate": 9.04985609360021e-05, + "loss": 3.0393, + "step": 18012 + }, + { + "epoch": 0.8386526060944666, + "grad_norm": 0.4016173511237777, + "learning_rate": 9.049697228485431e-05, + "loss": 2.9714, + "step": 18013 + }, + { + "epoch": 0.8386991642805597, + "grad_norm": 0.3903758946432274, + "learning_rate": 9.049538351485187e-05, + "loss": 3.0619, + "step": 18014 + }, + { + "epoch": 0.8387457224666527, + "grad_norm": 0.42137028884461813, + "learning_rate": 9.049379462599948e-05, + "loss": 2.8937, + "step": 18015 + }, + { + "epoch": 0.8387922806527458, + "grad_norm": 0.34968048048565237, + "learning_rate": 9.04922056183018e-05, + "loss": 2.8378, + "step": 18016 + }, + { + "epoch": 0.8388388388388388, + "grad_norm": 0.3972140882826457, + "learning_rate": 9.049061649176348e-05, + "loss": 2.9719, + "step": 18017 + }, + { + "epoch": 0.8388853970249319, + "grad_norm": 0.4027924749314677, + "learning_rate": 9.04890272463892e-05, + "loss": 3.0237, + "step": 18018 + }, + { + "epoch": 0.8389319552110249, + "grad_norm": 0.3853747116761885, + "learning_rate": 9.048743788218359e-05, + "loss": 2.9821, + "step": 18019 + }, + { + "epoch": 0.838978513397118, + "grad_norm": 0.4064844101120361, + "learning_rate": 9.048584839915135e-05, + "loss": 2.9651, + "step": 18020 + }, + { + "epoch": 0.8390250715832112, + "grad_norm": 0.37913070997201365, + "learning_rate": 9.048425879729713e-05, + "loss": 2.8435, + "step": 18021 + }, + { + "epoch": 0.8390716297693042, + "grad_norm": 0.38199189463016825, + "learning_rate": 9.04826690766256e-05, + "loss": 3.042, + "step": 18022 + }, + { + "epoch": 0.8391181879553973, + "grad_norm": 0.37612676056518624, + "learning_rate": 9.048107923714143e-05, + "loss": 2.9426, + "step": 18023 + }, + { + "epoch": 0.8391647461414903, + "grad_norm": 0.3678124462854616, + "learning_rate": 9.047948927884927e-05, + "loss": 2.9084, + "step": 18024 + }, + { + "epoch": 0.8392113043275834, + "grad_norm": 0.3737130475130803, + "learning_rate": 9.04778992017538e-05, + "loss": 3.0437, + "step": 18025 + }, + { + "epoch": 0.8392578625136765, + "grad_norm": 0.35338451835209117, + "learning_rate": 9.04763090058597e-05, + "loss": 2.8686, + "step": 18026 + }, + { + "epoch": 0.8393044206997695, + "grad_norm": 0.3468712718561904, + "learning_rate": 9.047471869117161e-05, + "loss": 2.8875, + "step": 18027 + }, + { + "epoch": 0.8393509788858626, + "grad_norm": 0.38917456885396845, + "learning_rate": 9.04731282576942e-05, + "loss": 2.7966, + "step": 18028 + }, + { + "epoch": 0.8393975370719556, + "grad_norm": 0.36864157427079247, + "learning_rate": 9.047153770543215e-05, + "loss": 3.0211, + "step": 18029 + }, + { + "epoch": 0.8394440952580488, + "grad_norm": 0.4172328852304096, + "learning_rate": 9.046994703439014e-05, + "loss": 3.0049, + "step": 18030 + }, + { + "epoch": 0.8394906534441419, + "grad_norm": 0.35571726246294555, + "learning_rate": 9.04683562445728e-05, + "loss": 2.9631, + "step": 18031 + }, + { + "epoch": 0.8395372116302349, + "grad_norm": 0.39047012481383375, + "learning_rate": 9.046676533598482e-05, + "loss": 2.9617, + "step": 18032 + }, + { + "epoch": 0.839583769816328, + "grad_norm": 0.3376551028175652, + "learning_rate": 9.046517430863088e-05, + "loss": 2.8151, + "step": 18033 + }, + { + "epoch": 0.839630328002421, + "grad_norm": 0.3228942885095457, + "learning_rate": 9.046358316251564e-05, + "loss": 2.9157, + "step": 18034 + }, + { + "epoch": 0.8396768861885141, + "grad_norm": 0.3517643967377197, + "learning_rate": 9.046199189764377e-05, + "loss": 3.0196, + "step": 18035 + }, + { + "epoch": 0.8397234443746072, + "grad_norm": 0.33892273060384404, + "learning_rate": 9.046040051401995e-05, + "loss": 2.9104, + "step": 18036 + }, + { + "epoch": 0.8397700025607002, + "grad_norm": 0.3713183063297818, + "learning_rate": 9.045880901164883e-05, + "loss": 2.998, + "step": 18037 + }, + { + "epoch": 0.8398165607467933, + "grad_norm": 0.32448281656747585, + "learning_rate": 9.045721739053509e-05, + "loss": 2.9423, + "step": 18038 + }, + { + "epoch": 0.8398631189328863, + "grad_norm": 0.35926067314544236, + "learning_rate": 9.04556256506834e-05, + "loss": 2.9007, + "step": 18039 + }, + { + "epoch": 0.8399096771189795, + "grad_norm": 0.3393596218194373, + "learning_rate": 9.045403379209844e-05, + "loss": 2.9457, + "step": 18040 + }, + { + "epoch": 0.8399562353050725, + "grad_norm": 0.338642520344886, + "learning_rate": 9.045244181478487e-05, + "loss": 2.9447, + "step": 18041 + }, + { + "epoch": 0.8400027934911656, + "grad_norm": 0.3540743570683154, + "learning_rate": 9.045084971874738e-05, + "loss": 2.878, + "step": 18042 + }, + { + "epoch": 0.8400493516772587, + "grad_norm": 0.3632526129455145, + "learning_rate": 9.044925750399062e-05, + "loss": 3.059, + "step": 18043 + }, + { + "epoch": 0.8400959098633517, + "grad_norm": 0.3314593535999482, + "learning_rate": 9.044766517051927e-05, + "loss": 3.043, + "step": 18044 + }, + { + "epoch": 0.8401424680494448, + "grad_norm": 0.3643265154104831, + "learning_rate": 9.044607271833803e-05, + "loss": 2.9307, + "step": 18045 + }, + { + "epoch": 0.8401890262355378, + "grad_norm": 0.3391097157121855, + "learning_rate": 9.044448014745153e-05, + "loss": 2.9601, + "step": 18046 + }, + { + "epoch": 0.8402355844216309, + "grad_norm": 0.3320816169018093, + "learning_rate": 9.044288745786446e-05, + "loss": 2.8669, + "step": 18047 + }, + { + "epoch": 0.840282142607724, + "grad_norm": 0.3277854638326494, + "learning_rate": 9.044129464958153e-05, + "loss": 2.9293, + "step": 18048 + }, + { + "epoch": 0.840328700793817, + "grad_norm": 0.36877560987601415, + "learning_rate": 9.043970172260737e-05, + "loss": 2.9515, + "step": 18049 + }, + { + "epoch": 0.8403752589799102, + "grad_norm": 0.35860729424904936, + "learning_rate": 9.043810867694666e-05, + "loss": 2.9343, + "step": 18050 + }, + { + "epoch": 0.8404218171660032, + "grad_norm": 0.3575664736860052, + "learning_rate": 9.043651551260408e-05, + "loss": 3.0084, + "step": 18051 + }, + { + "epoch": 0.8404683753520963, + "grad_norm": 0.36563684831554666, + "learning_rate": 9.043492222958433e-05, + "loss": 2.9882, + "step": 18052 + }, + { + "epoch": 0.8405149335381894, + "grad_norm": 0.42909067042793797, + "learning_rate": 9.043332882789206e-05, + "loss": 2.9529, + "step": 18053 + }, + { + "epoch": 0.8405614917242824, + "grad_norm": 0.35029284479524614, + "learning_rate": 9.043173530753195e-05, + "loss": 2.968, + "step": 18054 + }, + { + "epoch": 0.8406080499103755, + "grad_norm": 0.3832296107341125, + "learning_rate": 9.043014166850867e-05, + "loss": 2.9499, + "step": 18055 + }, + { + "epoch": 0.8406546080964685, + "grad_norm": 0.35261278761467685, + "learning_rate": 9.042854791082693e-05, + "loss": 2.8537, + "step": 18056 + }, + { + "epoch": 0.8407011662825616, + "grad_norm": 0.3723326321954094, + "learning_rate": 9.042695403449137e-05, + "loss": 2.9777, + "step": 18057 + }, + { + "epoch": 0.8407477244686548, + "grad_norm": 0.37218859013904165, + "learning_rate": 9.042536003950667e-05, + "loss": 3.0527, + "step": 18058 + }, + { + "epoch": 0.8407942826547478, + "grad_norm": 0.3603650743717589, + "learning_rate": 9.042376592587756e-05, + "loss": 2.9442, + "step": 18059 + }, + { + "epoch": 0.8408408408408409, + "grad_norm": 0.33112478624691943, + "learning_rate": 9.042217169360865e-05, + "loss": 2.808, + "step": 18060 + }, + { + "epoch": 0.8408873990269339, + "grad_norm": 0.3826355658805846, + "learning_rate": 9.042057734270465e-05, + "loss": 2.9324, + "step": 18061 + }, + { + "epoch": 0.840933957213027, + "grad_norm": 0.31760902834274635, + "learning_rate": 9.041898287317025e-05, + "loss": 2.8196, + "step": 18062 + }, + { + "epoch": 0.84098051539912, + "grad_norm": 0.38204106100366203, + "learning_rate": 9.04173882850101e-05, + "loss": 2.9316, + "step": 18063 + }, + { + "epoch": 0.8410270735852131, + "grad_norm": 0.37751686050670286, + "learning_rate": 9.041579357822891e-05, + "loss": 2.8856, + "step": 18064 + }, + { + "epoch": 0.8410736317713062, + "grad_norm": 0.37209775294922254, + "learning_rate": 9.041419875283135e-05, + "loss": 2.9527, + "step": 18065 + }, + { + "epoch": 0.8411201899573992, + "grad_norm": 0.35976796150418666, + "learning_rate": 9.041260380882209e-05, + "loss": 2.8959, + "step": 18066 + }, + { + "epoch": 0.8411667481434923, + "grad_norm": 0.3634637429535358, + "learning_rate": 9.041100874620581e-05, + "loss": 2.9182, + "step": 18067 + }, + { + "epoch": 0.8412133063295854, + "grad_norm": 0.37628153512012086, + "learning_rate": 9.040941356498722e-05, + "loss": 2.9217, + "step": 18068 + }, + { + "epoch": 0.8412598645156785, + "grad_norm": 0.3657924146833979, + "learning_rate": 9.040781826517097e-05, + "loss": 2.8569, + "step": 18069 + }, + { + "epoch": 0.8413064227017716, + "grad_norm": 0.4229026318545703, + "learning_rate": 9.040622284676176e-05, + "loss": 3.008, + "step": 18070 + }, + { + "epoch": 0.8413529808878646, + "grad_norm": 0.3735357059545708, + "learning_rate": 9.040462730976427e-05, + "loss": 2.7383, + "step": 18071 + }, + { + "epoch": 0.8413995390739577, + "grad_norm": 0.39817015443920256, + "learning_rate": 9.040303165418316e-05, + "loss": 2.8567, + "step": 18072 + }, + { + "epoch": 0.8414460972600507, + "grad_norm": 0.40545792607218895, + "learning_rate": 9.040143588002316e-05, + "loss": 2.9153, + "step": 18073 + }, + { + "epoch": 0.8414926554461438, + "grad_norm": 0.36018421643489223, + "learning_rate": 9.039983998728891e-05, + "loss": 2.9752, + "step": 18074 + }, + { + "epoch": 0.8415392136322369, + "grad_norm": 0.3873799601599888, + "learning_rate": 9.039824397598511e-05, + "loss": 3.0105, + "step": 18075 + }, + { + "epoch": 0.8415857718183299, + "grad_norm": 0.37599189327006854, + "learning_rate": 9.039664784611643e-05, + "loss": 2.8671, + "step": 18076 + }, + { + "epoch": 0.8416323300044231, + "grad_norm": 0.378229210973643, + "learning_rate": 9.03950515976876e-05, + "loss": 2.9457, + "step": 18077 + }, + { + "epoch": 0.8416788881905161, + "grad_norm": 0.3650354251350116, + "learning_rate": 9.039345523070324e-05, + "loss": 2.82, + "step": 18078 + }, + { + "epoch": 0.8417254463766092, + "grad_norm": 0.35565190672100416, + "learning_rate": 9.039185874516808e-05, + "loss": 3.0027, + "step": 18079 + }, + { + "epoch": 0.8417720045627023, + "grad_norm": 0.35781902988301584, + "learning_rate": 9.039026214108681e-05, + "loss": 3.0209, + "step": 18080 + }, + { + "epoch": 0.8418185627487953, + "grad_norm": 0.33161995420075846, + "learning_rate": 9.038866541846409e-05, + "loss": 2.8163, + "step": 18081 + }, + { + "epoch": 0.8418651209348884, + "grad_norm": 0.36274332893657146, + "learning_rate": 9.038706857730463e-05, + "loss": 2.9101, + "step": 18082 + }, + { + "epoch": 0.8419116791209814, + "grad_norm": 0.335726015465025, + "learning_rate": 9.038547161761308e-05, + "loss": 2.9344, + "step": 18083 + }, + { + "epoch": 0.8419582373070745, + "grad_norm": 0.390135524002679, + "learning_rate": 9.038387453939415e-05, + "loss": 2.8453, + "step": 18084 + }, + { + "epoch": 0.8420047954931675, + "grad_norm": 0.3199155150781332, + "learning_rate": 9.038227734265255e-05, + "loss": 2.8765, + "step": 18085 + }, + { + "epoch": 0.8420513536792607, + "grad_norm": 0.36195527253523524, + "learning_rate": 9.038068002739292e-05, + "loss": 2.8299, + "step": 18086 + }, + { + "epoch": 0.8420979118653538, + "grad_norm": 0.32540102876691385, + "learning_rate": 9.037908259361998e-05, + "loss": 2.8453, + "step": 18087 + }, + { + "epoch": 0.8421444700514468, + "grad_norm": 0.36457744631385003, + "learning_rate": 9.037748504133841e-05, + "loss": 3.0087, + "step": 18088 + }, + { + "epoch": 0.8421910282375399, + "grad_norm": 0.35982144941778527, + "learning_rate": 9.03758873705529e-05, + "loss": 3.0111, + "step": 18089 + }, + { + "epoch": 0.8422375864236329, + "grad_norm": 0.35303255122841903, + "learning_rate": 9.037428958126813e-05, + "loss": 2.9358, + "step": 18090 + }, + { + "epoch": 0.842284144609726, + "grad_norm": 0.3607687282100919, + "learning_rate": 9.037269167348881e-05, + "loss": 2.9657, + "step": 18091 + }, + { + "epoch": 0.8423307027958191, + "grad_norm": 0.622045591757042, + "learning_rate": 9.037109364721961e-05, + "loss": 2.8448, + "step": 18092 + }, + { + "epoch": 0.8423772609819121, + "grad_norm": 0.39146775612770796, + "learning_rate": 9.036949550246523e-05, + "loss": 3.0112, + "step": 18093 + }, + { + "epoch": 0.8424238191680052, + "grad_norm": 0.36015566568086926, + "learning_rate": 9.036789723923037e-05, + "loss": 2.8942, + "step": 18094 + }, + { + "epoch": 0.8424703773540982, + "grad_norm": 0.3708720127724316, + "learning_rate": 9.03662988575197e-05, + "loss": 2.9119, + "step": 18095 + }, + { + "epoch": 0.8425169355401914, + "grad_norm": 0.37180026481934086, + "learning_rate": 9.036470035733791e-05, + "loss": 2.9571, + "step": 18096 + }, + { + "epoch": 0.8425634937262845, + "grad_norm": 0.3562400136389439, + "learning_rate": 9.03631017386897e-05, + "loss": 2.864, + "step": 18097 + }, + { + "epoch": 0.8426100519123775, + "grad_norm": 0.3445746440190251, + "learning_rate": 9.036150300157977e-05, + "loss": 2.9094, + "step": 18098 + }, + { + "epoch": 0.8426566100984706, + "grad_norm": 0.35751941773110285, + "learning_rate": 9.035990414601281e-05, + "loss": 2.9439, + "step": 18099 + }, + { + "epoch": 0.8427031682845636, + "grad_norm": 0.36259126823779525, + "learning_rate": 9.03583051719935e-05, + "loss": 2.8718, + "step": 18100 + }, + { + "epoch": 0.8427497264706567, + "grad_norm": 0.3694058202790951, + "learning_rate": 9.035670607952655e-05, + "loss": 3.0287, + "step": 18101 + }, + { + "epoch": 0.8427962846567498, + "grad_norm": 0.3550820891560087, + "learning_rate": 9.035510686861663e-05, + "loss": 2.8783, + "step": 18102 + }, + { + "epoch": 0.8428428428428428, + "grad_norm": 0.33363148364432593, + "learning_rate": 9.035350753926846e-05, + "loss": 2.8938, + "step": 18103 + }, + { + "epoch": 0.842889401028936, + "grad_norm": 0.3730344194402114, + "learning_rate": 9.035190809148671e-05, + "loss": 2.9078, + "step": 18104 + }, + { + "epoch": 0.842935959215029, + "grad_norm": 0.330558856895669, + "learning_rate": 9.035030852527609e-05, + "loss": 2.937, + "step": 18105 + }, + { + "epoch": 0.8429825174011221, + "grad_norm": 0.3737856841023627, + "learning_rate": 9.034870884064128e-05, + "loss": 2.9213, + "step": 18106 + }, + { + "epoch": 0.8430290755872151, + "grad_norm": 0.34044382416424934, + "learning_rate": 9.0347109037587e-05, + "loss": 2.97, + "step": 18107 + }, + { + "epoch": 0.8430756337733082, + "grad_norm": 0.4440835084776613, + "learning_rate": 9.034550911611792e-05, + "loss": 3.0129, + "step": 18108 + }, + { + "epoch": 0.8431221919594013, + "grad_norm": 0.38619899147056025, + "learning_rate": 9.034390907623875e-05, + "loss": 2.9499, + "step": 18109 + }, + { + "epoch": 0.8431687501454943, + "grad_norm": 0.32871912665080777, + "learning_rate": 9.034230891795419e-05, + "loss": 2.9233, + "step": 18110 + }, + { + "epoch": 0.8432153083315874, + "grad_norm": 0.3824050516778787, + "learning_rate": 9.034070864126891e-05, + "loss": 2.9511, + "step": 18111 + }, + { + "epoch": 0.8432618665176804, + "grad_norm": 0.3227620457689167, + "learning_rate": 9.033910824618764e-05, + "loss": 2.9086, + "step": 18112 + }, + { + "epoch": 0.8433084247037735, + "grad_norm": 0.3467891069342665, + "learning_rate": 9.033750773271506e-05, + "loss": 2.8678, + "step": 18113 + }, + { + "epoch": 0.8433549828898667, + "grad_norm": 0.35348463652932505, + "learning_rate": 9.033590710085585e-05, + "loss": 2.9117, + "step": 18114 + }, + { + "epoch": 0.8434015410759597, + "grad_norm": 0.3149925316078286, + "learning_rate": 9.033430635061474e-05, + "loss": 2.9775, + "step": 18115 + }, + { + "epoch": 0.8434480992620528, + "grad_norm": 0.3819374553989262, + "learning_rate": 9.033270548199642e-05, + "loss": 2.954, + "step": 18116 + }, + { + "epoch": 0.8434946574481458, + "grad_norm": 0.3353709347905928, + "learning_rate": 9.033110449500557e-05, + "loss": 2.967, + "step": 18117 + }, + { + "epoch": 0.8435412156342389, + "grad_norm": 0.3847696953102422, + "learning_rate": 9.03295033896469e-05, + "loss": 2.9997, + "step": 18118 + }, + { + "epoch": 0.843587773820332, + "grad_norm": 0.3350239877086397, + "learning_rate": 9.032790216592513e-05, + "loss": 2.8461, + "step": 18119 + }, + { + "epoch": 0.843634332006425, + "grad_norm": 0.33695138440538186, + "learning_rate": 9.032630082384492e-05, + "loss": 2.8808, + "step": 18120 + }, + { + "epoch": 0.8436808901925181, + "grad_norm": 0.3511701781298031, + "learning_rate": 9.032469936341099e-05, + "loss": 2.9904, + "step": 18121 + }, + { + "epoch": 0.8437274483786111, + "grad_norm": 0.36781973727555695, + "learning_rate": 9.032309778462806e-05, + "loss": 2.8826, + "step": 18122 + }, + { + "epoch": 0.8437740065647042, + "grad_norm": 0.3446901750321595, + "learning_rate": 9.03214960875008e-05, + "loss": 2.8623, + "step": 18123 + }, + { + "epoch": 0.8438205647507974, + "grad_norm": 0.3594484439501566, + "learning_rate": 9.031989427203391e-05, + "loss": 2.9334, + "step": 18124 + }, + { + "epoch": 0.8438671229368904, + "grad_norm": 0.3345357306608101, + "learning_rate": 9.031829233823212e-05, + "loss": 2.9062, + "step": 18125 + }, + { + "epoch": 0.8439136811229835, + "grad_norm": 0.3973806275235809, + "learning_rate": 9.03166902861001e-05, + "loss": 2.8972, + "step": 18126 + }, + { + "epoch": 0.8439602393090765, + "grad_norm": 0.3433287287138544, + "learning_rate": 9.031508811564258e-05, + "loss": 2.9589, + "step": 18127 + }, + { + "epoch": 0.8440067974951696, + "grad_norm": 0.3687872662156059, + "learning_rate": 9.031348582686423e-05, + "loss": 2.9274, + "step": 18128 + }, + { + "epoch": 0.8440533556812626, + "grad_norm": 0.33836834006978994, + "learning_rate": 9.031188341976979e-05, + "loss": 2.9282, + "step": 18129 + }, + { + "epoch": 0.8440999138673557, + "grad_norm": 0.37258882237297436, + "learning_rate": 9.031028089436393e-05, + "loss": 2.9605, + "step": 18130 + }, + { + "epoch": 0.8441464720534488, + "grad_norm": 0.33447921194771607, + "learning_rate": 9.030867825065137e-05, + "loss": 2.9287, + "step": 18131 + }, + { + "epoch": 0.8441930302395418, + "grad_norm": 0.38364559665978737, + "learning_rate": 9.030707548863681e-05, + "loss": 2.9223, + "step": 18132 + }, + { + "epoch": 0.844239588425635, + "grad_norm": 0.3228778245141337, + "learning_rate": 9.030547260832496e-05, + "loss": 2.9049, + "step": 18133 + }, + { + "epoch": 0.844286146611728, + "grad_norm": 0.3792450364701971, + "learning_rate": 9.030386960972052e-05, + "loss": 2.9676, + "step": 18134 + }, + { + "epoch": 0.8443327047978211, + "grad_norm": 0.3342818057528186, + "learning_rate": 9.030226649282817e-05, + "loss": 2.8838, + "step": 18135 + }, + { + "epoch": 0.8443792629839142, + "grad_norm": 0.386750216438377, + "learning_rate": 9.030066325765267e-05, + "loss": 2.8757, + "step": 18136 + }, + { + "epoch": 0.8444258211700072, + "grad_norm": 0.3272082276481675, + "learning_rate": 9.029905990419868e-05, + "loss": 2.8829, + "step": 18137 + }, + { + "epoch": 0.8444723793561003, + "grad_norm": 0.36238168889851907, + "learning_rate": 9.029745643247091e-05, + "loss": 2.9981, + "step": 18138 + }, + { + "epoch": 0.8445189375421933, + "grad_norm": 0.42645609813135327, + "learning_rate": 9.029585284247409e-05, + "loss": 2.8634, + "step": 18139 + }, + { + "epoch": 0.8445654957282864, + "grad_norm": 0.3451239344201154, + "learning_rate": 9.029424913421292e-05, + "loss": 2.9255, + "step": 18140 + }, + { + "epoch": 0.8446120539143795, + "grad_norm": 0.38672109962194, + "learning_rate": 9.029264530769208e-05, + "loss": 2.94, + "step": 18141 + }, + { + "epoch": 0.8446586121004726, + "grad_norm": 0.3705416423139799, + "learning_rate": 9.02910413629163e-05, + "loss": 2.8969, + "step": 18142 + }, + { + "epoch": 0.8447051702865657, + "grad_norm": 0.418675184118661, + "learning_rate": 9.028943729989028e-05, + "loss": 2.9157, + "step": 18143 + }, + { + "epoch": 0.8447517284726587, + "grad_norm": 0.34320681045213763, + "learning_rate": 9.028783311861874e-05, + "loss": 2.9609, + "step": 18144 + }, + { + "epoch": 0.8447982866587518, + "grad_norm": 0.34453002600246635, + "learning_rate": 9.028622881910637e-05, + "loss": 2.8619, + "step": 18145 + }, + { + "epoch": 0.8448448448448449, + "grad_norm": 0.3355357791329725, + "learning_rate": 9.028462440135788e-05, + "loss": 2.8654, + "step": 18146 + }, + { + "epoch": 0.8448914030309379, + "grad_norm": 0.33133141617188394, + "learning_rate": 9.0283019865378e-05, + "loss": 2.9561, + "step": 18147 + }, + { + "epoch": 0.844937961217031, + "grad_norm": 0.36690029585725586, + "learning_rate": 9.028141521117142e-05, + "loss": 2.9686, + "step": 18148 + }, + { + "epoch": 0.844984519403124, + "grad_norm": 0.32275373955542147, + "learning_rate": 9.027981043874286e-05, + "loss": 2.8487, + "step": 18149 + }, + { + "epoch": 0.8450310775892171, + "grad_norm": 0.3408092891120023, + "learning_rate": 9.027820554809702e-05, + "loss": 2.9268, + "step": 18150 + }, + { + "epoch": 0.8450776357753101, + "grad_norm": 0.3138230959761916, + "learning_rate": 9.02766005392386e-05, + "loss": 2.9356, + "step": 18151 + }, + { + "epoch": 0.8451241939614033, + "grad_norm": 0.3342902387330918, + "learning_rate": 9.027499541217235e-05, + "loss": 2.8957, + "step": 18152 + }, + { + "epoch": 0.8451707521474964, + "grad_norm": 0.3266577798972419, + "learning_rate": 9.027339016690294e-05, + "loss": 2.8894, + "step": 18153 + }, + { + "epoch": 0.8452173103335894, + "grad_norm": 0.4106606876863189, + "learning_rate": 9.027178480343512e-05, + "loss": 2.9571, + "step": 18154 + }, + { + "epoch": 0.8452638685196825, + "grad_norm": 0.3229082028409225, + "learning_rate": 9.027017932177355e-05, + "loss": 2.9635, + "step": 18155 + }, + { + "epoch": 0.8453104267057755, + "grad_norm": 0.34243380451544486, + "learning_rate": 9.0268573721923e-05, + "loss": 2.8377, + "step": 18156 + }, + { + "epoch": 0.8453569848918686, + "grad_norm": 0.3979918982891177, + "learning_rate": 9.026696800388813e-05, + "loss": 2.8683, + "step": 18157 + }, + { + "epoch": 0.8454035430779617, + "grad_norm": 0.3941563444500301, + "learning_rate": 9.026536216767369e-05, + "loss": 2.9376, + "step": 18158 + }, + { + "epoch": 0.8454501012640547, + "grad_norm": 0.33182541615600825, + "learning_rate": 9.026375621328436e-05, + "loss": 2.89, + "step": 18159 + }, + { + "epoch": 0.8454966594501478, + "grad_norm": 0.3694046864014465, + "learning_rate": 9.02621501407249e-05, + "loss": 2.9648, + "step": 18160 + }, + { + "epoch": 0.8455432176362409, + "grad_norm": 0.2981981425895955, + "learning_rate": 9.026054394999997e-05, + "loss": 2.8716, + "step": 18161 + }, + { + "epoch": 0.845589775822334, + "grad_norm": 0.38237837295294164, + "learning_rate": 9.025893764111433e-05, + "loss": 2.9263, + "step": 18162 + }, + { + "epoch": 0.8456363340084271, + "grad_norm": 0.3217573372147531, + "learning_rate": 9.025733121407265e-05, + "loss": 2.9805, + "step": 18163 + }, + { + "epoch": 0.8456828921945201, + "grad_norm": 0.3608637205316, + "learning_rate": 9.02557246688797e-05, + "loss": 2.9058, + "step": 18164 + }, + { + "epoch": 0.8457294503806132, + "grad_norm": 0.36736432070681113, + "learning_rate": 9.025411800554013e-05, + "loss": 2.9095, + "step": 18165 + }, + { + "epoch": 0.8457760085667062, + "grad_norm": 0.3503396122419831, + "learning_rate": 9.025251122405871e-05, + "loss": 2.9263, + "step": 18166 + }, + { + "epoch": 0.8458225667527993, + "grad_norm": 0.39134844413987135, + "learning_rate": 9.025090432444013e-05, + "loss": 2.9576, + "step": 18167 + }, + { + "epoch": 0.8458691249388924, + "grad_norm": 0.3431988497612487, + "learning_rate": 9.024929730668912e-05, + "loss": 2.9833, + "step": 18168 + }, + { + "epoch": 0.8459156831249854, + "grad_norm": 0.3649797170205078, + "learning_rate": 9.024769017081037e-05, + "loss": 2.9124, + "step": 18169 + }, + { + "epoch": 0.8459622413110786, + "grad_norm": 0.3088261196752205, + "learning_rate": 9.024608291680862e-05, + "loss": 2.9889, + "step": 18170 + }, + { + "epoch": 0.8460087994971716, + "grad_norm": 0.3867354207389415, + "learning_rate": 9.02444755446886e-05, + "loss": 2.9346, + "step": 18171 + }, + { + "epoch": 0.8460553576832647, + "grad_norm": 0.32785335741914456, + "learning_rate": 9.024286805445498e-05, + "loss": 2.8981, + "step": 18172 + }, + { + "epoch": 0.8461019158693577, + "grad_norm": 0.3727975819549842, + "learning_rate": 9.024126044611254e-05, + "loss": 2.9764, + "step": 18173 + }, + { + "epoch": 0.8461484740554508, + "grad_norm": 0.3438645679875695, + "learning_rate": 9.023965271966594e-05, + "loss": 2.9927, + "step": 18174 + }, + { + "epoch": 0.8461950322415439, + "grad_norm": 0.35514704065282693, + "learning_rate": 9.023804487511994e-05, + "loss": 3.0275, + "step": 18175 + }, + { + "epoch": 0.8462415904276369, + "grad_norm": 0.36405168207306615, + "learning_rate": 9.023643691247923e-05, + "loss": 2.9193, + "step": 18176 + }, + { + "epoch": 0.84628814861373, + "grad_norm": 0.3814326174067972, + "learning_rate": 9.023482883174854e-05, + "loss": 2.9673, + "step": 18177 + }, + { + "epoch": 0.846334706799823, + "grad_norm": 0.32433135103291744, + "learning_rate": 9.02332206329326e-05, + "loss": 2.8992, + "step": 18178 + }, + { + "epoch": 0.8463812649859161, + "grad_norm": 0.3551017451252014, + "learning_rate": 9.023161231603612e-05, + "loss": 2.8514, + "step": 18179 + }, + { + "epoch": 0.8464278231720093, + "grad_norm": 0.3067149725161581, + "learning_rate": 9.023000388106381e-05, + "loss": 2.8782, + "step": 18180 + }, + { + "epoch": 0.8464743813581023, + "grad_norm": 0.3545214108579382, + "learning_rate": 9.02283953280204e-05, + "loss": 2.9666, + "step": 18181 + }, + { + "epoch": 0.8465209395441954, + "grad_norm": 0.3267001247615826, + "learning_rate": 9.022678665691063e-05, + "loss": 2.8831, + "step": 18182 + }, + { + "epoch": 0.8465674977302884, + "grad_norm": 0.37979468461850524, + "learning_rate": 9.022517786773921e-05, + "loss": 2.9177, + "step": 18183 + }, + { + "epoch": 0.8466140559163815, + "grad_norm": 0.4545340977584082, + "learning_rate": 9.022356896051083e-05, + "loss": 3.0111, + "step": 18184 + }, + { + "epoch": 0.8466606141024746, + "grad_norm": 0.364999569824744, + "learning_rate": 9.022195993523027e-05, + "loss": 2.8964, + "step": 18185 + }, + { + "epoch": 0.8467071722885676, + "grad_norm": 0.4019362089651236, + "learning_rate": 9.022035079190219e-05, + "loss": 2.8875, + "step": 18186 + }, + { + "epoch": 0.8467537304746607, + "grad_norm": 0.3561648651898302, + "learning_rate": 9.021874153053135e-05, + "loss": 2.8552, + "step": 18187 + }, + { + "epoch": 0.8468002886607537, + "grad_norm": 0.4206498562020498, + "learning_rate": 9.021713215112246e-05, + "loss": 2.922, + "step": 18188 + }, + { + "epoch": 0.8468468468468469, + "grad_norm": 0.30243113747502154, + "learning_rate": 9.021552265368028e-05, + "loss": 2.9949, + "step": 18189 + }, + { + "epoch": 0.84689340503294, + "grad_norm": 0.42262086581960906, + "learning_rate": 9.021391303820948e-05, + "loss": 2.9458, + "step": 18190 + }, + { + "epoch": 0.846939963219033, + "grad_norm": 0.3517442131123969, + "learning_rate": 9.02123033047148e-05, + "loss": 2.8745, + "step": 18191 + }, + { + "epoch": 0.8469865214051261, + "grad_norm": 0.43066093711625686, + "learning_rate": 9.021069345320098e-05, + "loss": 2.9213, + "step": 18192 + }, + { + "epoch": 0.8470330795912191, + "grad_norm": 0.36472616741297126, + "learning_rate": 9.020908348367272e-05, + "loss": 2.8801, + "step": 18193 + }, + { + "epoch": 0.8470796377773122, + "grad_norm": 0.4069015905632751, + "learning_rate": 9.020747339613479e-05, + "loss": 2.9888, + "step": 18194 + }, + { + "epoch": 0.8471261959634052, + "grad_norm": 0.37413215756129475, + "learning_rate": 9.020586319059187e-05, + "loss": 2.8957, + "step": 18195 + }, + { + "epoch": 0.8471727541494983, + "grad_norm": 0.3451853182194196, + "learning_rate": 9.020425286704872e-05, + "loss": 2.8294, + "step": 18196 + }, + { + "epoch": 0.8472193123355914, + "grad_norm": 0.35929590449878746, + "learning_rate": 9.020264242551002e-05, + "loss": 2.9212, + "step": 18197 + }, + { + "epoch": 0.8472658705216844, + "grad_norm": 0.348273259650246, + "learning_rate": 9.020103186598054e-05, + "loss": 2.9099, + "step": 18198 + }, + { + "epoch": 0.8473124287077776, + "grad_norm": 0.36301670008000403, + "learning_rate": 9.0199421188465e-05, + "loss": 2.913, + "step": 18199 + }, + { + "epoch": 0.8473589868938706, + "grad_norm": 0.3637324968827068, + "learning_rate": 9.019781039296811e-05, + "loss": 2.95, + "step": 18200 + }, + { + "epoch": 0.8474055450799637, + "grad_norm": 0.3489007882249433, + "learning_rate": 9.01961994794946e-05, + "loss": 2.9602, + "step": 18201 + }, + { + "epoch": 0.8474521032660568, + "grad_norm": 0.3859857329740419, + "learning_rate": 9.019458844804922e-05, + "loss": 2.9431, + "step": 18202 + }, + { + "epoch": 0.8474986614521498, + "grad_norm": 0.3507456748839829, + "learning_rate": 9.019297729863668e-05, + "loss": 2.8481, + "step": 18203 + }, + { + "epoch": 0.8475452196382429, + "grad_norm": 0.3359127528258448, + "learning_rate": 9.01913660312617e-05, + "loss": 2.9251, + "step": 18204 + }, + { + "epoch": 0.8475917778243359, + "grad_norm": 0.3752820052541005, + "learning_rate": 9.018975464592903e-05, + "loss": 2.9646, + "step": 18205 + }, + { + "epoch": 0.847638336010429, + "grad_norm": 0.346811235043411, + "learning_rate": 9.018814314264339e-05, + "loss": 3.0503, + "step": 18206 + }, + { + "epoch": 0.8476848941965222, + "grad_norm": 0.34063340491406824, + "learning_rate": 9.018653152140951e-05, + "loss": 2.9382, + "step": 18207 + }, + { + "epoch": 0.8477314523826152, + "grad_norm": 0.3629758089990455, + "learning_rate": 9.01849197822321e-05, + "loss": 2.9462, + "step": 18208 + }, + { + "epoch": 0.8477780105687083, + "grad_norm": 0.371336845062001, + "learning_rate": 9.018330792511593e-05, + "loss": 2.9814, + "step": 18209 + }, + { + "epoch": 0.8478245687548013, + "grad_norm": 0.3210782639966069, + "learning_rate": 9.01816959500657e-05, + "loss": 2.8894, + "step": 18210 + }, + { + "epoch": 0.8478711269408944, + "grad_norm": 0.3440075134469198, + "learning_rate": 9.018008385708615e-05, + "loss": 2.8582, + "step": 18211 + }, + { + "epoch": 0.8479176851269874, + "grad_norm": 0.3296133775571454, + "learning_rate": 9.017847164618203e-05, + "loss": 2.9757, + "step": 18212 + }, + { + "epoch": 0.8479642433130805, + "grad_norm": 0.29949281458490395, + "learning_rate": 9.017685931735803e-05, + "loss": 2.9203, + "step": 18213 + }, + { + "epoch": 0.8480108014991736, + "grad_norm": 0.34615333014916266, + "learning_rate": 9.017524687061891e-05, + "loss": 2.9185, + "step": 18214 + }, + { + "epoch": 0.8480573596852666, + "grad_norm": 0.3360623366840416, + "learning_rate": 9.017363430596941e-05, + "loss": 3.0039, + "step": 18215 + }, + { + "epoch": 0.8481039178713597, + "grad_norm": 0.3799208137923537, + "learning_rate": 9.017202162341424e-05, + "loss": 3.0102, + "step": 18216 + }, + { + "epoch": 0.8481504760574528, + "grad_norm": 0.34920377272319997, + "learning_rate": 9.017040882295815e-05, + "loss": 2.9029, + "step": 18217 + }, + { + "epoch": 0.8481970342435459, + "grad_norm": 0.3505269802515853, + "learning_rate": 9.016879590460587e-05, + "loss": 2.9852, + "step": 18218 + }, + { + "epoch": 0.848243592429639, + "grad_norm": 0.3183579717364399, + "learning_rate": 9.016718286836213e-05, + "loss": 2.8392, + "step": 18219 + }, + { + "epoch": 0.848290150615732, + "grad_norm": 0.3614130427025798, + "learning_rate": 9.016556971423167e-05, + "loss": 2.8532, + "step": 18220 + }, + { + "epoch": 0.8483367088018251, + "grad_norm": 0.3684782871361742, + "learning_rate": 9.016395644221922e-05, + "loss": 2.8926, + "step": 18221 + }, + { + "epoch": 0.8483832669879181, + "grad_norm": 0.3424705248524892, + "learning_rate": 9.01623430523295e-05, + "loss": 2.9937, + "step": 18222 + }, + { + "epoch": 0.8484298251740112, + "grad_norm": 0.37969241696625555, + "learning_rate": 9.016072954456727e-05, + "loss": 2.9683, + "step": 18223 + }, + { + "epoch": 0.8484763833601043, + "grad_norm": 0.35000778498258733, + "learning_rate": 9.015911591893725e-05, + "loss": 3.0667, + "step": 18224 + }, + { + "epoch": 0.8485229415461973, + "grad_norm": 0.3071641195167383, + "learning_rate": 9.015750217544418e-05, + "loss": 3.0012, + "step": 18225 + }, + { + "epoch": 0.8485694997322905, + "grad_norm": 0.3262813142007471, + "learning_rate": 9.015588831409281e-05, + "loss": 2.8871, + "step": 18226 + }, + { + "epoch": 0.8486160579183835, + "grad_norm": 0.2990422924761776, + "learning_rate": 9.015427433488786e-05, + "loss": 3.038, + "step": 18227 + }, + { + "epoch": 0.8486626161044766, + "grad_norm": 0.31725972474705944, + "learning_rate": 9.015266023783406e-05, + "loss": 2.8296, + "step": 18228 + }, + { + "epoch": 0.8487091742905697, + "grad_norm": 0.32493451771552617, + "learning_rate": 9.015104602293617e-05, + "loss": 3.0066, + "step": 18229 + }, + { + "epoch": 0.8487557324766627, + "grad_norm": 0.31740691062578424, + "learning_rate": 9.014943169019892e-05, + "loss": 3.0177, + "step": 18230 + }, + { + "epoch": 0.8488022906627558, + "grad_norm": 0.3444169661323378, + "learning_rate": 9.014781723962702e-05, + "loss": 2.923, + "step": 18231 + }, + { + "epoch": 0.8488488488488488, + "grad_norm": 0.33415012430482804, + "learning_rate": 9.014620267122525e-05, + "loss": 2.9571, + "step": 18232 + }, + { + "epoch": 0.8488954070349419, + "grad_norm": 0.33036686241249447, + "learning_rate": 9.014458798499832e-05, + "loss": 2.9671, + "step": 18233 + }, + { + "epoch": 0.8489419652210349, + "grad_norm": 0.3584178288832922, + "learning_rate": 9.014297318095099e-05, + "loss": 2.9002, + "step": 18234 + }, + { + "epoch": 0.848988523407128, + "grad_norm": 0.3506532642446707, + "learning_rate": 9.014135825908798e-05, + "loss": 2.9125, + "step": 18235 + }, + { + "epoch": 0.8490350815932212, + "grad_norm": 0.3147036896870176, + "learning_rate": 9.013974321941404e-05, + "loss": 2.9051, + "step": 18236 + }, + { + "epoch": 0.8490816397793142, + "grad_norm": 0.3128055493472021, + "learning_rate": 9.013812806193391e-05, + "loss": 2.8998, + "step": 18237 + }, + { + "epoch": 0.8491281979654073, + "grad_norm": 0.31890813116737227, + "learning_rate": 9.013651278665233e-05, + "loss": 2.843, + "step": 18238 + }, + { + "epoch": 0.8491747561515003, + "grad_norm": 0.3338876515036056, + "learning_rate": 9.013489739357403e-05, + "loss": 2.9551, + "step": 18239 + }, + { + "epoch": 0.8492213143375934, + "grad_norm": 0.310252786629355, + "learning_rate": 9.013328188270378e-05, + "loss": 2.9297, + "step": 18240 + }, + { + "epoch": 0.8492678725236865, + "grad_norm": 0.33667895627404226, + "learning_rate": 9.013166625404627e-05, + "loss": 2.9069, + "step": 18241 + }, + { + "epoch": 0.8493144307097795, + "grad_norm": 0.3415014731631834, + "learning_rate": 9.01300505076063e-05, + "loss": 2.7803, + "step": 18242 + }, + { + "epoch": 0.8493609888958726, + "grad_norm": 0.3141823324945243, + "learning_rate": 9.012843464338858e-05, + "loss": 2.9733, + "step": 18243 + }, + { + "epoch": 0.8494075470819656, + "grad_norm": 0.35023395513313904, + "learning_rate": 9.012681866139785e-05, + "loss": 2.9858, + "step": 18244 + }, + { + "epoch": 0.8494541052680588, + "grad_norm": 0.32484286220193237, + "learning_rate": 9.012520256163887e-05, + "loss": 2.9296, + "step": 18245 + }, + { + "epoch": 0.8495006634541519, + "grad_norm": 0.3639370255746988, + "learning_rate": 9.012358634411637e-05, + "loss": 2.8685, + "step": 18246 + }, + { + "epoch": 0.8495472216402449, + "grad_norm": 0.3020255832786504, + "learning_rate": 9.012197000883508e-05, + "loss": 2.9281, + "step": 18247 + }, + { + "epoch": 0.849593779826338, + "grad_norm": 0.387005794543261, + "learning_rate": 9.012035355579979e-05, + "loss": 2.9795, + "step": 18248 + }, + { + "epoch": 0.849640338012431, + "grad_norm": 0.3167528911963803, + "learning_rate": 9.011873698501519e-05, + "loss": 3.0263, + "step": 18249 + }, + { + "epoch": 0.8496868961985241, + "grad_norm": 0.3172858558402532, + "learning_rate": 9.011712029648607e-05, + "loss": 2.9403, + "step": 18250 + }, + { + "epoch": 0.8497334543846172, + "grad_norm": 0.31935889836095965, + "learning_rate": 9.011550349021713e-05, + "loss": 2.9196, + "step": 18251 + }, + { + "epoch": 0.8497800125707102, + "grad_norm": 0.35049466476672414, + "learning_rate": 9.011388656621315e-05, + "loss": 2.9137, + "step": 18252 + }, + { + "epoch": 0.8498265707568033, + "grad_norm": 0.31359776389228716, + "learning_rate": 9.011226952447889e-05, + "loss": 2.9415, + "step": 18253 + }, + { + "epoch": 0.8498731289428963, + "grad_norm": 0.3558051983231005, + "learning_rate": 9.011065236501904e-05, + "loss": 2.9082, + "step": 18254 + }, + { + "epoch": 0.8499196871289895, + "grad_norm": 0.31736117818004367, + "learning_rate": 9.010903508783838e-05, + "loss": 2.9542, + "step": 18255 + }, + { + "epoch": 0.8499662453150825, + "grad_norm": 0.34971215187954174, + "learning_rate": 9.010741769294166e-05, + "loss": 3.0405, + "step": 18256 + }, + { + "epoch": 0.8500128035011756, + "grad_norm": 0.33879602802759357, + "learning_rate": 9.010580018033362e-05, + "loss": 2.845, + "step": 18257 + }, + { + "epoch": 0.8500593616872687, + "grad_norm": 0.3544227387235665, + "learning_rate": 9.010418255001901e-05, + "loss": 2.918, + "step": 18258 + }, + { + "epoch": 0.8501059198733617, + "grad_norm": 0.34721643499018845, + "learning_rate": 9.010256480200257e-05, + "loss": 3.0567, + "step": 18259 + }, + { + "epoch": 0.8501524780594548, + "grad_norm": 0.34878314180108067, + "learning_rate": 9.010094693628906e-05, + "loss": 2.9537, + "step": 18260 + }, + { + "epoch": 0.8501990362455478, + "grad_norm": 0.35900535481663276, + "learning_rate": 9.009932895288322e-05, + "loss": 2.8914, + "step": 18261 + }, + { + "epoch": 0.8502455944316409, + "grad_norm": 0.3520807540502296, + "learning_rate": 9.00977108517898e-05, + "loss": 2.821, + "step": 18262 + }, + { + "epoch": 0.850292152617734, + "grad_norm": 0.34036943185365376, + "learning_rate": 9.009609263301352e-05, + "loss": 2.961, + "step": 18263 + }, + { + "epoch": 0.8503387108038271, + "grad_norm": 0.3698715205032627, + "learning_rate": 9.009447429655919e-05, + "loss": 2.9064, + "step": 18264 + }, + { + "epoch": 0.8503852689899202, + "grad_norm": 0.2985952647613832, + "learning_rate": 9.009285584243153e-05, + "loss": 2.8372, + "step": 18265 + }, + { + "epoch": 0.8504318271760132, + "grad_norm": 0.38936020688206296, + "learning_rate": 9.009123727063528e-05, + "loss": 3.0734, + "step": 18266 + }, + { + "epoch": 0.8504783853621063, + "grad_norm": 0.32344573024589574, + "learning_rate": 9.00896185811752e-05, + "loss": 2.9701, + "step": 18267 + }, + { + "epoch": 0.8505249435481994, + "grad_norm": 0.35867649647953775, + "learning_rate": 9.008799977405603e-05, + "loss": 2.9196, + "step": 18268 + }, + { + "epoch": 0.8505715017342924, + "grad_norm": 0.3342517060129866, + "learning_rate": 9.008638084928254e-05, + "loss": 2.9003, + "step": 18269 + }, + { + "epoch": 0.8506180599203855, + "grad_norm": 0.35963000073531987, + "learning_rate": 9.008476180685947e-05, + "loss": 2.9189, + "step": 18270 + }, + { + "epoch": 0.8506646181064785, + "grad_norm": 0.33845098063939144, + "learning_rate": 9.008314264679157e-05, + "loss": 2.9402, + "step": 18271 + }, + { + "epoch": 0.8507111762925716, + "grad_norm": 0.33544488234834646, + "learning_rate": 9.008152336908359e-05, + "loss": 2.9251, + "step": 18272 + }, + { + "epoch": 0.8507577344786648, + "grad_norm": 0.34372287298615184, + "learning_rate": 9.00799039737403e-05, + "loss": 2.8685, + "step": 18273 + }, + { + "epoch": 0.8508042926647578, + "grad_norm": 0.3225371113177748, + "learning_rate": 9.007828446076643e-05, + "loss": 2.9755, + "step": 18274 + }, + { + "epoch": 0.8508508508508509, + "grad_norm": 0.32451930214943747, + "learning_rate": 9.007666483016675e-05, + "loss": 2.7162, + "step": 18275 + }, + { + "epoch": 0.8508974090369439, + "grad_norm": 0.33993093736991475, + "learning_rate": 9.0075045081946e-05, + "loss": 2.971, + "step": 18276 + }, + { + "epoch": 0.850943967223037, + "grad_norm": 0.34877894822398225, + "learning_rate": 9.007342521610895e-05, + "loss": 2.9087, + "step": 18277 + }, + { + "epoch": 0.85099052540913, + "grad_norm": 0.34155889271766265, + "learning_rate": 9.007180523266035e-05, + "loss": 3.0011, + "step": 18278 + }, + { + "epoch": 0.8510370835952231, + "grad_norm": 0.3224451901906934, + "learning_rate": 9.007018513160492e-05, + "loss": 2.8444, + "step": 18279 + }, + { + "epoch": 0.8510836417813162, + "grad_norm": 0.35003856835246583, + "learning_rate": 9.006856491294747e-05, + "loss": 2.8385, + "step": 18280 + }, + { + "epoch": 0.8511301999674092, + "grad_norm": 0.3216634736005414, + "learning_rate": 9.006694457669273e-05, + "loss": 2.8793, + "step": 18281 + }, + { + "epoch": 0.8511767581535024, + "grad_norm": 0.3574256000455739, + "learning_rate": 9.006532412284544e-05, + "loss": 2.9944, + "step": 18282 + }, + { + "epoch": 0.8512233163395954, + "grad_norm": 0.3363115377294538, + "learning_rate": 9.006370355141039e-05, + "loss": 2.9298, + "step": 18283 + }, + { + "epoch": 0.8512698745256885, + "grad_norm": 0.33753994866533066, + "learning_rate": 9.00620828623923e-05, + "loss": 2.9011, + "step": 18284 + }, + { + "epoch": 0.8513164327117816, + "grad_norm": 0.3474116362881622, + "learning_rate": 9.006046205579594e-05, + "loss": 2.9123, + "step": 18285 + }, + { + "epoch": 0.8513629908978746, + "grad_norm": 0.3704819768533425, + "learning_rate": 9.005884113162609e-05, + "loss": 2.9599, + "step": 18286 + }, + { + "epoch": 0.8514095490839677, + "grad_norm": 0.3609608953771123, + "learning_rate": 9.005722008988747e-05, + "loss": 2.8869, + "step": 18287 + }, + { + "epoch": 0.8514561072700607, + "grad_norm": 0.36494142741644997, + "learning_rate": 9.005559893058486e-05, + "loss": 2.9105, + "step": 18288 + }, + { + "epoch": 0.8515026654561538, + "grad_norm": 0.3520396028284471, + "learning_rate": 9.005397765372302e-05, + "loss": 2.8802, + "step": 18289 + }, + { + "epoch": 0.8515492236422469, + "grad_norm": 0.38164974287474995, + "learning_rate": 9.005235625930671e-05, + "loss": 2.9166, + "step": 18290 + }, + { + "epoch": 0.85159578182834, + "grad_norm": 0.38933055934850314, + "learning_rate": 9.005073474734065e-05, + "loss": 2.9303, + "step": 18291 + }, + { + "epoch": 0.8516423400144331, + "grad_norm": 0.34580082199493234, + "learning_rate": 9.004911311782966e-05, + "loss": 3.0072, + "step": 18292 + }, + { + "epoch": 0.8516888982005261, + "grad_norm": 0.39878495202649905, + "learning_rate": 9.004749137077846e-05, + "loss": 2.9824, + "step": 18293 + }, + { + "epoch": 0.8517354563866192, + "grad_norm": 0.35818497436507135, + "learning_rate": 9.00458695061918e-05, + "loss": 2.9375, + "step": 18294 + }, + { + "epoch": 0.8517820145727123, + "grad_norm": 0.40624011586050707, + "learning_rate": 9.004424752407447e-05, + "loss": 2.923, + "step": 18295 + }, + { + "epoch": 0.8518285727588053, + "grad_norm": 0.4474073831243236, + "learning_rate": 9.004262542443123e-05, + "loss": 3.0114, + "step": 18296 + }, + { + "epoch": 0.8518751309448984, + "grad_norm": 0.3444047545392098, + "learning_rate": 9.004100320726682e-05, + "loss": 3.0375, + "step": 18297 + }, + { + "epoch": 0.8519216891309914, + "grad_norm": 0.3847829144554242, + "learning_rate": 9.003938087258602e-05, + "loss": 2.9297, + "step": 18298 + }, + { + "epoch": 0.8519682473170845, + "grad_norm": 0.3689478623183743, + "learning_rate": 9.003775842039356e-05, + "loss": 2.991, + "step": 18299 + }, + { + "epoch": 0.8520148055031775, + "grad_norm": 0.3708389296512033, + "learning_rate": 9.003613585069424e-05, + "loss": 2.8654, + "step": 18300 + }, + { + "epoch": 0.8520613636892707, + "grad_norm": 0.34380475425500784, + "learning_rate": 9.003451316349279e-05, + "loss": 2.815, + "step": 18301 + }, + { + "epoch": 0.8521079218753638, + "grad_norm": 0.3514922175186565, + "learning_rate": 9.003289035879401e-05, + "loss": 2.9752, + "step": 18302 + }, + { + "epoch": 0.8521544800614568, + "grad_norm": 0.3897803277120922, + "learning_rate": 9.003126743660261e-05, + "loss": 2.8759, + "step": 18303 + }, + { + "epoch": 0.8522010382475499, + "grad_norm": 0.32079196862992504, + "learning_rate": 9.002964439692342e-05, + "loss": 2.8889, + "step": 18304 + }, + { + "epoch": 0.8522475964336429, + "grad_norm": 0.3520441703124837, + "learning_rate": 9.002802123976116e-05, + "loss": 2.8631, + "step": 18305 + }, + { + "epoch": 0.852294154619736, + "grad_norm": 0.34963912035564826, + "learning_rate": 9.002639796512058e-05, + "loss": 2.9061, + "step": 18306 + }, + { + "epoch": 0.8523407128058291, + "grad_norm": 0.3513369258129817, + "learning_rate": 9.002477457300646e-05, + "loss": 2.7914, + "step": 18307 + }, + { + "epoch": 0.8523872709919221, + "grad_norm": 0.3308891822214276, + "learning_rate": 9.00231510634236e-05, + "loss": 2.8164, + "step": 18308 + }, + { + "epoch": 0.8524338291780152, + "grad_norm": 0.324289944320352, + "learning_rate": 9.002152743637673e-05, + "loss": 2.9157, + "step": 18309 + }, + { + "epoch": 0.8524803873641082, + "grad_norm": 0.3408032988602625, + "learning_rate": 9.00199036918706e-05, + "loss": 2.8875, + "step": 18310 + }, + { + "epoch": 0.8525269455502014, + "grad_norm": 0.35916321907216603, + "learning_rate": 9.001827982991001e-05, + "loss": 2.9071, + "step": 18311 + }, + { + "epoch": 0.8525735037362945, + "grad_norm": 0.3222592724738762, + "learning_rate": 9.00166558504997e-05, + "loss": 2.9004, + "step": 18312 + }, + { + "epoch": 0.8526200619223875, + "grad_norm": 0.34713790623896024, + "learning_rate": 9.001503175364446e-05, + "loss": 2.9915, + "step": 18313 + }, + { + "epoch": 0.8526666201084806, + "grad_norm": 0.299734170794354, + "learning_rate": 9.001340753934904e-05, + "loss": 2.8884, + "step": 18314 + }, + { + "epoch": 0.8527131782945736, + "grad_norm": 0.33219340133293923, + "learning_rate": 9.00117832076182e-05, + "loss": 2.9568, + "step": 18315 + }, + { + "epoch": 0.8527597364806667, + "grad_norm": 0.33089437349736806, + "learning_rate": 9.001015875845673e-05, + "loss": 2.7929, + "step": 18316 + }, + { + "epoch": 0.8528062946667598, + "grad_norm": 0.3288268813494238, + "learning_rate": 9.000853419186939e-05, + "loss": 2.9614, + "step": 18317 + }, + { + "epoch": 0.8528528528528528, + "grad_norm": 0.3714021291281448, + "learning_rate": 9.000690950786093e-05, + "loss": 2.9406, + "step": 18318 + }, + { + "epoch": 0.852899411038946, + "grad_norm": 0.2985227102857886, + "learning_rate": 9.000528470643614e-05, + "loss": 2.9465, + "step": 18319 + }, + { + "epoch": 0.852945969225039, + "grad_norm": 0.3550048524804364, + "learning_rate": 9.000365978759978e-05, + "loss": 2.8636, + "step": 18320 + }, + { + "epoch": 0.8529925274111321, + "grad_norm": 0.317589183984466, + "learning_rate": 9.000203475135663e-05, + "loss": 2.9631, + "step": 18321 + }, + { + "epoch": 0.8530390855972251, + "grad_norm": 0.36479842567821236, + "learning_rate": 9.000040959771143e-05, + "loss": 2.8001, + "step": 18322 + }, + { + "epoch": 0.8530856437833182, + "grad_norm": 0.32878596663546844, + "learning_rate": 8.999878432666898e-05, + "loss": 2.8159, + "step": 18323 + }, + { + "epoch": 0.8531322019694113, + "grad_norm": 0.32146748730692004, + "learning_rate": 8.999715893823403e-05, + "loss": 2.9329, + "step": 18324 + }, + { + "epoch": 0.8531787601555043, + "grad_norm": 0.37347408668656396, + "learning_rate": 8.999553343241136e-05, + "loss": 2.8524, + "step": 18325 + }, + { + "epoch": 0.8532253183415974, + "grad_norm": 0.32138190925421384, + "learning_rate": 8.999390780920574e-05, + "loss": 2.9833, + "step": 18326 + }, + { + "epoch": 0.8532718765276904, + "grad_norm": 0.3768424665343359, + "learning_rate": 8.999228206862195e-05, + "loss": 2.9373, + "step": 18327 + }, + { + "epoch": 0.8533184347137835, + "grad_norm": 0.29457393418936045, + "learning_rate": 8.999065621066474e-05, + "loss": 2.9843, + "step": 18328 + }, + { + "epoch": 0.8533649928998767, + "grad_norm": 0.35233806835290754, + "learning_rate": 8.998903023533891e-05, + "loss": 2.9652, + "step": 18329 + }, + { + "epoch": 0.8534115510859697, + "grad_norm": 0.29206601359173184, + "learning_rate": 8.998740414264919e-05, + "loss": 2.89, + "step": 18330 + }, + { + "epoch": 0.8534581092720628, + "grad_norm": 0.3204297551966009, + "learning_rate": 8.99857779326004e-05, + "loss": 2.9753, + "step": 18331 + }, + { + "epoch": 0.8535046674581558, + "grad_norm": 0.3400777988725769, + "learning_rate": 8.998415160519729e-05, + "loss": 2.9809, + "step": 18332 + }, + { + "epoch": 0.8535512256442489, + "grad_norm": 0.3448185835559183, + "learning_rate": 8.998252516044462e-05, + "loss": 2.8548, + "step": 18333 + }, + { + "epoch": 0.853597783830342, + "grad_norm": 0.36764998786603986, + "learning_rate": 8.99808985983472e-05, + "loss": 2.9781, + "step": 18334 + }, + { + "epoch": 0.853644342016435, + "grad_norm": 0.3335491534699804, + "learning_rate": 8.997927191890975e-05, + "loss": 2.8944, + "step": 18335 + }, + { + "epoch": 0.8536909002025281, + "grad_norm": 0.380257932870319, + "learning_rate": 8.99776451221371e-05, + "loss": 2.9401, + "step": 18336 + }, + { + "epoch": 0.8537374583886211, + "grad_norm": 0.3288860244012082, + "learning_rate": 8.997601820803399e-05, + "loss": 2.9809, + "step": 18337 + }, + { + "epoch": 0.8537840165747143, + "grad_norm": 0.3846726467772733, + "learning_rate": 8.997439117660522e-05, + "loss": 2.9618, + "step": 18338 + }, + { + "epoch": 0.8538305747608074, + "grad_norm": 0.3432416819599695, + "learning_rate": 8.997276402785552e-05, + "loss": 2.9663, + "step": 18339 + }, + { + "epoch": 0.8538771329469004, + "grad_norm": 0.3754307294974221, + "learning_rate": 8.997113676178971e-05, + "loss": 2.8796, + "step": 18340 + }, + { + "epoch": 0.8539236911329935, + "grad_norm": 0.33109262876033013, + "learning_rate": 8.996950937841257e-05, + "loss": 2.8924, + "step": 18341 + }, + { + "epoch": 0.8539702493190865, + "grad_norm": 0.3522543309166716, + "learning_rate": 8.996788187772884e-05, + "loss": 2.8724, + "step": 18342 + }, + { + "epoch": 0.8540168075051796, + "grad_norm": 0.378939280835721, + "learning_rate": 8.996625425974332e-05, + "loss": 2.9331, + "step": 18343 + }, + { + "epoch": 0.8540633656912726, + "grad_norm": 0.34075546390914946, + "learning_rate": 8.996462652446078e-05, + "loss": 2.8793, + "step": 18344 + }, + { + "epoch": 0.8541099238773657, + "grad_norm": 0.33782374009074834, + "learning_rate": 8.9962998671886e-05, + "loss": 2.9782, + "step": 18345 + }, + { + "epoch": 0.8541564820634588, + "grad_norm": 0.3329047131950988, + "learning_rate": 8.996137070202375e-05, + "loss": 2.918, + "step": 18346 + }, + { + "epoch": 0.8542030402495518, + "grad_norm": 0.40147031205293177, + "learning_rate": 8.995974261487881e-05, + "loss": 2.9441, + "step": 18347 + }, + { + "epoch": 0.854249598435645, + "grad_norm": 0.31441959635696004, + "learning_rate": 8.995811441045596e-05, + "loss": 2.9407, + "step": 18348 + }, + { + "epoch": 0.854296156621738, + "grad_norm": 0.3595617597956019, + "learning_rate": 8.995648608876e-05, + "loss": 2.9108, + "step": 18349 + }, + { + "epoch": 0.8543427148078311, + "grad_norm": 0.31942116647811847, + "learning_rate": 8.995485764979568e-05, + "loss": 2.8729, + "step": 18350 + }, + { + "epoch": 0.8543892729939242, + "grad_norm": 0.33317279954599993, + "learning_rate": 8.995322909356778e-05, + "loss": 2.9215, + "step": 18351 + }, + { + "epoch": 0.8544358311800172, + "grad_norm": 0.3543446110472703, + "learning_rate": 8.99516004200811e-05, + "loss": 2.9384, + "step": 18352 + }, + { + "epoch": 0.8544823893661103, + "grad_norm": 0.3159885425176962, + "learning_rate": 8.99499716293404e-05, + "loss": 2.8271, + "step": 18353 + }, + { + "epoch": 0.8545289475522033, + "grad_norm": 0.3855508054423656, + "learning_rate": 8.994834272135049e-05, + "loss": 2.9157, + "step": 18354 + }, + { + "epoch": 0.8545755057382964, + "grad_norm": 0.3431457953665573, + "learning_rate": 8.99467136961161e-05, + "loss": 2.9167, + "step": 18355 + }, + { + "epoch": 0.8546220639243896, + "grad_norm": 0.36280750324570904, + "learning_rate": 8.994508455364206e-05, + "loss": 2.7769, + "step": 18356 + }, + { + "epoch": 0.8546686221104826, + "grad_norm": 0.3429136116543241, + "learning_rate": 8.994345529393312e-05, + "loss": 2.9421, + "step": 18357 + }, + { + "epoch": 0.8547151802965757, + "grad_norm": 0.34165877072963996, + "learning_rate": 8.994182591699408e-05, + "loss": 2.884, + "step": 18358 + }, + { + "epoch": 0.8547617384826687, + "grad_norm": 0.32891808524043903, + "learning_rate": 8.994019642282971e-05, + "loss": 2.9573, + "step": 18359 + }, + { + "epoch": 0.8548082966687618, + "grad_norm": 0.32447573088740805, + "learning_rate": 8.993856681144482e-05, + "loss": 2.9484, + "step": 18360 + }, + { + "epoch": 0.8548548548548549, + "grad_norm": 0.3462114656229551, + "learning_rate": 8.993693708284414e-05, + "loss": 2.8966, + "step": 18361 + }, + { + "epoch": 0.8549014130409479, + "grad_norm": 0.32719076348925075, + "learning_rate": 8.993530723703251e-05, + "loss": 2.9786, + "step": 18362 + }, + { + "epoch": 0.854947971227041, + "grad_norm": 0.34196555451552785, + "learning_rate": 8.993367727401467e-05, + "loss": 2.9668, + "step": 18363 + }, + { + "epoch": 0.854994529413134, + "grad_norm": 0.3044758717816712, + "learning_rate": 8.993204719379543e-05, + "loss": 2.8819, + "step": 18364 + }, + { + "epoch": 0.8550410875992271, + "grad_norm": 0.3574044197823544, + "learning_rate": 8.993041699637957e-05, + "loss": 2.9031, + "step": 18365 + }, + { + "epoch": 0.8550876457853201, + "grad_norm": 0.3734292784589405, + "learning_rate": 8.992878668177186e-05, + "loss": 2.9846, + "step": 18366 + }, + { + "epoch": 0.8551342039714133, + "grad_norm": 0.35571347107364437, + "learning_rate": 8.992715624997709e-05, + "loss": 2.9132, + "step": 18367 + }, + { + "epoch": 0.8551807621575064, + "grad_norm": 0.36525855397354423, + "learning_rate": 8.992552570100007e-05, + "loss": 2.9887, + "step": 18368 + }, + { + "epoch": 0.8552273203435994, + "grad_norm": 0.3607283824532416, + "learning_rate": 8.992389503484555e-05, + "loss": 2.9124, + "step": 18369 + }, + { + "epoch": 0.8552738785296925, + "grad_norm": 0.35035159117867626, + "learning_rate": 8.992226425151833e-05, + "loss": 2.9149, + "step": 18370 + }, + { + "epoch": 0.8553204367157855, + "grad_norm": 0.3448150028160705, + "learning_rate": 8.99206333510232e-05, + "loss": 3.0224, + "step": 18371 + }, + { + "epoch": 0.8553669949018786, + "grad_norm": 0.36696789903692906, + "learning_rate": 8.991900233336495e-05, + "loss": 2.9469, + "step": 18372 + }, + { + "epoch": 0.8554135530879717, + "grad_norm": 0.32846333679962764, + "learning_rate": 8.991737119854837e-05, + "loss": 2.96, + "step": 18373 + }, + { + "epoch": 0.8554601112740647, + "grad_norm": 0.3583727232065614, + "learning_rate": 8.991573994657822e-05, + "loss": 2.9647, + "step": 18374 + }, + { + "epoch": 0.8555066694601579, + "grad_norm": 0.3372091645176304, + "learning_rate": 8.99141085774593e-05, + "loss": 2.9404, + "step": 18375 + }, + { + "epoch": 0.8555532276462509, + "grad_norm": 0.3378975860561178, + "learning_rate": 8.991247709119643e-05, + "loss": 2.9131, + "step": 18376 + }, + { + "epoch": 0.855599785832344, + "grad_norm": 0.37375888262993, + "learning_rate": 8.991084548779434e-05, + "loss": 2.9852, + "step": 18377 + }, + { + "epoch": 0.8556463440184371, + "grad_norm": 0.32391256666546164, + "learning_rate": 8.990921376725786e-05, + "loss": 2.8848, + "step": 18378 + }, + { + "epoch": 0.8556929022045301, + "grad_norm": 0.34223087558182835, + "learning_rate": 8.990758192959179e-05, + "loss": 2.957, + "step": 18379 + }, + { + "epoch": 0.8557394603906232, + "grad_norm": 0.350103673476524, + "learning_rate": 8.990594997480089e-05, + "loss": 3.0341, + "step": 18380 + }, + { + "epoch": 0.8557860185767162, + "grad_norm": 0.33835754939200324, + "learning_rate": 8.990431790288995e-05, + "loss": 2.845, + "step": 18381 + }, + { + "epoch": 0.8558325767628093, + "grad_norm": 0.32450054168865433, + "learning_rate": 8.990268571386376e-05, + "loss": 2.9444, + "step": 18382 + }, + { + "epoch": 0.8558791349489024, + "grad_norm": 0.32829689164030146, + "learning_rate": 8.990105340772714e-05, + "loss": 2.917, + "step": 18383 + }, + { + "epoch": 0.8559256931349954, + "grad_norm": 0.361896103908076, + "learning_rate": 8.989942098448485e-05, + "loss": 2.9529, + "step": 18384 + }, + { + "epoch": 0.8559722513210886, + "grad_norm": 0.32414815652792783, + "learning_rate": 8.989778844414168e-05, + "loss": 2.9101, + "step": 18385 + }, + { + "epoch": 0.8560188095071816, + "grad_norm": 0.3604648106776289, + "learning_rate": 8.989615578670243e-05, + "loss": 2.8417, + "step": 18386 + }, + { + "epoch": 0.8560653676932747, + "grad_norm": 0.40683938721290847, + "learning_rate": 8.989452301217192e-05, + "loss": 2.909, + "step": 18387 + }, + { + "epoch": 0.8561119258793677, + "grad_norm": 0.3515155059135926, + "learning_rate": 8.98928901205549e-05, + "loss": 2.8841, + "step": 18388 + }, + { + "epoch": 0.8561584840654608, + "grad_norm": 0.362961497778179, + "learning_rate": 8.989125711185618e-05, + "loss": 3.0448, + "step": 18389 + }, + { + "epoch": 0.8562050422515539, + "grad_norm": 0.3435607951349229, + "learning_rate": 8.988962398608054e-05, + "loss": 2.8569, + "step": 18390 + }, + { + "epoch": 0.8562516004376469, + "grad_norm": 0.3186337863046207, + "learning_rate": 8.988799074323278e-05, + "loss": 2.9288, + "step": 18391 + }, + { + "epoch": 0.85629815862374, + "grad_norm": 0.32636686728019615, + "learning_rate": 8.988635738331772e-05, + "loss": 2.8962, + "step": 18392 + }, + { + "epoch": 0.856344716809833, + "grad_norm": 0.3223675305352209, + "learning_rate": 8.988472390634011e-05, + "loss": 2.8552, + "step": 18393 + }, + { + "epoch": 0.8563912749959262, + "grad_norm": 0.3814691857270534, + "learning_rate": 8.988309031230477e-05, + "loss": 2.9137, + "step": 18394 + }, + { + "epoch": 0.8564378331820193, + "grad_norm": 0.3060547143878417, + "learning_rate": 8.98814566012165e-05, + "loss": 2.8469, + "step": 18395 + }, + { + "epoch": 0.8564843913681123, + "grad_norm": 0.35504458325862137, + "learning_rate": 8.987982277308007e-05, + "loss": 2.9429, + "step": 18396 + }, + { + "epoch": 0.8565309495542054, + "grad_norm": 0.3131687551987398, + "learning_rate": 8.987818882790029e-05, + "loss": 2.9701, + "step": 18397 + }, + { + "epoch": 0.8565775077402984, + "grad_norm": 0.3286683988333798, + "learning_rate": 8.987655476568195e-05, + "loss": 2.928, + "step": 18398 + }, + { + "epoch": 0.8566240659263915, + "grad_norm": 0.2978428969420335, + "learning_rate": 8.987492058642987e-05, + "loss": 2.7883, + "step": 18399 + }, + { + "epoch": 0.8566706241124846, + "grad_norm": 0.32156658888934603, + "learning_rate": 8.987328629014881e-05, + "loss": 2.8719, + "step": 18400 + }, + { + "epoch": 0.8567171822985776, + "grad_norm": 0.323043606548519, + "learning_rate": 8.987165187684358e-05, + "loss": 3.0399, + "step": 18401 + }, + { + "epoch": 0.8567637404846707, + "grad_norm": 0.3230109116082108, + "learning_rate": 8.987001734651899e-05, + "loss": 2.8501, + "step": 18402 + }, + { + "epoch": 0.8568102986707637, + "grad_norm": 0.33844204238839337, + "learning_rate": 8.986838269917983e-05, + "loss": 2.8253, + "step": 18403 + }, + { + "epoch": 0.8568568568568569, + "grad_norm": 0.3347324783695076, + "learning_rate": 8.986674793483089e-05, + "loss": 2.9523, + "step": 18404 + }, + { + "epoch": 0.85690341504295, + "grad_norm": 0.40546879971230626, + "learning_rate": 8.986511305347697e-05, + "loss": 2.7868, + "step": 18405 + }, + { + "epoch": 0.856949973229043, + "grad_norm": 0.322185867269771, + "learning_rate": 8.986347805512288e-05, + "loss": 2.9791, + "step": 18406 + }, + { + "epoch": 0.8569965314151361, + "grad_norm": 0.3671236770963455, + "learning_rate": 8.986184293977339e-05, + "loss": 2.8971, + "step": 18407 + }, + { + "epoch": 0.8570430896012291, + "grad_norm": 0.2966849446849853, + "learning_rate": 8.986020770743334e-05, + "loss": 2.9562, + "step": 18408 + }, + { + "epoch": 0.8570896477873222, + "grad_norm": 0.3948903832855221, + "learning_rate": 8.985857235810749e-05, + "loss": 2.8324, + "step": 18409 + }, + { + "epoch": 0.8571362059734152, + "grad_norm": 0.322168598115094, + "learning_rate": 8.985693689180066e-05, + "loss": 2.9866, + "step": 18410 + }, + { + "epoch": 0.8571827641595083, + "grad_norm": 0.40356664703489253, + "learning_rate": 8.985530130851765e-05, + "loss": 2.9948, + "step": 18411 + }, + { + "epoch": 0.8572293223456015, + "grad_norm": 0.3218866003395999, + "learning_rate": 8.985366560826324e-05, + "loss": 2.9384, + "step": 18412 + }, + { + "epoch": 0.8572758805316945, + "grad_norm": 0.4210762885505783, + "learning_rate": 8.985202979104228e-05, + "loss": 2.8267, + "step": 18413 + }, + { + "epoch": 0.8573224387177876, + "grad_norm": 0.3429307834799482, + "learning_rate": 8.985039385685952e-05, + "loss": 2.974, + "step": 18414 + }, + { + "epoch": 0.8573689969038806, + "grad_norm": 0.4035593686971687, + "learning_rate": 8.984875780571978e-05, + "loss": 2.9966, + "step": 18415 + }, + { + "epoch": 0.8574155550899737, + "grad_norm": 0.3582569185222458, + "learning_rate": 8.984712163762788e-05, + "loss": 2.8612, + "step": 18416 + }, + { + "epoch": 0.8574621132760668, + "grad_norm": 0.4323701060693347, + "learning_rate": 8.984548535258858e-05, + "loss": 3.0331, + "step": 18417 + }, + { + "epoch": 0.8575086714621598, + "grad_norm": 0.3892857555655189, + "learning_rate": 8.984384895060671e-05, + "loss": 3.0215, + "step": 18418 + }, + { + "epoch": 0.8575552296482529, + "grad_norm": 0.3712122997762089, + "learning_rate": 8.984221243168708e-05, + "loss": 2.9565, + "step": 18419 + }, + { + "epoch": 0.8576017878343459, + "grad_norm": 0.39813014303566874, + "learning_rate": 8.984057579583447e-05, + "loss": 2.8515, + "step": 18420 + }, + { + "epoch": 0.857648346020439, + "grad_norm": 0.32573305784823237, + "learning_rate": 8.98389390430537e-05, + "loss": 3.0372, + "step": 18421 + }, + { + "epoch": 0.8576949042065322, + "grad_norm": 0.3845767291021454, + "learning_rate": 8.983730217334956e-05, + "loss": 2.8784, + "step": 18422 + }, + { + "epoch": 0.8577414623926252, + "grad_norm": 0.3447550731877963, + "learning_rate": 8.983566518672688e-05, + "loss": 2.9153, + "step": 18423 + }, + { + "epoch": 0.8577880205787183, + "grad_norm": 0.35853206736798515, + "learning_rate": 8.983402808319043e-05, + "loss": 2.8276, + "step": 18424 + }, + { + "epoch": 0.8578345787648113, + "grad_norm": 0.357574702007101, + "learning_rate": 8.983239086274505e-05, + "loss": 2.8005, + "step": 18425 + }, + { + "epoch": 0.8578811369509044, + "grad_norm": 0.3371220916536699, + "learning_rate": 8.983075352539552e-05, + "loss": 2.9265, + "step": 18426 + }, + { + "epoch": 0.8579276951369975, + "grad_norm": 0.34309205278368976, + "learning_rate": 8.982911607114664e-05, + "loss": 2.8522, + "step": 18427 + }, + { + "epoch": 0.8579742533230905, + "grad_norm": 0.3330694885179968, + "learning_rate": 8.982747850000324e-05, + "loss": 3.041, + "step": 18428 + }, + { + "epoch": 0.8580208115091836, + "grad_norm": 0.3633651282236478, + "learning_rate": 8.982584081197011e-05, + "loss": 3.0223, + "step": 18429 + }, + { + "epoch": 0.8580673696952766, + "grad_norm": 0.3407931453019004, + "learning_rate": 8.982420300705206e-05, + "loss": 2.9226, + "step": 18430 + }, + { + "epoch": 0.8581139278813698, + "grad_norm": 0.3161425727297444, + "learning_rate": 8.982256508525391e-05, + "loss": 2.9946, + "step": 18431 + }, + { + "epoch": 0.8581604860674628, + "grad_norm": 0.3683789301578218, + "learning_rate": 8.982092704658043e-05, + "loss": 2.9597, + "step": 18432 + }, + { + "epoch": 0.8582070442535559, + "grad_norm": 0.3157848200200527, + "learning_rate": 8.981928889103647e-05, + "loss": 2.9587, + "step": 18433 + }, + { + "epoch": 0.858253602439649, + "grad_norm": 0.33673918454694673, + "learning_rate": 8.981765061862682e-05, + "loss": 2.9661, + "step": 18434 + }, + { + "epoch": 0.858300160625742, + "grad_norm": 0.33892970967982655, + "learning_rate": 8.981601222935629e-05, + "loss": 2.9144, + "step": 18435 + }, + { + "epoch": 0.8583467188118351, + "grad_norm": 0.3118123076729622, + "learning_rate": 8.981437372322967e-05, + "loss": 2.9443, + "step": 18436 + }, + { + "epoch": 0.8583932769979281, + "grad_norm": 0.3289829112863725, + "learning_rate": 8.981273510025181e-05, + "loss": 2.993, + "step": 18437 + }, + { + "epoch": 0.8584398351840212, + "grad_norm": 0.3128565782641464, + "learning_rate": 8.981109636042747e-05, + "loss": 3.0667, + "step": 18438 + }, + { + "epoch": 0.8584863933701143, + "grad_norm": 0.2939611591025802, + "learning_rate": 8.98094575037615e-05, + "loss": 2.8773, + "step": 18439 + }, + { + "epoch": 0.8585329515562073, + "grad_norm": 0.30863873543383913, + "learning_rate": 8.980781853025868e-05, + "loss": 2.8141, + "step": 18440 + }, + { + "epoch": 0.8585795097423005, + "grad_norm": 0.30165482657771164, + "learning_rate": 8.980617943992384e-05, + "loss": 2.8567, + "step": 18441 + }, + { + "epoch": 0.8586260679283935, + "grad_norm": 0.33997766265714213, + "learning_rate": 8.980454023276179e-05, + "loss": 2.9731, + "step": 18442 + }, + { + "epoch": 0.8586726261144866, + "grad_norm": 0.3150219528529136, + "learning_rate": 8.980290090877734e-05, + "loss": 2.9224, + "step": 18443 + }, + { + "epoch": 0.8587191843005797, + "grad_norm": 0.30791454791689565, + "learning_rate": 8.98012614679753e-05, + "loss": 2.9184, + "step": 18444 + }, + { + "epoch": 0.8587657424866727, + "grad_norm": 0.34016024381320037, + "learning_rate": 8.979962191036046e-05, + "loss": 2.8282, + "step": 18445 + }, + { + "epoch": 0.8588123006727658, + "grad_norm": 0.3264420503256077, + "learning_rate": 8.979798223593767e-05, + "loss": 2.9189, + "step": 18446 + }, + { + "epoch": 0.8588588588588588, + "grad_norm": 0.31605306560205215, + "learning_rate": 8.979634244471171e-05, + "loss": 2.8831, + "step": 18447 + }, + { + "epoch": 0.8589054170449519, + "grad_norm": 0.33045570459870716, + "learning_rate": 8.979470253668741e-05, + "loss": 2.9268, + "step": 18448 + }, + { + "epoch": 0.858951975231045, + "grad_norm": 0.33021626768608636, + "learning_rate": 8.979306251186958e-05, + "loss": 3.0042, + "step": 18449 + }, + { + "epoch": 0.858998533417138, + "grad_norm": 0.324967732152434, + "learning_rate": 8.979142237026304e-05, + "loss": 2.9245, + "step": 18450 + }, + { + "epoch": 0.8590450916032312, + "grad_norm": 0.3123309322086106, + "learning_rate": 8.978978211187258e-05, + "loss": 2.9679, + "step": 18451 + }, + { + "epoch": 0.8590916497893242, + "grad_norm": 0.3083923974015077, + "learning_rate": 8.978814173670302e-05, + "loss": 2.9615, + "step": 18452 + }, + { + "epoch": 0.8591382079754173, + "grad_norm": 0.3114490672842625, + "learning_rate": 8.97865012447592e-05, + "loss": 2.9452, + "step": 18453 + }, + { + "epoch": 0.8591847661615103, + "grad_norm": 0.3089411371306541, + "learning_rate": 8.978486063604592e-05, + "loss": 2.8311, + "step": 18454 + }, + { + "epoch": 0.8592313243476034, + "grad_norm": 0.33391227142866076, + "learning_rate": 8.978321991056799e-05, + "loss": 2.9732, + "step": 18455 + }, + { + "epoch": 0.8592778825336965, + "grad_norm": 0.34987746745548426, + "learning_rate": 8.978157906833023e-05, + "loss": 2.9126, + "step": 18456 + }, + { + "epoch": 0.8593244407197895, + "grad_norm": 0.3012933471104891, + "learning_rate": 8.977993810933745e-05, + "loss": 2.8997, + "step": 18457 + }, + { + "epoch": 0.8593709989058826, + "grad_norm": 0.35026855756974207, + "learning_rate": 8.977829703359448e-05, + "loss": 2.9843, + "step": 18458 + }, + { + "epoch": 0.8594175570919756, + "grad_norm": 0.3297350009273637, + "learning_rate": 8.977665584110613e-05, + "loss": 2.8513, + "step": 18459 + }, + { + "epoch": 0.8594641152780688, + "grad_norm": 0.3318023384750472, + "learning_rate": 8.97750145318772e-05, + "loss": 2.9199, + "step": 18460 + }, + { + "epoch": 0.8595106734641619, + "grad_norm": 0.32157422435655314, + "learning_rate": 8.977337310591252e-05, + "loss": 2.897, + "step": 18461 + }, + { + "epoch": 0.8595572316502549, + "grad_norm": 0.33472648099696495, + "learning_rate": 8.977173156321692e-05, + "loss": 2.9887, + "step": 18462 + }, + { + "epoch": 0.859603789836348, + "grad_norm": 0.3127946125530478, + "learning_rate": 8.977008990379521e-05, + "loss": 2.9052, + "step": 18463 + }, + { + "epoch": 0.859650348022441, + "grad_norm": 0.3169171505641734, + "learning_rate": 8.97684481276522e-05, + "loss": 2.995, + "step": 18464 + }, + { + "epoch": 0.8596969062085341, + "grad_norm": 0.3669896722264452, + "learning_rate": 8.97668062347927e-05, + "loss": 3.0789, + "step": 18465 + }, + { + "epoch": 0.8597434643946272, + "grad_norm": 0.3073892105268075, + "learning_rate": 8.976516422522155e-05, + "loss": 2.9283, + "step": 18466 + }, + { + "epoch": 0.8597900225807202, + "grad_norm": 0.3483193311635878, + "learning_rate": 8.976352209894355e-05, + "loss": 2.9282, + "step": 18467 + }, + { + "epoch": 0.8598365807668134, + "grad_norm": 0.32296843430937877, + "learning_rate": 8.976187985596354e-05, + "loss": 2.8882, + "step": 18468 + }, + { + "epoch": 0.8598831389529064, + "grad_norm": 0.31847657804865986, + "learning_rate": 8.976023749628633e-05, + "loss": 2.8858, + "step": 18469 + }, + { + "epoch": 0.8599296971389995, + "grad_norm": 0.35676410659251095, + "learning_rate": 8.975859501991674e-05, + "loss": 2.942, + "step": 18470 + }, + { + "epoch": 0.8599762553250926, + "grad_norm": 0.32738565130556757, + "learning_rate": 8.975695242685959e-05, + "loss": 2.9781, + "step": 18471 + }, + { + "epoch": 0.8600228135111856, + "grad_norm": 0.39367992753852804, + "learning_rate": 8.975530971711969e-05, + "loss": 2.8954, + "step": 18472 + }, + { + "epoch": 0.8600693716972787, + "grad_norm": 0.3578195595430425, + "learning_rate": 8.975366689070189e-05, + "loss": 2.8111, + "step": 18473 + }, + { + "epoch": 0.8601159298833717, + "grad_norm": 0.311412250656821, + "learning_rate": 8.975202394761099e-05, + "loss": 2.8243, + "step": 18474 + }, + { + "epoch": 0.8601624880694648, + "grad_norm": 0.32611852821111953, + "learning_rate": 8.975038088785182e-05, + "loss": 2.8163, + "step": 18475 + }, + { + "epoch": 0.8602090462555578, + "grad_norm": 0.3179026919506469, + "learning_rate": 8.974873771142917e-05, + "loss": 2.9441, + "step": 18476 + }, + { + "epoch": 0.860255604441651, + "grad_norm": 0.3240986006345032, + "learning_rate": 8.974709441834792e-05, + "loss": 3.0262, + "step": 18477 + }, + { + "epoch": 0.8603021626277441, + "grad_norm": 0.2997409399801571, + "learning_rate": 8.974545100861286e-05, + "loss": 2.89, + "step": 18478 + }, + { + "epoch": 0.8603487208138371, + "grad_norm": 0.30654083085958467, + "learning_rate": 8.974380748222881e-05, + "loss": 2.9292, + "step": 18479 + }, + { + "epoch": 0.8603952789999302, + "grad_norm": 0.35597374696121975, + "learning_rate": 8.97421638392006e-05, + "loss": 2.8407, + "step": 18480 + }, + { + "epoch": 0.8604418371860232, + "grad_norm": 0.3224395968323069, + "learning_rate": 8.974052007953306e-05, + "loss": 2.8046, + "step": 18481 + }, + { + "epoch": 0.8604883953721163, + "grad_norm": 0.32011202790615845, + "learning_rate": 8.973887620323101e-05, + "loss": 2.833, + "step": 18482 + }, + { + "epoch": 0.8605349535582094, + "grad_norm": 0.36372616931198554, + "learning_rate": 8.973723221029926e-05, + "loss": 2.8545, + "step": 18483 + }, + { + "epoch": 0.8605815117443024, + "grad_norm": 0.34952784850786034, + "learning_rate": 8.973558810074267e-05, + "loss": 2.9586, + "step": 18484 + }, + { + "epoch": 0.8606280699303955, + "grad_norm": 0.3511431392241937, + "learning_rate": 8.973394387456603e-05, + "loss": 2.9863, + "step": 18485 + }, + { + "epoch": 0.8606746281164885, + "grad_norm": 0.32060161525733094, + "learning_rate": 8.973229953177418e-05, + "loss": 2.9, + "step": 18486 + }, + { + "epoch": 0.8607211863025817, + "grad_norm": 0.31789186442645573, + "learning_rate": 8.973065507237194e-05, + "loss": 2.7914, + "step": 18487 + }, + { + "epoch": 0.8607677444886748, + "grad_norm": 0.32574068922212185, + "learning_rate": 8.972901049636415e-05, + "loss": 2.8539, + "step": 18488 + }, + { + "epoch": 0.8608143026747678, + "grad_norm": 0.5644266916583782, + "learning_rate": 8.972736580375563e-05, + "loss": 2.9242, + "step": 18489 + }, + { + "epoch": 0.8608608608608609, + "grad_norm": 0.425714232633183, + "learning_rate": 8.972572099455118e-05, + "loss": 2.9694, + "step": 18490 + }, + { + "epoch": 0.8609074190469539, + "grad_norm": 0.3906228684467366, + "learning_rate": 8.972407606875567e-05, + "loss": 2.8821, + "step": 18491 + }, + { + "epoch": 0.860953977233047, + "grad_norm": 0.37404577285820595, + "learning_rate": 8.972243102637393e-05, + "loss": 2.952, + "step": 18492 + }, + { + "epoch": 0.8610005354191401, + "grad_norm": 0.4261572219544081, + "learning_rate": 8.972078586741075e-05, + "loss": 2.961, + "step": 18493 + }, + { + "epoch": 0.8610470936052331, + "grad_norm": 0.34973350210633075, + "learning_rate": 8.971914059187099e-05, + "loss": 3.0315, + "step": 18494 + }, + { + "epoch": 0.8610936517913262, + "grad_norm": 0.3890685412111167, + "learning_rate": 8.971749519975945e-05, + "loss": 2.9888, + "step": 18495 + }, + { + "epoch": 0.8611402099774192, + "grad_norm": 0.3747351537770281, + "learning_rate": 8.971584969108098e-05, + "loss": 2.912, + "step": 18496 + }, + { + "epoch": 0.8611867681635124, + "grad_norm": 0.36927841477345, + "learning_rate": 8.971420406584039e-05, + "loss": 2.8668, + "step": 18497 + }, + { + "epoch": 0.8612333263496054, + "grad_norm": 0.38908521964002796, + "learning_rate": 8.971255832404254e-05, + "loss": 2.905, + "step": 18498 + }, + { + "epoch": 0.8612798845356985, + "grad_norm": 0.35972939761915707, + "learning_rate": 8.971091246569223e-05, + "loss": 2.9549, + "step": 18499 + }, + { + "epoch": 0.8613264427217916, + "grad_norm": 0.3678474363241, + "learning_rate": 8.970926649079432e-05, + "loss": 3.0515, + "step": 18500 + }, + { + "epoch": 0.8613730009078846, + "grad_norm": 0.3539086554456545, + "learning_rate": 8.970762039935361e-05, + "loss": 2.8593, + "step": 18501 + }, + { + "epoch": 0.8614195590939777, + "grad_norm": 0.3504319115590387, + "learning_rate": 8.970597419137496e-05, + "loss": 2.9786, + "step": 18502 + }, + { + "epoch": 0.8614661172800707, + "grad_norm": 0.37671127926424053, + "learning_rate": 8.970432786686318e-05, + "loss": 2.877, + "step": 18503 + }, + { + "epoch": 0.8615126754661638, + "grad_norm": 0.33465026008859383, + "learning_rate": 8.97026814258231e-05, + "loss": 2.9625, + "step": 18504 + }, + { + "epoch": 0.861559233652257, + "grad_norm": 0.39228075959378556, + "learning_rate": 8.970103486825958e-05, + "loss": 2.93, + "step": 18505 + }, + { + "epoch": 0.86160579183835, + "grad_norm": 0.3108209063563264, + "learning_rate": 8.969938819417743e-05, + "loss": 2.8468, + "step": 18506 + }, + { + "epoch": 0.8616523500244431, + "grad_norm": 0.4316611071321586, + "learning_rate": 8.969774140358148e-05, + "loss": 2.9992, + "step": 18507 + }, + { + "epoch": 0.8616989082105361, + "grad_norm": 0.32896250320133136, + "learning_rate": 8.969609449647658e-05, + "loss": 2.8561, + "step": 18508 + }, + { + "epoch": 0.8617454663966292, + "grad_norm": 0.3951885784236018, + "learning_rate": 8.969444747286754e-05, + "loss": 2.9424, + "step": 18509 + }, + { + "epoch": 0.8617920245827223, + "grad_norm": 0.38032988264188894, + "learning_rate": 8.969280033275921e-05, + "loss": 2.8879, + "step": 18510 + }, + { + "epoch": 0.8618385827688153, + "grad_norm": 0.33715870560892625, + "learning_rate": 8.969115307615642e-05, + "loss": 2.8798, + "step": 18511 + }, + { + "epoch": 0.8618851409549084, + "grad_norm": 0.35164087927184084, + "learning_rate": 8.968950570306402e-05, + "loss": 2.8913, + "step": 18512 + }, + { + "epoch": 0.8619316991410014, + "grad_norm": 0.3415531959929755, + "learning_rate": 8.968785821348682e-05, + "loss": 2.9116, + "step": 18513 + }, + { + "epoch": 0.8619782573270945, + "grad_norm": 0.3945310979705991, + "learning_rate": 8.968621060742966e-05, + "loss": 2.9316, + "step": 18514 + }, + { + "epoch": 0.8620248155131877, + "grad_norm": 0.3639698229845898, + "learning_rate": 8.968456288489739e-05, + "loss": 2.9253, + "step": 18515 + }, + { + "epoch": 0.8620713736992807, + "grad_norm": 0.42396707988087645, + "learning_rate": 8.968291504589484e-05, + "loss": 2.9499, + "step": 18516 + }, + { + "epoch": 0.8621179318853738, + "grad_norm": 0.35478947673720335, + "learning_rate": 8.968126709042684e-05, + "loss": 3.0297, + "step": 18517 + }, + { + "epoch": 0.8621644900714668, + "grad_norm": 0.3546757564579201, + "learning_rate": 8.967961901849822e-05, + "loss": 2.9348, + "step": 18518 + }, + { + "epoch": 0.8622110482575599, + "grad_norm": 0.3682838947971121, + "learning_rate": 8.967797083011384e-05, + "loss": 2.9816, + "step": 18519 + }, + { + "epoch": 0.8622576064436529, + "grad_norm": 0.35461607734973677, + "learning_rate": 8.967632252527854e-05, + "loss": 2.9749, + "step": 18520 + }, + { + "epoch": 0.862304164629746, + "grad_norm": 0.31610697508712615, + "learning_rate": 8.96746741039971e-05, + "loss": 2.8511, + "step": 18521 + }, + { + "epoch": 0.8623507228158391, + "grad_norm": 0.37818069284207945, + "learning_rate": 8.967302556627443e-05, + "loss": 2.9266, + "step": 18522 + }, + { + "epoch": 0.8623972810019321, + "grad_norm": 0.31504814869029996, + "learning_rate": 8.967137691211533e-05, + "loss": 2.8905, + "step": 18523 + }, + { + "epoch": 0.8624438391880253, + "grad_norm": 0.36522357803088373, + "learning_rate": 8.966972814152465e-05, + "loss": 3.044, + "step": 18524 + }, + { + "epoch": 0.8624903973741183, + "grad_norm": 0.35155014841336446, + "learning_rate": 8.966807925450724e-05, + "loss": 2.9125, + "step": 18525 + }, + { + "epoch": 0.8625369555602114, + "grad_norm": 0.3664538254188207, + "learning_rate": 8.96664302510679e-05, + "loss": 3.0663, + "step": 18526 + }, + { + "epoch": 0.8625835137463045, + "grad_norm": 0.39225618644303933, + "learning_rate": 8.96647811312115e-05, + "loss": 2.9016, + "step": 18527 + }, + { + "epoch": 0.8626300719323975, + "grad_norm": 0.3242119701351025, + "learning_rate": 8.966313189494288e-05, + "loss": 2.8383, + "step": 18528 + }, + { + "epoch": 0.8626766301184906, + "grad_norm": 0.39874853461835563, + "learning_rate": 8.966148254226688e-05, + "loss": 2.8167, + "step": 18529 + }, + { + "epoch": 0.8627231883045836, + "grad_norm": 0.3683377064258173, + "learning_rate": 8.965983307318833e-05, + "loss": 2.9723, + "step": 18530 + }, + { + "epoch": 0.8627697464906767, + "grad_norm": 0.40861778065283333, + "learning_rate": 8.965818348771209e-05, + "loss": 2.8853, + "step": 18531 + }, + { + "epoch": 0.8628163046767698, + "grad_norm": 0.3687475287612424, + "learning_rate": 8.965653378584298e-05, + "loss": 2.9543, + "step": 18532 + }, + { + "epoch": 0.8628628628628628, + "grad_norm": 0.370991866118045, + "learning_rate": 8.965488396758585e-05, + "loss": 3.0052, + "step": 18533 + }, + { + "epoch": 0.862909421048956, + "grad_norm": 0.3415904570890771, + "learning_rate": 8.965323403294553e-05, + "loss": 2.8006, + "step": 18534 + }, + { + "epoch": 0.862955979235049, + "grad_norm": 0.35160165688706224, + "learning_rate": 8.965158398192689e-05, + "loss": 3.0298, + "step": 18535 + }, + { + "epoch": 0.8630025374211421, + "grad_norm": 0.39140355721076214, + "learning_rate": 8.964993381453475e-05, + "loss": 2.8826, + "step": 18536 + }, + { + "epoch": 0.8630490956072352, + "grad_norm": 0.3463775039406854, + "learning_rate": 8.964828353077397e-05, + "loss": 2.771, + "step": 18537 + }, + { + "epoch": 0.8630956537933282, + "grad_norm": 0.370302251714861, + "learning_rate": 8.964663313064938e-05, + "loss": 2.8768, + "step": 18538 + }, + { + "epoch": 0.8631422119794213, + "grad_norm": 0.3784040737159971, + "learning_rate": 8.964498261416581e-05, + "loss": 2.9404, + "step": 18539 + }, + { + "epoch": 0.8631887701655143, + "grad_norm": 0.3597738541542105, + "learning_rate": 8.964333198132814e-05, + "loss": 2.9829, + "step": 18540 + }, + { + "epoch": 0.8632353283516074, + "grad_norm": 0.3576388899120742, + "learning_rate": 8.964168123214119e-05, + "loss": 2.9427, + "step": 18541 + }, + { + "epoch": 0.8632818865377004, + "grad_norm": 0.33558912012434644, + "learning_rate": 8.964003036660982e-05, + "loss": 2.8808, + "step": 18542 + }, + { + "epoch": 0.8633284447237936, + "grad_norm": 0.38156209067774766, + "learning_rate": 8.963837938473885e-05, + "loss": 2.8112, + "step": 18543 + }, + { + "epoch": 0.8633750029098867, + "grad_norm": 0.35636460324762165, + "learning_rate": 8.963672828653315e-05, + "loss": 2.9188, + "step": 18544 + }, + { + "epoch": 0.8634215610959797, + "grad_norm": 0.3668577155962532, + "learning_rate": 8.963507707199757e-05, + "loss": 2.9223, + "step": 18545 + }, + { + "epoch": 0.8634681192820728, + "grad_norm": 0.3798158867191399, + "learning_rate": 8.963342574113693e-05, + "loss": 2.913, + "step": 18546 + }, + { + "epoch": 0.8635146774681658, + "grad_norm": 0.35536032285144303, + "learning_rate": 8.963177429395608e-05, + "loss": 2.874, + "step": 18547 + }, + { + "epoch": 0.8635612356542589, + "grad_norm": 0.38336856244080725, + "learning_rate": 8.96301227304599e-05, + "loss": 2.9528, + "step": 18548 + }, + { + "epoch": 0.863607793840352, + "grad_norm": 0.3731877929285899, + "learning_rate": 8.96284710506532e-05, + "loss": 2.9159, + "step": 18549 + }, + { + "epoch": 0.863654352026445, + "grad_norm": 0.3368747585018297, + "learning_rate": 8.962681925454083e-05, + "loss": 2.8395, + "step": 18550 + }, + { + "epoch": 0.8637009102125381, + "grad_norm": 0.34568674695968454, + "learning_rate": 8.962516734212767e-05, + "loss": 2.8418, + "step": 18551 + }, + { + "epoch": 0.8637474683986311, + "grad_norm": 0.3187952040480072, + "learning_rate": 8.962351531341854e-05, + "loss": 2.9141, + "step": 18552 + }, + { + "epoch": 0.8637940265847243, + "grad_norm": 0.3177044142710347, + "learning_rate": 8.96218631684183e-05, + "loss": 2.9621, + "step": 18553 + }, + { + "epoch": 0.8638405847708174, + "grad_norm": 0.32144994020470785, + "learning_rate": 8.962021090713179e-05, + "loss": 2.9425, + "step": 18554 + }, + { + "epoch": 0.8638871429569104, + "grad_norm": 0.342714661684267, + "learning_rate": 8.961855852956386e-05, + "loss": 2.8451, + "step": 18555 + }, + { + "epoch": 0.8639337011430035, + "grad_norm": 0.3275133814536432, + "learning_rate": 8.961690603571937e-05, + "loss": 2.9147, + "step": 18556 + }, + { + "epoch": 0.8639802593290965, + "grad_norm": 0.3587182991775934, + "learning_rate": 8.961525342560318e-05, + "loss": 2.939, + "step": 18557 + }, + { + "epoch": 0.8640268175151896, + "grad_norm": 0.33636968079275353, + "learning_rate": 8.961360069922009e-05, + "loss": 2.8923, + "step": 18558 + }, + { + "epoch": 0.8640733757012827, + "grad_norm": 0.3511624045203218, + "learning_rate": 8.9611947856575e-05, + "loss": 3.025, + "step": 18559 + }, + { + "epoch": 0.8641199338873757, + "grad_norm": 0.3600918469093948, + "learning_rate": 8.961029489767276e-05, + "loss": 2.9874, + "step": 18560 + }, + { + "epoch": 0.8641664920734689, + "grad_norm": 0.35134960920223124, + "learning_rate": 8.960864182251819e-05, + "loss": 2.8556, + "step": 18561 + }, + { + "epoch": 0.8642130502595619, + "grad_norm": 0.3585755087537419, + "learning_rate": 8.960698863111616e-05, + "loss": 2.9081, + "step": 18562 + }, + { + "epoch": 0.864259608445655, + "grad_norm": 0.3348192497702477, + "learning_rate": 8.960533532347152e-05, + "loss": 2.8156, + "step": 18563 + }, + { + "epoch": 0.864306166631748, + "grad_norm": 0.37608034145200003, + "learning_rate": 8.960368189958914e-05, + "loss": 3.0297, + "step": 18564 + }, + { + "epoch": 0.8643527248178411, + "grad_norm": 0.36758452590346846, + "learning_rate": 8.960202835947383e-05, + "loss": 2.9654, + "step": 18565 + }, + { + "epoch": 0.8643992830039342, + "grad_norm": 0.33579549307382406, + "learning_rate": 8.960037470313047e-05, + "loss": 2.8709, + "step": 18566 + }, + { + "epoch": 0.8644458411900272, + "grad_norm": 0.3815963254805284, + "learning_rate": 8.959872093056392e-05, + "loss": 2.9554, + "step": 18567 + }, + { + "epoch": 0.8644923993761203, + "grad_norm": 0.34912228342227275, + "learning_rate": 8.959706704177903e-05, + "loss": 2.8383, + "step": 18568 + }, + { + "epoch": 0.8645389575622133, + "grad_norm": 0.3690203764515234, + "learning_rate": 8.959541303678065e-05, + "loss": 2.934, + "step": 18569 + }, + { + "epoch": 0.8645855157483064, + "grad_norm": 0.3759279643845334, + "learning_rate": 8.959375891557362e-05, + "loss": 2.995, + "step": 18570 + }, + { + "epoch": 0.8646320739343996, + "grad_norm": 0.35192040880454734, + "learning_rate": 8.959210467816283e-05, + "loss": 2.8659, + "step": 18571 + }, + { + "epoch": 0.8646786321204926, + "grad_norm": 0.3778516268911305, + "learning_rate": 8.95904503245531e-05, + "loss": 2.9783, + "step": 18572 + }, + { + "epoch": 0.8647251903065857, + "grad_norm": 0.34113966452974376, + "learning_rate": 8.95887958547493e-05, + "loss": 2.8316, + "step": 18573 + }, + { + "epoch": 0.8647717484926787, + "grad_norm": 0.4021916294850911, + "learning_rate": 8.958714126875626e-05, + "loss": 2.8955, + "step": 18574 + }, + { + "epoch": 0.8648183066787718, + "grad_norm": 0.3418339070437841, + "learning_rate": 8.95854865665789e-05, + "loss": 2.9393, + "step": 18575 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.39422814035152093, + "learning_rate": 8.958383174822203e-05, + "loss": 2.9159, + "step": 18576 + }, + { + "epoch": 0.8649114230509579, + "grad_norm": 0.3648590485332312, + "learning_rate": 8.95821768136905e-05, + "loss": 3.0499, + "step": 18577 + }, + { + "epoch": 0.864957981237051, + "grad_norm": 0.31926501596663504, + "learning_rate": 8.958052176298918e-05, + "loss": 2.8727, + "step": 18578 + }, + { + "epoch": 0.865004539423144, + "grad_norm": 0.3328999633150425, + "learning_rate": 8.957886659612293e-05, + "loss": 2.9465, + "step": 18579 + }, + { + "epoch": 0.8650510976092372, + "grad_norm": 0.3659705423391428, + "learning_rate": 8.95772113130966e-05, + "loss": 3.0403, + "step": 18580 + }, + { + "epoch": 0.8650976557953303, + "grad_norm": 0.3240024350565999, + "learning_rate": 8.957555591391506e-05, + "loss": 2.958, + "step": 18581 + }, + { + "epoch": 0.8651442139814233, + "grad_norm": 0.34147002259008685, + "learning_rate": 8.957390039858317e-05, + "loss": 2.9646, + "step": 18582 + }, + { + "epoch": 0.8651907721675164, + "grad_norm": 0.336546342288567, + "learning_rate": 8.957224476710577e-05, + "loss": 3.0066, + "step": 18583 + }, + { + "epoch": 0.8652373303536094, + "grad_norm": 0.3265703212300091, + "learning_rate": 8.957058901948774e-05, + "loss": 2.8425, + "step": 18584 + }, + { + "epoch": 0.8652838885397025, + "grad_norm": 0.36772352616824044, + "learning_rate": 8.956893315573393e-05, + "loss": 2.9105, + "step": 18585 + }, + { + "epoch": 0.8653304467257955, + "grad_norm": 0.34973429322530447, + "learning_rate": 8.956727717584919e-05, + "loss": 2.8692, + "step": 18586 + }, + { + "epoch": 0.8653770049118886, + "grad_norm": 0.3390822667268769, + "learning_rate": 8.95656210798384e-05, + "loss": 2.8641, + "step": 18587 + }, + { + "epoch": 0.8654235630979817, + "grad_norm": 0.3630062348434627, + "learning_rate": 8.95639648677064e-05, + "loss": 2.8955, + "step": 18588 + }, + { + "epoch": 0.8654701212840747, + "grad_norm": 0.3415602526296339, + "learning_rate": 8.956230853945806e-05, + "loss": 2.9759, + "step": 18589 + }, + { + "epoch": 0.8655166794701679, + "grad_norm": 0.33933144847379754, + "learning_rate": 8.956065209509825e-05, + "loss": 2.8468, + "step": 18590 + }, + { + "epoch": 0.8655632376562609, + "grad_norm": 0.3484539911314542, + "learning_rate": 8.955899553463181e-05, + "loss": 2.8938, + "step": 18591 + }, + { + "epoch": 0.865609795842354, + "grad_norm": 0.34982208847833907, + "learning_rate": 8.955733885806363e-05, + "loss": 2.9104, + "step": 18592 + }, + { + "epoch": 0.8656563540284471, + "grad_norm": 0.37302661690434263, + "learning_rate": 8.955568206539855e-05, + "loss": 2.9312, + "step": 18593 + }, + { + "epoch": 0.8657029122145401, + "grad_norm": 0.34586047804469683, + "learning_rate": 8.955402515664144e-05, + "loss": 2.9383, + "step": 18594 + }, + { + "epoch": 0.8657494704006332, + "grad_norm": 0.37268442936258633, + "learning_rate": 8.955236813179717e-05, + "loss": 2.9599, + "step": 18595 + }, + { + "epoch": 0.8657960285867262, + "grad_norm": 0.3696817598057393, + "learning_rate": 8.955071099087058e-05, + "loss": 2.9202, + "step": 18596 + }, + { + "epoch": 0.8658425867728193, + "grad_norm": 0.3350141820109128, + "learning_rate": 8.954905373386658e-05, + "loss": 2.832, + "step": 18597 + }, + { + "epoch": 0.8658891449589124, + "grad_norm": 0.36310228091857094, + "learning_rate": 8.954739636078998e-05, + "loss": 2.8792, + "step": 18598 + }, + { + "epoch": 0.8659357031450055, + "grad_norm": 0.34420923424039795, + "learning_rate": 8.954573887164567e-05, + "loss": 2.9085, + "step": 18599 + }, + { + "epoch": 0.8659822613310986, + "grad_norm": 0.36479765298729694, + "learning_rate": 8.954408126643852e-05, + "loss": 2.9789, + "step": 18600 + }, + { + "epoch": 0.8660288195171916, + "grad_norm": 0.33380623147398625, + "learning_rate": 8.954242354517337e-05, + "loss": 2.9374, + "step": 18601 + }, + { + "epoch": 0.8660753777032847, + "grad_norm": 0.36621738141981364, + "learning_rate": 8.954076570785513e-05, + "loss": 2.9609, + "step": 18602 + }, + { + "epoch": 0.8661219358893778, + "grad_norm": 0.3322013471127033, + "learning_rate": 8.953910775448861e-05, + "loss": 2.9571, + "step": 18603 + }, + { + "epoch": 0.8661684940754708, + "grad_norm": 0.3896693904975659, + "learning_rate": 8.953744968507872e-05, + "loss": 2.9407, + "step": 18604 + }, + { + "epoch": 0.8662150522615639, + "grad_norm": 0.28141366803253, + "learning_rate": 8.95357914996303e-05, + "loss": 2.7939, + "step": 18605 + }, + { + "epoch": 0.8662616104476569, + "grad_norm": 0.3374319700075871, + "learning_rate": 8.953413319814823e-05, + "loss": 2.8785, + "step": 18606 + }, + { + "epoch": 0.86630816863375, + "grad_norm": 0.309636157449398, + "learning_rate": 8.953247478063738e-05, + "loss": 2.9066, + "step": 18607 + }, + { + "epoch": 0.866354726819843, + "grad_norm": 0.3069543230582372, + "learning_rate": 8.953081624710261e-05, + "loss": 2.8243, + "step": 18608 + }, + { + "epoch": 0.8664012850059362, + "grad_norm": 0.34681401764305286, + "learning_rate": 8.952915759754878e-05, + "loss": 2.956, + "step": 18609 + }, + { + "epoch": 0.8664478431920293, + "grad_norm": 0.3182176822509498, + "learning_rate": 8.952749883198077e-05, + "loss": 2.9194, + "step": 18610 + }, + { + "epoch": 0.8664944013781223, + "grad_norm": 0.30012078164467615, + "learning_rate": 8.952583995040347e-05, + "loss": 2.8972, + "step": 18611 + }, + { + "epoch": 0.8665409595642154, + "grad_norm": 0.3201106259017818, + "learning_rate": 8.952418095282169e-05, + "loss": 2.9042, + "step": 18612 + }, + { + "epoch": 0.8665875177503084, + "grad_norm": 0.3217889425272343, + "learning_rate": 8.952252183924035e-05, + "loss": 3.0335, + "step": 18613 + }, + { + "epoch": 0.8666340759364015, + "grad_norm": 0.3352765542731507, + "learning_rate": 8.952086260966429e-05, + "loss": 2.9751, + "step": 18614 + }, + { + "epoch": 0.8666806341224946, + "grad_norm": 0.3604697552047894, + "learning_rate": 8.95192032640984e-05, + "loss": 2.9751, + "step": 18615 + }, + { + "epoch": 0.8667271923085876, + "grad_norm": 0.36980027138917393, + "learning_rate": 8.951754380254754e-05, + "loss": 2.9642, + "step": 18616 + }, + { + "epoch": 0.8667737504946808, + "grad_norm": 0.34230590838643304, + "learning_rate": 8.951588422501658e-05, + "loss": 2.896, + "step": 18617 + }, + { + "epoch": 0.8668203086807738, + "grad_norm": 0.3547691681294904, + "learning_rate": 8.951422453151039e-05, + "loss": 2.8715, + "step": 18618 + }, + { + "epoch": 0.8668668668668669, + "grad_norm": 0.3600948276366542, + "learning_rate": 8.951256472203386e-05, + "loss": 2.9676, + "step": 18619 + }, + { + "epoch": 0.86691342505296, + "grad_norm": 0.329276887729555, + "learning_rate": 8.951090479659183e-05, + "loss": 2.9126, + "step": 18620 + }, + { + "epoch": 0.866959983239053, + "grad_norm": 0.3641215370823921, + "learning_rate": 8.95092447551892e-05, + "loss": 2.8808, + "step": 18621 + }, + { + "epoch": 0.8670065414251461, + "grad_norm": 0.3266880710740614, + "learning_rate": 8.950758459783082e-05, + "loss": 2.8394, + "step": 18622 + }, + { + "epoch": 0.8670530996112391, + "grad_norm": 0.34503620351865255, + "learning_rate": 8.950592432452156e-05, + "loss": 2.8929, + "step": 18623 + }, + { + "epoch": 0.8670996577973322, + "grad_norm": 0.3713576914199678, + "learning_rate": 8.950426393526632e-05, + "loss": 2.9351, + "step": 18624 + }, + { + "epoch": 0.8671462159834253, + "grad_norm": 0.32098622801623083, + "learning_rate": 8.950260343006995e-05, + "loss": 2.8467, + "step": 18625 + }, + { + "epoch": 0.8671927741695183, + "grad_norm": 0.37238292275207824, + "learning_rate": 8.950094280893733e-05, + "loss": 2.9322, + "step": 18626 + }, + { + "epoch": 0.8672393323556115, + "grad_norm": 0.37669807806137595, + "learning_rate": 8.949928207187335e-05, + "loss": 2.8514, + "step": 18627 + }, + { + "epoch": 0.8672858905417045, + "grad_norm": 0.38874467488371256, + "learning_rate": 8.949762121888285e-05, + "loss": 2.9694, + "step": 18628 + }, + { + "epoch": 0.8673324487277976, + "grad_norm": 0.33748059888109044, + "learning_rate": 8.949596024997073e-05, + "loss": 2.888, + "step": 18629 + }, + { + "epoch": 0.8673790069138906, + "grad_norm": 0.365690019923124, + "learning_rate": 8.949429916514186e-05, + "loss": 3.032, + "step": 18630 + }, + { + "epoch": 0.8674255650999837, + "grad_norm": 0.3446297431183859, + "learning_rate": 8.94926379644011e-05, + "loss": 2.7979, + "step": 18631 + }, + { + "epoch": 0.8674721232860768, + "grad_norm": 0.3746443488316157, + "learning_rate": 8.949097664775335e-05, + "loss": 2.8922, + "step": 18632 + }, + { + "epoch": 0.8675186814721698, + "grad_norm": 0.3733171084399338, + "learning_rate": 8.948931521520348e-05, + "loss": 2.9606, + "step": 18633 + }, + { + "epoch": 0.8675652396582629, + "grad_norm": 0.35938400093549555, + "learning_rate": 8.948765366675635e-05, + "loss": 2.9486, + "step": 18634 + }, + { + "epoch": 0.8676117978443559, + "grad_norm": 0.3603198055258143, + "learning_rate": 8.948599200241686e-05, + "loss": 2.8722, + "step": 18635 + }, + { + "epoch": 0.867658356030449, + "grad_norm": 0.3558040593615599, + "learning_rate": 8.948433022218986e-05, + "loss": 2.8632, + "step": 18636 + }, + { + "epoch": 0.8677049142165422, + "grad_norm": 0.33407906412531513, + "learning_rate": 8.948266832608024e-05, + "loss": 2.9099, + "step": 18637 + }, + { + "epoch": 0.8677514724026352, + "grad_norm": 0.40243820311126255, + "learning_rate": 8.948100631409288e-05, + "loss": 2.8988, + "step": 18638 + }, + { + "epoch": 0.8677980305887283, + "grad_norm": 0.3272020845041116, + "learning_rate": 8.947934418623264e-05, + "loss": 2.8641, + "step": 18639 + }, + { + "epoch": 0.8678445887748213, + "grad_norm": 0.3720125904226842, + "learning_rate": 8.947768194250444e-05, + "loss": 2.8708, + "step": 18640 + }, + { + "epoch": 0.8678911469609144, + "grad_norm": 0.3399179815587964, + "learning_rate": 8.947601958291311e-05, + "loss": 2.9712, + "step": 18641 + }, + { + "epoch": 0.8679377051470075, + "grad_norm": 0.34546842364475644, + "learning_rate": 8.947435710746359e-05, + "loss": 2.8935, + "step": 18642 + }, + { + "epoch": 0.8679842633331005, + "grad_norm": 0.34953232078884355, + "learning_rate": 8.947269451616069e-05, + "loss": 2.9817, + "step": 18643 + }, + { + "epoch": 0.8680308215191936, + "grad_norm": 0.3503098040828322, + "learning_rate": 8.947103180900934e-05, + "loss": 2.9516, + "step": 18644 + }, + { + "epoch": 0.8680773797052866, + "grad_norm": 0.3456676475757272, + "learning_rate": 8.946936898601437e-05, + "loss": 2.8577, + "step": 18645 + }, + { + "epoch": 0.8681239378913798, + "grad_norm": 0.35097278369585316, + "learning_rate": 8.946770604718071e-05, + "loss": 2.9268, + "step": 18646 + }, + { + "epoch": 0.8681704960774729, + "grad_norm": 0.3871143879097735, + "learning_rate": 8.946604299251322e-05, + "loss": 2.9135, + "step": 18647 + }, + { + "epoch": 0.8682170542635659, + "grad_norm": 0.32699394487617844, + "learning_rate": 8.946437982201677e-05, + "loss": 2.9667, + "step": 18648 + }, + { + "epoch": 0.868263612449659, + "grad_norm": 0.36618379233981874, + "learning_rate": 8.946271653569626e-05, + "loss": 2.8945, + "step": 18649 + }, + { + "epoch": 0.868310170635752, + "grad_norm": 0.3200952791142245, + "learning_rate": 8.946105313355657e-05, + "loss": 2.8918, + "step": 18650 + }, + { + "epoch": 0.8683567288218451, + "grad_norm": 0.3609530023677387, + "learning_rate": 8.945938961560258e-05, + "loss": 2.8831, + "step": 18651 + }, + { + "epoch": 0.8684032870079381, + "grad_norm": 0.3253657064831159, + "learning_rate": 8.945772598183917e-05, + "loss": 2.8344, + "step": 18652 + }, + { + "epoch": 0.8684498451940312, + "grad_norm": 0.3411275472751544, + "learning_rate": 8.945606223227121e-05, + "loss": 2.9056, + "step": 18653 + }, + { + "epoch": 0.8684964033801243, + "grad_norm": 0.34077899416996044, + "learning_rate": 8.945439836690359e-05, + "loss": 2.9525, + "step": 18654 + }, + { + "epoch": 0.8685429615662174, + "grad_norm": 0.3438990203051985, + "learning_rate": 8.945273438574121e-05, + "loss": 2.9182, + "step": 18655 + }, + { + "epoch": 0.8685895197523105, + "grad_norm": 0.3168470267884168, + "learning_rate": 8.945107028878894e-05, + "loss": 2.8672, + "step": 18656 + }, + { + "epoch": 0.8686360779384035, + "grad_norm": 0.3401365184548247, + "learning_rate": 8.944940607605165e-05, + "loss": 2.9714, + "step": 18657 + }, + { + "epoch": 0.8686826361244966, + "grad_norm": 0.3424547811429882, + "learning_rate": 8.944774174753427e-05, + "loss": 2.896, + "step": 18658 + }, + { + "epoch": 0.8687291943105897, + "grad_norm": 0.32032868501393186, + "learning_rate": 8.944607730324163e-05, + "loss": 2.9849, + "step": 18659 + }, + { + "epoch": 0.8687757524966827, + "grad_norm": 0.3471895967298629, + "learning_rate": 8.944441274317866e-05, + "loss": 3.0108, + "step": 18660 + }, + { + "epoch": 0.8688223106827758, + "grad_norm": 0.3312057308636991, + "learning_rate": 8.94427480673502e-05, + "loss": 2.8765, + "step": 18661 + }, + { + "epoch": 0.8688688688688688, + "grad_norm": 0.34552224702256057, + "learning_rate": 8.944108327576118e-05, + "loss": 2.8941, + "step": 18662 + }, + { + "epoch": 0.8689154270549619, + "grad_norm": 0.332831025173277, + "learning_rate": 8.943941836841644e-05, + "loss": 2.9284, + "step": 18663 + }, + { + "epoch": 0.8689619852410551, + "grad_norm": 0.3370311063919227, + "learning_rate": 8.943775334532092e-05, + "loss": 2.7983, + "step": 18664 + }, + { + "epoch": 0.8690085434271481, + "grad_norm": 0.3767193301378962, + "learning_rate": 8.943608820647946e-05, + "loss": 2.8245, + "step": 18665 + }, + { + "epoch": 0.8690551016132412, + "grad_norm": 0.3738644203631935, + "learning_rate": 8.943442295189699e-05, + "loss": 2.9913, + "step": 18666 + }, + { + "epoch": 0.8691016597993342, + "grad_norm": 0.39895306168036554, + "learning_rate": 8.943275758157836e-05, + "loss": 2.9713, + "step": 18667 + }, + { + "epoch": 0.8691482179854273, + "grad_norm": 0.3181733127211963, + "learning_rate": 8.943109209552847e-05, + "loss": 2.8884, + "step": 18668 + }, + { + "epoch": 0.8691947761715204, + "grad_norm": 0.40190568646705527, + "learning_rate": 8.94294264937522e-05, + "loss": 2.9736, + "step": 18669 + }, + { + "epoch": 0.8692413343576134, + "grad_norm": 0.3459676054149059, + "learning_rate": 8.942776077625445e-05, + "loss": 2.8389, + "step": 18670 + }, + { + "epoch": 0.8692878925437065, + "grad_norm": 0.4066356012756813, + "learning_rate": 8.942609494304013e-05, + "loss": 2.9408, + "step": 18671 + }, + { + "epoch": 0.8693344507297995, + "grad_norm": 0.35227295280725335, + "learning_rate": 8.942442899411408e-05, + "loss": 2.949, + "step": 18672 + }, + { + "epoch": 0.8693810089158927, + "grad_norm": 0.4027159454754829, + "learning_rate": 8.942276292948121e-05, + "loss": 2.8992, + "step": 18673 + }, + { + "epoch": 0.8694275671019857, + "grad_norm": 0.3839722192852595, + "learning_rate": 8.942109674914645e-05, + "loss": 2.8594, + "step": 18674 + }, + { + "epoch": 0.8694741252880788, + "grad_norm": 0.39205987223820277, + "learning_rate": 8.941943045311463e-05, + "loss": 2.9519, + "step": 18675 + }, + { + "epoch": 0.8695206834741719, + "grad_norm": 0.3926032910262, + "learning_rate": 8.941776404139066e-05, + "loss": 2.8705, + "step": 18676 + }, + { + "epoch": 0.8695672416602649, + "grad_norm": 0.38624019393758463, + "learning_rate": 8.941609751397945e-05, + "loss": 2.9388, + "step": 18677 + }, + { + "epoch": 0.869613799846358, + "grad_norm": 0.3802768006364884, + "learning_rate": 8.941443087088587e-05, + "loss": 2.969, + "step": 18678 + }, + { + "epoch": 0.869660358032451, + "grad_norm": 0.3543589092607515, + "learning_rate": 8.941276411211481e-05, + "loss": 3.0332, + "step": 18679 + }, + { + "epoch": 0.8697069162185441, + "grad_norm": 0.37963464657558116, + "learning_rate": 8.941109723767118e-05, + "loss": 2.9512, + "step": 18680 + }, + { + "epoch": 0.8697534744046372, + "grad_norm": 0.33809431018214225, + "learning_rate": 8.940943024755987e-05, + "loss": 2.8453, + "step": 18681 + }, + { + "epoch": 0.8698000325907302, + "grad_norm": 0.37099281187764127, + "learning_rate": 8.940776314178575e-05, + "loss": 2.9489, + "step": 18682 + }, + { + "epoch": 0.8698465907768234, + "grad_norm": 0.31627910242447205, + "learning_rate": 8.940609592035374e-05, + "loss": 2.7871, + "step": 18683 + }, + { + "epoch": 0.8698931489629164, + "grad_norm": 0.3601374598282722, + "learning_rate": 8.94044285832687e-05, + "loss": 2.8344, + "step": 18684 + }, + { + "epoch": 0.8699397071490095, + "grad_norm": 0.3426478435310021, + "learning_rate": 8.940276113053557e-05, + "loss": 2.992, + "step": 18685 + }, + { + "epoch": 0.8699862653351026, + "grad_norm": 0.37858125518812785, + "learning_rate": 8.94010935621592e-05, + "loss": 2.9019, + "step": 18686 + }, + { + "epoch": 0.8700328235211956, + "grad_norm": 0.3123921180316169, + "learning_rate": 8.939942587814452e-05, + "loss": 2.8959, + "step": 18687 + }, + { + "epoch": 0.8700793817072887, + "grad_norm": 0.37045948114728167, + "learning_rate": 8.939775807849639e-05, + "loss": 2.9105, + "step": 18688 + }, + { + "epoch": 0.8701259398933817, + "grad_norm": 0.33169961338065124, + "learning_rate": 8.939609016321973e-05, + "loss": 2.8194, + "step": 18689 + }, + { + "epoch": 0.8701724980794748, + "grad_norm": 0.3672417967898999, + "learning_rate": 8.939442213231941e-05, + "loss": 2.965, + "step": 18690 + }, + { + "epoch": 0.870219056265568, + "grad_norm": 0.35196394184135693, + "learning_rate": 8.939275398580036e-05, + "loss": 2.9401, + "step": 18691 + }, + { + "epoch": 0.870265614451661, + "grad_norm": 0.355190402762787, + "learning_rate": 8.939108572366746e-05, + "loss": 2.9353, + "step": 18692 + }, + { + "epoch": 0.8703121726377541, + "grad_norm": 0.34160388350286264, + "learning_rate": 8.938941734592558e-05, + "loss": 2.9993, + "step": 18693 + }, + { + "epoch": 0.8703587308238471, + "grad_norm": 0.32059408315890864, + "learning_rate": 8.938774885257968e-05, + "loss": 2.7986, + "step": 18694 + }, + { + "epoch": 0.8704052890099402, + "grad_norm": 0.32789806181279124, + "learning_rate": 8.938608024363459e-05, + "loss": 2.9256, + "step": 18695 + }, + { + "epoch": 0.8704518471960332, + "grad_norm": 0.32069468377465704, + "learning_rate": 8.938441151909523e-05, + "loss": 2.8707, + "step": 18696 + }, + { + "epoch": 0.8704984053821263, + "grad_norm": 0.32655866781587406, + "learning_rate": 8.938274267896651e-05, + "loss": 2.9467, + "step": 18697 + }, + { + "epoch": 0.8705449635682194, + "grad_norm": 0.3156476963590087, + "learning_rate": 8.938107372325332e-05, + "loss": 2.9179, + "step": 18698 + }, + { + "epoch": 0.8705915217543124, + "grad_norm": 0.33970595000877724, + "learning_rate": 8.937940465196058e-05, + "loss": 2.9317, + "step": 18699 + }, + { + "epoch": 0.8706380799404055, + "grad_norm": 0.33844306711572664, + "learning_rate": 8.937773546509313e-05, + "loss": 2.9778, + "step": 18700 + }, + { + "epoch": 0.8706846381264985, + "grad_norm": 0.31256032938772044, + "learning_rate": 8.937606616265593e-05, + "loss": 3.0341, + "step": 18701 + }, + { + "epoch": 0.8707311963125917, + "grad_norm": 0.33438410109006406, + "learning_rate": 8.937439674465384e-05, + "loss": 2.8791, + "step": 18702 + }, + { + "epoch": 0.8707777544986848, + "grad_norm": 0.3200682223297226, + "learning_rate": 8.937272721109178e-05, + "loss": 2.9536, + "step": 18703 + }, + { + "epoch": 0.8708243126847778, + "grad_norm": 0.2871614371953533, + "learning_rate": 8.937105756197466e-05, + "loss": 2.8939, + "step": 18704 + }, + { + "epoch": 0.8708708708708709, + "grad_norm": 0.3643909777161134, + "learning_rate": 8.936938779730735e-05, + "loss": 2.9254, + "step": 18705 + }, + { + "epoch": 0.8709174290569639, + "grad_norm": 0.29847904684487825, + "learning_rate": 8.936771791709476e-05, + "loss": 2.9429, + "step": 18706 + }, + { + "epoch": 0.870963987243057, + "grad_norm": 0.3500211553957535, + "learning_rate": 8.93660479213418e-05, + "loss": 2.8714, + "step": 18707 + }, + { + "epoch": 0.8710105454291501, + "grad_norm": 0.3089432364752845, + "learning_rate": 8.936437781005338e-05, + "loss": 2.9019, + "step": 18708 + }, + { + "epoch": 0.8710571036152431, + "grad_norm": 0.3309523056419986, + "learning_rate": 8.936270758323437e-05, + "loss": 2.9532, + "step": 18709 + }, + { + "epoch": 0.8711036618013362, + "grad_norm": 0.3623228206361893, + "learning_rate": 8.936103724088971e-05, + "loss": 2.939, + "step": 18710 + }, + { + "epoch": 0.8711502199874293, + "grad_norm": 0.32727349161597813, + "learning_rate": 8.935936678302426e-05, + "loss": 2.9095, + "step": 18711 + }, + { + "epoch": 0.8711967781735224, + "grad_norm": 0.3474416168390374, + "learning_rate": 8.935769620964297e-05, + "loss": 2.883, + "step": 18712 + }, + { + "epoch": 0.8712433363596155, + "grad_norm": 0.33533134809092485, + "learning_rate": 8.93560255207507e-05, + "loss": 2.9434, + "step": 18713 + }, + { + "epoch": 0.8712898945457085, + "grad_norm": 0.33519993113894236, + "learning_rate": 8.935435471635239e-05, + "loss": 2.8777, + "step": 18714 + }, + { + "epoch": 0.8713364527318016, + "grad_norm": 0.352367834287122, + "learning_rate": 8.935268379645291e-05, + "loss": 2.9753, + "step": 18715 + }, + { + "epoch": 0.8713830109178946, + "grad_norm": 0.3473963855009142, + "learning_rate": 8.935101276105719e-05, + "loss": 2.858, + "step": 18716 + }, + { + "epoch": 0.8714295691039877, + "grad_norm": 0.34463651205740853, + "learning_rate": 8.934934161017013e-05, + "loss": 2.8669, + "step": 18717 + }, + { + "epoch": 0.8714761272900807, + "grad_norm": 0.36578767770725484, + "learning_rate": 8.934767034379662e-05, + "loss": 2.9796, + "step": 18718 + }, + { + "epoch": 0.8715226854761738, + "grad_norm": 0.3604091293124941, + "learning_rate": 8.934599896194158e-05, + "loss": 2.8496, + "step": 18719 + }, + { + "epoch": 0.871569243662267, + "grad_norm": 0.3719217910665621, + "learning_rate": 8.93443274646099e-05, + "loss": 2.8822, + "step": 18720 + }, + { + "epoch": 0.87161580184836, + "grad_norm": 0.3417798103208449, + "learning_rate": 8.93426558518065e-05, + "loss": 3.0062, + "step": 18721 + }, + { + "epoch": 0.8716623600344531, + "grad_norm": 0.3499312439763645, + "learning_rate": 8.934098412353628e-05, + "loss": 2.8222, + "step": 18722 + }, + { + "epoch": 0.8717089182205461, + "grad_norm": 0.3576484821291065, + "learning_rate": 8.933931227980414e-05, + "loss": 3.0127, + "step": 18723 + }, + { + "epoch": 0.8717554764066392, + "grad_norm": 0.3703156668245423, + "learning_rate": 8.9337640320615e-05, + "loss": 2.9088, + "step": 18724 + }, + { + "epoch": 0.8718020345927323, + "grad_norm": 0.3432102662357722, + "learning_rate": 8.933596824597377e-05, + "loss": 2.8571, + "step": 18725 + }, + { + "epoch": 0.8718485927788253, + "grad_norm": 0.3584903540833547, + "learning_rate": 8.933429605588534e-05, + "loss": 2.9198, + "step": 18726 + }, + { + "epoch": 0.8718951509649184, + "grad_norm": 0.3412457594013964, + "learning_rate": 8.933262375035462e-05, + "loss": 2.822, + "step": 18727 + }, + { + "epoch": 0.8719417091510114, + "grad_norm": 0.3616399625366513, + "learning_rate": 8.933095132938654e-05, + "loss": 2.9181, + "step": 18728 + }, + { + "epoch": 0.8719882673371046, + "grad_norm": 0.340919563303726, + "learning_rate": 8.932927879298598e-05, + "loss": 2.9103, + "step": 18729 + }, + { + "epoch": 0.8720348255231977, + "grad_norm": 0.38454474622620166, + "learning_rate": 8.932760614115788e-05, + "loss": 3.0282, + "step": 18730 + }, + { + "epoch": 0.8720813837092907, + "grad_norm": 0.34603143084854754, + "learning_rate": 8.93259333739071e-05, + "loss": 2.9006, + "step": 18731 + }, + { + "epoch": 0.8721279418953838, + "grad_norm": 0.35549842030529716, + "learning_rate": 8.932426049123859e-05, + "loss": 2.8759, + "step": 18732 + }, + { + "epoch": 0.8721745000814768, + "grad_norm": 0.3735887753716176, + "learning_rate": 8.932258749315727e-05, + "loss": 2.8362, + "step": 18733 + }, + { + "epoch": 0.8722210582675699, + "grad_norm": 0.3784339274909297, + "learning_rate": 8.9320914379668e-05, + "loss": 2.8477, + "step": 18734 + }, + { + "epoch": 0.872267616453663, + "grad_norm": 0.39158762288633314, + "learning_rate": 8.931924115077574e-05, + "loss": 3.0085, + "step": 18735 + }, + { + "epoch": 0.872314174639756, + "grad_norm": 0.349809273720717, + "learning_rate": 8.931756780648537e-05, + "loss": 2.9715, + "step": 18736 + }, + { + "epoch": 0.8723607328258491, + "grad_norm": 0.32924164096818054, + "learning_rate": 8.931589434680182e-05, + "loss": 2.8965, + "step": 18737 + }, + { + "epoch": 0.8724072910119421, + "grad_norm": 0.3038362304531702, + "learning_rate": 8.931422077172999e-05, + "loss": 3.0541, + "step": 18738 + }, + { + "epoch": 0.8724538491980353, + "grad_norm": 0.36043666920046175, + "learning_rate": 8.93125470812748e-05, + "loss": 2.9616, + "step": 18739 + }, + { + "epoch": 0.8725004073841283, + "grad_norm": 0.2895982388244531, + "learning_rate": 8.931087327544115e-05, + "loss": 2.8524, + "step": 18740 + }, + { + "epoch": 0.8725469655702214, + "grad_norm": 0.35368086489194484, + "learning_rate": 8.930919935423396e-05, + "loss": 2.9916, + "step": 18741 + }, + { + "epoch": 0.8725935237563145, + "grad_norm": 0.3088644236452346, + "learning_rate": 8.930752531765815e-05, + "loss": 2.921, + "step": 18742 + }, + { + "epoch": 0.8726400819424075, + "grad_norm": 0.36093291229192953, + "learning_rate": 8.930585116571861e-05, + "loss": 2.968, + "step": 18743 + }, + { + "epoch": 0.8726866401285006, + "grad_norm": 0.3305082849935968, + "learning_rate": 8.93041768984203e-05, + "loss": 2.7985, + "step": 18744 + }, + { + "epoch": 0.8727331983145936, + "grad_norm": 0.3584526314305665, + "learning_rate": 8.930250251576807e-05, + "loss": 2.9406, + "step": 18745 + }, + { + "epoch": 0.8727797565006867, + "grad_norm": 0.35904374759458796, + "learning_rate": 8.930082801776688e-05, + "loss": 2.9375, + "step": 18746 + }, + { + "epoch": 0.8728263146867798, + "grad_norm": 0.34097143629047855, + "learning_rate": 8.929915340442163e-05, + "loss": 2.9529, + "step": 18747 + }, + { + "epoch": 0.8728728728728729, + "grad_norm": 0.37447318376398675, + "learning_rate": 8.929747867573724e-05, + "loss": 3.0156, + "step": 18748 + }, + { + "epoch": 0.872919431058966, + "grad_norm": 0.335710226990605, + "learning_rate": 8.929580383171863e-05, + "loss": 2.8819, + "step": 18749 + }, + { + "epoch": 0.872965989245059, + "grad_norm": 0.3671928302686364, + "learning_rate": 8.929412887237069e-05, + "loss": 2.9615, + "step": 18750 + }, + { + "epoch": 0.8730125474311521, + "grad_norm": 0.308640715734387, + "learning_rate": 8.929245379769835e-05, + "loss": 2.9393, + "step": 18751 + }, + { + "epoch": 0.8730591056172452, + "grad_norm": 0.3607934434239299, + "learning_rate": 8.929077860770654e-05, + "loss": 2.8549, + "step": 18752 + }, + { + "epoch": 0.8731056638033382, + "grad_norm": 0.3025179309566662, + "learning_rate": 8.928910330240017e-05, + "loss": 2.8422, + "step": 18753 + }, + { + "epoch": 0.8731522219894313, + "grad_norm": 0.3910992940451909, + "learning_rate": 8.928742788178415e-05, + "loss": 2.8885, + "step": 18754 + }, + { + "epoch": 0.8731987801755243, + "grad_norm": 0.3526525036360417, + "learning_rate": 8.928575234586338e-05, + "loss": 2.8543, + "step": 18755 + }, + { + "epoch": 0.8732453383616174, + "grad_norm": 0.33600673261878206, + "learning_rate": 8.928407669464283e-05, + "loss": 2.9149, + "step": 18756 + }, + { + "epoch": 0.8732918965477106, + "grad_norm": 0.36919571418467756, + "learning_rate": 8.928240092812736e-05, + "loss": 2.9261, + "step": 18757 + }, + { + "epoch": 0.8733384547338036, + "grad_norm": 0.31412692981380264, + "learning_rate": 8.928072504632192e-05, + "loss": 2.9409, + "step": 18758 + }, + { + "epoch": 0.8733850129198967, + "grad_norm": 0.34786072484883523, + "learning_rate": 8.927904904923142e-05, + "loss": 3.0193, + "step": 18759 + }, + { + "epoch": 0.8734315711059897, + "grad_norm": 0.3467374444824863, + "learning_rate": 8.927737293686079e-05, + "loss": 2.9217, + "step": 18760 + }, + { + "epoch": 0.8734781292920828, + "grad_norm": 0.3428254287983215, + "learning_rate": 8.927569670921493e-05, + "loss": 2.9113, + "step": 18761 + }, + { + "epoch": 0.8735246874781758, + "grad_norm": 0.3451303663072273, + "learning_rate": 8.927402036629878e-05, + "loss": 2.8891, + "step": 18762 + }, + { + "epoch": 0.8735712456642689, + "grad_norm": 0.3394592822144814, + "learning_rate": 8.927234390811723e-05, + "loss": 2.9253, + "step": 18763 + }, + { + "epoch": 0.873617803850362, + "grad_norm": 0.3492339668510887, + "learning_rate": 8.927066733467524e-05, + "loss": 2.871, + "step": 18764 + }, + { + "epoch": 0.873664362036455, + "grad_norm": 0.31498100272006446, + "learning_rate": 8.92689906459777e-05, + "loss": 2.8134, + "step": 18765 + }, + { + "epoch": 0.8737109202225481, + "grad_norm": 0.3361514711518226, + "learning_rate": 8.926731384202954e-05, + "loss": 2.8438, + "step": 18766 + }, + { + "epoch": 0.8737574784086412, + "grad_norm": 0.331115053783889, + "learning_rate": 8.926563692283569e-05, + "loss": 2.9751, + "step": 18767 + }, + { + "epoch": 0.8738040365947343, + "grad_norm": 0.3537774898311018, + "learning_rate": 8.926395988840107e-05, + "loss": 2.9201, + "step": 18768 + }, + { + "epoch": 0.8738505947808274, + "grad_norm": 0.34126127093191283, + "learning_rate": 8.926228273873057e-05, + "loss": 2.8671, + "step": 18769 + }, + { + "epoch": 0.8738971529669204, + "grad_norm": 0.3398326694562513, + "learning_rate": 8.926060547382915e-05, + "loss": 2.9404, + "step": 18770 + }, + { + "epoch": 0.8739437111530135, + "grad_norm": 0.32304729687726086, + "learning_rate": 8.925892809370174e-05, + "loss": 2.8993, + "step": 18771 + }, + { + "epoch": 0.8739902693391065, + "grad_norm": 0.3177039479938029, + "learning_rate": 8.925725059835322e-05, + "loss": 2.8959, + "step": 18772 + }, + { + "epoch": 0.8740368275251996, + "grad_norm": 0.3426811100822362, + "learning_rate": 8.925557298778855e-05, + "loss": 2.9389, + "step": 18773 + }, + { + "epoch": 0.8740833857112927, + "grad_norm": 0.3078149739542859, + "learning_rate": 8.925389526201263e-05, + "loss": 2.8501, + "step": 18774 + }, + { + "epoch": 0.8741299438973857, + "grad_norm": 0.3362618068083031, + "learning_rate": 8.92522174210304e-05, + "loss": 2.9129, + "step": 18775 + }, + { + "epoch": 0.8741765020834789, + "grad_norm": 0.3154498163113841, + "learning_rate": 8.925053946484678e-05, + "loss": 2.9761, + "step": 18776 + }, + { + "epoch": 0.8742230602695719, + "grad_norm": 0.3339326275060841, + "learning_rate": 8.924886139346668e-05, + "loss": 3.0038, + "step": 18777 + }, + { + "epoch": 0.874269618455665, + "grad_norm": 0.316954880156792, + "learning_rate": 8.924718320689507e-05, + "loss": 2.8921, + "step": 18778 + }, + { + "epoch": 0.8743161766417581, + "grad_norm": 0.320296895417881, + "learning_rate": 8.924550490513681e-05, + "loss": 2.8408, + "step": 18779 + }, + { + "epoch": 0.8743627348278511, + "grad_norm": 0.3480580600091226, + "learning_rate": 8.924382648819688e-05, + "loss": 2.9528, + "step": 18780 + }, + { + "epoch": 0.8744092930139442, + "grad_norm": 0.33509842427539094, + "learning_rate": 8.924214795608018e-05, + "loss": 2.9153, + "step": 18781 + }, + { + "epoch": 0.8744558512000372, + "grad_norm": 0.33619796244198696, + "learning_rate": 8.924046930879164e-05, + "loss": 2.8852, + "step": 18782 + }, + { + "epoch": 0.8745024093861303, + "grad_norm": 0.3209092963342415, + "learning_rate": 8.92387905463362e-05, + "loss": 2.9245, + "step": 18783 + }, + { + "epoch": 0.8745489675722233, + "grad_norm": 0.31421647826441185, + "learning_rate": 8.923711166871876e-05, + "loss": 2.9194, + "step": 18784 + }, + { + "epoch": 0.8745955257583164, + "grad_norm": 0.3195039303425096, + "learning_rate": 8.923543267594426e-05, + "loss": 2.8222, + "step": 18785 + }, + { + "epoch": 0.8746420839444096, + "grad_norm": 0.33627501769483675, + "learning_rate": 8.923375356801764e-05, + "loss": 2.9105, + "step": 18786 + }, + { + "epoch": 0.8746886421305026, + "grad_norm": 0.31658869466185735, + "learning_rate": 8.923207434494383e-05, + "loss": 2.8329, + "step": 18787 + }, + { + "epoch": 0.8747352003165957, + "grad_norm": 0.30745928534008327, + "learning_rate": 8.923039500672772e-05, + "loss": 2.9132, + "step": 18788 + }, + { + "epoch": 0.8747817585026887, + "grad_norm": 0.34815971124392925, + "learning_rate": 8.922871555337429e-05, + "loss": 2.9484, + "step": 18789 + }, + { + "epoch": 0.8748283166887818, + "grad_norm": 0.33845115136195836, + "learning_rate": 8.922703598488841e-05, + "loss": 2.8406, + "step": 18790 + }, + { + "epoch": 0.8748748748748749, + "grad_norm": 0.326195308015697, + "learning_rate": 8.922535630127509e-05, + "loss": 2.9281, + "step": 18791 + }, + { + "epoch": 0.8749214330609679, + "grad_norm": 0.35840259045979267, + "learning_rate": 8.922367650253918e-05, + "loss": 2.9231, + "step": 18792 + }, + { + "epoch": 0.874967991247061, + "grad_norm": 0.3441956411555035, + "learning_rate": 8.922199658868566e-05, + "loss": 2.9604, + "step": 18793 + }, + { + "epoch": 0.875014549433154, + "grad_norm": 0.35501266695610206, + "learning_rate": 8.922031655971943e-05, + "loss": 2.9162, + "step": 18794 + }, + { + "epoch": 0.8750611076192472, + "grad_norm": 0.3578353291779386, + "learning_rate": 8.921863641564546e-05, + "loss": 2.9248, + "step": 18795 + }, + { + "epoch": 0.8751076658053403, + "grad_norm": 0.3410871354708283, + "learning_rate": 8.921695615646864e-05, + "loss": 2.9582, + "step": 18796 + }, + { + "epoch": 0.8751542239914333, + "grad_norm": 0.3648630671292465, + "learning_rate": 8.921527578219391e-05, + "loss": 2.8451, + "step": 18797 + }, + { + "epoch": 0.8752007821775264, + "grad_norm": 0.36115011352992343, + "learning_rate": 8.921359529282621e-05, + "loss": 2.9518, + "step": 18798 + }, + { + "epoch": 0.8752473403636194, + "grad_norm": 0.32383014009747546, + "learning_rate": 8.921191468837048e-05, + "loss": 2.8275, + "step": 18799 + }, + { + "epoch": 0.8752938985497125, + "grad_norm": 0.3361527448210693, + "learning_rate": 8.921023396883164e-05, + "loss": 2.863, + "step": 18800 + }, + { + "epoch": 0.8753404567358056, + "grad_norm": 0.33047467516444695, + "learning_rate": 8.920855313421462e-05, + "loss": 2.8445, + "step": 18801 + }, + { + "epoch": 0.8753870149218986, + "grad_norm": 0.33659129758559625, + "learning_rate": 8.920687218452436e-05, + "loss": 2.92, + "step": 18802 + }, + { + "epoch": 0.8754335731079917, + "grad_norm": 0.3180010354083184, + "learning_rate": 8.92051911197658e-05, + "loss": 2.826, + "step": 18803 + }, + { + "epoch": 0.8754801312940848, + "grad_norm": 0.35841539451692955, + "learning_rate": 8.920350993994386e-05, + "loss": 2.8479, + "step": 18804 + }, + { + "epoch": 0.8755266894801779, + "grad_norm": 0.3077384030480937, + "learning_rate": 8.92018286450635e-05, + "loss": 2.9927, + "step": 18805 + }, + { + "epoch": 0.8755732476662709, + "grad_norm": 0.37938666516203123, + "learning_rate": 8.920014723512962e-05, + "loss": 2.9509, + "step": 18806 + }, + { + "epoch": 0.875619805852364, + "grad_norm": 0.3751846887394173, + "learning_rate": 8.919846571014716e-05, + "loss": 2.9111, + "step": 18807 + }, + { + "epoch": 0.8756663640384571, + "grad_norm": 0.3421759569682941, + "learning_rate": 8.919678407012108e-05, + "loss": 2.8943, + "step": 18808 + }, + { + "epoch": 0.8757129222245501, + "grad_norm": 0.38786612708806373, + "learning_rate": 8.91951023150563e-05, + "loss": 2.9258, + "step": 18809 + }, + { + "epoch": 0.8757594804106432, + "grad_norm": 0.3438740383232595, + "learning_rate": 8.919342044495774e-05, + "loss": 2.971, + "step": 18810 + }, + { + "epoch": 0.8758060385967362, + "grad_norm": 0.3640284603652811, + "learning_rate": 8.919173845983036e-05, + "loss": 2.89, + "step": 18811 + }, + { + "epoch": 0.8758525967828293, + "grad_norm": 0.33933057088368535, + "learning_rate": 8.91900563596791e-05, + "loss": 2.949, + "step": 18812 + }, + { + "epoch": 0.8758991549689225, + "grad_norm": 0.35241139990263054, + "learning_rate": 8.918837414450887e-05, + "loss": 2.9359, + "step": 18813 + }, + { + "epoch": 0.8759457131550155, + "grad_norm": 0.33900492461644816, + "learning_rate": 8.918669181432464e-05, + "loss": 2.9807, + "step": 18814 + }, + { + "epoch": 0.8759922713411086, + "grad_norm": 0.3157908124493911, + "learning_rate": 8.91850093691313e-05, + "loss": 3.0063, + "step": 18815 + }, + { + "epoch": 0.8760388295272016, + "grad_norm": 0.3645031367070968, + "learning_rate": 8.918332680893385e-05, + "loss": 2.8706, + "step": 18816 + }, + { + "epoch": 0.8760853877132947, + "grad_norm": 0.31522084071869705, + "learning_rate": 8.918164413373718e-05, + "loss": 2.9255, + "step": 18817 + }, + { + "epoch": 0.8761319458993878, + "grad_norm": 0.36040527822548735, + "learning_rate": 8.917996134354625e-05, + "loss": 2.9639, + "step": 18818 + }, + { + "epoch": 0.8761785040854808, + "grad_norm": 0.3465668687849543, + "learning_rate": 8.9178278438366e-05, + "loss": 2.8892, + "step": 18819 + }, + { + "epoch": 0.8762250622715739, + "grad_norm": 0.35435009866931105, + "learning_rate": 8.917659541820134e-05, + "loss": 2.8384, + "step": 18820 + }, + { + "epoch": 0.8762716204576669, + "grad_norm": 0.3793717441983465, + "learning_rate": 8.917491228305725e-05, + "loss": 2.9075, + "step": 18821 + }, + { + "epoch": 0.87631817864376, + "grad_norm": 0.36024675660984, + "learning_rate": 8.917322903293866e-05, + "loss": 3.0179, + "step": 18822 + }, + { + "epoch": 0.8763647368298532, + "grad_norm": 0.4145450409704954, + "learning_rate": 8.917154566785048e-05, + "loss": 2.9083, + "step": 18823 + }, + { + "epoch": 0.8764112950159462, + "grad_norm": 0.3815232103390604, + "learning_rate": 8.916986218779768e-05, + "loss": 2.8947, + "step": 18824 + }, + { + "epoch": 0.8764578532020393, + "grad_norm": 0.3204000052701375, + "learning_rate": 8.91681785927852e-05, + "loss": 2.8564, + "step": 18825 + }, + { + "epoch": 0.8765044113881323, + "grad_norm": 0.3710743651115681, + "learning_rate": 8.916649488281796e-05, + "loss": 3.0048, + "step": 18826 + }, + { + "epoch": 0.8765509695742254, + "grad_norm": 0.3648592222908354, + "learning_rate": 8.916481105790094e-05, + "loss": 2.9436, + "step": 18827 + }, + { + "epoch": 0.8765975277603184, + "grad_norm": 0.3342170023370134, + "learning_rate": 8.916312711803904e-05, + "loss": 2.9005, + "step": 18828 + }, + { + "epoch": 0.8766440859464115, + "grad_norm": 0.35200976563494957, + "learning_rate": 8.916144306323722e-05, + "loss": 2.9598, + "step": 18829 + }, + { + "epoch": 0.8766906441325046, + "grad_norm": 0.33276166509951904, + "learning_rate": 8.915975889350043e-05, + "loss": 2.9456, + "step": 18830 + }, + { + "epoch": 0.8767372023185976, + "grad_norm": 0.34799972066708584, + "learning_rate": 8.91580746088336e-05, + "loss": 2.9374, + "step": 18831 + }, + { + "epoch": 0.8767837605046908, + "grad_norm": 0.35244388046716646, + "learning_rate": 8.915639020924168e-05, + "loss": 2.9127, + "step": 18832 + }, + { + "epoch": 0.8768303186907838, + "grad_norm": 0.35113263916522197, + "learning_rate": 8.915470569472962e-05, + "loss": 2.9646, + "step": 18833 + }, + { + "epoch": 0.8768768768768769, + "grad_norm": 0.3413391122413581, + "learning_rate": 8.915302106530234e-05, + "loss": 2.8456, + "step": 18834 + }, + { + "epoch": 0.87692343506297, + "grad_norm": 0.3265980209494115, + "learning_rate": 8.915133632096481e-05, + "loss": 3.003, + "step": 18835 + }, + { + "epoch": 0.876969993249063, + "grad_norm": 0.3733753418734885, + "learning_rate": 8.914965146172197e-05, + "loss": 2.9092, + "step": 18836 + }, + { + "epoch": 0.8770165514351561, + "grad_norm": 0.30487678299147225, + "learning_rate": 8.914796648757875e-05, + "loss": 2.8832, + "step": 18837 + }, + { + "epoch": 0.8770631096212491, + "grad_norm": 0.3449833246901264, + "learning_rate": 8.91462813985401e-05, + "loss": 2.9861, + "step": 18838 + }, + { + "epoch": 0.8771096678073422, + "grad_norm": 0.33743662202839864, + "learning_rate": 8.9144596194611e-05, + "loss": 2.8451, + "step": 18839 + }, + { + "epoch": 0.8771562259934353, + "grad_norm": 0.3331603195520651, + "learning_rate": 8.914291087579633e-05, + "loss": 2.9135, + "step": 18840 + }, + { + "epoch": 0.8772027841795283, + "grad_norm": 0.33925698735795334, + "learning_rate": 8.91412254421011e-05, + "loss": 2.8371, + "step": 18841 + }, + { + "epoch": 0.8772493423656215, + "grad_norm": 0.32576548393161214, + "learning_rate": 8.913953989353021e-05, + "loss": 2.909, + "step": 18842 + }, + { + "epoch": 0.8772959005517145, + "grad_norm": 0.3166388787895651, + "learning_rate": 8.913785423008865e-05, + "loss": 2.8904, + "step": 18843 + }, + { + "epoch": 0.8773424587378076, + "grad_norm": 0.32763523392353405, + "learning_rate": 8.913616845178133e-05, + "loss": 2.7473, + "step": 18844 + }, + { + "epoch": 0.8773890169239007, + "grad_norm": 0.3468283915875148, + "learning_rate": 8.91344825586132e-05, + "loss": 2.8767, + "step": 18845 + }, + { + "epoch": 0.8774355751099937, + "grad_norm": 0.3465788347558851, + "learning_rate": 8.913279655058924e-05, + "loss": 2.985, + "step": 18846 + }, + { + "epoch": 0.8774821332960868, + "grad_norm": 0.3641785790656026, + "learning_rate": 8.913111042771436e-05, + "loss": 2.8881, + "step": 18847 + }, + { + "epoch": 0.8775286914821798, + "grad_norm": 0.34547422020332375, + "learning_rate": 8.912942418999352e-05, + "loss": 2.9005, + "step": 18848 + }, + { + "epoch": 0.8775752496682729, + "grad_norm": 0.4023731717635223, + "learning_rate": 8.912773783743168e-05, + "loss": 2.94, + "step": 18849 + }, + { + "epoch": 0.8776218078543659, + "grad_norm": 0.35938860737810335, + "learning_rate": 8.91260513700338e-05, + "loss": 2.8208, + "step": 18850 + }, + { + "epoch": 0.8776683660404591, + "grad_norm": 0.3657898176071199, + "learning_rate": 8.912436478780482e-05, + "loss": 2.8818, + "step": 18851 + }, + { + "epoch": 0.8777149242265522, + "grad_norm": 0.43029029864040036, + "learning_rate": 8.912267809074965e-05, + "loss": 3.0041, + "step": 18852 + }, + { + "epoch": 0.8777614824126452, + "grad_norm": 0.38619977782909115, + "learning_rate": 8.91209912788733e-05, + "loss": 2.9731, + "step": 18853 + }, + { + "epoch": 0.8778080405987383, + "grad_norm": 0.3806016858399166, + "learning_rate": 8.91193043521807e-05, + "loss": 2.9607, + "step": 18854 + }, + { + "epoch": 0.8778545987848313, + "grad_norm": 0.37843337517337045, + "learning_rate": 8.911761731067677e-05, + "loss": 2.9883, + "step": 18855 + }, + { + "epoch": 0.8779011569709244, + "grad_norm": 0.3192447633725378, + "learning_rate": 8.91159301543665e-05, + "loss": 2.9007, + "step": 18856 + }, + { + "epoch": 0.8779477151570175, + "grad_norm": 0.37253995537604406, + "learning_rate": 8.911424288325482e-05, + "loss": 2.9341, + "step": 18857 + }, + { + "epoch": 0.8779942733431105, + "grad_norm": 0.31295412953395957, + "learning_rate": 8.911255549734671e-05, + "loss": 2.9411, + "step": 18858 + }, + { + "epoch": 0.8780408315292036, + "grad_norm": 0.35515349717669753, + "learning_rate": 8.91108679966471e-05, + "loss": 2.7905, + "step": 18859 + }, + { + "epoch": 0.8780873897152967, + "grad_norm": 0.5320906272795929, + "learning_rate": 8.910918038116093e-05, + "loss": 2.8128, + "step": 18860 + }, + { + "epoch": 0.8781339479013898, + "grad_norm": 0.3291118720530354, + "learning_rate": 8.910749265089317e-05, + "loss": 2.9047, + "step": 18861 + }, + { + "epoch": 0.8781805060874829, + "grad_norm": 0.36948677143602765, + "learning_rate": 8.910580480584878e-05, + "loss": 3.0086, + "step": 18862 + }, + { + "epoch": 0.8782270642735759, + "grad_norm": 0.3044971578700626, + "learning_rate": 8.91041168460327e-05, + "loss": 2.9247, + "step": 18863 + }, + { + "epoch": 0.878273622459669, + "grad_norm": 0.3460857691451286, + "learning_rate": 8.910242877144989e-05, + "loss": 2.9813, + "step": 18864 + }, + { + "epoch": 0.878320180645762, + "grad_norm": 0.33828851003215654, + "learning_rate": 8.910074058210531e-05, + "loss": 2.8872, + "step": 18865 + }, + { + "epoch": 0.8783667388318551, + "grad_norm": 0.3253018293872173, + "learning_rate": 8.909905227800392e-05, + "loss": 2.9167, + "step": 18866 + }, + { + "epoch": 0.8784132970179482, + "grad_norm": 0.33222188039513123, + "learning_rate": 8.909736385915065e-05, + "loss": 2.9745, + "step": 18867 + }, + { + "epoch": 0.8784598552040412, + "grad_norm": 0.33029992366868616, + "learning_rate": 8.909567532555046e-05, + "loss": 2.8615, + "step": 18868 + }, + { + "epoch": 0.8785064133901344, + "grad_norm": 0.3443936493636901, + "learning_rate": 8.909398667720832e-05, + "loss": 2.866, + "step": 18869 + }, + { + "epoch": 0.8785529715762274, + "grad_norm": 0.32421512868165064, + "learning_rate": 8.909229791412919e-05, + "loss": 2.9282, + "step": 18870 + }, + { + "epoch": 0.8785995297623205, + "grad_norm": 0.3167583470465175, + "learning_rate": 8.9090609036318e-05, + "loss": 2.9976, + "step": 18871 + }, + { + "epoch": 0.8786460879484135, + "grad_norm": 0.33971861080285504, + "learning_rate": 8.908892004377974e-05, + "loss": 2.9022, + "step": 18872 + }, + { + "epoch": 0.8786926461345066, + "grad_norm": 0.33370964478194937, + "learning_rate": 8.908723093651933e-05, + "loss": 3.001, + "step": 18873 + }, + { + "epoch": 0.8787392043205997, + "grad_norm": 0.34020516875960083, + "learning_rate": 8.908554171454177e-05, + "loss": 2.9628, + "step": 18874 + }, + { + "epoch": 0.8787857625066927, + "grad_norm": 0.3265954356986128, + "learning_rate": 8.908385237785199e-05, + "loss": 2.7565, + "step": 18875 + }, + { + "epoch": 0.8788323206927858, + "grad_norm": 0.37744674340281836, + "learning_rate": 8.908216292645495e-05, + "loss": 2.98, + "step": 18876 + }, + { + "epoch": 0.8788788788788788, + "grad_norm": 0.36653313917348107, + "learning_rate": 8.908047336035561e-05, + "loss": 2.8898, + "step": 18877 + }, + { + "epoch": 0.878925437064972, + "grad_norm": 0.371739623774892, + "learning_rate": 8.907878367955893e-05, + "loss": 2.9144, + "step": 18878 + }, + { + "epoch": 0.8789719952510651, + "grad_norm": 0.3704771720911158, + "learning_rate": 8.907709388406987e-05, + "loss": 2.987, + "step": 18879 + }, + { + "epoch": 0.8790185534371581, + "grad_norm": 0.3604806431753785, + "learning_rate": 8.90754039738934e-05, + "loss": 2.8237, + "step": 18880 + }, + { + "epoch": 0.8790651116232512, + "grad_norm": 0.3719502948729409, + "learning_rate": 8.907371394903446e-05, + "loss": 2.9094, + "step": 18881 + }, + { + "epoch": 0.8791116698093442, + "grad_norm": 0.3270010984239108, + "learning_rate": 8.907202380949802e-05, + "loss": 2.8537, + "step": 18882 + }, + { + "epoch": 0.8791582279954373, + "grad_norm": 0.3783960612335421, + "learning_rate": 8.907033355528905e-05, + "loss": 2.9263, + "step": 18883 + }, + { + "epoch": 0.8792047861815304, + "grad_norm": 0.3676380017217656, + "learning_rate": 8.906864318641248e-05, + "loss": 3.0372, + "step": 18884 + }, + { + "epoch": 0.8792513443676234, + "grad_norm": 0.36029832079313706, + "learning_rate": 8.90669527028733e-05, + "loss": 2.8481, + "step": 18885 + }, + { + "epoch": 0.8792979025537165, + "grad_norm": 0.38231383053924634, + "learning_rate": 8.906526210467647e-05, + "loss": 2.8699, + "step": 18886 + }, + { + "epoch": 0.8793444607398095, + "grad_norm": 0.37140101765545136, + "learning_rate": 8.906357139182693e-05, + "loss": 3.0125, + "step": 18887 + }, + { + "epoch": 0.8793910189259027, + "grad_norm": 0.38538024063132537, + "learning_rate": 8.906188056432967e-05, + "loss": 2.8883, + "step": 18888 + }, + { + "epoch": 0.8794375771119958, + "grad_norm": 0.33023037869409316, + "learning_rate": 8.906018962218963e-05, + "loss": 2.9277, + "step": 18889 + }, + { + "epoch": 0.8794841352980888, + "grad_norm": 0.32761692406364373, + "learning_rate": 8.905849856541177e-05, + "loss": 2.8751, + "step": 18890 + }, + { + "epoch": 0.8795306934841819, + "grad_norm": 0.3426400077731136, + "learning_rate": 8.905680739400109e-05, + "loss": 2.9517, + "step": 18891 + }, + { + "epoch": 0.8795772516702749, + "grad_norm": 0.328986007109133, + "learning_rate": 8.905511610796251e-05, + "loss": 2.9703, + "step": 18892 + }, + { + "epoch": 0.879623809856368, + "grad_norm": 0.34566629185793035, + "learning_rate": 8.905342470730103e-05, + "loss": 2.8725, + "step": 18893 + }, + { + "epoch": 0.879670368042461, + "grad_norm": 0.36849948541593774, + "learning_rate": 8.905173319202158e-05, + "loss": 2.8878, + "step": 18894 + }, + { + "epoch": 0.8797169262285541, + "grad_norm": 0.3411809512522712, + "learning_rate": 8.905004156212915e-05, + "loss": 2.8858, + "step": 18895 + }, + { + "epoch": 0.8797634844146472, + "grad_norm": 0.3872558224177567, + "learning_rate": 8.904834981762869e-05, + "loss": 2.928, + "step": 18896 + }, + { + "epoch": 0.8798100426007402, + "grad_norm": 0.37754200642440405, + "learning_rate": 8.904665795852516e-05, + "loss": 2.9042, + "step": 18897 + }, + { + "epoch": 0.8798566007868334, + "grad_norm": 0.39747144564822284, + "learning_rate": 8.904496598482355e-05, + "loss": 2.9987, + "step": 18898 + }, + { + "epoch": 0.8799031589729264, + "grad_norm": 0.4132932628569907, + "learning_rate": 8.90432738965288e-05, + "loss": 3.0016, + "step": 18899 + }, + { + "epoch": 0.8799497171590195, + "grad_norm": 0.3524538603308161, + "learning_rate": 8.904158169364589e-05, + "loss": 2.8715, + "step": 18900 + }, + { + "epoch": 0.8799962753451126, + "grad_norm": 0.3860713540340227, + "learning_rate": 8.90398893761798e-05, + "loss": 2.8252, + "step": 18901 + }, + { + "epoch": 0.8800428335312056, + "grad_norm": 0.33804462130210333, + "learning_rate": 8.903819694413546e-05, + "loss": 2.7883, + "step": 18902 + }, + { + "epoch": 0.8800893917172987, + "grad_norm": 0.3707750454634537, + "learning_rate": 8.903650439751787e-05, + "loss": 2.9312, + "step": 18903 + }, + { + "epoch": 0.8801359499033917, + "grad_norm": 0.3424909725472264, + "learning_rate": 8.903481173633199e-05, + "loss": 2.8502, + "step": 18904 + }, + { + "epoch": 0.8801825080894848, + "grad_norm": 0.32825294825659773, + "learning_rate": 8.903311896058277e-05, + "loss": 2.8374, + "step": 18905 + }, + { + "epoch": 0.880229066275578, + "grad_norm": 0.36339130541857134, + "learning_rate": 8.90314260702752e-05, + "loss": 2.8596, + "step": 18906 + }, + { + "epoch": 0.880275624461671, + "grad_norm": 0.3399937752020289, + "learning_rate": 8.902973306541424e-05, + "loss": 2.9567, + "step": 18907 + }, + { + "epoch": 0.8803221826477641, + "grad_norm": 0.3450340345541127, + "learning_rate": 8.902803994600487e-05, + "loss": 2.8776, + "step": 18908 + }, + { + "epoch": 0.8803687408338571, + "grad_norm": 0.33758836356951566, + "learning_rate": 8.902634671205201e-05, + "loss": 2.9783, + "step": 18909 + }, + { + "epoch": 0.8804152990199502, + "grad_norm": 0.3068108198286319, + "learning_rate": 8.902465336356069e-05, + "loss": 2.9157, + "step": 18910 + }, + { + "epoch": 0.8804618572060433, + "grad_norm": 0.3286081826285498, + "learning_rate": 8.902295990053586e-05, + "loss": 2.8544, + "step": 18911 + }, + { + "epoch": 0.8805084153921363, + "grad_norm": 0.3080600109732663, + "learning_rate": 8.90212663229825e-05, + "loss": 2.8818, + "step": 18912 + }, + { + "epoch": 0.8805549735782294, + "grad_norm": 0.31853002650805906, + "learning_rate": 8.901957263090554e-05, + "loss": 2.8545, + "step": 18913 + }, + { + "epoch": 0.8806015317643224, + "grad_norm": 0.3204345770250305, + "learning_rate": 8.901787882430999e-05, + "loss": 2.8961, + "step": 18914 + }, + { + "epoch": 0.8806480899504155, + "grad_norm": 0.31320642759394285, + "learning_rate": 8.90161849032008e-05, + "loss": 3.0056, + "step": 18915 + }, + { + "epoch": 0.8806946481365086, + "grad_norm": 0.33499546386088336, + "learning_rate": 8.901449086758296e-05, + "loss": 2.8877, + "step": 18916 + }, + { + "epoch": 0.8807412063226017, + "grad_norm": 0.3215213041603471, + "learning_rate": 8.901279671746145e-05, + "loss": 2.9118, + "step": 18917 + }, + { + "epoch": 0.8807877645086948, + "grad_norm": 0.32649268200744613, + "learning_rate": 8.90111024528412e-05, + "loss": 2.891, + "step": 18918 + }, + { + "epoch": 0.8808343226947878, + "grad_norm": 0.31060648195462176, + "learning_rate": 8.900940807372722e-05, + "loss": 2.949, + "step": 18919 + }, + { + "epoch": 0.8808808808808809, + "grad_norm": 0.3150288475204292, + "learning_rate": 8.900771358012446e-05, + "loss": 2.85, + "step": 18920 + }, + { + "epoch": 0.8809274390669739, + "grad_norm": 0.3619708741470249, + "learning_rate": 8.900601897203791e-05, + "loss": 2.9764, + "step": 18921 + }, + { + "epoch": 0.880973997253067, + "grad_norm": 0.35448592226378434, + "learning_rate": 8.900432424947253e-05, + "loss": 2.8808, + "step": 18922 + }, + { + "epoch": 0.8810205554391601, + "grad_norm": 0.36519733744318894, + "learning_rate": 8.900262941243332e-05, + "loss": 2.9062, + "step": 18923 + }, + { + "epoch": 0.8810671136252531, + "grad_norm": 0.3450273983286561, + "learning_rate": 8.900093446092522e-05, + "loss": 2.9306, + "step": 18924 + }, + { + "epoch": 0.8811136718113463, + "grad_norm": 0.36309574215743107, + "learning_rate": 8.899923939495322e-05, + "loss": 2.8828, + "step": 18925 + }, + { + "epoch": 0.8811602299974393, + "grad_norm": 0.3647720102120761, + "learning_rate": 8.89975442145223e-05, + "loss": 2.9288, + "step": 18926 + }, + { + "epoch": 0.8812067881835324, + "grad_norm": 0.31689002989402615, + "learning_rate": 8.899584891963742e-05, + "loss": 2.8456, + "step": 18927 + }, + { + "epoch": 0.8812533463696255, + "grad_norm": 0.36409748842577716, + "learning_rate": 8.899415351030359e-05, + "loss": 2.9066, + "step": 18928 + }, + { + "epoch": 0.8812999045557185, + "grad_norm": 0.34301365456044863, + "learning_rate": 8.899245798652573e-05, + "loss": 2.9254, + "step": 18929 + }, + { + "epoch": 0.8813464627418116, + "grad_norm": 0.32520559196184046, + "learning_rate": 8.899076234830886e-05, + "loss": 2.8502, + "step": 18930 + }, + { + "epoch": 0.8813930209279046, + "grad_norm": 0.3216076094153156, + "learning_rate": 8.898906659565794e-05, + "loss": 2.8858, + "step": 18931 + }, + { + "epoch": 0.8814395791139977, + "grad_norm": 0.3585090823318023, + "learning_rate": 8.898737072857796e-05, + "loss": 2.9025, + "step": 18932 + }, + { + "epoch": 0.8814861373000907, + "grad_norm": 0.340738171639014, + "learning_rate": 8.898567474707388e-05, + "loss": 2.9214, + "step": 18933 + }, + { + "epoch": 0.8815326954861838, + "grad_norm": 0.32829085088658494, + "learning_rate": 8.898397865115069e-05, + "loss": 2.8296, + "step": 18934 + }, + { + "epoch": 0.881579253672277, + "grad_norm": 0.30332673720773423, + "learning_rate": 8.898228244081335e-05, + "loss": 2.957, + "step": 18935 + }, + { + "epoch": 0.88162581185837, + "grad_norm": 0.35294710326301737, + "learning_rate": 8.898058611606687e-05, + "loss": 2.9224, + "step": 18936 + }, + { + "epoch": 0.8816723700444631, + "grad_norm": 0.31985957849593216, + "learning_rate": 8.89788896769162e-05, + "loss": 2.8605, + "step": 18937 + }, + { + "epoch": 0.8817189282305561, + "grad_norm": 0.3441194312638035, + "learning_rate": 8.897719312336634e-05, + "loss": 2.8974, + "step": 18938 + }, + { + "epoch": 0.8817654864166492, + "grad_norm": 0.35021662409465965, + "learning_rate": 8.897549645542224e-05, + "loss": 2.9041, + "step": 18939 + }, + { + "epoch": 0.8818120446027423, + "grad_norm": 0.32715534078633973, + "learning_rate": 8.897379967308891e-05, + "loss": 2.8271, + "step": 18940 + }, + { + "epoch": 0.8818586027888353, + "grad_norm": 0.34112324670670513, + "learning_rate": 8.897210277637132e-05, + "loss": 2.863, + "step": 18941 + }, + { + "epoch": 0.8819051609749284, + "grad_norm": 0.3011906806874099, + "learning_rate": 8.897040576527443e-05, + "loss": 2.8832, + "step": 18942 + }, + { + "epoch": 0.8819517191610214, + "grad_norm": 0.3144971850845054, + "learning_rate": 8.896870863980326e-05, + "loss": 2.9662, + "step": 18943 + }, + { + "epoch": 0.8819982773471146, + "grad_norm": 0.3297441743682976, + "learning_rate": 8.896701139996275e-05, + "loss": 2.9453, + "step": 18944 + }, + { + "epoch": 0.8820448355332077, + "grad_norm": 0.3478422262746245, + "learning_rate": 8.896531404575792e-05, + "loss": 2.9712, + "step": 18945 + }, + { + "epoch": 0.8820913937193007, + "grad_norm": 0.32875743509385524, + "learning_rate": 8.896361657719372e-05, + "loss": 2.9797, + "step": 18946 + }, + { + "epoch": 0.8821379519053938, + "grad_norm": 0.3137921274209256, + "learning_rate": 8.896191899427515e-05, + "loss": 2.9183, + "step": 18947 + }, + { + "epoch": 0.8821845100914868, + "grad_norm": 0.3564986608812648, + "learning_rate": 8.896022129700717e-05, + "loss": 2.9293, + "step": 18948 + }, + { + "epoch": 0.8822310682775799, + "grad_norm": 0.31187085017942784, + "learning_rate": 8.89585234853948e-05, + "loss": 2.8259, + "step": 18949 + }, + { + "epoch": 0.882277626463673, + "grad_norm": 0.32497893064164, + "learning_rate": 8.895682555944298e-05, + "loss": 2.9181, + "step": 18950 + }, + { + "epoch": 0.882324184649766, + "grad_norm": 0.3320347783670484, + "learning_rate": 8.895512751915674e-05, + "loss": 2.9433, + "step": 18951 + }, + { + "epoch": 0.8823707428358591, + "grad_norm": 0.3861589761962868, + "learning_rate": 8.895342936454102e-05, + "loss": 2.9551, + "step": 18952 + }, + { + "epoch": 0.8824173010219521, + "grad_norm": 0.3104967392447592, + "learning_rate": 8.895173109560082e-05, + "loss": 2.8395, + "step": 18953 + }, + { + "epoch": 0.8824638592080453, + "grad_norm": 0.37593719993592123, + "learning_rate": 8.895003271234114e-05, + "loss": 2.865, + "step": 18954 + }, + { + "epoch": 0.8825104173941383, + "grad_norm": 0.3462891283684086, + "learning_rate": 8.894833421476695e-05, + "loss": 2.9812, + "step": 18955 + }, + { + "epoch": 0.8825569755802314, + "grad_norm": 0.3243777678815722, + "learning_rate": 8.894663560288323e-05, + "loss": 2.9627, + "step": 18956 + }, + { + "epoch": 0.8826035337663245, + "grad_norm": 0.34866619493950474, + "learning_rate": 8.894493687669496e-05, + "loss": 2.8734, + "step": 18957 + }, + { + "epoch": 0.8826500919524175, + "grad_norm": 0.348299883757811, + "learning_rate": 8.894323803620717e-05, + "loss": 2.962, + "step": 18958 + }, + { + "epoch": 0.8826966501385106, + "grad_norm": 0.32436389855971137, + "learning_rate": 8.894153908142479e-05, + "loss": 2.8888, + "step": 18959 + }, + { + "epoch": 0.8827432083246036, + "grad_norm": 0.3438193276887788, + "learning_rate": 8.893984001235283e-05, + "loss": 2.9065, + "step": 18960 + }, + { + "epoch": 0.8827897665106967, + "grad_norm": 0.3366051165363533, + "learning_rate": 8.893814082899627e-05, + "loss": 2.8566, + "step": 18961 + }, + { + "epoch": 0.8828363246967899, + "grad_norm": 0.34010397955374166, + "learning_rate": 8.893644153136012e-05, + "loss": 2.9637, + "step": 18962 + }, + { + "epoch": 0.8828828828828829, + "grad_norm": 0.3558593060692551, + "learning_rate": 8.893474211944935e-05, + "loss": 2.9071, + "step": 18963 + }, + { + "epoch": 0.882929441068976, + "grad_norm": 0.34593532244429126, + "learning_rate": 8.893304259326893e-05, + "loss": 2.9283, + "step": 18964 + }, + { + "epoch": 0.882975999255069, + "grad_norm": 0.3532589579320197, + "learning_rate": 8.893134295282387e-05, + "loss": 2.9369, + "step": 18965 + }, + { + "epoch": 0.8830225574411621, + "grad_norm": 0.30370208434899254, + "learning_rate": 8.892964319811916e-05, + "loss": 3.0085, + "step": 18966 + }, + { + "epoch": 0.8830691156272552, + "grad_norm": 0.34978229812561834, + "learning_rate": 8.892794332915978e-05, + "loss": 2.97, + "step": 18967 + }, + { + "epoch": 0.8831156738133482, + "grad_norm": 0.2916966429996343, + "learning_rate": 8.892624334595073e-05, + "loss": 2.9318, + "step": 18968 + }, + { + "epoch": 0.8831622319994413, + "grad_norm": 0.3444422628469899, + "learning_rate": 8.892454324849697e-05, + "loss": 2.872, + "step": 18969 + }, + { + "epoch": 0.8832087901855343, + "grad_norm": 0.3051174838158541, + "learning_rate": 8.892284303680352e-05, + "loss": 2.8811, + "step": 18970 + }, + { + "epoch": 0.8832553483716274, + "grad_norm": 0.32636281773799947, + "learning_rate": 8.892114271087537e-05, + "loss": 2.8483, + "step": 18971 + }, + { + "epoch": 0.8833019065577206, + "grad_norm": 0.3017882323055744, + "learning_rate": 8.891944227071749e-05, + "loss": 2.9457, + "step": 18972 + }, + { + "epoch": 0.8833484647438136, + "grad_norm": 0.3068336064188851, + "learning_rate": 8.891774171633489e-05, + "loss": 2.9541, + "step": 18973 + }, + { + "epoch": 0.8833950229299067, + "grad_norm": 0.3078458793979064, + "learning_rate": 8.891604104773254e-05, + "loss": 2.8801, + "step": 18974 + }, + { + "epoch": 0.8834415811159997, + "grad_norm": 0.31082786568868787, + "learning_rate": 8.891434026491546e-05, + "loss": 2.8671, + "step": 18975 + }, + { + "epoch": 0.8834881393020928, + "grad_norm": 0.31496321166286845, + "learning_rate": 8.891263936788861e-05, + "loss": 2.8606, + "step": 18976 + }, + { + "epoch": 0.8835346974881858, + "grad_norm": 0.33841086381181024, + "learning_rate": 8.8910938356657e-05, + "loss": 2.9035, + "step": 18977 + }, + { + "epoch": 0.8835812556742789, + "grad_norm": 0.3089391841968844, + "learning_rate": 8.890923723122562e-05, + "loss": 2.9784, + "step": 18978 + }, + { + "epoch": 0.883627813860372, + "grad_norm": 0.34253509259651743, + "learning_rate": 8.890753599159946e-05, + "loss": 2.8634, + "step": 18979 + }, + { + "epoch": 0.883674372046465, + "grad_norm": 0.309893717267656, + "learning_rate": 8.890583463778352e-05, + "loss": 2.9039, + "step": 18980 + }, + { + "epoch": 0.8837209302325582, + "grad_norm": 0.34171972701555875, + "learning_rate": 8.890413316978279e-05, + "loss": 2.971, + "step": 18981 + }, + { + "epoch": 0.8837674884186512, + "grad_norm": 0.3041798259806945, + "learning_rate": 8.890243158760226e-05, + "loss": 2.8408, + "step": 18982 + }, + { + "epoch": 0.8838140466047443, + "grad_norm": 0.32451363657105814, + "learning_rate": 8.890072989124691e-05, + "loss": 2.9976, + "step": 18983 + }, + { + "epoch": 0.8838606047908374, + "grad_norm": 0.30429185386309704, + "learning_rate": 8.889902808072177e-05, + "loss": 2.8891, + "step": 18984 + }, + { + "epoch": 0.8839071629769304, + "grad_norm": 0.31361830998396306, + "learning_rate": 8.889732615603182e-05, + "loss": 2.9185, + "step": 18985 + }, + { + "epoch": 0.8839537211630235, + "grad_norm": 0.33595264575686457, + "learning_rate": 8.889562411718202e-05, + "loss": 2.9846, + "step": 18986 + }, + { + "epoch": 0.8840002793491165, + "grad_norm": 0.2826603619952797, + "learning_rate": 8.889392196417742e-05, + "loss": 3.0168, + "step": 18987 + }, + { + "epoch": 0.8840468375352096, + "grad_norm": 0.3095056147570953, + "learning_rate": 8.889221969702299e-05, + "loss": 2.8809, + "step": 18988 + }, + { + "epoch": 0.8840933957213027, + "grad_norm": 0.29429869151698707, + "learning_rate": 8.88905173157237e-05, + "loss": 3.0016, + "step": 18989 + }, + { + "epoch": 0.8841399539073957, + "grad_norm": 0.34786427266101483, + "learning_rate": 8.888881482028462e-05, + "loss": 2.9568, + "step": 18990 + }, + { + "epoch": 0.8841865120934889, + "grad_norm": 0.32860441588547157, + "learning_rate": 8.888711221071067e-05, + "loss": 2.8642, + "step": 18991 + }, + { + "epoch": 0.8842330702795819, + "grad_norm": 0.31176512327871975, + "learning_rate": 8.888540948700687e-05, + "loss": 2.8499, + "step": 18992 + }, + { + "epoch": 0.884279628465675, + "grad_norm": 0.2992651796448732, + "learning_rate": 8.888370664917823e-05, + "loss": 2.9543, + "step": 18993 + }, + { + "epoch": 0.8843261866517681, + "grad_norm": 0.32797551606788683, + "learning_rate": 8.888200369722976e-05, + "loss": 2.9221, + "step": 18994 + }, + { + "epoch": 0.8843727448378611, + "grad_norm": 0.3399483082418666, + "learning_rate": 8.888030063116641e-05, + "loss": 2.8826, + "step": 18995 + }, + { + "epoch": 0.8844193030239542, + "grad_norm": 0.31131054347644, + "learning_rate": 8.887859745099323e-05, + "loss": 2.8636, + "step": 18996 + }, + { + "epoch": 0.8844658612100472, + "grad_norm": 0.314210708474364, + "learning_rate": 8.887689415671519e-05, + "loss": 2.9866, + "step": 18997 + }, + { + "epoch": 0.8845124193961403, + "grad_norm": 0.3615947821290906, + "learning_rate": 8.88751907483373e-05, + "loss": 2.8565, + "step": 18998 + }, + { + "epoch": 0.8845589775822333, + "grad_norm": 0.3475757928725353, + "learning_rate": 8.887348722586455e-05, + "loss": 2.8362, + "step": 18999 + }, + { + "epoch": 0.8846055357683265, + "grad_norm": 0.3367918472409373, + "learning_rate": 8.887178358930195e-05, + "loss": 2.9529, + "step": 19000 + }, + { + "epoch": 0.8846520939544196, + "grad_norm": 0.35914667029504815, + "learning_rate": 8.887007983865448e-05, + "loss": 2.8786, + "step": 19001 + }, + { + "epoch": 0.8846986521405126, + "grad_norm": 0.3693053083865305, + "learning_rate": 8.886837597392717e-05, + "loss": 2.8998, + "step": 19002 + }, + { + "epoch": 0.8847452103266057, + "grad_norm": 0.3566294837972525, + "learning_rate": 8.886667199512499e-05, + "loss": 2.9709, + "step": 19003 + }, + { + "epoch": 0.8847917685126987, + "grad_norm": 0.364710200305146, + "learning_rate": 8.886496790225298e-05, + "loss": 2.9612, + "step": 19004 + }, + { + "epoch": 0.8848383266987918, + "grad_norm": 0.3578797640211954, + "learning_rate": 8.88632636953161e-05, + "loss": 3.0121, + "step": 19005 + }, + { + "epoch": 0.8848848848848849, + "grad_norm": 0.3456669054601245, + "learning_rate": 8.886155937431938e-05, + "loss": 2.9299, + "step": 19006 + }, + { + "epoch": 0.8849314430709779, + "grad_norm": 0.3523113576135361, + "learning_rate": 8.88598549392678e-05, + "loss": 2.7905, + "step": 19007 + }, + { + "epoch": 0.884978001257071, + "grad_norm": 0.35435605027015876, + "learning_rate": 8.885815039016638e-05, + "loss": 2.8507, + "step": 19008 + }, + { + "epoch": 0.885024559443164, + "grad_norm": 0.34324425151080146, + "learning_rate": 8.885644572702012e-05, + "loss": 2.9145, + "step": 19009 + }, + { + "epoch": 0.8850711176292572, + "grad_norm": 0.3485405993537595, + "learning_rate": 8.885474094983403e-05, + "loss": 2.9167, + "step": 19010 + }, + { + "epoch": 0.8851176758153503, + "grad_norm": 0.3165703539965064, + "learning_rate": 8.885303605861308e-05, + "loss": 2.8562, + "step": 19011 + }, + { + "epoch": 0.8851642340014433, + "grad_norm": 0.35819349750857643, + "learning_rate": 8.885133105336232e-05, + "loss": 2.8627, + "step": 19012 + }, + { + "epoch": 0.8852107921875364, + "grad_norm": 0.3020693643938421, + "learning_rate": 8.884962593408671e-05, + "loss": 2.8663, + "step": 19013 + }, + { + "epoch": 0.8852573503736294, + "grad_norm": 0.3607075391628542, + "learning_rate": 8.884792070079128e-05, + "loss": 2.9293, + "step": 19014 + }, + { + "epoch": 0.8853039085597225, + "grad_norm": 0.3259931236253437, + "learning_rate": 8.884621535348103e-05, + "loss": 2.8816, + "step": 19015 + }, + { + "epoch": 0.8853504667458156, + "grad_norm": 0.34273707410535403, + "learning_rate": 8.884450989216098e-05, + "loss": 2.823, + "step": 19016 + }, + { + "epoch": 0.8853970249319086, + "grad_norm": 0.3376633425470465, + "learning_rate": 8.884280431683612e-05, + "loss": 2.8009, + "step": 19017 + }, + { + "epoch": 0.8854435831180018, + "grad_norm": 0.38127624466890975, + "learning_rate": 8.884109862751142e-05, + "loss": 2.9982, + "step": 19018 + }, + { + "epoch": 0.8854901413040948, + "grad_norm": 0.3388171269766522, + "learning_rate": 8.883939282419195e-05, + "loss": 2.8203, + "step": 19019 + }, + { + "epoch": 0.8855366994901879, + "grad_norm": 0.3590660803506023, + "learning_rate": 8.883768690688268e-05, + "loss": 2.9038, + "step": 19020 + }, + { + "epoch": 0.8855832576762809, + "grad_norm": 0.3742356115269094, + "learning_rate": 8.883598087558864e-05, + "loss": 2.9807, + "step": 19021 + }, + { + "epoch": 0.885629815862374, + "grad_norm": 0.33456621596429803, + "learning_rate": 8.883427473031482e-05, + "loss": 2.9134, + "step": 19022 + }, + { + "epoch": 0.8856763740484671, + "grad_norm": 0.3460703645394613, + "learning_rate": 8.883256847106622e-05, + "loss": 2.905, + "step": 19023 + }, + { + "epoch": 0.8857229322345601, + "grad_norm": 0.3332161231844268, + "learning_rate": 8.883086209784785e-05, + "loss": 2.9158, + "step": 19024 + }, + { + "epoch": 0.8857694904206532, + "grad_norm": 0.3567743607831471, + "learning_rate": 8.882915561066473e-05, + "loss": 2.8299, + "step": 19025 + }, + { + "epoch": 0.8858160486067462, + "grad_norm": 0.34383998717472014, + "learning_rate": 8.882744900952187e-05, + "loss": 2.9011, + "step": 19026 + }, + { + "epoch": 0.8858626067928393, + "grad_norm": 0.3624817088583445, + "learning_rate": 8.882574229442427e-05, + "loss": 2.8776, + "step": 19027 + }, + { + "epoch": 0.8859091649789325, + "grad_norm": 0.37292518301672883, + "learning_rate": 8.882403546537695e-05, + "loss": 2.9016, + "step": 19028 + }, + { + "epoch": 0.8859557231650255, + "grad_norm": 0.3639257169412624, + "learning_rate": 8.88223285223849e-05, + "loss": 2.9495, + "step": 19029 + }, + { + "epoch": 0.8860022813511186, + "grad_norm": 0.35224418223084153, + "learning_rate": 8.882062146545315e-05, + "loss": 3.0188, + "step": 19030 + }, + { + "epoch": 0.8860488395372116, + "grad_norm": 0.3259558539734221, + "learning_rate": 8.881891429458668e-05, + "loss": 2.7325, + "step": 19031 + }, + { + "epoch": 0.8860953977233047, + "grad_norm": 0.3476719894420038, + "learning_rate": 8.881720700979055e-05, + "loss": 2.9107, + "step": 19032 + }, + { + "epoch": 0.8861419559093978, + "grad_norm": 0.31983786255338514, + "learning_rate": 8.881549961106971e-05, + "loss": 2.8825, + "step": 19033 + }, + { + "epoch": 0.8861885140954908, + "grad_norm": 0.32696282913756797, + "learning_rate": 8.881379209842922e-05, + "loss": 2.8965, + "step": 19034 + }, + { + "epoch": 0.8862350722815839, + "grad_norm": 0.3284909685856209, + "learning_rate": 8.881208447187408e-05, + "loss": 2.8155, + "step": 19035 + }, + { + "epoch": 0.8862816304676769, + "grad_norm": 0.3290378584791138, + "learning_rate": 8.881037673140928e-05, + "loss": 2.8058, + "step": 19036 + }, + { + "epoch": 0.88632818865377, + "grad_norm": 0.32365976516880113, + "learning_rate": 8.880866887703986e-05, + "loss": 2.8915, + "step": 19037 + }, + { + "epoch": 0.8863747468398632, + "grad_norm": 0.3539592661511632, + "learning_rate": 8.880696090877082e-05, + "loss": 2.9543, + "step": 19038 + }, + { + "epoch": 0.8864213050259562, + "grad_norm": 0.3518914771670924, + "learning_rate": 8.880525282660717e-05, + "loss": 2.9253, + "step": 19039 + }, + { + "epoch": 0.8864678632120493, + "grad_norm": 0.34921796879319966, + "learning_rate": 8.880354463055391e-05, + "loss": 2.9747, + "step": 19040 + }, + { + "epoch": 0.8865144213981423, + "grad_norm": 0.32593269360812355, + "learning_rate": 8.880183632061609e-05, + "loss": 2.8895, + "step": 19041 + }, + { + "epoch": 0.8865609795842354, + "grad_norm": 0.40085497687621663, + "learning_rate": 8.88001278967987e-05, + "loss": 3.0269, + "step": 19042 + }, + { + "epoch": 0.8866075377703284, + "grad_norm": 0.32411091869910663, + "learning_rate": 8.879841935910675e-05, + "loss": 2.8773, + "step": 19043 + }, + { + "epoch": 0.8866540959564215, + "grad_norm": 0.39237947245073834, + "learning_rate": 8.879671070754527e-05, + "loss": 2.9255, + "step": 19044 + }, + { + "epoch": 0.8867006541425146, + "grad_norm": 0.3384436861314633, + "learning_rate": 8.879500194211926e-05, + "loss": 2.9305, + "step": 19045 + }, + { + "epoch": 0.8867472123286076, + "grad_norm": 0.4049754413203986, + "learning_rate": 8.879329306283375e-05, + "loss": 2.8984, + "step": 19046 + }, + { + "epoch": 0.8867937705147008, + "grad_norm": 0.34727795361453095, + "learning_rate": 8.879158406969373e-05, + "loss": 2.9639, + "step": 19047 + }, + { + "epoch": 0.8868403287007938, + "grad_norm": 0.3613047155538009, + "learning_rate": 8.878987496270425e-05, + "loss": 2.9264, + "step": 19048 + }, + { + "epoch": 0.8868868868868869, + "grad_norm": 0.3445371778761661, + "learning_rate": 8.87881657418703e-05, + "loss": 2.9999, + "step": 19049 + }, + { + "epoch": 0.88693344507298, + "grad_norm": 0.320353711844661, + "learning_rate": 8.878645640719691e-05, + "loss": 2.8411, + "step": 19050 + }, + { + "epoch": 0.886980003259073, + "grad_norm": 0.32510689046126645, + "learning_rate": 8.878474695868908e-05, + "loss": 2.9201, + "step": 19051 + }, + { + "epoch": 0.8870265614451661, + "grad_norm": 0.34148718440301384, + "learning_rate": 8.878303739635184e-05, + "loss": 2.9482, + "step": 19052 + }, + { + "epoch": 0.8870731196312591, + "grad_norm": 0.34490606958990316, + "learning_rate": 8.878132772019021e-05, + "loss": 2.8985, + "step": 19053 + }, + { + "epoch": 0.8871196778173522, + "grad_norm": 0.3568869686599522, + "learning_rate": 8.877961793020921e-05, + "loss": 2.9557, + "step": 19054 + }, + { + "epoch": 0.8871662360034454, + "grad_norm": 0.3193624097110016, + "learning_rate": 8.877790802641385e-05, + "loss": 2.8467, + "step": 19055 + }, + { + "epoch": 0.8872127941895384, + "grad_norm": 0.3546558105931589, + "learning_rate": 8.877619800880914e-05, + "loss": 2.9928, + "step": 19056 + }, + { + "epoch": 0.8872593523756315, + "grad_norm": 0.3484272004991258, + "learning_rate": 8.877448787740013e-05, + "loss": 2.9818, + "step": 19057 + }, + { + "epoch": 0.8873059105617245, + "grad_norm": 0.34139213808900676, + "learning_rate": 8.877277763219179e-05, + "loss": 2.9194, + "step": 19058 + }, + { + "epoch": 0.8873524687478176, + "grad_norm": 0.3423990360259735, + "learning_rate": 8.877106727318918e-05, + "loss": 2.9049, + "step": 19059 + }, + { + "epoch": 0.8873990269339107, + "grad_norm": 0.3366020095975055, + "learning_rate": 8.876935680039731e-05, + "loss": 2.8995, + "step": 19060 + }, + { + "epoch": 0.8874455851200037, + "grad_norm": 0.35520588741337167, + "learning_rate": 8.876764621382121e-05, + "loss": 2.8856, + "step": 19061 + }, + { + "epoch": 0.8874921433060968, + "grad_norm": 0.3491105494462168, + "learning_rate": 8.876593551346586e-05, + "loss": 2.9544, + "step": 19062 + }, + { + "epoch": 0.8875387014921898, + "grad_norm": 0.3614611288198602, + "learning_rate": 8.876422469933632e-05, + "loss": 2.9649, + "step": 19063 + }, + { + "epoch": 0.887585259678283, + "grad_norm": 0.35512933702498684, + "learning_rate": 8.87625137714376e-05, + "loss": 2.9916, + "step": 19064 + }, + { + "epoch": 0.887631817864376, + "grad_norm": 0.3373712629169577, + "learning_rate": 8.876080272977471e-05, + "loss": 2.9574, + "step": 19065 + }, + { + "epoch": 0.8876783760504691, + "grad_norm": 0.40073944342368617, + "learning_rate": 8.875909157435271e-05, + "loss": 2.9209, + "step": 19066 + }, + { + "epoch": 0.8877249342365622, + "grad_norm": 0.3467315885568534, + "learning_rate": 8.875738030517657e-05, + "loss": 2.8746, + "step": 19067 + }, + { + "epoch": 0.8877714924226552, + "grad_norm": 0.33777793775337855, + "learning_rate": 8.875566892225136e-05, + "loss": 2.9195, + "step": 19068 + }, + { + "epoch": 0.8878180506087483, + "grad_norm": 0.3537624945975245, + "learning_rate": 8.875395742558206e-05, + "loss": 2.9235, + "step": 19069 + }, + { + "epoch": 0.8878646087948413, + "grad_norm": 0.33564061672122975, + "learning_rate": 8.875224581517372e-05, + "loss": 3.0458, + "step": 19070 + }, + { + "epoch": 0.8879111669809344, + "grad_norm": 0.33587691785557455, + "learning_rate": 8.875053409103136e-05, + "loss": 2.9086, + "step": 19071 + }, + { + "epoch": 0.8879577251670275, + "grad_norm": 0.3697435745744374, + "learning_rate": 8.874882225316e-05, + "loss": 2.9649, + "step": 19072 + }, + { + "epoch": 0.8880042833531205, + "grad_norm": 0.38928453900446064, + "learning_rate": 8.874711030156464e-05, + "loss": 2.8742, + "step": 19073 + }, + { + "epoch": 0.8880508415392137, + "grad_norm": 0.3360616046364694, + "learning_rate": 8.874539823625036e-05, + "loss": 2.853, + "step": 19074 + }, + { + "epoch": 0.8880973997253067, + "grad_norm": 0.3753670332017623, + "learning_rate": 8.874368605722214e-05, + "loss": 2.8263, + "step": 19075 + }, + { + "epoch": 0.8881439579113998, + "grad_norm": 0.34326997539618187, + "learning_rate": 8.8741973764485e-05, + "loss": 2.9216, + "step": 19076 + }, + { + "epoch": 0.8881905160974929, + "grad_norm": 0.33339903337720045, + "learning_rate": 8.874026135804402e-05, + "loss": 2.9515, + "step": 19077 + }, + { + "epoch": 0.8882370742835859, + "grad_norm": 0.33575298142202586, + "learning_rate": 8.873854883790418e-05, + "loss": 2.8575, + "step": 19078 + }, + { + "epoch": 0.888283632469679, + "grad_norm": 0.3388375593178758, + "learning_rate": 8.873683620407049e-05, + "loss": 2.8548, + "step": 19079 + }, + { + "epoch": 0.888330190655772, + "grad_norm": 0.33438568814822994, + "learning_rate": 8.873512345654803e-05, + "loss": 2.8371, + "step": 19080 + }, + { + "epoch": 0.8883767488418651, + "grad_norm": 0.3084310748117484, + "learning_rate": 8.873341059534179e-05, + "loss": 2.844, + "step": 19081 + }, + { + "epoch": 0.8884233070279582, + "grad_norm": 0.35095642200878696, + "learning_rate": 8.87316976204568e-05, + "loss": 2.9467, + "step": 19082 + }, + { + "epoch": 0.8884698652140512, + "grad_norm": 0.31749571231442864, + "learning_rate": 8.872998453189809e-05, + "loss": 2.9159, + "step": 19083 + }, + { + "epoch": 0.8885164234001444, + "grad_norm": 0.3554437838483695, + "learning_rate": 8.872827132967071e-05, + "loss": 2.9558, + "step": 19084 + }, + { + "epoch": 0.8885629815862374, + "grad_norm": 0.3460866836064329, + "learning_rate": 8.872655801377965e-05, + "loss": 2.7901, + "step": 19085 + }, + { + "epoch": 0.8886095397723305, + "grad_norm": 0.32569655038025586, + "learning_rate": 8.872484458422997e-05, + "loss": 2.932, + "step": 19086 + }, + { + "epoch": 0.8886560979584235, + "grad_norm": 0.32168801186899615, + "learning_rate": 8.872313104102667e-05, + "loss": 2.8272, + "step": 19087 + }, + { + "epoch": 0.8887026561445166, + "grad_norm": 0.3364718054388076, + "learning_rate": 8.872141738417482e-05, + "loss": 2.7607, + "step": 19088 + }, + { + "epoch": 0.8887492143306097, + "grad_norm": 0.3246673754604961, + "learning_rate": 8.871970361367941e-05, + "loss": 2.9188, + "step": 19089 + }, + { + "epoch": 0.8887957725167027, + "grad_norm": 0.31925562870692276, + "learning_rate": 8.871798972954548e-05, + "loss": 2.7746, + "step": 19090 + }, + { + "epoch": 0.8888423307027958, + "grad_norm": 0.33505196750275984, + "learning_rate": 8.871627573177808e-05, + "loss": 2.884, + "step": 19091 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3213742929651804, + "learning_rate": 8.87145616203822e-05, + "loss": 3.0473, + "step": 19092 + }, + { + "epoch": 0.888935447074982, + "grad_norm": 0.3069871647003835, + "learning_rate": 8.871284739536292e-05, + "loss": 2.9846, + "step": 19093 + }, + { + "epoch": 0.8889820052610751, + "grad_norm": 0.30564237156545715, + "learning_rate": 8.871113305672524e-05, + "loss": 2.979, + "step": 19094 + }, + { + "epoch": 0.8890285634471681, + "grad_norm": 0.39777276127687844, + "learning_rate": 8.87094186044742e-05, + "loss": 2.9443, + "step": 19095 + }, + { + "epoch": 0.8890751216332612, + "grad_norm": 0.32921901733870773, + "learning_rate": 8.870770403861482e-05, + "loss": 2.898, + "step": 19096 + }, + { + "epoch": 0.8891216798193542, + "grad_norm": 0.3468358861390082, + "learning_rate": 8.870598935915215e-05, + "loss": 2.8367, + "step": 19097 + }, + { + "epoch": 0.8891682380054473, + "grad_norm": 0.3154079172349521, + "learning_rate": 8.87042745660912e-05, + "loss": 2.9025, + "step": 19098 + }, + { + "epoch": 0.8892147961915404, + "grad_norm": 0.3340225068298924, + "learning_rate": 8.870255965943703e-05, + "loss": 2.8422, + "step": 19099 + }, + { + "epoch": 0.8892613543776334, + "grad_norm": 0.3328864157623076, + "learning_rate": 8.870084463919466e-05, + "loss": 2.8028, + "step": 19100 + }, + { + "epoch": 0.8893079125637265, + "grad_norm": 0.347132426270702, + "learning_rate": 8.869912950536913e-05, + "loss": 2.8309, + "step": 19101 + }, + { + "epoch": 0.8893544707498195, + "grad_norm": 0.35315471859522357, + "learning_rate": 8.869741425796545e-05, + "loss": 2.8879, + "step": 19102 + }, + { + "epoch": 0.8894010289359127, + "grad_norm": 0.33802319672490483, + "learning_rate": 8.86956988969887e-05, + "loss": 2.9182, + "step": 19103 + }, + { + "epoch": 0.8894475871220058, + "grad_norm": 0.3418883606496874, + "learning_rate": 8.869398342244386e-05, + "loss": 2.8798, + "step": 19104 + }, + { + "epoch": 0.8894941453080988, + "grad_norm": 0.32941046805798097, + "learning_rate": 8.869226783433599e-05, + "loss": 2.892, + "step": 19105 + }, + { + "epoch": 0.8895407034941919, + "grad_norm": 0.37265244115734125, + "learning_rate": 8.869055213267014e-05, + "loss": 2.9456, + "step": 19106 + }, + { + "epoch": 0.8895872616802849, + "grad_norm": 0.3318577465675525, + "learning_rate": 8.868883631745133e-05, + "loss": 2.9196, + "step": 19107 + }, + { + "epoch": 0.889633819866378, + "grad_norm": 0.3959024788445336, + "learning_rate": 8.868712038868459e-05, + "loss": 2.8852, + "step": 19108 + }, + { + "epoch": 0.889680378052471, + "grad_norm": 0.3860057673046691, + "learning_rate": 8.868540434637495e-05, + "loss": 3.0143, + "step": 19109 + }, + { + "epoch": 0.8897269362385641, + "grad_norm": 0.36160296246107165, + "learning_rate": 8.868368819052748e-05, + "loss": 2.8806, + "step": 19110 + }, + { + "epoch": 0.8897734944246573, + "grad_norm": 0.32467518283036956, + "learning_rate": 8.86819719211472e-05, + "loss": 2.8826, + "step": 19111 + }, + { + "epoch": 0.8898200526107503, + "grad_norm": 0.359320357652271, + "learning_rate": 8.868025553823913e-05, + "loss": 2.8593, + "step": 19112 + }, + { + "epoch": 0.8898666107968434, + "grad_norm": 0.34376108296332236, + "learning_rate": 8.867853904180832e-05, + "loss": 2.9662, + "step": 19113 + }, + { + "epoch": 0.8899131689829364, + "grad_norm": 0.37741525507737833, + "learning_rate": 8.867682243185981e-05, + "loss": 2.8973, + "step": 19114 + }, + { + "epoch": 0.8899597271690295, + "grad_norm": 0.38532714531782025, + "learning_rate": 8.867510570839865e-05, + "loss": 2.8877, + "step": 19115 + }, + { + "epoch": 0.8900062853551226, + "grad_norm": 0.3551487955023964, + "learning_rate": 8.867338887142985e-05, + "loss": 2.9298, + "step": 19116 + }, + { + "epoch": 0.8900528435412156, + "grad_norm": 0.37935726292542843, + "learning_rate": 8.867167192095847e-05, + "loss": 2.9521, + "step": 19117 + }, + { + "epoch": 0.8900994017273087, + "grad_norm": 0.37355330427863137, + "learning_rate": 8.866995485698954e-05, + "loss": 2.9509, + "step": 19118 + }, + { + "epoch": 0.8901459599134017, + "grad_norm": 0.40570305074160923, + "learning_rate": 8.86682376795281e-05, + "loss": 2.8897, + "step": 19119 + }, + { + "epoch": 0.8901925180994948, + "grad_norm": 0.37761318222694945, + "learning_rate": 8.866652038857919e-05, + "loss": 2.9715, + "step": 19120 + }, + { + "epoch": 0.890239076285588, + "grad_norm": 0.37386207814831907, + "learning_rate": 8.866480298414786e-05, + "loss": 2.8578, + "step": 19121 + }, + { + "epoch": 0.890285634471681, + "grad_norm": 0.3517520310318682, + "learning_rate": 8.866308546623914e-05, + "loss": 2.7943, + "step": 19122 + }, + { + "epoch": 0.8903321926577741, + "grad_norm": 0.3586205442665751, + "learning_rate": 8.866136783485807e-05, + "loss": 3.0449, + "step": 19123 + }, + { + "epoch": 0.8903787508438671, + "grad_norm": 0.3883465054430955, + "learning_rate": 8.865965009000969e-05, + "loss": 2.922, + "step": 19124 + }, + { + "epoch": 0.8904253090299602, + "grad_norm": 0.34932062347994547, + "learning_rate": 8.865793223169906e-05, + "loss": 2.9373, + "step": 19125 + }, + { + "epoch": 0.8904718672160533, + "grad_norm": 0.3889741040946394, + "learning_rate": 8.86562142599312e-05, + "loss": 3.0318, + "step": 19126 + }, + { + "epoch": 0.8905184254021463, + "grad_norm": 0.34695368042977764, + "learning_rate": 8.865449617471115e-05, + "loss": 2.8402, + "step": 19127 + }, + { + "epoch": 0.8905649835882394, + "grad_norm": 0.37238060877777657, + "learning_rate": 8.865277797604397e-05, + "loss": 2.8646, + "step": 19128 + }, + { + "epoch": 0.8906115417743324, + "grad_norm": 0.3405941838806375, + "learning_rate": 8.86510596639347e-05, + "loss": 2.8746, + "step": 19129 + }, + { + "epoch": 0.8906580999604256, + "grad_norm": 0.3664117622800073, + "learning_rate": 8.864934123838837e-05, + "loss": 2.88, + "step": 19130 + }, + { + "epoch": 0.8907046581465186, + "grad_norm": 0.3472439586495635, + "learning_rate": 8.864762269941003e-05, + "loss": 2.8657, + "step": 19131 + }, + { + "epoch": 0.8907512163326117, + "grad_norm": 0.3645754379488323, + "learning_rate": 8.864590404700473e-05, + "loss": 3.0181, + "step": 19132 + }, + { + "epoch": 0.8907977745187048, + "grad_norm": 0.3385702970865729, + "learning_rate": 8.86441852811775e-05, + "loss": 2.89, + "step": 19133 + }, + { + "epoch": 0.8908443327047978, + "grad_norm": 0.3538275437940094, + "learning_rate": 8.864246640193341e-05, + "loss": 2.9603, + "step": 19134 + }, + { + "epoch": 0.8908908908908909, + "grad_norm": 0.37040961998406124, + "learning_rate": 8.864074740927749e-05, + "loss": 2.8667, + "step": 19135 + }, + { + "epoch": 0.8909374490769839, + "grad_norm": 0.358955038399377, + "learning_rate": 8.863902830321476e-05, + "loss": 2.9182, + "step": 19136 + }, + { + "epoch": 0.890984007263077, + "grad_norm": 0.3407432705662474, + "learning_rate": 8.863730908375029e-05, + "loss": 2.8555, + "step": 19137 + }, + { + "epoch": 0.8910305654491701, + "grad_norm": 0.371333558306756, + "learning_rate": 8.863558975088915e-05, + "loss": 2.9099, + "step": 19138 + }, + { + "epoch": 0.8910771236352631, + "grad_norm": 0.33276923620208526, + "learning_rate": 8.863387030463633e-05, + "loss": 2.837, + "step": 19139 + }, + { + "epoch": 0.8911236818213563, + "grad_norm": 0.37992321964752157, + "learning_rate": 8.863215074499693e-05, + "loss": 2.9098, + "step": 19140 + }, + { + "epoch": 0.8911702400074493, + "grad_norm": 0.35335473205405143, + "learning_rate": 8.863043107197596e-05, + "loss": 2.9054, + "step": 19141 + }, + { + "epoch": 0.8912167981935424, + "grad_norm": 0.3852054144515645, + "learning_rate": 8.862871128557848e-05, + "loss": 2.9666, + "step": 19142 + }, + { + "epoch": 0.8912633563796355, + "grad_norm": 0.38572774665277715, + "learning_rate": 8.862699138580955e-05, + "loss": 2.7581, + "step": 19143 + }, + { + "epoch": 0.8913099145657285, + "grad_norm": 0.3623141952181626, + "learning_rate": 8.86252713726742e-05, + "loss": 2.8213, + "step": 19144 + }, + { + "epoch": 0.8913564727518216, + "grad_norm": 0.321063818566063, + "learning_rate": 8.862355124617748e-05, + "loss": 2.8623, + "step": 19145 + }, + { + "epoch": 0.8914030309379146, + "grad_norm": 0.3390375955233072, + "learning_rate": 8.862183100632444e-05, + "loss": 2.8122, + "step": 19146 + }, + { + "epoch": 0.8914495891240077, + "grad_norm": 0.3409033316421713, + "learning_rate": 8.862011065312015e-05, + "loss": 2.9254, + "step": 19147 + }, + { + "epoch": 0.8914961473101009, + "grad_norm": 0.3331425442668488, + "learning_rate": 8.861839018656962e-05, + "loss": 2.8955, + "step": 19148 + }, + { + "epoch": 0.8915427054961939, + "grad_norm": 0.34219389501426006, + "learning_rate": 8.861666960667792e-05, + "loss": 3.0284, + "step": 19149 + }, + { + "epoch": 0.891589263682287, + "grad_norm": 0.3265271494878126, + "learning_rate": 8.86149489134501e-05, + "loss": 2.8589, + "step": 19150 + }, + { + "epoch": 0.89163582186838, + "grad_norm": 0.36600448259224005, + "learning_rate": 8.86132281068912e-05, + "loss": 2.8966, + "step": 19151 + }, + { + "epoch": 0.8916823800544731, + "grad_norm": 0.3449164824661889, + "learning_rate": 8.861150718700631e-05, + "loss": 2.8984, + "step": 19152 + }, + { + "epoch": 0.8917289382405661, + "grad_norm": 0.3355256062504572, + "learning_rate": 8.860978615380043e-05, + "loss": 2.9414, + "step": 19153 + }, + { + "epoch": 0.8917754964266592, + "grad_norm": 0.3538295251981946, + "learning_rate": 8.860806500727864e-05, + "loss": 2.8393, + "step": 19154 + }, + { + "epoch": 0.8918220546127523, + "grad_norm": 0.32827711980705737, + "learning_rate": 8.860634374744598e-05, + "loss": 2.972, + "step": 19155 + }, + { + "epoch": 0.8918686127988453, + "grad_norm": 0.34356129540190977, + "learning_rate": 8.86046223743075e-05, + "loss": 2.882, + "step": 19156 + }, + { + "epoch": 0.8919151709849384, + "grad_norm": 0.3338011855189023, + "learning_rate": 8.860290088786825e-05, + "loss": 2.8022, + "step": 19157 + }, + { + "epoch": 0.8919617291710314, + "grad_norm": 0.332234764302069, + "learning_rate": 8.860117928813329e-05, + "loss": 2.8778, + "step": 19158 + }, + { + "epoch": 0.8920082873571246, + "grad_norm": 0.3512636067048892, + "learning_rate": 8.85994575751077e-05, + "loss": 2.9387, + "step": 19159 + }, + { + "epoch": 0.8920548455432177, + "grad_norm": 0.3311978334925851, + "learning_rate": 8.859773574879647e-05, + "loss": 2.8564, + "step": 19160 + }, + { + "epoch": 0.8921014037293107, + "grad_norm": 0.3413784090201571, + "learning_rate": 8.85960138092047e-05, + "loss": 2.9001, + "step": 19161 + }, + { + "epoch": 0.8921479619154038, + "grad_norm": 0.3429041617995062, + "learning_rate": 8.859429175633745e-05, + "loss": 2.894, + "step": 19162 + }, + { + "epoch": 0.8921945201014968, + "grad_norm": 0.32149462420602926, + "learning_rate": 8.859256959019972e-05, + "loss": 2.8562, + "step": 19163 + }, + { + "epoch": 0.8922410782875899, + "grad_norm": 0.355819556426931, + "learning_rate": 8.859084731079663e-05, + "loss": 2.8554, + "step": 19164 + }, + { + "epoch": 0.892287636473683, + "grad_norm": 0.3300753981358665, + "learning_rate": 8.85891249181332e-05, + "loss": 2.8866, + "step": 19165 + }, + { + "epoch": 0.892334194659776, + "grad_norm": 0.3172740200162245, + "learning_rate": 8.858740241221447e-05, + "loss": 2.9314, + "step": 19166 + }, + { + "epoch": 0.8923807528458692, + "grad_norm": 0.40146440594549165, + "learning_rate": 8.858567979304552e-05, + "loss": 2.9377, + "step": 19167 + }, + { + "epoch": 0.8924273110319622, + "grad_norm": 0.31773363682744127, + "learning_rate": 8.858395706063143e-05, + "loss": 2.9453, + "step": 19168 + }, + { + "epoch": 0.8924738692180553, + "grad_norm": 0.3951011161026436, + "learning_rate": 8.85822342149772e-05, + "loss": 2.8188, + "step": 19169 + }, + { + "epoch": 0.8925204274041484, + "grad_norm": 0.34714076401478555, + "learning_rate": 8.858051125608793e-05, + "loss": 2.9659, + "step": 19170 + }, + { + "epoch": 0.8925669855902414, + "grad_norm": 0.36280610830128995, + "learning_rate": 8.857878818396864e-05, + "loss": 2.8952, + "step": 19171 + }, + { + "epoch": 0.8926135437763345, + "grad_norm": 0.30216626378803974, + "learning_rate": 8.857706499862443e-05, + "loss": 2.7774, + "step": 19172 + }, + { + "epoch": 0.8926601019624275, + "grad_norm": 0.35216643040969314, + "learning_rate": 8.857534170006032e-05, + "loss": 2.917, + "step": 19173 + }, + { + "epoch": 0.8927066601485206, + "grad_norm": 0.30385277604879624, + "learning_rate": 8.85736182882814e-05, + "loss": 2.8183, + "step": 19174 + }, + { + "epoch": 0.8927532183346136, + "grad_norm": 0.3376506799304887, + "learning_rate": 8.857189476329269e-05, + "loss": 2.8595, + "step": 19175 + }, + { + "epoch": 0.8927997765207067, + "grad_norm": 0.3476423468508039, + "learning_rate": 8.857017112509927e-05, + "loss": 2.9074, + "step": 19176 + }, + { + "epoch": 0.8928463347067999, + "grad_norm": 0.31489748393916894, + "learning_rate": 8.856844737370619e-05, + "loss": 2.8279, + "step": 19177 + }, + { + "epoch": 0.8928928928928929, + "grad_norm": 0.35813477309262925, + "learning_rate": 8.856672350911855e-05, + "loss": 2.9997, + "step": 19178 + }, + { + "epoch": 0.892939451078986, + "grad_norm": 0.30194651886343804, + "learning_rate": 8.856499953134134e-05, + "loss": 2.8903, + "step": 19179 + }, + { + "epoch": 0.892986009265079, + "grad_norm": 0.3512710471000933, + "learning_rate": 8.856327544037968e-05, + "loss": 2.7919, + "step": 19180 + }, + { + "epoch": 0.8930325674511721, + "grad_norm": 0.30000828876765334, + "learning_rate": 8.856155123623859e-05, + "loss": 2.8923, + "step": 19181 + }, + { + "epoch": 0.8930791256372652, + "grad_norm": 0.3378157459438951, + "learning_rate": 8.855982691892315e-05, + "loss": 2.8088, + "step": 19182 + }, + { + "epoch": 0.8931256838233582, + "grad_norm": 0.319889724851575, + "learning_rate": 8.855810248843843e-05, + "loss": 2.8796, + "step": 19183 + }, + { + "epoch": 0.8931722420094513, + "grad_norm": 0.3564036435082912, + "learning_rate": 8.855637794478946e-05, + "loss": 2.9855, + "step": 19184 + }, + { + "epoch": 0.8932188001955443, + "grad_norm": 0.3433994386237725, + "learning_rate": 8.855465328798133e-05, + "loss": 2.8912, + "step": 19185 + }, + { + "epoch": 0.8932653583816375, + "grad_norm": 0.3508762292391675, + "learning_rate": 8.855292851801908e-05, + "loss": 2.9991, + "step": 19186 + }, + { + "epoch": 0.8933119165677306, + "grad_norm": 0.33005481099777506, + "learning_rate": 8.855120363490779e-05, + "loss": 2.997, + "step": 19187 + }, + { + "epoch": 0.8933584747538236, + "grad_norm": 0.36153343215488415, + "learning_rate": 8.854947863865253e-05, + "loss": 2.9816, + "step": 19188 + }, + { + "epoch": 0.8934050329399167, + "grad_norm": 0.35957397248527995, + "learning_rate": 8.854775352925832e-05, + "loss": 2.8006, + "step": 19189 + }, + { + "epoch": 0.8934515911260097, + "grad_norm": 0.35782077901764203, + "learning_rate": 8.854602830673027e-05, + "loss": 2.8862, + "step": 19190 + }, + { + "epoch": 0.8934981493121028, + "grad_norm": 0.32895342332966676, + "learning_rate": 8.854430297107341e-05, + "loss": 2.9864, + "step": 19191 + }, + { + "epoch": 0.8935447074981959, + "grad_norm": 0.3486567602669168, + "learning_rate": 8.854257752229284e-05, + "loss": 2.7069, + "step": 19192 + }, + { + "epoch": 0.8935912656842889, + "grad_norm": 0.3513587490439379, + "learning_rate": 8.854085196039359e-05, + "loss": 2.9696, + "step": 19193 + }, + { + "epoch": 0.893637823870382, + "grad_norm": 0.3514139042768173, + "learning_rate": 8.853912628538073e-05, + "loss": 2.8884, + "step": 19194 + }, + { + "epoch": 0.893684382056475, + "grad_norm": 0.3731878817463803, + "learning_rate": 8.853740049725934e-05, + "loss": 2.9, + "step": 19195 + }, + { + "epoch": 0.8937309402425682, + "grad_norm": 0.39415355190609663, + "learning_rate": 8.853567459603446e-05, + "loss": 2.8749, + "step": 19196 + }, + { + "epoch": 0.8937774984286612, + "grad_norm": 0.347538338519077, + "learning_rate": 8.85339485817112e-05, + "loss": 2.8585, + "step": 19197 + }, + { + "epoch": 0.8938240566147543, + "grad_norm": 0.3765140854737554, + "learning_rate": 8.853222245429458e-05, + "loss": 2.9324, + "step": 19198 + }, + { + "epoch": 0.8938706148008474, + "grad_norm": 0.35736926347507547, + "learning_rate": 8.853049621378968e-05, + "loss": 2.9253, + "step": 19199 + }, + { + "epoch": 0.8939171729869404, + "grad_norm": 0.3511001946861919, + "learning_rate": 8.852876986020158e-05, + "loss": 2.903, + "step": 19200 + }, + { + "epoch": 0.8939637311730335, + "grad_norm": 0.3656540837592454, + "learning_rate": 8.852704339353532e-05, + "loss": 2.9484, + "step": 19201 + }, + { + "epoch": 0.8940102893591265, + "grad_norm": 0.3541386645029371, + "learning_rate": 8.8525316813796e-05, + "loss": 2.9007, + "step": 19202 + }, + { + "epoch": 0.8940568475452196, + "grad_norm": 0.37236039200034826, + "learning_rate": 8.852359012098866e-05, + "loss": 3.0207, + "step": 19203 + }, + { + "epoch": 0.8941034057313128, + "grad_norm": 0.337408814678287, + "learning_rate": 8.852186331511838e-05, + "loss": 2.8324, + "step": 19204 + }, + { + "epoch": 0.8941499639174058, + "grad_norm": 0.3224878816364101, + "learning_rate": 8.852013639619024e-05, + "loss": 2.8134, + "step": 19205 + }, + { + "epoch": 0.8941965221034989, + "grad_norm": 0.32000277348384626, + "learning_rate": 8.851840936420927e-05, + "loss": 2.8285, + "step": 19206 + }, + { + "epoch": 0.8942430802895919, + "grad_norm": 0.3467641079300422, + "learning_rate": 8.851668221918058e-05, + "loss": 2.821, + "step": 19207 + }, + { + "epoch": 0.894289638475685, + "grad_norm": 0.3375000256777118, + "learning_rate": 8.851495496110923e-05, + "loss": 2.927, + "step": 19208 + }, + { + "epoch": 0.8943361966617781, + "grad_norm": 0.3429191955371634, + "learning_rate": 8.851322759000026e-05, + "loss": 2.9175, + "step": 19209 + }, + { + "epoch": 0.8943827548478711, + "grad_norm": 0.3184641226956002, + "learning_rate": 8.851150010585876e-05, + "loss": 2.8358, + "step": 19210 + }, + { + "epoch": 0.8944293130339642, + "grad_norm": 0.3711232788091098, + "learning_rate": 8.850977250868982e-05, + "loss": 2.9046, + "step": 19211 + }, + { + "epoch": 0.8944758712200572, + "grad_norm": 0.34013575296565757, + "learning_rate": 8.850804479849848e-05, + "loss": 2.9386, + "step": 19212 + }, + { + "epoch": 0.8945224294061503, + "grad_norm": 0.33337735946633584, + "learning_rate": 8.850631697528984e-05, + "loss": 2.8926, + "step": 19213 + }, + { + "epoch": 0.8945689875922435, + "grad_norm": 0.3492547576902486, + "learning_rate": 8.850458903906894e-05, + "loss": 2.9601, + "step": 19214 + }, + { + "epoch": 0.8946155457783365, + "grad_norm": 0.33663276698176525, + "learning_rate": 8.850286098984085e-05, + "loss": 2.8984, + "step": 19215 + }, + { + "epoch": 0.8946621039644296, + "grad_norm": 0.3644802829779717, + "learning_rate": 8.850113282761067e-05, + "loss": 2.9111, + "step": 19216 + }, + { + "epoch": 0.8947086621505226, + "grad_norm": 0.3244681888866652, + "learning_rate": 8.849940455238346e-05, + "loss": 2.9447, + "step": 19217 + }, + { + "epoch": 0.8947552203366157, + "grad_norm": 0.34368821081648043, + "learning_rate": 8.84976761641643e-05, + "loss": 2.8094, + "step": 19218 + }, + { + "epoch": 0.8948017785227087, + "grad_norm": 0.36483677375148493, + "learning_rate": 8.849594766295824e-05, + "loss": 2.9432, + "step": 19219 + }, + { + "epoch": 0.8948483367088018, + "grad_norm": 0.3165841210355419, + "learning_rate": 8.849421904877036e-05, + "loss": 2.8656, + "step": 19220 + }, + { + "epoch": 0.8948948948948949, + "grad_norm": 0.3624176728751042, + "learning_rate": 8.849249032160575e-05, + "loss": 2.955, + "step": 19221 + }, + { + "epoch": 0.8949414530809879, + "grad_norm": 0.3223013799622812, + "learning_rate": 8.849076148146946e-05, + "loss": 2.8735, + "step": 19222 + }, + { + "epoch": 0.894988011267081, + "grad_norm": 0.32499223189033066, + "learning_rate": 8.848903252836658e-05, + "loss": 2.9762, + "step": 19223 + }, + { + "epoch": 0.8950345694531741, + "grad_norm": 0.35634807436125104, + "learning_rate": 8.848730346230221e-05, + "loss": 2.802, + "step": 19224 + }, + { + "epoch": 0.8950811276392672, + "grad_norm": 0.3383311197009033, + "learning_rate": 8.848557428328137e-05, + "loss": 2.8887, + "step": 19225 + }, + { + "epoch": 0.8951276858253603, + "grad_norm": 0.34805699394788453, + "learning_rate": 8.848384499130916e-05, + "loss": 2.918, + "step": 19226 + }, + { + "epoch": 0.8951742440114533, + "grad_norm": 0.33491391054539965, + "learning_rate": 8.848211558639066e-05, + "loss": 2.8866, + "step": 19227 + }, + { + "epoch": 0.8952208021975464, + "grad_norm": 0.3408960520863193, + "learning_rate": 8.848038606853095e-05, + "loss": 2.9562, + "step": 19228 + }, + { + "epoch": 0.8952673603836394, + "grad_norm": 0.3368368636109006, + "learning_rate": 8.847865643773509e-05, + "loss": 2.88, + "step": 19229 + }, + { + "epoch": 0.8953139185697325, + "grad_norm": 0.30519365632978074, + "learning_rate": 8.847692669400815e-05, + "loss": 2.8381, + "step": 19230 + }, + { + "epoch": 0.8953604767558256, + "grad_norm": 0.36191986695905154, + "learning_rate": 8.847519683735525e-05, + "loss": 2.9614, + "step": 19231 + }, + { + "epoch": 0.8954070349419186, + "grad_norm": 0.3342093518278964, + "learning_rate": 8.847346686778142e-05, + "loss": 2.9295, + "step": 19232 + }, + { + "epoch": 0.8954535931280118, + "grad_norm": 0.3389367766910033, + "learning_rate": 8.847173678529175e-05, + "loss": 2.8674, + "step": 19233 + }, + { + "epoch": 0.8955001513141048, + "grad_norm": 0.3360616303324677, + "learning_rate": 8.847000658989134e-05, + "loss": 2.9188, + "step": 19234 + }, + { + "epoch": 0.8955467095001979, + "grad_norm": 0.36878023629003415, + "learning_rate": 8.846827628158525e-05, + "loss": 2.8917, + "step": 19235 + }, + { + "epoch": 0.895593267686291, + "grad_norm": 0.3393327856415971, + "learning_rate": 8.846654586037855e-05, + "loss": 2.925, + "step": 19236 + }, + { + "epoch": 0.895639825872384, + "grad_norm": 0.40102730395095326, + "learning_rate": 8.846481532627633e-05, + "loss": 2.832, + "step": 19237 + }, + { + "epoch": 0.8956863840584771, + "grad_norm": 0.3453817827277874, + "learning_rate": 8.846308467928366e-05, + "loss": 2.8812, + "step": 19238 + }, + { + "epoch": 0.8957329422445701, + "grad_norm": 0.3689683417052058, + "learning_rate": 8.846135391940563e-05, + "loss": 2.775, + "step": 19239 + }, + { + "epoch": 0.8957795004306632, + "grad_norm": 0.35634358701872904, + "learning_rate": 8.845962304664732e-05, + "loss": 2.8732, + "step": 19240 + }, + { + "epoch": 0.8958260586167562, + "grad_norm": 0.38063706019744586, + "learning_rate": 8.845789206101379e-05, + "loss": 2.7735, + "step": 19241 + }, + { + "epoch": 0.8958726168028494, + "grad_norm": 0.33994030787428914, + "learning_rate": 8.845616096251015e-05, + "loss": 2.9624, + "step": 19242 + }, + { + "epoch": 0.8959191749889425, + "grad_norm": 0.39474712441761256, + "learning_rate": 8.845442975114147e-05, + "loss": 2.8747, + "step": 19243 + }, + { + "epoch": 0.8959657331750355, + "grad_norm": 0.3621037577740723, + "learning_rate": 8.845269842691282e-05, + "loss": 2.9358, + "step": 19244 + }, + { + "epoch": 0.8960122913611286, + "grad_norm": 0.37432329660650765, + "learning_rate": 8.845096698982928e-05, + "loss": 2.8923, + "step": 19245 + }, + { + "epoch": 0.8960588495472216, + "grad_norm": 0.331886757551912, + "learning_rate": 8.844923543989595e-05, + "loss": 2.9176, + "step": 19246 + }, + { + "epoch": 0.8961054077333147, + "grad_norm": 0.3621525030352021, + "learning_rate": 8.84475037771179e-05, + "loss": 2.9328, + "step": 19247 + }, + { + "epoch": 0.8961519659194078, + "grad_norm": 0.34397553385425833, + "learning_rate": 8.844577200150021e-05, + "loss": 2.9195, + "step": 19248 + }, + { + "epoch": 0.8961985241055008, + "grad_norm": 0.34842273589588574, + "learning_rate": 8.844404011304797e-05, + "loss": 2.8811, + "step": 19249 + }, + { + "epoch": 0.8962450822915939, + "grad_norm": 0.35556290027178605, + "learning_rate": 8.844230811176626e-05, + "loss": 2.8582, + "step": 19250 + }, + { + "epoch": 0.896291640477687, + "grad_norm": 0.3417168327971388, + "learning_rate": 8.844057599766016e-05, + "loss": 2.9112, + "step": 19251 + }, + { + "epoch": 0.8963381986637801, + "grad_norm": 0.37849301599625623, + "learning_rate": 8.843884377073477e-05, + "loss": 2.9095, + "step": 19252 + }, + { + "epoch": 0.8963847568498732, + "grad_norm": 0.3444146602579007, + "learning_rate": 8.843711143099516e-05, + "loss": 2.8707, + "step": 19253 + }, + { + "epoch": 0.8964313150359662, + "grad_norm": 0.38536610030665136, + "learning_rate": 8.843537897844639e-05, + "loss": 2.863, + "step": 19254 + }, + { + "epoch": 0.8964778732220593, + "grad_norm": 0.3508351306547497, + "learning_rate": 8.843364641309358e-05, + "loss": 2.9488, + "step": 19255 + }, + { + "epoch": 0.8965244314081523, + "grad_norm": 0.35288732467703204, + "learning_rate": 8.843191373494182e-05, + "loss": 2.879, + "step": 19256 + }, + { + "epoch": 0.8965709895942454, + "grad_norm": 0.34421825842198345, + "learning_rate": 8.843018094399616e-05, + "loss": 2.8104, + "step": 19257 + }, + { + "epoch": 0.8966175477803385, + "grad_norm": 0.35989834247820235, + "learning_rate": 8.842844804026171e-05, + "loss": 2.871, + "step": 19258 + }, + { + "epoch": 0.8966641059664315, + "grad_norm": 0.33929153492774217, + "learning_rate": 8.842671502374355e-05, + "loss": 2.9917, + "step": 19259 + }, + { + "epoch": 0.8967106641525247, + "grad_norm": 0.3423505745378286, + "learning_rate": 8.842498189444678e-05, + "loss": 2.9649, + "step": 19260 + }, + { + "epoch": 0.8967572223386177, + "grad_norm": 0.3760249506923268, + "learning_rate": 8.842324865237646e-05, + "loss": 3.017, + "step": 19261 + }, + { + "epoch": 0.8968037805247108, + "grad_norm": 0.3284486566870383, + "learning_rate": 8.84215152975377e-05, + "loss": 2.7984, + "step": 19262 + }, + { + "epoch": 0.8968503387108038, + "grad_norm": 0.3378556839325647, + "learning_rate": 8.841978182993557e-05, + "loss": 2.8255, + "step": 19263 + }, + { + "epoch": 0.8968968968968969, + "grad_norm": 0.3214954597730066, + "learning_rate": 8.841804824957516e-05, + "loss": 2.8122, + "step": 19264 + }, + { + "epoch": 0.89694345508299, + "grad_norm": 0.3333585483628486, + "learning_rate": 8.841631455646158e-05, + "loss": 3.0169, + "step": 19265 + }, + { + "epoch": 0.896990013269083, + "grad_norm": 0.32232277057768816, + "learning_rate": 8.84145807505999e-05, + "loss": 2.8321, + "step": 19266 + }, + { + "epoch": 0.8970365714551761, + "grad_norm": 0.31117136535872725, + "learning_rate": 8.841284683199519e-05, + "loss": 2.8942, + "step": 19267 + }, + { + "epoch": 0.8970831296412691, + "grad_norm": 0.3572855347142179, + "learning_rate": 8.841111280065257e-05, + "loss": 2.9115, + "step": 19268 + }, + { + "epoch": 0.8971296878273622, + "grad_norm": 0.3155644389068092, + "learning_rate": 8.840937865657712e-05, + "loss": 2.8886, + "step": 19269 + }, + { + "epoch": 0.8971762460134554, + "grad_norm": 0.34481962177360964, + "learning_rate": 8.840764439977392e-05, + "loss": 2.8982, + "step": 19270 + }, + { + "epoch": 0.8972228041995484, + "grad_norm": 0.35054364562759033, + "learning_rate": 8.840591003024809e-05, + "loss": 2.9552, + "step": 19271 + }, + { + "epoch": 0.8972693623856415, + "grad_norm": 0.3615020043681109, + "learning_rate": 8.840417554800467e-05, + "loss": 2.885, + "step": 19272 + }, + { + "epoch": 0.8973159205717345, + "grad_norm": 0.33975341674292836, + "learning_rate": 8.84024409530488e-05, + "loss": 2.8889, + "step": 19273 + }, + { + "epoch": 0.8973624787578276, + "grad_norm": 0.3506436662401819, + "learning_rate": 8.840070624538552e-05, + "loss": 3.0066, + "step": 19274 + }, + { + "epoch": 0.8974090369439207, + "grad_norm": 0.36965844891839206, + "learning_rate": 8.839897142501998e-05, + "loss": 2.9003, + "step": 19275 + }, + { + "epoch": 0.8974555951300137, + "grad_norm": 0.3944594612387782, + "learning_rate": 8.839723649195723e-05, + "loss": 2.8716, + "step": 19276 + }, + { + "epoch": 0.8975021533161068, + "grad_norm": 0.3536106164359907, + "learning_rate": 8.839550144620237e-05, + "loss": 2.9721, + "step": 19277 + }, + { + "epoch": 0.8975487115021998, + "grad_norm": 0.36900831474771884, + "learning_rate": 8.83937662877605e-05, + "loss": 2.9202, + "step": 19278 + }, + { + "epoch": 0.897595269688293, + "grad_norm": 0.37629995700514063, + "learning_rate": 8.839203101663672e-05, + "loss": 2.8284, + "step": 19279 + }, + { + "epoch": 0.8976418278743861, + "grad_norm": 0.3442245484365564, + "learning_rate": 8.83902956328361e-05, + "loss": 2.8567, + "step": 19280 + }, + { + "epoch": 0.8976883860604791, + "grad_norm": 0.35506204163108956, + "learning_rate": 8.838856013636375e-05, + "loss": 2.8821, + "step": 19281 + }, + { + "epoch": 0.8977349442465722, + "grad_norm": 0.3549613830880936, + "learning_rate": 8.838682452722474e-05, + "loss": 2.9406, + "step": 19282 + }, + { + "epoch": 0.8977815024326652, + "grad_norm": 0.3292069671265573, + "learning_rate": 8.83850888054242e-05, + "loss": 2.8531, + "step": 19283 + }, + { + "epoch": 0.8978280606187583, + "grad_norm": 0.37189936719252203, + "learning_rate": 8.83833529709672e-05, + "loss": 2.8872, + "step": 19284 + }, + { + "epoch": 0.8978746188048513, + "grad_norm": 0.3397121972128588, + "learning_rate": 8.838161702385883e-05, + "loss": 2.7897, + "step": 19285 + }, + { + "epoch": 0.8979211769909444, + "grad_norm": 0.35022362108696253, + "learning_rate": 8.837988096410421e-05, + "loss": 2.8211, + "step": 19286 + }, + { + "epoch": 0.8979677351770375, + "grad_norm": 0.38331446421386584, + "learning_rate": 8.837814479170842e-05, + "loss": 2.9628, + "step": 19287 + }, + { + "epoch": 0.8980142933631305, + "grad_norm": 0.321151687070094, + "learning_rate": 8.837640850667655e-05, + "loss": 2.9021, + "step": 19288 + }, + { + "epoch": 0.8980608515492237, + "grad_norm": 0.38281147790548664, + "learning_rate": 8.837467210901371e-05, + "loss": 2.8101, + "step": 19289 + }, + { + "epoch": 0.8981074097353167, + "grad_norm": 0.3511208884404717, + "learning_rate": 8.837293559872497e-05, + "loss": 2.8923, + "step": 19290 + }, + { + "epoch": 0.8981539679214098, + "grad_norm": 0.3557033075024829, + "learning_rate": 8.837119897581548e-05, + "loss": 2.9018, + "step": 19291 + }, + { + "epoch": 0.8982005261075029, + "grad_norm": 0.3144118658353238, + "learning_rate": 8.836946224029025e-05, + "loss": 2.9795, + "step": 19292 + }, + { + "epoch": 0.8982470842935959, + "grad_norm": 0.3482316187428851, + "learning_rate": 8.836772539215447e-05, + "loss": 2.8846, + "step": 19293 + }, + { + "epoch": 0.898293642479689, + "grad_norm": 0.32541604927146833, + "learning_rate": 8.836598843141317e-05, + "loss": 2.81, + "step": 19294 + }, + { + "epoch": 0.898340200665782, + "grad_norm": 0.34775348619212515, + "learning_rate": 8.836425135807149e-05, + "loss": 2.9192, + "step": 19295 + }, + { + "epoch": 0.8983867588518751, + "grad_norm": 0.3433953683422937, + "learning_rate": 8.836251417213451e-05, + "loss": 2.8357, + "step": 19296 + }, + { + "epoch": 0.8984333170379682, + "grad_norm": 0.31693133791109995, + "learning_rate": 8.836077687360733e-05, + "loss": 2.9818, + "step": 19297 + }, + { + "epoch": 0.8984798752240613, + "grad_norm": 0.3376442056564221, + "learning_rate": 8.835903946249504e-05, + "loss": 2.7831, + "step": 19298 + }, + { + "epoch": 0.8985264334101544, + "grad_norm": 0.28974239170095545, + "learning_rate": 8.835730193880276e-05, + "loss": 2.9078, + "step": 19299 + }, + { + "epoch": 0.8985729915962474, + "grad_norm": 0.3578592853717518, + "learning_rate": 8.835556430253557e-05, + "loss": 2.8785, + "step": 19300 + }, + { + "epoch": 0.8986195497823405, + "grad_norm": 0.31197307805126323, + "learning_rate": 8.835382655369857e-05, + "loss": 2.7405, + "step": 19301 + }, + { + "epoch": 0.8986661079684336, + "grad_norm": 0.3644358267027441, + "learning_rate": 8.835208869229688e-05, + "loss": 2.9538, + "step": 19302 + }, + { + "epoch": 0.8987126661545266, + "grad_norm": 0.3763420071058276, + "learning_rate": 8.835035071833559e-05, + "loss": 2.9563, + "step": 19303 + }, + { + "epoch": 0.8987592243406197, + "grad_norm": 0.33328267155446256, + "learning_rate": 8.834861263181979e-05, + "loss": 2.8653, + "step": 19304 + }, + { + "epoch": 0.8988057825267127, + "grad_norm": 0.37791726979370227, + "learning_rate": 8.83468744327546e-05, + "loss": 2.8878, + "step": 19305 + }, + { + "epoch": 0.8988523407128058, + "grad_norm": 0.3391594571836896, + "learning_rate": 8.83451361211451e-05, + "loss": 2.9013, + "step": 19306 + }, + { + "epoch": 0.8988988988988988, + "grad_norm": 0.35450026794978484, + "learning_rate": 8.83433976969964e-05, + "loss": 2.9691, + "step": 19307 + }, + { + "epoch": 0.898945457084992, + "grad_norm": 0.36612673462834733, + "learning_rate": 8.834165916031363e-05, + "loss": 2.9377, + "step": 19308 + }, + { + "epoch": 0.8989920152710851, + "grad_norm": 0.34996223521194575, + "learning_rate": 8.833992051110185e-05, + "loss": 2.9334, + "step": 19309 + }, + { + "epoch": 0.8990385734571781, + "grad_norm": 0.38250036363876755, + "learning_rate": 8.833818174936617e-05, + "loss": 2.9292, + "step": 19310 + }, + { + "epoch": 0.8990851316432712, + "grad_norm": 0.3666761834444959, + "learning_rate": 8.833644287511171e-05, + "loss": 2.9207, + "step": 19311 + }, + { + "epoch": 0.8991316898293642, + "grad_norm": 0.3594963368400236, + "learning_rate": 8.833470388834358e-05, + "loss": 2.9208, + "step": 19312 + }, + { + "epoch": 0.8991782480154573, + "grad_norm": 0.37028449663275576, + "learning_rate": 8.833296478906687e-05, + "loss": 2.7943, + "step": 19313 + }, + { + "epoch": 0.8992248062015504, + "grad_norm": 0.33345581905327526, + "learning_rate": 8.833122557728667e-05, + "loss": 2.9332, + "step": 19314 + }, + { + "epoch": 0.8992713643876434, + "grad_norm": 0.35139374159442927, + "learning_rate": 8.832948625300811e-05, + "loss": 2.8277, + "step": 19315 + }, + { + "epoch": 0.8993179225737366, + "grad_norm": 0.3212618531025608, + "learning_rate": 8.832774681623628e-05, + "loss": 2.9267, + "step": 19316 + }, + { + "epoch": 0.8993644807598296, + "grad_norm": 0.38309452908563274, + "learning_rate": 8.832600726697628e-05, + "loss": 2.9405, + "step": 19317 + }, + { + "epoch": 0.8994110389459227, + "grad_norm": 0.3156480565465537, + "learning_rate": 8.832426760523325e-05, + "loss": 2.8979, + "step": 19318 + }, + { + "epoch": 0.8994575971320158, + "grad_norm": 0.374636844041613, + "learning_rate": 8.832252783101224e-05, + "loss": 2.9666, + "step": 19319 + }, + { + "epoch": 0.8995041553181088, + "grad_norm": 0.3449022136018054, + "learning_rate": 8.832078794431839e-05, + "loss": 2.8318, + "step": 19320 + }, + { + "epoch": 0.8995507135042019, + "grad_norm": 0.38133657650898883, + "learning_rate": 8.831904794515682e-05, + "loss": 2.8883, + "step": 19321 + }, + { + "epoch": 0.8995972716902949, + "grad_norm": 0.45367544110781116, + "learning_rate": 8.831730783353262e-05, + "loss": 2.9548, + "step": 19322 + }, + { + "epoch": 0.899643829876388, + "grad_norm": 0.3392744585077005, + "learning_rate": 8.83155676094509e-05, + "loss": 2.8464, + "step": 19323 + }, + { + "epoch": 0.8996903880624811, + "grad_norm": 0.4190841041079882, + "learning_rate": 8.831382727291673e-05, + "loss": 2.9761, + "step": 19324 + }, + { + "epoch": 0.8997369462485741, + "grad_norm": 0.3236525898311736, + "learning_rate": 8.831208682393528e-05, + "loss": 2.8482, + "step": 19325 + }, + { + "epoch": 0.8997835044346673, + "grad_norm": 0.4167384209607554, + "learning_rate": 8.831034626251162e-05, + "loss": 2.8598, + "step": 19326 + }, + { + "epoch": 0.8998300626207603, + "grad_norm": 0.36377293687846357, + "learning_rate": 8.830860558865088e-05, + "loss": 2.8915, + "step": 19327 + }, + { + "epoch": 0.8998766208068534, + "grad_norm": 0.3594021823367682, + "learning_rate": 8.830686480235815e-05, + "loss": 2.8546, + "step": 19328 + }, + { + "epoch": 0.8999231789929464, + "grad_norm": 0.38727396434342043, + "learning_rate": 8.830512390363855e-05, + "loss": 2.9304, + "step": 19329 + }, + { + "epoch": 0.8999697371790395, + "grad_norm": 0.342348231320696, + "learning_rate": 8.830338289249717e-05, + "loss": 2.7865, + "step": 19330 + }, + { + "epoch": 0.9000162953651326, + "grad_norm": 0.345566984717867, + "learning_rate": 8.830164176893913e-05, + "loss": 2.9517, + "step": 19331 + }, + { + "epoch": 0.9000628535512256, + "grad_norm": 0.3438089104908233, + "learning_rate": 8.829990053296957e-05, + "loss": 2.901, + "step": 19332 + }, + { + "epoch": 0.9001094117373187, + "grad_norm": 0.34390498810640957, + "learning_rate": 8.829815918459357e-05, + "loss": 2.8501, + "step": 19333 + }, + { + "epoch": 0.9001559699234117, + "grad_norm": 0.37217135622842235, + "learning_rate": 8.829641772381623e-05, + "loss": 2.8917, + "step": 19334 + }, + { + "epoch": 0.9002025281095049, + "grad_norm": 0.3422382231464916, + "learning_rate": 8.829467615064268e-05, + "loss": 2.9393, + "step": 19335 + }, + { + "epoch": 0.900249086295598, + "grad_norm": 0.31762538506833443, + "learning_rate": 8.829293446507805e-05, + "loss": 2.8488, + "step": 19336 + }, + { + "epoch": 0.900295644481691, + "grad_norm": 0.32370894132478994, + "learning_rate": 8.82911926671274e-05, + "loss": 2.9031, + "step": 19337 + }, + { + "epoch": 0.9003422026677841, + "grad_norm": 0.3709052950755295, + "learning_rate": 8.828945075679588e-05, + "loss": 2.8415, + "step": 19338 + }, + { + "epoch": 0.9003887608538771, + "grad_norm": 0.3401509765822634, + "learning_rate": 8.82877087340886e-05, + "loss": 2.7674, + "step": 19339 + }, + { + "epoch": 0.9004353190399702, + "grad_norm": 0.3602028718211042, + "learning_rate": 8.828596659901067e-05, + "loss": 2.8618, + "step": 19340 + }, + { + "epoch": 0.9004818772260633, + "grad_norm": 0.34647626155710726, + "learning_rate": 8.82842243515672e-05, + "loss": 2.876, + "step": 19341 + }, + { + "epoch": 0.9005284354121563, + "grad_norm": 0.32294396820175175, + "learning_rate": 8.82824819917633e-05, + "loss": 2.8292, + "step": 19342 + }, + { + "epoch": 0.9005749935982494, + "grad_norm": 0.3385596499348949, + "learning_rate": 8.82807395196041e-05, + "loss": 2.7962, + "step": 19343 + }, + { + "epoch": 0.9006215517843424, + "grad_norm": 0.33079046937537476, + "learning_rate": 8.827899693509467e-05, + "loss": 2.9122, + "step": 19344 + }, + { + "epoch": 0.9006681099704356, + "grad_norm": 0.33198674280828594, + "learning_rate": 8.827725423824018e-05, + "loss": 2.9924, + "step": 19345 + }, + { + "epoch": 0.9007146681565287, + "grad_norm": 0.33097625328439995, + "learning_rate": 8.827551142904571e-05, + "loss": 2.7779, + "step": 19346 + }, + { + "epoch": 0.9007612263426217, + "grad_norm": 0.3463122260296432, + "learning_rate": 8.827376850751639e-05, + "loss": 2.7418, + "step": 19347 + }, + { + "epoch": 0.9008077845287148, + "grad_norm": 0.35279546727058947, + "learning_rate": 8.827202547365732e-05, + "loss": 2.9901, + "step": 19348 + }, + { + "epoch": 0.9008543427148078, + "grad_norm": 0.3475748521658531, + "learning_rate": 8.827028232747364e-05, + "loss": 2.9407, + "step": 19349 + }, + { + "epoch": 0.9009009009009009, + "grad_norm": 0.32704641284943686, + "learning_rate": 8.826853906897044e-05, + "loss": 2.8819, + "step": 19350 + }, + { + "epoch": 0.9009474590869939, + "grad_norm": 0.3387393793871133, + "learning_rate": 8.826679569815285e-05, + "loss": 2.8032, + "step": 19351 + }, + { + "epoch": 0.900994017273087, + "grad_norm": 0.3524408943257958, + "learning_rate": 8.826505221502599e-05, + "loss": 2.7898, + "step": 19352 + }, + { + "epoch": 0.9010405754591801, + "grad_norm": 0.3182534347363333, + "learning_rate": 8.826330861959496e-05, + "loss": 2.9608, + "step": 19353 + }, + { + "epoch": 0.9010871336452732, + "grad_norm": 0.36748546516028624, + "learning_rate": 8.82615649118649e-05, + "loss": 2.9154, + "step": 19354 + }, + { + "epoch": 0.9011336918313663, + "grad_norm": 0.3370745327927597, + "learning_rate": 8.82598210918409e-05, + "loss": 2.9445, + "step": 19355 + }, + { + "epoch": 0.9011802500174593, + "grad_norm": 0.3862133269653071, + "learning_rate": 8.825807715952812e-05, + "loss": 2.899, + "step": 19356 + }, + { + "epoch": 0.9012268082035524, + "grad_norm": 0.3361309683157767, + "learning_rate": 8.825633311493164e-05, + "loss": 2.8067, + "step": 19357 + }, + { + "epoch": 0.9012733663896455, + "grad_norm": 0.3349377569712641, + "learning_rate": 8.825458895805659e-05, + "loss": 2.971, + "step": 19358 + }, + { + "epoch": 0.9013199245757385, + "grad_norm": 0.3388840818884525, + "learning_rate": 8.825284468890809e-05, + "loss": 2.7492, + "step": 19359 + }, + { + "epoch": 0.9013664827618316, + "grad_norm": 0.33676754712791757, + "learning_rate": 8.825110030749127e-05, + "loss": 2.9301, + "step": 19360 + }, + { + "epoch": 0.9014130409479246, + "grad_norm": 0.3860032995873598, + "learning_rate": 8.824935581381122e-05, + "loss": 2.8875, + "step": 19361 + }, + { + "epoch": 0.9014595991340177, + "grad_norm": 0.3475800908538409, + "learning_rate": 8.824761120787309e-05, + "loss": 2.9583, + "step": 19362 + }, + { + "epoch": 0.9015061573201109, + "grad_norm": 0.3789983004137188, + "learning_rate": 8.824586648968199e-05, + "loss": 2.8453, + "step": 19363 + }, + { + "epoch": 0.9015527155062039, + "grad_norm": 0.3392013934752183, + "learning_rate": 8.824412165924303e-05, + "loss": 2.8419, + "step": 19364 + }, + { + "epoch": 0.901599273692297, + "grad_norm": 0.34367519236981514, + "learning_rate": 8.824237671656135e-05, + "loss": 2.9206, + "step": 19365 + }, + { + "epoch": 0.90164583187839, + "grad_norm": 0.32097730516240336, + "learning_rate": 8.824063166164207e-05, + "loss": 2.9409, + "step": 19366 + }, + { + "epoch": 0.9016923900644831, + "grad_norm": 0.3724620820483481, + "learning_rate": 8.823888649449027e-05, + "loss": 2.911, + "step": 19367 + }, + { + "epoch": 0.9017389482505762, + "grad_norm": 0.32941539339965115, + "learning_rate": 8.823714121511114e-05, + "loss": 2.8644, + "step": 19368 + }, + { + "epoch": 0.9017855064366692, + "grad_norm": 0.37559471034557473, + "learning_rate": 8.823539582350976e-05, + "loss": 2.8895, + "step": 19369 + }, + { + "epoch": 0.9018320646227623, + "grad_norm": 0.3603441729495425, + "learning_rate": 8.823365031969125e-05, + "loss": 2.8367, + "step": 19370 + }, + { + "epoch": 0.9018786228088553, + "grad_norm": 0.3966343005132397, + "learning_rate": 8.823190470366074e-05, + "loss": 2.9877, + "step": 19371 + }, + { + "epoch": 0.9019251809949484, + "grad_norm": 0.3944572470093549, + "learning_rate": 8.823015897542338e-05, + "loss": 2.8677, + "step": 19372 + }, + { + "epoch": 0.9019717391810415, + "grad_norm": 0.37050321819829246, + "learning_rate": 8.822841313498424e-05, + "loss": 2.9243, + "step": 19373 + }, + { + "epoch": 0.9020182973671346, + "grad_norm": 0.36920405833341563, + "learning_rate": 8.82266671823485e-05, + "loss": 2.9573, + "step": 19374 + }, + { + "epoch": 0.9020648555532277, + "grad_norm": 0.35221648618699736, + "learning_rate": 8.822492111752125e-05, + "loss": 2.9008, + "step": 19375 + }, + { + "epoch": 0.9021114137393207, + "grad_norm": 0.3792726749771813, + "learning_rate": 8.822317494050761e-05, + "loss": 2.982, + "step": 19376 + }, + { + "epoch": 0.9021579719254138, + "grad_norm": 0.3482189911299757, + "learning_rate": 8.822142865131274e-05, + "loss": 2.9509, + "step": 19377 + }, + { + "epoch": 0.9022045301115068, + "grad_norm": 0.37271987092107933, + "learning_rate": 8.821968224994171e-05, + "loss": 2.9773, + "step": 19378 + }, + { + "epoch": 0.9022510882975999, + "grad_norm": 0.3246942156325204, + "learning_rate": 8.82179357363997e-05, + "loss": 2.8618, + "step": 19379 + }, + { + "epoch": 0.902297646483693, + "grad_norm": 0.35959433840216765, + "learning_rate": 8.821618911069181e-05, + "loss": 2.9135, + "step": 19380 + }, + { + "epoch": 0.902344204669786, + "grad_norm": 0.30818037966094164, + "learning_rate": 8.821444237282317e-05, + "loss": 2.9139, + "step": 19381 + }, + { + "epoch": 0.9023907628558792, + "grad_norm": 0.3467176176170426, + "learning_rate": 8.82126955227989e-05, + "loss": 2.9238, + "step": 19382 + }, + { + "epoch": 0.9024373210419722, + "grad_norm": 0.33084944950413564, + "learning_rate": 8.821094856062416e-05, + "loss": 2.896, + "step": 19383 + }, + { + "epoch": 0.9024838792280653, + "grad_norm": 0.3432578265698153, + "learning_rate": 8.820920148630402e-05, + "loss": 2.9119, + "step": 19384 + }, + { + "epoch": 0.9025304374141584, + "grad_norm": 0.31201602902236353, + "learning_rate": 8.820745429984365e-05, + "loss": 2.9151, + "step": 19385 + }, + { + "epoch": 0.9025769956002514, + "grad_norm": 0.36656332480338666, + "learning_rate": 8.820570700124815e-05, + "loss": 2.9477, + "step": 19386 + }, + { + "epoch": 0.9026235537863445, + "grad_norm": 0.3153193395800635, + "learning_rate": 8.820395959052268e-05, + "loss": 2.9445, + "step": 19387 + }, + { + "epoch": 0.9026701119724375, + "grad_norm": 0.32730651914953063, + "learning_rate": 8.820221206767235e-05, + "loss": 2.924, + "step": 19388 + }, + { + "epoch": 0.9027166701585306, + "grad_norm": 0.35566246134916474, + "learning_rate": 8.820046443270229e-05, + "loss": 2.9917, + "step": 19389 + }, + { + "epoch": 0.9027632283446237, + "grad_norm": 0.35231096664995293, + "learning_rate": 8.819871668561763e-05, + "loss": 3.0509, + "step": 19390 + }, + { + "epoch": 0.9028097865307168, + "grad_norm": 0.3763007304516556, + "learning_rate": 8.81969688264235e-05, + "loss": 2.9618, + "step": 19391 + }, + { + "epoch": 0.9028563447168099, + "grad_norm": 0.322456666492655, + "learning_rate": 8.819522085512503e-05, + "loss": 2.8793, + "step": 19392 + }, + { + "epoch": 0.9029029029029029, + "grad_norm": 0.36659801148697885, + "learning_rate": 8.819347277172735e-05, + "loss": 2.9759, + "step": 19393 + }, + { + "epoch": 0.902949461088996, + "grad_norm": 0.3007566849124925, + "learning_rate": 8.81917245762356e-05, + "loss": 2.8203, + "step": 19394 + }, + { + "epoch": 0.902996019275089, + "grad_norm": 0.3497950724208521, + "learning_rate": 8.818997626865489e-05, + "loss": 2.8842, + "step": 19395 + }, + { + "epoch": 0.9030425774611821, + "grad_norm": 0.34341176997045186, + "learning_rate": 8.818822784899034e-05, + "loss": 2.833, + "step": 19396 + }, + { + "epoch": 0.9030891356472752, + "grad_norm": 0.3370428987289233, + "learning_rate": 8.818647931724713e-05, + "loss": 2.9355, + "step": 19397 + }, + { + "epoch": 0.9031356938333682, + "grad_norm": 0.353097305084828, + "learning_rate": 8.818473067343036e-05, + "loss": 2.9835, + "step": 19398 + }, + { + "epoch": 0.9031822520194613, + "grad_norm": 0.3534165270288091, + "learning_rate": 8.818298191754517e-05, + "loss": 2.8516, + "step": 19399 + }, + { + "epoch": 0.9032288102055543, + "grad_norm": 0.3370886914310738, + "learning_rate": 8.818123304959668e-05, + "loss": 2.9026, + "step": 19400 + }, + { + "epoch": 0.9032753683916475, + "grad_norm": 0.3283155580169899, + "learning_rate": 8.817948406959005e-05, + "loss": 2.9503, + "step": 19401 + }, + { + "epoch": 0.9033219265777406, + "grad_norm": 0.3410093960339277, + "learning_rate": 8.81777349775304e-05, + "loss": 2.8755, + "step": 19402 + }, + { + "epoch": 0.9033684847638336, + "grad_norm": 0.3466642671186508, + "learning_rate": 8.817598577342283e-05, + "loss": 2.8835, + "step": 19403 + }, + { + "epoch": 0.9034150429499267, + "grad_norm": 0.35091243324664684, + "learning_rate": 8.817423645727252e-05, + "loss": 2.9102, + "step": 19404 + }, + { + "epoch": 0.9034616011360197, + "grad_norm": 0.34550224948456787, + "learning_rate": 8.817248702908458e-05, + "loss": 2.8824, + "step": 19405 + }, + { + "epoch": 0.9035081593221128, + "grad_norm": 0.37364554227640256, + "learning_rate": 8.817073748886416e-05, + "loss": 2.9555, + "step": 19406 + }, + { + "epoch": 0.9035547175082059, + "grad_norm": 0.3353166864557525, + "learning_rate": 8.816898783661638e-05, + "loss": 2.8289, + "step": 19407 + }, + { + "epoch": 0.9036012756942989, + "grad_norm": 0.377078701290151, + "learning_rate": 8.81672380723464e-05, + "loss": 2.9314, + "step": 19408 + }, + { + "epoch": 0.903647833880392, + "grad_norm": 0.3656085867818683, + "learning_rate": 8.81654881960593e-05, + "loss": 2.9219, + "step": 19409 + }, + { + "epoch": 0.903694392066485, + "grad_norm": 0.35824017319464246, + "learning_rate": 8.816373820776028e-05, + "loss": 2.8921, + "step": 19410 + }, + { + "epoch": 0.9037409502525782, + "grad_norm": 0.3727622216643754, + "learning_rate": 8.816198810745444e-05, + "loss": 2.8399, + "step": 19411 + }, + { + "epoch": 0.9037875084386713, + "grad_norm": 0.371133747405526, + "learning_rate": 8.816023789514691e-05, + "loss": 2.9371, + "step": 19412 + }, + { + "epoch": 0.9038340666247643, + "grad_norm": 0.3784438271529272, + "learning_rate": 8.815848757084286e-05, + "loss": 2.901, + "step": 19413 + }, + { + "epoch": 0.9038806248108574, + "grad_norm": 0.36567007368299836, + "learning_rate": 8.815673713454741e-05, + "loss": 2.8324, + "step": 19414 + }, + { + "epoch": 0.9039271829969504, + "grad_norm": 0.33218471516584874, + "learning_rate": 8.815498658626568e-05, + "loss": 2.8404, + "step": 19415 + }, + { + "epoch": 0.9039737411830435, + "grad_norm": 0.3455990339846776, + "learning_rate": 8.815323592600284e-05, + "loss": 2.9005, + "step": 19416 + }, + { + "epoch": 0.9040202993691365, + "grad_norm": 0.34336092718625266, + "learning_rate": 8.8151485153764e-05, + "loss": 2.9599, + "step": 19417 + }, + { + "epoch": 0.9040668575552296, + "grad_norm": 0.32967209955938104, + "learning_rate": 8.814973426955431e-05, + "loss": 2.8819, + "step": 19418 + }, + { + "epoch": 0.9041134157413228, + "grad_norm": 0.3550534796778332, + "learning_rate": 8.81479832733789e-05, + "loss": 2.8822, + "step": 19419 + }, + { + "epoch": 0.9041599739274158, + "grad_norm": 0.35383568701298246, + "learning_rate": 8.814623216524294e-05, + "loss": 2.9027, + "step": 19420 + }, + { + "epoch": 0.9042065321135089, + "grad_norm": 0.34741789100320386, + "learning_rate": 8.814448094515153e-05, + "loss": 2.9018, + "step": 19421 + }, + { + "epoch": 0.9042530902996019, + "grad_norm": 0.3568549689655787, + "learning_rate": 8.814272961310984e-05, + "loss": 2.9382, + "step": 19422 + }, + { + "epoch": 0.904299648485695, + "grad_norm": 0.3382695019398026, + "learning_rate": 8.814097816912299e-05, + "loss": 3.0019, + "step": 19423 + }, + { + "epoch": 0.9043462066717881, + "grad_norm": 0.39117532352447887, + "learning_rate": 8.813922661319611e-05, + "loss": 2.8337, + "step": 19424 + }, + { + "epoch": 0.9043927648578811, + "grad_norm": 0.36895906773778886, + "learning_rate": 8.813747494533438e-05, + "loss": 2.9199, + "step": 19425 + }, + { + "epoch": 0.9044393230439742, + "grad_norm": 0.3923273310624679, + "learning_rate": 8.81357231655429e-05, + "loss": 2.9608, + "step": 19426 + }, + { + "epoch": 0.9044858812300672, + "grad_norm": 0.3820398488913447, + "learning_rate": 8.813397127382686e-05, + "loss": 2.8369, + "step": 19427 + }, + { + "epoch": 0.9045324394161603, + "grad_norm": 0.380284515737697, + "learning_rate": 8.813221927019133e-05, + "loss": 2.9242, + "step": 19428 + }, + { + "epoch": 0.9045789976022535, + "grad_norm": 0.34363292789252015, + "learning_rate": 8.813046715464153e-05, + "loss": 2.8557, + "step": 19429 + }, + { + "epoch": 0.9046255557883465, + "grad_norm": 0.38427207078233333, + "learning_rate": 8.812871492718253e-05, + "loss": 2.887, + "step": 19430 + }, + { + "epoch": 0.9046721139744396, + "grad_norm": 0.3651807295840763, + "learning_rate": 8.812696258781954e-05, + "loss": 2.78, + "step": 19431 + }, + { + "epoch": 0.9047186721605326, + "grad_norm": 0.34345335148493716, + "learning_rate": 8.812521013655765e-05, + "loss": 2.9293, + "step": 19432 + }, + { + "epoch": 0.9047652303466257, + "grad_norm": 0.418717752517555, + "learning_rate": 8.812345757340203e-05, + "loss": 2.9805, + "step": 19433 + }, + { + "epoch": 0.9048117885327188, + "grad_norm": 0.35638696858411967, + "learning_rate": 8.812170489835783e-05, + "loss": 2.8965, + "step": 19434 + }, + { + "epoch": 0.9048583467188118, + "grad_norm": 0.3926890703101135, + "learning_rate": 8.811995211143016e-05, + "loss": 2.9139, + "step": 19435 + }, + { + "epoch": 0.9049049049049049, + "grad_norm": 0.38458642046036073, + "learning_rate": 8.81181992126242e-05, + "loss": 2.7907, + "step": 19436 + }, + { + "epoch": 0.9049514630909979, + "grad_norm": 0.3458197496789293, + "learning_rate": 8.811644620194508e-05, + "loss": 2.8678, + "step": 19437 + }, + { + "epoch": 0.9049980212770911, + "grad_norm": 0.3578791114528043, + "learning_rate": 8.811469307939795e-05, + "loss": 2.8674, + "step": 19438 + }, + { + "epoch": 0.9050445794631841, + "grad_norm": 0.3500731220175883, + "learning_rate": 8.811293984498794e-05, + "loss": 2.9096, + "step": 19439 + }, + { + "epoch": 0.9050911376492772, + "grad_norm": 0.41248558540626323, + "learning_rate": 8.81111864987202e-05, + "loss": 3.0159, + "step": 19440 + }, + { + "epoch": 0.9051376958353703, + "grad_norm": 0.3337445197112958, + "learning_rate": 8.81094330405999e-05, + "loss": 2.8669, + "step": 19441 + }, + { + "epoch": 0.9051842540214633, + "grad_norm": 0.36670697488107146, + "learning_rate": 8.810767947063216e-05, + "loss": 2.949, + "step": 19442 + }, + { + "epoch": 0.9052308122075564, + "grad_norm": 0.35465494586303403, + "learning_rate": 8.810592578882213e-05, + "loss": 2.9603, + "step": 19443 + }, + { + "epoch": 0.9052773703936494, + "grad_norm": 0.32294942993419495, + "learning_rate": 8.810417199517497e-05, + "loss": 2.8968, + "step": 19444 + }, + { + "epoch": 0.9053239285797425, + "grad_norm": 0.37975370022948146, + "learning_rate": 8.810241808969582e-05, + "loss": 2.9462, + "step": 19445 + }, + { + "epoch": 0.9053704867658356, + "grad_norm": 0.3078750043162044, + "learning_rate": 8.810066407238981e-05, + "loss": 2.8981, + "step": 19446 + }, + { + "epoch": 0.9054170449519287, + "grad_norm": 0.3329037427726586, + "learning_rate": 8.809890994326213e-05, + "loss": 2.782, + "step": 19447 + }, + { + "epoch": 0.9054636031380218, + "grad_norm": 0.3033697606632688, + "learning_rate": 8.80971557023179e-05, + "loss": 2.8095, + "step": 19448 + }, + { + "epoch": 0.9055101613241148, + "grad_norm": 0.3058415386564064, + "learning_rate": 8.809540134956226e-05, + "loss": 2.9709, + "step": 19449 + }, + { + "epoch": 0.9055567195102079, + "grad_norm": 0.3403931505899644, + "learning_rate": 8.809364688500037e-05, + "loss": 2.8775, + "step": 19450 + }, + { + "epoch": 0.905603277696301, + "grad_norm": 0.32944239431253103, + "learning_rate": 8.809189230863739e-05, + "loss": 2.9353, + "step": 19451 + }, + { + "epoch": 0.905649835882394, + "grad_norm": 0.3054375311802124, + "learning_rate": 8.809013762047844e-05, + "loss": 2.8113, + "step": 19452 + }, + { + "epoch": 0.9056963940684871, + "grad_norm": 0.3141502717717921, + "learning_rate": 8.80883828205287e-05, + "loss": 2.936, + "step": 19453 + }, + { + "epoch": 0.9057429522545801, + "grad_norm": 0.35044943138377677, + "learning_rate": 8.808662790879329e-05, + "loss": 2.8959, + "step": 19454 + }, + { + "epoch": 0.9057895104406732, + "grad_norm": 0.34106061495580137, + "learning_rate": 8.80848728852774e-05, + "loss": 2.953, + "step": 19455 + }, + { + "epoch": 0.9058360686267664, + "grad_norm": 0.338198219426658, + "learning_rate": 8.808311774998617e-05, + "loss": 2.9853, + "step": 19456 + }, + { + "epoch": 0.9058826268128594, + "grad_norm": 0.3452623548459772, + "learning_rate": 8.808136250292473e-05, + "loss": 2.916, + "step": 19457 + }, + { + "epoch": 0.9059291849989525, + "grad_norm": 0.34140297681586673, + "learning_rate": 8.807960714409824e-05, + "loss": 2.8652, + "step": 19458 + }, + { + "epoch": 0.9059757431850455, + "grad_norm": 0.34884940994234803, + "learning_rate": 8.807785167351184e-05, + "loss": 2.8644, + "step": 19459 + }, + { + "epoch": 0.9060223013711386, + "grad_norm": 0.32446078806609147, + "learning_rate": 8.807609609117071e-05, + "loss": 2.8601, + "step": 19460 + }, + { + "epoch": 0.9060688595572316, + "grad_norm": 0.36821286973692025, + "learning_rate": 8.807434039708e-05, + "loss": 2.8966, + "step": 19461 + }, + { + "epoch": 0.9061154177433247, + "grad_norm": 0.34078837563629516, + "learning_rate": 8.807258459124484e-05, + "loss": 2.8192, + "step": 19462 + }, + { + "epoch": 0.9061619759294178, + "grad_norm": 0.3596541530398145, + "learning_rate": 8.80708286736704e-05, + "loss": 2.9351, + "step": 19463 + }, + { + "epoch": 0.9062085341155108, + "grad_norm": 0.35220371730108235, + "learning_rate": 8.806907264436183e-05, + "loss": 2.8617, + "step": 19464 + }, + { + "epoch": 0.906255092301604, + "grad_norm": 0.3493809282208435, + "learning_rate": 8.806731650332428e-05, + "loss": 2.8463, + "step": 19465 + }, + { + "epoch": 0.906301650487697, + "grad_norm": 0.3364439218108697, + "learning_rate": 8.806556025056289e-05, + "loss": 2.8394, + "step": 19466 + }, + { + "epoch": 0.9063482086737901, + "grad_norm": 0.34879964494319465, + "learning_rate": 8.806380388608285e-05, + "loss": 2.8823, + "step": 19467 + }, + { + "epoch": 0.9063947668598832, + "grad_norm": 0.3773321513292491, + "learning_rate": 8.80620474098893e-05, + "loss": 2.8673, + "step": 19468 + }, + { + "epoch": 0.9064413250459762, + "grad_norm": 0.3475972230339106, + "learning_rate": 8.806029082198738e-05, + "loss": 2.9497, + "step": 19469 + }, + { + "epoch": 0.9064878832320693, + "grad_norm": 0.37403154321197857, + "learning_rate": 8.805853412238226e-05, + "loss": 2.8117, + "step": 19470 + }, + { + "epoch": 0.9065344414181623, + "grad_norm": 0.3536848236534618, + "learning_rate": 8.805677731107908e-05, + "loss": 2.9056, + "step": 19471 + }, + { + "epoch": 0.9065809996042554, + "grad_norm": 0.3801690916950052, + "learning_rate": 8.805502038808302e-05, + "loss": 2.9297, + "step": 19472 + }, + { + "epoch": 0.9066275577903485, + "grad_norm": 0.39028271766181577, + "learning_rate": 8.805326335339922e-05, + "loss": 2.9317, + "step": 19473 + }, + { + "epoch": 0.9066741159764415, + "grad_norm": 0.3740023752484219, + "learning_rate": 8.805150620703284e-05, + "loss": 2.8871, + "step": 19474 + }, + { + "epoch": 0.9067206741625347, + "grad_norm": 0.387569871012183, + "learning_rate": 8.804974894898905e-05, + "loss": 2.9418, + "step": 19475 + }, + { + "epoch": 0.9067672323486277, + "grad_norm": 0.42450817478658404, + "learning_rate": 8.804799157927299e-05, + "loss": 2.9226, + "step": 19476 + }, + { + "epoch": 0.9068137905347208, + "grad_norm": 0.3528476708706272, + "learning_rate": 8.804623409788981e-05, + "loss": 2.9481, + "step": 19477 + }, + { + "epoch": 0.9068603487208139, + "grad_norm": 0.4293076371356073, + "learning_rate": 8.804447650484468e-05, + "loss": 2.952, + "step": 19478 + }, + { + "epoch": 0.9069069069069069, + "grad_norm": 0.3821865899885579, + "learning_rate": 8.804271880014276e-05, + "loss": 2.8502, + "step": 19479 + }, + { + "epoch": 0.906953465093, + "grad_norm": 0.3669677913128856, + "learning_rate": 8.804096098378924e-05, + "loss": 2.847, + "step": 19480 + }, + { + "epoch": 0.907000023279093, + "grad_norm": 0.37285939835142423, + "learning_rate": 8.803920305578921e-05, + "loss": 2.8946, + "step": 19481 + }, + { + "epoch": 0.9070465814651861, + "grad_norm": 0.37477295992699877, + "learning_rate": 8.803744501614789e-05, + "loss": 2.9365, + "step": 19482 + }, + { + "epoch": 0.9070931396512791, + "grad_norm": 0.36883742873487596, + "learning_rate": 8.803568686487041e-05, + "loss": 2.9394, + "step": 19483 + }, + { + "epoch": 0.9071396978373722, + "grad_norm": 0.3590077023801557, + "learning_rate": 8.803392860196193e-05, + "loss": 2.9158, + "step": 19484 + }, + { + "epoch": 0.9071862560234654, + "grad_norm": 0.3742075033175657, + "learning_rate": 8.803217022742763e-05, + "loss": 2.8915, + "step": 19485 + }, + { + "epoch": 0.9072328142095584, + "grad_norm": 0.3962019999074592, + "learning_rate": 8.803041174127264e-05, + "loss": 2.9347, + "step": 19486 + }, + { + "epoch": 0.9072793723956515, + "grad_norm": 0.3417191482721927, + "learning_rate": 8.802865314350214e-05, + "loss": 2.8155, + "step": 19487 + }, + { + "epoch": 0.9073259305817445, + "grad_norm": 0.3621372910569754, + "learning_rate": 8.802689443412129e-05, + "loss": 2.842, + "step": 19488 + }, + { + "epoch": 0.9073724887678376, + "grad_norm": 0.5755294814436634, + "learning_rate": 8.802513561313526e-05, + "loss": 2.715, + "step": 19489 + }, + { + "epoch": 0.9074190469539307, + "grad_norm": 0.421643087152665, + "learning_rate": 8.80233766805492e-05, + "loss": 2.9315, + "step": 19490 + }, + { + "epoch": 0.9074656051400237, + "grad_norm": 0.32732490046136886, + "learning_rate": 8.802161763636827e-05, + "loss": 2.9058, + "step": 19491 + }, + { + "epoch": 0.9075121633261168, + "grad_norm": 0.4031982534757696, + "learning_rate": 8.801985848059764e-05, + "loss": 2.9821, + "step": 19492 + }, + { + "epoch": 0.9075587215122098, + "grad_norm": 0.2998629576747444, + "learning_rate": 8.801809921324248e-05, + "loss": 2.9014, + "step": 19493 + }, + { + "epoch": 0.907605279698303, + "grad_norm": 0.37875718615858933, + "learning_rate": 8.801633983430794e-05, + "loss": 2.8846, + "step": 19494 + }, + { + "epoch": 0.9076518378843961, + "grad_norm": 0.30601330845241215, + "learning_rate": 8.801458034379918e-05, + "loss": 2.973, + "step": 19495 + }, + { + "epoch": 0.9076983960704891, + "grad_norm": 0.3521444744931526, + "learning_rate": 8.801282074172137e-05, + "loss": 2.9768, + "step": 19496 + }, + { + "epoch": 0.9077449542565822, + "grad_norm": 0.30774899897553437, + "learning_rate": 8.801106102807969e-05, + "loss": 3.0121, + "step": 19497 + }, + { + "epoch": 0.9077915124426752, + "grad_norm": 0.34436261311318017, + "learning_rate": 8.800930120287928e-05, + "loss": 2.9049, + "step": 19498 + }, + { + "epoch": 0.9078380706287683, + "grad_norm": 0.3077242621517242, + "learning_rate": 8.800754126612532e-05, + "loss": 2.9621, + "step": 19499 + }, + { + "epoch": 0.9078846288148614, + "grad_norm": 0.36983352478503895, + "learning_rate": 8.800578121782297e-05, + "loss": 2.9455, + "step": 19500 + }, + { + "epoch": 0.9079311870009544, + "grad_norm": 0.3169611812042079, + "learning_rate": 8.80040210579774e-05, + "loss": 2.8804, + "step": 19501 + }, + { + "epoch": 0.9079777451870475, + "grad_norm": 0.3671910822225431, + "learning_rate": 8.800226078659377e-05, + "loss": 2.868, + "step": 19502 + }, + { + "epoch": 0.9080243033731406, + "grad_norm": 0.3361292045414289, + "learning_rate": 8.800050040367725e-05, + "loss": 2.8293, + "step": 19503 + }, + { + "epoch": 0.9080708615592337, + "grad_norm": 0.34171959941142604, + "learning_rate": 8.7998739909233e-05, + "loss": 2.8098, + "step": 19504 + }, + { + "epoch": 0.9081174197453267, + "grad_norm": 0.34425299178476665, + "learning_rate": 8.799697930326618e-05, + "loss": 2.9824, + "step": 19505 + }, + { + "epoch": 0.9081639779314198, + "grad_norm": 0.3521913839090785, + "learning_rate": 8.799521858578199e-05, + "loss": 3.0298, + "step": 19506 + }, + { + "epoch": 0.9082105361175129, + "grad_norm": 0.3571310135323226, + "learning_rate": 8.799345775678556e-05, + "loss": 2.7945, + "step": 19507 + }, + { + "epoch": 0.9082570943036059, + "grad_norm": 0.3220060710886364, + "learning_rate": 8.79916968162821e-05, + "loss": 2.833, + "step": 19508 + }, + { + "epoch": 0.908303652489699, + "grad_norm": 0.35101958467157823, + "learning_rate": 8.798993576427672e-05, + "loss": 2.8751, + "step": 19509 + }, + { + "epoch": 0.908350210675792, + "grad_norm": 0.3553050392648749, + "learning_rate": 8.798817460077465e-05, + "loss": 2.9074, + "step": 19510 + }, + { + "epoch": 0.9083967688618851, + "grad_norm": 0.3323246348622319, + "learning_rate": 8.798641332578101e-05, + "loss": 3.0261, + "step": 19511 + }, + { + "epoch": 0.9084433270479783, + "grad_norm": 0.41946115231170344, + "learning_rate": 8.7984651939301e-05, + "loss": 2.9524, + "step": 19512 + }, + { + "epoch": 0.9084898852340713, + "grad_norm": 0.3075120040353711, + "learning_rate": 8.798289044133977e-05, + "loss": 2.8332, + "step": 19513 + }, + { + "epoch": 0.9085364434201644, + "grad_norm": 0.3396512674399641, + "learning_rate": 8.79811288319025e-05, + "loss": 2.8427, + "step": 19514 + }, + { + "epoch": 0.9085830016062574, + "grad_norm": 0.3311860979746703, + "learning_rate": 8.797936711099437e-05, + "loss": 2.911, + "step": 19515 + }, + { + "epoch": 0.9086295597923505, + "grad_norm": 0.32265486289958495, + "learning_rate": 8.797760527862053e-05, + "loss": 2.9402, + "step": 19516 + }, + { + "epoch": 0.9086761179784436, + "grad_norm": 0.32748739944112804, + "learning_rate": 8.797584333478617e-05, + "loss": 2.963, + "step": 19517 + }, + { + "epoch": 0.9087226761645366, + "grad_norm": 0.32027968013720415, + "learning_rate": 8.797408127949642e-05, + "loss": 2.8998, + "step": 19518 + }, + { + "epoch": 0.9087692343506297, + "grad_norm": 0.329680763038411, + "learning_rate": 8.797231911275651e-05, + "loss": 2.8489, + "step": 19519 + }, + { + "epoch": 0.9088157925367227, + "grad_norm": 0.3262332936926547, + "learning_rate": 8.797055683457159e-05, + "loss": 2.9116, + "step": 19520 + }, + { + "epoch": 0.9088623507228158, + "grad_norm": 0.33371741704131314, + "learning_rate": 8.796879444494681e-05, + "loss": 2.8593, + "step": 19521 + }, + { + "epoch": 0.908908908908909, + "grad_norm": 0.32082942182629104, + "learning_rate": 8.796703194388736e-05, + "loss": 2.9834, + "step": 19522 + }, + { + "epoch": 0.908955467095002, + "grad_norm": 0.33236738975490376, + "learning_rate": 8.796526933139843e-05, + "loss": 2.9654, + "step": 19523 + }, + { + "epoch": 0.9090020252810951, + "grad_norm": 0.346368098411845, + "learning_rate": 8.796350660748515e-05, + "loss": 2.9211, + "step": 19524 + }, + { + "epoch": 0.9090485834671881, + "grad_norm": 0.35296562300564543, + "learning_rate": 8.796174377215273e-05, + "loss": 2.866, + "step": 19525 + }, + { + "epoch": 0.9090951416532812, + "grad_norm": 0.3186734098595333, + "learning_rate": 8.795998082540633e-05, + "loss": 2.8334, + "step": 19526 + }, + { + "epoch": 0.9091416998393742, + "grad_norm": 0.3759398554171704, + "learning_rate": 8.795821776725113e-05, + "loss": 2.9032, + "step": 19527 + }, + { + "epoch": 0.9091882580254673, + "grad_norm": 0.32497676130572334, + "learning_rate": 8.79564545976923e-05, + "loss": 2.8921, + "step": 19528 + }, + { + "epoch": 0.9092348162115604, + "grad_norm": 0.35325577815278686, + "learning_rate": 8.795469131673501e-05, + "loss": 2.9196, + "step": 19529 + }, + { + "epoch": 0.9092813743976534, + "grad_norm": 0.3579815899426888, + "learning_rate": 8.795292792438444e-05, + "loss": 2.9238, + "step": 19530 + }, + { + "epoch": 0.9093279325837466, + "grad_norm": 0.37343663698149054, + "learning_rate": 8.795116442064577e-05, + "loss": 2.8531, + "step": 19531 + }, + { + "epoch": 0.9093744907698396, + "grad_norm": 0.3595600326377204, + "learning_rate": 8.794940080552417e-05, + "loss": 2.9407, + "step": 19532 + }, + { + "epoch": 0.9094210489559327, + "grad_norm": 0.36128981924540615, + "learning_rate": 8.794763707902482e-05, + "loss": 2.835, + "step": 19533 + }, + { + "epoch": 0.9094676071420258, + "grad_norm": 0.3453940791048857, + "learning_rate": 8.794587324115289e-05, + "loss": 2.8731, + "step": 19534 + }, + { + "epoch": 0.9095141653281188, + "grad_norm": 0.3457139183382412, + "learning_rate": 8.794410929191356e-05, + "loss": 2.8866, + "step": 19535 + }, + { + "epoch": 0.9095607235142119, + "grad_norm": 0.40133883573603185, + "learning_rate": 8.7942345231312e-05, + "loss": 2.9236, + "step": 19536 + }, + { + "epoch": 0.9096072817003049, + "grad_norm": 0.3288727132495593, + "learning_rate": 8.794058105935341e-05, + "loss": 2.8272, + "step": 19537 + }, + { + "epoch": 0.909653839886398, + "grad_norm": 0.373406008420064, + "learning_rate": 8.793881677604294e-05, + "loss": 2.954, + "step": 19538 + }, + { + "epoch": 0.9097003980724911, + "grad_norm": 0.3512581167609094, + "learning_rate": 8.793705238138579e-05, + "loss": 2.9209, + "step": 19539 + }, + { + "epoch": 0.9097469562585841, + "grad_norm": 0.3429170551979033, + "learning_rate": 8.793528787538712e-05, + "loss": 2.825, + "step": 19540 + }, + { + "epoch": 0.9097935144446773, + "grad_norm": 0.3777675109946951, + "learning_rate": 8.793352325805214e-05, + "loss": 2.8244, + "step": 19541 + }, + { + "epoch": 0.9098400726307703, + "grad_norm": 0.3373762705660925, + "learning_rate": 8.793175852938597e-05, + "loss": 2.8556, + "step": 19542 + }, + { + "epoch": 0.9098866308168634, + "grad_norm": 0.3851693501753352, + "learning_rate": 8.792999368939384e-05, + "loss": 2.8697, + "step": 19543 + }, + { + "epoch": 0.9099331890029565, + "grad_norm": 0.33497989563973246, + "learning_rate": 8.792822873808091e-05, + "loss": 2.9028, + "step": 19544 + }, + { + "epoch": 0.9099797471890495, + "grad_norm": 0.34667535255255283, + "learning_rate": 8.792646367545239e-05, + "loss": 2.8253, + "step": 19545 + }, + { + "epoch": 0.9100263053751426, + "grad_norm": 0.3390995460813727, + "learning_rate": 8.792469850151342e-05, + "loss": 2.9732, + "step": 19546 + }, + { + "epoch": 0.9100728635612356, + "grad_norm": 0.378373205738279, + "learning_rate": 8.792293321626918e-05, + "loss": 2.854, + "step": 19547 + }, + { + "epoch": 0.9101194217473287, + "grad_norm": 0.3469289588478493, + "learning_rate": 8.792116781972489e-05, + "loss": 2.8677, + "step": 19548 + }, + { + "epoch": 0.9101659799334217, + "grad_norm": 0.3428805469391301, + "learning_rate": 8.79194023118857e-05, + "loss": 2.8871, + "step": 19549 + }, + { + "epoch": 0.9102125381195149, + "grad_norm": 0.34961044913299016, + "learning_rate": 8.79176366927568e-05, + "loss": 2.9068, + "step": 19550 + }, + { + "epoch": 0.910259096305608, + "grad_norm": 0.3415546591296902, + "learning_rate": 8.791587096234337e-05, + "loss": 2.9066, + "step": 19551 + }, + { + "epoch": 0.910305654491701, + "grad_norm": 0.35945744124680734, + "learning_rate": 8.79141051206506e-05, + "loss": 2.7714, + "step": 19552 + }, + { + "epoch": 0.9103522126777941, + "grad_norm": 0.35493255891710196, + "learning_rate": 8.791233916768367e-05, + "loss": 2.8956, + "step": 19553 + }, + { + "epoch": 0.9103987708638871, + "grad_norm": 0.3976636046722774, + "learning_rate": 8.791057310344775e-05, + "loss": 2.8762, + "step": 19554 + }, + { + "epoch": 0.9104453290499802, + "grad_norm": 0.31552815737937556, + "learning_rate": 8.790880692794803e-05, + "loss": 2.8945, + "step": 19555 + }, + { + "epoch": 0.9104918872360733, + "grad_norm": 0.370215845760202, + "learning_rate": 8.790704064118971e-05, + "loss": 2.8405, + "step": 19556 + }, + { + "epoch": 0.9105384454221663, + "grad_norm": 0.33346794360339654, + "learning_rate": 8.790527424317794e-05, + "loss": 2.8968, + "step": 19557 + }, + { + "epoch": 0.9105850036082594, + "grad_norm": 0.35828479158956433, + "learning_rate": 8.790350773391794e-05, + "loss": 3.0369, + "step": 19558 + }, + { + "epoch": 0.9106315617943525, + "grad_norm": 0.33472910535202904, + "learning_rate": 8.790174111341487e-05, + "loss": 2.7862, + "step": 19559 + }, + { + "epoch": 0.9106781199804456, + "grad_norm": 0.3594378575644664, + "learning_rate": 8.789997438167393e-05, + "loss": 2.9241, + "step": 19560 + }, + { + "epoch": 0.9107246781665387, + "grad_norm": 0.34804362634221564, + "learning_rate": 8.78982075387003e-05, + "loss": 2.8364, + "step": 19561 + }, + { + "epoch": 0.9107712363526317, + "grad_norm": 0.3603927038202662, + "learning_rate": 8.789644058449917e-05, + "loss": 2.8884, + "step": 19562 + }, + { + "epoch": 0.9108177945387248, + "grad_norm": 0.3652273528724997, + "learning_rate": 8.789467351907571e-05, + "loss": 2.8313, + "step": 19563 + }, + { + "epoch": 0.9108643527248178, + "grad_norm": 0.332079076637042, + "learning_rate": 8.789290634243512e-05, + "loss": 2.8492, + "step": 19564 + }, + { + "epoch": 0.9109109109109109, + "grad_norm": 0.4353254635260076, + "learning_rate": 8.789113905458258e-05, + "loss": 2.9047, + "step": 19565 + }, + { + "epoch": 0.910957469097004, + "grad_norm": 0.3659948589868187, + "learning_rate": 8.788937165552328e-05, + "loss": 2.7954, + "step": 19566 + }, + { + "epoch": 0.911004027283097, + "grad_norm": 0.3880606823483677, + "learning_rate": 8.78876041452624e-05, + "loss": 2.8871, + "step": 19567 + }, + { + "epoch": 0.9110505854691902, + "grad_norm": 0.3492377381983223, + "learning_rate": 8.788583652380516e-05, + "loss": 2.921, + "step": 19568 + }, + { + "epoch": 0.9110971436552832, + "grad_norm": 0.3584872310061145, + "learning_rate": 8.788406879115669e-05, + "loss": 2.9113, + "step": 19569 + }, + { + "epoch": 0.9111437018413763, + "grad_norm": 0.39827340169462166, + "learning_rate": 8.788230094732223e-05, + "loss": 2.8372, + "step": 19570 + }, + { + "epoch": 0.9111902600274693, + "grad_norm": 0.39451753742331985, + "learning_rate": 8.788053299230694e-05, + "loss": 2.8024, + "step": 19571 + }, + { + "epoch": 0.9112368182135624, + "grad_norm": 0.3894892859646229, + "learning_rate": 8.787876492611602e-05, + "loss": 2.8583, + "step": 19572 + }, + { + "epoch": 0.9112833763996555, + "grad_norm": 0.42558282272171055, + "learning_rate": 8.787699674875467e-05, + "loss": 2.8798, + "step": 19573 + }, + { + "epoch": 0.9113299345857485, + "grad_norm": 0.39576708306848896, + "learning_rate": 8.787522846022804e-05, + "loss": 2.9867, + "step": 19574 + }, + { + "epoch": 0.9113764927718416, + "grad_norm": 0.41966561600875524, + "learning_rate": 8.787346006054137e-05, + "loss": 2.7811, + "step": 19575 + }, + { + "epoch": 0.9114230509579346, + "grad_norm": 0.37519271261600795, + "learning_rate": 8.787169154969982e-05, + "loss": 2.9587, + "step": 19576 + }, + { + "epoch": 0.9114696091440277, + "grad_norm": 0.46064514274262225, + "learning_rate": 8.786992292770858e-05, + "loss": 2.8623, + "step": 19577 + }, + { + "epoch": 0.9115161673301209, + "grad_norm": 0.3531405407103425, + "learning_rate": 8.786815419457285e-05, + "loss": 2.9049, + "step": 19578 + }, + { + "epoch": 0.9115627255162139, + "grad_norm": 0.412526397628379, + "learning_rate": 8.78663853502978e-05, + "loss": 2.8918, + "step": 19579 + }, + { + "epoch": 0.911609283702307, + "grad_norm": 0.34997892054078733, + "learning_rate": 8.786461639488867e-05, + "loss": 2.9589, + "step": 19580 + }, + { + "epoch": 0.9116558418884, + "grad_norm": 0.36436298232705694, + "learning_rate": 8.786284732835061e-05, + "loss": 2.9417, + "step": 19581 + }, + { + "epoch": 0.9117024000744931, + "grad_norm": 0.31971499932102576, + "learning_rate": 8.786107815068882e-05, + "loss": 2.8693, + "step": 19582 + }, + { + "epoch": 0.9117489582605862, + "grad_norm": 0.3847740214010992, + "learning_rate": 8.78593088619085e-05, + "loss": 2.9986, + "step": 19583 + }, + { + "epoch": 0.9117955164466792, + "grad_norm": 0.3167753652142457, + "learning_rate": 8.785753946201484e-05, + "loss": 2.8251, + "step": 19584 + }, + { + "epoch": 0.9118420746327723, + "grad_norm": 0.38633978226852445, + "learning_rate": 8.785576995101301e-05, + "loss": 2.8215, + "step": 19585 + }, + { + "epoch": 0.9118886328188653, + "grad_norm": 0.3570374945186243, + "learning_rate": 8.785400032890825e-05, + "loss": 2.9541, + "step": 19586 + }, + { + "epoch": 0.9119351910049585, + "grad_norm": 0.3791328824310264, + "learning_rate": 8.785223059570572e-05, + "loss": 2.9317, + "step": 19587 + }, + { + "epoch": 0.9119817491910516, + "grad_norm": 0.3414528899664806, + "learning_rate": 8.785046075141062e-05, + "loss": 2.8142, + "step": 19588 + }, + { + "epoch": 0.9120283073771446, + "grad_norm": 0.3742909268765202, + "learning_rate": 8.784869079602816e-05, + "loss": 2.8567, + "step": 19589 + }, + { + "epoch": 0.9120748655632377, + "grad_norm": 0.33274269345557994, + "learning_rate": 8.78469207295635e-05, + "loss": 2.8697, + "step": 19590 + }, + { + "epoch": 0.9121214237493307, + "grad_norm": 0.3668521831206913, + "learning_rate": 8.784515055202187e-05, + "loss": 2.9361, + "step": 19591 + }, + { + "epoch": 0.9121679819354238, + "grad_norm": 0.3312683933668303, + "learning_rate": 8.784338026340844e-05, + "loss": 2.8735, + "step": 19592 + }, + { + "epoch": 0.9122145401215168, + "grad_norm": 0.35856427854814726, + "learning_rate": 8.784160986372842e-05, + "loss": 2.8809, + "step": 19593 + }, + { + "epoch": 0.9122610983076099, + "grad_norm": 0.33454704989458894, + "learning_rate": 8.783983935298701e-05, + "loss": 2.8562, + "step": 19594 + }, + { + "epoch": 0.912307656493703, + "grad_norm": 0.37164350883596664, + "learning_rate": 8.783806873118941e-05, + "loss": 2.9355, + "step": 19595 + }, + { + "epoch": 0.912354214679796, + "grad_norm": 0.384951748939384, + "learning_rate": 8.783629799834078e-05, + "loss": 2.9029, + "step": 19596 + }, + { + "epoch": 0.9124007728658892, + "grad_norm": 0.3625377267825257, + "learning_rate": 8.783452715444636e-05, + "loss": 2.9548, + "step": 19597 + }, + { + "epoch": 0.9124473310519822, + "grad_norm": 0.38651728485182074, + "learning_rate": 8.783275619951133e-05, + "loss": 2.9411, + "step": 19598 + }, + { + "epoch": 0.9124938892380753, + "grad_norm": 0.3157715189779542, + "learning_rate": 8.783098513354087e-05, + "loss": 3.0182, + "step": 19599 + }, + { + "epoch": 0.9125404474241684, + "grad_norm": 0.4167569931187015, + "learning_rate": 8.782921395654022e-05, + "loss": 2.911, + "step": 19600 + }, + { + "epoch": 0.9125870056102614, + "grad_norm": 0.32749669028675066, + "learning_rate": 8.782744266851454e-05, + "loss": 2.9203, + "step": 19601 + }, + { + "epoch": 0.9126335637963545, + "grad_norm": 0.32779264788200724, + "learning_rate": 8.782567126946904e-05, + "loss": 2.9256, + "step": 19602 + }, + { + "epoch": 0.9126801219824475, + "grad_norm": 0.32381472003686024, + "learning_rate": 8.782389975940892e-05, + "loss": 2.8364, + "step": 19603 + }, + { + "epoch": 0.9127266801685406, + "grad_norm": 0.33129834300971883, + "learning_rate": 8.78221281383394e-05, + "loss": 2.9387, + "step": 19604 + }, + { + "epoch": 0.9127732383546338, + "grad_norm": 0.33587766197589786, + "learning_rate": 8.782035640626565e-05, + "loss": 2.926, + "step": 19605 + }, + { + "epoch": 0.9128197965407268, + "grad_norm": 0.3472936496627369, + "learning_rate": 8.781858456319287e-05, + "loss": 2.8799, + "step": 19606 + }, + { + "epoch": 0.9128663547268199, + "grad_norm": 0.32061599762808235, + "learning_rate": 8.781681260912627e-05, + "loss": 2.894, + "step": 19607 + }, + { + "epoch": 0.9129129129129129, + "grad_norm": 0.3689652803458499, + "learning_rate": 8.781504054407106e-05, + "loss": 2.9774, + "step": 19608 + }, + { + "epoch": 0.912959471099006, + "grad_norm": 0.37340738413081287, + "learning_rate": 8.781326836803244e-05, + "loss": 2.9678, + "step": 19609 + }, + { + "epoch": 0.9130060292850991, + "grad_norm": 0.3265235287650436, + "learning_rate": 8.781149608101557e-05, + "loss": 2.7732, + "step": 19610 + }, + { + "epoch": 0.9130525874711921, + "grad_norm": 0.3853026667739662, + "learning_rate": 8.780972368302571e-05, + "loss": 2.8488, + "step": 19611 + }, + { + "epoch": 0.9130991456572852, + "grad_norm": 0.332840619098193, + "learning_rate": 8.780795117406804e-05, + "loss": 2.8897, + "step": 19612 + }, + { + "epoch": 0.9131457038433782, + "grad_norm": 0.4012184603284688, + "learning_rate": 8.780617855414775e-05, + "loss": 2.8801, + "step": 19613 + }, + { + "epoch": 0.9131922620294713, + "grad_norm": 0.30676317136038145, + "learning_rate": 8.780440582327005e-05, + "loss": 2.9197, + "step": 19614 + }, + { + "epoch": 0.9132388202155644, + "grad_norm": 0.366824581771714, + "learning_rate": 8.780263298144014e-05, + "loss": 2.9786, + "step": 19615 + }, + { + "epoch": 0.9132853784016575, + "grad_norm": 0.31289419570647686, + "learning_rate": 8.780086002866323e-05, + "loss": 2.8697, + "step": 19616 + }, + { + "epoch": 0.9133319365877506, + "grad_norm": 0.33254139261334265, + "learning_rate": 8.779908696494453e-05, + "loss": 2.8903, + "step": 19617 + }, + { + "epoch": 0.9133784947738436, + "grad_norm": 0.346800837892771, + "learning_rate": 8.779731379028922e-05, + "loss": 2.932, + "step": 19618 + }, + { + "epoch": 0.9134250529599367, + "grad_norm": 0.35068899436153417, + "learning_rate": 8.779554050470251e-05, + "loss": 2.9265, + "step": 19619 + }, + { + "epoch": 0.9134716111460297, + "grad_norm": 0.36557133722385887, + "learning_rate": 8.779376710818962e-05, + "loss": 2.9142, + "step": 19620 + }, + { + "epoch": 0.9135181693321228, + "grad_norm": 0.3332026274245774, + "learning_rate": 8.779199360075576e-05, + "loss": 2.9072, + "step": 19621 + }, + { + "epoch": 0.9135647275182159, + "grad_norm": 0.3357381864474263, + "learning_rate": 8.779021998240611e-05, + "loss": 2.9126, + "step": 19622 + }, + { + "epoch": 0.9136112857043089, + "grad_norm": 0.33752038903576587, + "learning_rate": 8.778844625314588e-05, + "loss": 2.9031, + "step": 19623 + }, + { + "epoch": 0.913657843890402, + "grad_norm": 0.33316148420576863, + "learning_rate": 8.778667241298029e-05, + "loss": 2.7785, + "step": 19624 + }, + { + "epoch": 0.9137044020764951, + "grad_norm": 0.3219986732816779, + "learning_rate": 8.778489846191454e-05, + "loss": 2.8689, + "step": 19625 + }, + { + "epoch": 0.9137509602625882, + "grad_norm": 0.3066147832357123, + "learning_rate": 8.778312439995385e-05, + "loss": 2.8259, + "step": 19626 + }, + { + "epoch": 0.9137975184486813, + "grad_norm": 0.337348476555367, + "learning_rate": 8.778135022710339e-05, + "loss": 2.8142, + "step": 19627 + }, + { + "epoch": 0.9138440766347743, + "grad_norm": 0.3258728181267562, + "learning_rate": 8.77795759433684e-05, + "loss": 2.9007, + "step": 19628 + }, + { + "epoch": 0.9138906348208674, + "grad_norm": 0.348790099938174, + "learning_rate": 8.777780154875408e-05, + "loss": 2.8379, + "step": 19629 + }, + { + "epoch": 0.9139371930069604, + "grad_norm": 0.3248494017018329, + "learning_rate": 8.777602704326562e-05, + "loss": 2.8615, + "step": 19630 + }, + { + "epoch": 0.9139837511930535, + "grad_norm": 0.3571441590535093, + "learning_rate": 8.777425242690824e-05, + "loss": 2.8321, + "step": 19631 + }, + { + "epoch": 0.9140303093791466, + "grad_norm": 0.31492735946976835, + "learning_rate": 8.777247769968717e-05, + "loss": 2.946, + "step": 19632 + }, + { + "epoch": 0.9140768675652396, + "grad_norm": 0.3272681207635455, + "learning_rate": 8.777070286160759e-05, + "loss": 2.9625, + "step": 19633 + }, + { + "epoch": 0.9141234257513328, + "grad_norm": 0.3420544761232804, + "learning_rate": 8.776892791267472e-05, + "loss": 2.8438, + "step": 19634 + }, + { + "epoch": 0.9141699839374258, + "grad_norm": 0.31888017121072076, + "learning_rate": 8.776715285289377e-05, + "loss": 2.8192, + "step": 19635 + }, + { + "epoch": 0.9142165421235189, + "grad_norm": 0.35408181251921206, + "learning_rate": 8.776537768226994e-05, + "loss": 2.9071, + "step": 19636 + }, + { + "epoch": 0.9142631003096119, + "grad_norm": 0.33393254440217146, + "learning_rate": 8.776360240080844e-05, + "loss": 3.0697, + "step": 19637 + }, + { + "epoch": 0.914309658495705, + "grad_norm": 0.3726038741485415, + "learning_rate": 8.77618270085145e-05, + "loss": 2.8081, + "step": 19638 + }, + { + "epoch": 0.9143562166817981, + "grad_norm": 0.3511089283298, + "learning_rate": 8.776005150539332e-05, + "loss": 2.9901, + "step": 19639 + }, + { + "epoch": 0.9144027748678911, + "grad_norm": 0.43357784981166214, + "learning_rate": 8.775827589145009e-05, + "loss": 2.9688, + "step": 19640 + }, + { + "epoch": 0.9144493330539842, + "grad_norm": 0.3369020573881668, + "learning_rate": 8.775650016669005e-05, + "loss": 2.9019, + "step": 19641 + }, + { + "epoch": 0.9144958912400772, + "grad_norm": 0.40838743756432494, + "learning_rate": 8.775472433111843e-05, + "loss": 2.9715, + "step": 19642 + }, + { + "epoch": 0.9145424494261704, + "grad_norm": 0.32391210576794177, + "learning_rate": 8.775294838474038e-05, + "loss": 2.8645, + "step": 19643 + }, + { + "epoch": 0.9145890076122635, + "grad_norm": 0.4297073886564717, + "learning_rate": 8.775117232756115e-05, + "loss": 2.9563, + "step": 19644 + }, + { + "epoch": 0.9146355657983565, + "grad_norm": 0.3463594704180932, + "learning_rate": 8.774939615958595e-05, + "loss": 2.9413, + "step": 19645 + }, + { + "epoch": 0.9146821239844496, + "grad_norm": 0.40167215775435183, + "learning_rate": 8.774761988082e-05, + "loss": 2.9413, + "step": 19646 + }, + { + "epoch": 0.9147286821705426, + "grad_norm": 0.338533281136993, + "learning_rate": 8.77458434912685e-05, + "loss": 2.9442, + "step": 19647 + }, + { + "epoch": 0.9147752403566357, + "grad_norm": 0.3897331244222211, + "learning_rate": 8.774406699093668e-05, + "loss": 2.9189, + "step": 19648 + }, + { + "epoch": 0.9148217985427288, + "grad_norm": 0.36757995060672, + "learning_rate": 8.774229037982971e-05, + "loss": 2.8126, + "step": 19649 + }, + { + "epoch": 0.9148683567288218, + "grad_norm": 0.3818967393678631, + "learning_rate": 8.774051365795287e-05, + "loss": 2.9369, + "step": 19650 + }, + { + "epoch": 0.914914914914915, + "grad_norm": 0.3706860342621151, + "learning_rate": 8.77387368253113e-05, + "loss": 2.94, + "step": 19651 + }, + { + "epoch": 0.914961473101008, + "grad_norm": 0.3494351663048918, + "learning_rate": 8.773695988191029e-05, + "loss": 2.8811, + "step": 19652 + }, + { + "epoch": 0.9150080312871011, + "grad_norm": 0.3572756311414374, + "learning_rate": 8.773518282775501e-05, + "loss": 2.8968, + "step": 19653 + }, + { + "epoch": 0.9150545894731942, + "grad_norm": 0.3350738491564264, + "learning_rate": 8.773340566285069e-05, + "loss": 2.8625, + "step": 19654 + }, + { + "epoch": 0.9151011476592872, + "grad_norm": 0.3698960717790305, + "learning_rate": 8.773162838720253e-05, + "loss": 2.9661, + "step": 19655 + }, + { + "epoch": 0.9151477058453803, + "grad_norm": 0.3449230618706265, + "learning_rate": 8.772985100081577e-05, + "loss": 2.9291, + "step": 19656 + }, + { + "epoch": 0.9151942640314733, + "grad_norm": 0.3503246209301846, + "learning_rate": 8.772807350369561e-05, + "loss": 2.9456, + "step": 19657 + }, + { + "epoch": 0.9152408222175664, + "grad_norm": 0.3478933156490333, + "learning_rate": 8.772629589584727e-05, + "loss": 2.8514, + "step": 19658 + }, + { + "epoch": 0.9152873804036594, + "grad_norm": 0.33346297359041877, + "learning_rate": 8.772451817727598e-05, + "loss": 3.0028, + "step": 19659 + }, + { + "epoch": 0.9153339385897525, + "grad_norm": 0.4141860569682541, + "learning_rate": 8.772274034798693e-05, + "loss": 2.9561, + "step": 19660 + }, + { + "epoch": 0.9153804967758457, + "grad_norm": 0.3206542945518453, + "learning_rate": 8.772096240798536e-05, + "loss": 2.9363, + "step": 19661 + }, + { + "epoch": 0.9154270549619387, + "grad_norm": 0.4040443414519824, + "learning_rate": 8.771918435727648e-05, + "loss": 2.85, + "step": 19662 + }, + { + "epoch": 0.9154736131480318, + "grad_norm": 0.37678514263323176, + "learning_rate": 8.77174061958655e-05, + "loss": 2.9493, + "step": 19663 + }, + { + "epoch": 0.9155201713341248, + "grad_norm": 0.34209852779878747, + "learning_rate": 8.771562792375766e-05, + "loss": 2.9308, + "step": 19664 + }, + { + "epoch": 0.9155667295202179, + "grad_norm": 0.380434327343679, + "learning_rate": 8.771384954095818e-05, + "loss": 2.869, + "step": 19665 + }, + { + "epoch": 0.915613287706311, + "grad_norm": 0.33659320467703563, + "learning_rate": 8.771207104747224e-05, + "loss": 2.9529, + "step": 19666 + }, + { + "epoch": 0.915659845892404, + "grad_norm": 0.38039100460996583, + "learning_rate": 8.771029244330511e-05, + "loss": 2.9222, + "step": 19667 + }, + { + "epoch": 0.9157064040784971, + "grad_norm": 0.3483606102539524, + "learning_rate": 8.770851372846197e-05, + "loss": 2.9115, + "step": 19668 + }, + { + "epoch": 0.9157529622645901, + "grad_norm": 0.3941769183638778, + "learning_rate": 8.770673490294806e-05, + "loss": 2.9358, + "step": 19669 + }, + { + "epoch": 0.9157995204506832, + "grad_norm": 0.335787379148947, + "learning_rate": 8.770495596676861e-05, + "loss": 2.874, + "step": 19670 + }, + { + "epoch": 0.9158460786367764, + "grad_norm": 0.3722807517922217, + "learning_rate": 8.770317691992882e-05, + "loss": 2.9206, + "step": 19671 + }, + { + "epoch": 0.9158926368228694, + "grad_norm": 0.4205343759798838, + "learning_rate": 8.770139776243392e-05, + "loss": 2.9282, + "step": 19672 + }, + { + "epoch": 0.9159391950089625, + "grad_norm": 0.3533269281238482, + "learning_rate": 8.769961849428914e-05, + "loss": 2.8612, + "step": 19673 + }, + { + "epoch": 0.9159857531950555, + "grad_norm": 0.35946566379607847, + "learning_rate": 8.769783911549968e-05, + "loss": 3.0014, + "step": 19674 + }, + { + "epoch": 0.9160323113811486, + "grad_norm": 0.3702610829466029, + "learning_rate": 8.769605962607079e-05, + "loss": 2.9766, + "step": 19675 + }, + { + "epoch": 0.9160788695672416, + "grad_norm": 0.35799828645500664, + "learning_rate": 8.769428002600766e-05, + "loss": 2.8116, + "step": 19676 + }, + { + "epoch": 0.9161254277533347, + "grad_norm": 0.33587073037869, + "learning_rate": 8.769250031531554e-05, + "loss": 2.9227, + "step": 19677 + }, + { + "epoch": 0.9161719859394278, + "grad_norm": 0.3278889914078076, + "learning_rate": 8.769072049399965e-05, + "loss": 2.8322, + "step": 19678 + }, + { + "epoch": 0.9162185441255208, + "grad_norm": 0.3200372587723251, + "learning_rate": 8.768894056206521e-05, + "loss": 2.9499, + "step": 19679 + }, + { + "epoch": 0.916265102311614, + "grad_norm": 0.3367320348048641, + "learning_rate": 8.768716051951743e-05, + "loss": 2.8517, + "step": 19680 + }, + { + "epoch": 0.916311660497707, + "grad_norm": 0.33442403754079647, + "learning_rate": 8.768538036636156e-05, + "loss": 2.9366, + "step": 19681 + }, + { + "epoch": 0.9163582186838001, + "grad_norm": 0.38046128200611795, + "learning_rate": 8.76836001026028e-05, + "loss": 2.9051, + "step": 19682 + }, + { + "epoch": 0.9164047768698932, + "grad_norm": 0.33919509386620733, + "learning_rate": 8.76818197282464e-05, + "loss": 2.7813, + "step": 19683 + }, + { + "epoch": 0.9164513350559862, + "grad_norm": 0.3382523815573481, + "learning_rate": 8.768003924329755e-05, + "loss": 2.8572, + "step": 19684 + }, + { + "epoch": 0.9164978932420793, + "grad_norm": 0.33992156546890545, + "learning_rate": 8.767825864776152e-05, + "loss": 2.9188, + "step": 19685 + }, + { + "epoch": 0.9165444514281723, + "grad_norm": 0.3553580518282621, + "learning_rate": 8.76764779416435e-05, + "loss": 3.0156, + "step": 19686 + }, + { + "epoch": 0.9165910096142654, + "grad_norm": 0.40721991121684, + "learning_rate": 8.767469712494874e-05, + "loss": 2.8749, + "step": 19687 + }, + { + "epoch": 0.9166375678003585, + "grad_norm": 0.379234481492569, + "learning_rate": 8.767291619768246e-05, + "loss": 2.8781, + "step": 19688 + }, + { + "epoch": 0.9166841259864515, + "grad_norm": 0.36455532135481206, + "learning_rate": 8.767113515984986e-05, + "loss": 2.9119, + "step": 19689 + }, + { + "epoch": 0.9167306841725447, + "grad_norm": 0.3880086020499628, + "learning_rate": 8.76693540114562e-05, + "loss": 2.8604, + "step": 19690 + }, + { + "epoch": 0.9167772423586377, + "grad_norm": 0.3496687365853337, + "learning_rate": 8.76675727525067e-05, + "loss": 2.8714, + "step": 19691 + }, + { + "epoch": 0.9168238005447308, + "grad_norm": 0.43327377850153864, + "learning_rate": 8.766579138300658e-05, + "loss": 2.8325, + "step": 19692 + }, + { + "epoch": 0.9168703587308239, + "grad_norm": 0.32777820579950506, + "learning_rate": 8.766400990296107e-05, + "loss": 2.8791, + "step": 19693 + }, + { + "epoch": 0.9169169169169169, + "grad_norm": 0.39407137455096736, + "learning_rate": 8.766222831237541e-05, + "loss": 2.8553, + "step": 19694 + }, + { + "epoch": 0.91696347510301, + "grad_norm": 0.3536468141762671, + "learning_rate": 8.766044661125481e-05, + "loss": 2.8872, + "step": 19695 + }, + { + "epoch": 0.917010033289103, + "grad_norm": 0.3470469656692871, + "learning_rate": 8.765866479960452e-05, + "loss": 2.9552, + "step": 19696 + }, + { + "epoch": 0.9170565914751961, + "grad_norm": 0.3860690594957931, + "learning_rate": 8.765688287742975e-05, + "loss": 2.9456, + "step": 19697 + }, + { + "epoch": 0.9171031496612891, + "grad_norm": 0.3541488914937058, + "learning_rate": 8.765510084473573e-05, + "loss": 2.8858, + "step": 19698 + }, + { + "epoch": 0.9171497078473823, + "grad_norm": 0.39704366938628555, + "learning_rate": 8.765331870152773e-05, + "loss": 2.8266, + "step": 19699 + }, + { + "epoch": 0.9171962660334754, + "grad_norm": 0.3485150896484875, + "learning_rate": 8.765153644781093e-05, + "loss": 2.8456, + "step": 19700 + }, + { + "epoch": 0.9172428242195684, + "grad_norm": 0.3573733695395185, + "learning_rate": 8.764975408359057e-05, + "loss": 2.9165, + "step": 19701 + }, + { + "epoch": 0.9172893824056615, + "grad_norm": 0.34095225811810875, + "learning_rate": 8.76479716088719e-05, + "loss": 2.8918, + "step": 19702 + }, + { + "epoch": 0.9173359405917545, + "grad_norm": 0.36234451055099676, + "learning_rate": 8.764618902366015e-05, + "loss": 2.7267, + "step": 19703 + }, + { + "epoch": 0.9173824987778476, + "grad_norm": 0.3449432593104461, + "learning_rate": 8.764440632796054e-05, + "loss": 2.8542, + "step": 19704 + }, + { + "epoch": 0.9174290569639407, + "grad_norm": 0.3553360129416965, + "learning_rate": 8.76426235217783e-05, + "loss": 2.8596, + "step": 19705 + }, + { + "epoch": 0.9174756151500337, + "grad_norm": 0.37804594920212337, + "learning_rate": 8.764084060511868e-05, + "loss": 2.9812, + "step": 19706 + }, + { + "epoch": 0.9175221733361268, + "grad_norm": 0.33450835538753915, + "learning_rate": 8.76390575779869e-05, + "loss": 2.9127, + "step": 19707 + }, + { + "epoch": 0.9175687315222198, + "grad_norm": 0.3491162606781341, + "learning_rate": 8.763727444038818e-05, + "loss": 2.9356, + "step": 19708 + }, + { + "epoch": 0.917615289708313, + "grad_norm": 0.33884275659785645, + "learning_rate": 8.763549119232776e-05, + "loss": 2.8328, + "step": 19709 + }, + { + "epoch": 0.9176618478944061, + "grad_norm": 0.32668714398452436, + "learning_rate": 8.763370783381091e-05, + "loss": 2.8073, + "step": 19710 + }, + { + "epoch": 0.9177084060804991, + "grad_norm": 0.3585799003024075, + "learning_rate": 8.763192436484282e-05, + "loss": 2.8956, + "step": 19711 + }, + { + "epoch": 0.9177549642665922, + "grad_norm": 0.34284212650602375, + "learning_rate": 8.763014078542874e-05, + "loss": 2.877, + "step": 19712 + }, + { + "epoch": 0.9178015224526852, + "grad_norm": 0.3816643718297743, + "learning_rate": 8.76283570955739e-05, + "loss": 2.9989, + "step": 19713 + }, + { + "epoch": 0.9178480806387783, + "grad_norm": 0.37260113982092696, + "learning_rate": 8.762657329528354e-05, + "loss": 2.9491, + "step": 19714 + }, + { + "epoch": 0.9178946388248714, + "grad_norm": 0.3216950513504673, + "learning_rate": 8.762478938456291e-05, + "loss": 2.8551, + "step": 19715 + }, + { + "epoch": 0.9179411970109644, + "grad_norm": 0.3537251632312025, + "learning_rate": 8.762300536341721e-05, + "loss": 2.9258, + "step": 19716 + }, + { + "epoch": 0.9179877551970576, + "grad_norm": 0.32771875348960877, + "learning_rate": 8.76212212318517e-05, + "loss": 2.8821, + "step": 19717 + }, + { + "epoch": 0.9180343133831506, + "grad_norm": 0.3371546722957396, + "learning_rate": 8.761943698987162e-05, + "loss": 2.8648, + "step": 19718 + }, + { + "epoch": 0.9180808715692437, + "grad_norm": 0.34326772806529093, + "learning_rate": 8.76176526374822e-05, + "loss": 2.8157, + "step": 19719 + }, + { + "epoch": 0.9181274297553367, + "grad_norm": 0.38721650727881635, + "learning_rate": 8.761586817468865e-05, + "loss": 2.889, + "step": 19720 + }, + { + "epoch": 0.9181739879414298, + "grad_norm": 0.34393297505032905, + "learning_rate": 8.761408360149625e-05, + "loss": 2.9285, + "step": 19721 + }, + { + "epoch": 0.9182205461275229, + "grad_norm": 0.3855820722873036, + "learning_rate": 8.761229891791023e-05, + "loss": 2.8052, + "step": 19722 + }, + { + "epoch": 0.9182671043136159, + "grad_norm": 0.35063014661263336, + "learning_rate": 8.761051412393582e-05, + "loss": 2.871, + "step": 19723 + }, + { + "epoch": 0.918313662499709, + "grad_norm": 0.34309800945605157, + "learning_rate": 8.760872921957824e-05, + "loss": 2.8811, + "step": 19724 + }, + { + "epoch": 0.918360220685802, + "grad_norm": 0.3637983892973795, + "learning_rate": 8.760694420484274e-05, + "loss": 2.7601, + "step": 19725 + }, + { + "epoch": 0.9184067788718951, + "grad_norm": 0.33324786519174276, + "learning_rate": 8.760515907973459e-05, + "loss": 2.8731, + "step": 19726 + }, + { + "epoch": 0.9184533370579883, + "grad_norm": 0.36285262265631046, + "learning_rate": 8.760337384425898e-05, + "loss": 2.8829, + "step": 19727 + }, + { + "epoch": 0.9184998952440813, + "grad_norm": 0.35030625431539486, + "learning_rate": 8.760158849842118e-05, + "loss": 2.755, + "step": 19728 + }, + { + "epoch": 0.9185464534301744, + "grad_norm": 0.4153387717033077, + "learning_rate": 8.759980304222644e-05, + "loss": 3.054, + "step": 19729 + }, + { + "epoch": 0.9185930116162674, + "grad_norm": 0.35586488273747924, + "learning_rate": 8.759801747567994e-05, + "loss": 2.8763, + "step": 19730 + }, + { + "epoch": 0.9186395698023605, + "grad_norm": 0.3818193017083589, + "learning_rate": 8.7596231798787e-05, + "loss": 2.8027, + "step": 19731 + }, + { + "epoch": 0.9186861279884536, + "grad_norm": 0.3729381931671581, + "learning_rate": 8.75944460115528e-05, + "loss": 2.9159, + "step": 19732 + }, + { + "epoch": 0.9187326861745466, + "grad_norm": 0.38086592476338543, + "learning_rate": 8.759266011398261e-05, + "loss": 2.8573, + "step": 19733 + }, + { + "epoch": 0.9187792443606397, + "grad_norm": 0.35763017616543274, + "learning_rate": 8.759087410608169e-05, + "loss": 2.8383, + "step": 19734 + }, + { + "epoch": 0.9188258025467327, + "grad_norm": 0.35303710827013357, + "learning_rate": 8.758908798785523e-05, + "loss": 2.8557, + "step": 19735 + }, + { + "epoch": 0.9188723607328259, + "grad_norm": 0.339790347054558, + "learning_rate": 8.758730175930851e-05, + "loss": 2.858, + "step": 19736 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 0.34297530635628387, + "learning_rate": 8.758551542044677e-05, + "loss": 2.9342, + "step": 19737 + }, + { + "epoch": 0.918965477105012, + "grad_norm": 0.365656139216942, + "learning_rate": 8.758372897127522e-05, + "loss": 3.0074, + "step": 19738 + }, + { + "epoch": 0.9190120352911051, + "grad_norm": 0.3436910343199266, + "learning_rate": 8.758194241179915e-05, + "loss": 3.0379, + "step": 19739 + }, + { + "epoch": 0.9190585934771981, + "grad_norm": 0.6034170360755037, + "learning_rate": 8.758015574202378e-05, + "loss": 2.9179, + "step": 19740 + }, + { + "epoch": 0.9191051516632912, + "grad_norm": 0.35336280342808296, + "learning_rate": 8.757836896195435e-05, + "loss": 2.8576, + "step": 19741 + }, + { + "epoch": 0.9191517098493842, + "grad_norm": 0.37657061087804616, + "learning_rate": 8.757658207159611e-05, + "loss": 2.9424, + "step": 19742 + }, + { + "epoch": 0.9191982680354773, + "grad_norm": 0.34818509861203095, + "learning_rate": 8.75747950709543e-05, + "loss": 2.8949, + "step": 19743 + }, + { + "epoch": 0.9192448262215704, + "grad_norm": 0.3763740352482326, + "learning_rate": 8.757300796003419e-05, + "loss": 2.9391, + "step": 19744 + }, + { + "epoch": 0.9192913844076634, + "grad_norm": 0.35033001212876375, + "learning_rate": 8.757122073884096e-05, + "loss": 2.8581, + "step": 19745 + }, + { + "epoch": 0.9193379425937566, + "grad_norm": 0.39613505104781654, + "learning_rate": 8.756943340737993e-05, + "loss": 2.9517, + "step": 19746 + }, + { + "epoch": 0.9193845007798496, + "grad_norm": 0.3647739244251373, + "learning_rate": 8.756764596565631e-05, + "loss": 2.8787, + "step": 19747 + }, + { + "epoch": 0.9194310589659427, + "grad_norm": 0.3826359757918581, + "learning_rate": 8.756585841367535e-05, + "loss": 2.8657, + "step": 19748 + }, + { + "epoch": 0.9194776171520358, + "grad_norm": 0.35610734324990256, + "learning_rate": 8.756407075144229e-05, + "loss": 2.8032, + "step": 19749 + }, + { + "epoch": 0.9195241753381288, + "grad_norm": 0.38615886171868763, + "learning_rate": 8.756228297896238e-05, + "loss": 2.9072, + "step": 19750 + }, + { + "epoch": 0.9195707335242219, + "grad_norm": 0.36309675653176177, + "learning_rate": 8.75604950962409e-05, + "loss": 2.8866, + "step": 19751 + }, + { + "epoch": 0.9196172917103149, + "grad_norm": 0.33747245054409325, + "learning_rate": 8.755870710328304e-05, + "loss": 2.8609, + "step": 19752 + }, + { + "epoch": 0.919663849896408, + "grad_norm": 0.38395026350017686, + "learning_rate": 8.755691900009407e-05, + "loss": 2.9044, + "step": 19753 + }, + { + "epoch": 0.9197104080825012, + "grad_norm": 0.3087926376826967, + "learning_rate": 8.755513078667925e-05, + "loss": 2.8693, + "step": 19754 + }, + { + "epoch": 0.9197569662685942, + "grad_norm": 0.33708164815004776, + "learning_rate": 8.755334246304382e-05, + "loss": 2.8634, + "step": 19755 + }, + { + "epoch": 0.9198035244546873, + "grad_norm": 0.3132633531313281, + "learning_rate": 8.755155402919304e-05, + "loss": 2.814, + "step": 19756 + }, + { + "epoch": 0.9198500826407803, + "grad_norm": 0.346941831616698, + "learning_rate": 8.754976548513213e-05, + "loss": 2.8453, + "step": 19757 + }, + { + "epoch": 0.9198966408268734, + "grad_norm": 0.3261257132630487, + "learning_rate": 8.754797683086638e-05, + "loss": 2.881, + "step": 19758 + }, + { + "epoch": 0.9199431990129665, + "grad_norm": 0.34928211988828023, + "learning_rate": 8.7546188066401e-05, + "loss": 2.8975, + "step": 19759 + }, + { + "epoch": 0.9199897571990595, + "grad_norm": 0.3380065510561616, + "learning_rate": 8.754439919174127e-05, + "loss": 2.8861, + "step": 19760 + }, + { + "epoch": 0.9200363153851526, + "grad_norm": 0.34686281328169194, + "learning_rate": 8.754261020689242e-05, + "loss": 2.9947, + "step": 19761 + }, + { + "epoch": 0.9200828735712456, + "grad_norm": 0.2969122457076739, + "learning_rate": 8.75408211118597e-05, + "loss": 2.7552, + "step": 19762 + }, + { + "epoch": 0.9201294317573387, + "grad_norm": 0.36749003344242454, + "learning_rate": 8.753903190664839e-05, + "loss": 2.8298, + "step": 19763 + }, + { + "epoch": 0.9201759899434317, + "grad_norm": 0.3303871562846786, + "learning_rate": 8.75372425912637e-05, + "loss": 2.8695, + "step": 19764 + }, + { + "epoch": 0.9202225481295249, + "grad_norm": 0.3447680313501446, + "learning_rate": 8.75354531657109e-05, + "loss": 2.8491, + "step": 19765 + }, + { + "epoch": 0.920269106315618, + "grad_norm": 0.3276865445240301, + "learning_rate": 8.753366362999524e-05, + "loss": 2.9622, + "step": 19766 + }, + { + "epoch": 0.920315664501711, + "grad_norm": 0.35035577569543636, + "learning_rate": 8.7531873984122e-05, + "loss": 2.9157, + "step": 19767 + }, + { + "epoch": 0.9203622226878041, + "grad_norm": 0.35740595832353383, + "learning_rate": 8.753008422809639e-05, + "loss": 2.8758, + "step": 19768 + }, + { + "epoch": 0.9204087808738971, + "grad_norm": 0.3432028211545593, + "learning_rate": 8.752829436192366e-05, + "loss": 2.9647, + "step": 19769 + }, + { + "epoch": 0.9204553390599902, + "grad_norm": 0.40465037478974564, + "learning_rate": 8.752650438560911e-05, + "loss": 2.8507, + "step": 19770 + }, + { + "epoch": 0.9205018972460833, + "grad_norm": 0.3513810046250105, + "learning_rate": 8.752471429915797e-05, + "loss": 2.841, + "step": 19771 + }, + { + "epoch": 0.9205484554321763, + "grad_norm": 0.38864641167452224, + "learning_rate": 8.752292410257547e-05, + "loss": 2.9926, + "step": 19772 + }, + { + "epoch": 0.9205950136182695, + "grad_norm": 0.3837672077589757, + "learning_rate": 8.75211337958669e-05, + "loss": 2.9305, + "step": 19773 + }, + { + "epoch": 0.9206415718043625, + "grad_norm": 0.35221481415745975, + "learning_rate": 8.751934337903751e-05, + "loss": 2.8627, + "step": 19774 + }, + { + "epoch": 0.9206881299904556, + "grad_norm": 0.3559215860509168, + "learning_rate": 8.751755285209252e-05, + "loss": 2.9459, + "step": 19775 + }, + { + "epoch": 0.9207346881765487, + "grad_norm": 0.3198275309723119, + "learning_rate": 8.751576221503721e-05, + "loss": 2.8583, + "step": 19776 + }, + { + "epoch": 0.9207812463626417, + "grad_norm": 0.3556512720972805, + "learning_rate": 8.751397146787685e-05, + "loss": 2.983, + "step": 19777 + }, + { + "epoch": 0.9208278045487348, + "grad_norm": 0.3340688190215309, + "learning_rate": 8.751218061061667e-05, + "loss": 2.8706, + "step": 19778 + }, + { + "epoch": 0.9208743627348278, + "grad_norm": 0.32002779824845917, + "learning_rate": 8.751038964326193e-05, + "loss": 2.9173, + "step": 19779 + }, + { + "epoch": 0.9209209209209209, + "grad_norm": 0.33843311814298005, + "learning_rate": 8.75085985658179e-05, + "loss": 2.8828, + "step": 19780 + }, + { + "epoch": 0.920967479107014, + "grad_norm": 0.3194139107150948, + "learning_rate": 8.750680737828982e-05, + "loss": 2.9027, + "step": 19781 + }, + { + "epoch": 0.921014037293107, + "grad_norm": 0.31826423252678226, + "learning_rate": 8.750501608068295e-05, + "loss": 2.887, + "step": 19782 + }, + { + "epoch": 0.9210605954792002, + "grad_norm": 0.29536422672666934, + "learning_rate": 8.750322467300258e-05, + "loss": 2.8062, + "step": 19783 + }, + { + "epoch": 0.9211071536652932, + "grad_norm": 0.3119795220113545, + "learning_rate": 8.750143315525392e-05, + "loss": 2.9067, + "step": 19784 + }, + { + "epoch": 0.9211537118513863, + "grad_norm": 0.3284648200679793, + "learning_rate": 8.749964152744225e-05, + "loss": 2.9739, + "step": 19785 + }, + { + "epoch": 0.9212002700374793, + "grad_norm": 0.32163495458795127, + "learning_rate": 8.749784978957282e-05, + "loss": 2.8814, + "step": 19786 + }, + { + "epoch": 0.9212468282235724, + "grad_norm": 0.31870879995347007, + "learning_rate": 8.749605794165093e-05, + "loss": 2.9274, + "step": 19787 + }, + { + "epoch": 0.9212933864096655, + "grad_norm": 0.32944429657071056, + "learning_rate": 8.749426598368176e-05, + "loss": 2.9297, + "step": 19788 + }, + { + "epoch": 0.9213399445957585, + "grad_norm": 0.3552042658366569, + "learning_rate": 8.749247391567063e-05, + "loss": 2.9607, + "step": 19789 + }, + { + "epoch": 0.9213865027818516, + "grad_norm": 0.3339602420377077, + "learning_rate": 8.749068173762278e-05, + "loss": 2.9569, + "step": 19790 + }, + { + "epoch": 0.9214330609679446, + "grad_norm": 0.3470770191915479, + "learning_rate": 8.748888944954348e-05, + "loss": 2.8949, + "step": 19791 + }, + { + "epoch": 0.9214796191540378, + "grad_norm": 0.346344839957327, + "learning_rate": 8.748709705143798e-05, + "loss": 2.9132, + "step": 19792 + }, + { + "epoch": 0.9215261773401309, + "grad_norm": 0.34569084544896067, + "learning_rate": 8.748530454331154e-05, + "loss": 2.9775, + "step": 19793 + }, + { + "epoch": 0.9215727355262239, + "grad_norm": 0.3584300486628257, + "learning_rate": 8.748351192516944e-05, + "loss": 2.9645, + "step": 19794 + }, + { + "epoch": 0.921619293712317, + "grad_norm": 0.3319053219941363, + "learning_rate": 8.74817191970169e-05, + "loss": 2.8832, + "step": 19795 + }, + { + "epoch": 0.92166585189841, + "grad_norm": 0.3421298928764844, + "learning_rate": 8.747992635885922e-05, + "loss": 2.8552, + "step": 19796 + }, + { + "epoch": 0.9217124100845031, + "grad_norm": 0.33532048874884895, + "learning_rate": 8.747813341070164e-05, + "loss": 2.9167, + "step": 19797 + }, + { + "epoch": 0.9217589682705962, + "grad_norm": 0.36279974519712865, + "learning_rate": 8.747634035254944e-05, + "loss": 2.8726, + "step": 19798 + }, + { + "epoch": 0.9218055264566892, + "grad_norm": 0.3468898150655161, + "learning_rate": 8.747454718440787e-05, + "loss": 2.8785, + "step": 19799 + }, + { + "epoch": 0.9218520846427823, + "grad_norm": 0.3782398984991754, + "learning_rate": 8.74727539062822e-05, + "loss": 2.9737, + "step": 19800 + }, + { + "epoch": 0.9218986428288753, + "grad_norm": 0.326323088578412, + "learning_rate": 8.747096051817768e-05, + "loss": 2.8275, + "step": 19801 + }, + { + "epoch": 0.9219452010149685, + "grad_norm": 0.39906662825790046, + "learning_rate": 8.746916702009959e-05, + "loss": 2.9215, + "step": 19802 + }, + { + "epoch": 0.9219917592010616, + "grad_norm": 0.3216458684690879, + "learning_rate": 8.746737341205317e-05, + "loss": 2.9019, + "step": 19803 + }, + { + "epoch": 0.9220383173871546, + "grad_norm": 0.36719600643965183, + "learning_rate": 8.746557969404372e-05, + "loss": 2.9521, + "step": 19804 + }, + { + "epoch": 0.9220848755732477, + "grad_norm": 0.3462059823460692, + "learning_rate": 8.746378586607649e-05, + "loss": 2.9105, + "step": 19805 + }, + { + "epoch": 0.9221314337593407, + "grad_norm": 0.35665360462641205, + "learning_rate": 8.746199192815671e-05, + "loss": 2.97, + "step": 19806 + }, + { + "epoch": 0.9221779919454338, + "grad_norm": 0.32853011516896247, + "learning_rate": 8.74601978802897e-05, + "loss": 2.8521, + "step": 19807 + }, + { + "epoch": 0.9222245501315268, + "grad_norm": 0.365501008846899, + "learning_rate": 8.745840372248069e-05, + "loss": 2.9423, + "step": 19808 + }, + { + "epoch": 0.9222711083176199, + "grad_norm": 0.3250916694515744, + "learning_rate": 8.745660945473495e-05, + "loss": 2.8973, + "step": 19809 + }, + { + "epoch": 0.922317666503713, + "grad_norm": 0.3718884121400129, + "learning_rate": 8.745481507705777e-05, + "loss": 2.7938, + "step": 19810 + }, + { + "epoch": 0.9223642246898061, + "grad_norm": 0.3502731295004018, + "learning_rate": 8.745302058945438e-05, + "loss": 2.8999, + "step": 19811 + }, + { + "epoch": 0.9224107828758992, + "grad_norm": 0.3947357535036707, + "learning_rate": 8.745122599193007e-05, + "loss": 2.8823, + "step": 19812 + }, + { + "epoch": 0.9224573410619922, + "grad_norm": 0.38977436843638336, + "learning_rate": 8.74494312844901e-05, + "loss": 3.0081, + "step": 19813 + }, + { + "epoch": 0.9225038992480853, + "grad_norm": 0.4243940888946928, + "learning_rate": 8.744763646713975e-05, + "loss": 2.8781, + "step": 19814 + }, + { + "epoch": 0.9225504574341784, + "grad_norm": 0.3527737820734378, + "learning_rate": 8.744584153988426e-05, + "loss": 2.7956, + "step": 19815 + }, + { + "epoch": 0.9225970156202714, + "grad_norm": 0.38776804388437136, + "learning_rate": 8.744404650272893e-05, + "loss": 3.0338, + "step": 19816 + }, + { + "epoch": 0.9226435738063645, + "grad_norm": 0.35029483050359195, + "learning_rate": 8.744225135567898e-05, + "loss": 2.8777, + "step": 19817 + }, + { + "epoch": 0.9226901319924575, + "grad_norm": 0.38331865447025154, + "learning_rate": 8.744045609873974e-05, + "loss": 2.9271, + "step": 19818 + }, + { + "epoch": 0.9227366901785506, + "grad_norm": 0.382302828076587, + "learning_rate": 8.743866073191644e-05, + "loss": 2.8243, + "step": 19819 + }, + { + "epoch": 0.9227832483646438, + "grad_norm": 0.36417402490213396, + "learning_rate": 8.743686525521437e-05, + "loss": 2.9689, + "step": 19820 + }, + { + "epoch": 0.9228298065507368, + "grad_norm": 0.4139380338430553, + "learning_rate": 8.74350696686388e-05, + "loss": 3.0036, + "step": 19821 + }, + { + "epoch": 0.9228763647368299, + "grad_norm": 0.34324832226902185, + "learning_rate": 8.743327397219495e-05, + "loss": 2.8915, + "step": 19822 + }, + { + "epoch": 0.9229229229229229, + "grad_norm": 0.37642605187205297, + "learning_rate": 8.743147816588815e-05, + "loss": 2.7855, + "step": 19823 + }, + { + "epoch": 0.922969481109016, + "grad_norm": 0.3487280472933075, + "learning_rate": 8.742968224972365e-05, + "loss": 2.8917, + "step": 19824 + }, + { + "epoch": 0.9230160392951091, + "grad_norm": 0.41255450557728185, + "learning_rate": 8.742788622370673e-05, + "loss": 3.0275, + "step": 19825 + }, + { + "epoch": 0.9230625974812021, + "grad_norm": 0.3583148944694828, + "learning_rate": 8.742609008784262e-05, + "loss": 2.9093, + "step": 19826 + }, + { + "epoch": 0.9231091556672952, + "grad_norm": 0.35754458784345766, + "learning_rate": 8.742429384213664e-05, + "loss": 2.8603, + "step": 19827 + }, + { + "epoch": 0.9231557138533882, + "grad_norm": 0.3419844681990484, + "learning_rate": 8.742249748659405e-05, + "loss": 2.9336, + "step": 19828 + }, + { + "epoch": 0.9232022720394814, + "grad_norm": 0.33834425296231707, + "learning_rate": 8.74207010212201e-05, + "loss": 2.9095, + "step": 19829 + }, + { + "epoch": 0.9232488302255744, + "grad_norm": 0.33539607436759306, + "learning_rate": 8.74189044460201e-05, + "loss": 2.9224, + "step": 19830 + }, + { + "epoch": 0.9232953884116675, + "grad_norm": 0.36395188543196383, + "learning_rate": 8.74171077609993e-05, + "loss": 2.976, + "step": 19831 + }, + { + "epoch": 0.9233419465977606, + "grad_norm": 0.36443228664860383, + "learning_rate": 8.741531096616296e-05, + "loss": 2.9234, + "step": 19832 + }, + { + "epoch": 0.9233885047838536, + "grad_norm": 0.32586994970034705, + "learning_rate": 8.741351406151639e-05, + "loss": 2.8613, + "step": 19833 + }, + { + "epoch": 0.9234350629699467, + "grad_norm": 0.3594133564966571, + "learning_rate": 8.741171704706483e-05, + "loss": 2.8718, + "step": 19834 + }, + { + "epoch": 0.9234816211560397, + "grad_norm": 0.34243497544426726, + "learning_rate": 8.740991992281356e-05, + "loss": 2.8876, + "step": 19835 + }, + { + "epoch": 0.9235281793421328, + "grad_norm": 0.37304157703324514, + "learning_rate": 8.740812268876785e-05, + "loss": 2.8933, + "step": 19836 + }, + { + "epoch": 0.9235747375282259, + "grad_norm": 0.31580278519220173, + "learning_rate": 8.740632534493301e-05, + "loss": 2.924, + "step": 19837 + }, + { + "epoch": 0.923621295714319, + "grad_norm": 0.3259706404642785, + "learning_rate": 8.740452789131429e-05, + "loss": 2.8682, + "step": 19838 + }, + { + "epoch": 0.9236678539004121, + "grad_norm": 0.3100665786812331, + "learning_rate": 8.740273032791694e-05, + "loss": 2.887, + "step": 19839 + }, + { + "epoch": 0.9237144120865051, + "grad_norm": 0.3347623532461365, + "learning_rate": 8.740093265474629e-05, + "loss": 2.9121, + "step": 19840 + }, + { + "epoch": 0.9237609702725982, + "grad_norm": 0.33225047791126705, + "learning_rate": 8.739913487180758e-05, + "loss": 2.9487, + "step": 19841 + }, + { + "epoch": 0.9238075284586913, + "grad_norm": 0.30742545185985787, + "learning_rate": 8.739733697910608e-05, + "loss": 2.8451, + "step": 19842 + }, + { + "epoch": 0.9238540866447843, + "grad_norm": 0.3360861722927857, + "learning_rate": 8.739553897664709e-05, + "loss": 2.9486, + "step": 19843 + }, + { + "epoch": 0.9239006448308774, + "grad_norm": 0.30886612025599125, + "learning_rate": 8.739374086443588e-05, + "loss": 2.8688, + "step": 19844 + }, + { + "epoch": 0.9239472030169704, + "grad_norm": 0.3347220308329401, + "learning_rate": 8.739194264247772e-05, + "loss": 2.8821, + "step": 19845 + }, + { + "epoch": 0.9239937612030635, + "grad_norm": 0.34352058623603177, + "learning_rate": 8.739014431077789e-05, + "loss": 2.9325, + "step": 19846 + }, + { + "epoch": 0.9240403193891567, + "grad_norm": 0.33431729907714397, + "learning_rate": 8.738834586934166e-05, + "loss": 2.8057, + "step": 19847 + }, + { + "epoch": 0.9240868775752497, + "grad_norm": 0.31237619473272676, + "learning_rate": 8.738654731817434e-05, + "loss": 2.8224, + "step": 19848 + }, + { + "epoch": 0.9241334357613428, + "grad_norm": 0.32288163193062414, + "learning_rate": 8.738474865728116e-05, + "loss": 2.9458, + "step": 19849 + }, + { + "epoch": 0.9241799939474358, + "grad_norm": 0.33323162236999615, + "learning_rate": 8.738294988666746e-05, + "loss": 2.9809, + "step": 19850 + }, + { + "epoch": 0.9242265521335289, + "grad_norm": 0.32223205238227715, + "learning_rate": 8.738115100633846e-05, + "loss": 2.8391, + "step": 19851 + }, + { + "epoch": 0.9242731103196219, + "grad_norm": 0.3043087797427197, + "learning_rate": 8.737935201629947e-05, + "loss": 2.7718, + "step": 19852 + }, + { + "epoch": 0.924319668505715, + "grad_norm": 0.32469368127981807, + "learning_rate": 8.737755291655579e-05, + "loss": 2.8585, + "step": 19853 + }, + { + "epoch": 0.9243662266918081, + "grad_norm": 0.33702418633400305, + "learning_rate": 8.737575370711264e-05, + "loss": 2.8982, + "step": 19854 + }, + { + "epoch": 0.9244127848779011, + "grad_norm": 0.34165026377816016, + "learning_rate": 8.737395438797534e-05, + "loss": 2.8784, + "step": 19855 + }, + { + "epoch": 0.9244593430639942, + "grad_norm": 0.3368433112821744, + "learning_rate": 8.737215495914918e-05, + "loss": 2.9145, + "step": 19856 + }, + { + "epoch": 0.9245059012500872, + "grad_norm": 0.37956201157966185, + "learning_rate": 8.737035542063941e-05, + "loss": 2.964, + "step": 19857 + }, + { + "epoch": 0.9245524594361804, + "grad_norm": 0.34036415109994356, + "learning_rate": 8.736855577245133e-05, + "loss": 2.9269, + "step": 19858 + }, + { + "epoch": 0.9245990176222735, + "grad_norm": 0.3677025491732408, + "learning_rate": 8.736675601459024e-05, + "loss": 2.9056, + "step": 19859 + }, + { + "epoch": 0.9246455758083665, + "grad_norm": 0.39070206699405313, + "learning_rate": 8.73649561470614e-05, + "loss": 2.8305, + "step": 19860 + }, + { + "epoch": 0.9246921339944596, + "grad_norm": 0.34187907037940374, + "learning_rate": 8.736315616987007e-05, + "loss": 2.9337, + "step": 19861 + }, + { + "epoch": 0.9247386921805526, + "grad_norm": 0.3655351424756477, + "learning_rate": 8.736135608302157e-05, + "loss": 2.8438, + "step": 19862 + }, + { + "epoch": 0.9247852503666457, + "grad_norm": 0.31244880393699753, + "learning_rate": 8.735955588652118e-05, + "loss": 2.8563, + "step": 19863 + }, + { + "epoch": 0.9248318085527388, + "grad_norm": 0.3444887365451807, + "learning_rate": 8.735775558037417e-05, + "loss": 2.9022, + "step": 19864 + }, + { + "epoch": 0.9248783667388318, + "grad_norm": 0.3337301871036466, + "learning_rate": 8.735595516458582e-05, + "loss": 2.8377, + "step": 19865 + }, + { + "epoch": 0.924924924924925, + "grad_norm": 0.3778523963403632, + "learning_rate": 8.735415463916144e-05, + "loss": 2.9599, + "step": 19866 + }, + { + "epoch": 0.924971483111018, + "grad_norm": 0.324851581552077, + "learning_rate": 8.735235400410626e-05, + "loss": 2.886, + "step": 19867 + }, + { + "epoch": 0.9250180412971111, + "grad_norm": 0.3612885417208605, + "learning_rate": 8.735055325942563e-05, + "loss": 2.914, + "step": 19868 + }, + { + "epoch": 0.9250645994832042, + "grad_norm": 0.3558536439334163, + "learning_rate": 8.73487524051248e-05, + "loss": 2.958, + "step": 19869 + }, + { + "epoch": 0.9251111576692972, + "grad_norm": 0.31846551910921467, + "learning_rate": 8.734695144120906e-05, + "loss": 2.8253, + "step": 19870 + }, + { + "epoch": 0.9251577158553903, + "grad_norm": 0.3582542331856008, + "learning_rate": 8.734515036768369e-05, + "loss": 2.9068, + "step": 19871 + }, + { + "epoch": 0.9252042740414833, + "grad_norm": 0.3842433769520491, + "learning_rate": 8.7343349184554e-05, + "loss": 2.8716, + "step": 19872 + }, + { + "epoch": 0.9252508322275764, + "grad_norm": 0.33181606404126796, + "learning_rate": 8.734154789182525e-05, + "loss": 2.8314, + "step": 19873 + }, + { + "epoch": 0.9252973904136694, + "grad_norm": 0.3564998739361829, + "learning_rate": 8.733974648950272e-05, + "loss": 2.8855, + "step": 19874 + }, + { + "epoch": 0.9253439485997625, + "grad_norm": 0.4000035679536482, + "learning_rate": 8.733794497759174e-05, + "loss": 2.9522, + "step": 19875 + }, + { + "epoch": 0.9253905067858557, + "grad_norm": 0.3608456379854846, + "learning_rate": 8.733614335609756e-05, + "loss": 2.932, + "step": 19876 + }, + { + "epoch": 0.9254370649719487, + "grad_norm": 0.3891882209339116, + "learning_rate": 8.733434162502547e-05, + "loss": 2.9347, + "step": 19877 + }, + { + "epoch": 0.9254836231580418, + "grad_norm": 0.380082091439634, + "learning_rate": 8.733253978438076e-05, + "loss": 2.7461, + "step": 19878 + }, + { + "epoch": 0.9255301813441348, + "grad_norm": 0.3251161458714399, + "learning_rate": 8.733073783416874e-05, + "loss": 2.8692, + "step": 19879 + }, + { + "epoch": 0.9255767395302279, + "grad_norm": 0.3584952087212676, + "learning_rate": 8.732893577439467e-05, + "loss": 2.9085, + "step": 19880 + }, + { + "epoch": 0.925623297716321, + "grad_norm": 0.34256604610743363, + "learning_rate": 8.732713360506386e-05, + "loss": 2.964, + "step": 19881 + }, + { + "epoch": 0.925669855902414, + "grad_norm": 0.3803977880264625, + "learning_rate": 8.732533132618159e-05, + "loss": 2.8599, + "step": 19882 + }, + { + "epoch": 0.9257164140885071, + "grad_norm": 0.3518117414218958, + "learning_rate": 8.732352893775315e-05, + "loss": 2.9073, + "step": 19883 + }, + { + "epoch": 0.9257629722746001, + "grad_norm": 0.3234197670485856, + "learning_rate": 8.732172643978383e-05, + "loss": 2.8074, + "step": 19884 + }, + { + "epoch": 0.9258095304606933, + "grad_norm": 0.3167785862888167, + "learning_rate": 8.731992383227891e-05, + "loss": 2.9549, + "step": 19885 + }, + { + "epoch": 0.9258560886467864, + "grad_norm": 0.2923472502792617, + "learning_rate": 8.73181211152437e-05, + "loss": 2.7942, + "step": 19886 + }, + { + "epoch": 0.9259026468328794, + "grad_norm": 0.3346833529486563, + "learning_rate": 8.731631828868347e-05, + "loss": 2.8423, + "step": 19887 + }, + { + "epoch": 0.9259492050189725, + "grad_norm": 0.3335761679869998, + "learning_rate": 8.731451535260353e-05, + "loss": 2.9455, + "step": 19888 + }, + { + "epoch": 0.9259957632050655, + "grad_norm": 0.32261852444423045, + "learning_rate": 8.731271230700917e-05, + "loss": 2.7921, + "step": 19889 + }, + { + "epoch": 0.9260423213911586, + "grad_norm": 0.3394064439042484, + "learning_rate": 8.731090915190566e-05, + "loss": 2.8505, + "step": 19890 + }, + { + "epoch": 0.9260888795772517, + "grad_norm": 0.3116370007519157, + "learning_rate": 8.730910588729832e-05, + "loss": 2.9123, + "step": 19891 + }, + { + "epoch": 0.9261354377633447, + "grad_norm": 0.34383881871879407, + "learning_rate": 8.730730251319243e-05, + "loss": 2.8497, + "step": 19892 + }, + { + "epoch": 0.9261819959494378, + "grad_norm": 0.3163163530433326, + "learning_rate": 8.730549902959327e-05, + "loss": 2.8467, + "step": 19893 + }, + { + "epoch": 0.9262285541355308, + "grad_norm": 0.3536923827463133, + "learning_rate": 8.730369543650617e-05, + "loss": 2.9556, + "step": 19894 + }, + { + "epoch": 0.926275112321624, + "grad_norm": 0.29994552927988166, + "learning_rate": 8.730189173393636e-05, + "loss": 2.8277, + "step": 19895 + }, + { + "epoch": 0.926321670507717, + "grad_norm": 0.3490794174155039, + "learning_rate": 8.730008792188919e-05, + "loss": 2.9154, + "step": 19896 + }, + { + "epoch": 0.9263682286938101, + "grad_norm": 0.31526208858564564, + "learning_rate": 8.729828400036995e-05, + "loss": 2.919, + "step": 19897 + }, + { + "epoch": 0.9264147868799032, + "grad_norm": 0.3267456658072667, + "learning_rate": 8.729647996938391e-05, + "loss": 2.9215, + "step": 19898 + }, + { + "epoch": 0.9264613450659962, + "grad_norm": 0.33099266644311803, + "learning_rate": 8.729467582893637e-05, + "loss": 2.9433, + "step": 19899 + }, + { + "epoch": 0.9265079032520893, + "grad_norm": 0.3153621677181947, + "learning_rate": 8.729287157903265e-05, + "loss": 2.9857, + "step": 19900 + }, + { + "epoch": 0.9265544614381823, + "grad_norm": 0.3502942360640074, + "learning_rate": 8.729106721967801e-05, + "loss": 2.9387, + "step": 19901 + }, + { + "epoch": 0.9266010196242754, + "grad_norm": 0.31418003442846826, + "learning_rate": 8.728926275087777e-05, + "loss": 2.9357, + "step": 19902 + }, + { + "epoch": 0.9266475778103686, + "grad_norm": 0.3450281704646486, + "learning_rate": 8.728745817263722e-05, + "loss": 2.9498, + "step": 19903 + }, + { + "epoch": 0.9266941359964616, + "grad_norm": 0.3262774899995041, + "learning_rate": 8.728565348496165e-05, + "loss": 2.8837, + "step": 19904 + }, + { + "epoch": 0.9267406941825547, + "grad_norm": 0.3743904585672651, + "learning_rate": 8.728384868785634e-05, + "loss": 2.8232, + "step": 19905 + }, + { + "epoch": 0.9267872523686477, + "grad_norm": 0.29958650561166783, + "learning_rate": 8.728204378132662e-05, + "loss": 2.8365, + "step": 19906 + }, + { + "epoch": 0.9268338105547408, + "grad_norm": 0.35703107339325113, + "learning_rate": 8.728023876537778e-05, + "loss": 2.935, + "step": 19907 + }, + { + "epoch": 0.9268803687408339, + "grad_norm": 0.3104447547003485, + "learning_rate": 8.727843364001512e-05, + "loss": 2.8413, + "step": 19908 + }, + { + "epoch": 0.9269269269269269, + "grad_norm": 0.3550810732242213, + "learning_rate": 8.727662840524393e-05, + "loss": 2.8097, + "step": 19909 + }, + { + "epoch": 0.92697348511302, + "grad_norm": 0.28172035524351774, + "learning_rate": 8.727482306106949e-05, + "loss": 2.8149, + "step": 19910 + }, + { + "epoch": 0.927020043299113, + "grad_norm": 0.3682478125961661, + "learning_rate": 8.727301760749713e-05, + "loss": 2.8144, + "step": 19911 + }, + { + "epoch": 0.9270666014852061, + "grad_norm": 0.3189001287449222, + "learning_rate": 8.727121204453212e-05, + "loss": 2.8303, + "step": 19912 + }, + { + "epoch": 0.9271131596712993, + "grad_norm": 0.4146810778742443, + "learning_rate": 8.72694063721798e-05, + "loss": 3.0238, + "step": 19913 + }, + { + "epoch": 0.9271597178573923, + "grad_norm": 0.30153381030865006, + "learning_rate": 8.726760059044542e-05, + "loss": 2.8387, + "step": 19914 + }, + { + "epoch": 0.9272062760434854, + "grad_norm": 0.3875229810646753, + "learning_rate": 8.726579469933432e-05, + "loss": 2.8674, + "step": 19915 + }, + { + "epoch": 0.9272528342295784, + "grad_norm": 0.34261027118728266, + "learning_rate": 8.726398869885177e-05, + "loss": 2.8172, + "step": 19916 + }, + { + "epoch": 0.9272993924156715, + "grad_norm": 0.37884185446234, + "learning_rate": 8.72621825890031e-05, + "loss": 2.9598, + "step": 19917 + }, + { + "epoch": 0.9273459506017645, + "grad_norm": 0.34987130505590064, + "learning_rate": 8.726037636979358e-05, + "loss": 2.8254, + "step": 19918 + }, + { + "epoch": 0.9273925087878576, + "grad_norm": 0.397909498502011, + "learning_rate": 8.725857004122853e-05, + "loss": 2.908, + "step": 19919 + }, + { + "epoch": 0.9274390669739507, + "grad_norm": 0.37785114844053014, + "learning_rate": 8.725676360331325e-05, + "loss": 2.8447, + "step": 19920 + }, + { + "epoch": 0.9274856251600437, + "grad_norm": 0.38346062304655565, + "learning_rate": 8.725495705605305e-05, + "loss": 2.8443, + "step": 19921 + }, + { + "epoch": 0.9275321833461369, + "grad_norm": 0.3402342226277638, + "learning_rate": 8.72531503994532e-05, + "loss": 2.8468, + "step": 19922 + }, + { + "epoch": 0.9275787415322299, + "grad_norm": 0.4258356704447602, + "learning_rate": 8.725134363351903e-05, + "loss": 2.886, + "step": 19923 + }, + { + "epoch": 0.927625299718323, + "grad_norm": 0.3391598596498011, + "learning_rate": 8.724953675825585e-05, + "loss": 2.9462, + "step": 19924 + }, + { + "epoch": 0.9276718579044161, + "grad_norm": 0.39085609436961255, + "learning_rate": 8.724772977366893e-05, + "loss": 2.9213, + "step": 19925 + }, + { + "epoch": 0.9277184160905091, + "grad_norm": 0.3226716981772889, + "learning_rate": 8.724592267976359e-05, + "loss": 2.8972, + "step": 19926 + }, + { + "epoch": 0.9277649742766022, + "grad_norm": 0.3860206587507008, + "learning_rate": 8.724411547654516e-05, + "loss": 2.8715, + "step": 19927 + }, + { + "epoch": 0.9278115324626952, + "grad_norm": 0.37643384619445913, + "learning_rate": 8.724230816401891e-05, + "loss": 2.8739, + "step": 19928 + }, + { + "epoch": 0.9278580906487883, + "grad_norm": 0.3786307531556199, + "learning_rate": 8.724050074219014e-05, + "loss": 2.8514, + "step": 19929 + }, + { + "epoch": 0.9279046488348814, + "grad_norm": 0.373733548889489, + "learning_rate": 8.723869321106419e-05, + "loss": 2.906, + "step": 19930 + }, + { + "epoch": 0.9279512070209744, + "grad_norm": 0.3408661148905472, + "learning_rate": 8.723688557064633e-05, + "loss": 2.9601, + "step": 19931 + }, + { + "epoch": 0.9279977652070676, + "grad_norm": 0.35002396013164144, + "learning_rate": 8.723507782094188e-05, + "loss": 2.7379, + "step": 19932 + }, + { + "epoch": 0.9280443233931606, + "grad_norm": 0.37497352572118753, + "learning_rate": 8.723326996195616e-05, + "loss": 2.98, + "step": 19933 + }, + { + "epoch": 0.9280908815792537, + "grad_norm": 0.3194470714300806, + "learning_rate": 8.723146199369445e-05, + "loss": 2.8416, + "step": 19934 + }, + { + "epoch": 0.9281374397653468, + "grad_norm": 0.36961458896084715, + "learning_rate": 8.722965391616206e-05, + "loss": 2.8165, + "step": 19935 + }, + { + "epoch": 0.9281839979514398, + "grad_norm": 0.2971811497416803, + "learning_rate": 8.722784572936432e-05, + "loss": 2.7299, + "step": 19936 + }, + { + "epoch": 0.9282305561375329, + "grad_norm": 0.3677226505445917, + "learning_rate": 8.72260374333065e-05, + "loss": 2.7769, + "step": 19937 + }, + { + "epoch": 0.9282771143236259, + "grad_norm": 0.31092518823620174, + "learning_rate": 8.722422902799395e-05, + "loss": 2.8016, + "step": 19938 + }, + { + "epoch": 0.928323672509719, + "grad_norm": 0.33909126006286866, + "learning_rate": 8.722242051343195e-05, + "loss": 2.8956, + "step": 19939 + }, + { + "epoch": 0.928370230695812, + "grad_norm": 0.33553893485225283, + "learning_rate": 8.722061188962581e-05, + "loss": 2.8903, + "step": 19940 + }, + { + "epoch": 0.9284167888819052, + "grad_norm": 0.34163175162270476, + "learning_rate": 8.721880315658082e-05, + "loss": 2.9087, + "step": 19941 + }, + { + "epoch": 0.9284633470679983, + "grad_norm": 0.3013477296181801, + "learning_rate": 8.721699431430233e-05, + "loss": 2.7318, + "step": 19942 + }, + { + "epoch": 0.9285099052540913, + "grad_norm": 0.3501191130689604, + "learning_rate": 8.721518536279563e-05, + "loss": 2.9415, + "step": 19943 + }, + { + "epoch": 0.9285564634401844, + "grad_norm": 0.3534356356256747, + "learning_rate": 8.721337630206603e-05, + "loss": 2.7942, + "step": 19944 + }, + { + "epoch": 0.9286030216262774, + "grad_norm": 0.32124936927104786, + "learning_rate": 8.721156713211882e-05, + "loss": 2.9067, + "step": 19945 + }, + { + "epoch": 0.9286495798123705, + "grad_norm": 0.35624296035249764, + "learning_rate": 8.720975785295934e-05, + "loss": 2.9271, + "step": 19946 + }, + { + "epoch": 0.9286961379984636, + "grad_norm": 0.3510207388586152, + "learning_rate": 8.720794846459288e-05, + "loss": 2.9267, + "step": 19947 + }, + { + "epoch": 0.9287426961845566, + "grad_norm": 0.33125662868819356, + "learning_rate": 8.720613896702477e-05, + "loss": 2.8507, + "step": 19948 + }, + { + "epoch": 0.9287892543706497, + "grad_norm": 0.32079770706270366, + "learning_rate": 8.720432936026028e-05, + "loss": 2.8222, + "step": 19949 + }, + { + "epoch": 0.9288358125567427, + "grad_norm": 0.30938067286289395, + "learning_rate": 8.720251964430477e-05, + "loss": 2.939, + "step": 19950 + }, + { + "epoch": 0.9288823707428359, + "grad_norm": 0.3405394880202336, + "learning_rate": 8.720070981916354e-05, + "loss": 2.8635, + "step": 19951 + }, + { + "epoch": 0.928928928928929, + "grad_norm": 0.32256505554348036, + "learning_rate": 8.719889988484187e-05, + "loss": 2.8925, + "step": 19952 + }, + { + "epoch": 0.928975487115022, + "grad_norm": 0.33914589254377875, + "learning_rate": 8.71970898413451e-05, + "loss": 2.8761, + "step": 19953 + }, + { + "epoch": 0.9290220453011151, + "grad_norm": 0.3377394767501338, + "learning_rate": 8.719527968867853e-05, + "loss": 2.9864, + "step": 19954 + }, + { + "epoch": 0.9290686034872081, + "grad_norm": 0.3442768499906313, + "learning_rate": 8.719346942684747e-05, + "loss": 2.9357, + "step": 19955 + }, + { + "epoch": 0.9291151616733012, + "grad_norm": 0.3458167221289353, + "learning_rate": 8.719165905585726e-05, + "loss": 2.8784, + "step": 19956 + }, + { + "epoch": 0.9291617198593943, + "grad_norm": 0.33726750868537464, + "learning_rate": 8.718984857571319e-05, + "loss": 2.9581, + "step": 19957 + }, + { + "epoch": 0.9292082780454873, + "grad_norm": 0.3227078504992754, + "learning_rate": 8.718803798642059e-05, + "loss": 2.8596, + "step": 19958 + }, + { + "epoch": 0.9292548362315804, + "grad_norm": 0.3700486652516699, + "learning_rate": 8.718622728798475e-05, + "loss": 2.9416, + "step": 19959 + }, + { + "epoch": 0.9293013944176735, + "grad_norm": 0.33257412139555337, + "learning_rate": 8.7184416480411e-05, + "loss": 2.7712, + "step": 19960 + }, + { + "epoch": 0.9293479526037666, + "grad_norm": 0.36196942274192556, + "learning_rate": 8.718260556370464e-05, + "loss": 2.9159, + "step": 19961 + }, + { + "epoch": 0.9293945107898596, + "grad_norm": 0.3741367356087119, + "learning_rate": 8.7180794537871e-05, + "loss": 2.8654, + "step": 19962 + }, + { + "epoch": 0.9294410689759527, + "grad_norm": 0.3259363004193534, + "learning_rate": 8.71789834029154e-05, + "loss": 2.8978, + "step": 19963 + }, + { + "epoch": 0.9294876271620458, + "grad_norm": 0.3699992105041467, + "learning_rate": 8.717717215884314e-05, + "loss": 2.9262, + "step": 19964 + }, + { + "epoch": 0.9295341853481388, + "grad_norm": 0.33400663527392827, + "learning_rate": 8.717536080565955e-05, + "loss": 2.8483, + "step": 19965 + }, + { + "epoch": 0.9295807435342319, + "grad_norm": 0.37974584122871397, + "learning_rate": 8.717354934336993e-05, + "loss": 2.9236, + "step": 19966 + }, + { + "epoch": 0.9296273017203249, + "grad_norm": 0.4155932096474554, + "learning_rate": 8.71717377719796e-05, + "loss": 2.8761, + "step": 19967 + }, + { + "epoch": 0.929673859906418, + "grad_norm": 0.3522559212483985, + "learning_rate": 8.716992609149389e-05, + "loss": 2.8341, + "step": 19968 + }, + { + "epoch": 0.9297204180925112, + "grad_norm": 0.4041931475540988, + "learning_rate": 8.716811430191811e-05, + "loss": 2.9328, + "step": 19969 + }, + { + "epoch": 0.9297669762786042, + "grad_norm": 0.35618343281290554, + "learning_rate": 8.716630240325757e-05, + "loss": 2.927, + "step": 19970 + }, + { + "epoch": 0.9298135344646973, + "grad_norm": 0.39233750639374537, + "learning_rate": 8.716449039551759e-05, + "loss": 2.854, + "step": 19971 + }, + { + "epoch": 0.9298600926507903, + "grad_norm": 0.338283360747868, + "learning_rate": 8.716267827870351e-05, + "loss": 2.7796, + "step": 19972 + }, + { + "epoch": 0.9299066508368834, + "grad_norm": 0.3802688648620278, + "learning_rate": 8.716086605282062e-05, + "loss": 2.8564, + "step": 19973 + }, + { + "epoch": 0.9299532090229765, + "grad_norm": 0.359722122523184, + "learning_rate": 8.715905371787425e-05, + "loss": 2.8669, + "step": 19974 + }, + { + "epoch": 0.9299997672090695, + "grad_norm": 0.3785644424591901, + "learning_rate": 8.715724127386972e-05, + "loss": 2.8427, + "step": 19975 + }, + { + "epoch": 0.9300463253951626, + "grad_norm": 0.41310196047719727, + "learning_rate": 8.715542872081235e-05, + "loss": 2.8658, + "step": 19976 + }, + { + "epoch": 0.9300928835812556, + "grad_norm": 0.36612820738886426, + "learning_rate": 8.715361605870745e-05, + "loss": 2.7975, + "step": 19977 + }, + { + "epoch": 0.9301394417673488, + "grad_norm": 0.34308536223249253, + "learning_rate": 8.715180328756034e-05, + "loss": 2.9231, + "step": 19978 + }, + { + "epoch": 0.9301859999534419, + "grad_norm": 0.3732732750182066, + "learning_rate": 8.714999040737635e-05, + "loss": 2.8513, + "step": 19979 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.33025070285704167, + "learning_rate": 8.714817741816081e-05, + "loss": 2.8079, + "step": 19980 + }, + { + "epoch": 0.930279116325628, + "grad_norm": 0.3644046828959765, + "learning_rate": 8.714636431991902e-05, + "loss": 2.8385, + "step": 19981 + }, + { + "epoch": 0.930325674511721, + "grad_norm": 0.3613781406581077, + "learning_rate": 8.714455111265632e-05, + "loss": 2.8794, + "step": 19982 + }, + { + "epoch": 0.9303722326978141, + "grad_norm": 0.3672681145388402, + "learning_rate": 8.7142737796378e-05, + "loss": 2.9179, + "step": 19983 + }, + { + "epoch": 0.9304187908839071, + "grad_norm": 0.37462831572483557, + "learning_rate": 8.714092437108943e-05, + "loss": 2.9158, + "step": 19984 + }, + { + "epoch": 0.9304653490700002, + "grad_norm": 0.35492336060762103, + "learning_rate": 8.71391108367959e-05, + "loss": 2.9235, + "step": 19985 + }, + { + "epoch": 0.9305119072560933, + "grad_norm": 0.4036885499802152, + "learning_rate": 8.713729719350273e-05, + "loss": 2.9555, + "step": 19986 + }, + { + "epoch": 0.9305584654421863, + "grad_norm": 0.3619696000098053, + "learning_rate": 8.713548344121525e-05, + "loss": 2.8862, + "step": 19987 + }, + { + "epoch": 0.9306050236282795, + "grad_norm": 0.34730578312433047, + "learning_rate": 8.71336695799388e-05, + "loss": 2.8378, + "step": 19988 + }, + { + "epoch": 0.9306515818143725, + "grad_norm": 0.38690122766368584, + "learning_rate": 8.713185560967866e-05, + "loss": 2.9637, + "step": 19989 + }, + { + "epoch": 0.9306981400004656, + "grad_norm": 0.3361200129099777, + "learning_rate": 8.713004153044022e-05, + "loss": 2.8882, + "step": 19990 + }, + { + "epoch": 0.9307446981865587, + "grad_norm": 0.3484353697606406, + "learning_rate": 8.712822734222874e-05, + "loss": 2.8131, + "step": 19991 + }, + { + "epoch": 0.9307912563726517, + "grad_norm": 0.35873732172035866, + "learning_rate": 8.712641304504957e-05, + "loss": 2.965, + "step": 19992 + }, + { + "epoch": 0.9308378145587448, + "grad_norm": 0.39068190332692865, + "learning_rate": 8.712459863890805e-05, + "loss": 2.9151, + "step": 19993 + }, + { + "epoch": 0.9308843727448378, + "grad_norm": 0.3221249463741238, + "learning_rate": 8.712278412380947e-05, + "loss": 2.9098, + "step": 19994 + }, + { + "epoch": 0.9309309309309309, + "grad_norm": 0.38995030602522346, + "learning_rate": 8.712096949975918e-05, + "loss": 3.0724, + "step": 19995 + }, + { + "epoch": 0.930977489117024, + "grad_norm": 0.329883174593127, + "learning_rate": 8.711915476676252e-05, + "loss": 2.8591, + "step": 19996 + }, + { + "epoch": 0.931024047303117, + "grad_norm": 0.3843715402012034, + "learning_rate": 8.711733992482478e-05, + "loss": 2.8194, + "step": 19997 + }, + { + "epoch": 0.9310706054892102, + "grad_norm": 0.3071625930866441, + "learning_rate": 8.711552497395132e-05, + "loss": 2.8467, + "step": 19998 + }, + { + "epoch": 0.9311171636753032, + "grad_norm": 0.4074275959820046, + "learning_rate": 8.711370991414745e-05, + "loss": 2.8295, + "step": 19999 + }, + { + "epoch": 0.9311637218613963, + "grad_norm": 0.3887329963797546, + "learning_rate": 8.711189474541848e-05, + "loss": 2.8802, + "step": 20000 + }, + { + "epoch": 0.9312102800474894, + "grad_norm": 0.3486702912928154, + "learning_rate": 8.711007946776976e-05, + "loss": 2.8899, + "step": 20001 + }, + { + "epoch": 0.9312568382335824, + "grad_norm": 0.3539281910841805, + "learning_rate": 8.710826408120662e-05, + "loss": 2.8243, + "step": 20002 + }, + { + "epoch": 0.9313033964196755, + "grad_norm": 0.3294885325261975, + "learning_rate": 8.710644858573438e-05, + "loss": 2.8745, + "step": 20003 + }, + { + "epoch": 0.9313499546057685, + "grad_norm": 0.34226877655794946, + "learning_rate": 8.710463298135835e-05, + "loss": 2.7116, + "step": 20004 + }, + { + "epoch": 0.9313965127918616, + "grad_norm": 0.34252395106234995, + "learning_rate": 8.71028172680839e-05, + "loss": 2.9647, + "step": 20005 + }, + { + "epoch": 0.9314430709779546, + "grad_norm": 0.36332865000329717, + "learning_rate": 8.710100144591634e-05, + "loss": 3.0007, + "step": 20006 + }, + { + "epoch": 0.9314896291640478, + "grad_norm": 0.3682640029845658, + "learning_rate": 8.709918551486099e-05, + "loss": 2.8625, + "step": 20007 + }, + { + "epoch": 0.9315361873501409, + "grad_norm": 0.3149409411297151, + "learning_rate": 8.709736947492317e-05, + "loss": 2.827, + "step": 20008 + }, + { + "epoch": 0.9315827455362339, + "grad_norm": 0.33791652268501765, + "learning_rate": 8.709555332610823e-05, + "loss": 2.8074, + "step": 20009 + }, + { + "epoch": 0.931629303722327, + "grad_norm": 0.3262097383327392, + "learning_rate": 8.709373706842152e-05, + "loss": 2.8819, + "step": 20010 + }, + { + "epoch": 0.93167586190842, + "grad_norm": 0.3649867508957349, + "learning_rate": 8.709192070186831e-05, + "loss": 2.8721, + "step": 20011 + }, + { + "epoch": 0.9317224200945131, + "grad_norm": 0.33942127896019797, + "learning_rate": 8.7090104226454e-05, + "loss": 3.0163, + "step": 20012 + }, + { + "epoch": 0.9317689782806062, + "grad_norm": 0.3661961079666804, + "learning_rate": 8.708828764218387e-05, + "loss": 2.9143, + "step": 20013 + }, + { + "epoch": 0.9318155364666992, + "grad_norm": 0.35425022564462416, + "learning_rate": 8.708647094906328e-05, + "loss": 2.9039, + "step": 20014 + }, + { + "epoch": 0.9318620946527923, + "grad_norm": 0.3336386994975359, + "learning_rate": 8.708465414709753e-05, + "loss": 2.9597, + "step": 20015 + }, + { + "epoch": 0.9319086528388854, + "grad_norm": 0.3383998029853594, + "learning_rate": 8.708283723629199e-05, + "loss": 2.9382, + "step": 20016 + }, + { + "epoch": 0.9319552110249785, + "grad_norm": 0.3244159020203637, + "learning_rate": 8.708102021665198e-05, + "loss": 2.8507, + "step": 20017 + }, + { + "epoch": 0.9320017692110716, + "grad_norm": 0.3441684634133062, + "learning_rate": 8.707920308818282e-05, + "loss": 2.9181, + "step": 20018 + }, + { + "epoch": 0.9320483273971646, + "grad_norm": 0.31240787670357745, + "learning_rate": 8.707738585088985e-05, + "loss": 2.9676, + "step": 20019 + }, + { + "epoch": 0.9320948855832577, + "grad_norm": 0.3483354571867629, + "learning_rate": 8.70755685047784e-05, + "loss": 2.8615, + "step": 20020 + }, + { + "epoch": 0.9321414437693507, + "grad_norm": 0.3510698866619721, + "learning_rate": 8.707375104985383e-05, + "loss": 3.0882, + "step": 20021 + }, + { + "epoch": 0.9321880019554438, + "grad_norm": 0.3424822925391649, + "learning_rate": 8.707193348612144e-05, + "loss": 2.8989, + "step": 20022 + }, + { + "epoch": 0.9322345601415369, + "grad_norm": 0.3573369723611084, + "learning_rate": 8.707011581358658e-05, + "loss": 2.9404, + "step": 20023 + }, + { + "epoch": 0.9322811183276299, + "grad_norm": 0.3371306746921816, + "learning_rate": 8.706829803225458e-05, + "loss": 2.9165, + "step": 20024 + }, + { + "epoch": 0.9323276765137231, + "grad_norm": 0.34430662381849203, + "learning_rate": 8.706648014213078e-05, + "loss": 2.9226, + "step": 20025 + }, + { + "epoch": 0.9323742346998161, + "grad_norm": 0.36215401552658216, + "learning_rate": 8.706466214322051e-05, + "loss": 2.8566, + "step": 20026 + }, + { + "epoch": 0.9324207928859092, + "grad_norm": 0.3501034301388943, + "learning_rate": 8.706284403552912e-05, + "loss": 2.9283, + "step": 20027 + }, + { + "epoch": 0.9324673510720022, + "grad_norm": 0.3552166300735434, + "learning_rate": 8.706102581906192e-05, + "loss": 2.8691, + "step": 20028 + }, + { + "epoch": 0.9325139092580953, + "grad_norm": 0.3485781720070784, + "learning_rate": 8.705920749382425e-05, + "loss": 2.8622, + "step": 20029 + }, + { + "epoch": 0.9325604674441884, + "grad_norm": 0.3588211108200624, + "learning_rate": 8.705738905982149e-05, + "loss": 2.7695, + "step": 20030 + }, + { + "epoch": 0.9326070256302814, + "grad_norm": 0.37464633053049823, + "learning_rate": 8.70555705170589e-05, + "loss": 2.913, + "step": 20031 + }, + { + "epoch": 0.9326535838163745, + "grad_norm": 0.3692820527821508, + "learning_rate": 8.70537518655419e-05, + "loss": 2.8633, + "step": 20032 + }, + { + "epoch": 0.9327001420024675, + "grad_norm": 0.3530770420840044, + "learning_rate": 8.705193310527578e-05, + "loss": 2.8883, + "step": 20033 + }, + { + "epoch": 0.9327467001885607, + "grad_norm": 0.3547812553733484, + "learning_rate": 8.705011423626587e-05, + "loss": 2.96, + "step": 20034 + }, + { + "epoch": 0.9327932583746538, + "grad_norm": 0.33815975924303066, + "learning_rate": 8.704829525851754e-05, + "loss": 2.8102, + "step": 20035 + }, + { + "epoch": 0.9328398165607468, + "grad_norm": 0.33009794850520013, + "learning_rate": 8.704647617203611e-05, + "loss": 2.8544, + "step": 20036 + }, + { + "epoch": 0.9328863747468399, + "grad_norm": 0.38558889163925075, + "learning_rate": 8.70446569768269e-05, + "loss": 2.8336, + "step": 20037 + }, + { + "epoch": 0.9329329329329329, + "grad_norm": 0.33043880956778376, + "learning_rate": 8.704283767289531e-05, + "loss": 2.9355, + "step": 20038 + }, + { + "epoch": 0.932979491119026, + "grad_norm": 0.3993969024822473, + "learning_rate": 8.704101826024662e-05, + "loss": 2.9621, + "step": 20039 + }, + { + "epoch": 0.9330260493051191, + "grad_norm": 0.33936737797544547, + "learning_rate": 8.703919873888619e-05, + "loss": 2.7538, + "step": 20040 + }, + { + "epoch": 0.9330726074912121, + "grad_norm": 0.34712184141588853, + "learning_rate": 8.703737910881937e-05, + "loss": 2.8194, + "step": 20041 + }, + { + "epoch": 0.9331191656773052, + "grad_norm": 0.34192801179857946, + "learning_rate": 8.703555937005148e-05, + "loss": 2.8941, + "step": 20042 + }, + { + "epoch": 0.9331657238633982, + "grad_norm": 0.3440922216531969, + "learning_rate": 8.703373952258788e-05, + "loss": 2.852, + "step": 20043 + }, + { + "epoch": 0.9332122820494914, + "grad_norm": 0.32118856057310946, + "learning_rate": 8.70319195664339e-05, + "loss": 2.7903, + "step": 20044 + }, + { + "epoch": 0.9332588402355845, + "grad_norm": 0.3638133095710278, + "learning_rate": 8.703009950159488e-05, + "loss": 2.9082, + "step": 20045 + }, + { + "epoch": 0.9333053984216775, + "grad_norm": 0.3391851636623026, + "learning_rate": 8.702827932807618e-05, + "loss": 2.9844, + "step": 20046 + }, + { + "epoch": 0.9333519566077706, + "grad_norm": 0.3454916173596162, + "learning_rate": 8.702645904588311e-05, + "loss": 2.7878, + "step": 20047 + }, + { + "epoch": 0.9333985147938636, + "grad_norm": 0.3175849939156026, + "learning_rate": 8.702463865502104e-05, + "loss": 2.8382, + "step": 20048 + }, + { + "epoch": 0.9334450729799567, + "grad_norm": 0.34720329403078654, + "learning_rate": 8.70228181554953e-05, + "loss": 2.9508, + "step": 20049 + }, + { + "epoch": 0.9334916311660497, + "grad_norm": 0.29421935627920426, + "learning_rate": 8.702099754731124e-05, + "loss": 2.7754, + "step": 20050 + }, + { + "epoch": 0.9335381893521428, + "grad_norm": 0.3114440648119904, + "learning_rate": 8.70191768304742e-05, + "loss": 2.8854, + "step": 20051 + }, + { + "epoch": 0.933584747538236, + "grad_norm": 0.3302305163205585, + "learning_rate": 8.701735600498951e-05, + "loss": 3.0083, + "step": 20052 + }, + { + "epoch": 0.933631305724329, + "grad_norm": 0.31505731130330833, + "learning_rate": 8.701553507086254e-05, + "loss": 2.854, + "step": 20053 + }, + { + "epoch": 0.9336778639104221, + "grad_norm": 0.32533633734833906, + "learning_rate": 8.701371402809861e-05, + "loss": 3.0383, + "step": 20054 + }, + { + "epoch": 0.9337244220965151, + "grad_norm": 0.3203735656056499, + "learning_rate": 8.70118928767031e-05, + "loss": 2.8653, + "step": 20055 + }, + { + "epoch": 0.9337709802826082, + "grad_norm": 0.3369409033112031, + "learning_rate": 8.70100716166813e-05, + "loss": 2.9518, + "step": 20056 + }, + { + "epoch": 0.9338175384687013, + "grad_norm": 0.315460656962546, + "learning_rate": 8.700825024803862e-05, + "loss": 2.7335, + "step": 20057 + }, + { + "epoch": 0.9338640966547943, + "grad_norm": 0.34741304444045357, + "learning_rate": 8.700642877078033e-05, + "loss": 2.9248, + "step": 20058 + }, + { + "epoch": 0.9339106548408874, + "grad_norm": 0.3113082244441036, + "learning_rate": 8.700460718491184e-05, + "loss": 2.8848, + "step": 20059 + }, + { + "epoch": 0.9339572130269804, + "grad_norm": 0.34397379704271086, + "learning_rate": 8.700278549043849e-05, + "loss": 2.887, + "step": 20060 + }, + { + "epoch": 0.9340037712130735, + "grad_norm": 0.370907898975822, + "learning_rate": 8.700096368736558e-05, + "loss": 2.9734, + "step": 20061 + }, + { + "epoch": 0.9340503293991667, + "grad_norm": 0.3268831360134918, + "learning_rate": 8.69991417756985e-05, + "loss": 2.9053, + "step": 20062 + }, + { + "epoch": 0.9340968875852597, + "grad_norm": 0.3527809932054862, + "learning_rate": 8.699731975544259e-05, + "loss": 2.8603, + "step": 20063 + }, + { + "epoch": 0.9341434457713528, + "grad_norm": 0.34976268387227005, + "learning_rate": 8.699549762660319e-05, + "loss": 2.9416, + "step": 20064 + }, + { + "epoch": 0.9341900039574458, + "grad_norm": 0.3172398539032775, + "learning_rate": 8.699367538918563e-05, + "loss": 2.8556, + "step": 20065 + }, + { + "epoch": 0.9342365621435389, + "grad_norm": 0.3400456415853378, + "learning_rate": 8.699185304319531e-05, + "loss": 2.8481, + "step": 20066 + }, + { + "epoch": 0.934283120329632, + "grad_norm": 0.3102267504272192, + "learning_rate": 8.699003058863752e-05, + "loss": 2.9177, + "step": 20067 + }, + { + "epoch": 0.934329678515725, + "grad_norm": 0.40074904034343717, + "learning_rate": 8.698820802551763e-05, + "loss": 2.9948, + "step": 20068 + }, + { + "epoch": 0.9343762367018181, + "grad_norm": 0.3568004045053589, + "learning_rate": 8.698638535384101e-05, + "loss": 2.9873, + "step": 20069 + }, + { + "epoch": 0.9344227948879111, + "grad_norm": 0.359002086455337, + "learning_rate": 8.698456257361299e-05, + "loss": 3.002, + "step": 20070 + }, + { + "epoch": 0.9344693530740042, + "grad_norm": 0.3757519108364942, + "learning_rate": 8.698273968483892e-05, + "loss": 2.9338, + "step": 20071 + }, + { + "epoch": 0.9345159112600973, + "grad_norm": 0.367736982648498, + "learning_rate": 8.698091668752414e-05, + "loss": 2.835, + "step": 20072 + }, + { + "epoch": 0.9345624694461904, + "grad_norm": 0.35711808067699136, + "learning_rate": 8.697909358167405e-05, + "loss": 2.9071, + "step": 20073 + }, + { + "epoch": 0.9346090276322835, + "grad_norm": 0.3527290845805826, + "learning_rate": 8.697727036729393e-05, + "loss": 2.8341, + "step": 20074 + }, + { + "epoch": 0.9346555858183765, + "grad_norm": 0.32655268099589346, + "learning_rate": 8.697544704438916e-05, + "loss": 2.8655, + "step": 20075 + }, + { + "epoch": 0.9347021440044696, + "grad_norm": 0.32991750013283005, + "learning_rate": 8.697362361296512e-05, + "loss": 2.9544, + "step": 20076 + }, + { + "epoch": 0.9347487021905626, + "grad_norm": 0.3274926908888543, + "learning_rate": 8.697180007302712e-05, + "loss": 2.8242, + "step": 20077 + }, + { + "epoch": 0.9347952603766557, + "grad_norm": 0.34053888807140303, + "learning_rate": 8.696997642458054e-05, + "loss": 2.7806, + "step": 20078 + }, + { + "epoch": 0.9348418185627488, + "grad_norm": 0.31845812463479256, + "learning_rate": 8.696815266763072e-05, + "loss": 2.8229, + "step": 20079 + }, + { + "epoch": 0.9348883767488418, + "grad_norm": 0.33686672499056547, + "learning_rate": 8.696632880218301e-05, + "loss": 2.936, + "step": 20080 + }, + { + "epoch": 0.934934934934935, + "grad_norm": 0.3396451873257728, + "learning_rate": 8.696450482824277e-05, + "loss": 2.8543, + "step": 20081 + }, + { + "epoch": 0.934981493121028, + "grad_norm": 0.31347361509889365, + "learning_rate": 8.696268074581533e-05, + "loss": 2.9798, + "step": 20082 + }, + { + "epoch": 0.9350280513071211, + "grad_norm": 0.32753157812675243, + "learning_rate": 8.696085655490609e-05, + "loss": 3.0125, + "step": 20083 + }, + { + "epoch": 0.9350746094932142, + "grad_norm": 0.32229873801558634, + "learning_rate": 8.695903225552036e-05, + "loss": 2.9439, + "step": 20084 + }, + { + "epoch": 0.9351211676793072, + "grad_norm": 0.317428787157445, + "learning_rate": 8.695720784766352e-05, + "loss": 2.8706, + "step": 20085 + }, + { + "epoch": 0.9351677258654003, + "grad_norm": 0.3550463028244514, + "learning_rate": 8.695538333134091e-05, + "loss": 2.9544, + "step": 20086 + }, + { + "epoch": 0.9352142840514933, + "grad_norm": 0.3107638840974363, + "learning_rate": 8.69535587065579e-05, + "loss": 2.8718, + "step": 20087 + }, + { + "epoch": 0.9352608422375864, + "grad_norm": 0.3311222904884475, + "learning_rate": 8.695173397331983e-05, + "loss": 2.8156, + "step": 20088 + }, + { + "epoch": 0.9353074004236795, + "grad_norm": 0.32598723165070725, + "learning_rate": 8.694990913163206e-05, + "loss": 2.7945, + "step": 20089 + }, + { + "epoch": 0.9353539586097726, + "grad_norm": 0.3119416381932124, + "learning_rate": 8.694808418149995e-05, + "loss": 2.902, + "step": 20090 + }, + { + "epoch": 0.9354005167958657, + "grad_norm": 0.35523775011451797, + "learning_rate": 8.694625912292885e-05, + "loss": 2.9321, + "step": 20091 + }, + { + "epoch": 0.9354470749819587, + "grad_norm": 0.31776802496770507, + "learning_rate": 8.694443395592413e-05, + "loss": 2.8318, + "step": 20092 + }, + { + "epoch": 0.9354936331680518, + "grad_norm": 0.38547177384147896, + "learning_rate": 8.694260868049111e-05, + "loss": 2.9489, + "step": 20093 + }, + { + "epoch": 0.9355401913541448, + "grad_norm": 0.3511895704379465, + "learning_rate": 8.69407832966352e-05, + "loss": 3.0339, + "step": 20094 + }, + { + "epoch": 0.9355867495402379, + "grad_norm": 0.364216361482898, + "learning_rate": 8.693895780436172e-05, + "loss": 2.9943, + "step": 20095 + }, + { + "epoch": 0.935633307726331, + "grad_norm": 0.33774285302055124, + "learning_rate": 8.693713220367604e-05, + "loss": 2.8907, + "step": 20096 + }, + { + "epoch": 0.935679865912424, + "grad_norm": 0.3903627641054387, + "learning_rate": 8.693530649458352e-05, + "loss": 2.8743, + "step": 20097 + }, + { + "epoch": 0.9357264240985171, + "grad_norm": 0.3119425607600048, + "learning_rate": 8.693348067708951e-05, + "loss": 2.8356, + "step": 20098 + }, + { + "epoch": 0.9357729822846101, + "grad_norm": 0.3574981136775035, + "learning_rate": 8.693165475119936e-05, + "loss": 2.805, + "step": 20099 + }, + { + "epoch": 0.9358195404707033, + "grad_norm": 0.35305276936166663, + "learning_rate": 8.692982871691846e-05, + "loss": 2.8518, + "step": 20100 + }, + { + "epoch": 0.9358660986567964, + "grad_norm": 0.376344928443062, + "learning_rate": 8.692800257425214e-05, + "loss": 2.753, + "step": 20101 + }, + { + "epoch": 0.9359126568428894, + "grad_norm": 0.34685433344042815, + "learning_rate": 8.692617632320577e-05, + "loss": 2.8476, + "step": 20102 + }, + { + "epoch": 0.9359592150289825, + "grad_norm": 0.36316077303505206, + "learning_rate": 8.692434996378471e-05, + "loss": 2.9334, + "step": 20103 + }, + { + "epoch": 0.9360057732150755, + "grad_norm": 0.3449201697087356, + "learning_rate": 8.692252349599432e-05, + "loss": 2.8389, + "step": 20104 + }, + { + "epoch": 0.9360523314011686, + "grad_norm": 0.32233131895846906, + "learning_rate": 8.692069691983997e-05, + "loss": 2.8071, + "step": 20105 + }, + { + "epoch": 0.9360988895872617, + "grad_norm": 0.3803865178543732, + "learning_rate": 8.6918870235327e-05, + "loss": 2.9504, + "step": 20106 + }, + { + "epoch": 0.9361454477733547, + "grad_norm": 0.3215462440340051, + "learning_rate": 8.69170434424608e-05, + "loss": 2.9265, + "step": 20107 + }, + { + "epoch": 0.9361920059594478, + "grad_norm": 0.3171113776369758, + "learning_rate": 8.69152165412467e-05, + "loss": 2.9333, + "step": 20108 + }, + { + "epoch": 0.9362385641455409, + "grad_norm": 0.3458387812839573, + "learning_rate": 8.691338953169009e-05, + "loss": 2.8541, + "step": 20109 + }, + { + "epoch": 0.936285122331634, + "grad_norm": 0.34972766991268134, + "learning_rate": 8.691156241379632e-05, + "loss": 2.9144, + "step": 20110 + }, + { + "epoch": 0.9363316805177271, + "grad_norm": 0.3747519704476367, + "learning_rate": 8.690973518757072e-05, + "loss": 2.9789, + "step": 20111 + }, + { + "epoch": 0.9363782387038201, + "grad_norm": 0.3705018032044838, + "learning_rate": 8.690790785301873e-05, + "loss": 2.867, + "step": 20112 + }, + { + "epoch": 0.9364247968899132, + "grad_norm": 0.37132789911493597, + "learning_rate": 8.690608041014563e-05, + "loss": 2.8648, + "step": 20113 + }, + { + "epoch": 0.9364713550760062, + "grad_norm": 0.366774125691635, + "learning_rate": 8.690425285895683e-05, + "loss": 2.8334, + "step": 20114 + }, + { + "epoch": 0.9365179132620993, + "grad_norm": 0.3395860110212461, + "learning_rate": 8.690242519945768e-05, + "loss": 2.7897, + "step": 20115 + }, + { + "epoch": 0.9365644714481923, + "grad_norm": 0.3454978272154178, + "learning_rate": 8.690059743165356e-05, + "loss": 2.8781, + "step": 20116 + }, + { + "epoch": 0.9366110296342854, + "grad_norm": 0.37819981622503684, + "learning_rate": 8.689876955554981e-05, + "loss": 2.9326, + "step": 20117 + }, + { + "epoch": 0.9366575878203786, + "grad_norm": 0.3475994266157326, + "learning_rate": 8.68969415711518e-05, + "loss": 2.9925, + "step": 20118 + }, + { + "epoch": 0.9367041460064716, + "grad_norm": 0.3767417779246377, + "learning_rate": 8.68951134784649e-05, + "loss": 2.9931, + "step": 20119 + }, + { + "epoch": 0.9367507041925647, + "grad_norm": 0.3386189010016108, + "learning_rate": 8.68932852774945e-05, + "loss": 2.8571, + "step": 20120 + }, + { + "epoch": 0.9367972623786577, + "grad_norm": 0.35249469327183214, + "learning_rate": 8.689145696824594e-05, + "loss": 2.8649, + "step": 20121 + }, + { + "epoch": 0.9368438205647508, + "grad_norm": 0.331035854993247, + "learning_rate": 8.688962855072457e-05, + "loss": 2.7408, + "step": 20122 + }, + { + "epoch": 0.9368903787508439, + "grad_norm": 0.31864379336210386, + "learning_rate": 8.688780002493578e-05, + "loss": 2.8422, + "step": 20123 + }, + { + "epoch": 0.9369369369369369, + "grad_norm": 0.3315145828012823, + "learning_rate": 8.688597139088493e-05, + "loss": 2.839, + "step": 20124 + }, + { + "epoch": 0.93698349512303, + "grad_norm": 0.36115138550555875, + "learning_rate": 8.68841426485774e-05, + "loss": 2.8783, + "step": 20125 + }, + { + "epoch": 0.937030053309123, + "grad_norm": 0.3543895177350154, + "learning_rate": 8.688231379801853e-05, + "loss": 2.7944, + "step": 20126 + }, + { + "epoch": 0.9370766114952161, + "grad_norm": 0.37921734037375626, + "learning_rate": 8.688048483921372e-05, + "loss": 2.8138, + "step": 20127 + }, + { + "epoch": 0.9371231696813093, + "grad_norm": 0.37694843939774986, + "learning_rate": 8.687865577216829e-05, + "loss": 2.8972, + "step": 20128 + }, + { + "epoch": 0.9371697278674023, + "grad_norm": 0.3430009152044954, + "learning_rate": 8.687682659688767e-05, + "loss": 2.8916, + "step": 20129 + }, + { + "epoch": 0.9372162860534954, + "grad_norm": 0.37168135288675597, + "learning_rate": 8.687499731337718e-05, + "loss": 2.9211, + "step": 20130 + }, + { + "epoch": 0.9372628442395884, + "grad_norm": 0.34081510541244026, + "learning_rate": 8.687316792164221e-05, + "loss": 2.893, + "step": 20131 + }, + { + "epoch": 0.9373094024256815, + "grad_norm": 0.3543520032648792, + "learning_rate": 8.687133842168814e-05, + "loss": 2.9123, + "step": 20132 + }, + { + "epoch": 0.9373559606117746, + "grad_norm": 0.3898558954046355, + "learning_rate": 8.68695088135203e-05, + "loss": 2.9451, + "step": 20133 + }, + { + "epoch": 0.9374025187978676, + "grad_norm": 0.3215181594272123, + "learning_rate": 8.68676790971441e-05, + "loss": 2.8775, + "step": 20134 + }, + { + "epoch": 0.9374490769839607, + "grad_norm": 0.37414545354613743, + "learning_rate": 8.686584927256487e-05, + "loss": 2.9187, + "step": 20135 + }, + { + "epoch": 0.9374956351700537, + "grad_norm": 0.35983253735087506, + "learning_rate": 8.686401933978803e-05, + "loss": 2.8394, + "step": 20136 + }, + { + "epoch": 0.9375421933561469, + "grad_norm": 0.34022692265658294, + "learning_rate": 8.686218929881893e-05, + "loss": 2.9851, + "step": 20137 + }, + { + "epoch": 0.9375887515422399, + "grad_norm": 0.3198040791419867, + "learning_rate": 8.686035914966291e-05, + "loss": 2.8752, + "step": 20138 + }, + { + "epoch": 0.937635309728333, + "grad_norm": 0.32337448703033134, + "learning_rate": 8.685852889232538e-05, + "loss": 2.9004, + "step": 20139 + }, + { + "epoch": 0.9376818679144261, + "grad_norm": 0.297092047287314, + "learning_rate": 8.68566985268117e-05, + "loss": 2.9098, + "step": 20140 + }, + { + "epoch": 0.9377284261005191, + "grad_norm": 0.34542961332525396, + "learning_rate": 8.685486805312724e-05, + "loss": 2.8756, + "step": 20141 + }, + { + "epoch": 0.9377749842866122, + "grad_norm": 0.33183619855544044, + "learning_rate": 8.685303747127736e-05, + "loss": 2.9136, + "step": 20142 + }, + { + "epoch": 0.9378215424727052, + "grad_norm": 0.3138948517092438, + "learning_rate": 8.685120678126747e-05, + "loss": 2.8398, + "step": 20143 + }, + { + "epoch": 0.9378681006587983, + "grad_norm": 0.311066935173311, + "learning_rate": 8.68493759831029e-05, + "loss": 2.8533, + "step": 20144 + }, + { + "epoch": 0.9379146588448914, + "grad_norm": 0.3293312689004181, + "learning_rate": 8.684754507678905e-05, + "loss": 2.9293, + "step": 20145 + }, + { + "epoch": 0.9379612170309845, + "grad_norm": 0.3212903251307173, + "learning_rate": 8.684571406233129e-05, + "loss": 2.8192, + "step": 20146 + }, + { + "epoch": 0.9380077752170776, + "grad_norm": 0.31277765005164576, + "learning_rate": 8.684388293973498e-05, + "loss": 2.884, + "step": 20147 + }, + { + "epoch": 0.9380543334031706, + "grad_norm": 0.3042457792093164, + "learning_rate": 8.68420517090055e-05, + "loss": 2.7794, + "step": 20148 + }, + { + "epoch": 0.9381008915892637, + "grad_norm": 0.31089275749001666, + "learning_rate": 8.684022037014822e-05, + "loss": 2.9107, + "step": 20149 + }, + { + "epoch": 0.9381474497753568, + "grad_norm": 0.3085517218819414, + "learning_rate": 8.683838892316856e-05, + "loss": 2.9522, + "step": 20150 + }, + { + "epoch": 0.9381940079614498, + "grad_norm": 0.31777780380366627, + "learning_rate": 8.683655736807183e-05, + "loss": 2.9111, + "step": 20151 + }, + { + "epoch": 0.9382405661475429, + "grad_norm": 0.30992698772070715, + "learning_rate": 8.683472570486345e-05, + "loss": 2.8658, + "step": 20152 + }, + { + "epoch": 0.9382871243336359, + "grad_norm": 0.32769438889525804, + "learning_rate": 8.683289393354874e-05, + "loss": 2.8165, + "step": 20153 + }, + { + "epoch": 0.938333682519729, + "grad_norm": 0.3190243355130294, + "learning_rate": 8.683106205413315e-05, + "loss": 2.8094, + "step": 20154 + }, + { + "epoch": 0.9383802407058222, + "grad_norm": 0.3599266810636752, + "learning_rate": 8.682923006662201e-05, + "loss": 3.01, + "step": 20155 + }, + { + "epoch": 0.9384267988919152, + "grad_norm": 0.3721298523580133, + "learning_rate": 8.682739797102071e-05, + "loss": 2.8371, + "step": 20156 + }, + { + "epoch": 0.9384733570780083, + "grad_norm": 0.35238390885704324, + "learning_rate": 8.682556576733463e-05, + "loss": 2.8627, + "step": 20157 + }, + { + "epoch": 0.9385199152641013, + "grad_norm": 0.343443063093354, + "learning_rate": 8.682373345556914e-05, + "loss": 2.828, + "step": 20158 + }, + { + "epoch": 0.9385664734501944, + "grad_norm": 0.3875963395396379, + "learning_rate": 8.682190103572962e-05, + "loss": 2.8662, + "step": 20159 + }, + { + "epoch": 0.9386130316362874, + "grad_norm": 0.3177990448341476, + "learning_rate": 8.682006850782144e-05, + "loss": 2.8671, + "step": 20160 + }, + { + "epoch": 0.9386595898223805, + "grad_norm": 0.4251062171827367, + "learning_rate": 8.681823587184999e-05, + "loss": 2.8793, + "step": 20161 + }, + { + "epoch": 0.9387061480084736, + "grad_norm": 0.33272855382213634, + "learning_rate": 8.681640312782064e-05, + "loss": 2.9124, + "step": 20162 + }, + { + "epoch": 0.9387527061945666, + "grad_norm": 0.41291674387091, + "learning_rate": 8.68145702757388e-05, + "loss": 2.9572, + "step": 20163 + }, + { + "epoch": 0.9387992643806597, + "grad_norm": 0.32652369996227726, + "learning_rate": 8.681273731560979e-05, + "loss": 2.8918, + "step": 20164 + }, + { + "epoch": 0.9388458225667528, + "grad_norm": 0.3468647616185914, + "learning_rate": 8.681090424743904e-05, + "loss": 2.8701, + "step": 20165 + }, + { + "epoch": 0.9388923807528459, + "grad_norm": 0.32137357048592274, + "learning_rate": 8.680907107123191e-05, + "loss": 2.9099, + "step": 20166 + }, + { + "epoch": 0.938938938938939, + "grad_norm": 0.3529765070285872, + "learning_rate": 8.680723778699377e-05, + "loss": 2.9123, + "step": 20167 + }, + { + "epoch": 0.938985497125032, + "grad_norm": 0.3286184610624457, + "learning_rate": 8.680540439473004e-05, + "loss": 2.8815, + "step": 20168 + }, + { + "epoch": 0.9390320553111251, + "grad_norm": 0.3565659138345476, + "learning_rate": 8.680357089444606e-05, + "loss": 2.998, + "step": 20169 + }, + { + "epoch": 0.9390786134972181, + "grad_norm": 0.32959196861848716, + "learning_rate": 8.680173728614723e-05, + "loss": 2.9407, + "step": 20170 + }, + { + "epoch": 0.9391251716833112, + "grad_norm": 0.37343135754104795, + "learning_rate": 8.679990356983891e-05, + "loss": 2.8407, + "step": 20171 + }, + { + "epoch": 0.9391717298694043, + "grad_norm": 0.3152347524842838, + "learning_rate": 8.679806974552651e-05, + "loss": 2.8302, + "step": 20172 + }, + { + "epoch": 0.9392182880554973, + "grad_norm": 0.36211883379730625, + "learning_rate": 8.67962358132154e-05, + "loss": 2.9136, + "step": 20173 + }, + { + "epoch": 0.9392648462415905, + "grad_norm": 0.32396787816857914, + "learning_rate": 8.679440177291096e-05, + "loss": 2.9274, + "step": 20174 + }, + { + "epoch": 0.9393114044276835, + "grad_norm": 0.38397625882141806, + "learning_rate": 8.679256762461858e-05, + "loss": 2.9068, + "step": 20175 + }, + { + "epoch": 0.9393579626137766, + "grad_norm": 0.3476110165247001, + "learning_rate": 8.679073336834363e-05, + "loss": 2.9517, + "step": 20176 + }, + { + "epoch": 0.9394045207998697, + "grad_norm": 0.3475683402993469, + "learning_rate": 8.678889900409152e-05, + "loss": 2.8731, + "step": 20177 + }, + { + "epoch": 0.9394510789859627, + "grad_norm": 0.3581855336078241, + "learning_rate": 8.67870645318676e-05, + "loss": 2.9394, + "step": 20178 + }, + { + "epoch": 0.9394976371720558, + "grad_norm": 0.34624724680113345, + "learning_rate": 8.678522995167728e-05, + "loss": 2.8852, + "step": 20179 + }, + { + "epoch": 0.9395441953581488, + "grad_norm": 0.3821679277605568, + "learning_rate": 8.678339526352593e-05, + "loss": 2.8549, + "step": 20180 + }, + { + "epoch": 0.9395907535442419, + "grad_norm": 0.30422871352312847, + "learning_rate": 8.678156046741893e-05, + "loss": 2.9396, + "step": 20181 + }, + { + "epoch": 0.9396373117303349, + "grad_norm": 0.3489679226579369, + "learning_rate": 8.677972556336169e-05, + "loss": 2.8781, + "step": 20182 + }, + { + "epoch": 0.939683869916428, + "grad_norm": 0.3073016563156017, + "learning_rate": 8.677789055135957e-05, + "loss": 2.9217, + "step": 20183 + }, + { + "epoch": 0.9397304281025212, + "grad_norm": 0.34803394980795166, + "learning_rate": 8.677605543141796e-05, + "loss": 2.8812, + "step": 20184 + }, + { + "epoch": 0.9397769862886142, + "grad_norm": 0.38944522132718956, + "learning_rate": 8.677422020354224e-05, + "loss": 3.0606, + "step": 20185 + }, + { + "epoch": 0.9398235444747073, + "grad_norm": 0.31633457497115514, + "learning_rate": 8.677238486773783e-05, + "loss": 2.8751, + "step": 20186 + }, + { + "epoch": 0.9398701026608003, + "grad_norm": 0.3883968900563819, + "learning_rate": 8.677054942401008e-05, + "loss": 2.8644, + "step": 20187 + }, + { + "epoch": 0.9399166608468934, + "grad_norm": 0.32392008106755177, + "learning_rate": 8.67687138723644e-05, + "loss": 2.968, + "step": 20188 + }, + { + "epoch": 0.9399632190329865, + "grad_norm": 0.3648139189942515, + "learning_rate": 8.676687821280616e-05, + "loss": 2.8452, + "step": 20189 + }, + { + "epoch": 0.9400097772190795, + "grad_norm": 0.32360647787048985, + "learning_rate": 8.676504244534077e-05, + "loss": 2.8441, + "step": 20190 + }, + { + "epoch": 0.9400563354051726, + "grad_norm": 0.3291320596263752, + "learning_rate": 8.676320656997358e-05, + "loss": 2.8682, + "step": 20191 + }, + { + "epoch": 0.9401028935912656, + "grad_norm": 0.3413699464095119, + "learning_rate": 8.676137058671e-05, + "loss": 2.7928, + "step": 20192 + }, + { + "epoch": 0.9401494517773588, + "grad_norm": 0.3162854824344575, + "learning_rate": 8.675953449555543e-05, + "loss": 2.8395, + "step": 20193 + }, + { + "epoch": 0.9401960099634519, + "grad_norm": 0.33332850035595085, + "learning_rate": 8.675769829651525e-05, + "loss": 2.9291, + "step": 20194 + }, + { + "epoch": 0.9402425681495449, + "grad_norm": 0.3050572284171962, + "learning_rate": 8.675586198959485e-05, + "loss": 2.8792, + "step": 20195 + }, + { + "epoch": 0.940289126335638, + "grad_norm": 0.34770442660986456, + "learning_rate": 8.675402557479959e-05, + "loss": 2.9306, + "step": 20196 + }, + { + "epoch": 0.940335684521731, + "grad_norm": 0.308971019368144, + "learning_rate": 8.675218905213491e-05, + "loss": 2.8903, + "step": 20197 + }, + { + "epoch": 0.9403822427078241, + "grad_norm": 0.34899763194349404, + "learning_rate": 8.675035242160616e-05, + "loss": 2.928, + "step": 20198 + }, + { + "epoch": 0.9404288008939172, + "grad_norm": 0.31407880691248286, + "learning_rate": 8.674851568321877e-05, + "loss": 2.9633, + "step": 20199 + }, + { + "epoch": 0.9404753590800102, + "grad_norm": 0.35245252646021474, + "learning_rate": 8.674667883697808e-05, + "loss": 2.732, + "step": 20200 + }, + { + "epoch": 0.9405219172661033, + "grad_norm": 0.3155567899584545, + "learning_rate": 8.674484188288953e-05, + "loss": 2.9035, + "step": 20201 + }, + { + "epoch": 0.9405684754521964, + "grad_norm": 0.31100493676366364, + "learning_rate": 8.674300482095847e-05, + "loss": 2.8414, + "step": 20202 + }, + { + "epoch": 0.9406150336382895, + "grad_norm": 0.3326673737885565, + "learning_rate": 8.674116765119034e-05, + "loss": 2.9079, + "step": 20203 + }, + { + "epoch": 0.9406615918243825, + "grad_norm": 0.34159586686872523, + "learning_rate": 8.673933037359047e-05, + "loss": 2.7687, + "step": 20204 + }, + { + "epoch": 0.9407081500104756, + "grad_norm": 0.3161089690379441, + "learning_rate": 8.67374929881643e-05, + "loss": 2.7896, + "step": 20205 + }, + { + "epoch": 0.9407547081965687, + "grad_norm": 0.3355985536150548, + "learning_rate": 8.67356554949172e-05, + "loss": 2.8249, + "step": 20206 + }, + { + "epoch": 0.9408012663826617, + "grad_norm": 0.33259533591235, + "learning_rate": 8.673381789385457e-05, + "loss": 2.9087, + "step": 20207 + }, + { + "epoch": 0.9408478245687548, + "grad_norm": 0.3139080979866641, + "learning_rate": 8.673198018498183e-05, + "loss": 2.8608, + "step": 20208 + }, + { + "epoch": 0.9408943827548478, + "grad_norm": 0.3242205313857756, + "learning_rate": 8.673014236830431e-05, + "loss": 2.9482, + "step": 20209 + }, + { + "epoch": 0.9409409409409409, + "grad_norm": 0.3558393433544276, + "learning_rate": 8.672830444382746e-05, + "loss": 2.9546, + "step": 20210 + }, + { + "epoch": 0.940987499127034, + "grad_norm": 0.31562809793451174, + "learning_rate": 8.672646641155666e-05, + "loss": 2.8397, + "step": 20211 + }, + { + "epoch": 0.9410340573131271, + "grad_norm": 0.3284009120595451, + "learning_rate": 8.672462827149728e-05, + "loss": 2.848, + "step": 20212 + }, + { + "epoch": 0.9410806154992202, + "grad_norm": 0.31449921852203205, + "learning_rate": 8.672279002365476e-05, + "loss": 2.8914, + "step": 20213 + }, + { + "epoch": 0.9411271736853132, + "grad_norm": 0.32583220814950964, + "learning_rate": 8.672095166803444e-05, + "loss": 2.7558, + "step": 20214 + }, + { + "epoch": 0.9411737318714063, + "grad_norm": 0.35455403097449023, + "learning_rate": 8.671911320464175e-05, + "loss": 2.8825, + "step": 20215 + }, + { + "epoch": 0.9412202900574994, + "grad_norm": 0.3867036900398624, + "learning_rate": 8.67172746334821e-05, + "loss": 2.8697, + "step": 20216 + }, + { + "epoch": 0.9412668482435924, + "grad_norm": 0.36320013814343416, + "learning_rate": 8.671543595456085e-05, + "loss": 2.826, + "step": 20217 + }, + { + "epoch": 0.9413134064296855, + "grad_norm": 0.4350189159539409, + "learning_rate": 8.671359716788342e-05, + "loss": 2.8344, + "step": 20218 + }, + { + "epoch": 0.9413599646157785, + "grad_norm": 0.357221021977985, + "learning_rate": 8.671175827345519e-05, + "loss": 2.855, + "step": 20219 + }, + { + "epoch": 0.9414065228018716, + "grad_norm": 0.40165945907142153, + "learning_rate": 8.670991927128157e-05, + "loss": 2.896, + "step": 20220 + }, + { + "epoch": 0.9414530809879648, + "grad_norm": 0.3977582803929582, + "learning_rate": 8.670808016136795e-05, + "loss": 2.9342, + "step": 20221 + }, + { + "epoch": 0.9414996391740578, + "grad_norm": 0.39388488722378845, + "learning_rate": 8.670624094371973e-05, + "loss": 2.7891, + "step": 20222 + }, + { + "epoch": 0.9415461973601509, + "grad_norm": 0.40255405221125, + "learning_rate": 8.670440161834232e-05, + "loss": 2.8222, + "step": 20223 + }, + { + "epoch": 0.9415927555462439, + "grad_norm": 0.34624639195197543, + "learning_rate": 8.67025621852411e-05, + "loss": 2.9597, + "step": 20224 + }, + { + "epoch": 0.941639313732337, + "grad_norm": 0.40580045903139206, + "learning_rate": 8.670072264442148e-05, + "loss": 3.0511, + "step": 20225 + }, + { + "epoch": 0.94168587191843, + "grad_norm": 0.3272293925709813, + "learning_rate": 8.669888299588887e-05, + "loss": 2.9532, + "step": 20226 + }, + { + "epoch": 0.9417324301045231, + "grad_norm": 0.34651700332076985, + "learning_rate": 8.669704323964862e-05, + "loss": 2.9028, + "step": 20227 + }, + { + "epoch": 0.9417789882906162, + "grad_norm": 0.3167941944770339, + "learning_rate": 8.669520337570618e-05, + "loss": 2.9504, + "step": 20228 + }, + { + "epoch": 0.9418255464767092, + "grad_norm": 0.3575104179812513, + "learning_rate": 8.669336340406693e-05, + "loss": 2.86, + "step": 20229 + }, + { + "epoch": 0.9418721046628024, + "grad_norm": 0.3063772512839356, + "learning_rate": 8.669152332473629e-05, + "loss": 2.9664, + "step": 20230 + }, + { + "epoch": 0.9419186628488954, + "grad_norm": 0.3417498958990275, + "learning_rate": 8.668968313771963e-05, + "loss": 2.8964, + "step": 20231 + }, + { + "epoch": 0.9419652210349885, + "grad_norm": 0.34391388416587676, + "learning_rate": 8.668784284302236e-05, + "loss": 2.8149, + "step": 20232 + }, + { + "epoch": 0.9420117792210816, + "grad_norm": 0.33663704506144404, + "learning_rate": 8.66860024406499e-05, + "loss": 2.9467, + "step": 20233 + }, + { + "epoch": 0.9420583374071746, + "grad_norm": 0.3247726269160549, + "learning_rate": 8.668416193060762e-05, + "loss": 2.8371, + "step": 20234 + }, + { + "epoch": 0.9421048955932677, + "grad_norm": 0.37387154872244277, + "learning_rate": 8.668232131290095e-05, + "loss": 2.9277, + "step": 20235 + }, + { + "epoch": 0.9421514537793607, + "grad_norm": 0.34659317029007136, + "learning_rate": 8.668048058753528e-05, + "loss": 2.7694, + "step": 20236 + }, + { + "epoch": 0.9421980119654538, + "grad_norm": 0.35412297002287596, + "learning_rate": 8.6678639754516e-05, + "loss": 2.826, + "step": 20237 + }, + { + "epoch": 0.942244570151547, + "grad_norm": 0.3177792642810221, + "learning_rate": 8.667679881384853e-05, + "loss": 2.8818, + "step": 20238 + }, + { + "epoch": 0.94229112833764, + "grad_norm": 0.33617463121740454, + "learning_rate": 8.667495776553828e-05, + "loss": 2.8875, + "step": 20239 + }, + { + "epoch": 0.9423376865237331, + "grad_norm": 0.3433663948020943, + "learning_rate": 8.667311660959064e-05, + "loss": 2.8408, + "step": 20240 + }, + { + "epoch": 0.9423842447098261, + "grad_norm": 0.33509189269643386, + "learning_rate": 8.667127534601102e-05, + "loss": 2.8751, + "step": 20241 + }, + { + "epoch": 0.9424308028959192, + "grad_norm": 0.34513098000544246, + "learning_rate": 8.66694339748048e-05, + "loss": 2.8211, + "step": 20242 + }, + { + "epoch": 0.9424773610820123, + "grad_norm": 0.30836411686125703, + "learning_rate": 8.666759249597743e-05, + "loss": 2.8895, + "step": 20243 + }, + { + "epoch": 0.9425239192681053, + "grad_norm": 0.3310100745137337, + "learning_rate": 8.666575090953426e-05, + "loss": 2.8065, + "step": 20244 + }, + { + "epoch": 0.9425704774541984, + "grad_norm": 0.311505455261875, + "learning_rate": 8.666390921548076e-05, + "loss": 2.9523, + "step": 20245 + }, + { + "epoch": 0.9426170356402914, + "grad_norm": 0.352619206979504, + "learning_rate": 8.666206741382225e-05, + "loss": 2.9061, + "step": 20246 + }, + { + "epoch": 0.9426635938263845, + "grad_norm": 0.32043971809868427, + "learning_rate": 8.666022550456423e-05, + "loss": 2.9581, + "step": 20247 + }, + { + "epoch": 0.9427101520124775, + "grad_norm": 0.35198464776726823, + "learning_rate": 8.665838348771203e-05, + "loss": 2.9321, + "step": 20248 + }, + { + "epoch": 0.9427567101985707, + "grad_norm": 0.32857850231082025, + "learning_rate": 8.66565413632711e-05, + "loss": 2.9544, + "step": 20249 + }, + { + "epoch": 0.9428032683846638, + "grad_norm": 0.31912722236860797, + "learning_rate": 8.665469913124682e-05, + "loss": 2.9117, + "step": 20250 + }, + { + "epoch": 0.9428498265707568, + "grad_norm": 0.313293677315752, + "learning_rate": 8.665285679164463e-05, + "loss": 2.9409, + "step": 20251 + }, + { + "epoch": 0.9428963847568499, + "grad_norm": 0.3211416795184152, + "learning_rate": 8.665101434446989e-05, + "loss": 2.8836, + "step": 20252 + }, + { + "epoch": 0.9429429429429429, + "grad_norm": 0.3194116564297267, + "learning_rate": 8.664917178972805e-05, + "loss": 2.9432, + "step": 20253 + }, + { + "epoch": 0.942989501129036, + "grad_norm": 0.3165268047118167, + "learning_rate": 8.66473291274245e-05, + "loss": 2.8444, + "step": 20254 + }, + { + "epoch": 0.9430360593151291, + "grad_norm": 0.3147079023689929, + "learning_rate": 8.664548635756464e-05, + "loss": 2.9392, + "step": 20255 + }, + { + "epoch": 0.9430826175012221, + "grad_norm": 0.35521044393891515, + "learning_rate": 8.66436434801539e-05, + "loss": 2.9985, + "step": 20256 + }, + { + "epoch": 0.9431291756873152, + "grad_norm": 0.3083954567406595, + "learning_rate": 8.664180049519767e-05, + "loss": 2.8275, + "step": 20257 + }, + { + "epoch": 0.9431757338734083, + "grad_norm": 0.35807406251235746, + "learning_rate": 8.663995740270137e-05, + "loss": 2.9252, + "step": 20258 + }, + { + "epoch": 0.9432222920595014, + "grad_norm": 0.32879652586897484, + "learning_rate": 8.663811420267041e-05, + "loss": 2.8851, + "step": 20259 + }, + { + "epoch": 0.9432688502455945, + "grad_norm": 0.3367197858849987, + "learning_rate": 8.663627089511018e-05, + "loss": 2.8998, + "step": 20260 + }, + { + "epoch": 0.9433154084316875, + "grad_norm": 0.3161804876077013, + "learning_rate": 8.66344274800261e-05, + "loss": 2.8494, + "step": 20261 + }, + { + "epoch": 0.9433619666177806, + "grad_norm": 0.33791425883954856, + "learning_rate": 8.66325839574236e-05, + "loss": 2.8726, + "step": 20262 + }, + { + "epoch": 0.9434085248038736, + "grad_norm": 0.32464569772143625, + "learning_rate": 8.663074032730807e-05, + "loss": 2.8826, + "step": 20263 + }, + { + "epoch": 0.9434550829899667, + "grad_norm": 0.3694235973880044, + "learning_rate": 8.662889658968493e-05, + "loss": 2.8314, + "step": 20264 + }, + { + "epoch": 0.9435016411760598, + "grad_norm": 0.35272778228748536, + "learning_rate": 8.662705274455958e-05, + "loss": 2.934, + "step": 20265 + }, + { + "epoch": 0.9435481993621528, + "grad_norm": 0.3624055254951853, + "learning_rate": 8.662520879193744e-05, + "loss": 2.829, + "step": 20266 + }, + { + "epoch": 0.943594757548246, + "grad_norm": 0.3615871504290034, + "learning_rate": 8.662336473182393e-05, + "loss": 2.8779, + "step": 20267 + }, + { + "epoch": 0.943641315734339, + "grad_norm": 0.3290653603463936, + "learning_rate": 8.662152056422444e-05, + "loss": 2.9397, + "step": 20268 + }, + { + "epoch": 0.9436878739204321, + "grad_norm": 0.3890976237349072, + "learning_rate": 8.66196762891444e-05, + "loss": 2.8671, + "step": 20269 + }, + { + "epoch": 0.9437344321065251, + "grad_norm": 0.33951827161862175, + "learning_rate": 8.66178319065892e-05, + "loss": 2.9353, + "step": 20270 + }, + { + "epoch": 0.9437809902926182, + "grad_norm": 0.37209733304646275, + "learning_rate": 8.661598741656431e-05, + "loss": 2.9153, + "step": 20271 + }, + { + "epoch": 0.9438275484787113, + "grad_norm": 0.3456997992903625, + "learning_rate": 8.661414281907507e-05, + "loss": 2.7822, + "step": 20272 + }, + { + "epoch": 0.9438741066648043, + "grad_norm": 0.35928645017672806, + "learning_rate": 8.661229811412695e-05, + "loss": 2.901, + "step": 20273 + }, + { + "epoch": 0.9439206648508974, + "grad_norm": 0.32801664028429595, + "learning_rate": 8.661045330172532e-05, + "loss": 2.8908, + "step": 20274 + }, + { + "epoch": 0.9439672230369904, + "grad_norm": 0.37259613043777806, + "learning_rate": 8.660860838187563e-05, + "loss": 2.8882, + "step": 20275 + }, + { + "epoch": 0.9440137812230835, + "grad_norm": 0.30904945728455935, + "learning_rate": 8.660676335458328e-05, + "loss": 2.8588, + "step": 20276 + }, + { + "epoch": 0.9440603394091767, + "grad_norm": 0.36270663490406574, + "learning_rate": 8.660491821985368e-05, + "loss": 2.8884, + "step": 20277 + }, + { + "epoch": 0.9441068975952697, + "grad_norm": 0.3558747587744571, + "learning_rate": 8.660307297769226e-05, + "loss": 2.8812, + "step": 20278 + }, + { + "epoch": 0.9441534557813628, + "grad_norm": 0.36019811957440756, + "learning_rate": 8.660122762810443e-05, + "loss": 2.9021, + "step": 20279 + }, + { + "epoch": 0.9442000139674558, + "grad_norm": 0.32069920209317043, + "learning_rate": 8.65993821710956e-05, + "loss": 2.914, + "step": 20280 + }, + { + "epoch": 0.9442465721535489, + "grad_norm": 0.33388688641940995, + "learning_rate": 8.659753660667118e-05, + "loss": 2.9051, + "step": 20281 + }, + { + "epoch": 0.944293130339642, + "grad_norm": 0.3036175392683218, + "learning_rate": 8.659569093483659e-05, + "loss": 2.832, + "step": 20282 + }, + { + "epoch": 0.944339688525735, + "grad_norm": 0.3192736438082126, + "learning_rate": 8.659384515559727e-05, + "loss": 2.8275, + "step": 20283 + }, + { + "epoch": 0.9443862467118281, + "grad_norm": 0.3172902417928898, + "learning_rate": 8.65919992689586e-05, + "loss": 2.8156, + "step": 20284 + }, + { + "epoch": 0.9444328048979211, + "grad_norm": 0.3308516807469689, + "learning_rate": 8.659015327492603e-05, + "loss": 2.9374, + "step": 20285 + }, + { + "epoch": 0.9444793630840143, + "grad_norm": 0.2961462267902758, + "learning_rate": 8.658830717350497e-05, + "loss": 2.8152, + "step": 20286 + }, + { + "epoch": 0.9445259212701074, + "grad_norm": 0.3273107562816921, + "learning_rate": 8.658646096470082e-05, + "loss": 2.9168, + "step": 20287 + }, + { + "epoch": 0.9445724794562004, + "grad_norm": 0.3415712023592362, + "learning_rate": 8.658461464851902e-05, + "loss": 2.9637, + "step": 20288 + }, + { + "epoch": 0.9446190376422935, + "grad_norm": 0.30711446371359286, + "learning_rate": 8.658276822496498e-05, + "loss": 2.9204, + "step": 20289 + }, + { + "epoch": 0.9446655958283865, + "grad_norm": 0.358955801367083, + "learning_rate": 8.658092169404413e-05, + "loss": 2.8075, + "step": 20290 + }, + { + "epoch": 0.9447121540144796, + "grad_norm": 0.3329730785588172, + "learning_rate": 8.657907505576186e-05, + "loss": 2.8277, + "step": 20291 + }, + { + "epoch": 0.9447587122005726, + "grad_norm": 0.3333418294935756, + "learning_rate": 8.657722831012361e-05, + "loss": 2.8593, + "step": 20292 + }, + { + "epoch": 0.9448052703866657, + "grad_norm": 0.3471277140582787, + "learning_rate": 8.65753814571348e-05, + "loss": 2.8856, + "step": 20293 + }, + { + "epoch": 0.9448518285727588, + "grad_norm": 0.3287760420946097, + "learning_rate": 8.657353449680086e-05, + "loss": 2.9192, + "step": 20294 + }, + { + "epoch": 0.9448983867588518, + "grad_norm": 0.3471826766784874, + "learning_rate": 8.65716874291272e-05, + "loss": 2.8284, + "step": 20295 + }, + { + "epoch": 0.944944944944945, + "grad_norm": 0.37984658768974844, + "learning_rate": 8.656984025411923e-05, + "loss": 2.948, + "step": 20296 + }, + { + "epoch": 0.944991503131038, + "grad_norm": 0.3525130686391535, + "learning_rate": 8.656799297178238e-05, + "loss": 2.9291, + "step": 20297 + }, + { + "epoch": 0.9450380613171311, + "grad_norm": 0.3628005922406484, + "learning_rate": 8.656614558212207e-05, + "loss": 2.9122, + "step": 20298 + }, + { + "epoch": 0.9450846195032242, + "grad_norm": 0.2962240632525539, + "learning_rate": 8.656429808514372e-05, + "loss": 2.8263, + "step": 20299 + }, + { + "epoch": 0.9451311776893172, + "grad_norm": 0.349201759366419, + "learning_rate": 8.656245048085279e-05, + "loss": 2.8363, + "step": 20300 + }, + { + "epoch": 0.9451777358754103, + "grad_norm": 0.32567963110246345, + "learning_rate": 8.656060276925463e-05, + "loss": 2.7914, + "step": 20301 + }, + { + "epoch": 0.9452242940615033, + "grad_norm": 0.35193572260787215, + "learning_rate": 8.655875495035472e-05, + "loss": 2.8725, + "step": 20302 + }, + { + "epoch": 0.9452708522475964, + "grad_norm": 0.3280772849274452, + "learning_rate": 8.655690702415849e-05, + "loss": 2.791, + "step": 20303 + }, + { + "epoch": 0.9453174104336896, + "grad_norm": 0.32159170934213344, + "learning_rate": 8.655505899067132e-05, + "loss": 2.9039, + "step": 20304 + }, + { + "epoch": 0.9453639686197826, + "grad_norm": 0.33551394134150375, + "learning_rate": 8.655321084989864e-05, + "loss": 3.0115, + "step": 20305 + }, + { + "epoch": 0.9454105268058757, + "grad_norm": 0.3424290432418537, + "learning_rate": 8.65513626018459e-05, + "loss": 2.9195, + "step": 20306 + }, + { + "epoch": 0.9454570849919687, + "grad_norm": 0.32247013839865446, + "learning_rate": 8.65495142465185e-05, + "loss": 2.8441, + "step": 20307 + }, + { + "epoch": 0.9455036431780618, + "grad_norm": 0.360899959780792, + "learning_rate": 8.654766578392189e-05, + "loss": 2.8982, + "step": 20308 + }, + { + "epoch": 0.9455502013641549, + "grad_norm": 0.3198954251852149, + "learning_rate": 8.654581721406147e-05, + "loss": 2.8143, + "step": 20309 + }, + { + "epoch": 0.9455967595502479, + "grad_norm": 0.3961440124464356, + "learning_rate": 8.654396853694269e-05, + "loss": 2.9884, + "step": 20310 + }, + { + "epoch": 0.945643317736341, + "grad_norm": 0.33683928710029243, + "learning_rate": 8.654211975257095e-05, + "loss": 2.903, + "step": 20311 + }, + { + "epoch": 0.945689875922434, + "grad_norm": 0.33571020616315067, + "learning_rate": 8.654027086095169e-05, + "loss": 2.8626, + "step": 20312 + }, + { + "epoch": 0.9457364341085271, + "grad_norm": 0.3646308725129704, + "learning_rate": 8.653842186209034e-05, + "loss": 2.8729, + "step": 20313 + }, + { + "epoch": 0.9457829922946202, + "grad_norm": 0.3339209472256695, + "learning_rate": 8.653657275599232e-05, + "loss": 2.8122, + "step": 20314 + }, + { + "epoch": 0.9458295504807133, + "grad_norm": 0.3351909592695439, + "learning_rate": 8.653472354266305e-05, + "loss": 2.9182, + "step": 20315 + }, + { + "epoch": 0.9458761086668064, + "grad_norm": 0.32619770517713337, + "learning_rate": 8.653287422210799e-05, + "loss": 2.8744, + "step": 20316 + }, + { + "epoch": 0.9459226668528994, + "grad_norm": 0.3714002843825717, + "learning_rate": 8.653102479433252e-05, + "loss": 2.8531, + "step": 20317 + }, + { + "epoch": 0.9459692250389925, + "grad_norm": 0.31480072747179577, + "learning_rate": 8.65291752593421e-05, + "loss": 2.9359, + "step": 20318 + }, + { + "epoch": 0.9460157832250855, + "grad_norm": 0.3830939947292037, + "learning_rate": 8.652732561714216e-05, + "loss": 2.9368, + "step": 20319 + }, + { + "epoch": 0.9460623414111786, + "grad_norm": 0.32071921404929077, + "learning_rate": 8.65254758677381e-05, + "loss": 2.7612, + "step": 20320 + }, + { + "epoch": 0.9461088995972717, + "grad_norm": 0.370553082720949, + "learning_rate": 8.652362601113537e-05, + "loss": 2.8546, + "step": 20321 + }, + { + "epoch": 0.9461554577833647, + "grad_norm": 0.32682871272091324, + "learning_rate": 8.65217760473394e-05, + "loss": 2.8705, + "step": 20322 + }, + { + "epoch": 0.9462020159694579, + "grad_norm": 0.3374852495712763, + "learning_rate": 8.651992597635561e-05, + "loss": 2.9649, + "step": 20323 + }, + { + "epoch": 0.9462485741555509, + "grad_norm": 0.35952647433980045, + "learning_rate": 8.651807579818945e-05, + "loss": 2.8269, + "step": 20324 + }, + { + "epoch": 0.946295132341644, + "grad_norm": 0.32707946145500005, + "learning_rate": 8.651622551284631e-05, + "loss": 2.8217, + "step": 20325 + }, + { + "epoch": 0.9463416905277371, + "grad_norm": 0.3382479316094344, + "learning_rate": 8.651437512033167e-05, + "loss": 2.9192, + "step": 20326 + }, + { + "epoch": 0.9463882487138301, + "grad_norm": 0.38255702114945206, + "learning_rate": 8.651252462065092e-05, + "loss": 2.8563, + "step": 20327 + }, + { + "epoch": 0.9464348068999232, + "grad_norm": 0.3278620478582493, + "learning_rate": 8.651067401380952e-05, + "loss": 2.9072, + "step": 20328 + }, + { + "epoch": 0.9464813650860162, + "grad_norm": 0.3361128615179421, + "learning_rate": 8.650882329981287e-05, + "loss": 2.8995, + "step": 20329 + }, + { + "epoch": 0.9465279232721093, + "grad_norm": 0.3525817779564385, + "learning_rate": 8.650697247866644e-05, + "loss": 2.9485, + "step": 20330 + }, + { + "epoch": 0.9465744814582024, + "grad_norm": 0.33396958432066687, + "learning_rate": 8.650512155037564e-05, + "loss": 2.9162, + "step": 20331 + }, + { + "epoch": 0.9466210396442954, + "grad_norm": 0.30790974642923097, + "learning_rate": 8.650327051494589e-05, + "loss": 2.7252, + "step": 20332 + }, + { + "epoch": 0.9466675978303886, + "grad_norm": 0.35156979404111316, + "learning_rate": 8.650141937238265e-05, + "loss": 2.8042, + "step": 20333 + }, + { + "epoch": 0.9467141560164816, + "grad_norm": 0.34301442019980893, + "learning_rate": 8.649956812269133e-05, + "loss": 2.9018, + "step": 20334 + }, + { + "epoch": 0.9467607142025747, + "grad_norm": 0.29783899714650547, + "learning_rate": 8.649771676587739e-05, + "loss": 2.8307, + "step": 20335 + }, + { + "epoch": 0.9468072723886677, + "grad_norm": 0.3629839547399186, + "learning_rate": 8.649586530194623e-05, + "loss": 2.8476, + "step": 20336 + }, + { + "epoch": 0.9468538305747608, + "grad_norm": 0.29643388885808497, + "learning_rate": 8.649401373090331e-05, + "loss": 2.8202, + "step": 20337 + }, + { + "epoch": 0.9469003887608539, + "grad_norm": 0.3201682219578519, + "learning_rate": 8.649216205275405e-05, + "loss": 2.91, + "step": 20338 + }, + { + "epoch": 0.9469469469469469, + "grad_norm": 0.3257433259186058, + "learning_rate": 8.649031026750389e-05, + "loss": 2.9536, + "step": 20339 + }, + { + "epoch": 0.94699350513304, + "grad_norm": 0.306598597214474, + "learning_rate": 8.648845837515827e-05, + "loss": 2.9474, + "step": 20340 + }, + { + "epoch": 0.947040063319133, + "grad_norm": 0.31597586022195817, + "learning_rate": 8.648660637572261e-05, + "loss": 2.8157, + "step": 20341 + }, + { + "epoch": 0.9470866215052262, + "grad_norm": 0.3256466102255213, + "learning_rate": 8.648475426920237e-05, + "loss": 2.8474, + "step": 20342 + }, + { + "epoch": 0.9471331796913193, + "grad_norm": 0.3361238406621835, + "learning_rate": 8.648290205560296e-05, + "loss": 2.784, + "step": 20343 + }, + { + "epoch": 0.9471797378774123, + "grad_norm": 0.31459977338925965, + "learning_rate": 8.648104973492983e-05, + "loss": 2.8549, + "step": 20344 + }, + { + "epoch": 0.9472262960635054, + "grad_norm": 0.3312145969499273, + "learning_rate": 8.64791973071884e-05, + "loss": 2.8411, + "step": 20345 + }, + { + "epoch": 0.9472728542495984, + "grad_norm": 0.3084397154410071, + "learning_rate": 8.647734477238414e-05, + "loss": 2.9411, + "step": 20346 + }, + { + "epoch": 0.9473194124356915, + "grad_norm": 0.334372581936022, + "learning_rate": 8.647549213052245e-05, + "loss": 2.7947, + "step": 20347 + }, + { + "epoch": 0.9473659706217846, + "grad_norm": 0.3635910103991604, + "learning_rate": 8.64736393816088e-05, + "loss": 2.9695, + "step": 20348 + }, + { + "epoch": 0.9474125288078776, + "grad_norm": 0.3380612673390451, + "learning_rate": 8.64717865256486e-05, + "loss": 2.8842, + "step": 20349 + }, + { + "epoch": 0.9474590869939707, + "grad_norm": 0.36795517781523485, + "learning_rate": 8.64699335626473e-05, + "loss": 2.9515, + "step": 20350 + }, + { + "epoch": 0.9475056451800637, + "grad_norm": 0.32382682353678455, + "learning_rate": 8.646808049261034e-05, + "loss": 2.6929, + "step": 20351 + }, + { + "epoch": 0.9475522033661569, + "grad_norm": 0.3375657107948393, + "learning_rate": 8.646622731554316e-05, + "loss": 2.9248, + "step": 20352 + }, + { + "epoch": 0.94759876155225, + "grad_norm": 0.33834955101107245, + "learning_rate": 8.646437403145119e-05, + "loss": 2.8675, + "step": 20353 + }, + { + "epoch": 0.947645319738343, + "grad_norm": 0.3188393798919577, + "learning_rate": 8.646252064033987e-05, + "loss": 2.8934, + "step": 20354 + }, + { + "epoch": 0.9476918779244361, + "grad_norm": 0.3826509202889653, + "learning_rate": 8.646066714221466e-05, + "loss": 2.7774, + "step": 20355 + }, + { + "epoch": 0.9477384361105291, + "grad_norm": 0.3747838429150091, + "learning_rate": 8.645881353708097e-05, + "loss": 2.9109, + "step": 20356 + }, + { + "epoch": 0.9477849942966222, + "grad_norm": 0.3415762226552465, + "learning_rate": 8.645695982494426e-05, + "loss": 2.9596, + "step": 20357 + }, + { + "epoch": 0.9478315524827152, + "grad_norm": 0.37719381136659913, + "learning_rate": 8.645510600580996e-05, + "loss": 2.8565, + "step": 20358 + }, + { + "epoch": 0.9478781106688083, + "grad_norm": 0.36766130398361735, + "learning_rate": 8.645325207968352e-05, + "loss": 2.9282, + "step": 20359 + }, + { + "epoch": 0.9479246688549015, + "grad_norm": 0.35090789281219364, + "learning_rate": 8.645139804657037e-05, + "loss": 2.9427, + "step": 20360 + }, + { + "epoch": 0.9479712270409945, + "grad_norm": 0.3599603762595727, + "learning_rate": 8.644954390647597e-05, + "loss": 2.8632, + "step": 20361 + }, + { + "epoch": 0.9480177852270876, + "grad_norm": 0.36049464223234734, + "learning_rate": 8.644768965940574e-05, + "loss": 2.9086, + "step": 20362 + }, + { + "epoch": 0.9480643434131806, + "grad_norm": 0.3755208977501536, + "learning_rate": 8.644583530536514e-05, + "loss": 2.9338, + "step": 20363 + }, + { + "epoch": 0.9481109015992737, + "grad_norm": 0.34476152969531626, + "learning_rate": 8.644398084435959e-05, + "loss": 2.8625, + "step": 20364 + }, + { + "epoch": 0.9481574597853668, + "grad_norm": 0.36836759715747813, + "learning_rate": 8.644212627639456e-05, + "loss": 2.8722, + "step": 20365 + }, + { + "epoch": 0.9482040179714598, + "grad_norm": 0.3418636882495387, + "learning_rate": 8.644027160147547e-05, + "loss": 2.9663, + "step": 20366 + }, + { + "epoch": 0.9482505761575529, + "grad_norm": 0.34698381642670756, + "learning_rate": 8.643841681960777e-05, + "loss": 2.8763, + "step": 20367 + }, + { + "epoch": 0.9482971343436459, + "grad_norm": 0.3625570576920368, + "learning_rate": 8.643656193079692e-05, + "loss": 2.8631, + "step": 20368 + }, + { + "epoch": 0.948343692529739, + "grad_norm": 0.3325984869503853, + "learning_rate": 8.643470693504833e-05, + "loss": 2.7519, + "step": 20369 + }, + { + "epoch": 0.9483902507158322, + "grad_norm": 0.3644070686939187, + "learning_rate": 8.643285183236749e-05, + "loss": 2.8459, + "step": 20370 + }, + { + "epoch": 0.9484368089019252, + "grad_norm": 0.35113174538511616, + "learning_rate": 8.643099662275981e-05, + "loss": 2.9616, + "step": 20371 + }, + { + "epoch": 0.9484833670880183, + "grad_norm": 0.42421013276595787, + "learning_rate": 8.642914130623075e-05, + "loss": 2.9525, + "step": 20372 + }, + { + "epoch": 0.9485299252741113, + "grad_norm": 0.31985494934620834, + "learning_rate": 8.642728588278572e-05, + "loss": 2.8365, + "step": 20373 + }, + { + "epoch": 0.9485764834602044, + "grad_norm": 0.4078625707538418, + "learning_rate": 8.642543035243022e-05, + "loss": 2.8783, + "step": 20374 + }, + { + "epoch": 0.9486230416462975, + "grad_norm": 0.3362065116912404, + "learning_rate": 8.642357471516967e-05, + "loss": 2.8021, + "step": 20375 + }, + { + "epoch": 0.9486695998323905, + "grad_norm": 0.3735404659276533, + "learning_rate": 8.64217189710095e-05, + "loss": 2.8566, + "step": 20376 + }, + { + "epoch": 0.9487161580184836, + "grad_norm": 0.35409092601262804, + "learning_rate": 8.64198631199552e-05, + "loss": 2.8181, + "step": 20377 + }, + { + "epoch": 0.9487627162045766, + "grad_norm": 0.3182605477223161, + "learning_rate": 8.641800716201217e-05, + "loss": 2.8881, + "step": 20378 + }, + { + "epoch": 0.9488092743906698, + "grad_norm": 0.33523359740225595, + "learning_rate": 8.641615109718588e-05, + "loss": 2.8463, + "step": 20379 + }, + { + "epoch": 0.9488558325767628, + "grad_norm": 0.3518835809776858, + "learning_rate": 8.641429492548177e-05, + "loss": 2.8644, + "step": 20380 + }, + { + "epoch": 0.9489023907628559, + "grad_norm": 0.3311145384342289, + "learning_rate": 8.64124386469053e-05, + "loss": 2.9291, + "step": 20381 + }, + { + "epoch": 0.948948948948949, + "grad_norm": 0.3430502974396447, + "learning_rate": 8.64105822614619e-05, + "loss": 2.8975, + "step": 20382 + }, + { + "epoch": 0.948995507135042, + "grad_norm": 0.3001235336931503, + "learning_rate": 8.640872576915704e-05, + "loss": 2.7566, + "step": 20383 + }, + { + "epoch": 0.9490420653211351, + "grad_norm": 0.35622652339075744, + "learning_rate": 8.640686916999615e-05, + "loss": 2.877, + "step": 20384 + }, + { + "epoch": 0.9490886235072281, + "grad_norm": 0.3083715191442855, + "learning_rate": 8.640501246398469e-05, + "loss": 2.8212, + "step": 20385 + }, + { + "epoch": 0.9491351816933212, + "grad_norm": 0.3646873636497043, + "learning_rate": 8.640315565112811e-05, + "loss": 2.884, + "step": 20386 + }, + { + "epoch": 0.9491817398794143, + "grad_norm": 0.3648077260495933, + "learning_rate": 8.640129873143183e-05, + "loss": 2.9668, + "step": 20387 + }, + { + "epoch": 0.9492282980655073, + "grad_norm": 0.3505308826461514, + "learning_rate": 8.639944170490136e-05, + "loss": 2.9056, + "step": 20388 + }, + { + "epoch": 0.9492748562516005, + "grad_norm": 0.37693794054154484, + "learning_rate": 8.639758457154209e-05, + "loss": 2.9161, + "step": 20389 + }, + { + "epoch": 0.9493214144376935, + "grad_norm": 0.37266364471509494, + "learning_rate": 8.63957273313595e-05, + "loss": 2.8493, + "step": 20390 + }, + { + "epoch": 0.9493679726237866, + "grad_norm": 0.352415058864221, + "learning_rate": 8.639386998435903e-05, + "loss": 2.969, + "step": 20391 + }, + { + "epoch": 0.9494145308098797, + "grad_norm": 0.3398250552617373, + "learning_rate": 8.639201253054614e-05, + "loss": 2.9765, + "step": 20392 + }, + { + "epoch": 0.9494610889959727, + "grad_norm": 0.3369676081889841, + "learning_rate": 8.639015496992629e-05, + "loss": 2.9522, + "step": 20393 + }, + { + "epoch": 0.9495076471820658, + "grad_norm": 0.33539512084877027, + "learning_rate": 8.638829730250491e-05, + "loss": 2.8209, + "step": 20394 + }, + { + "epoch": 0.9495542053681588, + "grad_norm": 0.3198993031462198, + "learning_rate": 8.638643952828746e-05, + "loss": 2.827, + "step": 20395 + }, + { + "epoch": 0.9496007635542519, + "grad_norm": 0.3161109800988754, + "learning_rate": 8.63845816472794e-05, + "loss": 2.7544, + "step": 20396 + }, + { + "epoch": 0.9496473217403449, + "grad_norm": 0.32385560823976, + "learning_rate": 8.638272365948619e-05, + "loss": 2.9397, + "step": 20397 + }, + { + "epoch": 0.9496938799264381, + "grad_norm": 0.3550935565062888, + "learning_rate": 8.638086556491325e-05, + "loss": 2.9031, + "step": 20398 + }, + { + "epoch": 0.9497404381125312, + "grad_norm": 0.3266778662334854, + "learning_rate": 8.637900736356605e-05, + "loss": 2.8832, + "step": 20399 + }, + { + "epoch": 0.9497869962986242, + "grad_norm": 0.331683307085347, + "learning_rate": 8.637714905545007e-05, + "loss": 2.8239, + "step": 20400 + }, + { + "epoch": 0.9498335544847173, + "grad_norm": 0.3252140182573646, + "learning_rate": 8.637529064057073e-05, + "loss": 2.8702, + "step": 20401 + }, + { + "epoch": 0.9498801126708103, + "grad_norm": 0.3440587464629893, + "learning_rate": 8.63734321189335e-05, + "loss": 2.8512, + "step": 20402 + }, + { + "epoch": 0.9499266708569034, + "grad_norm": 0.3239422847801465, + "learning_rate": 8.637157349054381e-05, + "loss": 2.7966, + "step": 20403 + }, + { + "epoch": 0.9499732290429965, + "grad_norm": 0.31893484029980856, + "learning_rate": 8.636971475540715e-05, + "loss": 2.7943, + "step": 20404 + }, + { + "epoch": 0.9500197872290895, + "grad_norm": 0.30809851977197694, + "learning_rate": 8.636785591352896e-05, + "loss": 2.7957, + "step": 20405 + }, + { + "epoch": 0.9500663454151826, + "grad_norm": 0.3223755751303022, + "learning_rate": 8.63659969649147e-05, + "loss": 2.905, + "step": 20406 + }, + { + "epoch": 0.9501129036012756, + "grad_norm": 0.3212829382225509, + "learning_rate": 8.63641379095698e-05, + "loss": 2.8546, + "step": 20407 + }, + { + "epoch": 0.9501594617873688, + "grad_norm": 0.33922097893092346, + "learning_rate": 8.636227874749976e-05, + "loss": 2.9946, + "step": 20408 + }, + { + "epoch": 0.9502060199734619, + "grad_norm": 0.3290143172409256, + "learning_rate": 8.636041947871001e-05, + "loss": 2.8018, + "step": 20409 + }, + { + "epoch": 0.9502525781595549, + "grad_norm": 0.34259624758593527, + "learning_rate": 8.635856010320602e-05, + "loss": 2.8492, + "step": 20410 + }, + { + "epoch": 0.950299136345648, + "grad_norm": 0.31979290338002586, + "learning_rate": 8.635670062099323e-05, + "loss": 2.9094, + "step": 20411 + }, + { + "epoch": 0.950345694531741, + "grad_norm": 0.35895224468308873, + "learning_rate": 8.635484103207709e-05, + "loss": 2.9327, + "step": 20412 + }, + { + "epoch": 0.9503922527178341, + "grad_norm": 0.31931876695643485, + "learning_rate": 8.635298133646309e-05, + "loss": 2.8532, + "step": 20413 + }, + { + "epoch": 0.9504388109039272, + "grad_norm": 0.3396433461630658, + "learning_rate": 8.635112153415668e-05, + "loss": 2.9236, + "step": 20414 + }, + { + "epoch": 0.9504853690900202, + "grad_norm": 0.35651067759544125, + "learning_rate": 8.634926162516328e-05, + "loss": 2.8994, + "step": 20415 + }, + { + "epoch": 0.9505319272761134, + "grad_norm": 0.33622584504338754, + "learning_rate": 8.63474016094884e-05, + "loss": 2.8665, + "step": 20416 + }, + { + "epoch": 0.9505784854622064, + "grad_norm": 0.3406501976164438, + "learning_rate": 8.634554148713746e-05, + "loss": 2.8111, + "step": 20417 + }, + { + "epoch": 0.9506250436482995, + "grad_norm": 0.35317254871111725, + "learning_rate": 8.634368125811596e-05, + "loss": 2.951, + "step": 20418 + }, + { + "epoch": 0.9506716018343925, + "grad_norm": 0.3397747363401631, + "learning_rate": 8.634182092242932e-05, + "loss": 2.8479, + "step": 20419 + }, + { + "epoch": 0.9507181600204856, + "grad_norm": 0.3565063936936815, + "learning_rate": 8.633996048008303e-05, + "loss": 2.882, + "step": 20420 + }, + { + "epoch": 0.9507647182065787, + "grad_norm": 0.3218300150269152, + "learning_rate": 8.633809993108253e-05, + "loss": 2.817, + "step": 20421 + }, + { + "epoch": 0.9508112763926717, + "grad_norm": 0.35457723177018224, + "learning_rate": 8.633623927543328e-05, + "loss": 2.9056, + "step": 20422 + }, + { + "epoch": 0.9508578345787648, + "grad_norm": 0.35279789066884176, + "learning_rate": 8.633437851314074e-05, + "loss": 2.9808, + "step": 20423 + }, + { + "epoch": 0.9509043927648578, + "grad_norm": 0.33167166162043377, + "learning_rate": 8.63325176442104e-05, + "loss": 2.9228, + "step": 20424 + }, + { + "epoch": 0.950950950950951, + "grad_norm": 0.32867388458125346, + "learning_rate": 8.633065666864768e-05, + "loss": 2.7921, + "step": 20425 + }, + { + "epoch": 0.9509975091370441, + "grad_norm": 0.3314040758331101, + "learning_rate": 8.632879558645807e-05, + "loss": 2.8136, + "step": 20426 + }, + { + "epoch": 0.9510440673231371, + "grad_norm": 0.31693512250667855, + "learning_rate": 8.632693439764703e-05, + "loss": 2.8915, + "step": 20427 + }, + { + "epoch": 0.9510906255092302, + "grad_norm": 0.3055994143647478, + "learning_rate": 8.632507310222001e-05, + "loss": 2.8936, + "step": 20428 + }, + { + "epoch": 0.9511371836953232, + "grad_norm": 0.35060959855897034, + "learning_rate": 8.632321170018248e-05, + "loss": 2.9168, + "step": 20429 + }, + { + "epoch": 0.9511837418814163, + "grad_norm": 0.3248998899281186, + "learning_rate": 8.632135019153991e-05, + "loss": 2.8233, + "step": 20430 + }, + { + "epoch": 0.9512303000675094, + "grad_norm": 0.34712598676356365, + "learning_rate": 8.631948857629774e-05, + "loss": 2.8883, + "step": 20431 + }, + { + "epoch": 0.9512768582536024, + "grad_norm": 0.32040082213979654, + "learning_rate": 8.631762685446145e-05, + "loss": 2.9263, + "step": 20432 + }, + { + "epoch": 0.9513234164396955, + "grad_norm": 0.3312805477948846, + "learning_rate": 8.631576502603651e-05, + "loss": 2.9177, + "step": 20433 + }, + { + "epoch": 0.9513699746257885, + "grad_norm": 0.33209590834584607, + "learning_rate": 8.631390309102838e-05, + "loss": 2.9154, + "step": 20434 + }, + { + "epoch": 0.9514165328118817, + "grad_norm": 0.3425738749456833, + "learning_rate": 8.631204104944253e-05, + "loss": 2.879, + "step": 20435 + }, + { + "epoch": 0.9514630909979748, + "grad_norm": 0.36126485781376577, + "learning_rate": 8.631017890128442e-05, + "loss": 2.9327, + "step": 20436 + }, + { + "epoch": 0.9515096491840678, + "grad_norm": 0.32107180372773464, + "learning_rate": 8.630831664655948e-05, + "loss": 2.9115, + "step": 20437 + }, + { + "epoch": 0.9515562073701609, + "grad_norm": 0.3845459310235748, + "learning_rate": 8.630645428527321e-05, + "loss": 2.8473, + "step": 20438 + }, + { + "epoch": 0.9516027655562539, + "grad_norm": 0.35429750494370876, + "learning_rate": 8.63045918174311e-05, + "loss": 2.7463, + "step": 20439 + }, + { + "epoch": 0.951649323742347, + "grad_norm": 0.3496802226316464, + "learning_rate": 8.630272924303858e-05, + "loss": 2.8612, + "step": 20440 + }, + { + "epoch": 0.95169588192844, + "grad_norm": 0.4045938329960538, + "learning_rate": 8.630086656210113e-05, + "loss": 2.8665, + "step": 20441 + }, + { + "epoch": 0.9517424401145331, + "grad_norm": 0.31906762566248514, + "learning_rate": 8.62990037746242e-05, + "loss": 2.8981, + "step": 20442 + }, + { + "epoch": 0.9517889983006262, + "grad_norm": 0.3723625801016284, + "learning_rate": 8.629714088061329e-05, + "loss": 2.8448, + "step": 20443 + }, + { + "epoch": 0.9518355564867192, + "grad_norm": 0.31381535641564845, + "learning_rate": 8.629527788007384e-05, + "loss": 2.8607, + "step": 20444 + }, + { + "epoch": 0.9518821146728124, + "grad_norm": 0.32529321769477476, + "learning_rate": 8.629341477301132e-05, + "loss": 2.8933, + "step": 20445 + }, + { + "epoch": 0.9519286728589054, + "grad_norm": 0.3439420983715762, + "learning_rate": 8.629155155943119e-05, + "loss": 2.9017, + "step": 20446 + }, + { + "epoch": 0.9519752310449985, + "grad_norm": 0.3498538565398456, + "learning_rate": 8.628968823933893e-05, + "loss": 2.92, + "step": 20447 + }, + { + "epoch": 0.9520217892310916, + "grad_norm": 0.3564765846260129, + "learning_rate": 8.628782481274005e-05, + "loss": 2.9381, + "step": 20448 + }, + { + "epoch": 0.9520683474171846, + "grad_norm": 0.3557963659071119, + "learning_rate": 8.628596127963995e-05, + "loss": 2.9175, + "step": 20449 + }, + { + "epoch": 0.9521149056032777, + "grad_norm": 0.3009888257575652, + "learning_rate": 8.628409764004412e-05, + "loss": 2.9198, + "step": 20450 + }, + { + "epoch": 0.9521614637893707, + "grad_norm": 0.37482656567463934, + "learning_rate": 8.628223389395806e-05, + "loss": 2.9071, + "step": 20451 + }, + { + "epoch": 0.9522080219754638, + "grad_norm": 0.32949032653351645, + "learning_rate": 8.62803700413872e-05, + "loss": 2.9763, + "step": 20452 + }, + { + "epoch": 0.952254580161557, + "grad_norm": 0.3634214148303802, + "learning_rate": 8.627850608233704e-05, + "loss": 2.9035, + "step": 20453 + }, + { + "epoch": 0.95230113834765, + "grad_norm": 0.3978334469889131, + "learning_rate": 8.627664201681303e-05, + "loss": 2.8595, + "step": 20454 + }, + { + "epoch": 0.9523476965337431, + "grad_norm": 0.34071500377330277, + "learning_rate": 8.627477784482067e-05, + "loss": 2.9137, + "step": 20455 + }, + { + "epoch": 0.9523942547198361, + "grad_norm": 0.3932947345966541, + "learning_rate": 8.62729135663654e-05, + "loss": 2.9558, + "step": 20456 + }, + { + "epoch": 0.9524408129059292, + "grad_norm": 0.3594246469197742, + "learning_rate": 8.62710491814527e-05, + "loss": 2.9126, + "step": 20457 + }, + { + "epoch": 0.9524873710920223, + "grad_norm": 0.37826789459442745, + "learning_rate": 8.626918469008802e-05, + "loss": 2.8713, + "step": 20458 + }, + { + "epoch": 0.9525339292781153, + "grad_norm": 0.33954322328597075, + "learning_rate": 8.62673200922769e-05, + "loss": 2.9085, + "step": 20459 + }, + { + "epoch": 0.9525804874642084, + "grad_norm": 0.352459312079259, + "learning_rate": 8.626545538802474e-05, + "loss": 2.928, + "step": 20460 + }, + { + "epoch": 0.9526270456503014, + "grad_norm": 0.36869406930542264, + "learning_rate": 8.626359057733704e-05, + "loss": 2.9405, + "step": 20461 + }, + { + "epoch": 0.9526736038363945, + "grad_norm": 0.3442743760489167, + "learning_rate": 8.626172566021928e-05, + "loss": 2.8673, + "step": 20462 + }, + { + "epoch": 0.9527201620224875, + "grad_norm": 0.3904782366217317, + "learning_rate": 8.625986063667693e-05, + "loss": 2.8787, + "step": 20463 + }, + { + "epoch": 0.9527667202085807, + "grad_norm": 0.351041982220116, + "learning_rate": 8.625799550671546e-05, + "loss": 2.9911, + "step": 20464 + }, + { + "epoch": 0.9528132783946738, + "grad_norm": 0.37367716220316766, + "learning_rate": 8.625613027034034e-05, + "loss": 2.9387, + "step": 20465 + }, + { + "epoch": 0.9528598365807668, + "grad_norm": 0.36456227183029144, + "learning_rate": 8.625426492755706e-05, + "loss": 2.8782, + "step": 20466 + }, + { + "epoch": 0.9529063947668599, + "grad_norm": 0.32621743877234044, + "learning_rate": 8.625239947837108e-05, + "loss": 2.7754, + "step": 20467 + }, + { + "epoch": 0.9529529529529529, + "grad_norm": 0.33522650923387737, + "learning_rate": 8.625053392278788e-05, + "loss": 2.8041, + "step": 20468 + }, + { + "epoch": 0.952999511139046, + "grad_norm": 0.3373871918928853, + "learning_rate": 8.624866826081293e-05, + "loss": 2.9842, + "step": 20469 + }, + { + "epoch": 0.9530460693251391, + "grad_norm": 0.3573802153476562, + "learning_rate": 8.624680249245172e-05, + "loss": 2.8538, + "step": 20470 + }, + { + "epoch": 0.9530926275112321, + "grad_norm": 0.33267779615190834, + "learning_rate": 8.62449366177097e-05, + "loss": 2.9464, + "step": 20471 + }, + { + "epoch": 0.9531391856973253, + "grad_norm": 0.3650403553737444, + "learning_rate": 8.624307063659235e-05, + "loss": 2.9907, + "step": 20472 + }, + { + "epoch": 0.9531857438834183, + "grad_norm": 0.3655004229375536, + "learning_rate": 8.62412045491052e-05, + "loss": 2.8242, + "step": 20473 + }, + { + "epoch": 0.9532323020695114, + "grad_norm": 0.36707779211488745, + "learning_rate": 8.623933835525366e-05, + "loss": 2.9667, + "step": 20474 + }, + { + "epoch": 0.9532788602556045, + "grad_norm": 0.3210263807649453, + "learning_rate": 8.623747205504323e-05, + "loss": 2.7009, + "step": 20475 + }, + { + "epoch": 0.9533254184416975, + "grad_norm": 0.35979383871433385, + "learning_rate": 8.623560564847938e-05, + "loss": 2.8599, + "step": 20476 + }, + { + "epoch": 0.9533719766277906, + "grad_norm": 0.35673118735175835, + "learning_rate": 8.623373913556762e-05, + "loss": 2.8754, + "step": 20477 + }, + { + "epoch": 0.9534185348138836, + "grad_norm": 0.3342393729645298, + "learning_rate": 8.62318725163134e-05, + "loss": 2.8404, + "step": 20478 + }, + { + "epoch": 0.9534650929999767, + "grad_norm": 0.33407868921229494, + "learning_rate": 8.62300057907222e-05, + "loss": 2.8398, + "step": 20479 + }, + { + "epoch": 0.9535116511860698, + "grad_norm": 0.3205354184571844, + "learning_rate": 8.62281389587995e-05, + "loss": 2.8396, + "step": 20480 + }, + { + "epoch": 0.9535582093721628, + "grad_norm": 0.34836851160816973, + "learning_rate": 8.622627202055079e-05, + "loss": 2.9511, + "step": 20481 + }, + { + "epoch": 0.953604767558256, + "grad_norm": 0.34954513847147023, + "learning_rate": 8.622440497598153e-05, + "loss": 2.8812, + "step": 20482 + }, + { + "epoch": 0.953651325744349, + "grad_norm": 0.3309873400420664, + "learning_rate": 8.622253782509722e-05, + "loss": 2.8494, + "step": 20483 + }, + { + "epoch": 0.9536978839304421, + "grad_norm": 0.33661707973903193, + "learning_rate": 8.622067056790333e-05, + "loss": 2.8865, + "step": 20484 + }, + { + "epoch": 0.9537444421165351, + "grad_norm": 0.32111150983896725, + "learning_rate": 8.621880320440532e-05, + "loss": 2.8427, + "step": 20485 + }, + { + "epoch": 0.9537910003026282, + "grad_norm": 0.3466579168148019, + "learning_rate": 8.621693573460871e-05, + "loss": 2.8646, + "step": 20486 + }, + { + "epoch": 0.9538375584887213, + "grad_norm": 0.3327361081310343, + "learning_rate": 8.621506815851896e-05, + "loss": 2.8347, + "step": 20487 + }, + { + "epoch": 0.9538841166748143, + "grad_norm": 0.34480122073621017, + "learning_rate": 8.621320047614154e-05, + "loss": 2.8208, + "step": 20488 + }, + { + "epoch": 0.9539306748609074, + "grad_norm": 0.31981578724516413, + "learning_rate": 8.621133268748197e-05, + "loss": 2.9958, + "step": 20489 + }, + { + "epoch": 0.9539772330470004, + "grad_norm": 0.3534521806628165, + "learning_rate": 8.620946479254568e-05, + "loss": 2.8822, + "step": 20490 + }, + { + "epoch": 0.9540237912330936, + "grad_norm": 0.32779104918100826, + "learning_rate": 8.620759679133819e-05, + "loss": 2.8151, + "step": 20491 + }, + { + "epoch": 0.9540703494191867, + "grad_norm": 0.34412506102689056, + "learning_rate": 8.620572868386496e-05, + "loss": 2.8402, + "step": 20492 + }, + { + "epoch": 0.9541169076052797, + "grad_norm": 0.29784723419192477, + "learning_rate": 8.62038604701315e-05, + "loss": 2.808, + "step": 20493 + }, + { + "epoch": 0.9541634657913728, + "grad_norm": 0.32502834678625064, + "learning_rate": 8.620199215014326e-05, + "loss": 2.8994, + "step": 20494 + }, + { + "epoch": 0.9542100239774658, + "grad_norm": 0.3176921114904776, + "learning_rate": 8.620012372390576e-05, + "loss": 2.8728, + "step": 20495 + }, + { + "epoch": 0.9542565821635589, + "grad_norm": 0.3513036030375956, + "learning_rate": 8.619825519142443e-05, + "loss": 2.9485, + "step": 20496 + }, + { + "epoch": 0.954303140349652, + "grad_norm": 0.318361505794016, + "learning_rate": 8.619638655270482e-05, + "loss": 2.7991, + "step": 20497 + }, + { + "epoch": 0.954349698535745, + "grad_norm": 0.32223837973313624, + "learning_rate": 8.619451780775236e-05, + "loss": 2.849, + "step": 20498 + }, + { + "epoch": 0.9543962567218381, + "grad_norm": 0.33565085283213847, + "learning_rate": 8.619264895657258e-05, + "loss": 2.8736, + "step": 20499 + }, + { + "epoch": 0.9544428149079311, + "grad_norm": 0.37651894647404743, + "learning_rate": 8.619077999917092e-05, + "loss": 2.8824, + "step": 20500 + }, + { + "epoch": 0.9544893730940243, + "grad_norm": 0.3349851970569807, + "learning_rate": 8.61889109355529e-05, + "loss": 2.9478, + "step": 20501 + }, + { + "epoch": 0.9545359312801174, + "grad_norm": 0.36757043382788374, + "learning_rate": 8.618704176572397e-05, + "loss": 2.8009, + "step": 20502 + }, + { + "epoch": 0.9545824894662104, + "grad_norm": 0.3326439193315624, + "learning_rate": 8.618517248968965e-05, + "loss": 2.931, + "step": 20503 + }, + { + "epoch": 0.9546290476523035, + "grad_norm": 0.3095667139811424, + "learning_rate": 8.618330310745542e-05, + "loss": 2.8689, + "step": 20504 + }, + { + "epoch": 0.9546756058383965, + "grad_norm": 0.3309605901153047, + "learning_rate": 8.618143361902675e-05, + "loss": 2.876, + "step": 20505 + }, + { + "epoch": 0.9547221640244896, + "grad_norm": 0.3403447867126776, + "learning_rate": 8.617956402440914e-05, + "loss": 2.8531, + "step": 20506 + }, + { + "epoch": 0.9547687222105826, + "grad_norm": 0.338493601601713, + "learning_rate": 8.617769432360809e-05, + "loss": 2.8067, + "step": 20507 + }, + { + "epoch": 0.9548152803966757, + "grad_norm": 0.34993382508508997, + "learning_rate": 8.617582451662904e-05, + "loss": 2.7919, + "step": 20508 + }, + { + "epoch": 0.9548618385827689, + "grad_norm": 0.34267415982284455, + "learning_rate": 8.617395460347753e-05, + "loss": 2.9373, + "step": 20509 + }, + { + "epoch": 0.9549083967688619, + "grad_norm": 0.3614261802356422, + "learning_rate": 8.617208458415901e-05, + "loss": 2.9744, + "step": 20510 + }, + { + "epoch": 0.954954954954955, + "grad_norm": 0.35680412271956713, + "learning_rate": 8.617021445867902e-05, + "loss": 2.9818, + "step": 20511 + }, + { + "epoch": 0.955001513141048, + "grad_norm": 0.3429467354544277, + "learning_rate": 8.616834422704298e-05, + "loss": 2.9445, + "step": 20512 + }, + { + "epoch": 0.9550480713271411, + "grad_norm": 0.3192025489938653, + "learning_rate": 8.616647388925642e-05, + "loss": 2.9348, + "step": 20513 + }, + { + "epoch": 0.9550946295132342, + "grad_norm": 0.35724236618099897, + "learning_rate": 8.616460344532483e-05, + "loss": 2.9373, + "step": 20514 + }, + { + "epoch": 0.9551411876993272, + "grad_norm": 0.3485582262362988, + "learning_rate": 8.61627328952537e-05, + "loss": 2.9051, + "step": 20515 + }, + { + "epoch": 0.9551877458854203, + "grad_norm": 0.3325326219880627, + "learning_rate": 8.616086223904849e-05, + "loss": 2.8165, + "step": 20516 + }, + { + "epoch": 0.9552343040715133, + "grad_norm": 0.3736981843222467, + "learning_rate": 8.615899147671472e-05, + "loss": 2.9384, + "step": 20517 + }, + { + "epoch": 0.9552808622576064, + "grad_norm": 0.3340568395601266, + "learning_rate": 8.615712060825787e-05, + "loss": 2.8983, + "step": 20518 + }, + { + "epoch": 0.9553274204436996, + "grad_norm": 0.3383720604452426, + "learning_rate": 8.615524963368344e-05, + "loss": 2.9139, + "step": 20519 + }, + { + "epoch": 0.9553739786297926, + "grad_norm": 0.33771303034615596, + "learning_rate": 8.615337855299692e-05, + "loss": 2.8458, + "step": 20520 + }, + { + "epoch": 0.9554205368158857, + "grad_norm": 0.3481472490378577, + "learning_rate": 8.615150736620377e-05, + "loss": 2.9533, + "step": 20521 + }, + { + "epoch": 0.9554670950019787, + "grad_norm": 0.34696018027395015, + "learning_rate": 8.614963607330953e-05, + "loss": 2.788, + "step": 20522 + }, + { + "epoch": 0.9555136531880718, + "grad_norm": 0.38156317971123865, + "learning_rate": 8.614776467431966e-05, + "loss": 3.0114, + "step": 20523 + }, + { + "epoch": 0.9555602113741649, + "grad_norm": 0.34119906096718766, + "learning_rate": 8.614589316923967e-05, + "loss": 2.9346, + "step": 20524 + }, + { + "epoch": 0.9556067695602579, + "grad_norm": 0.35401271717961275, + "learning_rate": 8.614402155807503e-05, + "loss": 2.6617, + "step": 20525 + }, + { + "epoch": 0.955653327746351, + "grad_norm": 0.37384990190379297, + "learning_rate": 8.614214984083126e-05, + "loss": 2.8722, + "step": 20526 + }, + { + "epoch": 0.955699885932444, + "grad_norm": 0.3621938649716862, + "learning_rate": 8.614027801751385e-05, + "loss": 2.8856, + "step": 20527 + }, + { + "epoch": 0.9557464441185372, + "grad_norm": 0.3711164508472857, + "learning_rate": 8.613840608812826e-05, + "loss": 2.8921, + "step": 20528 + }, + { + "epoch": 0.9557930023046302, + "grad_norm": 0.3808379551696536, + "learning_rate": 8.613653405268003e-05, + "loss": 2.9124, + "step": 20529 + }, + { + "epoch": 0.9558395604907233, + "grad_norm": 0.3399278143326221, + "learning_rate": 8.613466191117463e-05, + "loss": 2.8584, + "step": 20530 + }, + { + "epoch": 0.9558861186768164, + "grad_norm": 0.36622577151594277, + "learning_rate": 8.613278966361756e-05, + "loss": 2.9113, + "step": 20531 + }, + { + "epoch": 0.9559326768629094, + "grad_norm": 0.36667969413459534, + "learning_rate": 8.613091731001427e-05, + "loss": 2.9142, + "step": 20532 + }, + { + "epoch": 0.9559792350490025, + "grad_norm": 0.337756280677412, + "learning_rate": 8.612904485037034e-05, + "loss": 2.939, + "step": 20533 + }, + { + "epoch": 0.9560257932350955, + "grad_norm": 0.36890955347412174, + "learning_rate": 8.612717228469123e-05, + "loss": 2.9055, + "step": 20534 + }, + { + "epoch": 0.9560723514211886, + "grad_norm": 0.35906987422511816, + "learning_rate": 8.61252996129824e-05, + "loss": 2.8968, + "step": 20535 + }, + { + "epoch": 0.9561189096072817, + "grad_norm": 0.35392264813496643, + "learning_rate": 8.612342683524938e-05, + "loss": 2.9701, + "step": 20536 + }, + { + "epoch": 0.9561654677933747, + "grad_norm": 0.36270680923927934, + "learning_rate": 8.612155395149768e-05, + "loss": 2.9102, + "step": 20537 + }, + { + "epoch": 0.9562120259794679, + "grad_norm": 0.3394796566964699, + "learning_rate": 8.611968096173277e-05, + "loss": 2.8537, + "step": 20538 + }, + { + "epoch": 0.9562585841655609, + "grad_norm": 0.36615312793518817, + "learning_rate": 8.611780786596013e-05, + "loss": 2.8607, + "step": 20539 + }, + { + "epoch": 0.956305142351654, + "grad_norm": 0.3255395883590139, + "learning_rate": 8.61159346641853e-05, + "loss": 2.8988, + "step": 20540 + }, + { + "epoch": 0.9563517005377471, + "grad_norm": 0.3907010602867014, + "learning_rate": 8.611406135641376e-05, + "loss": 2.8724, + "step": 20541 + }, + { + "epoch": 0.9563982587238401, + "grad_norm": 0.3246807324630406, + "learning_rate": 8.611218794265102e-05, + "loss": 2.9426, + "step": 20542 + }, + { + "epoch": 0.9564448169099332, + "grad_norm": 0.4013213282455762, + "learning_rate": 8.611031442290255e-05, + "loss": 2.7996, + "step": 20543 + }, + { + "epoch": 0.9564913750960262, + "grad_norm": 0.3586581005855751, + "learning_rate": 8.610844079717388e-05, + "loss": 2.7321, + "step": 20544 + }, + { + "epoch": 0.9565379332821193, + "grad_norm": 0.3863879359031027, + "learning_rate": 8.610656706547047e-05, + "loss": 2.8355, + "step": 20545 + }, + { + "epoch": 0.9565844914682125, + "grad_norm": 0.35271268052950494, + "learning_rate": 8.610469322779787e-05, + "loss": 2.6797, + "step": 20546 + }, + { + "epoch": 0.9566310496543055, + "grad_norm": 0.37496533487153627, + "learning_rate": 8.610281928416153e-05, + "loss": 2.9832, + "step": 20547 + }, + { + "epoch": 0.9566776078403986, + "grad_norm": 0.3550286540581577, + "learning_rate": 8.610094523456697e-05, + "loss": 2.9135, + "step": 20548 + }, + { + "epoch": 0.9567241660264916, + "grad_norm": 0.38504123131991, + "learning_rate": 8.609907107901971e-05, + "loss": 3.0421, + "step": 20549 + }, + { + "epoch": 0.9567707242125847, + "grad_norm": 0.4021178253791735, + "learning_rate": 8.609719681752522e-05, + "loss": 2.8963, + "step": 20550 + }, + { + "epoch": 0.9568172823986777, + "grad_norm": 0.3248004960664502, + "learning_rate": 8.609532245008902e-05, + "loss": 2.857, + "step": 20551 + }, + { + "epoch": 0.9568638405847708, + "grad_norm": 0.3947366706831375, + "learning_rate": 8.609344797671661e-05, + "loss": 2.9361, + "step": 20552 + }, + { + "epoch": 0.9569103987708639, + "grad_norm": 0.3447690264257268, + "learning_rate": 8.609157339741347e-05, + "loss": 2.7933, + "step": 20553 + }, + { + "epoch": 0.9569569569569569, + "grad_norm": 0.386726093171282, + "learning_rate": 8.608969871218512e-05, + "loss": 2.8313, + "step": 20554 + }, + { + "epoch": 0.95700351514305, + "grad_norm": 0.3582967662772224, + "learning_rate": 8.608782392103707e-05, + "loss": 2.9567, + "step": 20555 + }, + { + "epoch": 0.957050073329143, + "grad_norm": 0.3854561001679126, + "learning_rate": 8.608594902397482e-05, + "loss": 2.908, + "step": 20556 + }, + { + "epoch": 0.9570966315152362, + "grad_norm": 0.42493274247803925, + "learning_rate": 8.608407402100384e-05, + "loss": 2.8865, + "step": 20557 + }, + { + "epoch": 0.9571431897013293, + "grad_norm": 0.42728626577998363, + "learning_rate": 8.608219891212968e-05, + "loss": 2.9584, + "step": 20558 + }, + { + "epoch": 0.9571897478874223, + "grad_norm": 0.4074037413500287, + "learning_rate": 8.60803236973578e-05, + "loss": 2.8106, + "step": 20559 + }, + { + "epoch": 0.9572363060735154, + "grad_norm": 0.43970253664282377, + "learning_rate": 8.607844837669375e-05, + "loss": 2.9209, + "step": 20560 + }, + { + "epoch": 0.9572828642596084, + "grad_norm": 0.40710133070572724, + "learning_rate": 8.6076572950143e-05, + "loss": 2.8417, + "step": 20561 + }, + { + "epoch": 0.9573294224457015, + "grad_norm": 0.4227726437369205, + "learning_rate": 8.607469741771106e-05, + "loss": 2.8727, + "step": 20562 + }, + { + "epoch": 0.9573759806317946, + "grad_norm": 0.35578156279978423, + "learning_rate": 8.607282177940344e-05, + "loss": 2.918, + "step": 20563 + }, + { + "epoch": 0.9574225388178876, + "grad_norm": 0.3818256899116254, + "learning_rate": 8.607094603522562e-05, + "loss": 2.9632, + "step": 20564 + }, + { + "epoch": 0.9574690970039808, + "grad_norm": 0.35088482030864054, + "learning_rate": 8.606907018518314e-05, + "loss": 2.9069, + "step": 20565 + }, + { + "epoch": 0.9575156551900738, + "grad_norm": 0.3492135908518613, + "learning_rate": 8.606719422928149e-05, + "loss": 2.9234, + "step": 20566 + }, + { + "epoch": 0.9575622133761669, + "grad_norm": 0.3761300082567323, + "learning_rate": 8.606531816752619e-05, + "loss": 2.9071, + "step": 20567 + }, + { + "epoch": 0.95760877156226, + "grad_norm": 0.3278569000060264, + "learning_rate": 8.606344199992274e-05, + "loss": 2.7994, + "step": 20568 + }, + { + "epoch": 0.957655329748353, + "grad_norm": 0.398280879314806, + "learning_rate": 8.606156572647661e-05, + "loss": 2.7793, + "step": 20569 + }, + { + "epoch": 0.9577018879344461, + "grad_norm": 0.3809840064618247, + "learning_rate": 8.605968934719337e-05, + "loss": 2.9326, + "step": 20570 + }, + { + "epoch": 0.9577484461205391, + "grad_norm": 0.37638560652045894, + "learning_rate": 8.605781286207848e-05, + "loss": 2.9215, + "step": 20571 + }, + { + "epoch": 0.9577950043066322, + "grad_norm": 0.3779885549270864, + "learning_rate": 8.605593627113746e-05, + "loss": 2.8324, + "step": 20572 + }, + { + "epoch": 0.9578415624927252, + "grad_norm": 0.34240689742444924, + "learning_rate": 8.605405957437581e-05, + "loss": 2.9024, + "step": 20573 + }, + { + "epoch": 0.9578881206788183, + "grad_norm": 0.3711144722196609, + "learning_rate": 8.605218277179906e-05, + "loss": 2.8424, + "step": 20574 + }, + { + "epoch": 0.9579346788649115, + "grad_norm": 0.3821277656254059, + "learning_rate": 8.60503058634127e-05, + "loss": 3.0097, + "step": 20575 + }, + { + "epoch": 0.9579812370510045, + "grad_norm": 0.3444124602842716, + "learning_rate": 8.604842884922225e-05, + "loss": 2.7915, + "step": 20576 + }, + { + "epoch": 0.9580277952370976, + "grad_norm": 0.35625810865278007, + "learning_rate": 8.604655172923322e-05, + "loss": 2.8928, + "step": 20577 + }, + { + "epoch": 0.9580743534231906, + "grad_norm": 0.3530037428464216, + "learning_rate": 8.604467450345109e-05, + "loss": 2.8999, + "step": 20578 + }, + { + "epoch": 0.9581209116092837, + "grad_norm": 0.34565798830755545, + "learning_rate": 8.604279717188139e-05, + "loss": 2.9574, + "step": 20579 + }, + { + "epoch": 0.9581674697953768, + "grad_norm": 0.3344013422073473, + "learning_rate": 8.604091973452966e-05, + "loss": 2.8173, + "step": 20580 + }, + { + "epoch": 0.9582140279814698, + "grad_norm": 0.9034834179921176, + "learning_rate": 8.603904219140135e-05, + "loss": 2.864, + "step": 20581 + }, + { + "epoch": 0.9582605861675629, + "grad_norm": 0.4114457438437955, + "learning_rate": 8.603716454250202e-05, + "loss": 2.7424, + "step": 20582 + }, + { + "epoch": 0.9583071443536559, + "grad_norm": 0.4056852570814326, + "learning_rate": 8.603528678783717e-05, + "loss": 2.7718, + "step": 20583 + }, + { + "epoch": 0.958353702539749, + "grad_norm": 0.402584777837939, + "learning_rate": 8.603340892741229e-05, + "loss": 2.9371, + "step": 20584 + }, + { + "epoch": 0.9584002607258422, + "grad_norm": 0.403877970582408, + "learning_rate": 8.603153096123291e-05, + "loss": 2.9417, + "step": 20585 + }, + { + "epoch": 0.9584468189119352, + "grad_norm": 0.3968422984513714, + "learning_rate": 8.602965288930452e-05, + "loss": 2.8306, + "step": 20586 + }, + { + "epoch": 0.9584933770980283, + "grad_norm": 0.38891519954911496, + "learning_rate": 8.602777471163266e-05, + "loss": 2.8498, + "step": 20587 + }, + { + "epoch": 0.9585399352841213, + "grad_norm": 0.3678614510788118, + "learning_rate": 8.602589642822283e-05, + "loss": 2.8611, + "step": 20588 + }, + { + "epoch": 0.9585864934702144, + "grad_norm": 0.35859672846901297, + "learning_rate": 8.602401803908055e-05, + "loss": 2.8391, + "step": 20589 + }, + { + "epoch": 0.9586330516563075, + "grad_norm": 0.38011801712234516, + "learning_rate": 8.602213954421132e-05, + "loss": 2.8578, + "step": 20590 + }, + { + "epoch": 0.9586796098424005, + "grad_norm": 0.3720287377944362, + "learning_rate": 8.602026094362065e-05, + "loss": 2.8978, + "step": 20591 + }, + { + "epoch": 0.9587261680284936, + "grad_norm": 0.37873714034913725, + "learning_rate": 8.601838223731408e-05, + "loss": 2.8745, + "step": 20592 + }, + { + "epoch": 0.9587727262145866, + "grad_norm": 0.36967727706684683, + "learning_rate": 8.60165034252971e-05, + "loss": 2.8656, + "step": 20593 + }, + { + "epoch": 0.9588192844006798, + "grad_norm": 0.3952312866042336, + "learning_rate": 8.601462450757523e-05, + "loss": 2.8697, + "step": 20594 + }, + { + "epoch": 0.9588658425867728, + "grad_norm": 0.38247742774503757, + "learning_rate": 8.601274548415398e-05, + "loss": 2.9475, + "step": 20595 + }, + { + "epoch": 0.9589124007728659, + "grad_norm": 0.3909984293885468, + "learning_rate": 8.601086635503888e-05, + "loss": 2.9604, + "step": 20596 + }, + { + "epoch": 0.958958958958959, + "grad_norm": 0.3625762495491953, + "learning_rate": 8.600898712023544e-05, + "loss": 2.9101, + "step": 20597 + }, + { + "epoch": 0.959005517145052, + "grad_norm": 0.3416397520091026, + "learning_rate": 8.600710777974916e-05, + "loss": 2.8158, + "step": 20598 + }, + { + "epoch": 0.9590520753311451, + "grad_norm": 0.35624290750119775, + "learning_rate": 8.600522833358556e-05, + "loss": 2.8787, + "step": 20599 + }, + { + "epoch": 0.9590986335172381, + "grad_norm": 0.34309570021704966, + "learning_rate": 8.600334878175017e-05, + "loss": 2.9449, + "step": 20600 + }, + { + "epoch": 0.9591451917033312, + "grad_norm": 0.3611298806234044, + "learning_rate": 8.60014691242485e-05, + "loss": 2.8498, + "step": 20601 + }, + { + "epoch": 0.9591917498894243, + "grad_norm": 0.358316082113324, + "learning_rate": 8.599958936108606e-05, + "loss": 2.9432, + "step": 20602 + }, + { + "epoch": 0.9592383080755174, + "grad_norm": 0.339890461805316, + "learning_rate": 8.599770949226837e-05, + "loss": 2.8636, + "step": 20603 + }, + { + "epoch": 0.9592848662616105, + "grad_norm": 0.3548793138376712, + "learning_rate": 8.599582951780096e-05, + "loss": 2.8922, + "step": 20604 + }, + { + "epoch": 0.9593314244477035, + "grad_norm": 0.3574820038374433, + "learning_rate": 8.599394943768933e-05, + "loss": 2.8867, + "step": 20605 + }, + { + "epoch": 0.9593779826337966, + "grad_norm": 0.34279623947052934, + "learning_rate": 8.5992069251939e-05, + "loss": 2.7511, + "step": 20606 + }, + { + "epoch": 0.9594245408198897, + "grad_norm": 0.3505662404878177, + "learning_rate": 8.599018896055549e-05, + "loss": 2.819, + "step": 20607 + }, + { + "epoch": 0.9594710990059827, + "grad_norm": 0.3402384107956714, + "learning_rate": 8.598830856354434e-05, + "loss": 2.8699, + "step": 20608 + }, + { + "epoch": 0.9595176571920758, + "grad_norm": 0.37230064101631477, + "learning_rate": 8.598642806091102e-05, + "loss": 2.8417, + "step": 20609 + }, + { + "epoch": 0.9595642153781688, + "grad_norm": 0.33368834985258766, + "learning_rate": 8.598454745266111e-05, + "loss": 2.9779, + "step": 20610 + }, + { + "epoch": 0.9596107735642619, + "grad_norm": 0.3486304555714782, + "learning_rate": 8.598266673880007e-05, + "loss": 2.8471, + "step": 20611 + }, + { + "epoch": 0.9596573317503551, + "grad_norm": 0.3590417726071896, + "learning_rate": 8.598078591933346e-05, + "loss": 2.8738, + "step": 20612 + }, + { + "epoch": 0.9597038899364481, + "grad_norm": 0.34169818966253784, + "learning_rate": 8.597890499426679e-05, + "loss": 2.871, + "step": 20613 + }, + { + "epoch": 0.9597504481225412, + "grad_norm": 0.35676546737378345, + "learning_rate": 8.597702396360559e-05, + "loss": 2.9688, + "step": 20614 + }, + { + "epoch": 0.9597970063086342, + "grad_norm": 0.3361628369002188, + "learning_rate": 8.597514282735535e-05, + "loss": 2.9304, + "step": 20615 + }, + { + "epoch": 0.9598435644947273, + "grad_norm": 0.3480086877194223, + "learning_rate": 8.597326158552162e-05, + "loss": 2.8562, + "step": 20616 + }, + { + "epoch": 0.9598901226808203, + "grad_norm": 0.33739313040375996, + "learning_rate": 8.59713802381099e-05, + "loss": 2.9439, + "step": 20617 + }, + { + "epoch": 0.9599366808669134, + "grad_norm": 0.3670394120219608, + "learning_rate": 8.596949878512573e-05, + "loss": 2.8178, + "step": 20618 + }, + { + "epoch": 0.9599832390530065, + "grad_norm": 0.348251771853059, + "learning_rate": 8.596761722657463e-05, + "loss": 2.9062, + "step": 20619 + }, + { + "epoch": 0.9600297972390995, + "grad_norm": 0.3528887811415968, + "learning_rate": 8.596573556246212e-05, + "loss": 2.8633, + "step": 20620 + }, + { + "epoch": 0.9600763554251927, + "grad_norm": 0.3476124061395944, + "learning_rate": 8.59638537927937e-05, + "loss": 2.8319, + "step": 20621 + }, + { + "epoch": 0.9601229136112857, + "grad_norm": 0.3343372679140657, + "learning_rate": 8.596197191757493e-05, + "loss": 2.8153, + "step": 20622 + }, + { + "epoch": 0.9601694717973788, + "grad_norm": 0.367135524379244, + "learning_rate": 8.59600899368113e-05, + "loss": 2.9635, + "step": 20623 + }, + { + "epoch": 0.9602160299834719, + "grad_norm": 0.3226014959266325, + "learning_rate": 8.595820785050836e-05, + "loss": 2.9429, + "step": 20624 + }, + { + "epoch": 0.9602625881695649, + "grad_norm": 0.35606696533667925, + "learning_rate": 8.595632565867162e-05, + "loss": 2.9027, + "step": 20625 + }, + { + "epoch": 0.960309146355658, + "grad_norm": 0.3331904690357013, + "learning_rate": 8.595444336130661e-05, + "loss": 2.8776, + "step": 20626 + }, + { + "epoch": 0.960355704541751, + "grad_norm": 0.3458336399870147, + "learning_rate": 8.595256095841884e-05, + "loss": 2.8244, + "step": 20627 + }, + { + "epoch": 0.9604022627278441, + "grad_norm": 0.33984847414284647, + "learning_rate": 8.595067845001385e-05, + "loss": 2.8974, + "step": 20628 + }, + { + "epoch": 0.9604488209139372, + "grad_norm": 0.31135098646661497, + "learning_rate": 8.594879583609718e-05, + "loss": 2.8115, + "step": 20629 + }, + { + "epoch": 0.9604953791000302, + "grad_norm": 0.3260333547306818, + "learning_rate": 8.594691311667431e-05, + "loss": 2.818, + "step": 20630 + }, + { + "epoch": 0.9605419372861234, + "grad_norm": 0.34731278960128154, + "learning_rate": 8.59450302917508e-05, + "loss": 2.8628, + "step": 20631 + }, + { + "epoch": 0.9605884954722164, + "grad_norm": 0.3241642665735676, + "learning_rate": 8.594314736133218e-05, + "loss": 2.8838, + "step": 20632 + }, + { + "epoch": 0.9606350536583095, + "grad_norm": 0.34609656378650233, + "learning_rate": 8.594126432542394e-05, + "loss": 2.8701, + "step": 20633 + }, + { + "epoch": 0.9606816118444026, + "grad_norm": 0.344890733053368, + "learning_rate": 8.593938118403164e-05, + "loss": 2.8682, + "step": 20634 + }, + { + "epoch": 0.9607281700304956, + "grad_norm": 0.3655177737339585, + "learning_rate": 8.59374979371608e-05, + "loss": 2.9553, + "step": 20635 + }, + { + "epoch": 0.9607747282165887, + "grad_norm": 0.3397511341016424, + "learning_rate": 8.593561458481694e-05, + "loss": 2.8887, + "step": 20636 + }, + { + "epoch": 0.9608212864026817, + "grad_norm": 0.3549934481347771, + "learning_rate": 8.59337311270056e-05, + "loss": 2.9804, + "step": 20637 + }, + { + "epoch": 0.9608678445887748, + "grad_norm": 0.3261107931736302, + "learning_rate": 8.593184756373229e-05, + "loss": 2.9794, + "step": 20638 + }, + { + "epoch": 0.9609144027748678, + "grad_norm": 0.34885655112652103, + "learning_rate": 8.592996389500253e-05, + "loss": 2.8839, + "step": 20639 + }, + { + "epoch": 0.960960960960961, + "grad_norm": 0.3488424246411696, + "learning_rate": 8.592808012082189e-05, + "loss": 2.8007, + "step": 20640 + }, + { + "epoch": 0.9610075191470541, + "grad_norm": 0.32179131240736414, + "learning_rate": 8.592619624119589e-05, + "loss": 2.8494, + "step": 20641 + }, + { + "epoch": 0.9610540773331471, + "grad_norm": 0.33729985695688924, + "learning_rate": 8.592431225613002e-05, + "loss": 2.8975, + "step": 20642 + }, + { + "epoch": 0.9611006355192402, + "grad_norm": 0.3297302949392975, + "learning_rate": 8.592242816562983e-05, + "loss": 2.8553, + "step": 20643 + }, + { + "epoch": 0.9611471937053332, + "grad_norm": 0.31714504499072776, + "learning_rate": 8.592054396970086e-05, + "loss": 2.9412, + "step": 20644 + }, + { + "epoch": 0.9611937518914263, + "grad_norm": 0.34218139112093543, + "learning_rate": 8.591865966834863e-05, + "loss": 2.9109, + "step": 20645 + }, + { + "epoch": 0.9612403100775194, + "grad_norm": 0.32030515777037205, + "learning_rate": 8.591677526157868e-05, + "loss": 2.9546, + "step": 20646 + }, + { + "epoch": 0.9612868682636124, + "grad_norm": 0.3364472689559572, + "learning_rate": 8.591489074939652e-05, + "loss": 2.822, + "step": 20647 + }, + { + "epoch": 0.9613334264497055, + "grad_norm": 0.3126557671171186, + "learning_rate": 8.591300613180771e-05, + "loss": 2.8501, + "step": 20648 + }, + { + "epoch": 0.9613799846357985, + "grad_norm": 0.3422035322915415, + "learning_rate": 8.591112140881774e-05, + "loss": 2.8519, + "step": 20649 + }, + { + "epoch": 0.9614265428218917, + "grad_norm": 0.34621699642914355, + "learning_rate": 8.59092365804322e-05, + "loss": 2.9308, + "step": 20650 + }, + { + "epoch": 0.9614731010079848, + "grad_norm": 0.32785485829625544, + "learning_rate": 8.590735164665655e-05, + "loss": 2.9316, + "step": 20651 + }, + { + "epoch": 0.9615196591940778, + "grad_norm": 0.315685112111834, + "learning_rate": 8.590546660749639e-05, + "loss": 2.7821, + "step": 20652 + }, + { + "epoch": 0.9615662173801709, + "grad_norm": 0.3469394916646917, + "learning_rate": 8.59035814629572e-05, + "loss": 2.8739, + "step": 20653 + }, + { + "epoch": 0.9616127755662639, + "grad_norm": 0.3272429986042582, + "learning_rate": 8.590169621304456e-05, + "loss": 2.9625, + "step": 20654 + }, + { + "epoch": 0.961659333752357, + "grad_norm": 0.36900687298101315, + "learning_rate": 8.589981085776397e-05, + "loss": 2.8752, + "step": 20655 + }, + { + "epoch": 0.9617058919384501, + "grad_norm": 0.35882518435101646, + "learning_rate": 8.589792539712096e-05, + "loss": 2.8624, + "step": 20656 + }, + { + "epoch": 0.9617524501245431, + "grad_norm": 0.38420882399388284, + "learning_rate": 8.589603983112109e-05, + "loss": 2.8754, + "step": 20657 + }, + { + "epoch": 0.9617990083106362, + "grad_norm": 0.36649331815642966, + "learning_rate": 8.589415415976987e-05, + "loss": 2.7483, + "step": 20658 + }, + { + "epoch": 0.9618455664967293, + "grad_norm": 0.3833292682294997, + "learning_rate": 8.589226838307284e-05, + "loss": 2.904, + "step": 20659 + }, + { + "epoch": 0.9618921246828224, + "grad_norm": 0.3876735258815274, + "learning_rate": 8.589038250103556e-05, + "loss": 2.9276, + "step": 20660 + }, + { + "epoch": 0.9619386828689154, + "grad_norm": 0.3275934590236773, + "learning_rate": 8.588849651366353e-05, + "loss": 2.9102, + "step": 20661 + }, + { + "epoch": 0.9619852410550085, + "grad_norm": 0.35524838344680776, + "learning_rate": 8.588661042096228e-05, + "loss": 2.9166, + "step": 20662 + }, + { + "epoch": 0.9620317992411016, + "grad_norm": 0.353780841204063, + "learning_rate": 8.588472422293739e-05, + "loss": 2.8489, + "step": 20663 + }, + { + "epoch": 0.9620783574271946, + "grad_norm": 0.3900557225865839, + "learning_rate": 8.588283791959436e-05, + "loss": 2.8514, + "step": 20664 + }, + { + "epoch": 0.9621249156132877, + "grad_norm": 0.37460903728192885, + "learning_rate": 8.588095151093874e-05, + "loss": 2.9159, + "step": 20665 + }, + { + "epoch": 0.9621714737993807, + "grad_norm": 0.3947313878505141, + "learning_rate": 8.587906499697607e-05, + "loss": 2.9136, + "step": 20666 + }, + { + "epoch": 0.9622180319854738, + "grad_norm": 0.33861999425699146, + "learning_rate": 8.587717837771186e-05, + "loss": 2.7708, + "step": 20667 + }, + { + "epoch": 0.962264590171567, + "grad_norm": 0.3570042481724972, + "learning_rate": 8.587529165315166e-05, + "loss": 2.8104, + "step": 20668 + }, + { + "epoch": 0.96231114835766, + "grad_norm": 0.34944797763589724, + "learning_rate": 8.587340482330104e-05, + "loss": 2.9043, + "step": 20669 + }, + { + "epoch": 0.9623577065437531, + "grad_norm": 0.31812804806243516, + "learning_rate": 8.58715178881655e-05, + "loss": 2.8644, + "step": 20670 + }, + { + "epoch": 0.9624042647298461, + "grad_norm": 0.3413105896713689, + "learning_rate": 8.586963084775058e-05, + "loss": 2.8692, + "step": 20671 + }, + { + "epoch": 0.9624508229159392, + "grad_norm": 0.323476287909194, + "learning_rate": 8.586774370206183e-05, + "loss": 2.8673, + "step": 20672 + }, + { + "epoch": 0.9624973811020323, + "grad_norm": 0.3381077599804382, + "learning_rate": 8.586585645110478e-05, + "loss": 2.9382, + "step": 20673 + }, + { + "epoch": 0.9625439392881253, + "grad_norm": 0.29985805758364736, + "learning_rate": 8.586396909488499e-05, + "loss": 2.8647, + "step": 20674 + }, + { + "epoch": 0.9625904974742184, + "grad_norm": 0.34103375954918214, + "learning_rate": 8.586208163340799e-05, + "loss": 2.9113, + "step": 20675 + }, + { + "epoch": 0.9626370556603114, + "grad_norm": 0.3113778157033792, + "learning_rate": 8.586019406667929e-05, + "loss": 2.914, + "step": 20676 + }, + { + "epoch": 0.9626836138464046, + "grad_norm": 0.35896402354370827, + "learning_rate": 8.585830639470447e-05, + "loss": 2.8661, + "step": 20677 + }, + { + "epoch": 0.9627301720324977, + "grad_norm": 0.3172118297763864, + "learning_rate": 8.585641861748905e-05, + "loss": 2.9186, + "step": 20678 + }, + { + "epoch": 0.9627767302185907, + "grad_norm": 0.35137980972760885, + "learning_rate": 8.585453073503857e-05, + "loss": 2.798, + "step": 20679 + }, + { + "epoch": 0.9628232884046838, + "grad_norm": 0.31334444947252144, + "learning_rate": 8.585264274735858e-05, + "loss": 2.9234, + "step": 20680 + }, + { + "epoch": 0.9628698465907768, + "grad_norm": 0.3145055716655645, + "learning_rate": 8.585075465445461e-05, + "loss": 2.816, + "step": 20681 + }, + { + "epoch": 0.9629164047768699, + "grad_norm": 0.2973538081916066, + "learning_rate": 8.584886645633221e-05, + "loss": 2.7724, + "step": 20682 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 0.31966224269055893, + "learning_rate": 8.584697815299692e-05, + "loss": 2.8917, + "step": 20683 + }, + { + "epoch": 0.963009521149056, + "grad_norm": 0.3010251514509209, + "learning_rate": 8.584508974445427e-05, + "loss": 2.8451, + "step": 20684 + }, + { + "epoch": 0.9630560793351491, + "grad_norm": 0.32593407081343867, + "learning_rate": 8.584320123070983e-05, + "loss": 2.911, + "step": 20685 + }, + { + "epoch": 0.9631026375212421, + "grad_norm": 0.3025602126881206, + "learning_rate": 8.58413126117691e-05, + "loss": 2.8803, + "step": 20686 + }, + { + "epoch": 0.9631491957073353, + "grad_norm": 0.3252900107886796, + "learning_rate": 8.583942388763768e-05, + "loss": 2.9019, + "step": 20687 + }, + { + "epoch": 0.9631957538934283, + "grad_norm": 0.3135268367905459, + "learning_rate": 8.583753505832107e-05, + "loss": 2.8081, + "step": 20688 + }, + { + "epoch": 0.9632423120795214, + "grad_norm": 0.31904115070971123, + "learning_rate": 8.583564612382482e-05, + "loss": 2.9594, + "step": 20689 + }, + { + "epoch": 0.9632888702656145, + "grad_norm": 0.33772957159244, + "learning_rate": 8.583375708415447e-05, + "loss": 2.8537, + "step": 20690 + }, + { + "epoch": 0.9633354284517075, + "grad_norm": 0.34148781920695964, + "learning_rate": 8.583186793931559e-05, + "loss": 2.9528, + "step": 20691 + }, + { + "epoch": 0.9633819866378006, + "grad_norm": 0.3589060334851423, + "learning_rate": 8.582997868931371e-05, + "loss": 2.8851, + "step": 20692 + }, + { + "epoch": 0.9634285448238936, + "grad_norm": 0.36005984249300316, + "learning_rate": 8.582808933415436e-05, + "loss": 2.8702, + "step": 20693 + }, + { + "epoch": 0.9634751030099867, + "grad_norm": 0.3414378364122793, + "learning_rate": 8.58261998738431e-05, + "loss": 2.9312, + "step": 20694 + }, + { + "epoch": 0.9635216611960798, + "grad_norm": 0.36300985905868843, + "learning_rate": 8.582431030838547e-05, + "loss": 2.9219, + "step": 20695 + }, + { + "epoch": 0.9635682193821729, + "grad_norm": 0.31880721666913897, + "learning_rate": 8.582242063778703e-05, + "loss": 2.7268, + "step": 20696 + }, + { + "epoch": 0.963614777568266, + "grad_norm": 0.3332919923320486, + "learning_rate": 8.58205308620533e-05, + "loss": 2.8534, + "step": 20697 + }, + { + "epoch": 0.963661335754359, + "grad_norm": 0.3370500339698254, + "learning_rate": 8.581864098118986e-05, + "loss": 2.8307, + "step": 20698 + }, + { + "epoch": 0.9637078939404521, + "grad_norm": 0.327092872519398, + "learning_rate": 8.581675099520222e-05, + "loss": 2.9351, + "step": 20699 + }, + { + "epoch": 0.9637544521265452, + "grad_norm": 0.35891639228879474, + "learning_rate": 8.581486090409595e-05, + "loss": 2.8929, + "step": 20700 + }, + { + "epoch": 0.9638010103126382, + "grad_norm": 0.30567849702828515, + "learning_rate": 8.58129707078766e-05, + "loss": 2.9559, + "step": 20701 + }, + { + "epoch": 0.9638475684987313, + "grad_norm": 0.3432836544308037, + "learning_rate": 8.581108040654969e-05, + "loss": 2.9739, + "step": 20702 + }, + { + "epoch": 0.9638941266848243, + "grad_norm": 0.30567932566668615, + "learning_rate": 8.58091900001208e-05, + "loss": 2.9702, + "step": 20703 + }, + { + "epoch": 0.9639406848709174, + "grad_norm": 0.35814979780270934, + "learning_rate": 8.580729948859548e-05, + "loss": 2.9991, + "step": 20704 + }, + { + "epoch": 0.9639872430570104, + "grad_norm": 0.30830125583625295, + "learning_rate": 8.580540887197922e-05, + "loss": 2.8439, + "step": 20705 + }, + { + "epoch": 0.9640338012431036, + "grad_norm": 0.32541480858316, + "learning_rate": 8.580351815027765e-05, + "loss": 2.8355, + "step": 20706 + }, + { + "epoch": 0.9640803594291967, + "grad_norm": 0.32552361270685004, + "learning_rate": 8.580162732349626e-05, + "loss": 2.9049, + "step": 20707 + }, + { + "epoch": 0.9641269176152897, + "grad_norm": 0.31786713198621996, + "learning_rate": 8.579973639164064e-05, + "loss": 2.9432, + "step": 20708 + }, + { + "epoch": 0.9641734758013828, + "grad_norm": 0.36150551368150546, + "learning_rate": 8.579784535471631e-05, + "loss": 2.8875, + "step": 20709 + }, + { + "epoch": 0.9642200339874758, + "grad_norm": 0.3091011222682979, + "learning_rate": 8.579595421272884e-05, + "loss": 2.7233, + "step": 20710 + }, + { + "epoch": 0.9642665921735689, + "grad_norm": 0.37135447717976056, + "learning_rate": 8.579406296568376e-05, + "loss": 2.8083, + "step": 20711 + }, + { + "epoch": 0.964313150359662, + "grad_norm": 0.3245368851201448, + "learning_rate": 8.579217161358663e-05, + "loss": 2.9829, + "step": 20712 + }, + { + "epoch": 0.964359708545755, + "grad_norm": 0.38040281685183247, + "learning_rate": 8.579028015644301e-05, + "loss": 2.938, + "step": 20713 + }, + { + "epoch": 0.9644062667318481, + "grad_norm": 0.34551309486649234, + "learning_rate": 8.578838859425844e-05, + "loss": 2.9503, + "step": 20714 + }, + { + "epoch": 0.9644528249179412, + "grad_norm": 0.349043507666252, + "learning_rate": 8.578649692703848e-05, + "loss": 2.8067, + "step": 20715 + }, + { + "epoch": 0.9644993831040343, + "grad_norm": 0.3455579121300185, + "learning_rate": 8.578460515478868e-05, + "loss": 2.7872, + "step": 20716 + }, + { + "epoch": 0.9645459412901274, + "grad_norm": 0.3140834234924703, + "learning_rate": 8.578271327751457e-05, + "loss": 2.8837, + "step": 20717 + }, + { + "epoch": 0.9645924994762204, + "grad_norm": 0.38549913463664104, + "learning_rate": 8.578082129522174e-05, + "loss": 2.8731, + "step": 20718 + }, + { + "epoch": 0.9646390576623135, + "grad_norm": 0.3612971213174608, + "learning_rate": 8.577892920791571e-05, + "loss": 2.8989, + "step": 20719 + }, + { + "epoch": 0.9646856158484065, + "grad_norm": 0.3692448394188595, + "learning_rate": 8.577703701560207e-05, + "loss": 2.8735, + "step": 20720 + }, + { + "epoch": 0.9647321740344996, + "grad_norm": 0.34893394078244383, + "learning_rate": 8.577514471828633e-05, + "loss": 2.9369, + "step": 20721 + }, + { + "epoch": 0.9647787322205927, + "grad_norm": 0.363138260379555, + "learning_rate": 8.577325231597407e-05, + "loss": 2.87, + "step": 20722 + }, + { + "epoch": 0.9648252904066857, + "grad_norm": 0.3386848852053313, + "learning_rate": 8.577135980867084e-05, + "loss": 2.7778, + "step": 20723 + }, + { + "epoch": 0.9648718485927789, + "grad_norm": 0.3400606569402675, + "learning_rate": 8.576946719638219e-05, + "loss": 2.8137, + "step": 20724 + }, + { + "epoch": 0.9649184067788719, + "grad_norm": 0.37231120205558565, + "learning_rate": 8.576757447911369e-05, + "loss": 2.8112, + "step": 20725 + }, + { + "epoch": 0.964964964964965, + "grad_norm": 0.37739286382148757, + "learning_rate": 8.576568165687087e-05, + "loss": 2.8159, + "step": 20726 + }, + { + "epoch": 0.965011523151058, + "grad_norm": 0.3284849867293239, + "learning_rate": 8.57637887296593e-05, + "loss": 2.8411, + "step": 20727 + }, + { + "epoch": 0.9650580813371511, + "grad_norm": 0.3418888514202213, + "learning_rate": 8.576189569748455e-05, + "loss": 2.8182, + "step": 20728 + }, + { + "epoch": 0.9651046395232442, + "grad_norm": 0.31795157029772575, + "learning_rate": 8.576000256035214e-05, + "loss": 2.758, + "step": 20729 + }, + { + "epoch": 0.9651511977093372, + "grad_norm": 0.3457468931660656, + "learning_rate": 8.575810931826764e-05, + "loss": 2.8416, + "step": 20730 + }, + { + "epoch": 0.9651977558954303, + "grad_norm": 0.32927405478815813, + "learning_rate": 8.575621597123662e-05, + "loss": 2.9073, + "step": 20731 + }, + { + "epoch": 0.9652443140815233, + "grad_norm": 0.3690290797954039, + "learning_rate": 8.575432251926462e-05, + "loss": 2.7782, + "step": 20732 + }, + { + "epoch": 0.9652908722676165, + "grad_norm": 0.3254882320513208, + "learning_rate": 8.575242896235723e-05, + "loss": 2.8489, + "step": 20733 + }, + { + "epoch": 0.9653374304537096, + "grad_norm": 0.37355351324554836, + "learning_rate": 8.575053530051997e-05, + "loss": 2.8248, + "step": 20734 + }, + { + "epoch": 0.9653839886398026, + "grad_norm": 0.3109369479945637, + "learning_rate": 8.574864153375841e-05, + "loss": 2.9204, + "step": 20735 + }, + { + "epoch": 0.9654305468258957, + "grad_norm": 0.34064024251691005, + "learning_rate": 8.574674766207811e-05, + "loss": 2.835, + "step": 20736 + }, + { + "epoch": 0.9654771050119887, + "grad_norm": 0.34512006400376244, + "learning_rate": 8.574485368548463e-05, + "loss": 2.8854, + "step": 20737 + }, + { + "epoch": 0.9655236631980818, + "grad_norm": 0.3786016103761794, + "learning_rate": 8.574295960398352e-05, + "loss": 2.8832, + "step": 20738 + }, + { + "epoch": 0.9655702213841749, + "grad_norm": 0.333241405163458, + "learning_rate": 8.574106541758035e-05, + "loss": 2.8836, + "step": 20739 + }, + { + "epoch": 0.9656167795702679, + "grad_norm": 0.37237979726793335, + "learning_rate": 8.573917112628067e-05, + "loss": 2.8887, + "step": 20740 + }, + { + "epoch": 0.965663337756361, + "grad_norm": 0.33221283119824124, + "learning_rate": 8.573727673009004e-05, + "loss": 2.8008, + "step": 20741 + }, + { + "epoch": 0.965709895942454, + "grad_norm": 0.35255163174456355, + "learning_rate": 8.573538222901403e-05, + "loss": 2.7707, + "step": 20742 + }, + { + "epoch": 0.9657564541285472, + "grad_norm": 0.33448650568292176, + "learning_rate": 8.57334876230582e-05, + "loss": 2.8158, + "step": 20743 + }, + { + "epoch": 0.9658030123146403, + "grad_norm": 0.3249644564975812, + "learning_rate": 8.573159291222809e-05, + "loss": 2.7954, + "step": 20744 + }, + { + "epoch": 0.9658495705007333, + "grad_norm": 0.3324863052995632, + "learning_rate": 8.572969809652929e-05, + "loss": 2.8231, + "step": 20745 + }, + { + "epoch": 0.9658961286868264, + "grad_norm": 0.3312989843552103, + "learning_rate": 8.572780317596733e-05, + "loss": 2.8101, + "step": 20746 + }, + { + "epoch": 0.9659426868729194, + "grad_norm": 0.3343978102624465, + "learning_rate": 8.57259081505478e-05, + "loss": 2.9091, + "step": 20747 + }, + { + "epoch": 0.9659892450590125, + "grad_norm": 0.34225166414119956, + "learning_rate": 8.572401302027624e-05, + "loss": 2.8855, + "step": 20748 + }, + { + "epoch": 0.9660358032451055, + "grad_norm": 0.3339364124575095, + "learning_rate": 8.572211778515822e-05, + "loss": 2.9491, + "step": 20749 + }, + { + "epoch": 0.9660823614311986, + "grad_norm": 0.3623233624053084, + "learning_rate": 8.57202224451993e-05, + "loss": 2.9627, + "step": 20750 + }, + { + "epoch": 0.9661289196172917, + "grad_norm": 0.34145265598105806, + "learning_rate": 8.571832700040506e-05, + "loss": 2.9312, + "step": 20751 + }, + { + "epoch": 0.9661754778033848, + "grad_norm": 0.31489704419354253, + "learning_rate": 8.571643145078106e-05, + "loss": 2.8645, + "step": 20752 + }, + { + "epoch": 0.9662220359894779, + "grad_norm": 0.36063580524667327, + "learning_rate": 8.571453579633282e-05, + "loss": 2.901, + "step": 20753 + }, + { + "epoch": 0.9662685941755709, + "grad_norm": 0.33210104659294204, + "learning_rate": 8.571264003706596e-05, + "loss": 2.8468, + "step": 20754 + }, + { + "epoch": 0.966315152361664, + "grad_norm": 0.36657236988243347, + "learning_rate": 8.571074417298601e-05, + "loss": 2.9127, + "step": 20755 + }, + { + "epoch": 0.9663617105477571, + "grad_norm": 0.3080699672395656, + "learning_rate": 8.570884820409855e-05, + "loss": 2.7372, + "step": 20756 + }, + { + "epoch": 0.9664082687338501, + "grad_norm": 0.35088412821446413, + "learning_rate": 8.570695213040914e-05, + "loss": 2.8037, + "step": 20757 + }, + { + "epoch": 0.9664548269199432, + "grad_norm": 0.3134226107596506, + "learning_rate": 8.570505595192333e-05, + "loss": 2.8055, + "step": 20758 + }, + { + "epoch": 0.9665013851060362, + "grad_norm": 0.35468885552974916, + "learning_rate": 8.57031596686467e-05, + "loss": 2.9795, + "step": 20759 + }, + { + "epoch": 0.9665479432921293, + "grad_norm": 0.3224036704111797, + "learning_rate": 8.570126328058483e-05, + "loss": 2.8732, + "step": 20760 + }, + { + "epoch": 0.9665945014782225, + "grad_norm": 0.3146941395765095, + "learning_rate": 8.569936678774327e-05, + "loss": 2.8658, + "step": 20761 + }, + { + "epoch": 0.9666410596643155, + "grad_norm": 0.34572704567398677, + "learning_rate": 8.569747019012754e-05, + "loss": 2.8226, + "step": 20762 + }, + { + "epoch": 0.9666876178504086, + "grad_norm": 0.31789272021475795, + "learning_rate": 8.569557348774331e-05, + "loss": 2.8862, + "step": 20763 + }, + { + "epoch": 0.9667341760365016, + "grad_norm": 0.369962314383866, + "learning_rate": 8.569367668059606e-05, + "loss": 2.801, + "step": 20764 + }, + { + "epoch": 0.9667807342225947, + "grad_norm": 0.3161818494281817, + "learning_rate": 8.569177976869138e-05, + "loss": 2.8299, + "step": 20765 + }, + { + "epoch": 0.9668272924086878, + "grad_norm": 0.33052308263581565, + "learning_rate": 8.568988275203487e-05, + "loss": 2.7015, + "step": 20766 + }, + { + "epoch": 0.9668738505947808, + "grad_norm": 0.3180230561758356, + "learning_rate": 8.568798563063205e-05, + "loss": 2.8774, + "step": 20767 + }, + { + "epoch": 0.9669204087808739, + "grad_norm": 0.34748711836002427, + "learning_rate": 8.56860884044885e-05, + "loss": 2.8357, + "step": 20768 + }, + { + "epoch": 0.9669669669669669, + "grad_norm": 0.3363538199008223, + "learning_rate": 8.56841910736098e-05, + "loss": 2.8162, + "step": 20769 + }, + { + "epoch": 0.96701352515306, + "grad_norm": 0.3271177780969272, + "learning_rate": 8.568229363800153e-05, + "loss": 2.7876, + "step": 20770 + }, + { + "epoch": 0.967060083339153, + "grad_norm": 0.3813429424161357, + "learning_rate": 8.568039609766924e-05, + "loss": 2.887, + "step": 20771 + }, + { + "epoch": 0.9671066415252462, + "grad_norm": 0.3422195919138774, + "learning_rate": 8.56784984526185e-05, + "loss": 2.8472, + "step": 20772 + }, + { + "epoch": 0.9671531997113393, + "grad_norm": 0.363623820785602, + "learning_rate": 8.567660070285489e-05, + "loss": 2.8597, + "step": 20773 + }, + { + "epoch": 0.9671997578974323, + "grad_norm": 0.3360254657809843, + "learning_rate": 8.567470284838395e-05, + "loss": 2.9327, + "step": 20774 + }, + { + "epoch": 0.9672463160835254, + "grad_norm": 0.37158076377321975, + "learning_rate": 8.567280488921127e-05, + "loss": 2.8758, + "step": 20775 + }, + { + "epoch": 0.9672928742696184, + "grad_norm": 0.35993934614168815, + "learning_rate": 8.567090682534245e-05, + "loss": 2.9218, + "step": 20776 + }, + { + "epoch": 0.9673394324557115, + "grad_norm": 0.3295441361752, + "learning_rate": 8.566900865678301e-05, + "loss": 2.8814, + "step": 20777 + }, + { + "epoch": 0.9673859906418046, + "grad_norm": 0.33655773217785195, + "learning_rate": 8.566711038353855e-05, + "loss": 2.8598, + "step": 20778 + }, + { + "epoch": 0.9674325488278976, + "grad_norm": 0.34483331196182077, + "learning_rate": 8.566521200561464e-05, + "loss": 2.8961, + "step": 20779 + }, + { + "epoch": 0.9674791070139908, + "grad_norm": 0.31579242892312526, + "learning_rate": 8.566331352301684e-05, + "loss": 2.7524, + "step": 20780 + }, + { + "epoch": 0.9675256652000838, + "grad_norm": 0.38413627675687323, + "learning_rate": 8.566141493575072e-05, + "loss": 2.9305, + "step": 20781 + }, + { + "epoch": 0.9675722233861769, + "grad_norm": 0.3261634734489758, + "learning_rate": 8.565951624382188e-05, + "loss": 2.8458, + "step": 20782 + }, + { + "epoch": 0.96761878157227, + "grad_norm": 0.3567026095789885, + "learning_rate": 8.565761744723586e-05, + "loss": 2.8669, + "step": 20783 + }, + { + "epoch": 0.967665339758363, + "grad_norm": 0.3432755305522961, + "learning_rate": 8.565571854599825e-05, + "loss": 2.9289, + "step": 20784 + }, + { + "epoch": 0.9677118979444561, + "grad_norm": 0.33259554402734065, + "learning_rate": 8.565381954011462e-05, + "loss": 2.842, + "step": 20785 + }, + { + "epoch": 0.9677584561305491, + "grad_norm": 0.33633203391798744, + "learning_rate": 8.565192042959055e-05, + "loss": 2.8191, + "step": 20786 + }, + { + "epoch": 0.9678050143166422, + "grad_norm": 0.33624918633148376, + "learning_rate": 8.56500212144316e-05, + "loss": 2.9279, + "step": 20787 + }, + { + "epoch": 0.9678515725027353, + "grad_norm": 0.3090051933948791, + "learning_rate": 8.564812189464334e-05, + "loss": 2.8898, + "step": 20788 + }, + { + "epoch": 0.9678981306888284, + "grad_norm": 0.35236920276122274, + "learning_rate": 8.564622247023135e-05, + "loss": 2.9137, + "step": 20789 + }, + { + "epoch": 0.9679446888749215, + "grad_norm": 0.30538890535855306, + "learning_rate": 8.564432294120123e-05, + "loss": 2.8427, + "step": 20790 + }, + { + "epoch": 0.9679912470610145, + "grad_norm": 0.33605988151088956, + "learning_rate": 8.564242330755851e-05, + "loss": 3.0134, + "step": 20791 + }, + { + "epoch": 0.9680378052471076, + "grad_norm": 0.3136505324644103, + "learning_rate": 8.564052356930881e-05, + "loss": 2.988, + "step": 20792 + }, + { + "epoch": 0.9680843634332006, + "grad_norm": 0.30533681550079433, + "learning_rate": 8.563862372645769e-05, + "loss": 2.9658, + "step": 20793 + }, + { + "epoch": 0.9681309216192937, + "grad_norm": 0.32958265398830694, + "learning_rate": 8.56367237790107e-05, + "loss": 2.9741, + "step": 20794 + }, + { + "epoch": 0.9681774798053868, + "grad_norm": 0.3186483978076514, + "learning_rate": 8.563482372697345e-05, + "loss": 2.8677, + "step": 20795 + }, + { + "epoch": 0.9682240379914798, + "grad_norm": 0.3392312596590509, + "learning_rate": 8.563292357035149e-05, + "loss": 2.8876, + "step": 20796 + }, + { + "epoch": 0.9682705961775729, + "grad_norm": 0.3105490099889369, + "learning_rate": 8.563102330915041e-05, + "loss": 2.8017, + "step": 20797 + }, + { + "epoch": 0.9683171543636659, + "grad_norm": 0.3278636869402463, + "learning_rate": 8.562912294337579e-05, + "loss": 2.8367, + "step": 20798 + }, + { + "epoch": 0.9683637125497591, + "grad_norm": 0.3249684411699693, + "learning_rate": 8.562722247303321e-05, + "loss": 2.8172, + "step": 20799 + }, + { + "epoch": 0.9684102707358522, + "grad_norm": 0.3089830925314542, + "learning_rate": 8.562532189812824e-05, + "loss": 2.825, + "step": 20800 + }, + { + "epoch": 0.9684568289219452, + "grad_norm": 0.3050031196033937, + "learning_rate": 8.562342121866645e-05, + "loss": 2.8864, + "step": 20801 + }, + { + "epoch": 0.9685033871080383, + "grad_norm": 0.31682779714275305, + "learning_rate": 8.562152043465343e-05, + "loss": 2.8092, + "step": 20802 + }, + { + "epoch": 0.9685499452941313, + "grad_norm": 0.29115564734924276, + "learning_rate": 8.561961954609477e-05, + "loss": 2.8811, + "step": 20803 + }, + { + "epoch": 0.9685965034802244, + "grad_norm": 0.32864764638899435, + "learning_rate": 8.561771855299602e-05, + "loss": 2.8376, + "step": 20804 + }, + { + "epoch": 0.9686430616663175, + "grad_norm": 0.32322289760035494, + "learning_rate": 8.56158174553628e-05, + "loss": 2.8338, + "step": 20805 + }, + { + "epoch": 0.9686896198524105, + "grad_norm": 0.3473994442460977, + "learning_rate": 8.561391625320065e-05, + "loss": 2.8257, + "step": 20806 + }, + { + "epoch": 0.9687361780385036, + "grad_norm": 0.35138915842710106, + "learning_rate": 8.561201494651516e-05, + "loss": 2.9037, + "step": 20807 + }, + { + "epoch": 0.9687827362245967, + "grad_norm": 0.3570039099644014, + "learning_rate": 8.561011353531192e-05, + "loss": 2.9388, + "step": 20808 + }, + { + "epoch": 0.9688292944106898, + "grad_norm": 0.37618559495726417, + "learning_rate": 8.56082120195965e-05, + "loss": 2.774, + "step": 20809 + }, + { + "epoch": 0.9688758525967829, + "grad_norm": 0.34700156164643536, + "learning_rate": 8.560631039937449e-05, + "loss": 2.9381, + "step": 20810 + }, + { + "epoch": 0.9689224107828759, + "grad_norm": 0.38031767776550957, + "learning_rate": 8.560440867465147e-05, + "loss": 2.8043, + "step": 20811 + }, + { + "epoch": 0.968968968968969, + "grad_norm": 0.33798856521030324, + "learning_rate": 8.560250684543301e-05, + "loss": 2.868, + "step": 20812 + }, + { + "epoch": 0.969015527155062, + "grad_norm": 0.35236585105307083, + "learning_rate": 8.560060491172471e-05, + "loss": 2.8754, + "step": 20813 + }, + { + "epoch": 0.9690620853411551, + "grad_norm": 0.33798864206042284, + "learning_rate": 8.559870287353214e-05, + "loss": 2.9149, + "step": 20814 + }, + { + "epoch": 0.9691086435272481, + "grad_norm": 0.32460548613501455, + "learning_rate": 8.559680073086087e-05, + "loss": 2.9315, + "step": 20815 + }, + { + "epoch": 0.9691552017133412, + "grad_norm": 0.3468468532567043, + "learning_rate": 8.55948984837165e-05, + "loss": 3.0131, + "step": 20816 + }, + { + "epoch": 0.9692017598994344, + "grad_norm": 0.32351354528960546, + "learning_rate": 8.559299613210462e-05, + "loss": 2.8334, + "step": 20817 + }, + { + "epoch": 0.9692483180855274, + "grad_norm": 0.3411493852817333, + "learning_rate": 8.55910936760308e-05, + "loss": 3.015, + "step": 20818 + }, + { + "epoch": 0.9692948762716205, + "grad_norm": 0.40011979531144304, + "learning_rate": 8.558919111550062e-05, + "loss": 2.8768, + "step": 20819 + }, + { + "epoch": 0.9693414344577135, + "grad_norm": 0.35061917936714887, + "learning_rate": 8.558728845051967e-05, + "loss": 2.906, + "step": 20820 + }, + { + "epoch": 0.9693879926438066, + "grad_norm": 0.36116481976621667, + "learning_rate": 8.558538568109355e-05, + "loss": 2.9276, + "step": 20821 + }, + { + "epoch": 0.9694345508298997, + "grad_norm": 0.3892114722271572, + "learning_rate": 8.558348280722782e-05, + "loss": 2.8817, + "step": 20822 + }, + { + "epoch": 0.9694811090159927, + "grad_norm": 0.3668306878587333, + "learning_rate": 8.558157982892807e-05, + "loss": 2.8761, + "step": 20823 + }, + { + "epoch": 0.9695276672020858, + "grad_norm": 0.37072357764213176, + "learning_rate": 8.557967674619989e-05, + "loss": 2.8742, + "step": 20824 + }, + { + "epoch": 0.9695742253881788, + "grad_norm": 0.3404189410288083, + "learning_rate": 8.557777355904886e-05, + "loss": 2.702, + "step": 20825 + }, + { + "epoch": 0.969620783574272, + "grad_norm": 0.3421809978080372, + "learning_rate": 8.557587026748058e-05, + "loss": 2.8673, + "step": 20826 + }, + { + "epoch": 0.9696673417603651, + "grad_norm": 0.3329906972693783, + "learning_rate": 8.557396687150062e-05, + "loss": 2.9611, + "step": 20827 + }, + { + "epoch": 0.9697138999464581, + "grad_norm": 0.35489112148682517, + "learning_rate": 8.557206337111456e-05, + "loss": 2.9641, + "step": 20828 + }, + { + "epoch": 0.9697604581325512, + "grad_norm": 0.3331561462647982, + "learning_rate": 8.557015976632801e-05, + "loss": 2.8564, + "step": 20829 + }, + { + "epoch": 0.9698070163186442, + "grad_norm": 0.3687430072965838, + "learning_rate": 8.556825605714656e-05, + "loss": 2.7722, + "step": 20830 + }, + { + "epoch": 0.9698535745047373, + "grad_norm": 0.3518172574840536, + "learning_rate": 8.556635224357577e-05, + "loss": 2.8158, + "step": 20831 + }, + { + "epoch": 0.9699001326908304, + "grad_norm": 0.3352194733485221, + "learning_rate": 8.556444832562123e-05, + "loss": 2.7512, + "step": 20832 + }, + { + "epoch": 0.9699466908769234, + "grad_norm": 0.3588794566596978, + "learning_rate": 8.556254430328854e-05, + "loss": 2.7892, + "step": 20833 + }, + { + "epoch": 0.9699932490630165, + "grad_norm": 0.3693866484483682, + "learning_rate": 8.55606401765833e-05, + "loss": 2.8251, + "step": 20834 + }, + { + "epoch": 0.9700398072491095, + "grad_norm": 0.3376502457849711, + "learning_rate": 8.555873594551107e-05, + "loss": 2.8187, + "step": 20835 + }, + { + "epoch": 0.9700863654352027, + "grad_norm": 0.3660463028937372, + "learning_rate": 8.555683161007746e-05, + "loss": 2.9719, + "step": 20836 + }, + { + "epoch": 0.9701329236212957, + "grad_norm": 0.3247636772215601, + "learning_rate": 8.555492717028804e-05, + "loss": 2.8414, + "step": 20837 + }, + { + "epoch": 0.9701794818073888, + "grad_norm": 0.3668907508210498, + "learning_rate": 8.555302262614843e-05, + "loss": 2.8655, + "step": 20838 + }, + { + "epoch": 0.9702260399934819, + "grad_norm": 0.35075288559178397, + "learning_rate": 8.55511179776642e-05, + "loss": 2.9479, + "step": 20839 + }, + { + "epoch": 0.9702725981795749, + "grad_norm": 0.34036918624837, + "learning_rate": 8.554921322484093e-05, + "loss": 2.9036, + "step": 20840 + }, + { + "epoch": 0.970319156365668, + "grad_norm": 0.3664696092269676, + "learning_rate": 8.554730836768422e-05, + "loss": 2.7939, + "step": 20841 + }, + { + "epoch": 0.970365714551761, + "grad_norm": 0.32102406137253264, + "learning_rate": 8.554540340619968e-05, + "loss": 2.8234, + "step": 20842 + }, + { + "epoch": 0.9704122727378541, + "grad_norm": 0.3813606796588641, + "learning_rate": 8.554349834039285e-05, + "loss": 2.8656, + "step": 20843 + }, + { + "epoch": 0.9704588309239472, + "grad_norm": 0.31751886066203433, + "learning_rate": 8.554159317026939e-05, + "loss": 2.7644, + "step": 20844 + }, + { + "epoch": 0.9705053891100403, + "grad_norm": 0.33719546960566127, + "learning_rate": 8.553968789583484e-05, + "loss": 2.791, + "step": 20845 + }, + { + "epoch": 0.9705519472961334, + "grad_norm": 0.35142116971317, + "learning_rate": 8.553778251709481e-05, + "loss": 2.8818, + "step": 20846 + }, + { + "epoch": 0.9705985054822264, + "grad_norm": 0.32604583728574243, + "learning_rate": 8.553587703405488e-05, + "loss": 2.8833, + "step": 20847 + }, + { + "epoch": 0.9706450636683195, + "grad_norm": 0.3396554098292343, + "learning_rate": 8.553397144672067e-05, + "loss": 2.8529, + "step": 20848 + }, + { + "epoch": 0.9706916218544126, + "grad_norm": 0.3369886209586169, + "learning_rate": 8.553206575509773e-05, + "loss": 2.8939, + "step": 20849 + }, + { + "epoch": 0.9707381800405056, + "grad_norm": 0.3284174592790125, + "learning_rate": 8.553015995919169e-05, + "loss": 2.9029, + "step": 20850 + }, + { + "epoch": 0.9707847382265987, + "grad_norm": 0.33882439002858394, + "learning_rate": 8.552825405900812e-05, + "loss": 2.8829, + "step": 20851 + }, + { + "epoch": 0.9708312964126917, + "grad_norm": 0.3209841314313076, + "learning_rate": 8.552634805455263e-05, + "loss": 2.8054, + "step": 20852 + }, + { + "epoch": 0.9708778545987848, + "grad_norm": 0.3204717659780003, + "learning_rate": 8.552444194583083e-05, + "loss": 2.8561, + "step": 20853 + }, + { + "epoch": 0.970924412784878, + "grad_norm": 0.32438831607926155, + "learning_rate": 8.552253573284827e-05, + "loss": 2.9137, + "step": 20854 + }, + { + "epoch": 0.970970970970971, + "grad_norm": 0.33179170253364043, + "learning_rate": 8.552062941561058e-05, + "loss": 2.7891, + "step": 20855 + }, + { + "epoch": 0.9710175291570641, + "grad_norm": 0.3315584450510875, + "learning_rate": 8.551872299412334e-05, + "loss": 2.7774, + "step": 20856 + }, + { + "epoch": 0.9710640873431571, + "grad_norm": 0.32216805077284455, + "learning_rate": 8.551681646839215e-05, + "loss": 2.8744, + "step": 20857 + }, + { + "epoch": 0.9711106455292502, + "grad_norm": 0.34360677097016523, + "learning_rate": 8.551490983842259e-05, + "loss": 2.9683, + "step": 20858 + }, + { + "epoch": 0.9711572037153432, + "grad_norm": 0.32063386581125003, + "learning_rate": 8.551300310422028e-05, + "loss": 2.9396, + "step": 20859 + }, + { + "epoch": 0.9712037619014363, + "grad_norm": 0.34849555226005624, + "learning_rate": 8.551109626579079e-05, + "loss": 2.9085, + "step": 20860 + }, + { + "epoch": 0.9712503200875294, + "grad_norm": 0.32428788567214506, + "learning_rate": 8.550918932313974e-05, + "loss": 2.94, + "step": 20861 + }, + { + "epoch": 0.9712968782736224, + "grad_norm": 0.34123281207931866, + "learning_rate": 8.550728227627272e-05, + "loss": 2.9289, + "step": 20862 + }, + { + "epoch": 0.9713434364597155, + "grad_norm": 0.31074920415017954, + "learning_rate": 8.550537512519531e-05, + "loss": 2.9434, + "step": 20863 + }, + { + "epoch": 0.9713899946458086, + "grad_norm": 0.3198444348922377, + "learning_rate": 8.550346786991314e-05, + "loss": 2.8523, + "step": 20864 + }, + { + "epoch": 0.9714365528319017, + "grad_norm": 0.3005400863903499, + "learning_rate": 8.550156051043179e-05, + "loss": 2.8117, + "step": 20865 + }, + { + "epoch": 0.9714831110179948, + "grad_norm": 0.33011214801121963, + "learning_rate": 8.549965304675685e-05, + "loss": 2.7372, + "step": 20866 + }, + { + "epoch": 0.9715296692040878, + "grad_norm": 0.29997149293843756, + "learning_rate": 8.549774547889393e-05, + "loss": 2.7205, + "step": 20867 + }, + { + "epoch": 0.9715762273901809, + "grad_norm": 0.34091832386763604, + "learning_rate": 8.549583780684862e-05, + "loss": 2.8031, + "step": 20868 + }, + { + "epoch": 0.9716227855762739, + "grad_norm": 0.3547915973835059, + "learning_rate": 8.549393003062652e-05, + "loss": 2.9657, + "step": 20869 + }, + { + "epoch": 0.971669343762367, + "grad_norm": 0.31098578042592484, + "learning_rate": 8.549202215023323e-05, + "loss": 2.9934, + "step": 20870 + }, + { + "epoch": 0.9717159019484601, + "grad_norm": 0.3004697955057603, + "learning_rate": 8.549011416567436e-05, + "loss": 2.8477, + "step": 20871 + }, + { + "epoch": 0.9717624601345531, + "grad_norm": 0.33096014968692355, + "learning_rate": 8.548820607695549e-05, + "loss": 2.8935, + "step": 20872 + }, + { + "epoch": 0.9718090183206463, + "grad_norm": 0.32996169404041964, + "learning_rate": 8.548629788408224e-05, + "loss": 2.8716, + "step": 20873 + }, + { + "epoch": 0.9718555765067393, + "grad_norm": 0.3283999558382887, + "learning_rate": 8.548438958706021e-05, + "loss": 2.9443, + "step": 20874 + }, + { + "epoch": 0.9719021346928324, + "grad_norm": 0.3307119592330863, + "learning_rate": 8.548248118589498e-05, + "loss": 2.9178, + "step": 20875 + }, + { + "epoch": 0.9719486928789255, + "grad_norm": 0.32094826241832797, + "learning_rate": 8.548057268059217e-05, + "loss": 2.878, + "step": 20876 + }, + { + "epoch": 0.9719952510650185, + "grad_norm": 0.3152874046465982, + "learning_rate": 8.547866407115737e-05, + "loss": 2.8992, + "step": 20877 + }, + { + "epoch": 0.9720418092511116, + "grad_norm": 0.34643363818931994, + "learning_rate": 8.547675535759618e-05, + "loss": 2.8059, + "step": 20878 + }, + { + "epoch": 0.9720883674372046, + "grad_norm": 0.3456274718902542, + "learning_rate": 8.547484653991422e-05, + "loss": 2.902, + "step": 20879 + }, + { + "epoch": 0.9721349256232977, + "grad_norm": 0.37075949938983127, + "learning_rate": 8.547293761811708e-05, + "loss": 2.9366, + "step": 20880 + }, + { + "epoch": 0.9721814838093907, + "grad_norm": 0.3398811977218863, + "learning_rate": 8.547102859221037e-05, + "loss": 2.846, + "step": 20881 + }, + { + "epoch": 0.9722280419954838, + "grad_norm": 0.32733352257019815, + "learning_rate": 8.546911946219966e-05, + "loss": 2.8704, + "step": 20882 + }, + { + "epoch": 0.972274600181577, + "grad_norm": 0.3183229798675899, + "learning_rate": 8.54672102280906e-05, + "loss": 2.8617, + "step": 20883 + }, + { + "epoch": 0.97232115836767, + "grad_norm": 0.3284712561875822, + "learning_rate": 8.546530088988875e-05, + "loss": 2.8385, + "step": 20884 + }, + { + "epoch": 0.9723677165537631, + "grad_norm": 0.3502224254486904, + "learning_rate": 8.546339144759976e-05, + "loss": 2.9297, + "step": 20885 + }, + { + "epoch": 0.9724142747398561, + "grad_norm": 0.33647814019524247, + "learning_rate": 8.54614819012292e-05, + "loss": 2.9433, + "step": 20886 + }, + { + "epoch": 0.9724608329259492, + "grad_norm": 0.3408503034697656, + "learning_rate": 8.54595722507827e-05, + "loss": 2.8003, + "step": 20887 + }, + { + "epoch": 0.9725073911120423, + "grad_norm": 0.3156105498743426, + "learning_rate": 8.545766249626582e-05, + "loss": 2.8364, + "step": 20888 + }, + { + "epoch": 0.9725539492981353, + "grad_norm": 0.3665426862369777, + "learning_rate": 8.545575263768422e-05, + "loss": 2.9388, + "step": 20889 + }, + { + "epoch": 0.9726005074842284, + "grad_norm": 0.3193936871897858, + "learning_rate": 8.545384267504346e-05, + "loss": 2.8907, + "step": 20890 + }, + { + "epoch": 0.9726470656703214, + "grad_norm": 0.33457084723351577, + "learning_rate": 8.545193260834917e-05, + "loss": 2.8768, + "step": 20891 + }, + { + "epoch": 0.9726936238564146, + "grad_norm": 0.3374405346782143, + "learning_rate": 8.545002243760695e-05, + "loss": 2.8598, + "step": 20892 + }, + { + "epoch": 0.9727401820425077, + "grad_norm": 0.32706522369357177, + "learning_rate": 8.54481121628224e-05, + "loss": 2.834, + "step": 20893 + }, + { + "epoch": 0.9727867402286007, + "grad_norm": 0.33281112712911864, + "learning_rate": 8.544620178400115e-05, + "loss": 2.857, + "step": 20894 + }, + { + "epoch": 0.9728332984146938, + "grad_norm": 0.3221043770603296, + "learning_rate": 8.544429130114877e-05, + "loss": 2.9501, + "step": 20895 + }, + { + "epoch": 0.9728798566007868, + "grad_norm": 0.40224701019201387, + "learning_rate": 8.544238071427091e-05, + "loss": 2.847, + "step": 20896 + }, + { + "epoch": 0.9729264147868799, + "grad_norm": 0.3828474462907982, + "learning_rate": 8.544047002337314e-05, + "loss": 2.9284, + "step": 20897 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.39051106950682046, + "learning_rate": 8.543855922846108e-05, + "loss": 2.9247, + "step": 20898 + }, + { + "epoch": 0.973019531159066, + "grad_norm": 0.41357722923174717, + "learning_rate": 8.543664832954035e-05, + "loss": 2.8368, + "step": 20899 + }, + { + "epoch": 0.9730660893451591, + "grad_norm": 0.3869567804823794, + "learning_rate": 8.543473732661653e-05, + "loss": 2.979, + "step": 20900 + }, + { + "epoch": 0.9731126475312522, + "grad_norm": 0.3846623560213731, + "learning_rate": 8.543282621969526e-05, + "loss": 2.82, + "step": 20901 + }, + { + "epoch": 0.9731592057173453, + "grad_norm": 0.3431541581259698, + "learning_rate": 8.543091500878214e-05, + "loss": 2.8074, + "step": 20902 + }, + { + "epoch": 0.9732057639034383, + "grad_norm": 0.38118476365233195, + "learning_rate": 8.542900369388278e-05, + "loss": 2.8534, + "step": 20903 + }, + { + "epoch": 0.9732523220895314, + "grad_norm": 0.3354070166054271, + "learning_rate": 8.542709227500276e-05, + "loss": 2.9285, + "step": 20904 + }, + { + "epoch": 0.9732988802756245, + "grad_norm": 0.32686829186000993, + "learning_rate": 8.542518075214774e-05, + "loss": 2.7494, + "step": 20905 + }, + { + "epoch": 0.9733454384617175, + "grad_norm": 0.3288995675595601, + "learning_rate": 8.542326912532328e-05, + "loss": 2.9094, + "step": 20906 + }, + { + "epoch": 0.9733919966478106, + "grad_norm": 0.330509135093098, + "learning_rate": 8.542135739453504e-05, + "loss": 2.9402, + "step": 20907 + }, + { + "epoch": 0.9734385548339036, + "grad_norm": 0.32839694311346435, + "learning_rate": 8.541944555978858e-05, + "loss": 2.9317, + "step": 20908 + }, + { + "epoch": 0.9734851130199967, + "grad_norm": 0.29541169402538087, + "learning_rate": 8.541753362108955e-05, + "loss": 2.8234, + "step": 20909 + }, + { + "epoch": 0.9735316712060899, + "grad_norm": 0.3648602227393769, + "learning_rate": 8.541562157844355e-05, + "loss": 2.7701, + "step": 20910 + }, + { + "epoch": 0.9735782293921829, + "grad_norm": 0.32442515489312834, + "learning_rate": 8.541370943185619e-05, + "loss": 2.8036, + "step": 20911 + }, + { + "epoch": 0.973624787578276, + "grad_norm": 0.35715160050644035, + "learning_rate": 8.541179718133307e-05, + "loss": 2.8747, + "step": 20912 + }, + { + "epoch": 0.973671345764369, + "grad_norm": 0.3489339719194269, + "learning_rate": 8.540988482687984e-05, + "loss": 2.88, + "step": 20913 + }, + { + "epoch": 0.9737179039504621, + "grad_norm": 0.34119434932019477, + "learning_rate": 8.540797236850206e-05, + "loss": 2.9225, + "step": 20914 + }, + { + "epoch": 0.9737644621365552, + "grad_norm": 0.3484224909537027, + "learning_rate": 8.540605980620537e-05, + "loss": 2.832, + "step": 20915 + }, + { + "epoch": 0.9738110203226482, + "grad_norm": 0.3615902363374259, + "learning_rate": 8.54041471399954e-05, + "loss": 2.8192, + "step": 20916 + }, + { + "epoch": 0.9738575785087413, + "grad_norm": 0.3240790659659338, + "learning_rate": 8.540223436987773e-05, + "loss": 2.8106, + "step": 20917 + }, + { + "epoch": 0.9739041366948343, + "grad_norm": 0.3754809747867899, + "learning_rate": 8.540032149585801e-05, + "loss": 2.8938, + "step": 20918 + }, + { + "epoch": 0.9739506948809274, + "grad_norm": 0.3695084943508574, + "learning_rate": 8.539840851794182e-05, + "loss": 2.9322, + "step": 20919 + }, + { + "epoch": 0.9739972530670206, + "grad_norm": 0.35762304232474834, + "learning_rate": 8.539649543613479e-05, + "loss": 2.8841, + "step": 20920 + }, + { + "epoch": 0.9740438112531136, + "grad_norm": 0.36285369324176125, + "learning_rate": 8.539458225044254e-05, + "loss": 2.8174, + "step": 20921 + }, + { + "epoch": 0.9740903694392067, + "grad_norm": 0.3845508137621112, + "learning_rate": 8.539266896087066e-05, + "loss": 2.8013, + "step": 20922 + }, + { + "epoch": 0.9741369276252997, + "grad_norm": 0.3706746847547092, + "learning_rate": 8.53907555674248e-05, + "loss": 2.8549, + "step": 20923 + }, + { + "epoch": 0.9741834858113928, + "grad_norm": 0.34696659895102194, + "learning_rate": 8.538884207011056e-05, + "loss": 2.9007, + "step": 20924 + }, + { + "epoch": 0.9742300439974858, + "grad_norm": 0.3971016311052463, + "learning_rate": 8.538692846893353e-05, + "loss": 2.8651, + "step": 20925 + }, + { + "epoch": 0.9742766021835789, + "grad_norm": 0.3677927987600632, + "learning_rate": 8.538501476389937e-05, + "loss": 2.9275, + "step": 20926 + }, + { + "epoch": 0.974323160369672, + "grad_norm": 0.34797370669021266, + "learning_rate": 8.538310095501368e-05, + "loss": 2.8968, + "step": 20927 + }, + { + "epoch": 0.974369718555765, + "grad_norm": 0.32329044545807495, + "learning_rate": 8.538118704228207e-05, + "loss": 2.9425, + "step": 20928 + }, + { + "epoch": 0.9744162767418582, + "grad_norm": 0.3300212763514148, + "learning_rate": 8.537927302571016e-05, + "loss": 2.9745, + "step": 20929 + }, + { + "epoch": 0.9744628349279512, + "grad_norm": 0.34756280572341053, + "learning_rate": 8.537735890530358e-05, + "loss": 2.9098, + "step": 20930 + }, + { + "epoch": 0.9745093931140443, + "grad_norm": 0.31551475498452364, + "learning_rate": 8.537544468106793e-05, + "loss": 2.9821, + "step": 20931 + }, + { + "epoch": 0.9745559513001374, + "grad_norm": 0.34185591233455465, + "learning_rate": 8.537353035300883e-05, + "loss": 2.7776, + "step": 20932 + }, + { + "epoch": 0.9746025094862304, + "grad_norm": 0.35494648882669844, + "learning_rate": 8.53716159211319e-05, + "loss": 2.9212, + "step": 20933 + }, + { + "epoch": 0.9746490676723235, + "grad_norm": 0.338233862454306, + "learning_rate": 8.536970138544278e-05, + "loss": 2.7727, + "step": 20934 + }, + { + "epoch": 0.9746956258584165, + "grad_norm": 0.3395435583653016, + "learning_rate": 8.536778674594705e-05, + "loss": 2.8833, + "step": 20935 + }, + { + "epoch": 0.9747421840445096, + "grad_norm": 0.3409853274619098, + "learning_rate": 8.536587200265036e-05, + "loss": 2.8311, + "step": 20936 + }, + { + "epoch": 0.9747887422306027, + "grad_norm": 0.3053545962018944, + "learning_rate": 8.536395715555833e-05, + "loss": 3.0242, + "step": 20937 + }, + { + "epoch": 0.9748353004166957, + "grad_norm": 0.3349413830612044, + "learning_rate": 8.536204220467656e-05, + "loss": 2.8583, + "step": 20938 + }, + { + "epoch": 0.9748818586027889, + "grad_norm": 0.2923843454858605, + "learning_rate": 8.536012715001068e-05, + "loss": 2.8731, + "step": 20939 + }, + { + "epoch": 0.9749284167888819, + "grad_norm": 0.32522847412114025, + "learning_rate": 8.53582119915663e-05, + "loss": 2.769, + "step": 20940 + }, + { + "epoch": 0.974974974974975, + "grad_norm": 0.323175435488749, + "learning_rate": 8.535629672934907e-05, + "loss": 2.8304, + "step": 20941 + }, + { + "epoch": 0.9750215331610681, + "grad_norm": 0.31109902573793863, + "learning_rate": 8.535438136336459e-05, + "loss": 2.8585, + "step": 20942 + }, + { + "epoch": 0.9750680913471611, + "grad_norm": 0.3350597273138438, + "learning_rate": 8.535246589361848e-05, + "loss": 2.8573, + "step": 20943 + }, + { + "epoch": 0.9751146495332542, + "grad_norm": 0.34185106588753367, + "learning_rate": 8.535055032011636e-05, + "loss": 2.8841, + "step": 20944 + }, + { + "epoch": 0.9751612077193472, + "grad_norm": 0.320380794268065, + "learning_rate": 8.534863464286384e-05, + "loss": 3.0253, + "step": 20945 + }, + { + "epoch": 0.9752077659054403, + "grad_norm": 0.33691371813876314, + "learning_rate": 8.534671886186659e-05, + "loss": 2.8821, + "step": 20946 + }, + { + "epoch": 0.9752543240915333, + "grad_norm": 0.3293213595724989, + "learning_rate": 8.534480297713018e-05, + "loss": 2.8844, + "step": 20947 + }, + { + "epoch": 0.9753008822776265, + "grad_norm": 0.36036682087555794, + "learning_rate": 8.534288698866029e-05, + "loss": 2.861, + "step": 20948 + }, + { + "epoch": 0.9753474404637196, + "grad_norm": 0.35382810916870383, + "learning_rate": 8.534097089646246e-05, + "loss": 2.9076, + "step": 20949 + }, + { + "epoch": 0.9753939986498126, + "grad_norm": 0.3356957027346903, + "learning_rate": 8.53390547005424e-05, + "loss": 2.856, + "step": 20950 + }, + { + "epoch": 0.9754405568359057, + "grad_norm": 0.3449091314755754, + "learning_rate": 8.533713840090569e-05, + "loss": 2.8522, + "step": 20951 + }, + { + "epoch": 0.9754871150219987, + "grad_norm": 0.34557502709205046, + "learning_rate": 8.533522199755796e-05, + "loss": 2.9514, + "step": 20952 + }, + { + "epoch": 0.9755336732080918, + "grad_norm": 0.37171607360562986, + "learning_rate": 8.533330549050482e-05, + "loss": 2.7969, + "step": 20953 + }, + { + "epoch": 0.9755802313941849, + "grad_norm": 0.3449877274116053, + "learning_rate": 8.533138887975191e-05, + "loss": 2.9001, + "step": 20954 + }, + { + "epoch": 0.9756267895802779, + "grad_norm": 0.38231397614167817, + "learning_rate": 8.532947216530487e-05, + "loss": 2.8207, + "step": 20955 + }, + { + "epoch": 0.975673347766371, + "grad_norm": 0.32143898373737906, + "learning_rate": 8.532755534716931e-05, + "loss": 2.9516, + "step": 20956 + }, + { + "epoch": 0.975719905952464, + "grad_norm": 0.31463572511368304, + "learning_rate": 8.532563842535083e-05, + "loss": 2.8105, + "step": 20957 + }, + { + "epoch": 0.9757664641385572, + "grad_norm": 0.337803927408742, + "learning_rate": 8.532372139985512e-05, + "loss": 2.8172, + "step": 20958 + }, + { + "epoch": 0.9758130223246503, + "grad_norm": 0.34334754637401854, + "learning_rate": 8.532180427068773e-05, + "loss": 2.9859, + "step": 20959 + }, + { + "epoch": 0.9758595805107433, + "grad_norm": 0.3161259432336596, + "learning_rate": 8.531988703785436e-05, + "loss": 2.8351, + "step": 20960 + }, + { + "epoch": 0.9759061386968364, + "grad_norm": 0.30756430359095316, + "learning_rate": 8.531796970136057e-05, + "loss": 2.8014, + "step": 20961 + }, + { + "epoch": 0.9759526968829294, + "grad_norm": 0.31139212411283124, + "learning_rate": 8.531605226121202e-05, + "loss": 2.8768, + "step": 20962 + }, + { + "epoch": 0.9759992550690225, + "grad_norm": 0.345178692909235, + "learning_rate": 8.531413471741435e-05, + "loss": 2.8473, + "step": 20963 + }, + { + "epoch": 0.9760458132551156, + "grad_norm": 0.3312399217747519, + "learning_rate": 8.531221706997316e-05, + "loss": 2.8968, + "step": 20964 + }, + { + "epoch": 0.9760923714412086, + "grad_norm": 0.32280752474576246, + "learning_rate": 8.531029931889411e-05, + "loss": 2.8803, + "step": 20965 + }, + { + "epoch": 0.9761389296273018, + "grad_norm": 0.3297146475495531, + "learning_rate": 8.530838146418281e-05, + "loss": 2.9511, + "step": 20966 + }, + { + "epoch": 0.9761854878133948, + "grad_norm": 0.33063442093166123, + "learning_rate": 8.530646350584488e-05, + "loss": 2.8498, + "step": 20967 + }, + { + "epoch": 0.9762320459994879, + "grad_norm": 0.3470600850864621, + "learning_rate": 8.530454544388595e-05, + "loss": 2.9156, + "step": 20968 + }, + { + "epoch": 0.9762786041855809, + "grad_norm": 0.33307565636697156, + "learning_rate": 8.530262727831167e-05, + "loss": 2.9023, + "step": 20969 + }, + { + "epoch": 0.976325162371674, + "grad_norm": 0.3045412390000974, + "learning_rate": 8.530070900912765e-05, + "loss": 2.7784, + "step": 20970 + }, + { + "epoch": 0.9763717205577671, + "grad_norm": 0.33147691373701715, + "learning_rate": 8.529879063633953e-05, + "loss": 2.8049, + "step": 20971 + }, + { + "epoch": 0.9764182787438601, + "grad_norm": 0.3177615298439474, + "learning_rate": 8.529687215995293e-05, + "loss": 2.9533, + "step": 20972 + }, + { + "epoch": 0.9764648369299532, + "grad_norm": 0.35344660743503786, + "learning_rate": 8.52949535799735e-05, + "loss": 2.9578, + "step": 20973 + }, + { + "epoch": 0.9765113951160462, + "grad_norm": 0.32673364766604984, + "learning_rate": 8.529303489640685e-05, + "loss": 2.8231, + "step": 20974 + }, + { + "epoch": 0.9765579533021393, + "grad_norm": 0.3773905032559562, + "learning_rate": 8.529111610925862e-05, + "loss": 2.9818, + "step": 20975 + }, + { + "epoch": 0.9766045114882325, + "grad_norm": 0.3306299922757189, + "learning_rate": 8.528919721853445e-05, + "loss": 2.8521, + "step": 20976 + }, + { + "epoch": 0.9766510696743255, + "grad_norm": 0.33838212419863967, + "learning_rate": 8.528727822423994e-05, + "loss": 2.9559, + "step": 20977 + }, + { + "epoch": 0.9766976278604186, + "grad_norm": 0.3302688275283416, + "learning_rate": 8.528535912638078e-05, + "loss": 2.9319, + "step": 20978 + }, + { + "epoch": 0.9767441860465116, + "grad_norm": 0.3253678564202682, + "learning_rate": 8.528343992496254e-05, + "loss": 2.8383, + "step": 20979 + }, + { + "epoch": 0.9767907442326047, + "grad_norm": 0.3513968992671408, + "learning_rate": 8.528152061999088e-05, + "loss": 2.899, + "step": 20980 + }, + { + "epoch": 0.9768373024186978, + "grad_norm": 0.33883435895966774, + "learning_rate": 8.527960121147144e-05, + "loss": 2.841, + "step": 20981 + }, + { + "epoch": 0.9768838606047908, + "grad_norm": 0.3197526087867352, + "learning_rate": 8.527768169940986e-05, + "loss": 2.8619, + "step": 20982 + }, + { + "epoch": 0.9769304187908839, + "grad_norm": 0.34471291605250565, + "learning_rate": 8.527576208381175e-05, + "loss": 2.8213, + "step": 20983 + }, + { + "epoch": 0.9769769769769769, + "grad_norm": 0.33267495096895355, + "learning_rate": 8.527384236468274e-05, + "loss": 2.869, + "step": 20984 + }, + { + "epoch": 0.9770235351630701, + "grad_norm": 0.3314514139834699, + "learning_rate": 8.527192254202848e-05, + "loss": 2.8592, + "step": 20985 + }, + { + "epoch": 0.9770700933491632, + "grad_norm": 0.363882626049153, + "learning_rate": 8.527000261585461e-05, + "loss": 2.8825, + "step": 20986 + }, + { + "epoch": 0.9771166515352562, + "grad_norm": 0.35622744367495174, + "learning_rate": 8.526808258616677e-05, + "loss": 2.9659, + "step": 20987 + }, + { + "epoch": 0.9771632097213493, + "grad_norm": 0.3968868113066558, + "learning_rate": 8.526616245297054e-05, + "loss": 2.8493, + "step": 20988 + }, + { + "epoch": 0.9772097679074423, + "grad_norm": 0.3759077604000844, + "learning_rate": 8.526424221627163e-05, + "loss": 2.8336, + "step": 20989 + }, + { + "epoch": 0.9772563260935354, + "grad_norm": 0.3565256443909057, + "learning_rate": 8.526232187607565e-05, + "loss": 2.8079, + "step": 20990 + }, + { + "epoch": 0.9773028842796284, + "grad_norm": 0.3662868600068923, + "learning_rate": 8.52604014323882e-05, + "loss": 3.001, + "step": 20991 + }, + { + "epoch": 0.9773494424657215, + "grad_norm": 0.35128714371383646, + "learning_rate": 8.525848088521498e-05, + "loss": 2.882, + "step": 20992 + }, + { + "epoch": 0.9773960006518146, + "grad_norm": 0.37694292574412963, + "learning_rate": 8.525656023456157e-05, + "loss": 2.8425, + "step": 20993 + }, + { + "epoch": 0.9774425588379076, + "grad_norm": 0.3615626499475001, + "learning_rate": 8.525463948043363e-05, + "loss": 2.9236, + "step": 20994 + }, + { + "epoch": 0.9774891170240008, + "grad_norm": 0.34862996967095655, + "learning_rate": 8.525271862283682e-05, + "loss": 2.8606, + "step": 20995 + }, + { + "epoch": 0.9775356752100938, + "grad_norm": 0.32591175837461667, + "learning_rate": 8.525079766177673e-05, + "loss": 2.8356, + "step": 20996 + }, + { + "epoch": 0.9775822333961869, + "grad_norm": 0.37808383532876844, + "learning_rate": 8.524887659725902e-05, + "loss": 2.9691, + "step": 20997 + }, + { + "epoch": 0.97762879158228, + "grad_norm": 0.32649706085788255, + "learning_rate": 8.524695542928933e-05, + "loss": 2.9149, + "step": 20998 + }, + { + "epoch": 0.977675349768373, + "grad_norm": 0.3443917062145594, + "learning_rate": 8.524503415787331e-05, + "loss": 2.8889, + "step": 20999 + }, + { + "epoch": 0.9777219079544661, + "grad_norm": 0.33349247189046544, + "learning_rate": 8.524311278301658e-05, + "loss": 2.8988, + "step": 21000 + }, + { + "epoch": 0.9777684661405591, + "grad_norm": 0.3375671745925971, + "learning_rate": 8.52411913047248e-05, + "loss": 2.9152, + "step": 21001 + }, + { + "epoch": 0.9778150243266522, + "grad_norm": 0.3110982636018472, + "learning_rate": 8.523926972300356e-05, + "loss": 2.8101, + "step": 21002 + }, + { + "epoch": 0.9778615825127454, + "grad_norm": 0.3497483591557612, + "learning_rate": 8.523734803785857e-05, + "loss": 2.8528, + "step": 21003 + }, + { + "epoch": 0.9779081406988384, + "grad_norm": 0.32601960549197057, + "learning_rate": 8.523542624929541e-05, + "loss": 2.8483, + "step": 21004 + }, + { + "epoch": 0.9779546988849315, + "grad_norm": 0.3227139674530439, + "learning_rate": 8.523350435731977e-05, + "loss": 2.9603, + "step": 21005 + }, + { + "epoch": 0.9780012570710245, + "grad_norm": 0.3270160365115205, + "learning_rate": 8.523158236193725e-05, + "loss": 2.7846, + "step": 21006 + }, + { + "epoch": 0.9780478152571176, + "grad_norm": 0.3285426989608249, + "learning_rate": 8.522966026315351e-05, + "loss": 2.8354, + "step": 21007 + }, + { + "epoch": 0.9780943734432107, + "grad_norm": 0.36886539014733166, + "learning_rate": 8.52277380609742e-05, + "loss": 2.8678, + "step": 21008 + }, + { + "epoch": 0.9781409316293037, + "grad_norm": 0.3305075520033273, + "learning_rate": 8.522581575540492e-05, + "loss": 2.8493, + "step": 21009 + }, + { + "epoch": 0.9781874898153968, + "grad_norm": 0.34276780889915487, + "learning_rate": 8.522389334645137e-05, + "loss": 2.8511, + "step": 21010 + }, + { + "epoch": 0.9782340480014898, + "grad_norm": 0.3535843480404525, + "learning_rate": 8.522197083411915e-05, + "loss": 2.864, + "step": 21011 + }, + { + "epoch": 0.978280606187583, + "grad_norm": 0.36259195553958173, + "learning_rate": 8.522004821841392e-05, + "loss": 2.8259, + "step": 21012 + }, + { + "epoch": 0.978327164373676, + "grad_norm": 0.36817276193103493, + "learning_rate": 8.521812549934129e-05, + "loss": 2.949, + "step": 21013 + }, + { + "epoch": 0.9783737225597691, + "grad_norm": 0.32578573399393407, + "learning_rate": 8.521620267690697e-05, + "loss": 2.8945, + "step": 21014 + }, + { + "epoch": 0.9784202807458622, + "grad_norm": 0.3281292833255664, + "learning_rate": 8.521427975111656e-05, + "loss": 2.7911, + "step": 21015 + }, + { + "epoch": 0.9784668389319552, + "grad_norm": 0.34288290499272234, + "learning_rate": 8.52123567219757e-05, + "loss": 2.8831, + "step": 21016 + }, + { + "epoch": 0.9785133971180483, + "grad_norm": 0.3386680679588817, + "learning_rate": 8.521043358949003e-05, + "loss": 2.8184, + "step": 21017 + }, + { + "epoch": 0.9785599553041413, + "grad_norm": 0.3475136545260296, + "learning_rate": 8.520851035366521e-05, + "loss": 2.9187, + "step": 21018 + }, + { + "epoch": 0.9786065134902344, + "grad_norm": 0.30788698367593736, + "learning_rate": 8.52065870145069e-05, + "loss": 2.8197, + "step": 21019 + }, + { + "epoch": 0.9786530716763275, + "grad_norm": 0.3641958087708565, + "learning_rate": 8.52046635720207e-05, + "loss": 2.8014, + "step": 21020 + }, + { + "epoch": 0.9786996298624205, + "grad_norm": 0.31593213689117217, + "learning_rate": 8.520274002621231e-05, + "loss": 2.8242, + "step": 21021 + }, + { + "epoch": 0.9787461880485137, + "grad_norm": 0.3457771398196614, + "learning_rate": 8.520081637708732e-05, + "loss": 2.8577, + "step": 21022 + }, + { + "epoch": 0.9787927462346067, + "grad_norm": 0.32851770342533737, + "learning_rate": 8.519889262465142e-05, + "loss": 2.8261, + "step": 21023 + }, + { + "epoch": 0.9788393044206998, + "grad_norm": 0.36791655914666566, + "learning_rate": 8.519696876891024e-05, + "loss": 2.9402, + "step": 21024 + }, + { + "epoch": 0.9788858626067929, + "grad_norm": 0.33885634016228655, + "learning_rate": 8.51950448098694e-05, + "loss": 2.7995, + "step": 21025 + }, + { + "epoch": 0.9789324207928859, + "grad_norm": 0.34691846210760335, + "learning_rate": 8.519312074753458e-05, + "loss": 2.8119, + "step": 21026 + }, + { + "epoch": 0.978978978978979, + "grad_norm": 0.35927408342282047, + "learning_rate": 8.519119658191143e-05, + "loss": 2.8865, + "step": 21027 + }, + { + "epoch": 0.979025537165072, + "grad_norm": 0.33952529524779745, + "learning_rate": 8.518927231300557e-05, + "loss": 2.8105, + "step": 21028 + }, + { + "epoch": 0.9790720953511651, + "grad_norm": 0.31190700839963076, + "learning_rate": 8.518734794082269e-05, + "loss": 2.8521, + "step": 21029 + }, + { + "epoch": 0.9791186535372582, + "grad_norm": 0.33349412644246407, + "learning_rate": 8.518542346536838e-05, + "loss": 2.9207, + "step": 21030 + }, + { + "epoch": 0.9791652117233512, + "grad_norm": 0.3398563985546506, + "learning_rate": 8.518349888664832e-05, + "loss": 2.8847, + "step": 21031 + }, + { + "epoch": 0.9792117699094444, + "grad_norm": 0.34201766265972605, + "learning_rate": 8.518157420466817e-05, + "loss": 2.8978, + "step": 21032 + }, + { + "epoch": 0.9792583280955374, + "grad_norm": 0.3357210329884306, + "learning_rate": 8.517964941943356e-05, + "loss": 2.9359, + "step": 21033 + }, + { + "epoch": 0.9793048862816305, + "grad_norm": 0.3483372579541114, + "learning_rate": 8.517772453095014e-05, + "loss": 2.6615, + "step": 21034 + }, + { + "epoch": 0.9793514444677235, + "grad_norm": 0.3190507815686368, + "learning_rate": 8.517579953922358e-05, + "loss": 2.8583, + "step": 21035 + }, + { + "epoch": 0.9793980026538166, + "grad_norm": 0.3580500366267174, + "learning_rate": 8.517387444425951e-05, + "loss": 2.8417, + "step": 21036 + }, + { + "epoch": 0.9794445608399097, + "grad_norm": 0.31955648026175215, + "learning_rate": 8.517194924606358e-05, + "loss": 2.9243, + "step": 21037 + }, + { + "epoch": 0.9794911190260027, + "grad_norm": 0.34332834414006685, + "learning_rate": 8.517002394464144e-05, + "loss": 2.9029, + "step": 21038 + }, + { + "epoch": 0.9795376772120958, + "grad_norm": 0.3312661726443547, + "learning_rate": 8.516809853999875e-05, + "loss": 2.7795, + "step": 21039 + }, + { + "epoch": 0.9795842353981888, + "grad_norm": 0.34076504231782434, + "learning_rate": 8.516617303214114e-05, + "loss": 2.9373, + "step": 21040 + }, + { + "epoch": 0.979630793584282, + "grad_norm": 0.3480700048667091, + "learning_rate": 8.51642474210743e-05, + "loss": 2.7995, + "step": 21041 + }, + { + "epoch": 0.9796773517703751, + "grad_norm": 0.32521711029651257, + "learning_rate": 8.516232170680384e-05, + "loss": 2.875, + "step": 21042 + }, + { + "epoch": 0.9797239099564681, + "grad_norm": 0.31693699762565164, + "learning_rate": 8.516039588933544e-05, + "loss": 2.868, + "step": 21043 + }, + { + "epoch": 0.9797704681425612, + "grad_norm": 0.33361480622004247, + "learning_rate": 8.515846996867472e-05, + "loss": 2.8994, + "step": 21044 + }, + { + "epoch": 0.9798170263286542, + "grad_norm": 0.3267732974658637, + "learning_rate": 8.515654394482737e-05, + "loss": 2.9039, + "step": 21045 + }, + { + "epoch": 0.9798635845147473, + "grad_norm": 0.3454706504537454, + "learning_rate": 8.515461781779904e-05, + "loss": 3.0452, + "step": 21046 + }, + { + "epoch": 0.9799101427008404, + "grad_norm": 0.34685502915891747, + "learning_rate": 8.515269158759535e-05, + "loss": 2.9236, + "step": 21047 + }, + { + "epoch": 0.9799567008869334, + "grad_norm": 0.3387613487034044, + "learning_rate": 8.515076525422199e-05, + "loss": 2.8719, + "step": 21048 + }, + { + "epoch": 0.9800032590730265, + "grad_norm": 0.3355818434883206, + "learning_rate": 8.514883881768459e-05, + "loss": 2.8437, + "step": 21049 + }, + { + "epoch": 0.9800498172591195, + "grad_norm": 0.3275003209601568, + "learning_rate": 8.514691227798882e-05, + "loss": 2.9423, + "step": 21050 + }, + { + "epoch": 0.9800963754452127, + "grad_norm": 0.3593014928018557, + "learning_rate": 8.51449856351403e-05, + "loss": 2.8846, + "step": 21051 + }, + { + "epoch": 0.9801429336313058, + "grad_norm": 0.31319872587754916, + "learning_rate": 8.514305888914474e-05, + "loss": 2.8866, + "step": 21052 + }, + { + "epoch": 0.9801894918173988, + "grad_norm": 0.37855542265536224, + "learning_rate": 8.514113204000774e-05, + "loss": 2.9193, + "step": 21053 + }, + { + "epoch": 0.9802360500034919, + "grad_norm": 0.33806865102709555, + "learning_rate": 8.513920508773499e-05, + "loss": 2.8859, + "step": 21054 + }, + { + "epoch": 0.9802826081895849, + "grad_norm": 0.33079447457079214, + "learning_rate": 8.513727803233214e-05, + "loss": 2.8486, + "step": 21055 + }, + { + "epoch": 0.980329166375678, + "grad_norm": 0.34385029124406263, + "learning_rate": 8.513535087380481e-05, + "loss": 2.811, + "step": 21056 + }, + { + "epoch": 0.980375724561771, + "grad_norm": 0.32776657953297156, + "learning_rate": 8.513342361215872e-05, + "loss": 2.8834, + "step": 21057 + }, + { + "epoch": 0.9804222827478641, + "grad_norm": 0.3594134550508526, + "learning_rate": 8.513149624739949e-05, + "loss": 2.8612, + "step": 21058 + }, + { + "epoch": 0.9804688409339573, + "grad_norm": 0.3563980368566652, + "learning_rate": 8.512956877953278e-05, + "loss": 2.9828, + "step": 21059 + }, + { + "epoch": 0.9805153991200503, + "grad_norm": 0.3651604670133055, + "learning_rate": 8.512764120856423e-05, + "loss": 2.8781, + "step": 21060 + }, + { + "epoch": 0.9805619573061434, + "grad_norm": 0.3737460145208342, + "learning_rate": 8.512571353449953e-05, + "loss": 2.8179, + "step": 21061 + }, + { + "epoch": 0.9806085154922364, + "grad_norm": 0.3426917004913793, + "learning_rate": 8.512378575734431e-05, + "loss": 2.8375, + "step": 21062 + }, + { + "epoch": 0.9806550736783295, + "grad_norm": 0.3661657743712947, + "learning_rate": 8.512185787710423e-05, + "loss": 2.8691, + "step": 21063 + }, + { + "epoch": 0.9807016318644226, + "grad_norm": 0.34832303568538353, + "learning_rate": 8.511992989378499e-05, + "loss": 2.8654, + "step": 21064 + }, + { + "epoch": 0.9807481900505156, + "grad_norm": 0.3545307266048215, + "learning_rate": 8.511800180739219e-05, + "loss": 2.8686, + "step": 21065 + }, + { + "epoch": 0.9807947482366087, + "grad_norm": 0.3585523614222491, + "learning_rate": 8.511607361793152e-05, + "loss": 2.9178, + "step": 21066 + }, + { + "epoch": 0.9808413064227017, + "grad_norm": 0.32348698652166036, + "learning_rate": 8.511414532540863e-05, + "loss": 2.8797, + "step": 21067 + }, + { + "epoch": 0.9808878646087948, + "grad_norm": 0.34870935318127305, + "learning_rate": 8.51122169298292e-05, + "loss": 2.8545, + "step": 21068 + }, + { + "epoch": 0.980934422794888, + "grad_norm": 0.3218433204429858, + "learning_rate": 8.511028843119886e-05, + "loss": 2.766, + "step": 21069 + }, + { + "epoch": 0.980980980980981, + "grad_norm": 0.34956607088101416, + "learning_rate": 8.510835982952327e-05, + "loss": 2.9063, + "step": 21070 + }, + { + "epoch": 0.9810275391670741, + "grad_norm": 0.32498413234706414, + "learning_rate": 8.510643112480812e-05, + "loss": 2.848, + "step": 21071 + }, + { + "epoch": 0.9810740973531671, + "grad_norm": 0.35353007273364684, + "learning_rate": 8.510450231705905e-05, + "loss": 2.8363, + "step": 21072 + }, + { + "epoch": 0.9811206555392602, + "grad_norm": 0.33951281918004783, + "learning_rate": 8.510257340628171e-05, + "loss": 2.8958, + "step": 21073 + }, + { + "epoch": 0.9811672137253533, + "grad_norm": 0.3588592710353857, + "learning_rate": 8.510064439248179e-05, + "loss": 2.86, + "step": 21074 + }, + { + "epoch": 0.9812137719114463, + "grad_norm": 0.35970525560696864, + "learning_rate": 8.509871527566494e-05, + "loss": 2.8551, + "step": 21075 + }, + { + "epoch": 0.9812603300975394, + "grad_norm": 0.3500277689868486, + "learning_rate": 8.509678605583681e-05, + "loss": 2.9099, + "step": 21076 + }, + { + "epoch": 0.9813068882836324, + "grad_norm": 0.3469109299940233, + "learning_rate": 8.509485673300309e-05, + "loss": 2.856, + "step": 21077 + }, + { + "epoch": 0.9813534464697256, + "grad_norm": 0.3347207154672857, + "learning_rate": 8.509292730716938e-05, + "loss": 2.7594, + "step": 21078 + }, + { + "epoch": 0.9814000046558186, + "grad_norm": 0.35786494317633566, + "learning_rate": 8.509099777834142e-05, + "loss": 2.9452, + "step": 21079 + }, + { + "epoch": 0.9814465628419117, + "grad_norm": 0.3393665591241679, + "learning_rate": 8.508906814652483e-05, + "loss": 2.9299, + "step": 21080 + }, + { + "epoch": 0.9814931210280048, + "grad_norm": 0.38939697571997556, + "learning_rate": 8.508713841172528e-05, + "loss": 2.845, + "step": 21081 + }, + { + "epoch": 0.9815396792140978, + "grad_norm": 0.3466185195103489, + "learning_rate": 8.508520857394844e-05, + "loss": 2.8998, + "step": 21082 + }, + { + "epoch": 0.9815862374001909, + "grad_norm": 0.3887717670193783, + "learning_rate": 8.508327863319997e-05, + "loss": 2.7654, + "step": 21083 + }, + { + "epoch": 0.9816327955862839, + "grad_norm": 0.35199876291558047, + "learning_rate": 8.508134858948553e-05, + "loss": 2.8136, + "step": 21084 + }, + { + "epoch": 0.981679353772377, + "grad_norm": 0.33618499592213996, + "learning_rate": 8.507941844281077e-05, + "loss": 2.8904, + "step": 21085 + }, + { + "epoch": 0.9817259119584701, + "grad_norm": 0.356926474782349, + "learning_rate": 8.507748819318141e-05, + "loss": 2.8412, + "step": 21086 + }, + { + "epoch": 0.9817724701445631, + "grad_norm": 0.33383627770336377, + "learning_rate": 8.507555784060306e-05, + "loss": 2.8611, + "step": 21087 + }, + { + "epoch": 0.9818190283306563, + "grad_norm": 0.32607682482324374, + "learning_rate": 8.50736273850814e-05, + "loss": 2.8478, + "step": 21088 + }, + { + "epoch": 0.9818655865167493, + "grad_norm": 0.371058508848419, + "learning_rate": 8.507169682662209e-05, + "loss": 2.897, + "step": 21089 + }, + { + "epoch": 0.9819121447028424, + "grad_norm": 0.3233365143905344, + "learning_rate": 8.506976616523082e-05, + "loss": 2.863, + "step": 21090 + }, + { + "epoch": 0.9819587028889355, + "grad_norm": 0.3491725450806743, + "learning_rate": 8.506783540091323e-05, + "loss": 2.8642, + "step": 21091 + }, + { + "epoch": 0.9820052610750285, + "grad_norm": 0.3032261814909148, + "learning_rate": 8.506590453367503e-05, + "loss": 2.7679, + "step": 21092 + }, + { + "epoch": 0.9820518192611216, + "grad_norm": 0.34129333767384984, + "learning_rate": 8.506397356352181e-05, + "loss": 2.8564, + "step": 21093 + }, + { + "epoch": 0.9820983774472146, + "grad_norm": 0.31744257040221396, + "learning_rate": 8.50620424904593e-05, + "loss": 2.8804, + "step": 21094 + }, + { + "epoch": 0.9821449356333077, + "grad_norm": 0.35319428446597473, + "learning_rate": 8.506011131449316e-05, + "loss": 2.8992, + "step": 21095 + }, + { + "epoch": 0.9821914938194009, + "grad_norm": 0.33229351322458195, + "learning_rate": 8.505818003562905e-05, + "loss": 2.8522, + "step": 21096 + }, + { + "epoch": 0.9822380520054939, + "grad_norm": 0.3550739231176679, + "learning_rate": 8.50562486538726e-05, + "loss": 2.9084, + "step": 21097 + }, + { + "epoch": 0.982284610191587, + "grad_norm": 0.3323383978185456, + "learning_rate": 8.505431716922955e-05, + "loss": 2.7439, + "step": 21098 + }, + { + "epoch": 0.98233116837768, + "grad_norm": 0.33957589191439946, + "learning_rate": 8.505238558170552e-05, + "loss": 2.9427, + "step": 21099 + }, + { + "epoch": 0.9823777265637731, + "grad_norm": 0.3207515419291923, + "learning_rate": 8.50504538913062e-05, + "loss": 2.8866, + "step": 21100 + }, + { + "epoch": 0.9824242847498661, + "grad_norm": 0.317504374061908, + "learning_rate": 8.504852209803724e-05, + "loss": 2.776, + "step": 21101 + }, + { + "epoch": 0.9824708429359592, + "grad_norm": 0.34152992763960965, + "learning_rate": 8.50465902019043e-05, + "loss": 2.703, + "step": 21102 + }, + { + "epoch": 0.9825174011220523, + "grad_norm": 0.3082022292906301, + "learning_rate": 8.504465820291309e-05, + "loss": 2.8659, + "step": 21103 + }, + { + "epoch": 0.9825639593081453, + "grad_norm": 0.3559458360744084, + "learning_rate": 8.504272610106927e-05, + "loss": 2.8472, + "step": 21104 + }, + { + "epoch": 0.9826105174942384, + "grad_norm": 0.3479890074000884, + "learning_rate": 8.504079389637851e-05, + "loss": 2.9502, + "step": 21105 + }, + { + "epoch": 0.9826570756803314, + "grad_norm": 0.3585744884616678, + "learning_rate": 8.503886158884645e-05, + "loss": 2.8874, + "step": 21106 + }, + { + "epoch": 0.9827036338664246, + "grad_norm": 0.3557185750486104, + "learning_rate": 8.503692917847879e-05, + "loss": 2.7778, + "step": 21107 + }, + { + "epoch": 0.9827501920525177, + "grad_norm": 0.3400538086371697, + "learning_rate": 8.503499666528119e-05, + "loss": 2.847, + "step": 21108 + }, + { + "epoch": 0.9827967502386107, + "grad_norm": 0.3845203975362908, + "learning_rate": 8.503306404925934e-05, + "loss": 2.9023, + "step": 21109 + }, + { + "epoch": 0.9828433084247038, + "grad_norm": 0.3208022566635632, + "learning_rate": 8.503113133041889e-05, + "loss": 2.7786, + "step": 21110 + }, + { + "epoch": 0.9828898666107968, + "grad_norm": 0.3688609966279121, + "learning_rate": 8.502919850876551e-05, + "loss": 2.778, + "step": 21111 + }, + { + "epoch": 0.9829364247968899, + "grad_norm": 0.33065427638300204, + "learning_rate": 8.50272655843049e-05, + "loss": 2.7712, + "step": 21112 + }, + { + "epoch": 0.982982982982983, + "grad_norm": 0.34964702929281005, + "learning_rate": 8.502533255704272e-05, + "loss": 2.8833, + "step": 21113 + }, + { + "epoch": 0.983029541169076, + "grad_norm": 0.34186569486724616, + "learning_rate": 8.502339942698463e-05, + "loss": 2.8629, + "step": 21114 + }, + { + "epoch": 0.9830760993551692, + "grad_norm": 0.3319377453422614, + "learning_rate": 8.50214661941363e-05, + "loss": 2.842, + "step": 21115 + }, + { + "epoch": 0.9831226575412622, + "grad_norm": 0.35146583461692527, + "learning_rate": 8.501953285850342e-05, + "loss": 2.8651, + "step": 21116 + }, + { + "epoch": 0.9831692157273553, + "grad_norm": 0.3453632425516814, + "learning_rate": 8.501759942009169e-05, + "loss": 2.8458, + "step": 21117 + }, + { + "epoch": 0.9832157739134483, + "grad_norm": 0.3399784061172975, + "learning_rate": 8.501566587890673e-05, + "loss": 2.8746, + "step": 21118 + }, + { + "epoch": 0.9832623320995414, + "grad_norm": 0.3674244253195179, + "learning_rate": 8.501373223495425e-05, + "loss": 2.8047, + "step": 21119 + }, + { + "epoch": 0.9833088902856345, + "grad_norm": 0.3417714741058102, + "learning_rate": 8.50117984882399e-05, + "loss": 2.9979, + "step": 21120 + }, + { + "epoch": 0.9833554484717275, + "grad_norm": 0.329664919506015, + "learning_rate": 8.500986463876938e-05, + "loss": 2.7956, + "step": 21121 + }, + { + "epoch": 0.9834020066578206, + "grad_norm": 0.32279201966435606, + "learning_rate": 8.500793068654837e-05, + "loss": 2.8343, + "step": 21122 + }, + { + "epoch": 0.9834485648439136, + "grad_norm": 0.3508456455174438, + "learning_rate": 8.500599663158252e-05, + "loss": 2.83, + "step": 21123 + }, + { + "epoch": 0.9834951230300067, + "grad_norm": 0.3743723909595823, + "learning_rate": 8.500406247387751e-05, + "loss": 2.9015, + "step": 21124 + }, + { + "epoch": 0.9835416812160999, + "grad_norm": 0.31112805427388934, + "learning_rate": 8.500212821343904e-05, + "loss": 2.8872, + "step": 21125 + }, + { + "epoch": 0.9835882394021929, + "grad_norm": 0.3573738290399228, + "learning_rate": 8.500019385027275e-05, + "loss": 2.8796, + "step": 21126 + }, + { + "epoch": 0.983634797588286, + "grad_norm": 0.36022684666206845, + "learning_rate": 8.499825938438437e-05, + "loss": 3.0117, + "step": 21127 + }, + { + "epoch": 0.983681355774379, + "grad_norm": 0.3914037494734152, + "learning_rate": 8.499632481577952e-05, + "loss": 2.8206, + "step": 21128 + }, + { + "epoch": 0.9837279139604721, + "grad_norm": 0.3307261580439873, + "learning_rate": 8.499439014446393e-05, + "loss": 2.8518, + "step": 21129 + }, + { + "epoch": 0.9837744721465652, + "grad_norm": 0.3940898556877137, + "learning_rate": 8.499245537044323e-05, + "loss": 2.9219, + "step": 21130 + }, + { + "epoch": 0.9838210303326582, + "grad_norm": 0.32893918254262494, + "learning_rate": 8.499052049372314e-05, + "loss": 2.8093, + "step": 21131 + }, + { + "epoch": 0.9838675885187513, + "grad_norm": 0.34397962549756506, + "learning_rate": 8.49885855143093e-05, + "loss": 2.8756, + "step": 21132 + }, + { + "epoch": 0.9839141467048443, + "grad_norm": 0.3067444706770738, + "learning_rate": 8.498665043220743e-05, + "loss": 2.7399, + "step": 21133 + }, + { + "epoch": 0.9839607048909375, + "grad_norm": 0.3660096523384475, + "learning_rate": 8.498471524742318e-05, + "loss": 2.8616, + "step": 21134 + }, + { + "epoch": 0.9840072630770306, + "grad_norm": 0.3257325368520591, + "learning_rate": 8.498277995996222e-05, + "loss": 2.848, + "step": 21135 + }, + { + "epoch": 0.9840538212631236, + "grad_norm": 0.3453263249990365, + "learning_rate": 8.498084456983027e-05, + "loss": 2.776, + "step": 21136 + }, + { + "epoch": 0.9841003794492167, + "grad_norm": 0.3509066618002551, + "learning_rate": 8.497890907703297e-05, + "loss": 2.8435, + "step": 21137 + }, + { + "epoch": 0.9841469376353097, + "grad_norm": 0.33132429766015836, + "learning_rate": 8.497697348157602e-05, + "loss": 2.755, + "step": 21138 + }, + { + "epoch": 0.9841934958214028, + "grad_norm": 0.33561814968202425, + "learning_rate": 8.497503778346511e-05, + "loss": 2.8549, + "step": 21139 + }, + { + "epoch": 0.9842400540074958, + "grad_norm": 0.3651608167862898, + "learning_rate": 8.49731019827059e-05, + "loss": 2.8907, + "step": 21140 + }, + { + "epoch": 0.9842866121935889, + "grad_norm": 0.3441673743840087, + "learning_rate": 8.497116607930409e-05, + "loss": 2.8522, + "step": 21141 + }, + { + "epoch": 0.984333170379682, + "grad_norm": 0.38203700937130286, + "learning_rate": 8.496923007326534e-05, + "loss": 2.7708, + "step": 21142 + }, + { + "epoch": 0.984379728565775, + "grad_norm": 0.34781012277602624, + "learning_rate": 8.496729396459536e-05, + "loss": 2.921, + "step": 21143 + }, + { + "epoch": 0.9844262867518682, + "grad_norm": 0.34300413245816536, + "learning_rate": 8.496535775329981e-05, + "loss": 2.9629, + "step": 21144 + }, + { + "epoch": 0.9844728449379612, + "grad_norm": 0.35366342139894896, + "learning_rate": 8.496342143938439e-05, + "loss": 2.8886, + "step": 21145 + }, + { + "epoch": 0.9845194031240543, + "grad_norm": 0.3435451282788493, + "learning_rate": 8.496148502285475e-05, + "loss": 2.7685, + "step": 21146 + }, + { + "epoch": 0.9845659613101474, + "grad_norm": 0.3251570152684714, + "learning_rate": 8.49595485037166e-05, + "loss": 2.8817, + "step": 21147 + }, + { + "epoch": 0.9846125194962404, + "grad_norm": 0.3411502097500448, + "learning_rate": 8.495761188197564e-05, + "loss": 2.8688, + "step": 21148 + }, + { + "epoch": 0.9846590776823335, + "grad_norm": 0.336414333854079, + "learning_rate": 8.495567515763751e-05, + "loss": 2.9779, + "step": 21149 + }, + { + "epoch": 0.9847056358684265, + "grad_norm": 0.3844808257938747, + "learning_rate": 8.495373833070792e-05, + "loss": 2.8446, + "step": 21150 + }, + { + "epoch": 0.9847521940545196, + "grad_norm": 0.3173763969983386, + "learning_rate": 8.495180140119257e-05, + "loss": 2.8293, + "step": 21151 + }, + { + "epoch": 0.9847987522406128, + "grad_norm": 0.3825234343570775, + "learning_rate": 8.49498643690971e-05, + "loss": 2.8715, + "step": 21152 + }, + { + "epoch": 0.9848453104267058, + "grad_norm": 0.3469416794006747, + "learning_rate": 8.494792723442724e-05, + "loss": 2.9131, + "step": 21153 + }, + { + "epoch": 0.9848918686127989, + "grad_norm": 0.3611517570476681, + "learning_rate": 8.494598999718866e-05, + "loss": 2.8908, + "step": 21154 + }, + { + "epoch": 0.9849384267988919, + "grad_norm": 0.3222672506659244, + "learning_rate": 8.494405265738703e-05, + "loss": 2.7163, + "step": 21155 + }, + { + "epoch": 0.984984984984985, + "grad_norm": 0.3598127541004324, + "learning_rate": 8.494211521502806e-05, + "loss": 2.8053, + "step": 21156 + }, + { + "epoch": 0.9850315431710781, + "grad_norm": 0.3313855721955274, + "learning_rate": 8.49401776701174e-05, + "loss": 2.8199, + "step": 21157 + }, + { + "epoch": 0.9850781013571711, + "grad_norm": 0.33750798484042205, + "learning_rate": 8.49382400226608e-05, + "loss": 2.9186, + "step": 21158 + }, + { + "epoch": 0.9851246595432642, + "grad_norm": 0.3508043350198949, + "learning_rate": 8.493630227266386e-05, + "loss": 2.821, + "step": 21159 + }, + { + "epoch": 0.9851712177293572, + "grad_norm": 0.3377423189814696, + "learning_rate": 8.493436442013235e-05, + "loss": 2.9482, + "step": 21160 + }, + { + "epoch": 0.9852177759154503, + "grad_norm": 0.3359702082905517, + "learning_rate": 8.493242646507191e-05, + "loss": 2.8194, + "step": 21161 + }, + { + "epoch": 0.9852643341015433, + "grad_norm": 0.34358444513514597, + "learning_rate": 8.493048840748824e-05, + "loss": 2.9769, + "step": 21162 + }, + { + "epoch": 0.9853108922876365, + "grad_norm": 0.33086031109247116, + "learning_rate": 8.492855024738704e-05, + "loss": 2.9431, + "step": 21163 + }, + { + "epoch": 0.9853574504737296, + "grad_norm": 0.3237771057973859, + "learning_rate": 8.492661198477397e-05, + "loss": 2.8854, + "step": 21164 + }, + { + "epoch": 0.9854040086598226, + "grad_norm": 0.3221201254285345, + "learning_rate": 8.492467361965473e-05, + "loss": 2.8775, + "step": 21165 + }, + { + "epoch": 0.9854505668459157, + "grad_norm": 0.3283376516782756, + "learning_rate": 8.492273515203503e-05, + "loss": 2.9271, + "step": 21166 + }, + { + "epoch": 0.9854971250320087, + "grad_norm": 0.3111572149458774, + "learning_rate": 8.492079658192054e-05, + "loss": 2.952, + "step": 21167 + }, + { + "epoch": 0.9855436832181018, + "grad_norm": 0.3660500580869841, + "learning_rate": 8.491885790931694e-05, + "loss": 2.868, + "step": 21168 + }, + { + "epoch": 0.9855902414041949, + "grad_norm": 0.3344453099968884, + "learning_rate": 8.491691913422995e-05, + "loss": 2.9035, + "step": 21169 + }, + { + "epoch": 0.9856367995902879, + "grad_norm": 0.3667131858626228, + "learning_rate": 8.491498025666522e-05, + "loss": 2.9024, + "step": 21170 + }, + { + "epoch": 0.985683357776381, + "grad_norm": 0.3161739120563479, + "learning_rate": 8.491304127662847e-05, + "loss": 2.8984, + "step": 21171 + }, + { + "epoch": 0.9857299159624741, + "grad_norm": 0.33102083858972736, + "learning_rate": 8.491110219412541e-05, + "loss": 2.8858, + "step": 21172 + }, + { + "epoch": 0.9857764741485672, + "grad_norm": 0.35249311555611174, + "learning_rate": 8.490916300916168e-05, + "loss": 2.7934, + "step": 21173 + }, + { + "epoch": 0.9858230323346603, + "grad_norm": 0.39087050602381057, + "learning_rate": 8.490722372174299e-05, + "loss": 2.8084, + "step": 21174 + }, + { + "epoch": 0.9858695905207533, + "grad_norm": 0.3204893785400953, + "learning_rate": 8.490528433187505e-05, + "loss": 2.817, + "step": 21175 + }, + { + "epoch": 0.9859161487068464, + "grad_norm": 0.3687332311331866, + "learning_rate": 8.490334483956353e-05, + "loss": 2.9051, + "step": 21176 + }, + { + "epoch": 0.9859627068929394, + "grad_norm": 0.3220238965540217, + "learning_rate": 8.490140524481414e-05, + "loss": 2.8956, + "step": 21177 + }, + { + "epoch": 0.9860092650790325, + "grad_norm": 0.36295510850240215, + "learning_rate": 8.489946554763255e-05, + "loss": 2.8237, + "step": 21178 + }, + { + "epoch": 0.9860558232651256, + "grad_norm": 0.3174097648642103, + "learning_rate": 8.489752574802447e-05, + "loss": 2.8238, + "step": 21179 + }, + { + "epoch": 0.9861023814512186, + "grad_norm": 0.3578767695333134, + "learning_rate": 8.489558584599562e-05, + "loss": 2.8347, + "step": 21180 + }, + { + "epoch": 0.9861489396373118, + "grad_norm": 0.31096342467923366, + "learning_rate": 8.489364584155164e-05, + "loss": 2.8485, + "step": 21181 + }, + { + "epoch": 0.9861954978234048, + "grad_norm": 0.32924116137690923, + "learning_rate": 8.489170573469824e-05, + "loss": 2.836, + "step": 21182 + }, + { + "epoch": 0.9862420560094979, + "grad_norm": 0.35281324853203216, + "learning_rate": 8.488976552544113e-05, + "loss": 3.0017, + "step": 21183 + }, + { + "epoch": 0.9862886141955909, + "grad_norm": 0.34549190859894147, + "learning_rate": 8.488782521378599e-05, + "loss": 2.8654, + "step": 21184 + }, + { + "epoch": 0.986335172381684, + "grad_norm": 0.3418581236356893, + "learning_rate": 8.488588479973853e-05, + "loss": 2.7212, + "step": 21185 + }, + { + "epoch": 0.9863817305677771, + "grad_norm": 0.3391584291694546, + "learning_rate": 8.488394428330443e-05, + "loss": 2.9835, + "step": 21186 + }, + { + "epoch": 0.9864282887538701, + "grad_norm": 0.33646388322389637, + "learning_rate": 8.488200366448938e-05, + "loss": 2.9402, + "step": 21187 + }, + { + "epoch": 0.9864748469399632, + "grad_norm": 0.3082438405589935, + "learning_rate": 8.48800629432991e-05, + "loss": 2.7938, + "step": 21188 + }, + { + "epoch": 0.9865214051260562, + "grad_norm": 0.34467691405972295, + "learning_rate": 8.487812211973927e-05, + "loss": 2.9228, + "step": 21189 + }, + { + "epoch": 0.9865679633121494, + "grad_norm": 0.33079999973100727, + "learning_rate": 8.487618119381559e-05, + "loss": 2.8441, + "step": 21190 + }, + { + "epoch": 0.9866145214982425, + "grad_norm": 0.3687767353528282, + "learning_rate": 8.487424016553374e-05, + "loss": 2.8034, + "step": 21191 + }, + { + "epoch": 0.9866610796843355, + "grad_norm": 0.34143956715035695, + "learning_rate": 8.487229903489943e-05, + "loss": 2.8904, + "step": 21192 + }, + { + "epoch": 0.9867076378704286, + "grad_norm": 0.3425163527711072, + "learning_rate": 8.487035780191838e-05, + "loss": 2.93, + "step": 21193 + }, + { + "epoch": 0.9867541960565216, + "grad_norm": 0.3358905759945113, + "learning_rate": 8.486841646659625e-05, + "loss": 2.8067, + "step": 21194 + }, + { + "epoch": 0.9868007542426147, + "grad_norm": 0.31130713008804883, + "learning_rate": 8.486647502893875e-05, + "loss": 2.9443, + "step": 21195 + }, + { + "epoch": 0.9868473124287078, + "grad_norm": 0.3770827153996412, + "learning_rate": 8.486453348895158e-05, + "loss": 2.9447, + "step": 21196 + }, + { + "epoch": 0.9868938706148008, + "grad_norm": 0.3425553834878586, + "learning_rate": 8.486259184664045e-05, + "loss": 2.8924, + "step": 21197 + }, + { + "epoch": 0.9869404288008939, + "grad_norm": 0.378044312417173, + "learning_rate": 8.486065010201103e-05, + "loss": 2.8139, + "step": 21198 + }, + { + "epoch": 0.986986986986987, + "grad_norm": 0.3327552820905079, + "learning_rate": 8.485870825506906e-05, + "loss": 2.8676, + "step": 21199 + }, + { + "epoch": 0.9870335451730801, + "grad_norm": 0.3633235303199268, + "learning_rate": 8.48567663058202e-05, + "loss": 2.7716, + "step": 21200 + }, + { + "epoch": 0.9870801033591732, + "grad_norm": 0.32442603140819676, + "learning_rate": 8.485482425427017e-05, + "loss": 2.8632, + "step": 21201 + }, + { + "epoch": 0.9871266615452662, + "grad_norm": 0.3706168442377821, + "learning_rate": 8.485288210042465e-05, + "loss": 2.8602, + "step": 21202 + }, + { + "epoch": 0.9871732197313593, + "grad_norm": 0.35359157789164347, + "learning_rate": 8.485093984428938e-05, + "loss": 2.915, + "step": 21203 + }, + { + "epoch": 0.9872197779174523, + "grad_norm": 0.3529749949356117, + "learning_rate": 8.484899748587002e-05, + "loss": 2.8842, + "step": 21204 + }, + { + "epoch": 0.9872663361035454, + "grad_norm": 0.3354323031048126, + "learning_rate": 8.48470550251723e-05, + "loss": 2.8719, + "step": 21205 + }, + { + "epoch": 0.9873128942896384, + "grad_norm": 0.3951446676908714, + "learning_rate": 8.484511246220188e-05, + "loss": 2.959, + "step": 21206 + }, + { + "epoch": 0.9873594524757315, + "grad_norm": 0.35021336521042107, + "learning_rate": 8.484316979696451e-05, + "loss": 2.8243, + "step": 21207 + }, + { + "epoch": 0.9874060106618247, + "grad_norm": 0.3454659482725439, + "learning_rate": 8.484122702946586e-05, + "loss": 2.7839, + "step": 21208 + }, + { + "epoch": 0.9874525688479177, + "grad_norm": 0.3506773949110234, + "learning_rate": 8.483928415971165e-05, + "loss": 2.8429, + "step": 21209 + }, + { + "epoch": 0.9874991270340108, + "grad_norm": 0.34944667551877245, + "learning_rate": 8.483734118770756e-05, + "loss": 2.7826, + "step": 21210 + }, + { + "epoch": 0.9875456852201038, + "grad_norm": 0.36932756956938045, + "learning_rate": 8.483539811345931e-05, + "loss": 2.8749, + "step": 21211 + }, + { + "epoch": 0.9875922434061969, + "grad_norm": 0.3485908254167663, + "learning_rate": 8.483345493697261e-05, + "loss": 2.8334, + "step": 21212 + }, + { + "epoch": 0.98763880159229, + "grad_norm": 0.3835464434203103, + "learning_rate": 8.483151165825313e-05, + "loss": 2.8711, + "step": 21213 + }, + { + "epoch": 0.987685359778383, + "grad_norm": 0.36821065299169625, + "learning_rate": 8.482956827730661e-05, + "loss": 2.8615, + "step": 21214 + }, + { + "epoch": 0.9877319179644761, + "grad_norm": 0.38145114126885443, + "learning_rate": 8.482762479413873e-05, + "loss": 2.8378, + "step": 21215 + }, + { + "epoch": 0.9877784761505691, + "grad_norm": 0.36886675187062873, + "learning_rate": 8.48256812087552e-05, + "loss": 2.9505, + "step": 21216 + }, + { + "epoch": 0.9878250343366622, + "grad_norm": 0.35603545806062636, + "learning_rate": 8.482373752116175e-05, + "loss": 2.8745, + "step": 21217 + }, + { + "epoch": 0.9878715925227554, + "grad_norm": 0.3816978017766786, + "learning_rate": 8.482179373136403e-05, + "loss": 2.9101, + "step": 21218 + }, + { + "epoch": 0.9879181507088484, + "grad_norm": 0.3285183216470809, + "learning_rate": 8.48198498393678e-05, + "loss": 2.8836, + "step": 21219 + }, + { + "epoch": 0.9879647088949415, + "grad_norm": 0.3545226482189042, + "learning_rate": 8.481790584517872e-05, + "loss": 2.6848, + "step": 21220 + }, + { + "epoch": 0.9880112670810345, + "grad_norm": 0.35246188145036844, + "learning_rate": 8.481596174880254e-05, + "loss": 2.8371, + "step": 21221 + }, + { + "epoch": 0.9880578252671276, + "grad_norm": 0.3450493496177003, + "learning_rate": 8.481401755024493e-05, + "loss": 2.8644, + "step": 21222 + }, + { + "epoch": 0.9881043834532207, + "grad_norm": 0.3583846283130751, + "learning_rate": 8.481207324951161e-05, + "loss": 2.8924, + "step": 21223 + }, + { + "epoch": 0.9881509416393137, + "grad_norm": 0.31854247701750804, + "learning_rate": 8.481012884660828e-05, + "loss": 2.9553, + "step": 21224 + }, + { + "epoch": 0.9881974998254068, + "grad_norm": 0.3523656751603993, + "learning_rate": 8.480818434154065e-05, + "loss": 2.8877, + "step": 21225 + }, + { + "epoch": 0.9882440580114998, + "grad_norm": 0.31862407705545553, + "learning_rate": 8.480623973431445e-05, + "loss": 2.8795, + "step": 21226 + }, + { + "epoch": 0.988290616197593, + "grad_norm": 0.3499718872372191, + "learning_rate": 8.480429502493535e-05, + "loss": 2.828, + "step": 21227 + }, + { + "epoch": 0.988337174383686, + "grad_norm": 0.2933790456543189, + "learning_rate": 8.480235021340906e-05, + "loss": 2.8075, + "step": 21228 + }, + { + "epoch": 0.9883837325697791, + "grad_norm": 0.3174929326280398, + "learning_rate": 8.48004052997413e-05, + "loss": 2.7858, + "step": 21229 + }, + { + "epoch": 0.9884302907558722, + "grad_norm": 0.3056691305328175, + "learning_rate": 8.479846028393779e-05, + "loss": 2.9349, + "step": 21230 + }, + { + "epoch": 0.9884768489419652, + "grad_norm": 0.3087050805941276, + "learning_rate": 8.479651516600424e-05, + "loss": 2.9656, + "step": 21231 + }, + { + "epoch": 0.9885234071280583, + "grad_norm": 0.3308204769557062, + "learning_rate": 8.479456994594633e-05, + "loss": 2.8082, + "step": 21232 + }, + { + "epoch": 0.9885699653141513, + "grad_norm": 0.3326843192342559, + "learning_rate": 8.479262462376978e-05, + "loss": 2.8329, + "step": 21233 + }, + { + "epoch": 0.9886165235002444, + "grad_norm": 0.30000753265505753, + "learning_rate": 8.479067919948032e-05, + "loss": 2.8341, + "step": 21234 + }, + { + "epoch": 0.9886630816863375, + "grad_norm": 0.32152366749073913, + "learning_rate": 8.478873367308365e-05, + "loss": 2.9408, + "step": 21235 + }, + { + "epoch": 0.9887096398724305, + "grad_norm": 0.3151432114174724, + "learning_rate": 8.478678804458544e-05, + "loss": 2.9073, + "step": 21236 + }, + { + "epoch": 0.9887561980585237, + "grad_norm": 0.33873833894898364, + "learning_rate": 8.478484231399146e-05, + "loss": 2.84, + "step": 21237 + }, + { + "epoch": 0.9888027562446167, + "grad_norm": 0.30545816001160553, + "learning_rate": 8.478289648130739e-05, + "loss": 2.9148, + "step": 21238 + }, + { + "epoch": 0.9888493144307098, + "grad_norm": 0.35792500908877595, + "learning_rate": 8.478095054653895e-05, + "loss": 2.9883, + "step": 21239 + }, + { + "epoch": 0.9888958726168029, + "grad_norm": 0.35459254019592057, + "learning_rate": 8.477900450969184e-05, + "loss": 2.908, + "step": 21240 + }, + { + "epoch": 0.9889424308028959, + "grad_norm": 0.32894835420973517, + "learning_rate": 8.477705837077178e-05, + "loss": 2.8343, + "step": 21241 + }, + { + "epoch": 0.988988988988989, + "grad_norm": 0.3380869065722742, + "learning_rate": 8.477511212978448e-05, + "loss": 2.8898, + "step": 21242 + }, + { + "epoch": 0.989035547175082, + "grad_norm": 0.3452832686051419, + "learning_rate": 8.477316578673564e-05, + "loss": 2.8476, + "step": 21243 + }, + { + "epoch": 0.9890821053611751, + "grad_norm": 0.29350554208914637, + "learning_rate": 8.4771219341631e-05, + "loss": 2.8217, + "step": 21244 + }, + { + "epoch": 0.9891286635472682, + "grad_norm": 0.3542943417500988, + "learning_rate": 8.476927279447626e-05, + "loss": 2.9459, + "step": 21245 + }, + { + "epoch": 0.9891752217333613, + "grad_norm": 0.3500117241177797, + "learning_rate": 8.476732614527711e-05, + "loss": 2.895, + "step": 21246 + }, + { + "epoch": 0.9892217799194544, + "grad_norm": 0.3263095484769448, + "learning_rate": 8.47653793940393e-05, + "loss": 2.8051, + "step": 21247 + }, + { + "epoch": 0.9892683381055474, + "grad_norm": 0.36997430459566494, + "learning_rate": 8.476343254076853e-05, + "loss": 2.7783, + "step": 21248 + }, + { + "epoch": 0.9893148962916405, + "grad_norm": 0.35064705564537185, + "learning_rate": 8.47614855854705e-05, + "loss": 2.825, + "step": 21249 + }, + { + "epoch": 0.9893614544777335, + "grad_norm": 0.35969967577589834, + "learning_rate": 8.475953852815093e-05, + "loss": 2.9178, + "step": 21250 + }, + { + "epoch": 0.9894080126638266, + "grad_norm": 0.34463132858901147, + "learning_rate": 8.475759136881555e-05, + "loss": 2.8456, + "step": 21251 + }, + { + "epoch": 0.9894545708499197, + "grad_norm": 0.3272890510287046, + "learning_rate": 8.475564410747006e-05, + "loss": 2.8722, + "step": 21252 + }, + { + "epoch": 0.9895011290360127, + "grad_norm": 0.3681818533582145, + "learning_rate": 8.475369674412017e-05, + "loss": 2.7941, + "step": 21253 + }, + { + "epoch": 0.9895476872221058, + "grad_norm": 0.33874854257150244, + "learning_rate": 8.475174927877161e-05, + "loss": 2.8191, + "step": 21254 + }, + { + "epoch": 0.9895942454081988, + "grad_norm": 0.3668530666376919, + "learning_rate": 8.47498017114301e-05, + "loss": 2.8617, + "step": 21255 + }, + { + "epoch": 0.989640803594292, + "grad_norm": 0.3270896157069983, + "learning_rate": 8.474785404210133e-05, + "loss": 2.879, + "step": 21256 + }, + { + "epoch": 0.9896873617803851, + "grad_norm": 0.35957309345866334, + "learning_rate": 8.474590627079104e-05, + "loss": 2.8436, + "step": 21257 + }, + { + "epoch": 0.9897339199664781, + "grad_norm": 0.32605352325067943, + "learning_rate": 8.474395839750492e-05, + "loss": 2.7962, + "step": 21258 + }, + { + "epoch": 0.9897804781525712, + "grad_norm": 0.38774350415664044, + "learning_rate": 8.474201042224874e-05, + "loss": 2.9527, + "step": 21259 + }, + { + "epoch": 0.9898270363386642, + "grad_norm": 0.3419882647859876, + "learning_rate": 8.474006234502817e-05, + "loss": 2.8011, + "step": 21260 + }, + { + "epoch": 0.9898735945247573, + "grad_norm": 0.33945001752554665, + "learning_rate": 8.473811416584891e-05, + "loss": 2.8981, + "step": 21261 + }, + { + "epoch": 0.9899201527108504, + "grad_norm": 0.34332521228050816, + "learning_rate": 8.473616588471674e-05, + "loss": 2.8614, + "step": 21262 + }, + { + "epoch": 0.9899667108969434, + "grad_norm": 0.30625558785172247, + "learning_rate": 8.473421750163733e-05, + "loss": 2.8561, + "step": 21263 + }, + { + "epoch": 0.9900132690830366, + "grad_norm": 0.30365952168780425, + "learning_rate": 8.473226901661641e-05, + "loss": 2.9554, + "step": 21264 + }, + { + "epoch": 0.9900598272691296, + "grad_norm": 0.3429037816242401, + "learning_rate": 8.473032042965972e-05, + "loss": 2.8557, + "step": 21265 + }, + { + "epoch": 0.9901063854552227, + "grad_norm": 0.32672081523679775, + "learning_rate": 8.472837174077295e-05, + "loss": 2.7122, + "step": 21266 + }, + { + "epoch": 0.9901529436413158, + "grad_norm": 0.3356990258552488, + "learning_rate": 8.472642294996183e-05, + "loss": 2.8782, + "step": 21267 + }, + { + "epoch": 0.9901995018274088, + "grad_norm": 0.3485199171301308, + "learning_rate": 8.472447405723208e-05, + "loss": 2.8841, + "step": 21268 + }, + { + "epoch": 0.9902460600135019, + "grad_norm": 0.3503660115988375, + "learning_rate": 8.472252506258942e-05, + "loss": 2.9287, + "step": 21269 + }, + { + "epoch": 0.9902926181995949, + "grad_norm": 0.3237142845570269, + "learning_rate": 8.472057596603957e-05, + "loss": 2.948, + "step": 21270 + }, + { + "epoch": 0.990339176385688, + "grad_norm": 0.353561067620124, + "learning_rate": 8.471862676758827e-05, + "loss": 2.8635, + "step": 21271 + }, + { + "epoch": 0.990385734571781, + "grad_norm": 0.31035464860839196, + "learning_rate": 8.47166774672412e-05, + "loss": 2.8497, + "step": 21272 + }, + { + "epoch": 0.9904322927578741, + "grad_norm": 0.3728248572271348, + "learning_rate": 8.47147280650041e-05, + "loss": 2.8256, + "step": 21273 + }, + { + "epoch": 0.9904788509439673, + "grad_norm": 0.33176352338010595, + "learning_rate": 8.471277856088267e-05, + "loss": 2.8166, + "step": 21274 + }, + { + "epoch": 0.9905254091300603, + "grad_norm": 0.34661083422246247, + "learning_rate": 8.471082895488269e-05, + "loss": 2.9789, + "step": 21275 + }, + { + "epoch": 0.9905719673161534, + "grad_norm": 0.3512576530855812, + "learning_rate": 8.470887924700984e-05, + "loss": 2.8756, + "step": 21276 + }, + { + "epoch": 0.9906185255022464, + "grad_norm": 0.34762649252377753, + "learning_rate": 8.470692943726985e-05, + "loss": 2.7982, + "step": 21277 + }, + { + "epoch": 0.9906650836883395, + "grad_norm": 0.3808626721952027, + "learning_rate": 8.470497952566843e-05, + "loss": 2.9146, + "step": 21278 + }, + { + "epoch": 0.9907116418744326, + "grad_norm": 0.3194290344244716, + "learning_rate": 8.470302951221132e-05, + "loss": 2.8499, + "step": 21279 + }, + { + "epoch": 0.9907582000605256, + "grad_norm": 0.36508323572585777, + "learning_rate": 8.470107939690425e-05, + "loss": 2.8974, + "step": 21280 + }, + { + "epoch": 0.9908047582466187, + "grad_norm": 0.3581196827490571, + "learning_rate": 8.469912917975291e-05, + "loss": 2.843, + "step": 21281 + }, + { + "epoch": 0.9908513164327117, + "grad_norm": 0.33146000454758395, + "learning_rate": 8.469717886076305e-05, + "loss": 2.834, + "step": 21282 + }, + { + "epoch": 0.9908978746188049, + "grad_norm": 0.3676517790057729, + "learning_rate": 8.46952284399404e-05, + "loss": 2.9307, + "step": 21283 + }, + { + "epoch": 0.990944432804898, + "grad_norm": 0.33280410531456905, + "learning_rate": 8.469327791729066e-05, + "loss": 2.931, + "step": 21284 + }, + { + "epoch": 0.990990990990991, + "grad_norm": 0.31367932842767227, + "learning_rate": 8.469132729281958e-05, + "loss": 2.7855, + "step": 21285 + }, + { + "epoch": 0.9910375491770841, + "grad_norm": 0.34334911766324033, + "learning_rate": 8.468937656653287e-05, + "loss": 2.8156, + "step": 21286 + }, + { + "epoch": 0.9910841073631771, + "grad_norm": 0.3295263429113112, + "learning_rate": 8.468742573843624e-05, + "loss": 2.9302, + "step": 21287 + }, + { + "epoch": 0.9911306655492702, + "grad_norm": 0.3273817110582466, + "learning_rate": 8.468547480853543e-05, + "loss": 2.8965, + "step": 21288 + }, + { + "epoch": 0.9911772237353633, + "grad_norm": 0.31448818514988325, + "learning_rate": 8.46835237768362e-05, + "loss": 2.8496, + "step": 21289 + }, + { + "epoch": 0.9912237819214563, + "grad_norm": 0.33493662507181116, + "learning_rate": 8.468157264334423e-05, + "loss": 2.8757, + "step": 21290 + }, + { + "epoch": 0.9912703401075494, + "grad_norm": 0.32729382624068437, + "learning_rate": 8.467962140806524e-05, + "loss": 2.9069, + "step": 21291 + }, + { + "epoch": 0.9913168982936424, + "grad_norm": 0.3397939730494299, + "learning_rate": 8.4677670071005e-05, + "loss": 2.9693, + "step": 21292 + }, + { + "epoch": 0.9913634564797356, + "grad_norm": 0.329003452265013, + "learning_rate": 8.46757186321692e-05, + "loss": 2.815, + "step": 21293 + }, + { + "epoch": 0.9914100146658286, + "grad_norm": 0.3264157240306648, + "learning_rate": 8.46737670915636e-05, + "loss": 2.7198, + "step": 21294 + }, + { + "epoch": 0.9914565728519217, + "grad_norm": 0.3235336757086774, + "learning_rate": 8.467181544919392e-05, + "loss": 2.9688, + "step": 21295 + }, + { + "epoch": 0.9915031310380148, + "grad_norm": 0.3359127992885286, + "learning_rate": 8.466986370506584e-05, + "loss": 2.7551, + "step": 21296 + }, + { + "epoch": 0.9915496892241078, + "grad_norm": 0.34302017414704356, + "learning_rate": 8.466791185918515e-05, + "loss": 2.8157, + "step": 21297 + }, + { + "epoch": 0.9915962474102009, + "grad_norm": 0.34121429278191245, + "learning_rate": 8.466595991155755e-05, + "loss": 2.9505, + "step": 21298 + }, + { + "epoch": 0.9916428055962939, + "grad_norm": 0.34170613911759856, + "learning_rate": 8.466400786218878e-05, + "loss": 2.983, + "step": 21299 + }, + { + "epoch": 0.991689363782387, + "grad_norm": 0.35060404012676144, + "learning_rate": 8.466205571108455e-05, + "loss": 2.8572, + "step": 21300 + }, + { + "epoch": 0.9917359219684801, + "grad_norm": 0.30012440448969985, + "learning_rate": 8.46601034582506e-05, + "loss": 2.8538, + "step": 21301 + }, + { + "epoch": 0.9917824801545732, + "grad_norm": 0.33908919791603404, + "learning_rate": 8.465815110369266e-05, + "loss": 3.0248, + "step": 21302 + }, + { + "epoch": 0.9918290383406663, + "grad_norm": 0.28792664695738485, + "learning_rate": 8.465619864741648e-05, + "loss": 2.8249, + "step": 21303 + }, + { + "epoch": 0.9918755965267593, + "grad_norm": 0.30837146310663166, + "learning_rate": 8.465424608942776e-05, + "loss": 2.8029, + "step": 21304 + }, + { + "epoch": 0.9919221547128524, + "grad_norm": 0.30901348069898044, + "learning_rate": 8.465229342973223e-05, + "loss": 2.9126, + "step": 21305 + }, + { + "epoch": 0.9919687128989455, + "grad_norm": 0.32319758611899946, + "learning_rate": 8.465034066833563e-05, + "loss": 2.8486, + "step": 21306 + }, + { + "epoch": 0.9920152710850385, + "grad_norm": 0.3380874152086274, + "learning_rate": 8.464838780524372e-05, + "loss": 2.8835, + "step": 21307 + }, + { + "epoch": 0.9920618292711316, + "grad_norm": 0.37874970267382857, + "learning_rate": 8.46464348404622e-05, + "loss": 2.9338, + "step": 21308 + }, + { + "epoch": 0.9921083874572246, + "grad_norm": 0.3229929997138171, + "learning_rate": 8.46444817739968e-05, + "loss": 2.8946, + "step": 21309 + }, + { + "epoch": 0.9921549456433177, + "grad_norm": 0.37548590641405893, + "learning_rate": 8.464252860585324e-05, + "loss": 2.8957, + "step": 21310 + }, + { + "epoch": 0.9922015038294109, + "grad_norm": 0.3443193124644857, + "learning_rate": 8.464057533603729e-05, + "loss": 2.944, + "step": 21311 + }, + { + "epoch": 0.9922480620155039, + "grad_norm": 0.3581349616273749, + "learning_rate": 8.463862196455467e-05, + "loss": 2.9123, + "step": 21312 + }, + { + "epoch": 0.992294620201597, + "grad_norm": 0.3283678578452395, + "learning_rate": 8.463666849141109e-05, + "loss": 2.8677, + "step": 21313 + }, + { + "epoch": 0.99234117838769, + "grad_norm": 0.3394734536382485, + "learning_rate": 8.463471491661232e-05, + "loss": 2.9145, + "step": 21314 + }, + { + "epoch": 0.9923877365737831, + "grad_norm": 0.34792213497285746, + "learning_rate": 8.463276124016406e-05, + "loss": 2.8058, + "step": 21315 + }, + { + "epoch": 0.9924342947598761, + "grad_norm": 0.3114132488606602, + "learning_rate": 8.463080746207205e-05, + "loss": 2.9036, + "step": 21316 + }, + { + "epoch": 0.9924808529459692, + "grad_norm": 0.32408339888132986, + "learning_rate": 8.462885358234205e-05, + "loss": 2.9564, + "step": 21317 + }, + { + "epoch": 0.9925274111320623, + "grad_norm": 0.31588222980070474, + "learning_rate": 8.462689960097977e-05, + "loss": 2.8115, + "step": 21318 + }, + { + "epoch": 0.9925739693181553, + "grad_norm": 0.29615414087683223, + "learning_rate": 8.462494551799095e-05, + "loss": 2.8677, + "step": 21319 + }, + { + "epoch": 0.9926205275042485, + "grad_norm": 0.3400309752641285, + "learning_rate": 8.462299133338133e-05, + "loss": 2.9365, + "step": 21320 + }, + { + "epoch": 0.9926670856903415, + "grad_norm": 0.32339242952221975, + "learning_rate": 8.462103704715663e-05, + "loss": 2.9996, + "step": 21321 + }, + { + "epoch": 0.9927136438764346, + "grad_norm": 0.305374702302765, + "learning_rate": 8.461908265932261e-05, + "loss": 2.9154, + "step": 21322 + }, + { + "epoch": 0.9927602020625277, + "grad_norm": 0.3110202676995576, + "learning_rate": 8.461712816988499e-05, + "loss": 2.8372, + "step": 21323 + }, + { + "epoch": 0.9928067602486207, + "grad_norm": 0.30164621569475314, + "learning_rate": 8.461517357884951e-05, + "loss": 2.7742, + "step": 21324 + }, + { + "epoch": 0.9928533184347138, + "grad_norm": 0.34141410901810804, + "learning_rate": 8.46132188862219e-05, + "loss": 2.8781, + "step": 21325 + }, + { + "epoch": 0.9928998766208068, + "grad_norm": 0.31550746968318844, + "learning_rate": 8.46112640920079e-05, + "loss": 2.8472, + "step": 21326 + }, + { + "epoch": 0.9929464348068999, + "grad_norm": 0.3350515617733548, + "learning_rate": 8.460930919621326e-05, + "loss": 2.9246, + "step": 21327 + }, + { + "epoch": 0.992992992992993, + "grad_norm": 0.3269006271716984, + "learning_rate": 8.46073541988437e-05, + "loss": 2.8301, + "step": 21328 + }, + { + "epoch": 0.993039551179086, + "grad_norm": 0.30691376576331103, + "learning_rate": 8.460539909990497e-05, + "loss": 2.9259, + "step": 21329 + }, + { + "epoch": 0.9930861093651792, + "grad_norm": 0.3502363159872248, + "learning_rate": 8.460344389940281e-05, + "loss": 2.8382, + "step": 21330 + }, + { + "epoch": 0.9931326675512722, + "grad_norm": 0.29591963720177944, + "learning_rate": 8.460148859734292e-05, + "loss": 2.8424, + "step": 21331 + }, + { + "epoch": 0.9931792257373653, + "grad_norm": 0.3355847179188052, + "learning_rate": 8.459953319373109e-05, + "loss": 2.9203, + "step": 21332 + }, + { + "epoch": 0.9932257839234584, + "grad_norm": 0.298895353010402, + "learning_rate": 8.459757768857303e-05, + "loss": 2.8985, + "step": 21333 + }, + { + "epoch": 0.9932723421095514, + "grad_norm": 0.3189035827378353, + "learning_rate": 8.459562208187451e-05, + "loss": 2.8447, + "step": 21334 + }, + { + "epoch": 0.9933189002956445, + "grad_norm": 0.31628008904212657, + "learning_rate": 8.459366637364124e-05, + "loss": 2.8723, + "step": 21335 + }, + { + "epoch": 0.9933654584817375, + "grad_norm": 0.32627966536154795, + "learning_rate": 8.459171056387895e-05, + "loss": 2.7789, + "step": 21336 + }, + { + "epoch": 0.9934120166678306, + "grad_norm": 0.3204722952679227, + "learning_rate": 8.458975465259343e-05, + "loss": 2.8272, + "step": 21337 + }, + { + "epoch": 0.9934585748539236, + "grad_norm": 0.32073369545950614, + "learning_rate": 8.458779863979035e-05, + "loss": 2.7825, + "step": 21338 + }, + { + "epoch": 0.9935051330400168, + "grad_norm": 0.321835435517539, + "learning_rate": 8.458584252547551e-05, + "loss": 2.8948, + "step": 21339 + }, + { + "epoch": 0.9935516912261099, + "grad_norm": 0.30202156526939666, + "learning_rate": 8.458388630965463e-05, + "loss": 2.8484, + "step": 21340 + }, + { + "epoch": 0.9935982494122029, + "grad_norm": 0.3326795901274059, + "learning_rate": 8.458192999233344e-05, + "loss": 2.9477, + "step": 21341 + }, + { + "epoch": 0.993644807598296, + "grad_norm": 0.33519531625265453, + "learning_rate": 8.457997357351771e-05, + "loss": 2.7485, + "step": 21342 + }, + { + "epoch": 0.993691365784389, + "grad_norm": 0.3218410766209728, + "learning_rate": 8.457801705321315e-05, + "loss": 2.8602, + "step": 21343 + }, + { + "epoch": 0.9937379239704821, + "grad_norm": 0.367436285184502, + "learning_rate": 8.457606043142551e-05, + "loss": 2.909, + "step": 21344 + }, + { + "epoch": 0.9937844821565752, + "grad_norm": 0.3118993411476585, + "learning_rate": 8.457410370816054e-05, + "loss": 2.8919, + "step": 21345 + }, + { + "epoch": 0.9938310403426682, + "grad_norm": 0.3234542114064295, + "learning_rate": 8.457214688342399e-05, + "loss": 2.9142, + "step": 21346 + }, + { + "epoch": 0.9938775985287613, + "grad_norm": 0.3022476035011769, + "learning_rate": 8.45701899572216e-05, + "loss": 2.8053, + "step": 21347 + }, + { + "epoch": 0.9939241567148543, + "grad_norm": 0.3329854309629609, + "learning_rate": 8.456823292955911e-05, + "loss": 2.9116, + "step": 21348 + }, + { + "epoch": 0.9939707149009475, + "grad_norm": 0.3412297383162532, + "learning_rate": 8.456627580044225e-05, + "loss": 2.8934, + "step": 21349 + }, + { + "epoch": 0.9940172730870406, + "grad_norm": 0.3235640122493767, + "learning_rate": 8.456431856987676e-05, + "loss": 2.8022, + "step": 21350 + }, + { + "epoch": 0.9940638312731336, + "grad_norm": 0.319228758745723, + "learning_rate": 8.456236123786843e-05, + "loss": 2.8479, + "step": 21351 + }, + { + "epoch": 0.9941103894592267, + "grad_norm": 0.3211755623758524, + "learning_rate": 8.456040380442297e-05, + "loss": 2.9232, + "step": 21352 + }, + { + "epoch": 0.9941569476453197, + "grad_norm": 0.35422902786618143, + "learning_rate": 8.455844626954612e-05, + "loss": 2.8662, + "step": 21353 + }, + { + "epoch": 0.9942035058314128, + "grad_norm": 0.3353359792486122, + "learning_rate": 8.455648863324364e-05, + "loss": 2.8419, + "step": 21354 + }, + { + "epoch": 0.9942500640175059, + "grad_norm": 0.3495625328966532, + "learning_rate": 8.455453089552127e-05, + "loss": 2.857, + "step": 21355 + }, + { + "epoch": 0.9942966222035989, + "grad_norm": 0.3347432130489937, + "learning_rate": 8.455257305638475e-05, + "loss": 2.9536, + "step": 21356 + }, + { + "epoch": 0.994343180389692, + "grad_norm": 0.33850488363399395, + "learning_rate": 8.455061511583985e-05, + "loss": 2.8738, + "step": 21357 + }, + { + "epoch": 0.994389738575785, + "grad_norm": 0.3376582819382959, + "learning_rate": 8.454865707389228e-05, + "loss": 2.8009, + "step": 21358 + }, + { + "epoch": 0.9944362967618782, + "grad_norm": 0.32145840330409686, + "learning_rate": 8.454669893054781e-05, + "loss": 2.7793, + "step": 21359 + }, + { + "epoch": 0.9944828549479712, + "grad_norm": 0.3338130698909899, + "learning_rate": 8.454474068581219e-05, + "loss": 2.8199, + "step": 21360 + }, + { + "epoch": 0.9945294131340643, + "grad_norm": 0.3295403760405379, + "learning_rate": 8.454278233969115e-05, + "loss": 2.8561, + "step": 21361 + }, + { + "epoch": 0.9945759713201574, + "grad_norm": 0.32348730759638666, + "learning_rate": 8.454082389219045e-05, + "loss": 2.959, + "step": 21362 + }, + { + "epoch": 0.9946225295062504, + "grad_norm": 0.33595110086446106, + "learning_rate": 8.453886534331583e-05, + "loss": 2.811, + "step": 21363 + }, + { + "epoch": 0.9946690876923435, + "grad_norm": 0.317083857874271, + "learning_rate": 8.453690669307304e-05, + "loss": 2.7829, + "step": 21364 + }, + { + "epoch": 0.9947156458784365, + "grad_norm": 0.3323629702655257, + "learning_rate": 8.453494794146785e-05, + "loss": 2.9153, + "step": 21365 + }, + { + "epoch": 0.9947622040645296, + "grad_norm": 0.3632691105396604, + "learning_rate": 8.453298908850597e-05, + "loss": 2.9065, + "step": 21366 + }, + { + "epoch": 0.9948087622506228, + "grad_norm": 0.30704931881067227, + "learning_rate": 8.453103013419319e-05, + "loss": 2.7859, + "step": 21367 + }, + { + "epoch": 0.9948553204367158, + "grad_norm": 0.3398730389952619, + "learning_rate": 8.452907107853522e-05, + "loss": 2.8563, + "step": 21368 + }, + { + "epoch": 0.9949018786228089, + "grad_norm": 0.3838661758197647, + "learning_rate": 8.452711192153783e-05, + "loss": 2.9406, + "step": 21369 + }, + { + "epoch": 0.9949484368089019, + "grad_norm": 0.30627838396481394, + "learning_rate": 8.452515266320678e-05, + "loss": 2.798, + "step": 21370 + }, + { + "epoch": 0.994994994994995, + "grad_norm": 0.39596424201231295, + "learning_rate": 8.452319330354779e-05, + "loss": 2.8839, + "step": 21371 + }, + { + "epoch": 0.9950415531810881, + "grad_norm": 0.3312313929194931, + "learning_rate": 8.452123384256665e-05, + "loss": 2.8377, + "step": 21372 + }, + { + "epoch": 0.9950881113671811, + "grad_norm": 0.40629087283295034, + "learning_rate": 8.451927428026906e-05, + "loss": 2.8794, + "step": 21373 + }, + { + "epoch": 0.9951346695532742, + "grad_norm": 0.3236337437221426, + "learning_rate": 8.451731461666083e-05, + "loss": 2.9492, + "step": 21374 + }, + { + "epoch": 0.9951812277393672, + "grad_norm": 0.37248192255897683, + "learning_rate": 8.451535485174768e-05, + "loss": 2.876, + "step": 21375 + }, + { + "epoch": 0.9952277859254604, + "grad_norm": 0.320804933729381, + "learning_rate": 8.451339498553535e-05, + "loss": 2.8032, + "step": 21376 + }, + { + "epoch": 0.9952743441115535, + "grad_norm": 0.39113939008331183, + "learning_rate": 8.45114350180296e-05, + "loss": 2.8352, + "step": 21377 + }, + { + "epoch": 0.9953209022976465, + "grad_norm": 0.30784133547439263, + "learning_rate": 8.45094749492362e-05, + "loss": 2.9101, + "step": 21378 + }, + { + "epoch": 0.9953674604837396, + "grad_norm": 0.3458933724604414, + "learning_rate": 8.45075147791609e-05, + "loss": 2.862, + "step": 21379 + }, + { + "epoch": 0.9954140186698326, + "grad_norm": 0.31234288049000797, + "learning_rate": 8.450555450780944e-05, + "loss": 2.7435, + "step": 21380 + }, + { + "epoch": 0.9954605768559257, + "grad_norm": 0.3124710106549771, + "learning_rate": 8.450359413518755e-05, + "loss": 2.7987, + "step": 21381 + }, + { + "epoch": 0.9955071350420187, + "grad_norm": 0.3428316364821386, + "learning_rate": 8.450163366130104e-05, + "loss": 2.7678, + "step": 21382 + }, + { + "epoch": 0.9955536932281118, + "grad_norm": 0.3151034436606998, + "learning_rate": 8.449967308615562e-05, + "loss": 2.7767, + "step": 21383 + }, + { + "epoch": 0.9956002514142049, + "grad_norm": 0.33382154473034814, + "learning_rate": 8.449771240975707e-05, + "loss": 2.7831, + "step": 21384 + }, + { + "epoch": 0.9956468096002979, + "grad_norm": 0.2958783836760349, + "learning_rate": 8.449575163211112e-05, + "loss": 2.8106, + "step": 21385 + }, + { + "epoch": 0.9956933677863911, + "grad_norm": 0.3230999788424372, + "learning_rate": 8.449379075322354e-05, + "loss": 2.8367, + "step": 21386 + }, + { + "epoch": 0.9957399259724841, + "grad_norm": 0.3208112159558816, + "learning_rate": 8.449182977310009e-05, + "loss": 2.9029, + "step": 21387 + }, + { + "epoch": 0.9957864841585772, + "grad_norm": 0.3301698875996787, + "learning_rate": 8.448986869174652e-05, + "loss": 2.7854, + "step": 21388 + }, + { + "epoch": 0.9958330423446703, + "grad_norm": 0.335102339400982, + "learning_rate": 8.448790750916857e-05, + "loss": 2.9525, + "step": 21389 + }, + { + "epoch": 0.9958796005307633, + "grad_norm": 0.3057774940693192, + "learning_rate": 8.448594622537201e-05, + "loss": 2.8314, + "step": 21390 + }, + { + "epoch": 0.9959261587168564, + "grad_norm": 0.3437297035459384, + "learning_rate": 8.44839848403626e-05, + "loss": 2.9204, + "step": 21391 + }, + { + "epoch": 0.9959727169029494, + "grad_norm": 0.3102649170583415, + "learning_rate": 8.44820233541461e-05, + "loss": 2.8832, + "step": 21392 + }, + { + "epoch": 0.9960192750890425, + "grad_norm": 0.32707273957022226, + "learning_rate": 8.448006176672825e-05, + "loss": 2.7672, + "step": 21393 + }, + { + "epoch": 0.9960658332751356, + "grad_norm": 0.3358639916007216, + "learning_rate": 8.447810007811482e-05, + "loss": 2.8211, + "step": 21394 + }, + { + "epoch": 0.9961123914612287, + "grad_norm": 0.3148563693187852, + "learning_rate": 8.447613828831156e-05, + "loss": 2.8877, + "step": 21395 + }, + { + "epoch": 0.9961589496473218, + "grad_norm": 0.31205564436006944, + "learning_rate": 8.447417639732423e-05, + "loss": 2.7232, + "step": 21396 + }, + { + "epoch": 0.9962055078334148, + "grad_norm": 0.3280426731067577, + "learning_rate": 8.447221440515859e-05, + "loss": 2.8166, + "step": 21397 + }, + { + "epoch": 0.9962520660195079, + "grad_norm": 0.33457081239736663, + "learning_rate": 8.44702523118204e-05, + "loss": 2.8647, + "step": 21398 + }, + { + "epoch": 0.996298624205601, + "grad_norm": 0.35854416083073887, + "learning_rate": 8.44682901173154e-05, + "loss": 3.0368, + "step": 21399 + }, + { + "epoch": 0.996345182391694, + "grad_norm": 0.32351188876864695, + "learning_rate": 8.446632782164938e-05, + "loss": 2.8734, + "step": 21400 + }, + { + "epoch": 0.9963917405777871, + "grad_norm": 0.3563941470065113, + "learning_rate": 8.446436542482808e-05, + "loss": 2.856, + "step": 21401 + }, + { + "epoch": 0.9964382987638801, + "grad_norm": 0.33270572141017885, + "learning_rate": 8.446240292685725e-05, + "loss": 2.8684, + "step": 21402 + }, + { + "epoch": 0.9964848569499732, + "grad_norm": 0.3478306373800298, + "learning_rate": 8.446044032774268e-05, + "loss": 2.8431, + "step": 21403 + }, + { + "epoch": 0.9965314151360662, + "grad_norm": 0.3257918016576067, + "learning_rate": 8.445847762749011e-05, + "loss": 2.8813, + "step": 21404 + }, + { + "epoch": 0.9965779733221594, + "grad_norm": 0.3454479357278452, + "learning_rate": 8.44565148261053e-05, + "loss": 2.8724, + "step": 21405 + }, + { + "epoch": 0.9966245315082525, + "grad_norm": 0.3350352421574858, + "learning_rate": 8.445455192359401e-05, + "loss": 2.8763, + "step": 21406 + }, + { + "epoch": 0.9966710896943455, + "grad_norm": 0.35880259841641776, + "learning_rate": 8.4452588919962e-05, + "loss": 2.8569, + "step": 21407 + }, + { + "epoch": 0.9967176478804386, + "grad_norm": 0.3412230084041266, + "learning_rate": 8.445062581521504e-05, + "loss": 2.8652, + "step": 21408 + }, + { + "epoch": 0.9967642060665316, + "grad_norm": 0.32941151400041085, + "learning_rate": 8.44486626093589e-05, + "loss": 2.7001, + "step": 21409 + }, + { + "epoch": 0.9968107642526247, + "grad_norm": 0.34814634786555027, + "learning_rate": 8.444669930239932e-05, + "loss": 2.8041, + "step": 21410 + }, + { + "epoch": 0.9968573224387178, + "grad_norm": 0.33905116303905225, + "learning_rate": 8.444473589434208e-05, + "loss": 2.9191, + "step": 21411 + }, + { + "epoch": 0.9969038806248108, + "grad_norm": 0.32081915871529615, + "learning_rate": 8.444277238519292e-05, + "loss": 2.8539, + "step": 21412 + }, + { + "epoch": 0.996950438810904, + "grad_norm": 0.3821325208329643, + "learning_rate": 8.444080877495761e-05, + "loss": 2.805, + "step": 21413 + }, + { + "epoch": 0.996996996996997, + "grad_norm": 0.36252324541893927, + "learning_rate": 8.443884506364193e-05, + "loss": 2.8048, + "step": 21414 + }, + { + "epoch": 0.9970435551830901, + "grad_norm": 0.3401357228598553, + "learning_rate": 8.443688125125164e-05, + "loss": 2.8852, + "step": 21415 + }, + { + "epoch": 0.9970901133691832, + "grad_norm": 0.3432862287090094, + "learning_rate": 8.443491733779248e-05, + "loss": 2.8671, + "step": 21416 + }, + { + "epoch": 0.9971366715552762, + "grad_norm": 0.3305456485861089, + "learning_rate": 8.443295332327024e-05, + "loss": 2.781, + "step": 21417 + }, + { + "epoch": 0.9971832297413693, + "grad_norm": 0.3306715330839746, + "learning_rate": 8.443098920769067e-05, + "loss": 2.7696, + "step": 21418 + }, + { + "epoch": 0.9972297879274623, + "grad_norm": 0.33918772431136907, + "learning_rate": 8.442902499105953e-05, + "loss": 3.0141, + "step": 21419 + }, + { + "epoch": 0.9972763461135554, + "grad_norm": 0.3507561339106354, + "learning_rate": 8.442706067338262e-05, + "loss": 2.9004, + "step": 21420 + }, + { + "epoch": 0.9973229042996485, + "grad_norm": 0.36501861825416143, + "learning_rate": 8.442509625466566e-05, + "loss": 2.8831, + "step": 21421 + }, + { + "epoch": 0.9973694624857415, + "grad_norm": 0.3409652560763656, + "learning_rate": 8.442313173491443e-05, + "loss": 2.7844, + "step": 21422 + }, + { + "epoch": 0.9974160206718347, + "grad_norm": 0.3805037188707477, + "learning_rate": 8.44211671141347e-05, + "loss": 2.8432, + "step": 21423 + }, + { + "epoch": 0.9974625788579277, + "grad_norm": 0.37336769299225053, + "learning_rate": 8.441920239233226e-05, + "loss": 2.8747, + "step": 21424 + }, + { + "epoch": 0.9975091370440208, + "grad_norm": 0.34002053093913065, + "learning_rate": 8.441723756951282e-05, + "loss": 2.8208, + "step": 21425 + }, + { + "epoch": 0.9975556952301138, + "grad_norm": 0.37503480481505486, + "learning_rate": 8.441527264568219e-05, + "loss": 2.8514, + "step": 21426 + }, + { + "epoch": 0.9976022534162069, + "grad_norm": 0.3521484963686413, + "learning_rate": 8.441330762084613e-05, + "loss": 2.8994, + "step": 21427 + }, + { + "epoch": 0.9976488116023, + "grad_norm": 0.4048410714480568, + "learning_rate": 8.44113424950104e-05, + "loss": 2.9894, + "step": 21428 + }, + { + "epoch": 0.997695369788393, + "grad_norm": 0.3611244850663939, + "learning_rate": 8.440937726818076e-05, + "loss": 2.7918, + "step": 21429 + }, + { + "epoch": 0.9977419279744861, + "grad_norm": 0.3433619834140489, + "learning_rate": 8.440741194036301e-05, + "loss": 2.8717, + "step": 21430 + }, + { + "epoch": 0.9977884861605791, + "grad_norm": 0.3593463071807294, + "learning_rate": 8.440544651156287e-05, + "loss": 2.8511, + "step": 21431 + }, + { + "epoch": 0.9978350443466723, + "grad_norm": 0.3397071207068734, + "learning_rate": 8.440348098178615e-05, + "loss": 2.8102, + "step": 21432 + }, + { + "epoch": 0.9978816025327654, + "grad_norm": 0.3427234573020433, + "learning_rate": 8.440151535103861e-05, + "loss": 2.9414, + "step": 21433 + }, + { + "epoch": 0.9979281607188584, + "grad_norm": 0.3494508047545362, + "learning_rate": 8.439954961932599e-05, + "loss": 2.9308, + "step": 21434 + }, + { + "epoch": 0.9979747189049515, + "grad_norm": 0.3411220453392997, + "learning_rate": 8.43975837866541e-05, + "loss": 2.7624, + "step": 21435 + }, + { + "epoch": 0.9980212770910445, + "grad_norm": 0.34401326795554105, + "learning_rate": 8.439561785302868e-05, + "loss": 2.73, + "step": 21436 + }, + { + "epoch": 0.9980678352771376, + "grad_norm": 0.34220035769567647, + "learning_rate": 8.439365181845551e-05, + "loss": 2.9603, + "step": 21437 + }, + { + "epoch": 0.9981143934632307, + "grad_norm": 0.3395427728590107, + "learning_rate": 8.439168568294037e-05, + "loss": 2.7943, + "step": 21438 + }, + { + "epoch": 0.9981609516493237, + "grad_norm": 0.37219124377601687, + "learning_rate": 8.438971944648902e-05, + "loss": 2.9102, + "step": 21439 + }, + { + "epoch": 0.9982075098354168, + "grad_norm": 0.34592654135029977, + "learning_rate": 8.43877531091072e-05, + "loss": 2.7092, + "step": 21440 + }, + { + "epoch": 0.9982540680215098, + "grad_norm": 0.3523580417295698, + "learning_rate": 8.438578667080075e-05, + "loss": 2.8911, + "step": 21441 + }, + { + "epoch": 0.998300626207603, + "grad_norm": 0.3560730761062527, + "learning_rate": 8.43838201315754e-05, + "loss": 2.9366, + "step": 21442 + }, + { + "epoch": 0.9983471843936961, + "grad_norm": 0.3373373635903106, + "learning_rate": 8.43818534914369e-05, + "loss": 2.8102, + "step": 21443 + }, + { + "epoch": 0.9983937425797891, + "grad_norm": 0.3449563151476025, + "learning_rate": 8.437988675039107e-05, + "loss": 2.8745, + "step": 21444 + }, + { + "epoch": 0.9984403007658822, + "grad_norm": 0.33646762159122456, + "learning_rate": 8.437791990844364e-05, + "loss": 2.9393, + "step": 21445 + }, + { + "epoch": 0.9984868589519752, + "grad_norm": 0.35466536293399686, + "learning_rate": 8.437595296560041e-05, + "loss": 3.0245, + "step": 21446 + }, + { + "epoch": 0.9985334171380683, + "grad_norm": 0.35944678681769465, + "learning_rate": 8.437398592186716e-05, + "loss": 2.8685, + "step": 21447 + }, + { + "epoch": 0.9985799753241613, + "grad_norm": 0.3222455197861889, + "learning_rate": 8.437201877724964e-05, + "loss": 2.9704, + "step": 21448 + }, + { + "epoch": 0.9986265335102544, + "grad_norm": 0.3237699109235801, + "learning_rate": 8.437005153175363e-05, + "loss": 2.9423, + "step": 21449 + }, + { + "epoch": 0.9986730916963475, + "grad_norm": 0.32835309854632067, + "learning_rate": 8.436808418538489e-05, + "loss": 2.8204, + "step": 21450 + }, + { + "epoch": 0.9987196498824406, + "grad_norm": 0.33342998090939224, + "learning_rate": 8.436611673814923e-05, + "loss": 2.9029, + "step": 21451 + }, + { + "epoch": 0.9987662080685337, + "grad_norm": 0.3257011829192343, + "learning_rate": 8.43641491900524e-05, + "loss": 2.851, + "step": 21452 + }, + { + "epoch": 0.9988127662546267, + "grad_norm": 0.31507493548509535, + "learning_rate": 8.436218154110016e-05, + "loss": 2.7535, + "step": 21453 + }, + { + "epoch": 0.9988593244407198, + "grad_norm": 0.36550311956328385, + "learning_rate": 8.436021379129833e-05, + "loss": 2.8594, + "step": 21454 + }, + { + "epoch": 0.9989058826268129, + "grad_norm": 0.34962891781861916, + "learning_rate": 8.435824594065263e-05, + "loss": 2.8437, + "step": 21455 + }, + { + "epoch": 0.9989524408129059, + "grad_norm": 0.3684286922894403, + "learning_rate": 8.435627798916888e-05, + "loss": 2.8253, + "step": 21456 + }, + { + "epoch": 0.998998998998999, + "grad_norm": 0.3344979244221456, + "learning_rate": 8.435430993685282e-05, + "loss": 2.7748, + "step": 21457 + }, + { + "epoch": 0.999045557185092, + "grad_norm": 0.3566556990515384, + "learning_rate": 8.435234178371027e-05, + "loss": 2.9404, + "step": 21458 + }, + { + "epoch": 0.9990921153711851, + "grad_norm": 0.3316625305731707, + "learning_rate": 8.435037352974696e-05, + "loss": 2.8461, + "step": 21459 + }, + { + "epoch": 0.9991386735572783, + "grad_norm": 0.33187655841157854, + "learning_rate": 8.43484051749687e-05, + "loss": 2.8726, + "step": 21460 + }, + { + "epoch": 0.9991852317433713, + "grad_norm": 0.3154713122635066, + "learning_rate": 8.434643671938125e-05, + "loss": 2.9179, + "step": 21461 + }, + { + "epoch": 0.9992317899294644, + "grad_norm": 0.3371598667499557, + "learning_rate": 8.43444681629904e-05, + "loss": 2.8457, + "step": 21462 + }, + { + "epoch": 0.9992783481155574, + "grad_norm": 0.36042833305592054, + "learning_rate": 8.434249950580191e-05, + "loss": 2.8505, + "step": 21463 + }, + { + "epoch": 0.9993249063016505, + "grad_norm": 0.31753743650253596, + "learning_rate": 8.434053074782157e-05, + "loss": 2.8334, + "step": 21464 + }, + { + "epoch": 0.9993714644877436, + "grad_norm": 0.355372117195268, + "learning_rate": 8.433856188905516e-05, + "loss": 2.9001, + "step": 21465 + }, + { + "epoch": 0.9994180226738366, + "grad_norm": 0.3398984725114164, + "learning_rate": 8.433659292950844e-05, + "loss": 2.8323, + "step": 21466 + }, + { + "epoch": 0.9994645808599297, + "grad_norm": 0.3598840710730407, + "learning_rate": 8.433462386918722e-05, + "loss": 2.9071, + "step": 21467 + }, + { + "epoch": 0.9995111390460227, + "grad_norm": 0.3300881404081763, + "learning_rate": 8.433265470809724e-05, + "loss": 2.9313, + "step": 21468 + }, + { + "epoch": 0.9995576972321158, + "grad_norm": 0.37718998026193273, + "learning_rate": 8.433068544624433e-05, + "loss": 2.9087, + "step": 21469 + }, + { + "epoch": 0.9996042554182089, + "grad_norm": 0.41242355628982164, + "learning_rate": 8.432871608363421e-05, + "loss": 3.0197, + "step": 21470 + }, + { + "epoch": 0.999650813604302, + "grad_norm": 0.3114643489478214, + "learning_rate": 8.43267466202727e-05, + "loss": 2.8436, + "step": 21471 + }, + { + "epoch": 0.9996973717903951, + "grad_norm": 0.3763242516407874, + "learning_rate": 8.432477705616559e-05, + "loss": 2.8617, + "step": 21472 + }, + { + "epoch": 0.9997439299764881, + "grad_norm": 0.35329042303705843, + "learning_rate": 8.432280739131862e-05, + "loss": 2.9272, + "step": 21473 + }, + { + "epoch": 0.9997904881625812, + "grad_norm": 0.3218957309950244, + "learning_rate": 8.43208376257376e-05, + "loss": 2.7989, + "step": 21474 + }, + { + "epoch": 0.9998370463486742, + "grad_norm": 0.3562676342341345, + "learning_rate": 8.43188677594283e-05, + "loss": 2.767, + "step": 21475 + }, + { + "epoch": 0.9998836045347673, + "grad_norm": 0.315215070365387, + "learning_rate": 8.431689779239651e-05, + "loss": 2.8986, + "step": 21476 + }, + { + "epoch": 0.9999301627208604, + "grad_norm": 0.314886411352715, + "learning_rate": 8.431492772464799e-05, + "loss": 2.8326, + "step": 21477 + }, + { + "epoch": 0.9999767209069534, + "grad_norm": 0.3599934577282777, + "learning_rate": 8.431295755618856e-05, + "loss": 2.9558, + "step": 21478 + }, + { + "epoch": 1.0, + "grad_norm": 0.3599934577282777, + "learning_rate": 8.431098728702396e-05, + "loss": 2.8162, + "step": 21479 + }, + { + "epoch": 1.0000465581860931, + "grad_norm": 0.4422223637006491, + "learning_rate": 8.430901691716e-05, + "loss": 2.9129, + "step": 21480 + }, + { + "epoch": 1.0000931163721862, + "grad_norm": 0.3405458886774884, + "learning_rate": 8.430704644660246e-05, + "loss": 2.7986, + "step": 21481 + }, + { + "epoch": 1.0001396745582791, + "grad_norm": 0.3463003321703549, + "learning_rate": 8.430507587535712e-05, + "loss": 2.9426, + "step": 21482 + }, + { + "epoch": 1.0001862327443722, + "grad_norm": 0.3330048642830492, + "learning_rate": 8.430310520342976e-05, + "loss": 2.8387, + "step": 21483 + }, + { + "epoch": 1.0002327909304654, + "grad_norm": 0.33375281818722746, + "learning_rate": 8.430113443082616e-05, + "loss": 2.9001, + "step": 21484 + }, + { + "epoch": 1.0002793491165585, + "grad_norm": 0.339411157949759, + "learning_rate": 8.429916355755209e-05, + "loss": 2.8091, + "step": 21485 + }, + { + "epoch": 1.0003259073026516, + "grad_norm": 0.32794004805016164, + "learning_rate": 8.42971925836134e-05, + "loss": 2.8447, + "step": 21486 + }, + { + "epoch": 1.0003724654887445, + "grad_norm": 0.3316392042699535, + "learning_rate": 8.429522150901579e-05, + "loss": 2.9108, + "step": 21487 + }, + { + "epoch": 1.0004190236748376, + "grad_norm": 0.32223438597421755, + "learning_rate": 8.429325033376511e-05, + "loss": 2.9114, + "step": 21488 + }, + { + "epoch": 1.0004655818609307, + "grad_norm": 0.3493384501685294, + "learning_rate": 8.42912790578671e-05, + "loss": 2.8933, + "step": 21489 + }, + { + "epoch": 1.0005121400470238, + "grad_norm": 0.3338823900285061, + "learning_rate": 8.428930768132757e-05, + "loss": 2.8502, + "step": 21490 + }, + { + "epoch": 1.000558698233117, + "grad_norm": 0.3171274298953507, + "learning_rate": 8.42873362041523e-05, + "loss": 2.9331, + "step": 21491 + }, + { + "epoch": 1.0006052564192098, + "grad_norm": 0.3320531124593092, + "learning_rate": 8.428536462634708e-05, + "loss": 2.9267, + "step": 21492 + }, + { + "epoch": 1.000651814605303, + "grad_norm": 0.3260719681423933, + "learning_rate": 8.42833929479177e-05, + "loss": 2.9642, + "step": 21493 + }, + { + "epoch": 1.000698372791396, + "grad_norm": 0.3193257588512547, + "learning_rate": 8.428142116886991e-05, + "loss": 2.8151, + "step": 21494 + }, + { + "epoch": 1.0007449309774892, + "grad_norm": 0.30282301407697315, + "learning_rate": 8.427944928920954e-05, + "loss": 2.8135, + "step": 21495 + }, + { + "epoch": 1.0007914891635823, + "grad_norm": 0.33165838237064615, + "learning_rate": 8.427747730894238e-05, + "loss": 2.9446, + "step": 21496 + }, + { + "epoch": 1.0008380473496752, + "grad_norm": 0.3418802846961454, + "learning_rate": 8.42755052280742e-05, + "loss": 2.8215, + "step": 21497 + }, + { + "epoch": 1.0008846055357683, + "grad_norm": 0.31063810415978615, + "learning_rate": 8.427353304661077e-05, + "loss": 2.8652, + "step": 21498 + }, + { + "epoch": 1.0009311637218614, + "grad_norm": 0.3616158804922221, + "learning_rate": 8.42715607645579e-05, + "loss": 2.8665, + "step": 21499 + }, + { + "epoch": 1.0009777219079545, + "grad_norm": 0.3348871325874429, + "learning_rate": 8.426958838192138e-05, + "loss": 2.841, + "step": 21500 + }, + { + "epoch": 1.0010242800940474, + "grad_norm": 0.34160701184833187, + "learning_rate": 8.426761589870699e-05, + "loss": 2.8329, + "step": 21501 + }, + { + "epoch": 1.0010708382801405, + "grad_norm": 0.31785859187982934, + "learning_rate": 8.426564331492053e-05, + "loss": 2.873, + "step": 21502 + }, + { + "epoch": 1.0011173964662337, + "grad_norm": 0.3539241450458252, + "learning_rate": 8.426367063056778e-05, + "loss": 2.8609, + "step": 21503 + }, + { + "epoch": 1.0011639546523268, + "grad_norm": 0.3432016716117409, + "learning_rate": 8.426169784565452e-05, + "loss": 2.904, + "step": 21504 + }, + { + "epoch": 1.0012105128384199, + "grad_norm": 0.33800165796515247, + "learning_rate": 8.425972496018658e-05, + "loss": 2.7087, + "step": 21505 + }, + { + "epoch": 1.0012570710245128, + "grad_norm": 0.3573049129476521, + "learning_rate": 8.42577519741697e-05, + "loss": 2.7381, + "step": 21506 + }, + { + "epoch": 1.001303629210606, + "grad_norm": 0.36322252020887563, + "learning_rate": 8.425577888760971e-05, + "loss": 2.8721, + "step": 21507 + }, + { + "epoch": 1.001350187396699, + "grad_norm": 0.36662436373347895, + "learning_rate": 8.425380570051236e-05, + "loss": 2.8205, + "step": 21508 + }, + { + "epoch": 1.0013967455827921, + "grad_norm": 0.3677540189241769, + "learning_rate": 8.425183241288349e-05, + "loss": 2.8573, + "step": 21509 + }, + { + "epoch": 1.0014433037688852, + "grad_norm": 0.3689585639640332, + "learning_rate": 8.424985902472886e-05, + "loss": 2.8492, + "step": 21510 + }, + { + "epoch": 1.0014898619549781, + "grad_norm": 0.35281140673809513, + "learning_rate": 8.424788553605426e-05, + "loss": 2.7475, + "step": 21511 + }, + { + "epoch": 1.0015364201410712, + "grad_norm": 0.36015960942118397, + "learning_rate": 8.424591194686548e-05, + "loss": 2.8634, + "step": 21512 + }, + { + "epoch": 1.0015829783271644, + "grad_norm": 0.35763807557390825, + "learning_rate": 8.424393825716834e-05, + "loss": 2.8803, + "step": 21513 + }, + { + "epoch": 1.0016295365132575, + "grad_norm": 0.36738677526135477, + "learning_rate": 8.424196446696862e-05, + "loss": 2.896, + "step": 21514 + }, + { + "epoch": 1.0016760946993506, + "grad_norm": 0.3322995920184061, + "learning_rate": 8.42399905762721e-05, + "loss": 2.8014, + "step": 21515 + }, + { + "epoch": 1.0017226528854435, + "grad_norm": 0.35812164035356137, + "learning_rate": 8.423801658508457e-05, + "loss": 2.7732, + "step": 21516 + }, + { + "epoch": 1.0017692110715366, + "grad_norm": 0.30178288429687594, + "learning_rate": 8.423604249341184e-05, + "loss": 2.7824, + "step": 21517 + }, + { + "epoch": 1.0018157692576297, + "grad_norm": 0.3386928676780543, + "learning_rate": 8.42340683012597e-05, + "loss": 2.9044, + "step": 21518 + }, + { + "epoch": 1.0018623274437228, + "grad_norm": 0.32802649494922764, + "learning_rate": 8.423209400863394e-05, + "loss": 2.8337, + "step": 21519 + }, + { + "epoch": 1.001908885629816, + "grad_norm": 0.3271843571357147, + "learning_rate": 8.423011961554037e-05, + "loss": 2.7395, + "step": 21520 + }, + { + "epoch": 1.0019554438159088, + "grad_norm": 0.39318178566509393, + "learning_rate": 8.422814512198476e-05, + "loss": 2.7942, + "step": 21521 + }, + { + "epoch": 1.002002002002002, + "grad_norm": 0.33549367300305, + "learning_rate": 8.42261705279729e-05, + "loss": 2.849, + "step": 21522 + }, + { + "epoch": 1.002048560188095, + "grad_norm": 0.3238235383530095, + "learning_rate": 8.422419583351062e-05, + "loss": 2.7819, + "step": 21523 + }, + { + "epoch": 1.0020951183741882, + "grad_norm": 0.35417128478752785, + "learning_rate": 8.422222103860372e-05, + "loss": 2.865, + "step": 21524 + }, + { + "epoch": 1.0021416765602813, + "grad_norm": 0.3542779588585835, + "learning_rate": 8.422024614325793e-05, + "loss": 2.8461, + "step": 21525 + }, + { + "epoch": 1.0021882347463742, + "grad_norm": 0.3366848293632048, + "learning_rate": 8.421827114747912e-05, + "loss": 2.9123, + "step": 21526 + }, + { + "epoch": 1.0022347929324673, + "grad_norm": 0.34738489092878294, + "learning_rate": 8.421629605127302e-05, + "loss": 2.85, + "step": 21527 + }, + { + "epoch": 1.0022813511185604, + "grad_norm": 0.32974715116340014, + "learning_rate": 8.42143208546455e-05, + "loss": 2.95, + "step": 21528 + }, + { + "epoch": 1.0023279093046535, + "grad_norm": 0.3491448491421695, + "learning_rate": 8.42123455576023e-05, + "loss": 2.8119, + "step": 21529 + }, + { + "epoch": 1.0023744674907467, + "grad_norm": 0.3344828025889341, + "learning_rate": 8.421037016014923e-05, + "loss": 2.7692, + "step": 21530 + }, + { + "epoch": 1.0024210256768395, + "grad_norm": 0.3592718126892963, + "learning_rate": 8.420839466229211e-05, + "loss": 2.7858, + "step": 21531 + }, + { + "epoch": 1.0024675838629327, + "grad_norm": 0.3153581847939026, + "learning_rate": 8.420641906403671e-05, + "loss": 2.8657, + "step": 21532 + }, + { + "epoch": 1.0025141420490258, + "grad_norm": 0.35049042591788726, + "learning_rate": 8.420444336538885e-05, + "loss": 2.9196, + "step": 21533 + }, + { + "epoch": 1.002560700235119, + "grad_norm": 0.3216525125817222, + "learning_rate": 8.42024675663543e-05, + "loss": 2.8162, + "step": 21534 + }, + { + "epoch": 1.002607258421212, + "grad_norm": 0.3382923463810527, + "learning_rate": 8.420049166693888e-05, + "loss": 2.7835, + "step": 21535 + }, + { + "epoch": 1.002653816607305, + "grad_norm": 0.32063546646780006, + "learning_rate": 8.419851566714841e-05, + "loss": 2.8072, + "step": 21536 + }, + { + "epoch": 1.002700374793398, + "grad_norm": 0.34245598444527714, + "learning_rate": 8.419653956698866e-05, + "loss": 2.7596, + "step": 21537 + }, + { + "epoch": 1.0027469329794911, + "grad_norm": 0.32568390097508765, + "learning_rate": 8.419456336646543e-05, + "loss": 2.801, + "step": 21538 + }, + { + "epoch": 1.0027934911655842, + "grad_norm": 0.3121429328915303, + "learning_rate": 8.419258706558451e-05, + "loss": 2.8256, + "step": 21539 + }, + { + "epoch": 1.0028400493516774, + "grad_norm": 0.3358685294385916, + "learning_rate": 8.419061066435172e-05, + "loss": 2.7849, + "step": 21540 + }, + { + "epoch": 1.0028866075377703, + "grad_norm": 0.31581846962245985, + "learning_rate": 8.418863416277287e-05, + "loss": 2.8078, + "step": 21541 + }, + { + "epoch": 1.0029331657238634, + "grad_norm": 0.3286215584217832, + "learning_rate": 8.418665756085375e-05, + "loss": 2.7906, + "step": 21542 + }, + { + "epoch": 1.0029797239099565, + "grad_norm": 0.32768588841475316, + "learning_rate": 8.418468085860014e-05, + "loss": 2.8309, + "step": 21543 + }, + { + "epoch": 1.0030262820960496, + "grad_norm": 0.34146132995411377, + "learning_rate": 8.418270405601787e-05, + "loss": 2.8522, + "step": 21544 + }, + { + "epoch": 1.0030728402821425, + "grad_norm": 0.315338481964039, + "learning_rate": 8.418072715311273e-05, + "loss": 2.8962, + "step": 21545 + }, + { + "epoch": 1.0031193984682356, + "grad_norm": 0.31837223513643975, + "learning_rate": 8.417875014989053e-05, + "loss": 2.8443, + "step": 21546 + }, + { + "epoch": 1.0031659566543287, + "grad_norm": 0.29470725771373424, + "learning_rate": 8.417677304635705e-05, + "loss": 2.7983, + "step": 21547 + }, + { + "epoch": 1.0032125148404218, + "grad_norm": 0.3166142191573755, + "learning_rate": 8.417479584251812e-05, + "loss": 3.0186, + "step": 21548 + }, + { + "epoch": 1.003259073026515, + "grad_norm": 0.3145836905711556, + "learning_rate": 8.417281853837953e-05, + "loss": 2.8318, + "step": 21549 + }, + { + "epoch": 1.0033056312126079, + "grad_norm": 0.31102587177220686, + "learning_rate": 8.417084113394708e-05, + "loss": 2.8105, + "step": 21550 + }, + { + "epoch": 1.003352189398701, + "grad_norm": 0.31981203942816494, + "learning_rate": 8.416886362922658e-05, + "loss": 2.8292, + "step": 21551 + }, + { + "epoch": 1.003398747584794, + "grad_norm": 0.3211335714291316, + "learning_rate": 8.416688602422382e-05, + "loss": 2.9105, + "step": 21552 + }, + { + "epoch": 1.0034453057708872, + "grad_norm": 0.31910901821269383, + "learning_rate": 8.416490831894464e-05, + "loss": 2.8181, + "step": 21553 + }, + { + "epoch": 1.0034918639569803, + "grad_norm": 0.34262677018430393, + "learning_rate": 8.416293051339478e-05, + "loss": 2.8188, + "step": 21554 + }, + { + "epoch": 1.0035384221430732, + "grad_norm": 0.33511876570981525, + "learning_rate": 8.416095260758012e-05, + "loss": 2.8781, + "step": 21555 + }, + { + "epoch": 1.0035849803291663, + "grad_norm": 0.3272458306282797, + "learning_rate": 8.415897460150642e-05, + "loss": 2.9059, + "step": 21556 + }, + { + "epoch": 1.0036315385152594, + "grad_norm": 0.3517198524055732, + "learning_rate": 8.41569964951795e-05, + "loss": 3.0018, + "step": 21557 + }, + { + "epoch": 1.0036780967013526, + "grad_norm": 0.3475932416884142, + "learning_rate": 8.415501828860515e-05, + "loss": 2.8921, + "step": 21558 + }, + { + "epoch": 1.0037246548874457, + "grad_norm": 0.350751015904121, + "learning_rate": 8.41530399817892e-05, + "loss": 2.8479, + "step": 21559 + }, + { + "epoch": 1.0037712130735386, + "grad_norm": 0.3433248711255729, + "learning_rate": 8.415106157473743e-05, + "loss": 2.9049, + "step": 21560 + }, + { + "epoch": 1.0038177712596317, + "grad_norm": 0.3318409821805468, + "learning_rate": 8.414908306745566e-05, + "loss": 2.8695, + "step": 21561 + }, + { + "epoch": 1.0038643294457248, + "grad_norm": 0.3243080997014941, + "learning_rate": 8.41471044599497e-05, + "loss": 2.8435, + "step": 21562 + }, + { + "epoch": 1.003910887631818, + "grad_norm": 0.33352410421191664, + "learning_rate": 8.414512575222535e-05, + "loss": 2.9077, + "step": 21563 + }, + { + "epoch": 1.003957445817911, + "grad_norm": 0.3334898422343072, + "learning_rate": 8.414314694428843e-05, + "loss": 2.7857, + "step": 21564 + }, + { + "epoch": 1.004004004004004, + "grad_norm": 0.32603652807249794, + "learning_rate": 8.414116803614471e-05, + "loss": 2.801, + "step": 21565 + }, + { + "epoch": 1.004050562190097, + "grad_norm": 0.33445045814293084, + "learning_rate": 8.413918902780005e-05, + "loss": 2.8445, + "step": 21566 + }, + { + "epoch": 1.0040971203761901, + "grad_norm": 0.34043575646114227, + "learning_rate": 8.413720991926022e-05, + "loss": 2.9539, + "step": 21567 + }, + { + "epoch": 1.0041436785622833, + "grad_norm": 0.32084393547717643, + "learning_rate": 8.413523071053105e-05, + "loss": 2.8843, + "step": 21568 + }, + { + "epoch": 1.0041902367483764, + "grad_norm": 0.3414128231655438, + "learning_rate": 8.413325140161834e-05, + "loss": 2.842, + "step": 21569 + }, + { + "epoch": 1.0042367949344693, + "grad_norm": 0.3225419300877905, + "learning_rate": 8.41312719925279e-05, + "loss": 2.9204, + "step": 21570 + }, + { + "epoch": 1.0042833531205624, + "grad_norm": 0.33360909124292115, + "learning_rate": 8.412929248326553e-05, + "loss": 2.7285, + "step": 21571 + }, + { + "epoch": 1.0043299113066555, + "grad_norm": 0.3118479148657688, + "learning_rate": 8.412731287383706e-05, + "loss": 2.8759, + "step": 21572 + }, + { + "epoch": 1.0043764694927486, + "grad_norm": 0.38254608226595826, + "learning_rate": 8.412533316424829e-05, + "loss": 2.9369, + "step": 21573 + }, + { + "epoch": 1.0044230276788417, + "grad_norm": 0.3431250208242674, + "learning_rate": 8.412335335450502e-05, + "loss": 2.8139, + "step": 21574 + }, + { + "epoch": 1.0044695858649346, + "grad_norm": 0.33977217255848974, + "learning_rate": 8.412137344461309e-05, + "loss": 2.8732, + "step": 21575 + }, + { + "epoch": 1.0045161440510277, + "grad_norm": 0.37636034469072166, + "learning_rate": 8.411939343457826e-05, + "loss": 2.8283, + "step": 21576 + }, + { + "epoch": 1.0045627022371209, + "grad_norm": 0.3263317109894245, + "learning_rate": 8.411741332440638e-05, + "loss": 2.8073, + "step": 21577 + }, + { + "epoch": 1.004609260423214, + "grad_norm": 0.3526434519108468, + "learning_rate": 8.411543311410326e-05, + "loss": 2.8781, + "step": 21578 + }, + { + "epoch": 1.004655818609307, + "grad_norm": 0.3463882117284183, + "learning_rate": 8.411345280367471e-05, + "loss": 2.7926, + "step": 21579 + }, + { + "epoch": 1.0047023767954, + "grad_norm": 0.3575864512897079, + "learning_rate": 8.411147239312654e-05, + "loss": 2.817, + "step": 21580 + }, + { + "epoch": 1.004748934981493, + "grad_norm": 0.3619799908607513, + "learning_rate": 8.410949188246453e-05, + "loss": 2.8574, + "step": 21581 + }, + { + "epoch": 1.0047954931675862, + "grad_norm": 0.37965358565639085, + "learning_rate": 8.410751127169455e-05, + "loss": 2.8452, + "step": 21582 + }, + { + "epoch": 1.0048420513536793, + "grad_norm": 0.35014191540467404, + "learning_rate": 8.410553056082238e-05, + "loss": 2.9108, + "step": 21583 + }, + { + "epoch": 1.0048886095397724, + "grad_norm": 0.38582133120211465, + "learning_rate": 8.410354974985383e-05, + "loss": 2.7185, + "step": 21584 + }, + { + "epoch": 1.0049351677258653, + "grad_norm": 0.34413724733843776, + "learning_rate": 8.410156883879473e-05, + "loss": 2.8696, + "step": 21585 + }, + { + "epoch": 1.0049817259119584, + "grad_norm": 0.3759301676447954, + "learning_rate": 8.409958782765086e-05, + "loss": 2.8202, + "step": 21586 + }, + { + "epoch": 1.0050282840980516, + "grad_norm": 0.3598389613273122, + "learning_rate": 8.409760671642809e-05, + "loss": 2.8204, + "step": 21587 + }, + { + "epoch": 1.0050748422841447, + "grad_norm": 0.3497222431757991, + "learning_rate": 8.409562550513219e-05, + "loss": 2.8599, + "step": 21588 + }, + { + "epoch": 1.0051214004702376, + "grad_norm": 0.3589342593453575, + "learning_rate": 8.409364419376899e-05, + "loss": 2.8213, + "step": 21589 + }, + { + "epoch": 1.0051679586563307, + "grad_norm": 0.3398014905186985, + "learning_rate": 8.40916627823443e-05, + "loss": 2.835, + "step": 21590 + }, + { + "epoch": 1.0052145168424238, + "grad_norm": 0.3686795979109012, + "learning_rate": 8.408968127086396e-05, + "loss": 2.7357, + "step": 21591 + }, + { + "epoch": 1.005261075028517, + "grad_norm": 0.33960465716009763, + "learning_rate": 8.408769965933373e-05, + "loss": 2.816, + "step": 21592 + }, + { + "epoch": 1.00530763321461, + "grad_norm": 0.39178566983200974, + "learning_rate": 8.408571794775948e-05, + "loss": 2.9241, + "step": 21593 + }, + { + "epoch": 1.005354191400703, + "grad_norm": 0.31150994314730784, + "learning_rate": 8.4083736136147e-05, + "loss": 2.8855, + "step": 21594 + }, + { + "epoch": 1.005400749586796, + "grad_norm": 0.40698742325091875, + "learning_rate": 8.40817542245021e-05, + "loss": 2.748, + "step": 21595 + }, + { + "epoch": 1.0054473077728892, + "grad_norm": 0.3091506552256253, + "learning_rate": 8.407977221283063e-05, + "loss": 2.8504, + "step": 21596 + }, + { + "epoch": 1.0054938659589823, + "grad_norm": 0.3322053394821156, + "learning_rate": 8.407779010113839e-05, + "loss": 2.8659, + "step": 21597 + }, + { + "epoch": 1.0055404241450754, + "grad_norm": 0.3195731575729175, + "learning_rate": 8.407580788943117e-05, + "loss": 2.7901, + "step": 21598 + }, + { + "epoch": 1.0055869823311683, + "grad_norm": 0.3483557421837734, + "learning_rate": 8.407382557771483e-05, + "loss": 2.7801, + "step": 21599 + }, + { + "epoch": 1.0056335405172614, + "grad_norm": 0.32452279559663505, + "learning_rate": 8.407184316599517e-05, + "loss": 2.7486, + "step": 21600 + }, + { + "epoch": 1.0056800987033545, + "grad_norm": 0.32823738315133416, + "learning_rate": 8.4069860654278e-05, + "loss": 2.8164, + "step": 21601 + }, + { + "epoch": 1.0057266568894476, + "grad_norm": 0.32760703178995065, + "learning_rate": 8.406787804256912e-05, + "loss": 2.8898, + "step": 21602 + }, + { + "epoch": 1.0057732150755407, + "grad_norm": 0.31345853226265863, + "learning_rate": 8.406589533087441e-05, + "loss": 2.837, + "step": 21603 + }, + { + "epoch": 1.0058197732616336, + "grad_norm": 0.3362361333037254, + "learning_rate": 8.406391251919965e-05, + "loss": 2.9033, + "step": 21604 + }, + { + "epoch": 1.0058663314477267, + "grad_norm": 0.32932524515789857, + "learning_rate": 8.406192960755065e-05, + "loss": 2.7631, + "step": 21605 + }, + { + "epoch": 1.0059128896338199, + "grad_norm": 0.333871361745165, + "learning_rate": 8.405994659593323e-05, + "loss": 2.7086, + "step": 21606 + }, + { + "epoch": 1.005959447819913, + "grad_norm": 0.34923044899675293, + "learning_rate": 8.405796348435324e-05, + "loss": 2.8886, + "step": 21607 + }, + { + "epoch": 1.006006006006006, + "grad_norm": 0.3393461814577458, + "learning_rate": 8.405598027281649e-05, + "loss": 2.8412, + "step": 21608 + }, + { + "epoch": 1.006052564192099, + "grad_norm": 0.36672681774688565, + "learning_rate": 8.405399696132879e-05, + "loss": 2.7799, + "step": 21609 + }, + { + "epoch": 1.006099122378192, + "grad_norm": 0.33093369442601556, + "learning_rate": 8.405201354989596e-05, + "loss": 2.9168, + "step": 21610 + }, + { + "epoch": 1.0061456805642852, + "grad_norm": 0.35597462645470257, + "learning_rate": 8.40500300385238e-05, + "loss": 2.9103, + "step": 21611 + }, + { + "epoch": 1.0061922387503783, + "grad_norm": 0.33802903979428556, + "learning_rate": 8.404804642721818e-05, + "loss": 2.8715, + "step": 21612 + }, + { + "epoch": 1.0062387969364714, + "grad_norm": 0.3226479515094418, + "learning_rate": 8.404606271598491e-05, + "loss": 2.7955, + "step": 21613 + }, + { + "epoch": 1.0062853551225643, + "grad_norm": 0.35695039694707875, + "learning_rate": 8.404407890482978e-05, + "loss": 2.8895, + "step": 21614 + }, + { + "epoch": 1.0063319133086575, + "grad_norm": 0.3320462256932762, + "learning_rate": 8.404209499375865e-05, + "loss": 2.8011, + "step": 21615 + }, + { + "epoch": 1.0063784714947506, + "grad_norm": 0.3575427666720136, + "learning_rate": 8.40401109827773e-05, + "loss": 2.7907, + "step": 21616 + }, + { + "epoch": 1.0064250296808437, + "grad_norm": 0.32940295697512373, + "learning_rate": 8.403812687189158e-05, + "loss": 2.8951, + "step": 21617 + }, + { + "epoch": 1.0064715878669368, + "grad_norm": 0.3395843445940132, + "learning_rate": 8.403614266110734e-05, + "loss": 2.8521, + "step": 21618 + }, + { + "epoch": 1.0065181460530297, + "grad_norm": 0.3062404018007429, + "learning_rate": 8.403415835043036e-05, + "loss": 2.789, + "step": 21619 + }, + { + "epoch": 1.0065647042391228, + "grad_norm": 0.358168813765066, + "learning_rate": 8.403217393986648e-05, + "loss": 2.9635, + "step": 21620 + }, + { + "epoch": 1.006611262425216, + "grad_norm": 0.33307856927573587, + "learning_rate": 8.403018942942151e-05, + "loss": 2.8851, + "step": 21621 + }, + { + "epoch": 1.006657820611309, + "grad_norm": 0.32567206291828726, + "learning_rate": 8.40282048191013e-05, + "loss": 2.8766, + "step": 21622 + }, + { + "epoch": 1.0067043787974022, + "grad_norm": 0.35414046929831233, + "learning_rate": 8.402622010891166e-05, + "loss": 2.8323, + "step": 21623 + }, + { + "epoch": 1.006750936983495, + "grad_norm": 0.3375717424448548, + "learning_rate": 8.40242352988584e-05, + "loss": 2.6619, + "step": 21624 + }, + { + "epoch": 1.0067974951695882, + "grad_norm": 0.3726581039166422, + "learning_rate": 8.402225038894737e-05, + "loss": 2.8971, + "step": 21625 + }, + { + "epoch": 1.0068440533556813, + "grad_norm": 0.33629072769176177, + "learning_rate": 8.402026537918439e-05, + "loss": 2.8681, + "step": 21626 + }, + { + "epoch": 1.0068906115417744, + "grad_norm": 0.3308794343313437, + "learning_rate": 8.401828026957528e-05, + "loss": 2.7698, + "step": 21627 + }, + { + "epoch": 1.0069371697278675, + "grad_norm": 0.35374229384668365, + "learning_rate": 8.401629506012587e-05, + "loss": 2.7346, + "step": 21628 + }, + { + "epoch": 1.0069837279139604, + "grad_norm": 0.3380117268303181, + "learning_rate": 8.401430975084199e-05, + "loss": 2.9518, + "step": 21629 + }, + { + "epoch": 1.0070302861000535, + "grad_norm": 0.3384011071120337, + "learning_rate": 8.401232434172945e-05, + "loss": 2.8273, + "step": 21630 + }, + { + "epoch": 1.0070768442861466, + "grad_norm": 0.33670939524543275, + "learning_rate": 8.40103388327941e-05, + "loss": 2.8697, + "step": 21631 + }, + { + "epoch": 1.0071234024722397, + "grad_norm": 0.30969473728038116, + "learning_rate": 8.400835322404176e-05, + "loss": 2.9343, + "step": 21632 + }, + { + "epoch": 1.0071699606583326, + "grad_norm": 0.3194741706416474, + "learning_rate": 8.400636751547825e-05, + "loss": 2.8558, + "step": 21633 + }, + { + "epoch": 1.0072165188444258, + "grad_norm": 0.3026747953810383, + "learning_rate": 8.400438170710939e-05, + "loss": 2.7562, + "step": 21634 + }, + { + "epoch": 1.0072630770305189, + "grad_norm": 0.36967003374540064, + "learning_rate": 8.400239579894103e-05, + "loss": 2.7556, + "step": 21635 + }, + { + "epoch": 1.007309635216612, + "grad_norm": 0.287732120240115, + "learning_rate": 8.400040979097899e-05, + "loss": 3.0223, + "step": 21636 + }, + { + "epoch": 1.007356193402705, + "grad_norm": 0.3696474770057566, + "learning_rate": 8.399842368322909e-05, + "loss": 2.8019, + "step": 21637 + }, + { + "epoch": 1.007402751588798, + "grad_norm": 0.35301534313647326, + "learning_rate": 8.399643747569716e-05, + "loss": 2.741, + "step": 21638 + }, + { + "epoch": 1.007449309774891, + "grad_norm": 0.34720585643983115, + "learning_rate": 8.399445116838905e-05, + "loss": 2.8537, + "step": 21639 + }, + { + "epoch": 1.0074958679609842, + "grad_norm": 0.3382546315052241, + "learning_rate": 8.399246476131058e-05, + "loss": 2.9083, + "step": 21640 + }, + { + "epoch": 1.0075424261470773, + "grad_norm": 0.34015710449448316, + "learning_rate": 8.399047825446755e-05, + "loss": 2.8829, + "step": 21641 + }, + { + "epoch": 1.0075889843331705, + "grad_norm": 0.3327666160099577, + "learning_rate": 8.398849164786583e-05, + "loss": 2.7548, + "step": 21642 + }, + { + "epoch": 1.0076355425192633, + "grad_norm": 0.3171938806419445, + "learning_rate": 8.398650494151123e-05, + "loss": 2.9086, + "step": 21643 + }, + { + "epoch": 1.0076821007053565, + "grad_norm": 0.3573304359617878, + "learning_rate": 8.398451813540959e-05, + "loss": 2.8456, + "step": 21644 + }, + { + "epoch": 1.0077286588914496, + "grad_norm": 0.29292244236725284, + "learning_rate": 8.398253122956673e-05, + "loss": 2.8242, + "step": 21645 + }, + { + "epoch": 1.0077752170775427, + "grad_norm": 0.35179697025473944, + "learning_rate": 8.398054422398851e-05, + "loss": 2.7709, + "step": 21646 + }, + { + "epoch": 1.0078217752636358, + "grad_norm": 0.30617434238386404, + "learning_rate": 8.397855711868073e-05, + "loss": 2.8271, + "step": 21647 + }, + { + "epoch": 1.0078683334497287, + "grad_norm": 0.36152656580251835, + "learning_rate": 8.397656991364922e-05, + "loss": 2.856, + "step": 21648 + }, + { + "epoch": 1.0079148916358218, + "grad_norm": 0.3213719843367126, + "learning_rate": 8.397458260889984e-05, + "loss": 2.8333, + "step": 21649 + }, + { + "epoch": 1.007961449821915, + "grad_norm": 0.34134969800387116, + "learning_rate": 8.39725952044384e-05, + "loss": 2.8446, + "step": 21650 + }, + { + "epoch": 1.008008008008008, + "grad_norm": 0.34802061154871855, + "learning_rate": 8.397060770027074e-05, + "loss": 2.7943, + "step": 21651 + }, + { + "epoch": 1.0080545661941012, + "grad_norm": 0.3197311328331022, + "learning_rate": 8.39686200964027e-05, + "loss": 2.8131, + "step": 21652 + }, + { + "epoch": 1.008101124380194, + "grad_norm": 0.3140103820001245, + "learning_rate": 8.396663239284008e-05, + "loss": 2.8324, + "step": 21653 + }, + { + "epoch": 1.0081476825662872, + "grad_norm": 0.30551981887761653, + "learning_rate": 8.396464458958877e-05, + "loss": 2.8827, + "step": 21654 + }, + { + "epoch": 1.0081942407523803, + "grad_norm": 0.32130417986517096, + "learning_rate": 8.396265668665456e-05, + "loss": 2.7767, + "step": 21655 + }, + { + "epoch": 1.0082407989384734, + "grad_norm": 0.31810768310769155, + "learning_rate": 8.39606686840433e-05, + "loss": 2.8118, + "step": 21656 + }, + { + "epoch": 1.0082873571245665, + "grad_norm": 0.3067017858698938, + "learning_rate": 8.395868058176083e-05, + "loss": 2.7564, + "step": 21657 + }, + { + "epoch": 1.0083339153106594, + "grad_norm": 0.3522641357786937, + "learning_rate": 8.395669237981297e-05, + "loss": 2.8272, + "step": 21658 + }, + { + "epoch": 1.0083804734967525, + "grad_norm": 0.306692419842018, + "learning_rate": 8.395470407820559e-05, + "loss": 2.9522, + "step": 21659 + }, + { + "epoch": 1.0084270316828456, + "grad_norm": 0.31294946097867377, + "learning_rate": 8.395271567694446e-05, + "loss": 2.8378, + "step": 21660 + }, + { + "epoch": 1.0084735898689388, + "grad_norm": 0.3449064318743026, + "learning_rate": 8.395072717603547e-05, + "loss": 2.9202, + "step": 21661 + }, + { + "epoch": 1.0085201480550319, + "grad_norm": 0.34300660443861436, + "learning_rate": 8.394873857548443e-05, + "loss": 2.6971, + "step": 21662 + }, + { + "epoch": 1.0085667062411248, + "grad_norm": 0.32277529074206757, + "learning_rate": 8.39467498752972e-05, + "loss": 3.0497, + "step": 21663 + }, + { + "epoch": 1.0086132644272179, + "grad_norm": 0.3565337142133573, + "learning_rate": 8.394476107547961e-05, + "loss": 2.8129, + "step": 21664 + }, + { + "epoch": 1.008659822613311, + "grad_norm": 0.3327425571594077, + "learning_rate": 8.394277217603747e-05, + "loss": 2.9176, + "step": 21665 + }, + { + "epoch": 1.0087063807994041, + "grad_norm": 0.3765698953851909, + "learning_rate": 8.394078317697664e-05, + "loss": 2.7683, + "step": 21666 + }, + { + "epoch": 1.0087529389854972, + "grad_norm": 0.3108082896114925, + "learning_rate": 8.393879407830297e-05, + "loss": 2.813, + "step": 21667 + }, + { + "epoch": 1.0087994971715901, + "grad_norm": 0.35782228228465596, + "learning_rate": 8.39368048800223e-05, + "loss": 2.7771, + "step": 21668 + }, + { + "epoch": 1.0088460553576832, + "grad_norm": 0.3244455033947439, + "learning_rate": 8.39348155821404e-05, + "loss": 2.8863, + "step": 21669 + }, + { + "epoch": 1.0088926135437764, + "grad_norm": 0.33963389460426824, + "learning_rate": 8.39328261846632e-05, + "loss": 2.8827, + "step": 21670 + }, + { + "epoch": 1.0089391717298695, + "grad_norm": 0.3366802158391251, + "learning_rate": 8.393083668759648e-05, + "loss": 2.8113, + "step": 21671 + }, + { + "epoch": 1.0089857299159626, + "grad_norm": 0.3360498575004068, + "learning_rate": 8.392884709094611e-05, + "loss": 2.9079, + "step": 21672 + }, + { + "epoch": 1.0090322881020555, + "grad_norm": 0.3607815315103188, + "learning_rate": 8.392685739471791e-05, + "loss": 2.7943, + "step": 21673 + }, + { + "epoch": 1.0090788462881486, + "grad_norm": 0.3174730116829441, + "learning_rate": 8.392486759891771e-05, + "loss": 2.7973, + "step": 21674 + }, + { + "epoch": 1.0091254044742417, + "grad_norm": 0.3641811482924554, + "learning_rate": 8.392287770355139e-05, + "loss": 2.8848, + "step": 21675 + }, + { + "epoch": 1.0091719626603348, + "grad_norm": 0.3231374749274764, + "learning_rate": 8.392088770862476e-05, + "loss": 3.0084, + "step": 21676 + }, + { + "epoch": 1.0092185208464277, + "grad_norm": 0.36957131300874074, + "learning_rate": 8.391889761414365e-05, + "loss": 2.8266, + "step": 21677 + }, + { + "epoch": 1.0092650790325208, + "grad_norm": 0.3398406737491613, + "learning_rate": 8.391690742011393e-05, + "loss": 2.8469, + "step": 21678 + }, + { + "epoch": 1.009311637218614, + "grad_norm": 0.35192040342958064, + "learning_rate": 8.391491712654143e-05, + "loss": 2.9274, + "step": 21679 + }, + { + "epoch": 1.009358195404707, + "grad_norm": 0.319696720783233, + "learning_rate": 8.391292673343198e-05, + "loss": 2.7942, + "step": 21680 + }, + { + "epoch": 1.0094047535908002, + "grad_norm": 0.34013986282587366, + "learning_rate": 8.391093624079145e-05, + "loss": 2.7095, + "step": 21681 + }, + { + "epoch": 1.009451311776893, + "grad_norm": 0.34149257967422647, + "learning_rate": 8.390894564862565e-05, + "loss": 2.9043, + "step": 21682 + }, + { + "epoch": 1.0094978699629862, + "grad_norm": 0.3283835501146894, + "learning_rate": 8.390695495694043e-05, + "loss": 2.8038, + "step": 21683 + }, + { + "epoch": 1.0095444281490793, + "grad_norm": 0.3518357612639494, + "learning_rate": 8.390496416574165e-05, + "loss": 2.8661, + "step": 21684 + }, + { + "epoch": 1.0095909863351724, + "grad_norm": 0.34028070414727235, + "learning_rate": 8.390297327503512e-05, + "loss": 2.8066, + "step": 21685 + }, + { + "epoch": 1.0096375445212655, + "grad_norm": 0.3648321326359781, + "learning_rate": 8.390098228482673e-05, + "loss": 2.8847, + "step": 21686 + }, + { + "epoch": 1.0096841027073584, + "grad_norm": 0.3288277435163114, + "learning_rate": 8.389899119512228e-05, + "loss": 2.8532, + "step": 21687 + }, + { + "epoch": 1.0097306608934515, + "grad_norm": 0.3249398723647331, + "learning_rate": 8.389700000592764e-05, + "loss": 2.8999, + "step": 21688 + }, + { + "epoch": 1.0097772190795447, + "grad_norm": 0.34055308822078956, + "learning_rate": 8.389500871724864e-05, + "loss": 2.9012, + "step": 21689 + }, + { + "epoch": 1.0098237772656378, + "grad_norm": 0.35396537242672177, + "learning_rate": 8.389301732909112e-05, + "loss": 2.782, + "step": 21690 + }, + { + "epoch": 1.0098703354517309, + "grad_norm": 0.32766819493815247, + "learning_rate": 8.389102584146094e-05, + "loss": 2.9203, + "step": 21691 + }, + { + "epoch": 1.0099168936378238, + "grad_norm": 0.34456308687925125, + "learning_rate": 8.388903425436394e-05, + "loss": 2.8033, + "step": 21692 + }, + { + "epoch": 1.009963451823917, + "grad_norm": 0.339156500454289, + "learning_rate": 8.388704256780595e-05, + "loss": 2.951, + "step": 21693 + }, + { + "epoch": 1.01001001001001, + "grad_norm": 0.3398308226104027, + "learning_rate": 8.388505078179285e-05, + "loss": 2.8145, + "step": 21694 + }, + { + "epoch": 1.0100565681961031, + "grad_norm": 0.3524092632370109, + "learning_rate": 8.388305889633046e-05, + "loss": 2.7209, + "step": 21695 + }, + { + "epoch": 1.0101031263821962, + "grad_norm": 0.3497590028721872, + "learning_rate": 8.388106691142462e-05, + "loss": 2.8411, + "step": 21696 + }, + { + "epoch": 1.0101496845682891, + "grad_norm": 0.3460547540752001, + "learning_rate": 8.387907482708119e-05, + "loss": 2.8396, + "step": 21697 + }, + { + "epoch": 1.0101962427543822, + "grad_norm": 0.3301233167356336, + "learning_rate": 8.387708264330601e-05, + "loss": 2.8295, + "step": 21698 + }, + { + "epoch": 1.0102428009404754, + "grad_norm": 0.3426596938615317, + "learning_rate": 8.387509036010494e-05, + "loss": 2.8445, + "step": 21699 + }, + { + "epoch": 1.0102893591265685, + "grad_norm": 0.34130723003584185, + "learning_rate": 8.387309797748382e-05, + "loss": 2.9285, + "step": 21700 + }, + { + "epoch": 1.0103359173126616, + "grad_norm": 0.3940321881640394, + "learning_rate": 8.387110549544847e-05, + "loss": 2.8614, + "step": 21701 + }, + { + "epoch": 1.0103824754987545, + "grad_norm": 0.357776772574222, + "learning_rate": 8.386911291400479e-05, + "loss": 2.8007, + "step": 21702 + }, + { + "epoch": 1.0104290336848476, + "grad_norm": 0.34629510524190726, + "learning_rate": 8.386712023315858e-05, + "loss": 2.8752, + "step": 21703 + }, + { + "epoch": 1.0104755918709407, + "grad_norm": 0.37235158324591927, + "learning_rate": 8.386512745291573e-05, + "loss": 2.8081, + "step": 21704 + }, + { + "epoch": 1.0105221500570338, + "grad_norm": 0.33486147535855426, + "learning_rate": 8.386313457328205e-05, + "loss": 2.8254, + "step": 21705 + }, + { + "epoch": 1.010568708243127, + "grad_norm": 0.36210244634671535, + "learning_rate": 8.386114159426339e-05, + "loss": 2.9078, + "step": 21706 + }, + { + "epoch": 1.0106152664292198, + "grad_norm": 0.3251279036698758, + "learning_rate": 8.385914851586565e-05, + "loss": 2.8671, + "step": 21707 + }, + { + "epoch": 1.010661824615313, + "grad_norm": 0.3865974527947705, + "learning_rate": 8.385715533809464e-05, + "loss": 2.842, + "step": 21708 + }, + { + "epoch": 1.010708382801406, + "grad_norm": 0.3241507073081863, + "learning_rate": 8.385516206095618e-05, + "loss": 2.8154, + "step": 21709 + }, + { + "epoch": 1.0107549409874992, + "grad_norm": 0.35191026396543934, + "learning_rate": 8.385316868445619e-05, + "loss": 2.8917, + "step": 21710 + }, + { + "epoch": 1.0108014991735923, + "grad_norm": 0.35537682075472793, + "learning_rate": 8.385117520860047e-05, + "loss": 2.7667, + "step": 21711 + }, + { + "epoch": 1.0108480573596852, + "grad_norm": 0.31512307670949347, + "learning_rate": 8.384918163339489e-05, + "loss": 2.8038, + "step": 21712 + }, + { + "epoch": 1.0108946155457783, + "grad_norm": 0.36526323931345756, + "learning_rate": 8.384718795884529e-05, + "loss": 2.8806, + "step": 21713 + }, + { + "epoch": 1.0109411737318714, + "grad_norm": 0.33498478906150503, + "learning_rate": 8.384519418495754e-05, + "loss": 2.8949, + "step": 21714 + }, + { + "epoch": 1.0109877319179645, + "grad_norm": 0.35546740995002496, + "learning_rate": 8.384320031173748e-05, + "loss": 2.7828, + "step": 21715 + }, + { + "epoch": 1.0110342901040577, + "grad_norm": 0.34262580507641777, + "learning_rate": 8.384120633919094e-05, + "loss": 2.8709, + "step": 21716 + }, + { + "epoch": 1.0110808482901505, + "grad_norm": 0.3253330773364867, + "learning_rate": 8.383921226732383e-05, + "loss": 2.8476, + "step": 21717 + }, + { + "epoch": 1.0111274064762437, + "grad_norm": 0.3801828490881513, + "learning_rate": 8.383721809614194e-05, + "loss": 2.8581, + "step": 21718 + }, + { + "epoch": 1.0111739646623368, + "grad_norm": 0.3047850602663275, + "learning_rate": 8.383522382565114e-05, + "loss": 2.8392, + "step": 21719 + }, + { + "epoch": 1.01122052284843, + "grad_norm": 0.3800750126224487, + "learning_rate": 8.38332294558573e-05, + "loss": 2.8712, + "step": 21720 + }, + { + "epoch": 1.0112670810345228, + "grad_norm": 0.3531762767793229, + "learning_rate": 8.383123498676628e-05, + "loss": 2.9056, + "step": 21721 + }, + { + "epoch": 1.011313639220616, + "grad_norm": 0.40759333480538207, + "learning_rate": 8.38292404183839e-05, + "loss": 2.8282, + "step": 21722 + }, + { + "epoch": 1.011360197406709, + "grad_norm": 0.359531434222951, + "learning_rate": 8.382724575071604e-05, + "loss": 2.9914, + "step": 21723 + }, + { + "epoch": 1.0114067555928021, + "grad_norm": 0.37257412034117043, + "learning_rate": 8.382525098376855e-05, + "loss": 2.824, + "step": 21724 + }, + { + "epoch": 1.0114533137788952, + "grad_norm": 0.3888897051668246, + "learning_rate": 8.382325611754727e-05, + "loss": 2.9549, + "step": 21725 + }, + { + "epoch": 1.0114998719649881, + "grad_norm": 0.3703132084215187, + "learning_rate": 8.382126115205808e-05, + "loss": 2.8882, + "step": 21726 + }, + { + "epoch": 1.0115464301510813, + "grad_norm": 0.37822907710341286, + "learning_rate": 8.381926608730682e-05, + "loss": 2.8538, + "step": 21727 + }, + { + "epoch": 1.0115929883371744, + "grad_norm": 0.33090173125846795, + "learning_rate": 8.381727092329931e-05, + "loss": 2.9083, + "step": 21728 + }, + { + "epoch": 1.0116395465232675, + "grad_norm": 0.3746022210650675, + "learning_rate": 8.381527566004147e-05, + "loss": 2.8028, + "step": 21729 + }, + { + "epoch": 1.0116861047093606, + "grad_norm": 0.3440169917177196, + "learning_rate": 8.381328029753915e-05, + "loss": 2.8311, + "step": 21730 + }, + { + "epoch": 1.0117326628954535, + "grad_norm": 0.3581229502720108, + "learning_rate": 8.381128483579814e-05, + "loss": 2.8305, + "step": 21731 + }, + { + "epoch": 1.0117792210815466, + "grad_norm": 0.3136551861964617, + "learning_rate": 8.380928927482436e-05, + "loss": 2.7919, + "step": 21732 + }, + { + "epoch": 1.0118257792676397, + "grad_norm": 0.37539391713939535, + "learning_rate": 8.380729361462365e-05, + "loss": 2.8358, + "step": 21733 + }, + { + "epoch": 1.0118723374537328, + "grad_norm": 0.33966578777527007, + "learning_rate": 8.380529785520186e-05, + "loss": 2.9306, + "step": 21734 + }, + { + "epoch": 1.011918895639826, + "grad_norm": 0.3687176304718955, + "learning_rate": 8.380330199656485e-05, + "loss": 2.8093, + "step": 21735 + }, + { + "epoch": 1.0119654538259188, + "grad_norm": 0.3430363746202916, + "learning_rate": 8.380130603871849e-05, + "loss": 2.8566, + "step": 21736 + }, + { + "epoch": 1.012012012012012, + "grad_norm": 0.33386918614926375, + "learning_rate": 8.37993099816686e-05, + "loss": 2.8403, + "step": 21737 + }, + { + "epoch": 1.012058570198105, + "grad_norm": 0.335634186800074, + "learning_rate": 8.379731382542109e-05, + "loss": 2.8335, + "step": 21738 + }, + { + "epoch": 1.0121051283841982, + "grad_norm": 0.3491644172133998, + "learning_rate": 8.379531756998178e-05, + "loss": 2.9049, + "step": 21739 + }, + { + "epoch": 1.0121516865702913, + "grad_norm": 0.3279885623525553, + "learning_rate": 8.379332121535656e-05, + "loss": 2.864, + "step": 21740 + }, + { + "epoch": 1.0121982447563842, + "grad_norm": 0.3421446599875431, + "learning_rate": 8.379132476155125e-05, + "loss": 2.8438, + "step": 21741 + }, + { + "epoch": 1.0122448029424773, + "grad_norm": 0.33344553874806093, + "learning_rate": 8.378932820857175e-05, + "loss": 2.7945, + "step": 21742 + }, + { + "epoch": 1.0122913611285704, + "grad_norm": 0.3232698419190119, + "learning_rate": 8.378733155642389e-05, + "loss": 2.8497, + "step": 21743 + }, + { + "epoch": 1.0123379193146635, + "grad_norm": 0.34239291569667757, + "learning_rate": 8.378533480511355e-05, + "loss": 2.8821, + "step": 21744 + }, + { + "epoch": 1.0123844775007567, + "grad_norm": 0.3199130491444208, + "learning_rate": 8.378333795464656e-05, + "loss": 2.83, + "step": 21745 + }, + { + "epoch": 1.0124310356868496, + "grad_norm": 0.3355880680332376, + "learning_rate": 8.378134100502881e-05, + "loss": 2.8853, + "step": 21746 + }, + { + "epoch": 1.0124775938729427, + "grad_norm": 0.3369940002613593, + "learning_rate": 8.377934395626615e-05, + "loss": 2.7652, + "step": 21747 + }, + { + "epoch": 1.0125241520590358, + "grad_norm": 0.33862498816991465, + "learning_rate": 8.377734680836444e-05, + "loss": 2.796, + "step": 21748 + }, + { + "epoch": 1.012570710245129, + "grad_norm": 0.3132911910223228, + "learning_rate": 8.377534956132958e-05, + "loss": 2.9006, + "step": 21749 + }, + { + "epoch": 1.012617268431222, + "grad_norm": 0.36722984631350025, + "learning_rate": 8.377335221516735e-05, + "loss": 2.8336, + "step": 21750 + }, + { + "epoch": 1.012663826617315, + "grad_norm": 0.3008067716225297, + "learning_rate": 8.377135476988369e-05, + "loss": 2.856, + "step": 21751 + }, + { + "epoch": 1.012710384803408, + "grad_norm": 0.36338638724327094, + "learning_rate": 8.37693572254844e-05, + "loss": 2.8001, + "step": 21752 + }, + { + "epoch": 1.0127569429895011, + "grad_norm": 0.33139444253282685, + "learning_rate": 8.376735958197541e-05, + "loss": 2.8184, + "step": 21753 + }, + { + "epoch": 1.0128035011755943, + "grad_norm": 0.38611589395839335, + "learning_rate": 8.376536183936252e-05, + "loss": 2.8639, + "step": 21754 + }, + { + "epoch": 1.0128500593616874, + "grad_norm": 0.3221132833578756, + "learning_rate": 8.376336399765163e-05, + "loss": 2.7158, + "step": 21755 + }, + { + "epoch": 1.0128966175477803, + "grad_norm": 0.33968504643475755, + "learning_rate": 8.376136605684858e-05, + "loss": 2.7432, + "step": 21756 + }, + { + "epoch": 1.0129431757338734, + "grad_norm": 0.3320739909539841, + "learning_rate": 8.375936801695927e-05, + "loss": 2.7666, + "step": 21757 + }, + { + "epoch": 1.0129897339199665, + "grad_norm": 0.3541169977784638, + "learning_rate": 8.375736987798953e-05, + "loss": 2.8412, + "step": 21758 + }, + { + "epoch": 1.0130362921060596, + "grad_norm": 0.35901155977017896, + "learning_rate": 8.375537163994522e-05, + "loss": 2.8381, + "step": 21759 + }, + { + "epoch": 1.0130828502921527, + "grad_norm": 0.37760176754906694, + "learning_rate": 8.375337330283223e-05, + "loss": 2.7964, + "step": 21760 + }, + { + "epoch": 1.0131294084782456, + "grad_norm": 0.35418081197429063, + "learning_rate": 8.375137486665643e-05, + "loss": 2.6863, + "step": 21761 + }, + { + "epoch": 1.0131759666643387, + "grad_norm": 0.33557124405016203, + "learning_rate": 8.374937633142365e-05, + "loss": 2.7583, + "step": 21762 + }, + { + "epoch": 1.0132225248504318, + "grad_norm": 0.35372817406972823, + "learning_rate": 8.37473776971398e-05, + "loss": 2.8233, + "step": 21763 + }, + { + "epoch": 1.013269083036525, + "grad_norm": 0.329342814544086, + "learning_rate": 8.37453789638107e-05, + "loss": 2.8901, + "step": 21764 + }, + { + "epoch": 1.0133156412226179, + "grad_norm": 0.37408093823860694, + "learning_rate": 8.374338013144223e-05, + "loss": 2.894, + "step": 21765 + }, + { + "epoch": 1.013362199408711, + "grad_norm": 0.3052861991198151, + "learning_rate": 8.374138120004028e-05, + "loss": 2.8453, + "step": 21766 + }, + { + "epoch": 1.013408757594804, + "grad_norm": 0.34313531345865017, + "learning_rate": 8.37393821696107e-05, + "loss": 2.7696, + "step": 21767 + }, + { + "epoch": 1.0134553157808972, + "grad_norm": 0.3220152242483597, + "learning_rate": 8.373738304015934e-05, + "loss": 2.8813, + "step": 21768 + }, + { + "epoch": 1.0135018739669903, + "grad_norm": 0.3217940800434388, + "learning_rate": 8.37353838116921e-05, + "loss": 2.8804, + "step": 21769 + }, + { + "epoch": 1.0135484321530832, + "grad_norm": 0.3449508107098927, + "learning_rate": 8.373338448421483e-05, + "loss": 2.889, + "step": 21770 + }, + { + "epoch": 1.0135949903391763, + "grad_norm": 0.35386311938276327, + "learning_rate": 8.373138505773341e-05, + "loss": 2.9293, + "step": 21771 + }, + { + "epoch": 1.0136415485252694, + "grad_norm": 0.31933471969047283, + "learning_rate": 8.372938553225367e-05, + "loss": 2.8022, + "step": 21772 + }, + { + "epoch": 1.0136881067113626, + "grad_norm": 0.35508404457271714, + "learning_rate": 8.372738590778153e-05, + "loss": 2.8973, + "step": 21773 + }, + { + "epoch": 1.0137346648974557, + "grad_norm": 0.33023028634387, + "learning_rate": 8.372538618432283e-05, + "loss": 2.8618, + "step": 21774 + }, + { + "epoch": 1.0137812230835486, + "grad_norm": 0.3361547881715359, + "learning_rate": 8.372338636188344e-05, + "loss": 2.8144, + "step": 21775 + }, + { + "epoch": 1.0138277812696417, + "grad_norm": 0.32567656489425656, + "learning_rate": 8.372138644046923e-05, + "loss": 2.8304, + "step": 21776 + }, + { + "epoch": 1.0138743394557348, + "grad_norm": 0.3426481694435497, + "learning_rate": 8.371938642008608e-05, + "loss": 2.8977, + "step": 21777 + }, + { + "epoch": 1.013920897641828, + "grad_norm": 0.3281813889132848, + "learning_rate": 8.371738630073985e-05, + "loss": 2.7831, + "step": 21778 + }, + { + "epoch": 1.013967455827921, + "grad_norm": 0.3726141386261468, + "learning_rate": 8.371538608243641e-05, + "loss": 2.8535, + "step": 21779 + }, + { + "epoch": 1.014014014014014, + "grad_norm": 0.34129119250515894, + "learning_rate": 8.371338576518164e-05, + "loss": 2.9132, + "step": 21780 + }, + { + "epoch": 1.014060572200107, + "grad_norm": 0.3438448759773361, + "learning_rate": 8.371138534898141e-05, + "loss": 2.8442, + "step": 21781 + }, + { + "epoch": 1.0141071303862002, + "grad_norm": 0.3449753565619682, + "learning_rate": 8.370938483384156e-05, + "loss": 2.7932, + "step": 21782 + }, + { + "epoch": 1.0141536885722933, + "grad_norm": 0.3212584427929666, + "learning_rate": 8.370738421976801e-05, + "loss": 2.7209, + "step": 21783 + }, + { + "epoch": 1.0142002467583864, + "grad_norm": 0.37766966160958004, + "learning_rate": 8.37053835067666e-05, + "loss": 2.8457, + "step": 21784 + }, + { + "epoch": 1.0142468049444793, + "grad_norm": 0.39037180360996615, + "learning_rate": 8.370338269484322e-05, + "loss": 2.818, + "step": 21785 + }, + { + "epoch": 1.0142933631305724, + "grad_norm": 0.3361662306506184, + "learning_rate": 8.370138178400373e-05, + "loss": 2.9269, + "step": 21786 + }, + { + "epoch": 1.0143399213166655, + "grad_norm": 0.33021099324435677, + "learning_rate": 8.369938077425399e-05, + "loss": 2.8356, + "step": 21787 + }, + { + "epoch": 1.0143864795027586, + "grad_norm": 0.41299287544547264, + "learning_rate": 8.36973796655999e-05, + "loss": 2.8431, + "step": 21788 + }, + { + "epoch": 1.0144330376888517, + "grad_norm": 0.33397364198934226, + "learning_rate": 8.369537845804731e-05, + "loss": 2.9522, + "step": 21789 + }, + { + "epoch": 1.0144795958749446, + "grad_norm": 0.3560640041637921, + "learning_rate": 8.36933771516021e-05, + "loss": 2.9444, + "step": 21790 + }, + { + "epoch": 1.0145261540610377, + "grad_norm": 0.3324246405035992, + "learning_rate": 8.369137574627017e-05, + "loss": 2.7622, + "step": 21791 + }, + { + "epoch": 1.0145727122471309, + "grad_norm": 0.36908812821083586, + "learning_rate": 8.368937424205735e-05, + "loss": 2.7549, + "step": 21792 + }, + { + "epoch": 1.014619270433224, + "grad_norm": 0.3096159034646269, + "learning_rate": 8.368737263896954e-05, + "loss": 2.8457, + "step": 21793 + }, + { + "epoch": 1.014665828619317, + "grad_norm": 0.3456700612462519, + "learning_rate": 8.368537093701261e-05, + "loss": 2.8824, + "step": 21794 + }, + { + "epoch": 1.01471238680541, + "grad_norm": 0.3418890129672616, + "learning_rate": 8.368336913619244e-05, + "loss": 2.835, + "step": 21795 + }, + { + "epoch": 1.014758944991503, + "grad_norm": 0.3095642123740878, + "learning_rate": 8.36813672365149e-05, + "loss": 2.8416, + "step": 21796 + }, + { + "epoch": 1.0148055031775962, + "grad_norm": 0.3278799363324566, + "learning_rate": 8.367936523798586e-05, + "loss": 2.8111, + "step": 21797 + }, + { + "epoch": 1.0148520613636893, + "grad_norm": 0.30840753759453965, + "learning_rate": 8.367736314061121e-05, + "loss": 2.8073, + "step": 21798 + }, + { + "epoch": 1.0148986195497824, + "grad_norm": 0.36697937365742805, + "learning_rate": 8.367536094439682e-05, + "loss": 2.8342, + "step": 21799 + }, + { + "epoch": 1.0149451777358753, + "grad_norm": 0.3513935129447986, + "learning_rate": 8.367335864934855e-05, + "loss": 2.7877, + "step": 21800 + }, + { + "epoch": 1.0149917359219685, + "grad_norm": 0.3799833238383188, + "learning_rate": 8.367135625547229e-05, + "loss": 2.871, + "step": 21801 + }, + { + "epoch": 1.0150382941080616, + "grad_norm": 0.3752333348241835, + "learning_rate": 8.366935376277392e-05, + "loss": 2.8394, + "step": 21802 + }, + { + "epoch": 1.0150848522941547, + "grad_norm": 0.3542588534103447, + "learning_rate": 8.366735117125932e-05, + "loss": 2.8758, + "step": 21803 + }, + { + "epoch": 1.0151314104802478, + "grad_norm": 0.3776747657671918, + "learning_rate": 8.366534848093434e-05, + "loss": 2.8563, + "step": 21804 + }, + { + "epoch": 1.0151779686663407, + "grad_norm": 0.32615255935241644, + "learning_rate": 8.366334569180492e-05, + "loss": 2.7909, + "step": 21805 + }, + { + "epoch": 1.0152245268524338, + "grad_norm": 0.386552445495569, + "learning_rate": 8.366134280387685e-05, + "loss": 2.9116, + "step": 21806 + }, + { + "epoch": 1.015271085038527, + "grad_norm": 0.34660125891751303, + "learning_rate": 8.36593398171561e-05, + "loss": 2.7392, + "step": 21807 + }, + { + "epoch": 1.01531764322462, + "grad_norm": 0.37357342017428125, + "learning_rate": 8.365733673164847e-05, + "loss": 2.8371, + "step": 21808 + }, + { + "epoch": 1.015364201410713, + "grad_norm": 0.355170331415631, + "learning_rate": 8.365533354735988e-05, + "loss": 2.8098, + "step": 21809 + }, + { + "epoch": 1.015410759596806, + "grad_norm": 0.34646318780388985, + "learning_rate": 8.36533302642962e-05, + "loss": 2.8413, + "step": 21810 + }, + { + "epoch": 1.0154573177828992, + "grad_norm": 0.3304700944402215, + "learning_rate": 8.36513268824633e-05, + "loss": 2.8088, + "step": 21811 + }, + { + "epoch": 1.0155038759689923, + "grad_norm": 0.3374412007292585, + "learning_rate": 8.36493234018671e-05, + "loss": 2.8071, + "step": 21812 + }, + { + "epoch": 1.0155504341550854, + "grad_norm": 0.3319015403293378, + "learning_rate": 8.364731982251344e-05, + "loss": 2.7927, + "step": 21813 + }, + { + "epoch": 1.0155969923411783, + "grad_norm": 0.3549032815329995, + "learning_rate": 8.364531614440821e-05, + "loss": 2.829, + "step": 21814 + }, + { + "epoch": 1.0156435505272714, + "grad_norm": 0.3332081496881611, + "learning_rate": 8.364331236755729e-05, + "loss": 2.8748, + "step": 21815 + }, + { + "epoch": 1.0156901087133645, + "grad_norm": 0.34231980475810725, + "learning_rate": 8.364130849196656e-05, + "loss": 2.8268, + "step": 21816 + }, + { + "epoch": 1.0157366668994576, + "grad_norm": 0.3534467150512264, + "learning_rate": 8.36393045176419e-05, + "loss": 2.8635, + "step": 21817 + }, + { + "epoch": 1.0157832250855507, + "grad_norm": 0.319309479712095, + "learning_rate": 8.363730044458922e-05, + "loss": 2.8397, + "step": 21818 + }, + { + "epoch": 1.0158297832716436, + "grad_norm": 0.33591567211026885, + "learning_rate": 8.363529627281434e-05, + "loss": 2.896, + "step": 21819 + }, + { + "epoch": 1.0158763414577368, + "grad_norm": 0.349643545665307, + "learning_rate": 8.36332920023232e-05, + "loss": 2.863, + "step": 21820 + }, + { + "epoch": 1.0159228996438299, + "grad_norm": 0.33722932813603446, + "learning_rate": 8.363128763312167e-05, + "loss": 2.9234, + "step": 21821 + }, + { + "epoch": 1.015969457829923, + "grad_norm": 0.3871324336449725, + "learning_rate": 8.362928316521561e-05, + "loss": 2.8937, + "step": 21822 + }, + { + "epoch": 1.016016016016016, + "grad_norm": 0.3143684138462887, + "learning_rate": 8.362727859861091e-05, + "loss": 2.8533, + "step": 21823 + }, + { + "epoch": 1.016062574202109, + "grad_norm": 0.3862994415140265, + "learning_rate": 8.362527393331348e-05, + "loss": 2.919, + "step": 21824 + }, + { + "epoch": 1.016109132388202, + "grad_norm": 0.32641551985799727, + "learning_rate": 8.362326916932918e-05, + "loss": 2.8986, + "step": 21825 + }, + { + "epoch": 1.0161556905742952, + "grad_norm": 0.3580586706261374, + "learning_rate": 8.362126430666388e-05, + "loss": 2.7373, + "step": 21826 + }, + { + "epoch": 1.0162022487603883, + "grad_norm": 0.31747078752907054, + "learning_rate": 8.36192593453235e-05, + "loss": 2.8833, + "step": 21827 + }, + { + "epoch": 1.0162488069464815, + "grad_norm": 0.3444490184538604, + "learning_rate": 8.361725428531388e-05, + "loss": 2.8532, + "step": 21828 + }, + { + "epoch": 1.0162953651325743, + "grad_norm": 0.2997969025767597, + "learning_rate": 8.361524912664097e-05, + "loss": 2.817, + "step": 21829 + }, + { + "epoch": 1.0163419233186675, + "grad_norm": 0.33707475848621327, + "learning_rate": 8.361324386931058e-05, + "loss": 2.793, + "step": 21830 + }, + { + "epoch": 1.0163884815047606, + "grad_norm": 0.31529015775114894, + "learning_rate": 8.361123851332866e-05, + "loss": 2.8404, + "step": 21831 + }, + { + "epoch": 1.0164350396908537, + "grad_norm": 0.31265197805407907, + "learning_rate": 8.360923305870106e-05, + "loss": 2.8814, + "step": 21832 + }, + { + "epoch": 1.0164815978769468, + "grad_norm": 0.3219973975801322, + "learning_rate": 8.360722750543364e-05, + "loss": 2.8058, + "step": 21833 + }, + { + "epoch": 1.0165281560630397, + "grad_norm": 0.30558422164714205, + "learning_rate": 8.360522185353234e-05, + "loss": 2.7336, + "step": 21834 + }, + { + "epoch": 1.0165747142491328, + "grad_norm": 0.31072199520193067, + "learning_rate": 8.360321610300304e-05, + "loss": 2.9475, + "step": 21835 + }, + { + "epoch": 1.016621272435226, + "grad_norm": 0.33695283816213456, + "learning_rate": 8.360121025385159e-05, + "loss": 2.8604, + "step": 21836 + }, + { + "epoch": 1.016667830621319, + "grad_norm": 0.31879661602109727, + "learning_rate": 8.359920430608392e-05, + "loss": 2.824, + "step": 21837 + }, + { + "epoch": 1.0167143888074122, + "grad_norm": 0.31588260362737125, + "learning_rate": 8.359719825970587e-05, + "loss": 2.9159, + "step": 21838 + }, + { + "epoch": 1.016760946993505, + "grad_norm": 0.3300766897488729, + "learning_rate": 8.359519211472337e-05, + "loss": 2.9634, + "step": 21839 + }, + { + "epoch": 1.0168075051795982, + "grad_norm": 0.33579086092854105, + "learning_rate": 8.359318587114229e-05, + "loss": 2.8944, + "step": 21840 + }, + { + "epoch": 1.0168540633656913, + "grad_norm": 0.33747108685646743, + "learning_rate": 8.35911795289685e-05, + "loss": 2.9048, + "step": 21841 + }, + { + "epoch": 1.0169006215517844, + "grad_norm": 0.3421352673451309, + "learning_rate": 8.358917308820793e-05, + "loss": 2.865, + "step": 21842 + }, + { + "epoch": 1.0169471797378775, + "grad_norm": 0.3714305678967251, + "learning_rate": 8.358716654886643e-05, + "loss": 2.872, + "step": 21843 + }, + { + "epoch": 1.0169937379239704, + "grad_norm": 0.38989010805491114, + "learning_rate": 8.358515991094991e-05, + "loss": 2.8475, + "step": 21844 + }, + { + "epoch": 1.0170402961100635, + "grad_norm": 0.34827249520160747, + "learning_rate": 8.358315317446426e-05, + "loss": 2.9284, + "step": 21845 + }, + { + "epoch": 1.0170868542961566, + "grad_norm": 0.3380219904134286, + "learning_rate": 8.358114633941536e-05, + "loss": 2.8292, + "step": 21846 + }, + { + "epoch": 1.0171334124822498, + "grad_norm": 0.33349181841409065, + "learning_rate": 8.35791394058091e-05, + "loss": 2.82, + "step": 21847 + }, + { + "epoch": 1.0171799706683429, + "grad_norm": 0.36063149227655183, + "learning_rate": 8.357713237365139e-05, + "loss": 2.7774, + "step": 21848 + }, + { + "epoch": 1.0172265288544358, + "grad_norm": 0.35730032823774144, + "learning_rate": 8.357512524294808e-05, + "loss": 2.7924, + "step": 21849 + }, + { + "epoch": 1.0172730870405289, + "grad_norm": 0.36259906482286425, + "learning_rate": 8.357311801370511e-05, + "loss": 2.7436, + "step": 21850 + }, + { + "epoch": 1.017319645226622, + "grad_norm": 0.3978451958530111, + "learning_rate": 8.357111068592832e-05, + "loss": 2.9065, + "step": 21851 + }, + { + "epoch": 1.017366203412715, + "grad_norm": 0.3329714601231351, + "learning_rate": 8.356910325962366e-05, + "loss": 2.7769, + "step": 21852 + }, + { + "epoch": 1.017412761598808, + "grad_norm": 0.38392705621747986, + "learning_rate": 8.356709573479696e-05, + "loss": 2.8904, + "step": 21853 + }, + { + "epoch": 1.0174593197849011, + "grad_norm": 0.34415823852764715, + "learning_rate": 8.356508811145414e-05, + "loss": 2.7596, + "step": 21854 + }, + { + "epoch": 1.0175058779709942, + "grad_norm": 0.35231714737162095, + "learning_rate": 8.356308038960112e-05, + "loss": 2.981, + "step": 21855 + }, + { + "epoch": 1.0175524361570873, + "grad_norm": 0.3928893008252827, + "learning_rate": 8.356107256924375e-05, + "loss": 2.893, + "step": 21856 + }, + { + "epoch": 1.0175989943431805, + "grad_norm": 0.3311708868956781, + "learning_rate": 8.355906465038794e-05, + "loss": 2.8137, + "step": 21857 + }, + { + "epoch": 1.0176455525292734, + "grad_norm": 0.36287925206691857, + "learning_rate": 8.355705663303957e-05, + "loss": 2.8034, + "step": 21858 + }, + { + "epoch": 1.0176921107153665, + "grad_norm": 0.3453928742177841, + "learning_rate": 8.355504851720455e-05, + "loss": 2.8304, + "step": 21859 + }, + { + "epoch": 1.0177386689014596, + "grad_norm": 0.379917669185934, + "learning_rate": 8.355304030288877e-05, + "loss": 2.6916, + "step": 21860 + }, + { + "epoch": 1.0177852270875527, + "grad_norm": 0.3338569163234431, + "learning_rate": 8.355103199009812e-05, + "loss": 2.8638, + "step": 21861 + }, + { + "epoch": 1.0178317852736458, + "grad_norm": 0.3359382592939474, + "learning_rate": 8.354902357883852e-05, + "loss": 2.7471, + "step": 21862 + }, + { + "epoch": 1.0178783434597387, + "grad_norm": 0.34298815938755167, + "learning_rate": 8.354701506911581e-05, + "loss": 2.8749, + "step": 21863 + }, + { + "epoch": 1.0179249016458318, + "grad_norm": 0.32633971741948953, + "learning_rate": 8.354500646093593e-05, + "loss": 2.8438, + "step": 21864 + }, + { + "epoch": 1.017971459831925, + "grad_norm": 0.3258419621261484, + "learning_rate": 8.354299775430476e-05, + "loss": 2.762, + "step": 21865 + }, + { + "epoch": 1.018018018018018, + "grad_norm": 0.31516578602682915, + "learning_rate": 8.354098894922819e-05, + "loss": 2.7314, + "step": 21866 + }, + { + "epoch": 1.0180645762041112, + "grad_norm": 0.35522054353881993, + "learning_rate": 8.353898004571213e-05, + "loss": 2.8463, + "step": 21867 + }, + { + "epoch": 1.018111134390204, + "grad_norm": 0.30604291698246894, + "learning_rate": 8.353697104376247e-05, + "loss": 2.8072, + "step": 21868 + }, + { + "epoch": 1.0181576925762972, + "grad_norm": 0.32906254613741176, + "learning_rate": 8.35349619433851e-05, + "loss": 2.9025, + "step": 21869 + }, + { + "epoch": 1.0182042507623903, + "grad_norm": 0.3285169540229122, + "learning_rate": 8.353295274458593e-05, + "loss": 2.8738, + "step": 21870 + }, + { + "epoch": 1.0182508089484834, + "grad_norm": 0.318060399490612, + "learning_rate": 8.353094344737083e-05, + "loss": 2.8029, + "step": 21871 + }, + { + "epoch": 1.0182973671345765, + "grad_norm": 0.33025244409064447, + "learning_rate": 8.352893405174572e-05, + "loss": 2.939, + "step": 21872 + }, + { + "epoch": 1.0183439253206694, + "grad_norm": 0.33262601655253154, + "learning_rate": 8.352692455771648e-05, + "loss": 2.8243, + "step": 21873 + }, + { + "epoch": 1.0183904835067625, + "grad_norm": 0.3548042866334724, + "learning_rate": 8.352491496528904e-05, + "loss": 2.9238, + "step": 21874 + }, + { + "epoch": 1.0184370416928556, + "grad_norm": 0.31867804778034675, + "learning_rate": 8.352290527446927e-05, + "loss": 2.8438, + "step": 21875 + }, + { + "epoch": 1.0184835998789488, + "grad_norm": 0.3705373646139808, + "learning_rate": 8.352089548526308e-05, + "loss": 2.7951, + "step": 21876 + }, + { + "epoch": 1.0185301580650419, + "grad_norm": 0.31249655973283885, + "learning_rate": 8.351888559767634e-05, + "loss": 2.9486, + "step": 21877 + }, + { + "epoch": 1.0185767162511348, + "grad_norm": 0.3562145123973896, + "learning_rate": 8.351687561171499e-05, + "loss": 2.9117, + "step": 21878 + }, + { + "epoch": 1.0186232744372279, + "grad_norm": 0.33514878255936886, + "learning_rate": 8.351486552738493e-05, + "loss": 2.7782, + "step": 21879 + }, + { + "epoch": 1.018669832623321, + "grad_norm": 0.3340222640173923, + "learning_rate": 8.351285534469202e-05, + "loss": 2.7664, + "step": 21880 + }, + { + "epoch": 1.0187163908094141, + "grad_norm": 0.32863770515234625, + "learning_rate": 8.351084506364218e-05, + "loss": 2.7728, + "step": 21881 + }, + { + "epoch": 1.0187629489955072, + "grad_norm": 0.3330286078171439, + "learning_rate": 8.350883468424131e-05, + "loss": 2.9148, + "step": 21882 + }, + { + "epoch": 1.0188095071816001, + "grad_norm": 0.3153877541775562, + "learning_rate": 8.350682420649533e-05, + "loss": 2.7966, + "step": 21883 + }, + { + "epoch": 1.0188560653676932, + "grad_norm": 0.32069455396364577, + "learning_rate": 8.350481363041012e-05, + "loss": 2.8561, + "step": 21884 + }, + { + "epoch": 1.0189026235537864, + "grad_norm": 0.3408724492704194, + "learning_rate": 8.350280295599156e-05, + "loss": 2.9383, + "step": 21885 + }, + { + "epoch": 1.0189491817398795, + "grad_norm": 0.33031106161048845, + "learning_rate": 8.350079218324558e-05, + "loss": 2.8753, + "step": 21886 + }, + { + "epoch": 1.0189957399259726, + "grad_norm": 0.32963666462892655, + "learning_rate": 8.349878131217809e-05, + "loss": 2.8465, + "step": 21887 + }, + { + "epoch": 1.0190422981120655, + "grad_norm": 0.3382270656304605, + "learning_rate": 8.349677034279497e-05, + "loss": 2.9246, + "step": 21888 + }, + { + "epoch": 1.0190888562981586, + "grad_norm": 0.3397021058397973, + "learning_rate": 8.349475927510213e-05, + "loss": 2.7404, + "step": 21889 + }, + { + "epoch": 1.0191354144842517, + "grad_norm": 0.2770364244452568, + "learning_rate": 8.349274810910547e-05, + "loss": 2.7862, + "step": 21890 + }, + { + "epoch": 1.0191819726703448, + "grad_norm": 0.3404910241781182, + "learning_rate": 8.349073684481089e-05, + "loss": 2.8302, + "step": 21891 + }, + { + "epoch": 1.019228530856438, + "grad_norm": 0.301091016944792, + "learning_rate": 8.34887254822243e-05, + "loss": 2.7125, + "step": 21892 + }, + { + "epoch": 1.0192750890425308, + "grad_norm": 0.3642363401721165, + "learning_rate": 8.34867140213516e-05, + "loss": 2.9085, + "step": 21893 + }, + { + "epoch": 1.019321647228624, + "grad_norm": 0.3172652360630751, + "learning_rate": 8.348470246219871e-05, + "loss": 2.8747, + "step": 21894 + }, + { + "epoch": 1.019368205414717, + "grad_norm": 0.35316204570986565, + "learning_rate": 8.348269080477148e-05, + "loss": 2.9157, + "step": 21895 + }, + { + "epoch": 1.0194147636008102, + "grad_norm": 0.32985540754743925, + "learning_rate": 8.348067904907589e-05, + "loss": 2.8301, + "step": 21896 + }, + { + "epoch": 1.019461321786903, + "grad_norm": 0.323309617778043, + "learning_rate": 8.347866719511776e-05, + "loss": 2.8459, + "step": 21897 + }, + { + "epoch": 1.0195078799729962, + "grad_norm": 0.32241247244072796, + "learning_rate": 8.347665524290309e-05, + "loss": 2.8047, + "step": 21898 + }, + { + "epoch": 1.0195544381590893, + "grad_norm": 0.32593153066325475, + "learning_rate": 8.34746431924377e-05, + "loss": 2.9405, + "step": 21899 + }, + { + "epoch": 1.0196009963451824, + "grad_norm": 0.3324661810402436, + "learning_rate": 8.347263104372754e-05, + "loss": 2.8557, + "step": 21900 + }, + { + "epoch": 1.0196475545312755, + "grad_norm": 0.3348686054712129, + "learning_rate": 8.34706187967785e-05, + "loss": 2.8378, + "step": 21901 + }, + { + "epoch": 1.0196941127173684, + "grad_norm": 0.31260258099614874, + "learning_rate": 8.346860645159651e-05, + "loss": 2.8311, + "step": 21902 + }, + { + "epoch": 1.0197406709034615, + "grad_norm": 0.3384645828267204, + "learning_rate": 8.346659400818745e-05, + "loss": 2.6882, + "step": 21903 + }, + { + "epoch": 1.0197872290895547, + "grad_norm": 0.3055919980166471, + "learning_rate": 8.346458146655722e-05, + "loss": 2.6833, + "step": 21904 + }, + { + "epoch": 1.0198337872756478, + "grad_norm": 0.3290166927725903, + "learning_rate": 8.346256882671175e-05, + "loss": 2.9055, + "step": 21905 + }, + { + "epoch": 1.019880345461741, + "grad_norm": 0.3116797817802974, + "learning_rate": 8.346055608865694e-05, + "loss": 2.8097, + "step": 21906 + }, + { + "epoch": 1.0199269036478338, + "grad_norm": 0.32629278983122645, + "learning_rate": 8.34585432523987e-05, + "loss": 2.8824, + "step": 21907 + }, + { + "epoch": 1.019973461833927, + "grad_norm": 0.3295094984167595, + "learning_rate": 8.345653031794292e-05, + "loss": 2.8538, + "step": 21908 + }, + { + "epoch": 1.02002002002002, + "grad_norm": 0.35851912008563863, + "learning_rate": 8.345451728529552e-05, + "loss": 2.8107, + "step": 21909 + }, + { + "epoch": 1.0200665782061131, + "grad_norm": 0.31687400581226377, + "learning_rate": 8.34525041544624e-05, + "loss": 2.7473, + "step": 21910 + }, + { + "epoch": 1.0201131363922062, + "grad_norm": 0.3600145922255489, + "learning_rate": 8.34504909254495e-05, + "loss": 2.8369, + "step": 21911 + }, + { + "epoch": 1.0201596945782991, + "grad_norm": 0.321923880486482, + "learning_rate": 8.344847759826269e-05, + "loss": 2.7441, + "step": 21912 + }, + { + "epoch": 1.0202062527643923, + "grad_norm": 0.37933403251510756, + "learning_rate": 8.344646417290789e-05, + "loss": 2.8423, + "step": 21913 + }, + { + "epoch": 1.0202528109504854, + "grad_norm": 0.35558593011658024, + "learning_rate": 8.344445064939101e-05, + "loss": 2.9419, + "step": 21914 + }, + { + "epoch": 1.0202993691365785, + "grad_norm": 0.35469065297070634, + "learning_rate": 8.344243702771797e-05, + "loss": 2.8747, + "step": 21915 + }, + { + "epoch": 1.0203459273226716, + "grad_norm": 0.35533432016419625, + "learning_rate": 8.344042330789468e-05, + "loss": 2.9112, + "step": 21916 + }, + { + "epoch": 1.0203924855087645, + "grad_norm": 0.3291658825498159, + "learning_rate": 8.343840948992703e-05, + "loss": 2.8443, + "step": 21917 + }, + { + "epoch": 1.0204390436948576, + "grad_norm": 0.33458613953347505, + "learning_rate": 8.343639557382095e-05, + "loss": 2.8462, + "step": 21918 + }, + { + "epoch": 1.0204856018809507, + "grad_norm": 0.36968518497863745, + "learning_rate": 8.343438155958232e-05, + "loss": 2.8793, + "step": 21919 + }, + { + "epoch": 1.0205321600670438, + "grad_norm": 0.3189619068922601, + "learning_rate": 8.343236744721711e-05, + "loss": 2.8052, + "step": 21920 + }, + { + "epoch": 1.020578718253137, + "grad_norm": 0.35764911162812707, + "learning_rate": 8.343035323673117e-05, + "loss": 2.9394, + "step": 21921 + }, + { + "epoch": 1.0206252764392298, + "grad_norm": 0.32860962660853427, + "learning_rate": 8.342833892813045e-05, + "loss": 2.8175, + "step": 21922 + }, + { + "epoch": 1.020671834625323, + "grad_norm": 0.33309942945518894, + "learning_rate": 8.342632452142083e-05, + "loss": 2.8011, + "step": 21923 + }, + { + "epoch": 1.020718392811416, + "grad_norm": 0.346326017236241, + "learning_rate": 8.342431001660827e-05, + "loss": 2.7907, + "step": 21924 + }, + { + "epoch": 1.0207649509975092, + "grad_norm": 0.3521820485541113, + "learning_rate": 8.342229541369863e-05, + "loss": 2.8307, + "step": 21925 + }, + { + "epoch": 1.0208115091836023, + "grad_norm": 0.3767991194479533, + "learning_rate": 8.342028071269785e-05, + "loss": 2.8201, + "step": 21926 + }, + { + "epoch": 1.0208580673696952, + "grad_norm": 0.3525666727981052, + "learning_rate": 8.341826591361184e-05, + "loss": 2.8649, + "step": 21927 + }, + { + "epoch": 1.0209046255557883, + "grad_norm": 0.3443841813424099, + "learning_rate": 8.34162510164465e-05, + "loss": 2.855, + "step": 21928 + }, + { + "epoch": 1.0209511837418814, + "grad_norm": 0.3221575328726094, + "learning_rate": 8.341423602120778e-05, + "loss": 2.792, + "step": 21929 + }, + { + "epoch": 1.0209977419279745, + "grad_norm": 0.33377435984020243, + "learning_rate": 8.341222092790155e-05, + "loss": 2.7961, + "step": 21930 + }, + { + "epoch": 1.0210443001140677, + "grad_norm": 0.37304105498171664, + "learning_rate": 8.341020573653376e-05, + "loss": 2.7839, + "step": 21931 + }, + { + "epoch": 1.0210908583001606, + "grad_norm": 0.340266057734977, + "learning_rate": 8.34081904471103e-05, + "loss": 2.8491, + "step": 21932 + }, + { + "epoch": 1.0211374164862537, + "grad_norm": 0.35002779647148546, + "learning_rate": 8.340617505963708e-05, + "loss": 2.7409, + "step": 21933 + }, + { + "epoch": 1.0211839746723468, + "grad_norm": 0.3324552042546855, + "learning_rate": 8.340415957412005e-05, + "loss": 2.9327, + "step": 21934 + }, + { + "epoch": 1.02123053285844, + "grad_norm": 0.36815612190484726, + "learning_rate": 8.340214399056508e-05, + "loss": 2.8087, + "step": 21935 + }, + { + "epoch": 1.021277091044533, + "grad_norm": 0.38209395909028354, + "learning_rate": 8.340012830897811e-05, + "loss": 2.8031, + "step": 21936 + }, + { + "epoch": 1.021323649230626, + "grad_norm": 0.3146918528239148, + "learning_rate": 8.339811252936507e-05, + "loss": 2.8275, + "step": 21937 + }, + { + "epoch": 1.021370207416719, + "grad_norm": 0.34916046350612756, + "learning_rate": 8.339609665173185e-05, + "loss": 2.7187, + "step": 21938 + }, + { + "epoch": 1.0214167656028121, + "grad_norm": 0.3102203131386109, + "learning_rate": 8.339408067608438e-05, + "loss": 2.8175, + "step": 21939 + }, + { + "epoch": 1.0214633237889053, + "grad_norm": 0.3522167615843727, + "learning_rate": 8.339206460242857e-05, + "loss": 2.7453, + "step": 21940 + }, + { + "epoch": 1.0215098819749981, + "grad_norm": 0.3317685386170248, + "learning_rate": 8.339004843077033e-05, + "loss": 2.7493, + "step": 21941 + }, + { + "epoch": 1.0215564401610913, + "grad_norm": 0.3421083459798459, + "learning_rate": 8.338803216111559e-05, + "loss": 2.8911, + "step": 21942 + }, + { + "epoch": 1.0216029983471844, + "grad_norm": 0.3468231478556524, + "learning_rate": 8.338601579347028e-05, + "loss": 2.8289, + "step": 21943 + }, + { + "epoch": 1.0216495565332775, + "grad_norm": 0.32633717820931246, + "learning_rate": 8.33839993278403e-05, + "loss": 2.8808, + "step": 21944 + }, + { + "epoch": 1.0216961147193706, + "grad_norm": 0.32508188077675376, + "learning_rate": 8.338198276423157e-05, + "loss": 2.6831, + "step": 21945 + }, + { + "epoch": 1.0217426729054635, + "grad_norm": 0.32153706357428635, + "learning_rate": 8.337996610265e-05, + "loss": 2.765, + "step": 21946 + }, + { + "epoch": 1.0217892310915566, + "grad_norm": 0.3129032374537278, + "learning_rate": 8.337794934310152e-05, + "loss": 2.854, + "step": 21947 + }, + { + "epoch": 1.0218357892776497, + "grad_norm": 0.3236395733993621, + "learning_rate": 8.337593248559206e-05, + "loss": 2.8354, + "step": 21948 + }, + { + "epoch": 1.0218823474637428, + "grad_norm": 0.3163246355850305, + "learning_rate": 8.337391553012752e-05, + "loss": 2.7449, + "step": 21949 + }, + { + "epoch": 1.021928905649836, + "grad_norm": 0.31674509785554766, + "learning_rate": 8.337189847671382e-05, + "loss": 2.7516, + "step": 21950 + }, + { + "epoch": 1.0219754638359289, + "grad_norm": 0.31852849878976797, + "learning_rate": 8.336988132535689e-05, + "loss": 2.9402, + "step": 21951 + }, + { + "epoch": 1.022022022022022, + "grad_norm": 0.3363667541033416, + "learning_rate": 8.336786407606264e-05, + "loss": 2.7254, + "step": 21952 + }, + { + "epoch": 1.022068580208115, + "grad_norm": 0.30366865290428113, + "learning_rate": 8.3365846728837e-05, + "loss": 2.8233, + "step": 21953 + }, + { + "epoch": 1.0221151383942082, + "grad_norm": 0.33963297593112995, + "learning_rate": 8.336382928368589e-05, + "loss": 2.8095, + "step": 21954 + }, + { + "epoch": 1.0221616965803013, + "grad_norm": 0.3181192522220269, + "learning_rate": 8.336181174061521e-05, + "loss": 2.7821, + "step": 21955 + }, + { + "epoch": 1.0222082547663942, + "grad_norm": 0.35960976153064944, + "learning_rate": 8.335979409963093e-05, + "loss": 2.8358, + "step": 21956 + }, + { + "epoch": 1.0222548129524873, + "grad_norm": 0.3266753614340967, + "learning_rate": 8.335777636073892e-05, + "loss": 2.706, + "step": 21957 + }, + { + "epoch": 1.0223013711385804, + "grad_norm": 0.34334549499797623, + "learning_rate": 8.335575852394513e-05, + "loss": 2.8753, + "step": 21958 + }, + { + "epoch": 1.0223479293246736, + "grad_norm": 0.36799755125398304, + "learning_rate": 8.335374058925548e-05, + "loss": 2.8681, + "step": 21959 + }, + { + "epoch": 1.0223944875107667, + "grad_norm": 0.36418004610809723, + "learning_rate": 8.335172255667588e-05, + "loss": 2.8793, + "step": 21960 + }, + { + "epoch": 1.0224410456968596, + "grad_norm": 0.353894133596513, + "learning_rate": 8.334970442621226e-05, + "loss": 2.8065, + "step": 21961 + }, + { + "epoch": 1.0224876038829527, + "grad_norm": 0.3610600567517667, + "learning_rate": 8.334768619787054e-05, + "loss": 2.8083, + "step": 21962 + }, + { + "epoch": 1.0225341620690458, + "grad_norm": 0.34586496369268815, + "learning_rate": 8.334566787165664e-05, + "loss": 2.8273, + "step": 21963 + }, + { + "epoch": 1.022580720255139, + "grad_norm": 0.3705515366141159, + "learning_rate": 8.33436494475765e-05, + "loss": 2.8918, + "step": 21964 + }, + { + "epoch": 1.022627278441232, + "grad_norm": 0.3446588492273203, + "learning_rate": 8.334163092563604e-05, + "loss": 2.8399, + "step": 21965 + }, + { + "epoch": 1.022673836627325, + "grad_norm": 0.3531884185828627, + "learning_rate": 8.333961230584117e-05, + "loss": 2.8478, + "step": 21966 + }, + { + "epoch": 1.022720394813418, + "grad_norm": 0.33663341390597296, + "learning_rate": 8.333759358819782e-05, + "loss": 2.8702, + "step": 21967 + }, + { + "epoch": 1.0227669529995111, + "grad_norm": 0.36244119623326176, + "learning_rate": 8.333557477271192e-05, + "loss": 2.8846, + "step": 21968 + }, + { + "epoch": 1.0228135111856043, + "grad_norm": 0.3208673393217828, + "learning_rate": 8.333355585938938e-05, + "loss": 2.9386, + "step": 21969 + }, + { + "epoch": 1.0228600693716974, + "grad_norm": 0.3651369750588539, + "learning_rate": 8.333153684823616e-05, + "loss": 2.8272, + "step": 21970 + }, + { + "epoch": 1.0229066275577903, + "grad_norm": 0.33635999176335224, + "learning_rate": 8.332951773925815e-05, + "loss": 2.8362, + "step": 21971 + }, + { + "epoch": 1.0229531857438834, + "grad_norm": 0.32787550590744236, + "learning_rate": 8.33274985324613e-05, + "loss": 2.7751, + "step": 21972 + }, + { + "epoch": 1.0229997439299765, + "grad_norm": 0.3571835227256991, + "learning_rate": 8.332547922785151e-05, + "loss": 2.946, + "step": 21973 + }, + { + "epoch": 1.0230463021160696, + "grad_norm": 0.31424589149490995, + "learning_rate": 8.332345982543473e-05, + "loss": 2.8488, + "step": 21974 + }, + { + "epoch": 1.0230928603021627, + "grad_norm": 0.3275928986812148, + "learning_rate": 8.332144032521688e-05, + "loss": 2.7745, + "step": 21975 + }, + { + "epoch": 1.0231394184882556, + "grad_norm": 0.35545887641282775, + "learning_rate": 8.331942072720388e-05, + "loss": 2.8347, + "step": 21976 + }, + { + "epoch": 1.0231859766743487, + "grad_norm": 0.30397951332595163, + "learning_rate": 8.331740103140167e-05, + "loss": 2.6974, + "step": 21977 + }, + { + "epoch": 1.0232325348604419, + "grad_norm": 0.34366990398015174, + "learning_rate": 8.331538123781616e-05, + "loss": 2.863, + "step": 21978 + }, + { + "epoch": 1.023279093046535, + "grad_norm": 0.3311333017921222, + "learning_rate": 8.33133613464533e-05, + "loss": 2.8889, + "step": 21979 + }, + { + "epoch": 1.023325651232628, + "grad_norm": 0.3377851661147261, + "learning_rate": 8.3311341357319e-05, + "loss": 2.8052, + "step": 21980 + }, + { + "epoch": 1.023372209418721, + "grad_norm": 0.3300394325841532, + "learning_rate": 8.330932127041919e-05, + "loss": 2.8817, + "step": 21981 + }, + { + "epoch": 1.023418767604814, + "grad_norm": 0.3268858770328474, + "learning_rate": 8.330730108575982e-05, + "loss": 2.8136, + "step": 21982 + }, + { + "epoch": 1.0234653257909072, + "grad_norm": 0.3153512531190855, + "learning_rate": 8.330528080334679e-05, + "loss": 2.8742, + "step": 21983 + }, + { + "epoch": 1.0235118839770003, + "grad_norm": 0.3150405619855635, + "learning_rate": 8.330326042318604e-05, + "loss": 2.8762, + "step": 21984 + }, + { + "epoch": 1.0235584421630932, + "grad_norm": 0.3158118977826282, + "learning_rate": 8.33012399452835e-05, + "loss": 2.8053, + "step": 21985 + }, + { + "epoch": 1.0236050003491863, + "grad_norm": 0.323457397885458, + "learning_rate": 8.329921936964509e-05, + "loss": 2.8754, + "step": 21986 + }, + { + "epoch": 1.0236515585352794, + "grad_norm": 0.32265410393315347, + "learning_rate": 8.329719869627677e-05, + "loss": 2.8498, + "step": 21987 + }, + { + "epoch": 1.0236981167213726, + "grad_norm": 0.32146083243800994, + "learning_rate": 8.329517792518447e-05, + "loss": 2.8558, + "step": 21988 + }, + { + "epoch": 1.0237446749074657, + "grad_norm": 0.3264395652722364, + "learning_rate": 8.329315705637407e-05, + "loss": 2.8528, + "step": 21989 + }, + { + "epoch": 1.0237912330935586, + "grad_norm": 0.33597653827449864, + "learning_rate": 8.329113608985155e-05, + "loss": 2.8257, + "step": 21990 + }, + { + "epoch": 1.0238377912796517, + "grad_norm": 0.2995200738716625, + "learning_rate": 8.328911502562282e-05, + "loss": 2.843, + "step": 21991 + }, + { + "epoch": 1.0238843494657448, + "grad_norm": 0.3247496672935825, + "learning_rate": 8.328709386369383e-05, + "loss": 2.8982, + "step": 21992 + }, + { + "epoch": 1.023930907651838, + "grad_norm": 0.3337965466809663, + "learning_rate": 8.328507260407047e-05, + "loss": 2.8872, + "step": 21993 + }, + { + "epoch": 1.023977465837931, + "grad_norm": 0.3302689889913724, + "learning_rate": 8.328305124675873e-05, + "loss": 2.8953, + "step": 21994 + }, + { + "epoch": 1.024024024024024, + "grad_norm": 0.3067043891453619, + "learning_rate": 8.328102979176449e-05, + "loss": 2.7411, + "step": 21995 + }, + { + "epoch": 1.024070582210117, + "grad_norm": 0.3438382854970852, + "learning_rate": 8.327900823909372e-05, + "loss": 2.8484, + "step": 21996 + }, + { + "epoch": 1.0241171403962102, + "grad_norm": 0.3366049375143766, + "learning_rate": 8.327698658875234e-05, + "loss": 2.9166, + "step": 21997 + }, + { + "epoch": 1.0241636985823033, + "grad_norm": 0.3063752350773126, + "learning_rate": 8.327496484074628e-05, + "loss": 2.8581, + "step": 21998 + }, + { + "epoch": 1.0242102567683964, + "grad_norm": 0.349353117500191, + "learning_rate": 8.327294299508149e-05, + "loss": 2.8202, + "step": 21999 + }, + { + "epoch": 1.0242568149544893, + "grad_norm": 0.30294542322870543, + "learning_rate": 8.327092105176386e-05, + "loss": 2.8308, + "step": 22000 + }, + { + "epoch": 1.0243033731405824, + "grad_norm": 0.3652833169198087, + "learning_rate": 8.326889901079937e-05, + "loss": 2.8355, + "step": 22001 + }, + { + "epoch": 1.0243499313266755, + "grad_norm": 0.340176582417386, + "learning_rate": 8.326687687219394e-05, + "loss": 2.9091, + "step": 22002 + }, + { + "epoch": 1.0243964895127686, + "grad_norm": 0.34291782231816953, + "learning_rate": 8.326485463595351e-05, + "loss": 2.9079, + "step": 22003 + }, + { + "epoch": 1.0244430476988617, + "grad_norm": 0.351498906406926, + "learning_rate": 8.3262832302084e-05, + "loss": 2.9203, + "step": 22004 + }, + { + "epoch": 1.0244896058849546, + "grad_norm": 0.33441629284586033, + "learning_rate": 8.326080987059135e-05, + "loss": 2.8148, + "step": 22005 + }, + { + "epoch": 1.0245361640710478, + "grad_norm": 0.3487668533859865, + "learning_rate": 8.325878734148152e-05, + "loss": 2.9187, + "step": 22006 + }, + { + "epoch": 1.0245827222571409, + "grad_norm": 0.343823968099662, + "learning_rate": 8.32567647147604e-05, + "loss": 2.9202, + "step": 22007 + }, + { + "epoch": 1.024629280443234, + "grad_norm": 0.3599641564549299, + "learning_rate": 8.325474199043396e-05, + "loss": 2.8488, + "step": 22008 + }, + { + "epoch": 1.024675838629327, + "grad_norm": 0.36846505298743754, + "learning_rate": 8.325271916850815e-05, + "loss": 2.7467, + "step": 22009 + }, + { + "epoch": 1.02472239681542, + "grad_norm": 0.3806985231732881, + "learning_rate": 8.325069624898887e-05, + "loss": 2.7893, + "step": 22010 + }, + { + "epoch": 1.024768955001513, + "grad_norm": 0.34251220967561974, + "learning_rate": 8.324867323188207e-05, + "loss": 2.8569, + "step": 22011 + }, + { + "epoch": 1.0248155131876062, + "grad_norm": 0.35500643930621456, + "learning_rate": 8.324665011719368e-05, + "loss": 2.7984, + "step": 22012 + }, + { + "epoch": 1.0248620713736993, + "grad_norm": 0.37533009540365775, + "learning_rate": 8.324462690492965e-05, + "loss": 2.8108, + "step": 22013 + }, + { + "epoch": 1.0249086295597925, + "grad_norm": 0.35720124828813465, + "learning_rate": 8.324260359509593e-05, + "loss": 2.7843, + "step": 22014 + }, + { + "epoch": 1.0249551877458853, + "grad_norm": 0.3576936053609158, + "learning_rate": 8.324058018769844e-05, + "loss": 2.7723, + "step": 22015 + }, + { + "epoch": 1.0250017459319785, + "grad_norm": 0.3429463304870878, + "learning_rate": 8.323855668274312e-05, + "loss": 2.948, + "step": 22016 + }, + { + "epoch": 1.0250483041180716, + "grad_norm": 0.34522835062619034, + "learning_rate": 8.32365330802359e-05, + "loss": 2.8698, + "step": 22017 + }, + { + "epoch": 1.0250948623041647, + "grad_norm": 0.3606308279464261, + "learning_rate": 8.323450938018274e-05, + "loss": 2.8696, + "step": 22018 + }, + { + "epoch": 1.0251414204902578, + "grad_norm": 0.36445500320386287, + "learning_rate": 8.323248558258958e-05, + "loss": 2.9182, + "step": 22019 + }, + { + "epoch": 1.0251879786763507, + "grad_norm": 0.41146731630565775, + "learning_rate": 8.323046168746232e-05, + "loss": 2.7687, + "step": 22020 + }, + { + "epoch": 1.0252345368624438, + "grad_norm": 0.356373241096913, + "learning_rate": 8.322843769480695e-05, + "loss": 2.8482, + "step": 22021 + }, + { + "epoch": 1.025281095048537, + "grad_norm": 0.37974114704407436, + "learning_rate": 8.322641360462938e-05, + "loss": 2.8631, + "step": 22022 + }, + { + "epoch": 1.02532765323463, + "grad_norm": 0.33019121261123535, + "learning_rate": 8.322438941693556e-05, + "loss": 2.8562, + "step": 22023 + }, + { + "epoch": 1.025374211420723, + "grad_norm": 0.3498303092193862, + "learning_rate": 8.322236513173145e-05, + "loss": 2.7646, + "step": 22024 + }, + { + "epoch": 1.025420769606816, + "grad_norm": 0.31610942772141437, + "learning_rate": 8.322034074902294e-05, + "loss": 2.7556, + "step": 22025 + }, + { + "epoch": 1.0254673277929092, + "grad_norm": 0.3345534389576541, + "learning_rate": 8.321831626881602e-05, + "loss": 2.8692, + "step": 22026 + }, + { + "epoch": 1.0255138859790023, + "grad_norm": 0.34646119425989086, + "learning_rate": 8.321629169111661e-05, + "loss": 2.8855, + "step": 22027 + }, + { + "epoch": 1.0255604441650954, + "grad_norm": 0.31537152176986744, + "learning_rate": 8.321426701593065e-05, + "loss": 2.8278, + "step": 22028 + }, + { + "epoch": 1.0256070023511883, + "grad_norm": 0.3366279383929888, + "learning_rate": 8.321224224326412e-05, + "loss": 2.8585, + "step": 22029 + }, + { + "epoch": 1.0256535605372814, + "grad_norm": 0.3122887984571695, + "learning_rate": 8.321021737312289e-05, + "loss": 2.7639, + "step": 22030 + }, + { + "epoch": 1.0257001187233745, + "grad_norm": 0.3231860267013295, + "learning_rate": 8.320819240551297e-05, + "loss": 2.7746, + "step": 22031 + }, + { + "epoch": 1.0257466769094676, + "grad_norm": 0.3621791115112266, + "learning_rate": 8.320616734044025e-05, + "loss": 2.8676, + "step": 22032 + }, + { + "epoch": 1.0257932350955608, + "grad_norm": 0.3061116962871978, + "learning_rate": 8.320414217791073e-05, + "loss": 2.8712, + "step": 22033 + }, + { + "epoch": 1.0258397932816536, + "grad_norm": 0.35624777268052865, + "learning_rate": 8.320211691793031e-05, + "loss": 2.836, + "step": 22034 + }, + { + "epoch": 1.0258863514677468, + "grad_norm": 0.31328013918701253, + "learning_rate": 8.320009156050495e-05, + "loss": 2.7528, + "step": 22035 + }, + { + "epoch": 1.0259329096538399, + "grad_norm": 0.3506849974696731, + "learning_rate": 8.319806610564059e-05, + "loss": 2.8235, + "step": 22036 + }, + { + "epoch": 1.025979467839933, + "grad_norm": 0.29688841749560557, + "learning_rate": 8.319604055334318e-05, + "loss": 2.9247, + "step": 22037 + }, + { + "epoch": 1.026026026026026, + "grad_norm": 0.3100152194584874, + "learning_rate": 8.319401490361867e-05, + "loss": 2.8042, + "step": 22038 + }, + { + "epoch": 1.026072584212119, + "grad_norm": 0.3274198811152971, + "learning_rate": 8.319198915647297e-05, + "loss": 2.8202, + "step": 22039 + }, + { + "epoch": 1.0261191423982121, + "grad_norm": 0.3182611882947306, + "learning_rate": 8.318996331191207e-05, + "loss": 2.9174, + "step": 22040 + }, + { + "epoch": 1.0261657005843052, + "grad_norm": 0.3419529763964038, + "learning_rate": 8.31879373699419e-05, + "loss": 2.857, + "step": 22041 + }, + { + "epoch": 1.0262122587703983, + "grad_norm": 0.3001225281136106, + "learning_rate": 8.31859113305684e-05, + "loss": 2.9169, + "step": 22042 + }, + { + "epoch": 1.0262588169564915, + "grad_norm": 0.33909157578628324, + "learning_rate": 8.318388519379751e-05, + "loss": 2.7395, + "step": 22043 + }, + { + "epoch": 1.0263053751425844, + "grad_norm": 0.2953827829854107, + "learning_rate": 8.318185895963519e-05, + "loss": 2.8509, + "step": 22044 + }, + { + "epoch": 1.0263519333286775, + "grad_norm": 0.3335674243957997, + "learning_rate": 8.317983262808739e-05, + "loss": 2.9254, + "step": 22045 + }, + { + "epoch": 1.0263984915147706, + "grad_norm": 0.34829652830002766, + "learning_rate": 8.317780619916005e-05, + "loss": 2.8837, + "step": 22046 + }, + { + "epoch": 1.0264450497008637, + "grad_norm": 0.3416079573061251, + "learning_rate": 8.317577967285913e-05, + "loss": 2.8294, + "step": 22047 + }, + { + "epoch": 1.0264916078869568, + "grad_norm": 0.3484123620191241, + "learning_rate": 8.317375304919052e-05, + "loss": 2.818, + "step": 22048 + }, + { + "epoch": 1.0265381660730497, + "grad_norm": 0.3867719756183367, + "learning_rate": 8.317172632816026e-05, + "loss": 2.7429, + "step": 22049 + }, + { + "epoch": 1.0265847242591428, + "grad_norm": 0.3442663604612283, + "learning_rate": 8.316969950977422e-05, + "loss": 2.8381, + "step": 22050 + }, + { + "epoch": 1.026631282445236, + "grad_norm": 0.36335955752076343, + "learning_rate": 8.31676725940384e-05, + "loss": 2.9054, + "step": 22051 + }, + { + "epoch": 1.026677840631329, + "grad_norm": 0.33679399649358804, + "learning_rate": 8.316564558095872e-05, + "loss": 3.0274, + "step": 22052 + }, + { + "epoch": 1.0267243988174222, + "grad_norm": 0.34995193894066245, + "learning_rate": 8.316361847054115e-05, + "loss": 2.8426, + "step": 22053 + }, + { + "epoch": 1.026770957003515, + "grad_norm": 0.362090061753851, + "learning_rate": 8.31615912627916e-05, + "loss": 2.8459, + "step": 22054 + }, + { + "epoch": 1.0268175151896082, + "grad_norm": 0.35467573755387394, + "learning_rate": 8.315956395771607e-05, + "loss": 2.7577, + "step": 22055 + }, + { + "epoch": 1.0268640733757013, + "grad_norm": 0.37330376777535784, + "learning_rate": 8.315753655532048e-05, + "loss": 2.8493, + "step": 22056 + }, + { + "epoch": 1.0269106315617944, + "grad_norm": 0.396490970850455, + "learning_rate": 8.315550905561077e-05, + "loss": 2.7553, + "step": 22057 + }, + { + "epoch": 1.0269571897478875, + "grad_norm": 0.3381240913309444, + "learning_rate": 8.315348145859295e-05, + "loss": 2.8813, + "step": 22058 + }, + { + "epoch": 1.0270037479339804, + "grad_norm": 0.35683439026928404, + "learning_rate": 8.315145376427289e-05, + "loss": 2.8452, + "step": 22059 + }, + { + "epoch": 1.0270503061200735, + "grad_norm": 0.35769248827181827, + "learning_rate": 8.31494259726566e-05, + "loss": 2.7945, + "step": 22060 + }, + { + "epoch": 1.0270968643061666, + "grad_norm": 0.3582371118012965, + "learning_rate": 8.314739808375e-05, + "loss": 2.7721, + "step": 22061 + }, + { + "epoch": 1.0271434224922598, + "grad_norm": 0.32639775736624144, + "learning_rate": 8.314537009755905e-05, + "loss": 2.869, + "step": 22062 + }, + { + "epoch": 1.0271899806783529, + "grad_norm": 0.3472311554798562, + "learning_rate": 8.31433420140897e-05, + "loss": 2.9523, + "step": 22063 + }, + { + "epoch": 1.0272365388644458, + "grad_norm": 0.3367709469498275, + "learning_rate": 8.314131383334791e-05, + "loss": 2.7989, + "step": 22064 + }, + { + "epoch": 1.0272830970505389, + "grad_norm": 0.3332965911105398, + "learning_rate": 8.313928555533965e-05, + "loss": 2.8145, + "step": 22065 + }, + { + "epoch": 1.027329655236632, + "grad_norm": 0.33047557276067824, + "learning_rate": 8.313725718007083e-05, + "loss": 2.8373, + "step": 22066 + }, + { + "epoch": 1.0273762134227251, + "grad_norm": 0.3561481142599312, + "learning_rate": 8.313522870754742e-05, + "loss": 2.8395, + "step": 22067 + }, + { + "epoch": 1.0274227716088182, + "grad_norm": 0.3312522621908958, + "learning_rate": 8.313320013777539e-05, + "loss": 2.8391, + "step": 22068 + }, + { + "epoch": 1.0274693297949111, + "grad_norm": 0.36336530707647774, + "learning_rate": 8.313117147076069e-05, + "loss": 2.7938, + "step": 22069 + }, + { + "epoch": 1.0275158879810042, + "grad_norm": 0.32501997581224973, + "learning_rate": 8.312914270650924e-05, + "loss": 2.9134, + "step": 22070 + }, + { + "epoch": 1.0275624461670974, + "grad_norm": 0.34657827130892355, + "learning_rate": 8.312711384502703e-05, + "loss": 2.8104, + "step": 22071 + }, + { + "epoch": 1.0276090043531905, + "grad_norm": 0.3280728748329618, + "learning_rate": 8.312508488632002e-05, + "loss": 2.7539, + "step": 22072 + }, + { + "epoch": 1.0276555625392834, + "grad_norm": 0.3558622710710018, + "learning_rate": 8.312305583039414e-05, + "loss": 2.9328, + "step": 22073 + }, + { + "epoch": 1.0277021207253765, + "grad_norm": 0.3419018116490357, + "learning_rate": 8.312102667725534e-05, + "loss": 2.8145, + "step": 22074 + }, + { + "epoch": 1.0277486789114696, + "grad_norm": 0.3455001459928195, + "learning_rate": 8.31189974269096e-05, + "loss": 2.8759, + "step": 22075 + }, + { + "epoch": 1.0277952370975627, + "grad_norm": 0.4065771814851396, + "learning_rate": 8.311696807936287e-05, + "loss": 2.9636, + "step": 22076 + }, + { + "epoch": 1.0278417952836558, + "grad_norm": 0.3193025983295817, + "learning_rate": 8.31149386346211e-05, + "loss": 2.8208, + "step": 22077 + }, + { + "epoch": 1.0278883534697487, + "grad_norm": 0.375874570121351, + "learning_rate": 8.311290909269025e-05, + "loss": 2.8223, + "step": 22078 + }, + { + "epoch": 1.0279349116558418, + "grad_norm": 0.35371361416711855, + "learning_rate": 8.311087945357627e-05, + "loss": 2.7747, + "step": 22079 + }, + { + "epoch": 1.027981469841935, + "grad_norm": 0.3614362490342382, + "learning_rate": 8.310884971728512e-05, + "loss": 2.9161, + "step": 22080 + }, + { + "epoch": 1.028028028028028, + "grad_norm": 0.3384201433526402, + "learning_rate": 8.310681988382277e-05, + "loss": 2.676, + "step": 22081 + }, + { + "epoch": 1.0280745862141212, + "grad_norm": 0.3580322948115711, + "learning_rate": 8.310478995319515e-05, + "loss": 2.8717, + "step": 22082 + }, + { + "epoch": 1.028121144400214, + "grad_norm": 0.32372113644213224, + "learning_rate": 8.310275992540824e-05, + "loss": 2.8264, + "step": 22083 + }, + { + "epoch": 1.0281677025863072, + "grad_norm": 0.389460664475877, + "learning_rate": 8.310072980046797e-05, + "loss": 2.8221, + "step": 22084 + }, + { + "epoch": 1.0282142607724003, + "grad_norm": 0.3631768239395199, + "learning_rate": 8.309869957838034e-05, + "loss": 2.8184, + "step": 22085 + }, + { + "epoch": 1.0282608189584934, + "grad_norm": 0.3551574092670634, + "learning_rate": 8.309666925915129e-05, + "loss": 2.8411, + "step": 22086 + }, + { + "epoch": 1.0283073771445865, + "grad_norm": 0.31971514125518585, + "learning_rate": 8.309463884278676e-05, + "loss": 2.8262, + "step": 22087 + }, + { + "epoch": 1.0283539353306794, + "grad_norm": 0.3213254735737083, + "learning_rate": 8.309260832929275e-05, + "loss": 2.8259, + "step": 22088 + }, + { + "epoch": 1.0284004935167725, + "grad_norm": 0.37295035750689653, + "learning_rate": 8.309057771867516e-05, + "loss": 2.8837, + "step": 22089 + }, + { + "epoch": 1.0284470517028657, + "grad_norm": 0.32408428331507766, + "learning_rate": 8.308854701094002e-05, + "loss": 2.8799, + "step": 22090 + }, + { + "epoch": 1.0284936098889588, + "grad_norm": 0.37743065718947083, + "learning_rate": 8.308651620609325e-05, + "loss": 2.8754, + "step": 22091 + }, + { + "epoch": 1.0285401680750519, + "grad_norm": 0.31757890707120834, + "learning_rate": 8.308448530414079e-05, + "loss": 2.8881, + "step": 22092 + }, + { + "epoch": 1.0285867262611448, + "grad_norm": 0.35384416155410314, + "learning_rate": 8.308245430508864e-05, + "loss": 2.8669, + "step": 22093 + }, + { + "epoch": 1.028633284447238, + "grad_norm": 0.3283865895311173, + "learning_rate": 8.308042320894274e-05, + "loss": 2.8921, + "step": 22094 + }, + { + "epoch": 1.028679842633331, + "grad_norm": 0.34956249724360017, + "learning_rate": 8.307839201570906e-05, + "loss": 2.8614, + "step": 22095 + }, + { + "epoch": 1.0287264008194241, + "grad_norm": 0.30579521216971894, + "learning_rate": 8.307636072539358e-05, + "loss": 2.8228, + "step": 22096 + }, + { + "epoch": 1.0287729590055172, + "grad_norm": 0.3731773639044642, + "learning_rate": 8.307432933800221e-05, + "loss": 2.9477, + "step": 22097 + }, + { + "epoch": 1.0288195171916101, + "grad_norm": 0.3388696035148085, + "learning_rate": 8.307229785354095e-05, + "loss": 2.8761, + "step": 22098 + }, + { + "epoch": 1.0288660753777032, + "grad_norm": 0.34124290309768035, + "learning_rate": 8.307026627201575e-05, + "loss": 2.7177, + "step": 22099 + }, + { + "epoch": 1.0289126335637964, + "grad_norm": 0.34930027902493094, + "learning_rate": 8.306823459343258e-05, + "loss": 2.8189, + "step": 22100 + }, + { + "epoch": 1.0289591917498895, + "grad_norm": 0.3255139000146951, + "learning_rate": 8.30662028177974e-05, + "loss": 2.7317, + "step": 22101 + }, + { + "epoch": 1.0290057499359826, + "grad_norm": 0.39893492245928536, + "learning_rate": 8.306417094511618e-05, + "loss": 2.7547, + "step": 22102 + }, + { + "epoch": 1.0290523081220755, + "grad_norm": 0.3180449109154535, + "learning_rate": 8.306213897539487e-05, + "loss": 2.7821, + "step": 22103 + }, + { + "epoch": 1.0290988663081686, + "grad_norm": 0.4201175659873142, + "learning_rate": 8.306010690863943e-05, + "loss": 2.7306, + "step": 22104 + }, + { + "epoch": 1.0291454244942617, + "grad_norm": 0.29047984089218276, + "learning_rate": 8.305807474485586e-05, + "loss": 2.8155, + "step": 22105 + }, + { + "epoch": 1.0291919826803548, + "grad_norm": 0.38134895824221504, + "learning_rate": 8.305604248405005e-05, + "loss": 2.8377, + "step": 22106 + }, + { + "epoch": 1.029238540866448, + "grad_norm": 0.30728967701580756, + "learning_rate": 8.305401012622804e-05, + "loss": 2.9046, + "step": 22107 + }, + { + "epoch": 1.0292850990525408, + "grad_norm": 0.3830706628141842, + "learning_rate": 8.305197767139577e-05, + "loss": 2.8062, + "step": 22108 + }, + { + "epoch": 1.029331657238634, + "grad_norm": 0.334953583673501, + "learning_rate": 8.304994511955922e-05, + "loss": 2.9257, + "step": 22109 + }, + { + "epoch": 1.029378215424727, + "grad_norm": 0.38855301499333333, + "learning_rate": 8.30479124707243e-05, + "loss": 2.7585, + "step": 22110 + }, + { + "epoch": 1.0294247736108202, + "grad_norm": 0.3412312827840194, + "learning_rate": 8.304587972489703e-05, + "loss": 2.8356, + "step": 22111 + }, + { + "epoch": 1.029471331796913, + "grad_norm": 0.3683972314220885, + "learning_rate": 8.304384688208335e-05, + "loss": 2.7696, + "step": 22112 + }, + { + "epoch": 1.0295178899830062, + "grad_norm": 0.3717701927477945, + "learning_rate": 8.304181394228925e-05, + "loss": 2.9068, + "step": 22113 + }, + { + "epoch": 1.0295644481690993, + "grad_norm": 0.37317786126246894, + "learning_rate": 8.303978090552067e-05, + "loss": 2.8824, + "step": 22114 + }, + { + "epoch": 1.0296110063551924, + "grad_norm": 0.3653386617161878, + "learning_rate": 8.303774777178359e-05, + "loss": 2.851, + "step": 22115 + }, + { + "epoch": 1.0296575645412855, + "grad_norm": 0.3824357135679091, + "learning_rate": 8.303571454108398e-05, + "loss": 2.8359, + "step": 22116 + }, + { + "epoch": 1.0297041227273784, + "grad_norm": 0.3975000910450641, + "learning_rate": 8.30336812134278e-05, + "loss": 2.8022, + "step": 22117 + }, + { + "epoch": 1.0297506809134715, + "grad_norm": 0.3851474650023451, + "learning_rate": 8.303164778882102e-05, + "loss": 2.873, + "step": 22118 + }, + { + "epoch": 1.0297972390995647, + "grad_norm": 0.3578448900526859, + "learning_rate": 8.302961426726961e-05, + "loss": 2.8102, + "step": 22119 + }, + { + "epoch": 1.0298437972856578, + "grad_norm": 0.3711008213519944, + "learning_rate": 8.302758064877952e-05, + "loss": 2.8877, + "step": 22120 + }, + { + "epoch": 1.029890355471751, + "grad_norm": 0.33409206409998965, + "learning_rate": 8.302554693335676e-05, + "loss": 2.9651, + "step": 22121 + }, + { + "epoch": 1.0299369136578438, + "grad_norm": 0.42942787574156654, + "learning_rate": 8.302351312100724e-05, + "loss": 2.8204, + "step": 22122 + }, + { + "epoch": 1.029983471843937, + "grad_norm": 0.3608951424200901, + "learning_rate": 8.3021479211737e-05, + "loss": 2.8625, + "step": 22123 + }, + { + "epoch": 1.03003003003003, + "grad_norm": 0.37715139785690366, + "learning_rate": 8.301944520555195e-05, + "loss": 2.7926, + "step": 22124 + }, + { + "epoch": 1.0300765882161231, + "grad_norm": 0.3641314354077324, + "learning_rate": 8.301741110245809e-05, + "loss": 2.7565, + "step": 22125 + }, + { + "epoch": 1.0301231464022162, + "grad_norm": 0.39412847491907493, + "learning_rate": 8.301537690246137e-05, + "loss": 2.7847, + "step": 22126 + }, + { + "epoch": 1.0301697045883091, + "grad_norm": 0.38476383384840546, + "learning_rate": 8.301334260556778e-05, + "loss": 2.8776, + "step": 22127 + }, + { + "epoch": 1.0302162627744023, + "grad_norm": 0.4319340248927201, + "learning_rate": 8.301130821178328e-05, + "loss": 2.8367, + "step": 22128 + }, + { + "epoch": 1.0302628209604954, + "grad_norm": 0.3396709863938248, + "learning_rate": 8.300927372111384e-05, + "loss": 2.7372, + "step": 22129 + }, + { + "epoch": 1.0303093791465885, + "grad_norm": 0.3932888438264599, + "learning_rate": 8.300723913356543e-05, + "loss": 2.8027, + "step": 22130 + }, + { + "epoch": 1.0303559373326816, + "grad_norm": 0.3545141401594881, + "learning_rate": 8.300520444914404e-05, + "loss": 2.8522, + "step": 22131 + }, + { + "epoch": 1.0304024955187745, + "grad_norm": 0.40262655297747074, + "learning_rate": 8.300316966785562e-05, + "loss": 2.971, + "step": 22132 + }, + { + "epoch": 1.0304490537048676, + "grad_norm": 0.35754013629465353, + "learning_rate": 8.300113478970614e-05, + "loss": 2.6965, + "step": 22133 + }, + { + "epoch": 1.0304956118909607, + "grad_norm": 0.3813132915777125, + "learning_rate": 8.299909981470158e-05, + "loss": 2.8628, + "step": 22134 + }, + { + "epoch": 1.0305421700770538, + "grad_norm": 0.3525794764679712, + "learning_rate": 8.299706474284792e-05, + "loss": 2.8851, + "step": 22135 + }, + { + "epoch": 1.030588728263147, + "grad_norm": 0.3566908604849304, + "learning_rate": 8.299502957415113e-05, + "loss": 2.8421, + "step": 22136 + }, + { + "epoch": 1.0306352864492399, + "grad_norm": 0.33805133657804487, + "learning_rate": 8.299299430861718e-05, + "loss": 2.8521, + "step": 22137 + }, + { + "epoch": 1.030681844635333, + "grad_norm": 0.3499562166125696, + "learning_rate": 8.299095894625203e-05, + "loss": 2.7476, + "step": 22138 + }, + { + "epoch": 1.030728402821426, + "grad_norm": 0.3564175170629689, + "learning_rate": 8.298892348706167e-05, + "loss": 2.8843, + "step": 22139 + }, + { + "epoch": 1.0307749610075192, + "grad_norm": 0.33734834854552864, + "learning_rate": 8.298688793105207e-05, + "loss": 2.8537, + "step": 22140 + }, + { + "epoch": 1.0308215191936123, + "grad_norm": 0.3481486491800548, + "learning_rate": 8.298485227822921e-05, + "loss": 2.8868, + "step": 22141 + }, + { + "epoch": 1.0308680773797052, + "grad_norm": 0.37682205278390174, + "learning_rate": 8.298281652859905e-05, + "loss": 2.833, + "step": 22142 + }, + { + "epoch": 1.0309146355657983, + "grad_norm": 0.3374279101722899, + "learning_rate": 8.298078068216757e-05, + "loss": 2.8269, + "step": 22143 + }, + { + "epoch": 1.0309611937518914, + "grad_norm": 0.3705987630775791, + "learning_rate": 8.297874473894075e-05, + "loss": 2.8681, + "step": 22144 + }, + { + "epoch": 1.0310077519379846, + "grad_norm": 0.3703303604480022, + "learning_rate": 8.297670869892458e-05, + "loss": 2.8463, + "step": 22145 + }, + { + "epoch": 1.0310543101240777, + "grad_norm": 0.35677192896111704, + "learning_rate": 8.2974672562125e-05, + "loss": 2.7396, + "step": 22146 + }, + { + "epoch": 1.0311008683101706, + "grad_norm": 0.29942360577715965, + "learning_rate": 8.297263632854802e-05, + "loss": 2.9219, + "step": 22147 + }, + { + "epoch": 1.0311474264962637, + "grad_norm": 0.3558434784038553, + "learning_rate": 8.297059999819959e-05, + "loss": 2.8451, + "step": 22148 + }, + { + "epoch": 1.0311939846823568, + "grad_norm": 0.3304381660822176, + "learning_rate": 8.29685635710857e-05, + "loss": 2.8898, + "step": 22149 + }, + { + "epoch": 1.03124054286845, + "grad_norm": 0.3355050767574701, + "learning_rate": 8.29665270472123e-05, + "loss": 2.8655, + "step": 22150 + }, + { + "epoch": 1.031287101054543, + "grad_norm": 0.33188124581572115, + "learning_rate": 8.296449042658542e-05, + "loss": 2.8969, + "step": 22151 + }, + { + "epoch": 1.031333659240636, + "grad_norm": 0.33323920771719895, + "learning_rate": 8.296245370921101e-05, + "loss": 2.7717, + "step": 22152 + }, + { + "epoch": 1.031380217426729, + "grad_norm": 0.31617044028115765, + "learning_rate": 8.296041689509502e-05, + "loss": 2.8588, + "step": 22153 + }, + { + "epoch": 1.0314267756128221, + "grad_norm": 0.32667507763565196, + "learning_rate": 8.295837998424347e-05, + "loss": 2.8279, + "step": 22154 + }, + { + "epoch": 1.0314733337989153, + "grad_norm": 0.3114479849457896, + "learning_rate": 8.295634297666233e-05, + "loss": 2.8269, + "step": 22155 + }, + { + "epoch": 1.0315198919850084, + "grad_norm": 0.3429906790869424, + "learning_rate": 8.295430587235756e-05, + "loss": 2.8373, + "step": 22156 + }, + { + "epoch": 1.0315664501711013, + "grad_norm": 0.3260205216026554, + "learning_rate": 8.295226867133514e-05, + "loss": 2.7252, + "step": 22157 + }, + { + "epoch": 1.0316130083571944, + "grad_norm": 0.3231487791899592, + "learning_rate": 8.295023137360108e-05, + "loss": 2.7908, + "step": 22158 + }, + { + "epoch": 1.0316595665432875, + "grad_norm": 0.3645935455694702, + "learning_rate": 8.294819397916133e-05, + "loss": 2.8102, + "step": 22159 + }, + { + "epoch": 1.0317061247293806, + "grad_norm": 0.3338294267235227, + "learning_rate": 8.294615648802187e-05, + "loss": 2.9407, + "step": 22160 + }, + { + "epoch": 1.0317526829154735, + "grad_norm": 0.3173060243880283, + "learning_rate": 8.294411890018868e-05, + "loss": 2.843, + "step": 22161 + }, + { + "epoch": 1.0317992411015666, + "grad_norm": 0.3342268448986254, + "learning_rate": 8.294208121566776e-05, + "loss": 2.8137, + "step": 22162 + }, + { + "epoch": 1.0318457992876597, + "grad_norm": 0.3400561477242693, + "learning_rate": 8.294004343446508e-05, + "loss": 2.7592, + "step": 22163 + }, + { + "epoch": 1.0318923574737529, + "grad_norm": 0.3205690894952657, + "learning_rate": 8.29380055565866e-05, + "loss": 2.8139, + "step": 22164 + }, + { + "epoch": 1.031938915659846, + "grad_norm": 0.33657563404377444, + "learning_rate": 8.293596758203833e-05, + "loss": 2.8589, + "step": 22165 + }, + { + "epoch": 1.0319854738459389, + "grad_norm": 0.3175826276321888, + "learning_rate": 8.293392951082623e-05, + "loss": 2.8583, + "step": 22166 + }, + { + "epoch": 1.032032032032032, + "grad_norm": 0.3658701018615691, + "learning_rate": 8.293189134295631e-05, + "loss": 2.8674, + "step": 22167 + }, + { + "epoch": 1.032078590218125, + "grad_norm": 0.29138461953807077, + "learning_rate": 8.292985307843452e-05, + "loss": 2.7669, + "step": 22168 + }, + { + "epoch": 1.0321251484042182, + "grad_norm": 0.40691363402066694, + "learning_rate": 8.292781471726686e-05, + "loss": 2.7569, + "step": 22169 + }, + { + "epoch": 1.0321717065903113, + "grad_norm": 0.32085780314928736, + "learning_rate": 8.29257762594593e-05, + "loss": 2.8676, + "step": 22170 + }, + { + "epoch": 1.0322182647764042, + "grad_norm": 0.4012616235193257, + "learning_rate": 8.292373770501783e-05, + "loss": 2.8629, + "step": 22171 + }, + { + "epoch": 1.0322648229624973, + "grad_norm": 0.3520172127610486, + "learning_rate": 8.292169905394844e-05, + "loss": 2.8334, + "step": 22172 + }, + { + "epoch": 1.0323113811485904, + "grad_norm": 0.33877379075463054, + "learning_rate": 8.291966030625711e-05, + "loss": 2.6932, + "step": 22173 + }, + { + "epoch": 1.0323579393346836, + "grad_norm": 0.32818805847736, + "learning_rate": 8.291762146194982e-05, + "loss": 2.821, + "step": 22174 + }, + { + "epoch": 1.0324044975207767, + "grad_norm": 0.30613173185046944, + "learning_rate": 8.291558252103255e-05, + "loss": 2.7633, + "step": 22175 + }, + { + "epoch": 1.0324510557068696, + "grad_norm": 0.31383240877207214, + "learning_rate": 8.291354348351128e-05, + "loss": 2.761, + "step": 22176 + }, + { + "epoch": 1.0324976138929627, + "grad_norm": 0.3136744797651253, + "learning_rate": 8.291150434939202e-05, + "loss": 2.8622, + "step": 22177 + }, + { + "epoch": 1.0325441720790558, + "grad_norm": 0.3214001213766325, + "learning_rate": 8.290946511868073e-05, + "loss": 2.885, + "step": 22178 + }, + { + "epoch": 1.032590730265149, + "grad_norm": 0.34942754350929134, + "learning_rate": 8.290742579138339e-05, + "loss": 2.8523, + "step": 22179 + }, + { + "epoch": 1.032637288451242, + "grad_norm": 0.3120787097131473, + "learning_rate": 8.290538636750602e-05, + "loss": 2.884, + "step": 22180 + }, + { + "epoch": 1.032683846637335, + "grad_norm": 0.3673549671193936, + "learning_rate": 8.290334684705457e-05, + "loss": 2.8962, + "step": 22181 + }, + { + "epoch": 1.032730404823428, + "grad_norm": 0.3206245143087574, + "learning_rate": 8.290130723003503e-05, + "loss": 2.8581, + "step": 22182 + }, + { + "epoch": 1.0327769630095212, + "grad_norm": 0.30266495967453627, + "learning_rate": 8.28992675164534e-05, + "loss": 2.8259, + "step": 22183 + }, + { + "epoch": 1.0328235211956143, + "grad_norm": 0.32481758910935205, + "learning_rate": 8.289722770631567e-05, + "loss": 2.8043, + "step": 22184 + }, + { + "epoch": 1.0328700793817074, + "grad_norm": 0.31092301495786295, + "learning_rate": 8.28951877996278e-05, + "loss": 2.7894, + "step": 22185 + }, + { + "epoch": 1.0329166375678003, + "grad_norm": 0.31418001482279684, + "learning_rate": 8.28931477963958e-05, + "loss": 2.8394, + "step": 22186 + }, + { + "epoch": 1.0329631957538934, + "grad_norm": 0.32948155714653443, + "learning_rate": 8.289110769662567e-05, + "loss": 2.8562, + "step": 22187 + }, + { + "epoch": 1.0330097539399865, + "grad_norm": 0.35358658915520647, + "learning_rate": 8.288906750032336e-05, + "loss": 2.9594, + "step": 22188 + }, + { + "epoch": 1.0330563121260796, + "grad_norm": 0.311399928111791, + "learning_rate": 8.288702720749488e-05, + "loss": 2.8818, + "step": 22189 + }, + { + "epoch": 1.0331028703121727, + "grad_norm": 0.3591741626685473, + "learning_rate": 8.288498681814623e-05, + "loss": 2.841, + "step": 22190 + }, + { + "epoch": 1.0331494284982656, + "grad_norm": 0.33917359512549683, + "learning_rate": 8.288294633228338e-05, + "loss": 2.8461, + "step": 22191 + }, + { + "epoch": 1.0331959866843587, + "grad_norm": 0.36945677028884866, + "learning_rate": 8.28809057499123e-05, + "loss": 2.9393, + "step": 22192 + }, + { + "epoch": 1.0332425448704519, + "grad_norm": 0.362273368718963, + "learning_rate": 8.287886507103901e-05, + "loss": 2.8772, + "step": 22193 + }, + { + "epoch": 1.033289103056545, + "grad_norm": 0.3406297143430167, + "learning_rate": 8.287682429566949e-05, + "loss": 2.7715, + "step": 22194 + }, + { + "epoch": 1.033335661242638, + "grad_norm": 0.344230762304959, + "learning_rate": 8.287478342380976e-05, + "loss": 2.8298, + "step": 22195 + }, + { + "epoch": 1.033382219428731, + "grad_norm": 0.36620546720781955, + "learning_rate": 8.287274245546575e-05, + "loss": 2.8841, + "step": 22196 + }, + { + "epoch": 1.033428777614824, + "grad_norm": 0.3329067151938115, + "learning_rate": 8.287070139064348e-05, + "loss": 2.6886, + "step": 22197 + }, + { + "epoch": 1.0334753358009172, + "grad_norm": 0.3632442337556329, + "learning_rate": 8.286866022934894e-05, + "loss": 2.872, + "step": 22198 + }, + { + "epoch": 1.0335218939870103, + "grad_norm": 0.3357880182973042, + "learning_rate": 8.286661897158811e-05, + "loss": 2.8361, + "step": 22199 + }, + { + "epoch": 1.0335684521731032, + "grad_norm": 0.38025478408677954, + "learning_rate": 8.286457761736702e-05, + "loss": 2.845, + "step": 22200 + }, + { + "epoch": 1.0336150103591963, + "grad_norm": 0.32913338871533027, + "learning_rate": 8.286253616669162e-05, + "loss": 2.9263, + "step": 22201 + }, + { + "epoch": 1.0336615685452895, + "grad_norm": 0.4041669909718849, + "learning_rate": 8.286049461956791e-05, + "loss": 2.8622, + "step": 22202 + }, + { + "epoch": 1.0337081267313826, + "grad_norm": 0.3142051730032596, + "learning_rate": 8.285845297600188e-05, + "loss": 2.8931, + "step": 22203 + }, + { + "epoch": 1.0337546849174757, + "grad_norm": 0.39563481536043704, + "learning_rate": 8.285641123599954e-05, + "loss": 2.8572, + "step": 22204 + }, + { + "epoch": 1.0338012431035686, + "grad_norm": 0.34326976525148534, + "learning_rate": 8.285436939956686e-05, + "loss": 2.9342, + "step": 22205 + }, + { + "epoch": 1.0338478012896617, + "grad_norm": 0.38041795518891036, + "learning_rate": 8.285232746670986e-05, + "loss": 2.8176, + "step": 22206 + }, + { + "epoch": 1.0338943594757548, + "grad_norm": 0.3615080253155261, + "learning_rate": 8.285028543743449e-05, + "loss": 2.8751, + "step": 22207 + }, + { + "epoch": 1.033940917661848, + "grad_norm": 0.36115849662940075, + "learning_rate": 8.284824331174678e-05, + "loss": 2.8284, + "step": 22208 + }, + { + "epoch": 1.033987475847941, + "grad_norm": 0.33572448510994796, + "learning_rate": 8.284620108965271e-05, + "loss": 2.9023, + "step": 22209 + }, + { + "epoch": 1.034034034034034, + "grad_norm": 0.35044568529176423, + "learning_rate": 8.284415877115827e-05, + "loss": 2.7973, + "step": 22210 + }, + { + "epoch": 1.034080592220127, + "grad_norm": 0.3343376768318162, + "learning_rate": 8.284211635626949e-05, + "loss": 2.8677, + "step": 22211 + }, + { + "epoch": 1.0341271504062202, + "grad_norm": 0.39611890094768015, + "learning_rate": 8.284007384499231e-05, + "loss": 2.9506, + "step": 22212 + }, + { + "epoch": 1.0341737085923133, + "grad_norm": 0.3158735275424697, + "learning_rate": 8.283803123733274e-05, + "loss": 2.7965, + "step": 22213 + }, + { + "epoch": 1.0342202667784064, + "grad_norm": 0.36548856605981167, + "learning_rate": 8.28359885332968e-05, + "loss": 2.8224, + "step": 22214 + }, + { + "epoch": 1.0342668249644993, + "grad_norm": 0.3262581064741626, + "learning_rate": 8.283394573289046e-05, + "loss": 2.8527, + "step": 22215 + }, + { + "epoch": 1.0343133831505924, + "grad_norm": 0.3693790082513379, + "learning_rate": 8.283190283611973e-05, + "loss": 2.9265, + "step": 22216 + }, + { + "epoch": 1.0343599413366855, + "grad_norm": 0.35585758859277833, + "learning_rate": 8.28298598429906e-05, + "loss": 2.8624, + "step": 22217 + }, + { + "epoch": 1.0344064995227786, + "grad_norm": 0.3542299635498818, + "learning_rate": 8.282781675350906e-05, + "loss": 2.8335, + "step": 22218 + }, + { + "epoch": 1.0344530577088717, + "grad_norm": 0.33951817028373266, + "learning_rate": 8.282577356768111e-05, + "loss": 2.8303, + "step": 22219 + }, + { + "epoch": 1.0344996158949646, + "grad_norm": 0.35554073264727776, + "learning_rate": 8.282373028551276e-05, + "loss": 3.002, + "step": 22220 + }, + { + "epoch": 1.0345461740810578, + "grad_norm": 0.34576481879557386, + "learning_rate": 8.282168690700998e-05, + "loss": 2.8525, + "step": 22221 + }, + { + "epoch": 1.0345927322671509, + "grad_norm": 0.3611152046201076, + "learning_rate": 8.281964343217879e-05, + "loss": 2.8349, + "step": 22222 + }, + { + "epoch": 1.034639290453244, + "grad_norm": 0.3378095700052374, + "learning_rate": 8.281759986102518e-05, + "loss": 2.8184, + "step": 22223 + }, + { + "epoch": 1.034685848639337, + "grad_norm": 0.3948318280633087, + "learning_rate": 8.281555619355515e-05, + "loss": 2.8691, + "step": 22224 + }, + { + "epoch": 1.03473240682543, + "grad_norm": 0.3401591790690386, + "learning_rate": 8.281351242977469e-05, + "loss": 2.8523, + "step": 22225 + }, + { + "epoch": 1.034778965011523, + "grad_norm": 0.39373627795651567, + "learning_rate": 8.28114685696898e-05, + "loss": 2.8829, + "step": 22226 + }, + { + "epoch": 1.0348255231976162, + "grad_norm": 0.33412689249907207, + "learning_rate": 8.28094246133065e-05, + "loss": 2.7972, + "step": 22227 + }, + { + "epoch": 1.0348720813837093, + "grad_norm": 0.37916995198015957, + "learning_rate": 8.280738056063076e-05, + "loss": 2.8476, + "step": 22228 + }, + { + "epoch": 1.0349186395698025, + "grad_norm": 0.36187621049126717, + "learning_rate": 8.280533641166857e-05, + "loss": 2.8665, + "step": 22229 + }, + { + "epoch": 1.0349651977558953, + "grad_norm": 0.3704739284406004, + "learning_rate": 8.280329216642597e-05, + "loss": 2.894, + "step": 22230 + }, + { + "epoch": 1.0350117559419885, + "grad_norm": 0.39155677616173384, + "learning_rate": 8.280124782490895e-05, + "loss": 2.8625, + "step": 22231 + }, + { + "epoch": 1.0350583141280816, + "grad_norm": 0.3218974755337167, + "learning_rate": 8.279920338712349e-05, + "loss": 2.8051, + "step": 22232 + }, + { + "epoch": 1.0351048723141747, + "grad_norm": 0.3854049643582846, + "learning_rate": 8.279715885307559e-05, + "loss": 2.7683, + "step": 22233 + }, + { + "epoch": 1.0351514305002678, + "grad_norm": 0.32862176035208795, + "learning_rate": 8.279511422277127e-05, + "loss": 2.7846, + "step": 22234 + }, + { + "epoch": 1.0351979886863607, + "grad_norm": 0.3619298637122412, + "learning_rate": 8.27930694962165e-05, + "loss": 2.8202, + "step": 22235 + }, + { + "epoch": 1.0352445468724538, + "grad_norm": 0.30744721842776956, + "learning_rate": 8.279102467341732e-05, + "loss": 2.7689, + "step": 22236 + }, + { + "epoch": 1.035291105058547, + "grad_norm": 0.3797439873945807, + "learning_rate": 8.278897975437973e-05, + "loss": 2.7922, + "step": 22237 + }, + { + "epoch": 1.03533766324464, + "grad_norm": 0.35543621384999535, + "learning_rate": 8.278693473910969e-05, + "loss": 2.6875, + "step": 22238 + }, + { + "epoch": 1.0353842214307332, + "grad_norm": 0.3526481057468054, + "learning_rate": 8.278488962761323e-05, + "loss": 2.7819, + "step": 22239 + }, + { + "epoch": 1.035430779616826, + "grad_norm": 0.3668008173093488, + "learning_rate": 8.278284441989636e-05, + "loss": 2.8522, + "step": 22240 + }, + { + "epoch": 1.0354773378029192, + "grad_norm": 0.3416447989762699, + "learning_rate": 8.278079911596505e-05, + "loss": 2.8556, + "step": 22241 + }, + { + "epoch": 1.0355238959890123, + "grad_norm": 0.35973979169603065, + "learning_rate": 8.277875371582535e-05, + "loss": 2.8619, + "step": 22242 + }, + { + "epoch": 1.0355704541751054, + "grad_norm": 0.33749969187716894, + "learning_rate": 8.277670821948323e-05, + "loss": 2.8067, + "step": 22243 + }, + { + "epoch": 1.0356170123611985, + "grad_norm": 0.3657505940088052, + "learning_rate": 8.277466262694469e-05, + "loss": 2.837, + "step": 22244 + }, + { + "epoch": 1.0356635705472914, + "grad_norm": 0.31319135550329646, + "learning_rate": 8.277261693821574e-05, + "loss": 2.7915, + "step": 22245 + }, + { + "epoch": 1.0357101287333845, + "grad_norm": 0.3361403781172457, + "learning_rate": 8.27705711533024e-05, + "loss": 2.7917, + "step": 22246 + }, + { + "epoch": 1.0357566869194776, + "grad_norm": 0.33952926727354377, + "learning_rate": 8.276852527221066e-05, + "loss": 2.8528, + "step": 22247 + }, + { + "epoch": 1.0358032451055708, + "grad_norm": 0.3215392634016823, + "learning_rate": 8.276647929494652e-05, + "loss": 2.811, + "step": 22248 + }, + { + "epoch": 1.0358498032916637, + "grad_norm": 0.3783401424982455, + "learning_rate": 8.2764433221516e-05, + "loss": 2.922, + "step": 22249 + }, + { + "epoch": 1.0358963614777568, + "grad_norm": 0.32724450100536806, + "learning_rate": 8.276238705192509e-05, + "loss": 2.8804, + "step": 22250 + }, + { + "epoch": 1.0359429196638499, + "grad_norm": 0.35810444555252374, + "learning_rate": 8.276034078617981e-05, + "loss": 2.7803, + "step": 22251 + }, + { + "epoch": 1.035989477849943, + "grad_norm": 0.3627247337259134, + "learning_rate": 8.275829442428615e-05, + "loss": 2.8937, + "step": 22252 + }, + { + "epoch": 1.0360360360360361, + "grad_norm": 0.3299849739771982, + "learning_rate": 8.275624796625012e-05, + "loss": 2.8964, + "step": 22253 + }, + { + "epoch": 1.036082594222129, + "grad_norm": 0.35060297625755965, + "learning_rate": 8.275420141207774e-05, + "loss": 2.8376, + "step": 22254 + }, + { + "epoch": 1.0361291524082221, + "grad_norm": 0.35191326652195715, + "learning_rate": 8.2752154761775e-05, + "loss": 2.8568, + "step": 22255 + }, + { + "epoch": 1.0361757105943152, + "grad_norm": 0.31548190853146185, + "learning_rate": 8.27501080153479e-05, + "loss": 2.8374, + "step": 22256 + }, + { + "epoch": 1.0362222687804084, + "grad_norm": 0.34646194797468777, + "learning_rate": 8.274806117280249e-05, + "loss": 2.868, + "step": 22257 + }, + { + "epoch": 1.0362688269665015, + "grad_norm": 0.31970491497793135, + "learning_rate": 8.274601423414472e-05, + "loss": 2.8245, + "step": 22258 + }, + { + "epoch": 1.0363153851525944, + "grad_norm": 0.3495702077756185, + "learning_rate": 8.274396719938063e-05, + "loss": 2.8506, + "step": 22259 + }, + { + "epoch": 1.0363619433386875, + "grad_norm": 0.358672225010104, + "learning_rate": 8.274192006851622e-05, + "loss": 2.913, + "step": 22260 + }, + { + "epoch": 1.0364085015247806, + "grad_norm": 0.37413604499263103, + "learning_rate": 8.27398728415575e-05, + "loss": 2.6831, + "step": 22261 + }, + { + "epoch": 1.0364550597108737, + "grad_norm": 0.34915994733504296, + "learning_rate": 8.273782551851049e-05, + "loss": 2.8133, + "step": 22262 + }, + { + "epoch": 1.0365016178969668, + "grad_norm": 0.3493585962374505, + "learning_rate": 8.273577809938118e-05, + "loss": 2.9141, + "step": 22263 + }, + { + "epoch": 1.0365481760830597, + "grad_norm": 0.3546327899875576, + "learning_rate": 8.273373058417559e-05, + "loss": 2.9529, + "step": 22264 + }, + { + "epoch": 1.0365947342691528, + "grad_norm": 0.3528128279355359, + "learning_rate": 8.273168297289973e-05, + "loss": 2.8861, + "step": 22265 + }, + { + "epoch": 1.036641292455246, + "grad_norm": 0.3373019340506752, + "learning_rate": 8.272963526555958e-05, + "loss": 2.7765, + "step": 22266 + }, + { + "epoch": 1.036687850641339, + "grad_norm": 0.3224331931206074, + "learning_rate": 8.27275874621612e-05, + "loss": 2.8299, + "step": 22267 + }, + { + "epoch": 1.0367344088274322, + "grad_norm": 0.35104610738639447, + "learning_rate": 8.272553956271057e-05, + "loss": 2.726, + "step": 22268 + }, + { + "epoch": 1.036780967013525, + "grad_norm": 0.3718512862610803, + "learning_rate": 8.27234915672137e-05, + "loss": 2.9274, + "step": 22269 + }, + { + "epoch": 1.0368275251996182, + "grad_norm": 0.353412969241136, + "learning_rate": 8.27214434756766e-05, + "loss": 2.9473, + "step": 22270 + }, + { + "epoch": 1.0368740833857113, + "grad_norm": 0.3241663714394966, + "learning_rate": 8.27193952881053e-05, + "loss": 2.8985, + "step": 22271 + }, + { + "epoch": 1.0369206415718044, + "grad_norm": 0.35648321336087707, + "learning_rate": 8.27173470045058e-05, + "loss": 2.7425, + "step": 22272 + }, + { + "epoch": 1.0369671997578975, + "grad_norm": 0.3191715576697317, + "learning_rate": 8.271529862488411e-05, + "loss": 2.8449, + "step": 22273 + }, + { + "epoch": 1.0370137579439904, + "grad_norm": 0.34186883462299456, + "learning_rate": 8.271325014924623e-05, + "loss": 2.8497, + "step": 22274 + }, + { + "epoch": 1.0370603161300835, + "grad_norm": 0.34518771156356176, + "learning_rate": 8.27112015775982e-05, + "loss": 2.8488, + "step": 22275 + }, + { + "epoch": 1.0371068743161767, + "grad_norm": 0.3112839258866734, + "learning_rate": 8.2709152909946e-05, + "loss": 2.9115, + "step": 22276 + }, + { + "epoch": 1.0371534325022698, + "grad_norm": 0.3452210050974219, + "learning_rate": 8.270710414629567e-05, + "loss": 2.8051, + "step": 22277 + }, + { + "epoch": 1.0371999906883629, + "grad_norm": 0.3077521753160558, + "learning_rate": 8.27050552866532e-05, + "loss": 2.8099, + "step": 22278 + }, + { + "epoch": 1.0372465488744558, + "grad_norm": 0.36580052821487047, + "learning_rate": 8.270300633102461e-05, + "loss": 2.7801, + "step": 22279 + }, + { + "epoch": 1.037293107060549, + "grad_norm": 0.32673362822325286, + "learning_rate": 8.270095727941594e-05, + "loss": 2.8585, + "step": 22280 + }, + { + "epoch": 1.037339665246642, + "grad_norm": 0.3542631854845262, + "learning_rate": 8.269890813183316e-05, + "loss": 2.8607, + "step": 22281 + }, + { + "epoch": 1.0373862234327351, + "grad_norm": 0.35667404793639584, + "learning_rate": 8.269685888828232e-05, + "loss": 2.8644, + "step": 22282 + }, + { + "epoch": 1.0374327816188282, + "grad_norm": 0.3409884762245974, + "learning_rate": 8.269480954876942e-05, + "loss": 2.7599, + "step": 22283 + }, + { + "epoch": 1.0374793398049211, + "grad_norm": 0.3308189139735994, + "learning_rate": 8.269276011330047e-05, + "loss": 2.7873, + "step": 22284 + }, + { + "epoch": 1.0375258979910142, + "grad_norm": 0.3230243009404296, + "learning_rate": 8.269071058188149e-05, + "loss": 2.8561, + "step": 22285 + }, + { + "epoch": 1.0375724561771074, + "grad_norm": 0.31072485240352965, + "learning_rate": 8.26886609545185e-05, + "loss": 2.9063, + "step": 22286 + }, + { + "epoch": 1.0376190143632005, + "grad_norm": 0.3081430522369242, + "learning_rate": 8.268661123121751e-05, + "loss": 2.8338, + "step": 22287 + }, + { + "epoch": 1.0376655725492934, + "grad_norm": 0.3331510212522646, + "learning_rate": 8.268456141198452e-05, + "loss": 2.89, + "step": 22288 + }, + { + "epoch": 1.0377121307353865, + "grad_norm": 0.3126150830314704, + "learning_rate": 8.268251149682557e-05, + "loss": 2.8754, + "step": 22289 + }, + { + "epoch": 1.0377586889214796, + "grad_norm": 0.3782477159813796, + "learning_rate": 8.268046148574669e-05, + "loss": 2.801, + "step": 22290 + }, + { + "epoch": 1.0378052471075727, + "grad_norm": 0.3188318981369292, + "learning_rate": 8.267841137875385e-05, + "loss": 2.841, + "step": 22291 + }, + { + "epoch": 1.0378518052936658, + "grad_norm": 0.35170678626134033, + "learning_rate": 8.26763611758531e-05, + "loss": 2.8184, + "step": 22292 + }, + { + "epoch": 1.0378983634797587, + "grad_norm": 0.3204309599170917, + "learning_rate": 8.267431087705045e-05, + "loss": 2.8031, + "step": 22293 + }, + { + "epoch": 1.0379449216658518, + "grad_norm": 0.36761054999220694, + "learning_rate": 8.267226048235193e-05, + "loss": 2.7601, + "step": 22294 + }, + { + "epoch": 1.037991479851945, + "grad_norm": 0.3300852242259973, + "learning_rate": 8.267020999176351e-05, + "loss": 2.8456, + "step": 22295 + }, + { + "epoch": 1.038038038038038, + "grad_norm": 0.35302639759748494, + "learning_rate": 8.266815940529127e-05, + "loss": 2.8974, + "step": 22296 + }, + { + "epoch": 1.0380845962241312, + "grad_norm": 0.38255908200429317, + "learning_rate": 8.26661087229412e-05, + "loss": 2.8666, + "step": 22297 + }, + { + "epoch": 1.038131154410224, + "grad_norm": 0.3428100376658651, + "learning_rate": 8.26640579447193e-05, + "loss": 2.8469, + "step": 22298 + }, + { + "epoch": 1.0381777125963172, + "grad_norm": 0.3426599363271273, + "learning_rate": 8.266200707063163e-05, + "loss": 2.8578, + "step": 22299 + }, + { + "epoch": 1.0382242707824103, + "grad_norm": 0.3537271075178595, + "learning_rate": 8.265995610068417e-05, + "loss": 2.8841, + "step": 22300 + }, + { + "epoch": 1.0382708289685034, + "grad_norm": 0.40097355514539745, + "learning_rate": 8.265790503488298e-05, + "loss": 2.9004, + "step": 22301 + }, + { + "epoch": 1.0383173871545965, + "grad_norm": 0.3211400558982752, + "learning_rate": 8.265585387323403e-05, + "loss": 2.903, + "step": 22302 + }, + { + "epoch": 1.0383639453406894, + "grad_norm": 0.34722499594325607, + "learning_rate": 8.265380261574338e-05, + "loss": 2.8655, + "step": 22303 + }, + { + "epoch": 1.0384105035267825, + "grad_norm": 0.3232785862223469, + "learning_rate": 8.265175126241703e-05, + "loss": 2.7912, + "step": 22304 + }, + { + "epoch": 1.0384570617128757, + "grad_norm": 0.31900598080954135, + "learning_rate": 8.264969981326101e-05, + "loss": 2.8761, + "step": 22305 + }, + { + "epoch": 1.0385036198989688, + "grad_norm": 0.34489988250015136, + "learning_rate": 8.264764826828132e-05, + "loss": 2.8944, + "step": 22306 + }, + { + "epoch": 1.038550178085062, + "grad_norm": 0.3450719985266346, + "learning_rate": 8.264559662748402e-05, + "loss": 2.841, + "step": 22307 + }, + { + "epoch": 1.0385967362711548, + "grad_norm": 0.33247545124486116, + "learning_rate": 8.264354489087511e-05, + "loss": 2.6615, + "step": 22308 + }, + { + "epoch": 1.038643294457248, + "grad_norm": 0.31587927758765705, + "learning_rate": 8.264149305846061e-05, + "loss": 2.7689, + "step": 22309 + }, + { + "epoch": 1.038689852643341, + "grad_norm": 0.3473081651915177, + "learning_rate": 8.263944113024653e-05, + "loss": 2.8308, + "step": 22310 + }, + { + "epoch": 1.0387364108294341, + "grad_norm": 0.325412503802552, + "learning_rate": 8.263738910623892e-05, + "loss": 2.9652, + "step": 22311 + }, + { + "epoch": 1.0387829690155272, + "grad_norm": 0.3654536470052504, + "learning_rate": 8.263533698644379e-05, + "loss": 2.7419, + "step": 22312 + }, + { + "epoch": 1.0388295272016201, + "grad_norm": 0.34538917300956923, + "learning_rate": 8.263328477086716e-05, + "loss": 2.7491, + "step": 22313 + }, + { + "epoch": 1.0388760853877133, + "grad_norm": 0.33366328353550073, + "learning_rate": 8.263123245951505e-05, + "loss": 2.8436, + "step": 22314 + }, + { + "epoch": 1.0389226435738064, + "grad_norm": 0.3397315742699905, + "learning_rate": 8.262918005239348e-05, + "loss": 2.8402, + "step": 22315 + }, + { + "epoch": 1.0389692017598995, + "grad_norm": 0.32538490905596773, + "learning_rate": 8.262712754950848e-05, + "loss": 2.6812, + "step": 22316 + }, + { + "epoch": 1.0390157599459926, + "grad_norm": 0.315415405887383, + "learning_rate": 8.262507495086609e-05, + "loss": 2.8122, + "step": 22317 + }, + { + "epoch": 1.0390623181320855, + "grad_norm": 0.3447683725102274, + "learning_rate": 8.262302225647231e-05, + "loss": 2.7685, + "step": 22318 + }, + { + "epoch": 1.0391088763181786, + "grad_norm": 0.3262305168786376, + "learning_rate": 8.262096946633317e-05, + "loss": 2.8142, + "step": 22319 + }, + { + "epoch": 1.0391554345042717, + "grad_norm": 0.32352140431141524, + "learning_rate": 8.261891658045471e-05, + "loss": 2.8598, + "step": 22320 + }, + { + "epoch": 1.0392019926903648, + "grad_norm": 0.3645472507158894, + "learning_rate": 8.261686359884293e-05, + "loss": 2.7511, + "step": 22321 + }, + { + "epoch": 1.039248550876458, + "grad_norm": 0.352831899666089, + "learning_rate": 8.261481052150388e-05, + "loss": 2.8888, + "step": 22322 + }, + { + "epoch": 1.0392951090625508, + "grad_norm": 0.3819476276866832, + "learning_rate": 8.261275734844358e-05, + "loss": 2.9013, + "step": 22323 + }, + { + "epoch": 1.039341667248644, + "grad_norm": 0.383700009355152, + "learning_rate": 8.261070407966802e-05, + "loss": 2.8458, + "step": 22324 + }, + { + "epoch": 1.039388225434737, + "grad_norm": 0.3545763528816318, + "learning_rate": 8.26086507151833e-05, + "loss": 2.7427, + "step": 22325 + }, + { + "epoch": 1.0394347836208302, + "grad_norm": 0.366365093694944, + "learning_rate": 8.260659725499537e-05, + "loss": 2.7602, + "step": 22326 + }, + { + "epoch": 1.0394813418069233, + "grad_norm": 0.34372518063939733, + "learning_rate": 8.260454369911031e-05, + "loss": 2.8372, + "step": 22327 + }, + { + "epoch": 1.0395278999930162, + "grad_norm": 0.38329438265261684, + "learning_rate": 8.260249004753409e-05, + "loss": 2.8202, + "step": 22328 + }, + { + "epoch": 1.0395744581791093, + "grad_norm": 0.32154940018360556, + "learning_rate": 8.260043630027281e-05, + "loss": 2.8838, + "step": 22329 + }, + { + "epoch": 1.0396210163652024, + "grad_norm": 0.34715705988807094, + "learning_rate": 8.259838245733244e-05, + "loss": 2.8619, + "step": 22330 + }, + { + "epoch": 1.0396675745512955, + "grad_norm": 0.3336024634055465, + "learning_rate": 8.259632851871904e-05, + "loss": 2.8149, + "step": 22331 + }, + { + "epoch": 1.0397141327373887, + "grad_norm": 0.33226776031146743, + "learning_rate": 8.259427448443863e-05, + "loss": 2.8167, + "step": 22332 + }, + { + "epoch": 1.0397606909234816, + "grad_norm": 0.36117282653409183, + "learning_rate": 8.259222035449723e-05, + "loss": 2.7629, + "step": 22333 + }, + { + "epoch": 1.0398072491095747, + "grad_norm": 0.3353941209133302, + "learning_rate": 8.259016612890087e-05, + "loss": 2.864, + "step": 22334 + }, + { + "epoch": 1.0398538072956678, + "grad_norm": 0.3726647077565098, + "learning_rate": 8.25881118076556e-05, + "loss": 2.8322, + "step": 22335 + }, + { + "epoch": 1.039900365481761, + "grad_norm": 0.3077180010398886, + "learning_rate": 8.258605739076742e-05, + "loss": 2.828, + "step": 22336 + }, + { + "epoch": 1.0399469236678538, + "grad_norm": 0.35980450041236006, + "learning_rate": 8.258400287824237e-05, + "loss": 2.7779, + "step": 22337 + }, + { + "epoch": 1.039993481853947, + "grad_norm": 0.295151908609422, + "learning_rate": 8.258194827008648e-05, + "loss": 2.8981, + "step": 22338 + }, + { + "epoch": 1.04004004004004, + "grad_norm": 0.35355089439778437, + "learning_rate": 8.257989356630578e-05, + "loss": 2.926, + "step": 22339 + }, + { + "epoch": 1.0400865982261331, + "grad_norm": 0.3217823142482882, + "learning_rate": 8.257783876690632e-05, + "loss": 2.8628, + "step": 22340 + }, + { + "epoch": 1.0401331564122263, + "grad_norm": 0.32401395342087613, + "learning_rate": 8.25757838718941e-05, + "loss": 2.9042, + "step": 22341 + }, + { + "epoch": 1.0401797145983191, + "grad_norm": 0.3553670327668361, + "learning_rate": 8.257372888127516e-05, + "loss": 2.8507, + "step": 22342 + }, + { + "epoch": 1.0402262727844123, + "grad_norm": 0.3234768707940971, + "learning_rate": 8.257167379505552e-05, + "loss": 2.7542, + "step": 22343 + }, + { + "epoch": 1.0402728309705054, + "grad_norm": 0.33208047867131685, + "learning_rate": 8.256961861324127e-05, + "loss": 2.845, + "step": 22344 + }, + { + "epoch": 1.0403193891565985, + "grad_norm": 0.32822508394452266, + "learning_rate": 8.256756333583837e-05, + "loss": 2.6919, + "step": 22345 + }, + { + "epoch": 1.0403659473426916, + "grad_norm": 0.3594444509008107, + "learning_rate": 8.256550796285287e-05, + "loss": 2.7732, + "step": 22346 + }, + { + "epoch": 1.0404125055287845, + "grad_norm": 0.33160299839314783, + "learning_rate": 8.256345249429083e-05, + "loss": 2.8691, + "step": 22347 + }, + { + "epoch": 1.0404590637148776, + "grad_norm": 0.3525539976217166, + "learning_rate": 8.256139693015827e-05, + "loss": 2.8815, + "step": 22348 + }, + { + "epoch": 1.0405056219009707, + "grad_norm": 0.32796570234179834, + "learning_rate": 8.255934127046121e-05, + "loss": 2.8288, + "step": 22349 + }, + { + "epoch": 1.0405521800870638, + "grad_norm": 0.3530170451127392, + "learning_rate": 8.255728551520568e-05, + "loss": 2.7882, + "step": 22350 + }, + { + "epoch": 1.040598738273157, + "grad_norm": 0.3298934457790814, + "learning_rate": 8.255522966439773e-05, + "loss": 2.9358, + "step": 22351 + }, + { + "epoch": 1.0406452964592499, + "grad_norm": 0.3621141533654242, + "learning_rate": 8.255317371804339e-05, + "loss": 2.7121, + "step": 22352 + }, + { + "epoch": 1.040691854645343, + "grad_norm": 0.3386356613547266, + "learning_rate": 8.255111767614871e-05, + "loss": 2.8809, + "step": 22353 + }, + { + "epoch": 1.040738412831436, + "grad_norm": 0.34704627694657963, + "learning_rate": 8.254906153871969e-05, + "loss": 2.8663, + "step": 22354 + }, + { + "epoch": 1.0407849710175292, + "grad_norm": 0.3594799670633545, + "learning_rate": 8.254700530576237e-05, + "loss": 2.8175, + "step": 22355 + }, + { + "epoch": 1.0408315292036223, + "grad_norm": 0.3481800900356068, + "learning_rate": 8.25449489772828e-05, + "loss": 2.8248, + "step": 22356 + }, + { + "epoch": 1.0408780873897152, + "grad_norm": 0.3666836471617029, + "learning_rate": 8.254289255328703e-05, + "loss": 2.8162, + "step": 22357 + }, + { + "epoch": 1.0409246455758083, + "grad_norm": 0.35694213573915123, + "learning_rate": 8.254083603378107e-05, + "loss": 2.776, + "step": 22358 + }, + { + "epoch": 1.0409712037619014, + "grad_norm": 0.3405875782373081, + "learning_rate": 8.253877941877095e-05, + "loss": 2.8127, + "step": 22359 + }, + { + "epoch": 1.0410177619479946, + "grad_norm": 0.3575594776507431, + "learning_rate": 8.253672270826271e-05, + "loss": 2.8881, + "step": 22360 + }, + { + "epoch": 1.0410643201340877, + "grad_norm": 0.35278324348714835, + "learning_rate": 8.253466590226241e-05, + "loss": 2.7955, + "step": 22361 + }, + { + "epoch": 1.0411108783201806, + "grad_norm": 0.3759471905229173, + "learning_rate": 8.253260900077608e-05, + "loss": 2.8451, + "step": 22362 + }, + { + "epoch": 1.0411574365062737, + "grad_norm": 0.34235052556726175, + "learning_rate": 8.253055200380974e-05, + "loss": 2.8058, + "step": 22363 + }, + { + "epoch": 1.0412039946923668, + "grad_norm": 0.3789325536790865, + "learning_rate": 8.252849491136942e-05, + "loss": 2.8594, + "step": 22364 + }, + { + "epoch": 1.04125055287846, + "grad_norm": 0.36516276710854767, + "learning_rate": 8.252643772346119e-05, + "loss": 2.73, + "step": 22365 + }, + { + "epoch": 1.041297111064553, + "grad_norm": 0.34763545743306806, + "learning_rate": 8.252438044009108e-05, + "loss": 2.8082, + "step": 22366 + }, + { + "epoch": 1.041343669250646, + "grad_norm": 0.3901008083307801, + "learning_rate": 8.252232306126508e-05, + "loss": 2.8797, + "step": 22367 + }, + { + "epoch": 1.041390227436739, + "grad_norm": 0.31754307791972886, + "learning_rate": 8.25202655869893e-05, + "loss": 2.8198, + "step": 22368 + }, + { + "epoch": 1.0414367856228322, + "grad_norm": 0.35518885756845187, + "learning_rate": 8.251820801726972e-05, + "loss": 2.7937, + "step": 22369 + }, + { + "epoch": 1.0414833438089253, + "grad_norm": 0.3216277137372914, + "learning_rate": 8.251615035211241e-05, + "loss": 2.7748, + "step": 22370 + }, + { + "epoch": 1.0415299019950184, + "grad_norm": 0.37453646061531576, + "learning_rate": 8.25140925915234e-05, + "loss": 2.9686, + "step": 22371 + }, + { + "epoch": 1.0415764601811113, + "grad_norm": 0.31336246814434965, + "learning_rate": 8.251203473550875e-05, + "loss": 2.9414, + "step": 22372 + }, + { + "epoch": 1.0416230183672044, + "grad_norm": 0.32235302565394996, + "learning_rate": 8.250997678407447e-05, + "loss": 2.7978, + "step": 22373 + }, + { + "epoch": 1.0416695765532975, + "grad_norm": 0.3441736958040432, + "learning_rate": 8.250791873722661e-05, + "loss": 2.8107, + "step": 22374 + }, + { + "epoch": 1.0417161347393906, + "grad_norm": 0.3114826694178305, + "learning_rate": 8.25058605949712e-05, + "loss": 2.9145, + "step": 22375 + }, + { + "epoch": 1.0417626929254835, + "grad_norm": 0.3576196018252522, + "learning_rate": 8.250380235731432e-05, + "loss": 2.6924, + "step": 22376 + }, + { + "epoch": 1.0418092511115766, + "grad_norm": 0.3238632501397246, + "learning_rate": 8.250174402426196e-05, + "loss": 2.8841, + "step": 22377 + }, + { + "epoch": 1.0418558092976697, + "grad_norm": 0.3804674148640694, + "learning_rate": 8.249968559582017e-05, + "loss": 2.8254, + "step": 22378 + }, + { + "epoch": 1.0419023674837629, + "grad_norm": 0.33336754190552625, + "learning_rate": 8.249762707199503e-05, + "loss": 2.8299, + "step": 22379 + }, + { + "epoch": 1.041948925669856, + "grad_norm": 0.39479143650732645, + "learning_rate": 8.249556845279256e-05, + "loss": 2.903, + "step": 22380 + }, + { + "epoch": 1.0419954838559489, + "grad_norm": 0.3355052514406129, + "learning_rate": 8.249350973821878e-05, + "loss": 2.8643, + "step": 22381 + }, + { + "epoch": 1.042042042042042, + "grad_norm": 0.3409342544551718, + "learning_rate": 8.249145092827976e-05, + "loss": 2.8972, + "step": 22382 + }, + { + "epoch": 1.042088600228135, + "grad_norm": 0.34943619842046325, + "learning_rate": 8.248939202298153e-05, + "loss": 2.8892, + "step": 22383 + }, + { + "epoch": 1.0421351584142282, + "grad_norm": 0.33475456390735825, + "learning_rate": 8.248733302233014e-05, + "loss": 2.9035, + "step": 22384 + }, + { + "epoch": 1.0421817166003213, + "grad_norm": 0.336857305065677, + "learning_rate": 8.24852739263316e-05, + "loss": 2.8915, + "step": 22385 + }, + { + "epoch": 1.0422282747864142, + "grad_norm": 0.34085739643065843, + "learning_rate": 8.248321473499201e-05, + "loss": 2.8274, + "step": 22386 + }, + { + "epoch": 1.0422748329725073, + "grad_norm": 0.31509387036075287, + "learning_rate": 8.248115544831738e-05, + "loss": 2.8587, + "step": 22387 + }, + { + "epoch": 1.0423213911586005, + "grad_norm": 0.37685267211580453, + "learning_rate": 8.247909606631375e-05, + "loss": 2.8695, + "step": 22388 + }, + { + "epoch": 1.0423679493446936, + "grad_norm": 0.32767307182346456, + "learning_rate": 8.247703658898719e-05, + "loss": 2.8464, + "step": 22389 + }, + { + "epoch": 1.0424145075307867, + "grad_norm": 0.39655701206304644, + "learning_rate": 8.24749770163437e-05, + "loss": 2.7294, + "step": 22390 + }, + { + "epoch": 1.0424610657168796, + "grad_norm": 0.3343791273298634, + "learning_rate": 8.247291734838938e-05, + "loss": 2.8743, + "step": 22391 + }, + { + "epoch": 1.0425076239029727, + "grad_norm": 0.34287292876639863, + "learning_rate": 8.247085758513024e-05, + "loss": 2.9157, + "step": 22392 + }, + { + "epoch": 1.0425541820890658, + "grad_norm": 0.3358884753528925, + "learning_rate": 8.246879772657232e-05, + "loss": 2.8207, + "step": 22393 + }, + { + "epoch": 1.042600740275159, + "grad_norm": 0.31679368684840525, + "learning_rate": 8.246673777272169e-05, + "loss": 2.7246, + "step": 22394 + }, + { + "epoch": 1.042647298461252, + "grad_norm": 0.3476279385317482, + "learning_rate": 8.246467772358437e-05, + "loss": 2.8458, + "step": 22395 + }, + { + "epoch": 1.042693856647345, + "grad_norm": 0.3239963517318185, + "learning_rate": 8.246261757916643e-05, + "loss": 2.863, + "step": 22396 + }, + { + "epoch": 1.042740414833438, + "grad_norm": 0.31661067771826956, + "learning_rate": 8.246055733947391e-05, + "loss": 2.8396, + "step": 22397 + }, + { + "epoch": 1.0427869730195312, + "grad_norm": 0.30626911100432097, + "learning_rate": 8.245849700451285e-05, + "loss": 2.8081, + "step": 22398 + }, + { + "epoch": 1.0428335312056243, + "grad_norm": 0.3288404210883915, + "learning_rate": 8.245643657428929e-05, + "loss": 2.8275, + "step": 22399 + }, + { + "epoch": 1.0428800893917174, + "grad_norm": 0.3149583656237434, + "learning_rate": 8.245437604880927e-05, + "loss": 2.769, + "step": 22400 + }, + { + "epoch": 1.0429266475778103, + "grad_norm": 0.3081072235207099, + "learning_rate": 8.245231542807888e-05, + "loss": 2.8605, + "step": 22401 + }, + { + "epoch": 1.0429732057639034, + "grad_norm": 0.3225572073919768, + "learning_rate": 8.245025471210415e-05, + "loss": 2.8329, + "step": 22402 + }, + { + "epoch": 1.0430197639499965, + "grad_norm": 0.3486027729803392, + "learning_rate": 8.244819390089109e-05, + "loss": 2.894, + "step": 22403 + }, + { + "epoch": 1.0430663221360896, + "grad_norm": 0.30959437571346343, + "learning_rate": 8.24461329944458e-05, + "loss": 2.8582, + "step": 22404 + }, + { + "epoch": 1.0431128803221827, + "grad_norm": 0.34069867515765306, + "learning_rate": 8.24440719927743e-05, + "loss": 2.9349, + "step": 22405 + }, + { + "epoch": 1.0431594385082756, + "grad_norm": 0.2974150488740631, + "learning_rate": 8.244201089588263e-05, + "loss": 2.8976, + "step": 22406 + }, + { + "epoch": 1.0432059966943688, + "grad_norm": 0.3458410980693287, + "learning_rate": 8.243994970377687e-05, + "loss": 2.8277, + "step": 22407 + }, + { + "epoch": 1.0432525548804619, + "grad_norm": 0.3099262997325864, + "learning_rate": 8.243788841646305e-05, + "loss": 2.9553, + "step": 22408 + }, + { + "epoch": 1.043299113066555, + "grad_norm": 0.33128777347686034, + "learning_rate": 8.243582703394723e-05, + "loss": 2.7834, + "step": 22409 + }, + { + "epoch": 1.043345671252648, + "grad_norm": 0.32144331019029093, + "learning_rate": 8.243376555623544e-05, + "loss": 2.905, + "step": 22410 + }, + { + "epoch": 1.043392229438741, + "grad_norm": 0.35423285852059766, + "learning_rate": 8.243170398333376e-05, + "loss": 2.7656, + "step": 22411 + }, + { + "epoch": 1.043438787624834, + "grad_norm": 0.2943355043969449, + "learning_rate": 8.242964231524821e-05, + "loss": 2.7955, + "step": 22412 + }, + { + "epoch": 1.0434853458109272, + "grad_norm": 0.34574912004807956, + "learning_rate": 8.242758055198485e-05, + "loss": 2.8377, + "step": 22413 + }, + { + "epoch": 1.0435319039970203, + "grad_norm": 0.31465815510632467, + "learning_rate": 8.242551869354975e-05, + "loss": 2.7979, + "step": 22414 + }, + { + "epoch": 1.0435784621831135, + "grad_norm": 0.3538479818455859, + "learning_rate": 8.242345673994893e-05, + "loss": 2.8226, + "step": 22415 + }, + { + "epoch": 1.0436250203692063, + "grad_norm": 0.321402814874127, + "learning_rate": 8.242139469118848e-05, + "loss": 2.8613, + "step": 22416 + }, + { + "epoch": 1.0436715785552995, + "grad_norm": 0.35397749542098345, + "learning_rate": 8.241933254727442e-05, + "loss": 2.881, + "step": 22417 + }, + { + "epoch": 1.0437181367413926, + "grad_norm": 0.33890337740229576, + "learning_rate": 8.241727030821282e-05, + "loss": 2.9201, + "step": 22418 + }, + { + "epoch": 1.0437646949274857, + "grad_norm": 0.33609532560019145, + "learning_rate": 8.241520797400971e-05, + "loss": 2.8963, + "step": 22419 + }, + { + "epoch": 1.0438112531135788, + "grad_norm": 0.35632610088159666, + "learning_rate": 8.241314554467116e-05, + "loss": 2.9138, + "step": 22420 + }, + { + "epoch": 1.0438578112996717, + "grad_norm": 0.34308963473699994, + "learning_rate": 8.241108302020324e-05, + "loss": 2.9179, + "step": 22421 + }, + { + "epoch": 1.0439043694857648, + "grad_norm": 0.3311707030102168, + "learning_rate": 8.240902040061198e-05, + "loss": 2.7666, + "step": 22422 + }, + { + "epoch": 1.043950927671858, + "grad_norm": 0.3601459801154229, + "learning_rate": 8.240695768590342e-05, + "loss": 2.7632, + "step": 22423 + }, + { + "epoch": 1.043997485857951, + "grad_norm": 0.33798863140186597, + "learning_rate": 8.240489487608364e-05, + "loss": 2.8483, + "step": 22424 + }, + { + "epoch": 1.044044044044044, + "grad_norm": 0.33839804925031775, + "learning_rate": 8.240283197115869e-05, + "loss": 2.8016, + "step": 22425 + }, + { + "epoch": 1.044090602230137, + "grad_norm": 0.31346624335896295, + "learning_rate": 8.240076897113461e-05, + "loss": 2.9388, + "step": 22426 + }, + { + "epoch": 1.0441371604162302, + "grad_norm": 0.33546801214616917, + "learning_rate": 8.239870587601747e-05, + "loss": 2.8477, + "step": 22427 + }, + { + "epoch": 1.0441837186023233, + "grad_norm": 0.31238222504919644, + "learning_rate": 8.239664268581333e-05, + "loss": 2.9046, + "step": 22428 + }, + { + "epoch": 1.0442302767884164, + "grad_norm": 0.3297932633330722, + "learning_rate": 8.239457940052823e-05, + "loss": 2.8057, + "step": 22429 + }, + { + "epoch": 1.0442768349745093, + "grad_norm": 0.32280041364663487, + "learning_rate": 8.239251602016823e-05, + "loss": 2.7534, + "step": 22430 + }, + { + "epoch": 1.0443233931606024, + "grad_norm": 0.3224971774165453, + "learning_rate": 8.239045254473938e-05, + "loss": 2.8679, + "step": 22431 + }, + { + "epoch": 1.0443699513466955, + "grad_norm": 0.3472626947665112, + "learning_rate": 8.238838897424775e-05, + "loss": 2.884, + "step": 22432 + }, + { + "epoch": 1.0444165095327886, + "grad_norm": 0.34822095695886185, + "learning_rate": 8.238632530869939e-05, + "loss": 2.8467, + "step": 22433 + }, + { + "epoch": 1.0444630677188818, + "grad_norm": 0.3451631756564732, + "learning_rate": 8.238426154810035e-05, + "loss": 2.8631, + "step": 22434 + }, + { + "epoch": 1.0445096259049746, + "grad_norm": 0.3842531914993933, + "learning_rate": 8.23821976924567e-05, + "loss": 2.845, + "step": 22435 + }, + { + "epoch": 1.0445561840910678, + "grad_norm": 0.32041099680155116, + "learning_rate": 8.238013374177448e-05, + "loss": 2.8012, + "step": 22436 + }, + { + "epoch": 1.0446027422771609, + "grad_norm": 0.35261583865010027, + "learning_rate": 8.237806969605977e-05, + "loss": 2.8702, + "step": 22437 + }, + { + "epoch": 1.044649300463254, + "grad_norm": 0.34090883885575196, + "learning_rate": 8.237600555531861e-05, + "loss": 2.8342, + "step": 22438 + }, + { + "epoch": 1.044695858649347, + "grad_norm": 0.3180313757394135, + "learning_rate": 8.237394131955707e-05, + "loss": 2.875, + "step": 22439 + }, + { + "epoch": 1.04474241683544, + "grad_norm": 0.34774694222666025, + "learning_rate": 8.237187698878118e-05, + "loss": 2.7403, + "step": 22440 + }, + { + "epoch": 1.0447889750215331, + "grad_norm": 0.37792770817430515, + "learning_rate": 8.236981256299704e-05, + "loss": 2.8581, + "step": 22441 + }, + { + "epoch": 1.0448355332076262, + "grad_norm": 0.39165570665051225, + "learning_rate": 8.236774804221069e-05, + "loss": 2.8737, + "step": 22442 + }, + { + "epoch": 1.0448820913937193, + "grad_norm": 0.3566280863784794, + "learning_rate": 8.236568342642818e-05, + "loss": 2.8106, + "step": 22443 + }, + { + "epoch": 1.0449286495798125, + "grad_norm": 0.34358565282735554, + "learning_rate": 8.236361871565557e-05, + "loss": 2.7925, + "step": 22444 + }, + { + "epoch": 1.0449752077659054, + "grad_norm": 0.34340338533781045, + "learning_rate": 8.236155390989894e-05, + "loss": 2.9224, + "step": 22445 + }, + { + "epoch": 1.0450217659519985, + "grad_norm": 0.3396438560121098, + "learning_rate": 8.235948900916433e-05, + "loss": 2.8967, + "step": 22446 + }, + { + "epoch": 1.0450683241380916, + "grad_norm": 0.35687603828064324, + "learning_rate": 8.235742401345782e-05, + "loss": 2.9124, + "step": 22447 + }, + { + "epoch": 1.0451148823241847, + "grad_norm": 0.38142468299176163, + "learning_rate": 8.235535892278545e-05, + "loss": 2.9635, + "step": 22448 + }, + { + "epoch": 1.0451614405102778, + "grad_norm": 0.33235170879518317, + "learning_rate": 8.235329373715328e-05, + "loss": 2.8707, + "step": 22449 + }, + { + "epoch": 1.0452079986963707, + "grad_norm": 0.37205317703969093, + "learning_rate": 8.23512284565674e-05, + "loss": 2.8903, + "step": 22450 + }, + { + "epoch": 1.0452545568824638, + "grad_norm": 0.3487084320736098, + "learning_rate": 8.234916308103383e-05, + "loss": 2.8768, + "step": 22451 + }, + { + "epoch": 1.045301115068557, + "grad_norm": 0.3382975229560781, + "learning_rate": 8.234709761055868e-05, + "loss": 2.8002, + "step": 22452 + }, + { + "epoch": 1.04534767325465, + "grad_norm": 0.3304323925933438, + "learning_rate": 8.234503204514796e-05, + "loss": 2.8159, + "step": 22453 + }, + { + "epoch": 1.0453942314407432, + "grad_norm": 0.33735300597521845, + "learning_rate": 8.234296638480777e-05, + "loss": 2.8213, + "step": 22454 + }, + { + "epoch": 1.045440789626836, + "grad_norm": 0.32466142429552425, + "learning_rate": 8.234090062954414e-05, + "loss": 2.8339, + "step": 22455 + }, + { + "epoch": 1.0454873478129292, + "grad_norm": 0.3252722330174641, + "learning_rate": 8.233883477936319e-05, + "loss": 2.7881, + "step": 22456 + }, + { + "epoch": 1.0455339059990223, + "grad_norm": 0.3115540183468381, + "learning_rate": 8.233676883427091e-05, + "loss": 2.7695, + "step": 22457 + }, + { + "epoch": 1.0455804641851154, + "grad_norm": 0.32837002700815626, + "learning_rate": 8.233470279427342e-05, + "loss": 2.7433, + "step": 22458 + }, + { + "epoch": 1.0456270223712085, + "grad_norm": 0.30889842310084265, + "learning_rate": 8.233263665937676e-05, + "loss": 2.8195, + "step": 22459 + }, + { + "epoch": 1.0456735805573014, + "grad_norm": 0.3317372258909906, + "learning_rate": 8.233057042958698e-05, + "loss": 2.8321, + "step": 22460 + }, + { + "epoch": 1.0457201387433945, + "grad_norm": 0.34391829127113244, + "learning_rate": 8.232850410491019e-05, + "loss": 2.845, + "step": 22461 + }, + { + "epoch": 1.0457666969294876, + "grad_norm": 0.3460444298713915, + "learning_rate": 8.232643768535241e-05, + "loss": 2.8809, + "step": 22462 + }, + { + "epoch": 1.0458132551155808, + "grad_norm": 0.3646958798821745, + "learning_rate": 8.232437117091972e-05, + "loss": 2.8077, + "step": 22463 + }, + { + "epoch": 1.0458598133016737, + "grad_norm": 0.31680381611174274, + "learning_rate": 8.232230456161818e-05, + "loss": 2.7589, + "step": 22464 + }, + { + "epoch": 1.0459063714877668, + "grad_norm": 0.3670082228851265, + "learning_rate": 8.232023785745386e-05, + "loss": 2.851, + "step": 22465 + }, + { + "epoch": 1.0459529296738599, + "grad_norm": 0.3337506421909298, + "learning_rate": 8.231817105843283e-05, + "loss": 2.809, + "step": 22466 + }, + { + "epoch": 1.045999487859953, + "grad_norm": 0.3435304556166751, + "learning_rate": 8.231610416456118e-05, + "loss": 2.89, + "step": 22467 + }, + { + "epoch": 1.0460460460460461, + "grad_norm": 0.325067911598013, + "learning_rate": 8.23140371758449e-05, + "loss": 2.7882, + "step": 22468 + }, + { + "epoch": 1.046092604232139, + "grad_norm": 0.3570651196174715, + "learning_rate": 8.231197009229013e-05, + "loss": 2.821, + "step": 22469 + }, + { + "epoch": 1.0461391624182321, + "grad_norm": 0.3028359002074005, + "learning_rate": 8.230990291390291e-05, + "loss": 2.9438, + "step": 22470 + }, + { + "epoch": 1.0461857206043252, + "grad_norm": 0.3489355536998566, + "learning_rate": 8.23078356406893e-05, + "loss": 2.8901, + "step": 22471 + }, + { + "epoch": 1.0462322787904184, + "grad_norm": 0.32464519034533007, + "learning_rate": 8.230576827265537e-05, + "loss": 2.8073, + "step": 22472 + }, + { + "epoch": 1.0462788369765115, + "grad_norm": 0.32942487580411084, + "learning_rate": 8.23037008098072e-05, + "loss": 2.865, + "step": 22473 + }, + { + "epoch": 1.0463253951626044, + "grad_norm": 0.32501495195564867, + "learning_rate": 8.230163325215087e-05, + "loss": 2.8515, + "step": 22474 + }, + { + "epoch": 1.0463719533486975, + "grad_norm": 0.33811126757968474, + "learning_rate": 8.229956559969241e-05, + "loss": 2.9548, + "step": 22475 + }, + { + "epoch": 1.0464185115347906, + "grad_norm": 0.32272435349989353, + "learning_rate": 8.22974978524379e-05, + "loss": 2.8556, + "step": 22476 + }, + { + "epoch": 1.0464650697208837, + "grad_norm": 0.34561448464223815, + "learning_rate": 8.229543001039343e-05, + "loss": 2.8658, + "step": 22477 + }, + { + "epoch": 1.0465116279069768, + "grad_norm": 0.2984123714640672, + "learning_rate": 8.229336207356504e-05, + "loss": 2.8522, + "step": 22478 + }, + { + "epoch": 1.0465581860930697, + "grad_norm": 0.34048912501144585, + "learning_rate": 8.229129404195883e-05, + "loss": 2.8919, + "step": 22479 + }, + { + "epoch": 1.0466047442791628, + "grad_norm": 0.3327768740638541, + "learning_rate": 8.228922591558086e-05, + "loss": 2.8414, + "step": 22480 + }, + { + "epoch": 1.046651302465256, + "grad_norm": 0.31070855331992125, + "learning_rate": 8.228715769443717e-05, + "loss": 2.7824, + "step": 22481 + }, + { + "epoch": 1.046697860651349, + "grad_norm": 0.371477053041324, + "learning_rate": 8.228508937853385e-05, + "loss": 2.826, + "step": 22482 + }, + { + "epoch": 1.0467444188374422, + "grad_norm": 0.34399001292896164, + "learning_rate": 8.2283020967877e-05, + "loss": 2.8599, + "step": 22483 + }, + { + "epoch": 1.046790977023535, + "grad_norm": 0.3571986805046523, + "learning_rate": 8.228095246247265e-05, + "loss": 2.9119, + "step": 22484 + }, + { + "epoch": 1.0468375352096282, + "grad_norm": 0.3278205215032985, + "learning_rate": 8.227888386232688e-05, + "loss": 2.9309, + "step": 22485 + }, + { + "epoch": 1.0468840933957213, + "grad_norm": 0.35245758961579476, + "learning_rate": 8.227681516744576e-05, + "loss": 2.9269, + "step": 22486 + }, + { + "epoch": 1.0469306515818144, + "grad_norm": 0.3648175211809506, + "learning_rate": 8.227474637783538e-05, + "loss": 2.8456, + "step": 22487 + }, + { + "epoch": 1.0469772097679075, + "grad_norm": 0.34406019246436254, + "learning_rate": 8.22726774935018e-05, + "loss": 2.8016, + "step": 22488 + }, + { + "epoch": 1.0470237679540004, + "grad_norm": 0.3632056569458706, + "learning_rate": 8.227060851445109e-05, + "loss": 2.9074, + "step": 22489 + }, + { + "epoch": 1.0470703261400935, + "grad_norm": 0.3499339738853222, + "learning_rate": 8.226853944068932e-05, + "loss": 2.9132, + "step": 22490 + }, + { + "epoch": 1.0471168843261867, + "grad_norm": 0.3708610203344141, + "learning_rate": 8.226647027222256e-05, + "loss": 2.869, + "step": 22491 + }, + { + "epoch": 1.0471634425122798, + "grad_norm": 0.36855736416959606, + "learning_rate": 8.22644010090569e-05, + "loss": 2.8997, + "step": 22492 + }, + { + "epoch": 1.047210000698373, + "grad_norm": 0.34383313896396006, + "learning_rate": 8.22623316511984e-05, + "loss": 2.9022, + "step": 22493 + }, + { + "epoch": 1.0472565588844658, + "grad_norm": 0.34162215691428344, + "learning_rate": 8.226026219865313e-05, + "loss": 2.886, + "step": 22494 + }, + { + "epoch": 1.047303117070559, + "grad_norm": 0.3616004633545826, + "learning_rate": 8.225819265142717e-05, + "loss": 2.8066, + "step": 22495 + }, + { + "epoch": 1.047349675256652, + "grad_norm": 0.34178382984604205, + "learning_rate": 8.225612300952659e-05, + "loss": 2.7675, + "step": 22496 + }, + { + "epoch": 1.0473962334427451, + "grad_norm": 0.3355368170947035, + "learning_rate": 8.225405327295747e-05, + "loss": 2.8891, + "step": 22497 + }, + { + "epoch": 1.0474427916288382, + "grad_norm": 0.3677492780948015, + "learning_rate": 8.225198344172588e-05, + "loss": 2.7193, + "step": 22498 + }, + { + "epoch": 1.0474893498149311, + "grad_norm": 0.3490135892765779, + "learning_rate": 8.224991351583789e-05, + "loss": 2.8951, + "step": 22499 + }, + { + "epoch": 1.0475359080010243, + "grad_norm": 0.3692935139543481, + "learning_rate": 8.22478434952996e-05, + "loss": 2.9019, + "step": 22500 + }, + { + "epoch": 1.0475824661871174, + "grad_norm": 0.3861639034016546, + "learning_rate": 8.224577338011704e-05, + "loss": 2.8185, + "step": 22501 + }, + { + "epoch": 1.0476290243732105, + "grad_norm": 0.3705767547446809, + "learning_rate": 8.224370317029633e-05, + "loss": 2.6879, + "step": 22502 + }, + { + "epoch": 1.0476755825593034, + "grad_norm": 0.35278637779859073, + "learning_rate": 8.224163286584353e-05, + "loss": 2.7981, + "step": 22503 + }, + { + "epoch": 1.0477221407453965, + "grad_norm": 0.3933263837798705, + "learning_rate": 8.22395624667647e-05, + "loss": 2.9529, + "step": 22504 + }, + { + "epoch": 1.0477686989314896, + "grad_norm": 0.3889436936769089, + "learning_rate": 8.223749197306594e-05, + "loss": 2.7758, + "step": 22505 + }, + { + "epoch": 1.0478152571175827, + "grad_norm": 0.37927763568210876, + "learning_rate": 8.22354213847533e-05, + "loss": 2.8969, + "step": 22506 + }, + { + "epoch": 1.0478618153036758, + "grad_norm": 0.35915466182225675, + "learning_rate": 8.22333507018329e-05, + "loss": 2.8633, + "step": 22507 + }, + { + "epoch": 1.047908373489769, + "grad_norm": 0.36627526582911124, + "learning_rate": 8.223127992431077e-05, + "loss": 2.8327, + "step": 22508 + }, + { + "epoch": 1.0479549316758618, + "grad_norm": 0.3692043912169619, + "learning_rate": 8.222920905219303e-05, + "loss": 2.6631, + "step": 22509 + }, + { + "epoch": 1.048001489861955, + "grad_norm": 0.35121298465063744, + "learning_rate": 8.222713808548572e-05, + "loss": 2.8121, + "step": 22510 + }, + { + "epoch": 1.048048048048048, + "grad_norm": 0.3516363552749533, + "learning_rate": 8.222506702419494e-05, + "loss": 2.8252, + "step": 22511 + }, + { + "epoch": 1.0480946062341412, + "grad_norm": 0.36680422988780403, + "learning_rate": 8.222299586832675e-05, + "loss": 2.8067, + "step": 22512 + }, + { + "epoch": 1.048141164420234, + "grad_norm": 0.3097782648852382, + "learning_rate": 8.222092461788727e-05, + "loss": 2.8351, + "step": 22513 + }, + { + "epoch": 1.0481877226063272, + "grad_norm": 0.37476634442923284, + "learning_rate": 8.221885327288252e-05, + "loss": 2.8351, + "step": 22514 + }, + { + "epoch": 1.0482342807924203, + "grad_norm": 0.3164586078128953, + "learning_rate": 8.221678183331864e-05, + "loss": 2.9117, + "step": 22515 + }, + { + "epoch": 1.0482808389785134, + "grad_norm": 0.33095567912871815, + "learning_rate": 8.221471029920165e-05, + "loss": 2.8473, + "step": 22516 + }, + { + "epoch": 1.0483273971646065, + "grad_norm": 0.32458506923926567, + "learning_rate": 8.221263867053768e-05, + "loss": 2.8071, + "step": 22517 + }, + { + "epoch": 1.0483739553506994, + "grad_norm": 0.33287285660157, + "learning_rate": 8.221056694733278e-05, + "loss": 2.8171, + "step": 22518 + }, + { + "epoch": 1.0484205135367926, + "grad_norm": 0.3302879871350885, + "learning_rate": 8.220849512959303e-05, + "loss": 2.7883, + "step": 22519 + }, + { + "epoch": 1.0484670717228857, + "grad_norm": 0.33730102392723793, + "learning_rate": 8.220642321732454e-05, + "loss": 2.7825, + "step": 22520 + }, + { + "epoch": 1.0485136299089788, + "grad_norm": 0.3129222809262885, + "learning_rate": 8.220435121053335e-05, + "loss": 2.9243, + "step": 22521 + }, + { + "epoch": 1.048560188095072, + "grad_norm": 0.33207422612546567, + "learning_rate": 8.220227910922558e-05, + "loss": 2.8593, + "step": 22522 + }, + { + "epoch": 1.0486067462811648, + "grad_norm": 0.35642762082240553, + "learning_rate": 8.220020691340729e-05, + "loss": 2.8584, + "step": 22523 + }, + { + "epoch": 1.048653304467258, + "grad_norm": 0.3300234298071259, + "learning_rate": 8.219813462308457e-05, + "loss": 2.9165, + "step": 22524 + }, + { + "epoch": 1.048699862653351, + "grad_norm": 0.33912087098135457, + "learning_rate": 8.219606223826348e-05, + "loss": 2.8889, + "step": 22525 + }, + { + "epoch": 1.0487464208394441, + "grad_norm": 0.3297727545584178, + "learning_rate": 8.219398975895014e-05, + "loss": 2.74, + "step": 22526 + }, + { + "epoch": 1.0487929790255373, + "grad_norm": 0.3122156783373451, + "learning_rate": 8.219191718515058e-05, + "loss": 2.953, + "step": 22527 + }, + { + "epoch": 1.0488395372116301, + "grad_norm": 0.32276422348690326, + "learning_rate": 8.218984451687093e-05, + "loss": 2.8439, + "step": 22528 + }, + { + "epoch": 1.0488860953977233, + "grad_norm": 0.312518267145749, + "learning_rate": 8.218777175411727e-05, + "loss": 2.9301, + "step": 22529 + }, + { + "epoch": 1.0489326535838164, + "grad_norm": 0.3447436130260767, + "learning_rate": 8.218569889689565e-05, + "loss": 3.0062, + "step": 22530 + }, + { + "epoch": 1.0489792117699095, + "grad_norm": 0.31259946150916845, + "learning_rate": 8.21836259452122e-05, + "loss": 2.8537, + "step": 22531 + }, + { + "epoch": 1.0490257699560026, + "grad_norm": 0.3374613163719987, + "learning_rate": 8.218155289907297e-05, + "loss": 2.8132, + "step": 22532 + }, + { + "epoch": 1.0490723281420955, + "grad_norm": 0.29968302852897194, + "learning_rate": 8.217947975848405e-05, + "loss": 2.8461, + "step": 22533 + }, + { + "epoch": 1.0491188863281886, + "grad_norm": 0.33318235896260534, + "learning_rate": 8.217740652345152e-05, + "loss": 2.8415, + "step": 22534 + }, + { + "epoch": 1.0491654445142817, + "grad_norm": 0.3200643894761675, + "learning_rate": 8.217533319398147e-05, + "loss": 2.8843, + "step": 22535 + }, + { + "epoch": 1.0492120027003748, + "grad_norm": 0.31769833294507743, + "learning_rate": 8.217325977007999e-05, + "loss": 2.8125, + "step": 22536 + }, + { + "epoch": 1.049258560886468, + "grad_norm": 0.322058050290588, + "learning_rate": 8.217118625175318e-05, + "loss": 2.7608, + "step": 22537 + }, + { + "epoch": 1.0493051190725609, + "grad_norm": 0.3589696696543355, + "learning_rate": 8.216911263900708e-05, + "loss": 2.7861, + "step": 22538 + }, + { + "epoch": 1.049351677258654, + "grad_norm": 0.3143294033572957, + "learning_rate": 8.216703893184783e-05, + "loss": 2.8827, + "step": 22539 + }, + { + "epoch": 1.049398235444747, + "grad_norm": 0.3558317636858879, + "learning_rate": 8.216496513028147e-05, + "loss": 2.8117, + "step": 22540 + }, + { + "epoch": 1.0494447936308402, + "grad_norm": 0.335023112178441, + "learning_rate": 8.216289123431411e-05, + "loss": 2.8185, + "step": 22541 + }, + { + "epoch": 1.0494913518169333, + "grad_norm": 0.32622096148640556, + "learning_rate": 8.216081724395183e-05, + "loss": 2.8728, + "step": 22542 + }, + { + "epoch": 1.0495379100030262, + "grad_norm": 0.31423183682330935, + "learning_rate": 8.215874315920073e-05, + "loss": 2.8209, + "step": 22543 + }, + { + "epoch": 1.0495844681891193, + "grad_norm": 0.3160133099672373, + "learning_rate": 8.215666898006689e-05, + "loss": 2.8071, + "step": 22544 + }, + { + "epoch": 1.0496310263752124, + "grad_norm": 0.3543898282519946, + "learning_rate": 8.215459470655638e-05, + "loss": 2.8719, + "step": 22545 + }, + { + "epoch": 1.0496775845613056, + "grad_norm": 0.34444154209085254, + "learning_rate": 8.215252033867531e-05, + "loss": 2.8074, + "step": 22546 + }, + { + "epoch": 1.0497241427473987, + "grad_norm": 0.33560070010535875, + "learning_rate": 8.215044587642976e-05, + "loss": 2.7918, + "step": 22547 + }, + { + "epoch": 1.0497707009334916, + "grad_norm": 0.33301811098029493, + "learning_rate": 8.214837131982581e-05, + "loss": 2.7632, + "step": 22548 + }, + { + "epoch": 1.0498172591195847, + "grad_norm": 0.3400986134111014, + "learning_rate": 8.214629666886957e-05, + "loss": 2.8441, + "step": 22549 + }, + { + "epoch": 1.0498638173056778, + "grad_norm": 0.321069820801509, + "learning_rate": 8.21442219235671e-05, + "loss": 2.8211, + "step": 22550 + }, + { + "epoch": 1.049910375491771, + "grad_norm": 0.34442564622028266, + "learning_rate": 8.214214708392454e-05, + "loss": 2.8728, + "step": 22551 + }, + { + "epoch": 1.0499569336778638, + "grad_norm": 0.33148838471840386, + "learning_rate": 8.21400721499479e-05, + "loss": 2.8468, + "step": 22552 + }, + { + "epoch": 1.050003491863957, + "grad_norm": 0.3478440927057904, + "learning_rate": 8.213799712164334e-05, + "loss": 2.8197, + "step": 22553 + }, + { + "epoch": 1.05005005005005, + "grad_norm": 0.3462845486644319, + "learning_rate": 8.213592199901691e-05, + "loss": 2.8725, + "step": 22554 + }, + { + "epoch": 1.0500966082361431, + "grad_norm": 0.34231556725328993, + "learning_rate": 8.213384678207474e-05, + "loss": 2.8743, + "step": 22555 + }, + { + "epoch": 1.0501431664222363, + "grad_norm": 0.36412294518816263, + "learning_rate": 8.213177147082288e-05, + "loss": 2.7881, + "step": 22556 + }, + { + "epoch": 1.0501897246083292, + "grad_norm": 0.32244713308167156, + "learning_rate": 8.212969606526744e-05, + "loss": 2.8514, + "step": 22557 + }, + { + "epoch": 1.0502362827944223, + "grad_norm": 0.3648113467208136, + "learning_rate": 8.21276205654145e-05, + "loss": 2.7286, + "step": 22558 + }, + { + "epoch": 1.0502828409805154, + "grad_norm": 0.31656672269508573, + "learning_rate": 8.212554497127017e-05, + "loss": 2.8748, + "step": 22559 + }, + { + "epoch": 1.0503293991666085, + "grad_norm": 0.3530798305118056, + "learning_rate": 8.212346928284053e-05, + "loss": 2.8855, + "step": 22560 + }, + { + "epoch": 1.0503759573527016, + "grad_norm": 0.3122027202124222, + "learning_rate": 8.212139350013166e-05, + "loss": 2.779, + "step": 22561 + }, + { + "epoch": 1.0504225155387945, + "grad_norm": 0.3476952708451474, + "learning_rate": 8.211931762314967e-05, + "loss": 2.8115, + "step": 22562 + }, + { + "epoch": 1.0504690737248876, + "grad_norm": 0.3409162650659611, + "learning_rate": 8.211724165190064e-05, + "loss": 2.712, + "step": 22563 + }, + { + "epoch": 1.0505156319109807, + "grad_norm": 0.31536946991888, + "learning_rate": 8.211516558639069e-05, + "loss": 2.8362, + "step": 22564 + }, + { + "epoch": 1.0505621900970739, + "grad_norm": 0.3265205944598476, + "learning_rate": 8.211308942662587e-05, + "loss": 2.8726, + "step": 22565 + }, + { + "epoch": 1.050608748283167, + "grad_norm": 0.32549378572117993, + "learning_rate": 8.211101317261232e-05, + "loss": 2.8845, + "step": 22566 + }, + { + "epoch": 1.0506553064692599, + "grad_norm": 0.31897783864787066, + "learning_rate": 8.210893682435608e-05, + "loss": 2.676, + "step": 22567 + }, + { + "epoch": 1.050701864655353, + "grad_norm": 0.33092107477607763, + "learning_rate": 8.21068603818633e-05, + "loss": 2.8146, + "step": 22568 + }, + { + "epoch": 1.050748422841446, + "grad_norm": 0.3200486776148789, + "learning_rate": 8.210478384514004e-05, + "loss": 2.8372, + "step": 22569 + }, + { + "epoch": 1.0507949810275392, + "grad_norm": 0.3736599991818031, + "learning_rate": 8.21027072141924e-05, + "loss": 2.9276, + "step": 22570 + }, + { + "epoch": 1.0508415392136323, + "grad_norm": 0.34412934568535364, + "learning_rate": 8.210063048902647e-05, + "loss": 2.7932, + "step": 22571 + }, + { + "epoch": 1.0508880973997252, + "grad_norm": 0.3115773710998619, + "learning_rate": 8.209855366964836e-05, + "loss": 2.8342, + "step": 22572 + }, + { + "epoch": 1.0509346555858183, + "grad_norm": 0.36080284552885855, + "learning_rate": 8.209647675606416e-05, + "loss": 2.805, + "step": 22573 + }, + { + "epoch": 1.0509812137719114, + "grad_norm": 0.3413355435086217, + "learning_rate": 8.209439974827996e-05, + "loss": 2.7823, + "step": 22574 + }, + { + "epoch": 1.0510277719580046, + "grad_norm": 0.3539553333743832, + "learning_rate": 8.209232264630187e-05, + "loss": 2.8597, + "step": 22575 + }, + { + "epoch": 1.0510743301440977, + "grad_norm": 0.3652780053707895, + "learning_rate": 8.209024545013595e-05, + "loss": 2.8407, + "step": 22576 + }, + { + "epoch": 1.0511208883301906, + "grad_norm": 0.35457013405219845, + "learning_rate": 8.208816815978833e-05, + "loss": 2.8835, + "step": 22577 + }, + { + "epoch": 1.0511674465162837, + "grad_norm": 0.3625704788164517, + "learning_rate": 8.208609077526511e-05, + "loss": 2.8605, + "step": 22578 + }, + { + "epoch": 1.0512140047023768, + "grad_norm": 0.36171896551319876, + "learning_rate": 8.208401329657236e-05, + "loss": 2.8221, + "step": 22579 + }, + { + "epoch": 1.05126056288847, + "grad_norm": 0.344973628782657, + "learning_rate": 8.208193572371619e-05, + "loss": 2.8833, + "step": 22580 + }, + { + "epoch": 1.051307121074563, + "grad_norm": 0.3717032825642123, + "learning_rate": 8.207985805670271e-05, + "loss": 2.8233, + "step": 22581 + }, + { + "epoch": 1.051353679260656, + "grad_norm": 0.35017615637936583, + "learning_rate": 8.207778029553802e-05, + "loss": 2.836, + "step": 22582 + }, + { + "epoch": 1.051400237446749, + "grad_norm": 0.34109900427280115, + "learning_rate": 8.207570244022818e-05, + "loss": 2.8464, + "step": 22583 + }, + { + "epoch": 1.0514467956328422, + "grad_norm": 0.3176095635723196, + "learning_rate": 8.207362449077932e-05, + "loss": 2.8206, + "step": 22584 + }, + { + "epoch": 1.0514933538189353, + "grad_norm": 0.30135321455684294, + "learning_rate": 8.207154644719753e-05, + "loss": 2.8459, + "step": 22585 + }, + { + "epoch": 1.0515399120050284, + "grad_norm": 0.3248404902790427, + "learning_rate": 8.206946830948892e-05, + "loss": 2.7758, + "step": 22586 + }, + { + "epoch": 1.0515864701911213, + "grad_norm": 0.32528776879077803, + "learning_rate": 8.206739007765958e-05, + "loss": 2.8354, + "step": 22587 + }, + { + "epoch": 1.0516330283772144, + "grad_norm": 0.32674721466104234, + "learning_rate": 8.20653117517156e-05, + "loss": 2.852, + "step": 22588 + }, + { + "epoch": 1.0516795865633075, + "grad_norm": 0.32979286680462483, + "learning_rate": 8.206323333166309e-05, + "loss": 2.7546, + "step": 22589 + }, + { + "epoch": 1.0517261447494006, + "grad_norm": 0.3350139819562694, + "learning_rate": 8.206115481750815e-05, + "loss": 2.799, + "step": 22590 + }, + { + "epoch": 1.0517727029354935, + "grad_norm": 0.30316616458691564, + "learning_rate": 8.205907620925688e-05, + "loss": 2.8279, + "step": 22591 + }, + { + "epoch": 1.0518192611215866, + "grad_norm": 0.33875201848822806, + "learning_rate": 8.205699750691538e-05, + "loss": 2.8812, + "step": 22592 + }, + { + "epoch": 1.0518658193076798, + "grad_norm": 0.3305875710791919, + "learning_rate": 8.205491871048976e-05, + "loss": 2.8157, + "step": 22593 + }, + { + "epoch": 1.0519123774937729, + "grad_norm": 0.35048466289287283, + "learning_rate": 8.20528398199861e-05, + "loss": 2.7253, + "step": 22594 + }, + { + "epoch": 1.051958935679866, + "grad_norm": 0.312728358641594, + "learning_rate": 8.205076083541052e-05, + "loss": 2.8296, + "step": 22595 + }, + { + "epoch": 1.052005493865959, + "grad_norm": 0.35135386599630397, + "learning_rate": 8.20486817567691e-05, + "loss": 2.8601, + "step": 22596 + }, + { + "epoch": 1.052052052052052, + "grad_norm": 0.3183134115817673, + "learning_rate": 8.204660258406798e-05, + "loss": 2.8457, + "step": 22597 + }, + { + "epoch": 1.052098610238145, + "grad_norm": 0.33716217675182003, + "learning_rate": 8.20445233173132e-05, + "loss": 2.9047, + "step": 22598 + }, + { + "epoch": 1.0521451684242382, + "grad_norm": 0.3136756912136805, + "learning_rate": 8.204244395651093e-05, + "loss": 2.9085, + "step": 22599 + }, + { + "epoch": 1.0521917266103313, + "grad_norm": 0.3480035879944588, + "learning_rate": 8.204036450166725e-05, + "loss": 2.8642, + "step": 22600 + }, + { + "epoch": 1.0522382847964242, + "grad_norm": 0.33351067730732203, + "learning_rate": 8.203828495278824e-05, + "loss": 2.6991, + "step": 22601 + }, + { + "epoch": 1.0522848429825173, + "grad_norm": 0.3475817583587742, + "learning_rate": 8.203620530988003e-05, + "loss": 2.9541, + "step": 22602 + }, + { + "epoch": 1.0523314011686105, + "grad_norm": 0.33450108995514255, + "learning_rate": 8.203412557294872e-05, + "loss": 2.8496, + "step": 22603 + }, + { + "epoch": 1.0523779593547036, + "grad_norm": 0.36084723023525866, + "learning_rate": 8.203204574200039e-05, + "loss": 2.8873, + "step": 22604 + }, + { + "epoch": 1.0524245175407967, + "grad_norm": 0.33846547359480034, + "learning_rate": 8.202996581704117e-05, + "loss": 2.8001, + "step": 22605 + }, + { + "epoch": 1.0524710757268896, + "grad_norm": 0.34081124439683114, + "learning_rate": 8.202788579807715e-05, + "loss": 2.8432, + "step": 22606 + }, + { + "epoch": 1.0525176339129827, + "grad_norm": 0.3490569924319456, + "learning_rate": 8.202580568511444e-05, + "loss": 2.8368, + "step": 22607 + }, + { + "epoch": 1.0525641920990758, + "grad_norm": 0.3479763828841698, + "learning_rate": 8.202372547815915e-05, + "loss": 2.8469, + "step": 22608 + }, + { + "epoch": 1.052610750285169, + "grad_norm": 0.32096374040595715, + "learning_rate": 8.202164517721738e-05, + "loss": 2.7641, + "step": 22609 + }, + { + "epoch": 1.052657308471262, + "grad_norm": 0.3738340769607326, + "learning_rate": 8.201956478229525e-05, + "loss": 2.783, + "step": 22610 + }, + { + "epoch": 1.052703866657355, + "grad_norm": 0.37451322617956795, + "learning_rate": 8.201748429339882e-05, + "loss": 2.7778, + "step": 22611 + }, + { + "epoch": 1.052750424843448, + "grad_norm": 0.3459900523435269, + "learning_rate": 8.201540371053426e-05, + "loss": 2.7571, + "step": 22612 + }, + { + "epoch": 1.0527969830295412, + "grad_norm": 0.39381284129119937, + "learning_rate": 8.201332303370763e-05, + "loss": 2.9832, + "step": 22613 + }, + { + "epoch": 1.0528435412156343, + "grad_norm": 0.3769740705964052, + "learning_rate": 8.201124226292505e-05, + "loss": 2.7675, + "step": 22614 + }, + { + "epoch": 1.0528900994017274, + "grad_norm": 0.3537618613576805, + "learning_rate": 8.200916139819263e-05, + "loss": 2.8611, + "step": 22615 + }, + { + "epoch": 1.0529366575878203, + "grad_norm": 0.393449306095607, + "learning_rate": 8.200708043951647e-05, + "loss": 2.8536, + "step": 22616 + }, + { + "epoch": 1.0529832157739134, + "grad_norm": 0.34602374900003197, + "learning_rate": 8.200499938690268e-05, + "loss": 2.8027, + "step": 22617 + }, + { + "epoch": 1.0530297739600065, + "grad_norm": 0.36790716193281964, + "learning_rate": 8.200291824035737e-05, + "loss": 2.9494, + "step": 22618 + }, + { + "epoch": 1.0530763321460996, + "grad_norm": 0.31875871869401473, + "learning_rate": 8.200083699988666e-05, + "loss": 2.7786, + "step": 22619 + }, + { + "epoch": 1.0531228903321928, + "grad_norm": 0.3829655847031482, + "learning_rate": 8.199875566549664e-05, + "loss": 2.7454, + "step": 22620 + }, + { + "epoch": 1.0531694485182856, + "grad_norm": 0.34555514298778006, + "learning_rate": 8.199667423719343e-05, + "loss": 2.8568, + "step": 22621 + }, + { + "epoch": 1.0532160067043788, + "grad_norm": 0.34637744221270345, + "learning_rate": 8.199459271498312e-05, + "loss": 2.7157, + "step": 22622 + }, + { + "epoch": 1.0532625648904719, + "grad_norm": 0.343819164433773, + "learning_rate": 8.199251109887185e-05, + "loss": 2.787, + "step": 22623 + }, + { + "epoch": 1.053309123076565, + "grad_norm": 0.4033667701694803, + "learning_rate": 8.19904293888657e-05, + "loss": 2.9012, + "step": 22624 + }, + { + "epoch": 1.053355681262658, + "grad_norm": 0.3412604149175389, + "learning_rate": 8.198834758497078e-05, + "loss": 2.6401, + "step": 22625 + }, + { + "epoch": 1.053402239448751, + "grad_norm": 0.3785200580283326, + "learning_rate": 8.198626568719323e-05, + "loss": 2.6749, + "step": 22626 + }, + { + "epoch": 1.0534487976348441, + "grad_norm": 0.34378644170311223, + "learning_rate": 8.198418369553913e-05, + "loss": 2.9, + "step": 22627 + }, + { + "epoch": 1.0534953558209372, + "grad_norm": 0.39480172477194325, + "learning_rate": 8.198210161001462e-05, + "loss": 2.8544, + "step": 22628 + }, + { + "epoch": 1.0535419140070303, + "grad_norm": 0.35917587638092263, + "learning_rate": 8.198001943062577e-05, + "loss": 2.7551, + "step": 22629 + }, + { + "epoch": 1.0535884721931235, + "grad_norm": 0.37093374725038386, + "learning_rate": 8.197793715737872e-05, + "loss": 2.9099, + "step": 22630 + }, + { + "epoch": 1.0536350303792164, + "grad_norm": 0.36633574337071356, + "learning_rate": 8.197585479027959e-05, + "loss": 2.8449, + "step": 22631 + }, + { + "epoch": 1.0536815885653095, + "grad_norm": 0.34756268456434636, + "learning_rate": 8.197377232933445e-05, + "loss": 2.7913, + "step": 22632 + }, + { + "epoch": 1.0537281467514026, + "grad_norm": 0.3384848066341604, + "learning_rate": 8.197168977454947e-05, + "loss": 2.8373, + "step": 22633 + }, + { + "epoch": 1.0537747049374957, + "grad_norm": 0.32971973916237673, + "learning_rate": 8.196960712593071e-05, + "loss": 2.7833, + "step": 22634 + }, + { + "epoch": 1.0538212631235888, + "grad_norm": 0.3401778231536121, + "learning_rate": 8.196752438348429e-05, + "loss": 2.8947, + "step": 22635 + }, + { + "epoch": 1.0538678213096817, + "grad_norm": 0.33965418267024644, + "learning_rate": 8.196544154721634e-05, + "loss": 2.9068, + "step": 22636 + }, + { + "epoch": 1.0539143794957748, + "grad_norm": 0.34012496845495005, + "learning_rate": 8.1963358617133e-05, + "loss": 2.8739, + "step": 22637 + }, + { + "epoch": 1.053960937681868, + "grad_norm": 0.3182096163329676, + "learning_rate": 8.196127559324031e-05, + "loss": 2.7989, + "step": 22638 + }, + { + "epoch": 1.054007495867961, + "grad_norm": 0.34687311700215917, + "learning_rate": 8.195919247554446e-05, + "loss": 2.8886, + "step": 22639 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 0.31061758894726027, + "learning_rate": 8.195710926405152e-05, + "loss": 2.9216, + "step": 22640 + }, + { + "epoch": 1.054100612240147, + "grad_norm": 0.3416004397763965, + "learning_rate": 8.19550259587676e-05, + "loss": 2.7816, + "step": 22641 + }, + { + "epoch": 1.0541471704262402, + "grad_norm": 0.33052037576430154, + "learning_rate": 8.195294255969883e-05, + "loss": 2.7723, + "step": 22642 + }, + { + "epoch": 1.0541937286123333, + "grad_norm": 0.32206269952819566, + "learning_rate": 8.195085906685133e-05, + "loss": 2.9221, + "step": 22643 + }, + { + "epoch": 1.0542402867984264, + "grad_norm": 0.34030153416941916, + "learning_rate": 8.194877548023119e-05, + "loss": 2.7249, + "step": 22644 + }, + { + "epoch": 1.0542868449845193, + "grad_norm": 0.3170553550623369, + "learning_rate": 8.194669179984456e-05, + "loss": 2.9209, + "step": 22645 + }, + { + "epoch": 1.0543334031706124, + "grad_norm": 0.34902411905501046, + "learning_rate": 8.194460802569754e-05, + "loss": 2.8938, + "step": 22646 + }, + { + "epoch": 1.0543799613567055, + "grad_norm": 0.36500266805536397, + "learning_rate": 8.194252415779622e-05, + "loss": 2.9115, + "step": 22647 + }, + { + "epoch": 1.0544265195427986, + "grad_norm": 0.3538386139446115, + "learning_rate": 8.194044019614675e-05, + "loss": 2.8057, + "step": 22648 + }, + { + "epoch": 1.0544730777288918, + "grad_norm": 0.3639679362660791, + "learning_rate": 8.193835614075522e-05, + "loss": 2.7766, + "step": 22649 + }, + { + "epoch": 1.0545196359149847, + "grad_norm": 0.3429529545274254, + "learning_rate": 8.193627199162778e-05, + "loss": 2.9344, + "step": 22650 + }, + { + "epoch": 1.0545661941010778, + "grad_norm": 0.3524950140670044, + "learning_rate": 8.193418774877053e-05, + "loss": 2.9015, + "step": 22651 + }, + { + "epoch": 1.0546127522871709, + "grad_norm": 0.35323075444013646, + "learning_rate": 8.193210341218958e-05, + "loss": 2.8795, + "step": 22652 + }, + { + "epoch": 1.054659310473264, + "grad_norm": 0.33389775276897826, + "learning_rate": 8.193001898189106e-05, + "loss": 2.9253, + "step": 22653 + }, + { + "epoch": 1.0547058686593571, + "grad_norm": 0.3314376229079333, + "learning_rate": 8.192793445788106e-05, + "loss": 2.865, + "step": 22654 + }, + { + "epoch": 1.05475242684545, + "grad_norm": 0.33551417326437466, + "learning_rate": 8.192584984016574e-05, + "loss": 2.8605, + "step": 22655 + }, + { + "epoch": 1.0547989850315431, + "grad_norm": 0.3552246784948444, + "learning_rate": 8.192376512875118e-05, + "loss": 2.8804, + "step": 22656 + }, + { + "epoch": 1.0548455432176362, + "grad_norm": 0.31990814391480804, + "learning_rate": 8.192168032364351e-05, + "loss": 2.8463, + "step": 22657 + }, + { + "epoch": 1.0548921014037294, + "grad_norm": 0.3495345443049844, + "learning_rate": 8.191959542484886e-05, + "loss": 2.8526, + "step": 22658 + }, + { + "epoch": 1.0549386595898225, + "grad_norm": 0.3103392611947559, + "learning_rate": 8.191751043237335e-05, + "loss": 2.7561, + "step": 22659 + }, + { + "epoch": 1.0549852177759154, + "grad_norm": 0.31621908431637813, + "learning_rate": 8.191542534622309e-05, + "loss": 2.8696, + "step": 22660 + }, + { + "epoch": 1.0550317759620085, + "grad_norm": 0.31962860413372773, + "learning_rate": 8.19133401664042e-05, + "loss": 2.8326, + "step": 22661 + }, + { + "epoch": 1.0550783341481016, + "grad_norm": 0.3148259489618497, + "learning_rate": 8.191125489292279e-05, + "loss": 2.7367, + "step": 22662 + }, + { + "epoch": 1.0551248923341947, + "grad_norm": 0.3340026606032594, + "learning_rate": 8.1909169525785e-05, + "loss": 2.7014, + "step": 22663 + }, + { + "epoch": 1.0551714505202878, + "grad_norm": 0.34185204336268504, + "learning_rate": 8.190708406499695e-05, + "loss": 2.7072, + "step": 22664 + }, + { + "epoch": 1.0552180087063807, + "grad_norm": 0.3146817983530071, + "learning_rate": 8.190499851056473e-05, + "loss": 2.8056, + "step": 22665 + }, + { + "epoch": 1.0552645668924738, + "grad_norm": 0.3074884399639588, + "learning_rate": 8.19029128624945e-05, + "loss": 2.7777, + "step": 22666 + }, + { + "epoch": 1.055311125078567, + "grad_norm": 0.3152835772032244, + "learning_rate": 8.190082712079235e-05, + "loss": 2.7918, + "step": 22667 + }, + { + "epoch": 1.05535768326466, + "grad_norm": 0.3294799023216288, + "learning_rate": 8.189874128546442e-05, + "loss": 2.9351, + "step": 22668 + }, + { + "epoch": 1.0554042414507532, + "grad_norm": 0.32569744539918283, + "learning_rate": 8.189665535651683e-05, + "loss": 2.8714, + "step": 22669 + }, + { + "epoch": 1.055450799636846, + "grad_norm": 0.33544598934473635, + "learning_rate": 8.18945693339557e-05, + "loss": 2.8321, + "step": 22670 + }, + { + "epoch": 1.0554973578229392, + "grad_norm": 0.33751230286161643, + "learning_rate": 8.189248321778714e-05, + "loss": 2.803, + "step": 22671 + }, + { + "epoch": 1.0555439160090323, + "grad_norm": 0.30691072461021557, + "learning_rate": 8.189039700801729e-05, + "loss": 2.9603, + "step": 22672 + }, + { + "epoch": 1.0555904741951254, + "grad_norm": 0.3401983713659163, + "learning_rate": 8.188831070465229e-05, + "loss": 2.9076, + "step": 22673 + }, + { + "epoch": 1.0556370323812185, + "grad_norm": 0.3075820817331031, + "learning_rate": 8.18862243076982e-05, + "loss": 2.6647, + "step": 22674 + }, + { + "epoch": 1.0556835905673114, + "grad_norm": 0.31255166981573385, + "learning_rate": 8.18841378171612e-05, + "loss": 2.8231, + "step": 22675 + }, + { + "epoch": 1.0557301487534045, + "grad_norm": 0.3041146921181538, + "learning_rate": 8.18820512330474e-05, + "loss": 2.7071, + "step": 22676 + }, + { + "epoch": 1.0557767069394977, + "grad_norm": 0.33027739786666843, + "learning_rate": 8.187996455536292e-05, + "loss": 2.8492, + "step": 22677 + }, + { + "epoch": 1.0558232651255908, + "grad_norm": 0.31253386663039007, + "learning_rate": 8.187787778411388e-05, + "loss": 2.8602, + "step": 22678 + }, + { + "epoch": 1.0558698233116837, + "grad_norm": 0.34775534060260554, + "learning_rate": 8.187579091930642e-05, + "loss": 2.7324, + "step": 22679 + }, + { + "epoch": 1.0559163814977768, + "grad_norm": 0.3298476784358764, + "learning_rate": 8.187370396094663e-05, + "loss": 2.9016, + "step": 22680 + }, + { + "epoch": 1.05596293968387, + "grad_norm": 0.34634553104438653, + "learning_rate": 8.187161690904069e-05, + "loss": 2.8118, + "step": 22681 + }, + { + "epoch": 1.056009497869963, + "grad_norm": 0.29481383340830575, + "learning_rate": 8.186952976359467e-05, + "loss": 2.8795, + "step": 22682 + }, + { + "epoch": 1.0560560560560561, + "grad_norm": 0.3290537629773158, + "learning_rate": 8.186744252461474e-05, + "loss": 2.7987, + "step": 22683 + }, + { + "epoch": 1.0561026142421492, + "grad_norm": 0.29772581937917175, + "learning_rate": 8.186535519210699e-05, + "loss": 2.8456, + "step": 22684 + }, + { + "epoch": 1.0561491724282421, + "grad_norm": 0.3014561799854039, + "learning_rate": 8.186326776607755e-05, + "loss": 2.7592, + "step": 22685 + }, + { + "epoch": 1.0561957306143352, + "grad_norm": 0.3036686906977272, + "learning_rate": 8.186118024653259e-05, + "loss": 2.8978, + "step": 22686 + }, + { + "epoch": 1.0562422888004284, + "grad_norm": 0.3397284837688788, + "learning_rate": 8.18590926334782e-05, + "loss": 2.8641, + "step": 22687 + }, + { + "epoch": 1.0562888469865215, + "grad_norm": 0.3011771179286248, + "learning_rate": 8.185700492692048e-05, + "loss": 2.859, + "step": 22688 + }, + { + "epoch": 1.0563354051726144, + "grad_norm": 0.3330020258628523, + "learning_rate": 8.185491712686561e-05, + "loss": 2.9561, + "step": 22689 + }, + { + "epoch": 1.0563819633587075, + "grad_norm": 0.33604739803650563, + "learning_rate": 8.185282923331971e-05, + "loss": 2.8765, + "step": 22690 + }, + { + "epoch": 1.0564285215448006, + "grad_norm": 0.30553259377719194, + "learning_rate": 8.185074124628889e-05, + "loss": 2.7935, + "step": 22691 + }, + { + "epoch": 1.0564750797308937, + "grad_norm": 0.3439221318274523, + "learning_rate": 8.184865316577927e-05, + "loss": 2.8048, + "step": 22692 + }, + { + "epoch": 1.0565216379169868, + "grad_norm": 0.294682944922023, + "learning_rate": 8.184656499179699e-05, + "loss": 2.9067, + "step": 22693 + }, + { + "epoch": 1.0565681961030797, + "grad_norm": 0.33810574903092816, + "learning_rate": 8.184447672434817e-05, + "loss": 2.8672, + "step": 22694 + }, + { + "epoch": 1.0566147542891728, + "grad_norm": 0.29694271603559036, + "learning_rate": 8.184238836343898e-05, + "loss": 2.9028, + "step": 22695 + }, + { + "epoch": 1.056661312475266, + "grad_norm": 0.32854814851721276, + "learning_rate": 8.18402999090755e-05, + "loss": 2.7823, + "step": 22696 + }, + { + "epoch": 1.056707870661359, + "grad_norm": 0.31692834877075854, + "learning_rate": 8.183821136126386e-05, + "loss": 2.7611, + "step": 22697 + }, + { + "epoch": 1.0567544288474522, + "grad_norm": 0.3311746613407382, + "learning_rate": 8.183612272001023e-05, + "loss": 2.8678, + "step": 22698 + }, + { + "epoch": 1.056800987033545, + "grad_norm": 0.3117497224274168, + "learning_rate": 8.18340339853207e-05, + "loss": 2.7306, + "step": 22699 + }, + { + "epoch": 1.0568475452196382, + "grad_norm": 0.33767258019075835, + "learning_rate": 8.183194515720143e-05, + "loss": 2.9591, + "step": 22700 + }, + { + "epoch": 1.0568941034057313, + "grad_norm": 0.3174755596182239, + "learning_rate": 8.182985623565853e-05, + "loss": 2.7852, + "step": 22701 + }, + { + "epoch": 1.0569406615918244, + "grad_norm": 0.3161602433158804, + "learning_rate": 8.182776722069813e-05, + "loss": 2.8635, + "step": 22702 + }, + { + "epoch": 1.0569872197779175, + "grad_norm": 0.35439996892359293, + "learning_rate": 8.182567811232638e-05, + "loss": 2.8423, + "step": 22703 + }, + { + "epoch": 1.0570337779640104, + "grad_norm": 0.33526907339894235, + "learning_rate": 8.18235889105494e-05, + "loss": 2.7263, + "step": 22704 + }, + { + "epoch": 1.0570803361501035, + "grad_norm": 0.311293714765863, + "learning_rate": 8.182149961537333e-05, + "loss": 2.7569, + "step": 22705 + }, + { + "epoch": 1.0571268943361967, + "grad_norm": 0.3205563530998557, + "learning_rate": 8.181941022680426e-05, + "loss": 2.8024, + "step": 22706 + }, + { + "epoch": 1.0571734525222898, + "grad_norm": 0.3221684964401008, + "learning_rate": 8.181732074484838e-05, + "loss": 2.8634, + "step": 22707 + }, + { + "epoch": 1.057220010708383, + "grad_norm": 0.29415877492446263, + "learning_rate": 8.18152311695118e-05, + "loss": 2.8612, + "step": 22708 + }, + { + "epoch": 1.0572665688944758, + "grad_norm": 0.314298641248465, + "learning_rate": 8.181314150080064e-05, + "loss": 2.8596, + "step": 22709 + }, + { + "epoch": 1.057313127080569, + "grad_norm": 0.3139683188960607, + "learning_rate": 8.181105173872106e-05, + "loss": 2.7725, + "step": 22710 + }, + { + "epoch": 1.057359685266662, + "grad_norm": 0.3146432061023297, + "learning_rate": 8.180896188327916e-05, + "loss": 2.7197, + "step": 22711 + }, + { + "epoch": 1.0574062434527551, + "grad_norm": 0.35202208492122683, + "learning_rate": 8.180687193448111e-05, + "loss": 2.8678, + "step": 22712 + }, + { + "epoch": 1.0574528016388482, + "grad_norm": 0.3406469714128078, + "learning_rate": 8.180478189233302e-05, + "loss": 2.7923, + "step": 22713 + }, + { + "epoch": 1.0574993598249411, + "grad_norm": 0.3382190159561567, + "learning_rate": 8.1802691756841e-05, + "loss": 2.8568, + "step": 22714 + }, + { + "epoch": 1.0575459180110343, + "grad_norm": 0.3074817591663984, + "learning_rate": 8.180060152801126e-05, + "loss": 2.8627, + "step": 22715 + }, + { + "epoch": 1.0575924761971274, + "grad_norm": 0.3277368262364185, + "learning_rate": 8.179851120584985e-05, + "loss": 2.7565, + "step": 22716 + }, + { + "epoch": 1.0576390343832205, + "grad_norm": 0.3194694046418506, + "learning_rate": 8.179642079036295e-05, + "loss": 2.8058, + "step": 22717 + }, + { + "epoch": 1.0576855925693136, + "grad_norm": 0.33463702775694065, + "learning_rate": 8.17943302815567e-05, + "loss": 2.8519, + "step": 22718 + }, + { + "epoch": 1.0577321507554065, + "grad_norm": 0.436062084784653, + "learning_rate": 8.17922396794372e-05, + "loss": 2.7868, + "step": 22719 + }, + { + "epoch": 1.0577787089414996, + "grad_norm": 0.36023699439249096, + "learning_rate": 8.179014898401062e-05, + "loss": 2.828, + "step": 22720 + }, + { + "epoch": 1.0578252671275927, + "grad_norm": 0.34429745472473383, + "learning_rate": 8.17880581952831e-05, + "loss": 2.881, + "step": 22721 + }, + { + "epoch": 1.0578718253136858, + "grad_norm": 0.3598878606097253, + "learning_rate": 8.178596731326075e-05, + "loss": 2.7991, + "step": 22722 + }, + { + "epoch": 1.057918383499779, + "grad_norm": 0.3605911845881327, + "learning_rate": 8.17838763379497e-05, + "loss": 2.829, + "step": 22723 + }, + { + "epoch": 1.0579649416858719, + "grad_norm": 0.3607525339891839, + "learning_rate": 8.178178526935614e-05, + "loss": 2.8092, + "step": 22724 + }, + { + "epoch": 1.058011499871965, + "grad_norm": 0.3495697558014863, + "learning_rate": 8.177969410748613e-05, + "loss": 2.8466, + "step": 22725 + }, + { + "epoch": 1.058058058058058, + "grad_norm": 0.3548252091434927, + "learning_rate": 8.177760285234588e-05, + "loss": 2.7886, + "step": 22726 + }, + { + "epoch": 1.0581046162441512, + "grad_norm": 0.33265277096226037, + "learning_rate": 8.177551150394149e-05, + "loss": 2.923, + "step": 22727 + }, + { + "epoch": 1.058151174430244, + "grad_norm": 0.36295155862571366, + "learning_rate": 8.17734200622791e-05, + "loss": 2.7153, + "step": 22728 + }, + { + "epoch": 1.0581977326163372, + "grad_norm": 0.33748799780219096, + "learning_rate": 8.177132852736484e-05, + "loss": 2.856, + "step": 22729 + }, + { + "epoch": 1.0582442908024303, + "grad_norm": 0.3601270919684306, + "learning_rate": 8.176923689920488e-05, + "loss": 2.7892, + "step": 22730 + }, + { + "epoch": 1.0582908489885234, + "grad_norm": 0.3444090046529982, + "learning_rate": 8.176714517780534e-05, + "loss": 2.7393, + "step": 22731 + }, + { + "epoch": 1.0583374071746166, + "grad_norm": 0.3591297092563629, + "learning_rate": 8.176505336317235e-05, + "loss": 2.8104, + "step": 22732 + }, + { + "epoch": 1.0583839653607094, + "grad_norm": 0.3298534861146022, + "learning_rate": 8.176296145531206e-05, + "loss": 2.8836, + "step": 22733 + }, + { + "epoch": 1.0584305235468026, + "grad_norm": 0.34502986701672506, + "learning_rate": 8.176086945423061e-05, + "loss": 2.8288, + "step": 22734 + }, + { + "epoch": 1.0584770817328957, + "grad_norm": 0.32528444355586605, + "learning_rate": 8.175877735993411e-05, + "loss": 2.6623, + "step": 22735 + }, + { + "epoch": 1.0585236399189888, + "grad_norm": 0.3600596765089615, + "learning_rate": 8.175668517242876e-05, + "loss": 2.9011, + "step": 22736 + }, + { + "epoch": 1.058570198105082, + "grad_norm": 0.3116770560995734, + "learning_rate": 8.175459289172067e-05, + "loss": 2.8166, + "step": 22737 + }, + { + "epoch": 1.0586167562911748, + "grad_norm": 0.3361383598475175, + "learning_rate": 8.175250051781595e-05, + "loss": 2.8044, + "step": 22738 + }, + { + "epoch": 1.058663314477268, + "grad_norm": 0.3524988962398669, + "learning_rate": 8.175040805072078e-05, + "loss": 2.8801, + "step": 22739 + }, + { + "epoch": 1.058709872663361, + "grad_norm": 0.3129917820084299, + "learning_rate": 8.174831549044131e-05, + "loss": 2.7415, + "step": 22740 + }, + { + "epoch": 1.0587564308494541, + "grad_norm": 0.313003486179208, + "learning_rate": 8.174622283698366e-05, + "loss": 2.8415, + "step": 22741 + }, + { + "epoch": 1.0588029890355473, + "grad_norm": 0.3159847413305058, + "learning_rate": 8.174413009035396e-05, + "loss": 2.7913, + "step": 22742 + }, + { + "epoch": 1.0588495472216402, + "grad_norm": 0.3296163446249307, + "learning_rate": 8.174203725055836e-05, + "loss": 2.8542, + "step": 22743 + }, + { + "epoch": 1.0588961054077333, + "grad_norm": 0.2854180306674709, + "learning_rate": 8.1739944317603e-05, + "loss": 2.7938, + "step": 22744 + }, + { + "epoch": 1.0589426635938264, + "grad_norm": 0.3261041610801528, + "learning_rate": 8.173785129149405e-05, + "loss": 2.8411, + "step": 22745 + }, + { + "epoch": 1.0589892217799195, + "grad_norm": 0.31065760709333756, + "learning_rate": 8.173575817223763e-05, + "loss": 2.8253, + "step": 22746 + }, + { + "epoch": 1.0590357799660126, + "grad_norm": 0.319032034049674, + "learning_rate": 8.173366495983989e-05, + "loss": 2.8, + "step": 22747 + }, + { + "epoch": 1.0590823381521055, + "grad_norm": 0.3093834585193564, + "learning_rate": 8.173157165430696e-05, + "loss": 2.8434, + "step": 22748 + }, + { + "epoch": 1.0591288963381986, + "grad_norm": 0.3305032896769114, + "learning_rate": 8.1729478255645e-05, + "loss": 2.7793, + "step": 22749 + }, + { + "epoch": 1.0591754545242917, + "grad_norm": 0.29278601669291027, + "learning_rate": 8.172738476386014e-05, + "loss": 2.7422, + "step": 22750 + }, + { + "epoch": 1.0592220127103849, + "grad_norm": 0.31437894762692004, + "learning_rate": 8.172529117895853e-05, + "loss": 2.8743, + "step": 22751 + }, + { + "epoch": 1.059268570896478, + "grad_norm": 0.3363213350213074, + "learning_rate": 8.172319750094632e-05, + "loss": 2.7984, + "step": 22752 + }, + { + "epoch": 1.0593151290825709, + "grad_norm": 0.30212458637159034, + "learning_rate": 8.172110372982966e-05, + "loss": 2.8201, + "step": 22753 + }, + { + "epoch": 1.059361687268664, + "grad_norm": 0.2964193620233929, + "learning_rate": 8.171900986561467e-05, + "loss": 2.8587, + "step": 22754 + }, + { + "epoch": 1.059408245454757, + "grad_norm": 0.31722075261956967, + "learning_rate": 8.171691590830752e-05, + "loss": 2.7747, + "step": 22755 + }, + { + "epoch": 1.0594548036408502, + "grad_norm": 0.29489353668038776, + "learning_rate": 8.171482185791435e-05, + "loss": 2.8149, + "step": 22756 + }, + { + "epoch": 1.0595013618269433, + "grad_norm": 0.31390214500446045, + "learning_rate": 8.17127277144413e-05, + "loss": 2.8172, + "step": 22757 + }, + { + "epoch": 1.0595479200130362, + "grad_norm": 0.2966707169936316, + "learning_rate": 8.171063347789452e-05, + "loss": 2.8219, + "step": 22758 + }, + { + "epoch": 1.0595944781991293, + "grad_norm": 0.3213800560577973, + "learning_rate": 8.170853914828014e-05, + "loss": 2.8248, + "step": 22759 + }, + { + "epoch": 1.0596410363852224, + "grad_norm": 0.33860801643022514, + "learning_rate": 8.170644472560433e-05, + "loss": 2.8148, + "step": 22760 + }, + { + "epoch": 1.0596875945713156, + "grad_norm": 0.3541083361353601, + "learning_rate": 8.170435020987324e-05, + "loss": 2.8791, + "step": 22761 + }, + { + "epoch": 1.0597341527574087, + "grad_norm": 0.38711393316123743, + "learning_rate": 8.170225560109298e-05, + "loss": 2.8239, + "step": 22762 + }, + { + "epoch": 1.0597807109435016, + "grad_norm": 0.35815657473708556, + "learning_rate": 8.170016089926975e-05, + "loss": 2.9013, + "step": 22763 + }, + { + "epoch": 1.0598272691295947, + "grad_norm": 0.330617835728097, + "learning_rate": 8.169806610440966e-05, + "loss": 2.8648, + "step": 22764 + }, + { + "epoch": 1.0598738273156878, + "grad_norm": 0.3904772632303063, + "learning_rate": 8.169597121651887e-05, + "loss": 2.833, + "step": 22765 + }, + { + "epoch": 1.059920385501781, + "grad_norm": 0.31288971495371076, + "learning_rate": 8.169387623560352e-05, + "loss": 2.8262, + "step": 22766 + }, + { + "epoch": 1.0599669436878738, + "grad_norm": 0.3315750210221355, + "learning_rate": 8.169178116166977e-05, + "loss": 2.8705, + "step": 22767 + }, + { + "epoch": 1.060013501873967, + "grad_norm": 0.31649603092756207, + "learning_rate": 8.168968599472376e-05, + "loss": 2.8575, + "step": 22768 + }, + { + "epoch": 1.06006006006006, + "grad_norm": 0.35795330254239127, + "learning_rate": 8.168759073477165e-05, + "loss": 2.8366, + "step": 22769 + }, + { + "epoch": 1.0601066182461532, + "grad_norm": 0.30224767852706835, + "learning_rate": 8.168549538181958e-05, + "loss": 2.8558, + "step": 22770 + }, + { + "epoch": 1.0601531764322463, + "grad_norm": 0.37817594163362894, + "learning_rate": 8.168339993587371e-05, + "loss": 2.8425, + "step": 22771 + }, + { + "epoch": 1.0601997346183392, + "grad_norm": 0.31534517360043407, + "learning_rate": 8.168130439694017e-05, + "loss": 2.7555, + "step": 22772 + }, + { + "epoch": 1.0602462928044323, + "grad_norm": 0.3401338300266948, + "learning_rate": 8.167920876502513e-05, + "loss": 2.8406, + "step": 22773 + }, + { + "epoch": 1.0602928509905254, + "grad_norm": 0.3579127820744418, + "learning_rate": 8.167711304013473e-05, + "loss": 2.8713, + "step": 22774 + }, + { + "epoch": 1.0603394091766185, + "grad_norm": 0.3539613352132229, + "learning_rate": 8.167501722227513e-05, + "loss": 2.8948, + "step": 22775 + }, + { + "epoch": 1.0603859673627116, + "grad_norm": 0.3703734887421032, + "learning_rate": 8.167292131145248e-05, + "loss": 2.8472, + "step": 22776 + }, + { + "epoch": 1.0604325255488045, + "grad_norm": 0.36576428602193206, + "learning_rate": 8.167082530767292e-05, + "loss": 2.7619, + "step": 22777 + }, + { + "epoch": 1.0604790837348976, + "grad_norm": 0.314971352746899, + "learning_rate": 8.16687292109426e-05, + "loss": 2.6184, + "step": 22778 + }, + { + "epoch": 1.0605256419209907, + "grad_norm": 0.30671932191677065, + "learning_rate": 8.166663302126769e-05, + "loss": 2.7881, + "step": 22779 + }, + { + "epoch": 1.0605722001070839, + "grad_norm": 0.3453684059577057, + "learning_rate": 8.166453673865432e-05, + "loss": 2.8829, + "step": 22780 + }, + { + "epoch": 1.060618758293177, + "grad_norm": 0.3307628736630519, + "learning_rate": 8.166244036310867e-05, + "loss": 2.9233, + "step": 22781 + }, + { + "epoch": 1.0606653164792699, + "grad_norm": 0.3537077258381982, + "learning_rate": 8.166034389463686e-05, + "loss": 2.8639, + "step": 22782 + }, + { + "epoch": 1.060711874665363, + "grad_norm": 0.3506532036268881, + "learning_rate": 8.165824733324507e-05, + "loss": 2.8326, + "step": 22783 + }, + { + "epoch": 1.060758432851456, + "grad_norm": 0.31462910806117644, + "learning_rate": 8.165615067893944e-05, + "loss": 2.8688, + "step": 22784 + }, + { + "epoch": 1.0608049910375492, + "grad_norm": 0.3524017299262312, + "learning_rate": 8.165405393172612e-05, + "loss": 2.9018, + "step": 22785 + }, + { + "epoch": 1.0608515492236423, + "grad_norm": 0.30988906446904035, + "learning_rate": 8.165195709161128e-05, + "loss": 2.8464, + "step": 22786 + }, + { + "epoch": 1.0608981074097352, + "grad_norm": 0.3646051564984406, + "learning_rate": 8.164986015860106e-05, + "loss": 2.8607, + "step": 22787 + }, + { + "epoch": 1.0609446655958283, + "grad_norm": 0.29261418848548226, + "learning_rate": 8.164776313270162e-05, + "loss": 2.8081, + "step": 22788 + }, + { + "epoch": 1.0609912237819215, + "grad_norm": 0.3326126836359904, + "learning_rate": 8.164566601391911e-05, + "loss": 2.7648, + "step": 22789 + }, + { + "epoch": 1.0610377819680146, + "grad_norm": 0.3129590372859171, + "learning_rate": 8.164356880225969e-05, + "loss": 2.8509, + "step": 22790 + }, + { + "epoch": 1.0610843401541077, + "grad_norm": 0.3282898573328453, + "learning_rate": 8.164147149772952e-05, + "loss": 2.7633, + "step": 22791 + }, + { + "epoch": 1.0611308983402006, + "grad_norm": 0.3038795671836354, + "learning_rate": 8.163937410033474e-05, + "loss": 2.8186, + "step": 22792 + }, + { + "epoch": 1.0611774565262937, + "grad_norm": 0.3180949019115955, + "learning_rate": 8.163727661008151e-05, + "loss": 2.7546, + "step": 22793 + }, + { + "epoch": 1.0612240147123868, + "grad_norm": 0.33599327834285203, + "learning_rate": 8.163517902697601e-05, + "loss": 2.8807, + "step": 22794 + }, + { + "epoch": 1.06127057289848, + "grad_norm": 0.30904910522030304, + "learning_rate": 8.163308135102437e-05, + "loss": 2.6681, + "step": 22795 + }, + { + "epoch": 1.061317131084573, + "grad_norm": 0.31992273440268776, + "learning_rate": 8.163098358223274e-05, + "loss": 2.8525, + "step": 22796 + }, + { + "epoch": 1.061363689270666, + "grad_norm": 0.33108273260832916, + "learning_rate": 8.162888572060729e-05, + "loss": 2.8867, + "step": 22797 + }, + { + "epoch": 1.061410247456759, + "grad_norm": 0.32674983014558173, + "learning_rate": 8.162678776615418e-05, + "loss": 2.7256, + "step": 22798 + }, + { + "epoch": 1.0614568056428522, + "grad_norm": 0.3190872883315153, + "learning_rate": 8.162468971887956e-05, + "loss": 2.7783, + "step": 22799 + }, + { + "epoch": 1.0615033638289453, + "grad_norm": 0.287767311211975, + "learning_rate": 8.16225915787896e-05, + "loss": 2.775, + "step": 22800 + }, + { + "epoch": 1.0615499220150384, + "grad_norm": 0.30354841545193734, + "learning_rate": 8.162049334589043e-05, + "loss": 2.8175, + "step": 22801 + }, + { + "epoch": 1.0615964802011313, + "grad_norm": 0.2981367209993329, + "learning_rate": 8.161839502018825e-05, + "loss": 2.7725, + "step": 22802 + }, + { + "epoch": 1.0616430383872244, + "grad_norm": 0.31324190629466403, + "learning_rate": 8.161629660168919e-05, + "loss": 2.8352, + "step": 22803 + }, + { + "epoch": 1.0616895965733175, + "grad_norm": 0.30174307006134565, + "learning_rate": 8.161419809039941e-05, + "loss": 2.8446, + "step": 22804 + }, + { + "epoch": 1.0617361547594106, + "grad_norm": 0.3374682135390637, + "learning_rate": 8.161209948632508e-05, + "loss": 2.7907, + "step": 22805 + }, + { + "epoch": 1.0617827129455037, + "grad_norm": 0.3355293728698393, + "learning_rate": 8.161000078947234e-05, + "loss": 2.779, + "step": 22806 + }, + { + "epoch": 1.0618292711315966, + "grad_norm": 0.2947001517920354, + "learning_rate": 8.160790199984735e-05, + "loss": 2.7857, + "step": 22807 + }, + { + "epoch": 1.0618758293176898, + "grad_norm": 0.3304897967044375, + "learning_rate": 8.160580311745632e-05, + "loss": 2.7806, + "step": 22808 + }, + { + "epoch": 1.0619223875037829, + "grad_norm": 0.292740714570337, + "learning_rate": 8.160370414230535e-05, + "loss": 2.9167, + "step": 22809 + }, + { + "epoch": 1.061968945689876, + "grad_norm": 0.3390276325891325, + "learning_rate": 8.160160507440062e-05, + "loss": 2.9285, + "step": 22810 + }, + { + "epoch": 1.062015503875969, + "grad_norm": 0.3459394459044233, + "learning_rate": 8.15995059137483e-05, + "loss": 2.8134, + "step": 22811 + }, + { + "epoch": 1.062062062062062, + "grad_norm": 0.34296011147455724, + "learning_rate": 8.159740666035453e-05, + "loss": 2.7539, + "step": 22812 + }, + { + "epoch": 1.062108620248155, + "grad_norm": 0.3474402902744647, + "learning_rate": 8.159530731422551e-05, + "loss": 2.8125, + "step": 22813 + }, + { + "epoch": 1.0621551784342482, + "grad_norm": 0.3235981427009662, + "learning_rate": 8.159320787536735e-05, + "loss": 2.8173, + "step": 22814 + }, + { + "epoch": 1.0622017366203413, + "grad_norm": 0.41780825166286417, + "learning_rate": 8.159110834378625e-05, + "loss": 2.8112, + "step": 22815 + }, + { + "epoch": 1.0622482948064342, + "grad_norm": 0.3559288212612964, + "learning_rate": 8.158900871948836e-05, + "loss": 2.944, + "step": 22816 + }, + { + "epoch": 1.0622948529925273, + "grad_norm": 0.3816466605957546, + "learning_rate": 8.158690900247983e-05, + "loss": 2.9895, + "step": 22817 + }, + { + "epoch": 1.0623414111786205, + "grad_norm": 0.33399340286274387, + "learning_rate": 8.158480919276684e-05, + "loss": 2.901, + "step": 22818 + }, + { + "epoch": 1.0623879693647136, + "grad_norm": 0.3775927057162003, + "learning_rate": 8.158270929035554e-05, + "loss": 2.8958, + "step": 22819 + }, + { + "epoch": 1.0624345275508067, + "grad_norm": 0.346766457336317, + "learning_rate": 8.158060929525211e-05, + "loss": 2.7617, + "step": 22820 + }, + { + "epoch": 1.0624810857368996, + "grad_norm": 0.3567537409289081, + "learning_rate": 8.15785092074627e-05, + "loss": 2.9042, + "step": 22821 + }, + { + "epoch": 1.0625276439229927, + "grad_norm": 0.3645920737139877, + "learning_rate": 8.157640902699349e-05, + "loss": 2.7697, + "step": 22822 + }, + { + "epoch": 1.0625742021090858, + "grad_norm": 0.37158817845277575, + "learning_rate": 8.157430875385061e-05, + "loss": 2.8881, + "step": 22823 + }, + { + "epoch": 1.062620760295179, + "grad_norm": 0.3734651123318791, + "learning_rate": 8.157220838804025e-05, + "loss": 2.8296, + "step": 22824 + }, + { + "epoch": 1.062667318481272, + "grad_norm": 0.3235892376681929, + "learning_rate": 8.157010792956857e-05, + "loss": 2.7917, + "step": 22825 + }, + { + "epoch": 1.062713876667365, + "grad_norm": 0.38899646319389214, + "learning_rate": 8.156800737844175e-05, + "loss": 2.8498, + "step": 22826 + }, + { + "epoch": 1.062760434853458, + "grad_norm": 0.33733939709670674, + "learning_rate": 8.15659067346659e-05, + "loss": 2.8156, + "step": 22827 + }, + { + "epoch": 1.0628069930395512, + "grad_norm": 0.36566004364122917, + "learning_rate": 8.156380599824725e-05, + "loss": 2.8851, + "step": 22828 + }, + { + "epoch": 1.0628535512256443, + "grad_norm": 0.32091323831849006, + "learning_rate": 8.156170516919193e-05, + "loss": 2.8077, + "step": 22829 + }, + { + "epoch": 1.0629001094117374, + "grad_norm": 0.3698570021828402, + "learning_rate": 8.155960424750613e-05, + "loss": 2.8354, + "step": 22830 + }, + { + "epoch": 1.0629466675978303, + "grad_norm": 0.34887770794676415, + "learning_rate": 8.155750323319599e-05, + "loss": 2.881, + "step": 22831 + }, + { + "epoch": 1.0629932257839234, + "grad_norm": 0.360681521240873, + "learning_rate": 8.155540212626768e-05, + "loss": 2.8332, + "step": 22832 + }, + { + "epoch": 1.0630397839700165, + "grad_norm": 0.33489405495459157, + "learning_rate": 8.155330092672738e-05, + "loss": 2.7867, + "step": 22833 + }, + { + "epoch": 1.0630863421561096, + "grad_norm": 0.3800686035423158, + "learning_rate": 8.155119963458124e-05, + "loss": 2.8472, + "step": 22834 + }, + { + "epoch": 1.0631329003422028, + "grad_norm": 0.3260089915505418, + "learning_rate": 8.154909824983546e-05, + "loss": 2.8511, + "step": 22835 + }, + { + "epoch": 1.0631794585282957, + "grad_norm": 0.3282917302290211, + "learning_rate": 8.154699677249618e-05, + "loss": 2.8522, + "step": 22836 + }, + { + "epoch": 1.0632260167143888, + "grad_norm": 0.3860908639678273, + "learning_rate": 8.154489520256956e-05, + "loss": 2.9238, + "step": 22837 + }, + { + "epoch": 1.0632725749004819, + "grad_norm": 0.34086849941483405, + "learning_rate": 8.154279354006179e-05, + "loss": 2.7602, + "step": 22838 + }, + { + "epoch": 1.063319133086575, + "grad_norm": 0.34537672518712564, + "learning_rate": 8.1540691784979e-05, + "loss": 2.8958, + "step": 22839 + }, + { + "epoch": 1.0633656912726681, + "grad_norm": 0.35486156620902265, + "learning_rate": 8.153858993732743e-05, + "loss": 2.9061, + "step": 22840 + }, + { + "epoch": 1.063412249458761, + "grad_norm": 0.3120901022605622, + "learning_rate": 8.153648799711318e-05, + "loss": 2.9684, + "step": 22841 + }, + { + "epoch": 1.0634588076448541, + "grad_norm": 0.39398709258676334, + "learning_rate": 8.153438596434246e-05, + "loss": 2.8264, + "step": 22842 + }, + { + "epoch": 1.0635053658309472, + "grad_norm": 0.3220080315668482, + "learning_rate": 8.153228383902141e-05, + "loss": 2.836, + "step": 22843 + }, + { + "epoch": 1.0635519240170404, + "grad_norm": 0.35652493217024817, + "learning_rate": 8.153018162115623e-05, + "loss": 2.8323, + "step": 22844 + }, + { + "epoch": 1.0635984822031335, + "grad_norm": 0.3077794125700411, + "learning_rate": 8.152807931075304e-05, + "loss": 2.9682, + "step": 22845 + }, + { + "epoch": 1.0636450403892264, + "grad_norm": 0.3693528935003607, + "learning_rate": 8.152597690781807e-05, + "loss": 2.7939, + "step": 22846 + }, + { + "epoch": 1.0636915985753195, + "grad_norm": 0.34808827588406593, + "learning_rate": 8.152387441235745e-05, + "loss": 2.8933, + "step": 22847 + }, + { + "epoch": 1.0637381567614126, + "grad_norm": 0.299341183948404, + "learning_rate": 8.152177182437738e-05, + "loss": 2.7692, + "step": 22848 + }, + { + "epoch": 1.0637847149475057, + "grad_norm": 0.324192187753196, + "learning_rate": 8.151966914388401e-05, + "loss": 2.8397, + "step": 22849 + }, + { + "epoch": 1.0638312731335988, + "grad_norm": 0.3361855051207049, + "learning_rate": 8.151756637088351e-05, + "loss": 2.7811, + "step": 22850 + }, + { + "epoch": 1.0638778313196917, + "grad_norm": 0.3378138979583001, + "learning_rate": 8.151546350538206e-05, + "loss": 2.8621, + "step": 22851 + }, + { + "epoch": 1.0639243895057848, + "grad_norm": 0.32690079494244323, + "learning_rate": 8.151336054738583e-05, + "loss": 2.8967, + "step": 22852 + }, + { + "epoch": 1.063970947691878, + "grad_norm": 0.3319239497193315, + "learning_rate": 8.151125749690101e-05, + "loss": 2.8898, + "step": 22853 + }, + { + "epoch": 1.064017505877971, + "grad_norm": 0.31791044636430704, + "learning_rate": 8.150915435393371e-05, + "loss": 2.8209, + "step": 22854 + }, + { + "epoch": 1.064064064064064, + "grad_norm": 0.32541345825644563, + "learning_rate": 8.150705111849018e-05, + "loss": 2.806, + "step": 22855 + }, + { + "epoch": 1.064110622250157, + "grad_norm": 0.3460172370856939, + "learning_rate": 8.150494779057656e-05, + "loss": 2.9016, + "step": 22856 + }, + { + "epoch": 1.0641571804362502, + "grad_norm": 0.3183208869027999, + "learning_rate": 8.1502844370199e-05, + "loss": 2.83, + "step": 22857 + }, + { + "epoch": 1.0642037386223433, + "grad_norm": 0.36914114188659675, + "learning_rate": 8.150074085736373e-05, + "loss": 2.8482, + "step": 22858 + }, + { + "epoch": 1.0642502968084364, + "grad_norm": 0.32816213036612424, + "learning_rate": 8.149863725207684e-05, + "loss": 2.7826, + "step": 22859 + }, + { + "epoch": 1.0642968549945295, + "grad_norm": 0.31149963439654943, + "learning_rate": 8.14965335543446e-05, + "loss": 2.8098, + "step": 22860 + }, + { + "epoch": 1.0643434131806224, + "grad_norm": 0.32670007702351384, + "learning_rate": 8.14944297641731e-05, + "loss": 2.875, + "step": 22861 + }, + { + "epoch": 1.0643899713667155, + "grad_norm": 0.33252993924504043, + "learning_rate": 8.149232588156858e-05, + "loss": 2.813, + "step": 22862 + }, + { + "epoch": 1.0644365295528087, + "grad_norm": 0.3151948014780078, + "learning_rate": 8.149022190653717e-05, + "loss": 2.854, + "step": 22863 + }, + { + "epoch": 1.0644830877389018, + "grad_norm": 0.31753346226827245, + "learning_rate": 8.148811783908506e-05, + "loss": 2.8512, + "step": 22864 + }, + { + "epoch": 1.0645296459249947, + "grad_norm": 0.3086485122670442, + "learning_rate": 8.148601367921843e-05, + "loss": 2.9328, + "step": 22865 + }, + { + "epoch": 1.0645762041110878, + "grad_norm": 0.3109644334113784, + "learning_rate": 8.148390942694345e-05, + "loss": 2.8083, + "step": 22866 + }, + { + "epoch": 1.064622762297181, + "grad_norm": 0.33238912180072444, + "learning_rate": 8.14818050822663e-05, + "loss": 2.9032, + "step": 22867 + }, + { + "epoch": 1.064669320483274, + "grad_norm": 0.315057399605973, + "learning_rate": 8.147970064519317e-05, + "loss": 2.8126, + "step": 22868 + }, + { + "epoch": 1.0647158786693671, + "grad_norm": 0.331237888073195, + "learning_rate": 8.14775961157302e-05, + "loss": 2.775, + "step": 22869 + }, + { + "epoch": 1.06476243685546, + "grad_norm": 0.3358383106060512, + "learning_rate": 8.147549149388358e-05, + "loss": 2.7817, + "step": 22870 + }, + { + "epoch": 1.0648089950415531, + "grad_norm": 0.3623995064664076, + "learning_rate": 8.147338677965953e-05, + "loss": 2.8956, + "step": 22871 + }, + { + "epoch": 1.0648555532276462, + "grad_norm": 0.34382251133362435, + "learning_rate": 8.147128197306415e-05, + "loss": 2.7761, + "step": 22872 + }, + { + "epoch": 1.0649021114137394, + "grad_norm": 0.3386380550358278, + "learning_rate": 8.146917707410366e-05, + "loss": 2.8222, + "step": 22873 + }, + { + "epoch": 1.0649486695998325, + "grad_norm": 0.34658556276074226, + "learning_rate": 8.146707208278425e-05, + "loss": 2.7346, + "step": 22874 + }, + { + "epoch": 1.0649952277859254, + "grad_norm": 0.32422199546862984, + "learning_rate": 8.146496699911208e-05, + "loss": 2.8353, + "step": 22875 + }, + { + "epoch": 1.0650417859720185, + "grad_norm": 0.3515638073186797, + "learning_rate": 8.146286182309335e-05, + "loss": 2.7442, + "step": 22876 + }, + { + "epoch": 1.0650883441581116, + "grad_norm": 0.31724410571817235, + "learning_rate": 8.14607565547342e-05, + "loss": 2.8661, + "step": 22877 + }, + { + "epoch": 1.0651349023442047, + "grad_norm": 0.35704225834206355, + "learning_rate": 8.145865119404081e-05, + "loss": 2.8348, + "step": 22878 + }, + { + "epoch": 1.0651814605302978, + "grad_norm": 0.3086948463769818, + "learning_rate": 8.14565457410194e-05, + "loss": 2.795, + "step": 22879 + }, + { + "epoch": 1.0652280187163907, + "grad_norm": 0.3813945277328126, + "learning_rate": 8.145444019567614e-05, + "loss": 2.9156, + "step": 22880 + }, + { + "epoch": 1.0652745769024838, + "grad_norm": 0.3646160251030981, + "learning_rate": 8.145233455801719e-05, + "loss": 2.8457, + "step": 22881 + }, + { + "epoch": 1.065321135088577, + "grad_norm": 0.35445441738846917, + "learning_rate": 8.145022882804873e-05, + "loss": 2.8383, + "step": 22882 + }, + { + "epoch": 1.06536769327467, + "grad_norm": 0.3529609895825357, + "learning_rate": 8.144812300577693e-05, + "loss": 2.9003, + "step": 22883 + }, + { + "epoch": 1.0654142514607632, + "grad_norm": 0.3305648955589079, + "learning_rate": 8.1446017091208e-05, + "loss": 2.8667, + "step": 22884 + }, + { + "epoch": 1.065460809646856, + "grad_norm": 0.3163207074812139, + "learning_rate": 8.144391108434813e-05, + "loss": 2.9596, + "step": 22885 + }, + { + "epoch": 1.0655073678329492, + "grad_norm": 0.34431795044711694, + "learning_rate": 8.144180498520346e-05, + "loss": 2.758, + "step": 22886 + }, + { + "epoch": 1.0655539260190423, + "grad_norm": 0.31678000687616154, + "learning_rate": 8.14396987937802e-05, + "loss": 2.8522, + "step": 22887 + }, + { + "epoch": 1.0656004842051354, + "grad_norm": 0.36310633609243625, + "learning_rate": 8.14375925100845e-05, + "loss": 2.827, + "step": 22888 + }, + { + "epoch": 1.0656470423912285, + "grad_norm": 0.3173761954943226, + "learning_rate": 8.143548613412258e-05, + "loss": 2.9034, + "step": 22889 + }, + { + "epoch": 1.0656936005773214, + "grad_norm": 0.3633030507983586, + "learning_rate": 8.143337966590058e-05, + "loss": 2.8251, + "step": 22890 + }, + { + "epoch": 1.0657401587634145, + "grad_norm": 0.3411907707388206, + "learning_rate": 8.143127310542475e-05, + "loss": 2.8097, + "step": 22891 + }, + { + "epoch": 1.0657867169495077, + "grad_norm": 0.3297522690745922, + "learning_rate": 8.14291664527012e-05, + "loss": 2.7986, + "step": 22892 + }, + { + "epoch": 1.0658332751356008, + "grad_norm": 0.3072290821683601, + "learning_rate": 8.142705970773614e-05, + "loss": 2.7978, + "step": 22893 + }, + { + "epoch": 1.0658798333216937, + "grad_norm": 0.32560177948366076, + "learning_rate": 8.142495287053577e-05, + "loss": 2.7974, + "step": 22894 + }, + { + "epoch": 1.0659263915077868, + "grad_norm": 0.33258831202191946, + "learning_rate": 8.142284594110626e-05, + "loss": 2.8545, + "step": 22895 + }, + { + "epoch": 1.06597294969388, + "grad_norm": 0.3511931453481193, + "learning_rate": 8.142073891945379e-05, + "loss": 2.7848, + "step": 22896 + }, + { + "epoch": 1.066019507879973, + "grad_norm": 0.3461249919704258, + "learning_rate": 8.141863180558453e-05, + "loss": 2.7298, + "step": 22897 + }, + { + "epoch": 1.0660660660660661, + "grad_norm": 0.3456221077927683, + "learning_rate": 8.14165245995047e-05, + "loss": 2.7916, + "step": 22898 + }, + { + "epoch": 1.0661126242521592, + "grad_norm": 0.35974057947685356, + "learning_rate": 8.141441730122047e-05, + "loss": 2.8183, + "step": 22899 + }, + { + "epoch": 1.0661591824382521, + "grad_norm": 0.3158227431323407, + "learning_rate": 8.141230991073802e-05, + "loss": 2.9387, + "step": 22900 + }, + { + "epoch": 1.0662057406243453, + "grad_norm": 0.3528747002268111, + "learning_rate": 8.141020242806352e-05, + "loss": 2.8285, + "step": 22901 + }, + { + "epoch": 1.0662522988104384, + "grad_norm": 0.32968831983890867, + "learning_rate": 8.140809485320318e-05, + "loss": 2.7588, + "step": 22902 + }, + { + "epoch": 1.0662988569965315, + "grad_norm": 0.3305596710465386, + "learning_rate": 8.140598718616316e-05, + "loss": 2.7083, + "step": 22903 + }, + { + "epoch": 1.0663454151826244, + "grad_norm": 0.32951689726279615, + "learning_rate": 8.140387942694969e-05, + "loss": 2.8241, + "step": 22904 + }, + { + "epoch": 1.0663919733687175, + "grad_norm": 0.3388772836307602, + "learning_rate": 8.14017715755689e-05, + "loss": 2.7359, + "step": 22905 + }, + { + "epoch": 1.0664385315548106, + "grad_norm": 0.30626914198973887, + "learning_rate": 8.139966363202703e-05, + "loss": 2.8521, + "step": 22906 + }, + { + "epoch": 1.0664850897409037, + "grad_norm": 0.3365289214349202, + "learning_rate": 8.139755559633022e-05, + "loss": 2.8039, + "step": 22907 + }, + { + "epoch": 1.0665316479269968, + "grad_norm": 0.31510021449437753, + "learning_rate": 8.139544746848469e-05, + "loss": 2.8184, + "step": 22908 + }, + { + "epoch": 1.0665782061130897, + "grad_norm": 0.3954525819805711, + "learning_rate": 8.139333924849659e-05, + "loss": 2.7147, + "step": 22909 + }, + { + "epoch": 1.0666247642991828, + "grad_norm": 0.3043680853571148, + "learning_rate": 8.139123093637215e-05, + "loss": 2.7628, + "step": 22910 + }, + { + "epoch": 1.066671322485276, + "grad_norm": 0.35244846730425783, + "learning_rate": 8.138912253211754e-05, + "loss": 2.8495, + "step": 22911 + }, + { + "epoch": 1.066717880671369, + "grad_norm": 0.3518488295485771, + "learning_rate": 8.138701403573896e-05, + "loss": 2.9025, + "step": 22912 + }, + { + "epoch": 1.0667644388574622, + "grad_norm": 0.31839579083348224, + "learning_rate": 8.138490544724258e-05, + "loss": 2.8388, + "step": 22913 + }, + { + "epoch": 1.066810997043555, + "grad_norm": 0.3471414121625963, + "learning_rate": 8.138279676663458e-05, + "loss": 2.7725, + "step": 22914 + }, + { + "epoch": 1.0668575552296482, + "grad_norm": 0.30250534590335254, + "learning_rate": 8.138068799392116e-05, + "loss": 2.8486, + "step": 22915 + }, + { + "epoch": 1.0669041134157413, + "grad_norm": 0.31426798458823274, + "learning_rate": 8.137857912910854e-05, + "loss": 2.7549, + "step": 22916 + }, + { + "epoch": 1.0669506716018344, + "grad_norm": 0.34693014793951593, + "learning_rate": 8.137647017220286e-05, + "loss": 2.8063, + "step": 22917 + }, + { + "epoch": 1.0669972297879275, + "grad_norm": 0.32654898404037536, + "learning_rate": 8.137436112321033e-05, + "loss": 2.8771, + "step": 22918 + }, + { + "epoch": 1.0670437879740204, + "grad_norm": 0.35974068008795396, + "learning_rate": 8.137225198213715e-05, + "loss": 2.7954, + "step": 22919 + }, + { + "epoch": 1.0670903461601136, + "grad_norm": 0.30076225664093975, + "learning_rate": 8.13701427489895e-05, + "loss": 2.8063, + "step": 22920 + }, + { + "epoch": 1.0671369043462067, + "grad_norm": 0.3533065871730509, + "learning_rate": 8.136803342377357e-05, + "loss": 2.8699, + "step": 22921 + }, + { + "epoch": 1.0671834625322998, + "grad_norm": 0.3196631564079811, + "learning_rate": 8.136592400649555e-05, + "loss": 2.7875, + "step": 22922 + }, + { + "epoch": 1.067230020718393, + "grad_norm": 0.33790328648406176, + "learning_rate": 8.136381449716164e-05, + "loss": 2.7573, + "step": 22923 + }, + { + "epoch": 1.0672765789044858, + "grad_norm": 0.3405619626422945, + "learning_rate": 8.136170489577801e-05, + "loss": 2.7893, + "step": 22924 + }, + { + "epoch": 1.067323137090579, + "grad_norm": 0.3337238997229289, + "learning_rate": 8.135959520235088e-05, + "loss": 2.7747, + "step": 22925 + }, + { + "epoch": 1.067369695276672, + "grad_norm": 0.32917481296912887, + "learning_rate": 8.135748541688644e-05, + "loss": 2.7926, + "step": 22926 + }, + { + "epoch": 1.0674162534627651, + "grad_norm": 0.31215934820748054, + "learning_rate": 8.135537553939083e-05, + "loss": 2.7725, + "step": 22927 + }, + { + "epoch": 1.0674628116488583, + "grad_norm": 0.35390315095975505, + "learning_rate": 8.135326556987031e-05, + "loss": 2.8152, + "step": 22928 + }, + { + "epoch": 1.0675093698349511, + "grad_norm": 0.30632530329678087, + "learning_rate": 8.135115550833104e-05, + "loss": 2.7946, + "step": 22929 + }, + { + "epoch": 1.0675559280210443, + "grad_norm": 0.3525705298966246, + "learning_rate": 8.134904535477921e-05, + "loss": 2.9537, + "step": 22930 + }, + { + "epoch": 1.0676024862071374, + "grad_norm": 0.3097856146222624, + "learning_rate": 8.134693510922102e-05, + "loss": 2.851, + "step": 22931 + }, + { + "epoch": 1.0676490443932305, + "grad_norm": 0.3526000323395083, + "learning_rate": 8.134482477166268e-05, + "loss": 2.946, + "step": 22932 + }, + { + "epoch": 1.0676956025793236, + "grad_norm": 0.32610604920045616, + "learning_rate": 8.134271434211036e-05, + "loss": 2.7946, + "step": 22933 + }, + { + "epoch": 1.0677421607654165, + "grad_norm": 0.3228076769050629, + "learning_rate": 8.134060382057026e-05, + "loss": 2.8442, + "step": 22934 + }, + { + "epoch": 1.0677887189515096, + "grad_norm": 0.3262169331251844, + "learning_rate": 8.133849320704857e-05, + "loss": 2.8325, + "step": 22935 + }, + { + "epoch": 1.0678352771376027, + "grad_norm": 0.3409630424840796, + "learning_rate": 8.133638250155149e-05, + "loss": 2.7647, + "step": 22936 + }, + { + "epoch": 1.0678818353236958, + "grad_norm": 0.3184800611450415, + "learning_rate": 8.133427170408523e-05, + "loss": 2.8594, + "step": 22937 + }, + { + "epoch": 1.067928393509789, + "grad_norm": 0.3282117982448999, + "learning_rate": 8.133216081465594e-05, + "loss": 2.8058, + "step": 22938 + }, + { + "epoch": 1.0679749516958819, + "grad_norm": 0.34583726807079623, + "learning_rate": 8.133004983326987e-05, + "loss": 2.7893, + "step": 22939 + }, + { + "epoch": 1.068021509881975, + "grad_norm": 0.3373002378756214, + "learning_rate": 8.132793875993318e-05, + "loss": 2.9247, + "step": 22940 + }, + { + "epoch": 1.068068068068068, + "grad_norm": 0.3626595075570545, + "learning_rate": 8.132582759465208e-05, + "loss": 2.8897, + "step": 22941 + }, + { + "epoch": 1.0681146262541612, + "grad_norm": 0.34300527217184273, + "learning_rate": 8.132371633743276e-05, + "loss": 2.827, + "step": 22942 + }, + { + "epoch": 1.068161184440254, + "grad_norm": 0.33960889940376243, + "learning_rate": 8.132160498828143e-05, + "loss": 2.8291, + "step": 22943 + }, + { + "epoch": 1.0682077426263472, + "grad_norm": 0.33565556398224483, + "learning_rate": 8.131949354720426e-05, + "loss": 2.8422, + "step": 22944 + }, + { + "epoch": 1.0682543008124403, + "grad_norm": 0.3204922125645282, + "learning_rate": 8.131738201420747e-05, + "loss": 2.9297, + "step": 22945 + }, + { + "epoch": 1.0683008589985334, + "grad_norm": 0.31653852653031606, + "learning_rate": 8.131527038929723e-05, + "loss": 2.7769, + "step": 22946 + }, + { + "epoch": 1.0683474171846266, + "grad_norm": 0.3198422482156504, + "learning_rate": 8.131315867247978e-05, + "loss": 2.853, + "step": 22947 + }, + { + "epoch": 1.0683939753707197, + "grad_norm": 0.32288919826974566, + "learning_rate": 8.131104686376129e-05, + "loss": 2.8348, + "step": 22948 + }, + { + "epoch": 1.0684405335568126, + "grad_norm": 0.3080171566479922, + "learning_rate": 8.130893496314795e-05, + "loss": 2.7734, + "step": 22949 + }, + { + "epoch": 1.0684870917429057, + "grad_norm": 0.30733619356538705, + "learning_rate": 8.130682297064597e-05, + "loss": 2.8385, + "step": 22950 + }, + { + "epoch": 1.0685336499289988, + "grad_norm": 0.3188133653525264, + "learning_rate": 8.130471088626157e-05, + "loss": 2.8443, + "step": 22951 + }, + { + "epoch": 1.068580208115092, + "grad_norm": 0.3737529366481776, + "learning_rate": 8.130259871000092e-05, + "loss": 2.8231, + "step": 22952 + }, + { + "epoch": 1.0686267663011848, + "grad_norm": 0.3352097829107982, + "learning_rate": 8.130048644187021e-05, + "loss": 2.7221, + "step": 22953 + }, + { + "epoch": 1.068673324487278, + "grad_norm": 0.31493385453234224, + "learning_rate": 8.129837408187566e-05, + "loss": 2.8069, + "step": 22954 + }, + { + "epoch": 1.068719882673371, + "grad_norm": 0.3607992639472302, + "learning_rate": 8.129626163002349e-05, + "loss": 2.8868, + "step": 22955 + }, + { + "epoch": 1.0687664408594642, + "grad_norm": 0.329316378523256, + "learning_rate": 8.129414908631985e-05, + "loss": 2.7758, + "step": 22956 + }, + { + "epoch": 1.0688129990455573, + "grad_norm": 0.35985968472384733, + "learning_rate": 8.129203645077097e-05, + "loss": 2.7801, + "step": 22957 + }, + { + "epoch": 1.0688595572316502, + "grad_norm": 0.3272149889567515, + "learning_rate": 8.128992372338305e-05, + "loss": 2.7552, + "step": 22958 + }, + { + "epoch": 1.0689061154177433, + "grad_norm": 0.33904084688665015, + "learning_rate": 8.128781090416228e-05, + "loss": 2.8347, + "step": 22959 + }, + { + "epoch": 1.0689526736038364, + "grad_norm": 0.32637944293544, + "learning_rate": 8.128569799311488e-05, + "loss": 2.7094, + "step": 22960 + }, + { + "epoch": 1.0689992317899295, + "grad_norm": 0.3525250430841987, + "learning_rate": 8.128358499024703e-05, + "loss": 2.8577, + "step": 22961 + }, + { + "epoch": 1.0690457899760226, + "grad_norm": 0.3044518091832155, + "learning_rate": 8.128147189556494e-05, + "loss": 2.8954, + "step": 22962 + }, + { + "epoch": 1.0690923481621155, + "grad_norm": 0.3385394236232162, + "learning_rate": 8.127935870907482e-05, + "loss": 2.7506, + "step": 22963 + }, + { + "epoch": 1.0691389063482086, + "grad_norm": 0.29708674480267516, + "learning_rate": 8.127724543078286e-05, + "loss": 2.8445, + "step": 22964 + }, + { + "epoch": 1.0691854645343017, + "grad_norm": 0.32656882007271965, + "learning_rate": 8.127513206069525e-05, + "loss": 2.8289, + "step": 22965 + }, + { + "epoch": 1.0692320227203949, + "grad_norm": 0.3034318415627645, + "learning_rate": 8.127301859881822e-05, + "loss": 2.9057, + "step": 22966 + }, + { + "epoch": 1.069278580906488, + "grad_norm": 0.34918040171297726, + "learning_rate": 8.127090504515797e-05, + "loss": 2.7251, + "step": 22967 + }, + { + "epoch": 1.0693251390925809, + "grad_norm": 0.3415835080787357, + "learning_rate": 8.126879139972068e-05, + "loss": 2.9319, + "step": 22968 + }, + { + "epoch": 1.069371697278674, + "grad_norm": 0.356038531718173, + "learning_rate": 8.126667766251258e-05, + "loss": 2.9288, + "step": 22969 + }, + { + "epoch": 1.069418255464767, + "grad_norm": 0.34989342948213215, + "learning_rate": 8.126456383353985e-05, + "loss": 2.8839, + "step": 22970 + }, + { + "epoch": 1.0694648136508602, + "grad_norm": 0.33603572155876626, + "learning_rate": 8.126244991280872e-05, + "loss": 2.8671, + "step": 22971 + }, + { + "epoch": 1.0695113718369533, + "grad_norm": 0.3058473598090409, + "learning_rate": 8.126033590032535e-05, + "loss": 2.8053, + "step": 22972 + }, + { + "epoch": 1.0695579300230462, + "grad_norm": 0.32364050505461095, + "learning_rate": 8.125822179609602e-05, + "loss": 2.7504, + "step": 22973 + }, + { + "epoch": 1.0696044882091393, + "grad_norm": 0.3091915562674597, + "learning_rate": 8.125610760012686e-05, + "loss": 2.9504, + "step": 22974 + }, + { + "epoch": 1.0696510463952325, + "grad_norm": 0.31785311116770104, + "learning_rate": 8.12539933124241e-05, + "loss": 2.803, + "step": 22975 + }, + { + "epoch": 1.0696976045813256, + "grad_norm": 0.3113315122784824, + "learning_rate": 8.125187893299396e-05, + "loss": 2.7712, + "step": 22976 + }, + { + "epoch": 1.0697441627674187, + "grad_norm": 0.3350962318406503, + "learning_rate": 8.124976446184263e-05, + "loss": 2.848, + "step": 22977 + }, + { + "epoch": 1.0697907209535116, + "grad_norm": 0.3425533313992694, + "learning_rate": 8.124764989897631e-05, + "loss": 2.7281, + "step": 22978 + }, + { + "epoch": 1.0698372791396047, + "grad_norm": 0.3174755005116539, + "learning_rate": 8.124553524440122e-05, + "loss": 2.7022, + "step": 22979 + }, + { + "epoch": 1.0698838373256978, + "grad_norm": 0.3513072681275377, + "learning_rate": 8.124342049812357e-05, + "loss": 2.7843, + "step": 22980 + }, + { + "epoch": 1.069930395511791, + "grad_norm": 0.3429409019000956, + "learning_rate": 8.124130566014955e-05, + "loss": 2.7479, + "step": 22981 + }, + { + "epoch": 1.0699769536978838, + "grad_norm": 0.3565145937010906, + "learning_rate": 8.123919073048539e-05, + "loss": 2.781, + "step": 22982 + }, + { + "epoch": 1.070023511883977, + "grad_norm": 0.34285642034701885, + "learning_rate": 8.123707570913728e-05, + "loss": 2.9494, + "step": 22983 + }, + { + "epoch": 1.07007007007007, + "grad_norm": 0.33514159579567165, + "learning_rate": 8.123496059611142e-05, + "loss": 2.7812, + "step": 22984 + }, + { + "epoch": 1.0701166282561632, + "grad_norm": 0.3523527157014966, + "learning_rate": 8.123284539141404e-05, + "loss": 2.8707, + "step": 22985 + }, + { + "epoch": 1.0701631864422563, + "grad_norm": 0.3407875982848166, + "learning_rate": 8.123073009505132e-05, + "loss": 2.7637, + "step": 22986 + }, + { + "epoch": 1.0702097446283494, + "grad_norm": 0.3289145727641783, + "learning_rate": 8.122861470702949e-05, + "loss": 2.8048, + "step": 22987 + }, + { + "epoch": 1.0702563028144423, + "grad_norm": 0.3513219054046168, + "learning_rate": 8.122649922735475e-05, + "loss": 2.7039, + "step": 22988 + }, + { + "epoch": 1.0703028610005354, + "grad_norm": 0.34337645604335126, + "learning_rate": 8.122438365603331e-05, + "loss": 2.879, + "step": 22989 + }, + { + "epoch": 1.0703494191866285, + "grad_norm": 0.3729729897319149, + "learning_rate": 8.12222679930714e-05, + "loss": 2.8354, + "step": 22990 + }, + { + "epoch": 1.0703959773727216, + "grad_norm": 0.33819367998633265, + "learning_rate": 8.122015223847519e-05, + "loss": 2.9445, + "step": 22991 + }, + { + "epoch": 1.0704425355588145, + "grad_norm": 0.3258802031456565, + "learning_rate": 8.121803639225091e-05, + "loss": 2.7293, + "step": 22992 + }, + { + "epoch": 1.0704890937449076, + "grad_norm": 0.3566417646351434, + "learning_rate": 8.121592045440478e-05, + "loss": 2.8397, + "step": 22993 + }, + { + "epoch": 1.0705356519310008, + "grad_norm": 0.32485277384521416, + "learning_rate": 8.1213804424943e-05, + "loss": 2.9146, + "step": 22994 + }, + { + "epoch": 1.0705822101170939, + "grad_norm": 0.35400707944985443, + "learning_rate": 8.121168830387176e-05, + "loss": 2.8382, + "step": 22995 + }, + { + "epoch": 1.070628768303187, + "grad_norm": 0.3161522191174665, + "learning_rate": 8.12095720911973e-05, + "loss": 2.723, + "step": 22996 + }, + { + "epoch": 1.0706753264892799, + "grad_norm": 0.3121516114800423, + "learning_rate": 8.120745578692582e-05, + "loss": 2.8669, + "step": 22997 + }, + { + "epoch": 1.070721884675373, + "grad_norm": 0.36392075382787376, + "learning_rate": 8.120533939106354e-05, + "loss": 2.792, + "step": 22998 + }, + { + "epoch": 1.070768442861466, + "grad_norm": 0.3317158507858396, + "learning_rate": 8.120322290361665e-05, + "loss": 2.8236, + "step": 22999 + }, + { + "epoch": 1.0708150010475592, + "grad_norm": 0.3827458403811071, + "learning_rate": 8.120110632459137e-05, + "loss": 2.6912, + "step": 23000 + }, + { + "epoch": 1.0708615592336523, + "grad_norm": 0.31562533132841547, + "learning_rate": 8.119898965399393e-05, + "loss": 2.8459, + "step": 23001 + }, + { + "epoch": 1.0709081174197452, + "grad_norm": 0.36391147127978324, + "learning_rate": 8.119687289183054e-05, + "loss": 2.8239, + "step": 23002 + }, + { + "epoch": 1.0709546756058383, + "grad_norm": 0.33120468779964285, + "learning_rate": 8.119475603810737e-05, + "loss": 2.823, + "step": 23003 + }, + { + "epoch": 1.0710012337919315, + "grad_norm": 0.36047266360531777, + "learning_rate": 8.119263909283069e-05, + "loss": 2.9181, + "step": 23004 + }, + { + "epoch": 1.0710477919780246, + "grad_norm": 0.32121714121688677, + "learning_rate": 8.119052205600667e-05, + "loss": 2.7672, + "step": 23005 + }, + { + "epoch": 1.0710943501641177, + "grad_norm": 0.35665122551641315, + "learning_rate": 8.118840492764157e-05, + "loss": 2.7041, + "step": 23006 + }, + { + "epoch": 1.0711409083502106, + "grad_norm": 0.31593507184435005, + "learning_rate": 8.118628770774155e-05, + "loss": 2.8164, + "step": 23007 + }, + { + "epoch": 1.0711874665363037, + "grad_norm": 0.34159567189502343, + "learning_rate": 8.118417039631285e-05, + "loss": 2.7773, + "step": 23008 + }, + { + "epoch": 1.0712340247223968, + "grad_norm": 0.3133956309074034, + "learning_rate": 8.118205299336168e-05, + "loss": 2.9121, + "step": 23009 + }, + { + "epoch": 1.07128058290849, + "grad_norm": 0.3372451559379932, + "learning_rate": 8.117993549889425e-05, + "loss": 2.6777, + "step": 23010 + }, + { + "epoch": 1.071327141094583, + "grad_norm": 0.31488988613173824, + "learning_rate": 8.117781791291681e-05, + "loss": 2.7032, + "step": 23011 + }, + { + "epoch": 1.071373699280676, + "grad_norm": 0.3411841937255037, + "learning_rate": 8.117570023543552e-05, + "loss": 2.8489, + "step": 23012 + }, + { + "epoch": 1.071420257466769, + "grad_norm": 0.31608959526227154, + "learning_rate": 8.117358246645662e-05, + "loss": 2.9296, + "step": 23013 + }, + { + "epoch": 1.0714668156528622, + "grad_norm": 0.3346738621113223, + "learning_rate": 8.117146460598633e-05, + "loss": 2.8327, + "step": 23014 + }, + { + "epoch": 1.0715133738389553, + "grad_norm": 0.3012404614949212, + "learning_rate": 8.116934665403087e-05, + "loss": 2.8552, + "step": 23015 + }, + { + "epoch": 1.0715599320250484, + "grad_norm": 0.3477471893426923, + "learning_rate": 8.116722861059644e-05, + "loss": 2.7764, + "step": 23016 + }, + { + "epoch": 1.0716064902111413, + "grad_norm": 0.2999252409412398, + "learning_rate": 8.116511047568929e-05, + "loss": 2.8055, + "step": 23017 + }, + { + "epoch": 1.0716530483972344, + "grad_norm": 0.3647422752709596, + "learning_rate": 8.116299224931558e-05, + "loss": 2.7825, + "step": 23018 + }, + { + "epoch": 1.0716996065833275, + "grad_norm": 0.32820815631329897, + "learning_rate": 8.116087393148156e-05, + "loss": 2.882, + "step": 23019 + }, + { + "epoch": 1.0717461647694206, + "grad_norm": 0.3433339338662264, + "learning_rate": 8.115875552219346e-05, + "loss": 2.8108, + "step": 23020 + }, + { + "epoch": 1.0717927229555138, + "grad_norm": 0.34013640664555456, + "learning_rate": 8.115663702145748e-05, + "loss": 2.785, + "step": 23021 + }, + { + "epoch": 1.0718392811416066, + "grad_norm": 0.3485326316849244, + "learning_rate": 8.115451842927984e-05, + "loss": 2.8332, + "step": 23022 + }, + { + "epoch": 1.0718858393276998, + "grad_norm": 0.3351607411277697, + "learning_rate": 8.115239974566676e-05, + "loss": 2.7725, + "step": 23023 + }, + { + "epoch": 1.0719323975137929, + "grad_norm": 0.33405722747132477, + "learning_rate": 8.115028097062443e-05, + "loss": 2.8279, + "step": 23024 + }, + { + "epoch": 1.071978955699886, + "grad_norm": 0.30321288436602556, + "learning_rate": 8.114816210415912e-05, + "loss": 2.8633, + "step": 23025 + }, + { + "epoch": 1.072025513885979, + "grad_norm": 0.3368176614650233, + "learning_rate": 8.114604314627701e-05, + "loss": 2.7774, + "step": 23026 + }, + { + "epoch": 1.072072072072072, + "grad_norm": 0.33047773997329233, + "learning_rate": 8.114392409698435e-05, + "loss": 2.8392, + "step": 23027 + }, + { + "epoch": 1.0721186302581651, + "grad_norm": 0.3052545779153116, + "learning_rate": 8.114180495628731e-05, + "loss": 2.7276, + "step": 23028 + }, + { + "epoch": 1.0721651884442582, + "grad_norm": 0.3294701044299396, + "learning_rate": 8.113968572419217e-05, + "loss": 2.7681, + "step": 23029 + }, + { + "epoch": 1.0722117466303513, + "grad_norm": 0.3299567211183842, + "learning_rate": 8.113756640070511e-05, + "loss": 2.878, + "step": 23030 + }, + { + "epoch": 1.0722583048164442, + "grad_norm": 0.33568305621858624, + "learning_rate": 8.113544698583236e-05, + "loss": 2.8489, + "step": 23031 + }, + { + "epoch": 1.0723048630025374, + "grad_norm": 0.327821807236339, + "learning_rate": 8.113332747958014e-05, + "loss": 2.8699, + "step": 23032 + }, + { + "epoch": 1.0723514211886305, + "grad_norm": 0.32919211850233704, + "learning_rate": 8.113120788195467e-05, + "loss": 2.8259, + "step": 23033 + }, + { + "epoch": 1.0723979793747236, + "grad_norm": 0.33721302565819167, + "learning_rate": 8.112908819296217e-05, + "loss": 2.8284, + "step": 23034 + }, + { + "epoch": 1.0724445375608167, + "grad_norm": 0.3355068497206907, + "learning_rate": 8.112696841260887e-05, + "loss": 2.8764, + "step": 23035 + }, + { + "epoch": 1.0724910957469098, + "grad_norm": 0.35386668296892887, + "learning_rate": 8.112484854090096e-05, + "loss": 2.9284, + "step": 23036 + }, + { + "epoch": 1.0725376539330027, + "grad_norm": 0.3428346703732439, + "learning_rate": 8.11227285778447e-05, + "loss": 2.8237, + "step": 23037 + }, + { + "epoch": 1.0725842121190958, + "grad_norm": 0.3577347720514544, + "learning_rate": 8.112060852344632e-05, + "loss": 2.7987, + "step": 23038 + }, + { + "epoch": 1.072630770305189, + "grad_norm": 0.35991791508584897, + "learning_rate": 8.111848837771201e-05, + "loss": 2.8593, + "step": 23039 + }, + { + "epoch": 1.072677328491282, + "grad_norm": 0.3338023762558568, + "learning_rate": 8.111636814064799e-05, + "loss": 2.9261, + "step": 23040 + }, + { + "epoch": 1.072723886677375, + "grad_norm": 0.3333950436248223, + "learning_rate": 8.111424781226051e-05, + "loss": 2.8017, + "step": 23041 + }, + { + "epoch": 1.072770444863468, + "grad_norm": 0.3625781966319111, + "learning_rate": 8.111212739255577e-05, + "loss": 2.8299, + "step": 23042 + }, + { + "epoch": 1.0728170030495612, + "grad_norm": 0.31940327926237716, + "learning_rate": 8.111000688153999e-05, + "loss": 2.8689, + "step": 23043 + }, + { + "epoch": 1.0728635612356543, + "grad_norm": 0.365077716943073, + "learning_rate": 8.110788627921942e-05, + "loss": 2.8887, + "step": 23044 + }, + { + "epoch": 1.0729101194217474, + "grad_norm": 0.35185788277210034, + "learning_rate": 8.110576558560026e-05, + "loss": 2.7487, + "step": 23045 + }, + { + "epoch": 1.0729566776078403, + "grad_norm": 0.3348942675676699, + "learning_rate": 8.110364480068874e-05, + "loss": 2.8898, + "step": 23046 + }, + { + "epoch": 1.0730032357939334, + "grad_norm": 0.3800240478255495, + "learning_rate": 8.11015239244911e-05, + "loss": 2.8271, + "step": 23047 + }, + { + "epoch": 1.0730497939800265, + "grad_norm": 0.33894399356055366, + "learning_rate": 8.109940295701353e-05, + "loss": 2.8109, + "step": 23048 + }, + { + "epoch": 1.0730963521661196, + "grad_norm": 0.38298617196289125, + "learning_rate": 8.109728189826229e-05, + "loss": 2.7704, + "step": 23049 + }, + { + "epoch": 1.0731429103522128, + "grad_norm": 0.3678363135806929, + "learning_rate": 8.10951607482436e-05, + "loss": 2.7592, + "step": 23050 + }, + { + "epoch": 1.0731894685383057, + "grad_norm": 0.3539507317585997, + "learning_rate": 8.109303950696366e-05, + "loss": 2.928, + "step": 23051 + }, + { + "epoch": 1.0732360267243988, + "grad_norm": 0.3661757351735761, + "learning_rate": 8.109091817442872e-05, + "loss": 2.9135, + "step": 23052 + }, + { + "epoch": 1.0732825849104919, + "grad_norm": 0.38555646566309765, + "learning_rate": 8.1088796750645e-05, + "loss": 2.8544, + "step": 23053 + }, + { + "epoch": 1.073329143096585, + "grad_norm": 0.3425999727468205, + "learning_rate": 8.108667523561872e-05, + "loss": 2.8385, + "step": 23054 + }, + { + "epoch": 1.0733757012826781, + "grad_norm": 0.36482938732729814, + "learning_rate": 8.10845536293561e-05, + "loss": 2.8528, + "step": 23055 + }, + { + "epoch": 1.073422259468771, + "grad_norm": 0.3802529773724813, + "learning_rate": 8.10824319318634e-05, + "loss": 2.7381, + "step": 23056 + }, + { + "epoch": 1.0734688176548641, + "grad_norm": 0.38100650884058795, + "learning_rate": 8.10803101431468e-05, + "loss": 2.7403, + "step": 23057 + }, + { + "epoch": 1.0735153758409572, + "grad_norm": 0.34449307705229537, + "learning_rate": 8.107818826321258e-05, + "loss": 2.8989, + "step": 23058 + }, + { + "epoch": 1.0735619340270504, + "grad_norm": 0.4002075295572134, + "learning_rate": 8.107606629206692e-05, + "loss": 2.7403, + "step": 23059 + }, + { + "epoch": 1.0736084922131435, + "grad_norm": 0.3486688264325251, + "learning_rate": 8.107394422971608e-05, + "loss": 2.6986, + "step": 23060 + }, + { + "epoch": 1.0736550503992364, + "grad_norm": 0.3617897229286262, + "learning_rate": 8.107182207616627e-05, + "loss": 2.8976, + "step": 23061 + }, + { + "epoch": 1.0737016085853295, + "grad_norm": 0.32282882878704433, + "learning_rate": 8.106969983142372e-05, + "loss": 2.7217, + "step": 23062 + }, + { + "epoch": 1.0737481667714226, + "grad_norm": 0.361427969349698, + "learning_rate": 8.106757749549465e-05, + "loss": 2.8576, + "step": 23063 + }, + { + "epoch": 1.0737947249575157, + "grad_norm": 0.33724272779479375, + "learning_rate": 8.106545506838532e-05, + "loss": 2.7433, + "step": 23064 + }, + { + "epoch": 1.0738412831436088, + "grad_norm": 0.3544061577243622, + "learning_rate": 8.106333255010194e-05, + "loss": 2.9057, + "step": 23065 + }, + { + "epoch": 1.0738878413297017, + "grad_norm": 0.3291748847418098, + "learning_rate": 8.106120994065072e-05, + "loss": 2.8963, + "step": 23066 + }, + { + "epoch": 1.0739343995157948, + "grad_norm": 0.32444646473240096, + "learning_rate": 8.105908724003792e-05, + "loss": 2.7496, + "step": 23067 + }, + { + "epoch": 1.073980957701888, + "grad_norm": 0.35236945475540277, + "learning_rate": 8.105696444826976e-05, + "loss": 2.7351, + "step": 23068 + }, + { + "epoch": 1.074027515887981, + "grad_norm": 0.3272297225210977, + "learning_rate": 8.105484156535248e-05, + "loss": 2.7478, + "step": 23069 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 0.3409715970548576, + "learning_rate": 8.105271859129227e-05, + "loss": 2.8844, + "step": 23070 + }, + { + "epoch": 1.074120632260167, + "grad_norm": 0.3269307545249925, + "learning_rate": 8.105059552609542e-05, + "loss": 2.8322, + "step": 23071 + }, + { + "epoch": 1.0741671904462602, + "grad_norm": 0.32886787312137256, + "learning_rate": 8.104847236976811e-05, + "loss": 2.7065, + "step": 23072 + }, + { + "epoch": 1.0742137486323533, + "grad_norm": 0.32526937568187503, + "learning_rate": 8.10463491223166e-05, + "loss": 2.8837, + "step": 23073 + }, + { + "epoch": 1.0742603068184464, + "grad_norm": 0.3646623315766045, + "learning_rate": 8.104422578374713e-05, + "loss": 2.7198, + "step": 23074 + }, + { + "epoch": 1.0743068650045395, + "grad_norm": 0.3222958640604594, + "learning_rate": 8.104210235406588e-05, + "loss": 2.8442, + "step": 23075 + }, + { + "epoch": 1.0743534231906324, + "grad_norm": 0.3415616521341552, + "learning_rate": 8.103997883327915e-05, + "loss": 2.8637, + "step": 23076 + }, + { + "epoch": 1.0743999813767255, + "grad_norm": 0.36121627368832626, + "learning_rate": 8.103785522139314e-05, + "loss": 2.7668, + "step": 23077 + }, + { + "epoch": 1.0744465395628187, + "grad_norm": 0.3545222250617019, + "learning_rate": 8.103573151841407e-05, + "loss": 2.8147, + "step": 23078 + }, + { + "epoch": 1.0744930977489118, + "grad_norm": 0.36425046077145573, + "learning_rate": 8.103360772434818e-05, + "loss": 2.7719, + "step": 23079 + }, + { + "epoch": 1.0745396559350047, + "grad_norm": 0.34475858666285664, + "learning_rate": 8.103148383920171e-05, + "loss": 2.8417, + "step": 23080 + }, + { + "epoch": 1.0745862141210978, + "grad_norm": 0.36654554933265926, + "learning_rate": 8.10293598629809e-05, + "loss": 2.8414, + "step": 23081 + }, + { + "epoch": 1.074632772307191, + "grad_norm": 0.33090850135825395, + "learning_rate": 8.102723579569197e-05, + "loss": 2.8523, + "step": 23082 + }, + { + "epoch": 1.074679330493284, + "grad_norm": 0.35397724246786205, + "learning_rate": 8.102511163734117e-05, + "loss": 2.8291, + "step": 23083 + }, + { + "epoch": 1.0747258886793771, + "grad_norm": 0.32726332454351065, + "learning_rate": 8.102298738793472e-05, + "loss": 2.8519, + "step": 23084 + }, + { + "epoch": 1.07477244686547, + "grad_norm": 0.39099930685774964, + "learning_rate": 8.102086304747884e-05, + "loss": 2.7942, + "step": 23085 + }, + { + "epoch": 1.0748190050515631, + "grad_norm": 0.3128678039836637, + "learning_rate": 8.101873861597981e-05, + "loss": 2.8179, + "step": 23086 + }, + { + "epoch": 1.0748655632376563, + "grad_norm": 0.3898014563084258, + "learning_rate": 8.101661409344383e-05, + "loss": 2.8988, + "step": 23087 + }, + { + "epoch": 1.0749121214237494, + "grad_norm": 0.3391209397797679, + "learning_rate": 8.101448947987713e-05, + "loss": 2.8032, + "step": 23088 + }, + { + "epoch": 1.0749586796098425, + "grad_norm": 0.3729213957375432, + "learning_rate": 8.101236477528598e-05, + "loss": 2.7851, + "step": 23089 + }, + { + "epoch": 1.0750052377959354, + "grad_norm": 0.3286261704737552, + "learning_rate": 8.101023997967658e-05, + "loss": 2.7819, + "step": 23090 + }, + { + "epoch": 1.0750517959820285, + "grad_norm": 0.37675112576454844, + "learning_rate": 8.100811509305517e-05, + "loss": 2.8387, + "step": 23091 + }, + { + "epoch": 1.0750983541681216, + "grad_norm": 0.3595125536125418, + "learning_rate": 8.100599011542801e-05, + "loss": 2.8161, + "step": 23092 + }, + { + "epoch": 1.0751449123542147, + "grad_norm": 0.34296807329731316, + "learning_rate": 8.100386504680133e-05, + "loss": 2.8007, + "step": 23093 + }, + { + "epoch": 1.0751914705403078, + "grad_norm": 0.3776022357341621, + "learning_rate": 8.100173988718135e-05, + "loss": 2.8396, + "step": 23094 + }, + { + "epoch": 1.0752380287264007, + "grad_norm": 0.3257974504554939, + "learning_rate": 8.099961463657433e-05, + "loss": 2.7529, + "step": 23095 + }, + { + "epoch": 1.0752845869124938, + "grad_norm": 0.36627612412039134, + "learning_rate": 8.09974892949865e-05, + "loss": 2.9328, + "step": 23096 + }, + { + "epoch": 1.075331145098587, + "grad_norm": 0.31428572640397484, + "learning_rate": 8.099536386242406e-05, + "loss": 2.7576, + "step": 23097 + }, + { + "epoch": 1.07537770328468, + "grad_norm": 0.33158161073453163, + "learning_rate": 8.09932383388933e-05, + "loss": 2.7441, + "step": 23098 + }, + { + "epoch": 1.0754242614707732, + "grad_norm": 0.2993967314652666, + "learning_rate": 8.099111272440044e-05, + "loss": 2.8309, + "step": 23099 + }, + { + "epoch": 1.075470819656866, + "grad_norm": 0.3438078726951876, + "learning_rate": 8.098898701895172e-05, + "loss": 2.8582, + "step": 23100 + }, + { + "epoch": 1.0755173778429592, + "grad_norm": 0.3213045348611252, + "learning_rate": 8.098686122255337e-05, + "loss": 2.8425, + "step": 23101 + }, + { + "epoch": 1.0755639360290523, + "grad_norm": 0.365431143961202, + "learning_rate": 8.098473533521165e-05, + "loss": 2.7837, + "step": 23102 + }, + { + "epoch": 1.0756104942151454, + "grad_norm": 0.3191316714692311, + "learning_rate": 8.098260935693276e-05, + "loss": 2.8114, + "step": 23103 + }, + { + "epoch": 1.0756570524012385, + "grad_norm": 0.38107502265719995, + "learning_rate": 8.098048328772298e-05, + "loss": 2.8364, + "step": 23104 + }, + { + "epoch": 1.0757036105873314, + "grad_norm": 0.32384849733706744, + "learning_rate": 8.097835712758854e-05, + "loss": 2.7946, + "step": 23105 + }, + { + "epoch": 1.0757501687734246, + "grad_norm": 0.3860634671788494, + "learning_rate": 8.097623087653565e-05, + "loss": 2.8482, + "step": 23106 + }, + { + "epoch": 1.0757967269595177, + "grad_norm": 0.3063006229557089, + "learning_rate": 8.097410453457059e-05, + "loss": 2.8438, + "step": 23107 + }, + { + "epoch": 1.0758432851456108, + "grad_norm": 0.37202077544102274, + "learning_rate": 8.097197810169958e-05, + "loss": 2.8008, + "step": 23108 + }, + { + "epoch": 1.075889843331704, + "grad_norm": 0.32632319542935995, + "learning_rate": 8.096985157792886e-05, + "loss": 2.7847, + "step": 23109 + }, + { + "epoch": 1.0759364015177968, + "grad_norm": 0.3402451903235548, + "learning_rate": 8.096772496326468e-05, + "loss": 2.8036, + "step": 23110 + }, + { + "epoch": 1.07598295970389, + "grad_norm": 0.32592678858515395, + "learning_rate": 8.096559825771329e-05, + "loss": 2.8093, + "step": 23111 + }, + { + "epoch": 1.076029517889983, + "grad_norm": 0.3384614414994567, + "learning_rate": 8.096347146128092e-05, + "loss": 2.7587, + "step": 23112 + }, + { + "epoch": 1.0760760760760761, + "grad_norm": 0.32016660071051095, + "learning_rate": 8.096134457397379e-05, + "loss": 2.9241, + "step": 23113 + }, + { + "epoch": 1.0761226342621693, + "grad_norm": 0.3203980652017616, + "learning_rate": 8.095921759579819e-05, + "loss": 2.7108, + "step": 23114 + }, + { + "epoch": 1.0761691924482621, + "grad_norm": 0.3480620421225932, + "learning_rate": 8.095709052676033e-05, + "loss": 2.8652, + "step": 23115 + }, + { + "epoch": 1.0762157506343553, + "grad_norm": 0.33567918383946616, + "learning_rate": 8.095496336686645e-05, + "loss": 2.8556, + "step": 23116 + }, + { + "epoch": 1.0762623088204484, + "grad_norm": 0.33990734001912226, + "learning_rate": 8.09528361161228e-05, + "loss": 2.8586, + "step": 23117 + }, + { + "epoch": 1.0763088670065415, + "grad_norm": 0.37188317206734944, + "learning_rate": 8.095070877453562e-05, + "loss": 2.7791, + "step": 23118 + }, + { + "epoch": 1.0763554251926344, + "grad_norm": 0.3225562112851916, + "learning_rate": 8.094858134211118e-05, + "loss": 2.8071, + "step": 23119 + }, + { + "epoch": 1.0764019833787275, + "grad_norm": 0.3595494432350461, + "learning_rate": 8.094645381885568e-05, + "loss": 2.7612, + "step": 23120 + }, + { + "epoch": 1.0764485415648206, + "grad_norm": 0.32538629690110116, + "learning_rate": 8.094432620477541e-05, + "loss": 2.8644, + "step": 23121 + }, + { + "epoch": 1.0764950997509137, + "grad_norm": 0.36076432426994043, + "learning_rate": 8.094219849987658e-05, + "loss": 2.7729, + "step": 23122 + }, + { + "epoch": 1.0765416579370068, + "grad_norm": 0.3426809267082576, + "learning_rate": 8.094007070416544e-05, + "loss": 2.7931, + "step": 23123 + }, + { + "epoch": 1.0765882161231, + "grad_norm": 0.2974290627477892, + "learning_rate": 8.093794281764825e-05, + "loss": 2.8605, + "step": 23124 + }, + { + "epoch": 1.0766347743091929, + "grad_norm": 0.3447147994968307, + "learning_rate": 8.093581484033123e-05, + "loss": 2.8042, + "step": 23125 + }, + { + "epoch": 1.076681332495286, + "grad_norm": 0.33047469465218, + "learning_rate": 8.093368677222067e-05, + "loss": 2.8134, + "step": 23126 + }, + { + "epoch": 1.076727890681379, + "grad_norm": 0.3298691646377796, + "learning_rate": 8.093155861332276e-05, + "loss": 2.8426, + "step": 23127 + }, + { + "epoch": 1.0767744488674722, + "grad_norm": 0.33189425023749175, + "learning_rate": 8.092943036364378e-05, + "loss": 2.8303, + "step": 23128 + }, + { + "epoch": 1.076821007053565, + "grad_norm": 0.3259331737642627, + "learning_rate": 8.092730202318997e-05, + "loss": 2.9045, + "step": 23129 + }, + { + "epoch": 1.0768675652396582, + "grad_norm": 0.3108500980435813, + "learning_rate": 8.092517359196758e-05, + "loss": 2.9316, + "step": 23130 + }, + { + "epoch": 1.0769141234257513, + "grad_norm": 0.3460445384237571, + "learning_rate": 8.092304506998285e-05, + "loss": 2.8399, + "step": 23131 + }, + { + "epoch": 1.0769606816118444, + "grad_norm": 0.3146675684523217, + "learning_rate": 8.092091645724202e-05, + "loss": 2.8012, + "step": 23132 + }, + { + "epoch": 1.0770072397979376, + "grad_norm": 0.3523136584577778, + "learning_rate": 8.091878775375136e-05, + "loss": 2.9067, + "step": 23133 + }, + { + "epoch": 1.0770537979840304, + "grad_norm": 0.31414454165835815, + "learning_rate": 8.09166589595171e-05, + "loss": 2.8493, + "step": 23134 + }, + { + "epoch": 1.0771003561701236, + "grad_norm": 0.3527028986209977, + "learning_rate": 8.091453007454549e-05, + "loss": 2.8032, + "step": 23135 + }, + { + "epoch": 1.0771469143562167, + "grad_norm": 0.3113584223184275, + "learning_rate": 8.091240109884277e-05, + "loss": 2.9043, + "step": 23136 + }, + { + "epoch": 1.0771934725423098, + "grad_norm": 0.36198146069300285, + "learning_rate": 8.091027203241523e-05, + "loss": 2.7851, + "step": 23137 + }, + { + "epoch": 1.077240030728403, + "grad_norm": 0.32278303398584546, + "learning_rate": 8.090814287526905e-05, + "loss": 2.7512, + "step": 23138 + }, + { + "epoch": 1.0772865889144958, + "grad_norm": 0.3376307680757118, + "learning_rate": 8.090601362741052e-05, + "loss": 2.8413, + "step": 23139 + }, + { + "epoch": 1.077333147100589, + "grad_norm": 0.30814679300059156, + "learning_rate": 8.09038842888459e-05, + "loss": 2.8664, + "step": 23140 + }, + { + "epoch": 1.077379705286682, + "grad_norm": 0.36548623182791173, + "learning_rate": 8.090175485958142e-05, + "loss": 2.8721, + "step": 23141 + }, + { + "epoch": 1.0774262634727751, + "grad_norm": 0.3218651707078561, + "learning_rate": 8.089962533962334e-05, + "loss": 2.7454, + "step": 23142 + }, + { + "epoch": 1.0774728216588683, + "grad_norm": 0.35198396918454167, + "learning_rate": 8.08974957289779e-05, + "loss": 2.7711, + "step": 23143 + }, + { + "epoch": 1.0775193798449612, + "grad_norm": 0.3336946777451743, + "learning_rate": 8.089536602765135e-05, + "loss": 2.9155, + "step": 23144 + }, + { + "epoch": 1.0775659380310543, + "grad_norm": 0.3277019046171763, + "learning_rate": 8.089323623564994e-05, + "loss": 2.8992, + "step": 23145 + }, + { + "epoch": 1.0776124962171474, + "grad_norm": 0.3584513731002909, + "learning_rate": 8.089110635297994e-05, + "loss": 2.7487, + "step": 23146 + }, + { + "epoch": 1.0776590544032405, + "grad_norm": 0.3062600610421736, + "learning_rate": 8.088897637964756e-05, + "loss": 2.7404, + "step": 23147 + }, + { + "epoch": 1.0777056125893336, + "grad_norm": 0.3645628177577571, + "learning_rate": 8.08868463156591e-05, + "loss": 2.8854, + "step": 23148 + }, + { + "epoch": 1.0777521707754265, + "grad_norm": 0.34163170209482807, + "learning_rate": 8.088471616102078e-05, + "loss": 2.8234, + "step": 23149 + }, + { + "epoch": 1.0777987289615196, + "grad_norm": 0.344442046122775, + "learning_rate": 8.088258591573888e-05, + "loss": 2.714, + "step": 23150 + }, + { + "epoch": 1.0778452871476127, + "grad_norm": 0.38756575150465444, + "learning_rate": 8.088045557981961e-05, + "loss": 2.8002, + "step": 23151 + }, + { + "epoch": 1.0778918453337059, + "grad_norm": 0.3145332599834159, + "learning_rate": 8.087832515326925e-05, + "loss": 2.8178, + "step": 23152 + }, + { + "epoch": 1.077938403519799, + "grad_norm": 0.3578312927221534, + "learning_rate": 8.087619463609404e-05, + "loss": 2.8428, + "step": 23153 + }, + { + "epoch": 1.0779849617058919, + "grad_norm": 0.3214781500182924, + "learning_rate": 8.087406402830025e-05, + "loss": 2.8894, + "step": 23154 + }, + { + "epoch": 1.078031519891985, + "grad_norm": 0.34530848130364455, + "learning_rate": 8.087193332989413e-05, + "loss": 2.8414, + "step": 23155 + }, + { + "epoch": 1.078078078078078, + "grad_norm": 0.3098103065142123, + "learning_rate": 8.086980254088192e-05, + "loss": 2.8346, + "step": 23156 + }, + { + "epoch": 1.0781246362641712, + "grad_norm": 0.33447924532321577, + "learning_rate": 8.086767166126988e-05, + "loss": 2.94, + "step": 23157 + }, + { + "epoch": 1.078171194450264, + "grad_norm": 0.35106638979588195, + "learning_rate": 8.086554069106427e-05, + "loss": 2.8474, + "step": 23158 + }, + { + "epoch": 1.0782177526363572, + "grad_norm": 0.36176810059090736, + "learning_rate": 8.086340963027133e-05, + "loss": 2.8076, + "step": 23159 + }, + { + "epoch": 1.0782643108224503, + "grad_norm": 0.32538764094511713, + "learning_rate": 8.086127847889733e-05, + "loss": 2.9285, + "step": 23160 + }, + { + "epoch": 1.0783108690085434, + "grad_norm": 0.31866506260680066, + "learning_rate": 8.085914723694852e-05, + "loss": 2.7233, + "step": 23161 + }, + { + "epoch": 1.0783574271946366, + "grad_norm": 0.3075587138309918, + "learning_rate": 8.085701590443115e-05, + "loss": 2.9488, + "step": 23162 + }, + { + "epoch": 1.0784039853807297, + "grad_norm": 0.33814678396589454, + "learning_rate": 8.085488448135148e-05, + "loss": 2.7301, + "step": 23163 + }, + { + "epoch": 1.0784505435668226, + "grad_norm": 0.28381346289488213, + "learning_rate": 8.085275296771576e-05, + "loss": 2.8907, + "step": 23164 + }, + { + "epoch": 1.0784971017529157, + "grad_norm": 0.32960498542411987, + "learning_rate": 8.085062136353025e-05, + "loss": 2.8885, + "step": 23165 + }, + { + "epoch": 1.0785436599390088, + "grad_norm": 0.32985045245854855, + "learning_rate": 8.08484896688012e-05, + "loss": 2.7787, + "step": 23166 + }, + { + "epoch": 1.078590218125102, + "grad_norm": 0.3159583207398079, + "learning_rate": 8.084635788353488e-05, + "loss": 2.7317, + "step": 23167 + }, + { + "epoch": 1.0786367763111948, + "grad_norm": 0.3756843225407979, + "learning_rate": 8.084422600773754e-05, + "loss": 2.8145, + "step": 23168 + }, + { + "epoch": 1.078683334497288, + "grad_norm": 0.3034273705158181, + "learning_rate": 8.084209404141543e-05, + "loss": 2.7859, + "step": 23169 + }, + { + "epoch": 1.078729892683381, + "grad_norm": 0.38409267270834285, + "learning_rate": 8.083996198457482e-05, + "loss": 2.8891, + "step": 23170 + }, + { + "epoch": 1.0787764508694742, + "grad_norm": 0.29469970120477446, + "learning_rate": 8.083782983722194e-05, + "loss": 2.6983, + "step": 23171 + }, + { + "epoch": 1.0788230090555673, + "grad_norm": 0.3688615600327763, + "learning_rate": 8.083569759936308e-05, + "loss": 2.8133, + "step": 23172 + }, + { + "epoch": 1.0788695672416602, + "grad_norm": 0.31234085648034254, + "learning_rate": 8.08335652710045e-05, + "loss": 2.8622, + "step": 23173 + }, + { + "epoch": 1.0789161254277533, + "grad_norm": 0.35200727781339103, + "learning_rate": 8.083143285215242e-05, + "loss": 2.8168, + "step": 23174 + }, + { + "epoch": 1.0789626836138464, + "grad_norm": 0.3453567657794643, + "learning_rate": 8.082930034281313e-05, + "loss": 2.8133, + "step": 23175 + }, + { + "epoch": 1.0790092417999395, + "grad_norm": 0.30995697704385305, + "learning_rate": 8.082716774299288e-05, + "loss": 2.7754, + "step": 23176 + }, + { + "epoch": 1.0790557999860326, + "grad_norm": 0.29891295820028024, + "learning_rate": 8.082503505269794e-05, + "loss": 2.7533, + "step": 23177 + }, + { + "epoch": 1.0791023581721255, + "grad_norm": 0.3346192071595185, + "learning_rate": 8.082290227193455e-05, + "loss": 2.6956, + "step": 23178 + }, + { + "epoch": 1.0791489163582186, + "grad_norm": 0.2940728214867851, + "learning_rate": 8.082076940070899e-05, + "loss": 2.7979, + "step": 23179 + }, + { + "epoch": 1.0791954745443118, + "grad_norm": 0.3698411909737315, + "learning_rate": 8.08186364390275e-05, + "loss": 2.7905, + "step": 23180 + }, + { + "epoch": 1.0792420327304049, + "grad_norm": 0.3114289844958212, + "learning_rate": 8.081650338689634e-05, + "loss": 2.815, + "step": 23181 + }, + { + "epoch": 1.079288590916498, + "grad_norm": 0.3573792172951647, + "learning_rate": 8.081437024432179e-05, + "loss": 2.8784, + "step": 23182 + }, + { + "epoch": 1.0793351491025909, + "grad_norm": 0.3123276175790119, + "learning_rate": 8.081223701131009e-05, + "loss": 2.7746, + "step": 23183 + }, + { + "epoch": 1.079381707288684, + "grad_norm": 0.3651958412606217, + "learning_rate": 8.081010368786752e-05, + "loss": 2.8216, + "step": 23184 + }, + { + "epoch": 1.079428265474777, + "grad_norm": 0.39548856082305156, + "learning_rate": 8.08079702740003e-05, + "loss": 2.8232, + "step": 23185 + }, + { + "epoch": 1.0794748236608702, + "grad_norm": 0.3803980647594993, + "learning_rate": 8.080583676971477e-05, + "loss": 2.8538, + "step": 23186 + }, + { + "epoch": 1.0795213818469633, + "grad_norm": 0.3390114344862458, + "learning_rate": 8.080370317501712e-05, + "loss": 2.9244, + "step": 23187 + }, + { + "epoch": 1.0795679400330562, + "grad_norm": 0.3705566423281669, + "learning_rate": 8.080156948991363e-05, + "loss": 2.8575, + "step": 23188 + }, + { + "epoch": 1.0796144982191493, + "grad_norm": 0.33728075249856443, + "learning_rate": 8.079943571441058e-05, + "loss": 2.8446, + "step": 23189 + }, + { + "epoch": 1.0796610564052425, + "grad_norm": 0.3651128902536072, + "learning_rate": 8.079730184851421e-05, + "loss": 2.8353, + "step": 23190 + }, + { + "epoch": 1.0797076145913356, + "grad_norm": 0.33837812667325706, + "learning_rate": 8.07951678922308e-05, + "loss": 2.7938, + "step": 23191 + }, + { + "epoch": 1.0797541727774287, + "grad_norm": 0.3977240848816379, + "learning_rate": 8.07930338455666e-05, + "loss": 2.8321, + "step": 23192 + }, + { + "epoch": 1.0798007309635216, + "grad_norm": 0.3361189348495671, + "learning_rate": 8.079089970852788e-05, + "loss": 2.7526, + "step": 23193 + }, + { + "epoch": 1.0798472891496147, + "grad_norm": 0.37137977398244915, + "learning_rate": 8.078876548112091e-05, + "loss": 2.893, + "step": 23194 + }, + { + "epoch": 1.0798938473357078, + "grad_norm": 0.3207451569209717, + "learning_rate": 8.078663116335194e-05, + "loss": 2.8873, + "step": 23195 + }, + { + "epoch": 1.079940405521801, + "grad_norm": 0.36188780786348473, + "learning_rate": 8.078449675522726e-05, + "loss": 2.7723, + "step": 23196 + }, + { + "epoch": 1.079986963707894, + "grad_norm": 0.32384954841658387, + "learning_rate": 8.078236225675309e-05, + "loss": 2.8855, + "step": 23197 + }, + { + "epoch": 1.080033521893987, + "grad_norm": 0.37351140553819995, + "learning_rate": 8.078022766793573e-05, + "loss": 2.831, + "step": 23198 + }, + { + "epoch": 1.08008008008008, + "grad_norm": 0.3248269276422415, + "learning_rate": 8.077809298878143e-05, + "loss": 2.6604, + "step": 23199 + }, + { + "epoch": 1.0801266382661732, + "grad_norm": 0.3394459760333499, + "learning_rate": 8.077595821929647e-05, + "loss": 2.819, + "step": 23200 + }, + { + "epoch": 1.0801731964522663, + "grad_norm": 0.31358163882289214, + "learning_rate": 8.077382335948708e-05, + "loss": 2.7211, + "step": 23201 + }, + { + "epoch": 1.0802197546383594, + "grad_norm": 0.34552699747404225, + "learning_rate": 8.077168840935956e-05, + "loss": 2.8706, + "step": 23202 + }, + { + "epoch": 1.0802663128244523, + "grad_norm": 0.33513664552917505, + "learning_rate": 8.076955336892018e-05, + "loss": 2.7529, + "step": 23203 + }, + { + "epoch": 1.0803128710105454, + "grad_norm": 0.36224161361179547, + "learning_rate": 8.076741823817518e-05, + "loss": 2.8162, + "step": 23204 + }, + { + "epoch": 1.0803594291966385, + "grad_norm": 0.34146717368576285, + "learning_rate": 8.076528301713084e-05, + "loss": 2.9159, + "step": 23205 + }, + { + "epoch": 1.0804059873827316, + "grad_norm": 0.3826313962773885, + "learning_rate": 8.076314770579344e-05, + "loss": 2.735, + "step": 23206 + }, + { + "epoch": 1.0804525455688245, + "grad_norm": 0.34078133216581236, + "learning_rate": 8.076101230416922e-05, + "loss": 2.7971, + "step": 23207 + }, + { + "epoch": 1.0804991037549176, + "grad_norm": 0.3755635832729254, + "learning_rate": 8.075887681226447e-05, + "loss": 2.8189, + "step": 23208 + }, + { + "epoch": 1.0805456619410108, + "grad_norm": 0.3394534159158555, + "learning_rate": 8.075674123008543e-05, + "loss": 2.8129, + "step": 23209 + }, + { + "epoch": 1.0805922201271039, + "grad_norm": 0.3534384151046421, + "learning_rate": 8.075460555763838e-05, + "loss": 2.857, + "step": 23210 + }, + { + "epoch": 1.080638778313197, + "grad_norm": 0.34507015804975405, + "learning_rate": 8.07524697949296e-05, + "loss": 2.8242, + "step": 23211 + }, + { + "epoch": 1.0806853364992899, + "grad_norm": 0.35973018841575577, + "learning_rate": 8.075033394196536e-05, + "loss": 2.7795, + "step": 23212 + }, + { + "epoch": 1.080731894685383, + "grad_norm": 0.31032404492650156, + "learning_rate": 8.074819799875191e-05, + "loss": 2.7294, + "step": 23213 + }, + { + "epoch": 1.0807784528714761, + "grad_norm": 0.34737705767578364, + "learning_rate": 8.074606196529554e-05, + "loss": 2.8531, + "step": 23214 + }, + { + "epoch": 1.0808250110575692, + "grad_norm": 0.3337816458887506, + "learning_rate": 8.07439258416025e-05, + "loss": 2.7083, + "step": 23215 + }, + { + "epoch": 1.0808715692436623, + "grad_norm": 0.32203156950291695, + "learning_rate": 8.074178962767906e-05, + "loss": 2.7963, + "step": 23216 + }, + { + "epoch": 1.0809181274297552, + "grad_norm": 0.3485225094448968, + "learning_rate": 8.07396533235315e-05, + "loss": 2.8592, + "step": 23217 + }, + { + "epoch": 1.0809646856158484, + "grad_norm": 0.3200018526623507, + "learning_rate": 8.07375169291661e-05, + "loss": 2.8848, + "step": 23218 + }, + { + "epoch": 1.0810112438019415, + "grad_norm": 0.3621484246834876, + "learning_rate": 8.073538044458909e-05, + "loss": 2.9098, + "step": 23219 + }, + { + "epoch": 1.0810578019880346, + "grad_norm": 0.32385737879695525, + "learning_rate": 8.073324386980678e-05, + "loss": 2.8655, + "step": 23220 + }, + { + "epoch": 1.0811043601741277, + "grad_norm": 0.3321141194426429, + "learning_rate": 8.073110720482542e-05, + "loss": 2.7966, + "step": 23221 + }, + { + "epoch": 1.0811509183602206, + "grad_norm": 0.33446744897104064, + "learning_rate": 8.072897044965129e-05, + "loss": 2.8323, + "step": 23222 + }, + { + "epoch": 1.0811974765463137, + "grad_norm": 0.3177964363009779, + "learning_rate": 8.072683360429067e-05, + "loss": 2.7431, + "step": 23223 + }, + { + "epoch": 1.0812440347324068, + "grad_norm": 0.33062353586956317, + "learning_rate": 8.072469666874981e-05, + "loss": 2.8156, + "step": 23224 + }, + { + "epoch": 1.0812905929185, + "grad_norm": 0.3290172280600273, + "learning_rate": 8.0722559643035e-05, + "loss": 2.7848, + "step": 23225 + }, + { + "epoch": 1.081337151104593, + "grad_norm": 0.36185698778173114, + "learning_rate": 8.07204225271525e-05, + "loss": 2.7406, + "step": 23226 + }, + { + "epoch": 1.081383709290686, + "grad_norm": 0.3183879286660412, + "learning_rate": 8.071828532110859e-05, + "loss": 2.777, + "step": 23227 + }, + { + "epoch": 1.081430267476779, + "grad_norm": 0.36141349246374205, + "learning_rate": 8.071614802490953e-05, + "loss": 2.8788, + "step": 23228 + }, + { + "epoch": 1.0814768256628722, + "grad_norm": 0.35022668629399784, + "learning_rate": 8.07140106385616e-05, + "loss": 2.6496, + "step": 23229 + }, + { + "epoch": 1.0815233838489653, + "grad_norm": 0.3141176534110325, + "learning_rate": 8.07118731620711e-05, + "loss": 2.8073, + "step": 23230 + }, + { + "epoch": 1.0815699420350584, + "grad_norm": 0.33515361906257596, + "learning_rate": 8.070973559544425e-05, + "loss": 2.7465, + "step": 23231 + }, + { + "epoch": 1.0816165002211513, + "grad_norm": 0.33079094597536185, + "learning_rate": 8.070759793868736e-05, + "loss": 2.7753, + "step": 23232 + }, + { + "epoch": 1.0816630584072444, + "grad_norm": 0.34077934281146577, + "learning_rate": 8.070546019180669e-05, + "loss": 2.9077, + "step": 23233 + }, + { + "epoch": 1.0817096165933375, + "grad_norm": 0.31744254937198846, + "learning_rate": 8.070332235480853e-05, + "loss": 2.9523, + "step": 23234 + }, + { + "epoch": 1.0817561747794306, + "grad_norm": 0.31406254890572183, + "learning_rate": 8.070118442769914e-05, + "loss": 2.8677, + "step": 23235 + }, + { + "epoch": 1.0818027329655238, + "grad_norm": 0.3445466137584238, + "learning_rate": 8.06990464104848e-05, + "loss": 2.9721, + "step": 23236 + }, + { + "epoch": 1.0818492911516167, + "grad_norm": 0.3554913207607974, + "learning_rate": 8.069690830317179e-05, + "loss": 2.9039, + "step": 23237 + }, + { + "epoch": 1.0818958493377098, + "grad_norm": 0.363326073824035, + "learning_rate": 8.069477010576636e-05, + "loss": 2.8022, + "step": 23238 + }, + { + "epoch": 1.0819424075238029, + "grad_norm": 0.3587408168620295, + "learning_rate": 8.06926318182748e-05, + "loss": 2.926, + "step": 23239 + }, + { + "epoch": 1.081988965709896, + "grad_norm": 0.3553761341241099, + "learning_rate": 8.06904934407034e-05, + "loss": 2.8027, + "step": 23240 + }, + { + "epoch": 1.0820355238959891, + "grad_norm": 0.33548096516390513, + "learning_rate": 8.068835497305843e-05, + "loss": 2.8229, + "step": 23241 + }, + { + "epoch": 1.082082082082082, + "grad_norm": 0.36708773749360996, + "learning_rate": 8.068621641534617e-05, + "loss": 2.812, + "step": 23242 + }, + { + "epoch": 1.0821286402681751, + "grad_norm": 0.3471414018925525, + "learning_rate": 8.068407776757287e-05, + "loss": 2.8219, + "step": 23243 + }, + { + "epoch": 1.0821751984542682, + "grad_norm": 0.34783697043016587, + "learning_rate": 8.068193902974483e-05, + "loss": 2.7634, + "step": 23244 + }, + { + "epoch": 1.0822217566403614, + "grad_norm": 0.36018369088423263, + "learning_rate": 8.067980020186831e-05, + "loss": 2.8472, + "step": 23245 + }, + { + "epoch": 1.0822683148264542, + "grad_norm": 0.3383279101500207, + "learning_rate": 8.067766128394962e-05, + "loss": 2.947, + "step": 23246 + }, + { + "epoch": 1.0823148730125474, + "grad_norm": 0.3544170104324382, + "learning_rate": 8.0675522275995e-05, + "loss": 2.8187, + "step": 23247 + }, + { + "epoch": 1.0823614311986405, + "grad_norm": 0.3317371557645891, + "learning_rate": 8.067338317801077e-05, + "loss": 2.7118, + "step": 23248 + }, + { + "epoch": 1.0824079893847336, + "grad_norm": 0.33181906822304996, + "learning_rate": 8.067124399000316e-05, + "loss": 2.8073, + "step": 23249 + }, + { + "epoch": 1.0824545475708267, + "grad_norm": 0.3362582425598607, + "learning_rate": 8.066910471197846e-05, + "loss": 2.8004, + "step": 23250 + }, + { + "epoch": 1.0825011057569198, + "grad_norm": 0.3056325289664215, + "learning_rate": 8.066696534394298e-05, + "loss": 2.7868, + "step": 23251 + }, + { + "epoch": 1.0825476639430127, + "grad_norm": 0.33923587563943397, + "learning_rate": 8.066482588590297e-05, + "loss": 2.8416, + "step": 23252 + }, + { + "epoch": 1.0825942221291058, + "grad_norm": 0.3384805860240464, + "learning_rate": 8.066268633786471e-05, + "loss": 2.7423, + "step": 23253 + }, + { + "epoch": 1.082640780315199, + "grad_norm": 0.3331992738659648, + "learning_rate": 8.066054669983452e-05, + "loss": 2.6853, + "step": 23254 + }, + { + "epoch": 1.082687338501292, + "grad_norm": 0.33131987386251044, + "learning_rate": 8.065840697181862e-05, + "loss": 2.748, + "step": 23255 + }, + { + "epoch": 1.082733896687385, + "grad_norm": 0.33828643738559566, + "learning_rate": 8.065626715382331e-05, + "loss": 2.9217, + "step": 23256 + }, + { + "epoch": 1.082780454873478, + "grad_norm": 0.3200614060660282, + "learning_rate": 8.065412724585489e-05, + "loss": 2.8336, + "step": 23257 + }, + { + "epoch": 1.0828270130595712, + "grad_norm": 0.3387974475088806, + "learning_rate": 8.06519872479196e-05, + "loss": 2.9032, + "step": 23258 + }, + { + "epoch": 1.0828735712456643, + "grad_norm": 0.3384438084425701, + "learning_rate": 8.064984716002378e-05, + "loss": 2.9414, + "step": 23259 + }, + { + "epoch": 1.0829201294317574, + "grad_norm": 0.3500336727948865, + "learning_rate": 8.064770698217365e-05, + "loss": 2.7239, + "step": 23260 + }, + { + "epoch": 1.0829666876178503, + "grad_norm": 0.3307572358345725, + "learning_rate": 8.064556671437553e-05, + "loss": 2.7792, + "step": 23261 + }, + { + "epoch": 1.0830132458039434, + "grad_norm": 0.3712891218932595, + "learning_rate": 8.064342635663571e-05, + "loss": 2.7092, + "step": 23262 + }, + { + "epoch": 1.0830598039900365, + "grad_norm": 0.3017505061453431, + "learning_rate": 8.064128590896042e-05, + "loss": 2.8669, + "step": 23263 + }, + { + "epoch": 1.0831063621761297, + "grad_norm": 0.344532265311221, + "learning_rate": 8.063914537135601e-05, + "loss": 2.7453, + "step": 23264 + }, + { + "epoch": 1.0831529203622228, + "grad_norm": 0.2952940006515169, + "learning_rate": 8.063700474382871e-05, + "loss": 2.8225, + "step": 23265 + }, + { + "epoch": 1.0831994785483157, + "grad_norm": 0.3341931265307319, + "learning_rate": 8.063486402638481e-05, + "loss": 2.9078, + "step": 23266 + }, + { + "epoch": 1.0832460367344088, + "grad_norm": 0.2961175234119598, + "learning_rate": 8.06327232190306e-05, + "loss": 2.7122, + "step": 23267 + }, + { + "epoch": 1.083292594920502, + "grad_norm": 0.3531319794853892, + "learning_rate": 8.063058232177239e-05, + "loss": 2.7966, + "step": 23268 + }, + { + "epoch": 1.083339153106595, + "grad_norm": 0.3179336887789009, + "learning_rate": 8.062844133461642e-05, + "loss": 2.8189, + "step": 23269 + }, + { + "epoch": 1.0833857112926881, + "grad_norm": 0.3501828256288188, + "learning_rate": 8.062630025756898e-05, + "loss": 2.8266, + "step": 23270 + }, + { + "epoch": 1.083432269478781, + "grad_norm": 0.3492182806490637, + "learning_rate": 8.062415909063638e-05, + "loss": 2.7595, + "step": 23271 + }, + { + "epoch": 1.0834788276648741, + "grad_norm": 0.31525769117688623, + "learning_rate": 8.06220178338249e-05, + "loss": 2.8596, + "step": 23272 + }, + { + "epoch": 1.0835253858509672, + "grad_norm": 0.35382717500724886, + "learning_rate": 8.061987648714081e-05, + "loss": 2.8871, + "step": 23273 + }, + { + "epoch": 1.0835719440370604, + "grad_norm": 0.33895259660568916, + "learning_rate": 8.06177350505904e-05, + "loss": 2.8088, + "step": 23274 + }, + { + "epoch": 1.0836185022231535, + "grad_norm": 0.34248174440087664, + "learning_rate": 8.061559352417994e-05, + "loss": 2.9172, + "step": 23275 + }, + { + "epoch": 1.0836650604092464, + "grad_norm": 0.34763272919340127, + "learning_rate": 8.061345190791573e-05, + "loss": 2.9358, + "step": 23276 + }, + { + "epoch": 1.0837116185953395, + "grad_norm": 0.3358359347401994, + "learning_rate": 8.061131020180408e-05, + "loss": 2.7758, + "step": 23277 + }, + { + "epoch": 1.0837581767814326, + "grad_norm": 0.33057007964611806, + "learning_rate": 8.060916840585124e-05, + "loss": 2.8259, + "step": 23278 + }, + { + "epoch": 1.0838047349675257, + "grad_norm": 0.31889754960025724, + "learning_rate": 8.060702652006348e-05, + "loss": 2.7914, + "step": 23279 + }, + { + "epoch": 1.0838512931536188, + "grad_norm": 0.3231801269945854, + "learning_rate": 8.060488454444713e-05, + "loss": 2.7643, + "step": 23280 + }, + { + "epoch": 1.0838978513397117, + "grad_norm": 0.3404458985424907, + "learning_rate": 8.060274247900848e-05, + "loss": 2.7778, + "step": 23281 + }, + { + "epoch": 1.0839444095258048, + "grad_norm": 0.3408515708635242, + "learning_rate": 8.060060032375377e-05, + "loss": 2.779, + "step": 23282 + }, + { + "epoch": 1.083990967711898, + "grad_norm": 0.32316875318630417, + "learning_rate": 8.059845807868932e-05, + "loss": 2.8164, + "step": 23283 + }, + { + "epoch": 1.084037525897991, + "grad_norm": 0.34568451676533873, + "learning_rate": 8.05963157438214e-05, + "loss": 2.8235, + "step": 23284 + }, + { + "epoch": 1.0840840840840842, + "grad_norm": 0.320702578324139, + "learning_rate": 8.059417331915632e-05, + "loss": 2.9288, + "step": 23285 + }, + { + "epoch": 1.084130642270177, + "grad_norm": 0.34812130277383, + "learning_rate": 8.059203080470036e-05, + "loss": 2.856, + "step": 23286 + }, + { + "epoch": 1.0841772004562702, + "grad_norm": 0.32566757385992073, + "learning_rate": 8.058988820045978e-05, + "loss": 2.8697, + "step": 23287 + }, + { + "epoch": 1.0842237586423633, + "grad_norm": 0.32653178008547906, + "learning_rate": 8.058774550644093e-05, + "loss": 2.7212, + "step": 23288 + }, + { + "epoch": 1.0842703168284564, + "grad_norm": 0.3160153973533559, + "learning_rate": 8.058560272265002e-05, + "loss": 2.8694, + "step": 23289 + }, + { + "epoch": 1.0843168750145495, + "grad_norm": 0.32385219719954783, + "learning_rate": 8.058345984909341e-05, + "loss": 2.7352, + "step": 23290 + }, + { + "epoch": 1.0843634332006424, + "grad_norm": 0.31685427101976815, + "learning_rate": 8.058131688577734e-05, + "loss": 2.8182, + "step": 23291 + }, + { + "epoch": 1.0844099913867356, + "grad_norm": 0.33837748264934114, + "learning_rate": 8.057917383270812e-05, + "loss": 2.8573, + "step": 23292 + }, + { + "epoch": 1.0844565495728287, + "grad_norm": 0.3395163494235989, + "learning_rate": 8.057703068989205e-05, + "loss": 2.7534, + "step": 23293 + }, + { + "epoch": 1.0845031077589218, + "grad_norm": 0.32492646958881666, + "learning_rate": 8.05748874573354e-05, + "loss": 2.798, + "step": 23294 + }, + { + "epoch": 1.0845496659450147, + "grad_norm": 0.3440947165972427, + "learning_rate": 8.057274413504445e-05, + "loss": 2.9125, + "step": 23295 + }, + { + "epoch": 1.0845962241311078, + "grad_norm": 0.3386573587547164, + "learning_rate": 8.057060072302552e-05, + "loss": 2.8074, + "step": 23296 + }, + { + "epoch": 1.084642782317201, + "grad_norm": 0.341999832288027, + "learning_rate": 8.05684572212849e-05, + "loss": 2.8729, + "step": 23297 + }, + { + "epoch": 1.084689340503294, + "grad_norm": 0.32960071600650465, + "learning_rate": 8.056631362982885e-05, + "loss": 2.6776, + "step": 23298 + }, + { + "epoch": 1.0847358986893871, + "grad_norm": 0.361768263911342, + "learning_rate": 8.056416994866369e-05, + "loss": 2.8929, + "step": 23299 + }, + { + "epoch": 1.08478245687548, + "grad_norm": 0.32787766064107193, + "learning_rate": 8.056202617779571e-05, + "loss": 2.8986, + "step": 23300 + }, + { + "epoch": 1.0848290150615731, + "grad_norm": 0.3429300675927437, + "learning_rate": 8.05598823172312e-05, + "loss": 2.6961, + "step": 23301 + }, + { + "epoch": 1.0848755732476663, + "grad_norm": 0.36025589562331334, + "learning_rate": 8.055773836697641e-05, + "loss": 2.7882, + "step": 23302 + }, + { + "epoch": 1.0849221314337594, + "grad_norm": 0.3714690343351966, + "learning_rate": 8.05555943270377e-05, + "loss": 2.7619, + "step": 23303 + }, + { + "epoch": 1.0849686896198525, + "grad_norm": 0.32377826584064123, + "learning_rate": 8.055345019742133e-05, + "loss": 2.8829, + "step": 23304 + }, + { + "epoch": 1.0850152478059454, + "grad_norm": 0.3534674873382776, + "learning_rate": 8.055130597813359e-05, + "loss": 2.6887, + "step": 23305 + }, + { + "epoch": 1.0850618059920385, + "grad_norm": 0.3175485561960958, + "learning_rate": 8.054916166918076e-05, + "loss": 2.7389, + "step": 23306 + }, + { + "epoch": 1.0851083641781316, + "grad_norm": 0.34092621965700043, + "learning_rate": 8.054701727056917e-05, + "loss": 2.8139, + "step": 23307 + }, + { + "epoch": 1.0851549223642247, + "grad_norm": 0.3356656233732604, + "learning_rate": 8.05448727823051e-05, + "loss": 2.7608, + "step": 23308 + }, + { + "epoch": 1.0852014805503178, + "grad_norm": 0.3392580202683499, + "learning_rate": 8.054272820439482e-05, + "loss": 2.8047, + "step": 23309 + }, + { + "epoch": 1.0852480387364107, + "grad_norm": 0.33174540679190806, + "learning_rate": 8.054058353684464e-05, + "loss": 2.9217, + "step": 23310 + }, + { + "epoch": 1.0852945969225039, + "grad_norm": 0.3150557162983916, + "learning_rate": 8.053843877966085e-05, + "loss": 2.6299, + "step": 23311 + }, + { + "epoch": 1.085341155108597, + "grad_norm": 0.3546383907233325, + "learning_rate": 8.053629393284977e-05, + "loss": 2.8161, + "step": 23312 + }, + { + "epoch": 1.08538771329469, + "grad_norm": 0.3219215216703072, + "learning_rate": 8.053414899641767e-05, + "loss": 2.7733, + "step": 23313 + }, + { + "epoch": 1.0854342714807832, + "grad_norm": 0.32216348543705714, + "learning_rate": 8.053200397037086e-05, + "loss": 2.8203, + "step": 23314 + }, + { + "epoch": 1.085480829666876, + "grad_norm": 0.3221537749816695, + "learning_rate": 8.05298588547156e-05, + "loss": 2.8084, + "step": 23315 + }, + { + "epoch": 1.0855273878529692, + "grad_norm": 0.3418119215972772, + "learning_rate": 8.052771364945824e-05, + "loss": 2.7911, + "step": 23316 + }, + { + "epoch": 1.0855739460390623, + "grad_norm": 0.3161908194617218, + "learning_rate": 8.052556835460503e-05, + "loss": 2.8022, + "step": 23317 + }, + { + "epoch": 1.0856205042251554, + "grad_norm": 0.33687837041199403, + "learning_rate": 8.052342297016229e-05, + "loss": 2.7388, + "step": 23318 + }, + { + "epoch": 1.0856670624112486, + "grad_norm": 0.3513939954350514, + "learning_rate": 8.052127749613631e-05, + "loss": 2.8694, + "step": 23319 + }, + { + "epoch": 1.0857136205973414, + "grad_norm": 0.34298210123097894, + "learning_rate": 8.05191319325334e-05, + "loss": 2.8715, + "step": 23320 + }, + { + "epoch": 1.0857601787834346, + "grad_norm": 0.3558927855848118, + "learning_rate": 8.051698627935982e-05, + "loss": 2.8179, + "step": 23321 + }, + { + "epoch": 1.0858067369695277, + "grad_norm": 0.35819658887743144, + "learning_rate": 8.051484053662191e-05, + "loss": 2.8692, + "step": 23322 + }, + { + "epoch": 1.0858532951556208, + "grad_norm": 0.35480922964945055, + "learning_rate": 8.051269470432594e-05, + "loss": 2.8365, + "step": 23323 + }, + { + "epoch": 1.085899853341714, + "grad_norm": 0.32554748284157364, + "learning_rate": 8.051054878247821e-05, + "loss": 2.7919, + "step": 23324 + }, + { + "epoch": 1.0859464115278068, + "grad_norm": 0.3360226216732347, + "learning_rate": 8.050840277108505e-05, + "loss": 2.7898, + "step": 23325 + }, + { + "epoch": 1.0859929697139, + "grad_norm": 0.30663730846342063, + "learning_rate": 8.050625667015272e-05, + "loss": 2.9372, + "step": 23326 + }, + { + "epoch": 1.086039527899993, + "grad_norm": 0.34158409298881665, + "learning_rate": 8.050411047968753e-05, + "loss": 2.7507, + "step": 23327 + }, + { + "epoch": 1.0860860860860861, + "grad_norm": 0.30315956340869493, + "learning_rate": 8.050196419969577e-05, + "loss": 2.712, + "step": 23328 + }, + { + "epoch": 1.0861326442721793, + "grad_norm": 0.31693801436683083, + "learning_rate": 8.049981783018377e-05, + "loss": 2.8688, + "step": 23329 + }, + { + "epoch": 1.0861792024582722, + "grad_norm": 0.3003590357922414, + "learning_rate": 8.04976713711578e-05, + "loss": 2.7157, + "step": 23330 + }, + { + "epoch": 1.0862257606443653, + "grad_norm": 0.33809388923685707, + "learning_rate": 8.049552482262417e-05, + "loss": 2.8428, + "step": 23331 + }, + { + "epoch": 1.0862723188304584, + "grad_norm": 0.28931321645030644, + "learning_rate": 8.049337818458917e-05, + "loss": 2.8182, + "step": 23332 + }, + { + "epoch": 1.0863188770165515, + "grad_norm": 0.31522190525817906, + "learning_rate": 8.049123145705912e-05, + "loss": 2.8362, + "step": 23333 + }, + { + "epoch": 1.0863654352026444, + "grad_norm": 0.31378339052379967, + "learning_rate": 8.048908464004032e-05, + "loss": 2.8859, + "step": 23334 + }, + { + "epoch": 1.0864119933887375, + "grad_norm": 0.37080065166397524, + "learning_rate": 8.048693773353904e-05, + "loss": 2.8574, + "step": 23335 + }, + { + "epoch": 1.0864585515748306, + "grad_norm": 0.32481448727091145, + "learning_rate": 8.048479073756161e-05, + "loss": 2.8944, + "step": 23336 + }, + { + "epoch": 1.0865051097609237, + "grad_norm": 0.3368867599582968, + "learning_rate": 8.048264365211433e-05, + "loss": 2.8392, + "step": 23337 + }, + { + "epoch": 1.0865516679470169, + "grad_norm": 0.34093640581298457, + "learning_rate": 8.048049647720348e-05, + "loss": 2.8633, + "step": 23338 + }, + { + "epoch": 1.08659822613311, + "grad_norm": 0.304014225448638, + "learning_rate": 8.047834921283538e-05, + "loss": 2.9183, + "step": 23339 + }, + { + "epoch": 1.0866447843192029, + "grad_norm": 0.3312816674670186, + "learning_rate": 8.047620185901634e-05, + "loss": 2.7976, + "step": 23340 + }, + { + "epoch": 1.086691342505296, + "grad_norm": 0.3296279292004948, + "learning_rate": 8.047405441575264e-05, + "loss": 2.8062, + "step": 23341 + }, + { + "epoch": 1.086737900691389, + "grad_norm": 0.3423867146860411, + "learning_rate": 8.047190688305058e-05, + "loss": 2.8943, + "step": 23342 + }, + { + "epoch": 1.0867844588774822, + "grad_norm": 0.3506248497420904, + "learning_rate": 8.046975926091649e-05, + "loss": 2.8751, + "step": 23343 + }, + { + "epoch": 1.086831017063575, + "grad_norm": 0.3275347060125595, + "learning_rate": 8.046761154935666e-05, + "loss": 2.8432, + "step": 23344 + }, + { + "epoch": 1.0868775752496682, + "grad_norm": 0.3422453741132221, + "learning_rate": 8.046546374837738e-05, + "loss": 2.8605, + "step": 23345 + }, + { + "epoch": 1.0869241334357613, + "grad_norm": 0.3547261278443568, + "learning_rate": 8.046331585798498e-05, + "loss": 2.8813, + "step": 23346 + }, + { + "epoch": 1.0869706916218544, + "grad_norm": 0.34217966764010105, + "learning_rate": 8.046116787818573e-05, + "loss": 2.8367, + "step": 23347 + }, + { + "epoch": 1.0870172498079476, + "grad_norm": 0.34092617062432734, + "learning_rate": 8.045901980898596e-05, + "loss": 2.7487, + "step": 23348 + }, + { + "epoch": 1.0870638079940405, + "grad_norm": 0.34563427286858817, + "learning_rate": 8.045687165039199e-05, + "loss": 2.7189, + "step": 23349 + }, + { + "epoch": 1.0871103661801336, + "grad_norm": 0.33928862134980237, + "learning_rate": 8.045472340241008e-05, + "loss": 2.8835, + "step": 23350 + }, + { + "epoch": 1.0871569243662267, + "grad_norm": 0.34255501175852276, + "learning_rate": 8.045257506504655e-05, + "loss": 2.8395, + "step": 23351 + }, + { + "epoch": 1.0872034825523198, + "grad_norm": 0.34909467007134654, + "learning_rate": 8.045042663830773e-05, + "loss": 2.7919, + "step": 23352 + }, + { + "epoch": 1.087250040738413, + "grad_norm": 0.3596310225716844, + "learning_rate": 8.04482781221999e-05, + "loss": 2.7918, + "step": 23353 + }, + { + "epoch": 1.0872965989245058, + "grad_norm": 0.38916055944566874, + "learning_rate": 8.044612951672938e-05, + "loss": 2.7867, + "step": 23354 + }, + { + "epoch": 1.087343157110599, + "grad_norm": 0.36839422857543547, + "learning_rate": 8.044398082190246e-05, + "loss": 2.8052, + "step": 23355 + }, + { + "epoch": 1.087389715296692, + "grad_norm": 0.5130224001985588, + "learning_rate": 8.044183203772546e-05, + "loss": 2.8275, + "step": 23356 + }, + { + "epoch": 1.0874362734827852, + "grad_norm": 0.4469503061071411, + "learning_rate": 8.043968316420468e-05, + "loss": 2.7528, + "step": 23357 + }, + { + "epoch": 1.0874828316688783, + "grad_norm": 0.40768195093112625, + "learning_rate": 8.043753420134644e-05, + "loss": 2.7519, + "step": 23358 + }, + { + "epoch": 1.0875293898549712, + "grad_norm": 0.3892532407236647, + "learning_rate": 8.043538514915703e-05, + "loss": 2.8428, + "step": 23359 + }, + { + "epoch": 1.0875759480410643, + "grad_norm": 0.39165367739715595, + "learning_rate": 8.043323600764274e-05, + "loss": 2.8056, + "step": 23360 + }, + { + "epoch": 1.0876225062271574, + "grad_norm": 0.3667780419593418, + "learning_rate": 8.043108677680993e-05, + "loss": 2.7984, + "step": 23361 + }, + { + "epoch": 1.0876690644132505, + "grad_norm": 0.3874866473509387, + "learning_rate": 8.042893745666486e-05, + "loss": 2.7861, + "step": 23362 + }, + { + "epoch": 1.0877156225993436, + "grad_norm": 0.3921612498555152, + "learning_rate": 8.042678804721386e-05, + "loss": 2.901, + "step": 23363 + }, + { + "epoch": 1.0877621807854365, + "grad_norm": 0.38745596793558096, + "learning_rate": 8.042463854846325e-05, + "loss": 2.833, + "step": 23364 + }, + { + "epoch": 1.0878087389715296, + "grad_norm": 0.3731766901823232, + "learning_rate": 8.042248896041931e-05, + "loss": 2.9113, + "step": 23365 + }, + { + "epoch": 1.0878552971576227, + "grad_norm": 0.3752350199192995, + "learning_rate": 8.042033928308837e-05, + "loss": 2.8247, + "step": 23366 + }, + { + "epoch": 1.0879018553437159, + "grad_norm": 0.35006931762144883, + "learning_rate": 8.041818951647673e-05, + "loss": 2.8018, + "step": 23367 + }, + { + "epoch": 1.087948413529809, + "grad_norm": 0.38632816037839357, + "learning_rate": 8.041603966059069e-05, + "loss": 2.8549, + "step": 23368 + }, + { + "epoch": 1.0879949717159019, + "grad_norm": 0.35858876554167113, + "learning_rate": 8.041388971543657e-05, + "loss": 2.8029, + "step": 23369 + }, + { + "epoch": 1.088041529901995, + "grad_norm": 0.36534481791855544, + "learning_rate": 8.041173968102068e-05, + "loss": 2.8122, + "step": 23370 + }, + { + "epoch": 1.088088088088088, + "grad_norm": 0.32467344623299865, + "learning_rate": 8.040958955734935e-05, + "loss": 2.7776, + "step": 23371 + }, + { + "epoch": 1.0881346462741812, + "grad_norm": 0.3552944580503512, + "learning_rate": 8.040743934442884e-05, + "loss": 2.7695, + "step": 23372 + }, + { + "epoch": 1.0881812044602743, + "grad_norm": 0.3587497682418664, + "learning_rate": 8.04052890422655e-05, + "loss": 2.8199, + "step": 23373 + }, + { + "epoch": 1.0882277626463672, + "grad_norm": 0.3792577509623591, + "learning_rate": 8.040313865086563e-05, + "loss": 2.6708, + "step": 23374 + }, + { + "epoch": 1.0882743208324603, + "grad_norm": 0.35241399530797374, + "learning_rate": 8.040098817023557e-05, + "loss": 2.9054, + "step": 23375 + }, + { + "epoch": 1.0883208790185535, + "grad_norm": 0.3848217534455445, + "learning_rate": 8.039883760038158e-05, + "loss": 2.8639, + "step": 23376 + }, + { + "epoch": 1.0883674372046466, + "grad_norm": 0.318990402858631, + "learning_rate": 8.039668694131002e-05, + "loss": 2.7722, + "step": 23377 + }, + { + "epoch": 1.0884139953907397, + "grad_norm": 0.3508691592611848, + "learning_rate": 8.039453619302714e-05, + "loss": 2.7313, + "step": 23378 + }, + { + "epoch": 1.0884605535768326, + "grad_norm": 0.35045031348179106, + "learning_rate": 8.039238535553932e-05, + "loss": 2.9356, + "step": 23379 + }, + { + "epoch": 1.0885071117629257, + "grad_norm": 0.34382985588596343, + "learning_rate": 8.039023442885286e-05, + "loss": 2.7682, + "step": 23380 + }, + { + "epoch": 1.0885536699490188, + "grad_norm": 0.3509287634908236, + "learning_rate": 8.038808341297401e-05, + "loss": 2.8979, + "step": 23381 + }, + { + "epoch": 1.088600228135112, + "grad_norm": 0.36277437465334483, + "learning_rate": 8.038593230790915e-05, + "loss": 2.8528, + "step": 23382 + }, + { + "epoch": 1.0886467863212048, + "grad_norm": 0.3374626645801731, + "learning_rate": 8.038378111366459e-05, + "loss": 2.949, + "step": 23383 + }, + { + "epoch": 1.088693344507298, + "grad_norm": 0.39049556631721255, + "learning_rate": 8.038162983024659e-05, + "loss": 2.85, + "step": 23384 + }, + { + "epoch": 1.088739902693391, + "grad_norm": 0.33969130513651385, + "learning_rate": 8.037947845766152e-05, + "loss": 2.8605, + "step": 23385 + }, + { + "epoch": 1.0887864608794842, + "grad_norm": 0.3625706720449069, + "learning_rate": 8.037732699591568e-05, + "loss": 2.8059, + "step": 23386 + }, + { + "epoch": 1.0888330190655773, + "grad_norm": 0.3468957626462385, + "learning_rate": 8.037517544501536e-05, + "loss": 2.7397, + "step": 23387 + }, + { + "epoch": 1.0888795772516702, + "grad_norm": 0.33071894788373646, + "learning_rate": 8.037302380496689e-05, + "loss": 2.8726, + "step": 23388 + }, + { + "epoch": 1.0889261354377633, + "grad_norm": 0.34321948390986745, + "learning_rate": 8.037087207577661e-05, + "loss": 2.8423, + "step": 23389 + }, + { + "epoch": 1.0889726936238564, + "grad_norm": 0.3411892354783966, + "learning_rate": 8.036872025745081e-05, + "loss": 2.8273, + "step": 23390 + }, + { + "epoch": 1.0890192518099495, + "grad_norm": 0.34118477955776705, + "learning_rate": 8.036656834999579e-05, + "loss": 2.7681, + "step": 23391 + }, + { + "epoch": 1.0890658099960426, + "grad_norm": 0.3085168392808763, + "learning_rate": 8.03644163534179e-05, + "loss": 2.8377, + "step": 23392 + }, + { + "epoch": 1.0891123681821355, + "grad_norm": 0.3276010625185087, + "learning_rate": 8.036226426772343e-05, + "loss": 2.8482, + "step": 23393 + }, + { + "epoch": 1.0891589263682286, + "grad_norm": 0.31272116299674124, + "learning_rate": 8.036011209291871e-05, + "loss": 2.6978, + "step": 23394 + }, + { + "epoch": 1.0892054845543218, + "grad_norm": 0.3069128182159847, + "learning_rate": 8.035795982901004e-05, + "loss": 2.882, + "step": 23395 + }, + { + "epoch": 1.0892520427404149, + "grad_norm": 0.3251643307868852, + "learning_rate": 8.035580747600377e-05, + "loss": 2.869, + "step": 23396 + }, + { + "epoch": 1.089298600926508, + "grad_norm": 0.33593309802562865, + "learning_rate": 8.035365503390618e-05, + "loss": 2.8667, + "step": 23397 + }, + { + "epoch": 1.0893451591126009, + "grad_norm": 0.3128784580313512, + "learning_rate": 8.035150250272361e-05, + "loss": 2.8276, + "step": 23398 + }, + { + "epoch": 1.089391717298694, + "grad_norm": 0.33918507575827017, + "learning_rate": 8.034934988246237e-05, + "loss": 2.8295, + "step": 23399 + }, + { + "epoch": 1.089438275484787, + "grad_norm": 0.31733602192758253, + "learning_rate": 8.034719717312877e-05, + "loss": 2.7038, + "step": 23400 + }, + { + "epoch": 1.0894848336708802, + "grad_norm": 0.36771387606788775, + "learning_rate": 8.034504437472914e-05, + "loss": 2.894, + "step": 23401 + }, + { + "epoch": 1.0895313918569733, + "grad_norm": 0.2987249334462044, + "learning_rate": 8.03428914872698e-05, + "loss": 2.751, + "step": 23402 + }, + { + "epoch": 1.0895779500430662, + "grad_norm": 0.33938576803584186, + "learning_rate": 8.034073851075706e-05, + "loss": 2.8145, + "step": 23403 + }, + { + "epoch": 1.0896245082291593, + "grad_norm": 0.345565650857453, + "learning_rate": 8.033858544519723e-05, + "loss": 2.7827, + "step": 23404 + }, + { + "epoch": 1.0896710664152525, + "grad_norm": 0.32917435048981303, + "learning_rate": 8.033643229059667e-05, + "loss": 2.7869, + "step": 23405 + }, + { + "epoch": 1.0897176246013456, + "grad_norm": 0.32742974424636906, + "learning_rate": 8.033427904696164e-05, + "loss": 2.8751, + "step": 23406 + }, + { + "epoch": 1.0897641827874387, + "grad_norm": 0.32457110793351895, + "learning_rate": 8.033212571429851e-05, + "loss": 2.7521, + "step": 23407 + }, + { + "epoch": 1.0898107409735316, + "grad_norm": 0.3492421774461364, + "learning_rate": 8.032997229261356e-05, + "loss": 2.8104, + "step": 23408 + }, + { + "epoch": 1.0898572991596247, + "grad_norm": 0.2962879331756358, + "learning_rate": 8.032781878191314e-05, + "loss": 2.8841, + "step": 23409 + }, + { + "epoch": 1.0899038573457178, + "grad_norm": 0.35049340890817415, + "learning_rate": 8.032566518220356e-05, + "loss": 2.7389, + "step": 23410 + }, + { + "epoch": 1.089950415531811, + "grad_norm": 0.3116435953919963, + "learning_rate": 8.032351149349113e-05, + "loss": 2.7384, + "step": 23411 + }, + { + "epoch": 1.089996973717904, + "grad_norm": 0.3181335408870271, + "learning_rate": 8.03213577157822e-05, + "loss": 2.6983, + "step": 23412 + }, + { + "epoch": 1.090043531903997, + "grad_norm": 0.3496746542179787, + "learning_rate": 8.031920384908305e-05, + "loss": 2.8629, + "step": 23413 + }, + { + "epoch": 1.09009009009009, + "grad_norm": 0.3363440804307103, + "learning_rate": 8.031704989340004e-05, + "loss": 2.8483, + "step": 23414 + }, + { + "epoch": 1.0901366482761832, + "grad_norm": 0.35097232121397065, + "learning_rate": 8.031489584873947e-05, + "loss": 2.8272, + "step": 23415 + }, + { + "epoch": 1.0901832064622763, + "grad_norm": 0.3375630305520118, + "learning_rate": 8.031274171510768e-05, + "loss": 2.6523, + "step": 23416 + }, + { + "epoch": 1.0902297646483694, + "grad_norm": 0.35038954351144, + "learning_rate": 8.031058749251096e-05, + "loss": 2.9304, + "step": 23417 + }, + { + "epoch": 1.0902763228344623, + "grad_norm": 0.3608462600908659, + "learning_rate": 8.030843318095566e-05, + "loss": 2.9339, + "step": 23418 + }, + { + "epoch": 1.0903228810205554, + "grad_norm": 0.36486469850469566, + "learning_rate": 8.030627878044808e-05, + "loss": 2.9084, + "step": 23419 + }, + { + "epoch": 1.0903694392066485, + "grad_norm": 0.32633584315727227, + "learning_rate": 8.030412429099458e-05, + "loss": 2.7884, + "step": 23420 + }, + { + "epoch": 1.0904159973927416, + "grad_norm": 0.33133187697084865, + "learning_rate": 8.030196971260145e-05, + "loss": 2.8136, + "step": 23421 + }, + { + "epoch": 1.0904625555788345, + "grad_norm": 0.33871470591758346, + "learning_rate": 8.029981504527503e-05, + "loss": 2.8662, + "step": 23422 + }, + { + "epoch": 1.0905091137649277, + "grad_norm": 0.33674236292514115, + "learning_rate": 8.029766028902164e-05, + "loss": 2.7209, + "step": 23423 + }, + { + "epoch": 1.0905556719510208, + "grad_norm": 0.3294323912792689, + "learning_rate": 8.029550544384759e-05, + "loss": 2.7715, + "step": 23424 + }, + { + "epoch": 1.0906022301371139, + "grad_norm": 0.31723237549589717, + "learning_rate": 8.029335050975922e-05, + "loss": 2.787, + "step": 23425 + }, + { + "epoch": 1.090648788323207, + "grad_norm": 0.39869821246332365, + "learning_rate": 8.029119548676285e-05, + "loss": 2.8966, + "step": 23426 + }, + { + "epoch": 1.0906953465093001, + "grad_norm": 0.3225349127138904, + "learning_rate": 8.02890403748648e-05, + "loss": 2.8119, + "step": 23427 + }, + { + "epoch": 1.090741904695393, + "grad_norm": 0.325030708880003, + "learning_rate": 8.028688517407141e-05, + "loss": 2.8034, + "step": 23428 + }, + { + "epoch": 1.0907884628814861, + "grad_norm": 0.3167634073386224, + "learning_rate": 8.0284729884389e-05, + "loss": 2.812, + "step": 23429 + }, + { + "epoch": 1.0908350210675792, + "grad_norm": 0.34128959257526603, + "learning_rate": 8.028257450582388e-05, + "loss": 2.7605, + "step": 23430 + }, + { + "epoch": 1.0908815792536724, + "grad_norm": 0.30055669134305163, + "learning_rate": 8.028041903838239e-05, + "loss": 2.7727, + "step": 23431 + }, + { + "epoch": 1.0909281374397652, + "grad_norm": 0.33890389201545623, + "learning_rate": 8.027826348207086e-05, + "loss": 2.7715, + "step": 23432 + }, + { + "epoch": 1.0909746956258584, + "grad_norm": 0.29893346167163376, + "learning_rate": 8.027610783689561e-05, + "loss": 2.7967, + "step": 23433 + }, + { + "epoch": 1.0910212538119515, + "grad_norm": 0.33770346285850006, + "learning_rate": 8.027395210286298e-05, + "loss": 2.8545, + "step": 23434 + }, + { + "epoch": 1.0910678119980446, + "grad_norm": 0.3374664542579617, + "learning_rate": 8.027179627997925e-05, + "loss": 2.8614, + "step": 23435 + }, + { + "epoch": 1.0911143701841377, + "grad_norm": 0.3049251069542274, + "learning_rate": 8.02696403682508e-05, + "loss": 2.8272, + "step": 23436 + }, + { + "epoch": 1.0911609283702306, + "grad_norm": 0.33168996096232883, + "learning_rate": 8.026748436768392e-05, + "loss": 2.6781, + "step": 23437 + }, + { + "epoch": 1.0912074865563237, + "grad_norm": 0.3079065780240443, + "learning_rate": 8.026532827828497e-05, + "loss": 2.8923, + "step": 23438 + }, + { + "epoch": 1.0912540447424168, + "grad_norm": 0.3136621661713066, + "learning_rate": 8.026317210006026e-05, + "loss": 2.8932, + "step": 23439 + }, + { + "epoch": 1.09130060292851, + "grad_norm": 0.30422490138239533, + "learning_rate": 8.026101583301613e-05, + "loss": 2.8703, + "step": 23440 + }, + { + "epoch": 1.091347161114603, + "grad_norm": 0.3154129062233641, + "learning_rate": 8.025885947715888e-05, + "loss": 2.7169, + "step": 23441 + }, + { + "epoch": 1.091393719300696, + "grad_norm": 0.31480186646447456, + "learning_rate": 8.025670303249488e-05, + "loss": 2.8431, + "step": 23442 + }, + { + "epoch": 1.091440277486789, + "grad_norm": 0.3284770350082601, + "learning_rate": 8.025454649903042e-05, + "loss": 2.8065, + "step": 23443 + }, + { + "epoch": 1.0914868356728822, + "grad_norm": 0.3024057398099295, + "learning_rate": 8.025238987677186e-05, + "loss": 2.8302, + "step": 23444 + }, + { + "epoch": 1.0915333938589753, + "grad_norm": 0.3407964302233829, + "learning_rate": 8.025023316572551e-05, + "loss": 2.8813, + "step": 23445 + }, + { + "epoch": 1.0915799520450684, + "grad_norm": 0.2887711150169053, + "learning_rate": 8.024807636589769e-05, + "loss": 2.8357, + "step": 23446 + }, + { + "epoch": 1.0916265102311613, + "grad_norm": 0.3395003331742657, + "learning_rate": 8.024591947729477e-05, + "loss": 2.8903, + "step": 23447 + }, + { + "epoch": 1.0916730684172544, + "grad_norm": 0.3212564787923579, + "learning_rate": 8.024376249992305e-05, + "loss": 2.889, + "step": 23448 + }, + { + "epoch": 1.0917196266033475, + "grad_norm": 0.31766239762729404, + "learning_rate": 8.024160543378887e-05, + "loss": 2.8511, + "step": 23449 + }, + { + "epoch": 1.0917661847894407, + "grad_norm": 0.34743788851145674, + "learning_rate": 8.023944827889854e-05, + "loss": 2.7975, + "step": 23450 + }, + { + "epoch": 1.0918127429755338, + "grad_norm": 0.3270788348937371, + "learning_rate": 8.023729103525842e-05, + "loss": 2.8479, + "step": 23451 + }, + { + "epoch": 1.0918593011616267, + "grad_norm": 0.3534778177162906, + "learning_rate": 8.023513370287483e-05, + "loss": 2.7458, + "step": 23452 + }, + { + "epoch": 1.0919058593477198, + "grad_norm": 0.33967104290085187, + "learning_rate": 8.02329762817541e-05, + "loss": 2.7952, + "step": 23453 + }, + { + "epoch": 1.091952417533813, + "grad_norm": 0.36220926759223177, + "learning_rate": 8.023081877190257e-05, + "loss": 2.7999, + "step": 23454 + }, + { + "epoch": 1.091998975719906, + "grad_norm": 0.3505878858756822, + "learning_rate": 8.022866117332656e-05, + "loss": 2.7174, + "step": 23455 + }, + { + "epoch": 1.0920455339059991, + "grad_norm": 0.3251739260672201, + "learning_rate": 8.02265034860324e-05, + "loss": 2.687, + "step": 23456 + }, + { + "epoch": 1.092092092092092, + "grad_norm": 0.3672648818919965, + "learning_rate": 8.022434571002644e-05, + "loss": 2.7734, + "step": 23457 + }, + { + "epoch": 1.0921386502781851, + "grad_norm": 0.31003049592866594, + "learning_rate": 8.0222187845315e-05, + "loss": 2.7584, + "step": 23458 + }, + { + "epoch": 1.0921852084642782, + "grad_norm": 0.32507861258154586, + "learning_rate": 8.022002989190441e-05, + "loss": 2.931, + "step": 23459 + }, + { + "epoch": 1.0922317666503714, + "grad_norm": 0.32840835532495083, + "learning_rate": 8.021787184980103e-05, + "loss": 2.8238, + "step": 23460 + }, + { + "epoch": 1.0922783248364643, + "grad_norm": 0.3448771299709363, + "learning_rate": 8.021571371901115e-05, + "loss": 2.8508, + "step": 23461 + }, + { + "epoch": 1.0923248830225574, + "grad_norm": 0.3541791658350751, + "learning_rate": 8.021355549954114e-05, + "loss": 2.8701, + "step": 23462 + }, + { + "epoch": 1.0923714412086505, + "grad_norm": 0.34075388154920006, + "learning_rate": 8.021139719139732e-05, + "loss": 2.8288, + "step": 23463 + }, + { + "epoch": 1.0924179993947436, + "grad_norm": 0.37312628090559075, + "learning_rate": 8.020923879458601e-05, + "loss": 2.8289, + "step": 23464 + }, + { + "epoch": 1.0924645575808367, + "grad_norm": 0.32838171423064394, + "learning_rate": 8.020708030911357e-05, + "loss": 2.8397, + "step": 23465 + }, + { + "epoch": 1.0925111157669298, + "grad_norm": 0.35722426412997016, + "learning_rate": 8.020492173498634e-05, + "loss": 2.8715, + "step": 23466 + }, + { + "epoch": 1.0925576739530227, + "grad_norm": 0.3362895117670462, + "learning_rate": 8.020276307221062e-05, + "loss": 2.8348, + "step": 23467 + }, + { + "epoch": 1.0926042321391158, + "grad_norm": 0.34255108253432903, + "learning_rate": 8.020060432079277e-05, + "loss": 2.8588, + "step": 23468 + }, + { + "epoch": 1.092650790325209, + "grad_norm": 0.3483398255531289, + "learning_rate": 8.019844548073912e-05, + "loss": 2.8062, + "step": 23469 + }, + { + "epoch": 1.092697348511302, + "grad_norm": 0.3401983211047264, + "learning_rate": 8.019628655205603e-05, + "loss": 2.8112, + "step": 23470 + }, + { + "epoch": 1.092743906697395, + "grad_norm": 0.3186259825157137, + "learning_rate": 8.01941275347498e-05, + "loss": 2.8487, + "step": 23471 + }, + { + "epoch": 1.092790464883488, + "grad_norm": 0.3621372753526441, + "learning_rate": 8.019196842882677e-05, + "loss": 2.9148, + "step": 23472 + }, + { + "epoch": 1.0928370230695812, + "grad_norm": 0.30044311972620796, + "learning_rate": 8.018980923429329e-05, + "loss": 2.7471, + "step": 23473 + }, + { + "epoch": 1.0928835812556743, + "grad_norm": 0.33481480186273777, + "learning_rate": 8.018764995115569e-05, + "loss": 2.749, + "step": 23474 + }, + { + "epoch": 1.0929301394417674, + "grad_norm": 0.31456344947665354, + "learning_rate": 8.018549057942033e-05, + "loss": 2.7092, + "step": 23475 + }, + { + "epoch": 1.0929766976278603, + "grad_norm": 0.3295624951524598, + "learning_rate": 8.018333111909352e-05, + "loss": 2.8288, + "step": 23476 + }, + { + "epoch": 1.0930232558139534, + "grad_norm": 0.3113398817887515, + "learning_rate": 8.01811715701816e-05, + "loss": 2.7764, + "step": 23477 + }, + { + "epoch": 1.0930698140000465, + "grad_norm": 0.3251410460027406, + "learning_rate": 8.017901193269091e-05, + "loss": 2.8054, + "step": 23478 + }, + { + "epoch": 1.0931163721861397, + "grad_norm": 0.3312232223423686, + "learning_rate": 8.01768522066278e-05, + "loss": 2.8966, + "step": 23479 + }, + { + "epoch": 1.0931629303722328, + "grad_norm": 0.3446527008967292, + "learning_rate": 8.017469239199862e-05, + "loss": 2.832, + "step": 23480 + }, + { + "epoch": 1.0932094885583257, + "grad_norm": 0.30889300155031135, + "learning_rate": 8.017253248880967e-05, + "loss": 2.7394, + "step": 23481 + }, + { + "epoch": 1.0932560467444188, + "grad_norm": 0.3161365517981073, + "learning_rate": 8.017037249706732e-05, + "loss": 2.8054, + "step": 23482 + }, + { + "epoch": 1.093302604930512, + "grad_norm": 0.32180394749993135, + "learning_rate": 8.016821241677789e-05, + "loss": 2.9313, + "step": 23483 + }, + { + "epoch": 1.093349163116605, + "grad_norm": 0.33751228014844303, + "learning_rate": 8.016605224794773e-05, + "loss": 2.7507, + "step": 23484 + }, + { + "epoch": 1.0933957213026981, + "grad_norm": 0.3246821820960742, + "learning_rate": 8.016389199058319e-05, + "loss": 2.7953, + "step": 23485 + }, + { + "epoch": 1.093442279488791, + "grad_norm": 0.3370540062855974, + "learning_rate": 8.016173164469059e-05, + "loss": 2.7518, + "step": 23486 + }, + { + "epoch": 1.0934888376748841, + "grad_norm": 0.305465518155819, + "learning_rate": 8.015957121027627e-05, + "loss": 2.8544, + "step": 23487 + }, + { + "epoch": 1.0935353958609773, + "grad_norm": 0.358237835411317, + "learning_rate": 8.01574106873466e-05, + "loss": 2.8102, + "step": 23488 + }, + { + "epoch": 1.0935819540470704, + "grad_norm": 0.33819802228485474, + "learning_rate": 8.015525007590788e-05, + "loss": 2.8645, + "step": 23489 + }, + { + "epoch": 1.0936285122331635, + "grad_norm": 0.3486904845438401, + "learning_rate": 8.015308937596648e-05, + "loss": 2.7648, + "step": 23490 + }, + { + "epoch": 1.0936750704192564, + "grad_norm": 0.3473494003856862, + "learning_rate": 8.015092858752873e-05, + "loss": 2.7955, + "step": 23491 + }, + { + "epoch": 1.0937216286053495, + "grad_norm": 0.3423872705704528, + "learning_rate": 8.014876771060097e-05, + "loss": 2.8139, + "step": 23492 + }, + { + "epoch": 1.0937681867914426, + "grad_norm": 0.35394048166965003, + "learning_rate": 8.014660674518957e-05, + "loss": 2.828, + "step": 23493 + }, + { + "epoch": 1.0938147449775357, + "grad_norm": 0.3602713456462668, + "learning_rate": 8.014444569130084e-05, + "loss": 2.8557, + "step": 23494 + }, + { + "epoch": 1.0938613031636288, + "grad_norm": 0.36300927980825226, + "learning_rate": 8.014228454894111e-05, + "loss": 2.8821, + "step": 23495 + }, + { + "epoch": 1.0939078613497217, + "grad_norm": 0.3479146151184011, + "learning_rate": 8.014012331811676e-05, + "loss": 2.6955, + "step": 23496 + }, + { + "epoch": 1.0939544195358148, + "grad_norm": 0.3387376100689418, + "learning_rate": 8.013796199883412e-05, + "loss": 2.7843, + "step": 23497 + }, + { + "epoch": 1.094000977721908, + "grad_norm": 0.35116927631072525, + "learning_rate": 8.013580059109954e-05, + "loss": 2.8789, + "step": 23498 + }, + { + "epoch": 1.094047535908001, + "grad_norm": 0.3493135903829758, + "learning_rate": 8.013363909491935e-05, + "loss": 2.802, + "step": 23499 + }, + { + "epoch": 1.0940940940940942, + "grad_norm": 0.3288089138665283, + "learning_rate": 8.013147751029988e-05, + "loss": 2.8631, + "step": 23500 + }, + { + "epoch": 1.094140652280187, + "grad_norm": 0.3597789986297748, + "learning_rate": 8.01293158372475e-05, + "loss": 2.7395, + "step": 23501 + }, + { + "epoch": 1.0941872104662802, + "grad_norm": 0.3185395361338763, + "learning_rate": 8.012715407576856e-05, + "loss": 2.7788, + "step": 23502 + }, + { + "epoch": 1.0942337686523733, + "grad_norm": 0.35303773013876893, + "learning_rate": 8.012499222586937e-05, + "loss": 2.8882, + "step": 23503 + }, + { + "epoch": 1.0942803268384664, + "grad_norm": 0.3518313948748512, + "learning_rate": 8.012283028755631e-05, + "loss": 2.7322, + "step": 23504 + }, + { + "epoch": 1.0943268850245595, + "grad_norm": 0.35142424321550464, + "learning_rate": 8.01206682608357e-05, + "loss": 2.8386, + "step": 23505 + }, + { + "epoch": 1.0943734432106524, + "grad_norm": 0.36358816870172744, + "learning_rate": 8.01185061457139e-05, + "loss": 2.8497, + "step": 23506 + }, + { + "epoch": 1.0944200013967456, + "grad_norm": 0.3825319500454229, + "learning_rate": 8.011634394219726e-05, + "loss": 2.8951, + "step": 23507 + }, + { + "epoch": 1.0944665595828387, + "grad_norm": 0.3310001552137936, + "learning_rate": 8.011418165029212e-05, + "loss": 2.9509, + "step": 23508 + }, + { + "epoch": 1.0945131177689318, + "grad_norm": 0.39699004910395125, + "learning_rate": 8.01120192700048e-05, + "loss": 2.7656, + "step": 23509 + }, + { + "epoch": 1.0945596759550247, + "grad_norm": 0.31405536525346345, + "learning_rate": 8.010985680134168e-05, + "loss": 2.7484, + "step": 23510 + }, + { + "epoch": 1.0946062341411178, + "grad_norm": 0.3657596108406007, + "learning_rate": 8.01076942443091e-05, + "loss": 2.8166, + "step": 23511 + }, + { + "epoch": 1.094652792327211, + "grad_norm": 0.331600364948962, + "learning_rate": 8.010553159891341e-05, + "loss": 2.8616, + "step": 23512 + }, + { + "epoch": 1.094699350513304, + "grad_norm": 0.3546682410142303, + "learning_rate": 8.010336886516095e-05, + "loss": 2.7886, + "step": 23513 + }, + { + "epoch": 1.0947459086993971, + "grad_norm": 0.34281056416364863, + "learning_rate": 8.010120604305806e-05, + "loss": 2.8638, + "step": 23514 + }, + { + "epoch": 1.0947924668854903, + "grad_norm": 0.31820308234082145, + "learning_rate": 8.00990431326111e-05, + "loss": 2.8062, + "step": 23515 + }, + { + "epoch": 1.0948390250715831, + "grad_norm": 0.3341706789273983, + "learning_rate": 8.009688013382642e-05, + "loss": 2.7584, + "step": 23516 + }, + { + "epoch": 1.0948855832576763, + "grad_norm": 0.3320278751714263, + "learning_rate": 8.009471704671036e-05, + "loss": 2.857, + "step": 23517 + }, + { + "epoch": 1.0949321414437694, + "grad_norm": 0.3599299616230351, + "learning_rate": 8.009255387126926e-05, + "loss": 2.8548, + "step": 23518 + }, + { + "epoch": 1.0949786996298625, + "grad_norm": 0.3538208713867126, + "learning_rate": 8.009039060750949e-05, + "loss": 2.8586, + "step": 23519 + }, + { + "epoch": 1.0950252578159554, + "grad_norm": 0.31189267335664406, + "learning_rate": 8.008822725543739e-05, + "loss": 2.7693, + "step": 23520 + }, + { + "epoch": 1.0950718160020485, + "grad_norm": 0.3630415660292583, + "learning_rate": 8.008606381505928e-05, + "loss": 2.8958, + "step": 23521 + }, + { + "epoch": 1.0951183741881416, + "grad_norm": 0.30865974467944124, + "learning_rate": 8.008390028638155e-05, + "loss": 2.8169, + "step": 23522 + }, + { + "epoch": 1.0951649323742347, + "grad_norm": 0.33008109366843363, + "learning_rate": 8.008173666941055e-05, + "loss": 2.9041, + "step": 23523 + }, + { + "epoch": 1.0952114905603278, + "grad_norm": 0.32556794342171663, + "learning_rate": 8.007957296415261e-05, + "loss": 2.8153, + "step": 23524 + }, + { + "epoch": 1.0952580487464207, + "grad_norm": 0.3792667727624232, + "learning_rate": 8.00774091706141e-05, + "loss": 2.9816, + "step": 23525 + }, + { + "epoch": 1.0953046069325139, + "grad_norm": 0.3143003041991729, + "learning_rate": 8.007524528880132e-05, + "loss": 2.7948, + "step": 23526 + }, + { + "epoch": 1.095351165118607, + "grad_norm": 0.3461706706018253, + "learning_rate": 8.007308131872068e-05, + "loss": 2.906, + "step": 23527 + }, + { + "epoch": 1.0953977233047, + "grad_norm": 0.33719208543018553, + "learning_rate": 8.007091726037852e-05, + "loss": 2.7339, + "step": 23528 + }, + { + "epoch": 1.0954442814907932, + "grad_norm": 0.33222448770399204, + "learning_rate": 8.006875311378117e-05, + "loss": 2.8841, + "step": 23529 + }, + { + "epoch": 1.095490839676886, + "grad_norm": 0.32410177402940027, + "learning_rate": 8.006658887893498e-05, + "loss": 2.7268, + "step": 23530 + }, + { + "epoch": 1.0955373978629792, + "grad_norm": 0.3758912763944541, + "learning_rate": 8.006442455584634e-05, + "loss": 2.8622, + "step": 23531 + }, + { + "epoch": 1.0955839560490723, + "grad_norm": 0.3542881337204667, + "learning_rate": 8.006226014452156e-05, + "loss": 2.672, + "step": 23532 + }, + { + "epoch": 1.0956305142351654, + "grad_norm": 0.3326832185329604, + "learning_rate": 8.006009564496702e-05, + "loss": 2.7976, + "step": 23533 + }, + { + "epoch": 1.0956770724212586, + "grad_norm": 0.3307647169352494, + "learning_rate": 8.005793105718904e-05, + "loss": 2.8011, + "step": 23534 + }, + { + "epoch": 1.0957236306073515, + "grad_norm": 0.3487255630845031, + "learning_rate": 8.005576638119401e-05, + "loss": 2.6985, + "step": 23535 + }, + { + "epoch": 1.0957701887934446, + "grad_norm": 0.35264990536859253, + "learning_rate": 8.005360161698824e-05, + "loss": 2.8653, + "step": 23536 + }, + { + "epoch": 1.0958167469795377, + "grad_norm": 0.37154979153081835, + "learning_rate": 8.005143676457814e-05, + "loss": 2.8642, + "step": 23537 + }, + { + "epoch": 1.0958633051656308, + "grad_norm": 0.3250734994156861, + "learning_rate": 8.004927182397004e-05, + "loss": 2.876, + "step": 23538 + }, + { + "epoch": 1.095909863351724, + "grad_norm": 0.3398918123585785, + "learning_rate": 8.004710679517027e-05, + "loss": 2.971, + "step": 23539 + }, + { + "epoch": 1.0959564215378168, + "grad_norm": 0.33331927516340976, + "learning_rate": 8.00449416781852e-05, + "loss": 2.8027, + "step": 23540 + }, + { + "epoch": 1.09600297972391, + "grad_norm": 0.3345490650481108, + "learning_rate": 8.004277647302119e-05, + "loss": 2.8469, + "step": 23541 + }, + { + "epoch": 1.096049537910003, + "grad_norm": 0.3621870063319083, + "learning_rate": 8.004061117968458e-05, + "loss": 2.7498, + "step": 23542 + }, + { + "epoch": 1.0960960960960962, + "grad_norm": 0.3900171297775831, + "learning_rate": 8.003844579818174e-05, + "loss": 2.7927, + "step": 23543 + }, + { + "epoch": 1.0961426542821893, + "grad_norm": 0.32361795891623424, + "learning_rate": 8.003628032851904e-05, + "loss": 2.9211, + "step": 23544 + }, + { + "epoch": 1.0961892124682822, + "grad_norm": 0.3787177708269827, + "learning_rate": 8.003411477070279e-05, + "loss": 2.7704, + "step": 23545 + }, + { + "epoch": 1.0962357706543753, + "grad_norm": 0.355281637389443, + "learning_rate": 8.00319491247394e-05, + "loss": 2.7678, + "step": 23546 + }, + { + "epoch": 1.0962823288404684, + "grad_norm": 0.38117414667021476, + "learning_rate": 8.002978339063519e-05, + "loss": 2.7234, + "step": 23547 + }, + { + "epoch": 1.0963288870265615, + "grad_norm": 0.3251633507111982, + "learning_rate": 8.002761756839651e-05, + "loss": 2.8631, + "step": 23548 + }, + { + "epoch": 1.0963754452126544, + "grad_norm": 0.4040629748606208, + "learning_rate": 8.002545165802974e-05, + "loss": 2.812, + "step": 23549 + }, + { + "epoch": 1.0964220033987475, + "grad_norm": 0.30656114824962893, + "learning_rate": 8.002328565954123e-05, + "loss": 2.832, + "step": 23550 + }, + { + "epoch": 1.0964685615848406, + "grad_norm": 0.3548251658206662, + "learning_rate": 8.002111957293732e-05, + "loss": 2.7425, + "step": 23551 + }, + { + "epoch": 1.0965151197709337, + "grad_norm": 0.3442802942484616, + "learning_rate": 8.001895339822439e-05, + "loss": 2.8043, + "step": 23552 + }, + { + "epoch": 1.0965616779570269, + "grad_norm": 0.33589760799453544, + "learning_rate": 8.001678713540879e-05, + "loss": 2.8336, + "step": 23553 + }, + { + "epoch": 1.09660823614312, + "grad_norm": 0.34465157833847004, + "learning_rate": 8.001462078449687e-05, + "loss": 2.754, + "step": 23554 + }, + { + "epoch": 1.0966547943292129, + "grad_norm": 0.364646214049698, + "learning_rate": 8.0012454345495e-05, + "loss": 2.9014, + "step": 23555 + }, + { + "epoch": 1.096701352515306, + "grad_norm": 0.343487581120075, + "learning_rate": 8.001028781840954e-05, + "loss": 2.872, + "step": 23556 + }, + { + "epoch": 1.096747910701399, + "grad_norm": 0.3063016292698869, + "learning_rate": 8.000812120324684e-05, + "loss": 2.7645, + "step": 23557 + }, + { + "epoch": 1.0967944688874922, + "grad_norm": 0.34545387862734245, + "learning_rate": 8.000595450001325e-05, + "loss": 2.8422, + "step": 23558 + }, + { + "epoch": 1.096841027073585, + "grad_norm": 0.33055598759636534, + "learning_rate": 8.000378770871515e-05, + "loss": 2.8676, + "step": 23559 + }, + { + "epoch": 1.0968875852596782, + "grad_norm": 0.3521145401713108, + "learning_rate": 8.000162082935889e-05, + "loss": 2.7956, + "step": 23560 + }, + { + "epoch": 1.0969341434457713, + "grad_norm": 0.3083900463134873, + "learning_rate": 7.999945386195082e-05, + "loss": 2.7974, + "step": 23561 + }, + { + "epoch": 1.0969807016318645, + "grad_norm": 0.37804811305940633, + "learning_rate": 7.999728680649732e-05, + "loss": 2.8306, + "step": 23562 + }, + { + "epoch": 1.0970272598179576, + "grad_norm": 0.3177138979368095, + "learning_rate": 7.999511966300472e-05, + "loss": 2.8437, + "step": 23563 + }, + { + "epoch": 1.0970738180040505, + "grad_norm": 0.3314295200455369, + "learning_rate": 7.999295243147942e-05, + "loss": 2.7534, + "step": 23564 + }, + { + "epoch": 1.0971203761901436, + "grad_norm": 0.3096654003162406, + "learning_rate": 7.999078511192775e-05, + "loss": 2.8176, + "step": 23565 + }, + { + "epoch": 1.0971669343762367, + "grad_norm": 0.31625094230816836, + "learning_rate": 7.998861770435608e-05, + "loss": 2.8661, + "step": 23566 + }, + { + "epoch": 1.0972134925623298, + "grad_norm": 0.3350724409304405, + "learning_rate": 7.998645020877077e-05, + "loss": 2.794, + "step": 23567 + }, + { + "epoch": 1.097260050748423, + "grad_norm": 0.3142055870518805, + "learning_rate": 7.998428262517818e-05, + "loss": 2.696, + "step": 23568 + }, + { + "epoch": 1.0973066089345158, + "grad_norm": 0.30878431953271346, + "learning_rate": 7.998211495358469e-05, + "loss": 2.7919, + "step": 23569 + }, + { + "epoch": 1.097353167120609, + "grad_norm": 0.3239306610185577, + "learning_rate": 7.997994719399665e-05, + "loss": 2.7729, + "step": 23570 + }, + { + "epoch": 1.097399725306702, + "grad_norm": 0.3160750550382847, + "learning_rate": 7.997777934642039e-05, + "loss": 2.8646, + "step": 23571 + }, + { + "epoch": 1.0974462834927952, + "grad_norm": 0.30471600302119234, + "learning_rate": 7.997561141086231e-05, + "loss": 2.8941, + "step": 23572 + }, + { + "epoch": 1.0974928416788883, + "grad_norm": 0.3090645625262796, + "learning_rate": 7.997344338732877e-05, + "loss": 2.8931, + "step": 23573 + }, + { + "epoch": 1.0975393998649812, + "grad_norm": 0.3117616059276839, + "learning_rate": 7.997127527582612e-05, + "loss": 2.7934, + "step": 23574 + }, + { + "epoch": 1.0975859580510743, + "grad_norm": 0.3096956223912706, + "learning_rate": 7.996910707636074e-05, + "loss": 2.76, + "step": 23575 + }, + { + "epoch": 1.0976325162371674, + "grad_norm": 0.3025721634692464, + "learning_rate": 7.996693878893897e-05, + "loss": 2.8288, + "step": 23576 + }, + { + "epoch": 1.0976790744232605, + "grad_norm": 0.3227673022430819, + "learning_rate": 7.99647704135672e-05, + "loss": 2.8869, + "step": 23577 + }, + { + "epoch": 1.0977256326093536, + "grad_norm": 0.30521466985435564, + "learning_rate": 7.996260195025177e-05, + "loss": 2.8397, + "step": 23578 + }, + { + "epoch": 1.0977721907954465, + "grad_norm": 0.32325991491792233, + "learning_rate": 7.996043339899906e-05, + "loss": 2.8557, + "step": 23579 + }, + { + "epoch": 1.0978187489815396, + "grad_norm": 0.29412469698894206, + "learning_rate": 7.995826475981543e-05, + "loss": 2.8411, + "step": 23580 + }, + { + "epoch": 1.0978653071676328, + "grad_norm": 0.32798798613420277, + "learning_rate": 7.995609603270725e-05, + "loss": 2.8108, + "step": 23581 + }, + { + "epoch": 1.0979118653537259, + "grad_norm": 0.32038589526513767, + "learning_rate": 7.995392721768088e-05, + "loss": 2.8013, + "step": 23582 + }, + { + "epoch": 1.097958423539819, + "grad_norm": 0.3113185188604377, + "learning_rate": 7.995175831474267e-05, + "loss": 2.7415, + "step": 23583 + }, + { + "epoch": 1.0980049817259119, + "grad_norm": 0.33155452092011994, + "learning_rate": 7.994958932389901e-05, + "loss": 2.7726, + "step": 23584 + }, + { + "epoch": 1.098051539912005, + "grad_norm": 0.2974456002555476, + "learning_rate": 7.994742024515625e-05, + "loss": 3.0162, + "step": 23585 + }, + { + "epoch": 1.098098098098098, + "grad_norm": 0.36657980265696716, + "learning_rate": 7.994525107852076e-05, + "loss": 2.7383, + "step": 23586 + }, + { + "epoch": 1.0981446562841912, + "grad_norm": 0.32431647530239566, + "learning_rate": 7.994308182399893e-05, + "loss": 2.8491, + "step": 23587 + }, + { + "epoch": 1.0981912144702843, + "grad_norm": 0.3315563837567974, + "learning_rate": 7.994091248159709e-05, + "loss": 2.8148, + "step": 23588 + }, + { + "epoch": 1.0982377726563772, + "grad_norm": 0.3468840882580838, + "learning_rate": 7.993874305132161e-05, + "loss": 2.8313, + "step": 23589 + }, + { + "epoch": 1.0982843308424703, + "grad_norm": 0.3323428400191824, + "learning_rate": 7.993657353317888e-05, + "loss": 2.83, + "step": 23590 + }, + { + "epoch": 1.0983308890285635, + "grad_norm": 0.34131910444886565, + "learning_rate": 7.993440392717525e-05, + "loss": 2.7942, + "step": 23591 + }, + { + "epoch": 1.0983774472146566, + "grad_norm": 0.34321214887631546, + "learning_rate": 7.99322342333171e-05, + "loss": 2.8692, + "step": 23592 + }, + { + "epoch": 1.0984240054007497, + "grad_norm": 0.3004179162197071, + "learning_rate": 7.99300644516108e-05, + "loss": 2.7492, + "step": 23593 + }, + { + "epoch": 1.0984705635868426, + "grad_norm": 0.33198085011176226, + "learning_rate": 7.992789458206269e-05, + "loss": 2.8494, + "step": 23594 + }, + { + "epoch": 1.0985171217729357, + "grad_norm": 0.33976878672797645, + "learning_rate": 7.992572462467917e-05, + "loss": 2.7825, + "step": 23595 + }, + { + "epoch": 1.0985636799590288, + "grad_norm": 0.31943507298165813, + "learning_rate": 7.992355457946659e-05, + "loss": 2.8878, + "step": 23596 + }, + { + "epoch": 1.098610238145122, + "grad_norm": 0.3219446512806555, + "learning_rate": 7.992138444643134e-05, + "loss": 2.7972, + "step": 23597 + }, + { + "epoch": 1.0986567963312148, + "grad_norm": 0.33557986251950106, + "learning_rate": 7.991921422557976e-05, + "loss": 2.8074, + "step": 23598 + }, + { + "epoch": 1.098703354517308, + "grad_norm": 0.3301740625037715, + "learning_rate": 7.991704391691824e-05, + "loss": 2.7154, + "step": 23599 + }, + { + "epoch": 1.098749912703401, + "grad_norm": 0.3182743192532753, + "learning_rate": 7.991487352045312e-05, + "loss": 2.8234, + "step": 23600 + }, + { + "epoch": 1.0987964708894942, + "grad_norm": 0.3235585310191933, + "learning_rate": 7.991270303619083e-05, + "loss": 2.8528, + "step": 23601 + }, + { + "epoch": 1.0988430290755873, + "grad_norm": 0.33064762075770965, + "learning_rate": 7.99105324641377e-05, + "loss": 2.8767, + "step": 23602 + }, + { + "epoch": 1.0988895872616804, + "grad_norm": 0.3342212711528679, + "learning_rate": 7.990836180430008e-05, + "loss": 2.837, + "step": 23603 + }, + { + "epoch": 1.0989361454477733, + "grad_norm": 0.33174808047131515, + "learning_rate": 7.990619105668437e-05, + "loss": 2.7592, + "step": 23604 + }, + { + "epoch": 1.0989827036338664, + "grad_norm": 0.36109913622774337, + "learning_rate": 7.990402022129694e-05, + "loss": 2.9391, + "step": 23605 + }, + { + "epoch": 1.0990292618199595, + "grad_norm": 0.3417953784012496, + "learning_rate": 7.990184929814415e-05, + "loss": 2.7804, + "step": 23606 + }, + { + "epoch": 1.0990758200060526, + "grad_norm": 0.3751882183697252, + "learning_rate": 7.98996782872324e-05, + "loss": 2.85, + "step": 23607 + }, + { + "epoch": 1.0991223781921455, + "grad_norm": 0.35636981666506384, + "learning_rate": 7.989750718856801e-05, + "loss": 2.7452, + "step": 23608 + }, + { + "epoch": 1.0991689363782386, + "grad_norm": 0.36853889091862824, + "learning_rate": 7.989533600215739e-05, + "loss": 2.8139, + "step": 23609 + }, + { + "epoch": 1.0992154945643318, + "grad_norm": 0.401524488390913, + "learning_rate": 7.989316472800693e-05, + "loss": 2.8801, + "step": 23610 + }, + { + "epoch": 1.0992620527504249, + "grad_norm": 0.37774450227887085, + "learning_rate": 7.989099336612296e-05, + "loss": 2.8332, + "step": 23611 + }, + { + "epoch": 1.099308610936518, + "grad_norm": 0.3389155267201799, + "learning_rate": 7.988882191651185e-05, + "loss": 2.8399, + "step": 23612 + }, + { + "epoch": 1.0993551691226109, + "grad_norm": 0.3407800510231637, + "learning_rate": 7.988665037918002e-05, + "loss": 2.8611, + "step": 23613 + }, + { + "epoch": 1.099401727308704, + "grad_norm": 0.3548050779373519, + "learning_rate": 7.988447875413381e-05, + "loss": 2.8932, + "step": 23614 + }, + { + "epoch": 1.0994482854947971, + "grad_norm": 0.35874739407743306, + "learning_rate": 7.98823070413796e-05, + "loss": 2.7737, + "step": 23615 + }, + { + "epoch": 1.0994948436808902, + "grad_norm": 0.30463618627844613, + "learning_rate": 7.988013524092375e-05, + "loss": 2.8463, + "step": 23616 + }, + { + "epoch": 1.0995414018669833, + "grad_norm": 0.339721064174535, + "learning_rate": 7.987796335277264e-05, + "loss": 2.7797, + "step": 23617 + }, + { + "epoch": 1.0995879600530762, + "grad_norm": 0.3045340824849152, + "learning_rate": 7.987579137693267e-05, + "loss": 2.7445, + "step": 23618 + }, + { + "epoch": 1.0996345182391694, + "grad_norm": 0.3678473491960337, + "learning_rate": 7.987361931341019e-05, + "loss": 2.7968, + "step": 23619 + }, + { + "epoch": 1.0996810764252625, + "grad_norm": 0.3287021805861133, + "learning_rate": 7.987144716221159e-05, + "loss": 2.9014, + "step": 23620 + }, + { + "epoch": 1.0997276346113556, + "grad_norm": 0.3431914713605652, + "learning_rate": 7.986927492334322e-05, + "loss": 2.7386, + "step": 23621 + }, + { + "epoch": 1.0997741927974487, + "grad_norm": 0.34833008549052963, + "learning_rate": 7.986710259681147e-05, + "loss": 2.7798, + "step": 23622 + }, + { + "epoch": 1.0998207509835416, + "grad_norm": 0.29723337634991587, + "learning_rate": 7.986493018262272e-05, + "loss": 2.8261, + "step": 23623 + }, + { + "epoch": 1.0998673091696347, + "grad_norm": 0.3351305840697373, + "learning_rate": 7.986275768078335e-05, + "loss": 2.7469, + "step": 23624 + }, + { + "epoch": 1.0999138673557278, + "grad_norm": 0.3145239935473204, + "learning_rate": 7.986058509129972e-05, + "loss": 2.865, + "step": 23625 + }, + { + "epoch": 1.099960425541821, + "grad_norm": 0.3238216839556257, + "learning_rate": 7.985841241417821e-05, + "loss": 2.7355, + "step": 23626 + }, + { + "epoch": 1.100006983727914, + "grad_norm": 0.3285560811428548, + "learning_rate": 7.985623964942521e-05, + "loss": 2.7146, + "step": 23627 + }, + { + "epoch": 1.100053541914007, + "grad_norm": 0.3338771804897516, + "learning_rate": 7.98540667970471e-05, + "loss": 2.8354, + "step": 23628 + }, + { + "epoch": 1.1001001001001, + "grad_norm": 0.328211461354697, + "learning_rate": 7.985189385705023e-05, + "loss": 2.8036, + "step": 23629 + }, + { + "epoch": 1.1001466582861932, + "grad_norm": 0.3542735027796123, + "learning_rate": 7.9849720829441e-05, + "loss": 2.8289, + "step": 23630 + }, + { + "epoch": 1.1001932164722863, + "grad_norm": 0.35153150006555905, + "learning_rate": 7.984754771422576e-05, + "loss": 2.7758, + "step": 23631 + }, + { + "epoch": 1.1002397746583794, + "grad_norm": 0.29678874410804384, + "learning_rate": 7.984537451141093e-05, + "loss": 2.769, + "step": 23632 + }, + { + "epoch": 1.1002863328444723, + "grad_norm": 0.3777225841950535, + "learning_rate": 7.984320122100286e-05, + "loss": 2.7833, + "step": 23633 + }, + { + "epoch": 1.1003328910305654, + "grad_norm": 0.3212238103190866, + "learning_rate": 7.984102784300793e-05, + "loss": 2.8211, + "step": 23634 + }, + { + "epoch": 1.1003794492166585, + "grad_norm": 0.3501978505479023, + "learning_rate": 7.983885437743252e-05, + "loss": 2.7812, + "step": 23635 + }, + { + "epoch": 1.1004260074027516, + "grad_norm": 0.3452219654352596, + "learning_rate": 7.983668082428302e-05, + "loss": 2.7668, + "step": 23636 + }, + { + "epoch": 1.1004725655888445, + "grad_norm": 0.3433221538000239, + "learning_rate": 7.98345071835658e-05, + "loss": 2.7432, + "step": 23637 + }, + { + "epoch": 1.1005191237749377, + "grad_norm": 0.329311536974898, + "learning_rate": 7.983233345528724e-05, + "loss": 2.8713, + "step": 23638 + }, + { + "epoch": 1.1005656819610308, + "grad_norm": 0.3301377415606832, + "learning_rate": 7.983015963945373e-05, + "loss": 2.8738, + "step": 23639 + }, + { + "epoch": 1.1006122401471239, + "grad_norm": 0.348731970903604, + "learning_rate": 7.982798573607162e-05, + "loss": 2.8776, + "step": 23640 + }, + { + "epoch": 1.100658798333217, + "grad_norm": 0.3339734777971369, + "learning_rate": 7.982581174514731e-05, + "loss": 2.8026, + "step": 23641 + }, + { + "epoch": 1.1007053565193101, + "grad_norm": 0.3476624443846781, + "learning_rate": 7.98236376666872e-05, + "loss": 2.674, + "step": 23642 + }, + { + "epoch": 1.100751914705403, + "grad_norm": 0.35336142871301796, + "learning_rate": 7.982146350069764e-05, + "loss": 2.751, + "step": 23643 + }, + { + "epoch": 1.1007984728914961, + "grad_norm": 0.32866687488352003, + "learning_rate": 7.981928924718503e-05, + "loss": 2.7677, + "step": 23644 + }, + { + "epoch": 1.1008450310775892, + "grad_norm": 0.3275588559226995, + "learning_rate": 7.981711490615573e-05, + "loss": 2.829, + "step": 23645 + }, + { + "epoch": 1.1008915892636824, + "grad_norm": 0.32944904712944545, + "learning_rate": 7.981494047761615e-05, + "loss": 2.8001, + "step": 23646 + }, + { + "epoch": 1.1009381474497753, + "grad_norm": 0.3251239736077838, + "learning_rate": 7.981276596157265e-05, + "loss": 2.8172, + "step": 23647 + }, + { + "epoch": 1.1009847056358684, + "grad_norm": 0.3413754048755105, + "learning_rate": 7.981059135803163e-05, + "loss": 2.8119, + "step": 23648 + }, + { + "epoch": 1.1010312638219615, + "grad_norm": 0.30536365130062837, + "learning_rate": 7.980841666699944e-05, + "loss": 2.8782, + "step": 23649 + }, + { + "epoch": 1.1010778220080546, + "grad_norm": 0.3284828530803108, + "learning_rate": 7.98062418884825e-05, + "loss": 2.8854, + "step": 23650 + }, + { + "epoch": 1.1011243801941477, + "grad_norm": 0.32665905917496596, + "learning_rate": 7.980406702248717e-05, + "loss": 2.7551, + "step": 23651 + }, + { + "epoch": 1.1011709383802406, + "grad_norm": 0.29919298359822233, + "learning_rate": 7.980189206901984e-05, + "loss": 2.8142, + "step": 23652 + }, + { + "epoch": 1.1012174965663337, + "grad_norm": 0.33980231244701, + "learning_rate": 7.97997170280869e-05, + "loss": 2.8583, + "step": 23653 + }, + { + "epoch": 1.1012640547524268, + "grad_norm": 0.3101553542033539, + "learning_rate": 7.979754189969472e-05, + "loss": 2.9078, + "step": 23654 + }, + { + "epoch": 1.10131061293852, + "grad_norm": 0.3494699989149216, + "learning_rate": 7.97953666838497e-05, + "loss": 2.8816, + "step": 23655 + }, + { + "epoch": 1.101357171124613, + "grad_norm": 0.29362194008880693, + "learning_rate": 7.979319138055818e-05, + "loss": 2.8599, + "step": 23656 + }, + { + "epoch": 1.101403729310706, + "grad_norm": 0.35355010166368084, + "learning_rate": 7.97910159898266e-05, + "loss": 2.8535, + "step": 23657 + }, + { + "epoch": 1.101450287496799, + "grad_norm": 0.35130299278286375, + "learning_rate": 7.978884051166133e-05, + "loss": 2.8023, + "step": 23658 + }, + { + "epoch": 1.1014968456828922, + "grad_norm": 0.3222991569506884, + "learning_rate": 7.978666494606875e-05, + "loss": 2.8269, + "step": 23659 + }, + { + "epoch": 1.1015434038689853, + "grad_norm": 0.33728465013124176, + "learning_rate": 7.978448929305524e-05, + "loss": 2.7065, + "step": 23660 + }, + { + "epoch": 1.1015899620550784, + "grad_norm": 0.33823076524646195, + "learning_rate": 7.978231355262718e-05, + "loss": 2.7061, + "step": 23661 + }, + { + "epoch": 1.1016365202411713, + "grad_norm": 0.3339455047164025, + "learning_rate": 7.978013772479098e-05, + "loss": 2.7758, + "step": 23662 + }, + { + "epoch": 1.1016830784272644, + "grad_norm": 0.318718989274465, + "learning_rate": 7.977796180955299e-05, + "loss": 2.783, + "step": 23663 + }, + { + "epoch": 1.1017296366133575, + "grad_norm": 0.32754169644159054, + "learning_rate": 7.977578580691962e-05, + "loss": 2.7112, + "step": 23664 + }, + { + "epoch": 1.1017761947994507, + "grad_norm": 0.30751432981577176, + "learning_rate": 7.977360971689727e-05, + "loss": 2.8881, + "step": 23665 + }, + { + "epoch": 1.1018227529855438, + "grad_norm": 0.3218245049795089, + "learning_rate": 7.977143353949228e-05, + "loss": 2.8569, + "step": 23666 + }, + { + "epoch": 1.1018693111716367, + "grad_norm": 0.31272039718378203, + "learning_rate": 7.976925727471109e-05, + "loss": 2.837, + "step": 23667 + }, + { + "epoch": 1.1019158693577298, + "grad_norm": 0.3056384916012816, + "learning_rate": 7.976708092256005e-05, + "loss": 2.9088, + "step": 23668 + }, + { + "epoch": 1.101962427543823, + "grad_norm": 0.3279584649586431, + "learning_rate": 7.976490448304557e-05, + "loss": 2.8947, + "step": 23669 + }, + { + "epoch": 1.102008985729916, + "grad_norm": 0.3192916653524862, + "learning_rate": 7.9762727956174e-05, + "loss": 2.8241, + "step": 23670 + }, + { + "epoch": 1.1020555439160091, + "grad_norm": 0.32900680654038983, + "learning_rate": 7.976055134195178e-05, + "loss": 2.8738, + "step": 23671 + }, + { + "epoch": 1.102102102102102, + "grad_norm": 0.3191664231626438, + "learning_rate": 7.975837464038527e-05, + "loss": 2.7327, + "step": 23672 + }, + { + "epoch": 1.1021486602881951, + "grad_norm": 0.31580622984798923, + "learning_rate": 7.975619785148087e-05, + "loss": 2.7443, + "step": 23673 + }, + { + "epoch": 1.1021952184742883, + "grad_norm": 0.3162483731430414, + "learning_rate": 7.975402097524495e-05, + "loss": 2.8194, + "step": 23674 + }, + { + "epoch": 1.1022417766603814, + "grad_norm": 0.3176909257511464, + "learning_rate": 7.975184401168391e-05, + "loss": 2.7366, + "step": 23675 + }, + { + "epoch": 1.1022883348464745, + "grad_norm": 0.3376063225852818, + "learning_rate": 7.974966696080414e-05, + "loss": 2.9347, + "step": 23676 + }, + { + "epoch": 1.1023348930325674, + "grad_norm": 0.3388373540031863, + "learning_rate": 7.974748982261202e-05, + "loss": 2.8277, + "step": 23677 + }, + { + "epoch": 1.1023814512186605, + "grad_norm": 0.3090106116482054, + "learning_rate": 7.974531259711397e-05, + "loss": 2.7581, + "step": 23678 + }, + { + "epoch": 1.1024280094047536, + "grad_norm": 0.35408570809272827, + "learning_rate": 7.974313528431634e-05, + "loss": 2.8101, + "step": 23679 + }, + { + "epoch": 1.1024745675908467, + "grad_norm": 0.3028082714591488, + "learning_rate": 7.974095788422554e-05, + "loss": 2.8596, + "step": 23680 + }, + { + "epoch": 1.1025211257769398, + "grad_norm": 0.35428280946685775, + "learning_rate": 7.973878039684796e-05, + "loss": 2.778, + "step": 23681 + }, + { + "epoch": 1.1025676839630327, + "grad_norm": 0.3141437898084279, + "learning_rate": 7.973660282219e-05, + "loss": 2.819, + "step": 23682 + }, + { + "epoch": 1.1026142421491258, + "grad_norm": 0.32186318576900985, + "learning_rate": 7.973442516025803e-05, + "loss": 2.7324, + "step": 23683 + }, + { + "epoch": 1.102660800335219, + "grad_norm": 0.337936226464623, + "learning_rate": 7.973224741105845e-05, + "loss": 2.8047, + "step": 23684 + }, + { + "epoch": 1.102707358521312, + "grad_norm": 0.38817915130344977, + "learning_rate": 7.973006957459765e-05, + "loss": 2.8671, + "step": 23685 + }, + { + "epoch": 1.102753916707405, + "grad_norm": 0.3219681758510719, + "learning_rate": 7.972789165088204e-05, + "loss": 2.7593, + "step": 23686 + }, + { + "epoch": 1.102800474893498, + "grad_norm": 0.37294869069370673, + "learning_rate": 7.972571363991799e-05, + "loss": 2.743, + "step": 23687 + }, + { + "epoch": 1.1028470330795912, + "grad_norm": 0.34190189258435766, + "learning_rate": 7.97235355417119e-05, + "loss": 2.8479, + "step": 23688 + }, + { + "epoch": 1.1028935912656843, + "grad_norm": 0.3306837558600708, + "learning_rate": 7.972135735627015e-05, + "loss": 2.8292, + "step": 23689 + }, + { + "epoch": 1.1029401494517774, + "grad_norm": 0.3524500087130859, + "learning_rate": 7.971917908359915e-05, + "loss": 2.864, + "step": 23690 + }, + { + "epoch": 1.1029867076378705, + "grad_norm": 0.3494906138336641, + "learning_rate": 7.97170007237053e-05, + "loss": 2.7788, + "step": 23691 + }, + { + "epoch": 1.1030332658239634, + "grad_norm": 0.3402027756598089, + "learning_rate": 7.971482227659497e-05, + "loss": 2.7757, + "step": 23692 + }, + { + "epoch": 1.1030798240100566, + "grad_norm": 0.3470491289344412, + "learning_rate": 7.971264374227455e-05, + "loss": 2.7135, + "step": 23693 + }, + { + "epoch": 1.1031263821961497, + "grad_norm": 0.32971646854830317, + "learning_rate": 7.971046512075046e-05, + "loss": 2.8114, + "step": 23694 + }, + { + "epoch": 1.1031729403822428, + "grad_norm": 0.33456219411485333, + "learning_rate": 7.970828641202909e-05, + "loss": 2.8943, + "step": 23695 + }, + { + "epoch": 1.1032194985683357, + "grad_norm": 0.3347960842544264, + "learning_rate": 7.970610761611681e-05, + "loss": 2.8259, + "step": 23696 + }, + { + "epoch": 1.1032660567544288, + "grad_norm": 0.35134647687876613, + "learning_rate": 7.970392873302005e-05, + "loss": 2.809, + "step": 23697 + }, + { + "epoch": 1.103312614940522, + "grad_norm": 0.33898159542282147, + "learning_rate": 7.970174976274518e-05, + "loss": 2.908, + "step": 23698 + }, + { + "epoch": 1.103359173126615, + "grad_norm": 0.3349034299561177, + "learning_rate": 7.969957070529858e-05, + "loss": 2.7675, + "step": 23699 + }, + { + "epoch": 1.1034057313127081, + "grad_norm": 0.299089255307725, + "learning_rate": 7.96973915606867e-05, + "loss": 2.7746, + "step": 23700 + }, + { + "epoch": 1.103452289498801, + "grad_norm": 0.34032364898594736, + "learning_rate": 7.969521232891589e-05, + "loss": 2.8686, + "step": 23701 + }, + { + "epoch": 1.1034988476848941, + "grad_norm": 0.3223384022533916, + "learning_rate": 7.969303300999254e-05, + "loss": 2.8103, + "step": 23702 + }, + { + "epoch": 1.1035454058709873, + "grad_norm": 0.3127821059537461, + "learning_rate": 7.969085360392306e-05, + "loss": 2.7511, + "step": 23703 + }, + { + "epoch": 1.1035919640570804, + "grad_norm": 0.350341948930927, + "learning_rate": 7.968867411071386e-05, + "loss": 2.8577, + "step": 23704 + }, + { + "epoch": 1.1036385222431735, + "grad_norm": 0.34184527270528453, + "learning_rate": 7.968649453037133e-05, + "loss": 2.8261, + "step": 23705 + }, + { + "epoch": 1.1036850804292664, + "grad_norm": 0.34234832442471325, + "learning_rate": 7.968431486290188e-05, + "loss": 2.7539, + "step": 23706 + }, + { + "epoch": 1.1037316386153595, + "grad_norm": 0.33338525325412516, + "learning_rate": 7.968213510831187e-05, + "loss": 2.7802, + "step": 23707 + }, + { + "epoch": 1.1037781968014526, + "grad_norm": 0.3038200429719403, + "learning_rate": 7.967995526660773e-05, + "loss": 2.794, + "step": 23708 + }, + { + "epoch": 1.1038247549875457, + "grad_norm": 0.355097543707926, + "learning_rate": 7.967777533779583e-05, + "loss": 2.9071, + "step": 23709 + }, + { + "epoch": 1.1038713131736388, + "grad_norm": 0.3038011922279071, + "learning_rate": 7.967559532188258e-05, + "loss": 2.7657, + "step": 23710 + }, + { + "epoch": 1.1039178713597317, + "grad_norm": 0.3223903099060447, + "learning_rate": 7.967341521887439e-05, + "loss": 2.8568, + "step": 23711 + }, + { + "epoch": 1.1039644295458249, + "grad_norm": 0.3375138983492348, + "learning_rate": 7.967123502877765e-05, + "loss": 2.7995, + "step": 23712 + }, + { + "epoch": 1.104010987731918, + "grad_norm": 0.327258466648093, + "learning_rate": 7.966905475159876e-05, + "loss": 2.8611, + "step": 23713 + }, + { + "epoch": 1.104057545918011, + "grad_norm": 0.3308717332001255, + "learning_rate": 7.966687438734412e-05, + "loss": 2.8547, + "step": 23714 + }, + { + "epoch": 1.1041041041041042, + "grad_norm": 0.3419737791436773, + "learning_rate": 7.966469393602011e-05, + "loss": 2.8131, + "step": 23715 + }, + { + "epoch": 1.104150662290197, + "grad_norm": 0.34708454233103947, + "learning_rate": 7.966251339763317e-05, + "loss": 2.7912, + "step": 23716 + }, + { + "epoch": 1.1041972204762902, + "grad_norm": 0.30234034268668963, + "learning_rate": 7.966033277218967e-05, + "loss": 2.7186, + "step": 23717 + }, + { + "epoch": 1.1042437786623833, + "grad_norm": 0.3521569468387941, + "learning_rate": 7.9658152059696e-05, + "loss": 2.8851, + "step": 23718 + }, + { + "epoch": 1.1042903368484764, + "grad_norm": 0.3213561322885044, + "learning_rate": 7.965597126015859e-05, + "loss": 2.784, + "step": 23719 + }, + { + "epoch": 1.1043368950345696, + "grad_norm": 0.3719891308857902, + "learning_rate": 7.965379037358382e-05, + "loss": 2.795, + "step": 23720 + }, + { + "epoch": 1.1043834532206624, + "grad_norm": 0.3281084224492263, + "learning_rate": 7.965160939997808e-05, + "loss": 2.7346, + "step": 23721 + }, + { + "epoch": 1.1044300114067556, + "grad_norm": 0.3554635890245189, + "learning_rate": 7.96494283393478e-05, + "loss": 2.7568, + "step": 23722 + }, + { + "epoch": 1.1044765695928487, + "grad_norm": 0.33304868374003643, + "learning_rate": 7.964724719169939e-05, + "loss": 2.7859, + "step": 23723 + }, + { + "epoch": 1.1045231277789418, + "grad_norm": 0.338132023980452, + "learning_rate": 7.964506595703923e-05, + "loss": 2.8624, + "step": 23724 + }, + { + "epoch": 1.1045696859650347, + "grad_norm": 0.3376240214238579, + "learning_rate": 7.964288463537368e-05, + "loss": 2.6994, + "step": 23725 + }, + { + "epoch": 1.1046162441511278, + "grad_norm": 0.34421135527773145, + "learning_rate": 7.964070322670921e-05, + "loss": 2.9059, + "step": 23726 + }, + { + "epoch": 1.104662802337221, + "grad_norm": 0.3221961096227279, + "learning_rate": 7.963852173105221e-05, + "loss": 2.7923, + "step": 23727 + }, + { + "epoch": 1.104709360523314, + "grad_norm": 0.3279772354038178, + "learning_rate": 7.963634014840905e-05, + "loss": 2.6939, + "step": 23728 + }, + { + "epoch": 1.1047559187094071, + "grad_norm": 0.3246291674184692, + "learning_rate": 7.963415847878614e-05, + "loss": 2.9417, + "step": 23729 + }, + { + "epoch": 1.1048024768955003, + "grad_norm": 0.3440942841185207, + "learning_rate": 7.963197672218991e-05, + "loss": 2.7565, + "step": 23730 + }, + { + "epoch": 1.1048490350815932, + "grad_norm": 0.3448307034896748, + "learning_rate": 7.962979487862677e-05, + "loss": 2.7832, + "step": 23731 + }, + { + "epoch": 1.1048955932676863, + "grad_norm": 0.3014932517795913, + "learning_rate": 7.962761294810307e-05, + "loss": 2.7741, + "step": 23732 + }, + { + "epoch": 1.1049421514537794, + "grad_norm": 0.3385203826674822, + "learning_rate": 7.962543093062525e-05, + "loss": 2.7893, + "step": 23733 + }, + { + "epoch": 1.1049887096398725, + "grad_norm": 0.3405815505401848, + "learning_rate": 7.962324882619971e-05, + "loss": 2.8378, + "step": 23734 + }, + { + "epoch": 1.1050352678259654, + "grad_norm": 0.3328827791312728, + "learning_rate": 7.962106663483285e-05, + "loss": 2.8651, + "step": 23735 + }, + { + "epoch": 1.1050818260120585, + "grad_norm": 0.3373308084501944, + "learning_rate": 7.961888435653109e-05, + "loss": 2.873, + "step": 23736 + }, + { + "epoch": 1.1051283841981516, + "grad_norm": 0.3288782289246601, + "learning_rate": 7.961670199130083e-05, + "loss": 2.8114, + "step": 23737 + }, + { + "epoch": 1.1051749423842447, + "grad_norm": 0.3145419879028686, + "learning_rate": 7.961451953914844e-05, + "loss": 2.7762, + "step": 23738 + }, + { + "epoch": 1.1052215005703379, + "grad_norm": 0.3405103882322316, + "learning_rate": 7.961233700008034e-05, + "loss": 2.9008, + "step": 23739 + }, + { + "epoch": 1.1052680587564307, + "grad_norm": 0.3258759084554485, + "learning_rate": 7.9610154374103e-05, + "loss": 2.7788, + "step": 23740 + }, + { + "epoch": 1.1053146169425239, + "grad_norm": 0.30399211787766134, + "learning_rate": 7.960797166122275e-05, + "loss": 2.8343, + "step": 23741 + }, + { + "epoch": 1.105361175128617, + "grad_norm": 0.3191688110417979, + "learning_rate": 7.960578886144601e-05, + "loss": 2.7894, + "step": 23742 + }, + { + "epoch": 1.10540773331471, + "grad_norm": 0.31955240237854565, + "learning_rate": 7.96036059747792e-05, + "loss": 2.7315, + "step": 23743 + }, + { + "epoch": 1.1054542915008032, + "grad_norm": 0.3564597185841955, + "learning_rate": 7.960142300122873e-05, + "loss": 2.9162, + "step": 23744 + }, + { + "epoch": 1.105500849686896, + "grad_norm": 0.312686279731029, + "learning_rate": 7.9599239940801e-05, + "loss": 2.8207, + "step": 23745 + }, + { + "epoch": 1.1055474078729892, + "grad_norm": 0.356487332475729, + "learning_rate": 7.95970567935024e-05, + "loss": 2.802, + "step": 23746 + }, + { + "epoch": 1.1055939660590823, + "grad_norm": 0.2983520523091964, + "learning_rate": 7.959487355933938e-05, + "loss": 2.8055, + "step": 23747 + }, + { + "epoch": 1.1056405242451754, + "grad_norm": 0.32959790523410276, + "learning_rate": 7.95926902383183e-05, + "loss": 2.839, + "step": 23748 + }, + { + "epoch": 1.1056870824312686, + "grad_norm": 0.3469984598565512, + "learning_rate": 7.95905068304456e-05, + "loss": 2.8906, + "step": 23749 + }, + { + "epoch": 1.1057336406173615, + "grad_norm": 0.3322531735183711, + "learning_rate": 7.958832333572768e-05, + "loss": 2.8515, + "step": 23750 + }, + { + "epoch": 1.1057801988034546, + "grad_norm": 0.33317831718450164, + "learning_rate": 7.958613975417094e-05, + "loss": 2.878, + "step": 23751 + }, + { + "epoch": 1.1058267569895477, + "grad_norm": 0.31994003262128823, + "learning_rate": 7.95839560857818e-05, + "loss": 2.8343, + "step": 23752 + }, + { + "epoch": 1.1058733151756408, + "grad_norm": 0.3394835398764299, + "learning_rate": 7.958177233056666e-05, + "loss": 2.8134, + "step": 23753 + }, + { + "epoch": 1.105919873361734, + "grad_norm": 0.3398097147014165, + "learning_rate": 7.957958848853193e-05, + "loss": 2.7133, + "step": 23754 + }, + { + "epoch": 1.1059664315478268, + "grad_norm": 0.3442808185733227, + "learning_rate": 7.957740455968403e-05, + "loss": 2.8856, + "step": 23755 + }, + { + "epoch": 1.10601298973392, + "grad_norm": 0.35426475054864515, + "learning_rate": 7.957522054402935e-05, + "loss": 2.8587, + "step": 23756 + }, + { + "epoch": 1.106059547920013, + "grad_norm": 0.36020474952716225, + "learning_rate": 7.95730364415743e-05, + "loss": 2.8644, + "step": 23757 + }, + { + "epoch": 1.1061061061061062, + "grad_norm": 0.31533511060660985, + "learning_rate": 7.957085225232532e-05, + "loss": 2.8208, + "step": 23758 + }, + { + "epoch": 1.1061526642921993, + "grad_norm": 0.3381948970701355, + "learning_rate": 7.956866797628882e-05, + "loss": 2.8735, + "step": 23759 + }, + { + "epoch": 1.1061992224782922, + "grad_norm": 0.3180188860042158, + "learning_rate": 7.956648361347116e-05, + "loss": 2.8132, + "step": 23760 + }, + { + "epoch": 1.1062457806643853, + "grad_norm": 0.32121119450253055, + "learning_rate": 7.95642991638788e-05, + "loss": 2.7387, + "step": 23761 + }, + { + "epoch": 1.1062923388504784, + "grad_norm": 0.33817842019258476, + "learning_rate": 7.956211462751812e-05, + "loss": 2.8472, + "step": 23762 + }, + { + "epoch": 1.1063388970365715, + "grad_norm": 0.324183563652482, + "learning_rate": 7.955993000439557e-05, + "loss": 2.7828, + "step": 23763 + }, + { + "epoch": 1.1063854552226646, + "grad_norm": 0.3131628970228193, + "learning_rate": 7.955774529451752e-05, + "loss": 2.9014, + "step": 23764 + }, + { + "epoch": 1.1064320134087575, + "grad_norm": 0.3373728822179667, + "learning_rate": 7.95555604978904e-05, + "loss": 2.8311, + "step": 23765 + }, + { + "epoch": 1.1064785715948506, + "grad_norm": 0.33094958485757625, + "learning_rate": 7.955337561452064e-05, + "loss": 2.7877, + "step": 23766 + }, + { + "epoch": 1.1065251297809438, + "grad_norm": 0.3263886382572239, + "learning_rate": 7.955119064441461e-05, + "loss": 2.8917, + "step": 23767 + }, + { + "epoch": 1.1065716879670369, + "grad_norm": 0.3015131799057097, + "learning_rate": 7.954900558757876e-05, + "loss": 2.8854, + "step": 23768 + }, + { + "epoch": 1.10661824615313, + "grad_norm": 0.3524343658645357, + "learning_rate": 7.954682044401949e-05, + "loss": 2.7682, + "step": 23769 + }, + { + "epoch": 1.1066648043392229, + "grad_norm": 0.3311447755535528, + "learning_rate": 7.95446352137432e-05, + "loss": 2.7432, + "step": 23770 + }, + { + "epoch": 1.106711362525316, + "grad_norm": 0.33119415254843493, + "learning_rate": 7.954244989675632e-05, + "loss": 2.7079, + "step": 23771 + }, + { + "epoch": 1.106757920711409, + "grad_norm": 0.3232005745405964, + "learning_rate": 7.954026449306527e-05, + "loss": 2.7649, + "step": 23772 + }, + { + "epoch": 1.1068044788975022, + "grad_norm": 0.35710781429850347, + "learning_rate": 7.953807900267644e-05, + "loss": 2.7217, + "step": 23773 + }, + { + "epoch": 1.1068510370835951, + "grad_norm": 0.32442741630107064, + "learning_rate": 7.953589342559627e-05, + "loss": 2.8888, + "step": 23774 + }, + { + "epoch": 1.1068975952696882, + "grad_norm": 0.3559929357886097, + "learning_rate": 7.953370776183116e-05, + "loss": 2.8929, + "step": 23775 + }, + { + "epoch": 1.1069441534557813, + "grad_norm": 0.3747960102780727, + "learning_rate": 7.953152201138752e-05, + "loss": 2.796, + "step": 23776 + }, + { + "epoch": 1.1069907116418745, + "grad_norm": 0.31366838885521514, + "learning_rate": 7.952933617427179e-05, + "loss": 2.8795, + "step": 23777 + }, + { + "epoch": 1.1070372698279676, + "grad_norm": 0.33684408579436437, + "learning_rate": 7.952715025049035e-05, + "loss": 2.7526, + "step": 23778 + }, + { + "epoch": 1.1070838280140607, + "grad_norm": 0.3435957308482857, + "learning_rate": 7.952496424004964e-05, + "loss": 2.784, + "step": 23779 + }, + { + "epoch": 1.1071303862001536, + "grad_norm": 0.3354501739774894, + "learning_rate": 7.952277814295606e-05, + "loss": 2.9115, + "step": 23780 + }, + { + "epoch": 1.1071769443862467, + "grad_norm": 0.36697479217188866, + "learning_rate": 7.952059195921604e-05, + "loss": 2.8223, + "step": 23781 + }, + { + "epoch": 1.1072235025723398, + "grad_norm": 0.3645342550027781, + "learning_rate": 7.9518405688836e-05, + "loss": 2.8277, + "step": 23782 + }, + { + "epoch": 1.107270060758433, + "grad_norm": 0.35213468127418834, + "learning_rate": 7.951621933182233e-05, + "loss": 2.8796, + "step": 23783 + }, + { + "epoch": 1.1073166189445258, + "grad_norm": 0.3383695540037432, + "learning_rate": 7.951403288818149e-05, + "loss": 2.8666, + "step": 23784 + }, + { + "epoch": 1.107363177130619, + "grad_norm": 0.3786693758799477, + "learning_rate": 7.951184635791986e-05, + "loss": 2.7083, + "step": 23785 + }, + { + "epoch": 1.107409735316712, + "grad_norm": 0.35735615728667264, + "learning_rate": 7.950965974104387e-05, + "loss": 2.833, + "step": 23786 + }, + { + "epoch": 1.1074562935028052, + "grad_norm": 0.39652522210549696, + "learning_rate": 7.950747303755992e-05, + "loss": 2.7694, + "step": 23787 + }, + { + "epoch": 1.1075028516888983, + "grad_norm": 0.3590101815637306, + "learning_rate": 7.950528624747446e-05, + "loss": 2.7283, + "step": 23788 + }, + { + "epoch": 1.1075494098749912, + "grad_norm": 0.3554435601111324, + "learning_rate": 7.950309937079388e-05, + "loss": 2.8333, + "step": 23789 + }, + { + "epoch": 1.1075959680610843, + "grad_norm": 0.3874940750490949, + "learning_rate": 7.950091240752462e-05, + "loss": 2.8689, + "step": 23790 + }, + { + "epoch": 1.1076425262471774, + "grad_norm": 0.35098152942007427, + "learning_rate": 7.949872535767309e-05, + "loss": 2.8667, + "step": 23791 + }, + { + "epoch": 1.1076890844332705, + "grad_norm": 0.3328211327858321, + "learning_rate": 7.94965382212457e-05, + "loss": 2.8056, + "step": 23792 + }, + { + "epoch": 1.1077356426193636, + "grad_norm": 0.3428720801695196, + "learning_rate": 7.949435099824887e-05, + "loss": 2.7973, + "step": 23793 + }, + { + "epoch": 1.1077822008054565, + "grad_norm": 0.3454989946129621, + "learning_rate": 7.949216368868903e-05, + "loss": 2.7971, + "step": 23794 + }, + { + "epoch": 1.1078287589915496, + "grad_norm": 0.32739654018859593, + "learning_rate": 7.94899762925726e-05, + "loss": 2.7324, + "step": 23795 + }, + { + "epoch": 1.1078753171776428, + "grad_norm": 0.37938675080251594, + "learning_rate": 7.948778880990599e-05, + "loss": 2.82, + "step": 23796 + }, + { + "epoch": 1.1079218753637359, + "grad_norm": 0.33781470822560344, + "learning_rate": 7.948560124069563e-05, + "loss": 2.8397, + "step": 23797 + }, + { + "epoch": 1.107968433549829, + "grad_norm": 0.3508312344836712, + "learning_rate": 7.948341358494793e-05, + "loss": 2.8887, + "step": 23798 + }, + { + "epoch": 1.1080149917359219, + "grad_norm": 0.3294536746282353, + "learning_rate": 7.948122584266932e-05, + "loss": 2.871, + "step": 23799 + }, + { + "epoch": 1.108061549922015, + "grad_norm": 0.34128992924053464, + "learning_rate": 7.947903801386621e-05, + "loss": 2.882, + "step": 23800 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 0.32931711142044945, + "learning_rate": 7.947685009854505e-05, + "loss": 2.7767, + "step": 23801 + }, + { + "epoch": 1.1081546662942012, + "grad_norm": 0.3604815355415801, + "learning_rate": 7.94746620967122e-05, + "loss": 2.8576, + "step": 23802 + }, + { + "epoch": 1.1082012244802943, + "grad_norm": 0.34038121722018944, + "learning_rate": 7.947247400837414e-05, + "loss": 2.7409, + "step": 23803 + }, + { + "epoch": 1.1082477826663872, + "grad_norm": 0.32938057824344147, + "learning_rate": 7.947028583353728e-05, + "loss": 2.7936, + "step": 23804 + }, + { + "epoch": 1.1082943408524804, + "grad_norm": 0.33911222504760036, + "learning_rate": 7.946809757220804e-05, + "loss": 2.9443, + "step": 23805 + }, + { + "epoch": 1.1083408990385735, + "grad_norm": 0.3738163878587794, + "learning_rate": 7.946590922439281e-05, + "loss": 2.8979, + "step": 23806 + }, + { + "epoch": 1.1083874572246666, + "grad_norm": 0.32363174877825407, + "learning_rate": 7.946372079009805e-05, + "loss": 2.791, + "step": 23807 + }, + { + "epoch": 1.1084340154107597, + "grad_norm": 0.37911691596856345, + "learning_rate": 7.946153226933018e-05, + "loss": 2.8804, + "step": 23808 + }, + { + "epoch": 1.1084805735968526, + "grad_norm": 0.30827252279592643, + "learning_rate": 7.945934366209562e-05, + "loss": 2.9147, + "step": 23809 + }, + { + "epoch": 1.1085271317829457, + "grad_norm": 0.3875441056263973, + "learning_rate": 7.945715496840078e-05, + "loss": 2.7928, + "step": 23810 + }, + { + "epoch": 1.1085736899690388, + "grad_norm": 0.3242167028384409, + "learning_rate": 7.945496618825209e-05, + "loss": 2.857, + "step": 23811 + }, + { + "epoch": 1.108620248155132, + "grad_norm": 0.3710614019957602, + "learning_rate": 7.945277732165597e-05, + "loss": 2.8659, + "step": 23812 + }, + { + "epoch": 1.1086668063412248, + "grad_norm": 0.30230099755095885, + "learning_rate": 7.945058836861887e-05, + "loss": 2.7508, + "step": 23813 + }, + { + "epoch": 1.108713364527318, + "grad_norm": 0.3439927106773298, + "learning_rate": 7.944839932914718e-05, + "loss": 2.8469, + "step": 23814 + }, + { + "epoch": 1.108759922713411, + "grad_norm": 0.31704543388972667, + "learning_rate": 7.944621020324734e-05, + "loss": 2.873, + "step": 23815 + }, + { + "epoch": 1.1088064808995042, + "grad_norm": 0.37752925786287833, + "learning_rate": 7.944402099092579e-05, + "loss": 2.8028, + "step": 23816 + }, + { + "epoch": 1.1088530390855973, + "grad_norm": 0.3551092330236038, + "learning_rate": 7.94418316921889e-05, + "loss": 2.8092, + "step": 23817 + }, + { + "epoch": 1.1088995972716904, + "grad_norm": 0.3418568458221713, + "learning_rate": 7.943964230704317e-05, + "loss": 2.8137, + "step": 23818 + }, + { + "epoch": 1.1089461554577833, + "grad_norm": 0.37741932157063013, + "learning_rate": 7.943745283549499e-05, + "loss": 2.8025, + "step": 23819 + }, + { + "epoch": 1.1089927136438764, + "grad_norm": 0.3156246878581067, + "learning_rate": 7.943526327755076e-05, + "loss": 2.7065, + "step": 23820 + }, + { + "epoch": 1.1090392718299695, + "grad_norm": 0.38303304823435824, + "learning_rate": 7.943307363321695e-05, + "loss": 2.8702, + "step": 23821 + }, + { + "epoch": 1.1090858300160626, + "grad_norm": 0.33326197589591483, + "learning_rate": 7.943088390249997e-05, + "loss": 2.8294, + "step": 23822 + }, + { + "epoch": 1.1091323882021555, + "grad_norm": 0.39459194292520483, + "learning_rate": 7.942869408540626e-05, + "loss": 2.9014, + "step": 23823 + }, + { + "epoch": 1.1091789463882487, + "grad_norm": 0.359197123513003, + "learning_rate": 7.94265041819422e-05, + "loss": 2.8445, + "step": 23824 + }, + { + "epoch": 1.1092255045743418, + "grad_norm": 0.3643579684609369, + "learning_rate": 7.942431419211425e-05, + "loss": 2.8069, + "step": 23825 + }, + { + "epoch": 1.1092720627604349, + "grad_norm": 0.3582808839717492, + "learning_rate": 7.942212411592886e-05, + "loss": 2.832, + "step": 23826 + }, + { + "epoch": 1.109318620946528, + "grad_norm": 0.3459250556909942, + "learning_rate": 7.941993395339243e-05, + "loss": 2.7854, + "step": 23827 + }, + { + "epoch": 1.109365179132621, + "grad_norm": 0.36300962923630237, + "learning_rate": 7.941774370451139e-05, + "loss": 2.8305, + "step": 23828 + }, + { + "epoch": 1.109411737318714, + "grad_norm": 0.3441949474058612, + "learning_rate": 7.941555336929216e-05, + "loss": 2.8382, + "step": 23829 + }, + { + "epoch": 1.1094582955048071, + "grad_norm": 0.3858595078882398, + "learning_rate": 7.941336294774118e-05, + "loss": 2.8771, + "step": 23830 + }, + { + "epoch": 1.1095048536909002, + "grad_norm": 0.3476500938437628, + "learning_rate": 7.94111724398649e-05, + "loss": 2.8633, + "step": 23831 + }, + { + "epoch": 1.1095514118769934, + "grad_norm": 0.3783865769372966, + "learning_rate": 7.940898184566969e-05, + "loss": 2.8212, + "step": 23832 + }, + { + "epoch": 1.1095979700630862, + "grad_norm": 0.35671134529900994, + "learning_rate": 7.940679116516203e-05, + "loss": 2.8336, + "step": 23833 + }, + { + "epoch": 1.1096445282491794, + "grad_norm": 0.3344061588071367, + "learning_rate": 7.940460039834833e-05, + "loss": 2.8184, + "step": 23834 + }, + { + "epoch": 1.1096910864352725, + "grad_norm": 0.38041692512469333, + "learning_rate": 7.940240954523506e-05, + "loss": 2.8018, + "step": 23835 + }, + { + "epoch": 1.1097376446213656, + "grad_norm": 0.35985019041176664, + "learning_rate": 7.940021860582858e-05, + "loss": 2.7884, + "step": 23836 + }, + { + "epoch": 1.1097842028074587, + "grad_norm": 0.3598971076956592, + "learning_rate": 7.939802758013537e-05, + "loss": 2.8343, + "step": 23837 + }, + { + "epoch": 1.1098307609935516, + "grad_norm": 0.3292789727435888, + "learning_rate": 7.939583646816183e-05, + "loss": 2.8361, + "step": 23838 + }, + { + "epoch": 1.1098773191796447, + "grad_norm": 0.3476991136677265, + "learning_rate": 7.939364526991442e-05, + "loss": 2.898, + "step": 23839 + }, + { + "epoch": 1.1099238773657378, + "grad_norm": 0.3423538216569305, + "learning_rate": 7.939145398539955e-05, + "loss": 2.8433, + "step": 23840 + }, + { + "epoch": 1.109970435551831, + "grad_norm": 0.3391762540935956, + "learning_rate": 7.938926261462366e-05, + "loss": 2.762, + "step": 23841 + }, + { + "epoch": 1.110016993737924, + "grad_norm": 0.3165046450251363, + "learning_rate": 7.938707115759318e-05, + "loss": 2.7829, + "step": 23842 + }, + { + "epoch": 1.110063551924017, + "grad_norm": 0.3526780325917921, + "learning_rate": 7.938487961431453e-05, + "loss": 2.8188, + "step": 23843 + }, + { + "epoch": 1.11011011011011, + "grad_norm": 0.3262839792898097, + "learning_rate": 7.938268798479418e-05, + "loss": 2.7399, + "step": 23844 + }, + { + "epoch": 1.1101566682962032, + "grad_norm": 0.3544524480279073, + "learning_rate": 7.938049626903852e-05, + "loss": 2.8649, + "step": 23845 + }, + { + "epoch": 1.1102032264822963, + "grad_norm": 0.32639767882230075, + "learning_rate": 7.937830446705398e-05, + "loss": 2.7946, + "step": 23846 + }, + { + "epoch": 1.1102497846683894, + "grad_norm": 0.3346488960244022, + "learning_rate": 7.937611257884704e-05, + "loss": 2.7654, + "step": 23847 + }, + { + "epoch": 1.1102963428544823, + "grad_norm": 0.34412004465460583, + "learning_rate": 7.937392060442409e-05, + "loss": 2.8225, + "step": 23848 + }, + { + "epoch": 1.1103429010405754, + "grad_norm": 0.3413687986524646, + "learning_rate": 7.937172854379158e-05, + "loss": 2.8112, + "step": 23849 + }, + { + "epoch": 1.1103894592266685, + "grad_norm": 0.3294288073356275, + "learning_rate": 7.936953639695595e-05, + "loss": 2.9325, + "step": 23850 + }, + { + "epoch": 1.1104360174127617, + "grad_norm": 0.35493326943696757, + "learning_rate": 7.93673441639236e-05, + "loss": 2.7773, + "step": 23851 + }, + { + "epoch": 1.1104825755988548, + "grad_norm": 0.33038944097302964, + "learning_rate": 7.9365151844701e-05, + "loss": 2.8519, + "step": 23852 + }, + { + "epoch": 1.1105291337849477, + "grad_norm": 0.35505489838924215, + "learning_rate": 7.936295943929458e-05, + "loss": 2.8184, + "step": 23853 + }, + { + "epoch": 1.1105756919710408, + "grad_norm": 0.3469757974524099, + "learning_rate": 7.936076694771076e-05, + "loss": 2.6997, + "step": 23854 + }, + { + "epoch": 1.110622250157134, + "grad_norm": 0.31809571289550714, + "learning_rate": 7.935857436995597e-05, + "loss": 2.8398, + "step": 23855 + }, + { + "epoch": 1.110668808343227, + "grad_norm": 0.38360865396490723, + "learning_rate": 7.935638170603667e-05, + "loss": 2.7376, + "step": 23856 + }, + { + "epoch": 1.1107153665293201, + "grad_norm": 0.29293599972933315, + "learning_rate": 7.935418895595928e-05, + "loss": 2.7485, + "step": 23857 + }, + { + "epoch": 1.110761924715413, + "grad_norm": 0.36409759163480887, + "learning_rate": 7.935199611973025e-05, + "loss": 2.7282, + "step": 23858 + }, + { + "epoch": 1.1108084829015061, + "grad_norm": 0.3098982909220931, + "learning_rate": 7.934980319735599e-05, + "loss": 2.8273, + "step": 23859 + }, + { + "epoch": 1.1108550410875992, + "grad_norm": 0.34359440399492186, + "learning_rate": 7.934761018884293e-05, + "loss": 2.803, + "step": 23860 + }, + { + "epoch": 1.1109015992736924, + "grad_norm": 0.32788913522179086, + "learning_rate": 7.934541709419757e-05, + "loss": 2.8544, + "step": 23861 + }, + { + "epoch": 1.1109481574597853, + "grad_norm": 0.30542142892158836, + "learning_rate": 7.934322391342626e-05, + "loss": 2.8898, + "step": 23862 + }, + { + "epoch": 1.1109947156458784, + "grad_norm": 0.3279839731310198, + "learning_rate": 7.934103064653549e-05, + "loss": 2.8246, + "step": 23863 + }, + { + "epoch": 1.1110412738319715, + "grad_norm": 0.315864710168605, + "learning_rate": 7.93388372935317e-05, + "loss": 2.8227, + "step": 23864 + }, + { + "epoch": 1.1110878320180646, + "grad_norm": 0.36082972365259103, + "learning_rate": 7.93366438544213e-05, + "loss": 2.8547, + "step": 23865 + }, + { + "epoch": 1.1111343902041577, + "grad_norm": 0.34562867056119084, + "learning_rate": 7.933445032921074e-05, + "loss": 2.8324, + "step": 23866 + }, + { + "epoch": 1.1111809483902508, + "grad_norm": 0.33619147929464466, + "learning_rate": 7.933225671790647e-05, + "loss": 2.8198, + "step": 23867 + }, + { + "epoch": 1.1112275065763437, + "grad_norm": 0.3360304802064837, + "learning_rate": 7.933006302051491e-05, + "loss": 2.8191, + "step": 23868 + }, + { + "epoch": 1.1112740647624368, + "grad_norm": 0.3147815060532406, + "learning_rate": 7.93278692370425e-05, + "loss": 2.8447, + "step": 23869 + }, + { + "epoch": 1.11132062294853, + "grad_norm": 0.3360901589808304, + "learning_rate": 7.932567536749568e-05, + "loss": 2.8013, + "step": 23870 + }, + { + "epoch": 1.111367181134623, + "grad_norm": 0.3396173142148855, + "learning_rate": 7.93234814118809e-05, + "loss": 2.9488, + "step": 23871 + }, + { + "epoch": 1.111413739320716, + "grad_norm": 0.346488204832859, + "learning_rate": 7.932128737020459e-05, + "loss": 2.8198, + "step": 23872 + }, + { + "epoch": 1.111460297506809, + "grad_norm": 0.3223441218987898, + "learning_rate": 7.931909324247318e-05, + "loss": 2.8085, + "step": 23873 + }, + { + "epoch": 1.1115068556929022, + "grad_norm": 0.34971099505360576, + "learning_rate": 7.931689902869314e-05, + "loss": 2.8248, + "step": 23874 + }, + { + "epoch": 1.1115534138789953, + "grad_norm": 0.32306511608237315, + "learning_rate": 7.931470472887087e-05, + "loss": 2.7281, + "step": 23875 + }, + { + "epoch": 1.1115999720650884, + "grad_norm": 0.32722261223148325, + "learning_rate": 7.931251034301283e-05, + "loss": 2.8834, + "step": 23876 + }, + { + "epoch": 1.1116465302511813, + "grad_norm": 0.31294702930102164, + "learning_rate": 7.931031587112548e-05, + "loss": 2.7803, + "step": 23877 + }, + { + "epoch": 1.1116930884372744, + "grad_norm": 0.31542099056885925, + "learning_rate": 7.930812131321522e-05, + "loss": 2.7853, + "step": 23878 + }, + { + "epoch": 1.1117396466233676, + "grad_norm": 0.3282464886210352, + "learning_rate": 7.930592666928852e-05, + "loss": 2.8592, + "step": 23879 + }, + { + "epoch": 1.1117862048094607, + "grad_norm": 0.32616220836606463, + "learning_rate": 7.930373193935182e-05, + "loss": 2.8286, + "step": 23880 + }, + { + "epoch": 1.1118327629955538, + "grad_norm": 0.3383728638412998, + "learning_rate": 7.930153712341155e-05, + "loss": 2.8815, + "step": 23881 + }, + { + "epoch": 1.1118793211816467, + "grad_norm": 0.3053672795004568, + "learning_rate": 7.929934222147416e-05, + "loss": 2.888, + "step": 23882 + }, + { + "epoch": 1.1119258793677398, + "grad_norm": 0.3363946177662063, + "learning_rate": 7.929714723354607e-05, + "loss": 2.9664, + "step": 23883 + }, + { + "epoch": 1.111972437553833, + "grad_norm": 0.32640313162335255, + "learning_rate": 7.929495215963373e-05, + "loss": 2.8118, + "step": 23884 + }, + { + "epoch": 1.112018995739926, + "grad_norm": 0.3373054177993344, + "learning_rate": 7.929275699974361e-05, + "loss": 2.7231, + "step": 23885 + }, + { + "epoch": 1.1120655539260191, + "grad_norm": 0.31287649773537574, + "learning_rate": 7.929056175388215e-05, + "loss": 2.8018, + "step": 23886 + }, + { + "epoch": 1.112112112112112, + "grad_norm": 0.3256736850625, + "learning_rate": 7.928836642205575e-05, + "loss": 2.9438, + "step": 23887 + }, + { + "epoch": 1.1121586702982051, + "grad_norm": 0.32580813608133075, + "learning_rate": 7.928617100427088e-05, + "loss": 2.771, + "step": 23888 + }, + { + "epoch": 1.1122052284842983, + "grad_norm": 0.3409261305136276, + "learning_rate": 7.9283975500534e-05, + "loss": 2.8374, + "step": 23889 + }, + { + "epoch": 1.1122517866703914, + "grad_norm": 0.3837019028728709, + "learning_rate": 7.928177991085153e-05, + "loss": 2.9315, + "step": 23890 + }, + { + "epoch": 1.1122983448564845, + "grad_norm": 0.33941495992115434, + "learning_rate": 7.927958423522992e-05, + "loss": 2.9144, + "step": 23891 + }, + { + "epoch": 1.1123449030425774, + "grad_norm": 0.3758697004995665, + "learning_rate": 7.927738847367561e-05, + "loss": 2.733, + "step": 23892 + }, + { + "epoch": 1.1123914612286705, + "grad_norm": 0.3192520339581768, + "learning_rate": 7.927519262619505e-05, + "loss": 2.7887, + "step": 23893 + }, + { + "epoch": 1.1124380194147636, + "grad_norm": 0.3857687197966913, + "learning_rate": 7.927299669279469e-05, + "loss": 2.7988, + "step": 23894 + }, + { + "epoch": 1.1124845776008567, + "grad_norm": 0.36431659914031395, + "learning_rate": 7.927080067348096e-05, + "loss": 2.8427, + "step": 23895 + }, + { + "epoch": 1.1125311357869498, + "grad_norm": 0.3715214562244176, + "learning_rate": 7.926860456826032e-05, + "loss": 2.8095, + "step": 23896 + }, + { + "epoch": 1.1125776939730427, + "grad_norm": 0.395979338020268, + "learning_rate": 7.926640837713922e-05, + "loss": 2.8728, + "step": 23897 + }, + { + "epoch": 1.1126242521591359, + "grad_norm": 0.34693858151549645, + "learning_rate": 7.926421210012406e-05, + "loss": 2.6791, + "step": 23898 + }, + { + "epoch": 1.112670810345229, + "grad_norm": 0.37762404533557364, + "learning_rate": 7.926201573722137e-05, + "loss": 2.8234, + "step": 23899 + }, + { + "epoch": 1.112717368531322, + "grad_norm": 0.301318902373053, + "learning_rate": 7.925981928843752e-05, + "loss": 2.7997, + "step": 23900 + }, + { + "epoch": 1.112763926717415, + "grad_norm": 0.37658593454484723, + "learning_rate": 7.925762275377897e-05, + "loss": 2.906, + "step": 23901 + }, + { + "epoch": 1.112810484903508, + "grad_norm": 0.31833717640126113, + "learning_rate": 7.925542613325219e-05, + "loss": 2.9857, + "step": 23902 + }, + { + "epoch": 1.1128570430896012, + "grad_norm": 0.36653913484813033, + "learning_rate": 7.925322942686362e-05, + "loss": 2.7937, + "step": 23903 + }, + { + "epoch": 1.1129036012756943, + "grad_norm": 0.3540781766202771, + "learning_rate": 7.92510326346197e-05, + "loss": 2.7639, + "step": 23904 + }, + { + "epoch": 1.1129501594617874, + "grad_norm": 0.38611811097704546, + "learning_rate": 7.924883575652689e-05, + "loss": 2.8711, + "step": 23905 + }, + { + "epoch": 1.1129967176478806, + "grad_norm": 0.3585101284480906, + "learning_rate": 7.924663879259162e-05, + "loss": 2.7374, + "step": 23906 + }, + { + "epoch": 1.1130432758339734, + "grad_norm": 0.3954362462556675, + "learning_rate": 7.924444174282036e-05, + "loss": 2.7936, + "step": 23907 + }, + { + "epoch": 1.1130898340200666, + "grad_norm": 0.36890727735410483, + "learning_rate": 7.924224460721953e-05, + "loss": 2.91, + "step": 23908 + }, + { + "epoch": 1.1131363922061597, + "grad_norm": 0.4040181374132372, + "learning_rate": 7.92400473857956e-05, + "loss": 2.7709, + "step": 23909 + }, + { + "epoch": 1.1131829503922528, + "grad_norm": 0.3627505678751574, + "learning_rate": 7.923785007855499e-05, + "loss": 2.8582, + "step": 23910 + }, + { + "epoch": 1.1132295085783457, + "grad_norm": 0.34968656215230665, + "learning_rate": 7.923565268550419e-05, + "loss": 2.7004, + "step": 23911 + }, + { + "epoch": 1.1132760667644388, + "grad_norm": 0.3896747571713127, + "learning_rate": 7.923345520664964e-05, + "loss": 2.8523, + "step": 23912 + }, + { + "epoch": 1.113322624950532, + "grad_norm": 0.3732117550712896, + "learning_rate": 7.923125764199777e-05, + "loss": 2.794, + "step": 23913 + }, + { + "epoch": 1.113369183136625, + "grad_norm": 0.3433975147112726, + "learning_rate": 7.922905999155503e-05, + "loss": 2.7613, + "step": 23914 + }, + { + "epoch": 1.1134157413227181, + "grad_norm": 0.3836321290449372, + "learning_rate": 7.922686225532788e-05, + "loss": 2.7984, + "step": 23915 + }, + { + "epoch": 1.113462299508811, + "grad_norm": 0.34458335998299805, + "learning_rate": 7.922466443332277e-05, + "loss": 2.8758, + "step": 23916 + }, + { + "epoch": 1.1135088576949042, + "grad_norm": 0.36188171130761215, + "learning_rate": 7.922246652554616e-05, + "loss": 2.7396, + "step": 23917 + }, + { + "epoch": 1.1135554158809973, + "grad_norm": 0.36827657615342363, + "learning_rate": 7.922026853200448e-05, + "loss": 2.7479, + "step": 23918 + }, + { + "epoch": 1.1136019740670904, + "grad_norm": 0.3212101328862706, + "learning_rate": 7.921807045270419e-05, + "loss": 2.7899, + "step": 23919 + }, + { + "epoch": 1.1136485322531835, + "grad_norm": 0.36409516177440615, + "learning_rate": 7.921587228765174e-05, + "loss": 2.8822, + "step": 23920 + }, + { + "epoch": 1.1136950904392764, + "grad_norm": 0.3268443130737074, + "learning_rate": 7.92136740368536e-05, + "loss": 2.8091, + "step": 23921 + }, + { + "epoch": 1.1137416486253695, + "grad_norm": 0.35212428444208255, + "learning_rate": 7.921147570031617e-05, + "loss": 2.9035, + "step": 23922 + }, + { + "epoch": 1.1137882068114626, + "grad_norm": 0.3378748676762492, + "learning_rate": 7.920927727804596e-05, + "loss": 2.7484, + "step": 23923 + }, + { + "epoch": 1.1138347649975557, + "grad_norm": 0.3402714158476696, + "learning_rate": 7.92070787700494e-05, + "loss": 2.7058, + "step": 23924 + }, + { + "epoch": 1.1138813231836489, + "grad_norm": 0.34290144552582685, + "learning_rate": 7.920488017633294e-05, + "loss": 2.7728, + "step": 23925 + }, + { + "epoch": 1.1139278813697417, + "grad_norm": 0.3397032361429518, + "learning_rate": 7.920268149690304e-05, + "loss": 2.8558, + "step": 23926 + }, + { + "epoch": 1.1139744395558349, + "grad_norm": 0.35691819977540973, + "learning_rate": 7.920048273176613e-05, + "loss": 2.8068, + "step": 23927 + }, + { + "epoch": 1.114020997741928, + "grad_norm": 0.34258294208294704, + "learning_rate": 7.919828388092869e-05, + "loss": 2.788, + "step": 23928 + }, + { + "epoch": 1.114067555928021, + "grad_norm": 0.3529631665939149, + "learning_rate": 7.919608494439716e-05, + "loss": 2.7673, + "step": 23929 + }, + { + "epoch": 1.1141141141141142, + "grad_norm": 0.3424340758141807, + "learning_rate": 7.9193885922178e-05, + "loss": 2.7324, + "step": 23930 + }, + { + "epoch": 1.114160672300207, + "grad_norm": 0.3474496942021374, + "learning_rate": 7.919168681427766e-05, + "loss": 2.9089, + "step": 23931 + }, + { + "epoch": 1.1142072304863002, + "grad_norm": 0.352643131696681, + "learning_rate": 7.91894876207026e-05, + "loss": 2.9423, + "step": 23932 + }, + { + "epoch": 1.1142537886723933, + "grad_norm": 0.3721767445410592, + "learning_rate": 7.918728834145926e-05, + "loss": 2.8722, + "step": 23933 + }, + { + "epoch": 1.1143003468584864, + "grad_norm": 0.3147985137353613, + "learning_rate": 7.91850889765541e-05, + "loss": 2.8167, + "step": 23934 + }, + { + "epoch": 1.1143469050445796, + "grad_norm": 0.3514780128071944, + "learning_rate": 7.91828895259936e-05, + "loss": 2.7232, + "step": 23935 + }, + { + "epoch": 1.1143934632306725, + "grad_norm": 0.3048114769899479, + "learning_rate": 7.918068998978418e-05, + "loss": 2.7919, + "step": 23936 + }, + { + "epoch": 1.1144400214167656, + "grad_norm": 0.32784761591866096, + "learning_rate": 7.91784903679323e-05, + "loss": 2.8, + "step": 23937 + }, + { + "epoch": 1.1144865796028587, + "grad_norm": 0.32997937407014194, + "learning_rate": 7.917629066044443e-05, + "loss": 2.8786, + "step": 23938 + }, + { + "epoch": 1.1145331377889518, + "grad_norm": 0.31406963557979983, + "learning_rate": 7.917409086732703e-05, + "loss": 2.8433, + "step": 23939 + }, + { + "epoch": 1.114579695975045, + "grad_norm": 0.3174049772821928, + "learning_rate": 7.917189098858654e-05, + "loss": 2.8955, + "step": 23940 + }, + { + "epoch": 1.1146262541611378, + "grad_norm": 0.32870922680614856, + "learning_rate": 7.916969102422944e-05, + "loss": 2.769, + "step": 23941 + }, + { + "epoch": 1.114672812347231, + "grad_norm": 0.35238923784395554, + "learning_rate": 7.916749097426215e-05, + "loss": 2.7595, + "step": 23942 + }, + { + "epoch": 1.114719370533324, + "grad_norm": 0.30377499857503043, + "learning_rate": 7.916529083869116e-05, + "loss": 2.8512, + "step": 23943 + }, + { + "epoch": 1.1147659287194172, + "grad_norm": 0.32288941115960773, + "learning_rate": 7.916309061752291e-05, + "loss": 2.8285, + "step": 23944 + }, + { + "epoch": 1.1148124869055103, + "grad_norm": 0.30597216435856595, + "learning_rate": 7.916089031076388e-05, + "loss": 2.7887, + "step": 23945 + }, + { + "epoch": 1.1148590450916032, + "grad_norm": 0.34393304425096427, + "learning_rate": 7.915868991842048e-05, + "loss": 2.8627, + "step": 23946 + }, + { + "epoch": 1.1149056032776963, + "grad_norm": 0.338485857358189, + "learning_rate": 7.915648944049921e-05, + "loss": 2.7751, + "step": 23947 + }, + { + "epoch": 1.1149521614637894, + "grad_norm": 0.3578676284019428, + "learning_rate": 7.915428887700654e-05, + "loss": 2.7989, + "step": 23948 + }, + { + "epoch": 1.1149987196498825, + "grad_norm": 0.3415560882017389, + "learning_rate": 7.915208822794886e-05, + "loss": 2.837, + "step": 23949 + }, + { + "epoch": 1.1150452778359754, + "grad_norm": 0.35071500118055554, + "learning_rate": 7.91498874933327e-05, + "loss": 2.8278, + "step": 23950 + }, + { + "epoch": 1.1150918360220685, + "grad_norm": 0.345411509644357, + "learning_rate": 7.914768667316448e-05, + "loss": 2.8318, + "step": 23951 + }, + { + "epoch": 1.1151383942081616, + "grad_norm": 0.3256931554237053, + "learning_rate": 7.914548576745068e-05, + "loss": 2.7275, + "step": 23952 + }, + { + "epoch": 1.1151849523942547, + "grad_norm": 0.34893877697191766, + "learning_rate": 7.914328477619775e-05, + "loss": 2.7893, + "step": 23953 + }, + { + "epoch": 1.1152315105803479, + "grad_norm": 0.34088895459937407, + "learning_rate": 7.914108369941215e-05, + "loss": 2.8683, + "step": 23954 + }, + { + "epoch": 1.1152780687664408, + "grad_norm": 0.35890363896905747, + "learning_rate": 7.913888253710034e-05, + "loss": 2.7442, + "step": 23955 + }, + { + "epoch": 1.1153246269525339, + "grad_norm": 0.34534818850294785, + "learning_rate": 7.913668128926877e-05, + "loss": 2.7395, + "step": 23956 + }, + { + "epoch": 1.115371185138627, + "grad_norm": 0.3148862024087229, + "learning_rate": 7.913447995592393e-05, + "loss": 2.9101, + "step": 23957 + }, + { + "epoch": 1.11541774332472, + "grad_norm": 0.35431959519524653, + "learning_rate": 7.913227853707225e-05, + "loss": 2.772, + "step": 23958 + }, + { + "epoch": 1.1154643015108132, + "grad_norm": 0.3192275256561977, + "learning_rate": 7.913007703272019e-05, + "loss": 2.7732, + "step": 23959 + }, + { + "epoch": 1.115510859696906, + "grad_norm": 0.32311487553407603, + "learning_rate": 7.912787544287424e-05, + "loss": 2.9328, + "step": 23960 + }, + { + "epoch": 1.1155574178829992, + "grad_norm": 0.34275266548195066, + "learning_rate": 7.912567376754084e-05, + "loss": 2.8706, + "step": 23961 + }, + { + "epoch": 1.1156039760690923, + "grad_norm": 0.34611008170512564, + "learning_rate": 7.912347200672648e-05, + "loss": 2.7702, + "step": 23962 + }, + { + "epoch": 1.1156505342551855, + "grad_norm": 0.3355990292378774, + "learning_rate": 7.912127016043756e-05, + "loss": 2.8151, + "step": 23963 + }, + { + "epoch": 1.1156970924412786, + "grad_norm": 0.343475391817846, + "learning_rate": 7.91190682286806e-05, + "loss": 2.8806, + "step": 23964 + }, + { + "epoch": 1.1157436506273715, + "grad_norm": 0.3835331406301484, + "learning_rate": 7.911686621146203e-05, + "loss": 2.9076, + "step": 23965 + }, + { + "epoch": 1.1157902088134646, + "grad_norm": 0.3454736747589592, + "learning_rate": 7.911466410878836e-05, + "loss": 2.8106, + "step": 23966 + }, + { + "epoch": 1.1158367669995577, + "grad_norm": 0.36187066991769845, + "learning_rate": 7.9112461920666e-05, + "loss": 2.849, + "step": 23967 + }, + { + "epoch": 1.1158833251856508, + "grad_norm": 0.30883244536486565, + "learning_rate": 7.911025964710142e-05, + "loss": 2.7523, + "step": 23968 + }, + { + "epoch": 1.115929883371744, + "grad_norm": 0.345867827942442, + "learning_rate": 7.910805728810112e-05, + "loss": 2.8401, + "step": 23969 + }, + { + "epoch": 1.1159764415578368, + "grad_norm": 0.343404072071495, + "learning_rate": 7.910585484367151e-05, + "loss": 2.8688, + "step": 23970 + }, + { + "epoch": 1.11602299974393, + "grad_norm": 0.32184214718408377, + "learning_rate": 7.910365231381911e-05, + "loss": 2.751, + "step": 23971 + }, + { + "epoch": 1.116069557930023, + "grad_norm": 0.3317470314590191, + "learning_rate": 7.910144969855033e-05, + "loss": 2.7978, + "step": 23972 + }, + { + "epoch": 1.1161161161161162, + "grad_norm": 0.3177228701166754, + "learning_rate": 7.90992469978717e-05, + "loss": 2.8751, + "step": 23973 + }, + { + "epoch": 1.1161626743022093, + "grad_norm": 0.352095850173098, + "learning_rate": 7.90970442117896e-05, + "loss": 2.8599, + "step": 23974 + }, + { + "epoch": 1.1162092324883022, + "grad_norm": 0.3419969591808185, + "learning_rate": 7.909484134031058e-05, + "loss": 2.7021, + "step": 23975 + }, + { + "epoch": 1.1162557906743953, + "grad_norm": 0.3344618749580402, + "learning_rate": 7.909263838344105e-05, + "loss": 2.8504, + "step": 23976 + }, + { + "epoch": 1.1163023488604884, + "grad_norm": 0.31406016449948965, + "learning_rate": 7.90904353411875e-05, + "loss": 2.784, + "step": 23977 + }, + { + "epoch": 1.1163489070465815, + "grad_norm": 0.3329973792740651, + "learning_rate": 7.90882322135564e-05, + "loss": 2.8846, + "step": 23978 + }, + { + "epoch": 1.1163954652326746, + "grad_norm": 0.3296642436232766, + "learning_rate": 7.908602900055418e-05, + "loss": 2.7895, + "step": 23979 + }, + { + "epoch": 1.1164420234187675, + "grad_norm": 0.32320869357370613, + "learning_rate": 7.908382570218733e-05, + "loss": 2.8741, + "step": 23980 + }, + { + "epoch": 1.1164885816048606, + "grad_norm": 0.3353464552090488, + "learning_rate": 7.908162231846232e-05, + "loss": 2.8113, + "step": 23981 + }, + { + "epoch": 1.1165351397909538, + "grad_norm": 0.32905877011708107, + "learning_rate": 7.907941884938563e-05, + "loss": 2.7865, + "step": 23982 + }, + { + "epoch": 1.1165816979770469, + "grad_norm": 0.33850063696740257, + "learning_rate": 7.90772152949637e-05, + "loss": 2.9057, + "step": 23983 + }, + { + "epoch": 1.11662825616314, + "grad_norm": 0.3393409301927044, + "learning_rate": 7.907501165520302e-05, + "loss": 2.7432, + "step": 23984 + }, + { + "epoch": 1.1166748143492329, + "grad_norm": 0.3431976805788864, + "learning_rate": 7.907280793011003e-05, + "loss": 2.8664, + "step": 23985 + }, + { + "epoch": 1.116721372535326, + "grad_norm": 0.34478100640837256, + "learning_rate": 7.90706041196912e-05, + "loss": 2.7833, + "step": 23986 + }, + { + "epoch": 1.116767930721419, + "grad_norm": 0.33045095909547373, + "learning_rate": 7.906840022395303e-05, + "loss": 2.8589, + "step": 23987 + }, + { + "epoch": 1.1168144889075122, + "grad_norm": 0.3498780177102051, + "learning_rate": 7.906619624290197e-05, + "loss": 2.7864, + "step": 23988 + }, + { + "epoch": 1.1168610470936051, + "grad_norm": 0.33527775698214507, + "learning_rate": 7.906399217654448e-05, + "loss": 2.8007, + "step": 23989 + }, + { + "epoch": 1.1169076052796982, + "grad_norm": 0.33682342112919844, + "learning_rate": 7.906178802488704e-05, + "loss": 2.8004, + "step": 23990 + }, + { + "epoch": 1.1169541634657913, + "grad_norm": 0.35241966059638735, + "learning_rate": 7.905958378793611e-05, + "loss": 2.8376, + "step": 23991 + }, + { + "epoch": 1.1170007216518845, + "grad_norm": 0.3171256268696276, + "learning_rate": 7.905737946569817e-05, + "loss": 2.8714, + "step": 23992 + }, + { + "epoch": 1.1170472798379776, + "grad_norm": 0.3277231585268201, + "learning_rate": 7.90551750581797e-05, + "loss": 2.845, + "step": 23993 + }, + { + "epoch": 1.1170938380240707, + "grad_norm": 0.34167270464909094, + "learning_rate": 7.905297056538712e-05, + "loss": 2.9212, + "step": 23994 + }, + { + "epoch": 1.1171403962101636, + "grad_norm": 0.35387512500659807, + "learning_rate": 7.905076598732696e-05, + "loss": 2.8584, + "step": 23995 + }, + { + "epoch": 1.1171869543962567, + "grad_norm": 0.35397756555987586, + "learning_rate": 7.904856132400565e-05, + "loss": 2.773, + "step": 23996 + }, + { + "epoch": 1.1172335125823498, + "grad_norm": 0.34247588274532176, + "learning_rate": 7.904635657542967e-05, + "loss": 2.8035, + "step": 23997 + }, + { + "epoch": 1.117280070768443, + "grad_norm": 0.37083913500004745, + "learning_rate": 7.904415174160551e-05, + "loss": 2.8264, + "step": 23998 + }, + { + "epoch": 1.1173266289545358, + "grad_norm": 0.34251463590829645, + "learning_rate": 7.904194682253962e-05, + "loss": 2.8727, + "step": 23999 + }, + { + "epoch": 1.117373187140629, + "grad_norm": 0.3746523037489907, + "learning_rate": 7.903974181823848e-05, + "loss": 2.8231, + "step": 24000 + }, + { + "epoch": 1.117419745326722, + "grad_norm": 0.3256972951317739, + "learning_rate": 7.903753672870855e-05, + "loss": 2.8475, + "step": 24001 + }, + { + "epoch": 1.1174663035128152, + "grad_norm": 0.3968578867305732, + "learning_rate": 7.903533155395632e-05, + "loss": 2.8441, + "step": 24002 + }, + { + "epoch": 1.1175128616989083, + "grad_norm": 0.3544439768999604, + "learning_rate": 7.903312629398825e-05, + "loss": 2.8106, + "step": 24003 + }, + { + "epoch": 1.1175594198850012, + "grad_norm": 0.3756583568380797, + "learning_rate": 7.90309209488108e-05, + "loss": 2.8222, + "step": 24004 + }, + { + "epoch": 1.1176059780710943, + "grad_norm": 0.3807582558711663, + "learning_rate": 7.902871551843048e-05, + "loss": 2.8457, + "step": 24005 + }, + { + "epoch": 1.1176525362571874, + "grad_norm": 0.3860805710081122, + "learning_rate": 7.902651000285372e-05, + "loss": 2.7316, + "step": 24006 + }, + { + "epoch": 1.1176990944432805, + "grad_norm": 0.36247948680021974, + "learning_rate": 7.902430440208704e-05, + "loss": 2.9236, + "step": 24007 + }, + { + "epoch": 1.1177456526293736, + "grad_norm": 0.36032683109857033, + "learning_rate": 7.902209871613686e-05, + "loss": 2.7288, + "step": 24008 + }, + { + "epoch": 1.1177922108154665, + "grad_norm": 0.3467511071809119, + "learning_rate": 7.901989294500969e-05, + "loss": 2.8589, + "step": 24009 + }, + { + "epoch": 1.1178387690015597, + "grad_norm": 0.34736369269154155, + "learning_rate": 7.901768708871198e-05, + "loss": 2.8514, + "step": 24010 + }, + { + "epoch": 1.1178853271876528, + "grad_norm": 0.3372019000727836, + "learning_rate": 7.901548114725023e-05, + "loss": 2.7849, + "step": 24011 + }, + { + "epoch": 1.1179318853737459, + "grad_norm": 0.3284117808338192, + "learning_rate": 7.90132751206309e-05, + "loss": 2.8067, + "step": 24012 + }, + { + "epoch": 1.117978443559839, + "grad_norm": 0.34292182840896046, + "learning_rate": 7.901106900886047e-05, + "loss": 2.7757, + "step": 24013 + }, + { + "epoch": 1.118025001745932, + "grad_norm": 0.32941960278945204, + "learning_rate": 7.90088628119454e-05, + "loss": 2.7715, + "step": 24014 + }, + { + "epoch": 1.118071559932025, + "grad_norm": 0.31888277230661516, + "learning_rate": 7.900665652989217e-05, + "loss": 2.7468, + "step": 24015 + }, + { + "epoch": 1.1181181181181181, + "grad_norm": 0.3347145671612606, + "learning_rate": 7.900445016270727e-05, + "loss": 2.7346, + "step": 24016 + }, + { + "epoch": 1.1181646763042112, + "grad_norm": 0.32669579499952073, + "learning_rate": 7.900224371039717e-05, + "loss": 2.8761, + "step": 24017 + }, + { + "epoch": 1.1182112344903044, + "grad_norm": 0.32349164519143564, + "learning_rate": 7.900003717296834e-05, + "loss": 2.8567, + "step": 24018 + }, + { + "epoch": 1.1182577926763972, + "grad_norm": 0.29527232632561135, + "learning_rate": 7.899783055042726e-05, + "loss": 2.8876, + "step": 24019 + }, + { + "epoch": 1.1183043508624904, + "grad_norm": 0.33934007350094014, + "learning_rate": 7.899562384278041e-05, + "loss": 2.9506, + "step": 24020 + }, + { + "epoch": 1.1183509090485835, + "grad_norm": 0.3179987153604606, + "learning_rate": 7.899341705003424e-05, + "loss": 2.8821, + "step": 24021 + }, + { + "epoch": 1.1183974672346766, + "grad_norm": 0.3257583847480135, + "learning_rate": 7.899121017219526e-05, + "loss": 2.8084, + "step": 24022 + }, + { + "epoch": 1.1184440254207697, + "grad_norm": 0.35461203872660557, + "learning_rate": 7.898900320926994e-05, + "loss": 2.7289, + "step": 24023 + }, + { + "epoch": 1.1184905836068626, + "grad_norm": 0.31389507846994547, + "learning_rate": 7.898679616126474e-05, + "loss": 2.7953, + "step": 24024 + }, + { + "epoch": 1.1185371417929557, + "grad_norm": 0.3541954645817473, + "learning_rate": 7.898458902818616e-05, + "loss": 2.7891, + "step": 24025 + }, + { + "epoch": 1.1185836999790488, + "grad_norm": 0.3316587383766137, + "learning_rate": 7.898238181004067e-05, + "loss": 2.9174, + "step": 24026 + }, + { + "epoch": 1.118630258165142, + "grad_norm": 0.33072198469710595, + "learning_rate": 7.898017450683473e-05, + "loss": 2.7613, + "step": 24027 + }, + { + "epoch": 1.118676816351235, + "grad_norm": 0.30526324642192987, + "learning_rate": 7.897796711857484e-05, + "loss": 2.8093, + "step": 24028 + }, + { + "epoch": 1.118723374537328, + "grad_norm": 0.37498261814955636, + "learning_rate": 7.897575964526746e-05, + "loss": 2.7574, + "step": 24029 + }, + { + "epoch": 1.118769932723421, + "grad_norm": 0.296629149232686, + "learning_rate": 7.89735520869191e-05, + "loss": 2.7992, + "step": 24030 + }, + { + "epoch": 1.1188164909095142, + "grad_norm": 0.3664727767909322, + "learning_rate": 7.897134444353619e-05, + "loss": 2.9042, + "step": 24031 + }, + { + "epoch": 1.1188630490956073, + "grad_norm": 0.32897806551836334, + "learning_rate": 7.896913671512527e-05, + "loss": 2.8478, + "step": 24032 + }, + { + "epoch": 1.1189096072817004, + "grad_norm": 0.3549274965809141, + "learning_rate": 7.896692890169278e-05, + "loss": 2.7583, + "step": 24033 + }, + { + "epoch": 1.1189561654677933, + "grad_norm": 0.3280975542906072, + "learning_rate": 7.896472100324518e-05, + "loss": 2.7571, + "step": 24034 + }, + { + "epoch": 1.1190027236538864, + "grad_norm": 0.3368637443412826, + "learning_rate": 7.896251301978901e-05, + "loss": 2.807, + "step": 24035 + }, + { + "epoch": 1.1190492818399795, + "grad_norm": 0.34586252797394745, + "learning_rate": 7.896030495133071e-05, + "loss": 2.839, + "step": 24036 + }, + { + "epoch": 1.1190958400260727, + "grad_norm": 0.3328379186448991, + "learning_rate": 7.895809679787676e-05, + "loss": 2.9058, + "step": 24037 + }, + { + "epoch": 1.1191423982121655, + "grad_norm": 0.37671637091307103, + "learning_rate": 7.895588855943365e-05, + "loss": 2.8084, + "step": 24038 + }, + { + "epoch": 1.1191889563982587, + "grad_norm": 0.3561438766823432, + "learning_rate": 7.895368023600786e-05, + "loss": 2.7479, + "step": 24039 + }, + { + "epoch": 1.1192355145843518, + "grad_norm": 0.3679629832386914, + "learning_rate": 7.895147182760589e-05, + "loss": 2.9049, + "step": 24040 + }, + { + "epoch": 1.119282072770445, + "grad_norm": 0.35307268500737843, + "learning_rate": 7.894926333423419e-05, + "loss": 2.8579, + "step": 24041 + }, + { + "epoch": 1.119328630956538, + "grad_norm": 0.35858026463329834, + "learning_rate": 7.894705475589922e-05, + "loss": 2.7559, + "step": 24042 + }, + { + "epoch": 1.119375189142631, + "grad_norm": 0.36192629974890644, + "learning_rate": 7.894484609260755e-05, + "loss": 2.8185, + "step": 24043 + }, + { + "epoch": 1.119421747328724, + "grad_norm": 0.3370452588238446, + "learning_rate": 7.894263734436557e-05, + "loss": 2.7908, + "step": 24044 + }, + { + "epoch": 1.1194683055148171, + "grad_norm": 0.3368046181938118, + "learning_rate": 7.894042851117981e-05, + "loss": 2.8134, + "step": 24045 + }, + { + "epoch": 1.1195148637009102, + "grad_norm": 0.34936627025195, + "learning_rate": 7.893821959305676e-05, + "loss": 2.7936, + "step": 24046 + }, + { + "epoch": 1.1195614218870034, + "grad_norm": 0.36143536312061697, + "learning_rate": 7.893601059000288e-05, + "loss": 2.8538, + "step": 24047 + }, + { + "epoch": 1.1196079800730963, + "grad_norm": 0.3463537353327032, + "learning_rate": 7.893380150202465e-05, + "loss": 2.9179, + "step": 24048 + }, + { + "epoch": 1.1196545382591894, + "grad_norm": 0.37171629619796354, + "learning_rate": 7.893159232912857e-05, + "loss": 2.8136, + "step": 24049 + }, + { + "epoch": 1.1197010964452825, + "grad_norm": 0.3514562942583763, + "learning_rate": 7.892938307132111e-05, + "loss": 2.8611, + "step": 24050 + }, + { + "epoch": 1.1197476546313756, + "grad_norm": 0.3633940508532121, + "learning_rate": 7.892717372860877e-05, + "loss": 2.8395, + "step": 24051 + }, + { + "epoch": 1.1197942128174687, + "grad_norm": 0.35784248031362725, + "learning_rate": 7.892496430099803e-05, + "loss": 2.8922, + "step": 24052 + }, + { + "epoch": 1.1198407710035616, + "grad_norm": 0.35321121159199687, + "learning_rate": 7.892275478849535e-05, + "loss": 2.774, + "step": 24053 + }, + { + "epoch": 1.1198873291896547, + "grad_norm": 0.3355159146513702, + "learning_rate": 7.892054519110725e-05, + "loss": 2.7249, + "step": 24054 + }, + { + "epoch": 1.1199338873757478, + "grad_norm": 0.3121212514694167, + "learning_rate": 7.891833550884018e-05, + "loss": 2.735, + "step": 24055 + }, + { + "epoch": 1.119980445561841, + "grad_norm": 0.330545017623262, + "learning_rate": 7.891612574170066e-05, + "loss": 2.798, + "step": 24056 + }, + { + "epoch": 1.120027003747934, + "grad_norm": 0.32906510247239257, + "learning_rate": 7.891391588969516e-05, + "loss": 2.8454, + "step": 24057 + }, + { + "epoch": 1.120073561934027, + "grad_norm": 0.3121898132136529, + "learning_rate": 7.891170595283016e-05, + "loss": 2.8013, + "step": 24058 + }, + { + "epoch": 1.12012012012012, + "grad_norm": 0.32848350491419415, + "learning_rate": 7.890949593111216e-05, + "loss": 2.8782, + "step": 24059 + }, + { + "epoch": 1.1201666783062132, + "grad_norm": 0.3120695390564126, + "learning_rate": 7.890728582454763e-05, + "loss": 2.8084, + "step": 24060 + }, + { + "epoch": 1.1202132364923063, + "grad_norm": 0.3435345825679159, + "learning_rate": 7.890507563314306e-05, + "loss": 2.8739, + "step": 24061 + }, + { + "epoch": 1.1202597946783994, + "grad_norm": 0.3226457443336153, + "learning_rate": 7.890286535690495e-05, + "loss": 2.9149, + "step": 24062 + }, + { + "epoch": 1.1203063528644923, + "grad_norm": 0.3233401836510117, + "learning_rate": 7.890065499583977e-05, + "loss": 2.797, + "step": 24063 + }, + { + "epoch": 1.1203529110505854, + "grad_norm": 0.3212640694601025, + "learning_rate": 7.889844454995402e-05, + "loss": 2.8724, + "step": 24064 + }, + { + "epoch": 1.1203994692366785, + "grad_norm": 0.3199356706814171, + "learning_rate": 7.889623401925418e-05, + "loss": 2.7718, + "step": 24065 + }, + { + "epoch": 1.1204460274227717, + "grad_norm": 0.34184215304370485, + "learning_rate": 7.889402340374675e-05, + "loss": 2.7549, + "step": 24066 + }, + { + "epoch": 1.1204925856088648, + "grad_norm": 0.2988382417879489, + "learning_rate": 7.88918127034382e-05, + "loss": 2.8114, + "step": 24067 + }, + { + "epoch": 1.1205391437949577, + "grad_norm": 0.3144215026434049, + "learning_rate": 7.888960191833501e-05, + "loss": 2.8712, + "step": 24068 + }, + { + "epoch": 1.1205857019810508, + "grad_norm": 0.3001244877767974, + "learning_rate": 7.888739104844369e-05, + "loss": 2.8394, + "step": 24069 + }, + { + "epoch": 1.120632260167144, + "grad_norm": 0.30823556836391774, + "learning_rate": 7.888518009377075e-05, + "loss": 2.8662, + "step": 24070 + }, + { + "epoch": 1.120678818353237, + "grad_norm": 0.29840439383886563, + "learning_rate": 7.888296905432262e-05, + "loss": 2.851, + "step": 24071 + }, + { + "epoch": 1.1207253765393301, + "grad_norm": 0.3362843693977761, + "learning_rate": 7.888075793010585e-05, + "loss": 2.8353, + "step": 24072 + }, + { + "epoch": 1.120771934725423, + "grad_norm": 0.3011232690325778, + "learning_rate": 7.887854672112687e-05, + "loss": 2.8061, + "step": 24073 + }, + { + "epoch": 1.1208184929115161, + "grad_norm": 0.3206784368020288, + "learning_rate": 7.887633542739221e-05, + "loss": 2.7994, + "step": 24074 + }, + { + "epoch": 1.1208650510976093, + "grad_norm": 0.3209530754882212, + "learning_rate": 7.887412404890837e-05, + "loss": 2.7674, + "step": 24075 + }, + { + "epoch": 1.1209116092837024, + "grad_norm": 0.36189671199156354, + "learning_rate": 7.887191258568181e-05, + "loss": 2.8324, + "step": 24076 + }, + { + "epoch": 1.1209581674697953, + "grad_norm": 0.30220171111101685, + "learning_rate": 7.886970103771903e-05, + "loss": 2.9169, + "step": 24077 + }, + { + "epoch": 1.1210047256558884, + "grad_norm": 0.33722747492121197, + "learning_rate": 7.88674894050265e-05, + "loss": 2.723, + "step": 24078 + }, + { + "epoch": 1.1210512838419815, + "grad_norm": 0.28539419638112046, + "learning_rate": 7.886527768761076e-05, + "loss": 2.8126, + "step": 24079 + }, + { + "epoch": 1.1210978420280746, + "grad_norm": 0.3545672722445007, + "learning_rate": 7.886306588547827e-05, + "loss": 2.8692, + "step": 24080 + }, + { + "epoch": 1.1211444002141677, + "grad_norm": 0.3036249580439, + "learning_rate": 7.886085399863552e-05, + "loss": 2.7724, + "step": 24081 + }, + { + "epoch": 1.1211909584002608, + "grad_norm": 0.3306249535780529, + "learning_rate": 7.885864202708899e-05, + "loss": 2.8879, + "step": 24082 + }, + { + "epoch": 1.1212375165863537, + "grad_norm": 0.3305687682606922, + "learning_rate": 7.885642997084522e-05, + "loss": 2.7605, + "step": 24083 + }, + { + "epoch": 1.1212840747724468, + "grad_norm": 0.31472361893623674, + "learning_rate": 7.885421782991064e-05, + "loss": 2.8448, + "step": 24084 + }, + { + "epoch": 1.12133063295854, + "grad_norm": 0.3077474258784104, + "learning_rate": 7.885200560429181e-05, + "loss": 2.8713, + "step": 24085 + }, + { + "epoch": 1.121377191144633, + "grad_norm": 0.31807247048277293, + "learning_rate": 7.884979329399515e-05, + "loss": 2.8336, + "step": 24086 + }, + { + "epoch": 1.121423749330726, + "grad_norm": 0.32554278119534713, + "learning_rate": 7.884758089902722e-05, + "loss": 2.8131, + "step": 24087 + }, + { + "epoch": 1.121470307516819, + "grad_norm": 0.3266869829179933, + "learning_rate": 7.884536841939447e-05, + "loss": 2.8713, + "step": 24088 + }, + { + "epoch": 1.1215168657029122, + "grad_norm": 0.3367375497708759, + "learning_rate": 7.88431558551034e-05, + "loss": 2.8008, + "step": 24089 + }, + { + "epoch": 1.1215634238890053, + "grad_norm": 0.3387047588032309, + "learning_rate": 7.884094320616051e-05, + "loss": 2.8264, + "step": 24090 + }, + { + "epoch": 1.1216099820750984, + "grad_norm": 0.31567736647068595, + "learning_rate": 7.883873047257229e-05, + "loss": 2.8502, + "step": 24091 + }, + { + "epoch": 1.1216565402611913, + "grad_norm": 0.32000488348064304, + "learning_rate": 7.883651765434523e-05, + "loss": 2.8425, + "step": 24092 + }, + { + "epoch": 1.1217030984472844, + "grad_norm": 0.3354060206323742, + "learning_rate": 7.883430475148586e-05, + "loss": 2.9626, + "step": 24093 + }, + { + "epoch": 1.1217496566333776, + "grad_norm": 0.3501060367033388, + "learning_rate": 7.883209176400062e-05, + "loss": 2.8386, + "step": 24094 + }, + { + "epoch": 1.1217962148194707, + "grad_norm": 0.3772431772696176, + "learning_rate": 7.882987869189605e-05, + "loss": 2.8589, + "step": 24095 + }, + { + "epoch": 1.1218427730055638, + "grad_norm": 0.3362828998074935, + "learning_rate": 7.882766553517861e-05, + "loss": 2.7576, + "step": 24096 + }, + { + "epoch": 1.1218893311916567, + "grad_norm": 0.36691219484221005, + "learning_rate": 7.882545229385483e-05, + "loss": 2.8429, + "step": 24097 + }, + { + "epoch": 1.1219358893777498, + "grad_norm": 0.36222224781373447, + "learning_rate": 7.882323896793116e-05, + "loss": 2.7825, + "step": 24098 + }, + { + "epoch": 1.121982447563843, + "grad_norm": 0.3499883875902806, + "learning_rate": 7.882102555741415e-05, + "loss": 2.7779, + "step": 24099 + }, + { + "epoch": 1.122029005749936, + "grad_norm": 0.38534465101501236, + "learning_rate": 7.881881206231025e-05, + "loss": 2.7961, + "step": 24100 + }, + { + "epoch": 1.1220755639360291, + "grad_norm": 0.3563872348279372, + "learning_rate": 7.881659848262598e-05, + "loss": 2.815, + "step": 24101 + }, + { + "epoch": 1.122122122122122, + "grad_norm": 0.33130819966953745, + "learning_rate": 7.881438481836785e-05, + "loss": 2.83, + "step": 24102 + }, + { + "epoch": 1.1221686803082151, + "grad_norm": 0.3415972666819842, + "learning_rate": 7.881217106954232e-05, + "loss": 2.802, + "step": 24103 + }, + { + "epoch": 1.1222152384943083, + "grad_norm": 0.3424523832147882, + "learning_rate": 7.880995723615591e-05, + "loss": 2.8532, + "step": 24104 + }, + { + "epoch": 1.1222617966804014, + "grad_norm": 0.34688004894429403, + "learning_rate": 7.88077433182151e-05, + "loss": 2.742, + "step": 24105 + }, + { + "epoch": 1.1223083548664945, + "grad_norm": 0.32535437134193923, + "learning_rate": 7.880552931572644e-05, + "loss": 2.7472, + "step": 24106 + }, + { + "epoch": 1.1223549130525874, + "grad_norm": 0.3205334332190653, + "learning_rate": 7.880331522869636e-05, + "loss": 2.8302, + "step": 24107 + }, + { + "epoch": 1.1224014712386805, + "grad_norm": 0.3343264824262363, + "learning_rate": 7.880110105713139e-05, + "loss": 2.8183, + "step": 24108 + }, + { + "epoch": 1.1224480294247736, + "grad_norm": 0.3584259407991692, + "learning_rate": 7.879888680103803e-05, + "loss": 2.7958, + "step": 24109 + }, + { + "epoch": 1.1224945876108667, + "grad_norm": 0.32816660130994196, + "learning_rate": 7.879667246042278e-05, + "loss": 2.8434, + "step": 24110 + }, + { + "epoch": 1.1225411457969598, + "grad_norm": 0.3253049174462859, + "learning_rate": 7.879445803529213e-05, + "loss": 2.8481, + "step": 24111 + }, + { + "epoch": 1.1225877039830527, + "grad_norm": 0.3391279794064507, + "learning_rate": 7.879224352565258e-05, + "loss": 2.8881, + "step": 24112 + }, + { + "epoch": 1.1226342621691459, + "grad_norm": 0.29710180387391544, + "learning_rate": 7.879002893151061e-05, + "loss": 2.752, + "step": 24113 + }, + { + "epoch": 1.122680820355239, + "grad_norm": 0.3046543874898442, + "learning_rate": 7.878781425287277e-05, + "loss": 2.9142, + "step": 24114 + }, + { + "epoch": 1.122727378541332, + "grad_norm": 0.3103326957690777, + "learning_rate": 7.878559948974554e-05, + "loss": 2.8161, + "step": 24115 + }, + { + "epoch": 1.1227739367274252, + "grad_norm": 0.310906867052864, + "learning_rate": 7.87833846421354e-05, + "loss": 2.877, + "step": 24116 + }, + { + "epoch": 1.122820494913518, + "grad_norm": 0.3242475382078229, + "learning_rate": 7.878116971004885e-05, + "loss": 2.8587, + "step": 24117 + }, + { + "epoch": 1.1228670530996112, + "grad_norm": 0.34986165774547906, + "learning_rate": 7.87789546934924e-05, + "loss": 2.7994, + "step": 24118 + }, + { + "epoch": 1.1229136112857043, + "grad_norm": 0.35036339556549195, + "learning_rate": 7.877673959247256e-05, + "loss": 2.7977, + "step": 24119 + }, + { + "epoch": 1.1229601694717974, + "grad_norm": 0.3349050070536185, + "learning_rate": 7.877452440699583e-05, + "loss": 2.8176, + "step": 24120 + }, + { + "epoch": 1.1230067276578906, + "grad_norm": 0.32348086659427566, + "learning_rate": 7.87723091370687e-05, + "loss": 2.8618, + "step": 24121 + }, + { + "epoch": 1.1230532858439835, + "grad_norm": 0.3069663457072848, + "learning_rate": 7.877009378269768e-05, + "loss": 2.7874, + "step": 24122 + }, + { + "epoch": 1.1230998440300766, + "grad_norm": 0.3212037366880143, + "learning_rate": 7.876787834388927e-05, + "loss": 2.8802, + "step": 24123 + }, + { + "epoch": 1.1231464022161697, + "grad_norm": 0.3052587509164805, + "learning_rate": 7.876566282064998e-05, + "loss": 2.8278, + "step": 24124 + }, + { + "epoch": 1.1231929604022628, + "grad_norm": 0.3524981065327235, + "learning_rate": 7.876344721298628e-05, + "loss": 2.8855, + "step": 24125 + }, + { + "epoch": 1.1232395185883557, + "grad_norm": 0.31705115200449135, + "learning_rate": 7.876123152090472e-05, + "loss": 2.7907, + "step": 24126 + }, + { + "epoch": 1.1232860767744488, + "grad_norm": 0.3387496087466643, + "learning_rate": 7.875901574441175e-05, + "loss": 2.8807, + "step": 24127 + }, + { + "epoch": 1.123332634960542, + "grad_norm": 0.3422074239791181, + "learning_rate": 7.875679988351393e-05, + "loss": 2.9692, + "step": 24128 + }, + { + "epoch": 1.123379193146635, + "grad_norm": 0.3201911725100595, + "learning_rate": 7.875458393821772e-05, + "loss": 2.7841, + "step": 24129 + }, + { + "epoch": 1.1234257513327282, + "grad_norm": 0.3471557836673408, + "learning_rate": 7.875236790852966e-05, + "loss": 2.8308, + "step": 24130 + }, + { + "epoch": 1.123472309518821, + "grad_norm": 0.33090104803914017, + "learning_rate": 7.875015179445621e-05, + "loss": 2.7581, + "step": 24131 + }, + { + "epoch": 1.1235188677049142, + "grad_norm": 0.3211575444040225, + "learning_rate": 7.87479355960039e-05, + "loss": 2.8833, + "step": 24132 + }, + { + "epoch": 1.1235654258910073, + "grad_norm": 0.35714556323169017, + "learning_rate": 7.874571931317924e-05, + "loss": 2.8224, + "step": 24133 + }, + { + "epoch": 1.1236119840771004, + "grad_norm": 0.32511242705318844, + "learning_rate": 7.874350294598872e-05, + "loss": 2.8661, + "step": 24134 + }, + { + "epoch": 1.1236585422631935, + "grad_norm": 0.3232280531869755, + "learning_rate": 7.874128649443886e-05, + "loss": 2.6984, + "step": 24135 + }, + { + "epoch": 1.1237051004492864, + "grad_norm": 0.35899209570154506, + "learning_rate": 7.873906995853614e-05, + "loss": 2.8247, + "step": 24136 + }, + { + "epoch": 1.1237516586353795, + "grad_norm": 0.32312742255096033, + "learning_rate": 7.873685333828709e-05, + "loss": 2.7531, + "step": 24137 + }, + { + "epoch": 1.1237982168214726, + "grad_norm": 0.3223387546267549, + "learning_rate": 7.87346366336982e-05, + "loss": 2.8376, + "step": 24138 + }, + { + "epoch": 1.1238447750075657, + "grad_norm": 0.32994568491741916, + "learning_rate": 7.8732419844776e-05, + "loss": 2.7183, + "step": 24139 + }, + { + "epoch": 1.1238913331936589, + "grad_norm": 0.3337373466153219, + "learning_rate": 7.873020297152696e-05, + "loss": 2.7576, + "step": 24140 + }, + { + "epoch": 1.1239378913797518, + "grad_norm": 0.349041679851906, + "learning_rate": 7.872798601395761e-05, + "loss": 2.7435, + "step": 24141 + }, + { + "epoch": 1.1239844495658449, + "grad_norm": 0.3528578253903042, + "learning_rate": 7.872576897207444e-05, + "loss": 2.7993, + "step": 24142 + }, + { + "epoch": 1.124031007751938, + "grad_norm": 0.35258338181179666, + "learning_rate": 7.872355184588398e-05, + "loss": 2.9199, + "step": 24143 + }, + { + "epoch": 1.124077565938031, + "grad_norm": 0.3512955807846841, + "learning_rate": 7.872133463539273e-05, + "loss": 2.8254, + "step": 24144 + }, + { + "epoch": 1.1241241241241242, + "grad_norm": 0.3334454930340987, + "learning_rate": 7.871911734060719e-05, + "loss": 2.8043, + "step": 24145 + }, + { + "epoch": 1.124170682310217, + "grad_norm": 0.33303571808105764, + "learning_rate": 7.871689996153387e-05, + "loss": 2.8729, + "step": 24146 + }, + { + "epoch": 1.1242172404963102, + "grad_norm": 0.3309493857997413, + "learning_rate": 7.871468249817927e-05, + "loss": 2.8891, + "step": 24147 + }, + { + "epoch": 1.1242637986824033, + "grad_norm": 0.372252326009676, + "learning_rate": 7.871246495054992e-05, + "loss": 2.799, + "step": 24148 + }, + { + "epoch": 1.1243103568684965, + "grad_norm": 0.3284913190387946, + "learning_rate": 7.871024731865229e-05, + "loss": 2.9178, + "step": 24149 + }, + { + "epoch": 1.1243569150545896, + "grad_norm": 0.36082498074525243, + "learning_rate": 7.870802960249292e-05, + "loss": 2.9577, + "step": 24150 + }, + { + "epoch": 1.1244034732406825, + "grad_norm": 0.32815047929053953, + "learning_rate": 7.870581180207833e-05, + "loss": 2.8681, + "step": 24151 + }, + { + "epoch": 1.1244500314267756, + "grad_norm": 0.3761023980866103, + "learning_rate": 7.8703593917415e-05, + "loss": 2.8111, + "step": 24152 + }, + { + "epoch": 1.1244965896128687, + "grad_norm": 0.35661206646603044, + "learning_rate": 7.870137594850944e-05, + "loss": 2.8203, + "step": 24153 + }, + { + "epoch": 1.1245431477989618, + "grad_norm": 0.370968614457781, + "learning_rate": 7.869915789536818e-05, + "loss": 2.9255, + "step": 24154 + }, + { + "epoch": 1.124589705985055, + "grad_norm": 0.33925511542552667, + "learning_rate": 7.869693975799772e-05, + "loss": 2.8146, + "step": 24155 + }, + { + "epoch": 1.1246362641711478, + "grad_norm": 0.38706717308612026, + "learning_rate": 7.869472153640456e-05, + "loss": 2.8263, + "step": 24156 + }, + { + "epoch": 1.124682822357241, + "grad_norm": 0.34043238694769146, + "learning_rate": 7.869250323059523e-05, + "loss": 2.759, + "step": 24157 + }, + { + "epoch": 1.124729380543334, + "grad_norm": 0.36665772058481916, + "learning_rate": 7.869028484057621e-05, + "loss": 2.864, + "step": 24158 + }, + { + "epoch": 1.1247759387294272, + "grad_norm": 0.34843246862337385, + "learning_rate": 7.868806636635405e-05, + "loss": 2.8121, + "step": 24159 + }, + { + "epoch": 1.1248224969155203, + "grad_norm": 0.3940114429765837, + "learning_rate": 7.868584780793525e-05, + "loss": 2.8141, + "step": 24160 + }, + { + "epoch": 1.1248690551016132, + "grad_norm": 0.3402471905548546, + "learning_rate": 7.86836291653263e-05, + "loss": 2.7077, + "step": 24161 + }, + { + "epoch": 1.1249156132877063, + "grad_norm": 0.31728473720919326, + "learning_rate": 7.868141043853372e-05, + "loss": 2.7831, + "step": 24162 + }, + { + "epoch": 1.1249621714737994, + "grad_norm": 0.3283573398833088, + "learning_rate": 7.867919162756402e-05, + "loss": 2.8359, + "step": 24163 + }, + { + "epoch": 1.1250087296598925, + "grad_norm": 0.3443519025459766, + "learning_rate": 7.867697273242372e-05, + "loss": 2.7775, + "step": 24164 + }, + { + "epoch": 1.1250552878459854, + "grad_norm": 0.31531435015406944, + "learning_rate": 7.867475375311934e-05, + "loss": 2.8375, + "step": 24165 + }, + { + "epoch": 1.1251018460320785, + "grad_norm": 0.3426494807975622, + "learning_rate": 7.867253468965739e-05, + "loss": 2.8213, + "step": 24166 + }, + { + "epoch": 1.1251484042181716, + "grad_norm": 0.3087348602086498, + "learning_rate": 7.867031554204436e-05, + "loss": 2.8889, + "step": 24167 + }, + { + "epoch": 1.1251949624042648, + "grad_norm": 0.3776516108586442, + "learning_rate": 7.866809631028677e-05, + "loss": 2.8945, + "step": 24168 + }, + { + "epoch": 1.1252415205903579, + "grad_norm": 0.30629010778070487, + "learning_rate": 7.866587699439116e-05, + "loss": 2.8503, + "step": 24169 + }, + { + "epoch": 1.125288078776451, + "grad_norm": 0.35157394719509494, + "learning_rate": 7.866365759436403e-05, + "loss": 2.8495, + "step": 24170 + }, + { + "epoch": 1.1253346369625439, + "grad_norm": 0.3084593548600797, + "learning_rate": 7.866143811021187e-05, + "loss": 2.8269, + "step": 24171 + }, + { + "epoch": 1.125381195148637, + "grad_norm": 0.33961887183573136, + "learning_rate": 7.865921854194122e-05, + "loss": 2.9195, + "step": 24172 + }, + { + "epoch": 1.12542775333473, + "grad_norm": 0.32239006776051493, + "learning_rate": 7.865699888955859e-05, + "loss": 2.9133, + "step": 24173 + }, + { + "epoch": 1.1254743115208232, + "grad_norm": 0.318130861955315, + "learning_rate": 7.86547791530705e-05, + "loss": 2.7915, + "step": 24174 + }, + { + "epoch": 1.1255208697069161, + "grad_norm": 0.30107466853548226, + "learning_rate": 7.865255933248344e-05, + "loss": 2.7392, + "step": 24175 + }, + { + "epoch": 1.1255674278930092, + "grad_norm": 0.3203824911895364, + "learning_rate": 7.865033942780395e-05, + "loss": 2.8296, + "step": 24176 + }, + { + "epoch": 1.1256139860791023, + "grad_norm": 0.30497456992044414, + "learning_rate": 7.864811943903853e-05, + "loss": 2.8458, + "step": 24177 + }, + { + "epoch": 1.1256605442651955, + "grad_norm": 0.3226300478404272, + "learning_rate": 7.86458993661937e-05, + "loss": 2.8878, + "step": 24178 + }, + { + "epoch": 1.1257071024512886, + "grad_norm": 0.29617811524111837, + "learning_rate": 7.864367920927599e-05, + "loss": 2.8381, + "step": 24179 + }, + { + "epoch": 1.1257536606373817, + "grad_norm": 0.3435077342639725, + "learning_rate": 7.864145896829189e-05, + "loss": 2.8833, + "step": 24180 + }, + { + "epoch": 1.1258002188234746, + "grad_norm": 0.31744856903107443, + "learning_rate": 7.863923864324793e-05, + "loss": 2.8289, + "step": 24181 + }, + { + "epoch": 1.1258467770095677, + "grad_norm": 0.3055007199781715, + "learning_rate": 7.863701823415064e-05, + "loss": 2.8702, + "step": 24182 + }, + { + "epoch": 1.1258933351956608, + "grad_norm": 0.31323401138201645, + "learning_rate": 7.863479774100653e-05, + "loss": 2.7467, + "step": 24183 + }, + { + "epoch": 1.125939893381754, + "grad_norm": 0.3115543525179002, + "learning_rate": 7.863257716382208e-05, + "loss": 2.9622, + "step": 24184 + }, + { + "epoch": 1.1259864515678468, + "grad_norm": 0.33256691812816974, + "learning_rate": 7.863035650260386e-05, + "loss": 2.9102, + "step": 24185 + }, + { + "epoch": 1.12603300975394, + "grad_norm": 0.33363480202966894, + "learning_rate": 7.862813575735835e-05, + "loss": 2.7859, + "step": 24186 + }, + { + "epoch": 1.126079567940033, + "grad_norm": 0.3086255521266231, + "learning_rate": 7.86259149280921e-05, + "loss": 2.8695, + "step": 24187 + }, + { + "epoch": 1.1261261261261262, + "grad_norm": 0.3257369144321254, + "learning_rate": 7.862369401481157e-05, + "loss": 2.8429, + "step": 24188 + }, + { + "epoch": 1.1261726843122193, + "grad_norm": 0.3326034866461616, + "learning_rate": 7.862147301752336e-05, + "loss": 2.871, + "step": 24189 + }, + { + "epoch": 1.1262192424983122, + "grad_norm": 0.31245898770369046, + "learning_rate": 7.861925193623392e-05, + "loss": 2.6937, + "step": 24190 + }, + { + "epoch": 1.1262658006844053, + "grad_norm": 0.3334412424868139, + "learning_rate": 7.861703077094982e-05, + "loss": 2.7815, + "step": 24191 + }, + { + "epoch": 1.1263123588704984, + "grad_norm": 0.3588226477467225, + "learning_rate": 7.861480952167753e-05, + "loss": 2.9379, + "step": 24192 + }, + { + "epoch": 1.1263589170565915, + "grad_norm": 0.3314459293031671, + "learning_rate": 7.861258818842361e-05, + "loss": 2.8285, + "step": 24193 + }, + { + "epoch": 1.1264054752426846, + "grad_norm": 0.3375387676152624, + "learning_rate": 7.861036677119456e-05, + "loss": 2.8813, + "step": 24194 + }, + { + "epoch": 1.1264520334287775, + "grad_norm": 0.36175876993393696, + "learning_rate": 7.86081452699969e-05, + "loss": 2.7463, + "step": 24195 + }, + { + "epoch": 1.1264985916148706, + "grad_norm": 0.3145907712767604, + "learning_rate": 7.860592368483715e-05, + "loss": 2.8358, + "step": 24196 + }, + { + "epoch": 1.1265451498009638, + "grad_norm": 0.3454498648695931, + "learning_rate": 7.860370201572182e-05, + "loss": 2.8522, + "step": 24197 + }, + { + "epoch": 1.1265917079870569, + "grad_norm": 0.33486570144252265, + "learning_rate": 7.860148026265746e-05, + "loss": 2.7546, + "step": 24198 + }, + { + "epoch": 1.12663826617315, + "grad_norm": 0.3550687809089399, + "learning_rate": 7.859925842565057e-05, + "loss": 2.8138, + "step": 24199 + }, + { + "epoch": 1.1266848243592429, + "grad_norm": 0.32908975443162436, + "learning_rate": 7.859703650470767e-05, + "loss": 2.7869, + "step": 24200 + }, + { + "epoch": 1.126731382545336, + "grad_norm": 0.3661446897874773, + "learning_rate": 7.859481449983528e-05, + "loss": 2.8395, + "step": 24201 + }, + { + "epoch": 1.1267779407314291, + "grad_norm": 0.33997139715502656, + "learning_rate": 7.859259241103994e-05, + "loss": 2.7983, + "step": 24202 + }, + { + "epoch": 1.1268244989175222, + "grad_norm": 0.35926641791501296, + "learning_rate": 7.859037023832813e-05, + "loss": 2.9452, + "step": 24203 + }, + { + "epoch": 1.1268710571036151, + "grad_norm": 0.33301216099345976, + "learning_rate": 7.858814798170644e-05, + "loss": 2.7886, + "step": 24204 + }, + { + "epoch": 1.1269176152897082, + "grad_norm": 0.3354957043721329, + "learning_rate": 7.858592564118133e-05, + "loss": 2.7606, + "step": 24205 + }, + { + "epoch": 1.1269641734758014, + "grad_norm": 0.33109540180355523, + "learning_rate": 7.858370321675935e-05, + "loss": 2.875, + "step": 24206 + }, + { + "epoch": 1.1270107316618945, + "grad_norm": 0.349476220220011, + "learning_rate": 7.858148070844702e-05, + "loss": 2.7851, + "step": 24207 + }, + { + "epoch": 1.1270572898479876, + "grad_norm": 0.34596845758918093, + "learning_rate": 7.857925811625085e-05, + "loss": 2.825, + "step": 24208 + }, + { + "epoch": 1.1271038480340807, + "grad_norm": 0.36957217001436643, + "learning_rate": 7.857703544017737e-05, + "loss": 2.7642, + "step": 24209 + }, + { + "epoch": 1.1271504062201736, + "grad_norm": 0.32198463156481094, + "learning_rate": 7.857481268023312e-05, + "loss": 2.7979, + "step": 24210 + }, + { + "epoch": 1.1271969644062667, + "grad_norm": 0.34677565786659836, + "learning_rate": 7.857258983642461e-05, + "loss": 2.7763, + "step": 24211 + }, + { + "epoch": 1.1272435225923598, + "grad_norm": 0.34017617855440596, + "learning_rate": 7.857036690875836e-05, + "loss": 2.8736, + "step": 24212 + }, + { + "epoch": 1.127290080778453, + "grad_norm": 0.30901816226478845, + "learning_rate": 7.856814389724088e-05, + "loss": 2.7818, + "step": 24213 + }, + { + "epoch": 1.1273366389645458, + "grad_norm": 0.3669492454050701, + "learning_rate": 7.856592080187875e-05, + "loss": 2.8434, + "step": 24214 + }, + { + "epoch": 1.127383197150639, + "grad_norm": 0.3182918775188449, + "learning_rate": 7.856369762267843e-05, + "loss": 2.7531, + "step": 24215 + }, + { + "epoch": 1.127429755336732, + "grad_norm": 0.3470038112058868, + "learning_rate": 7.856147435964647e-05, + "loss": 2.7302, + "step": 24216 + }, + { + "epoch": 1.1274763135228252, + "grad_norm": 0.3068423561385258, + "learning_rate": 7.85592510127894e-05, + "loss": 2.8119, + "step": 24217 + }, + { + "epoch": 1.1275228717089183, + "grad_norm": 0.36896329880190953, + "learning_rate": 7.855702758211374e-05, + "loss": 2.7269, + "step": 24218 + }, + { + "epoch": 1.1275694298950114, + "grad_norm": 0.3148954525151845, + "learning_rate": 7.855480406762603e-05, + "loss": 2.8832, + "step": 24219 + }, + { + "epoch": 1.1276159880811043, + "grad_norm": 0.35414887254833605, + "learning_rate": 7.855258046933276e-05, + "loss": 2.8229, + "step": 24220 + }, + { + "epoch": 1.1276625462671974, + "grad_norm": 0.31644175641463956, + "learning_rate": 7.85503567872405e-05, + "loss": 2.8566, + "step": 24221 + }, + { + "epoch": 1.1277091044532905, + "grad_norm": 0.3536111009526177, + "learning_rate": 7.854813302135575e-05, + "loss": 2.9196, + "step": 24222 + }, + { + "epoch": 1.1277556626393836, + "grad_norm": 0.31594365779516753, + "learning_rate": 7.854590917168505e-05, + "loss": 2.8652, + "step": 24223 + }, + { + "epoch": 1.1278022208254765, + "grad_norm": 0.3686481989056276, + "learning_rate": 7.85436852382349e-05, + "loss": 2.786, + "step": 24224 + }, + { + "epoch": 1.1278487790115697, + "grad_norm": 0.33940114549345324, + "learning_rate": 7.854146122101185e-05, + "loss": 2.8226, + "step": 24225 + }, + { + "epoch": 1.1278953371976628, + "grad_norm": 0.3496317890327111, + "learning_rate": 7.853923712002243e-05, + "loss": 2.8315, + "step": 24226 + }, + { + "epoch": 1.1279418953837559, + "grad_norm": 0.3296054547171012, + "learning_rate": 7.853701293527316e-05, + "loss": 2.8452, + "step": 24227 + }, + { + "epoch": 1.127988453569849, + "grad_norm": 0.3306710783068424, + "learning_rate": 7.853478866677058e-05, + "loss": 2.8154, + "step": 24228 + }, + { + "epoch": 1.128035011755942, + "grad_norm": 0.3580072192150842, + "learning_rate": 7.853256431452118e-05, + "loss": 2.7913, + "step": 24229 + }, + { + "epoch": 1.128081569942035, + "grad_norm": 0.3460469136551244, + "learning_rate": 7.853033987853154e-05, + "loss": 2.8452, + "step": 24230 + }, + { + "epoch": 1.1281281281281281, + "grad_norm": 0.33292847767411987, + "learning_rate": 7.852811535880815e-05, + "loss": 2.8412, + "step": 24231 + }, + { + "epoch": 1.1281746863142212, + "grad_norm": 0.3289225234105657, + "learning_rate": 7.852589075535756e-05, + "loss": 2.7813, + "step": 24232 + }, + { + "epoch": 1.1282212445003144, + "grad_norm": 0.3113376353299858, + "learning_rate": 7.852366606818629e-05, + "loss": 2.7086, + "step": 24233 + }, + { + "epoch": 1.1282678026864073, + "grad_norm": 0.3106143340163224, + "learning_rate": 7.852144129730086e-05, + "loss": 2.7819, + "step": 24234 + }, + { + "epoch": 1.1283143608725004, + "grad_norm": 0.30982813456831654, + "learning_rate": 7.851921644270782e-05, + "loss": 2.8169, + "step": 24235 + }, + { + "epoch": 1.1283609190585935, + "grad_norm": 0.3361975331795765, + "learning_rate": 7.851699150441368e-05, + "loss": 2.7488, + "step": 24236 + }, + { + "epoch": 1.1284074772446866, + "grad_norm": 0.3410302332680139, + "learning_rate": 7.851476648242498e-05, + "loss": 2.8364, + "step": 24237 + }, + { + "epoch": 1.1284540354307797, + "grad_norm": 0.3282038373597605, + "learning_rate": 7.851254137674825e-05, + "loss": 2.6932, + "step": 24238 + }, + { + "epoch": 1.1285005936168726, + "grad_norm": 0.3436578885106205, + "learning_rate": 7.851031618739002e-05, + "loss": 2.8458, + "step": 24239 + }, + { + "epoch": 1.1285471518029657, + "grad_norm": 0.3274712062892773, + "learning_rate": 7.850809091435683e-05, + "loss": 2.7827, + "step": 24240 + }, + { + "epoch": 1.1285937099890588, + "grad_norm": 0.33029853188632535, + "learning_rate": 7.85058655576552e-05, + "loss": 2.8041, + "step": 24241 + }, + { + "epoch": 1.128640268175152, + "grad_norm": 0.365918149206762, + "learning_rate": 7.850364011729166e-05, + "loss": 2.8143, + "step": 24242 + }, + { + "epoch": 1.1286868263612448, + "grad_norm": 0.32850310865477184, + "learning_rate": 7.850141459327274e-05, + "loss": 2.8525, + "step": 24243 + }, + { + "epoch": 1.128733384547338, + "grad_norm": 0.3642306673281727, + "learning_rate": 7.849918898560498e-05, + "loss": 2.7557, + "step": 24244 + }, + { + "epoch": 1.128779942733431, + "grad_norm": 0.3501229575159532, + "learning_rate": 7.84969632942949e-05, + "loss": 2.8534, + "step": 24245 + }, + { + "epoch": 1.1288265009195242, + "grad_norm": 0.33591095479229777, + "learning_rate": 7.849473751934904e-05, + "loss": 2.8709, + "step": 24246 + }, + { + "epoch": 1.1288730591056173, + "grad_norm": 0.33724955991365085, + "learning_rate": 7.849251166077394e-05, + "loss": 2.8065, + "step": 24247 + }, + { + "epoch": 1.1289196172917104, + "grad_norm": 0.31901547375357314, + "learning_rate": 7.849028571857613e-05, + "loss": 2.8418, + "step": 24248 + }, + { + "epoch": 1.1289661754778033, + "grad_norm": 0.3429865347523959, + "learning_rate": 7.848805969276212e-05, + "loss": 2.8346, + "step": 24249 + }, + { + "epoch": 1.1290127336638964, + "grad_norm": 0.3365017267074372, + "learning_rate": 7.848583358333848e-05, + "loss": 2.7813, + "step": 24250 + }, + { + "epoch": 1.1290592918499895, + "grad_norm": 0.31890989957360727, + "learning_rate": 7.848360739031173e-05, + "loss": 2.7023, + "step": 24251 + }, + { + "epoch": 1.1291058500360827, + "grad_norm": 0.31803298737285374, + "learning_rate": 7.848138111368837e-05, + "loss": 2.8559, + "step": 24252 + }, + { + "epoch": 1.1291524082221756, + "grad_norm": 0.359136870569315, + "learning_rate": 7.847915475347499e-05, + "loss": 2.9172, + "step": 24253 + }, + { + "epoch": 1.1291989664082687, + "grad_norm": 0.3248503049993484, + "learning_rate": 7.847692830967808e-05, + "loss": 2.8328, + "step": 24254 + }, + { + "epoch": 1.1292455245943618, + "grad_norm": 0.3435769926137733, + "learning_rate": 7.84747017823042e-05, + "loss": 2.8028, + "step": 24255 + }, + { + "epoch": 1.129292082780455, + "grad_norm": 0.3262082627640915, + "learning_rate": 7.847247517135988e-05, + "loss": 2.8226, + "step": 24256 + }, + { + "epoch": 1.129338640966548, + "grad_norm": 0.3102187938216054, + "learning_rate": 7.847024847685164e-05, + "loss": 2.7958, + "step": 24257 + }, + { + "epoch": 1.1293851991526411, + "grad_norm": 0.36258005222444284, + "learning_rate": 7.846802169878602e-05, + "loss": 2.8641, + "step": 24258 + }, + { + "epoch": 1.129431757338734, + "grad_norm": 0.35707172618758026, + "learning_rate": 7.846579483716958e-05, + "loss": 2.7959, + "step": 24259 + }, + { + "epoch": 1.1294783155248271, + "grad_norm": 0.3559572768802686, + "learning_rate": 7.846356789200882e-05, + "loss": 2.8721, + "step": 24260 + }, + { + "epoch": 1.1295248737109203, + "grad_norm": 0.37830534994928466, + "learning_rate": 7.84613408633103e-05, + "loss": 2.7671, + "step": 24261 + }, + { + "epoch": 1.1295714318970134, + "grad_norm": 0.30647833487870463, + "learning_rate": 7.845911375108055e-05, + "loss": 2.8327, + "step": 24262 + }, + { + "epoch": 1.1296179900831063, + "grad_norm": 0.3479367837445202, + "learning_rate": 7.84568865553261e-05, + "loss": 2.787, + "step": 24263 + }, + { + "epoch": 1.1296645482691994, + "grad_norm": 0.3114596930743949, + "learning_rate": 7.845465927605349e-05, + "loss": 2.807, + "step": 24264 + }, + { + "epoch": 1.1297111064552925, + "grad_norm": 0.3329478637635509, + "learning_rate": 7.845243191326927e-05, + "loss": 2.8565, + "step": 24265 + }, + { + "epoch": 1.1297576646413856, + "grad_norm": 0.32137530874846154, + "learning_rate": 7.845020446697995e-05, + "loss": 2.8397, + "step": 24266 + }, + { + "epoch": 1.1298042228274787, + "grad_norm": 0.34003207528676815, + "learning_rate": 7.844797693719208e-05, + "loss": 2.8458, + "step": 24267 + }, + { + "epoch": 1.1298507810135716, + "grad_norm": 0.3187142118388772, + "learning_rate": 7.844574932391222e-05, + "loss": 2.8719, + "step": 24268 + }, + { + "epoch": 1.1298973391996647, + "grad_norm": 0.3245197462830862, + "learning_rate": 7.844352162714688e-05, + "loss": 2.7796, + "step": 24269 + }, + { + "epoch": 1.1299438973857578, + "grad_norm": 0.3137447273973675, + "learning_rate": 7.84412938469026e-05, + "loss": 2.8548, + "step": 24270 + }, + { + "epoch": 1.129990455571851, + "grad_norm": 0.3086041576452652, + "learning_rate": 7.843906598318593e-05, + "loss": 2.7104, + "step": 24271 + }, + { + "epoch": 1.130037013757944, + "grad_norm": 0.28141133939141844, + "learning_rate": 7.843683803600339e-05, + "loss": 2.7956, + "step": 24272 + }, + { + "epoch": 1.130083571944037, + "grad_norm": 0.3296765877046014, + "learning_rate": 7.843461000536156e-05, + "loss": 2.8558, + "step": 24273 + }, + { + "epoch": 1.13013013013013, + "grad_norm": 0.3301572215992106, + "learning_rate": 7.843238189126691e-05, + "loss": 2.864, + "step": 24274 + }, + { + "epoch": 1.1301766883162232, + "grad_norm": 0.3070007454906071, + "learning_rate": 7.843015369372606e-05, + "loss": 2.7918, + "step": 24275 + }, + { + "epoch": 1.1302232465023163, + "grad_norm": 0.34618489740023695, + "learning_rate": 7.842792541274548e-05, + "loss": 2.7437, + "step": 24276 + }, + { + "epoch": 1.1302698046884094, + "grad_norm": 0.29465040566182643, + "learning_rate": 7.842569704833177e-05, + "loss": 2.8371, + "step": 24277 + }, + { + "epoch": 1.1303163628745023, + "grad_norm": 0.3298214734152486, + "learning_rate": 7.842346860049141e-05, + "loss": 2.8848, + "step": 24278 + }, + { + "epoch": 1.1303629210605954, + "grad_norm": 0.2956449242644349, + "learning_rate": 7.8421240069231e-05, + "loss": 2.7175, + "step": 24279 + }, + { + "epoch": 1.1304094792466886, + "grad_norm": 0.32490532678566797, + "learning_rate": 7.841901145455703e-05, + "loss": 2.7625, + "step": 24280 + }, + { + "epoch": 1.1304560374327817, + "grad_norm": 0.3053060231609339, + "learning_rate": 7.841678275647605e-05, + "loss": 2.8131, + "step": 24281 + }, + { + "epoch": 1.1305025956188748, + "grad_norm": 0.323887002564951, + "learning_rate": 7.841455397499465e-05, + "loss": 2.8353, + "step": 24282 + }, + { + "epoch": 1.1305491538049677, + "grad_norm": 0.3190673031421919, + "learning_rate": 7.84123251101193e-05, + "loss": 2.8843, + "step": 24283 + }, + { + "epoch": 1.1305957119910608, + "grad_norm": 0.332310899111926, + "learning_rate": 7.841009616185657e-05, + "loss": 2.8147, + "step": 24284 + }, + { + "epoch": 1.130642270177154, + "grad_norm": 0.3316118914138421, + "learning_rate": 7.840786713021302e-05, + "loss": 2.8127, + "step": 24285 + }, + { + "epoch": 1.130688828363247, + "grad_norm": 0.3251079994073416, + "learning_rate": 7.840563801519519e-05, + "loss": 2.899, + "step": 24286 + }, + { + "epoch": 1.1307353865493401, + "grad_norm": 0.3457527620614639, + "learning_rate": 7.84034088168096e-05, + "loss": 2.8448, + "step": 24287 + }, + { + "epoch": 1.130781944735433, + "grad_norm": 0.34427092635398737, + "learning_rate": 7.840117953506281e-05, + "loss": 2.8352, + "step": 24288 + }, + { + "epoch": 1.1308285029215261, + "grad_norm": 0.336550798813674, + "learning_rate": 7.839895016996135e-05, + "loss": 2.8111, + "step": 24289 + }, + { + "epoch": 1.1308750611076193, + "grad_norm": 0.3525342582614626, + "learning_rate": 7.839672072151177e-05, + "loss": 2.8312, + "step": 24290 + }, + { + "epoch": 1.1309216192937124, + "grad_norm": 0.3471291431262084, + "learning_rate": 7.83944911897206e-05, + "loss": 2.8165, + "step": 24291 + }, + { + "epoch": 1.1309681774798053, + "grad_norm": 0.3435215660422372, + "learning_rate": 7.83922615745944e-05, + "loss": 2.8455, + "step": 24292 + }, + { + "epoch": 1.1310147356658984, + "grad_norm": 0.35637623958040926, + "learning_rate": 7.839003187613972e-05, + "loss": 2.7352, + "step": 24293 + }, + { + "epoch": 1.1310612938519915, + "grad_norm": 0.3162108533555256, + "learning_rate": 7.838780209436309e-05, + "loss": 2.8773, + "step": 24294 + }, + { + "epoch": 1.1311078520380846, + "grad_norm": 0.3341466104564472, + "learning_rate": 7.838557222927108e-05, + "loss": 2.7144, + "step": 24295 + }, + { + "epoch": 1.1311544102241777, + "grad_norm": 0.30778103360998865, + "learning_rate": 7.838334228087019e-05, + "loss": 2.849, + "step": 24296 + }, + { + "epoch": 1.1312009684102708, + "grad_norm": 0.3407375276679134, + "learning_rate": 7.838111224916697e-05, + "loss": 2.8297, + "step": 24297 + }, + { + "epoch": 1.1312475265963637, + "grad_norm": 0.3278285098704316, + "learning_rate": 7.8378882134168e-05, + "loss": 2.802, + "step": 24298 + }, + { + "epoch": 1.1312940847824569, + "grad_norm": 0.3308136209680482, + "learning_rate": 7.83766519358798e-05, + "loss": 2.7546, + "step": 24299 + }, + { + "epoch": 1.13134064296855, + "grad_norm": 0.3234832022352622, + "learning_rate": 7.837442165430893e-05, + "loss": 2.7607, + "step": 24300 + }, + { + "epoch": 1.131387201154643, + "grad_norm": 0.3316103490772076, + "learning_rate": 7.837219128946193e-05, + "loss": 2.824, + "step": 24301 + }, + { + "epoch": 1.131433759340736, + "grad_norm": 0.33176112229087684, + "learning_rate": 7.836996084134534e-05, + "loss": 2.84, + "step": 24302 + }, + { + "epoch": 1.131480317526829, + "grad_norm": 0.37939736656860673, + "learning_rate": 7.836773030996569e-05, + "loss": 2.825, + "step": 24303 + }, + { + "epoch": 1.1315268757129222, + "grad_norm": 0.3303366817772484, + "learning_rate": 7.836549969532957e-05, + "loss": 2.8226, + "step": 24304 + }, + { + "epoch": 1.1315734338990153, + "grad_norm": 0.3657429597089602, + "learning_rate": 7.836326899744351e-05, + "loss": 2.8026, + "step": 24305 + }, + { + "epoch": 1.1316199920851084, + "grad_norm": 0.3304699616565305, + "learning_rate": 7.836103821631402e-05, + "loss": 2.8155, + "step": 24306 + }, + { + "epoch": 1.1316665502712016, + "grad_norm": 0.335402875367481, + "learning_rate": 7.835880735194769e-05, + "loss": 2.8354, + "step": 24307 + }, + { + "epoch": 1.1317131084572944, + "grad_norm": 0.30866483428134606, + "learning_rate": 7.835657640435107e-05, + "loss": 2.8695, + "step": 24308 + }, + { + "epoch": 1.1317596666433876, + "grad_norm": 0.3465497655423328, + "learning_rate": 7.835434537353068e-05, + "loss": 2.7768, + "step": 24309 + }, + { + "epoch": 1.1318062248294807, + "grad_norm": 0.35616945384348514, + "learning_rate": 7.835211425949307e-05, + "loss": 2.8912, + "step": 24310 + }, + { + "epoch": 1.1318527830155738, + "grad_norm": 0.322566272358701, + "learning_rate": 7.83498830622448e-05, + "loss": 2.8212, + "step": 24311 + }, + { + "epoch": 1.1318993412016667, + "grad_norm": 0.3319054123247439, + "learning_rate": 7.834765178179244e-05, + "loss": 2.8185, + "step": 24312 + }, + { + "epoch": 1.1319458993877598, + "grad_norm": 0.3521933971099203, + "learning_rate": 7.83454204181425e-05, + "loss": 2.8411, + "step": 24313 + }, + { + "epoch": 1.131992457573853, + "grad_norm": 0.3165532634972966, + "learning_rate": 7.834318897130154e-05, + "loss": 2.7838, + "step": 24314 + }, + { + "epoch": 1.132039015759946, + "grad_norm": 0.3717414148545998, + "learning_rate": 7.834095744127611e-05, + "loss": 2.8414, + "step": 24315 + }, + { + "epoch": 1.1320855739460391, + "grad_norm": 0.3026572381620996, + "learning_rate": 7.833872582807276e-05, + "loss": 2.8122, + "step": 24316 + }, + { + "epoch": 1.132132132132132, + "grad_norm": 0.3394736348446499, + "learning_rate": 7.833649413169805e-05, + "loss": 2.7932, + "step": 24317 + }, + { + "epoch": 1.1321786903182252, + "grad_norm": 0.32146194122380783, + "learning_rate": 7.833426235215852e-05, + "loss": 2.835, + "step": 24318 + }, + { + "epoch": 1.1322252485043183, + "grad_norm": 0.3219736779836005, + "learning_rate": 7.833203048946072e-05, + "loss": 2.8808, + "step": 24319 + }, + { + "epoch": 1.1322718066904114, + "grad_norm": 0.3678902870004003, + "learning_rate": 7.83297985436112e-05, + "loss": 2.8316, + "step": 24320 + }, + { + "epoch": 1.1323183648765045, + "grad_norm": 0.3108801976544151, + "learning_rate": 7.832756651461652e-05, + "loss": 2.836, + "step": 24321 + }, + { + "epoch": 1.1323649230625974, + "grad_norm": 0.35340370863381715, + "learning_rate": 7.832533440248322e-05, + "loss": 2.7268, + "step": 24322 + }, + { + "epoch": 1.1324114812486905, + "grad_norm": 0.305721310487253, + "learning_rate": 7.832310220721784e-05, + "loss": 2.7872, + "step": 24323 + }, + { + "epoch": 1.1324580394347836, + "grad_norm": 0.32800998585891766, + "learning_rate": 7.832086992882697e-05, + "loss": 2.7947, + "step": 24324 + }, + { + "epoch": 1.1325045976208767, + "grad_norm": 0.336153750901851, + "learning_rate": 7.831863756731712e-05, + "loss": 2.8243, + "step": 24325 + }, + { + "epoch": 1.1325511558069699, + "grad_norm": 0.3058707960784252, + "learning_rate": 7.831640512269484e-05, + "loss": 2.6464, + "step": 24326 + }, + { + "epoch": 1.1325977139930627, + "grad_norm": 0.33371592494677277, + "learning_rate": 7.831417259496674e-05, + "loss": 2.821, + "step": 24327 + }, + { + "epoch": 1.1326442721791559, + "grad_norm": 0.29703872213361066, + "learning_rate": 7.831193998413932e-05, + "loss": 2.7617, + "step": 24328 + }, + { + "epoch": 1.132690830365249, + "grad_norm": 0.364029894836916, + "learning_rate": 7.830970729021914e-05, + "loss": 2.7551, + "step": 24329 + }, + { + "epoch": 1.132737388551342, + "grad_norm": 0.3353450346667563, + "learning_rate": 7.830747451321275e-05, + "loss": 2.7826, + "step": 24330 + }, + { + "epoch": 1.132783946737435, + "grad_norm": 0.35857879977972634, + "learning_rate": 7.830524165312673e-05, + "loss": 2.8308, + "step": 24331 + }, + { + "epoch": 1.132830504923528, + "grad_norm": 0.34249404909711617, + "learning_rate": 7.83030087099676e-05, + "loss": 2.8271, + "step": 24332 + }, + { + "epoch": 1.1328770631096212, + "grad_norm": 0.36848883808285016, + "learning_rate": 7.830077568374193e-05, + "loss": 2.7879, + "step": 24333 + }, + { + "epoch": 1.1329236212957143, + "grad_norm": 0.32934413798886536, + "learning_rate": 7.829854257445629e-05, + "loss": 2.8282, + "step": 24334 + }, + { + "epoch": 1.1329701794818074, + "grad_norm": 0.3326324329238653, + "learning_rate": 7.829630938211719e-05, + "loss": 2.7874, + "step": 24335 + }, + { + "epoch": 1.1330167376679006, + "grad_norm": 0.3441309332494161, + "learning_rate": 7.829407610673121e-05, + "loss": 2.762, + "step": 24336 + }, + { + "epoch": 1.1330632958539935, + "grad_norm": 0.3217993098226881, + "learning_rate": 7.829184274830491e-05, + "loss": 2.8028, + "step": 24337 + }, + { + "epoch": 1.1331098540400866, + "grad_norm": 0.3360149104295978, + "learning_rate": 7.828960930684484e-05, + "loss": 2.863, + "step": 24338 + }, + { + "epoch": 1.1331564122261797, + "grad_norm": 0.29436746117913953, + "learning_rate": 7.828737578235756e-05, + "loss": 2.7635, + "step": 24339 + }, + { + "epoch": 1.1332029704122728, + "grad_norm": 0.3220705714272268, + "learning_rate": 7.828514217484961e-05, + "loss": 2.8068, + "step": 24340 + }, + { + "epoch": 1.1332495285983657, + "grad_norm": 0.31726858863945817, + "learning_rate": 7.828290848432755e-05, + "loss": 2.9138, + "step": 24341 + }, + { + "epoch": 1.1332960867844588, + "grad_norm": 0.33451492663584564, + "learning_rate": 7.828067471079796e-05, + "loss": 2.8015, + "step": 24342 + }, + { + "epoch": 1.133342644970552, + "grad_norm": 0.3112051268844233, + "learning_rate": 7.827844085426737e-05, + "loss": 2.7862, + "step": 24343 + }, + { + "epoch": 1.133389203156645, + "grad_norm": 0.3234816518398205, + "learning_rate": 7.827620691474232e-05, + "loss": 2.7659, + "step": 24344 + }, + { + "epoch": 1.1334357613427382, + "grad_norm": 0.3087035488703951, + "learning_rate": 7.827397289222941e-05, + "loss": 2.7172, + "step": 24345 + }, + { + "epoch": 1.1334823195288313, + "grad_norm": 0.3172954685027719, + "learning_rate": 7.827173878673517e-05, + "loss": 2.8807, + "step": 24346 + }, + { + "epoch": 1.1335288777149242, + "grad_norm": 0.2966683526650353, + "learning_rate": 7.826950459826616e-05, + "loss": 2.7718, + "step": 24347 + }, + { + "epoch": 1.1335754359010173, + "grad_norm": 0.32199584560936295, + "learning_rate": 7.826727032682893e-05, + "loss": 2.8903, + "step": 24348 + }, + { + "epoch": 1.1336219940871104, + "grad_norm": 0.3272787735614661, + "learning_rate": 7.826503597243007e-05, + "loss": 2.9632, + "step": 24349 + }, + { + "epoch": 1.1336685522732035, + "grad_norm": 0.3485195410912208, + "learning_rate": 7.826280153507611e-05, + "loss": 2.8806, + "step": 24350 + }, + { + "epoch": 1.1337151104592964, + "grad_norm": 0.34392060305913885, + "learning_rate": 7.826056701477358e-05, + "loss": 2.7952, + "step": 24351 + }, + { + "epoch": 1.1337616686453895, + "grad_norm": 0.3510445080406487, + "learning_rate": 7.82583324115291e-05, + "loss": 2.7762, + "step": 24352 + }, + { + "epoch": 1.1338082268314826, + "grad_norm": 0.33387738333992784, + "learning_rate": 7.825609772534919e-05, + "loss": 2.7523, + "step": 24353 + }, + { + "epoch": 1.1338547850175758, + "grad_norm": 0.3395559600949679, + "learning_rate": 7.825386295624043e-05, + "loss": 2.78, + "step": 24354 + }, + { + "epoch": 1.1339013432036689, + "grad_norm": 0.30722197987168526, + "learning_rate": 7.825162810420934e-05, + "loss": 2.7743, + "step": 24355 + }, + { + "epoch": 1.1339479013897618, + "grad_norm": 0.3311833668686822, + "learning_rate": 7.824939316926252e-05, + "loss": 2.8491, + "step": 24356 + }, + { + "epoch": 1.1339944595758549, + "grad_norm": 0.30165615963525305, + "learning_rate": 7.82471581514065e-05, + "loss": 2.6948, + "step": 24357 + }, + { + "epoch": 1.134041017761948, + "grad_norm": 0.35774635101575797, + "learning_rate": 7.824492305064788e-05, + "loss": 2.8843, + "step": 24358 + }, + { + "epoch": 1.134087575948041, + "grad_norm": 0.3377530090800156, + "learning_rate": 7.824268786699318e-05, + "loss": 2.8245, + "step": 24359 + }, + { + "epoch": 1.1341341341341342, + "grad_norm": 0.32211478767093277, + "learning_rate": 7.824045260044896e-05, + "loss": 2.886, + "step": 24360 + }, + { + "epoch": 1.1341806923202271, + "grad_norm": 0.36229503462544277, + "learning_rate": 7.823821725102181e-05, + "loss": 2.8996, + "step": 24361 + }, + { + "epoch": 1.1342272505063202, + "grad_norm": 0.3397904118813611, + "learning_rate": 7.823598181871827e-05, + "loss": 2.8171, + "step": 24362 + }, + { + "epoch": 1.1342738086924133, + "grad_norm": 0.33743593307193315, + "learning_rate": 7.82337463035449e-05, + "loss": 2.6699, + "step": 24363 + }, + { + "epoch": 1.1343203668785065, + "grad_norm": 0.334681423498277, + "learning_rate": 7.823151070550827e-05, + "loss": 2.781, + "step": 24364 + }, + { + "epoch": 1.1343669250645996, + "grad_norm": 0.30828555906758387, + "learning_rate": 7.822927502461495e-05, + "loss": 2.7483, + "step": 24365 + }, + { + "epoch": 1.1344134832506925, + "grad_norm": 0.34053069495013644, + "learning_rate": 7.822703926087147e-05, + "loss": 2.8013, + "step": 24366 + }, + { + "epoch": 1.1344600414367856, + "grad_norm": 0.3216289077560583, + "learning_rate": 7.822480341428442e-05, + "loss": 2.8309, + "step": 24367 + }, + { + "epoch": 1.1345065996228787, + "grad_norm": 0.3137070945452479, + "learning_rate": 7.822256748486036e-05, + "loss": 2.7735, + "step": 24368 + }, + { + "epoch": 1.1345531578089718, + "grad_norm": 0.35384429755690994, + "learning_rate": 7.822033147260584e-05, + "loss": 2.765, + "step": 24369 + }, + { + "epoch": 1.134599715995065, + "grad_norm": 0.3243304759250156, + "learning_rate": 7.821809537752741e-05, + "loss": 2.7325, + "step": 24370 + }, + { + "epoch": 1.1346462741811578, + "grad_norm": 0.37502849606777866, + "learning_rate": 7.821585919963166e-05, + "loss": 2.7898, + "step": 24371 + }, + { + "epoch": 1.134692832367251, + "grad_norm": 0.3220685676341018, + "learning_rate": 7.821362293892515e-05, + "loss": 2.8722, + "step": 24372 + }, + { + "epoch": 1.134739390553344, + "grad_norm": 0.3962314634174989, + "learning_rate": 7.821138659541444e-05, + "loss": 2.8414, + "step": 24373 + }, + { + "epoch": 1.1347859487394372, + "grad_norm": 0.32646803269754127, + "learning_rate": 7.820915016910608e-05, + "loss": 2.8726, + "step": 24374 + }, + { + "epoch": 1.1348325069255303, + "grad_norm": 0.3816631706198856, + "learning_rate": 7.820691366000664e-05, + "loss": 2.7149, + "step": 24375 + }, + { + "epoch": 1.1348790651116232, + "grad_norm": 0.34881642561420995, + "learning_rate": 7.820467706812269e-05, + "loss": 2.7819, + "step": 24376 + }, + { + "epoch": 1.1349256232977163, + "grad_norm": 0.3420877290486459, + "learning_rate": 7.82024403934608e-05, + "loss": 2.7187, + "step": 24377 + }, + { + "epoch": 1.1349721814838094, + "grad_norm": 0.34681508643050896, + "learning_rate": 7.820020363602752e-05, + "loss": 2.8101, + "step": 24378 + }, + { + "epoch": 1.1350187396699025, + "grad_norm": 0.30192687566220955, + "learning_rate": 7.819796679582941e-05, + "loss": 2.7183, + "step": 24379 + }, + { + "epoch": 1.1350652978559954, + "grad_norm": 0.3296899512691402, + "learning_rate": 7.819572987287306e-05, + "loss": 2.7154, + "step": 24380 + }, + { + "epoch": 1.1351118560420885, + "grad_norm": 0.3033409734206575, + "learning_rate": 7.819349286716502e-05, + "loss": 2.7447, + "step": 24381 + }, + { + "epoch": 1.1351584142281816, + "grad_norm": 0.34715939095306775, + "learning_rate": 7.819125577871185e-05, + "loss": 2.8374, + "step": 24382 + }, + { + "epoch": 1.1352049724142748, + "grad_norm": 0.34144355346695954, + "learning_rate": 7.818901860752013e-05, + "loss": 2.8253, + "step": 24383 + }, + { + "epoch": 1.1352515306003679, + "grad_norm": 0.32308327142581855, + "learning_rate": 7.818678135359641e-05, + "loss": 2.7978, + "step": 24384 + }, + { + "epoch": 1.135298088786461, + "grad_norm": 0.32038428832746463, + "learning_rate": 7.818454401694726e-05, + "loss": 2.8369, + "step": 24385 + }, + { + "epoch": 1.1353446469725539, + "grad_norm": 0.30999801811653405, + "learning_rate": 7.818230659757925e-05, + "loss": 2.8715, + "step": 24386 + }, + { + "epoch": 1.135391205158647, + "grad_norm": 0.32367401985956473, + "learning_rate": 7.818006909549895e-05, + "loss": 2.7267, + "step": 24387 + }, + { + "epoch": 1.1354377633447401, + "grad_norm": 0.28900841492280555, + "learning_rate": 7.817783151071293e-05, + "loss": 2.6997, + "step": 24388 + }, + { + "epoch": 1.1354843215308332, + "grad_norm": 0.32459456847795143, + "learning_rate": 7.817559384322774e-05, + "loss": 2.772, + "step": 24389 + }, + { + "epoch": 1.1355308797169261, + "grad_norm": 0.29603254746544033, + "learning_rate": 7.817335609304997e-05, + "loss": 2.8737, + "step": 24390 + }, + { + "epoch": 1.1355774379030192, + "grad_norm": 0.34023391443100115, + "learning_rate": 7.817111826018617e-05, + "loss": 2.8482, + "step": 24391 + }, + { + "epoch": 1.1356239960891124, + "grad_norm": 0.3303067431382448, + "learning_rate": 7.816888034464292e-05, + "loss": 2.9386, + "step": 24392 + }, + { + "epoch": 1.1356705542752055, + "grad_norm": 0.3216090729285982, + "learning_rate": 7.816664234642675e-05, + "loss": 2.9113, + "step": 24393 + }, + { + "epoch": 1.1357171124612986, + "grad_norm": 0.3507823328754913, + "learning_rate": 7.816440426554429e-05, + "loss": 2.7965, + "step": 24394 + }, + { + "epoch": 1.1357636706473917, + "grad_norm": 0.3094252470166677, + "learning_rate": 7.816216610200207e-05, + "loss": 2.8775, + "step": 24395 + }, + { + "epoch": 1.1358102288334846, + "grad_norm": 0.3308735837294189, + "learning_rate": 7.815992785580668e-05, + "loss": 2.7553, + "step": 24396 + }, + { + "epoch": 1.1358567870195777, + "grad_norm": 0.32323625239303644, + "learning_rate": 7.815768952696465e-05, + "loss": 2.8644, + "step": 24397 + }, + { + "epoch": 1.1359033452056708, + "grad_norm": 0.2801650880909512, + "learning_rate": 7.815545111548259e-05, + "loss": 2.7876, + "step": 24398 + }, + { + "epoch": 1.135949903391764, + "grad_norm": 0.4014304439516781, + "learning_rate": 7.815321262136705e-05, + "loss": 2.7676, + "step": 24399 + }, + { + "epoch": 1.1359964615778568, + "grad_norm": 0.3106759545339381, + "learning_rate": 7.815097404462461e-05, + "loss": 2.846, + "step": 24400 + }, + { + "epoch": 1.13604301976395, + "grad_norm": 0.3909746452056768, + "learning_rate": 7.814873538526184e-05, + "loss": 2.8549, + "step": 24401 + }, + { + "epoch": 1.136089577950043, + "grad_norm": 0.3111160037066135, + "learning_rate": 7.814649664328529e-05, + "loss": 2.6719, + "step": 24402 + }, + { + "epoch": 1.1361361361361362, + "grad_norm": 0.32865882616661474, + "learning_rate": 7.814425781870156e-05, + "loss": 2.8568, + "step": 24403 + }, + { + "epoch": 1.1361826943222293, + "grad_norm": 0.35502107124573257, + "learning_rate": 7.814201891151721e-05, + "loss": 2.8014, + "step": 24404 + }, + { + "epoch": 1.1362292525083222, + "grad_norm": 0.3451977616104722, + "learning_rate": 7.813977992173878e-05, + "loss": 2.7234, + "step": 24405 + }, + { + "epoch": 1.1362758106944153, + "grad_norm": 0.34981854687532005, + "learning_rate": 7.813754084937288e-05, + "loss": 2.8118, + "step": 24406 + }, + { + "epoch": 1.1363223688805084, + "grad_norm": 0.36810742863624246, + "learning_rate": 7.813530169442607e-05, + "loss": 2.74, + "step": 24407 + }, + { + "epoch": 1.1363689270666015, + "grad_norm": 0.3231510110564648, + "learning_rate": 7.813306245690493e-05, + "loss": 2.8091, + "step": 24408 + }, + { + "epoch": 1.1364154852526946, + "grad_norm": 0.33769981061212584, + "learning_rate": 7.813082313681602e-05, + "loss": 2.9307, + "step": 24409 + }, + { + "epoch": 1.1364620434387875, + "grad_norm": 0.3429087676896746, + "learning_rate": 7.812858373416591e-05, + "loss": 2.7977, + "step": 24410 + }, + { + "epoch": 1.1365086016248807, + "grad_norm": 0.3269523888084544, + "learning_rate": 7.812634424896118e-05, + "loss": 2.807, + "step": 24411 + }, + { + "epoch": 1.1365551598109738, + "grad_norm": 0.3286674197918678, + "learning_rate": 7.81241046812084e-05, + "loss": 2.8403, + "step": 24412 + }, + { + "epoch": 1.1366017179970669, + "grad_norm": 0.32738439427928934, + "learning_rate": 7.812186503091415e-05, + "loss": 2.9049, + "step": 24413 + }, + { + "epoch": 1.13664827618316, + "grad_norm": 0.3652428555035814, + "learning_rate": 7.811962529808499e-05, + "loss": 2.8469, + "step": 24414 + }, + { + "epoch": 1.136694834369253, + "grad_norm": 0.3092843592334208, + "learning_rate": 7.81173854827275e-05, + "loss": 2.746, + "step": 24415 + }, + { + "epoch": 1.136741392555346, + "grad_norm": 0.3810262161131103, + "learning_rate": 7.811514558484825e-05, + "loss": 2.7986, + "step": 24416 + }, + { + "epoch": 1.1367879507414391, + "grad_norm": 0.3095155913999309, + "learning_rate": 7.811290560445384e-05, + "loss": 2.9225, + "step": 24417 + }, + { + "epoch": 1.1368345089275322, + "grad_norm": 0.38822010610393165, + "learning_rate": 7.81106655415508e-05, + "loss": 2.8341, + "step": 24418 + }, + { + "epoch": 1.1368810671136251, + "grad_norm": 0.33177095024187403, + "learning_rate": 7.810842539614572e-05, + "loss": 2.8561, + "step": 24419 + }, + { + "epoch": 1.1369276252997182, + "grad_norm": 0.3431938519394103, + "learning_rate": 7.81061851682452e-05, + "loss": 2.8668, + "step": 24420 + }, + { + "epoch": 1.1369741834858114, + "grad_norm": 0.36163920926518967, + "learning_rate": 7.810394485785578e-05, + "loss": 2.831, + "step": 24421 + }, + { + "epoch": 1.1370207416719045, + "grad_norm": 0.3223956738510313, + "learning_rate": 7.810170446498406e-05, + "loss": 2.7087, + "step": 24422 + }, + { + "epoch": 1.1370672998579976, + "grad_norm": 0.35893796687190155, + "learning_rate": 7.80994639896366e-05, + "loss": 2.834, + "step": 24423 + }, + { + "epoch": 1.1371138580440907, + "grad_norm": 0.34416691182544995, + "learning_rate": 7.809722343181998e-05, + "loss": 2.8199, + "step": 24424 + }, + { + "epoch": 1.1371604162301836, + "grad_norm": 0.38612794490707625, + "learning_rate": 7.809498279154078e-05, + "loss": 2.7955, + "step": 24425 + }, + { + "epoch": 1.1372069744162767, + "grad_norm": 0.34911592723667934, + "learning_rate": 7.80927420688056e-05, + "loss": 2.8956, + "step": 24426 + }, + { + "epoch": 1.1372535326023698, + "grad_norm": 0.34583879142780427, + "learning_rate": 7.809050126362094e-05, + "loss": 2.8266, + "step": 24427 + }, + { + "epoch": 1.137300090788463, + "grad_norm": 0.3522061391407821, + "learning_rate": 7.808826037599346e-05, + "loss": 2.8569, + "step": 24428 + }, + { + "epoch": 1.1373466489745558, + "grad_norm": 0.338541147409678, + "learning_rate": 7.808601940592969e-05, + "loss": 2.8829, + "step": 24429 + }, + { + "epoch": 1.137393207160649, + "grad_norm": 0.33742121770795125, + "learning_rate": 7.808377835343622e-05, + "loss": 2.8175, + "step": 24430 + }, + { + "epoch": 1.137439765346742, + "grad_norm": 0.32451446457003036, + "learning_rate": 7.808153721851963e-05, + "loss": 2.8409, + "step": 24431 + }, + { + "epoch": 1.1374863235328352, + "grad_norm": 0.34165093211331027, + "learning_rate": 7.80792960011865e-05, + "loss": 2.797, + "step": 24432 + }, + { + "epoch": 1.1375328817189283, + "grad_norm": 0.3182938232501972, + "learning_rate": 7.80770547014434e-05, + "loss": 2.806, + "step": 24433 + }, + { + "epoch": 1.1375794399050214, + "grad_norm": 0.3530056040361892, + "learning_rate": 7.807481331929692e-05, + "loss": 2.7821, + "step": 24434 + }, + { + "epoch": 1.1376259980911143, + "grad_norm": 0.34770567791041473, + "learning_rate": 7.807257185475361e-05, + "loss": 2.9039, + "step": 24435 + }, + { + "epoch": 1.1376725562772074, + "grad_norm": 0.3352537285329844, + "learning_rate": 7.807033030782007e-05, + "loss": 2.8162, + "step": 24436 + }, + { + "epoch": 1.1377191144633005, + "grad_norm": 0.35016722058062166, + "learning_rate": 7.806808867850288e-05, + "loss": 2.8399, + "step": 24437 + }, + { + "epoch": 1.1377656726493937, + "grad_norm": 0.3033414542307259, + "learning_rate": 7.806584696680861e-05, + "loss": 2.792, + "step": 24438 + }, + { + "epoch": 1.1378122308354865, + "grad_norm": 0.3469771173628692, + "learning_rate": 7.806360517274384e-05, + "loss": 2.8945, + "step": 24439 + }, + { + "epoch": 1.1378587890215797, + "grad_norm": 0.3391527729162471, + "learning_rate": 7.806136329631517e-05, + "loss": 2.781, + "step": 24440 + }, + { + "epoch": 1.1379053472076728, + "grad_norm": 0.30983635125408987, + "learning_rate": 7.805912133752917e-05, + "loss": 2.8301, + "step": 24441 + }, + { + "epoch": 1.137951905393766, + "grad_norm": 0.3512385726862516, + "learning_rate": 7.80568792963924e-05, + "loss": 2.8354, + "step": 24442 + }, + { + "epoch": 1.137998463579859, + "grad_norm": 0.31979191525876344, + "learning_rate": 7.805463717291143e-05, + "loss": 2.8276, + "step": 24443 + }, + { + "epoch": 1.138045021765952, + "grad_norm": 0.35797259685556154, + "learning_rate": 7.805239496709292e-05, + "loss": 2.8059, + "step": 24444 + }, + { + "epoch": 1.138091579952045, + "grad_norm": 0.3073533736390375, + "learning_rate": 7.805015267894334e-05, + "loss": 2.814, + "step": 24445 + }, + { + "epoch": 1.1381381381381381, + "grad_norm": 0.3614819556151191, + "learning_rate": 7.804791030846934e-05, + "loss": 2.7732, + "step": 24446 + }, + { + "epoch": 1.1381846963242312, + "grad_norm": 0.3137949285264329, + "learning_rate": 7.80456678556775e-05, + "loss": 2.7499, + "step": 24447 + }, + { + "epoch": 1.1382312545103244, + "grad_norm": 0.36039845118028785, + "learning_rate": 7.804342532057438e-05, + "loss": 2.7872, + "step": 24448 + }, + { + "epoch": 1.1382778126964173, + "grad_norm": 0.34108584845072104, + "learning_rate": 7.804118270316658e-05, + "loss": 2.8385, + "step": 24449 + }, + { + "epoch": 1.1383243708825104, + "grad_norm": 0.3365705682469718, + "learning_rate": 7.803894000346066e-05, + "loss": 2.8218, + "step": 24450 + }, + { + "epoch": 1.1383709290686035, + "grad_norm": 0.3795426767496709, + "learning_rate": 7.80366972214632e-05, + "loss": 2.8597, + "step": 24451 + }, + { + "epoch": 1.1384174872546966, + "grad_norm": 0.3335865761607865, + "learning_rate": 7.803445435718082e-05, + "loss": 2.7947, + "step": 24452 + }, + { + "epoch": 1.1384640454407897, + "grad_norm": 0.38136014607752067, + "learning_rate": 7.803221141062008e-05, + "loss": 2.7975, + "step": 24453 + }, + { + "epoch": 1.1385106036268826, + "grad_norm": 0.36655679379447786, + "learning_rate": 7.802996838178753e-05, + "loss": 2.7571, + "step": 24454 + }, + { + "epoch": 1.1385571618129757, + "grad_norm": 0.33132164048926627, + "learning_rate": 7.802772527068981e-05, + "loss": 2.8398, + "step": 24455 + }, + { + "epoch": 1.1386037199990688, + "grad_norm": 0.3416321937440384, + "learning_rate": 7.802548207733347e-05, + "loss": 2.848, + "step": 24456 + }, + { + "epoch": 1.138650278185162, + "grad_norm": 0.3280256698524142, + "learning_rate": 7.802323880172509e-05, + "loss": 2.848, + "step": 24457 + }, + { + "epoch": 1.138696836371255, + "grad_norm": 0.3469773683826181, + "learning_rate": 7.80209954438713e-05, + "loss": 2.7067, + "step": 24458 + }, + { + "epoch": 1.138743394557348, + "grad_norm": 0.32746969542685994, + "learning_rate": 7.801875200377862e-05, + "loss": 2.7779, + "step": 24459 + }, + { + "epoch": 1.138789952743441, + "grad_norm": 0.33297044755281835, + "learning_rate": 7.801650848145366e-05, + "loss": 2.8422, + "step": 24460 + }, + { + "epoch": 1.1388365109295342, + "grad_norm": 0.3405653299892822, + "learning_rate": 7.801426487690302e-05, + "loss": 2.8191, + "step": 24461 + }, + { + "epoch": 1.1388830691156273, + "grad_norm": 0.34829000297803564, + "learning_rate": 7.801202119013328e-05, + "loss": 2.7776, + "step": 24462 + }, + { + "epoch": 1.1389296273017204, + "grad_norm": 0.3387140798409141, + "learning_rate": 7.800977742115099e-05, + "loss": 2.7785, + "step": 24463 + }, + { + "epoch": 1.1389761854878133, + "grad_norm": 0.322696237217838, + "learning_rate": 7.800753356996279e-05, + "loss": 2.7593, + "step": 24464 + }, + { + "epoch": 1.1390227436739064, + "grad_norm": 0.35643264909973665, + "learning_rate": 7.800528963657523e-05, + "loss": 2.8372, + "step": 24465 + }, + { + "epoch": 1.1390693018599996, + "grad_norm": 0.33346352973411064, + "learning_rate": 7.80030456209949e-05, + "loss": 2.7892, + "step": 24466 + }, + { + "epoch": 1.1391158600460927, + "grad_norm": 0.3501201267584414, + "learning_rate": 7.800080152322839e-05, + "loss": 2.8149, + "step": 24467 + }, + { + "epoch": 1.1391624182321856, + "grad_norm": 0.3308244011630936, + "learning_rate": 7.799855734328229e-05, + "loss": 2.89, + "step": 24468 + }, + { + "epoch": 1.1392089764182787, + "grad_norm": 0.3285345529810286, + "learning_rate": 7.799631308116317e-05, + "loss": 2.8885, + "step": 24469 + }, + { + "epoch": 1.1392555346043718, + "grad_norm": 0.3207103013750931, + "learning_rate": 7.799406873687764e-05, + "loss": 2.7507, + "step": 24470 + }, + { + "epoch": 1.139302092790465, + "grad_norm": 0.3215275181748571, + "learning_rate": 7.799182431043228e-05, + "loss": 2.7036, + "step": 24471 + }, + { + "epoch": 1.139348650976558, + "grad_norm": 0.33159804183884517, + "learning_rate": 7.798957980183366e-05, + "loss": 2.8207, + "step": 24472 + }, + { + "epoch": 1.1393952091626511, + "grad_norm": 0.35820981679006664, + "learning_rate": 7.79873352110884e-05, + "loss": 2.8259, + "step": 24473 + }, + { + "epoch": 1.139441767348744, + "grad_norm": 0.38188968279648267, + "learning_rate": 7.798509053820305e-05, + "loss": 2.7743, + "step": 24474 + }, + { + "epoch": 1.1394883255348371, + "grad_norm": 0.3501941417726084, + "learning_rate": 7.798284578318423e-05, + "loss": 2.7296, + "step": 24475 + }, + { + "epoch": 1.1395348837209303, + "grad_norm": 0.35449343134101, + "learning_rate": 7.798060094603852e-05, + "loss": 2.9409, + "step": 24476 + }, + { + "epoch": 1.1395814419070234, + "grad_norm": 0.32882989506174526, + "learning_rate": 7.797835602677248e-05, + "loss": 2.9495, + "step": 24477 + }, + { + "epoch": 1.1396280000931163, + "grad_norm": 0.3860037319494881, + "learning_rate": 7.797611102539272e-05, + "loss": 2.8199, + "step": 24478 + }, + { + "epoch": 1.1396745582792094, + "grad_norm": 0.3039668921020276, + "learning_rate": 7.797386594190585e-05, + "loss": 2.6763, + "step": 24479 + }, + { + "epoch": 1.1397211164653025, + "grad_norm": 0.33586742136567693, + "learning_rate": 7.797162077631844e-05, + "loss": 2.7674, + "step": 24480 + }, + { + "epoch": 1.1397676746513956, + "grad_norm": 0.33293739462954164, + "learning_rate": 7.796937552863707e-05, + "loss": 2.7565, + "step": 24481 + }, + { + "epoch": 1.1398142328374887, + "grad_norm": 0.35399259580034315, + "learning_rate": 7.796713019886835e-05, + "loss": 2.8993, + "step": 24482 + }, + { + "epoch": 1.1398607910235818, + "grad_norm": 0.3193310632029329, + "learning_rate": 7.796488478701884e-05, + "loss": 2.7589, + "step": 24483 + }, + { + "epoch": 1.1399073492096747, + "grad_norm": 0.38594126307120136, + "learning_rate": 7.796263929309516e-05, + "loss": 2.7979, + "step": 24484 + }, + { + "epoch": 1.1399539073957679, + "grad_norm": 0.3781266502087375, + "learning_rate": 7.796039371710387e-05, + "loss": 2.811, + "step": 24485 + }, + { + "epoch": 1.140000465581861, + "grad_norm": 0.3786690360215315, + "learning_rate": 7.795814805905159e-05, + "loss": 2.7928, + "step": 24486 + }, + { + "epoch": 1.140047023767954, + "grad_norm": 0.3550825870404305, + "learning_rate": 7.795590231894491e-05, + "loss": 2.8066, + "step": 24487 + }, + { + "epoch": 1.140093581954047, + "grad_norm": 0.35806293059294253, + "learning_rate": 7.79536564967904e-05, + "loss": 2.7893, + "step": 24488 + }, + { + "epoch": 1.14014014014014, + "grad_norm": 0.3393237121152223, + "learning_rate": 7.795141059259467e-05, + "loss": 2.6223, + "step": 24489 + }, + { + "epoch": 1.1401866983262332, + "grad_norm": 0.3424060739741076, + "learning_rate": 7.79491646063643e-05, + "loss": 2.7855, + "step": 24490 + }, + { + "epoch": 1.1402332565123263, + "grad_norm": 0.3232998690928566, + "learning_rate": 7.794691853810589e-05, + "loss": 2.8698, + "step": 24491 + }, + { + "epoch": 1.1402798146984194, + "grad_norm": 0.3386485846365671, + "learning_rate": 7.794467238782601e-05, + "loss": 2.8535, + "step": 24492 + }, + { + "epoch": 1.1403263728845123, + "grad_norm": 0.31214964600829936, + "learning_rate": 7.79424261555313e-05, + "loss": 2.7584, + "step": 24493 + }, + { + "epoch": 1.1403729310706054, + "grad_norm": 0.3534522395740845, + "learning_rate": 7.79401798412283e-05, + "loss": 2.8643, + "step": 24494 + }, + { + "epoch": 1.1404194892566986, + "grad_norm": 0.32993703664421825, + "learning_rate": 7.793793344492362e-05, + "loss": 2.8447, + "step": 24495 + }, + { + "epoch": 1.1404660474427917, + "grad_norm": 0.3438011771861377, + "learning_rate": 7.793568696662385e-05, + "loss": 2.7837, + "step": 24496 + }, + { + "epoch": 1.1405126056288848, + "grad_norm": 0.34397366545962393, + "learning_rate": 7.793344040633563e-05, + "loss": 2.8285, + "step": 24497 + }, + { + "epoch": 1.1405591638149777, + "grad_norm": 0.32314863002003597, + "learning_rate": 7.793119376406548e-05, + "loss": 2.8307, + "step": 24498 + }, + { + "epoch": 1.1406057220010708, + "grad_norm": 0.36926270995866856, + "learning_rate": 7.792894703982005e-05, + "loss": 2.7945, + "step": 24499 + }, + { + "epoch": 1.140652280187164, + "grad_norm": 0.31143760326971337, + "learning_rate": 7.79267002336059e-05, + "loss": 2.7527, + "step": 24500 + }, + { + "epoch": 1.140698838373257, + "grad_norm": 0.3647024932143988, + "learning_rate": 7.792445334542963e-05, + "loss": 2.8413, + "step": 24501 + }, + { + "epoch": 1.1407453965593501, + "grad_norm": 0.3247378425686803, + "learning_rate": 7.792220637529784e-05, + "loss": 2.844, + "step": 24502 + }, + { + "epoch": 1.140791954745443, + "grad_norm": 0.33795968799282633, + "learning_rate": 7.791995932321713e-05, + "loss": 2.7424, + "step": 24503 + }, + { + "epoch": 1.1408385129315362, + "grad_norm": 0.3250674520664744, + "learning_rate": 7.79177121891941e-05, + "loss": 2.8224, + "step": 24504 + }, + { + "epoch": 1.1408850711176293, + "grad_norm": 0.31718450791486913, + "learning_rate": 7.791546497323533e-05, + "loss": 2.8057, + "step": 24505 + }, + { + "epoch": 1.1409316293037224, + "grad_norm": 0.29181614112124965, + "learning_rate": 7.79132176753474e-05, + "loss": 2.7617, + "step": 24506 + }, + { + "epoch": 1.1409781874898153, + "grad_norm": 0.33177646003436767, + "learning_rate": 7.791097029553696e-05, + "loss": 2.8284, + "step": 24507 + }, + { + "epoch": 1.1410247456759084, + "grad_norm": 0.31917004830646717, + "learning_rate": 7.790872283381055e-05, + "loss": 2.8519, + "step": 24508 + }, + { + "epoch": 1.1410713038620015, + "grad_norm": 0.35314908480107665, + "learning_rate": 7.79064752901748e-05, + "loss": 2.8265, + "step": 24509 + }, + { + "epoch": 1.1411178620480946, + "grad_norm": 0.3604951400149117, + "learning_rate": 7.790422766463628e-05, + "loss": 2.8499, + "step": 24510 + }, + { + "epoch": 1.1411644202341877, + "grad_norm": 0.3721153001427049, + "learning_rate": 7.79019799572016e-05, + "loss": 2.8, + "step": 24511 + }, + { + "epoch": 1.1412109784202809, + "grad_norm": 0.3349976501823064, + "learning_rate": 7.789973216787739e-05, + "loss": 2.7912, + "step": 24512 + }, + { + "epoch": 1.1412575366063737, + "grad_norm": 0.3353287211614319, + "learning_rate": 7.78974842966702e-05, + "loss": 2.8282, + "step": 24513 + }, + { + "epoch": 1.1413040947924669, + "grad_norm": 0.35136731920079745, + "learning_rate": 7.789523634358661e-05, + "loss": 2.8092, + "step": 24514 + }, + { + "epoch": 1.14135065297856, + "grad_norm": 0.3229228750242645, + "learning_rate": 7.789298830863327e-05, + "loss": 2.7905, + "step": 24515 + }, + { + "epoch": 1.141397211164653, + "grad_norm": 0.35906467602315684, + "learning_rate": 7.789074019181676e-05, + "loss": 2.7236, + "step": 24516 + }, + { + "epoch": 1.141443769350746, + "grad_norm": 0.3377523954439459, + "learning_rate": 7.788849199314368e-05, + "loss": 2.7726, + "step": 24517 + }, + { + "epoch": 1.141490327536839, + "grad_norm": 0.39729619088725404, + "learning_rate": 7.788624371262062e-05, + "loss": 2.8847, + "step": 24518 + }, + { + "epoch": 1.1415368857229322, + "grad_norm": 0.3304225994927981, + "learning_rate": 7.788399535025418e-05, + "loss": 2.8731, + "step": 24519 + }, + { + "epoch": 1.1415834439090253, + "grad_norm": 0.33702299544709946, + "learning_rate": 7.788174690605097e-05, + "loss": 2.6818, + "step": 24520 + }, + { + "epoch": 1.1416300020951184, + "grad_norm": 0.31820320743907027, + "learning_rate": 7.787949838001757e-05, + "loss": 2.9073, + "step": 24521 + }, + { + "epoch": 1.1416765602812116, + "grad_norm": 0.37393319947559345, + "learning_rate": 7.787724977216058e-05, + "loss": 2.7818, + "step": 24522 + }, + { + "epoch": 1.1417231184673045, + "grad_norm": 0.3205081390542097, + "learning_rate": 7.787500108248662e-05, + "loss": 2.7959, + "step": 24523 + }, + { + "epoch": 1.1417696766533976, + "grad_norm": 0.3478382439644625, + "learning_rate": 7.787275231100229e-05, + "loss": 2.7351, + "step": 24524 + }, + { + "epoch": 1.1418162348394907, + "grad_norm": 0.3453109291252322, + "learning_rate": 7.787050345771416e-05, + "loss": 2.9036, + "step": 24525 + }, + { + "epoch": 1.1418627930255838, + "grad_norm": 0.32084057137138855, + "learning_rate": 7.786825452262885e-05, + "loss": 2.7991, + "step": 24526 + }, + { + "epoch": 1.1419093512116767, + "grad_norm": 0.34197163875978215, + "learning_rate": 7.786600550575296e-05, + "loss": 2.7438, + "step": 24527 + }, + { + "epoch": 1.1419559093977698, + "grad_norm": 0.33842316509802345, + "learning_rate": 7.786375640709309e-05, + "loss": 2.7947, + "step": 24528 + }, + { + "epoch": 1.142002467583863, + "grad_norm": 0.32188025181095004, + "learning_rate": 7.786150722665584e-05, + "loss": 2.8156, + "step": 24529 + }, + { + "epoch": 1.142049025769956, + "grad_norm": 0.38064608786293547, + "learning_rate": 7.785925796444779e-05, + "loss": 2.8261, + "step": 24530 + }, + { + "epoch": 1.1420955839560492, + "grad_norm": 0.3401078129280063, + "learning_rate": 7.78570086204756e-05, + "loss": 2.8187, + "step": 24531 + }, + { + "epoch": 1.142142142142142, + "grad_norm": 0.34416621156951704, + "learning_rate": 7.785475919474581e-05, + "loss": 2.782, + "step": 24532 + }, + { + "epoch": 1.1421887003282352, + "grad_norm": 0.34920378307416927, + "learning_rate": 7.785250968726505e-05, + "loss": 2.7773, + "step": 24533 + }, + { + "epoch": 1.1422352585143283, + "grad_norm": 0.34236544770785127, + "learning_rate": 7.785026009803993e-05, + "loss": 2.8511, + "step": 24534 + }, + { + "epoch": 1.1422818167004214, + "grad_norm": 0.31944981087279195, + "learning_rate": 7.784801042707704e-05, + "loss": 2.8008, + "step": 24535 + }, + { + "epoch": 1.1423283748865145, + "grad_norm": 0.3412915385658948, + "learning_rate": 7.784576067438296e-05, + "loss": 2.8257, + "step": 24536 + }, + { + "epoch": 1.1423749330726074, + "grad_norm": 0.35305548472487436, + "learning_rate": 7.784351083996433e-05, + "loss": 2.7726, + "step": 24537 + }, + { + "epoch": 1.1424214912587005, + "grad_norm": 0.36611693999882855, + "learning_rate": 7.784126092382773e-05, + "loss": 2.7686, + "step": 24538 + }, + { + "epoch": 1.1424680494447936, + "grad_norm": 0.36172701024926074, + "learning_rate": 7.783901092597979e-05, + "loss": 2.7469, + "step": 24539 + }, + { + "epoch": 1.1425146076308867, + "grad_norm": 0.33390031369849743, + "learning_rate": 7.783676084642709e-05, + "loss": 2.7541, + "step": 24540 + }, + { + "epoch": 1.1425611658169799, + "grad_norm": 0.36407175780410217, + "learning_rate": 7.783451068517624e-05, + "loss": 2.8689, + "step": 24541 + }, + { + "epoch": 1.1426077240030728, + "grad_norm": 0.340001743220207, + "learning_rate": 7.783226044223383e-05, + "loss": 2.8763, + "step": 24542 + }, + { + "epoch": 1.1426542821891659, + "grad_norm": 0.34266116453181644, + "learning_rate": 7.783001011760649e-05, + "loss": 2.7511, + "step": 24543 + }, + { + "epoch": 1.142700840375259, + "grad_norm": 0.322210179979833, + "learning_rate": 7.782775971130081e-05, + "loss": 2.8614, + "step": 24544 + }, + { + "epoch": 1.142747398561352, + "grad_norm": 0.3251424647913429, + "learning_rate": 7.782550922332339e-05, + "loss": 2.726, + "step": 24545 + }, + { + "epoch": 1.1427939567474452, + "grad_norm": 0.3072864028290155, + "learning_rate": 7.782325865368085e-05, + "loss": 2.6343, + "step": 24546 + }, + { + "epoch": 1.142840514933538, + "grad_norm": 0.35369266727286125, + "learning_rate": 7.782100800237976e-05, + "loss": 2.7544, + "step": 24547 + }, + { + "epoch": 1.1428870731196312, + "grad_norm": 0.3445765354949117, + "learning_rate": 7.78187572694268e-05, + "loss": 2.8286, + "step": 24548 + }, + { + "epoch": 1.1429336313057243, + "grad_norm": 0.3529765617141406, + "learning_rate": 7.78165064548285e-05, + "loss": 2.8142, + "step": 24549 + }, + { + "epoch": 1.1429801894918175, + "grad_norm": 0.3159909530989146, + "learning_rate": 7.78142555585915e-05, + "loss": 2.7582, + "step": 24550 + }, + { + "epoch": 1.1430267476779106, + "grad_norm": 0.34349445186400857, + "learning_rate": 7.781200458072239e-05, + "loss": 2.8116, + "step": 24551 + }, + { + "epoch": 1.1430733058640035, + "grad_norm": 0.3437914996960524, + "learning_rate": 7.78097535212278e-05, + "loss": 2.8128, + "step": 24552 + }, + { + "epoch": 1.1431198640500966, + "grad_norm": 0.31000110807944387, + "learning_rate": 7.780750238011431e-05, + "loss": 2.7891, + "step": 24553 + }, + { + "epoch": 1.1431664222361897, + "grad_norm": 0.35841333756802607, + "learning_rate": 7.780525115738855e-05, + "loss": 2.7661, + "step": 24554 + }, + { + "epoch": 1.1432129804222828, + "grad_norm": 0.36281324102719553, + "learning_rate": 7.780299985305712e-05, + "loss": 2.8352, + "step": 24555 + }, + { + "epoch": 1.1432595386083757, + "grad_norm": 0.33776842315654176, + "learning_rate": 7.780074846712662e-05, + "loss": 2.9457, + "step": 24556 + }, + { + "epoch": 1.1433060967944688, + "grad_norm": 0.355630051060849, + "learning_rate": 7.779849699960367e-05, + "loss": 2.7979, + "step": 24557 + }, + { + "epoch": 1.143352654980562, + "grad_norm": 0.32263235533238305, + "learning_rate": 7.779624545049485e-05, + "loss": 2.8541, + "step": 24558 + }, + { + "epoch": 1.143399213166655, + "grad_norm": 0.34304312612868404, + "learning_rate": 7.77939938198068e-05, + "loss": 2.8658, + "step": 24559 + }, + { + "epoch": 1.1434457713527482, + "grad_norm": 0.3476756157510791, + "learning_rate": 7.779174210754611e-05, + "loss": 2.8193, + "step": 24560 + }, + { + "epoch": 1.1434923295388413, + "grad_norm": 0.32025079958695185, + "learning_rate": 7.77894903137194e-05, + "loss": 2.7674, + "step": 24561 + }, + { + "epoch": 1.1435388877249342, + "grad_norm": 0.35051524023609204, + "learning_rate": 7.778723843833329e-05, + "loss": 2.8291, + "step": 24562 + }, + { + "epoch": 1.1435854459110273, + "grad_norm": 0.3171829566582175, + "learning_rate": 7.778498648139433e-05, + "loss": 2.77, + "step": 24563 + }, + { + "epoch": 1.1436320040971204, + "grad_norm": 0.3375278991411372, + "learning_rate": 7.778273444290921e-05, + "loss": 2.9328, + "step": 24564 + }, + { + "epoch": 1.1436785622832135, + "grad_norm": 0.33823382648611894, + "learning_rate": 7.778048232288449e-05, + "loss": 2.7526, + "step": 24565 + }, + { + "epoch": 1.1437251204693064, + "grad_norm": 0.3296169967611445, + "learning_rate": 7.777823012132679e-05, + "loss": 2.7637, + "step": 24566 + }, + { + "epoch": 1.1437716786553995, + "grad_norm": 0.3285251179828172, + "learning_rate": 7.777597783824273e-05, + "loss": 2.7656, + "step": 24567 + }, + { + "epoch": 1.1438182368414926, + "grad_norm": 0.3212739890338619, + "learning_rate": 7.77737254736389e-05, + "loss": 2.8812, + "step": 24568 + }, + { + "epoch": 1.1438647950275858, + "grad_norm": 0.32988473404505225, + "learning_rate": 7.777147302752192e-05, + "loss": 2.8149, + "step": 24569 + }, + { + "epoch": 1.1439113532136789, + "grad_norm": 0.3212884743044029, + "learning_rate": 7.776922049989842e-05, + "loss": 2.8921, + "step": 24570 + }, + { + "epoch": 1.143957911399772, + "grad_norm": 0.33530140730394237, + "learning_rate": 7.776696789077497e-05, + "loss": 2.7504, + "step": 24571 + }, + { + "epoch": 1.1440044695858649, + "grad_norm": 0.308030901565125, + "learning_rate": 7.776471520015822e-05, + "loss": 2.8261, + "step": 24572 + }, + { + "epoch": 1.144051027771958, + "grad_norm": 0.325570027939812, + "learning_rate": 7.776246242805476e-05, + "loss": 2.8328, + "step": 24573 + }, + { + "epoch": 1.144097585958051, + "grad_norm": 0.32676443346202305, + "learning_rate": 7.776020957447121e-05, + "loss": 2.8115, + "step": 24574 + }, + { + "epoch": 1.1441441441441442, + "grad_norm": 0.3072299026360458, + "learning_rate": 7.775795663941419e-05, + "loss": 2.7871, + "step": 24575 + }, + { + "epoch": 1.1441907023302371, + "grad_norm": 0.32701506767566624, + "learning_rate": 7.775570362289029e-05, + "loss": 2.7987, + "step": 24576 + }, + { + "epoch": 1.1442372605163302, + "grad_norm": 0.33035723633546604, + "learning_rate": 7.775345052490613e-05, + "loss": 2.9101, + "step": 24577 + }, + { + "epoch": 1.1442838187024233, + "grad_norm": 0.32306024250096077, + "learning_rate": 7.775119734546834e-05, + "loss": 2.8723, + "step": 24578 + }, + { + "epoch": 1.1443303768885165, + "grad_norm": 0.349726664147626, + "learning_rate": 7.774894408458351e-05, + "loss": 2.8457, + "step": 24579 + }, + { + "epoch": 1.1443769350746096, + "grad_norm": 0.32395912145904615, + "learning_rate": 7.774669074225827e-05, + "loss": 2.8646, + "step": 24580 + }, + { + "epoch": 1.1444234932607025, + "grad_norm": 0.34225206648912, + "learning_rate": 7.774443731849924e-05, + "loss": 2.7262, + "step": 24581 + }, + { + "epoch": 1.1444700514467956, + "grad_norm": 0.33024459098751396, + "learning_rate": 7.774218381331299e-05, + "loss": 2.8335, + "step": 24582 + }, + { + "epoch": 1.1445166096328887, + "grad_norm": 0.31567932217340794, + "learning_rate": 7.773993022670617e-05, + "loss": 2.7636, + "step": 24583 + }, + { + "epoch": 1.1445631678189818, + "grad_norm": 0.3411728721697804, + "learning_rate": 7.77376765586854e-05, + "loss": 2.7849, + "step": 24584 + }, + { + "epoch": 1.144609726005075, + "grad_norm": 0.3436008945061513, + "learning_rate": 7.773542280925728e-05, + "loss": 2.7165, + "step": 24585 + }, + { + "epoch": 1.1446562841911678, + "grad_norm": 0.3528023722514057, + "learning_rate": 7.77331689784284e-05, + "loss": 2.8047, + "step": 24586 + }, + { + "epoch": 1.144702842377261, + "grad_norm": 0.34756850271825523, + "learning_rate": 7.773091506620543e-05, + "loss": 2.6875, + "step": 24587 + }, + { + "epoch": 1.144749400563354, + "grad_norm": 0.35950866804501047, + "learning_rate": 7.772866107259495e-05, + "loss": 2.7737, + "step": 24588 + }, + { + "epoch": 1.1447959587494472, + "grad_norm": 0.33939756373770574, + "learning_rate": 7.772640699760358e-05, + "loss": 2.8658, + "step": 24589 + }, + { + "epoch": 1.1448425169355403, + "grad_norm": 0.33358825438374545, + "learning_rate": 7.772415284123795e-05, + "loss": 2.7183, + "step": 24590 + }, + { + "epoch": 1.1448890751216332, + "grad_norm": 0.33233397574160384, + "learning_rate": 7.772189860350463e-05, + "loss": 2.8407, + "step": 24591 + }, + { + "epoch": 1.1449356333077263, + "grad_norm": 0.3132414070953905, + "learning_rate": 7.771964428441029e-05, + "loss": 2.8734, + "step": 24592 + }, + { + "epoch": 1.1449821914938194, + "grad_norm": 0.31660500661929253, + "learning_rate": 7.771738988396153e-05, + "loss": 2.9146, + "step": 24593 + }, + { + "epoch": 1.1450287496799125, + "grad_norm": 0.3445240451188835, + "learning_rate": 7.771513540216495e-05, + "loss": 2.7939, + "step": 24594 + }, + { + "epoch": 1.1450753078660054, + "grad_norm": 0.3108659175272908, + "learning_rate": 7.77128808390272e-05, + "loss": 2.8375, + "step": 24595 + }, + { + "epoch": 1.1451218660520985, + "grad_norm": 0.34452291720797923, + "learning_rate": 7.771062619455485e-05, + "loss": 2.9154, + "step": 24596 + }, + { + "epoch": 1.1451684242381917, + "grad_norm": 0.3197030371092813, + "learning_rate": 7.770837146875454e-05, + "loss": 2.7923, + "step": 24597 + }, + { + "epoch": 1.1452149824242848, + "grad_norm": 0.31552370363604526, + "learning_rate": 7.77061166616329e-05, + "loss": 2.7125, + "step": 24598 + }, + { + "epoch": 1.1452615406103779, + "grad_norm": 0.310484262521674, + "learning_rate": 7.770386177319654e-05, + "loss": 2.7629, + "step": 24599 + }, + { + "epoch": 1.145308098796471, + "grad_norm": 0.3309022756600427, + "learning_rate": 7.770160680345207e-05, + "loss": 2.7418, + "step": 24600 + }, + { + "epoch": 1.145354656982564, + "grad_norm": 0.3307864020446313, + "learning_rate": 7.769935175240611e-05, + "loss": 2.7876, + "step": 24601 + }, + { + "epoch": 1.145401215168657, + "grad_norm": 0.3165628725867534, + "learning_rate": 7.76970966200653e-05, + "loss": 2.7598, + "step": 24602 + }, + { + "epoch": 1.1454477733547501, + "grad_norm": 0.3119147348187792, + "learning_rate": 7.769484140643622e-05, + "loss": 2.9357, + "step": 24603 + }, + { + "epoch": 1.1454943315408432, + "grad_norm": 0.3211475344542441, + "learning_rate": 7.769258611152551e-05, + "loss": 2.6873, + "step": 24604 + }, + { + "epoch": 1.1455408897269361, + "grad_norm": 0.322939781739697, + "learning_rate": 7.76903307353398e-05, + "loss": 2.7859, + "step": 24605 + }, + { + "epoch": 1.1455874479130292, + "grad_norm": 0.33249488412580264, + "learning_rate": 7.768807527788568e-05, + "loss": 2.8039, + "step": 24606 + }, + { + "epoch": 1.1456340060991224, + "grad_norm": 0.3563082944217947, + "learning_rate": 7.768581973916981e-05, + "loss": 2.7963, + "step": 24607 + }, + { + "epoch": 1.1456805642852155, + "grad_norm": 0.3525813452065169, + "learning_rate": 7.768356411919878e-05, + "loss": 2.8496, + "step": 24608 + }, + { + "epoch": 1.1457271224713086, + "grad_norm": 0.3228671788986514, + "learning_rate": 7.768130841797919e-05, + "loss": 2.7146, + "step": 24609 + }, + { + "epoch": 1.1457736806574017, + "grad_norm": 0.35722503702398034, + "learning_rate": 7.767905263551772e-05, + "loss": 2.8049, + "step": 24610 + }, + { + "epoch": 1.1458202388434946, + "grad_norm": 0.32887038338041197, + "learning_rate": 7.767679677182093e-05, + "loss": 2.8417, + "step": 24611 + }, + { + "epoch": 1.1458667970295877, + "grad_norm": 0.36155165504649245, + "learning_rate": 7.767454082689549e-05, + "loss": 2.7607, + "step": 24612 + }, + { + "epoch": 1.1459133552156808, + "grad_norm": 0.332636688534588, + "learning_rate": 7.767228480074799e-05, + "loss": 2.8661, + "step": 24613 + }, + { + "epoch": 1.145959913401774, + "grad_norm": 0.3531631219210377, + "learning_rate": 7.767002869338506e-05, + "loss": 2.8062, + "step": 24614 + }, + { + "epoch": 1.1460064715878668, + "grad_norm": 0.3249098747601661, + "learning_rate": 7.766777250481333e-05, + "loss": 2.8463, + "step": 24615 + }, + { + "epoch": 1.14605302977396, + "grad_norm": 0.33594729305492804, + "learning_rate": 7.76655162350394e-05, + "loss": 2.7612, + "step": 24616 + }, + { + "epoch": 1.146099587960053, + "grad_norm": 0.3544769052419841, + "learning_rate": 7.766325988406992e-05, + "loss": 2.8738, + "step": 24617 + }, + { + "epoch": 1.1461461461461462, + "grad_norm": 0.32083326835728165, + "learning_rate": 7.766100345191149e-05, + "loss": 2.8122, + "step": 24618 + }, + { + "epoch": 1.1461927043322393, + "grad_norm": 0.36356201805911187, + "learning_rate": 7.765874693857074e-05, + "loss": 2.7655, + "step": 24619 + }, + { + "epoch": 1.1462392625183322, + "grad_norm": 0.30830769027601446, + "learning_rate": 7.765649034405429e-05, + "loss": 2.8579, + "step": 24620 + }, + { + "epoch": 1.1462858207044253, + "grad_norm": 0.3391790754704748, + "learning_rate": 7.765423366836876e-05, + "loss": 2.8458, + "step": 24621 + }, + { + "epoch": 1.1463323788905184, + "grad_norm": 0.34741585159755584, + "learning_rate": 7.76519769115208e-05, + "loss": 2.7753, + "step": 24622 + }, + { + "epoch": 1.1463789370766115, + "grad_norm": 0.3257860287398306, + "learning_rate": 7.764972007351698e-05, + "loss": 2.9139, + "step": 24623 + }, + { + "epoch": 1.1464254952627047, + "grad_norm": 0.34784826747589126, + "learning_rate": 7.764746315436399e-05, + "loss": 2.805, + "step": 24624 + }, + { + "epoch": 1.1464720534487975, + "grad_norm": 0.3449493455007693, + "learning_rate": 7.764520615406839e-05, + "loss": 2.8472, + "step": 24625 + }, + { + "epoch": 1.1465186116348907, + "grad_norm": 0.3599177825402058, + "learning_rate": 7.764294907263685e-05, + "loss": 2.8688, + "step": 24626 + }, + { + "epoch": 1.1465651698209838, + "grad_norm": 0.3648327873904073, + "learning_rate": 7.764069191007597e-05, + "loss": 2.6557, + "step": 24627 + }, + { + "epoch": 1.146611728007077, + "grad_norm": 0.33594321494089424, + "learning_rate": 7.763843466639239e-05, + "loss": 2.8013, + "step": 24628 + }, + { + "epoch": 1.14665828619317, + "grad_norm": 0.3369188730500126, + "learning_rate": 7.763617734159273e-05, + "loss": 2.8085, + "step": 24629 + }, + { + "epoch": 1.146704844379263, + "grad_norm": 0.32334801131427837, + "learning_rate": 7.763391993568361e-05, + "loss": 2.9092, + "step": 24630 + }, + { + "epoch": 1.146751402565356, + "grad_norm": 0.34318903079753244, + "learning_rate": 7.763166244867164e-05, + "loss": 2.8857, + "step": 24631 + }, + { + "epoch": 1.1467979607514491, + "grad_norm": 0.36268039504996985, + "learning_rate": 7.762940488056348e-05, + "loss": 2.8008, + "step": 24632 + }, + { + "epoch": 1.1468445189375422, + "grad_norm": 0.3320812928840989, + "learning_rate": 7.762714723136575e-05, + "loss": 2.7868, + "step": 24633 + }, + { + "epoch": 1.1468910771236351, + "grad_norm": 0.3587251747110332, + "learning_rate": 7.762488950108504e-05, + "loss": 2.8362, + "step": 24634 + }, + { + "epoch": 1.1469376353097283, + "grad_norm": 0.3553624232733085, + "learning_rate": 7.762263168972802e-05, + "loss": 2.852, + "step": 24635 + }, + { + "epoch": 1.1469841934958214, + "grad_norm": 0.3325592538900461, + "learning_rate": 7.762037379730128e-05, + "loss": 2.863, + "step": 24636 + }, + { + "epoch": 1.1470307516819145, + "grad_norm": 0.3588761014607251, + "learning_rate": 7.761811582381147e-05, + "loss": 2.8311, + "step": 24637 + }, + { + "epoch": 1.1470773098680076, + "grad_norm": 0.3185901270970377, + "learning_rate": 7.761585776926521e-05, + "loss": 2.8087, + "step": 24638 + }, + { + "epoch": 1.1471238680541007, + "grad_norm": 0.3600196747569812, + "learning_rate": 7.761359963366914e-05, + "loss": 2.8658, + "step": 24639 + }, + { + "epoch": 1.1471704262401936, + "grad_norm": 0.3270531225572835, + "learning_rate": 7.761134141702986e-05, + "loss": 2.8172, + "step": 24640 + }, + { + "epoch": 1.1472169844262867, + "grad_norm": 0.3142589330387711, + "learning_rate": 7.7609083119354e-05, + "loss": 2.8906, + "step": 24641 + }, + { + "epoch": 1.1472635426123798, + "grad_norm": 0.3383450394413864, + "learning_rate": 7.760682474064824e-05, + "loss": 2.8057, + "step": 24642 + }, + { + "epoch": 1.147310100798473, + "grad_norm": 0.3040592784829832, + "learning_rate": 7.760456628091914e-05, + "loss": 2.8197, + "step": 24643 + }, + { + "epoch": 1.1473566589845658, + "grad_norm": 0.33452278526443685, + "learning_rate": 7.760230774017336e-05, + "loss": 2.8598, + "step": 24644 + }, + { + "epoch": 1.147403217170659, + "grad_norm": 0.3338903905826336, + "learning_rate": 7.760004911841753e-05, + "loss": 2.8479, + "step": 24645 + }, + { + "epoch": 1.147449775356752, + "grad_norm": 0.34375513594084356, + "learning_rate": 7.75977904156583e-05, + "loss": 2.8913, + "step": 24646 + }, + { + "epoch": 1.1474963335428452, + "grad_norm": 0.3895906537441849, + "learning_rate": 7.759553163190224e-05, + "loss": 2.8321, + "step": 24647 + }, + { + "epoch": 1.1475428917289383, + "grad_norm": 0.33086583953864535, + "learning_rate": 7.759327276715602e-05, + "loss": 2.838, + "step": 24648 + }, + { + "epoch": 1.1475894499150314, + "grad_norm": 0.3645491174880512, + "learning_rate": 7.759101382142626e-05, + "loss": 2.8543, + "step": 24649 + }, + { + "epoch": 1.1476360081011243, + "grad_norm": 0.3750606616105835, + "learning_rate": 7.758875479471961e-05, + "loss": 2.8375, + "step": 24650 + }, + { + "epoch": 1.1476825662872174, + "grad_norm": 0.3462233657385798, + "learning_rate": 7.758649568704268e-05, + "loss": 2.7869, + "step": 24651 + }, + { + "epoch": 1.1477291244733105, + "grad_norm": 0.3223566198454575, + "learning_rate": 7.75842364984021e-05, + "loss": 2.72, + "step": 24652 + }, + { + "epoch": 1.1477756826594037, + "grad_norm": 0.3434583570127065, + "learning_rate": 7.75819772288045e-05, + "loss": 2.8603, + "step": 24653 + }, + { + "epoch": 1.1478222408454966, + "grad_norm": 0.37672167081897207, + "learning_rate": 7.75797178782565e-05, + "loss": 2.8031, + "step": 24654 + }, + { + "epoch": 1.1478687990315897, + "grad_norm": 0.3337741495602233, + "learning_rate": 7.757745844676476e-05, + "loss": 2.8525, + "step": 24655 + }, + { + "epoch": 1.1479153572176828, + "grad_norm": 0.3349309153259881, + "learning_rate": 7.757519893433588e-05, + "loss": 2.7553, + "step": 24656 + }, + { + "epoch": 1.147961915403776, + "grad_norm": 0.34428834402971037, + "learning_rate": 7.757293934097654e-05, + "loss": 2.8162, + "step": 24657 + }, + { + "epoch": 1.148008473589869, + "grad_norm": 0.33065690310465545, + "learning_rate": 7.757067966669332e-05, + "loss": 2.8778, + "step": 24658 + }, + { + "epoch": 1.1480550317759621, + "grad_norm": 0.3519781121360109, + "learning_rate": 7.756841991149287e-05, + "loss": 2.8496, + "step": 24659 + }, + { + "epoch": 1.148101589962055, + "grad_norm": 0.34769501908850264, + "learning_rate": 7.756616007538183e-05, + "loss": 2.7835, + "step": 24660 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.33247050552636354, + "learning_rate": 7.756390015836682e-05, + "loss": 2.8833, + "step": 24661 + }, + { + "epoch": 1.1481947063342413, + "grad_norm": 0.3424762642363497, + "learning_rate": 7.756164016045448e-05, + "loss": 2.8295, + "step": 24662 + }, + { + "epoch": 1.1482412645203344, + "grad_norm": 0.32044778603852275, + "learning_rate": 7.755938008165145e-05, + "loss": 2.7671, + "step": 24663 + }, + { + "epoch": 1.1482878227064273, + "grad_norm": 0.3191084369635058, + "learning_rate": 7.755711992196434e-05, + "loss": 2.8008, + "step": 24664 + }, + { + "epoch": 1.1483343808925204, + "grad_norm": 0.30205669866565676, + "learning_rate": 7.755485968139981e-05, + "loss": 2.7942, + "step": 24665 + }, + { + "epoch": 1.1483809390786135, + "grad_norm": 0.3270461427796758, + "learning_rate": 7.755259935996447e-05, + "loss": 2.862, + "step": 24666 + }, + { + "epoch": 1.1484274972647066, + "grad_norm": 0.3203639538220415, + "learning_rate": 7.755033895766497e-05, + "loss": 2.7667, + "step": 24667 + }, + { + "epoch": 1.1484740554507997, + "grad_norm": 0.3261278336457557, + "learning_rate": 7.754807847450794e-05, + "loss": 2.8544, + "step": 24668 + }, + { + "epoch": 1.1485206136368926, + "grad_norm": 0.3204807341083108, + "learning_rate": 7.754581791050002e-05, + "loss": 2.8242, + "step": 24669 + }, + { + "epoch": 1.1485671718229857, + "grad_norm": 0.3210493174678671, + "learning_rate": 7.754355726564782e-05, + "loss": 2.7114, + "step": 24670 + }, + { + "epoch": 1.1486137300090788, + "grad_norm": 0.3186016374940202, + "learning_rate": 7.7541296539958e-05, + "loss": 2.8269, + "step": 24671 + }, + { + "epoch": 1.148660288195172, + "grad_norm": 0.3316852161378581, + "learning_rate": 7.753903573343721e-05, + "loss": 2.8544, + "step": 24672 + }, + { + "epoch": 1.148706846381265, + "grad_norm": 0.32526029114467275, + "learning_rate": 7.753677484609203e-05, + "loss": 2.8269, + "step": 24673 + }, + { + "epoch": 1.148753404567358, + "grad_norm": 0.33028507125105144, + "learning_rate": 7.753451387792916e-05, + "loss": 2.8009, + "step": 24674 + }, + { + "epoch": 1.148799962753451, + "grad_norm": 0.36556191519582726, + "learning_rate": 7.753225282895519e-05, + "loss": 2.8035, + "step": 24675 + }, + { + "epoch": 1.1488465209395442, + "grad_norm": 0.30940000636858184, + "learning_rate": 7.752999169917677e-05, + "loss": 2.766, + "step": 24676 + }, + { + "epoch": 1.1488930791256373, + "grad_norm": 0.3478350822314044, + "learning_rate": 7.752773048860054e-05, + "loss": 2.8729, + "step": 24677 + }, + { + "epoch": 1.1489396373117304, + "grad_norm": 0.339033078328098, + "learning_rate": 7.752546919723314e-05, + "loss": 2.8867, + "step": 24678 + }, + { + "epoch": 1.1489861954978233, + "grad_norm": 0.34841081769802223, + "learning_rate": 7.752320782508118e-05, + "loss": 2.8468, + "step": 24679 + }, + { + "epoch": 1.1490327536839164, + "grad_norm": 0.3247307172510966, + "learning_rate": 7.752094637215132e-05, + "loss": 2.7875, + "step": 24680 + }, + { + "epoch": 1.1490793118700096, + "grad_norm": 0.3341349460510453, + "learning_rate": 7.751868483845019e-05, + "loss": 2.6457, + "step": 24681 + }, + { + "epoch": 1.1491258700561027, + "grad_norm": 0.31461783019595896, + "learning_rate": 7.751642322398444e-05, + "loss": 2.9054, + "step": 24682 + }, + { + "epoch": 1.1491724282421956, + "grad_norm": 0.32563493539692523, + "learning_rate": 7.75141615287607e-05, + "loss": 2.8003, + "step": 24683 + }, + { + "epoch": 1.1492189864282887, + "grad_norm": 0.31774752445916654, + "learning_rate": 7.75118997527856e-05, + "loss": 2.8606, + "step": 24684 + }, + { + "epoch": 1.1492655446143818, + "grad_norm": 0.3283685695597317, + "learning_rate": 7.75096378960658e-05, + "loss": 2.8021, + "step": 24685 + }, + { + "epoch": 1.149312102800475, + "grad_norm": 0.35465001322670403, + "learning_rate": 7.75073759586079e-05, + "loss": 2.8946, + "step": 24686 + }, + { + "epoch": 1.149358660986568, + "grad_norm": 0.33486709143352267, + "learning_rate": 7.750511394041858e-05, + "loss": 2.7934, + "step": 24687 + }, + { + "epoch": 1.1494052191726611, + "grad_norm": 0.3294359920828668, + "learning_rate": 7.750285184150446e-05, + "loss": 2.8797, + "step": 24688 + }, + { + "epoch": 1.149451777358754, + "grad_norm": 0.35096549598577115, + "learning_rate": 7.750058966187217e-05, + "loss": 2.7581, + "step": 24689 + }, + { + "epoch": 1.1494983355448471, + "grad_norm": 0.34260658711963715, + "learning_rate": 7.749832740152836e-05, + "loss": 2.7723, + "step": 24690 + }, + { + "epoch": 1.1495448937309403, + "grad_norm": 0.32381123643380955, + "learning_rate": 7.749606506047967e-05, + "loss": 2.8175, + "step": 24691 + }, + { + "epoch": 1.1495914519170334, + "grad_norm": 0.34884126927902637, + "learning_rate": 7.749380263873275e-05, + "loss": 2.8491, + "step": 24692 + }, + { + "epoch": 1.1496380101031263, + "grad_norm": 0.33890303267242017, + "learning_rate": 7.74915401362942e-05, + "loss": 2.8006, + "step": 24693 + }, + { + "epoch": 1.1496845682892194, + "grad_norm": 0.31283100638000705, + "learning_rate": 7.748927755317072e-05, + "loss": 2.6658, + "step": 24694 + }, + { + "epoch": 1.1497311264753125, + "grad_norm": 0.34849638506312186, + "learning_rate": 7.74870148893689e-05, + "loss": 2.8146, + "step": 24695 + }, + { + "epoch": 1.1497776846614056, + "grad_norm": 0.32683369023380715, + "learning_rate": 7.74847521448954e-05, + "loss": 2.8247, + "step": 24696 + }, + { + "epoch": 1.1498242428474987, + "grad_norm": 0.3716860870728825, + "learning_rate": 7.748248931975687e-05, + "loss": 2.7667, + "step": 24697 + }, + { + "epoch": 1.1498708010335918, + "grad_norm": 0.2946702496106332, + "learning_rate": 7.748022641395994e-05, + "loss": 2.9093, + "step": 24698 + }, + { + "epoch": 1.1499173592196847, + "grad_norm": 0.34854907912169236, + "learning_rate": 7.747796342751123e-05, + "loss": 2.7831, + "step": 24699 + }, + { + "epoch": 1.1499639174057779, + "grad_norm": 0.2959467209104286, + "learning_rate": 7.747570036041742e-05, + "loss": 2.8658, + "step": 24700 + }, + { + "epoch": 1.150010475591871, + "grad_norm": 0.3348507227550946, + "learning_rate": 7.747343721268515e-05, + "loss": 2.8051, + "step": 24701 + }, + { + "epoch": 1.150057033777964, + "grad_norm": 0.29706222374030655, + "learning_rate": 7.747117398432103e-05, + "loss": 2.798, + "step": 24702 + }, + { + "epoch": 1.150103591964057, + "grad_norm": 0.3333810900047888, + "learning_rate": 7.746891067533173e-05, + "loss": 2.8973, + "step": 24703 + }, + { + "epoch": 1.15015015015015, + "grad_norm": 0.29590076214949335, + "learning_rate": 7.746664728572387e-05, + "loss": 2.9186, + "step": 24704 + }, + { + "epoch": 1.1501967083362432, + "grad_norm": 0.3218331724409628, + "learning_rate": 7.746438381550412e-05, + "loss": 2.7871, + "step": 24705 + }, + { + "epoch": 1.1502432665223363, + "grad_norm": 0.3118937250862926, + "learning_rate": 7.74621202646791e-05, + "loss": 2.7333, + "step": 24706 + }, + { + "epoch": 1.1502898247084294, + "grad_norm": 0.3308036290397148, + "learning_rate": 7.745985663325546e-05, + "loss": 2.8127, + "step": 24707 + }, + { + "epoch": 1.1503363828945223, + "grad_norm": 0.32133350204915645, + "learning_rate": 7.745759292123986e-05, + "loss": 2.7607, + "step": 24708 + }, + { + "epoch": 1.1503829410806155, + "grad_norm": 0.34300819092094037, + "learning_rate": 7.745532912863892e-05, + "loss": 2.8934, + "step": 24709 + }, + { + "epoch": 1.1504294992667086, + "grad_norm": 0.32036603876475717, + "learning_rate": 7.74530652554593e-05, + "loss": 2.7985, + "step": 24710 + }, + { + "epoch": 1.1504760574528017, + "grad_norm": 0.3237131999364907, + "learning_rate": 7.745080130170763e-05, + "loss": 2.844, + "step": 24711 + }, + { + "epoch": 1.1505226156388948, + "grad_norm": 0.31923233665322454, + "learning_rate": 7.744853726739056e-05, + "loss": 2.7841, + "step": 24712 + }, + { + "epoch": 1.1505691738249877, + "grad_norm": 0.3186697528050196, + "learning_rate": 7.744627315251476e-05, + "loss": 2.8255, + "step": 24713 + }, + { + "epoch": 1.1506157320110808, + "grad_norm": 0.32419456199917496, + "learning_rate": 7.744400895708683e-05, + "loss": 2.8069, + "step": 24714 + }, + { + "epoch": 1.150662290197174, + "grad_norm": 0.3220639437112312, + "learning_rate": 7.744174468111345e-05, + "loss": 2.7734, + "step": 24715 + }, + { + "epoch": 1.150708848383267, + "grad_norm": 0.30981638951751556, + "learning_rate": 7.743948032460125e-05, + "loss": 2.7344, + "step": 24716 + }, + { + "epoch": 1.1507554065693602, + "grad_norm": 0.2976906131932938, + "learning_rate": 7.743721588755687e-05, + "loss": 2.811, + "step": 24717 + }, + { + "epoch": 1.150801964755453, + "grad_norm": 0.33266407196665376, + "learning_rate": 7.743495136998697e-05, + "loss": 2.9233, + "step": 24718 + }, + { + "epoch": 1.1508485229415462, + "grad_norm": 0.29912015557847976, + "learning_rate": 7.74326867718982e-05, + "loss": 2.7908, + "step": 24719 + }, + { + "epoch": 1.1508950811276393, + "grad_norm": 0.31626277716201096, + "learning_rate": 7.743042209329719e-05, + "loss": 2.75, + "step": 24720 + }, + { + "epoch": 1.1509416393137324, + "grad_norm": 0.2987235006325692, + "learning_rate": 7.742815733419058e-05, + "loss": 2.7116, + "step": 24721 + }, + { + "epoch": 1.1509881974998253, + "grad_norm": 0.3017866616074923, + "learning_rate": 7.742589249458504e-05, + "loss": 2.8108, + "step": 24722 + }, + { + "epoch": 1.1510347556859184, + "grad_norm": 0.3371171771465486, + "learning_rate": 7.74236275744872e-05, + "loss": 2.8525, + "step": 24723 + }, + { + "epoch": 1.1510813138720115, + "grad_norm": 0.31718528749010383, + "learning_rate": 7.742136257390374e-05, + "loss": 2.7637, + "step": 24724 + }, + { + "epoch": 1.1511278720581046, + "grad_norm": 0.32634530603967515, + "learning_rate": 7.741909749284126e-05, + "loss": 2.8159, + "step": 24725 + }, + { + "epoch": 1.1511744302441977, + "grad_norm": 0.31686722702911874, + "learning_rate": 7.741683233130644e-05, + "loss": 2.8225, + "step": 24726 + }, + { + "epoch": 1.1512209884302909, + "grad_norm": 0.29291219154424886, + "learning_rate": 7.74145670893059e-05, + "loss": 2.6599, + "step": 24727 + }, + { + "epoch": 1.1512675466163838, + "grad_norm": 0.33723949876242554, + "learning_rate": 7.741230176684633e-05, + "loss": 2.8705, + "step": 24728 + }, + { + "epoch": 1.1513141048024769, + "grad_norm": 0.31344983110712266, + "learning_rate": 7.741003636393434e-05, + "loss": 2.8564, + "step": 24729 + }, + { + "epoch": 1.15136066298857, + "grad_norm": 0.32493689067284115, + "learning_rate": 7.74077708805766e-05, + "loss": 2.7637, + "step": 24730 + }, + { + "epoch": 1.151407221174663, + "grad_norm": 0.322504342892261, + "learning_rate": 7.740550531677975e-05, + "loss": 2.7551, + "step": 24731 + }, + { + "epoch": 1.151453779360756, + "grad_norm": 0.3010329558758491, + "learning_rate": 7.740323967255045e-05, + "loss": 2.8791, + "step": 24732 + }, + { + "epoch": 1.151500337546849, + "grad_norm": 0.3065280934010829, + "learning_rate": 7.740097394789533e-05, + "loss": 2.8105, + "step": 24733 + }, + { + "epoch": 1.1515468957329422, + "grad_norm": 0.3107115235957714, + "learning_rate": 7.739870814282107e-05, + "loss": 2.8095, + "step": 24734 + }, + { + "epoch": 1.1515934539190353, + "grad_norm": 0.31770144276544326, + "learning_rate": 7.739644225733427e-05, + "loss": 2.8128, + "step": 24735 + }, + { + "epoch": 1.1516400121051285, + "grad_norm": 0.3118009556322614, + "learning_rate": 7.739417629144162e-05, + "loss": 2.8059, + "step": 24736 + }, + { + "epoch": 1.1516865702912216, + "grad_norm": 0.3040445951088122, + "learning_rate": 7.739191024514978e-05, + "loss": 2.807, + "step": 24737 + }, + { + "epoch": 1.1517331284773145, + "grad_norm": 0.311737834765235, + "learning_rate": 7.738964411846537e-05, + "loss": 2.8212, + "step": 24738 + }, + { + "epoch": 1.1517796866634076, + "grad_norm": 0.32707853066781817, + "learning_rate": 7.738737791139506e-05, + "loss": 2.8363, + "step": 24739 + }, + { + "epoch": 1.1518262448495007, + "grad_norm": 0.3344830905323124, + "learning_rate": 7.738511162394548e-05, + "loss": 2.7818, + "step": 24740 + }, + { + "epoch": 1.1518728030355938, + "grad_norm": 0.3367984968996374, + "learning_rate": 7.73828452561233e-05, + "loss": 2.8014, + "step": 24741 + }, + { + "epoch": 1.1519193612216867, + "grad_norm": 0.3365803753186106, + "learning_rate": 7.738057880793517e-05, + "loss": 2.8299, + "step": 24742 + }, + { + "epoch": 1.1519659194077798, + "grad_norm": 0.3102471783988992, + "learning_rate": 7.737831227938775e-05, + "loss": 2.7717, + "step": 24743 + }, + { + "epoch": 1.152012477593873, + "grad_norm": 0.359315632043346, + "learning_rate": 7.737604567048765e-05, + "loss": 2.8003, + "step": 24744 + }, + { + "epoch": 1.152059035779966, + "grad_norm": 0.3165825157120197, + "learning_rate": 7.737377898124158e-05, + "loss": 2.806, + "step": 24745 + }, + { + "epoch": 1.1521055939660592, + "grad_norm": 0.359681675327469, + "learning_rate": 7.737151221165615e-05, + "loss": 2.8055, + "step": 24746 + }, + { + "epoch": 1.1521521521521523, + "grad_norm": 0.3359198254461111, + "learning_rate": 7.736924536173805e-05, + "loss": 2.9236, + "step": 24747 + }, + { + "epoch": 1.1521987103382452, + "grad_norm": 0.3375475483407257, + "learning_rate": 7.736697843149388e-05, + "loss": 2.8783, + "step": 24748 + }, + { + "epoch": 1.1522452685243383, + "grad_norm": 0.3275875648336664, + "learning_rate": 7.736471142093033e-05, + "loss": 2.8636, + "step": 24749 + }, + { + "epoch": 1.1522918267104314, + "grad_norm": 0.34363956534738843, + "learning_rate": 7.736244433005407e-05, + "loss": 2.8509, + "step": 24750 + }, + { + "epoch": 1.1523383848965245, + "grad_norm": 0.3386554750019322, + "learning_rate": 7.736017715887172e-05, + "loss": 2.846, + "step": 24751 + }, + { + "epoch": 1.1523849430826174, + "grad_norm": 0.34770965315934266, + "learning_rate": 7.735790990738995e-05, + "loss": 2.8534, + "step": 24752 + }, + { + "epoch": 1.1524315012687105, + "grad_norm": 0.3119526595070215, + "learning_rate": 7.73556425756154e-05, + "loss": 2.7731, + "step": 24753 + }, + { + "epoch": 1.1524780594548036, + "grad_norm": 0.31854521469413855, + "learning_rate": 7.735337516355474e-05, + "loss": 2.6855, + "step": 24754 + }, + { + "epoch": 1.1525246176408968, + "grad_norm": 0.3173003660882044, + "learning_rate": 7.735110767121463e-05, + "loss": 2.8093, + "step": 24755 + }, + { + "epoch": 1.1525711758269899, + "grad_norm": 0.30786501691057405, + "learning_rate": 7.734884009860169e-05, + "loss": 2.8532, + "step": 24756 + }, + { + "epoch": 1.1526177340130828, + "grad_norm": 0.32129815402795453, + "learning_rate": 7.734657244572261e-05, + "loss": 2.7333, + "step": 24757 + }, + { + "epoch": 1.1526642921991759, + "grad_norm": 0.32977171175733827, + "learning_rate": 7.734430471258403e-05, + "loss": 2.8092, + "step": 24758 + }, + { + "epoch": 1.152710850385269, + "grad_norm": 0.3538398110017761, + "learning_rate": 7.734203689919261e-05, + "loss": 2.7929, + "step": 24759 + }, + { + "epoch": 1.152757408571362, + "grad_norm": 0.3124554256176581, + "learning_rate": 7.733976900555502e-05, + "loss": 2.8228, + "step": 24760 + }, + { + "epoch": 1.1528039667574552, + "grad_norm": 0.3178619943657016, + "learning_rate": 7.733750103167789e-05, + "loss": 2.7691, + "step": 24761 + }, + { + "epoch": 1.1528505249435481, + "grad_norm": 0.34340431617908557, + "learning_rate": 7.733523297756788e-05, + "loss": 2.8582, + "step": 24762 + }, + { + "epoch": 1.1528970831296412, + "grad_norm": 0.3641836257456462, + "learning_rate": 7.733296484323167e-05, + "loss": 2.768, + "step": 24763 + }, + { + "epoch": 1.1529436413157343, + "grad_norm": 0.31706442467099144, + "learning_rate": 7.73306966286759e-05, + "loss": 2.8589, + "step": 24764 + }, + { + "epoch": 1.1529901995018275, + "grad_norm": 0.36419664326761086, + "learning_rate": 7.73284283339072e-05, + "loss": 2.8321, + "step": 24765 + }, + { + "epoch": 1.1530367576879206, + "grad_norm": 0.3312845280172563, + "learning_rate": 7.732615995893229e-05, + "loss": 2.7784, + "step": 24766 + }, + { + "epoch": 1.1530833158740135, + "grad_norm": 0.333833798947657, + "learning_rate": 7.732389150375777e-05, + "loss": 2.7841, + "step": 24767 + }, + { + "epoch": 1.1531298740601066, + "grad_norm": 0.32348886186385967, + "learning_rate": 7.732162296839033e-05, + "loss": 2.8553, + "step": 24768 + }, + { + "epoch": 1.1531764322461997, + "grad_norm": 0.3472625971996959, + "learning_rate": 7.731935435283662e-05, + "loss": 2.8182, + "step": 24769 + }, + { + "epoch": 1.1532229904322928, + "grad_norm": 0.3157016545022548, + "learning_rate": 7.73170856571033e-05, + "loss": 2.8342, + "step": 24770 + }, + { + "epoch": 1.1532695486183857, + "grad_norm": 0.3282625300848373, + "learning_rate": 7.731481688119702e-05, + "loss": 2.7821, + "step": 24771 + }, + { + "epoch": 1.1533161068044788, + "grad_norm": 0.3220659887017693, + "learning_rate": 7.731254802512443e-05, + "loss": 2.8629, + "step": 24772 + }, + { + "epoch": 1.153362664990572, + "grad_norm": 0.3310681791827929, + "learning_rate": 7.731027908889222e-05, + "loss": 2.7623, + "step": 24773 + }, + { + "epoch": 1.153409223176665, + "grad_norm": 0.3779269174811157, + "learning_rate": 7.730801007250704e-05, + "loss": 2.8344, + "step": 24774 + }, + { + "epoch": 1.1534557813627582, + "grad_norm": 0.3203302138930759, + "learning_rate": 7.730574097597552e-05, + "loss": 2.8476, + "step": 24775 + }, + { + "epoch": 1.1535023395488513, + "grad_norm": 0.37456025563592826, + "learning_rate": 7.730347179930432e-05, + "loss": 2.7425, + "step": 24776 + }, + { + "epoch": 1.1535488977349442, + "grad_norm": 0.3222123326952126, + "learning_rate": 7.730120254250016e-05, + "loss": 2.7592, + "step": 24777 + }, + { + "epoch": 1.1535954559210373, + "grad_norm": 0.3390613457072739, + "learning_rate": 7.729893320556964e-05, + "loss": 2.8323, + "step": 24778 + }, + { + "epoch": 1.1536420141071304, + "grad_norm": 0.3435289877347848, + "learning_rate": 7.729666378851946e-05, + "loss": 2.8825, + "step": 24779 + }, + { + "epoch": 1.1536885722932235, + "grad_norm": 0.3367177466391609, + "learning_rate": 7.729439429135625e-05, + "loss": 2.8146, + "step": 24780 + }, + { + "epoch": 1.1537351304793164, + "grad_norm": 0.3546456461901762, + "learning_rate": 7.729212471408666e-05, + "loss": 2.7874, + "step": 24781 + }, + { + "epoch": 1.1537816886654095, + "grad_norm": 0.3191724731398402, + "learning_rate": 7.72898550567174e-05, + "loss": 2.7858, + "step": 24782 + }, + { + "epoch": 1.1538282468515026, + "grad_norm": 0.3415722023725058, + "learning_rate": 7.728758531925511e-05, + "loss": 2.8574, + "step": 24783 + }, + { + "epoch": 1.1538748050375958, + "grad_norm": 0.39149940048061627, + "learning_rate": 7.728531550170643e-05, + "loss": 2.8461, + "step": 24784 + }, + { + "epoch": 1.1539213632236889, + "grad_norm": 0.3348393159520362, + "learning_rate": 7.728304560407805e-05, + "loss": 2.8474, + "step": 24785 + }, + { + "epoch": 1.153967921409782, + "grad_norm": 0.3616377701945618, + "learning_rate": 7.72807756263766e-05, + "loss": 2.7553, + "step": 24786 + }, + { + "epoch": 1.1540144795958749, + "grad_norm": 0.32270908973366985, + "learning_rate": 7.727850556860876e-05, + "loss": 2.7705, + "step": 24787 + }, + { + "epoch": 1.154061037781968, + "grad_norm": 0.33783789146634907, + "learning_rate": 7.727623543078119e-05, + "loss": 2.7949, + "step": 24788 + }, + { + "epoch": 1.1541075959680611, + "grad_norm": 0.32576573060157027, + "learning_rate": 7.727396521290058e-05, + "loss": 2.6985, + "step": 24789 + }, + { + "epoch": 1.1541541541541542, + "grad_norm": 0.3457124796646495, + "learning_rate": 7.727169491497356e-05, + "loss": 2.6524, + "step": 24790 + }, + { + "epoch": 1.1542007123402471, + "grad_norm": 0.30854254438393197, + "learning_rate": 7.726942453700681e-05, + "loss": 2.8456, + "step": 24791 + }, + { + "epoch": 1.1542472705263402, + "grad_norm": 0.3247725084448067, + "learning_rate": 7.726715407900696e-05, + "loss": 2.8916, + "step": 24792 + }, + { + "epoch": 1.1542938287124334, + "grad_norm": 0.3439112161793135, + "learning_rate": 7.726488354098072e-05, + "loss": 2.7738, + "step": 24793 + }, + { + "epoch": 1.1543403868985265, + "grad_norm": 0.30242545589365993, + "learning_rate": 7.726261292293473e-05, + "loss": 2.9003, + "step": 24794 + }, + { + "epoch": 1.1543869450846196, + "grad_norm": 0.30544916734773747, + "learning_rate": 7.726034222487564e-05, + "loss": 2.7964, + "step": 24795 + }, + { + "epoch": 1.1544335032707125, + "grad_norm": 0.31863633710958156, + "learning_rate": 7.725807144681015e-05, + "loss": 2.8329, + "step": 24796 + }, + { + "epoch": 1.1544800614568056, + "grad_norm": 0.30302783048478177, + "learning_rate": 7.725580058874491e-05, + "loss": 2.8398, + "step": 24797 + }, + { + "epoch": 1.1545266196428987, + "grad_norm": 0.32238361071689087, + "learning_rate": 7.725352965068658e-05, + "loss": 2.8255, + "step": 24798 + }, + { + "epoch": 1.1545731778289918, + "grad_norm": 0.3049942523982731, + "learning_rate": 7.72512586326418e-05, + "loss": 2.7563, + "step": 24799 + }, + { + "epoch": 1.154619736015085, + "grad_norm": 0.31890529993325556, + "learning_rate": 7.724898753461729e-05, + "loss": 2.8558, + "step": 24800 + }, + { + "epoch": 1.1546662942011778, + "grad_norm": 0.31115367925438886, + "learning_rate": 7.724671635661967e-05, + "loss": 2.8375, + "step": 24801 + }, + { + "epoch": 1.154712852387271, + "grad_norm": 0.31661633038425074, + "learning_rate": 7.724444509865561e-05, + "loss": 2.8143, + "step": 24802 + }, + { + "epoch": 1.154759410573364, + "grad_norm": 0.3097329098097371, + "learning_rate": 7.724217376073181e-05, + "loss": 2.7026, + "step": 24803 + }, + { + "epoch": 1.1548059687594572, + "grad_norm": 0.31814838717466914, + "learning_rate": 7.723990234285489e-05, + "loss": 2.8286, + "step": 24804 + }, + { + "epoch": 1.1548525269455503, + "grad_norm": 0.32324339660069085, + "learning_rate": 7.723763084503157e-05, + "loss": 2.8985, + "step": 24805 + }, + { + "epoch": 1.1548990851316432, + "grad_norm": 0.34149649474921134, + "learning_rate": 7.723535926726848e-05, + "loss": 2.8147, + "step": 24806 + }, + { + "epoch": 1.1549456433177363, + "grad_norm": 0.34125684951100443, + "learning_rate": 7.723308760957229e-05, + "loss": 2.8154, + "step": 24807 + }, + { + "epoch": 1.1549922015038294, + "grad_norm": 0.34240578374832054, + "learning_rate": 7.723081587194966e-05, + "loss": 2.776, + "step": 24808 + }, + { + "epoch": 1.1550387596899225, + "grad_norm": 0.3443221197071118, + "learning_rate": 7.722854405440728e-05, + "loss": 2.8022, + "step": 24809 + }, + { + "epoch": 1.1550853178760154, + "grad_norm": 0.36481381161317916, + "learning_rate": 7.72262721569518e-05, + "loss": 2.8277, + "step": 24810 + }, + { + "epoch": 1.1551318760621085, + "grad_norm": 0.3328534879261192, + "learning_rate": 7.72240001795899e-05, + "loss": 2.7776, + "step": 24811 + }, + { + "epoch": 1.1551784342482017, + "grad_norm": 0.3495713694913831, + "learning_rate": 7.722172812232823e-05, + "loss": 2.8195, + "step": 24812 + }, + { + "epoch": 1.1552249924342948, + "grad_norm": 0.346837801076393, + "learning_rate": 7.721945598517347e-05, + "loss": 2.8066, + "step": 24813 + }, + { + "epoch": 1.1552715506203879, + "grad_norm": 0.3381749362066462, + "learning_rate": 7.72171837681323e-05, + "loss": 2.7898, + "step": 24814 + }, + { + "epoch": 1.155318108806481, + "grad_norm": 0.41289652978331315, + "learning_rate": 7.721491147121137e-05, + "loss": 2.7903, + "step": 24815 + }, + { + "epoch": 1.155364666992574, + "grad_norm": 0.34365316003161966, + "learning_rate": 7.721263909441737e-05, + "loss": 2.8394, + "step": 24816 + }, + { + "epoch": 1.155411225178667, + "grad_norm": 0.384733519313404, + "learning_rate": 7.721036663775694e-05, + "loss": 2.8052, + "step": 24817 + }, + { + "epoch": 1.1554577833647601, + "grad_norm": 0.40912959141023314, + "learning_rate": 7.720809410123677e-05, + "loss": 2.8277, + "step": 24818 + }, + { + "epoch": 1.1555043415508532, + "grad_norm": 0.3432249392483784, + "learning_rate": 7.720582148486352e-05, + "loss": 2.8099, + "step": 24819 + }, + { + "epoch": 1.1555508997369461, + "grad_norm": 0.38904361636677615, + "learning_rate": 7.720354878864387e-05, + "loss": 2.7802, + "step": 24820 + }, + { + "epoch": 1.1555974579230393, + "grad_norm": 0.3297962642769426, + "learning_rate": 7.720127601258447e-05, + "loss": 2.8425, + "step": 24821 + }, + { + "epoch": 1.1556440161091324, + "grad_norm": 0.35566462157197626, + "learning_rate": 7.719900315669202e-05, + "loss": 2.8729, + "step": 24822 + }, + { + "epoch": 1.1556905742952255, + "grad_norm": 0.3414234850699333, + "learning_rate": 7.719673022097316e-05, + "loss": 2.8555, + "step": 24823 + }, + { + "epoch": 1.1557371324813186, + "grad_norm": 0.34315002286117435, + "learning_rate": 7.719445720543458e-05, + "loss": 2.7801, + "step": 24824 + }, + { + "epoch": 1.1557836906674117, + "grad_norm": 0.36815176858951604, + "learning_rate": 7.719218411008295e-05, + "loss": 2.8123, + "step": 24825 + }, + { + "epoch": 1.1558302488535046, + "grad_norm": 0.3652245242036294, + "learning_rate": 7.718991093492495e-05, + "loss": 2.8935, + "step": 24826 + }, + { + "epoch": 1.1558768070395977, + "grad_norm": 0.3889511559761762, + "learning_rate": 7.718763767996724e-05, + "loss": 2.905, + "step": 24827 + }, + { + "epoch": 1.1559233652256908, + "grad_norm": 0.3148063652834636, + "learning_rate": 7.718536434521649e-05, + "loss": 2.7785, + "step": 24828 + }, + { + "epoch": 1.155969923411784, + "grad_norm": 0.35500885699879287, + "learning_rate": 7.718309093067936e-05, + "loss": 2.8221, + "step": 24829 + }, + { + "epoch": 1.1560164815978768, + "grad_norm": 0.33818532906030746, + "learning_rate": 7.718081743636254e-05, + "loss": 2.9082, + "step": 24830 + }, + { + "epoch": 1.15606303978397, + "grad_norm": 0.32379027997635185, + "learning_rate": 7.717854386227271e-05, + "loss": 2.7662, + "step": 24831 + }, + { + "epoch": 1.156109597970063, + "grad_norm": 0.3308000281658405, + "learning_rate": 7.717627020841651e-05, + "loss": 2.8356, + "step": 24832 + }, + { + "epoch": 1.1561561561561562, + "grad_norm": 0.3071564926267477, + "learning_rate": 7.717399647480065e-05, + "loss": 2.7877, + "step": 24833 + }, + { + "epoch": 1.1562027143422493, + "grad_norm": 0.3439213018691367, + "learning_rate": 7.717172266143178e-05, + "loss": 2.7657, + "step": 24834 + }, + { + "epoch": 1.1562492725283424, + "grad_norm": 0.33098517671665384, + "learning_rate": 7.71694487683166e-05, + "loss": 2.7821, + "step": 24835 + }, + { + "epoch": 1.1562958307144353, + "grad_norm": 0.3086431904523378, + "learning_rate": 7.716717479546174e-05, + "loss": 2.7741, + "step": 24836 + }, + { + "epoch": 1.1563423889005284, + "grad_norm": 0.32910805180266056, + "learning_rate": 7.716490074287392e-05, + "loss": 2.9067, + "step": 24837 + }, + { + "epoch": 1.1563889470866215, + "grad_norm": 0.307581873058548, + "learning_rate": 7.716262661055978e-05, + "loss": 2.7675, + "step": 24838 + }, + { + "epoch": 1.1564355052727147, + "grad_norm": 0.32825311680521924, + "learning_rate": 7.7160352398526e-05, + "loss": 2.8405, + "step": 24839 + }, + { + "epoch": 1.1564820634588076, + "grad_norm": 0.3124688615401199, + "learning_rate": 7.715807810677926e-05, + "loss": 2.769, + "step": 24840 + }, + { + "epoch": 1.1565286216449007, + "grad_norm": 0.31823885907976524, + "learning_rate": 7.715580373532626e-05, + "loss": 2.7889, + "step": 24841 + }, + { + "epoch": 1.1565751798309938, + "grad_norm": 0.34567658597655854, + "learning_rate": 7.715352928417363e-05, + "loss": 2.7079, + "step": 24842 + }, + { + "epoch": 1.156621738017087, + "grad_norm": 0.3110091243189706, + "learning_rate": 7.715125475332807e-05, + "loss": 2.8755, + "step": 24843 + }, + { + "epoch": 1.15666829620318, + "grad_norm": 0.3391149332964205, + "learning_rate": 7.714898014279626e-05, + "loss": 2.7547, + "step": 24844 + }, + { + "epoch": 1.156714854389273, + "grad_norm": 0.3428393714636109, + "learning_rate": 7.714670545258487e-05, + "loss": 2.77, + "step": 24845 + }, + { + "epoch": 1.156761412575366, + "grad_norm": 0.3388656834964711, + "learning_rate": 7.714443068270057e-05, + "loss": 2.8991, + "step": 24846 + }, + { + "epoch": 1.1568079707614591, + "grad_norm": 0.33292198603842704, + "learning_rate": 7.714215583315003e-05, + "loss": 2.7965, + "step": 24847 + }, + { + "epoch": 1.1568545289475523, + "grad_norm": 0.32493333678105313, + "learning_rate": 7.713988090393996e-05, + "loss": 2.7485, + "step": 24848 + }, + { + "epoch": 1.1569010871336454, + "grad_norm": 0.3370969958513304, + "learning_rate": 7.7137605895077e-05, + "loss": 2.8185, + "step": 24849 + }, + { + "epoch": 1.1569476453197383, + "grad_norm": 0.3425442576210582, + "learning_rate": 7.713533080656783e-05, + "loss": 2.8961, + "step": 24850 + }, + { + "epoch": 1.1569942035058314, + "grad_norm": 0.3352623803393919, + "learning_rate": 7.713305563841915e-05, + "loss": 2.6375, + "step": 24851 + }, + { + "epoch": 1.1570407616919245, + "grad_norm": 0.3308364657608359, + "learning_rate": 7.713078039063761e-05, + "loss": 2.8212, + "step": 24852 + }, + { + "epoch": 1.1570873198780176, + "grad_norm": 0.31018526918918476, + "learning_rate": 7.712850506322991e-05, + "loss": 2.87, + "step": 24853 + }, + { + "epoch": 1.1571338780641107, + "grad_norm": 0.34114809099628873, + "learning_rate": 7.712622965620273e-05, + "loss": 2.7449, + "step": 24854 + }, + { + "epoch": 1.1571804362502036, + "grad_norm": 0.324854047687415, + "learning_rate": 7.712395416956272e-05, + "loss": 2.686, + "step": 24855 + }, + { + "epoch": 1.1572269944362967, + "grad_norm": 0.3249538300508521, + "learning_rate": 7.712167860331659e-05, + "loss": 2.837, + "step": 24856 + }, + { + "epoch": 1.1572735526223898, + "grad_norm": 0.31965620019213437, + "learning_rate": 7.7119402957471e-05, + "loss": 2.7304, + "step": 24857 + }, + { + "epoch": 1.157320110808483, + "grad_norm": 0.31678816005183824, + "learning_rate": 7.711712723203264e-05, + "loss": 2.8203, + "step": 24858 + }, + { + "epoch": 1.1573666689945759, + "grad_norm": 0.3280771286814943, + "learning_rate": 7.711485142700818e-05, + "loss": 2.847, + "step": 24859 + }, + { + "epoch": 1.157413227180669, + "grad_norm": 0.3276098282281702, + "learning_rate": 7.71125755424043e-05, + "loss": 2.7621, + "step": 24860 + }, + { + "epoch": 1.157459785366762, + "grad_norm": 0.31826294568314406, + "learning_rate": 7.711029957822769e-05, + "loss": 2.8582, + "step": 24861 + }, + { + "epoch": 1.1575063435528552, + "grad_norm": 0.3070786662832804, + "learning_rate": 7.710802353448502e-05, + "loss": 2.7758, + "step": 24862 + }, + { + "epoch": 1.1575529017389483, + "grad_norm": 0.31343120914138994, + "learning_rate": 7.710574741118297e-05, + "loss": 2.834, + "step": 24863 + }, + { + "epoch": 1.1575994599250414, + "grad_norm": 0.30070159762289295, + "learning_rate": 7.710347120832821e-05, + "loss": 2.7825, + "step": 24864 + }, + { + "epoch": 1.1576460181111343, + "grad_norm": 0.3390914984051213, + "learning_rate": 7.710119492592744e-05, + "loss": 2.8943, + "step": 24865 + }, + { + "epoch": 1.1576925762972274, + "grad_norm": 0.3205600098711156, + "learning_rate": 7.709891856398734e-05, + "loss": 2.7813, + "step": 24866 + }, + { + "epoch": 1.1577391344833206, + "grad_norm": 0.31841095568041916, + "learning_rate": 7.709664212251458e-05, + "loss": 2.7637, + "step": 24867 + }, + { + "epoch": 1.1577856926694137, + "grad_norm": 0.32244288661645787, + "learning_rate": 7.709436560151584e-05, + "loss": 2.7963, + "step": 24868 + }, + { + "epoch": 1.1578322508555066, + "grad_norm": 0.34515974824786316, + "learning_rate": 7.70920890009978e-05, + "loss": 2.8677, + "step": 24869 + }, + { + "epoch": 1.1578788090415997, + "grad_norm": 0.3341949419703212, + "learning_rate": 7.708981232096717e-05, + "loss": 2.8291, + "step": 24870 + }, + { + "epoch": 1.1579253672276928, + "grad_norm": 0.3269920977655545, + "learning_rate": 7.708753556143057e-05, + "loss": 2.8319, + "step": 24871 + }, + { + "epoch": 1.157971925413786, + "grad_norm": 0.33528686124310764, + "learning_rate": 7.708525872239477e-05, + "loss": 2.8509, + "step": 24872 + }, + { + "epoch": 1.158018483599879, + "grad_norm": 0.3182462875727565, + "learning_rate": 7.708298180386637e-05, + "loss": 2.7604, + "step": 24873 + }, + { + "epoch": 1.1580650417859721, + "grad_norm": 0.3124041489049159, + "learning_rate": 7.708070480585211e-05, + "loss": 2.7525, + "step": 24874 + }, + { + "epoch": 1.158111599972065, + "grad_norm": 0.33507966631925534, + "learning_rate": 7.707842772835863e-05, + "loss": 2.8181, + "step": 24875 + }, + { + "epoch": 1.1581581581581581, + "grad_norm": 0.3194085072423544, + "learning_rate": 7.707615057139264e-05, + "loss": 2.8738, + "step": 24876 + }, + { + "epoch": 1.1582047163442513, + "grad_norm": 0.3314333733342211, + "learning_rate": 7.707387333496083e-05, + "loss": 2.7956, + "step": 24877 + }, + { + "epoch": 1.1582512745303444, + "grad_norm": 0.3193593192181125, + "learning_rate": 7.707159601906984e-05, + "loss": 2.7574, + "step": 24878 + }, + { + "epoch": 1.1582978327164373, + "grad_norm": 0.3063647053921446, + "learning_rate": 7.706931862372639e-05, + "loss": 2.8122, + "step": 24879 + }, + { + "epoch": 1.1583443909025304, + "grad_norm": 0.2905710668139983, + "learning_rate": 7.706704114893718e-05, + "loss": 2.7649, + "step": 24880 + }, + { + "epoch": 1.1583909490886235, + "grad_norm": 0.3167341892036761, + "learning_rate": 7.706476359470886e-05, + "loss": 2.936, + "step": 24881 + }, + { + "epoch": 1.1584375072747166, + "grad_norm": 0.3099079854880104, + "learning_rate": 7.706248596104811e-05, + "loss": 2.8698, + "step": 24882 + }, + { + "epoch": 1.1584840654608097, + "grad_norm": 0.3114814574032243, + "learning_rate": 7.706020824796164e-05, + "loss": 2.7621, + "step": 24883 + }, + { + "epoch": 1.1585306236469026, + "grad_norm": 0.3005110307150797, + "learning_rate": 7.705793045545612e-05, + "loss": 2.7743, + "step": 24884 + }, + { + "epoch": 1.1585771818329957, + "grad_norm": 0.3261923280354732, + "learning_rate": 7.705565258353825e-05, + "loss": 2.8643, + "step": 24885 + }, + { + "epoch": 1.1586237400190889, + "grad_norm": 0.29896522978948026, + "learning_rate": 7.705337463221469e-05, + "loss": 2.9377, + "step": 24886 + }, + { + "epoch": 1.158670298205182, + "grad_norm": 0.3390637501128864, + "learning_rate": 7.705109660149215e-05, + "loss": 2.9519, + "step": 24887 + }, + { + "epoch": 1.158716856391275, + "grad_norm": 0.3162942716113636, + "learning_rate": 7.704881849137731e-05, + "loss": 2.791, + "step": 24888 + }, + { + "epoch": 1.158763414577368, + "grad_norm": 0.3208204237824735, + "learning_rate": 7.704654030187684e-05, + "loss": 2.7263, + "step": 24889 + }, + { + "epoch": 1.158809972763461, + "grad_norm": 0.3390304287385094, + "learning_rate": 7.704426203299746e-05, + "loss": 2.9004, + "step": 24890 + }, + { + "epoch": 1.1588565309495542, + "grad_norm": 0.38298657907887335, + "learning_rate": 7.704198368474582e-05, + "loss": 2.7943, + "step": 24891 + }, + { + "epoch": 1.1589030891356473, + "grad_norm": 0.3219139198229874, + "learning_rate": 7.703970525712862e-05, + "loss": 2.8443, + "step": 24892 + }, + { + "epoch": 1.1589496473217404, + "grad_norm": 0.36606326374529496, + "learning_rate": 7.703742675015255e-05, + "loss": 2.8672, + "step": 24893 + }, + { + "epoch": 1.1589962055078333, + "grad_norm": 0.3290399093583573, + "learning_rate": 7.70351481638243e-05, + "loss": 2.7946, + "step": 24894 + }, + { + "epoch": 1.1590427636939264, + "grad_norm": 0.32340577851846103, + "learning_rate": 7.703286949815057e-05, + "loss": 2.8484, + "step": 24895 + }, + { + "epoch": 1.1590893218800196, + "grad_norm": 0.34011630285192873, + "learning_rate": 7.7030590753138e-05, + "loss": 2.8654, + "step": 24896 + }, + { + "epoch": 1.1591358800661127, + "grad_norm": 0.3359309840665489, + "learning_rate": 7.702831192879332e-05, + "loss": 2.7316, + "step": 24897 + }, + { + "epoch": 1.1591824382522056, + "grad_norm": 0.3539451850691031, + "learning_rate": 7.702603302512321e-05, + "loss": 2.8329, + "step": 24898 + }, + { + "epoch": 1.1592289964382987, + "grad_norm": 0.3584460700688841, + "learning_rate": 7.702375404213437e-05, + "loss": 2.8046, + "step": 24899 + }, + { + "epoch": 1.1592755546243918, + "grad_norm": 0.33077266300775826, + "learning_rate": 7.702147497983345e-05, + "loss": 2.7991, + "step": 24900 + }, + { + "epoch": 1.159322112810485, + "grad_norm": 0.3510703228192557, + "learning_rate": 7.701919583822717e-05, + "loss": 2.7688, + "step": 24901 + }, + { + "epoch": 1.159368670996578, + "grad_norm": 0.32257956506974617, + "learning_rate": 7.701691661732221e-05, + "loss": 2.6993, + "step": 24902 + }, + { + "epoch": 1.1594152291826711, + "grad_norm": 0.34135215921685547, + "learning_rate": 7.701463731712528e-05, + "loss": 2.7357, + "step": 24903 + }, + { + "epoch": 1.159461787368764, + "grad_norm": 0.38185882154055495, + "learning_rate": 7.701235793764304e-05, + "loss": 2.8777, + "step": 24904 + }, + { + "epoch": 1.1595083455548572, + "grad_norm": 0.34483072430670464, + "learning_rate": 7.701007847888219e-05, + "loss": 2.7727, + "step": 24905 + }, + { + "epoch": 1.1595549037409503, + "grad_norm": 0.3699696967201602, + "learning_rate": 7.700779894084942e-05, + "loss": 2.7339, + "step": 24906 + }, + { + "epoch": 1.1596014619270434, + "grad_norm": 0.34890096848808316, + "learning_rate": 7.70055193235514e-05, + "loss": 2.7347, + "step": 24907 + }, + { + "epoch": 1.1596480201131363, + "grad_norm": 0.33936160453180336, + "learning_rate": 7.700323962699488e-05, + "loss": 2.8332, + "step": 24908 + }, + { + "epoch": 1.1596945782992294, + "grad_norm": 0.38586106042963053, + "learning_rate": 7.700095985118649e-05, + "loss": 2.8462, + "step": 24909 + }, + { + "epoch": 1.1597411364853225, + "grad_norm": 0.3268670610850222, + "learning_rate": 7.699867999613294e-05, + "loss": 2.8353, + "step": 24910 + }, + { + "epoch": 1.1597876946714156, + "grad_norm": 0.36681160413681924, + "learning_rate": 7.699640006184092e-05, + "loss": 2.7082, + "step": 24911 + }, + { + "epoch": 1.1598342528575087, + "grad_norm": 0.33304081198200947, + "learning_rate": 7.699412004831714e-05, + "loss": 2.8799, + "step": 24912 + }, + { + "epoch": 1.1598808110436019, + "grad_norm": 0.3508367953105334, + "learning_rate": 7.699183995556828e-05, + "loss": 2.7653, + "step": 24913 + }, + { + "epoch": 1.1599273692296947, + "grad_norm": 0.3883065011633208, + "learning_rate": 7.698955978360101e-05, + "loss": 2.7736, + "step": 24914 + }, + { + "epoch": 1.1599739274157879, + "grad_norm": 0.36375229552641836, + "learning_rate": 7.698727953242206e-05, + "loss": 2.8687, + "step": 24915 + }, + { + "epoch": 1.160020485601881, + "grad_norm": 0.4248314613575151, + "learning_rate": 7.698499920203809e-05, + "loss": 2.8152, + "step": 24916 + }, + { + "epoch": 1.160067043787974, + "grad_norm": 0.3315292582310624, + "learning_rate": 7.698271879245582e-05, + "loss": 2.7732, + "step": 24917 + }, + { + "epoch": 1.160113601974067, + "grad_norm": 0.4076775961731668, + "learning_rate": 7.698043830368192e-05, + "loss": 2.7527, + "step": 24918 + }, + { + "epoch": 1.16016016016016, + "grad_norm": 0.3334661872635092, + "learning_rate": 7.697815773572309e-05, + "loss": 2.8097, + "step": 24919 + }, + { + "epoch": 1.1602067183462532, + "grad_norm": 0.4098406748730538, + "learning_rate": 7.697587708858602e-05, + "loss": 2.9173, + "step": 24920 + }, + { + "epoch": 1.1602532765323463, + "grad_norm": 0.3144653399465887, + "learning_rate": 7.697359636227741e-05, + "loss": 2.8742, + "step": 24921 + }, + { + "epoch": 1.1602998347184394, + "grad_norm": 0.3986359338013452, + "learning_rate": 7.697131555680397e-05, + "loss": 2.865, + "step": 24922 + }, + { + "epoch": 1.1603463929045326, + "grad_norm": 0.3593212991999608, + "learning_rate": 7.696903467217236e-05, + "loss": 2.7554, + "step": 24923 + }, + { + "epoch": 1.1603929510906255, + "grad_norm": 0.38009095274514887, + "learning_rate": 7.696675370838929e-05, + "loss": 2.8181, + "step": 24924 + }, + { + "epoch": 1.1604395092767186, + "grad_norm": 0.3409168497385155, + "learning_rate": 7.696447266546145e-05, + "loss": 2.8835, + "step": 24925 + }, + { + "epoch": 1.1604860674628117, + "grad_norm": 0.40034327387347646, + "learning_rate": 7.696219154339555e-05, + "loss": 2.8434, + "step": 24926 + }, + { + "epoch": 1.1605326256489048, + "grad_norm": 0.3725575304778479, + "learning_rate": 7.695991034219828e-05, + "loss": 2.7843, + "step": 24927 + }, + { + "epoch": 1.1605791838349977, + "grad_norm": 0.3409432383675532, + "learning_rate": 7.695762906187631e-05, + "loss": 2.9244, + "step": 24928 + }, + { + "epoch": 1.1606257420210908, + "grad_norm": 0.39859787976132727, + "learning_rate": 7.695534770243638e-05, + "loss": 2.7246, + "step": 24929 + }, + { + "epoch": 1.160672300207184, + "grad_norm": 0.32815660961168464, + "learning_rate": 7.695306626388512e-05, + "loss": 2.7686, + "step": 24930 + }, + { + "epoch": 1.160718858393277, + "grad_norm": 0.38963020140151283, + "learning_rate": 7.69507847462293e-05, + "loss": 2.7601, + "step": 24931 + }, + { + "epoch": 1.1607654165793702, + "grad_norm": 0.3619817654262272, + "learning_rate": 7.694850314947557e-05, + "loss": 2.7522, + "step": 24932 + }, + { + "epoch": 1.160811974765463, + "grad_norm": 0.40859828373810747, + "learning_rate": 7.694622147363065e-05, + "loss": 2.8374, + "step": 24933 + }, + { + "epoch": 1.1608585329515562, + "grad_norm": 0.335410772966112, + "learning_rate": 7.69439397187012e-05, + "loss": 2.8122, + "step": 24934 + }, + { + "epoch": 1.1609050911376493, + "grad_norm": 0.38649251904923904, + "learning_rate": 7.694165788469397e-05, + "loss": 2.9077, + "step": 24935 + }, + { + "epoch": 1.1609516493237424, + "grad_norm": 0.3606290119265617, + "learning_rate": 7.69393759716156e-05, + "loss": 2.805, + "step": 24936 + }, + { + "epoch": 1.1609982075098355, + "grad_norm": 0.34731460784554913, + "learning_rate": 7.693709397947283e-05, + "loss": 2.7759, + "step": 24937 + }, + { + "epoch": 1.1610447656959284, + "grad_norm": 0.34674908766841844, + "learning_rate": 7.693481190827234e-05, + "loss": 2.7499, + "step": 24938 + }, + { + "epoch": 1.1610913238820215, + "grad_norm": 0.32711835592375843, + "learning_rate": 7.693252975802083e-05, + "loss": 2.9267, + "step": 24939 + }, + { + "epoch": 1.1611378820681146, + "grad_norm": 0.3718267478338295, + "learning_rate": 7.693024752872501e-05, + "loss": 2.8793, + "step": 24940 + }, + { + "epoch": 1.1611844402542078, + "grad_norm": 0.312515172898655, + "learning_rate": 7.692796522039156e-05, + "loss": 2.8362, + "step": 24941 + }, + { + "epoch": 1.1612309984403009, + "grad_norm": 0.35322302381108545, + "learning_rate": 7.69256828330272e-05, + "loss": 2.7699, + "step": 24942 + }, + { + "epoch": 1.1612775566263938, + "grad_norm": 0.307707579980769, + "learning_rate": 7.692340036663857e-05, + "loss": 2.7307, + "step": 24943 + }, + { + "epoch": 1.1613241148124869, + "grad_norm": 0.3604181838041388, + "learning_rate": 7.692111782123246e-05, + "loss": 2.7341, + "step": 24944 + }, + { + "epoch": 1.16137067299858, + "grad_norm": 0.28439723339957174, + "learning_rate": 7.691883519681549e-05, + "loss": 2.6187, + "step": 24945 + }, + { + "epoch": 1.161417231184673, + "grad_norm": 0.3541616349645986, + "learning_rate": 7.69165524933944e-05, + "loss": 2.8254, + "step": 24946 + }, + { + "epoch": 1.161463789370766, + "grad_norm": 0.31701532110959635, + "learning_rate": 7.691426971097587e-05, + "loss": 2.8886, + "step": 24947 + }, + { + "epoch": 1.1615103475568591, + "grad_norm": 0.3334357441398717, + "learning_rate": 7.691198684956662e-05, + "loss": 2.7681, + "step": 24948 + }, + { + "epoch": 1.1615569057429522, + "grad_norm": 0.3380092260093168, + "learning_rate": 7.690970390917335e-05, + "loss": 2.7193, + "step": 24949 + }, + { + "epoch": 1.1616034639290453, + "grad_norm": 0.30340472674547836, + "learning_rate": 7.690742088980273e-05, + "loss": 2.8447, + "step": 24950 + }, + { + "epoch": 1.1616500221151385, + "grad_norm": 0.3373609473229237, + "learning_rate": 7.690513779146149e-05, + "loss": 2.8564, + "step": 24951 + }, + { + "epoch": 1.1616965803012316, + "grad_norm": 0.31092772748901604, + "learning_rate": 7.690285461415634e-05, + "loss": 2.7646, + "step": 24952 + }, + { + "epoch": 1.1617431384873245, + "grad_norm": 0.35807651214668246, + "learning_rate": 7.690057135789394e-05, + "loss": 2.831, + "step": 24953 + }, + { + "epoch": 1.1617896966734176, + "grad_norm": 0.32980838079926483, + "learning_rate": 7.689828802268102e-05, + "loss": 2.8389, + "step": 24954 + }, + { + "epoch": 1.1618362548595107, + "grad_norm": 0.32788482698806953, + "learning_rate": 7.689600460852426e-05, + "loss": 2.7509, + "step": 24955 + }, + { + "epoch": 1.1618828130456038, + "grad_norm": 0.33006450496830253, + "learning_rate": 7.68937211154304e-05, + "loss": 2.723, + "step": 24956 + }, + { + "epoch": 1.1619293712316967, + "grad_norm": 0.3585651435795517, + "learning_rate": 7.68914375434061e-05, + "loss": 2.8847, + "step": 24957 + }, + { + "epoch": 1.1619759294177898, + "grad_norm": 0.3336674915027993, + "learning_rate": 7.68891538924581e-05, + "loss": 2.8133, + "step": 24958 + }, + { + "epoch": 1.162022487603883, + "grad_norm": 0.32294739807286843, + "learning_rate": 7.688687016259306e-05, + "loss": 2.7349, + "step": 24959 + }, + { + "epoch": 1.162069045789976, + "grad_norm": 0.3174580576096802, + "learning_rate": 7.688458635381772e-05, + "loss": 2.7889, + "step": 24960 + }, + { + "epoch": 1.1621156039760692, + "grad_norm": 0.31328277422441625, + "learning_rate": 7.688230246613875e-05, + "loss": 2.8341, + "step": 24961 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.30525983213186897, + "learning_rate": 7.688001849956288e-05, + "loss": 2.7446, + "step": 24962 + }, + { + "epoch": 1.1622087203482552, + "grad_norm": 0.3030167985513494, + "learning_rate": 7.68777344540968e-05, + "loss": 2.784, + "step": 24963 + }, + { + "epoch": 1.1622552785343483, + "grad_norm": 0.3298301674605544, + "learning_rate": 7.687545032974723e-05, + "loss": 2.8593, + "step": 24964 + }, + { + "epoch": 1.1623018367204414, + "grad_norm": 0.32924747030121443, + "learning_rate": 7.687316612652086e-05, + "loss": 2.9063, + "step": 24965 + }, + { + "epoch": 1.1623483949065345, + "grad_norm": 0.31017552275926613, + "learning_rate": 7.687088184442439e-05, + "loss": 2.6947, + "step": 24966 + }, + { + "epoch": 1.1623949530926274, + "grad_norm": 0.3096928383916843, + "learning_rate": 7.686859748346453e-05, + "loss": 2.7516, + "step": 24967 + }, + { + "epoch": 1.1624415112787205, + "grad_norm": 0.33438434108441506, + "learning_rate": 7.686631304364798e-05, + "loss": 2.7672, + "step": 24968 + }, + { + "epoch": 1.1624880694648136, + "grad_norm": 0.2911581885720044, + "learning_rate": 7.686402852498145e-05, + "loss": 2.8028, + "step": 24969 + }, + { + "epoch": 1.1625346276509068, + "grad_norm": 0.3093058575009137, + "learning_rate": 7.686174392747164e-05, + "loss": 2.6739, + "step": 24970 + }, + { + "epoch": 1.1625811858369999, + "grad_norm": 0.3386138165423734, + "learning_rate": 7.685945925112525e-05, + "loss": 2.8188, + "step": 24971 + }, + { + "epoch": 1.1626277440230928, + "grad_norm": 0.314193017030752, + "learning_rate": 7.685717449594901e-05, + "loss": 2.7865, + "step": 24972 + }, + { + "epoch": 1.1626743022091859, + "grad_norm": 0.34816201029421245, + "learning_rate": 7.68548896619496e-05, + "loss": 2.7593, + "step": 24973 + }, + { + "epoch": 1.162720860395279, + "grad_norm": 0.3088821620256311, + "learning_rate": 7.685260474913373e-05, + "loss": 2.9133, + "step": 24974 + }, + { + "epoch": 1.1627674185813721, + "grad_norm": 0.3651461549946969, + "learning_rate": 7.685031975750812e-05, + "loss": 2.8179, + "step": 24975 + }, + { + "epoch": 1.1628139767674652, + "grad_norm": 0.33942597714353057, + "learning_rate": 7.684803468707946e-05, + "loss": 2.7518, + "step": 24976 + }, + { + "epoch": 1.1628605349535581, + "grad_norm": 0.34150916461138986, + "learning_rate": 7.684574953785447e-05, + "loss": 2.8994, + "step": 24977 + }, + { + "epoch": 1.1629070931396512, + "grad_norm": 0.35077761235975363, + "learning_rate": 7.684346430983985e-05, + "loss": 2.7256, + "step": 24978 + }, + { + "epoch": 1.1629536513257444, + "grad_norm": 0.3212841363011415, + "learning_rate": 7.68411790030423e-05, + "loss": 2.9086, + "step": 24979 + }, + { + "epoch": 1.1630002095118375, + "grad_norm": 0.3565026033946436, + "learning_rate": 7.683889361746854e-05, + "loss": 2.8222, + "step": 24980 + }, + { + "epoch": 1.1630467676979306, + "grad_norm": 0.33322425763222197, + "learning_rate": 7.683660815312528e-05, + "loss": 2.8125, + "step": 24981 + }, + { + "epoch": 1.1630933258840235, + "grad_norm": 0.3556095760214961, + "learning_rate": 7.683432261001919e-05, + "loss": 2.8467, + "step": 24982 + }, + { + "epoch": 1.1631398840701166, + "grad_norm": 0.34369318543898486, + "learning_rate": 7.683203698815703e-05, + "loss": 2.7076, + "step": 24983 + }, + { + "epoch": 1.1631864422562097, + "grad_norm": 0.3735905799509682, + "learning_rate": 7.682975128754549e-05, + "loss": 2.7621, + "step": 24984 + }, + { + "epoch": 1.1632330004423028, + "grad_norm": 0.33822074197143126, + "learning_rate": 7.682746550819125e-05, + "loss": 2.9114, + "step": 24985 + }, + { + "epoch": 1.1632795586283957, + "grad_norm": 0.33864788918282707, + "learning_rate": 7.682517965010105e-05, + "loss": 2.8211, + "step": 24986 + }, + { + "epoch": 1.1633261168144888, + "grad_norm": 0.37118410556396497, + "learning_rate": 7.68228937132816e-05, + "loss": 2.8174, + "step": 24987 + }, + { + "epoch": 1.163372675000582, + "grad_norm": 0.37068338694976005, + "learning_rate": 7.682060769773958e-05, + "loss": 2.8871, + "step": 24988 + }, + { + "epoch": 1.163419233186675, + "grad_norm": 0.3584146186059052, + "learning_rate": 7.681832160348174e-05, + "loss": 2.8291, + "step": 24989 + }, + { + "epoch": 1.1634657913727682, + "grad_norm": 0.38361939703862297, + "learning_rate": 7.681603543051477e-05, + "loss": 2.7968, + "step": 24990 + }, + { + "epoch": 1.1635123495588613, + "grad_norm": 0.31698092086203183, + "learning_rate": 7.681374917884536e-05, + "loss": 2.8192, + "step": 24991 + }, + { + "epoch": 1.1635589077449542, + "grad_norm": 0.4062639890294243, + "learning_rate": 7.681146284848025e-05, + "loss": 2.7854, + "step": 24992 + }, + { + "epoch": 1.1636054659310473, + "grad_norm": 0.3725831361924801, + "learning_rate": 7.680917643942613e-05, + "loss": 2.9491, + "step": 24993 + }, + { + "epoch": 1.1636520241171404, + "grad_norm": 0.3405486775313027, + "learning_rate": 7.680688995168973e-05, + "loss": 2.735, + "step": 24994 + }, + { + "epoch": 1.1636985823032335, + "grad_norm": 0.342291725842192, + "learning_rate": 7.680460338527774e-05, + "loss": 2.8632, + "step": 24995 + }, + { + "epoch": 1.1637451404893264, + "grad_norm": 0.34597102474119235, + "learning_rate": 7.680231674019688e-05, + "loss": 2.7406, + "step": 24996 + }, + { + "epoch": 1.1637916986754195, + "grad_norm": 0.3268845524230492, + "learning_rate": 7.680003001645386e-05, + "loss": 2.7711, + "step": 24997 + }, + { + "epoch": 1.1638382568615127, + "grad_norm": 0.315266422685378, + "learning_rate": 7.679774321405539e-05, + "loss": 2.8839, + "step": 24998 + }, + { + "epoch": 1.1638848150476058, + "grad_norm": 0.36818352194475085, + "learning_rate": 7.67954563330082e-05, + "loss": 2.7183, + "step": 24999 + }, + { + "epoch": 1.1639313732336989, + "grad_norm": 0.3304923278865911, + "learning_rate": 7.679316937331897e-05, + "loss": 2.8175, + "step": 25000 + }, + { + "epoch": 1.163977931419792, + "grad_norm": 0.32189065046272275, + "learning_rate": 7.679088233499443e-05, + "loss": 2.7578, + "step": 25001 + }, + { + "epoch": 1.164024489605885, + "grad_norm": 0.33382692133966585, + "learning_rate": 7.678859521804127e-05, + "loss": 2.7449, + "step": 25002 + }, + { + "epoch": 1.164071047791978, + "grad_norm": 0.33354271707578487, + "learning_rate": 7.678630802246626e-05, + "loss": 2.7987, + "step": 25003 + }, + { + "epoch": 1.1641176059780711, + "grad_norm": 0.3131412971367024, + "learning_rate": 7.678402074827605e-05, + "loss": 2.8306, + "step": 25004 + }, + { + "epoch": 1.1641641641641642, + "grad_norm": 0.356632565111525, + "learning_rate": 7.67817333954774e-05, + "loss": 2.7449, + "step": 25005 + }, + { + "epoch": 1.1642107223502571, + "grad_norm": 0.31230655524653844, + "learning_rate": 7.677944596407698e-05, + "loss": 2.6985, + "step": 25006 + }, + { + "epoch": 1.1642572805363502, + "grad_norm": 0.3393295533701042, + "learning_rate": 7.677715845408152e-05, + "loss": 2.7887, + "step": 25007 + }, + { + "epoch": 1.1643038387224434, + "grad_norm": 0.34072664812837167, + "learning_rate": 7.677487086549776e-05, + "loss": 2.8602, + "step": 25008 + }, + { + "epoch": 1.1643503969085365, + "grad_norm": 0.3251400573204341, + "learning_rate": 7.677258319833238e-05, + "loss": 2.8914, + "step": 25009 + }, + { + "epoch": 1.1643969550946296, + "grad_norm": 0.3128136415239921, + "learning_rate": 7.67702954525921e-05, + "loss": 2.8568, + "step": 25010 + }, + { + "epoch": 1.1644435132807225, + "grad_norm": 0.343261388420435, + "learning_rate": 7.676800762828365e-05, + "loss": 2.8215, + "step": 25011 + }, + { + "epoch": 1.1644900714668156, + "grad_norm": 0.3145654663155027, + "learning_rate": 7.676571972541374e-05, + "loss": 2.907, + "step": 25012 + }, + { + "epoch": 1.1645366296529087, + "grad_norm": 0.3423412766812871, + "learning_rate": 7.676343174398905e-05, + "loss": 2.869, + "step": 25013 + }, + { + "epoch": 1.1645831878390018, + "grad_norm": 0.3447222209115707, + "learning_rate": 7.676114368401635e-05, + "loss": 2.8665, + "step": 25014 + }, + { + "epoch": 1.164629746025095, + "grad_norm": 0.33034533800451066, + "learning_rate": 7.675885554550232e-05, + "loss": 2.7562, + "step": 25015 + }, + { + "epoch": 1.1646763042111878, + "grad_norm": 0.34003750646662156, + "learning_rate": 7.675656732845369e-05, + "loss": 2.8335, + "step": 25016 + }, + { + "epoch": 1.164722862397281, + "grad_norm": 0.34373181706726774, + "learning_rate": 7.675427903287717e-05, + "loss": 2.8389, + "step": 25017 + }, + { + "epoch": 1.164769420583374, + "grad_norm": 0.33637900141851723, + "learning_rate": 7.67519906587795e-05, + "loss": 2.8125, + "step": 25018 + }, + { + "epoch": 1.1648159787694672, + "grad_norm": 0.3274927248018022, + "learning_rate": 7.674970220616732e-05, + "loss": 2.767, + "step": 25019 + }, + { + "epoch": 1.1648625369555603, + "grad_norm": 0.33032503196068724, + "learning_rate": 7.674741367504744e-05, + "loss": 2.8594, + "step": 25020 + }, + { + "epoch": 1.1649090951416532, + "grad_norm": 0.29983352836656174, + "learning_rate": 7.674512506542653e-05, + "loss": 2.8296, + "step": 25021 + }, + { + "epoch": 1.1649556533277463, + "grad_norm": 0.33224820945251543, + "learning_rate": 7.67428363773113e-05, + "loss": 2.7638, + "step": 25022 + }, + { + "epoch": 1.1650022115138394, + "grad_norm": 0.32570797256372636, + "learning_rate": 7.674054761070849e-05, + "loss": 2.7808, + "step": 25023 + }, + { + "epoch": 1.1650487696999325, + "grad_norm": 0.2994666666658031, + "learning_rate": 7.673825876562481e-05, + "loss": 2.8286, + "step": 25024 + }, + { + "epoch": 1.1650953278860257, + "grad_norm": 0.329867166357322, + "learning_rate": 7.673596984206697e-05, + "loss": 2.729, + "step": 25025 + }, + { + "epoch": 1.1651418860721185, + "grad_norm": 0.3070273296567292, + "learning_rate": 7.673368084004169e-05, + "loss": 2.8634, + "step": 25026 + }, + { + "epoch": 1.1651884442582117, + "grad_norm": 0.3072878674807352, + "learning_rate": 7.673139175955568e-05, + "loss": 2.859, + "step": 25027 + }, + { + "epoch": 1.1652350024443048, + "grad_norm": 0.32228381502038267, + "learning_rate": 7.672910260061568e-05, + "loss": 2.8534, + "step": 25028 + }, + { + "epoch": 1.165281560630398, + "grad_norm": 0.3017586896944558, + "learning_rate": 7.672681336322841e-05, + "loss": 2.8855, + "step": 25029 + }, + { + "epoch": 1.165328118816491, + "grad_norm": 0.330443974119625, + "learning_rate": 7.672452404740056e-05, + "loss": 2.7607, + "step": 25030 + }, + { + "epoch": 1.165374677002584, + "grad_norm": 0.31027162749828346, + "learning_rate": 7.672223465313888e-05, + "loss": 2.863, + "step": 25031 + }, + { + "epoch": 1.165421235188677, + "grad_norm": 0.3141420216205558, + "learning_rate": 7.671994518045006e-05, + "loss": 2.7903, + "step": 25032 + }, + { + "epoch": 1.1654677933747701, + "grad_norm": 0.31916824049123516, + "learning_rate": 7.671765562934083e-05, + "loss": 2.772, + "step": 25033 + }, + { + "epoch": 1.1655143515608632, + "grad_norm": 0.33241937777211344, + "learning_rate": 7.671536599981792e-05, + "loss": 2.8901, + "step": 25034 + }, + { + "epoch": 1.1655609097469561, + "grad_norm": 0.3193129430467811, + "learning_rate": 7.671307629188805e-05, + "loss": 2.7264, + "step": 25035 + }, + { + "epoch": 1.1656074679330493, + "grad_norm": 0.3163924141548303, + "learning_rate": 7.671078650555792e-05, + "loss": 2.7868, + "step": 25036 + }, + { + "epoch": 1.1656540261191424, + "grad_norm": 0.3100270651614407, + "learning_rate": 7.670849664083426e-05, + "loss": 2.7556, + "step": 25037 + }, + { + "epoch": 1.1657005843052355, + "grad_norm": 0.31512550995175226, + "learning_rate": 7.670620669772379e-05, + "loss": 2.7949, + "step": 25038 + }, + { + "epoch": 1.1657471424913286, + "grad_norm": 0.31814443578667984, + "learning_rate": 7.670391667623327e-05, + "loss": 2.8077, + "step": 25039 + }, + { + "epoch": 1.1657937006774217, + "grad_norm": 0.3274184388455481, + "learning_rate": 7.670162657636934e-05, + "loss": 2.8455, + "step": 25040 + }, + { + "epoch": 1.1658402588635146, + "grad_norm": 0.3188430643866392, + "learning_rate": 7.66993363981388e-05, + "loss": 2.8303, + "step": 25041 + }, + { + "epoch": 1.1658868170496077, + "grad_norm": 0.3186930913013015, + "learning_rate": 7.669704614154833e-05, + "loss": 2.8049, + "step": 25042 + }, + { + "epoch": 1.1659333752357008, + "grad_norm": 0.30786840364678947, + "learning_rate": 7.669475580660465e-05, + "loss": 2.7655, + "step": 25043 + }, + { + "epoch": 1.165979933421794, + "grad_norm": 0.3489647541969575, + "learning_rate": 7.669246539331449e-05, + "loss": 2.8048, + "step": 25044 + }, + { + "epoch": 1.1660264916078869, + "grad_norm": 0.31162080753622545, + "learning_rate": 7.669017490168459e-05, + "loss": 2.7158, + "step": 25045 + }, + { + "epoch": 1.16607304979398, + "grad_norm": 0.31090937215008446, + "learning_rate": 7.668788433172165e-05, + "loss": 2.7887, + "step": 25046 + }, + { + "epoch": 1.166119607980073, + "grad_norm": 0.3379444451870776, + "learning_rate": 7.668559368343237e-05, + "loss": 2.787, + "step": 25047 + }, + { + "epoch": 1.1661661661661662, + "grad_norm": 0.31676381248990937, + "learning_rate": 7.668330295682355e-05, + "loss": 2.7969, + "step": 25048 + }, + { + "epoch": 1.1662127243522593, + "grad_norm": 0.3166825782656067, + "learning_rate": 7.668101215190185e-05, + "loss": 2.7452, + "step": 25049 + }, + { + "epoch": 1.1662592825383524, + "grad_norm": 0.33082489327719344, + "learning_rate": 7.667872126867398e-05, + "loss": 2.8227, + "step": 25050 + }, + { + "epoch": 1.1663058407244453, + "grad_norm": 0.3091216197611559, + "learning_rate": 7.667643030714671e-05, + "loss": 2.7551, + "step": 25051 + }, + { + "epoch": 1.1663523989105384, + "grad_norm": 0.3231850559631468, + "learning_rate": 7.667413926732673e-05, + "loss": 2.7697, + "step": 25052 + }, + { + "epoch": 1.1663989570966316, + "grad_norm": 0.3354314442888807, + "learning_rate": 7.667184814922082e-05, + "loss": 2.8015, + "step": 25053 + }, + { + "epoch": 1.1664455152827247, + "grad_norm": 0.3252846887746511, + "learning_rate": 7.666955695283562e-05, + "loss": 2.696, + "step": 25054 + }, + { + "epoch": 1.1664920734688176, + "grad_norm": 0.3308791720939887, + "learning_rate": 7.666726567817792e-05, + "loss": 2.8479, + "step": 25055 + }, + { + "epoch": 1.1665386316549107, + "grad_norm": 0.34766875002648473, + "learning_rate": 7.666497432525443e-05, + "loss": 2.8035, + "step": 25056 + }, + { + "epoch": 1.1665851898410038, + "grad_norm": 0.30690256242437913, + "learning_rate": 7.666268289407186e-05, + "loss": 2.8226, + "step": 25057 + }, + { + "epoch": 1.166631748027097, + "grad_norm": 0.3328485321056059, + "learning_rate": 7.666039138463693e-05, + "loss": 2.8123, + "step": 25058 + }, + { + "epoch": 1.16667830621319, + "grad_norm": 0.33087884030173637, + "learning_rate": 7.665809979695638e-05, + "loss": 2.7834, + "step": 25059 + }, + { + "epoch": 1.166724864399283, + "grad_norm": 0.35327111837319264, + "learning_rate": 7.665580813103695e-05, + "loss": 2.7683, + "step": 25060 + }, + { + "epoch": 1.166771422585376, + "grad_norm": 0.30440448483892973, + "learning_rate": 7.665351638688532e-05, + "loss": 2.7592, + "step": 25061 + }, + { + "epoch": 1.1668179807714691, + "grad_norm": 0.36296374625066175, + "learning_rate": 7.665122456450827e-05, + "loss": 2.772, + "step": 25062 + }, + { + "epoch": 1.1668645389575623, + "grad_norm": 0.3282480262365573, + "learning_rate": 7.664893266391249e-05, + "loss": 2.8704, + "step": 25063 + }, + { + "epoch": 1.1669110971436554, + "grad_norm": 0.33298378026013775, + "learning_rate": 7.664664068510473e-05, + "loss": 2.7971, + "step": 25064 + }, + { + "epoch": 1.1669576553297483, + "grad_norm": 0.3296453202775534, + "learning_rate": 7.66443486280917e-05, + "loss": 2.7486, + "step": 25065 + }, + { + "epoch": 1.1670042135158414, + "grad_norm": 0.32759472346221996, + "learning_rate": 7.664205649288012e-05, + "loss": 2.8925, + "step": 25066 + }, + { + "epoch": 1.1670507717019345, + "grad_norm": 0.3223140535225721, + "learning_rate": 7.663976427947674e-05, + "loss": 2.7553, + "step": 25067 + }, + { + "epoch": 1.1670973298880276, + "grad_norm": 0.3299569080921131, + "learning_rate": 7.663747198788827e-05, + "loss": 2.8421, + "step": 25068 + }, + { + "epoch": 1.1671438880741207, + "grad_norm": 0.3314745345329713, + "learning_rate": 7.663517961812146e-05, + "loss": 2.9249, + "step": 25069 + }, + { + "epoch": 1.1671904462602136, + "grad_norm": 0.34823186092703545, + "learning_rate": 7.6632887170183e-05, + "loss": 2.8176, + "step": 25070 + }, + { + "epoch": 1.1672370044463067, + "grad_norm": 0.30155763451791856, + "learning_rate": 7.663059464407965e-05, + "loss": 2.8696, + "step": 25071 + }, + { + "epoch": 1.1672835626323999, + "grad_norm": 0.3593464254976281, + "learning_rate": 7.662830203981814e-05, + "loss": 2.8014, + "step": 25072 + }, + { + "epoch": 1.167330120818493, + "grad_norm": 0.29250598518358395, + "learning_rate": 7.662600935740516e-05, + "loss": 2.8157, + "step": 25073 + }, + { + "epoch": 1.1673766790045859, + "grad_norm": 0.3354164196326429, + "learning_rate": 7.662371659684748e-05, + "loss": 2.7563, + "step": 25074 + }, + { + "epoch": 1.167423237190679, + "grad_norm": 0.32932123667119484, + "learning_rate": 7.662142375815182e-05, + "loss": 2.8197, + "step": 25075 + }, + { + "epoch": 1.167469795376772, + "grad_norm": 0.3513086848725081, + "learning_rate": 7.661913084132491e-05, + "loss": 2.8077, + "step": 25076 + }, + { + "epoch": 1.1675163535628652, + "grad_norm": 0.31052968140595405, + "learning_rate": 7.661683784637346e-05, + "loss": 2.7852, + "step": 25077 + }, + { + "epoch": 1.1675629117489583, + "grad_norm": 0.3432102781490338, + "learning_rate": 7.661454477330421e-05, + "loss": 2.9094, + "step": 25078 + }, + { + "epoch": 1.1676094699350514, + "grad_norm": 0.3276396306531654, + "learning_rate": 7.66122516221239e-05, + "loss": 2.7741, + "step": 25079 + }, + { + "epoch": 1.1676560281211443, + "grad_norm": 0.3170834672526927, + "learning_rate": 7.660995839283925e-05, + "loss": 2.7952, + "step": 25080 + }, + { + "epoch": 1.1677025863072374, + "grad_norm": 0.31366957747865376, + "learning_rate": 7.6607665085457e-05, + "loss": 2.7314, + "step": 25081 + }, + { + "epoch": 1.1677491444933306, + "grad_norm": 0.32279228723845976, + "learning_rate": 7.660537169998386e-05, + "loss": 2.8341, + "step": 25082 + }, + { + "epoch": 1.1677957026794237, + "grad_norm": 0.3330949395349113, + "learning_rate": 7.660307823642659e-05, + "loss": 2.6996, + "step": 25083 + }, + { + "epoch": 1.1678422608655166, + "grad_norm": 0.3319405628456417, + "learning_rate": 7.660078469479191e-05, + "loss": 2.8022, + "step": 25084 + }, + { + "epoch": 1.1678888190516097, + "grad_norm": 0.32650647797471977, + "learning_rate": 7.659849107508653e-05, + "loss": 2.8293, + "step": 25085 + }, + { + "epoch": 1.1679353772377028, + "grad_norm": 0.3150003730836619, + "learning_rate": 7.65961973773172e-05, + "loss": 2.7769, + "step": 25086 + }, + { + "epoch": 1.167981935423796, + "grad_norm": 0.32385368907692014, + "learning_rate": 7.659390360149066e-05, + "loss": 2.7951, + "step": 25087 + }, + { + "epoch": 1.168028493609889, + "grad_norm": 0.3044841533321693, + "learning_rate": 7.659160974761363e-05, + "loss": 2.8127, + "step": 25088 + }, + { + "epoch": 1.1680750517959821, + "grad_norm": 0.3298659340959631, + "learning_rate": 7.658931581569285e-05, + "loss": 2.762, + "step": 25089 + }, + { + "epoch": 1.168121609982075, + "grad_norm": 0.31276180948541255, + "learning_rate": 7.658702180573505e-05, + "loss": 2.8086, + "step": 25090 + }, + { + "epoch": 1.1681681681681682, + "grad_norm": 0.32282205420087473, + "learning_rate": 7.658472771774694e-05, + "loss": 2.7656, + "step": 25091 + }, + { + "epoch": 1.1682147263542613, + "grad_norm": 0.29839974094898997, + "learning_rate": 7.65824335517353e-05, + "loss": 2.8413, + "step": 25092 + }, + { + "epoch": 1.1682612845403544, + "grad_norm": 0.3573260052730654, + "learning_rate": 7.658013930770682e-05, + "loss": 2.6993, + "step": 25093 + }, + { + "epoch": 1.1683078427264473, + "grad_norm": 0.3427120944863804, + "learning_rate": 7.657784498566826e-05, + "loss": 2.9262, + "step": 25094 + }, + { + "epoch": 1.1683544009125404, + "grad_norm": 0.3495247172959819, + "learning_rate": 7.657555058562633e-05, + "loss": 2.8711, + "step": 25095 + }, + { + "epoch": 1.1684009590986335, + "grad_norm": 0.3742471192867498, + "learning_rate": 7.657325610758779e-05, + "loss": 2.8045, + "step": 25096 + }, + { + "epoch": 1.1684475172847266, + "grad_norm": 0.3330958818953124, + "learning_rate": 7.657096155155936e-05, + "loss": 2.7934, + "step": 25097 + }, + { + "epoch": 1.1684940754708197, + "grad_norm": 0.31925912651602023, + "learning_rate": 7.656866691754776e-05, + "loss": 2.8007, + "step": 25098 + }, + { + "epoch": 1.1685406336569126, + "grad_norm": 0.37277601150147804, + "learning_rate": 7.656637220555976e-05, + "loss": 2.9099, + "step": 25099 + }, + { + "epoch": 1.1685871918430057, + "grad_norm": 0.36233160205192827, + "learning_rate": 7.656407741560205e-05, + "loss": 2.8977, + "step": 25100 + }, + { + "epoch": 1.1686337500290989, + "grad_norm": 0.3351492752804851, + "learning_rate": 7.65617825476814e-05, + "loss": 2.841, + "step": 25101 + }, + { + "epoch": 1.168680308215192, + "grad_norm": 0.36951745504839284, + "learning_rate": 7.655948760180456e-05, + "loss": 2.7742, + "step": 25102 + }, + { + "epoch": 1.168726866401285, + "grad_norm": 0.3226400900359681, + "learning_rate": 7.655719257797821e-05, + "loss": 2.731, + "step": 25103 + }, + { + "epoch": 1.168773424587378, + "grad_norm": 0.3297247466583751, + "learning_rate": 7.655489747620913e-05, + "loss": 2.7112, + "step": 25104 + }, + { + "epoch": 1.168819982773471, + "grad_norm": 0.34669063904998665, + "learning_rate": 7.655260229650405e-05, + "loss": 2.8003, + "step": 25105 + }, + { + "epoch": 1.1688665409595642, + "grad_norm": 0.3098762854951285, + "learning_rate": 7.655030703886967e-05, + "loss": 2.8494, + "step": 25106 + }, + { + "epoch": 1.1689130991456573, + "grad_norm": 0.31472068831356753, + "learning_rate": 7.654801170331278e-05, + "loss": 2.8253, + "step": 25107 + }, + { + "epoch": 1.1689596573317504, + "grad_norm": 0.3033399270226632, + "learning_rate": 7.654571628984008e-05, + "loss": 2.8487, + "step": 25108 + }, + { + "epoch": 1.1690062155178433, + "grad_norm": 0.3156479288326918, + "learning_rate": 7.654342079845834e-05, + "loss": 2.8069, + "step": 25109 + }, + { + "epoch": 1.1690527737039365, + "grad_norm": 0.31044642734383837, + "learning_rate": 7.654112522917424e-05, + "loss": 2.8059, + "step": 25110 + }, + { + "epoch": 1.1690993318900296, + "grad_norm": 0.31123667628779184, + "learning_rate": 7.653882958199456e-05, + "loss": 2.7323, + "step": 25111 + }, + { + "epoch": 1.1691458900761227, + "grad_norm": 0.29784991209515055, + "learning_rate": 7.653653385692605e-05, + "loss": 2.9126, + "step": 25112 + }, + { + "epoch": 1.1691924482622158, + "grad_norm": 0.31833971884351714, + "learning_rate": 7.65342380539754e-05, + "loss": 2.8728, + "step": 25113 + }, + { + "epoch": 1.1692390064483087, + "grad_norm": 0.31816104810878776, + "learning_rate": 7.65319421731494e-05, + "loss": 2.8194, + "step": 25114 + }, + { + "epoch": 1.1692855646344018, + "grad_norm": 0.2940079626061797, + "learning_rate": 7.652964621445475e-05, + "loss": 2.7164, + "step": 25115 + }, + { + "epoch": 1.169332122820495, + "grad_norm": 0.2921950155818951, + "learning_rate": 7.65273501778982e-05, + "loss": 2.8506, + "step": 25116 + }, + { + "epoch": 1.169378681006588, + "grad_norm": 0.3100281471244914, + "learning_rate": 7.65250540634865e-05, + "loss": 2.867, + "step": 25117 + }, + { + "epoch": 1.1694252391926812, + "grad_norm": 0.3099249706976646, + "learning_rate": 7.652275787122638e-05, + "loss": 2.8885, + "step": 25118 + }, + { + "epoch": 1.169471797378774, + "grad_norm": 0.29036874960639336, + "learning_rate": 7.652046160112459e-05, + "loss": 2.8248, + "step": 25119 + }, + { + "epoch": 1.1695183555648672, + "grad_norm": 0.3355064039652092, + "learning_rate": 7.651816525318783e-05, + "loss": 2.8398, + "step": 25120 + }, + { + "epoch": 1.1695649137509603, + "grad_norm": 0.3442382012995971, + "learning_rate": 7.651586882742287e-05, + "loss": 2.8218, + "step": 25121 + }, + { + "epoch": 1.1696114719370534, + "grad_norm": 0.313231112833934, + "learning_rate": 7.651357232383647e-05, + "loss": 2.8562, + "step": 25122 + }, + { + "epoch": 1.1696580301231463, + "grad_norm": 0.3158634336448239, + "learning_rate": 7.651127574243533e-05, + "loss": 2.7472, + "step": 25123 + }, + { + "epoch": 1.1697045883092394, + "grad_norm": 0.33481927913870796, + "learning_rate": 7.65089790832262e-05, + "loss": 2.8376, + "step": 25124 + }, + { + "epoch": 1.1697511464953325, + "grad_norm": 0.3018991084113367, + "learning_rate": 7.650668234621586e-05, + "loss": 2.8723, + "step": 25125 + }, + { + "epoch": 1.1697977046814256, + "grad_norm": 0.34391473846959286, + "learning_rate": 7.650438553141098e-05, + "loss": 2.7523, + "step": 25126 + }, + { + "epoch": 1.1698442628675187, + "grad_norm": 0.3090954618463741, + "learning_rate": 7.650208863881837e-05, + "loss": 2.8326, + "step": 25127 + }, + { + "epoch": 1.1698908210536119, + "grad_norm": 0.3443591774554235, + "learning_rate": 7.649979166844472e-05, + "loss": 2.7488, + "step": 25128 + }, + { + "epoch": 1.1699373792397048, + "grad_norm": 0.3307925856480163, + "learning_rate": 7.649749462029681e-05, + "loss": 2.7247, + "step": 25129 + }, + { + "epoch": 1.1699839374257979, + "grad_norm": 0.37186255162330356, + "learning_rate": 7.649519749438134e-05, + "loss": 2.8957, + "step": 25130 + }, + { + "epoch": 1.170030495611891, + "grad_norm": 0.31888453188693083, + "learning_rate": 7.64929002907051e-05, + "loss": 2.7571, + "step": 25131 + }, + { + "epoch": 1.170077053797984, + "grad_norm": 0.3279981044945918, + "learning_rate": 7.649060300927479e-05, + "loss": 2.8505, + "step": 25132 + }, + { + "epoch": 1.170123611984077, + "grad_norm": 0.32610098844316576, + "learning_rate": 7.648830565009718e-05, + "loss": 2.7591, + "step": 25133 + }, + { + "epoch": 1.17017017017017, + "grad_norm": 0.328879810250856, + "learning_rate": 7.648600821317901e-05, + "loss": 2.7732, + "step": 25134 + }, + { + "epoch": 1.1702167283562632, + "grad_norm": 0.32576557744313694, + "learning_rate": 7.6483710698527e-05, + "loss": 2.8679, + "step": 25135 + }, + { + "epoch": 1.1702632865423563, + "grad_norm": 0.3018884946146474, + "learning_rate": 7.648141310614791e-05, + "loss": 2.77, + "step": 25136 + }, + { + "epoch": 1.1703098447284495, + "grad_norm": 0.35885128050809106, + "learning_rate": 7.64791154360485e-05, + "loss": 2.8004, + "step": 25137 + }, + { + "epoch": 1.1703564029145426, + "grad_norm": 0.289588038792041, + "learning_rate": 7.647681768823547e-05, + "loss": 2.7874, + "step": 25138 + }, + { + "epoch": 1.1704029611006355, + "grad_norm": 0.3495200091413295, + "learning_rate": 7.647451986271559e-05, + "loss": 2.7513, + "step": 25139 + }, + { + "epoch": 1.1704495192867286, + "grad_norm": 0.3237956782636104, + "learning_rate": 7.647222195949562e-05, + "loss": 2.7778, + "step": 25140 + }, + { + "epoch": 1.1704960774728217, + "grad_norm": 0.33656886208296205, + "learning_rate": 7.646992397858226e-05, + "loss": 2.8238, + "step": 25141 + }, + { + "epoch": 1.1705426356589148, + "grad_norm": 0.3119347768349188, + "learning_rate": 7.64676259199823e-05, + "loss": 2.8044, + "step": 25142 + }, + { + "epoch": 1.1705891938450077, + "grad_norm": 0.36272654979876123, + "learning_rate": 7.646532778370246e-05, + "loss": 2.7874, + "step": 25143 + }, + { + "epoch": 1.1706357520311008, + "grad_norm": 0.2907299974131692, + "learning_rate": 7.646302956974951e-05, + "loss": 2.7916, + "step": 25144 + }, + { + "epoch": 1.170682310217194, + "grad_norm": 0.34452150815031474, + "learning_rate": 7.646073127813016e-05, + "loss": 2.7133, + "step": 25145 + }, + { + "epoch": 1.170728868403287, + "grad_norm": 0.32722797490490907, + "learning_rate": 7.645843290885116e-05, + "loss": 2.8602, + "step": 25146 + }, + { + "epoch": 1.1707754265893802, + "grad_norm": 0.3680780277026911, + "learning_rate": 7.645613446191929e-05, + "loss": 2.7626, + "step": 25147 + }, + { + "epoch": 1.170821984775473, + "grad_norm": 0.3511715252458849, + "learning_rate": 7.645383593734125e-05, + "loss": 2.7069, + "step": 25148 + }, + { + "epoch": 1.1708685429615662, + "grad_norm": 0.3599853668266737, + "learning_rate": 7.64515373351238e-05, + "loss": 2.8255, + "step": 25149 + }, + { + "epoch": 1.1709151011476593, + "grad_norm": 0.33180877315615925, + "learning_rate": 7.644923865527371e-05, + "loss": 2.9051, + "step": 25150 + }, + { + "epoch": 1.1709616593337524, + "grad_norm": 0.347568304731053, + "learning_rate": 7.644693989779771e-05, + "loss": 2.9101, + "step": 25151 + }, + { + "epoch": 1.1710082175198455, + "grad_norm": 0.32631558998927596, + "learning_rate": 7.644464106270254e-05, + "loss": 2.7693, + "step": 25152 + }, + { + "epoch": 1.1710547757059384, + "grad_norm": 0.3644651035275315, + "learning_rate": 7.644234214999497e-05, + "loss": 2.8259, + "step": 25153 + }, + { + "epoch": 1.1711013338920315, + "grad_norm": 0.29551185566257687, + "learning_rate": 7.644004315968171e-05, + "loss": 2.9149, + "step": 25154 + }, + { + "epoch": 1.1711478920781246, + "grad_norm": 0.3781390675404227, + "learning_rate": 7.643774409176954e-05, + "loss": 2.837, + "step": 25155 + }, + { + "epoch": 1.1711944502642178, + "grad_norm": 0.30452579688570613, + "learning_rate": 7.64354449462652e-05, + "loss": 2.7288, + "step": 25156 + }, + { + "epoch": 1.1712410084503109, + "grad_norm": 0.336190104610583, + "learning_rate": 7.643314572317542e-05, + "loss": 2.7726, + "step": 25157 + }, + { + "epoch": 1.1712875666364038, + "grad_norm": 0.3252534076935773, + "learning_rate": 7.643084642250698e-05, + "loss": 2.7192, + "step": 25158 + }, + { + "epoch": 1.1713341248224969, + "grad_norm": 0.3309569874425891, + "learning_rate": 7.64285470442666e-05, + "loss": 2.8174, + "step": 25159 + }, + { + "epoch": 1.17138068300859, + "grad_norm": 0.34367755503755093, + "learning_rate": 7.642624758846103e-05, + "loss": 2.8117, + "step": 25160 + }, + { + "epoch": 1.171427241194683, + "grad_norm": 0.3337390937095412, + "learning_rate": 7.642394805509704e-05, + "loss": 2.7763, + "step": 25161 + }, + { + "epoch": 1.171473799380776, + "grad_norm": 0.35703574260462734, + "learning_rate": 7.642164844418136e-05, + "loss": 2.6941, + "step": 25162 + }, + { + "epoch": 1.1715203575668691, + "grad_norm": 0.3222523659843422, + "learning_rate": 7.641934875572073e-05, + "loss": 2.7755, + "step": 25163 + }, + { + "epoch": 1.1715669157529622, + "grad_norm": 0.3459258265894081, + "learning_rate": 7.641704898972193e-05, + "loss": 2.79, + "step": 25164 + }, + { + "epoch": 1.1716134739390553, + "grad_norm": 0.3437149862095875, + "learning_rate": 7.641474914619169e-05, + "loss": 2.7767, + "step": 25165 + }, + { + "epoch": 1.1716600321251485, + "grad_norm": 0.33906607782514225, + "learning_rate": 7.641244922513677e-05, + "loss": 2.7816, + "step": 25166 + }, + { + "epoch": 1.1717065903112416, + "grad_norm": 0.34874862535265316, + "learning_rate": 7.641014922656391e-05, + "loss": 2.9072, + "step": 25167 + }, + { + "epoch": 1.1717531484973345, + "grad_norm": 0.34936660325025565, + "learning_rate": 7.640784915047985e-05, + "loss": 2.7729, + "step": 25168 + }, + { + "epoch": 1.1717997066834276, + "grad_norm": 0.34188969288396526, + "learning_rate": 7.640554899689138e-05, + "loss": 2.6757, + "step": 25169 + }, + { + "epoch": 1.1718462648695207, + "grad_norm": 0.3264472556071936, + "learning_rate": 7.64032487658052e-05, + "loss": 2.8067, + "step": 25170 + }, + { + "epoch": 1.1718928230556138, + "grad_norm": 0.3051491473509828, + "learning_rate": 7.640094845722812e-05, + "loss": 2.7657, + "step": 25171 + }, + { + "epoch": 1.1719393812417067, + "grad_norm": 0.32709165058637946, + "learning_rate": 7.639864807116682e-05, + "loss": 2.7512, + "step": 25172 + }, + { + "epoch": 1.1719859394277998, + "grad_norm": 0.30674010827268683, + "learning_rate": 7.63963476076281e-05, + "loss": 2.8152, + "step": 25173 + }, + { + "epoch": 1.172032497613893, + "grad_norm": 0.31761525961347303, + "learning_rate": 7.639404706661871e-05, + "loss": 2.7326, + "step": 25174 + }, + { + "epoch": 1.172079055799986, + "grad_norm": 0.29346853076635715, + "learning_rate": 7.639174644814538e-05, + "loss": 2.7644, + "step": 25175 + }, + { + "epoch": 1.1721256139860792, + "grad_norm": 0.32889616819191264, + "learning_rate": 7.638944575221489e-05, + "loss": 2.8216, + "step": 25176 + }, + { + "epoch": 1.1721721721721723, + "grad_norm": 0.3012155085395701, + "learning_rate": 7.638714497883396e-05, + "loss": 2.7242, + "step": 25177 + }, + { + "epoch": 1.1722187303582652, + "grad_norm": 0.35803556409931103, + "learning_rate": 7.638484412800936e-05, + "loss": 2.8271, + "step": 25178 + }, + { + "epoch": 1.1722652885443583, + "grad_norm": 0.3050276506395928, + "learning_rate": 7.638254319974785e-05, + "loss": 2.809, + "step": 25179 + }, + { + "epoch": 1.1723118467304514, + "grad_norm": 0.3958448257477693, + "learning_rate": 7.638024219405618e-05, + "loss": 2.8914, + "step": 25180 + }, + { + "epoch": 1.1723584049165445, + "grad_norm": 0.3313487328123296, + "learning_rate": 7.637794111094108e-05, + "loss": 2.8436, + "step": 25181 + }, + { + "epoch": 1.1724049631026374, + "grad_norm": 0.355283970499354, + "learning_rate": 7.637563995040934e-05, + "loss": 2.8398, + "step": 25182 + }, + { + "epoch": 1.1724515212887305, + "grad_norm": 0.32561016934632786, + "learning_rate": 7.637333871246768e-05, + "loss": 2.7335, + "step": 25183 + }, + { + "epoch": 1.1724980794748237, + "grad_norm": 0.35440771311880037, + "learning_rate": 7.637103739712288e-05, + "loss": 2.8414, + "step": 25184 + }, + { + "epoch": 1.1725446376609168, + "grad_norm": 0.3230384147099099, + "learning_rate": 7.636873600438168e-05, + "loss": 2.86, + "step": 25185 + }, + { + "epoch": 1.1725911958470099, + "grad_norm": 0.34897600515191884, + "learning_rate": 7.636643453425083e-05, + "loss": 2.8307, + "step": 25186 + }, + { + "epoch": 1.1726377540331028, + "grad_norm": 0.3345414027115194, + "learning_rate": 7.63641329867371e-05, + "loss": 2.7756, + "step": 25187 + }, + { + "epoch": 1.172684312219196, + "grad_norm": 0.32558766264061223, + "learning_rate": 7.636183136184725e-05, + "loss": 2.759, + "step": 25188 + }, + { + "epoch": 1.172730870405289, + "grad_norm": 0.3083094012849433, + "learning_rate": 7.635952965958799e-05, + "loss": 2.6603, + "step": 25189 + }, + { + "epoch": 1.1727774285913821, + "grad_norm": 0.3128312948639309, + "learning_rate": 7.635722787996613e-05, + "loss": 2.8081, + "step": 25190 + }, + { + "epoch": 1.1728239867774752, + "grad_norm": 0.3201924927273694, + "learning_rate": 7.635492602298839e-05, + "loss": 2.795, + "step": 25191 + }, + { + "epoch": 1.1728705449635681, + "grad_norm": 0.32669020893122597, + "learning_rate": 7.635262408866156e-05, + "loss": 2.8508, + "step": 25192 + }, + { + "epoch": 1.1729171031496612, + "grad_norm": 0.29722331284376674, + "learning_rate": 7.635032207699235e-05, + "loss": 2.8777, + "step": 25193 + }, + { + "epoch": 1.1729636613357544, + "grad_norm": 0.3372705529881339, + "learning_rate": 7.634801998798755e-05, + "loss": 2.8132, + "step": 25194 + }, + { + "epoch": 1.1730102195218475, + "grad_norm": 0.3608973099631359, + "learning_rate": 7.634571782165391e-05, + "loss": 2.7914, + "step": 25195 + }, + { + "epoch": 1.1730567777079406, + "grad_norm": 0.29444522487094454, + "learning_rate": 7.634341557799819e-05, + "loss": 2.8359, + "step": 25196 + }, + { + "epoch": 1.1731033358940335, + "grad_norm": 0.360141190979845, + "learning_rate": 7.634111325702714e-05, + "loss": 2.7644, + "step": 25197 + }, + { + "epoch": 1.1731498940801266, + "grad_norm": 0.31474430566951517, + "learning_rate": 7.633881085874751e-05, + "loss": 2.7041, + "step": 25198 + }, + { + "epoch": 1.1731964522662197, + "grad_norm": 0.3070378102464796, + "learning_rate": 7.633650838316606e-05, + "loss": 2.8354, + "step": 25199 + }, + { + "epoch": 1.1732430104523128, + "grad_norm": 0.3124987066597227, + "learning_rate": 7.633420583028956e-05, + "loss": 2.8876, + "step": 25200 + }, + { + "epoch": 1.173289568638406, + "grad_norm": 0.33504463886143976, + "learning_rate": 7.633190320012476e-05, + "loss": 2.6512, + "step": 25201 + }, + { + "epoch": 1.1733361268244988, + "grad_norm": 0.3285374491881232, + "learning_rate": 7.632960049267842e-05, + "loss": 2.7538, + "step": 25202 + }, + { + "epoch": 1.173382685010592, + "grad_norm": 0.3351793477125871, + "learning_rate": 7.632729770795727e-05, + "loss": 2.7516, + "step": 25203 + }, + { + "epoch": 1.173429243196685, + "grad_norm": 0.3412229007231882, + "learning_rate": 7.632499484596813e-05, + "loss": 2.7555, + "step": 25204 + }, + { + "epoch": 1.1734758013827782, + "grad_norm": 0.3422171088691304, + "learning_rate": 7.63226919067177e-05, + "loss": 2.8709, + "step": 25205 + }, + { + "epoch": 1.1735223595688713, + "grad_norm": 0.3516022674426881, + "learning_rate": 7.632038889021279e-05, + "loss": 2.7954, + "step": 25206 + }, + { + "epoch": 1.1735689177549642, + "grad_norm": 0.3255006715366249, + "learning_rate": 7.631808579646012e-05, + "loss": 2.7101, + "step": 25207 + }, + { + "epoch": 1.1736154759410573, + "grad_norm": 0.32697289776724425, + "learning_rate": 7.631578262546646e-05, + "loss": 2.7292, + "step": 25208 + }, + { + "epoch": 1.1736620341271504, + "grad_norm": 0.29841834609626666, + "learning_rate": 7.631347937723856e-05, + "loss": 2.7784, + "step": 25209 + }, + { + "epoch": 1.1737085923132435, + "grad_norm": 0.3600013812947791, + "learning_rate": 7.63111760517832e-05, + "loss": 2.8684, + "step": 25210 + }, + { + "epoch": 1.1737551504993364, + "grad_norm": 0.2998508824520613, + "learning_rate": 7.630887264910714e-05, + "loss": 2.7727, + "step": 25211 + }, + { + "epoch": 1.1738017086854295, + "grad_norm": 0.3699536593191408, + "learning_rate": 7.63065691692171e-05, + "loss": 2.9292, + "step": 25212 + }, + { + "epoch": 1.1738482668715227, + "grad_norm": 0.33469140788938173, + "learning_rate": 7.63042656121199e-05, + "loss": 2.8436, + "step": 25213 + }, + { + "epoch": 1.1738948250576158, + "grad_norm": 0.351316066361374, + "learning_rate": 7.630196197782224e-05, + "loss": 2.8082, + "step": 25214 + }, + { + "epoch": 1.173941383243709, + "grad_norm": 0.33364783077248583, + "learning_rate": 7.629965826633094e-05, + "loss": 2.8756, + "step": 25215 + }, + { + "epoch": 1.173987941429802, + "grad_norm": 0.3495754091212044, + "learning_rate": 7.629735447765273e-05, + "loss": 2.8403, + "step": 25216 + }, + { + "epoch": 1.174034499615895, + "grad_norm": 0.3375508060250566, + "learning_rate": 7.629505061179438e-05, + "loss": 2.8205, + "step": 25217 + }, + { + "epoch": 1.174081057801988, + "grad_norm": 0.3471761363377871, + "learning_rate": 7.629274666876262e-05, + "loss": 2.7563, + "step": 25218 + }, + { + "epoch": 1.1741276159880811, + "grad_norm": 0.31006064049864457, + "learning_rate": 7.629044264856428e-05, + "loss": 2.738, + "step": 25219 + }, + { + "epoch": 1.1741741741741742, + "grad_norm": 0.35875750047105537, + "learning_rate": 7.628813855120604e-05, + "loss": 2.7966, + "step": 25220 + }, + { + "epoch": 1.1742207323602671, + "grad_norm": 0.3142434564427838, + "learning_rate": 7.628583437669475e-05, + "loss": 2.7116, + "step": 25221 + }, + { + "epoch": 1.1742672905463603, + "grad_norm": 0.3647777431723886, + "learning_rate": 7.628353012503709e-05, + "loss": 2.8776, + "step": 25222 + }, + { + "epoch": 1.1743138487324534, + "grad_norm": 0.3358619619809415, + "learning_rate": 7.628122579623987e-05, + "loss": 2.7144, + "step": 25223 + }, + { + "epoch": 1.1743604069185465, + "grad_norm": 0.37486193843288673, + "learning_rate": 7.627892139030984e-05, + "loss": 2.9473, + "step": 25224 + }, + { + "epoch": 1.1744069651046396, + "grad_norm": 0.3614725279004538, + "learning_rate": 7.627661690725376e-05, + "loss": 2.6998, + "step": 25225 + }, + { + "epoch": 1.1744535232907327, + "grad_norm": 0.3631484678946449, + "learning_rate": 7.62743123470784e-05, + "loss": 2.7755, + "step": 25226 + }, + { + "epoch": 1.1745000814768256, + "grad_norm": 0.3554361768758482, + "learning_rate": 7.627200770979052e-05, + "loss": 2.7384, + "step": 25227 + }, + { + "epoch": 1.1745466396629187, + "grad_norm": 0.3770950748408646, + "learning_rate": 7.626970299539689e-05, + "loss": 2.8474, + "step": 25228 + }, + { + "epoch": 1.1745931978490118, + "grad_norm": 0.34288132083099987, + "learning_rate": 7.626739820390428e-05, + "loss": 2.8804, + "step": 25229 + }, + { + "epoch": 1.174639756035105, + "grad_norm": 0.4165330773249658, + "learning_rate": 7.626509333531942e-05, + "loss": 2.7628, + "step": 25230 + }, + { + "epoch": 1.1746863142211978, + "grad_norm": 0.3331917963975369, + "learning_rate": 7.62627883896491e-05, + "loss": 2.8506, + "step": 25231 + }, + { + "epoch": 1.174732872407291, + "grad_norm": 0.3751966637406505, + "learning_rate": 7.62604833669001e-05, + "loss": 2.8582, + "step": 25232 + }, + { + "epoch": 1.174779430593384, + "grad_norm": 0.33198743763046673, + "learning_rate": 7.625817826707917e-05, + "loss": 2.8054, + "step": 25233 + }, + { + "epoch": 1.1748259887794772, + "grad_norm": 0.35046712293136806, + "learning_rate": 7.625587309019306e-05, + "loss": 2.7388, + "step": 25234 + }, + { + "epoch": 1.1748725469655703, + "grad_norm": 0.3432643818114126, + "learning_rate": 7.625356783624857e-05, + "loss": 2.8428, + "step": 25235 + }, + { + "epoch": 1.1749191051516632, + "grad_norm": 0.37308543002280725, + "learning_rate": 7.625126250525242e-05, + "loss": 2.7747, + "step": 25236 + }, + { + "epoch": 1.1749656633377563, + "grad_norm": 0.3467212400550016, + "learning_rate": 7.624895709721141e-05, + "loss": 2.8544, + "step": 25237 + }, + { + "epoch": 1.1750122215238494, + "grad_norm": 0.34502570104634156, + "learning_rate": 7.624665161213231e-05, + "loss": 2.6972, + "step": 25238 + }, + { + "epoch": 1.1750587797099425, + "grad_norm": 0.350075495491652, + "learning_rate": 7.624434605002185e-05, + "loss": 2.8706, + "step": 25239 + }, + { + "epoch": 1.1751053378960357, + "grad_norm": 0.35404702634681817, + "learning_rate": 7.624204041088683e-05, + "loss": 2.8325, + "step": 25240 + }, + { + "epoch": 1.1751518960821286, + "grad_norm": 0.3216077191861946, + "learning_rate": 7.6239734694734e-05, + "loss": 2.764, + "step": 25241 + }, + { + "epoch": 1.1751984542682217, + "grad_norm": 0.3577323943677905, + "learning_rate": 7.623742890157015e-05, + "loss": 2.8296, + "step": 25242 + }, + { + "epoch": 1.1752450124543148, + "grad_norm": 0.3240327859455002, + "learning_rate": 7.623512303140202e-05, + "loss": 2.7201, + "step": 25243 + }, + { + "epoch": 1.175291570640408, + "grad_norm": 0.3454412243888424, + "learning_rate": 7.623281708423638e-05, + "loss": 2.7958, + "step": 25244 + }, + { + "epoch": 1.175338128826501, + "grad_norm": 0.3453765493394118, + "learning_rate": 7.623051106008003e-05, + "loss": 2.8487, + "step": 25245 + }, + { + "epoch": 1.175384687012594, + "grad_norm": 0.3280414294345467, + "learning_rate": 7.62282049589397e-05, + "loss": 2.8002, + "step": 25246 + }, + { + "epoch": 1.175431245198687, + "grad_norm": 0.3333691685767123, + "learning_rate": 7.622589878082218e-05, + "loss": 2.729, + "step": 25247 + }, + { + "epoch": 1.1754778033847801, + "grad_norm": 0.3298862769056461, + "learning_rate": 7.622359252573423e-05, + "loss": 2.7842, + "step": 25248 + }, + { + "epoch": 1.1755243615708733, + "grad_norm": 0.3541781159141391, + "learning_rate": 7.62212861936826e-05, + "loss": 2.7834, + "step": 25249 + }, + { + "epoch": 1.1755709197569661, + "grad_norm": 0.32777512259184266, + "learning_rate": 7.62189797846741e-05, + "loss": 2.8924, + "step": 25250 + }, + { + "epoch": 1.1756174779430593, + "grad_norm": 0.3586975200986762, + "learning_rate": 7.621667329871546e-05, + "loss": 2.7635, + "step": 25251 + }, + { + "epoch": 1.1756640361291524, + "grad_norm": 0.3476230024133144, + "learning_rate": 7.621436673581348e-05, + "loss": 2.7155, + "step": 25252 + }, + { + "epoch": 1.1757105943152455, + "grad_norm": 0.29381963108770964, + "learning_rate": 7.62120600959749e-05, + "loss": 2.702, + "step": 25253 + }, + { + "epoch": 1.1757571525013386, + "grad_norm": 0.34267652971413015, + "learning_rate": 7.620975337920653e-05, + "loss": 2.8088, + "step": 25254 + }, + { + "epoch": 1.1758037106874317, + "grad_norm": 0.31687116273824145, + "learning_rate": 7.62074465855151e-05, + "loss": 2.6801, + "step": 25255 + }, + { + "epoch": 1.1758502688735246, + "grad_norm": 0.35136626434375806, + "learning_rate": 7.62051397149074e-05, + "loss": 2.7744, + "step": 25256 + }, + { + "epoch": 1.1758968270596177, + "grad_norm": 0.32888591286469915, + "learning_rate": 7.62028327673902e-05, + "loss": 2.8442, + "step": 25257 + }, + { + "epoch": 1.1759433852457108, + "grad_norm": 0.3614766815635723, + "learning_rate": 7.620052574297027e-05, + "loss": 2.781, + "step": 25258 + }, + { + "epoch": 1.175989943431804, + "grad_norm": 0.3468869800708885, + "learning_rate": 7.619821864165437e-05, + "loss": 2.7336, + "step": 25259 + }, + { + "epoch": 1.1760365016178969, + "grad_norm": 0.3252255994074745, + "learning_rate": 7.619591146344928e-05, + "loss": 2.8309, + "step": 25260 + }, + { + "epoch": 1.17608305980399, + "grad_norm": 0.36605962995271724, + "learning_rate": 7.619360420836176e-05, + "loss": 2.7699, + "step": 25261 + }, + { + "epoch": 1.176129617990083, + "grad_norm": 0.3121420308218842, + "learning_rate": 7.619129687639862e-05, + "loss": 2.797, + "step": 25262 + }, + { + "epoch": 1.1761761761761762, + "grad_norm": 0.334562657343489, + "learning_rate": 7.618898946756658e-05, + "loss": 2.7475, + "step": 25263 + }, + { + "epoch": 1.1762227343622693, + "grad_norm": 0.32089878458606813, + "learning_rate": 7.618668198187244e-05, + "loss": 2.768, + "step": 25264 + }, + { + "epoch": 1.1762692925483624, + "grad_norm": 0.3479637377669638, + "learning_rate": 7.618437441932298e-05, + "loss": 2.8361, + "step": 25265 + }, + { + "epoch": 1.1763158507344553, + "grad_norm": 0.30420591276824027, + "learning_rate": 7.618206677992494e-05, + "loss": 2.7273, + "step": 25266 + }, + { + "epoch": 1.1763624089205484, + "grad_norm": 0.3289797739013791, + "learning_rate": 7.617975906368513e-05, + "loss": 2.8344, + "step": 25267 + }, + { + "epoch": 1.1764089671066416, + "grad_norm": 0.3496254888440474, + "learning_rate": 7.61774512706103e-05, + "loss": 2.7834, + "step": 25268 + }, + { + "epoch": 1.1764555252927347, + "grad_norm": 0.3155052094474735, + "learning_rate": 7.617514340070724e-05, + "loss": 2.8163, + "step": 25269 + }, + { + "epoch": 1.1765020834788276, + "grad_norm": 0.3223679506184145, + "learning_rate": 7.61728354539827e-05, + "loss": 2.7619, + "step": 25270 + }, + { + "epoch": 1.1765486416649207, + "grad_norm": 0.3226095910374193, + "learning_rate": 7.617052743044349e-05, + "loss": 2.6945, + "step": 25271 + }, + { + "epoch": 1.1765951998510138, + "grad_norm": 0.33744051629735305, + "learning_rate": 7.616821933009633e-05, + "loss": 2.7816, + "step": 25272 + }, + { + "epoch": 1.176641758037107, + "grad_norm": 0.3009330771014772, + "learning_rate": 7.616591115294804e-05, + "loss": 2.7919, + "step": 25273 + }, + { + "epoch": 1.1766883162232, + "grad_norm": 0.31847073676705756, + "learning_rate": 7.616360289900537e-05, + "loss": 2.8185, + "step": 25274 + }, + { + "epoch": 1.176734874409293, + "grad_norm": 0.3104257550467824, + "learning_rate": 7.61612945682751e-05, + "loss": 2.8496, + "step": 25275 + }, + { + "epoch": 1.176781432595386, + "grad_norm": 0.32322085541774487, + "learning_rate": 7.615898616076402e-05, + "loss": 2.7871, + "step": 25276 + }, + { + "epoch": 1.1768279907814791, + "grad_norm": 0.2819063272787528, + "learning_rate": 7.615667767647888e-05, + "loss": 2.8006, + "step": 25277 + }, + { + "epoch": 1.1768745489675723, + "grad_norm": 0.32216139968248514, + "learning_rate": 7.615436911542649e-05, + "loss": 2.8503, + "step": 25278 + }, + { + "epoch": 1.1769211071536654, + "grad_norm": 0.30556629734471086, + "learning_rate": 7.615206047761358e-05, + "loss": 2.8339, + "step": 25279 + }, + { + "epoch": 1.1769676653397583, + "grad_norm": 0.31504213488505145, + "learning_rate": 7.614975176304695e-05, + "loss": 2.6959, + "step": 25280 + }, + { + "epoch": 1.1770142235258514, + "grad_norm": 0.31285193636533987, + "learning_rate": 7.614744297173338e-05, + "loss": 2.8123, + "step": 25281 + }, + { + "epoch": 1.1770607817119445, + "grad_norm": 0.31006543414263504, + "learning_rate": 7.614513410367966e-05, + "loss": 2.7356, + "step": 25282 + }, + { + "epoch": 1.1771073398980376, + "grad_norm": 0.3304225602482786, + "learning_rate": 7.614282515889253e-05, + "loss": 2.7418, + "step": 25283 + }, + { + "epoch": 1.1771538980841307, + "grad_norm": 0.3156263435138245, + "learning_rate": 7.614051613737878e-05, + "loss": 2.8113, + "step": 25284 + }, + { + "epoch": 1.1772004562702236, + "grad_norm": 0.3599867274742041, + "learning_rate": 7.61382070391452e-05, + "loss": 2.8188, + "step": 25285 + }, + { + "epoch": 1.1772470144563167, + "grad_norm": 0.3008592161857707, + "learning_rate": 7.613589786419855e-05, + "loss": 2.8201, + "step": 25286 + }, + { + "epoch": 1.1772935726424099, + "grad_norm": 0.36009423890440373, + "learning_rate": 7.613358861254561e-05, + "loss": 2.8273, + "step": 25287 + }, + { + "epoch": 1.177340130828503, + "grad_norm": 0.3211123743198118, + "learning_rate": 7.613127928419316e-05, + "loss": 2.8157, + "step": 25288 + }, + { + "epoch": 1.177386689014596, + "grad_norm": 0.33255832136310964, + "learning_rate": 7.6128969879148e-05, + "loss": 2.8527, + "step": 25289 + }, + { + "epoch": 1.177433247200689, + "grad_norm": 0.33613982991460456, + "learning_rate": 7.612666039741687e-05, + "loss": 2.7653, + "step": 25290 + }, + { + "epoch": 1.177479805386782, + "grad_norm": 0.31855979572499954, + "learning_rate": 7.612435083900657e-05, + "loss": 2.6906, + "step": 25291 + }, + { + "epoch": 1.1775263635728752, + "grad_norm": 0.354749265270917, + "learning_rate": 7.612204120392387e-05, + "loss": 2.7228, + "step": 25292 + }, + { + "epoch": 1.1775729217589683, + "grad_norm": 0.3046493321921584, + "learning_rate": 7.611973149217556e-05, + "loss": 2.7366, + "step": 25293 + }, + { + "epoch": 1.1776194799450614, + "grad_norm": 0.33245662641472573, + "learning_rate": 7.611742170376841e-05, + "loss": 2.6342, + "step": 25294 + }, + { + "epoch": 1.1776660381311543, + "grad_norm": 0.3207149807663388, + "learning_rate": 7.611511183870919e-05, + "loss": 2.7347, + "step": 25295 + }, + { + "epoch": 1.1777125963172475, + "grad_norm": 0.3138356407439757, + "learning_rate": 7.611280189700471e-05, + "loss": 2.7682, + "step": 25296 + }, + { + "epoch": 1.1777591545033406, + "grad_norm": 0.3469029749154553, + "learning_rate": 7.611049187866173e-05, + "loss": 2.8002, + "step": 25297 + }, + { + "epoch": 1.1778057126894337, + "grad_norm": 0.31639573346388716, + "learning_rate": 7.610818178368702e-05, + "loss": 2.8174, + "step": 25298 + }, + { + "epoch": 1.1778522708755266, + "grad_norm": 0.3254093952366903, + "learning_rate": 7.610587161208736e-05, + "loss": 2.7973, + "step": 25299 + }, + { + "epoch": 1.1778988290616197, + "grad_norm": 0.3151516659674868, + "learning_rate": 7.610356136386953e-05, + "loss": 2.7655, + "step": 25300 + }, + { + "epoch": 1.1779453872477128, + "grad_norm": 0.3201484361265562, + "learning_rate": 7.610125103904033e-05, + "loss": 2.769, + "step": 25301 + }, + { + "epoch": 1.177991945433806, + "grad_norm": 0.3120931187166136, + "learning_rate": 7.609894063760654e-05, + "loss": 2.8197, + "step": 25302 + }, + { + "epoch": 1.178038503619899, + "grad_norm": 0.32169288250336137, + "learning_rate": 7.609663015957492e-05, + "loss": 2.8529, + "step": 25303 + }, + { + "epoch": 1.1780850618059922, + "grad_norm": 0.3507298295361572, + "learning_rate": 7.609431960495225e-05, + "loss": 2.9383, + "step": 25304 + }, + { + "epoch": 1.178131619992085, + "grad_norm": 0.3287246624787708, + "learning_rate": 7.609200897374536e-05, + "loss": 2.8062, + "step": 25305 + }, + { + "epoch": 1.1781781781781782, + "grad_norm": 0.3576841409765427, + "learning_rate": 7.608969826596096e-05, + "loss": 2.8376, + "step": 25306 + }, + { + "epoch": 1.1782247363642713, + "grad_norm": 0.2943314884613224, + "learning_rate": 7.608738748160588e-05, + "loss": 2.6637, + "step": 25307 + }, + { + "epoch": 1.1782712945503644, + "grad_norm": 0.3235638663998967, + "learning_rate": 7.608507662068687e-05, + "loss": 2.7799, + "step": 25308 + }, + { + "epoch": 1.1783178527364573, + "grad_norm": 0.3277536462908273, + "learning_rate": 7.608276568321076e-05, + "loss": 2.8683, + "step": 25309 + }, + { + "epoch": 1.1783644109225504, + "grad_norm": 0.32692489936189184, + "learning_rate": 7.60804546691843e-05, + "loss": 2.8682, + "step": 25310 + }, + { + "epoch": 1.1784109691086435, + "grad_norm": 0.33991656613597665, + "learning_rate": 7.607814357861425e-05, + "loss": 2.8847, + "step": 25311 + }, + { + "epoch": 1.1784575272947366, + "grad_norm": 0.33155398785552326, + "learning_rate": 7.607583241150743e-05, + "loss": 2.7985, + "step": 25312 + }, + { + "epoch": 1.1785040854808297, + "grad_norm": 0.3297330676193773, + "learning_rate": 7.607352116787061e-05, + "loss": 2.8547, + "step": 25313 + }, + { + "epoch": 1.1785506436669229, + "grad_norm": 0.3294881924914098, + "learning_rate": 7.607120984771058e-05, + "loss": 2.8968, + "step": 25314 + }, + { + "epoch": 1.1785972018530158, + "grad_norm": 0.33722111154088447, + "learning_rate": 7.606889845103411e-05, + "loss": 2.8497, + "step": 25315 + }, + { + "epoch": 1.1786437600391089, + "grad_norm": 0.32447997985899163, + "learning_rate": 7.606658697784799e-05, + "loss": 2.9074, + "step": 25316 + }, + { + "epoch": 1.178690318225202, + "grad_norm": 0.3538613383363475, + "learning_rate": 7.6064275428159e-05, + "loss": 2.827, + "step": 25317 + }, + { + "epoch": 1.178736876411295, + "grad_norm": 0.31591803803451934, + "learning_rate": 7.606196380197393e-05, + "loss": 2.7016, + "step": 25318 + }, + { + "epoch": 1.178783434597388, + "grad_norm": 0.35910291093136537, + "learning_rate": 7.605965209929958e-05, + "loss": 2.7059, + "step": 25319 + }, + { + "epoch": 1.178829992783481, + "grad_norm": 0.3197703033896694, + "learning_rate": 7.60573403201427e-05, + "loss": 2.6749, + "step": 25320 + }, + { + "epoch": 1.1788765509695742, + "grad_norm": 0.3317552008269556, + "learning_rate": 7.605502846451012e-05, + "loss": 2.812, + "step": 25321 + }, + { + "epoch": 1.1789231091556673, + "grad_norm": 0.3176539961810135, + "learning_rate": 7.605271653240857e-05, + "loss": 2.8842, + "step": 25322 + }, + { + "epoch": 1.1789696673417605, + "grad_norm": 0.32828793632948416, + "learning_rate": 7.605040452384489e-05, + "loss": 2.8486, + "step": 25323 + }, + { + "epoch": 1.1790162255278533, + "grad_norm": 0.32779142754976964, + "learning_rate": 7.604809243882583e-05, + "loss": 2.8698, + "step": 25324 + }, + { + "epoch": 1.1790627837139465, + "grad_norm": 0.3030269046734282, + "learning_rate": 7.604578027735816e-05, + "loss": 2.7647, + "step": 25325 + }, + { + "epoch": 1.1791093419000396, + "grad_norm": 0.3490259674560083, + "learning_rate": 7.60434680394487e-05, + "loss": 2.8061, + "step": 25326 + }, + { + "epoch": 1.1791559000861327, + "grad_norm": 0.33653145179739485, + "learning_rate": 7.604115572510424e-05, + "loss": 2.8561, + "step": 25327 + }, + { + "epoch": 1.1792024582722258, + "grad_norm": 0.3347612525155088, + "learning_rate": 7.603884333433156e-05, + "loss": 2.863, + "step": 25328 + }, + { + "epoch": 1.1792490164583187, + "grad_norm": 0.3127421764119221, + "learning_rate": 7.603653086713743e-05, + "loss": 2.8857, + "step": 25329 + }, + { + "epoch": 1.1792955746444118, + "grad_norm": 0.3328727395850684, + "learning_rate": 7.603421832352863e-05, + "loss": 2.8043, + "step": 25330 + }, + { + "epoch": 1.179342132830505, + "grad_norm": 0.32962634501757165, + "learning_rate": 7.6031905703512e-05, + "loss": 2.9137, + "step": 25331 + }, + { + "epoch": 1.179388691016598, + "grad_norm": 0.33031156504983794, + "learning_rate": 7.602959300709426e-05, + "loss": 2.7707, + "step": 25332 + }, + { + "epoch": 1.1794352492026912, + "grad_norm": 0.330857031851301, + "learning_rate": 7.602728023428225e-05, + "loss": 2.7316, + "step": 25333 + }, + { + "epoch": 1.179481807388784, + "grad_norm": 0.3907926807067543, + "learning_rate": 7.602496738508273e-05, + "loss": 2.7106, + "step": 25334 + }, + { + "epoch": 1.1795283655748772, + "grad_norm": 0.31213554947834443, + "learning_rate": 7.60226544595025e-05, + "loss": 2.7796, + "step": 25335 + }, + { + "epoch": 1.1795749237609703, + "grad_norm": 0.3576407274666998, + "learning_rate": 7.602034145754833e-05, + "loss": 2.8121, + "step": 25336 + }, + { + "epoch": 1.1796214819470634, + "grad_norm": 0.3272137897564453, + "learning_rate": 7.601802837922702e-05, + "loss": 2.8402, + "step": 25337 + }, + { + "epoch": 1.1796680401331563, + "grad_norm": 0.3685684533609718, + "learning_rate": 7.601571522454538e-05, + "loss": 2.8194, + "step": 25338 + }, + { + "epoch": 1.1797145983192494, + "grad_norm": 0.35731948604949004, + "learning_rate": 7.601340199351015e-05, + "loss": 2.8604, + "step": 25339 + }, + { + "epoch": 1.1797611565053425, + "grad_norm": 0.34165195176242297, + "learning_rate": 7.601108868612816e-05, + "loss": 2.6849, + "step": 25340 + }, + { + "epoch": 1.1798077146914356, + "grad_norm": 0.35645493064120426, + "learning_rate": 7.60087753024062e-05, + "loss": 2.778, + "step": 25341 + }, + { + "epoch": 1.1798542728775288, + "grad_norm": 0.3324070764312445, + "learning_rate": 7.600646184235104e-05, + "loss": 2.9069, + "step": 25342 + }, + { + "epoch": 1.1799008310636219, + "grad_norm": 0.3629836238615005, + "learning_rate": 7.600414830596947e-05, + "loss": 2.8206, + "step": 25343 + }, + { + "epoch": 1.1799473892497148, + "grad_norm": 0.37438409581145005, + "learning_rate": 7.600183469326829e-05, + "loss": 2.8166, + "step": 25344 + }, + { + "epoch": 1.1799939474358079, + "grad_norm": 0.36802660979523444, + "learning_rate": 7.599952100425428e-05, + "loss": 2.8156, + "step": 25345 + }, + { + "epoch": 1.180040505621901, + "grad_norm": 0.361684041039262, + "learning_rate": 7.599720723893425e-05, + "loss": 2.8126, + "step": 25346 + }, + { + "epoch": 1.180087063807994, + "grad_norm": 0.373636598050905, + "learning_rate": 7.599489339731497e-05, + "loss": 2.7947, + "step": 25347 + }, + { + "epoch": 1.180133621994087, + "grad_norm": 0.35639331335329505, + "learning_rate": 7.599257947940325e-05, + "loss": 2.7975, + "step": 25348 + }, + { + "epoch": 1.1801801801801801, + "grad_norm": 0.33927169456454526, + "learning_rate": 7.599026548520586e-05, + "loss": 2.7286, + "step": 25349 + }, + { + "epoch": 1.1802267383662732, + "grad_norm": 0.36448289491451974, + "learning_rate": 7.598795141472961e-05, + "loss": 2.8636, + "step": 25350 + }, + { + "epoch": 1.1802732965523663, + "grad_norm": 0.3514475485529038, + "learning_rate": 7.598563726798126e-05, + "loss": 2.8731, + "step": 25351 + }, + { + "epoch": 1.1803198547384595, + "grad_norm": 0.3532226157982793, + "learning_rate": 7.598332304496764e-05, + "loss": 2.6306, + "step": 25352 + }, + { + "epoch": 1.1803664129245526, + "grad_norm": 0.33566791916644867, + "learning_rate": 7.598100874569552e-05, + "loss": 2.7146, + "step": 25353 + }, + { + "epoch": 1.1804129711106455, + "grad_norm": 0.3461280186256451, + "learning_rate": 7.597869437017171e-05, + "loss": 2.8073, + "step": 25354 + }, + { + "epoch": 1.1804595292967386, + "grad_norm": 0.33255893220925553, + "learning_rate": 7.597637991840299e-05, + "loss": 2.8091, + "step": 25355 + }, + { + "epoch": 1.1805060874828317, + "grad_norm": 0.3093383777702447, + "learning_rate": 7.597406539039616e-05, + "loss": 2.8296, + "step": 25356 + }, + { + "epoch": 1.1805526456689248, + "grad_norm": 0.3728927440999813, + "learning_rate": 7.5971750786158e-05, + "loss": 2.8072, + "step": 25357 + }, + { + "epoch": 1.1805992038550177, + "grad_norm": 0.3224190576901831, + "learning_rate": 7.596943610569529e-05, + "loss": 2.7386, + "step": 25358 + }, + { + "epoch": 1.1806457620411108, + "grad_norm": 0.33501580600779424, + "learning_rate": 7.596712134901487e-05, + "loss": 2.8758, + "step": 25359 + }, + { + "epoch": 1.180692320227204, + "grad_norm": 0.33991273537172284, + "learning_rate": 7.59648065161235e-05, + "loss": 2.7942, + "step": 25360 + }, + { + "epoch": 1.180738878413297, + "grad_norm": 0.33594472634850864, + "learning_rate": 7.596249160702797e-05, + "loss": 2.7484, + "step": 25361 + }, + { + "epoch": 1.1807854365993902, + "grad_norm": 0.3222657459338324, + "learning_rate": 7.59601766217351e-05, + "loss": 2.8125, + "step": 25362 + }, + { + "epoch": 1.180831994785483, + "grad_norm": 0.32930345308835607, + "learning_rate": 7.595786156025167e-05, + "loss": 2.8729, + "step": 25363 + }, + { + "epoch": 1.1808785529715762, + "grad_norm": 0.32177437734993464, + "learning_rate": 7.595554642258447e-05, + "loss": 2.7735, + "step": 25364 + }, + { + "epoch": 1.1809251111576693, + "grad_norm": 0.3529463647813082, + "learning_rate": 7.595323120874029e-05, + "loss": 2.781, + "step": 25365 + }, + { + "epoch": 1.1809716693437624, + "grad_norm": 0.3170913948567809, + "learning_rate": 7.595091591872593e-05, + "loss": 2.8173, + "step": 25366 + }, + { + "epoch": 1.1810182275298555, + "grad_norm": 0.33900995297443093, + "learning_rate": 7.59486005525482e-05, + "loss": 2.8635, + "step": 25367 + }, + { + "epoch": 1.1810647857159484, + "grad_norm": 0.32924662317952136, + "learning_rate": 7.594628511021389e-05, + "loss": 2.9071, + "step": 25368 + }, + { + "epoch": 1.1811113439020415, + "grad_norm": 0.31895443838858956, + "learning_rate": 7.594396959172977e-05, + "loss": 2.8258, + "step": 25369 + }, + { + "epoch": 1.1811579020881346, + "grad_norm": 0.32046420033189016, + "learning_rate": 7.594165399710266e-05, + "loss": 2.8114, + "step": 25370 + }, + { + "epoch": 1.1812044602742278, + "grad_norm": 0.32112041546621795, + "learning_rate": 7.593933832633935e-05, + "loss": 2.8935, + "step": 25371 + }, + { + "epoch": 1.1812510184603209, + "grad_norm": 0.3242741857573181, + "learning_rate": 7.593702257944666e-05, + "loss": 2.7758, + "step": 25372 + }, + { + "epoch": 1.1812975766464138, + "grad_norm": 0.29770723134488075, + "learning_rate": 7.593470675643133e-05, + "loss": 2.7445, + "step": 25373 + }, + { + "epoch": 1.1813441348325069, + "grad_norm": 0.3014011865454879, + "learning_rate": 7.593239085730021e-05, + "loss": 2.721, + "step": 25374 + }, + { + "epoch": 1.1813906930186, + "grad_norm": 0.317281582716297, + "learning_rate": 7.593007488206008e-05, + "loss": 2.8086, + "step": 25375 + }, + { + "epoch": 1.1814372512046931, + "grad_norm": 0.2999953268626573, + "learning_rate": 7.592775883071774e-05, + "loss": 2.7774, + "step": 25376 + }, + { + "epoch": 1.181483809390786, + "grad_norm": 0.30706385964745203, + "learning_rate": 7.592544270327998e-05, + "loss": 2.8141, + "step": 25377 + }, + { + "epoch": 1.1815303675768791, + "grad_norm": 0.3073875750889773, + "learning_rate": 7.592312649975357e-05, + "loss": 2.7384, + "step": 25378 + }, + { + "epoch": 1.1815769257629722, + "grad_norm": 0.28986634720285803, + "learning_rate": 7.592081022014536e-05, + "loss": 2.8404, + "step": 25379 + }, + { + "epoch": 1.1816234839490654, + "grad_norm": 0.32527088492503425, + "learning_rate": 7.591849386446213e-05, + "loss": 2.8003, + "step": 25380 + }, + { + "epoch": 1.1816700421351585, + "grad_norm": 0.3150390844509285, + "learning_rate": 7.591617743271066e-05, + "loss": 2.705, + "step": 25381 + }, + { + "epoch": 1.1817166003212516, + "grad_norm": 0.3135398540236668, + "learning_rate": 7.591386092489778e-05, + "loss": 2.8189, + "step": 25382 + }, + { + "epoch": 1.1817631585073445, + "grad_norm": 0.3433341166936485, + "learning_rate": 7.591154434103026e-05, + "loss": 2.8279, + "step": 25383 + }, + { + "epoch": 1.1818097166934376, + "grad_norm": 0.3258129571475475, + "learning_rate": 7.590922768111491e-05, + "loss": 2.7844, + "step": 25384 + }, + { + "epoch": 1.1818562748795307, + "grad_norm": 0.3302887446261851, + "learning_rate": 7.590691094515854e-05, + "loss": 2.8549, + "step": 25385 + }, + { + "epoch": 1.1819028330656238, + "grad_norm": 0.3161063802826006, + "learning_rate": 7.590459413316793e-05, + "loss": 2.7309, + "step": 25386 + }, + { + "epoch": 1.1819493912517167, + "grad_norm": 0.3311660853964647, + "learning_rate": 7.590227724514989e-05, + "loss": 2.7404, + "step": 25387 + }, + { + "epoch": 1.1819959494378098, + "grad_norm": 0.3288079106656367, + "learning_rate": 7.589996028111122e-05, + "loss": 2.7681, + "step": 25388 + }, + { + "epoch": 1.182042507623903, + "grad_norm": 0.3172512596985851, + "learning_rate": 7.589764324105872e-05, + "loss": 2.8195, + "step": 25389 + }, + { + "epoch": 1.182089065809996, + "grad_norm": 0.33557796018952646, + "learning_rate": 7.589532612499916e-05, + "loss": 2.779, + "step": 25390 + }, + { + "epoch": 1.1821356239960892, + "grad_norm": 0.327618833834151, + "learning_rate": 7.589300893293942e-05, + "loss": 2.7455, + "step": 25391 + }, + { + "epoch": 1.1821821821821823, + "grad_norm": 0.3583797203129204, + "learning_rate": 7.589069166488621e-05, + "loss": 2.8816, + "step": 25392 + }, + { + "epoch": 1.1822287403682752, + "grad_norm": 0.32984770253277884, + "learning_rate": 7.588837432084639e-05, + "loss": 2.7456, + "step": 25393 + }, + { + "epoch": 1.1822752985543683, + "grad_norm": 0.3477059041617906, + "learning_rate": 7.588605690082673e-05, + "loss": 2.8141, + "step": 25394 + }, + { + "epoch": 1.1823218567404614, + "grad_norm": 0.31596597063536697, + "learning_rate": 7.588373940483404e-05, + "loss": 2.8164, + "step": 25395 + }, + { + "epoch": 1.1823684149265545, + "grad_norm": 0.35919614579581743, + "learning_rate": 7.588142183287515e-05, + "loss": 2.7348, + "step": 25396 + }, + { + "epoch": 1.1824149731126474, + "grad_norm": 0.32958831800209715, + "learning_rate": 7.58791041849568e-05, + "loss": 2.8386, + "step": 25397 + }, + { + "epoch": 1.1824615312987405, + "grad_norm": 0.3408950193761992, + "learning_rate": 7.587678646108586e-05, + "loss": 2.7446, + "step": 25398 + }, + { + "epoch": 1.1825080894848337, + "grad_norm": 0.3285815630894222, + "learning_rate": 7.587446866126908e-05, + "loss": 2.7291, + "step": 25399 + }, + { + "epoch": 1.1825546476709268, + "grad_norm": 0.34750076030597143, + "learning_rate": 7.58721507855133e-05, + "loss": 2.8809, + "step": 25400 + }, + { + "epoch": 1.1826012058570199, + "grad_norm": 0.3230119657949287, + "learning_rate": 7.586983283382529e-05, + "loss": 2.678, + "step": 25401 + }, + { + "epoch": 1.182647764043113, + "grad_norm": 0.37162037328181063, + "learning_rate": 7.586751480621188e-05, + "loss": 2.8139, + "step": 25402 + }, + { + "epoch": 1.182694322229206, + "grad_norm": 0.33048911748584486, + "learning_rate": 7.586519670267987e-05, + "loss": 2.7953, + "step": 25403 + }, + { + "epoch": 1.182740880415299, + "grad_norm": 0.3639455753172167, + "learning_rate": 7.586287852323605e-05, + "loss": 2.8637, + "step": 25404 + }, + { + "epoch": 1.1827874386013921, + "grad_norm": 0.3103639021056781, + "learning_rate": 7.586056026788721e-05, + "loss": 2.8604, + "step": 25405 + }, + { + "epoch": 1.1828339967874852, + "grad_norm": 0.3650278808122221, + "learning_rate": 7.58582419366402e-05, + "loss": 2.7968, + "step": 25406 + }, + { + "epoch": 1.1828805549735781, + "grad_norm": 0.33029749787053986, + "learning_rate": 7.585592352950177e-05, + "loss": 2.8629, + "step": 25407 + }, + { + "epoch": 1.1829271131596713, + "grad_norm": 0.3523142096831236, + "learning_rate": 7.585360504647879e-05, + "loss": 2.8243, + "step": 25408 + }, + { + "epoch": 1.1829736713457644, + "grad_norm": 0.3147770679860196, + "learning_rate": 7.5851286487578e-05, + "loss": 2.778, + "step": 25409 + }, + { + "epoch": 1.1830202295318575, + "grad_norm": 0.35938409335993265, + "learning_rate": 7.584896785280622e-05, + "loss": 2.8814, + "step": 25410 + }, + { + "epoch": 1.1830667877179506, + "grad_norm": 0.30908359553070747, + "learning_rate": 7.58466491421703e-05, + "loss": 2.8421, + "step": 25411 + }, + { + "epoch": 1.1831133459040435, + "grad_norm": 0.34377324008395843, + "learning_rate": 7.584433035567698e-05, + "loss": 2.7937, + "step": 25412 + }, + { + "epoch": 1.1831599040901366, + "grad_norm": 0.31209858109523186, + "learning_rate": 7.584201149333312e-05, + "loss": 2.819, + "step": 25413 + }, + { + "epoch": 1.1832064622762297, + "grad_norm": 0.337034120822326, + "learning_rate": 7.583969255514549e-05, + "loss": 2.8601, + "step": 25414 + }, + { + "epoch": 1.1832530204623228, + "grad_norm": 0.3183165068887073, + "learning_rate": 7.58373735411209e-05, + "loss": 2.8563, + "step": 25415 + }, + { + "epoch": 1.183299578648416, + "grad_norm": 0.33068935008030503, + "learning_rate": 7.583505445126618e-05, + "loss": 2.6641, + "step": 25416 + }, + { + "epoch": 1.1833461368345088, + "grad_norm": 0.32171369020306156, + "learning_rate": 7.58327352855881e-05, + "loss": 2.8455, + "step": 25417 + }, + { + "epoch": 1.183392695020602, + "grad_norm": 0.3395775602450102, + "learning_rate": 7.583041604409352e-05, + "loss": 2.8187, + "step": 25418 + }, + { + "epoch": 1.183439253206695, + "grad_norm": 0.32567663394851487, + "learning_rate": 7.582809672678918e-05, + "loss": 2.9375, + "step": 25419 + }, + { + "epoch": 1.1834858113927882, + "grad_norm": 0.3621881690866923, + "learning_rate": 7.582577733368193e-05, + "loss": 2.8292, + "step": 25420 + }, + { + "epoch": 1.1835323695788813, + "grad_norm": 0.3506519294406286, + "learning_rate": 7.582345786477858e-05, + "loss": 2.6932, + "step": 25421 + }, + { + "epoch": 1.1835789277649742, + "grad_norm": 0.37855111269706804, + "learning_rate": 7.582113832008591e-05, + "loss": 2.7764, + "step": 25422 + }, + { + "epoch": 1.1836254859510673, + "grad_norm": 0.31896605914712955, + "learning_rate": 7.581881869961076e-05, + "loss": 2.7914, + "step": 25423 + }, + { + "epoch": 1.1836720441371604, + "grad_norm": 0.3347231018781282, + "learning_rate": 7.581649900335992e-05, + "loss": 2.8604, + "step": 25424 + }, + { + "epoch": 1.1837186023232535, + "grad_norm": 0.31920689428512056, + "learning_rate": 7.581417923134018e-05, + "loss": 2.7576, + "step": 25425 + }, + { + "epoch": 1.1837651605093464, + "grad_norm": 0.3288741180259493, + "learning_rate": 7.581185938355837e-05, + "loss": 2.7512, + "step": 25426 + }, + { + "epoch": 1.1838117186954396, + "grad_norm": 0.3298599472771655, + "learning_rate": 7.580953946002132e-05, + "loss": 2.7589, + "step": 25427 + }, + { + "epoch": 1.1838582768815327, + "grad_norm": 0.3369438612776168, + "learning_rate": 7.580721946073579e-05, + "loss": 2.7597, + "step": 25428 + }, + { + "epoch": 1.1839048350676258, + "grad_norm": 0.31926712451506223, + "learning_rate": 7.580489938570862e-05, + "loss": 2.8108, + "step": 25429 + }, + { + "epoch": 1.183951393253719, + "grad_norm": 0.32568377794014114, + "learning_rate": 7.58025792349466e-05, + "loss": 2.7262, + "step": 25430 + }, + { + "epoch": 1.183997951439812, + "grad_norm": 0.3275241369621026, + "learning_rate": 7.580025900845658e-05, + "loss": 2.799, + "step": 25431 + }, + { + "epoch": 1.184044509625905, + "grad_norm": 0.3554933944282296, + "learning_rate": 7.579793870624532e-05, + "loss": 2.7698, + "step": 25432 + }, + { + "epoch": 1.184091067811998, + "grad_norm": 0.3381636901733654, + "learning_rate": 7.579561832831965e-05, + "loss": 2.8848, + "step": 25433 + }, + { + "epoch": 1.1841376259980911, + "grad_norm": 0.32438895898822656, + "learning_rate": 7.579329787468638e-05, + "loss": 2.8386, + "step": 25434 + }, + { + "epoch": 1.1841841841841843, + "grad_norm": 0.3265377612131075, + "learning_rate": 7.579097734535234e-05, + "loss": 2.8137, + "step": 25435 + }, + { + "epoch": 1.1842307423702771, + "grad_norm": 0.3260223095131801, + "learning_rate": 7.578865674032431e-05, + "loss": 2.7269, + "step": 25436 + }, + { + "epoch": 1.1842773005563703, + "grad_norm": 0.3239590480298281, + "learning_rate": 7.578633605960913e-05, + "loss": 2.7748, + "step": 25437 + }, + { + "epoch": 1.1843238587424634, + "grad_norm": 0.3154269756869676, + "learning_rate": 7.578401530321358e-05, + "loss": 2.8731, + "step": 25438 + }, + { + "epoch": 1.1843704169285565, + "grad_norm": 0.3285107021203325, + "learning_rate": 7.578169447114448e-05, + "loss": 2.7275, + "step": 25439 + }, + { + "epoch": 1.1844169751146496, + "grad_norm": 0.3081177459802387, + "learning_rate": 7.577937356340865e-05, + "loss": 2.8291, + "step": 25440 + }, + { + "epoch": 1.1844635333007427, + "grad_norm": 0.29912548159255586, + "learning_rate": 7.577705258001291e-05, + "loss": 2.807, + "step": 25441 + }, + { + "epoch": 1.1845100914868356, + "grad_norm": 0.34170325705363364, + "learning_rate": 7.577473152096404e-05, + "loss": 2.9636, + "step": 25442 + }, + { + "epoch": 1.1845566496729287, + "grad_norm": 0.3374137320006008, + "learning_rate": 7.577241038626888e-05, + "loss": 2.8126, + "step": 25443 + }, + { + "epoch": 1.1846032078590218, + "grad_norm": 0.35620747018646354, + "learning_rate": 7.577008917593424e-05, + "loss": 2.734, + "step": 25444 + }, + { + "epoch": 1.184649766045115, + "grad_norm": 0.33439916113419155, + "learning_rate": 7.576776788996694e-05, + "loss": 2.8624, + "step": 25445 + }, + { + "epoch": 1.1846963242312079, + "grad_norm": 0.33783766720232894, + "learning_rate": 7.576544652837376e-05, + "loss": 2.8273, + "step": 25446 + }, + { + "epoch": 1.184742882417301, + "grad_norm": 0.3107298645236482, + "learning_rate": 7.576312509116152e-05, + "loss": 2.7648, + "step": 25447 + }, + { + "epoch": 1.184789440603394, + "grad_norm": 0.32729269971861835, + "learning_rate": 7.576080357833706e-05, + "loss": 2.7773, + "step": 25448 + }, + { + "epoch": 1.1848359987894872, + "grad_norm": 0.33908945517869427, + "learning_rate": 7.575848198990719e-05, + "loss": 2.7764, + "step": 25449 + }, + { + "epoch": 1.1848825569755803, + "grad_norm": 0.3176266183530845, + "learning_rate": 7.575616032587871e-05, + "loss": 2.7882, + "step": 25450 + }, + { + "epoch": 1.1849291151616732, + "grad_norm": 0.3332207307755742, + "learning_rate": 7.575383858625844e-05, + "loss": 2.8101, + "step": 25451 + }, + { + "epoch": 1.1849756733477663, + "grad_norm": 0.3500938492554489, + "learning_rate": 7.575151677105317e-05, + "loss": 2.841, + "step": 25452 + }, + { + "epoch": 1.1850222315338594, + "grad_norm": 0.3464720798175715, + "learning_rate": 7.574919488026974e-05, + "loss": 2.8511, + "step": 25453 + }, + { + "epoch": 1.1850687897199526, + "grad_norm": 0.33488896324675416, + "learning_rate": 7.574687291391498e-05, + "loss": 2.7969, + "step": 25454 + }, + { + "epoch": 1.1851153479060457, + "grad_norm": 0.30196582048122145, + "learning_rate": 7.574455087199568e-05, + "loss": 2.8111, + "step": 25455 + }, + { + "epoch": 1.1851619060921386, + "grad_norm": 0.34466552345790014, + "learning_rate": 7.574222875451864e-05, + "loss": 2.8043, + "step": 25456 + }, + { + "epoch": 1.1852084642782317, + "grad_norm": 0.314987650950101, + "learning_rate": 7.57399065614907e-05, + "loss": 2.8479, + "step": 25457 + }, + { + "epoch": 1.1852550224643248, + "grad_norm": 0.3587179369413497, + "learning_rate": 7.573758429291868e-05, + "loss": 2.8104, + "step": 25458 + }, + { + "epoch": 1.185301580650418, + "grad_norm": 0.3246165273890355, + "learning_rate": 7.573526194880938e-05, + "loss": 2.7275, + "step": 25459 + }, + { + "epoch": 1.185348138836511, + "grad_norm": 0.3506655724501585, + "learning_rate": 7.573293952916962e-05, + "loss": 2.8013, + "step": 25460 + }, + { + "epoch": 1.185394697022604, + "grad_norm": 0.3415222526664532, + "learning_rate": 7.573061703400623e-05, + "loss": 2.7375, + "step": 25461 + }, + { + "epoch": 1.185441255208697, + "grad_norm": 0.31861598202898705, + "learning_rate": 7.572829446332601e-05, + "loss": 2.6749, + "step": 25462 + }, + { + "epoch": 1.1854878133947901, + "grad_norm": 0.3494670037387607, + "learning_rate": 7.572597181713578e-05, + "loss": 2.9361, + "step": 25463 + }, + { + "epoch": 1.1855343715808833, + "grad_norm": 0.31554568082567186, + "learning_rate": 7.572364909544235e-05, + "loss": 2.7687, + "step": 25464 + }, + { + "epoch": 1.1855809297669762, + "grad_norm": 0.36696338775220105, + "learning_rate": 7.572132629825255e-05, + "loss": 2.7867, + "step": 25465 + }, + { + "epoch": 1.1856274879530693, + "grad_norm": 0.32540488117153155, + "learning_rate": 7.571900342557319e-05, + "loss": 2.7148, + "step": 25466 + }, + { + "epoch": 1.1856740461391624, + "grad_norm": 0.3515766677898424, + "learning_rate": 7.571668047741108e-05, + "loss": 2.6926, + "step": 25467 + }, + { + "epoch": 1.1857206043252555, + "grad_norm": 0.3525405934182657, + "learning_rate": 7.571435745377306e-05, + "loss": 2.7952, + "step": 25468 + }, + { + "epoch": 1.1857671625113486, + "grad_norm": 0.3575386066361545, + "learning_rate": 7.571203435466592e-05, + "loss": 2.7719, + "step": 25469 + }, + { + "epoch": 1.1858137206974417, + "grad_norm": 0.34332816764280166, + "learning_rate": 7.570971118009649e-05, + "loss": 2.7533, + "step": 25470 + }, + { + "epoch": 1.1858602788835346, + "grad_norm": 0.3493376249852132, + "learning_rate": 7.57073879300716e-05, + "loss": 2.7245, + "step": 25471 + }, + { + "epoch": 1.1859068370696277, + "grad_norm": 0.3361717570439968, + "learning_rate": 7.570506460459807e-05, + "loss": 2.6766, + "step": 25472 + }, + { + "epoch": 1.1859533952557209, + "grad_norm": 0.32634299897382735, + "learning_rate": 7.570274120368269e-05, + "loss": 2.6976, + "step": 25473 + }, + { + "epoch": 1.185999953441814, + "grad_norm": 0.3232730360395053, + "learning_rate": 7.57004177273323e-05, + "loss": 2.7772, + "step": 25474 + }, + { + "epoch": 1.1860465116279069, + "grad_norm": 0.3140813492468717, + "learning_rate": 7.569809417555374e-05, + "loss": 2.9068, + "step": 25475 + }, + { + "epoch": 1.186093069814, + "grad_norm": 0.33017018267158366, + "learning_rate": 7.569577054835377e-05, + "loss": 2.8574, + "step": 25476 + }, + { + "epoch": 1.186139628000093, + "grad_norm": 0.31373212019178487, + "learning_rate": 7.569344684573927e-05, + "loss": 2.7814, + "step": 25477 + }, + { + "epoch": 1.1861861861861862, + "grad_norm": 0.3011167348745936, + "learning_rate": 7.569112306771702e-05, + "loss": 2.803, + "step": 25478 + }, + { + "epoch": 1.1862327443722793, + "grad_norm": 0.34908237781007584, + "learning_rate": 7.568879921429386e-05, + "loss": 2.8632, + "step": 25479 + }, + { + "epoch": 1.1862793025583724, + "grad_norm": 0.33574996083445013, + "learning_rate": 7.568647528547659e-05, + "loss": 2.8182, + "step": 25480 + }, + { + "epoch": 1.1863258607444653, + "grad_norm": 0.32712007343297705, + "learning_rate": 7.568415128127206e-05, + "loss": 2.7767, + "step": 25481 + }, + { + "epoch": 1.1863724189305584, + "grad_norm": 0.3384865633290968, + "learning_rate": 7.568182720168708e-05, + "loss": 2.827, + "step": 25482 + }, + { + "epoch": 1.1864189771166516, + "grad_norm": 0.3368820080009234, + "learning_rate": 7.567950304672846e-05, + "loss": 2.8798, + "step": 25483 + }, + { + "epoch": 1.1864655353027447, + "grad_norm": 0.3268123451323246, + "learning_rate": 7.567717881640304e-05, + "loss": 2.8763, + "step": 25484 + }, + { + "epoch": 1.1865120934888376, + "grad_norm": 0.32467839220565303, + "learning_rate": 7.567485451071762e-05, + "loss": 2.8868, + "step": 25485 + }, + { + "epoch": 1.1865586516749307, + "grad_norm": 0.3299179990973656, + "learning_rate": 7.567253012967904e-05, + "loss": 2.7713, + "step": 25486 + }, + { + "epoch": 1.1866052098610238, + "grad_norm": 0.3241490184078039, + "learning_rate": 7.56702056732941e-05, + "loss": 2.7733, + "step": 25487 + }, + { + "epoch": 1.186651768047117, + "grad_norm": 0.31860581967175045, + "learning_rate": 7.566788114156964e-05, + "loss": 2.837, + "step": 25488 + }, + { + "epoch": 1.18669832623321, + "grad_norm": 0.3418660779527823, + "learning_rate": 7.566555653451249e-05, + "loss": 2.8, + "step": 25489 + }, + { + "epoch": 1.1867448844193031, + "grad_norm": 0.3102010686761335, + "learning_rate": 7.566323185212945e-05, + "loss": 2.8312, + "step": 25490 + }, + { + "epoch": 1.186791442605396, + "grad_norm": 0.32406284623042664, + "learning_rate": 7.566090709442735e-05, + "loss": 2.8115, + "step": 25491 + }, + { + "epoch": 1.1868380007914892, + "grad_norm": 0.31119140749124785, + "learning_rate": 7.565858226141301e-05, + "loss": 2.8075, + "step": 25492 + }, + { + "epoch": 1.1868845589775823, + "grad_norm": 0.33175675189435794, + "learning_rate": 7.565625735309326e-05, + "loss": 2.8097, + "step": 25493 + }, + { + "epoch": 1.1869311171636754, + "grad_norm": 0.31566306204090394, + "learning_rate": 7.565393236947494e-05, + "loss": 2.7787, + "step": 25494 + }, + { + "epoch": 1.1869776753497683, + "grad_norm": 0.30161776000341967, + "learning_rate": 7.565160731056485e-05, + "loss": 2.8139, + "step": 25495 + }, + { + "epoch": 1.1870242335358614, + "grad_norm": 0.3195952730519128, + "learning_rate": 7.56492821763698e-05, + "loss": 2.8748, + "step": 25496 + }, + { + "epoch": 1.1870707917219545, + "grad_norm": 0.309611784075393, + "learning_rate": 7.564695696689664e-05, + "loss": 2.7618, + "step": 25497 + }, + { + "epoch": 1.1871173499080476, + "grad_norm": 0.30010812771800377, + "learning_rate": 7.564463168215219e-05, + "loss": 2.8575, + "step": 25498 + }, + { + "epoch": 1.1871639080941407, + "grad_norm": 0.3165201596866341, + "learning_rate": 7.564230632214329e-05, + "loss": 2.784, + "step": 25499 + }, + { + "epoch": 1.1872104662802336, + "grad_norm": 0.30012951952197936, + "learning_rate": 7.563998088687674e-05, + "loss": 2.8139, + "step": 25500 + }, + { + "epoch": 1.1872570244663267, + "grad_norm": 0.3053523198750831, + "learning_rate": 7.563765537635936e-05, + "loss": 2.8658, + "step": 25501 + }, + { + "epoch": 1.1873035826524199, + "grad_norm": 0.2999203583965762, + "learning_rate": 7.563532979059799e-05, + "loss": 2.8269, + "step": 25502 + }, + { + "epoch": 1.187350140838513, + "grad_norm": 0.30322150272250165, + "learning_rate": 7.563300412959947e-05, + "loss": 2.7557, + "step": 25503 + }, + { + "epoch": 1.187396699024606, + "grad_norm": 0.3116147745389433, + "learning_rate": 7.563067839337059e-05, + "loss": 2.7885, + "step": 25504 + }, + { + "epoch": 1.187443257210699, + "grad_norm": 0.29530571376589065, + "learning_rate": 7.56283525819182e-05, + "loss": 2.8082, + "step": 25505 + }, + { + "epoch": 1.187489815396792, + "grad_norm": 0.2953828923746001, + "learning_rate": 7.562602669524912e-05, + "loss": 2.8353, + "step": 25506 + }, + { + "epoch": 1.1875363735828852, + "grad_norm": 0.320679860419003, + "learning_rate": 7.562370073337018e-05, + "loss": 2.8611, + "step": 25507 + }, + { + "epoch": 1.1875829317689783, + "grad_norm": 0.2953490333501421, + "learning_rate": 7.56213746962882e-05, + "loss": 2.8659, + "step": 25508 + }, + { + "epoch": 1.1876294899550714, + "grad_norm": 0.2804491896415201, + "learning_rate": 7.561904858400999e-05, + "loss": 2.8035, + "step": 25509 + }, + { + "epoch": 1.1876760481411643, + "grad_norm": 0.3046440172141779, + "learning_rate": 7.561672239654242e-05, + "loss": 2.7983, + "step": 25510 + }, + { + "epoch": 1.1877226063272575, + "grad_norm": 0.3124570685698419, + "learning_rate": 7.561439613389229e-05, + "loss": 2.7862, + "step": 25511 + }, + { + "epoch": 1.1877691645133506, + "grad_norm": 0.32787187430083603, + "learning_rate": 7.561206979606644e-05, + "loss": 2.7759, + "step": 25512 + }, + { + "epoch": 1.1878157226994437, + "grad_norm": 0.31900019629907256, + "learning_rate": 7.560974338307167e-05, + "loss": 2.7585, + "step": 25513 + }, + { + "epoch": 1.1878622808855366, + "grad_norm": 0.3358347805238783, + "learning_rate": 7.560741689491484e-05, + "loss": 2.7714, + "step": 25514 + }, + { + "epoch": 1.1879088390716297, + "grad_norm": 0.3231101001237376, + "learning_rate": 7.560509033160274e-05, + "loss": 2.8367, + "step": 25515 + }, + { + "epoch": 1.1879553972577228, + "grad_norm": 0.32622005611403515, + "learning_rate": 7.560276369314224e-05, + "loss": 2.6802, + "step": 25516 + }, + { + "epoch": 1.188001955443816, + "grad_norm": 0.31324051614693427, + "learning_rate": 7.560043697954015e-05, + "loss": 2.7574, + "step": 25517 + }, + { + "epoch": 1.188048513629909, + "grad_norm": 0.32452259676708467, + "learning_rate": 7.55981101908033e-05, + "loss": 2.8107, + "step": 25518 + }, + { + "epoch": 1.1880950718160022, + "grad_norm": 0.3302559298858153, + "learning_rate": 7.559578332693852e-05, + "loss": 2.8106, + "step": 25519 + }, + { + "epoch": 1.188141630002095, + "grad_norm": 0.3386548005229765, + "learning_rate": 7.559345638795264e-05, + "loss": 2.8274, + "step": 25520 + }, + { + "epoch": 1.1881881881881882, + "grad_norm": 0.33201929709476347, + "learning_rate": 7.55911293738525e-05, + "loss": 2.7547, + "step": 25521 + }, + { + "epoch": 1.1882347463742813, + "grad_norm": 0.3270058188506855, + "learning_rate": 7.558880228464488e-05, + "loss": 2.7983, + "step": 25522 + }, + { + "epoch": 1.1882813045603744, + "grad_norm": 0.33027465666616157, + "learning_rate": 7.558647512033665e-05, + "loss": 2.7919, + "step": 25523 + }, + { + "epoch": 1.1883278627464673, + "grad_norm": 0.3678430744269951, + "learning_rate": 7.558414788093466e-05, + "loss": 2.733, + "step": 25524 + }, + { + "epoch": 1.1883744209325604, + "grad_norm": 0.3124879471003527, + "learning_rate": 7.558182056644572e-05, + "loss": 2.7832, + "step": 25525 + }, + { + "epoch": 1.1884209791186535, + "grad_norm": 0.41847485641599425, + "learning_rate": 7.557949317687663e-05, + "loss": 2.831, + "step": 25526 + }, + { + "epoch": 1.1884675373047466, + "grad_norm": 0.29880516407063784, + "learning_rate": 7.557716571223427e-05, + "loss": 2.735, + "step": 25527 + }, + { + "epoch": 1.1885140954908398, + "grad_norm": 0.3503978808347431, + "learning_rate": 7.557483817252544e-05, + "loss": 2.6597, + "step": 25528 + }, + { + "epoch": 1.1885606536769329, + "grad_norm": 0.3141602716652653, + "learning_rate": 7.557251055775698e-05, + "loss": 2.8132, + "step": 25529 + }, + { + "epoch": 1.1886072118630258, + "grad_norm": 0.3733037164124708, + "learning_rate": 7.557018286793573e-05, + "loss": 2.8387, + "step": 25530 + }, + { + "epoch": 1.1886537700491189, + "grad_norm": 0.30559835679180036, + "learning_rate": 7.55678551030685e-05, + "loss": 2.7761, + "step": 25531 + }, + { + "epoch": 1.188700328235212, + "grad_norm": 0.32846315718051544, + "learning_rate": 7.556552726316213e-05, + "loss": 2.7914, + "step": 25532 + }, + { + "epoch": 1.188746886421305, + "grad_norm": 0.3330724023231281, + "learning_rate": 7.556319934822346e-05, + "loss": 2.85, + "step": 25533 + }, + { + "epoch": 1.188793444607398, + "grad_norm": 0.3181124918074995, + "learning_rate": 7.556087135825931e-05, + "loss": 2.8169, + "step": 25534 + }, + { + "epoch": 1.1888400027934911, + "grad_norm": 0.30735852663368024, + "learning_rate": 7.555854329327653e-05, + "loss": 2.7958, + "step": 25535 + }, + { + "epoch": 1.1888865609795842, + "grad_norm": 0.30919729834388304, + "learning_rate": 7.555621515328196e-05, + "loss": 2.7482, + "step": 25536 + }, + { + "epoch": 1.1889331191656773, + "grad_norm": 0.3130712147619863, + "learning_rate": 7.555388693828241e-05, + "loss": 2.8089, + "step": 25537 + }, + { + "epoch": 1.1889796773517705, + "grad_norm": 0.3051273676758538, + "learning_rate": 7.55515586482847e-05, + "loss": 2.8144, + "step": 25538 + }, + { + "epoch": 1.1890262355378634, + "grad_norm": 0.3188143933326625, + "learning_rate": 7.55492302832957e-05, + "loss": 2.7965, + "step": 25539 + }, + { + "epoch": 1.1890727937239565, + "grad_norm": 0.3188265026980984, + "learning_rate": 7.554690184332223e-05, + "loss": 2.8181, + "step": 25540 + }, + { + "epoch": 1.1891193519100496, + "grad_norm": 0.3216392054141694, + "learning_rate": 7.55445733283711e-05, + "loss": 2.7953, + "step": 25541 + }, + { + "epoch": 1.1891659100961427, + "grad_norm": 0.33788987564583517, + "learning_rate": 7.554224473844917e-05, + "loss": 2.7715, + "step": 25542 + }, + { + "epoch": 1.1892124682822358, + "grad_norm": 0.3185534406207546, + "learning_rate": 7.553991607356328e-05, + "loss": 2.8055, + "step": 25543 + }, + { + "epoch": 1.1892590264683287, + "grad_norm": 0.3621429348787586, + "learning_rate": 7.553758733372024e-05, + "loss": 2.8458, + "step": 25544 + }, + { + "epoch": 1.1893055846544218, + "grad_norm": 0.325184377714465, + "learning_rate": 7.55352585189269e-05, + "loss": 2.8007, + "step": 25545 + }, + { + "epoch": 1.189352142840515, + "grad_norm": 0.33377534709659157, + "learning_rate": 7.55329296291901e-05, + "loss": 2.9432, + "step": 25546 + }, + { + "epoch": 1.189398701026608, + "grad_norm": 0.3652861655652491, + "learning_rate": 7.553060066451665e-05, + "loss": 2.7245, + "step": 25547 + }, + { + "epoch": 1.1894452592127012, + "grad_norm": 0.3282787390221533, + "learning_rate": 7.552827162491341e-05, + "loss": 2.7703, + "step": 25548 + }, + { + "epoch": 1.189491817398794, + "grad_norm": 0.34869340389074954, + "learning_rate": 7.552594251038722e-05, + "loss": 2.9178, + "step": 25549 + }, + { + "epoch": 1.1895383755848872, + "grad_norm": 0.35009066755667867, + "learning_rate": 7.552361332094488e-05, + "loss": 2.7261, + "step": 25550 + }, + { + "epoch": 1.1895849337709803, + "grad_norm": 0.309429435285724, + "learning_rate": 7.552128405659327e-05, + "loss": 2.7495, + "step": 25551 + }, + { + "epoch": 1.1896314919570734, + "grad_norm": 0.320438945405208, + "learning_rate": 7.55189547173392e-05, + "loss": 2.8402, + "step": 25552 + }, + { + "epoch": 1.1896780501431663, + "grad_norm": 0.32218551468132167, + "learning_rate": 7.551662530318952e-05, + "loss": 2.8972, + "step": 25553 + }, + { + "epoch": 1.1897246083292594, + "grad_norm": 0.31030542762353513, + "learning_rate": 7.551429581415104e-05, + "loss": 2.7483, + "step": 25554 + }, + { + "epoch": 1.1897711665153525, + "grad_norm": 0.29795252771351394, + "learning_rate": 7.551196625023063e-05, + "loss": 2.8539, + "step": 25555 + }, + { + "epoch": 1.1898177247014456, + "grad_norm": 0.33242049210219266, + "learning_rate": 7.55096366114351e-05, + "loss": 2.7562, + "step": 25556 + }, + { + "epoch": 1.1898642828875388, + "grad_norm": 0.28292598443301414, + "learning_rate": 7.55073068977713e-05, + "loss": 2.7061, + "step": 25557 + }, + { + "epoch": 1.1899108410736319, + "grad_norm": 0.33184795170176595, + "learning_rate": 7.550497710924608e-05, + "loss": 2.8029, + "step": 25558 + }, + { + "epoch": 1.1899573992597248, + "grad_norm": 0.3236002396225058, + "learning_rate": 7.550264724586625e-05, + "loss": 2.798, + "step": 25559 + }, + { + "epoch": 1.1900039574458179, + "grad_norm": 0.3165774888200517, + "learning_rate": 7.550031730763867e-05, + "loss": 2.7904, + "step": 25560 + }, + { + "epoch": 1.190050515631911, + "grad_norm": 0.3123118797405086, + "learning_rate": 7.549798729457017e-05, + "loss": 2.8266, + "step": 25561 + }, + { + "epoch": 1.1900970738180041, + "grad_norm": 0.3401467604564095, + "learning_rate": 7.549565720666761e-05, + "loss": 2.7236, + "step": 25562 + }, + { + "epoch": 1.190143632004097, + "grad_norm": 0.33772447483526574, + "learning_rate": 7.549332704393778e-05, + "loss": 2.8054, + "step": 25563 + }, + { + "epoch": 1.1901901901901901, + "grad_norm": 0.31689707575060605, + "learning_rate": 7.549099680638756e-05, + "loss": 2.8438, + "step": 25564 + }, + { + "epoch": 1.1902367483762832, + "grad_norm": 0.3445234818064885, + "learning_rate": 7.548866649402377e-05, + "loss": 2.7675, + "step": 25565 + }, + { + "epoch": 1.1902833065623764, + "grad_norm": 0.36863677291196034, + "learning_rate": 7.548633610685326e-05, + "loss": 2.7995, + "step": 25566 + }, + { + "epoch": 1.1903298647484695, + "grad_norm": 0.3068631661364154, + "learning_rate": 7.548400564488286e-05, + "loss": 2.8265, + "step": 25567 + }, + { + "epoch": 1.1903764229345626, + "grad_norm": 0.3536716420702282, + "learning_rate": 7.548167510811941e-05, + "loss": 2.8137, + "step": 25568 + }, + { + "epoch": 1.1904229811206555, + "grad_norm": 0.34916782390961937, + "learning_rate": 7.547934449656975e-05, + "loss": 2.7292, + "step": 25569 + }, + { + "epoch": 1.1904695393067486, + "grad_norm": 0.3554294886929068, + "learning_rate": 7.547701381024073e-05, + "loss": 2.914, + "step": 25570 + }, + { + "epoch": 1.1905160974928417, + "grad_norm": 0.33375829864577206, + "learning_rate": 7.547468304913921e-05, + "loss": 2.7832, + "step": 25571 + }, + { + "epoch": 1.1905626556789348, + "grad_norm": 0.3330353651246881, + "learning_rate": 7.547235221327197e-05, + "loss": 2.8001, + "step": 25572 + }, + { + "epoch": 1.1906092138650277, + "grad_norm": 0.32536384090410897, + "learning_rate": 7.54700213026459e-05, + "loss": 2.7973, + "step": 25573 + }, + { + "epoch": 1.1906557720511208, + "grad_norm": 0.3127317687658286, + "learning_rate": 7.546769031726781e-05, + "loss": 2.7361, + "step": 25574 + }, + { + "epoch": 1.190702330237214, + "grad_norm": 0.30305719864932595, + "learning_rate": 7.546535925714459e-05, + "loss": 2.8035, + "step": 25575 + }, + { + "epoch": 1.190748888423307, + "grad_norm": 0.3539438683960621, + "learning_rate": 7.546302812228302e-05, + "loss": 2.8622, + "step": 25576 + }, + { + "epoch": 1.1907954466094002, + "grad_norm": 0.3106682282968135, + "learning_rate": 7.546069691269e-05, + "loss": 2.7735, + "step": 25577 + }, + { + "epoch": 1.1908420047954933, + "grad_norm": 0.3503625028365376, + "learning_rate": 7.545836562837231e-05, + "loss": 2.807, + "step": 25578 + }, + { + "epoch": 1.1908885629815862, + "grad_norm": 0.33975276673978133, + "learning_rate": 7.545603426933684e-05, + "loss": 2.7985, + "step": 25579 + }, + { + "epoch": 1.1909351211676793, + "grad_norm": 0.33822551336841766, + "learning_rate": 7.545370283559042e-05, + "loss": 2.7409, + "step": 25580 + }, + { + "epoch": 1.1909816793537724, + "grad_norm": 0.3360864486097886, + "learning_rate": 7.545137132713989e-05, + "loss": 2.7811, + "step": 25581 + }, + { + "epoch": 1.1910282375398655, + "grad_norm": 0.32660420164874376, + "learning_rate": 7.544903974399209e-05, + "loss": 2.8295, + "step": 25582 + }, + { + "epoch": 1.1910747957259584, + "grad_norm": 0.3536261641874879, + "learning_rate": 7.544670808615386e-05, + "loss": 2.8238, + "step": 25583 + }, + { + "epoch": 1.1911213539120515, + "grad_norm": 0.32444164245661766, + "learning_rate": 7.544437635363206e-05, + "loss": 2.7698, + "step": 25584 + }, + { + "epoch": 1.1911679120981447, + "grad_norm": 0.33423613366335264, + "learning_rate": 7.54420445464335e-05, + "loss": 2.7118, + "step": 25585 + }, + { + "epoch": 1.1912144702842378, + "grad_norm": 0.3673547850021964, + "learning_rate": 7.543971266456506e-05, + "loss": 2.7961, + "step": 25586 + }, + { + "epoch": 1.1912610284703309, + "grad_norm": 0.3236670048747639, + "learning_rate": 7.543738070803358e-05, + "loss": 2.794, + "step": 25587 + }, + { + "epoch": 1.1913075866564238, + "grad_norm": 0.35192994025443347, + "learning_rate": 7.543504867684588e-05, + "loss": 2.7261, + "step": 25588 + }, + { + "epoch": 1.191354144842517, + "grad_norm": 0.3340851817101388, + "learning_rate": 7.543271657100883e-05, + "loss": 2.8986, + "step": 25589 + }, + { + "epoch": 1.19140070302861, + "grad_norm": 0.40435710128911095, + "learning_rate": 7.543038439052927e-05, + "loss": 2.7954, + "step": 25590 + }, + { + "epoch": 1.1914472612147031, + "grad_norm": 0.3439871623729627, + "learning_rate": 7.542805213541402e-05, + "loss": 2.7746, + "step": 25591 + }, + { + "epoch": 1.1914938194007962, + "grad_norm": 0.3346136911770035, + "learning_rate": 7.542571980566993e-05, + "loss": 2.7866, + "step": 25592 + }, + { + "epoch": 1.1915403775868891, + "grad_norm": 0.3498367102714299, + "learning_rate": 7.542338740130388e-05, + "loss": 2.787, + "step": 25593 + }, + { + "epoch": 1.1915869357729822, + "grad_norm": 0.358386409684216, + "learning_rate": 7.54210549223227e-05, + "loss": 2.9492, + "step": 25594 + }, + { + "epoch": 1.1916334939590754, + "grad_norm": 0.3364276652401316, + "learning_rate": 7.541872236873319e-05, + "loss": 2.7994, + "step": 25595 + }, + { + "epoch": 1.1916800521451685, + "grad_norm": 0.3421333131661171, + "learning_rate": 7.541638974054226e-05, + "loss": 2.9294, + "step": 25596 + }, + { + "epoch": 1.1917266103312616, + "grad_norm": 0.34818298216015253, + "learning_rate": 7.541405703775673e-05, + "loss": 2.884, + "step": 25597 + }, + { + "epoch": 1.1917731685173545, + "grad_norm": 0.33775346050377303, + "learning_rate": 7.541172426038344e-05, + "loss": 2.8846, + "step": 25598 + }, + { + "epoch": 1.1918197267034476, + "grad_norm": 0.3630897638042207, + "learning_rate": 7.540939140842924e-05, + "loss": 2.7406, + "step": 25599 + }, + { + "epoch": 1.1918662848895407, + "grad_norm": 0.33432547636538346, + "learning_rate": 7.5407058481901e-05, + "loss": 2.7359, + "step": 25600 + }, + { + "epoch": 1.1919128430756338, + "grad_norm": 0.3493757715888301, + "learning_rate": 7.540472548080551e-05, + "loss": 2.7485, + "step": 25601 + }, + { + "epoch": 1.1919594012617267, + "grad_norm": 0.34537016989414465, + "learning_rate": 7.540239240514968e-05, + "loss": 2.723, + "step": 25602 + }, + { + "epoch": 1.1920059594478198, + "grad_norm": 0.329160788658348, + "learning_rate": 7.540005925494034e-05, + "loss": 2.7457, + "step": 25603 + }, + { + "epoch": 1.192052517633913, + "grad_norm": 0.3116459173637406, + "learning_rate": 7.53977260301843e-05, + "loss": 2.6498, + "step": 25604 + }, + { + "epoch": 1.192099075820006, + "grad_norm": 0.3345367271448063, + "learning_rate": 7.539539273088847e-05, + "loss": 2.753, + "step": 25605 + }, + { + "epoch": 1.1921456340060992, + "grad_norm": 0.30075821595518454, + "learning_rate": 7.539305935705962e-05, + "loss": 2.7003, + "step": 25606 + }, + { + "epoch": 1.1921921921921923, + "grad_norm": 0.32970688910005846, + "learning_rate": 7.539072590870468e-05, + "loss": 2.8068, + "step": 25607 + }, + { + "epoch": 1.1922387503782852, + "grad_norm": 0.32998796718303763, + "learning_rate": 7.538839238583044e-05, + "loss": 2.8209, + "step": 25608 + }, + { + "epoch": 1.1922853085643783, + "grad_norm": 0.31210354526214007, + "learning_rate": 7.538605878844378e-05, + "loss": 2.7579, + "step": 25609 + }, + { + "epoch": 1.1923318667504714, + "grad_norm": 0.33633815322865196, + "learning_rate": 7.538372511655154e-05, + "loss": 2.7967, + "step": 25610 + }, + { + "epoch": 1.1923784249365645, + "grad_norm": 0.27069286484618804, + "learning_rate": 7.538139137016057e-05, + "loss": 2.8143, + "step": 25611 + }, + { + "epoch": 1.1924249831226574, + "grad_norm": 0.350456037115935, + "learning_rate": 7.537905754927769e-05, + "loss": 2.8654, + "step": 25612 + }, + { + "epoch": 1.1924715413087505, + "grad_norm": 0.3063376676655966, + "learning_rate": 7.53767236539098e-05, + "loss": 2.8161, + "step": 25613 + }, + { + "epoch": 1.1925180994948437, + "grad_norm": 0.3023766117709094, + "learning_rate": 7.537438968406372e-05, + "loss": 2.8108, + "step": 25614 + }, + { + "epoch": 1.1925646576809368, + "grad_norm": 0.30593934462503136, + "learning_rate": 7.537205563974632e-05, + "loss": 2.7719, + "step": 25615 + }, + { + "epoch": 1.19261121586703, + "grad_norm": 0.32251711506212233, + "learning_rate": 7.536972152096444e-05, + "loss": 2.8053, + "step": 25616 + }, + { + "epoch": 1.192657774053123, + "grad_norm": 0.30227701959104625, + "learning_rate": 7.53673873277249e-05, + "loss": 2.7011, + "step": 25617 + }, + { + "epoch": 1.192704332239216, + "grad_norm": 0.3171978403706224, + "learning_rate": 7.536505306003459e-05, + "loss": 2.8675, + "step": 25618 + }, + { + "epoch": 1.192750890425309, + "grad_norm": 0.2993821373055314, + "learning_rate": 7.536271871790035e-05, + "loss": 2.7535, + "step": 25619 + }, + { + "epoch": 1.1927974486114021, + "grad_norm": 0.3156737982881946, + "learning_rate": 7.536038430132901e-05, + "loss": 2.8149, + "step": 25620 + }, + { + "epoch": 1.1928440067974952, + "grad_norm": 0.32185185114592424, + "learning_rate": 7.535804981032747e-05, + "loss": 2.811, + "step": 25621 + }, + { + "epoch": 1.1928905649835881, + "grad_norm": 0.31671288776378675, + "learning_rate": 7.535571524490253e-05, + "loss": 2.7083, + "step": 25622 + }, + { + "epoch": 1.1929371231696813, + "grad_norm": 0.311629531535939, + "learning_rate": 7.535338060506108e-05, + "loss": 2.6937, + "step": 25623 + }, + { + "epoch": 1.1929836813557744, + "grad_norm": 0.3229811961513961, + "learning_rate": 7.535104589080994e-05, + "loss": 2.8308, + "step": 25624 + }, + { + "epoch": 1.1930302395418675, + "grad_norm": 0.28616872583585473, + "learning_rate": 7.534871110215598e-05, + "loss": 2.6799, + "step": 25625 + }, + { + "epoch": 1.1930767977279606, + "grad_norm": 0.3436056818592091, + "learning_rate": 7.534637623910607e-05, + "loss": 2.7589, + "step": 25626 + }, + { + "epoch": 1.1931233559140535, + "grad_norm": 0.31017119118189407, + "learning_rate": 7.534404130166702e-05, + "loss": 2.6817, + "step": 25627 + }, + { + "epoch": 1.1931699141001466, + "grad_norm": 0.29224908436650227, + "learning_rate": 7.534170628984571e-05, + "loss": 2.8519, + "step": 25628 + }, + { + "epoch": 1.1932164722862397, + "grad_norm": 0.3250272157678213, + "learning_rate": 7.5339371203649e-05, + "loss": 2.8686, + "step": 25629 + }, + { + "epoch": 1.1932630304723328, + "grad_norm": 0.3222472121697495, + "learning_rate": 7.533703604308373e-05, + "loss": 2.8458, + "step": 25630 + }, + { + "epoch": 1.193309588658426, + "grad_norm": 0.31285751164918696, + "learning_rate": 7.533470080815674e-05, + "loss": 2.7722, + "step": 25631 + }, + { + "epoch": 1.1933561468445189, + "grad_norm": 0.30829902610154963, + "learning_rate": 7.533236549887492e-05, + "loss": 2.8418, + "step": 25632 + }, + { + "epoch": 1.193402705030612, + "grad_norm": 0.3270013049184519, + "learning_rate": 7.533003011524508e-05, + "loss": 2.6749, + "step": 25633 + }, + { + "epoch": 1.193449263216705, + "grad_norm": 0.3067130562228972, + "learning_rate": 7.532769465727412e-05, + "loss": 2.797, + "step": 25634 + }, + { + "epoch": 1.1934958214027982, + "grad_norm": 0.30081213681747543, + "learning_rate": 7.532535912496886e-05, + "loss": 2.709, + "step": 25635 + }, + { + "epoch": 1.1935423795888913, + "grad_norm": 0.33364167565269404, + "learning_rate": 7.532302351833616e-05, + "loss": 2.8055, + "step": 25636 + }, + { + "epoch": 1.1935889377749842, + "grad_norm": 0.31439438840533046, + "learning_rate": 7.532068783738288e-05, + "loss": 2.7377, + "step": 25637 + }, + { + "epoch": 1.1936354959610773, + "grad_norm": 0.31894435745459054, + "learning_rate": 7.531835208211589e-05, + "loss": 2.8366, + "step": 25638 + }, + { + "epoch": 1.1936820541471704, + "grad_norm": 0.3389737106531867, + "learning_rate": 7.531601625254203e-05, + "loss": 2.7605, + "step": 25639 + }, + { + "epoch": 1.1937286123332636, + "grad_norm": 0.32793886121055166, + "learning_rate": 7.531368034866815e-05, + "loss": 2.7846, + "step": 25640 + }, + { + "epoch": 1.1937751705193564, + "grad_norm": 0.33645389833808126, + "learning_rate": 7.531134437050111e-05, + "loss": 2.7927, + "step": 25641 + }, + { + "epoch": 1.1938217287054496, + "grad_norm": 0.33344019449050594, + "learning_rate": 7.530900831804777e-05, + "loss": 2.7481, + "step": 25642 + }, + { + "epoch": 1.1938682868915427, + "grad_norm": 0.3441722550487317, + "learning_rate": 7.5306672191315e-05, + "loss": 2.8252, + "step": 25643 + }, + { + "epoch": 1.1939148450776358, + "grad_norm": 0.3029284264423711, + "learning_rate": 7.530433599030961e-05, + "loss": 2.8033, + "step": 25644 + }, + { + "epoch": 1.193961403263729, + "grad_norm": 0.3320971899297746, + "learning_rate": 7.530199971503851e-05, + "loss": 2.7623, + "step": 25645 + }, + { + "epoch": 1.194007961449822, + "grad_norm": 0.3095878904205157, + "learning_rate": 7.52996633655085e-05, + "loss": 2.8242, + "step": 25646 + }, + { + "epoch": 1.194054519635915, + "grad_norm": 0.34309855886786, + "learning_rate": 7.529732694172651e-05, + "loss": 2.752, + "step": 25647 + }, + { + "epoch": 1.194101077822008, + "grad_norm": 0.30550163337165326, + "learning_rate": 7.529499044369935e-05, + "loss": 2.8781, + "step": 25648 + }, + { + "epoch": 1.1941476360081011, + "grad_norm": 0.3488250337621532, + "learning_rate": 7.529265387143387e-05, + "loss": 2.8785, + "step": 25649 + }, + { + "epoch": 1.1941941941941943, + "grad_norm": 0.3145889541027423, + "learning_rate": 7.529031722493693e-05, + "loss": 2.7985, + "step": 25650 + }, + { + "epoch": 1.1942407523802872, + "grad_norm": 0.3112906298423452, + "learning_rate": 7.528798050421544e-05, + "loss": 2.7858, + "step": 25651 + }, + { + "epoch": 1.1942873105663803, + "grad_norm": 0.31743022350098377, + "learning_rate": 7.52856437092762e-05, + "loss": 2.7289, + "step": 25652 + }, + { + "epoch": 1.1943338687524734, + "grad_norm": 0.314900861113789, + "learning_rate": 7.528330684012608e-05, + "loss": 2.7745, + "step": 25653 + }, + { + "epoch": 1.1943804269385665, + "grad_norm": 0.3518783738707984, + "learning_rate": 7.528096989677196e-05, + "loss": 2.8306, + "step": 25654 + }, + { + "epoch": 1.1944269851246596, + "grad_norm": 0.2923090200863993, + "learning_rate": 7.527863287922066e-05, + "loss": 2.848, + "step": 25655 + }, + { + "epoch": 1.1944735433107527, + "grad_norm": 0.3884028981372034, + "learning_rate": 7.527629578747908e-05, + "loss": 2.8377, + "step": 25656 + }, + { + "epoch": 1.1945201014968456, + "grad_norm": 0.29895217379717887, + "learning_rate": 7.527395862155407e-05, + "loss": 2.8142, + "step": 25657 + }, + { + "epoch": 1.1945666596829387, + "grad_norm": 0.3478044389420508, + "learning_rate": 7.527162138145247e-05, + "loss": 2.7685, + "step": 25658 + }, + { + "epoch": 1.1946132178690319, + "grad_norm": 0.3562741571272746, + "learning_rate": 7.526928406718114e-05, + "loss": 2.8344, + "step": 25659 + }, + { + "epoch": 1.194659776055125, + "grad_norm": 0.36390888307572694, + "learning_rate": 7.526694667874696e-05, + "loss": 2.8896, + "step": 25660 + }, + { + "epoch": 1.1947063342412179, + "grad_norm": 0.3446814671571082, + "learning_rate": 7.52646092161568e-05, + "loss": 2.7914, + "step": 25661 + }, + { + "epoch": 1.194752892427311, + "grad_norm": 0.3904566883601023, + "learning_rate": 7.526227167941749e-05, + "loss": 2.769, + "step": 25662 + }, + { + "epoch": 1.194799450613404, + "grad_norm": 0.3409722484924757, + "learning_rate": 7.525993406853589e-05, + "loss": 2.7624, + "step": 25663 + }, + { + "epoch": 1.1948460087994972, + "grad_norm": 0.3423690278642048, + "learning_rate": 7.525759638351886e-05, + "loss": 2.8897, + "step": 25664 + }, + { + "epoch": 1.1948925669855903, + "grad_norm": 0.35831655878277824, + "learning_rate": 7.52552586243733e-05, + "loss": 2.8906, + "step": 25665 + }, + { + "epoch": 1.1949391251716832, + "grad_norm": 0.3201052802197407, + "learning_rate": 7.525292079110605e-05, + "loss": 2.8688, + "step": 25666 + }, + { + "epoch": 1.1949856833577763, + "grad_norm": 0.35007608398147194, + "learning_rate": 7.525058288372394e-05, + "loss": 2.8011, + "step": 25667 + }, + { + "epoch": 1.1950322415438694, + "grad_norm": 0.32098617356221404, + "learning_rate": 7.524824490223386e-05, + "loss": 2.8704, + "step": 25668 + }, + { + "epoch": 1.1950787997299626, + "grad_norm": 0.32010150787426617, + "learning_rate": 7.524590684664268e-05, + "loss": 2.885, + "step": 25669 + }, + { + "epoch": 1.1951253579160557, + "grad_norm": 0.33665233382877424, + "learning_rate": 7.524356871695726e-05, + "loss": 2.8793, + "step": 25670 + }, + { + "epoch": 1.1951719161021486, + "grad_norm": 0.3221903784667935, + "learning_rate": 7.524123051318443e-05, + "loss": 2.7698, + "step": 25671 + }, + { + "epoch": 1.1952184742882417, + "grad_norm": 0.32964245318355756, + "learning_rate": 7.523889223533107e-05, + "loss": 2.81, + "step": 25672 + }, + { + "epoch": 1.1952650324743348, + "grad_norm": 0.31755688101246354, + "learning_rate": 7.523655388340408e-05, + "loss": 2.8378, + "step": 25673 + }, + { + "epoch": 1.195311590660428, + "grad_norm": 0.32685457319873257, + "learning_rate": 7.523421545741025e-05, + "loss": 2.8168, + "step": 25674 + }, + { + "epoch": 1.195358148846521, + "grad_norm": 0.3287796964956008, + "learning_rate": 7.523187695735652e-05, + "loss": 2.8288, + "step": 25675 + }, + { + "epoch": 1.195404707032614, + "grad_norm": 0.32850574586623676, + "learning_rate": 7.522953838324969e-05, + "loss": 2.7119, + "step": 25676 + }, + { + "epoch": 1.195451265218707, + "grad_norm": 0.31560882272982294, + "learning_rate": 7.522719973509667e-05, + "loss": 2.7138, + "step": 25677 + }, + { + "epoch": 1.1954978234048002, + "grad_norm": 0.3381309649019111, + "learning_rate": 7.522486101290431e-05, + "loss": 2.7677, + "step": 25678 + }, + { + "epoch": 1.1955443815908933, + "grad_norm": 0.3133744659835718, + "learning_rate": 7.522252221667945e-05, + "loss": 2.8036, + "step": 25679 + }, + { + "epoch": 1.1955909397769864, + "grad_norm": 0.3292088536003661, + "learning_rate": 7.522018334642897e-05, + "loss": 2.8442, + "step": 25680 + }, + { + "epoch": 1.1956374979630793, + "grad_norm": 0.34179459567430853, + "learning_rate": 7.521784440215975e-05, + "loss": 2.731, + "step": 25681 + }, + { + "epoch": 1.1956840561491724, + "grad_norm": 0.3391866848301557, + "learning_rate": 7.521550538387863e-05, + "loss": 2.7692, + "step": 25682 + }, + { + "epoch": 1.1957306143352655, + "grad_norm": 0.33706797937180183, + "learning_rate": 7.52131662915925e-05, + "loss": 2.7804, + "step": 25683 + }, + { + "epoch": 1.1957771725213586, + "grad_norm": 0.33621575083915295, + "learning_rate": 7.52108271253082e-05, + "loss": 2.8547, + "step": 25684 + }, + { + "epoch": 1.1958237307074517, + "grad_norm": 0.34195894264127763, + "learning_rate": 7.52084878850326e-05, + "loss": 2.7438, + "step": 25685 + }, + { + "epoch": 1.1958702888935446, + "grad_norm": 0.3531419095247618, + "learning_rate": 7.520614857077257e-05, + "loss": 2.7948, + "step": 25686 + }, + { + "epoch": 1.1959168470796377, + "grad_norm": 0.3264870679823367, + "learning_rate": 7.520380918253499e-05, + "loss": 2.7263, + "step": 25687 + }, + { + "epoch": 1.1959634052657309, + "grad_norm": 0.347569000607571, + "learning_rate": 7.520146972032671e-05, + "loss": 2.8443, + "step": 25688 + }, + { + "epoch": 1.196009963451824, + "grad_norm": 0.33194932449892034, + "learning_rate": 7.51991301841546e-05, + "loss": 2.8992, + "step": 25689 + }, + { + "epoch": 1.1960565216379169, + "grad_norm": 0.37597508031336996, + "learning_rate": 7.519679057402552e-05, + "loss": 2.8773, + "step": 25690 + }, + { + "epoch": 1.19610307982401, + "grad_norm": 0.33275432441850145, + "learning_rate": 7.519445088994635e-05, + "loss": 2.8285, + "step": 25691 + }, + { + "epoch": 1.196149638010103, + "grad_norm": 0.40842934445200224, + "learning_rate": 7.519211113192394e-05, + "loss": 2.8451, + "step": 25692 + }, + { + "epoch": 1.1961961961961962, + "grad_norm": 0.30505629500604703, + "learning_rate": 7.518977129996518e-05, + "loss": 2.8381, + "step": 25693 + }, + { + "epoch": 1.1962427543822893, + "grad_norm": 0.37490172268521377, + "learning_rate": 7.518743139407691e-05, + "loss": 2.8327, + "step": 25694 + }, + { + "epoch": 1.1962893125683824, + "grad_norm": 0.32983975725189624, + "learning_rate": 7.518509141426601e-05, + "loss": 2.8989, + "step": 25695 + }, + { + "epoch": 1.1963358707544753, + "grad_norm": 0.34110599410036396, + "learning_rate": 7.518275136053935e-05, + "loss": 2.8488, + "step": 25696 + }, + { + "epoch": 1.1963824289405685, + "grad_norm": 0.3356083041346019, + "learning_rate": 7.51804112329038e-05, + "loss": 2.8812, + "step": 25697 + }, + { + "epoch": 1.1964289871266616, + "grad_norm": 0.3424857777169848, + "learning_rate": 7.517807103136623e-05, + "loss": 2.7158, + "step": 25698 + }, + { + "epoch": 1.1964755453127547, + "grad_norm": 0.366457026087504, + "learning_rate": 7.517573075593348e-05, + "loss": 2.8988, + "step": 25699 + }, + { + "epoch": 1.1965221034988476, + "grad_norm": 0.30465180688329613, + "learning_rate": 7.517339040661245e-05, + "loss": 2.6635, + "step": 25700 + }, + { + "epoch": 1.1965686616849407, + "grad_norm": 0.3658620936313327, + "learning_rate": 7.517104998341e-05, + "loss": 2.7626, + "step": 25701 + }, + { + "epoch": 1.1966152198710338, + "grad_norm": 0.3210591208444302, + "learning_rate": 7.516870948633302e-05, + "loss": 2.7807, + "step": 25702 + }, + { + "epoch": 1.196661778057127, + "grad_norm": 0.3314745057445759, + "learning_rate": 7.516636891538833e-05, + "loss": 2.7386, + "step": 25703 + }, + { + "epoch": 1.19670833624322, + "grad_norm": 0.35370197334605896, + "learning_rate": 7.516402827058283e-05, + "loss": 2.8236, + "step": 25704 + }, + { + "epoch": 1.1967548944293132, + "grad_norm": 0.39003602216554595, + "learning_rate": 7.516168755192339e-05, + "loss": 2.8835, + "step": 25705 + }, + { + "epoch": 1.196801452615406, + "grad_norm": 0.3335285005383226, + "learning_rate": 7.515934675941688e-05, + "loss": 2.873, + "step": 25706 + }, + { + "epoch": 1.1968480108014992, + "grad_norm": 0.35737648593856897, + "learning_rate": 7.515700589307016e-05, + "loss": 2.7601, + "step": 25707 + }, + { + "epoch": 1.1968945689875923, + "grad_norm": 0.3290453365608128, + "learning_rate": 7.515466495289009e-05, + "loss": 2.7044, + "step": 25708 + }, + { + "epoch": 1.1969411271736854, + "grad_norm": 0.3460724526896841, + "learning_rate": 7.515232393888359e-05, + "loss": 2.915, + "step": 25709 + }, + { + "epoch": 1.1969876853597783, + "grad_norm": 0.3471297789604031, + "learning_rate": 7.514998285105748e-05, + "loss": 2.7353, + "step": 25710 + }, + { + "epoch": 1.1970342435458714, + "grad_norm": 0.3359185271588295, + "learning_rate": 7.514764168941866e-05, + "loss": 2.7614, + "step": 25711 + }, + { + "epoch": 1.1970808017319645, + "grad_norm": 0.31743124796098693, + "learning_rate": 7.514530045397397e-05, + "loss": 2.8034, + "step": 25712 + }, + { + "epoch": 1.1971273599180576, + "grad_norm": 0.33246762750794256, + "learning_rate": 7.514295914473031e-05, + "loss": 2.7697, + "step": 25713 + }, + { + "epoch": 1.1971739181041507, + "grad_norm": 0.31196654205584823, + "learning_rate": 7.514061776169453e-05, + "loss": 2.792, + "step": 25714 + }, + { + "epoch": 1.1972204762902436, + "grad_norm": 0.33384214067471085, + "learning_rate": 7.513827630487354e-05, + "loss": 2.7997, + "step": 25715 + }, + { + "epoch": 1.1972670344763368, + "grad_norm": 0.3191352602582847, + "learning_rate": 7.513593477427416e-05, + "loss": 2.7256, + "step": 25716 + }, + { + "epoch": 1.1973135926624299, + "grad_norm": 0.3091814739035392, + "learning_rate": 7.51335931699033e-05, + "loss": 2.8703, + "step": 25717 + }, + { + "epoch": 1.197360150848523, + "grad_norm": 0.32569157478356453, + "learning_rate": 7.513125149176782e-05, + "loss": 2.8152, + "step": 25718 + }, + { + "epoch": 1.197406709034616, + "grad_norm": 0.32441378201343984, + "learning_rate": 7.512890973987458e-05, + "loss": 2.7097, + "step": 25719 + }, + { + "epoch": 1.197453267220709, + "grad_norm": 0.3653712623112295, + "learning_rate": 7.512656791423049e-05, + "loss": 2.8161, + "step": 25720 + }, + { + "epoch": 1.197499825406802, + "grad_norm": 0.3236746712855718, + "learning_rate": 7.512422601484237e-05, + "loss": 2.6882, + "step": 25721 + }, + { + "epoch": 1.1975463835928952, + "grad_norm": 0.3261383582757282, + "learning_rate": 7.512188404171714e-05, + "loss": 2.8468, + "step": 25722 + }, + { + "epoch": 1.1975929417789883, + "grad_norm": 0.2966395898600168, + "learning_rate": 7.511954199486165e-05, + "loss": 2.8608, + "step": 25723 + }, + { + "epoch": 1.1976394999650815, + "grad_norm": 0.345748804925744, + "learning_rate": 7.511719987428278e-05, + "loss": 2.7891, + "step": 25724 + }, + { + "epoch": 1.1976860581511743, + "grad_norm": 0.29807230591078293, + "learning_rate": 7.51148576799874e-05, + "loss": 2.7631, + "step": 25725 + }, + { + "epoch": 1.1977326163372675, + "grad_norm": 0.35656933634827415, + "learning_rate": 7.51125154119824e-05, + "loss": 2.7907, + "step": 25726 + }, + { + "epoch": 1.1977791745233606, + "grad_norm": 0.31984766325611497, + "learning_rate": 7.511017307027463e-05, + "loss": 2.8307, + "step": 25727 + }, + { + "epoch": 1.1978257327094537, + "grad_norm": 0.34774223154819905, + "learning_rate": 7.510783065487098e-05, + "loss": 2.7524, + "step": 25728 + }, + { + "epoch": 1.1978722908955466, + "grad_norm": 0.3308128824893376, + "learning_rate": 7.510548816577832e-05, + "loss": 2.7646, + "step": 25729 + }, + { + "epoch": 1.1979188490816397, + "grad_norm": 0.32370381036880386, + "learning_rate": 7.510314560300352e-05, + "loss": 2.8595, + "step": 25730 + }, + { + "epoch": 1.1979654072677328, + "grad_norm": 0.3295349220097824, + "learning_rate": 7.510080296655346e-05, + "loss": 2.9252, + "step": 25731 + }, + { + "epoch": 1.198011965453826, + "grad_norm": 0.3393514376118982, + "learning_rate": 7.509846025643502e-05, + "loss": 2.7434, + "step": 25732 + }, + { + "epoch": 1.198058523639919, + "grad_norm": 0.33026942308270885, + "learning_rate": 7.509611747265508e-05, + "loss": 2.7034, + "step": 25733 + }, + { + "epoch": 1.1981050818260122, + "grad_norm": 0.3525677131473295, + "learning_rate": 7.509377461522049e-05, + "loss": 2.8794, + "step": 25734 + }, + { + "epoch": 1.198151640012105, + "grad_norm": 0.3480179482069483, + "learning_rate": 7.509143168413816e-05, + "loss": 2.7668, + "step": 25735 + }, + { + "epoch": 1.1981981981981982, + "grad_norm": 0.337897174666739, + "learning_rate": 7.508908867941493e-05, + "loss": 2.8151, + "step": 25736 + }, + { + "epoch": 1.1982447563842913, + "grad_norm": 0.3614839956591202, + "learning_rate": 7.508674560105772e-05, + "loss": 2.7438, + "step": 25737 + }, + { + "epoch": 1.1982913145703844, + "grad_norm": 0.3167300654520959, + "learning_rate": 7.508440244907338e-05, + "loss": 2.7574, + "step": 25738 + }, + { + "epoch": 1.1983378727564773, + "grad_norm": 0.38925452381162584, + "learning_rate": 7.508205922346878e-05, + "loss": 2.8077, + "step": 25739 + }, + { + "epoch": 1.1983844309425704, + "grad_norm": 0.33748819655787393, + "learning_rate": 7.507971592425081e-05, + "loss": 2.8331, + "step": 25740 + }, + { + "epoch": 1.1984309891286635, + "grad_norm": 0.41948872907682305, + "learning_rate": 7.507737255142633e-05, + "loss": 2.8477, + "step": 25741 + }, + { + "epoch": 1.1984775473147566, + "grad_norm": 0.329924638469366, + "learning_rate": 7.507502910500226e-05, + "loss": 2.7801, + "step": 25742 + }, + { + "epoch": 1.1985241055008498, + "grad_norm": 0.3736819099723286, + "learning_rate": 7.507268558498543e-05, + "loss": 2.759, + "step": 25743 + }, + { + "epoch": 1.1985706636869429, + "grad_norm": 0.33384124056410247, + "learning_rate": 7.507034199138275e-05, + "loss": 2.8202, + "step": 25744 + }, + { + "epoch": 1.1986172218730358, + "grad_norm": 0.37702708568335164, + "learning_rate": 7.506799832420109e-05, + "loss": 2.8067, + "step": 25745 + }, + { + "epoch": 1.1986637800591289, + "grad_norm": 0.3301472595000553, + "learning_rate": 7.50656545834473e-05, + "loss": 2.8037, + "step": 25746 + }, + { + "epoch": 1.198710338245222, + "grad_norm": 0.3585964543149947, + "learning_rate": 7.506331076912831e-05, + "loss": 2.8987, + "step": 25747 + }, + { + "epoch": 1.198756896431315, + "grad_norm": 0.3519243074233975, + "learning_rate": 7.506096688125094e-05, + "loss": 2.7566, + "step": 25748 + }, + { + "epoch": 1.198803454617408, + "grad_norm": 0.3459493829548694, + "learning_rate": 7.505862291982213e-05, + "loss": 2.7543, + "step": 25749 + }, + { + "epoch": 1.1988500128035011, + "grad_norm": 0.3250850921747277, + "learning_rate": 7.505627888484872e-05, + "loss": 2.7622, + "step": 25750 + }, + { + "epoch": 1.1988965709895942, + "grad_norm": 0.3458781889750457, + "learning_rate": 7.505393477633761e-05, + "loss": 2.7312, + "step": 25751 + }, + { + "epoch": 1.1989431291756873, + "grad_norm": 0.3423277045045609, + "learning_rate": 7.505159059429565e-05, + "loss": 2.9212, + "step": 25752 + }, + { + "epoch": 1.1989896873617805, + "grad_norm": 0.35636380521501515, + "learning_rate": 7.504924633872976e-05, + "loss": 2.8135, + "step": 25753 + }, + { + "epoch": 1.1990362455478734, + "grad_norm": 0.31727557621779423, + "learning_rate": 7.504690200964678e-05, + "loss": 2.7971, + "step": 25754 + }, + { + "epoch": 1.1990828037339665, + "grad_norm": 0.32702357899030016, + "learning_rate": 7.504455760705362e-05, + "loss": 2.8201, + "step": 25755 + }, + { + "epoch": 1.1991293619200596, + "grad_norm": 0.29564217275493676, + "learning_rate": 7.504221313095716e-05, + "loss": 2.6612, + "step": 25756 + }, + { + "epoch": 1.1991759201061527, + "grad_norm": 0.29904310290979796, + "learning_rate": 7.503986858136427e-05, + "loss": 2.8274, + "step": 25757 + }, + { + "epoch": 1.1992224782922458, + "grad_norm": 0.30375812613927305, + "learning_rate": 7.503752395828182e-05, + "loss": 2.8607, + "step": 25758 + }, + { + "epoch": 1.1992690364783387, + "grad_norm": 0.3386963694927706, + "learning_rate": 7.50351792617167e-05, + "loss": 2.8654, + "step": 25759 + }, + { + "epoch": 1.1993155946644318, + "grad_norm": 0.30629506877571977, + "learning_rate": 7.503283449167582e-05, + "loss": 2.7507, + "step": 25760 + }, + { + "epoch": 1.199362152850525, + "grad_norm": 0.3124085468335415, + "learning_rate": 7.503048964816601e-05, + "loss": 2.8326, + "step": 25761 + }, + { + "epoch": 1.199408711036618, + "grad_norm": 0.3495574057393848, + "learning_rate": 7.502814473119419e-05, + "loss": 2.9258, + "step": 25762 + }, + { + "epoch": 1.1994552692227112, + "grad_norm": 0.2994332314316145, + "learning_rate": 7.502579974076723e-05, + "loss": 2.7352, + "step": 25763 + }, + { + "epoch": 1.199501827408804, + "grad_norm": 0.32473075316775785, + "learning_rate": 7.502345467689202e-05, + "loss": 2.7292, + "step": 25764 + }, + { + "epoch": 1.1995483855948972, + "grad_norm": 0.3116125552715895, + "learning_rate": 7.502110953957543e-05, + "loss": 2.7388, + "step": 25765 + }, + { + "epoch": 1.1995949437809903, + "grad_norm": 0.3218649296848924, + "learning_rate": 7.501876432882436e-05, + "loss": 2.772, + "step": 25766 + }, + { + "epoch": 1.1996415019670834, + "grad_norm": 0.33413281642607023, + "learning_rate": 7.501641904464565e-05, + "loss": 2.77, + "step": 25767 + }, + { + "epoch": 1.1996880601531765, + "grad_norm": 0.3166714959721855, + "learning_rate": 7.501407368704625e-05, + "loss": 2.7613, + "step": 25768 + }, + { + "epoch": 1.1997346183392694, + "grad_norm": 0.33704074846058885, + "learning_rate": 7.5011728256033e-05, + "loss": 2.8475, + "step": 25769 + }, + { + "epoch": 1.1997811765253625, + "grad_norm": 0.3115408473847986, + "learning_rate": 7.500938275161278e-05, + "loss": 2.7393, + "step": 25770 + }, + { + "epoch": 1.1998277347114557, + "grad_norm": 0.32855097823610047, + "learning_rate": 7.500703717379248e-05, + "loss": 2.7739, + "step": 25771 + }, + { + "epoch": 1.1998742928975488, + "grad_norm": 0.3197830876576775, + "learning_rate": 7.5004691522579e-05, + "loss": 2.7699, + "step": 25772 + }, + { + "epoch": 1.1999208510836419, + "grad_norm": 0.3130232301082906, + "learning_rate": 7.500234579797921e-05, + "loss": 2.8131, + "step": 25773 + }, + { + "epoch": 1.1999674092697348, + "grad_norm": 0.3148038438309782, + "learning_rate": 7.500000000000001e-05, + "loss": 2.8232, + "step": 25774 + }, + { + "epoch": 1.200013967455828, + "grad_norm": 0.3226154503656416, + "learning_rate": 7.499765412864825e-05, + "loss": 2.7856, + "step": 25775 + }, + { + "epoch": 1.200060525641921, + "grad_norm": 0.32435124164985496, + "learning_rate": 7.499530818393086e-05, + "loss": 2.7996, + "step": 25776 + }, + { + "epoch": 1.2001070838280141, + "grad_norm": 0.31406819680456105, + "learning_rate": 7.49929621658547e-05, + "loss": 2.7584, + "step": 25777 + }, + { + "epoch": 1.200153642014107, + "grad_norm": 0.3294341685331358, + "learning_rate": 7.499061607442666e-05, + "loss": 2.8079, + "step": 25778 + }, + { + "epoch": 1.2002002002002001, + "grad_norm": 0.34479515994606785, + "learning_rate": 7.498826990965361e-05, + "loss": 2.8146, + "step": 25779 + }, + { + "epoch": 1.2002467583862932, + "grad_norm": 0.33434768359250394, + "learning_rate": 7.498592367154245e-05, + "loss": 2.8242, + "step": 25780 + }, + { + "epoch": 1.2002933165723864, + "grad_norm": 0.2998257702906858, + "learning_rate": 7.498357736010007e-05, + "loss": 2.7194, + "step": 25781 + }, + { + "epoch": 1.2003398747584795, + "grad_norm": 0.31572192693651924, + "learning_rate": 7.498123097533336e-05, + "loss": 2.7641, + "step": 25782 + }, + { + "epoch": 1.2003864329445726, + "grad_norm": 0.3340958625980701, + "learning_rate": 7.49788845172492e-05, + "loss": 2.7438, + "step": 25783 + }, + { + "epoch": 1.2004329911306655, + "grad_norm": 0.31790924472968857, + "learning_rate": 7.497653798585447e-05, + "loss": 2.8702, + "step": 25784 + }, + { + "epoch": 1.2004795493167586, + "grad_norm": 0.3458753363585147, + "learning_rate": 7.497419138115604e-05, + "loss": 2.7925, + "step": 25785 + }, + { + "epoch": 1.2005261075028517, + "grad_norm": 0.31134947107312066, + "learning_rate": 7.497184470316084e-05, + "loss": 2.6806, + "step": 25786 + }, + { + "epoch": 1.2005726656889448, + "grad_norm": 0.3389855968802293, + "learning_rate": 7.496949795187574e-05, + "loss": 2.8134, + "step": 25787 + }, + { + "epoch": 1.2006192238750377, + "grad_norm": 0.3492925412110036, + "learning_rate": 7.496715112730761e-05, + "loss": 2.7671, + "step": 25788 + }, + { + "epoch": 1.2006657820611308, + "grad_norm": 0.310238833123197, + "learning_rate": 7.496480422946336e-05, + "loss": 2.7346, + "step": 25789 + }, + { + "epoch": 1.200712340247224, + "grad_norm": 0.3464956873449146, + "learning_rate": 7.496245725834985e-05, + "loss": 2.7806, + "step": 25790 + }, + { + "epoch": 1.200758898433317, + "grad_norm": 0.32676810184439864, + "learning_rate": 7.496011021397402e-05, + "loss": 2.6283, + "step": 25791 + }, + { + "epoch": 1.2008054566194102, + "grad_norm": 0.34559304817492714, + "learning_rate": 7.49577630963427e-05, + "loss": 2.8637, + "step": 25792 + }, + { + "epoch": 1.2008520148055033, + "grad_norm": 0.34105672001841114, + "learning_rate": 7.495541590546282e-05, + "loss": 2.7639, + "step": 25793 + }, + { + "epoch": 1.2008985729915962, + "grad_norm": 0.3449680035936689, + "learning_rate": 7.495306864134125e-05, + "loss": 2.7226, + "step": 25794 + }, + { + "epoch": 1.2009451311776893, + "grad_norm": 0.3467299872271794, + "learning_rate": 7.495072130398489e-05, + "loss": 2.8041, + "step": 25795 + }, + { + "epoch": 1.2009916893637824, + "grad_norm": 0.35640689660303354, + "learning_rate": 7.49483738934006e-05, + "loss": 2.8447, + "step": 25796 + }, + { + "epoch": 1.2010382475498755, + "grad_norm": 0.33049802589434846, + "learning_rate": 7.494602640959529e-05, + "loss": 2.8579, + "step": 25797 + }, + { + "epoch": 1.2010848057359684, + "grad_norm": 0.3293282203858116, + "learning_rate": 7.494367885257587e-05, + "loss": 2.7142, + "step": 25798 + }, + { + "epoch": 1.2011313639220615, + "grad_norm": 0.3248832285365294, + "learning_rate": 7.494133122234919e-05, + "loss": 2.7768, + "step": 25799 + }, + { + "epoch": 1.2011779221081547, + "grad_norm": 0.3166127074187054, + "learning_rate": 7.493898351892217e-05, + "loss": 2.7943, + "step": 25800 + }, + { + "epoch": 1.2012244802942478, + "grad_norm": 0.33478047332746386, + "learning_rate": 7.49366357423017e-05, + "loss": 2.8766, + "step": 25801 + }, + { + "epoch": 1.201271038480341, + "grad_norm": 0.3172132641060159, + "learning_rate": 7.493428789249463e-05, + "loss": 2.7259, + "step": 25802 + }, + { + "epoch": 1.2013175966664338, + "grad_norm": 0.3155135555048, + "learning_rate": 7.493193996950791e-05, + "loss": 2.8913, + "step": 25803 + }, + { + "epoch": 1.201364154852527, + "grad_norm": 0.29298757547450954, + "learning_rate": 7.492959197334839e-05, + "loss": 2.7235, + "step": 25804 + }, + { + "epoch": 1.20141071303862, + "grad_norm": 0.3118253728109291, + "learning_rate": 7.492724390402299e-05, + "loss": 2.7852, + "step": 25805 + }, + { + "epoch": 1.2014572712247131, + "grad_norm": 0.3107848132538292, + "learning_rate": 7.492489576153857e-05, + "loss": 2.7656, + "step": 25806 + }, + { + "epoch": 1.2015038294108062, + "grad_norm": 0.286941679372035, + "learning_rate": 7.492254754590205e-05, + "loss": 2.8272, + "step": 25807 + }, + { + "epoch": 1.2015503875968991, + "grad_norm": 0.3404573459833188, + "learning_rate": 7.492019925712028e-05, + "loss": 2.8004, + "step": 25808 + }, + { + "epoch": 1.2015969457829923, + "grad_norm": 0.34228943476266144, + "learning_rate": 7.491785089520021e-05, + "loss": 2.7357, + "step": 25809 + }, + { + "epoch": 1.2016435039690854, + "grad_norm": 0.33316427961826234, + "learning_rate": 7.49155024601487e-05, + "loss": 2.8184, + "step": 25810 + }, + { + "epoch": 1.2016900621551785, + "grad_norm": 0.3697642166536506, + "learning_rate": 7.491315395197262e-05, + "loss": 2.763, + "step": 25811 + }, + { + "epoch": 1.2017366203412716, + "grad_norm": 0.31518293119113594, + "learning_rate": 7.49108053706789e-05, + "loss": 2.7378, + "step": 25812 + }, + { + "epoch": 1.2017831785273645, + "grad_norm": 0.36293146369676854, + "learning_rate": 7.490845671627443e-05, + "loss": 2.6852, + "step": 25813 + }, + { + "epoch": 1.2018297367134576, + "grad_norm": 0.31828053056180167, + "learning_rate": 7.490610798876609e-05, + "loss": 2.8339, + "step": 25814 + }, + { + "epoch": 1.2018762948995507, + "grad_norm": 0.3578400651554948, + "learning_rate": 7.490375918816077e-05, + "loss": 2.7799, + "step": 25815 + }, + { + "epoch": 1.2019228530856438, + "grad_norm": 0.312828942319125, + "learning_rate": 7.490141031446535e-05, + "loss": 2.7557, + "step": 25816 + }, + { + "epoch": 1.2019694112717367, + "grad_norm": 0.3319465162196244, + "learning_rate": 7.489906136768679e-05, + "loss": 2.8407, + "step": 25817 + }, + { + "epoch": 1.2020159694578298, + "grad_norm": 0.3353415229897859, + "learning_rate": 7.489671234783191e-05, + "loss": 2.7858, + "step": 25818 + }, + { + "epoch": 1.202062527643923, + "grad_norm": 0.33721076698027513, + "learning_rate": 7.489436325490764e-05, + "loss": 2.793, + "step": 25819 + }, + { + "epoch": 1.202109085830016, + "grad_norm": 0.3382563390367734, + "learning_rate": 7.489201408892085e-05, + "loss": 2.7982, + "step": 25820 + }, + { + "epoch": 1.2021556440161092, + "grad_norm": 0.3276300918093611, + "learning_rate": 7.488966484987847e-05, + "loss": 2.7746, + "step": 25821 + }, + { + "epoch": 1.2022022022022023, + "grad_norm": 0.35249950327013385, + "learning_rate": 7.488731553778738e-05, + "loss": 2.8904, + "step": 25822 + }, + { + "epoch": 1.2022487603882952, + "grad_norm": 0.3180730135768668, + "learning_rate": 7.488496615265446e-05, + "loss": 2.7514, + "step": 25823 + }, + { + "epoch": 1.2022953185743883, + "grad_norm": 0.3491353329323558, + "learning_rate": 7.488261669448662e-05, + "loss": 2.8653, + "step": 25824 + }, + { + "epoch": 1.2023418767604814, + "grad_norm": 0.340826932394763, + "learning_rate": 7.488026716329075e-05, + "loss": 2.764, + "step": 25825 + }, + { + "epoch": 1.2023884349465745, + "grad_norm": 0.35688715223944234, + "learning_rate": 7.487791755907373e-05, + "loss": 2.892, + "step": 25826 + }, + { + "epoch": 1.2024349931326674, + "grad_norm": 0.32351824412735436, + "learning_rate": 7.487556788184249e-05, + "loss": 2.8397, + "step": 25827 + }, + { + "epoch": 1.2024815513187606, + "grad_norm": 0.368212999219034, + "learning_rate": 7.487321813160391e-05, + "loss": 2.89, + "step": 25828 + }, + { + "epoch": 1.2025281095048537, + "grad_norm": 0.3129983960945787, + "learning_rate": 7.487086830836487e-05, + "loss": 2.8569, + "step": 25829 + }, + { + "epoch": 1.2025746676909468, + "grad_norm": 0.3616483383632694, + "learning_rate": 7.486851841213229e-05, + "loss": 2.9444, + "step": 25830 + }, + { + "epoch": 1.20262122587704, + "grad_norm": 0.3217233424689347, + "learning_rate": 7.486616844291307e-05, + "loss": 2.7982, + "step": 25831 + }, + { + "epoch": 1.202667784063133, + "grad_norm": 0.34625768407319785, + "learning_rate": 7.486381840071409e-05, + "loss": 2.674, + "step": 25832 + }, + { + "epoch": 1.202714342249226, + "grad_norm": 0.32646125199214765, + "learning_rate": 7.486146828554226e-05, + "loss": 2.7564, + "step": 25833 + }, + { + "epoch": 1.202760900435319, + "grad_norm": 0.33890895695506623, + "learning_rate": 7.485911809740445e-05, + "loss": 2.8318, + "step": 25834 + }, + { + "epoch": 1.2028074586214121, + "grad_norm": 0.3311265784169416, + "learning_rate": 7.48567678363076e-05, + "loss": 2.8119, + "step": 25835 + }, + { + "epoch": 1.2028540168075053, + "grad_norm": 0.36309692309907565, + "learning_rate": 7.485441750225855e-05, + "loss": 2.7855, + "step": 25836 + }, + { + "epoch": 1.2029005749935981, + "grad_norm": 0.33175258773154753, + "learning_rate": 7.485206709526426e-05, + "loss": 2.719, + "step": 25837 + }, + { + "epoch": 1.2029471331796913, + "grad_norm": 0.34864400430504183, + "learning_rate": 7.48497166153316e-05, + "loss": 2.7709, + "step": 25838 + }, + { + "epoch": 1.2029936913657844, + "grad_norm": 0.3222256794994188, + "learning_rate": 7.484736606246747e-05, + "loss": 2.8843, + "step": 25839 + }, + { + "epoch": 1.2030402495518775, + "grad_norm": 0.3395532752333288, + "learning_rate": 7.484501543667877e-05, + "loss": 2.8036, + "step": 25840 + }, + { + "epoch": 1.2030868077379706, + "grad_norm": 0.3381852004145155, + "learning_rate": 7.484266473797239e-05, + "loss": 2.7782, + "step": 25841 + }, + { + "epoch": 1.2031333659240635, + "grad_norm": 0.32059170368232587, + "learning_rate": 7.484031396635524e-05, + "loss": 2.7419, + "step": 25842 + }, + { + "epoch": 1.2031799241101566, + "grad_norm": 0.3397016445816123, + "learning_rate": 7.483796312183421e-05, + "loss": 2.7344, + "step": 25843 + }, + { + "epoch": 1.2032264822962497, + "grad_norm": 0.30856505467172596, + "learning_rate": 7.483561220441621e-05, + "loss": 2.7552, + "step": 25844 + }, + { + "epoch": 1.2032730404823428, + "grad_norm": 0.3305806707318884, + "learning_rate": 7.483326121410814e-05, + "loss": 2.8067, + "step": 25845 + }, + { + "epoch": 1.203319598668436, + "grad_norm": 0.3198933735883826, + "learning_rate": 7.483091015091689e-05, + "loss": 2.7679, + "step": 25846 + }, + { + "epoch": 1.2033661568545289, + "grad_norm": 0.3009487839989389, + "learning_rate": 7.482855901484937e-05, + "loss": 2.7333, + "step": 25847 + }, + { + "epoch": 1.203412715040622, + "grad_norm": 0.34698749061278067, + "learning_rate": 7.482620780591246e-05, + "loss": 2.7905, + "step": 25848 + }, + { + "epoch": 1.203459273226715, + "grad_norm": 0.30247640944553245, + "learning_rate": 7.482385652411308e-05, + "loss": 2.9584, + "step": 25849 + }, + { + "epoch": 1.2035058314128082, + "grad_norm": 0.3215662515499641, + "learning_rate": 7.482150516945814e-05, + "loss": 2.7186, + "step": 25850 + }, + { + "epoch": 1.2035523895989013, + "grad_norm": 0.3109695499454093, + "learning_rate": 7.481915374195451e-05, + "loss": 2.7081, + "step": 25851 + }, + { + "epoch": 1.2035989477849942, + "grad_norm": 0.3263773530292173, + "learning_rate": 7.481680224160913e-05, + "loss": 2.7263, + "step": 25852 + }, + { + "epoch": 1.2036455059710873, + "grad_norm": 0.324812065983447, + "learning_rate": 7.481445066842886e-05, + "loss": 2.7636, + "step": 25853 + }, + { + "epoch": 1.2036920641571804, + "grad_norm": 0.33063949612677607, + "learning_rate": 7.481209902242064e-05, + "loss": 2.855, + "step": 25854 + }, + { + "epoch": 1.2037386223432736, + "grad_norm": 0.3015444067164665, + "learning_rate": 7.480974730359133e-05, + "loss": 2.7135, + "step": 25855 + }, + { + "epoch": 1.2037851805293667, + "grad_norm": 0.30817444582605913, + "learning_rate": 7.480739551194788e-05, + "loss": 2.7249, + "step": 25856 + }, + { + "epoch": 1.2038317387154596, + "grad_norm": 0.32693701203899983, + "learning_rate": 7.480504364749717e-05, + "loss": 2.8244, + "step": 25857 + }, + { + "epoch": 1.2038782969015527, + "grad_norm": 0.3193662886498509, + "learning_rate": 7.480269171024608e-05, + "loss": 2.8567, + "step": 25858 + }, + { + "epoch": 1.2039248550876458, + "grad_norm": 0.3142931445360644, + "learning_rate": 7.480033970020155e-05, + "loss": 2.8257, + "step": 25859 + }, + { + "epoch": 1.203971413273739, + "grad_norm": 0.33634299182894617, + "learning_rate": 7.479798761737046e-05, + "loss": 2.7791, + "step": 25860 + }, + { + "epoch": 1.204017971459832, + "grad_norm": 0.33342380314051523, + "learning_rate": 7.479563546175971e-05, + "loss": 2.8216, + "step": 25861 + }, + { + "epoch": 1.204064529645925, + "grad_norm": 0.3430172267682303, + "learning_rate": 7.479328323337622e-05, + "loss": 2.8668, + "step": 25862 + }, + { + "epoch": 1.204111087832018, + "grad_norm": 0.3419203365462434, + "learning_rate": 7.479093093222689e-05, + "loss": 2.8391, + "step": 25863 + }, + { + "epoch": 1.2041576460181111, + "grad_norm": 0.32781799490086383, + "learning_rate": 7.478857855831862e-05, + "loss": 2.8384, + "step": 25864 + }, + { + "epoch": 1.2042042042042043, + "grad_norm": 0.3519125989212542, + "learning_rate": 7.478622611165831e-05, + "loss": 2.7666, + "step": 25865 + }, + { + "epoch": 1.2042507623902972, + "grad_norm": 0.28950892538924206, + "learning_rate": 7.478387359225286e-05, + "loss": 2.7432, + "step": 25866 + }, + { + "epoch": 1.2042973205763903, + "grad_norm": 0.32350297168229075, + "learning_rate": 7.478152100010919e-05, + "loss": 2.7323, + "step": 25867 + }, + { + "epoch": 1.2043438787624834, + "grad_norm": 0.3215189113885166, + "learning_rate": 7.47791683352342e-05, + "loss": 2.7752, + "step": 25868 + }, + { + "epoch": 1.2043904369485765, + "grad_norm": 0.29420022786040023, + "learning_rate": 7.47768155976348e-05, + "loss": 2.763, + "step": 25869 + }, + { + "epoch": 1.2044369951346696, + "grad_norm": 0.30745934840186573, + "learning_rate": 7.477446278731788e-05, + "loss": 2.8722, + "step": 25870 + }, + { + "epoch": 1.2044835533207627, + "grad_norm": 0.3128682085512243, + "learning_rate": 7.477210990429035e-05, + "loss": 2.7435, + "step": 25871 + }, + { + "epoch": 1.2045301115068556, + "grad_norm": 0.30705432410605404, + "learning_rate": 7.476975694855912e-05, + "loss": 2.8425, + "step": 25872 + }, + { + "epoch": 1.2045766696929487, + "grad_norm": 0.29281867239715514, + "learning_rate": 7.476740392013109e-05, + "loss": 2.7934, + "step": 25873 + }, + { + "epoch": 1.2046232278790419, + "grad_norm": 0.32152795601509493, + "learning_rate": 7.476505081901318e-05, + "loss": 2.7592, + "step": 25874 + }, + { + "epoch": 1.204669786065135, + "grad_norm": 0.28595041821830436, + "learning_rate": 7.476269764521227e-05, + "loss": 2.7348, + "step": 25875 + }, + { + "epoch": 1.2047163442512279, + "grad_norm": 0.32045708646292964, + "learning_rate": 7.476034439873529e-05, + "loss": 2.834, + "step": 25876 + }, + { + "epoch": 1.204762902437321, + "grad_norm": 0.30952995913462195, + "learning_rate": 7.475799107958914e-05, + "loss": 2.8063, + "step": 25877 + }, + { + "epoch": 1.204809460623414, + "grad_norm": 0.31324819723487085, + "learning_rate": 7.475563768778073e-05, + "loss": 2.8099, + "step": 25878 + }, + { + "epoch": 1.2048560188095072, + "grad_norm": 0.3404431777186428, + "learning_rate": 7.475328422331694e-05, + "loss": 2.7132, + "step": 25879 + }, + { + "epoch": 1.2049025769956003, + "grad_norm": 0.34118483039679454, + "learning_rate": 7.475093068620471e-05, + "loss": 2.857, + "step": 25880 + }, + { + "epoch": 1.2049491351816934, + "grad_norm": 0.33347185461479867, + "learning_rate": 7.474857707645094e-05, + "loss": 2.7242, + "step": 25881 + }, + { + "epoch": 1.2049956933677863, + "grad_norm": 0.3363002267531844, + "learning_rate": 7.474622339406255e-05, + "loss": 2.6781, + "step": 25882 + }, + { + "epoch": 1.2050422515538795, + "grad_norm": 0.3275478864434109, + "learning_rate": 7.474386963904643e-05, + "loss": 2.8467, + "step": 25883 + }, + { + "epoch": 1.2050888097399726, + "grad_norm": 0.328771915094205, + "learning_rate": 7.474151581140946e-05, + "loss": 2.7123, + "step": 25884 + }, + { + "epoch": 1.2051353679260657, + "grad_norm": 0.3375676076771547, + "learning_rate": 7.47391619111586e-05, + "loss": 2.8387, + "step": 25885 + }, + { + "epoch": 1.2051819261121586, + "grad_norm": 0.3266613958611643, + "learning_rate": 7.473680793830073e-05, + "loss": 2.7813, + "step": 25886 + }, + { + "epoch": 1.2052284842982517, + "grad_norm": 0.3443445252598033, + "learning_rate": 7.473445389284278e-05, + "loss": 2.8651, + "step": 25887 + }, + { + "epoch": 1.2052750424843448, + "grad_norm": 0.3212220994514091, + "learning_rate": 7.473209977479163e-05, + "loss": 2.8283, + "step": 25888 + }, + { + "epoch": 1.205321600670438, + "grad_norm": 0.3407730995302934, + "learning_rate": 7.47297455841542e-05, + "loss": 2.8301, + "step": 25889 + }, + { + "epoch": 1.205368158856531, + "grad_norm": 0.32369416969848186, + "learning_rate": 7.472739132093741e-05, + "loss": 2.8656, + "step": 25890 + }, + { + "epoch": 1.205414717042624, + "grad_norm": 0.38402211407714737, + "learning_rate": 7.472503698514817e-05, + "loss": 2.8323, + "step": 25891 + }, + { + "epoch": 1.205461275228717, + "grad_norm": 0.3335500889339141, + "learning_rate": 7.472268257679338e-05, + "loss": 2.806, + "step": 25892 + }, + { + "epoch": 1.2055078334148102, + "grad_norm": 0.3852354497186842, + "learning_rate": 7.472032809587995e-05, + "loss": 2.8221, + "step": 25893 + }, + { + "epoch": 1.2055543916009033, + "grad_norm": 0.3400508956052656, + "learning_rate": 7.471797354241477e-05, + "loss": 2.8123, + "step": 25894 + }, + { + "epoch": 1.2056009497869964, + "grad_norm": 0.3390086756470501, + "learning_rate": 7.47156189164048e-05, + "loss": 2.7389, + "step": 25895 + }, + { + "epoch": 1.2056475079730893, + "grad_norm": 0.35060907188332957, + "learning_rate": 7.471326421785693e-05, + "loss": 2.8153, + "step": 25896 + }, + { + "epoch": 1.2056940661591824, + "grad_norm": 0.3093941954460951, + "learning_rate": 7.471090944677805e-05, + "loss": 2.8251, + "step": 25897 + }, + { + "epoch": 1.2057406243452755, + "grad_norm": 0.36064955487747474, + "learning_rate": 7.470855460317509e-05, + "loss": 2.7906, + "step": 25898 + }, + { + "epoch": 1.2057871825313686, + "grad_norm": 0.3413369985453724, + "learning_rate": 7.470619968705495e-05, + "loss": 2.816, + "step": 25899 + }, + { + "epoch": 1.2058337407174617, + "grad_norm": 0.35999731625690845, + "learning_rate": 7.470384469842455e-05, + "loss": 2.8436, + "step": 25900 + }, + { + "epoch": 1.2058802989035546, + "grad_norm": 0.35246872968270737, + "learning_rate": 7.47014896372908e-05, + "loss": 2.698, + "step": 25901 + }, + { + "epoch": 1.2059268570896478, + "grad_norm": 0.3321791425328036, + "learning_rate": 7.469913450366061e-05, + "loss": 2.7791, + "step": 25902 + }, + { + "epoch": 1.2059734152757409, + "grad_norm": 0.321095074851932, + "learning_rate": 7.469677929754089e-05, + "loss": 2.7137, + "step": 25903 + }, + { + "epoch": 1.206019973461834, + "grad_norm": 0.3411988978798358, + "learning_rate": 7.469442401893856e-05, + "loss": 2.7565, + "step": 25904 + }, + { + "epoch": 1.2060665316479269, + "grad_norm": 0.3339157410448551, + "learning_rate": 7.469206866786052e-05, + "loss": 2.8304, + "step": 25905 + }, + { + "epoch": 1.20611308983402, + "grad_norm": 0.37182274957417843, + "learning_rate": 7.468971324431371e-05, + "loss": 2.8139, + "step": 25906 + }, + { + "epoch": 1.206159648020113, + "grad_norm": 0.351843820729408, + "learning_rate": 7.4687357748305e-05, + "loss": 2.9216, + "step": 25907 + }, + { + "epoch": 1.2062062062062062, + "grad_norm": 0.3525341378880799, + "learning_rate": 7.468500217984136e-05, + "loss": 2.8565, + "step": 25908 + }, + { + "epoch": 1.2062527643922993, + "grad_norm": 0.3597834552762465, + "learning_rate": 7.468264653892965e-05, + "loss": 2.7249, + "step": 25909 + }, + { + "epoch": 1.2062993225783925, + "grad_norm": 0.32153793522657287, + "learning_rate": 7.46802908255768e-05, + "loss": 2.8522, + "step": 25910 + }, + { + "epoch": 1.2063458807644853, + "grad_norm": 0.3582043584036917, + "learning_rate": 7.467793503978974e-05, + "loss": 2.7145, + "step": 25911 + }, + { + "epoch": 1.2063924389505785, + "grad_norm": 0.32259177865133837, + "learning_rate": 7.467557918157537e-05, + "loss": 2.835, + "step": 25912 + }, + { + "epoch": 1.2064389971366716, + "grad_norm": 0.3377103863735791, + "learning_rate": 7.46732232509406e-05, + "loss": 2.7737, + "step": 25913 + }, + { + "epoch": 1.2064855553227647, + "grad_norm": 0.3275029665569058, + "learning_rate": 7.467086724789234e-05, + "loss": 2.8205, + "step": 25914 + }, + { + "epoch": 1.2065321135088576, + "grad_norm": 0.35580845647025, + "learning_rate": 7.466851117243753e-05, + "loss": 2.7051, + "step": 25915 + }, + { + "epoch": 1.2065786716949507, + "grad_norm": 0.32440257205093714, + "learning_rate": 7.466615502458306e-05, + "loss": 2.7889, + "step": 25916 + }, + { + "epoch": 1.2066252298810438, + "grad_norm": 0.35296851279627905, + "learning_rate": 7.466379880433586e-05, + "loss": 2.7442, + "step": 25917 + }, + { + "epoch": 1.206671788067137, + "grad_norm": 0.32440215527701094, + "learning_rate": 7.466144251170285e-05, + "loss": 2.6908, + "step": 25918 + }, + { + "epoch": 1.20671834625323, + "grad_norm": 0.37529953223045304, + "learning_rate": 7.465908614669093e-05, + "loss": 2.9015, + "step": 25919 + }, + { + "epoch": 1.2067649044393232, + "grad_norm": 0.29491786598022435, + "learning_rate": 7.465672970930702e-05, + "loss": 2.873, + "step": 25920 + }, + { + "epoch": 1.206811462625416, + "grad_norm": 0.36753496574628697, + "learning_rate": 7.465437319955804e-05, + "loss": 2.8182, + "step": 25921 + }, + { + "epoch": 1.2068580208115092, + "grad_norm": 0.34490111155716985, + "learning_rate": 7.465201661745091e-05, + "loss": 2.8592, + "step": 25922 + }, + { + "epoch": 1.2069045789976023, + "grad_norm": 0.3429786801039276, + "learning_rate": 7.464965996299252e-05, + "loss": 2.7758, + "step": 25923 + }, + { + "epoch": 1.2069511371836954, + "grad_norm": 0.3676739483719754, + "learning_rate": 7.464730323618981e-05, + "loss": 2.7459, + "step": 25924 + }, + { + "epoch": 1.2069976953697883, + "grad_norm": 0.33222593940629036, + "learning_rate": 7.46449464370497e-05, + "loss": 2.818, + "step": 25925 + }, + { + "epoch": 1.2070442535558814, + "grad_norm": 0.3628085165221836, + "learning_rate": 7.46425895655791e-05, + "loss": 2.7498, + "step": 25926 + }, + { + "epoch": 1.2070908117419745, + "grad_norm": 0.3169554116591772, + "learning_rate": 7.464023262178493e-05, + "loss": 2.7651, + "step": 25927 + }, + { + "epoch": 1.2071373699280676, + "grad_norm": 0.40031049709888933, + "learning_rate": 7.46378756056741e-05, + "loss": 2.8618, + "step": 25928 + }, + { + "epoch": 1.2071839281141608, + "grad_norm": 0.33197284123750137, + "learning_rate": 7.463551851725353e-05, + "loss": 2.8984, + "step": 25929 + }, + { + "epoch": 1.2072304863002536, + "grad_norm": 0.3565958126592716, + "learning_rate": 7.463316135653015e-05, + "loss": 2.8274, + "step": 25930 + }, + { + "epoch": 1.2072770444863468, + "grad_norm": 0.33741739102100465, + "learning_rate": 7.463080412351086e-05, + "loss": 2.7804, + "step": 25931 + }, + { + "epoch": 1.2073236026724399, + "grad_norm": 0.3700224609848764, + "learning_rate": 7.462844681820259e-05, + "loss": 2.7376, + "step": 25932 + }, + { + "epoch": 1.207370160858533, + "grad_norm": 0.3181601949826434, + "learning_rate": 7.462608944061225e-05, + "loss": 2.7561, + "step": 25933 + }, + { + "epoch": 1.207416719044626, + "grad_norm": 0.3457341052897369, + "learning_rate": 7.462373199074676e-05, + "loss": 2.6918, + "step": 25934 + }, + { + "epoch": 1.207463277230719, + "grad_norm": 0.33088963398883176, + "learning_rate": 7.462137446861306e-05, + "loss": 2.7942, + "step": 25935 + }, + { + "epoch": 1.2075098354168121, + "grad_norm": 0.33529707163745576, + "learning_rate": 7.461901687421804e-05, + "loss": 2.8803, + "step": 25936 + }, + { + "epoch": 1.2075563936029052, + "grad_norm": 0.3479502022868516, + "learning_rate": 7.461665920756864e-05, + "loss": 2.9682, + "step": 25937 + }, + { + "epoch": 1.2076029517889983, + "grad_norm": 0.3484987482298823, + "learning_rate": 7.461430146867175e-05, + "loss": 2.9117, + "step": 25938 + }, + { + "epoch": 1.2076495099750915, + "grad_norm": 0.31259010001471127, + "learning_rate": 7.461194365753432e-05, + "loss": 2.7121, + "step": 25939 + }, + { + "epoch": 1.2076960681611844, + "grad_norm": 0.3363081294875892, + "learning_rate": 7.460958577416327e-05, + "loss": 2.8119, + "step": 25940 + }, + { + "epoch": 1.2077426263472775, + "grad_norm": 0.3482709781424454, + "learning_rate": 7.460722781856551e-05, + "loss": 2.7942, + "step": 25941 + }, + { + "epoch": 1.2077891845333706, + "grad_norm": 0.30270486068620756, + "learning_rate": 7.460486979074795e-05, + "loss": 2.7748, + "step": 25942 + }, + { + "epoch": 1.2078357427194637, + "grad_norm": 0.3378900103269151, + "learning_rate": 7.460251169071752e-05, + "loss": 2.8446, + "step": 25943 + }, + { + "epoch": 1.2078823009055568, + "grad_norm": 0.34646182082540145, + "learning_rate": 7.460015351848114e-05, + "loss": 2.7913, + "step": 25944 + }, + { + "epoch": 1.2079288590916497, + "grad_norm": 0.337142559011673, + "learning_rate": 7.459779527404576e-05, + "loss": 2.8943, + "step": 25945 + }, + { + "epoch": 1.2079754172777428, + "grad_norm": 0.3582430330029254, + "learning_rate": 7.459543695741825e-05, + "loss": 2.8444, + "step": 25946 + }, + { + "epoch": 1.208021975463836, + "grad_norm": 0.3116289938754884, + "learning_rate": 7.459307856860556e-05, + "loss": 2.8867, + "step": 25947 + }, + { + "epoch": 1.208068533649929, + "grad_norm": 0.34851018806163386, + "learning_rate": 7.45907201076146e-05, + "loss": 2.7152, + "step": 25948 + }, + { + "epoch": 1.2081150918360222, + "grad_norm": 0.3403257117864482, + "learning_rate": 7.45883615744523e-05, + "loss": 2.8904, + "step": 25949 + }, + { + "epoch": 1.208161650022115, + "grad_norm": 0.3469582695703105, + "learning_rate": 7.45860029691256e-05, + "loss": 2.7408, + "step": 25950 + }, + { + "epoch": 1.2082082082082082, + "grad_norm": 0.33741029600109507, + "learning_rate": 7.458364429164139e-05, + "loss": 2.8161, + "step": 25951 + }, + { + "epoch": 1.2082547663943013, + "grad_norm": 0.36230367143798564, + "learning_rate": 7.45812855420066e-05, + "loss": 2.7368, + "step": 25952 + }, + { + "epoch": 1.2083013245803944, + "grad_norm": 0.34164880564721783, + "learning_rate": 7.457892672022816e-05, + "loss": 2.8073, + "step": 25953 + }, + { + "epoch": 1.2083478827664873, + "grad_norm": 0.334324786008253, + "learning_rate": 7.4576567826313e-05, + "loss": 2.7817, + "step": 25954 + }, + { + "epoch": 1.2083944409525804, + "grad_norm": 0.3152710698643386, + "learning_rate": 7.457420886026802e-05, + "loss": 2.8246, + "step": 25955 + }, + { + "epoch": 1.2084409991386735, + "grad_norm": 0.33714092636223286, + "learning_rate": 7.457184982210017e-05, + "loss": 2.8375, + "step": 25956 + }, + { + "epoch": 1.2084875573247666, + "grad_norm": 0.3041233353244034, + "learning_rate": 7.456949071181635e-05, + "loss": 2.7269, + "step": 25957 + }, + { + "epoch": 1.2085341155108598, + "grad_norm": 0.35374185745627157, + "learning_rate": 7.456713152942354e-05, + "loss": 2.7143, + "step": 25958 + }, + { + "epoch": 1.2085806736969529, + "grad_norm": 0.3266745285138698, + "learning_rate": 7.456477227492858e-05, + "loss": 2.781, + "step": 25959 + }, + { + "epoch": 1.2086272318830458, + "grad_norm": 0.34191209080668555, + "learning_rate": 7.456241294833843e-05, + "loss": 2.8833, + "step": 25960 + }, + { + "epoch": 1.2086737900691389, + "grad_norm": 0.37614475807129066, + "learning_rate": 7.456005354966003e-05, + "loss": 2.9151, + "step": 25961 + }, + { + "epoch": 1.208720348255232, + "grad_norm": 0.34875510727451536, + "learning_rate": 7.455769407890028e-05, + "loss": 2.8514, + "step": 25962 + }, + { + "epoch": 1.2087669064413251, + "grad_norm": 0.3727736733203404, + "learning_rate": 7.455533453606613e-05, + "loss": 2.8278, + "step": 25963 + }, + { + "epoch": 1.208813464627418, + "grad_norm": 0.32391596748840285, + "learning_rate": 7.455297492116448e-05, + "loss": 2.8268, + "step": 25964 + }, + { + "epoch": 1.2088600228135111, + "grad_norm": 0.3442668109692425, + "learning_rate": 7.455061523420228e-05, + "loss": 2.7502, + "step": 25965 + }, + { + "epoch": 1.2089065809996042, + "grad_norm": 0.33990157807986227, + "learning_rate": 7.454825547518643e-05, + "loss": 2.8351, + "step": 25966 + }, + { + "epoch": 1.2089531391856974, + "grad_norm": 0.34523372811812747, + "learning_rate": 7.454589564412389e-05, + "loss": 2.753, + "step": 25967 + }, + { + "epoch": 1.2089996973717905, + "grad_norm": 0.3417226831896319, + "learning_rate": 7.454353574102155e-05, + "loss": 2.7717, + "step": 25968 + }, + { + "epoch": 1.2090462555578836, + "grad_norm": 0.37320995411416635, + "learning_rate": 7.454117576588635e-05, + "loss": 2.8343, + "step": 25969 + }, + { + "epoch": 1.2090928137439765, + "grad_norm": 0.33790941826059245, + "learning_rate": 7.45388157187252e-05, + "loss": 2.7075, + "step": 25970 + }, + { + "epoch": 1.2091393719300696, + "grad_norm": 0.3386526662687349, + "learning_rate": 7.453645559954509e-05, + "loss": 2.8593, + "step": 25971 + }, + { + "epoch": 1.2091859301161627, + "grad_norm": 0.3861041321653118, + "learning_rate": 7.453409540835285e-05, + "loss": 2.8149, + "step": 25972 + }, + { + "epoch": 1.2092324883022558, + "grad_norm": 0.3612735253639783, + "learning_rate": 7.45317351451555e-05, + "loss": 2.823, + "step": 25973 + }, + { + "epoch": 1.2092790464883487, + "grad_norm": 0.3640377793529847, + "learning_rate": 7.45293748099599e-05, + "loss": 2.7991, + "step": 25974 + }, + { + "epoch": 1.2093256046744418, + "grad_norm": 0.35964585809735844, + "learning_rate": 7.4527014402773e-05, + "loss": 2.7301, + "step": 25975 + }, + { + "epoch": 1.209372162860535, + "grad_norm": 0.38034061546922365, + "learning_rate": 7.452465392360173e-05, + "loss": 2.7909, + "step": 25976 + }, + { + "epoch": 1.209418721046628, + "grad_norm": 0.32785755138919215, + "learning_rate": 7.452229337245303e-05, + "loss": 2.7891, + "step": 25977 + }, + { + "epoch": 1.2094652792327212, + "grad_norm": 0.4088743342344242, + "learning_rate": 7.45199327493338e-05, + "loss": 2.7607, + "step": 25978 + }, + { + "epoch": 1.209511837418814, + "grad_norm": 0.3787353469596656, + "learning_rate": 7.451757205425098e-05, + "loss": 2.7666, + "step": 25979 + }, + { + "epoch": 1.2095583956049072, + "grad_norm": 0.3935841712994926, + "learning_rate": 7.451521128721151e-05, + "loss": 2.776, + "step": 25980 + }, + { + "epoch": 1.2096049537910003, + "grad_norm": 0.34596010967342644, + "learning_rate": 7.451285044822231e-05, + "loss": 2.8289, + "step": 25981 + }, + { + "epoch": 1.2096515119770934, + "grad_norm": 0.4141621324546151, + "learning_rate": 7.451048953729032e-05, + "loss": 2.7703, + "step": 25982 + }, + { + "epoch": 1.2096980701631865, + "grad_norm": 0.32679919868939467, + "learning_rate": 7.450812855442244e-05, + "loss": 2.812, + "step": 25983 + }, + { + "epoch": 1.2097446283492794, + "grad_norm": 0.3317580905272218, + "learning_rate": 7.450576749962563e-05, + "loss": 2.7384, + "step": 25984 + }, + { + "epoch": 1.2097911865353725, + "grad_norm": 0.3293244944992527, + "learning_rate": 7.450340637290679e-05, + "loss": 2.7219, + "step": 25985 + }, + { + "epoch": 1.2098377447214657, + "grad_norm": 0.36198901900109415, + "learning_rate": 7.450104517427289e-05, + "loss": 2.9197, + "step": 25986 + }, + { + "epoch": 1.2098843029075588, + "grad_norm": 0.32775231538253463, + "learning_rate": 7.449868390373081e-05, + "loss": 2.916, + "step": 25987 + }, + { + "epoch": 1.2099308610936519, + "grad_norm": 0.37982965858704143, + "learning_rate": 7.449632256128752e-05, + "loss": 2.7792, + "step": 25988 + }, + { + "epoch": 1.2099774192797448, + "grad_norm": 0.3350977021384222, + "learning_rate": 7.449396114694993e-05, + "loss": 2.8546, + "step": 25989 + }, + { + "epoch": 1.210023977465838, + "grad_norm": 0.35362877427962636, + "learning_rate": 7.4491599660725e-05, + "loss": 2.7086, + "step": 25990 + }, + { + "epoch": 1.210070535651931, + "grad_norm": 0.3521526650534156, + "learning_rate": 7.448923810261961e-05, + "loss": 2.898, + "step": 25991 + }, + { + "epoch": 1.2101170938380241, + "grad_norm": 0.3700402846714625, + "learning_rate": 7.448687647264073e-05, + "loss": 2.7663, + "step": 25992 + }, + { + "epoch": 1.210163652024117, + "grad_norm": 0.31500764270067033, + "learning_rate": 7.448451477079527e-05, + "loss": 2.6293, + "step": 25993 + }, + { + "epoch": 1.2102102102102101, + "grad_norm": 0.32821696413491813, + "learning_rate": 7.448215299709019e-05, + "loss": 2.782, + "step": 25994 + }, + { + "epoch": 1.2102567683963033, + "grad_norm": 0.3564823368774807, + "learning_rate": 7.447979115153239e-05, + "loss": 2.744, + "step": 25995 + }, + { + "epoch": 1.2103033265823964, + "grad_norm": 0.3604966478044568, + "learning_rate": 7.447742923412882e-05, + "loss": 2.8535, + "step": 25996 + }, + { + "epoch": 1.2103498847684895, + "grad_norm": 0.3901349018568981, + "learning_rate": 7.447506724488641e-05, + "loss": 2.8331, + "step": 25997 + }, + { + "epoch": 1.2103964429545826, + "grad_norm": 0.37089453485740037, + "learning_rate": 7.447270518381208e-05, + "loss": 2.7892, + "step": 25998 + }, + { + "epoch": 1.2104430011406755, + "grad_norm": 0.431119325319127, + "learning_rate": 7.447034305091279e-05, + "loss": 2.8133, + "step": 25999 + }, + { + "epoch": 1.2104895593267686, + "grad_norm": 0.33773519131777036, + "learning_rate": 7.446798084619543e-05, + "loss": 2.838, + "step": 26000 + }, + { + "epoch": 1.2105361175128617, + "grad_norm": 0.3629963747920485, + "learning_rate": 7.446561856966696e-05, + "loss": 2.9212, + "step": 26001 + }, + { + "epoch": 1.2105826756989548, + "grad_norm": 0.3347308959201073, + "learning_rate": 7.44632562213343e-05, + "loss": 2.7815, + "step": 26002 + }, + { + "epoch": 1.2106292338850477, + "grad_norm": 0.3689337637959461, + "learning_rate": 7.446089380120441e-05, + "loss": 2.7626, + "step": 26003 + }, + { + "epoch": 1.2106757920711408, + "grad_norm": 0.3232573017848272, + "learning_rate": 7.445853130928422e-05, + "loss": 2.7227, + "step": 26004 + }, + { + "epoch": 1.210722350257234, + "grad_norm": 0.3607326609666796, + "learning_rate": 7.445616874558063e-05, + "loss": 2.8847, + "step": 26005 + }, + { + "epoch": 1.210768908443327, + "grad_norm": 0.3642490532257769, + "learning_rate": 7.44538061101006e-05, + "loss": 2.8302, + "step": 26006 + }, + { + "epoch": 1.2108154666294202, + "grad_norm": 0.3402788023643176, + "learning_rate": 7.445144340285104e-05, + "loss": 2.7913, + "step": 26007 + }, + { + "epoch": 1.2108620248155133, + "grad_norm": 0.35824772176152275, + "learning_rate": 7.444908062383894e-05, + "loss": 2.7663, + "step": 26008 + }, + { + "epoch": 1.2109085830016062, + "grad_norm": 0.32899346679873315, + "learning_rate": 7.444671777307118e-05, + "loss": 2.754, + "step": 26009 + }, + { + "epoch": 1.2109551411876993, + "grad_norm": 0.3759849886974961, + "learning_rate": 7.44443548505547e-05, + "loss": 2.7747, + "step": 26010 + }, + { + "epoch": 1.2110016993737924, + "grad_norm": 0.32900238607359283, + "learning_rate": 7.444199185629645e-05, + "loss": 2.9076, + "step": 26011 + }, + { + "epoch": 1.2110482575598855, + "grad_norm": 0.34298719323494176, + "learning_rate": 7.443962879030338e-05, + "loss": 2.6619, + "step": 26012 + }, + { + "epoch": 1.2110948157459784, + "grad_norm": 0.32690592944656366, + "learning_rate": 7.443726565258239e-05, + "loss": 2.7181, + "step": 26013 + }, + { + "epoch": 1.2111413739320716, + "grad_norm": 0.33537134803695384, + "learning_rate": 7.443490244314042e-05, + "loss": 2.9006, + "step": 26014 + }, + { + "epoch": 1.2111879321181647, + "grad_norm": 0.3183554684358193, + "learning_rate": 7.443253916198445e-05, + "loss": 2.6143, + "step": 26015 + }, + { + "epoch": 1.2112344903042578, + "grad_norm": 0.32297777562350166, + "learning_rate": 7.443017580912136e-05, + "loss": 2.787, + "step": 26016 + }, + { + "epoch": 1.211281048490351, + "grad_norm": 0.3082894225068658, + "learning_rate": 7.442781238455813e-05, + "loss": 2.808, + "step": 26017 + }, + { + "epoch": 1.2113276066764438, + "grad_norm": 0.3470270049429812, + "learning_rate": 7.442544888830166e-05, + "loss": 2.764, + "step": 26018 + }, + { + "epoch": 1.211374164862537, + "grad_norm": 0.3199458351501264, + "learning_rate": 7.442308532035891e-05, + "loss": 2.7423, + "step": 26019 + }, + { + "epoch": 1.21142072304863, + "grad_norm": 0.32621948772742554, + "learning_rate": 7.44207216807368e-05, + "loss": 2.7671, + "step": 26020 + }, + { + "epoch": 1.2114672812347231, + "grad_norm": 0.34036738120317556, + "learning_rate": 7.44183579694423e-05, + "loss": 2.8064, + "step": 26021 + }, + { + "epoch": 1.2115138394208163, + "grad_norm": 0.34756151225301907, + "learning_rate": 7.441599418648232e-05, + "loss": 2.8843, + "step": 26022 + }, + { + "epoch": 1.2115603976069091, + "grad_norm": 0.34507259540072494, + "learning_rate": 7.44136303318638e-05, + "loss": 2.6801, + "step": 26023 + }, + { + "epoch": 1.2116069557930023, + "grad_norm": 0.3083182220774025, + "learning_rate": 7.441126640559367e-05, + "loss": 2.7335, + "step": 26024 + }, + { + "epoch": 1.2116535139790954, + "grad_norm": 0.34012321467719925, + "learning_rate": 7.440890240767888e-05, + "loss": 2.8605, + "step": 26025 + }, + { + "epoch": 1.2117000721651885, + "grad_norm": 0.29080843363265163, + "learning_rate": 7.440653833812639e-05, + "loss": 2.7656, + "step": 26026 + }, + { + "epoch": 1.2117466303512816, + "grad_norm": 0.31265593626054017, + "learning_rate": 7.440417419694309e-05, + "loss": 2.786, + "step": 26027 + }, + { + "epoch": 1.2117931885373745, + "grad_norm": 0.2996390354446513, + "learning_rate": 7.440180998413595e-05, + "loss": 2.8302, + "step": 26028 + }, + { + "epoch": 1.2118397467234676, + "grad_norm": 0.37414505597284486, + "learning_rate": 7.43994456997119e-05, + "loss": 2.7033, + "step": 26029 + }, + { + "epoch": 1.2118863049095607, + "grad_norm": 0.29418842808913603, + "learning_rate": 7.439708134367786e-05, + "loss": 2.7269, + "step": 26030 + }, + { + "epoch": 1.2119328630956538, + "grad_norm": 0.36884018451696104, + "learning_rate": 7.439471691604083e-05, + "loss": 2.7209, + "step": 26031 + }, + { + "epoch": 1.2119794212817467, + "grad_norm": 0.330814211028287, + "learning_rate": 7.439235241680769e-05, + "loss": 2.7992, + "step": 26032 + }, + { + "epoch": 1.2120259794678399, + "grad_norm": 0.34559135860115925, + "learning_rate": 7.438998784598539e-05, + "loss": 2.7578, + "step": 26033 + }, + { + "epoch": 1.212072537653933, + "grad_norm": 0.33172676712999105, + "learning_rate": 7.438762320358088e-05, + "loss": 2.8549, + "step": 26034 + }, + { + "epoch": 1.212119095840026, + "grad_norm": 0.3340854996659901, + "learning_rate": 7.438525848960112e-05, + "loss": 2.9209, + "step": 26035 + }, + { + "epoch": 1.2121656540261192, + "grad_norm": 0.3117620200629832, + "learning_rate": 7.438289370405303e-05, + "loss": 2.9077, + "step": 26036 + }, + { + "epoch": 1.2122122122122123, + "grad_norm": 0.31853818734083794, + "learning_rate": 7.438052884694352e-05, + "loss": 2.7902, + "step": 26037 + }, + { + "epoch": 1.2122587703983052, + "grad_norm": 0.3268167720225167, + "learning_rate": 7.437816391827957e-05, + "loss": 2.6782, + "step": 26038 + }, + { + "epoch": 1.2123053285843983, + "grad_norm": 0.3074386358501483, + "learning_rate": 7.437579891806811e-05, + "loss": 2.7773, + "step": 26039 + }, + { + "epoch": 1.2123518867704914, + "grad_norm": 0.3444157705508517, + "learning_rate": 7.437343384631609e-05, + "loss": 2.7229, + "step": 26040 + }, + { + "epoch": 1.2123984449565846, + "grad_norm": 0.3304149238334242, + "learning_rate": 7.437106870303043e-05, + "loss": 2.8556, + "step": 26041 + }, + { + "epoch": 1.2124450031426774, + "grad_norm": 0.3539049927053086, + "learning_rate": 7.43687034882181e-05, + "loss": 2.8281, + "step": 26042 + }, + { + "epoch": 1.2124915613287706, + "grad_norm": 0.31749449910814026, + "learning_rate": 7.436633820188601e-05, + "loss": 2.9003, + "step": 26043 + }, + { + "epoch": 1.2125381195148637, + "grad_norm": 0.351391979629593, + "learning_rate": 7.436397284404113e-05, + "loss": 2.8086, + "step": 26044 + }, + { + "epoch": 1.2125846777009568, + "grad_norm": 0.3396688353364477, + "learning_rate": 7.436160741469039e-05, + "loss": 2.7459, + "step": 26045 + }, + { + "epoch": 1.21263123588705, + "grad_norm": 0.34732160331678247, + "learning_rate": 7.435924191384072e-05, + "loss": 2.8247, + "step": 26046 + }, + { + "epoch": 1.212677794073143, + "grad_norm": 0.3592024241756694, + "learning_rate": 7.435687634149909e-05, + "loss": 2.8848, + "step": 26047 + }, + { + "epoch": 1.212724352259236, + "grad_norm": 0.3377903676418528, + "learning_rate": 7.435451069767242e-05, + "loss": 2.8489, + "step": 26048 + }, + { + "epoch": 1.212770910445329, + "grad_norm": 0.33373624468257346, + "learning_rate": 7.435214498236766e-05, + "loss": 2.8718, + "step": 26049 + }, + { + "epoch": 1.2128174686314221, + "grad_norm": 0.38363880306489917, + "learning_rate": 7.434977919559176e-05, + "loss": 2.7987, + "step": 26050 + }, + { + "epoch": 1.2128640268175153, + "grad_norm": 0.33493636909786323, + "learning_rate": 7.434741333735165e-05, + "loss": 2.8179, + "step": 26051 + }, + { + "epoch": 1.2129105850036082, + "grad_norm": 0.35549786724850563, + "learning_rate": 7.434504740765428e-05, + "loss": 2.7222, + "step": 26052 + }, + { + "epoch": 1.2129571431897013, + "grad_norm": 0.3378965742850521, + "learning_rate": 7.434268140650659e-05, + "loss": 2.781, + "step": 26053 + }, + { + "epoch": 1.2130037013757944, + "grad_norm": 0.3450747965175028, + "learning_rate": 7.434031533391555e-05, + "loss": 2.6747, + "step": 26054 + }, + { + "epoch": 1.2130502595618875, + "grad_norm": 0.3449638138631965, + "learning_rate": 7.433794918988806e-05, + "loss": 2.7859, + "step": 26055 + }, + { + "epoch": 1.2130968177479806, + "grad_norm": 0.367506085686798, + "learning_rate": 7.433558297443108e-05, + "loss": 2.8914, + "step": 26056 + }, + { + "epoch": 1.2131433759340737, + "grad_norm": 0.31343698969875133, + "learning_rate": 7.433321668755158e-05, + "loss": 2.8282, + "step": 26057 + }, + { + "epoch": 1.2131899341201666, + "grad_norm": 0.3370257257271691, + "learning_rate": 7.433085032925649e-05, + "loss": 2.7181, + "step": 26058 + }, + { + "epoch": 1.2132364923062597, + "grad_norm": 0.32136407779094933, + "learning_rate": 7.432848389955274e-05, + "loss": 2.8106, + "step": 26059 + }, + { + "epoch": 1.2132830504923529, + "grad_norm": 0.30699985639276767, + "learning_rate": 7.432611739844728e-05, + "loss": 2.7415, + "step": 26060 + }, + { + "epoch": 1.213329608678446, + "grad_norm": 0.32367461074940856, + "learning_rate": 7.432375082594708e-05, + "loss": 2.7463, + "step": 26061 + }, + { + "epoch": 1.2133761668645389, + "grad_norm": 0.31125211297634564, + "learning_rate": 7.432138418205907e-05, + "loss": 2.8092, + "step": 26062 + }, + { + "epoch": 1.213422725050632, + "grad_norm": 0.31986831373404223, + "learning_rate": 7.431901746679017e-05, + "loss": 2.8151, + "step": 26063 + }, + { + "epoch": 1.213469283236725, + "grad_norm": 0.3384010045740546, + "learning_rate": 7.431665068014737e-05, + "loss": 2.7611, + "step": 26064 + }, + { + "epoch": 1.2135158414228182, + "grad_norm": 0.32530143371120024, + "learning_rate": 7.431428382213759e-05, + "loss": 2.7294, + "step": 26065 + }, + { + "epoch": 1.2135623996089113, + "grad_norm": 0.34252131123568236, + "learning_rate": 7.431191689276778e-05, + "loss": 2.8743, + "step": 26066 + }, + { + "epoch": 1.2136089577950042, + "grad_norm": 0.34855622334327113, + "learning_rate": 7.430954989204489e-05, + "loss": 2.894, + "step": 26067 + }, + { + "epoch": 1.2136555159810973, + "grad_norm": 0.3617670470326081, + "learning_rate": 7.430718281997587e-05, + "loss": 2.8263, + "step": 26068 + }, + { + "epoch": 1.2137020741671904, + "grad_norm": 0.343333637914538, + "learning_rate": 7.430481567656764e-05, + "loss": 2.8098, + "step": 26069 + }, + { + "epoch": 1.2137486323532836, + "grad_norm": 0.32891255488594234, + "learning_rate": 7.430244846182718e-05, + "loss": 2.8616, + "step": 26070 + }, + { + "epoch": 1.2137951905393767, + "grad_norm": 0.3790355207666316, + "learning_rate": 7.430008117576144e-05, + "loss": 2.8947, + "step": 26071 + }, + { + "epoch": 1.2138417487254696, + "grad_norm": 0.3828998567225358, + "learning_rate": 7.429771381837736e-05, + "loss": 2.8801, + "step": 26072 + }, + { + "epoch": 1.2138883069115627, + "grad_norm": 0.33775096486597744, + "learning_rate": 7.429534638968188e-05, + "loss": 2.7096, + "step": 26073 + }, + { + "epoch": 1.2139348650976558, + "grad_norm": 0.35798822042035405, + "learning_rate": 7.429297888968196e-05, + "loss": 2.8912, + "step": 26074 + }, + { + "epoch": 1.213981423283749, + "grad_norm": 0.3568212352413708, + "learning_rate": 7.429061131838451e-05, + "loss": 2.9127, + "step": 26075 + }, + { + "epoch": 1.214027981469842, + "grad_norm": 0.3662695445034252, + "learning_rate": 7.428824367579655e-05, + "loss": 2.7844, + "step": 26076 + }, + { + "epoch": 1.214074539655935, + "grad_norm": 0.33138242197329487, + "learning_rate": 7.428587596192495e-05, + "loss": 2.7679, + "step": 26077 + }, + { + "epoch": 1.214121097842028, + "grad_norm": 0.34437903117393937, + "learning_rate": 7.428350817677671e-05, + "loss": 2.7278, + "step": 26078 + }, + { + "epoch": 1.2141676560281212, + "grad_norm": 0.3475279733104242, + "learning_rate": 7.428114032035876e-05, + "loss": 2.8851, + "step": 26079 + }, + { + "epoch": 1.2142142142142143, + "grad_norm": 0.3270393788325498, + "learning_rate": 7.427877239267807e-05, + "loss": 2.7347, + "step": 26080 + }, + { + "epoch": 1.2142607724003072, + "grad_norm": 0.34857993266892046, + "learning_rate": 7.427640439374156e-05, + "loss": 2.8339, + "step": 26081 + }, + { + "epoch": 1.2143073305864003, + "grad_norm": 0.31026758796255943, + "learning_rate": 7.427403632355621e-05, + "loss": 2.8127, + "step": 26082 + }, + { + "epoch": 1.2143538887724934, + "grad_norm": 0.3057353987400984, + "learning_rate": 7.427166818212894e-05, + "loss": 2.8269, + "step": 26083 + }, + { + "epoch": 1.2144004469585865, + "grad_norm": 0.3343338269920998, + "learning_rate": 7.426929996946671e-05, + "loss": 2.7787, + "step": 26084 + }, + { + "epoch": 1.2144470051446796, + "grad_norm": 0.2943276330810824, + "learning_rate": 7.42669316855765e-05, + "loss": 2.7877, + "step": 26085 + }, + { + "epoch": 1.2144935633307727, + "grad_norm": 0.29376151829377783, + "learning_rate": 7.426456333046522e-05, + "loss": 2.7809, + "step": 26086 + }, + { + "epoch": 1.2145401215168656, + "grad_norm": 0.32756278435481745, + "learning_rate": 7.426219490413985e-05, + "loss": 2.9511, + "step": 26087 + }, + { + "epoch": 1.2145866797029587, + "grad_norm": 0.32830796704657156, + "learning_rate": 7.425982640660731e-05, + "loss": 2.7386, + "step": 26088 + }, + { + "epoch": 1.2146332378890519, + "grad_norm": 0.3340248675324639, + "learning_rate": 7.425745783787459e-05, + "loss": 2.7536, + "step": 26089 + }, + { + "epoch": 1.214679796075145, + "grad_norm": 0.32926376241771826, + "learning_rate": 7.42550891979486e-05, + "loss": 2.8585, + "step": 26090 + }, + { + "epoch": 1.2147263542612379, + "grad_norm": 0.30629520322956416, + "learning_rate": 7.425272048683631e-05, + "loss": 2.847, + "step": 26091 + }, + { + "epoch": 1.214772912447331, + "grad_norm": 0.3261527794220713, + "learning_rate": 7.42503517045447e-05, + "loss": 2.7915, + "step": 26092 + }, + { + "epoch": 1.214819470633424, + "grad_norm": 0.3088449833481041, + "learning_rate": 7.424798285108068e-05, + "loss": 2.877, + "step": 26093 + }, + { + "epoch": 1.2148660288195172, + "grad_norm": 0.29080295798079087, + "learning_rate": 7.424561392645121e-05, + "loss": 2.7409, + "step": 26094 + }, + { + "epoch": 1.2149125870056103, + "grad_norm": 0.32094503711610534, + "learning_rate": 7.424324493066326e-05, + "loss": 2.738, + "step": 26095 + }, + { + "epoch": 1.2149591451917034, + "grad_norm": 0.3008408063970535, + "learning_rate": 7.424087586372379e-05, + "loss": 2.8165, + "step": 26096 + }, + { + "epoch": 1.2150057033777963, + "grad_norm": 0.31636285932462244, + "learning_rate": 7.423850672563972e-05, + "loss": 2.7916, + "step": 26097 + }, + { + "epoch": 1.2150522615638895, + "grad_norm": 0.31554168102841307, + "learning_rate": 7.423613751641804e-05, + "loss": 2.8911, + "step": 26098 + }, + { + "epoch": 1.2150988197499826, + "grad_norm": 0.3310411142653752, + "learning_rate": 7.423376823606566e-05, + "loss": 2.7757, + "step": 26099 + }, + { + "epoch": 1.2151453779360757, + "grad_norm": 0.34998444857778777, + "learning_rate": 7.423139888458957e-05, + "loss": 2.88, + "step": 26100 + }, + { + "epoch": 1.2151919361221686, + "grad_norm": 0.3234774840887119, + "learning_rate": 7.422902946199671e-05, + "loss": 2.7952, + "step": 26101 + }, + { + "epoch": 1.2152384943082617, + "grad_norm": 0.3585429302613099, + "learning_rate": 7.422665996829403e-05, + "loss": 2.8216, + "step": 26102 + }, + { + "epoch": 1.2152850524943548, + "grad_norm": 0.34163453525630294, + "learning_rate": 7.42242904034885e-05, + "loss": 2.7797, + "step": 26103 + }, + { + "epoch": 1.215331610680448, + "grad_norm": 0.35242017020828165, + "learning_rate": 7.422192076758706e-05, + "loss": 3.0137, + "step": 26104 + }, + { + "epoch": 1.215378168866541, + "grad_norm": 0.32828090668029275, + "learning_rate": 7.421955106059667e-05, + "loss": 2.7264, + "step": 26105 + }, + { + "epoch": 1.215424727052634, + "grad_norm": 0.3283831510285009, + "learning_rate": 7.421718128252428e-05, + "loss": 2.7687, + "step": 26106 + }, + { + "epoch": 1.215471285238727, + "grad_norm": 0.3132155320414748, + "learning_rate": 7.421481143337685e-05, + "loss": 2.7259, + "step": 26107 + }, + { + "epoch": 1.2155178434248202, + "grad_norm": 0.3053627879537246, + "learning_rate": 7.421244151316133e-05, + "loss": 2.7624, + "step": 26108 + }, + { + "epoch": 1.2155644016109133, + "grad_norm": 0.28652275143511774, + "learning_rate": 7.421007152188468e-05, + "loss": 2.7576, + "step": 26109 + }, + { + "epoch": 1.2156109597970064, + "grad_norm": 0.34981497376526405, + "learning_rate": 7.420770145955387e-05, + "loss": 2.6915, + "step": 26110 + }, + { + "epoch": 1.2156575179830993, + "grad_norm": 0.28117939554222143, + "learning_rate": 7.420533132617583e-05, + "loss": 2.779, + "step": 26111 + }, + { + "epoch": 1.2157040761691924, + "grad_norm": 0.31748098692181, + "learning_rate": 7.420296112175753e-05, + "loss": 2.7528, + "step": 26112 + }, + { + "epoch": 1.2157506343552855, + "grad_norm": 0.3101391669045503, + "learning_rate": 7.420059084630592e-05, + "loss": 2.7696, + "step": 26113 + }, + { + "epoch": 1.2157971925413786, + "grad_norm": 0.3168226802809587, + "learning_rate": 7.419822049982796e-05, + "loss": 2.8562, + "step": 26114 + }, + { + "epoch": 1.2158437507274718, + "grad_norm": 0.30437957652343955, + "learning_rate": 7.419585008233061e-05, + "loss": 2.6062, + "step": 26115 + }, + { + "epoch": 1.2158903089135646, + "grad_norm": 0.32186779884173905, + "learning_rate": 7.419347959382082e-05, + "loss": 2.8425, + "step": 26116 + }, + { + "epoch": 1.2159368670996578, + "grad_norm": 0.31780614723024075, + "learning_rate": 7.419110903430555e-05, + "loss": 2.7501, + "step": 26117 + }, + { + "epoch": 1.2159834252857509, + "grad_norm": 0.3286203925954782, + "learning_rate": 7.418873840379175e-05, + "loss": 2.7469, + "step": 26118 + }, + { + "epoch": 1.216029983471844, + "grad_norm": 0.3397420236844833, + "learning_rate": 7.41863677022864e-05, + "loss": 2.7088, + "step": 26119 + }, + { + "epoch": 1.2160765416579369, + "grad_norm": 0.317140335371947, + "learning_rate": 7.418399692979644e-05, + "loss": 2.7342, + "step": 26120 + }, + { + "epoch": 1.21612309984403, + "grad_norm": 0.3219467071099622, + "learning_rate": 7.418162608632883e-05, + "loss": 2.8226, + "step": 26121 + }, + { + "epoch": 1.2161696580301231, + "grad_norm": 0.3378524018594423, + "learning_rate": 7.417925517189053e-05, + "loss": 2.8102, + "step": 26122 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.3057968045366728, + "learning_rate": 7.417688418648849e-05, + "loss": 2.816, + "step": 26123 + }, + { + "epoch": 1.2162627744023093, + "grad_norm": 0.34929028497995257, + "learning_rate": 7.417451313012971e-05, + "loss": 2.693, + "step": 26124 + }, + { + "epoch": 1.2163093325884025, + "grad_norm": 0.32726576483980235, + "learning_rate": 7.417214200282108e-05, + "loss": 2.874, + "step": 26125 + }, + { + "epoch": 1.2163558907744954, + "grad_norm": 0.3649284838064497, + "learning_rate": 7.416977080456961e-05, + "loss": 2.9, + "step": 26126 + }, + { + "epoch": 1.2164024489605885, + "grad_norm": 0.36165344740174576, + "learning_rate": 7.416739953538225e-05, + "loss": 2.7251, + "step": 26127 + }, + { + "epoch": 1.2164490071466816, + "grad_norm": 0.3176587925430269, + "learning_rate": 7.416502819526592e-05, + "loss": 2.844, + "step": 26128 + }, + { + "epoch": 1.2164955653327747, + "grad_norm": 0.3135193965661166, + "learning_rate": 7.416265678422764e-05, + "loss": 2.8121, + "step": 26129 + }, + { + "epoch": 1.2165421235188676, + "grad_norm": 0.34717566987696313, + "learning_rate": 7.416028530227434e-05, + "loss": 2.8091, + "step": 26130 + }, + { + "epoch": 1.2165886817049607, + "grad_norm": 0.3037093653082209, + "learning_rate": 7.415791374941298e-05, + "loss": 2.7887, + "step": 26131 + }, + { + "epoch": 1.2166352398910538, + "grad_norm": 0.338429359688284, + "learning_rate": 7.415554212565052e-05, + "loss": 2.7399, + "step": 26132 + }, + { + "epoch": 1.216681798077147, + "grad_norm": 0.34715837209840145, + "learning_rate": 7.415317043099394e-05, + "loss": 2.7662, + "step": 26133 + }, + { + "epoch": 1.21672835626324, + "grad_norm": 0.3074793484864675, + "learning_rate": 7.415079866545017e-05, + "loss": 2.6845, + "step": 26134 + }, + { + "epoch": 1.2167749144493332, + "grad_norm": 0.331005285592979, + "learning_rate": 7.414842682902618e-05, + "loss": 2.8806, + "step": 26135 + }, + { + "epoch": 1.216821472635426, + "grad_norm": 0.35516765394518957, + "learning_rate": 7.414605492172895e-05, + "loss": 2.7489, + "step": 26136 + }, + { + "epoch": 1.2168680308215192, + "grad_norm": 0.3306767646823082, + "learning_rate": 7.414368294356541e-05, + "loss": 2.7411, + "step": 26137 + }, + { + "epoch": 1.2169145890076123, + "grad_norm": 0.34199296655159606, + "learning_rate": 7.414131089454255e-05, + "loss": 2.8685, + "step": 26138 + }, + { + "epoch": 1.2169611471937054, + "grad_norm": 0.3515414998511876, + "learning_rate": 7.413893877466734e-05, + "loss": 2.7783, + "step": 26139 + }, + { + "epoch": 1.2170077053797983, + "grad_norm": 0.3544074294473454, + "learning_rate": 7.41365665839467e-05, + "loss": 2.8602, + "step": 26140 + }, + { + "epoch": 1.2170542635658914, + "grad_norm": 0.35628995894178783, + "learning_rate": 7.413419432238761e-05, + "loss": 2.7681, + "step": 26141 + }, + { + "epoch": 1.2171008217519845, + "grad_norm": 0.3669985562313594, + "learning_rate": 7.413182198999705e-05, + "loss": 2.7061, + "step": 26142 + }, + { + "epoch": 1.2171473799380776, + "grad_norm": 0.32363271712398195, + "learning_rate": 7.412944958678197e-05, + "loss": 2.7504, + "step": 26143 + }, + { + "epoch": 1.2171939381241708, + "grad_norm": 0.3914485651534967, + "learning_rate": 7.412707711274934e-05, + "loss": 2.7671, + "step": 26144 + }, + { + "epoch": 1.2172404963102639, + "grad_norm": 0.33298412303269387, + "learning_rate": 7.412470456790609e-05, + "loss": 2.7883, + "step": 26145 + }, + { + "epoch": 1.2172870544963568, + "grad_norm": 0.43172488263339426, + "learning_rate": 7.412233195225921e-05, + "loss": 2.8244, + "step": 26146 + }, + { + "epoch": 1.2173336126824499, + "grad_norm": 0.3372645593489396, + "learning_rate": 7.411995926581568e-05, + "loss": 2.7547, + "step": 26147 + }, + { + "epoch": 1.217380170868543, + "grad_norm": 0.3696567010027468, + "learning_rate": 7.411758650858245e-05, + "loss": 2.8069, + "step": 26148 + }, + { + "epoch": 1.2174267290546361, + "grad_norm": 0.34773740044447715, + "learning_rate": 7.411521368056646e-05, + "loss": 2.712, + "step": 26149 + }, + { + "epoch": 1.217473287240729, + "grad_norm": 0.3755624143785627, + "learning_rate": 7.411284078177471e-05, + "loss": 2.7988, + "step": 26150 + }, + { + "epoch": 1.2175198454268221, + "grad_norm": 0.35693116040787015, + "learning_rate": 7.411046781221414e-05, + "loss": 2.683, + "step": 26151 + }, + { + "epoch": 1.2175664036129152, + "grad_norm": 0.3604516223659428, + "learning_rate": 7.410809477189174e-05, + "loss": 2.839, + "step": 26152 + }, + { + "epoch": 1.2176129617990084, + "grad_norm": 0.36637567792397685, + "learning_rate": 7.410572166081444e-05, + "loss": 2.6551, + "step": 26153 + }, + { + "epoch": 1.2176595199851015, + "grad_norm": 0.31767396921981106, + "learning_rate": 7.410334847898921e-05, + "loss": 2.7777, + "step": 26154 + }, + { + "epoch": 1.2177060781711944, + "grad_norm": 0.35486592132610656, + "learning_rate": 7.410097522642304e-05, + "loss": 2.7387, + "step": 26155 + }, + { + "epoch": 1.2177526363572875, + "grad_norm": 0.33646091855094745, + "learning_rate": 7.409860190312288e-05, + "loss": 2.7953, + "step": 26156 + }, + { + "epoch": 1.2177991945433806, + "grad_norm": 0.34320255059389804, + "learning_rate": 7.40962285090957e-05, + "loss": 2.7794, + "step": 26157 + }, + { + "epoch": 1.2178457527294737, + "grad_norm": 0.36852362676716566, + "learning_rate": 7.409385504434847e-05, + "loss": 2.7336, + "step": 26158 + }, + { + "epoch": 1.2178923109155668, + "grad_norm": 0.34822579734121606, + "learning_rate": 7.409148150888813e-05, + "loss": 2.7648, + "step": 26159 + }, + { + "epoch": 1.2179388691016597, + "grad_norm": 0.3328156865526548, + "learning_rate": 7.408910790272167e-05, + "loss": 2.7812, + "step": 26160 + }, + { + "epoch": 1.2179854272877528, + "grad_norm": 0.36667381307421315, + "learning_rate": 7.408673422585607e-05, + "loss": 2.8182, + "step": 26161 + }, + { + "epoch": 1.218031985473846, + "grad_norm": 0.32874598478643685, + "learning_rate": 7.408436047829827e-05, + "loss": 2.7785, + "step": 26162 + }, + { + "epoch": 1.218078543659939, + "grad_norm": 0.3559498648101655, + "learning_rate": 7.408198666005524e-05, + "loss": 2.8693, + "step": 26163 + }, + { + "epoch": 1.2181251018460322, + "grad_norm": 0.3648639066262862, + "learning_rate": 7.407961277113396e-05, + "loss": 2.7095, + "step": 26164 + }, + { + "epoch": 1.218171660032125, + "grad_norm": 0.37355134136691703, + "learning_rate": 7.407723881154137e-05, + "loss": 2.756, + "step": 26165 + }, + { + "epoch": 1.2182182182182182, + "grad_norm": 0.30660077616571957, + "learning_rate": 7.407486478128447e-05, + "loss": 2.662, + "step": 26166 + }, + { + "epoch": 1.2182647764043113, + "grad_norm": 0.33391755153130603, + "learning_rate": 7.407249068037021e-05, + "loss": 2.8395, + "step": 26167 + }, + { + "epoch": 1.2183113345904044, + "grad_norm": 0.34079292691448076, + "learning_rate": 7.407011650880556e-05, + "loss": 2.7411, + "step": 26168 + }, + { + "epoch": 1.2183578927764973, + "grad_norm": 0.32818207819612466, + "learning_rate": 7.406774226659749e-05, + "loss": 2.7744, + "step": 26169 + }, + { + "epoch": 1.2184044509625904, + "grad_norm": 0.3541082695684275, + "learning_rate": 7.406536795375297e-05, + "loss": 2.7978, + "step": 26170 + }, + { + "epoch": 1.2184510091486835, + "grad_norm": 0.3251518439491753, + "learning_rate": 7.406299357027897e-05, + "loss": 2.6124, + "step": 26171 + }, + { + "epoch": 1.2184975673347767, + "grad_norm": 0.33129861617068723, + "learning_rate": 7.406061911618245e-05, + "loss": 2.8103, + "step": 26172 + }, + { + "epoch": 1.2185441255208698, + "grad_norm": 0.3538921011068061, + "learning_rate": 7.405824459147037e-05, + "loss": 2.8541, + "step": 26173 + }, + { + "epoch": 1.2185906837069629, + "grad_norm": 0.35598291130958587, + "learning_rate": 7.405586999614973e-05, + "loss": 2.7423, + "step": 26174 + }, + { + "epoch": 1.2186372418930558, + "grad_norm": 0.3270492976249713, + "learning_rate": 7.405349533022747e-05, + "loss": 2.6911, + "step": 26175 + }, + { + "epoch": 1.218683800079149, + "grad_norm": 0.33408829933092576, + "learning_rate": 7.40511205937106e-05, + "loss": 2.6464, + "step": 26176 + }, + { + "epoch": 1.218730358265242, + "grad_norm": 0.3208275816604202, + "learning_rate": 7.404874578660603e-05, + "loss": 2.8757, + "step": 26177 + }, + { + "epoch": 1.2187769164513351, + "grad_norm": 0.34466755677160843, + "learning_rate": 7.404637090892075e-05, + "loss": 2.8058, + "step": 26178 + }, + { + "epoch": 1.218823474637428, + "grad_norm": 0.3099772895260942, + "learning_rate": 7.404399596066178e-05, + "loss": 2.8522, + "step": 26179 + }, + { + "epoch": 1.2188700328235211, + "grad_norm": 0.33875609805731843, + "learning_rate": 7.404162094183603e-05, + "loss": 2.7713, + "step": 26180 + }, + { + "epoch": 1.2189165910096142, + "grad_norm": 0.33976639070264353, + "learning_rate": 7.403924585245047e-05, + "loss": 2.7657, + "step": 26181 + }, + { + "epoch": 1.2189631491957074, + "grad_norm": 0.3144137401132937, + "learning_rate": 7.403687069251212e-05, + "loss": 2.6801, + "step": 26182 + }, + { + "epoch": 1.2190097073818005, + "grad_norm": 0.35263012807915606, + "learning_rate": 7.403449546202791e-05, + "loss": 2.8543, + "step": 26183 + }, + { + "epoch": 1.2190562655678936, + "grad_norm": 0.30574345387432766, + "learning_rate": 7.403212016100484e-05, + "loss": 2.7585, + "step": 26184 + }, + { + "epoch": 1.2191028237539865, + "grad_norm": 0.35383566519285237, + "learning_rate": 7.402974478944985e-05, + "loss": 2.7749, + "step": 26185 + }, + { + "epoch": 1.2191493819400796, + "grad_norm": 0.3406245209862486, + "learning_rate": 7.402736934736992e-05, + "loss": 2.8719, + "step": 26186 + }, + { + "epoch": 1.2191959401261727, + "grad_norm": 0.3461681990315239, + "learning_rate": 7.402499383477204e-05, + "loss": 2.8224, + "step": 26187 + }, + { + "epoch": 1.2192424983122658, + "grad_norm": 0.32426170216519834, + "learning_rate": 7.402261825166317e-05, + "loss": 2.8596, + "step": 26188 + }, + { + "epoch": 1.2192890564983587, + "grad_norm": 0.3414103450181562, + "learning_rate": 7.402024259805028e-05, + "loss": 2.7168, + "step": 26189 + }, + { + "epoch": 1.2193356146844518, + "grad_norm": 0.34189324177767905, + "learning_rate": 7.401786687394034e-05, + "loss": 2.6516, + "step": 26190 + }, + { + "epoch": 1.219382172870545, + "grad_norm": 0.3713992830208775, + "learning_rate": 7.401549107934033e-05, + "loss": 2.9078, + "step": 26191 + }, + { + "epoch": 1.219428731056638, + "grad_norm": 0.32937739717344106, + "learning_rate": 7.401311521425723e-05, + "loss": 2.7072, + "step": 26192 + }, + { + "epoch": 1.2194752892427312, + "grad_norm": 0.3597275022367544, + "learning_rate": 7.4010739278698e-05, + "loss": 2.7617, + "step": 26193 + }, + { + "epoch": 1.219521847428824, + "grad_norm": 0.3554917838635625, + "learning_rate": 7.40083632726696e-05, + "loss": 2.8692, + "step": 26194 + }, + { + "epoch": 1.2195684056149172, + "grad_norm": 0.3317234424877104, + "learning_rate": 7.400598719617902e-05, + "loss": 2.7873, + "step": 26195 + }, + { + "epoch": 1.2196149638010103, + "grad_norm": 0.33973619468135496, + "learning_rate": 7.400361104923323e-05, + "loss": 2.8092, + "step": 26196 + }, + { + "epoch": 1.2196615219871034, + "grad_norm": 0.3263859093329429, + "learning_rate": 7.400123483183922e-05, + "loss": 2.8537, + "step": 26197 + }, + { + "epoch": 1.2197080801731965, + "grad_norm": 0.3448831856275919, + "learning_rate": 7.399885854400396e-05, + "loss": 2.7518, + "step": 26198 + }, + { + "epoch": 1.2197546383592894, + "grad_norm": 0.2976348011553661, + "learning_rate": 7.399648218573439e-05, + "loss": 2.8115, + "step": 26199 + }, + { + "epoch": 1.2198011965453825, + "grad_norm": 0.3445570232128307, + "learning_rate": 7.399410575703751e-05, + "loss": 2.8302, + "step": 26200 + }, + { + "epoch": 1.2198477547314757, + "grad_norm": 0.31919703161728674, + "learning_rate": 7.399172925792032e-05, + "loss": 2.8286, + "step": 26201 + }, + { + "epoch": 1.2198943129175688, + "grad_norm": 0.31276793815640536, + "learning_rate": 7.398935268838976e-05, + "loss": 2.7242, + "step": 26202 + }, + { + "epoch": 1.219940871103662, + "grad_norm": 0.32588642016424696, + "learning_rate": 7.398697604845279e-05, + "loss": 2.8312, + "step": 26203 + }, + { + "epoch": 1.2199874292897548, + "grad_norm": 0.3041280277415698, + "learning_rate": 7.398459933811644e-05, + "loss": 2.865, + "step": 26204 + }, + { + "epoch": 1.220033987475848, + "grad_norm": 0.3401409045288768, + "learning_rate": 7.398222255738762e-05, + "loss": 2.7885, + "step": 26205 + }, + { + "epoch": 1.220080545661941, + "grad_norm": 0.3300502141546955, + "learning_rate": 7.397984570627337e-05, + "loss": 2.7266, + "step": 26206 + }, + { + "epoch": 1.2201271038480341, + "grad_norm": 0.31249955881483976, + "learning_rate": 7.39774687847806e-05, + "loss": 2.7931, + "step": 26207 + }, + { + "epoch": 1.220173662034127, + "grad_norm": 0.314669983520315, + "learning_rate": 7.397509179291635e-05, + "loss": 2.8151, + "step": 26208 + }, + { + "epoch": 1.2202202202202201, + "grad_norm": 0.31901153098205615, + "learning_rate": 7.397271473068756e-05, + "loss": 2.6883, + "step": 26209 + }, + { + "epoch": 1.2202667784063133, + "grad_norm": 0.32442001772579576, + "learning_rate": 7.397033759810122e-05, + "loss": 2.8566, + "step": 26210 + }, + { + "epoch": 1.2203133365924064, + "grad_norm": 0.35728999344688633, + "learning_rate": 7.39679603951643e-05, + "loss": 2.8339, + "step": 26211 + }, + { + "epoch": 1.2203598947784995, + "grad_norm": 0.3191681089220806, + "learning_rate": 7.396558312188376e-05, + "loss": 2.819, + "step": 26212 + }, + { + "epoch": 1.2204064529645926, + "grad_norm": 0.3537345462517474, + "learning_rate": 7.396320577826662e-05, + "loss": 2.7238, + "step": 26213 + }, + { + "epoch": 1.2204530111506855, + "grad_norm": 0.32161103570778876, + "learning_rate": 7.396082836431981e-05, + "loss": 2.7709, + "step": 26214 + }, + { + "epoch": 1.2204995693367786, + "grad_norm": 0.35790256094387535, + "learning_rate": 7.395845088005035e-05, + "loss": 2.8346, + "step": 26215 + }, + { + "epoch": 1.2205461275228717, + "grad_norm": 0.3052756576071824, + "learning_rate": 7.395607332546517e-05, + "loss": 2.6988, + "step": 26216 + }, + { + "epoch": 1.2205926857089648, + "grad_norm": 0.3225892391363323, + "learning_rate": 7.395369570057129e-05, + "loss": 2.7785, + "step": 26217 + }, + { + "epoch": 1.2206392438950577, + "grad_norm": 0.3356627583660474, + "learning_rate": 7.395131800537566e-05, + "loss": 2.8848, + "step": 26218 + }, + { + "epoch": 1.2206858020811509, + "grad_norm": 0.3574344471371773, + "learning_rate": 7.39489402398853e-05, + "loss": 2.8436, + "step": 26219 + }, + { + "epoch": 1.220732360267244, + "grad_norm": 0.3452120022458841, + "learning_rate": 7.394656240410714e-05, + "loss": 2.8004, + "step": 26220 + }, + { + "epoch": 1.220778918453337, + "grad_norm": 0.35324031537808775, + "learning_rate": 7.394418449804818e-05, + "loss": 2.8637, + "step": 26221 + }, + { + "epoch": 1.2208254766394302, + "grad_norm": 0.31620163959584857, + "learning_rate": 7.394180652171541e-05, + "loss": 2.821, + "step": 26222 + }, + { + "epoch": 1.2208720348255233, + "grad_norm": 0.32690577499563656, + "learning_rate": 7.393942847511576e-05, + "loss": 2.7344, + "step": 26223 + }, + { + "epoch": 1.2209185930116162, + "grad_norm": 0.3502127083783529, + "learning_rate": 7.393705035825628e-05, + "loss": 2.8522, + "step": 26224 + }, + { + "epoch": 1.2209651511977093, + "grad_norm": 0.34667062080130495, + "learning_rate": 7.393467217114391e-05, + "loss": 2.8451, + "step": 26225 + }, + { + "epoch": 1.2210117093838024, + "grad_norm": 0.32943838065457254, + "learning_rate": 7.393229391378564e-05, + "loss": 2.689, + "step": 26226 + }, + { + "epoch": 1.2210582675698956, + "grad_norm": 0.3425710019235365, + "learning_rate": 7.392991558618845e-05, + "loss": 2.8275, + "step": 26227 + }, + { + "epoch": 1.2211048257559884, + "grad_norm": 0.3355153769015697, + "learning_rate": 7.392753718835929e-05, + "loss": 2.8348, + "step": 26228 + }, + { + "epoch": 1.2211513839420816, + "grad_norm": 0.3265172394069756, + "learning_rate": 7.392515872030519e-05, + "loss": 2.7673, + "step": 26229 + }, + { + "epoch": 1.2211979421281747, + "grad_norm": 0.34853607535337244, + "learning_rate": 7.392278018203309e-05, + "loss": 2.8495, + "step": 26230 + }, + { + "epoch": 1.2212445003142678, + "grad_norm": 0.36542069538929894, + "learning_rate": 7.392040157354999e-05, + "loss": 2.7636, + "step": 26231 + }, + { + "epoch": 1.221291058500361, + "grad_norm": 0.3181745410954106, + "learning_rate": 7.391802289486286e-05, + "loss": 2.8441, + "step": 26232 + }, + { + "epoch": 1.221337616686454, + "grad_norm": 0.3432313034957303, + "learning_rate": 7.391564414597872e-05, + "loss": 2.7894, + "step": 26233 + }, + { + "epoch": 1.221384174872547, + "grad_norm": 0.31967479221984485, + "learning_rate": 7.391326532690449e-05, + "loss": 2.7527, + "step": 26234 + }, + { + "epoch": 1.22143073305864, + "grad_norm": 0.3449616516463126, + "learning_rate": 7.391088643764719e-05, + "loss": 2.6944, + "step": 26235 + }, + { + "epoch": 1.2214772912447331, + "grad_norm": 0.34328234609663383, + "learning_rate": 7.390850747821379e-05, + "loss": 2.8709, + "step": 26236 + }, + { + "epoch": 1.2215238494308263, + "grad_norm": 0.3416124979064241, + "learning_rate": 7.390612844861128e-05, + "loss": 2.685, + "step": 26237 + }, + { + "epoch": 1.2215704076169192, + "grad_norm": 0.34419021906264485, + "learning_rate": 7.390374934884665e-05, + "loss": 2.7567, + "step": 26238 + }, + { + "epoch": 1.2216169658030123, + "grad_norm": 0.3742729096260431, + "learning_rate": 7.390137017892684e-05, + "loss": 2.7396, + "step": 26239 + }, + { + "epoch": 1.2216635239891054, + "grad_norm": 0.3562829263042809, + "learning_rate": 7.38989909388589e-05, + "loss": 2.8736, + "step": 26240 + }, + { + "epoch": 1.2217100821751985, + "grad_norm": 0.3581723037478722, + "learning_rate": 7.389661162864976e-05, + "loss": 2.7734, + "step": 26241 + }, + { + "epoch": 1.2217566403612916, + "grad_norm": 0.34532360487681196, + "learning_rate": 7.389423224830641e-05, + "loss": 2.7634, + "step": 26242 + }, + { + "epoch": 1.2218031985473845, + "grad_norm": 0.3586388035673692, + "learning_rate": 7.389185279783586e-05, + "loss": 2.8011, + "step": 26243 + }, + { + "epoch": 1.2218497567334776, + "grad_norm": 0.3338460582261849, + "learning_rate": 7.388947327724506e-05, + "loss": 2.7028, + "step": 26244 + }, + { + "epoch": 1.2218963149195707, + "grad_norm": 0.3606401367671041, + "learning_rate": 7.388709368654101e-05, + "loss": 2.809, + "step": 26245 + }, + { + "epoch": 1.2219428731056639, + "grad_norm": 0.34184553178656124, + "learning_rate": 7.38847140257307e-05, + "loss": 2.7727, + "step": 26246 + }, + { + "epoch": 1.221989431291757, + "grad_norm": 0.3409134812341095, + "learning_rate": 7.388233429482111e-05, + "loss": 2.7181, + "step": 26247 + }, + { + "epoch": 1.2220359894778499, + "grad_norm": 0.36330279979383395, + "learning_rate": 7.387995449381922e-05, + "loss": 2.8126, + "step": 26248 + }, + { + "epoch": 1.222082547663943, + "grad_norm": 0.321611604087524, + "learning_rate": 7.3877574622732e-05, + "loss": 2.8009, + "step": 26249 + }, + { + "epoch": 1.222129105850036, + "grad_norm": 0.3502686266671757, + "learning_rate": 7.387519468156647e-05, + "loss": 2.7273, + "step": 26250 + }, + { + "epoch": 1.2221756640361292, + "grad_norm": 0.345179616063284, + "learning_rate": 7.38728146703296e-05, + "loss": 2.7305, + "step": 26251 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.3330576696209643, + "learning_rate": 7.387043458902836e-05, + "loss": 2.7129, + "step": 26252 + }, + { + "epoch": 1.2222687804083152, + "grad_norm": 0.34458208048906447, + "learning_rate": 7.386805443766975e-05, + "loss": 2.883, + "step": 26253 + }, + { + "epoch": 1.2223153385944083, + "grad_norm": 0.3487090111198732, + "learning_rate": 7.386567421626074e-05, + "loss": 2.9351, + "step": 26254 + }, + { + "epoch": 1.2223618967805014, + "grad_norm": 0.3885466556930606, + "learning_rate": 7.386329392480834e-05, + "loss": 2.7328, + "step": 26255 + }, + { + "epoch": 1.2224084549665946, + "grad_norm": 0.33945293479257704, + "learning_rate": 7.386091356331952e-05, + "loss": 2.8024, + "step": 26256 + }, + { + "epoch": 1.2224550131526875, + "grad_norm": 0.3168394742663853, + "learning_rate": 7.385853313180126e-05, + "loss": 2.8595, + "step": 26257 + }, + { + "epoch": 1.2225015713387806, + "grad_norm": 0.4576655004944907, + "learning_rate": 7.385615263026058e-05, + "loss": 2.7157, + "step": 26258 + }, + { + "epoch": 1.2225481295248737, + "grad_norm": 0.33678875550242066, + "learning_rate": 7.385377205870442e-05, + "loss": 2.7735, + "step": 26259 + }, + { + "epoch": 1.2225946877109668, + "grad_norm": 0.34935282782802524, + "learning_rate": 7.38513914171398e-05, + "loss": 2.7287, + "step": 26260 + }, + { + "epoch": 1.22264124589706, + "grad_norm": 0.3341481752115489, + "learning_rate": 7.384901070557368e-05, + "loss": 2.7473, + "step": 26261 + }, + { + "epoch": 1.222687804083153, + "grad_norm": 0.3492150490222742, + "learning_rate": 7.384662992401308e-05, + "loss": 2.7646, + "step": 26262 + }, + { + "epoch": 1.222734362269246, + "grad_norm": 0.3514812615601246, + "learning_rate": 7.384424907246497e-05, + "loss": 2.7302, + "step": 26263 + }, + { + "epoch": 1.222780920455339, + "grad_norm": 0.3322539547622578, + "learning_rate": 7.384186815093634e-05, + "loss": 2.7297, + "step": 26264 + }, + { + "epoch": 1.2228274786414322, + "grad_norm": 0.36880980323008145, + "learning_rate": 7.383948715943417e-05, + "loss": 2.7924, + "step": 26265 + }, + { + "epoch": 1.2228740368275253, + "grad_norm": 0.32383990007560365, + "learning_rate": 7.383710609796545e-05, + "loss": 2.8264, + "step": 26266 + }, + { + "epoch": 1.2229205950136182, + "grad_norm": 0.33405651519811713, + "learning_rate": 7.383472496653717e-05, + "loss": 2.6164, + "step": 26267 + }, + { + "epoch": 1.2229671531997113, + "grad_norm": 0.342536091695155, + "learning_rate": 7.383234376515634e-05, + "loss": 2.7994, + "step": 26268 + }, + { + "epoch": 1.2230137113858044, + "grad_norm": 0.31994373404225507, + "learning_rate": 7.382996249382989e-05, + "loss": 2.8494, + "step": 26269 + }, + { + "epoch": 1.2230602695718975, + "grad_norm": 0.36069093154740006, + "learning_rate": 7.382758115256488e-05, + "loss": 2.7469, + "step": 26270 + }, + { + "epoch": 1.2231068277579906, + "grad_norm": 0.31406831727862994, + "learning_rate": 7.382519974136825e-05, + "loss": 2.7581, + "step": 26271 + }, + { + "epoch": 1.2231533859440837, + "grad_norm": 0.36020513042882624, + "learning_rate": 7.382281826024702e-05, + "loss": 2.8498, + "step": 26272 + }, + { + "epoch": 1.2231999441301766, + "grad_norm": 0.32658089245090904, + "learning_rate": 7.382043670920816e-05, + "loss": 2.7963, + "step": 26273 + }, + { + "epoch": 1.2232465023162697, + "grad_norm": 0.3358600519622216, + "learning_rate": 7.381805508825868e-05, + "loss": 2.7586, + "step": 26274 + }, + { + "epoch": 1.2232930605023629, + "grad_norm": 0.31236374564139907, + "learning_rate": 7.381567339740553e-05, + "loss": 2.8606, + "step": 26275 + }, + { + "epoch": 1.223339618688456, + "grad_norm": 0.3475149099024857, + "learning_rate": 7.381329163665574e-05, + "loss": 2.7924, + "step": 26276 + }, + { + "epoch": 1.2233861768745489, + "grad_norm": 0.3816010219418679, + "learning_rate": 7.381090980601628e-05, + "loss": 2.8182, + "step": 26277 + }, + { + "epoch": 1.223432735060642, + "grad_norm": 0.36077277314413886, + "learning_rate": 7.380852790549414e-05, + "loss": 2.8087, + "step": 26278 + }, + { + "epoch": 1.223479293246735, + "grad_norm": 0.34357953944942066, + "learning_rate": 7.380614593509633e-05, + "loss": 2.825, + "step": 26279 + }, + { + "epoch": 1.2235258514328282, + "grad_norm": 0.36251409851504923, + "learning_rate": 7.380376389482981e-05, + "loss": 2.7996, + "step": 26280 + }, + { + "epoch": 1.2235724096189213, + "grad_norm": 0.3367373318154501, + "learning_rate": 7.38013817847016e-05, + "loss": 2.7224, + "step": 26281 + }, + { + "epoch": 1.2236189678050142, + "grad_norm": 0.36418137508152915, + "learning_rate": 7.379899960471866e-05, + "loss": 2.7988, + "step": 26282 + }, + { + "epoch": 1.2236655259911073, + "grad_norm": 0.33021404982961705, + "learning_rate": 7.379661735488803e-05, + "loss": 2.8024, + "step": 26283 + }, + { + "epoch": 1.2237120841772005, + "grad_norm": 0.3258844164974304, + "learning_rate": 7.379423503521665e-05, + "loss": 2.7956, + "step": 26284 + }, + { + "epoch": 1.2237586423632936, + "grad_norm": 0.3152857115140414, + "learning_rate": 7.379185264571155e-05, + "loss": 2.8464, + "step": 26285 + }, + { + "epoch": 1.2238052005493867, + "grad_norm": 0.3111814133418417, + "learning_rate": 7.37894701863797e-05, + "loss": 2.7878, + "step": 26286 + }, + { + "epoch": 1.2238517587354796, + "grad_norm": 0.3382248430577973, + "learning_rate": 7.378708765722809e-05, + "loss": 2.7803, + "step": 26287 + }, + { + "epoch": 1.2238983169215727, + "grad_norm": 0.3101565925222612, + "learning_rate": 7.378470505826374e-05, + "loss": 2.8067, + "step": 26288 + }, + { + "epoch": 1.2239448751076658, + "grad_norm": 0.29942542205450196, + "learning_rate": 7.37823223894936e-05, + "loss": 2.765, + "step": 26289 + }, + { + "epoch": 1.223991433293759, + "grad_norm": 0.31126472577897146, + "learning_rate": 7.377993965092471e-05, + "loss": 2.861, + "step": 26290 + }, + { + "epoch": 1.224037991479852, + "grad_norm": 0.3356871149294289, + "learning_rate": 7.377755684256404e-05, + "loss": 2.8452, + "step": 26291 + }, + { + "epoch": 1.224084549665945, + "grad_norm": 0.3084971752413795, + "learning_rate": 7.377517396441856e-05, + "loss": 2.7953, + "step": 26292 + }, + { + "epoch": 1.224131107852038, + "grad_norm": 0.3375000562214523, + "learning_rate": 7.37727910164953e-05, + "loss": 2.7786, + "step": 26293 + }, + { + "epoch": 1.2241776660381312, + "grad_norm": 0.30992280084054014, + "learning_rate": 7.377040799880124e-05, + "loss": 2.6983, + "step": 26294 + }, + { + "epoch": 1.2242242242242243, + "grad_norm": 0.3269689737132484, + "learning_rate": 7.376802491134338e-05, + "loss": 2.7316, + "step": 26295 + }, + { + "epoch": 1.2242707824103172, + "grad_norm": 0.3173477409984855, + "learning_rate": 7.376564175412869e-05, + "loss": 2.7589, + "step": 26296 + }, + { + "epoch": 1.2243173405964103, + "grad_norm": 0.3243239248657678, + "learning_rate": 7.37632585271642e-05, + "loss": 2.7391, + "step": 26297 + }, + { + "epoch": 1.2243638987825034, + "grad_norm": 0.3251161916634539, + "learning_rate": 7.376087523045689e-05, + "loss": 2.7611, + "step": 26298 + }, + { + "epoch": 1.2244104569685965, + "grad_norm": 0.3289804780079661, + "learning_rate": 7.375849186401373e-05, + "loss": 2.7721, + "step": 26299 + }, + { + "epoch": 1.2244570151546896, + "grad_norm": 0.3090182164516086, + "learning_rate": 7.375610842784175e-05, + "loss": 2.6799, + "step": 26300 + }, + { + "epoch": 1.2245035733407827, + "grad_norm": 0.34144805158401514, + "learning_rate": 7.375372492194795e-05, + "loss": 2.7546, + "step": 26301 + }, + { + "epoch": 1.2245501315268756, + "grad_norm": 0.35941095180809507, + "learning_rate": 7.375134134633929e-05, + "loss": 2.8451, + "step": 26302 + }, + { + "epoch": 1.2245966897129688, + "grad_norm": 0.3311672854809076, + "learning_rate": 7.374895770102279e-05, + "loss": 2.7368, + "step": 26303 + }, + { + "epoch": 1.2246432478990619, + "grad_norm": 0.3463346768879093, + "learning_rate": 7.374657398600542e-05, + "loss": 2.6901, + "step": 26304 + }, + { + "epoch": 1.224689806085155, + "grad_norm": 0.3268230002396035, + "learning_rate": 7.374419020129421e-05, + "loss": 2.8226, + "step": 26305 + }, + { + "epoch": 1.2247363642712479, + "grad_norm": 0.32388711072244325, + "learning_rate": 7.374180634689615e-05, + "loss": 2.7776, + "step": 26306 + }, + { + "epoch": 1.224782922457341, + "grad_norm": 0.32176240813178336, + "learning_rate": 7.373942242281821e-05, + "loss": 2.7855, + "step": 26307 + }, + { + "epoch": 1.224829480643434, + "grad_norm": 0.3460859693815707, + "learning_rate": 7.373703842906741e-05, + "loss": 2.6517, + "step": 26308 + }, + { + "epoch": 1.2248760388295272, + "grad_norm": 0.3407847585067476, + "learning_rate": 7.373465436565073e-05, + "loss": 2.7639, + "step": 26309 + }, + { + "epoch": 1.2249225970156203, + "grad_norm": 0.3309808669767721, + "learning_rate": 7.37322702325752e-05, + "loss": 2.9045, + "step": 26310 + }, + { + "epoch": 1.2249691552017135, + "grad_norm": 0.31377255528183917, + "learning_rate": 7.372988602984778e-05, + "loss": 2.7581, + "step": 26311 + }, + { + "epoch": 1.2250157133878063, + "grad_norm": 0.32260349251568154, + "learning_rate": 7.372750175747549e-05, + "loss": 2.7558, + "step": 26312 + }, + { + "epoch": 1.2250622715738995, + "grad_norm": 0.3175578196354593, + "learning_rate": 7.37251174154653e-05, + "loss": 2.8189, + "step": 26313 + }, + { + "epoch": 1.2251088297599926, + "grad_norm": 0.3322502928537297, + "learning_rate": 7.372273300382425e-05, + "loss": 2.7977, + "step": 26314 + }, + { + "epoch": 1.2251553879460857, + "grad_norm": 0.31619630979464763, + "learning_rate": 7.372034852255932e-05, + "loss": 2.7868, + "step": 26315 + }, + { + "epoch": 1.2252019461321786, + "grad_norm": 0.32396993496127036, + "learning_rate": 7.371796397167749e-05, + "loss": 2.785, + "step": 26316 + }, + { + "epoch": 1.2252485043182717, + "grad_norm": 0.3177234576992557, + "learning_rate": 7.371557935118577e-05, + "loss": 2.7205, + "step": 26317 + }, + { + "epoch": 1.2252950625043648, + "grad_norm": 0.30226504290178763, + "learning_rate": 7.371319466109116e-05, + "loss": 2.8203, + "step": 26318 + }, + { + "epoch": 1.225341620690458, + "grad_norm": 0.3208907436539321, + "learning_rate": 7.371080990140068e-05, + "loss": 2.7621, + "step": 26319 + }, + { + "epoch": 1.225388178876551, + "grad_norm": 0.31512042944177476, + "learning_rate": 7.37084250721213e-05, + "loss": 2.8295, + "step": 26320 + }, + { + "epoch": 1.2254347370626442, + "grad_norm": 0.331534716375117, + "learning_rate": 7.370604017326001e-05, + "loss": 2.8501, + "step": 26321 + }, + { + "epoch": 1.225481295248737, + "grad_norm": 0.3350990099154658, + "learning_rate": 7.370365520482384e-05, + "loss": 2.7416, + "step": 26322 + }, + { + "epoch": 1.2255278534348302, + "grad_norm": 0.31588285847999, + "learning_rate": 7.370127016681977e-05, + "loss": 2.7239, + "step": 26323 + }, + { + "epoch": 1.2255744116209233, + "grad_norm": 0.3627472229255031, + "learning_rate": 7.369888505925482e-05, + "loss": 2.8345, + "step": 26324 + }, + { + "epoch": 1.2256209698070164, + "grad_norm": 0.3452025337076336, + "learning_rate": 7.369649988213598e-05, + "loss": 2.9063, + "step": 26325 + }, + { + "epoch": 1.2256675279931093, + "grad_norm": 0.38629387204735416, + "learning_rate": 7.369411463547023e-05, + "loss": 2.6209, + "step": 26326 + }, + { + "epoch": 1.2257140861792024, + "grad_norm": 0.3116606479827369, + "learning_rate": 7.36917293192646e-05, + "loss": 2.7592, + "step": 26327 + }, + { + "epoch": 1.2257606443652955, + "grad_norm": 0.37047992590894546, + "learning_rate": 7.368934393352607e-05, + "loss": 2.7898, + "step": 26328 + }, + { + "epoch": 1.2258072025513886, + "grad_norm": 0.3161803215951271, + "learning_rate": 7.368695847826167e-05, + "loss": 2.7804, + "step": 26329 + }, + { + "epoch": 1.2258537607374818, + "grad_norm": 0.3603696501072644, + "learning_rate": 7.368457295347836e-05, + "loss": 2.7898, + "step": 26330 + }, + { + "epoch": 1.2259003189235747, + "grad_norm": 0.34432288429156477, + "learning_rate": 7.368218735918316e-05, + "loss": 2.8295, + "step": 26331 + }, + { + "epoch": 1.2259468771096678, + "grad_norm": 0.3340685583264291, + "learning_rate": 7.367980169538308e-05, + "loss": 2.7462, + "step": 26332 + }, + { + "epoch": 1.2259934352957609, + "grad_norm": 0.34131688834106316, + "learning_rate": 7.367741596208512e-05, + "loss": 2.7955, + "step": 26333 + }, + { + "epoch": 1.226039993481854, + "grad_norm": 0.34099817094602214, + "learning_rate": 7.367503015929626e-05, + "loss": 2.7965, + "step": 26334 + }, + { + "epoch": 1.226086551667947, + "grad_norm": 0.3373902380095284, + "learning_rate": 7.367264428702354e-05, + "loss": 2.7247, + "step": 26335 + }, + { + "epoch": 1.22613310985404, + "grad_norm": 0.31875455025664456, + "learning_rate": 7.367025834527393e-05, + "loss": 2.8202, + "step": 26336 + }, + { + "epoch": 1.2261796680401331, + "grad_norm": 0.32183046227518924, + "learning_rate": 7.366787233405445e-05, + "loss": 2.7833, + "step": 26337 + }, + { + "epoch": 1.2262262262262262, + "grad_norm": 0.33435952681722625, + "learning_rate": 7.366548625337209e-05, + "loss": 2.799, + "step": 26338 + }, + { + "epoch": 1.2262727844123193, + "grad_norm": 0.30437734458993176, + "learning_rate": 7.366310010323385e-05, + "loss": 2.8124, + "step": 26339 + }, + { + "epoch": 1.2263193425984125, + "grad_norm": 0.34874529165082796, + "learning_rate": 7.366071388364677e-05, + "loss": 2.7195, + "step": 26340 + }, + { + "epoch": 1.2263659007845054, + "grad_norm": 0.3015263242719351, + "learning_rate": 7.365832759461779e-05, + "loss": 2.8467, + "step": 26341 + }, + { + "epoch": 1.2264124589705985, + "grad_norm": 0.3317138608511808, + "learning_rate": 7.365594123615398e-05, + "loss": 2.732, + "step": 26342 + }, + { + "epoch": 1.2264590171566916, + "grad_norm": 0.3114880825417547, + "learning_rate": 7.36535548082623e-05, + "loss": 2.7223, + "step": 26343 + }, + { + "epoch": 1.2265055753427847, + "grad_norm": 0.3901093986408228, + "learning_rate": 7.365116831094978e-05, + "loss": 2.8146, + "step": 26344 + }, + { + "epoch": 1.2265521335288776, + "grad_norm": 0.3492916879465431, + "learning_rate": 7.364878174422339e-05, + "loss": 2.8602, + "step": 26345 + }, + { + "epoch": 1.2265986917149707, + "grad_norm": 0.3543879045885109, + "learning_rate": 7.364639510809017e-05, + "loss": 2.8472, + "step": 26346 + }, + { + "epoch": 1.2266452499010638, + "grad_norm": 0.3442382925936812, + "learning_rate": 7.36440084025571e-05, + "loss": 2.7913, + "step": 26347 + }, + { + "epoch": 1.226691808087157, + "grad_norm": 0.3242322172691895, + "learning_rate": 7.36416216276312e-05, + "loss": 2.8119, + "step": 26348 + }, + { + "epoch": 1.22673836627325, + "grad_norm": 0.3477233428785778, + "learning_rate": 7.363923478331947e-05, + "loss": 2.7516, + "step": 26349 + }, + { + "epoch": 1.2267849244593432, + "grad_norm": 0.3134323088659048, + "learning_rate": 7.36368478696289e-05, + "loss": 2.7249, + "step": 26350 + }, + { + "epoch": 1.226831482645436, + "grad_norm": 0.32795281606811677, + "learning_rate": 7.363446088656653e-05, + "loss": 2.7869, + "step": 26351 + }, + { + "epoch": 1.2268780408315292, + "grad_norm": 0.33350777167485546, + "learning_rate": 7.363207383413934e-05, + "loss": 2.7375, + "step": 26352 + }, + { + "epoch": 1.2269245990176223, + "grad_norm": 0.31431996899153863, + "learning_rate": 7.362968671235434e-05, + "loss": 2.7891, + "step": 26353 + }, + { + "epoch": 1.2269711572037154, + "grad_norm": 0.33740370304873757, + "learning_rate": 7.362729952121854e-05, + "loss": 2.7348, + "step": 26354 + }, + { + "epoch": 1.2270177153898083, + "grad_norm": 0.31072103374590926, + "learning_rate": 7.362491226073895e-05, + "loss": 2.8328, + "step": 26355 + }, + { + "epoch": 1.2270642735759014, + "grad_norm": 0.3426469234297731, + "learning_rate": 7.362252493092255e-05, + "loss": 2.7569, + "step": 26356 + }, + { + "epoch": 1.2271108317619945, + "grad_norm": 0.32691277346227554, + "learning_rate": 7.362013753177638e-05, + "loss": 2.7782, + "step": 26357 + }, + { + "epoch": 1.2271573899480877, + "grad_norm": 0.3130507577635744, + "learning_rate": 7.361775006330743e-05, + "loss": 2.78, + "step": 26358 + }, + { + "epoch": 1.2272039481341808, + "grad_norm": 0.3313696543141152, + "learning_rate": 7.361536252552271e-05, + "loss": 2.8071, + "step": 26359 + }, + { + "epoch": 1.2272505063202739, + "grad_norm": 0.3334687359872421, + "learning_rate": 7.361297491842923e-05, + "loss": 2.8279, + "step": 26360 + }, + { + "epoch": 1.2272970645063668, + "grad_norm": 0.3188421826617489, + "learning_rate": 7.361058724203401e-05, + "loss": 2.889, + "step": 26361 + }, + { + "epoch": 1.22734362269246, + "grad_norm": 0.3238039736351997, + "learning_rate": 7.360819949634401e-05, + "loss": 2.7954, + "step": 26362 + }, + { + "epoch": 1.227390180878553, + "grad_norm": 0.32380776176615794, + "learning_rate": 7.360581168136628e-05, + "loss": 2.7747, + "step": 26363 + }, + { + "epoch": 1.2274367390646461, + "grad_norm": 0.3060111391554299, + "learning_rate": 7.360342379710785e-05, + "loss": 2.8365, + "step": 26364 + }, + { + "epoch": 1.227483297250739, + "grad_norm": 0.3326426953870273, + "learning_rate": 7.360103584357565e-05, + "loss": 2.9388, + "step": 26365 + }, + { + "epoch": 1.2275298554368321, + "grad_norm": 0.31555472596346795, + "learning_rate": 7.359864782077675e-05, + "loss": 2.7451, + "step": 26366 + }, + { + "epoch": 1.2275764136229252, + "grad_norm": 0.32297947317119907, + "learning_rate": 7.359625972871817e-05, + "loss": 2.8968, + "step": 26367 + }, + { + "epoch": 1.2276229718090184, + "grad_norm": 0.3398967636178767, + "learning_rate": 7.359387156740687e-05, + "loss": 2.7926, + "step": 26368 + }, + { + "epoch": 1.2276695299951115, + "grad_norm": 0.323743664393424, + "learning_rate": 7.359148333684988e-05, + "loss": 2.8405, + "step": 26369 + }, + { + "epoch": 1.2277160881812044, + "grad_norm": 0.31119259590852894, + "learning_rate": 7.358909503705422e-05, + "loss": 2.8881, + "step": 26370 + }, + { + "epoch": 1.2277626463672975, + "grad_norm": 0.33096143044784615, + "learning_rate": 7.358670666802687e-05, + "loss": 2.7326, + "step": 26371 + }, + { + "epoch": 1.2278092045533906, + "grad_norm": 0.313853085407359, + "learning_rate": 7.358431822977487e-05, + "loss": 2.7827, + "step": 26372 + }, + { + "epoch": 1.2278557627394837, + "grad_norm": 0.34266247172268466, + "learning_rate": 7.358192972230525e-05, + "loss": 2.7722, + "step": 26373 + }, + { + "epoch": 1.2279023209255768, + "grad_norm": 0.34228322765923835, + "learning_rate": 7.357954114562495e-05, + "loss": 2.7933, + "step": 26374 + }, + { + "epoch": 1.2279488791116697, + "grad_norm": 0.34610479122401805, + "learning_rate": 7.357715249974103e-05, + "loss": 2.9129, + "step": 26375 + }, + { + "epoch": 1.2279954372977628, + "grad_norm": 0.3318662179196357, + "learning_rate": 7.357476378466049e-05, + "loss": 2.8082, + "step": 26376 + }, + { + "epoch": 1.228041995483856, + "grad_norm": 0.320256308328133, + "learning_rate": 7.357237500039034e-05, + "loss": 2.7222, + "step": 26377 + }, + { + "epoch": 1.228088553669949, + "grad_norm": 0.32597054305527257, + "learning_rate": 7.35699861469376e-05, + "loss": 2.7684, + "step": 26378 + }, + { + "epoch": 1.2281351118560422, + "grad_norm": 0.3362182084460622, + "learning_rate": 7.356759722430927e-05, + "loss": 2.6723, + "step": 26379 + }, + { + "epoch": 1.228181670042135, + "grad_norm": 0.35575461545734693, + "learning_rate": 7.356520823251236e-05, + "loss": 2.8032, + "step": 26380 + }, + { + "epoch": 1.2282282282282282, + "grad_norm": 0.3507271085676483, + "learning_rate": 7.356281917155388e-05, + "loss": 2.7221, + "step": 26381 + }, + { + "epoch": 1.2282747864143213, + "grad_norm": 0.3372727670899324, + "learning_rate": 7.356043004144086e-05, + "loss": 2.768, + "step": 26382 + }, + { + "epoch": 1.2283213446004144, + "grad_norm": 0.36398821425196426, + "learning_rate": 7.35580408421803e-05, + "loss": 2.7927, + "step": 26383 + }, + { + "epoch": 1.2283679027865073, + "grad_norm": 0.3157682920565722, + "learning_rate": 7.355565157377921e-05, + "loss": 2.7784, + "step": 26384 + }, + { + "epoch": 1.2284144609726004, + "grad_norm": 0.38076847630682376, + "learning_rate": 7.355326223624459e-05, + "loss": 2.775, + "step": 26385 + }, + { + "epoch": 1.2284610191586935, + "grad_norm": 0.3171785274432278, + "learning_rate": 7.355087282958346e-05, + "loss": 2.7634, + "step": 26386 + }, + { + "epoch": 1.2285075773447867, + "grad_norm": 0.34420665467887124, + "learning_rate": 7.354848335380287e-05, + "loss": 2.785, + "step": 26387 + }, + { + "epoch": 1.2285541355308798, + "grad_norm": 0.3247025058147308, + "learning_rate": 7.354609380890978e-05, + "loss": 2.8731, + "step": 26388 + }, + { + "epoch": 1.228600693716973, + "grad_norm": 0.3577694926052742, + "learning_rate": 7.354370419491122e-05, + "loss": 2.698, + "step": 26389 + }, + { + "epoch": 1.2286472519030658, + "grad_norm": 0.3584113597370584, + "learning_rate": 7.354131451181421e-05, + "loss": 2.7175, + "step": 26390 + }, + { + "epoch": 1.228693810089159, + "grad_norm": 0.3509496643898557, + "learning_rate": 7.353892475962577e-05, + "loss": 2.7083, + "step": 26391 + }, + { + "epoch": 1.228740368275252, + "grad_norm": 0.3315210485605024, + "learning_rate": 7.35365349383529e-05, + "loss": 2.7759, + "step": 26392 + }, + { + "epoch": 1.2287869264613451, + "grad_norm": 0.3747549782561222, + "learning_rate": 7.353414504800263e-05, + "loss": 2.8102, + "step": 26393 + }, + { + "epoch": 1.228833484647438, + "grad_norm": 0.3066078176729401, + "learning_rate": 7.353175508858195e-05, + "loss": 2.841, + "step": 26394 + }, + { + "epoch": 1.2288800428335311, + "grad_norm": 0.37094693408773316, + "learning_rate": 7.35293650600979e-05, + "loss": 2.7248, + "step": 26395 + }, + { + "epoch": 1.2289266010196243, + "grad_norm": 0.3306099180858847, + "learning_rate": 7.352697496255747e-05, + "loss": 2.7925, + "step": 26396 + }, + { + "epoch": 1.2289731592057174, + "grad_norm": 0.31769456826872916, + "learning_rate": 7.352458479596767e-05, + "loss": 2.7474, + "step": 26397 + }, + { + "epoch": 1.2290197173918105, + "grad_norm": 0.30729187983426226, + "learning_rate": 7.352219456033554e-05, + "loss": 2.7886, + "step": 26398 + }, + { + "epoch": 1.2290662755779036, + "grad_norm": 0.3336871810181144, + "learning_rate": 7.35198042556681e-05, + "loss": 2.866, + "step": 26399 + }, + { + "epoch": 1.2291128337639965, + "grad_norm": 0.33998872252823886, + "learning_rate": 7.351741388197235e-05, + "loss": 2.8622, + "step": 26400 + }, + { + "epoch": 1.2291593919500896, + "grad_norm": 0.3387369515092536, + "learning_rate": 7.35150234392553e-05, + "loss": 2.7249, + "step": 26401 + }, + { + "epoch": 1.2292059501361827, + "grad_norm": 0.36688590922675646, + "learning_rate": 7.351263292752397e-05, + "loss": 2.7957, + "step": 26402 + }, + { + "epoch": 1.2292525083222758, + "grad_norm": 0.31568253533593166, + "learning_rate": 7.351024234678539e-05, + "loss": 2.7513, + "step": 26403 + }, + { + "epoch": 1.2292990665083687, + "grad_norm": 0.37618924406449394, + "learning_rate": 7.350785169704657e-05, + "loss": 2.741, + "step": 26404 + }, + { + "epoch": 1.2293456246944618, + "grad_norm": 0.3539347901228596, + "learning_rate": 7.35054609783145e-05, + "loss": 2.866, + "step": 26405 + }, + { + "epoch": 1.229392182880555, + "grad_norm": 0.36391368337081537, + "learning_rate": 7.350307019059623e-05, + "loss": 2.8176, + "step": 26406 + }, + { + "epoch": 1.229438741066648, + "grad_norm": 0.3251617565818432, + "learning_rate": 7.350067933389877e-05, + "loss": 2.8021, + "step": 26407 + }, + { + "epoch": 1.2294852992527412, + "grad_norm": 0.34870807452173275, + "learning_rate": 7.349828840822912e-05, + "loss": 2.7642, + "step": 26408 + }, + { + "epoch": 1.229531857438834, + "grad_norm": 0.3384930941751885, + "learning_rate": 7.349589741359432e-05, + "loss": 2.7441, + "step": 26409 + }, + { + "epoch": 1.2295784156249272, + "grad_norm": 0.38603568800501553, + "learning_rate": 7.349350635000137e-05, + "loss": 2.7925, + "step": 26410 + }, + { + "epoch": 1.2296249738110203, + "grad_norm": 0.33406184046370946, + "learning_rate": 7.349111521745728e-05, + "loss": 2.7924, + "step": 26411 + }, + { + "epoch": 1.2296715319971134, + "grad_norm": 0.39941695409512246, + "learning_rate": 7.348872401596908e-05, + "loss": 2.7786, + "step": 26412 + }, + { + "epoch": 1.2297180901832065, + "grad_norm": 0.3545322809465295, + "learning_rate": 7.348633274554381e-05, + "loss": 2.7443, + "step": 26413 + }, + { + "epoch": 1.2297646483692994, + "grad_norm": 0.36548942482195074, + "learning_rate": 7.348394140618846e-05, + "loss": 2.668, + "step": 26414 + }, + { + "epoch": 1.2298112065553926, + "grad_norm": 0.32936867901994604, + "learning_rate": 7.348154999791006e-05, + "loss": 2.8211, + "step": 26415 + }, + { + "epoch": 1.2298577647414857, + "grad_norm": 0.34360846631850367, + "learning_rate": 7.347915852071562e-05, + "loss": 2.8106, + "step": 26416 + }, + { + "epoch": 1.2299043229275788, + "grad_norm": 0.3384325901563401, + "learning_rate": 7.347676697461217e-05, + "loss": 2.792, + "step": 26417 + }, + { + "epoch": 1.229950881113672, + "grad_norm": 0.3185952079404803, + "learning_rate": 7.34743753596067e-05, + "loss": 2.7484, + "step": 26418 + }, + { + "epoch": 1.2299974392997648, + "grad_norm": 0.33979517520053826, + "learning_rate": 7.347198367570627e-05, + "loss": 2.8096, + "step": 26419 + }, + { + "epoch": 1.230043997485858, + "grad_norm": 0.33742244403985516, + "learning_rate": 7.346959192291788e-05, + "loss": 2.6592, + "step": 26420 + }, + { + "epoch": 1.230090555671951, + "grad_norm": 0.3280330796485139, + "learning_rate": 7.346720010124854e-05, + "loss": 2.7959, + "step": 26421 + }, + { + "epoch": 1.2301371138580441, + "grad_norm": 0.3523564949622916, + "learning_rate": 7.346480821070528e-05, + "loss": 2.8339, + "step": 26422 + }, + { + "epoch": 1.2301836720441373, + "grad_norm": 0.31636462496523476, + "learning_rate": 7.346241625129513e-05, + "loss": 2.7522, + "step": 26423 + }, + { + "epoch": 1.2302302302302301, + "grad_norm": 0.33382846561182505, + "learning_rate": 7.34600242230251e-05, + "loss": 2.8298, + "step": 26424 + }, + { + "epoch": 1.2302767884163233, + "grad_norm": 0.3152839660241687, + "learning_rate": 7.345763212590218e-05, + "loss": 2.7214, + "step": 26425 + }, + { + "epoch": 1.2303233466024164, + "grad_norm": 0.31988683716382604, + "learning_rate": 7.345523995993345e-05, + "loss": 2.6945, + "step": 26426 + }, + { + "epoch": 1.2303699047885095, + "grad_norm": 0.30554209310104113, + "learning_rate": 7.345284772512591e-05, + "loss": 2.7542, + "step": 26427 + }, + { + "epoch": 1.2304164629746026, + "grad_norm": 0.3407029397898617, + "learning_rate": 7.345045542148656e-05, + "loss": 2.8876, + "step": 26428 + }, + { + "epoch": 1.2304630211606955, + "grad_norm": 0.3168588189723538, + "learning_rate": 7.344806304902242e-05, + "loss": 2.7613, + "step": 26429 + }, + { + "epoch": 1.2305095793467886, + "grad_norm": 0.33133943347371964, + "learning_rate": 7.344567060774055e-05, + "loss": 2.7612, + "step": 26430 + }, + { + "epoch": 1.2305561375328817, + "grad_norm": 0.3477804553318822, + "learning_rate": 7.344327809764793e-05, + "loss": 2.8438, + "step": 26431 + }, + { + "epoch": 1.2306026957189748, + "grad_norm": 0.36649980427424184, + "learning_rate": 7.34408855187516e-05, + "loss": 2.8153, + "step": 26432 + }, + { + "epoch": 1.2306492539050677, + "grad_norm": 0.34239186203320693, + "learning_rate": 7.343849287105858e-05, + "loss": 2.7547, + "step": 26433 + }, + { + "epoch": 1.2306958120911609, + "grad_norm": 0.33039213941822415, + "learning_rate": 7.34361001545759e-05, + "loss": 2.7254, + "step": 26434 + }, + { + "epoch": 1.230742370277254, + "grad_norm": 0.35382208172864055, + "learning_rate": 7.343370736931056e-05, + "loss": 2.812, + "step": 26435 + }, + { + "epoch": 1.230788928463347, + "grad_norm": 0.3362767277850259, + "learning_rate": 7.343131451526962e-05, + "loss": 2.7272, + "step": 26436 + }, + { + "epoch": 1.2308354866494402, + "grad_norm": 0.3542509415374562, + "learning_rate": 7.342892159246005e-05, + "loss": 2.836, + "step": 26437 + }, + { + "epoch": 1.2308820448355333, + "grad_norm": 0.3506326058443512, + "learning_rate": 7.342652860088892e-05, + "loss": 2.8363, + "step": 26438 + }, + { + "epoch": 1.2309286030216262, + "grad_norm": 0.3421626406775728, + "learning_rate": 7.342413554056324e-05, + "loss": 2.6647, + "step": 26439 + }, + { + "epoch": 1.2309751612077193, + "grad_norm": 0.33932717305874965, + "learning_rate": 7.342174241149002e-05, + "loss": 2.6689, + "step": 26440 + }, + { + "epoch": 1.2310217193938124, + "grad_norm": 0.35474103569702, + "learning_rate": 7.34193492136763e-05, + "loss": 2.7421, + "step": 26441 + }, + { + "epoch": 1.2310682775799056, + "grad_norm": 0.34988700445258636, + "learning_rate": 7.341695594712909e-05, + "loss": 2.7836, + "step": 26442 + }, + { + "epoch": 1.2311148357659984, + "grad_norm": 0.34555010885130966, + "learning_rate": 7.341456261185544e-05, + "loss": 2.8167, + "step": 26443 + }, + { + "epoch": 1.2311613939520916, + "grad_norm": 0.3740154446845194, + "learning_rate": 7.341216920786234e-05, + "loss": 2.8198, + "step": 26444 + }, + { + "epoch": 1.2312079521381847, + "grad_norm": 0.34728046754993086, + "learning_rate": 7.340977573515684e-05, + "loss": 2.796, + "step": 26445 + }, + { + "epoch": 1.2312545103242778, + "grad_norm": 0.3297718658279498, + "learning_rate": 7.340738219374595e-05, + "loss": 2.7625, + "step": 26446 + }, + { + "epoch": 1.231301068510371, + "grad_norm": 0.3591700797656742, + "learning_rate": 7.34049885836367e-05, + "loss": 2.7922, + "step": 26447 + }, + { + "epoch": 1.231347626696464, + "grad_norm": 0.3120432065526835, + "learning_rate": 7.34025949048361e-05, + "loss": 2.69, + "step": 26448 + }, + { + "epoch": 1.231394184882557, + "grad_norm": 0.3352608760146322, + "learning_rate": 7.34002011573512e-05, + "loss": 2.7313, + "step": 26449 + }, + { + "epoch": 1.23144074306865, + "grad_norm": 0.35167818106737153, + "learning_rate": 7.339780734118903e-05, + "loss": 2.751, + "step": 26450 + }, + { + "epoch": 1.2314873012547431, + "grad_norm": 0.30622655047164976, + "learning_rate": 7.339541345635658e-05, + "loss": 2.6874, + "step": 26451 + }, + { + "epoch": 1.2315338594408363, + "grad_norm": 0.35610600717503854, + "learning_rate": 7.339301950286091e-05, + "loss": 2.7102, + "step": 26452 + }, + { + "epoch": 1.2315804176269292, + "grad_norm": 0.29995873145863367, + "learning_rate": 7.3390625480709e-05, + "loss": 2.819, + "step": 26453 + }, + { + "epoch": 1.2316269758130223, + "grad_norm": 0.35948497432622833, + "learning_rate": 7.338823138990796e-05, + "loss": 2.8372, + "step": 26454 + }, + { + "epoch": 1.2316735339991154, + "grad_norm": 0.3292918437481381, + "learning_rate": 7.338583723046473e-05, + "loss": 2.8675, + "step": 26455 + }, + { + "epoch": 1.2317200921852085, + "grad_norm": 0.3402107047780877, + "learning_rate": 7.338344300238639e-05, + "loss": 2.7778, + "step": 26456 + }, + { + "epoch": 1.2317666503713016, + "grad_norm": 0.33218782105946554, + "learning_rate": 7.338104870567993e-05, + "loss": 2.8568, + "step": 26457 + }, + { + "epoch": 1.2318132085573945, + "grad_norm": 0.37028928123831867, + "learning_rate": 7.337865434035241e-05, + "loss": 2.8295, + "step": 26458 + }, + { + "epoch": 1.2318597667434876, + "grad_norm": 0.3694375919021222, + "learning_rate": 7.337625990641084e-05, + "loss": 2.8488, + "step": 26459 + }, + { + "epoch": 1.2319063249295807, + "grad_norm": 0.3349690309960983, + "learning_rate": 7.337386540386224e-05, + "loss": 2.7324, + "step": 26460 + }, + { + "epoch": 1.2319528831156739, + "grad_norm": 0.35242426023243223, + "learning_rate": 7.337147083271365e-05, + "loss": 2.8199, + "step": 26461 + }, + { + "epoch": 1.231999441301767, + "grad_norm": 0.3322443805862409, + "learning_rate": 7.336907619297209e-05, + "loss": 2.7842, + "step": 26462 + }, + { + "epoch": 1.2320459994878599, + "grad_norm": 0.32749901801708914, + "learning_rate": 7.336668148464462e-05, + "loss": 2.7237, + "step": 26463 + }, + { + "epoch": 1.232092557673953, + "grad_norm": 0.3199777184176636, + "learning_rate": 7.336428670773822e-05, + "loss": 2.686, + "step": 26464 + }, + { + "epoch": 1.232139115860046, + "grad_norm": 0.3289751555789801, + "learning_rate": 7.336189186225995e-05, + "loss": 2.7601, + "step": 26465 + }, + { + "epoch": 1.2321856740461392, + "grad_norm": 0.337109366387213, + "learning_rate": 7.335949694821682e-05, + "loss": 2.7604, + "step": 26466 + }, + { + "epoch": 1.2322322322322323, + "grad_norm": 0.32210525757217867, + "learning_rate": 7.335710196561588e-05, + "loss": 2.9491, + "step": 26467 + }, + { + "epoch": 1.2322787904183252, + "grad_norm": 0.3466180281801223, + "learning_rate": 7.335470691446413e-05, + "loss": 2.8543, + "step": 26468 + }, + { + "epoch": 1.2323253486044183, + "grad_norm": 0.31001694316537265, + "learning_rate": 7.335231179476863e-05, + "loss": 2.773, + "step": 26469 + }, + { + "epoch": 1.2323719067905115, + "grad_norm": 0.35035910453543473, + "learning_rate": 7.334991660653639e-05, + "loss": 2.7663, + "step": 26470 + }, + { + "epoch": 1.2324184649766046, + "grad_norm": 0.3305521857724238, + "learning_rate": 7.334752134977446e-05, + "loss": 2.9295, + "step": 26471 + }, + { + "epoch": 1.2324650231626975, + "grad_norm": 0.3305774692498397, + "learning_rate": 7.334512602448984e-05, + "loss": 2.7399, + "step": 26472 + }, + { + "epoch": 1.2325115813487906, + "grad_norm": 0.3113219161239593, + "learning_rate": 7.334273063068957e-05, + "loss": 2.8097, + "step": 26473 + }, + { + "epoch": 1.2325581395348837, + "grad_norm": 0.32110508168110746, + "learning_rate": 7.334033516838069e-05, + "loss": 2.8633, + "step": 26474 + }, + { + "epoch": 1.2326046977209768, + "grad_norm": 0.3495834340377626, + "learning_rate": 7.333793963757023e-05, + "loss": 2.8379, + "step": 26475 + }, + { + "epoch": 1.23265125590707, + "grad_norm": 0.34318492158229197, + "learning_rate": 7.33355440382652e-05, + "loss": 2.6896, + "step": 26476 + }, + { + "epoch": 1.232697814093163, + "grad_norm": 0.3197923519295422, + "learning_rate": 7.333314837047267e-05, + "loss": 2.8045, + "step": 26477 + }, + { + "epoch": 1.232744372279256, + "grad_norm": 0.31963282724229175, + "learning_rate": 7.333075263419964e-05, + "loss": 2.8099, + "step": 26478 + }, + { + "epoch": 1.232790930465349, + "grad_norm": 0.35579241066061157, + "learning_rate": 7.332835682945315e-05, + "loss": 2.8186, + "step": 26479 + }, + { + "epoch": 1.2328374886514422, + "grad_norm": 0.33181453125428634, + "learning_rate": 7.332596095624022e-05, + "loss": 2.8618, + "step": 26480 + }, + { + "epoch": 1.2328840468375353, + "grad_norm": 0.3060799350511398, + "learning_rate": 7.33235650145679e-05, + "loss": 2.7882, + "step": 26481 + }, + { + "epoch": 1.2329306050236282, + "grad_norm": 0.3071467576584334, + "learning_rate": 7.332116900444324e-05, + "loss": 2.8134, + "step": 26482 + }, + { + "epoch": 1.2329771632097213, + "grad_norm": 0.32421012987298337, + "learning_rate": 7.331877292587321e-05, + "loss": 2.8034, + "step": 26483 + }, + { + "epoch": 1.2330237213958144, + "grad_norm": 0.3257444298828531, + "learning_rate": 7.331637677886489e-05, + "loss": 2.788, + "step": 26484 + }, + { + "epoch": 1.2330702795819075, + "grad_norm": 0.3373376495902308, + "learning_rate": 7.33139805634253e-05, + "loss": 2.8317, + "step": 26485 + }, + { + "epoch": 1.2331168377680006, + "grad_norm": 0.3206605061843419, + "learning_rate": 7.331158427956149e-05, + "loss": 2.8157, + "step": 26486 + }, + { + "epoch": 1.2331633959540937, + "grad_norm": 0.35801000237774727, + "learning_rate": 7.330918792728045e-05, + "loss": 2.7287, + "step": 26487 + }, + { + "epoch": 1.2332099541401866, + "grad_norm": 0.31426727132778254, + "learning_rate": 7.330679150658925e-05, + "loss": 2.742, + "step": 26488 + }, + { + "epoch": 1.2332565123262798, + "grad_norm": 0.3438523838069115, + "learning_rate": 7.330439501749493e-05, + "loss": 2.7381, + "step": 26489 + }, + { + "epoch": 1.2333030705123729, + "grad_norm": 0.3098268549336871, + "learning_rate": 7.330199846000449e-05, + "loss": 2.7515, + "step": 26490 + }, + { + "epoch": 1.233349628698466, + "grad_norm": 0.31594180108307746, + "learning_rate": 7.329960183412498e-05, + "loss": 2.8741, + "step": 26491 + }, + { + "epoch": 1.2333961868845589, + "grad_norm": 0.32981087828769295, + "learning_rate": 7.329720513986343e-05, + "loss": 2.8624, + "step": 26492 + }, + { + "epoch": 1.233442745070652, + "grad_norm": 0.34500055166313043, + "learning_rate": 7.32948083772269e-05, + "loss": 2.8354, + "step": 26493 + }, + { + "epoch": 1.233489303256745, + "grad_norm": 0.3457771076929976, + "learning_rate": 7.329241154622237e-05, + "loss": 2.8048, + "step": 26494 + }, + { + "epoch": 1.2335358614428382, + "grad_norm": 0.3216048766964142, + "learning_rate": 7.329001464685694e-05, + "loss": 2.8134, + "step": 26495 + }, + { + "epoch": 1.2335824196289313, + "grad_norm": 0.34244787346961564, + "learning_rate": 7.328761767913759e-05, + "loss": 2.7676, + "step": 26496 + }, + { + "epoch": 1.2336289778150242, + "grad_norm": 0.3332729492810748, + "learning_rate": 7.328522064307139e-05, + "loss": 2.8389, + "step": 26497 + }, + { + "epoch": 1.2336755360011173, + "grad_norm": 0.35435242791869137, + "learning_rate": 7.328282353866533e-05, + "loss": 2.7674, + "step": 26498 + }, + { + "epoch": 1.2337220941872105, + "grad_norm": 0.33111832754944137, + "learning_rate": 7.328042636592651e-05, + "loss": 2.8801, + "step": 26499 + }, + { + "epoch": 1.2337686523733036, + "grad_norm": 0.3531823014495089, + "learning_rate": 7.327802912486192e-05, + "loss": 2.7912, + "step": 26500 + }, + { + "epoch": 1.2338152105593967, + "grad_norm": 0.390304927438083, + "learning_rate": 7.327563181547862e-05, + "loss": 2.6581, + "step": 26501 + }, + { + "epoch": 1.2338617687454896, + "grad_norm": 0.3931902070008918, + "learning_rate": 7.327323443778361e-05, + "loss": 3.001, + "step": 26502 + }, + { + "epoch": 1.2339083269315827, + "grad_norm": 0.3520670513849223, + "learning_rate": 7.327083699178397e-05, + "loss": 2.7075, + "step": 26503 + }, + { + "epoch": 1.2339548851176758, + "grad_norm": 0.3535805504419311, + "learning_rate": 7.326843947748671e-05, + "loss": 2.7359, + "step": 26504 + }, + { + "epoch": 1.234001443303769, + "grad_norm": 0.32329990150092314, + "learning_rate": 7.326604189489887e-05, + "loss": 2.7422, + "step": 26505 + }, + { + "epoch": 1.234048001489862, + "grad_norm": 0.33569935547982305, + "learning_rate": 7.326364424402749e-05, + "loss": 2.7558, + "step": 26506 + }, + { + "epoch": 1.234094559675955, + "grad_norm": 0.34196792592339853, + "learning_rate": 7.326124652487961e-05, + "loss": 2.7553, + "step": 26507 + }, + { + "epoch": 1.234141117862048, + "grad_norm": 0.30786988335075527, + "learning_rate": 7.325884873746226e-05, + "loss": 2.7671, + "step": 26508 + }, + { + "epoch": 1.2341876760481412, + "grad_norm": 0.3424266328773402, + "learning_rate": 7.325645088178248e-05, + "loss": 2.7784, + "step": 26509 + }, + { + "epoch": 1.2342342342342343, + "grad_norm": 0.337564591896823, + "learning_rate": 7.325405295784731e-05, + "loss": 2.8208, + "step": 26510 + }, + { + "epoch": 1.2342807924203274, + "grad_norm": 0.3425388725425636, + "learning_rate": 7.325165496566377e-05, + "loss": 2.7802, + "step": 26511 + }, + { + "epoch": 1.2343273506064203, + "grad_norm": 0.33945135085659633, + "learning_rate": 7.324925690523893e-05, + "loss": 2.7097, + "step": 26512 + }, + { + "epoch": 1.2343739087925134, + "grad_norm": 0.3089121669664406, + "learning_rate": 7.324685877657982e-05, + "loss": 2.7732, + "step": 26513 + }, + { + "epoch": 1.2344204669786065, + "grad_norm": 0.3261683770767788, + "learning_rate": 7.324446057969345e-05, + "loss": 2.8143, + "step": 26514 + }, + { + "epoch": 1.2344670251646996, + "grad_norm": 0.336529837082746, + "learning_rate": 7.32420623145869e-05, + "loss": 2.6851, + "step": 26515 + }, + { + "epoch": 1.2345135833507928, + "grad_norm": 0.34834832571767915, + "learning_rate": 7.323966398126715e-05, + "loss": 2.8247, + "step": 26516 + }, + { + "epoch": 1.2345601415368856, + "grad_norm": 0.3382568442395232, + "learning_rate": 7.323726557974131e-05, + "loss": 2.9063, + "step": 26517 + }, + { + "epoch": 1.2346066997229788, + "grad_norm": 0.3392427011065727, + "learning_rate": 7.323486711001637e-05, + "loss": 2.8659, + "step": 26518 + }, + { + "epoch": 1.2346532579090719, + "grad_norm": 0.3492849480295111, + "learning_rate": 7.32324685720994e-05, + "loss": 2.7412, + "step": 26519 + }, + { + "epoch": 1.234699816095165, + "grad_norm": 0.3344306529107795, + "learning_rate": 7.323006996599741e-05, + "loss": 2.6749, + "step": 26520 + }, + { + "epoch": 1.2347463742812579, + "grad_norm": 0.3249712802745267, + "learning_rate": 7.322767129171745e-05, + "loss": 2.7785, + "step": 26521 + }, + { + "epoch": 1.234792932467351, + "grad_norm": 0.36196542034608903, + "learning_rate": 7.322527254926657e-05, + "loss": 2.7616, + "step": 26522 + }, + { + "epoch": 1.2348394906534441, + "grad_norm": 0.33193981082086765, + "learning_rate": 7.32228737386518e-05, + "loss": 2.6777, + "step": 26523 + }, + { + "epoch": 1.2348860488395372, + "grad_norm": 0.3369400228592982, + "learning_rate": 7.322047485988017e-05, + "loss": 2.801, + "step": 26524 + }, + { + "epoch": 1.2349326070256303, + "grad_norm": 0.3533010565477624, + "learning_rate": 7.321807591295876e-05, + "loss": 2.8099, + "step": 26525 + }, + { + "epoch": 1.2349791652117235, + "grad_norm": 0.33715433282063345, + "learning_rate": 7.321567689789457e-05, + "loss": 2.799, + "step": 26526 + }, + { + "epoch": 1.2350257233978164, + "grad_norm": 0.33179813706246475, + "learning_rate": 7.321327781469466e-05, + "loss": 2.8725, + "step": 26527 + }, + { + "epoch": 1.2350722815839095, + "grad_norm": 0.30864541784093846, + "learning_rate": 7.321087866336605e-05, + "loss": 2.8004, + "step": 26528 + }, + { + "epoch": 1.2351188397700026, + "grad_norm": 0.3513243437365929, + "learning_rate": 7.320847944391581e-05, + "loss": 2.7335, + "step": 26529 + }, + { + "epoch": 1.2351653979560957, + "grad_norm": 0.3115067283398685, + "learning_rate": 7.320608015635097e-05, + "loss": 2.7756, + "step": 26530 + }, + { + "epoch": 1.2352119561421886, + "grad_norm": 0.3292276180261853, + "learning_rate": 7.320368080067859e-05, + "loss": 2.8034, + "step": 26531 + }, + { + "epoch": 1.2352585143282817, + "grad_norm": 0.3159072014762306, + "learning_rate": 7.320128137690567e-05, + "loss": 2.8003, + "step": 26532 + }, + { + "epoch": 1.2353050725143748, + "grad_norm": 0.3553201857530531, + "learning_rate": 7.319888188503928e-05, + "loss": 2.8307, + "step": 26533 + }, + { + "epoch": 1.235351630700468, + "grad_norm": 0.3047796073731296, + "learning_rate": 7.319648232508645e-05, + "loss": 2.8189, + "step": 26534 + }, + { + "epoch": 1.235398188886561, + "grad_norm": 0.358775181803215, + "learning_rate": 7.319408269705425e-05, + "loss": 2.7799, + "step": 26535 + }, + { + "epoch": 1.2354447470726542, + "grad_norm": 0.329040146538899, + "learning_rate": 7.319168300094969e-05, + "loss": 2.857, + "step": 26536 + }, + { + "epoch": 1.235491305258747, + "grad_norm": 0.3338275150823478, + "learning_rate": 7.318928323677982e-05, + "loss": 2.8478, + "step": 26537 + }, + { + "epoch": 1.2355378634448402, + "grad_norm": 0.3236552271716981, + "learning_rate": 7.318688340455169e-05, + "loss": 2.8298, + "step": 26538 + }, + { + "epoch": 1.2355844216309333, + "grad_norm": 0.330986240659591, + "learning_rate": 7.318448350427234e-05, + "loss": 2.8307, + "step": 26539 + }, + { + "epoch": 1.2356309798170264, + "grad_norm": 0.337625633802003, + "learning_rate": 7.318208353594882e-05, + "loss": 2.7113, + "step": 26540 + }, + { + "epoch": 1.2356775380031193, + "grad_norm": 0.3082434693471409, + "learning_rate": 7.317968349958817e-05, + "loss": 2.7201, + "step": 26541 + }, + { + "epoch": 1.2357240961892124, + "grad_norm": 0.3478503003971455, + "learning_rate": 7.317728339519741e-05, + "loss": 2.8208, + "step": 26542 + }, + { + "epoch": 1.2357706543753055, + "grad_norm": 0.3454701397848708, + "learning_rate": 7.317488322278363e-05, + "loss": 2.6787, + "step": 26543 + }, + { + "epoch": 1.2358172125613986, + "grad_norm": 0.3136595515626396, + "learning_rate": 7.317248298235387e-05, + "loss": 2.728, + "step": 26544 + }, + { + "epoch": 1.2358637707474918, + "grad_norm": 0.3384018876855973, + "learning_rate": 7.317008267391512e-05, + "loss": 2.6855, + "step": 26545 + }, + { + "epoch": 1.2359103289335847, + "grad_norm": 0.30133568439434943, + "learning_rate": 7.316768229747447e-05, + "loss": 2.6538, + "step": 26546 + }, + { + "epoch": 1.2359568871196778, + "grad_norm": 0.35589346874345296, + "learning_rate": 7.316528185303896e-05, + "loss": 2.8139, + "step": 26547 + }, + { + "epoch": 1.2360034453057709, + "grad_norm": 0.3311488725830214, + "learning_rate": 7.316288134061562e-05, + "loss": 2.8049, + "step": 26548 + }, + { + "epoch": 1.236050003491864, + "grad_norm": 0.34885063814879774, + "learning_rate": 7.316048076021152e-05, + "loss": 2.7684, + "step": 26549 + }, + { + "epoch": 1.2360965616779571, + "grad_norm": 0.3305218896356596, + "learning_rate": 7.315808011183368e-05, + "loss": 2.7494, + "step": 26550 + }, + { + "epoch": 1.23614311986405, + "grad_norm": 0.3580860024337563, + "learning_rate": 7.315567939548917e-05, + "loss": 2.8754, + "step": 26551 + }, + { + "epoch": 1.2361896780501431, + "grad_norm": 0.3352743819534538, + "learning_rate": 7.3153278611185e-05, + "loss": 2.6682, + "step": 26552 + }, + { + "epoch": 1.2362362362362362, + "grad_norm": 0.34386892425785626, + "learning_rate": 7.315087775892826e-05, + "loss": 2.9546, + "step": 26553 + }, + { + "epoch": 1.2362827944223294, + "grad_norm": 0.37021025560135296, + "learning_rate": 7.314847683872596e-05, + "loss": 2.8111, + "step": 26554 + }, + { + "epoch": 1.2363293526084225, + "grad_norm": 0.3206906469243316, + "learning_rate": 7.314607585058517e-05, + "loss": 2.9118, + "step": 26555 + }, + { + "epoch": 1.2363759107945154, + "grad_norm": 0.36148036135546613, + "learning_rate": 7.314367479451291e-05, + "loss": 2.8247, + "step": 26556 + }, + { + "epoch": 1.2364224689806085, + "grad_norm": 0.3061494644530512, + "learning_rate": 7.314127367051626e-05, + "loss": 2.7762, + "step": 26557 + }, + { + "epoch": 1.2364690271667016, + "grad_norm": 0.33771056084823425, + "learning_rate": 7.313887247860225e-05, + "loss": 2.8421, + "step": 26558 + }, + { + "epoch": 1.2365155853527947, + "grad_norm": 0.3399627760009363, + "learning_rate": 7.313647121877793e-05, + "loss": 2.7469, + "step": 26559 + }, + { + "epoch": 1.2365621435388876, + "grad_norm": 0.3412442365518263, + "learning_rate": 7.313406989105034e-05, + "loss": 2.6769, + "step": 26560 + }, + { + "epoch": 1.2366087017249807, + "grad_norm": 0.31528580158741176, + "learning_rate": 7.313166849542654e-05, + "loss": 2.7756, + "step": 26561 + }, + { + "epoch": 1.2366552599110738, + "grad_norm": 0.3483273728072096, + "learning_rate": 7.312926703191356e-05, + "loss": 2.7247, + "step": 26562 + }, + { + "epoch": 1.236701818097167, + "grad_norm": 0.3414363310827488, + "learning_rate": 7.312686550051848e-05, + "loss": 2.7644, + "step": 26563 + }, + { + "epoch": 1.23674837628326, + "grad_norm": 0.3490755722918831, + "learning_rate": 7.312446390124832e-05, + "loss": 2.7346, + "step": 26564 + }, + { + "epoch": 1.2367949344693532, + "grad_norm": 0.33149096832050834, + "learning_rate": 7.312206223411013e-05, + "loss": 2.781, + "step": 26565 + }, + { + "epoch": 1.236841492655446, + "grad_norm": 0.35761641348149276, + "learning_rate": 7.311966049911096e-05, + "loss": 2.7646, + "step": 26566 + }, + { + "epoch": 1.2368880508415392, + "grad_norm": 0.3462341338613798, + "learning_rate": 7.311725869625788e-05, + "loss": 2.8814, + "step": 26567 + }, + { + "epoch": 1.2369346090276323, + "grad_norm": 0.3264078032335543, + "learning_rate": 7.311485682555792e-05, + "loss": 2.7957, + "step": 26568 + }, + { + "epoch": 1.2369811672137254, + "grad_norm": 0.3281626286422198, + "learning_rate": 7.311245488701812e-05, + "loss": 2.6522, + "step": 26569 + }, + { + "epoch": 1.2370277253998183, + "grad_norm": 0.3219368330260232, + "learning_rate": 7.311005288064556e-05, + "loss": 2.748, + "step": 26570 + }, + { + "epoch": 1.2370742835859114, + "grad_norm": 0.31279930036374143, + "learning_rate": 7.310765080644726e-05, + "loss": 2.8367, + "step": 26571 + }, + { + "epoch": 1.2371208417720045, + "grad_norm": 0.36735727820378156, + "learning_rate": 7.310524866443029e-05, + "loss": 2.7439, + "step": 26572 + }, + { + "epoch": 1.2371673999580977, + "grad_norm": 0.3188232472691799, + "learning_rate": 7.310284645460169e-05, + "loss": 2.7175, + "step": 26573 + }, + { + "epoch": 1.2372139581441908, + "grad_norm": 0.35390788227672604, + "learning_rate": 7.310044417696851e-05, + "loss": 2.8115, + "step": 26574 + }, + { + "epoch": 1.2372605163302839, + "grad_norm": 0.3440191997883829, + "learning_rate": 7.309804183153778e-05, + "loss": 2.7547, + "step": 26575 + }, + { + "epoch": 1.2373070745163768, + "grad_norm": 0.3344606227131343, + "learning_rate": 7.309563941831661e-05, + "loss": 2.8825, + "step": 26576 + }, + { + "epoch": 1.23735363270247, + "grad_norm": 0.3333060988945619, + "learning_rate": 7.309323693731199e-05, + "loss": 2.7338, + "step": 26577 + }, + { + "epoch": 1.237400190888563, + "grad_norm": 0.3362446856002507, + "learning_rate": 7.3090834388531e-05, + "loss": 2.7355, + "step": 26578 + }, + { + "epoch": 1.2374467490746561, + "grad_norm": 0.32832180334393507, + "learning_rate": 7.308843177198068e-05, + "loss": 2.8481, + "step": 26579 + }, + { + "epoch": 1.237493307260749, + "grad_norm": 0.33289763866565275, + "learning_rate": 7.308602908766809e-05, + "loss": 2.7578, + "step": 26580 + }, + { + "epoch": 1.2375398654468421, + "grad_norm": 0.34189449509894987, + "learning_rate": 7.308362633560029e-05, + "loss": 2.7841, + "step": 26581 + }, + { + "epoch": 1.2375864236329353, + "grad_norm": 0.3253524728382024, + "learning_rate": 7.308122351578432e-05, + "loss": 2.8268, + "step": 26582 + }, + { + "epoch": 1.2376329818190284, + "grad_norm": 0.3311921320974665, + "learning_rate": 7.307882062822722e-05, + "loss": 2.8458, + "step": 26583 + }, + { + "epoch": 1.2376795400051215, + "grad_norm": 0.34921906130482355, + "learning_rate": 7.307641767293606e-05, + "loss": 2.8656, + "step": 26584 + }, + { + "epoch": 1.2377260981912144, + "grad_norm": 0.31450561417085027, + "learning_rate": 7.30740146499179e-05, + "loss": 2.801, + "step": 26585 + }, + { + "epoch": 1.2377726563773075, + "grad_norm": 0.3278579073730851, + "learning_rate": 7.307161155917977e-05, + "loss": 2.6404, + "step": 26586 + }, + { + "epoch": 1.2378192145634006, + "grad_norm": 0.29722407429494657, + "learning_rate": 7.306920840072872e-05, + "loss": 2.7833, + "step": 26587 + }, + { + "epoch": 1.2378657727494937, + "grad_norm": 0.3448848387231781, + "learning_rate": 7.306680517457183e-05, + "loss": 2.7643, + "step": 26588 + }, + { + "epoch": 1.2379123309355868, + "grad_norm": 0.3226997175031019, + "learning_rate": 7.306440188071613e-05, + "loss": 2.7663, + "step": 26589 + }, + { + "epoch": 1.2379588891216797, + "grad_norm": 0.3217292881155583, + "learning_rate": 7.30619985191687e-05, + "loss": 2.77, + "step": 26590 + }, + { + "epoch": 1.2380054473077728, + "grad_norm": 0.3442244290383469, + "learning_rate": 7.305959508993656e-05, + "loss": 2.8461, + "step": 26591 + }, + { + "epoch": 1.238052005493866, + "grad_norm": 0.33371689816499234, + "learning_rate": 7.305719159302679e-05, + "loss": 2.8073, + "step": 26592 + }, + { + "epoch": 1.238098563679959, + "grad_norm": 0.31530262795743236, + "learning_rate": 7.305478802844643e-05, + "loss": 2.7569, + "step": 26593 + }, + { + "epoch": 1.2381451218660522, + "grad_norm": 0.32632742708424683, + "learning_rate": 7.305238439620255e-05, + "loss": 2.7591, + "step": 26594 + }, + { + "epoch": 1.238191680052145, + "grad_norm": 0.36975266006848956, + "learning_rate": 7.304998069630217e-05, + "loss": 2.762, + "step": 26595 + }, + { + "epoch": 1.2382382382382382, + "grad_norm": 0.33777556513324714, + "learning_rate": 7.304757692875239e-05, + "loss": 2.8225, + "step": 26596 + }, + { + "epoch": 1.2382847964243313, + "grad_norm": 0.3537228599951694, + "learning_rate": 7.304517309356023e-05, + "loss": 2.699, + "step": 26597 + }, + { + "epoch": 1.2383313546104244, + "grad_norm": 0.31740633306326416, + "learning_rate": 7.304276919073275e-05, + "loss": 2.6887, + "step": 26598 + }, + { + "epoch": 1.2383779127965175, + "grad_norm": 0.33357032092420413, + "learning_rate": 7.304036522027703e-05, + "loss": 2.8234, + "step": 26599 + }, + { + "epoch": 1.2384244709826104, + "grad_norm": 0.33424048533681344, + "learning_rate": 7.30379611822001e-05, + "loss": 2.776, + "step": 26600 + }, + { + "epoch": 1.2384710291687036, + "grad_norm": 0.32987993479795746, + "learning_rate": 7.303555707650902e-05, + "loss": 2.7771, + "step": 26601 + }, + { + "epoch": 1.2385175873547967, + "grad_norm": 0.3167747032612201, + "learning_rate": 7.303315290321085e-05, + "loss": 2.745, + "step": 26602 + }, + { + "epoch": 1.2385641455408898, + "grad_norm": 0.31606758424416603, + "learning_rate": 7.303074866231266e-05, + "loss": 2.7362, + "step": 26603 + }, + { + "epoch": 1.238610703726983, + "grad_norm": 0.32776489321157626, + "learning_rate": 7.302834435382146e-05, + "loss": 2.7508, + "step": 26604 + }, + { + "epoch": 1.2386572619130758, + "grad_norm": 0.325408970337207, + "learning_rate": 7.302593997774436e-05, + "loss": 2.7316, + "step": 26605 + }, + { + "epoch": 1.238703820099169, + "grad_norm": 0.3284171156371119, + "learning_rate": 7.302353553408838e-05, + "loss": 2.768, + "step": 26606 + }, + { + "epoch": 1.238750378285262, + "grad_norm": 0.33109256246209745, + "learning_rate": 7.302113102286063e-05, + "loss": 2.6885, + "step": 26607 + }, + { + "epoch": 1.2387969364713551, + "grad_norm": 0.3306446386673597, + "learning_rate": 7.301872644406809e-05, + "loss": 2.8882, + "step": 26608 + }, + { + "epoch": 1.238843494657448, + "grad_norm": 0.336727076944491, + "learning_rate": 7.301632179771787e-05, + "loss": 2.7218, + "step": 26609 + }, + { + "epoch": 1.2388900528435411, + "grad_norm": 0.33008740064777553, + "learning_rate": 7.301391708381701e-05, + "loss": 2.7869, + "step": 26610 + }, + { + "epoch": 1.2389366110296343, + "grad_norm": 0.33053568137853356, + "learning_rate": 7.301151230237256e-05, + "loss": 2.7972, + "step": 26611 + }, + { + "epoch": 1.2389831692157274, + "grad_norm": 0.31330964733195793, + "learning_rate": 7.300910745339161e-05, + "loss": 2.713, + "step": 26612 + }, + { + "epoch": 1.2390297274018205, + "grad_norm": 0.35119614508837216, + "learning_rate": 7.30067025368812e-05, + "loss": 2.7971, + "step": 26613 + }, + { + "epoch": 1.2390762855879136, + "grad_norm": 0.3188999278827503, + "learning_rate": 7.300429755284835e-05, + "loss": 2.6821, + "step": 26614 + }, + { + "epoch": 1.2391228437740065, + "grad_norm": 0.34850323090504903, + "learning_rate": 7.300189250130017e-05, + "loss": 2.7597, + "step": 26615 + }, + { + "epoch": 1.2391694019600996, + "grad_norm": 0.33431390366047936, + "learning_rate": 7.29994873822437e-05, + "loss": 2.8516, + "step": 26616 + }, + { + "epoch": 1.2392159601461927, + "grad_norm": 0.37294902527248064, + "learning_rate": 7.299708219568601e-05, + "loss": 2.812, + "step": 26617 + }, + { + "epoch": 1.2392625183322858, + "grad_norm": 0.32706674517711276, + "learning_rate": 7.299467694163415e-05, + "loss": 2.7712, + "step": 26618 + }, + { + "epoch": 1.2393090765183787, + "grad_norm": 0.3451469706990938, + "learning_rate": 7.299227162009517e-05, + "loss": 2.8001, + "step": 26619 + }, + { + "epoch": 1.2393556347044719, + "grad_norm": 0.33809251032545856, + "learning_rate": 7.298986623107616e-05, + "loss": 2.822, + "step": 26620 + }, + { + "epoch": 1.239402192890565, + "grad_norm": 0.31313098342156975, + "learning_rate": 7.298746077458412e-05, + "loss": 2.8439, + "step": 26621 + }, + { + "epoch": 1.239448751076658, + "grad_norm": 0.3318161659901619, + "learning_rate": 7.298505525062618e-05, + "loss": 2.8272, + "step": 26622 + }, + { + "epoch": 1.2394953092627512, + "grad_norm": 0.30837032934513353, + "learning_rate": 7.298264965920935e-05, + "loss": 2.7066, + "step": 26623 + }, + { + "epoch": 1.2395418674488443, + "grad_norm": 0.3437433706717979, + "learning_rate": 7.298024400034071e-05, + "loss": 2.8029, + "step": 26624 + }, + { + "epoch": 1.2395884256349372, + "grad_norm": 0.3090319356172999, + "learning_rate": 7.297783827402732e-05, + "loss": 2.7279, + "step": 26625 + }, + { + "epoch": 1.2396349838210303, + "grad_norm": 0.33879501769561854, + "learning_rate": 7.297543248027623e-05, + "loss": 2.7648, + "step": 26626 + }, + { + "epoch": 1.2396815420071234, + "grad_norm": 0.3246071736365207, + "learning_rate": 7.297302661909451e-05, + "loss": 2.7701, + "step": 26627 + }, + { + "epoch": 1.2397281001932166, + "grad_norm": 0.3269854448218954, + "learning_rate": 7.297062069048923e-05, + "loss": 2.868, + "step": 26628 + }, + { + "epoch": 1.2397746583793094, + "grad_norm": 0.34126937282889536, + "learning_rate": 7.296821469446743e-05, + "loss": 2.8729, + "step": 26629 + }, + { + "epoch": 1.2398212165654026, + "grad_norm": 0.3049881885346356, + "learning_rate": 7.29658086310362e-05, + "loss": 2.8336, + "step": 26630 + }, + { + "epoch": 1.2398677747514957, + "grad_norm": 0.3741018119722087, + "learning_rate": 7.296340250020257e-05, + "loss": 2.8685, + "step": 26631 + }, + { + "epoch": 1.2399143329375888, + "grad_norm": 0.3423218185282289, + "learning_rate": 7.296099630197361e-05, + "loss": 2.8087, + "step": 26632 + }, + { + "epoch": 1.239960891123682, + "grad_norm": 0.3580168621127864, + "learning_rate": 7.29585900363564e-05, + "loss": 2.7778, + "step": 26633 + }, + { + "epoch": 1.2400074493097748, + "grad_norm": 0.3601963565101005, + "learning_rate": 7.2956183703358e-05, + "loss": 2.7484, + "step": 26634 + }, + { + "epoch": 1.240054007495868, + "grad_norm": 0.347268026555999, + "learning_rate": 7.295377730298545e-05, + "loss": 2.7845, + "step": 26635 + }, + { + "epoch": 1.240100565681961, + "grad_norm": 0.3382112553955501, + "learning_rate": 7.295137083524583e-05, + "loss": 2.7447, + "step": 26636 + }, + { + "epoch": 1.2401471238680541, + "grad_norm": 0.35303787274396486, + "learning_rate": 7.294896430014618e-05, + "loss": 2.8126, + "step": 26637 + }, + { + "epoch": 1.2401936820541473, + "grad_norm": 0.35204875230606436, + "learning_rate": 7.29465576976936e-05, + "loss": 2.7353, + "step": 26638 + }, + { + "epoch": 1.2402402402402402, + "grad_norm": 0.3502231764970556, + "learning_rate": 7.294415102789513e-05, + "loss": 2.8168, + "step": 26639 + }, + { + "epoch": 1.2402867984263333, + "grad_norm": 0.32825787634597886, + "learning_rate": 7.294174429075783e-05, + "loss": 2.7568, + "step": 26640 + }, + { + "epoch": 1.2403333566124264, + "grad_norm": 0.36244738466658116, + "learning_rate": 7.293933748628877e-05, + "loss": 2.8419, + "step": 26641 + }, + { + "epoch": 1.2403799147985195, + "grad_norm": 0.3506709413109607, + "learning_rate": 7.293693061449502e-05, + "loss": 2.804, + "step": 26642 + }, + { + "epoch": 1.2404264729846126, + "grad_norm": 0.3675785924718193, + "learning_rate": 7.293452367538363e-05, + "loss": 2.8001, + "step": 26643 + }, + { + "epoch": 1.2404730311707055, + "grad_norm": 0.31875490276105994, + "learning_rate": 7.29321166689617e-05, + "loss": 2.6957, + "step": 26644 + }, + { + "epoch": 1.2405195893567986, + "grad_norm": 0.3792182076102757, + "learning_rate": 7.292970959523624e-05, + "loss": 2.8204, + "step": 26645 + }, + { + "epoch": 1.2405661475428917, + "grad_norm": 0.3429945508793732, + "learning_rate": 7.292730245421436e-05, + "loss": 2.8674, + "step": 26646 + }, + { + "epoch": 1.2406127057289849, + "grad_norm": 0.3789000863321366, + "learning_rate": 7.292489524590308e-05, + "loss": 2.6729, + "step": 26647 + }, + { + "epoch": 1.2406592639150777, + "grad_norm": 0.35793773106948507, + "learning_rate": 7.292248797030952e-05, + "loss": 2.8665, + "step": 26648 + }, + { + "epoch": 1.2407058221011709, + "grad_norm": 0.34362086824014326, + "learning_rate": 7.292008062744069e-05, + "loss": 2.7921, + "step": 26649 + }, + { + "epoch": 1.240752380287264, + "grad_norm": 0.3392719929856501, + "learning_rate": 7.291767321730369e-05, + "loss": 2.7357, + "step": 26650 + }, + { + "epoch": 1.240798938473357, + "grad_norm": 0.3497853256827298, + "learning_rate": 7.291526573990557e-05, + "loss": 2.6725, + "step": 26651 + }, + { + "epoch": 1.2408454966594502, + "grad_norm": 0.31379296379508553, + "learning_rate": 7.291285819525342e-05, + "loss": 2.7955, + "step": 26652 + }, + { + "epoch": 1.2408920548455433, + "grad_norm": 0.3577577355056213, + "learning_rate": 7.291045058335428e-05, + "loss": 2.7947, + "step": 26653 + }, + { + "epoch": 1.2409386130316362, + "grad_norm": 0.30183855634183404, + "learning_rate": 7.290804290421523e-05, + "loss": 2.8011, + "step": 26654 + }, + { + "epoch": 1.2409851712177293, + "grad_norm": 0.31966009555949115, + "learning_rate": 7.290563515784332e-05, + "loss": 2.7677, + "step": 26655 + }, + { + "epoch": 1.2410317294038224, + "grad_norm": 0.33902850672838497, + "learning_rate": 7.290322734424563e-05, + "loss": 2.7767, + "step": 26656 + }, + { + "epoch": 1.2410782875899156, + "grad_norm": 0.32779838299666625, + "learning_rate": 7.290081946342923e-05, + "loss": 2.7318, + "step": 26657 + }, + { + "epoch": 1.2411248457760085, + "grad_norm": 0.35121890806538975, + "learning_rate": 7.289841151540117e-05, + "loss": 2.6732, + "step": 26658 + }, + { + "epoch": 1.2411714039621016, + "grad_norm": 0.3433032647229464, + "learning_rate": 7.289600350016854e-05, + "loss": 2.7566, + "step": 26659 + }, + { + "epoch": 1.2412179621481947, + "grad_norm": 0.3090463194185019, + "learning_rate": 7.28935954177384e-05, + "loss": 2.8302, + "step": 26660 + }, + { + "epoch": 1.2412645203342878, + "grad_norm": 0.3144165062829992, + "learning_rate": 7.28911872681178e-05, + "loss": 2.8642, + "step": 26661 + }, + { + "epoch": 1.241311078520381, + "grad_norm": 0.33479073172747037, + "learning_rate": 7.288877905131383e-05, + "loss": 2.8276, + "step": 26662 + }, + { + "epoch": 1.241357636706474, + "grad_norm": 0.33421917061106765, + "learning_rate": 7.288637076733355e-05, + "loss": 2.7259, + "step": 26663 + }, + { + "epoch": 1.241404194892567, + "grad_norm": 0.3139147461615722, + "learning_rate": 7.2883962416184e-05, + "loss": 2.7168, + "step": 26664 + }, + { + "epoch": 1.24145075307866, + "grad_norm": 0.3191145303567755, + "learning_rate": 7.28815539978723e-05, + "loss": 2.8199, + "step": 26665 + }, + { + "epoch": 1.2414973112647532, + "grad_norm": 0.31504082251168375, + "learning_rate": 7.287914551240549e-05, + "loss": 2.6892, + "step": 26666 + }, + { + "epoch": 1.2415438694508463, + "grad_norm": 0.32701561308395855, + "learning_rate": 7.287673695979065e-05, + "loss": 2.7688, + "step": 26667 + }, + { + "epoch": 1.2415904276369392, + "grad_norm": 0.333536199480082, + "learning_rate": 7.287432834003483e-05, + "loss": 2.8307, + "step": 26668 + }, + { + "epoch": 1.2416369858230323, + "grad_norm": 0.33052295031968704, + "learning_rate": 7.287191965314509e-05, + "loss": 2.7737, + "step": 26669 + }, + { + "epoch": 1.2416835440091254, + "grad_norm": 0.32004688200148707, + "learning_rate": 7.286951089912854e-05, + "loss": 2.8598, + "step": 26670 + }, + { + "epoch": 1.2417301021952185, + "grad_norm": 0.3568754885860813, + "learning_rate": 7.286710207799224e-05, + "loss": 2.7797, + "step": 26671 + }, + { + "epoch": 1.2417766603813116, + "grad_norm": 0.33728124797056486, + "learning_rate": 7.286469318974322e-05, + "loss": 2.704, + "step": 26672 + }, + { + "epoch": 1.2418232185674045, + "grad_norm": 0.3166098556115633, + "learning_rate": 7.286228423438861e-05, + "loss": 2.8461, + "step": 26673 + }, + { + "epoch": 1.2418697767534976, + "grad_norm": 0.3251882640821675, + "learning_rate": 7.285987521193544e-05, + "loss": 2.7222, + "step": 26674 + }, + { + "epoch": 1.2419163349395907, + "grad_norm": 0.32948321639115397, + "learning_rate": 7.285746612239078e-05, + "loss": 2.6915, + "step": 26675 + }, + { + "epoch": 1.2419628931256839, + "grad_norm": 0.3374761612941414, + "learning_rate": 7.28550569657617e-05, + "loss": 2.8511, + "step": 26676 + }, + { + "epoch": 1.242009451311777, + "grad_norm": 0.3332809119098704, + "learning_rate": 7.285264774205529e-05, + "loss": 2.7864, + "step": 26677 + }, + { + "epoch": 1.2420560094978699, + "grad_norm": 0.3477805448755138, + "learning_rate": 7.28502384512786e-05, + "loss": 2.798, + "step": 26678 + }, + { + "epoch": 1.242102567683963, + "grad_norm": 0.3623491512128169, + "learning_rate": 7.284782909343872e-05, + "loss": 2.8449, + "step": 26679 + }, + { + "epoch": 1.242149125870056, + "grad_norm": 0.3537501258689353, + "learning_rate": 7.284541966854273e-05, + "loss": 2.7455, + "step": 26680 + }, + { + "epoch": 1.2421956840561492, + "grad_norm": 0.3628062301749943, + "learning_rate": 7.284301017659766e-05, + "loss": 2.8262, + "step": 26681 + }, + { + "epoch": 1.2422422422422423, + "grad_norm": 0.3649070845825063, + "learning_rate": 7.28406006176106e-05, + "loss": 2.7188, + "step": 26682 + }, + { + "epoch": 1.2422888004283352, + "grad_norm": 0.34801687762899, + "learning_rate": 7.283819099158864e-05, + "loss": 2.8489, + "step": 26683 + }, + { + "epoch": 1.2423353586144283, + "grad_norm": 0.3209510978611282, + "learning_rate": 7.283578129853885e-05, + "loss": 2.7289, + "step": 26684 + }, + { + "epoch": 1.2423819168005215, + "grad_norm": 0.3372826845611689, + "learning_rate": 7.283337153846827e-05, + "loss": 2.839, + "step": 26685 + }, + { + "epoch": 1.2424284749866146, + "grad_norm": 0.3193524577602342, + "learning_rate": 7.283096171138401e-05, + "loss": 2.7674, + "step": 26686 + }, + { + "epoch": 1.2424750331727077, + "grad_norm": 0.3414209430228419, + "learning_rate": 7.282855181729313e-05, + "loss": 2.7709, + "step": 26687 + }, + { + "epoch": 1.2425215913588006, + "grad_norm": 0.341244827761226, + "learning_rate": 7.282614185620268e-05, + "loss": 2.7644, + "step": 26688 + }, + { + "epoch": 1.2425681495448937, + "grad_norm": 0.3180572284121189, + "learning_rate": 7.282373182811978e-05, + "loss": 2.7895, + "step": 26689 + }, + { + "epoch": 1.2426147077309868, + "grad_norm": 0.3360753949619897, + "learning_rate": 7.282132173305145e-05, + "loss": 2.6852, + "step": 26690 + }, + { + "epoch": 1.24266126591708, + "grad_norm": 0.3320235277997513, + "learning_rate": 7.28189115710048e-05, + "loss": 2.7682, + "step": 26691 + }, + { + "epoch": 1.242707824103173, + "grad_norm": 0.32479896971862127, + "learning_rate": 7.281650134198688e-05, + "loss": 2.8086, + "step": 26692 + }, + { + "epoch": 1.242754382289266, + "grad_norm": 0.3452044274752312, + "learning_rate": 7.28140910460048e-05, + "loss": 2.9435, + "step": 26693 + }, + { + "epoch": 1.242800940475359, + "grad_norm": 0.3168662472447709, + "learning_rate": 7.281168068306558e-05, + "loss": 2.7833, + "step": 26694 + }, + { + "epoch": 1.2428474986614522, + "grad_norm": 0.3069929408349735, + "learning_rate": 7.280927025317634e-05, + "loss": 2.7252, + "step": 26695 + }, + { + "epoch": 1.2428940568475453, + "grad_norm": 0.3363249723285881, + "learning_rate": 7.280685975634414e-05, + "loss": 2.7607, + "step": 26696 + }, + { + "epoch": 1.2429406150336382, + "grad_norm": 0.3187304010247119, + "learning_rate": 7.280444919257607e-05, + "loss": 2.7851, + "step": 26697 + }, + { + "epoch": 1.2429871732197313, + "grad_norm": 0.33638769883822367, + "learning_rate": 7.280203856187917e-05, + "loss": 2.7613, + "step": 26698 + }, + { + "epoch": 1.2430337314058244, + "grad_norm": 0.3239028217608374, + "learning_rate": 7.279962786426053e-05, + "loss": 2.8043, + "step": 26699 + }, + { + "epoch": 1.2430802895919175, + "grad_norm": 0.33958201499385476, + "learning_rate": 7.279721709972725e-05, + "loss": 2.8306, + "step": 26700 + }, + { + "epoch": 1.2431268477780106, + "grad_norm": 0.3949941435964016, + "learning_rate": 7.279480626828636e-05, + "loss": 2.7521, + "step": 26701 + }, + { + "epoch": 1.2431734059641038, + "grad_norm": 0.3693685225903047, + "learning_rate": 7.279239536994497e-05, + "loss": 2.7927, + "step": 26702 + }, + { + "epoch": 1.2432199641501966, + "grad_norm": 0.33622529974982357, + "learning_rate": 7.278998440471013e-05, + "loss": 2.7271, + "step": 26703 + }, + { + "epoch": 1.2432665223362898, + "grad_norm": 0.3624300138317164, + "learning_rate": 7.278757337258894e-05, + "loss": 2.7338, + "step": 26704 + }, + { + "epoch": 1.2433130805223829, + "grad_norm": 0.3502869851357335, + "learning_rate": 7.278516227358848e-05, + "loss": 2.7037, + "step": 26705 + }, + { + "epoch": 1.243359638708476, + "grad_norm": 0.36352810793974055, + "learning_rate": 7.27827511077158e-05, + "loss": 2.7545, + "step": 26706 + }, + { + "epoch": 1.2434061968945689, + "grad_norm": 0.3626963274628251, + "learning_rate": 7.278033987497799e-05, + "loss": 2.81, + "step": 26707 + }, + { + "epoch": 1.243452755080662, + "grad_norm": 0.3250430191377753, + "learning_rate": 7.277792857538213e-05, + "loss": 2.9115, + "step": 26708 + }, + { + "epoch": 1.2434993132667551, + "grad_norm": 0.354097933851077, + "learning_rate": 7.277551720893529e-05, + "loss": 2.8106, + "step": 26709 + }, + { + "epoch": 1.2435458714528482, + "grad_norm": 0.31482464026674484, + "learning_rate": 7.277310577564455e-05, + "loss": 2.7539, + "step": 26710 + }, + { + "epoch": 1.2435924296389413, + "grad_norm": 0.3535337480383941, + "learning_rate": 7.2770694275517e-05, + "loss": 2.9033, + "step": 26711 + }, + { + "epoch": 1.2436389878250345, + "grad_norm": 0.36211027497444354, + "learning_rate": 7.276828270855969e-05, + "loss": 2.8455, + "step": 26712 + }, + { + "epoch": 1.2436855460111274, + "grad_norm": 0.3754423964546267, + "learning_rate": 7.276587107477973e-05, + "loss": 2.7788, + "step": 26713 + }, + { + "epoch": 1.2437321041972205, + "grad_norm": 0.3700685546985148, + "learning_rate": 7.276345937418417e-05, + "loss": 2.7389, + "step": 26714 + }, + { + "epoch": 1.2437786623833136, + "grad_norm": 0.32509551194342673, + "learning_rate": 7.27610476067801e-05, + "loss": 2.8834, + "step": 26715 + }, + { + "epoch": 1.2438252205694067, + "grad_norm": 0.36623858676022547, + "learning_rate": 7.27586357725746e-05, + "loss": 2.7328, + "step": 26716 + }, + { + "epoch": 1.2438717787554996, + "grad_norm": 0.34079699391170243, + "learning_rate": 7.275622387157474e-05, + "loss": 2.6984, + "step": 26717 + }, + { + "epoch": 1.2439183369415927, + "grad_norm": 0.34544032093084154, + "learning_rate": 7.275381190378762e-05, + "loss": 2.8335, + "step": 26718 + }, + { + "epoch": 1.2439648951276858, + "grad_norm": 0.3942944492366238, + "learning_rate": 7.275139986922028e-05, + "loss": 2.8001, + "step": 26719 + }, + { + "epoch": 1.244011453313779, + "grad_norm": 0.34521472298059264, + "learning_rate": 7.274898776787985e-05, + "loss": 2.8671, + "step": 26720 + }, + { + "epoch": 1.244058011499872, + "grad_norm": 0.35476236855088633, + "learning_rate": 7.274657559977335e-05, + "loss": 2.8915, + "step": 26721 + }, + { + "epoch": 1.244104569685965, + "grad_norm": 0.3455235909618908, + "learning_rate": 7.274416336490793e-05, + "loss": 2.8032, + "step": 26722 + }, + { + "epoch": 1.244151127872058, + "grad_norm": 0.3316747389061634, + "learning_rate": 7.274175106329061e-05, + "loss": 2.7511, + "step": 26723 + }, + { + "epoch": 1.2441976860581512, + "grad_norm": 0.3362180819082447, + "learning_rate": 7.273933869492848e-05, + "loss": 2.862, + "step": 26724 + }, + { + "epoch": 1.2442442442442443, + "grad_norm": 0.31753383328837964, + "learning_rate": 7.273692625982866e-05, + "loss": 2.677, + "step": 26725 + }, + { + "epoch": 1.2442908024303374, + "grad_norm": 0.35385492114323375, + "learning_rate": 7.273451375799818e-05, + "loss": 2.7046, + "step": 26726 + }, + { + "epoch": 1.2443373606164303, + "grad_norm": 0.3186489233709049, + "learning_rate": 7.273210118944415e-05, + "loss": 2.769, + "step": 26727 + }, + { + "epoch": 1.2443839188025234, + "grad_norm": 0.33175662231434133, + "learning_rate": 7.272968855417364e-05, + "loss": 2.8082, + "step": 26728 + }, + { + "epoch": 1.2444304769886165, + "grad_norm": 0.33284164855252146, + "learning_rate": 7.272727585219373e-05, + "loss": 2.7746, + "step": 26729 + }, + { + "epoch": 1.2444770351747096, + "grad_norm": 0.3144298350514717, + "learning_rate": 7.272486308351151e-05, + "loss": 2.6412, + "step": 26730 + }, + { + "epoch": 1.2445235933608028, + "grad_norm": 0.325984710075936, + "learning_rate": 7.272245024813405e-05, + "loss": 2.8105, + "step": 26731 + }, + { + "epoch": 1.2445701515468957, + "grad_norm": 0.31893312388681005, + "learning_rate": 7.272003734606843e-05, + "loss": 2.8249, + "step": 26732 + }, + { + "epoch": 1.2446167097329888, + "grad_norm": 0.31764098703135313, + "learning_rate": 7.271762437732175e-05, + "loss": 2.7813, + "step": 26733 + }, + { + "epoch": 1.2446632679190819, + "grad_norm": 0.3147792168069315, + "learning_rate": 7.271521134190108e-05, + "loss": 2.8038, + "step": 26734 + }, + { + "epoch": 1.244709826105175, + "grad_norm": 0.31546874695596316, + "learning_rate": 7.27127982398135e-05, + "loss": 2.7869, + "step": 26735 + }, + { + "epoch": 1.244756384291268, + "grad_norm": 0.3129100592525034, + "learning_rate": 7.27103850710661e-05, + "loss": 2.7856, + "step": 26736 + }, + { + "epoch": 1.244802942477361, + "grad_norm": 0.29201361661814207, + "learning_rate": 7.270797183566596e-05, + "loss": 2.6984, + "step": 26737 + }, + { + "epoch": 1.2448495006634541, + "grad_norm": 0.3307125429326681, + "learning_rate": 7.270555853362015e-05, + "loss": 2.6782, + "step": 26738 + }, + { + "epoch": 1.2448960588495472, + "grad_norm": 0.33519679852082906, + "learning_rate": 7.270314516493578e-05, + "loss": 2.8223, + "step": 26739 + }, + { + "epoch": 1.2449426170356404, + "grad_norm": 0.2900260102101877, + "learning_rate": 7.270073172961988e-05, + "loss": 2.8372, + "step": 26740 + }, + { + "epoch": 1.2449891752217335, + "grad_norm": 0.3211016581031932, + "learning_rate": 7.269831822767959e-05, + "loss": 2.7955, + "step": 26741 + }, + { + "epoch": 1.2450357334078264, + "grad_norm": 0.32070031704106766, + "learning_rate": 7.269590465912197e-05, + "loss": 2.8236, + "step": 26742 + }, + { + "epoch": 1.2450822915939195, + "grad_norm": 0.32479837727949157, + "learning_rate": 7.269349102395411e-05, + "loss": 2.8235, + "step": 26743 + }, + { + "epoch": 1.2451288497800126, + "grad_norm": 0.32597711365797055, + "learning_rate": 7.269107732218309e-05, + "loss": 2.8878, + "step": 26744 + }, + { + "epoch": 1.2451754079661057, + "grad_norm": 0.3470448957605131, + "learning_rate": 7.268866355381599e-05, + "loss": 2.8445, + "step": 26745 + }, + { + "epoch": 1.2452219661521986, + "grad_norm": 0.3373035030614546, + "learning_rate": 7.268624971885989e-05, + "loss": 2.7015, + "step": 26746 + }, + { + "epoch": 1.2452685243382917, + "grad_norm": 0.32658979484393474, + "learning_rate": 7.26838358173219e-05, + "loss": 2.8, + "step": 26747 + }, + { + "epoch": 1.2453150825243848, + "grad_norm": 0.35768121918490625, + "learning_rate": 7.268142184920908e-05, + "loss": 2.6905, + "step": 26748 + }, + { + "epoch": 1.245361640710478, + "grad_norm": 0.3238218243061986, + "learning_rate": 7.267900781452852e-05, + "loss": 2.6993, + "step": 26749 + }, + { + "epoch": 1.245408198896571, + "grad_norm": 0.3726563247364948, + "learning_rate": 7.26765937132873e-05, + "loss": 2.7161, + "step": 26750 + }, + { + "epoch": 1.2454547570826642, + "grad_norm": 0.3058729349964353, + "learning_rate": 7.267417954549252e-05, + "loss": 2.8094, + "step": 26751 + }, + { + "epoch": 1.245501315268757, + "grad_norm": 0.3613224831234773, + "learning_rate": 7.267176531115125e-05, + "loss": 2.7101, + "step": 26752 + }, + { + "epoch": 1.2455478734548502, + "grad_norm": 0.3476052064104046, + "learning_rate": 7.266935101027059e-05, + "loss": 2.6971, + "step": 26753 + }, + { + "epoch": 1.2455944316409433, + "grad_norm": 0.31146658606626715, + "learning_rate": 7.26669366428576e-05, + "loss": 2.7642, + "step": 26754 + }, + { + "epoch": 1.2456409898270364, + "grad_norm": 0.36646014275770306, + "learning_rate": 7.266452220891941e-05, + "loss": 2.7454, + "step": 26755 + }, + { + "epoch": 1.2456875480131293, + "grad_norm": 0.32794513701227523, + "learning_rate": 7.266210770846307e-05, + "loss": 2.7631, + "step": 26756 + }, + { + "epoch": 1.2457341061992224, + "grad_norm": 0.3515480126145511, + "learning_rate": 7.265969314149567e-05, + "loss": 2.8395, + "step": 26757 + }, + { + "epoch": 1.2457806643853155, + "grad_norm": 0.3362758388265222, + "learning_rate": 7.26572785080243e-05, + "loss": 2.7343, + "step": 26758 + }, + { + "epoch": 1.2458272225714087, + "grad_norm": 0.3178236841760479, + "learning_rate": 7.265486380805606e-05, + "loss": 2.7441, + "step": 26759 + }, + { + "epoch": 1.2458737807575018, + "grad_norm": 0.3428830185237752, + "learning_rate": 7.265244904159802e-05, + "loss": 2.8556, + "step": 26760 + }, + { + "epoch": 1.2459203389435947, + "grad_norm": 0.343040608246698, + "learning_rate": 7.265003420865729e-05, + "loss": 2.7621, + "step": 26761 + }, + { + "epoch": 1.2459668971296878, + "grad_norm": 0.33702277881161535, + "learning_rate": 7.264761930924092e-05, + "loss": 2.762, + "step": 26762 + }, + { + "epoch": 1.246013455315781, + "grad_norm": 0.32258228261047284, + "learning_rate": 7.264520434335603e-05, + "loss": 2.9553, + "step": 26763 + }, + { + "epoch": 1.246060013501874, + "grad_norm": 0.344246508647265, + "learning_rate": 7.264278931100969e-05, + "loss": 2.8096, + "step": 26764 + }, + { + "epoch": 1.2461065716879671, + "grad_norm": 0.3264057155320088, + "learning_rate": 7.2640374212209e-05, + "loss": 2.7784, + "step": 26765 + }, + { + "epoch": 1.24615312987406, + "grad_norm": 0.3356970479892979, + "learning_rate": 7.263795904696103e-05, + "loss": 2.7935, + "step": 26766 + }, + { + "epoch": 1.2461996880601531, + "grad_norm": 0.34748584476959166, + "learning_rate": 7.26355438152729e-05, + "loss": 2.7833, + "step": 26767 + }, + { + "epoch": 1.2462462462462462, + "grad_norm": 0.32752508519487544, + "learning_rate": 7.263312851715167e-05, + "loss": 2.8275, + "step": 26768 + }, + { + "epoch": 1.2462928044323394, + "grad_norm": 0.37263568773257577, + "learning_rate": 7.263071315260442e-05, + "loss": 2.8489, + "step": 26769 + }, + { + "epoch": 1.2463393626184325, + "grad_norm": 0.3347180563137914, + "learning_rate": 7.262829772163829e-05, + "loss": 2.8126, + "step": 26770 + }, + { + "epoch": 1.2463859208045254, + "grad_norm": 0.3350615771315815, + "learning_rate": 7.26258822242603e-05, + "loss": 2.7603, + "step": 26771 + }, + { + "epoch": 1.2464324789906185, + "grad_norm": 0.3322716449404678, + "learning_rate": 7.26234666604776e-05, + "loss": 2.7403, + "step": 26772 + }, + { + "epoch": 1.2464790371767116, + "grad_norm": 0.3534033453099548, + "learning_rate": 7.262105103029725e-05, + "loss": 2.7723, + "step": 26773 + }, + { + "epoch": 1.2465255953628047, + "grad_norm": 0.33321893599349617, + "learning_rate": 7.261863533372633e-05, + "loss": 2.8689, + "step": 26774 + }, + { + "epoch": 1.2465721535488976, + "grad_norm": 0.325277484492373, + "learning_rate": 7.261621957077198e-05, + "loss": 2.7316, + "step": 26775 + }, + { + "epoch": 1.2466187117349907, + "grad_norm": 0.3831296283084658, + "learning_rate": 7.26138037414412e-05, + "loss": 2.7853, + "step": 26776 + }, + { + "epoch": 1.2466652699210838, + "grad_norm": 0.31232176692407393, + "learning_rate": 7.261138784574117e-05, + "loss": 2.762, + "step": 26777 + }, + { + "epoch": 1.246711828107177, + "grad_norm": 0.3878872764971274, + "learning_rate": 7.260897188367893e-05, + "loss": 2.8194, + "step": 26778 + }, + { + "epoch": 1.24675838629327, + "grad_norm": 0.3187559395462058, + "learning_rate": 7.26065558552616e-05, + "loss": 2.8332, + "step": 26779 + }, + { + "epoch": 1.2468049444793632, + "grad_norm": 0.35619564761528416, + "learning_rate": 7.260413976049625e-05, + "loss": 2.8091, + "step": 26780 + }, + { + "epoch": 1.246851502665456, + "grad_norm": 0.3319365879784598, + "learning_rate": 7.260172359938997e-05, + "loss": 2.7666, + "step": 26781 + }, + { + "epoch": 1.2468980608515492, + "grad_norm": 0.34535002627979433, + "learning_rate": 7.259930737194986e-05, + "loss": 2.7075, + "step": 26782 + }, + { + "epoch": 1.2469446190376423, + "grad_norm": 0.3023945621938207, + "learning_rate": 7.259689107818301e-05, + "loss": 2.6572, + "step": 26783 + }, + { + "epoch": 1.2469911772237354, + "grad_norm": 0.3267671082171484, + "learning_rate": 7.25944747180965e-05, + "loss": 2.7804, + "step": 26784 + }, + { + "epoch": 1.2470377354098283, + "grad_norm": 0.3087290695762564, + "learning_rate": 7.259205829169744e-05, + "loss": 2.8691, + "step": 26785 + }, + { + "epoch": 1.2470842935959214, + "grad_norm": 0.34383473300658146, + "learning_rate": 7.258964179899293e-05, + "loss": 2.7477, + "step": 26786 + }, + { + "epoch": 1.2471308517820145, + "grad_norm": 0.29762403277480826, + "learning_rate": 7.258722523999003e-05, + "loss": 2.8507, + "step": 26787 + }, + { + "epoch": 1.2471774099681077, + "grad_norm": 0.34678577006365274, + "learning_rate": 7.258480861469587e-05, + "loss": 2.6747, + "step": 26788 + }, + { + "epoch": 1.2472239681542008, + "grad_norm": 0.2939691281914807, + "learning_rate": 7.25823919231175e-05, + "loss": 2.6954, + "step": 26789 + }, + { + "epoch": 1.247270526340294, + "grad_norm": 0.3085477671742526, + "learning_rate": 7.257997516526204e-05, + "loss": 2.7293, + "step": 26790 + }, + { + "epoch": 1.2473170845263868, + "grad_norm": 0.3173478027132889, + "learning_rate": 7.257755834113657e-05, + "loss": 2.8388, + "step": 26791 + }, + { + "epoch": 1.24736364271248, + "grad_norm": 0.3163380459137609, + "learning_rate": 7.257514145074821e-05, + "loss": 2.8673, + "step": 26792 + }, + { + "epoch": 1.247410200898573, + "grad_norm": 0.3643092830254884, + "learning_rate": 7.257272449410401e-05, + "loss": 2.8193, + "step": 26793 + }, + { + "epoch": 1.2474567590846661, + "grad_norm": 0.30613786121788755, + "learning_rate": 7.25703074712111e-05, + "loss": 2.8242, + "step": 26794 + }, + { + "epoch": 1.247503317270759, + "grad_norm": 0.3757534219503514, + "learning_rate": 7.256789038207655e-05, + "loss": 2.8338, + "step": 26795 + }, + { + "epoch": 1.2475498754568521, + "grad_norm": 0.34853693859664064, + "learning_rate": 7.256547322670748e-05, + "loss": 2.7842, + "step": 26796 + }, + { + "epoch": 1.2475964336429453, + "grad_norm": 0.37165877098973105, + "learning_rate": 7.256305600511098e-05, + "loss": 2.7537, + "step": 26797 + }, + { + "epoch": 1.2476429918290384, + "grad_norm": 0.352321657296789, + "learning_rate": 7.25606387172941e-05, + "loss": 2.8237, + "step": 26798 + }, + { + "epoch": 1.2476895500151315, + "grad_norm": 0.3155341119561526, + "learning_rate": 7.2558221363264e-05, + "loss": 2.6335, + "step": 26799 + }, + { + "epoch": 1.2477361082012246, + "grad_norm": 0.3375709635951691, + "learning_rate": 7.255580394302772e-05, + "loss": 2.6682, + "step": 26800 + }, + { + "epoch": 1.2477826663873175, + "grad_norm": 0.3327874737579025, + "learning_rate": 7.25533864565924e-05, + "loss": 2.737, + "step": 26801 + }, + { + "epoch": 1.2478292245734106, + "grad_norm": 0.3082014825772926, + "learning_rate": 7.255096890396509e-05, + "loss": 2.8391, + "step": 26802 + }, + { + "epoch": 1.2478757827595037, + "grad_norm": 0.3491464640091208, + "learning_rate": 7.254855128515293e-05, + "loss": 2.7421, + "step": 26803 + }, + { + "epoch": 1.2479223409455968, + "grad_norm": 0.3419866063164114, + "learning_rate": 7.254613360016296e-05, + "loss": 2.8934, + "step": 26804 + }, + { + "epoch": 1.2479688991316897, + "grad_norm": 0.3486288647752905, + "learning_rate": 7.254371584900232e-05, + "loss": 2.7554, + "step": 26805 + }, + { + "epoch": 1.2480154573177829, + "grad_norm": 0.3529320108938411, + "learning_rate": 7.254129803167812e-05, + "loss": 2.7643, + "step": 26806 + }, + { + "epoch": 1.248062015503876, + "grad_norm": 0.32272183543220534, + "learning_rate": 7.253888014819741e-05, + "loss": 2.8165, + "step": 26807 + }, + { + "epoch": 1.248108573689969, + "grad_norm": 0.3633910173683539, + "learning_rate": 7.253646219856731e-05, + "loss": 2.7957, + "step": 26808 + }, + { + "epoch": 1.2481551318760622, + "grad_norm": 0.34717815198516555, + "learning_rate": 7.253404418279491e-05, + "loss": 2.7804, + "step": 26809 + }, + { + "epoch": 1.248201690062155, + "grad_norm": 0.3453921839333953, + "learning_rate": 7.253162610088732e-05, + "loss": 2.8289, + "step": 26810 + }, + { + "epoch": 1.2482482482482482, + "grad_norm": 0.3398555476140883, + "learning_rate": 7.252920795285163e-05, + "loss": 2.8496, + "step": 26811 + }, + { + "epoch": 1.2482948064343413, + "grad_norm": 0.3328300398307361, + "learning_rate": 7.252678973869493e-05, + "loss": 2.818, + "step": 26812 + }, + { + "epoch": 1.2483413646204344, + "grad_norm": 0.34860488027509895, + "learning_rate": 7.252437145842432e-05, + "loss": 2.7337, + "step": 26813 + }, + { + "epoch": 1.2483879228065276, + "grad_norm": 0.3331142131377817, + "learning_rate": 7.252195311204689e-05, + "loss": 2.8638, + "step": 26814 + }, + { + "epoch": 1.2484344809926204, + "grad_norm": 0.3426516836599338, + "learning_rate": 7.251953469956975e-05, + "loss": 2.7495, + "step": 26815 + }, + { + "epoch": 1.2484810391787136, + "grad_norm": 0.31998806696291393, + "learning_rate": 7.251711622100001e-05, + "loss": 2.8283, + "step": 26816 + }, + { + "epoch": 1.2485275973648067, + "grad_norm": 0.36913328555911107, + "learning_rate": 7.251469767634473e-05, + "loss": 2.6768, + "step": 26817 + }, + { + "epoch": 1.2485741555508998, + "grad_norm": 0.33903137668051264, + "learning_rate": 7.251227906561105e-05, + "loss": 2.8485, + "step": 26818 + }, + { + "epoch": 1.248620713736993, + "grad_norm": 0.34304265432732123, + "learning_rate": 7.250986038880604e-05, + "loss": 2.7268, + "step": 26819 + }, + { + "epoch": 1.2486672719230858, + "grad_norm": 0.3312900736225682, + "learning_rate": 7.25074416459368e-05, + "loss": 2.7505, + "step": 26820 + }, + { + "epoch": 1.248713830109179, + "grad_norm": 0.334795339486265, + "learning_rate": 7.250502283701044e-05, + "loss": 2.8157, + "step": 26821 + }, + { + "epoch": 1.248760388295272, + "grad_norm": 0.32250181051385834, + "learning_rate": 7.250260396203405e-05, + "loss": 2.7713, + "step": 26822 + }, + { + "epoch": 1.2488069464813651, + "grad_norm": 0.3280514396053062, + "learning_rate": 7.250018502101474e-05, + "loss": 2.8115, + "step": 26823 + }, + { + "epoch": 1.248853504667458, + "grad_norm": 0.32973619539510707, + "learning_rate": 7.249776601395963e-05, + "loss": 2.7465, + "step": 26824 + }, + { + "epoch": 1.2489000628535512, + "grad_norm": 0.35211564732744577, + "learning_rate": 7.249534694087576e-05, + "loss": 2.7321, + "step": 26825 + }, + { + "epoch": 1.2489466210396443, + "grad_norm": 0.3634983669506364, + "learning_rate": 7.249292780177028e-05, + "loss": 2.818, + "step": 26826 + }, + { + "epoch": 1.2489931792257374, + "grad_norm": 0.3358481386785262, + "learning_rate": 7.249050859665025e-05, + "loss": 2.7175, + "step": 26827 + }, + { + "epoch": 1.2490397374118305, + "grad_norm": 0.34415472892549476, + "learning_rate": 7.248808932552282e-05, + "loss": 2.8528, + "step": 26828 + }, + { + "epoch": 1.2490862955979236, + "grad_norm": 0.33484092504959173, + "learning_rate": 7.248566998839505e-05, + "loss": 2.7791, + "step": 26829 + }, + { + "epoch": 1.2491328537840165, + "grad_norm": 0.34500145782821584, + "learning_rate": 7.248325058527405e-05, + "loss": 2.8497, + "step": 26830 + }, + { + "epoch": 1.2491794119701096, + "grad_norm": 0.3664531314964418, + "learning_rate": 7.248083111616692e-05, + "loss": 2.7113, + "step": 26831 + }, + { + "epoch": 1.2492259701562027, + "grad_norm": 0.35703034523648913, + "learning_rate": 7.247841158108079e-05, + "loss": 2.8044, + "step": 26832 + }, + { + "epoch": 1.2492725283422959, + "grad_norm": 0.3599675018231438, + "learning_rate": 7.247599198002271e-05, + "loss": 2.7373, + "step": 26833 + }, + { + "epoch": 1.2493190865283887, + "grad_norm": 0.3092249556261461, + "learning_rate": 7.247357231299983e-05, + "loss": 2.7459, + "step": 26834 + }, + { + "epoch": 1.2493656447144819, + "grad_norm": 0.40185443931661063, + "learning_rate": 7.247115258001921e-05, + "loss": 2.896, + "step": 26835 + }, + { + "epoch": 1.249412202900575, + "grad_norm": 0.31438685632393254, + "learning_rate": 7.246873278108798e-05, + "loss": 2.8203, + "step": 26836 + }, + { + "epoch": 1.249458761086668, + "grad_norm": 0.3515231058219354, + "learning_rate": 7.246631291621324e-05, + "loss": 2.802, + "step": 26837 + }, + { + "epoch": 1.2495053192727612, + "grad_norm": 0.31981545308515663, + "learning_rate": 7.246389298540207e-05, + "loss": 2.717, + "step": 26838 + }, + { + "epoch": 1.2495518774588543, + "grad_norm": 0.3627192552137181, + "learning_rate": 7.246147298866161e-05, + "loss": 2.7464, + "step": 26839 + }, + { + "epoch": 1.2495984356449472, + "grad_norm": 0.3430364460751451, + "learning_rate": 7.245905292599892e-05, + "loss": 2.7079, + "step": 26840 + }, + { + "epoch": 1.2496449938310403, + "grad_norm": 0.3613959278417091, + "learning_rate": 7.245663279742112e-05, + "loss": 2.9261, + "step": 26841 + }, + { + "epoch": 1.2496915520171334, + "grad_norm": 0.3424236341603244, + "learning_rate": 7.245421260293532e-05, + "loss": 2.8168, + "step": 26842 + }, + { + "epoch": 1.2497381102032266, + "grad_norm": 0.33847811211894696, + "learning_rate": 7.245179234254862e-05, + "loss": 2.8589, + "step": 26843 + }, + { + "epoch": 1.2497846683893195, + "grad_norm": 0.3368434682934618, + "learning_rate": 7.244937201626812e-05, + "loss": 2.8627, + "step": 26844 + }, + { + "epoch": 1.2498312265754126, + "grad_norm": 0.32121125346198154, + "learning_rate": 7.244695162410092e-05, + "loss": 2.8722, + "step": 26845 + }, + { + "epoch": 1.2498777847615057, + "grad_norm": 0.329364909522829, + "learning_rate": 7.244453116605415e-05, + "loss": 2.8094, + "step": 26846 + }, + { + "epoch": 1.2499243429475988, + "grad_norm": 0.32534151266216177, + "learning_rate": 7.244211064213487e-05, + "loss": 2.8489, + "step": 26847 + }, + { + "epoch": 1.249970901133692, + "grad_norm": 0.3334568889552409, + "learning_rate": 7.243969005235021e-05, + "loss": 2.8099, + "step": 26848 + }, + { + "epoch": 1.250017459319785, + "grad_norm": 0.33979828667923806, + "learning_rate": 7.243726939670726e-05, + "loss": 2.6708, + "step": 26849 + }, + { + "epoch": 1.250064017505878, + "grad_norm": 0.36114859398786053, + "learning_rate": 7.243484867521315e-05, + "loss": 2.7582, + "step": 26850 + }, + { + "epoch": 1.250110575691971, + "grad_norm": 0.3424790026271394, + "learning_rate": 7.243242788787497e-05, + "loss": 2.8311, + "step": 26851 + }, + { + "epoch": 1.2501571338780642, + "grad_norm": 0.40554182205114353, + "learning_rate": 7.243000703469983e-05, + "loss": 2.7273, + "step": 26852 + }, + { + "epoch": 1.250203692064157, + "grad_norm": 0.33937322801640313, + "learning_rate": 7.242758611569481e-05, + "loss": 2.7732, + "step": 26853 + }, + { + "epoch": 1.2502502502502502, + "grad_norm": 0.3324065581538709, + "learning_rate": 7.242516513086704e-05, + "loss": 2.6812, + "step": 26854 + }, + { + "epoch": 1.2502968084363433, + "grad_norm": 0.3439855560064537, + "learning_rate": 7.242274408022364e-05, + "loss": 2.6095, + "step": 26855 + }, + { + "epoch": 1.2503433666224364, + "grad_norm": 0.3388790226637624, + "learning_rate": 7.242032296377167e-05, + "loss": 2.7784, + "step": 26856 + }, + { + "epoch": 1.2503899248085295, + "grad_norm": 0.3614377494198906, + "learning_rate": 7.241790178151827e-05, + "loss": 2.8159, + "step": 26857 + }, + { + "epoch": 1.2504364829946226, + "grad_norm": 0.32556911410025197, + "learning_rate": 7.241548053347054e-05, + "loss": 2.6807, + "step": 26858 + }, + { + "epoch": 1.2504830411807157, + "grad_norm": 0.3479544859196681, + "learning_rate": 7.241305921963557e-05, + "loss": 2.8608, + "step": 26859 + }, + { + "epoch": 1.2505295993668086, + "grad_norm": 0.3195595729994363, + "learning_rate": 7.241063784002049e-05, + "loss": 2.8347, + "step": 26860 + }, + { + "epoch": 1.2505761575529017, + "grad_norm": 0.33441553546339753, + "learning_rate": 7.24082163946324e-05, + "loss": 2.7031, + "step": 26861 + }, + { + "epoch": 1.2506227157389949, + "grad_norm": 0.3586930585378278, + "learning_rate": 7.240579488347838e-05, + "loss": 2.7081, + "step": 26862 + }, + { + "epoch": 1.2506692739250878, + "grad_norm": 0.3228053668210197, + "learning_rate": 7.24033733065656e-05, + "loss": 2.7921, + "step": 26863 + }, + { + "epoch": 1.2507158321111809, + "grad_norm": 0.3290289523100023, + "learning_rate": 7.240095166390108e-05, + "loss": 2.7919, + "step": 26864 + }, + { + "epoch": 1.250762390297274, + "grad_norm": 0.33642296036166747, + "learning_rate": 7.239852995549201e-05, + "loss": 2.7653, + "step": 26865 + }, + { + "epoch": 1.250808948483367, + "grad_norm": 0.33768355492157737, + "learning_rate": 7.239610818134545e-05, + "loss": 2.7197, + "step": 26866 + }, + { + "epoch": 1.2508555066694602, + "grad_norm": 0.3375327387880129, + "learning_rate": 7.239368634146852e-05, + "loss": 2.7541, + "step": 26867 + }, + { + "epoch": 1.2509020648555533, + "grad_norm": 0.33735365536858886, + "learning_rate": 7.239126443586831e-05, + "loss": 2.7494, + "step": 26868 + }, + { + "epoch": 1.2509486230416462, + "grad_norm": 0.330993350458568, + "learning_rate": 7.238884246455196e-05, + "loss": 2.7431, + "step": 26869 + }, + { + "epoch": 1.2509951812277393, + "grad_norm": 0.3572675919267547, + "learning_rate": 7.238642042752657e-05, + "loss": 2.7461, + "step": 26870 + }, + { + "epoch": 1.2510417394138325, + "grad_norm": 0.34057562138073344, + "learning_rate": 7.238399832479923e-05, + "loss": 2.7494, + "step": 26871 + }, + { + "epoch": 1.2510882975999256, + "grad_norm": 0.35703565795542164, + "learning_rate": 7.238157615637705e-05, + "loss": 2.805, + "step": 26872 + }, + { + "epoch": 1.2511348557860185, + "grad_norm": 0.3651164119285794, + "learning_rate": 7.237915392226716e-05, + "loss": 2.7278, + "step": 26873 + }, + { + "epoch": 1.2511814139721116, + "grad_norm": 0.34944189716856877, + "learning_rate": 7.237673162247667e-05, + "loss": 2.8587, + "step": 26874 + }, + { + "epoch": 1.2512279721582047, + "grad_norm": 0.380966188677701, + "learning_rate": 7.237430925701267e-05, + "loss": 2.855, + "step": 26875 + }, + { + "epoch": 1.2512745303442978, + "grad_norm": 0.35215057380494963, + "learning_rate": 7.237188682588228e-05, + "loss": 2.7943, + "step": 26876 + }, + { + "epoch": 1.251321088530391, + "grad_norm": 0.3301312180892008, + "learning_rate": 7.236946432909259e-05, + "loss": 2.7177, + "step": 26877 + }, + { + "epoch": 1.251367646716484, + "grad_norm": 0.3523820223536243, + "learning_rate": 7.236704176665075e-05, + "loss": 2.7855, + "step": 26878 + }, + { + "epoch": 1.251414204902577, + "grad_norm": 0.33829558200636, + "learning_rate": 7.236461913856381e-05, + "loss": 2.8326, + "step": 26879 + }, + { + "epoch": 1.25146076308867, + "grad_norm": 0.36742785719467846, + "learning_rate": 7.236219644483895e-05, + "loss": 2.8276, + "step": 26880 + }, + { + "epoch": 1.2515073212747632, + "grad_norm": 0.3172064841716833, + "learning_rate": 7.235977368548322e-05, + "loss": 2.7839, + "step": 26881 + }, + { + "epoch": 1.2515538794608563, + "grad_norm": 0.3590193478237244, + "learning_rate": 7.235735086050378e-05, + "loss": 2.8387, + "step": 26882 + }, + { + "epoch": 1.2516004376469492, + "grad_norm": 0.3101017448384973, + "learning_rate": 7.235492796990771e-05, + "loss": 2.7626, + "step": 26883 + }, + { + "epoch": 1.2516469958330423, + "grad_norm": 0.36783152689686177, + "learning_rate": 7.235250501370212e-05, + "loss": 2.7882, + "step": 26884 + }, + { + "epoch": 1.2516935540191354, + "grad_norm": 0.31294516048951915, + "learning_rate": 7.235008199189413e-05, + "loss": 2.7382, + "step": 26885 + }, + { + "epoch": 1.2517401122052285, + "grad_norm": 0.38268675602167723, + "learning_rate": 7.234765890449085e-05, + "loss": 2.799, + "step": 26886 + }, + { + "epoch": 1.2517866703913216, + "grad_norm": 0.3419311026196624, + "learning_rate": 7.234523575149942e-05, + "loss": 2.8902, + "step": 26887 + }, + { + "epoch": 1.2518332285774147, + "grad_norm": 0.36671831630446283, + "learning_rate": 7.23428125329269e-05, + "loss": 2.7407, + "step": 26888 + }, + { + "epoch": 1.2518797867635076, + "grad_norm": 0.351509005036492, + "learning_rate": 7.234038924878044e-05, + "loss": 2.8851, + "step": 26889 + }, + { + "epoch": 1.2519263449496008, + "grad_norm": 0.37935055072309265, + "learning_rate": 7.233796589906713e-05, + "loss": 2.8322, + "step": 26890 + }, + { + "epoch": 1.2519729031356939, + "grad_norm": 0.36305913959556746, + "learning_rate": 7.233554248379408e-05, + "loss": 2.7855, + "step": 26891 + }, + { + "epoch": 1.252019461321787, + "grad_norm": 0.36078467041661627, + "learning_rate": 7.233311900296844e-05, + "loss": 2.7826, + "step": 26892 + }, + { + "epoch": 1.2520660195078799, + "grad_norm": 0.3491930543207984, + "learning_rate": 7.233069545659728e-05, + "loss": 2.6233, + "step": 26893 + }, + { + "epoch": 1.252112577693973, + "grad_norm": 0.3426199647863347, + "learning_rate": 7.232827184468773e-05, + "loss": 2.7414, + "step": 26894 + }, + { + "epoch": 1.252159135880066, + "grad_norm": 0.3457072362401168, + "learning_rate": 7.23258481672469e-05, + "loss": 2.789, + "step": 26895 + }, + { + "epoch": 1.2522056940661592, + "grad_norm": 0.33502997261082323, + "learning_rate": 7.23234244242819e-05, + "loss": 2.7834, + "step": 26896 + }, + { + "epoch": 1.2522522522522523, + "grad_norm": 0.37470252672895205, + "learning_rate": 7.232100061579985e-05, + "loss": 2.8434, + "step": 26897 + }, + { + "epoch": 1.2522988104383455, + "grad_norm": 0.3378576072083579, + "learning_rate": 7.231857674180787e-05, + "loss": 2.7278, + "step": 26898 + }, + { + "epoch": 1.2523453686244383, + "grad_norm": 0.3596319562282816, + "learning_rate": 7.231615280231306e-05, + "loss": 2.81, + "step": 26899 + }, + { + "epoch": 1.2523919268105315, + "grad_norm": 0.34565976702916856, + "learning_rate": 7.231372879732255e-05, + "loss": 2.7947, + "step": 26900 + }, + { + "epoch": 1.2524384849966246, + "grad_norm": 0.348332365915521, + "learning_rate": 7.231130472684341e-05, + "loss": 2.8075, + "step": 26901 + }, + { + "epoch": 1.2524850431827175, + "grad_norm": 0.3626854861617941, + "learning_rate": 7.230888059088285e-05, + "loss": 2.7235, + "step": 26902 + }, + { + "epoch": 1.2525316013688106, + "grad_norm": 0.3266752738514761, + "learning_rate": 7.230645638944788e-05, + "loss": 2.8693, + "step": 26903 + }, + { + "epoch": 1.2525781595549037, + "grad_norm": 0.35963912512636786, + "learning_rate": 7.230403212254566e-05, + "loss": 2.7498, + "step": 26904 + }, + { + "epoch": 1.2526247177409968, + "grad_norm": 0.33317008456919517, + "learning_rate": 7.23016077901833e-05, + "loss": 2.8276, + "step": 26905 + }, + { + "epoch": 1.25267127592709, + "grad_norm": 0.3625701759002902, + "learning_rate": 7.229918339236794e-05, + "loss": 2.7292, + "step": 26906 + }, + { + "epoch": 1.252717834113183, + "grad_norm": 0.32931918972990587, + "learning_rate": 7.229675892910665e-05, + "loss": 2.7903, + "step": 26907 + }, + { + "epoch": 1.252764392299276, + "grad_norm": 0.32256679223291135, + "learning_rate": 7.229433440040659e-05, + "loss": 2.7272, + "step": 26908 + }, + { + "epoch": 1.252810950485369, + "grad_norm": 0.33626631630848103, + "learning_rate": 7.229190980627484e-05, + "loss": 2.7439, + "step": 26909 + }, + { + "epoch": 1.2528575086714622, + "grad_norm": 0.34113707889436573, + "learning_rate": 7.228948514671853e-05, + "loss": 2.9036, + "step": 26910 + }, + { + "epoch": 1.2529040668575553, + "grad_norm": 0.3325973010096664, + "learning_rate": 7.22870604217448e-05, + "loss": 2.6522, + "step": 26911 + }, + { + "epoch": 1.2529506250436482, + "grad_norm": 0.30366465060081116, + "learning_rate": 7.228463563136073e-05, + "loss": 2.8407, + "step": 26912 + }, + { + "epoch": 1.2529971832297413, + "grad_norm": 0.33221567549740455, + "learning_rate": 7.228221077557345e-05, + "loss": 2.7651, + "step": 26913 + }, + { + "epoch": 1.2530437414158344, + "grad_norm": 0.3114350718208959, + "learning_rate": 7.227978585439006e-05, + "loss": 2.8238, + "step": 26914 + }, + { + "epoch": 1.2530902996019275, + "grad_norm": 0.3371305895925452, + "learning_rate": 7.227736086781771e-05, + "loss": 2.8153, + "step": 26915 + }, + { + "epoch": 1.2531368577880206, + "grad_norm": 0.31864728390394703, + "learning_rate": 7.22749358158635e-05, + "loss": 2.8182, + "step": 26916 + }, + { + "epoch": 1.2531834159741138, + "grad_norm": 0.33678410826568517, + "learning_rate": 7.227251069853454e-05, + "loss": 2.7533, + "step": 26917 + }, + { + "epoch": 1.2532299741602067, + "grad_norm": 0.3275036746653927, + "learning_rate": 7.227008551583797e-05, + "loss": 2.7397, + "step": 26918 + }, + { + "epoch": 1.2532765323462998, + "grad_norm": 0.3387617604080654, + "learning_rate": 7.226766026778088e-05, + "loss": 2.5858, + "step": 26919 + }, + { + "epoch": 1.2533230905323929, + "grad_norm": 0.3181659280588365, + "learning_rate": 7.22652349543704e-05, + "loss": 2.7406, + "step": 26920 + }, + { + "epoch": 1.253369648718486, + "grad_norm": 0.3021745206648623, + "learning_rate": 7.226280957561364e-05, + "loss": 2.6441, + "step": 26921 + }, + { + "epoch": 1.253416206904579, + "grad_norm": 0.31447658305557163, + "learning_rate": 7.226038413151775e-05, + "loss": 2.8404, + "step": 26922 + }, + { + "epoch": 1.253462765090672, + "grad_norm": 0.341628399763967, + "learning_rate": 7.225795862208981e-05, + "loss": 2.8589, + "step": 26923 + }, + { + "epoch": 1.2535093232767651, + "grad_norm": 0.32023858236867586, + "learning_rate": 7.225553304733696e-05, + "loss": 2.8177, + "step": 26924 + }, + { + "epoch": 1.2535558814628582, + "grad_norm": 0.3428835077474551, + "learning_rate": 7.225310740726632e-05, + "loss": 2.7556, + "step": 26925 + }, + { + "epoch": 1.2536024396489513, + "grad_norm": 0.3025382713799828, + "learning_rate": 7.2250681701885e-05, + "loss": 2.8423, + "step": 26926 + }, + { + "epoch": 1.2536489978350445, + "grad_norm": 0.3526506875707908, + "learning_rate": 7.224825593120012e-05, + "loss": 2.8285, + "step": 26927 + }, + { + "epoch": 1.2536955560211374, + "grad_norm": 0.33839798218476946, + "learning_rate": 7.22458300952188e-05, + "loss": 2.8428, + "step": 26928 + }, + { + "epoch": 1.2537421142072305, + "grad_norm": 0.3191638313703143, + "learning_rate": 7.224340419394815e-05, + "loss": 2.8283, + "step": 26929 + }, + { + "epoch": 1.2537886723933236, + "grad_norm": 0.3477907233343578, + "learning_rate": 7.22409782273953e-05, + "loss": 2.8183, + "step": 26930 + }, + { + "epoch": 1.2538352305794167, + "grad_norm": 0.3067687605289736, + "learning_rate": 7.223855219556738e-05, + "loss": 2.8961, + "step": 26931 + }, + { + "epoch": 1.2538817887655096, + "grad_norm": 0.32227173025399436, + "learning_rate": 7.22361260984715e-05, + "loss": 2.7655, + "step": 26932 + }, + { + "epoch": 1.2539283469516027, + "grad_norm": 0.34221227247986374, + "learning_rate": 7.223369993611478e-05, + "loss": 2.696, + "step": 26933 + }, + { + "epoch": 1.2539749051376958, + "grad_norm": 0.30594602674600535, + "learning_rate": 7.223127370850433e-05, + "loss": 2.7313, + "step": 26934 + }, + { + "epoch": 1.254021463323789, + "grad_norm": 0.3214118158508765, + "learning_rate": 7.22288474156473e-05, + "loss": 2.8571, + "step": 26935 + }, + { + "epoch": 1.254068021509882, + "grad_norm": 0.3317339815891841, + "learning_rate": 7.222642105755077e-05, + "loss": 2.7181, + "step": 26936 + }, + { + "epoch": 1.2541145796959752, + "grad_norm": 0.34016923514475716, + "learning_rate": 7.22239946342219e-05, + "loss": 2.6687, + "step": 26937 + }, + { + "epoch": 1.254161137882068, + "grad_norm": 0.34222977944488614, + "learning_rate": 7.22215681456678e-05, + "loss": 2.8408, + "step": 26938 + }, + { + "epoch": 1.2542076960681612, + "grad_norm": 0.36193355084242074, + "learning_rate": 7.221914159189558e-05, + "loss": 2.7348, + "step": 26939 + }, + { + "epoch": 1.2542542542542543, + "grad_norm": 0.31280812131825286, + "learning_rate": 7.221671497291236e-05, + "loss": 2.8683, + "step": 26940 + }, + { + "epoch": 1.2543008124403472, + "grad_norm": 0.35856142627420085, + "learning_rate": 7.22142882887253e-05, + "loss": 2.8653, + "step": 26941 + }, + { + "epoch": 1.2543473706264403, + "grad_norm": 0.30007726867553197, + "learning_rate": 7.221186153934146e-05, + "loss": 2.826, + "step": 26942 + }, + { + "epoch": 1.2543939288125334, + "grad_norm": 0.3496691092069009, + "learning_rate": 7.2209434724768e-05, + "loss": 2.8133, + "step": 26943 + }, + { + "epoch": 1.2544404869986265, + "grad_norm": 0.3048813614324529, + "learning_rate": 7.220700784501204e-05, + "loss": 2.7802, + "step": 26944 + }, + { + "epoch": 1.2544870451847197, + "grad_norm": 0.3750274162399759, + "learning_rate": 7.22045809000807e-05, + "loss": 2.8707, + "step": 26945 + }, + { + "epoch": 1.2545336033708128, + "grad_norm": 0.33170463131235695, + "learning_rate": 7.22021538899811e-05, + "loss": 2.84, + "step": 26946 + }, + { + "epoch": 1.2545801615569057, + "grad_norm": 0.3741292804196812, + "learning_rate": 7.219972681472038e-05, + "loss": 2.763, + "step": 26947 + }, + { + "epoch": 1.2546267197429988, + "grad_norm": 0.3056224827352741, + "learning_rate": 7.219729967430563e-05, + "loss": 2.7988, + "step": 26948 + }, + { + "epoch": 1.254673277929092, + "grad_norm": 0.3486504659826264, + "learning_rate": 7.2194872468744e-05, + "loss": 2.8146, + "step": 26949 + }, + { + "epoch": 1.254719836115185, + "grad_norm": 0.3295343438109228, + "learning_rate": 7.219244519804262e-05, + "loss": 2.7078, + "step": 26950 + }, + { + "epoch": 1.254766394301278, + "grad_norm": 0.39343566740310254, + "learning_rate": 7.219001786220858e-05, + "loss": 2.7948, + "step": 26951 + }, + { + "epoch": 1.254812952487371, + "grad_norm": 0.31862345240515055, + "learning_rate": 7.218759046124903e-05, + "loss": 2.8633, + "step": 26952 + }, + { + "epoch": 1.2548595106734641, + "grad_norm": 0.38029520862068733, + "learning_rate": 7.218516299517109e-05, + "loss": 2.7262, + "step": 26953 + }, + { + "epoch": 1.2549060688595572, + "grad_norm": 0.3135134476946485, + "learning_rate": 7.218273546398188e-05, + "loss": 2.7575, + "step": 26954 + }, + { + "epoch": 1.2549526270456504, + "grad_norm": 0.4080903738081024, + "learning_rate": 7.218030786768854e-05, + "loss": 2.7756, + "step": 26955 + }, + { + "epoch": 1.2549991852317435, + "grad_norm": 0.3125758343297044, + "learning_rate": 7.217788020629816e-05, + "loss": 2.7592, + "step": 26956 + }, + { + "epoch": 1.2550457434178364, + "grad_norm": 0.3442345107067197, + "learning_rate": 7.217545247981789e-05, + "loss": 2.7654, + "step": 26957 + }, + { + "epoch": 1.2550923016039295, + "grad_norm": 0.38900304578298095, + "learning_rate": 7.217302468825485e-05, + "loss": 2.852, + "step": 26958 + }, + { + "epoch": 1.2551388597900226, + "grad_norm": 0.33931051460833306, + "learning_rate": 7.217059683161617e-05, + "loss": 2.8239, + "step": 26959 + }, + { + "epoch": 1.2551854179761157, + "grad_norm": 0.41268219151109137, + "learning_rate": 7.216816890990899e-05, + "loss": 2.8802, + "step": 26960 + }, + { + "epoch": 1.2552319761622086, + "grad_norm": 0.3147044502238841, + "learning_rate": 7.21657409231404e-05, + "loss": 2.7866, + "step": 26961 + }, + { + "epoch": 1.2552785343483017, + "grad_norm": 0.3911016771669727, + "learning_rate": 7.216331287131755e-05, + "loss": 2.7315, + "step": 26962 + }, + { + "epoch": 1.2553250925343948, + "grad_norm": 0.350954073384502, + "learning_rate": 7.216088475444755e-05, + "loss": 2.7558, + "step": 26963 + }, + { + "epoch": 1.255371650720488, + "grad_norm": 0.3279640172556301, + "learning_rate": 7.215845657253754e-05, + "loss": 2.8166, + "step": 26964 + }, + { + "epoch": 1.255418208906581, + "grad_norm": 0.33425904178522553, + "learning_rate": 7.215602832559467e-05, + "loss": 2.749, + "step": 26965 + }, + { + "epoch": 1.2554647670926742, + "grad_norm": 0.31310887847609875, + "learning_rate": 7.215360001362601e-05, + "loss": 2.7199, + "step": 26966 + }, + { + "epoch": 1.255511325278767, + "grad_norm": 0.3133356699605501, + "learning_rate": 7.215117163663872e-05, + "loss": 2.8319, + "step": 26967 + }, + { + "epoch": 1.2555578834648602, + "grad_norm": 0.33598893604602725, + "learning_rate": 7.214874319463993e-05, + "loss": 2.8478, + "step": 26968 + }, + { + "epoch": 1.2556044416509533, + "grad_norm": 0.3383681045953801, + "learning_rate": 7.214631468763676e-05, + "loss": 2.8678, + "step": 26969 + }, + { + "epoch": 1.2556509998370464, + "grad_norm": 0.3576214989748959, + "learning_rate": 7.214388611563634e-05, + "loss": 2.7682, + "step": 26970 + }, + { + "epoch": 1.2556975580231393, + "grad_norm": 0.32489935811987525, + "learning_rate": 7.214145747864579e-05, + "loss": 2.8313, + "step": 26971 + }, + { + "epoch": 1.2557441162092324, + "grad_norm": 0.33421447935696974, + "learning_rate": 7.213902877667223e-05, + "loss": 2.7738, + "step": 26972 + }, + { + "epoch": 1.2557906743953255, + "grad_norm": 0.33151418556576, + "learning_rate": 7.213660000972283e-05, + "loss": 2.7468, + "step": 26973 + }, + { + "epoch": 1.2558372325814187, + "grad_norm": 0.32107395910916436, + "learning_rate": 7.213417117780468e-05, + "loss": 2.7854, + "step": 26974 + }, + { + "epoch": 1.2558837907675118, + "grad_norm": 0.3471828523147679, + "learning_rate": 7.213174228092491e-05, + "loss": 2.7869, + "step": 26975 + }, + { + "epoch": 1.255930348953605, + "grad_norm": 0.3502524735067032, + "learning_rate": 7.212931331909066e-05, + "loss": 2.845, + "step": 26976 + }, + { + "epoch": 1.2559769071396978, + "grad_norm": 0.3233898554489324, + "learning_rate": 7.212688429230906e-05, + "loss": 2.7806, + "step": 26977 + }, + { + "epoch": 1.256023465325791, + "grad_norm": 0.34044098925513555, + "learning_rate": 7.212445520058724e-05, + "loss": 2.8652, + "step": 26978 + }, + { + "epoch": 1.256070023511884, + "grad_norm": 0.3256990202045375, + "learning_rate": 7.212202604393231e-05, + "loss": 2.7849, + "step": 26979 + }, + { + "epoch": 1.256116581697977, + "grad_norm": 0.34555558964444116, + "learning_rate": 7.21195968223514e-05, + "loss": 2.7967, + "step": 26980 + }, + { + "epoch": 1.25616313988407, + "grad_norm": 0.335687279563491, + "learning_rate": 7.211716753585169e-05, + "loss": 2.8166, + "step": 26981 + }, + { + "epoch": 1.2562096980701631, + "grad_norm": 0.3330426135904864, + "learning_rate": 7.211473818444026e-05, + "loss": 2.763, + "step": 26982 + }, + { + "epoch": 1.2562562562562563, + "grad_norm": 0.36094350324718366, + "learning_rate": 7.211230876812423e-05, + "loss": 2.7914, + "step": 26983 + }, + { + "epoch": 1.2563028144423494, + "grad_norm": 0.32910567798297324, + "learning_rate": 7.210987928691078e-05, + "loss": 2.9099, + "step": 26984 + }, + { + "epoch": 1.2563493726284425, + "grad_norm": 0.3407018625145278, + "learning_rate": 7.2107449740807e-05, + "loss": 2.7334, + "step": 26985 + }, + { + "epoch": 1.2563959308145356, + "grad_norm": 0.32660302029991695, + "learning_rate": 7.210502012982003e-05, + "loss": 2.8353, + "step": 26986 + }, + { + "epoch": 1.2564424890006285, + "grad_norm": 0.30171269239019644, + "learning_rate": 7.2102590453957e-05, + "loss": 2.674, + "step": 26987 + }, + { + "epoch": 1.2564890471867216, + "grad_norm": 0.3715753111325325, + "learning_rate": 7.210016071322505e-05, + "loss": 2.7343, + "step": 26988 + }, + { + "epoch": 1.2565356053728147, + "grad_norm": 0.3150128874113438, + "learning_rate": 7.20977309076313e-05, + "loss": 2.8622, + "step": 26989 + }, + { + "epoch": 1.2565821635589076, + "grad_norm": 0.33168594975081717, + "learning_rate": 7.20953010371829e-05, + "loss": 2.7214, + "step": 26990 + }, + { + "epoch": 1.2566287217450007, + "grad_norm": 0.3271517232152054, + "learning_rate": 7.209287110188697e-05, + "loss": 2.7417, + "step": 26991 + }, + { + "epoch": 1.2566752799310938, + "grad_norm": 0.35956594464313274, + "learning_rate": 7.209044110175062e-05, + "loss": 2.8192, + "step": 26992 + }, + { + "epoch": 1.256721838117187, + "grad_norm": 0.34705427316949433, + "learning_rate": 7.208801103678102e-05, + "loss": 2.8095, + "step": 26993 + }, + { + "epoch": 1.25676839630328, + "grad_norm": 0.33573205412736307, + "learning_rate": 7.208558090698527e-05, + "loss": 2.7869, + "step": 26994 + }, + { + "epoch": 1.2568149544893732, + "grad_norm": 0.32661393969381436, + "learning_rate": 7.208315071237053e-05, + "loss": 2.7064, + "step": 26995 + }, + { + "epoch": 1.256861512675466, + "grad_norm": 0.3126465353095605, + "learning_rate": 7.20807204529439e-05, + "loss": 2.7065, + "step": 26996 + }, + { + "epoch": 1.2569080708615592, + "grad_norm": 0.3321817302398732, + "learning_rate": 7.207829012871254e-05, + "loss": 2.7082, + "step": 26997 + }, + { + "epoch": 1.2569546290476523, + "grad_norm": 0.3226890291057112, + "learning_rate": 7.207585973968358e-05, + "loss": 2.8082, + "step": 26998 + }, + { + "epoch": 1.2570011872337454, + "grad_norm": 0.3588790039109765, + "learning_rate": 7.207342928586414e-05, + "loss": 2.7371, + "step": 26999 + }, + { + "epoch": 1.2570477454198383, + "grad_norm": 0.31672448579168017, + "learning_rate": 7.207099876726137e-05, + "loss": 2.7617, + "step": 27000 + }, + { + "epoch": 1.2570943036059314, + "grad_norm": 0.3299690594947103, + "learning_rate": 7.206856818388238e-05, + "loss": 2.7889, + "step": 27001 + }, + { + "epoch": 1.2571408617920246, + "grad_norm": 0.3079269762692978, + "learning_rate": 7.206613753573432e-05, + "loss": 2.7601, + "step": 27002 + }, + { + "epoch": 1.2571874199781177, + "grad_norm": 0.33468218252462856, + "learning_rate": 7.206370682282433e-05, + "loss": 2.8266, + "step": 27003 + }, + { + "epoch": 1.2572339781642108, + "grad_norm": 0.34487783603300576, + "learning_rate": 7.206127604515951e-05, + "loss": 2.7632, + "step": 27004 + }, + { + "epoch": 1.257280536350304, + "grad_norm": 0.3113106578789782, + "learning_rate": 7.205884520274704e-05, + "loss": 2.7426, + "step": 27005 + }, + { + "epoch": 1.2573270945363968, + "grad_norm": 0.32569512029759007, + "learning_rate": 7.205641429559402e-05, + "loss": 2.7995, + "step": 27006 + }, + { + "epoch": 1.25737365272249, + "grad_norm": 0.3198351589699826, + "learning_rate": 7.205398332370761e-05, + "loss": 2.8402, + "step": 27007 + }, + { + "epoch": 1.257420210908583, + "grad_norm": 0.3249119991013574, + "learning_rate": 7.205155228709491e-05, + "loss": 2.746, + "step": 27008 + }, + { + "epoch": 1.2574667690946761, + "grad_norm": 0.31051128688841734, + "learning_rate": 7.20491211857631e-05, + "loss": 2.6515, + "step": 27009 + }, + { + "epoch": 1.257513327280769, + "grad_norm": 0.30672893243116417, + "learning_rate": 7.204669001971928e-05, + "loss": 2.7954, + "step": 27010 + }, + { + "epoch": 1.2575598854668621, + "grad_norm": 0.3342498349894861, + "learning_rate": 7.20442587889706e-05, + "loss": 2.7062, + "step": 27011 + }, + { + "epoch": 1.2576064436529553, + "grad_norm": 0.3155191447923252, + "learning_rate": 7.204182749352419e-05, + "loss": 2.7833, + "step": 27012 + }, + { + "epoch": 1.2576530018390484, + "grad_norm": 0.3498995449199473, + "learning_rate": 7.20393961333872e-05, + "loss": 2.8126, + "step": 27013 + }, + { + "epoch": 1.2576995600251415, + "grad_norm": 0.31325033431765675, + "learning_rate": 7.203696470856673e-05, + "loss": 2.6944, + "step": 27014 + }, + { + "epoch": 1.2577461182112346, + "grad_norm": 0.34132221699885984, + "learning_rate": 7.203453321906995e-05, + "loss": 2.7401, + "step": 27015 + }, + { + "epoch": 1.2577926763973275, + "grad_norm": 0.38542267454594725, + "learning_rate": 7.2032101664904e-05, + "loss": 2.8191, + "step": 27016 + }, + { + "epoch": 1.2578392345834206, + "grad_norm": 0.37408998737800736, + "learning_rate": 7.202967004607598e-05, + "loss": 2.8239, + "step": 27017 + }, + { + "epoch": 1.2578857927695137, + "grad_norm": 0.3470988950752111, + "learning_rate": 7.202723836259306e-05, + "loss": 2.6665, + "step": 27018 + }, + { + "epoch": 1.2579323509556068, + "grad_norm": 0.3984062148919179, + "learning_rate": 7.202480661446237e-05, + "loss": 2.7708, + "step": 27019 + }, + { + "epoch": 1.2579789091416997, + "grad_norm": 0.3435250294184553, + "learning_rate": 7.202237480169102e-05, + "loss": 2.7608, + "step": 27020 + }, + { + "epoch": 1.2580254673277929, + "grad_norm": 0.34224953191172475, + "learning_rate": 7.20199429242862e-05, + "loss": 2.7753, + "step": 27021 + }, + { + "epoch": 1.258072025513886, + "grad_norm": 0.342698291181316, + "learning_rate": 7.201751098225502e-05, + "loss": 2.8057, + "step": 27022 + }, + { + "epoch": 1.258118583699979, + "grad_norm": 0.3463484658544194, + "learning_rate": 7.201507897560457e-05, + "loss": 2.7816, + "step": 27023 + }, + { + "epoch": 1.2581651418860722, + "grad_norm": 0.3341191075274622, + "learning_rate": 7.201264690434207e-05, + "loss": 2.8331, + "step": 27024 + }, + { + "epoch": 1.2582117000721653, + "grad_norm": 0.3074102253374372, + "learning_rate": 7.201021476847461e-05, + "loss": 2.7358, + "step": 27025 + }, + { + "epoch": 1.2582582582582582, + "grad_norm": 0.35053941609365075, + "learning_rate": 7.200778256800933e-05, + "loss": 2.7152, + "step": 27026 + }, + { + "epoch": 1.2583048164443513, + "grad_norm": 0.3400232569058652, + "learning_rate": 7.200535030295339e-05, + "loss": 2.7508, + "step": 27027 + }, + { + "epoch": 1.2583513746304444, + "grad_norm": 0.3198788033576351, + "learning_rate": 7.20029179733139e-05, + "loss": 2.8282, + "step": 27028 + }, + { + "epoch": 1.2583979328165373, + "grad_norm": 0.3351874707595801, + "learning_rate": 7.200048557909804e-05, + "loss": 2.8022, + "step": 27029 + }, + { + "epoch": 1.2584444910026304, + "grad_norm": 0.31460475251593045, + "learning_rate": 7.199805312031289e-05, + "loss": 2.7266, + "step": 27030 + }, + { + "epoch": 1.2584910491887236, + "grad_norm": 0.3440001159010273, + "learning_rate": 7.199562059696564e-05, + "loss": 2.7756, + "step": 27031 + }, + { + "epoch": 1.2585376073748167, + "grad_norm": 0.3239846961283872, + "learning_rate": 7.199318800906341e-05, + "loss": 2.7347, + "step": 27032 + }, + { + "epoch": 1.2585841655609098, + "grad_norm": 0.33258448295072246, + "learning_rate": 7.199075535661332e-05, + "loss": 2.771, + "step": 27033 + }, + { + "epoch": 1.258630723747003, + "grad_norm": 0.362722912618912, + "learning_rate": 7.198832263962255e-05, + "loss": 2.7496, + "step": 27034 + }, + { + "epoch": 1.2586772819330958, + "grad_norm": 0.3497367243127419, + "learning_rate": 7.198588985809821e-05, + "loss": 2.792, + "step": 27035 + }, + { + "epoch": 1.258723840119189, + "grad_norm": 0.37801946412792076, + "learning_rate": 7.198345701204746e-05, + "loss": 2.7197, + "step": 27036 + }, + { + "epoch": 1.258770398305282, + "grad_norm": 0.3584084367322822, + "learning_rate": 7.198102410147742e-05, + "loss": 2.7775, + "step": 27037 + }, + { + "epoch": 1.2588169564913751, + "grad_norm": 0.37539337437823156, + "learning_rate": 7.197859112639525e-05, + "loss": 2.6539, + "step": 27038 + }, + { + "epoch": 1.258863514677468, + "grad_norm": 0.34443274738640234, + "learning_rate": 7.197615808680807e-05, + "loss": 2.6998, + "step": 27039 + }, + { + "epoch": 1.2589100728635612, + "grad_norm": 0.37289842886271135, + "learning_rate": 7.197372498272304e-05, + "loss": 2.7892, + "step": 27040 + }, + { + "epoch": 1.2589566310496543, + "grad_norm": 0.3467118407012034, + "learning_rate": 7.197129181414727e-05, + "loss": 2.6838, + "step": 27041 + }, + { + "epoch": 1.2590031892357474, + "grad_norm": 0.3554561658886045, + "learning_rate": 7.196885858108795e-05, + "loss": 2.85, + "step": 27042 + }, + { + "epoch": 1.2590497474218405, + "grad_norm": 0.3338303648927502, + "learning_rate": 7.196642528355218e-05, + "loss": 2.7588, + "step": 27043 + }, + { + "epoch": 1.2590963056079336, + "grad_norm": 0.35612228132425156, + "learning_rate": 7.196399192154713e-05, + "loss": 2.8512, + "step": 27044 + }, + { + "epoch": 1.2591428637940265, + "grad_norm": 0.32571838285284044, + "learning_rate": 7.19615584950799e-05, + "loss": 2.8262, + "step": 27045 + }, + { + "epoch": 1.2591894219801196, + "grad_norm": 0.32840200523295343, + "learning_rate": 7.19591250041577e-05, + "loss": 2.8013, + "step": 27046 + }, + { + "epoch": 1.2592359801662127, + "grad_norm": 0.33813253487348677, + "learning_rate": 7.195669144878758e-05, + "loss": 2.8468, + "step": 27047 + }, + { + "epoch": 1.2592825383523059, + "grad_norm": 0.31479128395759975, + "learning_rate": 7.195425782897677e-05, + "loss": 2.8726, + "step": 27048 + }, + { + "epoch": 1.2593290965383988, + "grad_norm": 0.35075529007198814, + "learning_rate": 7.195182414473237e-05, + "loss": 2.7898, + "step": 27049 + }, + { + "epoch": 1.2593756547244919, + "grad_norm": 0.3302125062832777, + "learning_rate": 7.194939039606152e-05, + "loss": 2.7494, + "step": 27050 + }, + { + "epoch": 1.259422212910585, + "grad_norm": 0.3320731117418522, + "learning_rate": 7.194695658297138e-05, + "loss": 2.8857, + "step": 27051 + }, + { + "epoch": 1.259468771096678, + "grad_norm": 0.3622468559491834, + "learning_rate": 7.194452270546908e-05, + "loss": 2.8183, + "step": 27052 + }, + { + "epoch": 1.2595153292827712, + "grad_norm": 0.3585749899996492, + "learning_rate": 7.194208876356178e-05, + "loss": 2.8731, + "step": 27053 + }, + { + "epoch": 1.2595618874688643, + "grad_norm": 0.34263313671223883, + "learning_rate": 7.19396547572566e-05, + "loss": 2.7041, + "step": 27054 + }, + { + "epoch": 1.2596084456549572, + "grad_norm": 0.3466101098253712, + "learning_rate": 7.19372206865607e-05, + "loss": 2.6889, + "step": 27055 + }, + { + "epoch": 1.2596550038410503, + "grad_norm": 0.32331094691210077, + "learning_rate": 7.19347865514812e-05, + "loss": 2.8884, + "step": 27056 + }, + { + "epoch": 1.2597015620271435, + "grad_norm": 0.3787932716596055, + "learning_rate": 7.193235235202528e-05, + "loss": 2.7675, + "step": 27057 + }, + { + "epoch": 1.2597481202132366, + "grad_norm": 0.28706212771801914, + "learning_rate": 7.192991808820006e-05, + "loss": 2.6696, + "step": 27058 + }, + { + "epoch": 1.2597946783993295, + "grad_norm": 0.3462116115181931, + "learning_rate": 7.19274837600127e-05, + "loss": 2.7447, + "step": 27059 + }, + { + "epoch": 1.2598412365854226, + "grad_norm": 0.32526317945644767, + "learning_rate": 7.192504936747033e-05, + "loss": 2.839, + "step": 27060 + }, + { + "epoch": 1.2598877947715157, + "grad_norm": 0.34488366821547417, + "learning_rate": 7.19226149105801e-05, + "loss": 2.8015, + "step": 27061 + }, + { + "epoch": 1.2599343529576088, + "grad_norm": 0.33969532697372623, + "learning_rate": 7.192018038934914e-05, + "loss": 2.7332, + "step": 27062 + }, + { + "epoch": 1.259980911143702, + "grad_norm": 0.3437959392465613, + "learning_rate": 7.191774580378464e-05, + "loss": 2.7641, + "step": 27063 + }, + { + "epoch": 1.260027469329795, + "grad_norm": 0.32431033576379165, + "learning_rate": 7.19153111538937e-05, + "loss": 2.7565, + "step": 27064 + }, + { + "epoch": 1.260074027515888, + "grad_norm": 0.3491855911130861, + "learning_rate": 7.191287643968347e-05, + "loss": 2.7465, + "step": 27065 + }, + { + "epoch": 1.260120585701981, + "grad_norm": 0.33349105838812254, + "learning_rate": 7.191044166116112e-05, + "loss": 2.7159, + "step": 27066 + }, + { + "epoch": 1.2601671438880742, + "grad_norm": 0.3700896251002376, + "learning_rate": 7.190800681833378e-05, + "loss": 2.8173, + "step": 27067 + }, + { + "epoch": 1.260213702074167, + "grad_norm": 0.3278705749923721, + "learning_rate": 7.190557191120861e-05, + "loss": 2.7129, + "step": 27068 + }, + { + "epoch": 1.2602602602602602, + "grad_norm": 0.3489134464192946, + "learning_rate": 7.190313693979272e-05, + "loss": 2.8413, + "step": 27069 + }, + { + "epoch": 1.2603068184463533, + "grad_norm": 0.3244759580918791, + "learning_rate": 7.19007019040933e-05, + "loss": 2.7857, + "step": 27070 + }, + { + "epoch": 1.2603533766324464, + "grad_norm": 0.3867515344673581, + "learning_rate": 7.189826680411746e-05, + "loss": 2.7837, + "step": 27071 + }, + { + "epoch": 1.2603999348185395, + "grad_norm": 0.31917102762508326, + "learning_rate": 7.18958316398724e-05, + "loss": 2.7625, + "step": 27072 + }, + { + "epoch": 1.2604464930046326, + "grad_norm": 0.3745564034612543, + "learning_rate": 7.189339641136518e-05, + "loss": 2.8236, + "step": 27073 + }, + { + "epoch": 1.2604930511907257, + "grad_norm": 0.34421563456019555, + "learning_rate": 7.189096111860303e-05, + "loss": 2.7851, + "step": 27074 + }, + { + "epoch": 1.2605396093768186, + "grad_norm": 0.35262313805149886, + "learning_rate": 7.188852576159306e-05, + "loss": 2.913, + "step": 27075 + }, + { + "epoch": 1.2605861675629118, + "grad_norm": 0.3358327937532336, + "learning_rate": 7.188609034034242e-05, + "loss": 2.7917, + "step": 27076 + }, + { + "epoch": 1.2606327257490049, + "grad_norm": 0.33161244142992635, + "learning_rate": 7.188365485485828e-05, + "loss": 2.7381, + "step": 27077 + }, + { + "epoch": 1.2606792839350978, + "grad_norm": 0.30961885009060786, + "learning_rate": 7.188121930514774e-05, + "loss": 2.6976, + "step": 27078 + }, + { + "epoch": 1.2607258421211909, + "grad_norm": 0.3216528872621166, + "learning_rate": 7.1878783691218e-05, + "loss": 2.7428, + "step": 27079 + }, + { + "epoch": 1.260772400307284, + "grad_norm": 0.29666672010296985, + "learning_rate": 7.187634801307616e-05, + "loss": 2.7186, + "step": 27080 + }, + { + "epoch": 1.260818958493377, + "grad_norm": 0.32193433085871503, + "learning_rate": 7.187391227072943e-05, + "loss": 2.8202, + "step": 27081 + }, + { + "epoch": 1.2608655166794702, + "grad_norm": 0.34287017357787575, + "learning_rate": 7.18714764641849e-05, + "loss": 2.7442, + "step": 27082 + }, + { + "epoch": 1.2609120748655633, + "grad_norm": 0.3063126041721597, + "learning_rate": 7.186904059344973e-05, + "loss": 2.8009, + "step": 27083 + }, + { + "epoch": 1.2609586330516562, + "grad_norm": 0.34099125336587227, + "learning_rate": 7.18666046585311e-05, + "loss": 2.7905, + "step": 27084 + }, + { + "epoch": 1.2610051912377493, + "grad_norm": 0.30933323273297353, + "learning_rate": 7.186416865943614e-05, + "loss": 2.8836, + "step": 27085 + }, + { + "epoch": 1.2610517494238425, + "grad_norm": 0.3522458708393977, + "learning_rate": 7.186173259617198e-05, + "loss": 2.8115, + "step": 27086 + }, + { + "epoch": 1.2610983076099356, + "grad_norm": 0.3251722379674008, + "learning_rate": 7.185929646874581e-05, + "loss": 2.7777, + "step": 27087 + }, + { + "epoch": 1.2611448657960285, + "grad_norm": 0.3373186619960764, + "learning_rate": 7.185686027716474e-05, + "loss": 2.666, + "step": 27088 + }, + { + "epoch": 1.2611914239821216, + "grad_norm": 0.3315742032552603, + "learning_rate": 7.185442402143594e-05, + "loss": 2.7926, + "step": 27089 + }, + { + "epoch": 1.2612379821682147, + "grad_norm": 0.3438821606853357, + "learning_rate": 7.185198770156659e-05, + "loss": 2.7687, + "step": 27090 + }, + { + "epoch": 1.2612845403543078, + "grad_norm": 0.31832118660297193, + "learning_rate": 7.184955131756377e-05, + "loss": 2.7652, + "step": 27091 + }, + { + "epoch": 1.261331098540401, + "grad_norm": 0.35699244906678446, + "learning_rate": 7.184711486943469e-05, + "loss": 2.7891, + "step": 27092 + }, + { + "epoch": 1.261377656726494, + "grad_norm": 0.3368121516994266, + "learning_rate": 7.184467835718646e-05, + "loss": 2.7096, + "step": 27093 + }, + { + "epoch": 1.261424214912587, + "grad_norm": 0.35922051617098716, + "learning_rate": 7.184224178082628e-05, + "loss": 2.678, + "step": 27094 + }, + { + "epoch": 1.26147077309868, + "grad_norm": 0.3758669590450349, + "learning_rate": 7.183980514036126e-05, + "loss": 2.8133, + "step": 27095 + }, + { + "epoch": 1.2615173312847732, + "grad_norm": 0.35417600701304225, + "learning_rate": 7.183736843579856e-05, + "loss": 2.7543, + "step": 27096 + }, + { + "epoch": 1.2615638894708663, + "grad_norm": 0.374648477857062, + "learning_rate": 7.183493166714532e-05, + "loss": 2.7662, + "step": 27097 + }, + { + "epoch": 1.2616104476569592, + "grad_norm": 0.31492399471957017, + "learning_rate": 7.183249483440873e-05, + "loss": 2.7814, + "step": 27098 + }, + { + "epoch": 1.2616570058430523, + "grad_norm": 0.32622325996196433, + "learning_rate": 7.183005793759592e-05, + "loss": 2.8386, + "step": 27099 + }, + { + "epoch": 1.2617035640291454, + "grad_norm": 0.34786932754535294, + "learning_rate": 7.182762097671402e-05, + "loss": 2.7946, + "step": 27100 + }, + { + "epoch": 1.2617501222152385, + "grad_norm": 0.3402108861200794, + "learning_rate": 7.182518395177021e-05, + "loss": 2.7844, + "step": 27101 + }, + { + "epoch": 1.2617966804013316, + "grad_norm": 0.34175102333502283, + "learning_rate": 7.182274686277164e-05, + "loss": 2.891, + "step": 27102 + }, + { + "epoch": 1.2618432385874248, + "grad_norm": 0.34228466168958904, + "learning_rate": 7.182030970972546e-05, + "loss": 2.8601, + "step": 27103 + }, + { + "epoch": 1.2618897967735176, + "grad_norm": 0.34234271358026463, + "learning_rate": 7.181787249263881e-05, + "loss": 2.7144, + "step": 27104 + }, + { + "epoch": 1.2619363549596108, + "grad_norm": 0.32241623320395396, + "learning_rate": 7.181543521151887e-05, + "loss": 2.8526, + "step": 27105 + }, + { + "epoch": 1.2619829131457039, + "grad_norm": 0.3370811170159731, + "learning_rate": 7.181299786637278e-05, + "loss": 2.8847, + "step": 27106 + }, + { + "epoch": 1.262029471331797, + "grad_norm": 0.33904746083408277, + "learning_rate": 7.181056045720767e-05, + "loss": 2.8707, + "step": 27107 + }, + { + "epoch": 1.2620760295178899, + "grad_norm": 0.3434487499306827, + "learning_rate": 7.180812298403073e-05, + "loss": 2.7393, + "step": 27108 + }, + { + "epoch": 1.262122587703983, + "grad_norm": 0.32566649697726535, + "learning_rate": 7.18056854468491e-05, + "loss": 2.7536, + "step": 27109 + }, + { + "epoch": 1.2621691458900761, + "grad_norm": 0.3589378638058736, + "learning_rate": 7.180324784566991e-05, + "loss": 2.8578, + "step": 27110 + }, + { + "epoch": 1.2622157040761692, + "grad_norm": 0.3149165475456209, + "learning_rate": 7.180081018050034e-05, + "loss": 2.6957, + "step": 27111 + }, + { + "epoch": 1.2622622622622623, + "grad_norm": 0.3414922861832536, + "learning_rate": 7.179837245134755e-05, + "loss": 2.752, + "step": 27112 + }, + { + "epoch": 1.2623088204483555, + "grad_norm": 0.3203733687084584, + "learning_rate": 7.179593465821867e-05, + "loss": 2.8251, + "step": 27113 + }, + { + "epoch": 1.2623553786344484, + "grad_norm": 0.3336797964635459, + "learning_rate": 7.179349680112088e-05, + "loss": 2.676, + "step": 27114 + }, + { + "epoch": 1.2624019368205415, + "grad_norm": 0.3355210294076395, + "learning_rate": 7.179105888006133e-05, + "loss": 2.8889, + "step": 27115 + }, + { + "epoch": 1.2624484950066346, + "grad_norm": 0.3274690754826084, + "learning_rate": 7.178862089504717e-05, + "loss": 2.7173, + "step": 27116 + }, + { + "epoch": 1.2624950531927275, + "grad_norm": 0.3277044742391772, + "learning_rate": 7.178618284608554e-05, + "loss": 2.8295, + "step": 27117 + }, + { + "epoch": 1.2625416113788206, + "grad_norm": 0.3253119201971104, + "learning_rate": 7.178374473318364e-05, + "loss": 2.707, + "step": 27118 + }, + { + "epoch": 1.2625881695649137, + "grad_norm": 0.30348967344643407, + "learning_rate": 7.178130655634857e-05, + "loss": 2.7103, + "step": 27119 + }, + { + "epoch": 1.2626347277510068, + "grad_norm": 0.3335602349677604, + "learning_rate": 7.17788683155875e-05, + "loss": 2.7331, + "step": 27120 + }, + { + "epoch": 1.2626812859371, + "grad_norm": 0.3189199471865573, + "learning_rate": 7.177643001090762e-05, + "loss": 2.7733, + "step": 27121 + }, + { + "epoch": 1.262727844123193, + "grad_norm": 0.35783379863166676, + "learning_rate": 7.177399164231604e-05, + "loss": 2.731, + "step": 27122 + }, + { + "epoch": 1.262774402309286, + "grad_norm": 0.3414641969348151, + "learning_rate": 7.177155320981995e-05, + "loss": 2.7974, + "step": 27123 + }, + { + "epoch": 1.262820960495379, + "grad_norm": 0.33183519498882214, + "learning_rate": 7.17691147134265e-05, + "loss": 2.82, + "step": 27124 + }, + { + "epoch": 1.2628675186814722, + "grad_norm": 0.3448690584808969, + "learning_rate": 7.176667615314284e-05, + "loss": 2.8165, + "step": 27125 + }, + { + "epoch": 1.2629140768675653, + "grad_norm": 0.354638771437061, + "learning_rate": 7.176423752897614e-05, + "loss": 2.8126, + "step": 27126 + }, + { + "epoch": 1.2629606350536582, + "grad_norm": 0.35494426973838483, + "learning_rate": 7.176179884093354e-05, + "loss": 2.7595, + "step": 27127 + }, + { + "epoch": 1.2630071932397513, + "grad_norm": 0.3648728182361681, + "learning_rate": 7.17593600890222e-05, + "loss": 2.8171, + "step": 27128 + }, + { + "epoch": 1.2630537514258444, + "grad_norm": 0.35746036204095594, + "learning_rate": 7.175692127324928e-05, + "loss": 2.8024, + "step": 27129 + }, + { + "epoch": 1.2631003096119375, + "grad_norm": 0.3957413816356702, + "learning_rate": 7.175448239362195e-05, + "loss": 2.7467, + "step": 27130 + }, + { + "epoch": 1.2631468677980306, + "grad_norm": 0.3513233170905545, + "learning_rate": 7.175204345014737e-05, + "loss": 2.7691, + "step": 27131 + }, + { + "epoch": 1.2631934259841238, + "grad_norm": 0.3899142705403181, + "learning_rate": 7.174960444283265e-05, + "loss": 2.7249, + "step": 27132 + }, + { + "epoch": 1.2632399841702167, + "grad_norm": 0.3966846895963745, + "learning_rate": 7.1747165371685e-05, + "loss": 2.8646, + "step": 27133 + }, + { + "epoch": 1.2632865423563098, + "grad_norm": 0.33081354953501574, + "learning_rate": 7.174472623671157e-05, + "loss": 2.7405, + "step": 27134 + }, + { + "epoch": 1.2633331005424029, + "grad_norm": 0.3920172529128778, + "learning_rate": 7.174228703791951e-05, + "loss": 2.7282, + "step": 27135 + }, + { + "epoch": 1.263379658728496, + "grad_norm": 0.29826372237637555, + "learning_rate": 7.173984777531597e-05, + "loss": 2.75, + "step": 27136 + }, + { + "epoch": 1.263426216914589, + "grad_norm": 0.36528686285844286, + "learning_rate": 7.173740844890811e-05, + "loss": 2.8727, + "step": 27137 + }, + { + "epoch": 1.263472775100682, + "grad_norm": 0.33263408395226857, + "learning_rate": 7.173496905870312e-05, + "loss": 2.8216, + "step": 27138 + }, + { + "epoch": 1.2635193332867751, + "grad_norm": 0.3251514214021756, + "learning_rate": 7.173252960470813e-05, + "loss": 2.7492, + "step": 27139 + }, + { + "epoch": 1.2635658914728682, + "grad_norm": 0.3320340817833819, + "learning_rate": 7.17300900869303e-05, + "loss": 2.7537, + "step": 27140 + }, + { + "epoch": 1.2636124496589614, + "grad_norm": 0.3196922867505699, + "learning_rate": 7.172765050537679e-05, + "loss": 2.823, + "step": 27141 + }, + { + "epoch": 1.2636590078450545, + "grad_norm": 0.3154788623959028, + "learning_rate": 7.172521086005478e-05, + "loss": 2.7829, + "step": 27142 + }, + { + "epoch": 1.2637055660311474, + "grad_norm": 0.3295270579176727, + "learning_rate": 7.172277115097141e-05, + "loss": 2.7655, + "step": 27143 + }, + { + "epoch": 1.2637521242172405, + "grad_norm": 0.3091875538788555, + "learning_rate": 7.172033137813386e-05, + "loss": 2.7727, + "step": 27144 + }, + { + "epoch": 1.2637986824033336, + "grad_norm": 0.3243043306300235, + "learning_rate": 7.171789154154926e-05, + "loss": 2.8123, + "step": 27145 + }, + { + "epoch": 1.2638452405894267, + "grad_norm": 0.3528235171598297, + "learning_rate": 7.17154516412248e-05, + "loss": 2.8401, + "step": 27146 + }, + { + "epoch": 1.2638917987755196, + "grad_norm": 0.30851079493393935, + "learning_rate": 7.171301167716762e-05, + "loss": 2.8023, + "step": 27147 + }, + { + "epoch": 1.2639383569616127, + "grad_norm": 0.32616395576036444, + "learning_rate": 7.17105716493849e-05, + "loss": 2.6932, + "step": 27148 + }, + { + "epoch": 1.2639849151477058, + "grad_norm": 0.3382328321554806, + "learning_rate": 7.170813155788378e-05, + "loss": 2.8363, + "step": 27149 + }, + { + "epoch": 1.264031473333799, + "grad_norm": 0.3315717976756031, + "learning_rate": 7.170569140267142e-05, + "loss": 2.8889, + "step": 27150 + }, + { + "epoch": 1.264078031519892, + "grad_norm": 0.3298395936688945, + "learning_rate": 7.170325118375502e-05, + "loss": 2.7267, + "step": 27151 + }, + { + "epoch": 1.2641245897059852, + "grad_norm": 0.3431349127181067, + "learning_rate": 7.170081090114171e-05, + "loss": 2.7733, + "step": 27152 + }, + { + "epoch": 1.264171147892078, + "grad_norm": 0.3115422498132179, + "learning_rate": 7.169837055483866e-05, + "loss": 2.7148, + "step": 27153 + }, + { + "epoch": 1.2642177060781712, + "grad_norm": 0.3578110713549033, + "learning_rate": 7.169593014485302e-05, + "loss": 2.7651, + "step": 27154 + }, + { + "epoch": 1.2642642642642643, + "grad_norm": 0.289565872707501, + "learning_rate": 7.169348967119196e-05, + "loss": 2.7676, + "step": 27155 + }, + { + "epoch": 1.2643108224503572, + "grad_norm": 0.353898496385773, + "learning_rate": 7.169104913386267e-05, + "loss": 2.9122, + "step": 27156 + }, + { + "epoch": 1.2643573806364503, + "grad_norm": 0.31739862359208293, + "learning_rate": 7.168860853287227e-05, + "loss": 2.7978, + "step": 27157 + }, + { + "epoch": 1.2644039388225434, + "grad_norm": 0.33749007401517145, + "learning_rate": 7.168616786822792e-05, + "loss": 2.6899, + "step": 27158 + }, + { + "epoch": 1.2644504970086365, + "grad_norm": 0.30028991699160845, + "learning_rate": 7.168372713993683e-05, + "loss": 2.6914, + "step": 27159 + }, + { + "epoch": 1.2644970551947297, + "grad_norm": 0.3343742300538591, + "learning_rate": 7.168128634800613e-05, + "loss": 2.7273, + "step": 27160 + }, + { + "epoch": 1.2645436133808228, + "grad_norm": 0.30018254646047954, + "learning_rate": 7.167884549244299e-05, + "loss": 2.751, + "step": 27161 + }, + { + "epoch": 1.2645901715669159, + "grad_norm": 0.30117470795011986, + "learning_rate": 7.167640457325459e-05, + "loss": 2.6897, + "step": 27162 + }, + { + "epoch": 1.2646367297530088, + "grad_norm": 0.3254798665942688, + "learning_rate": 7.167396359044805e-05, + "loss": 2.8222, + "step": 27163 + }, + { + "epoch": 1.264683287939102, + "grad_norm": 0.3188628813568995, + "learning_rate": 7.167152254403057e-05, + "loss": 2.8535, + "step": 27164 + }, + { + "epoch": 1.264729846125195, + "grad_norm": 0.309565428660199, + "learning_rate": 7.166908143400932e-05, + "loss": 2.7994, + "step": 27165 + }, + { + "epoch": 1.264776404311288, + "grad_norm": 0.30585050824445315, + "learning_rate": 7.166664026039145e-05, + "loss": 2.7767, + "step": 27166 + }, + { + "epoch": 1.264822962497381, + "grad_norm": 0.32385411212837284, + "learning_rate": 7.16641990231841e-05, + "loss": 2.832, + "step": 27167 + }, + { + "epoch": 1.2648695206834741, + "grad_norm": 0.31436007332504556, + "learning_rate": 7.166175772239449e-05, + "loss": 2.7906, + "step": 27168 + }, + { + "epoch": 1.2649160788695673, + "grad_norm": 0.3027363004863869, + "learning_rate": 7.165931635802975e-05, + "loss": 2.7093, + "step": 27169 + }, + { + "epoch": 1.2649626370556604, + "grad_norm": 0.3174361443131162, + "learning_rate": 7.165687493009703e-05, + "loss": 2.8017, + "step": 27170 + }, + { + "epoch": 1.2650091952417535, + "grad_norm": 0.32246483111289576, + "learning_rate": 7.165443343860355e-05, + "loss": 2.8956, + "step": 27171 + }, + { + "epoch": 1.2650557534278464, + "grad_norm": 0.33644061835054334, + "learning_rate": 7.165199188355642e-05, + "loss": 2.7598, + "step": 27172 + }, + { + "epoch": 1.2651023116139395, + "grad_norm": 0.33927122728013004, + "learning_rate": 7.164955026496282e-05, + "loss": 2.7947, + "step": 27173 + }, + { + "epoch": 1.2651488698000326, + "grad_norm": 0.3212062280917655, + "learning_rate": 7.164710858282993e-05, + "loss": 2.7589, + "step": 27174 + }, + { + "epoch": 1.2651954279861257, + "grad_norm": 0.32585051040673235, + "learning_rate": 7.164466683716491e-05, + "loss": 2.9431, + "step": 27175 + }, + { + "epoch": 1.2652419861722186, + "grad_norm": 0.3126846465554972, + "learning_rate": 7.164222502797492e-05, + "loss": 2.8134, + "step": 27176 + }, + { + "epoch": 1.2652885443583117, + "grad_norm": 0.34200980718391216, + "learning_rate": 7.163978315526714e-05, + "loss": 2.698, + "step": 27177 + }, + { + "epoch": 1.2653351025444048, + "grad_norm": 0.3132296343748134, + "learning_rate": 7.163734121904871e-05, + "loss": 2.689, + "step": 27178 + }, + { + "epoch": 1.265381660730498, + "grad_norm": 0.33514673631212555, + "learning_rate": 7.163489921932683e-05, + "loss": 2.8963, + "step": 27179 + }, + { + "epoch": 1.265428218916591, + "grad_norm": 0.38151814554248187, + "learning_rate": 7.163245715610867e-05, + "loss": 2.7903, + "step": 27180 + }, + { + "epoch": 1.2654747771026842, + "grad_norm": 0.3126417491848295, + "learning_rate": 7.163001502940135e-05, + "loss": 2.7388, + "step": 27181 + }, + { + "epoch": 1.265521335288777, + "grad_norm": 0.36981629919876513, + "learning_rate": 7.162757283921208e-05, + "loss": 2.789, + "step": 27182 + }, + { + "epoch": 1.2655678934748702, + "grad_norm": 0.3394141241554634, + "learning_rate": 7.162513058554802e-05, + "loss": 2.7463, + "step": 27183 + }, + { + "epoch": 1.2656144516609633, + "grad_norm": 0.3488998559531148, + "learning_rate": 7.162268826841631e-05, + "loss": 2.7487, + "step": 27184 + }, + { + "epoch": 1.2656610098470564, + "grad_norm": 0.33728129355751557, + "learning_rate": 7.162024588782416e-05, + "loss": 2.7716, + "step": 27185 + }, + { + "epoch": 1.2657075680331493, + "grad_norm": 0.3406527930552307, + "learning_rate": 7.161780344377871e-05, + "loss": 2.851, + "step": 27186 + }, + { + "epoch": 1.2657541262192424, + "grad_norm": 0.3263329532340226, + "learning_rate": 7.161536093628715e-05, + "loss": 2.6377, + "step": 27187 + }, + { + "epoch": 1.2658006844053356, + "grad_norm": 0.32805994976709013, + "learning_rate": 7.161291836535661e-05, + "loss": 2.7737, + "step": 27188 + }, + { + "epoch": 1.2658472425914287, + "grad_norm": 0.32940827144994483, + "learning_rate": 7.161047573099431e-05, + "loss": 2.8576, + "step": 27189 + }, + { + "epoch": 1.2658938007775218, + "grad_norm": 0.3169749673450179, + "learning_rate": 7.160803303320738e-05, + "loss": 2.7293, + "step": 27190 + }, + { + "epoch": 1.265940358963615, + "grad_norm": 0.32981797444689714, + "learning_rate": 7.160559027200299e-05, + "loss": 2.842, + "step": 27191 + }, + { + "epoch": 1.2659869171497078, + "grad_norm": 0.35358306869292805, + "learning_rate": 7.160314744738834e-05, + "loss": 2.8508, + "step": 27192 + }, + { + "epoch": 1.266033475335801, + "grad_norm": 0.2901069048634095, + "learning_rate": 7.160070455937058e-05, + "loss": 2.7699, + "step": 27193 + }, + { + "epoch": 1.266080033521894, + "grad_norm": 0.3686384710970721, + "learning_rate": 7.159826160795687e-05, + "loss": 2.8588, + "step": 27194 + }, + { + "epoch": 1.2661265917079871, + "grad_norm": 0.3056463675594146, + "learning_rate": 7.159581859315441e-05, + "loss": 2.7749, + "step": 27195 + }, + { + "epoch": 1.26617314989408, + "grad_norm": 0.3513953443847333, + "learning_rate": 7.159337551497033e-05, + "loss": 2.8106, + "step": 27196 + }, + { + "epoch": 1.2662197080801731, + "grad_norm": 0.2881210488064745, + "learning_rate": 7.159093237341182e-05, + "loss": 2.8019, + "step": 27197 + }, + { + "epoch": 1.2662662662662663, + "grad_norm": 0.33463638743545887, + "learning_rate": 7.158848916848607e-05, + "loss": 2.8203, + "step": 27198 + }, + { + "epoch": 1.2663128244523594, + "grad_norm": 0.3158669687171635, + "learning_rate": 7.15860459002002e-05, + "loss": 2.8227, + "step": 27199 + }, + { + "epoch": 1.2663593826384525, + "grad_norm": 0.3672439853158372, + "learning_rate": 7.158360256856142e-05, + "loss": 2.7063, + "step": 27200 + }, + { + "epoch": 1.2664059408245456, + "grad_norm": 0.30389150854331126, + "learning_rate": 7.158115917357691e-05, + "loss": 2.7702, + "step": 27201 + }, + { + "epoch": 1.2664524990106385, + "grad_norm": 0.3418636430448293, + "learning_rate": 7.157871571525381e-05, + "loss": 2.7562, + "step": 27202 + }, + { + "epoch": 1.2664990571967316, + "grad_norm": 0.3378604632325794, + "learning_rate": 7.157627219359932e-05, + "loss": 2.881, + "step": 27203 + }, + { + "epoch": 1.2665456153828247, + "grad_norm": 0.3536859928971389, + "learning_rate": 7.157382860862059e-05, + "loss": 2.717, + "step": 27204 + }, + { + "epoch": 1.2665921735689176, + "grad_norm": 0.3251555710176004, + "learning_rate": 7.157138496032479e-05, + "loss": 2.7349, + "step": 27205 + }, + { + "epoch": 1.2666387317550107, + "grad_norm": 0.31086162734742173, + "learning_rate": 7.15689412487191e-05, + "loss": 2.7102, + "step": 27206 + }, + { + "epoch": 1.2666852899411039, + "grad_norm": 0.3482445063386275, + "learning_rate": 7.15664974738107e-05, + "loss": 2.7633, + "step": 27207 + }, + { + "epoch": 1.266731848127197, + "grad_norm": 0.30388891799253076, + "learning_rate": 7.156405363560676e-05, + "loss": 2.8324, + "step": 27208 + }, + { + "epoch": 1.26677840631329, + "grad_norm": 0.34787581937725165, + "learning_rate": 7.156160973411444e-05, + "loss": 2.8016, + "step": 27209 + }, + { + "epoch": 1.2668249644993832, + "grad_norm": 0.3356875049421595, + "learning_rate": 7.155916576934092e-05, + "loss": 2.7097, + "step": 27210 + }, + { + "epoch": 1.266871522685476, + "grad_norm": 0.33031573443069306, + "learning_rate": 7.155672174129336e-05, + "loss": 2.8513, + "step": 27211 + }, + { + "epoch": 1.2669180808715692, + "grad_norm": 0.33733968921170415, + "learning_rate": 7.155427764997896e-05, + "loss": 2.8091, + "step": 27212 + }, + { + "epoch": 1.2669646390576623, + "grad_norm": 0.34966992593625784, + "learning_rate": 7.155183349540489e-05, + "loss": 2.8147, + "step": 27213 + }, + { + "epoch": 1.2670111972437554, + "grad_norm": 0.33405543003222915, + "learning_rate": 7.154938927757829e-05, + "loss": 2.8345, + "step": 27214 + }, + { + "epoch": 1.2670577554298483, + "grad_norm": 0.29871156372463187, + "learning_rate": 7.154694499650635e-05, + "loss": 2.7554, + "step": 27215 + }, + { + "epoch": 1.2671043136159414, + "grad_norm": 0.3770882865122635, + "learning_rate": 7.154450065219627e-05, + "loss": 2.7524, + "step": 27216 + }, + { + "epoch": 1.2671508718020346, + "grad_norm": 0.32853891220833986, + "learning_rate": 7.15420562446552e-05, + "loss": 2.8087, + "step": 27217 + }, + { + "epoch": 1.2671974299881277, + "grad_norm": 0.3494608320237596, + "learning_rate": 7.15396117738903e-05, + "loss": 2.8006, + "step": 27218 + }, + { + "epoch": 1.2672439881742208, + "grad_norm": 0.3331141211535617, + "learning_rate": 7.153716723990879e-05, + "loss": 2.7645, + "step": 27219 + }, + { + "epoch": 1.267290546360314, + "grad_norm": 0.34727582663246975, + "learning_rate": 7.153472264271778e-05, + "loss": 2.8224, + "step": 27220 + }, + { + "epoch": 1.2673371045464068, + "grad_norm": 0.3253650788460148, + "learning_rate": 7.153227798232452e-05, + "loss": 2.749, + "step": 27221 + }, + { + "epoch": 1.2673836627325, + "grad_norm": 0.3131287628965301, + "learning_rate": 7.152983325873613e-05, + "loss": 2.8483, + "step": 27222 + }, + { + "epoch": 1.267430220918593, + "grad_norm": 0.35399420078179944, + "learning_rate": 7.15273884719598e-05, + "loss": 2.7975, + "step": 27223 + }, + { + "epoch": 1.2674767791046861, + "grad_norm": 0.315679429897498, + "learning_rate": 7.152494362200268e-05, + "loss": 2.8598, + "step": 27224 + }, + { + "epoch": 1.267523337290779, + "grad_norm": 0.31304803773301987, + "learning_rate": 7.1522498708872e-05, + "loss": 2.7763, + "step": 27225 + }, + { + "epoch": 1.2675698954768722, + "grad_norm": 0.32011408950603626, + "learning_rate": 7.15200537325749e-05, + "loss": 2.8005, + "step": 27226 + }, + { + "epoch": 1.2676164536629653, + "grad_norm": 0.30605973078723137, + "learning_rate": 7.151760869311856e-05, + "loss": 2.833, + "step": 27227 + }, + { + "epoch": 1.2676630118490584, + "grad_norm": 0.3329692986427749, + "learning_rate": 7.151516359051017e-05, + "loss": 2.7576, + "step": 27228 + }, + { + "epoch": 1.2677095700351515, + "grad_norm": 0.30381192330080253, + "learning_rate": 7.151271842475686e-05, + "loss": 2.6982, + "step": 27229 + }, + { + "epoch": 1.2677561282212446, + "grad_norm": 0.31045452358390807, + "learning_rate": 7.151027319586587e-05, + "loss": 2.7676, + "step": 27230 + }, + { + "epoch": 1.2678026864073375, + "grad_norm": 0.3214221267870246, + "learning_rate": 7.150782790384435e-05, + "loss": 2.77, + "step": 27231 + }, + { + "epoch": 1.2678492445934306, + "grad_norm": 0.33786495836527464, + "learning_rate": 7.150538254869948e-05, + "loss": 2.8167, + "step": 27232 + }, + { + "epoch": 1.2678958027795237, + "grad_norm": 0.3102192318673718, + "learning_rate": 7.15029371304384e-05, + "loss": 2.7381, + "step": 27233 + }, + { + "epoch": 1.2679423609656169, + "grad_norm": 0.3320482714042464, + "learning_rate": 7.150049164906834e-05, + "loss": 2.7867, + "step": 27234 + }, + { + "epoch": 1.2679889191517097, + "grad_norm": 0.3390938196876315, + "learning_rate": 7.149804610459644e-05, + "loss": 2.7289, + "step": 27235 + }, + { + "epoch": 1.2680354773378029, + "grad_norm": 0.28890170009290045, + "learning_rate": 7.149560049702991e-05, + "loss": 2.9023, + "step": 27236 + }, + { + "epoch": 1.268082035523896, + "grad_norm": 0.35204591843544003, + "learning_rate": 7.149315482637591e-05, + "loss": 2.7299, + "step": 27237 + }, + { + "epoch": 1.268128593709989, + "grad_norm": 0.31228228555025356, + "learning_rate": 7.14907090926416e-05, + "loss": 2.7232, + "step": 27238 + }, + { + "epoch": 1.2681751518960822, + "grad_norm": 0.3515611461892749, + "learning_rate": 7.14882632958342e-05, + "loss": 2.733, + "step": 27239 + }, + { + "epoch": 1.2682217100821753, + "grad_norm": 0.30864025052404687, + "learning_rate": 7.148581743596085e-05, + "loss": 2.7926, + "step": 27240 + }, + { + "epoch": 1.2682682682682682, + "grad_norm": 0.3780006507316731, + "learning_rate": 7.148337151302873e-05, + "loss": 2.7167, + "step": 27241 + }, + { + "epoch": 1.2683148264543613, + "grad_norm": 0.3146284471889247, + "learning_rate": 7.148092552704505e-05, + "loss": 2.8071, + "step": 27242 + }, + { + "epoch": 1.2683613846404544, + "grad_norm": 0.33663077007261805, + "learning_rate": 7.147847947801696e-05, + "loss": 2.7771, + "step": 27243 + }, + { + "epoch": 1.2684079428265473, + "grad_norm": 0.34331516642570103, + "learning_rate": 7.147603336595166e-05, + "loss": 2.815, + "step": 27244 + }, + { + "epoch": 1.2684545010126405, + "grad_norm": 0.3259113536083867, + "learning_rate": 7.147358719085632e-05, + "loss": 2.7673, + "step": 27245 + }, + { + "epoch": 1.2685010591987336, + "grad_norm": 0.3292863724986034, + "learning_rate": 7.147114095273811e-05, + "loss": 2.7944, + "step": 27246 + }, + { + "epoch": 1.2685476173848267, + "grad_norm": 0.3243618017263555, + "learning_rate": 7.146869465160421e-05, + "loss": 2.7666, + "step": 27247 + }, + { + "epoch": 1.2685941755709198, + "grad_norm": 0.3238414253245839, + "learning_rate": 7.146624828746181e-05, + "loss": 2.6817, + "step": 27248 + }, + { + "epoch": 1.268640733757013, + "grad_norm": 0.3174283093533159, + "learning_rate": 7.14638018603181e-05, + "loss": 2.753, + "step": 27249 + }, + { + "epoch": 1.268687291943106, + "grad_norm": 0.34215014254248366, + "learning_rate": 7.146135537018022e-05, + "loss": 2.7173, + "step": 27250 + }, + { + "epoch": 1.268733850129199, + "grad_norm": 0.32450967337958986, + "learning_rate": 7.145890881705538e-05, + "loss": 2.6866, + "step": 27251 + }, + { + "epoch": 1.268780408315292, + "grad_norm": 0.3579648301995307, + "learning_rate": 7.145646220095079e-05, + "loss": 2.7505, + "step": 27252 + }, + { + "epoch": 1.2688269665013852, + "grad_norm": 0.33748129399397886, + "learning_rate": 7.145401552187357e-05, + "loss": 2.8143, + "step": 27253 + }, + { + "epoch": 1.268873524687478, + "grad_norm": 0.3280541678065447, + "learning_rate": 7.145156877983091e-05, + "loss": 2.7443, + "step": 27254 + }, + { + "epoch": 1.2689200828735712, + "grad_norm": 0.36865260257477306, + "learning_rate": 7.144912197483003e-05, + "loss": 2.8622, + "step": 27255 + }, + { + "epoch": 1.2689666410596643, + "grad_norm": 0.34033693723022374, + "learning_rate": 7.14466751068781e-05, + "loss": 2.8569, + "step": 27256 + }, + { + "epoch": 1.2690131992457574, + "grad_norm": 0.3687218615580006, + "learning_rate": 7.144422817598229e-05, + "loss": 2.806, + "step": 27257 + }, + { + "epoch": 1.2690597574318505, + "grad_norm": 0.33645457610612656, + "learning_rate": 7.144178118214977e-05, + "loss": 2.7585, + "step": 27258 + }, + { + "epoch": 1.2691063156179436, + "grad_norm": 0.33129283488881833, + "learning_rate": 7.143933412538774e-05, + "loss": 2.6197, + "step": 27259 + }, + { + "epoch": 1.2691528738040365, + "grad_norm": 0.31678963343947897, + "learning_rate": 7.143688700570337e-05, + "loss": 2.7227, + "step": 27260 + }, + { + "epoch": 1.2691994319901296, + "grad_norm": 0.3169921741499689, + "learning_rate": 7.143443982310386e-05, + "loss": 2.8184, + "step": 27261 + }, + { + "epoch": 1.2692459901762227, + "grad_norm": 0.32653562267884373, + "learning_rate": 7.143199257759637e-05, + "loss": 2.8075, + "step": 27262 + }, + { + "epoch": 1.2692925483623159, + "grad_norm": 0.32270515052854976, + "learning_rate": 7.14295452691881e-05, + "loss": 2.66, + "step": 27263 + }, + { + "epoch": 1.2693391065484088, + "grad_norm": 0.3423470252231397, + "learning_rate": 7.142709789788621e-05, + "loss": 2.7756, + "step": 27264 + }, + { + "epoch": 1.2693856647345019, + "grad_norm": 0.3184490172716382, + "learning_rate": 7.142465046369793e-05, + "loss": 2.8387, + "step": 27265 + }, + { + "epoch": 1.269432222920595, + "grad_norm": 0.3464233863752256, + "learning_rate": 7.14222029666304e-05, + "loss": 2.8362, + "step": 27266 + }, + { + "epoch": 1.269478781106688, + "grad_norm": 0.3269433652974366, + "learning_rate": 7.14197554066908e-05, + "loss": 2.7952, + "step": 27267 + }, + { + "epoch": 1.2695253392927812, + "grad_norm": 0.3298149747908621, + "learning_rate": 7.141730778388633e-05, + "loss": 2.8161, + "step": 27268 + }, + { + "epoch": 1.2695718974788743, + "grad_norm": 0.32493571209921923, + "learning_rate": 7.141486009822417e-05, + "loss": 2.7272, + "step": 27269 + }, + { + "epoch": 1.2696184556649672, + "grad_norm": 0.32247440798049737, + "learning_rate": 7.141241234971153e-05, + "loss": 2.6965, + "step": 27270 + }, + { + "epoch": 1.2696650138510603, + "grad_norm": 0.32523795719142073, + "learning_rate": 7.140996453835555e-05, + "loss": 2.794, + "step": 27271 + }, + { + "epoch": 1.2697115720371535, + "grad_norm": 0.3273978858979888, + "learning_rate": 7.140751666416344e-05, + "loss": 2.799, + "step": 27272 + }, + { + "epoch": 1.2697581302232466, + "grad_norm": 0.329256819225623, + "learning_rate": 7.140506872714238e-05, + "loss": 2.7421, + "step": 27273 + }, + { + "epoch": 1.2698046884093395, + "grad_norm": 0.3259649396810522, + "learning_rate": 7.140262072729954e-05, + "loss": 2.7354, + "step": 27274 + }, + { + "epoch": 1.2698512465954326, + "grad_norm": 0.3367739315468881, + "learning_rate": 7.140017266464213e-05, + "loss": 2.887, + "step": 27275 + }, + { + "epoch": 1.2698978047815257, + "grad_norm": 0.3396873169418453, + "learning_rate": 7.139772453917732e-05, + "loss": 2.7281, + "step": 27276 + }, + { + "epoch": 1.2699443629676188, + "grad_norm": 0.31137391210349086, + "learning_rate": 7.139527635091228e-05, + "loss": 2.8458, + "step": 27277 + }, + { + "epoch": 1.269990921153712, + "grad_norm": 0.36076376571965946, + "learning_rate": 7.139282809985422e-05, + "loss": 2.7763, + "step": 27278 + }, + { + "epoch": 1.270037479339805, + "grad_norm": 0.3176435600591945, + "learning_rate": 7.139037978601033e-05, + "loss": 2.8242, + "step": 27279 + }, + { + "epoch": 1.270084037525898, + "grad_norm": 0.3327727810653125, + "learning_rate": 7.138793140938777e-05, + "loss": 2.7398, + "step": 27280 + }, + { + "epoch": 1.270130595711991, + "grad_norm": 0.33652286897203276, + "learning_rate": 7.138548296999373e-05, + "loss": 2.7319, + "step": 27281 + }, + { + "epoch": 1.2701771538980842, + "grad_norm": 0.3833799657386562, + "learning_rate": 7.138303446783541e-05, + "loss": 2.8066, + "step": 27282 + }, + { + "epoch": 1.2702237120841773, + "grad_norm": 0.3140579754416075, + "learning_rate": 7.138058590292001e-05, + "loss": 2.8426, + "step": 27283 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 0.353931973650375, + "learning_rate": 7.137813727525469e-05, + "loss": 2.7314, + "step": 27284 + }, + { + "epoch": 1.2703168284563633, + "grad_norm": 0.3253763186740578, + "learning_rate": 7.137568858484663e-05, + "loss": 2.8348, + "step": 27285 + }, + { + "epoch": 1.2703633866424564, + "grad_norm": 0.34373002638436717, + "learning_rate": 7.137323983170304e-05, + "loss": 2.6759, + "step": 27286 + }, + { + "epoch": 1.2704099448285495, + "grad_norm": 0.35926967396453424, + "learning_rate": 7.137079101583109e-05, + "loss": 2.7489, + "step": 27287 + }, + { + "epoch": 1.2704565030146426, + "grad_norm": 0.3442739645276042, + "learning_rate": 7.136834213723797e-05, + "loss": 2.7329, + "step": 27288 + }, + { + "epoch": 1.2705030612007358, + "grad_norm": 0.3525601216661663, + "learning_rate": 7.136589319593088e-05, + "loss": 2.9061, + "step": 27289 + }, + { + "epoch": 1.2705496193868286, + "grad_norm": 0.3636504661768284, + "learning_rate": 7.136344419191699e-05, + "loss": 2.784, + "step": 27290 + }, + { + "epoch": 1.2705961775729218, + "grad_norm": 0.3101483088991151, + "learning_rate": 7.13609951252035e-05, + "loss": 2.7128, + "step": 27291 + }, + { + "epoch": 1.2706427357590149, + "grad_norm": 0.3238392785856867, + "learning_rate": 7.135854599579758e-05, + "loss": 2.7837, + "step": 27292 + }, + { + "epoch": 1.2706892939451078, + "grad_norm": 0.33961801317357404, + "learning_rate": 7.135609680370646e-05, + "loss": 2.6128, + "step": 27293 + }, + { + "epoch": 1.2707358521312009, + "grad_norm": 0.30304214287135844, + "learning_rate": 7.135364754893729e-05, + "loss": 2.7779, + "step": 27294 + }, + { + "epoch": 1.270782410317294, + "grad_norm": 0.36266905880834777, + "learning_rate": 7.135119823149725e-05, + "loss": 2.768, + "step": 27295 + }, + { + "epoch": 1.2708289685033871, + "grad_norm": 0.3231112259128155, + "learning_rate": 7.134874885139358e-05, + "loss": 2.8049, + "step": 27296 + }, + { + "epoch": 1.2708755266894802, + "grad_norm": 0.34467389976834, + "learning_rate": 7.134629940863342e-05, + "loss": 2.5916, + "step": 27297 + }, + { + "epoch": 1.2709220848755733, + "grad_norm": 0.3139148079068707, + "learning_rate": 7.134384990322395e-05, + "loss": 2.7936, + "step": 27298 + }, + { + "epoch": 1.2709686430616662, + "grad_norm": 0.3090027678851458, + "learning_rate": 7.134140033517241e-05, + "loss": 2.7973, + "step": 27299 + }, + { + "epoch": 1.2710152012477594, + "grad_norm": 0.3758203699136146, + "learning_rate": 7.133895070448595e-05, + "loss": 2.8103, + "step": 27300 + }, + { + "epoch": 1.2710617594338525, + "grad_norm": 0.3111169481412183, + "learning_rate": 7.133650101117178e-05, + "loss": 2.7929, + "step": 27301 + }, + { + "epoch": 1.2711083176199456, + "grad_norm": 0.3591423155137053, + "learning_rate": 7.133405125523709e-05, + "loss": 2.7557, + "step": 27302 + }, + { + "epoch": 1.2711548758060385, + "grad_norm": 0.3009491106392658, + "learning_rate": 7.133160143668904e-05, + "loss": 2.8911, + "step": 27303 + }, + { + "epoch": 1.2712014339921316, + "grad_norm": 0.34401264288643735, + "learning_rate": 7.132915155553485e-05, + "loss": 2.6709, + "step": 27304 + }, + { + "epoch": 1.2712479921782247, + "grad_norm": 0.30359837386790267, + "learning_rate": 7.132670161178171e-05, + "loss": 2.8222, + "step": 27305 + }, + { + "epoch": 1.2712945503643178, + "grad_norm": 0.32858157281071043, + "learning_rate": 7.13242516054368e-05, + "loss": 2.7799, + "step": 27306 + }, + { + "epoch": 1.271341108550411, + "grad_norm": 0.31985847006050894, + "learning_rate": 7.132180153650731e-05, + "loss": 2.8362, + "step": 27307 + }, + { + "epoch": 1.271387666736504, + "grad_norm": 0.33466982349001084, + "learning_rate": 7.131935140500042e-05, + "loss": 2.6542, + "step": 27308 + }, + { + "epoch": 1.271434224922597, + "grad_norm": 0.33301037059619437, + "learning_rate": 7.131690121092336e-05, + "loss": 2.8802, + "step": 27309 + }, + { + "epoch": 1.27148078310869, + "grad_norm": 0.3227040788108427, + "learning_rate": 7.131445095428328e-05, + "loss": 2.6733, + "step": 27310 + }, + { + "epoch": 1.2715273412947832, + "grad_norm": 0.31715033293975203, + "learning_rate": 7.131200063508738e-05, + "loss": 2.8261, + "step": 27311 + }, + { + "epoch": 1.2715738994808763, + "grad_norm": 0.3657848203678353, + "learning_rate": 7.130955025334285e-05, + "loss": 2.7664, + "step": 27312 + }, + { + "epoch": 1.2716204576669692, + "grad_norm": 0.33480636671501535, + "learning_rate": 7.130709980905691e-05, + "loss": 2.71, + "step": 27313 + }, + { + "epoch": 1.2716670158530623, + "grad_norm": 0.3640749821809668, + "learning_rate": 7.130464930223673e-05, + "loss": 2.7913, + "step": 27314 + }, + { + "epoch": 1.2717135740391554, + "grad_norm": 0.3553097476729438, + "learning_rate": 7.130219873288951e-05, + "loss": 2.8087, + "step": 27315 + }, + { + "epoch": 1.2717601322252485, + "grad_norm": 0.3414169933288865, + "learning_rate": 7.129974810102241e-05, + "loss": 2.7611, + "step": 27316 + }, + { + "epoch": 1.2718066904113416, + "grad_norm": 0.34040534994348226, + "learning_rate": 7.129729740664267e-05, + "loss": 2.7784, + "step": 27317 + }, + { + "epoch": 1.2718532485974348, + "grad_norm": 0.3569873110795894, + "learning_rate": 7.129484664975744e-05, + "loss": 2.6948, + "step": 27318 + }, + { + "epoch": 1.2718998067835277, + "grad_norm": 0.3708345939998532, + "learning_rate": 7.129239583037394e-05, + "loss": 2.7197, + "step": 27319 + }, + { + "epoch": 1.2719463649696208, + "grad_norm": 0.3409150236945732, + "learning_rate": 7.128994494849936e-05, + "loss": 2.7215, + "step": 27320 + }, + { + "epoch": 1.2719929231557139, + "grad_norm": 0.3835860346655456, + "learning_rate": 7.128749400414088e-05, + "loss": 2.8603, + "step": 27321 + }, + { + "epoch": 1.272039481341807, + "grad_norm": 0.32852957274047456, + "learning_rate": 7.128504299730573e-05, + "loss": 2.6735, + "step": 27322 + }, + { + "epoch": 1.2720860395279, + "grad_norm": 0.32991817891365743, + "learning_rate": 7.128259192800105e-05, + "loss": 2.7216, + "step": 27323 + }, + { + "epoch": 1.272132597713993, + "grad_norm": 0.33448554660522345, + "learning_rate": 7.128014079623407e-05, + "loss": 2.7495, + "step": 27324 + }, + { + "epoch": 1.2721791559000861, + "grad_norm": 0.3434351254776029, + "learning_rate": 7.127768960201197e-05, + "loss": 2.8126, + "step": 27325 + }, + { + "epoch": 1.2722257140861792, + "grad_norm": 0.38398780897135704, + "learning_rate": 7.127523834534194e-05, + "loss": 2.7933, + "step": 27326 + }, + { + "epoch": 1.2722722722722724, + "grad_norm": 0.34576622656003825, + "learning_rate": 7.127278702623119e-05, + "loss": 2.7498, + "step": 27327 + }, + { + "epoch": 1.2723188304583655, + "grad_norm": 0.3532407269472322, + "learning_rate": 7.127033564468689e-05, + "loss": 2.7621, + "step": 27328 + }, + { + "epoch": 1.2723653886444584, + "grad_norm": 0.30001024783769165, + "learning_rate": 7.126788420071627e-05, + "loss": 2.7901, + "step": 27329 + }, + { + "epoch": 1.2724119468305515, + "grad_norm": 0.37584770676066553, + "learning_rate": 7.126543269432651e-05, + "loss": 2.7978, + "step": 27330 + }, + { + "epoch": 1.2724585050166446, + "grad_norm": 0.32071045620653627, + "learning_rate": 7.126298112552479e-05, + "loss": 2.8052, + "step": 27331 + }, + { + "epoch": 1.2725050632027375, + "grad_norm": 0.3364172456510707, + "learning_rate": 7.12605294943183e-05, + "loss": 2.7913, + "step": 27332 + }, + { + "epoch": 1.2725516213888306, + "grad_norm": 0.355232653277491, + "learning_rate": 7.125807780071428e-05, + "loss": 2.7407, + "step": 27333 + }, + { + "epoch": 1.2725981795749237, + "grad_norm": 0.3136087324847638, + "learning_rate": 7.125562604471987e-05, + "loss": 2.7968, + "step": 27334 + }, + { + "epoch": 1.2726447377610168, + "grad_norm": 0.31641503765862916, + "learning_rate": 7.125317422634231e-05, + "loss": 2.6362, + "step": 27335 + }, + { + "epoch": 1.27269129594711, + "grad_norm": 0.3358296650976045, + "learning_rate": 7.125072234558876e-05, + "loss": 2.7335, + "step": 27336 + }, + { + "epoch": 1.272737854133203, + "grad_norm": 0.31266864123379856, + "learning_rate": 7.124827040246644e-05, + "loss": 2.7887, + "step": 27337 + }, + { + "epoch": 1.2727844123192962, + "grad_norm": 0.35488721516470867, + "learning_rate": 7.124581839698256e-05, + "loss": 2.8196, + "step": 27338 + }, + { + "epoch": 1.272830970505389, + "grad_norm": 0.35547827284993233, + "learning_rate": 7.124336632914427e-05, + "loss": 2.6906, + "step": 27339 + }, + { + "epoch": 1.2728775286914822, + "grad_norm": 0.3322002934956685, + "learning_rate": 7.124091419895879e-05, + "loss": 2.7772, + "step": 27340 + }, + { + "epoch": 1.2729240868775753, + "grad_norm": 0.364674218922871, + "learning_rate": 7.123846200643334e-05, + "loss": 2.8205, + "step": 27341 + }, + { + "epoch": 1.2729706450636682, + "grad_norm": 0.32142037922532557, + "learning_rate": 7.12360097515751e-05, + "loss": 2.8217, + "step": 27342 + }, + { + "epoch": 1.2730172032497613, + "grad_norm": 0.34467981370355455, + "learning_rate": 7.123355743439124e-05, + "loss": 2.8425, + "step": 27343 + }, + { + "epoch": 1.2730637614358544, + "grad_norm": 0.3241245191958693, + "learning_rate": 7.123110505488899e-05, + "loss": 2.7985, + "step": 27344 + }, + { + "epoch": 1.2731103196219475, + "grad_norm": 0.3362555069671585, + "learning_rate": 7.122865261307554e-05, + "loss": 2.7438, + "step": 27345 + }, + { + "epoch": 1.2731568778080407, + "grad_norm": 0.3128855071755228, + "learning_rate": 7.122620010895809e-05, + "loss": 2.7402, + "step": 27346 + }, + { + "epoch": 1.2732034359941338, + "grad_norm": 0.33321225278068045, + "learning_rate": 7.122374754254382e-05, + "loss": 2.6951, + "step": 27347 + }, + { + "epoch": 1.2732499941802267, + "grad_norm": 0.32509298702388667, + "learning_rate": 7.122129491383996e-05, + "loss": 2.748, + "step": 27348 + }, + { + "epoch": 1.2732965523663198, + "grad_norm": 0.34437227426562245, + "learning_rate": 7.121884222285368e-05, + "loss": 2.8542, + "step": 27349 + }, + { + "epoch": 1.273343110552413, + "grad_norm": 0.33058467891032906, + "learning_rate": 7.121638946959219e-05, + "loss": 2.8139, + "step": 27350 + }, + { + "epoch": 1.273389668738506, + "grad_norm": 0.34811169816483595, + "learning_rate": 7.121393665406269e-05, + "loss": 2.7998, + "step": 27351 + }, + { + "epoch": 1.273436226924599, + "grad_norm": 0.3378937937824588, + "learning_rate": 7.121148377627237e-05, + "loss": 2.8091, + "step": 27352 + }, + { + "epoch": 1.273482785110692, + "grad_norm": 0.3270181230026062, + "learning_rate": 7.120903083622843e-05, + "loss": 2.77, + "step": 27353 + }, + { + "epoch": 1.2735293432967851, + "grad_norm": 0.348946141106933, + "learning_rate": 7.120657783393808e-05, + "loss": 2.791, + "step": 27354 + }, + { + "epoch": 1.2735759014828782, + "grad_norm": 0.3354975487804975, + "learning_rate": 7.120412476940851e-05, + "loss": 2.7206, + "step": 27355 + }, + { + "epoch": 1.2736224596689714, + "grad_norm": 0.3591961386132282, + "learning_rate": 7.120167164264693e-05, + "loss": 2.884, + "step": 27356 + }, + { + "epoch": 1.2736690178550645, + "grad_norm": 0.3481166337085402, + "learning_rate": 7.119921845366054e-05, + "loss": 2.8344, + "step": 27357 + }, + { + "epoch": 1.2737155760411574, + "grad_norm": 0.3450930042691236, + "learning_rate": 7.119676520245652e-05, + "loss": 2.6511, + "step": 27358 + }, + { + "epoch": 1.2737621342272505, + "grad_norm": 0.32048814262307884, + "learning_rate": 7.119431188904208e-05, + "loss": 2.6969, + "step": 27359 + }, + { + "epoch": 1.2738086924133436, + "grad_norm": 0.3582658589508139, + "learning_rate": 7.119185851342443e-05, + "loss": 2.7758, + "step": 27360 + }, + { + "epoch": 1.2738552505994367, + "grad_norm": 0.32579284386869606, + "learning_rate": 7.118940507561078e-05, + "loss": 2.6877, + "step": 27361 + }, + { + "epoch": 1.2739018087855296, + "grad_norm": 0.3112971098904683, + "learning_rate": 7.118695157560829e-05, + "loss": 2.7462, + "step": 27362 + }, + { + "epoch": 1.2739483669716227, + "grad_norm": 0.33801473717619, + "learning_rate": 7.118449801342417e-05, + "loss": 2.8386, + "step": 27363 + }, + { + "epoch": 1.2739949251577158, + "grad_norm": 0.32880198354106166, + "learning_rate": 7.118204438906566e-05, + "loss": 2.7658, + "step": 27364 + }, + { + "epoch": 1.274041483343809, + "grad_norm": 0.34264752551431976, + "learning_rate": 7.117959070253993e-05, + "loss": 2.7921, + "step": 27365 + }, + { + "epoch": 1.274088041529902, + "grad_norm": 0.3539881436771083, + "learning_rate": 7.117713695385418e-05, + "loss": 2.7392, + "step": 27366 + }, + { + "epoch": 1.2741345997159952, + "grad_norm": 0.3241740383931327, + "learning_rate": 7.117468314301563e-05, + "loss": 2.7448, + "step": 27367 + }, + { + "epoch": 1.274181157902088, + "grad_norm": 0.33772572547574603, + "learning_rate": 7.117222927003147e-05, + "loss": 2.6524, + "step": 27368 + }, + { + "epoch": 1.2742277160881812, + "grad_norm": 0.3337755355332752, + "learning_rate": 7.11697753349089e-05, + "loss": 2.8206, + "step": 27369 + }, + { + "epoch": 1.2742742742742743, + "grad_norm": 0.34634705236857233, + "learning_rate": 7.116732133765513e-05, + "loss": 2.8621, + "step": 27370 + }, + { + "epoch": 1.2743208324603674, + "grad_norm": 0.34196068161945237, + "learning_rate": 7.116486727827735e-05, + "loss": 2.8488, + "step": 27371 + }, + { + "epoch": 1.2743673906464603, + "grad_norm": 0.33182627639630247, + "learning_rate": 7.116241315678279e-05, + "loss": 2.8551, + "step": 27372 + }, + { + "epoch": 1.2744139488325534, + "grad_norm": 0.3319052429312525, + "learning_rate": 7.11599589731786e-05, + "loss": 2.809, + "step": 27373 + }, + { + "epoch": 1.2744605070186465, + "grad_norm": 0.3510423346108675, + "learning_rate": 7.115750472747204e-05, + "loss": 2.7468, + "step": 27374 + }, + { + "epoch": 1.2745070652047397, + "grad_norm": 0.3407766682752879, + "learning_rate": 7.115505041967028e-05, + "loss": 2.7812, + "step": 27375 + }, + { + "epoch": 1.2745536233908328, + "grad_norm": 0.3508992653869267, + "learning_rate": 7.115259604978052e-05, + "loss": 2.8671, + "step": 27376 + }, + { + "epoch": 1.274600181576926, + "grad_norm": 0.33536970338881417, + "learning_rate": 7.115014161780998e-05, + "loss": 2.8014, + "step": 27377 + }, + { + "epoch": 1.2746467397630188, + "grad_norm": 0.36030437583287994, + "learning_rate": 7.114768712376587e-05, + "loss": 2.7909, + "step": 27378 + }, + { + "epoch": 1.274693297949112, + "grad_norm": 0.3373669151303093, + "learning_rate": 7.114523256765537e-05, + "loss": 2.8165, + "step": 27379 + }, + { + "epoch": 1.274739856135205, + "grad_norm": 0.363278922964659, + "learning_rate": 7.114277794948569e-05, + "loss": 2.7931, + "step": 27380 + }, + { + "epoch": 1.274786414321298, + "grad_norm": 0.35104589935445585, + "learning_rate": 7.114032326926406e-05, + "loss": 2.8677, + "step": 27381 + }, + { + "epoch": 1.274832972507391, + "grad_norm": 0.3359584068005292, + "learning_rate": 7.113786852699764e-05, + "loss": 2.7724, + "step": 27382 + }, + { + "epoch": 1.2748795306934841, + "grad_norm": 0.37291405179373843, + "learning_rate": 7.113541372269368e-05, + "loss": 2.7897, + "step": 27383 + }, + { + "epoch": 1.2749260888795773, + "grad_norm": 0.3081245193021037, + "learning_rate": 7.113295885635935e-05, + "loss": 2.6577, + "step": 27384 + }, + { + "epoch": 1.2749726470656704, + "grad_norm": 0.33930822498614965, + "learning_rate": 7.113050392800188e-05, + "loss": 2.8343, + "step": 27385 + }, + { + "epoch": 1.2750192052517635, + "grad_norm": 0.36494042332906645, + "learning_rate": 7.112804893762845e-05, + "loss": 2.8206, + "step": 27386 + }, + { + "epoch": 1.2750657634378564, + "grad_norm": 0.376002224910059, + "learning_rate": 7.112559388524629e-05, + "loss": 2.8699, + "step": 27387 + }, + { + "epoch": 1.2751123216239495, + "grad_norm": 0.32688743059332515, + "learning_rate": 7.112313877086259e-05, + "loss": 2.9162, + "step": 27388 + }, + { + "epoch": 1.2751588798100426, + "grad_norm": 0.3693964704249135, + "learning_rate": 7.112068359448454e-05, + "loss": 2.7691, + "step": 27389 + }, + { + "epoch": 1.2752054379961357, + "grad_norm": 0.34123216336468504, + "learning_rate": 7.111822835611938e-05, + "loss": 2.6979, + "step": 27390 + }, + { + "epoch": 1.2752519961822286, + "grad_norm": 0.37739839888367227, + "learning_rate": 7.111577305577429e-05, + "loss": 2.8614, + "step": 27391 + }, + { + "epoch": 1.2752985543683217, + "grad_norm": 0.32634048408471833, + "learning_rate": 7.111331769345651e-05, + "loss": 2.7817, + "step": 27392 + }, + { + "epoch": 1.2753451125544149, + "grad_norm": 0.33309861507468236, + "learning_rate": 7.111086226917319e-05, + "loss": 2.7285, + "step": 27393 + }, + { + "epoch": 1.275391670740508, + "grad_norm": 0.34734597131235734, + "learning_rate": 7.11084067829316e-05, + "loss": 2.8163, + "step": 27394 + }, + { + "epoch": 1.275438228926601, + "grad_norm": 0.3299240614589857, + "learning_rate": 7.110595123473889e-05, + "loss": 2.6354, + "step": 27395 + }, + { + "epoch": 1.2754847871126942, + "grad_norm": 0.31412472652687456, + "learning_rate": 7.110349562460231e-05, + "loss": 2.8066, + "step": 27396 + }, + { + "epoch": 1.275531345298787, + "grad_norm": 0.3284889029458003, + "learning_rate": 7.110103995252903e-05, + "loss": 2.8925, + "step": 27397 + }, + { + "epoch": 1.2755779034848802, + "grad_norm": 0.3152474170723344, + "learning_rate": 7.109858421852631e-05, + "loss": 2.8145, + "step": 27398 + }, + { + "epoch": 1.2756244616709733, + "grad_norm": 0.3267010470814225, + "learning_rate": 7.109612842260129e-05, + "loss": 2.7356, + "step": 27399 + }, + { + "epoch": 1.2756710198570664, + "grad_norm": 0.31898214134137376, + "learning_rate": 7.109367256476123e-05, + "loss": 2.7671, + "step": 27400 + }, + { + "epoch": 1.2757175780431593, + "grad_norm": 0.3209414578843298, + "learning_rate": 7.109121664501333e-05, + "loss": 2.7982, + "step": 27401 + }, + { + "epoch": 1.2757641362292524, + "grad_norm": 0.32740880260631283, + "learning_rate": 7.108876066336476e-05, + "loss": 2.7572, + "step": 27402 + }, + { + "epoch": 1.2758106944153456, + "grad_norm": 0.3211256746283242, + "learning_rate": 7.108630461982276e-05, + "loss": 2.7812, + "step": 27403 + }, + { + "epoch": 1.2758572526014387, + "grad_norm": 0.36288430779094194, + "learning_rate": 7.108384851439454e-05, + "loss": 2.7958, + "step": 27404 + }, + { + "epoch": 1.2759038107875318, + "grad_norm": 0.32950881232448537, + "learning_rate": 7.10813923470873e-05, + "loss": 2.7378, + "step": 27405 + }, + { + "epoch": 1.275950368973625, + "grad_norm": 0.35843434571464966, + "learning_rate": 7.107893611790826e-05, + "loss": 2.8109, + "step": 27406 + }, + { + "epoch": 1.2759969271597178, + "grad_norm": 0.3408562804413587, + "learning_rate": 7.107647982686461e-05, + "loss": 2.8331, + "step": 27407 + }, + { + "epoch": 1.276043485345811, + "grad_norm": 0.35626374888519824, + "learning_rate": 7.107402347396357e-05, + "loss": 2.801, + "step": 27408 + }, + { + "epoch": 1.276090043531904, + "grad_norm": 0.3469633050707746, + "learning_rate": 7.107156705921234e-05, + "loss": 2.8386, + "step": 27409 + }, + { + "epoch": 1.2761366017179971, + "grad_norm": 0.3220942383080726, + "learning_rate": 7.106911058261814e-05, + "loss": 2.8354, + "step": 27410 + }, + { + "epoch": 1.27618315990409, + "grad_norm": 0.34014870844946393, + "learning_rate": 7.10666540441882e-05, + "loss": 2.7061, + "step": 27411 + }, + { + "epoch": 1.2762297180901832, + "grad_norm": 0.36661430076048823, + "learning_rate": 7.106419744392966e-05, + "loss": 2.8232, + "step": 27412 + }, + { + "epoch": 1.2762762762762763, + "grad_norm": 0.31994227746537673, + "learning_rate": 7.106174078184982e-05, + "loss": 2.6957, + "step": 27413 + }, + { + "epoch": 1.2763228344623694, + "grad_norm": 0.3502628794305368, + "learning_rate": 7.105928405795584e-05, + "loss": 2.7829, + "step": 27414 + }, + { + "epoch": 1.2763693926484625, + "grad_norm": 0.3364455287656948, + "learning_rate": 7.105682727225491e-05, + "loss": 2.6991, + "step": 27415 + }, + { + "epoch": 1.2764159508345556, + "grad_norm": 0.3678729180221769, + "learning_rate": 7.105437042475427e-05, + "loss": 2.7925, + "step": 27416 + }, + { + "epoch": 1.2764625090206485, + "grad_norm": 0.3391761012636893, + "learning_rate": 7.105191351546113e-05, + "loss": 2.7992, + "step": 27417 + }, + { + "epoch": 1.2765090672067416, + "grad_norm": 0.351666319524918, + "learning_rate": 7.10494565443827e-05, + "loss": 2.8147, + "step": 27418 + }, + { + "epoch": 1.2765556253928347, + "grad_norm": 0.35225256441725045, + "learning_rate": 7.104699951152621e-05, + "loss": 2.7836, + "step": 27419 + }, + { + "epoch": 1.2766021835789276, + "grad_norm": 0.36515121056439104, + "learning_rate": 7.104454241689882e-05, + "loss": 2.8071, + "step": 27420 + }, + { + "epoch": 1.2766487417650207, + "grad_norm": 0.3101047116772637, + "learning_rate": 7.104208526050779e-05, + "loss": 2.8319, + "step": 27421 + }, + { + "epoch": 1.2766952999511139, + "grad_norm": 0.34238465491850517, + "learning_rate": 7.10396280423603e-05, + "loss": 2.7046, + "step": 27422 + }, + { + "epoch": 1.276741858137207, + "grad_norm": 0.32391047569403614, + "learning_rate": 7.10371707624636e-05, + "loss": 2.7249, + "step": 27423 + }, + { + "epoch": 1.2767884163233, + "grad_norm": 0.3130399496683906, + "learning_rate": 7.103471342082486e-05, + "loss": 2.8307, + "step": 27424 + }, + { + "epoch": 1.2768349745093932, + "grad_norm": 0.3091830402792629, + "learning_rate": 7.103225601745129e-05, + "loss": 2.7049, + "step": 27425 + }, + { + "epoch": 1.2768815326954863, + "grad_norm": 0.33633940766970943, + "learning_rate": 7.102979855235014e-05, + "loss": 2.7873, + "step": 27426 + }, + { + "epoch": 1.2769280908815792, + "grad_norm": 0.31741388572430257, + "learning_rate": 7.10273410255286e-05, + "loss": 2.8296, + "step": 27427 + }, + { + "epoch": 1.2769746490676723, + "grad_norm": 0.3466167079742481, + "learning_rate": 7.102488343699388e-05, + "loss": 2.927, + "step": 27428 + }, + { + "epoch": 1.2770212072537654, + "grad_norm": 0.34531601216318936, + "learning_rate": 7.10224257867532e-05, + "loss": 2.7544, + "step": 27429 + }, + { + "epoch": 1.2770677654398583, + "grad_norm": 0.33567954769342656, + "learning_rate": 7.101996807481377e-05, + "loss": 2.7982, + "step": 27430 + }, + { + "epoch": 1.2771143236259515, + "grad_norm": 0.3351607987519383, + "learning_rate": 7.101751030118281e-05, + "loss": 2.6969, + "step": 27431 + }, + { + "epoch": 1.2771608818120446, + "grad_norm": 0.3417315101088353, + "learning_rate": 7.101505246586753e-05, + "loss": 2.8023, + "step": 27432 + }, + { + "epoch": 1.2772074399981377, + "grad_norm": 0.3416433258100387, + "learning_rate": 7.101259456887514e-05, + "loss": 2.831, + "step": 27433 + }, + { + "epoch": 1.2772539981842308, + "grad_norm": 0.3574851999866583, + "learning_rate": 7.101013661021285e-05, + "loss": 2.7111, + "step": 27434 + }, + { + "epoch": 1.277300556370324, + "grad_norm": 0.2935337683775412, + "learning_rate": 7.100767858988787e-05, + "loss": 2.7057, + "step": 27435 + }, + { + "epoch": 1.2773471145564168, + "grad_norm": 0.3420761683247421, + "learning_rate": 7.100522050790744e-05, + "loss": 2.7823, + "step": 27436 + }, + { + "epoch": 1.27739367274251, + "grad_norm": 0.3367200243968146, + "learning_rate": 7.100276236427876e-05, + "loss": 2.6472, + "step": 27437 + }, + { + "epoch": 1.277440230928603, + "grad_norm": 0.34029094902807355, + "learning_rate": 7.100030415900903e-05, + "loss": 2.8022, + "step": 27438 + }, + { + "epoch": 1.2774867891146962, + "grad_norm": 0.34316973814832036, + "learning_rate": 7.099784589210549e-05, + "loss": 2.8659, + "step": 27439 + }, + { + "epoch": 1.277533347300789, + "grad_norm": 0.38330012960056437, + "learning_rate": 7.099538756357532e-05, + "loss": 2.6178, + "step": 27440 + }, + { + "epoch": 1.2775799054868822, + "grad_norm": 0.32573585862785853, + "learning_rate": 7.099292917342577e-05, + "loss": 2.749, + "step": 27441 + }, + { + "epoch": 1.2776264636729753, + "grad_norm": 0.383830699730264, + "learning_rate": 7.099047072166404e-05, + "loss": 2.795, + "step": 27442 + }, + { + "epoch": 1.2776730218590684, + "grad_norm": 0.31719206317970033, + "learning_rate": 7.098801220829734e-05, + "loss": 2.867, + "step": 27443 + }, + { + "epoch": 1.2777195800451615, + "grad_norm": 0.38276450539554147, + "learning_rate": 7.098555363333289e-05, + "loss": 2.7768, + "step": 27444 + }, + { + "epoch": 1.2777661382312546, + "grad_norm": 0.3163329771750194, + "learning_rate": 7.098309499677791e-05, + "loss": 2.6304, + "step": 27445 + }, + { + "epoch": 1.2778126964173475, + "grad_norm": 0.31235466090228664, + "learning_rate": 7.098063629863962e-05, + "loss": 2.7836, + "step": 27446 + }, + { + "epoch": 1.2778592546034406, + "grad_norm": 0.33540777528128934, + "learning_rate": 7.097817753892522e-05, + "loss": 2.9195, + "step": 27447 + }, + { + "epoch": 1.2779058127895337, + "grad_norm": 0.3325854585432203, + "learning_rate": 7.097571871764195e-05, + "loss": 2.6583, + "step": 27448 + }, + { + "epoch": 1.2779523709756269, + "grad_norm": 0.31039234232642166, + "learning_rate": 7.097325983479702e-05, + "loss": 2.7428, + "step": 27449 + }, + { + "epoch": 1.2779989291617198, + "grad_norm": 0.3609671035633839, + "learning_rate": 7.097080089039761e-05, + "loss": 2.68, + "step": 27450 + }, + { + "epoch": 1.2780454873478129, + "grad_norm": 0.31793474820544343, + "learning_rate": 7.096834188445098e-05, + "loss": 2.8033, + "step": 27451 + }, + { + "epoch": 1.278092045533906, + "grad_norm": 0.3463434098418904, + "learning_rate": 7.096588281696433e-05, + "loss": 2.8302, + "step": 27452 + }, + { + "epoch": 1.278138603719999, + "grad_norm": 0.3155882050813786, + "learning_rate": 7.096342368794488e-05, + "loss": 2.8054, + "step": 27453 + }, + { + "epoch": 1.2781851619060922, + "grad_norm": 0.34729387775240905, + "learning_rate": 7.096096449739985e-05, + "loss": 2.7609, + "step": 27454 + }, + { + "epoch": 1.2782317200921853, + "grad_norm": 0.30069612072655566, + "learning_rate": 7.095850524533645e-05, + "loss": 2.6485, + "step": 27455 + }, + { + "epoch": 1.2782782782782782, + "grad_norm": 0.3430497864209726, + "learning_rate": 7.09560459317619e-05, + "loss": 2.6509, + "step": 27456 + }, + { + "epoch": 1.2783248364643713, + "grad_norm": 0.28903016284054783, + "learning_rate": 7.095358655668342e-05, + "loss": 2.7409, + "step": 27457 + }, + { + "epoch": 1.2783713946504645, + "grad_norm": 0.32967561788428457, + "learning_rate": 7.095112712010821e-05, + "loss": 2.6971, + "step": 27458 + }, + { + "epoch": 1.2784179528365576, + "grad_norm": 0.33416615521618287, + "learning_rate": 7.094866762204354e-05, + "loss": 2.918, + "step": 27459 + }, + { + "epoch": 1.2784645110226505, + "grad_norm": 0.32461505792163575, + "learning_rate": 7.094620806249657e-05, + "loss": 2.7654, + "step": 27460 + }, + { + "epoch": 1.2785110692087436, + "grad_norm": 0.34914947276658703, + "learning_rate": 7.094374844147456e-05, + "loss": 2.7541, + "step": 27461 + }, + { + "epoch": 1.2785576273948367, + "grad_norm": 0.33061827826960166, + "learning_rate": 7.094128875898471e-05, + "loss": 2.842, + "step": 27462 + }, + { + "epoch": 1.2786041855809298, + "grad_norm": 0.37912338370324755, + "learning_rate": 7.093882901503424e-05, + "loss": 2.7087, + "step": 27463 + }, + { + "epoch": 1.278650743767023, + "grad_norm": 0.2975271851215918, + "learning_rate": 7.093636920963036e-05, + "loss": 2.7799, + "step": 27464 + }, + { + "epoch": 1.278697301953116, + "grad_norm": 0.3690884443686636, + "learning_rate": 7.093390934278031e-05, + "loss": 2.8361, + "step": 27465 + }, + { + "epoch": 1.278743860139209, + "grad_norm": 0.3255868589486396, + "learning_rate": 7.093144941449127e-05, + "loss": 2.7298, + "step": 27466 + }, + { + "epoch": 1.278790418325302, + "grad_norm": 0.36512051813383073, + "learning_rate": 7.092898942477052e-05, + "loss": 2.779, + "step": 27467 + }, + { + "epoch": 1.2788369765113952, + "grad_norm": 0.353256518430049, + "learning_rate": 7.092652937362525e-05, + "loss": 2.8377, + "step": 27468 + }, + { + "epoch": 1.278883534697488, + "grad_norm": 0.362395241647074, + "learning_rate": 7.092406926106265e-05, + "loss": 2.6458, + "step": 27469 + }, + { + "epoch": 1.2789300928835812, + "grad_norm": 0.3141798162140212, + "learning_rate": 7.092160908708999e-05, + "loss": 2.7621, + "step": 27470 + }, + { + "epoch": 1.2789766510696743, + "grad_norm": 0.3070951931707911, + "learning_rate": 7.091914885171446e-05, + "loss": 2.7031, + "step": 27471 + }, + { + "epoch": 1.2790232092557674, + "grad_norm": 0.3576512654564153, + "learning_rate": 7.091668855494328e-05, + "loss": 2.785, + "step": 27472 + }, + { + "epoch": 1.2790697674418605, + "grad_norm": 0.3078522556682146, + "learning_rate": 7.09142281967837e-05, + "loss": 2.8192, + "step": 27473 + }, + { + "epoch": 1.2791163256279536, + "grad_norm": 0.33822611790227397, + "learning_rate": 7.09117677772429e-05, + "loss": 2.8095, + "step": 27474 + }, + { + "epoch": 1.2791628838140465, + "grad_norm": 0.2993460953287874, + "learning_rate": 7.090930729632815e-05, + "loss": 2.7047, + "step": 27475 + }, + { + "epoch": 1.2792094420001396, + "grad_norm": 0.3301933671243626, + "learning_rate": 7.090684675404663e-05, + "loss": 2.8096, + "step": 27476 + }, + { + "epoch": 1.2792560001862328, + "grad_norm": 0.2990363708582231, + "learning_rate": 7.090438615040558e-05, + "loss": 2.7777, + "step": 27477 + }, + { + "epoch": 1.2793025583723259, + "grad_norm": 0.3419813071142738, + "learning_rate": 7.09019254854122e-05, + "loss": 2.8612, + "step": 27478 + }, + { + "epoch": 1.2793491165584188, + "grad_norm": 0.32202458145960966, + "learning_rate": 7.089946475907373e-05, + "loss": 2.7895, + "step": 27479 + }, + { + "epoch": 1.2793956747445119, + "grad_norm": 0.33876679666955234, + "learning_rate": 7.089700397139738e-05, + "loss": 2.8033, + "step": 27480 + }, + { + "epoch": 1.279442232930605, + "grad_norm": 0.30925742199791767, + "learning_rate": 7.089454312239041e-05, + "loss": 2.7725, + "step": 27481 + }, + { + "epoch": 1.279488791116698, + "grad_norm": 0.3207644557020893, + "learning_rate": 7.089208221206e-05, + "loss": 2.7672, + "step": 27482 + }, + { + "epoch": 1.2795353493027912, + "grad_norm": 0.353472807759236, + "learning_rate": 7.088962124041338e-05, + "loss": 2.8266, + "step": 27483 + }, + { + "epoch": 1.2795819074888843, + "grad_norm": 0.3106015941705489, + "learning_rate": 7.088716020745779e-05, + "loss": 2.7183, + "step": 27484 + }, + { + "epoch": 1.2796284656749772, + "grad_norm": 0.353620561731807, + "learning_rate": 7.088469911320043e-05, + "loss": 2.8546, + "step": 27485 + }, + { + "epoch": 1.2796750238610703, + "grad_norm": 0.3226711044585625, + "learning_rate": 7.088223795764857e-05, + "loss": 2.8542, + "step": 27486 + }, + { + "epoch": 1.2797215820471635, + "grad_norm": 0.3581780444325256, + "learning_rate": 7.087977674080936e-05, + "loss": 2.6963, + "step": 27487 + }, + { + "epoch": 1.2797681402332566, + "grad_norm": 0.33499186551240495, + "learning_rate": 7.087731546269009e-05, + "loss": 2.7452, + "step": 27488 + }, + { + "epoch": 1.2798146984193495, + "grad_norm": 0.3618451484201659, + "learning_rate": 7.087485412329794e-05, + "loss": 2.7952, + "step": 27489 + }, + { + "epoch": 1.2798612566054426, + "grad_norm": 0.3142175341575432, + "learning_rate": 7.087239272264014e-05, + "loss": 2.5525, + "step": 27490 + }, + { + "epoch": 1.2799078147915357, + "grad_norm": 0.3551662503449451, + "learning_rate": 7.086993126072395e-05, + "loss": 2.7902, + "step": 27491 + }, + { + "epoch": 1.2799543729776288, + "grad_norm": 0.31929183217318474, + "learning_rate": 7.086746973755655e-05, + "loss": 2.8372, + "step": 27492 + }, + { + "epoch": 1.280000931163722, + "grad_norm": 0.34645361927135426, + "learning_rate": 7.086500815314519e-05, + "loss": 2.7141, + "step": 27493 + }, + { + "epoch": 1.280047489349815, + "grad_norm": 0.33954255347824785, + "learning_rate": 7.086254650749708e-05, + "loss": 2.8365, + "step": 27494 + }, + { + "epoch": 1.280094047535908, + "grad_norm": 0.32159041106852365, + "learning_rate": 7.086008480061944e-05, + "loss": 2.7687, + "step": 27495 + }, + { + "epoch": 1.280140605722001, + "grad_norm": 0.3486305145099877, + "learning_rate": 7.085762303251953e-05, + "loss": 2.8047, + "step": 27496 + }, + { + "epoch": 1.2801871639080942, + "grad_norm": 0.3210357456206909, + "learning_rate": 7.085516120320454e-05, + "loss": 2.8321, + "step": 27497 + }, + { + "epoch": 1.2802337220941873, + "grad_norm": 0.35220533424019046, + "learning_rate": 7.08526993126817e-05, + "loss": 2.7299, + "step": 27498 + }, + { + "epoch": 1.2802802802802802, + "grad_norm": 0.33913320506855704, + "learning_rate": 7.085023736095825e-05, + "loss": 2.616, + "step": 27499 + }, + { + "epoch": 1.2803268384663733, + "grad_norm": 0.35855616365206655, + "learning_rate": 7.084777534804141e-05, + "loss": 2.873, + "step": 27500 + }, + { + "epoch": 1.2803733966524664, + "grad_norm": 0.361176628074832, + "learning_rate": 7.084531327393841e-05, + "loss": 2.7456, + "step": 27501 + }, + { + "epoch": 1.2804199548385595, + "grad_norm": 0.3467810378806595, + "learning_rate": 7.084285113865645e-05, + "loss": 2.806, + "step": 27502 + }, + { + "epoch": 1.2804665130246526, + "grad_norm": 0.3646321852779925, + "learning_rate": 7.084038894220278e-05, + "loss": 2.7864, + "step": 27503 + }, + { + "epoch": 1.2805130712107458, + "grad_norm": 0.3436260589764457, + "learning_rate": 7.083792668458463e-05, + "loss": 2.7073, + "step": 27504 + }, + { + "epoch": 1.2805596293968387, + "grad_norm": 0.3602272520270145, + "learning_rate": 7.083546436580923e-05, + "loss": 2.8554, + "step": 27505 + }, + { + "epoch": 1.2806061875829318, + "grad_norm": 0.3352269583070273, + "learning_rate": 7.083300198588377e-05, + "loss": 2.7833, + "step": 27506 + }, + { + "epoch": 1.2806527457690249, + "grad_norm": 0.3451177580240167, + "learning_rate": 7.083053954481551e-05, + "loss": 2.8283, + "step": 27507 + }, + { + "epoch": 1.2806993039551178, + "grad_norm": 0.3494188143371117, + "learning_rate": 7.082807704261166e-05, + "loss": 2.7606, + "step": 27508 + }, + { + "epoch": 1.280745862141211, + "grad_norm": 0.32040095648807054, + "learning_rate": 7.082561447927947e-05, + "loss": 2.8273, + "step": 27509 + }, + { + "epoch": 1.280792420327304, + "grad_norm": 0.3552730920031925, + "learning_rate": 7.082315185482615e-05, + "loss": 2.8047, + "step": 27510 + }, + { + "epoch": 1.2808389785133971, + "grad_norm": 0.3276829944408712, + "learning_rate": 7.082068916925892e-05, + "loss": 2.6922, + "step": 27511 + }, + { + "epoch": 1.2808855366994902, + "grad_norm": 0.3684240926365403, + "learning_rate": 7.081822642258504e-05, + "loss": 2.6986, + "step": 27512 + }, + { + "epoch": 1.2809320948855833, + "grad_norm": 0.327482922536436, + "learning_rate": 7.08157636148117e-05, + "loss": 2.807, + "step": 27513 + }, + { + "epoch": 1.2809786530716765, + "grad_norm": 0.39467631066151515, + "learning_rate": 7.081330074594616e-05, + "loss": 2.7735, + "step": 27514 + }, + { + "epoch": 1.2810252112577694, + "grad_norm": 0.36025503339666015, + "learning_rate": 7.081083781599561e-05, + "loss": 2.7343, + "step": 27515 + }, + { + "epoch": 1.2810717694438625, + "grad_norm": 0.3400235096946511, + "learning_rate": 7.080837482496732e-05, + "loss": 2.7943, + "step": 27516 + }, + { + "epoch": 1.2811183276299556, + "grad_norm": 0.3554204279451737, + "learning_rate": 7.080591177286848e-05, + "loss": 2.757, + "step": 27517 + }, + { + "epoch": 1.2811648858160485, + "grad_norm": 0.3531040515602355, + "learning_rate": 7.080344865970636e-05, + "loss": 2.7941, + "step": 27518 + }, + { + "epoch": 1.2812114440021416, + "grad_norm": 0.32842908218462474, + "learning_rate": 7.080098548548815e-05, + "loss": 2.7787, + "step": 27519 + }, + { + "epoch": 1.2812580021882347, + "grad_norm": 0.3522052800219384, + "learning_rate": 7.07985222502211e-05, + "loss": 2.7655, + "step": 27520 + }, + { + "epoch": 1.2813045603743278, + "grad_norm": 0.33892783774143764, + "learning_rate": 7.079605895391244e-05, + "loss": 2.8058, + "step": 27521 + }, + { + "epoch": 1.281351118560421, + "grad_norm": 0.36537335420405126, + "learning_rate": 7.079359559656939e-05, + "loss": 2.7462, + "step": 27522 + }, + { + "epoch": 1.281397676746514, + "grad_norm": 0.3253493717891332, + "learning_rate": 7.07911321781992e-05, + "loss": 2.5886, + "step": 27523 + }, + { + "epoch": 1.281444234932607, + "grad_norm": 0.3660016135144184, + "learning_rate": 7.078866869880908e-05, + "loss": 2.7294, + "step": 27524 + }, + { + "epoch": 1.2814907931187, + "grad_norm": 0.33365214819324557, + "learning_rate": 7.078620515840628e-05, + "loss": 2.7383, + "step": 27525 + }, + { + "epoch": 1.2815373513047932, + "grad_norm": 0.342988748147713, + "learning_rate": 7.078374155699798e-05, + "loss": 2.7504, + "step": 27526 + }, + { + "epoch": 1.2815839094908863, + "grad_norm": 0.35740030826766855, + "learning_rate": 7.078127789459148e-05, + "loss": 2.8047, + "step": 27527 + }, + { + "epoch": 1.2816304676769792, + "grad_norm": 0.3379893724513776, + "learning_rate": 7.077881417119396e-05, + "loss": 2.7813, + "step": 27528 + }, + { + "epoch": 1.2816770258630723, + "grad_norm": 0.3643963521298794, + "learning_rate": 7.077635038681266e-05, + "loss": 2.6645, + "step": 27529 + }, + { + "epoch": 1.2817235840491654, + "grad_norm": 0.29414459373863316, + "learning_rate": 7.077388654145483e-05, + "loss": 2.7489, + "step": 27530 + }, + { + "epoch": 1.2817701422352585, + "grad_norm": 0.31475679009943, + "learning_rate": 7.077142263512768e-05, + "loss": 2.6962, + "step": 27531 + }, + { + "epoch": 1.2818167004213517, + "grad_norm": 0.32180328884726034, + "learning_rate": 7.076895866783846e-05, + "loss": 2.7608, + "step": 27532 + }, + { + "epoch": 1.2818632586074448, + "grad_norm": 0.3154546113664863, + "learning_rate": 7.07664946395944e-05, + "loss": 2.8448, + "step": 27533 + }, + { + "epoch": 1.2819098167935377, + "grad_norm": 0.3497105565672039, + "learning_rate": 7.07640305504027e-05, + "loss": 2.6742, + "step": 27534 + }, + { + "epoch": 1.2819563749796308, + "grad_norm": 0.3207536079210245, + "learning_rate": 7.076156640027063e-05, + "loss": 2.8371, + "step": 27535 + }, + { + "epoch": 1.282002933165724, + "grad_norm": 0.32106814890720586, + "learning_rate": 7.075910218920541e-05, + "loss": 2.6329, + "step": 27536 + }, + { + "epoch": 1.282049491351817, + "grad_norm": 0.32853535901463893, + "learning_rate": 7.075663791721428e-05, + "loss": 2.7905, + "step": 27537 + }, + { + "epoch": 1.28209604953791, + "grad_norm": 0.3386626194054026, + "learning_rate": 7.075417358430445e-05, + "loss": 2.8641, + "step": 27538 + }, + { + "epoch": 1.282142607724003, + "grad_norm": 0.3427089729234748, + "learning_rate": 7.075170919048317e-05, + "loss": 2.6997, + "step": 27539 + }, + { + "epoch": 1.2821891659100961, + "grad_norm": 0.3218539266811145, + "learning_rate": 7.074924473575766e-05, + "loss": 2.7687, + "step": 27540 + }, + { + "epoch": 1.2822357240961892, + "grad_norm": 0.2887415518859082, + "learning_rate": 7.074678022013517e-05, + "loss": 2.733, + "step": 27541 + }, + { + "epoch": 1.2822822822822824, + "grad_norm": 0.33668653267467796, + "learning_rate": 7.074431564362292e-05, + "loss": 2.7169, + "step": 27542 + }, + { + "epoch": 1.2823288404683755, + "grad_norm": 0.3024881492263512, + "learning_rate": 7.074185100622814e-05, + "loss": 2.6882, + "step": 27543 + }, + { + "epoch": 1.2823753986544684, + "grad_norm": 0.3119082891703998, + "learning_rate": 7.073938630795806e-05, + "loss": 2.75, + "step": 27544 + }, + { + "epoch": 1.2824219568405615, + "grad_norm": 0.3763519849135089, + "learning_rate": 7.073692154881996e-05, + "loss": 2.8019, + "step": 27545 + }, + { + "epoch": 1.2824685150266546, + "grad_norm": 0.30383927510269454, + "learning_rate": 7.073445672882101e-05, + "loss": 2.7321, + "step": 27546 + }, + { + "epoch": 1.2825150732127477, + "grad_norm": 0.3738345079576748, + "learning_rate": 7.073199184796849e-05, + "loss": 2.8741, + "step": 27547 + }, + { + "epoch": 1.2825616313988406, + "grad_norm": 0.30390964351608923, + "learning_rate": 7.07295269062696e-05, + "loss": 2.749, + "step": 27548 + }, + { + "epoch": 1.2826081895849337, + "grad_norm": 0.3528527829223225, + "learning_rate": 7.07270619037316e-05, + "loss": 2.7525, + "step": 27549 + }, + { + "epoch": 1.2826547477710268, + "grad_norm": 0.34987829941115306, + "learning_rate": 7.072459684036172e-05, + "loss": 2.8575, + "step": 27550 + }, + { + "epoch": 1.28270130595712, + "grad_norm": 0.3383536806816191, + "learning_rate": 7.072213171616718e-05, + "loss": 2.7226, + "step": 27551 + }, + { + "epoch": 1.282747864143213, + "grad_norm": 0.33475176962019343, + "learning_rate": 7.071966653115523e-05, + "loss": 2.7874, + "step": 27552 + }, + { + "epoch": 1.2827944223293062, + "grad_norm": 0.3442531530535472, + "learning_rate": 7.07172012853331e-05, + "loss": 2.7983, + "step": 27553 + }, + { + "epoch": 1.282840980515399, + "grad_norm": 0.32311019005747155, + "learning_rate": 7.071473597870802e-05, + "loss": 2.7575, + "step": 27554 + }, + { + "epoch": 1.2828875387014922, + "grad_norm": 0.32765220749479335, + "learning_rate": 7.071227061128724e-05, + "loss": 2.8553, + "step": 27555 + }, + { + "epoch": 1.2829340968875853, + "grad_norm": 0.32723783009133545, + "learning_rate": 7.070980518307797e-05, + "loss": 2.7814, + "step": 27556 + }, + { + "epoch": 1.2829806550736782, + "grad_norm": 0.32443244680376787, + "learning_rate": 7.070733969408747e-05, + "loss": 2.8383, + "step": 27557 + }, + { + "epoch": 1.2830272132597713, + "grad_norm": 0.3519872115781187, + "learning_rate": 7.070487414432296e-05, + "loss": 2.8109, + "step": 27558 + }, + { + "epoch": 1.2830737714458644, + "grad_norm": 0.3359579453056559, + "learning_rate": 7.07024085337917e-05, + "loss": 2.7756, + "step": 27559 + }, + { + "epoch": 1.2831203296319575, + "grad_norm": 0.3271216857524921, + "learning_rate": 7.06999428625009e-05, + "loss": 2.8711, + "step": 27560 + }, + { + "epoch": 1.2831668878180507, + "grad_norm": 0.35277566826163975, + "learning_rate": 7.069747713045783e-05, + "loss": 2.7821, + "step": 27561 + }, + { + "epoch": 1.2832134460041438, + "grad_norm": 0.30977453727649545, + "learning_rate": 7.069501133766968e-05, + "loss": 2.7622, + "step": 27562 + }, + { + "epoch": 1.2832600041902367, + "grad_norm": 0.34039285894743715, + "learning_rate": 7.069254548414373e-05, + "loss": 2.8229, + "step": 27563 + }, + { + "epoch": 1.2833065623763298, + "grad_norm": 0.3390747185329035, + "learning_rate": 7.069007956988718e-05, + "loss": 2.9437, + "step": 27564 + }, + { + "epoch": 1.283353120562423, + "grad_norm": 0.3393197019128002, + "learning_rate": 7.068761359490729e-05, + "loss": 2.799, + "step": 27565 + }, + { + "epoch": 1.283399678748516, + "grad_norm": 0.3502300917873219, + "learning_rate": 7.068514755921129e-05, + "loss": 2.8993, + "step": 27566 + }, + { + "epoch": 1.283446236934609, + "grad_norm": 0.33056489384674265, + "learning_rate": 7.068268146280642e-05, + "loss": 2.6756, + "step": 27567 + }, + { + "epoch": 1.283492795120702, + "grad_norm": 0.3329298004203064, + "learning_rate": 7.068021530569993e-05, + "loss": 2.7066, + "step": 27568 + }, + { + "epoch": 1.2835393533067951, + "grad_norm": 0.3156813782700637, + "learning_rate": 7.067774908789901e-05, + "loss": 2.7452, + "step": 27569 + }, + { + "epoch": 1.2835859114928883, + "grad_norm": 0.3665701720509173, + "learning_rate": 7.067528280941096e-05, + "loss": 2.7495, + "step": 27570 + }, + { + "epoch": 1.2836324696789814, + "grad_norm": 0.310570513711226, + "learning_rate": 7.067281647024298e-05, + "loss": 2.7147, + "step": 27571 + }, + { + "epoch": 1.2836790278650745, + "grad_norm": 0.32439634845752296, + "learning_rate": 7.067035007040235e-05, + "loss": 2.8018, + "step": 27572 + }, + { + "epoch": 1.2837255860511674, + "grad_norm": 0.339743126370965, + "learning_rate": 7.066788360989625e-05, + "loss": 2.7733, + "step": 27573 + }, + { + "epoch": 1.2837721442372605, + "grad_norm": 0.3355652484184572, + "learning_rate": 7.066541708873196e-05, + "loss": 2.7589, + "step": 27574 + }, + { + "epoch": 1.2838187024233536, + "grad_norm": 0.3315928846877796, + "learning_rate": 7.066295050691669e-05, + "loss": 2.7461, + "step": 27575 + }, + { + "epoch": 1.2838652606094467, + "grad_norm": 0.33498976004835135, + "learning_rate": 7.066048386445771e-05, + "loss": 2.6308, + "step": 27576 + }, + { + "epoch": 1.2839118187955396, + "grad_norm": 0.33087876867385685, + "learning_rate": 7.065801716136227e-05, + "loss": 2.8782, + "step": 27577 + }, + { + "epoch": 1.2839583769816327, + "grad_norm": 0.33744045259450467, + "learning_rate": 7.065555039763755e-05, + "loss": 2.7806, + "step": 27578 + }, + { + "epoch": 1.2840049351677258, + "grad_norm": 0.34139678712630106, + "learning_rate": 7.065308357329083e-05, + "loss": 2.8214, + "step": 27579 + }, + { + "epoch": 1.284051493353819, + "grad_norm": 0.34591517373044656, + "learning_rate": 7.065061668832935e-05, + "loss": 2.7564, + "step": 27580 + }, + { + "epoch": 1.284098051539912, + "grad_norm": 0.3321567493189195, + "learning_rate": 7.064814974276034e-05, + "loss": 2.733, + "step": 27581 + }, + { + "epoch": 1.2841446097260052, + "grad_norm": 0.3439588261865581, + "learning_rate": 7.064568273659104e-05, + "loss": 2.8237, + "step": 27582 + }, + { + "epoch": 1.284191167912098, + "grad_norm": 0.33555215760229384, + "learning_rate": 7.06432156698287e-05, + "loss": 2.8248, + "step": 27583 + }, + { + "epoch": 1.2842377260981912, + "grad_norm": 0.3393687355822465, + "learning_rate": 7.064074854248055e-05, + "loss": 2.8302, + "step": 27584 + }, + { + "epoch": 1.2842842842842843, + "grad_norm": 0.3174787632236683, + "learning_rate": 7.063828135455384e-05, + "loss": 2.8019, + "step": 27585 + }, + { + "epoch": 1.2843308424703774, + "grad_norm": 0.35516954748859225, + "learning_rate": 7.063581410605582e-05, + "loss": 2.7726, + "step": 27586 + }, + { + "epoch": 1.2843774006564703, + "grad_norm": 0.32137879609705083, + "learning_rate": 7.063334679699371e-05, + "loss": 2.7722, + "step": 27587 + }, + { + "epoch": 1.2844239588425634, + "grad_norm": 0.33550390544997394, + "learning_rate": 7.063087942737476e-05, + "loss": 2.7927, + "step": 27588 + }, + { + "epoch": 1.2844705170286566, + "grad_norm": 0.32592774266886854, + "learning_rate": 7.062841199720622e-05, + "loss": 2.675, + "step": 27589 + }, + { + "epoch": 1.2845170752147497, + "grad_norm": 0.3306053775178157, + "learning_rate": 7.062594450649532e-05, + "loss": 2.7786, + "step": 27590 + }, + { + "epoch": 1.2845636334008428, + "grad_norm": 0.38902176760257795, + "learning_rate": 7.06234769552493e-05, + "loss": 2.8236, + "step": 27591 + }, + { + "epoch": 1.284610191586936, + "grad_norm": 0.32151045865664624, + "learning_rate": 7.06210093434754e-05, + "loss": 2.7051, + "step": 27592 + }, + { + "epoch": 1.2846567497730288, + "grad_norm": 0.3277946202883401, + "learning_rate": 7.061854167118087e-05, + "loss": 2.7359, + "step": 27593 + }, + { + "epoch": 1.284703307959122, + "grad_norm": 0.3491396970128269, + "learning_rate": 7.061607393837295e-05, + "loss": 2.8673, + "step": 27594 + }, + { + "epoch": 1.284749866145215, + "grad_norm": 0.30305431364890634, + "learning_rate": 7.061360614505889e-05, + "loss": 2.7976, + "step": 27595 + }, + { + "epoch": 1.284796424331308, + "grad_norm": 0.295939221546353, + "learning_rate": 7.061113829124593e-05, + "loss": 2.732, + "step": 27596 + }, + { + "epoch": 1.284842982517401, + "grad_norm": 0.3211321801559614, + "learning_rate": 7.060867037694131e-05, + "loss": 2.7451, + "step": 27597 + }, + { + "epoch": 1.2848895407034941, + "grad_norm": 0.3092788520612306, + "learning_rate": 7.060620240215227e-05, + "loss": 2.7637, + "step": 27598 + }, + { + "epoch": 1.2849360988895873, + "grad_norm": 0.3248542337917541, + "learning_rate": 7.060373436688606e-05, + "loss": 2.7219, + "step": 27599 + }, + { + "epoch": 1.2849826570756804, + "grad_norm": 0.3315319029183516, + "learning_rate": 7.060126627114992e-05, + "loss": 2.8042, + "step": 27600 + }, + { + "epoch": 1.2850292152617735, + "grad_norm": 0.33867999823159256, + "learning_rate": 7.059879811495107e-05, + "loss": 2.8228, + "step": 27601 + }, + { + "epoch": 1.2850757734478666, + "grad_norm": 0.32145192536031214, + "learning_rate": 7.05963298982968e-05, + "loss": 2.787, + "step": 27602 + }, + { + "epoch": 1.2851223316339595, + "grad_norm": 0.3070751025169529, + "learning_rate": 7.059386162119433e-05, + "loss": 2.7383, + "step": 27603 + }, + { + "epoch": 1.2851688898200526, + "grad_norm": 0.32387189891119744, + "learning_rate": 7.059139328365091e-05, + "loss": 2.8533, + "step": 27604 + }, + { + "epoch": 1.2852154480061457, + "grad_norm": 0.3215265213838132, + "learning_rate": 7.058892488567377e-05, + "loss": 2.8404, + "step": 27605 + }, + { + "epoch": 1.2852620061922386, + "grad_norm": 0.571662827114118, + "learning_rate": 7.058645642727016e-05, + "loss": 2.842, + "step": 27606 + }, + { + "epoch": 1.2853085643783317, + "grad_norm": 0.36113290857141817, + "learning_rate": 7.058398790844732e-05, + "loss": 2.6826, + "step": 27607 + }, + { + "epoch": 1.2853551225644249, + "grad_norm": 0.3640682991730454, + "learning_rate": 7.058151932921254e-05, + "loss": 2.7073, + "step": 27608 + }, + { + "epoch": 1.285401680750518, + "grad_norm": 0.3418339878962266, + "learning_rate": 7.057905068957298e-05, + "loss": 2.7848, + "step": 27609 + }, + { + "epoch": 1.285448238936611, + "grad_norm": 0.37039152821175214, + "learning_rate": 7.057658198953596e-05, + "loss": 2.801, + "step": 27610 + }, + { + "epoch": 1.2854947971227042, + "grad_norm": 0.41550622352179095, + "learning_rate": 7.05741132291087e-05, + "loss": 2.7301, + "step": 27611 + }, + { + "epoch": 1.285541355308797, + "grad_norm": 0.34699294518707835, + "learning_rate": 7.057164440829844e-05, + "loss": 2.8277, + "step": 27612 + }, + { + "epoch": 1.2855879134948902, + "grad_norm": 0.31702054787573647, + "learning_rate": 7.056917552711244e-05, + "loss": 2.7588, + "step": 27613 + }, + { + "epoch": 1.2856344716809833, + "grad_norm": 0.3437088891327026, + "learning_rate": 7.056670658555791e-05, + "loss": 2.7584, + "step": 27614 + }, + { + "epoch": 1.2856810298670764, + "grad_norm": 0.3416200706225733, + "learning_rate": 7.056423758364216e-05, + "loss": 2.8152, + "step": 27615 + }, + { + "epoch": 1.2857275880531693, + "grad_norm": 0.30927688371474255, + "learning_rate": 7.056176852137237e-05, + "loss": 2.7848, + "step": 27616 + }, + { + "epoch": 1.2857741462392624, + "grad_norm": 0.3402315746032749, + "learning_rate": 7.055929939875583e-05, + "loss": 2.7229, + "step": 27617 + }, + { + "epoch": 1.2858207044253556, + "grad_norm": 0.3189405018920224, + "learning_rate": 7.055683021579976e-05, + "loss": 2.7025, + "step": 27618 + }, + { + "epoch": 1.2858672626114487, + "grad_norm": 0.34904662504161105, + "learning_rate": 7.055436097251142e-05, + "loss": 2.8552, + "step": 27619 + }, + { + "epoch": 1.2859138207975418, + "grad_norm": 0.3456988580850209, + "learning_rate": 7.055189166889806e-05, + "loss": 2.8213, + "step": 27620 + }, + { + "epoch": 1.285960378983635, + "grad_norm": 0.3413496459760938, + "learning_rate": 7.054942230496693e-05, + "loss": 2.8104, + "step": 27621 + }, + { + "epoch": 1.2860069371697278, + "grad_norm": 0.33736235067344317, + "learning_rate": 7.054695288072526e-05, + "loss": 2.8291, + "step": 27622 + }, + { + "epoch": 1.286053495355821, + "grad_norm": 0.34390431580345704, + "learning_rate": 7.054448339618031e-05, + "loss": 2.8099, + "step": 27623 + }, + { + "epoch": 1.286100053541914, + "grad_norm": 0.427662256740111, + "learning_rate": 7.054201385133932e-05, + "loss": 2.801, + "step": 27624 + }, + { + "epoch": 1.2861466117280071, + "grad_norm": 0.3644253447864923, + "learning_rate": 7.053954424620954e-05, + "loss": 2.8838, + "step": 27625 + }, + { + "epoch": 1.2861931699141, + "grad_norm": 0.3624064865237384, + "learning_rate": 7.053707458079825e-05, + "loss": 2.7302, + "step": 27626 + }, + { + "epoch": 1.2862397281001932, + "grad_norm": 0.3774615921634013, + "learning_rate": 7.053460485511264e-05, + "loss": 2.6565, + "step": 27627 + }, + { + "epoch": 1.2862862862862863, + "grad_norm": 0.30934085199030753, + "learning_rate": 7.053213506916e-05, + "loss": 2.8235, + "step": 27628 + }, + { + "epoch": 1.2863328444723794, + "grad_norm": 0.3423710412112102, + "learning_rate": 7.052966522294756e-05, + "loss": 2.81, + "step": 27629 + }, + { + "epoch": 1.2863794026584725, + "grad_norm": 0.31698214473034325, + "learning_rate": 7.052719531648258e-05, + "loss": 2.7641, + "step": 27630 + }, + { + "epoch": 1.2864259608445656, + "grad_norm": 0.3043938200549575, + "learning_rate": 7.05247253497723e-05, + "loss": 2.7879, + "step": 27631 + }, + { + "epoch": 1.2864725190306585, + "grad_norm": 0.3286418148695611, + "learning_rate": 7.052225532282397e-05, + "loss": 2.846, + "step": 27632 + }, + { + "epoch": 1.2865190772167516, + "grad_norm": 0.3249750168268111, + "learning_rate": 7.051978523564484e-05, + "loss": 2.7796, + "step": 27633 + }, + { + "epoch": 1.2865656354028447, + "grad_norm": 0.3239685727382203, + "learning_rate": 7.051731508824218e-05, + "loss": 2.8128, + "step": 27634 + }, + { + "epoch": 1.2866121935889379, + "grad_norm": 0.35085493381054716, + "learning_rate": 7.051484488062321e-05, + "loss": 2.8035, + "step": 27635 + }, + { + "epoch": 1.2866587517750308, + "grad_norm": 0.3417702934580964, + "learning_rate": 7.051237461279519e-05, + "loss": 2.6546, + "step": 27636 + }, + { + "epoch": 1.2867053099611239, + "grad_norm": 0.35474934119856344, + "learning_rate": 7.050990428476537e-05, + "loss": 2.831, + "step": 27637 + }, + { + "epoch": 1.286751868147217, + "grad_norm": 0.3184012374167769, + "learning_rate": 7.050743389654099e-05, + "loss": 2.8221, + "step": 27638 + }, + { + "epoch": 1.28679842633331, + "grad_norm": 0.3476948334028524, + "learning_rate": 7.050496344812934e-05, + "loss": 2.774, + "step": 27639 + }, + { + "epoch": 1.2868449845194032, + "grad_norm": 0.33523469497417324, + "learning_rate": 7.050249293953763e-05, + "loss": 2.7765, + "step": 27640 + }, + { + "epoch": 1.2868915427054963, + "grad_norm": 0.34215543221751077, + "learning_rate": 7.050002237077313e-05, + "loss": 2.6836, + "step": 27641 + }, + { + "epoch": 1.2869381008915892, + "grad_norm": 0.3535700180005226, + "learning_rate": 7.049755174184308e-05, + "loss": 2.8691, + "step": 27642 + }, + { + "epoch": 1.2869846590776823, + "grad_norm": 0.3227953637140154, + "learning_rate": 7.049508105275471e-05, + "loss": 2.7459, + "step": 27643 + }, + { + "epoch": 1.2870312172637755, + "grad_norm": 0.32849037161531547, + "learning_rate": 7.049261030351534e-05, + "loss": 2.7517, + "step": 27644 + }, + { + "epoch": 1.2870777754498683, + "grad_norm": 0.32456448063122856, + "learning_rate": 7.049013949413214e-05, + "loss": 2.7431, + "step": 27645 + }, + { + "epoch": 1.2871243336359615, + "grad_norm": 0.3272349177043684, + "learning_rate": 7.048766862461241e-05, + "loss": 2.7983, + "step": 27646 + }, + { + "epoch": 1.2871708918220546, + "grad_norm": 0.3273211991753211, + "learning_rate": 7.04851976949634e-05, + "loss": 2.825, + "step": 27647 + }, + { + "epoch": 1.2872174500081477, + "grad_norm": 0.3533056184103023, + "learning_rate": 7.048272670519234e-05, + "loss": 2.7476, + "step": 27648 + }, + { + "epoch": 1.2872640081942408, + "grad_norm": 0.34824150946037424, + "learning_rate": 7.048025565530651e-05, + "loss": 2.7906, + "step": 27649 + }, + { + "epoch": 1.287310566380334, + "grad_norm": 0.3499496276335422, + "learning_rate": 7.047778454531314e-05, + "loss": 2.8176, + "step": 27650 + }, + { + "epoch": 1.2873571245664268, + "grad_norm": 0.3976034496491144, + "learning_rate": 7.047531337521948e-05, + "loss": 2.8262, + "step": 27651 + }, + { + "epoch": 1.28740368275252, + "grad_norm": 0.32685373452021665, + "learning_rate": 7.047284214503281e-05, + "loss": 2.781, + "step": 27652 + }, + { + "epoch": 1.287450240938613, + "grad_norm": 0.35459377751694393, + "learning_rate": 7.047037085476036e-05, + "loss": 2.7521, + "step": 27653 + }, + { + "epoch": 1.2874967991247062, + "grad_norm": 0.34788946088707123, + "learning_rate": 7.04678995044094e-05, + "loss": 2.7114, + "step": 27654 + }, + { + "epoch": 1.287543357310799, + "grad_norm": 0.3846892922636588, + "learning_rate": 7.046542809398715e-05, + "loss": 2.7904, + "step": 27655 + }, + { + "epoch": 1.2875899154968922, + "grad_norm": 0.3493015724551526, + "learning_rate": 7.046295662350089e-05, + "loss": 2.819, + "step": 27656 + }, + { + "epoch": 1.2876364736829853, + "grad_norm": 0.3543998145244514, + "learning_rate": 7.046048509295787e-05, + "loss": 2.7862, + "step": 27657 + }, + { + "epoch": 1.2876830318690784, + "grad_norm": 0.37352114345573045, + "learning_rate": 7.045801350236535e-05, + "loss": 2.8732, + "step": 27658 + }, + { + "epoch": 1.2877295900551715, + "grad_norm": 0.3468139274874077, + "learning_rate": 7.045554185173056e-05, + "loss": 2.7786, + "step": 27659 + }, + { + "epoch": 1.2877761482412646, + "grad_norm": 0.3709156303918308, + "learning_rate": 7.045307014106078e-05, + "loss": 2.7144, + "step": 27660 + }, + { + "epoch": 1.2878227064273575, + "grad_norm": 0.34707669188112283, + "learning_rate": 7.045059837036325e-05, + "loss": 2.7774, + "step": 27661 + }, + { + "epoch": 1.2878692646134506, + "grad_norm": 0.3591202536861882, + "learning_rate": 7.044812653964522e-05, + "loss": 2.7149, + "step": 27662 + }, + { + "epoch": 1.2879158227995438, + "grad_norm": 0.34825426650150393, + "learning_rate": 7.044565464891396e-05, + "loss": 2.7334, + "step": 27663 + }, + { + "epoch": 1.2879623809856369, + "grad_norm": 0.3620239020829689, + "learning_rate": 7.044318269817673e-05, + "loss": 2.7508, + "step": 27664 + }, + { + "epoch": 1.2880089391717298, + "grad_norm": 0.33355294712080463, + "learning_rate": 7.044071068744076e-05, + "loss": 2.7827, + "step": 27665 + }, + { + "epoch": 1.2880554973578229, + "grad_norm": 0.3505366132445528, + "learning_rate": 7.043823861671332e-05, + "loss": 2.7175, + "step": 27666 + }, + { + "epoch": 1.288102055543916, + "grad_norm": 0.340651354376317, + "learning_rate": 7.043576648600168e-05, + "loss": 2.612, + "step": 27667 + }, + { + "epoch": 1.288148613730009, + "grad_norm": 0.3354214309732132, + "learning_rate": 7.043329429531306e-05, + "loss": 2.6876, + "step": 27668 + }, + { + "epoch": 1.2881951719161022, + "grad_norm": 0.3422325950973206, + "learning_rate": 7.043082204465473e-05, + "loss": 2.7466, + "step": 27669 + }, + { + "epoch": 1.2882417301021953, + "grad_norm": 0.3144582243342485, + "learning_rate": 7.042834973403395e-05, + "loss": 2.6776, + "step": 27670 + }, + { + "epoch": 1.2882882882882882, + "grad_norm": 0.3137270040755919, + "learning_rate": 7.0425877363458e-05, + "loss": 2.7667, + "step": 27671 + }, + { + "epoch": 1.2883348464743813, + "grad_norm": 0.3269552013915792, + "learning_rate": 7.04234049329341e-05, + "loss": 2.8791, + "step": 27672 + }, + { + "epoch": 1.2883814046604745, + "grad_norm": 0.3290770470110519, + "learning_rate": 7.04209324424695e-05, + "loss": 2.7423, + "step": 27673 + }, + { + "epoch": 1.2884279628465676, + "grad_norm": 0.34312761460563074, + "learning_rate": 7.041845989207149e-05, + "loss": 2.779, + "step": 27674 + }, + { + "epoch": 1.2884745210326605, + "grad_norm": 0.3110936598319468, + "learning_rate": 7.041598728174729e-05, + "loss": 2.8683, + "step": 27675 + }, + { + "epoch": 1.2885210792187536, + "grad_norm": 0.36715557816603006, + "learning_rate": 7.041351461150421e-05, + "loss": 2.7773, + "step": 27676 + }, + { + "epoch": 1.2885676374048467, + "grad_norm": 0.3423122164452602, + "learning_rate": 7.041104188134948e-05, + "loss": 2.7712, + "step": 27677 + }, + { + "epoch": 1.2886141955909398, + "grad_norm": 0.3343014432895765, + "learning_rate": 7.040856909129035e-05, + "loss": 2.6844, + "step": 27678 + }, + { + "epoch": 1.288660753777033, + "grad_norm": 0.33915881709601126, + "learning_rate": 7.040609624133405e-05, + "loss": 2.7606, + "step": 27679 + }, + { + "epoch": 1.288707311963126, + "grad_norm": 0.3459626215364932, + "learning_rate": 7.04036233314879e-05, + "loss": 2.7369, + "step": 27680 + }, + { + "epoch": 1.288753870149219, + "grad_norm": 0.31819384163507414, + "learning_rate": 7.04011503617591e-05, + "loss": 2.7856, + "step": 27681 + }, + { + "epoch": 1.288800428335312, + "grad_norm": 0.3410857771890478, + "learning_rate": 7.039867733215495e-05, + "loss": 2.8739, + "step": 27682 + }, + { + "epoch": 1.2888469865214052, + "grad_norm": 0.30666774162396077, + "learning_rate": 7.039620424268268e-05, + "loss": 2.7494, + "step": 27683 + }, + { + "epoch": 1.288893544707498, + "grad_norm": 0.3449208261379872, + "learning_rate": 7.039373109334957e-05, + "loss": 2.7775, + "step": 27684 + }, + { + "epoch": 1.2889401028935912, + "grad_norm": 0.38499475456323384, + "learning_rate": 7.039125788416287e-05, + "loss": 2.7635, + "step": 27685 + }, + { + "epoch": 1.2889866610796843, + "grad_norm": 0.3670562730983952, + "learning_rate": 7.038878461512983e-05, + "loss": 2.8752, + "step": 27686 + }, + { + "epoch": 1.2890332192657774, + "grad_norm": 0.38406160829106645, + "learning_rate": 7.038631128625772e-05, + "loss": 2.8125, + "step": 27687 + }, + { + "epoch": 1.2890797774518705, + "grad_norm": 0.35117384671716845, + "learning_rate": 7.038383789755378e-05, + "loss": 2.7159, + "step": 27688 + }, + { + "epoch": 1.2891263356379636, + "grad_norm": 0.3319478775267397, + "learning_rate": 7.038136444902531e-05, + "loss": 2.7009, + "step": 27689 + }, + { + "epoch": 1.2891728938240565, + "grad_norm": 0.3601443835617374, + "learning_rate": 7.037889094067953e-05, + "loss": 2.8265, + "step": 27690 + }, + { + "epoch": 1.2892194520101496, + "grad_norm": 0.3366223354012665, + "learning_rate": 7.037641737252371e-05, + "loss": 2.7147, + "step": 27691 + }, + { + "epoch": 1.2892660101962428, + "grad_norm": 0.35069086310448805, + "learning_rate": 7.037394374456512e-05, + "loss": 2.808, + "step": 27692 + }, + { + "epoch": 1.2893125683823359, + "grad_norm": 0.350627376661193, + "learning_rate": 7.0371470056811e-05, + "loss": 2.7952, + "step": 27693 + }, + { + "epoch": 1.2893591265684288, + "grad_norm": 0.362297599614803, + "learning_rate": 7.036899630926865e-05, + "loss": 2.7303, + "step": 27694 + }, + { + "epoch": 1.2894056847545219, + "grad_norm": 0.34696367081242824, + "learning_rate": 7.036652250194527e-05, + "loss": 2.8582, + "step": 27695 + }, + { + "epoch": 1.289452242940615, + "grad_norm": 0.3405101200681285, + "learning_rate": 7.036404863484816e-05, + "loss": 2.7487, + "step": 27696 + }, + { + "epoch": 1.2894988011267081, + "grad_norm": 0.34694203022011333, + "learning_rate": 7.036157470798458e-05, + "loss": 2.7844, + "step": 27697 + }, + { + "epoch": 1.2895453593128012, + "grad_norm": 0.3309994466349863, + "learning_rate": 7.035910072136179e-05, + "loss": 2.7724, + "step": 27698 + }, + { + "epoch": 1.2895919174988943, + "grad_norm": 0.33096385341012063, + "learning_rate": 7.035662667498704e-05, + "loss": 2.7719, + "step": 27699 + }, + { + "epoch": 1.2896384756849872, + "grad_norm": 0.35130079714886875, + "learning_rate": 7.03541525688676e-05, + "loss": 2.7695, + "step": 27700 + }, + { + "epoch": 1.2896850338710804, + "grad_norm": 0.348780523084102, + "learning_rate": 7.03516784030107e-05, + "loss": 2.8729, + "step": 27701 + }, + { + "epoch": 1.2897315920571735, + "grad_norm": 0.34529143457649997, + "learning_rate": 7.034920417742365e-05, + "loss": 2.8301, + "step": 27702 + }, + { + "epoch": 1.2897781502432666, + "grad_norm": 0.34818619840971227, + "learning_rate": 7.03467298921137e-05, + "loss": 2.8568, + "step": 27703 + }, + { + "epoch": 1.2898247084293595, + "grad_norm": 0.32011083566597137, + "learning_rate": 7.03442555470881e-05, + "loss": 2.693, + "step": 27704 + }, + { + "epoch": 1.2898712666154526, + "grad_norm": 0.33200976896663165, + "learning_rate": 7.034178114235411e-05, + "loss": 2.8272, + "step": 27705 + }, + { + "epoch": 1.2899178248015457, + "grad_norm": 0.36324563094644535, + "learning_rate": 7.033930667791899e-05, + "loss": 2.7651, + "step": 27706 + }, + { + "epoch": 1.2899643829876388, + "grad_norm": 0.3127539197446907, + "learning_rate": 7.033683215379002e-05, + "loss": 2.7482, + "step": 27707 + }, + { + "epoch": 1.290010941173732, + "grad_norm": 0.324498630389769, + "learning_rate": 7.033435756997444e-05, + "loss": 2.6901, + "step": 27708 + }, + { + "epoch": 1.290057499359825, + "grad_norm": 0.31463018258325065, + "learning_rate": 7.033188292647953e-05, + "loss": 2.805, + "step": 27709 + }, + { + "epoch": 1.290104057545918, + "grad_norm": 0.3288647532324907, + "learning_rate": 7.032940822331255e-05, + "loss": 2.788, + "step": 27710 + }, + { + "epoch": 1.290150615732011, + "grad_norm": 0.31821517239571245, + "learning_rate": 7.032693346048074e-05, + "loss": 2.7602, + "step": 27711 + }, + { + "epoch": 1.2901971739181042, + "grad_norm": 0.2964254043008604, + "learning_rate": 7.032445863799141e-05, + "loss": 2.7124, + "step": 27712 + }, + { + "epoch": 1.2902437321041973, + "grad_norm": 0.3394116588213578, + "learning_rate": 7.032198375585177e-05, + "loss": 2.8282, + "step": 27713 + }, + { + "epoch": 1.2902902902902902, + "grad_norm": 0.29907609066941293, + "learning_rate": 7.031950881406913e-05, + "loss": 2.779, + "step": 27714 + }, + { + "epoch": 1.2903368484763833, + "grad_norm": 0.34862363655202544, + "learning_rate": 7.031703381265072e-05, + "loss": 2.821, + "step": 27715 + }, + { + "epoch": 1.2903834066624764, + "grad_norm": 0.34725607331910235, + "learning_rate": 7.031455875160383e-05, + "loss": 2.8428, + "step": 27716 + }, + { + "epoch": 1.2904299648485695, + "grad_norm": 0.33260857097026125, + "learning_rate": 7.031208363093571e-05, + "loss": 2.8328, + "step": 27717 + }, + { + "epoch": 1.2904765230346626, + "grad_norm": 0.3395049278542966, + "learning_rate": 7.030960845065362e-05, + "loss": 2.774, + "step": 27718 + }, + { + "epoch": 1.2905230812207558, + "grad_norm": 0.3496875211656551, + "learning_rate": 7.030713321076484e-05, + "loss": 2.872, + "step": 27719 + }, + { + "epoch": 1.2905696394068487, + "grad_norm": 0.3252158181235193, + "learning_rate": 7.030465791127659e-05, + "loss": 2.8958, + "step": 27720 + }, + { + "epoch": 1.2906161975929418, + "grad_norm": 0.36470765682433737, + "learning_rate": 7.030218255219621e-05, + "loss": 2.8054, + "step": 27721 + }, + { + "epoch": 1.2906627557790349, + "grad_norm": 0.33162267960807557, + "learning_rate": 7.02997071335309e-05, + "loss": 2.8573, + "step": 27722 + }, + { + "epoch": 1.2907093139651278, + "grad_norm": 0.3595108299296833, + "learning_rate": 7.029723165528796e-05, + "loss": 2.6481, + "step": 27723 + }, + { + "epoch": 1.290755872151221, + "grad_norm": 0.3447730251251618, + "learning_rate": 7.029475611747464e-05, + "loss": 2.7651, + "step": 27724 + }, + { + "epoch": 1.290802430337314, + "grad_norm": 0.33869377458411415, + "learning_rate": 7.029228052009822e-05, + "loss": 2.6819, + "step": 27725 + }, + { + "epoch": 1.2908489885234071, + "grad_norm": 0.37026866333664493, + "learning_rate": 7.028980486316595e-05, + "loss": 2.822, + "step": 27726 + }, + { + "epoch": 1.2908955467095002, + "grad_norm": 0.3502918566867178, + "learning_rate": 7.02873291466851e-05, + "loss": 2.7703, + "step": 27727 + }, + { + "epoch": 1.2909421048955934, + "grad_norm": 0.3400055418638091, + "learning_rate": 7.028485337066293e-05, + "loss": 2.7094, + "step": 27728 + }, + { + "epoch": 1.2909886630816865, + "grad_norm": 0.360383376125598, + "learning_rate": 7.028237753510672e-05, + "loss": 2.8562, + "step": 27729 + }, + { + "epoch": 1.2910352212677794, + "grad_norm": 0.33366255441075876, + "learning_rate": 7.027990164002375e-05, + "loss": 2.7068, + "step": 27730 + }, + { + "epoch": 1.2910817794538725, + "grad_norm": 0.33532932736734233, + "learning_rate": 7.027742568542124e-05, + "loss": 2.7721, + "step": 27731 + }, + { + "epoch": 1.2911283376399656, + "grad_norm": 0.3273545648952967, + "learning_rate": 7.027494967130649e-05, + "loss": 2.7336, + "step": 27732 + }, + { + "epoch": 1.2911748958260585, + "grad_norm": 0.3298155793236389, + "learning_rate": 7.027247359768676e-05, + "loss": 2.7193, + "step": 27733 + }, + { + "epoch": 1.2912214540121516, + "grad_norm": 0.3305553379966469, + "learning_rate": 7.026999746456933e-05, + "loss": 2.6928, + "step": 27734 + }, + { + "epoch": 1.2912680121982447, + "grad_norm": 0.3274795810132721, + "learning_rate": 7.026752127196144e-05, + "loss": 2.7401, + "step": 27735 + }, + { + "epoch": 1.2913145703843378, + "grad_norm": 0.3250865112741355, + "learning_rate": 7.026504501987037e-05, + "loss": 2.7029, + "step": 27736 + }, + { + "epoch": 1.291361128570431, + "grad_norm": 0.33690007860187376, + "learning_rate": 7.026256870830339e-05, + "loss": 2.7849, + "step": 27737 + }, + { + "epoch": 1.291407686756524, + "grad_norm": 0.3258522832963386, + "learning_rate": 7.026009233726776e-05, + "loss": 2.7307, + "step": 27738 + }, + { + "epoch": 1.291454244942617, + "grad_norm": 0.3140299151415103, + "learning_rate": 7.025761590677078e-05, + "loss": 2.7122, + "step": 27739 + }, + { + "epoch": 1.29150080312871, + "grad_norm": 0.33755224787245, + "learning_rate": 7.025513941681967e-05, + "loss": 2.7704, + "step": 27740 + }, + { + "epoch": 1.2915473613148032, + "grad_norm": 0.3316288266645875, + "learning_rate": 7.025266286742174e-05, + "loss": 2.7858, + "step": 27741 + }, + { + "epoch": 1.2915939195008963, + "grad_norm": 0.3232877116980922, + "learning_rate": 7.025018625858424e-05, + "loss": 2.8221, + "step": 27742 + }, + { + "epoch": 1.2916404776869892, + "grad_norm": 0.32381970891944944, + "learning_rate": 7.024770959031442e-05, + "loss": 2.6967, + "step": 27743 + }, + { + "epoch": 1.2916870358730823, + "grad_norm": 0.32889165875229553, + "learning_rate": 7.024523286261958e-05, + "loss": 2.73, + "step": 27744 + }, + { + "epoch": 1.2917335940591754, + "grad_norm": 0.3418774631556283, + "learning_rate": 7.024275607550697e-05, + "loss": 2.7311, + "step": 27745 + }, + { + "epoch": 1.2917801522452685, + "grad_norm": 0.3454165592325005, + "learning_rate": 7.024027922898387e-05, + "loss": 2.7082, + "step": 27746 + }, + { + "epoch": 1.2918267104313617, + "grad_norm": 0.3289129456523808, + "learning_rate": 7.023780232305753e-05, + "loss": 2.821, + "step": 27747 + }, + { + "epoch": 1.2918732686174548, + "grad_norm": 0.337550559035577, + "learning_rate": 7.023532535773525e-05, + "loss": 2.8451, + "step": 27748 + }, + { + "epoch": 1.2919198268035477, + "grad_norm": 0.36148754925908944, + "learning_rate": 7.023284833302429e-05, + "loss": 2.8688, + "step": 27749 + }, + { + "epoch": 1.2919663849896408, + "grad_norm": 0.3310220274396, + "learning_rate": 7.023037124893189e-05, + "loss": 2.7925, + "step": 27750 + }, + { + "epoch": 1.292012943175734, + "grad_norm": 0.36634932105892626, + "learning_rate": 7.022789410546536e-05, + "loss": 2.7477, + "step": 27751 + }, + { + "epoch": 1.292059501361827, + "grad_norm": 0.31874464702038996, + "learning_rate": 7.022541690263195e-05, + "loss": 2.8031, + "step": 27752 + }, + { + "epoch": 1.29210605954792, + "grad_norm": 0.34482600500557814, + "learning_rate": 7.022293964043894e-05, + "loss": 2.8678, + "step": 27753 + }, + { + "epoch": 1.292152617734013, + "grad_norm": 0.3395716265350398, + "learning_rate": 7.022046231889358e-05, + "loss": 2.8089, + "step": 27754 + }, + { + "epoch": 1.2921991759201061, + "grad_norm": 0.31901252669221847, + "learning_rate": 7.021798493800317e-05, + "loss": 2.7885, + "step": 27755 + }, + { + "epoch": 1.2922457341061993, + "grad_norm": 0.3410473559106746, + "learning_rate": 7.021550749777494e-05, + "loss": 2.775, + "step": 27756 + }, + { + "epoch": 1.2922922922922924, + "grad_norm": 0.306036508451084, + "learning_rate": 7.021302999821622e-05, + "loss": 2.7158, + "step": 27757 + }, + { + "epoch": 1.2923388504783855, + "grad_norm": 0.33903353150280285, + "learning_rate": 7.021055243933423e-05, + "loss": 2.8262, + "step": 27758 + }, + { + "epoch": 1.2923854086644784, + "grad_norm": 0.32062058083855466, + "learning_rate": 7.020807482113627e-05, + "loss": 2.8024, + "step": 27759 + }, + { + "epoch": 1.2924319668505715, + "grad_norm": 0.3282175584414324, + "learning_rate": 7.020559714362958e-05, + "loss": 2.6678, + "step": 27760 + }, + { + "epoch": 1.2924785250366646, + "grad_norm": 0.3318015411094987, + "learning_rate": 7.020311940682149e-05, + "loss": 2.7652, + "step": 27761 + }, + { + "epoch": 1.2925250832227577, + "grad_norm": 0.3369790423385102, + "learning_rate": 7.02006416107192e-05, + "loss": 2.8235, + "step": 27762 + }, + { + "epoch": 1.2925716414088506, + "grad_norm": 0.3442258447973797, + "learning_rate": 7.019816375533002e-05, + "loss": 2.8557, + "step": 27763 + }, + { + "epoch": 1.2926181995949437, + "grad_norm": 0.3339087296696033, + "learning_rate": 7.019568584066123e-05, + "loss": 2.8355, + "step": 27764 + }, + { + "epoch": 1.2926647577810368, + "grad_norm": 0.3376515004355524, + "learning_rate": 7.019320786672009e-05, + "loss": 2.8424, + "step": 27765 + }, + { + "epoch": 1.29271131596713, + "grad_norm": 0.32736493534639516, + "learning_rate": 7.019072983351389e-05, + "loss": 2.7909, + "step": 27766 + }, + { + "epoch": 1.292757874153223, + "grad_norm": 0.346874920297913, + "learning_rate": 7.018825174104985e-05, + "loss": 2.898, + "step": 27767 + }, + { + "epoch": 1.2928044323393162, + "grad_norm": 0.3257761116685229, + "learning_rate": 7.018577358933532e-05, + "loss": 2.8064, + "step": 27768 + }, + { + "epoch": 1.292850990525409, + "grad_norm": 0.32526000685325257, + "learning_rate": 7.018329537837751e-05, + "loss": 2.8057, + "step": 27769 + }, + { + "epoch": 1.2928975487115022, + "grad_norm": 0.34079205466533563, + "learning_rate": 7.018081710818374e-05, + "loss": 2.729, + "step": 27770 + }, + { + "epoch": 1.2929441068975953, + "grad_norm": 0.31637505175550135, + "learning_rate": 7.017833877876123e-05, + "loss": 2.8453, + "step": 27771 + }, + { + "epoch": 1.2929906650836882, + "grad_norm": 0.3480048822959662, + "learning_rate": 7.01758603901173e-05, + "loss": 2.7772, + "step": 27772 + }, + { + "epoch": 1.2930372232697813, + "grad_norm": 0.35480733484528193, + "learning_rate": 7.01733819422592e-05, + "loss": 2.6958, + "step": 27773 + }, + { + "epoch": 1.2930837814558744, + "grad_norm": 0.33133062531730306, + "learning_rate": 7.017090343519421e-05, + "loss": 2.8009, + "step": 27774 + }, + { + "epoch": 1.2931303396419676, + "grad_norm": 0.37844181378420055, + "learning_rate": 7.016842486892961e-05, + "loss": 2.7387, + "step": 27775 + }, + { + "epoch": 1.2931768978280607, + "grad_norm": 0.30296327723935057, + "learning_rate": 7.016594624347267e-05, + "loss": 2.7709, + "step": 27776 + }, + { + "epoch": 1.2932234560141538, + "grad_norm": 0.3733513685178336, + "learning_rate": 7.016346755883067e-05, + "loss": 2.7666, + "step": 27777 + }, + { + "epoch": 1.2932700142002467, + "grad_norm": 0.35824873078620995, + "learning_rate": 7.016098881501086e-05, + "loss": 2.8568, + "step": 27778 + }, + { + "epoch": 1.2933165723863398, + "grad_norm": 0.4189663451282638, + "learning_rate": 7.015851001202056e-05, + "loss": 2.8362, + "step": 27779 + }, + { + "epoch": 1.293363130572433, + "grad_norm": 0.3217148460801413, + "learning_rate": 7.0156031149867e-05, + "loss": 2.716, + "step": 27780 + }, + { + "epoch": 1.293409688758526, + "grad_norm": 0.37809682136744527, + "learning_rate": 7.015355222855748e-05, + "loss": 2.7965, + "step": 27781 + }, + { + "epoch": 1.293456246944619, + "grad_norm": 0.3241030433494469, + "learning_rate": 7.015107324809928e-05, + "loss": 2.7523, + "step": 27782 + }, + { + "epoch": 1.293502805130712, + "grad_norm": 0.36096355879349235, + "learning_rate": 7.014859420849964e-05, + "loss": 2.7882, + "step": 27783 + }, + { + "epoch": 1.2935493633168051, + "grad_norm": 0.35492406231597157, + "learning_rate": 7.014611510976589e-05, + "loss": 2.8599, + "step": 27784 + }, + { + "epoch": 1.2935959215028983, + "grad_norm": 0.37201725192555446, + "learning_rate": 7.014363595190527e-05, + "loss": 2.8184, + "step": 27785 + }, + { + "epoch": 1.2936424796889914, + "grad_norm": 0.34757298150962296, + "learning_rate": 7.014115673492504e-05, + "loss": 2.7264, + "step": 27786 + }, + { + "epoch": 1.2936890378750845, + "grad_norm": 0.35691644321787513, + "learning_rate": 7.013867745883252e-05, + "loss": 2.8162, + "step": 27787 + }, + { + "epoch": 1.2937355960611774, + "grad_norm": 0.3750121628799242, + "learning_rate": 7.013619812363496e-05, + "loss": 2.684, + "step": 27788 + }, + { + "epoch": 1.2937821542472705, + "grad_norm": 0.3413500847589907, + "learning_rate": 7.013371872933965e-05, + "loss": 2.7253, + "step": 27789 + }, + { + "epoch": 1.2938287124333636, + "grad_norm": 0.342501000794441, + "learning_rate": 7.013123927595384e-05, + "loss": 2.8212, + "step": 27790 + }, + { + "epoch": 1.2938752706194567, + "grad_norm": 0.3673018922172579, + "learning_rate": 7.012875976348484e-05, + "loss": 2.7816, + "step": 27791 + }, + { + "epoch": 1.2939218288055496, + "grad_norm": 0.34772142087119595, + "learning_rate": 7.012628019193992e-05, + "loss": 2.8069, + "step": 27792 + }, + { + "epoch": 1.2939683869916427, + "grad_norm": 0.3493253916098589, + "learning_rate": 7.012380056132633e-05, + "loss": 2.8173, + "step": 27793 + }, + { + "epoch": 1.2940149451777359, + "grad_norm": 0.3478026400462755, + "learning_rate": 7.01213208716514e-05, + "loss": 2.7907, + "step": 27794 + }, + { + "epoch": 1.294061503363829, + "grad_norm": 0.37397455138637686, + "learning_rate": 7.011884112292235e-05, + "loss": 2.8344, + "step": 27795 + }, + { + "epoch": 1.294108061549922, + "grad_norm": 0.3006844970684139, + "learning_rate": 7.011636131514648e-05, + "loss": 2.7046, + "step": 27796 + }, + { + "epoch": 1.2941546197360152, + "grad_norm": 0.354629893553511, + "learning_rate": 7.011388144833108e-05, + "loss": 2.7307, + "step": 27797 + }, + { + "epoch": 1.294201177922108, + "grad_norm": 0.3258465066398046, + "learning_rate": 7.011140152248343e-05, + "loss": 2.7747, + "step": 27798 + }, + { + "epoch": 1.2942477361082012, + "grad_norm": 0.3385231046918459, + "learning_rate": 7.010892153761077e-05, + "loss": 2.7862, + "step": 27799 + }, + { + "epoch": 1.2942942942942943, + "grad_norm": 0.3426549058126923, + "learning_rate": 7.010644149372043e-05, + "loss": 2.8753, + "step": 27800 + }, + { + "epoch": 1.2943408524803874, + "grad_norm": 0.3190772942076941, + "learning_rate": 7.010396139081964e-05, + "loss": 2.7348, + "step": 27801 + }, + { + "epoch": 1.2943874106664803, + "grad_norm": 0.35462789144487566, + "learning_rate": 7.010148122891575e-05, + "loss": 2.7803, + "step": 27802 + }, + { + "epoch": 1.2944339688525734, + "grad_norm": 0.3438110267794532, + "learning_rate": 7.009900100801596e-05, + "loss": 2.7389, + "step": 27803 + }, + { + "epoch": 1.2944805270386666, + "grad_norm": 0.3499083281593559, + "learning_rate": 7.009652072812758e-05, + "loss": 2.7561, + "step": 27804 + }, + { + "epoch": 1.2945270852247597, + "grad_norm": 0.34208650905363863, + "learning_rate": 7.009404038925791e-05, + "loss": 2.8399, + "step": 27805 + }, + { + "epoch": 1.2945736434108528, + "grad_norm": 0.3335649871395571, + "learning_rate": 7.009155999141419e-05, + "loss": 2.7616, + "step": 27806 + }, + { + "epoch": 1.294620201596946, + "grad_norm": 0.30372261474564993, + "learning_rate": 7.008907953460374e-05, + "loss": 2.7283, + "step": 27807 + }, + { + "epoch": 1.2946667597830388, + "grad_norm": 0.357080388002178, + "learning_rate": 7.008659901883381e-05, + "loss": 2.8195, + "step": 27808 + }, + { + "epoch": 1.294713317969132, + "grad_norm": 0.3165643812419169, + "learning_rate": 7.008411844411168e-05, + "loss": 2.7633, + "step": 27809 + }, + { + "epoch": 1.294759876155225, + "grad_norm": 0.3541393933167981, + "learning_rate": 7.008163781044466e-05, + "loss": 2.7211, + "step": 27810 + }, + { + "epoch": 1.294806434341318, + "grad_norm": 0.3394373444812307, + "learning_rate": 7.007915711784001e-05, + "loss": 2.8561, + "step": 27811 + }, + { + "epoch": 1.294852992527411, + "grad_norm": 0.3811636170163217, + "learning_rate": 7.0076676366305e-05, + "loss": 2.7282, + "step": 27812 + }, + { + "epoch": 1.2948995507135042, + "grad_norm": 0.3259588076639833, + "learning_rate": 7.007419555584693e-05, + "loss": 2.7769, + "step": 27813 + }, + { + "epoch": 1.2949461088995973, + "grad_norm": 0.3994945949579904, + "learning_rate": 7.007171468647306e-05, + "loss": 2.919, + "step": 27814 + }, + { + "epoch": 1.2949926670856904, + "grad_norm": 0.33826527922025557, + "learning_rate": 7.006923375819071e-05, + "loss": 2.8496, + "step": 27815 + }, + { + "epoch": 1.2950392252717835, + "grad_norm": 0.4003310205449397, + "learning_rate": 7.006675277100713e-05, + "loss": 2.7256, + "step": 27816 + }, + { + "epoch": 1.2950857834578766, + "grad_norm": 0.3141781488361197, + "learning_rate": 7.00642717249296e-05, + "loss": 2.73, + "step": 27817 + }, + { + "epoch": 1.2951323416439695, + "grad_norm": 0.39196767181494807, + "learning_rate": 7.006179061996542e-05, + "loss": 2.855, + "step": 27818 + }, + { + "epoch": 1.2951788998300626, + "grad_norm": 0.35216695647280016, + "learning_rate": 7.005930945612185e-05, + "loss": 2.7649, + "step": 27819 + }, + { + "epoch": 1.2952254580161557, + "grad_norm": 0.359095503946206, + "learning_rate": 7.005682823340618e-05, + "loss": 2.7364, + "step": 27820 + }, + { + "epoch": 1.2952720162022486, + "grad_norm": 0.36875032666373897, + "learning_rate": 7.00543469518257e-05, + "loss": 2.7864, + "step": 27821 + }, + { + "epoch": 1.2953185743883417, + "grad_norm": 0.3743490194448081, + "learning_rate": 7.005186561138767e-05, + "loss": 2.7868, + "step": 27822 + }, + { + "epoch": 1.2953651325744349, + "grad_norm": 0.3771506548584311, + "learning_rate": 7.004938421209943e-05, + "loss": 2.7503, + "step": 27823 + }, + { + "epoch": 1.295411690760528, + "grad_norm": 0.33545527294203387, + "learning_rate": 7.004690275396819e-05, + "loss": 2.7611, + "step": 27824 + }, + { + "epoch": 1.295458248946621, + "grad_norm": 0.3601912382435935, + "learning_rate": 7.004442123700128e-05, + "loss": 2.7524, + "step": 27825 + }, + { + "epoch": 1.2955048071327142, + "grad_norm": 0.34418902832699455, + "learning_rate": 7.004193966120596e-05, + "loss": 2.8406, + "step": 27826 + }, + { + "epoch": 1.295551365318807, + "grad_norm": 0.365285866357453, + "learning_rate": 7.003945802658953e-05, + "loss": 2.651, + "step": 27827 + }, + { + "epoch": 1.2955979235049002, + "grad_norm": 0.37226656942495, + "learning_rate": 7.003697633315925e-05, + "loss": 2.8058, + "step": 27828 + }, + { + "epoch": 1.2956444816909933, + "grad_norm": 0.36600110865373586, + "learning_rate": 7.003449458092243e-05, + "loss": 2.8471, + "step": 27829 + }, + { + "epoch": 1.2956910398770864, + "grad_norm": 0.3537771258152756, + "learning_rate": 7.003201276988634e-05, + "loss": 2.6841, + "step": 27830 + }, + { + "epoch": 1.2957375980631793, + "grad_norm": 0.3705981241870897, + "learning_rate": 7.002953090005827e-05, + "loss": 2.6643, + "step": 27831 + }, + { + "epoch": 1.2957841562492725, + "grad_norm": 0.32460974870910575, + "learning_rate": 7.00270489714455e-05, + "loss": 2.8333, + "step": 27832 + }, + { + "epoch": 1.2958307144353656, + "grad_norm": 0.3297819036517937, + "learning_rate": 7.00245669840553e-05, + "loss": 2.7465, + "step": 27833 + }, + { + "epoch": 1.2958772726214587, + "grad_norm": 0.3219243487251772, + "learning_rate": 7.002208493789499e-05, + "loss": 2.7683, + "step": 27834 + }, + { + "epoch": 1.2959238308075518, + "grad_norm": 0.3378852502365936, + "learning_rate": 7.001960283297182e-05, + "loss": 2.7667, + "step": 27835 + }, + { + "epoch": 1.295970388993645, + "grad_norm": 0.33097569710636865, + "learning_rate": 7.001712066929308e-05, + "loss": 2.849, + "step": 27836 + }, + { + "epoch": 1.2960169471797378, + "grad_norm": 0.33960802926940303, + "learning_rate": 7.001463844686608e-05, + "loss": 2.8078, + "step": 27837 + }, + { + "epoch": 1.296063505365831, + "grad_norm": 0.31135088679702416, + "learning_rate": 7.001215616569808e-05, + "loss": 2.7492, + "step": 27838 + }, + { + "epoch": 1.296110063551924, + "grad_norm": 0.33972538543887504, + "learning_rate": 7.000967382579638e-05, + "loss": 2.6486, + "step": 27839 + }, + { + "epoch": 1.2961566217380172, + "grad_norm": 0.332235907668138, + "learning_rate": 7.000719142716823e-05, + "loss": 2.7959, + "step": 27840 + }, + { + "epoch": 1.29620317992411, + "grad_norm": 0.3264509080568059, + "learning_rate": 7.000470896982098e-05, + "loss": 2.7682, + "step": 27841 + }, + { + "epoch": 1.2962497381102032, + "grad_norm": 0.3622773513008891, + "learning_rate": 7.000222645376187e-05, + "loss": 2.7783, + "step": 27842 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.33784410942281917, + "learning_rate": 6.999974387899819e-05, + "loss": 2.8002, + "step": 27843 + }, + { + "epoch": 1.2963428544823894, + "grad_norm": 0.35847112543278514, + "learning_rate": 6.999726124553725e-05, + "loss": 2.8463, + "step": 27844 + }, + { + "epoch": 1.2963894126684825, + "grad_norm": 0.32591008848586406, + "learning_rate": 6.999477855338629e-05, + "loss": 2.6974, + "step": 27845 + }, + { + "epoch": 1.2964359708545756, + "grad_norm": 0.32730033919827434, + "learning_rate": 6.999229580255265e-05, + "loss": 2.7723, + "step": 27846 + }, + { + "epoch": 1.2964825290406685, + "grad_norm": 0.3233555754036459, + "learning_rate": 6.998981299304357e-05, + "loss": 2.8362, + "step": 27847 + }, + { + "epoch": 1.2965290872267616, + "grad_norm": 0.32809754607485586, + "learning_rate": 6.998733012486637e-05, + "loss": 2.7746, + "step": 27848 + }, + { + "epoch": 1.2965756454128547, + "grad_norm": 0.3256927494215797, + "learning_rate": 6.998484719802831e-05, + "loss": 2.7676, + "step": 27849 + }, + { + "epoch": 1.2966222035989479, + "grad_norm": 0.35173313089635794, + "learning_rate": 6.998236421253671e-05, + "loss": 2.8194, + "step": 27850 + }, + { + "epoch": 1.2966687617850408, + "grad_norm": 0.3363372077495129, + "learning_rate": 6.997988116839883e-05, + "loss": 2.7163, + "step": 27851 + }, + { + "epoch": 1.2967153199711339, + "grad_norm": 0.3546387208635759, + "learning_rate": 6.997739806562198e-05, + "loss": 2.8763, + "step": 27852 + }, + { + "epoch": 1.296761878157227, + "grad_norm": 0.30848969282373834, + "learning_rate": 6.997491490421342e-05, + "loss": 2.8259, + "step": 27853 + }, + { + "epoch": 1.29680843634332, + "grad_norm": 0.33859970522216487, + "learning_rate": 6.997243168418047e-05, + "loss": 2.7979, + "step": 27854 + }, + { + "epoch": 1.2968549945294132, + "grad_norm": 0.3530418714067815, + "learning_rate": 6.996994840553039e-05, + "loss": 2.7576, + "step": 27855 + }, + { + "epoch": 1.2969015527155063, + "grad_norm": 0.31857285032775384, + "learning_rate": 6.996746506827048e-05, + "loss": 2.8052, + "step": 27856 + }, + { + "epoch": 1.2969481109015992, + "grad_norm": 0.34918304123738375, + "learning_rate": 6.996498167240803e-05, + "loss": 2.7996, + "step": 27857 + }, + { + "epoch": 1.2969946690876923, + "grad_norm": 0.36217867545511656, + "learning_rate": 6.996249821795031e-05, + "loss": 2.7739, + "step": 27858 + }, + { + "epoch": 1.2970412272737855, + "grad_norm": 0.32954935097201443, + "learning_rate": 6.996001470490463e-05, + "loss": 2.764, + "step": 27859 + }, + { + "epoch": 1.2970877854598784, + "grad_norm": 0.37451824324059785, + "learning_rate": 6.995753113327828e-05, + "loss": 2.8114, + "step": 27860 + }, + { + "epoch": 1.2971343436459715, + "grad_norm": 0.3506682899005845, + "learning_rate": 6.995504750307855e-05, + "loss": 2.8453, + "step": 27861 + }, + { + "epoch": 1.2971809018320646, + "grad_norm": 0.335188617943264, + "learning_rate": 6.995256381431271e-05, + "loss": 2.6614, + "step": 27862 + }, + { + "epoch": 1.2972274600181577, + "grad_norm": 0.3636109065859595, + "learning_rate": 6.995008006698807e-05, + "loss": 2.734, + "step": 27863 + }, + { + "epoch": 1.2972740182042508, + "grad_norm": 0.32017998222755933, + "learning_rate": 6.99475962611119e-05, + "loss": 2.858, + "step": 27864 + }, + { + "epoch": 1.297320576390344, + "grad_norm": 0.38037349956217315, + "learning_rate": 6.99451123966915e-05, + "loss": 2.7122, + "step": 27865 + }, + { + "epoch": 1.2973671345764368, + "grad_norm": 0.35806069853699685, + "learning_rate": 6.994262847373416e-05, + "loss": 2.7992, + "step": 27866 + }, + { + "epoch": 1.29741369276253, + "grad_norm": 0.3590684499864688, + "learning_rate": 6.994014449224717e-05, + "loss": 2.6454, + "step": 27867 + }, + { + "epoch": 1.297460250948623, + "grad_norm": 0.349099322158503, + "learning_rate": 6.993766045223783e-05, + "loss": 2.8106, + "step": 27868 + }, + { + "epoch": 1.2975068091347162, + "grad_norm": 0.3711106429696921, + "learning_rate": 6.993517635371341e-05, + "loss": 2.7031, + "step": 27869 + }, + { + "epoch": 1.297553367320809, + "grad_norm": 0.33370633832014207, + "learning_rate": 6.993269219668123e-05, + "loss": 2.7648, + "step": 27870 + }, + { + "epoch": 1.2975999255069022, + "grad_norm": 0.3555117539029057, + "learning_rate": 6.993020798114856e-05, + "loss": 2.782, + "step": 27871 + }, + { + "epoch": 1.2976464836929953, + "grad_norm": 0.34608282104639376, + "learning_rate": 6.992772370712265e-05, + "loss": 2.7689, + "step": 27872 + }, + { + "epoch": 1.2976930418790884, + "grad_norm": 0.3174173115576789, + "learning_rate": 6.992523937461089e-05, + "loss": 2.7344, + "step": 27873 + }, + { + "epoch": 1.2977396000651815, + "grad_norm": 0.33867654658785523, + "learning_rate": 6.992275498362049e-05, + "loss": 2.7555, + "step": 27874 + }, + { + "epoch": 1.2977861582512746, + "grad_norm": 0.3537268420461756, + "learning_rate": 6.992027053415876e-05, + "loss": 2.778, + "step": 27875 + }, + { + "epoch": 1.2978327164373675, + "grad_norm": 0.3215502768690522, + "learning_rate": 6.9917786026233e-05, + "loss": 2.7495, + "step": 27876 + }, + { + "epoch": 1.2978792746234606, + "grad_norm": 0.33227665655522726, + "learning_rate": 6.991530145985051e-05, + "loss": 2.7554, + "step": 27877 + }, + { + "epoch": 1.2979258328095538, + "grad_norm": 0.33811302623959044, + "learning_rate": 6.991281683501857e-05, + "loss": 2.7912, + "step": 27878 + }, + { + "epoch": 1.2979723909956469, + "grad_norm": 0.33178570503317506, + "learning_rate": 6.991033215174448e-05, + "loss": 2.7484, + "step": 27879 + }, + { + "epoch": 1.2980189491817398, + "grad_norm": 0.3632938729087031, + "learning_rate": 6.990784741003551e-05, + "loss": 2.8203, + "step": 27880 + }, + { + "epoch": 1.2980655073678329, + "grad_norm": 0.3140857654416164, + "learning_rate": 6.990536260989898e-05, + "loss": 2.7688, + "step": 27881 + }, + { + "epoch": 1.298112065553926, + "grad_norm": 0.3687543852566095, + "learning_rate": 6.990287775134217e-05, + "loss": 2.7463, + "step": 27882 + }, + { + "epoch": 1.2981586237400191, + "grad_norm": 0.3135575463011128, + "learning_rate": 6.990039283437239e-05, + "loss": 2.6841, + "step": 27883 + }, + { + "epoch": 1.2982051819261122, + "grad_norm": 0.34867859723259065, + "learning_rate": 6.989790785899691e-05, + "loss": 2.7517, + "step": 27884 + }, + { + "epoch": 1.2982517401122053, + "grad_norm": 0.3507245349577244, + "learning_rate": 6.989542282522302e-05, + "loss": 2.7474, + "step": 27885 + }, + { + "epoch": 1.2982982982982982, + "grad_norm": 0.3666478171798878, + "learning_rate": 6.989293773305802e-05, + "loss": 2.737, + "step": 27886 + }, + { + "epoch": 1.2983448564843914, + "grad_norm": 0.3446442616421025, + "learning_rate": 6.989045258250921e-05, + "loss": 2.7169, + "step": 27887 + }, + { + "epoch": 1.2983914146704845, + "grad_norm": 0.34885285708538505, + "learning_rate": 6.98879673735839e-05, + "loss": 2.7642, + "step": 27888 + }, + { + "epoch": 1.2984379728565776, + "grad_norm": 0.3432092710491913, + "learning_rate": 6.988548210628934e-05, + "loss": 2.809, + "step": 27889 + }, + { + "epoch": 1.2984845310426705, + "grad_norm": 0.33298181526032306, + "learning_rate": 6.988299678063285e-05, + "loss": 2.825, + "step": 27890 + }, + { + "epoch": 1.2985310892287636, + "grad_norm": 0.3542754960164419, + "learning_rate": 6.988051139662173e-05, + "loss": 2.8504, + "step": 27891 + }, + { + "epoch": 1.2985776474148567, + "grad_norm": 0.3358931464836129, + "learning_rate": 6.98780259542633e-05, + "loss": 2.9152, + "step": 27892 + }, + { + "epoch": 1.2986242056009498, + "grad_norm": 0.3402135514664298, + "learning_rate": 6.987554045356479e-05, + "loss": 2.7584, + "step": 27893 + }, + { + "epoch": 1.298670763787043, + "grad_norm": 0.3433041806049204, + "learning_rate": 6.987305489453351e-05, + "loss": 2.7841, + "step": 27894 + }, + { + "epoch": 1.298717321973136, + "grad_norm": 0.32698675377541797, + "learning_rate": 6.98705692771768e-05, + "loss": 2.7074, + "step": 27895 + }, + { + "epoch": 1.298763880159229, + "grad_norm": 0.3324484497258436, + "learning_rate": 6.986808360150192e-05, + "loss": 2.7651, + "step": 27896 + }, + { + "epoch": 1.298810438345322, + "grad_norm": 0.3406570273632049, + "learning_rate": 6.986559786751618e-05, + "loss": 2.8345, + "step": 27897 + }, + { + "epoch": 1.2988569965314152, + "grad_norm": 0.3307725421256476, + "learning_rate": 6.986311207522686e-05, + "loss": 2.8752, + "step": 27898 + }, + { + "epoch": 1.298903554717508, + "grad_norm": 0.3641981217223333, + "learning_rate": 6.986062622464125e-05, + "loss": 2.7525, + "step": 27899 + }, + { + "epoch": 1.2989501129036012, + "grad_norm": 0.34532545119687974, + "learning_rate": 6.985814031576668e-05, + "loss": 2.7286, + "step": 27900 + }, + { + "epoch": 1.2989966710896943, + "grad_norm": 0.3276013488905388, + "learning_rate": 6.985565434861041e-05, + "loss": 2.7312, + "step": 27901 + }, + { + "epoch": 1.2990432292757874, + "grad_norm": 0.3315817192357637, + "learning_rate": 6.985316832317975e-05, + "loss": 2.7887, + "step": 27902 + }, + { + "epoch": 1.2990897874618805, + "grad_norm": 0.3307147546803711, + "learning_rate": 6.985068223948201e-05, + "loss": 2.8452, + "step": 27903 + }, + { + "epoch": 1.2991363456479736, + "grad_norm": 0.3263762918745629, + "learning_rate": 6.984819609752445e-05, + "loss": 2.6822, + "step": 27904 + }, + { + "epoch": 1.2991829038340668, + "grad_norm": 0.3760659871377058, + "learning_rate": 6.98457098973144e-05, + "loss": 2.804, + "step": 27905 + }, + { + "epoch": 1.2992294620201597, + "grad_norm": 0.32514079556624237, + "learning_rate": 6.984322363885916e-05, + "loss": 2.8582, + "step": 27906 + }, + { + "epoch": 1.2992760202062528, + "grad_norm": 0.35482692514745157, + "learning_rate": 6.9840737322166e-05, + "loss": 2.8579, + "step": 27907 + }, + { + "epoch": 1.2993225783923459, + "grad_norm": 0.33458885849494435, + "learning_rate": 6.983825094724224e-05, + "loss": 2.655, + "step": 27908 + }, + { + "epoch": 1.2993691365784388, + "grad_norm": 0.31879701490589135, + "learning_rate": 6.983576451409518e-05, + "loss": 2.7568, + "step": 27909 + }, + { + "epoch": 1.299415694764532, + "grad_norm": 0.3289225317472932, + "learning_rate": 6.983327802273208e-05, + "loss": 2.7619, + "step": 27910 + }, + { + "epoch": 1.299462252950625, + "grad_norm": 0.33391914923694765, + "learning_rate": 6.983079147316027e-05, + "loss": 2.6912, + "step": 27911 + }, + { + "epoch": 1.2995088111367181, + "grad_norm": 0.3056834962044804, + "learning_rate": 6.982830486538704e-05, + "loss": 2.8179, + "step": 27912 + }, + { + "epoch": 1.2995553693228112, + "grad_norm": 0.3374391685300728, + "learning_rate": 6.982581819941969e-05, + "loss": 2.7816, + "step": 27913 + }, + { + "epoch": 1.2996019275089044, + "grad_norm": 0.3384466962936142, + "learning_rate": 6.98233314752655e-05, + "loss": 2.7177, + "step": 27914 + }, + { + "epoch": 1.2996484856949972, + "grad_norm": 0.3437061030740878, + "learning_rate": 6.982084469293182e-05, + "loss": 2.8171, + "step": 27915 + }, + { + "epoch": 1.2996950438810904, + "grad_norm": 0.34532977793412284, + "learning_rate": 6.98183578524259e-05, + "loss": 2.7561, + "step": 27916 + }, + { + "epoch": 1.2997416020671835, + "grad_norm": 0.3221146873825397, + "learning_rate": 6.981587095375503e-05, + "loss": 2.8354, + "step": 27917 + }, + { + "epoch": 1.2997881602532766, + "grad_norm": 0.33314836347056404, + "learning_rate": 6.981338399692655e-05, + "loss": 2.6719, + "step": 27918 + }, + { + "epoch": 1.2998347184393695, + "grad_norm": 0.32474227315444626, + "learning_rate": 6.981089698194775e-05, + "loss": 2.6309, + "step": 27919 + }, + { + "epoch": 1.2998812766254626, + "grad_norm": 0.3021274116548685, + "learning_rate": 6.980840990882591e-05, + "loss": 2.836, + "step": 27920 + }, + { + "epoch": 1.2999278348115557, + "grad_norm": 0.3500444999672992, + "learning_rate": 6.980592277756835e-05, + "loss": 2.7909, + "step": 27921 + }, + { + "epoch": 1.2999743929976488, + "grad_norm": 0.35473683077216667, + "learning_rate": 6.980343558818233e-05, + "loss": 2.784, + "step": 27922 + }, + { + "epoch": 1.300020951183742, + "grad_norm": 0.3230644567291237, + "learning_rate": 6.980094834067518e-05, + "loss": 2.6409, + "step": 27923 + }, + { + "epoch": 1.300067509369835, + "grad_norm": 0.35155615803525064, + "learning_rate": 6.979846103505423e-05, + "loss": 2.7143, + "step": 27924 + }, + { + "epoch": 1.300114067555928, + "grad_norm": 0.3573382663177148, + "learning_rate": 6.979597367132672e-05, + "loss": 2.8711, + "step": 27925 + }, + { + "epoch": 1.300160625742021, + "grad_norm": 0.36962529222617624, + "learning_rate": 6.979348624949999e-05, + "loss": 2.8305, + "step": 27926 + }, + { + "epoch": 1.3002071839281142, + "grad_norm": 0.34487960119403677, + "learning_rate": 6.97909987695813e-05, + "loss": 2.743, + "step": 27927 + }, + { + "epoch": 1.3002537421142073, + "grad_norm": 0.34010205212299705, + "learning_rate": 6.978851123157802e-05, + "loss": 2.8649, + "step": 27928 + }, + { + "epoch": 1.3003003003003002, + "grad_norm": 0.3509679845155078, + "learning_rate": 6.978602363549738e-05, + "loss": 2.6849, + "step": 27929 + }, + { + "epoch": 1.3003468584863933, + "grad_norm": 0.3343965065873738, + "learning_rate": 6.978353598134671e-05, + "loss": 2.7724, + "step": 27930 + }, + { + "epoch": 1.3003934166724864, + "grad_norm": 0.35759186805915016, + "learning_rate": 6.978104826913332e-05, + "loss": 2.6828, + "step": 27931 + }, + { + "epoch": 1.3004399748585795, + "grad_norm": 0.3311169927548047, + "learning_rate": 6.97785604988645e-05, + "loss": 2.8408, + "step": 27932 + }, + { + "epoch": 1.3004865330446727, + "grad_norm": 0.3466147056199083, + "learning_rate": 6.977607267054756e-05, + "loss": 2.8208, + "step": 27933 + }, + { + "epoch": 1.3005330912307658, + "grad_norm": 0.3809668957716149, + "learning_rate": 6.97735847841898e-05, + "loss": 2.7141, + "step": 27934 + }, + { + "epoch": 1.3005796494168587, + "grad_norm": 0.3272382696098141, + "learning_rate": 6.977109683979852e-05, + "loss": 2.7232, + "step": 27935 + }, + { + "epoch": 1.3006262076029518, + "grad_norm": 0.36763125157585175, + "learning_rate": 6.976860883738098e-05, + "loss": 2.7208, + "step": 27936 + }, + { + "epoch": 1.300672765789045, + "grad_norm": 0.35347737537235374, + "learning_rate": 6.976612077694457e-05, + "loss": 2.8912, + "step": 27937 + }, + { + "epoch": 1.300719323975138, + "grad_norm": 0.3574395963612336, + "learning_rate": 6.976363265849652e-05, + "loss": 2.759, + "step": 27938 + }, + { + "epoch": 1.300765882161231, + "grad_norm": 0.33944827242585013, + "learning_rate": 6.976114448204414e-05, + "loss": 2.7157, + "step": 27939 + }, + { + "epoch": 1.300812440347324, + "grad_norm": 0.3526323356986452, + "learning_rate": 6.975865624759476e-05, + "loss": 2.7661, + "step": 27940 + }, + { + "epoch": 1.3008589985334171, + "grad_norm": 0.3413454777337644, + "learning_rate": 6.975616795515567e-05, + "loss": 2.7907, + "step": 27941 + }, + { + "epoch": 1.3009055567195102, + "grad_norm": 0.3672014301708363, + "learning_rate": 6.975367960473418e-05, + "loss": 2.6699, + "step": 27942 + }, + { + "epoch": 1.3009521149056034, + "grad_norm": 0.29841449947773385, + "learning_rate": 6.975119119633759e-05, + "loss": 2.6608, + "step": 27943 + }, + { + "epoch": 1.3009986730916965, + "grad_norm": 0.3329291499993766, + "learning_rate": 6.974870272997318e-05, + "loss": 2.704, + "step": 27944 + }, + { + "epoch": 1.3010452312777894, + "grad_norm": 0.3132491654369671, + "learning_rate": 6.97462142056483e-05, + "loss": 2.6875, + "step": 27945 + }, + { + "epoch": 1.3010917894638825, + "grad_norm": 0.3161852319137463, + "learning_rate": 6.97437256233702e-05, + "loss": 2.7374, + "step": 27946 + }, + { + "epoch": 1.3011383476499756, + "grad_norm": 0.362037043017863, + "learning_rate": 6.974123698314622e-05, + "loss": 2.7371, + "step": 27947 + }, + { + "epoch": 1.3011849058360685, + "grad_norm": 0.3304252734552001, + "learning_rate": 6.973874828498367e-05, + "loss": 2.7595, + "step": 27948 + }, + { + "epoch": 1.3012314640221616, + "grad_norm": 0.3446993971762144, + "learning_rate": 6.973625952888982e-05, + "loss": 2.7069, + "step": 27949 + }, + { + "epoch": 1.3012780222082547, + "grad_norm": 0.30795373153068506, + "learning_rate": 6.973377071487199e-05, + "loss": 2.7336, + "step": 27950 + }, + { + "epoch": 1.3013245803943478, + "grad_norm": 0.3380134204607757, + "learning_rate": 6.97312818429375e-05, + "loss": 2.7157, + "step": 27951 + }, + { + "epoch": 1.301371138580441, + "grad_norm": 0.3155716718156314, + "learning_rate": 6.972879291309363e-05, + "loss": 2.7416, + "step": 27952 + }, + { + "epoch": 1.301417696766534, + "grad_norm": 0.33464595227691735, + "learning_rate": 6.97263039253477e-05, + "loss": 2.7289, + "step": 27953 + }, + { + "epoch": 1.301464254952627, + "grad_norm": 0.30740929101403996, + "learning_rate": 6.972381487970702e-05, + "loss": 2.8086, + "step": 27954 + }, + { + "epoch": 1.30151081313872, + "grad_norm": 0.36069438487654765, + "learning_rate": 6.972132577617888e-05, + "loss": 2.7735, + "step": 27955 + }, + { + "epoch": 1.3015573713248132, + "grad_norm": 0.36155904038212167, + "learning_rate": 6.971883661477058e-05, + "loss": 2.7547, + "step": 27956 + }, + { + "epoch": 1.3016039295109063, + "grad_norm": 0.3642723809994679, + "learning_rate": 6.971634739548946e-05, + "loss": 2.8365, + "step": 27957 + }, + { + "epoch": 1.3016504876969992, + "grad_norm": 0.359171982076123, + "learning_rate": 6.97138581183428e-05, + "loss": 2.7126, + "step": 27958 + }, + { + "epoch": 1.3016970458830923, + "grad_norm": 0.34140783565163313, + "learning_rate": 6.971136878333789e-05, + "loss": 2.6642, + "step": 27959 + }, + { + "epoch": 1.3017436040691854, + "grad_norm": 0.3280625354338193, + "learning_rate": 6.970887939048207e-05, + "loss": 2.8161, + "step": 27960 + }, + { + "epoch": 1.3017901622552785, + "grad_norm": 0.3162646973896384, + "learning_rate": 6.970638993978263e-05, + "loss": 2.7284, + "step": 27961 + }, + { + "epoch": 1.3018367204413717, + "grad_norm": 0.31482137015884043, + "learning_rate": 6.970390043124688e-05, + "loss": 2.6999, + "step": 27962 + }, + { + "epoch": 1.3018832786274648, + "grad_norm": 0.3154620301195368, + "learning_rate": 6.97014108648821e-05, + "loss": 2.8171, + "step": 27963 + }, + { + "epoch": 1.3019298368135577, + "grad_norm": 0.32344293238823546, + "learning_rate": 6.969892124069565e-05, + "loss": 2.8977, + "step": 27964 + }, + { + "epoch": 1.3019763949996508, + "grad_norm": 0.3214133575854039, + "learning_rate": 6.969643155869479e-05, + "loss": 2.7874, + "step": 27965 + }, + { + "epoch": 1.302022953185744, + "grad_norm": 0.3374660849194402, + "learning_rate": 6.969394181888685e-05, + "loss": 2.7856, + "step": 27966 + }, + { + "epoch": 1.302069511371837, + "grad_norm": 0.3267688156060565, + "learning_rate": 6.969145202127911e-05, + "loss": 2.7745, + "step": 27967 + }, + { + "epoch": 1.30211606955793, + "grad_norm": 0.2912956204136221, + "learning_rate": 6.968896216587891e-05, + "loss": 2.7501, + "step": 27968 + }, + { + "epoch": 1.302162627744023, + "grad_norm": 0.31814633381500207, + "learning_rate": 6.968647225269356e-05, + "loss": 2.7232, + "step": 27969 + }, + { + "epoch": 1.3022091859301161, + "grad_norm": 0.30253487297534765, + "learning_rate": 6.968398228173034e-05, + "loss": 2.8424, + "step": 27970 + }, + { + "epoch": 1.3022557441162093, + "grad_norm": 0.32033294452609606, + "learning_rate": 6.968149225299658e-05, + "loss": 2.6193, + "step": 27971 + }, + { + "epoch": 1.3023023023023024, + "grad_norm": 0.3186328731969546, + "learning_rate": 6.967900216649959e-05, + "loss": 2.7395, + "step": 27972 + }, + { + "epoch": 1.3023488604883955, + "grad_norm": 0.32432854920161713, + "learning_rate": 6.967651202224665e-05, + "loss": 2.7902, + "step": 27973 + }, + { + "epoch": 1.3023954186744884, + "grad_norm": 0.33076220642593446, + "learning_rate": 6.967402182024508e-05, + "loss": 2.6919, + "step": 27974 + }, + { + "epoch": 1.3024419768605815, + "grad_norm": 0.3067677458160323, + "learning_rate": 6.96715315605022e-05, + "loss": 2.787, + "step": 27975 + }, + { + "epoch": 1.3024885350466746, + "grad_norm": 0.32377606158517874, + "learning_rate": 6.966904124302531e-05, + "loss": 2.8784, + "step": 27976 + }, + { + "epoch": 1.3025350932327677, + "grad_norm": 0.337253507920401, + "learning_rate": 6.966655086782173e-05, + "loss": 2.6854, + "step": 27977 + }, + { + "epoch": 1.3025816514188606, + "grad_norm": 0.3351644912256026, + "learning_rate": 6.966406043489877e-05, + "loss": 2.8174, + "step": 27978 + }, + { + "epoch": 1.3026282096049537, + "grad_norm": 0.36885614551522705, + "learning_rate": 6.966156994426371e-05, + "loss": 2.7346, + "step": 27979 + }, + { + "epoch": 1.3026747677910469, + "grad_norm": 0.3264112818804702, + "learning_rate": 6.965907939592388e-05, + "loss": 2.6095, + "step": 27980 + }, + { + "epoch": 1.30272132597714, + "grad_norm": 0.3173113049788821, + "learning_rate": 6.96565887898866e-05, + "loss": 2.907, + "step": 27981 + }, + { + "epoch": 1.302767884163233, + "grad_norm": 0.331614006036109, + "learning_rate": 6.965409812615917e-05, + "loss": 2.78, + "step": 27982 + }, + { + "epoch": 1.3028144423493262, + "grad_norm": 0.3364798068090838, + "learning_rate": 6.965160740474888e-05, + "loss": 2.7785, + "step": 27983 + }, + { + "epoch": 1.302861000535419, + "grad_norm": 0.3201959308188735, + "learning_rate": 6.964911662566308e-05, + "loss": 2.7952, + "step": 27984 + }, + { + "epoch": 1.3029075587215122, + "grad_norm": 0.337716255066498, + "learning_rate": 6.964662578890904e-05, + "loss": 2.7724, + "step": 27985 + }, + { + "epoch": 1.3029541169076053, + "grad_norm": 0.32544179348776164, + "learning_rate": 6.96441348944941e-05, + "loss": 2.849, + "step": 27986 + }, + { + "epoch": 1.3030006750936982, + "grad_norm": 0.32898865956597945, + "learning_rate": 6.964164394242557e-05, + "loss": 2.8131, + "step": 27987 + }, + { + "epoch": 1.3030472332797913, + "grad_norm": 0.3273055154661419, + "learning_rate": 6.963915293271073e-05, + "loss": 2.7236, + "step": 27988 + }, + { + "epoch": 1.3030937914658844, + "grad_norm": 0.31158032790466866, + "learning_rate": 6.963666186535691e-05, + "loss": 2.7786, + "step": 27989 + }, + { + "epoch": 1.3031403496519776, + "grad_norm": 0.31443897577964175, + "learning_rate": 6.963417074037144e-05, + "loss": 2.788, + "step": 27990 + }, + { + "epoch": 1.3031869078380707, + "grad_norm": 0.3203156339960115, + "learning_rate": 6.96316795577616e-05, + "loss": 2.7445, + "step": 27991 + }, + { + "epoch": 1.3032334660241638, + "grad_norm": 0.318318526570437, + "learning_rate": 6.962918831753472e-05, + "loss": 2.7839, + "step": 27992 + }, + { + "epoch": 1.303280024210257, + "grad_norm": 0.3258308542406534, + "learning_rate": 6.96266970196981e-05, + "loss": 2.828, + "step": 27993 + }, + { + "epoch": 1.3033265823963498, + "grad_norm": 0.3579437718780717, + "learning_rate": 6.962420566425906e-05, + "loss": 2.7061, + "step": 27994 + }, + { + "epoch": 1.303373140582443, + "grad_norm": 0.320206817016075, + "learning_rate": 6.962171425122491e-05, + "loss": 2.7827, + "step": 27995 + }, + { + "epoch": 1.303419698768536, + "grad_norm": 0.34834292952795876, + "learning_rate": 6.961922278060296e-05, + "loss": 2.7835, + "step": 27996 + }, + { + "epoch": 1.303466256954629, + "grad_norm": 0.3359125120970493, + "learning_rate": 6.961673125240053e-05, + "loss": 2.8016, + "step": 27997 + }, + { + "epoch": 1.303512815140722, + "grad_norm": 0.3235210018300189, + "learning_rate": 6.961423966662493e-05, + "loss": 2.7876, + "step": 27998 + }, + { + "epoch": 1.3035593733268152, + "grad_norm": 0.34954079797365156, + "learning_rate": 6.961174802328346e-05, + "loss": 2.8298, + "step": 27999 + }, + { + "epoch": 1.3036059315129083, + "grad_norm": 0.3046853882610506, + "learning_rate": 6.960925632238345e-05, + "loss": 2.7389, + "step": 28000 + }, + { + "epoch": 1.3036524896990014, + "grad_norm": 0.32992990496044705, + "learning_rate": 6.96067645639322e-05, + "loss": 2.8871, + "step": 28001 + }, + { + "epoch": 1.3036990478850945, + "grad_norm": 0.31165277050390094, + "learning_rate": 6.960427274793702e-05, + "loss": 2.7133, + "step": 28002 + }, + { + "epoch": 1.3037456060711874, + "grad_norm": 0.33462968545112226, + "learning_rate": 6.960178087440523e-05, + "loss": 2.8399, + "step": 28003 + }, + { + "epoch": 1.3037921642572805, + "grad_norm": 0.33408473533632466, + "learning_rate": 6.959928894334415e-05, + "loss": 2.7788, + "step": 28004 + }, + { + "epoch": 1.3038387224433736, + "grad_norm": 0.3325948932154951, + "learning_rate": 6.959679695476108e-05, + "loss": 2.7441, + "step": 28005 + }, + { + "epoch": 1.3038852806294667, + "grad_norm": 0.3380578676201832, + "learning_rate": 6.959430490866337e-05, + "loss": 2.7067, + "step": 28006 + }, + { + "epoch": 1.3039318388155596, + "grad_norm": 0.3205584513384028, + "learning_rate": 6.959181280505828e-05, + "loss": 2.7354, + "step": 28007 + }, + { + "epoch": 1.3039783970016527, + "grad_norm": 0.3184978795078579, + "learning_rate": 6.958932064395316e-05, + "loss": 2.8165, + "step": 28008 + }, + { + "epoch": 1.3040249551877459, + "grad_norm": 0.32847675969159, + "learning_rate": 6.95868284253553e-05, + "loss": 2.7951, + "step": 28009 + }, + { + "epoch": 1.304071513373839, + "grad_norm": 0.337842982759919, + "learning_rate": 6.958433614927205e-05, + "loss": 2.7565, + "step": 28010 + }, + { + "epoch": 1.304118071559932, + "grad_norm": 0.33667362992699673, + "learning_rate": 6.958184381571069e-05, + "loss": 2.7661, + "step": 28011 + }, + { + "epoch": 1.3041646297460252, + "grad_norm": 0.33262903632676427, + "learning_rate": 6.957935142467853e-05, + "loss": 2.8879, + "step": 28012 + }, + { + "epoch": 1.304211187932118, + "grad_norm": 0.3403011884672446, + "learning_rate": 6.957685897618292e-05, + "loss": 2.7883, + "step": 28013 + }, + { + "epoch": 1.3042577461182112, + "grad_norm": 0.34046973965805044, + "learning_rate": 6.957436647023116e-05, + "loss": 2.7332, + "step": 28014 + }, + { + "epoch": 1.3043043043043043, + "grad_norm": 0.3371660006004233, + "learning_rate": 6.957187390683057e-05, + "loss": 2.7731, + "step": 28015 + }, + { + "epoch": 1.3043508624903974, + "grad_norm": 0.3483150293794736, + "learning_rate": 6.956938128598844e-05, + "loss": 2.9812, + "step": 28016 + }, + { + "epoch": 1.3043974206764903, + "grad_norm": 0.35831584236404396, + "learning_rate": 6.956688860771212e-05, + "loss": 2.8311, + "step": 28017 + }, + { + "epoch": 1.3044439788625835, + "grad_norm": 0.35302129189263143, + "learning_rate": 6.956439587200891e-05, + "loss": 2.8237, + "step": 28018 + }, + { + "epoch": 1.3044905370486766, + "grad_norm": 0.32621530564281503, + "learning_rate": 6.95619030788861e-05, + "loss": 2.7444, + "step": 28019 + }, + { + "epoch": 1.3045370952347697, + "grad_norm": 0.3691917438924559, + "learning_rate": 6.955941022835105e-05, + "loss": 2.8413, + "step": 28020 + }, + { + "epoch": 1.3045836534208628, + "grad_norm": 0.32927537495232784, + "learning_rate": 6.955691732041106e-05, + "loss": 2.748, + "step": 28021 + }, + { + "epoch": 1.304630211606956, + "grad_norm": 0.3423205270445947, + "learning_rate": 6.955442435507344e-05, + "loss": 2.7684, + "step": 28022 + }, + { + "epoch": 1.3046767697930488, + "grad_norm": 0.3274742177383386, + "learning_rate": 6.955193133234553e-05, + "loss": 2.792, + "step": 28023 + }, + { + "epoch": 1.304723327979142, + "grad_norm": 0.35173647780275025, + "learning_rate": 6.95494382522346e-05, + "loss": 2.8066, + "step": 28024 + }, + { + "epoch": 1.304769886165235, + "grad_norm": 0.3278014311296156, + "learning_rate": 6.9546945114748e-05, + "loss": 2.7649, + "step": 28025 + }, + { + "epoch": 1.3048164443513282, + "grad_norm": 0.37407982830736186, + "learning_rate": 6.954445191989305e-05, + "loss": 2.7612, + "step": 28026 + }, + { + "epoch": 1.304863002537421, + "grad_norm": 0.3370858072173939, + "learning_rate": 6.954195866767705e-05, + "loss": 2.9161, + "step": 28027 + }, + { + "epoch": 1.3049095607235142, + "grad_norm": 0.3282245472929875, + "learning_rate": 6.953946535810734e-05, + "loss": 2.7569, + "step": 28028 + }, + { + "epoch": 1.3049561189096073, + "grad_norm": 0.35759152658473436, + "learning_rate": 6.953697199119121e-05, + "loss": 2.7618, + "step": 28029 + }, + { + "epoch": 1.3050026770957004, + "grad_norm": 0.3155696767645431, + "learning_rate": 6.9534478566936e-05, + "loss": 2.7642, + "step": 28030 + }, + { + "epoch": 1.3050492352817935, + "grad_norm": 0.333477019817981, + "learning_rate": 6.953198508534902e-05, + "loss": 2.8045, + "step": 28031 + }, + { + "epoch": 1.3050957934678866, + "grad_norm": 0.3406545492406506, + "learning_rate": 6.952949154643759e-05, + "loss": 2.7999, + "step": 28032 + }, + { + "epoch": 1.3051423516539795, + "grad_norm": 0.34824514861503725, + "learning_rate": 6.952699795020901e-05, + "loss": 2.7909, + "step": 28033 + }, + { + "epoch": 1.3051889098400726, + "grad_norm": 0.35228899846549355, + "learning_rate": 6.952450429667063e-05, + "loss": 2.7043, + "step": 28034 + }, + { + "epoch": 1.3052354680261657, + "grad_norm": 0.35767593782643514, + "learning_rate": 6.952201058582976e-05, + "loss": 2.8328, + "step": 28035 + }, + { + "epoch": 1.3052820262122586, + "grad_norm": 0.3222663625053848, + "learning_rate": 6.951951681769371e-05, + "loss": 2.764, + "step": 28036 + }, + { + "epoch": 1.3053285843983518, + "grad_norm": 0.3598284778403272, + "learning_rate": 6.951702299226978e-05, + "loss": 2.7627, + "step": 28037 + }, + { + "epoch": 1.3053751425844449, + "grad_norm": 0.3273053364568491, + "learning_rate": 6.951452910956532e-05, + "loss": 2.7393, + "step": 28038 + }, + { + "epoch": 1.305421700770538, + "grad_norm": 0.30386094475984954, + "learning_rate": 6.951203516958764e-05, + "loss": 2.813, + "step": 28039 + }, + { + "epoch": 1.305468258956631, + "grad_norm": 0.3710885493497132, + "learning_rate": 6.950954117234407e-05, + "loss": 2.6494, + "step": 28040 + }, + { + "epoch": 1.3055148171427242, + "grad_norm": 0.33432120926871406, + "learning_rate": 6.950704711784192e-05, + "loss": 2.7478, + "step": 28041 + }, + { + "epoch": 1.305561375328817, + "grad_norm": 0.33551330849757666, + "learning_rate": 6.950455300608849e-05, + "loss": 2.7188, + "step": 28042 + }, + { + "epoch": 1.3056079335149102, + "grad_norm": 0.3739191698853256, + "learning_rate": 6.950205883709114e-05, + "loss": 2.7138, + "step": 28043 + }, + { + "epoch": 1.3056544917010033, + "grad_norm": 0.34047205992214447, + "learning_rate": 6.949956461085714e-05, + "loss": 2.7948, + "step": 28044 + }, + { + "epoch": 1.3057010498870965, + "grad_norm": 0.40116809750034055, + "learning_rate": 6.949707032739387e-05, + "loss": 2.7951, + "step": 28045 + }, + { + "epoch": 1.3057476080731893, + "grad_norm": 0.32920160793367564, + "learning_rate": 6.94945759867086e-05, + "loss": 2.8311, + "step": 28046 + }, + { + "epoch": 1.3057941662592825, + "grad_norm": 0.34628474692169214, + "learning_rate": 6.949208158880867e-05, + "loss": 2.8162, + "step": 28047 + }, + { + "epoch": 1.3058407244453756, + "grad_norm": 0.3409256353249462, + "learning_rate": 6.948958713370141e-05, + "loss": 2.8401, + "step": 28048 + }, + { + "epoch": 1.3058872826314687, + "grad_norm": 0.3534142698123914, + "learning_rate": 6.948709262139412e-05, + "loss": 2.7752, + "step": 28049 + }, + { + "epoch": 1.3059338408175618, + "grad_norm": 0.3429236752289309, + "learning_rate": 6.948459805189415e-05, + "loss": 2.6739, + "step": 28050 + }, + { + "epoch": 1.305980399003655, + "grad_norm": 0.352842394283678, + "learning_rate": 6.94821034252088e-05, + "loss": 2.767, + "step": 28051 + }, + { + "epoch": 1.3060269571897478, + "grad_norm": 0.347198505613523, + "learning_rate": 6.947960874134538e-05, + "loss": 2.7166, + "step": 28052 + }, + { + "epoch": 1.306073515375841, + "grad_norm": 0.35320672273555304, + "learning_rate": 6.947711400031124e-05, + "loss": 2.8019, + "step": 28053 + }, + { + "epoch": 1.306120073561934, + "grad_norm": 0.32126425035557277, + "learning_rate": 6.947461920211369e-05, + "loss": 2.8048, + "step": 28054 + }, + { + "epoch": 1.3061666317480272, + "grad_norm": 0.3667103697036318, + "learning_rate": 6.947212434676005e-05, + "loss": 2.8189, + "step": 28055 + }, + { + "epoch": 1.30621318993412, + "grad_norm": 0.3391078431414716, + "learning_rate": 6.946962943425763e-05, + "loss": 2.8307, + "step": 28056 + }, + { + "epoch": 1.3062597481202132, + "grad_norm": 0.35365439816225047, + "learning_rate": 6.946713446461378e-05, + "loss": 2.6941, + "step": 28057 + }, + { + "epoch": 1.3063063063063063, + "grad_norm": 0.3612872977288796, + "learning_rate": 6.94646394378358e-05, + "loss": 2.7621, + "step": 28058 + }, + { + "epoch": 1.3063528644923994, + "grad_norm": 0.3498681660313938, + "learning_rate": 6.946214435393104e-05, + "loss": 2.8093, + "step": 28059 + }, + { + "epoch": 1.3063994226784925, + "grad_norm": 0.35829673652337934, + "learning_rate": 6.945964921290678e-05, + "loss": 2.6693, + "step": 28060 + }, + { + "epoch": 1.3064459808645856, + "grad_norm": 0.37582532231585297, + "learning_rate": 6.945715401477039e-05, + "loss": 2.6317, + "step": 28061 + }, + { + "epoch": 1.3064925390506785, + "grad_norm": 0.33719636623449445, + "learning_rate": 6.945465875952915e-05, + "loss": 2.7537, + "step": 28062 + }, + { + "epoch": 1.3065390972367716, + "grad_norm": 0.389679188421017, + "learning_rate": 6.94521634471904e-05, + "loss": 2.7454, + "step": 28063 + }, + { + "epoch": 1.3065856554228648, + "grad_norm": 0.35018103727481475, + "learning_rate": 6.944966807776149e-05, + "loss": 2.7663, + "step": 28064 + }, + { + "epoch": 1.3066322136089579, + "grad_norm": 0.33014465898221823, + "learning_rate": 6.94471726512497e-05, + "loss": 2.8081, + "step": 28065 + }, + { + "epoch": 1.3066787717950508, + "grad_norm": 0.36036563817462763, + "learning_rate": 6.944467716766239e-05, + "loss": 2.824, + "step": 28066 + }, + { + "epoch": 1.3067253299811439, + "grad_norm": 0.3566963896978283, + "learning_rate": 6.944218162700684e-05, + "loss": 2.6926, + "step": 28067 + }, + { + "epoch": 1.306771888167237, + "grad_norm": 0.35797805904321417, + "learning_rate": 6.943968602929045e-05, + "loss": 2.7737, + "step": 28068 + }, + { + "epoch": 1.30681844635333, + "grad_norm": 0.3194996476873145, + "learning_rate": 6.943719037452046e-05, + "loss": 2.7783, + "step": 28069 + }, + { + "epoch": 1.3068650045394232, + "grad_norm": 0.35346178285825597, + "learning_rate": 6.943469466270423e-05, + "loss": 2.7857, + "step": 28070 + }, + { + "epoch": 1.3069115627255163, + "grad_norm": 0.3470323742259647, + "learning_rate": 6.943219889384909e-05, + "loss": 2.8485, + "step": 28071 + }, + { + "epoch": 1.3069581209116092, + "grad_norm": 0.3572682380273047, + "learning_rate": 6.942970306796237e-05, + "loss": 2.7763, + "step": 28072 + }, + { + "epoch": 1.3070046790977023, + "grad_norm": 0.3161921684164834, + "learning_rate": 6.942720718505138e-05, + "loss": 2.7124, + "step": 28073 + }, + { + "epoch": 1.3070512372837955, + "grad_norm": 0.3317989009495038, + "learning_rate": 6.942471124512346e-05, + "loss": 2.8126, + "step": 28074 + }, + { + "epoch": 1.3070977954698884, + "grad_norm": 0.3254694084709697, + "learning_rate": 6.942221524818592e-05, + "loss": 2.7312, + "step": 28075 + }, + { + "epoch": 1.3071443536559815, + "grad_norm": 0.3038339377515319, + "learning_rate": 6.941971919424608e-05, + "loss": 2.752, + "step": 28076 + }, + { + "epoch": 1.3071909118420746, + "grad_norm": 0.3404275381575678, + "learning_rate": 6.94172230833113e-05, + "loss": 2.7604, + "step": 28077 + }, + { + "epoch": 1.3072374700281677, + "grad_norm": 0.3363378746108286, + "learning_rate": 6.941472691538887e-05, + "loss": 2.7289, + "step": 28078 + }, + { + "epoch": 1.3072840282142608, + "grad_norm": 0.35674818696940175, + "learning_rate": 6.941223069048613e-05, + "loss": 2.749, + "step": 28079 + }, + { + "epoch": 1.307330586400354, + "grad_norm": 0.3353898016968812, + "learning_rate": 6.94097344086104e-05, + "loss": 2.7493, + "step": 28080 + }, + { + "epoch": 1.307377144586447, + "grad_norm": 0.35719810886877906, + "learning_rate": 6.940723806976901e-05, + "loss": 2.7609, + "step": 28081 + }, + { + "epoch": 1.30742370277254, + "grad_norm": 0.32519387024824536, + "learning_rate": 6.94047416739693e-05, + "loss": 2.6926, + "step": 28082 + }, + { + "epoch": 1.307470260958633, + "grad_norm": 0.34877353231307856, + "learning_rate": 6.940224522121857e-05, + "loss": 2.6944, + "step": 28083 + }, + { + "epoch": 1.3075168191447262, + "grad_norm": 0.31548241959313755, + "learning_rate": 6.939974871152417e-05, + "loss": 2.7659, + "step": 28084 + }, + { + "epoch": 1.307563377330819, + "grad_norm": 0.3427327975809711, + "learning_rate": 6.939725214489344e-05, + "loss": 2.6693, + "step": 28085 + }, + { + "epoch": 1.3076099355169122, + "grad_norm": 0.3182181221839119, + "learning_rate": 6.939475552133365e-05, + "loss": 2.7561, + "step": 28086 + }, + { + "epoch": 1.3076564937030053, + "grad_norm": 0.3690794733978278, + "learning_rate": 6.939225884085218e-05, + "loss": 2.7245, + "step": 28087 + }, + { + "epoch": 1.3077030518890984, + "grad_norm": 0.32470080873872065, + "learning_rate": 6.938976210345635e-05, + "loss": 2.8321, + "step": 28088 + }, + { + "epoch": 1.3077496100751915, + "grad_norm": 0.3740677816468517, + "learning_rate": 6.938726530915346e-05, + "loss": 2.8479, + "step": 28089 + }, + { + "epoch": 1.3077961682612846, + "grad_norm": 0.33015020386531446, + "learning_rate": 6.938476845795086e-05, + "loss": 2.6726, + "step": 28090 + }, + { + "epoch": 1.3078427264473775, + "grad_norm": 0.3413059526424273, + "learning_rate": 6.938227154985588e-05, + "loss": 2.7284, + "step": 28091 + }, + { + "epoch": 1.3078892846334707, + "grad_norm": 0.34446449725938094, + "learning_rate": 6.937977458487584e-05, + "loss": 2.8732, + "step": 28092 + }, + { + "epoch": 1.3079358428195638, + "grad_norm": 0.34532684480130477, + "learning_rate": 6.937727756301807e-05, + "loss": 2.7141, + "step": 28093 + }, + { + "epoch": 1.3079824010056569, + "grad_norm": 0.34329787481132756, + "learning_rate": 6.93747804842899e-05, + "loss": 2.8429, + "step": 28094 + }, + { + "epoch": 1.3080289591917498, + "grad_norm": 0.3315518132167814, + "learning_rate": 6.937228334869867e-05, + "loss": 2.8566, + "step": 28095 + }, + { + "epoch": 1.308075517377843, + "grad_norm": 0.3095226860491817, + "learning_rate": 6.936978615625168e-05, + "loss": 2.8268, + "step": 28096 + }, + { + "epoch": 1.308122075563936, + "grad_norm": 0.32264824731436387, + "learning_rate": 6.936728890695629e-05, + "loss": 2.7336, + "step": 28097 + }, + { + "epoch": 1.3081686337500291, + "grad_norm": 0.3156184857364887, + "learning_rate": 6.936479160081981e-05, + "loss": 2.8104, + "step": 28098 + }, + { + "epoch": 1.3082151919361222, + "grad_norm": 0.3263157945269234, + "learning_rate": 6.936229423784957e-05, + "loss": 2.758, + "step": 28099 + }, + { + "epoch": 1.3082617501222153, + "grad_norm": 0.3435842431079749, + "learning_rate": 6.935979681805292e-05, + "loss": 2.8227, + "step": 28100 + }, + { + "epoch": 1.3083083083083082, + "grad_norm": 0.32921291997085056, + "learning_rate": 6.935729934143714e-05, + "loss": 2.7651, + "step": 28101 + }, + { + "epoch": 1.3083548664944014, + "grad_norm": 0.3175188575108218, + "learning_rate": 6.935480180800962e-05, + "loss": 2.8839, + "step": 28102 + }, + { + "epoch": 1.3084014246804945, + "grad_norm": 0.3383456300302114, + "learning_rate": 6.935230421777765e-05, + "loss": 2.7738, + "step": 28103 + }, + { + "epoch": 1.3084479828665876, + "grad_norm": 0.3276038920199929, + "learning_rate": 6.934980657074859e-05, + "loss": 2.717, + "step": 28104 + }, + { + "epoch": 1.3084945410526805, + "grad_norm": 0.3369888856299676, + "learning_rate": 6.934730886692973e-05, + "loss": 2.7648, + "step": 28105 + }, + { + "epoch": 1.3085410992387736, + "grad_norm": 0.32935601136338494, + "learning_rate": 6.934481110632844e-05, + "loss": 2.8453, + "step": 28106 + }, + { + "epoch": 1.3085876574248667, + "grad_norm": 0.37425795506006654, + "learning_rate": 6.934231328895203e-05, + "loss": 2.8149, + "step": 28107 + }, + { + "epoch": 1.3086342156109598, + "grad_norm": 0.326083165994121, + "learning_rate": 6.933981541480785e-05, + "loss": 2.6814, + "step": 28108 + }, + { + "epoch": 1.308680773797053, + "grad_norm": 0.34888483308413853, + "learning_rate": 6.93373174839032e-05, + "loss": 2.7996, + "step": 28109 + }, + { + "epoch": 1.308727331983146, + "grad_norm": 0.32020417376546073, + "learning_rate": 6.933481949624542e-05, + "loss": 2.7694, + "step": 28110 + }, + { + "epoch": 1.308773890169239, + "grad_norm": 0.3733558819324692, + "learning_rate": 6.933232145184188e-05, + "loss": 2.7377, + "step": 28111 + }, + { + "epoch": 1.308820448355332, + "grad_norm": 0.31371923661141676, + "learning_rate": 6.932982335069986e-05, + "loss": 2.7881, + "step": 28112 + }, + { + "epoch": 1.3088670065414252, + "grad_norm": 0.3364753442803065, + "learning_rate": 6.932732519282674e-05, + "loss": 2.6956, + "step": 28113 + }, + { + "epoch": 1.3089135647275183, + "grad_norm": 0.36014734800586723, + "learning_rate": 6.93248269782298e-05, + "loss": 2.8378, + "step": 28114 + }, + { + "epoch": 1.3089601229136112, + "grad_norm": 0.33994371469142154, + "learning_rate": 6.932232870691638e-05, + "loss": 2.6819, + "step": 28115 + }, + { + "epoch": 1.3090066810997043, + "grad_norm": 0.33408004302782274, + "learning_rate": 6.931983037889385e-05, + "loss": 2.7206, + "step": 28116 + }, + { + "epoch": 1.3090532392857974, + "grad_norm": 0.3386252531365915, + "learning_rate": 6.931733199416952e-05, + "loss": 2.8695, + "step": 28117 + }, + { + "epoch": 1.3090997974718905, + "grad_norm": 0.32082681405682206, + "learning_rate": 6.931483355275073e-05, + "loss": 2.7649, + "step": 28118 + }, + { + "epoch": 1.3091463556579837, + "grad_norm": 0.34478137626779704, + "learning_rate": 6.93123350546448e-05, + "loss": 2.7959, + "step": 28119 + }, + { + "epoch": 1.3091929138440768, + "grad_norm": 0.33607853453369607, + "learning_rate": 6.930983649985906e-05, + "loss": 2.768, + "step": 28120 + }, + { + "epoch": 1.3092394720301697, + "grad_norm": 0.3408237171671128, + "learning_rate": 6.930733788840086e-05, + "loss": 2.9097, + "step": 28121 + }, + { + "epoch": 1.3092860302162628, + "grad_norm": 0.3278715293653515, + "learning_rate": 6.930483922027753e-05, + "loss": 2.7725, + "step": 28122 + }, + { + "epoch": 1.309332588402356, + "grad_norm": 0.33149603329527116, + "learning_rate": 6.93023404954964e-05, + "loss": 2.6573, + "step": 28123 + }, + { + "epoch": 1.3093791465884488, + "grad_norm": 0.2901321277663653, + "learning_rate": 6.92998417140648e-05, + "loss": 2.7662, + "step": 28124 + }, + { + "epoch": 1.309425704774542, + "grad_norm": 0.4033523546191077, + "learning_rate": 6.929734287599006e-05, + "loss": 2.8148, + "step": 28125 + }, + { + "epoch": 1.309472262960635, + "grad_norm": 0.3215402140253431, + "learning_rate": 6.929484398127952e-05, + "loss": 2.758, + "step": 28126 + }, + { + "epoch": 1.3095188211467281, + "grad_norm": 0.3639292399244954, + "learning_rate": 6.929234502994052e-05, + "loss": 2.7328, + "step": 28127 + }, + { + "epoch": 1.3095653793328212, + "grad_norm": 0.3410425376130938, + "learning_rate": 6.928984602198038e-05, + "loss": 2.6654, + "step": 28128 + }, + { + "epoch": 1.3096119375189144, + "grad_norm": 0.3582222637016682, + "learning_rate": 6.928734695740644e-05, + "loss": 2.7593, + "step": 28129 + }, + { + "epoch": 1.3096584957050073, + "grad_norm": 0.36156563265725283, + "learning_rate": 6.928484783622604e-05, + "loss": 2.7994, + "step": 28130 + }, + { + "epoch": 1.3097050538911004, + "grad_norm": 0.3836284689830168, + "learning_rate": 6.928234865844653e-05, + "loss": 2.8872, + "step": 28131 + }, + { + "epoch": 1.3097516120771935, + "grad_norm": 0.3711095409589625, + "learning_rate": 6.927984942407521e-05, + "loss": 2.8056, + "step": 28132 + }, + { + "epoch": 1.3097981702632866, + "grad_norm": 0.34784345763307095, + "learning_rate": 6.927735013311942e-05, + "loss": 2.7593, + "step": 28133 + }, + { + "epoch": 1.3098447284493795, + "grad_norm": 0.3360045225296354, + "learning_rate": 6.927485078558651e-05, + "loss": 2.7674, + "step": 28134 + }, + { + "epoch": 1.3098912866354726, + "grad_norm": 0.3615331832529016, + "learning_rate": 6.927235138148384e-05, + "loss": 2.7305, + "step": 28135 + }, + { + "epoch": 1.3099378448215657, + "grad_norm": 0.337717846127296, + "learning_rate": 6.92698519208187e-05, + "loss": 2.7221, + "step": 28136 + }, + { + "epoch": 1.3099844030076588, + "grad_norm": 0.4356697135680733, + "learning_rate": 6.926735240359845e-05, + "loss": 2.7478, + "step": 28137 + }, + { + "epoch": 1.310030961193752, + "grad_norm": 0.3212483322653806, + "learning_rate": 6.926485282983039e-05, + "loss": 2.7776, + "step": 28138 + }, + { + "epoch": 1.310077519379845, + "grad_norm": 0.4203100742241246, + "learning_rate": 6.926235319952192e-05, + "loss": 2.8695, + "step": 28139 + }, + { + "epoch": 1.310124077565938, + "grad_norm": 0.34222219725196246, + "learning_rate": 6.925985351268033e-05, + "loss": 2.7881, + "step": 28140 + }, + { + "epoch": 1.310170635752031, + "grad_norm": 0.3430112712431843, + "learning_rate": 6.925735376931296e-05, + "loss": 2.8239, + "step": 28141 + }, + { + "epoch": 1.3102171939381242, + "grad_norm": 0.3554308853687525, + "learning_rate": 6.925485396942715e-05, + "loss": 2.7823, + "step": 28142 + }, + { + "epoch": 1.3102637521242173, + "grad_norm": 0.3454228200752186, + "learning_rate": 6.925235411303025e-05, + "loss": 2.6747, + "step": 28143 + }, + { + "epoch": 1.3103103103103102, + "grad_norm": 0.36567341872053827, + "learning_rate": 6.924985420012959e-05, + "loss": 2.7632, + "step": 28144 + }, + { + "epoch": 1.3103568684964033, + "grad_norm": 0.3358126708525124, + "learning_rate": 6.92473542307325e-05, + "loss": 2.7616, + "step": 28145 + }, + { + "epoch": 1.3104034266824964, + "grad_norm": 0.3826232871235829, + "learning_rate": 6.92448542048463e-05, + "loss": 2.7895, + "step": 28146 + }, + { + "epoch": 1.3104499848685895, + "grad_norm": 0.35955784333906793, + "learning_rate": 6.924235412247838e-05, + "loss": 2.8265, + "step": 28147 + }, + { + "epoch": 1.3104965430546827, + "grad_norm": 0.3584136927741611, + "learning_rate": 6.923985398363604e-05, + "loss": 2.805, + "step": 28148 + }, + { + "epoch": 1.3105431012407758, + "grad_norm": 0.342244767520328, + "learning_rate": 6.923735378832663e-05, + "loss": 2.7455, + "step": 28149 + }, + { + "epoch": 1.3105896594268687, + "grad_norm": 0.3773852480050041, + "learning_rate": 6.923485353655748e-05, + "loss": 2.846, + "step": 28150 + }, + { + "epoch": 1.3106362176129618, + "grad_norm": 0.3744393451433115, + "learning_rate": 6.923235322833591e-05, + "loss": 2.7719, + "step": 28151 + }, + { + "epoch": 1.310682775799055, + "grad_norm": 0.3567350435417813, + "learning_rate": 6.922985286366929e-05, + "loss": 2.8318, + "step": 28152 + }, + { + "epoch": 1.310729333985148, + "grad_norm": 0.35242467139368666, + "learning_rate": 6.922735244256494e-05, + "loss": 2.7633, + "step": 28153 + }, + { + "epoch": 1.310775892171241, + "grad_norm": 0.34704718759480657, + "learning_rate": 6.922485196503021e-05, + "loss": 2.8417, + "step": 28154 + }, + { + "epoch": 1.310822450357334, + "grad_norm": 0.3451772503855369, + "learning_rate": 6.922235143107244e-05, + "loss": 2.7963, + "step": 28155 + }, + { + "epoch": 1.3108690085434271, + "grad_norm": 0.3425422724091099, + "learning_rate": 6.921985084069895e-05, + "loss": 2.7755, + "step": 28156 + }, + { + "epoch": 1.3109155667295203, + "grad_norm": 0.3348687602237237, + "learning_rate": 6.92173501939171e-05, + "loss": 2.7194, + "step": 28157 + }, + { + "epoch": 1.3109621249156134, + "grad_norm": 0.3520072992569176, + "learning_rate": 6.921484949073422e-05, + "loss": 2.7991, + "step": 28158 + }, + { + "epoch": 1.3110086831017065, + "grad_norm": 0.34490509606589653, + "learning_rate": 6.921234873115765e-05, + "loss": 2.783, + "step": 28159 + }, + { + "epoch": 1.3110552412877994, + "grad_norm": 0.3522700696720651, + "learning_rate": 6.920984791519473e-05, + "loss": 2.8295, + "step": 28160 + }, + { + "epoch": 1.3111017994738925, + "grad_norm": 0.3510533124353367, + "learning_rate": 6.920734704285279e-05, + "loss": 2.8432, + "step": 28161 + }, + { + "epoch": 1.3111483576599856, + "grad_norm": 0.36143744604840433, + "learning_rate": 6.920484611413918e-05, + "loss": 2.7515, + "step": 28162 + }, + { + "epoch": 1.3111949158460785, + "grad_norm": 0.3350395696772062, + "learning_rate": 6.920234512906125e-05, + "loss": 2.7731, + "step": 28163 + }, + { + "epoch": 1.3112414740321716, + "grad_norm": 0.38150030659142764, + "learning_rate": 6.919984408762632e-05, + "loss": 2.7632, + "step": 28164 + }, + { + "epoch": 1.3112880322182647, + "grad_norm": 0.33040459566053, + "learning_rate": 6.919734298984175e-05, + "loss": 2.7755, + "step": 28165 + }, + { + "epoch": 1.3113345904043578, + "grad_norm": 0.3987920732760064, + "learning_rate": 6.919484183571485e-05, + "loss": 2.7692, + "step": 28166 + }, + { + "epoch": 1.311381148590451, + "grad_norm": 0.34320639112006507, + "learning_rate": 6.919234062525299e-05, + "loss": 2.658, + "step": 28167 + }, + { + "epoch": 1.311427706776544, + "grad_norm": 0.3561372837404737, + "learning_rate": 6.91898393584635e-05, + "loss": 2.8138, + "step": 28168 + }, + { + "epoch": 1.3114742649626372, + "grad_norm": 0.31480718227437887, + "learning_rate": 6.918733803535374e-05, + "loss": 2.7131, + "step": 28169 + }, + { + "epoch": 1.31152082314873, + "grad_norm": 0.3614357321947628, + "learning_rate": 6.918483665593101e-05, + "loss": 2.7598, + "step": 28170 + }, + { + "epoch": 1.3115673813348232, + "grad_norm": 0.31366305981487735, + "learning_rate": 6.918233522020267e-05, + "loss": 2.8457, + "step": 28171 + }, + { + "epoch": 1.3116139395209163, + "grad_norm": 0.36658993941781975, + "learning_rate": 6.917983372817609e-05, + "loss": 2.8668, + "step": 28172 + }, + { + "epoch": 1.3116604977070092, + "grad_norm": 0.3435116858914658, + "learning_rate": 6.917733217985856e-05, + "loss": 2.8511, + "step": 28173 + }, + { + "epoch": 1.3117070558931023, + "grad_norm": 0.32989884591510743, + "learning_rate": 6.917483057525748e-05, + "loss": 2.8071, + "step": 28174 + }, + { + "epoch": 1.3117536140791954, + "grad_norm": 0.36743980980480495, + "learning_rate": 6.917232891438014e-05, + "loss": 2.7805, + "step": 28175 + }, + { + "epoch": 1.3118001722652886, + "grad_norm": 0.34877518482150605, + "learning_rate": 6.916982719723393e-05, + "loss": 2.834, + "step": 28176 + }, + { + "epoch": 1.3118467304513817, + "grad_norm": 0.36342709392994976, + "learning_rate": 6.916732542382613e-05, + "loss": 2.8065, + "step": 28177 + }, + { + "epoch": 1.3118932886374748, + "grad_norm": 0.3426042553803364, + "learning_rate": 6.916482359416413e-05, + "loss": 2.7405, + "step": 28178 + }, + { + "epoch": 1.3119398468235677, + "grad_norm": 0.3757603024341506, + "learning_rate": 6.916232170825526e-05, + "loss": 2.8703, + "step": 28179 + }, + { + "epoch": 1.3119864050096608, + "grad_norm": 0.3629457571524165, + "learning_rate": 6.915981976610686e-05, + "loss": 2.8789, + "step": 28180 + }, + { + "epoch": 1.312032963195754, + "grad_norm": 0.3542752488530839, + "learning_rate": 6.915731776772629e-05, + "loss": 2.8159, + "step": 28181 + }, + { + "epoch": 1.312079521381847, + "grad_norm": 0.32884074429387217, + "learning_rate": 6.915481571312088e-05, + "loss": 2.7415, + "step": 28182 + }, + { + "epoch": 1.31212607956794, + "grad_norm": 0.32586193672408786, + "learning_rate": 6.915231360229797e-05, + "loss": 2.7645, + "step": 28183 + }, + { + "epoch": 1.312172637754033, + "grad_norm": 0.350320728984334, + "learning_rate": 6.914981143526488e-05, + "loss": 2.7335, + "step": 28184 + }, + { + "epoch": 1.3122191959401261, + "grad_norm": 0.3231946706970403, + "learning_rate": 6.914730921202902e-05, + "loss": 2.8076, + "step": 28185 + }, + { + "epoch": 1.3122657541262193, + "grad_norm": 0.337995362026059, + "learning_rate": 6.914480693259769e-05, + "loss": 2.848, + "step": 28186 + }, + { + "epoch": 1.3123123123123124, + "grad_norm": 0.3310717375629977, + "learning_rate": 6.914230459697821e-05, + "loss": 2.8695, + "step": 28187 + }, + { + "epoch": 1.3123588704984055, + "grad_norm": 0.3406200217903613, + "learning_rate": 6.913980220517797e-05, + "loss": 2.7975, + "step": 28188 + }, + { + "epoch": 1.3124054286844984, + "grad_norm": 0.337972386685527, + "learning_rate": 6.913729975720429e-05, + "loss": 2.5963, + "step": 28189 + }, + { + "epoch": 1.3124519868705915, + "grad_norm": 0.3142794102808634, + "learning_rate": 6.913479725306453e-05, + "loss": 2.6289, + "step": 28190 + }, + { + "epoch": 1.3124985450566846, + "grad_norm": 0.3234906067327104, + "learning_rate": 6.913229469276601e-05, + "loss": 2.7242, + "step": 28191 + }, + { + "epoch": 1.3125451032427777, + "grad_norm": 0.31727540376047714, + "learning_rate": 6.91297920763161e-05, + "loss": 2.8083, + "step": 28192 + }, + { + "epoch": 1.3125916614288706, + "grad_norm": 0.3232260227295941, + "learning_rate": 6.912728940372213e-05, + "loss": 2.7595, + "step": 28193 + }, + { + "epoch": 1.3126382196149637, + "grad_norm": 0.2905638135135248, + "learning_rate": 6.912478667499146e-05, + "loss": 2.7859, + "step": 28194 + }, + { + "epoch": 1.3126847778010569, + "grad_norm": 0.3366180759034099, + "learning_rate": 6.912228389013142e-05, + "loss": 2.83, + "step": 28195 + }, + { + "epoch": 1.31273133598715, + "grad_norm": 0.3232623980252689, + "learning_rate": 6.911978104914936e-05, + "loss": 2.8305, + "step": 28196 + }, + { + "epoch": 1.312777894173243, + "grad_norm": 0.3219593371123784, + "learning_rate": 6.911727815205262e-05, + "loss": 2.8692, + "step": 28197 + }, + { + "epoch": 1.3128244523593362, + "grad_norm": 0.346026292977174, + "learning_rate": 6.911477519884854e-05, + "loss": 2.7346, + "step": 28198 + }, + { + "epoch": 1.312871010545429, + "grad_norm": 0.32359758589891563, + "learning_rate": 6.911227218954452e-05, + "loss": 2.7677, + "step": 28199 + }, + { + "epoch": 1.3129175687315222, + "grad_norm": 0.3706253896026966, + "learning_rate": 6.910976912414783e-05, + "loss": 2.6479, + "step": 28200 + }, + { + "epoch": 1.3129641269176153, + "grad_norm": 0.30761029054440703, + "learning_rate": 6.910726600266586e-05, + "loss": 2.8613, + "step": 28201 + }, + { + "epoch": 1.3130106851037084, + "grad_norm": 0.34885579303610115, + "learning_rate": 6.910476282510595e-05, + "loss": 2.7629, + "step": 28202 + }, + { + "epoch": 1.3130572432898013, + "grad_norm": 0.36031861141942545, + "learning_rate": 6.910225959147545e-05, + "loss": 2.8678, + "step": 28203 + }, + { + "epoch": 1.3131038014758944, + "grad_norm": 0.3462545983962093, + "learning_rate": 6.909975630178168e-05, + "loss": 2.8829, + "step": 28204 + }, + { + "epoch": 1.3131503596619876, + "grad_norm": 0.3536103148292622, + "learning_rate": 6.909725295603201e-05, + "loss": 2.8231, + "step": 28205 + }, + { + "epoch": 1.3131969178480807, + "grad_norm": 0.3735092142767792, + "learning_rate": 6.909474955423378e-05, + "loss": 2.8443, + "step": 28206 + }, + { + "epoch": 1.3132434760341738, + "grad_norm": 0.38955702833171557, + "learning_rate": 6.909224609639436e-05, + "loss": 2.7638, + "step": 28207 + }, + { + "epoch": 1.313290034220267, + "grad_norm": 0.33436494333872313, + "learning_rate": 6.908974258252108e-05, + "loss": 2.7466, + "step": 28208 + }, + { + "epoch": 1.3133365924063598, + "grad_norm": 0.42129874667376754, + "learning_rate": 6.908723901262126e-05, + "loss": 2.7006, + "step": 28209 + }, + { + "epoch": 1.313383150592453, + "grad_norm": 0.3545247785991581, + "learning_rate": 6.908473538670229e-05, + "loss": 2.7324, + "step": 28210 + }, + { + "epoch": 1.313429708778546, + "grad_norm": 0.4202276292020673, + "learning_rate": 6.90822317047715e-05, + "loss": 2.7667, + "step": 28211 + }, + { + "epoch": 1.313476266964639, + "grad_norm": 0.3775229197761591, + "learning_rate": 6.907972796683625e-05, + "loss": 2.7838, + "step": 28212 + }, + { + "epoch": 1.313522825150732, + "grad_norm": 0.37128687854357295, + "learning_rate": 6.907722417290386e-05, + "loss": 2.7477, + "step": 28213 + }, + { + "epoch": 1.3135693833368252, + "grad_norm": 0.40486146880166324, + "learning_rate": 6.907472032298172e-05, + "loss": 2.9267, + "step": 28214 + }, + { + "epoch": 1.3136159415229183, + "grad_norm": 0.37625746165380636, + "learning_rate": 6.907221641707714e-05, + "loss": 2.7999, + "step": 28215 + }, + { + "epoch": 1.3136624997090114, + "grad_norm": 0.3833754332781913, + "learning_rate": 6.906971245519747e-05, + "loss": 2.7772, + "step": 28216 + }, + { + "epoch": 1.3137090578951045, + "grad_norm": 0.32941731493025184, + "learning_rate": 6.906720843735011e-05, + "loss": 2.8518, + "step": 28217 + }, + { + "epoch": 1.3137556160811974, + "grad_norm": 0.40282529950811286, + "learning_rate": 6.906470436354235e-05, + "loss": 2.7633, + "step": 28218 + }, + { + "epoch": 1.3138021742672905, + "grad_norm": 0.3225339547840897, + "learning_rate": 6.906220023378155e-05, + "loss": 2.7078, + "step": 28219 + }, + { + "epoch": 1.3138487324533836, + "grad_norm": 0.3518421478781051, + "learning_rate": 6.905969604807507e-05, + "loss": 2.7374, + "step": 28220 + }, + { + "epoch": 1.3138952906394767, + "grad_norm": 0.3395700388765309, + "learning_rate": 6.905719180643028e-05, + "loss": 2.7072, + "step": 28221 + }, + { + "epoch": 1.3139418488255696, + "grad_norm": 0.33936576614992525, + "learning_rate": 6.90546875088545e-05, + "loss": 2.8205, + "step": 28222 + }, + { + "epoch": 1.3139884070116628, + "grad_norm": 0.35273123376155036, + "learning_rate": 6.905218315535509e-05, + "loss": 2.7842, + "step": 28223 + }, + { + "epoch": 1.3140349651977559, + "grad_norm": 0.3567132818374996, + "learning_rate": 6.90496787459394e-05, + "loss": 2.7577, + "step": 28224 + }, + { + "epoch": 1.314081523383849, + "grad_norm": 0.37793325646636544, + "learning_rate": 6.904717428061478e-05, + "loss": 2.801, + "step": 28225 + }, + { + "epoch": 1.314128081569942, + "grad_norm": 0.3292471043179029, + "learning_rate": 6.904466975938857e-05, + "loss": 2.7361, + "step": 28226 + }, + { + "epoch": 1.3141746397560352, + "grad_norm": 0.361087662261617, + "learning_rate": 6.904216518226814e-05, + "loss": 2.7104, + "step": 28227 + }, + { + "epoch": 1.314221197942128, + "grad_norm": 0.34726354337961085, + "learning_rate": 6.903966054926082e-05, + "loss": 2.7385, + "step": 28228 + }, + { + "epoch": 1.3142677561282212, + "grad_norm": 0.7951785070547601, + "learning_rate": 6.903715586037399e-05, + "loss": 2.6936, + "step": 28229 + }, + { + "epoch": 1.3143143143143143, + "grad_norm": 0.4144365232580793, + "learning_rate": 6.903465111561497e-05, + "loss": 2.7276, + "step": 28230 + }, + { + "epoch": 1.3143608725004075, + "grad_norm": 0.3667499565296697, + "learning_rate": 6.903214631499113e-05, + "loss": 2.7641, + "step": 28231 + }, + { + "epoch": 1.3144074306865003, + "grad_norm": 0.39429025960639486, + "learning_rate": 6.902964145850982e-05, + "loss": 2.6309, + "step": 28232 + }, + { + "epoch": 1.3144539888725935, + "grad_norm": 0.406727338946652, + "learning_rate": 6.902713654617838e-05, + "loss": 2.6777, + "step": 28233 + }, + { + "epoch": 1.3145005470586866, + "grad_norm": 0.3910659739209491, + "learning_rate": 6.902463157800416e-05, + "loss": 2.7773, + "step": 28234 + }, + { + "epoch": 1.3145471052447797, + "grad_norm": 0.38104987652161004, + "learning_rate": 6.902212655399454e-05, + "loss": 2.7688, + "step": 28235 + }, + { + "epoch": 1.3145936634308728, + "grad_norm": 0.3921959878330515, + "learning_rate": 6.901962147415684e-05, + "loss": 2.6518, + "step": 28236 + }, + { + "epoch": 1.314640221616966, + "grad_norm": 0.3634465647859006, + "learning_rate": 6.901711633849843e-05, + "loss": 2.8579, + "step": 28237 + }, + { + "epoch": 1.3146867798030588, + "grad_norm": 0.35880231416646347, + "learning_rate": 6.901461114702665e-05, + "loss": 2.8005, + "step": 28238 + }, + { + "epoch": 1.314733337989152, + "grad_norm": 0.3470260309377104, + "learning_rate": 6.901210589974887e-05, + "loss": 2.7451, + "step": 28239 + }, + { + "epoch": 1.314779896175245, + "grad_norm": 0.36669695288296816, + "learning_rate": 6.900960059667243e-05, + "loss": 2.8307, + "step": 28240 + }, + { + "epoch": 1.3148264543613382, + "grad_norm": 0.34080433752104394, + "learning_rate": 6.900709523780467e-05, + "loss": 2.65, + "step": 28241 + }, + { + "epoch": 1.314873012547431, + "grad_norm": 0.36209168524633806, + "learning_rate": 6.900458982315297e-05, + "loss": 2.7718, + "step": 28242 + }, + { + "epoch": 1.3149195707335242, + "grad_norm": 0.35531043249797006, + "learning_rate": 6.900208435272468e-05, + "loss": 2.7752, + "step": 28243 + }, + { + "epoch": 1.3149661289196173, + "grad_norm": 0.3556643618706255, + "learning_rate": 6.899957882652714e-05, + "loss": 2.6711, + "step": 28244 + }, + { + "epoch": 1.3150126871057104, + "grad_norm": 0.3308889577492836, + "learning_rate": 6.89970732445677e-05, + "loss": 2.7483, + "step": 28245 + }, + { + "epoch": 1.3150592452918035, + "grad_norm": 0.34583493358031636, + "learning_rate": 6.899456760685373e-05, + "loss": 2.7212, + "step": 28246 + }, + { + "epoch": 1.3151058034778966, + "grad_norm": 0.3307231920869383, + "learning_rate": 6.899206191339256e-05, + "loss": 2.7624, + "step": 28247 + }, + { + "epoch": 1.3151523616639895, + "grad_norm": 0.3339120016946993, + "learning_rate": 6.898955616419158e-05, + "loss": 2.8575, + "step": 28248 + }, + { + "epoch": 1.3151989198500826, + "grad_norm": 0.3420450376644126, + "learning_rate": 6.89870503592581e-05, + "loss": 2.6482, + "step": 28249 + }, + { + "epoch": 1.3152454780361758, + "grad_norm": 0.36798206328851635, + "learning_rate": 6.89845444985995e-05, + "loss": 2.8132, + "step": 28250 + }, + { + "epoch": 1.3152920362222686, + "grad_norm": 0.35137752295131736, + "learning_rate": 6.898203858222316e-05, + "loss": 2.7579, + "step": 28251 + }, + { + "epoch": 1.3153385944083618, + "grad_norm": 0.35560414280408253, + "learning_rate": 6.897953261013639e-05, + "loss": 2.8511, + "step": 28252 + }, + { + "epoch": 1.3153851525944549, + "grad_norm": 0.36910752453618334, + "learning_rate": 6.897702658234655e-05, + "loss": 2.782, + "step": 28253 + }, + { + "epoch": 1.315431710780548, + "grad_norm": 0.34695884253359255, + "learning_rate": 6.897452049886103e-05, + "loss": 2.6991, + "step": 28254 + }, + { + "epoch": 1.315478268966641, + "grad_norm": 0.336429659285117, + "learning_rate": 6.897201435968715e-05, + "loss": 2.7425, + "step": 28255 + }, + { + "epoch": 1.3155248271527342, + "grad_norm": 0.34665978295192473, + "learning_rate": 6.896950816483226e-05, + "loss": 2.7304, + "step": 28256 + }, + { + "epoch": 1.3155713853388273, + "grad_norm": 0.37727397558237624, + "learning_rate": 6.896700191430376e-05, + "loss": 2.8079, + "step": 28257 + }, + { + "epoch": 1.3156179435249202, + "grad_norm": 0.37765418703911696, + "learning_rate": 6.896449560810895e-05, + "loss": 2.7116, + "step": 28258 + }, + { + "epoch": 1.3156645017110133, + "grad_norm": 0.307363046114659, + "learning_rate": 6.896198924625524e-05, + "loss": 2.8008, + "step": 28259 + }, + { + "epoch": 1.3157110598971065, + "grad_norm": 0.35245885210311034, + "learning_rate": 6.895948282874994e-05, + "loss": 2.7619, + "step": 28260 + }, + { + "epoch": 1.3157576180831994, + "grad_norm": 0.3089656890194223, + "learning_rate": 6.895697635560044e-05, + "loss": 2.7822, + "step": 28261 + }, + { + "epoch": 1.3158041762692925, + "grad_norm": 0.36662115244436444, + "learning_rate": 6.895446982681408e-05, + "loss": 2.7928, + "step": 28262 + }, + { + "epoch": 1.3158507344553856, + "grad_norm": 0.33060807072729, + "learning_rate": 6.895196324239821e-05, + "loss": 2.7908, + "step": 28263 + }, + { + "epoch": 1.3158972926414787, + "grad_norm": 0.34724394911908174, + "learning_rate": 6.89494566023602e-05, + "loss": 2.7215, + "step": 28264 + }, + { + "epoch": 1.3159438508275718, + "grad_norm": 0.32363153180776366, + "learning_rate": 6.894694990670741e-05, + "loss": 2.7369, + "step": 28265 + }, + { + "epoch": 1.315990409013665, + "grad_norm": 0.34378337202440834, + "learning_rate": 6.894444315544719e-05, + "loss": 2.7708, + "step": 28266 + }, + { + "epoch": 1.3160369671997578, + "grad_norm": 0.339726872364358, + "learning_rate": 6.894193634858689e-05, + "loss": 2.6713, + "step": 28267 + }, + { + "epoch": 1.316083525385851, + "grad_norm": 0.33226030441979854, + "learning_rate": 6.893942948613386e-05, + "loss": 2.8072, + "step": 28268 + }, + { + "epoch": 1.316130083571944, + "grad_norm": 0.38654924197609297, + "learning_rate": 6.893692256809548e-05, + "loss": 2.72, + "step": 28269 + }, + { + "epoch": 1.3161766417580372, + "grad_norm": 0.32457040369071716, + "learning_rate": 6.893441559447909e-05, + "loss": 2.7771, + "step": 28270 + }, + { + "epoch": 1.31622319994413, + "grad_norm": 0.35846695304818477, + "learning_rate": 6.893190856529207e-05, + "loss": 2.824, + "step": 28271 + }, + { + "epoch": 1.3162697581302232, + "grad_norm": 0.34586509291077466, + "learning_rate": 6.892940148054175e-05, + "loss": 2.8316, + "step": 28272 + }, + { + "epoch": 1.3163163163163163, + "grad_norm": 0.322126318696154, + "learning_rate": 6.892689434023552e-05, + "loss": 2.6924, + "step": 28273 + }, + { + "epoch": 1.3163628745024094, + "grad_norm": 0.3270941947899938, + "learning_rate": 6.89243871443807e-05, + "loss": 2.5937, + "step": 28274 + }, + { + "epoch": 1.3164094326885025, + "grad_norm": 0.33351390716665225, + "learning_rate": 6.892187989298469e-05, + "loss": 2.7702, + "step": 28275 + }, + { + "epoch": 1.3164559908745956, + "grad_norm": 0.3129496445022337, + "learning_rate": 6.89193725860548e-05, + "loss": 2.7646, + "step": 28276 + }, + { + "epoch": 1.3165025490606885, + "grad_norm": 0.3220912754302151, + "learning_rate": 6.891686522359843e-05, + "loss": 2.6679, + "step": 28277 + }, + { + "epoch": 1.3165491072467816, + "grad_norm": 0.33198769650451465, + "learning_rate": 6.891435780562292e-05, + "loss": 2.6653, + "step": 28278 + }, + { + "epoch": 1.3165956654328748, + "grad_norm": 0.3030188971006181, + "learning_rate": 6.891185033213564e-05, + "loss": 2.7591, + "step": 28279 + }, + { + "epoch": 1.3166422236189679, + "grad_norm": 0.3649387931640125, + "learning_rate": 6.890934280314394e-05, + "loss": 2.748, + "step": 28280 + }, + { + "epoch": 1.3166887818050608, + "grad_norm": 0.31983714117951956, + "learning_rate": 6.890683521865518e-05, + "loss": 2.7085, + "step": 28281 + }, + { + "epoch": 1.3167353399911539, + "grad_norm": 0.35324388494125586, + "learning_rate": 6.890432757867673e-05, + "loss": 2.7804, + "step": 28282 + }, + { + "epoch": 1.316781898177247, + "grad_norm": 0.3528673503618801, + "learning_rate": 6.890181988321591e-05, + "loss": 2.8843, + "step": 28283 + }, + { + "epoch": 1.3168284563633401, + "grad_norm": 0.3238783004551185, + "learning_rate": 6.889931213228015e-05, + "loss": 2.7651, + "step": 28284 + }, + { + "epoch": 1.3168750145494332, + "grad_norm": 0.3382835732215854, + "learning_rate": 6.889680432587675e-05, + "loss": 2.8568, + "step": 28285 + }, + { + "epoch": 1.3169215727355263, + "grad_norm": 0.31139523254511275, + "learning_rate": 6.889429646401308e-05, + "loss": 2.7158, + "step": 28286 + }, + { + "epoch": 1.3169681309216192, + "grad_norm": 0.3180035139964075, + "learning_rate": 6.889178854669653e-05, + "loss": 2.7218, + "step": 28287 + }, + { + "epoch": 1.3170146891077124, + "grad_norm": 0.3252189695524449, + "learning_rate": 6.888928057393444e-05, + "loss": 2.7576, + "step": 28288 + }, + { + "epoch": 1.3170612472938055, + "grad_norm": 0.3109639072460655, + "learning_rate": 6.888677254573416e-05, + "loss": 2.8111, + "step": 28289 + }, + { + "epoch": 1.3171078054798986, + "grad_norm": 0.3411537179749863, + "learning_rate": 6.888426446210308e-05, + "loss": 2.6788, + "step": 28290 + }, + { + "epoch": 1.3171543636659915, + "grad_norm": 0.317859630576008, + "learning_rate": 6.888175632304854e-05, + "loss": 2.7876, + "step": 28291 + }, + { + "epoch": 1.3172009218520846, + "grad_norm": 0.34147761552965455, + "learning_rate": 6.887924812857791e-05, + "loss": 2.9481, + "step": 28292 + }, + { + "epoch": 1.3172474800381777, + "grad_norm": 0.29780648884361344, + "learning_rate": 6.887673987869853e-05, + "loss": 2.6834, + "step": 28293 + }, + { + "epoch": 1.3172940382242708, + "grad_norm": 0.33856685329756897, + "learning_rate": 6.88742315734178e-05, + "loss": 2.8791, + "step": 28294 + }, + { + "epoch": 1.317340596410364, + "grad_norm": 0.3334411150814909, + "learning_rate": 6.887172321274304e-05, + "loss": 2.86, + "step": 28295 + }, + { + "epoch": 1.317387154596457, + "grad_norm": 0.31673924517047103, + "learning_rate": 6.886921479668164e-05, + "loss": 2.7893, + "step": 28296 + }, + { + "epoch": 1.31743371278255, + "grad_norm": 0.3403459745524733, + "learning_rate": 6.886670632524095e-05, + "loss": 2.8075, + "step": 28297 + }, + { + "epoch": 1.317480270968643, + "grad_norm": 0.35674406850046697, + "learning_rate": 6.886419779842835e-05, + "loss": 2.85, + "step": 28298 + }, + { + "epoch": 1.3175268291547362, + "grad_norm": 0.3356312487320147, + "learning_rate": 6.886168921625116e-05, + "loss": 2.7586, + "step": 28299 + }, + { + "epoch": 1.317573387340829, + "grad_norm": 0.3654554642392466, + "learning_rate": 6.885918057871679e-05, + "loss": 2.7654, + "step": 28300 + }, + { + "epoch": 1.3176199455269222, + "grad_norm": 0.3329466706733507, + "learning_rate": 6.885667188583258e-05, + "loss": 2.7019, + "step": 28301 + }, + { + "epoch": 1.3176665037130153, + "grad_norm": 0.32587931377665613, + "learning_rate": 6.885416313760588e-05, + "loss": 2.7204, + "step": 28302 + }, + { + "epoch": 1.3177130618991084, + "grad_norm": 0.32721151380627317, + "learning_rate": 6.88516543340441e-05, + "loss": 2.6732, + "step": 28303 + }, + { + "epoch": 1.3177596200852015, + "grad_norm": 0.3857931589205173, + "learning_rate": 6.884914547515455e-05, + "loss": 2.7552, + "step": 28304 + }, + { + "epoch": 1.3178061782712946, + "grad_norm": 0.3493609765923686, + "learning_rate": 6.884663656094462e-05, + "loss": 2.7528, + "step": 28305 + }, + { + "epoch": 1.3178527364573875, + "grad_norm": 0.3578088929821709, + "learning_rate": 6.884412759142167e-05, + "loss": 2.8747, + "step": 28306 + }, + { + "epoch": 1.3178992946434807, + "grad_norm": 0.36725007642455587, + "learning_rate": 6.884161856659308e-05, + "loss": 2.7417, + "step": 28307 + }, + { + "epoch": 1.3179458528295738, + "grad_norm": 0.3234434330519558, + "learning_rate": 6.883910948646615e-05, + "loss": 2.7347, + "step": 28308 + }, + { + "epoch": 1.3179924110156669, + "grad_norm": 0.34220744733230607, + "learning_rate": 6.883660035104832e-05, + "loss": 2.772, + "step": 28309 + }, + { + "epoch": 1.3180389692017598, + "grad_norm": 0.338129411249941, + "learning_rate": 6.883409116034692e-05, + "loss": 2.6913, + "step": 28310 + }, + { + "epoch": 1.318085527387853, + "grad_norm": 0.33658428241592614, + "learning_rate": 6.883158191436933e-05, + "loss": 2.8612, + "step": 28311 + }, + { + "epoch": 1.318132085573946, + "grad_norm": 0.336171490590678, + "learning_rate": 6.882907261312289e-05, + "loss": 2.8464, + "step": 28312 + }, + { + "epoch": 1.3181786437600391, + "grad_norm": 0.3482892986482662, + "learning_rate": 6.882656325661497e-05, + "loss": 2.8251, + "step": 28313 + }, + { + "epoch": 1.3182252019461322, + "grad_norm": 0.3675188806769836, + "learning_rate": 6.882405384485294e-05, + "loss": 2.8065, + "step": 28314 + }, + { + "epoch": 1.3182717601322254, + "grad_norm": 0.34165256502108976, + "learning_rate": 6.882154437784418e-05, + "loss": 2.7359, + "step": 28315 + }, + { + "epoch": 1.3183183183183182, + "grad_norm": 0.36122188206829714, + "learning_rate": 6.881903485559604e-05, + "loss": 2.8589, + "step": 28316 + }, + { + "epoch": 1.3183648765044114, + "grad_norm": 0.35529143435903376, + "learning_rate": 6.881652527811588e-05, + "loss": 2.7595, + "step": 28317 + }, + { + "epoch": 1.3184114346905045, + "grad_norm": 0.36248617441905834, + "learning_rate": 6.881401564541108e-05, + "loss": 2.7764, + "step": 28318 + }, + { + "epoch": 1.3184579928765976, + "grad_norm": 0.350532315538084, + "learning_rate": 6.881150595748897e-05, + "loss": 2.7756, + "step": 28319 + }, + { + "epoch": 1.3185045510626905, + "grad_norm": 0.34633762959735004, + "learning_rate": 6.880899621435699e-05, + "loss": 2.7578, + "step": 28320 + }, + { + "epoch": 1.3185511092487836, + "grad_norm": 0.3670985711940209, + "learning_rate": 6.880648641602243e-05, + "loss": 2.7804, + "step": 28321 + }, + { + "epoch": 1.3185976674348767, + "grad_norm": 0.3268081832198881, + "learning_rate": 6.88039765624927e-05, + "loss": 2.6587, + "step": 28322 + }, + { + "epoch": 1.3186442256209698, + "grad_norm": 0.3366714948392882, + "learning_rate": 6.880146665377512e-05, + "loss": 2.7641, + "step": 28323 + }, + { + "epoch": 1.318690783807063, + "grad_norm": 0.34866341141223883, + "learning_rate": 6.87989566898771e-05, + "loss": 2.7415, + "step": 28324 + }, + { + "epoch": 1.318737341993156, + "grad_norm": 0.3573367126540576, + "learning_rate": 6.879644667080601e-05, + "loss": 2.7422, + "step": 28325 + }, + { + "epoch": 1.318783900179249, + "grad_norm": 0.35368425143407345, + "learning_rate": 6.87939365965692e-05, + "loss": 2.7899, + "step": 28326 + }, + { + "epoch": 1.318830458365342, + "grad_norm": 0.3735390703687603, + "learning_rate": 6.879142646717403e-05, + "loss": 2.8571, + "step": 28327 + }, + { + "epoch": 1.3188770165514352, + "grad_norm": 0.3474341976288762, + "learning_rate": 6.878891628262788e-05, + "loss": 2.7105, + "step": 28328 + }, + { + "epoch": 1.3189235747375283, + "grad_norm": 0.34322615718021837, + "learning_rate": 6.878640604293811e-05, + "loss": 2.7984, + "step": 28329 + }, + { + "epoch": 1.3189701329236212, + "grad_norm": 0.34475204860593817, + "learning_rate": 6.878389574811209e-05, + "loss": 2.87, + "step": 28330 + }, + { + "epoch": 1.3190166911097143, + "grad_norm": 0.38971101143625003, + "learning_rate": 6.878138539815717e-05, + "loss": 2.7408, + "step": 28331 + }, + { + "epoch": 1.3190632492958074, + "grad_norm": 0.3629761232427879, + "learning_rate": 6.877887499308076e-05, + "loss": 2.7824, + "step": 28332 + }, + { + "epoch": 1.3191098074819005, + "grad_norm": 0.37580615302670256, + "learning_rate": 6.877636453289018e-05, + "loss": 2.7091, + "step": 28333 + }, + { + "epoch": 1.3191563656679937, + "grad_norm": 0.39762671189769644, + "learning_rate": 6.877385401759284e-05, + "loss": 2.7683, + "step": 28334 + }, + { + "epoch": 1.3192029238540868, + "grad_norm": 0.3832329022270312, + "learning_rate": 6.877134344719607e-05, + "loss": 2.868, + "step": 28335 + }, + { + "epoch": 1.3192494820401797, + "grad_norm": 0.37318126362848375, + "learning_rate": 6.876883282170726e-05, + "loss": 2.722, + "step": 28336 + }, + { + "epoch": 1.3192960402262728, + "grad_norm": 0.3298135891901789, + "learning_rate": 6.876632214113377e-05, + "loss": 2.6967, + "step": 28337 + }, + { + "epoch": 1.319342598412366, + "grad_norm": 0.3707665012218629, + "learning_rate": 6.8763811405483e-05, + "loss": 2.8834, + "step": 28338 + }, + { + "epoch": 1.3193891565984588, + "grad_norm": 0.32979256818755354, + "learning_rate": 6.876130061476226e-05, + "loss": 2.7523, + "step": 28339 + }, + { + "epoch": 1.319435714784552, + "grad_norm": 0.35158388209696007, + "learning_rate": 6.875878976897897e-05, + "loss": 2.8602, + "step": 28340 + }, + { + "epoch": 1.319482272970645, + "grad_norm": 0.33598317636878955, + "learning_rate": 6.875627886814047e-05, + "loss": 2.7701, + "step": 28341 + }, + { + "epoch": 1.3195288311567381, + "grad_norm": 0.34091216747272934, + "learning_rate": 6.875376791225414e-05, + "loss": 2.8239, + "step": 28342 + }, + { + "epoch": 1.3195753893428313, + "grad_norm": 0.33900337228699073, + "learning_rate": 6.875125690132737e-05, + "loss": 2.8429, + "step": 28343 + }, + { + "epoch": 1.3196219475289244, + "grad_norm": 0.3248168792441142, + "learning_rate": 6.874874583536748e-05, + "loss": 2.8346, + "step": 28344 + }, + { + "epoch": 1.3196685057150175, + "grad_norm": 0.3314282382647401, + "learning_rate": 6.874623471438188e-05, + "loss": 2.8508, + "step": 28345 + }, + { + "epoch": 1.3197150639011104, + "grad_norm": 0.32571648793893127, + "learning_rate": 6.874372353837792e-05, + "loss": 2.7708, + "step": 28346 + }, + { + "epoch": 1.3197616220872035, + "grad_norm": 0.33251226332429035, + "learning_rate": 6.874121230736299e-05, + "loss": 2.753, + "step": 28347 + }, + { + "epoch": 1.3198081802732966, + "grad_norm": 0.3192173517678669, + "learning_rate": 6.873870102134444e-05, + "loss": 2.792, + "step": 28348 + }, + { + "epoch": 1.3198547384593895, + "grad_norm": 0.34662934023373104, + "learning_rate": 6.873618968032964e-05, + "loss": 2.8375, + "step": 28349 + }, + { + "epoch": 1.3199012966454826, + "grad_norm": 0.3260730580977744, + "learning_rate": 6.873367828432598e-05, + "loss": 2.7622, + "step": 28350 + }, + { + "epoch": 1.3199478548315757, + "grad_norm": 0.3338885553884671, + "learning_rate": 6.873116683334082e-05, + "loss": 2.8375, + "step": 28351 + }, + { + "epoch": 1.3199944130176688, + "grad_norm": 0.33532227437049467, + "learning_rate": 6.872865532738154e-05, + "loss": 2.7466, + "step": 28352 + }, + { + "epoch": 1.320040971203762, + "grad_norm": 0.32223872624308, + "learning_rate": 6.872614376645549e-05, + "loss": 2.695, + "step": 28353 + }, + { + "epoch": 1.320087529389855, + "grad_norm": 0.3579823815457121, + "learning_rate": 6.872363215057006e-05, + "loss": 2.8543, + "step": 28354 + }, + { + "epoch": 1.320134087575948, + "grad_norm": 0.338080478523226, + "learning_rate": 6.87211204797326e-05, + "loss": 2.7106, + "step": 28355 + }, + { + "epoch": 1.320180645762041, + "grad_norm": 0.349186373450619, + "learning_rate": 6.871860875395052e-05, + "loss": 2.7769, + "step": 28356 + }, + { + "epoch": 1.3202272039481342, + "grad_norm": 0.33340090198415734, + "learning_rate": 6.871609697323116e-05, + "loss": 2.753, + "step": 28357 + }, + { + "epoch": 1.3202737621342273, + "grad_norm": 0.33749022518757343, + "learning_rate": 6.871358513758188e-05, + "loss": 2.7974, + "step": 28358 + }, + { + "epoch": 1.3203203203203202, + "grad_norm": 0.3621850193134138, + "learning_rate": 6.871107324701009e-05, + "loss": 2.8471, + "step": 28359 + }, + { + "epoch": 1.3203668785064133, + "grad_norm": 0.33575085054330633, + "learning_rate": 6.870856130152312e-05, + "loss": 2.7951, + "step": 28360 + }, + { + "epoch": 1.3204134366925064, + "grad_norm": 0.398500325981283, + "learning_rate": 6.87060493011284e-05, + "loss": 2.8311, + "step": 28361 + }, + { + "epoch": 1.3204599948785996, + "grad_norm": 0.2965321732698524, + "learning_rate": 6.870353724583324e-05, + "loss": 2.7868, + "step": 28362 + }, + { + "epoch": 1.3205065530646927, + "grad_norm": 0.39460777618200654, + "learning_rate": 6.870102513564505e-05, + "loss": 2.7431, + "step": 28363 + }, + { + "epoch": 1.3205531112507858, + "grad_norm": 0.3317314889146149, + "learning_rate": 6.86985129705712e-05, + "loss": 2.8165, + "step": 28364 + }, + { + "epoch": 1.3205996694368787, + "grad_norm": 0.34302455608758625, + "learning_rate": 6.869600075061904e-05, + "loss": 2.7265, + "step": 28365 + }, + { + "epoch": 1.3206462276229718, + "grad_norm": 0.3518630211457892, + "learning_rate": 6.869348847579597e-05, + "loss": 2.788, + "step": 28366 + }, + { + "epoch": 1.320692785809065, + "grad_norm": 0.34238682426530415, + "learning_rate": 6.869097614610937e-05, + "loss": 2.7869, + "step": 28367 + }, + { + "epoch": 1.320739343995158, + "grad_norm": 0.33050260072285886, + "learning_rate": 6.868846376156657e-05, + "loss": 2.8126, + "step": 28368 + }, + { + "epoch": 1.320785902181251, + "grad_norm": 0.3481642850678808, + "learning_rate": 6.868595132217499e-05, + "loss": 2.7724, + "step": 28369 + }, + { + "epoch": 1.320832460367344, + "grad_norm": 0.32044975666056724, + "learning_rate": 6.868343882794197e-05, + "loss": 2.8266, + "step": 28370 + }, + { + "epoch": 1.3208790185534371, + "grad_norm": 0.3111593072201482, + "learning_rate": 6.86809262788749e-05, + "loss": 2.6913, + "step": 28371 + }, + { + "epoch": 1.3209255767395303, + "grad_norm": 0.3271020968179269, + "learning_rate": 6.867841367498116e-05, + "loss": 2.814, + "step": 28372 + }, + { + "epoch": 1.3209721349256234, + "grad_norm": 0.31667616837772206, + "learning_rate": 6.867590101626811e-05, + "loss": 2.756, + "step": 28373 + }, + { + "epoch": 1.3210186931117165, + "grad_norm": 0.3422723085083442, + "learning_rate": 6.867338830274312e-05, + "loss": 2.8242, + "step": 28374 + }, + { + "epoch": 1.3210652512978094, + "grad_norm": 0.31666324996817774, + "learning_rate": 6.86708755344136e-05, + "loss": 2.8542, + "step": 28375 + }, + { + "epoch": 1.3211118094839025, + "grad_norm": 0.3332954510494722, + "learning_rate": 6.866836271128687e-05, + "loss": 2.7911, + "step": 28376 + }, + { + "epoch": 1.3211583676699956, + "grad_norm": 0.30442331245402804, + "learning_rate": 6.866584983337035e-05, + "loss": 2.7953, + "step": 28377 + }, + { + "epoch": 1.3212049258560887, + "grad_norm": 0.31853720531515817, + "learning_rate": 6.86633369006714e-05, + "loss": 2.6936, + "step": 28378 + }, + { + "epoch": 1.3212514840421816, + "grad_norm": 0.3393112716490456, + "learning_rate": 6.866082391319739e-05, + "loss": 2.719, + "step": 28379 + }, + { + "epoch": 1.3212980422282747, + "grad_norm": 0.31582661073612633, + "learning_rate": 6.86583108709557e-05, + "loss": 2.8072, + "step": 28380 + }, + { + "epoch": 1.3213446004143679, + "grad_norm": 0.3265285932680386, + "learning_rate": 6.865579777395372e-05, + "loss": 2.774, + "step": 28381 + }, + { + "epoch": 1.321391158600461, + "grad_norm": 0.32229422800529567, + "learning_rate": 6.86532846221988e-05, + "loss": 2.7879, + "step": 28382 + }, + { + "epoch": 1.321437716786554, + "grad_norm": 0.33413928595905995, + "learning_rate": 6.865077141569833e-05, + "loss": 2.8263, + "step": 28383 + }, + { + "epoch": 1.3214842749726472, + "grad_norm": 0.3081889778451673, + "learning_rate": 6.86482581544597e-05, + "loss": 2.8847, + "step": 28384 + }, + { + "epoch": 1.32153083315874, + "grad_norm": 0.3472752739270331, + "learning_rate": 6.864574483849023e-05, + "loss": 2.7461, + "step": 28385 + }, + { + "epoch": 1.3215773913448332, + "grad_norm": 0.3354150202446637, + "learning_rate": 6.864323146779735e-05, + "loss": 2.7206, + "step": 28386 + }, + { + "epoch": 1.3216239495309263, + "grad_norm": 0.34654252122063456, + "learning_rate": 6.864071804238843e-05, + "loss": 2.6866, + "step": 28387 + }, + { + "epoch": 1.3216705077170192, + "grad_norm": 0.3074518646502162, + "learning_rate": 6.863820456227085e-05, + "loss": 2.7443, + "step": 28388 + }, + { + "epoch": 1.3217170659031123, + "grad_norm": 0.33771508072548845, + "learning_rate": 6.863569102745196e-05, + "loss": 2.7314, + "step": 28389 + }, + { + "epoch": 1.3217636240892054, + "grad_norm": 0.3493902480949508, + "learning_rate": 6.863317743793916e-05, + "loss": 2.7769, + "step": 28390 + }, + { + "epoch": 1.3218101822752986, + "grad_norm": 0.3531799163313661, + "learning_rate": 6.863066379373981e-05, + "loss": 2.7395, + "step": 28391 + }, + { + "epoch": 1.3218567404613917, + "grad_norm": 0.3714072250829914, + "learning_rate": 6.862815009486131e-05, + "loss": 2.8203, + "step": 28392 + }, + { + "epoch": 1.3219032986474848, + "grad_norm": 0.3281273038037336, + "learning_rate": 6.862563634131103e-05, + "loss": 2.7678, + "step": 28393 + }, + { + "epoch": 1.3219498568335777, + "grad_norm": 0.3848667427670665, + "learning_rate": 6.862312253309632e-05, + "loss": 2.8326, + "step": 28394 + }, + { + "epoch": 1.3219964150196708, + "grad_norm": 0.33192316105431435, + "learning_rate": 6.862060867022459e-05, + "loss": 2.9125, + "step": 28395 + }, + { + "epoch": 1.322042973205764, + "grad_norm": 0.3652778748836394, + "learning_rate": 6.86180947527032e-05, + "loss": 2.8664, + "step": 28396 + }, + { + "epoch": 1.322089531391857, + "grad_norm": 0.31067643390549043, + "learning_rate": 6.861558078053955e-05, + "loss": 2.6894, + "step": 28397 + }, + { + "epoch": 1.32213608957795, + "grad_norm": 0.33716362521336773, + "learning_rate": 6.861306675374099e-05, + "loss": 2.6945, + "step": 28398 + }, + { + "epoch": 1.322182647764043, + "grad_norm": 0.36263184924917746, + "learning_rate": 6.861055267231493e-05, + "loss": 2.8584, + "step": 28399 + }, + { + "epoch": 1.3222292059501362, + "grad_norm": 0.33233942551329204, + "learning_rate": 6.860803853626872e-05, + "loss": 2.7941, + "step": 28400 + }, + { + "epoch": 1.3222757641362293, + "grad_norm": 0.34384716891619366, + "learning_rate": 6.860552434560976e-05, + "loss": 2.7707, + "step": 28401 + }, + { + "epoch": 1.3223223223223224, + "grad_norm": 0.35059204686998, + "learning_rate": 6.860301010034541e-05, + "loss": 2.7832, + "step": 28402 + }, + { + "epoch": 1.3223688805084155, + "grad_norm": 0.3536339711167593, + "learning_rate": 6.860049580048305e-05, + "loss": 2.7891, + "step": 28403 + }, + { + "epoch": 1.3224154386945084, + "grad_norm": 0.37004463090063133, + "learning_rate": 6.859798144603009e-05, + "loss": 2.8071, + "step": 28404 + }, + { + "epoch": 1.3224619968806015, + "grad_norm": 0.3120649222051068, + "learning_rate": 6.859546703699388e-05, + "loss": 2.7022, + "step": 28405 + }, + { + "epoch": 1.3225085550666946, + "grad_norm": 0.36560652569932406, + "learning_rate": 6.859295257338181e-05, + "loss": 2.7875, + "step": 28406 + }, + { + "epoch": 1.3225551132527877, + "grad_norm": 0.3452648888197412, + "learning_rate": 6.859043805520124e-05, + "loss": 2.832, + "step": 28407 + }, + { + "epoch": 1.3226016714388806, + "grad_norm": 0.33647512671391827, + "learning_rate": 6.858792348245957e-05, + "loss": 2.6802, + "step": 28408 + }, + { + "epoch": 1.3226482296249737, + "grad_norm": 0.354795737527487, + "learning_rate": 6.858540885516417e-05, + "loss": 2.7341, + "step": 28409 + }, + { + "epoch": 1.3226947878110669, + "grad_norm": 0.33588528427458325, + "learning_rate": 6.858289417332244e-05, + "loss": 2.7807, + "step": 28410 + }, + { + "epoch": 1.32274134599716, + "grad_norm": 0.35345059313226834, + "learning_rate": 6.858037943694173e-05, + "loss": 2.808, + "step": 28411 + }, + { + "epoch": 1.322787904183253, + "grad_norm": 0.3417135708188758, + "learning_rate": 6.857786464602946e-05, + "loss": 2.7599, + "step": 28412 + }, + { + "epoch": 1.3228344623693462, + "grad_norm": 0.358508497014442, + "learning_rate": 6.857534980059296e-05, + "loss": 2.7286, + "step": 28413 + }, + { + "epoch": 1.322881020555439, + "grad_norm": 0.34658793575615443, + "learning_rate": 6.857283490063966e-05, + "loss": 2.799, + "step": 28414 + }, + { + "epoch": 1.3229275787415322, + "grad_norm": 0.3493518208046425, + "learning_rate": 6.857031994617691e-05, + "loss": 2.8282, + "step": 28415 + }, + { + "epoch": 1.3229741369276253, + "grad_norm": 0.3236472759142407, + "learning_rate": 6.856780493721211e-05, + "loss": 2.7528, + "step": 28416 + }, + { + "epoch": 1.3230206951137184, + "grad_norm": 0.3718584640991136, + "learning_rate": 6.856528987375262e-05, + "loss": 2.9262, + "step": 28417 + }, + { + "epoch": 1.3230672532998113, + "grad_norm": 0.3448720083870354, + "learning_rate": 6.856277475580584e-05, + "loss": 2.6836, + "step": 28418 + }, + { + "epoch": 1.3231138114859045, + "grad_norm": 0.31901636587554355, + "learning_rate": 6.856025958337915e-05, + "loss": 2.6499, + "step": 28419 + }, + { + "epoch": 1.3231603696719976, + "grad_norm": 0.36514436579473053, + "learning_rate": 6.855774435647991e-05, + "loss": 2.7326, + "step": 28420 + }, + { + "epoch": 1.3232069278580907, + "grad_norm": 0.3488143782713352, + "learning_rate": 6.855522907511554e-05, + "loss": 2.7787, + "step": 28421 + }, + { + "epoch": 1.3232534860441838, + "grad_norm": 0.35556189931021515, + "learning_rate": 6.855271373929338e-05, + "loss": 2.7519, + "step": 28422 + }, + { + "epoch": 1.323300044230277, + "grad_norm": 0.34331022800680017, + "learning_rate": 6.855019834902083e-05, + "loss": 2.8335, + "step": 28423 + }, + { + "epoch": 1.3233466024163698, + "grad_norm": 0.363977754464519, + "learning_rate": 6.85476829043053e-05, + "loss": 2.7911, + "step": 28424 + }, + { + "epoch": 1.323393160602463, + "grad_norm": 0.31987643299544, + "learning_rate": 6.854516740515413e-05, + "loss": 2.8591, + "step": 28425 + }, + { + "epoch": 1.323439718788556, + "grad_norm": 0.3516552689115528, + "learning_rate": 6.854265185157472e-05, + "loss": 2.7292, + "step": 28426 + }, + { + "epoch": 1.323486276974649, + "grad_norm": 0.3222495387998436, + "learning_rate": 6.854013624357446e-05, + "loss": 2.7422, + "step": 28427 + }, + { + "epoch": 1.323532835160742, + "grad_norm": 0.34044017643621566, + "learning_rate": 6.853762058116073e-05, + "loss": 2.802, + "step": 28428 + }, + { + "epoch": 1.3235793933468352, + "grad_norm": 0.295438937416184, + "learning_rate": 6.85351048643409e-05, + "loss": 2.7475, + "step": 28429 + }, + { + "epoch": 1.3236259515329283, + "grad_norm": 0.3034633474614249, + "learning_rate": 6.853258909312237e-05, + "loss": 2.7039, + "step": 28430 + }, + { + "epoch": 1.3236725097190214, + "grad_norm": 0.3519538569288598, + "learning_rate": 6.853007326751252e-05, + "loss": 2.789, + "step": 28431 + }, + { + "epoch": 1.3237190679051145, + "grad_norm": 0.32074447396467287, + "learning_rate": 6.852755738751873e-05, + "loss": 2.6609, + "step": 28432 + }, + { + "epoch": 1.3237656260912074, + "grad_norm": 0.3226508747456858, + "learning_rate": 6.852504145314838e-05, + "loss": 2.6658, + "step": 28433 + }, + { + "epoch": 1.3238121842773005, + "grad_norm": 0.35080814872277466, + "learning_rate": 6.852252546440885e-05, + "loss": 2.7371, + "step": 28434 + }, + { + "epoch": 1.3238587424633936, + "grad_norm": 0.3558641557557801, + "learning_rate": 6.852000942130754e-05, + "loss": 2.8186, + "step": 28435 + }, + { + "epoch": 1.3239053006494867, + "grad_norm": 0.3647806773179542, + "learning_rate": 6.851749332385182e-05, + "loss": 2.7451, + "step": 28436 + }, + { + "epoch": 1.3239518588355796, + "grad_norm": 0.35673308490895994, + "learning_rate": 6.851497717204909e-05, + "loss": 2.7437, + "step": 28437 + }, + { + "epoch": 1.3239984170216728, + "grad_norm": 0.3636317767548909, + "learning_rate": 6.851246096590673e-05, + "loss": 2.7251, + "step": 28438 + }, + { + "epoch": 1.3240449752077659, + "grad_norm": 0.3980059223719053, + "learning_rate": 6.850994470543212e-05, + "loss": 2.8974, + "step": 28439 + }, + { + "epoch": 1.324091533393859, + "grad_norm": 0.3467029323836526, + "learning_rate": 6.850742839063265e-05, + "loss": 2.8066, + "step": 28440 + }, + { + "epoch": 1.324138091579952, + "grad_norm": 0.3598948352948784, + "learning_rate": 6.850491202151569e-05, + "loss": 2.6863, + "step": 28441 + }, + { + "epoch": 1.3241846497660452, + "grad_norm": 0.32706934886995037, + "learning_rate": 6.850239559808863e-05, + "loss": 2.7539, + "step": 28442 + }, + { + "epoch": 1.3242312079521381, + "grad_norm": 0.3725967807944612, + "learning_rate": 6.849987912035888e-05, + "loss": 2.8137, + "step": 28443 + }, + { + "epoch": 1.3242777661382312, + "grad_norm": 0.3256582631083546, + "learning_rate": 6.849736258833379e-05, + "loss": 2.7921, + "step": 28444 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.37439022044197084, + "learning_rate": 6.849484600202077e-05, + "loss": 2.6738, + "step": 28445 + }, + { + "epoch": 1.3243708825104175, + "grad_norm": 0.328461968142155, + "learning_rate": 6.84923293614272e-05, + "loss": 2.8461, + "step": 28446 + }, + { + "epoch": 1.3244174406965104, + "grad_norm": 0.36184251434404213, + "learning_rate": 6.848981266656047e-05, + "loss": 2.74, + "step": 28447 + }, + { + "epoch": 1.3244639988826035, + "grad_norm": 0.3600840132735764, + "learning_rate": 6.848729591742796e-05, + "loss": 2.8077, + "step": 28448 + }, + { + "epoch": 1.3245105570686966, + "grad_norm": 0.34674382977726176, + "learning_rate": 6.848477911403706e-05, + "loss": 2.8736, + "step": 28449 + }, + { + "epoch": 1.3245571152547897, + "grad_norm": 0.35818154829082677, + "learning_rate": 6.848226225639513e-05, + "loss": 2.7707, + "step": 28450 + }, + { + "epoch": 1.3246036734408828, + "grad_norm": 0.3704993840356647, + "learning_rate": 6.847974534450962e-05, + "loss": 2.7286, + "step": 28451 + }, + { + "epoch": 1.324650231626976, + "grad_norm": 0.3462002817984229, + "learning_rate": 6.847722837838784e-05, + "loss": 2.7986, + "step": 28452 + }, + { + "epoch": 1.3246967898130688, + "grad_norm": 0.3543482468368903, + "learning_rate": 6.847471135803724e-05, + "loss": 2.8001, + "step": 28453 + }, + { + "epoch": 1.324743347999162, + "grad_norm": 0.3304753437781163, + "learning_rate": 6.847219428346519e-05, + "loss": 2.849, + "step": 28454 + }, + { + "epoch": 1.324789906185255, + "grad_norm": 0.3654522366949799, + "learning_rate": 6.846967715467905e-05, + "loss": 2.9267, + "step": 28455 + }, + { + "epoch": 1.3248364643713482, + "grad_norm": 0.32732690774709994, + "learning_rate": 6.846715997168624e-05, + "loss": 2.7043, + "step": 28456 + }, + { + "epoch": 1.324883022557441, + "grad_norm": 0.32374043390967433, + "learning_rate": 6.846464273449411e-05, + "loss": 2.7162, + "step": 28457 + }, + { + "epoch": 1.3249295807435342, + "grad_norm": 0.37519128147085556, + "learning_rate": 6.846212544311009e-05, + "loss": 2.7292, + "step": 28458 + }, + { + "epoch": 1.3249761389296273, + "grad_norm": 0.30308325793797114, + "learning_rate": 6.845960809754155e-05, + "loss": 2.743, + "step": 28459 + }, + { + "epoch": 1.3250226971157204, + "grad_norm": 0.3323013675459274, + "learning_rate": 6.845709069779588e-05, + "loss": 2.5901, + "step": 28460 + }, + { + "epoch": 1.3250692553018135, + "grad_norm": 0.34346265951239646, + "learning_rate": 6.845457324388047e-05, + "loss": 2.8261, + "step": 28461 + }, + { + "epoch": 1.3251158134879066, + "grad_norm": 0.3346098336079866, + "learning_rate": 6.845205573580269e-05, + "loss": 2.7629, + "step": 28462 + }, + { + "epoch": 1.3251623716739995, + "grad_norm": 0.3263597286619224, + "learning_rate": 6.844953817356997e-05, + "loss": 2.67, + "step": 28463 + }, + { + "epoch": 1.3252089298600926, + "grad_norm": 0.33303068373403455, + "learning_rate": 6.844702055718965e-05, + "loss": 2.8144, + "step": 28464 + }, + { + "epoch": 1.3252554880461858, + "grad_norm": 0.3084526150692894, + "learning_rate": 6.844450288666915e-05, + "loss": 2.7402, + "step": 28465 + }, + { + "epoch": 1.3253020462322787, + "grad_norm": 0.35864975590202364, + "learning_rate": 6.844198516201585e-05, + "loss": 2.8003, + "step": 28466 + }, + { + "epoch": 1.3253486044183718, + "grad_norm": 0.33192844754984013, + "learning_rate": 6.843946738323715e-05, + "loss": 2.7921, + "step": 28467 + }, + { + "epoch": 1.3253951626044649, + "grad_norm": 0.3213833726685042, + "learning_rate": 6.843694955034041e-05, + "loss": 2.6935, + "step": 28468 + }, + { + "epoch": 1.325441720790558, + "grad_norm": 0.3497287814001843, + "learning_rate": 6.843443166333306e-05, + "loss": 2.8485, + "step": 28469 + }, + { + "epoch": 1.3254882789766511, + "grad_norm": 0.3487202250995114, + "learning_rate": 6.843191372222246e-05, + "loss": 2.7475, + "step": 28470 + }, + { + "epoch": 1.3255348371627442, + "grad_norm": 0.32743733884185444, + "learning_rate": 6.842939572701601e-05, + "loss": 2.827, + "step": 28471 + }, + { + "epoch": 1.3255813953488373, + "grad_norm": 0.3907951522813447, + "learning_rate": 6.842687767772111e-05, + "loss": 2.7741, + "step": 28472 + }, + { + "epoch": 1.3256279535349302, + "grad_norm": 0.32560131035070367, + "learning_rate": 6.84243595743451e-05, + "loss": 2.74, + "step": 28473 + }, + { + "epoch": 1.3256745117210234, + "grad_norm": 0.3563005821833698, + "learning_rate": 6.842184141689546e-05, + "loss": 2.758, + "step": 28474 + }, + { + "epoch": 1.3257210699071165, + "grad_norm": 0.328743045454977, + "learning_rate": 6.84193232053795e-05, + "loss": 2.7017, + "step": 28475 + }, + { + "epoch": 1.3257676280932094, + "grad_norm": 0.3263979850926159, + "learning_rate": 6.841680493980465e-05, + "loss": 2.7393, + "step": 28476 + }, + { + "epoch": 1.3258141862793025, + "grad_norm": 0.3591784023991016, + "learning_rate": 6.841428662017829e-05, + "loss": 2.6557, + "step": 28477 + }, + { + "epoch": 1.3258607444653956, + "grad_norm": 0.3274035939092708, + "learning_rate": 6.841176824650781e-05, + "loss": 2.7772, + "step": 28478 + }, + { + "epoch": 1.3259073026514887, + "grad_norm": 0.36658264886461134, + "learning_rate": 6.840924981880062e-05, + "loss": 2.7867, + "step": 28479 + }, + { + "epoch": 1.3259538608375818, + "grad_norm": 0.3441323833636888, + "learning_rate": 6.840673133706408e-05, + "loss": 2.783, + "step": 28480 + }, + { + "epoch": 1.326000419023675, + "grad_norm": 0.31836364335409933, + "learning_rate": 6.840421280130561e-05, + "loss": 2.8209, + "step": 28481 + }, + { + "epoch": 1.3260469772097678, + "grad_norm": 0.32754588018489844, + "learning_rate": 6.840169421153258e-05, + "loss": 2.6958, + "step": 28482 + }, + { + "epoch": 1.326093535395861, + "grad_norm": 0.31895684865806034, + "learning_rate": 6.839917556775239e-05, + "loss": 2.7816, + "step": 28483 + }, + { + "epoch": 1.326140093581954, + "grad_norm": 0.3388153341957314, + "learning_rate": 6.839665686997242e-05, + "loss": 2.7678, + "step": 28484 + }, + { + "epoch": 1.3261866517680472, + "grad_norm": 0.3367264485684956, + "learning_rate": 6.839413811820007e-05, + "loss": 2.7631, + "step": 28485 + }, + { + "epoch": 1.32623320995414, + "grad_norm": 0.32812226489659363, + "learning_rate": 6.839161931244276e-05, + "loss": 2.7707, + "step": 28486 + }, + { + "epoch": 1.3262797681402332, + "grad_norm": 0.3283358603716718, + "learning_rate": 6.838910045270786e-05, + "loss": 2.852, + "step": 28487 + }, + { + "epoch": 1.3263263263263263, + "grad_norm": 0.3025661936129245, + "learning_rate": 6.838658153900276e-05, + "loss": 2.6775, + "step": 28488 + }, + { + "epoch": 1.3263728845124194, + "grad_norm": 0.32832586420694815, + "learning_rate": 6.838406257133483e-05, + "loss": 2.7515, + "step": 28489 + }, + { + "epoch": 1.3264194426985125, + "grad_norm": 0.33238207107388285, + "learning_rate": 6.838154354971151e-05, + "loss": 2.7454, + "step": 28490 + }, + { + "epoch": 1.3264660008846056, + "grad_norm": 0.31273118879122996, + "learning_rate": 6.837902447414018e-05, + "loss": 2.7395, + "step": 28491 + }, + { + "epoch": 1.3265125590706985, + "grad_norm": 0.33429682094665375, + "learning_rate": 6.83765053446282e-05, + "loss": 2.8363, + "step": 28492 + }, + { + "epoch": 1.3265591172567917, + "grad_norm": 0.3298521612563568, + "learning_rate": 6.837398616118302e-05, + "loss": 2.6348, + "step": 28493 + }, + { + "epoch": 1.3266056754428848, + "grad_norm": 0.3461230184241504, + "learning_rate": 6.837146692381197e-05, + "loss": 2.7695, + "step": 28494 + }, + { + "epoch": 1.3266522336289779, + "grad_norm": 0.3444232832731512, + "learning_rate": 6.836894763252247e-05, + "loss": 2.6863, + "step": 28495 + }, + { + "epoch": 1.3266987918150708, + "grad_norm": 0.32246610494572897, + "learning_rate": 6.836642828732194e-05, + "loss": 2.8753, + "step": 28496 + }, + { + "epoch": 1.326745350001164, + "grad_norm": 0.35149715050667424, + "learning_rate": 6.836390888821775e-05, + "loss": 2.747, + "step": 28497 + }, + { + "epoch": 1.326791908187257, + "grad_norm": 0.3396107301566443, + "learning_rate": 6.836138943521729e-05, + "loss": 2.7287, + "step": 28498 + }, + { + "epoch": 1.3268384663733501, + "grad_norm": 0.33541959417206385, + "learning_rate": 6.835886992832795e-05, + "loss": 2.6959, + "step": 28499 + }, + { + "epoch": 1.3268850245594432, + "grad_norm": 0.3307608405705824, + "learning_rate": 6.835635036755715e-05, + "loss": 2.8023, + "step": 28500 + }, + { + "epoch": 1.3269315827455364, + "grad_norm": 0.3447650554018944, + "learning_rate": 6.835383075291226e-05, + "loss": 2.8139, + "step": 28501 + }, + { + "epoch": 1.3269781409316292, + "grad_norm": 0.3187797402294976, + "learning_rate": 6.83513110844007e-05, + "loss": 2.6567, + "step": 28502 + }, + { + "epoch": 1.3270246991177224, + "grad_norm": 0.3196046849365304, + "learning_rate": 6.834879136202982e-05, + "loss": 2.7814, + "step": 28503 + }, + { + "epoch": 1.3270712573038155, + "grad_norm": 0.343814866108031, + "learning_rate": 6.834627158580708e-05, + "loss": 2.831, + "step": 28504 + }, + { + "epoch": 1.3271178154899086, + "grad_norm": 0.32397404254068024, + "learning_rate": 6.834375175573983e-05, + "loss": 2.7533, + "step": 28505 + }, + { + "epoch": 1.3271643736760015, + "grad_norm": 0.305976447100953, + "learning_rate": 6.834123187183548e-05, + "loss": 2.7441, + "step": 28506 + }, + { + "epoch": 1.3272109318620946, + "grad_norm": 0.32306286976435017, + "learning_rate": 6.83387119341014e-05, + "loss": 2.7732, + "step": 28507 + }, + { + "epoch": 1.3272574900481877, + "grad_norm": 0.3015343398035092, + "learning_rate": 6.833619194254502e-05, + "loss": 2.6506, + "step": 28508 + }, + { + "epoch": 1.3273040482342808, + "grad_norm": 0.3012182978742293, + "learning_rate": 6.833367189717372e-05, + "loss": 2.7895, + "step": 28509 + }, + { + "epoch": 1.327350606420374, + "grad_norm": 0.33677828678288296, + "learning_rate": 6.833115179799492e-05, + "loss": 2.7073, + "step": 28510 + }, + { + "epoch": 1.327397164606467, + "grad_norm": 0.33114259816360475, + "learning_rate": 6.832863164501597e-05, + "loss": 2.8221, + "step": 28511 + }, + { + "epoch": 1.32744372279256, + "grad_norm": 0.34460918715960825, + "learning_rate": 6.83261114382443e-05, + "loss": 2.86, + "step": 28512 + }, + { + "epoch": 1.327490280978653, + "grad_norm": 0.3344533479279107, + "learning_rate": 6.832359117768729e-05, + "loss": 2.8421, + "step": 28513 + }, + { + "epoch": 1.3275368391647462, + "grad_norm": 0.3377231989326693, + "learning_rate": 6.832107086335236e-05, + "loss": 2.7274, + "step": 28514 + }, + { + "epoch": 1.327583397350839, + "grad_norm": 0.31588464176784714, + "learning_rate": 6.831855049524689e-05, + "loss": 2.8676, + "step": 28515 + }, + { + "epoch": 1.3276299555369322, + "grad_norm": 0.3504988492412768, + "learning_rate": 6.831603007337826e-05, + "loss": 2.7387, + "step": 28516 + }, + { + "epoch": 1.3276765137230253, + "grad_norm": 0.3132504743178347, + "learning_rate": 6.831350959775391e-05, + "loss": 2.7325, + "step": 28517 + }, + { + "epoch": 1.3277230719091184, + "grad_norm": 0.32430039933334953, + "learning_rate": 6.831098906838121e-05, + "loss": 2.7587, + "step": 28518 + }, + { + "epoch": 1.3277696300952115, + "grad_norm": 0.31962473920235607, + "learning_rate": 6.830846848526756e-05, + "loss": 2.6456, + "step": 28519 + }, + { + "epoch": 1.3278161882813047, + "grad_norm": 0.30713051661420204, + "learning_rate": 6.830594784842035e-05, + "loss": 2.7142, + "step": 28520 + }, + { + "epoch": 1.3278627464673975, + "grad_norm": 0.3217858565962609, + "learning_rate": 6.830342715784699e-05, + "loss": 2.7616, + "step": 28521 + }, + { + "epoch": 1.3279093046534907, + "grad_norm": 0.32489627863818704, + "learning_rate": 6.830090641355487e-05, + "loss": 2.7863, + "step": 28522 + }, + { + "epoch": 1.3279558628395838, + "grad_norm": 0.31369184802038313, + "learning_rate": 6.82983856155514e-05, + "loss": 2.7613, + "step": 28523 + }, + { + "epoch": 1.328002421025677, + "grad_norm": 0.34683667134706014, + "learning_rate": 6.829586476384397e-05, + "loss": 2.7749, + "step": 28524 + }, + { + "epoch": 1.3280489792117698, + "grad_norm": 0.3232869760311581, + "learning_rate": 6.829334385843998e-05, + "loss": 2.7784, + "step": 28525 + }, + { + "epoch": 1.328095537397863, + "grad_norm": 0.32199967793767437, + "learning_rate": 6.829082289934682e-05, + "loss": 2.7902, + "step": 28526 + }, + { + "epoch": 1.328142095583956, + "grad_norm": 0.33149136335188767, + "learning_rate": 6.828830188657189e-05, + "loss": 2.6613, + "step": 28527 + }, + { + "epoch": 1.3281886537700491, + "grad_norm": 0.31627476015058814, + "learning_rate": 6.828578082012263e-05, + "loss": 2.7248, + "step": 28528 + }, + { + "epoch": 1.3282352119561422, + "grad_norm": 0.33321002222986285, + "learning_rate": 6.828325970000637e-05, + "loss": 2.8878, + "step": 28529 + }, + { + "epoch": 1.3282817701422354, + "grad_norm": 0.31963167463646297, + "learning_rate": 6.828073852623056e-05, + "loss": 2.6604, + "step": 28530 + }, + { + "epoch": 1.3283283283283283, + "grad_norm": 0.35407995968927497, + "learning_rate": 6.827821729880258e-05, + "loss": 2.7683, + "step": 28531 + }, + { + "epoch": 1.3283748865144214, + "grad_norm": 0.3188447852044419, + "learning_rate": 6.827569601772983e-05, + "loss": 2.819, + "step": 28532 + }, + { + "epoch": 1.3284214447005145, + "grad_norm": 0.339040428126224, + "learning_rate": 6.827317468301972e-05, + "loss": 2.7843, + "step": 28533 + }, + { + "epoch": 1.3284680028866076, + "grad_norm": 0.3064885593349414, + "learning_rate": 6.827065329467963e-05, + "loss": 2.7692, + "step": 28534 + }, + { + "epoch": 1.3285145610727005, + "grad_norm": 0.3368008295887369, + "learning_rate": 6.826813185271698e-05, + "loss": 2.816, + "step": 28535 + }, + { + "epoch": 1.3285611192587936, + "grad_norm": 0.309209286602897, + "learning_rate": 6.826561035713916e-05, + "loss": 2.7352, + "step": 28536 + }, + { + "epoch": 1.3286076774448867, + "grad_norm": 0.31565915987281934, + "learning_rate": 6.826308880795359e-05, + "loss": 2.7373, + "step": 28537 + }, + { + "epoch": 1.3286542356309798, + "grad_norm": 0.3439400998196809, + "learning_rate": 6.826056720516764e-05, + "loss": 2.7672, + "step": 28538 + }, + { + "epoch": 1.328700793817073, + "grad_norm": 0.3244780909458827, + "learning_rate": 6.825804554878872e-05, + "loss": 2.7817, + "step": 28539 + }, + { + "epoch": 1.328747352003166, + "grad_norm": 0.32638529289577967, + "learning_rate": 6.825552383882424e-05, + "loss": 2.7328, + "step": 28540 + }, + { + "epoch": 1.328793910189259, + "grad_norm": 0.35456834683835364, + "learning_rate": 6.825300207528161e-05, + "loss": 2.7105, + "step": 28541 + }, + { + "epoch": 1.328840468375352, + "grad_norm": 0.32316695938191553, + "learning_rate": 6.825048025816819e-05, + "loss": 2.7402, + "step": 28542 + }, + { + "epoch": 1.3288870265614452, + "grad_norm": 0.3338465981346048, + "learning_rate": 6.824795838749142e-05, + "loss": 2.7759, + "step": 28543 + }, + { + "epoch": 1.3289335847475383, + "grad_norm": 0.31683133571141464, + "learning_rate": 6.824543646325869e-05, + "loss": 2.6831, + "step": 28544 + }, + { + "epoch": 1.3289801429336312, + "grad_norm": 0.3488490521978195, + "learning_rate": 6.82429144854774e-05, + "loss": 2.7647, + "step": 28545 + }, + { + "epoch": 1.3290267011197243, + "grad_norm": 0.3251395961507714, + "learning_rate": 6.824039245415497e-05, + "loss": 2.7174, + "step": 28546 + }, + { + "epoch": 1.3290732593058174, + "grad_norm": 0.38599672581287037, + "learning_rate": 6.823787036929876e-05, + "loss": 2.7374, + "step": 28547 + }, + { + "epoch": 1.3291198174919105, + "grad_norm": 0.3186446911291327, + "learning_rate": 6.82353482309162e-05, + "loss": 2.6954, + "step": 28548 + }, + { + "epoch": 1.3291663756780037, + "grad_norm": 0.37429607384478486, + "learning_rate": 6.82328260390147e-05, + "loss": 2.6879, + "step": 28549 + }, + { + "epoch": 1.3292129338640968, + "grad_norm": 0.34356998720178117, + "learning_rate": 6.823030379360166e-05, + "loss": 2.7029, + "step": 28550 + }, + { + "epoch": 1.3292594920501897, + "grad_norm": 0.3455905230032403, + "learning_rate": 6.822778149468445e-05, + "loss": 2.8366, + "step": 28551 + }, + { + "epoch": 1.3293060502362828, + "grad_norm": 0.3381706574945121, + "learning_rate": 6.822525914227051e-05, + "loss": 2.7003, + "step": 28552 + }, + { + "epoch": 1.329352608422376, + "grad_norm": 0.34227018663862313, + "learning_rate": 6.822273673636724e-05, + "loss": 2.7386, + "step": 28553 + }, + { + "epoch": 1.3293991666084688, + "grad_norm": 0.3479765085429277, + "learning_rate": 6.8220214276982e-05, + "loss": 2.8607, + "step": 28554 + }, + { + "epoch": 1.329445724794562, + "grad_norm": 0.364886772116193, + "learning_rate": 6.821769176412228e-05, + "loss": 2.732, + "step": 28555 + }, + { + "epoch": 1.329492282980655, + "grad_norm": 0.32763632369466833, + "learning_rate": 6.821516919779538e-05, + "loss": 2.7976, + "step": 28556 + }, + { + "epoch": 1.3295388411667481, + "grad_norm": 0.3475867287315517, + "learning_rate": 6.821264657800879e-05, + "loss": 2.7662, + "step": 28557 + }, + { + "epoch": 1.3295853993528413, + "grad_norm": 0.3551459319515715, + "learning_rate": 6.821012390476984e-05, + "loss": 2.7714, + "step": 28558 + }, + { + "epoch": 1.3296319575389344, + "grad_norm": 0.3172298399825129, + "learning_rate": 6.8207601178086e-05, + "loss": 2.6285, + "step": 28559 + }, + { + "epoch": 1.3296785157250275, + "grad_norm": 0.3487840200893592, + "learning_rate": 6.820507839796462e-05, + "loss": 2.769, + "step": 28560 + }, + { + "epoch": 1.3297250739111204, + "grad_norm": 0.34359443197694717, + "learning_rate": 6.820255556441315e-05, + "loss": 2.7843, + "step": 28561 + }, + { + "epoch": 1.3297716320972135, + "grad_norm": 0.3464071846966845, + "learning_rate": 6.820003267743897e-05, + "loss": 2.7935, + "step": 28562 + }, + { + "epoch": 1.3298181902833066, + "grad_norm": 0.3341871093369256, + "learning_rate": 6.819750973704947e-05, + "loss": 2.8265, + "step": 28563 + }, + { + "epoch": 1.3298647484693995, + "grad_norm": 0.3169852313181013, + "learning_rate": 6.81949867432521e-05, + "loss": 2.7695, + "step": 28564 + }, + { + "epoch": 1.3299113066554926, + "grad_norm": 0.32120727208707794, + "learning_rate": 6.819246369605421e-05, + "loss": 2.8514, + "step": 28565 + }, + { + "epoch": 1.3299578648415857, + "grad_norm": 0.3165364216595678, + "learning_rate": 6.818994059546323e-05, + "loss": 2.7649, + "step": 28566 + }, + { + "epoch": 1.3300044230276789, + "grad_norm": 0.319065944734753, + "learning_rate": 6.818741744148657e-05, + "loss": 2.7737, + "step": 28567 + }, + { + "epoch": 1.330050981213772, + "grad_norm": 0.3291198946585849, + "learning_rate": 6.818489423413165e-05, + "loss": 2.7489, + "step": 28568 + }, + { + "epoch": 1.330097539399865, + "grad_norm": 0.3241346755472206, + "learning_rate": 6.818237097340584e-05, + "loss": 2.71, + "step": 28569 + }, + { + "epoch": 1.330144097585958, + "grad_norm": 0.36337721988173594, + "learning_rate": 6.817984765931657e-05, + "loss": 2.7418, + "step": 28570 + }, + { + "epoch": 1.330190655772051, + "grad_norm": 0.3149948320333038, + "learning_rate": 6.817732429187124e-05, + "loss": 2.7125, + "step": 28571 + }, + { + "epoch": 1.3302372139581442, + "grad_norm": 0.35353596857386016, + "learning_rate": 6.817480087107725e-05, + "loss": 2.8208, + "step": 28572 + }, + { + "epoch": 1.3302837721442373, + "grad_norm": 0.33980907888238765, + "learning_rate": 6.8172277396942e-05, + "loss": 2.8088, + "step": 28573 + }, + { + "epoch": 1.3303303303303302, + "grad_norm": 0.35841215926248227, + "learning_rate": 6.816975386947293e-05, + "loss": 2.8384, + "step": 28574 + }, + { + "epoch": 1.3303768885164233, + "grad_norm": 0.37632317235855856, + "learning_rate": 6.81672302886774e-05, + "loss": 2.7639, + "step": 28575 + }, + { + "epoch": 1.3304234467025164, + "grad_norm": 0.34196815698249144, + "learning_rate": 6.816470665456285e-05, + "loss": 2.78, + "step": 28576 + }, + { + "epoch": 1.3304700048886096, + "grad_norm": 0.37973974783561637, + "learning_rate": 6.816218296713668e-05, + "loss": 2.8108, + "step": 28577 + }, + { + "epoch": 1.3305165630747027, + "grad_norm": 0.32467705925081036, + "learning_rate": 6.815965922640627e-05, + "loss": 2.7694, + "step": 28578 + }, + { + "epoch": 1.3305631212607958, + "grad_norm": 0.38229379022316645, + "learning_rate": 6.815713543237908e-05, + "loss": 2.769, + "step": 28579 + }, + { + "epoch": 1.3306096794468887, + "grad_norm": 0.39107087717899375, + "learning_rate": 6.815461158506246e-05, + "loss": 2.8412, + "step": 28580 + }, + { + "epoch": 1.3306562376329818, + "grad_norm": 0.3725385761224217, + "learning_rate": 6.815208768446386e-05, + "loss": 2.7362, + "step": 28581 + }, + { + "epoch": 1.330702795819075, + "grad_norm": 0.35298856277065455, + "learning_rate": 6.814956373059066e-05, + "loss": 2.8428, + "step": 28582 + }, + { + "epoch": 1.330749354005168, + "grad_norm": 0.3511513429428128, + "learning_rate": 6.814703972345029e-05, + "loss": 2.7352, + "step": 28583 + }, + { + "epoch": 1.330795912191261, + "grad_norm": 0.34526361363150765, + "learning_rate": 6.814451566305013e-05, + "loss": 2.8358, + "step": 28584 + }, + { + "epoch": 1.330842470377354, + "grad_norm": 0.3335967464026729, + "learning_rate": 6.814199154939763e-05, + "loss": 2.8011, + "step": 28585 + }, + { + "epoch": 1.3308890285634472, + "grad_norm": 0.3578734469974029, + "learning_rate": 6.813946738250014e-05, + "loss": 2.6828, + "step": 28586 + }, + { + "epoch": 1.3309355867495403, + "grad_norm": 0.36064204192204635, + "learning_rate": 6.813694316236512e-05, + "loss": 2.7365, + "step": 28587 + }, + { + "epoch": 1.3309821449356334, + "grad_norm": 0.354950861766155, + "learning_rate": 6.813441888899995e-05, + "loss": 2.8005, + "step": 28588 + }, + { + "epoch": 1.3310287031217265, + "grad_norm": 0.33794522119698633, + "learning_rate": 6.813189456241205e-05, + "loss": 2.8358, + "step": 28589 + }, + { + "epoch": 1.3310752613078194, + "grad_norm": 0.37078469865858843, + "learning_rate": 6.812937018260882e-05, + "loss": 2.7946, + "step": 28590 + }, + { + "epoch": 1.3311218194939125, + "grad_norm": 0.3528610138322274, + "learning_rate": 6.812684574959769e-05, + "loss": 2.7862, + "step": 28591 + }, + { + "epoch": 1.3311683776800056, + "grad_norm": 0.3766153968938417, + "learning_rate": 6.812432126338605e-05, + "loss": 2.7443, + "step": 28592 + }, + { + "epoch": 1.3312149358660987, + "grad_norm": 0.34204184709200974, + "learning_rate": 6.81217967239813e-05, + "loss": 2.6988, + "step": 28593 + }, + { + "epoch": 1.3312614940521916, + "grad_norm": 0.3862074501656839, + "learning_rate": 6.811927213139087e-05, + "loss": 2.7294, + "step": 28594 + }, + { + "epoch": 1.3313080522382847, + "grad_norm": 0.3429231127380416, + "learning_rate": 6.811674748562217e-05, + "loss": 2.7228, + "step": 28595 + }, + { + "epoch": 1.3313546104243779, + "grad_norm": 0.33983728456213264, + "learning_rate": 6.81142227866826e-05, + "loss": 2.6621, + "step": 28596 + }, + { + "epoch": 1.331401168610471, + "grad_norm": 0.3223639469857753, + "learning_rate": 6.811169803457955e-05, + "loss": 2.7495, + "step": 28597 + }, + { + "epoch": 1.331447726796564, + "grad_norm": 0.36842021366808847, + "learning_rate": 6.810917322932046e-05, + "loss": 2.7076, + "step": 28598 + }, + { + "epoch": 1.3314942849826572, + "grad_norm": 0.3297759465934442, + "learning_rate": 6.810664837091275e-05, + "loss": 2.8379, + "step": 28599 + }, + { + "epoch": 1.33154084316875, + "grad_norm": 0.3445903233236275, + "learning_rate": 6.810412345936378e-05, + "loss": 2.7608, + "step": 28600 + }, + { + "epoch": 1.3315874013548432, + "grad_norm": 0.33083706887655906, + "learning_rate": 6.810159849468101e-05, + "loss": 2.7794, + "step": 28601 + }, + { + "epoch": 1.3316339595409363, + "grad_norm": 0.34105185881298455, + "learning_rate": 6.809907347687182e-05, + "loss": 2.762, + "step": 28602 + }, + { + "epoch": 1.3316805177270292, + "grad_norm": 0.35204161469402606, + "learning_rate": 6.809654840594364e-05, + "loss": 2.801, + "step": 28603 + }, + { + "epoch": 1.3317270759131223, + "grad_norm": 0.35159364989373654, + "learning_rate": 6.809402328190389e-05, + "loss": 2.7279, + "step": 28604 + }, + { + "epoch": 1.3317736340992155, + "grad_norm": 0.3391564015237691, + "learning_rate": 6.809149810475995e-05, + "loss": 2.8289, + "step": 28605 + }, + { + "epoch": 1.3318201922853086, + "grad_norm": 0.3325962739956947, + "learning_rate": 6.808897287451923e-05, + "loss": 2.7799, + "step": 28606 + }, + { + "epoch": 1.3318667504714017, + "grad_norm": 0.35303107663237066, + "learning_rate": 6.808644759118917e-05, + "loss": 2.7634, + "step": 28607 + }, + { + "epoch": 1.3319133086574948, + "grad_norm": 0.3183763521070469, + "learning_rate": 6.808392225477716e-05, + "loss": 2.7702, + "step": 28608 + }, + { + "epoch": 1.3319598668435877, + "grad_norm": 0.3464389769332486, + "learning_rate": 6.808139686529064e-05, + "loss": 2.7437, + "step": 28609 + }, + { + "epoch": 1.3320064250296808, + "grad_norm": 0.3121106224126508, + "learning_rate": 6.807887142273698e-05, + "loss": 2.7938, + "step": 28610 + }, + { + "epoch": 1.332052983215774, + "grad_norm": 0.37059403119965717, + "learning_rate": 6.807634592712363e-05, + "loss": 2.9123, + "step": 28611 + }, + { + "epoch": 1.332099541401867, + "grad_norm": 0.31926322492422127, + "learning_rate": 6.807382037845798e-05, + "loss": 2.8458, + "step": 28612 + }, + { + "epoch": 1.33214609958796, + "grad_norm": 0.33708134117030897, + "learning_rate": 6.807129477674745e-05, + "loss": 2.8023, + "step": 28613 + }, + { + "epoch": 1.332192657774053, + "grad_norm": 0.33996202412143045, + "learning_rate": 6.806876912199945e-05, + "loss": 2.7901, + "step": 28614 + }, + { + "epoch": 1.3322392159601462, + "grad_norm": 0.31661825169701396, + "learning_rate": 6.806624341422138e-05, + "loss": 2.7569, + "step": 28615 + }, + { + "epoch": 1.3322857741462393, + "grad_norm": 0.3261671424259673, + "learning_rate": 6.806371765342066e-05, + "loss": 2.7071, + "step": 28616 + }, + { + "epoch": 1.3323323323323324, + "grad_norm": 0.34176447566579193, + "learning_rate": 6.806119183960473e-05, + "loss": 2.9049, + "step": 28617 + }, + { + "epoch": 1.3323788905184255, + "grad_norm": 0.3434449813992461, + "learning_rate": 6.805866597278098e-05, + "loss": 2.6636, + "step": 28618 + }, + { + "epoch": 1.3324254487045184, + "grad_norm": 0.3465162942207503, + "learning_rate": 6.805614005295681e-05, + "loss": 2.7692, + "step": 28619 + }, + { + "epoch": 1.3324720068906115, + "grad_norm": 0.3677125787472005, + "learning_rate": 6.805361408013966e-05, + "loss": 2.8108, + "step": 28620 + }, + { + "epoch": 1.3325185650767046, + "grad_norm": 0.38202158419993637, + "learning_rate": 6.805108805433694e-05, + "loss": 2.8156, + "step": 28621 + }, + { + "epoch": 1.3325651232627977, + "grad_norm": 0.3403512765472395, + "learning_rate": 6.804856197555604e-05, + "loss": 2.6646, + "step": 28622 + }, + { + "epoch": 1.3326116814488906, + "grad_norm": 0.35986118906850195, + "learning_rate": 6.804603584380438e-05, + "loss": 2.804, + "step": 28623 + }, + { + "epoch": 1.3326582396349838, + "grad_norm": 0.35876132950211487, + "learning_rate": 6.804350965908939e-05, + "loss": 2.7627, + "step": 28624 + }, + { + "epoch": 1.3327047978210769, + "grad_norm": 0.3737406923987933, + "learning_rate": 6.804098342141848e-05, + "loss": 2.7445, + "step": 28625 + }, + { + "epoch": 1.33275135600717, + "grad_norm": 0.3462586298110136, + "learning_rate": 6.803845713079906e-05, + "loss": 2.7907, + "step": 28626 + }, + { + "epoch": 1.332797914193263, + "grad_norm": 0.3594137156490068, + "learning_rate": 6.803593078723855e-05, + "loss": 2.8529, + "step": 28627 + }, + { + "epoch": 1.3328444723793562, + "grad_norm": 0.3705382896863766, + "learning_rate": 6.803340439074436e-05, + "loss": 2.7433, + "step": 28628 + }, + { + "epoch": 1.332891030565449, + "grad_norm": 0.34072749419466597, + "learning_rate": 6.80308779413239e-05, + "loss": 2.7003, + "step": 28629 + }, + { + "epoch": 1.3329375887515422, + "grad_norm": 0.3280129851996252, + "learning_rate": 6.802835143898458e-05, + "loss": 2.8938, + "step": 28630 + }, + { + "epoch": 1.3329841469376353, + "grad_norm": 0.3220332654130371, + "learning_rate": 6.802582488373384e-05, + "loss": 2.7149, + "step": 28631 + }, + { + "epoch": 1.3330307051237285, + "grad_norm": 0.36935977824838806, + "learning_rate": 6.802329827557908e-05, + "loss": 2.6612, + "step": 28632 + }, + { + "epoch": 1.3330772633098213, + "grad_norm": 0.3492103641452705, + "learning_rate": 6.802077161452771e-05, + "loss": 2.8453, + "step": 28633 + }, + { + "epoch": 1.3331238214959145, + "grad_norm": 0.35891677209889306, + "learning_rate": 6.801824490058715e-05, + "loss": 2.7202, + "step": 28634 + }, + { + "epoch": 1.3331703796820076, + "grad_norm": 0.3428761194681837, + "learning_rate": 6.801571813376482e-05, + "loss": 2.8433, + "step": 28635 + }, + { + "epoch": 1.3332169378681007, + "grad_norm": 0.36418737953304703, + "learning_rate": 6.801319131406813e-05, + "loss": 2.8125, + "step": 28636 + }, + { + "epoch": 1.3332634960541938, + "grad_norm": 0.3506215453662156, + "learning_rate": 6.80106644415045e-05, + "loss": 2.8007, + "step": 28637 + }, + { + "epoch": 1.333310054240287, + "grad_norm": 0.33176468154644767, + "learning_rate": 6.800813751608133e-05, + "loss": 2.7546, + "step": 28638 + }, + { + "epoch": 1.3333566124263798, + "grad_norm": 0.34164070092636856, + "learning_rate": 6.800561053780606e-05, + "loss": 2.8101, + "step": 28639 + }, + { + "epoch": 1.333403170612473, + "grad_norm": 0.3611173559302516, + "learning_rate": 6.800308350668611e-05, + "loss": 2.8211, + "step": 28640 + }, + { + "epoch": 1.333449728798566, + "grad_norm": 0.3767646853073636, + "learning_rate": 6.800055642272887e-05, + "loss": 2.8049, + "step": 28641 + }, + { + "epoch": 1.333496286984659, + "grad_norm": 0.3251314846996156, + "learning_rate": 6.799802928594176e-05, + "loss": 2.8564, + "step": 28642 + }, + { + "epoch": 1.333542845170752, + "grad_norm": 0.3612138650861071, + "learning_rate": 6.799550209633223e-05, + "loss": 2.7136, + "step": 28643 + }, + { + "epoch": 1.3335894033568452, + "grad_norm": 0.34063985194593777, + "learning_rate": 6.799297485390765e-05, + "loss": 2.7608, + "step": 28644 + }, + { + "epoch": 1.3336359615429383, + "grad_norm": 0.32780024833968474, + "learning_rate": 6.799044755867548e-05, + "loss": 2.724, + "step": 28645 + }, + { + "epoch": 1.3336825197290314, + "grad_norm": 0.3511152240135039, + "learning_rate": 6.798792021064313e-05, + "loss": 2.7511, + "step": 28646 + }, + { + "epoch": 1.3337290779151245, + "grad_norm": 0.31209738627955264, + "learning_rate": 6.798539280981799e-05, + "loss": 2.6851, + "step": 28647 + }, + { + "epoch": 1.3337756361012176, + "grad_norm": 0.31227816602127634, + "learning_rate": 6.79828653562075e-05, + "loss": 2.6924, + "step": 28648 + }, + { + "epoch": 1.3338221942873105, + "grad_norm": 0.3192753095973436, + "learning_rate": 6.798033784981907e-05, + "loss": 2.7888, + "step": 28649 + }, + { + "epoch": 1.3338687524734036, + "grad_norm": 0.3269355199703066, + "learning_rate": 6.797781029066011e-05, + "loss": 2.74, + "step": 28650 + }, + { + "epoch": 1.3339153106594968, + "grad_norm": 0.3317597090689892, + "learning_rate": 6.797528267873805e-05, + "loss": 2.7372, + "step": 28651 + }, + { + "epoch": 1.3339618688455896, + "grad_norm": 0.32652878690273435, + "learning_rate": 6.797275501406031e-05, + "loss": 2.7255, + "step": 28652 + }, + { + "epoch": 1.3340084270316828, + "grad_norm": 0.35573167103352327, + "learning_rate": 6.797022729663429e-05, + "loss": 2.7293, + "step": 28653 + }, + { + "epoch": 1.3340549852177759, + "grad_norm": 0.3166905022992532, + "learning_rate": 6.796769952646744e-05, + "loss": 2.7786, + "step": 28654 + }, + { + "epoch": 1.334101543403869, + "grad_norm": 0.37828388704572435, + "learning_rate": 6.796517170356717e-05, + "loss": 2.8563, + "step": 28655 + }, + { + "epoch": 1.334148101589962, + "grad_norm": 0.31646236584483733, + "learning_rate": 6.796264382794087e-05, + "loss": 2.6355, + "step": 28656 + }, + { + "epoch": 1.3341946597760552, + "grad_norm": 0.37377460459619044, + "learning_rate": 6.796011589959598e-05, + "loss": 2.7011, + "step": 28657 + }, + { + "epoch": 1.3342412179621481, + "grad_norm": 0.35175002367103225, + "learning_rate": 6.795758791853994e-05, + "loss": 2.7904, + "step": 28658 + }, + { + "epoch": 1.3342877761482412, + "grad_norm": 0.32528425068919226, + "learning_rate": 6.795505988478015e-05, + "loss": 2.8167, + "step": 28659 + }, + { + "epoch": 1.3343343343343343, + "grad_norm": 0.3528079803894705, + "learning_rate": 6.795253179832402e-05, + "loss": 2.7672, + "step": 28660 + }, + { + "epoch": 1.3343808925204275, + "grad_norm": 0.32870731079155374, + "learning_rate": 6.795000365917897e-05, + "loss": 2.7097, + "step": 28661 + }, + { + "epoch": 1.3344274507065204, + "grad_norm": 0.3116570172976549, + "learning_rate": 6.794747546735243e-05, + "loss": 2.7917, + "step": 28662 + }, + { + "epoch": 1.3344740088926135, + "grad_norm": 0.35023583910601275, + "learning_rate": 6.794494722285183e-05, + "loss": 2.6801, + "step": 28663 + }, + { + "epoch": 1.3345205670787066, + "grad_norm": 0.35684185233735105, + "learning_rate": 6.794241892568456e-05, + "loss": 2.6587, + "step": 28664 + }, + { + "epoch": 1.3345671252647997, + "grad_norm": 0.31241158760897064, + "learning_rate": 6.793989057585807e-05, + "loss": 2.7885, + "step": 28665 + }, + { + "epoch": 1.3346136834508928, + "grad_norm": 0.3499134737275863, + "learning_rate": 6.793736217337976e-05, + "loss": 2.734, + "step": 28666 + }, + { + "epoch": 1.334660241636986, + "grad_norm": 0.31116569501902847, + "learning_rate": 6.793483371825706e-05, + "loss": 2.7574, + "step": 28667 + }, + { + "epoch": 1.3347067998230788, + "grad_norm": 0.3338711213261711, + "learning_rate": 6.793230521049739e-05, + "loss": 2.8312, + "step": 28668 + }, + { + "epoch": 1.334753358009172, + "grad_norm": 0.3108492065833482, + "learning_rate": 6.792977665010818e-05, + "loss": 2.743, + "step": 28669 + }, + { + "epoch": 1.334799916195265, + "grad_norm": 0.3610143938095951, + "learning_rate": 6.792724803709682e-05, + "loss": 2.8333, + "step": 28670 + }, + { + "epoch": 1.3348464743813582, + "grad_norm": 0.31129694722091056, + "learning_rate": 6.792471937147078e-05, + "loss": 2.7987, + "step": 28671 + }, + { + "epoch": 1.334893032567451, + "grad_norm": 0.3465121755901053, + "learning_rate": 6.792219065323746e-05, + "loss": 2.6935, + "step": 28672 + }, + { + "epoch": 1.3349395907535442, + "grad_norm": 0.3392808094289991, + "learning_rate": 6.791966188240426e-05, + "loss": 2.6279, + "step": 28673 + }, + { + "epoch": 1.3349861489396373, + "grad_norm": 0.34844557592235603, + "learning_rate": 6.791713305897861e-05, + "loss": 2.7203, + "step": 28674 + }, + { + "epoch": 1.3350327071257304, + "grad_norm": 0.32409237841140637, + "learning_rate": 6.791460418296794e-05, + "loss": 2.6687, + "step": 28675 + }, + { + "epoch": 1.3350792653118235, + "grad_norm": 0.3585531058423946, + "learning_rate": 6.79120752543797e-05, + "loss": 2.7527, + "step": 28676 + }, + { + "epoch": 1.3351258234979166, + "grad_norm": 0.34227605143109974, + "learning_rate": 6.790954627322124e-05, + "loss": 2.8035, + "step": 28677 + }, + { + "epoch": 1.3351723816840095, + "grad_norm": 0.3390158700768096, + "learning_rate": 6.790701723950006e-05, + "loss": 2.7191, + "step": 28678 + }, + { + "epoch": 1.3352189398701027, + "grad_norm": 0.3237615328371198, + "learning_rate": 6.790448815322352e-05, + "loss": 2.7661, + "step": 28679 + }, + { + "epoch": 1.3352654980561958, + "grad_norm": 0.3486801618196395, + "learning_rate": 6.790195901439909e-05, + "loss": 2.5314, + "step": 28680 + }, + { + "epoch": 1.3353120562422889, + "grad_norm": 0.3180354558420103, + "learning_rate": 6.789942982303417e-05, + "loss": 2.7777, + "step": 28681 + }, + { + "epoch": 1.3353586144283818, + "grad_norm": 0.3638815667728283, + "learning_rate": 6.78969005791362e-05, + "loss": 2.6885, + "step": 28682 + }, + { + "epoch": 1.335405172614475, + "grad_norm": 0.3204599877723328, + "learning_rate": 6.789437128271258e-05, + "loss": 2.7025, + "step": 28683 + }, + { + "epoch": 1.335451730800568, + "grad_norm": 0.33539286374741456, + "learning_rate": 6.789184193377074e-05, + "loss": 2.7121, + "step": 28684 + }, + { + "epoch": 1.3354982889866611, + "grad_norm": 0.3404899881896352, + "learning_rate": 6.78893125323181e-05, + "loss": 2.6149, + "step": 28685 + }, + { + "epoch": 1.3355448471727542, + "grad_norm": 0.2952461759538529, + "learning_rate": 6.788678307836212e-05, + "loss": 2.7147, + "step": 28686 + }, + { + "epoch": 1.3355914053588474, + "grad_norm": 0.3102801577778309, + "learning_rate": 6.788425357191016e-05, + "loss": 2.7344, + "step": 28687 + }, + { + "epoch": 1.3356379635449402, + "grad_norm": 0.3024855662636314, + "learning_rate": 6.788172401296969e-05, + "loss": 2.7692, + "step": 28688 + }, + { + "epoch": 1.3356845217310334, + "grad_norm": 0.33265270757939513, + "learning_rate": 6.787919440154811e-05, + "loss": 2.6804, + "step": 28689 + }, + { + "epoch": 1.3357310799171265, + "grad_norm": 0.31275556821173517, + "learning_rate": 6.787666473765289e-05, + "loss": 2.8364, + "step": 28690 + }, + { + "epoch": 1.3357776381032194, + "grad_norm": 0.3197244319359373, + "learning_rate": 6.787413502129138e-05, + "loss": 2.8739, + "step": 28691 + }, + { + "epoch": 1.3358241962893125, + "grad_norm": 0.31238374359689214, + "learning_rate": 6.787160525247107e-05, + "loss": 2.6911, + "step": 28692 + }, + { + "epoch": 1.3358707544754056, + "grad_norm": 0.33100637791554766, + "learning_rate": 6.786907543119934e-05, + "loss": 2.6175, + "step": 28693 + }, + { + "epoch": 1.3359173126614987, + "grad_norm": 0.30961688293325157, + "learning_rate": 6.786654555748366e-05, + "loss": 2.7215, + "step": 28694 + }, + { + "epoch": 1.3359638708475918, + "grad_norm": 0.33591304099645247, + "learning_rate": 6.78640156313314e-05, + "loss": 2.7267, + "step": 28695 + }, + { + "epoch": 1.336010429033685, + "grad_norm": 0.3303393381248806, + "learning_rate": 6.786148565275004e-05, + "loss": 2.7499, + "step": 28696 + }, + { + "epoch": 1.3360569872197778, + "grad_norm": 0.3464394109407099, + "learning_rate": 6.785895562174698e-05, + "loss": 2.7846, + "step": 28697 + }, + { + "epoch": 1.336103545405871, + "grad_norm": 0.3361469504600103, + "learning_rate": 6.785642553832963e-05, + "loss": 2.7311, + "step": 28698 + }, + { + "epoch": 1.336150103591964, + "grad_norm": 0.33102880584986477, + "learning_rate": 6.785389540250543e-05, + "loss": 2.8, + "step": 28699 + }, + { + "epoch": 1.3361966617780572, + "grad_norm": 0.3308070503344857, + "learning_rate": 6.785136521428182e-05, + "loss": 2.7569, + "step": 28700 + }, + { + "epoch": 1.33624321996415, + "grad_norm": 0.32969736726791526, + "learning_rate": 6.784883497366618e-05, + "loss": 2.8352, + "step": 28701 + }, + { + "epoch": 1.3362897781502432, + "grad_norm": 0.3075992811637534, + "learning_rate": 6.784630468066599e-05, + "loss": 2.7872, + "step": 28702 + }, + { + "epoch": 1.3363363363363363, + "grad_norm": 0.32875776159431547, + "learning_rate": 6.784377433528866e-05, + "loss": 2.803, + "step": 28703 + }, + { + "epoch": 1.3363828945224294, + "grad_norm": 0.3123853770916577, + "learning_rate": 6.78412439375416e-05, + "loss": 2.7344, + "step": 28704 + }, + { + "epoch": 1.3364294527085225, + "grad_norm": 0.2840077040658021, + "learning_rate": 6.783871348743225e-05, + "loss": 2.7314, + "step": 28705 + }, + { + "epoch": 1.3364760108946157, + "grad_norm": 0.29603055740618167, + "learning_rate": 6.783618298496802e-05, + "loss": 2.7975, + "step": 28706 + }, + { + "epoch": 1.3365225690807085, + "grad_norm": 0.3198403017253799, + "learning_rate": 6.783365243015635e-05, + "loss": 2.7498, + "step": 28707 + }, + { + "epoch": 1.3365691272668017, + "grad_norm": 0.30693443556520084, + "learning_rate": 6.783112182300471e-05, + "loss": 2.7407, + "step": 28708 + }, + { + "epoch": 1.3366156854528948, + "grad_norm": 0.2984076348996094, + "learning_rate": 6.782859116352044e-05, + "loss": 2.7469, + "step": 28709 + }, + { + "epoch": 1.336662243638988, + "grad_norm": 0.3177735719238914, + "learning_rate": 6.782606045171103e-05, + "loss": 2.7733, + "step": 28710 + }, + { + "epoch": 1.3367088018250808, + "grad_norm": 0.30560278586912987, + "learning_rate": 6.782352968758389e-05, + "loss": 2.7161, + "step": 28711 + }, + { + "epoch": 1.336755360011174, + "grad_norm": 0.3219695145749676, + "learning_rate": 6.782099887114644e-05, + "loss": 2.7246, + "step": 28712 + }, + { + "epoch": 1.336801918197267, + "grad_norm": 0.33717919053549805, + "learning_rate": 6.781846800240611e-05, + "loss": 2.7552, + "step": 28713 + }, + { + "epoch": 1.3368484763833601, + "grad_norm": 0.2904819419876341, + "learning_rate": 6.781593708137033e-05, + "loss": 2.7887, + "step": 28714 + }, + { + "epoch": 1.3368950345694532, + "grad_norm": 0.33244550119482996, + "learning_rate": 6.781340610804653e-05, + "loss": 2.8094, + "step": 28715 + }, + { + "epoch": 1.3369415927555464, + "grad_norm": 0.2958390859537403, + "learning_rate": 6.781087508244215e-05, + "loss": 2.7923, + "step": 28716 + }, + { + "epoch": 1.3369881509416393, + "grad_norm": 0.3086853402084156, + "learning_rate": 6.78083440045646e-05, + "loss": 2.768, + "step": 28717 + }, + { + "epoch": 1.3370347091277324, + "grad_norm": 0.2968268293755128, + "learning_rate": 6.780581287442132e-05, + "loss": 2.7382, + "step": 28718 + }, + { + "epoch": 1.3370812673138255, + "grad_norm": 0.3256398526287738, + "learning_rate": 6.780328169201973e-05, + "loss": 2.7094, + "step": 28719 + }, + { + "epoch": 1.3371278254999186, + "grad_norm": 0.31718069590506764, + "learning_rate": 6.780075045736725e-05, + "loss": 2.7183, + "step": 28720 + }, + { + "epoch": 1.3371743836860115, + "grad_norm": 0.31070161967861454, + "learning_rate": 6.779821917047134e-05, + "loss": 2.7786, + "step": 28721 + }, + { + "epoch": 1.3372209418721046, + "grad_norm": 0.31880082639002677, + "learning_rate": 6.77956878313394e-05, + "loss": 2.8181, + "step": 28722 + }, + { + "epoch": 1.3372675000581977, + "grad_norm": 0.31787209492386304, + "learning_rate": 6.779315643997888e-05, + "loss": 2.7736, + "step": 28723 + }, + { + "epoch": 1.3373140582442908, + "grad_norm": 0.30678428276164216, + "learning_rate": 6.779062499639718e-05, + "loss": 2.8372, + "step": 28724 + }, + { + "epoch": 1.337360616430384, + "grad_norm": 0.33894734224500817, + "learning_rate": 6.778809350060176e-05, + "loss": 2.6224, + "step": 28725 + }, + { + "epoch": 1.337407174616477, + "grad_norm": 0.31059559794515457, + "learning_rate": 6.778556195260004e-05, + "loss": 2.7338, + "step": 28726 + }, + { + "epoch": 1.33745373280257, + "grad_norm": 0.35212206807503343, + "learning_rate": 6.778303035239944e-05, + "loss": 2.8031, + "step": 28727 + }, + { + "epoch": 1.337500290988663, + "grad_norm": 0.30893293970628083, + "learning_rate": 6.77804987000074e-05, + "loss": 2.7394, + "step": 28728 + }, + { + "epoch": 1.3375468491747562, + "grad_norm": 0.3278816074032583, + "learning_rate": 6.777796699543135e-05, + "loss": 2.8486, + "step": 28729 + }, + { + "epoch": 1.337593407360849, + "grad_norm": 0.35300839633861486, + "learning_rate": 6.777543523867872e-05, + "loss": 2.6883, + "step": 28730 + }, + { + "epoch": 1.3376399655469422, + "grad_norm": 0.3326798387174399, + "learning_rate": 6.777290342975692e-05, + "loss": 2.7064, + "step": 28731 + }, + { + "epoch": 1.3376865237330353, + "grad_norm": 0.34811692457091997, + "learning_rate": 6.777037156867342e-05, + "loss": 2.7898, + "step": 28732 + }, + { + "epoch": 1.3377330819191284, + "grad_norm": 0.3754917053637758, + "learning_rate": 6.776783965543562e-05, + "loss": 2.7704, + "step": 28733 + }, + { + "epoch": 1.3377796401052215, + "grad_norm": 0.3391685182047189, + "learning_rate": 6.776530769005098e-05, + "loss": 2.7489, + "step": 28734 + }, + { + "epoch": 1.3378261982913147, + "grad_norm": 0.3694337630430042, + "learning_rate": 6.77627756725269e-05, + "loss": 2.7135, + "step": 28735 + }, + { + "epoch": 1.3378727564774078, + "grad_norm": 0.3803448408674719, + "learning_rate": 6.776024360287083e-05, + "loss": 2.7956, + "step": 28736 + }, + { + "epoch": 1.3379193146635007, + "grad_norm": 0.32985757877956845, + "learning_rate": 6.775771148109018e-05, + "loss": 2.7003, + "step": 28737 + }, + { + "epoch": 1.3379658728495938, + "grad_norm": 0.389248426688559, + "learning_rate": 6.775517930719239e-05, + "loss": 2.7971, + "step": 28738 + }, + { + "epoch": 1.338012431035687, + "grad_norm": 0.33646202794138513, + "learning_rate": 6.775264708118491e-05, + "loss": 2.807, + "step": 28739 + }, + { + "epoch": 1.3380589892217798, + "grad_norm": 0.3429549602978245, + "learning_rate": 6.775011480307519e-05, + "loss": 2.7918, + "step": 28740 + }, + { + "epoch": 1.338105547407873, + "grad_norm": 0.36064415585526216, + "learning_rate": 6.774758247287058e-05, + "loss": 2.7574, + "step": 28741 + }, + { + "epoch": 1.338152105593966, + "grad_norm": 0.3225359483375682, + "learning_rate": 6.774505009057859e-05, + "loss": 2.8249, + "step": 28742 + }, + { + "epoch": 1.3381986637800591, + "grad_norm": 0.34178365172471437, + "learning_rate": 6.774251765620661e-05, + "loss": 2.7892, + "step": 28743 + }, + { + "epoch": 1.3382452219661523, + "grad_norm": 0.3158123723828294, + "learning_rate": 6.773998516976212e-05, + "loss": 2.7002, + "step": 28744 + }, + { + "epoch": 1.3382917801522454, + "grad_norm": 0.35933296195173636, + "learning_rate": 6.77374526312525e-05, + "loss": 2.8453, + "step": 28745 + }, + { + "epoch": 1.3383383383383383, + "grad_norm": 0.3600582376221403, + "learning_rate": 6.77349200406852e-05, + "loss": 2.849, + "step": 28746 + }, + { + "epoch": 1.3383848965244314, + "grad_norm": 0.34623290508190097, + "learning_rate": 6.773238739806767e-05, + "loss": 2.8399, + "step": 28747 + }, + { + "epoch": 1.3384314547105245, + "grad_norm": 0.34575686032976793, + "learning_rate": 6.772985470340732e-05, + "loss": 2.7406, + "step": 28748 + }, + { + "epoch": 1.3384780128966176, + "grad_norm": 0.34335286687349464, + "learning_rate": 6.772732195671161e-05, + "loss": 2.8672, + "step": 28749 + }, + { + "epoch": 1.3385245710827105, + "grad_norm": 0.3331295034100139, + "learning_rate": 6.772478915798793e-05, + "loss": 2.8099, + "step": 28750 + }, + { + "epoch": 1.3385711292688036, + "grad_norm": 0.3343196961481179, + "learning_rate": 6.772225630724375e-05, + "loss": 2.7754, + "step": 28751 + }, + { + "epoch": 1.3386176874548967, + "grad_norm": 0.35177681291636737, + "learning_rate": 6.771972340448651e-05, + "loss": 2.8257, + "step": 28752 + }, + { + "epoch": 1.3386642456409898, + "grad_norm": 0.35834900506382006, + "learning_rate": 6.771719044972361e-05, + "loss": 2.6465, + "step": 28753 + }, + { + "epoch": 1.338710803827083, + "grad_norm": 0.36249081521958515, + "learning_rate": 6.77146574429625e-05, + "loss": 2.7868, + "step": 28754 + }, + { + "epoch": 1.338757362013176, + "grad_norm": 0.3852386559153691, + "learning_rate": 6.771212438421062e-05, + "loss": 2.7971, + "step": 28755 + }, + { + "epoch": 1.338803920199269, + "grad_norm": 0.36271404066415985, + "learning_rate": 6.77095912734754e-05, + "loss": 2.7577, + "step": 28756 + }, + { + "epoch": 1.338850478385362, + "grad_norm": 0.342646082462535, + "learning_rate": 6.770705811076428e-05, + "loss": 2.742, + "step": 28757 + }, + { + "epoch": 1.3388970365714552, + "grad_norm": 0.3452377103243281, + "learning_rate": 6.770452489608467e-05, + "loss": 2.7705, + "step": 28758 + }, + { + "epoch": 1.3389435947575483, + "grad_norm": 0.34999764607640654, + "learning_rate": 6.770199162944404e-05, + "loss": 2.8338, + "step": 28759 + }, + { + "epoch": 1.3389901529436412, + "grad_norm": 0.3462137925989522, + "learning_rate": 6.769945831084982e-05, + "loss": 2.8405, + "step": 28760 + }, + { + "epoch": 1.3390367111297343, + "grad_norm": 0.35129544330911555, + "learning_rate": 6.769692494030941e-05, + "loss": 2.8313, + "step": 28761 + }, + { + "epoch": 1.3390832693158274, + "grad_norm": 0.34473361339300335, + "learning_rate": 6.769439151783029e-05, + "loss": 2.7498, + "step": 28762 + }, + { + "epoch": 1.3391298275019206, + "grad_norm": 0.3625930218157237, + "learning_rate": 6.769185804341987e-05, + "loss": 2.822, + "step": 28763 + }, + { + "epoch": 1.3391763856880137, + "grad_norm": 0.35764334872817555, + "learning_rate": 6.768932451708558e-05, + "loss": 2.7823, + "step": 28764 + }, + { + "epoch": 1.3392229438741068, + "grad_norm": 0.3581764284496782, + "learning_rate": 6.768679093883486e-05, + "loss": 2.8352, + "step": 28765 + }, + { + "epoch": 1.3392695020601997, + "grad_norm": 0.3073343545642614, + "learning_rate": 6.768425730867515e-05, + "loss": 2.7861, + "step": 28766 + }, + { + "epoch": 1.3393160602462928, + "grad_norm": 0.35001377270233214, + "learning_rate": 6.76817236266139e-05, + "loss": 2.7113, + "step": 28767 + }, + { + "epoch": 1.339362618432386, + "grad_norm": 0.31488774660183744, + "learning_rate": 6.767918989265854e-05, + "loss": 2.6522, + "step": 28768 + }, + { + "epoch": 1.339409176618479, + "grad_norm": 0.3319961299275612, + "learning_rate": 6.767665610681647e-05, + "loss": 2.8178, + "step": 28769 + }, + { + "epoch": 1.339455734804572, + "grad_norm": 0.3090264335344627, + "learning_rate": 6.767412226909517e-05, + "loss": 2.7745, + "step": 28770 + }, + { + "epoch": 1.339502292990665, + "grad_norm": 0.34392520514301844, + "learning_rate": 6.767158837950207e-05, + "loss": 2.7719, + "step": 28771 + }, + { + "epoch": 1.3395488511767581, + "grad_norm": 0.35214432334309775, + "learning_rate": 6.76690544380446e-05, + "loss": 2.8627, + "step": 28772 + }, + { + "epoch": 1.3395954093628513, + "grad_norm": 0.3379401053965479, + "learning_rate": 6.76665204447302e-05, + "loss": 2.6346, + "step": 28773 + }, + { + "epoch": 1.3396419675489444, + "grad_norm": 0.3297229615720525, + "learning_rate": 6.76639863995663e-05, + "loss": 2.8164, + "step": 28774 + }, + { + "epoch": 1.3396885257350375, + "grad_norm": 0.3616790526072783, + "learning_rate": 6.766145230256032e-05, + "loss": 2.7147, + "step": 28775 + }, + { + "epoch": 1.3397350839211304, + "grad_norm": 0.3323479315038053, + "learning_rate": 6.765891815371974e-05, + "loss": 2.6686, + "step": 28776 + }, + { + "epoch": 1.3397816421072235, + "grad_norm": 0.3469816428936112, + "learning_rate": 6.765638395305196e-05, + "loss": 2.8124, + "step": 28777 + }, + { + "epoch": 1.3398282002933166, + "grad_norm": 0.3303601699325127, + "learning_rate": 6.765384970056444e-05, + "loss": 2.7978, + "step": 28778 + }, + { + "epoch": 1.3398747584794095, + "grad_norm": 0.33157163606945467, + "learning_rate": 6.76513153962646e-05, + "loss": 2.8261, + "step": 28779 + }, + { + "epoch": 1.3399213166655026, + "grad_norm": 0.3260534632025111, + "learning_rate": 6.76487810401599e-05, + "loss": 2.7789, + "step": 28780 + }, + { + "epoch": 1.3399678748515957, + "grad_norm": 0.36394719313471274, + "learning_rate": 6.764624663225777e-05, + "loss": 2.7789, + "step": 28781 + }, + { + "epoch": 1.3400144330376889, + "grad_norm": 0.34148303537274294, + "learning_rate": 6.764371217256563e-05, + "loss": 2.7737, + "step": 28782 + }, + { + "epoch": 1.340060991223782, + "grad_norm": 0.35803536009588277, + "learning_rate": 6.764117766109094e-05, + "loss": 2.7744, + "step": 28783 + }, + { + "epoch": 1.340107549409875, + "grad_norm": 0.35467129979252077, + "learning_rate": 6.763864309784114e-05, + "loss": 2.7123, + "step": 28784 + }, + { + "epoch": 1.340154107595968, + "grad_norm": 0.3533632399117385, + "learning_rate": 6.763610848282365e-05, + "loss": 2.7401, + "step": 28785 + }, + { + "epoch": 1.340200665782061, + "grad_norm": 0.34914523201176206, + "learning_rate": 6.763357381604594e-05, + "loss": 2.8242, + "step": 28786 + }, + { + "epoch": 1.3402472239681542, + "grad_norm": 0.36450182164107114, + "learning_rate": 6.76310390975154e-05, + "loss": 2.8321, + "step": 28787 + }, + { + "epoch": 1.3402937821542473, + "grad_norm": 0.33110881089501043, + "learning_rate": 6.76285043272395e-05, + "loss": 2.7264, + "step": 28788 + }, + { + "epoch": 1.3403403403403402, + "grad_norm": 0.36780425451623866, + "learning_rate": 6.76259695052257e-05, + "loss": 2.7626, + "step": 28789 + }, + { + "epoch": 1.3403868985264333, + "grad_norm": 0.3524238612410261, + "learning_rate": 6.76234346314814e-05, + "loss": 2.6564, + "step": 28790 + }, + { + "epoch": 1.3404334567125264, + "grad_norm": 0.33787861226355437, + "learning_rate": 6.762089970601404e-05, + "loss": 2.8341, + "step": 28791 + }, + { + "epoch": 1.3404800148986196, + "grad_norm": 0.3584100091617254, + "learning_rate": 6.76183647288311e-05, + "loss": 2.7907, + "step": 28792 + }, + { + "epoch": 1.3405265730847127, + "grad_norm": 0.3501823534131769, + "learning_rate": 6.761582969993999e-05, + "loss": 2.7375, + "step": 28793 + }, + { + "epoch": 1.3405731312708058, + "grad_norm": 0.33566032579902044, + "learning_rate": 6.761329461934814e-05, + "loss": 2.7392, + "step": 28794 + }, + { + "epoch": 1.3406196894568987, + "grad_norm": 0.3221141354851347, + "learning_rate": 6.761075948706301e-05, + "loss": 2.8877, + "step": 28795 + }, + { + "epoch": 1.3406662476429918, + "grad_norm": 0.34109866564525004, + "learning_rate": 6.760822430309204e-05, + "loss": 2.7173, + "step": 28796 + }, + { + "epoch": 1.340712805829085, + "grad_norm": 0.31342834658351965, + "learning_rate": 6.760568906744267e-05, + "loss": 2.7053, + "step": 28797 + }, + { + "epoch": 1.340759364015178, + "grad_norm": 0.3297424950106684, + "learning_rate": 6.760315378012234e-05, + "loss": 2.8869, + "step": 28798 + }, + { + "epoch": 1.340805922201271, + "grad_norm": 0.3338908844532592, + "learning_rate": 6.76006184411385e-05, + "loss": 2.8018, + "step": 28799 + }, + { + "epoch": 1.340852480387364, + "grad_norm": 0.33281537835659364, + "learning_rate": 6.759808305049855e-05, + "loss": 2.8228, + "step": 28800 + }, + { + "epoch": 1.3408990385734572, + "grad_norm": 0.3525658029943841, + "learning_rate": 6.759554760820998e-05, + "loss": 2.7354, + "step": 28801 + }, + { + "epoch": 1.3409455967595503, + "grad_norm": 0.3890579791524693, + "learning_rate": 6.75930121142802e-05, + "loss": 2.7518, + "step": 28802 + }, + { + "epoch": 1.3409921549456434, + "grad_norm": 0.32036727719744956, + "learning_rate": 6.759047656871667e-05, + "loss": 2.7166, + "step": 28803 + }, + { + "epoch": 1.3410387131317365, + "grad_norm": 0.3929598886838559, + "learning_rate": 6.758794097152681e-05, + "loss": 2.7242, + "step": 28804 + }, + { + "epoch": 1.3410852713178294, + "grad_norm": 0.3232038609272512, + "learning_rate": 6.758540532271808e-05, + "loss": 2.7705, + "step": 28805 + }, + { + "epoch": 1.3411318295039225, + "grad_norm": 0.3557421275365665, + "learning_rate": 6.758286962229792e-05, + "loss": 2.7319, + "step": 28806 + }, + { + "epoch": 1.3411783876900156, + "grad_norm": 0.33252710025005805, + "learning_rate": 6.75803338702738e-05, + "loss": 2.8368, + "step": 28807 + }, + { + "epoch": 1.3412249458761087, + "grad_norm": 0.3454988503250305, + "learning_rate": 6.75777980666531e-05, + "loss": 2.7723, + "step": 28808 + }, + { + "epoch": 1.3412715040622016, + "grad_norm": 0.38820417566283566, + "learning_rate": 6.757526221144329e-05, + "loss": 2.777, + "step": 28809 + }, + { + "epoch": 1.3413180622482948, + "grad_norm": 0.31307478484142065, + "learning_rate": 6.757272630465183e-05, + "loss": 2.8051, + "step": 28810 + }, + { + "epoch": 1.3413646204343879, + "grad_norm": 0.3496311751226706, + "learning_rate": 6.757019034628616e-05, + "loss": 2.664, + "step": 28811 + }, + { + "epoch": 1.341411178620481, + "grad_norm": 0.35447484697073633, + "learning_rate": 6.756765433635371e-05, + "loss": 2.7442, + "step": 28812 + }, + { + "epoch": 1.341457736806574, + "grad_norm": 0.3331081811160395, + "learning_rate": 6.75651182748619e-05, + "loss": 2.7448, + "step": 28813 + }, + { + "epoch": 1.3415042949926672, + "grad_norm": 0.35873058346207387, + "learning_rate": 6.756258216181821e-05, + "loss": 2.8412, + "step": 28814 + }, + { + "epoch": 1.34155085317876, + "grad_norm": 0.3363155292533419, + "learning_rate": 6.756004599723007e-05, + "loss": 2.7441, + "step": 28815 + }, + { + "epoch": 1.3415974113648532, + "grad_norm": 0.35152993444819947, + "learning_rate": 6.755750978110493e-05, + "loss": 2.8259, + "step": 28816 + }, + { + "epoch": 1.3416439695509463, + "grad_norm": 0.3446330257890509, + "learning_rate": 6.755497351345024e-05, + "loss": 2.7297, + "step": 28817 + }, + { + "epoch": 1.3416905277370392, + "grad_norm": 0.31662762734004646, + "learning_rate": 6.755243719427341e-05, + "loss": 2.675, + "step": 28818 + }, + { + "epoch": 1.3417370859231323, + "grad_norm": 0.32827074781208404, + "learning_rate": 6.754990082358191e-05, + "loss": 2.725, + "step": 28819 + }, + { + "epoch": 1.3417836441092255, + "grad_norm": 0.3119585974778347, + "learning_rate": 6.754736440138318e-05, + "loss": 2.82, + "step": 28820 + }, + { + "epoch": 1.3418302022953186, + "grad_norm": 0.3516181865259012, + "learning_rate": 6.754482792768468e-05, + "loss": 2.7556, + "step": 28821 + }, + { + "epoch": 1.3418767604814117, + "grad_norm": 0.31161657544285554, + "learning_rate": 6.754229140249383e-05, + "loss": 2.8176, + "step": 28822 + }, + { + "epoch": 1.3419233186675048, + "grad_norm": 0.3472123248807708, + "learning_rate": 6.753975482581808e-05, + "loss": 2.7017, + "step": 28823 + }, + { + "epoch": 1.341969876853598, + "grad_norm": 0.33645355282560097, + "learning_rate": 6.753721819766489e-05, + "loss": 2.743, + "step": 28824 + }, + { + "epoch": 1.3420164350396908, + "grad_norm": 0.3569286676364115, + "learning_rate": 6.753468151804169e-05, + "loss": 2.7122, + "step": 28825 + }, + { + "epoch": 1.342062993225784, + "grad_norm": 0.3310554380929553, + "learning_rate": 6.75321447869559e-05, + "loss": 2.7403, + "step": 28826 + }, + { + "epoch": 1.342109551411877, + "grad_norm": 0.35443013759880226, + "learning_rate": 6.752960800441501e-05, + "loss": 2.7716, + "step": 28827 + }, + { + "epoch": 1.34215610959797, + "grad_norm": 0.32795424914616594, + "learning_rate": 6.752707117042645e-05, + "loss": 2.6956, + "step": 28828 + }, + { + "epoch": 1.342202667784063, + "grad_norm": 0.3395004000790808, + "learning_rate": 6.752453428499766e-05, + "loss": 2.7512, + "step": 28829 + }, + { + "epoch": 1.3422492259701562, + "grad_norm": 0.32081998562158914, + "learning_rate": 6.75219973481361e-05, + "loss": 2.6842, + "step": 28830 + }, + { + "epoch": 1.3422957841562493, + "grad_norm": 0.35038111190726356, + "learning_rate": 6.75194603598492e-05, + "loss": 2.7549, + "step": 28831 + }, + { + "epoch": 1.3423423423423424, + "grad_norm": 0.3562142438430957, + "learning_rate": 6.751692332014439e-05, + "loss": 2.829, + "step": 28832 + }, + { + "epoch": 1.3423889005284355, + "grad_norm": 0.32225033526852903, + "learning_rate": 6.751438622902915e-05, + "loss": 2.7232, + "step": 28833 + }, + { + "epoch": 1.3424354587145284, + "grad_norm": 0.36846596625445777, + "learning_rate": 6.751184908651091e-05, + "loss": 2.8042, + "step": 28834 + }, + { + "epoch": 1.3424820169006215, + "grad_norm": 0.33731137820322615, + "learning_rate": 6.750931189259712e-05, + "loss": 2.7603, + "step": 28835 + }, + { + "epoch": 1.3425285750867146, + "grad_norm": 0.35934889055956487, + "learning_rate": 6.750677464729522e-05, + "loss": 2.7838, + "step": 28836 + }, + { + "epoch": 1.3425751332728078, + "grad_norm": 0.35174067006351867, + "learning_rate": 6.750423735061267e-05, + "loss": 2.7603, + "step": 28837 + }, + { + "epoch": 1.3426216914589006, + "grad_norm": 0.324723893839081, + "learning_rate": 6.75017000025569e-05, + "loss": 2.8781, + "step": 28838 + }, + { + "epoch": 1.3426682496449938, + "grad_norm": 0.3306170077506411, + "learning_rate": 6.749916260313538e-05, + "loss": 2.6741, + "step": 28839 + }, + { + "epoch": 1.3427148078310869, + "grad_norm": 0.35267906756285833, + "learning_rate": 6.749662515235553e-05, + "loss": 2.8105, + "step": 28840 + }, + { + "epoch": 1.34276136601718, + "grad_norm": 0.3473832156912224, + "learning_rate": 6.749408765022481e-05, + "loss": 2.8167, + "step": 28841 + }, + { + "epoch": 1.342807924203273, + "grad_norm": 0.3324253744998713, + "learning_rate": 6.749155009675066e-05, + "loss": 2.7442, + "step": 28842 + }, + { + "epoch": 1.3428544823893662, + "grad_norm": 0.3417827392373352, + "learning_rate": 6.748901249194052e-05, + "loss": 2.6985, + "step": 28843 + }, + { + "epoch": 1.3429010405754591, + "grad_norm": 0.32498829091632114, + "learning_rate": 6.748647483580189e-05, + "loss": 2.755, + "step": 28844 + }, + { + "epoch": 1.3429475987615522, + "grad_norm": 0.37040605129488036, + "learning_rate": 6.748393712834215e-05, + "loss": 2.7407, + "step": 28845 + }, + { + "epoch": 1.3429941569476453, + "grad_norm": 0.32914262073539097, + "learning_rate": 6.748139936956877e-05, + "loss": 2.733, + "step": 28846 + }, + { + "epoch": 1.3430407151337385, + "grad_norm": 0.3464028871802787, + "learning_rate": 6.747886155948923e-05, + "loss": 2.7547, + "step": 28847 + }, + { + "epoch": 1.3430872733198314, + "grad_norm": 0.3469859014974377, + "learning_rate": 6.747632369811095e-05, + "loss": 2.8011, + "step": 28848 + }, + { + "epoch": 1.3431338315059245, + "grad_norm": 0.39710618839652395, + "learning_rate": 6.747378578544136e-05, + "loss": 2.8551, + "step": 28849 + }, + { + "epoch": 1.3431803896920176, + "grad_norm": 0.35686989017153753, + "learning_rate": 6.747124782148796e-05, + "loss": 2.746, + "step": 28850 + }, + { + "epoch": 1.3432269478781107, + "grad_norm": 0.34969819322181556, + "learning_rate": 6.746870980625815e-05, + "loss": 2.722, + "step": 28851 + }, + { + "epoch": 1.3432735060642038, + "grad_norm": 0.34647182322210973, + "learning_rate": 6.746617173975941e-05, + "loss": 2.7375, + "step": 28852 + }, + { + "epoch": 1.343320064250297, + "grad_norm": 0.33522102021540556, + "learning_rate": 6.746363362199916e-05, + "loss": 2.7439, + "step": 28853 + }, + { + "epoch": 1.3433666224363898, + "grad_norm": 0.32035008483152233, + "learning_rate": 6.746109545298488e-05, + "loss": 2.7154, + "step": 28854 + }, + { + "epoch": 1.343413180622483, + "grad_norm": 0.35420902213690447, + "learning_rate": 6.7458557232724e-05, + "loss": 2.733, + "step": 28855 + }, + { + "epoch": 1.343459738808576, + "grad_norm": 0.31080772875461893, + "learning_rate": 6.745601896122396e-05, + "loss": 2.7839, + "step": 28856 + }, + { + "epoch": 1.3435062969946692, + "grad_norm": 0.3436216823931349, + "learning_rate": 6.745348063849226e-05, + "loss": 2.7215, + "step": 28857 + }, + { + "epoch": 1.343552855180762, + "grad_norm": 0.30434469052860613, + "learning_rate": 6.745094226453628e-05, + "loss": 2.8452, + "step": 28858 + }, + { + "epoch": 1.3435994133668552, + "grad_norm": 0.3246033158738771, + "learning_rate": 6.74484038393635e-05, + "loss": 2.7854, + "step": 28859 + }, + { + "epoch": 1.3436459715529483, + "grad_norm": 0.34549427659963156, + "learning_rate": 6.74458653629814e-05, + "loss": 2.8367, + "step": 28860 + }, + { + "epoch": 1.3436925297390414, + "grad_norm": 0.34577832789596286, + "learning_rate": 6.744332683539741e-05, + "loss": 2.6716, + "step": 28861 + }, + { + "epoch": 1.3437390879251345, + "grad_norm": 0.3490151046756836, + "learning_rate": 6.744078825661895e-05, + "loss": 2.793, + "step": 28862 + }, + { + "epoch": 1.3437856461112276, + "grad_norm": 0.38995569791838325, + "learning_rate": 6.743824962665351e-05, + "loss": 2.9296, + "step": 28863 + }, + { + "epoch": 1.3438322042973205, + "grad_norm": 0.3396272479450575, + "learning_rate": 6.74357109455085e-05, + "loss": 2.7446, + "step": 28864 + }, + { + "epoch": 1.3438787624834136, + "grad_norm": 0.3900579333821595, + "learning_rate": 6.743317221319141e-05, + "loss": 2.7687, + "step": 28865 + }, + { + "epoch": 1.3439253206695068, + "grad_norm": 0.3467584282882013, + "learning_rate": 6.74306334297097e-05, + "loss": 2.8728, + "step": 28866 + }, + { + "epoch": 1.3439718788555997, + "grad_norm": 0.39064053706537966, + "learning_rate": 6.742809459507077e-05, + "loss": 2.7344, + "step": 28867 + }, + { + "epoch": 1.3440184370416928, + "grad_norm": 0.32285946811256616, + "learning_rate": 6.742555570928211e-05, + "loss": 2.639, + "step": 28868 + }, + { + "epoch": 1.3440649952277859, + "grad_norm": 0.34812921407742514, + "learning_rate": 6.742301677235115e-05, + "loss": 2.8237, + "step": 28869 + }, + { + "epoch": 1.344111553413879, + "grad_norm": 0.34587613617553514, + "learning_rate": 6.742047778428536e-05, + "loss": 2.8158, + "step": 28870 + }, + { + "epoch": 1.3441581115999721, + "grad_norm": 0.35161222443340523, + "learning_rate": 6.74179387450922e-05, + "loss": 2.8121, + "step": 28871 + }, + { + "epoch": 1.3442046697860652, + "grad_norm": 0.35076514027564004, + "learning_rate": 6.741539965477908e-05, + "loss": 2.755, + "step": 28872 + }, + { + "epoch": 1.3442512279721581, + "grad_norm": 0.3783806855498109, + "learning_rate": 6.741286051335347e-05, + "loss": 2.7069, + "step": 28873 + }, + { + "epoch": 1.3442977861582512, + "grad_norm": 0.3544081634193199, + "learning_rate": 6.741032132082286e-05, + "loss": 2.6683, + "step": 28874 + }, + { + "epoch": 1.3443443443443444, + "grad_norm": 0.33515002764765983, + "learning_rate": 6.740778207719466e-05, + "loss": 2.7031, + "step": 28875 + }, + { + "epoch": 1.3443909025304375, + "grad_norm": 0.34632836044357945, + "learning_rate": 6.740524278247634e-05, + "loss": 2.7194, + "step": 28876 + }, + { + "epoch": 1.3444374607165304, + "grad_norm": 0.3492993148688092, + "learning_rate": 6.740270343667534e-05, + "loss": 2.7509, + "step": 28877 + }, + { + "epoch": 1.3444840189026235, + "grad_norm": 0.36821212780955154, + "learning_rate": 6.740016403979912e-05, + "loss": 2.7983, + "step": 28878 + }, + { + "epoch": 1.3445305770887166, + "grad_norm": 0.31069329263436635, + "learning_rate": 6.739762459185513e-05, + "loss": 2.8922, + "step": 28879 + }, + { + "epoch": 1.3445771352748097, + "grad_norm": 0.38808748064782544, + "learning_rate": 6.739508509285084e-05, + "loss": 2.8051, + "step": 28880 + }, + { + "epoch": 1.3446236934609028, + "grad_norm": 0.3177932873704922, + "learning_rate": 6.739254554279367e-05, + "loss": 2.8356, + "step": 28881 + }, + { + "epoch": 1.344670251646996, + "grad_norm": 0.35654220739109643, + "learning_rate": 6.73900059416911e-05, + "loss": 2.7088, + "step": 28882 + }, + { + "epoch": 1.3447168098330888, + "grad_norm": 0.3864192912419427, + "learning_rate": 6.738746628955057e-05, + "loss": 2.8562, + "step": 28883 + }, + { + "epoch": 1.344763368019182, + "grad_norm": 0.33687196693824445, + "learning_rate": 6.738492658637955e-05, + "loss": 2.8381, + "step": 28884 + }, + { + "epoch": 1.344809926205275, + "grad_norm": 0.3895879505746486, + "learning_rate": 6.738238683218548e-05, + "loss": 2.6776, + "step": 28885 + }, + { + "epoch": 1.3448564843913682, + "grad_norm": 0.3722720984082618, + "learning_rate": 6.737984702697581e-05, + "loss": 2.664, + "step": 28886 + }, + { + "epoch": 1.344903042577461, + "grad_norm": 0.38169791074364057, + "learning_rate": 6.737730717075802e-05, + "loss": 2.7329, + "step": 28887 + }, + { + "epoch": 1.3449496007635542, + "grad_norm": 0.34743473065404473, + "learning_rate": 6.737476726353952e-05, + "loss": 2.7615, + "step": 28888 + }, + { + "epoch": 1.3449961589496473, + "grad_norm": 0.3656790330362786, + "learning_rate": 6.737222730532782e-05, + "loss": 2.734, + "step": 28889 + }, + { + "epoch": 1.3450427171357404, + "grad_norm": 0.34575993083912543, + "learning_rate": 6.736968729613033e-05, + "loss": 2.8487, + "step": 28890 + }, + { + "epoch": 1.3450892753218335, + "grad_norm": 0.3272896642240356, + "learning_rate": 6.736714723595451e-05, + "loss": 2.773, + "step": 28891 + }, + { + "epoch": 1.3451358335079266, + "grad_norm": 0.3390494128457105, + "learning_rate": 6.736460712480783e-05, + "loss": 2.7639, + "step": 28892 + }, + { + "epoch": 1.3451823916940195, + "grad_norm": 0.3155773802678688, + "learning_rate": 6.736206696269774e-05, + "loss": 2.7487, + "step": 28893 + }, + { + "epoch": 1.3452289498801127, + "grad_norm": 0.31237299868163393, + "learning_rate": 6.735952674963168e-05, + "loss": 2.7035, + "step": 28894 + }, + { + "epoch": 1.3452755080662058, + "grad_norm": 0.32675764864662016, + "learning_rate": 6.735698648561714e-05, + "loss": 2.7814, + "step": 28895 + }, + { + "epoch": 1.3453220662522989, + "grad_norm": 0.3489214531123791, + "learning_rate": 6.735444617066154e-05, + "loss": 2.806, + "step": 28896 + }, + { + "epoch": 1.3453686244383918, + "grad_norm": 0.3183042079171647, + "learning_rate": 6.735190580477236e-05, + "loss": 2.8259, + "step": 28897 + }, + { + "epoch": 1.345415182624485, + "grad_norm": 0.37233318431897644, + "learning_rate": 6.734936538795704e-05, + "loss": 2.769, + "step": 28898 + }, + { + "epoch": 1.345461740810578, + "grad_norm": 0.3263186242723377, + "learning_rate": 6.734682492022303e-05, + "loss": 2.7428, + "step": 28899 + }, + { + "epoch": 1.3455082989966711, + "grad_norm": 0.3438921950150877, + "learning_rate": 6.734428440157783e-05, + "loss": 2.7473, + "step": 28900 + }, + { + "epoch": 1.3455548571827642, + "grad_norm": 0.31207307279420954, + "learning_rate": 6.734174383202885e-05, + "loss": 2.7417, + "step": 28901 + }, + { + "epoch": 1.3456014153688574, + "grad_norm": 0.3361574040766697, + "learning_rate": 6.733920321158356e-05, + "loss": 2.7657, + "step": 28902 + }, + { + "epoch": 1.3456479735549502, + "grad_norm": 0.30310532897855696, + "learning_rate": 6.733666254024942e-05, + "loss": 2.8678, + "step": 28903 + }, + { + "epoch": 1.3456945317410434, + "grad_norm": 0.3230185529117149, + "learning_rate": 6.733412181803387e-05, + "loss": 2.8254, + "step": 28904 + }, + { + "epoch": 1.3457410899271365, + "grad_norm": 0.3229584499614716, + "learning_rate": 6.733158104494439e-05, + "loss": 2.7779, + "step": 28905 + }, + { + "epoch": 1.3457876481132294, + "grad_norm": 0.3095990947718254, + "learning_rate": 6.732904022098842e-05, + "loss": 2.7356, + "step": 28906 + }, + { + "epoch": 1.3458342062993225, + "grad_norm": 0.3251865921392912, + "learning_rate": 6.732649934617342e-05, + "loss": 2.7357, + "step": 28907 + }, + { + "epoch": 1.3458807644854156, + "grad_norm": 0.32604049761239345, + "learning_rate": 6.732395842050686e-05, + "loss": 2.6984, + "step": 28908 + }, + { + "epoch": 1.3459273226715087, + "grad_norm": 0.3342314613148453, + "learning_rate": 6.732141744399619e-05, + "loss": 2.7202, + "step": 28909 + }, + { + "epoch": 1.3459738808576018, + "grad_norm": 0.33100716282551534, + "learning_rate": 6.731887641664886e-05, + "loss": 2.7592, + "step": 28910 + }, + { + "epoch": 1.346020439043695, + "grad_norm": 0.36275448023788764, + "learning_rate": 6.731633533847235e-05, + "loss": 2.7434, + "step": 28911 + }, + { + "epoch": 1.346066997229788, + "grad_norm": 0.3544255896056648, + "learning_rate": 6.731379420947408e-05, + "loss": 2.8109, + "step": 28912 + }, + { + "epoch": 1.346113555415881, + "grad_norm": 0.34355595787009324, + "learning_rate": 6.731125302966155e-05, + "loss": 2.7259, + "step": 28913 + }, + { + "epoch": 1.346160113601974, + "grad_norm": 0.331971914234728, + "learning_rate": 6.730871179904218e-05, + "loss": 2.7466, + "step": 28914 + }, + { + "epoch": 1.3462066717880672, + "grad_norm": 0.3234941279177817, + "learning_rate": 6.730617051762347e-05, + "loss": 2.732, + "step": 28915 + }, + { + "epoch": 1.34625322997416, + "grad_norm": 0.33036049429126835, + "learning_rate": 6.730362918541283e-05, + "loss": 2.7425, + "step": 28916 + }, + { + "epoch": 1.3462997881602532, + "grad_norm": 0.3235142224531837, + "learning_rate": 6.730108780241775e-05, + "loss": 2.7251, + "step": 28917 + }, + { + "epoch": 1.3463463463463463, + "grad_norm": 0.31130111900715474, + "learning_rate": 6.729854636864567e-05, + "loss": 2.6892, + "step": 28918 + }, + { + "epoch": 1.3463929045324394, + "grad_norm": 0.3175161947446001, + "learning_rate": 6.729600488410408e-05, + "loss": 2.7846, + "step": 28919 + }, + { + "epoch": 1.3464394627185325, + "grad_norm": 0.3331774188573425, + "learning_rate": 6.729346334880042e-05, + "loss": 2.7539, + "step": 28920 + }, + { + "epoch": 1.3464860209046257, + "grad_norm": 0.32440471096229323, + "learning_rate": 6.729092176274213e-05, + "loss": 2.8481, + "step": 28921 + }, + { + "epoch": 1.3465325790907186, + "grad_norm": 0.32677732593599973, + "learning_rate": 6.72883801259367e-05, + "loss": 2.7365, + "step": 28922 + }, + { + "epoch": 1.3465791372768117, + "grad_norm": 0.32120706701720725, + "learning_rate": 6.728583843839158e-05, + "loss": 2.7934, + "step": 28923 + }, + { + "epoch": 1.3466256954629048, + "grad_norm": 0.3174685356345705, + "learning_rate": 6.728329670011422e-05, + "loss": 2.761, + "step": 28924 + }, + { + "epoch": 1.346672253648998, + "grad_norm": 0.3415650258485411, + "learning_rate": 6.72807549111121e-05, + "loss": 2.8034, + "step": 28925 + }, + { + "epoch": 1.3467188118350908, + "grad_norm": 0.32123023601511325, + "learning_rate": 6.727821307139266e-05, + "loss": 2.8369, + "step": 28926 + }, + { + "epoch": 1.346765370021184, + "grad_norm": 0.3317936905727447, + "learning_rate": 6.727567118096336e-05, + "loss": 2.7214, + "step": 28927 + }, + { + "epoch": 1.346811928207277, + "grad_norm": 0.344216070968203, + "learning_rate": 6.727312923983166e-05, + "loss": 2.6924, + "step": 28928 + }, + { + "epoch": 1.3468584863933701, + "grad_norm": 0.2984475061278853, + "learning_rate": 6.727058724800505e-05, + "loss": 2.779, + "step": 28929 + }, + { + "epoch": 1.3469050445794633, + "grad_norm": 0.33358185041822497, + "learning_rate": 6.726804520549096e-05, + "loss": 2.6662, + "step": 28930 + }, + { + "epoch": 1.3469516027655564, + "grad_norm": 0.3425814196946316, + "learning_rate": 6.726550311229686e-05, + "loss": 2.7042, + "step": 28931 + }, + { + "epoch": 1.3469981609516493, + "grad_norm": 0.34003287514274194, + "learning_rate": 6.726296096843019e-05, + "loss": 2.8744, + "step": 28932 + }, + { + "epoch": 1.3470447191377424, + "grad_norm": 0.33524470069287465, + "learning_rate": 6.726041877389846e-05, + "loss": 2.8828, + "step": 28933 + }, + { + "epoch": 1.3470912773238355, + "grad_norm": 0.34404335649949414, + "learning_rate": 6.725787652870907e-05, + "loss": 2.6339, + "step": 28934 + }, + { + "epoch": 1.3471378355099286, + "grad_norm": 0.3383467936385292, + "learning_rate": 6.725533423286954e-05, + "loss": 2.7929, + "step": 28935 + }, + { + "epoch": 1.3471843936960215, + "grad_norm": 0.3186571742924077, + "learning_rate": 6.725279188638729e-05, + "loss": 2.7908, + "step": 28936 + }, + { + "epoch": 1.3472309518821146, + "grad_norm": 0.3485818459037686, + "learning_rate": 6.725024948926979e-05, + "loss": 2.7408, + "step": 28937 + }, + { + "epoch": 1.3472775100682077, + "grad_norm": 0.3244602603072105, + "learning_rate": 6.724770704152452e-05, + "loss": 2.7107, + "step": 28938 + }, + { + "epoch": 1.3473240682543008, + "grad_norm": 0.39135615972268034, + "learning_rate": 6.724516454315893e-05, + "loss": 2.7193, + "step": 28939 + }, + { + "epoch": 1.347370626440394, + "grad_norm": 0.3384464155550096, + "learning_rate": 6.724262199418048e-05, + "loss": 2.7746, + "step": 28940 + }, + { + "epoch": 1.347417184626487, + "grad_norm": 0.37446844320586103, + "learning_rate": 6.724007939459662e-05, + "loss": 2.7715, + "step": 28941 + }, + { + "epoch": 1.34746374281258, + "grad_norm": 0.3513403630740338, + "learning_rate": 6.723753674441484e-05, + "loss": 2.6841, + "step": 28942 + }, + { + "epoch": 1.347510300998673, + "grad_norm": 0.36351909616125905, + "learning_rate": 6.723499404364258e-05, + "loss": 2.7387, + "step": 28943 + }, + { + "epoch": 1.3475568591847662, + "grad_norm": 0.3570385198020405, + "learning_rate": 6.723245129228731e-05, + "loss": 2.7954, + "step": 28944 + }, + { + "epoch": 1.3476034173708593, + "grad_norm": 0.370672934117853, + "learning_rate": 6.72299084903565e-05, + "loss": 2.756, + "step": 28945 + }, + { + "epoch": 1.3476499755569522, + "grad_norm": 0.38176415265027425, + "learning_rate": 6.72273656378576e-05, + "loss": 2.8071, + "step": 28946 + }, + { + "epoch": 1.3476965337430453, + "grad_norm": 0.38116031341928935, + "learning_rate": 6.722482273479809e-05, + "loss": 2.7563, + "step": 28947 + }, + { + "epoch": 1.3477430919291384, + "grad_norm": 0.3508610224419542, + "learning_rate": 6.72222797811854e-05, + "loss": 2.7507, + "step": 28948 + }, + { + "epoch": 1.3477896501152316, + "grad_norm": 0.3672691377383987, + "learning_rate": 6.721973677702704e-05, + "loss": 2.7818, + "step": 28949 + }, + { + "epoch": 1.3478362083013247, + "grad_norm": 0.35168545964763376, + "learning_rate": 6.721719372233042e-05, + "loss": 2.7223, + "step": 28950 + }, + { + "epoch": 1.3478827664874178, + "grad_norm": 0.33994297361895814, + "learning_rate": 6.721465061710306e-05, + "loss": 2.6606, + "step": 28951 + }, + { + "epoch": 1.3479293246735107, + "grad_norm": 0.36126104444777846, + "learning_rate": 6.721210746135239e-05, + "loss": 2.6836, + "step": 28952 + }, + { + "epoch": 1.3479758828596038, + "grad_norm": 0.33633255552922003, + "learning_rate": 6.720956425508588e-05, + "loss": 2.6644, + "step": 28953 + }, + { + "epoch": 1.348022441045697, + "grad_norm": 0.35652815494658335, + "learning_rate": 6.720702099831098e-05, + "loss": 2.7544, + "step": 28954 + }, + { + "epoch": 1.3480689992317898, + "grad_norm": 0.3925046240596863, + "learning_rate": 6.720447769103518e-05, + "loss": 2.8156, + "step": 28955 + }, + { + "epoch": 1.348115557417883, + "grad_norm": 0.3525339692638589, + "learning_rate": 6.720193433326593e-05, + "loss": 2.8153, + "step": 28956 + }, + { + "epoch": 1.348162115603976, + "grad_norm": 0.3713638366005711, + "learning_rate": 6.719939092501071e-05, + "loss": 2.7884, + "step": 28957 + }, + { + "epoch": 1.3482086737900691, + "grad_norm": 0.3305777822823474, + "learning_rate": 6.719684746627696e-05, + "loss": 2.7676, + "step": 28958 + }, + { + "epoch": 1.3482552319761623, + "grad_norm": 0.35473307834962936, + "learning_rate": 6.719430395707215e-05, + "loss": 2.6543, + "step": 28959 + }, + { + "epoch": 1.3483017901622554, + "grad_norm": 0.33716880840343955, + "learning_rate": 6.719176039740376e-05, + "loss": 2.8118, + "step": 28960 + }, + { + "epoch": 1.3483483483483483, + "grad_norm": 0.33176418309382816, + "learning_rate": 6.718921678727925e-05, + "loss": 2.7214, + "step": 28961 + }, + { + "epoch": 1.3483949065344414, + "grad_norm": 0.35046446912682905, + "learning_rate": 6.718667312670608e-05, + "loss": 2.7201, + "step": 28962 + }, + { + "epoch": 1.3484414647205345, + "grad_norm": 0.34743400810460806, + "learning_rate": 6.718412941569172e-05, + "loss": 2.8137, + "step": 28963 + }, + { + "epoch": 1.3484880229066276, + "grad_norm": 0.3577394522247536, + "learning_rate": 6.718158565424364e-05, + "loss": 2.707, + "step": 28964 + }, + { + "epoch": 1.3485345810927205, + "grad_norm": 0.33459257544219534, + "learning_rate": 6.717904184236929e-05, + "loss": 2.7998, + "step": 28965 + }, + { + "epoch": 1.3485811392788136, + "grad_norm": 0.34676866469010265, + "learning_rate": 6.717649798007614e-05, + "loss": 2.8012, + "step": 28966 + }, + { + "epoch": 1.3486276974649067, + "grad_norm": 0.3520827786274423, + "learning_rate": 6.717395406737166e-05, + "loss": 2.6448, + "step": 28967 + }, + { + "epoch": 1.3486742556509999, + "grad_norm": 0.3692917538424702, + "learning_rate": 6.717141010426332e-05, + "loss": 2.8156, + "step": 28968 + }, + { + "epoch": 1.348720813837093, + "grad_norm": 0.3364211717438112, + "learning_rate": 6.716886609075859e-05, + "loss": 2.7591, + "step": 28969 + }, + { + "epoch": 1.348767372023186, + "grad_norm": 0.3721615643285278, + "learning_rate": 6.716632202686493e-05, + "loss": 2.7654, + "step": 28970 + }, + { + "epoch": 1.348813930209279, + "grad_norm": 0.368593979701792, + "learning_rate": 6.71637779125898e-05, + "loss": 2.7582, + "step": 28971 + }, + { + "epoch": 1.348860488395372, + "grad_norm": 0.3332390909583944, + "learning_rate": 6.716123374794066e-05, + "loss": 2.7836, + "step": 28972 + }, + { + "epoch": 1.3489070465814652, + "grad_norm": 0.3683998794053614, + "learning_rate": 6.715868953292499e-05, + "loss": 2.9068, + "step": 28973 + }, + { + "epoch": 1.3489536047675583, + "grad_norm": 0.3467759216306372, + "learning_rate": 6.715614526755027e-05, + "loss": 2.7351, + "step": 28974 + }, + { + "epoch": 1.3490001629536512, + "grad_norm": 0.3294929424371066, + "learning_rate": 6.715360095182396e-05, + "loss": 2.7879, + "step": 28975 + }, + { + "epoch": 1.3490467211397443, + "grad_norm": 0.37311882085672443, + "learning_rate": 6.715105658575352e-05, + "loss": 2.7914, + "step": 28976 + }, + { + "epoch": 1.3490932793258374, + "grad_norm": 0.3334474953386275, + "learning_rate": 6.71485121693464e-05, + "loss": 2.7089, + "step": 28977 + }, + { + "epoch": 1.3491398375119306, + "grad_norm": 0.3519689973749558, + "learning_rate": 6.714596770261011e-05, + "loss": 2.8255, + "step": 28978 + }, + { + "epoch": 1.3491863956980237, + "grad_norm": 0.35088066916596783, + "learning_rate": 6.714342318555208e-05, + "loss": 2.7529, + "step": 28979 + }, + { + "epoch": 1.3492329538841168, + "grad_norm": 0.35296969559970975, + "learning_rate": 6.71408786181798e-05, + "loss": 2.6826, + "step": 28980 + }, + { + "epoch": 1.3492795120702097, + "grad_norm": 0.3288109285719, + "learning_rate": 6.713833400050071e-05, + "loss": 2.6853, + "step": 28981 + }, + { + "epoch": 1.3493260702563028, + "grad_norm": 0.35672008292818186, + "learning_rate": 6.713578933252232e-05, + "loss": 2.682, + "step": 28982 + }, + { + "epoch": 1.349372628442396, + "grad_norm": 0.333042182977988, + "learning_rate": 6.713324461425207e-05, + "loss": 2.7258, + "step": 28983 + }, + { + "epoch": 1.349419186628489, + "grad_norm": 0.33432471103677397, + "learning_rate": 6.713069984569743e-05, + "loss": 2.7925, + "step": 28984 + }, + { + "epoch": 1.349465744814582, + "grad_norm": 0.3301882453240349, + "learning_rate": 6.712815502686587e-05, + "loss": 2.7576, + "step": 28985 + }, + { + "epoch": 1.349512303000675, + "grad_norm": 0.3140444428964354, + "learning_rate": 6.712561015776487e-05, + "loss": 2.759, + "step": 28986 + }, + { + "epoch": 1.3495588611867682, + "grad_norm": 0.33032315439236887, + "learning_rate": 6.71230652384019e-05, + "loss": 2.7317, + "step": 28987 + }, + { + "epoch": 1.3496054193728613, + "grad_norm": 0.3291455733524793, + "learning_rate": 6.71205202687844e-05, + "loss": 2.7422, + "step": 28988 + }, + { + "epoch": 1.3496519775589544, + "grad_norm": 0.3307360223078461, + "learning_rate": 6.711797524891987e-05, + "loss": 2.7566, + "step": 28989 + }, + { + "epoch": 1.3496985357450475, + "grad_norm": 0.33293414376772107, + "learning_rate": 6.711543017881577e-05, + "loss": 2.7716, + "step": 28990 + }, + { + "epoch": 1.3497450939311404, + "grad_norm": 0.33732580518490723, + "learning_rate": 6.711288505847957e-05, + "loss": 2.744, + "step": 28991 + }, + { + "epoch": 1.3497916521172335, + "grad_norm": 0.3352312801803231, + "learning_rate": 6.711033988791875e-05, + "loss": 2.762, + "step": 28992 + }, + { + "epoch": 1.3498382103033266, + "grad_norm": 0.3361486326420764, + "learning_rate": 6.710779466714075e-05, + "loss": 2.7402, + "step": 28993 + }, + { + "epoch": 1.3498847684894195, + "grad_norm": 0.3156599987488597, + "learning_rate": 6.710524939615306e-05, + "loss": 2.7557, + "step": 28994 + }, + { + "epoch": 1.3499313266755126, + "grad_norm": 0.3071578152899109, + "learning_rate": 6.710270407496315e-05, + "loss": 2.7357, + "step": 28995 + }, + { + "epoch": 1.3499778848616057, + "grad_norm": 0.3355849690257265, + "learning_rate": 6.710015870357848e-05, + "loss": 2.8873, + "step": 28996 + }, + { + "epoch": 1.3500244430476989, + "grad_norm": 0.3307595575342572, + "learning_rate": 6.709761328200654e-05, + "loss": 2.8345, + "step": 28997 + }, + { + "epoch": 1.350071001233792, + "grad_norm": 0.33264312398156576, + "learning_rate": 6.709506781025479e-05, + "loss": 2.8754, + "step": 28998 + }, + { + "epoch": 1.350117559419885, + "grad_norm": 0.33734885082520405, + "learning_rate": 6.70925222883307e-05, + "loss": 2.7313, + "step": 28999 + }, + { + "epoch": 1.3501641176059782, + "grad_norm": 0.33477164487226363, + "learning_rate": 6.708997671624173e-05, + "loss": 2.8428, + "step": 29000 + }, + { + "epoch": 1.350210675792071, + "grad_norm": 0.3339599139810029, + "learning_rate": 6.708743109399538e-05, + "loss": 2.8725, + "step": 29001 + }, + { + "epoch": 1.3502572339781642, + "grad_norm": 0.31234552597424653, + "learning_rate": 6.70848854215991e-05, + "loss": 2.655, + "step": 29002 + }, + { + "epoch": 1.3503037921642573, + "grad_norm": 0.37916337044558424, + "learning_rate": 6.708233969906036e-05, + "loss": 2.6707, + "step": 29003 + }, + { + "epoch": 1.3503503503503502, + "grad_norm": 0.30545578715294514, + "learning_rate": 6.707979392638663e-05, + "loss": 2.7642, + "step": 29004 + }, + { + "epoch": 1.3503969085364433, + "grad_norm": 0.3567309875750391, + "learning_rate": 6.70772481035854e-05, + "loss": 2.7761, + "step": 29005 + }, + { + "epoch": 1.3504434667225365, + "grad_norm": 0.3050124515232016, + "learning_rate": 6.707470223066412e-05, + "loss": 2.6921, + "step": 29006 + }, + { + "epoch": 1.3504900249086296, + "grad_norm": 0.3530259769904021, + "learning_rate": 6.707215630763026e-05, + "loss": 2.8076, + "step": 29007 + }, + { + "epoch": 1.3505365830947227, + "grad_norm": 0.3346654933082411, + "learning_rate": 6.706961033449131e-05, + "loss": 2.871, + "step": 29008 + }, + { + "epoch": 1.3505831412808158, + "grad_norm": 0.3491716215190029, + "learning_rate": 6.706706431125474e-05, + "loss": 2.8128, + "step": 29009 + }, + { + "epoch": 1.3506296994669087, + "grad_norm": 0.33448674541814477, + "learning_rate": 6.706451823792803e-05, + "loss": 2.7468, + "step": 29010 + }, + { + "epoch": 1.3506762576530018, + "grad_norm": 0.3494366183508085, + "learning_rate": 6.706197211451862e-05, + "loss": 2.7079, + "step": 29011 + }, + { + "epoch": 1.350722815839095, + "grad_norm": 0.3492915003949005, + "learning_rate": 6.7059425941034e-05, + "loss": 2.6921, + "step": 29012 + }, + { + "epoch": 1.350769374025188, + "grad_norm": 0.33779549445513385, + "learning_rate": 6.705687971748167e-05, + "loss": 2.8563, + "step": 29013 + }, + { + "epoch": 1.350815932211281, + "grad_norm": 0.33746629448286347, + "learning_rate": 6.705433344386907e-05, + "loss": 2.7929, + "step": 29014 + }, + { + "epoch": 1.350862490397374, + "grad_norm": 0.35592314183870494, + "learning_rate": 6.705178712020367e-05, + "loss": 2.839, + "step": 29015 + }, + { + "epoch": 1.3509090485834672, + "grad_norm": 0.3360278443111862, + "learning_rate": 6.704924074649297e-05, + "loss": 2.7999, + "step": 29016 + }, + { + "epoch": 1.3509556067695603, + "grad_norm": 0.33093711508736745, + "learning_rate": 6.704669432274442e-05, + "loss": 2.85, + "step": 29017 + }, + { + "epoch": 1.3510021649556534, + "grad_norm": 0.3551818878070986, + "learning_rate": 6.704414784896548e-05, + "loss": 2.7099, + "step": 29018 + }, + { + "epoch": 1.3510487231417465, + "grad_norm": 0.32228805535632166, + "learning_rate": 6.704160132516368e-05, + "loss": 2.7782, + "step": 29019 + }, + { + "epoch": 1.3510952813278394, + "grad_norm": 0.35316319809122315, + "learning_rate": 6.703905475134645e-05, + "loss": 2.799, + "step": 29020 + }, + { + "epoch": 1.3511418395139325, + "grad_norm": 0.3506140319002881, + "learning_rate": 6.703650812752126e-05, + "loss": 2.7513, + "step": 29021 + }, + { + "epoch": 1.3511883977000256, + "grad_norm": 0.3222636937917864, + "learning_rate": 6.703396145369561e-05, + "loss": 2.8302, + "step": 29022 + }, + { + "epoch": 1.3512349558861187, + "grad_norm": 0.3551746974919497, + "learning_rate": 6.703141472987695e-05, + "loss": 2.8031, + "step": 29023 + }, + { + "epoch": 1.3512815140722116, + "grad_norm": 0.3433780734443921, + "learning_rate": 6.702886795607277e-05, + "loss": 2.7329, + "step": 29024 + }, + { + "epoch": 1.3513280722583048, + "grad_norm": 0.3311199927340674, + "learning_rate": 6.702632113229054e-05, + "loss": 2.8129, + "step": 29025 + }, + { + "epoch": 1.3513746304443979, + "grad_norm": 0.3636738228768246, + "learning_rate": 6.702377425853773e-05, + "loss": 2.7474, + "step": 29026 + }, + { + "epoch": 1.351421188630491, + "grad_norm": 0.33889913150552103, + "learning_rate": 6.702122733482183e-05, + "loss": 2.7534, + "step": 29027 + }, + { + "epoch": 1.351467746816584, + "grad_norm": 0.32304124193022, + "learning_rate": 6.701868036115031e-05, + "loss": 2.6445, + "step": 29028 + }, + { + "epoch": 1.3515143050026772, + "grad_norm": 0.35461615046514183, + "learning_rate": 6.701613333753063e-05, + "loss": 2.829, + "step": 29029 + }, + { + "epoch": 1.3515608631887701, + "grad_norm": 0.3575150747022847, + "learning_rate": 6.701358626397028e-05, + "loss": 2.7487, + "step": 29030 + }, + { + "epoch": 1.3516074213748632, + "grad_norm": 0.3208639647665489, + "learning_rate": 6.701103914047672e-05, + "loss": 2.7578, + "step": 29031 + }, + { + "epoch": 1.3516539795609563, + "grad_norm": 0.3543003701821678, + "learning_rate": 6.700849196705746e-05, + "loss": 2.8284, + "step": 29032 + }, + { + "epoch": 1.3517005377470495, + "grad_norm": 0.3636945263168061, + "learning_rate": 6.700594474371991e-05, + "loss": 2.8583, + "step": 29033 + }, + { + "epoch": 1.3517470959331424, + "grad_norm": 0.3591225874214014, + "learning_rate": 6.700339747047162e-05, + "loss": 2.8379, + "step": 29034 + }, + { + "epoch": 1.3517936541192355, + "grad_norm": 0.34674835533205456, + "learning_rate": 6.700085014732002e-05, + "loss": 2.7872, + "step": 29035 + }, + { + "epoch": 1.3518402123053286, + "grad_norm": 0.31449374908525707, + "learning_rate": 6.69983027742726e-05, + "loss": 2.7235, + "step": 29036 + }, + { + "epoch": 1.3518867704914217, + "grad_norm": 0.3580761322620119, + "learning_rate": 6.699575535133684e-05, + "loss": 2.8535, + "step": 29037 + }, + { + "epoch": 1.3519333286775148, + "grad_norm": 0.3269927520834826, + "learning_rate": 6.699320787852021e-05, + "loss": 2.7226, + "step": 29038 + }, + { + "epoch": 1.351979886863608, + "grad_norm": 0.33546131750111274, + "learning_rate": 6.699066035583019e-05, + "loss": 2.7615, + "step": 29039 + }, + { + "epoch": 1.3520264450497008, + "grad_norm": 0.32670023741904025, + "learning_rate": 6.698811278327427e-05, + "loss": 2.7766, + "step": 29040 + }, + { + "epoch": 1.352073003235794, + "grad_norm": 0.36162621927547184, + "learning_rate": 6.69855651608599e-05, + "loss": 2.8216, + "step": 29041 + }, + { + "epoch": 1.352119561421887, + "grad_norm": 0.3100261743210483, + "learning_rate": 6.698301748859456e-05, + "loss": 2.7941, + "step": 29042 + }, + { + "epoch": 1.35216611960798, + "grad_norm": 0.3538175475583512, + "learning_rate": 6.698046976648576e-05, + "loss": 2.7254, + "step": 29043 + }, + { + "epoch": 1.352212677794073, + "grad_norm": 0.3154548990064645, + "learning_rate": 6.697792199454094e-05, + "loss": 2.7601, + "step": 29044 + }, + { + "epoch": 1.3522592359801662, + "grad_norm": 0.3712496854188142, + "learning_rate": 6.697537417276757e-05, + "loss": 2.7201, + "step": 29045 + }, + { + "epoch": 1.3523057941662593, + "grad_norm": 0.31892524808407097, + "learning_rate": 6.697282630117319e-05, + "loss": 2.6535, + "step": 29046 + }, + { + "epoch": 1.3523523523523524, + "grad_norm": 0.32678305967423565, + "learning_rate": 6.697027837976522e-05, + "loss": 2.7863, + "step": 29047 + }, + { + "epoch": 1.3523989105384455, + "grad_norm": 0.33057076862410834, + "learning_rate": 6.696773040855114e-05, + "loss": 2.8102, + "step": 29048 + }, + { + "epoch": 1.3524454687245384, + "grad_norm": 0.33636908072484945, + "learning_rate": 6.696518238753846e-05, + "loss": 2.782, + "step": 29049 + }, + { + "epoch": 1.3524920269106315, + "grad_norm": 0.3355610550082539, + "learning_rate": 6.696263431673462e-05, + "loss": 2.6352, + "step": 29050 + }, + { + "epoch": 1.3525385850967246, + "grad_norm": 0.31975931950148684, + "learning_rate": 6.696008619614715e-05, + "loss": 2.8156, + "step": 29051 + }, + { + "epoch": 1.3525851432828178, + "grad_norm": 0.35442443910286103, + "learning_rate": 6.695753802578349e-05, + "loss": 2.787, + "step": 29052 + }, + { + "epoch": 1.3526317014689107, + "grad_norm": 0.31060762156045996, + "learning_rate": 6.695498980565112e-05, + "loss": 2.6565, + "step": 29053 + }, + { + "epoch": 1.3526782596550038, + "grad_norm": 0.3149643121287024, + "learning_rate": 6.695244153575753e-05, + "loss": 2.8572, + "step": 29054 + }, + { + "epoch": 1.3527248178410969, + "grad_norm": 0.34431571872696953, + "learning_rate": 6.694989321611019e-05, + "loss": 2.7331, + "step": 29055 + }, + { + "epoch": 1.35277137602719, + "grad_norm": 0.33284207817821354, + "learning_rate": 6.694734484671659e-05, + "loss": 2.8533, + "step": 29056 + }, + { + "epoch": 1.3528179342132831, + "grad_norm": 0.3333008346989517, + "learning_rate": 6.694479642758419e-05, + "loss": 2.8974, + "step": 29057 + }, + { + "epoch": 1.3528644923993762, + "grad_norm": 0.32937684273130824, + "learning_rate": 6.694224795872049e-05, + "loss": 2.7246, + "step": 29058 + }, + { + "epoch": 1.3529110505854691, + "grad_norm": 0.3299642350084328, + "learning_rate": 6.693969944013296e-05, + "loss": 2.7324, + "step": 29059 + }, + { + "epoch": 1.3529576087715622, + "grad_norm": 0.3133266421596068, + "learning_rate": 6.693715087182908e-05, + "loss": 2.7387, + "step": 29060 + }, + { + "epoch": 1.3530041669576554, + "grad_norm": 0.3150228774974911, + "learning_rate": 6.693460225381632e-05, + "loss": 2.7999, + "step": 29061 + }, + { + "epoch": 1.3530507251437485, + "grad_norm": 0.334329351646458, + "learning_rate": 6.693205358610218e-05, + "loss": 2.7452, + "step": 29062 + }, + { + "epoch": 1.3530972833298414, + "grad_norm": 0.3304064672069005, + "learning_rate": 6.692950486869414e-05, + "loss": 2.7783, + "step": 29063 + }, + { + "epoch": 1.3531438415159345, + "grad_norm": 0.3194235154748024, + "learning_rate": 6.692695610159965e-05, + "loss": 2.6715, + "step": 29064 + }, + { + "epoch": 1.3531903997020276, + "grad_norm": 0.33349385953858424, + "learning_rate": 6.692440728482622e-05, + "loss": 2.7371, + "step": 29065 + }, + { + "epoch": 1.3532369578881207, + "grad_norm": 0.33258726859311394, + "learning_rate": 6.692185841838135e-05, + "loss": 2.7376, + "step": 29066 + }, + { + "epoch": 1.3532835160742138, + "grad_norm": 0.3178713728192033, + "learning_rate": 6.691930950227246e-05, + "loss": 2.7155, + "step": 29067 + }, + { + "epoch": 1.353330074260307, + "grad_norm": 0.3234326172391391, + "learning_rate": 6.691676053650707e-05, + "loss": 2.709, + "step": 29068 + }, + { + "epoch": 1.3533766324463998, + "grad_norm": 0.3155799019326437, + "learning_rate": 6.691421152109268e-05, + "loss": 2.8256, + "step": 29069 + }, + { + "epoch": 1.353423190632493, + "grad_norm": 0.3298911860929628, + "learning_rate": 6.69116624560367e-05, + "loss": 2.7631, + "step": 29070 + }, + { + "epoch": 1.353469748818586, + "grad_norm": 0.33677741031812297, + "learning_rate": 6.690911334134669e-05, + "loss": 2.8361, + "step": 29071 + }, + { + "epoch": 1.3535163070046792, + "grad_norm": 0.33955007144452226, + "learning_rate": 6.690656417703007e-05, + "loss": 2.7642, + "step": 29072 + }, + { + "epoch": 1.353562865190772, + "grad_norm": 0.3355099131953941, + "learning_rate": 6.690401496309438e-05, + "loss": 2.7672, + "step": 29073 + }, + { + "epoch": 1.3536094233768652, + "grad_norm": 0.3476780965110458, + "learning_rate": 6.690146569954706e-05, + "loss": 2.8062, + "step": 29074 + }, + { + "epoch": 1.3536559815629583, + "grad_norm": 0.324472478675393, + "learning_rate": 6.68989163863956e-05, + "loss": 2.7769, + "step": 29075 + }, + { + "epoch": 1.3537025397490514, + "grad_norm": 0.3135583992749703, + "learning_rate": 6.689636702364748e-05, + "loss": 2.6213, + "step": 29076 + }, + { + "epoch": 1.3537490979351445, + "grad_norm": 0.3456575054926991, + "learning_rate": 6.68938176113102e-05, + "loss": 2.7435, + "step": 29077 + }, + { + "epoch": 1.3537956561212376, + "grad_norm": 0.33099846253293963, + "learning_rate": 6.689126814939122e-05, + "loss": 2.6747, + "step": 29078 + }, + { + "epoch": 1.3538422143073305, + "grad_norm": 0.34671786543277183, + "learning_rate": 6.688871863789805e-05, + "loss": 2.8017, + "step": 29079 + }, + { + "epoch": 1.3538887724934237, + "grad_norm": 0.3473144375771951, + "learning_rate": 6.688616907683814e-05, + "loss": 2.7231, + "step": 29080 + }, + { + "epoch": 1.3539353306795168, + "grad_norm": 0.37601993974419373, + "learning_rate": 6.688361946621898e-05, + "loss": 2.7645, + "step": 29081 + }, + { + "epoch": 1.3539818888656097, + "grad_norm": 0.32728656865038186, + "learning_rate": 6.688106980604807e-05, + "loss": 2.741, + "step": 29082 + }, + { + "epoch": 1.3540284470517028, + "grad_norm": 0.3531379558466418, + "learning_rate": 6.687852009633288e-05, + "loss": 2.788, + "step": 29083 + }, + { + "epoch": 1.354075005237796, + "grad_norm": 0.3702552771273612, + "learning_rate": 6.68759703370809e-05, + "loss": 2.7126, + "step": 29084 + }, + { + "epoch": 1.354121563423889, + "grad_norm": 0.3378511639357303, + "learning_rate": 6.68734205282996e-05, + "loss": 2.7465, + "step": 29085 + }, + { + "epoch": 1.3541681216099821, + "grad_norm": 0.3630223411577148, + "learning_rate": 6.687087066999648e-05, + "loss": 2.8767, + "step": 29086 + }, + { + "epoch": 1.3542146797960752, + "grad_norm": 0.393117274612425, + "learning_rate": 6.686832076217902e-05, + "loss": 2.6982, + "step": 29087 + }, + { + "epoch": 1.3542612379821684, + "grad_norm": 0.3456214987360955, + "learning_rate": 6.68657708048547e-05, + "loss": 2.7916, + "step": 29088 + }, + { + "epoch": 1.3543077961682612, + "grad_norm": 0.3939607214292642, + "learning_rate": 6.6863220798031e-05, + "loss": 2.7276, + "step": 29089 + }, + { + "epoch": 1.3543543543543544, + "grad_norm": 0.31160429605047, + "learning_rate": 6.68606707417154e-05, + "loss": 2.7329, + "step": 29090 + }, + { + "epoch": 1.3544009125404475, + "grad_norm": 0.35105981477681675, + "learning_rate": 6.685812063591542e-05, + "loss": 2.7086, + "step": 29091 + }, + { + "epoch": 1.3544474707265404, + "grad_norm": 0.33385624286500454, + "learning_rate": 6.68555704806385e-05, + "loss": 2.6961, + "step": 29092 + }, + { + "epoch": 1.3544940289126335, + "grad_norm": 0.32460725628892306, + "learning_rate": 6.685302027589214e-05, + "loss": 2.6337, + "step": 29093 + }, + { + "epoch": 1.3545405870987266, + "grad_norm": 0.3630955734632527, + "learning_rate": 6.685047002168382e-05, + "loss": 2.7609, + "step": 29094 + }, + { + "epoch": 1.3545871452848197, + "grad_norm": 0.3193639835560093, + "learning_rate": 6.684791971802104e-05, + "loss": 2.8375, + "step": 29095 + }, + { + "epoch": 1.3546337034709128, + "grad_norm": 0.34989534808582895, + "learning_rate": 6.684536936491127e-05, + "loss": 2.7855, + "step": 29096 + }, + { + "epoch": 1.354680261657006, + "grad_norm": 0.3367078911153691, + "learning_rate": 6.684281896236199e-05, + "loss": 2.7523, + "step": 29097 + }, + { + "epoch": 1.3547268198430988, + "grad_norm": 0.3163658223729935, + "learning_rate": 6.684026851038072e-05, + "loss": 2.6685, + "step": 29098 + }, + { + "epoch": 1.354773378029192, + "grad_norm": 0.3199443958968531, + "learning_rate": 6.68377180089749e-05, + "loss": 2.7448, + "step": 29099 + }, + { + "epoch": 1.354819936215285, + "grad_norm": 0.3525813776758031, + "learning_rate": 6.683516745815206e-05, + "loss": 2.9093, + "step": 29100 + }, + { + "epoch": 1.3548664944013782, + "grad_norm": 0.3188611810089747, + "learning_rate": 6.683261685791964e-05, + "loss": 2.8329, + "step": 29101 + }, + { + "epoch": 1.354913052587471, + "grad_norm": 0.33617697350846937, + "learning_rate": 6.683006620828516e-05, + "loss": 2.654, + "step": 29102 + }, + { + "epoch": 1.3549596107735642, + "grad_norm": 0.3528990601865813, + "learning_rate": 6.682751550925608e-05, + "loss": 2.8418, + "step": 29103 + }, + { + "epoch": 1.3550061689596573, + "grad_norm": 0.3226496360253956, + "learning_rate": 6.682496476083991e-05, + "loss": 2.7328, + "step": 29104 + }, + { + "epoch": 1.3550527271457504, + "grad_norm": 0.3399347153563513, + "learning_rate": 6.682241396304413e-05, + "loss": 2.7494, + "step": 29105 + }, + { + "epoch": 1.3550992853318435, + "grad_norm": 0.32595721075831696, + "learning_rate": 6.681986311587623e-05, + "loss": 2.7787, + "step": 29106 + }, + { + "epoch": 1.3551458435179367, + "grad_norm": 0.3231848589140375, + "learning_rate": 6.681731221934366e-05, + "loss": 2.7445, + "step": 29107 + }, + { + "epoch": 1.3551924017040295, + "grad_norm": 0.3170441924133103, + "learning_rate": 6.681476127345396e-05, + "loss": 2.7567, + "step": 29108 + }, + { + "epoch": 1.3552389598901227, + "grad_norm": 0.3158651572803637, + "learning_rate": 6.681221027821458e-05, + "loss": 2.7965, + "step": 29109 + }, + { + "epoch": 1.3552855180762158, + "grad_norm": 0.3220582209517199, + "learning_rate": 6.680965923363303e-05, + "loss": 2.8038, + "step": 29110 + }, + { + "epoch": 1.355332076262309, + "grad_norm": 0.34422907974899003, + "learning_rate": 6.680710813971677e-05, + "loss": 2.7357, + "step": 29111 + }, + { + "epoch": 1.3553786344484018, + "grad_norm": 0.31173832994913486, + "learning_rate": 6.680455699647331e-05, + "loss": 2.7445, + "step": 29112 + }, + { + "epoch": 1.355425192634495, + "grad_norm": 0.32955011688730657, + "learning_rate": 6.680200580391013e-05, + "loss": 2.7841, + "step": 29113 + }, + { + "epoch": 1.355471750820588, + "grad_norm": 0.32589413902699815, + "learning_rate": 6.679945456203472e-05, + "loss": 2.8242, + "step": 29114 + }, + { + "epoch": 1.3555183090066811, + "grad_norm": 0.36710638501175186, + "learning_rate": 6.679690327085457e-05, + "loss": 2.7771, + "step": 29115 + }, + { + "epoch": 1.3555648671927742, + "grad_norm": 0.3066571164399902, + "learning_rate": 6.679435193037716e-05, + "loss": 2.7648, + "step": 29116 + }, + { + "epoch": 1.3556114253788674, + "grad_norm": 0.3643148337055663, + "learning_rate": 6.679180054060999e-05, + "loss": 2.771, + "step": 29117 + }, + { + "epoch": 1.3556579835649603, + "grad_norm": 0.35316089633343956, + "learning_rate": 6.678924910156053e-05, + "loss": 2.7833, + "step": 29118 + }, + { + "epoch": 1.3557045417510534, + "grad_norm": 0.3485434963693245, + "learning_rate": 6.678669761323628e-05, + "loss": 2.7984, + "step": 29119 + }, + { + "epoch": 1.3557510999371465, + "grad_norm": 0.326862055980581, + "learning_rate": 6.678414607564472e-05, + "loss": 2.7262, + "step": 29120 + }, + { + "epoch": 1.3557976581232396, + "grad_norm": 0.3533259439820182, + "learning_rate": 6.678159448879335e-05, + "loss": 2.7945, + "step": 29121 + }, + { + "epoch": 1.3558442163093325, + "grad_norm": 0.33686057795102464, + "learning_rate": 6.677904285268965e-05, + "loss": 2.6574, + "step": 29122 + }, + { + "epoch": 1.3558907744954256, + "grad_norm": 0.3173768803639268, + "learning_rate": 6.677649116734113e-05, + "loss": 2.8331, + "step": 29123 + }, + { + "epoch": 1.3559373326815187, + "grad_norm": 0.37650520302782003, + "learning_rate": 6.677393943275525e-05, + "loss": 2.8332, + "step": 29124 + }, + { + "epoch": 1.3559838908676118, + "grad_norm": 0.3711513330854503, + "learning_rate": 6.67713876489395e-05, + "loss": 2.769, + "step": 29125 + }, + { + "epoch": 1.356030449053705, + "grad_norm": 0.37078383818942795, + "learning_rate": 6.676883581590139e-05, + "loss": 2.843, + "step": 29126 + }, + { + "epoch": 1.356077007239798, + "grad_norm": 0.3606485725156157, + "learning_rate": 6.67662839336484e-05, + "loss": 2.7808, + "step": 29127 + }, + { + "epoch": 1.356123565425891, + "grad_norm": 0.3573810857262784, + "learning_rate": 6.676373200218801e-05, + "loss": 2.782, + "step": 29128 + }, + { + "epoch": 1.356170123611984, + "grad_norm": 0.3418837624985374, + "learning_rate": 6.676118002152771e-05, + "loss": 2.758, + "step": 29129 + }, + { + "epoch": 1.3562166817980772, + "grad_norm": 0.39896258700144244, + "learning_rate": 6.675862799167503e-05, + "loss": 2.8064, + "step": 29130 + }, + { + "epoch": 1.35626323998417, + "grad_norm": 0.3641877594340962, + "learning_rate": 6.67560759126374e-05, + "loss": 2.661, + "step": 29131 + }, + { + "epoch": 1.3563097981702632, + "grad_norm": 0.36476360797650864, + "learning_rate": 6.675352378442236e-05, + "loss": 2.8266, + "step": 29132 + }, + { + "epoch": 1.3563563563563563, + "grad_norm": 0.36165954260606015, + "learning_rate": 6.675097160703736e-05, + "loss": 2.8295, + "step": 29133 + }, + { + "epoch": 1.3564029145424494, + "grad_norm": 0.324957095762274, + "learning_rate": 6.674841938048991e-05, + "loss": 2.7138, + "step": 29134 + }, + { + "epoch": 1.3564494727285425, + "grad_norm": 0.35445781261036846, + "learning_rate": 6.67458671047875e-05, + "loss": 2.7524, + "step": 29135 + }, + { + "epoch": 1.3564960309146357, + "grad_norm": 0.3482471753834887, + "learning_rate": 6.674331477993762e-05, + "loss": 2.7041, + "step": 29136 + }, + { + "epoch": 1.3565425891007286, + "grad_norm": 0.3259220435037308, + "learning_rate": 6.674076240594778e-05, + "loss": 2.8672, + "step": 29137 + }, + { + "epoch": 1.3565891472868217, + "grad_norm": 0.345770220867675, + "learning_rate": 6.673820998282542e-05, + "loss": 2.7956, + "step": 29138 + }, + { + "epoch": 1.3566357054729148, + "grad_norm": 0.336136951936312, + "learning_rate": 6.673565751057808e-05, + "loss": 2.7678, + "step": 29139 + }, + { + "epoch": 1.356682263659008, + "grad_norm": 0.3221246526899194, + "learning_rate": 6.673310498921322e-05, + "loss": 2.7398, + "step": 29140 + }, + { + "epoch": 1.3567288218451008, + "grad_norm": 0.3285724294237721, + "learning_rate": 6.673055241873836e-05, + "loss": 2.7591, + "step": 29141 + }, + { + "epoch": 1.356775380031194, + "grad_norm": 0.32118072683672505, + "learning_rate": 6.672799979916098e-05, + "loss": 2.7695, + "step": 29142 + }, + { + "epoch": 1.356821938217287, + "grad_norm": 0.31102598431621425, + "learning_rate": 6.672544713048855e-05, + "loss": 2.7989, + "step": 29143 + }, + { + "epoch": 1.3568684964033801, + "grad_norm": 0.3282341240992049, + "learning_rate": 6.672289441272862e-05, + "loss": 2.8105, + "step": 29144 + }, + { + "epoch": 1.3569150545894733, + "grad_norm": 0.318921535293034, + "learning_rate": 6.672034164588861e-05, + "loss": 2.8438, + "step": 29145 + }, + { + "epoch": 1.3569616127755664, + "grad_norm": 0.31226764633321785, + "learning_rate": 6.671778882997605e-05, + "loss": 2.6992, + "step": 29146 + }, + { + "epoch": 1.3570081709616593, + "grad_norm": 0.3115006317850899, + "learning_rate": 6.671523596499842e-05, + "loss": 2.6664, + "step": 29147 + }, + { + "epoch": 1.3570547291477524, + "grad_norm": 0.30263128845881776, + "learning_rate": 6.671268305096322e-05, + "loss": 2.7208, + "step": 29148 + }, + { + "epoch": 1.3571012873338455, + "grad_norm": 0.33316426650048603, + "learning_rate": 6.671013008787795e-05, + "loss": 2.7273, + "step": 29149 + }, + { + "epoch": 1.3571478455199386, + "grad_norm": 0.3130393659462977, + "learning_rate": 6.67075770757501e-05, + "loss": 2.7904, + "step": 29150 + }, + { + "epoch": 1.3571944037060315, + "grad_norm": 0.3169536935700468, + "learning_rate": 6.670502401458713e-05, + "loss": 2.7446, + "step": 29151 + }, + { + "epoch": 1.3572409618921246, + "grad_norm": 0.33395588860199277, + "learning_rate": 6.670247090439658e-05, + "loss": 2.7695, + "step": 29152 + }, + { + "epoch": 1.3572875200782177, + "grad_norm": 0.29329123310190364, + "learning_rate": 6.669991774518592e-05, + "loss": 2.5894, + "step": 29153 + }, + { + "epoch": 1.3573340782643109, + "grad_norm": 0.36206203706020396, + "learning_rate": 6.669736453696266e-05, + "loss": 2.7656, + "step": 29154 + }, + { + "epoch": 1.357380636450404, + "grad_norm": 0.3062606774279148, + "learning_rate": 6.669481127973426e-05, + "loss": 2.7071, + "step": 29155 + }, + { + "epoch": 1.357427194636497, + "grad_norm": 0.36220944761063517, + "learning_rate": 6.669225797350826e-05, + "loss": 2.8124, + "step": 29156 + }, + { + "epoch": 1.35747375282259, + "grad_norm": 0.3071714705236992, + "learning_rate": 6.668970461829211e-05, + "loss": 2.843, + "step": 29157 + }, + { + "epoch": 1.357520311008683, + "grad_norm": 0.32337393761816813, + "learning_rate": 6.66871512140933e-05, + "loss": 2.7836, + "step": 29158 + }, + { + "epoch": 1.3575668691947762, + "grad_norm": 0.3353848454904254, + "learning_rate": 6.668459776091938e-05, + "loss": 2.7641, + "step": 29159 + }, + { + "epoch": 1.3576134273808693, + "grad_norm": 0.31901719121245875, + "learning_rate": 6.66820442587778e-05, + "loss": 2.7971, + "step": 29160 + }, + { + "epoch": 1.3576599855669622, + "grad_norm": 0.3176856275435756, + "learning_rate": 6.667949070767605e-05, + "loss": 2.7912, + "step": 29161 + }, + { + "epoch": 1.3577065437530553, + "grad_norm": 0.344371443629426, + "learning_rate": 6.667693710762164e-05, + "loss": 2.7531, + "step": 29162 + }, + { + "epoch": 1.3577531019391484, + "grad_norm": 0.31516708496547446, + "learning_rate": 6.667438345862208e-05, + "loss": 2.9262, + "step": 29163 + }, + { + "epoch": 1.3577996601252416, + "grad_norm": 0.3376091697763398, + "learning_rate": 6.667182976068483e-05, + "loss": 2.8674, + "step": 29164 + }, + { + "epoch": 1.3578462183113347, + "grad_norm": 0.31441679610074325, + "learning_rate": 6.66692760138174e-05, + "loss": 2.8446, + "step": 29165 + }, + { + "epoch": 1.3578927764974278, + "grad_norm": 0.35108635594344967, + "learning_rate": 6.66667222180273e-05, + "loss": 2.6713, + "step": 29166 + }, + { + "epoch": 1.3579393346835207, + "grad_norm": 0.3101943645082529, + "learning_rate": 6.666416837332201e-05, + "loss": 2.7466, + "step": 29167 + }, + { + "epoch": 1.3579858928696138, + "grad_norm": 0.33277490218224703, + "learning_rate": 6.666161447970904e-05, + "loss": 2.6458, + "step": 29168 + }, + { + "epoch": 1.358032451055707, + "grad_norm": 0.3134478850927337, + "learning_rate": 6.665906053719586e-05, + "loss": 2.6657, + "step": 29169 + }, + { + "epoch": 1.3580790092417998, + "grad_norm": 0.313054718122094, + "learning_rate": 6.665650654578997e-05, + "loss": 2.8308, + "step": 29170 + }, + { + "epoch": 1.358125567427893, + "grad_norm": 0.3386464081660819, + "learning_rate": 6.665395250549889e-05, + "loss": 2.7513, + "step": 29171 + }, + { + "epoch": 1.358172125613986, + "grad_norm": 0.30742912267771133, + "learning_rate": 6.665139841633007e-05, + "loss": 2.725, + "step": 29172 + }, + { + "epoch": 1.3582186838000792, + "grad_norm": 0.3324261277084952, + "learning_rate": 6.664884427829107e-05, + "loss": 2.7637, + "step": 29173 + }, + { + "epoch": 1.3582652419861723, + "grad_norm": 0.32472978551776055, + "learning_rate": 6.664629009138934e-05, + "loss": 2.8049, + "step": 29174 + }, + { + "epoch": 1.3583118001722654, + "grad_norm": 0.31811997383204826, + "learning_rate": 6.664373585563239e-05, + "loss": 2.7944, + "step": 29175 + }, + { + "epoch": 1.3583583583583583, + "grad_norm": 0.3451802005490732, + "learning_rate": 6.66411815710277e-05, + "loss": 2.7458, + "step": 29176 + }, + { + "epoch": 1.3584049165444514, + "grad_norm": 0.3176575363516919, + "learning_rate": 6.66386272375828e-05, + "loss": 2.7521, + "step": 29177 + }, + { + "epoch": 1.3584514747305445, + "grad_norm": 0.34445257822630587, + "learning_rate": 6.663607285530517e-05, + "loss": 2.7854, + "step": 29178 + }, + { + "epoch": 1.3584980329166376, + "grad_norm": 0.3333108687249039, + "learning_rate": 6.663351842420229e-05, + "loss": 2.7032, + "step": 29179 + }, + { + "epoch": 1.3585445911027305, + "grad_norm": 0.31502713583115854, + "learning_rate": 6.663096394428169e-05, + "loss": 2.7699, + "step": 29180 + }, + { + "epoch": 1.3585911492888236, + "grad_norm": 0.36748794865574863, + "learning_rate": 6.662840941555083e-05, + "loss": 2.6769, + "step": 29181 + }, + { + "epoch": 1.3586377074749167, + "grad_norm": 0.33972645639733984, + "learning_rate": 6.662585483801725e-05, + "loss": 2.6661, + "step": 29182 + }, + { + "epoch": 1.3586842656610099, + "grad_norm": 0.3406290801592735, + "learning_rate": 6.66233002116884e-05, + "loss": 2.7845, + "step": 29183 + }, + { + "epoch": 1.358730823847103, + "grad_norm": 0.33886957025736725, + "learning_rate": 6.66207455365718e-05, + "loss": 2.7817, + "step": 29184 + }, + { + "epoch": 1.358777382033196, + "grad_norm": 0.34879118354444005, + "learning_rate": 6.661819081267498e-05, + "loss": 2.7606, + "step": 29185 + }, + { + "epoch": 1.358823940219289, + "grad_norm": 0.3247700937338528, + "learning_rate": 6.661563604000538e-05, + "loss": 2.7588, + "step": 29186 + }, + { + "epoch": 1.358870498405382, + "grad_norm": 0.37321599981012643, + "learning_rate": 6.661308121857053e-05, + "loss": 2.7956, + "step": 29187 + }, + { + "epoch": 1.3589170565914752, + "grad_norm": 0.3140021503054861, + "learning_rate": 6.661052634837792e-05, + "loss": 2.7204, + "step": 29188 + }, + { + "epoch": 1.3589636147775683, + "grad_norm": 0.32231119431427047, + "learning_rate": 6.660797142943507e-05, + "loss": 2.7648, + "step": 29189 + }, + { + "epoch": 1.3590101729636612, + "grad_norm": 0.3687877135144469, + "learning_rate": 6.660541646174945e-05, + "loss": 2.6946, + "step": 29190 + }, + { + "epoch": 1.3590567311497543, + "grad_norm": 0.3187327279815482, + "learning_rate": 6.660286144532856e-05, + "loss": 2.7144, + "step": 29191 + }, + { + "epoch": 1.3591032893358475, + "grad_norm": 0.33111497811765866, + "learning_rate": 6.660030638017991e-05, + "loss": 2.7519, + "step": 29192 + }, + { + "epoch": 1.3591498475219406, + "grad_norm": 0.37418321464953996, + "learning_rate": 6.6597751266311e-05, + "loss": 2.8687, + "step": 29193 + }, + { + "epoch": 1.3591964057080337, + "grad_norm": 0.3448708453142041, + "learning_rate": 6.659519610372933e-05, + "loss": 2.7308, + "step": 29194 + }, + { + "epoch": 1.3592429638941268, + "grad_norm": 0.368510970601577, + "learning_rate": 6.659264089244239e-05, + "loss": 2.7868, + "step": 29195 + }, + { + "epoch": 1.3592895220802197, + "grad_norm": 0.370214797835829, + "learning_rate": 6.659008563245769e-05, + "loss": 2.7565, + "step": 29196 + }, + { + "epoch": 1.3593360802663128, + "grad_norm": 0.3481682680959652, + "learning_rate": 6.658753032378271e-05, + "loss": 2.7883, + "step": 29197 + }, + { + "epoch": 1.359382638452406, + "grad_norm": 0.3629548383856979, + "learning_rate": 6.658497496642496e-05, + "loss": 2.7098, + "step": 29198 + }, + { + "epoch": 1.359429196638499, + "grad_norm": 0.3535334924914258, + "learning_rate": 6.658241956039196e-05, + "loss": 2.7246, + "step": 29199 + }, + { + "epoch": 1.359475754824592, + "grad_norm": 0.3441170765147799, + "learning_rate": 6.657986410569118e-05, + "loss": 2.7866, + "step": 29200 + }, + { + "epoch": 1.359522313010685, + "grad_norm": 0.3534448682101108, + "learning_rate": 6.657730860233013e-05, + "loss": 2.7498, + "step": 29201 + }, + { + "epoch": 1.3595688711967782, + "grad_norm": 0.3416929818359198, + "learning_rate": 6.657475305031632e-05, + "loss": 2.756, + "step": 29202 + }, + { + "epoch": 1.3596154293828713, + "grad_norm": 0.36413583090943574, + "learning_rate": 6.657219744965724e-05, + "loss": 2.7433, + "step": 29203 + }, + { + "epoch": 1.3596619875689644, + "grad_norm": 0.3312381042980602, + "learning_rate": 6.656964180036041e-05, + "loss": 2.7236, + "step": 29204 + }, + { + "epoch": 1.3597085457550575, + "grad_norm": 0.35226311231568813, + "learning_rate": 6.656708610243329e-05, + "loss": 2.7261, + "step": 29205 + }, + { + "epoch": 1.3597551039411504, + "grad_norm": 0.38476688255288233, + "learning_rate": 6.656453035588342e-05, + "loss": 2.7562, + "step": 29206 + }, + { + "epoch": 1.3598016621272435, + "grad_norm": 0.3442332182389249, + "learning_rate": 6.656197456071826e-05, + "loss": 2.6731, + "step": 29207 + }, + { + "epoch": 1.3598482203133366, + "grad_norm": 0.3487865132989662, + "learning_rate": 6.655941871694536e-05, + "loss": 2.7226, + "step": 29208 + }, + { + "epoch": 1.3598947784994295, + "grad_norm": 0.35187938757678144, + "learning_rate": 6.65568628245722e-05, + "loss": 2.7209, + "step": 29209 + }, + { + "epoch": 1.3599413366855226, + "grad_norm": 0.3282631733627652, + "learning_rate": 6.655430688360627e-05, + "loss": 2.6935, + "step": 29210 + }, + { + "epoch": 1.3599878948716158, + "grad_norm": 0.34691301831134225, + "learning_rate": 6.655175089405506e-05, + "loss": 2.805, + "step": 29211 + }, + { + "epoch": 1.3600344530577089, + "grad_norm": 0.321367979222349, + "learning_rate": 6.654919485592613e-05, + "loss": 2.7948, + "step": 29212 + }, + { + "epoch": 1.360081011243802, + "grad_norm": 0.3328096221792693, + "learning_rate": 6.654663876922692e-05, + "loss": 2.6717, + "step": 29213 + }, + { + "epoch": 1.360127569429895, + "grad_norm": 0.3139208250995226, + "learning_rate": 6.654408263396496e-05, + "loss": 2.8412, + "step": 29214 + }, + { + "epoch": 1.3601741276159882, + "grad_norm": 0.3729857687911069, + "learning_rate": 6.654152645014774e-05, + "loss": 2.7408, + "step": 29215 + }, + { + "epoch": 1.360220685802081, + "grad_norm": 0.33530474803360333, + "learning_rate": 6.653897021778277e-05, + "loss": 2.7058, + "step": 29216 + }, + { + "epoch": 1.3602672439881742, + "grad_norm": 0.33893527214208846, + "learning_rate": 6.653641393687757e-05, + "loss": 2.7711, + "step": 29217 + }, + { + "epoch": 1.3603138021742673, + "grad_norm": 0.35993999059887366, + "learning_rate": 6.653385760743961e-05, + "loss": 2.7406, + "step": 29218 + }, + { + "epoch": 1.3603603603603602, + "grad_norm": 0.34089698279825975, + "learning_rate": 6.653130122947641e-05, + "loss": 2.7664, + "step": 29219 + }, + { + "epoch": 1.3604069185464533, + "grad_norm": 0.3191370212787387, + "learning_rate": 6.652874480299547e-05, + "loss": 2.7571, + "step": 29220 + }, + { + "epoch": 1.3604534767325465, + "grad_norm": 0.32822711459411974, + "learning_rate": 6.65261883280043e-05, + "loss": 2.8336, + "step": 29221 + }, + { + "epoch": 1.3605000349186396, + "grad_norm": 0.31433371121307613, + "learning_rate": 6.652363180451039e-05, + "loss": 2.7642, + "step": 29222 + }, + { + "epoch": 1.3605465931047327, + "grad_norm": 0.3435022720044361, + "learning_rate": 6.652107523252124e-05, + "loss": 2.8365, + "step": 29223 + }, + { + "epoch": 1.3605931512908258, + "grad_norm": 0.34148648915356833, + "learning_rate": 6.651851861204437e-05, + "loss": 2.7379, + "step": 29224 + }, + { + "epoch": 1.3606397094769187, + "grad_norm": 0.3179397188868624, + "learning_rate": 6.651596194308727e-05, + "loss": 2.6869, + "step": 29225 + }, + { + "epoch": 1.3606862676630118, + "grad_norm": 0.32985021699367534, + "learning_rate": 6.651340522565745e-05, + "loss": 2.8548, + "step": 29226 + }, + { + "epoch": 1.360732825849105, + "grad_norm": 0.3332251422428236, + "learning_rate": 6.651084845976241e-05, + "loss": 2.6759, + "step": 29227 + }, + { + "epoch": 1.360779384035198, + "grad_norm": 0.35625692939404763, + "learning_rate": 6.650829164540965e-05, + "loss": 2.8981, + "step": 29228 + }, + { + "epoch": 1.360825942221291, + "grad_norm": 0.3532757680496382, + "learning_rate": 6.65057347826067e-05, + "loss": 2.7663, + "step": 29229 + }, + { + "epoch": 1.360872500407384, + "grad_norm": 0.36004782112530986, + "learning_rate": 6.650317787136103e-05, + "loss": 2.7234, + "step": 29230 + }, + { + "epoch": 1.3609190585934772, + "grad_norm": 0.368867924600945, + "learning_rate": 6.650062091168016e-05, + "loss": 2.6861, + "step": 29231 + }, + { + "epoch": 1.3609656167795703, + "grad_norm": 0.32501171419382174, + "learning_rate": 6.649806390357162e-05, + "loss": 2.8099, + "step": 29232 + }, + { + "epoch": 1.3610121749656634, + "grad_norm": 0.37526618562771547, + "learning_rate": 6.649550684704286e-05, + "loss": 2.7771, + "step": 29233 + }, + { + "epoch": 1.3610587331517565, + "grad_norm": 0.3528427502270014, + "learning_rate": 6.649294974210141e-05, + "loss": 2.7339, + "step": 29234 + }, + { + "epoch": 1.3611052913378494, + "grad_norm": 0.3364139462328825, + "learning_rate": 6.64903925887548e-05, + "loss": 2.8127, + "step": 29235 + }, + { + "epoch": 1.3611518495239425, + "grad_norm": 0.35108028650107287, + "learning_rate": 6.648783538701049e-05, + "loss": 2.8132, + "step": 29236 + }, + { + "epoch": 1.3611984077100356, + "grad_norm": 0.3210448782993621, + "learning_rate": 6.648527813687601e-05, + "loss": 2.6529, + "step": 29237 + }, + { + "epoch": 1.3612449658961288, + "grad_norm": 0.3340298821539012, + "learning_rate": 6.648272083835886e-05, + "loss": 2.9208, + "step": 29238 + }, + { + "epoch": 1.3612915240822216, + "grad_norm": 0.3615914906052284, + "learning_rate": 6.648016349146655e-05, + "loss": 2.8489, + "step": 29239 + }, + { + "epoch": 1.3613380822683148, + "grad_norm": 0.340037728673866, + "learning_rate": 6.64776060962066e-05, + "loss": 2.8423, + "step": 29240 + }, + { + "epoch": 1.3613846404544079, + "grad_norm": 0.35726048977941316, + "learning_rate": 6.647504865258649e-05, + "loss": 2.7412, + "step": 29241 + }, + { + "epoch": 1.361431198640501, + "grad_norm": 0.3486155417771274, + "learning_rate": 6.647249116061372e-05, + "loss": 2.739, + "step": 29242 + }, + { + "epoch": 1.361477756826594, + "grad_norm": 0.3613479674652074, + "learning_rate": 6.646993362029584e-05, + "loss": 2.7709, + "step": 29243 + }, + { + "epoch": 1.3615243150126872, + "grad_norm": 0.3460222470394564, + "learning_rate": 6.64673760316403e-05, + "loss": 2.823, + "step": 29244 + }, + { + "epoch": 1.3615708731987801, + "grad_norm": 0.37659169677137, + "learning_rate": 6.646481839465466e-05, + "loss": 2.7729, + "step": 29245 + }, + { + "epoch": 1.3616174313848732, + "grad_norm": 0.34167044211078584, + "learning_rate": 6.646226070934638e-05, + "loss": 2.7932, + "step": 29246 + }, + { + "epoch": 1.3616639895709663, + "grad_norm": 0.3624471920251959, + "learning_rate": 6.6459702975723e-05, + "loss": 2.8575, + "step": 29247 + }, + { + "epoch": 1.3617105477570595, + "grad_norm": 0.36731498778356947, + "learning_rate": 6.6457145193792e-05, + "loss": 2.8288, + "step": 29248 + }, + { + "epoch": 1.3617571059431524, + "grad_norm": 0.32847722603844687, + "learning_rate": 6.645458736356092e-05, + "loss": 2.6107, + "step": 29249 + }, + { + "epoch": 1.3618036641292455, + "grad_norm": 0.31363664277742076, + "learning_rate": 6.645202948503722e-05, + "loss": 2.77, + "step": 29250 + }, + { + "epoch": 1.3618502223153386, + "grad_norm": 0.3685793264307039, + "learning_rate": 6.644947155822845e-05, + "loss": 2.7699, + "step": 29251 + }, + { + "epoch": 1.3618967805014317, + "grad_norm": 0.3443563928025387, + "learning_rate": 6.644691358314209e-05, + "loss": 2.8302, + "step": 29252 + }, + { + "epoch": 1.3619433386875248, + "grad_norm": 0.37340518306580406, + "learning_rate": 6.644435555978568e-05, + "loss": 2.7222, + "step": 29253 + }, + { + "epoch": 1.361989896873618, + "grad_norm": 0.34412632407507077, + "learning_rate": 6.644179748816669e-05, + "loss": 2.8317, + "step": 29254 + }, + { + "epoch": 1.3620364550597108, + "grad_norm": 0.3337259052732803, + "learning_rate": 6.643923936829264e-05, + "loss": 2.7789, + "step": 29255 + }, + { + "epoch": 1.362083013245804, + "grad_norm": 0.36359194167657694, + "learning_rate": 6.643668120017106e-05, + "loss": 2.8806, + "step": 29256 + }, + { + "epoch": 1.362129571431897, + "grad_norm": 0.3338369084789408, + "learning_rate": 6.643412298380944e-05, + "loss": 2.8038, + "step": 29257 + }, + { + "epoch": 1.36217612961799, + "grad_norm": 0.32501585315833054, + "learning_rate": 6.643156471921528e-05, + "loss": 2.686, + "step": 29258 + }, + { + "epoch": 1.362222687804083, + "grad_norm": 0.36839493206012686, + "learning_rate": 6.64290064063961e-05, + "loss": 2.7342, + "step": 29259 + }, + { + "epoch": 1.3622692459901762, + "grad_norm": 0.3319092263034211, + "learning_rate": 6.642644804535938e-05, + "loss": 2.7791, + "step": 29260 + }, + { + "epoch": 1.3623158041762693, + "grad_norm": 0.32683706038562665, + "learning_rate": 6.642388963611267e-05, + "loss": 2.7808, + "step": 29261 + }, + { + "epoch": 1.3623623623623624, + "grad_norm": 0.35156653248956354, + "learning_rate": 6.642133117866347e-05, + "loss": 2.754, + "step": 29262 + }, + { + "epoch": 1.3624089205484555, + "grad_norm": 0.3849916462272545, + "learning_rate": 6.641877267301928e-05, + "loss": 2.6901, + "step": 29263 + }, + { + "epoch": 1.3624554787345484, + "grad_norm": 0.37112518735020156, + "learning_rate": 6.64162141191876e-05, + "loss": 2.6912, + "step": 29264 + }, + { + "epoch": 1.3625020369206415, + "grad_norm": 0.36504145166145935, + "learning_rate": 6.641365551717596e-05, + "loss": 2.7585, + "step": 29265 + }, + { + "epoch": 1.3625485951067347, + "grad_norm": 0.34480365583477135, + "learning_rate": 6.641109686699184e-05, + "loss": 2.7936, + "step": 29266 + }, + { + "epoch": 1.3625951532928278, + "grad_norm": 0.33709107146607453, + "learning_rate": 6.640853816864279e-05, + "loss": 2.6404, + "step": 29267 + }, + { + "epoch": 1.3626417114789207, + "grad_norm": 0.3202284696636588, + "learning_rate": 6.640597942213629e-05, + "loss": 2.7819, + "step": 29268 + }, + { + "epoch": 1.3626882696650138, + "grad_norm": 0.31754143693285386, + "learning_rate": 6.640342062747984e-05, + "loss": 2.7399, + "step": 29269 + }, + { + "epoch": 1.362734827851107, + "grad_norm": 0.3453266465539284, + "learning_rate": 6.640086178468098e-05, + "loss": 2.775, + "step": 29270 + }, + { + "epoch": 1.3627813860372, + "grad_norm": 0.3167504936833964, + "learning_rate": 6.639830289374722e-05, + "loss": 2.8557, + "step": 29271 + }, + { + "epoch": 1.3628279442232931, + "grad_norm": 0.36537616767287157, + "learning_rate": 6.639574395468604e-05, + "loss": 2.8043, + "step": 29272 + }, + { + "epoch": 1.3628745024093862, + "grad_norm": 0.3207049679457635, + "learning_rate": 6.639318496750498e-05, + "loss": 2.8253, + "step": 29273 + }, + { + "epoch": 1.3629210605954791, + "grad_norm": 0.3308111318696908, + "learning_rate": 6.639062593221152e-05, + "loss": 2.7837, + "step": 29274 + }, + { + "epoch": 1.3629676187815722, + "grad_norm": 0.3304344566514927, + "learning_rate": 6.638806684881318e-05, + "loss": 2.7483, + "step": 29275 + }, + { + "epoch": 1.3630141769676654, + "grad_norm": 0.3511824863977674, + "learning_rate": 6.638550771731751e-05, + "loss": 2.7633, + "step": 29276 + }, + { + "epoch": 1.3630607351537585, + "grad_norm": 0.3195229180914484, + "learning_rate": 6.638294853773196e-05, + "loss": 2.7603, + "step": 29277 + }, + { + "epoch": 1.3631072933398514, + "grad_norm": 0.33074877133747116, + "learning_rate": 6.638038931006408e-05, + "loss": 2.6419, + "step": 29278 + }, + { + "epoch": 1.3631538515259445, + "grad_norm": 0.3636290614726169, + "learning_rate": 6.637783003432136e-05, + "loss": 2.7996, + "step": 29279 + }, + { + "epoch": 1.3632004097120376, + "grad_norm": 0.33062813490428916, + "learning_rate": 6.637527071051135e-05, + "loss": 2.6621, + "step": 29280 + }, + { + "epoch": 1.3632469678981307, + "grad_norm": 0.34773296231615436, + "learning_rate": 6.637271133864151e-05, + "loss": 2.727, + "step": 29281 + }, + { + "epoch": 1.3632935260842238, + "grad_norm": 0.34549920113859756, + "learning_rate": 6.637015191871939e-05, + "loss": 2.7227, + "step": 29282 + }, + { + "epoch": 1.363340084270317, + "grad_norm": 0.3578826616348473, + "learning_rate": 6.636759245075249e-05, + "loss": 2.8035, + "step": 29283 + }, + { + "epoch": 1.3633866424564098, + "grad_norm": 0.3324629600515735, + "learning_rate": 6.63650329347483e-05, + "loss": 2.7467, + "step": 29284 + }, + { + "epoch": 1.363433200642503, + "grad_norm": 0.34755269239930375, + "learning_rate": 6.636247337071437e-05, + "loss": 2.8991, + "step": 29285 + }, + { + "epoch": 1.363479758828596, + "grad_norm": 0.33873394556599484, + "learning_rate": 6.635991375865818e-05, + "loss": 2.7252, + "step": 29286 + }, + { + "epoch": 1.3635263170146892, + "grad_norm": 0.3300918769233548, + "learning_rate": 6.635735409858726e-05, + "loss": 2.8465, + "step": 29287 + }, + { + "epoch": 1.363572875200782, + "grad_norm": 0.33049104030374504, + "learning_rate": 6.635479439050911e-05, + "loss": 2.7154, + "step": 29288 + }, + { + "epoch": 1.3636194333868752, + "grad_norm": 0.36024419928218526, + "learning_rate": 6.635223463443126e-05, + "loss": 2.8168, + "step": 29289 + }, + { + "epoch": 1.3636659915729683, + "grad_norm": 0.3046052003185362, + "learning_rate": 6.634967483036121e-05, + "loss": 2.8045, + "step": 29290 + }, + { + "epoch": 1.3637125497590614, + "grad_norm": 0.3420257601309216, + "learning_rate": 6.634711497830645e-05, + "loss": 2.8096, + "step": 29291 + }, + { + "epoch": 1.3637591079451545, + "grad_norm": 0.34156967968231594, + "learning_rate": 6.634455507827455e-05, + "loss": 2.8153, + "step": 29292 + }, + { + "epoch": 1.3638056661312477, + "grad_norm": 0.3216764125697062, + "learning_rate": 6.634199513027297e-05, + "loss": 2.7987, + "step": 29293 + }, + { + "epoch": 1.3638522243173405, + "grad_norm": 0.3432403982629932, + "learning_rate": 6.633943513430927e-05, + "loss": 2.8954, + "step": 29294 + }, + { + "epoch": 1.3638987825034337, + "grad_norm": 0.32734051805132697, + "learning_rate": 6.633687509039092e-05, + "loss": 2.8079, + "step": 29295 + }, + { + "epoch": 1.3639453406895268, + "grad_norm": 0.3398529092137757, + "learning_rate": 6.633431499852547e-05, + "loss": 2.7597, + "step": 29296 + }, + { + "epoch": 1.3639918988756197, + "grad_norm": 0.32702386505316966, + "learning_rate": 6.633175485872038e-05, + "loss": 2.7331, + "step": 29297 + }, + { + "epoch": 1.3640384570617128, + "grad_norm": 0.34036748276591655, + "learning_rate": 6.632919467098323e-05, + "loss": 2.8449, + "step": 29298 + }, + { + "epoch": 1.364085015247806, + "grad_norm": 0.3385610596508996, + "learning_rate": 6.632663443532149e-05, + "loss": 2.8288, + "step": 29299 + }, + { + "epoch": 1.364131573433899, + "grad_norm": 0.35591969340689644, + "learning_rate": 6.632407415174267e-05, + "loss": 2.7221, + "step": 29300 + }, + { + "epoch": 1.3641781316199921, + "grad_norm": 0.34230782578310615, + "learning_rate": 6.632151382025432e-05, + "loss": 2.7557, + "step": 29301 + }, + { + "epoch": 1.3642246898060852, + "grad_norm": 0.34590978222193836, + "learning_rate": 6.631895344086392e-05, + "loss": 2.8106, + "step": 29302 + }, + { + "epoch": 1.3642712479921784, + "grad_norm": 0.35219525991032313, + "learning_rate": 6.631639301357903e-05, + "loss": 2.7407, + "step": 29303 + }, + { + "epoch": 1.3643178061782713, + "grad_norm": 0.3283941780010703, + "learning_rate": 6.631383253840711e-05, + "loss": 2.7666, + "step": 29304 + }, + { + "epoch": 1.3643643643643644, + "grad_norm": 0.340545670209312, + "learning_rate": 6.63112720153557e-05, + "loss": 2.723, + "step": 29305 + }, + { + "epoch": 1.3644109225504575, + "grad_norm": 0.35692252775620015, + "learning_rate": 6.630871144443231e-05, + "loss": 2.7262, + "step": 29306 + }, + { + "epoch": 1.3644574807365504, + "grad_norm": 0.35922213458417207, + "learning_rate": 6.630615082564448e-05, + "loss": 2.6757, + "step": 29307 + }, + { + "epoch": 1.3645040389226435, + "grad_norm": 0.3537519939149396, + "learning_rate": 6.630359015899968e-05, + "loss": 2.6901, + "step": 29308 + }, + { + "epoch": 1.3645505971087366, + "grad_norm": 0.3461260088306556, + "learning_rate": 6.630102944450547e-05, + "loss": 2.7254, + "step": 29309 + }, + { + "epoch": 1.3645971552948297, + "grad_norm": 0.337731914142678, + "learning_rate": 6.629846868216932e-05, + "loss": 2.7336, + "step": 29310 + }, + { + "epoch": 1.3646437134809228, + "grad_norm": 0.3706337087132971, + "learning_rate": 6.629590787199879e-05, + "loss": 2.7524, + "step": 29311 + }, + { + "epoch": 1.364690271667016, + "grad_norm": 0.31847715137366867, + "learning_rate": 6.629334701400137e-05, + "loss": 2.7265, + "step": 29312 + }, + { + "epoch": 1.3647368298531088, + "grad_norm": 0.37035697320144423, + "learning_rate": 6.629078610818457e-05, + "loss": 2.8041, + "step": 29313 + }, + { + "epoch": 1.364783388039202, + "grad_norm": 0.3528828678544101, + "learning_rate": 6.628822515455593e-05, + "loss": 2.7505, + "step": 29314 + }, + { + "epoch": 1.364829946225295, + "grad_norm": 0.3308414223283063, + "learning_rate": 6.628566415312295e-05, + "loss": 2.776, + "step": 29315 + }, + { + "epoch": 1.3648765044113882, + "grad_norm": 0.3552451648001167, + "learning_rate": 6.628310310389317e-05, + "loss": 2.7731, + "step": 29316 + }, + { + "epoch": 1.364923062597481, + "grad_norm": 0.33463216520789135, + "learning_rate": 6.628054200687406e-05, + "loss": 2.7378, + "step": 29317 + }, + { + "epoch": 1.3649696207835742, + "grad_norm": 0.3289464860894325, + "learning_rate": 6.627798086207317e-05, + "loss": 2.8339, + "step": 29318 + }, + { + "epoch": 1.3650161789696673, + "grad_norm": 0.3220500964249987, + "learning_rate": 6.627541966949799e-05, + "loss": 2.8802, + "step": 29319 + }, + { + "epoch": 1.3650627371557604, + "grad_norm": 0.35795557950756346, + "learning_rate": 6.627285842915609e-05, + "loss": 2.6757, + "step": 29320 + }, + { + "epoch": 1.3651092953418535, + "grad_norm": 0.3680394644909415, + "learning_rate": 6.627029714105494e-05, + "loss": 2.745, + "step": 29321 + }, + { + "epoch": 1.3651558535279467, + "grad_norm": 0.34861117091938293, + "learning_rate": 6.626773580520208e-05, + "loss": 2.7296, + "step": 29322 + }, + { + "epoch": 1.3652024117140396, + "grad_norm": 0.36721277165865146, + "learning_rate": 6.6265174421605e-05, + "loss": 2.6668, + "step": 29323 + }, + { + "epoch": 1.3652489699001327, + "grad_norm": 0.3331304194223109, + "learning_rate": 6.626261299027125e-05, + "loss": 2.6972, + "step": 29324 + }, + { + "epoch": 1.3652955280862258, + "grad_norm": 0.3491875984621792, + "learning_rate": 6.626005151120834e-05, + "loss": 2.7474, + "step": 29325 + }, + { + "epoch": 1.365342086272319, + "grad_norm": 0.3371809036002674, + "learning_rate": 6.625748998442376e-05, + "loss": 2.733, + "step": 29326 + }, + { + "epoch": 1.3653886444584118, + "grad_norm": 0.359441782842917, + "learning_rate": 6.625492840992505e-05, + "loss": 2.9084, + "step": 29327 + }, + { + "epoch": 1.365435202644505, + "grad_norm": 0.3415573791717036, + "learning_rate": 6.625236678771973e-05, + "loss": 2.8056, + "step": 29328 + }, + { + "epoch": 1.365481760830598, + "grad_norm": 0.3162236304687433, + "learning_rate": 6.624980511781532e-05, + "loss": 2.7764, + "step": 29329 + }, + { + "epoch": 1.3655283190166911, + "grad_norm": 0.36687816307394105, + "learning_rate": 6.624724340021935e-05, + "loss": 2.8632, + "step": 29330 + }, + { + "epoch": 1.3655748772027843, + "grad_norm": 0.3059197938335158, + "learning_rate": 6.62446816349393e-05, + "loss": 2.7193, + "step": 29331 + }, + { + "epoch": 1.3656214353888774, + "grad_norm": 0.38661198978873623, + "learning_rate": 6.62421198219827e-05, + "loss": 2.7796, + "step": 29332 + }, + { + "epoch": 1.3656679935749703, + "grad_norm": 0.3403212638454574, + "learning_rate": 6.62395579613571e-05, + "loss": 2.7882, + "step": 29333 + }, + { + "epoch": 1.3657145517610634, + "grad_norm": 0.351090640449789, + "learning_rate": 6.623699605306999e-05, + "loss": 2.6826, + "step": 29334 + }, + { + "epoch": 1.3657611099471565, + "grad_norm": 0.36801862532659574, + "learning_rate": 6.62344340971289e-05, + "loss": 2.7852, + "step": 29335 + }, + { + "epoch": 1.3658076681332496, + "grad_norm": 0.3360495915585429, + "learning_rate": 6.623187209354134e-05, + "loss": 2.721, + "step": 29336 + }, + { + "epoch": 1.3658542263193425, + "grad_norm": 0.36812822450955796, + "learning_rate": 6.622931004231483e-05, + "loss": 2.7531, + "step": 29337 + }, + { + "epoch": 1.3659007845054356, + "grad_norm": 0.34035385064733215, + "learning_rate": 6.62267479434569e-05, + "loss": 2.7433, + "step": 29338 + }, + { + "epoch": 1.3659473426915287, + "grad_norm": 0.3550605249837452, + "learning_rate": 6.622418579697506e-05, + "loss": 2.773, + "step": 29339 + }, + { + "epoch": 1.3659939008776218, + "grad_norm": 0.3431295429427383, + "learning_rate": 6.622162360287683e-05, + "loss": 2.5661, + "step": 29340 + }, + { + "epoch": 1.366040459063715, + "grad_norm": 0.32797284035667496, + "learning_rate": 6.621906136116974e-05, + "loss": 2.8075, + "step": 29341 + }, + { + "epoch": 1.366087017249808, + "grad_norm": 0.35050990854605263, + "learning_rate": 6.621649907186129e-05, + "loss": 2.6406, + "step": 29342 + }, + { + "epoch": 1.366133575435901, + "grad_norm": 0.33335175935244193, + "learning_rate": 6.621393673495903e-05, + "loss": 2.8259, + "step": 29343 + }, + { + "epoch": 1.366180133621994, + "grad_norm": 0.3296804388549955, + "learning_rate": 6.621137435047046e-05, + "loss": 2.8131, + "step": 29344 + }, + { + "epoch": 1.3662266918080872, + "grad_norm": 0.3695882541987192, + "learning_rate": 6.620881191840309e-05, + "loss": 2.7496, + "step": 29345 + }, + { + "epoch": 1.36627324999418, + "grad_norm": 0.34115563870013854, + "learning_rate": 6.620624943876448e-05, + "loss": 2.6644, + "step": 29346 + }, + { + "epoch": 1.3663198081802732, + "grad_norm": 0.34975598510659317, + "learning_rate": 6.62036869115621e-05, + "loss": 2.7454, + "step": 29347 + }, + { + "epoch": 1.3663663663663663, + "grad_norm": 0.33252101537018924, + "learning_rate": 6.62011243368035e-05, + "loss": 2.8214, + "step": 29348 + }, + { + "epoch": 1.3664129245524594, + "grad_norm": 0.3632225747024501, + "learning_rate": 6.619856171449621e-05, + "loss": 2.7673, + "step": 29349 + }, + { + "epoch": 1.3664594827385526, + "grad_norm": 0.33573930348156633, + "learning_rate": 6.619599904464772e-05, + "loss": 2.6901, + "step": 29350 + }, + { + "epoch": 1.3665060409246457, + "grad_norm": 0.39644984820422197, + "learning_rate": 6.619343632726557e-05, + "loss": 2.7161, + "step": 29351 + }, + { + "epoch": 1.3665525991107386, + "grad_norm": 0.3233921737468294, + "learning_rate": 6.619087356235729e-05, + "loss": 2.8214, + "step": 29352 + }, + { + "epoch": 1.3665991572968317, + "grad_norm": 0.3956336454457362, + "learning_rate": 6.61883107499304e-05, + "loss": 2.7984, + "step": 29353 + }, + { + "epoch": 1.3666457154829248, + "grad_norm": 0.31243940438063933, + "learning_rate": 6.618574788999239e-05, + "loss": 2.7479, + "step": 29354 + }, + { + "epoch": 1.366692273669018, + "grad_norm": 0.4188515593389182, + "learning_rate": 6.61831849825508e-05, + "loss": 2.7827, + "step": 29355 + }, + { + "epoch": 1.3667388318551108, + "grad_norm": 0.31683866376919617, + "learning_rate": 6.618062202761317e-05, + "loss": 2.7809, + "step": 29356 + }, + { + "epoch": 1.366785390041204, + "grad_norm": 0.36038356774060304, + "learning_rate": 6.617805902518702e-05, + "loss": 2.7302, + "step": 29357 + }, + { + "epoch": 1.366831948227297, + "grad_norm": 0.34483512158316476, + "learning_rate": 6.617549597527984e-05, + "loss": 2.7929, + "step": 29358 + }, + { + "epoch": 1.3668785064133901, + "grad_norm": 0.3624949172598781, + "learning_rate": 6.617293287789919e-05, + "loss": 2.6759, + "step": 29359 + }, + { + "epoch": 1.3669250645994833, + "grad_norm": 0.33726693477715736, + "learning_rate": 6.617036973305256e-05, + "loss": 2.7546, + "step": 29360 + }, + { + "epoch": 1.3669716227855764, + "grad_norm": 0.3218321338877213, + "learning_rate": 6.61678065407475e-05, + "loss": 2.784, + "step": 29361 + }, + { + "epoch": 1.3670181809716693, + "grad_norm": 0.34130463082123635, + "learning_rate": 6.616524330099152e-05, + "loss": 2.7078, + "step": 29362 + }, + { + "epoch": 1.3670647391577624, + "grad_norm": 0.3276166735180879, + "learning_rate": 6.616268001379214e-05, + "loss": 2.6668, + "step": 29363 + }, + { + "epoch": 1.3671112973438555, + "grad_norm": 0.31774403219960623, + "learning_rate": 6.616011667915687e-05, + "loss": 2.6246, + "step": 29364 + }, + { + "epoch": 1.3671578555299486, + "grad_norm": 0.3224198818039092, + "learning_rate": 6.615755329709327e-05, + "loss": 2.8281, + "step": 29365 + }, + { + "epoch": 1.3672044137160415, + "grad_norm": 0.3049647029065293, + "learning_rate": 6.615498986760884e-05, + "loss": 2.7424, + "step": 29366 + }, + { + "epoch": 1.3672509719021346, + "grad_norm": 0.30547550912526367, + "learning_rate": 6.61524263907111e-05, + "loss": 2.739, + "step": 29367 + }, + { + "epoch": 1.3672975300882277, + "grad_norm": 0.30562479530743925, + "learning_rate": 6.614986286640759e-05, + "loss": 2.7209, + "step": 29368 + }, + { + "epoch": 1.3673440882743209, + "grad_norm": 0.32218999940454124, + "learning_rate": 6.61472992947058e-05, + "loss": 2.7889, + "step": 29369 + }, + { + "epoch": 1.367390646460414, + "grad_norm": 0.32579765703589875, + "learning_rate": 6.61447356756133e-05, + "loss": 2.6792, + "step": 29370 + }, + { + "epoch": 1.367437204646507, + "grad_norm": 0.304728031707583, + "learning_rate": 6.61421720091376e-05, + "loss": 2.6784, + "step": 29371 + }, + { + "epoch": 1.3674837628326, + "grad_norm": 0.2984472754027981, + "learning_rate": 6.613960829528621e-05, + "loss": 2.8079, + "step": 29372 + }, + { + "epoch": 1.367530321018693, + "grad_norm": 0.3346180186114851, + "learning_rate": 6.613704453406664e-05, + "loss": 2.7411, + "step": 29373 + }, + { + "epoch": 1.3675768792047862, + "grad_norm": 0.3231518487530156, + "learning_rate": 6.613448072548644e-05, + "loss": 2.8049, + "step": 29374 + }, + { + "epoch": 1.3676234373908793, + "grad_norm": 0.3250402603776917, + "learning_rate": 6.613191686955313e-05, + "loss": 2.7422, + "step": 29375 + }, + { + "epoch": 1.3676699955769722, + "grad_norm": 0.3108360695589732, + "learning_rate": 6.612935296627424e-05, + "loss": 2.7867, + "step": 29376 + }, + { + "epoch": 1.3677165537630653, + "grad_norm": 0.3332619375312631, + "learning_rate": 6.612678901565728e-05, + "loss": 2.6727, + "step": 29377 + }, + { + "epoch": 1.3677631119491584, + "grad_norm": 0.3264036446327881, + "learning_rate": 6.612422501770978e-05, + "loss": 2.7558, + "step": 29378 + }, + { + "epoch": 1.3678096701352516, + "grad_norm": 0.3285020145231432, + "learning_rate": 6.612166097243928e-05, + "loss": 2.7467, + "step": 29379 + }, + { + "epoch": 1.3678562283213447, + "grad_norm": 0.3114203123719866, + "learning_rate": 6.61190968798533e-05, + "loss": 2.7725, + "step": 29380 + }, + { + "epoch": 1.3679027865074378, + "grad_norm": 0.3164362404061643, + "learning_rate": 6.611653273995934e-05, + "loss": 2.7373, + "step": 29381 + }, + { + "epoch": 1.3679493446935307, + "grad_norm": 0.34411554934729405, + "learning_rate": 6.611396855276495e-05, + "loss": 2.8234, + "step": 29382 + }, + { + "epoch": 1.3679959028796238, + "grad_norm": 0.3274026370493718, + "learning_rate": 6.611140431827765e-05, + "loss": 2.8304, + "step": 29383 + }, + { + "epoch": 1.368042461065717, + "grad_norm": 0.3343200319240383, + "learning_rate": 6.610884003650497e-05, + "loss": 2.8725, + "step": 29384 + }, + { + "epoch": 1.3680890192518098, + "grad_norm": 0.35850646812884224, + "learning_rate": 6.610627570745443e-05, + "loss": 2.7278, + "step": 29385 + }, + { + "epoch": 1.368135577437903, + "grad_norm": 0.31637880889253733, + "learning_rate": 6.610371133113355e-05, + "loss": 2.7744, + "step": 29386 + }, + { + "epoch": 1.368182135623996, + "grad_norm": 0.35193254924163103, + "learning_rate": 6.610114690754987e-05, + "loss": 2.7133, + "step": 29387 + }, + { + "epoch": 1.3682286938100892, + "grad_norm": 0.338149520496097, + "learning_rate": 6.609858243671092e-05, + "loss": 2.8053, + "step": 29388 + }, + { + "epoch": 1.3682752519961823, + "grad_norm": 0.35181662580365525, + "learning_rate": 6.609601791862421e-05, + "loss": 2.7922, + "step": 29389 + }, + { + "epoch": 1.3683218101822754, + "grad_norm": 0.33435501554073926, + "learning_rate": 6.609345335329726e-05, + "loss": 2.6558, + "step": 29390 + }, + { + "epoch": 1.3683683683683685, + "grad_norm": 0.3378803950835449, + "learning_rate": 6.609088874073762e-05, + "loss": 2.7927, + "step": 29391 + }, + { + "epoch": 1.3684149265544614, + "grad_norm": 0.3728385572296317, + "learning_rate": 6.60883240809528e-05, + "loss": 2.8057, + "step": 29392 + }, + { + "epoch": 1.3684614847405545, + "grad_norm": 0.3269956738461911, + "learning_rate": 6.608575937395035e-05, + "loss": 2.8731, + "step": 29393 + }, + { + "epoch": 1.3685080429266476, + "grad_norm": 0.36898188169306273, + "learning_rate": 6.608319461973778e-05, + "loss": 2.7245, + "step": 29394 + }, + { + "epoch": 1.3685546011127405, + "grad_norm": 0.3483810860351253, + "learning_rate": 6.60806298183226e-05, + "loss": 2.8181, + "step": 29395 + }, + { + "epoch": 1.3686011592988336, + "grad_norm": 0.3332844853576831, + "learning_rate": 6.607806496971236e-05, + "loss": 2.7739, + "step": 29396 + }, + { + "epoch": 1.3686477174849268, + "grad_norm": 0.3525335005760787, + "learning_rate": 6.607550007391459e-05, + "loss": 2.7942, + "step": 29397 + }, + { + "epoch": 1.3686942756710199, + "grad_norm": 0.33566781550704666, + "learning_rate": 6.607293513093682e-05, + "loss": 2.8194, + "step": 29398 + }, + { + "epoch": 1.368740833857113, + "grad_norm": 0.350873037227757, + "learning_rate": 6.607037014078655e-05, + "loss": 2.8542, + "step": 29399 + }, + { + "epoch": 1.368787392043206, + "grad_norm": 0.35686580047322675, + "learning_rate": 6.606780510347133e-05, + "loss": 2.7825, + "step": 29400 + }, + { + "epoch": 1.368833950229299, + "grad_norm": 0.3300373679359047, + "learning_rate": 6.60652400189987e-05, + "loss": 2.7577, + "step": 29401 + }, + { + "epoch": 1.368880508415392, + "grad_norm": 0.33119360015739013, + "learning_rate": 6.606267488737616e-05, + "loss": 2.8393, + "step": 29402 + }, + { + "epoch": 1.3689270666014852, + "grad_norm": 0.34677800624830335, + "learning_rate": 6.606010970861126e-05, + "loss": 2.7008, + "step": 29403 + }, + { + "epoch": 1.3689736247875783, + "grad_norm": 0.3312407567234945, + "learning_rate": 6.60575444827115e-05, + "loss": 2.6729, + "step": 29404 + }, + { + "epoch": 1.3690201829736712, + "grad_norm": 0.33662626234669896, + "learning_rate": 6.605497920968445e-05, + "loss": 2.7778, + "step": 29405 + }, + { + "epoch": 1.3690667411597643, + "grad_norm": 0.3393803090410208, + "learning_rate": 6.60524138895376e-05, + "loss": 2.7123, + "step": 29406 + }, + { + "epoch": 1.3691132993458575, + "grad_norm": 0.308260515039789, + "learning_rate": 6.604984852227853e-05, + "loss": 2.797, + "step": 29407 + }, + { + "epoch": 1.3691598575319506, + "grad_norm": 0.3462880056097714, + "learning_rate": 6.60472831079147e-05, + "loss": 2.6739, + "step": 29408 + }, + { + "epoch": 1.3692064157180437, + "grad_norm": 0.33291258958123976, + "learning_rate": 6.604471764645369e-05, + "loss": 2.7097, + "step": 29409 + }, + { + "epoch": 1.3692529739041368, + "grad_norm": 0.32982493385481937, + "learning_rate": 6.604215213790301e-05, + "loss": 2.7063, + "step": 29410 + }, + { + "epoch": 1.3692995320902297, + "grad_norm": 0.3230376746881714, + "learning_rate": 6.60395865822702e-05, + "loss": 2.7426, + "step": 29411 + }, + { + "epoch": 1.3693460902763228, + "grad_norm": 0.36946013322741034, + "learning_rate": 6.603702097956279e-05, + "loss": 2.7682, + "step": 29412 + }, + { + "epoch": 1.369392648462416, + "grad_norm": 0.3177142694178044, + "learning_rate": 6.603445532978829e-05, + "loss": 2.6666, + "step": 29413 + }, + { + "epoch": 1.369439206648509, + "grad_norm": 0.3238088144581363, + "learning_rate": 6.603188963295425e-05, + "loss": 2.7432, + "step": 29414 + }, + { + "epoch": 1.369485764834602, + "grad_norm": 0.34981213132058414, + "learning_rate": 6.602932388906818e-05, + "loss": 2.83, + "step": 29415 + }, + { + "epoch": 1.369532323020695, + "grad_norm": 0.32976630084443753, + "learning_rate": 6.602675809813765e-05, + "loss": 2.8617, + "step": 29416 + }, + { + "epoch": 1.3695788812067882, + "grad_norm": 0.3416015140572717, + "learning_rate": 6.602419226017014e-05, + "loss": 2.8281, + "step": 29417 + }, + { + "epoch": 1.3696254393928813, + "grad_norm": 0.32565280310672756, + "learning_rate": 6.60216263751732e-05, + "loss": 2.7616, + "step": 29418 + }, + { + "epoch": 1.3696719975789744, + "grad_norm": 0.3178216306103058, + "learning_rate": 6.601906044315438e-05, + "loss": 2.7152, + "step": 29419 + }, + { + "epoch": 1.3697185557650675, + "grad_norm": 0.3332772195008748, + "learning_rate": 6.60164944641212e-05, + "loss": 2.6421, + "step": 29420 + }, + { + "epoch": 1.3697651139511604, + "grad_norm": 0.3247090315497459, + "learning_rate": 6.601392843808118e-05, + "loss": 2.8126, + "step": 29421 + }, + { + "epoch": 1.3698116721372535, + "grad_norm": 0.33401494437759705, + "learning_rate": 6.601136236504185e-05, + "loss": 2.8356, + "step": 29422 + }, + { + "epoch": 1.3698582303233466, + "grad_norm": 0.33575828369690613, + "learning_rate": 6.600879624501077e-05, + "loss": 2.766, + "step": 29423 + }, + { + "epoch": 1.3699047885094398, + "grad_norm": 0.35404499321742583, + "learning_rate": 6.600623007799542e-05, + "loss": 2.8723, + "step": 29424 + }, + { + "epoch": 1.3699513466955326, + "grad_norm": 0.3477424163280808, + "learning_rate": 6.60036638640034e-05, + "loss": 2.7869, + "step": 29425 + }, + { + "epoch": 1.3699979048816258, + "grad_norm": 0.3825501296805324, + "learning_rate": 6.600109760304218e-05, + "loss": 2.8865, + "step": 29426 + }, + { + "epoch": 1.3700444630677189, + "grad_norm": 0.3474508639002481, + "learning_rate": 6.599853129511931e-05, + "loss": 2.7678, + "step": 29427 + }, + { + "epoch": 1.370091021253812, + "grad_norm": 0.3555278175848072, + "learning_rate": 6.599596494024233e-05, + "loss": 2.8602, + "step": 29428 + }, + { + "epoch": 1.370137579439905, + "grad_norm": 0.4054953335453503, + "learning_rate": 6.599339853841877e-05, + "loss": 2.7491, + "step": 29429 + }, + { + "epoch": 1.3701841376259982, + "grad_norm": 0.38684565096803075, + "learning_rate": 6.599083208965617e-05, + "loss": 2.8404, + "step": 29430 + }, + { + "epoch": 1.3702306958120911, + "grad_norm": 0.37359281899706254, + "learning_rate": 6.598826559396204e-05, + "loss": 2.7507, + "step": 29431 + }, + { + "epoch": 1.3702772539981842, + "grad_norm": 0.35190679136170755, + "learning_rate": 6.598569905134392e-05, + "loss": 2.7625, + "step": 29432 + }, + { + "epoch": 1.3703238121842773, + "grad_norm": 0.3774524496133735, + "learning_rate": 6.598313246180937e-05, + "loss": 2.7278, + "step": 29433 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 0.37935958822712884, + "learning_rate": 6.598056582536589e-05, + "loss": 2.8115, + "step": 29434 + }, + { + "epoch": 1.3704169285564634, + "grad_norm": 0.36618298663972687, + "learning_rate": 6.597799914202101e-05, + "loss": 2.801, + "step": 29435 + }, + { + "epoch": 1.3704634867425565, + "grad_norm": 0.36486950957011133, + "learning_rate": 6.59754324117823e-05, + "loss": 2.6958, + "step": 29436 + }, + { + "epoch": 1.3705100449286496, + "grad_norm": 0.37520179234151607, + "learning_rate": 6.597286563465727e-05, + "loss": 2.7987, + "step": 29437 + }, + { + "epoch": 1.3705566031147427, + "grad_norm": 0.3406207870556677, + "learning_rate": 6.597029881065345e-05, + "loss": 2.7038, + "step": 29438 + }, + { + "epoch": 1.3706031613008358, + "grad_norm": 0.3984194655796185, + "learning_rate": 6.596773193977836e-05, + "loss": 2.7076, + "step": 29439 + }, + { + "epoch": 1.3706497194869287, + "grad_norm": 0.3172096366695096, + "learning_rate": 6.596516502203955e-05, + "loss": 2.8305, + "step": 29440 + }, + { + "epoch": 1.3706962776730218, + "grad_norm": 0.3992880176804886, + "learning_rate": 6.596259805744457e-05, + "loss": 2.8012, + "step": 29441 + }, + { + "epoch": 1.370742835859115, + "grad_norm": 0.3486869325014791, + "learning_rate": 6.596003104600092e-05, + "loss": 2.699, + "step": 29442 + }, + { + "epoch": 1.370789394045208, + "grad_norm": 0.3851447684180647, + "learning_rate": 6.595746398771617e-05, + "loss": 2.7519, + "step": 29443 + }, + { + "epoch": 1.370835952231301, + "grad_norm": 0.32186262943828037, + "learning_rate": 6.595489688259782e-05, + "loss": 2.78, + "step": 29444 + }, + { + "epoch": 1.370882510417394, + "grad_norm": 0.3618088951466935, + "learning_rate": 6.595232973065343e-05, + "loss": 2.7252, + "step": 29445 + }, + { + "epoch": 1.3709290686034872, + "grad_norm": 0.32424249803361893, + "learning_rate": 6.594976253189051e-05, + "loss": 2.7209, + "step": 29446 + }, + { + "epoch": 1.3709756267895803, + "grad_norm": 0.3465433429397514, + "learning_rate": 6.594719528631663e-05, + "loss": 2.7359, + "step": 29447 + }, + { + "epoch": 1.3710221849756734, + "grad_norm": 0.31601562718189846, + "learning_rate": 6.594462799393928e-05, + "loss": 2.7488, + "step": 29448 + }, + { + "epoch": 1.3710687431617665, + "grad_norm": 0.3218590459183171, + "learning_rate": 6.594206065476604e-05, + "loss": 2.8078, + "step": 29449 + }, + { + "epoch": 1.3711153013478594, + "grad_norm": 0.33302596661719636, + "learning_rate": 6.59394932688044e-05, + "loss": 2.8172, + "step": 29450 + }, + { + "epoch": 1.3711618595339525, + "grad_norm": 0.34289635759593784, + "learning_rate": 6.593692583606194e-05, + "loss": 2.8235, + "step": 29451 + }, + { + "epoch": 1.3712084177200456, + "grad_norm": 0.36288409352918305, + "learning_rate": 6.593435835654616e-05, + "loss": 2.8182, + "step": 29452 + }, + { + "epoch": 1.3712549759061388, + "grad_norm": 0.3508429093602704, + "learning_rate": 6.593179083026461e-05, + "loss": 2.7616, + "step": 29453 + }, + { + "epoch": 1.3713015340922317, + "grad_norm": 0.43672730061961973, + "learning_rate": 6.592922325722483e-05, + "loss": 2.7235, + "step": 29454 + }, + { + "epoch": 1.3713480922783248, + "grad_norm": 0.398257192886638, + "learning_rate": 6.592665563743434e-05, + "loss": 2.7373, + "step": 29455 + }, + { + "epoch": 1.3713946504644179, + "grad_norm": 0.4213447060107641, + "learning_rate": 6.592408797090068e-05, + "loss": 2.7271, + "step": 29456 + }, + { + "epoch": 1.371441208650511, + "grad_norm": 0.35082459998720505, + "learning_rate": 6.59215202576314e-05, + "loss": 2.6453, + "step": 29457 + }, + { + "epoch": 1.3714877668366041, + "grad_norm": 0.3819709735761283, + "learning_rate": 6.591895249763402e-05, + "loss": 2.7277, + "step": 29458 + }, + { + "epoch": 1.3715343250226972, + "grad_norm": 0.39751251826296996, + "learning_rate": 6.591638469091608e-05, + "loss": 2.7206, + "step": 29459 + }, + { + "epoch": 1.3715808832087901, + "grad_norm": 0.38270356418194773, + "learning_rate": 6.591381683748515e-05, + "loss": 2.6883, + "step": 29460 + }, + { + "epoch": 1.3716274413948832, + "grad_norm": 0.35631527441431315, + "learning_rate": 6.591124893734871e-05, + "loss": 2.7728, + "step": 29461 + }, + { + "epoch": 1.3716739995809764, + "grad_norm": 0.3825900075435556, + "learning_rate": 6.590868099051431e-05, + "loss": 2.7046, + "step": 29462 + }, + { + "epoch": 1.3717205577670695, + "grad_norm": 0.3437782353748543, + "learning_rate": 6.590611299698951e-05, + "loss": 2.6667, + "step": 29463 + }, + { + "epoch": 1.3717671159531624, + "grad_norm": 0.34227553033193164, + "learning_rate": 6.590354495678183e-05, + "loss": 2.7438, + "step": 29464 + }, + { + "epoch": 1.3718136741392555, + "grad_norm": 0.3774130383309273, + "learning_rate": 6.590097686989883e-05, + "loss": 2.7985, + "step": 29465 + }, + { + "epoch": 1.3718602323253486, + "grad_norm": 0.33772329272422924, + "learning_rate": 6.589840873634802e-05, + "loss": 2.7128, + "step": 29466 + }, + { + "epoch": 1.3719067905114417, + "grad_norm": 0.35611755339107154, + "learning_rate": 6.589584055613693e-05, + "loss": 2.7563, + "step": 29467 + }, + { + "epoch": 1.3719533486975348, + "grad_norm": 0.3784285696482976, + "learning_rate": 6.589327232927313e-05, + "loss": 2.7842, + "step": 29468 + }, + { + "epoch": 1.371999906883628, + "grad_norm": 0.34715811428450105, + "learning_rate": 6.589070405576414e-05, + "loss": 2.7302, + "step": 29469 + }, + { + "epoch": 1.3720464650697208, + "grad_norm": 0.32697369505660834, + "learning_rate": 6.58881357356175e-05, + "loss": 2.6863, + "step": 29470 + }, + { + "epoch": 1.372093023255814, + "grad_norm": 0.3942046086634751, + "learning_rate": 6.588556736884075e-05, + "loss": 2.8274, + "step": 29471 + }, + { + "epoch": 1.372139581441907, + "grad_norm": 0.33862922586501587, + "learning_rate": 6.588299895544143e-05, + "loss": 2.7997, + "step": 29472 + }, + { + "epoch": 1.372186139628, + "grad_norm": 0.35443716569601597, + "learning_rate": 6.588043049542705e-05, + "loss": 2.814, + "step": 29473 + }, + { + "epoch": 1.372232697814093, + "grad_norm": 0.3416971785423052, + "learning_rate": 6.587786198880518e-05, + "loss": 2.7465, + "step": 29474 + }, + { + "epoch": 1.3722792560001862, + "grad_norm": 0.333282556548029, + "learning_rate": 6.587529343558337e-05, + "loss": 2.8469, + "step": 29475 + }, + { + "epoch": 1.3723258141862793, + "grad_norm": 0.3421663035382484, + "learning_rate": 6.587272483576913e-05, + "loss": 2.8081, + "step": 29476 + }, + { + "epoch": 1.3723723723723724, + "grad_norm": 0.3393103922060311, + "learning_rate": 6.587015618937e-05, + "loss": 2.7023, + "step": 29477 + }, + { + "epoch": 1.3724189305584655, + "grad_norm": 0.3353430297354736, + "learning_rate": 6.586758749639352e-05, + "loss": 2.6193, + "step": 29478 + }, + { + "epoch": 1.3724654887445586, + "grad_norm": 0.32907255771691357, + "learning_rate": 6.586501875684722e-05, + "loss": 2.8412, + "step": 29479 + }, + { + "epoch": 1.3725120469306515, + "grad_norm": 0.36158282472480424, + "learning_rate": 6.586244997073868e-05, + "loss": 2.788, + "step": 29480 + }, + { + "epoch": 1.3725586051167447, + "grad_norm": 0.3278937885483245, + "learning_rate": 6.58598811380754e-05, + "loss": 2.7564, + "step": 29481 + }, + { + "epoch": 1.3726051633028378, + "grad_norm": 0.36791271789532826, + "learning_rate": 6.585731225886493e-05, + "loss": 2.7909, + "step": 29482 + }, + { + "epoch": 1.3726517214889307, + "grad_norm": 0.32783395405436266, + "learning_rate": 6.585474333311482e-05, + "loss": 2.8489, + "step": 29483 + }, + { + "epoch": 1.3726982796750238, + "grad_norm": 0.34316288433999104, + "learning_rate": 6.58521743608326e-05, + "loss": 2.8116, + "step": 29484 + }, + { + "epoch": 1.372744837861117, + "grad_norm": 0.3221769257436314, + "learning_rate": 6.58496053420258e-05, + "loss": 2.7098, + "step": 29485 + }, + { + "epoch": 1.37279139604721, + "grad_norm": 0.38162358524897316, + "learning_rate": 6.584703627670197e-05, + "loss": 2.7862, + "step": 29486 + }, + { + "epoch": 1.3728379542333031, + "grad_norm": 0.33053346227035, + "learning_rate": 6.584446716486867e-05, + "loss": 2.7613, + "step": 29487 + }, + { + "epoch": 1.3728845124193962, + "grad_norm": 0.3750297074447288, + "learning_rate": 6.58418980065334e-05, + "loss": 2.7641, + "step": 29488 + }, + { + "epoch": 1.3729310706054891, + "grad_norm": 0.32918567501230234, + "learning_rate": 6.583932880170371e-05, + "loss": 2.7602, + "step": 29489 + }, + { + "epoch": 1.3729776287915822, + "grad_norm": 0.35773439919686256, + "learning_rate": 6.583675955038717e-05, + "loss": 2.7287, + "step": 29490 + }, + { + "epoch": 1.3730241869776754, + "grad_norm": 0.32449667203302196, + "learning_rate": 6.58341902525913e-05, + "loss": 2.7914, + "step": 29491 + }, + { + "epoch": 1.3730707451637685, + "grad_norm": 0.3157928130426436, + "learning_rate": 6.583162090832364e-05, + "loss": 2.7236, + "step": 29492 + }, + { + "epoch": 1.3731173033498614, + "grad_norm": 0.33754355278067805, + "learning_rate": 6.582905151759172e-05, + "loss": 2.8352, + "step": 29493 + }, + { + "epoch": 1.3731638615359545, + "grad_norm": 0.3503946766943493, + "learning_rate": 6.582648208040309e-05, + "loss": 2.763, + "step": 29494 + }, + { + "epoch": 1.3732104197220476, + "grad_norm": 0.36117927796827254, + "learning_rate": 6.582391259676531e-05, + "loss": 2.7587, + "step": 29495 + }, + { + "epoch": 1.3732569779081407, + "grad_norm": 0.34008485032250435, + "learning_rate": 6.58213430666859e-05, + "loss": 2.8192, + "step": 29496 + }, + { + "epoch": 1.3733035360942338, + "grad_norm": 0.37372696431657637, + "learning_rate": 6.581877349017242e-05, + "loss": 2.7163, + "step": 29497 + }, + { + "epoch": 1.373350094280327, + "grad_norm": 0.33242830803818996, + "learning_rate": 6.581620386723239e-05, + "loss": 2.7598, + "step": 29498 + }, + { + "epoch": 1.3733966524664198, + "grad_norm": 0.3310370788408756, + "learning_rate": 6.581363419787336e-05, + "loss": 2.7383, + "step": 29499 + }, + { + "epoch": 1.373443210652513, + "grad_norm": 0.36046423879331024, + "learning_rate": 6.581106448210285e-05, + "loss": 2.8083, + "step": 29500 + }, + { + "epoch": 1.373489768838606, + "grad_norm": 0.3311128129352558, + "learning_rate": 6.580849471992847e-05, + "loss": 2.7535, + "step": 29501 + }, + { + "epoch": 1.3735363270246992, + "grad_norm": 0.34615756930790126, + "learning_rate": 6.580592491135768e-05, + "loss": 2.7867, + "step": 29502 + }, + { + "epoch": 1.373582885210792, + "grad_norm": 0.3736706618899882, + "learning_rate": 6.580335505639805e-05, + "loss": 2.8203, + "step": 29503 + }, + { + "epoch": 1.3736294433968852, + "grad_norm": 0.32881334927603795, + "learning_rate": 6.580078515505714e-05, + "loss": 2.8026, + "step": 29504 + }, + { + "epoch": 1.3736760015829783, + "grad_norm": 0.3696279774247501, + "learning_rate": 6.579821520734249e-05, + "loss": 2.7605, + "step": 29505 + }, + { + "epoch": 1.3737225597690714, + "grad_norm": 0.3527929984621968, + "learning_rate": 6.579564521326163e-05, + "loss": 2.837, + "step": 29506 + }, + { + "epoch": 1.3737691179551645, + "grad_norm": 0.36346111785751156, + "learning_rate": 6.579307517282212e-05, + "loss": 2.7404, + "step": 29507 + }, + { + "epoch": 1.3738156761412577, + "grad_norm": 0.3403310147148813, + "learning_rate": 6.579050508603148e-05, + "loss": 2.6695, + "step": 29508 + }, + { + "epoch": 1.3738622343273506, + "grad_norm": 0.3654994952571832, + "learning_rate": 6.578793495289726e-05, + "loss": 2.777, + "step": 29509 + }, + { + "epoch": 1.3739087925134437, + "grad_norm": 0.33108659678022384, + "learning_rate": 6.578536477342701e-05, + "loss": 2.7311, + "step": 29510 + }, + { + "epoch": 1.3739553506995368, + "grad_norm": 0.32770327504113855, + "learning_rate": 6.578279454762825e-05, + "loss": 2.6043, + "step": 29511 + }, + { + "epoch": 1.37400190888563, + "grad_norm": 0.3188019135903683, + "learning_rate": 6.578022427550859e-05, + "loss": 2.768, + "step": 29512 + }, + { + "epoch": 1.3740484670717228, + "grad_norm": 0.3555230441274456, + "learning_rate": 6.577765395707548e-05, + "loss": 2.7663, + "step": 29513 + }, + { + "epoch": 1.374095025257816, + "grad_norm": 0.31547728324239266, + "learning_rate": 6.577508359233653e-05, + "loss": 2.7436, + "step": 29514 + }, + { + "epoch": 1.374141583443909, + "grad_norm": 0.380480736985084, + "learning_rate": 6.577251318129926e-05, + "loss": 2.847, + "step": 29515 + }, + { + "epoch": 1.3741881416300021, + "grad_norm": 0.32155619587894707, + "learning_rate": 6.576994272397122e-05, + "loss": 2.772, + "step": 29516 + }, + { + "epoch": 1.3742346998160953, + "grad_norm": 0.3569704911903245, + "learning_rate": 6.576737222035994e-05, + "loss": 2.7464, + "step": 29517 + }, + { + "epoch": 1.3742812580021884, + "grad_norm": 0.32509248778453764, + "learning_rate": 6.576480167047298e-05, + "loss": 2.7023, + "step": 29518 + }, + { + "epoch": 1.3743278161882813, + "grad_norm": 0.33464874635245745, + "learning_rate": 6.576223107431791e-05, + "loss": 2.7973, + "step": 29519 + }, + { + "epoch": 1.3743743743743744, + "grad_norm": 0.32218454183172973, + "learning_rate": 6.575966043190222e-05, + "loss": 2.7602, + "step": 29520 + }, + { + "epoch": 1.3744209325604675, + "grad_norm": 0.34070571638400066, + "learning_rate": 6.575708974323348e-05, + "loss": 2.7323, + "step": 29521 + }, + { + "epoch": 1.3744674907465604, + "grad_norm": 0.3567136955368228, + "learning_rate": 6.575451900831921e-05, + "loss": 2.7938, + "step": 29522 + }, + { + "epoch": 1.3745140489326535, + "grad_norm": 0.3561566564826962, + "learning_rate": 6.575194822716702e-05, + "loss": 2.7252, + "step": 29523 + }, + { + "epoch": 1.3745606071187466, + "grad_norm": 0.3705176806915045, + "learning_rate": 6.57493773997844e-05, + "loss": 2.8074, + "step": 29524 + }, + { + "epoch": 1.3746071653048397, + "grad_norm": 0.3384657400689002, + "learning_rate": 6.574680652617891e-05, + "loss": 2.7451, + "step": 29525 + }, + { + "epoch": 1.3746537234909328, + "grad_norm": 0.3611155187032015, + "learning_rate": 6.574423560635809e-05, + "loss": 2.7999, + "step": 29526 + }, + { + "epoch": 1.374700281677026, + "grad_norm": 0.3455007716338129, + "learning_rate": 6.574166464032949e-05, + "loss": 2.6987, + "step": 29527 + }, + { + "epoch": 1.3747468398631189, + "grad_norm": 0.348775885807629, + "learning_rate": 6.573909362810065e-05, + "loss": 2.7874, + "step": 29528 + }, + { + "epoch": 1.374793398049212, + "grad_norm": 0.3297301730645848, + "learning_rate": 6.573652256967912e-05, + "loss": 2.6637, + "step": 29529 + }, + { + "epoch": 1.374839956235305, + "grad_norm": 0.35655433108772167, + "learning_rate": 6.573395146507245e-05, + "loss": 2.8255, + "step": 29530 + }, + { + "epoch": 1.3748865144213982, + "grad_norm": 0.33772270952986533, + "learning_rate": 6.573138031428818e-05, + "loss": 2.6204, + "step": 29531 + }, + { + "epoch": 1.374933072607491, + "grad_norm": 0.3445960158049464, + "learning_rate": 6.572880911733386e-05, + "loss": 2.7872, + "step": 29532 + }, + { + "epoch": 1.3749796307935842, + "grad_norm": 0.3744194008191136, + "learning_rate": 6.572623787421704e-05, + "loss": 2.7217, + "step": 29533 + }, + { + "epoch": 1.3750261889796773, + "grad_norm": 0.3209452063279204, + "learning_rate": 6.572366658494526e-05, + "loss": 2.7837, + "step": 29534 + }, + { + "epoch": 1.3750727471657704, + "grad_norm": 0.3496193842168961, + "learning_rate": 6.572109524952607e-05, + "loss": 2.7169, + "step": 29535 + }, + { + "epoch": 1.3751193053518636, + "grad_norm": 0.3597554352228827, + "learning_rate": 6.571852386796701e-05, + "loss": 2.7869, + "step": 29536 + }, + { + "epoch": 1.3751658635379567, + "grad_norm": 0.3152374755999184, + "learning_rate": 6.571595244027563e-05, + "loss": 2.6647, + "step": 29537 + }, + { + "epoch": 1.3752124217240496, + "grad_norm": 0.4053988191595931, + "learning_rate": 6.57133809664595e-05, + "loss": 2.7915, + "step": 29538 + }, + { + "epoch": 1.3752589799101427, + "grad_norm": 0.35781976673350685, + "learning_rate": 6.571080944652612e-05, + "loss": 2.7218, + "step": 29539 + }, + { + "epoch": 1.3753055380962358, + "grad_norm": 0.3716271384585842, + "learning_rate": 6.570823788048305e-05, + "loss": 2.7982, + "step": 29540 + }, + { + "epoch": 1.375352096282329, + "grad_norm": 0.3697567955284539, + "learning_rate": 6.570566626833787e-05, + "loss": 2.7806, + "step": 29541 + }, + { + "epoch": 1.3753986544684218, + "grad_norm": 0.3569181109368075, + "learning_rate": 6.570309461009811e-05, + "loss": 2.72, + "step": 29542 + }, + { + "epoch": 1.375445212654515, + "grad_norm": 0.3224455760050615, + "learning_rate": 6.57005229057713e-05, + "loss": 2.8383, + "step": 29543 + }, + { + "epoch": 1.375491770840608, + "grad_norm": 0.3627707586286391, + "learning_rate": 6.5697951155365e-05, + "loss": 2.7418, + "step": 29544 + }, + { + "epoch": 1.3755383290267011, + "grad_norm": 0.3007242632234968, + "learning_rate": 6.569537935888677e-05, + "loss": 2.7194, + "step": 29545 + }, + { + "epoch": 1.3755848872127943, + "grad_norm": 0.3308803672698873, + "learning_rate": 6.569280751634415e-05, + "loss": 2.6802, + "step": 29546 + }, + { + "epoch": 1.3756314453988874, + "grad_norm": 0.32106087205545836, + "learning_rate": 6.569023562774468e-05, + "loss": 2.8561, + "step": 29547 + }, + { + "epoch": 1.3756780035849803, + "grad_norm": 0.3453517492548231, + "learning_rate": 6.568766369309591e-05, + "loss": 2.7729, + "step": 29548 + }, + { + "epoch": 1.3757245617710734, + "grad_norm": 0.3251665212531857, + "learning_rate": 6.56850917124054e-05, + "loss": 2.7515, + "step": 29549 + }, + { + "epoch": 1.3757711199571665, + "grad_norm": 0.3367580708181117, + "learning_rate": 6.56825196856807e-05, + "loss": 2.7887, + "step": 29550 + }, + { + "epoch": 1.3758176781432596, + "grad_norm": 0.33076450065855273, + "learning_rate": 6.567994761292935e-05, + "loss": 2.8263, + "step": 29551 + }, + { + "epoch": 1.3758642363293525, + "grad_norm": 0.34800541396330253, + "learning_rate": 6.56773754941589e-05, + "loss": 2.7081, + "step": 29552 + }, + { + "epoch": 1.3759107945154456, + "grad_norm": 0.34660231359835775, + "learning_rate": 6.567480332937686e-05, + "loss": 2.8263, + "step": 29553 + }, + { + "epoch": 1.3759573527015387, + "grad_norm": 0.3050896618216723, + "learning_rate": 6.567223111859085e-05, + "loss": 2.7084, + "step": 29554 + }, + { + "epoch": 1.3760039108876319, + "grad_norm": 0.34642499647562697, + "learning_rate": 6.566965886180838e-05, + "loss": 2.7216, + "step": 29555 + }, + { + "epoch": 1.376050469073725, + "grad_norm": 0.3393086233669767, + "learning_rate": 6.5667086559037e-05, + "loss": 2.8916, + "step": 29556 + }, + { + "epoch": 1.376097027259818, + "grad_norm": 0.3289203515909889, + "learning_rate": 6.566451421028427e-05, + "loss": 2.659, + "step": 29557 + }, + { + "epoch": 1.376143585445911, + "grad_norm": 0.32935991442352713, + "learning_rate": 6.566194181555773e-05, + "loss": 2.7021, + "step": 29558 + }, + { + "epoch": 1.376190143632004, + "grad_norm": 0.3639662463301376, + "learning_rate": 6.565936937486493e-05, + "loss": 2.7205, + "step": 29559 + }, + { + "epoch": 1.3762367018180972, + "grad_norm": 0.3203152455624787, + "learning_rate": 6.565679688821344e-05, + "loss": 2.7906, + "step": 29560 + }, + { + "epoch": 1.37628326000419, + "grad_norm": 0.34890169184663866, + "learning_rate": 6.565422435561077e-05, + "loss": 2.7981, + "step": 29561 + }, + { + "epoch": 1.3763298181902832, + "grad_norm": 0.3141305852948295, + "learning_rate": 6.56516517770645e-05, + "loss": 2.8208, + "step": 29562 + }, + { + "epoch": 1.3763763763763763, + "grad_norm": 0.35200200035301743, + "learning_rate": 6.564907915258219e-05, + "loss": 2.7655, + "step": 29563 + }, + { + "epoch": 1.3764229345624694, + "grad_norm": 0.35212931246758183, + "learning_rate": 6.564650648217136e-05, + "loss": 2.7786, + "step": 29564 + }, + { + "epoch": 1.3764694927485626, + "grad_norm": 0.3210036514712657, + "learning_rate": 6.564393376583959e-05, + "loss": 2.847, + "step": 29565 + }, + { + "epoch": 1.3765160509346557, + "grad_norm": 0.346444194116812, + "learning_rate": 6.56413610035944e-05, + "loss": 2.6522, + "step": 29566 + }, + { + "epoch": 1.3765626091207488, + "grad_norm": 0.3477306432931421, + "learning_rate": 6.563878819544336e-05, + "loss": 2.8054, + "step": 29567 + }, + { + "epoch": 1.3766091673068417, + "grad_norm": 0.34504124603650765, + "learning_rate": 6.563621534139401e-05, + "loss": 2.7781, + "step": 29568 + }, + { + "epoch": 1.3766557254929348, + "grad_norm": 0.3633171196218811, + "learning_rate": 6.563364244145393e-05, + "loss": 2.773, + "step": 29569 + }, + { + "epoch": 1.376702283679028, + "grad_norm": 0.3255846827034128, + "learning_rate": 6.563106949563063e-05, + "loss": 2.6878, + "step": 29570 + }, + { + "epoch": 1.3767488418651208, + "grad_norm": 0.3656357473014939, + "learning_rate": 6.562849650393168e-05, + "loss": 2.7496, + "step": 29571 + }, + { + "epoch": 1.376795400051214, + "grad_norm": 0.32095732615966954, + "learning_rate": 6.562592346636465e-05, + "loss": 2.7585, + "step": 29572 + }, + { + "epoch": 1.376841958237307, + "grad_norm": 0.34153317912090914, + "learning_rate": 6.562335038293707e-05, + "loss": 2.6859, + "step": 29573 + }, + { + "epoch": 1.3768885164234002, + "grad_norm": 0.32498054208934024, + "learning_rate": 6.562077725365647e-05, + "loss": 2.778, + "step": 29574 + }, + { + "epoch": 1.3769350746094933, + "grad_norm": 0.3347790812524379, + "learning_rate": 6.561820407853045e-05, + "loss": 2.816, + "step": 29575 + }, + { + "epoch": 1.3769816327955864, + "grad_norm": 0.3490041152112968, + "learning_rate": 6.561563085756655e-05, + "loss": 2.7235, + "step": 29576 + }, + { + "epoch": 1.3770281909816793, + "grad_norm": 0.33242552624921373, + "learning_rate": 6.56130575907723e-05, + "loss": 2.6458, + "step": 29577 + }, + { + "epoch": 1.3770747491677724, + "grad_norm": 0.32617313325468417, + "learning_rate": 6.561048427815526e-05, + "loss": 2.7733, + "step": 29578 + }, + { + "epoch": 1.3771213073538655, + "grad_norm": 0.34443574880614497, + "learning_rate": 6.560791091972299e-05, + "loss": 2.7974, + "step": 29579 + }, + { + "epoch": 1.3771678655399586, + "grad_norm": 0.31081334568345115, + "learning_rate": 6.560533751548303e-05, + "loss": 2.6793, + "step": 29580 + }, + { + "epoch": 1.3772144237260515, + "grad_norm": 0.3316341565431312, + "learning_rate": 6.560276406544295e-05, + "loss": 2.6481, + "step": 29581 + }, + { + "epoch": 1.3772609819121446, + "grad_norm": 0.30911689234635475, + "learning_rate": 6.560019056961031e-05, + "loss": 2.7441, + "step": 29582 + }, + { + "epoch": 1.3773075400982377, + "grad_norm": 0.3486719040681301, + "learning_rate": 6.559761702799263e-05, + "loss": 2.6854, + "step": 29583 + }, + { + "epoch": 1.3773540982843309, + "grad_norm": 0.3028976212482853, + "learning_rate": 6.559504344059748e-05, + "loss": 2.7613, + "step": 29584 + }, + { + "epoch": 1.377400656470424, + "grad_norm": 0.30018660724020446, + "learning_rate": 6.559246980743241e-05, + "loss": 2.7274, + "step": 29585 + }, + { + "epoch": 1.377447214656517, + "grad_norm": 0.3157968589124764, + "learning_rate": 6.558989612850498e-05, + "loss": 2.8097, + "step": 29586 + }, + { + "epoch": 1.37749377284261, + "grad_norm": 0.3280531435772544, + "learning_rate": 6.558732240382275e-05, + "loss": 2.7269, + "step": 29587 + }, + { + "epoch": 1.377540331028703, + "grad_norm": 0.3097116008677878, + "learning_rate": 6.558474863339326e-05, + "loss": 2.7899, + "step": 29588 + }, + { + "epoch": 1.3775868892147962, + "grad_norm": 0.3400461878771905, + "learning_rate": 6.558217481722407e-05, + "loss": 2.7454, + "step": 29589 + }, + { + "epoch": 1.3776334474008893, + "grad_norm": 0.32147264504249895, + "learning_rate": 6.557960095532273e-05, + "loss": 2.7342, + "step": 29590 + }, + { + "epoch": 1.3776800055869822, + "grad_norm": 0.30667803456204845, + "learning_rate": 6.55770270476968e-05, + "loss": 2.7927, + "step": 29591 + }, + { + "epoch": 1.3777265637730753, + "grad_norm": 0.3200778397417475, + "learning_rate": 6.557445309435383e-05, + "loss": 2.5951, + "step": 29592 + }, + { + "epoch": 1.3777731219591685, + "grad_norm": 0.3344399364800402, + "learning_rate": 6.557187909530135e-05, + "loss": 2.6851, + "step": 29593 + }, + { + "epoch": 1.3778196801452616, + "grad_norm": 0.3233718534321278, + "learning_rate": 6.556930505054697e-05, + "loss": 2.7387, + "step": 29594 + }, + { + "epoch": 1.3778662383313547, + "grad_norm": 0.3314072159013813, + "learning_rate": 6.55667309600982e-05, + "loss": 2.7224, + "step": 29595 + }, + { + "epoch": 1.3779127965174478, + "grad_norm": 0.3232124834094844, + "learning_rate": 6.556415682396261e-05, + "loss": 2.8685, + "step": 29596 + }, + { + "epoch": 1.3779593547035407, + "grad_norm": 0.34639300775405124, + "learning_rate": 6.556158264214776e-05, + "loss": 2.6922, + "step": 29597 + }, + { + "epoch": 1.3780059128896338, + "grad_norm": 0.3097759979554444, + "learning_rate": 6.555900841466118e-05, + "loss": 2.7673, + "step": 29598 + }, + { + "epoch": 1.378052471075727, + "grad_norm": 0.37375155607537014, + "learning_rate": 6.555643414151045e-05, + "loss": 2.7792, + "step": 29599 + }, + { + "epoch": 1.37809902926182, + "grad_norm": 0.3157661669734966, + "learning_rate": 6.555385982270315e-05, + "loss": 2.6901, + "step": 29600 + }, + { + "epoch": 1.378145587447913, + "grad_norm": 0.3240257666455579, + "learning_rate": 6.555128545824676e-05, + "loss": 2.7596, + "step": 29601 + }, + { + "epoch": 1.378192145634006, + "grad_norm": 0.3413426261021223, + "learning_rate": 6.55487110481489e-05, + "loss": 2.7079, + "step": 29602 + }, + { + "epoch": 1.3782387038200992, + "grad_norm": 0.34189333187012594, + "learning_rate": 6.55461365924171e-05, + "loss": 2.78, + "step": 29603 + }, + { + "epoch": 1.3782852620061923, + "grad_norm": 0.3454887613441185, + "learning_rate": 6.554356209105892e-05, + "loss": 2.7542, + "step": 29604 + }, + { + "epoch": 1.3783318201922854, + "grad_norm": 0.3367060621971102, + "learning_rate": 6.554098754408194e-05, + "loss": 2.6543, + "step": 29605 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 0.32994647698910223, + "learning_rate": 6.553841295149366e-05, + "loss": 2.7786, + "step": 29606 + }, + { + "epoch": 1.3784249365644714, + "grad_norm": 0.3349899191101422, + "learning_rate": 6.553583831330167e-05, + "loss": 2.6948, + "step": 29607 + }, + { + "epoch": 1.3784714947505645, + "grad_norm": 0.3203811788055051, + "learning_rate": 6.553326362951353e-05, + "loss": 2.8161, + "step": 29608 + }, + { + "epoch": 1.3785180529366576, + "grad_norm": 0.32599775620871385, + "learning_rate": 6.553068890013681e-05, + "loss": 2.8276, + "step": 29609 + }, + { + "epoch": 1.3785646111227505, + "grad_norm": 0.33949560279273056, + "learning_rate": 6.552811412517903e-05, + "loss": 2.6773, + "step": 29610 + }, + { + "epoch": 1.3786111693088436, + "grad_norm": 0.3202273952134901, + "learning_rate": 6.552553930464777e-05, + "loss": 2.7767, + "step": 29611 + }, + { + "epoch": 1.3786577274949368, + "grad_norm": 0.3455246970739094, + "learning_rate": 6.552296443855057e-05, + "loss": 2.8635, + "step": 29612 + }, + { + "epoch": 1.3787042856810299, + "grad_norm": 0.3499869402791991, + "learning_rate": 6.552038952689502e-05, + "loss": 2.753, + "step": 29613 + }, + { + "epoch": 1.378750843867123, + "grad_norm": 0.3364656350479904, + "learning_rate": 6.551781456968864e-05, + "loss": 2.7585, + "step": 29614 + }, + { + "epoch": 1.378797402053216, + "grad_norm": 0.34401682552550467, + "learning_rate": 6.551523956693901e-05, + "loss": 2.6767, + "step": 29615 + }, + { + "epoch": 1.378843960239309, + "grad_norm": 0.35481122797324166, + "learning_rate": 6.551266451865369e-05, + "loss": 2.6345, + "step": 29616 + }, + { + "epoch": 1.3788905184254021, + "grad_norm": 0.3635632289656059, + "learning_rate": 6.55100894248402e-05, + "loss": 2.7578, + "step": 29617 + }, + { + "epoch": 1.3789370766114952, + "grad_norm": 0.31926260276642815, + "learning_rate": 6.550751428550616e-05, + "loss": 2.7551, + "step": 29618 + }, + { + "epoch": 1.3789836347975883, + "grad_norm": 0.3612510535131482, + "learning_rate": 6.550493910065907e-05, + "loss": 2.6387, + "step": 29619 + }, + { + "epoch": 1.3790301929836812, + "grad_norm": 0.320427627261037, + "learning_rate": 6.550236387030651e-05, + "loss": 2.7032, + "step": 29620 + }, + { + "epoch": 1.3790767511697744, + "grad_norm": 0.3299609914915809, + "learning_rate": 6.549978859445604e-05, + "loss": 2.6825, + "step": 29621 + }, + { + "epoch": 1.3791233093558675, + "grad_norm": 0.37822503977468136, + "learning_rate": 6.549721327311522e-05, + "loss": 2.6676, + "step": 29622 + }, + { + "epoch": 1.3791698675419606, + "grad_norm": 0.3250278355063686, + "learning_rate": 6.549463790629163e-05, + "loss": 2.6811, + "step": 29623 + }, + { + "epoch": 1.3792164257280537, + "grad_norm": 0.37553939024899474, + "learning_rate": 6.549206249399278e-05, + "loss": 2.8594, + "step": 29624 + }, + { + "epoch": 1.3792629839141468, + "grad_norm": 0.32520693057648564, + "learning_rate": 6.548948703622625e-05, + "loss": 2.8822, + "step": 29625 + }, + { + "epoch": 1.3793095421002397, + "grad_norm": 0.35683887208999987, + "learning_rate": 6.548691153299962e-05, + "loss": 2.7398, + "step": 29626 + }, + { + "epoch": 1.3793561002863328, + "grad_norm": 0.3253340288856503, + "learning_rate": 6.548433598432041e-05, + "loss": 2.7777, + "step": 29627 + }, + { + "epoch": 1.379402658472426, + "grad_norm": 0.3331153498176309, + "learning_rate": 6.548176039019621e-05, + "loss": 2.8093, + "step": 29628 + }, + { + "epoch": 1.379449216658519, + "grad_norm": 0.3287387917952718, + "learning_rate": 6.547918475063456e-05, + "loss": 2.8086, + "step": 29629 + }, + { + "epoch": 1.379495774844612, + "grad_norm": 0.35704202169851185, + "learning_rate": 6.547660906564303e-05, + "loss": 2.7202, + "step": 29630 + }, + { + "epoch": 1.379542333030705, + "grad_norm": 0.3285811272965743, + "learning_rate": 6.54740333352292e-05, + "loss": 2.7677, + "step": 29631 + }, + { + "epoch": 1.3795888912167982, + "grad_norm": 0.34682784905732117, + "learning_rate": 6.547145755940059e-05, + "loss": 2.683, + "step": 29632 + }, + { + "epoch": 1.3796354494028913, + "grad_norm": 0.3190408307926991, + "learning_rate": 6.546888173816476e-05, + "loss": 2.7113, + "step": 29633 + }, + { + "epoch": 1.3796820075889844, + "grad_norm": 0.32485552657286504, + "learning_rate": 6.54663058715293e-05, + "loss": 2.8229, + "step": 29634 + }, + { + "epoch": 1.3797285657750775, + "grad_norm": 0.33831771137338085, + "learning_rate": 6.546372995950175e-05, + "loss": 2.7494, + "step": 29635 + }, + { + "epoch": 1.3797751239611704, + "grad_norm": 0.31613157433940553, + "learning_rate": 6.546115400208969e-05, + "loss": 2.8158, + "step": 29636 + }, + { + "epoch": 1.3798216821472635, + "grad_norm": 0.3269562802362158, + "learning_rate": 6.545857799930065e-05, + "loss": 2.7083, + "step": 29637 + }, + { + "epoch": 1.3798682403333566, + "grad_norm": 0.35790436528714, + "learning_rate": 6.545600195114221e-05, + "loss": 2.7123, + "step": 29638 + }, + { + "epoch": 1.3799147985194498, + "grad_norm": 0.3473297499196903, + "learning_rate": 6.545342585762194e-05, + "loss": 2.7476, + "step": 29639 + }, + { + "epoch": 1.3799613567055427, + "grad_norm": 0.3256165882803064, + "learning_rate": 6.545084971874738e-05, + "loss": 2.7749, + "step": 29640 + }, + { + "epoch": 1.3800079148916358, + "grad_norm": 0.34739569931735687, + "learning_rate": 6.54482735345261e-05, + "loss": 2.6966, + "step": 29641 + }, + { + "epoch": 1.3800544730777289, + "grad_norm": 0.33185631473357724, + "learning_rate": 6.544569730496566e-05, + "loss": 2.6714, + "step": 29642 + }, + { + "epoch": 1.380101031263822, + "grad_norm": 0.3345849535799646, + "learning_rate": 6.544312103007361e-05, + "loss": 2.7821, + "step": 29643 + }, + { + "epoch": 1.3801475894499151, + "grad_norm": 0.33166496688638025, + "learning_rate": 6.544054470985753e-05, + "loss": 2.7554, + "step": 29644 + }, + { + "epoch": 1.3801941476360082, + "grad_norm": 0.3128047030712355, + "learning_rate": 6.543796834432496e-05, + "loss": 2.8794, + "step": 29645 + }, + { + "epoch": 1.3802407058221011, + "grad_norm": 0.33482995051140285, + "learning_rate": 6.54353919334835e-05, + "loss": 2.7742, + "step": 29646 + }, + { + "epoch": 1.3802872640081942, + "grad_norm": 0.3035378790531325, + "learning_rate": 6.543281547734066e-05, + "loss": 2.821, + "step": 29647 + }, + { + "epoch": 1.3803338221942874, + "grad_norm": 0.34613496199262045, + "learning_rate": 6.543023897590402e-05, + "loss": 2.8194, + "step": 29648 + }, + { + "epoch": 1.3803803803803802, + "grad_norm": 0.32238370603288, + "learning_rate": 6.542766242918117e-05, + "loss": 2.814, + "step": 29649 + }, + { + "epoch": 1.3804269385664734, + "grad_norm": 0.3581647529800704, + "learning_rate": 6.542508583717965e-05, + "loss": 2.7777, + "step": 29650 + }, + { + "epoch": 1.3804734967525665, + "grad_norm": 0.31585284707670325, + "learning_rate": 6.542250919990702e-05, + "loss": 2.8173, + "step": 29651 + }, + { + "epoch": 1.3805200549386596, + "grad_norm": 0.3499249003685464, + "learning_rate": 6.541993251737084e-05, + "loss": 2.7321, + "step": 29652 + }, + { + "epoch": 1.3805666131247527, + "grad_norm": 0.321173924440591, + "learning_rate": 6.541735578957868e-05, + "loss": 2.8105, + "step": 29653 + }, + { + "epoch": 1.3806131713108458, + "grad_norm": 0.32648491423957454, + "learning_rate": 6.54147790165381e-05, + "loss": 2.6841, + "step": 29654 + }, + { + "epoch": 1.380659729496939, + "grad_norm": 0.31840480347371086, + "learning_rate": 6.541220219825666e-05, + "loss": 2.5778, + "step": 29655 + }, + { + "epoch": 1.3807062876830318, + "grad_norm": 0.32373907444647854, + "learning_rate": 6.540962533474192e-05, + "loss": 2.7635, + "step": 29656 + }, + { + "epoch": 1.380752845869125, + "grad_norm": 0.34254652548152326, + "learning_rate": 6.540704842600146e-05, + "loss": 2.7459, + "step": 29657 + }, + { + "epoch": 1.380799404055218, + "grad_norm": 0.3504742202597999, + "learning_rate": 6.540447147204283e-05, + "loss": 2.8274, + "step": 29658 + }, + { + "epoch": 1.380845962241311, + "grad_norm": 0.3431546408422811, + "learning_rate": 6.54018944728736e-05, + "loss": 2.7013, + "step": 29659 + }, + { + "epoch": 1.380892520427404, + "grad_norm": 0.33633129958415287, + "learning_rate": 6.53993174285013e-05, + "loss": 2.719, + "step": 29660 + }, + { + "epoch": 1.3809390786134972, + "grad_norm": 0.3311921450744178, + "learning_rate": 6.539674033893353e-05, + "loss": 2.7207, + "step": 29661 + }, + { + "epoch": 1.3809856367995903, + "grad_norm": 0.34132088827718704, + "learning_rate": 6.539416320417786e-05, + "loss": 2.7315, + "step": 29662 + }, + { + "epoch": 1.3810321949856834, + "grad_norm": 0.34585896463144333, + "learning_rate": 6.539158602424183e-05, + "loss": 2.7777, + "step": 29663 + }, + { + "epoch": 1.3810787531717765, + "grad_norm": 0.3446289532967389, + "learning_rate": 6.5389008799133e-05, + "loss": 2.7414, + "step": 29664 + }, + { + "epoch": 1.3811253113578694, + "grad_norm": 0.3220778257883407, + "learning_rate": 6.538643152885897e-05, + "loss": 2.73, + "step": 29665 + }, + { + "epoch": 1.3811718695439625, + "grad_norm": 0.3369838701108945, + "learning_rate": 6.538385421342725e-05, + "loss": 2.7317, + "step": 29666 + }, + { + "epoch": 1.3812184277300557, + "grad_norm": 0.3232207130736359, + "learning_rate": 6.538127685284545e-05, + "loss": 2.8038, + "step": 29667 + }, + { + "epoch": 1.3812649859161488, + "grad_norm": 0.3700931526462278, + "learning_rate": 6.537869944712112e-05, + "loss": 2.8032, + "step": 29668 + }, + { + "epoch": 1.3813115441022417, + "grad_norm": 0.35168809334150625, + "learning_rate": 6.537612199626181e-05, + "loss": 2.7345, + "step": 29669 + }, + { + "epoch": 1.3813581022883348, + "grad_norm": 0.35831908204900226, + "learning_rate": 6.537354450027511e-05, + "loss": 2.7984, + "step": 29670 + }, + { + "epoch": 1.381404660474428, + "grad_norm": 0.38222517661298955, + "learning_rate": 6.537096695916856e-05, + "loss": 2.7499, + "step": 29671 + }, + { + "epoch": 1.381451218660521, + "grad_norm": 0.3346623486806712, + "learning_rate": 6.536838937294974e-05, + "loss": 2.8064, + "step": 29672 + }, + { + "epoch": 1.3814977768466141, + "grad_norm": 0.3405426037131302, + "learning_rate": 6.536581174162621e-05, + "loss": 2.6788, + "step": 29673 + }, + { + "epoch": 1.3815443350327072, + "grad_norm": 0.36924165333531495, + "learning_rate": 6.536323406520555e-05, + "loss": 2.7239, + "step": 29674 + }, + { + "epoch": 1.3815908932188001, + "grad_norm": 0.3494299617451716, + "learning_rate": 6.536065634369529e-05, + "loss": 2.7904, + "step": 29675 + }, + { + "epoch": 1.3816374514048932, + "grad_norm": 0.31299149905126045, + "learning_rate": 6.535807857710304e-05, + "loss": 2.7332, + "step": 29676 + }, + { + "epoch": 1.3816840095909864, + "grad_norm": 0.36029158823142743, + "learning_rate": 6.535550076543633e-05, + "loss": 2.7199, + "step": 29677 + }, + { + "epoch": 1.3817305677770795, + "grad_norm": 0.33724784649378997, + "learning_rate": 6.535292290870274e-05, + "loss": 2.6426, + "step": 29678 + }, + { + "epoch": 1.3817771259631724, + "grad_norm": 0.33862290736891193, + "learning_rate": 6.535034500690983e-05, + "loss": 2.7286, + "step": 29679 + }, + { + "epoch": 1.3818236841492655, + "grad_norm": 0.3518663019043714, + "learning_rate": 6.534776706006517e-05, + "loss": 2.7322, + "step": 29680 + }, + { + "epoch": 1.3818702423353586, + "grad_norm": 0.3370323265547954, + "learning_rate": 6.534518906817632e-05, + "loss": 2.8158, + "step": 29681 + }, + { + "epoch": 1.3819168005214517, + "grad_norm": 0.3416954507617078, + "learning_rate": 6.534261103125086e-05, + "loss": 2.6422, + "step": 29682 + }, + { + "epoch": 1.3819633587075448, + "grad_norm": 0.3290562167103, + "learning_rate": 6.534003294929636e-05, + "loss": 2.7693, + "step": 29683 + }, + { + "epoch": 1.382009916893638, + "grad_norm": 0.3136667714396712, + "learning_rate": 6.533745482232035e-05, + "loss": 2.7755, + "step": 29684 + }, + { + "epoch": 1.3820564750797308, + "grad_norm": 0.3572272481884378, + "learning_rate": 6.533487665033043e-05, + "loss": 2.7922, + "step": 29685 + }, + { + "epoch": 1.382103033265824, + "grad_norm": 0.32122006190128827, + "learning_rate": 6.533229843333417e-05, + "loss": 2.7405, + "step": 29686 + }, + { + "epoch": 1.382149591451917, + "grad_norm": 0.32433871827957855, + "learning_rate": 6.532972017133912e-05, + "loss": 2.8112, + "step": 29687 + }, + { + "epoch": 1.3821961496380102, + "grad_norm": 0.30327188440876124, + "learning_rate": 6.532714186435283e-05, + "loss": 2.7646, + "step": 29688 + }, + { + "epoch": 1.382242707824103, + "grad_norm": 0.3379628382864632, + "learning_rate": 6.532456351238292e-05, + "loss": 2.7606, + "step": 29689 + }, + { + "epoch": 1.3822892660101962, + "grad_norm": 0.3174787499771071, + "learning_rate": 6.53219851154369e-05, + "loss": 2.7135, + "step": 29690 + }, + { + "epoch": 1.3823358241962893, + "grad_norm": 0.3329543908710795, + "learning_rate": 6.531940667352239e-05, + "loss": 2.7743, + "step": 29691 + }, + { + "epoch": 1.3823823823823824, + "grad_norm": 0.3247211860806968, + "learning_rate": 6.53168281866469e-05, + "loss": 2.7115, + "step": 29692 + }, + { + "epoch": 1.3824289405684755, + "grad_norm": 0.3356857217577396, + "learning_rate": 6.531424965481805e-05, + "loss": 2.7169, + "step": 29693 + }, + { + "epoch": 1.3824754987545687, + "grad_norm": 0.35916726873643784, + "learning_rate": 6.531167107804337e-05, + "loss": 2.7642, + "step": 29694 + }, + { + "epoch": 1.3825220569406615, + "grad_norm": 0.3628601220394676, + "learning_rate": 6.530909245633047e-05, + "loss": 2.7594, + "step": 29695 + }, + { + "epoch": 1.3825686151267547, + "grad_norm": 0.33612995947052754, + "learning_rate": 6.530651378968686e-05, + "loss": 2.7551, + "step": 29696 + }, + { + "epoch": 1.3826151733128478, + "grad_norm": 0.3376138879240666, + "learning_rate": 6.530393507812015e-05, + "loss": 2.786, + "step": 29697 + }, + { + "epoch": 1.3826617314989407, + "grad_norm": 0.36241894123254637, + "learning_rate": 6.53013563216379e-05, + "loss": 2.8136, + "step": 29698 + }, + { + "epoch": 1.3827082896850338, + "grad_norm": 0.35451857664111003, + "learning_rate": 6.529877752024769e-05, + "loss": 2.7743, + "step": 29699 + }, + { + "epoch": 1.382754847871127, + "grad_norm": 0.3658611511732602, + "learning_rate": 6.529619867395707e-05, + "loss": 2.7589, + "step": 29700 + }, + { + "epoch": 1.38280140605722, + "grad_norm": 0.3364221899191697, + "learning_rate": 6.529361978277359e-05, + "loss": 2.8038, + "step": 29701 + }, + { + "epoch": 1.3828479642433131, + "grad_norm": 0.38244289375762336, + "learning_rate": 6.529104084670488e-05, + "loss": 2.71, + "step": 29702 + }, + { + "epoch": 1.3828945224294062, + "grad_norm": 0.3470055272486459, + "learning_rate": 6.528846186575845e-05, + "loss": 2.7913, + "step": 29703 + }, + { + "epoch": 1.3829410806154991, + "grad_norm": 0.3938561651717545, + "learning_rate": 6.528588283994191e-05, + "loss": 2.895, + "step": 29704 + }, + { + "epoch": 1.3829876388015923, + "grad_norm": 0.3616624779362021, + "learning_rate": 6.528330376926279e-05, + "loss": 2.9195, + "step": 29705 + }, + { + "epoch": 1.3830341969876854, + "grad_norm": 0.34640573831112526, + "learning_rate": 6.528072465372868e-05, + "loss": 2.6913, + "step": 29706 + }, + { + "epoch": 1.3830807551737785, + "grad_norm": 0.32849547473140434, + "learning_rate": 6.527814549334714e-05, + "loss": 2.7844, + "step": 29707 + }, + { + "epoch": 1.3831273133598714, + "grad_norm": 0.35215031004671893, + "learning_rate": 6.527556628812575e-05, + "loss": 2.7766, + "step": 29708 + }, + { + "epoch": 1.3831738715459645, + "grad_norm": 0.3376339718325006, + "learning_rate": 6.527298703807209e-05, + "loss": 2.7044, + "step": 29709 + }, + { + "epoch": 1.3832204297320576, + "grad_norm": 0.3440941626175142, + "learning_rate": 6.527040774319372e-05, + "loss": 2.759, + "step": 29710 + }, + { + "epoch": 1.3832669879181507, + "grad_norm": 0.35397713521823704, + "learning_rate": 6.52678284034982e-05, + "loss": 2.7432, + "step": 29711 + }, + { + "epoch": 1.3833135461042438, + "grad_norm": 0.32693994281501915, + "learning_rate": 6.526524901899311e-05, + "loss": 2.8039, + "step": 29712 + }, + { + "epoch": 1.383360104290337, + "grad_norm": 0.3488025690295765, + "learning_rate": 6.526266958968602e-05, + "loss": 2.7545, + "step": 29713 + }, + { + "epoch": 1.3834066624764298, + "grad_norm": 0.32527579965683656, + "learning_rate": 6.526009011558449e-05, + "loss": 2.7761, + "step": 29714 + }, + { + "epoch": 1.383453220662523, + "grad_norm": 0.3404543295976514, + "learning_rate": 6.52575105966961e-05, + "loss": 2.7693, + "step": 29715 + }, + { + "epoch": 1.383499778848616, + "grad_norm": 0.3429730729086449, + "learning_rate": 6.525493103302843e-05, + "loss": 2.7539, + "step": 29716 + }, + { + "epoch": 1.3835463370347092, + "grad_norm": 0.3473603059957221, + "learning_rate": 6.525235142458903e-05, + "loss": 2.7949, + "step": 29717 + }, + { + "epoch": 1.383592895220802, + "grad_norm": 0.3285862706063005, + "learning_rate": 6.52497717713855e-05, + "loss": 2.7105, + "step": 29718 + }, + { + "epoch": 1.3836394534068952, + "grad_norm": 0.3720522032029334, + "learning_rate": 6.524719207342538e-05, + "loss": 2.68, + "step": 29719 + }, + { + "epoch": 1.3836860115929883, + "grad_norm": 0.3422147117068387, + "learning_rate": 6.524461233071624e-05, + "loss": 2.8625, + "step": 29720 + }, + { + "epoch": 1.3837325697790814, + "grad_norm": 0.35061519045881484, + "learning_rate": 6.524203254326567e-05, + "loss": 2.6391, + "step": 29721 + }, + { + "epoch": 1.3837791279651745, + "grad_norm": 0.34610959300417954, + "learning_rate": 6.523945271108125e-05, + "loss": 2.772, + "step": 29722 + }, + { + "epoch": 1.3838256861512677, + "grad_norm": 0.3356250143008105, + "learning_rate": 6.523687283417052e-05, + "loss": 2.7577, + "step": 29723 + }, + { + "epoch": 1.3838722443373606, + "grad_norm": 0.36208009581286404, + "learning_rate": 6.523429291254108e-05, + "loss": 2.8288, + "step": 29724 + }, + { + "epoch": 1.3839188025234537, + "grad_norm": 0.313531911326264, + "learning_rate": 6.52317129462005e-05, + "loss": 2.8, + "step": 29725 + }, + { + "epoch": 1.3839653607095468, + "grad_norm": 0.3486133741149013, + "learning_rate": 6.522913293515633e-05, + "loss": 2.8891, + "step": 29726 + }, + { + "epoch": 1.38401191889564, + "grad_norm": 0.3559439207151251, + "learning_rate": 6.522655287941616e-05, + "loss": 2.8376, + "step": 29727 + }, + { + "epoch": 1.3840584770817328, + "grad_norm": 0.3423587915160363, + "learning_rate": 6.522397277898755e-05, + "loss": 2.7296, + "step": 29728 + }, + { + "epoch": 1.384105035267826, + "grad_norm": 0.3399734398356366, + "learning_rate": 6.522139263387809e-05, + "loss": 2.7455, + "step": 29729 + }, + { + "epoch": 1.384151593453919, + "grad_norm": 0.34880340737593823, + "learning_rate": 6.521881244409534e-05, + "loss": 2.6937, + "step": 29730 + }, + { + "epoch": 1.3841981516400121, + "grad_norm": 0.3615006289946967, + "learning_rate": 6.521623220964688e-05, + "loss": 2.6705, + "step": 29731 + }, + { + "epoch": 1.3842447098261053, + "grad_norm": 0.35058554515343127, + "learning_rate": 6.521365193054026e-05, + "loss": 2.6128, + "step": 29732 + }, + { + "epoch": 1.3842912680121984, + "grad_norm": 0.34224642820405515, + "learning_rate": 6.521107160678308e-05, + "loss": 2.852, + "step": 29733 + }, + { + "epoch": 1.3843378261982913, + "grad_norm": 0.3970240218308968, + "learning_rate": 6.52084912383829e-05, + "loss": 2.8263, + "step": 29734 + }, + { + "epoch": 1.3843843843843844, + "grad_norm": 0.3353466800847121, + "learning_rate": 6.520591082534731e-05, + "loss": 2.73, + "step": 29735 + }, + { + "epoch": 1.3844309425704775, + "grad_norm": 0.34067045620276487, + "learning_rate": 6.520333036768387e-05, + "loss": 2.8111, + "step": 29736 + }, + { + "epoch": 1.3844775007565704, + "grad_norm": 0.3397062966713954, + "learning_rate": 6.520074986540013e-05, + "loss": 2.6023, + "step": 29737 + }, + { + "epoch": 1.3845240589426635, + "grad_norm": 0.3351351624655546, + "learning_rate": 6.51981693185037e-05, + "loss": 2.8364, + "step": 29738 + }, + { + "epoch": 1.3845706171287566, + "grad_norm": 0.340993384970584, + "learning_rate": 6.519558872700213e-05, + "loss": 2.8277, + "step": 29739 + }, + { + "epoch": 1.3846171753148497, + "grad_norm": 0.35383448113264715, + "learning_rate": 6.519300809090303e-05, + "loss": 2.7002, + "step": 29740 + }, + { + "epoch": 1.3846637335009429, + "grad_norm": 0.33314995267867326, + "learning_rate": 6.519042741021393e-05, + "loss": 2.7792, + "step": 29741 + }, + { + "epoch": 1.384710291687036, + "grad_norm": 0.32915619774714683, + "learning_rate": 6.518784668494242e-05, + "loss": 2.7766, + "step": 29742 + }, + { + "epoch": 1.384756849873129, + "grad_norm": 0.355959505907522, + "learning_rate": 6.518526591509609e-05, + "loss": 2.746, + "step": 29743 + }, + { + "epoch": 1.384803408059222, + "grad_norm": 0.3323742926704807, + "learning_rate": 6.518268510068248e-05, + "loss": 2.7736, + "step": 29744 + }, + { + "epoch": 1.384849966245315, + "grad_norm": 0.33190897422588583, + "learning_rate": 6.51801042417092e-05, + "loss": 2.7007, + "step": 29745 + }, + { + "epoch": 1.3848965244314082, + "grad_norm": 0.32309264096338136, + "learning_rate": 6.517752333818381e-05, + "loss": 2.772, + "step": 29746 + }, + { + "epoch": 1.384943082617501, + "grad_norm": 0.3293935714211282, + "learning_rate": 6.517494239011389e-05, + "loss": 2.7072, + "step": 29747 + }, + { + "epoch": 1.3849896408035942, + "grad_norm": 0.32184941836155434, + "learning_rate": 6.517236139750698e-05, + "loss": 2.7963, + "step": 29748 + }, + { + "epoch": 1.3850361989896873, + "grad_norm": 0.3368595730772743, + "learning_rate": 6.516978036037071e-05, + "loss": 2.834, + "step": 29749 + }, + { + "epoch": 1.3850827571757804, + "grad_norm": 0.31473178456344203, + "learning_rate": 6.516719927871263e-05, + "loss": 2.7959, + "step": 29750 + }, + { + "epoch": 1.3851293153618736, + "grad_norm": 0.3418404258082345, + "learning_rate": 6.516461815254029e-05, + "loss": 2.7916, + "step": 29751 + }, + { + "epoch": 1.3851758735479667, + "grad_norm": 0.329038374686537, + "learning_rate": 6.516203698186131e-05, + "loss": 2.8794, + "step": 29752 + }, + { + "epoch": 1.3852224317340596, + "grad_norm": 0.3259231506475236, + "learning_rate": 6.515945576668325e-05, + "loss": 2.8078, + "step": 29753 + }, + { + "epoch": 1.3852689899201527, + "grad_norm": 0.3437513229514423, + "learning_rate": 6.515687450701366e-05, + "loss": 2.7435, + "step": 29754 + }, + { + "epoch": 1.3853155481062458, + "grad_norm": 0.31574160157024217, + "learning_rate": 6.515429320286016e-05, + "loss": 2.6472, + "step": 29755 + }, + { + "epoch": 1.385362106292339, + "grad_norm": 0.3377453723807207, + "learning_rate": 6.515171185423029e-05, + "loss": 2.8261, + "step": 29756 + }, + { + "epoch": 1.3854086644784318, + "grad_norm": 0.33267094465047686, + "learning_rate": 6.514913046113164e-05, + "loss": 2.761, + "step": 29757 + }, + { + "epoch": 1.385455222664525, + "grad_norm": 0.3577069030304553, + "learning_rate": 6.514654902357179e-05, + "loss": 2.7687, + "step": 29758 + }, + { + "epoch": 1.385501780850618, + "grad_norm": 0.3173238624422381, + "learning_rate": 6.514396754155829e-05, + "loss": 2.7978, + "step": 29759 + }, + { + "epoch": 1.3855483390367112, + "grad_norm": 0.35655878000532115, + "learning_rate": 6.514138601509876e-05, + "loss": 2.6426, + "step": 29760 + }, + { + "epoch": 1.3855948972228043, + "grad_norm": 0.3200580331473192, + "learning_rate": 6.513880444420075e-05, + "loss": 2.8518, + "step": 29761 + }, + { + "epoch": 1.3856414554088974, + "grad_norm": 0.36707706093608355, + "learning_rate": 6.513622282887182e-05, + "loss": 2.7544, + "step": 29762 + }, + { + "epoch": 1.3856880135949903, + "grad_norm": 0.31158746778491164, + "learning_rate": 6.51336411691196e-05, + "loss": 2.7535, + "step": 29763 + }, + { + "epoch": 1.3857345717810834, + "grad_norm": 0.38884567291282185, + "learning_rate": 6.513105946495161e-05, + "loss": 2.7142, + "step": 29764 + }, + { + "epoch": 1.3857811299671765, + "grad_norm": 0.34499915720970353, + "learning_rate": 6.512847771637544e-05, + "loss": 2.8025, + "step": 29765 + }, + { + "epoch": 1.3858276881532696, + "grad_norm": 0.3687370251653053, + "learning_rate": 6.512589592339871e-05, + "loss": 2.7301, + "step": 29766 + }, + { + "epoch": 1.3858742463393625, + "grad_norm": 0.3218805228056981, + "learning_rate": 6.512331408602895e-05, + "loss": 2.6698, + "step": 29767 + }, + { + "epoch": 1.3859208045254556, + "grad_norm": 0.36261024514724116, + "learning_rate": 6.512073220427377e-05, + "loss": 2.7554, + "step": 29768 + }, + { + "epoch": 1.3859673627115487, + "grad_norm": 0.3426819520536622, + "learning_rate": 6.511815027814072e-05, + "loss": 2.819, + "step": 29769 + }, + { + "epoch": 1.3860139208976419, + "grad_norm": 0.32594398170157357, + "learning_rate": 6.511556830763738e-05, + "loss": 2.7745, + "step": 29770 + }, + { + "epoch": 1.386060479083735, + "grad_norm": 0.35416599970209195, + "learning_rate": 6.511298629277133e-05, + "loss": 2.7707, + "step": 29771 + }, + { + "epoch": 1.386107037269828, + "grad_norm": 0.35235920605120147, + "learning_rate": 6.511040423355018e-05, + "loss": 2.7946, + "step": 29772 + }, + { + "epoch": 1.386153595455921, + "grad_norm": 0.33185395408043084, + "learning_rate": 6.510782212998145e-05, + "loss": 2.7514, + "step": 29773 + }, + { + "epoch": 1.386200153642014, + "grad_norm": 0.35818968517457206, + "learning_rate": 6.510523998207277e-05, + "loss": 2.6816, + "step": 29774 + }, + { + "epoch": 1.3862467118281072, + "grad_norm": 0.34780873233916065, + "learning_rate": 6.510265778983169e-05, + "loss": 2.7337, + "step": 29775 + }, + { + "epoch": 1.3862932700142003, + "grad_norm": 0.3377851404503798, + "learning_rate": 6.510007555326582e-05, + "loss": 2.7132, + "step": 29776 + }, + { + "epoch": 1.3863398282002932, + "grad_norm": 0.36541991529023926, + "learning_rate": 6.509749327238269e-05, + "loss": 2.708, + "step": 29777 + }, + { + "epoch": 1.3863863863863863, + "grad_norm": 0.3633492088399752, + "learning_rate": 6.509491094718991e-05, + "loss": 2.6632, + "step": 29778 + }, + { + "epoch": 1.3864329445724795, + "grad_norm": 0.3583909865439108, + "learning_rate": 6.509232857769506e-05, + "loss": 2.7371, + "step": 29779 + }, + { + "epoch": 1.3864795027585726, + "grad_norm": 0.3419754022320789, + "learning_rate": 6.50897461639057e-05, + "loss": 2.8305, + "step": 29780 + }, + { + "epoch": 1.3865260609446657, + "grad_norm": 0.34030141355081145, + "learning_rate": 6.508716370582945e-05, + "loss": 2.7234, + "step": 29781 + }, + { + "epoch": 1.3865726191307588, + "grad_norm": 0.34947797787653795, + "learning_rate": 6.508458120347383e-05, + "loss": 2.7743, + "step": 29782 + }, + { + "epoch": 1.3866191773168517, + "grad_norm": 0.3207980053659672, + "learning_rate": 6.508199865684646e-05, + "loss": 2.7897, + "step": 29783 + }, + { + "epoch": 1.3866657355029448, + "grad_norm": 0.35919124061506386, + "learning_rate": 6.507941606595491e-05, + "loss": 2.7347, + "step": 29784 + }, + { + "epoch": 1.386712293689038, + "grad_norm": 0.3397979093098516, + "learning_rate": 6.507683343080677e-05, + "loss": 2.6428, + "step": 29785 + }, + { + "epoch": 1.3867588518751308, + "grad_norm": 0.34865096043169924, + "learning_rate": 6.507425075140961e-05, + "loss": 2.7897, + "step": 29786 + }, + { + "epoch": 1.386805410061224, + "grad_norm": 0.3347740902530637, + "learning_rate": 6.507166802777099e-05, + "loss": 2.8364, + "step": 29787 + }, + { + "epoch": 1.386851968247317, + "grad_norm": 0.3471115529533505, + "learning_rate": 6.506908525989852e-05, + "loss": 2.7205, + "step": 29788 + }, + { + "epoch": 1.3868985264334102, + "grad_norm": 0.3326388630012217, + "learning_rate": 6.506650244779976e-05, + "loss": 2.7579, + "step": 29789 + }, + { + "epoch": 1.3869450846195033, + "grad_norm": 0.32456169282809344, + "learning_rate": 6.506391959148232e-05, + "loss": 2.835, + "step": 29790 + }, + { + "epoch": 1.3869916428055964, + "grad_norm": 0.3189929397743439, + "learning_rate": 6.506133669095373e-05, + "loss": 2.7213, + "step": 29791 + }, + { + "epoch": 1.3870382009916893, + "grad_norm": 0.33549917237720767, + "learning_rate": 6.505875374622163e-05, + "loss": 2.821, + "step": 29792 + }, + { + "epoch": 1.3870847591777824, + "grad_norm": 0.34824818676106434, + "learning_rate": 6.505617075729355e-05, + "loss": 2.811, + "step": 29793 + }, + { + "epoch": 1.3871313173638755, + "grad_norm": 0.3595426255827888, + "learning_rate": 6.505358772417712e-05, + "loss": 2.7242, + "step": 29794 + }, + { + "epoch": 1.3871778755499686, + "grad_norm": 0.3225858034555647, + "learning_rate": 6.505100464687987e-05, + "loss": 2.8105, + "step": 29795 + }, + { + "epoch": 1.3872244337360615, + "grad_norm": 0.3517811350878059, + "learning_rate": 6.50484215254094e-05, + "loss": 2.7319, + "step": 29796 + }, + { + "epoch": 1.3872709919221546, + "grad_norm": 0.3279428443038625, + "learning_rate": 6.504583835977331e-05, + "loss": 2.7928, + "step": 29797 + }, + { + "epoch": 1.3873175501082478, + "grad_norm": 0.3483224235477951, + "learning_rate": 6.504325514997915e-05, + "loss": 2.7995, + "step": 29798 + }, + { + "epoch": 1.3873641082943409, + "grad_norm": 0.3328561694727535, + "learning_rate": 6.504067189603454e-05, + "loss": 2.7053, + "step": 29799 + }, + { + "epoch": 1.387410666480434, + "grad_norm": 0.3783164271540521, + "learning_rate": 6.503808859794704e-05, + "loss": 2.8706, + "step": 29800 + }, + { + "epoch": 1.387457224666527, + "grad_norm": 0.36823337547130836, + "learning_rate": 6.503550525572421e-05, + "loss": 2.7712, + "step": 29801 + }, + { + "epoch": 1.38750378285262, + "grad_norm": 0.35325446272701916, + "learning_rate": 6.503292186937367e-05, + "loss": 2.7658, + "step": 29802 + }, + { + "epoch": 1.387550341038713, + "grad_norm": 0.3656901746126555, + "learning_rate": 6.503033843890297e-05, + "loss": 2.7613, + "step": 29803 + }, + { + "epoch": 1.3875968992248062, + "grad_norm": 0.3838445956251863, + "learning_rate": 6.502775496431972e-05, + "loss": 2.8284, + "step": 29804 + }, + { + "epoch": 1.3876434574108993, + "grad_norm": 0.35703649800274173, + "learning_rate": 6.502517144563149e-05, + "loss": 2.8626, + "step": 29805 + }, + { + "epoch": 1.3876900155969922, + "grad_norm": 0.36611805353610066, + "learning_rate": 6.502258788284585e-05, + "loss": 2.7735, + "step": 29806 + }, + { + "epoch": 1.3877365737830853, + "grad_norm": 0.32932259906204653, + "learning_rate": 6.50200042759704e-05, + "loss": 2.6471, + "step": 29807 + }, + { + "epoch": 1.3877831319691785, + "grad_norm": 0.34592226838513357, + "learning_rate": 6.501742062501273e-05, + "loss": 2.8108, + "step": 29808 + }, + { + "epoch": 1.3878296901552716, + "grad_norm": 0.34950981247557766, + "learning_rate": 6.501483692998041e-05, + "loss": 2.7569, + "step": 29809 + }, + { + "epoch": 1.3878762483413647, + "grad_norm": 0.36069583643890224, + "learning_rate": 6.5012253190881e-05, + "loss": 2.7999, + "step": 29810 + }, + { + "epoch": 1.3879228065274578, + "grad_norm": 0.3705042371442151, + "learning_rate": 6.500966940772211e-05, + "loss": 2.7205, + "step": 29811 + }, + { + "epoch": 1.3879693647135507, + "grad_norm": 0.37125859700989056, + "learning_rate": 6.500708558051134e-05, + "loss": 2.7443, + "step": 29812 + }, + { + "epoch": 1.3880159228996438, + "grad_norm": 0.3387844340483252, + "learning_rate": 6.500450170925624e-05, + "loss": 2.7003, + "step": 29813 + }, + { + "epoch": 1.388062481085737, + "grad_norm": 0.37335855630736176, + "learning_rate": 6.50019177939644e-05, + "loss": 2.7301, + "step": 29814 + }, + { + "epoch": 1.38810903927183, + "grad_norm": 0.33875548804737765, + "learning_rate": 6.49993338346434e-05, + "loss": 2.7703, + "step": 29815 + }, + { + "epoch": 1.388155597457923, + "grad_norm": 0.33958168980629183, + "learning_rate": 6.499674983130086e-05, + "loss": 2.8266, + "step": 29816 + }, + { + "epoch": 1.388202155644016, + "grad_norm": 0.33737452032814175, + "learning_rate": 6.499416578394433e-05, + "loss": 2.6963, + "step": 29817 + }, + { + "epoch": 1.3882487138301092, + "grad_norm": 0.3598565933267639, + "learning_rate": 6.499158169258139e-05, + "loss": 2.8305, + "step": 29818 + }, + { + "epoch": 1.3882952720162023, + "grad_norm": 0.3291272527012962, + "learning_rate": 6.498899755721964e-05, + "loss": 2.7486, + "step": 29819 + }, + { + "epoch": 1.3883418302022954, + "grad_norm": 0.3183293301003088, + "learning_rate": 6.498641337786666e-05, + "loss": 2.6989, + "step": 29820 + }, + { + "epoch": 1.3883883883883885, + "grad_norm": 0.32724336005448706, + "learning_rate": 6.498382915453004e-05, + "loss": 2.7616, + "step": 29821 + }, + { + "epoch": 1.3884349465744814, + "grad_norm": 0.35204442126525975, + "learning_rate": 6.498124488721734e-05, + "loss": 2.7669, + "step": 29822 + }, + { + "epoch": 1.3884815047605745, + "grad_norm": 0.318115249401673, + "learning_rate": 6.497866057593617e-05, + "loss": 2.7246, + "step": 29823 + }, + { + "epoch": 1.3885280629466676, + "grad_norm": 0.33894244561585585, + "learning_rate": 6.49760762206941e-05, + "loss": 2.8447, + "step": 29824 + }, + { + "epoch": 1.3885746211327605, + "grad_norm": 0.30782239375265597, + "learning_rate": 6.497349182149873e-05, + "loss": 2.671, + "step": 29825 + }, + { + "epoch": 1.3886211793188536, + "grad_norm": 0.3372939777020337, + "learning_rate": 6.497090737835765e-05, + "loss": 2.7166, + "step": 29826 + }, + { + "epoch": 1.3886677375049468, + "grad_norm": 0.31309261230746743, + "learning_rate": 6.49683228912784e-05, + "loss": 2.7422, + "step": 29827 + }, + { + "epoch": 1.3887142956910399, + "grad_norm": 0.3049133943507981, + "learning_rate": 6.496573836026862e-05, + "loss": 2.8424, + "step": 29828 + }, + { + "epoch": 1.388760853877133, + "grad_norm": 0.323128851766035, + "learning_rate": 6.496315378533587e-05, + "loss": 2.7898, + "step": 29829 + }, + { + "epoch": 1.388807412063226, + "grad_norm": 0.3439772330912178, + "learning_rate": 6.496056916648775e-05, + "loss": 2.7373, + "step": 29830 + }, + { + "epoch": 1.3888539702493192, + "grad_norm": 0.31794650649139466, + "learning_rate": 6.495798450373182e-05, + "loss": 2.786, + "step": 29831 + }, + { + "epoch": 1.3889005284354121, + "grad_norm": 0.3830122445714033, + "learning_rate": 6.495539979707567e-05, + "loss": 2.8589, + "step": 29832 + }, + { + "epoch": 1.3889470866215052, + "grad_norm": 0.3370708491381747, + "learning_rate": 6.495281504652689e-05, + "loss": 2.6897, + "step": 29833 + }, + { + "epoch": 1.3889936448075983, + "grad_norm": 0.3679043919173834, + "learning_rate": 6.495023025209307e-05, + "loss": 2.6703, + "step": 29834 + }, + { + "epoch": 1.3890402029936912, + "grad_norm": 0.31150766669385527, + "learning_rate": 6.494764541378183e-05, + "loss": 2.6923, + "step": 29835 + }, + { + "epoch": 1.3890867611797844, + "grad_norm": 0.35096917984269055, + "learning_rate": 6.49450605316007e-05, + "loss": 2.7416, + "step": 29836 + }, + { + "epoch": 1.3891333193658775, + "grad_norm": 0.32862157352154664, + "learning_rate": 6.49424756055573e-05, + "loss": 2.6504, + "step": 29837 + }, + { + "epoch": 1.3891798775519706, + "grad_norm": 0.34116131780980374, + "learning_rate": 6.49398906356592e-05, + "loss": 2.683, + "step": 29838 + }, + { + "epoch": 1.3892264357380637, + "grad_norm": 0.3331582740301447, + "learning_rate": 6.493730562191398e-05, + "loss": 2.7215, + "step": 29839 + }, + { + "epoch": 1.3892729939241568, + "grad_norm": 0.31068954701136337, + "learning_rate": 6.493472056432926e-05, + "loss": 2.7334, + "step": 29840 + }, + { + "epoch": 1.3893195521102497, + "grad_norm": 0.34441500341808917, + "learning_rate": 6.49321354629126e-05, + "loss": 2.756, + "step": 29841 + }, + { + "epoch": 1.3893661102963428, + "grad_norm": 0.314490231715025, + "learning_rate": 6.492955031767159e-05, + "loss": 2.6577, + "step": 29842 + }, + { + "epoch": 1.389412668482436, + "grad_norm": 0.36070154046773784, + "learning_rate": 6.492696512861382e-05, + "loss": 2.7542, + "step": 29843 + }, + { + "epoch": 1.389459226668529, + "grad_norm": 0.3418253701926167, + "learning_rate": 6.492437989574689e-05, + "loss": 2.7853, + "step": 29844 + }, + { + "epoch": 1.389505784854622, + "grad_norm": 0.3570911045224813, + "learning_rate": 6.492179461907837e-05, + "loss": 2.6664, + "step": 29845 + }, + { + "epoch": 1.389552343040715, + "grad_norm": 0.3830860226543312, + "learning_rate": 6.491920929861584e-05, + "loss": 2.8485, + "step": 29846 + }, + { + "epoch": 1.3895989012268082, + "grad_norm": 0.423935428707735, + "learning_rate": 6.49166239343669e-05, + "loss": 2.695, + "step": 29847 + }, + { + "epoch": 1.3896454594129013, + "grad_norm": 0.3256153568532445, + "learning_rate": 6.491403852633916e-05, + "loss": 2.7075, + "step": 29848 + }, + { + "epoch": 1.3896920175989944, + "grad_norm": 0.3452027043337535, + "learning_rate": 6.491145307454017e-05, + "loss": 2.7205, + "step": 29849 + }, + { + "epoch": 1.3897385757850875, + "grad_norm": 0.3336596748683501, + "learning_rate": 6.490886757897754e-05, + "loss": 2.8194, + "step": 29850 + }, + { + "epoch": 1.3897851339711804, + "grad_norm": 0.34918657752419996, + "learning_rate": 6.490628203965884e-05, + "loss": 2.7475, + "step": 29851 + }, + { + "epoch": 1.3898316921572735, + "grad_norm": 0.3181675742032034, + "learning_rate": 6.490369645659168e-05, + "loss": 2.7396, + "step": 29852 + }, + { + "epoch": 1.3898782503433667, + "grad_norm": 0.3638300657176117, + "learning_rate": 6.490111082978364e-05, + "loss": 2.763, + "step": 29853 + }, + { + "epoch": 1.3899248085294598, + "grad_norm": 0.3154134917555822, + "learning_rate": 6.48985251592423e-05, + "loss": 2.7427, + "step": 29854 + }, + { + "epoch": 1.3899713667155527, + "grad_norm": 0.36085558472795815, + "learning_rate": 6.489593944497528e-05, + "loss": 2.6406, + "step": 29855 + }, + { + "epoch": 1.3900179249016458, + "grad_norm": 0.33652178712765046, + "learning_rate": 6.489335368699013e-05, + "loss": 2.8573, + "step": 29856 + }, + { + "epoch": 1.390064483087739, + "grad_norm": 0.3752117709245556, + "learning_rate": 6.489076788529445e-05, + "loss": 2.7633, + "step": 29857 + }, + { + "epoch": 1.390111041273832, + "grad_norm": 0.3350044512027879, + "learning_rate": 6.488818203989585e-05, + "loss": 2.7659, + "step": 29858 + }, + { + "epoch": 1.3901575994599251, + "grad_norm": 0.3726511914689241, + "learning_rate": 6.488559615080187e-05, + "loss": 2.8365, + "step": 29859 + }, + { + "epoch": 1.3902041576460182, + "grad_norm": 0.3443032764962149, + "learning_rate": 6.488301021802015e-05, + "loss": 2.7858, + "step": 29860 + }, + { + "epoch": 1.3902507158321111, + "grad_norm": 0.38093635580064933, + "learning_rate": 6.488042424155827e-05, + "loss": 2.7473, + "step": 29861 + }, + { + "epoch": 1.3902972740182042, + "grad_norm": 0.33479572369361826, + "learning_rate": 6.48778382214238e-05, + "loss": 2.7407, + "step": 29862 + }, + { + "epoch": 1.3903438322042974, + "grad_norm": 0.363933395161069, + "learning_rate": 6.487525215762435e-05, + "loss": 2.7898, + "step": 29863 + }, + { + "epoch": 1.3903903903903903, + "grad_norm": 0.33380177001822997, + "learning_rate": 6.487266605016749e-05, + "loss": 2.8177, + "step": 29864 + }, + { + "epoch": 1.3904369485764834, + "grad_norm": 0.36669176187432817, + "learning_rate": 6.487007989906082e-05, + "loss": 2.7988, + "step": 29865 + }, + { + "epoch": 1.3904835067625765, + "grad_norm": 0.3774818255879403, + "learning_rate": 6.486749370431194e-05, + "loss": 2.6345, + "step": 29866 + }, + { + "epoch": 1.3905300649486696, + "grad_norm": 0.3449032022340156, + "learning_rate": 6.486490746592843e-05, + "loss": 2.7156, + "step": 29867 + }, + { + "epoch": 1.3905766231347627, + "grad_norm": 0.350850393083166, + "learning_rate": 6.486232118391787e-05, + "loss": 2.6891, + "step": 29868 + }, + { + "epoch": 1.3906231813208558, + "grad_norm": 0.3433667747175773, + "learning_rate": 6.485973485828787e-05, + "loss": 2.6931, + "step": 29869 + }, + { + "epoch": 1.390669739506949, + "grad_norm": 0.3367383555102291, + "learning_rate": 6.4857148489046e-05, + "loss": 2.5637, + "step": 29870 + }, + { + "epoch": 1.3907162976930418, + "grad_norm": 0.3539550590479178, + "learning_rate": 6.485456207619989e-05, + "loss": 2.7735, + "step": 29871 + }, + { + "epoch": 1.390762855879135, + "grad_norm": 0.3460429977073846, + "learning_rate": 6.485197561975708e-05, + "loss": 2.7349, + "step": 29872 + }, + { + "epoch": 1.390809414065228, + "grad_norm": 0.3702435487124811, + "learning_rate": 6.484938911972519e-05, + "loss": 2.9214, + "step": 29873 + }, + { + "epoch": 1.390855972251321, + "grad_norm": 0.3436560512840645, + "learning_rate": 6.484680257611179e-05, + "loss": 2.7038, + "step": 29874 + }, + { + "epoch": 1.390902530437414, + "grad_norm": 0.3621688026001379, + "learning_rate": 6.484421598892453e-05, + "loss": 2.8371, + "step": 29875 + }, + { + "epoch": 1.3909490886235072, + "grad_norm": 0.36381306011505715, + "learning_rate": 6.484162935817094e-05, + "loss": 2.775, + "step": 29876 + }, + { + "epoch": 1.3909956468096003, + "grad_norm": 0.34909860383412633, + "learning_rate": 6.483904268385861e-05, + "loss": 2.6613, + "step": 29877 + }, + { + "epoch": 1.3910422049956934, + "grad_norm": 0.3695912000753118, + "learning_rate": 6.483645596599516e-05, + "loss": 2.6191, + "step": 29878 + }, + { + "epoch": 1.3910887631817865, + "grad_norm": 0.32914373569777794, + "learning_rate": 6.483386920458817e-05, + "loss": 2.7781, + "step": 29879 + }, + { + "epoch": 1.3911353213678794, + "grad_norm": 0.36566909254799534, + "learning_rate": 6.483128239964524e-05, + "loss": 2.7599, + "step": 29880 + }, + { + "epoch": 1.3911818795539725, + "grad_norm": 0.35086668373907104, + "learning_rate": 6.482869555117397e-05, + "loss": 2.7467, + "step": 29881 + }, + { + "epoch": 1.3912284377400657, + "grad_norm": 0.3467009126477914, + "learning_rate": 6.482610865918193e-05, + "loss": 2.6795, + "step": 29882 + }, + { + "epoch": 1.3912749959261588, + "grad_norm": 0.37819142538093353, + "learning_rate": 6.482352172367671e-05, + "loss": 2.874, + "step": 29883 + }, + { + "epoch": 1.3913215541122517, + "grad_norm": 0.3356940035823214, + "learning_rate": 6.482093474466595e-05, + "loss": 2.7194, + "step": 29884 + }, + { + "epoch": 1.3913681122983448, + "grad_norm": 0.3871887272438314, + "learning_rate": 6.481834772215718e-05, + "loss": 2.7777, + "step": 29885 + }, + { + "epoch": 1.391414670484438, + "grad_norm": 0.3479466249798362, + "learning_rate": 6.481576065615801e-05, + "loss": 2.7887, + "step": 29886 + }, + { + "epoch": 1.391461228670531, + "grad_norm": 0.3625374528688668, + "learning_rate": 6.481317354667606e-05, + "loss": 2.8466, + "step": 29887 + }, + { + "epoch": 1.3915077868566241, + "grad_norm": 0.3560059133569271, + "learning_rate": 6.481058639371889e-05, + "loss": 2.724, + "step": 29888 + }, + { + "epoch": 1.3915543450427172, + "grad_norm": 0.3759472926384968, + "learning_rate": 6.480799919729412e-05, + "loss": 2.8667, + "step": 29889 + }, + { + "epoch": 1.3916009032288101, + "grad_norm": 0.34326697881293344, + "learning_rate": 6.480541195740932e-05, + "loss": 2.7031, + "step": 29890 + }, + { + "epoch": 1.3916474614149033, + "grad_norm": 0.35584277491964117, + "learning_rate": 6.480282467407211e-05, + "loss": 2.7966, + "step": 29891 + }, + { + "epoch": 1.3916940196009964, + "grad_norm": 0.34118102514465376, + "learning_rate": 6.480023734729005e-05, + "loss": 2.6221, + "step": 29892 + }, + { + "epoch": 1.3917405777870895, + "grad_norm": 0.3517680897288714, + "learning_rate": 6.479764997707078e-05, + "loss": 2.755, + "step": 29893 + }, + { + "epoch": 1.3917871359731824, + "grad_norm": 0.32673699129279843, + "learning_rate": 6.479506256342184e-05, + "loss": 2.7045, + "step": 29894 + }, + { + "epoch": 1.3918336941592755, + "grad_norm": 0.357201404037083, + "learning_rate": 6.479247510635085e-05, + "loss": 2.7098, + "step": 29895 + }, + { + "epoch": 1.3918802523453686, + "grad_norm": 0.3341102292156228, + "learning_rate": 6.478988760586541e-05, + "loss": 2.6654, + "step": 29896 + }, + { + "epoch": 1.3919268105314617, + "grad_norm": 0.32406365944236737, + "learning_rate": 6.478730006197311e-05, + "loss": 2.9056, + "step": 29897 + }, + { + "epoch": 1.3919733687175548, + "grad_norm": 0.3705086293074479, + "learning_rate": 6.478471247468154e-05, + "loss": 2.6687, + "step": 29898 + }, + { + "epoch": 1.392019926903648, + "grad_norm": 0.33210931167806323, + "learning_rate": 6.478212484399828e-05, + "loss": 2.7481, + "step": 29899 + }, + { + "epoch": 1.3920664850897408, + "grad_norm": 0.37492474079310534, + "learning_rate": 6.477953716993094e-05, + "loss": 2.8226, + "step": 29900 + }, + { + "epoch": 1.392113043275834, + "grad_norm": 0.3523364643980886, + "learning_rate": 6.477694945248714e-05, + "loss": 2.7962, + "step": 29901 + }, + { + "epoch": 1.392159601461927, + "grad_norm": 0.3359310445954139, + "learning_rate": 6.477436169167444e-05, + "loss": 2.7583, + "step": 29902 + }, + { + "epoch": 1.3922061596480202, + "grad_norm": 0.36263371826649377, + "learning_rate": 6.477177388750044e-05, + "loss": 2.6986, + "step": 29903 + }, + { + "epoch": 1.392252717834113, + "grad_norm": 0.3346493993429107, + "learning_rate": 6.476918603997272e-05, + "loss": 2.8464, + "step": 29904 + }, + { + "epoch": 1.3922992760202062, + "grad_norm": 0.3595869312594974, + "learning_rate": 6.476659814909892e-05, + "loss": 2.7151, + "step": 29905 + }, + { + "epoch": 1.3923458342062993, + "grad_norm": 0.337236624076748, + "learning_rate": 6.47640102148866e-05, + "loss": 2.7285, + "step": 29906 + }, + { + "epoch": 1.3923923923923924, + "grad_norm": 0.3629934784784106, + "learning_rate": 6.476142223734337e-05, + "loss": 2.6469, + "step": 29907 + }, + { + "epoch": 1.3924389505784855, + "grad_norm": 0.35767367822003887, + "learning_rate": 6.475883421647683e-05, + "loss": 2.6214, + "step": 29908 + }, + { + "epoch": 1.3924855087645787, + "grad_norm": 0.32459382259136466, + "learning_rate": 6.475624615229454e-05, + "loss": 2.7701, + "step": 29909 + }, + { + "epoch": 1.3925320669506716, + "grad_norm": 0.3690892002607751, + "learning_rate": 6.475365804480413e-05, + "loss": 2.8286, + "step": 29910 + }, + { + "epoch": 1.3925786251367647, + "grad_norm": 0.35488998527004095, + "learning_rate": 6.47510698940132e-05, + "loss": 2.8084, + "step": 29911 + }, + { + "epoch": 1.3926251833228578, + "grad_norm": 0.3063230898841957, + "learning_rate": 6.474848169992932e-05, + "loss": 2.7719, + "step": 29912 + }, + { + "epoch": 1.3926717415089507, + "grad_norm": 0.3492930240454628, + "learning_rate": 6.47458934625601e-05, + "loss": 2.8296, + "step": 29913 + }, + { + "epoch": 1.3927182996950438, + "grad_norm": 0.31546091930341935, + "learning_rate": 6.474330518191313e-05, + "loss": 2.6804, + "step": 29914 + }, + { + "epoch": 1.392764857881137, + "grad_norm": 0.3358019191368642, + "learning_rate": 6.474071685799603e-05, + "loss": 2.8347, + "step": 29915 + }, + { + "epoch": 1.39281141606723, + "grad_norm": 0.3421268234843618, + "learning_rate": 6.473812849081636e-05, + "loss": 2.7127, + "step": 29916 + }, + { + "epoch": 1.3928579742533231, + "grad_norm": 0.33153370662697346, + "learning_rate": 6.473554008038174e-05, + "loss": 2.7743, + "step": 29917 + }, + { + "epoch": 1.3929045324394163, + "grad_norm": 0.3259811959350787, + "learning_rate": 6.473295162669977e-05, + "loss": 2.8173, + "step": 29918 + }, + { + "epoch": 1.3929510906255091, + "grad_norm": 0.3466490421436185, + "learning_rate": 6.473036312977802e-05, + "loss": 2.8547, + "step": 29919 + }, + { + "epoch": 1.3929976488116023, + "grad_norm": 0.34878926199996657, + "learning_rate": 6.472777458962412e-05, + "loss": 2.7729, + "step": 29920 + }, + { + "epoch": 1.3930442069976954, + "grad_norm": 0.36522336014717005, + "learning_rate": 6.472518600624566e-05, + "loss": 2.831, + "step": 29921 + }, + { + "epoch": 1.3930907651837885, + "grad_norm": 0.3221711782236134, + "learning_rate": 6.472259737965022e-05, + "loss": 2.7667, + "step": 29922 + }, + { + "epoch": 1.3931373233698814, + "grad_norm": 0.37887292840708714, + "learning_rate": 6.47200087098454e-05, + "loss": 2.7179, + "step": 29923 + }, + { + "epoch": 1.3931838815559745, + "grad_norm": 0.32910624140985884, + "learning_rate": 6.471741999683882e-05, + "loss": 2.745, + "step": 29924 + }, + { + "epoch": 1.3932304397420676, + "grad_norm": 0.343122468943179, + "learning_rate": 6.471483124063805e-05, + "loss": 2.755, + "step": 29925 + }, + { + "epoch": 1.3932769979281607, + "grad_norm": 0.33636862035151516, + "learning_rate": 6.471224244125071e-05, + "loss": 2.775, + "step": 29926 + }, + { + "epoch": 1.3933235561142538, + "grad_norm": 0.32453055311685347, + "learning_rate": 6.470965359868438e-05, + "loss": 2.8521, + "step": 29927 + }, + { + "epoch": 1.393370114300347, + "grad_norm": 0.32080698421065296, + "learning_rate": 6.470706471294665e-05, + "loss": 2.7178, + "step": 29928 + }, + { + "epoch": 1.3934166724864399, + "grad_norm": 0.3426694951498541, + "learning_rate": 6.470447578404517e-05, + "loss": 2.8278, + "step": 29929 + }, + { + "epoch": 1.393463230672533, + "grad_norm": 0.3244144538295638, + "learning_rate": 6.470188681198748e-05, + "loss": 2.6841, + "step": 29930 + }, + { + "epoch": 1.393509788858626, + "grad_norm": 0.3456272385367771, + "learning_rate": 6.46992977967812e-05, + "loss": 2.8461, + "step": 29931 + }, + { + "epoch": 1.3935563470447192, + "grad_norm": 0.31768241017356164, + "learning_rate": 6.469670873843394e-05, + "loss": 2.665, + "step": 29932 + }, + { + "epoch": 1.393602905230812, + "grad_norm": 0.3509384725120311, + "learning_rate": 6.469411963695327e-05, + "loss": 2.7556, + "step": 29933 + }, + { + "epoch": 1.3936494634169052, + "grad_norm": 0.33986202434683876, + "learning_rate": 6.469153049234684e-05, + "loss": 2.7508, + "step": 29934 + }, + { + "epoch": 1.3936960216029983, + "grad_norm": 0.3434175817052007, + "learning_rate": 6.468894130462218e-05, + "loss": 2.8568, + "step": 29935 + }, + { + "epoch": 1.3937425797890914, + "grad_norm": 0.39421295618118884, + "learning_rate": 6.468635207378692e-05, + "loss": 2.8276, + "step": 29936 + }, + { + "epoch": 1.3937891379751846, + "grad_norm": 0.5809620860685034, + "learning_rate": 6.46837627998487e-05, + "loss": 2.8172, + "step": 29937 + }, + { + "epoch": 1.3938356961612777, + "grad_norm": 0.40511366724425163, + "learning_rate": 6.468117348281505e-05, + "loss": 2.8186, + "step": 29938 + }, + { + "epoch": 1.3938822543473706, + "grad_norm": 0.36837745522560766, + "learning_rate": 6.467858412269362e-05, + "loss": 2.7282, + "step": 29939 + }, + { + "epoch": 1.3939288125334637, + "grad_norm": 0.3757985642493906, + "learning_rate": 6.467599471949198e-05, + "loss": 2.7976, + "step": 29940 + }, + { + "epoch": 1.3939753707195568, + "grad_norm": 0.3127822587662957, + "learning_rate": 6.467340527321774e-05, + "loss": 2.6647, + "step": 29941 + }, + { + "epoch": 1.39402192890565, + "grad_norm": 0.34440220366411795, + "learning_rate": 6.467081578387852e-05, + "loss": 2.7621, + "step": 29942 + }, + { + "epoch": 1.3940684870917428, + "grad_norm": 0.3275154863333927, + "learning_rate": 6.46682262514819e-05, + "loss": 2.7559, + "step": 29943 + }, + { + "epoch": 1.394115045277836, + "grad_norm": 0.3461603510395315, + "learning_rate": 6.466563667603547e-05, + "loss": 2.8766, + "step": 29944 + }, + { + "epoch": 1.394161603463929, + "grad_norm": 0.36556441524685857, + "learning_rate": 6.466304705754684e-05, + "loss": 2.7057, + "step": 29945 + }, + { + "epoch": 1.3942081616500221, + "grad_norm": 0.32868611149431914, + "learning_rate": 6.466045739602362e-05, + "loss": 2.726, + "step": 29946 + }, + { + "epoch": 1.3942547198361153, + "grad_norm": 0.33180436545451775, + "learning_rate": 6.46578676914734e-05, + "loss": 2.8538, + "step": 29947 + }, + { + "epoch": 1.3943012780222084, + "grad_norm": 0.3396269632094414, + "learning_rate": 6.465527794390378e-05, + "loss": 2.6204, + "step": 29948 + }, + { + "epoch": 1.3943478362083013, + "grad_norm": 0.3339594925897294, + "learning_rate": 6.465268815332235e-05, + "loss": 2.7751, + "step": 29949 + }, + { + "epoch": 1.3943943943943944, + "grad_norm": 0.3542450756321482, + "learning_rate": 6.465009831973674e-05, + "loss": 2.759, + "step": 29950 + }, + { + "epoch": 1.3944409525804875, + "grad_norm": 0.3271339412197006, + "learning_rate": 6.464750844315453e-05, + "loss": 2.724, + "step": 29951 + }, + { + "epoch": 1.3944875107665804, + "grad_norm": 0.38185269720466086, + "learning_rate": 6.464491852358332e-05, + "loss": 2.6643, + "step": 29952 + }, + { + "epoch": 1.3945340689526735, + "grad_norm": 0.36370058453746035, + "learning_rate": 6.464232856103073e-05, + "loss": 2.8225, + "step": 29953 + }, + { + "epoch": 1.3945806271387666, + "grad_norm": 0.39895437170313225, + "learning_rate": 6.463973855550434e-05, + "loss": 2.8028, + "step": 29954 + }, + { + "epoch": 1.3946271853248597, + "grad_norm": 0.3525499407350031, + "learning_rate": 6.463714850701175e-05, + "loss": 2.7704, + "step": 29955 + }, + { + "epoch": 1.3946737435109529, + "grad_norm": 0.3799987695919597, + "learning_rate": 6.463455841556058e-05, + "loss": 2.7886, + "step": 29956 + }, + { + "epoch": 1.394720301697046, + "grad_norm": 0.3569269590428331, + "learning_rate": 6.463196828115843e-05, + "loss": 2.7605, + "step": 29957 + }, + { + "epoch": 1.394766859883139, + "grad_norm": 0.36503408511830493, + "learning_rate": 6.46293781038129e-05, + "loss": 2.7241, + "step": 29958 + }, + { + "epoch": 1.394813418069232, + "grad_norm": 0.3556326251231624, + "learning_rate": 6.462678788353157e-05, + "loss": 2.8045, + "step": 29959 + }, + { + "epoch": 1.394859976255325, + "grad_norm": 0.37165025213857705, + "learning_rate": 6.462419762032205e-05, + "loss": 2.7651, + "step": 29960 + }, + { + "epoch": 1.3949065344414182, + "grad_norm": 0.3403487640990849, + "learning_rate": 6.462160731419198e-05, + "loss": 2.7297, + "step": 29961 + }, + { + "epoch": 1.394953092627511, + "grad_norm": 0.33732042871456475, + "learning_rate": 6.461901696514892e-05, + "loss": 2.7333, + "step": 29962 + }, + { + "epoch": 1.3949996508136042, + "grad_norm": 0.35088251466566595, + "learning_rate": 6.461642657320047e-05, + "loss": 2.7127, + "step": 29963 + }, + { + "epoch": 1.3950462089996973, + "grad_norm": 0.3578712610676997, + "learning_rate": 6.461383613835427e-05, + "loss": 2.8023, + "step": 29964 + }, + { + "epoch": 1.3950927671857904, + "grad_norm": 0.335798407932044, + "learning_rate": 6.461124566061789e-05, + "loss": 2.7023, + "step": 29965 + }, + { + "epoch": 1.3951393253718836, + "grad_norm": 0.33169811174209324, + "learning_rate": 6.460865513999895e-05, + "loss": 2.8084, + "step": 29966 + }, + { + "epoch": 1.3951858835579767, + "grad_norm": 0.3608278955052094, + "learning_rate": 6.460606457650503e-05, + "loss": 2.7785, + "step": 29967 + }, + { + "epoch": 1.3952324417440696, + "grad_norm": 0.3258146842347811, + "learning_rate": 6.460347397014376e-05, + "loss": 2.7934, + "step": 29968 + }, + { + "epoch": 1.3952789999301627, + "grad_norm": 0.3625168868123464, + "learning_rate": 6.460088332092274e-05, + "loss": 2.7189, + "step": 29969 + }, + { + "epoch": 1.3953255581162558, + "grad_norm": 0.32535039681437106, + "learning_rate": 6.459829262884956e-05, + "loss": 2.7663, + "step": 29970 + }, + { + "epoch": 1.395372116302349, + "grad_norm": 0.35126444462459927, + "learning_rate": 6.459570189393184e-05, + "loss": 2.694, + "step": 29971 + }, + { + "epoch": 1.3954186744884418, + "grad_norm": 0.36030689177475694, + "learning_rate": 6.459311111617716e-05, + "loss": 2.8671, + "step": 29972 + }, + { + "epoch": 1.395465232674535, + "grad_norm": 0.3662264660178734, + "learning_rate": 6.459052029559313e-05, + "loss": 2.7738, + "step": 29973 + }, + { + "epoch": 1.395511790860628, + "grad_norm": 0.3320328447126796, + "learning_rate": 6.458792943218736e-05, + "loss": 2.7669, + "step": 29974 + }, + { + "epoch": 1.3955583490467212, + "grad_norm": 0.3769340649623985, + "learning_rate": 6.458533852596748e-05, + "loss": 2.6984, + "step": 29975 + }, + { + "epoch": 1.3956049072328143, + "grad_norm": 0.3433496940461951, + "learning_rate": 6.458274757694104e-05, + "loss": 2.8175, + "step": 29976 + }, + { + "epoch": 1.3956514654189074, + "grad_norm": 0.3863965185948799, + "learning_rate": 6.458015658511568e-05, + "loss": 2.7709, + "step": 29977 + }, + { + "epoch": 1.3956980236050003, + "grad_norm": 0.3379548290018621, + "learning_rate": 6.4577565550499e-05, + "loss": 2.7655, + "step": 29978 + }, + { + "epoch": 1.3957445817910934, + "grad_norm": 0.38727409175268696, + "learning_rate": 6.45749744730986e-05, + "loss": 2.7114, + "step": 29979 + }, + { + "epoch": 1.3957911399771865, + "grad_norm": 0.3192952980845071, + "learning_rate": 6.457238335292208e-05, + "loss": 2.8138, + "step": 29980 + }, + { + "epoch": 1.3958376981632796, + "grad_norm": 0.36493013770889343, + "learning_rate": 6.456979218997707e-05, + "loss": 2.7106, + "step": 29981 + }, + { + "epoch": 1.3958842563493725, + "grad_norm": 0.33279770121493624, + "learning_rate": 6.456720098427114e-05, + "loss": 2.8689, + "step": 29982 + }, + { + "epoch": 1.3959308145354656, + "grad_norm": 0.37159730177838063, + "learning_rate": 6.45646097358119e-05, + "loss": 2.8047, + "step": 29983 + }, + { + "epoch": 1.3959773727215588, + "grad_norm": 0.34254638562964523, + "learning_rate": 6.4562018444607e-05, + "loss": 2.7576, + "step": 29984 + }, + { + "epoch": 1.3960239309076519, + "grad_norm": 0.33813676799342535, + "learning_rate": 6.455942711066398e-05, + "loss": 2.7858, + "step": 29985 + }, + { + "epoch": 1.396070489093745, + "grad_norm": 0.3472557392447753, + "learning_rate": 6.45568357339905e-05, + "loss": 2.6986, + "step": 29986 + }, + { + "epoch": 1.396117047279838, + "grad_norm": 0.3754799215133676, + "learning_rate": 6.455424431459411e-05, + "loss": 2.7538, + "step": 29987 + }, + { + "epoch": 1.396163605465931, + "grad_norm": 0.34651075546418, + "learning_rate": 6.455165285248247e-05, + "loss": 2.6948, + "step": 29988 + }, + { + "epoch": 1.396210163652024, + "grad_norm": 0.37784941356591506, + "learning_rate": 6.454906134766315e-05, + "loss": 2.8401, + "step": 29989 + }, + { + "epoch": 1.3962567218381172, + "grad_norm": 0.3640168726143871, + "learning_rate": 6.454646980014378e-05, + "loss": 2.7763, + "step": 29990 + }, + { + "epoch": 1.3963032800242103, + "grad_norm": 0.35556528668076937, + "learning_rate": 6.454387820993194e-05, + "loss": 2.6699, + "step": 29991 + }, + { + "epoch": 1.3963498382103032, + "grad_norm": 0.38131017852044596, + "learning_rate": 6.454128657703526e-05, + "loss": 2.834, + "step": 29992 + }, + { + "epoch": 1.3963963963963963, + "grad_norm": 0.3935785132885129, + "learning_rate": 6.453869490146134e-05, + "loss": 2.7358, + "step": 29993 + }, + { + "epoch": 1.3964429545824895, + "grad_norm": 0.33506687194329504, + "learning_rate": 6.453610318321777e-05, + "loss": 2.7865, + "step": 29994 + }, + { + "epoch": 1.3964895127685826, + "grad_norm": 0.37603332159009156, + "learning_rate": 6.453351142231219e-05, + "loss": 2.8115, + "step": 29995 + }, + { + "epoch": 1.3965360709546757, + "grad_norm": 0.338466209301207, + "learning_rate": 6.453091961875215e-05, + "loss": 2.6905, + "step": 29996 + }, + { + "epoch": 1.3965826291407688, + "grad_norm": 0.3680555171697984, + "learning_rate": 6.452832777254534e-05, + "loss": 2.7493, + "step": 29997 + }, + { + "epoch": 1.3966291873268617, + "grad_norm": 0.358172794858424, + "learning_rate": 6.452573588369928e-05, + "loss": 2.6929, + "step": 29998 + }, + { + "epoch": 1.3966757455129548, + "grad_norm": 0.3670718417090985, + "learning_rate": 6.452314395222164e-05, + "loss": 2.74, + "step": 29999 + }, + { + "epoch": 1.396722303699048, + "grad_norm": 0.34035949227007367, + "learning_rate": 6.452055197811998e-05, + "loss": 2.6276, + "step": 30000 + }, + { + "epoch": 1.3967688618851408, + "grad_norm": 0.32535119040389754, + "learning_rate": 6.451795996140195e-05, + "loss": 2.8077, + "step": 30001 + }, + { + "epoch": 1.396815420071234, + "grad_norm": 0.32950306627049863, + "learning_rate": 6.451536790207513e-05, + "loss": 2.7507, + "step": 30002 + }, + { + "epoch": 1.396861978257327, + "grad_norm": 0.3279958489747167, + "learning_rate": 6.451277580014714e-05, + "loss": 2.6437, + "step": 30003 + }, + { + "epoch": 1.3969085364434202, + "grad_norm": 0.33240430796416903, + "learning_rate": 6.451018365562557e-05, + "loss": 2.7671, + "step": 30004 + }, + { + "epoch": 1.3969550946295133, + "grad_norm": 0.29443925530936527, + "learning_rate": 6.450759146851804e-05, + "loss": 2.6283, + "step": 30005 + }, + { + "epoch": 1.3970016528156064, + "grad_norm": 0.36437132137325307, + "learning_rate": 6.450499923883219e-05, + "loss": 2.7049, + "step": 30006 + }, + { + "epoch": 1.3970482110016993, + "grad_norm": 0.28184173107895844, + "learning_rate": 6.450240696657557e-05, + "loss": 2.7956, + "step": 30007 + }, + { + "epoch": 1.3970947691877924, + "grad_norm": 0.36015021297681044, + "learning_rate": 6.449981465175582e-05, + "loss": 2.7711, + "step": 30008 + }, + { + "epoch": 1.3971413273738855, + "grad_norm": 0.30576189499734935, + "learning_rate": 6.449722229438054e-05, + "loss": 2.7906, + "step": 30009 + }, + { + "epoch": 1.3971878855599786, + "grad_norm": 0.3194826371510929, + "learning_rate": 6.449462989445734e-05, + "loss": 2.7439, + "step": 30010 + }, + { + "epoch": 1.3972344437460715, + "grad_norm": 0.3461478094462017, + "learning_rate": 6.449203745199384e-05, + "loss": 2.8452, + "step": 30011 + }, + { + "epoch": 1.3972810019321646, + "grad_norm": 0.34721706840846955, + "learning_rate": 6.448944496699763e-05, + "loss": 2.735, + "step": 30012 + }, + { + "epoch": 1.3973275601182578, + "grad_norm": 0.3393062124029977, + "learning_rate": 6.448685243947633e-05, + "loss": 2.8002, + "step": 30013 + }, + { + "epoch": 1.3973741183043509, + "grad_norm": 0.3519146674079171, + "learning_rate": 6.448425986943752e-05, + "loss": 2.7392, + "step": 30014 + }, + { + "epoch": 1.397420676490444, + "grad_norm": 0.32813520098641275, + "learning_rate": 6.448166725688887e-05, + "loss": 2.8106, + "step": 30015 + }, + { + "epoch": 1.397467234676537, + "grad_norm": 0.3455067310025203, + "learning_rate": 6.447907460183794e-05, + "loss": 2.7688, + "step": 30016 + }, + { + "epoch": 1.39751379286263, + "grad_norm": 0.341045060305314, + "learning_rate": 6.447648190429235e-05, + "loss": 2.6798, + "step": 30017 + }, + { + "epoch": 1.3975603510487231, + "grad_norm": 0.3243512830358712, + "learning_rate": 6.44738891642597e-05, + "loss": 2.721, + "step": 30018 + }, + { + "epoch": 1.3976069092348162, + "grad_norm": 0.3636047070557136, + "learning_rate": 6.447129638174761e-05, + "loss": 2.8234, + "step": 30019 + }, + { + "epoch": 1.3976534674209093, + "grad_norm": 0.3430892524648066, + "learning_rate": 6.446870355676372e-05, + "loss": 2.7293, + "step": 30020 + }, + { + "epoch": 1.3977000256070022, + "grad_norm": 0.33837796525743885, + "learning_rate": 6.446611068931561e-05, + "loss": 2.8281, + "step": 30021 + }, + { + "epoch": 1.3977465837930954, + "grad_norm": 0.39314033188814934, + "learning_rate": 6.446351777941087e-05, + "loss": 2.7844, + "step": 30022 + }, + { + "epoch": 1.3977931419791885, + "grad_norm": 0.3208914404992284, + "learning_rate": 6.446092482705713e-05, + "loss": 2.6484, + "step": 30023 + }, + { + "epoch": 1.3978397001652816, + "grad_norm": 0.406372998963001, + "learning_rate": 6.445833183226201e-05, + "loss": 2.7195, + "step": 30024 + }, + { + "epoch": 1.3978862583513747, + "grad_norm": 0.3503378517884979, + "learning_rate": 6.44557387950331e-05, + "loss": 2.7585, + "step": 30025 + }, + { + "epoch": 1.3979328165374678, + "grad_norm": 0.35169651548971104, + "learning_rate": 6.445314571537802e-05, + "loss": 2.6835, + "step": 30026 + }, + { + "epoch": 1.3979793747235607, + "grad_norm": 0.342240820672556, + "learning_rate": 6.445055259330439e-05, + "loss": 2.8404, + "step": 30027 + }, + { + "epoch": 1.3980259329096538, + "grad_norm": 0.3590093743824234, + "learning_rate": 6.44479594288198e-05, + "loss": 2.7412, + "step": 30028 + }, + { + "epoch": 1.398072491095747, + "grad_norm": 0.3429201990299255, + "learning_rate": 6.444536622193189e-05, + "loss": 2.7535, + "step": 30029 + }, + { + "epoch": 1.39811904928184, + "grad_norm": 0.3478742326788819, + "learning_rate": 6.444277297264824e-05, + "loss": 2.7344, + "step": 30030 + }, + { + "epoch": 1.398165607467933, + "grad_norm": 0.32330180818983534, + "learning_rate": 6.444017968097649e-05, + "loss": 2.6994, + "step": 30031 + }, + { + "epoch": 1.398212165654026, + "grad_norm": 0.3404591740379692, + "learning_rate": 6.443758634692421e-05, + "loss": 2.6978, + "step": 30032 + }, + { + "epoch": 1.3982587238401192, + "grad_norm": 0.3471913227633492, + "learning_rate": 6.443499297049907e-05, + "loss": 2.7436, + "step": 30033 + }, + { + "epoch": 1.3983052820262123, + "grad_norm": 0.3594210034372788, + "learning_rate": 6.443239955170864e-05, + "loss": 2.8261, + "step": 30034 + }, + { + "epoch": 1.3983518402123054, + "grad_norm": 0.3261873216379757, + "learning_rate": 6.442980609056051e-05, + "loss": 2.7737, + "step": 30035 + }, + { + "epoch": 1.3983983983983985, + "grad_norm": 0.34603381855646653, + "learning_rate": 6.442721258706235e-05, + "loss": 2.7061, + "step": 30036 + }, + { + "epoch": 1.3984449565844914, + "grad_norm": 0.32985301294901437, + "learning_rate": 6.442461904122172e-05, + "loss": 2.814, + "step": 30037 + }, + { + "epoch": 1.3984915147705845, + "grad_norm": 0.34755943168459486, + "learning_rate": 6.442202545304627e-05, + "loss": 2.7115, + "step": 30038 + }, + { + "epoch": 1.3985380729566776, + "grad_norm": 0.330133801736471, + "learning_rate": 6.441943182254359e-05, + "loss": 2.8007, + "step": 30039 + }, + { + "epoch": 1.3985846311427705, + "grad_norm": 0.3273080213640648, + "learning_rate": 6.44168381497213e-05, + "loss": 2.7594, + "step": 30040 + }, + { + "epoch": 1.3986311893288637, + "grad_norm": 0.3108862558302189, + "learning_rate": 6.441424443458702e-05, + "loss": 2.8145, + "step": 30041 + }, + { + "epoch": 1.3986777475149568, + "grad_norm": 0.34082274959786446, + "learning_rate": 6.441165067714834e-05, + "loss": 2.7317, + "step": 30042 + }, + { + "epoch": 1.3987243057010499, + "grad_norm": 0.32391403104945155, + "learning_rate": 6.440905687741288e-05, + "loss": 2.6879, + "step": 30043 + }, + { + "epoch": 1.398770863887143, + "grad_norm": 0.3531177433482551, + "learning_rate": 6.440646303538826e-05, + "loss": 2.6949, + "step": 30044 + }, + { + "epoch": 1.3988174220732361, + "grad_norm": 0.32871394674436555, + "learning_rate": 6.440386915108209e-05, + "loss": 2.7854, + "step": 30045 + }, + { + "epoch": 1.3988639802593292, + "grad_norm": 0.34152689913568857, + "learning_rate": 6.4401275224502e-05, + "loss": 2.8365, + "step": 30046 + }, + { + "epoch": 1.3989105384454221, + "grad_norm": 0.33821124559931415, + "learning_rate": 6.439868125565557e-05, + "loss": 2.7203, + "step": 30047 + }, + { + "epoch": 1.3989570966315152, + "grad_norm": 0.3478646454877812, + "learning_rate": 6.439608724455044e-05, + "loss": 2.7737, + "step": 30048 + }, + { + "epoch": 1.3990036548176084, + "grad_norm": 0.32375552827912857, + "learning_rate": 6.439349319119422e-05, + "loss": 2.7904, + "step": 30049 + }, + { + "epoch": 1.3990502130037012, + "grad_norm": 0.3247912881875331, + "learning_rate": 6.439089909559448e-05, + "loss": 2.7591, + "step": 30050 + }, + { + "epoch": 1.3990967711897944, + "grad_norm": 0.3810676019119703, + "learning_rate": 6.438830495775891e-05, + "loss": 2.8044, + "step": 30051 + }, + { + "epoch": 1.3991433293758875, + "grad_norm": 0.31413350214416974, + "learning_rate": 6.438571077769506e-05, + "loss": 2.7143, + "step": 30052 + }, + { + "epoch": 1.3991898875619806, + "grad_norm": 0.3721776366178493, + "learning_rate": 6.438311655541056e-05, + "loss": 2.7636, + "step": 30053 + }, + { + "epoch": 1.3992364457480737, + "grad_norm": 0.33187877274883865, + "learning_rate": 6.438052229091302e-05, + "loss": 2.5992, + "step": 30054 + }, + { + "epoch": 1.3992830039341668, + "grad_norm": 0.3296321684519479, + "learning_rate": 6.43779279842101e-05, + "loss": 2.7846, + "step": 30055 + }, + { + "epoch": 1.3993295621202597, + "grad_norm": 0.307813719904972, + "learning_rate": 6.437533363530937e-05, + "loss": 2.8079, + "step": 30056 + }, + { + "epoch": 1.3993761203063528, + "grad_norm": 0.3306455210192152, + "learning_rate": 6.437273924421842e-05, + "loss": 2.7736, + "step": 30057 + }, + { + "epoch": 1.399422678492446, + "grad_norm": 0.34602482835486525, + "learning_rate": 6.437014481094491e-05, + "loss": 2.7493, + "step": 30058 + }, + { + "epoch": 1.399469236678539, + "grad_norm": 0.3299484199637516, + "learning_rate": 6.436755033549647e-05, + "loss": 2.7132, + "step": 30059 + }, + { + "epoch": 1.399515794864632, + "grad_norm": 0.3223852098001541, + "learning_rate": 6.436495581788066e-05, + "loss": 2.7145, + "step": 30060 + }, + { + "epoch": 1.399562353050725, + "grad_norm": 0.3539486258356785, + "learning_rate": 6.436236125810513e-05, + "loss": 2.7193, + "step": 30061 + }, + { + "epoch": 1.3996089112368182, + "grad_norm": 0.332699523437742, + "learning_rate": 6.435976665617748e-05, + "loss": 2.7666, + "step": 30062 + }, + { + "epoch": 1.3996554694229113, + "grad_norm": 0.33315226731059056, + "learning_rate": 6.435717201210533e-05, + "loss": 2.7151, + "step": 30063 + }, + { + "epoch": 1.3997020276090044, + "grad_norm": 0.320768522832949, + "learning_rate": 6.435457732589628e-05, + "loss": 2.8029, + "step": 30064 + }, + { + "epoch": 1.3997485857950975, + "grad_norm": 0.31941710195507667, + "learning_rate": 6.435198259755797e-05, + "loss": 2.7431, + "step": 30065 + }, + { + "epoch": 1.3997951439811904, + "grad_norm": 0.31208888658426087, + "learning_rate": 6.434938782709802e-05, + "loss": 2.8417, + "step": 30066 + }, + { + "epoch": 1.3998417021672835, + "grad_norm": 0.3160317498154388, + "learning_rate": 6.434679301452401e-05, + "loss": 2.7972, + "step": 30067 + }, + { + "epoch": 1.3998882603533767, + "grad_norm": 0.3297761284551774, + "learning_rate": 6.434419815984359e-05, + "loss": 2.7227, + "step": 30068 + }, + { + "epoch": 1.3999348185394698, + "grad_norm": 0.35240580662437687, + "learning_rate": 6.434160326306435e-05, + "loss": 2.7104, + "step": 30069 + }, + { + "epoch": 1.3999813767255627, + "grad_norm": 0.3270613233295277, + "learning_rate": 6.433900832419392e-05, + "loss": 2.7701, + "step": 30070 + }, + { + "epoch": 1.4000279349116558, + "grad_norm": 0.35867508888871596, + "learning_rate": 6.433641334323993e-05, + "loss": 2.728, + "step": 30071 + }, + { + "epoch": 1.400074493097749, + "grad_norm": 0.31502605839309894, + "learning_rate": 6.433381832020996e-05, + "loss": 2.7053, + "step": 30072 + }, + { + "epoch": 1.400121051283842, + "grad_norm": 0.34609985405966914, + "learning_rate": 6.433122325511165e-05, + "loss": 2.7051, + "step": 30073 + }, + { + "epoch": 1.4001676094699351, + "grad_norm": 0.3412833600511892, + "learning_rate": 6.432862814795262e-05, + "loss": 2.8368, + "step": 30074 + }, + { + "epoch": 1.4002141676560282, + "grad_norm": 0.34421311489476836, + "learning_rate": 6.432603299874047e-05, + "loss": 2.7714, + "step": 30075 + }, + { + "epoch": 1.4002607258421211, + "grad_norm": 0.37501989616637793, + "learning_rate": 6.432343780748283e-05, + "loss": 2.8505, + "step": 30076 + }, + { + "epoch": 1.4003072840282142, + "grad_norm": 0.3097379305252338, + "learning_rate": 6.43208425741873e-05, + "loss": 2.8071, + "step": 30077 + }, + { + "epoch": 1.4003538422143074, + "grad_norm": 0.34540996822986797, + "learning_rate": 6.431824729886151e-05, + "loss": 2.7596, + "step": 30078 + }, + { + "epoch": 1.4004004004004005, + "grad_norm": 0.3469935775598596, + "learning_rate": 6.431565198151309e-05, + "loss": 2.8396, + "step": 30079 + }, + { + "epoch": 1.4004469585864934, + "grad_norm": 0.32287160236196216, + "learning_rate": 6.431305662214962e-05, + "loss": 2.791, + "step": 30080 + }, + { + "epoch": 1.4004935167725865, + "grad_norm": 0.354076164650487, + "learning_rate": 6.431046122077874e-05, + "loss": 2.6817, + "step": 30081 + }, + { + "epoch": 1.4005400749586796, + "grad_norm": 0.3391514554643375, + "learning_rate": 6.430786577740808e-05, + "loss": 2.8526, + "step": 30082 + }, + { + "epoch": 1.4005866331447727, + "grad_norm": 0.3230110755543016, + "learning_rate": 6.430527029204524e-05, + "loss": 2.8304, + "step": 30083 + }, + { + "epoch": 1.4006331913308658, + "grad_norm": 0.3633202293914141, + "learning_rate": 6.430267476469783e-05, + "loss": 2.6761, + "step": 30084 + }, + { + "epoch": 1.400679749516959, + "grad_norm": 0.33769374105631256, + "learning_rate": 6.43000791953735e-05, + "loss": 2.633, + "step": 30085 + }, + { + "epoch": 1.4007263077030518, + "grad_norm": 0.3479871774729496, + "learning_rate": 6.429748358407983e-05, + "loss": 2.7465, + "step": 30086 + }, + { + "epoch": 1.400772865889145, + "grad_norm": 0.34215281756675214, + "learning_rate": 6.429488793082447e-05, + "loss": 2.7719, + "step": 30087 + }, + { + "epoch": 1.400819424075238, + "grad_norm": 0.32993096233910285, + "learning_rate": 6.429229223561501e-05, + "loss": 2.8997, + "step": 30088 + }, + { + "epoch": 1.400865982261331, + "grad_norm": 0.307957058312904, + "learning_rate": 6.428969649845907e-05, + "loss": 2.605, + "step": 30089 + }, + { + "epoch": 1.400912540447424, + "grad_norm": 0.31591343249431975, + "learning_rate": 6.428710071936429e-05, + "loss": 2.7624, + "step": 30090 + }, + { + "epoch": 1.4009590986335172, + "grad_norm": 0.3297484153129109, + "learning_rate": 6.428450489833828e-05, + "loss": 2.7844, + "step": 30091 + }, + { + "epoch": 1.4010056568196103, + "grad_norm": 0.33663374360888726, + "learning_rate": 6.428190903538866e-05, + "loss": 2.82, + "step": 30092 + }, + { + "epoch": 1.4010522150057034, + "grad_norm": 0.34171224011641343, + "learning_rate": 6.427931313052303e-05, + "loss": 2.6952, + "step": 30093 + }, + { + "epoch": 1.4010987731917965, + "grad_norm": 0.32873263172088024, + "learning_rate": 6.427671718374903e-05, + "loss": 2.7961, + "step": 30094 + }, + { + "epoch": 1.4011453313778894, + "grad_norm": 0.34412614315111206, + "learning_rate": 6.427412119507426e-05, + "loss": 2.7503, + "step": 30095 + }, + { + "epoch": 1.4011918895639826, + "grad_norm": 0.3229841823174569, + "learning_rate": 6.427152516450636e-05, + "loss": 2.6826, + "step": 30096 + }, + { + "epoch": 1.4012384477500757, + "grad_norm": 0.3208235397995803, + "learning_rate": 6.426892909205294e-05, + "loss": 2.7143, + "step": 30097 + }, + { + "epoch": 1.4012850059361688, + "grad_norm": 0.3742346350330658, + "learning_rate": 6.426633297772163e-05, + "loss": 2.8093, + "step": 30098 + }, + { + "epoch": 1.4013315641222617, + "grad_norm": 0.35181138242898885, + "learning_rate": 6.426373682152002e-05, + "loss": 2.8232, + "step": 30099 + }, + { + "epoch": 1.4013781223083548, + "grad_norm": 0.36926856204181474, + "learning_rate": 6.426114062345576e-05, + "loss": 2.6728, + "step": 30100 + }, + { + "epoch": 1.401424680494448, + "grad_norm": 0.34152849596259927, + "learning_rate": 6.425854438353645e-05, + "loss": 2.8054, + "step": 30101 + }, + { + "epoch": 1.401471238680541, + "grad_norm": 0.3723444474477748, + "learning_rate": 6.425594810176973e-05, + "loss": 2.7035, + "step": 30102 + }, + { + "epoch": 1.4015177968666341, + "grad_norm": 0.32339447267913485, + "learning_rate": 6.425335177816318e-05, + "loss": 2.782, + "step": 30103 + }, + { + "epoch": 1.4015643550527273, + "grad_norm": 0.36958505190193147, + "learning_rate": 6.425075541272447e-05, + "loss": 2.906, + "step": 30104 + }, + { + "epoch": 1.4016109132388201, + "grad_norm": 0.3542379131169002, + "learning_rate": 6.42481590054612e-05, + "loss": 2.7266, + "step": 30105 + }, + { + "epoch": 1.4016574714249133, + "grad_norm": 0.35149172231222775, + "learning_rate": 6.424556255638097e-05, + "loss": 2.7664, + "step": 30106 + }, + { + "epoch": 1.4017040296110064, + "grad_norm": 0.35831051683298853, + "learning_rate": 6.424296606549141e-05, + "loss": 2.7658, + "step": 30107 + }, + { + "epoch": 1.4017505877970995, + "grad_norm": 0.329084434631502, + "learning_rate": 6.424036953280017e-05, + "loss": 2.6873, + "step": 30108 + }, + { + "epoch": 1.4017971459831924, + "grad_norm": 0.3747735962980784, + "learning_rate": 6.423777295831484e-05, + "loss": 2.6849, + "step": 30109 + }, + { + "epoch": 1.4018437041692855, + "grad_norm": 0.33455460453073166, + "learning_rate": 6.423517634204304e-05, + "loss": 2.8041, + "step": 30110 + }, + { + "epoch": 1.4018902623553786, + "grad_norm": 0.363010109125109, + "learning_rate": 6.423257968399242e-05, + "loss": 2.7599, + "step": 30111 + }, + { + "epoch": 1.4019368205414717, + "grad_norm": 0.3504604310299589, + "learning_rate": 6.422998298417058e-05, + "loss": 2.7552, + "step": 30112 + }, + { + "epoch": 1.4019833787275648, + "grad_norm": 0.354526753565058, + "learning_rate": 6.422738624258512e-05, + "loss": 2.7796, + "step": 30113 + }, + { + "epoch": 1.402029936913658, + "grad_norm": 0.36685082342591663, + "learning_rate": 6.42247894592437e-05, + "loss": 2.7288, + "step": 30114 + }, + { + "epoch": 1.4020764950997509, + "grad_norm": 0.3575872953765405, + "learning_rate": 6.422219263415392e-05, + "loss": 2.7489, + "step": 30115 + }, + { + "epoch": 1.402123053285844, + "grad_norm": 0.34653156860450335, + "learning_rate": 6.421959576732339e-05, + "loss": 2.7739, + "step": 30116 + }, + { + "epoch": 1.402169611471937, + "grad_norm": 0.37531007977821335, + "learning_rate": 6.421699885875977e-05, + "loss": 2.7495, + "step": 30117 + }, + { + "epoch": 1.4022161696580302, + "grad_norm": 0.34970335875142516, + "learning_rate": 6.421440190847065e-05, + "loss": 2.7687, + "step": 30118 + }, + { + "epoch": 1.402262727844123, + "grad_norm": 0.38833080185654023, + "learning_rate": 6.421180491646365e-05, + "loss": 2.7648, + "step": 30119 + }, + { + "epoch": 1.4023092860302162, + "grad_norm": 0.35024686828958546, + "learning_rate": 6.420920788274643e-05, + "loss": 2.7409, + "step": 30120 + }, + { + "epoch": 1.4023558442163093, + "grad_norm": 0.3705325004945523, + "learning_rate": 6.420661080732657e-05, + "loss": 2.7991, + "step": 30121 + }, + { + "epoch": 1.4024024024024024, + "grad_norm": 0.343018502537183, + "learning_rate": 6.420401369021171e-05, + "loss": 2.8236, + "step": 30122 + }, + { + "epoch": 1.4024489605884956, + "grad_norm": 0.368708227815318, + "learning_rate": 6.420141653140945e-05, + "loss": 2.7708, + "step": 30123 + }, + { + "epoch": 1.4024955187745887, + "grad_norm": 0.3286245360200842, + "learning_rate": 6.419881933092746e-05, + "loss": 2.8084, + "step": 30124 + }, + { + "epoch": 1.4025420769606816, + "grad_norm": 0.35315096887002567, + "learning_rate": 6.419622208877332e-05, + "loss": 2.7155, + "step": 30125 + }, + { + "epoch": 1.4025886351467747, + "grad_norm": 0.34057687994609387, + "learning_rate": 6.419362480495467e-05, + "loss": 2.7492, + "step": 30126 + }, + { + "epoch": 1.4026351933328678, + "grad_norm": 0.3544841137748121, + "learning_rate": 6.419102747947911e-05, + "loss": 2.7094, + "step": 30127 + }, + { + "epoch": 1.4026817515189607, + "grad_norm": 0.35730903153898547, + "learning_rate": 6.418843011235432e-05, + "loss": 2.7399, + "step": 30128 + }, + { + "epoch": 1.4027283097050538, + "grad_norm": 0.3303199849521615, + "learning_rate": 6.418583270358786e-05, + "loss": 2.7214, + "step": 30129 + }, + { + "epoch": 1.402774867891147, + "grad_norm": 0.3695391389247161, + "learning_rate": 6.418323525318737e-05, + "loss": 2.7641, + "step": 30130 + }, + { + "epoch": 1.40282142607724, + "grad_norm": 0.34503922235083545, + "learning_rate": 6.41806377611605e-05, + "loss": 2.6447, + "step": 30131 + }, + { + "epoch": 1.4028679842633331, + "grad_norm": 0.37045975278892257, + "learning_rate": 6.417804022751486e-05, + "loss": 2.6407, + "step": 30132 + }, + { + "epoch": 1.4029145424494263, + "grad_norm": 0.35671136494039946, + "learning_rate": 6.417544265225806e-05, + "loss": 2.7928, + "step": 30133 + }, + { + "epoch": 1.4029611006355194, + "grad_norm": 0.36649174818608, + "learning_rate": 6.417284503539772e-05, + "loss": 2.7187, + "step": 30134 + }, + { + "epoch": 1.4030076588216123, + "grad_norm": 0.37628768172865923, + "learning_rate": 6.41702473769415e-05, + "loss": 2.6209, + "step": 30135 + }, + { + "epoch": 1.4030542170077054, + "grad_norm": 0.35629226545880816, + "learning_rate": 6.416764967689698e-05, + "loss": 2.8457, + "step": 30136 + }, + { + "epoch": 1.4031007751937985, + "grad_norm": 0.36350087343322235, + "learning_rate": 6.416505193527182e-05, + "loss": 2.6619, + "step": 30137 + }, + { + "epoch": 1.4031473333798914, + "grad_norm": 0.3458675322079772, + "learning_rate": 6.416245415207361e-05, + "loss": 2.7277, + "step": 30138 + }, + { + "epoch": 1.4031938915659845, + "grad_norm": 0.3689241448249146, + "learning_rate": 6.415985632731e-05, + "loss": 2.8105, + "step": 30139 + }, + { + "epoch": 1.4032404497520776, + "grad_norm": 0.3331257174239469, + "learning_rate": 6.41572584609886e-05, + "loss": 2.7081, + "step": 30140 + }, + { + "epoch": 1.4032870079381707, + "grad_norm": 0.37654884422184437, + "learning_rate": 6.415466055311707e-05, + "loss": 2.7048, + "step": 30141 + }, + { + "epoch": 1.4033335661242639, + "grad_norm": 0.33284903884296635, + "learning_rate": 6.415206260370296e-05, + "loss": 2.8042, + "step": 30142 + }, + { + "epoch": 1.403380124310357, + "grad_norm": 0.3510406376835672, + "learning_rate": 6.414946461275398e-05, + "loss": 2.6195, + "step": 30143 + }, + { + "epoch": 1.4034266824964499, + "grad_norm": 0.3296676296067176, + "learning_rate": 6.414686658027769e-05, + "loss": 2.7106, + "step": 30144 + }, + { + "epoch": 1.403473240682543, + "grad_norm": 0.34432566122091346, + "learning_rate": 6.414426850628174e-05, + "loss": 2.7128, + "step": 30145 + }, + { + "epoch": 1.403519798868636, + "grad_norm": 0.3471951606665834, + "learning_rate": 6.414167039077379e-05, + "loss": 2.7283, + "step": 30146 + }, + { + "epoch": 1.4035663570547292, + "grad_norm": 0.3209169547697427, + "learning_rate": 6.41390722337614e-05, + "loss": 2.7814, + "step": 30147 + }, + { + "epoch": 1.403612915240822, + "grad_norm": 0.3553376976045056, + "learning_rate": 6.413647403525224e-05, + "loss": 2.7966, + "step": 30148 + }, + { + "epoch": 1.4036594734269152, + "grad_norm": 0.3267202213509119, + "learning_rate": 6.413387579525392e-05, + "loss": 2.7575, + "step": 30149 + }, + { + "epoch": 1.4037060316130083, + "grad_norm": 0.35056481592273225, + "learning_rate": 6.413127751377406e-05, + "loss": 2.6785, + "step": 30150 + }, + { + "epoch": 1.4037525897991014, + "grad_norm": 0.8695575378950227, + "learning_rate": 6.412867919082029e-05, + "loss": 2.8336, + "step": 30151 + }, + { + "epoch": 1.4037991479851946, + "grad_norm": 0.5105441668157694, + "learning_rate": 6.412608082640024e-05, + "loss": 2.6827, + "step": 30152 + }, + { + "epoch": 1.4038457061712877, + "grad_norm": 0.4239284606055597, + "learning_rate": 6.412348242052154e-05, + "loss": 2.7436, + "step": 30153 + }, + { + "epoch": 1.4038922643573806, + "grad_norm": 0.4136712399954021, + "learning_rate": 6.412088397319181e-05, + "loss": 2.7495, + "step": 30154 + }, + { + "epoch": 1.4039388225434737, + "grad_norm": 0.463765999828474, + "learning_rate": 6.411828548441867e-05, + "loss": 2.7546, + "step": 30155 + }, + { + "epoch": 1.4039853807295668, + "grad_norm": 0.43811569058724903, + "learning_rate": 6.411568695420975e-05, + "loss": 2.7792, + "step": 30156 + }, + { + "epoch": 1.40403193891566, + "grad_norm": 0.40968541934610275, + "learning_rate": 6.411308838257268e-05, + "loss": 2.8321, + "step": 30157 + }, + { + "epoch": 1.4040784971017528, + "grad_norm": 0.41685787092556514, + "learning_rate": 6.411048976951509e-05, + "loss": 2.7342, + "step": 30158 + }, + { + "epoch": 1.404125055287846, + "grad_norm": 0.43854868349898374, + "learning_rate": 6.410789111504462e-05, + "loss": 2.7416, + "step": 30159 + }, + { + "epoch": 1.404171613473939, + "grad_norm": 0.359621496561754, + "learning_rate": 6.410529241916886e-05, + "loss": 2.7687, + "step": 30160 + }, + { + "epoch": 1.4042181716600322, + "grad_norm": 0.41937003610295476, + "learning_rate": 6.410269368189547e-05, + "loss": 2.8448, + "step": 30161 + }, + { + "epoch": 1.4042647298461253, + "grad_norm": 0.3674358129132161, + "learning_rate": 6.410009490323206e-05, + "loss": 2.8655, + "step": 30162 + }, + { + "epoch": 1.4043112880322184, + "grad_norm": 0.3897699160196769, + "learning_rate": 6.409749608318624e-05, + "loss": 2.7707, + "step": 30163 + }, + { + "epoch": 1.4043578462183113, + "grad_norm": 0.3263793167317789, + "learning_rate": 6.409489722176569e-05, + "loss": 2.7422, + "step": 30164 + }, + { + "epoch": 1.4044044044044044, + "grad_norm": 0.3360506919028654, + "learning_rate": 6.409229831897797e-05, + "loss": 2.7903, + "step": 30165 + }, + { + "epoch": 1.4044509625904975, + "grad_norm": 0.35219181668663724, + "learning_rate": 6.408969937483076e-05, + "loss": 2.8694, + "step": 30166 + }, + { + "epoch": 1.4044975207765906, + "grad_norm": 0.35019109066240506, + "learning_rate": 6.408710038933168e-05, + "loss": 2.8142, + "step": 30167 + }, + { + "epoch": 1.4045440789626835, + "grad_norm": 0.37489878697497125, + "learning_rate": 6.408450136248834e-05, + "loss": 2.7717, + "step": 30168 + }, + { + "epoch": 1.4045906371487766, + "grad_norm": 0.3771171927587933, + "learning_rate": 6.408190229430837e-05, + "loss": 2.8251, + "step": 30169 + }, + { + "epoch": 1.4046371953348697, + "grad_norm": 0.351595148300282, + "learning_rate": 6.40793031847994e-05, + "loss": 2.8285, + "step": 30170 + }, + { + "epoch": 1.4046837535209629, + "grad_norm": 0.35591718850084725, + "learning_rate": 6.407670403396906e-05, + "loss": 2.8143, + "step": 30171 + }, + { + "epoch": 1.404730311707056, + "grad_norm": 0.35510232051210644, + "learning_rate": 6.4074104841825e-05, + "loss": 2.7338, + "step": 30172 + }, + { + "epoch": 1.404776869893149, + "grad_norm": 0.3354128003597446, + "learning_rate": 6.407150560837483e-05, + "loss": 2.83, + "step": 30173 + }, + { + "epoch": 1.404823428079242, + "grad_norm": 0.3291744216589957, + "learning_rate": 6.406890633362617e-05, + "loss": 2.7041, + "step": 30174 + }, + { + "epoch": 1.404869986265335, + "grad_norm": 0.331135098083598, + "learning_rate": 6.406630701758665e-05, + "loss": 2.7152, + "step": 30175 + }, + { + "epoch": 1.4049165444514282, + "grad_norm": 0.3605795018221197, + "learning_rate": 6.406370766026391e-05, + "loss": 2.782, + "step": 30176 + }, + { + "epoch": 1.404963102637521, + "grad_norm": 0.30778375728535035, + "learning_rate": 6.406110826166558e-05, + "loss": 2.7294, + "step": 30177 + }, + { + "epoch": 1.4050096608236142, + "grad_norm": 0.33133068938115895, + "learning_rate": 6.405850882179929e-05, + "loss": 2.7571, + "step": 30178 + }, + { + "epoch": 1.4050562190097073, + "grad_norm": 0.3442201142199863, + "learning_rate": 6.405590934067262e-05, + "loss": 2.7061, + "step": 30179 + }, + { + "epoch": 1.4051027771958005, + "grad_norm": 0.31297420055564606, + "learning_rate": 6.405330981829327e-05, + "loss": 2.7652, + "step": 30180 + }, + { + "epoch": 1.4051493353818936, + "grad_norm": 0.3401795997737381, + "learning_rate": 6.405071025466884e-05, + "loss": 2.7764, + "step": 30181 + }, + { + "epoch": 1.4051958935679867, + "grad_norm": 0.3370208064527791, + "learning_rate": 6.404811064980696e-05, + "loss": 2.6392, + "step": 30182 + }, + { + "epoch": 1.4052424517540796, + "grad_norm": 0.3307565952945082, + "learning_rate": 6.404551100371525e-05, + "loss": 2.8927, + "step": 30183 + }, + { + "epoch": 1.4052890099401727, + "grad_norm": 0.3200563583590875, + "learning_rate": 6.404291131640135e-05, + "loss": 2.6831, + "step": 30184 + }, + { + "epoch": 1.4053355681262658, + "grad_norm": 0.3202494572367041, + "learning_rate": 6.40403115878729e-05, + "loss": 2.6801, + "step": 30185 + }, + { + "epoch": 1.405382126312359, + "grad_norm": 0.3383736919635006, + "learning_rate": 6.403771181813751e-05, + "loss": 2.7769, + "step": 30186 + }, + { + "epoch": 1.4054286844984518, + "grad_norm": 0.3260875829771529, + "learning_rate": 6.403511200720284e-05, + "loss": 2.664, + "step": 30187 + }, + { + "epoch": 1.405475242684545, + "grad_norm": 0.3194507912075758, + "learning_rate": 6.403251215507647e-05, + "loss": 2.7668, + "step": 30188 + }, + { + "epoch": 1.405521800870638, + "grad_norm": 0.3250023921190284, + "learning_rate": 6.402991226176607e-05, + "loss": 2.6393, + "step": 30189 + }, + { + "epoch": 1.4055683590567312, + "grad_norm": 0.325927636590695, + "learning_rate": 6.402731232727925e-05, + "loss": 2.6732, + "step": 30190 + }, + { + "epoch": 1.4056149172428243, + "grad_norm": 0.320029584453591, + "learning_rate": 6.402471235162367e-05, + "loss": 2.8113, + "step": 30191 + }, + { + "epoch": 1.4056614754289174, + "grad_norm": 0.3252346410492232, + "learning_rate": 6.402211233480692e-05, + "loss": 2.7617, + "step": 30192 + }, + { + "epoch": 1.4057080336150103, + "grad_norm": 0.3232642896399949, + "learning_rate": 6.401951227683667e-05, + "loss": 2.7409, + "step": 30193 + }, + { + "epoch": 1.4057545918011034, + "grad_norm": 0.3309623217166064, + "learning_rate": 6.401691217772053e-05, + "loss": 2.7558, + "step": 30194 + }, + { + "epoch": 1.4058011499871965, + "grad_norm": 0.3130527438637143, + "learning_rate": 6.401431203746612e-05, + "loss": 2.7162, + "step": 30195 + }, + { + "epoch": 1.4058477081732896, + "grad_norm": 0.3487588980731625, + "learning_rate": 6.401171185608109e-05, + "loss": 2.7062, + "step": 30196 + }, + { + "epoch": 1.4058942663593825, + "grad_norm": 0.30848671726569094, + "learning_rate": 6.400911163357308e-05, + "loss": 2.6608, + "step": 30197 + }, + { + "epoch": 1.4059408245454756, + "grad_norm": 0.35035472858122696, + "learning_rate": 6.400651136994968e-05, + "loss": 2.8688, + "step": 30198 + }, + { + "epoch": 1.4059873827315688, + "grad_norm": 0.3298042595585951, + "learning_rate": 6.400391106521858e-05, + "loss": 2.7396, + "step": 30199 + }, + { + "epoch": 1.4060339409176619, + "grad_norm": 0.3631016530214438, + "learning_rate": 6.400131071938738e-05, + "loss": 2.7736, + "step": 30200 + }, + { + "epoch": 1.406080499103755, + "grad_norm": 0.3361737369005882, + "learning_rate": 6.399871033246369e-05, + "loss": 2.7927, + "step": 30201 + }, + { + "epoch": 1.406127057289848, + "grad_norm": 0.3453372315087436, + "learning_rate": 6.399610990445517e-05, + "loss": 2.7111, + "step": 30202 + }, + { + "epoch": 1.406173615475941, + "grad_norm": 0.3628452139223148, + "learning_rate": 6.399350943536944e-05, + "loss": 2.646, + "step": 30203 + }, + { + "epoch": 1.4062201736620341, + "grad_norm": 0.329631023757735, + "learning_rate": 6.399090892521415e-05, + "loss": 2.682, + "step": 30204 + }, + { + "epoch": 1.4062667318481272, + "grad_norm": 0.34673722976705623, + "learning_rate": 6.398830837399692e-05, + "loss": 2.8327, + "step": 30205 + }, + { + "epoch": 1.4063132900342203, + "grad_norm": 0.3674611633352948, + "learning_rate": 6.398570778172538e-05, + "loss": 2.7246, + "step": 30206 + }, + { + "epoch": 1.4063598482203132, + "grad_norm": 0.33873352864688555, + "learning_rate": 6.398310714840716e-05, + "loss": 2.7325, + "step": 30207 + }, + { + "epoch": 1.4064064064064064, + "grad_norm": 0.34764874365535026, + "learning_rate": 6.398050647404991e-05, + "loss": 2.6802, + "step": 30208 + }, + { + "epoch": 1.4064529645924995, + "grad_norm": 0.39150157152237997, + "learning_rate": 6.397790575866124e-05, + "loss": 2.8812, + "step": 30209 + }, + { + "epoch": 1.4064995227785926, + "grad_norm": 0.34518124756427687, + "learning_rate": 6.397530500224881e-05, + "loss": 2.7277, + "step": 30210 + }, + { + "epoch": 1.4065460809646857, + "grad_norm": 0.3677872529236488, + "learning_rate": 6.397270420482022e-05, + "loss": 2.7458, + "step": 30211 + }, + { + "epoch": 1.4065926391507788, + "grad_norm": 0.3807134647887145, + "learning_rate": 6.397010336638313e-05, + "loss": 2.8224, + "step": 30212 + }, + { + "epoch": 1.4066391973368717, + "grad_norm": 0.36133910370771305, + "learning_rate": 6.396750248694517e-05, + "loss": 2.716, + "step": 30213 + }, + { + "epoch": 1.4066857555229648, + "grad_norm": 0.3809573483617206, + "learning_rate": 6.396490156651395e-05, + "loss": 2.9037, + "step": 30214 + }, + { + "epoch": 1.406732313709058, + "grad_norm": 0.371353384410366, + "learning_rate": 6.396230060509711e-05, + "loss": 2.7913, + "step": 30215 + }, + { + "epoch": 1.4067788718951508, + "grad_norm": 0.4085475640206733, + "learning_rate": 6.395969960270231e-05, + "loss": 2.8387, + "step": 30216 + }, + { + "epoch": 1.406825430081244, + "grad_norm": 0.3599914006526336, + "learning_rate": 6.395709855933718e-05, + "loss": 2.6503, + "step": 30217 + }, + { + "epoch": 1.406871988267337, + "grad_norm": 0.37788547005782863, + "learning_rate": 6.395449747500933e-05, + "loss": 2.7297, + "step": 30218 + }, + { + "epoch": 1.4069185464534302, + "grad_norm": 0.35125112173814255, + "learning_rate": 6.395189634972639e-05, + "loss": 2.8789, + "step": 30219 + }, + { + "epoch": 1.4069651046395233, + "grad_norm": 0.3896077198164858, + "learning_rate": 6.394929518349602e-05, + "loss": 2.7562, + "step": 30220 + }, + { + "epoch": 1.4070116628256164, + "grad_norm": 0.36119214940050387, + "learning_rate": 6.394669397632583e-05, + "loss": 2.6528, + "step": 30221 + }, + { + "epoch": 1.4070582210117095, + "grad_norm": 0.34182795173430774, + "learning_rate": 6.394409272822348e-05, + "loss": 2.7237, + "step": 30222 + }, + { + "epoch": 1.4071047791978024, + "grad_norm": 0.3595067712961275, + "learning_rate": 6.394149143919659e-05, + "loss": 2.7052, + "step": 30223 + }, + { + "epoch": 1.4071513373838955, + "grad_norm": 0.35460925825560075, + "learning_rate": 6.393889010925281e-05, + "loss": 2.7775, + "step": 30224 + }, + { + "epoch": 1.4071978955699886, + "grad_norm": 0.3515423465311689, + "learning_rate": 6.393628873839975e-05, + "loss": 2.7362, + "step": 30225 + }, + { + "epoch": 1.4072444537560815, + "grad_norm": 0.35242907505178495, + "learning_rate": 6.393368732664505e-05, + "loss": 2.7995, + "step": 30226 + }, + { + "epoch": 1.4072910119421747, + "grad_norm": 0.37386866259634266, + "learning_rate": 6.393108587399636e-05, + "loss": 2.8063, + "step": 30227 + }, + { + "epoch": 1.4073375701282678, + "grad_norm": 0.36656167850048227, + "learning_rate": 6.392848438046128e-05, + "loss": 2.7463, + "step": 30228 + }, + { + "epoch": 1.4073841283143609, + "grad_norm": 0.34241302499995785, + "learning_rate": 6.39258828460475e-05, + "loss": 2.6707, + "step": 30229 + }, + { + "epoch": 1.407430686500454, + "grad_norm": 0.35116632254102187, + "learning_rate": 6.39232812707626e-05, + "loss": 2.8375, + "step": 30230 + }, + { + "epoch": 1.4074772446865471, + "grad_norm": 0.3263348989646203, + "learning_rate": 6.392067965461427e-05, + "loss": 2.7494, + "step": 30231 + }, + { + "epoch": 1.40752380287264, + "grad_norm": 0.35102855972833985, + "learning_rate": 6.391807799761009e-05, + "loss": 2.6787, + "step": 30232 + }, + { + "epoch": 1.4075703610587331, + "grad_norm": 0.34330226085957605, + "learning_rate": 6.391547629975773e-05, + "loss": 2.845, + "step": 30233 + }, + { + "epoch": 1.4076169192448262, + "grad_norm": 0.37372189040860965, + "learning_rate": 6.391287456106482e-05, + "loss": 2.7938, + "step": 30234 + }, + { + "epoch": 1.4076634774309194, + "grad_norm": 0.3736327809671282, + "learning_rate": 6.3910272781539e-05, + "loss": 2.8225, + "step": 30235 + }, + { + "epoch": 1.4077100356170122, + "grad_norm": 0.377693941645937, + "learning_rate": 6.39076709611879e-05, + "loss": 2.7653, + "step": 30236 + }, + { + "epoch": 1.4077565938031054, + "grad_norm": 0.33868993414391957, + "learning_rate": 6.390506910001916e-05, + "loss": 2.746, + "step": 30237 + }, + { + "epoch": 1.4078031519891985, + "grad_norm": 0.3769192623521441, + "learning_rate": 6.39024671980404e-05, + "loss": 2.7452, + "step": 30238 + }, + { + "epoch": 1.4078497101752916, + "grad_norm": 0.3397220560127389, + "learning_rate": 6.389986525525927e-05, + "loss": 2.6952, + "step": 30239 + }, + { + "epoch": 1.4078962683613847, + "grad_norm": 0.3343571483178454, + "learning_rate": 6.389726327168341e-05, + "loss": 2.7399, + "step": 30240 + }, + { + "epoch": 1.4079428265474778, + "grad_norm": 0.33618485813999816, + "learning_rate": 6.389466124732046e-05, + "loss": 2.7298, + "step": 30241 + }, + { + "epoch": 1.4079893847335707, + "grad_norm": 0.35222012943518877, + "learning_rate": 6.389205918217803e-05, + "loss": 2.7238, + "step": 30242 + }, + { + "epoch": 1.4080359429196638, + "grad_norm": 0.33841873595955485, + "learning_rate": 6.388945707626378e-05, + "loss": 2.6522, + "step": 30243 + }, + { + "epoch": 1.408082501105757, + "grad_norm": 0.3285706484501208, + "learning_rate": 6.388685492958534e-05, + "loss": 2.7818, + "step": 30244 + }, + { + "epoch": 1.40812905929185, + "grad_norm": 0.3637231097249868, + "learning_rate": 6.388425274215036e-05, + "loss": 2.7643, + "step": 30245 + }, + { + "epoch": 1.408175617477943, + "grad_norm": 0.3277781979694778, + "learning_rate": 6.388165051396644e-05, + "loss": 2.7536, + "step": 30246 + }, + { + "epoch": 1.408222175664036, + "grad_norm": 0.33000495928954715, + "learning_rate": 6.387904824504128e-05, + "loss": 2.736, + "step": 30247 + }, + { + "epoch": 1.4082687338501292, + "grad_norm": 0.31657946473745946, + "learning_rate": 6.387644593538245e-05, + "loss": 2.846, + "step": 30248 + }, + { + "epoch": 1.4083152920362223, + "grad_norm": 0.3407204655434336, + "learning_rate": 6.387384358499764e-05, + "loss": 2.7335, + "step": 30249 + }, + { + "epoch": 1.4083618502223154, + "grad_norm": 0.31695467689901713, + "learning_rate": 6.387124119389445e-05, + "loss": 2.7442, + "step": 30250 + }, + { + "epoch": 1.4084084084084085, + "grad_norm": 0.3304499611897924, + "learning_rate": 6.386863876208056e-05, + "loss": 2.8051, + "step": 30251 + }, + { + "epoch": 1.4084549665945014, + "grad_norm": 0.33110161389420034, + "learning_rate": 6.386603628956355e-05, + "loss": 2.6662, + "step": 30252 + }, + { + "epoch": 1.4085015247805945, + "grad_norm": 0.36478606314500744, + "learning_rate": 6.38634337763511e-05, + "loss": 2.801, + "step": 30253 + }, + { + "epoch": 1.4085480829666877, + "grad_norm": 0.32995999153271516, + "learning_rate": 6.386083122245084e-05, + "loss": 2.7789, + "step": 30254 + }, + { + "epoch": 1.4085946411527808, + "grad_norm": 0.3326999028966097, + "learning_rate": 6.38582286278704e-05, + "loss": 2.8703, + "step": 30255 + }, + { + "epoch": 1.4086411993388737, + "grad_norm": 0.3627260984391894, + "learning_rate": 6.385562599261743e-05, + "loss": 2.7758, + "step": 30256 + }, + { + "epoch": 1.4086877575249668, + "grad_norm": 0.33927869604467487, + "learning_rate": 6.385302331669955e-05, + "loss": 2.7236, + "step": 30257 + }, + { + "epoch": 1.40873431571106, + "grad_norm": 0.38315056003631637, + "learning_rate": 6.385042060012444e-05, + "loss": 2.8007, + "step": 30258 + }, + { + "epoch": 1.408780873897153, + "grad_norm": 0.33061592383375893, + "learning_rate": 6.38478178428997e-05, + "loss": 2.7557, + "step": 30259 + }, + { + "epoch": 1.4088274320832461, + "grad_norm": 0.3630597198476818, + "learning_rate": 6.384521504503295e-05, + "loss": 2.7357, + "step": 30260 + }, + { + "epoch": 1.4088739902693392, + "grad_norm": 0.3460525634732163, + "learning_rate": 6.384261220653189e-05, + "loss": 2.7671, + "step": 30261 + }, + { + "epoch": 1.4089205484554321, + "grad_norm": 0.3791540791975215, + "learning_rate": 6.384000932740412e-05, + "loss": 2.7946, + "step": 30262 + }, + { + "epoch": 1.4089671066415252, + "grad_norm": 0.32582973781286945, + "learning_rate": 6.383740640765728e-05, + "loss": 2.8229, + "step": 30263 + }, + { + "epoch": 1.4090136648276184, + "grad_norm": 0.3586542443046478, + "learning_rate": 6.383480344729903e-05, + "loss": 2.6801, + "step": 30264 + }, + { + "epoch": 1.4090602230137113, + "grad_norm": 0.35376933626155216, + "learning_rate": 6.383220044633697e-05, + "loss": 2.7511, + "step": 30265 + }, + { + "epoch": 1.4091067811998044, + "grad_norm": 0.355224482646107, + "learning_rate": 6.382959740477878e-05, + "loss": 2.8138, + "step": 30266 + }, + { + "epoch": 1.4091533393858975, + "grad_norm": 0.3277372532537404, + "learning_rate": 6.382699432263208e-05, + "loss": 2.8049, + "step": 30267 + }, + { + "epoch": 1.4091998975719906, + "grad_norm": 0.34255227198335403, + "learning_rate": 6.382439119990452e-05, + "loss": 2.8643, + "step": 30268 + }, + { + "epoch": 1.4092464557580837, + "grad_norm": 0.364411163763562, + "learning_rate": 6.382178803660373e-05, + "loss": 2.6261, + "step": 30269 + }, + { + "epoch": 1.4092930139441768, + "grad_norm": 0.3565286976449799, + "learning_rate": 6.381918483273734e-05, + "loss": 2.7177, + "step": 30270 + }, + { + "epoch": 1.4093395721302697, + "grad_norm": 0.344643404746427, + "learning_rate": 6.381658158831302e-05, + "loss": 2.7647, + "step": 30271 + }, + { + "epoch": 1.4093861303163628, + "grad_norm": 0.35092412049148536, + "learning_rate": 6.38139783033384e-05, + "loss": 2.729, + "step": 30272 + }, + { + "epoch": 1.409432688502456, + "grad_norm": 0.323178887134211, + "learning_rate": 6.381137497782111e-05, + "loss": 2.6189, + "step": 30273 + }, + { + "epoch": 1.409479246688549, + "grad_norm": 0.32489923771012175, + "learning_rate": 6.380877161176878e-05, + "loss": 2.7568, + "step": 30274 + }, + { + "epoch": 1.409525804874642, + "grad_norm": 0.3180952905094107, + "learning_rate": 6.380616820518908e-05, + "loss": 2.7403, + "step": 30275 + }, + { + "epoch": 1.409572363060735, + "grad_norm": 0.3456845125153037, + "learning_rate": 6.380356475808964e-05, + "loss": 2.8355, + "step": 30276 + }, + { + "epoch": 1.4096189212468282, + "grad_norm": 0.38123923338278554, + "learning_rate": 6.380096127047809e-05, + "loss": 2.7683, + "step": 30277 + }, + { + "epoch": 1.4096654794329213, + "grad_norm": 0.3370848616819132, + "learning_rate": 6.379835774236208e-05, + "loss": 2.6842, + "step": 30278 + }, + { + "epoch": 1.4097120376190144, + "grad_norm": 0.3467836290955997, + "learning_rate": 6.379575417374924e-05, + "loss": 2.7865, + "step": 30279 + }, + { + "epoch": 1.4097585958051075, + "grad_norm": 0.34202469332943986, + "learning_rate": 6.379315056464723e-05, + "loss": 2.7821, + "step": 30280 + }, + { + "epoch": 1.4098051539912004, + "grad_norm": 0.3092192292159449, + "learning_rate": 6.379054691506369e-05, + "loss": 2.7636, + "step": 30281 + }, + { + "epoch": 1.4098517121772935, + "grad_norm": 0.348403755289127, + "learning_rate": 6.378794322500623e-05, + "loss": 2.7517, + "step": 30282 + }, + { + "epoch": 1.4098982703633867, + "grad_norm": 0.324247937651746, + "learning_rate": 6.378533949448253e-05, + "loss": 2.7213, + "step": 30283 + }, + { + "epoch": 1.4099448285494798, + "grad_norm": 0.3595001639225778, + "learning_rate": 6.378273572350022e-05, + "loss": 2.7848, + "step": 30284 + }, + { + "epoch": 1.4099913867355727, + "grad_norm": 0.31718617631916196, + "learning_rate": 6.378013191206692e-05, + "loss": 2.7831, + "step": 30285 + }, + { + "epoch": 1.4100379449216658, + "grad_norm": 0.351082486358494, + "learning_rate": 6.377752806019031e-05, + "loss": 2.8305, + "step": 30286 + }, + { + "epoch": 1.410084503107759, + "grad_norm": 0.3815235598780718, + "learning_rate": 6.3774924167878e-05, + "loss": 2.7799, + "step": 30287 + }, + { + "epoch": 1.410131061293852, + "grad_norm": 0.31133866008263256, + "learning_rate": 6.377232023513767e-05, + "loss": 2.7373, + "step": 30288 + }, + { + "epoch": 1.4101776194799451, + "grad_norm": 0.3687144338182243, + "learning_rate": 6.376971626197691e-05, + "loss": 2.7455, + "step": 30289 + }, + { + "epoch": 1.4102241776660382, + "grad_norm": 0.3116005887065366, + "learning_rate": 6.376711224840342e-05, + "loss": 2.7048, + "step": 30290 + }, + { + "epoch": 1.4102707358521311, + "grad_norm": 0.3220158215974059, + "learning_rate": 6.376450819442478e-05, + "loss": 2.799, + "step": 30291 + }, + { + "epoch": 1.4103172940382243, + "grad_norm": 0.3390665932435265, + "learning_rate": 6.376190410004867e-05, + "loss": 2.6455, + "step": 30292 + }, + { + "epoch": 1.4103638522243174, + "grad_norm": 0.32576572096884326, + "learning_rate": 6.375929996528273e-05, + "loss": 2.8098, + "step": 30293 + }, + { + "epoch": 1.4104104104104105, + "grad_norm": 0.35532292200834986, + "learning_rate": 6.37566957901346e-05, + "loss": 2.7822, + "step": 30294 + }, + { + "epoch": 1.4104569685965034, + "grad_norm": 0.3542359954070713, + "learning_rate": 6.375409157461192e-05, + "loss": 2.7088, + "step": 30295 + }, + { + "epoch": 1.4105035267825965, + "grad_norm": 0.3554935843486001, + "learning_rate": 6.375148731872234e-05, + "loss": 2.6903, + "step": 30296 + }, + { + "epoch": 1.4105500849686896, + "grad_norm": 0.35999570459170877, + "learning_rate": 6.37488830224735e-05, + "loss": 2.7316, + "step": 30297 + }, + { + "epoch": 1.4105966431547827, + "grad_norm": 0.38255632527238875, + "learning_rate": 6.374627868587304e-05, + "loss": 2.7723, + "step": 30298 + }, + { + "epoch": 1.4106432013408758, + "grad_norm": 0.36683425268711023, + "learning_rate": 6.374367430892862e-05, + "loss": 2.8423, + "step": 30299 + }, + { + "epoch": 1.410689759526969, + "grad_norm": 0.4078335940091295, + "learning_rate": 6.374106989164785e-05, + "loss": 2.7485, + "step": 30300 + }, + { + "epoch": 1.4107363177130618, + "grad_norm": 0.3986516810051667, + "learning_rate": 6.37384654340384e-05, + "loss": 2.7318, + "step": 30301 + }, + { + "epoch": 1.410782875899155, + "grad_norm": 0.3703169954611642, + "learning_rate": 6.373586093610791e-05, + "loss": 2.7235, + "step": 30302 + }, + { + "epoch": 1.410829434085248, + "grad_norm": 0.3936054837398863, + "learning_rate": 6.373325639786402e-05, + "loss": 2.7837, + "step": 30303 + }, + { + "epoch": 1.410875992271341, + "grad_norm": 0.39555569870594987, + "learning_rate": 6.373065181931439e-05, + "loss": 2.7948, + "step": 30304 + }, + { + "epoch": 1.410922550457434, + "grad_norm": 0.37459684421970385, + "learning_rate": 6.372804720046664e-05, + "loss": 2.8155, + "step": 30305 + }, + { + "epoch": 1.4109691086435272, + "grad_norm": 0.3742986183327808, + "learning_rate": 6.372544254132842e-05, + "loss": 2.8135, + "step": 30306 + }, + { + "epoch": 1.4110156668296203, + "grad_norm": 0.38231122210868645, + "learning_rate": 6.372283784190737e-05, + "loss": 2.6564, + "step": 30307 + }, + { + "epoch": 1.4110622250157134, + "grad_norm": 0.33150876465846985, + "learning_rate": 6.372023310221115e-05, + "loss": 2.7048, + "step": 30308 + }, + { + "epoch": 1.4111087832018065, + "grad_norm": 0.3642492684213307, + "learning_rate": 6.371762832224741e-05, + "loss": 2.8242, + "step": 30309 + }, + { + "epoch": 1.4111553413878997, + "grad_norm": 0.3266420924895798, + "learning_rate": 6.371502350202377e-05, + "loss": 2.7198, + "step": 30310 + }, + { + "epoch": 1.4112018995739926, + "grad_norm": 0.35334508721104696, + "learning_rate": 6.371241864154788e-05, + "loss": 2.7768, + "step": 30311 + }, + { + "epoch": 1.4112484577600857, + "grad_norm": 0.31499287536136983, + "learning_rate": 6.370981374082739e-05, + "loss": 2.8078, + "step": 30312 + }, + { + "epoch": 1.4112950159461788, + "grad_norm": 0.3570911623625266, + "learning_rate": 6.370720879986996e-05, + "loss": 2.8699, + "step": 30313 + }, + { + "epoch": 1.4113415741322717, + "grad_norm": 0.3370411202969839, + "learning_rate": 6.370460381868324e-05, + "loss": 2.7768, + "step": 30314 + }, + { + "epoch": 1.4113881323183648, + "grad_norm": 0.34612610550462525, + "learning_rate": 6.370199879727483e-05, + "loss": 2.8187, + "step": 30315 + }, + { + "epoch": 1.411434690504458, + "grad_norm": 0.34830458661217556, + "learning_rate": 6.369939373565241e-05, + "loss": 2.6991, + "step": 30316 + }, + { + "epoch": 1.411481248690551, + "grad_norm": 0.3086722521614043, + "learning_rate": 6.369678863382363e-05, + "loss": 2.6547, + "step": 30317 + }, + { + "epoch": 1.4115278068766441, + "grad_norm": 0.36620846379315486, + "learning_rate": 6.369418349179612e-05, + "loss": 2.7746, + "step": 30318 + }, + { + "epoch": 1.4115743650627373, + "grad_norm": 0.33379668039531907, + "learning_rate": 6.369157830957751e-05, + "loss": 2.7794, + "step": 30319 + }, + { + "epoch": 1.4116209232488302, + "grad_norm": 0.3486021554755372, + "learning_rate": 6.368897308717548e-05, + "loss": 2.7644, + "step": 30320 + }, + { + "epoch": 1.4116674814349233, + "grad_norm": 0.35266277513896177, + "learning_rate": 6.368636782459767e-05, + "loss": 2.7547, + "step": 30321 + }, + { + "epoch": 1.4117140396210164, + "grad_norm": 0.34847010964472, + "learning_rate": 6.368376252185172e-05, + "loss": 2.8411, + "step": 30322 + }, + { + "epoch": 1.4117605978071095, + "grad_norm": 0.36814494056959945, + "learning_rate": 6.368115717894528e-05, + "loss": 2.8137, + "step": 30323 + }, + { + "epoch": 1.4118071559932024, + "grad_norm": 0.33677575530234277, + "learning_rate": 6.367855179588597e-05, + "loss": 2.6524, + "step": 30324 + }, + { + "epoch": 1.4118537141792955, + "grad_norm": 0.33063603307615297, + "learning_rate": 6.367594637268146e-05, + "loss": 2.8155, + "step": 30325 + }, + { + "epoch": 1.4119002723653886, + "grad_norm": 0.3968868596519044, + "learning_rate": 6.367334090933941e-05, + "loss": 2.8241, + "step": 30326 + }, + { + "epoch": 1.4119468305514817, + "grad_norm": 0.3501152817860972, + "learning_rate": 6.367073540586746e-05, + "loss": 2.8736, + "step": 30327 + }, + { + "epoch": 1.4119933887375749, + "grad_norm": 0.34764330380455793, + "learning_rate": 6.366812986227321e-05, + "loss": 2.7574, + "step": 30328 + }, + { + "epoch": 1.412039946923668, + "grad_norm": 0.363551014072396, + "learning_rate": 6.366552427856438e-05, + "loss": 2.7815, + "step": 30329 + }, + { + "epoch": 1.4120865051097609, + "grad_norm": 0.3777403960199233, + "learning_rate": 6.366291865474856e-05, + "loss": 2.6843, + "step": 30330 + }, + { + "epoch": 1.412133063295854, + "grad_norm": 0.3456307442786039, + "learning_rate": 6.366031299083344e-05, + "loss": 2.7899, + "step": 30331 + }, + { + "epoch": 1.412179621481947, + "grad_norm": 0.3680576391852672, + "learning_rate": 6.365770728682664e-05, + "loss": 2.7962, + "step": 30332 + }, + { + "epoch": 1.4122261796680402, + "grad_norm": 0.37567579645478916, + "learning_rate": 6.365510154273581e-05, + "loss": 2.7029, + "step": 30333 + }, + { + "epoch": 1.412272737854133, + "grad_norm": 0.349572384331284, + "learning_rate": 6.36524957585686e-05, + "loss": 2.657, + "step": 30334 + }, + { + "epoch": 1.4123192960402262, + "grad_norm": 0.349554222510534, + "learning_rate": 6.364988993433267e-05, + "loss": 2.7418, + "step": 30335 + }, + { + "epoch": 1.4123658542263193, + "grad_norm": 0.4015333509397996, + "learning_rate": 6.364728407003565e-05, + "loss": 2.7512, + "step": 30336 + }, + { + "epoch": 1.4124124124124124, + "grad_norm": 0.359555048180609, + "learning_rate": 6.36446781656852e-05, + "loss": 2.7158, + "step": 30337 + }, + { + "epoch": 1.4124589705985056, + "grad_norm": 0.3657345938912369, + "learning_rate": 6.364207222128895e-05, + "loss": 2.7056, + "step": 30338 + }, + { + "epoch": 1.4125055287845987, + "grad_norm": 0.3427972474731718, + "learning_rate": 6.363946623685459e-05, + "loss": 2.652, + "step": 30339 + }, + { + "epoch": 1.4125520869706916, + "grad_norm": 0.349709450289055, + "learning_rate": 6.363686021238972e-05, + "loss": 2.7979, + "step": 30340 + }, + { + "epoch": 1.4125986451567847, + "grad_norm": 0.3300013819599489, + "learning_rate": 6.363425414790202e-05, + "loss": 2.8816, + "step": 30341 + }, + { + "epoch": 1.4126452033428778, + "grad_norm": 0.3791515253972249, + "learning_rate": 6.363164804339912e-05, + "loss": 2.7932, + "step": 30342 + }, + { + "epoch": 1.412691761528971, + "grad_norm": 0.3392112775025257, + "learning_rate": 6.362904189888868e-05, + "loss": 2.7471, + "step": 30343 + }, + { + "epoch": 1.4127383197150638, + "grad_norm": 0.3256981141601583, + "learning_rate": 6.362643571437835e-05, + "loss": 2.7532, + "step": 30344 + }, + { + "epoch": 1.412784877901157, + "grad_norm": 0.3330835959775431, + "learning_rate": 6.362382948987575e-05, + "loss": 2.7571, + "step": 30345 + }, + { + "epoch": 1.41283143608725, + "grad_norm": 0.34432172896754426, + "learning_rate": 6.362122322538857e-05, + "loss": 2.7719, + "step": 30346 + }, + { + "epoch": 1.4128779942733432, + "grad_norm": 0.34045944955518304, + "learning_rate": 6.361861692092443e-05, + "loss": 2.7464, + "step": 30347 + }, + { + "epoch": 1.4129245524594363, + "grad_norm": 0.34586918846643416, + "learning_rate": 6.3616010576491e-05, + "loss": 2.6505, + "step": 30348 + }, + { + "epoch": 1.4129711106455294, + "grad_norm": 0.3209746513242424, + "learning_rate": 6.361340419209593e-05, + "loss": 2.7461, + "step": 30349 + }, + { + "epoch": 1.4130176688316223, + "grad_norm": 0.3443942310577927, + "learning_rate": 6.361079776774684e-05, + "loss": 2.6876, + "step": 30350 + }, + { + "epoch": 1.4130642270177154, + "grad_norm": 0.33896381077973764, + "learning_rate": 6.360819130345141e-05, + "loss": 2.831, + "step": 30351 + }, + { + "epoch": 1.4131107852038085, + "grad_norm": 0.340569702137603, + "learning_rate": 6.360558479921728e-05, + "loss": 2.7732, + "step": 30352 + }, + { + "epoch": 1.4131573433899014, + "grad_norm": 0.39380156405715394, + "learning_rate": 6.360297825505211e-05, + "loss": 2.7097, + "step": 30353 + }, + { + "epoch": 1.4132039015759945, + "grad_norm": 0.3188191092873575, + "learning_rate": 6.360037167096352e-05, + "loss": 2.7785, + "step": 30354 + }, + { + "epoch": 1.4132504597620876, + "grad_norm": 0.35560968547801164, + "learning_rate": 6.35977650469592e-05, + "loss": 2.6178, + "step": 30355 + }, + { + "epoch": 1.4132970179481807, + "grad_norm": 0.3315345566668916, + "learning_rate": 6.359515838304675e-05, + "loss": 2.8859, + "step": 30356 + }, + { + "epoch": 1.4133435761342739, + "grad_norm": 0.3363311922031913, + "learning_rate": 6.359255167923385e-05, + "loss": 2.6237, + "step": 30357 + }, + { + "epoch": 1.413390134320367, + "grad_norm": 0.3157318044283925, + "learning_rate": 6.358994493552818e-05, + "loss": 2.8081, + "step": 30358 + }, + { + "epoch": 1.4134366925064599, + "grad_norm": 0.34801238084747804, + "learning_rate": 6.358733815193735e-05, + "loss": 2.6691, + "step": 30359 + }, + { + "epoch": 1.413483250692553, + "grad_norm": 0.341108692880975, + "learning_rate": 6.358473132846901e-05, + "loss": 2.7142, + "step": 30360 + }, + { + "epoch": 1.413529808878646, + "grad_norm": 0.3377516423286148, + "learning_rate": 6.358212446513082e-05, + "loss": 2.7757, + "step": 30361 + }, + { + "epoch": 1.4135763670647392, + "grad_norm": 0.37947040748341343, + "learning_rate": 6.357951756193044e-05, + "loss": 2.6511, + "step": 30362 + }, + { + "epoch": 1.413622925250832, + "grad_norm": 0.3357459804592907, + "learning_rate": 6.357691061887551e-05, + "loss": 2.7542, + "step": 30363 + }, + { + "epoch": 1.4136694834369252, + "grad_norm": 0.3400760503525272, + "learning_rate": 6.35743036359737e-05, + "loss": 2.7103, + "step": 30364 + }, + { + "epoch": 1.4137160416230183, + "grad_norm": 0.3530129597539701, + "learning_rate": 6.357169661323264e-05, + "loss": 2.6858, + "step": 30365 + }, + { + "epoch": 1.4137625998091115, + "grad_norm": 0.33297005555949766, + "learning_rate": 6.356908955065998e-05, + "loss": 2.6961, + "step": 30366 + }, + { + "epoch": 1.4138091579952046, + "grad_norm": 0.3353102406828794, + "learning_rate": 6.356648244826338e-05, + "loss": 2.712, + "step": 30367 + }, + { + "epoch": 1.4138557161812977, + "grad_norm": 0.3470442054991848, + "learning_rate": 6.35638753060505e-05, + "loss": 2.7051, + "step": 30368 + }, + { + "epoch": 1.4139022743673906, + "grad_norm": 0.3141354818743408, + "learning_rate": 6.356126812402897e-05, + "loss": 2.6864, + "step": 30369 + }, + { + "epoch": 1.4139488325534837, + "grad_norm": 0.3378781888684815, + "learning_rate": 6.355866090220646e-05, + "loss": 2.8033, + "step": 30370 + }, + { + "epoch": 1.4139953907395768, + "grad_norm": 0.3294732193039081, + "learning_rate": 6.355605364059061e-05, + "loss": 2.6956, + "step": 30371 + }, + { + "epoch": 1.41404194892567, + "grad_norm": 0.33086546665381966, + "learning_rate": 6.35534463391891e-05, + "loss": 2.7531, + "step": 30372 + }, + { + "epoch": 1.4140885071117628, + "grad_norm": 0.3309456755029973, + "learning_rate": 6.355083899800954e-05, + "loss": 2.7819, + "step": 30373 + }, + { + "epoch": 1.414135065297856, + "grad_norm": 0.3450644405684001, + "learning_rate": 6.354823161705958e-05, + "loss": 2.8, + "step": 30374 + }, + { + "epoch": 1.414181623483949, + "grad_norm": 0.32363925957255446, + "learning_rate": 6.354562419634693e-05, + "loss": 2.6897, + "step": 30375 + }, + { + "epoch": 1.4142281816700422, + "grad_norm": 0.33939185137379085, + "learning_rate": 6.35430167358792e-05, + "loss": 2.7876, + "step": 30376 + }, + { + "epoch": 1.4142747398561353, + "grad_norm": 0.3339795749470188, + "learning_rate": 6.354040923566404e-05, + "loss": 2.7987, + "step": 30377 + }, + { + "epoch": 1.4143212980422284, + "grad_norm": 0.3642837179718069, + "learning_rate": 6.353780169570912e-05, + "loss": 2.8065, + "step": 30378 + }, + { + "epoch": 1.4143678562283213, + "grad_norm": 0.3353051945126873, + "learning_rate": 6.35351941160221e-05, + "loss": 2.7203, + "step": 30379 + }, + { + "epoch": 1.4144144144144144, + "grad_norm": 0.3538707543909288, + "learning_rate": 6.353258649661061e-05, + "loss": 2.797, + "step": 30380 + }, + { + "epoch": 1.4144609726005075, + "grad_norm": 0.36190619673941, + "learning_rate": 6.352997883748231e-05, + "loss": 2.7543, + "step": 30381 + }, + { + "epoch": 1.4145075307866006, + "grad_norm": 0.401424087484275, + "learning_rate": 6.352737113864484e-05, + "loss": 2.7018, + "step": 30382 + }, + { + "epoch": 1.4145540889726935, + "grad_norm": 0.38545581234722415, + "learning_rate": 6.352476340010588e-05, + "loss": 2.8342, + "step": 30383 + }, + { + "epoch": 1.4146006471587866, + "grad_norm": 0.3760164685685694, + "learning_rate": 6.352215562187307e-05, + "loss": 2.7783, + "step": 30384 + }, + { + "epoch": 1.4146472053448798, + "grad_norm": 0.3478207927288725, + "learning_rate": 6.351954780395407e-05, + "loss": 2.7917, + "step": 30385 + }, + { + "epoch": 1.4146937635309729, + "grad_norm": 0.369952569244448, + "learning_rate": 6.351693994635653e-05, + "loss": 2.7588, + "step": 30386 + }, + { + "epoch": 1.414740321717066, + "grad_norm": 0.3685053636714482, + "learning_rate": 6.35143320490881e-05, + "loss": 2.6714, + "step": 30387 + }, + { + "epoch": 1.414786879903159, + "grad_norm": 0.3511363447998739, + "learning_rate": 6.351172411215642e-05, + "loss": 2.6884, + "step": 30388 + }, + { + "epoch": 1.414833438089252, + "grad_norm": 0.31976200961427953, + "learning_rate": 6.35091161355692e-05, + "loss": 2.7348, + "step": 30389 + }, + { + "epoch": 1.414879996275345, + "grad_norm": 0.3463790731883485, + "learning_rate": 6.350650811933401e-05, + "loss": 2.7607, + "step": 30390 + }, + { + "epoch": 1.4149265544614382, + "grad_norm": 0.34080785219630483, + "learning_rate": 6.350390006345858e-05, + "loss": 2.7614, + "step": 30391 + }, + { + "epoch": 1.4149731126475311, + "grad_norm": 0.34212921708618627, + "learning_rate": 6.350129196795054e-05, + "loss": 2.673, + "step": 30392 + }, + { + "epoch": 1.4150196708336242, + "grad_norm": 0.3464674815815227, + "learning_rate": 6.349868383281751e-05, + "loss": 2.6572, + "step": 30393 + }, + { + "epoch": 1.4150662290197173, + "grad_norm": 0.3169645630852689, + "learning_rate": 6.34960756580672e-05, + "loss": 2.7283, + "step": 30394 + }, + { + "epoch": 1.4151127872058105, + "grad_norm": 0.31478895356140374, + "learning_rate": 6.34934674437072e-05, + "loss": 2.7509, + "step": 30395 + }, + { + "epoch": 1.4151593453919036, + "grad_norm": 0.35682609746124744, + "learning_rate": 6.349085918974525e-05, + "loss": 2.7974, + "step": 30396 + }, + { + "epoch": 1.4152059035779967, + "grad_norm": 0.34664845736012634, + "learning_rate": 6.348825089618892e-05, + "loss": 2.7267, + "step": 30397 + }, + { + "epoch": 1.4152524617640898, + "grad_norm": 0.36258722319603764, + "learning_rate": 6.348564256304592e-05, + "loss": 2.6922, + "step": 30398 + }, + { + "epoch": 1.4152990199501827, + "grad_norm": 0.3361930811332413, + "learning_rate": 6.348303419032389e-05, + "loss": 2.78, + "step": 30399 + }, + { + "epoch": 1.4153455781362758, + "grad_norm": 0.3630624848049582, + "learning_rate": 6.348042577803046e-05, + "loss": 2.7022, + "step": 30400 + }, + { + "epoch": 1.415392136322369, + "grad_norm": 0.3522319296924499, + "learning_rate": 6.347781732617332e-05, + "loss": 2.7036, + "step": 30401 + }, + { + "epoch": 1.4154386945084618, + "grad_norm": 0.35962135864545697, + "learning_rate": 6.347520883476014e-05, + "loss": 2.6761, + "step": 30402 + }, + { + "epoch": 1.415485252694555, + "grad_norm": 0.34900197037398495, + "learning_rate": 6.347260030379852e-05, + "loss": 2.797, + "step": 30403 + }, + { + "epoch": 1.415531810880648, + "grad_norm": 0.3242091589976891, + "learning_rate": 6.346999173329616e-05, + "loss": 2.6907, + "step": 30404 + }, + { + "epoch": 1.4155783690667412, + "grad_norm": 0.37199071688423613, + "learning_rate": 6.346738312326069e-05, + "loss": 2.7128, + "step": 30405 + }, + { + "epoch": 1.4156249272528343, + "grad_norm": 0.3334283351718046, + "learning_rate": 6.346477447369979e-05, + "loss": 2.7032, + "step": 30406 + }, + { + "epoch": 1.4156714854389274, + "grad_norm": 0.3460499821880253, + "learning_rate": 6.34621657846211e-05, + "loss": 2.704, + "step": 30407 + }, + { + "epoch": 1.4157180436250203, + "grad_norm": 0.35134787344575025, + "learning_rate": 6.345955705603227e-05, + "loss": 2.8453, + "step": 30408 + }, + { + "epoch": 1.4157646018111134, + "grad_norm": 0.3583361251288892, + "learning_rate": 6.345694828794097e-05, + "loss": 2.7536, + "step": 30409 + }, + { + "epoch": 1.4158111599972065, + "grad_norm": 0.3377433262729189, + "learning_rate": 6.345433948035484e-05, + "loss": 2.7822, + "step": 30410 + }, + { + "epoch": 1.4158577181832996, + "grad_norm": 0.35622560871101955, + "learning_rate": 6.345173063328156e-05, + "loss": 2.8029, + "step": 30411 + }, + { + "epoch": 1.4159042763693925, + "grad_norm": 0.333791413464441, + "learning_rate": 6.34491217467288e-05, + "loss": 2.8068, + "step": 30412 + }, + { + "epoch": 1.4159508345554856, + "grad_norm": 0.3350912925841908, + "learning_rate": 6.344651282070416e-05, + "loss": 2.6391, + "step": 30413 + }, + { + "epoch": 1.4159973927415788, + "grad_norm": 0.33753755328899077, + "learning_rate": 6.344390385521533e-05, + "loss": 2.8656, + "step": 30414 + }, + { + "epoch": 1.4160439509276719, + "grad_norm": 0.3711554280857016, + "learning_rate": 6.344129485026999e-05, + "loss": 2.7804, + "step": 30415 + }, + { + "epoch": 1.416090509113765, + "grad_norm": 0.30406120104682904, + "learning_rate": 6.343868580587576e-05, + "loss": 2.7397, + "step": 30416 + }, + { + "epoch": 1.416137067299858, + "grad_norm": 0.3523259390482244, + "learning_rate": 6.343607672204032e-05, + "loss": 2.7421, + "step": 30417 + }, + { + "epoch": 1.416183625485951, + "grad_norm": 0.3469131608929806, + "learning_rate": 6.343346759877131e-05, + "loss": 2.8018, + "step": 30418 + }, + { + "epoch": 1.4162301836720441, + "grad_norm": 0.33166601225752707, + "learning_rate": 6.34308584360764e-05, + "loss": 2.7385, + "step": 30419 + }, + { + "epoch": 1.4162767418581372, + "grad_norm": 0.3414985769162479, + "learning_rate": 6.342824923396324e-05, + "loss": 2.6933, + "step": 30420 + }, + { + "epoch": 1.4163233000442303, + "grad_norm": 0.34033960209985786, + "learning_rate": 6.342563999243949e-05, + "loss": 2.7386, + "step": 30421 + }, + { + "epoch": 1.4163698582303232, + "grad_norm": 0.31976370498114903, + "learning_rate": 6.34230307115128e-05, + "loss": 2.6865, + "step": 30422 + }, + { + "epoch": 1.4164164164164164, + "grad_norm": 0.3356207499421074, + "learning_rate": 6.342042139119085e-05, + "loss": 2.7569, + "step": 30423 + }, + { + "epoch": 1.4164629746025095, + "grad_norm": 0.33856814675030544, + "learning_rate": 6.341781203148129e-05, + "loss": 2.8219, + "step": 30424 + }, + { + "epoch": 1.4165095327886026, + "grad_norm": 0.3463002898169703, + "learning_rate": 6.341520263239178e-05, + "loss": 2.7204, + "step": 30425 + }, + { + "epoch": 1.4165560909746957, + "grad_norm": 0.3410735249358559, + "learning_rate": 6.341259319392995e-05, + "loss": 2.7428, + "step": 30426 + }, + { + "epoch": 1.4166026491607888, + "grad_norm": 0.31466753639013106, + "learning_rate": 6.340998371610348e-05, + "loss": 2.7575, + "step": 30427 + }, + { + "epoch": 1.4166492073468817, + "grad_norm": 0.35152649818404186, + "learning_rate": 6.340737419892006e-05, + "loss": 2.9016, + "step": 30428 + }, + { + "epoch": 1.4166957655329748, + "grad_norm": 0.35155292160511015, + "learning_rate": 6.340476464238728e-05, + "loss": 2.7302, + "step": 30429 + }, + { + "epoch": 1.416742323719068, + "grad_norm": 0.3362974612052523, + "learning_rate": 6.340215504651287e-05, + "loss": 2.7316, + "step": 30430 + }, + { + "epoch": 1.416788881905161, + "grad_norm": 0.36952661474604703, + "learning_rate": 6.339954541130443e-05, + "loss": 2.8246, + "step": 30431 + }, + { + "epoch": 1.416835440091254, + "grad_norm": 0.3513998493305478, + "learning_rate": 6.339693573676965e-05, + "loss": 2.7302, + "step": 30432 + }, + { + "epoch": 1.416881998277347, + "grad_norm": 0.34775344833479754, + "learning_rate": 6.339432602291618e-05, + "loss": 2.7674, + "step": 30433 + }, + { + "epoch": 1.4169285564634402, + "grad_norm": 0.32787472300103776, + "learning_rate": 6.33917162697517e-05, + "loss": 2.7438, + "step": 30434 + }, + { + "epoch": 1.4169751146495333, + "grad_norm": 0.3782099556949571, + "learning_rate": 6.338910647728383e-05, + "loss": 2.7232, + "step": 30435 + }, + { + "epoch": 1.4170216728356264, + "grad_norm": 0.3150893464038583, + "learning_rate": 6.338649664552026e-05, + "loss": 2.6649, + "step": 30436 + }, + { + "epoch": 1.4170682310217195, + "grad_norm": 0.33788907673286633, + "learning_rate": 6.338388677446865e-05, + "loss": 2.7967, + "step": 30437 + }, + { + "epoch": 1.4171147892078124, + "grad_norm": 0.36133116262846643, + "learning_rate": 6.338127686413665e-05, + "loss": 2.6635, + "step": 30438 + }, + { + "epoch": 1.4171613473939055, + "grad_norm": 0.33870951498057966, + "learning_rate": 6.33786669145319e-05, + "loss": 2.7647, + "step": 30439 + }, + { + "epoch": 1.4172079055799987, + "grad_norm": 0.366587540617181, + "learning_rate": 6.33760569256621e-05, + "loss": 2.7469, + "step": 30440 + }, + { + "epoch": 1.4172544637660915, + "grad_norm": 0.32848811236571557, + "learning_rate": 6.337344689753489e-05, + "loss": 2.7129, + "step": 30441 + }, + { + "epoch": 1.4173010219521847, + "grad_norm": 0.35959635550827285, + "learning_rate": 6.337083683015791e-05, + "loss": 2.7072, + "step": 30442 + }, + { + "epoch": 1.4173475801382778, + "grad_norm": 0.37529500351408035, + "learning_rate": 6.336822672353888e-05, + "loss": 2.7653, + "step": 30443 + }, + { + "epoch": 1.417394138324371, + "grad_norm": 0.3356760317819838, + "learning_rate": 6.33656165776854e-05, + "loss": 2.7816, + "step": 30444 + }, + { + "epoch": 1.417440696510464, + "grad_norm": 0.3717214576705185, + "learning_rate": 6.336300639260515e-05, + "loss": 2.7852, + "step": 30445 + }, + { + "epoch": 1.4174872546965571, + "grad_norm": 0.37720303953236484, + "learning_rate": 6.336039616830578e-05, + "loss": 2.7519, + "step": 30446 + }, + { + "epoch": 1.41753381288265, + "grad_norm": 0.35065890477614764, + "learning_rate": 6.335778590479497e-05, + "loss": 2.7988, + "step": 30447 + }, + { + "epoch": 1.4175803710687431, + "grad_norm": 0.36637461450684433, + "learning_rate": 6.33551756020804e-05, + "loss": 2.7238, + "step": 30448 + }, + { + "epoch": 1.4176269292548362, + "grad_norm": 0.33505356802225933, + "learning_rate": 6.335256526016968e-05, + "loss": 2.7471, + "step": 30449 + }, + { + "epoch": 1.4176734874409294, + "grad_norm": 0.3590248628942165, + "learning_rate": 6.33499548790705e-05, + "loss": 2.6914, + "step": 30450 + }, + { + "epoch": 1.4177200456270223, + "grad_norm": 0.37247538884000764, + "learning_rate": 6.334734445879053e-05, + "loss": 2.6803, + "step": 30451 + }, + { + "epoch": 1.4177666038131154, + "grad_norm": 0.366187374800368, + "learning_rate": 6.33447339993374e-05, + "loss": 2.8938, + "step": 30452 + }, + { + "epoch": 1.4178131619992085, + "grad_norm": 0.354456827587651, + "learning_rate": 6.334212350071881e-05, + "loss": 2.8065, + "step": 30453 + }, + { + "epoch": 1.4178597201853016, + "grad_norm": 0.37463161160903546, + "learning_rate": 6.33395129629424e-05, + "loss": 2.6977, + "step": 30454 + }, + { + "epoch": 1.4179062783713947, + "grad_norm": 0.3519886423736896, + "learning_rate": 6.333690238601582e-05, + "loss": 2.7122, + "step": 30455 + }, + { + "epoch": 1.4179528365574878, + "grad_norm": 0.3234514431980096, + "learning_rate": 6.333429176994676e-05, + "loss": 2.7718, + "step": 30456 + }, + { + "epoch": 1.4179993947435807, + "grad_norm": 0.3679120489486335, + "learning_rate": 6.333168111474286e-05, + "loss": 2.7485, + "step": 30457 + }, + { + "epoch": 1.4180459529296738, + "grad_norm": 0.3512948990339378, + "learning_rate": 6.332907042041179e-05, + "loss": 2.782, + "step": 30458 + }, + { + "epoch": 1.418092511115767, + "grad_norm": 0.3537481005280984, + "learning_rate": 6.332645968696121e-05, + "loss": 2.6242, + "step": 30459 + }, + { + "epoch": 1.41813906930186, + "grad_norm": 0.32420283586543813, + "learning_rate": 6.332384891439877e-05, + "loss": 2.7831, + "step": 30460 + }, + { + "epoch": 1.418185627487953, + "grad_norm": 0.3871006403859048, + "learning_rate": 6.332123810273216e-05, + "loss": 2.8474, + "step": 30461 + }, + { + "epoch": 1.418232185674046, + "grad_norm": 0.3271877857822072, + "learning_rate": 6.331862725196903e-05, + "loss": 2.7589, + "step": 30462 + }, + { + "epoch": 1.4182787438601392, + "grad_norm": 0.3293942890228652, + "learning_rate": 6.331601636211704e-05, + "loss": 2.8297, + "step": 30463 + }, + { + "epoch": 1.4183253020462323, + "grad_norm": 0.35968258373017925, + "learning_rate": 6.331340543318385e-05, + "loss": 2.7904, + "step": 30464 + }, + { + "epoch": 1.4183718602323254, + "grad_norm": 0.33005351267847094, + "learning_rate": 6.331079446517712e-05, + "loss": 2.7802, + "step": 30465 + }, + { + "epoch": 1.4184184184184185, + "grad_norm": 0.3587729798600443, + "learning_rate": 6.330818345810454e-05, + "loss": 2.7004, + "step": 30466 + }, + { + "epoch": 1.4184649766045114, + "grad_norm": 0.33719249850806016, + "learning_rate": 6.330557241197374e-05, + "loss": 2.6953, + "step": 30467 + }, + { + "epoch": 1.4185115347906045, + "grad_norm": 0.352740718374584, + "learning_rate": 6.33029613267924e-05, + "loss": 2.74, + "step": 30468 + }, + { + "epoch": 1.4185580929766977, + "grad_norm": 0.35171533296660723, + "learning_rate": 6.330035020256816e-05, + "loss": 2.7504, + "step": 30469 + }, + { + "epoch": 1.4186046511627908, + "grad_norm": 0.35759575033495367, + "learning_rate": 6.329773903930874e-05, + "loss": 2.6842, + "step": 30470 + }, + { + "epoch": 1.4186512093488837, + "grad_norm": 0.34655523472748556, + "learning_rate": 6.329512783702174e-05, + "loss": 2.7239, + "step": 30471 + }, + { + "epoch": 1.4186977675349768, + "grad_norm": 0.3625806571154301, + "learning_rate": 6.329251659571486e-05, + "loss": 2.6775, + "step": 30472 + }, + { + "epoch": 1.41874432572107, + "grad_norm": 0.3651301801180677, + "learning_rate": 6.328990531539574e-05, + "loss": 2.7309, + "step": 30473 + }, + { + "epoch": 1.418790883907163, + "grad_norm": 0.3587707878929472, + "learning_rate": 6.328729399607207e-05, + "loss": 2.7449, + "step": 30474 + }, + { + "epoch": 1.4188374420932561, + "grad_norm": 0.3652309203017688, + "learning_rate": 6.328468263775149e-05, + "loss": 2.7601, + "step": 30475 + }, + { + "epoch": 1.4188840002793492, + "grad_norm": 0.3588207587674654, + "learning_rate": 6.328207124044168e-05, + "loss": 2.7477, + "step": 30476 + }, + { + "epoch": 1.4189305584654421, + "grad_norm": 0.35848827704261294, + "learning_rate": 6.32794598041503e-05, + "loss": 2.8482, + "step": 30477 + }, + { + "epoch": 1.4189771166515353, + "grad_norm": 0.34284752890876163, + "learning_rate": 6.327684832888501e-05, + "loss": 2.7007, + "step": 30478 + }, + { + "epoch": 1.4190236748376284, + "grad_norm": 0.3167502047669574, + "learning_rate": 6.32742368146535e-05, + "loss": 2.8117, + "step": 30479 + }, + { + "epoch": 1.4190702330237213, + "grad_norm": 0.3552606297964634, + "learning_rate": 6.32716252614634e-05, + "loss": 2.8084, + "step": 30480 + }, + { + "epoch": 1.4191167912098144, + "grad_norm": 0.36187503428869044, + "learning_rate": 6.326901366932237e-05, + "loss": 2.7369, + "step": 30481 + }, + { + "epoch": 1.4191633493959075, + "grad_norm": 0.33884087850502265, + "learning_rate": 6.326640203823811e-05, + "loss": 2.6999, + "step": 30482 + }, + { + "epoch": 1.4192099075820006, + "grad_norm": 0.3523657440518169, + "learning_rate": 6.326379036821825e-05, + "loss": 2.8315, + "step": 30483 + }, + { + "epoch": 1.4192564657680937, + "grad_norm": 0.34777445943917296, + "learning_rate": 6.32611786592705e-05, + "loss": 2.7984, + "step": 30484 + }, + { + "epoch": 1.4193030239541868, + "grad_norm": 0.3513807081597392, + "learning_rate": 6.325856691140248e-05, + "loss": 2.7749, + "step": 30485 + }, + { + "epoch": 1.41934958214028, + "grad_norm": 0.32473020005027114, + "learning_rate": 6.325595512462187e-05, + "loss": 2.8388, + "step": 30486 + }, + { + "epoch": 1.4193961403263728, + "grad_norm": 0.3542018410811904, + "learning_rate": 6.325334329893633e-05, + "loss": 2.7587, + "step": 30487 + }, + { + "epoch": 1.419442698512466, + "grad_norm": 0.345739937812984, + "learning_rate": 6.325073143435356e-05, + "loss": 2.7753, + "step": 30488 + }, + { + "epoch": 1.419489256698559, + "grad_norm": 0.3737192078004881, + "learning_rate": 6.324811953088118e-05, + "loss": 2.7824, + "step": 30489 + }, + { + "epoch": 1.419535814884652, + "grad_norm": 0.3342297938856848, + "learning_rate": 6.324550758852687e-05, + "loss": 2.7185, + "step": 30490 + }, + { + "epoch": 1.419582373070745, + "grad_norm": 0.361393392569378, + "learning_rate": 6.32428956072983e-05, + "loss": 2.6962, + "step": 30491 + }, + { + "epoch": 1.4196289312568382, + "grad_norm": 0.3282941360328595, + "learning_rate": 6.324028358720316e-05, + "loss": 2.6715, + "step": 30492 + }, + { + "epoch": 1.4196754894429313, + "grad_norm": 0.3308337895070475, + "learning_rate": 6.323767152824908e-05, + "loss": 2.753, + "step": 30493 + }, + { + "epoch": 1.4197220476290244, + "grad_norm": 0.3471764443041233, + "learning_rate": 6.323505943044372e-05, + "loss": 2.7183, + "step": 30494 + }, + { + "epoch": 1.4197686058151175, + "grad_norm": 0.33057160268263974, + "learning_rate": 6.323244729379478e-05, + "loss": 2.7814, + "step": 30495 + }, + { + "epoch": 1.4198151640012104, + "grad_norm": 0.3542752804012373, + "learning_rate": 6.322983511830991e-05, + "loss": 2.7453, + "step": 30496 + }, + { + "epoch": 1.4198617221873036, + "grad_norm": 0.3349791105904551, + "learning_rate": 6.322722290399678e-05, + "loss": 2.7375, + "step": 30497 + }, + { + "epoch": 1.4199082803733967, + "grad_norm": 0.33504727454838445, + "learning_rate": 6.322461065086305e-05, + "loss": 2.7411, + "step": 30498 + }, + { + "epoch": 1.4199548385594898, + "grad_norm": 0.33111775884865485, + "learning_rate": 6.322199835891638e-05, + "loss": 2.7355, + "step": 30499 + }, + { + "epoch": 1.4200013967455827, + "grad_norm": 0.33779190559677547, + "learning_rate": 6.321938602816446e-05, + "loss": 2.7163, + "step": 30500 + }, + { + "epoch": 1.4200479549316758, + "grad_norm": 0.33608712428767146, + "learning_rate": 6.321677365861494e-05, + "loss": 2.776, + "step": 30501 + }, + { + "epoch": 1.420094513117769, + "grad_norm": 0.34458330659186587, + "learning_rate": 6.32141612502755e-05, + "loss": 2.6803, + "step": 30502 + }, + { + "epoch": 1.420141071303862, + "grad_norm": 0.30334567021626446, + "learning_rate": 6.321154880315379e-05, + "loss": 2.6829, + "step": 30503 + }, + { + "epoch": 1.4201876294899551, + "grad_norm": 0.3282800427214941, + "learning_rate": 6.320893631725749e-05, + "loss": 2.7403, + "step": 30504 + }, + { + "epoch": 1.4202341876760483, + "grad_norm": 0.28671309747812307, + "learning_rate": 6.320632379259426e-05, + "loss": 2.5548, + "step": 30505 + }, + { + "epoch": 1.4202807458621411, + "grad_norm": 0.32087608015983043, + "learning_rate": 6.320371122917179e-05, + "loss": 2.6329, + "step": 30506 + }, + { + "epoch": 1.4203273040482343, + "grad_norm": 0.35777694077580857, + "learning_rate": 6.32010986269977e-05, + "loss": 2.7464, + "step": 30507 + }, + { + "epoch": 1.4203738622343274, + "grad_norm": 0.35063274816472073, + "learning_rate": 6.31984859860797e-05, + "loss": 2.7474, + "step": 30508 + }, + { + "epoch": 1.4204204204204205, + "grad_norm": 0.35729473738501133, + "learning_rate": 6.319587330642542e-05, + "loss": 2.763, + "step": 30509 + }, + { + "epoch": 1.4204669786065134, + "grad_norm": 0.35907265576208974, + "learning_rate": 6.319326058804258e-05, + "loss": 2.7188, + "step": 30510 + }, + { + "epoch": 1.4205135367926065, + "grad_norm": 0.3615952953558071, + "learning_rate": 6.31906478309388e-05, + "loss": 2.6154, + "step": 30511 + }, + { + "epoch": 1.4205600949786996, + "grad_norm": 0.35250016570944703, + "learning_rate": 6.318803503512178e-05, + "loss": 2.747, + "step": 30512 + }, + { + "epoch": 1.4206066531647927, + "grad_norm": 0.31570626113406175, + "learning_rate": 6.318542220059917e-05, + "loss": 2.7359, + "step": 30513 + }, + { + "epoch": 1.4206532113508858, + "grad_norm": 0.3717350147434919, + "learning_rate": 6.318280932737865e-05, + "loss": 2.713, + "step": 30514 + }, + { + "epoch": 1.420699769536979, + "grad_norm": 0.32970927044464643, + "learning_rate": 6.31801964154679e-05, + "loss": 2.6978, + "step": 30515 + }, + { + "epoch": 1.4207463277230719, + "grad_norm": 0.3472918249072508, + "learning_rate": 6.317758346487455e-05, + "loss": 2.6981, + "step": 30516 + }, + { + "epoch": 1.420792885909165, + "grad_norm": 0.35369194750073646, + "learning_rate": 6.31749704756063e-05, + "loss": 2.7355, + "step": 30517 + }, + { + "epoch": 1.420839444095258, + "grad_norm": 0.3361649669848029, + "learning_rate": 6.317235744767081e-05, + "loss": 2.7654, + "step": 30518 + }, + { + "epoch": 1.4208860022813512, + "grad_norm": 0.3683675101288166, + "learning_rate": 6.316974438107575e-05, + "loss": 2.7608, + "step": 30519 + }, + { + "epoch": 1.420932560467444, + "grad_norm": 0.3697596963141369, + "learning_rate": 6.316713127582879e-05, + "loss": 2.7135, + "step": 30520 + }, + { + "epoch": 1.4209791186535372, + "grad_norm": 0.350122571028279, + "learning_rate": 6.316451813193758e-05, + "loss": 2.7183, + "step": 30521 + }, + { + "epoch": 1.4210256768396303, + "grad_norm": 0.4153979879082717, + "learning_rate": 6.316190494940981e-05, + "loss": 2.7862, + "step": 30522 + }, + { + "epoch": 1.4210722350257234, + "grad_norm": 0.3433543116744578, + "learning_rate": 6.315929172825315e-05, + "loss": 2.6987, + "step": 30523 + }, + { + "epoch": 1.4211187932118166, + "grad_norm": 0.3802991992612278, + "learning_rate": 6.315667846847528e-05, + "loss": 2.7811, + "step": 30524 + }, + { + "epoch": 1.4211653513979097, + "grad_norm": 0.32284888903434783, + "learning_rate": 6.315406517008385e-05, + "loss": 2.5901, + "step": 30525 + }, + { + "epoch": 1.4212119095840026, + "grad_norm": 0.37655894027261216, + "learning_rate": 6.315145183308651e-05, + "loss": 2.8086, + "step": 30526 + }, + { + "epoch": 1.4212584677700957, + "grad_norm": 0.3205078198432112, + "learning_rate": 6.314883845749096e-05, + "loss": 2.6914, + "step": 30527 + }, + { + "epoch": 1.4213050259561888, + "grad_norm": 0.3928023875054693, + "learning_rate": 6.314622504330487e-05, + "loss": 2.7519, + "step": 30528 + }, + { + "epoch": 1.4213515841422817, + "grad_norm": 0.33223227067253625, + "learning_rate": 6.314361159053592e-05, + "loss": 2.6801, + "step": 30529 + }, + { + "epoch": 1.4213981423283748, + "grad_norm": 0.3613368270417059, + "learning_rate": 6.314099809919176e-05, + "loss": 2.6982, + "step": 30530 + }, + { + "epoch": 1.421444700514468, + "grad_norm": 0.3523653982883749, + "learning_rate": 6.313838456928007e-05, + "loss": 2.736, + "step": 30531 + }, + { + "epoch": 1.421491258700561, + "grad_norm": 0.37009278009241287, + "learning_rate": 6.31357710008085e-05, + "loss": 2.7108, + "step": 30532 + }, + { + "epoch": 1.4215378168866541, + "grad_norm": 0.3593494646704688, + "learning_rate": 6.313315739378475e-05, + "loss": 2.6646, + "step": 30533 + }, + { + "epoch": 1.4215843750727473, + "grad_norm": 0.37229289698680235, + "learning_rate": 6.313054374821647e-05, + "loss": 2.7137, + "step": 30534 + }, + { + "epoch": 1.4216309332588402, + "grad_norm": 0.32761064054477085, + "learning_rate": 6.312793006411133e-05, + "loss": 2.7205, + "step": 30535 + }, + { + "epoch": 1.4216774914449333, + "grad_norm": 0.34519495905609, + "learning_rate": 6.312531634147701e-05, + "loss": 2.9717, + "step": 30536 + }, + { + "epoch": 1.4217240496310264, + "grad_norm": 0.34083022482329384, + "learning_rate": 6.312270258032117e-05, + "loss": 2.6496, + "step": 30537 + }, + { + "epoch": 1.4217706078171195, + "grad_norm": 0.334125186617526, + "learning_rate": 6.312008878065152e-05, + "loss": 2.7549, + "step": 30538 + }, + { + "epoch": 1.4218171660032124, + "grad_norm": 0.3164012961756035, + "learning_rate": 6.311747494247569e-05, + "loss": 2.703, + "step": 30539 + }, + { + "epoch": 1.4218637241893055, + "grad_norm": 0.33533412232610504, + "learning_rate": 6.311486106580136e-05, + "loss": 2.7157, + "step": 30540 + }, + { + "epoch": 1.4219102823753986, + "grad_norm": 0.3407606413410466, + "learning_rate": 6.31122471506362e-05, + "loss": 2.8503, + "step": 30541 + }, + { + "epoch": 1.4219568405614917, + "grad_norm": 0.3177711424017295, + "learning_rate": 6.310963319698789e-05, + "loss": 2.781, + "step": 30542 + }, + { + "epoch": 1.4220033987475849, + "grad_norm": 0.32352161197900414, + "learning_rate": 6.31070192048641e-05, + "loss": 2.739, + "step": 30543 + }, + { + "epoch": 1.422049956933678, + "grad_norm": 0.3410706039491845, + "learning_rate": 6.310440517427252e-05, + "loss": 2.7817, + "step": 30544 + }, + { + "epoch": 1.4220965151197709, + "grad_norm": 0.31666301625260185, + "learning_rate": 6.310179110522077e-05, + "loss": 2.7392, + "step": 30545 + }, + { + "epoch": 1.422143073305864, + "grad_norm": 0.34289713495993074, + "learning_rate": 6.309917699771657e-05, + "loss": 2.8767, + "step": 30546 + }, + { + "epoch": 1.422189631491957, + "grad_norm": 0.33292398227868986, + "learning_rate": 6.309656285176757e-05, + "loss": 2.6712, + "step": 30547 + }, + { + "epoch": 1.4222361896780502, + "grad_norm": 0.329450901854685, + "learning_rate": 6.309394866738145e-05, + "loss": 2.7699, + "step": 30548 + }, + { + "epoch": 1.422282747864143, + "grad_norm": 0.32791908618331717, + "learning_rate": 6.309133444456587e-05, + "loss": 2.7583, + "step": 30549 + }, + { + "epoch": 1.4223293060502362, + "grad_norm": 0.33115260326925466, + "learning_rate": 6.308872018332852e-05, + "loss": 2.8423, + "step": 30550 + }, + { + "epoch": 1.4223758642363293, + "grad_norm": 0.32570336465709326, + "learning_rate": 6.308610588367708e-05, + "loss": 2.7222, + "step": 30551 + }, + { + "epoch": 1.4224224224224224, + "grad_norm": 0.31877396115367723, + "learning_rate": 6.30834915456192e-05, + "loss": 2.6849, + "step": 30552 + }, + { + "epoch": 1.4224689806085156, + "grad_norm": 0.30937454676233606, + "learning_rate": 6.308087716916255e-05, + "loss": 2.7748, + "step": 30553 + }, + { + "epoch": 1.4225155387946087, + "grad_norm": 0.3633326381164088, + "learning_rate": 6.307826275431481e-05, + "loss": 2.762, + "step": 30554 + }, + { + "epoch": 1.4225620969807016, + "grad_norm": 0.31510053003358, + "learning_rate": 6.307564830108369e-05, + "loss": 2.7497, + "step": 30555 + }, + { + "epoch": 1.4226086551667947, + "grad_norm": 0.3807680113412287, + "learning_rate": 6.307303380947681e-05, + "loss": 2.8486, + "step": 30556 + }, + { + "epoch": 1.4226552133528878, + "grad_norm": 0.3664972141301171, + "learning_rate": 6.307041927950187e-05, + "loss": 2.7189, + "step": 30557 + }, + { + "epoch": 1.422701771538981, + "grad_norm": 0.3478764688835054, + "learning_rate": 6.306780471116653e-05, + "loss": 2.883, + "step": 30558 + }, + { + "epoch": 1.4227483297250738, + "grad_norm": 0.3868212497283302, + "learning_rate": 6.306519010447847e-05, + "loss": 2.8158, + "step": 30559 + }, + { + "epoch": 1.422794887911167, + "grad_norm": 0.35863840902146515, + "learning_rate": 6.306257545944537e-05, + "loss": 2.6861, + "step": 30560 + }, + { + "epoch": 1.42284144609726, + "grad_norm": 0.37562939274237894, + "learning_rate": 6.305996077607489e-05, + "loss": 2.7948, + "step": 30561 + }, + { + "epoch": 1.4228880042833532, + "grad_norm": 0.34932275200576046, + "learning_rate": 6.305734605437471e-05, + "loss": 2.7679, + "step": 30562 + }, + { + "epoch": 1.4229345624694463, + "grad_norm": 0.34210499682778545, + "learning_rate": 6.30547312943525e-05, + "loss": 2.767, + "step": 30563 + }, + { + "epoch": 1.4229811206555394, + "grad_norm": 0.3463533459861164, + "learning_rate": 6.305211649601595e-05, + "loss": 2.8209, + "step": 30564 + }, + { + "epoch": 1.4230276788416323, + "grad_norm": 0.33263004479175473, + "learning_rate": 6.304950165937275e-05, + "loss": 2.6344, + "step": 30565 + }, + { + "epoch": 1.4230742370277254, + "grad_norm": 0.3678968630560199, + "learning_rate": 6.304688678443052e-05, + "loss": 2.803, + "step": 30566 + }, + { + "epoch": 1.4231207952138185, + "grad_norm": 0.33233600318689915, + "learning_rate": 6.304427187119696e-05, + "loss": 2.6566, + "step": 30567 + }, + { + "epoch": 1.4231673533999114, + "grad_norm": 0.3419167256381195, + "learning_rate": 6.304165691967975e-05, + "loss": 2.7544, + "step": 30568 + }, + { + "epoch": 1.4232139115860045, + "grad_norm": 0.3344280662443453, + "learning_rate": 6.303904192988657e-05, + "loss": 2.7316, + "step": 30569 + }, + { + "epoch": 1.4232604697720976, + "grad_norm": 0.33991819607577667, + "learning_rate": 6.303642690182509e-05, + "loss": 2.7812, + "step": 30570 + }, + { + "epoch": 1.4233070279581908, + "grad_norm": 0.31019781986410155, + "learning_rate": 6.303381183550296e-05, + "loss": 2.7828, + "step": 30571 + }, + { + "epoch": 1.4233535861442839, + "grad_norm": 0.368317155742618, + "learning_rate": 6.30311967309279e-05, + "loss": 2.7373, + "step": 30572 + }, + { + "epoch": 1.423400144330377, + "grad_norm": 0.3280385113855335, + "learning_rate": 6.302858158810754e-05, + "loss": 2.701, + "step": 30573 + }, + { + "epoch": 1.42344670251647, + "grad_norm": 0.37719587142927713, + "learning_rate": 6.30259664070496e-05, + "loss": 2.7886, + "step": 30574 + }, + { + "epoch": 1.423493260702563, + "grad_norm": 0.33966369962716375, + "learning_rate": 6.30233511877617e-05, + "loss": 2.7413, + "step": 30575 + }, + { + "epoch": 1.423539818888656, + "grad_norm": 0.33475245006720195, + "learning_rate": 6.302073593025157e-05, + "loss": 2.8214, + "step": 30576 + }, + { + "epoch": 1.4235863770747492, + "grad_norm": 0.3297361069793992, + "learning_rate": 6.301812063452686e-05, + "loss": 2.7273, + "step": 30577 + }, + { + "epoch": 1.4236329352608421, + "grad_norm": 0.3494161245198452, + "learning_rate": 6.301550530059526e-05, + "loss": 2.7502, + "step": 30578 + }, + { + "epoch": 1.4236794934469352, + "grad_norm": 0.3374102834032504, + "learning_rate": 6.301288992846442e-05, + "loss": 2.6851, + "step": 30579 + }, + { + "epoch": 1.4237260516330283, + "grad_norm": 0.35406360252625313, + "learning_rate": 6.301027451814202e-05, + "loss": 2.7222, + "step": 30580 + }, + { + "epoch": 1.4237726098191215, + "grad_norm": 0.3663978400430956, + "learning_rate": 6.300765906963577e-05, + "loss": 2.824, + "step": 30581 + }, + { + "epoch": 1.4238191680052146, + "grad_norm": 0.3521029772865637, + "learning_rate": 6.300504358295331e-05, + "loss": 2.7338, + "step": 30582 + }, + { + "epoch": 1.4238657261913077, + "grad_norm": 0.35236778890386367, + "learning_rate": 6.300242805810233e-05, + "loss": 2.7994, + "step": 30583 + }, + { + "epoch": 1.4239122843774006, + "grad_norm": 0.3343973068924094, + "learning_rate": 6.299981249509049e-05, + "loss": 2.7455, + "step": 30584 + }, + { + "epoch": 1.4239588425634937, + "grad_norm": 0.3279044719609178, + "learning_rate": 6.299719689392549e-05, + "loss": 2.8131, + "step": 30585 + }, + { + "epoch": 1.4240054007495868, + "grad_norm": 0.31233556593476897, + "learning_rate": 6.299458125461501e-05, + "loss": 2.7529, + "step": 30586 + }, + { + "epoch": 1.42405195893568, + "grad_norm": 0.33858458481607845, + "learning_rate": 6.29919655771667e-05, + "loss": 2.7921, + "step": 30587 + }, + { + "epoch": 1.4240985171217728, + "grad_norm": 0.31311287244037156, + "learning_rate": 6.298934986158827e-05, + "loss": 2.6663, + "step": 30588 + }, + { + "epoch": 1.424145075307866, + "grad_norm": 0.33269590656906883, + "learning_rate": 6.298673410788736e-05, + "loss": 2.7441, + "step": 30589 + }, + { + "epoch": 1.424191633493959, + "grad_norm": 0.35141318611285194, + "learning_rate": 6.298411831607167e-05, + "loss": 2.7351, + "step": 30590 + }, + { + "epoch": 1.4242381916800522, + "grad_norm": 0.33961751602117324, + "learning_rate": 6.298150248614885e-05, + "loss": 2.8526, + "step": 30591 + }, + { + "epoch": 1.4242847498661453, + "grad_norm": 0.36580995865987015, + "learning_rate": 6.297888661812663e-05, + "loss": 2.6743, + "step": 30592 + }, + { + "epoch": 1.4243313080522384, + "grad_norm": 0.35274358679766826, + "learning_rate": 6.297627071201264e-05, + "loss": 2.82, + "step": 30593 + }, + { + "epoch": 1.4243778662383313, + "grad_norm": 0.36695237721839985, + "learning_rate": 6.297365476781459e-05, + "loss": 2.6327, + "step": 30594 + }, + { + "epoch": 1.4244244244244244, + "grad_norm": 0.34505755592276627, + "learning_rate": 6.297103878554013e-05, + "loss": 2.7126, + "step": 30595 + }, + { + "epoch": 1.4244709826105175, + "grad_norm": 0.3551556649336527, + "learning_rate": 6.296842276519697e-05, + "loss": 2.7475, + "step": 30596 + }, + { + "epoch": 1.4245175407966106, + "grad_norm": 0.3573532776174108, + "learning_rate": 6.296580670679273e-05, + "loss": 2.7026, + "step": 30597 + }, + { + "epoch": 1.4245640989827035, + "grad_norm": 0.3591167003183342, + "learning_rate": 6.296319061033515e-05, + "loss": 2.7915, + "step": 30598 + }, + { + "epoch": 1.4246106571687966, + "grad_norm": 0.35906231958830576, + "learning_rate": 6.296057447583188e-05, + "loss": 2.8201, + "step": 30599 + }, + { + "epoch": 1.4246572153548898, + "grad_norm": 0.36356540078307037, + "learning_rate": 6.295795830329058e-05, + "loss": 2.7521, + "step": 30600 + }, + { + "epoch": 1.4247037735409829, + "grad_norm": 0.3257990530782601, + "learning_rate": 6.295534209271898e-05, + "loss": 2.7676, + "step": 30601 + }, + { + "epoch": 1.424750331727076, + "grad_norm": 0.35670711146371026, + "learning_rate": 6.29527258441247e-05, + "loss": 2.7574, + "step": 30602 + }, + { + "epoch": 1.424796889913169, + "grad_norm": 0.3219214996145641, + "learning_rate": 6.295010955751546e-05, + "loss": 2.7028, + "step": 30603 + }, + { + "epoch": 1.424843448099262, + "grad_norm": 0.33702741424006205, + "learning_rate": 6.294749323289892e-05, + "loss": 2.7663, + "step": 30604 + }, + { + "epoch": 1.4248900062853551, + "grad_norm": 0.3138413798186376, + "learning_rate": 6.294487687028276e-05, + "loss": 2.6397, + "step": 30605 + }, + { + "epoch": 1.4249365644714482, + "grad_norm": 0.3318030827768893, + "learning_rate": 6.294226046967466e-05, + "loss": 2.7161, + "step": 30606 + }, + { + "epoch": 1.4249831226575411, + "grad_norm": 0.3245281233525372, + "learning_rate": 6.293964403108232e-05, + "loss": 2.7121, + "step": 30607 + }, + { + "epoch": 1.4250296808436342, + "grad_norm": 0.32391763226538733, + "learning_rate": 6.293702755451338e-05, + "loss": 2.6987, + "step": 30608 + }, + { + "epoch": 1.4250762390297274, + "grad_norm": 0.30898131337371626, + "learning_rate": 6.293441103997555e-05, + "loss": 2.8163, + "step": 30609 + }, + { + "epoch": 1.4251227972158205, + "grad_norm": 0.34650644884051485, + "learning_rate": 6.29317944874765e-05, + "loss": 2.792, + "step": 30610 + }, + { + "epoch": 1.4251693554019136, + "grad_norm": 0.3190826139118084, + "learning_rate": 6.29291778970239e-05, + "loss": 2.7688, + "step": 30611 + }, + { + "epoch": 1.4252159135880067, + "grad_norm": 0.3799530196149253, + "learning_rate": 6.292656126862542e-05, + "loss": 2.7823, + "step": 30612 + }, + { + "epoch": 1.4252624717740998, + "grad_norm": 0.33556251345250354, + "learning_rate": 6.292394460228877e-05, + "loss": 2.7911, + "step": 30613 + }, + { + "epoch": 1.4253090299601927, + "grad_norm": 0.33551503640236685, + "learning_rate": 6.292132789802162e-05, + "loss": 2.7272, + "step": 30614 + }, + { + "epoch": 1.4253555881462858, + "grad_norm": 0.3262904797263534, + "learning_rate": 6.291871115583166e-05, + "loss": 2.7809, + "step": 30615 + }, + { + "epoch": 1.425402146332379, + "grad_norm": 0.33563399430891744, + "learning_rate": 6.291609437572653e-05, + "loss": 2.683, + "step": 30616 + }, + { + "epoch": 1.4254487045184718, + "grad_norm": 0.3111933067402593, + "learning_rate": 6.291347755771394e-05, + "loss": 2.6991, + "step": 30617 + }, + { + "epoch": 1.425495262704565, + "grad_norm": 0.34146157437740754, + "learning_rate": 6.291086070180157e-05, + "loss": 2.6814, + "step": 30618 + }, + { + "epoch": 1.425541820890658, + "grad_norm": 0.31746214088336, + "learning_rate": 6.290824380799711e-05, + "loss": 2.7782, + "step": 30619 + }, + { + "epoch": 1.4255883790767512, + "grad_norm": 0.3097612916066916, + "learning_rate": 6.290562687630821e-05, + "loss": 2.7676, + "step": 30620 + }, + { + "epoch": 1.4256349372628443, + "grad_norm": 0.324949970438849, + "learning_rate": 6.290300990674256e-05, + "loss": 2.6579, + "step": 30621 + }, + { + "epoch": 1.4256814954489374, + "grad_norm": 0.3055719660113905, + "learning_rate": 6.290039289930785e-05, + "loss": 2.8094, + "step": 30622 + }, + { + "epoch": 1.4257280536350303, + "grad_norm": 0.3521672346936862, + "learning_rate": 6.289777585401178e-05, + "loss": 2.7634, + "step": 30623 + }, + { + "epoch": 1.4257746118211234, + "grad_norm": 0.30243864456312475, + "learning_rate": 6.289515877086199e-05, + "loss": 2.7872, + "step": 30624 + }, + { + "epoch": 1.4258211700072165, + "grad_norm": 0.3391152726394294, + "learning_rate": 6.289254164986617e-05, + "loss": 2.7268, + "step": 30625 + }, + { + "epoch": 1.4258677281933096, + "grad_norm": 0.3342566885499592, + "learning_rate": 6.288992449103203e-05, + "loss": 2.8439, + "step": 30626 + }, + { + "epoch": 1.4259142863794025, + "grad_norm": 0.33421193999224286, + "learning_rate": 6.288730729436722e-05, + "loss": 2.7502, + "step": 30627 + }, + { + "epoch": 1.4259608445654957, + "grad_norm": 0.28602694386332206, + "learning_rate": 6.288469005987945e-05, + "loss": 2.6543, + "step": 30628 + }, + { + "epoch": 1.4260074027515888, + "grad_norm": 0.3346433121969652, + "learning_rate": 6.288207278757637e-05, + "loss": 2.7688, + "step": 30629 + }, + { + "epoch": 1.4260539609376819, + "grad_norm": 0.33030979961512286, + "learning_rate": 6.287945547746566e-05, + "loss": 2.7005, + "step": 30630 + }, + { + "epoch": 1.426100519123775, + "grad_norm": 0.3281779180407518, + "learning_rate": 6.287683812955503e-05, + "loss": 2.7807, + "step": 30631 + }, + { + "epoch": 1.4261470773098681, + "grad_norm": 0.34043387600976155, + "learning_rate": 6.287422074385216e-05, + "loss": 2.6565, + "step": 30632 + }, + { + "epoch": 1.426193635495961, + "grad_norm": 0.32463572398639895, + "learning_rate": 6.287160332036472e-05, + "loss": 2.7629, + "step": 30633 + }, + { + "epoch": 1.4262401936820541, + "grad_norm": 0.3714858826924725, + "learning_rate": 6.286898585910038e-05, + "loss": 2.7531, + "step": 30634 + }, + { + "epoch": 1.4262867518681472, + "grad_norm": 0.3241693401279574, + "learning_rate": 6.286636836006683e-05, + "loss": 2.7436, + "step": 30635 + }, + { + "epoch": 1.4263333100542404, + "grad_norm": 0.3545465442359897, + "learning_rate": 6.286375082327176e-05, + "loss": 2.7538, + "step": 30636 + }, + { + "epoch": 1.4263798682403332, + "grad_norm": 0.3499397625579753, + "learning_rate": 6.286113324872287e-05, + "loss": 2.8148, + "step": 30637 + }, + { + "epoch": 1.4264264264264264, + "grad_norm": 0.33459322365774413, + "learning_rate": 6.285851563642779e-05, + "loss": 2.7252, + "step": 30638 + }, + { + "epoch": 1.4264729846125195, + "grad_norm": 0.3439996191101324, + "learning_rate": 6.285589798639425e-05, + "loss": 2.6367, + "step": 30639 + }, + { + "epoch": 1.4265195427986126, + "grad_norm": 0.3348219358861261, + "learning_rate": 6.285328029862992e-05, + "loss": 2.7212, + "step": 30640 + }, + { + "epoch": 1.4265661009847057, + "grad_norm": 0.3326542512899405, + "learning_rate": 6.285066257314247e-05, + "loss": 2.7485, + "step": 30641 + }, + { + "epoch": 1.4266126591707988, + "grad_norm": 0.34036617468005914, + "learning_rate": 6.28480448099396e-05, + "loss": 2.7586, + "step": 30642 + }, + { + "epoch": 1.4266592173568917, + "grad_norm": 0.3485797264788136, + "learning_rate": 6.284542700902896e-05, + "loss": 2.9076, + "step": 30643 + }, + { + "epoch": 1.4267057755429848, + "grad_norm": 0.34966356674495913, + "learning_rate": 6.284280917041828e-05, + "loss": 2.7048, + "step": 30644 + }, + { + "epoch": 1.426752333729078, + "grad_norm": 0.34637175028639106, + "learning_rate": 6.284019129411522e-05, + "loss": 2.7328, + "step": 30645 + }, + { + "epoch": 1.426798891915171, + "grad_norm": 0.3358638286566457, + "learning_rate": 6.283757338012747e-05, + "loss": 2.8201, + "step": 30646 + }, + { + "epoch": 1.426845450101264, + "grad_norm": 0.32691522734135847, + "learning_rate": 6.283495542846269e-05, + "loss": 2.8113, + "step": 30647 + }, + { + "epoch": 1.426892008287357, + "grad_norm": 0.3397333872235265, + "learning_rate": 6.283233743912858e-05, + "loss": 2.8199, + "step": 30648 + }, + { + "epoch": 1.4269385664734502, + "grad_norm": 0.33375327163435603, + "learning_rate": 6.282971941213284e-05, + "loss": 2.754, + "step": 30649 + }, + { + "epoch": 1.4269851246595433, + "grad_norm": 0.33717530244072536, + "learning_rate": 6.282710134748311e-05, + "loss": 2.7639, + "step": 30650 + }, + { + "epoch": 1.4270316828456364, + "grad_norm": 0.32564241220862916, + "learning_rate": 6.282448324518713e-05, + "loss": 2.7247, + "step": 30651 + }, + { + "epoch": 1.4270782410317295, + "grad_norm": 0.3198923241543243, + "learning_rate": 6.282186510525254e-05, + "loss": 2.8756, + "step": 30652 + }, + { + "epoch": 1.4271247992178224, + "grad_norm": 0.32238708347377043, + "learning_rate": 6.281924692768703e-05, + "loss": 2.6306, + "step": 30653 + }, + { + "epoch": 1.4271713574039155, + "grad_norm": 0.3355773156828839, + "learning_rate": 6.281662871249831e-05, + "loss": 2.7692, + "step": 30654 + }, + { + "epoch": 1.4272179155900087, + "grad_norm": 0.33607057609045826, + "learning_rate": 6.281401045969406e-05, + "loss": 2.7985, + "step": 30655 + }, + { + "epoch": 1.4272644737761015, + "grad_norm": 0.387032906150411, + "learning_rate": 6.281139216928193e-05, + "loss": 2.6107, + "step": 30656 + }, + { + "epoch": 1.4273110319621947, + "grad_norm": 0.3249120644168771, + "learning_rate": 6.280877384126963e-05, + "loss": 2.6551, + "step": 30657 + }, + { + "epoch": 1.4273575901482878, + "grad_norm": 0.3601131859506641, + "learning_rate": 6.280615547566484e-05, + "loss": 2.645, + "step": 30658 + }, + { + "epoch": 1.427404148334381, + "grad_norm": 0.370452480235282, + "learning_rate": 6.280353707247525e-05, + "loss": 2.7546, + "step": 30659 + }, + { + "epoch": 1.427450706520474, + "grad_norm": 0.3304661135075192, + "learning_rate": 6.280091863170854e-05, + "loss": 2.6695, + "step": 30660 + }, + { + "epoch": 1.4274972647065671, + "grad_norm": 0.3234011998835864, + "learning_rate": 6.279830015337239e-05, + "loss": 2.6836, + "step": 30661 + }, + { + "epoch": 1.42754382289266, + "grad_norm": 0.38398129305046586, + "learning_rate": 6.279568163747449e-05, + "loss": 2.717, + "step": 30662 + }, + { + "epoch": 1.4275903810787531, + "grad_norm": 0.3323640688224806, + "learning_rate": 6.279306308402252e-05, + "loss": 2.7851, + "step": 30663 + }, + { + "epoch": 1.4276369392648462, + "grad_norm": 0.3703283190053917, + "learning_rate": 6.279044449302419e-05, + "loss": 2.7973, + "step": 30664 + }, + { + "epoch": 1.4276834974509394, + "grad_norm": 0.3351574004597174, + "learning_rate": 6.278782586448714e-05, + "loss": 2.7416, + "step": 30665 + }, + { + "epoch": 1.4277300556370323, + "grad_norm": 0.34986007421713067, + "learning_rate": 6.27852071984191e-05, + "loss": 2.7257, + "step": 30666 + }, + { + "epoch": 1.4277766138231254, + "grad_norm": 0.31150825765567913, + "learning_rate": 6.278258849482772e-05, + "loss": 2.6878, + "step": 30667 + }, + { + "epoch": 1.4278231720092185, + "grad_norm": 0.33831418719857814, + "learning_rate": 6.277996975372072e-05, + "loss": 2.7597, + "step": 30668 + }, + { + "epoch": 1.4278697301953116, + "grad_norm": 0.33852611063068605, + "learning_rate": 6.277735097510576e-05, + "loss": 2.6735, + "step": 30669 + }, + { + "epoch": 1.4279162883814047, + "grad_norm": 0.3331794168446133, + "learning_rate": 6.277473215899053e-05, + "loss": 2.7848, + "step": 30670 + }, + { + "epoch": 1.4279628465674978, + "grad_norm": 0.33342293791635363, + "learning_rate": 6.277211330538272e-05, + "loss": 2.7842, + "step": 30671 + }, + { + "epoch": 1.4280094047535907, + "grad_norm": 0.34624971682423794, + "learning_rate": 6.276949441429003e-05, + "loss": 2.8716, + "step": 30672 + }, + { + "epoch": 1.4280559629396838, + "grad_norm": 0.3569922237652282, + "learning_rate": 6.276687548572014e-05, + "loss": 2.6374, + "step": 30673 + }, + { + "epoch": 1.428102521125777, + "grad_norm": 0.3226404980275427, + "learning_rate": 6.27642565196807e-05, + "loss": 2.6411, + "step": 30674 + }, + { + "epoch": 1.42814907931187, + "grad_norm": 0.36636835782961996, + "learning_rate": 6.276163751617943e-05, + "loss": 2.6375, + "step": 30675 + }, + { + "epoch": 1.428195637497963, + "grad_norm": 0.3281997016337125, + "learning_rate": 6.275901847522401e-05, + "loss": 2.7313, + "step": 30676 + }, + { + "epoch": 1.428242195684056, + "grad_norm": 0.33541801867537135, + "learning_rate": 6.275639939682211e-05, + "loss": 2.8346, + "step": 30677 + }, + { + "epoch": 1.4282887538701492, + "grad_norm": 0.36090180605147787, + "learning_rate": 6.275378028098148e-05, + "loss": 2.8026, + "step": 30678 + }, + { + "epoch": 1.4283353120562423, + "grad_norm": 0.35223090534414014, + "learning_rate": 6.275116112770974e-05, + "loss": 2.8576, + "step": 30679 + }, + { + "epoch": 1.4283818702423354, + "grad_norm": 0.36338432337642396, + "learning_rate": 6.274854193701458e-05, + "loss": 2.7797, + "step": 30680 + }, + { + "epoch": 1.4284284284284285, + "grad_norm": 0.3460249796603782, + "learning_rate": 6.274592270890373e-05, + "loss": 2.7013, + "step": 30681 + }, + { + "epoch": 1.4284749866145214, + "grad_norm": 0.31507321613767214, + "learning_rate": 6.274330344338485e-05, + "loss": 2.7445, + "step": 30682 + }, + { + "epoch": 1.4285215448006146, + "grad_norm": 0.3206011581668105, + "learning_rate": 6.27406841404656e-05, + "loss": 2.7293, + "step": 30683 + }, + { + "epoch": 1.4285681029867077, + "grad_norm": 0.3610945268424643, + "learning_rate": 6.273806480015374e-05, + "loss": 2.7807, + "step": 30684 + }, + { + "epoch": 1.4286146611728008, + "grad_norm": 0.3442340835882511, + "learning_rate": 6.27354454224569e-05, + "loss": 2.704, + "step": 30685 + }, + { + "epoch": 1.4286612193588937, + "grad_norm": 0.31427209088644253, + "learning_rate": 6.273282600738277e-05, + "loss": 2.7953, + "step": 30686 + }, + { + "epoch": 1.4287077775449868, + "grad_norm": 0.32092570797749076, + "learning_rate": 6.273020655493906e-05, + "loss": 2.8163, + "step": 30687 + }, + { + "epoch": 1.42875433573108, + "grad_norm": 0.33950987164987584, + "learning_rate": 6.272758706513346e-05, + "loss": 2.9134, + "step": 30688 + }, + { + "epoch": 1.428800893917173, + "grad_norm": 0.3265346645907604, + "learning_rate": 6.272496753797363e-05, + "loss": 2.7314, + "step": 30689 + }, + { + "epoch": 1.4288474521032661, + "grad_norm": 0.34226284673128254, + "learning_rate": 6.272234797346726e-05, + "loss": 2.6609, + "step": 30690 + }, + { + "epoch": 1.4288940102893593, + "grad_norm": 0.3326510519361366, + "learning_rate": 6.271972837162208e-05, + "loss": 2.7834, + "step": 30691 + }, + { + "epoch": 1.4289405684754521, + "grad_norm": 0.3669691521198066, + "learning_rate": 6.271710873244574e-05, + "loss": 2.7628, + "step": 30692 + }, + { + "epoch": 1.4289871266615453, + "grad_norm": 0.3387427498266699, + "learning_rate": 6.271448905594592e-05, + "loss": 2.7047, + "step": 30693 + }, + { + "epoch": 1.4290336848476384, + "grad_norm": 0.3554022134642956, + "learning_rate": 6.271186934213036e-05, + "loss": 2.7493, + "step": 30694 + }, + { + "epoch": 1.4290802430337313, + "grad_norm": 0.34853504161439414, + "learning_rate": 6.27092495910067e-05, + "loss": 2.7084, + "step": 30695 + }, + { + "epoch": 1.4291268012198244, + "grad_norm": 0.33742936592803485, + "learning_rate": 6.270662980258264e-05, + "loss": 2.6993, + "step": 30696 + }, + { + "epoch": 1.4291733594059175, + "grad_norm": 0.35519354106016887, + "learning_rate": 6.270400997686588e-05, + "loss": 2.7687, + "step": 30697 + }, + { + "epoch": 1.4292199175920106, + "grad_norm": 0.3566221099977008, + "learning_rate": 6.27013901138641e-05, + "loss": 2.6862, + "step": 30698 + }, + { + "epoch": 1.4292664757781037, + "grad_norm": 0.3376267917175338, + "learning_rate": 6.269877021358498e-05, + "loss": 2.7579, + "step": 30699 + }, + { + "epoch": 1.4293130339641968, + "grad_norm": 0.3855028754133514, + "learning_rate": 6.269615027603625e-05, + "loss": 2.8139, + "step": 30700 + }, + { + "epoch": 1.42935959215029, + "grad_norm": 0.3182136431542198, + "learning_rate": 6.269353030122556e-05, + "loss": 2.7153, + "step": 30701 + }, + { + "epoch": 1.4294061503363829, + "grad_norm": 0.3493034445638602, + "learning_rate": 6.269091028916059e-05, + "loss": 2.7429, + "step": 30702 + }, + { + "epoch": 1.429452708522476, + "grad_norm": 0.3477852572084105, + "learning_rate": 6.268829023984906e-05, + "loss": 2.7254, + "step": 30703 + }, + { + "epoch": 1.429499266708569, + "grad_norm": 0.3436299098137142, + "learning_rate": 6.268567015329863e-05, + "loss": 2.7606, + "step": 30704 + }, + { + "epoch": 1.429545824894662, + "grad_norm": 0.33299490060785786, + "learning_rate": 6.268305002951704e-05, + "loss": 2.8298, + "step": 30705 + }, + { + "epoch": 1.429592383080755, + "grad_norm": 0.3215369837370924, + "learning_rate": 6.268042986851192e-05, + "loss": 2.8499, + "step": 30706 + }, + { + "epoch": 1.4296389412668482, + "grad_norm": 0.3378000793540269, + "learning_rate": 6.267780967029101e-05, + "loss": 2.7294, + "step": 30707 + }, + { + "epoch": 1.4296854994529413, + "grad_norm": 0.3356210189802683, + "learning_rate": 6.267518943486197e-05, + "loss": 2.6793, + "step": 30708 + }, + { + "epoch": 1.4297320576390344, + "grad_norm": 0.3850176852098397, + "learning_rate": 6.26725691622325e-05, + "loss": 2.8228, + "step": 30709 + }, + { + "epoch": 1.4297786158251276, + "grad_norm": 0.391176388762258, + "learning_rate": 6.266994885241029e-05, + "loss": 2.7147, + "step": 30710 + }, + { + "epoch": 1.4298251740112204, + "grad_norm": 0.37847542880530854, + "learning_rate": 6.266732850540302e-05, + "loss": 2.7761, + "step": 30711 + }, + { + "epoch": 1.4298717321973136, + "grad_norm": 0.3390615267130555, + "learning_rate": 6.266470812121838e-05, + "loss": 2.7511, + "step": 30712 + }, + { + "epoch": 1.4299182903834067, + "grad_norm": 0.3872873398946267, + "learning_rate": 6.266208769986409e-05, + "loss": 2.7157, + "step": 30713 + }, + { + "epoch": 1.4299648485694998, + "grad_norm": 0.32900889703776487, + "learning_rate": 6.265946724134781e-05, + "loss": 2.7189, + "step": 30714 + }, + { + "epoch": 1.4300114067555927, + "grad_norm": 0.34478287917644096, + "learning_rate": 6.265684674567724e-05, + "loss": 2.6277, + "step": 30715 + }, + { + "epoch": 1.4300579649416858, + "grad_norm": 0.3291021476105801, + "learning_rate": 6.265422621286006e-05, + "loss": 2.8193, + "step": 30716 + }, + { + "epoch": 1.430104523127779, + "grad_norm": 0.33809892142415343, + "learning_rate": 6.2651605642904e-05, + "loss": 2.7986, + "step": 30717 + }, + { + "epoch": 1.430151081313872, + "grad_norm": 0.33868125047408437, + "learning_rate": 6.264898503581671e-05, + "loss": 2.6576, + "step": 30718 + }, + { + "epoch": 1.4301976394999651, + "grad_norm": 0.316342486028887, + "learning_rate": 6.264636439160589e-05, + "loss": 2.9323, + "step": 30719 + }, + { + "epoch": 1.4302441976860583, + "grad_norm": 0.36034956416095926, + "learning_rate": 6.264374371027924e-05, + "loss": 2.8089, + "step": 30720 + }, + { + "epoch": 1.4302907558721512, + "grad_norm": 0.34707570942535976, + "learning_rate": 6.264112299184446e-05, + "loss": 2.7123, + "step": 30721 + }, + { + "epoch": 1.4303373140582443, + "grad_norm": 0.33637274960702773, + "learning_rate": 6.26385022363092e-05, + "loss": 2.7738, + "step": 30722 + }, + { + "epoch": 1.4303838722443374, + "grad_norm": 0.36580415462088317, + "learning_rate": 6.263588144368123e-05, + "loss": 2.8307, + "step": 30723 + }, + { + "epoch": 1.4304304304304305, + "grad_norm": 0.3782939855975085, + "learning_rate": 6.263326061396814e-05, + "loss": 2.7712, + "step": 30724 + }, + { + "epoch": 1.4304769886165234, + "grad_norm": 0.32957056053280126, + "learning_rate": 6.263063974717769e-05, + "loss": 2.7566, + "step": 30725 + }, + { + "epoch": 1.4305235468026165, + "grad_norm": 0.37950939816360646, + "learning_rate": 6.262801884331757e-05, + "loss": 2.7251, + "step": 30726 + }, + { + "epoch": 1.4305701049887096, + "grad_norm": 0.367539195197047, + "learning_rate": 6.262539790239546e-05, + "loss": 2.7673, + "step": 30727 + }, + { + "epoch": 1.4306166631748027, + "grad_norm": 0.34974544498358473, + "learning_rate": 6.262277692441903e-05, + "loss": 2.6962, + "step": 30728 + }, + { + "epoch": 1.4306632213608959, + "grad_norm": 0.36063676362262614, + "learning_rate": 6.262015590939601e-05, + "loss": 2.7697, + "step": 30729 + }, + { + "epoch": 1.430709779546989, + "grad_norm": 0.3442043963673319, + "learning_rate": 6.261753485733407e-05, + "loss": 2.702, + "step": 30730 + }, + { + "epoch": 1.4307563377330819, + "grad_norm": 0.3616548020305251, + "learning_rate": 6.261491376824092e-05, + "loss": 2.8434, + "step": 30731 + }, + { + "epoch": 1.430802895919175, + "grad_norm": 0.3416449745827904, + "learning_rate": 6.261229264212423e-05, + "loss": 2.8233, + "step": 30732 + }, + { + "epoch": 1.430849454105268, + "grad_norm": 0.39424048484076984, + "learning_rate": 6.26096714789917e-05, + "loss": 2.7044, + "step": 30733 + }, + { + "epoch": 1.4308960122913612, + "grad_norm": 0.38044896666917805, + "learning_rate": 6.260705027885103e-05, + "loss": 2.7585, + "step": 30734 + }, + { + "epoch": 1.430942570477454, + "grad_norm": 0.3524515436269723, + "learning_rate": 6.260442904170991e-05, + "loss": 2.7545, + "step": 30735 + }, + { + "epoch": 1.4309891286635472, + "grad_norm": 0.3814868912419372, + "learning_rate": 6.260180776757605e-05, + "loss": 2.8152, + "step": 30736 + }, + { + "epoch": 1.4310356868496403, + "grad_norm": 0.32504831365479314, + "learning_rate": 6.25991864564571e-05, + "loss": 2.7387, + "step": 30737 + }, + { + "epoch": 1.4310822450357334, + "grad_norm": 0.3688641093339854, + "learning_rate": 6.259656510836078e-05, + "loss": 2.7136, + "step": 30738 + }, + { + "epoch": 1.4311288032218266, + "grad_norm": 0.34433047254540394, + "learning_rate": 6.259394372329478e-05, + "loss": 2.7517, + "step": 30739 + }, + { + "epoch": 1.4311753614079197, + "grad_norm": 0.3607583754586333, + "learning_rate": 6.259132230126681e-05, + "loss": 2.8397, + "step": 30740 + }, + { + "epoch": 1.4312219195940126, + "grad_norm": 0.3457539085962482, + "learning_rate": 6.258870084228455e-05, + "loss": 2.7441, + "step": 30741 + }, + { + "epoch": 1.4312684777801057, + "grad_norm": 0.37282976812427243, + "learning_rate": 6.25860793463557e-05, + "loss": 2.7411, + "step": 30742 + }, + { + "epoch": 1.4313150359661988, + "grad_norm": 0.33182883645287997, + "learning_rate": 6.258345781348792e-05, + "loss": 2.7234, + "step": 30743 + }, + { + "epoch": 1.4313615941522917, + "grad_norm": 0.3818475329393688, + "learning_rate": 6.258083624368895e-05, + "loss": 2.7688, + "step": 30744 + }, + { + "epoch": 1.4314081523383848, + "grad_norm": 0.29823635638601204, + "learning_rate": 6.257821463696647e-05, + "loss": 2.8723, + "step": 30745 + }, + { + "epoch": 1.431454710524478, + "grad_norm": 0.3293836677199085, + "learning_rate": 6.257559299332816e-05, + "loss": 2.7104, + "step": 30746 + }, + { + "epoch": 1.431501268710571, + "grad_norm": 0.37011603615756755, + "learning_rate": 6.257297131278172e-05, + "loss": 2.7649, + "step": 30747 + }, + { + "epoch": 1.4315478268966642, + "grad_norm": 0.33312717704451117, + "learning_rate": 6.257034959533485e-05, + "loss": 2.7375, + "step": 30748 + }, + { + "epoch": 1.4315943850827573, + "grad_norm": 0.3593805494799063, + "learning_rate": 6.256772784099525e-05, + "loss": 2.7379, + "step": 30749 + }, + { + "epoch": 1.4316409432688502, + "grad_norm": 0.331159195286329, + "learning_rate": 6.25651060497706e-05, + "loss": 2.7109, + "step": 30750 + }, + { + "epoch": 1.4316875014549433, + "grad_norm": 0.3598238196358949, + "learning_rate": 6.256248422166861e-05, + "loss": 2.6559, + "step": 30751 + }, + { + "epoch": 1.4317340596410364, + "grad_norm": 0.3423254866568245, + "learning_rate": 6.255986235669695e-05, + "loss": 2.8489, + "step": 30752 + }, + { + "epoch": 1.4317806178271295, + "grad_norm": 0.3509377682759486, + "learning_rate": 6.255724045486334e-05, + "loss": 2.7948, + "step": 30753 + }, + { + "epoch": 1.4318271760132224, + "grad_norm": 0.31709676102902495, + "learning_rate": 6.255461851617548e-05, + "loss": 2.7506, + "step": 30754 + }, + { + "epoch": 1.4318737341993155, + "grad_norm": 0.3346213119303369, + "learning_rate": 6.255199654064103e-05, + "loss": 2.7624, + "step": 30755 + }, + { + "epoch": 1.4319202923854086, + "grad_norm": 0.3178536884398005, + "learning_rate": 6.25493745282677e-05, + "loss": 2.6697, + "step": 30756 + }, + { + "epoch": 1.4319668505715017, + "grad_norm": 0.31961290349765836, + "learning_rate": 6.254675247906322e-05, + "loss": 2.6976, + "step": 30757 + }, + { + "epoch": 1.4320134087575949, + "grad_norm": 0.3035441462113061, + "learning_rate": 6.254413039303524e-05, + "loss": 2.748, + "step": 30758 + }, + { + "epoch": 1.432059966943688, + "grad_norm": 0.32268956697075857, + "learning_rate": 6.254150827019149e-05, + "loss": 2.7756, + "step": 30759 + }, + { + "epoch": 1.4321065251297809, + "grad_norm": 0.3166019592378278, + "learning_rate": 6.253888611053964e-05, + "loss": 2.7378, + "step": 30760 + }, + { + "epoch": 1.432153083315874, + "grad_norm": 0.3150460370425633, + "learning_rate": 6.253626391408739e-05, + "loss": 2.7049, + "step": 30761 + }, + { + "epoch": 1.432199641501967, + "grad_norm": 0.318893564732735, + "learning_rate": 6.253364168084243e-05, + "loss": 2.7616, + "step": 30762 + }, + { + "epoch": 1.4322461996880602, + "grad_norm": 0.3378633476001429, + "learning_rate": 6.253101941081249e-05, + "loss": 2.7703, + "step": 30763 + }, + { + "epoch": 1.432292757874153, + "grad_norm": 0.30129371581309516, + "learning_rate": 6.252839710400523e-05, + "loss": 2.8061, + "step": 30764 + }, + { + "epoch": 1.4323393160602462, + "grad_norm": 0.3357856086902276, + "learning_rate": 6.252577476042835e-05, + "loss": 2.6948, + "step": 30765 + }, + { + "epoch": 1.4323858742463393, + "grad_norm": 0.30809756442849234, + "learning_rate": 6.252315238008957e-05, + "loss": 2.7015, + "step": 30766 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 0.3358096655873682, + "learning_rate": 6.252052996299656e-05, + "loss": 2.7908, + "step": 30767 + }, + { + "epoch": 1.4324789906185256, + "grad_norm": 0.3341339844618952, + "learning_rate": 6.251790750915705e-05, + "loss": 2.778, + "step": 30768 + }, + { + "epoch": 1.4325255488046187, + "grad_norm": 0.3311612926457446, + "learning_rate": 6.251528501857869e-05, + "loss": 2.7952, + "step": 30769 + }, + { + "epoch": 1.4325721069907116, + "grad_norm": 0.33941483091842156, + "learning_rate": 6.251266249126921e-05, + "loss": 2.8066, + "step": 30770 + }, + { + "epoch": 1.4326186651768047, + "grad_norm": 0.33502010135449933, + "learning_rate": 6.251003992723629e-05, + "loss": 2.7913, + "step": 30771 + }, + { + "epoch": 1.4326652233628978, + "grad_norm": 0.3312629670928838, + "learning_rate": 6.250741732648766e-05, + "loss": 2.706, + "step": 30772 + }, + { + "epoch": 1.432711781548991, + "grad_norm": 0.33070871838187843, + "learning_rate": 6.250479468903098e-05, + "loss": 2.7274, + "step": 30773 + }, + { + "epoch": 1.4327583397350838, + "grad_norm": 0.3250603193054846, + "learning_rate": 6.250217201487395e-05, + "loss": 2.7808, + "step": 30774 + }, + { + "epoch": 1.432804897921177, + "grad_norm": 0.3486220609644949, + "learning_rate": 6.249954930402429e-05, + "loss": 2.7329, + "step": 30775 + }, + { + "epoch": 1.43285145610727, + "grad_norm": 0.3144403406366584, + "learning_rate": 6.249692655648966e-05, + "loss": 2.7471, + "step": 30776 + }, + { + "epoch": 1.4328980142933632, + "grad_norm": 0.3659462519867792, + "learning_rate": 6.24943037722778e-05, + "loss": 2.7087, + "step": 30777 + }, + { + "epoch": 1.4329445724794563, + "grad_norm": 0.3277524768047254, + "learning_rate": 6.24916809513964e-05, + "loss": 2.6842, + "step": 30778 + }, + { + "epoch": 1.4329911306655494, + "grad_norm": 0.3299738120535917, + "learning_rate": 6.248905809385312e-05, + "loss": 2.7564, + "step": 30779 + }, + { + "epoch": 1.4330376888516423, + "grad_norm": 0.3360730998078788, + "learning_rate": 6.24864351996557e-05, + "loss": 2.7545, + "step": 30780 + }, + { + "epoch": 1.4330842470377354, + "grad_norm": 0.3242726373807186, + "learning_rate": 6.248381226881184e-05, + "loss": 2.6513, + "step": 30781 + }, + { + "epoch": 1.4331308052238285, + "grad_norm": 0.33803397608264046, + "learning_rate": 6.24811893013292e-05, + "loss": 2.7292, + "step": 30782 + }, + { + "epoch": 1.4331773634099214, + "grad_norm": 0.35115310965974245, + "learning_rate": 6.24785662972155e-05, + "loss": 2.7665, + "step": 30783 + }, + { + "epoch": 1.4332239215960145, + "grad_norm": 0.3358306041987929, + "learning_rate": 6.247594325647843e-05, + "loss": 2.7765, + "step": 30784 + }, + { + "epoch": 1.4332704797821076, + "grad_norm": 0.3576641878991557, + "learning_rate": 6.247332017912572e-05, + "loss": 2.7463, + "step": 30785 + }, + { + "epoch": 1.4333170379682008, + "grad_norm": 0.3449470379238749, + "learning_rate": 6.247069706516503e-05, + "loss": 2.7996, + "step": 30786 + }, + { + "epoch": 1.4333635961542939, + "grad_norm": 0.36185069492658356, + "learning_rate": 6.246807391460408e-05, + "loss": 2.8455, + "step": 30787 + }, + { + "epoch": 1.433410154340387, + "grad_norm": 0.3631817887078417, + "learning_rate": 6.246545072745054e-05, + "loss": 2.7959, + "step": 30788 + }, + { + "epoch": 1.43345671252648, + "grad_norm": 0.3501850235512692, + "learning_rate": 6.246282750371215e-05, + "loss": 2.8336, + "step": 30789 + }, + { + "epoch": 1.433503270712573, + "grad_norm": 0.33949791794362666, + "learning_rate": 6.246020424339658e-05, + "loss": 2.8015, + "step": 30790 + }, + { + "epoch": 1.4335498288986661, + "grad_norm": 0.36119034224323904, + "learning_rate": 6.245758094651153e-05, + "loss": 2.8041, + "step": 30791 + }, + { + "epoch": 1.4335963870847592, + "grad_norm": 0.3202913915345816, + "learning_rate": 6.245495761306473e-05, + "loss": 2.7706, + "step": 30792 + }, + { + "epoch": 1.4336429452708521, + "grad_norm": 0.33819745811703, + "learning_rate": 6.245233424306384e-05, + "loss": 2.6995, + "step": 30793 + }, + { + "epoch": 1.4336895034569452, + "grad_norm": 0.3198901391740047, + "learning_rate": 6.244971083651657e-05, + "loss": 2.7821, + "step": 30794 + }, + { + "epoch": 1.4337360616430384, + "grad_norm": 0.33736330900017375, + "learning_rate": 6.244708739343065e-05, + "loss": 2.7863, + "step": 30795 + }, + { + "epoch": 1.4337826198291315, + "grad_norm": 0.34490269965331255, + "learning_rate": 6.244446391381374e-05, + "loss": 2.7601, + "step": 30796 + }, + { + "epoch": 1.4338291780152246, + "grad_norm": 0.35209080192290865, + "learning_rate": 6.244184039767355e-05, + "loss": 2.747, + "step": 30797 + }, + { + "epoch": 1.4338757362013177, + "grad_norm": 0.33841142078859915, + "learning_rate": 6.243921684501779e-05, + "loss": 2.631, + "step": 30798 + }, + { + "epoch": 1.4339222943874106, + "grad_norm": 0.315081221062118, + "learning_rate": 6.243659325585416e-05, + "loss": 2.8296, + "step": 30799 + }, + { + "epoch": 1.4339688525735037, + "grad_norm": 0.34888181575040855, + "learning_rate": 6.243396963019035e-05, + "loss": 2.7431, + "step": 30800 + }, + { + "epoch": 1.4340154107595968, + "grad_norm": 0.31761418086177545, + "learning_rate": 6.243134596803405e-05, + "loss": 2.7398, + "step": 30801 + }, + { + "epoch": 1.43406196894569, + "grad_norm": 0.3188255198564566, + "learning_rate": 6.242872226939298e-05, + "loss": 2.7125, + "step": 30802 + }, + { + "epoch": 1.4341085271317828, + "grad_norm": 0.3388062240643158, + "learning_rate": 6.242609853427483e-05, + "loss": 2.7876, + "step": 30803 + }, + { + "epoch": 1.434155085317876, + "grad_norm": 0.32705529639861636, + "learning_rate": 6.242347476268733e-05, + "loss": 2.7831, + "step": 30804 + }, + { + "epoch": 1.434201643503969, + "grad_norm": 0.3378172754406191, + "learning_rate": 6.242085095463815e-05, + "loss": 2.7227, + "step": 30805 + }, + { + "epoch": 1.4342482016900622, + "grad_norm": 0.33599438613219507, + "learning_rate": 6.241822711013498e-05, + "loss": 2.7298, + "step": 30806 + }, + { + "epoch": 1.4342947598761553, + "grad_norm": 0.36574759161239234, + "learning_rate": 6.241560322918553e-05, + "loss": 2.7076, + "step": 30807 + }, + { + "epoch": 1.4343413180622484, + "grad_norm": 0.38006048333676384, + "learning_rate": 6.241297931179752e-05, + "loss": 2.6579, + "step": 30808 + }, + { + "epoch": 1.4343878762483413, + "grad_norm": 0.3299329771914129, + "learning_rate": 6.241035535797865e-05, + "loss": 2.7524, + "step": 30809 + }, + { + "epoch": 1.4344344344344344, + "grad_norm": 0.37105606322865614, + "learning_rate": 6.240773136773658e-05, + "loss": 2.6822, + "step": 30810 + }, + { + "epoch": 1.4344809926205275, + "grad_norm": 0.34007633715290914, + "learning_rate": 6.240510734107908e-05, + "loss": 2.8498, + "step": 30811 + }, + { + "epoch": 1.4345275508066206, + "grad_norm": 0.38414332668969137, + "learning_rate": 6.240248327801379e-05, + "loss": 2.7722, + "step": 30812 + }, + { + "epoch": 1.4345741089927135, + "grad_norm": 0.33148538956969364, + "learning_rate": 6.239985917854844e-05, + "loss": 2.8182, + "step": 30813 + }, + { + "epoch": 1.4346206671788067, + "grad_norm": 0.3910692532428741, + "learning_rate": 6.239723504269072e-05, + "loss": 2.7418, + "step": 30814 + }, + { + "epoch": 1.4346672253648998, + "grad_norm": 0.369684202005731, + "learning_rate": 6.239461087044833e-05, + "loss": 2.7403, + "step": 30815 + }, + { + "epoch": 1.4347137835509929, + "grad_norm": 0.33969666716261343, + "learning_rate": 6.239198666182899e-05, + "loss": 2.6883, + "step": 30816 + }, + { + "epoch": 1.434760341737086, + "grad_norm": 0.35307333631621035, + "learning_rate": 6.238936241684038e-05, + "loss": 2.8286, + "step": 30817 + }, + { + "epoch": 1.4348068999231791, + "grad_norm": 0.35901304401759304, + "learning_rate": 6.238673813549023e-05, + "loss": 2.8309, + "step": 30818 + }, + { + "epoch": 1.434853458109272, + "grad_norm": 0.3572203168012491, + "learning_rate": 6.238411381778619e-05, + "loss": 2.8802, + "step": 30819 + }, + { + "epoch": 1.4349000162953651, + "grad_norm": 0.3526312327085642, + "learning_rate": 6.238148946373602e-05, + "loss": 2.7289, + "step": 30820 + }, + { + "epoch": 1.4349465744814582, + "grad_norm": 0.35115971237175136, + "learning_rate": 6.237886507334739e-05, + "loss": 2.7271, + "step": 30821 + }, + { + "epoch": 1.4349931326675514, + "grad_norm": 0.34549861604048654, + "learning_rate": 6.237624064662803e-05, + "loss": 2.6659, + "step": 30822 + }, + { + "epoch": 1.4350396908536442, + "grad_norm": 0.3389501063799543, + "learning_rate": 6.23736161835856e-05, + "loss": 2.6559, + "step": 30823 + }, + { + "epoch": 1.4350862490397374, + "grad_norm": 0.3456263602316414, + "learning_rate": 6.237099168422785e-05, + "loss": 2.7285, + "step": 30824 + }, + { + "epoch": 1.4351328072258305, + "grad_norm": 0.326745945887756, + "learning_rate": 6.236836714856242e-05, + "loss": 2.7502, + "step": 30825 + }, + { + "epoch": 1.4351793654119236, + "grad_norm": 0.347546832613071, + "learning_rate": 6.236574257659709e-05, + "loss": 2.7347, + "step": 30826 + }, + { + "epoch": 1.4352259235980167, + "grad_norm": 0.349837002727561, + "learning_rate": 6.23631179683395e-05, + "loss": 2.6712, + "step": 30827 + }, + { + "epoch": 1.4352724817841098, + "grad_norm": 0.31780369536050096, + "learning_rate": 6.236049332379739e-05, + "loss": 2.7352, + "step": 30828 + }, + { + "epoch": 1.4353190399702027, + "grad_norm": 0.33465339684891376, + "learning_rate": 6.235786864297843e-05, + "loss": 2.7753, + "step": 30829 + }, + { + "epoch": 1.4353655981562958, + "grad_norm": 0.34589383469043733, + "learning_rate": 6.235524392589035e-05, + "loss": 2.6854, + "step": 30830 + }, + { + "epoch": 1.435412156342389, + "grad_norm": 0.3109795340470698, + "learning_rate": 6.235261917254086e-05, + "loss": 2.7722, + "step": 30831 + }, + { + "epoch": 1.4354587145284818, + "grad_norm": 0.3529275742904064, + "learning_rate": 6.234999438293765e-05, + "loss": 2.8577, + "step": 30832 + }, + { + "epoch": 1.435505272714575, + "grad_norm": 0.33823735498769386, + "learning_rate": 6.234736955708841e-05, + "loss": 2.6649, + "step": 30833 + }, + { + "epoch": 1.435551830900668, + "grad_norm": 0.33449624901793895, + "learning_rate": 6.234474469500085e-05, + "loss": 2.6514, + "step": 30834 + }, + { + "epoch": 1.4355983890867612, + "grad_norm": 0.3475673886154983, + "learning_rate": 6.234211979668271e-05, + "loss": 2.6955, + "step": 30835 + }, + { + "epoch": 1.4356449472728543, + "grad_norm": 0.3482492425863055, + "learning_rate": 6.233949486214164e-05, + "loss": 2.658, + "step": 30836 + }, + { + "epoch": 1.4356915054589474, + "grad_norm": 0.32837552998361497, + "learning_rate": 6.233686989138539e-05, + "loss": 2.7448, + "step": 30837 + }, + { + "epoch": 1.4357380636450403, + "grad_norm": 0.3416173713601209, + "learning_rate": 6.233424488442161e-05, + "loss": 2.7357, + "step": 30838 + }, + { + "epoch": 1.4357846218311334, + "grad_norm": 0.34180343889813114, + "learning_rate": 6.233161984125806e-05, + "loss": 2.8014, + "step": 30839 + }, + { + "epoch": 1.4358311800172265, + "grad_norm": 0.3194414362884092, + "learning_rate": 6.232899476190241e-05, + "loss": 2.7617, + "step": 30840 + }, + { + "epoch": 1.4358777382033197, + "grad_norm": 0.352487773201885, + "learning_rate": 6.232636964636239e-05, + "loss": 2.617, + "step": 30841 + }, + { + "epoch": 1.4359242963894125, + "grad_norm": 0.3551214843818399, + "learning_rate": 6.232374449464567e-05, + "loss": 2.7517, + "step": 30842 + }, + { + "epoch": 1.4359708545755057, + "grad_norm": 0.3341565101878372, + "learning_rate": 6.232111930675998e-05, + "loss": 2.7174, + "step": 30843 + }, + { + "epoch": 1.4360174127615988, + "grad_norm": 0.3806023113519671, + "learning_rate": 6.231849408271302e-05, + "loss": 2.7664, + "step": 30844 + }, + { + "epoch": 1.436063970947692, + "grad_norm": 0.3100235582769082, + "learning_rate": 6.23158688225125e-05, + "loss": 2.7731, + "step": 30845 + }, + { + "epoch": 1.436110529133785, + "grad_norm": 0.3718370065216262, + "learning_rate": 6.231324352616611e-05, + "loss": 2.7804, + "step": 30846 + }, + { + "epoch": 1.4361570873198781, + "grad_norm": 0.29513349797748933, + "learning_rate": 6.231061819368155e-05, + "loss": 2.7714, + "step": 30847 + }, + { + "epoch": 1.436203645505971, + "grad_norm": 0.3520319995734509, + "learning_rate": 6.230799282506657e-05, + "loss": 2.6304, + "step": 30848 + }, + { + "epoch": 1.4362502036920641, + "grad_norm": 0.3361412765511433, + "learning_rate": 6.230536742032883e-05, + "loss": 2.6786, + "step": 30849 + }, + { + "epoch": 1.4362967618781572, + "grad_norm": 0.3152997576905514, + "learning_rate": 6.230274197947606e-05, + "loss": 2.7305, + "step": 30850 + }, + { + "epoch": 1.4363433200642504, + "grad_norm": 0.34505808996066817, + "learning_rate": 6.230011650251592e-05, + "loss": 2.6663, + "step": 30851 + }, + { + "epoch": 1.4363898782503433, + "grad_norm": 0.3276761066923676, + "learning_rate": 6.229749098945618e-05, + "loss": 2.7846, + "step": 30852 + }, + { + "epoch": 1.4364364364364364, + "grad_norm": 0.3271224759617636, + "learning_rate": 6.229486544030451e-05, + "loss": 2.6196, + "step": 30853 + }, + { + "epoch": 1.4364829946225295, + "grad_norm": 0.33868025566931725, + "learning_rate": 6.22922398550686e-05, + "loss": 2.7149, + "step": 30854 + }, + { + "epoch": 1.4365295528086226, + "grad_norm": 0.29815159456994894, + "learning_rate": 6.228961423375621e-05, + "loss": 2.7521, + "step": 30855 + }, + { + "epoch": 1.4365761109947157, + "grad_norm": 0.32691103901714735, + "learning_rate": 6.2286988576375e-05, + "loss": 2.6733, + "step": 30856 + }, + { + "epoch": 1.4366226691808088, + "grad_norm": 0.28942293200913766, + "learning_rate": 6.228436288293268e-05, + "loss": 2.6549, + "step": 30857 + }, + { + "epoch": 1.4366692273669017, + "grad_norm": 0.33696225738498603, + "learning_rate": 6.228173715343699e-05, + "loss": 2.782, + "step": 30858 + }, + { + "epoch": 1.4367157855529948, + "grad_norm": 0.31539017117865564, + "learning_rate": 6.227911138789558e-05, + "loss": 2.8712, + "step": 30859 + }, + { + "epoch": 1.436762343739088, + "grad_norm": 0.3201788510112009, + "learning_rate": 6.227648558631619e-05, + "loss": 2.678, + "step": 30860 + }, + { + "epoch": 1.436808901925181, + "grad_norm": 0.3515044491818835, + "learning_rate": 6.227385974870656e-05, + "loss": 2.7605, + "step": 30861 + }, + { + "epoch": 1.436855460111274, + "grad_norm": 0.3054140817260686, + "learning_rate": 6.227123387507432e-05, + "loss": 2.7482, + "step": 30862 + }, + { + "epoch": 1.436902018297367, + "grad_norm": 0.3639614148026395, + "learning_rate": 6.226860796542724e-05, + "loss": 2.7869, + "step": 30863 + }, + { + "epoch": 1.4369485764834602, + "grad_norm": 0.31898196452006655, + "learning_rate": 6.2265982019773e-05, + "loss": 2.7222, + "step": 30864 + }, + { + "epoch": 1.4369951346695533, + "grad_norm": 0.3200891657946268, + "learning_rate": 6.22633560381193e-05, + "loss": 2.7548, + "step": 30865 + }, + { + "epoch": 1.4370416928556464, + "grad_norm": 0.3445417449843892, + "learning_rate": 6.226073002047387e-05, + "loss": 2.7015, + "step": 30866 + }, + { + "epoch": 1.4370882510417395, + "grad_norm": 0.34097319214405225, + "learning_rate": 6.225810396684441e-05, + "loss": 2.8292, + "step": 30867 + }, + { + "epoch": 1.4371348092278324, + "grad_norm": 0.32107290511451964, + "learning_rate": 6.225547787723861e-05, + "loss": 2.7148, + "step": 30868 + }, + { + "epoch": 1.4371813674139255, + "grad_norm": 0.35878209906518443, + "learning_rate": 6.22528517516642e-05, + "loss": 2.7346, + "step": 30869 + }, + { + "epoch": 1.4372279256000187, + "grad_norm": 0.32236143437090664, + "learning_rate": 6.225022559012886e-05, + "loss": 2.6844, + "step": 30870 + }, + { + "epoch": 1.4372744837861116, + "grad_norm": 0.3645549410285995, + "learning_rate": 6.224759939264033e-05, + "loss": 2.7025, + "step": 30871 + }, + { + "epoch": 1.4373210419722047, + "grad_norm": 0.35402291773030803, + "learning_rate": 6.22449731592063e-05, + "loss": 2.7307, + "step": 30872 + }, + { + "epoch": 1.4373676001582978, + "grad_norm": 0.34179406222670916, + "learning_rate": 6.224234688983448e-05, + "loss": 2.8084, + "step": 30873 + }, + { + "epoch": 1.437414158344391, + "grad_norm": 0.36547929227974724, + "learning_rate": 6.223972058453258e-05, + "loss": 2.7643, + "step": 30874 + }, + { + "epoch": 1.437460716530484, + "grad_norm": 0.3353568164975621, + "learning_rate": 6.22370942433083e-05, + "loss": 2.7518, + "step": 30875 + }, + { + "epoch": 1.4375072747165771, + "grad_norm": 0.34739497865360486, + "learning_rate": 6.223446786616936e-05, + "loss": 2.7533, + "step": 30876 + }, + { + "epoch": 1.4375538329026702, + "grad_norm": 0.34215534250484336, + "learning_rate": 6.223184145312347e-05, + "loss": 2.8247, + "step": 30877 + }, + { + "epoch": 1.4376003910887631, + "grad_norm": 0.32066333323161805, + "learning_rate": 6.222921500417831e-05, + "loss": 2.7533, + "step": 30878 + }, + { + "epoch": 1.4376469492748563, + "grad_norm": 0.3393117467048666, + "learning_rate": 6.222658851934163e-05, + "loss": 2.7352, + "step": 30879 + }, + { + "epoch": 1.4376935074609494, + "grad_norm": 0.3420121755428391, + "learning_rate": 6.222396199862111e-05, + "loss": 2.8272, + "step": 30880 + }, + { + "epoch": 1.4377400656470423, + "grad_norm": 0.3259229120805392, + "learning_rate": 6.222133544202444e-05, + "loss": 2.7068, + "step": 30881 + }, + { + "epoch": 1.4377866238331354, + "grad_norm": 0.33636664454651194, + "learning_rate": 6.221870884955938e-05, + "loss": 2.7445, + "step": 30882 + }, + { + "epoch": 1.4378331820192285, + "grad_norm": 0.3088774133378726, + "learning_rate": 6.221608222123361e-05, + "loss": 2.8346, + "step": 30883 + }, + { + "epoch": 1.4378797402053216, + "grad_norm": 0.32303833480742766, + "learning_rate": 6.221345555705483e-05, + "loss": 2.7246, + "step": 30884 + }, + { + "epoch": 1.4379262983914147, + "grad_norm": 0.3180028040644863, + "learning_rate": 6.22108288570308e-05, + "loss": 2.7494, + "step": 30885 + }, + { + "epoch": 1.4379728565775078, + "grad_norm": 0.3239614998592153, + "learning_rate": 6.220820212116915e-05, + "loss": 2.7304, + "step": 30886 + }, + { + "epoch": 1.4380194147636007, + "grad_norm": 0.32850286286965086, + "learning_rate": 6.220557534947765e-05, + "loss": 2.7504, + "step": 30887 + }, + { + "epoch": 1.4380659729496938, + "grad_norm": 0.3298053095770675, + "learning_rate": 6.220294854196397e-05, + "loss": 2.9115, + "step": 30888 + }, + { + "epoch": 1.438112531135787, + "grad_norm": 0.33521454054679417, + "learning_rate": 6.220032169863585e-05, + "loss": 2.6871, + "step": 30889 + }, + { + "epoch": 1.43815908932188, + "grad_norm": 0.31381405308913585, + "learning_rate": 6.2197694819501e-05, + "loss": 2.7265, + "step": 30890 + }, + { + "epoch": 1.438205647507973, + "grad_norm": 0.32520811145312695, + "learning_rate": 6.21950679045671e-05, + "loss": 2.7333, + "step": 30891 + }, + { + "epoch": 1.438252205694066, + "grad_norm": 0.3423917714404742, + "learning_rate": 6.219244095384187e-05, + "loss": 2.8202, + "step": 30892 + }, + { + "epoch": 1.4382987638801592, + "grad_norm": 0.34236907478547096, + "learning_rate": 6.218981396733303e-05, + "loss": 2.822, + "step": 30893 + }, + { + "epoch": 1.4383453220662523, + "grad_norm": 0.3370383210778089, + "learning_rate": 6.218718694504831e-05, + "loss": 2.7893, + "step": 30894 + }, + { + "epoch": 1.4383918802523454, + "grad_norm": 0.36594530082480525, + "learning_rate": 6.218455988699538e-05, + "loss": 2.7853, + "step": 30895 + }, + { + "epoch": 1.4384384384384385, + "grad_norm": 0.32395749471716057, + "learning_rate": 6.218193279318196e-05, + "loss": 2.8153, + "step": 30896 + }, + { + "epoch": 1.4384849966245314, + "grad_norm": 0.3530881433321103, + "learning_rate": 6.217930566361579e-05, + "loss": 2.8019, + "step": 30897 + }, + { + "epoch": 1.4385315548106246, + "grad_norm": 0.3396591402696828, + "learning_rate": 6.217667849830455e-05, + "loss": 2.6865, + "step": 30898 + }, + { + "epoch": 1.4385781129967177, + "grad_norm": 0.3348367069347291, + "learning_rate": 6.217405129725594e-05, + "loss": 2.73, + "step": 30899 + }, + { + "epoch": 1.4386246711828108, + "grad_norm": 0.34612696780982943, + "learning_rate": 6.217142406047772e-05, + "loss": 2.7495, + "step": 30900 + }, + { + "epoch": 1.4386712293689037, + "grad_norm": 0.3389814315196332, + "learning_rate": 6.216879678797753e-05, + "loss": 2.7226, + "step": 30901 + }, + { + "epoch": 1.4387177875549968, + "grad_norm": 0.3679252303348938, + "learning_rate": 6.216616947976315e-05, + "loss": 2.6868, + "step": 30902 + }, + { + "epoch": 1.43876434574109, + "grad_norm": 0.3469447034208035, + "learning_rate": 6.216354213584226e-05, + "loss": 2.6525, + "step": 30903 + }, + { + "epoch": 1.438810903927183, + "grad_norm": 0.3500670543401055, + "learning_rate": 6.216091475622255e-05, + "loss": 2.6827, + "step": 30904 + }, + { + "epoch": 1.4388574621132761, + "grad_norm": 0.353901180060876, + "learning_rate": 6.215828734091178e-05, + "loss": 2.818, + "step": 30905 + }, + { + "epoch": 1.4389040202993693, + "grad_norm": 0.40734679393348283, + "learning_rate": 6.21556598899176e-05, + "loss": 2.8729, + "step": 30906 + }, + { + "epoch": 1.4389505784854622, + "grad_norm": 0.38339937532906254, + "learning_rate": 6.21530324032478e-05, + "loss": 2.6852, + "step": 30907 + }, + { + "epoch": 1.4389971366715553, + "grad_norm": 0.3678459618146178, + "learning_rate": 6.215040488091001e-05, + "loss": 2.7919, + "step": 30908 + }, + { + "epoch": 1.4390436948576484, + "grad_norm": 0.36024785690974365, + "learning_rate": 6.214777732291201e-05, + "loss": 2.6802, + "step": 30909 + }, + { + "epoch": 1.4390902530437415, + "grad_norm": 0.37288745245767135, + "learning_rate": 6.214514972926146e-05, + "loss": 2.8042, + "step": 30910 + }, + { + "epoch": 1.4391368112298344, + "grad_norm": 0.3360775954398617, + "learning_rate": 6.214252209996609e-05, + "loss": 2.8299, + "step": 30911 + }, + { + "epoch": 1.4391833694159275, + "grad_norm": 0.41373476299200035, + "learning_rate": 6.213989443503362e-05, + "loss": 2.7291, + "step": 30912 + }, + { + "epoch": 1.4392299276020206, + "grad_norm": 0.3186685947426947, + "learning_rate": 6.213726673447177e-05, + "loss": 2.7877, + "step": 30913 + }, + { + "epoch": 1.4392764857881137, + "grad_norm": 0.41893236111674487, + "learning_rate": 6.213463899828821e-05, + "loss": 2.7192, + "step": 30914 + }, + { + "epoch": 1.4393230439742069, + "grad_norm": 0.35793518012726877, + "learning_rate": 6.213201122649069e-05, + "loss": 2.6463, + "step": 30915 + }, + { + "epoch": 1.4393696021603, + "grad_norm": 0.3941840558040026, + "learning_rate": 6.212938341908693e-05, + "loss": 2.8773, + "step": 30916 + }, + { + "epoch": 1.4394161603463929, + "grad_norm": 0.37632922943497954, + "learning_rate": 6.21267555760846e-05, + "loss": 2.8201, + "step": 30917 + }, + { + "epoch": 1.439462718532486, + "grad_norm": 0.3807510854937538, + "learning_rate": 6.212412769749145e-05, + "loss": 2.7845, + "step": 30918 + }, + { + "epoch": 1.439509276718579, + "grad_norm": 0.3577634394532913, + "learning_rate": 6.212149978331517e-05, + "loss": 2.8251, + "step": 30919 + }, + { + "epoch": 1.439555834904672, + "grad_norm": 0.3754963248123084, + "learning_rate": 6.211887183356349e-05, + "loss": 2.7428, + "step": 30920 + }, + { + "epoch": 1.439602393090765, + "grad_norm": 0.3675784224741303, + "learning_rate": 6.211624384824413e-05, + "loss": 2.7475, + "step": 30921 + }, + { + "epoch": 1.4396489512768582, + "grad_norm": 0.3859152522983353, + "learning_rate": 6.211361582736476e-05, + "loss": 2.7962, + "step": 30922 + }, + { + "epoch": 1.4396955094629513, + "grad_norm": 0.35276152682317224, + "learning_rate": 6.211098777093314e-05, + "loss": 2.7992, + "step": 30923 + }, + { + "epoch": 1.4397420676490444, + "grad_norm": 0.3807481630896933, + "learning_rate": 6.210835967895696e-05, + "loss": 2.8557, + "step": 30924 + }, + { + "epoch": 1.4397886258351376, + "grad_norm": 0.34200419579145275, + "learning_rate": 6.210573155144394e-05, + "loss": 2.8055, + "step": 30925 + }, + { + "epoch": 1.4398351840212305, + "grad_norm": 0.36820185302249253, + "learning_rate": 6.210310338840179e-05, + "loss": 2.6652, + "step": 30926 + }, + { + "epoch": 1.4398817422073236, + "grad_norm": 0.3507097225922141, + "learning_rate": 6.210047518983821e-05, + "loss": 2.7158, + "step": 30927 + }, + { + "epoch": 1.4399283003934167, + "grad_norm": 0.37530970017309484, + "learning_rate": 6.209784695576094e-05, + "loss": 2.8039, + "step": 30928 + }, + { + "epoch": 1.4399748585795098, + "grad_norm": 0.36111224234133177, + "learning_rate": 6.209521868617767e-05, + "loss": 2.7599, + "step": 30929 + }, + { + "epoch": 1.4400214167656027, + "grad_norm": 0.3718010415152636, + "learning_rate": 6.209259038109614e-05, + "loss": 2.7181, + "step": 30930 + }, + { + "epoch": 1.4400679749516958, + "grad_norm": 0.3370831981119942, + "learning_rate": 6.208996204052405e-05, + "loss": 2.8297, + "step": 30931 + }, + { + "epoch": 1.440114533137789, + "grad_norm": 0.37333387640292903, + "learning_rate": 6.208733366446908e-05, + "loss": 2.7121, + "step": 30932 + }, + { + "epoch": 1.440161091323882, + "grad_norm": 0.33426247565465406, + "learning_rate": 6.2084705252939e-05, + "loss": 2.7934, + "step": 30933 + }, + { + "epoch": 1.4402076495099752, + "grad_norm": 0.33988103767104694, + "learning_rate": 6.20820768059415e-05, + "loss": 2.6488, + "step": 30934 + }, + { + "epoch": 1.4402542076960683, + "grad_norm": 0.32375795061281665, + "learning_rate": 6.207944832348431e-05, + "loss": 2.7572, + "step": 30935 + }, + { + "epoch": 1.4403007658821612, + "grad_norm": 0.34815021780753647, + "learning_rate": 6.20768198055751e-05, + "loss": 2.7157, + "step": 30936 + }, + { + "epoch": 1.4403473240682543, + "grad_norm": 0.3838480095358991, + "learning_rate": 6.207419125222162e-05, + "loss": 2.7745, + "step": 30937 + }, + { + "epoch": 1.4403938822543474, + "grad_norm": 0.34300882299682417, + "learning_rate": 6.20715626634316e-05, + "loss": 2.8085, + "step": 30938 + }, + { + "epoch": 1.4404404404404405, + "grad_norm": 0.35474560185465887, + "learning_rate": 6.206893403921273e-05, + "loss": 2.83, + "step": 30939 + }, + { + "epoch": 1.4404869986265334, + "grad_norm": 0.342623998102498, + "learning_rate": 6.206630537957272e-05, + "loss": 2.7101, + "step": 30940 + }, + { + "epoch": 1.4405335568126265, + "grad_norm": 0.3254028145019506, + "learning_rate": 6.206367668451928e-05, + "loss": 2.749, + "step": 30941 + }, + { + "epoch": 1.4405801149987196, + "grad_norm": 0.31280538790932816, + "learning_rate": 6.206104795406014e-05, + "loss": 2.7675, + "step": 30942 + }, + { + "epoch": 1.4406266731848127, + "grad_norm": 0.307759431704614, + "learning_rate": 6.205841918820302e-05, + "loss": 2.6825, + "step": 30943 + }, + { + "epoch": 1.4406732313709059, + "grad_norm": 0.3465584732093795, + "learning_rate": 6.205579038695563e-05, + "loss": 2.7283, + "step": 30944 + }, + { + "epoch": 1.440719789556999, + "grad_norm": 0.303107151730602, + "learning_rate": 6.205316155032567e-05, + "loss": 2.8178, + "step": 30945 + }, + { + "epoch": 1.4407663477430919, + "grad_norm": 0.3444171139646108, + "learning_rate": 6.205053267832088e-05, + "loss": 2.8318, + "step": 30946 + }, + { + "epoch": 1.440812905929185, + "grad_norm": 0.33454242951496593, + "learning_rate": 6.204790377094897e-05, + "loss": 2.7783, + "step": 30947 + }, + { + "epoch": 1.440859464115278, + "grad_norm": 0.3477768908099706, + "learning_rate": 6.204527482821764e-05, + "loss": 2.8478, + "step": 30948 + }, + { + "epoch": 1.4409060223013712, + "grad_norm": 0.32990918009947867, + "learning_rate": 6.204264585013462e-05, + "loss": 2.7562, + "step": 30949 + }, + { + "epoch": 1.440952580487464, + "grad_norm": 0.3220603588763063, + "learning_rate": 6.204001683670761e-05, + "loss": 2.7571, + "step": 30950 + }, + { + "epoch": 1.4409991386735572, + "grad_norm": 0.33243036700037726, + "learning_rate": 6.203738778794435e-05, + "loss": 2.6446, + "step": 30951 + }, + { + "epoch": 1.4410456968596503, + "grad_norm": 0.3130113858827331, + "learning_rate": 6.203475870385253e-05, + "loss": 2.7026, + "step": 30952 + }, + { + "epoch": 1.4410922550457435, + "grad_norm": 0.33214240084415586, + "learning_rate": 6.203212958443989e-05, + "loss": 2.6607, + "step": 30953 + }, + { + "epoch": 1.4411388132318366, + "grad_norm": 0.30526819530038063, + "learning_rate": 6.202950042971413e-05, + "loss": 2.7945, + "step": 30954 + }, + { + "epoch": 1.4411853714179297, + "grad_norm": 0.3178923612724832, + "learning_rate": 6.202687123968297e-05, + "loss": 2.7717, + "step": 30955 + }, + { + "epoch": 1.4412319296040226, + "grad_norm": 0.33729102460155774, + "learning_rate": 6.202424201435413e-05, + "loss": 2.6631, + "step": 30956 + }, + { + "epoch": 1.4412784877901157, + "grad_norm": 0.3035835648527324, + "learning_rate": 6.202161275373532e-05, + "loss": 2.6228, + "step": 30957 + }, + { + "epoch": 1.4413250459762088, + "grad_norm": 0.3488124140294064, + "learning_rate": 6.201898345783427e-05, + "loss": 2.756, + "step": 30958 + }, + { + "epoch": 1.4413716041623017, + "grad_norm": 0.3043678222007916, + "learning_rate": 6.201635412665869e-05, + "loss": 2.6605, + "step": 30959 + }, + { + "epoch": 1.4414181623483948, + "grad_norm": 0.3465321192177301, + "learning_rate": 6.201372476021627e-05, + "loss": 2.7005, + "step": 30960 + }, + { + "epoch": 1.441464720534488, + "grad_norm": 0.2928308197886627, + "learning_rate": 6.201109535851477e-05, + "loss": 2.6858, + "step": 30961 + }, + { + "epoch": 1.441511278720581, + "grad_norm": 0.3575475334948672, + "learning_rate": 6.200846592156189e-05, + "loss": 2.7586, + "step": 30962 + }, + { + "epoch": 1.4415578369066742, + "grad_norm": 0.29936728946832153, + "learning_rate": 6.200583644936534e-05, + "loss": 2.6552, + "step": 30963 + }, + { + "epoch": 1.4416043950927673, + "grad_norm": 0.36819790809140307, + "learning_rate": 6.200320694193285e-05, + "loss": 2.7559, + "step": 30964 + }, + { + "epoch": 1.4416509532788604, + "grad_norm": 0.36434018971971655, + "learning_rate": 6.200057739927211e-05, + "loss": 2.8481, + "step": 30965 + }, + { + "epoch": 1.4416975114649533, + "grad_norm": 0.3387638744887158, + "learning_rate": 6.19979478213909e-05, + "loss": 2.6495, + "step": 30966 + }, + { + "epoch": 1.4417440696510464, + "grad_norm": 0.3861492687177462, + "learning_rate": 6.199531820829686e-05, + "loss": 2.7251, + "step": 30967 + }, + { + "epoch": 1.4417906278371395, + "grad_norm": 0.34691965671594466, + "learning_rate": 6.199268855999774e-05, + "loss": 2.6897, + "step": 30968 + }, + { + "epoch": 1.4418371860232324, + "grad_norm": 0.39542180615577904, + "learning_rate": 6.199005887650128e-05, + "loss": 2.7052, + "step": 30969 + }, + { + "epoch": 1.4418837442093255, + "grad_norm": 0.33293694829355924, + "learning_rate": 6.198742915781516e-05, + "loss": 2.7374, + "step": 30970 + }, + { + "epoch": 1.4419303023954186, + "grad_norm": 0.33971188907073513, + "learning_rate": 6.198479940394714e-05, + "loss": 2.5847, + "step": 30971 + }, + { + "epoch": 1.4419768605815118, + "grad_norm": 0.3639818095783632, + "learning_rate": 6.198216961490489e-05, + "loss": 2.6863, + "step": 30972 + }, + { + "epoch": 1.4420234187676049, + "grad_norm": 0.3502846979493163, + "learning_rate": 6.197953979069616e-05, + "loss": 2.8116, + "step": 30973 + }, + { + "epoch": 1.442069976953698, + "grad_norm": 0.3425248775114231, + "learning_rate": 6.197690993132867e-05, + "loss": 2.8096, + "step": 30974 + }, + { + "epoch": 1.4421165351397909, + "grad_norm": 0.39196081547139705, + "learning_rate": 6.197428003681012e-05, + "loss": 2.718, + "step": 30975 + }, + { + "epoch": 1.442163093325884, + "grad_norm": 0.3367617399843344, + "learning_rate": 6.197165010714824e-05, + "loss": 2.7776, + "step": 30976 + }, + { + "epoch": 1.442209651511977, + "grad_norm": 0.39255358560597, + "learning_rate": 6.196902014235075e-05, + "loss": 2.8235, + "step": 30977 + }, + { + "epoch": 1.4422562096980702, + "grad_norm": 0.3259725521963285, + "learning_rate": 6.196639014242536e-05, + "loss": 2.7936, + "step": 30978 + }, + { + "epoch": 1.4423027678841631, + "grad_norm": 0.3734261949255069, + "learning_rate": 6.196376010737979e-05, + "loss": 2.6668, + "step": 30979 + }, + { + "epoch": 1.4423493260702562, + "grad_norm": 0.3263206614740131, + "learning_rate": 6.196113003722178e-05, + "loss": 2.7387, + "step": 30980 + }, + { + "epoch": 1.4423958842563493, + "grad_norm": 0.3231319167249064, + "learning_rate": 6.195849993195901e-05, + "loss": 2.6894, + "step": 30981 + }, + { + "epoch": 1.4424424424424425, + "grad_norm": 0.3586854672107893, + "learning_rate": 6.195586979159923e-05, + "loss": 2.7601, + "step": 30982 + }, + { + "epoch": 1.4424890006285356, + "grad_norm": 0.3445877846569936, + "learning_rate": 6.195323961615014e-05, + "loss": 2.7387, + "step": 30983 + }, + { + "epoch": 1.4425355588146287, + "grad_norm": 0.3202967060143219, + "learning_rate": 6.19506094056195e-05, + "loss": 2.7285, + "step": 30984 + }, + { + "epoch": 1.4425821170007216, + "grad_norm": 0.35806496474395627, + "learning_rate": 6.194797916001499e-05, + "loss": 2.7369, + "step": 30985 + }, + { + "epoch": 1.4426286751868147, + "grad_norm": 0.3178599360929182, + "learning_rate": 6.194534887934431e-05, + "loss": 2.743, + "step": 30986 + }, + { + "epoch": 1.4426752333729078, + "grad_norm": 0.33128705561220956, + "learning_rate": 6.194271856361522e-05, + "loss": 2.6789, + "step": 30987 + }, + { + "epoch": 1.442721791559001, + "grad_norm": 0.34065432031001364, + "learning_rate": 6.194008821283544e-05, + "loss": 2.6898, + "step": 30988 + }, + { + "epoch": 1.4427683497450938, + "grad_norm": 0.3550316457760751, + "learning_rate": 6.193745782701267e-05, + "loss": 2.8013, + "step": 30989 + }, + { + "epoch": 1.442814907931187, + "grad_norm": 0.3418192771839563, + "learning_rate": 6.193482740615465e-05, + "loss": 2.882, + "step": 30990 + }, + { + "epoch": 1.44286146611728, + "grad_norm": 0.33855073713002043, + "learning_rate": 6.193219695026906e-05, + "loss": 2.723, + "step": 30991 + }, + { + "epoch": 1.4429080243033732, + "grad_norm": 0.343844983861686, + "learning_rate": 6.192956645936366e-05, + "loss": 2.8181, + "step": 30992 + }, + { + "epoch": 1.4429545824894663, + "grad_norm": 0.36966880857926265, + "learning_rate": 6.192693593344617e-05, + "loss": 2.8575, + "step": 30993 + }, + { + "epoch": 1.4430011406755594, + "grad_norm": 0.35791603043769327, + "learning_rate": 6.192430537252429e-05, + "loss": 2.7319, + "step": 30994 + }, + { + "epoch": 1.4430476988616523, + "grad_norm": 0.3322648807906789, + "learning_rate": 6.192167477660574e-05, + "loss": 2.687, + "step": 30995 + }, + { + "epoch": 1.4430942570477454, + "grad_norm": 0.35324585364783695, + "learning_rate": 6.191904414569826e-05, + "loss": 2.8082, + "step": 30996 + }, + { + "epoch": 1.4431408152338385, + "grad_norm": 0.3565440037770284, + "learning_rate": 6.191641347980955e-05, + "loss": 2.8242, + "step": 30997 + }, + { + "epoch": 1.4431873734199316, + "grad_norm": 0.3387524257504678, + "learning_rate": 6.191378277894735e-05, + "loss": 2.6915, + "step": 30998 + }, + { + "epoch": 1.4432339316060245, + "grad_norm": 0.353199270769559, + "learning_rate": 6.191115204311937e-05, + "loss": 2.6661, + "step": 30999 + }, + { + "epoch": 1.4432804897921176, + "grad_norm": 0.3600734012352092, + "learning_rate": 6.190852127233331e-05, + "loss": 2.7245, + "step": 31000 + }, + { + "epoch": 1.4433270479782108, + "grad_norm": 0.3408776264857631, + "learning_rate": 6.190589046659695e-05, + "loss": 2.7379, + "step": 31001 + }, + { + "epoch": 1.4433736061643039, + "grad_norm": 0.33582058659504843, + "learning_rate": 6.190325962591795e-05, + "loss": 2.7062, + "step": 31002 + }, + { + "epoch": 1.443420164350397, + "grad_norm": 0.3389625965271597, + "learning_rate": 6.190062875030407e-05, + "loss": 2.6491, + "step": 31003 + }, + { + "epoch": 1.44346672253649, + "grad_norm": 0.31150590638801084, + "learning_rate": 6.189799783976301e-05, + "loss": 2.7845, + "step": 31004 + }, + { + "epoch": 1.443513280722583, + "grad_norm": 0.33395379226324184, + "learning_rate": 6.18953668943025e-05, + "loss": 2.7272, + "step": 31005 + }, + { + "epoch": 1.4435598389086761, + "grad_norm": 0.32108489114155325, + "learning_rate": 6.189273591393024e-05, + "loss": 2.7619, + "step": 31006 + }, + { + "epoch": 1.4436063970947692, + "grad_norm": 0.3559961743975721, + "learning_rate": 6.1890104898654e-05, + "loss": 2.695, + "step": 31007 + }, + { + "epoch": 1.4436529552808621, + "grad_norm": 0.35136887300718606, + "learning_rate": 6.188747384848147e-05, + "loss": 2.6407, + "step": 31008 + }, + { + "epoch": 1.4436995134669552, + "grad_norm": 0.33217839713024433, + "learning_rate": 6.188484276342036e-05, + "loss": 2.7809, + "step": 31009 + }, + { + "epoch": 1.4437460716530484, + "grad_norm": 0.3693526816003526, + "learning_rate": 6.188221164347842e-05, + "loss": 2.7713, + "step": 31010 + }, + { + "epoch": 1.4437926298391415, + "grad_norm": 0.3323808673871227, + "learning_rate": 6.187958048866336e-05, + "loss": 2.7484, + "step": 31011 + }, + { + "epoch": 1.4438391880252346, + "grad_norm": 0.3401307604214839, + "learning_rate": 6.187694929898289e-05, + "loss": 2.7052, + "step": 31012 + }, + { + "epoch": 1.4438857462113277, + "grad_norm": 0.3502920096025311, + "learning_rate": 6.187431807444475e-05, + "loss": 2.7018, + "step": 31013 + }, + { + "epoch": 1.4439323043974206, + "grad_norm": 0.35651945087888276, + "learning_rate": 6.187168681505667e-05, + "loss": 2.7323, + "step": 31014 + }, + { + "epoch": 1.4439788625835137, + "grad_norm": 0.3422238118149191, + "learning_rate": 6.186905552082634e-05, + "loss": 2.8022, + "step": 31015 + }, + { + "epoch": 1.4440254207696068, + "grad_norm": 0.3591550772265444, + "learning_rate": 6.18664241917615e-05, + "loss": 2.7675, + "step": 31016 + }, + { + "epoch": 1.4440719789557, + "grad_norm": 0.3500104268475604, + "learning_rate": 6.18637928278699e-05, + "loss": 2.7616, + "step": 31017 + }, + { + "epoch": 1.4441185371417928, + "grad_norm": 0.33072624315280047, + "learning_rate": 6.18611614291592e-05, + "loss": 2.7118, + "step": 31018 + }, + { + "epoch": 1.444165095327886, + "grad_norm": 0.35570866420051, + "learning_rate": 6.185852999563717e-05, + "loss": 2.7081, + "step": 31019 + }, + { + "epoch": 1.444211653513979, + "grad_norm": 0.33594196289127404, + "learning_rate": 6.185589852731154e-05, + "loss": 2.7707, + "step": 31020 + }, + { + "epoch": 1.4442582117000722, + "grad_norm": 0.3112032419297898, + "learning_rate": 6.185326702419001e-05, + "loss": 2.7312, + "step": 31021 + }, + { + "epoch": 1.4443047698861653, + "grad_norm": 0.3556826345517436, + "learning_rate": 6.18506354862803e-05, + "loss": 2.7519, + "step": 31022 + }, + { + "epoch": 1.4443513280722584, + "grad_norm": 0.3339205064328415, + "learning_rate": 6.184800391359015e-05, + "loss": 2.7709, + "step": 31023 + }, + { + "epoch": 1.4443978862583513, + "grad_norm": 0.3689797198433419, + "learning_rate": 6.184537230612725e-05, + "loss": 2.7313, + "step": 31024 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.32317770080810837, + "learning_rate": 6.184274066389939e-05, + "loss": 2.7079, + "step": 31025 + }, + { + "epoch": 1.4444910026305375, + "grad_norm": 0.33021000166028547, + "learning_rate": 6.184010898691424e-05, + "loss": 2.709, + "step": 31026 + }, + { + "epoch": 1.4445375608166307, + "grad_norm": 0.31745932844521996, + "learning_rate": 6.183747727517954e-05, + "loss": 2.7787, + "step": 31027 + }, + { + "epoch": 1.4445841190027235, + "grad_norm": 0.36018727100225806, + "learning_rate": 6.1834845528703e-05, + "loss": 2.718, + "step": 31028 + }, + { + "epoch": 1.4446306771888167, + "grad_norm": 0.34546605945120135, + "learning_rate": 6.183221374749236e-05, + "loss": 2.7471, + "step": 31029 + }, + { + "epoch": 1.4446772353749098, + "grad_norm": 0.32623054306798055, + "learning_rate": 6.182958193155533e-05, + "loss": 2.6979, + "step": 31030 + }, + { + "epoch": 1.444723793561003, + "grad_norm": 0.3236968475506916, + "learning_rate": 6.182695008089964e-05, + "loss": 2.691, + "step": 31031 + }, + { + "epoch": 1.444770351747096, + "grad_norm": 0.3356140652901392, + "learning_rate": 6.182431819553303e-05, + "loss": 2.6409, + "step": 31032 + }, + { + "epoch": 1.4448169099331891, + "grad_norm": 0.3325858765857383, + "learning_rate": 6.182168627546319e-05, + "loss": 2.7847, + "step": 31033 + }, + { + "epoch": 1.444863468119282, + "grad_norm": 0.3335067158656762, + "learning_rate": 6.18190543206979e-05, + "loss": 2.7755, + "step": 31034 + }, + { + "epoch": 1.4449100263053751, + "grad_norm": 0.3300743916212851, + "learning_rate": 6.181642233124482e-05, + "loss": 2.7289, + "step": 31035 + }, + { + "epoch": 1.4449565844914682, + "grad_norm": 0.3269202406410659, + "learning_rate": 6.181379030711172e-05, + "loss": 2.7127, + "step": 31036 + }, + { + "epoch": 1.4450031426775614, + "grad_norm": 0.31524013155309916, + "learning_rate": 6.18111582483063e-05, + "loss": 2.8608, + "step": 31037 + }, + { + "epoch": 1.4450497008636543, + "grad_norm": 0.3614294151534917, + "learning_rate": 6.18085261548363e-05, + "loss": 2.7705, + "step": 31038 + }, + { + "epoch": 1.4450962590497474, + "grad_norm": 0.3338063045655896, + "learning_rate": 6.180589402670943e-05, + "loss": 2.692, + "step": 31039 + }, + { + "epoch": 1.4451428172358405, + "grad_norm": 0.2930327246547927, + "learning_rate": 6.180326186393344e-05, + "loss": 2.7024, + "step": 31040 + }, + { + "epoch": 1.4451893754219336, + "grad_norm": 0.35904114358453887, + "learning_rate": 6.180062966651603e-05, + "loss": 2.7106, + "step": 31041 + }, + { + "epoch": 1.4452359336080267, + "grad_norm": 0.33219663570830016, + "learning_rate": 6.179799743446494e-05, + "loss": 2.8382, + "step": 31042 + }, + { + "epoch": 1.4452824917941198, + "grad_norm": 0.34628494129870185, + "learning_rate": 6.17953651677879e-05, + "loss": 2.7868, + "step": 31043 + }, + { + "epoch": 1.4453290499802127, + "grad_norm": 0.3918546057786502, + "learning_rate": 6.17927328664926e-05, + "loss": 2.7598, + "step": 31044 + }, + { + "epoch": 1.4453756081663058, + "grad_norm": 0.3317011437291141, + "learning_rate": 6.179010053058679e-05, + "loss": 2.7474, + "step": 31045 + }, + { + "epoch": 1.445422166352399, + "grad_norm": 0.40269466571243445, + "learning_rate": 6.17874681600782e-05, + "loss": 2.7258, + "step": 31046 + }, + { + "epoch": 1.4454687245384918, + "grad_norm": 0.33201998598433197, + "learning_rate": 6.178483575497457e-05, + "loss": 2.752, + "step": 31047 + }, + { + "epoch": 1.445515282724585, + "grad_norm": 0.30701560847505294, + "learning_rate": 6.178220331528361e-05, + "loss": 2.6711, + "step": 31048 + }, + { + "epoch": 1.445561840910678, + "grad_norm": 0.3490637650868459, + "learning_rate": 6.177957084101303e-05, + "loss": 2.8007, + "step": 31049 + }, + { + "epoch": 1.4456083990967712, + "grad_norm": 0.3231116343408275, + "learning_rate": 6.177693833217055e-05, + "loss": 2.6694, + "step": 31050 + }, + { + "epoch": 1.4456549572828643, + "grad_norm": 0.3469679706575545, + "learning_rate": 6.177430578876395e-05, + "loss": 2.7323, + "step": 31051 + }, + { + "epoch": 1.4457015154689574, + "grad_norm": 0.35368136050939186, + "learning_rate": 6.177167321080091e-05, + "loss": 2.8068, + "step": 31052 + }, + { + "epoch": 1.4457480736550505, + "grad_norm": 0.3514742414475975, + "learning_rate": 6.176904059828917e-05, + "loss": 2.8097, + "step": 31053 + }, + { + "epoch": 1.4457946318411434, + "grad_norm": 0.3543796754187917, + "learning_rate": 6.176640795123646e-05, + "loss": 2.7571, + "step": 31054 + }, + { + "epoch": 1.4458411900272365, + "grad_norm": 0.3440583265330192, + "learning_rate": 6.176377526965049e-05, + "loss": 2.7508, + "step": 31055 + }, + { + "epoch": 1.4458877482133297, + "grad_norm": 0.34901542907321204, + "learning_rate": 6.176114255353902e-05, + "loss": 2.757, + "step": 31056 + }, + { + "epoch": 1.4459343063994226, + "grad_norm": 0.33462895166867335, + "learning_rate": 6.175850980290974e-05, + "loss": 2.7735, + "step": 31057 + }, + { + "epoch": 1.4459808645855157, + "grad_norm": 0.36495417086230136, + "learning_rate": 6.175587701777039e-05, + "loss": 2.6625, + "step": 31058 + }, + { + "epoch": 1.4460274227716088, + "grad_norm": 0.32160955881531306, + "learning_rate": 6.17532441981287e-05, + "loss": 2.6682, + "step": 31059 + }, + { + "epoch": 1.446073980957702, + "grad_norm": 0.35719817155075695, + "learning_rate": 6.175061134399238e-05, + "loss": 2.7921, + "step": 31060 + }, + { + "epoch": 1.446120539143795, + "grad_norm": 0.3485301206007477, + "learning_rate": 6.174797845536921e-05, + "loss": 2.7019, + "step": 31061 + }, + { + "epoch": 1.4461670973298881, + "grad_norm": 0.35775036437990004, + "learning_rate": 6.174534553226685e-05, + "loss": 2.7612, + "step": 31062 + }, + { + "epoch": 1.446213655515981, + "grad_norm": 0.35167653153889844, + "learning_rate": 6.174271257469308e-05, + "loss": 2.7376, + "step": 31063 + }, + { + "epoch": 1.4462602137020741, + "grad_norm": 0.35785328312020404, + "learning_rate": 6.17400795826556e-05, + "loss": 2.82, + "step": 31064 + }, + { + "epoch": 1.4463067718881673, + "grad_norm": 0.35212975364809657, + "learning_rate": 6.173744655616212e-05, + "loss": 2.6775, + "step": 31065 + }, + { + "epoch": 1.4463533300742604, + "grad_norm": 0.34306486604845293, + "learning_rate": 6.173481349522042e-05, + "loss": 2.7786, + "step": 31066 + }, + { + "epoch": 1.4463998882603533, + "grad_norm": 0.39114782340026916, + "learning_rate": 6.173218039983819e-05, + "loss": 2.7953, + "step": 31067 + }, + { + "epoch": 1.4464464464464464, + "grad_norm": 0.3285203264891972, + "learning_rate": 6.172954727002315e-05, + "loss": 2.7251, + "step": 31068 + }, + { + "epoch": 1.4464930046325395, + "grad_norm": 0.3733163925095157, + "learning_rate": 6.172691410578306e-05, + "loss": 2.6664, + "step": 31069 + }, + { + "epoch": 1.4465395628186326, + "grad_norm": 0.3501705360125606, + "learning_rate": 6.172428090712564e-05, + "loss": 2.7304, + "step": 31070 + }, + { + "epoch": 1.4465861210047257, + "grad_norm": 0.36261487222347977, + "learning_rate": 6.17216476740586e-05, + "loss": 2.7563, + "step": 31071 + }, + { + "epoch": 1.4466326791908188, + "grad_norm": 0.3424610284232994, + "learning_rate": 6.171901440658969e-05, + "loss": 2.6861, + "step": 31072 + }, + { + "epoch": 1.4466792373769117, + "grad_norm": 0.356932099496758, + "learning_rate": 6.17163811047266e-05, + "loss": 2.7066, + "step": 31073 + }, + { + "epoch": 1.4467257955630048, + "grad_norm": 0.3325558006045536, + "learning_rate": 6.171374776847711e-05, + "loss": 2.7916, + "step": 31074 + }, + { + "epoch": 1.446772353749098, + "grad_norm": 0.3583714158767743, + "learning_rate": 6.17111143978489e-05, + "loss": 2.6866, + "step": 31075 + }, + { + "epoch": 1.446818911935191, + "grad_norm": 0.3537517192170318, + "learning_rate": 6.170848099284974e-05, + "loss": 2.9016, + "step": 31076 + }, + { + "epoch": 1.446865470121284, + "grad_norm": 0.3460376985557814, + "learning_rate": 6.170584755348735e-05, + "loss": 2.762, + "step": 31077 + }, + { + "epoch": 1.446912028307377, + "grad_norm": 0.3601567650923395, + "learning_rate": 6.170321407976944e-05, + "loss": 2.7245, + "step": 31078 + }, + { + "epoch": 1.4469585864934702, + "grad_norm": 0.34735628362301374, + "learning_rate": 6.170058057170377e-05, + "loss": 2.7861, + "step": 31079 + }, + { + "epoch": 1.4470051446795633, + "grad_norm": 0.3328031341251307, + "learning_rate": 6.169794702929803e-05, + "loss": 2.7659, + "step": 31080 + }, + { + "epoch": 1.4470517028656564, + "grad_norm": 0.35968703132972796, + "learning_rate": 6.169531345255997e-05, + "loss": 2.6693, + "step": 31081 + }, + { + "epoch": 1.4470982610517495, + "grad_norm": 0.32535551039772476, + "learning_rate": 6.16926798414973e-05, + "loss": 2.8555, + "step": 31082 + }, + { + "epoch": 1.4471448192378424, + "grad_norm": 0.3781041622369061, + "learning_rate": 6.16900461961178e-05, + "loss": 2.6637, + "step": 31083 + }, + { + "epoch": 1.4471913774239356, + "grad_norm": 0.32524787602211336, + "learning_rate": 6.168741251642915e-05, + "loss": 2.689, + "step": 31084 + }, + { + "epoch": 1.4472379356100287, + "grad_norm": 0.3738474868667356, + "learning_rate": 6.168477880243908e-05, + "loss": 2.7828, + "step": 31085 + }, + { + "epoch": 1.4472844937961218, + "grad_norm": 0.3381621641929969, + "learning_rate": 6.168214505415534e-05, + "loss": 2.713, + "step": 31086 + }, + { + "epoch": 1.4473310519822147, + "grad_norm": 0.35457565958865966, + "learning_rate": 6.167951127158566e-05, + "loss": 2.6799, + "step": 31087 + }, + { + "epoch": 1.4473776101683078, + "grad_norm": 0.3370766782361098, + "learning_rate": 6.167687745473779e-05, + "loss": 2.6973, + "step": 31088 + }, + { + "epoch": 1.447424168354401, + "grad_norm": 0.3842869358405753, + "learning_rate": 6.16742436036194e-05, + "loss": 2.768, + "step": 31089 + }, + { + "epoch": 1.447470726540494, + "grad_norm": 0.33716684378998757, + "learning_rate": 6.167160971823827e-05, + "loss": 2.7792, + "step": 31090 + }, + { + "epoch": 1.4475172847265871, + "grad_norm": 0.34510482064288595, + "learning_rate": 6.166897579860214e-05, + "loss": 2.7017, + "step": 31091 + }, + { + "epoch": 1.4475638429126803, + "grad_norm": 0.3459189772216682, + "learning_rate": 6.166634184471869e-05, + "loss": 2.6677, + "step": 31092 + }, + { + "epoch": 1.4476104010987731, + "grad_norm": 0.33445516114912627, + "learning_rate": 6.166370785659568e-05, + "loss": 2.784, + "step": 31093 + }, + { + "epoch": 1.4476569592848663, + "grad_norm": 0.3505230848728178, + "learning_rate": 6.166107383424082e-05, + "loss": 2.7531, + "step": 31094 + }, + { + "epoch": 1.4477035174709594, + "grad_norm": 0.31501267986565396, + "learning_rate": 6.165843977766186e-05, + "loss": 2.81, + "step": 31095 + }, + { + "epoch": 1.4477500756570523, + "grad_norm": 0.34857997461135376, + "learning_rate": 6.165580568686654e-05, + "loss": 2.6717, + "step": 31096 + }, + { + "epoch": 1.4477966338431454, + "grad_norm": 0.3401058666003254, + "learning_rate": 6.165317156186259e-05, + "loss": 2.804, + "step": 31097 + }, + { + "epoch": 1.4478431920292385, + "grad_norm": 0.3353508819700731, + "learning_rate": 6.165053740265771e-05, + "loss": 2.666, + "step": 31098 + }, + { + "epoch": 1.4478897502153316, + "grad_norm": 0.32709051161910674, + "learning_rate": 6.164790320925966e-05, + "loss": 2.7287, + "step": 31099 + }, + { + "epoch": 1.4479363084014247, + "grad_norm": 0.3578828241023681, + "learning_rate": 6.164526898167615e-05, + "loss": 2.7418, + "step": 31100 + }, + { + "epoch": 1.4479828665875178, + "grad_norm": 0.3067881614872079, + "learning_rate": 6.164263471991494e-05, + "loss": 2.6653, + "step": 31101 + }, + { + "epoch": 1.4480294247736107, + "grad_norm": 0.30455623959592687, + "learning_rate": 6.164000042398374e-05, + "loss": 2.74, + "step": 31102 + }, + { + "epoch": 1.4480759829597039, + "grad_norm": 0.4020209773420776, + "learning_rate": 6.163736609389028e-05, + "loss": 2.7722, + "step": 31103 + }, + { + "epoch": 1.448122541145797, + "grad_norm": 0.33163120471577356, + "learning_rate": 6.163473172964229e-05, + "loss": 2.6134, + "step": 31104 + }, + { + "epoch": 1.44816909933189, + "grad_norm": 0.3933710977253118, + "learning_rate": 6.163209733124752e-05, + "loss": 2.8242, + "step": 31105 + }, + { + "epoch": 1.448215657517983, + "grad_norm": 0.3406092648407081, + "learning_rate": 6.162946289871369e-05, + "loss": 2.8405, + "step": 31106 + }, + { + "epoch": 1.448262215704076, + "grad_norm": 0.41915987504060637, + "learning_rate": 6.162682843204853e-05, + "loss": 2.7, + "step": 31107 + }, + { + "epoch": 1.4483087738901692, + "grad_norm": 0.3204255059471327, + "learning_rate": 6.162419393125977e-05, + "loss": 2.7344, + "step": 31108 + }, + { + "epoch": 1.4483553320762623, + "grad_norm": 0.391046510592853, + "learning_rate": 6.162155939635515e-05, + "loss": 2.7025, + "step": 31109 + }, + { + "epoch": 1.4484018902623554, + "grad_norm": 0.3328884222950663, + "learning_rate": 6.16189248273424e-05, + "loss": 2.7238, + "step": 31110 + }, + { + "epoch": 1.4484484484484486, + "grad_norm": 0.3864139901446084, + "learning_rate": 6.161629022422924e-05, + "loss": 2.8027, + "step": 31111 + }, + { + "epoch": 1.4484950066345414, + "grad_norm": 0.34058736590576427, + "learning_rate": 6.161365558702342e-05, + "loss": 2.7579, + "step": 31112 + }, + { + "epoch": 1.4485415648206346, + "grad_norm": 0.3373190047833088, + "learning_rate": 6.161102091573266e-05, + "loss": 2.7958, + "step": 31113 + }, + { + "epoch": 1.4485881230067277, + "grad_norm": 0.3562247810473777, + "learning_rate": 6.16083862103647e-05, + "loss": 2.802, + "step": 31114 + }, + { + "epoch": 1.4486346811928208, + "grad_norm": 0.32767262842951866, + "learning_rate": 6.160575147092728e-05, + "loss": 2.8137, + "step": 31115 + }, + { + "epoch": 1.4486812393789137, + "grad_norm": 0.3505547344048952, + "learning_rate": 6.160311669742811e-05, + "loss": 2.7864, + "step": 31116 + }, + { + "epoch": 1.4487277975650068, + "grad_norm": 0.317987723518522, + "learning_rate": 6.160048188987495e-05, + "loss": 2.7714, + "step": 31117 + }, + { + "epoch": 1.4487743557511, + "grad_norm": 0.32231323532329736, + "learning_rate": 6.15978470482755e-05, + "loss": 2.8411, + "step": 31118 + }, + { + "epoch": 1.448820913937193, + "grad_norm": 0.33435071921093584, + "learning_rate": 6.159521217263753e-05, + "loss": 2.7206, + "step": 31119 + }, + { + "epoch": 1.4488674721232861, + "grad_norm": 0.33752799919427046, + "learning_rate": 6.159257726296874e-05, + "loss": 2.7356, + "step": 31120 + }, + { + "epoch": 1.4489140303093793, + "grad_norm": 0.3216919841289087, + "learning_rate": 6.158994231927689e-05, + "loss": 2.6868, + "step": 31121 + }, + { + "epoch": 1.4489605884954722, + "grad_norm": 0.3824425568955406, + "learning_rate": 6.15873073415697e-05, + "loss": 2.7784, + "step": 31122 + }, + { + "epoch": 1.4490071466815653, + "grad_norm": 0.3454273497105467, + "learning_rate": 6.158467232985489e-05, + "loss": 2.7934, + "step": 31123 + }, + { + "epoch": 1.4490537048676584, + "grad_norm": 0.325612755769251, + "learning_rate": 6.158203728414022e-05, + "loss": 2.7561, + "step": 31124 + }, + { + "epoch": 1.4491002630537515, + "grad_norm": 0.3577463205163639, + "learning_rate": 6.15794022044334e-05, + "loss": 2.7872, + "step": 31125 + }, + { + "epoch": 1.4491468212398444, + "grad_norm": 0.3561809296698525, + "learning_rate": 6.157676709074219e-05, + "loss": 2.7568, + "step": 31126 + }, + { + "epoch": 1.4491933794259375, + "grad_norm": 0.32469034018403786, + "learning_rate": 6.157413194307428e-05, + "loss": 2.7863, + "step": 31127 + }, + { + "epoch": 1.4492399376120306, + "grad_norm": 0.3679729621393764, + "learning_rate": 6.157149676143747e-05, + "loss": 2.7287, + "step": 31128 + }, + { + "epoch": 1.4492864957981237, + "grad_norm": 0.3274969286810169, + "learning_rate": 6.156886154583943e-05, + "loss": 2.8141, + "step": 31129 + }, + { + "epoch": 1.4493330539842169, + "grad_norm": 0.330822165247754, + "learning_rate": 6.156622629628796e-05, + "loss": 2.7191, + "step": 31130 + }, + { + "epoch": 1.44937961217031, + "grad_norm": 0.38659037372620536, + "learning_rate": 6.156359101279072e-05, + "loss": 2.6673, + "step": 31131 + }, + { + "epoch": 1.4494261703564029, + "grad_norm": 0.31909308772533307, + "learning_rate": 6.156095569535548e-05, + "loss": 2.7731, + "step": 31132 + }, + { + "epoch": 1.449472728542496, + "grad_norm": 0.35826765903110774, + "learning_rate": 6.155832034398999e-05, + "loss": 2.7363, + "step": 31133 + }, + { + "epoch": 1.449519286728589, + "grad_norm": 0.3197203515324091, + "learning_rate": 6.155568495870197e-05, + "loss": 2.7002, + "step": 31134 + }, + { + "epoch": 1.449565844914682, + "grad_norm": 0.38002296334683205, + "learning_rate": 6.155304953949913e-05, + "loss": 2.8036, + "step": 31135 + }, + { + "epoch": 1.449612403100775, + "grad_norm": 0.3408346823200666, + "learning_rate": 6.155041408638925e-05, + "loss": 2.7892, + "step": 31136 + }, + { + "epoch": 1.4496589612868682, + "grad_norm": 0.3502174181134108, + "learning_rate": 6.154777859938004e-05, + "loss": 2.6598, + "step": 31137 + }, + { + "epoch": 1.4497055194729613, + "grad_norm": 0.35695655389603365, + "learning_rate": 6.154514307847923e-05, + "loss": 2.87, + "step": 31138 + }, + { + "epoch": 1.4497520776590545, + "grad_norm": 0.3275676085910098, + "learning_rate": 6.154250752369455e-05, + "loss": 2.7576, + "step": 31139 + }, + { + "epoch": 1.4497986358451476, + "grad_norm": 0.363601368106426, + "learning_rate": 6.153987193503377e-05, + "loss": 2.6528, + "step": 31140 + }, + { + "epoch": 1.4498451940312407, + "grad_norm": 0.3524602426678834, + "learning_rate": 6.15372363125046e-05, + "loss": 2.7462, + "step": 31141 + }, + { + "epoch": 1.4498917522173336, + "grad_norm": 0.33369369841509217, + "learning_rate": 6.153460065611476e-05, + "loss": 2.6967, + "step": 31142 + }, + { + "epoch": 1.4499383104034267, + "grad_norm": 0.3377119318464403, + "learning_rate": 6.153196496587202e-05, + "loss": 2.8055, + "step": 31143 + }, + { + "epoch": 1.4499848685895198, + "grad_norm": 0.36495909529372117, + "learning_rate": 6.152932924178409e-05, + "loss": 2.7811, + "step": 31144 + }, + { + "epoch": 1.4500314267756127, + "grad_norm": 0.3379341002365885, + "learning_rate": 6.152669348385874e-05, + "loss": 2.8165, + "step": 31145 + }, + { + "epoch": 1.4500779849617058, + "grad_norm": 0.3852388328455601, + "learning_rate": 6.152405769210366e-05, + "loss": 2.7666, + "step": 31146 + }, + { + "epoch": 1.450124543147799, + "grad_norm": 0.3129025309968202, + "learning_rate": 6.15214218665266e-05, + "loss": 2.7509, + "step": 31147 + }, + { + "epoch": 1.450171101333892, + "grad_norm": 0.40166104162683897, + "learning_rate": 6.15187860071353e-05, + "loss": 2.7643, + "step": 31148 + }, + { + "epoch": 1.4502176595199852, + "grad_norm": 0.3375235161394664, + "learning_rate": 6.15161501139375e-05, + "loss": 2.7683, + "step": 31149 + }, + { + "epoch": 1.4502642177060783, + "grad_norm": 0.40896897502280066, + "learning_rate": 6.151351418694093e-05, + "loss": 2.6444, + "step": 31150 + }, + { + "epoch": 1.4503107758921712, + "grad_norm": 0.3336880202662311, + "learning_rate": 6.151087822615335e-05, + "loss": 2.6887, + "step": 31151 + }, + { + "epoch": 1.4503573340782643, + "grad_norm": 0.3868780885845355, + "learning_rate": 6.150824223158247e-05, + "loss": 2.6708, + "step": 31152 + }, + { + "epoch": 1.4504038922643574, + "grad_norm": 0.33147816897275173, + "learning_rate": 6.150560620323602e-05, + "loss": 2.7547, + "step": 31153 + }, + { + "epoch": 1.4504504504504505, + "grad_norm": 0.36260739036253803, + "learning_rate": 6.150297014112177e-05, + "loss": 2.6213, + "step": 31154 + }, + { + "epoch": 1.4504970086365434, + "grad_norm": 0.30827115221872875, + "learning_rate": 6.150033404524743e-05, + "loss": 2.7615, + "step": 31155 + }, + { + "epoch": 1.4505435668226365, + "grad_norm": 0.3337832282086594, + "learning_rate": 6.149769791562073e-05, + "loss": 2.7892, + "step": 31156 + }, + { + "epoch": 1.4505901250087296, + "grad_norm": 0.3136635435704032, + "learning_rate": 6.149506175224943e-05, + "loss": 2.7831, + "step": 31157 + }, + { + "epoch": 1.4506366831948228, + "grad_norm": 0.3414335922792916, + "learning_rate": 6.149242555514124e-05, + "loss": 2.7536, + "step": 31158 + }, + { + "epoch": 1.4506832413809159, + "grad_norm": 0.33873080614434564, + "learning_rate": 6.148978932430392e-05, + "loss": 2.7549, + "step": 31159 + }, + { + "epoch": 1.450729799567009, + "grad_norm": 0.3494754790240007, + "learning_rate": 6.148715305974522e-05, + "loss": 2.734, + "step": 31160 + }, + { + "epoch": 1.4507763577531019, + "grad_norm": 0.34411190388522517, + "learning_rate": 6.148451676147284e-05, + "loss": 2.7939, + "step": 31161 + }, + { + "epoch": 1.450822915939195, + "grad_norm": 0.3447994838941994, + "learning_rate": 6.148188042949453e-05, + "loss": 2.7133, + "step": 31162 + }, + { + "epoch": 1.450869474125288, + "grad_norm": 0.37552094871233077, + "learning_rate": 6.147924406381804e-05, + "loss": 2.7579, + "step": 31163 + }, + { + "epoch": 1.4509160323113812, + "grad_norm": 0.32443283099871445, + "learning_rate": 6.14766076644511e-05, + "loss": 2.7292, + "step": 31164 + }, + { + "epoch": 1.4509625904974741, + "grad_norm": 0.3676024675939542, + "learning_rate": 6.147397123140145e-05, + "loss": 2.7383, + "step": 31165 + }, + { + "epoch": 1.4510091486835672, + "grad_norm": 0.35347463708470866, + "learning_rate": 6.147133476467682e-05, + "loss": 2.675, + "step": 31166 + }, + { + "epoch": 1.4510557068696603, + "grad_norm": 0.31580434291722476, + "learning_rate": 6.146869826428496e-05, + "loss": 2.732, + "step": 31167 + }, + { + "epoch": 1.4511022650557535, + "grad_norm": 0.3575970829783053, + "learning_rate": 6.146606173023359e-05, + "loss": 2.6679, + "step": 31168 + }, + { + "epoch": 1.4511488232418466, + "grad_norm": 0.3414222784064015, + "learning_rate": 6.146342516253047e-05, + "loss": 2.7611, + "step": 31169 + }, + { + "epoch": 1.4511953814279397, + "grad_norm": 0.34272265239555805, + "learning_rate": 6.146078856118331e-05, + "loss": 2.6841, + "step": 31170 + }, + { + "epoch": 1.4512419396140326, + "grad_norm": 0.32739250322991614, + "learning_rate": 6.145815192619989e-05, + "loss": 2.6955, + "step": 31171 + }, + { + "epoch": 1.4512884978001257, + "grad_norm": 0.3372036127757552, + "learning_rate": 6.14555152575879e-05, + "loss": 2.8383, + "step": 31172 + }, + { + "epoch": 1.4513350559862188, + "grad_norm": 0.33729871606482886, + "learning_rate": 6.145287855535511e-05, + "loss": 2.7796, + "step": 31173 + }, + { + "epoch": 1.451381614172312, + "grad_norm": 0.3184017610444734, + "learning_rate": 6.145024181950923e-05, + "loss": 2.7324, + "step": 31174 + }, + { + "epoch": 1.4514281723584048, + "grad_norm": 0.31740405665832566, + "learning_rate": 6.144760505005804e-05, + "loss": 2.7198, + "step": 31175 + }, + { + "epoch": 1.451474730544498, + "grad_norm": 0.2955146282892549, + "learning_rate": 6.144496824700925e-05, + "loss": 2.664, + "step": 31176 + }, + { + "epoch": 1.451521288730591, + "grad_norm": 0.3205977771069317, + "learning_rate": 6.144233141037061e-05, + "loss": 2.7993, + "step": 31177 + }, + { + "epoch": 1.4515678469166842, + "grad_norm": 0.3273242590259075, + "learning_rate": 6.143969454014986e-05, + "loss": 2.769, + "step": 31178 + }, + { + "epoch": 1.4516144051027773, + "grad_norm": 0.30786779067763137, + "learning_rate": 6.143705763635473e-05, + "loss": 2.6214, + "step": 31179 + }, + { + "epoch": 1.4516609632888704, + "grad_norm": 0.3367898540243633, + "learning_rate": 6.143442069899296e-05, + "loss": 2.7011, + "step": 31180 + }, + { + "epoch": 1.4517075214749633, + "grad_norm": 0.34007350102862016, + "learning_rate": 6.143178372807228e-05, + "loss": 2.7395, + "step": 31181 + }, + { + "epoch": 1.4517540796610564, + "grad_norm": 0.35029203665214637, + "learning_rate": 6.142914672360046e-05, + "loss": 2.6444, + "step": 31182 + }, + { + "epoch": 1.4518006378471495, + "grad_norm": 0.31354949181898395, + "learning_rate": 6.14265096855852e-05, + "loss": 2.7274, + "step": 31183 + }, + { + "epoch": 1.4518471960332424, + "grad_norm": 0.3377575026514169, + "learning_rate": 6.142387261403428e-05, + "loss": 2.7787, + "step": 31184 + }, + { + "epoch": 1.4518937542193355, + "grad_norm": 0.31865833490078604, + "learning_rate": 6.142123550895542e-05, + "loss": 2.6182, + "step": 31185 + }, + { + "epoch": 1.4519403124054286, + "grad_norm": 0.32498892681966457, + "learning_rate": 6.141859837035634e-05, + "loss": 2.6966, + "step": 31186 + }, + { + "epoch": 1.4519868705915218, + "grad_norm": 0.33190623171554184, + "learning_rate": 6.141596119824483e-05, + "loss": 2.757, + "step": 31187 + }, + { + "epoch": 1.4520334287776149, + "grad_norm": 0.3078662523285555, + "learning_rate": 6.141332399262856e-05, + "loss": 2.8064, + "step": 31188 + }, + { + "epoch": 1.452079986963708, + "grad_norm": 0.3293648963743334, + "learning_rate": 6.141068675351533e-05, + "loss": 2.6836, + "step": 31189 + }, + { + "epoch": 1.4521265451498009, + "grad_norm": 0.3314801325434306, + "learning_rate": 6.140804948091285e-05, + "loss": 2.7914, + "step": 31190 + }, + { + "epoch": 1.452173103335894, + "grad_norm": 0.36826778673040095, + "learning_rate": 6.140541217482887e-05, + "loss": 2.8277, + "step": 31191 + }, + { + "epoch": 1.4522196615219871, + "grad_norm": 0.37021700146909514, + "learning_rate": 6.140277483527113e-05, + "loss": 2.7714, + "step": 31192 + }, + { + "epoch": 1.4522662197080802, + "grad_norm": 0.3587447801879137, + "learning_rate": 6.140013746224738e-05, + "loss": 2.834, + "step": 31193 + }, + { + "epoch": 1.4523127778941731, + "grad_norm": 0.36806306458873483, + "learning_rate": 6.139750005576535e-05, + "loss": 2.7773, + "step": 31194 + }, + { + "epoch": 1.4523593360802662, + "grad_norm": 0.35035132712387995, + "learning_rate": 6.139486261583276e-05, + "loss": 2.7944, + "step": 31195 + }, + { + "epoch": 1.4524058942663594, + "grad_norm": 0.37844832397834144, + "learning_rate": 6.13922251424574e-05, + "loss": 2.6165, + "step": 31196 + }, + { + "epoch": 1.4524524524524525, + "grad_norm": 0.36251495417405466, + "learning_rate": 6.138958763564695e-05, + "loss": 2.7308, + "step": 31197 + }, + { + "epoch": 1.4524990106385456, + "grad_norm": 0.38722721910323205, + "learning_rate": 6.138695009540919e-05, + "loss": 2.7782, + "step": 31198 + }, + { + "epoch": 1.4525455688246387, + "grad_norm": 0.37905856693152024, + "learning_rate": 6.138431252175185e-05, + "loss": 2.6532, + "step": 31199 + }, + { + "epoch": 1.4525921270107316, + "grad_norm": 0.372989122927206, + "learning_rate": 6.13816749146827e-05, + "loss": 2.7481, + "step": 31200 + }, + { + "epoch": 1.4526386851968247, + "grad_norm": 0.35566703515539744, + "learning_rate": 6.137903727420943e-05, + "loss": 2.7722, + "step": 31201 + }, + { + "epoch": 1.4526852433829178, + "grad_norm": 0.3608928984464229, + "learning_rate": 6.137639960033981e-05, + "loss": 2.8169, + "step": 31202 + }, + { + "epoch": 1.452731801569011, + "grad_norm": 0.3412926636953329, + "learning_rate": 6.137376189308158e-05, + "loss": 2.6979, + "step": 31203 + }, + { + "epoch": 1.4527783597551038, + "grad_norm": 0.3331320725223831, + "learning_rate": 6.13711241524425e-05, + "loss": 2.7935, + "step": 31204 + }, + { + "epoch": 1.452824917941197, + "grad_norm": 0.3388362917825301, + "learning_rate": 6.136848637843027e-05, + "loss": 2.8202, + "step": 31205 + }, + { + "epoch": 1.45287147612729, + "grad_norm": 0.3327519418446245, + "learning_rate": 6.136584857105265e-05, + "loss": 2.7068, + "step": 31206 + }, + { + "epoch": 1.4529180343133832, + "grad_norm": 0.34867863498522106, + "learning_rate": 6.13632107303174e-05, + "loss": 2.6751, + "step": 31207 + }, + { + "epoch": 1.4529645924994763, + "grad_norm": 0.3236885660098381, + "learning_rate": 6.136057285623223e-05, + "loss": 2.7921, + "step": 31208 + }, + { + "epoch": 1.4530111506855694, + "grad_norm": 0.38164288510852407, + "learning_rate": 6.13579349488049e-05, + "loss": 2.7335, + "step": 31209 + }, + { + "epoch": 1.4530577088716623, + "grad_norm": 0.32159614781244233, + "learning_rate": 6.135529700804317e-05, + "loss": 2.6668, + "step": 31210 + }, + { + "epoch": 1.4531042670577554, + "grad_norm": 0.3501060561652411, + "learning_rate": 6.135265903395473e-05, + "loss": 2.7241, + "step": 31211 + }, + { + "epoch": 1.4531508252438485, + "grad_norm": 0.3474943386175126, + "learning_rate": 6.135002102654737e-05, + "loss": 2.7625, + "step": 31212 + }, + { + "epoch": 1.4531973834299416, + "grad_norm": 0.34407040706086306, + "learning_rate": 6.134738298582882e-05, + "loss": 2.8274, + "step": 31213 + }, + { + "epoch": 1.4532439416160345, + "grad_norm": 0.334009748768223, + "learning_rate": 6.134474491180681e-05, + "loss": 2.7331, + "step": 31214 + }, + { + "epoch": 1.4532904998021277, + "grad_norm": 0.3327379832684132, + "learning_rate": 6.134210680448909e-05, + "loss": 2.7674, + "step": 31215 + }, + { + "epoch": 1.4533370579882208, + "grad_norm": 0.36910180302982376, + "learning_rate": 6.133946866388342e-05, + "loss": 2.7604, + "step": 31216 + }, + { + "epoch": 1.4533836161743139, + "grad_norm": 0.3299686425918083, + "learning_rate": 6.133683048999752e-05, + "loss": 2.7821, + "step": 31217 + }, + { + "epoch": 1.453430174360407, + "grad_norm": 0.3567768311181036, + "learning_rate": 6.133419228283913e-05, + "loss": 2.7004, + "step": 31218 + }, + { + "epoch": 1.4534767325465001, + "grad_norm": 0.31605555097247273, + "learning_rate": 6.133155404241602e-05, + "loss": 2.7436, + "step": 31219 + }, + { + "epoch": 1.453523290732593, + "grad_norm": 0.3178952573266489, + "learning_rate": 6.13289157687359e-05, + "loss": 2.6801, + "step": 31220 + }, + { + "epoch": 1.4535698489186861, + "grad_norm": 0.36226045861328526, + "learning_rate": 6.132627746180653e-05, + "loss": 2.7596, + "step": 31221 + }, + { + "epoch": 1.4536164071047792, + "grad_norm": 0.34708182013048067, + "learning_rate": 6.132363912163566e-05, + "loss": 2.8492, + "step": 31222 + }, + { + "epoch": 1.4536629652908721, + "grad_norm": 0.364889616392616, + "learning_rate": 6.132100074823102e-05, + "loss": 2.8375, + "step": 31223 + }, + { + "epoch": 1.4537095234769652, + "grad_norm": 0.3459156398448149, + "learning_rate": 6.131836234160035e-05, + "loss": 2.7551, + "step": 31224 + }, + { + "epoch": 1.4537560816630584, + "grad_norm": 0.3599756777357531, + "learning_rate": 6.131572390175141e-05, + "loss": 2.7819, + "step": 31225 + }, + { + "epoch": 1.4538026398491515, + "grad_norm": 0.3108830104331953, + "learning_rate": 6.131308542869193e-05, + "loss": 2.8172, + "step": 31226 + }, + { + "epoch": 1.4538491980352446, + "grad_norm": 0.3574153945804848, + "learning_rate": 6.131044692242967e-05, + "loss": 2.8039, + "step": 31227 + }, + { + "epoch": 1.4538957562213377, + "grad_norm": 0.3232943998568134, + "learning_rate": 6.130780838297234e-05, + "loss": 2.7304, + "step": 31228 + }, + { + "epoch": 1.4539423144074308, + "grad_norm": 0.33453811648078874, + "learning_rate": 6.130516981032773e-05, + "loss": 2.7801, + "step": 31229 + }, + { + "epoch": 1.4539888725935237, + "grad_norm": 0.33106568146334214, + "learning_rate": 6.130253120450354e-05, + "loss": 2.8791, + "step": 31230 + }, + { + "epoch": 1.4540354307796168, + "grad_norm": 0.3465790843205633, + "learning_rate": 6.129989256550756e-05, + "loss": 2.84, + "step": 31231 + }, + { + "epoch": 1.45408198896571, + "grad_norm": 0.32197222674812753, + "learning_rate": 6.12972538933475e-05, + "loss": 2.7374, + "step": 31232 + }, + { + "epoch": 1.4541285471518028, + "grad_norm": 0.32007228300053253, + "learning_rate": 6.12946151880311e-05, + "loss": 2.7801, + "step": 31233 + }, + { + "epoch": 1.454175105337896, + "grad_norm": 0.36148900671308765, + "learning_rate": 6.129197644956612e-05, + "loss": 2.7364, + "step": 31234 + }, + { + "epoch": 1.454221663523989, + "grad_norm": 0.3509984268109123, + "learning_rate": 6.12893376779603e-05, + "loss": 2.7838, + "step": 31235 + }, + { + "epoch": 1.4542682217100822, + "grad_norm": 0.338414810260501, + "learning_rate": 6.128669887322139e-05, + "loss": 2.8154, + "step": 31236 + }, + { + "epoch": 1.4543147798961753, + "grad_norm": 0.3307032719259685, + "learning_rate": 6.128406003535714e-05, + "loss": 2.6202, + "step": 31237 + }, + { + "epoch": 1.4543613380822684, + "grad_norm": 0.3463582704303479, + "learning_rate": 6.128142116437528e-05, + "loss": 2.7694, + "step": 31238 + }, + { + "epoch": 1.4544078962683613, + "grad_norm": 0.37512251239727795, + "learning_rate": 6.127878226028355e-05, + "loss": 2.7548, + "step": 31239 + }, + { + "epoch": 1.4544544544544544, + "grad_norm": 0.33625369691149976, + "learning_rate": 6.127614332308973e-05, + "loss": 2.7112, + "step": 31240 + }, + { + "epoch": 1.4545010126405475, + "grad_norm": 0.36911382559234934, + "learning_rate": 6.127350435280151e-05, + "loss": 2.7422, + "step": 31241 + }, + { + "epoch": 1.4545475708266407, + "grad_norm": 0.31749316857583487, + "learning_rate": 6.127086534942669e-05, + "loss": 2.7479, + "step": 31242 + }, + { + "epoch": 1.4545941290127335, + "grad_norm": 0.38003869404615803, + "learning_rate": 6.126822631297297e-05, + "loss": 2.7279, + "step": 31243 + }, + { + "epoch": 1.4546406871988267, + "grad_norm": 0.3379715656986246, + "learning_rate": 6.126558724344813e-05, + "loss": 2.7872, + "step": 31244 + }, + { + "epoch": 1.4546872453849198, + "grad_norm": 0.35063872827529835, + "learning_rate": 6.12629481408599e-05, + "loss": 2.7327, + "step": 31245 + }, + { + "epoch": 1.454733803571013, + "grad_norm": 0.33889914603648663, + "learning_rate": 6.126030900521603e-05, + "loss": 2.7113, + "step": 31246 + }, + { + "epoch": 1.454780361757106, + "grad_norm": 0.3295716346131605, + "learning_rate": 6.125766983652426e-05, + "loss": 2.7021, + "step": 31247 + }, + { + "epoch": 1.4548269199431991, + "grad_norm": 0.36102366627908783, + "learning_rate": 6.125503063479232e-05, + "loss": 2.7198, + "step": 31248 + }, + { + "epoch": 1.454873478129292, + "grad_norm": 0.3423447607893282, + "learning_rate": 6.125239140002799e-05, + "loss": 2.7748, + "step": 31249 + }, + { + "epoch": 1.4549200363153851, + "grad_norm": 0.3473656318696598, + "learning_rate": 6.1249752132239e-05, + "loss": 2.7499, + "step": 31250 + }, + { + "epoch": 1.4549665945014782, + "grad_norm": 0.3341509786383225, + "learning_rate": 6.12471128314331e-05, + "loss": 2.6774, + "step": 31251 + }, + { + "epoch": 1.4550131526875714, + "grad_norm": 0.35364330923462156, + "learning_rate": 6.124447349761801e-05, + "loss": 2.7752, + "step": 31252 + }, + { + "epoch": 1.4550597108736643, + "grad_norm": 0.35264293096201965, + "learning_rate": 6.124183413080152e-05, + "loss": 2.7164, + "step": 31253 + }, + { + "epoch": 1.4551062690597574, + "grad_norm": 0.35290348790376813, + "learning_rate": 6.123919473099133e-05, + "loss": 2.7293, + "step": 31254 + }, + { + "epoch": 1.4551528272458505, + "grad_norm": 0.35205012683001063, + "learning_rate": 6.123655529819524e-05, + "loss": 2.6949, + "step": 31255 + }, + { + "epoch": 1.4551993854319436, + "grad_norm": 0.3392598771658006, + "learning_rate": 6.123391583242097e-05, + "loss": 2.8285, + "step": 31256 + }, + { + "epoch": 1.4552459436180367, + "grad_norm": 0.3605728137901722, + "learning_rate": 6.123127633367625e-05, + "loss": 2.7594, + "step": 31257 + }, + { + "epoch": 1.4552925018041298, + "grad_norm": 0.36327558297515794, + "learning_rate": 6.122863680196883e-05, + "loss": 2.6718, + "step": 31258 + }, + { + "epoch": 1.4553390599902227, + "grad_norm": 0.37507137459139467, + "learning_rate": 6.122599723730648e-05, + "loss": 2.7345, + "step": 31259 + }, + { + "epoch": 1.4553856181763158, + "grad_norm": 0.30236954404598604, + "learning_rate": 6.122335763969693e-05, + "loss": 2.6449, + "step": 31260 + }, + { + "epoch": 1.455432176362409, + "grad_norm": 0.3806211412422196, + "learning_rate": 6.122071800914793e-05, + "loss": 2.7312, + "step": 31261 + }, + { + "epoch": 1.455478734548502, + "grad_norm": 0.321803425763198, + "learning_rate": 6.121807834566723e-05, + "loss": 2.8146, + "step": 31262 + }, + { + "epoch": 1.455525292734595, + "grad_norm": 0.38971287465750154, + "learning_rate": 6.121543864926257e-05, + "loss": 2.8803, + "step": 31263 + }, + { + "epoch": 1.455571850920688, + "grad_norm": 0.32116288672590654, + "learning_rate": 6.121279891994171e-05, + "loss": 2.8113, + "step": 31264 + }, + { + "epoch": 1.4556184091067812, + "grad_norm": 0.3621146933435823, + "learning_rate": 6.12101591577124e-05, + "loss": 2.6492, + "step": 31265 + }, + { + "epoch": 1.4556649672928743, + "grad_norm": 0.3279042060728751, + "learning_rate": 6.120751936258236e-05, + "loss": 2.7193, + "step": 31266 + }, + { + "epoch": 1.4557115254789674, + "grad_norm": 0.3771968344191898, + "learning_rate": 6.120487953455935e-05, + "loss": 2.7302, + "step": 31267 + }, + { + "epoch": 1.4557580836650605, + "grad_norm": 0.3395774246815891, + "learning_rate": 6.120223967365114e-05, + "loss": 2.7556, + "step": 31268 + }, + { + "epoch": 1.4558046418511534, + "grad_norm": 0.37923366146000204, + "learning_rate": 6.119959977986544e-05, + "loss": 2.784, + "step": 31269 + }, + { + "epoch": 1.4558512000372466, + "grad_norm": 0.3142415230288606, + "learning_rate": 6.119695985321005e-05, + "loss": 2.6683, + "step": 31270 + }, + { + "epoch": 1.4558977582233397, + "grad_norm": 0.32919463668852644, + "learning_rate": 6.119431989369267e-05, + "loss": 2.6749, + "step": 31271 + }, + { + "epoch": 1.4559443164094326, + "grad_norm": 0.329816885340013, + "learning_rate": 6.119167990132105e-05, + "loss": 2.7958, + "step": 31272 + }, + { + "epoch": 1.4559908745955257, + "grad_norm": 0.3479791242100534, + "learning_rate": 6.118903987610298e-05, + "loss": 2.8383, + "step": 31273 + }, + { + "epoch": 1.4560374327816188, + "grad_norm": 0.3409116591654127, + "learning_rate": 6.118639981804615e-05, + "loss": 2.8182, + "step": 31274 + }, + { + "epoch": 1.456083990967712, + "grad_norm": 0.35588297243213357, + "learning_rate": 6.118375972715835e-05, + "loss": 2.7647, + "step": 31275 + }, + { + "epoch": 1.456130549153805, + "grad_norm": 0.30537698912668904, + "learning_rate": 6.118111960344731e-05, + "loss": 2.6844, + "step": 31276 + }, + { + "epoch": 1.4561771073398981, + "grad_norm": 0.3396223711858337, + "learning_rate": 6.11784794469208e-05, + "loss": 2.8049, + "step": 31277 + }, + { + "epoch": 1.456223665525991, + "grad_norm": 0.3367102731587036, + "learning_rate": 6.117583925758655e-05, + "loss": 2.69, + "step": 31278 + }, + { + "epoch": 1.4562702237120841, + "grad_norm": 0.32705783982361236, + "learning_rate": 6.11731990354523e-05, + "loss": 2.8225, + "step": 31279 + }, + { + "epoch": 1.4563167818981773, + "grad_norm": 0.3669449096531272, + "learning_rate": 6.117055878052582e-05, + "loss": 2.746, + "step": 31280 + }, + { + "epoch": 1.4563633400842704, + "grad_norm": 0.32387029729061145, + "learning_rate": 6.116791849281487e-05, + "loss": 2.8438, + "step": 31281 + }, + { + "epoch": 1.4564098982703633, + "grad_norm": 0.38595698099386594, + "learning_rate": 6.116527817232716e-05, + "loss": 2.7163, + "step": 31282 + }, + { + "epoch": 1.4564564564564564, + "grad_norm": 0.3254340460873914, + "learning_rate": 6.116263781907046e-05, + "loss": 2.7697, + "step": 31283 + }, + { + "epoch": 1.4565030146425495, + "grad_norm": 0.3525348740733981, + "learning_rate": 6.115999743305252e-05, + "loss": 2.8159, + "step": 31284 + }, + { + "epoch": 1.4565495728286426, + "grad_norm": 0.37653063328160724, + "learning_rate": 6.115735701428108e-05, + "loss": 2.7494, + "step": 31285 + }, + { + "epoch": 1.4565961310147357, + "grad_norm": 0.3286871015570858, + "learning_rate": 6.115471656276391e-05, + "loss": 2.7123, + "step": 31286 + }, + { + "epoch": 1.4566426892008288, + "grad_norm": 0.3909990199846953, + "learning_rate": 6.115207607850874e-05, + "loss": 2.685, + "step": 31287 + }, + { + "epoch": 1.4566892473869217, + "grad_norm": 0.33856322112151005, + "learning_rate": 6.114943556152331e-05, + "loss": 2.6456, + "step": 31288 + }, + { + "epoch": 1.4567358055730149, + "grad_norm": 0.3465496971105258, + "learning_rate": 6.11467950118154e-05, + "loss": 2.7221, + "step": 31289 + }, + { + "epoch": 1.456782363759108, + "grad_norm": 0.3499027139996321, + "learning_rate": 6.114415442939273e-05, + "loss": 2.8108, + "step": 31290 + }, + { + "epoch": 1.456828921945201, + "grad_norm": 0.31321122835062193, + "learning_rate": 6.114151381426309e-05, + "loss": 2.704, + "step": 31291 + }, + { + "epoch": 1.456875480131294, + "grad_norm": 0.36620279940713857, + "learning_rate": 6.113887316643419e-05, + "loss": 2.7922, + "step": 31292 + }, + { + "epoch": 1.456922038317387, + "grad_norm": 0.3559588524831424, + "learning_rate": 6.113623248591379e-05, + "loss": 2.7022, + "step": 31293 + }, + { + "epoch": 1.4569685965034802, + "grad_norm": 0.31946321722383353, + "learning_rate": 6.113359177270966e-05, + "loss": 2.7156, + "step": 31294 + }, + { + "epoch": 1.4570151546895733, + "grad_norm": 0.3869139618418053, + "learning_rate": 6.113095102682952e-05, + "loss": 2.7701, + "step": 31295 + }, + { + "epoch": 1.4570617128756664, + "grad_norm": 0.31894238969649613, + "learning_rate": 6.112831024828116e-05, + "loss": 2.7335, + "step": 31296 + }, + { + "epoch": 1.4571082710617596, + "grad_norm": 0.4054029230123764, + "learning_rate": 6.112566943707227e-05, + "loss": 2.7011, + "step": 31297 + }, + { + "epoch": 1.4571548292478524, + "grad_norm": 0.34933597151595497, + "learning_rate": 6.112302859321065e-05, + "loss": 2.7714, + "step": 31298 + }, + { + "epoch": 1.4572013874339456, + "grad_norm": 0.36073366003169616, + "learning_rate": 6.112038771670404e-05, + "loss": 2.7959, + "step": 31299 + }, + { + "epoch": 1.4572479456200387, + "grad_norm": 0.3697494517359631, + "learning_rate": 6.11177468075602e-05, + "loss": 2.791, + "step": 31300 + }, + { + "epoch": 1.4572945038061318, + "grad_norm": 0.35255830380444386, + "learning_rate": 6.111510586578685e-05, + "loss": 2.6477, + "step": 31301 + }, + { + "epoch": 1.4573410619922247, + "grad_norm": 0.3495022809586749, + "learning_rate": 6.111246489139176e-05, + "loss": 2.7762, + "step": 31302 + }, + { + "epoch": 1.4573876201783178, + "grad_norm": 0.34795298252076495, + "learning_rate": 6.110982388438269e-05, + "loss": 2.6652, + "step": 31303 + }, + { + "epoch": 1.457434178364411, + "grad_norm": 0.33356986787194104, + "learning_rate": 6.110718284476739e-05, + "loss": 2.8226, + "step": 31304 + }, + { + "epoch": 1.457480736550504, + "grad_norm": 0.33639857585910427, + "learning_rate": 6.110454177255358e-05, + "loss": 2.825, + "step": 31305 + }, + { + "epoch": 1.4575272947365971, + "grad_norm": 0.3568805881730262, + "learning_rate": 6.110190066774904e-05, + "loss": 2.8258, + "step": 31306 + }, + { + "epoch": 1.4575738529226903, + "grad_norm": 0.3573793237002641, + "learning_rate": 6.109925953036154e-05, + "loss": 2.7851, + "step": 31307 + }, + { + "epoch": 1.4576204111087832, + "grad_norm": 0.3644136612285732, + "learning_rate": 6.109661836039877e-05, + "loss": 2.7796, + "step": 31308 + }, + { + "epoch": 1.4576669692948763, + "grad_norm": 0.35714753970861424, + "learning_rate": 6.109397715786854e-05, + "loss": 2.7453, + "step": 31309 + }, + { + "epoch": 1.4577135274809694, + "grad_norm": 0.36705894809584116, + "learning_rate": 6.109133592277857e-05, + "loss": 2.7035, + "step": 31310 + }, + { + "epoch": 1.4577600856670623, + "grad_norm": 0.36929288864458365, + "learning_rate": 6.108869465513663e-05, + "loss": 2.7911, + "step": 31311 + }, + { + "epoch": 1.4578066438531554, + "grad_norm": 0.3288418750670609, + "learning_rate": 6.108605335495044e-05, + "loss": 2.6645, + "step": 31312 + }, + { + "epoch": 1.4578532020392485, + "grad_norm": 0.3557454789477509, + "learning_rate": 6.108341202222782e-05, + "loss": 2.7982, + "step": 31313 + }, + { + "epoch": 1.4578997602253416, + "grad_norm": 0.3456428621834756, + "learning_rate": 6.108077065697644e-05, + "loss": 2.7695, + "step": 31314 + }, + { + "epoch": 1.4579463184114347, + "grad_norm": 0.36143751374211847, + "learning_rate": 6.10781292592041e-05, + "loss": 2.8118, + "step": 31315 + }, + { + "epoch": 1.4579928765975279, + "grad_norm": 0.3465056328194367, + "learning_rate": 6.107548782891854e-05, + "loss": 2.7307, + "step": 31316 + }, + { + "epoch": 1.4580394347836207, + "grad_norm": 0.34969287472873517, + "learning_rate": 6.10728463661275e-05, + "loss": 2.7155, + "step": 31317 + }, + { + "epoch": 1.4580859929697139, + "grad_norm": 0.39200810959862814, + "learning_rate": 6.107020487083878e-05, + "loss": 2.7999, + "step": 31318 + }, + { + "epoch": 1.458132551155807, + "grad_norm": 0.3392749038214568, + "learning_rate": 6.10675633430601e-05, + "loss": 2.8039, + "step": 31319 + }, + { + "epoch": 1.4581791093419, + "grad_norm": 0.36724059825720473, + "learning_rate": 6.106492178279919e-05, + "loss": 2.8043, + "step": 31320 + }, + { + "epoch": 1.458225667527993, + "grad_norm": 0.3414185809403208, + "learning_rate": 6.106228019006384e-05, + "loss": 2.7319, + "step": 31321 + }, + { + "epoch": 1.458272225714086, + "grad_norm": 0.3680721980092916, + "learning_rate": 6.105963856486178e-05, + "loss": 2.7526, + "step": 31322 + }, + { + "epoch": 1.4583187839001792, + "grad_norm": 0.347253013332836, + "learning_rate": 6.105699690720077e-05, + "loss": 2.6699, + "step": 31323 + }, + { + "epoch": 1.4583653420862723, + "grad_norm": 0.35429528038275004, + "learning_rate": 6.105435521708856e-05, + "loss": 2.7556, + "step": 31324 + }, + { + "epoch": 1.4584119002723654, + "grad_norm": 0.3558652398959108, + "learning_rate": 6.10517134945329e-05, + "loss": 2.7762, + "step": 31325 + }, + { + "epoch": 1.4584584584584586, + "grad_norm": 0.3365113236779553, + "learning_rate": 6.104907173954156e-05, + "loss": 2.8339, + "step": 31326 + }, + { + "epoch": 1.4585050166445515, + "grad_norm": 0.3562065101333535, + "learning_rate": 6.104642995212228e-05, + "loss": 2.6887, + "step": 31327 + }, + { + "epoch": 1.4585515748306446, + "grad_norm": 0.34036527132833677, + "learning_rate": 6.104378813228282e-05, + "loss": 2.7966, + "step": 31328 + }, + { + "epoch": 1.4585981330167377, + "grad_norm": 0.34423449970990144, + "learning_rate": 6.104114628003092e-05, + "loss": 2.6922, + "step": 31329 + }, + { + "epoch": 1.4586446912028308, + "grad_norm": 0.3330582218121373, + "learning_rate": 6.103850439537435e-05, + "loss": 2.7415, + "step": 31330 + }, + { + "epoch": 1.4586912493889237, + "grad_norm": 0.3228579141356936, + "learning_rate": 6.103586247832087e-05, + "loss": 2.7351, + "step": 31331 + }, + { + "epoch": 1.4587378075750168, + "grad_norm": 0.3412477098456749, + "learning_rate": 6.103322052887821e-05, + "loss": 2.7234, + "step": 31332 + }, + { + "epoch": 1.45878436576111, + "grad_norm": 0.32700673290590493, + "learning_rate": 6.103057854705414e-05, + "loss": 2.6501, + "step": 31333 + }, + { + "epoch": 1.458830923947203, + "grad_norm": 0.34255328944429386, + "learning_rate": 6.1027936532856403e-05, + "loss": 2.6759, + "step": 31334 + }, + { + "epoch": 1.4588774821332962, + "grad_norm": 0.30748146015998873, + "learning_rate": 6.102529448629275e-05, + "loss": 2.7583, + "step": 31335 + }, + { + "epoch": 1.4589240403193893, + "grad_norm": 0.34103521592970193, + "learning_rate": 6.1022652407370965e-05, + "loss": 2.7935, + "step": 31336 + }, + { + "epoch": 1.4589705985054822, + "grad_norm": 0.32524104189780734, + "learning_rate": 6.102001029609876e-05, + "loss": 2.7734, + "step": 31337 + }, + { + "epoch": 1.4590171566915753, + "grad_norm": 0.33680812850420344, + "learning_rate": 6.101736815248391e-05, + "loss": 2.7778, + "step": 31338 + }, + { + "epoch": 1.4590637148776684, + "grad_norm": 0.3776390696313306, + "learning_rate": 6.101472597653418e-05, + "loss": 2.801, + "step": 31339 + }, + { + "epoch": 1.4591102730637615, + "grad_norm": 0.34481807038705503, + "learning_rate": 6.101208376825732e-05, + "loss": 2.7345, + "step": 31340 + }, + { + "epoch": 1.4591568312498544, + "grad_norm": 0.38434998388858227, + "learning_rate": 6.100944152766107e-05, + "loss": 2.7571, + "step": 31341 + }, + { + "epoch": 1.4592033894359475, + "grad_norm": 0.33854539006653434, + "learning_rate": 6.1006799254753186e-05, + "loss": 2.8183, + "step": 31342 + }, + { + "epoch": 1.4592499476220406, + "grad_norm": 0.3869564408947341, + "learning_rate": 6.100415694954144e-05, + "loss": 2.7521, + "step": 31343 + }, + { + "epoch": 1.4592965058081337, + "grad_norm": 0.3331023369409895, + "learning_rate": 6.1001514612033584e-05, + "loss": 2.7608, + "step": 31344 + }, + { + "epoch": 1.4593430639942269, + "grad_norm": 0.38478024946455375, + "learning_rate": 6.099887224223736e-05, + "loss": 2.8787, + "step": 31345 + }, + { + "epoch": 1.45938962218032, + "grad_norm": 0.3152645216448238, + "learning_rate": 6.099622984016053e-05, + "loss": 2.6904, + "step": 31346 + }, + { + "epoch": 1.4594361803664129, + "grad_norm": 0.3867862242897013, + "learning_rate": 6.0993587405810845e-05, + "loss": 2.724, + "step": 31347 + }, + { + "epoch": 1.459482738552506, + "grad_norm": 0.3432104954457883, + "learning_rate": 6.0990944939196074e-05, + "loss": 2.7003, + "step": 31348 + }, + { + "epoch": 1.459529296738599, + "grad_norm": 0.3437554060426132, + "learning_rate": 6.0988302440323964e-05, + "loss": 2.7601, + "step": 31349 + }, + { + "epoch": 1.459575854924692, + "grad_norm": 0.3497082127179667, + "learning_rate": 6.0985659909202254e-05, + "loss": 2.7623, + "step": 31350 + }, + { + "epoch": 1.459622413110785, + "grad_norm": 0.3339574079471316, + "learning_rate": 6.098301734583871e-05, + "loss": 2.7956, + "step": 31351 + }, + { + "epoch": 1.4596689712968782, + "grad_norm": 0.3628148452837894, + "learning_rate": 6.098037475024111e-05, + "loss": 2.7817, + "step": 31352 + }, + { + "epoch": 1.4597155294829713, + "grad_norm": 0.3303294070712749, + "learning_rate": 6.097773212241718e-05, + "loss": 2.8384, + "step": 31353 + }, + { + "epoch": 1.4597620876690645, + "grad_norm": 0.34912485949503597, + "learning_rate": 6.09750894623747e-05, + "loss": 2.7251, + "step": 31354 + }, + { + "epoch": 1.4598086458551576, + "grad_norm": 0.3330640078026784, + "learning_rate": 6.097244677012139e-05, + "loss": 2.6773, + "step": 31355 + }, + { + "epoch": 1.4598552040412507, + "grad_norm": 0.32457447316917815, + "learning_rate": 6.0969804045665055e-05, + "loss": 2.6919, + "step": 31356 + }, + { + "epoch": 1.4599017622273436, + "grad_norm": 0.3358609822205213, + "learning_rate": 6.0967161289013416e-05, + "loss": 2.8043, + "step": 31357 + }, + { + "epoch": 1.4599483204134367, + "grad_norm": 0.3556694654576139, + "learning_rate": 6.096451850017424e-05, + "loss": 2.8575, + "step": 31358 + }, + { + "epoch": 1.4599948785995298, + "grad_norm": 0.34838215789713645, + "learning_rate": 6.096187567915529e-05, + "loss": 2.7846, + "step": 31359 + }, + { + "epoch": 1.4600414367856227, + "grad_norm": 0.37563764756179313, + "learning_rate": 6.095923282596431e-05, + "loss": 2.8029, + "step": 31360 + }, + { + "epoch": 1.4600879949717158, + "grad_norm": 0.3328264260211646, + "learning_rate": 6.0956589940609056e-05, + "loss": 2.7376, + "step": 31361 + }, + { + "epoch": 1.460134553157809, + "grad_norm": 0.38469022482931087, + "learning_rate": 6.095394702309728e-05, + "loss": 2.7473, + "step": 31362 + }, + { + "epoch": 1.460181111343902, + "grad_norm": 0.3346686492818875, + "learning_rate": 6.095130407343678e-05, + "loss": 2.707, + "step": 31363 + }, + { + "epoch": 1.4602276695299952, + "grad_norm": 0.4011330501282446, + "learning_rate": 6.094866109163525e-05, + "loss": 2.7347, + "step": 31364 + }, + { + "epoch": 1.4602742277160883, + "grad_norm": 0.32996045000216545, + "learning_rate": 6.0946018077700485e-05, + "loss": 2.7654, + "step": 31365 + }, + { + "epoch": 1.4603207859021812, + "grad_norm": 0.3521445957917417, + "learning_rate": 6.0943375031640247e-05, + "loss": 2.7405, + "step": 31366 + }, + { + "epoch": 1.4603673440882743, + "grad_norm": 0.33121748516350447, + "learning_rate": 6.094073195346228e-05, + "loss": 2.7499, + "step": 31367 + }, + { + "epoch": 1.4604139022743674, + "grad_norm": 0.36018360789940795, + "learning_rate": 6.093808884317433e-05, + "loss": 2.7851, + "step": 31368 + }, + { + "epoch": 1.4604604604604605, + "grad_norm": 0.3379244131355419, + "learning_rate": 6.093544570078418e-05, + "loss": 2.6827, + "step": 31369 + }, + { + "epoch": 1.4605070186465534, + "grad_norm": 0.32095476148961277, + "learning_rate": 6.093280252629957e-05, + "loss": 2.6425, + "step": 31370 + }, + { + "epoch": 1.4605535768326465, + "grad_norm": 0.343954544481438, + "learning_rate": 6.0930159319728255e-05, + "loss": 2.7771, + "step": 31371 + }, + { + "epoch": 1.4606001350187396, + "grad_norm": 0.33295200725096136, + "learning_rate": 6.092751608107802e-05, + "loss": 2.7855, + "step": 31372 + }, + { + "epoch": 1.4606466932048328, + "grad_norm": 0.3484588403601383, + "learning_rate": 6.0924872810356583e-05, + "loss": 2.809, + "step": 31373 + }, + { + "epoch": 1.4606932513909259, + "grad_norm": 0.34078113581598846, + "learning_rate": 6.092222950757172e-05, + "loss": 2.7281, + "step": 31374 + }, + { + "epoch": 1.460739809577019, + "grad_norm": 0.3604802815458829, + "learning_rate": 6.0919586172731194e-05, + "loss": 2.726, + "step": 31375 + }, + { + "epoch": 1.4607863677631119, + "grad_norm": 0.3337814391274244, + "learning_rate": 6.091694280584276e-05, + "loss": 2.7802, + "step": 31376 + }, + { + "epoch": 1.460832925949205, + "grad_norm": 0.35963187823826437, + "learning_rate": 6.0914299406914165e-05, + "loss": 2.7127, + "step": 31377 + }, + { + "epoch": 1.4608794841352981, + "grad_norm": 0.3429859405050554, + "learning_rate": 6.091165597595318e-05, + "loss": 2.7648, + "step": 31378 + }, + { + "epoch": 1.4609260423213912, + "grad_norm": 0.35334320606913816, + "learning_rate": 6.0909012512967566e-05, + "loss": 2.5947, + "step": 31379 + }, + { + "epoch": 1.4609726005074841, + "grad_norm": 0.3292045634423467, + "learning_rate": 6.090636901796507e-05, + "loss": 2.8079, + "step": 31380 + }, + { + "epoch": 1.4610191586935772, + "grad_norm": 0.3530158963429044, + "learning_rate": 6.090372549095347e-05, + "loss": 2.7647, + "step": 31381 + }, + { + "epoch": 1.4610657168796704, + "grad_norm": 0.3339210175770392, + "learning_rate": 6.0901081931940485e-05, + "loss": 2.7359, + "step": 31382 + }, + { + "epoch": 1.4611122750657635, + "grad_norm": 0.323914973126665, + "learning_rate": 6.0898438340933916e-05, + "loss": 2.8306, + "step": 31383 + }, + { + "epoch": 1.4611588332518566, + "grad_norm": 0.36959088242160254, + "learning_rate": 6.08957947179415e-05, + "loss": 2.8439, + "step": 31384 + }, + { + "epoch": 1.4612053914379497, + "grad_norm": 0.35480134526824114, + "learning_rate": 6.089315106297101e-05, + "loss": 2.7993, + "step": 31385 + }, + { + "epoch": 1.4612519496240426, + "grad_norm": 0.3297178085356277, + "learning_rate": 6.089050737603018e-05, + "loss": 2.7717, + "step": 31386 + }, + { + "epoch": 1.4612985078101357, + "grad_norm": 0.35678571034049783, + "learning_rate": 6.088786365712679e-05, + "loss": 2.766, + "step": 31387 + }, + { + "epoch": 1.4613450659962288, + "grad_norm": 0.34936301252607643, + "learning_rate": 6.088521990626857e-05, + "loss": 2.7618, + "step": 31388 + }, + { + "epoch": 1.461391624182322, + "grad_norm": 0.3380930457012246, + "learning_rate": 6.088257612346333e-05, + "loss": 2.665, + "step": 31389 + }, + { + "epoch": 1.4614381823684148, + "grad_norm": 0.32005355592734097, + "learning_rate": 6.0879932308718804e-05, + "loss": 2.6381, + "step": 31390 + }, + { + "epoch": 1.461484740554508, + "grad_norm": 0.36758864065394453, + "learning_rate": 6.087728846204274e-05, + "loss": 2.6892, + "step": 31391 + }, + { + "epoch": 1.461531298740601, + "grad_norm": 0.3389327200340073, + "learning_rate": 6.0874644583442896e-05, + "loss": 2.7347, + "step": 31392 + }, + { + "epoch": 1.4615778569266942, + "grad_norm": 0.3786847825345002, + "learning_rate": 6.087200067292704e-05, + "loss": 2.8293, + "step": 31393 + }, + { + "epoch": 1.4616244151127873, + "grad_norm": 0.3618930599021265, + "learning_rate": 6.0869356730502955e-05, + "loss": 2.8224, + "step": 31394 + }, + { + "epoch": 1.4616709732988804, + "grad_norm": 0.39249458065275994, + "learning_rate": 6.086671275617837e-05, + "loss": 2.7126, + "step": 31395 + }, + { + "epoch": 1.4617175314849733, + "grad_norm": 0.38885849129903566, + "learning_rate": 6.0864068749961045e-05, + "loss": 2.7799, + "step": 31396 + }, + { + "epoch": 1.4617640896710664, + "grad_norm": 0.3590090981156348, + "learning_rate": 6.086142471185876e-05, + "loss": 2.8224, + "step": 31397 + }, + { + "epoch": 1.4618106478571595, + "grad_norm": 0.36902810888988996, + "learning_rate": 6.0858780641879256e-05, + "loss": 2.7602, + "step": 31398 + }, + { + "epoch": 1.4618572060432524, + "grad_norm": 0.3753224147524422, + "learning_rate": 6.085613654003031e-05, + "loss": 2.8062, + "step": 31399 + }, + { + "epoch": 1.4619037642293455, + "grad_norm": 0.33124702850698223, + "learning_rate": 6.085349240631966e-05, + "loss": 2.8274, + "step": 31400 + }, + { + "epoch": 1.4619503224154387, + "grad_norm": 0.3629972159557534, + "learning_rate": 6.0850848240755076e-05, + "loss": 2.7094, + "step": 31401 + }, + { + "epoch": 1.4619968806015318, + "grad_norm": 0.36226381916671124, + "learning_rate": 6.0848204043344325e-05, + "loss": 2.7516, + "step": 31402 + }, + { + "epoch": 1.4620434387876249, + "grad_norm": 0.3356846093102756, + "learning_rate": 6.084555981409518e-05, + "loss": 2.6398, + "step": 31403 + }, + { + "epoch": 1.462089996973718, + "grad_norm": 0.37505281915435745, + "learning_rate": 6.0842915553015364e-05, + "loss": 2.7339, + "step": 31404 + }, + { + "epoch": 1.462136555159811, + "grad_norm": 0.31533373982380813, + "learning_rate": 6.0840271260112666e-05, + "loss": 2.7299, + "step": 31405 + }, + { + "epoch": 1.462183113345904, + "grad_norm": 0.3376395443609577, + "learning_rate": 6.083762693539484e-05, + "loss": 2.6738, + "step": 31406 + }, + { + "epoch": 1.4622296715319971, + "grad_norm": 0.3389753278924694, + "learning_rate": 6.083498257886965e-05, + "loss": 2.7702, + "step": 31407 + }, + { + "epoch": 1.4622762297180902, + "grad_norm": 0.36937105548171856, + "learning_rate": 6.083233819054486e-05, + "loss": 2.7584, + "step": 31408 + }, + { + "epoch": 1.4623227879041831, + "grad_norm": 0.36677682233002085, + "learning_rate": 6.082969377042821e-05, + "loss": 2.7636, + "step": 31409 + }, + { + "epoch": 1.4623693460902762, + "grad_norm": 0.39582537511630617, + "learning_rate": 6.0827049318527485e-05, + "loss": 2.7673, + "step": 31410 + }, + { + "epoch": 1.4624159042763694, + "grad_norm": 0.3259215415821667, + "learning_rate": 6.0824404834850436e-05, + "loss": 2.7935, + "step": 31411 + }, + { + "epoch": 1.4624624624624625, + "grad_norm": 0.38287105133908184, + "learning_rate": 6.082176031940483e-05, + "loss": 2.7538, + "step": 31412 + }, + { + "epoch": 1.4625090206485556, + "grad_norm": 0.35001210984803355, + "learning_rate": 6.081911577219841e-05, + "loss": 2.8555, + "step": 31413 + }, + { + "epoch": 1.4625555788346487, + "grad_norm": 0.35061203489034554, + "learning_rate": 6.0816471193238956e-05, + "loss": 2.7617, + "step": 31414 + }, + { + "epoch": 1.4626021370207416, + "grad_norm": 0.31111564583594764, + "learning_rate": 6.0813826582534226e-05, + "loss": 2.7665, + "step": 31415 + }, + { + "epoch": 1.4626486952068347, + "grad_norm": 0.351450708855776, + "learning_rate": 6.081118194009198e-05, + "loss": 2.7838, + "step": 31416 + }, + { + "epoch": 1.4626952533929278, + "grad_norm": 0.33948481333925706, + "learning_rate": 6.0808537265919983e-05, + "loss": 2.7596, + "step": 31417 + }, + { + "epoch": 1.462741811579021, + "grad_norm": 0.3106807166433132, + "learning_rate": 6.080589256002599e-05, + "loss": 2.8601, + "step": 31418 + }, + { + "epoch": 1.4627883697651138, + "grad_norm": 0.3615384886119516, + "learning_rate": 6.080324782241777e-05, + "loss": 2.761, + "step": 31419 + }, + { + "epoch": 1.462834927951207, + "grad_norm": 0.33032755913495393, + "learning_rate": 6.080060305310308e-05, + "loss": 2.681, + "step": 31420 + }, + { + "epoch": 1.4628814861373, + "grad_norm": 0.3206814874398463, + "learning_rate": 6.079795825208969e-05, + "loss": 2.7318, + "step": 31421 + }, + { + "epoch": 1.4629280443233932, + "grad_norm": 0.35477569564842565, + "learning_rate": 6.0795313419385344e-05, + "loss": 2.5965, + "step": 31422 + }, + { + "epoch": 1.4629746025094863, + "grad_norm": 0.3294813066524918, + "learning_rate": 6.079266855499783e-05, + "loss": 2.7437, + "step": 31423 + }, + { + "epoch": 1.4630211606955794, + "grad_norm": 0.36948228683735473, + "learning_rate": 6.079002365893489e-05, + "loss": 2.7065, + "step": 31424 + }, + { + "epoch": 1.4630677188816723, + "grad_norm": 0.3491210682107893, + "learning_rate": 6.0787378731204295e-05, + "loss": 2.7135, + "step": 31425 + }, + { + "epoch": 1.4631142770677654, + "grad_norm": 0.3350966136177528, + "learning_rate": 6.078473377181382e-05, + "loss": 2.7621, + "step": 31426 + }, + { + "epoch": 1.4631608352538585, + "grad_norm": 0.32611621864418083, + "learning_rate": 6.078208878077118e-05, + "loss": 2.8049, + "step": 31427 + }, + { + "epoch": 1.4632073934399517, + "grad_norm": 0.35537706329872254, + "learning_rate": 6.0779443758084195e-05, + "loss": 2.7281, + "step": 31428 + }, + { + "epoch": 1.4632539516260445, + "grad_norm": 0.3451871682652348, + "learning_rate": 6.077679870376061e-05, + "loss": 2.7178, + "step": 31429 + }, + { + "epoch": 1.4633005098121377, + "grad_norm": 0.3483024630697333, + "learning_rate": 6.077415361780817e-05, + "loss": 2.8181, + "step": 31430 + }, + { + "epoch": 1.4633470679982308, + "grad_norm": 0.3417133246413145, + "learning_rate": 6.0771508500234654e-05, + "loss": 2.8013, + "step": 31431 + }, + { + "epoch": 1.463393626184324, + "grad_norm": 0.3563503862442042, + "learning_rate": 6.076886335104781e-05, + "loss": 2.8094, + "step": 31432 + }, + { + "epoch": 1.463440184370417, + "grad_norm": 0.35719342842208324, + "learning_rate": 6.076621817025543e-05, + "loss": 2.6593, + "step": 31433 + }, + { + "epoch": 1.4634867425565101, + "grad_norm": 0.35626623497568893, + "learning_rate": 6.0763572957865256e-05, + "loss": 2.738, + "step": 31434 + }, + { + "epoch": 1.463533300742603, + "grad_norm": 0.3240704223644245, + "learning_rate": 6.076092771388505e-05, + "loss": 2.7632, + "step": 31435 + }, + { + "epoch": 1.4635798589286961, + "grad_norm": 0.36426573272079504, + "learning_rate": 6.07582824383226e-05, + "loss": 2.7563, + "step": 31436 + }, + { + "epoch": 1.4636264171147892, + "grad_norm": 0.3038427765687805, + "learning_rate": 6.0755637131185637e-05, + "loss": 2.7207, + "step": 31437 + }, + { + "epoch": 1.4636729753008821, + "grad_norm": 0.3880145758703289, + "learning_rate": 6.0752991792481926e-05, + "loss": 2.7745, + "step": 31438 + }, + { + "epoch": 1.4637195334869753, + "grad_norm": 0.32924164775820636, + "learning_rate": 6.075034642221927e-05, + "loss": 2.747, + "step": 31439 + }, + { + "epoch": 1.4637660916730684, + "grad_norm": 0.37838184080845133, + "learning_rate": 6.074770102040539e-05, + "loss": 2.6873, + "step": 31440 + }, + { + "epoch": 1.4638126498591615, + "grad_norm": 0.34975506785222166, + "learning_rate": 6.074505558704806e-05, + "loss": 2.7372, + "step": 31441 + }, + { + "epoch": 1.4638592080452546, + "grad_norm": 0.3826231498229247, + "learning_rate": 6.074241012215506e-05, + "loss": 2.6783, + "step": 31442 + }, + { + "epoch": 1.4639057662313477, + "grad_norm": 0.35188665937117375, + "learning_rate": 6.0739764625734144e-05, + "loss": 2.6738, + "step": 31443 + }, + { + "epoch": 1.4639523244174408, + "grad_norm": 0.3720707052283658, + "learning_rate": 6.073711909779308e-05, + "loss": 2.6491, + "step": 31444 + }, + { + "epoch": 1.4639988826035337, + "grad_norm": 0.36577098647150375, + "learning_rate": 6.073447353833962e-05, + "loss": 2.8284, + "step": 31445 + }, + { + "epoch": 1.4640454407896268, + "grad_norm": 0.39802933167766963, + "learning_rate": 6.0731827947381544e-05, + "loss": 2.7972, + "step": 31446 + }, + { + "epoch": 1.46409199897572, + "grad_norm": 0.37836202614836634, + "learning_rate": 6.0729182324926614e-05, + "loss": 2.7031, + "step": 31447 + }, + { + "epoch": 1.4641385571618128, + "grad_norm": 0.34446765103268273, + "learning_rate": 6.0726536670982584e-05, + "loss": 2.6602, + "step": 31448 + }, + { + "epoch": 1.464185115347906, + "grad_norm": 0.348514920789916, + "learning_rate": 6.072389098555724e-05, + "loss": 2.7796, + "step": 31449 + }, + { + "epoch": 1.464231673533999, + "grad_norm": 0.3452011898683761, + "learning_rate": 6.072124526865831e-05, + "loss": 2.7904, + "step": 31450 + }, + { + "epoch": 1.4642782317200922, + "grad_norm": 0.34842065278985884, + "learning_rate": 6.0718599520293595e-05, + "loss": 2.7795, + "step": 31451 + }, + { + "epoch": 1.4643247899061853, + "grad_norm": 0.3390983210616472, + "learning_rate": 6.071595374047085e-05, + "loss": 2.6459, + "step": 31452 + }, + { + "epoch": 1.4643713480922784, + "grad_norm": 0.34017129084888753, + "learning_rate": 6.0713307929197826e-05, + "loss": 2.6705, + "step": 31453 + }, + { + "epoch": 1.4644179062783713, + "grad_norm": 0.33093791743050327, + "learning_rate": 6.07106620864823e-05, + "loss": 2.7364, + "step": 31454 + }, + { + "epoch": 1.4644644644644644, + "grad_norm": 0.367819476854014, + "learning_rate": 6.070801621233203e-05, + "loss": 2.8172, + "step": 31455 + }, + { + "epoch": 1.4645110226505575, + "grad_norm": 0.3334253549935032, + "learning_rate": 6.070537030675479e-05, + "loss": 2.7646, + "step": 31456 + }, + { + "epoch": 1.4645575808366507, + "grad_norm": 0.35499048386961235, + "learning_rate": 6.0702724369758365e-05, + "loss": 2.775, + "step": 31457 + }, + { + "epoch": 1.4646041390227436, + "grad_norm": 0.33298090889423704, + "learning_rate": 6.070007840135048e-05, + "loss": 2.6129, + "step": 31458 + }, + { + "epoch": 1.4646506972088367, + "grad_norm": 0.34359166717402145, + "learning_rate": 6.069743240153891e-05, + "loss": 2.7067, + "step": 31459 + }, + { + "epoch": 1.4646972553949298, + "grad_norm": 0.3260509810217092, + "learning_rate": 6.069478637033146e-05, + "loss": 2.7577, + "step": 31460 + }, + { + "epoch": 1.464743813581023, + "grad_norm": 0.349393541408123, + "learning_rate": 6.0692140307735835e-05, + "loss": 2.7923, + "step": 31461 + }, + { + "epoch": 1.464790371767116, + "grad_norm": 0.33039726625134724, + "learning_rate": 6.068949421375984e-05, + "loss": 2.6933, + "step": 31462 + }, + { + "epoch": 1.4648369299532091, + "grad_norm": 0.37903447209873126, + "learning_rate": 6.068684808841124e-05, + "loss": 2.832, + "step": 31463 + }, + { + "epoch": 1.464883488139302, + "grad_norm": 0.3146331639427258, + "learning_rate": 6.0684201931697794e-05, + "loss": 2.7857, + "step": 31464 + }, + { + "epoch": 1.4649300463253951, + "grad_norm": 0.36789165941838153, + "learning_rate": 6.068155574362726e-05, + "loss": 2.6967, + "step": 31465 + }, + { + "epoch": 1.4649766045114883, + "grad_norm": 0.32542069276012736, + "learning_rate": 6.067890952420742e-05, + "loss": 2.7025, + "step": 31466 + }, + { + "epoch": 1.4650231626975814, + "grad_norm": 0.3912669982852322, + "learning_rate": 6.067626327344602e-05, + "loss": 2.7885, + "step": 31467 + }, + { + "epoch": 1.4650697208836743, + "grad_norm": 0.34123337586958563, + "learning_rate": 6.0673616991350846e-05, + "loss": 2.826, + "step": 31468 + }, + { + "epoch": 1.4651162790697674, + "grad_norm": 0.3494808822362056, + "learning_rate": 6.0670970677929664e-05, + "loss": 2.7901, + "step": 31469 + }, + { + "epoch": 1.4651628372558605, + "grad_norm": 0.3575224705905393, + "learning_rate": 6.066832433319022e-05, + "loss": 2.7958, + "step": 31470 + }, + { + "epoch": 1.4652093954419536, + "grad_norm": 0.3883438719229981, + "learning_rate": 6.066567795714031e-05, + "loss": 2.602, + "step": 31471 + }, + { + "epoch": 1.4652559536280467, + "grad_norm": 0.3201231394159968, + "learning_rate": 6.0663031549787676e-05, + "loss": 2.7827, + "step": 31472 + }, + { + "epoch": 1.4653025118141398, + "grad_norm": 0.3787251490111444, + "learning_rate": 6.0660385111140106e-05, + "loss": 2.7751, + "step": 31473 + }, + { + "epoch": 1.4653490700002327, + "grad_norm": 0.3270192471787279, + "learning_rate": 6.065773864120534e-05, + "loss": 2.6861, + "step": 31474 + }, + { + "epoch": 1.4653956281863258, + "grad_norm": 0.3731760088617168, + "learning_rate": 6.065509213999119e-05, + "loss": 2.7018, + "step": 31475 + }, + { + "epoch": 1.465442186372419, + "grad_norm": 0.34561490293884667, + "learning_rate": 6.065244560750536e-05, + "loss": 2.6616, + "step": 31476 + }, + { + "epoch": 1.465488744558512, + "grad_norm": 0.35848406879947536, + "learning_rate": 6.064979904375566e-05, + "loss": 2.7915, + "step": 31477 + }, + { + "epoch": 1.465535302744605, + "grad_norm": 0.3293566870440324, + "learning_rate": 6.0647152448749856e-05, + "loss": 2.777, + "step": 31478 + }, + { + "epoch": 1.465581860930698, + "grad_norm": 0.3528737966285844, + "learning_rate": 6.06445058224957e-05, + "loss": 2.8315, + "step": 31479 + }, + { + "epoch": 1.4656284191167912, + "grad_norm": 0.3466918576463468, + "learning_rate": 6.064185916500099e-05, + "loss": 2.7308, + "step": 31480 + }, + { + "epoch": 1.4656749773028843, + "grad_norm": 0.3364245867896824, + "learning_rate": 6.063921247627345e-05, + "loss": 2.72, + "step": 31481 + }, + { + "epoch": 1.4657215354889774, + "grad_norm": 0.3350129337146783, + "learning_rate": 6.0636565756320874e-05, + "loss": 2.7482, + "step": 31482 + }, + { + "epoch": 1.4657680936750705, + "grad_norm": 0.3684546967177586, + "learning_rate": 6.063391900515103e-05, + "loss": 2.8061, + "step": 31483 + }, + { + "epoch": 1.4658146518611634, + "grad_norm": 0.3167772728988206, + "learning_rate": 6.0631272222771685e-05, + "loss": 2.7072, + "step": 31484 + }, + { + "epoch": 1.4658612100472566, + "grad_norm": 0.3519231232703957, + "learning_rate": 6.0628625409190584e-05, + "loss": 2.6787, + "step": 31485 + }, + { + "epoch": 1.4659077682333497, + "grad_norm": 0.3160008005048446, + "learning_rate": 6.0625978564415544e-05, + "loss": 2.7411, + "step": 31486 + }, + { + "epoch": 1.4659543264194426, + "grad_norm": 0.33054969119118977, + "learning_rate": 6.062333168845429e-05, + "loss": 2.7789, + "step": 31487 + }, + { + "epoch": 1.4660008846055357, + "grad_norm": 0.3364406061027076, + "learning_rate": 6.06206847813146e-05, + "loss": 2.6525, + "step": 31488 + }, + { + "epoch": 1.4660474427916288, + "grad_norm": 0.3359085930147064, + "learning_rate": 6.061803784300425e-05, + "loss": 2.7498, + "step": 31489 + }, + { + "epoch": 1.466094000977722, + "grad_norm": 0.33914141612615195, + "learning_rate": 6.0615390873531e-05, + "loss": 2.7012, + "step": 31490 + }, + { + "epoch": 1.466140559163815, + "grad_norm": 0.32918894480435623, + "learning_rate": 6.061274387290262e-05, + "loss": 2.7466, + "step": 31491 + }, + { + "epoch": 1.4661871173499081, + "grad_norm": 0.35344074835543, + "learning_rate": 6.0610096841126885e-05, + "loss": 2.7042, + "step": 31492 + }, + { + "epoch": 1.466233675536001, + "grad_norm": 0.3325872731385883, + "learning_rate": 6.060744977821158e-05, + "loss": 2.6379, + "step": 31493 + }, + { + "epoch": 1.4662802337220942, + "grad_norm": 0.3359032137540971, + "learning_rate": 6.060480268416443e-05, + "loss": 2.7187, + "step": 31494 + }, + { + "epoch": 1.4663267919081873, + "grad_norm": 0.3583597164591425, + "learning_rate": 6.060215555899323e-05, + "loss": 2.8319, + "step": 31495 + }, + { + "epoch": 1.4663733500942804, + "grad_norm": 0.33307150429381155, + "learning_rate": 6.0599508402705765e-05, + "loss": 2.6544, + "step": 31496 + }, + { + "epoch": 1.4664199082803733, + "grad_norm": 0.36984538389812865, + "learning_rate": 6.059686121530979e-05, + "loss": 2.6654, + "step": 31497 + }, + { + "epoch": 1.4664664664664664, + "grad_norm": 0.3224015562448036, + "learning_rate": 6.059421399681306e-05, + "loss": 2.697, + "step": 31498 + }, + { + "epoch": 1.4665130246525595, + "grad_norm": 0.35664917741365065, + "learning_rate": 6.0591566747223346e-05, + "loss": 2.7726, + "step": 31499 + }, + { + "epoch": 1.4665595828386526, + "grad_norm": 0.36367925690868796, + "learning_rate": 6.058891946654844e-05, + "loss": 2.7857, + "step": 31500 + }, + { + "epoch": 1.4666061410247457, + "grad_norm": 0.3274015525440347, + "learning_rate": 6.05862721547961e-05, + "loss": 2.8262, + "step": 31501 + }, + { + "epoch": 1.4666526992108389, + "grad_norm": 0.3425299913876004, + "learning_rate": 6.058362481197409e-05, + "loss": 2.762, + "step": 31502 + }, + { + "epoch": 1.4666992573969317, + "grad_norm": 0.319883341507404, + "learning_rate": 6.058097743809018e-05, + "loss": 2.8254, + "step": 31503 + }, + { + "epoch": 1.4667458155830249, + "grad_norm": 0.36060840342453104, + "learning_rate": 6.057833003315214e-05, + "loss": 2.6543, + "step": 31504 + }, + { + "epoch": 1.466792373769118, + "grad_norm": 0.32464053209694577, + "learning_rate": 6.057568259716775e-05, + "loss": 2.9494, + "step": 31505 + }, + { + "epoch": 1.466838931955211, + "grad_norm": 0.36296960060465305, + "learning_rate": 6.0573035130144776e-05, + "loss": 2.8654, + "step": 31506 + }, + { + "epoch": 1.466885490141304, + "grad_norm": 0.34546722480768666, + "learning_rate": 6.057038763209099e-05, + "loss": 2.7419, + "step": 31507 + }, + { + "epoch": 1.466932048327397, + "grad_norm": 0.3680905515458552, + "learning_rate": 6.056774010301415e-05, + "loss": 2.714, + "step": 31508 + }, + { + "epoch": 1.4669786065134902, + "grad_norm": 0.33431704398494283, + "learning_rate": 6.056509254292203e-05, + "loss": 2.6006, + "step": 31509 + }, + { + "epoch": 1.4670251646995833, + "grad_norm": 0.3619596565763458, + "learning_rate": 6.056244495182242e-05, + "loss": 2.7343, + "step": 31510 + }, + { + "epoch": 1.4670717228856764, + "grad_norm": 0.38932975201892145, + "learning_rate": 6.055979732972305e-05, + "loss": 2.612, + "step": 31511 + }, + { + "epoch": 1.4671182810717696, + "grad_norm": 0.3327469748814395, + "learning_rate": 6.055714967663174e-05, + "loss": 2.766, + "step": 31512 + }, + { + "epoch": 1.4671648392578625, + "grad_norm": 0.4032342731386779, + "learning_rate": 6.0554501992556223e-05, + "loss": 2.7048, + "step": 31513 + }, + { + "epoch": 1.4672113974439556, + "grad_norm": 0.3316059928649636, + "learning_rate": 6.055185427750428e-05, + "loss": 2.7578, + "step": 31514 + }, + { + "epoch": 1.4672579556300487, + "grad_norm": 0.3844947513257263, + "learning_rate": 6.054920653148368e-05, + "loss": 2.7701, + "step": 31515 + }, + { + "epoch": 1.4673045138161418, + "grad_norm": 0.29692876777215005, + "learning_rate": 6.054655875450223e-05, + "loss": 2.8023, + "step": 31516 + }, + { + "epoch": 1.4673510720022347, + "grad_norm": 0.3631518511777685, + "learning_rate": 6.054391094656764e-05, + "loss": 2.7327, + "step": 31517 + }, + { + "epoch": 1.4673976301883278, + "grad_norm": 0.3726612084038277, + "learning_rate": 6.0541263107687705e-05, + "loss": 2.7624, + "step": 31518 + }, + { + "epoch": 1.467444188374421, + "grad_norm": 0.40146630763730806, + "learning_rate": 6.0538615237870214e-05, + "loss": 2.7647, + "step": 31519 + }, + { + "epoch": 1.467490746560514, + "grad_norm": 0.35902964690992306, + "learning_rate": 6.053596733712292e-05, + "loss": 2.729, + "step": 31520 + }, + { + "epoch": 1.4675373047466072, + "grad_norm": 0.36282458350231056, + "learning_rate": 6.05333194054536e-05, + "loss": 2.6173, + "step": 31521 + }, + { + "epoch": 1.4675838629327003, + "grad_norm": 0.35375334346815035, + "learning_rate": 6.053067144287003e-05, + "loss": 2.7681, + "step": 31522 + }, + { + "epoch": 1.4676304211187932, + "grad_norm": 0.3607961011008354, + "learning_rate": 6.052802344937999e-05, + "loss": 2.7148, + "step": 31523 + }, + { + "epoch": 1.4676769793048863, + "grad_norm": 0.33790307098916933, + "learning_rate": 6.052537542499121e-05, + "loss": 2.7195, + "step": 31524 + }, + { + "epoch": 1.4677235374909794, + "grad_norm": 0.3882493655163638, + "learning_rate": 6.052272736971152e-05, + "loss": 2.8765, + "step": 31525 + }, + { + "epoch": 1.4677700956770723, + "grad_norm": 0.3414323964008096, + "learning_rate": 6.0520079283548636e-05, + "loss": 2.7391, + "step": 31526 + }, + { + "epoch": 1.4678166538631654, + "grad_norm": 0.3841530864846264, + "learning_rate": 6.051743116651036e-05, + "loss": 2.7681, + "step": 31527 + }, + { + "epoch": 1.4678632120492585, + "grad_norm": 0.34461965837090985, + "learning_rate": 6.051478301860447e-05, + "loss": 2.8308, + "step": 31528 + }, + { + "epoch": 1.4679097702353516, + "grad_norm": 0.36726403177814726, + "learning_rate": 6.051213483983873e-05, + "loss": 2.7513, + "step": 31529 + }, + { + "epoch": 1.4679563284214447, + "grad_norm": 0.3007260591049047, + "learning_rate": 6.05094866302209e-05, + "loss": 2.7632, + "step": 31530 + }, + { + "epoch": 1.4680028866075379, + "grad_norm": 0.36248007104806135, + "learning_rate": 6.050683838975876e-05, + "loss": 2.6729, + "step": 31531 + }, + { + "epoch": 1.468049444793631, + "grad_norm": 0.3446504588348002, + "learning_rate": 6.0504190118460094e-05, + "loss": 2.7432, + "step": 31532 + }, + { + "epoch": 1.4680960029797239, + "grad_norm": 0.3640973458681846, + "learning_rate": 6.0501541816332654e-05, + "loss": 2.7609, + "step": 31533 + }, + { + "epoch": 1.468142561165817, + "grad_norm": 0.33504336953997393, + "learning_rate": 6.049889348338423e-05, + "loss": 2.762, + "step": 31534 + }, + { + "epoch": 1.46818911935191, + "grad_norm": 0.3537073189369313, + "learning_rate": 6.049624511962259e-05, + "loss": 2.7101, + "step": 31535 + }, + { + "epoch": 1.468235677538003, + "grad_norm": 0.34061619472523025, + "learning_rate": 6.04935967250555e-05, + "loss": 2.7604, + "step": 31536 + }, + { + "epoch": 1.468282235724096, + "grad_norm": 0.3408357792791647, + "learning_rate": 6.0490948299690745e-05, + "loss": 2.751, + "step": 31537 + }, + { + "epoch": 1.4683287939101892, + "grad_norm": 0.3206063567700554, + "learning_rate": 6.048829984353609e-05, + "loss": 2.6528, + "step": 31538 + }, + { + "epoch": 1.4683753520962823, + "grad_norm": 0.355370989612786, + "learning_rate": 6.0485651356599294e-05, + "loss": 2.7812, + "step": 31539 + }, + { + "epoch": 1.4684219102823755, + "grad_norm": 0.30637353057739947, + "learning_rate": 6.0483002838888145e-05, + "loss": 2.682, + "step": 31540 + }, + { + "epoch": 1.4684684684684686, + "grad_norm": 0.3373035443256753, + "learning_rate": 6.048035429041042e-05, + "loss": 2.7152, + "step": 31541 + }, + { + "epoch": 1.4685150266545615, + "grad_norm": 0.32834630450406216, + "learning_rate": 6.0477705711173896e-05, + "loss": 2.8557, + "step": 31542 + }, + { + "epoch": 1.4685615848406546, + "grad_norm": 0.3616128383902954, + "learning_rate": 6.047505710118634e-05, + "loss": 2.6939, + "step": 31543 + }, + { + "epoch": 1.4686081430267477, + "grad_norm": 0.33264246333904507, + "learning_rate": 6.047240846045551e-05, + "loss": 2.786, + "step": 31544 + }, + { + "epoch": 1.4686547012128408, + "grad_norm": 0.33608572826205513, + "learning_rate": 6.04697597889892e-05, + "loss": 2.647, + "step": 31545 + }, + { + "epoch": 1.4687012593989337, + "grad_norm": 0.3326754212920769, + "learning_rate": 6.046711108679517e-05, + "loss": 2.7342, + "step": 31546 + }, + { + "epoch": 1.4687478175850268, + "grad_norm": 0.35034030601101673, + "learning_rate": 6.0464462353881203e-05, + "loss": 2.7239, + "step": 31547 + }, + { + "epoch": 1.46879437577112, + "grad_norm": 0.34629295182578734, + "learning_rate": 6.046181359025508e-05, + "loss": 2.7714, + "step": 31548 + }, + { + "epoch": 1.468840933957213, + "grad_norm": 0.34430896537303723, + "learning_rate": 6.045916479592457e-05, + "loss": 2.7463, + "step": 31549 + }, + { + "epoch": 1.4688874921433062, + "grad_norm": 0.32355964791258446, + "learning_rate": 6.045651597089742e-05, + "loss": 2.6377, + "step": 31550 + }, + { + "epoch": 1.4689340503293993, + "grad_norm": 0.3080199699125012, + "learning_rate": 6.045386711518144e-05, + "loss": 2.7625, + "step": 31551 + }, + { + "epoch": 1.4689806085154922, + "grad_norm": 0.35018671585067707, + "learning_rate": 6.045121822878439e-05, + "loss": 2.794, + "step": 31552 + }, + { + "epoch": 1.4690271667015853, + "grad_norm": 0.3140256660744866, + "learning_rate": 6.044856931171403e-05, + "loss": 2.6115, + "step": 31553 + }, + { + "epoch": 1.4690737248876784, + "grad_norm": 0.38232802136379535, + "learning_rate": 6.044592036397816e-05, + "loss": 2.7805, + "step": 31554 + }, + { + "epoch": 1.4691202830737715, + "grad_norm": 0.33855403294570874, + "learning_rate": 6.0443271385584534e-05, + "loss": 2.7335, + "step": 31555 + }, + { + "epoch": 1.4691668412598644, + "grad_norm": 0.348298932095468, + "learning_rate": 6.044062237654093e-05, + "loss": 2.7942, + "step": 31556 + }, + { + "epoch": 1.4692133994459575, + "grad_norm": 0.3717061512101682, + "learning_rate": 6.0437973336855156e-05, + "loss": 2.8206, + "step": 31557 + }, + { + "epoch": 1.4692599576320506, + "grad_norm": 0.3105223020042199, + "learning_rate": 6.043532426653494e-05, + "loss": 2.7643, + "step": 31558 + }, + { + "epoch": 1.4693065158181438, + "grad_norm": 0.3720200578254046, + "learning_rate": 6.043267516558807e-05, + "loss": 2.8425, + "step": 31559 + }, + { + "epoch": 1.4693530740042369, + "grad_norm": 0.3193053165174144, + "learning_rate": 6.043002603402234e-05, + "loss": 2.7535, + "step": 31560 + }, + { + "epoch": 1.46939963219033, + "grad_norm": 0.33766787871345455, + "learning_rate": 6.042737687184551e-05, + "loss": 2.6715, + "step": 31561 + }, + { + "epoch": 1.4694461903764229, + "grad_norm": 0.3336943086471956, + "learning_rate": 6.042472767906534e-05, + "loss": 2.7562, + "step": 31562 + }, + { + "epoch": 1.469492748562516, + "grad_norm": 0.38070208667070904, + "learning_rate": 6.042207845568964e-05, + "loss": 2.7848, + "step": 31563 + }, + { + "epoch": 1.469539306748609, + "grad_norm": 0.3579487693907362, + "learning_rate": 6.041942920172616e-05, + "loss": 2.7435, + "step": 31564 + }, + { + "epoch": 1.4695858649347022, + "grad_norm": 0.3654848736555957, + "learning_rate": 6.0416779917182674e-05, + "loss": 2.7437, + "step": 31565 + }, + { + "epoch": 1.4696324231207951, + "grad_norm": 0.3471837012648401, + "learning_rate": 6.041413060206698e-05, + "loss": 2.7398, + "step": 31566 + }, + { + "epoch": 1.4696789813068882, + "grad_norm": 0.3514049033114051, + "learning_rate": 6.041148125638683e-05, + "loss": 2.7872, + "step": 31567 + }, + { + "epoch": 1.4697255394929813, + "grad_norm": 0.3488307516566108, + "learning_rate": 6.0408831880150006e-05, + "loss": 2.6494, + "step": 31568 + }, + { + "epoch": 1.4697720976790745, + "grad_norm": 0.33544100651443265, + "learning_rate": 6.040618247336428e-05, + "loss": 2.6553, + "step": 31569 + }, + { + "epoch": 1.4698186558651676, + "grad_norm": 0.3174042515768049, + "learning_rate": 6.040353303603745e-05, + "loss": 2.8085, + "step": 31570 + }, + { + "epoch": 1.4698652140512607, + "grad_norm": 0.3372414817055996, + "learning_rate": 6.0400883568177266e-05, + "loss": 2.7932, + "step": 31571 + }, + { + "epoch": 1.4699117722373536, + "grad_norm": 0.3724815522430879, + "learning_rate": 6.039823406979152e-05, + "loss": 2.6873, + "step": 31572 + }, + { + "epoch": 1.4699583304234467, + "grad_norm": 0.3395454198397006, + "learning_rate": 6.0395584540887963e-05, + "loss": 2.8108, + "step": 31573 + }, + { + "epoch": 1.4700048886095398, + "grad_norm": 0.35154014795537897, + "learning_rate": 6.039293498147441e-05, + "loss": 2.6208, + "step": 31574 + }, + { + "epoch": 1.4700514467956327, + "grad_norm": 0.3434798591120209, + "learning_rate": 6.039028539155861e-05, + "loss": 2.7044, + "step": 31575 + }, + { + "epoch": 1.4700980049817258, + "grad_norm": 0.34332872202432296, + "learning_rate": 6.0387635771148357e-05, + "loss": 2.6465, + "step": 31576 + }, + { + "epoch": 1.470144563167819, + "grad_norm": 0.33912015899645126, + "learning_rate": 6.03849861202514e-05, + "loss": 2.7202, + "step": 31577 + }, + { + "epoch": 1.470191121353912, + "grad_norm": 0.35865110248018955, + "learning_rate": 6.038233643887553e-05, + "loss": 2.6337, + "step": 31578 + }, + { + "epoch": 1.4702376795400052, + "grad_norm": 0.3508432211572796, + "learning_rate": 6.037968672702854e-05, + "loss": 2.7177, + "step": 31579 + }, + { + "epoch": 1.4702842377260983, + "grad_norm": 0.35263100037107403, + "learning_rate": 6.037703698471818e-05, + "loss": 2.6386, + "step": 31580 + }, + { + "epoch": 1.4703307959121912, + "grad_norm": 0.33844560638135035, + "learning_rate": 6.037438721195224e-05, + "loss": 2.6072, + "step": 31581 + }, + { + "epoch": 1.4703773540982843, + "grad_norm": 0.3775253822219014, + "learning_rate": 6.037173740873849e-05, + "loss": 2.7766, + "step": 31582 + }, + { + "epoch": 1.4704239122843774, + "grad_norm": 0.36864423675171465, + "learning_rate": 6.036908757508473e-05, + "loss": 2.7856, + "step": 31583 + }, + { + "epoch": 1.4704704704704705, + "grad_norm": 0.3814929940572873, + "learning_rate": 6.036643771099871e-05, + "loss": 2.7359, + "step": 31584 + }, + { + "epoch": 1.4705170286565634, + "grad_norm": 0.3442956600321022, + "learning_rate": 6.036378781648822e-05, + "loss": 2.7095, + "step": 31585 + }, + { + "epoch": 1.4705635868426565, + "grad_norm": 0.35020779837155286, + "learning_rate": 6.036113789156102e-05, + "loss": 2.6915, + "step": 31586 + }, + { + "epoch": 1.4706101450287496, + "grad_norm": 0.3707181258344647, + "learning_rate": 6.035848793622492e-05, + "loss": 2.722, + "step": 31587 + }, + { + "epoch": 1.4706567032148428, + "grad_norm": 0.344836401921393, + "learning_rate": 6.035583795048767e-05, + "loss": 2.7009, + "step": 31588 + }, + { + "epoch": 1.4707032614009359, + "grad_norm": 0.395952764205338, + "learning_rate": 6.035318793435706e-05, + "loss": 2.6819, + "step": 31589 + }, + { + "epoch": 1.470749819587029, + "grad_norm": 0.34650564408955115, + "learning_rate": 6.035053788784085e-05, + "loss": 2.804, + "step": 31590 + }, + { + "epoch": 1.4707963777731219, + "grad_norm": 0.3707284146793721, + "learning_rate": 6.0347887810946846e-05, + "loss": 2.6213, + "step": 31591 + }, + { + "epoch": 1.470842935959215, + "grad_norm": 0.3614075329286889, + "learning_rate": 6.03452377036828e-05, + "loss": 2.7987, + "step": 31592 + }, + { + "epoch": 1.4708894941453081, + "grad_norm": 0.3824557175949744, + "learning_rate": 6.0342587566056504e-05, + "loss": 2.72, + "step": 31593 + }, + { + "epoch": 1.4709360523314012, + "grad_norm": 0.3521356452474596, + "learning_rate": 6.033993739807574e-05, + "loss": 2.7754, + "step": 31594 + }, + { + "epoch": 1.4709826105174941, + "grad_norm": 0.3653895927466009, + "learning_rate": 6.033728719974827e-05, + "loss": 2.7837, + "step": 31595 + }, + { + "epoch": 1.4710291687035872, + "grad_norm": 0.36157834977225234, + "learning_rate": 6.033463697108188e-05, + "loss": 2.7432, + "step": 31596 + }, + { + "epoch": 1.4710757268896804, + "grad_norm": 0.3420302346080589, + "learning_rate": 6.0331986712084355e-05, + "loss": 2.7706, + "step": 31597 + }, + { + "epoch": 1.4711222850757735, + "grad_norm": 0.38065583255582447, + "learning_rate": 6.032933642276346e-05, + "loss": 2.772, + "step": 31598 + }, + { + "epoch": 1.4711688432618666, + "grad_norm": 0.3238973042613638, + "learning_rate": 6.032668610312698e-05, + "loss": 2.8565, + "step": 31599 + }, + { + "epoch": 1.4712154014479597, + "grad_norm": 0.38252120024115643, + "learning_rate": 6.032403575318271e-05, + "loss": 2.6991, + "step": 31600 + }, + { + "epoch": 1.4712619596340526, + "grad_norm": 0.3583389266285371, + "learning_rate": 6.032138537293839e-05, + "loss": 2.7348, + "step": 31601 + }, + { + "epoch": 1.4713085178201457, + "grad_norm": 0.34804006240348834, + "learning_rate": 6.031873496240183e-05, + "loss": 2.7385, + "step": 31602 + }, + { + "epoch": 1.4713550760062388, + "grad_norm": 0.3310948448355341, + "learning_rate": 6.0316084521580805e-05, + "loss": 2.6799, + "step": 31603 + }, + { + "epoch": 1.471401634192332, + "grad_norm": 0.34256635020738907, + "learning_rate": 6.0313434050483086e-05, + "loss": 2.7043, + "step": 31604 + }, + { + "epoch": 1.4714481923784248, + "grad_norm": 0.34765386135913934, + "learning_rate": 6.031078354911644e-05, + "loss": 2.7326, + "step": 31605 + }, + { + "epoch": 1.471494750564518, + "grad_norm": 0.3551922414657311, + "learning_rate": 6.030813301748868e-05, + "loss": 2.7996, + "step": 31606 + }, + { + "epoch": 1.471541308750611, + "grad_norm": 0.34160645765244574, + "learning_rate": 6.0305482455607555e-05, + "loss": 2.6729, + "step": 31607 + }, + { + "epoch": 1.4715878669367042, + "grad_norm": 0.34460337918833317, + "learning_rate": 6.030283186348085e-05, + "loss": 2.6759, + "step": 31608 + }, + { + "epoch": 1.4716344251227973, + "grad_norm": 0.35382455579039207, + "learning_rate": 6.030018124111635e-05, + "loss": 2.7902, + "step": 31609 + }, + { + "epoch": 1.4716809833088904, + "grad_norm": 0.35203587364547856, + "learning_rate": 6.029753058852183e-05, + "loss": 2.7633, + "step": 31610 + }, + { + "epoch": 1.4717275414949833, + "grad_norm": 0.4029291406800997, + "learning_rate": 6.029487990570509e-05, + "loss": 2.7583, + "step": 31611 + }, + { + "epoch": 1.4717740996810764, + "grad_norm": 0.37786621262813563, + "learning_rate": 6.029222919267388e-05, + "loss": 2.6647, + "step": 31612 + }, + { + "epoch": 1.4718206578671695, + "grad_norm": 0.3671247075610522, + "learning_rate": 6.028957844943599e-05, + "loss": 2.7689, + "step": 31613 + }, + { + "epoch": 1.4718672160532624, + "grad_norm": 0.3505534281910215, + "learning_rate": 6.028692767599921e-05, + "loss": 2.7165, + "step": 31614 + }, + { + "epoch": 1.4719137742393555, + "grad_norm": 0.374711199683241, + "learning_rate": 6.0284276872371303e-05, + "loss": 2.7905, + "step": 31615 + }, + { + "epoch": 1.4719603324254487, + "grad_norm": 0.35936931785945114, + "learning_rate": 6.028162603856006e-05, + "loss": 2.735, + "step": 31616 + }, + { + "epoch": 1.4720068906115418, + "grad_norm": 0.3483015987466102, + "learning_rate": 6.027897517457325e-05, + "loss": 2.7149, + "step": 31617 + }, + { + "epoch": 1.472053448797635, + "grad_norm": 0.3680945946628401, + "learning_rate": 6.027632428041866e-05, + "loss": 2.7538, + "step": 31618 + }, + { + "epoch": 1.472100006983728, + "grad_norm": 0.3722691400971606, + "learning_rate": 6.0273673356104074e-05, + "loss": 2.7751, + "step": 31619 + }, + { + "epoch": 1.4721465651698211, + "grad_norm": 0.3433070530995209, + "learning_rate": 6.027102240163728e-05, + "loss": 2.7429, + "step": 31620 + }, + { + "epoch": 1.472193123355914, + "grad_norm": 0.4107673254663396, + "learning_rate": 6.026837141702603e-05, + "loss": 2.7804, + "step": 31621 + }, + { + "epoch": 1.4722396815420071, + "grad_norm": 0.3215524273866393, + "learning_rate": 6.026572040227813e-05, + "loss": 2.8509, + "step": 31622 + }, + { + "epoch": 1.4722862397281002, + "grad_norm": 0.34199741884242746, + "learning_rate": 6.026306935740135e-05, + "loss": 2.775, + "step": 31623 + }, + { + "epoch": 1.4723327979141931, + "grad_norm": 0.32354443280530865, + "learning_rate": 6.0260418282403474e-05, + "loss": 2.6261, + "step": 31624 + }, + { + "epoch": 1.4723793561002863, + "grad_norm": 0.32469462839397045, + "learning_rate": 6.025776717729228e-05, + "loss": 2.7812, + "step": 31625 + }, + { + "epoch": 1.4724259142863794, + "grad_norm": 0.3434516519806569, + "learning_rate": 6.025511604207556e-05, + "loss": 2.722, + "step": 31626 + }, + { + "epoch": 1.4724724724724725, + "grad_norm": 0.3070880884037306, + "learning_rate": 6.025246487676106e-05, + "loss": 2.7934, + "step": 31627 + }, + { + "epoch": 1.4725190306585656, + "grad_norm": 0.36399972112320156, + "learning_rate": 6.0249813681356605e-05, + "loss": 2.8281, + "step": 31628 + }, + { + "epoch": 1.4725655888446587, + "grad_norm": 0.33502743403430973, + "learning_rate": 6.0247162455869965e-05, + "loss": 2.7079, + "step": 31629 + }, + { + "epoch": 1.4726121470307516, + "grad_norm": 0.3097622800685772, + "learning_rate": 6.02445112003089e-05, + "loss": 2.7255, + "step": 31630 + }, + { + "epoch": 1.4726587052168447, + "grad_norm": 0.3577412315891619, + "learning_rate": 6.0241859914681184e-05, + "loss": 2.8173, + "step": 31631 + }, + { + "epoch": 1.4727052634029378, + "grad_norm": 0.3083536408533939, + "learning_rate": 6.023920859899464e-05, + "loss": 2.8142, + "step": 31632 + }, + { + "epoch": 1.472751821589031, + "grad_norm": 0.32797015502538035, + "learning_rate": 6.023655725325703e-05, + "loss": 2.6365, + "step": 31633 + }, + { + "epoch": 1.4727983797751238, + "grad_norm": 0.3405580617912321, + "learning_rate": 6.0233905877476124e-05, + "loss": 2.7533, + "step": 31634 + }, + { + "epoch": 1.472844937961217, + "grad_norm": 0.33479736432454527, + "learning_rate": 6.023125447165973e-05, + "loss": 2.6283, + "step": 31635 + }, + { + "epoch": 1.47289149614731, + "grad_norm": 0.3312816584883406, + "learning_rate": 6.022860303581559e-05, + "loss": 2.7661, + "step": 31636 + }, + { + "epoch": 1.4729380543334032, + "grad_norm": 0.3221393857762935, + "learning_rate": 6.0225951569951524e-05, + "loss": 2.7309, + "step": 31637 + }, + { + "epoch": 1.4729846125194963, + "grad_norm": 0.3573368768417678, + "learning_rate": 6.0223300074075284e-05, + "loss": 2.7663, + "step": 31638 + }, + { + "epoch": 1.4730311707055894, + "grad_norm": 0.3230879573026728, + "learning_rate": 6.022064854819468e-05, + "loss": 2.7433, + "step": 31639 + }, + { + "epoch": 1.4730777288916823, + "grad_norm": 0.3168215977626667, + "learning_rate": 6.021799699231747e-05, + "loss": 2.6712, + "step": 31640 + }, + { + "epoch": 1.4731242870777754, + "grad_norm": 0.3502712618674528, + "learning_rate": 6.021534540645144e-05, + "loss": 2.7588, + "step": 31641 + }, + { + "epoch": 1.4731708452638685, + "grad_norm": 0.3420608751430948, + "learning_rate": 6.021269379060439e-05, + "loss": 2.7842, + "step": 31642 + }, + { + "epoch": 1.4732174034499617, + "grad_norm": 0.3389307156563688, + "learning_rate": 6.021004214478408e-05, + "loss": 2.8015, + "step": 31643 + }, + { + "epoch": 1.4732639616360546, + "grad_norm": 0.33291599982568126, + "learning_rate": 6.02073904689983e-05, + "loss": 2.6619, + "step": 31644 + }, + { + "epoch": 1.4733105198221477, + "grad_norm": 0.2998282134342455, + "learning_rate": 6.020473876325484e-05, + "loss": 2.7243, + "step": 31645 + }, + { + "epoch": 1.4733570780082408, + "grad_norm": 0.3318418179007037, + "learning_rate": 6.0202087027561485e-05, + "loss": 2.8409, + "step": 31646 + }, + { + "epoch": 1.473403636194334, + "grad_norm": 0.34454978873560105, + "learning_rate": 6.019943526192601e-05, + "loss": 2.7799, + "step": 31647 + }, + { + "epoch": 1.473450194380427, + "grad_norm": 0.30577527365570406, + "learning_rate": 6.0196783466356174e-05, + "loss": 2.7369, + "step": 31648 + }, + { + "epoch": 1.4734967525665201, + "grad_norm": 0.34946223225650136, + "learning_rate": 6.019413164085981e-05, + "loss": 2.8188, + "step": 31649 + }, + { + "epoch": 1.473543310752613, + "grad_norm": 0.3080936392082223, + "learning_rate": 6.019147978544466e-05, + "loss": 2.7387, + "step": 31650 + }, + { + "epoch": 1.4735898689387061, + "grad_norm": 0.32568484794431335, + "learning_rate": 6.0188827900118524e-05, + "loss": 2.7903, + "step": 31651 + }, + { + "epoch": 1.4736364271247993, + "grad_norm": 0.33401922975672044, + "learning_rate": 6.0186175984889194e-05, + "loss": 2.6691, + "step": 31652 + }, + { + "epoch": 1.4736829853108924, + "grad_norm": 0.3336642782856622, + "learning_rate": 6.018352403976442e-05, + "loss": 2.7397, + "step": 31653 + }, + { + "epoch": 1.4737295434969853, + "grad_norm": 0.3568268710445028, + "learning_rate": 6.018087206475201e-05, + "loss": 2.7892, + "step": 31654 + }, + { + "epoch": 1.4737761016830784, + "grad_norm": 0.3246251298748704, + "learning_rate": 6.0178220059859744e-05, + "loss": 2.7646, + "step": 31655 + }, + { + "epoch": 1.4738226598691715, + "grad_norm": 0.3677448027127592, + "learning_rate": 6.017556802509542e-05, + "loss": 2.7728, + "step": 31656 + }, + { + "epoch": 1.4738692180552646, + "grad_norm": 0.3327884541213691, + "learning_rate": 6.017291596046679e-05, + "loss": 2.9042, + "step": 31657 + }, + { + "epoch": 1.4739157762413577, + "grad_norm": 0.3622691619391163, + "learning_rate": 6.017026386598166e-05, + "loss": 2.734, + "step": 31658 + }, + { + "epoch": 1.4739623344274508, + "grad_norm": 0.337012180977195, + "learning_rate": 6.01676117416478e-05, + "loss": 2.7049, + "step": 31659 + }, + { + "epoch": 1.4740088926135437, + "grad_norm": 0.32212758221567356, + "learning_rate": 6.0164959587473015e-05, + "loss": 2.7156, + "step": 31660 + }, + { + "epoch": 1.4740554507996368, + "grad_norm": 0.317052509191874, + "learning_rate": 6.016230740346507e-05, + "loss": 2.6589, + "step": 31661 + }, + { + "epoch": 1.47410200898573, + "grad_norm": 0.3367635643823991, + "learning_rate": 6.015965518963175e-05, + "loss": 2.6979, + "step": 31662 + }, + { + "epoch": 1.4741485671718229, + "grad_norm": 0.33384094911908807, + "learning_rate": 6.015700294598085e-05, + "loss": 2.7327, + "step": 31663 + }, + { + "epoch": 1.474195125357916, + "grad_norm": 0.3339417318134173, + "learning_rate": 6.015435067252013e-05, + "loss": 2.7023, + "step": 31664 + }, + { + "epoch": 1.474241683544009, + "grad_norm": 0.31065438100897874, + "learning_rate": 6.0151698369257417e-05, + "loss": 2.7836, + "step": 31665 + }, + { + "epoch": 1.4742882417301022, + "grad_norm": 0.34033566435403995, + "learning_rate": 6.014904603620045e-05, + "loss": 2.6331, + "step": 31666 + }, + { + "epoch": 1.4743347999161953, + "grad_norm": 0.3406138801930642, + "learning_rate": 6.014639367335704e-05, + "loss": 2.7175, + "step": 31667 + }, + { + "epoch": 1.4743813581022884, + "grad_norm": 0.36146386815736464, + "learning_rate": 6.014374128073495e-05, + "loss": 2.7392, + "step": 31668 + }, + { + "epoch": 1.4744279162883813, + "grad_norm": 0.34750883604517213, + "learning_rate": 6.014108885834201e-05, + "loss": 2.6499, + "step": 31669 + }, + { + "epoch": 1.4744744744744744, + "grad_norm": 0.3536097385765049, + "learning_rate": 6.013843640618595e-05, + "loss": 2.7301, + "step": 31670 + }, + { + "epoch": 1.4745210326605676, + "grad_norm": 0.3371909901486109, + "learning_rate": 6.013578392427458e-05, + "loss": 2.7623, + "step": 31671 + }, + { + "epoch": 1.4745675908466607, + "grad_norm": 0.37381646412755465, + "learning_rate": 6.0133131412615675e-05, + "loss": 2.745, + "step": 31672 + }, + { + "epoch": 1.4746141490327536, + "grad_norm": 0.37107553220992306, + "learning_rate": 6.0130478871217035e-05, + "loss": 2.7471, + "step": 31673 + }, + { + "epoch": 1.4746607072188467, + "grad_norm": 0.3389476061085183, + "learning_rate": 6.0127826300086455e-05, + "loss": 2.7172, + "step": 31674 + }, + { + "epoch": 1.4747072654049398, + "grad_norm": 0.4107469381816038, + "learning_rate": 6.012517369923167e-05, + "loss": 2.7171, + "step": 31675 + }, + { + "epoch": 1.474753823591033, + "grad_norm": 0.3569272608322742, + "learning_rate": 6.012252106866053e-05, + "loss": 2.6662, + "step": 31676 + }, + { + "epoch": 1.474800381777126, + "grad_norm": 0.3210651914699391, + "learning_rate": 6.0119868408380773e-05, + "loss": 2.7512, + "step": 31677 + }, + { + "epoch": 1.4748469399632191, + "grad_norm": 0.3720000542917313, + "learning_rate": 6.011721571840021e-05, + "loss": 2.7174, + "step": 31678 + }, + { + "epoch": 1.474893498149312, + "grad_norm": 0.3289295488805404, + "learning_rate": 6.01145629987266e-05, + "loss": 2.6647, + "step": 31679 + }, + { + "epoch": 1.4749400563354051, + "grad_norm": 0.3560823357447553, + "learning_rate": 6.011191024936774e-05, + "loss": 2.7161, + "step": 31680 + }, + { + "epoch": 1.4749866145214983, + "grad_norm": 0.3252589741906685, + "learning_rate": 6.010925747033143e-05, + "loss": 2.6685, + "step": 31681 + }, + { + "epoch": 1.4750331727075914, + "grad_norm": 0.37270046894374437, + "learning_rate": 6.010660466162544e-05, + "loss": 2.712, + "step": 31682 + }, + { + "epoch": 1.4750797308936843, + "grad_norm": 0.3135995367372936, + "learning_rate": 6.010395182325758e-05, + "loss": 2.6955, + "step": 31683 + }, + { + "epoch": 1.4751262890797774, + "grad_norm": 0.37402665262483004, + "learning_rate": 6.010129895523559e-05, + "loss": 2.7536, + "step": 31684 + }, + { + "epoch": 1.4751728472658705, + "grad_norm": 0.35546638539251935, + "learning_rate": 6.0098646057567286e-05, + "loss": 2.6254, + "step": 31685 + }, + { + "epoch": 1.4752194054519636, + "grad_norm": 0.38221277826698613, + "learning_rate": 6.009599313026046e-05, + "loss": 2.6663, + "step": 31686 + }, + { + "epoch": 1.4752659636380567, + "grad_norm": 0.3596947494614066, + "learning_rate": 6.0093340173322896e-05, + "loss": 2.7736, + "step": 31687 + }, + { + "epoch": 1.4753125218241498, + "grad_norm": 0.36434921570842355, + "learning_rate": 6.0090687186762364e-05, + "loss": 2.6285, + "step": 31688 + }, + { + "epoch": 1.4753590800102427, + "grad_norm": 0.3653668644163674, + "learning_rate": 6.008803417058666e-05, + "loss": 2.6719, + "step": 31689 + }, + { + "epoch": 1.4754056381963359, + "grad_norm": 0.36617215370614536, + "learning_rate": 6.0085381124803576e-05, + "loss": 2.8167, + "step": 31690 + }, + { + "epoch": 1.475452196382429, + "grad_norm": 0.3368547707024225, + "learning_rate": 6.008272804942088e-05, + "loss": 2.6858, + "step": 31691 + }, + { + "epoch": 1.475498754568522, + "grad_norm": 0.3652521116981252, + "learning_rate": 6.008007494444639e-05, + "loss": 2.6964, + "step": 31692 + }, + { + "epoch": 1.475545312754615, + "grad_norm": 0.3522784936411801, + "learning_rate": 6.007742180988785e-05, + "loss": 2.7835, + "step": 31693 + }, + { + "epoch": 1.475591870940708, + "grad_norm": 0.3484403533488746, + "learning_rate": 6.007476864575308e-05, + "loss": 2.6454, + "step": 31694 + }, + { + "epoch": 1.4756384291268012, + "grad_norm": 0.365788109240766, + "learning_rate": 6.007211545204985e-05, + "loss": 2.7126, + "step": 31695 + }, + { + "epoch": 1.4756849873128943, + "grad_norm": 0.36473210542245715, + "learning_rate": 6.006946222878596e-05, + "loss": 2.7394, + "step": 31696 + }, + { + "epoch": 1.4757315454989874, + "grad_norm": 0.3385977961290845, + "learning_rate": 6.006680897596919e-05, + "loss": 2.7498, + "step": 31697 + }, + { + "epoch": 1.4757781036850806, + "grad_norm": 0.38434036988351805, + "learning_rate": 6.006415569360732e-05, + "loss": 2.6885, + "step": 31698 + }, + { + "epoch": 1.4758246618711734, + "grad_norm": 0.3187194029707756, + "learning_rate": 6.006150238170815e-05, + "loss": 2.6732, + "step": 31699 + }, + { + "epoch": 1.4758712200572666, + "grad_norm": 0.36770238660164634, + "learning_rate": 6.0058849040279455e-05, + "loss": 2.7137, + "step": 31700 + }, + { + "epoch": 1.4759177782433597, + "grad_norm": 0.3574906951270678, + "learning_rate": 6.005619566932905e-05, + "loss": 2.8585, + "step": 31701 + }, + { + "epoch": 1.4759643364294526, + "grad_norm": 0.3724472086187227, + "learning_rate": 6.0053542268864695e-05, + "loss": 2.693, + "step": 31702 + }, + { + "epoch": 1.4760108946155457, + "grad_norm": 0.382718233723773, + "learning_rate": 6.005088883889417e-05, + "loss": 2.7387, + "step": 31703 + }, + { + "epoch": 1.4760574528016388, + "grad_norm": 0.3685589634294189, + "learning_rate": 6.004823537942528e-05, + "loss": 2.7556, + "step": 31704 + }, + { + "epoch": 1.476104010987732, + "grad_norm": 0.36135405158288336, + "learning_rate": 6.0045581890465816e-05, + "loss": 2.7752, + "step": 31705 + }, + { + "epoch": 1.476150569173825, + "grad_norm": 0.344480513815763, + "learning_rate": 6.0042928372023556e-05, + "loss": 2.7631, + "step": 31706 + }, + { + "epoch": 1.4761971273599181, + "grad_norm": 0.347745033939924, + "learning_rate": 6.004027482410629e-05, + "loss": 2.7599, + "step": 31707 + }, + { + "epoch": 1.4762436855460113, + "grad_norm": 0.3453034987085505, + "learning_rate": 6.003762124672181e-05, + "loss": 2.7664, + "step": 31708 + }, + { + "epoch": 1.4762902437321042, + "grad_norm": 0.39769034151875304, + "learning_rate": 6.003496763987788e-05, + "loss": 2.7245, + "step": 31709 + }, + { + "epoch": 1.4763368019181973, + "grad_norm": 0.3300976748148412, + "learning_rate": 6.0032314003582336e-05, + "loss": 2.7589, + "step": 31710 + }, + { + "epoch": 1.4763833601042904, + "grad_norm": 0.38081331470982316, + "learning_rate": 6.002966033784293e-05, + "loss": 2.803, + "step": 31711 + }, + { + "epoch": 1.4764299182903833, + "grad_norm": 0.3371276787252305, + "learning_rate": 6.002700664266745e-05, + "loss": 2.6482, + "step": 31712 + }, + { + "epoch": 1.4764764764764764, + "grad_norm": 0.33716818111058594, + "learning_rate": 6.0024352918063697e-05, + "loss": 2.7349, + "step": 31713 + }, + { + "epoch": 1.4765230346625695, + "grad_norm": 0.3492366724354261, + "learning_rate": 6.002169916403947e-05, + "loss": 2.7038, + "step": 31714 + }, + { + "epoch": 1.4765695928486626, + "grad_norm": 0.3511187965061835, + "learning_rate": 6.0019045380602544e-05, + "loss": 2.7284, + "step": 31715 + }, + { + "epoch": 1.4766161510347557, + "grad_norm": 0.34754966805940213, + "learning_rate": 6.001639156776069e-05, + "loss": 2.6141, + "step": 31716 + }, + { + "epoch": 1.4766627092208489, + "grad_norm": 0.34559229248805334, + "learning_rate": 6.001373772552171e-05, + "loss": 2.6515, + "step": 31717 + }, + { + "epoch": 1.4767092674069418, + "grad_norm": 0.3400169237460196, + "learning_rate": 6.001108385389341e-05, + "loss": 2.7663, + "step": 31718 + }, + { + "epoch": 1.4767558255930349, + "grad_norm": 0.3368861252672414, + "learning_rate": 6.000842995288357e-05, + "loss": 2.7651, + "step": 31719 + }, + { + "epoch": 1.476802383779128, + "grad_norm": 0.3401358278513385, + "learning_rate": 6.000577602249996e-05, + "loss": 2.6693, + "step": 31720 + }, + { + "epoch": 1.476848941965221, + "grad_norm": 0.30786423336895785, + "learning_rate": 6.0003122062750384e-05, + "loss": 2.7109, + "step": 31721 + }, + { + "epoch": 1.476895500151314, + "grad_norm": 0.3433591136553589, + "learning_rate": 6.000046807364263e-05, + "loss": 2.6985, + "step": 31722 + }, + { + "epoch": 1.476942058337407, + "grad_norm": 0.30589639870121577, + "learning_rate": 5.9997814055184496e-05, + "loss": 2.6924, + "step": 31723 + }, + { + "epoch": 1.4769886165235002, + "grad_norm": 0.34421946774198114, + "learning_rate": 5.999516000738376e-05, + "loss": 2.741, + "step": 31724 + }, + { + "epoch": 1.4770351747095933, + "grad_norm": 0.30678253685002, + "learning_rate": 5.999250593024821e-05, + "loss": 2.8394, + "step": 31725 + }, + { + "epoch": 1.4770817328956865, + "grad_norm": 0.33724713201805007, + "learning_rate": 5.998985182378564e-05, + "loss": 2.8291, + "step": 31726 + }, + { + "epoch": 1.4771282910817796, + "grad_norm": 0.3462290654538711, + "learning_rate": 5.998719768800385e-05, + "loss": 2.7556, + "step": 31727 + }, + { + "epoch": 1.4771748492678725, + "grad_norm": 0.33222290821135597, + "learning_rate": 5.99845435229106e-05, + "loss": 2.6822, + "step": 31728 + }, + { + "epoch": 1.4772214074539656, + "grad_norm": 0.3333571199305611, + "learning_rate": 5.998188932851372e-05, + "loss": 2.7495, + "step": 31729 + }, + { + "epoch": 1.4772679656400587, + "grad_norm": 0.3474692663757784, + "learning_rate": 5.997923510482095e-05, + "loss": 2.7876, + "step": 31730 + }, + { + "epoch": 1.4773145238261518, + "grad_norm": 0.40681149178318105, + "learning_rate": 5.9976580851840127e-05, + "loss": 2.7646, + "step": 31731 + }, + { + "epoch": 1.4773610820122447, + "grad_norm": 0.36896689449724357, + "learning_rate": 5.997392656957903e-05, + "loss": 2.8452, + "step": 31732 + }, + { + "epoch": 1.4774076401983378, + "grad_norm": 0.3682326293318561, + "learning_rate": 5.997127225804543e-05, + "loss": 2.7469, + "step": 31733 + }, + { + "epoch": 1.477454198384431, + "grad_norm": 0.35493921699868025, + "learning_rate": 5.996861791724713e-05, + "loss": 2.7491, + "step": 31734 + }, + { + "epoch": 1.477500756570524, + "grad_norm": 0.36236780364705945, + "learning_rate": 5.9965963547191904e-05, + "loss": 2.7451, + "step": 31735 + }, + { + "epoch": 1.4775473147566172, + "grad_norm": 0.36046304571402565, + "learning_rate": 5.996330914788757e-05, + "loss": 2.7429, + "step": 31736 + }, + { + "epoch": 1.4775938729427103, + "grad_norm": 0.3444721625127536, + "learning_rate": 5.9960654719341915e-05, + "loss": 2.763, + "step": 31737 + }, + { + "epoch": 1.4776404311288032, + "grad_norm": 0.3259317188998407, + "learning_rate": 5.995800026156271e-05, + "loss": 2.7261, + "step": 31738 + }, + { + "epoch": 1.4776869893148963, + "grad_norm": 0.35956171507345885, + "learning_rate": 5.995534577455775e-05, + "loss": 2.7148, + "step": 31739 + }, + { + "epoch": 1.4777335475009894, + "grad_norm": 0.339869575277293, + "learning_rate": 5.995269125833484e-05, + "loss": 2.8577, + "step": 31740 + }, + { + "epoch": 1.4777801056870825, + "grad_norm": 0.331140662492501, + "learning_rate": 5.995003671290175e-05, + "loss": 2.7563, + "step": 31741 + }, + { + "epoch": 1.4778266638731754, + "grad_norm": 0.32158106510043244, + "learning_rate": 5.9947382138266296e-05, + "loss": 2.8308, + "step": 31742 + }, + { + "epoch": 1.4778732220592685, + "grad_norm": 0.3672391947607638, + "learning_rate": 5.9944727534436254e-05, + "loss": 2.8048, + "step": 31743 + }, + { + "epoch": 1.4779197802453616, + "grad_norm": 0.3211171090701122, + "learning_rate": 5.99420729014194e-05, + "loss": 2.6729, + "step": 31744 + }, + { + "epoch": 1.4779663384314548, + "grad_norm": 0.34082788197805114, + "learning_rate": 5.993941823922356e-05, + "loss": 2.6816, + "step": 31745 + }, + { + "epoch": 1.4780128966175479, + "grad_norm": 0.3669579225798507, + "learning_rate": 5.99367635478565e-05, + "loss": 2.818, + "step": 31746 + }, + { + "epoch": 1.478059454803641, + "grad_norm": 0.3694202859417576, + "learning_rate": 5.9934108827326015e-05, + "loss": 2.833, + "step": 31747 + }, + { + "epoch": 1.4781060129897339, + "grad_norm": 0.3376016536358751, + "learning_rate": 5.99314540776399e-05, + "loss": 2.7567, + "step": 31748 + }, + { + "epoch": 1.478152571175827, + "grad_norm": 0.3212266395314335, + "learning_rate": 5.992879929880595e-05, + "loss": 2.7011, + "step": 31749 + }, + { + "epoch": 1.47819912936192, + "grad_norm": 0.32660517912430376, + "learning_rate": 5.992614449083196e-05, + "loss": 2.6933, + "step": 31750 + }, + { + "epoch": 1.478245687548013, + "grad_norm": 0.3239547158091912, + "learning_rate": 5.9923489653725697e-05, + "loss": 2.7922, + "step": 31751 + }, + { + "epoch": 1.4782922457341061, + "grad_norm": 0.3232552488014948, + "learning_rate": 5.992083478749497e-05, + "loss": 2.7252, + "step": 31752 + }, + { + "epoch": 1.4783388039201992, + "grad_norm": 0.3383969656091889, + "learning_rate": 5.991817989214758e-05, + "loss": 2.8366, + "step": 31753 + }, + { + "epoch": 1.4783853621062923, + "grad_norm": 0.34447184137781306, + "learning_rate": 5.9915524967691305e-05, + "loss": 2.6179, + "step": 31754 + }, + { + "epoch": 1.4784319202923855, + "grad_norm": 0.33976761499957653, + "learning_rate": 5.991287001413395e-05, + "loss": 2.6469, + "step": 31755 + }, + { + "epoch": 1.4784784784784786, + "grad_norm": 0.34030144902683496, + "learning_rate": 5.991021503148329e-05, + "loss": 2.7449, + "step": 31756 + }, + { + "epoch": 1.4785250366645715, + "grad_norm": 0.3459312012882793, + "learning_rate": 5.990756001974712e-05, + "loss": 2.6538, + "step": 31757 + }, + { + "epoch": 1.4785715948506646, + "grad_norm": 0.3779130394022488, + "learning_rate": 5.9904904978933244e-05, + "loss": 2.7443, + "step": 31758 + }, + { + "epoch": 1.4786181530367577, + "grad_norm": 0.33811649780014014, + "learning_rate": 5.9902249909049445e-05, + "loss": 2.6733, + "step": 31759 + }, + { + "epoch": 1.4786647112228508, + "grad_norm": 0.3684182420628938, + "learning_rate": 5.989959481010352e-05, + "loss": 2.7725, + "step": 31760 + }, + { + "epoch": 1.4787112694089437, + "grad_norm": 0.3363606983252268, + "learning_rate": 5.989693968210326e-05, + "loss": 2.7587, + "step": 31761 + }, + { + "epoch": 1.4787578275950368, + "grad_norm": 0.3678136201978903, + "learning_rate": 5.989428452505644e-05, + "loss": 2.7159, + "step": 31762 + }, + { + "epoch": 1.47880438578113, + "grad_norm": 0.3313645431704695, + "learning_rate": 5.989162933897089e-05, + "loss": 2.6919, + "step": 31763 + }, + { + "epoch": 1.478850943967223, + "grad_norm": 0.36278232659099113, + "learning_rate": 5.988897412385439e-05, + "loss": 2.7206, + "step": 31764 + }, + { + "epoch": 1.4788975021533162, + "grad_norm": 0.320590715527487, + "learning_rate": 5.988631887971471e-05, + "loss": 2.8031, + "step": 31765 + }, + { + "epoch": 1.4789440603394093, + "grad_norm": 0.3378791869076443, + "learning_rate": 5.988366360655966e-05, + "loss": 2.792, + "step": 31766 + }, + { + "epoch": 1.4789906185255022, + "grad_norm": 0.3497467812950911, + "learning_rate": 5.988100830439704e-05, + "loss": 2.7549, + "step": 31767 + }, + { + "epoch": 1.4790371767115953, + "grad_norm": 0.3098801371659423, + "learning_rate": 5.987835297323463e-05, + "loss": 2.6235, + "step": 31768 + }, + { + "epoch": 1.4790837348976884, + "grad_norm": 0.3554954645695001, + "learning_rate": 5.9875697613080216e-05, + "loss": 2.7406, + "step": 31769 + }, + { + "epoch": 1.4791302930837815, + "grad_norm": 0.3170608333629625, + "learning_rate": 5.987304222394161e-05, + "loss": 2.663, + "step": 31770 + }, + { + "epoch": 1.4791768512698744, + "grad_norm": 0.3207508396461309, + "learning_rate": 5.9870386805826594e-05, + "loss": 2.6614, + "step": 31771 + }, + { + "epoch": 1.4792234094559675, + "grad_norm": 0.31818384583578696, + "learning_rate": 5.9867731358742964e-05, + "loss": 2.6697, + "step": 31772 + }, + { + "epoch": 1.4792699676420606, + "grad_norm": 0.3315563851993793, + "learning_rate": 5.986507588269853e-05, + "loss": 2.7221, + "step": 31773 + }, + { + "epoch": 1.4793165258281538, + "grad_norm": 0.3201842876317868, + "learning_rate": 5.986242037770106e-05, + "loss": 2.7, + "step": 31774 + }, + { + "epoch": 1.4793630840142469, + "grad_norm": 0.323311520453161, + "learning_rate": 5.9859764843758346e-05, + "loss": 2.8051, + "step": 31775 + }, + { + "epoch": 1.47940964220034, + "grad_norm": 0.3265736240630969, + "learning_rate": 5.98571092808782e-05, + "loss": 2.6906, + "step": 31776 + }, + { + "epoch": 1.4794562003864329, + "grad_norm": 0.32425918236038115, + "learning_rate": 5.9854453689068423e-05, + "loss": 2.7633, + "step": 31777 + }, + { + "epoch": 1.479502758572526, + "grad_norm": 0.33661102868299825, + "learning_rate": 5.9851798068336785e-05, + "loss": 2.7011, + "step": 31778 + }, + { + "epoch": 1.4795493167586191, + "grad_norm": 0.3645407179695484, + "learning_rate": 5.9849142418691094e-05, + "loss": 2.7592, + "step": 31779 + }, + { + "epoch": 1.4795958749447122, + "grad_norm": 0.3282137590309233, + "learning_rate": 5.984648674013914e-05, + "loss": 2.7893, + "step": 31780 + }, + { + "epoch": 1.4796424331308051, + "grad_norm": 0.3545961933999121, + "learning_rate": 5.984383103268871e-05, + "loss": 2.7975, + "step": 31781 + }, + { + "epoch": 1.4796889913168982, + "grad_norm": 0.3501083630337093, + "learning_rate": 5.984117529634762e-05, + "loss": 2.6712, + "step": 31782 + }, + { + "epoch": 1.4797355495029914, + "grad_norm": 0.3332928211780521, + "learning_rate": 5.983851953112364e-05, + "loss": 2.8299, + "step": 31783 + }, + { + "epoch": 1.4797821076890845, + "grad_norm": 0.37366844758061246, + "learning_rate": 5.983586373702457e-05, + "loss": 2.8454, + "step": 31784 + }, + { + "epoch": 1.4798286658751776, + "grad_norm": 0.35513325576267574, + "learning_rate": 5.983320791405821e-05, + "loss": 2.7268, + "step": 31785 + }, + { + "epoch": 1.4798752240612707, + "grad_norm": 0.37060685927734066, + "learning_rate": 5.983055206223237e-05, + "loss": 2.8553, + "step": 31786 + }, + { + "epoch": 1.4799217822473636, + "grad_norm": 0.33525870476924, + "learning_rate": 5.982789618155481e-05, + "loss": 2.7441, + "step": 31787 + }, + { + "epoch": 1.4799683404334567, + "grad_norm": 0.40491009853335636, + "learning_rate": 5.9825240272033346e-05, + "loss": 2.865, + "step": 31788 + }, + { + "epoch": 1.4800148986195498, + "grad_norm": 0.3421325545787541, + "learning_rate": 5.982258433367577e-05, + "loss": 2.7428, + "step": 31789 + }, + { + "epoch": 1.4800614568056427, + "grad_norm": 0.38152034669508733, + "learning_rate": 5.981992836648989e-05, + "loss": 2.6984, + "step": 31790 + }, + { + "epoch": 1.4801080149917358, + "grad_norm": 0.3142749930955042, + "learning_rate": 5.981727237048347e-05, + "loss": 2.6892, + "step": 31791 + }, + { + "epoch": 1.480154573177829, + "grad_norm": 0.344812521074185, + "learning_rate": 5.9814616345664346e-05, + "loss": 2.7018, + "step": 31792 + }, + { + "epoch": 1.480201131363922, + "grad_norm": 0.3752223037595402, + "learning_rate": 5.9811960292040257e-05, + "loss": 2.6621, + "step": 31793 + }, + { + "epoch": 1.4802476895500152, + "grad_norm": 0.3020516131433341, + "learning_rate": 5.980930420961905e-05, + "loss": 2.7065, + "step": 31794 + }, + { + "epoch": 1.4802942477361083, + "grad_norm": 0.3724894789303063, + "learning_rate": 5.98066480984085e-05, + "loss": 2.6998, + "step": 31795 + }, + { + "epoch": 1.4803408059222014, + "grad_norm": 0.3583862424281986, + "learning_rate": 5.9803991958416405e-05, + "loss": 2.7248, + "step": 31796 + }, + { + "epoch": 1.4803873641082943, + "grad_norm": 0.34183562529144634, + "learning_rate": 5.980133578965056e-05, + "loss": 2.7735, + "step": 31797 + }, + { + "epoch": 1.4804339222943874, + "grad_norm": 0.3638793202685983, + "learning_rate": 5.979867959211876e-05, + "loss": 2.7957, + "step": 31798 + }, + { + "epoch": 1.4804804804804805, + "grad_norm": 0.3516177497352781, + "learning_rate": 5.979602336582879e-05, + "loss": 2.8007, + "step": 31799 + }, + { + "epoch": 1.4805270386665734, + "grad_norm": 0.3392611772554713, + "learning_rate": 5.9793367110788466e-05, + "loss": 2.7746, + "step": 31800 + }, + { + "epoch": 1.4805735968526665, + "grad_norm": 0.34753022556306307, + "learning_rate": 5.979071082700557e-05, + "loss": 2.7367, + "step": 31801 + }, + { + "epoch": 1.4806201550387597, + "grad_norm": 0.3451793510844958, + "learning_rate": 5.97880545144879e-05, + "loss": 2.7514, + "step": 31802 + }, + { + "epoch": 1.4806667132248528, + "grad_norm": 0.3288226906494676, + "learning_rate": 5.978539817324327e-05, + "loss": 2.758, + "step": 31803 + }, + { + "epoch": 1.4807132714109459, + "grad_norm": 0.3375058605512863, + "learning_rate": 5.978274180327945e-05, + "loss": 2.8679, + "step": 31804 + }, + { + "epoch": 1.480759829597039, + "grad_norm": 0.3472871573836895, + "learning_rate": 5.978008540460426e-05, + "loss": 2.7351, + "step": 31805 + }, + { + "epoch": 1.480806387783132, + "grad_norm": 0.36269080403855364, + "learning_rate": 5.977742897722546e-05, + "loss": 2.7953, + "step": 31806 + }, + { + "epoch": 1.480852945969225, + "grad_norm": 0.3618540037971789, + "learning_rate": 5.977477252115088e-05, + "loss": 2.7946, + "step": 31807 + }, + { + "epoch": 1.4808995041553181, + "grad_norm": 0.3444282635952479, + "learning_rate": 5.9772116036388305e-05, + "loss": 2.822, + "step": 31808 + }, + { + "epoch": 1.4809460623414112, + "grad_norm": 0.35012324955535146, + "learning_rate": 5.9769459522945537e-05, + "loss": 2.7294, + "step": 31809 + }, + { + "epoch": 1.4809926205275041, + "grad_norm": 0.3397418793914549, + "learning_rate": 5.976680298083035e-05, + "loss": 2.7895, + "step": 31810 + }, + { + "epoch": 1.4810391787135972, + "grad_norm": 0.36229193171040197, + "learning_rate": 5.976414641005057e-05, + "loss": 2.8296, + "step": 31811 + }, + { + "epoch": 1.4810857368996904, + "grad_norm": 0.36348685156699106, + "learning_rate": 5.9761489810613976e-05, + "loss": 2.7525, + "step": 31812 + }, + { + "epoch": 1.4811322950857835, + "grad_norm": 0.35393500986617604, + "learning_rate": 5.9758833182528395e-05, + "loss": 2.7719, + "step": 31813 + }, + { + "epoch": 1.4811788532718766, + "grad_norm": 0.34454955210366334, + "learning_rate": 5.975617652580158e-05, + "loss": 2.7931, + "step": 31814 + }, + { + "epoch": 1.4812254114579697, + "grad_norm": 0.363221822046504, + "learning_rate": 5.975351984044134e-05, + "loss": 2.7582, + "step": 31815 + }, + { + "epoch": 1.4812719696440626, + "grad_norm": 0.33406061837174345, + "learning_rate": 5.97508631264555e-05, + "loss": 2.7524, + "step": 31816 + }, + { + "epoch": 1.4813185278301557, + "grad_norm": 0.341000786746023, + "learning_rate": 5.974820638385183e-05, + "loss": 2.783, + "step": 31817 + }, + { + "epoch": 1.4813650860162488, + "grad_norm": 0.3423554718060572, + "learning_rate": 5.974554961263814e-05, + "loss": 2.6839, + "step": 31818 + }, + { + "epoch": 1.481411644202342, + "grad_norm": 0.3381852456753834, + "learning_rate": 5.974289281282221e-05, + "loss": 2.82, + "step": 31819 + }, + { + "epoch": 1.4814582023884348, + "grad_norm": 0.3755972446912957, + "learning_rate": 5.974023598441185e-05, + "loss": 2.8109, + "step": 31820 + }, + { + "epoch": 1.481504760574528, + "grad_norm": 0.3372507298794354, + "learning_rate": 5.973757912741487e-05, + "loss": 2.6967, + "step": 31821 + }, + { + "epoch": 1.481551318760621, + "grad_norm": 0.36176728238550704, + "learning_rate": 5.973492224183905e-05, + "loss": 2.6705, + "step": 31822 + }, + { + "epoch": 1.4815978769467142, + "grad_norm": 0.346482243058429, + "learning_rate": 5.973226532769218e-05, + "loss": 2.7628, + "step": 31823 + }, + { + "epoch": 1.4816444351328073, + "grad_norm": 0.34009418942346, + "learning_rate": 5.9729608384982085e-05, + "loss": 2.6444, + "step": 31824 + }, + { + "epoch": 1.4816909933189004, + "grad_norm": 0.34756319016315446, + "learning_rate": 5.972695141371654e-05, + "loss": 2.7767, + "step": 31825 + }, + { + "epoch": 1.4817375515049933, + "grad_norm": 0.3845963466975184, + "learning_rate": 5.972429441390334e-05, + "loss": 2.8852, + "step": 31826 + }, + { + "epoch": 1.4817841096910864, + "grad_norm": 0.36198318589856543, + "learning_rate": 5.972163738555031e-05, + "loss": 2.7962, + "step": 31827 + }, + { + "epoch": 1.4818306678771795, + "grad_norm": 0.3809330130049281, + "learning_rate": 5.971898032866523e-05, + "loss": 2.7668, + "step": 31828 + }, + { + "epoch": 1.4818772260632727, + "grad_norm": 0.371449941503774, + "learning_rate": 5.9716323243255915e-05, + "loss": 2.704, + "step": 31829 + }, + { + "epoch": 1.4819237842493655, + "grad_norm": 0.35655812189964553, + "learning_rate": 5.971366612933012e-05, + "loss": 2.8252, + "step": 31830 + }, + { + "epoch": 1.4819703424354587, + "grad_norm": 0.38088943656924107, + "learning_rate": 5.971100898689569e-05, + "loss": 2.7311, + "step": 31831 + }, + { + "epoch": 1.4820169006215518, + "grad_norm": 0.3696766397342462, + "learning_rate": 5.9708351815960396e-05, + "loss": 2.7385, + "step": 31832 + }, + { + "epoch": 1.482063458807645, + "grad_norm": 0.36753762479439867, + "learning_rate": 5.970569461653205e-05, + "loss": 2.7167, + "step": 31833 + }, + { + "epoch": 1.482110016993738, + "grad_norm": 0.36879158780738586, + "learning_rate": 5.970303738861844e-05, + "loss": 2.6133, + "step": 31834 + }, + { + "epoch": 1.4821565751798311, + "grad_norm": 0.3630204629141984, + "learning_rate": 5.970038013222737e-05, + "loss": 2.7337, + "step": 31835 + }, + { + "epoch": 1.482203133365924, + "grad_norm": 0.3335374283193665, + "learning_rate": 5.9697722847366654e-05, + "loss": 2.8036, + "step": 31836 + }, + { + "epoch": 1.4822496915520171, + "grad_norm": 0.37956951560602703, + "learning_rate": 5.969506553404407e-05, + "loss": 2.5782, + "step": 31837 + }, + { + "epoch": 1.4822962497381102, + "grad_norm": 0.3232891594402544, + "learning_rate": 5.9692408192267426e-05, + "loss": 2.731, + "step": 31838 + }, + { + "epoch": 1.4823428079242031, + "grad_norm": 0.3644124631888167, + "learning_rate": 5.968975082204451e-05, + "loss": 2.623, + "step": 31839 + }, + { + "epoch": 1.4823893661102963, + "grad_norm": 0.33263768966878077, + "learning_rate": 5.9687093423383145e-05, + "loss": 2.8147, + "step": 31840 + }, + { + "epoch": 1.4824359242963894, + "grad_norm": 0.33225046544458203, + "learning_rate": 5.96844359962911e-05, + "loss": 2.83, + "step": 31841 + }, + { + "epoch": 1.4824824824824825, + "grad_norm": 0.3457767703879352, + "learning_rate": 5.96817785407762e-05, + "loss": 2.7377, + "step": 31842 + }, + { + "epoch": 1.4825290406685756, + "grad_norm": 0.3741462914642373, + "learning_rate": 5.967912105684623e-05, + "loss": 2.7535, + "step": 31843 + }, + { + "epoch": 1.4825755988546687, + "grad_norm": 0.34511805186440164, + "learning_rate": 5.9676463544508987e-05, + "loss": 2.5953, + "step": 31844 + }, + { + "epoch": 1.4826221570407616, + "grad_norm": 0.3305721781549945, + "learning_rate": 5.967380600377229e-05, + "loss": 2.8343, + "step": 31845 + }, + { + "epoch": 1.4826687152268547, + "grad_norm": 0.3837765456486618, + "learning_rate": 5.967114843464392e-05, + "loss": 2.7946, + "step": 31846 + }, + { + "epoch": 1.4827152734129478, + "grad_norm": 0.3147748620213386, + "learning_rate": 5.966849083713167e-05, + "loss": 2.8184, + "step": 31847 + }, + { + "epoch": 1.482761831599041, + "grad_norm": 0.39079612532918406, + "learning_rate": 5.966583321124336e-05, + "loss": 2.6858, + "step": 31848 + }, + { + "epoch": 1.4828083897851339, + "grad_norm": 0.3321238873410882, + "learning_rate": 5.9663175556986796e-05, + "loss": 2.7258, + "step": 31849 + }, + { + "epoch": 1.482854947971227, + "grad_norm": 0.3635355065093083, + "learning_rate": 5.9660517874369745e-05, + "loss": 2.6103, + "step": 31850 + }, + { + "epoch": 1.48290150615732, + "grad_norm": 0.3185145609678005, + "learning_rate": 5.965786016340003e-05, + "loss": 2.6301, + "step": 31851 + }, + { + "epoch": 1.4829480643434132, + "grad_norm": 0.3853846402883488, + "learning_rate": 5.9655202424085446e-05, + "loss": 2.7733, + "step": 31852 + }, + { + "epoch": 1.4829946225295063, + "grad_norm": 0.3431675611618639, + "learning_rate": 5.9652544656433796e-05, + "loss": 2.6966, + "step": 31853 + }, + { + "epoch": 1.4830411807155994, + "grad_norm": 0.364637564120659, + "learning_rate": 5.9649886860452885e-05, + "loss": 2.6947, + "step": 31854 + }, + { + "epoch": 1.4830877389016923, + "grad_norm": 0.3565292936653835, + "learning_rate": 5.964722903615051e-05, + "loss": 2.7774, + "step": 31855 + }, + { + "epoch": 1.4831342970877854, + "grad_norm": 0.36311969569039554, + "learning_rate": 5.964457118353446e-05, + "loss": 2.7063, + "step": 31856 + }, + { + "epoch": 1.4831808552738786, + "grad_norm": 0.37384996856215025, + "learning_rate": 5.964191330261253e-05, + "loss": 2.759, + "step": 31857 + }, + { + "epoch": 1.4832274134599717, + "grad_norm": 0.34239926516529834, + "learning_rate": 5.963925539339255e-05, + "loss": 2.707, + "step": 31858 + }, + { + "epoch": 1.4832739716460646, + "grad_norm": 0.3747720756382981, + "learning_rate": 5.96365974558823e-05, + "loss": 2.8658, + "step": 31859 + }, + { + "epoch": 1.4833205298321577, + "grad_norm": 0.337415341887544, + "learning_rate": 5.9633939490089596e-05, + "loss": 2.6235, + "step": 31860 + }, + { + "epoch": 1.4833670880182508, + "grad_norm": 0.3466573586868972, + "learning_rate": 5.963128149602222e-05, + "loss": 2.705, + "step": 31861 + }, + { + "epoch": 1.483413646204344, + "grad_norm": 0.3464726318983959, + "learning_rate": 5.962862347368797e-05, + "loss": 2.6907, + "step": 31862 + }, + { + "epoch": 1.483460204390437, + "grad_norm": 0.3486311906070392, + "learning_rate": 5.9625965423094684e-05, + "loss": 2.7445, + "step": 31863 + }, + { + "epoch": 1.4835067625765301, + "grad_norm": 0.3468767459423429, + "learning_rate": 5.962330734425012e-05, + "loss": 2.7695, + "step": 31864 + }, + { + "epoch": 1.483553320762623, + "grad_norm": 0.3720151905336839, + "learning_rate": 5.9620649237162094e-05, + "loss": 2.8258, + "step": 31865 + }, + { + "epoch": 1.4835998789487161, + "grad_norm": 0.3551003218597519, + "learning_rate": 5.9617991101838425e-05, + "loss": 2.7407, + "step": 31866 + }, + { + "epoch": 1.4836464371348093, + "grad_norm": 0.3558028000609352, + "learning_rate": 5.9615332938286896e-05, + "loss": 2.7801, + "step": 31867 + }, + { + "epoch": 1.4836929953209024, + "grad_norm": 0.3460814096966928, + "learning_rate": 5.961267474651532e-05, + "loss": 2.75, + "step": 31868 + }, + { + "epoch": 1.4837395535069953, + "grad_norm": 0.34393899648542015, + "learning_rate": 5.961001652653147e-05, + "loss": 2.7889, + "step": 31869 + }, + { + "epoch": 1.4837861116930884, + "grad_norm": 0.3534363809126016, + "learning_rate": 5.9607358278343185e-05, + "loss": 2.801, + "step": 31870 + }, + { + "epoch": 1.4838326698791815, + "grad_norm": 0.36923747447453453, + "learning_rate": 5.9604700001958234e-05, + "loss": 2.778, + "step": 31871 + }, + { + "epoch": 1.4838792280652746, + "grad_norm": 0.37604548794576115, + "learning_rate": 5.9602041697384447e-05, + "loss": 2.8458, + "step": 31872 + }, + { + "epoch": 1.4839257862513677, + "grad_norm": 0.3751577743118916, + "learning_rate": 5.9599383364629616e-05, + "loss": 2.7268, + "step": 31873 + }, + { + "epoch": 1.4839723444374608, + "grad_norm": 0.39516833572955573, + "learning_rate": 5.959672500370153e-05, + "loss": 2.848, + "step": 31874 + }, + { + "epoch": 1.4840189026235537, + "grad_norm": 0.34426434912583537, + "learning_rate": 5.9594066614608e-05, + "loss": 2.6867, + "step": 31875 + }, + { + "epoch": 1.4840654608096469, + "grad_norm": 0.3768217559766294, + "learning_rate": 5.9591408197356836e-05, + "loss": 2.7178, + "step": 31876 + }, + { + "epoch": 1.48411201899574, + "grad_norm": 0.32337353317895484, + "learning_rate": 5.958874975195584e-05, + "loss": 2.786, + "step": 31877 + }, + { + "epoch": 1.4841585771818329, + "grad_norm": 0.37143722792300254, + "learning_rate": 5.9586091278412795e-05, + "loss": 2.7429, + "step": 31878 + }, + { + "epoch": 1.484205135367926, + "grad_norm": 0.34969348797238814, + "learning_rate": 5.9583432776735525e-05, + "loss": 2.7787, + "step": 31879 + }, + { + "epoch": 1.484251693554019, + "grad_norm": 0.3476713176644319, + "learning_rate": 5.958077424693183e-05, + "loss": 2.7458, + "step": 31880 + }, + { + "epoch": 1.4842982517401122, + "grad_norm": 0.36051165605358415, + "learning_rate": 5.9578115689009504e-05, + "loss": 2.8001, + "step": 31881 + }, + { + "epoch": 1.4843448099262053, + "grad_norm": 0.3568942506260602, + "learning_rate": 5.957545710297634e-05, + "loss": 2.7872, + "step": 31882 + }, + { + "epoch": 1.4843913681122984, + "grad_norm": 0.35890956304932564, + "learning_rate": 5.9572798488840164e-05, + "loss": 2.633, + "step": 31883 + }, + { + "epoch": 1.4844379262983916, + "grad_norm": 0.3546782195091973, + "learning_rate": 5.9570139846608754e-05, + "loss": 2.5965, + "step": 31884 + }, + { + "epoch": 1.4844844844844844, + "grad_norm": 0.31455536276650736, + "learning_rate": 5.956748117628993e-05, + "loss": 2.641, + "step": 31885 + }, + { + "epoch": 1.4845310426705776, + "grad_norm": 0.3704144913593371, + "learning_rate": 5.95648224778915e-05, + "loss": 2.6678, + "step": 31886 + }, + { + "epoch": 1.4845776008566707, + "grad_norm": 0.3331281996249011, + "learning_rate": 5.956216375142126e-05, + "loss": 2.686, + "step": 31887 + }, + { + "epoch": 1.4846241590427636, + "grad_norm": 0.3509840610992747, + "learning_rate": 5.9559504996887e-05, + "loss": 2.7732, + "step": 31888 + }, + { + "epoch": 1.4846707172288567, + "grad_norm": 0.3486129947585734, + "learning_rate": 5.9556846214296535e-05, + "loss": 2.7294, + "step": 31889 + }, + { + "epoch": 1.4847172754149498, + "grad_norm": 0.32157008486565997, + "learning_rate": 5.955418740365769e-05, + "loss": 2.7473, + "step": 31890 + }, + { + "epoch": 1.484763833601043, + "grad_norm": 0.36297601149083847, + "learning_rate": 5.9551528564978224e-05, + "loss": 2.687, + "step": 31891 + }, + { + "epoch": 1.484810391787136, + "grad_norm": 0.33667310661330885, + "learning_rate": 5.9548869698265966e-05, + "loss": 2.6841, + "step": 31892 + }, + { + "epoch": 1.4848569499732291, + "grad_norm": 0.33293468385446673, + "learning_rate": 5.954621080352872e-05, + "loss": 2.6916, + "step": 31893 + }, + { + "epoch": 1.484903508159322, + "grad_norm": 0.34844929523129103, + "learning_rate": 5.9543551880774284e-05, + "loss": 2.6708, + "step": 31894 + }, + { + "epoch": 1.4849500663454152, + "grad_norm": 0.36247756190175007, + "learning_rate": 5.954089293001047e-05, + "loss": 2.7143, + "step": 31895 + }, + { + "epoch": 1.4849966245315083, + "grad_norm": 0.37058781338848035, + "learning_rate": 5.953823395124507e-05, + "loss": 2.7763, + "step": 31896 + }, + { + "epoch": 1.4850431827176014, + "grad_norm": 0.38065365431532466, + "learning_rate": 5.953557494448589e-05, + "loss": 2.7734, + "step": 31897 + }, + { + "epoch": 1.4850897409036943, + "grad_norm": 0.362352474865827, + "learning_rate": 5.9532915909740736e-05, + "loss": 2.7379, + "step": 31898 + }, + { + "epoch": 1.4851362990897874, + "grad_norm": 0.35822934800814177, + "learning_rate": 5.953025684701743e-05, + "loss": 2.7288, + "step": 31899 + }, + { + "epoch": 1.4851828572758805, + "grad_norm": 0.3611635320016251, + "learning_rate": 5.9527597756323736e-05, + "loss": 2.7365, + "step": 31900 + }, + { + "epoch": 1.4852294154619736, + "grad_norm": 0.3722667164306974, + "learning_rate": 5.952493863766749e-05, + "loss": 2.7869, + "step": 31901 + }, + { + "epoch": 1.4852759736480667, + "grad_norm": 0.3550291463517095, + "learning_rate": 5.952227949105649e-05, + "loss": 2.7216, + "step": 31902 + }, + { + "epoch": 1.4853225318341599, + "grad_norm": 0.3822532927172085, + "learning_rate": 5.951962031649852e-05, + "loss": 2.7481, + "step": 31903 + }, + { + "epoch": 1.4853690900202527, + "grad_norm": 0.3737231862824362, + "learning_rate": 5.951696111400143e-05, + "loss": 2.7297, + "step": 31904 + }, + { + "epoch": 1.4854156482063459, + "grad_norm": 0.3656820408146065, + "learning_rate": 5.951430188357299e-05, + "loss": 2.7544, + "step": 31905 + }, + { + "epoch": 1.485462206392439, + "grad_norm": 0.3457201919858855, + "learning_rate": 5.951164262522101e-05, + "loss": 2.7486, + "step": 31906 + }, + { + "epoch": 1.485508764578532, + "grad_norm": 0.3707088625978392, + "learning_rate": 5.950898333895328e-05, + "loss": 2.6573, + "step": 31907 + }, + { + "epoch": 1.485555322764625, + "grad_norm": 0.345258478786033, + "learning_rate": 5.950632402477765e-05, + "loss": 2.743, + "step": 31908 + }, + { + "epoch": 1.485601880950718, + "grad_norm": 0.3897118167932788, + "learning_rate": 5.950366468270187e-05, + "loss": 2.771, + "step": 31909 + }, + { + "epoch": 1.4856484391368112, + "grad_norm": 0.32958043862602016, + "learning_rate": 5.950100531273377e-05, + "loss": 2.7847, + "step": 31910 + }, + { + "epoch": 1.4856949973229043, + "grad_norm": 0.3482486594534419, + "learning_rate": 5.949834591488117e-05, + "loss": 2.7623, + "step": 31911 + }, + { + "epoch": 1.4857415555089974, + "grad_norm": 0.3502287971223598, + "learning_rate": 5.9495686489151846e-05, + "loss": 2.6609, + "step": 31912 + }, + { + "epoch": 1.4857881136950906, + "grad_norm": 0.329177715057278, + "learning_rate": 5.949302703555364e-05, + "loss": 2.8346, + "step": 31913 + }, + { + "epoch": 1.4858346718811835, + "grad_norm": 0.3416338092165529, + "learning_rate": 5.949036755409432e-05, + "loss": 2.8101, + "step": 31914 + }, + { + "epoch": 1.4858812300672766, + "grad_norm": 0.3405142776008806, + "learning_rate": 5.9487708044781695e-05, + "loss": 2.7605, + "step": 31915 + }, + { + "epoch": 1.4859277882533697, + "grad_norm": 0.3458047492683351, + "learning_rate": 5.9485048507623595e-05, + "loss": 2.6857, + "step": 31916 + }, + { + "epoch": 1.4859743464394628, + "grad_norm": 0.3538609657844964, + "learning_rate": 5.948238894262782e-05, + "loss": 2.6803, + "step": 31917 + }, + { + "epoch": 1.4860209046255557, + "grad_norm": 0.32429400556835697, + "learning_rate": 5.947972934980215e-05, + "loss": 2.5706, + "step": 31918 + }, + { + "epoch": 1.4860674628116488, + "grad_norm": 0.3427241878809443, + "learning_rate": 5.947706972915442e-05, + "loss": 2.8172, + "step": 31919 + }, + { + "epoch": 1.486114020997742, + "grad_norm": 0.32297491004561696, + "learning_rate": 5.947441008069242e-05, + "loss": 2.7904, + "step": 31920 + }, + { + "epoch": 1.486160579183835, + "grad_norm": 0.34999853738212733, + "learning_rate": 5.947175040442396e-05, + "loss": 2.6616, + "step": 31921 + }, + { + "epoch": 1.4862071373699282, + "grad_norm": 0.3718381120432923, + "learning_rate": 5.9469090700356855e-05, + "loss": 2.7699, + "step": 31922 + }, + { + "epoch": 1.4862536955560213, + "grad_norm": 0.3621677959408162, + "learning_rate": 5.946643096849889e-05, + "loss": 2.7569, + "step": 31923 + }, + { + "epoch": 1.4863002537421142, + "grad_norm": 0.3599697807047929, + "learning_rate": 5.9463771208857886e-05, + "loss": 2.7866, + "step": 31924 + }, + { + "epoch": 1.4863468119282073, + "grad_norm": 0.3588840090033091, + "learning_rate": 5.946111142144164e-05, + "loss": 2.6943, + "step": 31925 + }, + { + "epoch": 1.4863933701143004, + "grad_norm": 0.3518560179201506, + "learning_rate": 5.945845160625798e-05, + "loss": 2.8531, + "step": 31926 + }, + { + "epoch": 1.4864399283003933, + "grad_norm": 0.3618710500174609, + "learning_rate": 5.945579176331468e-05, + "loss": 2.739, + "step": 31927 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.3648633709841611, + "learning_rate": 5.945313189261956e-05, + "loss": 2.7649, + "step": 31928 + }, + { + "epoch": 1.4865330446725795, + "grad_norm": 0.3434476979156347, + "learning_rate": 5.9450471994180436e-05, + "loss": 2.8473, + "step": 31929 + }, + { + "epoch": 1.4865796028586726, + "grad_norm": 0.36356885026898206, + "learning_rate": 5.9447812068005114e-05, + "loss": 2.679, + "step": 31930 + }, + { + "epoch": 1.4866261610447657, + "grad_norm": 0.3631969325238173, + "learning_rate": 5.944515211410139e-05, + "loss": 2.7528, + "step": 31931 + }, + { + "epoch": 1.4866727192308589, + "grad_norm": 0.34050404262952216, + "learning_rate": 5.944249213247708e-05, + "loss": 2.6948, + "step": 31932 + }, + { + "epoch": 1.4867192774169518, + "grad_norm": 0.33525434645275853, + "learning_rate": 5.9439832123139983e-05, + "loss": 2.786, + "step": 31933 + }, + { + "epoch": 1.4867658356030449, + "grad_norm": 0.3505977249525047, + "learning_rate": 5.94371720860979e-05, + "loss": 2.7879, + "step": 31934 + }, + { + "epoch": 1.486812393789138, + "grad_norm": 0.3660838885622769, + "learning_rate": 5.9434512021358655e-05, + "loss": 2.8413, + "step": 31935 + }, + { + "epoch": 1.486858951975231, + "grad_norm": 0.3795662561672278, + "learning_rate": 5.943185192893005e-05, + "loss": 2.7711, + "step": 31936 + }, + { + "epoch": 1.486905510161324, + "grad_norm": 0.36293435966577636, + "learning_rate": 5.9429191808819875e-05, + "loss": 2.6704, + "step": 31937 + }, + { + "epoch": 1.486952068347417, + "grad_norm": 0.3708145039685035, + "learning_rate": 5.942653166103595e-05, + "loss": 2.7911, + "step": 31938 + }, + { + "epoch": 1.4869986265335102, + "grad_norm": 0.37041664695855514, + "learning_rate": 5.9423871485586105e-05, + "loss": 2.8067, + "step": 31939 + }, + { + "epoch": 1.4870451847196033, + "grad_norm": 0.3620003476016596, + "learning_rate": 5.942121128247812e-05, + "loss": 2.778, + "step": 31940 + }, + { + "epoch": 1.4870917429056965, + "grad_norm": 0.33492469454380847, + "learning_rate": 5.94185510517198e-05, + "loss": 2.6754, + "step": 31941 + }, + { + "epoch": 1.4871383010917896, + "grad_norm": 0.33284401483933546, + "learning_rate": 5.941589079331895e-05, + "loss": 2.6315, + "step": 31942 + }, + { + "epoch": 1.4871848592778825, + "grad_norm": 0.32128440947225734, + "learning_rate": 5.9413230507283415e-05, + "loss": 2.7723, + "step": 31943 + }, + { + "epoch": 1.4872314174639756, + "grad_norm": 0.32288159589398063, + "learning_rate": 5.941057019362095e-05, + "loss": 2.7252, + "step": 31944 + }, + { + "epoch": 1.4872779756500687, + "grad_norm": 0.3571954693613021, + "learning_rate": 5.940790985233941e-05, + "loss": 2.7553, + "step": 31945 + }, + { + "epoch": 1.4873245338361618, + "grad_norm": 0.3097687165634667, + "learning_rate": 5.940524948344657e-05, + "loss": 2.5903, + "step": 31946 + }, + { + "epoch": 1.4873710920222547, + "grad_norm": 0.35597524107562806, + "learning_rate": 5.9402589086950245e-05, + "loss": 2.7517, + "step": 31947 + }, + { + "epoch": 1.4874176502083478, + "grad_norm": 0.3373469356711097, + "learning_rate": 5.9399928662858254e-05, + "loss": 2.6806, + "step": 31948 + }, + { + "epoch": 1.487464208394441, + "grad_norm": 0.350945531513656, + "learning_rate": 5.939726821117839e-05, + "loss": 2.75, + "step": 31949 + }, + { + "epoch": 1.487510766580534, + "grad_norm": 0.35138103355328965, + "learning_rate": 5.939460773191848e-05, + "loss": 2.7134, + "step": 31950 + }, + { + "epoch": 1.4875573247666272, + "grad_norm": 0.34772713972434266, + "learning_rate": 5.939194722508632e-05, + "loss": 2.5791, + "step": 31951 + }, + { + "epoch": 1.4876038829527203, + "grad_norm": 0.32161460821995536, + "learning_rate": 5.938928669068972e-05, + "loss": 2.7475, + "step": 31952 + }, + { + "epoch": 1.4876504411388132, + "grad_norm": 0.3649665599961778, + "learning_rate": 5.938662612873649e-05, + "loss": 2.6896, + "step": 31953 + }, + { + "epoch": 1.4876969993249063, + "grad_norm": 0.32699058540548004, + "learning_rate": 5.938396553923442e-05, + "loss": 2.7789, + "step": 31954 + }, + { + "epoch": 1.4877435575109994, + "grad_norm": 0.35074300396196734, + "learning_rate": 5.938130492219135e-05, + "loss": 2.7716, + "step": 31955 + }, + { + "epoch": 1.4877901156970925, + "grad_norm": 0.3354081293631261, + "learning_rate": 5.9378644277615084e-05, + "loss": 2.6856, + "step": 31956 + }, + { + "epoch": 1.4878366738831854, + "grad_norm": 0.29953860449537206, + "learning_rate": 5.937598360551341e-05, + "loss": 2.7182, + "step": 31957 + }, + { + "epoch": 1.4878832320692785, + "grad_norm": 0.34909464892949016, + "learning_rate": 5.937332290589416e-05, + "loss": 2.7537, + "step": 31958 + }, + { + "epoch": 1.4879297902553716, + "grad_norm": 0.34618492469702794, + "learning_rate": 5.937066217876511e-05, + "loss": 2.7269, + "step": 31959 + }, + { + "epoch": 1.4879763484414648, + "grad_norm": 0.3188806893948159, + "learning_rate": 5.9368001424134104e-05, + "loss": 2.6911, + "step": 31960 + }, + { + "epoch": 1.4880229066275579, + "grad_norm": 0.34872074684788734, + "learning_rate": 5.9365340642008915e-05, + "loss": 2.6295, + "step": 31961 + }, + { + "epoch": 1.488069464813651, + "grad_norm": 0.32161848854381075, + "learning_rate": 5.9362679832397406e-05, + "loss": 2.7134, + "step": 31962 + }, + { + "epoch": 1.4881160229997439, + "grad_norm": 0.3363324415331602, + "learning_rate": 5.936001899530733e-05, + "loss": 2.8283, + "step": 31963 + }, + { + "epoch": 1.488162581185837, + "grad_norm": 0.35753417130853293, + "learning_rate": 5.935735813074653e-05, + "loss": 2.8167, + "step": 31964 + }, + { + "epoch": 1.4882091393719301, + "grad_norm": 0.349253367710327, + "learning_rate": 5.93546972387228e-05, + "loss": 2.6174, + "step": 31965 + }, + { + "epoch": 1.488255697558023, + "grad_norm": 0.32050960875101314, + "learning_rate": 5.9352036319243955e-05, + "loss": 2.6694, + "step": 31966 + }, + { + "epoch": 1.4883022557441161, + "grad_norm": 0.34298317513740134, + "learning_rate": 5.9349375372317814e-05, + "loss": 2.7279, + "step": 31967 + }, + { + "epoch": 1.4883488139302092, + "grad_norm": 0.3309340436653115, + "learning_rate": 5.9346714397952176e-05, + "loss": 2.7144, + "step": 31968 + }, + { + "epoch": 1.4883953721163024, + "grad_norm": 0.35261218544265277, + "learning_rate": 5.9344053396154854e-05, + "loss": 2.7763, + "step": 31969 + }, + { + "epoch": 1.4884419303023955, + "grad_norm": 0.3442841845848651, + "learning_rate": 5.9341392366933655e-05, + "loss": 2.6647, + "step": 31970 + }, + { + "epoch": 1.4884884884884886, + "grad_norm": 0.31407810598155544, + "learning_rate": 5.933873131029639e-05, + "loss": 2.7638, + "step": 31971 + }, + { + "epoch": 1.4885350466745817, + "grad_norm": 0.34394519953039604, + "learning_rate": 5.933607022625086e-05, + "loss": 2.7736, + "step": 31972 + }, + { + "epoch": 1.4885816048606746, + "grad_norm": 0.3237795428708393, + "learning_rate": 5.933340911480488e-05, + "loss": 2.6975, + "step": 31973 + }, + { + "epoch": 1.4886281630467677, + "grad_norm": 0.33537957437800353, + "learning_rate": 5.933074797596627e-05, + "loss": 2.772, + "step": 31974 + }, + { + "epoch": 1.4886747212328608, + "grad_norm": 0.31901281935602593, + "learning_rate": 5.932808680974283e-05, + "loss": 2.7129, + "step": 31975 + }, + { + "epoch": 1.4887212794189537, + "grad_norm": 0.34265969501103255, + "learning_rate": 5.932542561614239e-05, + "loss": 2.8118, + "step": 31976 + }, + { + "epoch": 1.4887678376050468, + "grad_norm": 0.33482132565695166, + "learning_rate": 5.9322764395172734e-05, + "loss": 2.7478, + "step": 31977 + }, + { + "epoch": 1.48881439579114, + "grad_norm": 0.3402699430528006, + "learning_rate": 5.9320103146841675e-05, + "loss": 2.7757, + "step": 31978 + }, + { + "epoch": 1.488860953977233, + "grad_norm": 0.342709158525426, + "learning_rate": 5.931744187115704e-05, + "loss": 2.7145, + "step": 31979 + }, + { + "epoch": 1.4889075121633262, + "grad_norm": 0.3425813936186421, + "learning_rate": 5.9314780568126636e-05, + "loss": 2.7828, + "step": 31980 + }, + { + "epoch": 1.4889540703494193, + "grad_norm": 0.34777596394213217, + "learning_rate": 5.931211923775827e-05, + "loss": 2.7658, + "step": 31981 + }, + { + "epoch": 1.4890006285355122, + "grad_norm": 0.35663303791933343, + "learning_rate": 5.930945788005975e-05, + "loss": 2.6828, + "step": 31982 + }, + { + "epoch": 1.4890471867216053, + "grad_norm": 0.3516326081190645, + "learning_rate": 5.9306796495038874e-05, + "loss": 2.8595, + "step": 31983 + }, + { + "epoch": 1.4890937449076984, + "grad_norm": 0.37178188545246005, + "learning_rate": 5.930413508270347e-05, + "loss": 2.7636, + "step": 31984 + }, + { + "epoch": 1.4891403030937915, + "grad_norm": 0.3536333078432513, + "learning_rate": 5.930147364306137e-05, + "loss": 2.699, + "step": 31985 + }, + { + "epoch": 1.4891868612798844, + "grad_norm": 0.3375976796862036, + "learning_rate": 5.929881217612033e-05, + "loss": 2.7755, + "step": 31986 + }, + { + "epoch": 1.4892334194659775, + "grad_norm": 0.355358809804424, + "learning_rate": 5.929615068188821e-05, + "loss": 2.7405, + "step": 31987 + }, + { + "epoch": 1.4892799776520707, + "grad_norm": 0.3469714323586505, + "learning_rate": 5.9293489160372794e-05, + "loss": 2.8584, + "step": 31988 + }, + { + "epoch": 1.4893265358381638, + "grad_norm": 0.3463975814370587, + "learning_rate": 5.9290827611581924e-05, + "loss": 2.6988, + "step": 31989 + }, + { + "epoch": 1.4893730940242569, + "grad_norm": 0.3126533845113684, + "learning_rate": 5.928816603552336e-05, + "loss": 2.6937, + "step": 31990 + }, + { + "epoch": 1.48941965221035, + "grad_norm": 0.3507855621270895, + "learning_rate": 5.928550443220496e-05, + "loss": 2.7254, + "step": 31991 + }, + { + "epoch": 1.489466210396443, + "grad_norm": 0.32817597142236027, + "learning_rate": 5.928284280163452e-05, + "loss": 2.761, + "step": 31992 + }, + { + "epoch": 1.489512768582536, + "grad_norm": 0.3380451016447675, + "learning_rate": 5.9280181143819835e-05, + "loss": 2.8603, + "step": 31993 + }, + { + "epoch": 1.4895593267686291, + "grad_norm": 0.34542394418030314, + "learning_rate": 5.927751945876876e-05, + "loss": 2.6944, + "step": 31994 + }, + { + "epoch": 1.4896058849547222, + "grad_norm": 0.3345093669460899, + "learning_rate": 5.927485774648908e-05, + "loss": 2.7524, + "step": 31995 + }, + { + "epoch": 1.4896524431408151, + "grad_norm": 0.33214205941353336, + "learning_rate": 5.927219600698858e-05, + "loss": 2.777, + "step": 31996 + }, + { + "epoch": 1.4896990013269082, + "grad_norm": 0.33333926803789954, + "learning_rate": 5.9269534240275104e-05, + "loss": 2.7656, + "step": 31997 + }, + { + "epoch": 1.4897455595130014, + "grad_norm": 0.3364966927751972, + "learning_rate": 5.926687244635647e-05, + "loss": 2.7439, + "step": 31998 + }, + { + "epoch": 1.4897921176990945, + "grad_norm": 0.3587062247445395, + "learning_rate": 5.926421062524048e-05, + "loss": 2.7832, + "step": 31999 + }, + { + "epoch": 1.4898386758851876, + "grad_norm": 0.36334270506990957, + "learning_rate": 5.9261548776934926e-05, + "loss": 2.7511, + "step": 32000 + }, + { + "epoch": 1.4898852340712807, + "grad_norm": 0.35295746298026853, + "learning_rate": 5.925888690144765e-05, + "loss": 2.8603, + "step": 32001 + }, + { + "epoch": 1.4899317922573736, + "grad_norm": 0.31127176753727254, + "learning_rate": 5.9256224998786444e-05, + "loss": 2.7365, + "step": 32002 + }, + { + "epoch": 1.4899783504434667, + "grad_norm": 0.3378658105716432, + "learning_rate": 5.925356306895915e-05, + "loss": 2.7047, + "step": 32003 + }, + { + "epoch": 1.4900249086295598, + "grad_norm": 0.31768603851528876, + "learning_rate": 5.925090111197354e-05, + "loss": 2.7929, + "step": 32004 + }, + { + "epoch": 1.490071466815653, + "grad_norm": 0.32575182655454515, + "learning_rate": 5.924823912783746e-05, + "loss": 2.5215, + "step": 32005 + }, + { + "epoch": 1.4901180250017458, + "grad_norm": 0.31428553375717727, + "learning_rate": 5.924557711655869e-05, + "loss": 2.6834, + "step": 32006 + }, + { + "epoch": 1.490164583187839, + "grad_norm": 0.3569955762837739, + "learning_rate": 5.924291507814509e-05, + "loss": 2.6966, + "step": 32007 + }, + { + "epoch": 1.490211141373932, + "grad_norm": 0.37321628849829247, + "learning_rate": 5.924025301260444e-05, + "loss": 2.6983, + "step": 32008 + }, + { + "epoch": 1.4902576995600252, + "grad_norm": 0.3382763820712091, + "learning_rate": 5.923759091994454e-05, + "loss": 2.6336, + "step": 32009 + }, + { + "epoch": 1.4903042577461183, + "grad_norm": 0.36708778500608713, + "learning_rate": 5.923492880017323e-05, + "loss": 2.8194, + "step": 32010 + }, + { + "epoch": 1.4903508159322114, + "grad_norm": 0.3708799038426401, + "learning_rate": 5.923226665329831e-05, + "loss": 2.6698, + "step": 32011 + }, + { + "epoch": 1.4903973741183043, + "grad_norm": 0.3502104202530254, + "learning_rate": 5.9229604479327614e-05, + "loss": 2.8138, + "step": 32012 + }, + { + "epoch": 1.4904439323043974, + "grad_norm": 0.3236563361529239, + "learning_rate": 5.922694227826891e-05, + "loss": 2.6573, + "step": 32013 + }, + { + "epoch": 1.4904904904904905, + "grad_norm": 0.3762622039528113, + "learning_rate": 5.922428005013005e-05, + "loss": 2.7451, + "step": 32014 + }, + { + "epoch": 1.4905370486765834, + "grad_norm": 0.3234772262110667, + "learning_rate": 5.922161779491885e-05, + "loss": 2.7628, + "step": 32015 + }, + { + "epoch": 1.4905836068626765, + "grad_norm": 0.36515852581901986, + "learning_rate": 5.9218955512643103e-05, + "loss": 2.7017, + "step": 32016 + }, + { + "epoch": 1.4906301650487697, + "grad_norm": 0.3388306386730946, + "learning_rate": 5.9216293203310627e-05, + "loss": 2.803, + "step": 32017 + }, + { + "epoch": 1.4906767232348628, + "grad_norm": 0.38065878922873203, + "learning_rate": 5.921363086692924e-05, + "loss": 2.7111, + "step": 32018 + }, + { + "epoch": 1.490723281420956, + "grad_norm": 0.33169766993607813, + "learning_rate": 5.921096850350675e-05, + "loss": 2.6856, + "step": 32019 + }, + { + "epoch": 1.490769839607049, + "grad_norm": 0.32756428770545576, + "learning_rate": 5.9208306113050984e-05, + "loss": 2.6987, + "step": 32020 + }, + { + "epoch": 1.490816397793142, + "grad_norm": 0.37274482940439935, + "learning_rate": 5.920564369556975e-05, + "loss": 2.7218, + "step": 32021 + }, + { + "epoch": 1.490862955979235, + "grad_norm": 0.3366878997154056, + "learning_rate": 5.920298125107084e-05, + "loss": 2.7313, + "step": 32022 + }, + { + "epoch": 1.4909095141653281, + "grad_norm": 0.35189297348449083, + "learning_rate": 5.920031877956209e-05, + "loss": 2.7466, + "step": 32023 + }, + { + "epoch": 1.4909560723514212, + "grad_norm": 0.3533553675771304, + "learning_rate": 5.9197656281051316e-05, + "loss": 2.7094, + "step": 32024 + }, + { + "epoch": 1.4910026305375141, + "grad_norm": 0.34749538054054824, + "learning_rate": 5.9194993755546334e-05, + "loss": 2.7186, + "step": 32025 + }, + { + "epoch": 1.4910491887236073, + "grad_norm": 0.34730258760244315, + "learning_rate": 5.919233120305494e-05, + "loss": 2.7305, + "step": 32026 + }, + { + "epoch": 1.4910957469097004, + "grad_norm": 0.34689917361428885, + "learning_rate": 5.918966862358496e-05, + "loss": 2.7592, + "step": 32027 + }, + { + "epoch": 1.4911423050957935, + "grad_norm": 0.34719680235766104, + "learning_rate": 5.918700601714421e-05, + "loss": 2.685, + "step": 32028 + }, + { + "epoch": 1.4911888632818866, + "grad_norm": 0.33985835107186946, + "learning_rate": 5.9184343383740504e-05, + "loss": 2.8461, + "step": 32029 + }, + { + "epoch": 1.4912354214679797, + "grad_norm": 0.3724747385524471, + "learning_rate": 5.918168072338166e-05, + "loss": 2.7661, + "step": 32030 + }, + { + "epoch": 1.4912819796540726, + "grad_norm": 0.3302509786540392, + "learning_rate": 5.9179018036075485e-05, + "loss": 2.7887, + "step": 32031 + }, + { + "epoch": 1.4913285378401657, + "grad_norm": 0.3470929057138632, + "learning_rate": 5.917635532182979e-05, + "loss": 2.7529, + "step": 32032 + }, + { + "epoch": 1.4913750960262588, + "grad_norm": 0.3326043986512569, + "learning_rate": 5.9173692580652406e-05, + "loss": 2.7913, + "step": 32033 + }, + { + "epoch": 1.491421654212352, + "grad_norm": 0.354692015133671, + "learning_rate": 5.9171029812551136e-05, + "loss": 2.7062, + "step": 32034 + }, + { + "epoch": 1.4914682123984448, + "grad_norm": 0.34098816217607286, + "learning_rate": 5.9168367017533796e-05, + "loss": 2.6725, + "step": 32035 + }, + { + "epoch": 1.491514770584538, + "grad_norm": 0.34852841264586587, + "learning_rate": 5.91657041956082e-05, + "loss": 2.7553, + "step": 32036 + }, + { + "epoch": 1.491561328770631, + "grad_norm": 0.3464365729023993, + "learning_rate": 5.9163041346782154e-05, + "loss": 2.8428, + "step": 32037 + }, + { + "epoch": 1.4916078869567242, + "grad_norm": 0.34610272354310667, + "learning_rate": 5.916037847106349e-05, + "loss": 2.633, + "step": 32038 + }, + { + "epoch": 1.4916544451428173, + "grad_norm": 0.35611747927652393, + "learning_rate": 5.915771556846004e-05, + "loss": 2.9311, + "step": 32039 + }, + { + "epoch": 1.4917010033289104, + "grad_norm": 0.3501905993193789, + "learning_rate": 5.915505263897958e-05, + "loss": 2.7963, + "step": 32040 + }, + { + "epoch": 1.4917475615150033, + "grad_norm": 0.32602142323841904, + "learning_rate": 5.915238968262993e-05, + "loss": 2.7715, + "step": 32041 + }, + { + "epoch": 1.4917941197010964, + "grad_norm": 0.328908994307833, + "learning_rate": 5.914972669941893e-05, + "loss": 2.7344, + "step": 32042 + }, + { + "epoch": 1.4918406778871895, + "grad_norm": 0.3649435733082028, + "learning_rate": 5.914706368935439e-05, + "loss": 2.7353, + "step": 32043 + }, + { + "epoch": 1.4918872360732827, + "grad_norm": 0.3380664773567707, + "learning_rate": 5.914440065244411e-05, + "loss": 2.7931, + "step": 32044 + }, + { + "epoch": 1.4919337942593756, + "grad_norm": 0.3406548740795411, + "learning_rate": 5.914173758869592e-05, + "loss": 2.8027, + "step": 32045 + }, + { + "epoch": 1.4919803524454687, + "grad_norm": 0.340095858118444, + "learning_rate": 5.913907449811763e-05, + "loss": 2.7021, + "step": 32046 + }, + { + "epoch": 1.4920269106315618, + "grad_norm": 0.3375276631349095, + "learning_rate": 5.913641138071706e-05, + "loss": 2.6916, + "step": 32047 + }, + { + "epoch": 1.492073468817655, + "grad_norm": 0.3469315497196033, + "learning_rate": 5.913374823650202e-05, + "loss": 2.8335, + "step": 32048 + }, + { + "epoch": 1.492120027003748, + "grad_norm": 0.355316272372739, + "learning_rate": 5.913108506548033e-05, + "loss": 2.6554, + "step": 32049 + }, + { + "epoch": 1.4921665851898411, + "grad_norm": 0.34786023719452636, + "learning_rate": 5.912842186765979e-05, + "loss": 2.7248, + "step": 32050 + }, + { + "epoch": 1.492213143375934, + "grad_norm": 0.32713007505776803, + "learning_rate": 5.9125758643048244e-05, + "loss": 2.7163, + "step": 32051 + }, + { + "epoch": 1.4922597015620271, + "grad_norm": 0.35818362886210714, + "learning_rate": 5.91230953916535e-05, + "loss": 2.7981, + "step": 32052 + }, + { + "epoch": 1.4923062597481203, + "grad_norm": 0.3268251279202909, + "learning_rate": 5.912043211348336e-05, + "loss": 2.6666, + "step": 32053 + }, + { + "epoch": 1.4923528179342131, + "grad_norm": 0.33816327032683363, + "learning_rate": 5.911776880854565e-05, + "loss": 2.7306, + "step": 32054 + }, + { + "epoch": 1.4923993761203063, + "grad_norm": 0.33991276383888597, + "learning_rate": 5.911510547684819e-05, + "loss": 2.7083, + "step": 32055 + }, + { + "epoch": 1.4924459343063994, + "grad_norm": 0.30305553443553557, + "learning_rate": 5.911244211839879e-05, + "loss": 2.7584, + "step": 32056 + }, + { + "epoch": 1.4924924924924925, + "grad_norm": 0.348392554859105, + "learning_rate": 5.910977873320529e-05, + "loss": 2.7665, + "step": 32057 + }, + { + "epoch": 1.4925390506785856, + "grad_norm": 0.3126382181209864, + "learning_rate": 5.9107115321275466e-05, + "loss": 2.7962, + "step": 32058 + }, + { + "epoch": 1.4925856088646787, + "grad_norm": 0.351833448757663, + "learning_rate": 5.910445188261716e-05, + "loss": 2.7417, + "step": 32059 + }, + { + "epoch": 1.4926321670507716, + "grad_norm": 0.34249312792160963, + "learning_rate": 5.910178841723819e-05, + "loss": 2.683, + "step": 32060 + }, + { + "epoch": 1.4926787252368647, + "grad_norm": 0.35165466107634696, + "learning_rate": 5.909912492514638e-05, + "loss": 2.846, + "step": 32061 + }, + { + "epoch": 1.4927252834229578, + "grad_norm": 0.3645339305597344, + "learning_rate": 5.90964614063495e-05, + "loss": 2.7854, + "step": 32062 + }, + { + "epoch": 1.492771841609051, + "grad_norm": 0.3573803805682951, + "learning_rate": 5.9093797860855426e-05, + "loss": 2.7251, + "step": 32063 + }, + { + "epoch": 1.4928183997951439, + "grad_norm": 0.3530951059014473, + "learning_rate": 5.9091134288671946e-05, + "loss": 2.7151, + "step": 32064 + }, + { + "epoch": 1.492864957981237, + "grad_norm": 0.3609916692928956, + "learning_rate": 5.9088470689806874e-05, + "loss": 2.7722, + "step": 32065 + }, + { + "epoch": 1.49291151616733, + "grad_norm": 0.31040524792652663, + "learning_rate": 5.908580706426805e-05, + "loss": 2.69, + "step": 32066 + }, + { + "epoch": 1.4929580743534232, + "grad_norm": 0.3527666072854314, + "learning_rate": 5.908314341206327e-05, + "loss": 2.7009, + "step": 32067 + }, + { + "epoch": 1.4930046325395163, + "grad_norm": 0.3463593352310209, + "learning_rate": 5.908047973320035e-05, + "loss": 2.6854, + "step": 32068 + }, + { + "epoch": 1.4930511907256094, + "grad_norm": 0.34234025968243736, + "learning_rate": 5.9077816027687124e-05, + "loss": 2.7148, + "step": 32069 + }, + { + "epoch": 1.4930977489117023, + "grad_norm": 0.3448567567014196, + "learning_rate": 5.9075152295531423e-05, + "loss": 2.687, + "step": 32070 + }, + { + "epoch": 1.4931443070977954, + "grad_norm": 0.33474000419184546, + "learning_rate": 5.907248853674102e-05, + "loss": 2.6542, + "step": 32071 + }, + { + "epoch": 1.4931908652838886, + "grad_norm": 0.3478865931686223, + "learning_rate": 5.906982475132377e-05, + "loss": 2.6078, + "step": 32072 + }, + { + "epoch": 1.4932374234699817, + "grad_norm": 0.3333283131476858, + "learning_rate": 5.9067160939287466e-05, + "loss": 2.8475, + "step": 32073 + }, + { + "epoch": 1.4932839816560746, + "grad_norm": 0.34055347012104015, + "learning_rate": 5.906449710063994e-05, + "loss": 2.6938, + "step": 32074 + }, + { + "epoch": 1.4933305398421677, + "grad_norm": 0.3192682716850813, + "learning_rate": 5.906183323538902e-05, + "loss": 2.7921, + "step": 32075 + }, + { + "epoch": 1.4933770980282608, + "grad_norm": 0.33724167100204033, + "learning_rate": 5.9059169343542496e-05, + "loss": 2.6647, + "step": 32076 + }, + { + "epoch": 1.493423656214354, + "grad_norm": 0.31303132695876684, + "learning_rate": 5.905650542510821e-05, + "loss": 2.7345, + "step": 32077 + }, + { + "epoch": 1.493470214400447, + "grad_norm": 0.3506970765423488, + "learning_rate": 5.9053841480093964e-05, + "loss": 2.7048, + "step": 32078 + }, + { + "epoch": 1.4935167725865401, + "grad_norm": 0.32781077024351596, + "learning_rate": 5.90511775085076e-05, + "loss": 2.7262, + "step": 32079 + }, + { + "epoch": 1.493563330772633, + "grad_norm": 0.3310648729111194, + "learning_rate": 5.904851351035692e-05, + "loss": 2.6885, + "step": 32080 + }, + { + "epoch": 1.4936098889587262, + "grad_norm": 0.37026008292084095, + "learning_rate": 5.904584948564973e-05, + "loss": 2.6717, + "step": 32081 + }, + { + "epoch": 1.4936564471448193, + "grad_norm": 0.3661429833315386, + "learning_rate": 5.9043185434393864e-05, + "loss": 2.7116, + "step": 32082 + }, + { + "epoch": 1.4937030053309124, + "grad_norm": 0.3192534203193486, + "learning_rate": 5.904052135659717e-05, + "loss": 2.7661, + "step": 32083 + }, + { + "epoch": 1.4937495635170053, + "grad_norm": 0.3471734248111816, + "learning_rate": 5.9037857252267406e-05, + "loss": 2.7779, + "step": 32084 + }, + { + "epoch": 1.4937961217030984, + "grad_norm": 0.33650566506310137, + "learning_rate": 5.9035193121412437e-05, + "loss": 2.6999, + "step": 32085 + }, + { + "epoch": 1.4938426798891915, + "grad_norm": 0.31903167629057666, + "learning_rate": 5.903252896404006e-05, + "loss": 2.7091, + "step": 32086 + }, + { + "epoch": 1.4938892380752846, + "grad_norm": 0.3826827586025655, + "learning_rate": 5.9029864780158094e-05, + "loss": 2.8198, + "step": 32087 + }, + { + "epoch": 1.4939357962613777, + "grad_norm": 0.34068999816652124, + "learning_rate": 5.9027200569774374e-05, + "loss": 2.8437, + "step": 32088 + }, + { + "epoch": 1.4939823544474709, + "grad_norm": 0.3868380686549936, + "learning_rate": 5.902453633289671e-05, + "loss": 2.6904, + "step": 32089 + }, + { + "epoch": 1.4940289126335637, + "grad_norm": 0.3606481813004007, + "learning_rate": 5.90218720695329e-05, + "loss": 2.715, + "step": 32090 + }, + { + "epoch": 1.4940754708196569, + "grad_norm": 0.34389709437674254, + "learning_rate": 5.90192077796908e-05, + "loss": 2.7135, + "step": 32091 + }, + { + "epoch": 1.49412202900575, + "grad_norm": 0.3397908215537564, + "learning_rate": 5.9016543463378214e-05, + "loss": 2.6044, + "step": 32092 + }, + { + "epoch": 1.4941685871918429, + "grad_norm": 0.3597765302735947, + "learning_rate": 5.901387912060297e-05, + "loss": 2.7891, + "step": 32093 + }, + { + "epoch": 1.494215145377936, + "grad_norm": 0.3579724889505021, + "learning_rate": 5.901121475137287e-05, + "loss": 2.6742, + "step": 32094 + }, + { + "epoch": 1.494261703564029, + "grad_norm": 0.3423222614789506, + "learning_rate": 5.900855035569574e-05, + "loss": 2.7777, + "step": 32095 + }, + { + "epoch": 1.4943082617501222, + "grad_norm": 0.3577341789061069, + "learning_rate": 5.900588593357941e-05, + "loss": 2.7005, + "step": 32096 + }, + { + "epoch": 1.4943548199362153, + "grad_norm": 0.33128555243408037, + "learning_rate": 5.900322148503168e-05, + "loss": 2.6613, + "step": 32097 + }, + { + "epoch": 1.4944013781223084, + "grad_norm": 0.3490146770275145, + "learning_rate": 5.9000557010060395e-05, + "loss": 2.774, + "step": 32098 + }, + { + "epoch": 1.4944479363084016, + "grad_norm": 0.34300356575038804, + "learning_rate": 5.8997892508673356e-05, + "loss": 2.8005, + "step": 32099 + }, + { + "epoch": 1.4944944944944945, + "grad_norm": 0.3294559468333187, + "learning_rate": 5.8995227980878384e-05, + "loss": 2.7155, + "step": 32100 + }, + { + "epoch": 1.4945410526805876, + "grad_norm": 0.34864526835602455, + "learning_rate": 5.899256342668331e-05, + "loss": 2.745, + "step": 32101 + }, + { + "epoch": 1.4945876108666807, + "grad_norm": 0.3291332408670312, + "learning_rate": 5.8989898846095956e-05, + "loss": 2.7343, + "step": 32102 + }, + { + "epoch": 1.4946341690527736, + "grad_norm": 0.3496914188682138, + "learning_rate": 5.8987234239124126e-05, + "loss": 2.8265, + "step": 32103 + }, + { + "epoch": 1.4946807272388667, + "grad_norm": 0.34965740609457097, + "learning_rate": 5.898456960577564e-05, + "loss": 2.7623, + "step": 32104 + }, + { + "epoch": 1.4947272854249598, + "grad_norm": 0.3485758132431256, + "learning_rate": 5.898190494605833e-05, + "loss": 2.8382, + "step": 32105 + }, + { + "epoch": 1.494773843611053, + "grad_norm": 0.35740347602107786, + "learning_rate": 5.897924025998003e-05, + "loss": 2.7933, + "step": 32106 + }, + { + "epoch": 1.494820401797146, + "grad_norm": 0.3227012719510706, + "learning_rate": 5.897657554754853e-05, + "loss": 2.7245, + "step": 32107 + }, + { + "epoch": 1.4948669599832392, + "grad_norm": 0.3696533306486873, + "learning_rate": 5.897391080877167e-05, + "loss": 2.7147, + "step": 32108 + }, + { + "epoch": 1.494913518169332, + "grad_norm": 0.36290305173165205, + "learning_rate": 5.897124604365727e-05, + "loss": 2.7045, + "step": 32109 + }, + { + "epoch": 1.4949600763554252, + "grad_norm": 0.34665177376943856, + "learning_rate": 5.896858125221314e-05, + "loss": 2.7011, + "step": 32110 + }, + { + "epoch": 1.4950066345415183, + "grad_norm": 0.35353958409791963, + "learning_rate": 5.896591643444712e-05, + "loss": 2.6721, + "step": 32111 + }, + { + "epoch": 1.4950531927276114, + "grad_norm": 0.3654279027644535, + "learning_rate": 5.8963251590367e-05, + "loss": 2.7273, + "step": 32112 + }, + { + "epoch": 1.4950997509137043, + "grad_norm": 0.37479597129181724, + "learning_rate": 5.896058671998063e-05, + "loss": 2.7974, + "step": 32113 + }, + { + "epoch": 1.4951463090997974, + "grad_norm": 0.36805895881476675, + "learning_rate": 5.8957921823295815e-05, + "loss": 2.7797, + "step": 32114 + }, + { + "epoch": 1.4951928672858905, + "grad_norm": 0.38719298412709574, + "learning_rate": 5.895525690032039e-05, + "loss": 2.7836, + "step": 32115 + }, + { + "epoch": 1.4952394254719836, + "grad_norm": 0.42426872011363925, + "learning_rate": 5.895259195106216e-05, + "loss": 2.7038, + "step": 32116 + }, + { + "epoch": 1.4952859836580767, + "grad_norm": 0.3346557701219372, + "learning_rate": 5.894992697552896e-05, + "loss": 2.7519, + "step": 32117 + }, + { + "epoch": 1.4953325418441699, + "grad_norm": 0.4308433289866079, + "learning_rate": 5.8947261973728595e-05, + "loss": 2.777, + "step": 32118 + }, + { + "epoch": 1.4953791000302628, + "grad_norm": 0.33757869841307736, + "learning_rate": 5.8944596945668904e-05, + "loss": 2.7275, + "step": 32119 + }, + { + "epoch": 1.4954256582163559, + "grad_norm": 0.3734455409084003, + "learning_rate": 5.894193189135772e-05, + "loss": 2.785, + "step": 32120 + }, + { + "epoch": 1.495472216402449, + "grad_norm": 0.3373130792156663, + "learning_rate": 5.893926681080282e-05, + "loss": 2.847, + "step": 32121 + }, + { + "epoch": 1.495518774588542, + "grad_norm": 0.3623273467820175, + "learning_rate": 5.8936601704012075e-05, + "loss": 2.6455, + "step": 32122 + }, + { + "epoch": 1.495565332774635, + "grad_norm": 0.31868853055488366, + "learning_rate": 5.8933936570993265e-05, + "loss": 2.6826, + "step": 32123 + }, + { + "epoch": 1.495611890960728, + "grad_norm": 0.34524349458879755, + "learning_rate": 5.893127141175425e-05, + "loss": 2.7533, + "step": 32124 + }, + { + "epoch": 1.4956584491468212, + "grad_norm": 0.3476228482027381, + "learning_rate": 5.8928606226302816e-05, + "loss": 2.636, + "step": 32125 + }, + { + "epoch": 1.4957050073329143, + "grad_norm": 0.37056952261915566, + "learning_rate": 5.8925941014646816e-05, + "loss": 2.7913, + "step": 32126 + }, + { + "epoch": 1.4957515655190075, + "grad_norm": 0.3252841278163389, + "learning_rate": 5.892327577679404e-05, + "loss": 2.7292, + "step": 32127 + }, + { + "epoch": 1.4957981237051006, + "grad_norm": 0.3836347134189261, + "learning_rate": 5.892061051275234e-05, + "loss": 2.7738, + "step": 32128 + }, + { + "epoch": 1.4958446818911935, + "grad_norm": 0.32508389735390225, + "learning_rate": 5.891794522252954e-05, + "loss": 2.7406, + "step": 32129 + }, + { + "epoch": 1.4958912400772866, + "grad_norm": 0.3477683143834565, + "learning_rate": 5.8915279906133435e-05, + "loss": 2.7372, + "step": 32130 + }, + { + "epoch": 1.4959377982633797, + "grad_norm": 0.355205969433022, + "learning_rate": 5.8912614563571854e-05, + "loss": 2.7906, + "step": 32131 + }, + { + "epoch": 1.4959843564494728, + "grad_norm": 0.4043880135868509, + "learning_rate": 5.890994919485264e-05, + "loss": 2.66, + "step": 32132 + }, + { + "epoch": 1.4960309146355657, + "grad_norm": 0.3470995888099858, + "learning_rate": 5.8907283799983605e-05, + "loss": 2.6832, + "step": 32133 + }, + { + "epoch": 1.4960774728216588, + "grad_norm": 0.39830981117107866, + "learning_rate": 5.8904618378972564e-05, + "loss": 2.7732, + "step": 32134 + }, + { + "epoch": 1.496124031007752, + "grad_norm": 0.36600072912893206, + "learning_rate": 5.890195293182735e-05, + "loss": 2.6388, + "step": 32135 + }, + { + "epoch": 1.496170589193845, + "grad_norm": 0.36837571390629437, + "learning_rate": 5.889928745855577e-05, + "loss": 2.6961, + "step": 32136 + }, + { + "epoch": 1.4962171473799382, + "grad_norm": 0.3426591391370842, + "learning_rate": 5.889662195916567e-05, + "loss": 2.7719, + "step": 32137 + }, + { + "epoch": 1.4962637055660313, + "grad_norm": 0.35440976934840906, + "learning_rate": 5.8893956433664865e-05, + "loss": 2.6967, + "step": 32138 + }, + { + "epoch": 1.4963102637521242, + "grad_norm": 0.374046053508731, + "learning_rate": 5.889129088206116e-05, + "loss": 2.7549, + "step": 32139 + }, + { + "epoch": 1.4963568219382173, + "grad_norm": 0.34746285802923643, + "learning_rate": 5.888862530436239e-05, + "loss": 2.6252, + "step": 32140 + }, + { + "epoch": 1.4964033801243104, + "grad_norm": 0.3712386937677607, + "learning_rate": 5.888595970057639e-05, + "loss": 2.8445, + "step": 32141 + }, + { + "epoch": 1.4964499383104033, + "grad_norm": 0.34549744554852474, + "learning_rate": 5.8883294070710984e-05, + "loss": 2.7297, + "step": 32142 + }, + { + "epoch": 1.4964964964964964, + "grad_norm": 0.3758508849932467, + "learning_rate": 5.8880628414773965e-05, + "loss": 2.7088, + "step": 32143 + }, + { + "epoch": 1.4965430546825895, + "grad_norm": 0.3753294937645646, + "learning_rate": 5.8877962732773174e-05, + "loss": 2.727, + "step": 32144 + }, + { + "epoch": 1.4965896128686826, + "grad_norm": 0.33421117791324007, + "learning_rate": 5.887529702471645e-05, + "loss": 2.6303, + "step": 32145 + }, + { + "epoch": 1.4966361710547758, + "grad_norm": 0.3767355031769954, + "learning_rate": 5.887263129061159e-05, + "loss": 2.6732, + "step": 32146 + }, + { + "epoch": 1.4966827292408689, + "grad_norm": 0.3379747381519812, + "learning_rate": 5.886996553046645e-05, + "loss": 2.7788, + "step": 32147 + }, + { + "epoch": 1.4967292874269618, + "grad_norm": 0.3495890958255157, + "learning_rate": 5.886729974428884e-05, + "loss": 2.673, + "step": 32148 + }, + { + "epoch": 1.4967758456130549, + "grad_norm": 0.38840053145554915, + "learning_rate": 5.8864633932086564e-05, + "loss": 2.752, + "step": 32149 + }, + { + "epoch": 1.496822403799148, + "grad_norm": 0.376725948505513, + "learning_rate": 5.886196809386746e-05, + "loss": 2.6257, + "step": 32150 + }, + { + "epoch": 1.496868961985241, + "grad_norm": 0.3543961154576412, + "learning_rate": 5.885930222963936e-05, + "loss": 2.7684, + "step": 32151 + }, + { + "epoch": 1.496915520171334, + "grad_norm": 0.33697484108299475, + "learning_rate": 5.885663633941007e-05, + "loss": 2.7489, + "step": 32152 + }, + { + "epoch": 1.4969620783574271, + "grad_norm": 0.3640247986706687, + "learning_rate": 5.885397042318743e-05, + "loss": 2.7475, + "step": 32153 + }, + { + "epoch": 1.4970086365435202, + "grad_norm": 0.37779249728626, + "learning_rate": 5.885130448097925e-05, + "loss": 2.7728, + "step": 32154 + }, + { + "epoch": 1.4970551947296133, + "grad_norm": 0.3217393706320797, + "learning_rate": 5.8848638512793375e-05, + "loss": 2.7638, + "step": 32155 + }, + { + "epoch": 1.4971017529157065, + "grad_norm": 0.36565393782399336, + "learning_rate": 5.8845972518637625e-05, + "loss": 2.8087, + "step": 32156 + }, + { + "epoch": 1.4971483111017996, + "grad_norm": 0.34259442552518865, + "learning_rate": 5.88433064985198e-05, + "loss": 2.8044, + "step": 32157 + }, + { + "epoch": 1.4971948692878925, + "grad_norm": 0.34151280738207634, + "learning_rate": 5.884064045244775e-05, + "loss": 2.6353, + "step": 32158 + }, + { + "epoch": 1.4972414274739856, + "grad_norm": 0.3734887042661194, + "learning_rate": 5.883797438042929e-05, + "loss": 2.6877, + "step": 32159 + }, + { + "epoch": 1.4972879856600787, + "grad_norm": 0.3305361939170342, + "learning_rate": 5.883530828247226e-05, + "loss": 2.8038, + "step": 32160 + }, + { + "epoch": 1.4973345438461718, + "grad_norm": 0.38101121359950346, + "learning_rate": 5.883264215858446e-05, + "loss": 2.804, + "step": 32161 + }, + { + "epoch": 1.4973811020322647, + "grad_norm": 0.31528013500960295, + "learning_rate": 5.8829976008773715e-05, + "loss": 2.7037, + "step": 32162 + }, + { + "epoch": 1.4974276602183578, + "grad_norm": 0.3734429767733107, + "learning_rate": 5.8827309833047874e-05, + "loss": 2.8443, + "step": 32163 + }, + { + "epoch": 1.497474218404451, + "grad_norm": 0.3543326330813946, + "learning_rate": 5.882464363141473e-05, + "loss": 2.7489, + "step": 32164 + }, + { + "epoch": 1.497520776590544, + "grad_norm": 0.3214570857179854, + "learning_rate": 5.8821977403882146e-05, + "loss": 2.7772, + "step": 32165 + }, + { + "epoch": 1.4975673347766372, + "grad_norm": 0.36043345171851154, + "learning_rate": 5.8819311150457925e-05, + "loss": 2.7647, + "step": 32166 + }, + { + "epoch": 1.4976138929627303, + "grad_norm": 0.34496754693948384, + "learning_rate": 5.881664487114988e-05, + "loss": 2.7371, + "step": 32167 + }, + { + "epoch": 1.4976604511488232, + "grad_norm": 0.3450296640578584, + "learning_rate": 5.8813978565965856e-05, + "loss": 2.6529, + "step": 32168 + }, + { + "epoch": 1.4977070093349163, + "grad_norm": 0.36868610792789763, + "learning_rate": 5.8811312234913675e-05, + "loss": 2.7679, + "step": 32169 + }, + { + "epoch": 1.4977535675210094, + "grad_norm": 0.362781328478024, + "learning_rate": 5.8808645878001164e-05, + "loss": 2.7724, + "step": 32170 + }, + { + "epoch": 1.4978001257071025, + "grad_norm": 0.3555729957978509, + "learning_rate": 5.880597949523615e-05, + "loss": 2.6792, + "step": 32171 + }, + { + "epoch": 1.4978466838931954, + "grad_norm": 0.34659202493036456, + "learning_rate": 5.880331308662644e-05, + "loss": 2.6747, + "step": 32172 + }, + { + "epoch": 1.4978932420792885, + "grad_norm": 0.3320486882215376, + "learning_rate": 5.880064665217989e-05, + "loss": 2.5406, + "step": 32173 + }, + { + "epoch": 1.4979398002653816, + "grad_norm": 0.3756485409395514, + "learning_rate": 5.8797980191904314e-05, + "loss": 2.8068, + "step": 32174 + }, + { + "epoch": 1.4979863584514748, + "grad_norm": 0.355248968973808, + "learning_rate": 5.879531370580752e-05, + "loss": 2.629, + "step": 32175 + }, + { + "epoch": 1.4980329166375679, + "grad_norm": 0.3804151966892523, + "learning_rate": 5.8792647193897346e-05, + "loss": 2.7663, + "step": 32176 + }, + { + "epoch": 1.498079474823661, + "grad_norm": 0.35447182587414716, + "learning_rate": 5.878998065618161e-05, + "loss": 2.6177, + "step": 32177 + }, + { + "epoch": 1.4981260330097539, + "grad_norm": 0.3829961310373156, + "learning_rate": 5.878731409266815e-05, + "loss": 2.7698, + "step": 32178 + }, + { + "epoch": 1.498172591195847, + "grad_norm": 0.333318666768177, + "learning_rate": 5.878464750336481e-05, + "loss": 2.7302, + "step": 32179 + }, + { + "epoch": 1.4982191493819401, + "grad_norm": 0.4154283466683973, + "learning_rate": 5.878198088827938e-05, + "loss": 2.7295, + "step": 32180 + }, + { + "epoch": 1.498265707568033, + "grad_norm": 0.36092524399313297, + "learning_rate": 5.877931424741969e-05, + "loss": 2.6945, + "step": 32181 + }, + { + "epoch": 1.4983122657541261, + "grad_norm": 0.35771962697185833, + "learning_rate": 5.877664758079359e-05, + "loss": 2.6815, + "step": 32182 + }, + { + "epoch": 1.4983588239402192, + "grad_norm": 0.3693901883932829, + "learning_rate": 5.8773980888408896e-05, + "loss": 2.7175, + "step": 32183 + }, + { + "epoch": 1.4984053821263124, + "grad_norm": 0.3204828548030222, + "learning_rate": 5.8771314170273425e-05, + "loss": 2.7768, + "step": 32184 + }, + { + "epoch": 1.4984519403124055, + "grad_norm": 0.3971677237510278, + "learning_rate": 5.876864742639501e-05, + "loss": 2.7783, + "step": 32185 + }, + { + "epoch": 1.4984984984984986, + "grad_norm": 0.33213541537686464, + "learning_rate": 5.876598065678149e-05, + "loss": 2.7371, + "step": 32186 + }, + { + "epoch": 1.4985450566845917, + "grad_norm": 0.35769074234623127, + "learning_rate": 5.8763313861440675e-05, + "loss": 2.7412, + "step": 32187 + }, + { + "epoch": 1.4985916148706846, + "grad_norm": 0.34354128514135623, + "learning_rate": 5.87606470403804e-05, + "loss": 2.8347, + "step": 32188 + }, + { + "epoch": 1.4986381730567777, + "grad_norm": 0.369168988461428, + "learning_rate": 5.875798019360848e-05, + "loss": 2.6246, + "step": 32189 + }, + { + "epoch": 1.4986847312428708, + "grad_norm": 0.34113538499084267, + "learning_rate": 5.8755313321132754e-05, + "loss": 2.6522, + "step": 32190 + }, + { + "epoch": 1.4987312894289637, + "grad_norm": 0.35401606701117466, + "learning_rate": 5.875264642296105e-05, + "loss": 2.7276, + "step": 32191 + }, + { + "epoch": 1.4987778476150568, + "grad_norm": 0.35307698158746237, + "learning_rate": 5.8749979499101195e-05, + "loss": 2.7798, + "step": 32192 + }, + { + "epoch": 1.49882440580115, + "grad_norm": 0.34666881762436824, + "learning_rate": 5.8747312549561006e-05, + "loss": 2.8226, + "step": 32193 + }, + { + "epoch": 1.498870963987243, + "grad_norm": 0.40125600702568265, + "learning_rate": 5.874464557434831e-05, + "loss": 2.6902, + "step": 32194 + }, + { + "epoch": 1.4989175221733362, + "grad_norm": 0.33089590569906674, + "learning_rate": 5.8741978573470946e-05, + "loss": 2.6087, + "step": 32195 + }, + { + "epoch": 1.4989640803594293, + "grad_norm": 0.36692563750430046, + "learning_rate": 5.873931154693673e-05, + "loss": 2.8184, + "step": 32196 + }, + { + "epoch": 1.4990106385455222, + "grad_norm": 0.34403143868359093, + "learning_rate": 5.873664449475351e-05, + "loss": 2.7811, + "step": 32197 + }, + { + "epoch": 1.4990571967316153, + "grad_norm": 0.38306650617814325, + "learning_rate": 5.87339774169291e-05, + "loss": 2.8195, + "step": 32198 + }, + { + "epoch": 1.4991037549177084, + "grad_norm": 0.3496478056747849, + "learning_rate": 5.873131031347132e-05, + "loss": 2.6718, + "step": 32199 + }, + { + "epoch": 1.4991503131038015, + "grad_norm": 0.3197877952363806, + "learning_rate": 5.8728643184387997e-05, + "loss": 2.7955, + "step": 32200 + }, + { + "epoch": 1.4991968712898944, + "grad_norm": 0.34975308217286566, + "learning_rate": 5.872597602968698e-05, + "loss": 2.8056, + "step": 32201 + }, + { + "epoch": 1.4992434294759875, + "grad_norm": 0.31572254558766355, + "learning_rate": 5.8723308849376067e-05, + "loss": 2.6331, + "step": 32202 + }, + { + "epoch": 1.4992899876620807, + "grad_norm": 0.3509855216269301, + "learning_rate": 5.8720641643463114e-05, + "loss": 2.698, + "step": 32203 + }, + { + "epoch": 1.4993365458481738, + "grad_norm": 0.3272397117271585, + "learning_rate": 5.8717974411955936e-05, + "loss": 2.6577, + "step": 32204 + }, + { + "epoch": 1.499383104034267, + "grad_norm": 0.3419147846263713, + "learning_rate": 5.871530715486235e-05, + "loss": 2.7698, + "step": 32205 + }, + { + "epoch": 1.49942966222036, + "grad_norm": 0.3368685090641928, + "learning_rate": 5.871263987219021e-05, + "loss": 2.7484, + "step": 32206 + }, + { + "epoch": 1.499476220406453, + "grad_norm": 0.3499578859165415, + "learning_rate": 5.870997256394732e-05, + "loss": 2.7868, + "step": 32207 + }, + { + "epoch": 1.499522778592546, + "grad_norm": 0.35053739201263523, + "learning_rate": 5.870730523014153e-05, + "loss": 2.6668, + "step": 32208 + }, + { + "epoch": 1.4995693367786391, + "grad_norm": 0.365554797197386, + "learning_rate": 5.870463787078064e-05, + "loss": 2.7519, + "step": 32209 + }, + { + "epoch": 1.4996158949647322, + "grad_norm": 0.3413474562437681, + "learning_rate": 5.870197048587252e-05, + "loss": 2.805, + "step": 32210 + }, + { + "epoch": 1.4996624531508251, + "grad_norm": 0.3490675470124289, + "learning_rate": 5.869930307542496e-05, + "loss": 2.6525, + "step": 32211 + }, + { + "epoch": 1.4997090113369183, + "grad_norm": 0.3457067925948527, + "learning_rate": 5.8696635639445804e-05, + "loss": 2.816, + "step": 32212 + }, + { + "epoch": 1.4997555695230114, + "grad_norm": 0.34244660747960676, + "learning_rate": 5.869396817794288e-05, + "loss": 2.6913, + "step": 32213 + }, + { + "epoch": 1.4998021277091045, + "grad_norm": 0.3377845574771877, + "learning_rate": 5.8691300690924014e-05, + "loss": 2.5781, + "step": 32214 + }, + { + "epoch": 1.4998486858951976, + "grad_norm": 0.32332662211976354, + "learning_rate": 5.868863317839705e-05, + "loss": 2.7379, + "step": 32215 + }, + { + "epoch": 1.4998952440812907, + "grad_norm": 0.35576711316489806, + "learning_rate": 5.868596564036978e-05, + "loss": 2.8051, + "step": 32216 + }, + { + "epoch": 1.4999418022673836, + "grad_norm": 0.33639007645958224, + "learning_rate": 5.868329807685007e-05, + "loss": 2.6946, + "step": 32217 + }, + { + "epoch": 1.4999883604534767, + "grad_norm": 0.35525191896931785, + "learning_rate": 5.868063048784572e-05, + "loss": 2.756, + "step": 32218 + }, + { + "epoch": 1.5000349186395698, + "grad_norm": 0.3247744086735514, + "learning_rate": 5.86779628733646e-05, + "loss": 2.7126, + "step": 32219 + }, + { + "epoch": 1.5000814768256627, + "grad_norm": 0.3959001198572118, + "learning_rate": 5.8675295233414494e-05, + "loss": 2.7397, + "step": 32220 + }, + { + "epoch": 1.5001280350117558, + "grad_norm": 0.3187264403904092, + "learning_rate": 5.867262756800326e-05, + "loss": 2.6924, + "step": 32221 + }, + { + "epoch": 1.500174593197849, + "grad_norm": 0.3711182362236355, + "learning_rate": 5.8669959877138714e-05, + "loss": 2.6521, + "step": 32222 + }, + { + "epoch": 1.500221151383942, + "grad_norm": 0.331107025042917, + "learning_rate": 5.866729216082869e-05, + "loss": 2.7273, + "step": 32223 + }, + { + "epoch": 1.5002677095700352, + "grad_norm": 0.34767377879854905, + "learning_rate": 5.866462441908103e-05, + "loss": 2.8259, + "step": 32224 + }, + { + "epoch": 1.5003142677561283, + "grad_norm": 0.3776247499819839, + "learning_rate": 5.866195665190354e-05, + "loss": 2.6991, + "step": 32225 + }, + { + "epoch": 1.5003608259422214, + "grad_norm": 0.33353875768205615, + "learning_rate": 5.865928885930406e-05, + "loss": 2.7939, + "step": 32226 + }, + { + "epoch": 1.5004073841283143, + "grad_norm": 0.3308273354854514, + "learning_rate": 5.865662104129042e-05, + "loss": 2.6418, + "step": 32227 + }, + { + "epoch": 1.5004539423144074, + "grad_norm": 0.3590012812689798, + "learning_rate": 5.865395319787046e-05, + "loss": 2.6863, + "step": 32228 + }, + { + "epoch": 1.5005005005005005, + "grad_norm": 0.33664967101994725, + "learning_rate": 5.865128532905199e-05, + "loss": 2.7202, + "step": 32229 + }, + { + "epoch": 1.5005470586865934, + "grad_norm": 0.3189864271126406, + "learning_rate": 5.864861743484285e-05, + "loss": 2.7066, + "step": 32230 + }, + { + "epoch": 1.5005936168726866, + "grad_norm": 0.3252733949824618, + "learning_rate": 5.864594951525086e-05, + "loss": 2.6621, + "step": 32231 + }, + { + "epoch": 1.5006401750587797, + "grad_norm": 0.3234705524548565, + "learning_rate": 5.864328157028387e-05, + "loss": 2.7676, + "step": 32232 + }, + { + "epoch": 1.5006867332448728, + "grad_norm": 0.32630437521856104, + "learning_rate": 5.8640613599949714e-05, + "loss": 2.834, + "step": 32233 + }, + { + "epoch": 1.500733291430966, + "grad_norm": 0.36033855999552833, + "learning_rate": 5.8637945604256194e-05, + "loss": 2.6484, + "step": 32234 + }, + { + "epoch": 1.500779849617059, + "grad_norm": 0.31692335241258374, + "learning_rate": 5.863527758321115e-05, + "loss": 2.7289, + "step": 32235 + }, + { + "epoch": 1.5008264078031521, + "grad_norm": 0.34097203843335444, + "learning_rate": 5.8632609536822434e-05, + "loss": 2.7537, + "step": 32236 + }, + { + "epoch": 1.500872965989245, + "grad_norm": 0.3506394864362561, + "learning_rate": 5.8629941465097834e-05, + "loss": 2.8366, + "step": 32237 + }, + { + "epoch": 1.5009195241753381, + "grad_norm": 0.32570668485432475, + "learning_rate": 5.8627273368045234e-05, + "loss": 2.6639, + "step": 32238 + }, + { + "epoch": 1.5009660823614313, + "grad_norm": 0.35698586537507837, + "learning_rate": 5.862460524567243e-05, + "loss": 2.751, + "step": 32239 + }, + { + "epoch": 1.5010126405475241, + "grad_norm": 0.34480394708432477, + "learning_rate": 5.862193709798725e-05, + "loss": 2.7227, + "step": 32240 + }, + { + "epoch": 1.5010591987336173, + "grad_norm": 0.33175316289101753, + "learning_rate": 5.861926892499753e-05, + "loss": 2.8356, + "step": 32241 + }, + { + "epoch": 1.5011057569197104, + "grad_norm": 0.3637879184895081, + "learning_rate": 5.8616600726711115e-05, + "loss": 2.7253, + "step": 32242 + }, + { + "epoch": 1.5011523151058035, + "grad_norm": 0.3293893455421968, + "learning_rate": 5.861393250313583e-05, + "loss": 2.6472, + "step": 32243 + }, + { + "epoch": 1.5011988732918966, + "grad_norm": 0.30139671204149715, + "learning_rate": 5.861126425427949e-05, + "loss": 2.6997, + "step": 32244 + }, + { + "epoch": 1.5012454314779897, + "grad_norm": 0.36280005310499147, + "learning_rate": 5.860859598014993e-05, + "loss": 2.6338, + "step": 32245 + }, + { + "epoch": 1.5012919896640828, + "grad_norm": 0.32113794522705214, + "learning_rate": 5.8605927680755014e-05, + "loss": 2.7178, + "step": 32246 + }, + { + "epoch": 1.5013385478501757, + "grad_norm": 0.3341371637217397, + "learning_rate": 5.860325935610253e-05, + "loss": 2.6802, + "step": 32247 + }, + { + "epoch": 1.5013851060362688, + "grad_norm": 0.33391935044827226, + "learning_rate": 5.860059100620032e-05, + "loss": 2.768, + "step": 32248 + }, + { + "epoch": 1.5014316642223617, + "grad_norm": 0.31425513863062154, + "learning_rate": 5.8597922631056246e-05, + "loss": 2.6555, + "step": 32249 + }, + { + "epoch": 1.5014782224084549, + "grad_norm": 0.3135902219146828, + "learning_rate": 5.859525423067811e-05, + "loss": 2.648, + "step": 32250 + }, + { + "epoch": 1.501524780594548, + "grad_norm": 0.37680700359913466, + "learning_rate": 5.859258580507375e-05, + "loss": 2.762, + "step": 32251 + }, + { + "epoch": 1.501571338780641, + "grad_norm": 0.31682714235761267, + "learning_rate": 5.858991735425099e-05, + "loss": 2.6445, + "step": 32252 + }, + { + "epoch": 1.5016178969667342, + "grad_norm": 0.3276078193806415, + "learning_rate": 5.858724887821766e-05, + "loss": 2.8818, + "step": 32253 + }, + { + "epoch": 1.5016644551528273, + "grad_norm": 0.3285701792499052, + "learning_rate": 5.8584580376981615e-05, + "loss": 2.8601, + "step": 32254 + }, + { + "epoch": 1.5017110133389204, + "grad_norm": 0.323784866571081, + "learning_rate": 5.8581911850550674e-05, + "loss": 2.8328, + "step": 32255 + }, + { + "epoch": 1.5017575715250135, + "grad_norm": 0.35178588111027054, + "learning_rate": 5.857924329893266e-05, + "loss": 2.6996, + "step": 32256 + }, + { + "epoch": 1.5018041297111064, + "grad_norm": 0.3378346417697514, + "learning_rate": 5.857657472213541e-05, + "loss": 2.7221, + "step": 32257 + }, + { + "epoch": 1.5018506878971996, + "grad_norm": 0.3678994993490967, + "learning_rate": 5.8573906120166756e-05, + "loss": 2.815, + "step": 32258 + }, + { + "epoch": 1.5018972460832924, + "grad_norm": 0.3486360607240116, + "learning_rate": 5.8571237493034535e-05, + "loss": 2.6364, + "step": 32259 + }, + { + "epoch": 1.5019438042693856, + "grad_norm": 0.35662515241819803, + "learning_rate": 5.8568568840746587e-05, + "loss": 2.6764, + "step": 32260 + }, + { + "epoch": 1.5019903624554787, + "grad_norm": 0.32941233215511967, + "learning_rate": 5.8565900163310725e-05, + "loss": 2.8252, + "step": 32261 + }, + { + "epoch": 1.5020369206415718, + "grad_norm": 0.36739898537776966, + "learning_rate": 5.85632314607348e-05, + "loss": 2.8453, + "step": 32262 + }, + { + "epoch": 1.502083478827665, + "grad_norm": 0.33762594542674623, + "learning_rate": 5.856056273302663e-05, + "loss": 2.7663, + "step": 32263 + }, + { + "epoch": 1.502130037013758, + "grad_norm": 0.38083517634639946, + "learning_rate": 5.8557893980194046e-05, + "loss": 2.6998, + "step": 32264 + }, + { + "epoch": 1.5021765951998511, + "grad_norm": 0.32149669694251043, + "learning_rate": 5.855522520224488e-05, + "loss": 2.7723, + "step": 32265 + }, + { + "epoch": 1.502223153385944, + "grad_norm": 0.3999936227057464, + "learning_rate": 5.8552556399186976e-05, + "loss": 2.7363, + "step": 32266 + }, + { + "epoch": 1.5022697115720371, + "grad_norm": 0.32609819734758505, + "learning_rate": 5.854988757102816e-05, + "loss": 2.8099, + "step": 32267 + }, + { + "epoch": 1.5023162697581303, + "grad_norm": 0.41768284776568343, + "learning_rate": 5.8547218717776265e-05, + "loss": 2.7477, + "step": 32268 + }, + { + "epoch": 1.5023628279442232, + "grad_norm": 0.3746523191936492, + "learning_rate": 5.8544549839439143e-05, + "loss": 2.6997, + "step": 32269 + }, + { + "epoch": 1.5024093861303163, + "grad_norm": 0.39704332467897013, + "learning_rate": 5.8541880936024595e-05, + "loss": 2.7812, + "step": 32270 + }, + { + "epoch": 1.5024559443164094, + "grad_norm": 0.3701555147649126, + "learning_rate": 5.8539212007540467e-05, + "loss": 2.8559, + "step": 32271 + }, + { + "epoch": 1.5025025025025025, + "grad_norm": 0.343928772400919, + "learning_rate": 5.8536543053994585e-05, + "loss": 2.6897, + "step": 32272 + }, + { + "epoch": 1.5025490606885956, + "grad_norm": 0.34067001685720133, + "learning_rate": 5.853387407539481e-05, + "loss": 2.8344, + "step": 32273 + }, + { + "epoch": 1.5025956188746887, + "grad_norm": 0.3834574206535008, + "learning_rate": 5.853120507174894e-05, + "loss": 2.6868, + "step": 32274 + }, + { + "epoch": 1.5026421770607818, + "grad_norm": 0.3522304531913796, + "learning_rate": 5.852853604306484e-05, + "loss": 2.7609, + "step": 32275 + }, + { + "epoch": 1.5026887352468747, + "grad_norm": 0.37129521307654617, + "learning_rate": 5.852586698935031e-05, + "loss": 2.7975, + "step": 32276 + }, + { + "epoch": 1.5027352934329679, + "grad_norm": 0.3333189231395127, + "learning_rate": 5.8523197910613206e-05, + "loss": 2.7115, + "step": 32277 + }, + { + "epoch": 1.502781851619061, + "grad_norm": 0.39119096376287066, + "learning_rate": 5.8520528806861366e-05, + "loss": 2.7383, + "step": 32278 + }, + { + "epoch": 1.5028284098051539, + "grad_norm": 0.37038315053535276, + "learning_rate": 5.851785967810259e-05, + "loss": 2.6678, + "step": 32279 + }, + { + "epoch": 1.502874967991247, + "grad_norm": 0.36725302318751424, + "learning_rate": 5.851519052434475e-05, + "loss": 2.8119, + "step": 32280 + }, + { + "epoch": 1.50292152617734, + "grad_norm": 0.3716998530491669, + "learning_rate": 5.851252134559566e-05, + "loss": 2.7829, + "step": 32281 + }, + { + "epoch": 1.5029680843634332, + "grad_norm": 0.34363407554168823, + "learning_rate": 5.8509852141863174e-05, + "loss": 2.6556, + "step": 32282 + }, + { + "epoch": 1.5030146425495263, + "grad_norm": 0.34660052834223704, + "learning_rate": 5.850718291315509e-05, + "loss": 2.7144, + "step": 32283 + }, + { + "epoch": 1.5030612007356194, + "grad_norm": 0.3340748515042564, + "learning_rate": 5.8504513659479264e-05, + "loss": 2.7029, + "step": 32284 + }, + { + "epoch": 1.5031077589217126, + "grad_norm": 0.3382196709503476, + "learning_rate": 5.850184438084354e-05, + "loss": 2.7307, + "step": 32285 + }, + { + "epoch": 1.5031543171078054, + "grad_norm": 0.3484565533668159, + "learning_rate": 5.849917507725573e-05, + "loss": 2.7636, + "step": 32286 + }, + { + "epoch": 1.5032008752938986, + "grad_norm": 0.3552529257892935, + "learning_rate": 5.849650574872369e-05, + "loss": 2.7676, + "step": 32287 + }, + { + "epoch": 1.5032474334799915, + "grad_norm": 0.3368099018192939, + "learning_rate": 5.8493836395255244e-05, + "loss": 2.7003, + "step": 32288 + }, + { + "epoch": 1.5032939916660846, + "grad_norm": 0.38116320929698727, + "learning_rate": 5.8491167016858215e-05, + "loss": 2.7384, + "step": 32289 + }, + { + "epoch": 1.5033405498521777, + "grad_norm": 0.3343907770313738, + "learning_rate": 5.848849761354045e-05, + "loss": 2.7708, + "step": 32290 + }, + { + "epoch": 1.5033871080382708, + "grad_norm": 0.3707992292712135, + "learning_rate": 5.848582818530979e-05, + "loss": 2.7523, + "step": 32291 + }, + { + "epoch": 1.503433666224364, + "grad_norm": 0.3244967897801569, + "learning_rate": 5.8483158732174034e-05, + "loss": 2.5999, + "step": 32292 + }, + { + "epoch": 1.503480224410457, + "grad_norm": 0.3349690612736657, + "learning_rate": 5.8480489254141055e-05, + "loss": 2.715, + "step": 32293 + }, + { + "epoch": 1.5035267825965501, + "grad_norm": 0.34550889877615826, + "learning_rate": 5.847781975121869e-05, + "loss": 2.7597, + "step": 32294 + }, + { + "epoch": 1.5035733407826433, + "grad_norm": 0.34144884859079133, + "learning_rate": 5.847515022341474e-05, + "loss": 2.8009, + "step": 32295 + }, + { + "epoch": 1.5036198989687362, + "grad_norm": 0.3355640853342574, + "learning_rate": 5.847248067073707e-05, + "loss": 2.7962, + "step": 32296 + }, + { + "epoch": 1.5036664571548293, + "grad_norm": 0.37038742245021744, + "learning_rate": 5.84698110931935e-05, + "loss": 2.7674, + "step": 32297 + }, + { + "epoch": 1.5037130153409222, + "grad_norm": 0.3198727666224018, + "learning_rate": 5.846714149079187e-05, + "loss": 2.7854, + "step": 32298 + }, + { + "epoch": 1.5037595735270153, + "grad_norm": 0.3193387319960017, + "learning_rate": 5.8464471863540017e-05, + "loss": 2.8093, + "step": 32299 + }, + { + "epoch": 1.5038061317131084, + "grad_norm": 0.331721099272083, + "learning_rate": 5.846180221144577e-05, + "loss": 2.7216, + "step": 32300 + }, + { + "epoch": 1.5038526898992015, + "grad_norm": 0.338015628070094, + "learning_rate": 5.845913253451698e-05, + "loss": 2.8044, + "step": 32301 + }, + { + "epoch": 1.5038992480852946, + "grad_norm": 0.34485045254976915, + "learning_rate": 5.8456462832761446e-05, + "loss": 2.7413, + "step": 32302 + }, + { + "epoch": 1.5039458062713877, + "grad_norm": 0.34610719171066656, + "learning_rate": 5.845379310618704e-05, + "loss": 2.7799, + "step": 32303 + }, + { + "epoch": 1.5039923644574809, + "grad_norm": 0.35821224246671074, + "learning_rate": 5.8451123354801586e-05, + "loss": 2.8273, + "step": 32304 + }, + { + "epoch": 1.504038922643574, + "grad_norm": 0.32577591522054294, + "learning_rate": 5.8448453578612915e-05, + "loss": 2.7112, + "step": 32305 + }, + { + "epoch": 1.5040854808296669, + "grad_norm": 0.3337662505411152, + "learning_rate": 5.844578377762886e-05, + "loss": 2.6315, + "step": 32306 + }, + { + "epoch": 1.50413203901576, + "grad_norm": 0.3304012016697068, + "learning_rate": 5.8443113951857264e-05, + "loss": 2.6722, + "step": 32307 + }, + { + "epoch": 1.5041785972018529, + "grad_norm": 0.3299224260287912, + "learning_rate": 5.844044410130596e-05, + "loss": 2.7414, + "step": 32308 + }, + { + "epoch": 1.504225155387946, + "grad_norm": 0.3241445282227154, + "learning_rate": 5.843777422598279e-05, + "loss": 2.7565, + "step": 32309 + }, + { + "epoch": 1.504271713574039, + "grad_norm": 0.3221439581316033, + "learning_rate": 5.843510432589557e-05, + "loss": 2.7162, + "step": 32310 + }, + { + "epoch": 1.5043182717601322, + "grad_norm": 0.3600286335699451, + "learning_rate": 5.843243440105216e-05, + "loss": 2.715, + "step": 32311 + }, + { + "epoch": 1.5043648299462253, + "grad_norm": 0.33940641707451413, + "learning_rate": 5.842976445146038e-05, + "loss": 2.8107, + "step": 32312 + }, + { + "epoch": 1.5044113881323185, + "grad_norm": 0.3309561524005618, + "learning_rate": 5.842709447712809e-05, + "loss": 2.7411, + "step": 32313 + }, + { + "epoch": 1.5044579463184116, + "grad_norm": 0.3389044396822173, + "learning_rate": 5.84244244780631e-05, + "loss": 2.7199, + "step": 32314 + }, + { + "epoch": 1.5045045045045045, + "grad_norm": 0.32226151219358834, + "learning_rate": 5.842175445427325e-05, + "loss": 2.7559, + "step": 32315 + }, + { + "epoch": 1.5045510626905976, + "grad_norm": 0.36428537086025065, + "learning_rate": 5.8419084405766364e-05, + "loss": 2.6546, + "step": 32316 + }, + { + "epoch": 1.5045976208766907, + "grad_norm": 0.3264477844467494, + "learning_rate": 5.8416414332550316e-05, + "loss": 2.7887, + "step": 32317 + }, + { + "epoch": 1.5046441790627836, + "grad_norm": 0.3625529499825002, + "learning_rate": 5.841374423463292e-05, + "loss": 2.7395, + "step": 32318 + }, + { + "epoch": 1.5046907372488767, + "grad_norm": 0.32844879889517364, + "learning_rate": 5.8411074112022e-05, + "loss": 2.6681, + "step": 32319 + }, + { + "epoch": 1.5047372954349698, + "grad_norm": 0.3313410889987279, + "learning_rate": 5.8408403964725414e-05, + "loss": 2.6932, + "step": 32320 + }, + { + "epoch": 1.504783853621063, + "grad_norm": 0.3441078831235558, + "learning_rate": 5.8405733792750996e-05, + "loss": 2.6479, + "step": 32321 + }, + { + "epoch": 1.504830411807156, + "grad_norm": 0.3441481775155635, + "learning_rate": 5.840306359610657e-05, + "loss": 2.6469, + "step": 32322 + }, + { + "epoch": 1.5048769699932492, + "grad_norm": 0.33712396687567614, + "learning_rate": 5.840039337479999e-05, + "loss": 2.6639, + "step": 32323 + }, + { + "epoch": 1.5049235281793423, + "grad_norm": 0.3439288681189601, + "learning_rate": 5.8397723128839066e-05, + "loss": 2.6206, + "step": 32324 + }, + { + "epoch": 1.5049700863654352, + "grad_norm": 0.35311393373488176, + "learning_rate": 5.839505285823166e-05, + "loss": 2.8076, + "step": 32325 + }, + { + "epoch": 1.5050166445515283, + "grad_norm": 0.3412321566904542, + "learning_rate": 5.83923825629856e-05, + "loss": 2.7171, + "step": 32326 + }, + { + "epoch": 1.5050632027376214, + "grad_norm": 0.34720293256875295, + "learning_rate": 5.838971224310874e-05, + "loss": 2.6794, + "step": 32327 + }, + { + "epoch": 1.5051097609237143, + "grad_norm": 0.3271833955724969, + "learning_rate": 5.8387041898608886e-05, + "loss": 2.7932, + "step": 32328 + }, + { + "epoch": 1.5051563191098074, + "grad_norm": 0.3343862179691206, + "learning_rate": 5.8384371529493885e-05, + "loss": 2.6902, + "step": 32329 + }, + { + "epoch": 1.5052028772959005, + "grad_norm": 0.3262073507139691, + "learning_rate": 5.8381701135771595e-05, + "loss": 2.721, + "step": 32330 + }, + { + "epoch": 1.5052494354819936, + "grad_norm": 0.31045976850266804, + "learning_rate": 5.837903071744982e-05, + "loss": 2.5638, + "step": 32331 + }, + { + "epoch": 1.5052959936680868, + "grad_norm": 0.33068812386810503, + "learning_rate": 5.837636027453643e-05, + "loss": 2.7088, + "step": 32332 + }, + { + "epoch": 1.5053425518541799, + "grad_norm": 0.3195948056869437, + "learning_rate": 5.8373689807039246e-05, + "loss": 2.7702, + "step": 32333 + }, + { + "epoch": 1.505389110040273, + "grad_norm": 0.3298804555832647, + "learning_rate": 5.8371019314966104e-05, + "loss": 2.6911, + "step": 32334 + }, + { + "epoch": 1.5054356682263659, + "grad_norm": 0.3673340300745741, + "learning_rate": 5.836834879832483e-05, + "loss": 2.7196, + "step": 32335 + }, + { + "epoch": 1.505482226412459, + "grad_norm": 0.3407163173074789, + "learning_rate": 5.8365678257123305e-05, + "loss": 2.747, + "step": 32336 + }, + { + "epoch": 1.5055287845985519, + "grad_norm": 0.31841162406716833, + "learning_rate": 5.836300769136932e-05, + "loss": 2.7147, + "step": 32337 + }, + { + "epoch": 1.505575342784645, + "grad_norm": 0.35413606988360335, + "learning_rate": 5.836033710107074e-05, + "loss": 2.763, + "step": 32338 + }, + { + "epoch": 1.5056219009707381, + "grad_norm": 0.3118522052290433, + "learning_rate": 5.835766648623539e-05, + "loss": 2.7952, + "step": 32339 + }, + { + "epoch": 1.5056684591568312, + "grad_norm": 0.34943138664876544, + "learning_rate": 5.8354995846871116e-05, + "loss": 2.7489, + "step": 32340 + }, + { + "epoch": 1.5057150173429243, + "grad_norm": 0.3502132206080986, + "learning_rate": 5.8352325182985756e-05, + "loss": 2.7615, + "step": 32341 + }, + { + "epoch": 1.5057615755290175, + "grad_norm": 0.3316352208136591, + "learning_rate": 5.8349654494587134e-05, + "loss": 2.6774, + "step": 32342 + }, + { + "epoch": 1.5058081337151106, + "grad_norm": 0.37389761643044395, + "learning_rate": 5.83469837816831e-05, + "loss": 2.7473, + "step": 32343 + }, + { + "epoch": 1.5058546919012037, + "grad_norm": 0.3503410347424746, + "learning_rate": 5.8344313044281485e-05, + "loss": 2.7035, + "step": 32344 + }, + { + "epoch": 1.5059012500872966, + "grad_norm": 0.3619975545164392, + "learning_rate": 5.834164228239015e-05, + "loss": 2.8022, + "step": 32345 + }, + { + "epoch": 1.5059478082733897, + "grad_norm": 0.3388703633384345, + "learning_rate": 5.83389714960169e-05, + "loss": 2.6457, + "step": 32346 + }, + { + "epoch": 1.5059943664594826, + "grad_norm": 0.3452692783181093, + "learning_rate": 5.83363006851696e-05, + "loss": 2.7788, + "step": 32347 + }, + { + "epoch": 1.5060409246455757, + "grad_norm": 0.3564585092804882, + "learning_rate": 5.8333629849856073e-05, + "loss": 2.7199, + "step": 32348 + }, + { + "epoch": 1.5060874828316688, + "grad_norm": 0.36333791796806064, + "learning_rate": 5.833095899008416e-05, + "loss": 2.7403, + "step": 32349 + }, + { + "epoch": 1.506134041017762, + "grad_norm": 0.3489027103640201, + "learning_rate": 5.8328288105861726e-05, + "loss": 2.678, + "step": 32350 + }, + { + "epoch": 1.506180599203855, + "grad_norm": 0.3361161508015058, + "learning_rate": 5.832561719719657e-05, + "loss": 2.7624, + "step": 32351 + }, + { + "epoch": 1.5062271573899482, + "grad_norm": 0.3421365069717582, + "learning_rate": 5.8322946264096554e-05, + "loss": 2.7514, + "step": 32352 + }, + { + "epoch": 1.5062737155760413, + "grad_norm": 0.36013004256393605, + "learning_rate": 5.8320275306569504e-05, + "loss": 2.8221, + "step": 32353 + }, + { + "epoch": 1.5063202737621342, + "grad_norm": 0.33691093289869906, + "learning_rate": 5.831760432462328e-05, + "loss": 2.8114, + "step": 32354 + }, + { + "epoch": 1.5063668319482273, + "grad_norm": 0.35558115588878314, + "learning_rate": 5.831493331826569e-05, + "loss": 2.7714, + "step": 32355 + }, + { + "epoch": 1.5064133901343204, + "grad_norm": 0.3478341041505751, + "learning_rate": 5.831226228750458e-05, + "loss": 2.7155, + "step": 32356 + }, + { + "epoch": 1.5064599483204133, + "grad_norm": 0.32567857870007705, + "learning_rate": 5.8309591232347816e-05, + "loss": 2.5722, + "step": 32357 + }, + { + "epoch": 1.5065065065065064, + "grad_norm": 0.3448737028838495, + "learning_rate": 5.8306920152803216e-05, + "loss": 2.7665, + "step": 32358 + }, + { + "epoch": 1.5065530646925995, + "grad_norm": 0.3300565161835254, + "learning_rate": 5.8304249048878637e-05, + "loss": 2.7554, + "step": 32359 + }, + { + "epoch": 1.5065996228786926, + "grad_norm": 0.31457608766731937, + "learning_rate": 5.830157792058188e-05, + "loss": 2.5935, + "step": 32360 + }, + { + "epoch": 1.5066461810647858, + "grad_norm": 0.35485735522287476, + "learning_rate": 5.8298906767920824e-05, + "loss": 2.6939, + "step": 32361 + }, + { + "epoch": 1.5066927392508789, + "grad_norm": 0.3419910376674363, + "learning_rate": 5.829623559090329e-05, + "loss": 2.7223, + "step": 32362 + }, + { + "epoch": 1.506739297436972, + "grad_norm": 0.36339085905630547, + "learning_rate": 5.829356438953714e-05, + "loss": 2.7702, + "step": 32363 + }, + { + "epoch": 1.5067858556230649, + "grad_norm": 0.35453465025769143, + "learning_rate": 5.8290893163830174e-05, + "loss": 2.7117, + "step": 32364 + }, + { + "epoch": 1.506832413809158, + "grad_norm": 0.3766672756728098, + "learning_rate": 5.828822191379026e-05, + "loss": 2.7286, + "step": 32365 + }, + { + "epoch": 1.5068789719952511, + "grad_norm": 0.3345143273055649, + "learning_rate": 5.828555063942523e-05, + "loss": 2.7537, + "step": 32366 + }, + { + "epoch": 1.506925530181344, + "grad_norm": 0.3993261396476894, + "learning_rate": 5.828287934074292e-05, + "loss": 2.7753, + "step": 32367 + }, + { + "epoch": 1.5069720883674371, + "grad_norm": 0.341073165507414, + "learning_rate": 5.8280208017751195e-05, + "loss": 2.7409, + "step": 32368 + }, + { + "epoch": 1.5070186465535302, + "grad_norm": 0.3682941246345933, + "learning_rate": 5.8277536670457854e-05, + "loss": 2.7524, + "step": 32369 + }, + { + "epoch": 1.5070652047396234, + "grad_norm": 0.3576295257216906, + "learning_rate": 5.8274865298870775e-05, + "loss": 2.7338, + "step": 32370 + }, + { + "epoch": 1.5071117629257165, + "grad_norm": 0.36720071154523964, + "learning_rate": 5.8272193902997764e-05, + "loss": 2.7949, + "step": 32371 + }, + { + "epoch": 1.5071583211118096, + "grad_norm": 0.3610881289118563, + "learning_rate": 5.8269522482846695e-05, + "loss": 2.7342, + "step": 32372 + }, + { + "epoch": 1.5072048792979027, + "grad_norm": 0.3629987475861835, + "learning_rate": 5.826685103842539e-05, + "loss": 2.6675, + "step": 32373 + }, + { + "epoch": 1.5072514374839956, + "grad_norm": 0.3608198928719174, + "learning_rate": 5.8264179569741675e-05, + "loss": 2.6691, + "step": 32374 + }, + { + "epoch": 1.5072979956700887, + "grad_norm": 0.3616088924072986, + "learning_rate": 5.826150807680342e-05, + "loss": 2.7755, + "step": 32375 + }, + { + "epoch": 1.5073445538561816, + "grad_norm": 0.3773999448958417, + "learning_rate": 5.8258836559618456e-05, + "loss": 2.7513, + "step": 32376 + }, + { + "epoch": 1.5073911120422747, + "grad_norm": 0.33856518789153284, + "learning_rate": 5.825616501819461e-05, + "loss": 2.6976, + "step": 32377 + }, + { + "epoch": 1.5074376702283678, + "grad_norm": 0.37706943579805985, + "learning_rate": 5.825349345253974e-05, + "loss": 2.7238, + "step": 32378 + }, + { + "epoch": 1.507484228414461, + "grad_norm": 0.31825843388418135, + "learning_rate": 5.8250821862661686e-05, + "loss": 2.7685, + "step": 32379 + }, + { + "epoch": 1.507530786600554, + "grad_norm": 0.3592651692492772, + "learning_rate": 5.824815024856827e-05, + "loss": 2.8076, + "step": 32380 + }, + { + "epoch": 1.5075773447866472, + "grad_norm": 0.34518588133706524, + "learning_rate": 5.824547861026735e-05, + "loss": 2.7399, + "step": 32381 + }, + { + "epoch": 1.5076239029727403, + "grad_norm": 0.34842484105127236, + "learning_rate": 5.824280694776676e-05, + "loss": 2.7645, + "step": 32382 + }, + { + "epoch": 1.5076704611588334, + "grad_norm": 0.34085589528066496, + "learning_rate": 5.8240135261074346e-05, + "loss": 2.661, + "step": 32383 + }, + { + "epoch": 1.5077170193449263, + "grad_norm": 0.3326153200727794, + "learning_rate": 5.823746355019795e-05, + "loss": 2.7152, + "step": 32384 + }, + { + "epoch": 1.5077635775310194, + "grad_norm": 0.33961455148375475, + "learning_rate": 5.82347918151454e-05, + "loss": 2.8348, + "step": 32385 + }, + { + "epoch": 1.5078101357171123, + "grad_norm": 0.35753753353517154, + "learning_rate": 5.823212005592457e-05, + "loss": 2.7067, + "step": 32386 + }, + { + "epoch": 1.5078566939032054, + "grad_norm": 0.34244235708117743, + "learning_rate": 5.822944827254325e-05, + "loss": 2.7582, + "step": 32387 + }, + { + "epoch": 1.5079032520892985, + "grad_norm": 0.35160445904241544, + "learning_rate": 5.822677646500932e-05, + "loss": 2.8361, + "step": 32388 + }, + { + "epoch": 1.5079498102753917, + "grad_norm": 0.33099912498092765, + "learning_rate": 5.822410463333062e-05, + "loss": 2.7495, + "step": 32389 + }, + { + "epoch": 1.5079963684614848, + "grad_norm": 0.34871514774424733, + "learning_rate": 5.822143277751499e-05, + "loss": 2.7338, + "step": 32390 + }, + { + "epoch": 1.5080429266475779, + "grad_norm": 0.3369893417253438, + "learning_rate": 5.8218760897570256e-05, + "loss": 2.682, + "step": 32391 + }, + { + "epoch": 1.508089484833671, + "grad_norm": 0.33282375360783534, + "learning_rate": 5.8216088993504257e-05, + "loss": 2.7591, + "step": 32392 + }, + { + "epoch": 1.5081360430197641, + "grad_norm": 0.3456329790973983, + "learning_rate": 5.821341706532486e-05, + "loss": 2.8352, + "step": 32393 + }, + { + "epoch": 1.508182601205857, + "grad_norm": 0.3426631418219196, + "learning_rate": 5.821074511303988e-05, + "loss": 2.7487, + "step": 32394 + }, + { + "epoch": 1.5082291593919501, + "grad_norm": 0.33793060657569235, + "learning_rate": 5.820807313665718e-05, + "loss": 2.7551, + "step": 32395 + }, + { + "epoch": 1.508275717578043, + "grad_norm": 0.3336946333487676, + "learning_rate": 5.82054011361846e-05, + "loss": 2.71, + "step": 32396 + }, + { + "epoch": 1.5083222757641361, + "grad_norm": 0.3351171559412983, + "learning_rate": 5.820272911162996e-05, + "loss": 2.8006, + "step": 32397 + }, + { + "epoch": 1.5083688339502292, + "grad_norm": 0.32634949492643456, + "learning_rate": 5.8200057063001126e-05, + "loss": 2.7721, + "step": 32398 + }, + { + "epoch": 1.5084153921363224, + "grad_norm": 0.35084204896934823, + "learning_rate": 5.819738499030595e-05, + "loss": 2.8054, + "step": 32399 + }, + { + "epoch": 1.5084619503224155, + "grad_norm": 0.30863841382398066, + "learning_rate": 5.819471289355223e-05, + "loss": 2.6116, + "step": 32400 + }, + { + "epoch": 1.5085085085085086, + "grad_norm": 0.3177255113563022, + "learning_rate": 5.819204077274785e-05, + "loss": 2.6689, + "step": 32401 + }, + { + "epoch": 1.5085550666946017, + "grad_norm": 0.3174280487085543, + "learning_rate": 5.818936862790063e-05, + "loss": 2.793, + "step": 32402 + }, + { + "epoch": 1.5086016248806946, + "grad_norm": 0.31899812730824684, + "learning_rate": 5.818669645901842e-05, + "loss": 2.6873, + "step": 32403 + }, + { + "epoch": 1.5086481830667877, + "grad_norm": 0.30647551186123784, + "learning_rate": 5.8184024266109085e-05, + "loss": 2.7268, + "step": 32404 + }, + { + "epoch": 1.5086947412528808, + "grad_norm": 0.343381537099066, + "learning_rate": 5.8181352049180414e-05, + "loss": 2.753, + "step": 32405 + }, + { + "epoch": 1.5087412994389737, + "grad_norm": 0.3642867722625741, + "learning_rate": 5.817867980824029e-05, + "loss": 2.9121, + "step": 32406 + }, + { + "epoch": 1.5087878576250668, + "grad_norm": 0.31829822685738696, + "learning_rate": 5.8176007543296554e-05, + "loss": 2.6833, + "step": 32407 + }, + { + "epoch": 1.50883441581116, + "grad_norm": 0.3860694384524359, + "learning_rate": 5.8173335254357045e-05, + "loss": 2.7136, + "step": 32408 + }, + { + "epoch": 1.508880973997253, + "grad_norm": 0.3181548440303155, + "learning_rate": 5.8170662941429577e-05, + "loss": 2.6885, + "step": 32409 + }, + { + "epoch": 1.5089275321833462, + "grad_norm": 0.35804453732355057, + "learning_rate": 5.8167990604522036e-05, + "loss": 2.7624, + "step": 32410 + }, + { + "epoch": 1.5089740903694393, + "grad_norm": 0.34884659221559505, + "learning_rate": 5.8165318243642244e-05, + "loss": 2.7283, + "step": 32411 + }, + { + "epoch": 1.5090206485555324, + "grad_norm": 0.34516500389438753, + "learning_rate": 5.8162645858798046e-05, + "loss": 2.6753, + "step": 32412 + }, + { + "epoch": 1.5090672067416253, + "grad_norm": 0.331392845828865, + "learning_rate": 5.81599734499973e-05, + "loss": 2.7336, + "step": 32413 + }, + { + "epoch": 1.5091137649277184, + "grad_norm": 0.36410558156341044, + "learning_rate": 5.815730101724782e-05, + "loss": 2.8201, + "step": 32414 + }, + { + "epoch": 1.5091603231138113, + "grad_norm": 0.33647702478125985, + "learning_rate": 5.815462856055748e-05, + "loss": 2.7268, + "step": 32415 + }, + { + "epoch": 1.5092068812999044, + "grad_norm": 0.3439333856693485, + "learning_rate": 5.815195607993409e-05, + "loss": 2.777, + "step": 32416 + }, + { + "epoch": 1.5092534394859975, + "grad_norm": 0.33376830327024426, + "learning_rate": 5.814928357538554e-05, + "loss": 2.7722, + "step": 32417 + }, + { + "epoch": 1.5092999976720907, + "grad_norm": 0.3496730237315108, + "learning_rate": 5.814661104691962e-05, + "loss": 2.7742, + "step": 32418 + }, + { + "epoch": 1.5093465558581838, + "grad_norm": 0.3494008669939117, + "learning_rate": 5.8143938494544206e-05, + "loss": 2.7612, + "step": 32419 + }, + { + "epoch": 1.509393114044277, + "grad_norm": 0.37402365102861196, + "learning_rate": 5.814126591826712e-05, + "loss": 2.755, + "step": 32420 + }, + { + "epoch": 1.50943967223037, + "grad_norm": 0.35499448815200046, + "learning_rate": 5.813859331809624e-05, + "loss": 2.6958, + "step": 32421 + }, + { + "epoch": 1.5094862304164631, + "grad_norm": 0.3659420741774566, + "learning_rate": 5.813592069403939e-05, + "loss": 2.7223, + "step": 32422 + }, + { + "epoch": 1.509532788602556, + "grad_norm": 0.3167539072416042, + "learning_rate": 5.81332480461044e-05, + "loss": 2.6281, + "step": 32423 + }, + { + "epoch": 1.5095793467886491, + "grad_norm": 0.36614422800907426, + "learning_rate": 5.8130575374299143e-05, + "loss": 2.7779, + "step": 32424 + }, + { + "epoch": 1.509625904974742, + "grad_norm": 0.33350112348684424, + "learning_rate": 5.812790267863144e-05, + "loss": 2.7867, + "step": 32425 + }, + { + "epoch": 1.5096724631608351, + "grad_norm": 0.34970138528213496, + "learning_rate": 5.812522995910915e-05, + "loss": 2.8538, + "step": 32426 + }, + { + "epoch": 1.5097190213469283, + "grad_norm": 0.3217979478373686, + "learning_rate": 5.8122557215740105e-05, + "loss": 2.8043, + "step": 32427 + }, + { + "epoch": 1.5097655795330214, + "grad_norm": 0.35854437707383446, + "learning_rate": 5.811988444853217e-05, + "loss": 2.7195, + "step": 32428 + }, + { + "epoch": 1.5098121377191145, + "grad_norm": 0.34867831193736293, + "learning_rate": 5.8117211657493154e-05, + "loss": 2.7962, + "step": 32429 + }, + { + "epoch": 1.5098586959052076, + "grad_norm": 0.3380913900450454, + "learning_rate": 5.811453884263093e-05, + "loss": 2.7313, + "step": 32430 + }, + { + "epoch": 1.5099052540913007, + "grad_norm": 0.3648306462252905, + "learning_rate": 5.811186600395334e-05, + "loss": 2.7063, + "step": 32431 + }, + { + "epoch": 1.5099518122773938, + "grad_norm": 0.31873563385594994, + "learning_rate": 5.810919314146821e-05, + "loss": 2.6844, + "step": 32432 + }, + { + "epoch": 1.5099983704634867, + "grad_norm": 0.3256842749298894, + "learning_rate": 5.810652025518339e-05, + "loss": 2.8094, + "step": 32433 + }, + { + "epoch": 1.5100449286495798, + "grad_norm": 0.34470689841533586, + "learning_rate": 5.8103847345106755e-05, + "loss": 2.8187, + "step": 32434 + }, + { + "epoch": 1.5100914868356727, + "grad_norm": 0.34065786457991337, + "learning_rate": 5.810117441124612e-05, + "loss": 2.7465, + "step": 32435 + }, + { + "epoch": 1.5101380450217659, + "grad_norm": 0.33222710520549814, + "learning_rate": 5.809850145360932e-05, + "loss": 2.7081, + "step": 32436 + }, + { + "epoch": 1.510184603207859, + "grad_norm": 0.36099771934661473, + "learning_rate": 5.809582847220423e-05, + "loss": 2.7276, + "step": 32437 + }, + { + "epoch": 1.510231161393952, + "grad_norm": 0.3408516326298677, + "learning_rate": 5.809315546703867e-05, + "loss": 2.7701, + "step": 32438 + }, + { + "epoch": 1.5102777195800452, + "grad_norm": 0.35162074511860825, + "learning_rate": 5.809048243812051e-05, + "loss": 2.7511, + "step": 32439 + }, + { + "epoch": 1.5103242777661383, + "grad_norm": 0.329205427934601, + "learning_rate": 5.8087809385457584e-05, + "loss": 2.7267, + "step": 32440 + }, + { + "epoch": 1.5103708359522314, + "grad_norm": 0.3556507891762837, + "learning_rate": 5.8085136309057734e-05, + "loss": 2.7798, + "step": 32441 + }, + { + "epoch": 1.5104173941383243, + "grad_norm": 0.36887892842277276, + "learning_rate": 5.808246320892879e-05, + "loss": 2.6628, + "step": 32442 + }, + { + "epoch": 1.5104639523244174, + "grad_norm": 0.34719157980097476, + "learning_rate": 5.8079790085078624e-05, + "loss": 2.7241, + "step": 32443 + }, + { + "epoch": 1.5105105105105106, + "grad_norm": 0.34692853535281104, + "learning_rate": 5.8077116937515064e-05, + "loss": 2.7085, + "step": 32444 + }, + { + "epoch": 1.5105570686966034, + "grad_norm": 0.36158586787498964, + "learning_rate": 5.8074443766245965e-05, + "loss": 2.8304, + "step": 32445 + }, + { + "epoch": 1.5106036268826966, + "grad_norm": 0.32364624261639097, + "learning_rate": 5.807177057127916e-05, + "loss": 2.6699, + "step": 32446 + }, + { + "epoch": 1.5106501850687897, + "grad_norm": 0.34539608973163927, + "learning_rate": 5.8069097352622514e-05, + "loss": 2.6809, + "step": 32447 + }, + { + "epoch": 1.5106967432548828, + "grad_norm": 0.3443958161874284, + "learning_rate": 5.8066424110283855e-05, + "loss": 2.6635, + "step": 32448 + }, + { + "epoch": 1.510743301440976, + "grad_norm": 0.3408631549013534, + "learning_rate": 5.8063750844271045e-05, + "loss": 2.7956, + "step": 32449 + }, + { + "epoch": 1.510789859627069, + "grad_norm": 0.34492521815033683, + "learning_rate": 5.806107755459191e-05, + "loss": 2.7408, + "step": 32450 + }, + { + "epoch": 1.5108364178131621, + "grad_norm": 0.3633291795137626, + "learning_rate": 5.805840424125431e-05, + "loss": 2.7188, + "step": 32451 + }, + { + "epoch": 1.510882975999255, + "grad_norm": 0.34210789060800756, + "learning_rate": 5.805573090426609e-05, + "loss": 2.7684, + "step": 32452 + }, + { + "epoch": 1.5109295341853481, + "grad_norm": 0.3731960715160732, + "learning_rate": 5.80530575436351e-05, + "loss": 2.7161, + "step": 32453 + }, + { + "epoch": 1.5109760923714413, + "grad_norm": 0.38361937698144477, + "learning_rate": 5.805038415936919e-05, + "loss": 2.7591, + "step": 32454 + }, + { + "epoch": 1.5110226505575342, + "grad_norm": 0.3278211418835913, + "learning_rate": 5.804771075147617e-05, + "loss": 2.7596, + "step": 32455 + }, + { + "epoch": 1.5110692087436273, + "grad_norm": 0.38821261187629463, + "learning_rate": 5.8045037319963916e-05, + "loss": 2.6533, + "step": 32456 + }, + { + "epoch": 1.5111157669297204, + "grad_norm": 0.3445340481713328, + "learning_rate": 5.804236386484027e-05, + "loss": 2.7447, + "step": 32457 + }, + { + "epoch": 1.5111623251158135, + "grad_norm": 0.36814835091579223, + "learning_rate": 5.803969038611309e-05, + "loss": 2.7234, + "step": 32458 + }, + { + "epoch": 1.5112088833019066, + "grad_norm": 0.3729281685780898, + "learning_rate": 5.80370168837902e-05, + "loss": 2.7517, + "step": 32459 + }, + { + "epoch": 1.5112554414879997, + "grad_norm": 0.33286716380846654, + "learning_rate": 5.803434335787946e-05, + "loss": 2.6663, + "step": 32460 + }, + { + "epoch": 1.5113019996740928, + "grad_norm": 0.3738789079289925, + "learning_rate": 5.803166980838871e-05, + "loss": 2.7785, + "step": 32461 + }, + { + "epoch": 1.5113485578601857, + "grad_norm": 0.3358714996573902, + "learning_rate": 5.80289962353258e-05, + "loss": 2.6864, + "step": 32462 + }, + { + "epoch": 1.5113951160462789, + "grad_norm": 0.38693479280598386, + "learning_rate": 5.802632263869859e-05, + "loss": 2.7117, + "step": 32463 + }, + { + "epoch": 1.5114416742323717, + "grad_norm": 0.3505563090242771, + "learning_rate": 5.8023649018514906e-05, + "loss": 2.7203, + "step": 32464 + }, + { + "epoch": 1.5114882324184649, + "grad_norm": 0.3881650149266905, + "learning_rate": 5.802097537478259e-05, + "loss": 2.7336, + "step": 32465 + }, + { + "epoch": 1.511534790604558, + "grad_norm": 0.3464402386862901, + "learning_rate": 5.801830170750953e-05, + "loss": 2.7756, + "step": 32466 + }, + { + "epoch": 1.511581348790651, + "grad_norm": 0.366389834153412, + "learning_rate": 5.801562801670353e-05, + "loss": 2.8099, + "step": 32467 + }, + { + "epoch": 1.5116279069767442, + "grad_norm": 0.37279549453027794, + "learning_rate": 5.801295430237245e-05, + "loss": 2.7157, + "step": 32468 + }, + { + "epoch": 1.5116744651628373, + "grad_norm": 0.33604813096365155, + "learning_rate": 5.801028056452413e-05, + "loss": 2.6696, + "step": 32469 + }, + { + "epoch": 1.5117210233489304, + "grad_norm": 0.35856347475431055, + "learning_rate": 5.800760680316644e-05, + "loss": 2.734, + "step": 32470 + }, + { + "epoch": 1.5117675815350236, + "grad_norm": 0.3323156218584989, + "learning_rate": 5.800493301830721e-05, + "loss": 2.6628, + "step": 32471 + }, + { + "epoch": 1.5118141397211164, + "grad_norm": 0.33910555535920517, + "learning_rate": 5.800225920995428e-05, + "loss": 2.6376, + "step": 32472 + }, + { + "epoch": 1.5118606979072096, + "grad_norm": 0.32463825341514974, + "learning_rate": 5.799958537811552e-05, + "loss": 2.7581, + "step": 32473 + }, + { + "epoch": 1.5119072560933025, + "grad_norm": 0.37454875630523216, + "learning_rate": 5.799691152279875e-05, + "loss": 2.7, + "step": 32474 + }, + { + "epoch": 1.5119538142793956, + "grad_norm": 0.3311567725798947, + "learning_rate": 5.799423764401184e-05, + "loss": 2.7941, + "step": 32475 + }, + { + "epoch": 1.5120003724654887, + "grad_norm": 0.3405659448554318, + "learning_rate": 5.7991563741762636e-05, + "loss": 2.7604, + "step": 32476 + }, + { + "epoch": 1.5120469306515818, + "grad_norm": 0.3567078135927803, + "learning_rate": 5.798888981605898e-05, + "loss": 2.8071, + "step": 32477 + }, + { + "epoch": 1.512093488837675, + "grad_norm": 0.36712073869629425, + "learning_rate": 5.798621586690871e-05, + "loss": 2.743, + "step": 32478 + }, + { + "epoch": 1.512140047023768, + "grad_norm": 0.34204317172468096, + "learning_rate": 5.798354189431969e-05, + "loss": 2.7088, + "step": 32479 + }, + { + "epoch": 1.5121866052098611, + "grad_norm": 0.3522550949975096, + "learning_rate": 5.798086789829976e-05, + "loss": 2.7033, + "step": 32480 + }, + { + "epoch": 1.512233163395954, + "grad_norm": 0.35742573432958785, + "learning_rate": 5.797819387885677e-05, + "loss": 2.7623, + "step": 32481 + }, + { + "epoch": 1.5122797215820472, + "grad_norm": 0.3621815501214173, + "learning_rate": 5.7975519835998574e-05, + "loss": 2.6836, + "step": 32482 + }, + { + "epoch": 1.5123262797681403, + "grad_norm": 0.3632828584322807, + "learning_rate": 5.7972845769732996e-05, + "loss": 2.6908, + "step": 32483 + }, + { + "epoch": 1.5123728379542332, + "grad_norm": 0.35568780066981115, + "learning_rate": 5.797017168006791e-05, + "loss": 2.6812, + "step": 32484 + }, + { + "epoch": 1.5124193961403263, + "grad_norm": 0.3586360935745265, + "learning_rate": 5.7967497567011155e-05, + "loss": 2.7684, + "step": 32485 + }, + { + "epoch": 1.5124659543264194, + "grad_norm": 0.379757000158209, + "learning_rate": 5.796482343057058e-05, + "loss": 2.7422, + "step": 32486 + }, + { + "epoch": 1.5125125125125125, + "grad_norm": 0.36623107341841193, + "learning_rate": 5.796214927075403e-05, + "loss": 2.7358, + "step": 32487 + }, + { + "epoch": 1.5125590706986056, + "grad_norm": 0.3298659060995354, + "learning_rate": 5.795947508756935e-05, + "loss": 2.6932, + "step": 32488 + }, + { + "epoch": 1.5126056288846987, + "grad_norm": 0.3845758265538881, + "learning_rate": 5.79568008810244e-05, + "loss": 2.6292, + "step": 32489 + }, + { + "epoch": 1.5126521870707919, + "grad_norm": 0.34064131762356437, + "learning_rate": 5.795412665112704e-05, + "loss": 2.7268, + "step": 32490 + }, + { + "epoch": 1.5126987452568847, + "grad_norm": 0.3523753400713448, + "learning_rate": 5.7951452397885085e-05, + "loss": 2.7321, + "step": 32491 + }, + { + "epoch": 1.5127453034429779, + "grad_norm": 0.3306110067941531, + "learning_rate": 5.794877812130641e-05, + "loss": 2.6156, + "step": 32492 + }, + { + "epoch": 1.512791861629071, + "grad_norm": 0.3229599225322585, + "learning_rate": 5.794610382139884e-05, + "loss": 2.7858, + "step": 32493 + }, + { + "epoch": 1.5128384198151639, + "grad_norm": 0.35606900911308387, + "learning_rate": 5.7943429498170255e-05, + "loss": 2.7856, + "step": 32494 + }, + { + "epoch": 1.512884978001257, + "grad_norm": 0.3521766611306131, + "learning_rate": 5.7940755151628475e-05, + "loss": 2.7758, + "step": 32495 + }, + { + "epoch": 1.51293153618735, + "grad_norm": 0.36755873501452757, + "learning_rate": 5.793808078178137e-05, + "loss": 2.7564, + "step": 32496 + }, + { + "epoch": 1.5129780943734432, + "grad_norm": 0.3607931229827706, + "learning_rate": 5.793540638863677e-05, + "loss": 2.7099, + "step": 32497 + }, + { + "epoch": 1.5130246525595363, + "grad_norm": 0.32925591808190224, + "learning_rate": 5.793273197220252e-05, + "loss": 2.7291, + "step": 32498 + }, + { + "epoch": 1.5130712107456294, + "grad_norm": 0.35068837844354717, + "learning_rate": 5.7930057532486515e-05, + "loss": 2.7611, + "step": 32499 + }, + { + "epoch": 1.5131177689317226, + "grad_norm": 0.36917463492385266, + "learning_rate": 5.792738306949656e-05, + "loss": 2.7094, + "step": 32500 + }, + { + "epoch": 1.5131643271178155, + "grad_norm": 0.3527214618297624, + "learning_rate": 5.792470858324051e-05, + "loss": 2.819, + "step": 32501 + }, + { + "epoch": 1.5132108853039086, + "grad_norm": 0.35098895263149693, + "learning_rate": 5.792203407372623e-05, + "loss": 2.7881, + "step": 32502 + }, + { + "epoch": 1.5132574434900015, + "grad_norm": 0.3677224447659276, + "learning_rate": 5.791935954096156e-05, + "loss": 2.7219, + "step": 32503 + }, + { + "epoch": 1.5133040016760946, + "grad_norm": 0.33655991241491534, + "learning_rate": 5.7916684984954336e-05, + "loss": 2.8558, + "step": 32504 + }, + { + "epoch": 1.5133505598621877, + "grad_norm": 0.3357923519514257, + "learning_rate": 5.7914010405712446e-05, + "loss": 2.8256, + "step": 32505 + }, + { + "epoch": 1.5133971180482808, + "grad_norm": 0.3151846230382611, + "learning_rate": 5.791133580324371e-05, + "loss": 2.676, + "step": 32506 + }, + { + "epoch": 1.513443676234374, + "grad_norm": 0.3172114101116925, + "learning_rate": 5.790866117755596e-05, + "loss": 2.743, + "step": 32507 + }, + { + "epoch": 1.513490234420467, + "grad_norm": 0.33086586951760705, + "learning_rate": 5.790598652865709e-05, + "loss": 2.7648, + "step": 32508 + }, + { + "epoch": 1.5135367926065602, + "grad_norm": 0.36642933068440137, + "learning_rate": 5.790331185655492e-05, + "loss": 2.6933, + "step": 32509 + }, + { + "epoch": 1.5135833507926533, + "grad_norm": 0.3336294353469335, + "learning_rate": 5.7900637161257306e-05, + "loss": 2.6924, + "step": 32510 + }, + { + "epoch": 1.5136299089787462, + "grad_norm": 0.34472557660541614, + "learning_rate": 5.7897962442772104e-05, + "loss": 2.6443, + "step": 32511 + }, + { + "epoch": 1.5136764671648393, + "grad_norm": 0.293141596183735, + "learning_rate": 5.789528770110717e-05, + "loss": 2.8128, + "step": 32512 + }, + { + "epoch": 1.5137230253509322, + "grad_norm": 0.3369203507180274, + "learning_rate": 5.789261293627033e-05, + "loss": 2.7419, + "step": 32513 + }, + { + "epoch": 1.5137695835370253, + "grad_norm": 0.3472521818968703, + "learning_rate": 5.788993814826944e-05, + "loss": 2.7741, + "step": 32514 + }, + { + "epoch": 1.5138161417231184, + "grad_norm": 0.352621938534127, + "learning_rate": 5.788726333711237e-05, + "loss": 2.6908, + "step": 32515 + }, + { + "epoch": 1.5138626999092115, + "grad_norm": 0.3298008677119441, + "learning_rate": 5.788458850280697e-05, + "loss": 2.7012, + "step": 32516 + }, + { + "epoch": 1.5139092580953046, + "grad_norm": 0.35195774954520487, + "learning_rate": 5.788191364536107e-05, + "loss": 2.7772, + "step": 32517 + }, + { + "epoch": 1.5139558162813977, + "grad_norm": 0.3373401892803673, + "learning_rate": 5.787923876478254e-05, + "loss": 2.7426, + "step": 32518 + }, + { + "epoch": 1.5140023744674909, + "grad_norm": 0.3580724430592615, + "learning_rate": 5.78765638610792e-05, + "loss": 2.746, + "step": 32519 + }, + { + "epoch": 1.514048932653584, + "grad_norm": 0.32666433521866206, + "learning_rate": 5.787388893425893e-05, + "loss": 2.8237, + "step": 32520 + }, + { + "epoch": 1.5140954908396769, + "grad_norm": 0.32644169987733157, + "learning_rate": 5.7871213984329586e-05, + "loss": 2.7922, + "step": 32521 + }, + { + "epoch": 1.51414204902577, + "grad_norm": 0.34521341129309513, + "learning_rate": 5.7868539011298996e-05, + "loss": 2.7277, + "step": 32522 + }, + { + "epoch": 1.5141886072118629, + "grad_norm": 0.3151225594501641, + "learning_rate": 5.786586401517501e-05, + "loss": 2.7339, + "step": 32523 + }, + { + "epoch": 1.514235165397956, + "grad_norm": 0.33230241045022507, + "learning_rate": 5.786318899596549e-05, + "loss": 2.8313, + "step": 32524 + }, + { + "epoch": 1.514281723584049, + "grad_norm": 0.32488671897815874, + "learning_rate": 5.7860513953678287e-05, + "loss": 2.7041, + "step": 32525 + }, + { + "epoch": 1.5143282817701422, + "grad_norm": 0.32634842881610704, + "learning_rate": 5.785783888832126e-05, + "loss": 2.6275, + "step": 32526 + }, + { + "epoch": 1.5143748399562353, + "grad_norm": 0.3835246264586379, + "learning_rate": 5.785516379990223e-05, + "loss": 2.6923, + "step": 32527 + }, + { + "epoch": 1.5144213981423285, + "grad_norm": 0.33234321971364816, + "learning_rate": 5.785248868842907e-05, + "loss": 2.8453, + "step": 32528 + }, + { + "epoch": 1.5144679563284216, + "grad_norm": 0.38403136984998876, + "learning_rate": 5.784981355390966e-05, + "loss": 2.7034, + "step": 32529 + }, + { + "epoch": 1.5145145145145145, + "grad_norm": 0.3253079320758933, + "learning_rate": 5.784713839635178e-05, + "loss": 2.7643, + "step": 32530 + }, + { + "epoch": 1.5145610727006076, + "grad_norm": 0.3395169840563476, + "learning_rate": 5.784446321576336e-05, + "loss": 2.6673, + "step": 32531 + }, + { + "epoch": 1.5146076308867007, + "grad_norm": 0.36437691607757156, + "learning_rate": 5.7841788012152185e-05, + "loss": 2.7488, + "step": 32532 + }, + { + "epoch": 1.5146541890727936, + "grad_norm": 0.3581905193846175, + "learning_rate": 5.783911278552615e-05, + "loss": 2.7477, + "step": 32533 + }, + { + "epoch": 1.5147007472588867, + "grad_norm": 0.3418944267059718, + "learning_rate": 5.7836437535893075e-05, + "loss": 2.7317, + "step": 32534 + }, + { + "epoch": 1.5147473054449798, + "grad_norm": 0.37800688873636457, + "learning_rate": 5.7833762263260846e-05, + "loss": 2.8161, + "step": 32535 + }, + { + "epoch": 1.514793863631073, + "grad_norm": 0.3669787612770714, + "learning_rate": 5.783108696763729e-05, + "loss": 2.7529, + "step": 32536 + }, + { + "epoch": 1.514840421817166, + "grad_norm": 0.347729491835399, + "learning_rate": 5.7828411649030255e-05, + "loss": 2.672, + "step": 32537 + }, + { + "epoch": 1.5148869800032592, + "grad_norm": 0.3949976749654423, + "learning_rate": 5.7825736307447606e-05, + "loss": 2.736, + "step": 32538 + }, + { + "epoch": 1.5149335381893523, + "grad_norm": 0.3338121594018807, + "learning_rate": 5.782306094289721e-05, + "loss": 2.7398, + "step": 32539 + }, + { + "epoch": 1.5149800963754452, + "grad_norm": 0.34067769786499225, + "learning_rate": 5.782038555538689e-05, + "loss": 2.7281, + "step": 32540 + }, + { + "epoch": 1.5150266545615383, + "grad_norm": 0.3567538225844946, + "learning_rate": 5.78177101449245e-05, + "loss": 2.7648, + "step": 32541 + }, + { + "epoch": 1.5150732127476314, + "grad_norm": 0.33774772253016494, + "learning_rate": 5.781503471151792e-05, + "loss": 2.703, + "step": 32542 + }, + { + "epoch": 1.5151197709337243, + "grad_norm": 0.34371413559371705, + "learning_rate": 5.781235925517496e-05, + "loss": 2.6869, + "step": 32543 + }, + { + "epoch": 1.5151663291198174, + "grad_norm": 0.3391991582123228, + "learning_rate": 5.780968377590352e-05, + "loss": 2.7317, + "step": 32544 + }, + { + "epoch": 1.5152128873059105, + "grad_norm": 0.3292603147193549, + "learning_rate": 5.780700827371142e-05, + "loss": 2.6304, + "step": 32545 + }, + { + "epoch": 1.5152594454920036, + "grad_norm": 0.31834299322713056, + "learning_rate": 5.7804332748606505e-05, + "loss": 2.7401, + "step": 32546 + }, + { + "epoch": 1.5153060036780968, + "grad_norm": 0.3317384253559993, + "learning_rate": 5.780165720059665e-05, + "loss": 2.7295, + "step": 32547 + }, + { + "epoch": 1.5153525618641899, + "grad_norm": 0.3212792888674888, + "learning_rate": 5.779898162968971e-05, + "loss": 2.7372, + "step": 32548 + }, + { + "epoch": 1.515399120050283, + "grad_norm": 0.32867784280176415, + "learning_rate": 5.779630603589351e-05, + "loss": 2.5817, + "step": 32549 + }, + { + "epoch": 1.5154456782363759, + "grad_norm": 0.33088521673471477, + "learning_rate": 5.779363041921593e-05, + "loss": 2.7329, + "step": 32550 + }, + { + "epoch": 1.515492236422469, + "grad_norm": 0.3169284064392399, + "learning_rate": 5.779095477966481e-05, + "loss": 2.7763, + "step": 32551 + }, + { + "epoch": 1.515538794608562, + "grad_norm": 0.3426098870783474, + "learning_rate": 5.7788279117248e-05, + "loss": 2.7393, + "step": 32552 + }, + { + "epoch": 1.515585352794655, + "grad_norm": 0.3889014082765383, + "learning_rate": 5.7785603431973365e-05, + "loss": 2.7859, + "step": 32553 + }, + { + "epoch": 1.5156319109807481, + "grad_norm": 0.3564602356415489, + "learning_rate": 5.778292772384875e-05, + "loss": 2.7388, + "step": 32554 + }, + { + "epoch": 1.5156784691668412, + "grad_norm": 0.3877908747341761, + "learning_rate": 5.778025199288202e-05, + "loss": 2.8019, + "step": 32555 + }, + { + "epoch": 1.5157250273529344, + "grad_norm": 0.3483934326729525, + "learning_rate": 5.7777576239081e-05, + "loss": 2.6677, + "step": 32556 + }, + { + "epoch": 1.5157715855390275, + "grad_norm": 0.36211747404216427, + "learning_rate": 5.777490046245358e-05, + "loss": 2.7912, + "step": 32557 + }, + { + "epoch": 1.5158181437251206, + "grad_norm": 0.39831292908922505, + "learning_rate": 5.7772224663007566e-05, + "loss": 2.7873, + "step": 32558 + }, + { + "epoch": 1.5158647019112137, + "grad_norm": 0.37001239676387965, + "learning_rate": 5.776954884075085e-05, + "loss": 2.7697, + "step": 32559 + }, + { + "epoch": 1.5159112600973066, + "grad_norm": 0.38304359962450335, + "learning_rate": 5.776687299569127e-05, + "loss": 2.6985, + "step": 32560 + }, + { + "epoch": 1.5159578182833997, + "grad_norm": 0.3745086036188636, + "learning_rate": 5.776419712783668e-05, + "loss": 2.7087, + "step": 32561 + }, + { + "epoch": 1.5160043764694926, + "grad_norm": 0.3748836636236355, + "learning_rate": 5.776152123719496e-05, + "loss": 2.737, + "step": 32562 + }, + { + "epoch": 1.5160509346555857, + "grad_norm": 0.36581242454238416, + "learning_rate": 5.7758845323773916e-05, + "loss": 2.6643, + "step": 32563 + }, + { + "epoch": 1.5160974928416788, + "grad_norm": 0.35097650613786635, + "learning_rate": 5.775616938758143e-05, + "loss": 2.6367, + "step": 32564 + }, + { + "epoch": 1.516144051027772, + "grad_norm": 0.41724824831784035, + "learning_rate": 5.775349342862535e-05, + "loss": 2.7669, + "step": 32565 + }, + { + "epoch": 1.516190609213865, + "grad_norm": 0.33781411235552583, + "learning_rate": 5.775081744691354e-05, + "loss": 2.7158, + "step": 32566 + }, + { + "epoch": 1.5162371673999582, + "grad_norm": 0.37972826248180724, + "learning_rate": 5.774814144245384e-05, + "loss": 2.7282, + "step": 32567 + }, + { + "epoch": 1.5162837255860513, + "grad_norm": 0.3430847556999369, + "learning_rate": 5.774546541525412e-05, + "loss": 2.6756, + "step": 32568 + }, + { + "epoch": 1.5163302837721442, + "grad_norm": 0.3271376169375793, + "learning_rate": 5.77427893653222e-05, + "loss": 2.7431, + "step": 32569 + }, + { + "epoch": 1.5163768419582373, + "grad_norm": 0.36330770979053784, + "learning_rate": 5.7740113292665965e-05, + "loss": 2.8134, + "step": 32570 + }, + { + "epoch": 1.5164234001443304, + "grad_norm": 0.35426075728536, + "learning_rate": 5.773743719729327e-05, + "loss": 2.7572, + "step": 32571 + }, + { + "epoch": 1.5164699583304233, + "grad_norm": 0.36026736252629427, + "learning_rate": 5.773476107921194e-05, + "loss": 2.6534, + "step": 32572 + }, + { + "epoch": 1.5165165165165164, + "grad_norm": 0.346648137115331, + "learning_rate": 5.7732084938429854e-05, + "loss": 2.8184, + "step": 32573 + }, + { + "epoch": 1.5165630747026095, + "grad_norm": 0.36615823768138483, + "learning_rate": 5.772940877495486e-05, + "loss": 2.7127, + "step": 32574 + }, + { + "epoch": 1.5166096328887027, + "grad_norm": 0.32844491100647505, + "learning_rate": 5.772673258879483e-05, + "loss": 2.7059, + "step": 32575 + }, + { + "epoch": 1.5166561910747958, + "grad_norm": 0.3645228136226498, + "learning_rate": 5.772405637995758e-05, + "loss": 2.686, + "step": 32576 + }, + { + "epoch": 1.5167027492608889, + "grad_norm": 0.34049843505701416, + "learning_rate": 5.7721380148450985e-05, + "loss": 2.7012, + "step": 32577 + }, + { + "epoch": 1.516749307446982, + "grad_norm": 0.31936520803348506, + "learning_rate": 5.771870389428291e-05, + "loss": 2.6447, + "step": 32578 + }, + { + "epoch": 1.516795865633075, + "grad_norm": 0.34813763989797414, + "learning_rate": 5.7716027617461196e-05, + "loss": 2.7034, + "step": 32579 + }, + { + "epoch": 1.516842423819168, + "grad_norm": 0.3376009376607679, + "learning_rate": 5.7713351317993714e-05, + "loss": 2.8579, + "step": 32580 + }, + { + "epoch": 1.5168889820052611, + "grad_norm": 0.36956099132906056, + "learning_rate": 5.771067499588829e-05, + "loss": 2.7379, + "step": 32581 + }, + { + "epoch": 1.516935540191354, + "grad_norm": 0.36480777381638807, + "learning_rate": 5.770799865115281e-05, + "loss": 2.8644, + "step": 32582 + }, + { + "epoch": 1.5169820983774471, + "grad_norm": 0.38929771647009886, + "learning_rate": 5.7705322283795094e-05, + "loss": 2.6619, + "step": 32583 + }, + { + "epoch": 1.5170286565635402, + "grad_norm": 0.3732657359907202, + "learning_rate": 5.770264589382303e-05, + "loss": 2.6888, + "step": 32584 + }, + { + "epoch": 1.5170752147496334, + "grad_norm": 0.38784687190570416, + "learning_rate": 5.7699969481244454e-05, + "loss": 2.7581, + "step": 32585 + }, + { + "epoch": 1.5171217729357265, + "grad_norm": 0.3769979354328813, + "learning_rate": 5.7697293046067217e-05, + "loss": 2.818, + "step": 32586 + }, + { + "epoch": 1.5171683311218196, + "grad_norm": 0.3429984604976727, + "learning_rate": 5.769461658829919e-05, + "loss": 2.7115, + "step": 32587 + }, + { + "epoch": 1.5172148893079127, + "grad_norm": 0.3844995856032382, + "learning_rate": 5.7691940107948215e-05, + "loss": 2.7591, + "step": 32588 + }, + { + "epoch": 1.5172614474940056, + "grad_norm": 0.3551516330347563, + "learning_rate": 5.768926360502218e-05, + "loss": 2.7089, + "step": 32589 + }, + { + "epoch": 1.5173080056800987, + "grad_norm": 0.3861271282152973, + "learning_rate": 5.768658707952889e-05, + "loss": 2.675, + "step": 32590 + }, + { + "epoch": 1.5173545638661916, + "grad_norm": 0.3679883368534198, + "learning_rate": 5.768391053147623e-05, + "loss": 2.7234, + "step": 32591 + }, + { + "epoch": 1.5174011220522847, + "grad_norm": 0.37146121772521035, + "learning_rate": 5.768123396087204e-05, + "loss": 2.7817, + "step": 32592 + }, + { + "epoch": 1.5174476802383778, + "grad_norm": 0.3532499312937444, + "learning_rate": 5.7678557367724205e-05, + "loss": 2.6794, + "step": 32593 + }, + { + "epoch": 1.517494238424471, + "grad_norm": 0.3622775457235994, + "learning_rate": 5.767588075204055e-05, + "loss": 2.8161, + "step": 32594 + }, + { + "epoch": 1.517540796610564, + "grad_norm": 0.34103247934947545, + "learning_rate": 5.767320411382895e-05, + "loss": 2.7617, + "step": 32595 + }, + { + "epoch": 1.5175873547966572, + "grad_norm": 0.3587732250234616, + "learning_rate": 5.767052745309723e-05, + "loss": 2.6785, + "step": 32596 + }, + { + "epoch": 1.5176339129827503, + "grad_norm": 0.3485865257956092, + "learning_rate": 5.7667850769853284e-05, + "loss": 2.7115, + "step": 32597 + }, + { + "epoch": 1.5176804711688434, + "grad_norm": 0.3449037623100011, + "learning_rate": 5.766517406410495e-05, + "loss": 2.7841, + "step": 32598 + }, + { + "epoch": 1.5177270293549363, + "grad_norm": 0.36236195188179177, + "learning_rate": 5.766249733586009e-05, + "loss": 2.7277, + "step": 32599 + }, + { + "epoch": 1.5177735875410294, + "grad_norm": 0.3612165988024146, + "learning_rate": 5.7659820585126546e-05, + "loss": 2.7632, + "step": 32600 + }, + { + "epoch": 1.5178201457271223, + "grad_norm": 0.32960078954474614, + "learning_rate": 5.7657143811912174e-05, + "loss": 2.6649, + "step": 32601 + }, + { + "epoch": 1.5178667039132154, + "grad_norm": 0.33579779281016076, + "learning_rate": 5.765446701622487e-05, + "loss": 2.7718, + "step": 32602 + }, + { + "epoch": 1.5179132620993085, + "grad_norm": 0.34554321636119134, + "learning_rate": 5.765179019807243e-05, + "loss": 2.8266, + "step": 32603 + }, + { + "epoch": 1.5179598202854017, + "grad_norm": 0.3361639655726679, + "learning_rate": 5.764911335746275e-05, + "loss": 2.7446, + "step": 32604 + }, + { + "epoch": 1.5180063784714948, + "grad_norm": 0.3389155309931119, + "learning_rate": 5.764643649440367e-05, + "loss": 2.7171, + "step": 32605 + }, + { + "epoch": 1.518052936657588, + "grad_norm": 0.3485832930419532, + "learning_rate": 5.764375960890307e-05, + "loss": 2.7354, + "step": 32606 + }, + { + "epoch": 1.518099494843681, + "grad_norm": 0.32787623276788136, + "learning_rate": 5.764108270096879e-05, + "loss": 2.7207, + "step": 32607 + }, + { + "epoch": 1.5181460530297741, + "grad_norm": 0.34286012300826363, + "learning_rate": 5.763840577060866e-05, + "loss": 2.7639, + "step": 32608 + }, + { + "epoch": 1.518192611215867, + "grad_norm": 0.3167288670857554, + "learning_rate": 5.763572881783057e-05, + "loss": 2.7775, + "step": 32609 + }, + { + "epoch": 1.5182391694019601, + "grad_norm": 0.34882804022179575, + "learning_rate": 5.763305184264237e-05, + "loss": 2.7867, + "step": 32610 + }, + { + "epoch": 1.518285727588053, + "grad_norm": 0.3341083741309827, + "learning_rate": 5.7630374845051925e-05, + "loss": 2.8159, + "step": 32611 + }, + { + "epoch": 1.5183322857741461, + "grad_norm": 0.33128557911893974, + "learning_rate": 5.762769782506706e-05, + "loss": 2.7653, + "step": 32612 + }, + { + "epoch": 1.5183788439602393, + "grad_norm": 0.3746339271680038, + "learning_rate": 5.762502078269567e-05, + "loss": 2.7632, + "step": 32613 + }, + { + "epoch": 1.5184254021463324, + "grad_norm": 0.34737092001650777, + "learning_rate": 5.762234371794559e-05, + "loss": 2.8161, + "step": 32614 + }, + { + "epoch": 1.5184719603324255, + "grad_norm": 0.35525309881367545, + "learning_rate": 5.761966663082468e-05, + "loss": 2.6301, + "step": 32615 + }, + { + "epoch": 1.5185185185185186, + "grad_norm": 0.33788513250678326, + "learning_rate": 5.76169895213408e-05, + "loss": 2.7749, + "step": 32616 + }, + { + "epoch": 1.5185650767046117, + "grad_norm": 0.36333470638302434, + "learning_rate": 5.7614312389501814e-05, + "loss": 2.8227, + "step": 32617 + }, + { + "epoch": 1.5186116348907046, + "grad_norm": 0.32857975402164213, + "learning_rate": 5.761163523531556e-05, + "loss": 2.6946, + "step": 32618 + }, + { + "epoch": 1.5186581930767977, + "grad_norm": 0.3619256725549869, + "learning_rate": 5.7608958058789916e-05, + "loss": 2.6576, + "step": 32619 + }, + { + "epoch": 1.5187047512628908, + "grad_norm": 0.3292276592988994, + "learning_rate": 5.760628085993274e-05, + "loss": 2.7546, + "step": 32620 + }, + { + "epoch": 1.5187513094489837, + "grad_norm": 0.34711748751429583, + "learning_rate": 5.760360363875187e-05, + "loss": 2.8051, + "step": 32621 + }, + { + "epoch": 1.5187978676350768, + "grad_norm": 0.3504407478921813, + "learning_rate": 5.760092639525516e-05, + "loss": 2.6736, + "step": 32622 + }, + { + "epoch": 1.51884442582117, + "grad_norm": 0.34218718747864846, + "learning_rate": 5.759824912945049e-05, + "loss": 2.7833, + "step": 32623 + }, + { + "epoch": 1.518890984007263, + "grad_norm": 0.3435031915186887, + "learning_rate": 5.75955718413457e-05, + "loss": 2.7812, + "step": 32624 + }, + { + "epoch": 1.5189375421933562, + "grad_norm": 0.340052917098045, + "learning_rate": 5.759289453094866e-05, + "loss": 2.6857, + "step": 32625 + }, + { + "epoch": 1.5189841003794493, + "grad_norm": 0.3276491938196838, + "learning_rate": 5.759021719826723e-05, + "loss": 2.6507, + "step": 32626 + }, + { + "epoch": 1.5190306585655424, + "grad_norm": 0.3285771472729774, + "learning_rate": 5.7587539843309245e-05, + "loss": 2.7408, + "step": 32627 + }, + { + "epoch": 1.5190772167516353, + "grad_norm": 0.34230500818779525, + "learning_rate": 5.758486246608258e-05, + "loss": 2.7809, + "step": 32628 + }, + { + "epoch": 1.5191237749377284, + "grad_norm": 0.3385850842264275, + "learning_rate": 5.7582185066595104e-05, + "loss": 2.7777, + "step": 32629 + }, + { + "epoch": 1.5191703331238215, + "grad_norm": 0.33998967962405524, + "learning_rate": 5.7579507644854645e-05, + "loss": 2.7279, + "step": 32630 + }, + { + "epoch": 1.5192168913099144, + "grad_norm": 0.33822111902700797, + "learning_rate": 5.757683020086909e-05, + "loss": 2.6712, + "step": 32631 + }, + { + "epoch": 1.5192634494960076, + "grad_norm": 0.3532403900573938, + "learning_rate": 5.757415273464629e-05, + "loss": 2.7737, + "step": 32632 + }, + { + "epoch": 1.5193100076821007, + "grad_norm": 0.3543827629318298, + "learning_rate": 5.7571475246194084e-05, + "loss": 2.7949, + "step": 32633 + }, + { + "epoch": 1.5193565658681938, + "grad_norm": 0.3344977164629068, + "learning_rate": 5.7568797735520364e-05, + "loss": 2.6897, + "step": 32634 + }, + { + "epoch": 1.519403124054287, + "grad_norm": 0.35655590136109655, + "learning_rate": 5.7566120202632947e-05, + "loss": 2.7235, + "step": 32635 + }, + { + "epoch": 1.51944968224038, + "grad_norm": 0.32720820909274645, + "learning_rate": 5.7563442647539714e-05, + "loss": 2.768, + "step": 32636 + }, + { + "epoch": 1.5194962404264731, + "grad_norm": 0.3412110677040886, + "learning_rate": 5.7560765070248525e-05, + "loss": 2.692, + "step": 32637 + }, + { + "epoch": 1.519542798612566, + "grad_norm": 0.31703492447871784, + "learning_rate": 5.755808747076725e-05, + "loss": 2.6815, + "step": 32638 + }, + { + "epoch": 1.5195893567986591, + "grad_norm": 0.33717617209818984, + "learning_rate": 5.755540984910371e-05, + "loss": 2.7919, + "step": 32639 + }, + { + "epoch": 1.519635914984752, + "grad_norm": 0.3413941495237141, + "learning_rate": 5.75527322052658e-05, + "loss": 2.8408, + "step": 32640 + }, + { + "epoch": 1.5196824731708451, + "grad_norm": 0.30649986387921047, + "learning_rate": 5.7550054539261355e-05, + "loss": 2.7332, + "step": 32641 + }, + { + "epoch": 1.5197290313569383, + "grad_norm": 0.3636959716083465, + "learning_rate": 5.7547376851098256e-05, + "loss": 2.8113, + "step": 32642 + }, + { + "epoch": 1.5197755895430314, + "grad_norm": 0.3315281680511049, + "learning_rate": 5.754469914078435e-05, + "loss": 2.62, + "step": 32643 + }, + { + "epoch": 1.5198221477291245, + "grad_norm": 0.33749104532360347, + "learning_rate": 5.754202140832748e-05, + "loss": 2.7798, + "step": 32644 + }, + { + "epoch": 1.5198687059152176, + "grad_norm": 0.3599908014227781, + "learning_rate": 5.753934365373555e-05, + "loss": 2.7592, + "step": 32645 + }, + { + "epoch": 1.5199152641013107, + "grad_norm": 0.331925268251772, + "learning_rate": 5.753666587701636e-05, + "loss": 2.7969, + "step": 32646 + }, + { + "epoch": 1.5199618222874038, + "grad_norm": 0.3259934225715031, + "learning_rate": 5.7533988078177816e-05, + "loss": 2.6155, + "step": 32647 + }, + { + "epoch": 1.5200083804734967, + "grad_norm": 0.383296975855549, + "learning_rate": 5.753131025722774e-05, + "loss": 2.8225, + "step": 32648 + }, + { + "epoch": 1.5200549386595898, + "grad_norm": 0.34155845880002317, + "learning_rate": 5.752863241417402e-05, + "loss": 2.7287, + "step": 32649 + }, + { + "epoch": 1.5201014968456827, + "grad_norm": 0.3967565806861943, + "learning_rate": 5.752595454902451e-05, + "loss": 2.6916, + "step": 32650 + }, + { + "epoch": 1.5201480550317759, + "grad_norm": 0.340641017117493, + "learning_rate": 5.752327666178705e-05, + "loss": 2.6887, + "step": 32651 + }, + { + "epoch": 1.520194613217869, + "grad_norm": 0.37868774811420647, + "learning_rate": 5.752059875246953e-05, + "loss": 2.8441, + "step": 32652 + }, + { + "epoch": 1.520241171403962, + "grad_norm": 0.3376526288564606, + "learning_rate": 5.75179208210798e-05, + "loss": 2.6761, + "step": 32653 + }, + { + "epoch": 1.5202877295900552, + "grad_norm": 0.3574531880148379, + "learning_rate": 5.7515242867625697e-05, + "loss": 2.7796, + "step": 32654 + }, + { + "epoch": 1.5203342877761483, + "grad_norm": 0.3709452430909489, + "learning_rate": 5.7512564892115094e-05, + "loss": 2.7898, + "step": 32655 + }, + { + "epoch": 1.5203808459622414, + "grad_norm": 0.35978715338267064, + "learning_rate": 5.750988689455588e-05, + "loss": 2.7267, + "step": 32656 + }, + { + "epoch": 1.5204274041483343, + "grad_norm": 0.3238529031897923, + "learning_rate": 5.7507208874955855e-05, + "loss": 2.7445, + "step": 32657 + }, + { + "epoch": 1.5204739623344274, + "grad_norm": 0.3501581385085348, + "learning_rate": 5.750453083332293e-05, + "loss": 2.824, + "step": 32658 + }, + { + "epoch": 1.5205205205205206, + "grad_norm": 0.34889032898671474, + "learning_rate": 5.7501852769664945e-05, + "loss": 2.7342, + "step": 32659 + }, + { + "epoch": 1.5205670787066135, + "grad_norm": 0.32459882275668744, + "learning_rate": 5.749917468398975e-05, + "loss": 2.7976, + "step": 32660 + }, + { + "epoch": 1.5206136368927066, + "grad_norm": 0.33565623766974106, + "learning_rate": 5.7496496576305235e-05, + "loss": 2.7567, + "step": 32661 + }, + { + "epoch": 1.5206601950787997, + "grad_norm": 0.34776568629499816, + "learning_rate": 5.749381844661923e-05, + "loss": 2.7777, + "step": 32662 + }, + { + "epoch": 1.5207067532648928, + "grad_norm": 0.3188242288029685, + "learning_rate": 5.74911402949396e-05, + "loss": 2.7652, + "step": 32663 + }, + { + "epoch": 1.520753311450986, + "grad_norm": 0.3241221313595101, + "learning_rate": 5.748846212127421e-05, + "loss": 2.7207, + "step": 32664 + }, + { + "epoch": 1.520799869637079, + "grad_norm": 0.3605243538213447, + "learning_rate": 5.748578392563094e-05, + "loss": 2.7288, + "step": 32665 + }, + { + "epoch": 1.5208464278231721, + "grad_norm": 0.3310322010211339, + "learning_rate": 5.7483105708017614e-05, + "loss": 2.7356, + "step": 32666 + }, + { + "epoch": 1.520892986009265, + "grad_norm": 0.3466375533961842, + "learning_rate": 5.748042746844211e-05, + "loss": 2.7775, + "step": 32667 + }, + { + "epoch": 1.5209395441953582, + "grad_norm": 0.32349259865616287, + "learning_rate": 5.74777492069123e-05, + "loss": 2.7982, + "step": 32668 + }, + { + "epoch": 1.5209861023814513, + "grad_norm": 0.3374794442527739, + "learning_rate": 5.747507092343604e-05, + "loss": 2.7523, + "step": 32669 + }, + { + "epoch": 1.5210326605675442, + "grad_norm": 0.3586299332518471, + "learning_rate": 5.747239261802115e-05, + "loss": 2.8343, + "step": 32670 + }, + { + "epoch": 1.5210792187536373, + "grad_norm": 0.3669365133144203, + "learning_rate": 5.746971429067556e-05, + "loss": 2.7241, + "step": 32671 + }, + { + "epoch": 1.5211257769397304, + "grad_norm": 0.343790216371368, + "learning_rate": 5.7467035941407074e-05, + "loss": 2.6796, + "step": 32672 + }, + { + "epoch": 1.5211723351258235, + "grad_norm": 0.3435183830192113, + "learning_rate": 5.7464357570223584e-05, + "loss": 2.8422, + "step": 32673 + }, + { + "epoch": 1.5212188933119166, + "grad_norm": 0.3351209261638795, + "learning_rate": 5.7461679177132934e-05, + "loss": 2.8355, + "step": 32674 + }, + { + "epoch": 1.5212654514980097, + "grad_norm": 0.3184371489910152, + "learning_rate": 5.745900076214299e-05, + "loss": 2.8173, + "step": 32675 + }, + { + "epoch": 1.5213120096841029, + "grad_norm": 0.35700679886013503, + "learning_rate": 5.7456322325261615e-05, + "loss": 2.8015, + "step": 32676 + }, + { + "epoch": 1.5213585678701957, + "grad_norm": 0.3537656029747744, + "learning_rate": 5.7453643866496666e-05, + "loss": 2.7933, + "step": 32677 + }, + { + "epoch": 1.5214051260562889, + "grad_norm": 0.3252570347128935, + "learning_rate": 5.745096538585601e-05, + "loss": 2.7902, + "step": 32678 + }, + { + "epoch": 1.5214516842423818, + "grad_norm": 0.3402582003096639, + "learning_rate": 5.7448286883347505e-05, + "loss": 2.7581, + "step": 32679 + }, + { + "epoch": 1.5214982424284749, + "grad_norm": 0.3429933532013909, + "learning_rate": 5.7445608358979e-05, + "loss": 2.6951, + "step": 32680 + }, + { + "epoch": 1.521544800614568, + "grad_norm": 0.32430081131169036, + "learning_rate": 5.7442929812758374e-05, + "loss": 2.7358, + "step": 32681 + }, + { + "epoch": 1.521591358800661, + "grad_norm": 0.37316921773066947, + "learning_rate": 5.74402512446935e-05, + "loss": 2.7212, + "step": 32682 + }, + { + "epoch": 1.5216379169867542, + "grad_norm": 0.33090488804369184, + "learning_rate": 5.74375726547922e-05, + "loss": 2.7443, + "step": 32683 + }, + { + "epoch": 1.5216844751728473, + "grad_norm": 0.31853632233803786, + "learning_rate": 5.743489404306237e-05, + "loss": 2.7886, + "step": 32684 + }, + { + "epoch": 1.5217310333589404, + "grad_norm": 0.3484286685552971, + "learning_rate": 5.7432215409511844e-05, + "loss": 2.7951, + "step": 32685 + }, + { + "epoch": 1.5217775915450336, + "grad_norm": 0.32873436407794154, + "learning_rate": 5.7429536754148515e-05, + "loss": 2.7316, + "step": 32686 + }, + { + "epoch": 1.5218241497311265, + "grad_norm": 0.33591723421920827, + "learning_rate": 5.7426858076980213e-05, + "loss": 2.7962, + "step": 32687 + }, + { + "epoch": 1.5218707079172196, + "grad_norm": 0.3316076417600824, + "learning_rate": 5.7424179378014833e-05, + "loss": 2.731, + "step": 32688 + }, + { + "epoch": 1.5219172661033125, + "grad_norm": 0.352215188033757, + "learning_rate": 5.7421500657260195e-05, + "loss": 2.6696, + "step": 32689 + }, + { + "epoch": 1.5219638242894056, + "grad_norm": 0.34740497818401195, + "learning_rate": 5.74188219147242e-05, + "loss": 2.5645, + "step": 32690 + }, + { + "epoch": 1.5220103824754987, + "grad_norm": 0.35127279760951235, + "learning_rate": 5.741614315041468e-05, + "loss": 2.6133, + "step": 32691 + }, + { + "epoch": 1.5220569406615918, + "grad_norm": 0.3518769080030469, + "learning_rate": 5.741346436433952e-05, + "loss": 2.7543, + "step": 32692 + }, + { + "epoch": 1.522103498847685, + "grad_norm": 0.3640171398771237, + "learning_rate": 5.741078555650658e-05, + "loss": 2.8077, + "step": 32693 + }, + { + "epoch": 1.522150057033778, + "grad_norm": 0.35765330550109886, + "learning_rate": 5.7408106726923696e-05, + "loss": 2.7115, + "step": 32694 + }, + { + "epoch": 1.5221966152198712, + "grad_norm": 0.36786915277574395, + "learning_rate": 5.740542787559876e-05, + "loss": 2.7086, + "step": 32695 + }, + { + "epoch": 1.5222431734059643, + "grad_norm": 0.33891925849324883, + "learning_rate": 5.740274900253963e-05, + "loss": 2.7584, + "step": 32696 + }, + { + "epoch": 1.5222897315920572, + "grad_norm": 0.359005995449592, + "learning_rate": 5.7400070107754155e-05, + "loss": 2.7724, + "step": 32697 + }, + { + "epoch": 1.5223362897781503, + "grad_norm": 0.3495576898911255, + "learning_rate": 5.7397391191250196e-05, + "loss": 2.7026, + "step": 32698 + }, + { + "epoch": 1.5223828479642432, + "grad_norm": 0.32625916724953463, + "learning_rate": 5.7394712253035624e-05, + "loss": 2.8447, + "step": 32699 + }, + { + "epoch": 1.5224294061503363, + "grad_norm": 0.35835488371776, + "learning_rate": 5.73920332931183e-05, + "loss": 2.7767, + "step": 32700 + }, + { + "epoch": 1.5224759643364294, + "grad_norm": 0.34438689441543063, + "learning_rate": 5.7389354311506094e-05, + "loss": 2.6917, + "step": 32701 + }, + { + "epoch": 1.5225225225225225, + "grad_norm": 0.35214320579722336, + "learning_rate": 5.738667530820685e-05, + "loss": 2.7837, + "step": 32702 + }, + { + "epoch": 1.5225690807086156, + "grad_norm": 0.37279438854759495, + "learning_rate": 5.7383996283228445e-05, + "loss": 2.772, + "step": 32703 + }, + { + "epoch": 1.5226156388947087, + "grad_norm": 0.3285668995007116, + "learning_rate": 5.7381317236578734e-05, + "loss": 2.7643, + "step": 32704 + }, + { + "epoch": 1.5226621970808019, + "grad_norm": 0.5020723749157525, + "learning_rate": 5.73786381682656e-05, + "loss": 2.625, + "step": 32705 + }, + { + "epoch": 1.5227087552668948, + "grad_norm": 0.36450611860656784, + "learning_rate": 5.737595907829688e-05, + "loss": 2.7508, + "step": 32706 + }, + { + "epoch": 1.5227553134529879, + "grad_norm": 0.3710992794048009, + "learning_rate": 5.737327996668045e-05, + "loss": 2.6908, + "step": 32707 + }, + { + "epoch": 1.522801871639081, + "grad_norm": 0.37257938836948423, + "learning_rate": 5.737060083342417e-05, + "loss": 2.7413, + "step": 32708 + }, + { + "epoch": 1.5228484298251739, + "grad_norm": 0.3300160239466713, + "learning_rate": 5.7367921678535896e-05, + "loss": 2.5709, + "step": 32709 + }, + { + "epoch": 1.522894988011267, + "grad_norm": 0.36884614038210917, + "learning_rate": 5.736524250202351e-05, + "loss": 2.8333, + "step": 32710 + }, + { + "epoch": 1.52294154619736, + "grad_norm": 0.36146376431865, + "learning_rate": 5.736256330389485e-05, + "loss": 2.7598, + "step": 32711 + }, + { + "epoch": 1.5229881043834532, + "grad_norm": 0.35900717098414675, + "learning_rate": 5.73598840841578e-05, + "loss": 2.767, + "step": 32712 + }, + { + "epoch": 1.5230346625695463, + "grad_norm": 0.38132131303533123, + "learning_rate": 5.735720484282021e-05, + "loss": 2.8002, + "step": 32713 + }, + { + "epoch": 1.5230812207556395, + "grad_norm": 0.3433865405536115, + "learning_rate": 5.735452557988994e-05, + "loss": 2.7268, + "step": 32714 + }, + { + "epoch": 1.5231277789417326, + "grad_norm": 0.35106397732117567, + "learning_rate": 5.735184629537489e-05, + "loss": 2.638, + "step": 32715 + }, + { + "epoch": 1.5231743371278255, + "grad_norm": 0.3537037407858714, + "learning_rate": 5.734916698928288e-05, + "loss": 2.6319, + "step": 32716 + }, + { + "epoch": 1.5232208953139186, + "grad_norm": 0.3122590942294297, + "learning_rate": 5.7346487661621774e-05, + "loss": 2.7267, + "step": 32717 + }, + { + "epoch": 1.5232674535000117, + "grad_norm": 0.3892297045849107, + "learning_rate": 5.734380831239946e-05, + "loss": 2.7203, + "step": 32718 + }, + { + "epoch": 1.5233140116861046, + "grad_norm": 0.3500117976437904, + "learning_rate": 5.73411289416238e-05, + "loss": 2.7618, + "step": 32719 + }, + { + "epoch": 1.5233605698721977, + "grad_norm": 0.3759983378288691, + "learning_rate": 5.733844954930264e-05, + "loss": 2.7614, + "step": 32720 + }, + { + "epoch": 1.5234071280582908, + "grad_norm": 0.3521090281678714, + "learning_rate": 5.733577013544387e-05, + "loss": 2.7701, + "step": 32721 + }, + { + "epoch": 1.523453686244384, + "grad_norm": 0.3252407782241044, + "learning_rate": 5.7333090700055324e-05, + "loss": 2.7066, + "step": 32722 + }, + { + "epoch": 1.523500244430477, + "grad_norm": 0.37009837752853764, + "learning_rate": 5.733041124314488e-05, + "loss": 2.8445, + "step": 32723 + }, + { + "epoch": 1.5235468026165702, + "grad_norm": 0.31707815641684134, + "learning_rate": 5.7327731764720415e-05, + "loss": 2.7372, + "step": 32724 + }, + { + "epoch": 1.5235933608026633, + "grad_norm": 0.35975081370436646, + "learning_rate": 5.7325052264789756e-05, + "loss": 2.6021, + "step": 32725 + }, + { + "epoch": 1.5236399189887562, + "grad_norm": 0.3329349035119075, + "learning_rate": 5.73223727433608e-05, + "loss": 2.7477, + "step": 32726 + }, + { + "epoch": 1.5236864771748493, + "grad_norm": 0.3455929521493436, + "learning_rate": 5.73196932004414e-05, + "loss": 2.7476, + "step": 32727 + }, + { + "epoch": 1.5237330353609422, + "grad_norm": 0.393091720780867, + "learning_rate": 5.731701363603943e-05, + "loss": 2.7233, + "step": 32728 + }, + { + "epoch": 1.5237795935470353, + "grad_norm": 0.355176297286357, + "learning_rate": 5.7314334050162735e-05, + "loss": 2.7868, + "step": 32729 + }, + { + "epoch": 1.5238261517331284, + "grad_norm": 0.3446543800680125, + "learning_rate": 5.73116544428192e-05, + "loss": 2.6902, + "step": 32730 + }, + { + "epoch": 1.5238727099192215, + "grad_norm": 0.3611466122393605, + "learning_rate": 5.730897481401667e-05, + "loss": 2.6366, + "step": 32731 + }, + { + "epoch": 1.5239192681053146, + "grad_norm": 0.33989943353452995, + "learning_rate": 5.730629516376302e-05, + "loss": 2.7748, + "step": 32732 + }, + { + "epoch": 1.5239658262914078, + "grad_norm": 0.37447319167199095, + "learning_rate": 5.730361549206612e-05, + "loss": 2.7359, + "step": 32733 + }, + { + "epoch": 1.5240123844775009, + "grad_norm": 0.34080536597434286, + "learning_rate": 5.730093579893383e-05, + "loss": 2.6123, + "step": 32734 + }, + { + "epoch": 1.524058942663594, + "grad_norm": 0.3569903019467748, + "learning_rate": 5.7298256084374e-05, + "loss": 2.6876, + "step": 32735 + }, + { + "epoch": 1.5241055008496869, + "grad_norm": 0.3441869285547991, + "learning_rate": 5.729557634839452e-05, + "loss": 2.7761, + "step": 32736 + }, + { + "epoch": 1.52415205903578, + "grad_norm": 0.40547228420014725, + "learning_rate": 5.729289659100324e-05, + "loss": 2.7385, + "step": 32737 + }, + { + "epoch": 1.5241986172218729, + "grad_norm": 0.3264769443210618, + "learning_rate": 5.729021681220803e-05, + "loss": 2.6781, + "step": 32738 + }, + { + "epoch": 1.524245175407966, + "grad_norm": 0.36732478419658154, + "learning_rate": 5.728753701201673e-05, + "loss": 2.8623, + "step": 32739 + }, + { + "epoch": 1.5242917335940591, + "grad_norm": 0.3360935544822294, + "learning_rate": 5.7284857190437245e-05, + "loss": 2.7402, + "step": 32740 + }, + { + "epoch": 1.5243382917801522, + "grad_norm": 0.3457754959590276, + "learning_rate": 5.728217734747742e-05, + "loss": 2.5541, + "step": 32741 + }, + { + "epoch": 1.5243848499662453, + "grad_norm": 0.3449397007000504, + "learning_rate": 5.727949748314513e-05, + "loss": 2.7023, + "step": 32742 + }, + { + "epoch": 1.5244314081523385, + "grad_norm": 0.38233059509583517, + "learning_rate": 5.727681759744822e-05, + "loss": 2.7304, + "step": 32743 + }, + { + "epoch": 1.5244779663384316, + "grad_norm": 0.3269685458388052, + "learning_rate": 5.727413769039457e-05, + "loss": 2.823, + "step": 32744 + }, + { + "epoch": 1.5245245245245245, + "grad_norm": 0.4032501530447187, + "learning_rate": 5.727145776199205e-05, + "loss": 2.6868, + "step": 32745 + }, + { + "epoch": 1.5245710827106176, + "grad_norm": 0.3469444701307541, + "learning_rate": 5.726877781224851e-05, + "loss": 2.7291, + "step": 32746 + }, + { + "epoch": 1.5246176408967107, + "grad_norm": 0.3862324335193097, + "learning_rate": 5.726609784117183e-05, + "loss": 2.6575, + "step": 32747 + }, + { + "epoch": 1.5246641990828036, + "grad_norm": 0.32274602185066303, + "learning_rate": 5.7263417848769864e-05, + "loss": 2.7459, + "step": 32748 + }, + { + "epoch": 1.5247107572688967, + "grad_norm": 0.33663858060634044, + "learning_rate": 5.7260737835050474e-05, + "loss": 2.618, + "step": 32749 + }, + { + "epoch": 1.5247573154549898, + "grad_norm": 0.3428774353036148, + "learning_rate": 5.725805780002154e-05, + "loss": 2.7447, + "step": 32750 + }, + { + "epoch": 1.524803873641083, + "grad_norm": 0.33043146872418255, + "learning_rate": 5.725537774369093e-05, + "loss": 2.7331, + "step": 32751 + }, + { + "epoch": 1.524850431827176, + "grad_norm": 0.3404420213501878, + "learning_rate": 5.725269766606649e-05, + "loss": 2.7298, + "step": 32752 + }, + { + "epoch": 1.5248969900132692, + "grad_norm": 0.33049257391333187, + "learning_rate": 5.7250017567156086e-05, + "loss": 2.7122, + "step": 32753 + }, + { + "epoch": 1.5249435481993623, + "grad_norm": 0.3538315735508961, + "learning_rate": 5.724733744696762e-05, + "loss": 2.7835, + "step": 32754 + }, + { + "epoch": 1.5249901063854552, + "grad_norm": 0.32428409826414417, + "learning_rate": 5.724465730550892e-05, + "loss": 2.7088, + "step": 32755 + }, + { + "epoch": 1.5250366645715483, + "grad_norm": 0.32475887593380787, + "learning_rate": 5.724197714278786e-05, + "loss": 2.6791, + "step": 32756 + }, + { + "epoch": 1.5250832227576414, + "grad_norm": 0.3236875284436796, + "learning_rate": 5.7239296958812316e-05, + "loss": 2.7245, + "step": 32757 + }, + { + "epoch": 1.5251297809437343, + "grad_norm": 0.34719371897315, + "learning_rate": 5.7236616753590154e-05, + "loss": 2.7349, + "step": 32758 + }, + { + "epoch": 1.5251763391298274, + "grad_norm": 0.3131336732899863, + "learning_rate": 5.723393652712923e-05, + "loss": 2.806, + "step": 32759 + }, + { + "epoch": 1.5252228973159205, + "grad_norm": 0.3265921667945142, + "learning_rate": 5.723125627943743e-05, + "loss": 2.7745, + "step": 32760 + }, + { + "epoch": 1.5252694555020136, + "grad_norm": 0.33700295161170307, + "learning_rate": 5.7228576010522586e-05, + "loss": 2.7117, + "step": 32761 + }, + { + "epoch": 1.5253160136881068, + "grad_norm": 0.32735197275390326, + "learning_rate": 5.722589572039259e-05, + "loss": 2.7746, + "step": 32762 + }, + { + "epoch": 1.5253625718741999, + "grad_norm": 0.3322337390758674, + "learning_rate": 5.722321540905529e-05, + "loss": 2.6968, + "step": 32763 + }, + { + "epoch": 1.525409130060293, + "grad_norm": 0.35760244832407095, + "learning_rate": 5.722053507651858e-05, + "loss": 2.7108, + "step": 32764 + }, + { + "epoch": 1.5254556882463859, + "grad_norm": 0.3349534613309489, + "learning_rate": 5.72178547227903e-05, + "loss": 2.8048, + "step": 32765 + }, + { + "epoch": 1.525502246432479, + "grad_norm": 0.33589335554944383, + "learning_rate": 5.721517434787833e-05, + "loss": 2.762, + "step": 32766 + }, + { + "epoch": 1.525548804618572, + "grad_norm": 0.3322006542466724, + "learning_rate": 5.7212493951790535e-05, + "loss": 2.8095, + "step": 32767 + }, + { + "epoch": 1.525595362804665, + "grad_norm": 0.34189548091150607, + "learning_rate": 5.720981353453478e-05, + "loss": 2.8189, + "step": 32768 + }, + { + "epoch": 1.5256419209907581, + "grad_norm": 0.3323140144502253, + "learning_rate": 5.7207133096118934e-05, + "loss": 2.7606, + "step": 32769 + }, + { + "epoch": 1.5256884791768512, + "grad_norm": 0.37100427556313426, + "learning_rate": 5.7204452636550863e-05, + "loss": 2.6766, + "step": 32770 + }, + { + "epoch": 1.5257350373629444, + "grad_norm": 0.3395846727198718, + "learning_rate": 5.720177215583844e-05, + "loss": 2.662, + "step": 32771 + }, + { + "epoch": 1.5257815955490375, + "grad_norm": 0.3427147112652054, + "learning_rate": 5.7199091653989525e-05, + "loss": 2.7899, + "step": 32772 + }, + { + "epoch": 1.5258281537351306, + "grad_norm": 0.32556683159621785, + "learning_rate": 5.7196411131011985e-05, + "loss": 2.7062, + "step": 32773 + }, + { + "epoch": 1.5258747119212237, + "grad_norm": 0.3638331934599628, + "learning_rate": 5.719373058691367e-05, + "loss": 2.7921, + "step": 32774 + }, + { + "epoch": 1.5259212701073166, + "grad_norm": 0.34698796612218735, + "learning_rate": 5.7191050021702473e-05, + "loss": 2.793, + "step": 32775 + }, + { + "epoch": 1.5259678282934097, + "grad_norm": 0.367538688157354, + "learning_rate": 5.7188369435386256e-05, + "loss": 2.7866, + "step": 32776 + }, + { + "epoch": 1.5260143864795026, + "grad_norm": 0.34993762971308917, + "learning_rate": 5.718568882797288e-05, + "loss": 2.7389, + "step": 32777 + }, + { + "epoch": 1.5260609446655957, + "grad_norm": 0.32015446147866017, + "learning_rate": 5.718300819947022e-05, + "loss": 2.7765, + "step": 32778 + }, + { + "epoch": 1.5261075028516888, + "grad_norm": 0.34334873549638506, + "learning_rate": 5.718032754988614e-05, + "loss": 2.6662, + "step": 32779 + }, + { + "epoch": 1.526154061037782, + "grad_norm": 0.34973495429351026, + "learning_rate": 5.71776468792285e-05, + "loss": 2.7452, + "step": 32780 + }, + { + "epoch": 1.526200619223875, + "grad_norm": 0.3580996936792415, + "learning_rate": 5.717496618750518e-05, + "loss": 2.6496, + "step": 32781 + }, + { + "epoch": 1.5262471774099682, + "grad_norm": 0.3581210988685986, + "learning_rate": 5.717228547472403e-05, + "loss": 2.5903, + "step": 32782 + }, + { + "epoch": 1.5262937355960613, + "grad_norm": 0.34159349565419295, + "learning_rate": 5.7169604740892936e-05, + "loss": 2.7907, + "step": 32783 + }, + { + "epoch": 1.5263402937821544, + "grad_norm": 0.3537624383357582, + "learning_rate": 5.716692398601975e-05, + "loss": 2.7817, + "step": 32784 + }, + { + "epoch": 1.5263868519682473, + "grad_norm": 0.36344633934318443, + "learning_rate": 5.7164243210112365e-05, + "loss": 2.7503, + "step": 32785 + }, + { + "epoch": 1.5264334101543404, + "grad_norm": 0.3465498745087456, + "learning_rate": 5.716156241317863e-05, + "loss": 2.726, + "step": 32786 + }, + { + "epoch": 1.5264799683404333, + "grad_norm": 0.3495310310345347, + "learning_rate": 5.715888159522641e-05, + "loss": 2.7676, + "step": 32787 + }, + { + "epoch": 1.5265265265265264, + "grad_norm": 0.3664552073352449, + "learning_rate": 5.715620075626357e-05, + "loss": 2.7729, + "step": 32788 + }, + { + "epoch": 1.5265730847126195, + "grad_norm": 0.35087281402570386, + "learning_rate": 5.715351989629799e-05, + "loss": 2.8406, + "step": 32789 + }, + { + "epoch": 1.5266196428987127, + "grad_norm": 0.36630968024051075, + "learning_rate": 5.715083901533753e-05, + "loss": 2.7509, + "step": 32790 + }, + { + "epoch": 1.5266662010848058, + "grad_norm": 0.3361645399177752, + "learning_rate": 5.714815811339006e-05, + "loss": 2.7696, + "step": 32791 + }, + { + "epoch": 1.526712759270899, + "grad_norm": 0.3599885446292574, + "learning_rate": 5.714547719046347e-05, + "loss": 2.7137, + "step": 32792 + }, + { + "epoch": 1.526759317456992, + "grad_norm": 0.34400957936358306, + "learning_rate": 5.7142796246565586e-05, + "loss": 2.8323, + "step": 32793 + }, + { + "epoch": 1.526805875643085, + "grad_norm": 0.3500691437147837, + "learning_rate": 5.7140115281704306e-05, + "loss": 2.7342, + "step": 32794 + }, + { + "epoch": 1.526852433829178, + "grad_norm": 0.3587551871623031, + "learning_rate": 5.713743429588749e-05, + "loss": 2.6868, + "step": 32795 + }, + { + "epoch": 1.5268989920152711, + "grad_norm": 0.3382937931250668, + "learning_rate": 5.713475328912302e-05, + "loss": 2.7097, + "step": 32796 + }, + { + "epoch": 1.526945550201364, + "grad_norm": 0.32665542248946855, + "learning_rate": 5.713207226141875e-05, + "loss": 2.767, + "step": 32797 + }, + { + "epoch": 1.5269921083874571, + "grad_norm": 0.33419979258376403, + "learning_rate": 5.712939121278255e-05, + "loss": 2.73, + "step": 32798 + }, + { + "epoch": 1.5270386665735503, + "grad_norm": 0.3330149306487764, + "learning_rate": 5.712671014322227e-05, + "loss": 2.6927, + "step": 32799 + }, + { + "epoch": 1.5270852247596434, + "grad_norm": 0.36744762763217403, + "learning_rate": 5.712402905274581e-05, + "loss": 2.6763, + "step": 32800 + }, + { + "epoch": 1.5271317829457365, + "grad_norm": 0.34241310972711975, + "learning_rate": 5.712134794136104e-05, + "loss": 2.7499, + "step": 32801 + }, + { + "epoch": 1.5271783411318296, + "grad_norm": 0.3207848146757488, + "learning_rate": 5.7118666809075794e-05, + "loss": 2.7642, + "step": 32802 + }, + { + "epoch": 1.5272248993179227, + "grad_norm": 0.3492239128257299, + "learning_rate": 5.7115985655897976e-05, + "loss": 2.7272, + "step": 32803 + }, + { + "epoch": 1.5272714575040156, + "grad_norm": 0.33252372719396506, + "learning_rate": 5.711330448183543e-05, + "loss": 2.7058, + "step": 32804 + }, + { + "epoch": 1.5273180156901087, + "grad_norm": 0.35242237722692343, + "learning_rate": 5.711062328689605e-05, + "loss": 2.7034, + "step": 32805 + }, + { + "epoch": 1.5273645738762018, + "grad_norm": 0.33177149347893586, + "learning_rate": 5.710794207108767e-05, + "loss": 2.711, + "step": 32806 + }, + { + "epoch": 1.5274111320622947, + "grad_norm": 0.3280841796684235, + "learning_rate": 5.7105260834418195e-05, + "loss": 2.6137, + "step": 32807 + }, + { + "epoch": 1.5274576902483878, + "grad_norm": 0.3521178502963899, + "learning_rate": 5.710257957689548e-05, + "loss": 2.7137, + "step": 32808 + }, + { + "epoch": 1.527504248434481, + "grad_norm": 0.33546570152942007, + "learning_rate": 5.70998982985274e-05, + "loss": 2.8522, + "step": 32809 + }, + { + "epoch": 1.527550806620574, + "grad_norm": 0.33230275303223644, + "learning_rate": 5.7097216999321804e-05, + "loss": 2.6593, + "step": 32810 + }, + { + "epoch": 1.5275973648066672, + "grad_norm": 0.34977300065914524, + "learning_rate": 5.7094535679286586e-05, + "loss": 2.7939, + "step": 32811 + }, + { + "epoch": 1.5276439229927603, + "grad_norm": 0.3574471844258376, + "learning_rate": 5.70918543384296e-05, + "loss": 2.8444, + "step": 32812 + }, + { + "epoch": 1.5276904811788534, + "grad_norm": 0.31411513678496644, + "learning_rate": 5.708917297675871e-05, + "loss": 2.6567, + "step": 32813 + }, + { + "epoch": 1.5277370393649463, + "grad_norm": 0.3730220298266605, + "learning_rate": 5.708649159428181e-05, + "loss": 2.7233, + "step": 32814 + }, + { + "epoch": 1.5277835975510394, + "grad_norm": 0.36057724719184914, + "learning_rate": 5.708381019100675e-05, + "loss": 2.8341, + "step": 32815 + }, + { + "epoch": 1.5278301557371323, + "grad_norm": 0.3422100785287526, + "learning_rate": 5.70811287669414e-05, + "loss": 2.6934, + "step": 32816 + }, + { + "epoch": 1.5278767139232254, + "grad_norm": 0.3679560899914573, + "learning_rate": 5.7078447322093644e-05, + "loss": 2.7318, + "step": 32817 + }, + { + "epoch": 1.5279232721093186, + "grad_norm": 0.3620798603154099, + "learning_rate": 5.707576585647133e-05, + "loss": 2.6677, + "step": 32818 + }, + { + "epoch": 1.5279698302954117, + "grad_norm": 0.36274245570906466, + "learning_rate": 5.7073084370082366e-05, + "loss": 2.8123, + "step": 32819 + }, + { + "epoch": 1.5280163884815048, + "grad_norm": 0.34381317020972874, + "learning_rate": 5.707040286293457e-05, + "loss": 2.6405, + "step": 32820 + }, + { + "epoch": 1.528062946667598, + "grad_norm": 0.37720949616804383, + "learning_rate": 5.706772133503584e-05, + "loss": 2.7172, + "step": 32821 + }, + { + "epoch": 1.528109504853691, + "grad_norm": 0.38631165852906874, + "learning_rate": 5.706503978639406e-05, + "loss": 2.6928, + "step": 32822 + }, + { + "epoch": 1.5281560630397841, + "grad_norm": 0.36765657876734237, + "learning_rate": 5.7062358217017074e-05, + "loss": 2.7477, + "step": 32823 + }, + { + "epoch": 1.528202621225877, + "grad_norm": 0.3887820306156837, + "learning_rate": 5.7059676626912774e-05, + "loss": 2.7176, + "step": 32824 + }, + { + "epoch": 1.5282491794119701, + "grad_norm": 0.34350870428749336, + "learning_rate": 5.7056995016089e-05, + "loss": 2.6793, + "step": 32825 + }, + { + "epoch": 1.528295737598063, + "grad_norm": 0.4022801746105658, + "learning_rate": 5.7054313384553645e-05, + "loss": 2.649, + "step": 32826 + }, + { + "epoch": 1.5283422957841561, + "grad_norm": 0.3379830273502957, + "learning_rate": 5.705163173231458e-05, + "loss": 2.7206, + "step": 32827 + }, + { + "epoch": 1.5283888539702493, + "grad_norm": 0.3847437864550947, + "learning_rate": 5.704895005937968e-05, + "loss": 2.6748, + "step": 32828 + }, + { + "epoch": 1.5284354121563424, + "grad_norm": 0.3222123915326973, + "learning_rate": 5.704626836575679e-05, + "loss": 2.6535, + "step": 32829 + }, + { + "epoch": 1.5284819703424355, + "grad_norm": 0.40300243774621797, + "learning_rate": 5.70435866514538e-05, + "loss": 2.6711, + "step": 32830 + }, + { + "epoch": 1.5285285285285286, + "grad_norm": 0.355526158283242, + "learning_rate": 5.704090491647858e-05, + "loss": 2.74, + "step": 32831 + }, + { + "epoch": 1.5285750867146217, + "grad_norm": 0.397565474103943, + "learning_rate": 5.7038223160839e-05, + "loss": 2.6478, + "step": 32832 + }, + { + "epoch": 1.5286216449007146, + "grad_norm": 0.38005263385633664, + "learning_rate": 5.703554138454292e-05, + "loss": 2.7649, + "step": 32833 + }, + { + "epoch": 1.5286682030868077, + "grad_norm": 0.4067597382802014, + "learning_rate": 5.703285958759823e-05, + "loss": 2.7441, + "step": 32834 + }, + { + "epoch": 1.5287147612729008, + "grad_norm": 0.38476894543193146, + "learning_rate": 5.7030177770012796e-05, + "loss": 2.7422, + "step": 32835 + }, + { + "epoch": 1.5287613194589937, + "grad_norm": 0.3848577663914915, + "learning_rate": 5.702749593179446e-05, + "loss": 2.632, + "step": 32836 + }, + { + "epoch": 1.5288078776450869, + "grad_norm": 0.3954691547007786, + "learning_rate": 5.702481407295114e-05, + "loss": 2.8243, + "step": 32837 + }, + { + "epoch": 1.52885443583118, + "grad_norm": 0.32982751196252585, + "learning_rate": 5.702213219349066e-05, + "loss": 2.6754, + "step": 32838 + }, + { + "epoch": 1.528900994017273, + "grad_norm": 0.38885136401465814, + "learning_rate": 5.701945029342093e-05, + "loss": 2.6593, + "step": 32839 + }, + { + "epoch": 1.5289475522033662, + "grad_norm": 0.32690467135108936, + "learning_rate": 5.7016768372749796e-05, + "loss": 2.6914, + "step": 32840 + }, + { + "epoch": 1.5289941103894593, + "grad_norm": 0.36542005922390847, + "learning_rate": 5.7014086431485145e-05, + "loss": 2.7113, + "step": 32841 + }, + { + "epoch": 1.5290406685755524, + "grad_norm": 0.3133029550177677, + "learning_rate": 5.7011404469634834e-05, + "loss": 2.7608, + "step": 32842 + }, + { + "epoch": 1.5290872267616453, + "grad_norm": 0.3603884374272024, + "learning_rate": 5.7008722487206746e-05, + "loss": 2.749, + "step": 32843 + }, + { + "epoch": 1.5291337849477384, + "grad_norm": 0.33588733139954124, + "learning_rate": 5.700604048420875e-05, + "loss": 2.7174, + "step": 32844 + }, + { + "epoch": 1.5291803431338316, + "grad_norm": 0.351156866983623, + "learning_rate": 5.7003358460648706e-05, + "loss": 2.6455, + "step": 32845 + }, + { + "epoch": 1.5292269013199244, + "grad_norm": 0.3224948058219692, + "learning_rate": 5.70006764165345e-05, + "loss": 2.6791, + "step": 32846 + }, + { + "epoch": 1.5292734595060176, + "grad_norm": 0.34743183018785584, + "learning_rate": 5.6997994351873996e-05, + "loss": 2.685, + "step": 32847 + }, + { + "epoch": 1.5293200176921107, + "grad_norm": 0.3367076202098247, + "learning_rate": 5.699531226667508e-05, + "loss": 2.7812, + "step": 32848 + }, + { + "epoch": 1.5293665758782038, + "grad_norm": 0.31714647837815885, + "learning_rate": 5.699263016094561e-05, + "loss": 2.7439, + "step": 32849 + }, + { + "epoch": 1.529413134064297, + "grad_norm": 0.3677430449672735, + "learning_rate": 5.698994803469345e-05, + "loss": 2.7658, + "step": 32850 + }, + { + "epoch": 1.52945969225039, + "grad_norm": 0.31163829981256136, + "learning_rate": 5.698726588792649e-05, + "loss": 2.7948, + "step": 32851 + }, + { + "epoch": 1.5295062504364831, + "grad_norm": 0.3886941721583886, + "learning_rate": 5.6984583720652586e-05, + "loss": 2.8069, + "step": 32852 + }, + { + "epoch": 1.529552808622576, + "grad_norm": 0.33987160528145277, + "learning_rate": 5.698190153287961e-05, + "loss": 2.7025, + "step": 32853 + }, + { + "epoch": 1.5295993668086691, + "grad_norm": 0.31499770024816, + "learning_rate": 5.697921932461544e-05, + "loss": 2.6854, + "step": 32854 + }, + { + "epoch": 1.529645924994762, + "grad_norm": 0.3569369716412114, + "learning_rate": 5.697653709586797e-05, + "loss": 2.7158, + "step": 32855 + }, + { + "epoch": 1.5296924831808552, + "grad_norm": 0.3353859987158949, + "learning_rate": 5.6973854846645034e-05, + "loss": 2.5717, + "step": 32856 + }, + { + "epoch": 1.5297390413669483, + "grad_norm": 0.3401732451517191, + "learning_rate": 5.6971172576954525e-05, + "loss": 2.8234, + "step": 32857 + }, + { + "epoch": 1.5297855995530414, + "grad_norm": 0.3802058588172684, + "learning_rate": 5.6968490286804314e-05, + "loss": 2.7707, + "step": 32858 + }, + { + "epoch": 1.5298321577391345, + "grad_norm": 0.3687569799347175, + "learning_rate": 5.696580797620227e-05, + "loss": 2.815, + "step": 32859 + }, + { + "epoch": 1.5298787159252276, + "grad_norm": 0.38329339421556774, + "learning_rate": 5.6963125645156256e-05, + "loss": 2.6906, + "step": 32860 + }, + { + "epoch": 1.5299252741113207, + "grad_norm": 0.35284671203123186, + "learning_rate": 5.696044329367417e-05, + "loss": 2.6601, + "step": 32861 + }, + { + "epoch": 1.5299718322974138, + "grad_norm": 0.39828349273022173, + "learning_rate": 5.695776092176386e-05, + "loss": 2.9022, + "step": 32862 + }, + { + "epoch": 1.5300183904835067, + "grad_norm": 0.3572291154604573, + "learning_rate": 5.6955078529433206e-05, + "loss": 2.701, + "step": 32863 + }, + { + "epoch": 1.5300649486695999, + "grad_norm": 0.39631568739865897, + "learning_rate": 5.695239611669009e-05, + "loss": 2.7061, + "step": 32864 + }, + { + "epoch": 1.5301115068556927, + "grad_norm": 0.3790664594330343, + "learning_rate": 5.694971368354237e-05, + "loss": 2.7401, + "step": 32865 + }, + { + "epoch": 1.5301580650417859, + "grad_norm": 0.3632368238468383, + "learning_rate": 5.694703122999791e-05, + "loss": 2.6486, + "step": 32866 + }, + { + "epoch": 1.530204623227879, + "grad_norm": 0.36395058832356103, + "learning_rate": 5.6944348756064625e-05, + "loss": 2.7831, + "step": 32867 + }, + { + "epoch": 1.530251181413972, + "grad_norm": 0.34946418759361314, + "learning_rate": 5.694166626175035e-05, + "loss": 2.7504, + "step": 32868 + }, + { + "epoch": 1.5302977396000652, + "grad_norm": 0.34622484619286353, + "learning_rate": 5.6938983747062966e-05, + "loss": 2.7292, + "step": 32869 + }, + { + "epoch": 1.5303442977861583, + "grad_norm": 0.3361143688523657, + "learning_rate": 5.693630121201034e-05, + "loss": 2.7378, + "step": 32870 + }, + { + "epoch": 1.5303908559722514, + "grad_norm": 0.3509001125291918, + "learning_rate": 5.693361865660036e-05, + "loss": 2.6705, + "step": 32871 + }, + { + "epoch": 1.5304374141583446, + "grad_norm": 0.32294919174401443, + "learning_rate": 5.6930936080840905e-05, + "loss": 2.8452, + "step": 32872 + }, + { + "epoch": 1.5304839723444374, + "grad_norm": 0.35480480753009963, + "learning_rate": 5.6928253484739825e-05, + "loss": 2.6417, + "step": 32873 + }, + { + "epoch": 1.5305305305305306, + "grad_norm": 0.3466270844781887, + "learning_rate": 5.6925570868305015e-05, + "loss": 2.7318, + "step": 32874 + }, + { + "epoch": 1.5305770887166235, + "grad_norm": 0.33334138627061205, + "learning_rate": 5.6922888231544314e-05, + "loss": 2.6864, + "step": 32875 + }, + { + "epoch": 1.5306236469027166, + "grad_norm": 0.3226396772800289, + "learning_rate": 5.6920205574465624e-05, + "loss": 2.8185, + "step": 32876 + }, + { + "epoch": 1.5306702050888097, + "grad_norm": 0.3592361202912981, + "learning_rate": 5.691752289707684e-05, + "loss": 2.7539, + "step": 32877 + }, + { + "epoch": 1.5307167632749028, + "grad_norm": 0.3197700355784782, + "learning_rate": 5.691484019938578e-05, + "loss": 2.7754, + "step": 32878 + }, + { + "epoch": 1.530763321460996, + "grad_norm": 0.34059906849482946, + "learning_rate": 5.691215748140034e-05, + "loss": 2.7468, + "step": 32879 + }, + { + "epoch": 1.530809879647089, + "grad_norm": 0.29760146251577174, + "learning_rate": 5.690947474312841e-05, + "loss": 2.6672, + "step": 32880 + }, + { + "epoch": 1.5308564378331821, + "grad_norm": 0.333934728158805, + "learning_rate": 5.690679198457785e-05, + "loss": 2.8385, + "step": 32881 + }, + { + "epoch": 1.530902996019275, + "grad_norm": 0.34226564529216436, + "learning_rate": 5.690410920575654e-05, + "loss": 2.7454, + "step": 32882 + }, + { + "epoch": 1.5309495542053682, + "grad_norm": 0.3317692188155294, + "learning_rate": 5.6901426406672344e-05, + "loss": 2.7211, + "step": 32883 + }, + { + "epoch": 1.5309961123914613, + "grad_norm": 0.33453711424417776, + "learning_rate": 5.689874358733315e-05, + "loss": 2.7308, + "step": 32884 + }, + { + "epoch": 1.5310426705775542, + "grad_norm": 0.36018612655664034, + "learning_rate": 5.689606074774682e-05, + "loss": 2.7774, + "step": 32885 + }, + { + "epoch": 1.5310892287636473, + "grad_norm": 0.3290710911064408, + "learning_rate": 5.689337788792123e-05, + "loss": 2.7552, + "step": 32886 + }, + { + "epoch": 1.5311357869497404, + "grad_norm": 0.3599526483263799, + "learning_rate": 5.689069500786426e-05, + "loss": 2.7239, + "step": 32887 + }, + { + "epoch": 1.5311823451358335, + "grad_norm": 0.3681009518259066, + "learning_rate": 5.688801210758376e-05, + "loss": 2.8202, + "step": 32888 + }, + { + "epoch": 1.5312289033219266, + "grad_norm": 0.3503336228966088, + "learning_rate": 5.6885329187087645e-05, + "loss": 2.7564, + "step": 32889 + }, + { + "epoch": 1.5312754615080197, + "grad_norm": 0.3434201083778466, + "learning_rate": 5.6882646246383753e-05, + "loss": 2.6646, + "step": 32890 + }, + { + "epoch": 1.5313220196941129, + "grad_norm": 0.36143218603871174, + "learning_rate": 5.687996328547999e-05, + "loss": 2.7149, + "step": 32891 + }, + { + "epoch": 1.5313685778802058, + "grad_norm": 0.3665345735706558, + "learning_rate": 5.6877280304384194e-05, + "loss": 2.6657, + "step": 32892 + }, + { + "epoch": 1.5314151360662989, + "grad_norm": 0.3210581224837278, + "learning_rate": 5.687459730310426e-05, + "loss": 2.7184, + "step": 32893 + }, + { + "epoch": 1.531461694252392, + "grad_norm": 0.389345529240323, + "learning_rate": 5.687191428164806e-05, + "loss": 2.7249, + "step": 32894 + }, + { + "epoch": 1.5315082524384849, + "grad_norm": 0.3357440547188328, + "learning_rate": 5.686923124002348e-05, + "loss": 2.7361, + "step": 32895 + }, + { + "epoch": 1.531554810624578, + "grad_norm": 0.40999649384584574, + "learning_rate": 5.686654817823838e-05, + "loss": 2.7707, + "step": 32896 + }, + { + "epoch": 1.531601368810671, + "grad_norm": 0.3294514195142426, + "learning_rate": 5.6863865096300626e-05, + "loss": 2.7197, + "step": 32897 + }, + { + "epoch": 1.5316479269967642, + "grad_norm": 0.3716152226966677, + "learning_rate": 5.6861181994218114e-05, + "loss": 2.7488, + "step": 32898 + }, + { + "epoch": 1.5316944851828573, + "grad_norm": 0.3303592408419536, + "learning_rate": 5.6858498871998714e-05, + "loss": 2.6837, + "step": 32899 + }, + { + "epoch": 1.5317410433689505, + "grad_norm": 0.35209206981788765, + "learning_rate": 5.685581572965029e-05, + "loss": 2.6662, + "step": 32900 + }, + { + "epoch": 1.5317876015550436, + "grad_norm": 0.34661783505544336, + "learning_rate": 5.685313256718071e-05, + "loss": 2.7751, + "step": 32901 + }, + { + "epoch": 1.5318341597411365, + "grad_norm": 0.34946368107479875, + "learning_rate": 5.685044938459787e-05, + "loss": 2.7283, + "step": 32902 + }, + { + "epoch": 1.5318807179272296, + "grad_norm": 0.3550698706070844, + "learning_rate": 5.684776618190963e-05, + "loss": 2.5894, + "step": 32903 + }, + { + "epoch": 1.5319272761133225, + "grad_norm": 0.33836936663364614, + "learning_rate": 5.684508295912389e-05, + "loss": 2.6081, + "step": 32904 + }, + { + "epoch": 1.5319738342994156, + "grad_norm": 0.36270597232970075, + "learning_rate": 5.684239971624849e-05, + "loss": 2.6753, + "step": 32905 + }, + { + "epoch": 1.5320203924855087, + "grad_norm": 0.3570936708424499, + "learning_rate": 5.683971645329132e-05, + "loss": 2.8137, + "step": 32906 + }, + { + "epoch": 1.5320669506716018, + "grad_norm": 0.31867066428690227, + "learning_rate": 5.6837033170260265e-05, + "loss": 2.7721, + "step": 32907 + }, + { + "epoch": 1.532113508857695, + "grad_norm": 0.352207440285029, + "learning_rate": 5.6834349867163186e-05, + "loss": 2.7251, + "step": 32908 + }, + { + "epoch": 1.532160067043788, + "grad_norm": 0.34674862990339295, + "learning_rate": 5.683166654400797e-05, + "loss": 2.7149, + "step": 32909 + }, + { + "epoch": 1.5322066252298812, + "grad_norm": 0.36971715629215535, + "learning_rate": 5.6828983200802477e-05, + "loss": 2.7236, + "step": 32910 + }, + { + "epoch": 1.5322531834159743, + "grad_norm": 0.3284727631270798, + "learning_rate": 5.68262998375546e-05, + "loss": 2.7249, + "step": 32911 + }, + { + "epoch": 1.5322997416020672, + "grad_norm": 0.3639248297860735, + "learning_rate": 5.68236164542722e-05, + "loss": 2.7489, + "step": 32912 + }, + { + "epoch": 1.5323462997881603, + "grad_norm": 0.3611747156873458, + "learning_rate": 5.682093305096317e-05, + "loss": 2.7495, + "step": 32913 + }, + { + "epoch": 1.5323928579742532, + "grad_norm": 0.3445921468889478, + "learning_rate": 5.681824962763536e-05, + "loss": 2.8515, + "step": 32914 + }, + { + "epoch": 1.5324394161603463, + "grad_norm": 0.3404084384487992, + "learning_rate": 5.6815566184296655e-05, + "loss": 2.6741, + "step": 32915 + }, + { + "epoch": 1.5324859743464394, + "grad_norm": 0.34045353887803215, + "learning_rate": 5.681288272095493e-05, + "loss": 2.6807, + "step": 32916 + }, + { + "epoch": 1.5325325325325325, + "grad_norm": 0.34452702803447116, + "learning_rate": 5.6810199237618076e-05, + "loss": 2.7425, + "step": 32917 + }, + { + "epoch": 1.5325790907186256, + "grad_norm": 0.31914789773612795, + "learning_rate": 5.680751573429396e-05, + "loss": 2.7201, + "step": 32918 + }, + { + "epoch": 1.5326256489047188, + "grad_norm": 0.3422065703065089, + "learning_rate": 5.6804832210990455e-05, + "loss": 2.6732, + "step": 32919 + }, + { + "epoch": 1.5326722070908119, + "grad_norm": 0.31533425518697616, + "learning_rate": 5.680214866771543e-05, + "loss": 2.7987, + "step": 32920 + }, + { + "epoch": 1.5327187652769048, + "grad_norm": 0.33477869932544924, + "learning_rate": 5.679946510447678e-05, + "loss": 2.701, + "step": 32921 + }, + { + "epoch": 1.5327653234629979, + "grad_norm": 0.3374943378096062, + "learning_rate": 5.679678152128236e-05, + "loss": 2.7306, + "step": 32922 + }, + { + "epoch": 1.532811881649091, + "grad_norm": 0.3407597852476404, + "learning_rate": 5.6794097918140066e-05, + "loss": 2.7736, + "step": 32923 + }, + { + "epoch": 1.5328584398351839, + "grad_norm": 0.35271429612841826, + "learning_rate": 5.679141429505775e-05, + "loss": 2.7356, + "step": 32924 + }, + { + "epoch": 1.532904998021277, + "grad_norm": 0.35154640593489245, + "learning_rate": 5.678873065204332e-05, + "loss": 2.6425, + "step": 32925 + }, + { + "epoch": 1.5329515562073701, + "grad_norm": 0.3732936637421208, + "learning_rate": 5.6786046989104625e-05, + "loss": 2.8044, + "step": 32926 + }, + { + "epoch": 1.5329981143934632, + "grad_norm": 0.3914376507717871, + "learning_rate": 5.678336330624956e-05, + "loss": 2.7094, + "step": 32927 + }, + { + "epoch": 1.5330446725795563, + "grad_norm": 0.36722264397139076, + "learning_rate": 5.6780679603485976e-05, + "loss": 2.7308, + "step": 32928 + }, + { + "epoch": 1.5330912307656495, + "grad_norm": 0.38273756863053476, + "learning_rate": 5.6777995880821766e-05, + "loss": 2.6523, + "step": 32929 + }, + { + "epoch": 1.5331377889517426, + "grad_norm": 0.37497646112042393, + "learning_rate": 5.677531213826481e-05, + "loss": 2.8134, + "step": 32930 + }, + { + "epoch": 1.5331843471378355, + "grad_norm": 0.3543319052370504, + "learning_rate": 5.677262837582299e-05, + "loss": 2.7088, + "step": 32931 + }, + { + "epoch": 1.5332309053239286, + "grad_norm": 0.3539782958217989, + "learning_rate": 5.676994459350416e-05, + "loss": 2.735, + "step": 32932 + }, + { + "epoch": 1.5332774635100217, + "grad_norm": 0.36351197550671116, + "learning_rate": 5.676726079131621e-05, + "loss": 2.7669, + "step": 32933 + }, + { + "epoch": 1.5333240216961146, + "grad_norm": 0.34290410772074725, + "learning_rate": 5.676457696926703e-05, + "loss": 2.6693, + "step": 32934 + }, + { + "epoch": 1.5333705798822077, + "grad_norm": 0.33536742196725705, + "learning_rate": 5.6761893127364465e-05, + "loss": 2.7831, + "step": 32935 + }, + { + "epoch": 1.5334171380683008, + "grad_norm": 0.3546016374823758, + "learning_rate": 5.675920926561643e-05, + "loss": 2.6521, + "step": 32936 + }, + { + "epoch": 1.533463696254394, + "grad_norm": 0.32986290010172137, + "learning_rate": 5.6756525384030765e-05, + "loss": 2.7763, + "step": 32937 + }, + { + "epoch": 1.533510254440487, + "grad_norm": 0.33413599733888727, + "learning_rate": 5.6753841482615376e-05, + "loss": 2.6802, + "step": 32938 + }, + { + "epoch": 1.5335568126265802, + "grad_norm": 0.35172072121552744, + "learning_rate": 5.675115756137812e-05, + "loss": 2.6837, + "step": 32939 + }, + { + "epoch": 1.5336033708126733, + "grad_norm": 0.3370745141632629, + "learning_rate": 5.674847362032689e-05, + "loss": 2.7215, + "step": 32940 + }, + { + "epoch": 1.5336499289987662, + "grad_norm": 0.3711452296272776, + "learning_rate": 5.674578965946954e-05, + "loss": 2.7241, + "step": 32941 + }, + { + "epoch": 1.5336964871848593, + "grad_norm": 0.3544246303341508, + "learning_rate": 5.674310567881397e-05, + "loss": 2.8147, + "step": 32942 + }, + { + "epoch": 1.5337430453709522, + "grad_norm": 0.32235071002856225, + "learning_rate": 5.674042167836804e-05, + "loss": 2.6535, + "step": 32943 + }, + { + "epoch": 1.5337896035570453, + "grad_norm": 0.37440602056510564, + "learning_rate": 5.6737737658139645e-05, + "loss": 2.7231, + "step": 32944 + }, + { + "epoch": 1.5338361617431384, + "grad_norm": 0.3326393195297494, + "learning_rate": 5.6735053618136655e-05, + "loss": 2.6969, + "step": 32945 + }, + { + "epoch": 1.5338827199292315, + "grad_norm": 0.3607247964723038, + "learning_rate": 5.673236955836695e-05, + "loss": 2.7348, + "step": 32946 + }, + { + "epoch": 1.5339292781153246, + "grad_norm": 0.366379400358705, + "learning_rate": 5.672968547883839e-05, + "loss": 2.7037, + "step": 32947 + }, + { + "epoch": 1.5339758363014178, + "grad_norm": 0.3512239296315369, + "learning_rate": 5.6727001379558876e-05, + "loss": 2.8674, + "step": 32948 + }, + { + "epoch": 1.5340223944875109, + "grad_norm": 0.37202926590632707, + "learning_rate": 5.6724317260536284e-05, + "loss": 2.738, + "step": 32949 + }, + { + "epoch": 1.534068952673604, + "grad_norm": 0.3412726662375555, + "learning_rate": 5.6721633121778475e-05, + "loss": 2.6848, + "step": 32950 + }, + { + "epoch": 1.5341155108596969, + "grad_norm": 0.3300978189692565, + "learning_rate": 5.671894896329333e-05, + "loss": 2.7069, + "step": 32951 + }, + { + "epoch": 1.53416206904579, + "grad_norm": 0.36550918450280295, + "learning_rate": 5.6716264785088736e-05, + "loss": 2.7287, + "step": 32952 + }, + { + "epoch": 1.534208627231883, + "grad_norm": 0.3796933494122937, + "learning_rate": 5.6713580587172554e-05, + "loss": 2.7873, + "step": 32953 + }, + { + "epoch": 1.534255185417976, + "grad_norm": 0.3495123420602304, + "learning_rate": 5.6710896369552704e-05, + "loss": 2.7012, + "step": 32954 + }, + { + "epoch": 1.5343017436040691, + "grad_norm": 0.3859399975525795, + "learning_rate": 5.6708212132237e-05, + "loss": 2.6979, + "step": 32955 + }, + { + "epoch": 1.5343483017901622, + "grad_norm": 0.352863182640343, + "learning_rate": 5.670552787523338e-05, + "loss": 2.766, + "step": 32956 + }, + { + "epoch": 1.5343948599762554, + "grad_norm": 0.37761340648966685, + "learning_rate": 5.670284359854968e-05, + "loss": 2.8082, + "step": 32957 + }, + { + "epoch": 1.5344414181623485, + "grad_norm": 0.3504427975242951, + "learning_rate": 5.6700159302193814e-05, + "loss": 2.6264, + "step": 32958 + }, + { + "epoch": 1.5344879763484416, + "grad_norm": 0.3832326289434719, + "learning_rate": 5.669747498617361e-05, + "loss": 2.7515, + "step": 32959 + }, + { + "epoch": 1.5345345345345347, + "grad_norm": 0.35154340745578844, + "learning_rate": 5.6694790650497e-05, + "loss": 2.7459, + "step": 32960 + }, + { + "epoch": 1.5345810927206276, + "grad_norm": 0.35989617720728734, + "learning_rate": 5.6692106295171834e-05, + "loss": 2.7559, + "step": 32961 + }, + { + "epoch": 1.5346276509067207, + "grad_norm": 0.36347335289059823, + "learning_rate": 5.6689421920206e-05, + "loss": 2.674, + "step": 32962 + }, + { + "epoch": 1.5346742090928136, + "grad_norm": 0.32079375150292166, + "learning_rate": 5.668673752560736e-05, + "loss": 2.8163, + "step": 32963 + }, + { + "epoch": 1.5347207672789067, + "grad_norm": 0.31094640209904, + "learning_rate": 5.6684053111383816e-05, + "loss": 2.6649, + "step": 32964 + }, + { + "epoch": 1.5347673254649998, + "grad_norm": 0.5339258631303201, + "learning_rate": 5.6681368677543235e-05, + "loss": 2.6553, + "step": 32965 + }, + { + "epoch": 1.534813883651093, + "grad_norm": 0.35463577229247806, + "learning_rate": 5.667868422409348e-05, + "loss": 2.7452, + "step": 32966 + }, + { + "epoch": 1.534860441837186, + "grad_norm": 0.3854856668239865, + "learning_rate": 5.6675999751042466e-05, + "loss": 2.7912, + "step": 32967 + }, + { + "epoch": 1.5349070000232792, + "grad_norm": 0.39937358361215425, + "learning_rate": 5.667331525839803e-05, + "loss": 2.6259, + "step": 32968 + }, + { + "epoch": 1.5349535582093723, + "grad_norm": 0.3887938180184056, + "learning_rate": 5.6670630746168075e-05, + "loss": 2.6677, + "step": 32969 + }, + { + "epoch": 1.5350001163954652, + "grad_norm": 0.38375441830098617, + "learning_rate": 5.666794621436048e-05, + "loss": 2.7471, + "step": 32970 + }, + { + "epoch": 1.5350466745815583, + "grad_norm": 0.37402254236541715, + "learning_rate": 5.666526166298312e-05, + "loss": 2.7221, + "step": 32971 + }, + { + "epoch": 1.5350932327676514, + "grad_norm": 0.33643131706514595, + "learning_rate": 5.666257709204388e-05, + "loss": 2.6758, + "step": 32972 + }, + { + "epoch": 1.5351397909537443, + "grad_norm": 0.358528334605831, + "learning_rate": 5.6659892501550625e-05, + "loss": 2.7471, + "step": 32973 + }, + { + "epoch": 1.5351863491398374, + "grad_norm": 0.36241602596861566, + "learning_rate": 5.665720789151123e-05, + "loss": 2.7582, + "step": 32974 + }, + { + "epoch": 1.5352329073259305, + "grad_norm": 0.34613192305251067, + "learning_rate": 5.665452326193361e-05, + "loss": 2.8114, + "step": 32975 + }, + { + "epoch": 1.5352794655120237, + "grad_norm": 0.3752144223925983, + "learning_rate": 5.6651838612825616e-05, + "loss": 2.7983, + "step": 32976 + }, + { + "epoch": 1.5353260236981168, + "grad_norm": 0.3831192231709736, + "learning_rate": 5.664915394419512e-05, + "loss": 2.8386, + "step": 32977 + }, + { + "epoch": 1.5353725818842099, + "grad_norm": 0.3937010458376344, + "learning_rate": 5.664646925605002e-05, + "loss": 2.715, + "step": 32978 + }, + { + "epoch": 1.535419140070303, + "grad_norm": 0.36897143626045475, + "learning_rate": 5.6643784548398184e-05, + "loss": 2.633, + "step": 32979 + }, + { + "epoch": 1.535465698256396, + "grad_norm": 0.39808218046173355, + "learning_rate": 5.664109982124749e-05, + "loss": 2.776, + "step": 32980 + }, + { + "epoch": 1.535512256442489, + "grad_norm": 0.37096864188586526, + "learning_rate": 5.663841507460583e-05, + "loss": 2.7555, + "step": 32981 + }, + { + "epoch": 1.5355588146285821, + "grad_norm": 0.37152875443643896, + "learning_rate": 5.663573030848107e-05, + "loss": 2.6783, + "step": 32982 + }, + { + "epoch": 1.535605372814675, + "grad_norm": 0.38455467754900663, + "learning_rate": 5.663304552288109e-05, + "loss": 2.7566, + "step": 32983 + }, + { + "epoch": 1.5356519310007681, + "grad_norm": 0.37172016584937123, + "learning_rate": 5.663036071781378e-05, + "loss": 2.6925, + "step": 32984 + }, + { + "epoch": 1.5356984891868612, + "grad_norm": 0.4058734005995058, + "learning_rate": 5.662767589328702e-05, + "loss": 2.8006, + "step": 32985 + }, + { + "epoch": 1.5357450473729544, + "grad_norm": 0.3658964541542481, + "learning_rate": 5.6624991049308684e-05, + "loss": 2.6704, + "step": 32986 + }, + { + "epoch": 1.5357916055590475, + "grad_norm": 0.37769989563346074, + "learning_rate": 5.6622306185886643e-05, + "loss": 2.8476, + "step": 32987 + }, + { + "epoch": 1.5358381637451406, + "grad_norm": 0.39192204780712747, + "learning_rate": 5.66196213030288e-05, + "loss": 2.7334, + "step": 32988 + }, + { + "epoch": 1.5358847219312337, + "grad_norm": 0.36995993965554896, + "learning_rate": 5.661693640074301e-05, + "loss": 2.6913, + "step": 32989 + }, + { + "epoch": 1.5359312801173266, + "grad_norm": 0.36945888737023214, + "learning_rate": 5.661425147903717e-05, + "loss": 2.7221, + "step": 32990 + }, + { + "epoch": 1.5359778383034197, + "grad_norm": 0.3627894743535702, + "learning_rate": 5.661156653791915e-05, + "loss": 2.6876, + "step": 32991 + }, + { + "epoch": 1.5360243964895126, + "grad_norm": 0.3496698232212734, + "learning_rate": 5.6608881577396835e-05, + "loss": 2.7653, + "step": 32992 + }, + { + "epoch": 1.5360709546756057, + "grad_norm": 0.34570488627198337, + "learning_rate": 5.66061965974781e-05, + "loss": 2.6694, + "step": 32993 + }, + { + "epoch": 1.5361175128616988, + "grad_norm": 0.3231297464829761, + "learning_rate": 5.660351159817083e-05, + "loss": 2.6698, + "step": 32994 + }, + { + "epoch": 1.536164071047792, + "grad_norm": 0.33632030435505184, + "learning_rate": 5.66008265794829e-05, + "loss": 2.6756, + "step": 32995 + }, + { + "epoch": 1.536210629233885, + "grad_norm": 0.33449805633248797, + "learning_rate": 5.65981415414222e-05, + "loss": 2.7439, + "step": 32996 + }, + { + "epoch": 1.5362571874199782, + "grad_norm": 0.3266280212577084, + "learning_rate": 5.65954564839966e-05, + "loss": 2.6512, + "step": 32997 + }, + { + "epoch": 1.5363037456060713, + "grad_norm": 0.32983710472909783, + "learning_rate": 5.659277140721398e-05, + "loss": 2.7373, + "step": 32998 + }, + { + "epoch": 1.5363503037921644, + "grad_norm": 0.33646346785671954, + "learning_rate": 5.659008631108225e-05, + "loss": 2.6972, + "step": 32999 + }, + { + "epoch": 1.5363968619782573, + "grad_norm": 0.34981029889654386, + "learning_rate": 5.6587401195609244e-05, + "loss": 2.7004, + "step": 33000 + }, + { + "epoch": 1.5364434201643504, + "grad_norm": 0.34351877802620484, + "learning_rate": 5.6584716060802875e-05, + "loss": 2.668, + "step": 33001 + }, + { + "epoch": 1.5364899783504433, + "grad_norm": 0.34440050442834574, + "learning_rate": 5.6582030906671e-05, + "loss": 2.7937, + "step": 33002 + }, + { + "epoch": 1.5365365365365364, + "grad_norm": 0.33322032256443546, + "learning_rate": 5.657934573322153e-05, + "loss": 2.6888, + "step": 33003 + }, + { + "epoch": 1.5365830947226295, + "grad_norm": 0.3403417154887751, + "learning_rate": 5.657666054046232e-05, + "loss": 2.6978, + "step": 33004 + }, + { + "epoch": 1.5366296529087227, + "grad_norm": 0.3383530402219122, + "learning_rate": 5.657397532840125e-05, + "loss": 2.6485, + "step": 33005 + }, + { + "epoch": 1.5366762110948158, + "grad_norm": 0.34328170919167267, + "learning_rate": 5.657129009704622e-05, + "loss": 2.7126, + "step": 33006 + }, + { + "epoch": 1.536722769280909, + "grad_norm": 0.3477877970048742, + "learning_rate": 5.65686048464051e-05, + "loss": 2.706, + "step": 33007 + }, + { + "epoch": 1.536769327467002, + "grad_norm": 0.3632632990447639, + "learning_rate": 5.6565919576485784e-05, + "loss": 2.6333, + "step": 33008 + }, + { + "epoch": 1.536815885653095, + "grad_norm": 0.36706383648967744, + "learning_rate": 5.656323428729612e-05, + "loss": 2.7688, + "step": 33009 + }, + { + "epoch": 1.536862443839188, + "grad_norm": 0.3852501401195009, + "learning_rate": 5.656054897884402e-05, + "loss": 2.8056, + "step": 33010 + }, + { + "epoch": 1.5369090020252811, + "grad_norm": 0.3457838518766834, + "learning_rate": 5.655786365113734e-05, + "loss": 2.7205, + "step": 33011 + }, + { + "epoch": 1.536955560211374, + "grad_norm": 0.3744023453934818, + "learning_rate": 5.6555178304183995e-05, + "loss": 2.6298, + "step": 33012 + }, + { + "epoch": 1.5370021183974671, + "grad_norm": 0.3312191437133302, + "learning_rate": 5.6552492937991854e-05, + "loss": 2.6713, + "step": 33013 + }, + { + "epoch": 1.5370486765835603, + "grad_norm": 0.3776042225458032, + "learning_rate": 5.654980755256878e-05, + "loss": 2.8029, + "step": 33014 + }, + { + "epoch": 1.5370952347696534, + "grad_norm": 0.3581247087105368, + "learning_rate": 5.654712214792266e-05, + "loss": 2.7628, + "step": 33015 + }, + { + "epoch": 1.5371417929557465, + "grad_norm": 0.35717880992265433, + "learning_rate": 5.654443672406139e-05, + "loss": 2.6214, + "step": 33016 + }, + { + "epoch": 1.5371883511418396, + "grad_norm": 0.3960508628172517, + "learning_rate": 5.654175128099285e-05, + "loss": 2.8012, + "step": 33017 + }, + { + "epoch": 1.5372349093279327, + "grad_norm": 0.3249362970314809, + "learning_rate": 5.65390658187249e-05, + "loss": 2.8465, + "step": 33018 + }, + { + "epoch": 1.5372814675140256, + "grad_norm": 0.4329243742972597, + "learning_rate": 5.6536380337265435e-05, + "loss": 2.7678, + "step": 33019 + }, + { + "epoch": 1.5373280257001187, + "grad_norm": 0.3309402492857654, + "learning_rate": 5.6533694836622344e-05, + "loss": 2.7688, + "step": 33020 + }, + { + "epoch": 1.5373745838862118, + "grad_norm": 0.40438462745984205, + "learning_rate": 5.6531009316803505e-05, + "loss": 2.7605, + "step": 33021 + }, + { + "epoch": 1.5374211420723047, + "grad_norm": 0.35289665236910306, + "learning_rate": 5.652832377781679e-05, + "loss": 2.8389, + "step": 33022 + }, + { + "epoch": 1.5374677002583979, + "grad_norm": 0.3741055043858965, + "learning_rate": 5.65256382196701e-05, + "loss": 2.7682, + "step": 33023 + }, + { + "epoch": 1.537514258444491, + "grad_norm": 0.3718761520059822, + "learning_rate": 5.6522952642371284e-05, + "loss": 2.6688, + "step": 33024 + }, + { + "epoch": 1.537560816630584, + "grad_norm": 0.35786364457433784, + "learning_rate": 5.6520267045928266e-05, + "loss": 2.728, + "step": 33025 + }, + { + "epoch": 1.5376073748166772, + "grad_norm": 0.3561308632077157, + "learning_rate": 5.651758143034889e-05, + "loss": 2.761, + "step": 33026 + }, + { + "epoch": 1.5376539330027703, + "grad_norm": 0.33876019382793765, + "learning_rate": 5.6514895795641076e-05, + "loss": 2.7225, + "step": 33027 + }, + { + "epoch": 1.5377004911888634, + "grad_norm": 0.37037326168996526, + "learning_rate": 5.651221014181266e-05, + "loss": 2.7553, + "step": 33028 + }, + { + "epoch": 1.5377470493749563, + "grad_norm": 0.358522294972226, + "learning_rate": 5.650952446887157e-05, + "loss": 2.799, + "step": 33029 + }, + { + "epoch": 1.5377936075610494, + "grad_norm": 0.32187130823304855, + "learning_rate": 5.6506838776825656e-05, + "loss": 2.6832, + "step": 33030 + }, + { + "epoch": 1.5378401657471423, + "grad_norm": 0.35073651238794484, + "learning_rate": 5.650415306568281e-05, + "loss": 2.6497, + "step": 33031 + }, + { + "epoch": 1.5378867239332354, + "grad_norm": 0.30974802820519964, + "learning_rate": 5.650146733545091e-05, + "loss": 2.7884, + "step": 33032 + }, + { + "epoch": 1.5379332821193286, + "grad_norm": 0.3375383653676668, + "learning_rate": 5.649878158613785e-05, + "loss": 2.6427, + "step": 33033 + }, + { + "epoch": 1.5379798403054217, + "grad_norm": 0.34341313495697423, + "learning_rate": 5.64960958177515e-05, + "loss": 2.7475, + "step": 33034 + }, + { + "epoch": 1.5380263984915148, + "grad_norm": 0.32546337842271506, + "learning_rate": 5.649341003029976e-05, + "loss": 2.6434, + "step": 33035 + }, + { + "epoch": 1.538072956677608, + "grad_norm": 0.37196151968458663, + "learning_rate": 5.6490724223790506e-05, + "loss": 2.7829, + "step": 33036 + }, + { + "epoch": 1.538119514863701, + "grad_norm": 0.32123030715352086, + "learning_rate": 5.64880383982316e-05, + "loss": 2.7375, + "step": 33037 + }, + { + "epoch": 1.5381660730497941, + "grad_norm": 0.33366207502339285, + "learning_rate": 5.6485352553630955e-05, + "loss": 2.6399, + "step": 33038 + }, + { + "epoch": 1.538212631235887, + "grad_norm": 0.3401940734068626, + "learning_rate": 5.6482666689996435e-05, + "loss": 2.7118, + "step": 33039 + }, + { + "epoch": 1.5382591894219801, + "grad_norm": 0.3412854260044749, + "learning_rate": 5.647998080733593e-05, + "loss": 2.7227, + "step": 33040 + }, + { + "epoch": 1.538305747608073, + "grad_norm": 0.368076410103991, + "learning_rate": 5.647729490565731e-05, + "loss": 2.6875, + "step": 33041 + }, + { + "epoch": 1.5383523057941662, + "grad_norm": 0.3616303803358023, + "learning_rate": 5.6474608984968466e-05, + "loss": 2.8041, + "step": 33042 + }, + { + "epoch": 1.5383988639802593, + "grad_norm": 0.33627966636961737, + "learning_rate": 5.6471923045277286e-05, + "loss": 2.7408, + "step": 33043 + }, + { + "epoch": 1.5384454221663524, + "grad_norm": 0.34021687148087437, + "learning_rate": 5.646923708659166e-05, + "loss": 2.804, + "step": 33044 + }, + { + "epoch": 1.5384919803524455, + "grad_norm": 0.37898474089501427, + "learning_rate": 5.646655110891945e-05, + "loss": 2.7466, + "step": 33045 + }, + { + "epoch": 1.5385385385385386, + "grad_norm": 0.34763064885033645, + "learning_rate": 5.646386511226854e-05, + "loss": 2.8068, + "step": 33046 + }, + { + "epoch": 1.5385850967246317, + "grad_norm": 0.361425628737302, + "learning_rate": 5.6461179096646835e-05, + "loss": 2.7601, + "step": 33047 + }, + { + "epoch": 1.5386316549107248, + "grad_norm": 0.359431445532785, + "learning_rate": 5.6458493062062215e-05, + "loss": 2.7122, + "step": 33048 + }, + { + "epoch": 1.5386782130968177, + "grad_norm": 0.37416508012176575, + "learning_rate": 5.645580700852254e-05, + "loss": 2.8289, + "step": 33049 + }, + { + "epoch": 1.5387247712829109, + "grad_norm": 0.3615950556331725, + "learning_rate": 5.645312093603571e-05, + "loss": 2.6247, + "step": 33050 + }, + { + "epoch": 1.5387713294690037, + "grad_norm": 0.3521084732576271, + "learning_rate": 5.645043484460961e-05, + "loss": 2.6628, + "step": 33051 + }, + { + "epoch": 1.5388178876550969, + "grad_norm": 0.3475668506845918, + "learning_rate": 5.644774873425211e-05, + "loss": 2.7534, + "step": 33052 + }, + { + "epoch": 1.53886444584119, + "grad_norm": 0.35936577733332636, + "learning_rate": 5.644506260497112e-05, + "loss": 2.6666, + "step": 33053 + }, + { + "epoch": 1.538911004027283, + "grad_norm": 0.38146737722883434, + "learning_rate": 5.644237645677449e-05, + "loss": 2.7397, + "step": 33054 + }, + { + "epoch": 1.5389575622133762, + "grad_norm": 0.34643906473586517, + "learning_rate": 5.643969028967012e-05, + "loss": 2.7792, + "step": 33055 + }, + { + "epoch": 1.5390041203994693, + "grad_norm": 0.3682562349976031, + "learning_rate": 5.64370041036659e-05, + "loss": 2.755, + "step": 33056 + }, + { + "epoch": 1.5390506785855624, + "grad_norm": 0.36584181804666804, + "learning_rate": 5.6434317898769716e-05, + "loss": 2.7477, + "step": 33057 + }, + { + "epoch": 1.5390972367716553, + "grad_norm": 0.34590867569446987, + "learning_rate": 5.6431631674989424e-05, + "loss": 2.716, + "step": 33058 + }, + { + "epoch": 1.5391437949577484, + "grad_norm": 0.39415208062037715, + "learning_rate": 5.6428945432332936e-05, + "loss": 2.8017, + "step": 33059 + }, + { + "epoch": 1.5391903531438416, + "grad_norm": 0.35579274458394994, + "learning_rate": 5.6426259170808124e-05, + "loss": 2.7988, + "step": 33060 + }, + { + "epoch": 1.5392369113299345, + "grad_norm": 0.3725567058617231, + "learning_rate": 5.642357289042287e-05, + "loss": 2.7778, + "step": 33061 + }, + { + "epoch": 1.5392834695160276, + "grad_norm": 0.35414780019417247, + "learning_rate": 5.6420886591185076e-05, + "loss": 2.7156, + "step": 33062 + }, + { + "epoch": 1.5393300277021207, + "grad_norm": 0.3543953458358616, + "learning_rate": 5.641820027310261e-05, + "loss": 2.7299, + "step": 33063 + }, + { + "epoch": 1.5393765858882138, + "grad_norm": 0.3627243614538424, + "learning_rate": 5.641551393618335e-05, + "loss": 2.7038, + "step": 33064 + }, + { + "epoch": 1.539423144074307, + "grad_norm": 0.3507265686305702, + "learning_rate": 5.641282758043519e-05, + "loss": 2.7785, + "step": 33065 + }, + { + "epoch": 1.5394697022604, + "grad_norm": 0.3538247139902216, + "learning_rate": 5.641014120586603e-05, + "loss": 2.6561, + "step": 33066 + }, + { + "epoch": 1.5395162604464931, + "grad_norm": 0.3633567051293308, + "learning_rate": 5.640745481248372e-05, + "loss": 2.7961, + "step": 33067 + }, + { + "epoch": 1.539562818632586, + "grad_norm": 0.3384681702785784, + "learning_rate": 5.640476840029616e-05, + "loss": 2.6975, + "step": 33068 + }, + { + "epoch": 1.5396093768186792, + "grad_norm": 0.37679468330386084, + "learning_rate": 5.640208196931123e-05, + "loss": 2.6409, + "step": 33069 + }, + { + "epoch": 1.5396559350047723, + "grad_norm": 0.3625595045950856, + "learning_rate": 5.639939551953683e-05, + "loss": 2.6382, + "step": 33070 + }, + { + "epoch": 1.5397024931908652, + "grad_norm": 0.34508195276124126, + "learning_rate": 5.639670905098083e-05, + "loss": 2.7285, + "step": 33071 + }, + { + "epoch": 1.5397490513769583, + "grad_norm": 0.38423149272511303, + "learning_rate": 5.6394022563651125e-05, + "loss": 2.7027, + "step": 33072 + }, + { + "epoch": 1.5397956095630514, + "grad_norm": 0.3555536040953169, + "learning_rate": 5.6391336057555586e-05, + "loss": 2.6594, + "step": 33073 + }, + { + "epoch": 1.5398421677491445, + "grad_norm": 0.34722793631319415, + "learning_rate": 5.638864953270211e-05, + "loss": 2.7576, + "step": 33074 + }, + { + "epoch": 1.5398887259352376, + "grad_norm": 0.35862942283198546, + "learning_rate": 5.638596298909858e-05, + "loss": 2.6823, + "step": 33075 + }, + { + "epoch": 1.5399352841213307, + "grad_norm": 0.3589125647916723, + "learning_rate": 5.638327642675287e-05, + "loss": 2.7095, + "step": 33076 + }, + { + "epoch": 1.5399818423074239, + "grad_norm": 0.36228611664708593, + "learning_rate": 5.6380589845672894e-05, + "loss": 2.7524, + "step": 33077 + }, + { + "epoch": 1.5400284004935167, + "grad_norm": 0.36908342467569616, + "learning_rate": 5.6377903245866494e-05, + "loss": 2.7047, + "step": 33078 + }, + { + "epoch": 1.5400749586796099, + "grad_norm": 0.38292328735091175, + "learning_rate": 5.6375216627341575e-05, + "loss": 2.7698, + "step": 33079 + }, + { + "epoch": 1.5401215168657028, + "grad_norm": 0.3678942576909445, + "learning_rate": 5.637252999010605e-05, + "loss": 2.7247, + "step": 33080 + }, + { + "epoch": 1.5401680750517959, + "grad_norm": 0.3748681464882564, + "learning_rate": 5.6369843334167747e-05, + "loss": 2.7359, + "step": 33081 + }, + { + "epoch": 1.540214633237889, + "grad_norm": 0.33457130208341296, + "learning_rate": 5.636715665953459e-05, + "loss": 2.7338, + "step": 33082 + }, + { + "epoch": 1.540261191423982, + "grad_norm": 0.36009481009696004, + "learning_rate": 5.636446996621445e-05, + "loss": 2.6992, + "step": 33083 + }, + { + "epoch": 1.5403077496100752, + "grad_norm": 0.3372085500916653, + "learning_rate": 5.636178325421524e-05, + "loss": 2.6846, + "step": 33084 + }, + { + "epoch": 1.5403543077961683, + "grad_norm": 0.37080394152839624, + "learning_rate": 5.63590965235448e-05, + "loss": 2.8556, + "step": 33085 + }, + { + "epoch": 1.5404008659822614, + "grad_norm": 0.35015712831492013, + "learning_rate": 5.635640977421104e-05, + "loss": 2.765, + "step": 33086 + }, + { + "epoch": 1.5404474241683546, + "grad_norm": 0.39860119801155264, + "learning_rate": 5.635372300622185e-05, + "loss": 2.6969, + "step": 33087 + }, + { + "epoch": 1.5404939823544475, + "grad_norm": 0.36801232167534575, + "learning_rate": 5.635103621958511e-05, + "loss": 2.6771, + "step": 33088 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 0.3784450029540872, + "learning_rate": 5.634834941430871e-05, + "loss": 2.7523, + "step": 33089 + }, + { + "epoch": 1.5405870987266335, + "grad_norm": 0.3553105212427795, + "learning_rate": 5.634566259040052e-05, + "loss": 2.8403, + "step": 33090 + }, + { + "epoch": 1.5406336569127266, + "grad_norm": 0.39283306593706485, + "learning_rate": 5.6342975747868446e-05, + "loss": 2.7107, + "step": 33091 + }, + { + "epoch": 1.5406802150988197, + "grad_norm": 0.38344474415485813, + "learning_rate": 5.634028888672035e-05, + "loss": 2.6502, + "step": 33092 + }, + { + "epoch": 1.5407267732849128, + "grad_norm": 0.37137744203851686, + "learning_rate": 5.633760200696414e-05, + "loss": 2.6679, + "step": 33093 + }, + { + "epoch": 1.540773331471006, + "grad_norm": 0.3506817949308671, + "learning_rate": 5.633491510860769e-05, + "loss": 2.6004, + "step": 33094 + }, + { + "epoch": 1.540819889657099, + "grad_norm": 0.3425787130437016, + "learning_rate": 5.6332228191658876e-05, + "loss": 2.7349, + "step": 33095 + }, + { + "epoch": 1.5408664478431922, + "grad_norm": 0.36551053375087655, + "learning_rate": 5.632954125612561e-05, + "loss": 2.7909, + "step": 33096 + }, + { + "epoch": 1.540913006029285, + "grad_norm": 0.40663250140439516, + "learning_rate": 5.6326854302015755e-05, + "loss": 2.7341, + "step": 33097 + }, + { + "epoch": 1.5409595642153782, + "grad_norm": 0.3315387309982882, + "learning_rate": 5.632416732933722e-05, + "loss": 2.7825, + "step": 33098 + }, + { + "epoch": 1.5410061224014713, + "grad_norm": 0.3469447002948898, + "learning_rate": 5.632148033809787e-05, + "loss": 2.6719, + "step": 33099 + }, + { + "epoch": 1.5410526805875642, + "grad_norm": 0.34261873569739193, + "learning_rate": 5.631879332830559e-05, + "loss": 2.699, + "step": 33100 + }, + { + "epoch": 1.5410992387736573, + "grad_norm": 0.33315605843847146, + "learning_rate": 5.631610629996827e-05, + "loss": 2.8218, + "step": 33101 + }, + { + "epoch": 1.5411457969597504, + "grad_norm": 0.3607876564777546, + "learning_rate": 5.6313419253093825e-05, + "loss": 2.8074, + "step": 33102 + }, + { + "epoch": 1.5411923551458435, + "grad_norm": 0.3568474817577744, + "learning_rate": 5.63107321876901e-05, + "loss": 2.6783, + "step": 33103 + }, + { + "epoch": 1.5412389133319366, + "grad_norm": 0.32926128752776895, + "learning_rate": 5.6308045103765006e-05, + "loss": 2.6994, + "step": 33104 + }, + { + "epoch": 1.5412854715180297, + "grad_norm": 0.351192600649533, + "learning_rate": 5.6305358001326416e-05, + "loss": 2.7269, + "step": 33105 + }, + { + "epoch": 1.5413320297041229, + "grad_norm": 0.32663293863120507, + "learning_rate": 5.630267088038222e-05, + "loss": 2.7024, + "step": 33106 + }, + { + "epoch": 1.5413785878902158, + "grad_norm": 0.3543437842750719, + "learning_rate": 5.629998374094031e-05, + "loss": 2.757, + "step": 33107 + }, + { + "epoch": 1.5414251460763089, + "grad_norm": 0.3429190486921004, + "learning_rate": 5.629729658300855e-05, + "loss": 2.7259, + "step": 33108 + }, + { + "epoch": 1.541471704262402, + "grad_norm": 0.34813005582523976, + "learning_rate": 5.629460940659487e-05, + "loss": 2.6916, + "step": 33109 + }, + { + "epoch": 1.5415182624484949, + "grad_norm": 0.3789430139874197, + "learning_rate": 5.629192221170711e-05, + "loss": 2.7193, + "step": 33110 + }, + { + "epoch": 1.541564820634588, + "grad_norm": 0.330168446721719, + "learning_rate": 5.628923499835319e-05, + "loss": 2.7052, + "step": 33111 + }, + { + "epoch": 1.541611378820681, + "grad_norm": 0.35333010724564684, + "learning_rate": 5.6286547766540984e-05, + "loss": 2.7477, + "step": 33112 + }, + { + "epoch": 1.5416579370067742, + "grad_norm": 0.3529060558841699, + "learning_rate": 5.628386051627839e-05, + "loss": 2.7922, + "step": 33113 + }, + { + "epoch": 1.5417044951928673, + "grad_norm": 0.31781765476249474, + "learning_rate": 5.628117324757326e-05, + "loss": 2.7366, + "step": 33114 + }, + { + "epoch": 1.5417510533789605, + "grad_norm": 0.3573317677819985, + "learning_rate": 5.627848596043353e-05, + "loss": 2.675, + "step": 33115 + }, + { + "epoch": 1.5417976115650536, + "grad_norm": 0.358162778616665, + "learning_rate": 5.627579865486705e-05, + "loss": 2.7197, + "step": 33116 + }, + { + "epoch": 1.5418441697511465, + "grad_norm": 0.3292320128316464, + "learning_rate": 5.627311133088174e-05, + "loss": 2.7379, + "step": 33117 + }, + { + "epoch": 1.5418907279372396, + "grad_norm": 0.34201817459579037, + "learning_rate": 5.6270423988485436e-05, + "loss": 2.8313, + "step": 33118 + }, + { + "epoch": 1.5419372861233325, + "grad_norm": 0.3212772945118606, + "learning_rate": 5.626773662768606e-05, + "loss": 2.7126, + "step": 33119 + }, + { + "epoch": 1.5419838443094256, + "grad_norm": 0.36109919019167297, + "learning_rate": 5.62650492484915e-05, + "loss": 2.7517, + "step": 33120 + }, + { + "epoch": 1.5420304024955187, + "grad_norm": 0.3323205719952829, + "learning_rate": 5.626236185090964e-05, + "loss": 2.5966, + "step": 33121 + }, + { + "epoch": 1.5420769606816118, + "grad_norm": 0.3628956864993527, + "learning_rate": 5.6259674434948364e-05, + "loss": 2.6779, + "step": 33122 + }, + { + "epoch": 1.542123518867705, + "grad_norm": 0.362749782735889, + "learning_rate": 5.625698700061556e-05, + "loss": 2.6992, + "step": 33123 + }, + { + "epoch": 1.542170077053798, + "grad_norm": 0.4070526652388093, + "learning_rate": 5.625429954791911e-05, + "loss": 2.7804, + "step": 33124 + }, + { + "epoch": 1.5422166352398912, + "grad_norm": 0.3454206000223784, + "learning_rate": 5.625161207686692e-05, + "loss": 2.7065, + "step": 33125 + }, + { + "epoch": 1.5422631934259843, + "grad_norm": 0.38416724373217526, + "learning_rate": 5.624892458746686e-05, + "loss": 2.8343, + "step": 33126 + }, + { + "epoch": 1.5423097516120772, + "grad_norm": 0.38668555318003545, + "learning_rate": 5.6246237079726805e-05, + "loss": 2.6599, + "step": 33127 + }, + { + "epoch": 1.5423563097981703, + "grad_norm": 0.32009827743334385, + "learning_rate": 5.624354955365468e-05, + "loss": 2.7549, + "step": 33128 + }, + { + "epoch": 1.5424028679842632, + "grad_norm": 0.34226067785387093, + "learning_rate": 5.624086200925834e-05, + "loss": 2.6595, + "step": 33129 + }, + { + "epoch": 1.5424494261703563, + "grad_norm": 0.33581393780645763, + "learning_rate": 5.623817444654571e-05, + "loss": 2.7058, + "step": 33130 + }, + { + "epoch": 1.5424959843564494, + "grad_norm": 0.3407885802046241, + "learning_rate": 5.623548686552462e-05, + "loss": 2.7728, + "step": 33131 + }, + { + "epoch": 1.5425425425425425, + "grad_norm": 0.34298166303110666, + "learning_rate": 5.6232799266203006e-05, + "loss": 2.7407, + "step": 33132 + }, + { + "epoch": 1.5425891007286356, + "grad_norm": 0.32721580624153596, + "learning_rate": 5.623011164858873e-05, + "loss": 2.7943, + "step": 33133 + }, + { + "epoch": 1.5426356589147288, + "grad_norm": 0.3297976364035448, + "learning_rate": 5.6227424012689714e-05, + "loss": 2.6816, + "step": 33134 + }, + { + "epoch": 1.5426822171008219, + "grad_norm": 0.32905000613842833, + "learning_rate": 5.62247363585138e-05, + "loss": 2.7953, + "step": 33135 + }, + { + "epoch": 1.5427287752869148, + "grad_norm": 0.3339778947041184, + "learning_rate": 5.6222048686068894e-05, + "loss": 2.7663, + "step": 33136 + }, + { + "epoch": 1.5427753334730079, + "grad_norm": 0.34844956457707266, + "learning_rate": 5.62193609953629e-05, + "loss": 2.7077, + "step": 33137 + }, + { + "epoch": 1.542821891659101, + "grad_norm": 0.33529869633125436, + "learning_rate": 5.6216673286403686e-05, + "loss": 2.6519, + "step": 33138 + }, + { + "epoch": 1.542868449845194, + "grad_norm": 0.33945171172928984, + "learning_rate": 5.6213985559199164e-05, + "loss": 2.8024, + "step": 33139 + }, + { + "epoch": 1.542915008031287, + "grad_norm": 0.337243026445609, + "learning_rate": 5.621129781375719e-05, + "loss": 2.6522, + "step": 33140 + }, + { + "epoch": 1.5429615662173801, + "grad_norm": 0.3653893479291481, + "learning_rate": 5.6208610050085685e-05, + "loss": 2.8435, + "step": 33141 + }, + { + "epoch": 1.5430081244034732, + "grad_norm": 0.34312416806307633, + "learning_rate": 5.6205922268192515e-05, + "loss": 2.7103, + "step": 33142 + }, + { + "epoch": 1.5430546825895664, + "grad_norm": 0.3389132872545462, + "learning_rate": 5.6203234468085576e-05, + "loss": 2.672, + "step": 33143 + }, + { + "epoch": 1.5431012407756595, + "grad_norm": 0.35029213311189966, + "learning_rate": 5.620054664977275e-05, + "loss": 2.8056, + "step": 33144 + }, + { + "epoch": 1.5431477989617526, + "grad_norm": 0.34011828949257733, + "learning_rate": 5.619785881326192e-05, + "loss": 2.8046, + "step": 33145 + }, + { + "epoch": 1.5431943571478455, + "grad_norm": 0.3421213051043061, + "learning_rate": 5.6195170958560994e-05, + "loss": 2.7495, + "step": 33146 + }, + { + "epoch": 1.5432409153339386, + "grad_norm": 0.312146732218237, + "learning_rate": 5.619248308567785e-05, + "loss": 2.7064, + "step": 33147 + }, + { + "epoch": 1.5432874735200317, + "grad_norm": 0.35906350702424894, + "learning_rate": 5.618979519462039e-05, + "loss": 2.813, + "step": 33148 + }, + { + "epoch": 1.5433340317061246, + "grad_norm": 0.31969300871736533, + "learning_rate": 5.618710728539648e-05, + "loss": 2.7244, + "step": 33149 + }, + { + "epoch": 1.5433805898922177, + "grad_norm": 0.311076938344574, + "learning_rate": 5.6184419358014015e-05, + "loss": 2.6086, + "step": 33150 + }, + { + "epoch": 1.5434271480783108, + "grad_norm": 0.35748331831999836, + "learning_rate": 5.61817314124809e-05, + "loss": 2.7391, + "step": 33151 + }, + { + "epoch": 1.543473706264404, + "grad_norm": 0.3133549273422109, + "learning_rate": 5.617904344880501e-05, + "loss": 2.784, + "step": 33152 + }, + { + "epoch": 1.543520264450497, + "grad_norm": 0.351072286979701, + "learning_rate": 5.617635546699424e-05, + "loss": 2.6812, + "step": 33153 + }, + { + "epoch": 1.5435668226365902, + "grad_norm": 0.3385142308709299, + "learning_rate": 5.6173667467056464e-05, + "loss": 2.6922, + "step": 33154 + }, + { + "epoch": 1.5436133808226833, + "grad_norm": 0.35143579719302936, + "learning_rate": 5.6170979448999586e-05, + "loss": 2.755, + "step": 33155 + }, + { + "epoch": 1.5436599390087762, + "grad_norm": 0.3386033184219816, + "learning_rate": 5.616829141283149e-05, + "loss": 2.6877, + "step": 33156 + }, + { + "epoch": 1.5437064971948693, + "grad_norm": 0.34717018969004015, + "learning_rate": 5.616560335856008e-05, + "loss": 2.742, + "step": 33157 + }, + { + "epoch": 1.5437530553809622, + "grad_norm": 0.34695964649317146, + "learning_rate": 5.616291528619322e-05, + "loss": 2.7195, + "step": 33158 + }, + { + "epoch": 1.5437996135670553, + "grad_norm": 0.36331067292993785, + "learning_rate": 5.61602271957388e-05, + "loss": 2.716, + "step": 33159 + }, + { + "epoch": 1.5438461717531484, + "grad_norm": 0.3893751085440253, + "learning_rate": 5.615753908720474e-05, + "loss": 2.7537, + "step": 33160 + }, + { + "epoch": 1.5438927299392415, + "grad_norm": 0.32104051144051504, + "learning_rate": 5.6154850960598906e-05, + "loss": 2.7521, + "step": 33161 + }, + { + "epoch": 1.5439392881253347, + "grad_norm": 0.41722271263518934, + "learning_rate": 5.6152162815929176e-05, + "loss": 2.7788, + "step": 33162 + }, + { + "epoch": 1.5439858463114278, + "grad_norm": 0.3474297920411224, + "learning_rate": 5.614947465320346e-05, + "loss": 2.8112, + "step": 33163 + }, + { + "epoch": 1.5440324044975209, + "grad_norm": 0.3548352201416759, + "learning_rate": 5.614678647242965e-05, + "loss": 2.7011, + "step": 33164 + }, + { + "epoch": 1.544078962683614, + "grad_norm": 0.35228799175666, + "learning_rate": 5.614409827361562e-05, + "loss": 2.7687, + "step": 33165 + }, + { + "epoch": 1.544125520869707, + "grad_norm": 0.34269895036702425, + "learning_rate": 5.614141005676927e-05, + "loss": 2.6449, + "step": 33166 + }, + { + "epoch": 1.5441720790558, + "grad_norm": 0.3171141138568157, + "learning_rate": 5.61387218218985e-05, + "loss": 2.7459, + "step": 33167 + }, + { + "epoch": 1.544218637241893, + "grad_norm": 0.3487515992424862, + "learning_rate": 5.6136033569011156e-05, + "loss": 2.7255, + "step": 33168 + }, + { + "epoch": 1.544265195427986, + "grad_norm": 0.3247967666790333, + "learning_rate": 5.613334529811518e-05, + "loss": 2.7281, + "step": 33169 + }, + { + "epoch": 1.5443117536140791, + "grad_norm": 0.3822776839074819, + "learning_rate": 5.613065700921843e-05, + "loss": 2.6934, + "step": 33170 + }, + { + "epoch": 1.5443583118001722, + "grad_norm": 0.3354396878696098, + "learning_rate": 5.6127968702328814e-05, + "loss": 2.7505, + "step": 33171 + }, + { + "epoch": 1.5444048699862654, + "grad_norm": 0.3782745902917971, + "learning_rate": 5.612528037745421e-05, + "loss": 2.875, + "step": 33172 + }, + { + "epoch": 1.5444514281723585, + "grad_norm": 0.35685773224048706, + "learning_rate": 5.6122592034602504e-05, + "loss": 2.7911, + "step": 33173 + }, + { + "epoch": 1.5444979863584516, + "grad_norm": 0.35944036305699795, + "learning_rate": 5.611990367378159e-05, + "loss": 2.8377, + "step": 33174 + }, + { + "epoch": 1.5445445445445447, + "grad_norm": 0.33978834142338904, + "learning_rate": 5.611721529499939e-05, + "loss": 2.7166, + "step": 33175 + }, + { + "epoch": 1.5445911027306376, + "grad_norm": 0.32737200647282727, + "learning_rate": 5.611452689826374e-05, + "loss": 2.7299, + "step": 33176 + }, + { + "epoch": 1.5446376609167307, + "grad_norm": 0.3183838255884295, + "learning_rate": 5.6111838483582566e-05, + "loss": 2.6436, + "step": 33177 + }, + { + "epoch": 1.5446842191028236, + "grad_norm": 0.30608394437580494, + "learning_rate": 5.6109150050963755e-05, + "loss": 2.6699, + "step": 33178 + }, + { + "epoch": 1.5447307772889167, + "grad_norm": 0.3309873652594135, + "learning_rate": 5.6106461600415173e-05, + "loss": 2.7688, + "step": 33179 + }, + { + "epoch": 1.5447773354750098, + "grad_norm": 0.31728400956210745, + "learning_rate": 5.610377313194475e-05, + "loss": 2.767, + "step": 33180 + }, + { + "epoch": 1.544823893661103, + "grad_norm": 0.3338395559003583, + "learning_rate": 5.610108464556033e-05, + "loss": 2.6785, + "step": 33181 + }, + { + "epoch": 1.544870451847196, + "grad_norm": 0.3269184661025159, + "learning_rate": 5.6098396141269836e-05, + "loss": 2.8452, + "step": 33182 + }, + { + "epoch": 1.5449170100332892, + "grad_norm": 0.33554403629855106, + "learning_rate": 5.609570761908115e-05, + "loss": 2.7075, + "step": 33183 + }, + { + "epoch": 1.5449635682193823, + "grad_norm": 0.3412904041629648, + "learning_rate": 5.609301907900217e-05, + "loss": 2.7769, + "step": 33184 + }, + { + "epoch": 1.5450101264054752, + "grad_norm": 0.3138074376617902, + "learning_rate": 5.609033052104077e-05, + "loss": 2.6478, + "step": 33185 + }, + { + "epoch": 1.5450566845915683, + "grad_norm": 0.35724381050602066, + "learning_rate": 5.608764194520485e-05, + "loss": 2.7794, + "step": 33186 + }, + { + "epoch": 1.5451032427776614, + "grad_norm": 0.3345910153730117, + "learning_rate": 5.60849533515023e-05, + "loss": 2.6467, + "step": 33187 + }, + { + "epoch": 1.5451498009637543, + "grad_norm": 0.34083225373390147, + "learning_rate": 5.6082264739941025e-05, + "loss": 2.6206, + "step": 33188 + }, + { + "epoch": 1.5451963591498474, + "grad_norm": 0.34927471213987704, + "learning_rate": 5.6079576110528894e-05, + "loss": 2.7296, + "step": 33189 + }, + { + "epoch": 1.5452429173359405, + "grad_norm": 0.35351603587673497, + "learning_rate": 5.60768874632738e-05, + "loss": 2.7608, + "step": 33190 + }, + { + "epoch": 1.5452894755220337, + "grad_norm": 0.32369790300301493, + "learning_rate": 5.607419879818365e-05, + "loss": 2.7217, + "step": 33191 + }, + { + "epoch": 1.5453360337081268, + "grad_norm": 0.3682835794226438, + "learning_rate": 5.6071510115266325e-05, + "loss": 2.7168, + "step": 33192 + }, + { + "epoch": 1.54538259189422, + "grad_norm": 0.33609765914439277, + "learning_rate": 5.606882141452972e-05, + "loss": 2.6512, + "step": 33193 + }, + { + "epoch": 1.545429150080313, + "grad_norm": 0.35396331300170175, + "learning_rate": 5.6066132695981714e-05, + "loss": 2.7546, + "step": 33194 + }, + { + "epoch": 1.545475708266406, + "grad_norm": 0.34219569954335605, + "learning_rate": 5.606344395963019e-05, + "loss": 2.7673, + "step": 33195 + }, + { + "epoch": 1.545522266452499, + "grad_norm": 0.3568688281080975, + "learning_rate": 5.606075520548307e-05, + "loss": 2.748, + "step": 33196 + }, + { + "epoch": 1.5455688246385921, + "grad_norm": 0.3342778032407426, + "learning_rate": 5.605806643354824e-05, + "loss": 2.7372, + "step": 33197 + }, + { + "epoch": 1.545615382824685, + "grad_norm": 0.33890132162876063, + "learning_rate": 5.6055377643833574e-05, + "loss": 2.7409, + "step": 33198 + }, + { + "epoch": 1.5456619410107781, + "grad_norm": 0.3316629249811501, + "learning_rate": 5.6052688836346956e-05, + "loss": 2.8653, + "step": 33199 + }, + { + "epoch": 1.5457084991968713, + "grad_norm": 0.32107452721478946, + "learning_rate": 5.60500000110963e-05, + "loss": 2.6437, + "step": 33200 + }, + { + "epoch": 1.5457550573829644, + "grad_norm": 0.360298104703344, + "learning_rate": 5.60473111680895e-05, + "loss": 2.7279, + "step": 33201 + }, + { + "epoch": 1.5458016155690575, + "grad_norm": 0.3130050573394655, + "learning_rate": 5.604462230733445e-05, + "loss": 2.7439, + "step": 33202 + }, + { + "epoch": 1.5458481737551506, + "grad_norm": 0.3526512096508953, + "learning_rate": 5.6041933428839e-05, + "loss": 2.8184, + "step": 33203 + }, + { + "epoch": 1.5458947319412437, + "grad_norm": 0.33916130493942115, + "learning_rate": 5.603924453261109e-05, + "loss": 2.7034, + "step": 33204 + }, + { + "epoch": 1.5459412901273366, + "grad_norm": 0.34270598500368227, + "learning_rate": 5.6036555618658596e-05, + "loss": 2.6392, + "step": 33205 + }, + { + "epoch": 1.5459878483134297, + "grad_norm": 0.3455798714692816, + "learning_rate": 5.60338666869894e-05, + "loss": 2.8043, + "step": 33206 + }, + { + "epoch": 1.5460344064995226, + "grad_norm": 0.34039189573142714, + "learning_rate": 5.6031177737611385e-05, + "loss": 2.8187, + "step": 33207 + }, + { + "epoch": 1.5460809646856157, + "grad_norm": 0.3125823094057982, + "learning_rate": 5.602848877053247e-05, + "loss": 2.7435, + "step": 33208 + }, + { + "epoch": 1.5461275228717088, + "grad_norm": 0.3486666041130198, + "learning_rate": 5.602579978576053e-05, + "loss": 2.7786, + "step": 33209 + }, + { + "epoch": 1.546174081057802, + "grad_norm": 0.3478359052469795, + "learning_rate": 5.602311078330346e-05, + "loss": 2.7652, + "step": 33210 + }, + { + "epoch": 1.546220639243895, + "grad_norm": 0.3235907513645084, + "learning_rate": 5.6020421763169164e-05, + "loss": 2.734, + "step": 33211 + }, + { + "epoch": 1.5462671974299882, + "grad_norm": 0.33863579566854657, + "learning_rate": 5.601773272536551e-05, + "loss": 2.7617, + "step": 33212 + }, + { + "epoch": 1.5463137556160813, + "grad_norm": 0.33700645328946705, + "learning_rate": 5.601504366990041e-05, + "loss": 2.7218, + "step": 33213 + }, + { + "epoch": 1.5463603138021744, + "grad_norm": 0.3197753607986159, + "learning_rate": 5.6012354596781745e-05, + "loss": 2.737, + "step": 33214 + }, + { + "epoch": 1.5464068719882673, + "grad_norm": 0.34295179646493745, + "learning_rate": 5.6009665506017426e-05, + "loss": 2.7705, + "step": 33215 + }, + { + "epoch": 1.5464534301743604, + "grad_norm": 0.34482197000313675, + "learning_rate": 5.600697639761532e-05, + "loss": 2.7959, + "step": 33216 + }, + { + "epoch": 1.5464999883604533, + "grad_norm": 0.3272502015573612, + "learning_rate": 5.600428727158333e-05, + "loss": 2.7529, + "step": 33217 + }, + { + "epoch": 1.5465465465465464, + "grad_norm": 0.3539721634939982, + "learning_rate": 5.600159812792934e-05, + "loss": 2.7705, + "step": 33218 + }, + { + "epoch": 1.5465931047326396, + "grad_norm": 0.3343894466640315, + "learning_rate": 5.5998908966661266e-05, + "loss": 2.6409, + "step": 33219 + }, + { + "epoch": 1.5466396629187327, + "grad_norm": 0.3482739662638819, + "learning_rate": 5.599621978778699e-05, + "loss": 2.7979, + "step": 33220 + }, + { + "epoch": 1.5466862211048258, + "grad_norm": 0.3479631005497971, + "learning_rate": 5.599353059131438e-05, + "loss": 2.7467, + "step": 33221 + }, + { + "epoch": 1.546732779290919, + "grad_norm": 0.3586102284359667, + "learning_rate": 5.5990841377251356e-05, + "loss": 2.7582, + "step": 33222 + }, + { + "epoch": 1.546779337477012, + "grad_norm": 0.35637976123290444, + "learning_rate": 5.59881521456058e-05, + "loss": 2.7655, + "step": 33223 + }, + { + "epoch": 1.546825895663105, + "grad_norm": 0.33626485812873863, + "learning_rate": 5.598546289638561e-05, + "loss": 2.7233, + "step": 33224 + }, + { + "epoch": 1.546872453849198, + "grad_norm": 0.3374641378180017, + "learning_rate": 5.598277362959867e-05, + "loss": 2.743, + "step": 33225 + }, + { + "epoch": 1.5469190120352911, + "grad_norm": 0.36885481832012174, + "learning_rate": 5.598008434525289e-05, + "loss": 2.7267, + "step": 33226 + }, + { + "epoch": 1.546965570221384, + "grad_norm": 0.3832540226982622, + "learning_rate": 5.597739504335614e-05, + "loss": 2.8347, + "step": 33227 + }, + { + "epoch": 1.5470121284074771, + "grad_norm": 0.36166616600062984, + "learning_rate": 5.5974705723916334e-05, + "loss": 2.7174, + "step": 33228 + }, + { + "epoch": 1.5470586865935703, + "grad_norm": 0.3298919448971255, + "learning_rate": 5.597201638694136e-05, + "loss": 2.7063, + "step": 33229 + }, + { + "epoch": 1.5471052447796634, + "grad_norm": 0.34291082246005655, + "learning_rate": 5.596932703243911e-05, + "loss": 2.674, + "step": 33230 + }, + { + "epoch": 1.5471518029657565, + "grad_norm": 0.3426595826706135, + "learning_rate": 5.5966637660417455e-05, + "loss": 2.707, + "step": 33231 + }, + { + "epoch": 1.5471983611518496, + "grad_norm": 0.32975522394373613, + "learning_rate": 5.5963948270884314e-05, + "loss": 2.6801, + "step": 33232 + }, + { + "epoch": 1.5472449193379427, + "grad_norm": 0.31336473707053863, + "learning_rate": 5.5961258863847575e-05, + "loss": 2.7006, + "step": 33233 + }, + { + "epoch": 1.5472914775240356, + "grad_norm": 0.32928426864404275, + "learning_rate": 5.595856943931512e-05, + "loss": 2.6866, + "step": 33234 + }, + { + "epoch": 1.5473380357101287, + "grad_norm": 0.3323825376287209, + "learning_rate": 5.595587999729486e-05, + "loss": 2.6742, + "step": 33235 + }, + { + "epoch": 1.5473845938962218, + "grad_norm": 0.3248861163654182, + "learning_rate": 5.5953190537794667e-05, + "loss": 2.7201, + "step": 33236 + }, + { + "epoch": 1.5474311520823147, + "grad_norm": 0.3546354776407601, + "learning_rate": 5.595050106082245e-05, + "loss": 2.7447, + "step": 33237 + }, + { + "epoch": 1.5474777102684079, + "grad_norm": 0.33429411212777316, + "learning_rate": 5.594781156638611e-05, + "loss": 2.6766, + "step": 33238 + }, + { + "epoch": 1.547524268454501, + "grad_norm": 0.3237810091357579, + "learning_rate": 5.594512205449353e-05, + "loss": 2.8566, + "step": 33239 + }, + { + "epoch": 1.547570826640594, + "grad_norm": 0.35175698687897716, + "learning_rate": 5.594243252515259e-05, + "loss": 2.7521, + "step": 33240 + }, + { + "epoch": 1.5476173848266872, + "grad_norm": 0.33509621036925535, + "learning_rate": 5.593974297837119e-05, + "loss": 2.5847, + "step": 33241 + }, + { + "epoch": 1.5476639430127803, + "grad_norm": 0.3627183179823136, + "learning_rate": 5.593705341415725e-05, + "loss": 2.8253, + "step": 33242 + }, + { + "epoch": 1.5477105011988734, + "grad_norm": 0.3536249317894852, + "learning_rate": 5.5934363832518634e-05, + "loss": 2.8503, + "step": 33243 + }, + { + "epoch": 1.5477570593849663, + "grad_norm": 0.34980787476289854, + "learning_rate": 5.593167423346325e-05, + "loss": 2.7083, + "step": 33244 + }, + { + "epoch": 1.5478036175710594, + "grad_norm": 0.36960260942146833, + "learning_rate": 5.592898461699897e-05, + "loss": 2.7184, + "step": 33245 + }, + { + "epoch": 1.5478501757571523, + "grad_norm": 0.36316720954277126, + "learning_rate": 5.592629498313371e-05, + "loss": 2.7673, + "step": 33246 + }, + { + "epoch": 1.5478967339432455, + "grad_norm": 0.33727774745915884, + "learning_rate": 5.592360533187537e-05, + "loss": 2.6367, + "step": 33247 + }, + { + "epoch": 1.5479432921293386, + "grad_norm": 0.3472062111047437, + "learning_rate": 5.5920915663231835e-05, + "loss": 2.6635, + "step": 33248 + }, + { + "epoch": 1.5479898503154317, + "grad_norm": 0.33826412325311306, + "learning_rate": 5.591822597721097e-05, + "loss": 2.6752, + "step": 33249 + }, + { + "epoch": 1.5480364085015248, + "grad_norm": 0.33258446199195396, + "learning_rate": 5.5915536273820716e-05, + "loss": 2.7085, + "step": 33250 + }, + { + "epoch": 1.548082966687618, + "grad_norm": 0.36098201780172867, + "learning_rate": 5.591284655306894e-05, + "loss": 2.7549, + "step": 33251 + }, + { + "epoch": 1.548129524873711, + "grad_norm": 0.32272104366589094, + "learning_rate": 5.5910156814963546e-05, + "loss": 2.7386, + "step": 33252 + }, + { + "epoch": 1.5481760830598041, + "grad_norm": 0.349651239789219, + "learning_rate": 5.590746705951241e-05, + "loss": 2.7706, + "step": 33253 + }, + { + "epoch": 1.548222641245897, + "grad_norm": 0.35953717781168887, + "learning_rate": 5.5904777286723456e-05, + "loss": 2.6514, + "step": 33254 + }, + { + "epoch": 1.5482691994319902, + "grad_norm": 0.36597808016177935, + "learning_rate": 5.590208749660456e-05, + "loss": 2.7665, + "step": 33255 + }, + { + "epoch": 1.548315757618083, + "grad_norm": 0.350540716277969, + "learning_rate": 5.589939768916361e-05, + "loss": 2.7191, + "step": 33256 + }, + { + "epoch": 1.5483623158041762, + "grad_norm": 0.3508735510398282, + "learning_rate": 5.589670786440853e-05, + "loss": 2.9455, + "step": 33257 + }, + { + "epoch": 1.5484088739902693, + "grad_norm": 0.3660129446010836, + "learning_rate": 5.589401802234716e-05, + "loss": 2.7863, + "step": 33258 + }, + { + "epoch": 1.5484554321763624, + "grad_norm": 0.3273459733364794, + "learning_rate": 5.589132816298744e-05, + "loss": 2.6565, + "step": 33259 + }, + { + "epoch": 1.5485019903624555, + "grad_norm": 0.3492831655945884, + "learning_rate": 5.588863828633727e-05, + "loss": 2.6269, + "step": 33260 + }, + { + "epoch": 1.5485485485485486, + "grad_norm": 0.29838909783859935, + "learning_rate": 5.588594839240451e-05, + "loss": 2.5864, + "step": 33261 + }, + { + "epoch": 1.5485951067346417, + "grad_norm": 0.3567735127397041, + "learning_rate": 5.588325848119708e-05, + "loss": 2.718, + "step": 33262 + }, + { + "epoch": 1.5486416649207349, + "grad_norm": 0.31809147887205325, + "learning_rate": 5.588056855272285e-05, + "loss": 2.6837, + "step": 33263 + }, + { + "epoch": 1.5486882231068277, + "grad_norm": 0.3160190762707425, + "learning_rate": 5.587787860698974e-05, + "loss": 2.7328, + "step": 33264 + }, + { + "epoch": 1.5487347812929209, + "grad_norm": 0.3289321368423107, + "learning_rate": 5.587518864400565e-05, + "loss": 2.8344, + "step": 33265 + }, + { + "epoch": 1.5487813394790138, + "grad_norm": 0.31987060549465596, + "learning_rate": 5.587249866377844e-05, + "loss": 2.723, + "step": 33266 + }, + { + "epoch": 1.5488278976651069, + "grad_norm": 0.3414411126652175, + "learning_rate": 5.5869808666316035e-05, + "loss": 2.7981, + "step": 33267 + }, + { + "epoch": 1.5488744558512, + "grad_norm": 0.306178245417455, + "learning_rate": 5.5867118651626324e-05, + "loss": 2.678, + "step": 33268 + }, + { + "epoch": 1.548921014037293, + "grad_norm": 0.316618228518755, + "learning_rate": 5.586442861971719e-05, + "loss": 2.7045, + "step": 33269 + }, + { + "epoch": 1.5489675722233862, + "grad_norm": 0.33613892439588766, + "learning_rate": 5.5861738570596547e-05, + "loss": 2.6887, + "step": 33270 + }, + { + "epoch": 1.5490141304094793, + "grad_norm": 0.31255109980807144, + "learning_rate": 5.585904850427226e-05, + "loss": 2.8008, + "step": 33271 + }, + { + "epoch": 1.5490606885955724, + "grad_norm": 0.31219107129936435, + "learning_rate": 5.585635842075224e-05, + "loss": 2.7243, + "step": 33272 + }, + { + "epoch": 1.5491072467816653, + "grad_norm": 0.357257164125703, + "learning_rate": 5.5853668320044395e-05, + "loss": 2.7952, + "step": 33273 + }, + { + "epoch": 1.5491538049677585, + "grad_norm": 0.33730370951801036, + "learning_rate": 5.585097820215662e-05, + "loss": 2.8902, + "step": 33274 + }, + { + "epoch": 1.5492003631538516, + "grad_norm": 0.38986530209159986, + "learning_rate": 5.5848288067096786e-05, + "loss": 2.6901, + "step": 33275 + }, + { + "epoch": 1.5492469213399445, + "grad_norm": 0.33194253328026396, + "learning_rate": 5.584559791487282e-05, + "loss": 2.74, + "step": 33276 + }, + { + "epoch": 1.5492934795260376, + "grad_norm": 0.4091258184499454, + "learning_rate": 5.584290774549258e-05, + "loss": 2.7338, + "step": 33277 + }, + { + "epoch": 1.5493400377121307, + "grad_norm": 0.36179651211268427, + "learning_rate": 5.5840217558963995e-05, + "loss": 2.783, + "step": 33278 + }, + { + "epoch": 1.5493865958982238, + "grad_norm": 0.36907175248225554, + "learning_rate": 5.583752735529494e-05, + "loss": 2.6532, + "step": 33279 + }, + { + "epoch": 1.549433154084317, + "grad_norm": 0.31660016450404277, + "learning_rate": 5.583483713449331e-05, + "loss": 2.6831, + "step": 33280 + }, + { + "epoch": 1.54947971227041, + "grad_norm": 0.3559908966919418, + "learning_rate": 5.583214689656703e-05, + "loss": 2.7646, + "step": 33281 + }, + { + "epoch": 1.5495262704565032, + "grad_norm": 0.31892371243914047, + "learning_rate": 5.582945664152396e-05, + "loss": 2.8069, + "step": 33282 + }, + { + "epoch": 1.549572828642596, + "grad_norm": 0.35413623469510663, + "learning_rate": 5.582676636937202e-05, + "loss": 2.7327, + "step": 33283 + }, + { + "epoch": 1.5496193868286892, + "grad_norm": 0.34965480783761477, + "learning_rate": 5.582407608011908e-05, + "loss": 2.7093, + "step": 33284 + }, + { + "epoch": 1.5496659450147823, + "grad_norm": 0.3270738879173025, + "learning_rate": 5.582138577377305e-05, + "loss": 2.6581, + "step": 33285 + }, + { + "epoch": 1.5497125032008752, + "grad_norm": 0.35576235790892763, + "learning_rate": 5.581869545034183e-05, + "loss": 2.6999, + "step": 33286 + }, + { + "epoch": 1.5497590613869683, + "grad_norm": 0.33576744003909975, + "learning_rate": 5.5816005109833315e-05, + "loss": 2.6105, + "step": 33287 + }, + { + "epoch": 1.5498056195730614, + "grad_norm": 0.32033516777609067, + "learning_rate": 5.5813314752255396e-05, + "loss": 2.6531, + "step": 33288 + }, + { + "epoch": 1.5498521777591545, + "grad_norm": 0.3355198461416582, + "learning_rate": 5.581062437761596e-05, + "loss": 2.6261, + "step": 33289 + }, + { + "epoch": 1.5498987359452476, + "grad_norm": 0.3405573932968797, + "learning_rate": 5.580793398592292e-05, + "loss": 2.7181, + "step": 33290 + }, + { + "epoch": 1.5499452941313407, + "grad_norm": 0.35592666524410566, + "learning_rate": 5.580524357718417e-05, + "loss": 2.7705, + "step": 33291 + }, + { + "epoch": 1.5499918523174339, + "grad_norm": 0.34206656220928644, + "learning_rate": 5.580255315140761e-05, + "loss": 2.7162, + "step": 33292 + }, + { + "epoch": 1.5500384105035268, + "grad_norm": 0.35723532777573624, + "learning_rate": 5.579986270860112e-05, + "loss": 2.7643, + "step": 33293 + }, + { + "epoch": 1.5500849686896199, + "grad_norm": 0.3594994122983917, + "learning_rate": 5.579717224877261e-05, + "loss": 2.7033, + "step": 33294 + }, + { + "epoch": 1.5501315268757128, + "grad_norm": 0.3396863567606321, + "learning_rate": 5.5794481771929975e-05, + "loss": 2.8046, + "step": 33295 + }, + { + "epoch": 1.5501780850618059, + "grad_norm": 0.34497513760270876, + "learning_rate": 5.57917912780811e-05, + "loss": 2.7201, + "step": 33296 + }, + { + "epoch": 1.550224643247899, + "grad_norm": 0.32593647942775067, + "learning_rate": 5.578910076723388e-05, + "loss": 2.7174, + "step": 33297 + }, + { + "epoch": 1.550271201433992, + "grad_norm": 0.34575622321391153, + "learning_rate": 5.578641023939622e-05, + "loss": 2.7078, + "step": 33298 + }, + { + "epoch": 1.5503177596200852, + "grad_norm": 0.3238441330921234, + "learning_rate": 5.578371969457602e-05, + "loss": 2.7991, + "step": 33299 + }, + { + "epoch": 1.5503643178061783, + "grad_norm": 0.36254156957434863, + "learning_rate": 5.578102913278117e-05, + "loss": 2.7153, + "step": 33300 + }, + { + "epoch": 1.5504108759922715, + "grad_norm": 0.3391492096227046, + "learning_rate": 5.577833855401958e-05, + "loss": 2.6968, + "step": 33301 + }, + { + "epoch": 1.5504574341783646, + "grad_norm": 0.34279905580716447, + "learning_rate": 5.5775647958299124e-05, + "loss": 2.7087, + "step": 33302 + }, + { + "epoch": 1.5505039923644575, + "grad_norm": 0.3680911233183446, + "learning_rate": 5.577295734562772e-05, + "loss": 2.7017, + "step": 33303 + }, + { + "epoch": 1.5505505505505506, + "grad_norm": 0.319395345220653, + "learning_rate": 5.5770266716013244e-05, + "loss": 2.7401, + "step": 33304 + }, + { + "epoch": 1.5505971087366435, + "grad_norm": 0.34974329009768623, + "learning_rate": 5.576757606946361e-05, + "loss": 2.7512, + "step": 33305 + }, + { + "epoch": 1.5506436669227366, + "grad_norm": 0.32338549643484144, + "learning_rate": 5.576488540598671e-05, + "loss": 2.6798, + "step": 33306 + }, + { + "epoch": 1.5506902251088297, + "grad_norm": 0.31516821349102114, + "learning_rate": 5.576219472559044e-05, + "loss": 2.7502, + "step": 33307 + }, + { + "epoch": 1.5507367832949228, + "grad_norm": 0.35060574909922154, + "learning_rate": 5.575950402828269e-05, + "loss": 2.8124, + "step": 33308 + }, + { + "epoch": 1.550783341481016, + "grad_norm": 0.30911926560664443, + "learning_rate": 5.5756813314071364e-05, + "loss": 2.6488, + "step": 33309 + }, + { + "epoch": 1.550829899667109, + "grad_norm": 0.3469702898689932, + "learning_rate": 5.575412258296438e-05, + "loss": 2.6928, + "step": 33310 + }, + { + "epoch": 1.5508764578532022, + "grad_norm": 0.32212502918192726, + "learning_rate": 5.575143183496958e-05, + "loss": 2.8136, + "step": 33311 + }, + { + "epoch": 1.550923016039295, + "grad_norm": 0.32161533367445216, + "learning_rate": 5.574874107009491e-05, + "loss": 2.6636, + "step": 33312 + }, + { + "epoch": 1.5509695742253882, + "grad_norm": 0.3406089884361752, + "learning_rate": 5.574605028834825e-05, + "loss": 2.7396, + "step": 33313 + }, + { + "epoch": 1.5510161324114813, + "grad_norm": 0.33284467095685344, + "learning_rate": 5.57433594897375e-05, + "loss": 2.6815, + "step": 33314 + }, + { + "epoch": 1.5510626905975742, + "grad_norm": 0.34484090465192896, + "learning_rate": 5.574066867427055e-05, + "loss": 2.7335, + "step": 33315 + }, + { + "epoch": 1.5511092487836673, + "grad_norm": 0.37413198891949895, + "learning_rate": 5.57379778419553e-05, + "loss": 2.8249, + "step": 33316 + }, + { + "epoch": 1.5511558069697604, + "grad_norm": 0.33621071227670557, + "learning_rate": 5.573528699279965e-05, + "loss": 2.6995, + "step": 33317 + }, + { + "epoch": 1.5512023651558535, + "grad_norm": 0.36143985086257097, + "learning_rate": 5.573259612681152e-05, + "loss": 2.7459, + "step": 33318 + }, + { + "epoch": 1.5512489233419466, + "grad_norm": 0.3373741882483647, + "learning_rate": 5.572990524399877e-05, + "loss": 2.6885, + "step": 33319 + }, + { + "epoch": 1.5512954815280398, + "grad_norm": 0.35431119111188836, + "learning_rate": 5.572721434436932e-05, + "loss": 2.7268, + "step": 33320 + }, + { + "epoch": 1.5513420397141329, + "grad_norm": 0.346791996614779, + "learning_rate": 5.572452342793104e-05, + "loss": 2.7675, + "step": 33321 + }, + { + "epoch": 1.5513885979002258, + "grad_norm": 0.35199125092766975, + "learning_rate": 5.572183249469186e-05, + "loss": 2.7748, + "step": 33322 + }, + { + "epoch": 1.5514351560863189, + "grad_norm": 0.33518790223450434, + "learning_rate": 5.5719141544659667e-05, + "loss": 2.6543, + "step": 33323 + }, + { + "epoch": 1.551481714272412, + "grad_norm": 0.35297260260764585, + "learning_rate": 5.5716450577842364e-05, + "loss": 2.7615, + "step": 33324 + }, + { + "epoch": 1.5515282724585049, + "grad_norm": 0.3718300476667012, + "learning_rate": 5.5713759594247826e-05, + "loss": 2.6261, + "step": 33325 + }, + { + "epoch": 1.551574830644598, + "grad_norm": 0.3226338771671539, + "learning_rate": 5.5711068593883966e-05, + "loss": 2.7803, + "step": 33326 + }, + { + "epoch": 1.5516213888306911, + "grad_norm": 0.36668604355406337, + "learning_rate": 5.570837757675869e-05, + "loss": 2.8292, + "step": 33327 + }, + { + "epoch": 1.5516679470167842, + "grad_norm": 0.3561339419325891, + "learning_rate": 5.570568654287989e-05, + "loss": 2.6952, + "step": 33328 + }, + { + "epoch": 1.5517145052028773, + "grad_norm": 0.33972086343505487, + "learning_rate": 5.570299549225546e-05, + "loss": 2.7141, + "step": 33329 + }, + { + "epoch": 1.5517610633889705, + "grad_norm": 0.33606742205084594, + "learning_rate": 5.570030442489329e-05, + "loss": 2.7779, + "step": 33330 + }, + { + "epoch": 1.5518076215750636, + "grad_norm": 0.371750323319692, + "learning_rate": 5.569761334080132e-05, + "loss": 2.7738, + "step": 33331 + }, + { + "epoch": 1.5518541797611565, + "grad_norm": 0.35728307055554426, + "learning_rate": 5.569492223998738e-05, + "loss": 2.7228, + "step": 33332 + }, + { + "epoch": 1.5519007379472496, + "grad_norm": 0.3537158866404245, + "learning_rate": 5.569223112245943e-05, + "loss": 2.5868, + "step": 33333 + }, + { + "epoch": 1.5519472961333425, + "grad_norm": 0.36576616969578873, + "learning_rate": 5.5689539988225326e-05, + "loss": 2.7784, + "step": 33334 + }, + { + "epoch": 1.5519938543194356, + "grad_norm": 0.3399190768398132, + "learning_rate": 5.5686848837292985e-05, + "loss": 2.5822, + "step": 33335 + }, + { + "epoch": 1.5520404125055287, + "grad_norm": 0.35272371749139775, + "learning_rate": 5.56841576696703e-05, + "loss": 2.775, + "step": 33336 + }, + { + "epoch": 1.5520869706916218, + "grad_norm": 0.3585207209895485, + "learning_rate": 5.5681466485365185e-05, + "loss": 2.7157, + "step": 33337 + }, + { + "epoch": 1.552133528877715, + "grad_norm": 0.31969418326423965, + "learning_rate": 5.567877528438552e-05, + "loss": 2.6382, + "step": 33338 + }, + { + "epoch": 1.552180087063808, + "grad_norm": 0.34976385459652065, + "learning_rate": 5.56760840667392e-05, + "loss": 2.7982, + "step": 33339 + }, + { + "epoch": 1.5522266452499012, + "grad_norm": 0.33618123844826225, + "learning_rate": 5.567339283243414e-05, + "loss": 2.6923, + "step": 33340 + }, + { + "epoch": 1.5522732034359943, + "grad_norm": 0.3369703835796084, + "learning_rate": 5.567070158147824e-05, + "loss": 2.7704, + "step": 33341 + }, + { + "epoch": 1.5523197616220872, + "grad_norm": 0.3093948591014543, + "learning_rate": 5.566801031387937e-05, + "loss": 2.6402, + "step": 33342 + }, + { + "epoch": 1.5523663198081803, + "grad_norm": 0.34956232314546964, + "learning_rate": 5.566531902964546e-05, + "loss": 2.7833, + "step": 33343 + }, + { + "epoch": 1.5524128779942732, + "grad_norm": 0.3609666997256935, + "learning_rate": 5.56626277287844e-05, + "loss": 2.7309, + "step": 33344 + }, + { + "epoch": 1.5524594361803663, + "grad_norm": 0.3416348495797281, + "learning_rate": 5.565993641130408e-05, + "loss": 2.7966, + "step": 33345 + }, + { + "epoch": 1.5525059943664594, + "grad_norm": 0.3616794433547169, + "learning_rate": 5.5657245077212416e-05, + "loss": 2.7936, + "step": 33346 + }, + { + "epoch": 1.5525525525525525, + "grad_norm": 0.32513460193484556, + "learning_rate": 5.5654553726517275e-05, + "loss": 2.7454, + "step": 33347 + }, + { + "epoch": 1.5525991107386456, + "grad_norm": 0.35656267304967837, + "learning_rate": 5.565186235922658e-05, + "loss": 2.7329, + "step": 33348 + }, + { + "epoch": 1.5526456689247388, + "grad_norm": 0.34235742681052783, + "learning_rate": 5.564917097534823e-05, + "loss": 2.7641, + "step": 33349 + }, + { + "epoch": 1.5526922271108319, + "grad_norm": 0.3379933948590826, + "learning_rate": 5.564647957489013e-05, + "loss": 2.7427, + "step": 33350 + }, + { + "epoch": 1.552738785296925, + "grad_norm": 0.3302329685204595, + "learning_rate": 5.564378815786015e-05, + "loss": 2.7022, + "step": 33351 + }, + { + "epoch": 1.5527853434830179, + "grad_norm": 0.34964782304484127, + "learning_rate": 5.5641096724266215e-05, + "loss": 2.6809, + "step": 33352 + }, + { + "epoch": 1.552831901669111, + "grad_norm": 0.3472658520685641, + "learning_rate": 5.563840527411621e-05, + "loss": 2.8501, + "step": 33353 + }, + { + "epoch": 1.552878459855204, + "grad_norm": 0.35037476634056614, + "learning_rate": 5.563571380741805e-05, + "loss": 2.7541, + "step": 33354 + }, + { + "epoch": 1.552925018041297, + "grad_norm": 0.32037808913124566, + "learning_rate": 5.563302232417964e-05, + "loss": 2.7722, + "step": 33355 + }, + { + "epoch": 1.5529715762273901, + "grad_norm": 0.36832339974790657, + "learning_rate": 5.5630330824408837e-05, + "loss": 2.781, + "step": 33356 + }, + { + "epoch": 1.5530181344134832, + "grad_norm": 0.3360592654645675, + "learning_rate": 5.562763930811359e-05, + "loss": 2.7462, + "step": 33357 + }, + { + "epoch": 1.5530646925995764, + "grad_norm": 0.3369549886170329, + "learning_rate": 5.5624947775301764e-05, + "loss": 2.6283, + "step": 33358 + }, + { + "epoch": 1.5531112507856695, + "grad_norm": 0.33422807765140145, + "learning_rate": 5.5622256225981286e-05, + "loss": 2.6963, + "step": 33359 + }, + { + "epoch": 1.5531578089717626, + "grad_norm": 0.3485585026682428, + "learning_rate": 5.561956466016002e-05, + "loss": 2.6869, + "step": 33360 + }, + { + "epoch": 1.5532043671578555, + "grad_norm": 0.3824645522218255, + "learning_rate": 5.561687307784589e-05, + "loss": 2.7962, + "step": 33361 + }, + { + "epoch": 1.5532509253439486, + "grad_norm": 0.34563471650464456, + "learning_rate": 5.5614181479046776e-05, + "loss": 2.6806, + "step": 33362 + }, + { + "epoch": 1.5532974835300417, + "grad_norm": 0.33349683289380266, + "learning_rate": 5.5611489863770614e-05, + "loss": 2.7627, + "step": 33363 + }, + { + "epoch": 1.5533440417161346, + "grad_norm": 0.3832588398044697, + "learning_rate": 5.5608798232025285e-05, + "loss": 2.7212, + "step": 33364 + }, + { + "epoch": 1.5533905999022277, + "grad_norm": 0.3620761345318035, + "learning_rate": 5.560610658381867e-05, + "loss": 2.8208, + "step": 33365 + }, + { + "epoch": 1.5534371580883208, + "grad_norm": 0.36915508800429936, + "learning_rate": 5.560341491915869e-05, + "loss": 2.7039, + "step": 33366 + }, + { + "epoch": 1.553483716274414, + "grad_norm": 0.3549940930768301, + "learning_rate": 5.5600723238053244e-05, + "loss": 2.7518, + "step": 33367 + }, + { + "epoch": 1.553530274460507, + "grad_norm": 0.3734271107137098, + "learning_rate": 5.559803154051023e-05, + "loss": 2.7639, + "step": 33368 + }, + { + "epoch": 1.5535768326466002, + "grad_norm": 0.3374501521171078, + "learning_rate": 5.559533982653753e-05, + "loss": 2.7098, + "step": 33369 + }, + { + "epoch": 1.5536233908326933, + "grad_norm": 0.3562021852421041, + "learning_rate": 5.559264809614308e-05, + "loss": 2.8934, + "step": 33370 + }, + { + "epoch": 1.5536699490187862, + "grad_norm": 0.36537932079295116, + "learning_rate": 5.5589956349334745e-05, + "loss": 2.6682, + "step": 33371 + }, + { + "epoch": 1.5537165072048793, + "grad_norm": 0.3338703206982366, + "learning_rate": 5.558726458612044e-05, + "loss": 2.7571, + "step": 33372 + }, + { + "epoch": 1.5537630653909724, + "grad_norm": 0.3333384709826755, + "learning_rate": 5.558457280650807e-05, + "loss": 2.8149, + "step": 33373 + }, + { + "epoch": 1.5538096235770653, + "grad_norm": 0.3507967874077334, + "learning_rate": 5.558188101050552e-05, + "loss": 2.6592, + "step": 33374 + }, + { + "epoch": 1.5538561817631584, + "grad_norm": 0.3301915962251323, + "learning_rate": 5.5579189198120696e-05, + "loss": 2.6932, + "step": 33375 + }, + { + "epoch": 1.5539027399492515, + "grad_norm": 0.3469558591380236, + "learning_rate": 5.5576497369361514e-05, + "loss": 2.7952, + "step": 33376 + }, + { + "epoch": 1.5539492981353447, + "grad_norm": 0.3197536022438742, + "learning_rate": 5.5573805524235854e-05, + "loss": 2.7119, + "step": 33377 + }, + { + "epoch": 1.5539958563214378, + "grad_norm": 0.33330285326396025, + "learning_rate": 5.557111366275163e-05, + "loss": 2.663, + "step": 33378 + }, + { + "epoch": 1.554042414507531, + "grad_norm": 0.3488004840343355, + "learning_rate": 5.556842178491673e-05, + "loss": 2.8348, + "step": 33379 + }, + { + "epoch": 1.554088972693624, + "grad_norm": 0.3527771074101076, + "learning_rate": 5.556572989073905e-05, + "loss": 2.7582, + "step": 33380 + }, + { + "epoch": 1.554135530879717, + "grad_norm": 0.3375762474368149, + "learning_rate": 5.556303798022652e-05, + "loss": 2.7079, + "step": 33381 + }, + { + "epoch": 1.55418208906581, + "grad_norm": 0.3633324090446946, + "learning_rate": 5.556034605338701e-05, + "loss": 2.7186, + "step": 33382 + }, + { + "epoch": 1.554228647251903, + "grad_norm": 0.36139816137682335, + "learning_rate": 5.555765411022845e-05, + "loss": 2.7595, + "step": 33383 + }, + { + "epoch": 1.554275205437996, + "grad_norm": 0.34552898363394685, + "learning_rate": 5.555496215075871e-05, + "loss": 2.7405, + "step": 33384 + }, + { + "epoch": 1.5543217636240891, + "grad_norm": 0.35723757790809235, + "learning_rate": 5.55522701749857e-05, + "loss": 2.7322, + "step": 33385 + }, + { + "epoch": 1.5543683218101823, + "grad_norm": 0.3589154009889013, + "learning_rate": 5.554957818291734e-05, + "loss": 2.808, + "step": 33386 + }, + { + "epoch": 1.5544148799962754, + "grad_norm": 0.36237469685647755, + "learning_rate": 5.55468861745615e-05, + "loss": 2.7107, + "step": 33387 + }, + { + "epoch": 1.5544614381823685, + "grad_norm": 0.32720690223276705, + "learning_rate": 5.5544194149926085e-05, + "loss": 2.7286, + "step": 33388 + }, + { + "epoch": 1.5545079963684616, + "grad_norm": 0.3347703784372412, + "learning_rate": 5.554150210901902e-05, + "loss": 2.7233, + "step": 33389 + }, + { + "epoch": 1.5545545545545547, + "grad_norm": 0.3569214160677813, + "learning_rate": 5.5538810051848186e-05, + "loss": 2.766, + "step": 33390 + }, + { + "epoch": 1.5546011127406476, + "grad_norm": 0.3472159272959203, + "learning_rate": 5.553611797842151e-05, + "loss": 2.714, + "step": 33391 + }, + { + "epoch": 1.5546476709267407, + "grad_norm": 0.3646176191009767, + "learning_rate": 5.553342588874685e-05, + "loss": 2.6793, + "step": 33392 + }, + { + "epoch": 1.5546942291128336, + "grad_norm": 0.38148136898957713, + "learning_rate": 5.5530733782832136e-05, + "loss": 2.8365, + "step": 33393 + }, + { + "epoch": 1.5547407872989267, + "grad_norm": 0.3334693557985043, + "learning_rate": 5.552804166068526e-05, + "loss": 2.7165, + "step": 33394 + }, + { + "epoch": 1.5547873454850198, + "grad_norm": 0.3566786145028071, + "learning_rate": 5.552534952231414e-05, + "loss": 2.7199, + "step": 33395 + }, + { + "epoch": 1.554833903671113, + "grad_norm": 0.3670070675382762, + "learning_rate": 5.552265736772666e-05, + "loss": 2.7885, + "step": 33396 + }, + { + "epoch": 1.554880461857206, + "grad_norm": 0.3566911045389039, + "learning_rate": 5.551996519693072e-05, + "loss": 2.7727, + "step": 33397 + }, + { + "epoch": 1.5549270200432992, + "grad_norm": 0.36219240730773977, + "learning_rate": 5.551727300993422e-05, + "loss": 2.7209, + "step": 33398 + }, + { + "epoch": 1.5549735782293923, + "grad_norm": 0.35190523633046955, + "learning_rate": 5.5514580806745076e-05, + "loss": 2.7596, + "step": 33399 + }, + { + "epoch": 1.5550201364154852, + "grad_norm": 0.33656966253486525, + "learning_rate": 5.551188858737119e-05, + "loss": 2.6635, + "step": 33400 + }, + { + "epoch": 1.5550666946015783, + "grad_norm": 0.3330049268726283, + "learning_rate": 5.5509196351820437e-05, + "loss": 2.7277, + "step": 33401 + }, + { + "epoch": 1.5551132527876714, + "grad_norm": 0.30489399541529366, + "learning_rate": 5.550650410010073e-05, + "loss": 2.6705, + "step": 33402 + }, + { + "epoch": 1.5551598109737643, + "grad_norm": 0.34935661035311816, + "learning_rate": 5.5503811832219986e-05, + "loss": 2.701, + "step": 33403 + }, + { + "epoch": 1.5552063691598574, + "grad_norm": 0.31432279946127417, + "learning_rate": 5.550111954818611e-05, + "loss": 2.7134, + "step": 33404 + }, + { + "epoch": 1.5552529273459506, + "grad_norm": 0.3287133525570455, + "learning_rate": 5.549842724800697e-05, + "loss": 2.6888, + "step": 33405 + }, + { + "epoch": 1.5552994855320437, + "grad_norm": 0.3696390639110225, + "learning_rate": 5.549573493169049e-05, + "loss": 2.7302, + "step": 33406 + }, + { + "epoch": 1.5553460437181368, + "grad_norm": 0.33660721647524267, + "learning_rate": 5.549304259924457e-05, + "loss": 2.7911, + "step": 33407 + }, + { + "epoch": 1.55539260190423, + "grad_norm": 0.3607843958095608, + "learning_rate": 5.549035025067713e-05, + "loss": 2.6856, + "step": 33408 + }, + { + "epoch": 1.555439160090323, + "grad_norm": 0.332934108394471, + "learning_rate": 5.548765788599604e-05, + "loss": 2.7516, + "step": 33409 + }, + { + "epoch": 1.555485718276416, + "grad_norm": 0.36877495544823763, + "learning_rate": 5.5484965505209216e-05, + "loss": 2.6403, + "step": 33410 + }, + { + "epoch": 1.555532276462509, + "grad_norm": 0.3381072721931639, + "learning_rate": 5.5482273108324545e-05, + "loss": 2.8264, + "step": 33411 + }, + { + "epoch": 1.5555788346486021, + "grad_norm": 0.33621255306567877, + "learning_rate": 5.5479580695349955e-05, + "loss": 2.6858, + "step": 33412 + }, + { + "epoch": 1.555625392834695, + "grad_norm": 0.34102475260969567, + "learning_rate": 5.547688826629334e-05, + "loss": 2.7042, + "step": 33413 + }, + { + "epoch": 1.5556719510207881, + "grad_norm": 0.31457402949632784, + "learning_rate": 5.547419582116259e-05, + "loss": 2.711, + "step": 33414 + }, + { + "epoch": 1.5557185092068813, + "grad_norm": 0.3539768283117873, + "learning_rate": 5.547150335996562e-05, + "loss": 2.8006, + "step": 33415 + }, + { + "epoch": 1.5557650673929744, + "grad_norm": 0.36396570105873177, + "learning_rate": 5.546881088271031e-05, + "loss": 2.701, + "step": 33416 + }, + { + "epoch": 1.5558116255790675, + "grad_norm": 0.3284502420792298, + "learning_rate": 5.546611838940459e-05, + "loss": 2.7514, + "step": 33417 + }, + { + "epoch": 1.5558581837651606, + "grad_norm": 0.365164783955525, + "learning_rate": 5.5463425880056366e-05, + "loss": 2.7069, + "step": 33418 + }, + { + "epoch": 1.5559047419512537, + "grad_norm": 0.3613535859997033, + "learning_rate": 5.54607333546735e-05, + "loss": 2.7291, + "step": 33419 + }, + { + "epoch": 1.5559513001373466, + "grad_norm": 0.3335560656286327, + "learning_rate": 5.545804081326393e-05, + "loss": 2.8056, + "step": 33420 + }, + { + "epoch": 1.5559978583234397, + "grad_norm": 0.34910531922631205, + "learning_rate": 5.545534825583556e-05, + "loss": 2.6412, + "step": 33421 + }, + { + "epoch": 1.5560444165095326, + "grad_norm": 0.33461915608384285, + "learning_rate": 5.545265568239626e-05, + "loss": 2.7018, + "step": 33422 + }, + { + "epoch": 1.5560909746956257, + "grad_norm": 0.34464232797203553, + "learning_rate": 5.544996309295397e-05, + "loss": 2.8158, + "step": 33423 + }, + { + "epoch": 1.5561375328817189, + "grad_norm": 0.3457507719508425, + "learning_rate": 5.544727048751657e-05, + "loss": 2.7798, + "step": 33424 + }, + { + "epoch": 1.556184091067812, + "grad_norm": 0.3433674479145432, + "learning_rate": 5.544457786609196e-05, + "loss": 2.721, + "step": 33425 + }, + { + "epoch": 1.556230649253905, + "grad_norm": 0.3599337503164102, + "learning_rate": 5.544188522868805e-05, + "loss": 2.6667, + "step": 33426 + }, + { + "epoch": 1.5562772074399982, + "grad_norm": 0.3411505143826022, + "learning_rate": 5.543919257531276e-05, + "loss": 2.7979, + "step": 33427 + }, + { + "epoch": 1.5563237656260913, + "grad_norm": 0.36454192986081263, + "learning_rate": 5.543649990597396e-05, + "loss": 2.7197, + "step": 33428 + }, + { + "epoch": 1.5563703238121844, + "grad_norm": 0.3198964394479134, + "learning_rate": 5.543380722067957e-05, + "loss": 2.6712, + "step": 33429 + }, + { + "epoch": 1.5564168819982773, + "grad_norm": 0.3501735099487659, + "learning_rate": 5.543111451943749e-05, + "loss": 2.685, + "step": 33430 + }, + { + "epoch": 1.5564634401843704, + "grad_norm": 0.3321329074873273, + "learning_rate": 5.5428421802255625e-05, + "loss": 2.7666, + "step": 33431 + }, + { + "epoch": 1.5565099983704633, + "grad_norm": 0.3895597578195053, + "learning_rate": 5.542572906914188e-05, + "loss": 2.6776, + "step": 33432 + }, + { + "epoch": 1.5565565565565564, + "grad_norm": 0.34022757747846477, + "learning_rate": 5.542303632010416e-05, + "loss": 2.6997, + "step": 33433 + }, + { + "epoch": 1.5566031147426496, + "grad_norm": 0.3481348471511306, + "learning_rate": 5.5420343555150353e-05, + "loss": 2.7558, + "step": 33434 + }, + { + "epoch": 1.5566496729287427, + "grad_norm": 0.3513301045483253, + "learning_rate": 5.5417650774288375e-05, + "loss": 2.7595, + "step": 33435 + }, + { + "epoch": 1.5566962311148358, + "grad_norm": 0.3356302916771437, + "learning_rate": 5.541495797752614e-05, + "loss": 2.7349, + "step": 33436 + }, + { + "epoch": 1.556742789300929, + "grad_norm": 0.35297815399131144, + "learning_rate": 5.541226516487151e-05, + "loss": 2.6518, + "step": 33437 + }, + { + "epoch": 1.556789347487022, + "grad_norm": 0.34817157483092176, + "learning_rate": 5.5409572336332425e-05, + "loss": 2.7089, + "step": 33438 + }, + { + "epoch": 1.5568359056731151, + "grad_norm": 0.3403330202276081, + "learning_rate": 5.5406879491916785e-05, + "loss": 2.7353, + "step": 33439 + }, + { + "epoch": 1.556882463859208, + "grad_norm": 0.3672788419434118, + "learning_rate": 5.5404186631632485e-05, + "loss": 2.7051, + "step": 33440 + }, + { + "epoch": 1.5569290220453011, + "grad_norm": 0.344875872661301, + "learning_rate": 5.540149375548743e-05, + "loss": 2.734, + "step": 33441 + }, + { + "epoch": 1.556975580231394, + "grad_norm": 0.3819140580456619, + "learning_rate": 5.539880086348952e-05, + "loss": 2.7837, + "step": 33442 + }, + { + "epoch": 1.5570221384174872, + "grad_norm": 0.3692340614441255, + "learning_rate": 5.539610795564665e-05, + "loss": 2.6874, + "step": 33443 + }, + { + "epoch": 1.5570686966035803, + "grad_norm": 0.35041280451055606, + "learning_rate": 5.5393415031966744e-05, + "loss": 2.7796, + "step": 33444 + }, + { + "epoch": 1.5571152547896734, + "grad_norm": 0.38140605228029034, + "learning_rate": 5.5390722092457703e-05, + "loss": 2.6953, + "step": 33445 + }, + { + "epoch": 1.5571618129757665, + "grad_norm": 0.35076427008837985, + "learning_rate": 5.5388029137127416e-05, + "loss": 2.7942, + "step": 33446 + }, + { + "epoch": 1.5572083711618596, + "grad_norm": 0.44602436104636595, + "learning_rate": 5.53853361659838e-05, + "loss": 2.839, + "step": 33447 + }, + { + "epoch": 1.5572549293479527, + "grad_norm": 0.35896834977264946, + "learning_rate": 5.538264317903476e-05, + "loss": 2.7227, + "step": 33448 + }, + { + "epoch": 1.5573014875340456, + "grad_norm": 0.346448766435892, + "learning_rate": 5.5379950176288175e-05, + "loss": 2.6835, + "step": 33449 + }, + { + "epoch": 1.5573480457201387, + "grad_norm": 0.3843907744523429, + "learning_rate": 5.5377257157751984e-05, + "loss": 2.6679, + "step": 33450 + }, + { + "epoch": 1.5573946039062319, + "grad_norm": 0.34974504367619236, + "learning_rate": 5.537456412343406e-05, + "loss": 2.6215, + "step": 33451 + }, + { + "epoch": 1.5574411620923247, + "grad_norm": 0.33196796368122744, + "learning_rate": 5.537187107334232e-05, + "loss": 2.7892, + "step": 33452 + }, + { + "epoch": 1.5574877202784179, + "grad_norm": 0.3602061765170983, + "learning_rate": 5.5369178007484666e-05, + "loss": 2.7408, + "step": 33453 + }, + { + "epoch": 1.557534278464511, + "grad_norm": 0.3558482689908379, + "learning_rate": 5.536648492586901e-05, + "loss": 2.7352, + "step": 33454 + }, + { + "epoch": 1.557580836650604, + "grad_norm": 0.331631647902159, + "learning_rate": 5.536379182850324e-05, + "loss": 2.7692, + "step": 33455 + }, + { + "epoch": 1.5576273948366972, + "grad_norm": 0.372121854391132, + "learning_rate": 5.5361098715395274e-05, + "loss": 2.8318, + "step": 33456 + }, + { + "epoch": 1.5576739530227903, + "grad_norm": 0.35750038032766185, + "learning_rate": 5.535840558655302e-05, + "loss": 2.7296, + "step": 33457 + }, + { + "epoch": 1.5577205112088834, + "grad_norm": 0.37451956501004746, + "learning_rate": 5.5355712441984376e-05, + "loss": 2.7195, + "step": 33458 + }, + { + "epoch": 1.5577670693949763, + "grad_norm": 0.3506767734205793, + "learning_rate": 5.535301928169723e-05, + "loss": 2.7035, + "step": 33459 + }, + { + "epoch": 1.5578136275810694, + "grad_norm": 0.3625580837517946, + "learning_rate": 5.5350326105699516e-05, + "loss": 2.6403, + "step": 33460 + }, + { + "epoch": 1.5578601857671626, + "grad_norm": 0.334208986167028, + "learning_rate": 5.53476329139991e-05, + "loss": 2.6985, + "step": 33461 + }, + { + "epoch": 1.5579067439532555, + "grad_norm": 0.3577334516554652, + "learning_rate": 5.534493970660392e-05, + "loss": 2.7597, + "step": 33462 + }, + { + "epoch": 1.5579533021393486, + "grad_norm": 0.34103856230451285, + "learning_rate": 5.534224648352187e-05, + "loss": 2.7097, + "step": 33463 + }, + { + "epoch": 1.5579998603254417, + "grad_norm": 0.33067720025786834, + "learning_rate": 5.5339553244760856e-05, + "loss": 2.7888, + "step": 33464 + }, + { + "epoch": 1.5580464185115348, + "grad_norm": 0.3483003043683003, + "learning_rate": 5.5336859990328774e-05, + "loss": 2.7766, + "step": 33465 + }, + { + "epoch": 1.558092976697628, + "grad_norm": 0.3364346411626363, + "learning_rate": 5.533416672023353e-05, + "loss": 2.7729, + "step": 33466 + }, + { + "epoch": 1.558139534883721, + "grad_norm": 0.3469729661119866, + "learning_rate": 5.5331473434483026e-05, + "loss": 2.6422, + "step": 33467 + }, + { + "epoch": 1.5581860930698141, + "grad_norm": 0.345821389389218, + "learning_rate": 5.532878013308519e-05, + "loss": 2.7611, + "step": 33468 + }, + { + "epoch": 1.558232651255907, + "grad_norm": 0.35715642184320046, + "learning_rate": 5.532608681604789e-05, + "loss": 2.6906, + "step": 33469 + }, + { + "epoch": 1.5582792094420002, + "grad_norm": 0.3559202995978614, + "learning_rate": 5.532339348337907e-05, + "loss": 2.7225, + "step": 33470 + }, + { + "epoch": 1.558325767628093, + "grad_norm": 0.3654810657530525, + "learning_rate": 5.532070013508661e-05, + "loss": 2.8906, + "step": 33471 + }, + { + "epoch": 1.5583723258141862, + "grad_norm": 0.3373342853329375, + "learning_rate": 5.531800677117841e-05, + "loss": 2.8088, + "step": 33472 + }, + { + "epoch": 1.5584188840002793, + "grad_norm": 0.3667780498371531, + "learning_rate": 5.53153133916624e-05, + "loss": 2.6574, + "step": 33473 + }, + { + "epoch": 1.5584654421863724, + "grad_norm": 0.3253051210081486, + "learning_rate": 5.5312619996546455e-05, + "loss": 2.7345, + "step": 33474 + }, + { + "epoch": 1.5585120003724655, + "grad_norm": 0.3860822847821775, + "learning_rate": 5.530992658583849e-05, + "loss": 2.802, + "step": 33475 + }, + { + "epoch": 1.5585585585585586, + "grad_norm": 0.336000958283527, + "learning_rate": 5.530723315954642e-05, + "loss": 2.7394, + "step": 33476 + }, + { + "epoch": 1.5586051167446517, + "grad_norm": 0.37070453096878364, + "learning_rate": 5.530453971767815e-05, + "loss": 2.7043, + "step": 33477 + }, + { + "epoch": 1.5586516749307449, + "grad_norm": 0.3386709794411424, + "learning_rate": 5.5301846260241575e-05, + "loss": 2.7441, + "step": 33478 + }, + { + "epoch": 1.5586982331168378, + "grad_norm": 0.33753462162647985, + "learning_rate": 5.529915278724459e-05, + "loss": 2.8424, + "step": 33479 + }, + { + "epoch": 1.5587447913029309, + "grad_norm": 0.36392640971890144, + "learning_rate": 5.5296459298695126e-05, + "loss": 2.6988, + "step": 33480 + }, + { + "epoch": 1.5587913494890238, + "grad_norm": 0.3423589145910604, + "learning_rate": 5.529376579460108e-05, + "loss": 2.6211, + "step": 33481 + }, + { + "epoch": 1.5588379076751169, + "grad_norm": 0.3593709896892263, + "learning_rate": 5.529107227497035e-05, + "loss": 2.6588, + "step": 33482 + }, + { + "epoch": 1.55888446586121, + "grad_norm": 0.3199258191588442, + "learning_rate": 5.528837873981083e-05, + "loss": 2.7578, + "step": 33483 + }, + { + "epoch": 1.558931024047303, + "grad_norm": 0.3923030814553268, + "learning_rate": 5.528568518913047e-05, + "loss": 2.6697, + "step": 33484 + }, + { + "epoch": 1.5589775822333962, + "grad_norm": 0.315943339091007, + "learning_rate": 5.528299162293712e-05, + "loss": 2.7365, + "step": 33485 + }, + { + "epoch": 1.5590241404194893, + "grad_norm": 0.3705567304863792, + "learning_rate": 5.528029804123872e-05, + "loss": 2.746, + "step": 33486 + }, + { + "epoch": 1.5590706986055825, + "grad_norm": 0.3540695870031709, + "learning_rate": 5.527760444404315e-05, + "loss": 2.74, + "step": 33487 + }, + { + "epoch": 1.5591172567916753, + "grad_norm": 0.3799485921411704, + "learning_rate": 5.527491083135834e-05, + "loss": 2.7718, + "step": 33488 + }, + { + "epoch": 1.5591638149777685, + "grad_norm": 0.3473850306362868, + "learning_rate": 5.527221720319219e-05, + "loss": 2.8064, + "step": 33489 + }, + { + "epoch": 1.5592103731638616, + "grad_norm": 0.3443101334534744, + "learning_rate": 5.526952355955261e-05, + "loss": 2.6616, + "step": 33490 + }, + { + "epoch": 1.5592569313499545, + "grad_norm": 0.3690416441233206, + "learning_rate": 5.526682990044748e-05, + "loss": 2.7899, + "step": 33491 + }, + { + "epoch": 1.5593034895360476, + "grad_norm": 0.31346880232045965, + "learning_rate": 5.526413622588473e-05, + "loss": 2.7329, + "step": 33492 + }, + { + "epoch": 1.5593500477221407, + "grad_norm": 0.37469580271213415, + "learning_rate": 5.526144253587226e-05, + "loss": 2.7427, + "step": 33493 + }, + { + "epoch": 1.5593966059082338, + "grad_norm": 0.3242740404637105, + "learning_rate": 5.5258748830417974e-05, + "loss": 2.643, + "step": 33494 + }, + { + "epoch": 1.559443164094327, + "grad_norm": 0.3535996924214293, + "learning_rate": 5.525605510952978e-05, + "loss": 2.7451, + "step": 33495 + }, + { + "epoch": 1.55948972228042, + "grad_norm": 0.3696555253812029, + "learning_rate": 5.525336137321557e-05, + "loss": 2.709, + "step": 33496 + }, + { + "epoch": 1.5595362804665132, + "grad_norm": 0.3650065428155276, + "learning_rate": 5.5250667621483285e-05, + "loss": 2.7526, + "step": 33497 + }, + { + "epoch": 1.559582838652606, + "grad_norm": 0.344243118638841, + "learning_rate": 5.52479738543408e-05, + "loss": 2.64, + "step": 33498 + }, + { + "epoch": 1.5596293968386992, + "grad_norm": 0.3540965391912752, + "learning_rate": 5.524528007179603e-05, + "loss": 2.6426, + "step": 33499 + }, + { + "epoch": 1.5596759550247923, + "grad_norm": 0.3585494529475405, + "learning_rate": 5.5242586273856865e-05, + "loss": 2.6455, + "step": 33500 + }, + { + "epoch": 1.5597225132108852, + "grad_norm": 0.33359891008974013, + "learning_rate": 5.523989246053123e-05, + "loss": 2.7723, + "step": 33501 + }, + { + "epoch": 1.5597690713969783, + "grad_norm": 0.3470900738678322, + "learning_rate": 5.523719863182704e-05, + "loss": 2.7047, + "step": 33502 + }, + { + "epoch": 1.5598156295830714, + "grad_norm": 0.35712638126977725, + "learning_rate": 5.523450478775216e-05, + "loss": 2.6788, + "step": 33503 + }, + { + "epoch": 1.5598621877691645, + "grad_norm": 0.3625955078435422, + "learning_rate": 5.523181092831456e-05, + "loss": 2.6822, + "step": 33504 + }, + { + "epoch": 1.5599087459552576, + "grad_norm": 0.3382444475048453, + "learning_rate": 5.5229117053522083e-05, + "loss": 2.7495, + "step": 33505 + }, + { + "epoch": 1.5599553041413508, + "grad_norm": 0.35954183105420684, + "learning_rate": 5.522642316338268e-05, + "loss": 2.7007, + "step": 33506 + }, + { + "epoch": 1.5600018623274439, + "grad_norm": 0.356078965972228, + "learning_rate": 5.522372925790422e-05, + "loss": 2.6687, + "step": 33507 + }, + { + "epoch": 1.5600484205135368, + "grad_norm": 0.3286572250096421, + "learning_rate": 5.522103533709465e-05, + "loss": 2.705, + "step": 33508 + }, + { + "epoch": 1.5600949786996299, + "grad_norm": 0.3562602844905755, + "learning_rate": 5.521834140096185e-05, + "loss": 2.6915, + "step": 33509 + }, + { + "epoch": 1.5601415368857228, + "grad_norm": 0.3371732474243016, + "learning_rate": 5.521564744951374e-05, + "loss": 2.7161, + "step": 33510 + }, + { + "epoch": 1.5601880950718159, + "grad_norm": 0.33506163563117963, + "learning_rate": 5.52129534827582e-05, + "loss": 2.6918, + "step": 33511 + }, + { + "epoch": 1.560234653257909, + "grad_norm": 0.32303617889851055, + "learning_rate": 5.521025950070317e-05, + "loss": 2.7548, + "step": 33512 + }, + { + "epoch": 1.5602812114440021, + "grad_norm": 0.32993784358700695, + "learning_rate": 5.520756550335654e-05, + "loss": 2.6937, + "step": 33513 + }, + { + "epoch": 1.5603277696300952, + "grad_norm": 0.3209738632222962, + "learning_rate": 5.5204871490726205e-05, + "loss": 2.7145, + "step": 33514 + }, + { + "epoch": 1.5603743278161883, + "grad_norm": 0.3410188913068448, + "learning_rate": 5.520217746282009e-05, + "loss": 2.6263, + "step": 33515 + }, + { + "epoch": 1.5604208860022815, + "grad_norm": 0.34751737037332014, + "learning_rate": 5.51994834196461e-05, + "loss": 2.7212, + "step": 33516 + }, + { + "epoch": 1.5604674441883746, + "grad_norm": 0.32973877968685944, + "learning_rate": 5.5196789361212155e-05, + "loss": 2.8241, + "step": 33517 + }, + { + "epoch": 1.5605140023744675, + "grad_norm": 0.3290042817209646, + "learning_rate": 5.5194095287526114e-05, + "loss": 2.8092, + "step": 33518 + }, + { + "epoch": 1.5605605605605606, + "grad_norm": 0.31472978818151826, + "learning_rate": 5.5191401198595925e-05, + "loss": 2.794, + "step": 33519 + }, + { + "epoch": 1.5606071187466535, + "grad_norm": 0.3229172228502022, + "learning_rate": 5.518870709442949e-05, + "loss": 2.7003, + "step": 33520 + }, + { + "epoch": 1.5606536769327466, + "grad_norm": 0.3323306067785912, + "learning_rate": 5.5186012975034705e-05, + "loss": 2.8043, + "step": 33521 + }, + { + "epoch": 1.5607002351188397, + "grad_norm": 0.3347784851379577, + "learning_rate": 5.5183318840419504e-05, + "loss": 2.7705, + "step": 33522 + }, + { + "epoch": 1.5607467933049328, + "grad_norm": 0.3466167770699726, + "learning_rate": 5.5180624690591766e-05, + "loss": 2.723, + "step": 33523 + }, + { + "epoch": 1.560793351491026, + "grad_norm": 0.3277937462289472, + "learning_rate": 5.5177930525559394e-05, + "loss": 2.7214, + "step": 33524 + }, + { + "epoch": 1.560839909677119, + "grad_norm": 0.3435270347042847, + "learning_rate": 5.51752363453303e-05, + "loss": 2.7939, + "step": 33525 + }, + { + "epoch": 1.5608864678632122, + "grad_norm": 0.3231425436015704, + "learning_rate": 5.5172542149912406e-05, + "loss": 2.8711, + "step": 33526 + }, + { + "epoch": 1.5609330260493053, + "grad_norm": 0.3409633733191957, + "learning_rate": 5.516984793931361e-05, + "loss": 2.7344, + "step": 33527 + }, + { + "epoch": 1.5609795842353982, + "grad_norm": 0.31976408741481394, + "learning_rate": 5.516715371354182e-05, + "loss": 2.5762, + "step": 33528 + }, + { + "epoch": 1.5610261424214913, + "grad_norm": 0.34453857875879434, + "learning_rate": 5.5164459472604945e-05, + "loss": 2.6289, + "step": 33529 + }, + { + "epoch": 1.5610727006075842, + "grad_norm": 0.33540768219416284, + "learning_rate": 5.5161765216510886e-05, + "loss": 2.6894, + "step": 33530 + }, + { + "epoch": 1.5611192587936773, + "grad_norm": 0.3541154028368624, + "learning_rate": 5.5159070945267556e-05, + "loss": 2.7187, + "step": 33531 + }, + { + "epoch": 1.5611658169797704, + "grad_norm": 0.3401684094772755, + "learning_rate": 5.515637665888286e-05, + "loss": 2.6079, + "step": 33532 + }, + { + "epoch": 1.5612123751658635, + "grad_norm": 0.31925275597295544, + "learning_rate": 5.51536823573647e-05, + "loss": 2.6635, + "step": 33533 + }, + { + "epoch": 1.5612589333519566, + "grad_norm": 0.36599728582814806, + "learning_rate": 5.5150988040720995e-05, + "loss": 2.7533, + "step": 33534 + }, + { + "epoch": 1.5613054915380498, + "grad_norm": 0.3202411268729306, + "learning_rate": 5.514829370895966e-05, + "loss": 2.7259, + "step": 33535 + }, + { + "epoch": 1.5613520497241429, + "grad_norm": 0.34376889850049064, + "learning_rate": 5.5145599362088594e-05, + "loss": 2.7691, + "step": 33536 + }, + { + "epoch": 1.5613986079102358, + "grad_norm": 0.33134220189436686, + "learning_rate": 5.514290500011569e-05, + "loss": 2.7423, + "step": 33537 + }, + { + "epoch": 1.5614451660963289, + "grad_norm": 0.33971179216720127, + "learning_rate": 5.514021062304885e-05, + "loss": 2.7586, + "step": 33538 + }, + { + "epoch": 1.561491724282422, + "grad_norm": 0.33903656937797594, + "learning_rate": 5.5137516230896013e-05, + "loss": 2.7292, + "step": 33539 + }, + { + "epoch": 1.561538282468515, + "grad_norm": 0.3482533568768695, + "learning_rate": 5.5134821823665074e-05, + "loss": 2.693, + "step": 33540 + }, + { + "epoch": 1.561584840654608, + "grad_norm": 0.3525521943576745, + "learning_rate": 5.5132127401363934e-05, + "loss": 2.6772, + "step": 33541 + }, + { + "epoch": 1.5616313988407011, + "grad_norm": 0.3704201608827421, + "learning_rate": 5.51294329640005e-05, + "loss": 2.734, + "step": 33542 + }, + { + "epoch": 1.5616779570267942, + "grad_norm": 0.3779597393655251, + "learning_rate": 5.51267385115827e-05, + "loss": 2.7031, + "step": 33543 + }, + { + "epoch": 1.5617245152128874, + "grad_norm": 0.3463902296116382, + "learning_rate": 5.5124044044118426e-05, + "loss": 2.7027, + "step": 33544 + }, + { + "epoch": 1.5617710733989805, + "grad_norm": 0.39305499112228465, + "learning_rate": 5.512134956161558e-05, + "loss": 2.7494, + "step": 33545 + }, + { + "epoch": 1.5618176315850736, + "grad_norm": 0.35455216117740646, + "learning_rate": 5.511865506408207e-05, + "loss": 2.7188, + "step": 33546 + }, + { + "epoch": 1.5618641897711665, + "grad_norm": 0.35495350091585304, + "learning_rate": 5.511596055152583e-05, + "loss": 2.6908, + "step": 33547 + }, + { + "epoch": 1.5619107479572596, + "grad_norm": 0.3747869952398969, + "learning_rate": 5.5113266023954735e-05, + "loss": 2.7052, + "step": 33548 + }, + { + "epoch": 1.5619573061433527, + "grad_norm": 0.35899572850759914, + "learning_rate": 5.5110571481376726e-05, + "loss": 2.756, + "step": 33549 + }, + { + "epoch": 1.5620038643294456, + "grad_norm": 0.34355336523310553, + "learning_rate": 5.510787692379968e-05, + "loss": 2.8266, + "step": 33550 + }, + { + "epoch": 1.5620504225155387, + "grad_norm": 0.34705550038481453, + "learning_rate": 5.510518235123151e-05, + "loss": 2.6704, + "step": 33551 + }, + { + "epoch": 1.5620969807016318, + "grad_norm": 0.3604086014354453, + "learning_rate": 5.510248776368014e-05, + "loss": 2.8016, + "step": 33552 + }, + { + "epoch": 1.562143538887725, + "grad_norm": 0.36834712652597656, + "learning_rate": 5.509979316115349e-05, + "loss": 2.7313, + "step": 33553 + }, + { + "epoch": 1.562190097073818, + "grad_norm": 0.3395537156002042, + "learning_rate": 5.509709854365943e-05, + "loss": 2.7786, + "step": 33554 + }, + { + "epoch": 1.5622366552599112, + "grad_norm": 0.338851719344283, + "learning_rate": 5.5094403911205885e-05, + "loss": 2.7843, + "step": 33555 + }, + { + "epoch": 1.5622832134460043, + "grad_norm": 0.39172015481667516, + "learning_rate": 5.509170926380076e-05, + "loss": 2.5911, + "step": 33556 + }, + { + "epoch": 1.5623297716320972, + "grad_norm": 0.33351557236856116, + "learning_rate": 5.5089014601451986e-05, + "loss": 2.6815, + "step": 33557 + }, + { + "epoch": 1.5623763298181903, + "grad_norm": 0.37184946067459357, + "learning_rate": 5.508631992416746e-05, + "loss": 2.644, + "step": 33558 + }, + { + "epoch": 1.5624228880042832, + "grad_norm": 0.3569509327652908, + "learning_rate": 5.508362523195507e-05, + "loss": 2.7216, + "step": 33559 + }, + { + "epoch": 1.5624694461903763, + "grad_norm": 0.34380847167346873, + "learning_rate": 5.508093052482275e-05, + "loss": 2.6721, + "step": 33560 + }, + { + "epoch": 1.5625160043764694, + "grad_norm": 0.33714623836937396, + "learning_rate": 5.507823580277841e-05, + "loss": 2.7606, + "step": 33561 + }, + { + "epoch": 1.5625625625625625, + "grad_norm": 0.36477936976957887, + "learning_rate": 5.507554106582994e-05, + "loss": 2.6564, + "step": 33562 + }, + { + "epoch": 1.5626091207486557, + "grad_norm": 0.33617148178608824, + "learning_rate": 5.507284631398525e-05, + "loss": 2.6464, + "step": 33563 + }, + { + "epoch": 1.5626556789347488, + "grad_norm": 0.3351084096507938, + "learning_rate": 5.507015154725226e-05, + "loss": 2.6296, + "step": 33564 + }, + { + "epoch": 1.5627022371208419, + "grad_norm": 0.3558136836050739, + "learning_rate": 5.506745676563887e-05, + "loss": 2.8075, + "step": 33565 + }, + { + "epoch": 1.562748795306935, + "grad_norm": 0.33812758198764187, + "learning_rate": 5.506476196915299e-05, + "loss": 2.7985, + "step": 33566 + }, + { + "epoch": 1.562795353493028, + "grad_norm": 0.36067772265511666, + "learning_rate": 5.506206715780254e-05, + "loss": 2.7192, + "step": 33567 + }, + { + "epoch": 1.562841911679121, + "grad_norm": 0.37110483784475146, + "learning_rate": 5.505937233159543e-05, + "loss": 2.7289, + "step": 33568 + }, + { + "epoch": 1.562888469865214, + "grad_norm": 0.33919609654109395, + "learning_rate": 5.505667749053954e-05, + "loss": 2.7956, + "step": 33569 + }, + { + "epoch": 1.562935028051307, + "grad_norm": 0.3546483189129538, + "learning_rate": 5.505398263464282e-05, + "loss": 2.6714, + "step": 33570 + }, + { + "epoch": 1.5629815862374001, + "grad_norm": 0.3768111227768876, + "learning_rate": 5.5051287763913154e-05, + "loss": 2.6219, + "step": 33571 + }, + { + "epoch": 1.5630281444234932, + "grad_norm": 0.33993217360233197, + "learning_rate": 5.504859287835845e-05, + "loss": 2.6881, + "step": 33572 + }, + { + "epoch": 1.5630747026095864, + "grad_norm": 0.3516843598089847, + "learning_rate": 5.5045897977986626e-05, + "loss": 2.8154, + "step": 33573 + }, + { + "epoch": 1.5631212607956795, + "grad_norm": 0.3510976465693414, + "learning_rate": 5.504320306280559e-05, + "loss": 2.716, + "step": 33574 + }, + { + "epoch": 1.5631678189817726, + "grad_norm": 0.36544920555826765, + "learning_rate": 5.504050813282324e-05, + "loss": 2.7955, + "step": 33575 + }, + { + "epoch": 1.5632143771678655, + "grad_norm": 0.380221947545911, + "learning_rate": 5.5037813188047514e-05, + "loss": 2.6434, + "step": 33576 + }, + { + "epoch": 1.5632609353539586, + "grad_norm": 0.3716557237759272, + "learning_rate": 5.503511822848629e-05, + "loss": 2.7721, + "step": 33577 + }, + { + "epoch": 1.5633074935400517, + "grad_norm": 0.34711445289305987, + "learning_rate": 5.5032423254147494e-05, + "loss": 2.7763, + "step": 33578 + }, + { + "epoch": 1.5633540517261446, + "grad_norm": 0.3947887863552929, + "learning_rate": 5.5029728265039024e-05, + "loss": 2.7377, + "step": 33579 + }, + { + "epoch": 1.5634006099122377, + "grad_norm": 0.33598369855031934, + "learning_rate": 5.502703326116882e-05, + "loss": 2.691, + "step": 33580 + }, + { + "epoch": 1.5634471680983308, + "grad_norm": 0.40235297095203254, + "learning_rate": 5.502433824254474e-05, + "loss": 2.7134, + "step": 33581 + }, + { + "epoch": 1.563493726284424, + "grad_norm": 0.35472968663672466, + "learning_rate": 5.502164320917473e-05, + "loss": 2.6986, + "step": 33582 + }, + { + "epoch": 1.563540284470517, + "grad_norm": 0.36706427205012077, + "learning_rate": 5.50189481610667e-05, + "loss": 2.7336, + "step": 33583 + }, + { + "epoch": 1.5635868426566102, + "grad_norm": 0.35566368945177973, + "learning_rate": 5.501625309822854e-05, + "loss": 2.7121, + "step": 33584 + }, + { + "epoch": 1.5636334008427033, + "grad_norm": 0.3773021934543136, + "learning_rate": 5.501355802066819e-05, + "loss": 2.7799, + "step": 33585 + }, + { + "epoch": 1.5636799590287962, + "grad_norm": 0.3741422731310821, + "learning_rate": 5.5010862928393526e-05, + "loss": 2.7239, + "step": 33586 + }, + { + "epoch": 1.5637265172148893, + "grad_norm": 0.3513510210208137, + "learning_rate": 5.500816782141248e-05, + "loss": 2.7523, + "step": 33587 + }, + { + "epoch": 1.5637730754009824, + "grad_norm": 0.36981486511071326, + "learning_rate": 5.500547269973295e-05, + "loss": 2.6632, + "step": 33588 + }, + { + "epoch": 1.5638196335870753, + "grad_norm": 0.3416212238276438, + "learning_rate": 5.500277756336286e-05, + "loss": 2.7297, + "step": 33589 + }, + { + "epoch": 1.5638661917731684, + "grad_norm": 0.3694943714087835, + "learning_rate": 5.500008241231009e-05, + "loss": 2.8871, + "step": 33590 + }, + { + "epoch": 1.5639127499592616, + "grad_norm": 0.3553001989260154, + "learning_rate": 5.499738724658259e-05, + "loss": 2.7139, + "step": 33591 + }, + { + "epoch": 1.5639593081453547, + "grad_norm": 0.3777982350942316, + "learning_rate": 5.4994692066188234e-05, + "loss": 2.6849, + "step": 33592 + }, + { + "epoch": 1.5640058663314478, + "grad_norm": 0.3300149533405391, + "learning_rate": 5.4991996871134965e-05, + "loss": 2.6441, + "step": 33593 + }, + { + "epoch": 1.564052424517541, + "grad_norm": 0.3641853147672703, + "learning_rate": 5.498930166143068e-05, + "loss": 2.8209, + "step": 33594 + }, + { + "epoch": 1.564098982703634, + "grad_norm": 0.35340056399138553, + "learning_rate": 5.4986606437083274e-05, + "loss": 2.743, + "step": 33595 + }, + { + "epoch": 1.564145540889727, + "grad_norm": 0.3305367320039279, + "learning_rate": 5.498391119810067e-05, + "loss": 2.8146, + "step": 33596 + }, + { + "epoch": 1.56419209907582, + "grad_norm": 0.3636653494805206, + "learning_rate": 5.498121594449077e-05, + "loss": 2.7355, + "step": 33597 + }, + { + "epoch": 1.564238657261913, + "grad_norm": 0.3514766958301517, + "learning_rate": 5.4978520676261514e-05, + "loss": 2.8143, + "step": 33598 + }, + { + "epoch": 1.564285215448006, + "grad_norm": 0.3446147429312619, + "learning_rate": 5.4975825393420774e-05, + "loss": 2.8562, + "step": 33599 + }, + { + "epoch": 1.5643317736340991, + "grad_norm": 0.3312662490259109, + "learning_rate": 5.497313009597649e-05, + "loss": 2.7879, + "step": 33600 + }, + { + "epoch": 1.5643783318201923, + "grad_norm": 0.3697516616545916, + "learning_rate": 5.497043478393655e-05, + "loss": 2.8074, + "step": 33601 + }, + { + "epoch": 1.5644248900062854, + "grad_norm": 0.3542066674796479, + "learning_rate": 5.4967739457308866e-05, + "loss": 2.6726, + "step": 33602 + }, + { + "epoch": 1.5644714481923785, + "grad_norm": 0.3212596136606406, + "learning_rate": 5.496504411610137e-05, + "loss": 2.7854, + "step": 33603 + }, + { + "epoch": 1.5645180063784716, + "grad_norm": 0.34876872434271544, + "learning_rate": 5.496234876032196e-05, + "loss": 2.6427, + "step": 33604 + }, + { + "epoch": 1.5645645645645647, + "grad_norm": 0.35236283723206613, + "learning_rate": 5.495965338997853e-05, + "loss": 2.6092, + "step": 33605 + }, + { + "epoch": 1.5646111227506576, + "grad_norm": 0.3069079446755781, + "learning_rate": 5.495695800507901e-05, + "loss": 2.7297, + "step": 33606 + }, + { + "epoch": 1.5646576809367507, + "grad_norm": 0.32363994598135687, + "learning_rate": 5.495426260563132e-05, + "loss": 2.6872, + "step": 33607 + }, + { + "epoch": 1.5647042391228436, + "grad_norm": 0.3433035678248066, + "learning_rate": 5.4951567191643346e-05, + "loss": 2.733, + "step": 33608 + }, + { + "epoch": 1.5647507973089367, + "grad_norm": 0.32254972570371776, + "learning_rate": 5.494887176312301e-05, + "loss": 2.781, + "step": 33609 + }, + { + "epoch": 1.5647973554950299, + "grad_norm": 0.3376127077684056, + "learning_rate": 5.494617632007821e-05, + "loss": 2.6215, + "step": 33610 + }, + { + "epoch": 1.564843913681123, + "grad_norm": 0.33441444911456303, + "learning_rate": 5.4943480862516905e-05, + "loss": 2.6826, + "step": 33611 + }, + { + "epoch": 1.564890471867216, + "grad_norm": 0.34440654251266484, + "learning_rate": 5.4940785390446945e-05, + "loss": 2.7368, + "step": 33612 + }, + { + "epoch": 1.5649370300533092, + "grad_norm": 0.330254789409452, + "learning_rate": 5.4938089903876276e-05, + "loss": 2.7564, + "step": 33613 + }, + { + "epoch": 1.5649835882394023, + "grad_norm": 0.39362762039492716, + "learning_rate": 5.493539440281279e-05, + "loss": 2.7405, + "step": 33614 + }, + { + "epoch": 1.5650301464254954, + "grad_norm": 0.3323712816948557, + "learning_rate": 5.493269888726441e-05, + "loss": 2.6173, + "step": 33615 + }, + { + "epoch": 1.5650767046115883, + "grad_norm": 0.3656644556271822, + "learning_rate": 5.493000335723906e-05, + "loss": 2.7996, + "step": 33616 + }, + { + "epoch": 1.5651232627976814, + "grad_norm": 0.3475362058797654, + "learning_rate": 5.4927307812744624e-05, + "loss": 2.6922, + "step": 33617 + }, + { + "epoch": 1.5651698209837743, + "grad_norm": 0.394695844375094, + "learning_rate": 5.492461225378902e-05, + "loss": 2.7111, + "step": 33618 + }, + { + "epoch": 1.5652163791698674, + "grad_norm": 0.3464747070367402, + "learning_rate": 5.492191668038017e-05, + "loss": 2.6765, + "step": 33619 + }, + { + "epoch": 1.5652629373559606, + "grad_norm": 0.3330627675956605, + "learning_rate": 5.4919221092525974e-05, + "loss": 2.7107, + "step": 33620 + }, + { + "epoch": 1.5653094955420537, + "grad_norm": 0.40420270768558936, + "learning_rate": 5.491652549023436e-05, + "loss": 2.6587, + "step": 33621 + }, + { + "epoch": 1.5653560537281468, + "grad_norm": 0.3260037936977225, + "learning_rate": 5.491382987351321e-05, + "loss": 2.8047, + "step": 33622 + }, + { + "epoch": 1.56540261191424, + "grad_norm": 0.3746928127146371, + "learning_rate": 5.4911134242370465e-05, + "loss": 2.745, + "step": 33623 + }, + { + "epoch": 1.565449170100333, + "grad_norm": 0.3524258452780028, + "learning_rate": 5.4908438596814036e-05, + "loss": 2.7574, + "step": 33624 + }, + { + "epoch": 1.565495728286426, + "grad_norm": 0.37707444852000044, + "learning_rate": 5.49057429368518e-05, + "loss": 2.6618, + "step": 33625 + }, + { + "epoch": 1.565542286472519, + "grad_norm": 0.32617907957558784, + "learning_rate": 5.490304726249172e-05, + "loss": 2.6863, + "step": 33626 + }, + { + "epoch": 1.5655888446586121, + "grad_norm": 0.3812962219474688, + "learning_rate": 5.490035157374166e-05, + "loss": 2.7817, + "step": 33627 + }, + { + "epoch": 1.565635402844705, + "grad_norm": 0.3291405667903202, + "learning_rate": 5.4897655870609545e-05, + "loss": 2.676, + "step": 33628 + }, + { + "epoch": 1.5656819610307982, + "grad_norm": 0.37445428444595813, + "learning_rate": 5.4894960153103305e-05, + "loss": 2.6763, + "step": 33629 + }, + { + "epoch": 1.5657285192168913, + "grad_norm": 0.31844972895265555, + "learning_rate": 5.4892264421230844e-05, + "loss": 2.6746, + "step": 33630 + }, + { + "epoch": 1.5657750774029844, + "grad_norm": 0.3654856073456484, + "learning_rate": 5.488956867500005e-05, + "loss": 2.6674, + "step": 33631 + }, + { + "epoch": 1.5658216355890775, + "grad_norm": 0.3134876370067936, + "learning_rate": 5.4886872914418865e-05, + "loss": 2.6602, + "step": 33632 + }, + { + "epoch": 1.5658681937751706, + "grad_norm": 0.359507571598555, + "learning_rate": 5.488417713949518e-05, + "loss": 2.7928, + "step": 33633 + }, + { + "epoch": 1.5659147519612637, + "grad_norm": 0.372746726708355, + "learning_rate": 5.4881481350236944e-05, + "loss": 2.707, + "step": 33634 + }, + { + "epoch": 1.5659613101473566, + "grad_norm": 0.3649740857079103, + "learning_rate": 5.487878554665201e-05, + "loss": 2.7329, + "step": 33635 + }, + { + "epoch": 1.5660078683334497, + "grad_norm": 0.3747561730673667, + "learning_rate": 5.4876089728748324e-05, + "loss": 2.7696, + "step": 33636 + }, + { + "epoch": 1.5660544265195429, + "grad_norm": 0.3860532686308836, + "learning_rate": 5.487339389653382e-05, + "loss": 2.6762, + "step": 33637 + }, + { + "epoch": 1.5661009847056357, + "grad_norm": 0.3395747045546125, + "learning_rate": 5.4870698050016365e-05, + "loss": 2.6339, + "step": 33638 + }, + { + "epoch": 1.5661475428917289, + "grad_norm": 0.3733907064305044, + "learning_rate": 5.486800218920391e-05, + "loss": 2.7422, + "step": 33639 + }, + { + "epoch": 1.566194101077822, + "grad_norm": 0.35003906339950047, + "learning_rate": 5.486530631410433e-05, + "loss": 2.8559, + "step": 33640 + }, + { + "epoch": 1.566240659263915, + "grad_norm": 0.38617363342826927, + "learning_rate": 5.486261042472555e-05, + "loss": 2.8078, + "step": 33641 + }, + { + "epoch": 1.5662872174500082, + "grad_norm": 0.35404744528688364, + "learning_rate": 5.4859914521075484e-05, + "loss": 2.6961, + "step": 33642 + }, + { + "epoch": 1.5663337756361013, + "grad_norm": 0.3878089537130718, + "learning_rate": 5.4857218603162075e-05, + "loss": 2.8195, + "step": 33643 + }, + { + "epoch": 1.5663803338221944, + "grad_norm": 0.3722687166080289, + "learning_rate": 5.4854522670993183e-05, + "loss": 2.631, + "step": 33644 + }, + { + "epoch": 1.5664268920082873, + "grad_norm": 0.3537794397187724, + "learning_rate": 5.485182672457676e-05, + "loss": 2.6949, + "step": 33645 + }, + { + "epoch": 1.5664734501943804, + "grad_norm": 0.3962686395767663, + "learning_rate": 5.4849130763920684e-05, + "loss": 2.7199, + "step": 33646 + }, + { + "epoch": 1.5665200083804733, + "grad_norm": 0.3542543830387391, + "learning_rate": 5.4846434789032905e-05, + "loss": 2.7055, + "step": 33647 + }, + { + "epoch": 1.5665665665665665, + "grad_norm": 0.3989749244631978, + "learning_rate": 5.4843738799921316e-05, + "loss": 2.7825, + "step": 33648 + }, + { + "epoch": 1.5666131247526596, + "grad_norm": 0.33384140522715594, + "learning_rate": 5.4841042796593837e-05, + "loss": 2.8625, + "step": 33649 + }, + { + "epoch": 1.5666596829387527, + "grad_norm": 0.38365631724599125, + "learning_rate": 5.483834677905837e-05, + "loss": 2.7908, + "step": 33650 + }, + { + "epoch": 1.5667062411248458, + "grad_norm": 0.324385234293018, + "learning_rate": 5.483565074732282e-05, + "loss": 2.72, + "step": 33651 + }, + { + "epoch": 1.566752799310939, + "grad_norm": 0.36572689813737996, + "learning_rate": 5.483295470139512e-05, + "loss": 2.7664, + "step": 33652 + }, + { + "epoch": 1.566799357497032, + "grad_norm": 0.3530411425769141, + "learning_rate": 5.483025864128317e-05, + "loss": 2.6525, + "step": 33653 + }, + { + "epoch": 1.5668459156831251, + "grad_norm": 0.33030282358791213, + "learning_rate": 5.4827562566994896e-05, + "loss": 2.7832, + "step": 33654 + }, + { + "epoch": 1.566892473869218, + "grad_norm": 0.3611866384644919, + "learning_rate": 5.482486647853819e-05, + "loss": 2.6323, + "step": 33655 + }, + { + "epoch": 1.5669390320553112, + "grad_norm": 0.34190858847850525, + "learning_rate": 5.482217037592098e-05, + "loss": 2.7331, + "step": 33656 + }, + { + "epoch": 1.566985590241404, + "grad_norm": 0.4031104193191089, + "learning_rate": 5.481947425915118e-05, + "loss": 2.7137, + "step": 33657 + }, + { + "epoch": 1.5670321484274972, + "grad_norm": 0.33430059029121767, + "learning_rate": 5.481677812823669e-05, + "loss": 2.8327, + "step": 33658 + }, + { + "epoch": 1.5670787066135903, + "grad_norm": 0.3887614248273292, + "learning_rate": 5.4814081983185425e-05, + "loss": 2.7532, + "step": 33659 + }, + { + "epoch": 1.5671252647996834, + "grad_norm": 0.3520768746201073, + "learning_rate": 5.481138582400532e-05, + "loss": 2.7177, + "step": 33660 + }, + { + "epoch": 1.5671718229857765, + "grad_norm": 0.35268359946577527, + "learning_rate": 5.480868965070427e-05, + "loss": 2.675, + "step": 33661 + }, + { + "epoch": 1.5672183811718696, + "grad_norm": 0.340028210467268, + "learning_rate": 5.4805993463290184e-05, + "loss": 2.6262, + "step": 33662 + }, + { + "epoch": 1.5672649393579627, + "grad_norm": 0.34972046987856564, + "learning_rate": 5.4803297261770995e-05, + "loss": 2.7969, + "step": 33663 + }, + { + "epoch": 1.5673114975440556, + "grad_norm": 0.3646716960736034, + "learning_rate": 5.4800601046154576e-05, + "loss": 2.679, + "step": 33664 + }, + { + "epoch": 1.5673580557301487, + "grad_norm": 0.33498943088674815, + "learning_rate": 5.4797904816448875e-05, + "loss": 2.7029, + "step": 33665 + }, + { + "epoch": 1.5674046139162419, + "grad_norm": 0.3742123206096774, + "learning_rate": 5.479520857266181e-05, + "loss": 2.7971, + "step": 33666 + }, + { + "epoch": 1.5674511721023348, + "grad_norm": 0.33332117572594705, + "learning_rate": 5.479251231480127e-05, + "loss": 2.7067, + "step": 33667 + }, + { + "epoch": 1.5674977302884279, + "grad_norm": 0.3723591345088086, + "learning_rate": 5.4789816042875174e-05, + "loss": 2.8129, + "step": 33668 + }, + { + "epoch": 1.567544288474521, + "grad_norm": 0.3339339026226527, + "learning_rate": 5.478711975689144e-05, + "loss": 2.7861, + "step": 33669 + }, + { + "epoch": 1.567590846660614, + "grad_norm": 0.33976906307117627, + "learning_rate": 5.4784423456857994e-05, + "loss": 2.7727, + "step": 33670 + }, + { + "epoch": 1.5676374048467072, + "grad_norm": 0.34645151222678916, + "learning_rate": 5.478172714278272e-05, + "loss": 2.7341, + "step": 33671 + }, + { + "epoch": 1.5676839630328003, + "grad_norm": 0.33518058788946586, + "learning_rate": 5.477903081467355e-05, + "loss": 2.7031, + "step": 33672 + }, + { + "epoch": 1.5677305212188934, + "grad_norm": 0.350481281896642, + "learning_rate": 5.47763344725384e-05, + "loss": 2.7341, + "step": 33673 + }, + { + "epoch": 1.5677770794049863, + "grad_norm": 0.3292546861431389, + "learning_rate": 5.477363811638517e-05, + "loss": 2.7118, + "step": 33674 + }, + { + "epoch": 1.5678236375910795, + "grad_norm": 0.3298768154054099, + "learning_rate": 5.47709417462218e-05, + "loss": 2.7448, + "step": 33675 + }, + { + "epoch": 1.5678701957771726, + "grad_norm": 0.3524491367859022, + "learning_rate": 5.4768245362056195e-05, + "loss": 2.7285, + "step": 33676 + }, + { + "epoch": 1.5679167539632655, + "grad_norm": 0.36897511645118236, + "learning_rate": 5.476554896389623e-05, + "loss": 2.7726, + "step": 33677 + }, + { + "epoch": 1.5679633121493586, + "grad_norm": 0.3342794473718938, + "learning_rate": 5.476285255174987e-05, + "loss": 2.7645, + "step": 33678 + }, + { + "epoch": 1.5680098703354517, + "grad_norm": 0.35005999224713835, + "learning_rate": 5.476015612562499e-05, + "loss": 2.7739, + "step": 33679 + }, + { + "epoch": 1.5680564285215448, + "grad_norm": 0.33527473691878995, + "learning_rate": 5.4757459685529534e-05, + "loss": 2.7022, + "step": 33680 + }, + { + "epoch": 1.568102986707638, + "grad_norm": 0.3516751741997304, + "learning_rate": 5.4754763231471396e-05, + "loss": 2.7066, + "step": 33681 + }, + { + "epoch": 1.568149544893731, + "grad_norm": 0.333091888565076, + "learning_rate": 5.4752066763458484e-05, + "loss": 2.724, + "step": 33682 + }, + { + "epoch": 1.5681961030798242, + "grad_norm": 0.32749157150626773, + "learning_rate": 5.4749370281498734e-05, + "loss": 2.7817, + "step": 33683 + }, + { + "epoch": 1.568242661265917, + "grad_norm": 0.3371118399177612, + "learning_rate": 5.474667378560007e-05, + "loss": 2.7537, + "step": 33684 + }, + { + "epoch": 1.5682892194520102, + "grad_norm": 0.3461500331113172, + "learning_rate": 5.474397727577037e-05, + "loss": 2.7023, + "step": 33685 + }, + { + "epoch": 1.568335777638103, + "grad_norm": 0.3412900448266066, + "learning_rate": 5.474128075201756e-05, + "loss": 2.6473, + "step": 33686 + }, + { + "epoch": 1.5683823358241962, + "grad_norm": 0.3434983557398766, + "learning_rate": 5.4738584214349555e-05, + "loss": 2.8348, + "step": 33687 + }, + { + "epoch": 1.5684288940102893, + "grad_norm": 0.3541038744788099, + "learning_rate": 5.473588766277429e-05, + "loss": 2.6956, + "step": 33688 + }, + { + "epoch": 1.5684754521963824, + "grad_norm": 0.3351854980951238, + "learning_rate": 5.473319109729965e-05, + "loss": 2.7293, + "step": 33689 + }, + { + "epoch": 1.5685220103824755, + "grad_norm": 0.3277810039614032, + "learning_rate": 5.473049451793356e-05, + "loss": 2.6335, + "step": 33690 + }, + { + "epoch": 1.5685685685685686, + "grad_norm": 0.3409142585996955, + "learning_rate": 5.4727797924683934e-05, + "loss": 2.7065, + "step": 33691 + }, + { + "epoch": 1.5686151267546617, + "grad_norm": 0.32124390343705383, + "learning_rate": 5.472510131755868e-05, + "loss": 2.7338, + "step": 33692 + }, + { + "epoch": 1.5686616849407549, + "grad_norm": 0.35957907294094343, + "learning_rate": 5.4722404696565744e-05, + "loss": 2.6943, + "step": 33693 + }, + { + "epoch": 1.5687082431268478, + "grad_norm": 0.33006158410913883, + "learning_rate": 5.471970806171299e-05, + "loss": 2.7409, + "step": 33694 + }, + { + "epoch": 1.5687548013129409, + "grad_norm": 0.309528760254637, + "learning_rate": 5.471701141300837e-05, + "loss": 2.6664, + "step": 33695 + }, + { + "epoch": 1.5688013594990338, + "grad_norm": 0.3342545177509314, + "learning_rate": 5.471431475045977e-05, + "loss": 2.7255, + "step": 33696 + }, + { + "epoch": 1.5688479176851269, + "grad_norm": 0.33782904773595385, + "learning_rate": 5.471161807407514e-05, + "loss": 2.7255, + "step": 33697 + }, + { + "epoch": 1.56889447587122, + "grad_norm": 0.3356761010877345, + "learning_rate": 5.470892138386237e-05, + "loss": 2.7211, + "step": 33698 + }, + { + "epoch": 1.568941034057313, + "grad_norm": 0.35424517506128667, + "learning_rate": 5.4706224679829376e-05, + "loss": 2.6879, + "step": 33699 + }, + { + "epoch": 1.5689875922434062, + "grad_norm": 0.33970976440277717, + "learning_rate": 5.470352796198408e-05, + "loss": 2.706, + "step": 33700 + }, + { + "epoch": 1.5690341504294993, + "grad_norm": 0.3264306561506959, + "learning_rate": 5.4700831230334396e-05, + "loss": 2.8054, + "step": 33701 + }, + { + "epoch": 1.5690807086155925, + "grad_norm": 0.36228179970192376, + "learning_rate": 5.469813448488823e-05, + "loss": 2.6976, + "step": 33702 + }, + { + "epoch": 1.5691272668016856, + "grad_norm": 0.33545453556140226, + "learning_rate": 5.469543772565351e-05, + "loss": 2.7302, + "step": 33703 + }, + { + "epoch": 1.5691738249877785, + "grad_norm": 0.33946266257554314, + "learning_rate": 5.469274095263813e-05, + "loss": 2.8252, + "step": 33704 + }, + { + "epoch": 1.5692203831738716, + "grad_norm": 0.3570452306067874, + "learning_rate": 5.469004416585002e-05, + "loss": 2.7252, + "step": 33705 + }, + { + "epoch": 1.5692669413599645, + "grad_norm": 0.32902761920040335, + "learning_rate": 5.46873473652971e-05, + "loss": 2.7837, + "step": 33706 + }, + { + "epoch": 1.5693134995460576, + "grad_norm": 0.336233472062935, + "learning_rate": 5.468465055098727e-05, + "loss": 2.7571, + "step": 33707 + }, + { + "epoch": 1.5693600577321507, + "grad_norm": 0.36371791873851006, + "learning_rate": 5.468195372292846e-05, + "loss": 2.7094, + "step": 33708 + }, + { + "epoch": 1.5694066159182438, + "grad_norm": 0.34644557598641307, + "learning_rate": 5.467925688112856e-05, + "loss": 2.8081, + "step": 33709 + }, + { + "epoch": 1.569453174104337, + "grad_norm": 0.37389790078120944, + "learning_rate": 5.4676560025595514e-05, + "loss": 2.826, + "step": 33710 + }, + { + "epoch": 1.56949973229043, + "grad_norm": 0.33623358637918266, + "learning_rate": 5.4673863156337234e-05, + "loss": 2.741, + "step": 33711 + }, + { + "epoch": 1.5695462904765232, + "grad_norm": 0.3599687659584725, + "learning_rate": 5.467116627336161e-05, + "loss": 2.7428, + "step": 33712 + }, + { + "epoch": 1.569592848662616, + "grad_norm": 0.3605606494889694, + "learning_rate": 5.466846937667658e-05, + "loss": 2.6935, + "step": 33713 + }, + { + "epoch": 1.5696394068487092, + "grad_norm": 0.3394737637069209, + "learning_rate": 5.4665772466290056e-05, + "loss": 2.7329, + "step": 33714 + }, + { + "epoch": 1.5696859650348023, + "grad_norm": 0.37765119139569375, + "learning_rate": 5.4663075542209954e-05, + "loss": 2.8087, + "step": 33715 + }, + { + "epoch": 1.5697325232208952, + "grad_norm": 0.36280111544573657, + "learning_rate": 5.466037860444417e-05, + "loss": 2.8221, + "step": 33716 + }, + { + "epoch": 1.5697790814069883, + "grad_norm": 0.37337714771841807, + "learning_rate": 5.465768165300065e-05, + "loss": 2.7634, + "step": 33717 + }, + { + "epoch": 1.5698256395930814, + "grad_norm": 0.3766459044198288, + "learning_rate": 5.465498468788728e-05, + "loss": 2.7718, + "step": 33718 + }, + { + "epoch": 1.5698721977791745, + "grad_norm": 0.34090820220546236, + "learning_rate": 5.4652287709111985e-05, + "loss": 2.7116, + "step": 33719 + }, + { + "epoch": 1.5699187559652676, + "grad_norm": 0.36645191254619064, + "learning_rate": 5.46495907166827e-05, + "loss": 2.7221, + "step": 33720 + }, + { + "epoch": 1.5699653141513608, + "grad_norm": 0.34430437159585847, + "learning_rate": 5.4646893710607315e-05, + "loss": 2.6718, + "step": 33721 + }, + { + "epoch": 1.5700118723374539, + "grad_norm": 0.34832186752712174, + "learning_rate": 5.464419669089374e-05, + "loss": 2.7705, + "step": 33722 + }, + { + "epoch": 1.5700584305235468, + "grad_norm": 0.360566564638833, + "learning_rate": 5.464149965754992e-05, + "loss": 2.7735, + "step": 33723 + }, + { + "epoch": 1.5701049887096399, + "grad_norm": 0.37514060808613064, + "learning_rate": 5.463880261058377e-05, + "loss": 2.7696, + "step": 33724 + }, + { + "epoch": 1.570151546895733, + "grad_norm": 0.3386993056819872, + "learning_rate": 5.463610555000318e-05, + "loss": 2.6826, + "step": 33725 + }, + { + "epoch": 1.570198105081826, + "grad_norm": 0.3481117598484653, + "learning_rate": 5.463340847581606e-05, + "loss": 2.6567, + "step": 33726 + }, + { + "epoch": 1.570244663267919, + "grad_norm": 0.3681515828362318, + "learning_rate": 5.4630711388030376e-05, + "loss": 2.8106, + "step": 33727 + }, + { + "epoch": 1.5702912214540121, + "grad_norm": 0.340455893338637, + "learning_rate": 5.4628014286653985e-05, + "loss": 2.7754, + "step": 33728 + }, + { + "epoch": 1.5703377796401052, + "grad_norm": 0.36233968967446145, + "learning_rate": 5.4625317171694846e-05, + "loss": 2.6573, + "step": 33729 + }, + { + "epoch": 1.5703843378261984, + "grad_norm": 0.3390225082545784, + "learning_rate": 5.4622620043160844e-05, + "loss": 2.6794, + "step": 33730 + }, + { + "epoch": 1.5704308960122915, + "grad_norm": 0.3574537472235389, + "learning_rate": 5.46199229010599e-05, + "loss": 2.7373, + "step": 33731 + }, + { + "epoch": 1.5704774541983846, + "grad_norm": 0.3452136238318129, + "learning_rate": 5.461722574539995e-05, + "loss": 2.787, + "step": 33732 + }, + { + "epoch": 1.5705240123844775, + "grad_norm": 0.35321216947637246, + "learning_rate": 5.4614528576188884e-05, + "loss": 2.7879, + "step": 33733 + }, + { + "epoch": 1.5705705705705706, + "grad_norm": 0.3461590831695624, + "learning_rate": 5.461183139343466e-05, + "loss": 2.6951, + "step": 33734 + }, + { + "epoch": 1.5706171287566635, + "grad_norm": 0.33104205252458224, + "learning_rate": 5.460913419714513e-05, + "loss": 2.6466, + "step": 33735 + }, + { + "epoch": 1.5706636869427566, + "grad_norm": 0.39263704580633046, + "learning_rate": 5.460643698732826e-05, + "loss": 2.6077, + "step": 33736 + }, + { + "epoch": 1.5707102451288497, + "grad_norm": 0.33720116922916016, + "learning_rate": 5.4603739763991945e-05, + "loss": 2.7242, + "step": 33737 + }, + { + "epoch": 1.5707568033149428, + "grad_norm": 0.3582275930193214, + "learning_rate": 5.4601042527144134e-05, + "loss": 2.7434, + "step": 33738 + }, + { + "epoch": 1.570803361501036, + "grad_norm": 0.361574612468944, + "learning_rate": 5.4598345276792686e-05, + "loss": 2.6962, + "step": 33739 + }, + { + "epoch": 1.570849919687129, + "grad_norm": 0.34849828712828146, + "learning_rate": 5.4595648012945565e-05, + "loss": 2.8367, + "step": 33740 + }, + { + "epoch": 1.5708964778732222, + "grad_norm": 0.36093595558626884, + "learning_rate": 5.459295073561066e-05, + "loss": 2.771, + "step": 33741 + }, + { + "epoch": 1.5709430360593153, + "grad_norm": 0.3475541344430758, + "learning_rate": 5.45902534447959e-05, + "loss": 2.7158, + "step": 33742 + }, + { + "epoch": 1.5709895942454082, + "grad_norm": 0.3792138778216444, + "learning_rate": 5.458755614050921e-05, + "loss": 2.784, + "step": 33743 + }, + { + "epoch": 1.5710361524315013, + "grad_norm": 0.3453545001651924, + "learning_rate": 5.458485882275848e-05, + "loss": 2.6555, + "step": 33744 + }, + { + "epoch": 1.5710827106175942, + "grad_norm": 0.3531059800870568, + "learning_rate": 5.4582161491551645e-05, + "loss": 2.7274, + "step": 33745 + }, + { + "epoch": 1.5711292688036873, + "grad_norm": 0.3561756312835125, + "learning_rate": 5.457946414689661e-05, + "loss": 2.7595, + "step": 33746 + }, + { + "epoch": 1.5711758269897804, + "grad_norm": 0.3779102728249514, + "learning_rate": 5.457676678880131e-05, + "loss": 2.8256, + "step": 33747 + }, + { + "epoch": 1.5712223851758735, + "grad_norm": 0.3591221918438594, + "learning_rate": 5.4574069417273656e-05, + "loss": 2.6281, + "step": 33748 + }, + { + "epoch": 1.5712689433619667, + "grad_norm": 0.3438038183401706, + "learning_rate": 5.4571372032321545e-05, + "loss": 2.7303, + "step": 33749 + }, + { + "epoch": 1.5713155015480598, + "grad_norm": 0.37112180154084845, + "learning_rate": 5.45686746339529e-05, + "loss": 2.8069, + "step": 33750 + }, + { + "epoch": 1.5713620597341529, + "grad_norm": 0.38629523132593613, + "learning_rate": 5.456597722217567e-05, + "loss": 2.8174, + "step": 33751 + }, + { + "epoch": 1.5714086179202458, + "grad_norm": 0.3683279785560227, + "learning_rate": 5.4563279796997724e-05, + "loss": 2.7304, + "step": 33752 + }, + { + "epoch": 1.571455176106339, + "grad_norm": 0.3785754579172606, + "learning_rate": 5.4560582358427024e-05, + "loss": 2.7898, + "step": 33753 + }, + { + "epoch": 1.571501734292432, + "grad_norm": 0.41219012492846346, + "learning_rate": 5.455788490647145e-05, + "loss": 2.7487, + "step": 33754 + }, + { + "epoch": 1.571548292478525, + "grad_norm": 0.35618548586161947, + "learning_rate": 5.455518744113893e-05, + "loss": 2.7495, + "step": 33755 + }, + { + "epoch": 1.571594850664618, + "grad_norm": 0.40733193331052114, + "learning_rate": 5.455248996243739e-05, + "loss": 2.7898, + "step": 33756 + }, + { + "epoch": 1.5716414088507111, + "grad_norm": 0.35216182816481606, + "learning_rate": 5.454979247037474e-05, + "loss": 2.7047, + "step": 33757 + }, + { + "epoch": 1.5716879670368042, + "grad_norm": 0.39312949870738706, + "learning_rate": 5.454709496495889e-05, + "loss": 2.7711, + "step": 33758 + }, + { + "epoch": 1.5717345252228974, + "grad_norm": 0.3253140434443871, + "learning_rate": 5.454439744619777e-05, + "loss": 2.8658, + "step": 33759 + }, + { + "epoch": 1.5717810834089905, + "grad_norm": 0.4112641228388617, + "learning_rate": 5.4541699914099285e-05, + "loss": 2.6684, + "step": 33760 + }, + { + "epoch": 1.5718276415950836, + "grad_norm": 0.32280820566322754, + "learning_rate": 5.4539002368671374e-05, + "loss": 2.8454, + "step": 33761 + }, + { + "epoch": 1.5718741997811765, + "grad_norm": 0.3666114681329445, + "learning_rate": 5.4536304809921926e-05, + "loss": 2.7206, + "step": 33762 + }, + { + "epoch": 1.5719207579672696, + "grad_norm": 0.36505840962237446, + "learning_rate": 5.453360723785886e-05, + "loss": 2.7714, + "step": 33763 + }, + { + "epoch": 1.5719673161533627, + "grad_norm": 0.3397242322831299, + "learning_rate": 5.453090965249013e-05, + "loss": 2.6672, + "step": 33764 + }, + { + "epoch": 1.5720138743394556, + "grad_norm": 0.37160634384477553, + "learning_rate": 5.4528212053823615e-05, + "loss": 2.7313, + "step": 33765 + }, + { + "epoch": 1.5720604325255487, + "grad_norm": 0.389632569700827, + "learning_rate": 5.452551444186724e-05, + "loss": 2.737, + "step": 33766 + }, + { + "epoch": 1.5721069907116418, + "grad_norm": 0.3437993329396103, + "learning_rate": 5.452281681662893e-05, + "loss": 2.7416, + "step": 33767 + }, + { + "epoch": 1.572153548897735, + "grad_norm": 0.3945672976717012, + "learning_rate": 5.4520119178116594e-05, + "loss": 2.6739, + "step": 33768 + }, + { + "epoch": 1.572200107083828, + "grad_norm": 0.34146753139216485, + "learning_rate": 5.451742152633814e-05, + "loss": 2.81, + "step": 33769 + }, + { + "epoch": 1.5722466652699212, + "grad_norm": 0.3876342992816911, + "learning_rate": 5.4514723861301534e-05, + "loss": 2.7133, + "step": 33770 + }, + { + "epoch": 1.5722932234560143, + "grad_norm": 0.3483735449278461, + "learning_rate": 5.451202618301463e-05, + "loss": 2.7448, + "step": 33771 + }, + { + "epoch": 1.5723397816421072, + "grad_norm": 0.36001236402370906, + "learning_rate": 5.450932849148538e-05, + "loss": 2.6785, + "step": 33772 + }, + { + "epoch": 1.5723863398282003, + "grad_norm": 0.35756590718988845, + "learning_rate": 5.4506630786721694e-05, + "loss": 2.6361, + "step": 33773 + }, + { + "epoch": 1.5724328980142932, + "grad_norm": 0.36380635452641874, + "learning_rate": 5.4503933068731494e-05, + "loss": 2.7252, + "step": 33774 + }, + { + "epoch": 1.5724794562003863, + "grad_norm": 0.34525322662385477, + "learning_rate": 5.4501235337522695e-05, + "loss": 2.7674, + "step": 33775 + }, + { + "epoch": 1.5725260143864794, + "grad_norm": 0.32309302546848845, + "learning_rate": 5.4498537593103205e-05, + "loss": 2.6908, + "step": 33776 + }, + { + "epoch": 1.5725725725725725, + "grad_norm": 0.3483358631727719, + "learning_rate": 5.4495839835480964e-05, + "loss": 2.7741, + "step": 33777 + }, + { + "epoch": 1.5726191307586657, + "grad_norm": 0.3320496227119607, + "learning_rate": 5.4493142064663874e-05, + "loss": 2.6864, + "step": 33778 + }, + { + "epoch": 1.5726656889447588, + "grad_norm": 0.3359479778354461, + "learning_rate": 5.449044428065986e-05, + "loss": 2.7976, + "step": 33779 + }, + { + "epoch": 1.572712247130852, + "grad_norm": 0.3433227017588452, + "learning_rate": 5.448774648347682e-05, + "loss": 2.6878, + "step": 33780 + }, + { + "epoch": 1.572758805316945, + "grad_norm": 0.3381564116292159, + "learning_rate": 5.4485048673122686e-05, + "loss": 2.7606, + "step": 33781 + }, + { + "epoch": 1.572805363503038, + "grad_norm": 0.3561941916058709, + "learning_rate": 5.448235084960538e-05, + "loss": 2.7675, + "step": 33782 + }, + { + "epoch": 1.572851921689131, + "grad_norm": 0.36177554973401177, + "learning_rate": 5.447965301293282e-05, + "loss": 2.7471, + "step": 33783 + }, + { + "epoch": 1.572898479875224, + "grad_norm": 0.3667770522736619, + "learning_rate": 5.447695516311291e-05, + "loss": 2.7818, + "step": 33784 + }, + { + "epoch": 1.572945038061317, + "grad_norm": 0.3473571090428199, + "learning_rate": 5.447425730015359e-05, + "loss": 2.7678, + "step": 33785 + }, + { + "epoch": 1.5729915962474101, + "grad_norm": 0.35671756091890333, + "learning_rate": 5.4471559424062745e-05, + "loss": 2.6711, + "step": 33786 + }, + { + "epoch": 1.5730381544335033, + "grad_norm": 0.32939041072457714, + "learning_rate": 5.446886153484834e-05, + "loss": 2.8151, + "step": 33787 + }, + { + "epoch": 1.5730847126195964, + "grad_norm": 0.35519936368372285, + "learning_rate": 5.446616363251825e-05, + "loss": 2.705, + "step": 33788 + }, + { + "epoch": 1.5731312708056895, + "grad_norm": 0.34132495566714055, + "learning_rate": 5.446346571708042e-05, + "loss": 2.6647, + "step": 33789 + }, + { + "epoch": 1.5731778289917826, + "grad_norm": 0.35862010568430425, + "learning_rate": 5.4460767788542756e-05, + "loss": 2.7299, + "step": 33790 + }, + { + "epoch": 1.5732243871778757, + "grad_norm": 0.3450929591968391, + "learning_rate": 5.445806984691317e-05, + "loss": 2.5995, + "step": 33791 + }, + { + "epoch": 1.5732709453639686, + "grad_norm": 0.3176100226507552, + "learning_rate": 5.4455371892199605e-05, + "loss": 2.6924, + "step": 33792 + }, + { + "epoch": 1.5733175035500617, + "grad_norm": 0.36898597878916556, + "learning_rate": 5.445267392440995e-05, + "loss": 2.7311, + "step": 33793 + }, + { + "epoch": 1.5733640617361546, + "grad_norm": 0.3352375094342139, + "learning_rate": 5.444997594355212e-05, + "loss": 2.6476, + "step": 33794 + }, + { + "epoch": 1.5734106199222477, + "grad_norm": 0.3280473838040572, + "learning_rate": 5.444727794963407e-05, + "loss": 2.7512, + "step": 33795 + }, + { + "epoch": 1.5734571781083408, + "grad_norm": 0.34039908977033734, + "learning_rate": 5.4444579942663685e-05, + "loss": 2.7456, + "step": 33796 + }, + { + "epoch": 1.573503736294434, + "grad_norm": 0.35664959146467357, + "learning_rate": 5.444188192264892e-05, + "loss": 2.7282, + "step": 33797 + }, + { + "epoch": 1.573550294480527, + "grad_norm": 0.32079621383422896, + "learning_rate": 5.4439183889597646e-05, + "loss": 2.6395, + "step": 33798 + }, + { + "epoch": 1.5735968526666202, + "grad_norm": 0.3318463436931099, + "learning_rate": 5.4436485843517807e-05, + "loss": 2.6303, + "step": 33799 + }, + { + "epoch": 1.5736434108527133, + "grad_norm": 0.3640452961262633, + "learning_rate": 5.443378778441731e-05, + "loss": 2.7743, + "step": 33800 + }, + { + "epoch": 1.5736899690388062, + "grad_norm": 0.3544424188120715, + "learning_rate": 5.443108971230411e-05, + "loss": 2.7632, + "step": 33801 + }, + { + "epoch": 1.5737365272248993, + "grad_norm": 0.36724032250536753, + "learning_rate": 5.442839162718608e-05, + "loss": 2.8964, + "step": 33802 + }, + { + "epoch": 1.5737830854109924, + "grad_norm": 0.3860826937741279, + "learning_rate": 5.442569352907116e-05, + "loss": 2.7834, + "step": 33803 + }, + { + "epoch": 1.5738296435970853, + "grad_norm": 0.3489451838520457, + "learning_rate": 5.442299541796727e-05, + "loss": 2.6593, + "step": 33804 + }, + { + "epoch": 1.5738762017831784, + "grad_norm": 0.3641196602073075, + "learning_rate": 5.442029729388232e-05, + "loss": 2.7967, + "step": 33805 + }, + { + "epoch": 1.5739227599692716, + "grad_norm": 0.35413308388800896, + "learning_rate": 5.4417599156824236e-05, + "loss": 2.8018, + "step": 33806 + }, + { + "epoch": 1.5739693181553647, + "grad_norm": 0.34767395564995884, + "learning_rate": 5.441490100680093e-05, + "loss": 2.6901, + "step": 33807 + }, + { + "epoch": 1.5740158763414578, + "grad_norm": 0.3396399803826628, + "learning_rate": 5.441220284382033e-05, + "loss": 2.7412, + "step": 33808 + }, + { + "epoch": 1.574062434527551, + "grad_norm": 0.34049625504888953, + "learning_rate": 5.440950466789033e-05, + "loss": 2.7692, + "step": 33809 + }, + { + "epoch": 1.574108992713644, + "grad_norm": 0.3277187536372243, + "learning_rate": 5.440680647901889e-05, + "loss": 2.749, + "step": 33810 + }, + { + "epoch": 1.574155550899737, + "grad_norm": 0.3168594841443709, + "learning_rate": 5.4404108277213905e-05, + "loss": 2.7747, + "step": 33811 + }, + { + "epoch": 1.57420210908583, + "grad_norm": 0.31094195194480634, + "learning_rate": 5.440141006248328e-05, + "loss": 2.7035, + "step": 33812 + }, + { + "epoch": 1.5742486672719231, + "grad_norm": 0.32590598920123326, + "learning_rate": 5.4398711834834956e-05, + "loss": 2.7566, + "step": 33813 + }, + { + "epoch": 1.574295225458016, + "grad_norm": 0.3430506326773998, + "learning_rate": 5.439601359427685e-05, + "loss": 2.7126, + "step": 33814 + }, + { + "epoch": 1.5743417836441091, + "grad_norm": 0.2972717844748349, + "learning_rate": 5.439331534081689e-05, + "loss": 2.6165, + "step": 33815 + }, + { + "epoch": 1.5743883418302023, + "grad_norm": 0.337068991103836, + "learning_rate": 5.439061707446298e-05, + "loss": 2.8458, + "step": 33816 + }, + { + "epoch": 1.5744349000162954, + "grad_norm": 0.36162144160669063, + "learning_rate": 5.438791879522302e-05, + "loss": 2.7422, + "step": 33817 + }, + { + "epoch": 1.5744814582023885, + "grad_norm": 0.34632188143363846, + "learning_rate": 5.4385220503104974e-05, + "loss": 2.7442, + "step": 33818 + }, + { + "epoch": 1.5745280163884816, + "grad_norm": 0.35935261150685777, + "learning_rate": 5.438252219811673e-05, + "loss": 2.6213, + "step": 33819 + }, + { + "epoch": 1.5745745745745747, + "grad_norm": 0.3439846483661235, + "learning_rate": 5.437982388026621e-05, + "loss": 2.7704, + "step": 33820 + }, + { + "epoch": 1.5746211327606676, + "grad_norm": 0.34505880088625607, + "learning_rate": 5.437712554956134e-05, + "loss": 2.6909, + "step": 33821 + }, + { + "epoch": 1.5746676909467607, + "grad_norm": 0.3272508606598851, + "learning_rate": 5.437442720601003e-05, + "loss": 2.7177, + "step": 33822 + }, + { + "epoch": 1.5747142491328536, + "grad_norm": 0.34574877821614997, + "learning_rate": 5.437172884962021e-05, + "loss": 2.7129, + "step": 33823 + }, + { + "epoch": 1.5747608073189467, + "grad_norm": 0.3673985666902407, + "learning_rate": 5.436903048039981e-05, + "loss": 2.6568, + "step": 33824 + }, + { + "epoch": 1.5748073655050399, + "grad_norm": 0.36001981836192076, + "learning_rate": 5.436633209835672e-05, + "loss": 2.8755, + "step": 33825 + }, + { + "epoch": 1.574853923691133, + "grad_norm": 0.3515908819813884, + "learning_rate": 5.4363633703498884e-05, + "loss": 2.7594, + "step": 33826 + }, + { + "epoch": 1.574900481877226, + "grad_norm": 0.379731046913876, + "learning_rate": 5.4360935295834204e-05, + "loss": 2.7732, + "step": 33827 + }, + { + "epoch": 1.5749470400633192, + "grad_norm": 0.35922792233691336, + "learning_rate": 5.435823687537063e-05, + "loss": 2.675, + "step": 33828 + }, + { + "epoch": 1.5749935982494123, + "grad_norm": 0.33301398508947194, + "learning_rate": 5.435553844211605e-05, + "loss": 2.8404, + "step": 33829 + }, + { + "epoch": 1.5750401564355054, + "grad_norm": 0.36557448680035565, + "learning_rate": 5.435283999607839e-05, + "loss": 2.7175, + "step": 33830 + }, + { + "epoch": 1.5750867146215983, + "grad_norm": 0.32824621632223155, + "learning_rate": 5.4350141537265564e-05, + "loss": 2.7991, + "step": 33831 + }, + { + "epoch": 1.5751332728076914, + "grad_norm": 0.32371417204161124, + "learning_rate": 5.434744306568551e-05, + "loss": 2.716, + "step": 33832 + }, + { + "epoch": 1.5751798309937843, + "grad_norm": 0.35007200280815814, + "learning_rate": 5.434474458134615e-05, + "loss": 2.7483, + "step": 33833 + }, + { + "epoch": 1.5752263891798775, + "grad_norm": 0.34761186067736105, + "learning_rate": 5.4342046084255385e-05, + "loss": 2.8196, + "step": 33834 + }, + { + "epoch": 1.5752729473659706, + "grad_norm": 0.3674038845263445, + "learning_rate": 5.433934757442113e-05, + "loss": 2.7943, + "step": 33835 + }, + { + "epoch": 1.5753195055520637, + "grad_norm": 0.3216528324507848, + "learning_rate": 5.433664905185133e-05, + "loss": 2.696, + "step": 33836 + }, + { + "epoch": 1.5753660637381568, + "grad_norm": 0.3810147641791036, + "learning_rate": 5.433395051655389e-05, + "loss": 2.7787, + "step": 33837 + }, + { + "epoch": 1.57541262192425, + "grad_norm": 0.3365818524937126, + "learning_rate": 5.433125196853673e-05, + "loss": 2.676, + "step": 33838 + }, + { + "epoch": 1.575459180110343, + "grad_norm": 0.3566402780660987, + "learning_rate": 5.432855340780777e-05, + "loss": 2.6991, + "step": 33839 + }, + { + "epoch": 1.575505738296436, + "grad_norm": 0.3452218684403086, + "learning_rate": 5.4325854834374936e-05, + "loss": 2.8049, + "step": 33840 + }, + { + "epoch": 1.575552296482529, + "grad_norm": 0.3485250042364343, + "learning_rate": 5.4323156248246154e-05, + "loss": 2.7787, + "step": 33841 + }, + { + "epoch": 1.5755988546686222, + "grad_norm": 0.37593977402069856, + "learning_rate": 5.4320457649429334e-05, + "loss": 2.7335, + "step": 33842 + }, + { + "epoch": 1.575645412854715, + "grad_norm": 0.34511437099415965, + "learning_rate": 5.431775903793238e-05, + "loss": 2.7559, + "step": 33843 + }, + { + "epoch": 1.5756919710408082, + "grad_norm": 0.357415995237294, + "learning_rate": 5.431506041376323e-05, + "loss": 2.7318, + "step": 33844 + }, + { + "epoch": 1.5757385292269013, + "grad_norm": 0.3122951842279658, + "learning_rate": 5.431236177692981e-05, + "loss": 2.6986, + "step": 33845 + }, + { + "epoch": 1.5757850874129944, + "grad_norm": 0.33686289251365703, + "learning_rate": 5.4309663127440035e-05, + "loss": 2.7155, + "step": 33846 + }, + { + "epoch": 1.5758316455990875, + "grad_norm": 0.34301819083915375, + "learning_rate": 5.430696446530181e-05, + "loss": 2.7477, + "step": 33847 + }, + { + "epoch": 1.5758782037851806, + "grad_norm": 0.3243307380834859, + "learning_rate": 5.430426579052308e-05, + "loss": 2.7161, + "step": 33848 + }, + { + "epoch": 1.5759247619712737, + "grad_norm": 0.36655727783632686, + "learning_rate": 5.430156710311175e-05, + "loss": 2.7249, + "step": 33849 + }, + { + "epoch": 1.5759713201573666, + "grad_norm": 0.33818969575078917, + "learning_rate": 5.429886840307574e-05, + "loss": 2.663, + "step": 33850 + }, + { + "epoch": 1.5760178783434597, + "grad_norm": 0.3184368110900805, + "learning_rate": 5.429616969042299e-05, + "loss": 2.7038, + "step": 33851 + }, + { + "epoch": 1.5760644365295529, + "grad_norm": 0.3371238681657221, + "learning_rate": 5.42934709651614e-05, + "loss": 2.7958, + "step": 33852 + }, + { + "epoch": 1.5761109947156458, + "grad_norm": 0.3234808942420323, + "learning_rate": 5.4290772227298895e-05, + "loss": 2.6276, + "step": 33853 + }, + { + "epoch": 1.5761575529017389, + "grad_norm": 0.3667268458159745, + "learning_rate": 5.428807347684339e-05, + "loss": 2.7907, + "step": 33854 + }, + { + "epoch": 1.576204111087832, + "grad_norm": 0.3239471681123524, + "learning_rate": 5.4285374713802826e-05, + "loss": 2.596, + "step": 33855 + }, + { + "epoch": 1.576250669273925, + "grad_norm": 0.33825401318843107, + "learning_rate": 5.42826759381851e-05, + "loss": 2.7456, + "step": 33856 + }, + { + "epoch": 1.5762972274600182, + "grad_norm": 0.3262270833350068, + "learning_rate": 5.427997714999813e-05, + "loss": 2.7036, + "step": 33857 + }, + { + "epoch": 1.5763437856461113, + "grad_norm": 0.3346033487840135, + "learning_rate": 5.427727834924986e-05, + "loss": 2.8412, + "step": 33858 + }, + { + "epoch": 1.5763903438322044, + "grad_norm": 0.3338395455689591, + "learning_rate": 5.42745795359482e-05, + "loss": 2.6675, + "step": 33859 + }, + { + "epoch": 1.5764369020182973, + "grad_norm": 0.3263278267612462, + "learning_rate": 5.4271880710101076e-05, + "loss": 2.7564, + "step": 33860 + }, + { + "epoch": 1.5764834602043905, + "grad_norm": 0.3492227354225113, + "learning_rate": 5.426918187171639e-05, + "loss": 2.717, + "step": 33861 + }, + { + "epoch": 1.5765300183904833, + "grad_norm": 0.34187309207867267, + "learning_rate": 5.4266483020802085e-05, + "loss": 2.7712, + "step": 33862 + }, + { + "epoch": 1.5765765765765765, + "grad_norm": 0.3380706539764919, + "learning_rate": 5.426378415736606e-05, + "loss": 2.6885, + "step": 33863 + }, + { + "epoch": 1.5766231347626696, + "grad_norm": 0.3335441487993098, + "learning_rate": 5.426108528141627e-05, + "loss": 2.7038, + "step": 33864 + }, + { + "epoch": 1.5766696929487627, + "grad_norm": 0.3555362857132929, + "learning_rate": 5.425838639296059e-05, + "loss": 2.725, + "step": 33865 + }, + { + "epoch": 1.5767162511348558, + "grad_norm": 0.3513924612054388, + "learning_rate": 5.4255687492006976e-05, + "loss": 2.7006, + "step": 33866 + }, + { + "epoch": 1.576762809320949, + "grad_norm": 0.3484818498953603, + "learning_rate": 5.425298857856336e-05, + "loss": 2.7901, + "step": 33867 + }, + { + "epoch": 1.576809367507042, + "grad_norm": 0.33119407469771783, + "learning_rate": 5.4250289652637606e-05, + "loss": 2.6971, + "step": 33868 + }, + { + "epoch": 1.5768559256931352, + "grad_norm": 0.3459022612285956, + "learning_rate": 5.4247590714237695e-05, + "loss": 2.8292, + "step": 33869 + }, + { + "epoch": 1.576902483879228, + "grad_norm": 0.3782559723782156, + "learning_rate": 5.424489176337152e-05, + "loss": 2.7442, + "step": 33870 + }, + { + "epoch": 1.5769490420653212, + "grad_norm": 0.34542941288444384, + "learning_rate": 5.4242192800047e-05, + "loss": 2.7664, + "step": 33871 + }, + { + "epoch": 1.576995600251414, + "grad_norm": 0.39385799950669065, + "learning_rate": 5.4239493824272056e-05, + "loss": 2.6254, + "step": 33872 + }, + { + "epoch": 1.5770421584375072, + "grad_norm": 0.32483051364958127, + "learning_rate": 5.423679483605463e-05, + "loss": 2.6343, + "step": 33873 + }, + { + "epoch": 1.5770887166236003, + "grad_norm": 0.3800295063215648, + "learning_rate": 5.423409583540262e-05, + "loss": 2.8692, + "step": 33874 + }, + { + "epoch": 1.5771352748096934, + "grad_norm": 0.3485619711158178, + "learning_rate": 5.423139682232395e-05, + "loss": 2.712, + "step": 33875 + }, + { + "epoch": 1.5771818329957865, + "grad_norm": 0.37093788848405107, + "learning_rate": 5.4228697796826557e-05, + "loss": 2.7232, + "step": 33876 + }, + { + "epoch": 1.5772283911818796, + "grad_norm": 0.3340227181355586, + "learning_rate": 5.422599875891834e-05, + "loss": 2.7207, + "step": 33877 + }, + { + "epoch": 1.5772749493679727, + "grad_norm": 0.35606224956193355, + "learning_rate": 5.422329970860724e-05, + "loss": 2.8069, + "step": 33878 + }, + { + "epoch": 1.5773215075540656, + "grad_norm": 0.3569163551249862, + "learning_rate": 5.422060064590117e-05, + "loss": 2.7011, + "step": 33879 + }, + { + "epoch": 1.5773680657401588, + "grad_norm": 0.3449539950189378, + "learning_rate": 5.4217901570808074e-05, + "loss": 2.7765, + "step": 33880 + }, + { + "epoch": 1.5774146239262519, + "grad_norm": 0.36716306527337444, + "learning_rate": 5.421520248333584e-05, + "loss": 2.8099, + "step": 33881 + }, + { + "epoch": 1.5774611821123448, + "grad_norm": 0.33015419277037855, + "learning_rate": 5.4212503383492396e-05, + "loss": 2.7298, + "step": 33882 + }, + { + "epoch": 1.5775077402984379, + "grad_norm": 0.3573571188065144, + "learning_rate": 5.420980427128567e-05, + "loss": 2.8227, + "step": 33883 + }, + { + "epoch": 1.577554298484531, + "grad_norm": 0.33146953657449213, + "learning_rate": 5.420710514672358e-05, + "loss": 2.7505, + "step": 33884 + }, + { + "epoch": 1.577600856670624, + "grad_norm": 0.3074539084956098, + "learning_rate": 5.420440600981406e-05, + "loss": 2.7167, + "step": 33885 + }, + { + "epoch": 1.5776474148567172, + "grad_norm": 0.33751806088963293, + "learning_rate": 5.4201706860565006e-05, + "loss": 2.8031, + "step": 33886 + }, + { + "epoch": 1.5776939730428103, + "grad_norm": 0.3354407013929049, + "learning_rate": 5.4199007698984375e-05, + "loss": 2.8717, + "step": 33887 + }, + { + "epoch": 1.5777405312289035, + "grad_norm": 0.318122641866859, + "learning_rate": 5.4196308525080064e-05, + "loss": 2.7386, + "step": 33888 + }, + { + "epoch": 1.5777870894149963, + "grad_norm": 0.3363953149129782, + "learning_rate": 5.4193609338859996e-05, + "loss": 2.7321, + "step": 33889 + }, + { + "epoch": 1.5778336476010895, + "grad_norm": 0.32004741964607697, + "learning_rate": 5.41909101403321e-05, + "loss": 2.6004, + "step": 33890 + }, + { + "epoch": 1.5778802057871826, + "grad_norm": 0.3366649603345337, + "learning_rate": 5.41882109295043e-05, + "loss": 2.7124, + "step": 33891 + }, + { + "epoch": 1.5779267639732755, + "grad_norm": 0.3436717938171945, + "learning_rate": 5.4185511706384504e-05, + "loss": 2.7409, + "step": 33892 + }, + { + "epoch": 1.5779733221593686, + "grad_norm": 0.3269138167826337, + "learning_rate": 5.418281247098066e-05, + "loss": 2.775, + "step": 33893 + }, + { + "epoch": 1.5780198803454617, + "grad_norm": 0.3622209644354907, + "learning_rate": 5.418011322330067e-05, + "loss": 2.6975, + "step": 33894 + }, + { + "epoch": 1.5780664385315548, + "grad_norm": 0.3361338388197573, + "learning_rate": 5.417741396335244e-05, + "loss": 2.8169, + "step": 33895 + }, + { + "epoch": 1.578112996717648, + "grad_norm": 0.37128844846651815, + "learning_rate": 5.417471469114393e-05, + "loss": 2.7139, + "step": 33896 + }, + { + "epoch": 1.578159554903741, + "grad_norm": 0.3561929869879279, + "learning_rate": 5.417201540668304e-05, + "loss": 2.6885, + "step": 33897 + }, + { + "epoch": 1.5782061130898342, + "grad_norm": 0.335427485115939, + "learning_rate": 5.416931610997769e-05, + "loss": 2.7016, + "step": 33898 + }, + { + "epoch": 1.578252671275927, + "grad_norm": 0.32612270224453227, + "learning_rate": 5.416661680103581e-05, + "loss": 2.6694, + "step": 33899 + }, + { + "epoch": 1.5782992294620202, + "grad_norm": 0.357523639079529, + "learning_rate": 5.416391747986533e-05, + "loss": 2.7593, + "step": 33900 + }, + { + "epoch": 1.578345787648113, + "grad_norm": 0.33149351422672163, + "learning_rate": 5.416121814647414e-05, + "loss": 2.7179, + "step": 33901 + }, + { + "epoch": 1.5783923458342062, + "grad_norm": 0.32727899597333787, + "learning_rate": 5.41585188008702e-05, + "loss": 2.8123, + "step": 33902 + }, + { + "epoch": 1.5784389040202993, + "grad_norm": 0.35544824812970194, + "learning_rate": 5.415581944306142e-05, + "loss": 2.7948, + "step": 33903 + }, + { + "epoch": 1.5784854622063924, + "grad_norm": 0.3569482196422881, + "learning_rate": 5.415312007305573e-05, + "loss": 2.8245, + "step": 33904 + }, + { + "epoch": 1.5785320203924855, + "grad_norm": 0.3848287601050768, + "learning_rate": 5.4150420690861014e-05, + "loss": 2.6592, + "step": 33905 + }, + { + "epoch": 1.5785785785785786, + "grad_norm": 0.3367074895041754, + "learning_rate": 5.414772129648524e-05, + "loss": 2.8095, + "step": 33906 + }, + { + "epoch": 1.5786251367646718, + "grad_norm": 0.4094439133577609, + "learning_rate": 5.4145021889936306e-05, + "loss": 2.6803, + "step": 33907 + }, + { + "epoch": 1.5786716949507649, + "grad_norm": 0.32142348047920205, + "learning_rate": 5.4142322471222154e-05, + "loss": 2.6914, + "step": 33908 + }, + { + "epoch": 1.5787182531368578, + "grad_norm": 0.3203041637532445, + "learning_rate": 5.4139623040350687e-05, + "loss": 2.6782, + "step": 33909 + }, + { + "epoch": 1.5787648113229509, + "grad_norm": 0.3736882391306053, + "learning_rate": 5.4136923597329826e-05, + "loss": 2.8417, + "step": 33910 + }, + { + "epoch": 1.5788113695090438, + "grad_norm": 0.3489695300896204, + "learning_rate": 5.41342241421675e-05, + "loss": 2.608, + "step": 33911 + }, + { + "epoch": 1.5788579276951369, + "grad_norm": 0.3471609955896462, + "learning_rate": 5.413152467487164e-05, + "loss": 2.7256, + "step": 33912 + }, + { + "epoch": 1.57890448588123, + "grad_norm": 0.3278530491798663, + "learning_rate": 5.412882519545015e-05, + "loss": 2.6311, + "step": 33913 + }, + { + "epoch": 1.5789510440673231, + "grad_norm": 0.34630718037741165, + "learning_rate": 5.412612570391099e-05, + "loss": 2.8127, + "step": 33914 + }, + { + "epoch": 1.5789976022534162, + "grad_norm": 0.36067408354634456, + "learning_rate": 5.412342620026204e-05, + "loss": 2.7351, + "step": 33915 + }, + { + "epoch": 1.5790441604395093, + "grad_norm": 0.32063628172984726, + "learning_rate": 5.4120726684511234e-05, + "loss": 2.7793, + "step": 33916 + }, + { + "epoch": 1.5790907186256025, + "grad_norm": 0.3342073998366815, + "learning_rate": 5.411802715666653e-05, + "loss": 2.7966, + "step": 33917 + }, + { + "epoch": 1.5791372768116956, + "grad_norm": 0.355338903447714, + "learning_rate": 5.411532761673579e-05, + "loss": 2.6928, + "step": 33918 + }, + { + "epoch": 1.5791838349977885, + "grad_norm": 0.31582824626660977, + "learning_rate": 5.4112628064727e-05, + "loss": 2.6623, + "step": 33919 + }, + { + "epoch": 1.5792303931838816, + "grad_norm": 0.3510121967499831, + "learning_rate": 5.4109928500648024e-05, + "loss": 2.8164, + "step": 33920 + }, + { + "epoch": 1.5792769513699745, + "grad_norm": 0.33789108459505857, + "learning_rate": 5.410722892450683e-05, + "loss": 2.5704, + "step": 33921 + }, + { + "epoch": 1.5793235095560676, + "grad_norm": 0.3517599040204802, + "learning_rate": 5.4104529336311304e-05, + "loss": 2.7105, + "step": 33922 + }, + { + "epoch": 1.5793700677421607, + "grad_norm": 0.3345861441068367, + "learning_rate": 5.410182973606941e-05, + "loss": 2.7188, + "step": 33923 + }, + { + "epoch": 1.5794166259282538, + "grad_norm": 0.34117275740224706, + "learning_rate": 5.4099130123789034e-05, + "loss": 2.6552, + "step": 33924 + }, + { + "epoch": 1.579463184114347, + "grad_norm": 0.34301361109789597, + "learning_rate": 5.4096430499478125e-05, + "loss": 2.6958, + "step": 33925 + }, + { + "epoch": 1.57950974230044, + "grad_norm": 0.3489683721983144, + "learning_rate": 5.409373086314459e-05, + "loss": 2.6989, + "step": 33926 + }, + { + "epoch": 1.5795563004865332, + "grad_norm": 0.3314921301912498, + "learning_rate": 5.409103121479636e-05, + "loss": 2.6475, + "step": 33927 + }, + { + "epoch": 1.579602858672626, + "grad_norm": 0.34829312349791414, + "learning_rate": 5.408833155444136e-05, + "loss": 2.7931, + "step": 33928 + }, + { + "epoch": 1.5796494168587192, + "grad_norm": 0.3436710461391491, + "learning_rate": 5.408563188208751e-05, + "loss": 2.7412, + "step": 33929 + }, + { + "epoch": 1.5796959750448123, + "grad_norm": 0.3471546249169132, + "learning_rate": 5.408293219774273e-05, + "loss": 2.7103, + "step": 33930 + }, + { + "epoch": 1.5797425332309052, + "grad_norm": 0.36254480982836473, + "learning_rate": 5.408023250141494e-05, + "loss": 2.7029, + "step": 33931 + }, + { + "epoch": 1.5797890914169983, + "grad_norm": 0.34896158557833407, + "learning_rate": 5.407753279311208e-05, + "loss": 2.6864, + "step": 33932 + }, + { + "epoch": 1.5798356496030914, + "grad_norm": 0.36818393808421773, + "learning_rate": 5.4074833072842065e-05, + "loss": 2.6811, + "step": 33933 + }, + { + "epoch": 1.5798822077891845, + "grad_norm": 0.3294203300038981, + "learning_rate": 5.407213334061281e-05, + "loss": 2.836, + "step": 33934 + }, + { + "epoch": 1.5799287659752776, + "grad_norm": 0.3788254551989647, + "learning_rate": 5.406943359643224e-05, + "loss": 2.819, + "step": 33935 + }, + { + "epoch": 1.5799753241613708, + "grad_norm": 0.3454248864614055, + "learning_rate": 5.406673384030829e-05, + "loss": 2.7409, + "step": 33936 + }, + { + "epoch": 1.5800218823474639, + "grad_norm": 0.3418848051787264, + "learning_rate": 5.4064034072248884e-05, + "loss": 2.652, + "step": 33937 + }, + { + "epoch": 1.5800684405335568, + "grad_norm": 0.32926973849702484, + "learning_rate": 5.4061334292261925e-05, + "loss": 2.8206, + "step": 33938 + }, + { + "epoch": 1.5801149987196499, + "grad_norm": 0.3511358323408059, + "learning_rate": 5.405863450035534e-05, + "loss": 2.7631, + "step": 33939 + }, + { + "epoch": 1.580161556905743, + "grad_norm": 0.31833030248919664, + "learning_rate": 5.405593469653708e-05, + "loss": 2.7812, + "step": 33940 + }, + { + "epoch": 1.580208115091836, + "grad_norm": 0.3173430480196841, + "learning_rate": 5.405323488081505e-05, + "loss": 2.632, + "step": 33941 + }, + { + "epoch": 1.580254673277929, + "grad_norm": 0.35403910775869196, + "learning_rate": 5.405053505319717e-05, + "loss": 2.7487, + "step": 33942 + }, + { + "epoch": 1.5803012314640221, + "grad_norm": 0.3585125695055694, + "learning_rate": 5.4047835213691386e-05, + "loss": 2.766, + "step": 33943 + }, + { + "epoch": 1.5803477896501152, + "grad_norm": 0.32384093909908596, + "learning_rate": 5.404513536230559e-05, + "loss": 2.73, + "step": 33944 + }, + { + "epoch": 1.5803943478362084, + "grad_norm": 0.38964728704475027, + "learning_rate": 5.404243549904773e-05, + "loss": 2.6994, + "step": 33945 + }, + { + "epoch": 1.5804409060223015, + "grad_norm": 0.31935283813363163, + "learning_rate": 5.40397356239257e-05, + "loss": 2.7263, + "step": 33946 + }, + { + "epoch": 1.5804874642083946, + "grad_norm": 0.32598224426737393, + "learning_rate": 5.403703573694745e-05, + "loss": 2.6616, + "step": 33947 + }, + { + "epoch": 1.5805340223944875, + "grad_norm": 0.30968436560934004, + "learning_rate": 5.403433583812091e-05, + "loss": 2.5547, + "step": 33948 + }, + { + "epoch": 1.5805805805805806, + "grad_norm": 0.3408128210774899, + "learning_rate": 5.403163592745397e-05, + "loss": 2.5802, + "step": 33949 + }, + { + "epoch": 1.5806271387666735, + "grad_norm": 0.29946617143680837, + "learning_rate": 5.40289360049546e-05, + "loss": 2.6336, + "step": 33950 + }, + { + "epoch": 1.5806736969527666, + "grad_norm": 0.37242352383787924, + "learning_rate": 5.402623607063069e-05, + "loss": 2.8199, + "step": 33951 + }, + { + "epoch": 1.5807202551388597, + "grad_norm": 0.3705128372940097, + "learning_rate": 5.402353612449017e-05, + "loss": 2.7695, + "step": 33952 + }, + { + "epoch": 1.5807668133249528, + "grad_norm": 0.3510122366876137, + "learning_rate": 5.402083616654097e-05, + "loss": 2.7427, + "step": 33953 + }, + { + "epoch": 1.580813371511046, + "grad_norm": 0.39449678820106676, + "learning_rate": 5.401813619679101e-05, + "loss": 2.6919, + "step": 33954 + }, + { + "epoch": 1.580859929697139, + "grad_norm": 0.32919233487185945, + "learning_rate": 5.4015436215248226e-05, + "loss": 2.7719, + "step": 33955 + }, + { + "epoch": 1.5809064878832322, + "grad_norm": 0.3656677142684626, + "learning_rate": 5.4012736221920535e-05, + "loss": 2.7539, + "step": 33956 + }, + { + "epoch": 1.5809530460693253, + "grad_norm": 0.36503477340028767, + "learning_rate": 5.401003621681584e-05, + "loss": 2.6159, + "step": 33957 + }, + { + "epoch": 1.5809996042554182, + "grad_norm": 0.34782521999307653, + "learning_rate": 5.400733619994209e-05, + "loss": 2.8913, + "step": 33958 + }, + { + "epoch": 1.5810461624415113, + "grad_norm": 0.3739924700119465, + "learning_rate": 5.400463617130722e-05, + "loss": 2.757, + "step": 33959 + }, + { + "epoch": 1.5810927206276042, + "grad_norm": 0.3452508307250773, + "learning_rate": 5.400193613091912e-05, + "loss": 2.7667, + "step": 33960 + }, + { + "epoch": 1.5811392788136973, + "grad_norm": 0.3613574238734695, + "learning_rate": 5.3999236078785734e-05, + "loss": 2.745, + "step": 33961 + }, + { + "epoch": 1.5811858369997904, + "grad_norm": 0.36947669844543424, + "learning_rate": 5.399653601491498e-05, + "loss": 2.693, + "step": 33962 + }, + { + "epoch": 1.5812323951858835, + "grad_norm": 0.3285934012771708, + "learning_rate": 5.3993835939314795e-05, + "loss": 2.7446, + "step": 33963 + }, + { + "epoch": 1.5812789533719767, + "grad_norm": 0.3349186527732108, + "learning_rate": 5.399113585199309e-05, + "loss": 2.7496, + "step": 33964 + }, + { + "epoch": 1.5813255115580698, + "grad_norm": 0.3568929539249949, + "learning_rate": 5.398843575295779e-05, + "loss": 2.6884, + "step": 33965 + }, + { + "epoch": 1.581372069744163, + "grad_norm": 0.3475475824299785, + "learning_rate": 5.3985735642216826e-05, + "loss": 2.7721, + "step": 33966 + }, + { + "epoch": 1.5814186279302558, + "grad_norm": 0.37077300127941537, + "learning_rate": 5.3983035519778126e-05, + "loss": 2.7761, + "step": 33967 + }, + { + "epoch": 1.581465186116349, + "grad_norm": 0.357086006848434, + "learning_rate": 5.398033538564962e-05, + "loss": 2.7408, + "step": 33968 + }, + { + "epoch": 1.581511744302442, + "grad_norm": 0.3657638254819534, + "learning_rate": 5.397763523983921e-05, + "loss": 2.7347, + "step": 33969 + }, + { + "epoch": 1.581558302488535, + "grad_norm": 0.31997895931212755, + "learning_rate": 5.3974935082354824e-05, + "loss": 2.7193, + "step": 33970 + }, + { + "epoch": 1.581604860674628, + "grad_norm": 0.35775239752905713, + "learning_rate": 5.397223491320441e-05, + "loss": 2.6928, + "step": 33971 + }, + { + "epoch": 1.5816514188607211, + "grad_norm": 0.32352524749388906, + "learning_rate": 5.396953473239588e-05, + "loss": 2.7193, + "step": 33972 + }, + { + "epoch": 1.5816979770468143, + "grad_norm": 0.349536335441744, + "learning_rate": 5.396683453993714e-05, + "loss": 2.7151, + "step": 33973 + }, + { + "epoch": 1.5817445352329074, + "grad_norm": 0.3351293966840792, + "learning_rate": 5.3964134335836136e-05, + "loss": 2.6125, + "step": 33974 + }, + { + "epoch": 1.5817910934190005, + "grad_norm": 0.3240187882591512, + "learning_rate": 5.396143412010078e-05, + "loss": 2.71, + "step": 33975 + }, + { + "epoch": 1.5818376516050936, + "grad_norm": 0.36599575900364273, + "learning_rate": 5.395873389273902e-05, + "loss": 2.7097, + "step": 33976 + }, + { + "epoch": 1.5818842097911865, + "grad_norm": 0.3383310890185304, + "learning_rate": 5.395603365375877e-05, + "loss": 2.7364, + "step": 33977 + }, + { + "epoch": 1.5819307679772796, + "grad_norm": 0.374376092306133, + "learning_rate": 5.395333340316794e-05, + "loss": 2.7037, + "step": 33978 + }, + { + "epoch": 1.5819773261633727, + "grad_norm": 0.3426129428113138, + "learning_rate": 5.3950633140974473e-05, + "loss": 2.7028, + "step": 33979 + }, + { + "epoch": 1.5820238843494656, + "grad_norm": 0.32291573811937535, + "learning_rate": 5.394793286718628e-05, + "loss": 2.6919, + "step": 33980 + }, + { + "epoch": 1.5820704425355587, + "grad_norm": 0.3510844392546083, + "learning_rate": 5.39452325818113e-05, + "loss": 2.6485, + "step": 33981 + }, + { + "epoch": 1.5821170007216518, + "grad_norm": 0.34380173382959445, + "learning_rate": 5.394253228485745e-05, + "loss": 2.7462, + "step": 33982 + }, + { + "epoch": 1.582163558907745, + "grad_norm": 0.35126437217697165, + "learning_rate": 5.3939831976332655e-05, + "loss": 2.721, + "step": 33983 + }, + { + "epoch": 1.582210117093838, + "grad_norm": 0.3362845026276414, + "learning_rate": 5.3937131656244834e-05, + "loss": 2.7757, + "step": 33984 + }, + { + "epoch": 1.5822566752799312, + "grad_norm": 0.36744591267637405, + "learning_rate": 5.393443132460192e-05, + "loss": 2.702, + "step": 33985 + }, + { + "epoch": 1.5823032334660243, + "grad_norm": 0.3387142835430559, + "learning_rate": 5.393173098141186e-05, + "loss": 2.5971, + "step": 33986 + }, + { + "epoch": 1.5823497916521172, + "grad_norm": 0.3403528617510984, + "learning_rate": 5.392903062668253e-05, + "loss": 2.7166, + "step": 33987 + }, + { + "epoch": 1.5823963498382103, + "grad_norm": 0.3661043864983644, + "learning_rate": 5.392633026042189e-05, + "loss": 2.6892, + "step": 33988 + }, + { + "epoch": 1.5824429080243032, + "grad_norm": 0.36823533057446434, + "learning_rate": 5.392362988263786e-05, + "loss": 2.7663, + "step": 33989 + }, + { + "epoch": 1.5824894662103963, + "grad_norm": 0.3822539454655167, + "learning_rate": 5.392092949333837e-05, + "loss": 2.7395, + "step": 33990 + }, + { + "epoch": 1.5825360243964894, + "grad_norm": 0.4177264514276926, + "learning_rate": 5.3918229092531323e-05, + "loss": 2.6875, + "step": 33991 + }, + { + "epoch": 1.5825825825825826, + "grad_norm": 0.3851421228699418, + "learning_rate": 5.391552868022467e-05, + "loss": 2.5969, + "step": 33992 + }, + { + "epoch": 1.5826291407686757, + "grad_norm": 0.3478604281569619, + "learning_rate": 5.3912828256426316e-05, + "loss": 2.711, + "step": 33993 + }, + { + "epoch": 1.5826756989547688, + "grad_norm": 0.3740218080807005, + "learning_rate": 5.391012782114421e-05, + "loss": 2.6801, + "step": 33994 + }, + { + "epoch": 1.582722257140862, + "grad_norm": 0.38292731711656297, + "learning_rate": 5.3907427374386264e-05, + "loss": 2.6396, + "step": 33995 + }, + { + "epoch": 1.582768815326955, + "grad_norm": 0.3470166284373308, + "learning_rate": 5.39047269161604e-05, + "loss": 2.672, + "step": 33996 + }, + { + "epoch": 1.582815373513048, + "grad_norm": 0.3680199929257928, + "learning_rate": 5.390202644647454e-05, + "loss": 2.8534, + "step": 33997 + }, + { + "epoch": 1.582861931699141, + "grad_norm": 0.3622785717656822, + "learning_rate": 5.3899325965336625e-05, + "loss": 2.7377, + "step": 33998 + }, + { + "epoch": 1.582908489885234, + "grad_norm": 0.3903704451596072, + "learning_rate": 5.389662547275457e-05, + "loss": 2.8306, + "step": 33999 + }, + { + "epoch": 1.582955048071327, + "grad_norm": 0.35695944969641163, + "learning_rate": 5.3893924968736296e-05, + "loss": 2.5557, + "step": 34000 + }, + { + "epoch": 1.5830016062574201, + "grad_norm": 0.3572310362237809, + "learning_rate": 5.389122445328973e-05, + "loss": 2.6797, + "step": 34001 + }, + { + "epoch": 1.5830481644435133, + "grad_norm": 0.3636229329219687, + "learning_rate": 5.388852392642282e-05, + "loss": 2.7057, + "step": 34002 + }, + { + "epoch": 1.5830947226296064, + "grad_norm": 0.3996752517159902, + "learning_rate": 5.388582338814348e-05, + "loss": 2.7767, + "step": 34003 + }, + { + "epoch": 1.5831412808156995, + "grad_norm": 0.3535404785447451, + "learning_rate": 5.388312283845962e-05, + "loss": 2.7809, + "step": 34004 + }, + { + "epoch": 1.5831878390017926, + "grad_norm": 0.34821364654719794, + "learning_rate": 5.3880422277379174e-05, + "loss": 2.7179, + "step": 34005 + }, + { + "epoch": 1.5832343971878857, + "grad_norm": 0.3680927283647347, + "learning_rate": 5.387772170491008e-05, + "loss": 2.8032, + "step": 34006 + }, + { + "epoch": 1.5832809553739786, + "grad_norm": 0.35313196069417874, + "learning_rate": 5.387502112106025e-05, + "loss": 2.8096, + "step": 34007 + }, + { + "epoch": 1.5833275135600717, + "grad_norm": 0.3560563180777547, + "learning_rate": 5.387232052583763e-05, + "loss": 2.7578, + "step": 34008 + }, + { + "epoch": 1.5833740717461646, + "grad_norm": 0.32373908372221977, + "learning_rate": 5.3869619919250104e-05, + "loss": 2.6522, + "step": 34009 + }, + { + "epoch": 1.5834206299322577, + "grad_norm": 0.38483325223956477, + "learning_rate": 5.386691930130564e-05, + "loss": 2.7743, + "step": 34010 + }, + { + "epoch": 1.5834671881183509, + "grad_norm": 0.32049814187792613, + "learning_rate": 5.386421867201214e-05, + "loss": 2.7026, + "step": 34011 + }, + { + "epoch": 1.583513746304444, + "grad_norm": 0.34001252859538084, + "learning_rate": 5.3861518031377545e-05, + "loss": 2.6819, + "step": 34012 + }, + { + "epoch": 1.583560304490537, + "grad_norm": 0.33950932384005406, + "learning_rate": 5.3858817379409774e-05, + "loss": 2.6924, + "step": 34013 + }, + { + "epoch": 1.5836068626766302, + "grad_norm": 0.33422436350011625, + "learning_rate": 5.385611671611675e-05, + "loss": 2.7893, + "step": 34014 + }, + { + "epoch": 1.5836534208627233, + "grad_norm": 0.3864866053488252, + "learning_rate": 5.385341604150641e-05, + "loss": 2.6956, + "step": 34015 + }, + { + "epoch": 1.5836999790488162, + "grad_norm": 0.3386793886270337, + "learning_rate": 5.3850715355586664e-05, + "loss": 2.7587, + "step": 34016 + }, + { + "epoch": 1.5837465372349093, + "grad_norm": 0.35890244452872094, + "learning_rate": 5.384801465836547e-05, + "loss": 2.7076, + "step": 34017 + }, + { + "epoch": 1.5837930954210024, + "grad_norm": 0.33478041154356175, + "learning_rate": 5.384531394985072e-05, + "loss": 2.7188, + "step": 34018 + }, + { + "epoch": 1.5838396536070953, + "grad_norm": 0.36396194760845624, + "learning_rate": 5.384261323005034e-05, + "loss": 2.7041, + "step": 34019 + }, + { + "epoch": 1.5838862117931884, + "grad_norm": 0.3678071963929303, + "learning_rate": 5.383991249897229e-05, + "loss": 2.7604, + "step": 34020 + }, + { + "epoch": 1.5839327699792816, + "grad_norm": 0.364350520802548, + "learning_rate": 5.383721175662446e-05, + "loss": 2.781, + "step": 34021 + }, + { + "epoch": 1.5839793281653747, + "grad_norm": 0.3430968683019517, + "learning_rate": 5.38345110030148e-05, + "loss": 2.7911, + "step": 34022 + }, + { + "epoch": 1.5840258863514678, + "grad_norm": 0.35749199244984575, + "learning_rate": 5.383181023815122e-05, + "loss": 2.7408, + "step": 34023 + }, + { + "epoch": 1.584072444537561, + "grad_norm": 0.34174994707020323, + "learning_rate": 5.382910946204165e-05, + "loss": 2.8104, + "step": 34024 + }, + { + "epoch": 1.584119002723654, + "grad_norm": 0.36844731837088446, + "learning_rate": 5.382640867469403e-05, + "loss": 2.7419, + "step": 34025 + }, + { + "epoch": 1.584165560909747, + "grad_norm": 0.35220386355119104, + "learning_rate": 5.382370787611628e-05, + "loss": 2.6667, + "step": 34026 + }, + { + "epoch": 1.58421211909584, + "grad_norm": 0.33990786792431127, + "learning_rate": 5.382100706631632e-05, + "loss": 2.7128, + "step": 34027 + }, + { + "epoch": 1.5842586772819331, + "grad_norm": 0.33088811222214404, + "learning_rate": 5.381830624530207e-05, + "loss": 2.7573, + "step": 34028 + }, + { + "epoch": 1.584305235468026, + "grad_norm": 0.3283091339314815, + "learning_rate": 5.381560541308148e-05, + "loss": 2.712, + "step": 34029 + }, + { + "epoch": 1.5843517936541192, + "grad_norm": 0.3432036080632094, + "learning_rate": 5.381290456966246e-05, + "loss": 2.7313, + "step": 34030 + }, + { + "epoch": 1.5843983518402123, + "grad_norm": 0.3459163371731317, + "learning_rate": 5.381020371505295e-05, + "loss": 2.695, + "step": 34031 + }, + { + "epoch": 1.5844449100263054, + "grad_norm": 0.32649541548543776, + "learning_rate": 5.380750284926086e-05, + "loss": 2.6501, + "step": 34032 + }, + { + "epoch": 1.5844914682123985, + "grad_norm": 0.3413971612121692, + "learning_rate": 5.380480197229413e-05, + "loss": 2.6088, + "step": 34033 + }, + { + "epoch": 1.5845380263984916, + "grad_norm": 0.33207875109386287, + "learning_rate": 5.380210108416067e-05, + "loss": 2.7768, + "step": 34034 + }, + { + "epoch": 1.5845845845845847, + "grad_norm": 0.3410581333978921, + "learning_rate": 5.379940018486843e-05, + "loss": 2.6551, + "step": 34035 + }, + { + "epoch": 1.5846311427706776, + "grad_norm": 0.33797055116145014, + "learning_rate": 5.379669927442532e-05, + "loss": 2.7078, + "step": 34036 + }, + { + "epoch": 1.5846777009567707, + "grad_norm": 0.3376457068383107, + "learning_rate": 5.379399835283927e-05, + "loss": 2.7949, + "step": 34037 + }, + { + "epoch": 1.5847242591428636, + "grad_norm": 0.3552091688972745, + "learning_rate": 5.3791297420118204e-05, + "loss": 2.7137, + "step": 34038 + }, + { + "epoch": 1.5847708173289567, + "grad_norm": 0.3428319905580089, + "learning_rate": 5.3788596476270056e-05, + "loss": 2.7115, + "step": 34039 + }, + { + "epoch": 1.5848173755150499, + "grad_norm": 0.33808972656930086, + "learning_rate": 5.378589552130276e-05, + "loss": 2.718, + "step": 34040 + }, + { + "epoch": 1.584863933701143, + "grad_norm": 0.34801577490635627, + "learning_rate": 5.378319455522422e-05, + "loss": 2.7832, + "step": 34041 + }, + { + "epoch": 1.584910491887236, + "grad_norm": 0.33830499409962317, + "learning_rate": 5.378049357804238e-05, + "loss": 2.7228, + "step": 34042 + }, + { + "epoch": 1.5849570500733292, + "grad_norm": 0.3344949911739394, + "learning_rate": 5.3777792589765165e-05, + "loss": 2.671, + "step": 34043 + }, + { + "epoch": 1.5850036082594223, + "grad_norm": 0.3576785426003456, + "learning_rate": 5.3775091590400506e-05, + "loss": 2.7943, + "step": 34044 + }, + { + "epoch": 1.5850501664455154, + "grad_norm": 0.32212087299787634, + "learning_rate": 5.377239057995632e-05, + "loss": 2.8859, + "step": 34045 + }, + { + "epoch": 1.5850967246316083, + "grad_norm": 0.3307213102649265, + "learning_rate": 5.376968955844055e-05, + "loss": 2.7062, + "step": 34046 + }, + { + "epoch": 1.5851432828177014, + "grad_norm": 0.36801850423010946, + "learning_rate": 5.376698852586111e-05, + "loss": 2.7699, + "step": 34047 + }, + { + "epoch": 1.5851898410037943, + "grad_norm": 0.31358814221206754, + "learning_rate": 5.3764287482225914e-05, + "loss": 2.6086, + "step": 34048 + }, + { + "epoch": 1.5852363991898875, + "grad_norm": 0.35281650582440927, + "learning_rate": 5.376158642754292e-05, + "loss": 2.8261, + "step": 34049 + }, + { + "epoch": 1.5852829573759806, + "grad_norm": 0.34650970627482247, + "learning_rate": 5.3758885361820034e-05, + "loss": 2.5725, + "step": 34050 + }, + { + "epoch": 1.5853295155620737, + "grad_norm": 0.33697463397918453, + "learning_rate": 5.375618428506519e-05, + "loss": 2.748, + "step": 34051 + }, + { + "epoch": 1.5853760737481668, + "grad_norm": 0.35486199145121167, + "learning_rate": 5.375348319728631e-05, + "loss": 2.6625, + "step": 34052 + }, + { + "epoch": 1.58542263193426, + "grad_norm": 0.319058476558761, + "learning_rate": 5.375078209849133e-05, + "loss": 2.6326, + "step": 34053 + }, + { + "epoch": 1.585469190120353, + "grad_norm": 0.33001869887138824, + "learning_rate": 5.374808098868819e-05, + "loss": 2.7953, + "step": 34054 + }, + { + "epoch": 1.585515748306446, + "grad_norm": 0.32595532675591676, + "learning_rate": 5.374537986788478e-05, + "loss": 2.6816, + "step": 34055 + }, + { + "epoch": 1.585562306492539, + "grad_norm": 0.3220829849988952, + "learning_rate": 5.374267873608905e-05, + "loss": 2.6823, + "step": 34056 + }, + { + "epoch": 1.5856088646786322, + "grad_norm": 0.34994413962581705, + "learning_rate": 5.3739977593308945e-05, + "loss": 2.7315, + "step": 34057 + }, + { + "epoch": 1.585655422864725, + "grad_norm": 0.33261844209176644, + "learning_rate": 5.373727643955235e-05, + "loss": 2.6933, + "step": 34058 + }, + { + "epoch": 1.5857019810508182, + "grad_norm": 0.31062080749186244, + "learning_rate": 5.373457527482725e-05, + "loss": 2.7183, + "step": 34059 + }, + { + "epoch": 1.5857485392369113, + "grad_norm": 0.3434424657437283, + "learning_rate": 5.373187409914151e-05, + "loss": 2.7238, + "step": 34060 + }, + { + "epoch": 1.5857950974230044, + "grad_norm": 0.32192819843361886, + "learning_rate": 5.37291729125031e-05, + "loss": 2.7134, + "step": 34061 + }, + { + "epoch": 1.5858416556090975, + "grad_norm": 0.3680274788134046, + "learning_rate": 5.3726471714919926e-05, + "loss": 2.7073, + "step": 34062 + }, + { + "epoch": 1.5858882137951906, + "grad_norm": 0.3586789045801827, + "learning_rate": 5.372377050639994e-05, + "loss": 2.7983, + "step": 34063 + }, + { + "epoch": 1.5859347719812837, + "grad_norm": 0.34396617290304604, + "learning_rate": 5.3721069286951044e-05, + "loss": 2.7663, + "step": 34064 + }, + { + "epoch": 1.5859813301673766, + "grad_norm": 0.34869105964515007, + "learning_rate": 5.371836805658117e-05, + "loss": 2.6485, + "step": 34065 + }, + { + "epoch": 1.5860278883534698, + "grad_norm": 0.3653509531282362, + "learning_rate": 5.3715666815298246e-05, + "loss": 2.8712, + "step": 34066 + }, + { + "epoch": 1.5860744465395629, + "grad_norm": 0.34276958081581876, + "learning_rate": 5.371296556311023e-05, + "loss": 2.6484, + "step": 34067 + }, + { + "epoch": 1.5861210047256558, + "grad_norm": 0.37065513885805373, + "learning_rate": 5.3710264300025014e-05, + "loss": 2.8593, + "step": 34068 + }, + { + "epoch": 1.5861675629117489, + "grad_norm": 0.3454327620010096, + "learning_rate": 5.370756302605053e-05, + "loss": 2.6242, + "step": 34069 + }, + { + "epoch": 1.586214121097842, + "grad_norm": 0.36687061280713235, + "learning_rate": 5.370486174119473e-05, + "loss": 2.6302, + "step": 34070 + }, + { + "epoch": 1.586260679283935, + "grad_norm": 0.34821666852448346, + "learning_rate": 5.370216044546552e-05, + "loss": 2.6973, + "step": 34071 + }, + { + "epoch": 1.5863072374700282, + "grad_norm": 0.35828745696944736, + "learning_rate": 5.3699459138870846e-05, + "loss": 2.7736, + "step": 34072 + }, + { + "epoch": 1.5863537956561213, + "grad_norm": 0.34293448430463036, + "learning_rate": 5.3696757821418596e-05, + "loss": 2.6384, + "step": 34073 + }, + { + "epoch": 1.5864003538422145, + "grad_norm": 0.32863826388658596, + "learning_rate": 5.369405649311674e-05, + "loss": 2.8057, + "step": 34074 + }, + { + "epoch": 1.5864469120283073, + "grad_norm": 0.3547551730633041, + "learning_rate": 5.369135515397319e-05, + "loss": 2.795, + "step": 34075 + }, + { + "epoch": 1.5864934702144005, + "grad_norm": 0.35335030471317314, + "learning_rate": 5.368865380399588e-05, + "loss": 2.6224, + "step": 34076 + }, + { + "epoch": 1.5865400284004934, + "grad_norm": 0.3426382526224221, + "learning_rate": 5.368595244319273e-05, + "loss": 2.7464, + "step": 34077 + }, + { + "epoch": 1.5865865865865865, + "grad_norm": 0.37259262710520885, + "learning_rate": 5.368325107157167e-05, + "loss": 2.7546, + "step": 34078 + }, + { + "epoch": 1.5866331447726796, + "grad_norm": 0.33748739462127525, + "learning_rate": 5.3680549689140635e-05, + "loss": 2.7047, + "step": 34079 + }, + { + "epoch": 1.5866797029587727, + "grad_norm": 0.37526364534893303, + "learning_rate": 5.367784829590755e-05, + "loss": 2.7152, + "step": 34080 + }, + { + "epoch": 1.5867262611448658, + "grad_norm": 0.37819202958336057, + "learning_rate": 5.3675146891880356e-05, + "loss": 2.7702, + "step": 34081 + }, + { + "epoch": 1.586772819330959, + "grad_norm": 0.3688798618830281, + "learning_rate": 5.367244547706695e-05, + "loss": 2.7721, + "step": 34082 + }, + { + "epoch": 1.586819377517052, + "grad_norm": 0.35340263167103125, + "learning_rate": 5.366974405147529e-05, + "loss": 2.7373, + "step": 34083 + }, + { + "epoch": 1.5868659357031452, + "grad_norm": 0.3558877067116353, + "learning_rate": 5.366704261511328e-05, + "loss": 2.796, + "step": 34084 + }, + { + "epoch": 1.586912493889238, + "grad_norm": 0.3762437747530571, + "learning_rate": 5.366434116798887e-05, + "loss": 2.7472, + "step": 34085 + }, + { + "epoch": 1.5869590520753312, + "grad_norm": 0.35224064473209377, + "learning_rate": 5.366163971010998e-05, + "loss": 2.7164, + "step": 34086 + }, + { + "epoch": 1.587005610261424, + "grad_norm": 0.3799376364440659, + "learning_rate": 5.365893824148453e-05, + "loss": 2.8265, + "step": 34087 + }, + { + "epoch": 1.5870521684475172, + "grad_norm": 0.35119112778829364, + "learning_rate": 5.365623676212046e-05, + "loss": 2.7689, + "step": 34088 + }, + { + "epoch": 1.5870987266336103, + "grad_norm": 0.3760823845030091, + "learning_rate": 5.365353527202569e-05, + "loss": 2.7893, + "step": 34089 + }, + { + "epoch": 1.5871452848197034, + "grad_norm": 0.3764245535811323, + "learning_rate": 5.365083377120817e-05, + "loss": 2.7081, + "step": 34090 + }, + { + "epoch": 1.5871918430057965, + "grad_norm": 0.317657508734616, + "learning_rate": 5.36481322596758e-05, + "loss": 2.7925, + "step": 34091 + }, + { + "epoch": 1.5872384011918896, + "grad_norm": 0.36511485047529246, + "learning_rate": 5.364543073743652e-05, + "loss": 2.755, + "step": 34092 + }, + { + "epoch": 1.5872849593779828, + "grad_norm": 0.342522255329592, + "learning_rate": 5.364272920449826e-05, + "loss": 2.5713, + "step": 34093 + }, + { + "epoch": 1.5873315175640759, + "grad_norm": 0.3382331769200758, + "learning_rate": 5.3640027660868965e-05, + "loss": 2.7653, + "step": 34094 + }, + { + "epoch": 1.5873780757501688, + "grad_norm": 0.355785235241467, + "learning_rate": 5.363732610655654e-05, + "loss": 2.7664, + "step": 34095 + }, + { + "epoch": 1.5874246339362619, + "grad_norm": 0.32562012011018554, + "learning_rate": 5.363462454156892e-05, + "loss": 2.7421, + "step": 34096 + }, + { + "epoch": 1.5874711921223548, + "grad_norm": 0.3600042442876992, + "learning_rate": 5.3631922965914026e-05, + "loss": 2.7655, + "step": 34097 + }, + { + "epoch": 1.5875177503084479, + "grad_norm": 0.32918327421486376, + "learning_rate": 5.36292213795998e-05, + "loss": 2.777, + "step": 34098 + }, + { + "epoch": 1.587564308494541, + "grad_norm": 0.33895405007863594, + "learning_rate": 5.3626519782634175e-05, + "loss": 2.6984, + "step": 34099 + }, + { + "epoch": 1.5876108666806341, + "grad_norm": 0.3252366646321914, + "learning_rate": 5.362381817502506e-05, + "loss": 2.7284, + "step": 34100 + }, + { + "epoch": 1.5876574248667272, + "grad_norm": 0.32109147296466545, + "learning_rate": 5.36211165567804e-05, + "loss": 2.7131, + "step": 34101 + }, + { + "epoch": 1.5877039830528203, + "grad_norm": 0.35991918695828895, + "learning_rate": 5.361841492790812e-05, + "loss": 2.693, + "step": 34102 + }, + { + "epoch": 1.5877505412389135, + "grad_norm": 0.33412443284655163, + "learning_rate": 5.361571328841616e-05, + "loss": 2.6614, + "step": 34103 + }, + { + "epoch": 1.5877970994250064, + "grad_norm": 0.32497726685235606, + "learning_rate": 5.3613011638312414e-05, + "loss": 2.6654, + "step": 34104 + }, + { + "epoch": 1.5878436576110995, + "grad_norm": 0.3313731080644337, + "learning_rate": 5.3610309977604853e-05, + "loss": 2.7344, + "step": 34105 + }, + { + "epoch": 1.5878902157971926, + "grad_norm": 0.35352329603537247, + "learning_rate": 5.3607608306301385e-05, + "loss": 2.8354, + "step": 34106 + }, + { + "epoch": 1.5879367739832855, + "grad_norm": 0.32876255126105514, + "learning_rate": 5.3604906624409936e-05, + "loss": 2.623, + "step": 34107 + }, + { + "epoch": 1.5879833321693786, + "grad_norm": 0.3229892423815717, + "learning_rate": 5.360220493193845e-05, + "loss": 2.5968, + "step": 34108 + }, + { + "epoch": 1.5880298903554717, + "grad_norm": 0.3508734207890707, + "learning_rate": 5.359950322889484e-05, + "loss": 2.697, + "step": 34109 + }, + { + "epoch": 1.5880764485415648, + "grad_norm": 0.32394851189892854, + "learning_rate": 5.359680151528704e-05, + "loss": 2.7321, + "step": 34110 + }, + { + "epoch": 1.588123006727658, + "grad_norm": 0.35174974342050436, + "learning_rate": 5.3594099791122985e-05, + "loss": 2.6764, + "step": 34111 + }, + { + "epoch": 1.588169564913751, + "grad_norm": 0.34469901639394773, + "learning_rate": 5.359139805641062e-05, + "loss": 2.7136, + "step": 34112 + }, + { + "epoch": 1.5882161230998442, + "grad_norm": 0.3439864572697768, + "learning_rate": 5.358869631115784e-05, + "loss": 2.5928, + "step": 34113 + }, + { + "epoch": 1.588262681285937, + "grad_norm": 0.35466497725689244, + "learning_rate": 5.3585994555372577e-05, + "loss": 2.7278, + "step": 34114 + }, + { + "epoch": 1.5883092394720302, + "grad_norm": 0.33788499318429743, + "learning_rate": 5.3583292789062775e-05, + "loss": 2.7564, + "step": 34115 + }, + { + "epoch": 1.5883557976581233, + "grad_norm": 0.3297323933000545, + "learning_rate": 5.358059101223638e-05, + "loss": 2.75, + "step": 34116 + }, + { + "epoch": 1.5884023558442162, + "grad_norm": 0.35712516440523523, + "learning_rate": 5.35778892249013e-05, + "loss": 2.6787, + "step": 34117 + }, + { + "epoch": 1.5884489140303093, + "grad_norm": 0.343821616149651, + "learning_rate": 5.357518742706545e-05, + "loss": 2.6519, + "step": 34118 + }, + { + "epoch": 1.5884954722164024, + "grad_norm": 0.3965564926642869, + "learning_rate": 5.357248561873679e-05, + "loss": 2.7293, + "step": 34119 + }, + { + "epoch": 1.5885420304024955, + "grad_norm": 0.3557077741220144, + "learning_rate": 5.3569783799923235e-05, + "loss": 2.7606, + "step": 34120 + }, + { + "epoch": 1.5885885885885886, + "grad_norm": 0.4015597802404642, + "learning_rate": 5.3567081970632726e-05, + "loss": 2.6995, + "step": 34121 + }, + { + "epoch": 1.5886351467746818, + "grad_norm": 0.3684587565018508, + "learning_rate": 5.356438013087318e-05, + "loss": 2.7508, + "step": 34122 + }, + { + "epoch": 1.5886817049607749, + "grad_norm": 0.3735264794836421, + "learning_rate": 5.3561678280652514e-05, + "loss": 2.7765, + "step": 34123 + }, + { + "epoch": 1.5887282631468678, + "grad_norm": 0.409693054811605, + "learning_rate": 5.355897641997869e-05, + "loss": 2.8228, + "step": 34124 + }, + { + "epoch": 1.5887748213329609, + "grad_norm": 0.35524380231280994, + "learning_rate": 5.3556274548859606e-05, + "loss": 2.6995, + "step": 34125 + }, + { + "epoch": 1.5888213795190538, + "grad_norm": 0.3717720645429417, + "learning_rate": 5.355357266730322e-05, + "loss": 2.7736, + "step": 34126 + }, + { + "epoch": 1.588867937705147, + "grad_norm": 0.37578733201368847, + "learning_rate": 5.355087077531744e-05, + "loss": 2.7075, + "step": 34127 + }, + { + "epoch": 1.58891449589124, + "grad_norm": 0.38026933972783167, + "learning_rate": 5.3548168872910206e-05, + "loss": 2.7719, + "step": 34128 + }, + { + "epoch": 1.5889610540773331, + "grad_norm": 0.35005493111150127, + "learning_rate": 5.354546696008944e-05, + "loss": 2.8619, + "step": 34129 + }, + { + "epoch": 1.5890076122634262, + "grad_norm": 0.3756344841565726, + "learning_rate": 5.3542765036863093e-05, + "loss": 2.6612, + "step": 34130 + }, + { + "epoch": 1.5890541704495194, + "grad_norm": 0.37483939629466023, + "learning_rate": 5.354006310323907e-05, + "loss": 2.7892, + "step": 34131 + }, + { + "epoch": 1.5891007286356125, + "grad_norm": 0.3547573825293556, + "learning_rate": 5.353736115922531e-05, + "loss": 2.7957, + "step": 34132 + }, + { + "epoch": 1.5891472868217056, + "grad_norm": 0.38522594207465827, + "learning_rate": 5.353465920482974e-05, + "loss": 2.819, + "step": 34133 + }, + { + "epoch": 1.5891938450077985, + "grad_norm": 0.38145082826371784, + "learning_rate": 5.35319572400603e-05, + "loss": 2.6831, + "step": 34134 + }, + { + "epoch": 1.5892404031938916, + "grad_norm": 0.36265987672534494, + "learning_rate": 5.352925526492493e-05, + "loss": 2.7158, + "step": 34135 + }, + { + "epoch": 1.5892869613799845, + "grad_norm": 0.37262603832802305, + "learning_rate": 5.3526553279431516e-05, + "loss": 2.7823, + "step": 34136 + }, + { + "epoch": 1.5893335195660776, + "grad_norm": 0.39425845597092235, + "learning_rate": 5.3523851283588024e-05, + "loss": 2.7937, + "step": 34137 + }, + { + "epoch": 1.5893800777521707, + "grad_norm": 0.31935210245304196, + "learning_rate": 5.3521149277402374e-05, + "loss": 2.6684, + "step": 34138 + }, + { + "epoch": 1.5894266359382638, + "grad_norm": 0.3688602638108882, + "learning_rate": 5.3518447260882496e-05, + "loss": 2.6688, + "step": 34139 + }, + { + "epoch": 1.589473194124357, + "grad_norm": 0.32561875953254676, + "learning_rate": 5.351574523403633e-05, + "loss": 2.693, + "step": 34140 + }, + { + "epoch": 1.58951975231045, + "grad_norm": 0.3597713679160088, + "learning_rate": 5.3513043196871794e-05, + "loss": 2.7956, + "step": 34141 + }, + { + "epoch": 1.5895663104965432, + "grad_norm": 0.3438577115922951, + "learning_rate": 5.351034114939681e-05, + "loss": 2.6774, + "step": 34142 + }, + { + "epoch": 1.589612868682636, + "grad_norm": 0.34659929105357057, + "learning_rate": 5.3507639091619335e-05, + "loss": 2.7982, + "step": 34143 + }, + { + "epoch": 1.5896594268687292, + "grad_norm": 0.3267069349003022, + "learning_rate": 5.350493702354728e-05, + "loss": 2.7124, + "step": 34144 + }, + { + "epoch": 1.5897059850548223, + "grad_norm": 0.34836351088799367, + "learning_rate": 5.350223494518859e-05, + "loss": 2.7006, + "step": 34145 + }, + { + "epoch": 1.5897525432409152, + "grad_norm": 0.33975063444940234, + "learning_rate": 5.349953285655117e-05, + "loss": 2.7144, + "step": 34146 + }, + { + "epoch": 1.5897991014270083, + "grad_norm": 0.32797194925443224, + "learning_rate": 5.349683075764299e-05, + "loss": 2.743, + "step": 34147 + }, + { + "epoch": 1.5898456596131014, + "grad_norm": 0.33772652074320597, + "learning_rate": 5.349412864847194e-05, + "loss": 2.748, + "step": 34148 + }, + { + "epoch": 1.5898922177991945, + "grad_norm": 0.3270341297517108, + "learning_rate": 5.3491426529045976e-05, + "loss": 2.6837, + "step": 34149 + }, + { + "epoch": 1.5899387759852877, + "grad_norm": 0.33096522410639695, + "learning_rate": 5.3488724399373e-05, + "loss": 2.7359, + "step": 34150 + }, + { + "epoch": 1.5899853341713808, + "grad_norm": 0.3314089431815705, + "learning_rate": 5.348602225946097e-05, + "loss": 2.564, + "step": 34151 + }, + { + "epoch": 1.5900318923574739, + "grad_norm": 0.35803172589477994, + "learning_rate": 5.3483320109317806e-05, + "loss": 2.7899, + "step": 34152 + }, + { + "epoch": 1.5900784505435668, + "grad_norm": 0.31945095348763175, + "learning_rate": 5.348061794895145e-05, + "loss": 2.708, + "step": 34153 + }, + { + "epoch": 1.59012500872966, + "grad_norm": 0.3242754273974784, + "learning_rate": 5.347791577836981e-05, + "loss": 2.5817, + "step": 34154 + }, + { + "epoch": 1.590171566915753, + "grad_norm": 0.3460538221977639, + "learning_rate": 5.3475213597580845e-05, + "loss": 2.7042, + "step": 34155 + }, + { + "epoch": 1.590218125101846, + "grad_norm": 0.31473613227994013, + "learning_rate": 5.3472511406592454e-05, + "loss": 2.7456, + "step": 34156 + }, + { + "epoch": 1.590264683287939, + "grad_norm": 0.33108788036785164, + "learning_rate": 5.3469809205412604e-05, + "loss": 2.725, + "step": 34157 + }, + { + "epoch": 1.5903112414740321, + "grad_norm": 0.32092224538509506, + "learning_rate": 5.346710699404919e-05, + "loss": 2.7107, + "step": 34158 + }, + { + "epoch": 1.5903577996601252, + "grad_norm": 0.3297733139015911, + "learning_rate": 5.3464404772510155e-05, + "loss": 2.8077, + "step": 34159 + }, + { + "epoch": 1.5904043578462184, + "grad_norm": 0.3156933498806614, + "learning_rate": 5.346170254080345e-05, + "loss": 2.7256, + "step": 34160 + }, + { + "epoch": 1.5904509160323115, + "grad_norm": 0.31800137806132855, + "learning_rate": 5.345900029893698e-05, + "loss": 2.682, + "step": 34161 + }, + { + "epoch": 1.5904974742184046, + "grad_norm": 0.31583575170004985, + "learning_rate": 5.345629804691869e-05, + "loss": 2.6929, + "step": 34162 + }, + { + "epoch": 1.5905440324044975, + "grad_norm": 0.3234709196196422, + "learning_rate": 5.34535957847565e-05, + "loss": 2.7291, + "step": 34163 + }, + { + "epoch": 1.5905905905905906, + "grad_norm": 0.31570282055784826, + "learning_rate": 5.345089351245834e-05, + "loss": 2.7082, + "step": 34164 + }, + { + "epoch": 1.5906371487766835, + "grad_norm": 0.3443205051854875, + "learning_rate": 5.344819123003216e-05, + "loss": 2.7511, + "step": 34165 + }, + { + "epoch": 1.5906837069627766, + "grad_norm": 0.32515988089436554, + "learning_rate": 5.344548893748589e-05, + "loss": 2.7635, + "step": 34166 + }, + { + "epoch": 1.5907302651488697, + "grad_norm": 0.332336232329763, + "learning_rate": 5.344278663482743e-05, + "loss": 2.6729, + "step": 34167 + }, + { + "epoch": 1.5907768233349628, + "grad_norm": 0.3296015716831712, + "learning_rate": 5.344008432206472e-05, + "loss": 2.7061, + "step": 34168 + }, + { + "epoch": 1.590823381521056, + "grad_norm": 0.3585625614769359, + "learning_rate": 5.3437381999205715e-05, + "loss": 2.7164, + "step": 34169 + }, + { + "epoch": 1.590869939707149, + "grad_norm": 0.34970227257388004, + "learning_rate": 5.343467966625832e-05, + "loss": 2.6448, + "step": 34170 + }, + { + "epoch": 1.5909164978932422, + "grad_norm": 0.3614111889140807, + "learning_rate": 5.343197732323051e-05, + "loss": 2.7633, + "step": 34171 + }, + { + "epoch": 1.5909630560793353, + "grad_norm": 0.3680854906001637, + "learning_rate": 5.342927497013016e-05, + "loss": 2.7741, + "step": 34172 + }, + { + "epoch": 1.5910096142654282, + "grad_norm": 0.3245731092317316, + "learning_rate": 5.342657260696524e-05, + "loss": 2.65, + "step": 34173 + }, + { + "epoch": 1.5910561724515213, + "grad_norm": 0.35709051962639765, + "learning_rate": 5.3423870233743654e-05, + "loss": 2.7159, + "step": 34174 + }, + { + "epoch": 1.5911027306376142, + "grad_norm": 0.3522254940378773, + "learning_rate": 5.3421167850473364e-05, + "loss": 2.761, + "step": 34175 + }, + { + "epoch": 1.5911492888237073, + "grad_norm": 0.3712222733833448, + "learning_rate": 5.3418465457162257e-05, + "loss": 2.7813, + "step": 34176 + }, + { + "epoch": 1.5911958470098004, + "grad_norm": 0.3611759835644147, + "learning_rate": 5.34157630538183e-05, + "loss": 2.6324, + "step": 34177 + }, + { + "epoch": 1.5912424051958936, + "grad_norm": 0.3761037282896647, + "learning_rate": 5.341306064044942e-05, + "loss": 2.7576, + "step": 34178 + }, + { + "epoch": 1.5912889633819867, + "grad_norm": 0.35678036611782876, + "learning_rate": 5.341035821706354e-05, + "loss": 2.6982, + "step": 34179 + }, + { + "epoch": 1.5913355215680798, + "grad_norm": 0.3595588926152776, + "learning_rate": 5.340765578366861e-05, + "loss": 2.7443, + "step": 34180 + }, + { + "epoch": 1.591382079754173, + "grad_norm": 0.39695510086125446, + "learning_rate": 5.340495334027252e-05, + "loss": 2.7422, + "step": 34181 + }, + { + "epoch": 1.591428637940266, + "grad_norm": 0.3573866555514929, + "learning_rate": 5.3402250886883244e-05, + "loss": 2.669, + "step": 34182 + }, + { + "epoch": 1.591475196126359, + "grad_norm": 0.35586319194567834, + "learning_rate": 5.339954842350868e-05, + "loss": 2.829, + "step": 34183 + }, + { + "epoch": 1.591521754312452, + "grad_norm": 0.36297809345032717, + "learning_rate": 5.33968459501568e-05, + "loss": 2.7443, + "step": 34184 + }, + { + "epoch": 1.591568312498545, + "grad_norm": 0.3309160679196846, + "learning_rate": 5.33941434668355e-05, + "loss": 2.7208, + "step": 34185 + }, + { + "epoch": 1.591614870684638, + "grad_norm": 0.3828880650955875, + "learning_rate": 5.339144097355273e-05, + "loss": 2.7248, + "step": 34186 + }, + { + "epoch": 1.5916614288707311, + "grad_norm": 0.3308409644404046, + "learning_rate": 5.3388738470316416e-05, + "loss": 2.7706, + "step": 34187 + }, + { + "epoch": 1.5917079870568243, + "grad_norm": 0.34045368404801274, + "learning_rate": 5.338603595713447e-05, + "loss": 2.7763, + "step": 34188 + }, + { + "epoch": 1.5917545452429174, + "grad_norm": 0.37620376957334456, + "learning_rate": 5.338333343401486e-05, + "loss": 2.7322, + "step": 34189 + }, + { + "epoch": 1.5918011034290105, + "grad_norm": 0.33774356854772997, + "learning_rate": 5.338063090096549e-05, + "loss": 2.5418, + "step": 34190 + }, + { + "epoch": 1.5918476616151036, + "grad_norm": 0.33659175849920103, + "learning_rate": 5.3377928357994314e-05, + "loss": 2.7542, + "step": 34191 + }, + { + "epoch": 1.5918942198011965, + "grad_norm": 0.3376308685836114, + "learning_rate": 5.337522580510923e-05, + "loss": 2.7546, + "step": 34192 + }, + { + "epoch": 1.5919407779872896, + "grad_norm": 0.34600509760118214, + "learning_rate": 5.3372523242318204e-05, + "loss": 2.6185, + "step": 34193 + }, + { + "epoch": 1.5919873361733827, + "grad_norm": 0.3223167886348424, + "learning_rate": 5.3369820669629154e-05, + "loss": 2.7873, + "step": 34194 + }, + { + "epoch": 1.5920338943594756, + "grad_norm": 0.4039846499177739, + "learning_rate": 5.3367118087050005e-05, + "loss": 2.6611, + "step": 34195 + }, + { + "epoch": 1.5920804525455687, + "grad_norm": 0.3337247122665219, + "learning_rate": 5.3364415494588706e-05, + "loss": 2.7526, + "step": 34196 + }, + { + "epoch": 1.5921270107316619, + "grad_norm": 0.3813885840749021, + "learning_rate": 5.336171289225318e-05, + "loss": 2.7647, + "step": 34197 + }, + { + "epoch": 1.592173568917755, + "grad_norm": 0.34877669026112545, + "learning_rate": 5.335901028005135e-05, + "loss": 2.6371, + "step": 34198 + }, + { + "epoch": 1.592220127103848, + "grad_norm": 0.38067510514804925, + "learning_rate": 5.335630765799117e-05, + "loss": 2.7656, + "step": 34199 + }, + { + "epoch": 1.5922666852899412, + "grad_norm": 0.3350197413000092, + "learning_rate": 5.335360502608054e-05, + "loss": 2.714, + "step": 34200 + }, + { + "epoch": 1.5923132434760343, + "grad_norm": 0.3433254611012668, + "learning_rate": 5.335090238432742e-05, + "loss": 2.6995, + "step": 34201 + }, + { + "epoch": 1.5923598016621272, + "grad_norm": 0.3389191957308433, + "learning_rate": 5.334819973273973e-05, + "loss": 2.7231, + "step": 34202 + }, + { + "epoch": 1.5924063598482203, + "grad_norm": 0.3543048555391961, + "learning_rate": 5.33454970713254e-05, + "loss": 2.8252, + "step": 34203 + }, + { + "epoch": 1.5924529180343134, + "grad_norm": 0.3283094718388188, + "learning_rate": 5.3342794400092364e-05, + "loss": 2.714, + "step": 34204 + }, + { + "epoch": 1.5924994762204063, + "grad_norm": 0.3839015350873165, + "learning_rate": 5.3340091719048556e-05, + "loss": 2.7875, + "step": 34205 + }, + { + "epoch": 1.5925460344064994, + "grad_norm": 0.32990974467043943, + "learning_rate": 5.333738902820191e-05, + "loss": 2.6583, + "step": 34206 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.3516848519522389, + "learning_rate": 5.333468632756036e-05, + "loss": 2.7562, + "step": 34207 + }, + { + "epoch": 1.5926391507786857, + "grad_norm": 0.34108000812344047, + "learning_rate": 5.333198361713183e-05, + "loss": 2.7327, + "step": 34208 + }, + { + "epoch": 1.5926857089647788, + "grad_norm": 0.3624196404284248, + "learning_rate": 5.332928089692425e-05, + "loss": 2.6541, + "step": 34209 + }, + { + "epoch": 1.592732267150872, + "grad_norm": 0.34893069064664006, + "learning_rate": 5.332657816694557e-05, + "loss": 2.7457, + "step": 34210 + }, + { + "epoch": 1.592778825336965, + "grad_norm": 0.3731345030580511, + "learning_rate": 5.332387542720371e-05, + "loss": 2.5608, + "step": 34211 + }, + { + "epoch": 1.592825383523058, + "grad_norm": 0.3356366605275002, + "learning_rate": 5.332117267770661e-05, + "loss": 2.7129, + "step": 34212 + }, + { + "epoch": 1.592871941709151, + "grad_norm": 0.3418226371262075, + "learning_rate": 5.3318469918462175e-05, + "loss": 2.7402, + "step": 34213 + }, + { + "epoch": 1.592918499895244, + "grad_norm": 0.34508507274010897, + "learning_rate": 5.331576714947837e-05, + "loss": 2.6181, + "step": 34214 + }, + { + "epoch": 1.592965058081337, + "grad_norm": 0.3472020010140943, + "learning_rate": 5.331306437076311e-05, + "loss": 2.7131, + "step": 34215 + }, + { + "epoch": 1.5930116162674302, + "grad_norm": 0.3510123602647008, + "learning_rate": 5.331036158232434e-05, + "loss": 2.7377, + "step": 34216 + }, + { + "epoch": 1.5930581744535233, + "grad_norm": 0.3142859720491679, + "learning_rate": 5.330765878416998e-05, + "loss": 2.707, + "step": 34217 + }, + { + "epoch": 1.5931047326396164, + "grad_norm": 0.3549814306499711, + "learning_rate": 5.330495597630797e-05, + "loss": 2.6979, + "step": 34218 + }, + { + "epoch": 1.5931512908257095, + "grad_norm": 0.33710455361528285, + "learning_rate": 5.330225315874623e-05, + "loss": 2.8297, + "step": 34219 + }, + { + "epoch": 1.5931978490118026, + "grad_norm": 0.331322565200179, + "learning_rate": 5.329955033149272e-05, + "loss": 2.692, + "step": 34220 + }, + { + "epoch": 1.5932444071978957, + "grad_norm": 0.34161638128585997, + "learning_rate": 5.3296847494555334e-05, + "loss": 2.7338, + "step": 34221 + }, + { + "epoch": 1.5932909653839886, + "grad_norm": 0.35851439962777226, + "learning_rate": 5.3294144647942036e-05, + "loss": 2.6023, + "step": 34222 + }, + { + "epoch": 1.5933375235700817, + "grad_norm": 0.35942857207001455, + "learning_rate": 5.329144179166076e-05, + "loss": 2.7587, + "step": 34223 + }, + { + "epoch": 1.5933840817561746, + "grad_norm": 0.3482910091843324, + "learning_rate": 5.328873892571941e-05, + "loss": 2.7663, + "step": 34224 + }, + { + "epoch": 1.5934306399422677, + "grad_norm": 0.3374464170271846, + "learning_rate": 5.328603605012594e-05, + "loss": 2.69, + "step": 34225 + }, + { + "epoch": 1.5934771981283609, + "grad_norm": 0.3345464911676524, + "learning_rate": 5.328333316488828e-05, + "loss": 2.7907, + "step": 34226 + }, + { + "epoch": 1.593523756314454, + "grad_norm": 0.3641948977286447, + "learning_rate": 5.328063027001435e-05, + "loss": 2.6727, + "step": 34227 + }, + { + "epoch": 1.593570314500547, + "grad_norm": 0.344072163588421, + "learning_rate": 5.3277927365512105e-05, + "loss": 2.742, + "step": 34228 + }, + { + "epoch": 1.5936168726866402, + "grad_norm": 0.3212916180080134, + "learning_rate": 5.327522445138947e-05, + "loss": 2.7137, + "step": 34229 + }, + { + "epoch": 1.5936634308727333, + "grad_norm": 0.325771981156719, + "learning_rate": 5.327252152765436e-05, + "loss": 2.7502, + "step": 34230 + }, + { + "epoch": 1.5937099890588262, + "grad_norm": 0.35421886139760966, + "learning_rate": 5.326981859431472e-05, + "loss": 2.7357, + "step": 34231 + }, + { + "epoch": 1.5937565472449193, + "grad_norm": 0.3229786478090792, + "learning_rate": 5.32671156513785e-05, + "loss": 2.6993, + "step": 34232 + }, + { + "epoch": 1.5938031054310124, + "grad_norm": 0.37054993731546915, + "learning_rate": 5.3264412698853604e-05, + "loss": 2.6691, + "step": 34233 + }, + { + "epoch": 1.5938496636171053, + "grad_norm": 0.3514919976929014, + "learning_rate": 5.326170973674799e-05, + "loss": 2.6402, + "step": 34234 + }, + { + "epoch": 1.5938962218031985, + "grad_norm": 0.323257356305754, + "learning_rate": 5.3259006765069576e-05, + "loss": 2.6932, + "step": 34235 + }, + { + "epoch": 1.5939427799892916, + "grad_norm": 0.4144995417474857, + "learning_rate": 5.32563037838263e-05, + "loss": 2.7157, + "step": 34236 + }, + { + "epoch": 1.5939893381753847, + "grad_norm": 0.3373587647749607, + "learning_rate": 5.325360079302608e-05, + "loss": 2.7449, + "step": 34237 + }, + { + "epoch": 1.5940358963614778, + "grad_norm": 0.3896984947997369, + "learning_rate": 5.325089779267689e-05, + "loss": 2.637, + "step": 34238 + }, + { + "epoch": 1.594082454547571, + "grad_norm": 0.32859535222464675, + "learning_rate": 5.324819478278661e-05, + "loss": 2.7459, + "step": 34239 + }, + { + "epoch": 1.594129012733664, + "grad_norm": 0.3697472280297446, + "learning_rate": 5.3245491763363194e-05, + "loss": 2.6512, + "step": 34240 + }, + { + "epoch": 1.594175570919757, + "grad_norm": 0.33506829759567597, + "learning_rate": 5.3242788734414596e-05, + "loss": 2.7307, + "step": 34241 + }, + { + "epoch": 1.59422212910585, + "grad_norm": 0.3341141966494128, + "learning_rate": 5.324008569594873e-05, + "loss": 2.7263, + "step": 34242 + }, + { + "epoch": 1.5942686872919432, + "grad_norm": 0.3170248538502661, + "learning_rate": 5.3237382647973534e-05, + "loss": 2.7426, + "step": 34243 + }, + { + "epoch": 1.594315245478036, + "grad_norm": 0.3463895044461798, + "learning_rate": 5.3234679590496927e-05, + "loss": 2.7594, + "step": 34244 + }, + { + "epoch": 1.5943618036641292, + "grad_norm": 0.32369042480833016, + "learning_rate": 5.323197652352686e-05, + "loss": 2.6938, + "step": 34245 + }, + { + "epoch": 1.5944083618502223, + "grad_norm": 0.3151090341375255, + "learning_rate": 5.322927344707126e-05, + "loss": 2.726, + "step": 34246 + }, + { + "epoch": 1.5944549200363154, + "grad_norm": 0.3385025037814257, + "learning_rate": 5.322657036113806e-05, + "loss": 2.7121, + "step": 34247 + }, + { + "epoch": 1.5945014782224085, + "grad_norm": 0.32868496851642154, + "learning_rate": 5.3223867265735195e-05, + "loss": 2.7615, + "step": 34248 + }, + { + "epoch": 1.5945480364085016, + "grad_norm": 0.3302024198298769, + "learning_rate": 5.3221164160870604e-05, + "loss": 2.6903, + "step": 34249 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 0.3604643473219002, + "learning_rate": 5.3218461046552204e-05, + "loss": 2.7663, + "step": 34250 + }, + { + "epoch": 1.5946411527806876, + "grad_norm": 0.3565916363502355, + "learning_rate": 5.3215757922787944e-05, + "loss": 2.7074, + "step": 34251 + }, + { + "epoch": 1.5946877109667807, + "grad_norm": 0.3720628376753195, + "learning_rate": 5.321305478958575e-05, + "loss": 2.7605, + "step": 34252 + }, + { + "epoch": 1.5947342691528736, + "grad_norm": 0.357128037876627, + "learning_rate": 5.321035164695356e-05, + "loss": 2.77, + "step": 34253 + }, + { + "epoch": 1.5947808273389668, + "grad_norm": 0.35553449762277983, + "learning_rate": 5.320764849489929e-05, + "loss": 2.768, + "step": 34254 + }, + { + "epoch": 1.5948273855250599, + "grad_norm": 0.351856887466941, + "learning_rate": 5.32049453334309e-05, + "loss": 2.7153, + "step": 34255 + }, + { + "epoch": 1.594873943711153, + "grad_norm": 0.35331644825955183, + "learning_rate": 5.3202242162556314e-05, + "loss": 2.7447, + "step": 34256 + }, + { + "epoch": 1.594920501897246, + "grad_norm": 0.32736799148722945, + "learning_rate": 5.319953898228345e-05, + "loss": 2.796, + "step": 34257 + }, + { + "epoch": 1.5949670600833392, + "grad_norm": 0.35046913562374354, + "learning_rate": 5.3196835792620257e-05, + "loss": 2.6908, + "step": 34258 + }, + { + "epoch": 1.5950136182694323, + "grad_norm": 0.34885578096434533, + "learning_rate": 5.319413259357466e-05, + "loss": 2.7159, + "step": 34259 + }, + { + "epoch": 1.5950601764555254, + "grad_norm": 0.3424058554260304, + "learning_rate": 5.319142938515462e-05, + "loss": 2.775, + "step": 34260 + }, + { + "epoch": 1.5951067346416183, + "grad_norm": 0.3375883463663009, + "learning_rate": 5.3188726167368044e-05, + "loss": 2.6546, + "step": 34261 + }, + { + "epoch": 1.5951532928277115, + "grad_norm": 0.34391685843316566, + "learning_rate": 5.318602294022287e-05, + "loss": 2.7397, + "step": 34262 + }, + { + "epoch": 1.5951998510138043, + "grad_norm": 0.3240404949106551, + "learning_rate": 5.318331970372702e-05, + "loss": 2.7427, + "step": 34263 + }, + { + "epoch": 1.5952464091998975, + "grad_norm": 0.34781257595241605, + "learning_rate": 5.318061645788844e-05, + "loss": 2.6679, + "step": 34264 + }, + { + "epoch": 1.5952929673859906, + "grad_norm": 0.3689600136946627, + "learning_rate": 5.317791320271508e-05, + "loss": 2.8686, + "step": 34265 + }, + { + "epoch": 1.5953395255720837, + "grad_norm": 0.3515742801287378, + "learning_rate": 5.3175209938214855e-05, + "loss": 2.8059, + "step": 34266 + }, + { + "epoch": 1.5953860837581768, + "grad_norm": 0.34918974215520737, + "learning_rate": 5.3172506664395686e-05, + "loss": 2.7249, + "step": 34267 + }, + { + "epoch": 1.59543264194427, + "grad_norm": 0.3743164255473494, + "learning_rate": 5.3169803381265524e-05, + "loss": 2.7336, + "step": 34268 + }, + { + "epoch": 1.595479200130363, + "grad_norm": 0.3457935239112415, + "learning_rate": 5.316710008883231e-05, + "loss": 2.7045, + "step": 34269 + }, + { + "epoch": 1.5955257583164562, + "grad_norm": 0.3650119343020333, + "learning_rate": 5.3164396787103975e-05, + "loss": 2.6503, + "step": 34270 + }, + { + "epoch": 1.595572316502549, + "grad_norm": 0.3391013960556064, + "learning_rate": 5.3161693476088445e-05, + "loss": 2.7228, + "step": 34271 + }, + { + "epoch": 1.5956188746886422, + "grad_norm": 0.37059515751181066, + "learning_rate": 5.315899015579364e-05, + "loss": 2.6449, + "step": 34272 + }, + { + "epoch": 1.595665432874735, + "grad_norm": 0.3149893197956652, + "learning_rate": 5.315628682622753e-05, + "loss": 2.6488, + "step": 34273 + }, + { + "epoch": 1.5957119910608282, + "grad_norm": 0.37033856569612755, + "learning_rate": 5.315358348739803e-05, + "loss": 2.7435, + "step": 34274 + }, + { + "epoch": 1.5957585492469213, + "grad_norm": 0.3374690450854275, + "learning_rate": 5.3150880139313066e-05, + "loss": 2.6612, + "step": 34275 + }, + { + "epoch": 1.5958051074330144, + "grad_norm": 0.34992545027811256, + "learning_rate": 5.3148176781980574e-05, + "loss": 2.6145, + "step": 34276 + }, + { + "epoch": 1.5958516656191075, + "grad_norm": 0.33540818317927146, + "learning_rate": 5.31454734154085e-05, + "loss": 2.7445, + "step": 34277 + }, + { + "epoch": 1.5958982238052006, + "grad_norm": 0.36452804853653187, + "learning_rate": 5.314277003960476e-05, + "loss": 2.7444, + "step": 34278 + }, + { + "epoch": 1.5959447819912937, + "grad_norm": 0.33182639926686347, + "learning_rate": 5.3140066654577314e-05, + "loss": 2.7763, + "step": 34279 + }, + { + "epoch": 1.5959913401773866, + "grad_norm": 0.3438042481614952, + "learning_rate": 5.3137363260334075e-05, + "loss": 2.6893, + "step": 34280 + }, + { + "epoch": 1.5960378983634798, + "grad_norm": 0.34204149126686373, + "learning_rate": 5.3134659856882984e-05, + "loss": 2.7765, + "step": 34281 + }, + { + "epoch": 1.5960844565495729, + "grad_norm": 0.3369826869851793, + "learning_rate": 5.313195644423197e-05, + "loss": 2.7349, + "step": 34282 + }, + { + "epoch": 1.5961310147356658, + "grad_norm": 0.33649812431652515, + "learning_rate": 5.312925302238899e-05, + "loss": 2.6751, + "step": 34283 + }, + { + "epoch": 1.5961775729217589, + "grad_norm": 0.3509386234040123, + "learning_rate": 5.312654959136194e-05, + "loss": 2.7851, + "step": 34284 + }, + { + "epoch": 1.596224131107852, + "grad_norm": 0.35007381158797596, + "learning_rate": 5.312384615115878e-05, + "loss": 2.7671, + "step": 34285 + }, + { + "epoch": 1.596270689293945, + "grad_norm": 0.3398129849972221, + "learning_rate": 5.312114270178744e-05, + "loss": 2.6087, + "step": 34286 + }, + { + "epoch": 1.5963172474800382, + "grad_norm": 0.36444875780025215, + "learning_rate": 5.311843924325587e-05, + "loss": 2.7605, + "step": 34287 + }, + { + "epoch": 1.5963638056661313, + "grad_norm": 0.3635447071053205, + "learning_rate": 5.311573577557198e-05, + "loss": 2.8079, + "step": 34288 + }, + { + "epoch": 1.5964103638522245, + "grad_norm": 0.35100811397896403, + "learning_rate": 5.31130322987437e-05, + "loss": 2.7452, + "step": 34289 + }, + { + "epoch": 1.5964569220383173, + "grad_norm": 0.3921082615588329, + "learning_rate": 5.3110328812778984e-05, + "loss": 2.6969, + "step": 34290 + }, + { + "epoch": 1.5965034802244105, + "grad_norm": 0.3187311040488166, + "learning_rate": 5.310762531768576e-05, + "loss": 2.7727, + "step": 34291 + }, + { + "epoch": 1.5965500384105036, + "grad_norm": 0.37561477115190994, + "learning_rate": 5.310492181347196e-05, + "loss": 2.6841, + "step": 34292 + }, + { + "epoch": 1.5965965965965965, + "grad_norm": 0.3475003265561363, + "learning_rate": 5.3102218300145526e-05, + "loss": 2.7259, + "step": 34293 + }, + { + "epoch": 1.5966431547826896, + "grad_norm": 0.3533103903040003, + "learning_rate": 5.309951477771437e-05, + "loss": 2.7827, + "step": 34294 + }, + { + "epoch": 1.5966897129687827, + "grad_norm": 0.3811311051768784, + "learning_rate": 5.309681124618646e-05, + "loss": 2.7495, + "step": 34295 + }, + { + "epoch": 1.5967362711548758, + "grad_norm": 0.3508910883842924, + "learning_rate": 5.309410770556971e-05, + "loss": 2.727, + "step": 34296 + }, + { + "epoch": 1.596782829340969, + "grad_norm": 0.3836586789912095, + "learning_rate": 5.309140415587207e-05, + "loss": 2.7748, + "step": 34297 + }, + { + "epoch": 1.596829387527062, + "grad_norm": 0.342184016165861, + "learning_rate": 5.308870059710145e-05, + "loss": 2.7742, + "step": 34298 + }, + { + "epoch": 1.5968759457131552, + "grad_norm": 0.37551232805771995, + "learning_rate": 5.3085997029265786e-05, + "loss": 2.717, + "step": 34299 + }, + { + "epoch": 1.596922503899248, + "grad_norm": 0.360759822801753, + "learning_rate": 5.308329345237305e-05, + "loss": 2.7497, + "step": 34300 + }, + { + "epoch": 1.5969690620853412, + "grad_norm": 0.3628843253125453, + "learning_rate": 5.308058986643115e-05, + "loss": 2.8591, + "step": 34301 + }, + { + "epoch": 1.597015620271434, + "grad_norm": 0.36434629545330616, + "learning_rate": 5.307788627144801e-05, + "loss": 2.7311, + "step": 34302 + }, + { + "epoch": 1.5970621784575272, + "grad_norm": 0.3372029046400025, + "learning_rate": 5.307518266743158e-05, + "loss": 2.6666, + "step": 34303 + }, + { + "epoch": 1.5971087366436203, + "grad_norm": 0.34470789472940905, + "learning_rate": 5.307247905438979e-05, + "loss": 2.7072, + "step": 34304 + }, + { + "epoch": 1.5971552948297134, + "grad_norm": 0.3586528177552945, + "learning_rate": 5.306977543233057e-05, + "loss": 2.7572, + "step": 34305 + }, + { + "epoch": 1.5972018530158065, + "grad_norm": 0.32296408977510954, + "learning_rate": 5.306707180126188e-05, + "loss": 2.795, + "step": 34306 + }, + { + "epoch": 1.5972484112018996, + "grad_norm": 0.362132982058813, + "learning_rate": 5.306436816119162e-05, + "loss": 2.8138, + "step": 34307 + }, + { + "epoch": 1.5972949693879928, + "grad_norm": 0.34444265459373624, + "learning_rate": 5.306166451212775e-05, + "loss": 2.7307, + "step": 34308 + }, + { + "epoch": 1.5973415275740859, + "grad_norm": 0.3440258097966307, + "learning_rate": 5.3058960854078186e-05, + "loss": 2.6804, + "step": 34309 + }, + { + "epoch": 1.5973880857601788, + "grad_norm": 0.36649463766916973, + "learning_rate": 5.3056257187050893e-05, + "loss": 2.6956, + "step": 34310 + }, + { + "epoch": 1.5974346439462719, + "grad_norm": 0.3270986026092851, + "learning_rate": 5.3053553511053766e-05, + "loss": 2.6991, + "step": 34311 + }, + { + "epoch": 1.5974812021323648, + "grad_norm": 0.36832011141808585, + "learning_rate": 5.3050849826094765e-05, + "loss": 2.7437, + "step": 34312 + }, + { + "epoch": 1.597527760318458, + "grad_norm": 0.3384264677127298, + "learning_rate": 5.304814613218183e-05, + "loss": 2.6021, + "step": 34313 + }, + { + "epoch": 1.597574318504551, + "grad_norm": 0.3438634200534363, + "learning_rate": 5.304544242932288e-05, + "loss": 2.7127, + "step": 34314 + }, + { + "epoch": 1.5976208766906441, + "grad_norm": 0.3518136842046231, + "learning_rate": 5.304273871752586e-05, + "loss": 2.6916, + "step": 34315 + }, + { + "epoch": 1.5976674348767372, + "grad_norm": 0.356975903266722, + "learning_rate": 5.3040034996798706e-05, + "loss": 2.6747, + "step": 34316 + }, + { + "epoch": 1.5977139930628304, + "grad_norm": 0.36223214918330654, + "learning_rate": 5.303733126714934e-05, + "loss": 2.7959, + "step": 34317 + }, + { + "epoch": 1.5977605512489235, + "grad_norm": 0.36455280987534894, + "learning_rate": 5.30346275285857e-05, + "loss": 2.705, + "step": 34318 + }, + { + "epoch": 1.5978071094350164, + "grad_norm": 0.35942230084682014, + "learning_rate": 5.3031923781115745e-05, + "loss": 2.7444, + "step": 34319 + }, + { + "epoch": 1.5978536676211095, + "grad_norm": 0.36784345631119164, + "learning_rate": 5.3029220024747374e-05, + "loss": 2.7552, + "step": 34320 + }, + { + "epoch": 1.5979002258072026, + "grad_norm": 0.34371604576052806, + "learning_rate": 5.3026516259488545e-05, + "loss": 2.7336, + "step": 34321 + }, + { + "epoch": 1.5979467839932955, + "grad_norm": 0.38983428729183045, + "learning_rate": 5.3023812485347194e-05, + "loss": 2.772, + "step": 34322 + }, + { + "epoch": 1.5979933421793886, + "grad_norm": 0.37113971975806825, + "learning_rate": 5.3021108702331234e-05, + "loss": 2.683, + "step": 34323 + }, + { + "epoch": 1.5980399003654817, + "grad_norm": 0.35159504070923947, + "learning_rate": 5.301840491044865e-05, + "loss": 2.7614, + "step": 34324 + }, + { + "epoch": 1.5980864585515748, + "grad_norm": 0.367569619691553, + "learning_rate": 5.3015701109707325e-05, + "loss": 2.6247, + "step": 34325 + }, + { + "epoch": 1.598133016737668, + "grad_norm": 0.38969736628484003, + "learning_rate": 5.301299730011522e-05, + "loss": 2.7925, + "step": 34326 + }, + { + "epoch": 1.598179574923761, + "grad_norm": 0.3433691445092774, + "learning_rate": 5.3010293481680265e-05, + "loss": 2.7479, + "step": 34327 + }, + { + "epoch": 1.5982261331098542, + "grad_norm": 0.38182976468478363, + "learning_rate": 5.300758965441039e-05, + "loss": 2.7077, + "step": 34328 + }, + { + "epoch": 1.598272691295947, + "grad_norm": 0.35202248336551634, + "learning_rate": 5.300488581831354e-05, + "loss": 2.6516, + "step": 34329 + }, + { + "epoch": 1.5983192494820402, + "grad_norm": 0.35550143205119744, + "learning_rate": 5.3002181973397646e-05, + "loss": 2.691, + "step": 34330 + }, + { + "epoch": 1.5983658076681333, + "grad_norm": 0.38017754693529765, + "learning_rate": 5.299947811967063e-05, + "loss": 2.6664, + "step": 34331 + }, + { + "epoch": 1.5984123658542262, + "grad_norm": 0.35005417387607435, + "learning_rate": 5.2996774257140456e-05, + "loss": 2.6803, + "step": 34332 + }, + { + "epoch": 1.5984589240403193, + "grad_norm": 0.32008278593236233, + "learning_rate": 5.299407038581505e-05, + "loss": 2.679, + "step": 34333 + }, + { + "epoch": 1.5985054822264124, + "grad_norm": 0.3770475478805752, + "learning_rate": 5.2991366505702335e-05, + "loss": 2.6504, + "step": 34334 + }, + { + "epoch": 1.5985520404125055, + "grad_norm": 0.31007642854202266, + "learning_rate": 5.2988662616810245e-05, + "loss": 2.7899, + "step": 34335 + }, + { + "epoch": 1.5985985985985987, + "grad_norm": 0.34730879305064505, + "learning_rate": 5.2985958719146736e-05, + "loss": 2.6516, + "step": 34336 + }, + { + "epoch": 1.5986451567846918, + "grad_norm": 0.33501980137490533, + "learning_rate": 5.298325481271973e-05, + "loss": 2.6577, + "step": 34337 + }, + { + "epoch": 1.5986917149707849, + "grad_norm": 0.34133157742273645, + "learning_rate": 5.298055089753716e-05, + "loss": 2.7059, + "step": 34338 + }, + { + "epoch": 1.5987382731568778, + "grad_norm": 0.3329267409993298, + "learning_rate": 5.297784697360698e-05, + "loss": 2.8077, + "step": 34339 + }, + { + "epoch": 1.598784831342971, + "grad_norm": 0.37824023587372124, + "learning_rate": 5.29751430409371e-05, + "loss": 2.8128, + "step": 34340 + }, + { + "epoch": 1.5988313895290638, + "grad_norm": 0.34225833900461994, + "learning_rate": 5.2972439099535466e-05, + "loss": 2.7216, + "step": 34341 + }, + { + "epoch": 1.598877947715157, + "grad_norm": 0.3518085863523912, + "learning_rate": 5.296973514941004e-05, + "loss": 2.7079, + "step": 34342 + }, + { + "epoch": 1.59892450590125, + "grad_norm": 0.35043153231855356, + "learning_rate": 5.2967031190568706e-05, + "loss": 2.6757, + "step": 34343 + }, + { + "epoch": 1.5989710640873431, + "grad_norm": 0.40295316464205755, + "learning_rate": 5.2964327223019436e-05, + "loss": 2.7227, + "step": 34344 + }, + { + "epoch": 1.5990176222734362, + "grad_norm": 0.32084190395870177, + "learning_rate": 5.2961623246770155e-05, + "loss": 2.8093, + "step": 34345 + }, + { + "epoch": 1.5990641804595294, + "grad_norm": 0.3919841536351808, + "learning_rate": 5.295891926182881e-05, + "loss": 2.7995, + "step": 34346 + }, + { + "epoch": 1.5991107386456225, + "grad_norm": 0.3530812138314287, + "learning_rate": 5.295621526820333e-05, + "loss": 2.6868, + "step": 34347 + }, + { + "epoch": 1.5991572968317156, + "grad_norm": 0.3346947127132004, + "learning_rate": 5.295351126590164e-05, + "loss": 2.7532, + "step": 34348 + }, + { + "epoch": 1.5992038550178085, + "grad_norm": 0.3663241489817768, + "learning_rate": 5.2950807254931676e-05, + "loss": 2.8137, + "step": 34349 + }, + { + "epoch": 1.5992504132039016, + "grad_norm": 0.3207925568583628, + "learning_rate": 5.2948103235301406e-05, + "loss": 2.6562, + "step": 34350 + }, + { + "epoch": 1.5992969713899945, + "grad_norm": 0.3466922759465708, + "learning_rate": 5.294539920701873e-05, + "loss": 2.7529, + "step": 34351 + }, + { + "epoch": 1.5993435295760876, + "grad_norm": 0.3418288009770333, + "learning_rate": 5.294269517009162e-05, + "loss": 2.7515, + "step": 34352 + }, + { + "epoch": 1.5993900877621807, + "grad_norm": 0.3243586329403978, + "learning_rate": 5.293999112452795e-05, + "loss": 2.7844, + "step": 34353 + }, + { + "epoch": 1.5994366459482738, + "grad_norm": 0.3752276535160341, + "learning_rate": 5.2937287070335715e-05, + "loss": 2.6935, + "step": 34354 + }, + { + "epoch": 1.599483204134367, + "grad_norm": 0.3264740566655288, + "learning_rate": 5.2934583007522845e-05, + "loss": 2.7421, + "step": 34355 + }, + { + "epoch": 1.59952976232046, + "grad_norm": 0.36495234321630343, + "learning_rate": 5.2931878936097256e-05, + "loss": 2.6798, + "step": 34356 + }, + { + "epoch": 1.5995763205065532, + "grad_norm": 0.34923165358440955, + "learning_rate": 5.2929174856066876e-05, + "loss": 2.7017, + "step": 34357 + }, + { + "epoch": 1.5996228786926463, + "grad_norm": 0.3532822198156418, + "learning_rate": 5.292647076743966e-05, + "loss": 2.8217, + "step": 34358 + }, + { + "epoch": 1.5996694368787392, + "grad_norm": 0.40980441594590344, + "learning_rate": 5.292376667022355e-05, + "loss": 2.6819, + "step": 34359 + }, + { + "epoch": 1.5997159950648323, + "grad_norm": 0.33928124354096273, + "learning_rate": 5.292106256442648e-05, + "loss": 2.7363, + "step": 34360 + }, + { + "epoch": 1.5997625532509252, + "grad_norm": 0.35381600495858523, + "learning_rate": 5.291835845005637e-05, + "loss": 2.5901, + "step": 34361 + }, + { + "epoch": 1.5998091114370183, + "grad_norm": 0.39434950911934497, + "learning_rate": 5.2915654327121156e-05, + "loss": 2.7278, + "step": 34362 + }, + { + "epoch": 1.5998556696231114, + "grad_norm": 0.32671326861903516, + "learning_rate": 5.291295019562881e-05, + "loss": 2.7514, + "step": 34363 + }, + { + "epoch": 1.5999022278092045, + "grad_norm": 0.3773171612666682, + "learning_rate": 5.291024605558721e-05, + "loss": 2.6977, + "step": 34364 + }, + { + "epoch": 1.5999487859952977, + "grad_norm": 0.3530604262446818, + "learning_rate": 5.290754190700436e-05, + "loss": 2.742, + "step": 34365 + }, + { + "epoch": 1.5999953441813908, + "grad_norm": 0.35828240807882555, + "learning_rate": 5.2904837749888124e-05, + "loss": 2.8226, + "step": 34366 + }, + { + "epoch": 1.600041902367484, + "grad_norm": 0.3786962897089551, + "learning_rate": 5.290213358424649e-05, + "loss": 2.6879, + "step": 34367 + }, + { + "epoch": 1.6000884605535768, + "grad_norm": 0.378785505074336, + "learning_rate": 5.2899429410087386e-05, + "loss": 2.6863, + "step": 34368 + }, + { + "epoch": 1.60013501873967, + "grad_norm": 0.3433888692398908, + "learning_rate": 5.2896725227418744e-05, + "loss": 2.6688, + "step": 34369 + }, + { + "epoch": 1.600181576925763, + "grad_norm": 0.3540508825211572, + "learning_rate": 5.2894021036248487e-05, + "loss": 2.7951, + "step": 34370 + }, + { + "epoch": 1.600228135111856, + "grad_norm": 0.36896901744185784, + "learning_rate": 5.289131683658457e-05, + "loss": 2.8051, + "step": 34371 + }, + { + "epoch": 1.600274693297949, + "grad_norm": 0.32154284104078584, + "learning_rate": 5.288861262843492e-05, + "loss": 2.6488, + "step": 34372 + }, + { + "epoch": 1.6003212514840421, + "grad_norm": 0.35141955215954274, + "learning_rate": 5.288590841180747e-05, + "loss": 2.6145, + "step": 34373 + }, + { + "epoch": 1.6003678096701353, + "grad_norm": 0.3198966137752828, + "learning_rate": 5.288320418671018e-05, + "loss": 2.6491, + "step": 34374 + }, + { + "epoch": 1.6004143678562284, + "grad_norm": 0.3381373738036083, + "learning_rate": 5.288049995315095e-05, + "loss": 2.6999, + "step": 34375 + }, + { + "epoch": 1.6004609260423215, + "grad_norm": 0.36897488933036443, + "learning_rate": 5.287779571113777e-05, + "loss": 2.6883, + "step": 34376 + }, + { + "epoch": 1.6005074842284146, + "grad_norm": 0.32884854227819, + "learning_rate": 5.287509146067852e-05, + "loss": 2.621, + "step": 34377 + }, + { + "epoch": 1.6005540424145075, + "grad_norm": 0.35611526442743263, + "learning_rate": 5.287238720178116e-05, + "loss": 2.6807, + "step": 34378 + }, + { + "epoch": 1.6006006006006006, + "grad_norm": 0.38111965075336657, + "learning_rate": 5.286968293445362e-05, + "loss": 2.7264, + "step": 34379 + }, + { + "epoch": 1.6006471587866937, + "grad_norm": 0.34799249963469503, + "learning_rate": 5.2866978658703845e-05, + "loss": 2.7915, + "step": 34380 + }, + { + "epoch": 1.6006937169727866, + "grad_norm": 0.3808763923256011, + "learning_rate": 5.286427437453978e-05, + "loss": 2.7058, + "step": 34381 + }, + { + "epoch": 1.6007402751588797, + "grad_norm": 0.3857754435235164, + "learning_rate": 5.286157008196935e-05, + "loss": 2.7377, + "step": 34382 + }, + { + "epoch": 1.6007868333449728, + "grad_norm": 0.3220455559752624, + "learning_rate": 5.285886578100049e-05, + "loss": 2.7614, + "step": 34383 + }, + { + "epoch": 1.600833391531066, + "grad_norm": 0.3806624019384452, + "learning_rate": 5.285616147164114e-05, + "loss": 2.6744, + "step": 34384 + }, + { + "epoch": 1.600879949717159, + "grad_norm": 0.34885181565988255, + "learning_rate": 5.285345715389923e-05, + "loss": 2.8122, + "step": 34385 + }, + { + "epoch": 1.6009265079032522, + "grad_norm": 0.3325974967482454, + "learning_rate": 5.2850752827782715e-05, + "loss": 2.6927, + "step": 34386 + }, + { + "epoch": 1.6009730660893453, + "grad_norm": 0.42042595241388686, + "learning_rate": 5.284804849329953e-05, + "loss": 2.7744, + "step": 34387 + }, + { + "epoch": 1.6010196242754382, + "grad_norm": 0.3722721857616147, + "learning_rate": 5.284534415045759e-05, + "loss": 2.7286, + "step": 34388 + }, + { + "epoch": 1.6010661824615313, + "grad_norm": 0.39597783519051705, + "learning_rate": 5.284263979926486e-05, + "loss": 2.72, + "step": 34389 + }, + { + "epoch": 1.6011127406476242, + "grad_norm": 0.3686446912250058, + "learning_rate": 5.283993543972925e-05, + "loss": 2.71, + "step": 34390 + }, + { + "epoch": 1.6011592988337173, + "grad_norm": 0.37978896035229986, + "learning_rate": 5.2837231071858705e-05, + "loss": 2.6773, + "step": 34391 + }, + { + "epoch": 1.6012058570198104, + "grad_norm": 0.3709510263627962, + "learning_rate": 5.283452669566118e-05, + "loss": 2.7072, + "step": 34392 + }, + { + "epoch": 1.6012524152059036, + "grad_norm": 0.35971006880682505, + "learning_rate": 5.283182231114459e-05, + "loss": 2.6774, + "step": 34393 + }, + { + "epoch": 1.6012989733919967, + "grad_norm": 0.36167721011427484, + "learning_rate": 5.2829117918316874e-05, + "loss": 2.7297, + "step": 34394 + }, + { + "epoch": 1.6013455315780898, + "grad_norm": 0.37074062481427583, + "learning_rate": 5.2826413517185966e-05, + "loss": 2.6507, + "step": 34395 + }, + { + "epoch": 1.601392089764183, + "grad_norm": 0.3431909639699145, + "learning_rate": 5.282370910775984e-05, + "loss": 2.7574, + "step": 34396 + }, + { + "epoch": 1.601438647950276, + "grad_norm": 0.37659921206443636, + "learning_rate": 5.2821004690046394e-05, + "loss": 2.7502, + "step": 34397 + }, + { + "epoch": 1.601485206136369, + "grad_norm": 0.36637389320883573, + "learning_rate": 5.281830026405358e-05, + "loss": 2.7267, + "step": 34398 + }, + { + "epoch": 1.601531764322462, + "grad_norm": 0.3497717581815499, + "learning_rate": 5.2815595829789324e-05, + "loss": 2.7052, + "step": 34399 + }, + { + "epoch": 1.601578322508555, + "grad_norm": 0.3375464642596151, + "learning_rate": 5.2812891387261586e-05, + "loss": 2.7592, + "step": 34400 + }, + { + "epoch": 1.601624880694648, + "grad_norm": 0.37630517264986046, + "learning_rate": 5.281018693647828e-05, + "loss": 2.7086, + "step": 34401 + }, + { + "epoch": 1.6016714388807411, + "grad_norm": 0.3624438780268858, + "learning_rate": 5.2807482477447356e-05, + "loss": 2.6646, + "step": 34402 + }, + { + "epoch": 1.6017179970668343, + "grad_norm": 0.362313348270089, + "learning_rate": 5.2804778010176734e-05, + "loss": 2.694, + "step": 34403 + }, + { + "epoch": 1.6017645552529274, + "grad_norm": 0.35733644090717065, + "learning_rate": 5.2802073534674376e-05, + "loss": 2.6832, + "step": 34404 + }, + { + "epoch": 1.6018111134390205, + "grad_norm": 0.36984029104602617, + "learning_rate": 5.279936905094822e-05, + "loss": 2.7538, + "step": 34405 + }, + { + "epoch": 1.6018576716251136, + "grad_norm": 0.3694441247748218, + "learning_rate": 5.2796664559006174e-05, + "loss": 2.8793, + "step": 34406 + }, + { + "epoch": 1.6019042298112065, + "grad_norm": 0.3792603367379591, + "learning_rate": 5.279396005885619e-05, + "loss": 2.7288, + "step": 34407 + }, + { + "epoch": 1.6019507879972996, + "grad_norm": 0.3219610637752026, + "learning_rate": 5.2791255550506215e-05, + "loss": 2.7272, + "step": 34408 + }, + { + "epoch": 1.6019973461833927, + "grad_norm": 0.4135854476219877, + "learning_rate": 5.278855103396418e-05, + "loss": 2.7696, + "step": 34409 + }, + { + "epoch": 1.6020439043694856, + "grad_norm": 0.31025807351097684, + "learning_rate": 5.2785846509238035e-05, + "loss": 2.6551, + "step": 34410 + }, + { + "epoch": 1.6020904625555787, + "grad_norm": 0.42261334254758137, + "learning_rate": 5.278314197633569e-05, + "loss": 2.7271, + "step": 34411 + }, + { + "epoch": 1.6021370207416719, + "grad_norm": 0.3502262801190602, + "learning_rate": 5.278043743526509e-05, + "loss": 2.6814, + "step": 34412 + }, + { + "epoch": 1.602183578927765, + "grad_norm": 0.38993692599323104, + "learning_rate": 5.27777328860342e-05, + "loss": 2.7126, + "step": 34413 + }, + { + "epoch": 1.602230137113858, + "grad_norm": 0.3327694987091991, + "learning_rate": 5.277502832865093e-05, + "loss": 2.6568, + "step": 34414 + }, + { + "epoch": 1.6022766952999512, + "grad_norm": 0.35182789829166783, + "learning_rate": 5.2772323763123233e-05, + "loss": 2.7282, + "step": 34415 + }, + { + "epoch": 1.6023232534860443, + "grad_norm": 0.36611284256684273, + "learning_rate": 5.276961918945903e-05, + "loss": 2.7061, + "step": 34416 + }, + { + "epoch": 1.6023698116721372, + "grad_norm": 0.3642679616779878, + "learning_rate": 5.2766914607666264e-05, + "loss": 2.6407, + "step": 34417 + }, + { + "epoch": 1.6024163698582303, + "grad_norm": 0.35190605555745835, + "learning_rate": 5.276421001775288e-05, + "loss": 2.7169, + "step": 34418 + }, + { + "epoch": 1.6024629280443234, + "grad_norm": 0.39446433714800117, + "learning_rate": 5.276150541972682e-05, + "loss": 2.7204, + "step": 34419 + }, + { + "epoch": 1.6025094862304163, + "grad_norm": 0.3358586214318883, + "learning_rate": 5.2758800813596e-05, + "loss": 2.7169, + "step": 34420 + }, + { + "epoch": 1.6025560444165095, + "grad_norm": 0.37239220644382015, + "learning_rate": 5.2756096199368364e-05, + "loss": 2.744, + "step": 34421 + }, + { + "epoch": 1.6026026026026026, + "grad_norm": 0.3813825713948235, + "learning_rate": 5.275339157705187e-05, + "loss": 2.7594, + "step": 34422 + }, + { + "epoch": 1.6026491607886957, + "grad_norm": 0.3506280262717097, + "learning_rate": 5.275068694665446e-05, + "loss": 2.786, + "step": 34423 + }, + { + "epoch": 1.6026957189747888, + "grad_norm": 0.40770971456010635, + "learning_rate": 5.274798230818403e-05, + "loss": 2.6477, + "step": 34424 + }, + { + "epoch": 1.602742277160882, + "grad_norm": 0.33267530983277316, + "learning_rate": 5.2745277661648553e-05, + "loss": 2.7942, + "step": 34425 + }, + { + "epoch": 1.602788835346975, + "grad_norm": 0.40126529114483916, + "learning_rate": 5.274257300705595e-05, + "loss": 2.581, + "step": 34426 + }, + { + "epoch": 1.602835393533068, + "grad_norm": 0.33457946187739357, + "learning_rate": 5.2739868344414166e-05, + "loss": 2.7296, + "step": 34427 + }, + { + "epoch": 1.602881951719161, + "grad_norm": 0.36933664386936654, + "learning_rate": 5.273716367373115e-05, + "loss": 2.6251, + "step": 34428 + }, + { + "epoch": 1.602928509905254, + "grad_norm": 0.3387226865293293, + "learning_rate": 5.273445899501482e-05, + "loss": 2.7667, + "step": 34429 + }, + { + "epoch": 1.602975068091347, + "grad_norm": 0.3602012790561009, + "learning_rate": 5.273175430827312e-05, + "loss": 2.7531, + "step": 34430 + }, + { + "epoch": 1.6030216262774402, + "grad_norm": 0.353119218192465, + "learning_rate": 5.272904961351399e-05, + "loss": 2.8357, + "step": 34431 + }, + { + "epoch": 1.6030681844635333, + "grad_norm": 0.37104587315498255, + "learning_rate": 5.2726344910745374e-05, + "loss": 2.7769, + "step": 34432 + }, + { + "epoch": 1.6031147426496264, + "grad_norm": 0.36117127113514497, + "learning_rate": 5.27236401999752e-05, + "loss": 2.7722, + "step": 34433 + }, + { + "epoch": 1.6031613008357195, + "grad_norm": 0.33565444662951605, + "learning_rate": 5.2720935481211406e-05, + "loss": 2.716, + "step": 34434 + }, + { + "epoch": 1.6032078590218126, + "grad_norm": 0.3681145937609183, + "learning_rate": 5.271823075446194e-05, + "loss": 2.6971, + "step": 34435 + }, + { + "epoch": 1.6032544172079057, + "grad_norm": 0.3635546434491315, + "learning_rate": 5.271552601973473e-05, + "loss": 2.7304, + "step": 34436 + }, + { + "epoch": 1.6033009753939986, + "grad_norm": 0.3063654186768704, + "learning_rate": 5.271282127703773e-05, + "loss": 2.579, + "step": 34437 + }, + { + "epoch": 1.6033475335800917, + "grad_norm": 0.3453454094738937, + "learning_rate": 5.271011652637886e-05, + "loss": 2.7494, + "step": 34438 + }, + { + "epoch": 1.6033940917661846, + "grad_norm": 0.3449996421879488, + "learning_rate": 5.270741176776606e-05, + "loss": 2.679, + "step": 34439 + }, + { + "epoch": 1.6034406499522778, + "grad_norm": 0.35349235488155073, + "learning_rate": 5.270470700120729e-05, + "loss": 2.7152, + "step": 34440 + }, + { + "epoch": 1.6034872081383709, + "grad_norm": 0.3370454334702509, + "learning_rate": 5.2702002226710466e-05, + "loss": 2.7232, + "step": 34441 + }, + { + "epoch": 1.603533766324464, + "grad_norm": 0.31700408520405216, + "learning_rate": 5.2699297444283516e-05, + "loss": 2.7774, + "step": 34442 + }, + { + "epoch": 1.603580324510557, + "grad_norm": 0.38459768733533434, + "learning_rate": 5.2696592653934403e-05, + "loss": 2.7012, + "step": 34443 + }, + { + "epoch": 1.6036268826966502, + "grad_norm": 0.3425904380839207, + "learning_rate": 5.2693887855671055e-05, + "loss": 2.7452, + "step": 34444 + }, + { + "epoch": 1.6036734408827433, + "grad_norm": 0.3713854031129902, + "learning_rate": 5.26911830495014e-05, + "loss": 2.6856, + "step": 34445 + }, + { + "epoch": 1.6037199990688364, + "grad_norm": 0.32317341275622785, + "learning_rate": 5.268847823543341e-05, + "loss": 2.7604, + "step": 34446 + }, + { + "epoch": 1.6037665572549293, + "grad_norm": 0.3489880657242495, + "learning_rate": 5.2685773413474995e-05, + "loss": 2.8161, + "step": 34447 + }, + { + "epoch": 1.6038131154410225, + "grad_norm": 0.3634005697041323, + "learning_rate": 5.268306858363409e-05, + "loss": 2.6513, + "step": 34448 + }, + { + "epoch": 1.6038596736271153, + "grad_norm": 0.3747259342837427, + "learning_rate": 5.268036374591865e-05, + "loss": 2.7437, + "step": 34449 + }, + { + "epoch": 1.6039062318132085, + "grad_norm": 0.31383994329251885, + "learning_rate": 5.267765890033661e-05, + "loss": 2.8437, + "step": 34450 + }, + { + "epoch": 1.6039527899993016, + "grad_norm": 0.34800950944166353, + "learning_rate": 5.26749540468959e-05, + "loss": 2.7312, + "step": 34451 + }, + { + "epoch": 1.6039993481853947, + "grad_norm": 0.392761300627461, + "learning_rate": 5.267224918560446e-05, + "loss": 2.7958, + "step": 34452 + }, + { + "epoch": 1.6040459063714878, + "grad_norm": 0.3395346020367483, + "learning_rate": 5.266954431647024e-05, + "loss": 2.6977, + "step": 34453 + }, + { + "epoch": 1.604092464557581, + "grad_norm": 0.36642899311788996, + "learning_rate": 5.266683943950117e-05, + "loss": 2.6478, + "step": 34454 + }, + { + "epoch": 1.604139022743674, + "grad_norm": 0.3590082396890868, + "learning_rate": 5.266413455470519e-05, + "loss": 2.7509, + "step": 34455 + }, + { + "epoch": 1.604185580929767, + "grad_norm": 0.34103262232015996, + "learning_rate": 5.266142966209023e-05, + "loss": 2.5881, + "step": 34456 + }, + { + "epoch": 1.60423213911586, + "grad_norm": 0.3382425081224164, + "learning_rate": 5.2658724761664234e-05, + "loss": 2.7433, + "step": 34457 + }, + { + "epoch": 1.6042786973019532, + "grad_norm": 0.33455059553840555, + "learning_rate": 5.2656019853435145e-05, + "loss": 2.7506, + "step": 34458 + }, + { + "epoch": 1.604325255488046, + "grad_norm": 0.37866336578826365, + "learning_rate": 5.265331493741091e-05, + "loss": 2.7721, + "step": 34459 + }, + { + "epoch": 1.6043718136741392, + "grad_norm": 0.35006857752713744, + "learning_rate": 5.265061001359945e-05, + "loss": 2.6036, + "step": 34460 + }, + { + "epoch": 1.6044183718602323, + "grad_norm": 0.3289679265347293, + "learning_rate": 5.264790508200871e-05, + "loss": 2.6711, + "step": 34461 + }, + { + "epoch": 1.6044649300463254, + "grad_norm": 0.3341159162376538, + "learning_rate": 5.2645200142646625e-05, + "loss": 2.7266, + "step": 34462 + }, + { + "epoch": 1.6045114882324185, + "grad_norm": 0.3489445240155388, + "learning_rate": 5.264249519552113e-05, + "loss": 2.6934, + "step": 34463 + }, + { + "epoch": 1.6045580464185116, + "grad_norm": 0.3150126758205691, + "learning_rate": 5.26397902406402e-05, + "loss": 2.7765, + "step": 34464 + }, + { + "epoch": 1.6046046046046047, + "grad_norm": 0.3506547506427427, + "learning_rate": 5.263708527801172e-05, + "loss": 2.8049, + "step": 34465 + }, + { + "epoch": 1.6046511627906976, + "grad_norm": 0.33889664755954374, + "learning_rate": 5.2634380307643685e-05, + "loss": 2.7864, + "step": 34466 + }, + { + "epoch": 1.6046977209767908, + "grad_norm": 0.3572275096813242, + "learning_rate": 5.263167532954397e-05, + "loss": 2.7427, + "step": 34467 + }, + { + "epoch": 1.6047442791628839, + "grad_norm": 0.3471614669151578, + "learning_rate": 5.2628970343720574e-05, + "loss": 2.7444, + "step": 34468 + }, + { + "epoch": 1.6047908373489768, + "grad_norm": 0.357909782489036, + "learning_rate": 5.262626535018139e-05, + "loss": 2.7842, + "step": 34469 + }, + { + "epoch": 1.6048373955350699, + "grad_norm": 0.33329996682203633, + "learning_rate": 5.2623560348934375e-05, + "loss": 2.6742, + "step": 34470 + }, + { + "epoch": 1.604883953721163, + "grad_norm": 0.3347487543731992, + "learning_rate": 5.262085533998747e-05, + "loss": 2.7417, + "step": 34471 + }, + { + "epoch": 1.604930511907256, + "grad_norm": 0.3509532496809419, + "learning_rate": 5.2618150323348614e-05, + "loss": 2.8405, + "step": 34472 + }, + { + "epoch": 1.6049770700933492, + "grad_norm": 0.3383120071454611, + "learning_rate": 5.261544529902576e-05, + "loss": 2.5772, + "step": 34473 + }, + { + "epoch": 1.6050236282794423, + "grad_norm": 0.34184682435760283, + "learning_rate": 5.261274026702682e-05, + "loss": 2.7572, + "step": 34474 + }, + { + "epoch": 1.6050701864655355, + "grad_norm": 0.342632843781228, + "learning_rate": 5.2610035227359745e-05, + "loss": 2.6385, + "step": 34475 + }, + { + "epoch": 1.6051167446516283, + "grad_norm": 0.3562521365948033, + "learning_rate": 5.2607330180032464e-05, + "loss": 2.6363, + "step": 34476 + }, + { + "epoch": 1.6051633028377215, + "grad_norm": 0.34212442847076047, + "learning_rate": 5.2604625125052945e-05, + "loss": 2.7576, + "step": 34477 + }, + { + "epoch": 1.6052098610238144, + "grad_norm": 0.37310050161729463, + "learning_rate": 5.26019200624291e-05, + "loss": 2.709, + "step": 34478 + }, + { + "epoch": 1.6052564192099075, + "grad_norm": 0.34794655344361913, + "learning_rate": 5.259921499216888e-05, + "loss": 2.7479, + "step": 34479 + }, + { + "epoch": 1.6053029773960006, + "grad_norm": 0.37878550426149993, + "learning_rate": 5.259650991428021e-05, + "loss": 2.8223, + "step": 34480 + }, + { + "epoch": 1.6053495355820937, + "grad_norm": 0.3416026648109937, + "learning_rate": 5.259380482877104e-05, + "loss": 2.8156, + "step": 34481 + }, + { + "epoch": 1.6053960937681868, + "grad_norm": 0.35171731259910477, + "learning_rate": 5.259109973564932e-05, + "loss": 2.7185, + "step": 34482 + }, + { + "epoch": 1.60544265195428, + "grad_norm": 0.38124787775757046, + "learning_rate": 5.2588394634922965e-05, + "loss": 2.741, + "step": 34483 + }, + { + "epoch": 1.605489210140373, + "grad_norm": 0.33065521537448467, + "learning_rate": 5.2585689526599926e-05, + "loss": 2.663, + "step": 34484 + }, + { + "epoch": 1.6055357683264662, + "grad_norm": 0.3436219135749133, + "learning_rate": 5.258298441068815e-05, + "loss": 2.7733, + "step": 34485 + }, + { + "epoch": 1.605582326512559, + "grad_norm": 0.35029837508155753, + "learning_rate": 5.258027928719558e-05, + "loss": 2.7015, + "step": 34486 + }, + { + "epoch": 1.6056288846986522, + "grad_norm": 0.36857450717982576, + "learning_rate": 5.2577574156130126e-05, + "loss": 2.7347, + "step": 34487 + }, + { + "epoch": 1.605675442884745, + "grad_norm": 0.3133820749207479, + "learning_rate": 5.257486901749974e-05, + "loss": 2.6771, + "step": 34488 + }, + { + "epoch": 1.6057220010708382, + "grad_norm": 0.36320516341943104, + "learning_rate": 5.2572163871312384e-05, + "loss": 2.7633, + "step": 34489 + }, + { + "epoch": 1.6057685592569313, + "grad_norm": 0.36494371763742717, + "learning_rate": 5.256945871757598e-05, + "loss": 2.6899, + "step": 34490 + }, + { + "epoch": 1.6058151174430244, + "grad_norm": 0.37204290279728475, + "learning_rate": 5.2566753556298454e-05, + "loss": 2.6636, + "step": 34491 + }, + { + "epoch": 1.6058616756291175, + "grad_norm": 0.3369527606336971, + "learning_rate": 5.256404838748778e-05, + "loss": 2.8134, + "step": 34492 + }, + { + "epoch": 1.6059082338152106, + "grad_norm": 0.3490524325028894, + "learning_rate": 5.256134321115186e-05, + "loss": 2.6622, + "step": 34493 + }, + { + "epoch": 1.6059547920013038, + "grad_norm": 0.3865877282306478, + "learning_rate": 5.255863802729866e-05, + "loss": 2.6545, + "step": 34494 + }, + { + "epoch": 1.6060013501873966, + "grad_norm": 0.341968796797192, + "learning_rate": 5.255593283593611e-05, + "loss": 2.7366, + "step": 34495 + }, + { + "epoch": 1.6060479083734898, + "grad_norm": 0.3977344106384986, + "learning_rate": 5.255322763707213e-05, + "loss": 2.7134, + "step": 34496 + }, + { + "epoch": 1.6060944665595829, + "grad_norm": 0.3109025932417557, + "learning_rate": 5.25505224307147e-05, + "loss": 2.7076, + "step": 34497 + }, + { + "epoch": 1.6061410247456758, + "grad_norm": 0.3778446944416335, + "learning_rate": 5.254781721687172e-05, + "loss": 2.8131, + "step": 34498 + }, + { + "epoch": 1.6061875829317689, + "grad_norm": 0.3642436961854422, + "learning_rate": 5.254511199555116e-05, + "loss": 2.6732, + "step": 34499 + }, + { + "epoch": 1.606234141117862, + "grad_norm": 0.3339747177273876, + "learning_rate": 5.2542406766760946e-05, + "loss": 2.8099, + "step": 34500 + }, + { + "epoch": 1.6062806993039551, + "grad_norm": 0.36937318462732244, + "learning_rate": 5.253970153050901e-05, + "loss": 2.7515, + "step": 34501 + }, + { + "epoch": 1.6063272574900482, + "grad_norm": 0.3477182422303274, + "learning_rate": 5.253699628680331e-05, + "loss": 2.7362, + "step": 34502 + }, + { + "epoch": 1.6063738156761413, + "grad_norm": 0.34187514025311067, + "learning_rate": 5.2534291035651784e-05, + "loss": 2.7313, + "step": 34503 + }, + { + "epoch": 1.6064203738622345, + "grad_norm": 0.3549293424679494, + "learning_rate": 5.253158577706234e-05, + "loss": 2.7287, + "step": 34504 + }, + { + "epoch": 1.6064669320483274, + "grad_norm": 0.37929870985710634, + "learning_rate": 5.2528880511042966e-05, + "loss": 2.7921, + "step": 34505 + }, + { + "epoch": 1.6065134902344205, + "grad_norm": 0.36246232237505593, + "learning_rate": 5.252617523760156e-05, + "loss": 2.7197, + "step": 34506 + }, + { + "epoch": 1.6065600484205136, + "grad_norm": 0.35264086788178045, + "learning_rate": 5.2523469956746084e-05, + "loss": 2.7766, + "step": 34507 + }, + { + "epoch": 1.6066066066066065, + "grad_norm": 0.3575459691461444, + "learning_rate": 5.252076466848447e-05, + "loss": 2.6792, + "step": 34508 + }, + { + "epoch": 1.6066531647926996, + "grad_norm": 0.3509289620266342, + "learning_rate": 5.251805937282467e-05, + "loss": 2.6753, + "step": 34509 + }, + { + "epoch": 1.6066997229787927, + "grad_norm": 0.37095965605998804, + "learning_rate": 5.251535406977459e-05, + "loss": 2.7107, + "step": 34510 + }, + { + "epoch": 1.6067462811648858, + "grad_norm": 0.3903489648509479, + "learning_rate": 5.251264875934221e-05, + "loss": 2.6504, + "step": 34511 + }, + { + "epoch": 1.606792839350979, + "grad_norm": 0.3731951615503718, + "learning_rate": 5.250994344153545e-05, + "loss": 2.6668, + "step": 34512 + }, + { + "epoch": 1.606839397537072, + "grad_norm": 0.3754404505621374, + "learning_rate": 5.2507238116362264e-05, + "loss": 2.7168, + "step": 34513 + }, + { + "epoch": 1.6068859557231652, + "grad_norm": 0.34995970963808054, + "learning_rate": 5.250453278383056e-05, + "loss": 2.6806, + "step": 34514 + }, + { + "epoch": 1.606932513909258, + "grad_norm": 0.3851855034561205, + "learning_rate": 5.250182744394832e-05, + "loss": 2.7587, + "step": 34515 + }, + { + "epoch": 1.6069790720953512, + "grad_norm": 0.351079837527861, + "learning_rate": 5.249912209672345e-05, + "loss": 2.6737, + "step": 34516 + }, + { + "epoch": 1.607025630281444, + "grad_norm": 0.36153982584521077, + "learning_rate": 5.249641674216391e-05, + "loss": 2.6683, + "step": 34517 + }, + { + "epoch": 1.6070721884675372, + "grad_norm": 0.3520873278139342, + "learning_rate": 5.249371138027764e-05, + "loss": 2.7496, + "step": 34518 + }, + { + "epoch": 1.6071187466536303, + "grad_norm": 0.3495508676480687, + "learning_rate": 5.249100601107256e-05, + "loss": 2.7611, + "step": 34519 + }, + { + "epoch": 1.6071653048397234, + "grad_norm": 0.33299241506505906, + "learning_rate": 5.248830063455662e-05, + "loss": 2.6937, + "step": 34520 + }, + { + "epoch": 1.6072118630258165, + "grad_norm": 0.3636008964831884, + "learning_rate": 5.248559525073776e-05, + "loss": 2.7411, + "step": 34521 + }, + { + "epoch": 1.6072584212119096, + "grad_norm": 0.38367347301350896, + "learning_rate": 5.248288985962394e-05, + "loss": 2.79, + "step": 34522 + }, + { + "epoch": 1.6073049793980028, + "grad_norm": 0.36295498979700186, + "learning_rate": 5.248018446122308e-05, + "loss": 2.6969, + "step": 34523 + }, + { + "epoch": 1.6073515375840959, + "grad_norm": 0.34221996164306373, + "learning_rate": 5.247747905554311e-05, + "loss": 2.7477, + "step": 34524 + }, + { + "epoch": 1.6073980957701888, + "grad_norm": 0.37418721462272325, + "learning_rate": 5.247477364259199e-05, + "loss": 2.6696, + "step": 34525 + }, + { + "epoch": 1.6074446539562819, + "grad_norm": 0.3334825265616686, + "learning_rate": 5.2472068222377646e-05, + "loss": 2.6936, + "step": 34526 + }, + { + "epoch": 1.6074912121423748, + "grad_norm": 0.3657209614804139, + "learning_rate": 5.2469362794908036e-05, + "loss": 2.7644, + "step": 34527 + }, + { + "epoch": 1.607537770328468, + "grad_norm": 0.3971996551650686, + "learning_rate": 5.2466657360191084e-05, + "loss": 2.7804, + "step": 34528 + }, + { + "epoch": 1.607584328514561, + "grad_norm": 0.37060102015772683, + "learning_rate": 5.2463951918234744e-05, + "loss": 2.6637, + "step": 34529 + }, + { + "epoch": 1.6076308867006541, + "grad_norm": 0.3427971717664297, + "learning_rate": 5.2461246469046945e-05, + "loss": 2.7294, + "step": 34530 + }, + { + "epoch": 1.6076774448867472, + "grad_norm": 0.3709151949373101, + "learning_rate": 5.245854101263563e-05, + "loss": 2.6681, + "step": 34531 + }, + { + "epoch": 1.6077240030728404, + "grad_norm": 0.36284513380878436, + "learning_rate": 5.245583554900874e-05, + "loss": 2.6936, + "step": 34532 + }, + { + "epoch": 1.6077705612589335, + "grad_norm": 0.3411753887929887, + "learning_rate": 5.2453130078174205e-05, + "loss": 2.8638, + "step": 34533 + }, + { + "epoch": 1.6078171194450266, + "grad_norm": 0.36176089844139525, + "learning_rate": 5.245042460013997e-05, + "loss": 2.7181, + "step": 34534 + }, + { + "epoch": 1.6078636776311195, + "grad_norm": 0.3508294906161481, + "learning_rate": 5.244771911491399e-05, + "loss": 2.7975, + "step": 34535 + }, + { + "epoch": 1.6079102358172126, + "grad_norm": 0.3307138303671638, + "learning_rate": 5.2445013622504204e-05, + "loss": 2.7733, + "step": 34536 + }, + { + "epoch": 1.6079567940033055, + "grad_norm": 0.3545096361810919, + "learning_rate": 5.244230812291854e-05, + "loss": 2.7883, + "step": 34537 + }, + { + "epoch": 1.6080033521893986, + "grad_norm": 0.3226241124995642, + "learning_rate": 5.2439602616164926e-05, + "loss": 2.8155, + "step": 34538 + }, + { + "epoch": 1.6080499103754917, + "grad_norm": 0.35620280954634864, + "learning_rate": 5.243689710225133e-05, + "loss": 2.7058, + "step": 34539 + }, + { + "epoch": 1.6080964685615848, + "grad_norm": 0.340476382428518, + "learning_rate": 5.243419158118569e-05, + "loss": 2.7257, + "step": 34540 + }, + { + "epoch": 1.608143026747678, + "grad_norm": 0.36164623900516146, + "learning_rate": 5.243148605297593e-05, + "loss": 2.8251, + "step": 34541 + }, + { + "epoch": 1.608189584933771, + "grad_norm": 0.34627014549438145, + "learning_rate": 5.242878051763001e-05, + "loss": 2.779, + "step": 34542 + }, + { + "epoch": 1.6082361431198642, + "grad_norm": 0.35047491447978446, + "learning_rate": 5.2426074975155833e-05, + "loss": 2.7933, + "step": 34543 + }, + { + "epoch": 1.608282701305957, + "grad_norm": 0.3658609520309645, + "learning_rate": 5.242336942556137e-05, + "loss": 2.7315, + "step": 34544 + }, + { + "epoch": 1.6083292594920502, + "grad_norm": 0.38117909574298575, + "learning_rate": 5.242066386885458e-05, + "loss": 2.7126, + "step": 34545 + }, + { + "epoch": 1.6083758176781433, + "grad_norm": 0.37841709190504014, + "learning_rate": 5.2417958305043355e-05, + "loss": 2.7641, + "step": 34546 + }, + { + "epoch": 1.6084223758642362, + "grad_norm": 0.33964078351967303, + "learning_rate": 5.241525273413567e-05, + "loss": 2.6952, + "step": 34547 + }, + { + "epoch": 1.6084689340503293, + "grad_norm": 0.3893274748408032, + "learning_rate": 5.241254715613946e-05, + "loss": 2.7399, + "step": 34548 + }, + { + "epoch": 1.6085154922364224, + "grad_norm": 0.3279012997623337, + "learning_rate": 5.2409841571062654e-05, + "loss": 2.7772, + "step": 34549 + }, + { + "epoch": 1.6085620504225155, + "grad_norm": 0.3974296682523808, + "learning_rate": 5.24071359789132e-05, + "loss": 2.7205, + "step": 34550 + }, + { + "epoch": 1.6086086086086087, + "grad_norm": 0.3285646772279798, + "learning_rate": 5.2404430379699044e-05, + "loss": 2.7343, + "step": 34551 + }, + { + "epoch": 1.6086551667947018, + "grad_norm": 0.3532362790174832, + "learning_rate": 5.240172477342812e-05, + "loss": 2.6635, + "step": 34552 + }, + { + "epoch": 1.608701724980795, + "grad_norm": 0.36063080747929227, + "learning_rate": 5.239901916010837e-05, + "loss": 2.7469, + "step": 34553 + }, + { + "epoch": 1.6087482831668878, + "grad_norm": 0.3390759396051817, + "learning_rate": 5.2396313539747745e-05, + "loss": 2.6171, + "step": 34554 + }, + { + "epoch": 1.608794841352981, + "grad_norm": 0.336548759926068, + "learning_rate": 5.239360791235417e-05, + "loss": 2.7995, + "step": 34555 + }, + { + "epoch": 1.608841399539074, + "grad_norm": 0.373897673350249, + "learning_rate": 5.239090227793559e-05, + "loss": 2.5866, + "step": 34556 + }, + { + "epoch": 1.608887957725167, + "grad_norm": 0.34207205803120944, + "learning_rate": 5.2388196636499944e-05, + "loss": 2.7018, + "step": 34557 + }, + { + "epoch": 1.60893451591126, + "grad_norm": 0.3534129005423483, + "learning_rate": 5.2385490988055186e-05, + "loss": 2.756, + "step": 34558 + }, + { + "epoch": 1.6089810740973531, + "grad_norm": 0.3585926289854505, + "learning_rate": 5.238278533260923e-05, + "loss": 2.6804, + "step": 34559 + }, + { + "epoch": 1.6090276322834463, + "grad_norm": 0.3510294166221355, + "learning_rate": 5.238007967017005e-05, + "loss": 2.6217, + "step": 34560 + }, + { + "epoch": 1.6090741904695394, + "grad_norm": 0.38450244175995874, + "learning_rate": 5.237737400074556e-05, + "loss": 2.7157, + "step": 34561 + }, + { + "epoch": 1.6091207486556325, + "grad_norm": 0.3260949013630796, + "learning_rate": 5.237466832434371e-05, + "loss": 2.6649, + "step": 34562 + }, + { + "epoch": 1.6091673068417256, + "grad_norm": 0.3835637893928635, + "learning_rate": 5.237196264097245e-05, + "loss": 2.6928, + "step": 34563 + }, + { + "epoch": 1.6092138650278185, + "grad_norm": 0.33233168687739756, + "learning_rate": 5.236925695063971e-05, + "loss": 2.6981, + "step": 34564 + }, + { + "epoch": 1.6092604232139116, + "grad_norm": 0.35988370059649544, + "learning_rate": 5.236655125335344e-05, + "loss": 2.815, + "step": 34565 + }, + { + "epoch": 1.6093069814000045, + "grad_norm": 0.3756309038891744, + "learning_rate": 5.236384554912157e-05, + "loss": 2.8347, + "step": 34566 + }, + { + "epoch": 1.6093535395860976, + "grad_norm": 0.34791944637240557, + "learning_rate": 5.2361139837952045e-05, + "loss": 2.6701, + "step": 34567 + }, + { + "epoch": 1.6094000977721907, + "grad_norm": 0.35740170367234453, + "learning_rate": 5.235843411985282e-05, + "loss": 2.5739, + "step": 34568 + }, + { + "epoch": 1.6094466559582838, + "grad_norm": 0.3295190412933784, + "learning_rate": 5.2355728394831805e-05, + "loss": 2.7508, + "step": 34569 + }, + { + "epoch": 1.609493214144377, + "grad_norm": 0.37504397738977563, + "learning_rate": 5.235302266289697e-05, + "loss": 2.7978, + "step": 34570 + }, + { + "epoch": 1.60953977233047, + "grad_norm": 0.34420436235108, + "learning_rate": 5.235031692405623e-05, + "loss": 2.6782, + "step": 34571 + }, + { + "epoch": 1.6095863305165632, + "grad_norm": 0.3595831417796784, + "learning_rate": 5.234761117831757e-05, + "loss": 2.7987, + "step": 34572 + }, + { + "epoch": 1.6096328887026563, + "grad_norm": 0.35512589433549885, + "learning_rate": 5.234490542568887e-05, + "loss": 2.7834, + "step": 34573 + }, + { + "epoch": 1.6096794468887492, + "grad_norm": 0.3690868091820078, + "learning_rate": 5.2342199666178124e-05, + "loss": 2.7772, + "step": 34574 + }, + { + "epoch": 1.6097260050748423, + "grad_norm": 0.35953097167642284, + "learning_rate": 5.2339493899793244e-05, + "loss": 2.7781, + "step": 34575 + }, + { + "epoch": 1.6097725632609352, + "grad_norm": 0.3453037603402602, + "learning_rate": 5.233678812654219e-05, + "loss": 2.6875, + "step": 34576 + }, + { + "epoch": 1.6098191214470283, + "grad_norm": 0.36140052501211206, + "learning_rate": 5.233408234643288e-05, + "loss": 2.7497, + "step": 34577 + }, + { + "epoch": 1.6098656796331214, + "grad_norm": 0.35280780049546345, + "learning_rate": 5.233137655947328e-05, + "loss": 2.7937, + "step": 34578 + }, + { + "epoch": 1.6099122378192146, + "grad_norm": 0.3403631191087765, + "learning_rate": 5.232867076567132e-05, + "loss": 2.67, + "step": 34579 + }, + { + "epoch": 1.6099587960053077, + "grad_norm": 0.34097744565871246, + "learning_rate": 5.2325964965034925e-05, + "loss": 2.8023, + "step": 34580 + }, + { + "epoch": 1.6100053541914008, + "grad_norm": 0.4012224449645406, + "learning_rate": 5.232325915757208e-05, + "loss": 2.6634, + "step": 34581 + }, + { + "epoch": 1.610051912377494, + "grad_norm": 0.3246397927535752, + "learning_rate": 5.232055334329067e-05, + "loss": 2.6599, + "step": 34582 + }, + { + "epoch": 1.6100984705635868, + "grad_norm": 0.34740401517498193, + "learning_rate": 5.231784752219867e-05, + "loss": 2.7568, + "step": 34583 + }, + { + "epoch": 1.61014502874968, + "grad_norm": 0.3218430696338186, + "learning_rate": 5.2315141694304025e-05, + "loss": 2.6592, + "step": 34584 + }, + { + "epoch": 1.610191586935773, + "grad_norm": 0.3480301616299633, + "learning_rate": 5.231243585961467e-05, + "loss": 2.7003, + "step": 34585 + }, + { + "epoch": 1.610238145121866, + "grad_norm": 0.3463486747999081, + "learning_rate": 5.2309730018138527e-05, + "loss": 2.6735, + "step": 34586 + }, + { + "epoch": 1.610284703307959, + "grad_norm": 0.35997853462550256, + "learning_rate": 5.230702416988357e-05, + "loss": 2.692, + "step": 34587 + }, + { + "epoch": 1.6103312614940521, + "grad_norm": 0.3564760609107465, + "learning_rate": 5.2304318314857714e-05, + "loss": 2.7606, + "step": 34588 + }, + { + "epoch": 1.6103778196801453, + "grad_norm": 0.3280705010110631, + "learning_rate": 5.2301612453068905e-05, + "loss": 2.7722, + "step": 34589 + }, + { + "epoch": 1.6104243778662384, + "grad_norm": 0.346672673519313, + "learning_rate": 5.229890658452511e-05, + "loss": 2.729, + "step": 34590 + }, + { + "epoch": 1.6104709360523315, + "grad_norm": 0.36887745797690935, + "learning_rate": 5.2296200709234235e-05, + "loss": 2.6929, + "step": 34591 + }, + { + "epoch": 1.6105174942384246, + "grad_norm": 0.33152273847562236, + "learning_rate": 5.2293494827204245e-05, + "loss": 2.7132, + "step": 34592 + }, + { + "epoch": 1.6105640524245175, + "grad_norm": 0.3461438969645458, + "learning_rate": 5.229078893844308e-05, + "loss": 2.7823, + "step": 34593 + }, + { + "epoch": 1.6106106106106106, + "grad_norm": 0.3576662048762579, + "learning_rate": 5.228808304295867e-05, + "loss": 2.7016, + "step": 34594 + }, + { + "epoch": 1.6106571687967037, + "grad_norm": 0.3430844015835819, + "learning_rate": 5.228537714075895e-05, + "loss": 2.7146, + "step": 34595 + }, + { + "epoch": 1.6107037269827966, + "grad_norm": 0.35804597164509294, + "learning_rate": 5.2282671231851885e-05, + "loss": 2.7233, + "step": 34596 + }, + { + "epoch": 1.6107502851688897, + "grad_norm": 0.35634997943903485, + "learning_rate": 5.2279965316245394e-05, + "loss": 2.6813, + "step": 34597 + }, + { + "epoch": 1.6107968433549829, + "grad_norm": 0.3440712222090547, + "learning_rate": 5.227725939394743e-05, + "loss": 2.6895, + "step": 34598 + }, + { + "epoch": 1.610843401541076, + "grad_norm": 0.4073117735903934, + "learning_rate": 5.227455346496595e-05, + "loss": 2.702, + "step": 34599 + }, + { + "epoch": 1.610889959727169, + "grad_norm": 0.36434759299015, + "learning_rate": 5.227184752930887e-05, + "loss": 2.8147, + "step": 34600 + }, + { + "epoch": 1.6109365179132622, + "grad_norm": 0.37342840049764026, + "learning_rate": 5.226914158698414e-05, + "loss": 2.7584, + "step": 34601 + }, + { + "epoch": 1.6109830760993553, + "grad_norm": 0.3703490487088488, + "learning_rate": 5.22664356379997e-05, + "loss": 2.6853, + "step": 34602 + }, + { + "epoch": 1.6110296342854482, + "grad_norm": 0.3600478752436759, + "learning_rate": 5.2263729682363506e-05, + "loss": 2.6737, + "step": 34603 + }, + { + "epoch": 1.6110761924715413, + "grad_norm": 0.3601502055825766, + "learning_rate": 5.226102372008348e-05, + "loss": 2.733, + "step": 34604 + }, + { + "epoch": 1.6111227506576342, + "grad_norm": 0.38236475026331884, + "learning_rate": 5.2258317751167575e-05, + "loss": 2.6905, + "step": 34605 + }, + { + "epoch": 1.6111693088437273, + "grad_norm": 0.3583444549741824, + "learning_rate": 5.225561177562374e-05, + "loss": 2.7264, + "step": 34606 + }, + { + "epoch": 1.6112158670298204, + "grad_norm": 0.34683145012199573, + "learning_rate": 5.2252905793459885e-05, + "loss": 2.6885, + "step": 34607 + }, + { + "epoch": 1.6112624252159136, + "grad_norm": 0.3705446667251668, + "learning_rate": 5.2250199804683984e-05, + "loss": 2.5933, + "step": 34608 + }, + { + "epoch": 1.6113089834020067, + "grad_norm": 0.3461763877740314, + "learning_rate": 5.224749380930397e-05, + "loss": 2.7195, + "step": 34609 + }, + { + "epoch": 1.6113555415880998, + "grad_norm": 0.3761432823251839, + "learning_rate": 5.224478780732778e-05, + "loss": 2.7804, + "step": 34610 + }, + { + "epoch": 1.611402099774193, + "grad_norm": 0.3627818804819463, + "learning_rate": 5.224208179876335e-05, + "loss": 2.729, + "step": 34611 + }, + { + "epoch": 1.611448657960286, + "grad_norm": 0.3783847056091203, + "learning_rate": 5.223937578361866e-05, + "loss": 2.8296, + "step": 34612 + }, + { + "epoch": 1.611495216146379, + "grad_norm": 0.3439287957799168, + "learning_rate": 5.223666976190159e-05, + "loss": 2.7306, + "step": 34613 + }, + { + "epoch": 1.611541774332472, + "grad_norm": 0.37050152695568717, + "learning_rate": 5.223396373362013e-05, + "loss": 2.6664, + "step": 34614 + }, + { + "epoch": 1.611588332518565, + "grad_norm": 0.3382247783966125, + "learning_rate": 5.2231257698782196e-05, + "loss": 2.7056, + "step": 34615 + }, + { + "epoch": 1.611634890704658, + "grad_norm": 0.3835074222150414, + "learning_rate": 5.222855165739575e-05, + "loss": 2.6173, + "step": 34616 + }, + { + "epoch": 1.6116814488907512, + "grad_norm": 0.35971430642008856, + "learning_rate": 5.222584560946874e-05, + "loss": 2.8068, + "step": 34617 + }, + { + "epoch": 1.6117280070768443, + "grad_norm": 0.3539130812483853, + "learning_rate": 5.2223139555009074e-05, + "loss": 2.8442, + "step": 34618 + }, + { + "epoch": 1.6117745652629374, + "grad_norm": 0.3466879688363509, + "learning_rate": 5.222043349402471e-05, + "loss": 2.7503, + "step": 34619 + }, + { + "epoch": 1.6118211234490305, + "grad_norm": 0.35665970419155446, + "learning_rate": 5.22177274265236e-05, + "loss": 2.7543, + "step": 34620 + }, + { + "epoch": 1.6118676816351236, + "grad_norm": 0.3560210613292587, + "learning_rate": 5.2215021352513684e-05, + "loss": 2.668, + "step": 34621 + }, + { + "epoch": 1.6119142398212165, + "grad_norm": 0.3494872155355687, + "learning_rate": 5.221231527200289e-05, + "loss": 2.7291, + "step": 34622 + }, + { + "epoch": 1.6119607980073096, + "grad_norm": 0.3493645629758732, + "learning_rate": 5.220960918499915e-05, + "loss": 2.6742, + "step": 34623 + }, + { + "epoch": 1.6120073561934027, + "grad_norm": 0.3194598104472837, + "learning_rate": 5.220690309151045e-05, + "loss": 2.7221, + "step": 34624 + }, + { + "epoch": 1.6120539143794956, + "grad_norm": 0.32113339552940956, + "learning_rate": 5.2204196991544696e-05, + "loss": 2.7949, + "step": 34625 + }, + { + "epoch": 1.6121004725655887, + "grad_norm": 0.36173274883495776, + "learning_rate": 5.220149088510985e-05, + "loss": 2.7238, + "step": 34626 + }, + { + "epoch": 1.6121470307516819, + "grad_norm": 0.3598470133737563, + "learning_rate": 5.219878477221384e-05, + "loss": 2.7188, + "step": 34627 + }, + { + "epoch": 1.612193588937775, + "grad_norm": 0.3566446431373504, + "learning_rate": 5.219607865286461e-05, + "loss": 2.7127, + "step": 34628 + }, + { + "epoch": 1.612240147123868, + "grad_norm": 0.33052665718563334, + "learning_rate": 5.2193372527070105e-05, + "loss": 2.5755, + "step": 34629 + }, + { + "epoch": 1.6122867053099612, + "grad_norm": 0.3608556349455629, + "learning_rate": 5.2190666394838285e-05, + "loss": 2.6111, + "step": 34630 + }, + { + "epoch": 1.6123332634960543, + "grad_norm": 0.33910058472129223, + "learning_rate": 5.218796025617705e-05, + "loss": 2.7146, + "step": 34631 + }, + { + "epoch": 1.6123798216821472, + "grad_norm": 0.34577753383275434, + "learning_rate": 5.218525411109439e-05, + "loss": 2.6462, + "step": 34632 + }, + { + "epoch": 1.6124263798682403, + "grad_norm": 0.3334368244662164, + "learning_rate": 5.218254795959822e-05, + "loss": 2.7017, + "step": 34633 + }, + { + "epoch": 1.6124729380543334, + "grad_norm": 0.37776017661420486, + "learning_rate": 5.2179841801696475e-05, + "loss": 2.7185, + "step": 34634 + }, + { + "epoch": 1.6125194962404263, + "grad_norm": 0.32073352952108825, + "learning_rate": 5.217713563739711e-05, + "loss": 2.7553, + "step": 34635 + }, + { + "epoch": 1.6125660544265195, + "grad_norm": 0.36773063562881125, + "learning_rate": 5.217442946670807e-05, + "loss": 2.7578, + "step": 34636 + }, + { + "epoch": 1.6126126126126126, + "grad_norm": 0.36105672651853404, + "learning_rate": 5.2171723289637296e-05, + "loss": 2.6363, + "step": 34637 + }, + { + "epoch": 1.6126591707987057, + "grad_norm": 0.33412645689164877, + "learning_rate": 5.216901710619272e-05, + "loss": 2.6449, + "step": 34638 + }, + { + "epoch": 1.6127057289847988, + "grad_norm": 0.34831798464081737, + "learning_rate": 5.216631091638231e-05, + "loss": 2.7056, + "step": 34639 + }, + { + "epoch": 1.612752287170892, + "grad_norm": 0.34699024565664516, + "learning_rate": 5.2163604720213976e-05, + "loss": 2.7081, + "step": 34640 + }, + { + "epoch": 1.612798845356985, + "grad_norm": 0.3260423205481697, + "learning_rate": 5.216089851769568e-05, + "loss": 2.7547, + "step": 34641 + }, + { + "epoch": 1.612845403543078, + "grad_norm": 0.3725250402276182, + "learning_rate": 5.215819230883535e-05, + "loss": 2.7784, + "step": 34642 + }, + { + "epoch": 1.612891961729171, + "grad_norm": 0.3693833436860447, + "learning_rate": 5.215548609364095e-05, + "loss": 2.7257, + "step": 34643 + }, + { + "epoch": 1.612938519915264, + "grad_norm": 0.32769633228523487, + "learning_rate": 5.2152779872120405e-05, + "loss": 2.7389, + "step": 34644 + }, + { + "epoch": 1.612985078101357, + "grad_norm": 0.3894788877674522, + "learning_rate": 5.215007364428167e-05, + "loss": 2.6764, + "step": 34645 + }, + { + "epoch": 1.6130316362874502, + "grad_norm": 0.34505815589048383, + "learning_rate": 5.214736741013267e-05, + "loss": 2.619, + "step": 34646 + }, + { + "epoch": 1.6130781944735433, + "grad_norm": 0.31986451507631664, + "learning_rate": 5.2144661169681376e-05, + "loss": 2.6357, + "step": 34647 + }, + { + "epoch": 1.6131247526596364, + "grad_norm": 0.32217337824321457, + "learning_rate": 5.214195492293571e-05, + "loss": 2.6084, + "step": 34648 + }, + { + "epoch": 1.6131713108457295, + "grad_norm": 0.33350787473137, + "learning_rate": 5.2139248669903594e-05, + "loss": 2.7048, + "step": 34649 + }, + { + "epoch": 1.6132178690318226, + "grad_norm": 0.3573778794740643, + "learning_rate": 5.2136542410593014e-05, + "loss": 2.6661, + "step": 34650 + }, + { + "epoch": 1.6132644272179157, + "grad_norm": 0.35499808097860275, + "learning_rate": 5.213383614501188e-05, + "loss": 2.6987, + "step": 34651 + }, + { + "epoch": 1.6133109854040086, + "grad_norm": 0.3611448096923304, + "learning_rate": 5.213112987316815e-05, + "loss": 2.6243, + "step": 34652 + }, + { + "epoch": 1.6133575435901018, + "grad_norm": 0.3478174783908297, + "learning_rate": 5.212842359506977e-05, + "loss": 2.6244, + "step": 34653 + }, + { + "epoch": 1.6134041017761946, + "grad_norm": 0.38097689261320267, + "learning_rate": 5.212571731072467e-05, + "loss": 2.6164, + "step": 34654 + }, + { + "epoch": 1.6134506599622878, + "grad_norm": 0.3501192980004126, + "learning_rate": 5.212301102014081e-05, + "loss": 2.7778, + "step": 34655 + }, + { + "epoch": 1.6134972181483809, + "grad_norm": 0.36780383055644944, + "learning_rate": 5.212030472332612e-05, + "loss": 2.6819, + "step": 34656 + }, + { + "epoch": 1.613543776334474, + "grad_norm": 0.3641949609787312, + "learning_rate": 5.211759842028854e-05, + "loss": 2.696, + "step": 34657 + }, + { + "epoch": 1.613590334520567, + "grad_norm": 0.31702958688855826, + "learning_rate": 5.211489211103603e-05, + "loss": 2.7027, + "step": 34658 + }, + { + "epoch": 1.6136368927066602, + "grad_norm": 0.37663449454692877, + "learning_rate": 5.211218579557649e-05, + "loss": 2.6976, + "step": 34659 + }, + { + "epoch": 1.6136834508927533, + "grad_norm": 0.34792787698553546, + "learning_rate": 5.210947947391791e-05, + "loss": 2.7698, + "step": 34660 + }, + { + "epoch": 1.6137300090788465, + "grad_norm": 0.3491499454975237, + "learning_rate": 5.210677314606822e-05, + "loss": 2.7268, + "step": 34661 + }, + { + "epoch": 1.6137765672649393, + "grad_norm": 0.4066596205080904, + "learning_rate": 5.210406681203536e-05, + "loss": 2.7668, + "step": 34662 + }, + { + "epoch": 1.6138231254510325, + "grad_norm": 0.3475611766702873, + "learning_rate": 5.210136047182727e-05, + "loss": 2.8036, + "step": 34663 + }, + { + "epoch": 1.6138696836371254, + "grad_norm": 0.3288057366348442, + "learning_rate": 5.209865412545187e-05, + "loss": 2.6578, + "step": 34664 + }, + { + "epoch": 1.6139162418232185, + "grad_norm": 0.3470676217478911, + "learning_rate": 5.209594777291715e-05, + "loss": 2.7293, + "step": 34665 + }, + { + "epoch": 1.6139628000093116, + "grad_norm": 0.37164717916552326, + "learning_rate": 5.209324141423102e-05, + "loss": 2.7775, + "step": 34666 + }, + { + "epoch": 1.6140093581954047, + "grad_norm": 0.3278582862359037, + "learning_rate": 5.209053504940144e-05, + "loss": 2.715, + "step": 34667 + }, + { + "epoch": 1.6140559163814978, + "grad_norm": 0.3671053949673624, + "learning_rate": 5.208782867843635e-05, + "loss": 2.7751, + "step": 34668 + }, + { + "epoch": 1.614102474567591, + "grad_norm": 0.3351233332486273, + "learning_rate": 5.2085122301343695e-05, + "loss": 2.7006, + "step": 34669 + }, + { + "epoch": 1.614149032753684, + "grad_norm": 0.35659539222773623, + "learning_rate": 5.208241591813139e-05, + "loss": 2.5541, + "step": 34670 + }, + { + "epoch": 1.614195590939777, + "grad_norm": 0.33215984485920674, + "learning_rate": 5.207970952880742e-05, + "loss": 2.7439, + "step": 34671 + }, + { + "epoch": 1.61424214912587, + "grad_norm": 0.35416642191463793, + "learning_rate": 5.207700313337969e-05, + "loss": 2.6413, + "step": 34672 + }, + { + "epoch": 1.6142887073119632, + "grad_norm": 0.32413649118837007, + "learning_rate": 5.2074296731856165e-05, + "loss": 2.6176, + "step": 34673 + }, + { + "epoch": 1.614335265498056, + "grad_norm": 0.34831058427447986, + "learning_rate": 5.207159032424478e-05, + "loss": 2.6975, + "step": 34674 + }, + { + "epoch": 1.6143818236841492, + "grad_norm": 0.34102493255867405, + "learning_rate": 5.206888391055348e-05, + "loss": 2.7176, + "step": 34675 + }, + { + "epoch": 1.6144283818702423, + "grad_norm": 0.3023491072616363, + "learning_rate": 5.2066177490790226e-05, + "loss": 2.6825, + "step": 34676 + }, + { + "epoch": 1.6144749400563354, + "grad_norm": 0.38368496058367174, + "learning_rate": 5.206347106496293e-05, + "loss": 2.7409, + "step": 34677 + }, + { + "epoch": 1.6145214982424285, + "grad_norm": 0.3268813073069852, + "learning_rate": 5.206076463307955e-05, + "loss": 2.6559, + "step": 34678 + }, + { + "epoch": 1.6145680564285216, + "grad_norm": 0.33945524681726835, + "learning_rate": 5.205805819514803e-05, + "loss": 2.7399, + "step": 34679 + }, + { + "epoch": 1.6146146146146148, + "grad_norm": 0.3591777071960536, + "learning_rate": 5.205535175117633e-05, + "loss": 2.7589, + "step": 34680 + }, + { + "epoch": 1.6146611728007076, + "grad_norm": 0.35488920927972295, + "learning_rate": 5.2052645301172356e-05, + "loss": 2.7263, + "step": 34681 + }, + { + "epoch": 1.6147077309868008, + "grad_norm": 0.33936021107477266, + "learning_rate": 5.204993884514407e-05, + "loss": 2.7269, + "step": 34682 + }, + { + "epoch": 1.6147542891728939, + "grad_norm": 0.36481059074179023, + "learning_rate": 5.204723238309942e-05, + "loss": 2.7288, + "step": 34683 + }, + { + "epoch": 1.6148008473589868, + "grad_norm": 0.3435889822786171, + "learning_rate": 5.204452591504635e-05, + "loss": 2.7672, + "step": 34684 + }, + { + "epoch": 1.6148474055450799, + "grad_norm": 0.34444833706932937, + "learning_rate": 5.204181944099279e-05, + "loss": 2.6399, + "step": 34685 + }, + { + "epoch": 1.614893963731173, + "grad_norm": 0.3403952325202294, + "learning_rate": 5.203911296094669e-05, + "loss": 2.748, + "step": 34686 + }, + { + "epoch": 1.6149405219172661, + "grad_norm": 0.39322668883381245, + "learning_rate": 5.2036406474916e-05, + "loss": 2.8455, + "step": 34687 + }, + { + "epoch": 1.6149870801033592, + "grad_norm": 0.33405757540710357, + "learning_rate": 5.203369998290865e-05, + "loss": 2.7888, + "step": 34688 + }, + { + "epoch": 1.6150336382894523, + "grad_norm": 0.3485814001613456, + "learning_rate": 5.20309934849326e-05, + "loss": 2.7472, + "step": 34689 + }, + { + "epoch": 1.6150801964755455, + "grad_norm": 0.3734815531076267, + "learning_rate": 5.2028286980995775e-05, + "loss": 2.7264, + "step": 34690 + }, + { + "epoch": 1.6151267546616384, + "grad_norm": 0.36054027232549535, + "learning_rate": 5.2025580471106136e-05, + "loss": 2.7598, + "step": 34691 + }, + { + "epoch": 1.6151733128477315, + "grad_norm": 0.34481872269006214, + "learning_rate": 5.2022873955271614e-05, + "loss": 2.7201, + "step": 34692 + }, + { + "epoch": 1.6152198710338244, + "grad_norm": 0.36241801600569395, + "learning_rate": 5.202016743350016e-05, + "loss": 2.8483, + "step": 34693 + }, + { + "epoch": 1.6152664292199175, + "grad_norm": 0.32838020506614596, + "learning_rate": 5.2017460905799705e-05, + "loss": 2.6412, + "step": 34694 + }, + { + "epoch": 1.6153129874060106, + "grad_norm": 0.34920982420501373, + "learning_rate": 5.201475437217822e-05, + "loss": 2.6838, + "step": 34695 + }, + { + "epoch": 1.6153595455921037, + "grad_norm": 0.36223064377838676, + "learning_rate": 5.201204783264362e-05, + "loss": 2.6182, + "step": 34696 + }, + { + "epoch": 1.6154061037781968, + "grad_norm": 0.33245363456024574, + "learning_rate": 5.2009341287203837e-05, + "loss": 2.7451, + "step": 34697 + }, + { + "epoch": 1.61545266196429, + "grad_norm": 0.31661187488714854, + "learning_rate": 5.2006634735866863e-05, + "loss": 2.7279, + "step": 34698 + }, + { + "epoch": 1.615499220150383, + "grad_norm": 0.34444474632417454, + "learning_rate": 5.2003928178640606e-05, + "loss": 2.5601, + "step": 34699 + }, + { + "epoch": 1.6155457783364762, + "grad_norm": 0.3335284457893771, + "learning_rate": 5.2001221615533005e-05, + "loss": 2.7307, + "step": 34700 + }, + { + "epoch": 1.615592336522569, + "grad_norm": 0.3177913824504605, + "learning_rate": 5.1998515046552024e-05, + "loss": 2.6714, + "step": 34701 + }, + { + "epoch": 1.6156388947086622, + "grad_norm": 0.34133601601311175, + "learning_rate": 5.19958084717056e-05, + "loss": 2.7738, + "step": 34702 + }, + { + "epoch": 1.615685452894755, + "grad_norm": 0.32886435673233505, + "learning_rate": 5.199310189100167e-05, + "loss": 2.7637, + "step": 34703 + }, + { + "epoch": 1.6157320110808482, + "grad_norm": 0.35104607901255064, + "learning_rate": 5.1990395304448194e-05, + "loss": 2.7364, + "step": 34704 + }, + { + "epoch": 1.6157785692669413, + "grad_norm": 0.3424333976551575, + "learning_rate": 5.198768871205308e-05, + "loss": 2.5638, + "step": 34705 + }, + { + "epoch": 1.6158251274530344, + "grad_norm": 0.34011210041514855, + "learning_rate": 5.198498211382433e-05, + "loss": 2.7091, + "step": 34706 + }, + { + "epoch": 1.6158716856391275, + "grad_norm": 0.3444898711602947, + "learning_rate": 5.198227550976983e-05, + "loss": 2.7497, + "step": 34707 + }, + { + "epoch": 1.6159182438252206, + "grad_norm": 0.3538468265577257, + "learning_rate": 5.197956889989756e-05, + "loss": 2.821, + "step": 34708 + }, + { + "epoch": 1.6159648020113138, + "grad_norm": 0.3284157031059895, + "learning_rate": 5.197686228421543e-05, + "loss": 2.7635, + "step": 34709 + }, + { + "epoch": 1.6160113601974067, + "grad_norm": 0.3859606253340549, + "learning_rate": 5.197415566273142e-05, + "loss": 2.6762, + "step": 34710 + }, + { + "epoch": 1.6160579183834998, + "grad_norm": 0.3411426458802058, + "learning_rate": 5.197144903545345e-05, + "loss": 2.7113, + "step": 34711 + }, + { + "epoch": 1.6161044765695929, + "grad_norm": 0.35123936090152835, + "learning_rate": 5.1968742402389484e-05, + "loss": 2.6763, + "step": 34712 + }, + { + "epoch": 1.6161510347556858, + "grad_norm": 0.3582551907154658, + "learning_rate": 5.196603576354744e-05, + "loss": 2.6314, + "step": 34713 + }, + { + "epoch": 1.616197592941779, + "grad_norm": 0.35574089006875564, + "learning_rate": 5.196332911893527e-05, + "loss": 2.7242, + "step": 34714 + }, + { + "epoch": 1.616244151127872, + "grad_norm": 0.36745828365540745, + "learning_rate": 5.1960622468560926e-05, + "loss": 2.6744, + "step": 34715 + }, + { + "epoch": 1.6162907093139651, + "grad_norm": 0.3762708381758389, + "learning_rate": 5.1957915812432356e-05, + "loss": 2.653, + "step": 34716 + }, + { + "epoch": 1.6163372675000582, + "grad_norm": 0.34436132478521014, + "learning_rate": 5.1955209150557496e-05, + "loss": 2.7118, + "step": 34717 + }, + { + "epoch": 1.6163838256861514, + "grad_norm": 0.3909299540411646, + "learning_rate": 5.195250248294428e-05, + "loss": 2.7534, + "step": 34718 + }, + { + "epoch": 1.6164303838722445, + "grad_norm": 0.36839385339623726, + "learning_rate": 5.194979580960068e-05, + "loss": 2.7245, + "step": 34719 + }, + { + "epoch": 1.6164769420583374, + "grad_norm": 0.35737368366131644, + "learning_rate": 5.19470891305346e-05, + "loss": 2.7818, + "step": 34720 + }, + { + "epoch": 1.6165235002444305, + "grad_norm": 0.31797578022410744, + "learning_rate": 5.1944382445754024e-05, + "loss": 2.6281, + "step": 34721 + }, + { + "epoch": 1.6165700584305236, + "grad_norm": 0.3509739463984396, + "learning_rate": 5.194167575526687e-05, + "loss": 2.658, + "step": 34722 + }, + { + "epoch": 1.6166166166166165, + "grad_norm": 0.323589141452231, + "learning_rate": 5.193896905908108e-05, + "loss": 2.7127, + "step": 34723 + }, + { + "epoch": 1.6166631748027096, + "grad_norm": 0.3480907921368293, + "learning_rate": 5.193626235720461e-05, + "loss": 2.6442, + "step": 34724 + }, + { + "epoch": 1.6167097329888027, + "grad_norm": 0.3535262621324987, + "learning_rate": 5.1933555649645414e-05, + "loss": 2.7735, + "step": 34725 + }, + { + "epoch": 1.6167562911748958, + "grad_norm": 0.32797261300794595, + "learning_rate": 5.19308489364114e-05, + "loss": 2.75, + "step": 34726 + }, + { + "epoch": 1.616802849360989, + "grad_norm": 0.38070400533769017, + "learning_rate": 5.1928142217510556e-05, + "loss": 2.6996, + "step": 34727 + }, + { + "epoch": 1.616849407547082, + "grad_norm": 0.31544769828940833, + "learning_rate": 5.192543549295079e-05, + "loss": 2.7715, + "step": 34728 + }, + { + "epoch": 1.6168959657331752, + "grad_norm": 0.3499227160562331, + "learning_rate": 5.192272876274006e-05, + "loss": 2.8013, + "step": 34729 + }, + { + "epoch": 1.616942523919268, + "grad_norm": 0.3170250659136328, + "learning_rate": 5.192002202688633e-05, + "loss": 2.7278, + "step": 34730 + }, + { + "epoch": 1.6169890821053612, + "grad_norm": 0.32566289405750287, + "learning_rate": 5.1917315285397505e-05, + "loss": 2.7401, + "step": 34731 + }, + { + "epoch": 1.617035640291454, + "grad_norm": 0.34682636880506973, + "learning_rate": 5.191460853828156e-05, + "loss": 2.7371, + "step": 34732 + }, + { + "epoch": 1.6170821984775472, + "grad_norm": 0.3331178539689889, + "learning_rate": 5.191190178554642e-05, + "loss": 2.7093, + "step": 34733 + }, + { + "epoch": 1.6171287566636403, + "grad_norm": 0.34506689296967885, + "learning_rate": 5.190919502720005e-05, + "loss": 2.7098, + "step": 34734 + }, + { + "epoch": 1.6171753148497334, + "grad_norm": 0.32282837148598736, + "learning_rate": 5.190648826325036e-05, + "loss": 2.6584, + "step": 34735 + }, + { + "epoch": 1.6172218730358265, + "grad_norm": 0.33033070648634705, + "learning_rate": 5.190378149370533e-05, + "loss": 2.7123, + "step": 34736 + }, + { + "epoch": 1.6172684312219197, + "grad_norm": 0.35479607079339026, + "learning_rate": 5.190107471857287e-05, + "loss": 2.7627, + "step": 34737 + }, + { + "epoch": 1.6173149894080128, + "grad_norm": 0.3565259759587507, + "learning_rate": 5.189836793786096e-05, + "loss": 2.71, + "step": 34738 + }, + { + "epoch": 1.6173615475941059, + "grad_norm": 0.3332432857481139, + "learning_rate": 5.1895661151577534e-05, + "loss": 2.7333, + "step": 34739 + }, + { + "epoch": 1.6174081057801988, + "grad_norm": 0.340719233106777, + "learning_rate": 5.1892954359730515e-05, + "loss": 2.7621, + "step": 34740 + }, + { + "epoch": 1.617454663966292, + "grad_norm": 0.35570294752479636, + "learning_rate": 5.1890247562327855e-05, + "loss": 2.7311, + "step": 34741 + }, + { + "epoch": 1.6175012221523848, + "grad_norm": 0.3491029029835729, + "learning_rate": 5.1887540759377516e-05, + "loss": 2.759, + "step": 34742 + }, + { + "epoch": 1.617547780338478, + "grad_norm": 0.35253139673138423, + "learning_rate": 5.188483395088743e-05, + "loss": 2.7397, + "step": 34743 + }, + { + "epoch": 1.617594338524571, + "grad_norm": 0.3493297589525763, + "learning_rate": 5.1882127136865545e-05, + "loss": 2.7669, + "step": 34744 + }, + { + "epoch": 1.6176408967106641, + "grad_norm": 0.3690560963936442, + "learning_rate": 5.1879420317319804e-05, + "loss": 2.7367, + "step": 34745 + }, + { + "epoch": 1.6176874548967572, + "grad_norm": 0.32132454641480257, + "learning_rate": 5.1876713492258144e-05, + "loss": 2.8272, + "step": 34746 + }, + { + "epoch": 1.6177340130828504, + "grad_norm": 0.3519788346785228, + "learning_rate": 5.187400666168851e-05, + "loss": 2.766, + "step": 34747 + }, + { + "epoch": 1.6177805712689435, + "grad_norm": 0.35558983769316405, + "learning_rate": 5.1871299825618867e-05, + "loss": 2.7617, + "step": 34748 + }, + { + "epoch": 1.6178271294550366, + "grad_norm": 0.3477775758209821, + "learning_rate": 5.1868592984057126e-05, + "loss": 2.7804, + "step": 34749 + }, + { + "epoch": 1.6178736876411295, + "grad_norm": 0.3611124496100531, + "learning_rate": 5.1865886137011255e-05, + "loss": 2.6507, + "step": 34750 + }, + { + "epoch": 1.6179202458272226, + "grad_norm": 0.34248170105675, + "learning_rate": 5.186317928448919e-05, + "loss": 2.762, + "step": 34751 + }, + { + "epoch": 1.6179668040133155, + "grad_norm": 0.4017451883238093, + "learning_rate": 5.186047242649888e-05, + "loss": 2.7473, + "step": 34752 + }, + { + "epoch": 1.6180133621994086, + "grad_norm": 0.31809401944623933, + "learning_rate": 5.1857765563048266e-05, + "loss": 2.6356, + "step": 34753 + }, + { + "epoch": 1.6180599203855017, + "grad_norm": 0.36680160993135785, + "learning_rate": 5.18550586941453e-05, + "loss": 2.7117, + "step": 34754 + }, + { + "epoch": 1.6181064785715948, + "grad_norm": 0.34470603489469387, + "learning_rate": 5.185235181979791e-05, + "loss": 2.6997, + "step": 34755 + }, + { + "epoch": 1.618153036757688, + "grad_norm": 0.3388047840513958, + "learning_rate": 5.1849644940014044e-05, + "loss": 2.6435, + "step": 34756 + }, + { + "epoch": 1.618199594943781, + "grad_norm": 0.366044693978496, + "learning_rate": 5.1846938054801674e-05, + "loss": 2.7039, + "step": 34757 + }, + { + "epoch": 1.6182461531298742, + "grad_norm": 0.3641798253593284, + "learning_rate": 5.184423116416871e-05, + "loss": 2.7407, + "step": 34758 + }, + { + "epoch": 1.618292711315967, + "grad_norm": 0.3380210406888313, + "learning_rate": 5.184152426812311e-05, + "loss": 2.6655, + "step": 34759 + }, + { + "epoch": 1.6183392695020602, + "grad_norm": 0.36465540701079624, + "learning_rate": 5.1838817366672806e-05, + "loss": 2.7529, + "step": 34760 + }, + { + "epoch": 1.6183858276881533, + "grad_norm": 0.34951160203319387, + "learning_rate": 5.1836110459825785e-05, + "loss": 2.688, + "step": 34761 + }, + { + "epoch": 1.6184323858742462, + "grad_norm": 0.35093087719277355, + "learning_rate": 5.183340354758993e-05, + "loss": 2.7094, + "step": 34762 + }, + { + "epoch": 1.6184789440603393, + "grad_norm": 0.38183739874351663, + "learning_rate": 5.183069662997323e-05, + "loss": 2.8112, + "step": 34763 + }, + { + "epoch": 1.6185255022464324, + "grad_norm": 0.33602168975947067, + "learning_rate": 5.182798970698361e-05, + "loss": 2.5676, + "step": 34764 + }, + { + "epoch": 1.6185720604325256, + "grad_norm": 0.3672110315700723, + "learning_rate": 5.182528277862902e-05, + "loss": 2.7424, + "step": 34765 + }, + { + "epoch": 1.6186186186186187, + "grad_norm": 0.3562269783991748, + "learning_rate": 5.182257584491741e-05, + "loss": 2.6507, + "step": 34766 + }, + { + "epoch": 1.6186651768047118, + "grad_norm": 0.3446684920109906, + "learning_rate": 5.1819868905856716e-05, + "loss": 2.6999, + "step": 34767 + }, + { + "epoch": 1.618711734990805, + "grad_norm": 0.3423121517725854, + "learning_rate": 5.181716196145488e-05, + "loss": 2.6041, + "step": 34768 + }, + { + "epoch": 1.6187582931768978, + "grad_norm": 0.3661908416745456, + "learning_rate": 5.1814455011719867e-05, + "loss": 2.8022, + "step": 34769 + }, + { + "epoch": 1.618804851362991, + "grad_norm": 0.34276802271980755, + "learning_rate": 5.18117480566596e-05, + "loss": 2.7913, + "step": 34770 + }, + { + "epoch": 1.618851409549084, + "grad_norm": 0.3490999805812756, + "learning_rate": 5.180904109628203e-05, + "loss": 2.7068, + "step": 34771 + }, + { + "epoch": 1.618897967735177, + "grad_norm": 0.35678219525325816, + "learning_rate": 5.180633413059511e-05, + "loss": 2.7517, + "step": 34772 + }, + { + "epoch": 1.61894452592127, + "grad_norm": 0.3533647360555077, + "learning_rate": 5.180362715960677e-05, + "loss": 2.7355, + "step": 34773 + }, + { + "epoch": 1.6189910841073631, + "grad_norm": 0.3549997814076492, + "learning_rate": 5.180092018332496e-05, + "loss": 2.7187, + "step": 34774 + }, + { + "epoch": 1.6190376422934563, + "grad_norm": 0.3751682426654892, + "learning_rate": 5.179821320175764e-05, + "loss": 2.6429, + "step": 34775 + }, + { + "epoch": 1.6190842004795494, + "grad_norm": 0.31845401821581926, + "learning_rate": 5.179550621491273e-05, + "loss": 2.8177, + "step": 34776 + }, + { + "epoch": 1.6191307586656425, + "grad_norm": 0.38859202663007647, + "learning_rate": 5.179279922279818e-05, + "loss": 2.7044, + "step": 34777 + }, + { + "epoch": 1.6191773168517356, + "grad_norm": 0.3725997921968525, + "learning_rate": 5.179009222542195e-05, + "loss": 2.7404, + "step": 34778 + }, + { + "epoch": 1.6192238750378285, + "grad_norm": 0.35940012018990003, + "learning_rate": 5.1787385222791977e-05, + "loss": 2.8025, + "step": 34779 + }, + { + "epoch": 1.6192704332239216, + "grad_norm": 0.41142118220216556, + "learning_rate": 5.1784678214916205e-05, + "loss": 2.6942, + "step": 34780 + }, + { + "epoch": 1.6193169914100145, + "grad_norm": 0.3481045622105126, + "learning_rate": 5.178197120180257e-05, + "loss": 2.672, + "step": 34781 + }, + { + "epoch": 1.6193635495961076, + "grad_norm": 0.38227377956515174, + "learning_rate": 5.177926418345902e-05, + "loss": 2.7063, + "step": 34782 + }, + { + "epoch": 1.6194101077822007, + "grad_norm": 0.33661545454047764, + "learning_rate": 5.1776557159893526e-05, + "loss": 2.7079, + "step": 34783 + }, + { + "epoch": 1.6194566659682939, + "grad_norm": 0.3474536205485335, + "learning_rate": 5.1773850131114e-05, + "loss": 2.7771, + "step": 34784 + }, + { + "epoch": 1.619503224154387, + "grad_norm": 0.3457346543027482, + "learning_rate": 5.177114309712841e-05, + "loss": 2.8409, + "step": 34785 + }, + { + "epoch": 1.61954978234048, + "grad_norm": 0.3383227238265709, + "learning_rate": 5.176843605794467e-05, + "loss": 2.6686, + "step": 34786 + }, + { + "epoch": 1.6195963405265732, + "grad_norm": 0.34348217503219447, + "learning_rate": 5.176572901357075e-05, + "loss": 2.6359, + "step": 34787 + }, + { + "epoch": 1.6196428987126663, + "grad_norm": 0.3319350196915145, + "learning_rate": 5.1763021964014604e-05, + "loss": 2.6514, + "step": 34788 + }, + { + "epoch": 1.6196894568987592, + "grad_norm": 0.3331314726499527, + "learning_rate": 5.176031490928415e-05, + "loss": 2.6505, + "step": 34789 + }, + { + "epoch": 1.6197360150848523, + "grad_norm": 0.3332853424898223, + "learning_rate": 5.175760784938733e-05, + "loss": 2.6811, + "step": 34790 + }, + { + "epoch": 1.6197825732709452, + "grad_norm": 0.35162059426369674, + "learning_rate": 5.1754900784332114e-05, + "loss": 2.7871, + "step": 34791 + }, + { + "epoch": 1.6198291314570383, + "grad_norm": 0.342968688844302, + "learning_rate": 5.1752193714126454e-05, + "loss": 2.6711, + "step": 34792 + }, + { + "epoch": 1.6198756896431314, + "grad_norm": 0.35257354771862065, + "learning_rate": 5.174948663877827e-05, + "loss": 2.7912, + "step": 34793 + }, + { + "epoch": 1.6199222478292246, + "grad_norm": 0.3350454539303931, + "learning_rate": 5.174677955829551e-05, + "loss": 2.7761, + "step": 34794 + }, + { + "epoch": 1.6199688060153177, + "grad_norm": 0.3506549488573666, + "learning_rate": 5.174407247268612e-05, + "loss": 2.6369, + "step": 34795 + }, + { + "epoch": 1.6200153642014108, + "grad_norm": 0.3361668383840239, + "learning_rate": 5.1741365381958066e-05, + "loss": 2.7813, + "step": 34796 + }, + { + "epoch": 1.620061922387504, + "grad_norm": 0.3432986250467573, + "learning_rate": 5.173865828611926e-05, + "loss": 2.7557, + "step": 34797 + }, + { + "epoch": 1.6201084805735968, + "grad_norm": 0.3199861928668094, + "learning_rate": 5.173595118517768e-05, + "loss": 2.6545, + "step": 34798 + }, + { + "epoch": 1.62015503875969, + "grad_norm": 0.33364203029305645, + "learning_rate": 5.173324407914123e-05, + "loss": 2.7871, + "step": 34799 + }, + { + "epoch": 1.620201596945783, + "grad_norm": 0.34667570770226314, + "learning_rate": 5.1730536968017896e-05, + "loss": 2.7261, + "step": 34800 + }, + { + "epoch": 1.620248155131876, + "grad_norm": 0.34952963271489856, + "learning_rate": 5.17278298518156e-05, + "loss": 2.8232, + "step": 34801 + }, + { + "epoch": 1.620294713317969, + "grad_norm": 0.36600976960519555, + "learning_rate": 5.17251227305423e-05, + "loss": 2.7233, + "step": 34802 + }, + { + "epoch": 1.6203412715040622, + "grad_norm": 0.33430687976941237, + "learning_rate": 5.1722415604205934e-05, + "loss": 2.7415, + "step": 34803 + }, + { + "epoch": 1.6203878296901553, + "grad_norm": 0.3526011284702797, + "learning_rate": 5.171970847281443e-05, + "loss": 2.7326, + "step": 34804 + }, + { + "epoch": 1.6204343878762484, + "grad_norm": 0.3298500030877372, + "learning_rate": 5.171700133637577e-05, + "loss": 2.6778, + "step": 34805 + }, + { + "epoch": 1.6204809460623415, + "grad_norm": 0.3045934512263968, + "learning_rate": 5.171429419489788e-05, + "loss": 2.7458, + "step": 34806 + }, + { + "epoch": 1.6205275042484346, + "grad_norm": 0.38906667707574794, + "learning_rate": 5.17115870483887e-05, + "loss": 2.6997, + "step": 34807 + }, + { + "epoch": 1.6205740624345275, + "grad_norm": 0.33728910984768384, + "learning_rate": 5.1708879896856175e-05, + "loss": 2.6931, + "step": 34808 + }, + { + "epoch": 1.6206206206206206, + "grad_norm": 0.34798618904569245, + "learning_rate": 5.170617274030828e-05, + "loss": 2.6549, + "step": 34809 + }, + { + "epoch": 1.6206671788067137, + "grad_norm": 0.34424236437566047, + "learning_rate": 5.170346557875291e-05, + "loss": 2.7949, + "step": 34810 + }, + { + "epoch": 1.6207137369928066, + "grad_norm": 0.35835418602422975, + "learning_rate": 5.170075841219806e-05, + "loss": 2.6177, + "step": 34811 + }, + { + "epoch": 1.6207602951788997, + "grad_norm": 0.3686845314630452, + "learning_rate": 5.169805124065163e-05, + "loss": 2.8201, + "step": 34812 + }, + { + "epoch": 1.6208068533649929, + "grad_norm": 0.3656563897282712, + "learning_rate": 5.169534406412159e-05, + "loss": 2.7428, + "step": 34813 + }, + { + "epoch": 1.620853411551086, + "grad_norm": 0.38656998404100007, + "learning_rate": 5.1692636882615885e-05, + "loss": 2.6743, + "step": 34814 + }, + { + "epoch": 1.620899969737179, + "grad_norm": 0.37437015155674613, + "learning_rate": 5.168992969614247e-05, + "loss": 2.8023, + "step": 34815 + }, + { + "epoch": 1.6209465279232722, + "grad_norm": 0.36799983978379747, + "learning_rate": 5.168722250470925e-05, + "loss": 2.711, + "step": 34816 + }, + { + "epoch": 1.6209930861093653, + "grad_norm": 0.38614966835809245, + "learning_rate": 5.168451530832422e-05, + "loss": 2.7875, + "step": 34817 + }, + { + "epoch": 1.6210396442954582, + "grad_norm": 0.3370630327883406, + "learning_rate": 5.16818081069953e-05, + "loss": 2.675, + "step": 34818 + }, + { + "epoch": 1.6210862024815513, + "grad_norm": 0.3444776585618578, + "learning_rate": 5.167910090073043e-05, + "loss": 2.7662, + "step": 34819 + }, + { + "epoch": 1.6211327606676442, + "grad_norm": 0.37666999359880476, + "learning_rate": 5.1676393689537585e-05, + "loss": 2.6812, + "step": 34820 + }, + { + "epoch": 1.6211793188537373, + "grad_norm": 0.3393156649384053, + "learning_rate": 5.1673686473424666e-05, + "loss": 2.7084, + "step": 34821 + }, + { + "epoch": 1.6212258770398305, + "grad_norm": 0.3967147710113806, + "learning_rate": 5.167097925239967e-05, + "loss": 2.7691, + "step": 34822 + }, + { + "epoch": 1.6212724352259236, + "grad_norm": 0.35954480137509237, + "learning_rate": 5.166827202647049e-05, + "loss": 2.7314, + "step": 34823 + }, + { + "epoch": 1.6213189934120167, + "grad_norm": 0.36897087925676475, + "learning_rate": 5.16655647956451e-05, + "loss": 2.6771, + "step": 34824 + }, + { + "epoch": 1.6213655515981098, + "grad_norm": 0.34591019011269697, + "learning_rate": 5.166285755993144e-05, + "loss": 2.7258, + "step": 34825 + }, + { + "epoch": 1.621412109784203, + "grad_norm": 0.38901787605125987, + "learning_rate": 5.1660150319337466e-05, + "loss": 2.7273, + "step": 34826 + }, + { + "epoch": 1.621458667970296, + "grad_norm": 0.3553727010517703, + "learning_rate": 5.16574430738711e-05, + "loss": 2.5989, + "step": 34827 + }, + { + "epoch": 1.621505226156389, + "grad_norm": 0.3988158804962613, + "learning_rate": 5.165473582354031e-05, + "loss": 2.7118, + "step": 34828 + }, + { + "epoch": 1.621551784342482, + "grad_norm": 0.36812674328124406, + "learning_rate": 5.165202856835304e-05, + "loss": 2.6856, + "step": 34829 + }, + { + "epoch": 1.621598342528575, + "grad_norm": 0.31935999870166754, + "learning_rate": 5.164932130831722e-05, + "loss": 2.7548, + "step": 34830 + }, + { + "epoch": 1.621644900714668, + "grad_norm": 0.36370861051990927, + "learning_rate": 5.164661404344081e-05, + "loss": 2.7856, + "step": 34831 + }, + { + "epoch": 1.6216914589007612, + "grad_norm": 0.3160893170898827, + "learning_rate": 5.1643906773731744e-05, + "loss": 2.7663, + "step": 34832 + }, + { + "epoch": 1.6217380170868543, + "grad_norm": 0.3319556464382778, + "learning_rate": 5.164119949919799e-05, + "loss": 2.6656, + "step": 34833 + }, + { + "epoch": 1.6217845752729474, + "grad_norm": 0.3554682980037796, + "learning_rate": 5.163849221984746e-05, + "loss": 2.6717, + "step": 34834 + }, + { + "epoch": 1.6218311334590405, + "grad_norm": 0.3517265611872869, + "learning_rate": 5.1635784935688126e-05, + "loss": 2.8055, + "step": 34835 + }, + { + "epoch": 1.6218776916451336, + "grad_norm": 0.3660472712643874, + "learning_rate": 5.163307764672791e-05, + "loss": 2.6753, + "step": 34836 + }, + { + "epoch": 1.6219242498312267, + "grad_norm": 0.3422596904281164, + "learning_rate": 5.163037035297479e-05, + "loss": 2.592, + "step": 34837 + }, + { + "epoch": 1.6219708080173196, + "grad_norm": 0.33998781204702855, + "learning_rate": 5.162766305443669e-05, + "loss": 2.6416, + "step": 34838 + }, + { + "epoch": 1.6220173662034127, + "grad_norm": 0.34894416114999455, + "learning_rate": 5.162495575112155e-05, + "loss": 2.7227, + "step": 34839 + }, + { + "epoch": 1.6220639243895056, + "grad_norm": 0.36013644404217965, + "learning_rate": 5.1622248443037323e-05, + "loss": 2.6275, + "step": 34840 + }, + { + "epoch": 1.6221104825755988, + "grad_norm": 0.3706349559264908, + "learning_rate": 5.161954113019196e-05, + "loss": 2.6883, + "step": 34841 + }, + { + "epoch": 1.6221570407616919, + "grad_norm": 0.34355484041754286, + "learning_rate": 5.1616833812593415e-05, + "loss": 2.7388, + "step": 34842 + }, + { + "epoch": 1.622203598947785, + "grad_norm": 0.366295728787992, + "learning_rate": 5.1614126490249614e-05, + "loss": 2.6432, + "step": 34843 + }, + { + "epoch": 1.622250157133878, + "grad_norm": 0.34719444357327506, + "learning_rate": 5.161141916316851e-05, + "loss": 2.7444, + "step": 34844 + }, + { + "epoch": 1.6222967153199712, + "grad_norm": 0.394592032831658, + "learning_rate": 5.160871183135805e-05, + "loss": 2.6582, + "step": 34845 + }, + { + "epoch": 1.6223432735060643, + "grad_norm": 0.33793918762356207, + "learning_rate": 5.1606004494826176e-05, + "loss": 2.6789, + "step": 34846 + }, + { + "epoch": 1.6223898316921572, + "grad_norm": 0.38946986736796846, + "learning_rate": 5.160329715358084e-05, + "loss": 2.6664, + "step": 34847 + }, + { + "epoch": 1.6224363898782503, + "grad_norm": 0.34998220173211936, + "learning_rate": 5.1600589807629996e-05, + "loss": 2.7846, + "step": 34848 + }, + { + "epoch": 1.6224829480643435, + "grad_norm": 0.3659689366804928, + "learning_rate": 5.159788245698156e-05, + "loss": 2.7111, + "step": 34849 + }, + { + "epoch": 1.6225295062504363, + "grad_norm": 0.3701156326166378, + "learning_rate": 5.15951751016435e-05, + "loss": 2.6788, + "step": 34850 + }, + { + "epoch": 1.6225760644365295, + "grad_norm": 0.3505680082935393, + "learning_rate": 5.159246774162377e-05, + "loss": 2.5671, + "step": 34851 + }, + { + "epoch": 1.6226226226226226, + "grad_norm": 0.34941154957980236, + "learning_rate": 5.1589760376930294e-05, + "loss": 2.7695, + "step": 34852 + }, + { + "epoch": 1.6226691808087157, + "grad_norm": 0.34044054797681955, + "learning_rate": 5.158705300757103e-05, + "loss": 2.7157, + "step": 34853 + }, + { + "epoch": 1.6227157389948088, + "grad_norm": 0.35506860368697124, + "learning_rate": 5.158434563355392e-05, + "loss": 2.7316, + "step": 34854 + }, + { + "epoch": 1.622762297180902, + "grad_norm": 0.3582566092879835, + "learning_rate": 5.158163825488691e-05, + "loss": 2.7049, + "step": 34855 + }, + { + "epoch": 1.622808855366995, + "grad_norm": 0.3477461294049402, + "learning_rate": 5.157893087157796e-05, + "loss": 2.7167, + "step": 34856 + }, + { + "epoch": 1.622855413553088, + "grad_norm": 0.33759756307665506, + "learning_rate": 5.157622348363499e-05, + "loss": 2.7026, + "step": 34857 + }, + { + "epoch": 1.622901971739181, + "grad_norm": 0.3561361376619478, + "learning_rate": 5.157351609106596e-05, + "loss": 2.7518, + "step": 34858 + }, + { + "epoch": 1.6229485299252742, + "grad_norm": 0.3399327286906508, + "learning_rate": 5.1570808693878826e-05, + "loss": 2.7029, + "step": 34859 + }, + { + "epoch": 1.622995088111367, + "grad_norm": 0.3346768853931308, + "learning_rate": 5.156810129208152e-05, + "loss": 2.808, + "step": 34860 + }, + { + "epoch": 1.6230416462974602, + "grad_norm": 0.330792912192498, + "learning_rate": 5.156539388568199e-05, + "loss": 2.6163, + "step": 34861 + }, + { + "epoch": 1.6230882044835533, + "grad_norm": 0.3482138085262894, + "learning_rate": 5.156268647468818e-05, + "loss": 2.6928, + "step": 34862 + }, + { + "epoch": 1.6231347626696464, + "grad_norm": 0.34038084045895844, + "learning_rate": 5.155997905910803e-05, + "loss": 2.7006, + "step": 34863 + }, + { + "epoch": 1.6231813208557395, + "grad_norm": 0.3295080582519802, + "learning_rate": 5.1557271638949515e-05, + "loss": 2.7295, + "step": 34864 + }, + { + "epoch": 1.6232278790418326, + "grad_norm": 0.31997194503271204, + "learning_rate": 5.1554564214220546e-05, + "loss": 2.7439, + "step": 34865 + }, + { + "epoch": 1.6232744372279257, + "grad_norm": 0.34607407205174856, + "learning_rate": 5.1551856784929084e-05, + "loss": 2.6928, + "step": 34866 + }, + { + "epoch": 1.6233209954140186, + "grad_norm": 0.32699530576709107, + "learning_rate": 5.154914935108308e-05, + "loss": 2.6575, + "step": 34867 + }, + { + "epoch": 1.6233675536001118, + "grad_norm": 0.3303412411976506, + "learning_rate": 5.154644191269048e-05, + "loss": 2.7032, + "step": 34868 + }, + { + "epoch": 1.6234141117862046, + "grad_norm": 0.34258889709909934, + "learning_rate": 5.154373446975923e-05, + "loss": 2.7203, + "step": 34869 + }, + { + "epoch": 1.6234606699722978, + "grad_norm": 0.3222953181739511, + "learning_rate": 5.154102702229726e-05, + "loss": 2.6591, + "step": 34870 + }, + { + "epoch": 1.6235072281583909, + "grad_norm": 0.362804526120507, + "learning_rate": 5.153831957031252e-05, + "loss": 2.7286, + "step": 34871 + }, + { + "epoch": 1.623553786344484, + "grad_norm": 0.33197770379265756, + "learning_rate": 5.153561211381298e-05, + "loss": 2.6617, + "step": 34872 + }, + { + "epoch": 1.623600344530577, + "grad_norm": 0.30586204988669885, + "learning_rate": 5.153290465280657e-05, + "loss": 2.6752, + "step": 34873 + }, + { + "epoch": 1.6236469027166702, + "grad_norm": 0.3450407627647458, + "learning_rate": 5.153019718730123e-05, + "loss": 2.7189, + "step": 34874 + }, + { + "epoch": 1.6236934609027633, + "grad_norm": 0.32162770686729, + "learning_rate": 5.152748971730491e-05, + "loss": 2.6553, + "step": 34875 + }, + { + "epoch": 1.6237400190888565, + "grad_norm": 0.31191651805800097, + "learning_rate": 5.152478224282555e-05, + "loss": 2.7051, + "step": 34876 + }, + { + "epoch": 1.6237865772749493, + "grad_norm": 0.32899973936827254, + "learning_rate": 5.152207476387112e-05, + "loss": 2.6812, + "step": 34877 + }, + { + "epoch": 1.6238331354610425, + "grad_norm": 0.3411421861679203, + "learning_rate": 5.151936728044955e-05, + "loss": 2.8492, + "step": 34878 + }, + { + "epoch": 1.6238796936471354, + "grad_norm": 0.3516321798337284, + "learning_rate": 5.151665979256878e-05, + "loss": 2.7474, + "step": 34879 + }, + { + "epoch": 1.6239262518332285, + "grad_norm": 0.35221122206897365, + "learning_rate": 5.1513952300236754e-05, + "loss": 2.7403, + "step": 34880 + }, + { + "epoch": 1.6239728100193216, + "grad_norm": 0.3429558529875659, + "learning_rate": 5.151124480346143e-05, + "loss": 2.6925, + "step": 34881 + }, + { + "epoch": 1.6240193682054147, + "grad_norm": 0.3510012674015573, + "learning_rate": 5.1508537302250756e-05, + "loss": 2.7667, + "step": 34882 + }, + { + "epoch": 1.6240659263915078, + "grad_norm": 0.3596054030469909, + "learning_rate": 5.150582979661269e-05, + "loss": 2.6614, + "step": 34883 + }, + { + "epoch": 1.624112484577601, + "grad_norm": 0.33936628517180406, + "learning_rate": 5.150312228655515e-05, + "loss": 2.608, + "step": 34884 + }, + { + "epoch": 1.624159042763694, + "grad_norm": 0.3391808381366419, + "learning_rate": 5.15004147720861e-05, + "loss": 2.6021, + "step": 34885 + }, + { + "epoch": 1.624205600949787, + "grad_norm": 0.3566587863283097, + "learning_rate": 5.1497707253213465e-05, + "loss": 2.7195, + "step": 34886 + }, + { + "epoch": 1.62425215913588, + "grad_norm": 0.33249944710403134, + "learning_rate": 5.149499972994522e-05, + "loss": 2.793, + "step": 34887 + }, + { + "epoch": 1.6242987173219732, + "grad_norm": 0.3489321275011971, + "learning_rate": 5.1492292202289296e-05, + "loss": 2.671, + "step": 34888 + }, + { + "epoch": 1.624345275508066, + "grad_norm": 0.3577621798479268, + "learning_rate": 5.148958467025363e-05, + "loss": 2.7956, + "step": 34889 + }, + { + "epoch": 1.6243918336941592, + "grad_norm": 0.31789551794190307, + "learning_rate": 5.148687713384619e-05, + "loss": 2.7715, + "step": 34890 + }, + { + "epoch": 1.6244383918802523, + "grad_norm": 0.36883287122200015, + "learning_rate": 5.1484169593074905e-05, + "loss": 2.8351, + "step": 34891 + }, + { + "epoch": 1.6244849500663454, + "grad_norm": 0.36234041668426736, + "learning_rate": 5.148146204794774e-05, + "loss": 2.6286, + "step": 34892 + }, + { + "epoch": 1.6245315082524385, + "grad_norm": 0.3262566419887119, + "learning_rate": 5.147875449847263e-05, + "loss": 2.7123, + "step": 34893 + }, + { + "epoch": 1.6245780664385316, + "grad_norm": 0.34294089258417215, + "learning_rate": 5.14760469446575e-05, + "loss": 2.6493, + "step": 34894 + }, + { + "epoch": 1.6246246246246248, + "grad_norm": 0.3483539161092387, + "learning_rate": 5.147333938651033e-05, + "loss": 2.7187, + "step": 34895 + }, + { + "epoch": 1.6246711828107177, + "grad_norm": 0.35014743086875333, + "learning_rate": 5.147063182403906e-05, + "loss": 2.73, + "step": 34896 + }, + { + "epoch": 1.6247177409968108, + "grad_norm": 0.3316284814025636, + "learning_rate": 5.146792425725163e-05, + "loss": 2.6253, + "step": 34897 + }, + { + "epoch": 1.6247642991829039, + "grad_norm": 0.33643179212676816, + "learning_rate": 5.1465216686155993e-05, + "loss": 2.6038, + "step": 34898 + }, + { + "epoch": 1.6248108573689968, + "grad_norm": 0.3466109853873129, + "learning_rate": 5.146250911076007e-05, + "loss": 2.7114, + "step": 34899 + }, + { + "epoch": 1.62485741555509, + "grad_norm": 0.3268830536558776, + "learning_rate": 5.145980153107184e-05, + "loss": 2.6435, + "step": 34900 + }, + { + "epoch": 1.624903973741183, + "grad_norm": 0.36646916299915633, + "learning_rate": 5.145709394709923e-05, + "loss": 2.6653, + "step": 34901 + }, + { + "epoch": 1.6249505319272761, + "grad_norm": 0.33846466397941855, + "learning_rate": 5.1454386358850206e-05, + "loss": 2.7124, + "step": 34902 + }, + { + "epoch": 1.6249970901133692, + "grad_norm": 0.3447597277750355, + "learning_rate": 5.1451678766332677e-05, + "loss": 2.8059, + "step": 34903 + }, + { + "epoch": 1.6250436482994624, + "grad_norm": 0.32934474614803777, + "learning_rate": 5.1448971169554626e-05, + "loss": 2.7126, + "step": 34904 + }, + { + "epoch": 1.6250902064855555, + "grad_norm": 0.3411325689232228, + "learning_rate": 5.1446263568524e-05, + "loss": 2.6602, + "step": 34905 + }, + { + "epoch": 1.6251367646716484, + "grad_norm": 0.34786639743523157, + "learning_rate": 5.144355596324873e-05, + "loss": 2.7084, + "step": 34906 + }, + { + "epoch": 1.6251833228577415, + "grad_norm": 0.33000149984571203, + "learning_rate": 5.144084835373675e-05, + "loss": 2.6312, + "step": 34907 + }, + { + "epoch": 1.6252298810438344, + "grad_norm": 0.3496853657070848, + "learning_rate": 5.1438140739996024e-05, + "loss": 2.6433, + "step": 34908 + }, + { + "epoch": 1.6252764392299275, + "grad_norm": 0.3333065470742375, + "learning_rate": 5.143543312203449e-05, + "loss": 2.6797, + "step": 34909 + }, + { + "epoch": 1.6253229974160206, + "grad_norm": 0.35906997586398726, + "learning_rate": 5.1432725499860124e-05, + "loss": 2.7089, + "step": 34910 + }, + { + "epoch": 1.6253695556021137, + "grad_norm": 0.3354851067656601, + "learning_rate": 5.1430017873480853e-05, + "loss": 2.7178, + "step": 34911 + }, + { + "epoch": 1.6254161137882068, + "grad_norm": 0.3514635495216347, + "learning_rate": 5.14273102429046e-05, + "loss": 2.7387, + "step": 34912 + }, + { + "epoch": 1.6254626719743, + "grad_norm": 0.33667550035968247, + "learning_rate": 5.142460260813934e-05, + "loss": 2.5657, + "step": 34913 + }, + { + "epoch": 1.625509230160393, + "grad_norm": 0.34790422020012934, + "learning_rate": 5.142189496919302e-05, + "loss": 2.7382, + "step": 34914 + }, + { + "epoch": 1.6255557883464862, + "grad_norm": 0.383671474849683, + "learning_rate": 5.141918732607356e-05, + "loss": 2.7147, + "step": 34915 + }, + { + "epoch": 1.625602346532579, + "grad_norm": 0.3943005931727044, + "learning_rate": 5.141647967878893e-05, + "loss": 2.7384, + "step": 34916 + }, + { + "epoch": 1.6256489047186722, + "grad_norm": 0.3479103957296589, + "learning_rate": 5.141377202734707e-05, + "loss": 2.6738, + "step": 34917 + }, + { + "epoch": 1.625695462904765, + "grad_norm": 0.342338801644875, + "learning_rate": 5.141106437175594e-05, + "loss": 2.7206, + "step": 34918 + }, + { + "epoch": 1.6257420210908582, + "grad_norm": 0.3461083807807584, + "learning_rate": 5.140835671202348e-05, + "loss": 2.7312, + "step": 34919 + }, + { + "epoch": 1.6257885792769513, + "grad_norm": 0.31999609460693373, + "learning_rate": 5.140564904815761e-05, + "loss": 2.6669, + "step": 34920 + }, + { + "epoch": 1.6258351374630444, + "grad_norm": 0.33641984337640446, + "learning_rate": 5.1402941380166304e-05, + "loss": 2.6107, + "step": 34921 + }, + { + "epoch": 1.6258816956491375, + "grad_norm": 0.3296133581709722, + "learning_rate": 5.140023370805751e-05, + "loss": 2.7334, + "step": 34922 + }, + { + "epoch": 1.6259282538352307, + "grad_norm": 0.34244050605982557, + "learning_rate": 5.139752603183917e-05, + "loss": 2.735, + "step": 34923 + }, + { + "epoch": 1.6259748120213238, + "grad_norm": 0.36464880204849365, + "learning_rate": 5.1394818351519236e-05, + "loss": 2.6789, + "step": 34924 + }, + { + "epoch": 1.6260213702074169, + "grad_norm": 0.3173293244196586, + "learning_rate": 5.139211066710563e-05, + "loss": 2.673, + "step": 34925 + }, + { + "epoch": 1.6260679283935098, + "grad_norm": 0.3655162714140506, + "learning_rate": 5.138940297860633e-05, + "loss": 2.7026, + "step": 34926 + }, + { + "epoch": 1.626114486579603, + "grad_norm": 0.34025111148704235, + "learning_rate": 5.138669528602925e-05, + "loss": 2.7574, + "step": 34927 + }, + { + "epoch": 1.6261610447656958, + "grad_norm": 0.33386379937892613, + "learning_rate": 5.1383987589382376e-05, + "loss": 2.6628, + "step": 34928 + }, + { + "epoch": 1.626207602951789, + "grad_norm": 0.3250671501297504, + "learning_rate": 5.138127988867363e-05, + "loss": 2.7273, + "step": 34929 + }, + { + "epoch": 1.626254161137882, + "grad_norm": 0.35265725401500825, + "learning_rate": 5.137857218391096e-05, + "loss": 2.7279, + "step": 34930 + }, + { + "epoch": 1.6263007193239751, + "grad_norm": 0.3525215184196576, + "learning_rate": 5.137586447510231e-05, + "loss": 2.5616, + "step": 34931 + }, + { + "epoch": 1.6263472775100682, + "grad_norm": 0.33349160619697954, + "learning_rate": 5.1373156762255647e-05, + "loss": 2.8006, + "step": 34932 + }, + { + "epoch": 1.6263938356961614, + "grad_norm": 0.3514586982030938, + "learning_rate": 5.137044904537889e-05, + "loss": 2.7264, + "step": 34933 + }, + { + "epoch": 1.6264403938822545, + "grad_norm": 0.3353318400811861, + "learning_rate": 5.136774132448001e-05, + "loss": 2.6585, + "step": 34934 + }, + { + "epoch": 1.6264869520683474, + "grad_norm": 0.3404509247720738, + "learning_rate": 5.1365033599566936e-05, + "loss": 2.7089, + "step": 34935 + }, + { + "epoch": 1.6265335102544405, + "grad_norm": 0.34391158403738253, + "learning_rate": 5.136232587064763e-05, + "loss": 2.7548, + "step": 34936 + }, + { + "epoch": 1.6265800684405336, + "grad_norm": 0.3819109369335068, + "learning_rate": 5.1359618137730035e-05, + "loss": 2.6355, + "step": 34937 + }, + { + "epoch": 1.6266266266266265, + "grad_norm": 0.3783116970593219, + "learning_rate": 5.13569104008221e-05, + "loss": 2.7139, + "step": 34938 + }, + { + "epoch": 1.6266731848127196, + "grad_norm": 0.38097294390908887, + "learning_rate": 5.135420265993175e-05, + "loss": 2.7677, + "step": 34939 + }, + { + "epoch": 1.6267197429988127, + "grad_norm": 0.3923876586523921, + "learning_rate": 5.1351494915066954e-05, + "loss": 2.7259, + "step": 34940 + }, + { + "epoch": 1.6267663011849058, + "grad_norm": 0.33281480141586306, + "learning_rate": 5.1348787166235654e-05, + "loss": 2.7303, + "step": 34941 + }, + { + "epoch": 1.626812859370999, + "grad_norm": 0.37140852997783347, + "learning_rate": 5.13460794134458e-05, + "loss": 2.6336, + "step": 34942 + }, + { + "epoch": 1.626859417557092, + "grad_norm": 0.3614292985729983, + "learning_rate": 5.134337165670533e-05, + "loss": 2.6659, + "step": 34943 + }, + { + "epoch": 1.6269059757431852, + "grad_norm": 0.3628846283877602, + "learning_rate": 5.13406638960222e-05, + "loss": 2.6991, + "step": 34944 + }, + { + "epoch": 1.626952533929278, + "grad_norm": 0.3551573757090096, + "learning_rate": 5.133795613140434e-05, + "loss": 2.7355, + "step": 34945 + }, + { + "epoch": 1.6269990921153712, + "grad_norm": 0.373605289562476, + "learning_rate": 5.1335248362859733e-05, + "loss": 2.7516, + "step": 34946 + }, + { + "epoch": 1.6270456503014643, + "grad_norm": 0.35750789526413945, + "learning_rate": 5.1332540590396296e-05, + "loss": 2.7309, + "step": 34947 + }, + { + "epoch": 1.6270922084875572, + "grad_norm": 0.3667409223485034, + "learning_rate": 5.1329832814021985e-05, + "loss": 2.7353, + "step": 34948 + }, + { + "epoch": 1.6271387666736503, + "grad_norm": 0.3538978557246192, + "learning_rate": 5.1327125033744736e-05, + "loss": 2.742, + "step": 34949 + }, + { + "epoch": 1.6271853248597434, + "grad_norm": 0.35287658241401954, + "learning_rate": 5.132441724957252e-05, + "loss": 2.6766, + "step": 34950 + }, + { + "epoch": 1.6272318830458365, + "grad_norm": 0.382160586018904, + "learning_rate": 5.1321709461513265e-05, + "loss": 2.7361, + "step": 34951 + }, + { + "epoch": 1.6272784412319297, + "grad_norm": 0.3042991627657486, + "learning_rate": 5.131900166957492e-05, + "loss": 2.6983, + "step": 34952 + }, + { + "epoch": 1.6273249994180228, + "grad_norm": 0.3423523233671559, + "learning_rate": 5.131629387376543e-05, + "loss": 2.7481, + "step": 34953 + }, + { + "epoch": 1.627371557604116, + "grad_norm": 0.3277377944424055, + "learning_rate": 5.131358607409275e-05, + "loss": 2.6483, + "step": 34954 + }, + { + "epoch": 1.6274181157902088, + "grad_norm": 0.31564531518878114, + "learning_rate": 5.131087827056483e-05, + "loss": 2.7418, + "step": 34955 + }, + { + "epoch": 1.627464673976302, + "grad_norm": 0.3540644954142715, + "learning_rate": 5.130817046318961e-05, + "loss": 2.8295, + "step": 34956 + }, + { + "epoch": 1.6275112321623948, + "grad_norm": 0.32634751376460897, + "learning_rate": 5.130546265197504e-05, + "loss": 2.6411, + "step": 34957 + }, + { + "epoch": 1.627557790348488, + "grad_norm": 0.3217579635914085, + "learning_rate": 5.130275483692906e-05, + "loss": 2.7691, + "step": 34958 + }, + { + "epoch": 1.627604348534581, + "grad_norm": 0.34361215454845073, + "learning_rate": 5.1300047018059625e-05, + "loss": 2.7586, + "step": 34959 + }, + { + "epoch": 1.6276509067206741, + "grad_norm": 0.3650257929654347, + "learning_rate": 5.129733919537467e-05, + "loss": 2.7556, + "step": 34960 + }, + { + "epoch": 1.6276974649067673, + "grad_norm": 0.34440300948031394, + "learning_rate": 5.1294631368882164e-05, + "loss": 2.7063, + "step": 34961 + }, + { + "epoch": 1.6277440230928604, + "grad_norm": 0.31820762301594535, + "learning_rate": 5.1291923538590045e-05, + "loss": 2.7187, + "step": 34962 + }, + { + "epoch": 1.6277905812789535, + "grad_norm": 0.3479116228524477, + "learning_rate": 5.1289215704506255e-05, + "loss": 2.7148, + "step": 34963 + }, + { + "epoch": 1.6278371394650466, + "grad_norm": 0.3161004216700271, + "learning_rate": 5.1286507866638755e-05, + "loss": 2.7892, + "step": 34964 + }, + { + "epoch": 1.6278836976511395, + "grad_norm": 0.3121667881575688, + "learning_rate": 5.1283800024995466e-05, + "loss": 2.6581, + "step": 34965 + }, + { + "epoch": 1.6279302558372326, + "grad_norm": 0.3318821577872822, + "learning_rate": 5.1281092179584345e-05, + "loss": 2.6859, + "step": 34966 + }, + { + "epoch": 1.6279768140233255, + "grad_norm": 0.3424476941121352, + "learning_rate": 5.127838433041335e-05, + "loss": 2.7136, + "step": 34967 + }, + { + "epoch": 1.6280233722094186, + "grad_norm": 0.3212164478947076, + "learning_rate": 5.127567647749043e-05, + "loss": 2.7123, + "step": 34968 + }, + { + "epoch": 1.6280699303955117, + "grad_norm": 0.32618803302499755, + "learning_rate": 5.127296862082352e-05, + "loss": 2.6602, + "step": 34969 + }, + { + "epoch": 1.6281164885816048, + "grad_norm": 0.32046925423482736, + "learning_rate": 5.1270260760420576e-05, + "loss": 2.709, + "step": 34970 + }, + { + "epoch": 1.628163046767698, + "grad_norm": 0.3471437094595307, + "learning_rate": 5.1267552896289525e-05, + "loss": 2.7664, + "step": 34971 + }, + { + "epoch": 1.628209604953791, + "grad_norm": 0.31626179905761376, + "learning_rate": 5.1264845028438346e-05, + "loss": 2.6733, + "step": 34972 + }, + { + "epoch": 1.6282561631398842, + "grad_norm": 0.3492499350245946, + "learning_rate": 5.126213715687498e-05, + "loss": 2.6647, + "step": 34973 + }, + { + "epoch": 1.628302721325977, + "grad_norm": 0.3316916264042233, + "learning_rate": 5.125942928160735e-05, + "loss": 2.6559, + "step": 34974 + }, + { + "epoch": 1.6283492795120702, + "grad_norm": 0.31988828402819525, + "learning_rate": 5.125672140264344e-05, + "loss": 2.5984, + "step": 34975 + }, + { + "epoch": 1.6283958376981633, + "grad_norm": 0.3415829629872098, + "learning_rate": 5.1254013519991165e-05, + "loss": 2.7147, + "step": 34976 + }, + { + "epoch": 1.6284423958842562, + "grad_norm": 0.31625032686450727, + "learning_rate": 5.125130563365847e-05, + "loss": 2.6765, + "step": 34977 + }, + { + "epoch": 1.6284889540703493, + "grad_norm": 0.33738464995928313, + "learning_rate": 5.1248597743653336e-05, + "loss": 2.791, + "step": 34978 + }, + { + "epoch": 1.6285355122564424, + "grad_norm": 0.3271162134349839, + "learning_rate": 5.124588984998368e-05, + "loss": 2.7508, + "step": 34979 + }, + { + "epoch": 1.6285820704425356, + "grad_norm": 0.32926220896455827, + "learning_rate": 5.124318195265746e-05, + "loss": 2.8311, + "step": 34980 + }, + { + "epoch": 1.6286286286286287, + "grad_norm": 0.32435097898831666, + "learning_rate": 5.124047405168262e-05, + "loss": 2.7584, + "step": 34981 + }, + { + "epoch": 1.6286751868147218, + "grad_norm": 0.34423371004352254, + "learning_rate": 5.1237766147067125e-05, + "loss": 2.7419, + "step": 34982 + }, + { + "epoch": 1.628721745000815, + "grad_norm": 0.3216672878607982, + "learning_rate": 5.12350582388189e-05, + "loss": 2.7508, + "step": 34983 + }, + { + "epoch": 1.6287683031869078, + "grad_norm": 0.3263242482883397, + "learning_rate": 5.1232350326945886e-05, + "loss": 2.6343, + "step": 34984 + }, + { + "epoch": 1.628814861373001, + "grad_norm": 0.3322582831229886, + "learning_rate": 5.122964241145607e-05, + "loss": 2.7012, + "step": 34985 + }, + { + "epoch": 1.628861419559094, + "grad_norm": 0.3242022807892549, + "learning_rate": 5.122693449235737e-05, + "loss": 2.7519, + "step": 34986 + }, + { + "epoch": 1.628907977745187, + "grad_norm": 0.32638039833454213, + "learning_rate": 5.122422656965773e-05, + "loss": 2.6893, + "step": 34987 + }, + { + "epoch": 1.62895453593128, + "grad_norm": 0.3243693397996273, + "learning_rate": 5.122151864336511e-05, + "loss": 2.7886, + "step": 34988 + }, + { + "epoch": 1.6290010941173731, + "grad_norm": 0.35263355246285844, + "learning_rate": 5.1218810713487455e-05, + "loss": 2.79, + "step": 34989 + }, + { + "epoch": 1.6290476523034663, + "grad_norm": 0.33908076767706846, + "learning_rate": 5.121610278003269e-05, + "loss": 2.6852, + "step": 34990 + }, + { + "epoch": 1.6290942104895594, + "grad_norm": 0.38151850209072063, + "learning_rate": 5.121339484300881e-05, + "loss": 2.7497, + "step": 34991 + }, + { + "epoch": 1.6291407686756525, + "grad_norm": 0.35195970929063003, + "learning_rate": 5.1210686902423734e-05, + "loss": 2.7258, + "step": 34992 + }, + { + "epoch": 1.6291873268617456, + "grad_norm": 0.3657863733822307, + "learning_rate": 5.1207978958285393e-05, + "loss": 2.7112, + "step": 34993 + }, + { + "epoch": 1.6292338850478385, + "grad_norm": 0.33691613888409533, + "learning_rate": 5.1205271010601766e-05, + "loss": 2.7311, + "step": 34994 + }, + { + "epoch": 1.6292804432339316, + "grad_norm": 0.34882114871548553, + "learning_rate": 5.120256305938077e-05, + "loss": 2.7472, + "step": 34995 + }, + { + "epoch": 1.6293270014200245, + "grad_norm": 0.3278171028119042, + "learning_rate": 5.11998551046304e-05, + "loss": 2.6669, + "step": 34996 + }, + { + "epoch": 1.6293735596061176, + "grad_norm": 0.33955550464107986, + "learning_rate": 5.1197147146358545e-05, + "loss": 2.6747, + "step": 34997 + }, + { + "epoch": 1.6294201177922107, + "grad_norm": 0.3301950880448231, + "learning_rate": 5.1194439184573194e-05, + "loss": 2.7289, + "step": 34998 + }, + { + "epoch": 1.6294666759783039, + "grad_norm": 0.33502501661818324, + "learning_rate": 5.119173121928229e-05, + "loss": 2.7909, + "step": 34999 + }, + { + "epoch": 1.629513234164397, + "grad_norm": 0.35208180947364315, + "learning_rate": 5.118902325049376e-05, + "loss": 2.7021, + "step": 35000 + }, + { + "epoch": 1.62955979235049, + "grad_norm": 0.3422450066922149, + "learning_rate": 5.118631527821558e-05, + "loss": 2.7104, + "step": 35001 + }, + { + "epoch": 1.6296063505365832, + "grad_norm": 0.37348195437685394, + "learning_rate": 5.118360730245566e-05, + "loss": 2.7279, + "step": 35002 + }, + { + "epoch": 1.6296529087226763, + "grad_norm": 0.3521344152932278, + "learning_rate": 5.118089932322197e-05, + "loss": 2.7924, + "step": 35003 + }, + { + "epoch": 1.6296994669087692, + "grad_norm": 0.3585670207831926, + "learning_rate": 5.1178191340522464e-05, + "loss": 2.6692, + "step": 35004 + }, + { + "epoch": 1.6297460250948623, + "grad_norm": 0.3996067939748397, + "learning_rate": 5.1175483354365095e-05, + "loss": 2.7197, + "step": 35005 + }, + { + "epoch": 1.6297925832809552, + "grad_norm": 0.37143416068197166, + "learning_rate": 5.117277536475779e-05, + "loss": 2.8171, + "step": 35006 + }, + { + "epoch": 1.6298391414670483, + "grad_norm": 0.4288419754422424, + "learning_rate": 5.1170067371708486e-05, + "loss": 2.7099, + "step": 35007 + }, + { + "epoch": 1.6298856996531415, + "grad_norm": 0.35623930244047897, + "learning_rate": 5.116735937522517e-05, + "loss": 2.7053, + "step": 35008 + }, + { + "epoch": 1.6299322578392346, + "grad_norm": 0.40200815938575857, + "learning_rate": 5.1164651375315766e-05, + "loss": 2.7463, + "step": 35009 + }, + { + "epoch": 1.6299788160253277, + "grad_norm": 0.3634392321809939, + "learning_rate": 5.116194337198822e-05, + "loss": 2.6736, + "step": 35010 + }, + { + "epoch": 1.6300253742114208, + "grad_norm": 0.3928962909583927, + "learning_rate": 5.115923536525049e-05, + "loss": 2.7663, + "step": 35011 + }, + { + "epoch": 1.630071932397514, + "grad_norm": 0.3913093497954036, + "learning_rate": 5.115652735511052e-05, + "loss": 2.6719, + "step": 35012 + }, + { + "epoch": 1.630118490583607, + "grad_norm": 0.386673886029001, + "learning_rate": 5.115381934157625e-05, + "loss": 2.7944, + "step": 35013 + }, + { + "epoch": 1.6301650487697, + "grad_norm": 0.3742981565843701, + "learning_rate": 5.115111132465564e-05, + "loss": 2.714, + "step": 35014 + }, + { + "epoch": 1.630211606955793, + "grad_norm": 0.37700092666363855, + "learning_rate": 5.114840330435663e-05, + "loss": 2.7858, + "step": 35015 + }, + { + "epoch": 1.630258165141886, + "grad_norm": 0.34861624015571807, + "learning_rate": 5.1145695280687156e-05, + "loss": 2.668, + "step": 35016 + }, + { + "epoch": 1.630304723327979, + "grad_norm": 0.36323458626962474, + "learning_rate": 5.1142987253655196e-05, + "loss": 2.6845, + "step": 35017 + }, + { + "epoch": 1.6303512815140722, + "grad_norm": 0.37324716454222284, + "learning_rate": 5.114027922326868e-05, + "loss": 2.6978, + "step": 35018 + }, + { + "epoch": 1.6303978397001653, + "grad_norm": 0.3817429382491845, + "learning_rate": 5.113757118953555e-05, + "loss": 2.7055, + "step": 35019 + }, + { + "epoch": 1.6304443978862584, + "grad_norm": 0.36440300084720234, + "learning_rate": 5.113486315246375e-05, + "loss": 2.7358, + "step": 35020 + }, + { + "epoch": 1.6304909560723515, + "grad_norm": 0.3377108087815688, + "learning_rate": 5.1132155112061254e-05, + "loss": 2.6221, + "step": 35021 + }, + { + "epoch": 1.6305375142584446, + "grad_norm": 0.38296888588044714, + "learning_rate": 5.112944706833599e-05, + "loss": 2.7032, + "step": 35022 + }, + { + "epoch": 1.6305840724445375, + "grad_norm": 0.37910326470484046, + "learning_rate": 5.112673902129592e-05, + "loss": 2.7859, + "step": 35023 + }, + { + "epoch": 1.6306306306306306, + "grad_norm": 0.36611450153136194, + "learning_rate": 5.1124030970948964e-05, + "loss": 2.6294, + "step": 35024 + }, + { + "epoch": 1.6306771888167237, + "grad_norm": 0.3691052816027855, + "learning_rate": 5.1121322917303114e-05, + "loss": 2.7471, + "step": 35025 + }, + { + "epoch": 1.6307237470028166, + "grad_norm": 0.36041034175990255, + "learning_rate": 5.111861486036626e-05, + "loss": 2.6912, + "step": 35026 + }, + { + "epoch": 1.6307703051889098, + "grad_norm": 0.3679259808754542, + "learning_rate": 5.111590680014642e-05, + "loss": 2.7165, + "step": 35027 + }, + { + "epoch": 1.6308168633750029, + "grad_norm": 0.364869019329272, + "learning_rate": 5.111319873665147e-05, + "loss": 2.7188, + "step": 35028 + }, + { + "epoch": 1.630863421561096, + "grad_norm": 0.33341769005405525, + "learning_rate": 5.11104906698894e-05, + "loss": 2.8292, + "step": 35029 + }, + { + "epoch": 1.630909979747189, + "grad_norm": 0.3340364449534357, + "learning_rate": 5.110778259986815e-05, + "loss": 2.6596, + "step": 35030 + }, + { + "epoch": 1.6309565379332822, + "grad_norm": 0.3308206436024897, + "learning_rate": 5.110507452659566e-05, + "loss": 2.7156, + "step": 35031 + }, + { + "epoch": 1.6310030961193753, + "grad_norm": 0.3325278446731832, + "learning_rate": 5.110236645007991e-05, + "loss": 2.8307, + "step": 35032 + }, + { + "epoch": 1.6310496543054682, + "grad_norm": 0.3268196876964141, + "learning_rate": 5.1099658370328795e-05, + "loss": 2.7937, + "step": 35033 + }, + { + "epoch": 1.6310962124915613, + "grad_norm": 0.3187508948050998, + "learning_rate": 5.1096950287350295e-05, + "loss": 2.6138, + "step": 35034 + }, + { + "epoch": 1.6311427706776545, + "grad_norm": 0.3252359091207033, + "learning_rate": 5.1094242201152366e-05, + "loss": 2.7468, + "step": 35035 + }, + { + "epoch": 1.6311893288637473, + "grad_norm": 0.3532226495582838, + "learning_rate": 5.109153411174295e-05, + "loss": 2.7255, + "step": 35036 + }, + { + "epoch": 1.6312358870498405, + "grad_norm": 0.35243226291915114, + "learning_rate": 5.1088826019129964e-05, + "loss": 2.6671, + "step": 35037 + }, + { + "epoch": 1.6312824452359336, + "grad_norm": 0.32865822557692354, + "learning_rate": 5.108611792332141e-05, + "loss": 2.7646, + "step": 35038 + }, + { + "epoch": 1.6313290034220267, + "grad_norm": 0.3509032574211358, + "learning_rate": 5.108340982432519e-05, + "loss": 2.6631, + "step": 35039 + }, + { + "epoch": 1.6313755616081198, + "grad_norm": 0.3146420179249406, + "learning_rate": 5.108070172214926e-05, + "loss": 2.7595, + "step": 35040 + }, + { + "epoch": 1.631422119794213, + "grad_norm": 0.381238407434141, + "learning_rate": 5.10779936168016e-05, + "loss": 2.7522, + "step": 35041 + }, + { + "epoch": 1.631468677980306, + "grad_norm": 0.31040104957437986, + "learning_rate": 5.1075285508290114e-05, + "loss": 2.6091, + "step": 35042 + }, + { + "epoch": 1.631515236166399, + "grad_norm": 0.370160035262191, + "learning_rate": 5.107257739662278e-05, + "loss": 2.7797, + "step": 35043 + }, + { + "epoch": 1.631561794352492, + "grad_norm": 0.32010179248929427, + "learning_rate": 5.1069869281807535e-05, + "loss": 2.7657, + "step": 35044 + }, + { + "epoch": 1.631608352538585, + "grad_norm": 0.3356828012590023, + "learning_rate": 5.106716116385234e-05, + "loss": 2.6935, + "step": 35045 + }, + { + "epoch": 1.631654910724678, + "grad_norm": 0.30614955285337075, + "learning_rate": 5.106445304276511e-05, + "loss": 2.7534, + "step": 35046 + }, + { + "epoch": 1.6317014689107712, + "grad_norm": 0.31147297918776, + "learning_rate": 5.1061744918553834e-05, + "loss": 2.7632, + "step": 35047 + }, + { + "epoch": 1.6317480270968643, + "grad_norm": 0.31229065448548843, + "learning_rate": 5.105903679122643e-05, + "loss": 2.6989, + "step": 35048 + }, + { + "epoch": 1.6317945852829574, + "grad_norm": 0.3149915662084044, + "learning_rate": 5.1056328660790864e-05, + "loss": 2.643, + "step": 35049 + }, + { + "epoch": 1.6318411434690505, + "grad_norm": 0.30304652787791947, + "learning_rate": 5.105362052725509e-05, + "loss": 2.7424, + "step": 35050 + }, + { + "epoch": 1.6318877016551436, + "grad_norm": 0.366353678702387, + "learning_rate": 5.105091239062703e-05, + "loss": 2.8354, + "step": 35051 + }, + { + "epoch": 1.6319342598412367, + "grad_norm": 0.3168464446746363, + "learning_rate": 5.104820425091465e-05, + "loss": 2.68, + "step": 35052 + }, + { + "epoch": 1.6319808180273296, + "grad_norm": 0.372034831859903, + "learning_rate": 5.104549610812588e-05, + "loss": 2.7816, + "step": 35053 + }, + { + "epoch": 1.6320273762134228, + "grad_norm": 0.3180359107471083, + "learning_rate": 5.104278796226869e-05, + "loss": 2.6372, + "step": 35054 + }, + { + "epoch": 1.6320739343995156, + "grad_norm": 0.2923767810183055, + "learning_rate": 5.104007981335103e-05, + "loss": 2.8047, + "step": 35055 + }, + { + "epoch": 1.6321204925856088, + "grad_norm": 0.35138956508906904, + "learning_rate": 5.1037371661380816e-05, + "loss": 2.7393, + "step": 35056 + }, + { + "epoch": 1.6321670507717019, + "grad_norm": 0.3447842344822237, + "learning_rate": 5.103466350636603e-05, + "loss": 2.6985, + "step": 35057 + }, + { + "epoch": 1.632213608957795, + "grad_norm": 0.32214817167311915, + "learning_rate": 5.103195534831461e-05, + "loss": 2.6148, + "step": 35058 + }, + { + "epoch": 1.632260167143888, + "grad_norm": 0.34455899722610195, + "learning_rate": 5.102924718723451e-05, + "loss": 2.754, + "step": 35059 + }, + { + "epoch": 1.6323067253299812, + "grad_norm": 0.3359523773335598, + "learning_rate": 5.102653902313366e-05, + "loss": 2.6498, + "step": 35060 + }, + { + "epoch": 1.6323532835160743, + "grad_norm": 0.3418928997458868, + "learning_rate": 5.1023830856020014e-05, + "loss": 2.7343, + "step": 35061 + }, + { + "epoch": 1.6323998417021672, + "grad_norm": 0.34333340871003853, + "learning_rate": 5.1021122685901536e-05, + "loss": 2.7803, + "step": 35062 + }, + { + "epoch": 1.6324463998882603, + "grad_norm": 0.3530500182453162, + "learning_rate": 5.1018414512786175e-05, + "loss": 2.6286, + "step": 35063 + }, + { + "epoch": 1.6324929580743535, + "grad_norm": 0.3261991131868478, + "learning_rate": 5.101570633668186e-05, + "loss": 2.7752, + "step": 35064 + }, + { + "epoch": 1.6325395162604464, + "grad_norm": 0.3223045067963907, + "learning_rate": 5.101299815759653e-05, + "loss": 2.6338, + "step": 35065 + }, + { + "epoch": 1.6325860744465395, + "grad_norm": 0.37937118920215435, + "learning_rate": 5.1010289975538164e-05, + "loss": 2.7325, + "step": 35066 + }, + { + "epoch": 1.6326326326326326, + "grad_norm": 0.3335533022431772, + "learning_rate": 5.100758179051469e-05, + "loss": 2.7939, + "step": 35067 + }, + { + "epoch": 1.6326791908187257, + "grad_norm": 0.3497033285795785, + "learning_rate": 5.100487360253408e-05, + "loss": 2.6358, + "step": 35068 + }, + { + "epoch": 1.6327257490048188, + "grad_norm": 0.36319824739247863, + "learning_rate": 5.1002165411604245e-05, + "loss": 2.7759, + "step": 35069 + }, + { + "epoch": 1.632772307190912, + "grad_norm": 0.35153868874071587, + "learning_rate": 5.099945721773316e-05, + "loss": 2.7812, + "step": 35070 + }, + { + "epoch": 1.632818865377005, + "grad_norm": 0.34267596269206374, + "learning_rate": 5.099674902092876e-05, + "loss": 2.6588, + "step": 35071 + }, + { + "epoch": 1.632865423563098, + "grad_norm": 0.35886164002818655, + "learning_rate": 5.099404082119902e-05, + "loss": 2.6981, + "step": 35072 + }, + { + "epoch": 1.632911981749191, + "grad_norm": 0.36038483530038434, + "learning_rate": 5.099133261855185e-05, + "loss": 2.7785, + "step": 35073 + }, + { + "epoch": 1.6329585399352842, + "grad_norm": 0.351524722982815, + "learning_rate": 5.0988624412995214e-05, + "loss": 2.7173, + "step": 35074 + }, + { + "epoch": 1.633005098121377, + "grad_norm": 0.3863564456964689, + "learning_rate": 5.0985916204537064e-05, + "loss": 2.7578, + "step": 35075 + }, + { + "epoch": 1.6330516563074702, + "grad_norm": 0.3314950281985513, + "learning_rate": 5.098320799318537e-05, + "loss": 2.6753, + "step": 35076 + }, + { + "epoch": 1.6330982144935633, + "grad_norm": 0.39139467291966235, + "learning_rate": 5.098049977894804e-05, + "loss": 2.775, + "step": 35077 + }, + { + "epoch": 1.6331447726796564, + "grad_norm": 0.34948522432301726, + "learning_rate": 5.097779156183303e-05, + "loss": 2.702, + "step": 35078 + }, + { + "epoch": 1.6331913308657495, + "grad_norm": 0.3643123345044998, + "learning_rate": 5.097508334184831e-05, + "loss": 2.6032, + "step": 35079 + }, + { + "epoch": 1.6332378890518426, + "grad_norm": 0.34366092962852074, + "learning_rate": 5.097237511900181e-05, + "loss": 2.7033, + "step": 35080 + }, + { + "epoch": 1.6332844472379358, + "grad_norm": 0.35199291076414085, + "learning_rate": 5.0969666893301507e-05, + "loss": 2.7256, + "step": 35081 + }, + { + "epoch": 1.6333310054240286, + "grad_norm": 0.3435364544177628, + "learning_rate": 5.0966958664755306e-05, + "loss": 2.7204, + "step": 35082 + }, + { + "epoch": 1.6333775636101218, + "grad_norm": 0.39386998597894957, + "learning_rate": 5.096425043337117e-05, + "loss": 2.7122, + "step": 35083 + }, + { + "epoch": 1.6334241217962147, + "grad_norm": 0.35520528890121217, + "learning_rate": 5.096154219915706e-05, + "loss": 2.6636, + "step": 35084 + }, + { + "epoch": 1.6334706799823078, + "grad_norm": 0.3475414877269512, + "learning_rate": 5.095883396212092e-05, + "loss": 2.7266, + "step": 35085 + }, + { + "epoch": 1.6335172381684009, + "grad_norm": 0.40201506714420376, + "learning_rate": 5.0956125722270706e-05, + "loss": 2.7562, + "step": 35086 + }, + { + "epoch": 1.633563796354494, + "grad_norm": 0.37227080523989686, + "learning_rate": 5.095341747961435e-05, + "loss": 2.689, + "step": 35087 + }, + { + "epoch": 1.6336103545405871, + "grad_norm": 0.3667587547511473, + "learning_rate": 5.0950709234159796e-05, + "loss": 2.7219, + "step": 35088 + }, + { + "epoch": 1.6336569127266802, + "grad_norm": 0.3482485559695422, + "learning_rate": 5.094800098591503e-05, + "loss": 2.6884, + "step": 35089 + }, + { + "epoch": 1.6337034709127733, + "grad_norm": 0.3640024021962404, + "learning_rate": 5.0945292734887973e-05, + "loss": 2.6675, + "step": 35090 + }, + { + "epoch": 1.6337500290988665, + "grad_norm": 0.3471291123984873, + "learning_rate": 5.094258448108655e-05, + "loss": 2.7161, + "step": 35091 + }, + { + "epoch": 1.6337965872849594, + "grad_norm": 0.36450898546133564, + "learning_rate": 5.0939876224518745e-05, + "loss": 2.8032, + "step": 35092 + }, + { + "epoch": 1.6338431454710525, + "grad_norm": 0.3308567426440817, + "learning_rate": 5.09371679651925e-05, + "loss": 2.6412, + "step": 35093 + }, + { + "epoch": 1.6338897036571454, + "grad_norm": 0.3341384349541725, + "learning_rate": 5.0934459703115754e-05, + "loss": 2.7851, + "step": 35094 + }, + { + "epoch": 1.6339362618432385, + "grad_norm": 0.34000835401352697, + "learning_rate": 5.093175143829647e-05, + "loss": 2.7042, + "step": 35095 + }, + { + "epoch": 1.6339828200293316, + "grad_norm": 0.36443689751505626, + "learning_rate": 5.0929043170742575e-05, + "loss": 2.7127, + "step": 35096 + }, + { + "epoch": 1.6340293782154247, + "grad_norm": 0.34282560091374537, + "learning_rate": 5.092633490046204e-05, + "loss": 2.5988, + "step": 35097 + }, + { + "epoch": 1.6340759364015178, + "grad_norm": 0.3399866187549115, + "learning_rate": 5.0923626627462796e-05, + "loss": 2.636, + "step": 35098 + }, + { + "epoch": 1.634122494587611, + "grad_norm": 0.37463728441388094, + "learning_rate": 5.092091835175281e-05, + "loss": 2.7063, + "step": 35099 + }, + { + "epoch": 1.634169052773704, + "grad_norm": 0.3129216535147238, + "learning_rate": 5.091821007334e-05, + "loss": 2.6949, + "step": 35100 + }, + { + "epoch": 1.6342156109597972, + "grad_norm": 0.36888259235393156, + "learning_rate": 5.091550179223235e-05, + "loss": 2.7081, + "step": 35101 + }, + { + "epoch": 1.63426216914589, + "grad_norm": 0.3651101971117379, + "learning_rate": 5.0912793508437794e-05, + "loss": 2.694, + "step": 35102 + }, + { + "epoch": 1.6343087273319832, + "grad_norm": 0.37164903313316444, + "learning_rate": 5.0910085221964263e-05, + "loss": 2.6699, + "step": 35103 + }, + { + "epoch": 1.634355285518076, + "grad_norm": 0.3545063170154536, + "learning_rate": 5.090737693281975e-05, + "loss": 2.7995, + "step": 35104 + }, + { + "epoch": 1.6344018437041692, + "grad_norm": 0.33930327148411077, + "learning_rate": 5.0904668641012146e-05, + "loss": 2.7921, + "step": 35105 + }, + { + "epoch": 1.6344484018902623, + "grad_norm": 0.3751739831961053, + "learning_rate": 5.0901960346549434e-05, + "loss": 2.7197, + "step": 35106 + }, + { + "epoch": 1.6344949600763554, + "grad_norm": 0.3403684899433881, + "learning_rate": 5.089925204943956e-05, + "loss": 2.6712, + "step": 35107 + }, + { + "epoch": 1.6345415182624485, + "grad_norm": 0.3419232117161119, + "learning_rate": 5.0896543749690485e-05, + "loss": 2.6965, + "step": 35108 + }, + { + "epoch": 1.6345880764485416, + "grad_norm": 0.35580111564197786, + "learning_rate": 5.089383544731012e-05, + "loss": 2.767, + "step": 35109 + }, + { + "epoch": 1.6346346346346348, + "grad_norm": 0.3146453255272215, + "learning_rate": 5.0891127142306446e-05, + "loss": 2.6206, + "step": 35110 + }, + { + "epoch": 1.6346811928207277, + "grad_norm": 0.3784607947841757, + "learning_rate": 5.08884188346874e-05, + "loss": 2.7468, + "step": 35111 + }, + { + "epoch": 1.6347277510068208, + "grad_norm": 0.33446566842092057, + "learning_rate": 5.088571052446093e-05, + "loss": 2.785, + "step": 35112 + }, + { + "epoch": 1.6347743091929139, + "grad_norm": 0.3565327060758602, + "learning_rate": 5.0883002211635e-05, + "loss": 2.6964, + "step": 35113 + }, + { + "epoch": 1.6348208673790068, + "grad_norm": 0.34097139131268384, + "learning_rate": 5.088029389621754e-05, + "loss": 2.7497, + "step": 35114 + }, + { + "epoch": 1.6348674255651, + "grad_norm": 0.3368632081828321, + "learning_rate": 5.0877585578216506e-05, + "loss": 2.6242, + "step": 35115 + }, + { + "epoch": 1.634913983751193, + "grad_norm": 0.37542318134758323, + "learning_rate": 5.087487725763983e-05, + "loss": 2.6434, + "step": 35116 + }, + { + "epoch": 1.6349605419372861, + "grad_norm": 0.3240291378610548, + "learning_rate": 5.087216893449549e-05, + "loss": 2.6786, + "step": 35117 + }, + { + "epoch": 1.6350071001233792, + "grad_norm": 0.3513237891685407, + "learning_rate": 5.086946060879141e-05, + "loss": 2.6755, + "step": 35118 + }, + { + "epoch": 1.6350536583094724, + "grad_norm": 0.3262935923106284, + "learning_rate": 5.086675228053556e-05, + "loss": 2.7299, + "step": 35119 + }, + { + "epoch": 1.6351002164955655, + "grad_norm": 0.36113723333662695, + "learning_rate": 5.0864043949735865e-05, + "loss": 2.7216, + "step": 35120 + }, + { + "epoch": 1.6351467746816584, + "grad_norm": 0.3170150232448835, + "learning_rate": 5.0861335616400285e-05, + "loss": 2.802, + "step": 35121 + }, + { + "epoch": 1.6351933328677515, + "grad_norm": 0.33605464815063546, + "learning_rate": 5.0858627280536785e-05, + "loss": 2.7484, + "step": 35122 + }, + { + "epoch": 1.6352398910538446, + "grad_norm": 0.357711507077037, + "learning_rate": 5.085591894215328e-05, + "loss": 2.6674, + "step": 35123 + }, + { + "epoch": 1.6352864492399375, + "grad_norm": 0.3315379222138054, + "learning_rate": 5.085321060125775e-05, + "loss": 2.7019, + "step": 35124 + }, + { + "epoch": 1.6353330074260306, + "grad_norm": 0.3423685532023457, + "learning_rate": 5.085050225785812e-05, + "loss": 2.6789, + "step": 35125 + }, + { + "epoch": 1.6353795656121237, + "grad_norm": 0.37154166426969715, + "learning_rate": 5.084779391196237e-05, + "loss": 2.7673, + "step": 35126 + }, + { + "epoch": 1.6354261237982168, + "grad_norm": 0.33482963980245983, + "learning_rate": 5.084508556357841e-05, + "loss": 2.7375, + "step": 35127 + }, + { + "epoch": 1.63547268198431, + "grad_norm": 0.33981282608621644, + "learning_rate": 5.084237721271422e-05, + "loss": 2.6714, + "step": 35128 + }, + { + "epoch": 1.635519240170403, + "grad_norm": 0.36801270520599516, + "learning_rate": 5.0839668859377734e-05, + "loss": 2.7392, + "step": 35129 + }, + { + "epoch": 1.6355657983564962, + "grad_norm": 0.33069314888022167, + "learning_rate": 5.0836960503576895e-05, + "loss": 2.6397, + "step": 35130 + }, + { + "epoch": 1.635612356542589, + "grad_norm": 0.36378167750523005, + "learning_rate": 5.083425214531966e-05, + "loss": 2.7528, + "step": 35131 + }, + { + "epoch": 1.6356589147286822, + "grad_norm": 0.34131529511949166, + "learning_rate": 5.083154378461398e-05, + "loss": 2.7335, + "step": 35132 + }, + { + "epoch": 1.635705472914775, + "grad_norm": 0.3462381584682461, + "learning_rate": 5.0828835421467805e-05, + "loss": 2.8598, + "step": 35133 + }, + { + "epoch": 1.6357520311008682, + "grad_norm": 0.3470143460764346, + "learning_rate": 5.082612705588906e-05, + "loss": 2.7883, + "step": 35134 + }, + { + "epoch": 1.6357985892869613, + "grad_norm": 0.3584131249722576, + "learning_rate": 5.082341868788574e-05, + "loss": 2.6983, + "step": 35135 + }, + { + "epoch": 1.6358451474730544, + "grad_norm": 0.3365276687658071, + "learning_rate": 5.082071031746576e-05, + "loss": 2.6857, + "step": 35136 + }, + { + "epoch": 1.6358917056591475, + "grad_norm": 0.3458589424540189, + "learning_rate": 5.081800194463706e-05, + "loss": 2.6857, + "step": 35137 + }, + { + "epoch": 1.6359382638452407, + "grad_norm": 0.3824369548465753, + "learning_rate": 5.081529356940762e-05, + "loss": 2.7398, + "step": 35138 + }, + { + "epoch": 1.6359848220313338, + "grad_norm": 0.3507565712484068, + "learning_rate": 5.081258519178538e-05, + "loss": 2.6599, + "step": 35139 + }, + { + "epoch": 1.636031380217427, + "grad_norm": 0.34777999826391975, + "learning_rate": 5.080987681177827e-05, + "loss": 2.7148, + "step": 35140 + }, + { + "epoch": 1.6360779384035198, + "grad_norm": 0.34236216263957353, + "learning_rate": 5.080716842939426e-05, + "loss": 2.7228, + "step": 35141 + }, + { + "epoch": 1.636124496589613, + "grad_norm": 0.39269345651960263, + "learning_rate": 5.080446004464128e-05, + "loss": 2.7024, + "step": 35142 + }, + { + "epoch": 1.6361710547757058, + "grad_norm": 0.366185301529234, + "learning_rate": 5.080175165752729e-05, + "loss": 2.7587, + "step": 35143 + }, + { + "epoch": 1.636217612961799, + "grad_norm": 0.36372234732933423, + "learning_rate": 5.0799043268060256e-05, + "loss": 2.7914, + "step": 35144 + }, + { + "epoch": 1.636264171147892, + "grad_norm": 0.4139222141292611, + "learning_rate": 5.079633487624809e-05, + "loss": 2.646, + "step": 35145 + }, + { + "epoch": 1.6363107293339851, + "grad_norm": 0.3541117924354985, + "learning_rate": 5.0793626482098755e-05, + "loss": 2.6787, + "step": 35146 + }, + { + "epoch": 1.6363572875200783, + "grad_norm": 0.3665358514649411, + "learning_rate": 5.079091808562021e-05, + "loss": 2.7105, + "step": 35147 + }, + { + "epoch": 1.6364038457061714, + "grad_norm": 0.37338733560977894, + "learning_rate": 5.07882096868204e-05, + "loss": 2.6941, + "step": 35148 + }, + { + "epoch": 1.6364504038922645, + "grad_norm": 0.33031808567323395, + "learning_rate": 5.078550128570728e-05, + "loss": 2.8269, + "step": 35149 + }, + { + "epoch": 1.6364969620783574, + "grad_norm": 0.38200445199329125, + "learning_rate": 5.078279288228878e-05, + "loss": 2.8045, + "step": 35150 + }, + { + "epoch": 1.6365435202644505, + "grad_norm": 0.353765493415524, + "learning_rate": 5.0780084476572864e-05, + "loss": 2.6741, + "step": 35151 + }, + { + "epoch": 1.6365900784505436, + "grad_norm": 0.3535623384418547, + "learning_rate": 5.077737606856749e-05, + "loss": 2.6715, + "step": 35152 + }, + { + "epoch": 1.6366366366366365, + "grad_norm": 0.36344240515083137, + "learning_rate": 5.077466765828057e-05, + "loss": 2.7753, + "step": 35153 + }, + { + "epoch": 1.6366831948227296, + "grad_norm": 0.361770811219591, + "learning_rate": 5.07719592457201e-05, + "loss": 2.7462, + "step": 35154 + }, + { + "epoch": 1.6367297530088227, + "grad_norm": 0.36817580453069515, + "learning_rate": 5.0769250830893987e-05, + "loss": 2.7037, + "step": 35155 + }, + { + "epoch": 1.6367763111949158, + "grad_norm": 0.34003585225654265, + "learning_rate": 5.0766542413810205e-05, + "loss": 2.7415, + "step": 35156 + }, + { + "epoch": 1.636822869381009, + "grad_norm": 0.357477722011908, + "learning_rate": 5.07638339944767e-05, + "loss": 2.6188, + "step": 35157 + }, + { + "epoch": 1.636869427567102, + "grad_norm": 0.34817193818802705, + "learning_rate": 5.076112557290141e-05, + "loss": 2.7205, + "step": 35158 + }, + { + "epoch": 1.6369159857531952, + "grad_norm": 0.3338683688198271, + "learning_rate": 5.07584171490923e-05, + "loss": 2.7113, + "step": 35159 + }, + { + "epoch": 1.636962543939288, + "grad_norm": 0.37477625105668405, + "learning_rate": 5.07557087230573e-05, + "loss": 2.6208, + "step": 35160 + }, + { + "epoch": 1.6370091021253812, + "grad_norm": 0.32078770362651643, + "learning_rate": 5.0753000294804376e-05, + "loss": 2.7168, + "step": 35161 + }, + { + "epoch": 1.6370556603114743, + "grad_norm": 0.34989668747858615, + "learning_rate": 5.075029186434148e-05, + "loss": 2.7641, + "step": 35162 + }, + { + "epoch": 1.6371022184975672, + "grad_norm": 0.35282218777916935, + "learning_rate": 5.074758343167654e-05, + "loss": 2.803, + "step": 35163 + }, + { + "epoch": 1.6371487766836603, + "grad_norm": 0.3426638655263942, + "learning_rate": 5.07448749968175e-05, + "loss": 2.72, + "step": 35164 + }, + { + "epoch": 1.6371953348697534, + "grad_norm": 0.319709782957228, + "learning_rate": 5.0742166559772366e-05, + "loss": 2.6883, + "step": 35165 + }, + { + "epoch": 1.6372418930558466, + "grad_norm": 0.3320234200600411, + "learning_rate": 5.0739458120549014e-05, + "loss": 2.6542, + "step": 35166 + }, + { + "epoch": 1.6372884512419397, + "grad_norm": 0.3329827161364489, + "learning_rate": 5.0736749679155434e-05, + "loss": 2.6319, + "step": 35167 + }, + { + "epoch": 1.6373350094280328, + "grad_norm": 0.3195296743210132, + "learning_rate": 5.073404123559957e-05, + "loss": 2.7842, + "step": 35168 + }, + { + "epoch": 1.637381567614126, + "grad_norm": 0.3110979671315173, + "learning_rate": 5.073133278988936e-05, + "loss": 2.7303, + "step": 35169 + }, + { + "epoch": 1.6374281258002188, + "grad_norm": 0.35686929271105233, + "learning_rate": 5.072862434203276e-05, + "loss": 2.7107, + "step": 35170 + }, + { + "epoch": 1.637474683986312, + "grad_norm": 0.3602328192578336, + "learning_rate": 5.0725915892037735e-05, + "loss": 2.7819, + "step": 35171 + }, + { + "epoch": 1.6375212421724048, + "grad_norm": 0.31686005845858567, + "learning_rate": 5.07232074399122e-05, + "loss": 2.6482, + "step": 35172 + }, + { + "epoch": 1.637567800358498, + "grad_norm": 0.3466780637778219, + "learning_rate": 5.072049898566413e-05, + "loss": 2.7634, + "step": 35173 + }, + { + "epoch": 1.637614358544591, + "grad_norm": 0.34659553130124104, + "learning_rate": 5.071779052930146e-05, + "loss": 2.6968, + "step": 35174 + }, + { + "epoch": 1.6376609167306841, + "grad_norm": 0.3271938713297857, + "learning_rate": 5.071508207083216e-05, + "loss": 2.7693, + "step": 35175 + }, + { + "epoch": 1.6377074749167773, + "grad_norm": 0.37057908982908255, + "learning_rate": 5.071237361026415e-05, + "loss": 2.7674, + "step": 35176 + }, + { + "epoch": 1.6377540331028704, + "grad_norm": 0.3274166168443304, + "learning_rate": 5.07096651476054e-05, + "loss": 2.8216, + "step": 35177 + }, + { + "epoch": 1.6378005912889635, + "grad_norm": 0.3487304431818784, + "learning_rate": 5.070695668286386e-05, + "loss": 2.7134, + "step": 35178 + }, + { + "epoch": 1.6378471494750566, + "grad_norm": 0.3447498941879235, + "learning_rate": 5.0704248216047455e-05, + "loss": 2.6773, + "step": 35179 + }, + { + "epoch": 1.6378937076611495, + "grad_norm": 0.3633801659533813, + "learning_rate": 5.0701539747164174e-05, + "loss": 2.751, + "step": 35180 + }, + { + "epoch": 1.6379402658472426, + "grad_norm": 0.35366213732156354, + "learning_rate": 5.0698831276221925e-05, + "loss": 2.6638, + "step": 35181 + }, + { + "epoch": 1.6379868240333355, + "grad_norm": 0.32909883088930786, + "learning_rate": 5.0696122803228677e-05, + "loss": 2.7057, + "step": 35182 + }, + { + "epoch": 1.6380333822194286, + "grad_norm": 0.3704039319177734, + "learning_rate": 5.069341432819238e-05, + "loss": 2.6542, + "step": 35183 + }, + { + "epoch": 1.6380799404055217, + "grad_norm": 0.3647016701118676, + "learning_rate": 5.0690705851120965e-05, + "loss": 2.748, + "step": 35184 + }, + { + "epoch": 1.6381264985916149, + "grad_norm": 0.36975370931637475, + "learning_rate": 5.0687997372022425e-05, + "loss": 2.7538, + "step": 35185 + }, + { + "epoch": 1.638173056777708, + "grad_norm": 0.34890730094146477, + "learning_rate": 5.068528889090466e-05, + "loss": 2.7787, + "step": 35186 + }, + { + "epoch": 1.638219614963801, + "grad_norm": 0.346574346278397, + "learning_rate": 5.068258040777565e-05, + "loss": 2.6364, + "step": 35187 + }, + { + "epoch": 1.6382661731498942, + "grad_norm": 0.3622513225804279, + "learning_rate": 5.067987192264332e-05, + "loss": 2.7617, + "step": 35188 + }, + { + "epoch": 1.6383127313359873, + "grad_norm": 0.3222438262437979, + "learning_rate": 5.067716343551565e-05, + "loss": 2.6208, + "step": 35189 + }, + { + "epoch": 1.6383592895220802, + "grad_norm": 0.3410463522341411, + "learning_rate": 5.067445494640056e-05, + "loss": 2.725, + "step": 35190 + }, + { + "epoch": 1.6384058477081733, + "grad_norm": 0.335547120548892, + "learning_rate": 5.0671746455306024e-05, + "loss": 2.6796, + "step": 35191 + }, + { + "epoch": 1.6384524058942662, + "grad_norm": 0.33355193634709723, + "learning_rate": 5.066903796223997e-05, + "loss": 2.6902, + "step": 35192 + }, + { + "epoch": 1.6384989640803593, + "grad_norm": 0.3373828627748154, + "learning_rate": 5.066632946721035e-05, + "loss": 2.6818, + "step": 35193 + }, + { + "epoch": 1.6385455222664524, + "grad_norm": 0.3693155153403349, + "learning_rate": 5.0663620970225136e-05, + "loss": 2.8073, + "step": 35194 + }, + { + "epoch": 1.6385920804525456, + "grad_norm": 0.36543064429557776, + "learning_rate": 5.066091247129224e-05, + "loss": 2.7166, + "step": 35195 + }, + { + "epoch": 1.6386386386386387, + "grad_norm": 0.34075221991540666, + "learning_rate": 5.065820397041964e-05, + "loss": 2.7479, + "step": 35196 + }, + { + "epoch": 1.6386851968247318, + "grad_norm": 0.3852193616622557, + "learning_rate": 5.065549546761527e-05, + "loss": 2.7122, + "step": 35197 + }, + { + "epoch": 1.638731755010825, + "grad_norm": 0.38885260709343716, + "learning_rate": 5.06527869628871e-05, + "loss": 2.7886, + "step": 35198 + }, + { + "epoch": 1.6387783131969178, + "grad_norm": 0.35289356681691497, + "learning_rate": 5.065007845624305e-05, + "loss": 2.6726, + "step": 35199 + }, + { + "epoch": 1.638824871383011, + "grad_norm": 0.3891185046492888, + "learning_rate": 5.064736994769109e-05, + "loss": 2.7994, + "step": 35200 + }, + { + "epoch": 1.638871429569104, + "grad_norm": 0.3440675738990122, + "learning_rate": 5.064466143723916e-05, + "loss": 2.7341, + "step": 35201 + }, + { + "epoch": 1.638917987755197, + "grad_norm": 0.3983989512862629, + "learning_rate": 5.064195292489521e-05, + "loss": 2.6514, + "step": 35202 + }, + { + "epoch": 1.63896454594129, + "grad_norm": 0.37234905314806255, + "learning_rate": 5.0639244410667196e-05, + "loss": 2.7062, + "step": 35203 + }, + { + "epoch": 1.6390111041273832, + "grad_norm": 0.37286828584514187, + "learning_rate": 5.0636535894563074e-05, + "loss": 2.7242, + "step": 35204 + }, + { + "epoch": 1.6390576623134763, + "grad_norm": 0.381774643900906, + "learning_rate": 5.063382737659077e-05, + "loss": 2.7078, + "step": 35205 + }, + { + "epoch": 1.6391042204995694, + "grad_norm": 0.3723475579430699, + "learning_rate": 5.063111885675824e-05, + "loss": 2.7076, + "step": 35206 + }, + { + "epoch": 1.6391507786856625, + "grad_norm": 0.36553194765225105, + "learning_rate": 5.062841033507345e-05, + "loss": 2.7499, + "step": 35207 + }, + { + "epoch": 1.6391973368717556, + "grad_norm": 0.35325135546259423, + "learning_rate": 5.062570181154432e-05, + "loss": 2.7015, + "step": 35208 + }, + { + "epoch": 1.6392438950578485, + "grad_norm": 0.3714171180886371, + "learning_rate": 5.062299328617883e-05, + "loss": 2.6841, + "step": 35209 + }, + { + "epoch": 1.6392904532439416, + "grad_norm": 0.36145583504424017, + "learning_rate": 5.062028475898491e-05, + "loss": 2.5269, + "step": 35210 + }, + { + "epoch": 1.6393370114300347, + "grad_norm": 0.3354716063410524, + "learning_rate": 5.061757622997052e-05, + "loss": 2.6778, + "step": 35211 + }, + { + "epoch": 1.6393835696161276, + "grad_norm": 0.3639451022472891, + "learning_rate": 5.061486769914361e-05, + "loss": 2.6614, + "step": 35212 + }, + { + "epoch": 1.6394301278022207, + "grad_norm": 0.33678712207624617, + "learning_rate": 5.061215916651212e-05, + "loss": 2.7756, + "step": 35213 + }, + { + "epoch": 1.6394766859883139, + "grad_norm": 0.38049256591907316, + "learning_rate": 5.0609450632083985e-05, + "loss": 2.6885, + "step": 35214 + }, + { + "epoch": 1.639523244174407, + "grad_norm": 0.32526108328868875, + "learning_rate": 5.060674209586719e-05, + "loss": 2.7489, + "step": 35215 + }, + { + "epoch": 1.6395698023605, + "grad_norm": 0.3540686812927996, + "learning_rate": 5.060403355786968e-05, + "loss": 2.7196, + "step": 35216 + }, + { + "epoch": 1.6396163605465932, + "grad_norm": 0.3552815708311303, + "learning_rate": 5.0601325018099374e-05, + "loss": 2.6994, + "step": 35217 + }, + { + "epoch": 1.6396629187326863, + "grad_norm": 0.3400310099232221, + "learning_rate": 5.059861647656424e-05, + "loss": 2.6944, + "step": 35218 + }, + { + "epoch": 1.6397094769187792, + "grad_norm": 0.3637343554853395, + "learning_rate": 5.0595907933272225e-05, + "loss": 2.7103, + "step": 35219 + }, + { + "epoch": 1.6397560351048723, + "grad_norm": 0.3358262421988012, + "learning_rate": 5.059319938823128e-05, + "loss": 2.6244, + "step": 35220 + }, + { + "epoch": 1.6398025932909652, + "grad_norm": 0.3569004306403584, + "learning_rate": 5.0590490841449356e-05, + "loss": 2.6188, + "step": 35221 + }, + { + "epoch": 1.6398491514770583, + "grad_norm": 0.3333682550926889, + "learning_rate": 5.058778229293439e-05, + "loss": 2.7456, + "step": 35222 + }, + { + "epoch": 1.6398957096631515, + "grad_norm": 0.3390796476612363, + "learning_rate": 5.0585073742694355e-05, + "loss": 2.7482, + "step": 35223 + }, + { + "epoch": 1.6399422678492446, + "grad_norm": 0.3373653041900395, + "learning_rate": 5.0582365190737166e-05, + "loss": 2.6413, + "step": 35224 + }, + { + "epoch": 1.6399888260353377, + "grad_norm": 0.3611563008768594, + "learning_rate": 5.057965663707081e-05, + "loss": 2.7707, + "step": 35225 + }, + { + "epoch": 1.6400353842214308, + "grad_norm": 0.32900294498847366, + "learning_rate": 5.0576948081703224e-05, + "loss": 2.6447, + "step": 35226 + }, + { + "epoch": 1.640081942407524, + "grad_norm": 0.3452277220527861, + "learning_rate": 5.057423952464233e-05, + "loss": 2.7387, + "step": 35227 + }, + { + "epoch": 1.640128500593617, + "grad_norm": 0.32278404468474325, + "learning_rate": 5.057153096589611e-05, + "loss": 2.7265, + "step": 35228 + }, + { + "epoch": 1.64017505877971, + "grad_norm": 0.33964713692707393, + "learning_rate": 5.056882240547251e-05, + "loss": 2.7275, + "step": 35229 + }, + { + "epoch": 1.640221616965803, + "grad_norm": 0.34590420026540947, + "learning_rate": 5.056611384337948e-05, + "loss": 2.7385, + "step": 35230 + }, + { + "epoch": 1.640268175151896, + "grad_norm": 0.36916142157779064, + "learning_rate": 5.056340527962494e-05, + "loss": 2.6956, + "step": 35231 + }, + { + "epoch": 1.640314733337989, + "grad_norm": 0.36662074733618055, + "learning_rate": 5.056069671421687e-05, + "loss": 2.7671, + "step": 35232 + }, + { + "epoch": 1.6403612915240822, + "grad_norm": 0.330048967559732, + "learning_rate": 5.055798814716321e-05, + "loss": 2.6195, + "step": 35233 + }, + { + "epoch": 1.6404078497101753, + "grad_norm": 0.3834682799820968, + "learning_rate": 5.055527957847191e-05, + "loss": 2.6516, + "step": 35234 + }, + { + "epoch": 1.6404544078962684, + "grad_norm": 0.3611302647146548, + "learning_rate": 5.0552571008150925e-05, + "loss": 2.6875, + "step": 35235 + }, + { + "epoch": 1.6405009660823615, + "grad_norm": 0.3367925738116388, + "learning_rate": 5.054986243620818e-05, + "loss": 2.6225, + "step": 35236 + }, + { + "epoch": 1.6405475242684546, + "grad_norm": 0.324684658976845, + "learning_rate": 5.0547153862651667e-05, + "loss": 2.7166, + "step": 35237 + }, + { + "epoch": 1.6405940824545475, + "grad_norm": 0.3455757067952923, + "learning_rate": 5.05444452874893e-05, + "loss": 2.6665, + "step": 35238 + }, + { + "epoch": 1.6406406406406406, + "grad_norm": 0.31731736134663535, + "learning_rate": 5.054173671072904e-05, + "loss": 2.768, + "step": 35239 + }, + { + "epoch": 1.6406871988267338, + "grad_norm": 0.35906744785137107, + "learning_rate": 5.053902813237883e-05, + "loss": 2.7087, + "step": 35240 + }, + { + "epoch": 1.6407337570128266, + "grad_norm": 0.34698088520110726, + "learning_rate": 5.053631955244663e-05, + "loss": 2.7045, + "step": 35241 + }, + { + "epoch": 1.6407803151989198, + "grad_norm": 0.3263047535434111, + "learning_rate": 5.053361097094039e-05, + "loss": 2.6636, + "step": 35242 + }, + { + "epoch": 1.6408268733850129, + "grad_norm": 0.35114896240244825, + "learning_rate": 5.053090238786806e-05, + "loss": 2.7277, + "step": 35243 + }, + { + "epoch": 1.640873431571106, + "grad_norm": 0.31324539253418276, + "learning_rate": 5.0528193803237565e-05, + "loss": 2.617, + "step": 35244 + }, + { + "epoch": 1.640919989757199, + "grad_norm": 0.3255411611233058, + "learning_rate": 5.052548521705689e-05, + "loss": 2.7209, + "step": 35245 + }, + { + "epoch": 1.6409665479432922, + "grad_norm": 0.3301025202769904, + "learning_rate": 5.052277662933396e-05, + "loss": 2.6817, + "step": 35246 + }, + { + "epoch": 1.6410131061293853, + "grad_norm": 0.3091308000297407, + "learning_rate": 5.052006804007672e-05, + "loss": 2.7981, + "step": 35247 + }, + { + "epoch": 1.6410596643154782, + "grad_norm": 0.3334667561067847, + "learning_rate": 5.0517359449293156e-05, + "loss": 2.7348, + "step": 35248 + }, + { + "epoch": 1.6411062225015713, + "grad_norm": 0.331377782611269, + "learning_rate": 5.051465085699118e-05, + "loss": 2.6562, + "step": 35249 + }, + { + "epoch": 1.6411527806876645, + "grad_norm": 0.3361636668025958, + "learning_rate": 5.051194226317875e-05, + "loss": 2.7757, + "step": 35250 + }, + { + "epoch": 1.6411993388737574, + "grad_norm": 0.328884607464186, + "learning_rate": 5.050923366786382e-05, + "loss": 2.7, + "step": 35251 + }, + { + "epoch": 1.6412458970598505, + "grad_norm": 0.34703722927745506, + "learning_rate": 5.050652507105436e-05, + "loss": 2.8012, + "step": 35252 + }, + { + "epoch": 1.6412924552459436, + "grad_norm": 0.32800431551894343, + "learning_rate": 5.050381647275827e-05, + "loss": 2.7486, + "step": 35253 + }, + { + "epoch": 1.6413390134320367, + "grad_norm": 0.3215155973076334, + "learning_rate": 5.0501107872983546e-05, + "loss": 2.8149, + "step": 35254 + }, + { + "epoch": 1.6413855716181298, + "grad_norm": 0.38002242797278124, + "learning_rate": 5.049839927173812e-05, + "loss": 2.7574, + "step": 35255 + }, + { + "epoch": 1.641432129804223, + "grad_norm": 0.34141994990930286, + "learning_rate": 5.0495690669029926e-05, + "loss": 2.7191, + "step": 35256 + }, + { + "epoch": 1.641478687990316, + "grad_norm": 0.3205252370229971, + "learning_rate": 5.049298206486696e-05, + "loss": 2.7164, + "step": 35257 + }, + { + "epoch": 1.641525246176409, + "grad_norm": 0.3568944327642701, + "learning_rate": 5.049027345925711e-05, + "loss": 2.6742, + "step": 35258 + }, + { + "epoch": 1.641571804362502, + "grad_norm": 0.35648084509101324, + "learning_rate": 5.0487564852208355e-05, + "loss": 2.6819, + "step": 35259 + }, + { + "epoch": 1.641618362548595, + "grad_norm": 0.35455065579475203, + "learning_rate": 5.048485624372866e-05, + "loss": 2.7155, + "step": 35260 + }, + { + "epoch": 1.641664920734688, + "grad_norm": 0.32002869143308316, + "learning_rate": 5.048214763382596e-05, + "loss": 2.6001, + "step": 35261 + }, + { + "epoch": 1.6417114789207812, + "grad_norm": 0.34462963501436367, + "learning_rate": 5.04794390225082e-05, + "loss": 2.6498, + "step": 35262 + }, + { + "epoch": 1.6417580371068743, + "grad_norm": 0.3451236021233387, + "learning_rate": 5.0476730409783325e-05, + "loss": 2.657, + "step": 35263 + }, + { + "epoch": 1.6418045952929674, + "grad_norm": 0.3248608316309439, + "learning_rate": 5.04740217956593e-05, + "loss": 2.6931, + "step": 35264 + }, + { + "epoch": 1.6418511534790605, + "grad_norm": 0.316358132112444, + "learning_rate": 5.047131318014406e-05, + "loss": 2.6069, + "step": 35265 + }, + { + "epoch": 1.6418977116651536, + "grad_norm": 0.3478721523512107, + "learning_rate": 5.046860456324558e-05, + "loss": 2.7523, + "step": 35266 + }, + { + "epoch": 1.6419442698512468, + "grad_norm": 0.34229855255574143, + "learning_rate": 5.046589594497178e-05, + "loss": 2.7715, + "step": 35267 + }, + { + "epoch": 1.6419908280373396, + "grad_norm": 0.3656057301732069, + "learning_rate": 5.046318732533064e-05, + "loss": 2.6835, + "step": 35268 + }, + { + "epoch": 1.6420373862234328, + "grad_norm": 0.3365337616698414, + "learning_rate": 5.046047870433007e-05, + "loss": 2.7008, + "step": 35269 + }, + { + "epoch": 1.6420839444095257, + "grad_norm": 0.3413821139929147, + "learning_rate": 5.045777008197805e-05, + "loss": 2.6425, + "step": 35270 + }, + { + "epoch": 1.6421305025956188, + "grad_norm": 0.3194137440265165, + "learning_rate": 5.045506145828252e-05, + "loss": 2.6936, + "step": 35271 + }, + { + "epoch": 1.6421770607817119, + "grad_norm": 0.374510888610481, + "learning_rate": 5.0452352833251417e-05, + "loss": 2.7429, + "step": 35272 + }, + { + "epoch": 1.642223618967805, + "grad_norm": 0.34499361266827877, + "learning_rate": 5.044964420689271e-05, + "loss": 2.7907, + "step": 35273 + }, + { + "epoch": 1.6422701771538981, + "grad_norm": 0.3600095593740069, + "learning_rate": 5.044693557921434e-05, + "loss": 2.7057, + "step": 35274 + }, + { + "epoch": 1.6423167353399912, + "grad_norm": 0.3571502368295854, + "learning_rate": 5.044422695022426e-05, + "loss": 2.7555, + "step": 35275 + }, + { + "epoch": 1.6423632935260843, + "grad_norm": 0.3662963144491341, + "learning_rate": 5.044151831993041e-05, + "loss": 2.7145, + "step": 35276 + }, + { + "epoch": 1.6424098517121775, + "grad_norm": 0.33274158871581705, + "learning_rate": 5.043880968834076e-05, + "loss": 2.5751, + "step": 35277 + }, + { + "epoch": 1.6424564098982704, + "grad_norm": 0.3426366246039639, + "learning_rate": 5.0436101055463235e-05, + "loss": 2.8299, + "step": 35278 + }, + { + "epoch": 1.6425029680843635, + "grad_norm": 0.34386243117999205, + "learning_rate": 5.0433392421305815e-05, + "loss": 2.8058, + "step": 35279 + }, + { + "epoch": 1.6425495262704564, + "grad_norm": 0.3231821596886105, + "learning_rate": 5.043068378587642e-05, + "loss": 2.7554, + "step": 35280 + }, + { + "epoch": 1.6425960844565495, + "grad_norm": 0.3585407932198212, + "learning_rate": 5.042797514918302e-05, + "loss": 2.702, + "step": 35281 + }, + { + "epoch": 1.6426426426426426, + "grad_norm": 0.32329422578587425, + "learning_rate": 5.042526651123354e-05, + "loss": 2.6394, + "step": 35282 + }, + { + "epoch": 1.6426892008287357, + "grad_norm": 0.3739219600517987, + "learning_rate": 5.042255787203595e-05, + "loss": 2.7519, + "step": 35283 + }, + { + "epoch": 1.6427357590148288, + "grad_norm": 0.3704049168809692, + "learning_rate": 5.041984923159819e-05, + "loss": 2.8089, + "step": 35284 + }, + { + "epoch": 1.642782317200922, + "grad_norm": 0.339986371480922, + "learning_rate": 5.0417140589928224e-05, + "loss": 2.7032, + "step": 35285 + }, + { + "epoch": 1.642828875387015, + "grad_norm": 0.3760628180963718, + "learning_rate": 5.041443194703398e-05, + "loss": 2.7302, + "step": 35286 + }, + { + "epoch": 1.642875433573108, + "grad_norm": 0.32203469538768925, + "learning_rate": 5.041172330292342e-05, + "loss": 2.6519, + "step": 35287 + }, + { + "epoch": 1.642921991759201, + "grad_norm": 0.3590088606164979, + "learning_rate": 5.0409014657604494e-05, + "loss": 2.8656, + "step": 35288 + }, + { + "epoch": 1.6429685499452942, + "grad_norm": 0.32383666263249716, + "learning_rate": 5.0406306011085156e-05, + "loss": 2.8021, + "step": 35289 + }, + { + "epoch": 1.643015108131387, + "grad_norm": 0.32792984760133115, + "learning_rate": 5.0403597363373345e-05, + "loss": 2.7008, + "step": 35290 + }, + { + "epoch": 1.6430616663174802, + "grad_norm": 0.34847649772879175, + "learning_rate": 5.040088871447702e-05, + "loss": 2.69, + "step": 35291 + }, + { + "epoch": 1.6431082245035733, + "grad_norm": 0.3507671344112471, + "learning_rate": 5.039818006440413e-05, + "loss": 2.6175, + "step": 35292 + }, + { + "epoch": 1.6431547826896664, + "grad_norm": 0.3187587983301094, + "learning_rate": 5.039547141316261e-05, + "loss": 2.7281, + "step": 35293 + }, + { + "epoch": 1.6432013408757595, + "grad_norm": 0.3595192701556295, + "learning_rate": 5.039276276076044e-05, + "loss": 2.6884, + "step": 35294 + }, + { + "epoch": 1.6432478990618526, + "grad_norm": 0.3356584434318985, + "learning_rate": 5.039005410720553e-05, + "loss": 2.6272, + "step": 35295 + }, + { + "epoch": 1.6432944572479458, + "grad_norm": 0.3423419273187967, + "learning_rate": 5.038734545250585e-05, + "loss": 2.5548, + "step": 35296 + }, + { + "epoch": 1.6433410154340387, + "grad_norm": 0.3422973239569346, + "learning_rate": 5.038463679666935e-05, + "loss": 2.7315, + "step": 35297 + }, + { + "epoch": 1.6433875736201318, + "grad_norm": 0.35290099240501815, + "learning_rate": 5.038192813970399e-05, + "loss": 2.7655, + "step": 35298 + }, + { + "epoch": 1.6434341318062249, + "grad_norm": 0.3593642603322198, + "learning_rate": 5.03792194816177e-05, + "loss": 2.7495, + "step": 35299 + }, + { + "epoch": 1.6434806899923178, + "grad_norm": 0.3445713454695128, + "learning_rate": 5.037651082241843e-05, + "loss": 2.6865, + "step": 35300 + }, + { + "epoch": 1.643527248178411, + "grad_norm": 0.35257323394348444, + "learning_rate": 5.0373802162114145e-05, + "loss": 2.6992, + "step": 35301 + }, + { + "epoch": 1.643573806364504, + "grad_norm": 0.3562268896641019, + "learning_rate": 5.03710935007128e-05, + "loss": 2.6935, + "step": 35302 + }, + { + "epoch": 1.6436203645505971, + "grad_norm": 0.34726812581751654, + "learning_rate": 5.0368384838222314e-05, + "loss": 2.697, + "step": 35303 + }, + { + "epoch": 1.6436669227366902, + "grad_norm": 0.32807315898682227, + "learning_rate": 5.0365676174650664e-05, + "loss": 2.6965, + "step": 35304 + }, + { + "epoch": 1.6437134809227834, + "grad_norm": 0.3730594349929756, + "learning_rate": 5.03629675100058e-05, + "loss": 2.6825, + "step": 35305 + }, + { + "epoch": 1.6437600391088765, + "grad_norm": 0.32861461088862837, + "learning_rate": 5.0360258844295647e-05, + "loss": 2.6975, + "step": 35306 + }, + { + "epoch": 1.6438065972949694, + "grad_norm": 0.35555772242771466, + "learning_rate": 5.0357550177528175e-05, + "loss": 2.5846, + "step": 35307 + }, + { + "epoch": 1.6438531554810625, + "grad_norm": 0.3412396454379778, + "learning_rate": 5.0354841509711326e-05, + "loss": 2.8491, + "step": 35308 + }, + { + "epoch": 1.6438997136671554, + "grad_norm": 0.36746355896373617, + "learning_rate": 5.035213284085305e-05, + "loss": 2.6648, + "step": 35309 + }, + { + "epoch": 1.6439462718532485, + "grad_norm": 0.34339426509899634, + "learning_rate": 5.03494241709613e-05, + "loss": 2.6884, + "step": 35310 + }, + { + "epoch": 1.6439928300393416, + "grad_norm": 0.371680242179798, + "learning_rate": 5.034671550004404e-05, + "loss": 2.7039, + "step": 35311 + }, + { + "epoch": 1.6440393882254347, + "grad_norm": 0.3807800423169848, + "learning_rate": 5.034400682810919e-05, + "loss": 2.6827, + "step": 35312 + }, + { + "epoch": 1.6440859464115278, + "grad_norm": 0.38312780990572776, + "learning_rate": 5.0341298155164706e-05, + "loss": 2.6822, + "step": 35313 + }, + { + "epoch": 1.644132504597621, + "grad_norm": 0.34561014275390234, + "learning_rate": 5.0338589481218556e-05, + "loss": 2.6559, + "step": 35314 + }, + { + "epoch": 1.644179062783714, + "grad_norm": 0.35799844675945636, + "learning_rate": 5.033588080627867e-05, + "loss": 2.7754, + "step": 35315 + }, + { + "epoch": 1.6442256209698072, + "grad_norm": 0.3751828836183097, + "learning_rate": 5.033317213035302e-05, + "loss": 2.7137, + "step": 35316 + }, + { + "epoch": 1.6442721791559, + "grad_norm": 0.37285045719680654, + "learning_rate": 5.033046345344954e-05, + "loss": 2.7493, + "step": 35317 + }, + { + "epoch": 1.6443187373419932, + "grad_norm": 0.34853428473564535, + "learning_rate": 5.032775477557619e-05, + "loss": 2.7799, + "step": 35318 + }, + { + "epoch": 1.644365295528086, + "grad_norm": 0.38575359847955687, + "learning_rate": 5.032504609674089e-05, + "loss": 2.7747, + "step": 35319 + }, + { + "epoch": 1.6444118537141792, + "grad_norm": 0.33282604272392885, + "learning_rate": 5.0322337416951635e-05, + "loss": 2.7716, + "step": 35320 + }, + { + "epoch": 1.6444584119002723, + "grad_norm": 0.35523124662415784, + "learning_rate": 5.031962873621634e-05, + "loss": 2.743, + "step": 35321 + }, + { + "epoch": 1.6445049700863654, + "grad_norm": 0.34806985593646067, + "learning_rate": 5.031692005454296e-05, + "loss": 2.6974, + "step": 35322 + }, + { + "epoch": 1.6445515282724585, + "grad_norm": 0.37419906377168993, + "learning_rate": 5.0314211371939455e-05, + "loss": 2.6709, + "step": 35323 + }, + { + "epoch": 1.6445980864585517, + "grad_norm": 0.3268370086971235, + "learning_rate": 5.031150268841377e-05, + "loss": 2.7879, + "step": 35324 + }, + { + "epoch": 1.6446446446446448, + "grad_norm": 0.3646679388982791, + "learning_rate": 5.030879400397387e-05, + "loss": 2.706, + "step": 35325 + }, + { + "epoch": 1.6446912028307377, + "grad_norm": 0.36340319786455855, + "learning_rate": 5.0306085318627674e-05, + "loss": 2.754, + "step": 35326 + }, + { + "epoch": 1.6447377610168308, + "grad_norm": 0.3610331024279118, + "learning_rate": 5.030337663238315e-05, + "loss": 2.6664, + "step": 35327 + }, + { + "epoch": 1.644784319202924, + "grad_norm": 0.3563190449909447, + "learning_rate": 5.0300667945248234e-05, + "loss": 2.7739, + "step": 35328 + }, + { + "epoch": 1.6448308773890168, + "grad_norm": 0.3707215499591006, + "learning_rate": 5.029795925723091e-05, + "loss": 2.7555, + "step": 35329 + }, + { + "epoch": 1.64487743557511, + "grad_norm": 0.36467431176877024, + "learning_rate": 5.029525056833909e-05, + "loss": 2.7267, + "step": 35330 + }, + { + "epoch": 1.644923993761203, + "grad_norm": 0.33407832549018196, + "learning_rate": 5.0292541878580756e-05, + "loss": 2.7393, + "step": 35331 + }, + { + "epoch": 1.6449705519472961, + "grad_norm": 0.35041330833122997, + "learning_rate": 5.028983318796382e-05, + "loss": 2.7601, + "step": 35332 + }, + { + "epoch": 1.6450171101333892, + "grad_norm": 0.3366034260676312, + "learning_rate": 5.0287124496496264e-05, + "loss": 2.7061, + "step": 35333 + }, + { + "epoch": 1.6450636683194824, + "grad_norm": 0.37267955328842534, + "learning_rate": 5.0284415804186026e-05, + "loss": 2.7227, + "step": 35334 + }, + { + "epoch": 1.6451102265055755, + "grad_norm": 0.3252820564206427, + "learning_rate": 5.028170711104104e-05, + "loss": 2.702, + "step": 35335 + }, + { + "epoch": 1.6451567846916684, + "grad_norm": 0.3658397448632811, + "learning_rate": 5.027899841706929e-05, + "loss": 2.742, + "step": 35336 + }, + { + "epoch": 1.6452033428777615, + "grad_norm": 0.3514254095522438, + "learning_rate": 5.027628972227869e-05, + "loss": 2.7864, + "step": 35337 + }, + { + "epoch": 1.6452499010638546, + "grad_norm": 0.34961672641790154, + "learning_rate": 5.027358102667723e-05, + "loss": 2.836, + "step": 35338 + }, + { + "epoch": 1.6452964592499475, + "grad_norm": 0.33842479469211256, + "learning_rate": 5.027087233027282e-05, + "loss": 2.665, + "step": 35339 + }, + { + "epoch": 1.6453430174360406, + "grad_norm": 0.3478298500377565, + "learning_rate": 5.026816363307343e-05, + "loss": 2.7226, + "step": 35340 + }, + { + "epoch": 1.6453895756221337, + "grad_norm": 0.31509606476442603, + "learning_rate": 5.0265454935087e-05, + "loss": 2.6567, + "step": 35341 + }, + { + "epoch": 1.6454361338082268, + "grad_norm": 0.334352608094194, + "learning_rate": 5.026274623632149e-05, + "loss": 2.7488, + "step": 35342 + }, + { + "epoch": 1.64548269199432, + "grad_norm": 0.3315330954164717, + "learning_rate": 5.0260037536784854e-05, + "loss": 2.7386, + "step": 35343 + }, + { + "epoch": 1.645529250180413, + "grad_norm": 0.3505689927433146, + "learning_rate": 5.025732883648504e-05, + "loss": 2.7662, + "step": 35344 + }, + { + "epoch": 1.6455758083665062, + "grad_norm": 0.3226487967913452, + "learning_rate": 5.0254620135429977e-05, + "loss": 2.6369, + "step": 35345 + }, + { + "epoch": 1.645622366552599, + "grad_norm": 0.36393729397962904, + "learning_rate": 5.025191143362762e-05, + "loss": 2.5487, + "step": 35346 + }, + { + "epoch": 1.6456689247386922, + "grad_norm": 0.3375766333720733, + "learning_rate": 5.024920273108595e-05, + "loss": 2.644, + "step": 35347 + }, + { + "epoch": 1.645715482924785, + "grad_norm": 0.31558680324350236, + "learning_rate": 5.024649402781287e-05, + "loss": 2.6929, + "step": 35348 + }, + { + "epoch": 1.6457620411108782, + "grad_norm": 0.34195693217925555, + "learning_rate": 5.024378532381637e-05, + "loss": 2.7253, + "step": 35349 + }, + { + "epoch": 1.6458085992969713, + "grad_norm": 0.3358920991646324, + "learning_rate": 5.0241076619104374e-05, + "loss": 2.6356, + "step": 35350 + }, + { + "epoch": 1.6458551574830644, + "grad_norm": 0.3140114827471121, + "learning_rate": 5.023836791368486e-05, + "loss": 2.6592, + "step": 35351 + }, + { + "epoch": 1.6459017156691576, + "grad_norm": 0.3619230605999099, + "learning_rate": 5.0235659207565754e-05, + "loss": 2.7245, + "step": 35352 + }, + { + "epoch": 1.6459482738552507, + "grad_norm": 0.31618075161012726, + "learning_rate": 5.023295050075499e-05, + "loss": 2.7621, + "step": 35353 + }, + { + "epoch": 1.6459948320413438, + "grad_norm": 0.33208401766165396, + "learning_rate": 5.0230241793260556e-05, + "loss": 2.7215, + "step": 35354 + }, + { + "epoch": 1.646041390227437, + "grad_norm": 0.3596515629968618, + "learning_rate": 5.0227533085090385e-05, + "loss": 2.7733, + "step": 35355 + }, + { + "epoch": 1.6460879484135298, + "grad_norm": 0.3266552665438587, + "learning_rate": 5.0224824376252435e-05, + "loss": 2.7581, + "step": 35356 + }, + { + "epoch": 1.646134506599623, + "grad_norm": 0.3435405092497209, + "learning_rate": 5.0222115666754645e-05, + "loss": 2.7422, + "step": 35357 + }, + { + "epoch": 1.6461810647857158, + "grad_norm": 0.335813995776203, + "learning_rate": 5.0219406956604965e-05, + "loss": 2.6811, + "step": 35358 + }, + { + "epoch": 1.646227622971809, + "grad_norm": 0.3182998192829178, + "learning_rate": 5.021669824581133e-05, + "loss": 2.8123, + "step": 35359 + }, + { + "epoch": 1.646274181157902, + "grad_norm": 0.3633690941036622, + "learning_rate": 5.021398953438172e-05, + "loss": 2.7429, + "step": 35360 + }, + { + "epoch": 1.6463207393439951, + "grad_norm": 0.33083180541409757, + "learning_rate": 5.021128082232408e-05, + "loss": 2.7192, + "step": 35361 + }, + { + "epoch": 1.6463672975300883, + "grad_norm": 0.39573005371081676, + "learning_rate": 5.0208572109646344e-05, + "loss": 2.7565, + "step": 35362 + }, + { + "epoch": 1.6464138557161814, + "grad_norm": 0.3672418251521825, + "learning_rate": 5.020586339635647e-05, + "loss": 2.6419, + "step": 35363 + }, + { + "epoch": 1.6464604139022745, + "grad_norm": 0.3352946848431495, + "learning_rate": 5.0203154682462396e-05, + "loss": 2.6849, + "step": 35364 + }, + { + "epoch": 1.6465069720883674, + "grad_norm": 0.3651695897032387, + "learning_rate": 5.020044596797211e-05, + "loss": 2.7341, + "step": 35365 + }, + { + "epoch": 1.6465535302744605, + "grad_norm": 0.3463326893494442, + "learning_rate": 5.019773725289351e-05, + "loss": 2.662, + "step": 35366 + }, + { + "epoch": 1.6466000884605536, + "grad_norm": 0.3418963624334734, + "learning_rate": 5.019502853723458e-05, + "loss": 2.7827, + "step": 35367 + }, + { + "epoch": 1.6466466466466465, + "grad_norm": 0.3423326992475215, + "learning_rate": 5.0192319821003255e-05, + "loss": 2.756, + "step": 35368 + }, + { + "epoch": 1.6466932048327396, + "grad_norm": 0.3719948469512607, + "learning_rate": 5.0189611104207504e-05, + "loss": 2.6489, + "step": 35369 + }, + { + "epoch": 1.6467397630188327, + "grad_norm": 0.3408024970077524, + "learning_rate": 5.018690238685526e-05, + "loss": 2.6687, + "step": 35370 + }, + { + "epoch": 1.6467863212049259, + "grad_norm": 0.3410589207481585, + "learning_rate": 5.018419366895446e-05, + "loss": 2.7348, + "step": 35371 + }, + { + "epoch": 1.646832879391019, + "grad_norm": 0.3592833277394983, + "learning_rate": 5.018148495051308e-05, + "loss": 2.7387, + "step": 35372 + }, + { + "epoch": 1.646879437577112, + "grad_norm": 0.3462318481481633, + "learning_rate": 5.017877623153906e-05, + "loss": 2.7307, + "step": 35373 + }, + { + "epoch": 1.6469259957632052, + "grad_norm": 0.336302565068869, + "learning_rate": 5.0176067512040346e-05, + "loss": 2.6558, + "step": 35374 + }, + { + "epoch": 1.646972553949298, + "grad_norm": 0.36099385358475294, + "learning_rate": 5.0173358792024894e-05, + "loss": 2.7266, + "step": 35375 + }, + { + "epoch": 1.6470191121353912, + "grad_norm": 0.3341472485800953, + "learning_rate": 5.017065007150065e-05, + "loss": 2.6888, + "step": 35376 + }, + { + "epoch": 1.6470656703214843, + "grad_norm": 0.3551554718437426, + "learning_rate": 5.0167941350475566e-05, + "loss": 2.7412, + "step": 35377 + }, + { + "epoch": 1.6471122285075772, + "grad_norm": 0.3689516752389712, + "learning_rate": 5.016523262895758e-05, + "loss": 2.7112, + "step": 35378 + }, + { + "epoch": 1.6471587866936703, + "grad_norm": 0.33460701887007066, + "learning_rate": 5.016252390695467e-05, + "loss": 2.7647, + "step": 35379 + }, + { + "epoch": 1.6472053448797634, + "grad_norm": 0.3382716649196273, + "learning_rate": 5.0159815184474765e-05, + "loss": 2.7587, + "step": 35380 + }, + { + "epoch": 1.6472519030658566, + "grad_norm": 0.32804476794232557, + "learning_rate": 5.015710646152581e-05, + "loss": 2.6448, + "step": 35381 + }, + { + "epoch": 1.6472984612519497, + "grad_norm": 0.3544878935256628, + "learning_rate": 5.015439773811578e-05, + "loss": 2.6857, + "step": 35382 + }, + { + "epoch": 1.6473450194380428, + "grad_norm": 0.35589279704266436, + "learning_rate": 5.01516890142526e-05, + "loss": 2.7564, + "step": 35383 + }, + { + "epoch": 1.647391577624136, + "grad_norm": 0.366539238809786, + "learning_rate": 5.014898028994421e-05, + "loss": 2.6844, + "step": 35384 + }, + { + "epoch": 1.6474381358102288, + "grad_norm": 0.36019700585553527, + "learning_rate": 5.014627156519859e-05, + "loss": 2.7371, + "step": 35385 + }, + { + "epoch": 1.647484693996322, + "grad_norm": 0.3490488498033873, + "learning_rate": 5.014356284002367e-05, + "loss": 2.7274, + "step": 35386 + }, + { + "epoch": 1.6475312521824148, + "grad_norm": 0.39795812351120013, + "learning_rate": 5.0140854114427416e-05, + "loss": 2.7822, + "step": 35387 + }, + { + "epoch": 1.647577810368508, + "grad_norm": 0.34958410817499347, + "learning_rate": 5.0138145388417776e-05, + "loss": 2.6954, + "step": 35388 + }, + { + "epoch": 1.647624368554601, + "grad_norm": 0.37206322163136485, + "learning_rate": 5.0135436662002686e-05, + "loss": 2.6279, + "step": 35389 + }, + { + "epoch": 1.6476709267406942, + "grad_norm": 0.36287530797257156, + "learning_rate": 5.013272793519009e-05, + "loss": 2.6265, + "step": 35390 + }, + { + "epoch": 1.6477174849267873, + "grad_norm": 0.35307292122611844, + "learning_rate": 5.0130019207987966e-05, + "loss": 2.6612, + "step": 35391 + }, + { + "epoch": 1.6477640431128804, + "grad_norm": 0.3511803734723684, + "learning_rate": 5.0127310480404254e-05, + "loss": 2.6816, + "step": 35392 + }, + { + "epoch": 1.6478106012989735, + "grad_norm": 0.34161689827387803, + "learning_rate": 5.0124601752446886e-05, + "loss": 2.6574, + "step": 35393 + }, + { + "epoch": 1.6478571594850666, + "grad_norm": 0.3541165307474355, + "learning_rate": 5.012189302412382e-05, + "loss": 2.7192, + "step": 35394 + }, + { + "epoch": 1.6479037176711595, + "grad_norm": 0.3220352055273965, + "learning_rate": 5.011918429544302e-05, + "loss": 2.6215, + "step": 35395 + }, + { + "epoch": 1.6479502758572526, + "grad_norm": 0.35459014837363395, + "learning_rate": 5.011647556641242e-05, + "loss": 2.6567, + "step": 35396 + }, + { + "epoch": 1.6479968340433455, + "grad_norm": 0.328808972638108, + "learning_rate": 5.0113766837039986e-05, + "loss": 2.5558, + "step": 35397 + }, + { + "epoch": 1.6480433922294386, + "grad_norm": 0.3340439021012825, + "learning_rate": 5.011105810733365e-05, + "loss": 2.7282, + "step": 35398 + }, + { + "epoch": 1.6480899504155317, + "grad_norm": 0.3376623455528665, + "learning_rate": 5.010834937730137e-05, + "loss": 2.6996, + "step": 35399 + }, + { + "epoch": 1.6481365086016249, + "grad_norm": 0.3478762593613399, + "learning_rate": 5.010564064695109e-05, + "loss": 2.6891, + "step": 35400 + }, + { + "epoch": 1.648183066787718, + "grad_norm": 0.3084328935422546, + "learning_rate": 5.0102931916290775e-05, + "loss": 2.8182, + "step": 35401 + }, + { + "epoch": 1.648229624973811, + "grad_norm": 0.3596068604106914, + "learning_rate": 5.0100223185328364e-05, + "loss": 2.7147, + "step": 35402 + }, + { + "epoch": 1.6482761831599042, + "grad_norm": 0.2997067027258049, + "learning_rate": 5.009751445407179e-05, + "loss": 2.7748, + "step": 35403 + }, + { + "epoch": 1.6483227413459973, + "grad_norm": 0.34475949660337346, + "learning_rate": 5.009480572252904e-05, + "loss": 2.7273, + "step": 35404 + }, + { + "epoch": 1.6483692995320902, + "grad_norm": 0.3515380948136072, + "learning_rate": 5.0092096990708035e-05, + "loss": 2.6738, + "step": 35405 + }, + { + "epoch": 1.6484158577181833, + "grad_norm": 0.342297448108117, + "learning_rate": 5.0089388258616746e-05, + "loss": 2.6965, + "step": 35406 + }, + { + "epoch": 1.6484624159042762, + "grad_norm": 0.35306827476647673, + "learning_rate": 5.00866795262631e-05, + "loss": 2.5582, + "step": 35407 + }, + { + "epoch": 1.6485089740903693, + "grad_norm": 0.34310785189642284, + "learning_rate": 5.008397079365507e-05, + "loss": 2.693, + "step": 35408 + }, + { + "epoch": 1.6485555322764625, + "grad_norm": 0.3307318376502081, + "learning_rate": 5.008126206080058e-05, + "loss": 2.7003, + "step": 35409 + }, + { + "epoch": 1.6486020904625556, + "grad_norm": 0.3374486525160653, + "learning_rate": 5.007855332770761e-05, + "loss": 2.7081, + "step": 35410 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.349548085749209, + "learning_rate": 5.007584459438408e-05, + "loss": 2.8586, + "step": 35411 + }, + { + "epoch": 1.6486952068347418, + "grad_norm": 0.33577257218008627, + "learning_rate": 5.007313586083795e-05, + "loss": 2.6857, + "step": 35412 + }, + { + "epoch": 1.648741765020835, + "grad_norm": 0.341555046925508, + "learning_rate": 5.0070427127077176e-05, + "loss": 2.7482, + "step": 35413 + }, + { + "epoch": 1.6487883232069278, + "grad_norm": 0.36667797865157736, + "learning_rate": 5.006771839310971e-05, + "loss": 2.8099, + "step": 35414 + }, + { + "epoch": 1.648834881393021, + "grad_norm": 0.36247058498042856, + "learning_rate": 5.006500965894351e-05, + "loss": 2.7847, + "step": 35415 + }, + { + "epoch": 1.648881439579114, + "grad_norm": 0.3412116203866481, + "learning_rate": 5.0062300924586495e-05, + "loss": 2.7057, + "step": 35416 + }, + { + "epoch": 1.648927997765207, + "grad_norm": 0.39578833649718886, + "learning_rate": 5.005959219004664e-05, + "loss": 2.6729, + "step": 35417 + }, + { + "epoch": 1.6489745559513, + "grad_norm": 0.32565098741131476, + "learning_rate": 5.005688345533187e-05, + "loss": 2.6669, + "step": 35418 + }, + { + "epoch": 1.6490211141373932, + "grad_norm": 0.38921357194023165, + "learning_rate": 5.005417472045018e-05, + "loss": 2.7297, + "step": 35419 + }, + { + "epoch": 1.6490676723234863, + "grad_norm": 0.3402820066481171, + "learning_rate": 5.005146598540947e-05, + "loss": 2.7113, + "step": 35420 + }, + { + "epoch": 1.6491142305095794, + "grad_norm": 0.3757695081324369, + "learning_rate": 5.004875725021774e-05, + "loss": 2.7463, + "step": 35421 + }, + { + "epoch": 1.6491607886956725, + "grad_norm": 0.33832526369313454, + "learning_rate": 5.0046048514882895e-05, + "loss": 2.6798, + "step": 35422 + }, + { + "epoch": 1.6492073468817656, + "grad_norm": 0.3403332953358415, + "learning_rate": 5.004333977941289e-05, + "loss": 2.738, + "step": 35423 + }, + { + "epoch": 1.6492539050678585, + "grad_norm": 0.35511310583224365, + "learning_rate": 5.004063104381571e-05, + "loss": 2.6985, + "step": 35424 + }, + { + "epoch": 1.6493004632539516, + "grad_norm": 0.3318734429273951, + "learning_rate": 5.0037922308099274e-05, + "loss": 2.6993, + "step": 35425 + }, + { + "epoch": 1.6493470214400447, + "grad_norm": 0.3778462020239965, + "learning_rate": 5.003521357227153e-05, + "loss": 2.7283, + "step": 35426 + }, + { + "epoch": 1.6493935796261376, + "grad_norm": 0.34113311928263507, + "learning_rate": 5.003250483634044e-05, + "loss": 2.7429, + "step": 35427 + }, + { + "epoch": 1.6494401378122308, + "grad_norm": 0.36166703255134725, + "learning_rate": 5.0029796100313965e-05, + "loss": 2.6322, + "step": 35428 + }, + { + "epoch": 1.6494866959983239, + "grad_norm": 0.32186110007695207, + "learning_rate": 5.002708736420003e-05, + "loss": 2.6562, + "step": 35429 + }, + { + "epoch": 1.649533254184417, + "grad_norm": 0.35842054799187956, + "learning_rate": 5.002437862800659e-05, + "loss": 2.6789, + "step": 35430 + }, + { + "epoch": 1.64957981237051, + "grad_norm": 0.33403583274730764, + "learning_rate": 5.002166989174161e-05, + "loss": 2.713, + "step": 35431 + }, + { + "epoch": 1.6496263705566032, + "grad_norm": 0.3321750185824423, + "learning_rate": 5.001896115541303e-05, + "loss": 2.7507, + "step": 35432 + }, + { + "epoch": 1.6496729287426963, + "grad_norm": 0.3476776974982732, + "learning_rate": 5.001625241902881e-05, + "loss": 2.6237, + "step": 35433 + }, + { + "epoch": 1.6497194869287892, + "grad_norm": 0.3389562651559761, + "learning_rate": 5.0013543682596885e-05, + "loss": 2.7412, + "step": 35434 + }, + { + "epoch": 1.6497660451148823, + "grad_norm": 0.3655729229208266, + "learning_rate": 5.001083494612521e-05, + "loss": 2.747, + "step": 35435 + }, + { + "epoch": 1.6498126033009752, + "grad_norm": 0.3416287590849781, + "learning_rate": 5.000812620962173e-05, + "loss": 2.8305, + "step": 35436 + }, + { + "epoch": 1.6498591614870683, + "grad_norm": 0.366693165394653, + "learning_rate": 5.000541747309442e-05, + "loss": 2.8336, + "step": 35437 + }, + { + "epoch": 1.6499057196731615, + "grad_norm": 0.368920210970277, + "learning_rate": 5.0002708736551184e-05, + "loss": 2.7711, + "step": 35438 + }, + { + "epoch": 1.6499522778592546, + "grad_norm": 0.3615519968017808, + "learning_rate": 5e-05, + "loss": 2.6808, + "step": 35439 + }, + { + "epoch": 1.6499988360453477, + "grad_norm": 0.3568128824495376, + "learning_rate": 4.9997291263448834e-05, + "loss": 2.7565, + "step": 35440 + }, + { + "epoch": 1.6500453942314408, + "grad_norm": 0.33929456525400076, + "learning_rate": 4.9994582526905594e-05, + "loss": 2.8526, + "step": 35441 + }, + { + "epoch": 1.650091952417534, + "grad_norm": 0.3433821434527241, + "learning_rate": 4.999187379037828e-05, + "loss": 2.7353, + "step": 35442 + }, + { + "epoch": 1.650138510603627, + "grad_norm": 0.35961878946972825, + "learning_rate": 4.99891650538748e-05, + "loss": 2.756, + "step": 35443 + }, + { + "epoch": 1.65018506878972, + "grad_norm": 0.3492497812538368, + "learning_rate": 4.998645631740312e-05, + "loss": 2.7636, + "step": 35444 + }, + { + "epoch": 1.650231626975813, + "grad_norm": 0.33355253455455297, + "learning_rate": 4.99837475809712e-05, + "loss": 2.705, + "step": 35445 + }, + { + "epoch": 1.650278185161906, + "grad_norm": 0.34873022431279493, + "learning_rate": 4.998103884458697e-05, + "loss": 2.6514, + "step": 35446 + }, + { + "epoch": 1.650324743347999, + "grad_norm": 0.3263468547735754, + "learning_rate": 4.99783301082584e-05, + "loss": 2.6913, + "step": 35447 + }, + { + "epoch": 1.6503713015340922, + "grad_norm": 0.3626474850075452, + "learning_rate": 4.9975621371993405e-05, + "loss": 2.8035, + "step": 35448 + }, + { + "epoch": 1.6504178597201853, + "grad_norm": 0.356763213232846, + "learning_rate": 4.997291263579998e-05, + "loss": 2.6744, + "step": 35449 + }, + { + "epoch": 1.6504644179062784, + "grad_norm": 0.344116838028858, + "learning_rate": 4.997020389968606e-05, + "loss": 2.7423, + "step": 35450 + }, + { + "epoch": 1.6505109760923715, + "grad_norm": 0.3495189558180555, + "learning_rate": 4.9967495163659564e-05, + "loss": 2.6818, + "step": 35451 + }, + { + "epoch": 1.6505575342784646, + "grad_norm": 0.35895409521260446, + "learning_rate": 4.9964786427728485e-05, + "loss": 2.7366, + "step": 35452 + }, + { + "epoch": 1.6506040924645575, + "grad_norm": 0.3444098847675817, + "learning_rate": 4.996207769190076e-05, + "loss": 2.7008, + "step": 35453 + }, + { + "epoch": 1.6506506506506506, + "grad_norm": 0.3464035894075358, + "learning_rate": 4.99593689561843e-05, + "loss": 2.6475, + "step": 35454 + }, + { + "epoch": 1.6506972088367438, + "grad_norm": 0.37292817996335675, + "learning_rate": 4.995666022058712e-05, + "loss": 2.7239, + "step": 35455 + }, + { + "epoch": 1.6507437670228366, + "grad_norm": 0.32781477842821044, + "learning_rate": 4.995395148511712e-05, + "loss": 2.6144, + "step": 35456 + }, + { + "epoch": 1.6507903252089298, + "grad_norm": 0.3591541685181718, + "learning_rate": 4.9951242749782274e-05, + "loss": 2.5883, + "step": 35457 + }, + { + "epoch": 1.6508368833950229, + "grad_norm": 0.3300462498265742, + "learning_rate": 4.994853401459054e-05, + "loss": 2.7092, + "step": 35458 + }, + { + "epoch": 1.650883441581116, + "grad_norm": 0.3210217761390132, + "learning_rate": 4.9945825279549835e-05, + "loss": 2.7061, + "step": 35459 + }, + { + "epoch": 1.650929999767209, + "grad_norm": 0.3669799736405722, + "learning_rate": 4.994311654466814e-05, + "loss": 2.74, + "step": 35460 + }, + { + "epoch": 1.6509765579533022, + "grad_norm": 0.3649803217394291, + "learning_rate": 4.994040780995337e-05, + "loss": 2.7199, + "step": 35461 + }, + { + "epoch": 1.6510231161393953, + "grad_norm": 0.3497250637967925, + "learning_rate": 4.993769907541352e-05, + "loss": 2.7479, + "step": 35462 + }, + { + "epoch": 1.6510696743254882, + "grad_norm": 0.37631246730876095, + "learning_rate": 4.9934990341056523e-05, + "loss": 2.7462, + "step": 35463 + }, + { + "epoch": 1.6511162325115813, + "grad_norm": 0.34336354025112026, + "learning_rate": 4.993228160689029e-05, + "loss": 2.851, + "step": 35464 + }, + { + "epoch": 1.6511627906976745, + "grad_norm": 0.36479567500766785, + "learning_rate": 4.992957287292283e-05, + "loss": 2.8335, + "step": 35465 + }, + { + "epoch": 1.6512093488837674, + "grad_norm": 0.3422858205127276, + "learning_rate": 4.992686413916205e-05, + "loss": 2.742, + "step": 35466 + }, + { + "epoch": 1.6512559070698605, + "grad_norm": 0.3636586819631555, + "learning_rate": 4.992415540561593e-05, + "loss": 2.6563, + "step": 35467 + }, + { + "epoch": 1.6513024652559536, + "grad_norm": 0.32153962196037345, + "learning_rate": 4.9921446672292414e-05, + "loss": 2.8024, + "step": 35468 + }, + { + "epoch": 1.6513490234420467, + "grad_norm": 0.3595828640769853, + "learning_rate": 4.991873793919942e-05, + "loss": 2.7618, + "step": 35469 + }, + { + "epoch": 1.6513955816281398, + "grad_norm": 0.3504047332982069, + "learning_rate": 4.9916029206344945e-05, + "loss": 2.6904, + "step": 35470 + }, + { + "epoch": 1.651442139814233, + "grad_norm": 0.3652817239458166, + "learning_rate": 4.991332047373691e-05, + "loss": 2.8126, + "step": 35471 + }, + { + "epoch": 1.651488698000326, + "grad_norm": 0.3300038838590217, + "learning_rate": 4.9910611741383266e-05, + "loss": 2.7605, + "step": 35472 + }, + { + "epoch": 1.651535256186419, + "grad_norm": 0.39347244546615323, + "learning_rate": 4.990790300929198e-05, + "loss": 2.7255, + "step": 35473 + }, + { + "epoch": 1.651581814372512, + "grad_norm": 0.3515230774183889, + "learning_rate": 4.990519427747096e-05, + "loss": 2.7908, + "step": 35474 + }, + { + "epoch": 1.651628372558605, + "grad_norm": 0.37175606318586196, + "learning_rate": 4.9902485545928216e-05, + "loss": 2.7818, + "step": 35475 + }, + { + "epoch": 1.651674930744698, + "grad_norm": 0.38428558229095083, + "learning_rate": 4.989977681467166e-05, + "loss": 2.6893, + "step": 35476 + }, + { + "epoch": 1.6517214889307912, + "grad_norm": 0.36094554764742465, + "learning_rate": 4.989706808370923e-05, + "loss": 2.6457, + "step": 35477 + }, + { + "epoch": 1.6517680471168843, + "grad_norm": 0.3650112644913946, + "learning_rate": 4.989435935304892e-05, + "loss": 2.7509, + "step": 35478 + }, + { + "epoch": 1.6518146053029774, + "grad_norm": 0.3342886156574446, + "learning_rate": 4.9891650622698635e-05, + "loss": 2.734, + "step": 35479 + }, + { + "epoch": 1.6518611634890705, + "grad_norm": 0.3345223832587283, + "learning_rate": 4.988894189266636e-05, + "loss": 2.7449, + "step": 35480 + }, + { + "epoch": 1.6519077216751636, + "grad_norm": 0.339381589767051, + "learning_rate": 4.9886233162960025e-05, + "loss": 2.689, + "step": 35481 + }, + { + "epoch": 1.6519542798612568, + "grad_norm": 0.365513747776131, + "learning_rate": 4.988352443358758e-05, + "loss": 2.6381, + "step": 35482 + }, + { + "epoch": 1.6520008380473497, + "grad_norm": 0.33309735308191457, + "learning_rate": 4.9880815704556984e-05, + "loss": 2.7713, + "step": 35483 + }, + { + "epoch": 1.6520473962334428, + "grad_norm": 0.3543745021217987, + "learning_rate": 4.9878106975876175e-05, + "loss": 2.5892, + "step": 35484 + }, + { + "epoch": 1.6520939544195357, + "grad_norm": 0.3650497900406032, + "learning_rate": 4.987539824755313e-05, + "loss": 2.6441, + "step": 35485 + }, + { + "epoch": 1.6521405126056288, + "grad_norm": 0.3242646020340461, + "learning_rate": 4.987268951959577e-05, + "loss": 2.6509, + "step": 35486 + }, + { + "epoch": 1.652187070791722, + "grad_norm": 0.3913403433376814, + "learning_rate": 4.986998079201203e-05, + "loss": 2.7556, + "step": 35487 + }, + { + "epoch": 1.652233628977815, + "grad_norm": 0.31596730621552366, + "learning_rate": 4.986727206480991e-05, + "loss": 2.607, + "step": 35488 + }, + { + "epoch": 1.6522801871639081, + "grad_norm": 0.36665786431324854, + "learning_rate": 4.986456333799734e-05, + "loss": 2.7081, + "step": 35489 + }, + { + "epoch": 1.6523267453500012, + "grad_norm": 0.32585147318665464, + "learning_rate": 4.986185461158223e-05, + "loss": 2.801, + "step": 35490 + }, + { + "epoch": 1.6523733035360944, + "grad_norm": 0.3146673577248649, + "learning_rate": 4.985914588557259e-05, + "loss": 2.7426, + "step": 35491 + }, + { + "epoch": 1.6524198617221875, + "grad_norm": 0.3407141504495424, + "learning_rate": 4.985643715997632e-05, + "loss": 2.654, + "step": 35492 + }, + { + "epoch": 1.6524664199082804, + "grad_norm": 0.3394319557527517, + "learning_rate": 4.9853728434801416e-05, + "loss": 2.737, + "step": 35493 + }, + { + "epoch": 1.6525129780943735, + "grad_norm": 0.32451631876371045, + "learning_rate": 4.9851019710055807e-05, + "loss": 2.6409, + "step": 35494 + }, + { + "epoch": 1.6525595362804664, + "grad_norm": 0.31789941868479227, + "learning_rate": 4.9848310985747416e-05, + "loss": 2.7571, + "step": 35495 + }, + { + "epoch": 1.6526060944665595, + "grad_norm": 0.3571400369807405, + "learning_rate": 4.984560226188424e-05, + "loss": 2.7175, + "step": 35496 + }, + { + "epoch": 1.6526526526526526, + "grad_norm": 0.3426842713994648, + "learning_rate": 4.984289353847419e-05, + "loss": 2.7979, + "step": 35497 + }, + { + "epoch": 1.6526992108387457, + "grad_norm": 0.3344943561373813, + "learning_rate": 4.984018481552525e-05, + "loss": 2.6336, + "step": 35498 + }, + { + "epoch": 1.6527457690248388, + "grad_norm": 0.3611225120425777, + "learning_rate": 4.983747609304535e-05, + "loss": 2.663, + "step": 35499 + }, + { + "epoch": 1.652792327210932, + "grad_norm": 0.33252597590274585, + "learning_rate": 4.983476737104242e-05, + "loss": 2.7723, + "step": 35500 + }, + { + "epoch": 1.652838885397025, + "grad_norm": 0.3638523803544161, + "learning_rate": 4.9832058649524446e-05, + "loss": 2.6955, + "step": 35501 + }, + { + "epoch": 1.652885443583118, + "grad_norm": 0.34370158024572334, + "learning_rate": 4.9829349928499344e-05, + "loss": 2.6637, + "step": 35502 + }, + { + "epoch": 1.652932001769211, + "grad_norm": 0.353080924757812, + "learning_rate": 4.982664120797511e-05, + "loss": 2.6944, + "step": 35503 + }, + { + "epoch": 1.6529785599553042, + "grad_norm": 0.34956524269424, + "learning_rate": 4.9823932487959665e-05, + "loss": 2.6984, + "step": 35504 + }, + { + "epoch": 1.653025118141397, + "grad_norm": 0.32996787005031564, + "learning_rate": 4.9821223768460944e-05, + "loss": 2.7268, + "step": 35505 + }, + { + "epoch": 1.6530716763274902, + "grad_norm": 0.33705940502485926, + "learning_rate": 4.981851504948693e-05, + "loss": 2.633, + "step": 35506 + }, + { + "epoch": 1.6531182345135833, + "grad_norm": 0.33961588289701083, + "learning_rate": 4.9815806331045556e-05, + "loss": 2.6647, + "step": 35507 + }, + { + "epoch": 1.6531647926996764, + "grad_norm": 0.31536533610224615, + "learning_rate": 4.981309761314475e-05, + "loss": 2.6226, + "step": 35508 + }, + { + "epoch": 1.6532113508857695, + "grad_norm": 0.33255110497573265, + "learning_rate": 4.981038889579251e-05, + "loss": 2.5979, + "step": 35509 + }, + { + "epoch": 1.6532579090718627, + "grad_norm": 0.36429236908051654, + "learning_rate": 4.980768017899674e-05, + "loss": 2.7537, + "step": 35510 + }, + { + "epoch": 1.6533044672579558, + "grad_norm": 0.33556105776833445, + "learning_rate": 4.9804971462765435e-05, + "loss": 2.7542, + "step": 35511 + }, + { + "epoch": 1.6533510254440487, + "grad_norm": 0.3540622410852008, + "learning_rate": 4.980226274710651e-05, + "loss": 2.7729, + "step": 35512 + }, + { + "epoch": 1.6533975836301418, + "grad_norm": 0.3127599264825399, + "learning_rate": 4.9799554032027906e-05, + "loss": 2.6799, + "step": 35513 + }, + { + "epoch": 1.653444141816235, + "grad_norm": 0.33479288252697725, + "learning_rate": 4.979684531753761e-05, + "loss": 2.7717, + "step": 35514 + }, + { + "epoch": 1.6534907000023278, + "grad_norm": 0.36314972656697575, + "learning_rate": 4.979413660364353e-05, + "loss": 2.5729, + "step": 35515 + }, + { + "epoch": 1.653537258188421, + "grad_norm": 0.34466281819013456, + "learning_rate": 4.979142789035367e-05, + "loss": 2.7662, + "step": 35516 + }, + { + "epoch": 1.653583816374514, + "grad_norm": 0.39833573608675826, + "learning_rate": 4.978871917767594e-05, + "loss": 2.6616, + "step": 35517 + }, + { + "epoch": 1.6536303745606071, + "grad_norm": 0.34825248831342154, + "learning_rate": 4.9786010465618275e-05, + "loss": 2.7522, + "step": 35518 + }, + { + "epoch": 1.6536769327467002, + "grad_norm": 0.3794009070403413, + "learning_rate": 4.9783301754188674e-05, + "loss": 2.5995, + "step": 35519 + }, + { + "epoch": 1.6537234909327934, + "grad_norm": 0.35690570707066943, + "learning_rate": 4.978059304339505e-05, + "loss": 2.7511, + "step": 35520 + }, + { + "epoch": 1.6537700491188865, + "grad_norm": 0.3469298255334566, + "learning_rate": 4.977788433324536e-05, + "loss": 2.6901, + "step": 35521 + }, + { + "epoch": 1.6538166073049794, + "grad_norm": 0.3605013130358051, + "learning_rate": 4.977517562374758e-05, + "loss": 2.7807, + "step": 35522 + }, + { + "epoch": 1.6538631654910725, + "grad_norm": 0.30376909738553354, + "learning_rate": 4.977246691490961e-05, + "loss": 2.508, + "step": 35523 + }, + { + "epoch": 1.6539097236771654, + "grad_norm": 0.358756229157362, + "learning_rate": 4.976975820673945e-05, + "loss": 2.7409, + "step": 35524 + }, + { + "epoch": 1.6539562818632585, + "grad_norm": 0.33638265715970234, + "learning_rate": 4.976704949924502e-05, + "loss": 2.7458, + "step": 35525 + }, + { + "epoch": 1.6540028400493516, + "grad_norm": 0.31667633083418684, + "learning_rate": 4.9764340792434264e-05, + "loss": 2.7336, + "step": 35526 + }, + { + "epoch": 1.6540493982354447, + "grad_norm": 0.36899951172440015, + "learning_rate": 4.976163208631516e-05, + "loss": 2.6605, + "step": 35527 + }, + { + "epoch": 1.6540959564215378, + "grad_norm": 0.349547928963164, + "learning_rate": 4.975892338089562e-05, + "loss": 2.7352, + "step": 35528 + }, + { + "epoch": 1.654142514607631, + "grad_norm": 0.32633997030081074, + "learning_rate": 4.975621467618364e-05, + "loss": 2.6666, + "step": 35529 + }, + { + "epoch": 1.654189072793724, + "grad_norm": 0.37480265693015513, + "learning_rate": 4.9753505972187145e-05, + "loss": 2.6534, + "step": 35530 + }, + { + "epoch": 1.6542356309798172, + "grad_norm": 0.36334362295132305, + "learning_rate": 4.9750797268914065e-05, + "loss": 2.614, + "step": 35531 + }, + { + "epoch": 1.65428218916591, + "grad_norm": 0.35285656418741174, + "learning_rate": 4.974808856637239e-05, + "loss": 2.7721, + "step": 35532 + }, + { + "epoch": 1.6543287473520032, + "grad_norm": 0.355216303902282, + "learning_rate": 4.974537986457004e-05, + "loss": 2.7196, + "step": 35533 + }, + { + "epoch": 1.654375305538096, + "grad_norm": 0.3125120592836062, + "learning_rate": 4.9742671163514975e-05, + "loss": 2.6605, + "step": 35534 + }, + { + "epoch": 1.6544218637241892, + "grad_norm": 0.34550560165199823, + "learning_rate": 4.973996246321516e-05, + "loss": 2.6269, + "step": 35535 + }, + { + "epoch": 1.6544684219102823, + "grad_norm": 0.35799439572773945, + "learning_rate": 4.973725376367851e-05, + "loss": 2.8223, + "step": 35536 + }, + { + "epoch": 1.6545149800963754, + "grad_norm": 0.3377105258584982, + "learning_rate": 4.9734545064913006e-05, + "loss": 2.6781, + "step": 35537 + }, + { + "epoch": 1.6545615382824685, + "grad_norm": 0.3781772761121045, + "learning_rate": 4.973183636692657e-05, + "loss": 2.7669, + "step": 35538 + }, + { + "epoch": 1.6546080964685617, + "grad_norm": 0.36300819238603016, + "learning_rate": 4.972912766972719e-05, + "loss": 2.6952, + "step": 35539 + }, + { + "epoch": 1.6546546546546548, + "grad_norm": 0.354077539824545, + "learning_rate": 4.972641897332279e-05, + "loss": 2.8141, + "step": 35540 + }, + { + "epoch": 1.6547012128407477, + "grad_norm": 0.36023919500457763, + "learning_rate": 4.97237102777213e-05, + "loss": 2.7636, + "step": 35541 + }, + { + "epoch": 1.6547477710268408, + "grad_norm": 0.3698873339127286, + "learning_rate": 4.972100158293072e-05, + "loss": 2.7192, + "step": 35542 + }, + { + "epoch": 1.654794329212934, + "grad_norm": 0.3367604770097153, + "learning_rate": 4.971829288895897e-05, + "loss": 2.6882, + "step": 35543 + }, + { + "epoch": 1.6548408873990268, + "grad_norm": 0.3712396541422612, + "learning_rate": 4.9715584195813985e-05, + "loss": 2.7, + "step": 35544 + }, + { + "epoch": 1.65488744558512, + "grad_norm": 0.37422697058073473, + "learning_rate": 4.9712875503503754e-05, + "loss": 2.7748, + "step": 35545 + }, + { + "epoch": 1.654934003771213, + "grad_norm": 0.343472737911686, + "learning_rate": 4.971016681203619e-05, + "loss": 2.8197, + "step": 35546 + }, + { + "epoch": 1.6549805619573061, + "grad_norm": 0.35567600271451094, + "learning_rate": 4.970745812141926e-05, + "loss": 2.5949, + "step": 35547 + }, + { + "epoch": 1.6550271201433993, + "grad_norm": 0.3348023177514698, + "learning_rate": 4.970474943166092e-05, + "loss": 2.5985, + "step": 35548 + }, + { + "epoch": 1.6550736783294924, + "grad_norm": 0.34473663420568945, + "learning_rate": 4.97020407427691e-05, + "loss": 2.7559, + "step": 35549 + }, + { + "epoch": 1.6551202365155855, + "grad_norm": 0.34705801985683915, + "learning_rate": 4.969933205475177e-05, + "loss": 2.6769, + "step": 35550 + }, + { + "epoch": 1.6551667947016784, + "grad_norm": 0.3318088057698893, + "learning_rate": 4.969662336761686e-05, + "loss": 2.7103, + "step": 35551 + }, + { + "epoch": 1.6552133528877715, + "grad_norm": 0.34599131647029724, + "learning_rate": 4.9693914681372344e-05, + "loss": 2.779, + "step": 35552 + }, + { + "epoch": 1.6552599110738646, + "grad_norm": 0.34193468752583994, + "learning_rate": 4.969120599602616e-05, + "loss": 2.7282, + "step": 35553 + }, + { + "epoch": 1.6553064692599575, + "grad_norm": 0.33936653458517746, + "learning_rate": 4.968849731158623e-05, + "loss": 2.7074, + "step": 35554 + }, + { + "epoch": 1.6553530274460506, + "grad_norm": 0.3336255496201527, + "learning_rate": 4.9685788628060556e-05, + "loss": 2.6203, + "step": 35555 + }, + { + "epoch": 1.6553995856321437, + "grad_norm": 0.3214515950111635, + "learning_rate": 4.968307994545704e-05, + "loss": 2.6956, + "step": 35556 + }, + { + "epoch": 1.6554461438182368, + "grad_norm": 0.3480421920999034, + "learning_rate": 4.968037126378367e-05, + "loss": 2.6811, + "step": 35557 + }, + { + "epoch": 1.65549270200433, + "grad_norm": 0.34179061176066305, + "learning_rate": 4.967766258304838e-05, + "loss": 2.7219, + "step": 35558 + }, + { + "epoch": 1.655539260190423, + "grad_norm": 0.33030268257008655, + "learning_rate": 4.9674953903259116e-05, + "loss": 2.7547, + "step": 35559 + }, + { + "epoch": 1.6555858183765162, + "grad_norm": 0.3265094420263703, + "learning_rate": 4.9672245224423824e-05, + "loss": 2.6245, + "step": 35560 + }, + { + "epoch": 1.655632376562609, + "grad_norm": 0.3454451075100612, + "learning_rate": 4.966953654655047e-05, + "loss": 2.8714, + "step": 35561 + }, + { + "epoch": 1.6556789347487022, + "grad_norm": 0.32872951486809515, + "learning_rate": 4.966682786964699e-05, + "loss": 2.7668, + "step": 35562 + }, + { + "epoch": 1.655725492934795, + "grad_norm": 0.34432302696384753, + "learning_rate": 4.966411919372134e-05, + "loss": 2.6985, + "step": 35563 + }, + { + "epoch": 1.6557720511208882, + "grad_norm": 0.37383833925280013, + "learning_rate": 4.966141051878145e-05, + "loss": 2.7771, + "step": 35564 + }, + { + "epoch": 1.6558186093069813, + "grad_norm": 0.33445356869193993, + "learning_rate": 4.9658701844835306e-05, + "loss": 2.7678, + "step": 35565 + }, + { + "epoch": 1.6558651674930744, + "grad_norm": 0.37211849105963596, + "learning_rate": 4.9655993171890837e-05, + "loss": 2.6868, + "step": 35566 + }, + { + "epoch": 1.6559117256791676, + "grad_norm": 0.36587494114020186, + "learning_rate": 4.965328449995597e-05, + "loss": 2.7456, + "step": 35567 + }, + { + "epoch": 1.6559582838652607, + "grad_norm": 0.33254381041440284, + "learning_rate": 4.965057582903871e-05, + "loss": 2.6867, + "step": 35568 + }, + { + "epoch": 1.6560048420513538, + "grad_norm": 0.35236232832522213, + "learning_rate": 4.9647867159146946e-05, + "loss": 2.6479, + "step": 35569 + }, + { + "epoch": 1.656051400237447, + "grad_norm": 0.3492480374484889, + "learning_rate": 4.964515849028868e-05, + "loss": 2.6352, + "step": 35570 + }, + { + "epoch": 1.6560979584235398, + "grad_norm": 0.358573945509011, + "learning_rate": 4.964244982247184e-05, + "loss": 2.6055, + "step": 35571 + }, + { + "epoch": 1.656144516609633, + "grad_norm": 0.34237110742829624, + "learning_rate": 4.9639741155704365e-05, + "loss": 2.6986, + "step": 35572 + }, + { + "epoch": 1.6561910747957258, + "grad_norm": 0.34514877295280866, + "learning_rate": 4.9637032489994215e-05, + "loss": 2.7481, + "step": 35573 + }, + { + "epoch": 1.656237632981819, + "grad_norm": 0.3675029722438288, + "learning_rate": 4.9634323825349334e-05, + "loss": 2.6683, + "step": 35574 + }, + { + "epoch": 1.656284191167912, + "grad_norm": 0.341203646311484, + "learning_rate": 4.963161516177769e-05, + "loss": 2.743, + "step": 35575 + }, + { + "epoch": 1.6563307493540051, + "grad_norm": 0.3538989264031724, + "learning_rate": 4.962890649928722e-05, + "loss": 2.6591, + "step": 35576 + }, + { + "epoch": 1.6563773075400983, + "grad_norm": 0.33687711141319454, + "learning_rate": 4.962619783788585e-05, + "loss": 2.6866, + "step": 35577 + }, + { + "epoch": 1.6564238657261914, + "grad_norm": 0.35253499215966466, + "learning_rate": 4.9623489177581576e-05, + "loss": 2.6894, + "step": 35578 + }, + { + "epoch": 1.6564704239122845, + "grad_norm": 0.32659783050199503, + "learning_rate": 4.9620780518382324e-05, + "loss": 2.706, + "step": 35579 + }, + { + "epoch": 1.6565169820983776, + "grad_norm": 0.3202685170099192, + "learning_rate": 4.9618071860296025e-05, + "loss": 2.6233, + "step": 35580 + }, + { + "epoch": 1.6565635402844705, + "grad_norm": 0.3561576053989282, + "learning_rate": 4.9615363203330664e-05, + "loss": 2.645, + "step": 35581 + }, + { + "epoch": 1.6566100984705636, + "grad_norm": 0.3229627524100912, + "learning_rate": 4.961265454749415e-05, + "loss": 2.7084, + "step": 35582 + }, + { + "epoch": 1.6566566566566565, + "grad_norm": 0.3496913978734253, + "learning_rate": 4.960994589279448e-05, + "loss": 2.7174, + "step": 35583 + }, + { + "epoch": 1.6567032148427496, + "grad_norm": 0.35883133120607097, + "learning_rate": 4.960723723923959e-05, + "loss": 2.7252, + "step": 35584 + }, + { + "epoch": 1.6567497730288427, + "grad_norm": 0.35359009765321103, + "learning_rate": 4.96045285868374e-05, + "loss": 2.7258, + "step": 35585 + }, + { + "epoch": 1.6567963312149359, + "grad_norm": 0.35830491840750206, + "learning_rate": 4.9601819935595875e-05, + "loss": 2.8113, + "step": 35586 + }, + { + "epoch": 1.656842889401029, + "grad_norm": 0.3885734948540093, + "learning_rate": 4.959911128552298e-05, + "loss": 2.7267, + "step": 35587 + }, + { + "epoch": 1.656889447587122, + "grad_norm": 0.3391570739808556, + "learning_rate": 4.959640263662666e-05, + "loss": 2.7706, + "step": 35588 + }, + { + "epoch": 1.6569360057732152, + "grad_norm": 0.34938818410937966, + "learning_rate": 4.9593693988914856e-05, + "loss": 2.7422, + "step": 35589 + }, + { + "epoch": 1.656982563959308, + "grad_norm": 0.37202075421105785, + "learning_rate": 4.959098534239551e-05, + "loss": 2.6871, + "step": 35590 + }, + { + "epoch": 1.6570291221454012, + "grad_norm": 0.33598606515184654, + "learning_rate": 4.9588276697076594e-05, + "loss": 2.7661, + "step": 35591 + }, + { + "epoch": 1.6570756803314943, + "grad_norm": 0.3538399032224721, + "learning_rate": 4.958556805296602e-05, + "loss": 2.6049, + "step": 35592 + }, + { + "epoch": 1.6571222385175872, + "grad_norm": 0.33522786695755064, + "learning_rate": 4.9582859410071795e-05, + "loss": 2.6855, + "step": 35593 + }, + { + "epoch": 1.6571687967036803, + "grad_norm": 0.35927729180054074, + "learning_rate": 4.958015076840183e-05, + "loss": 2.6831, + "step": 35594 + }, + { + "epoch": 1.6572153548897735, + "grad_norm": 0.39086984684197906, + "learning_rate": 4.957744212796406e-05, + "loss": 2.7204, + "step": 35595 + }, + { + "epoch": 1.6572619130758666, + "grad_norm": 0.34510229949995985, + "learning_rate": 4.957473348876647e-05, + "loss": 2.8137, + "step": 35596 + }, + { + "epoch": 1.6573084712619597, + "grad_norm": 0.37333152646151285, + "learning_rate": 4.957202485081701e-05, + "loss": 2.5763, + "step": 35597 + }, + { + "epoch": 1.6573550294480528, + "grad_norm": 0.3679573190274556, + "learning_rate": 4.95693162141236e-05, + "loss": 2.8272, + "step": 35598 + }, + { + "epoch": 1.657401587634146, + "grad_norm": 0.3340049861933349, + "learning_rate": 4.95666075786942e-05, + "loss": 2.7584, + "step": 35599 + }, + { + "epoch": 1.6574481458202388, + "grad_norm": 0.37296443966000137, + "learning_rate": 4.9563898944536756e-05, + "loss": 2.6466, + "step": 35600 + }, + { + "epoch": 1.657494704006332, + "grad_norm": 0.33060500515181507, + "learning_rate": 4.956119031165925e-05, + "loss": 2.7258, + "step": 35601 + }, + { + "epoch": 1.657541262192425, + "grad_norm": 0.3636431439304514, + "learning_rate": 4.95584816800696e-05, + "loss": 2.6794, + "step": 35602 + }, + { + "epoch": 1.657587820378518, + "grad_norm": 0.39198441927582617, + "learning_rate": 4.9555773049775746e-05, + "loss": 2.724, + "step": 35603 + }, + { + "epoch": 1.657634378564611, + "grad_norm": 0.3868439755444672, + "learning_rate": 4.955306442078568e-05, + "loss": 2.6951, + "step": 35604 + }, + { + "epoch": 1.6576809367507042, + "grad_norm": 0.35593500035722975, + "learning_rate": 4.9550355793107295e-05, + "loss": 2.7188, + "step": 35605 + }, + { + "epoch": 1.6577274949367973, + "grad_norm": 0.37048774187222827, + "learning_rate": 4.9547647166748595e-05, + "loss": 2.6869, + "step": 35606 + }, + { + "epoch": 1.6577740531228904, + "grad_norm": 0.3443869522317663, + "learning_rate": 4.954493854171751e-05, + "loss": 2.5778, + "step": 35607 + }, + { + "epoch": 1.6578206113089835, + "grad_norm": 0.32579658365638237, + "learning_rate": 4.954222991802196e-05, + "loss": 2.6786, + "step": 35608 + }, + { + "epoch": 1.6578671694950766, + "grad_norm": 0.3932664241943268, + "learning_rate": 4.953952129566994e-05, + "loss": 2.7419, + "step": 35609 + }, + { + "epoch": 1.6579137276811695, + "grad_norm": 0.33334138616622583, + "learning_rate": 4.9536812674669375e-05, + "loss": 2.7977, + "step": 35610 + }, + { + "epoch": 1.6579602858672626, + "grad_norm": 0.37281735358966994, + "learning_rate": 4.953410405502822e-05, + "loss": 2.6432, + "step": 35611 + }, + { + "epoch": 1.6580068440533555, + "grad_norm": 0.36222156910905495, + "learning_rate": 4.9531395436754424e-05, + "loss": 2.823, + "step": 35612 + }, + { + "epoch": 1.6580534022394486, + "grad_norm": 0.3624484962106616, + "learning_rate": 4.9528686819855936e-05, + "loss": 2.7464, + "step": 35613 + }, + { + "epoch": 1.6580999604255418, + "grad_norm": 0.3496061304864194, + "learning_rate": 4.9525978204340715e-05, + "loss": 2.7298, + "step": 35614 + }, + { + "epoch": 1.6581465186116349, + "grad_norm": 0.3638666847379001, + "learning_rate": 4.95232695902167e-05, + "loss": 2.7785, + "step": 35615 + }, + { + "epoch": 1.658193076797728, + "grad_norm": 0.3402594994925672, + "learning_rate": 4.9520560977491815e-05, + "loss": 2.7129, + "step": 35616 + }, + { + "epoch": 1.658239634983821, + "grad_norm": 0.3535083530999462, + "learning_rate": 4.951785236617406e-05, + "loss": 2.6998, + "step": 35617 + }, + { + "epoch": 1.6582861931699142, + "grad_norm": 0.32333644778752696, + "learning_rate": 4.951514375627134e-05, + "loss": 2.7641, + "step": 35618 + }, + { + "epoch": 1.6583327513560073, + "grad_norm": 0.37322298551906513, + "learning_rate": 4.951243514779165e-05, + "loss": 2.6401, + "step": 35619 + }, + { + "epoch": 1.6583793095421002, + "grad_norm": 0.3072011336964035, + "learning_rate": 4.9509726540742915e-05, + "loss": 2.5978, + "step": 35620 + }, + { + "epoch": 1.6584258677281933, + "grad_norm": 0.36835992045636906, + "learning_rate": 4.950701793513306e-05, + "loss": 2.7866, + "step": 35621 + }, + { + "epoch": 1.6584724259142862, + "grad_norm": 0.34402643017115075, + "learning_rate": 4.950430933097008e-05, + "loss": 2.7449, + "step": 35622 + }, + { + "epoch": 1.6585189841003793, + "grad_norm": 0.352459493836803, + "learning_rate": 4.950160072826189e-05, + "loss": 2.7704, + "step": 35623 + }, + { + "epoch": 1.6585655422864725, + "grad_norm": 0.3780927091362402, + "learning_rate": 4.949889212701647e-05, + "loss": 2.7915, + "step": 35624 + }, + { + "epoch": 1.6586121004725656, + "grad_norm": 0.3446983349676696, + "learning_rate": 4.949618352724174e-05, + "loss": 2.6828, + "step": 35625 + }, + { + "epoch": 1.6586586586586587, + "grad_norm": 0.36396271461971824, + "learning_rate": 4.949347492894566e-05, + "loss": 2.6391, + "step": 35626 + }, + { + "epoch": 1.6587052168447518, + "grad_norm": 0.34552040639725945, + "learning_rate": 4.949076633213619e-05, + "loss": 2.7143, + "step": 35627 + }, + { + "epoch": 1.658751775030845, + "grad_norm": 0.32687894305419046, + "learning_rate": 4.9488057736821256e-05, + "loss": 2.8117, + "step": 35628 + }, + { + "epoch": 1.6587983332169378, + "grad_norm": 0.3633712475404354, + "learning_rate": 4.9485349143008834e-05, + "loss": 2.5856, + "step": 35629 + }, + { + "epoch": 1.658844891403031, + "grad_norm": 0.3524967781986347, + "learning_rate": 4.948264055070687e-05, + "loss": 2.7314, + "step": 35630 + }, + { + "epoch": 1.658891449589124, + "grad_norm": 0.37203358686305277, + "learning_rate": 4.9479931959923274e-05, + "loss": 2.6877, + "step": 35631 + }, + { + "epoch": 1.658938007775217, + "grad_norm": 0.35262414594551555, + "learning_rate": 4.947722337066606e-05, + "loss": 2.6768, + "step": 35632 + }, + { + "epoch": 1.65898456596131, + "grad_norm": 0.3342375799769418, + "learning_rate": 4.9474514782943135e-05, + "loss": 2.6791, + "step": 35633 + }, + { + "epoch": 1.6590311241474032, + "grad_norm": 0.31787305621837597, + "learning_rate": 4.947180619676244e-05, + "loss": 2.7879, + "step": 35634 + }, + { + "epoch": 1.6590776823334963, + "grad_norm": 0.3307395429020956, + "learning_rate": 4.9469097612131957e-05, + "loss": 2.7971, + "step": 35635 + }, + { + "epoch": 1.6591242405195894, + "grad_norm": 0.327789333247891, + "learning_rate": 4.9466389029059615e-05, + "loss": 2.736, + "step": 35636 + }, + { + "epoch": 1.6591707987056825, + "grad_norm": 0.3295834679294459, + "learning_rate": 4.946368044755338e-05, + "loss": 2.6614, + "step": 35637 + }, + { + "epoch": 1.6592173568917756, + "grad_norm": 0.32678966314242347, + "learning_rate": 4.946097186762118e-05, + "loss": 2.7025, + "step": 35638 + }, + { + "epoch": 1.6592639150778685, + "grad_norm": 0.34085237756893305, + "learning_rate": 4.945826328927097e-05, + "loss": 2.723, + "step": 35639 + }, + { + "epoch": 1.6593104732639616, + "grad_norm": 0.34185741209874443, + "learning_rate": 4.945555471251072e-05, + "loss": 2.8269, + "step": 35640 + }, + { + "epoch": 1.6593570314500548, + "grad_norm": 0.34314017426820126, + "learning_rate": 4.945284613734834e-05, + "loss": 2.8521, + "step": 35641 + }, + { + "epoch": 1.6594035896361476, + "grad_norm": 0.3543702286638345, + "learning_rate": 4.945013756379182e-05, + "loss": 2.8114, + "step": 35642 + }, + { + "epoch": 1.6594501478222408, + "grad_norm": 0.3108510951113623, + "learning_rate": 4.94474289918491e-05, + "loss": 2.6553, + "step": 35643 + }, + { + "epoch": 1.6594967060083339, + "grad_norm": 0.33447215548595927, + "learning_rate": 4.9444720421528093e-05, + "loss": 2.686, + "step": 35644 + }, + { + "epoch": 1.659543264194427, + "grad_norm": 0.3268730637899752, + "learning_rate": 4.94420118528368e-05, + "loss": 2.653, + "step": 35645 + }, + { + "epoch": 1.65958982238052, + "grad_norm": 0.3323797833053287, + "learning_rate": 4.943930328578313e-05, + "loss": 2.7528, + "step": 35646 + }, + { + "epoch": 1.6596363805666132, + "grad_norm": 0.32426732732750513, + "learning_rate": 4.9436594720375064e-05, + "loss": 2.7755, + "step": 35647 + }, + { + "epoch": 1.6596829387527063, + "grad_norm": 0.34122309722529165, + "learning_rate": 4.943388615662054e-05, + "loss": 2.8246, + "step": 35648 + }, + { + "epoch": 1.6597294969387992, + "grad_norm": 0.3634111911383923, + "learning_rate": 4.943117759452749e-05, + "loss": 2.6355, + "step": 35649 + }, + { + "epoch": 1.6597760551248923, + "grad_norm": 0.3392756971928366, + "learning_rate": 4.9428469034103896e-05, + "loss": 2.6643, + "step": 35650 + }, + { + "epoch": 1.6598226133109852, + "grad_norm": 0.35057585157155724, + "learning_rate": 4.9425760475357676e-05, + "loss": 2.7704, + "step": 35651 + }, + { + "epoch": 1.6598691714970784, + "grad_norm": 0.3489480237538877, + "learning_rate": 4.9423051918296794e-05, + "loss": 2.7299, + "step": 35652 + }, + { + "epoch": 1.6599157296831715, + "grad_norm": 0.34054253046042116, + "learning_rate": 4.9420343362929206e-05, + "loss": 2.7178, + "step": 35653 + }, + { + "epoch": 1.6599622878692646, + "grad_norm": 0.36877639905832976, + "learning_rate": 4.941763480926283e-05, + "loss": 2.7786, + "step": 35654 + }, + { + "epoch": 1.6600088460553577, + "grad_norm": 0.3228409910676632, + "learning_rate": 4.941492625730566e-05, + "loss": 2.5905, + "step": 35655 + }, + { + "epoch": 1.6600554042414508, + "grad_norm": 0.3981509115792854, + "learning_rate": 4.9412217707065626e-05, + "loss": 2.6462, + "step": 35656 + }, + { + "epoch": 1.660101962427544, + "grad_norm": 0.3428190559925136, + "learning_rate": 4.9409509158550656e-05, + "loss": 2.7228, + "step": 35657 + }, + { + "epoch": 1.660148520613637, + "grad_norm": 0.3536244286363275, + "learning_rate": 4.9406800611768736e-05, + "loss": 2.5806, + "step": 35658 + }, + { + "epoch": 1.66019507879973, + "grad_norm": 0.38455674102347553, + "learning_rate": 4.940409206672777e-05, + "loss": 2.6676, + "step": 35659 + }, + { + "epoch": 1.660241636985823, + "grad_norm": 0.3252663100214731, + "learning_rate": 4.940138352343577e-05, + "loss": 2.7737, + "step": 35660 + }, + { + "epoch": 1.660288195171916, + "grad_norm": 0.38360005858603885, + "learning_rate": 4.9398674981900644e-05, + "loss": 2.7828, + "step": 35661 + }, + { + "epoch": 1.660334753358009, + "grad_norm": 0.3373212963277393, + "learning_rate": 4.9395966442130334e-05, + "loss": 2.7756, + "step": 35662 + }, + { + "epoch": 1.6603813115441022, + "grad_norm": 0.35281355025389277, + "learning_rate": 4.9393257904132816e-05, + "loss": 2.7255, + "step": 35663 + }, + { + "epoch": 1.6604278697301953, + "grad_norm": 0.36076965478636286, + "learning_rate": 4.9390549367916006e-05, + "loss": 2.7279, + "step": 35664 + }, + { + "epoch": 1.6604744279162884, + "grad_norm": 0.35621520019664776, + "learning_rate": 4.93878408334879e-05, + "loss": 2.6192, + "step": 35665 + }, + { + "epoch": 1.6605209861023815, + "grad_norm": 0.33918230163520924, + "learning_rate": 4.938513230085641e-05, + "loss": 2.6619, + "step": 35666 + }, + { + "epoch": 1.6605675442884746, + "grad_norm": 0.3515279395271154, + "learning_rate": 4.938242377002948e-05, + "loss": 2.6974, + "step": 35667 + }, + { + "epoch": 1.6606141024745678, + "grad_norm": 0.35552793385932985, + "learning_rate": 4.937971524101509e-05, + "loss": 2.6634, + "step": 35668 + }, + { + "epoch": 1.6606606606606606, + "grad_norm": 0.3182238624301738, + "learning_rate": 4.937700671382119e-05, + "loss": 2.7247, + "step": 35669 + }, + { + "epoch": 1.6607072188467538, + "grad_norm": 0.3732281692882112, + "learning_rate": 4.937429818845568e-05, + "loss": 2.588, + "step": 35670 + }, + { + "epoch": 1.6607537770328467, + "grad_norm": 0.32955198210759806, + "learning_rate": 4.937158966492657e-05, + "loss": 2.6487, + "step": 35671 + }, + { + "epoch": 1.6608003352189398, + "grad_norm": 0.3546713388810956, + "learning_rate": 4.9368881143241764e-05, + "loss": 2.7046, + "step": 35672 + }, + { + "epoch": 1.6608468934050329, + "grad_norm": 0.3558584536353323, + "learning_rate": 4.936617262340925e-05, + "loss": 2.6055, + "step": 35673 + }, + { + "epoch": 1.660893451591126, + "grad_norm": 0.3498111379203619, + "learning_rate": 4.9363464105436944e-05, + "loss": 2.6664, + "step": 35674 + }, + { + "epoch": 1.6609400097772191, + "grad_norm": 0.3512052532315867, + "learning_rate": 4.9360755589332816e-05, + "loss": 2.7314, + "step": 35675 + }, + { + "epoch": 1.6609865679633122, + "grad_norm": 0.3229413594755435, + "learning_rate": 4.9358047075104805e-05, + "loss": 2.7801, + "step": 35676 + }, + { + "epoch": 1.6610331261494053, + "grad_norm": 0.3377618251279372, + "learning_rate": 4.9355338562760845e-05, + "loss": 2.7124, + "step": 35677 + }, + { + "epoch": 1.6610796843354982, + "grad_norm": 0.32322895212156444, + "learning_rate": 4.935263005230892e-05, + "loss": 2.7559, + "step": 35678 + }, + { + "epoch": 1.6611262425215914, + "grad_norm": 0.3399285535012068, + "learning_rate": 4.934992154375697e-05, + "loss": 2.7666, + "step": 35679 + }, + { + "epoch": 1.6611728007076845, + "grad_norm": 0.33446298647123107, + "learning_rate": 4.934721303711291e-05, + "loss": 2.7166, + "step": 35680 + }, + { + "epoch": 1.6612193588937774, + "grad_norm": 0.347193539816355, + "learning_rate": 4.934450453238474e-05, + "loss": 2.7443, + "step": 35681 + }, + { + "epoch": 1.6612659170798705, + "grad_norm": 0.3210229693485957, + "learning_rate": 4.934179602958036e-05, + "loss": 2.7415, + "step": 35682 + }, + { + "epoch": 1.6613124752659636, + "grad_norm": 0.33390571091424553, + "learning_rate": 4.933908752870777e-05, + "loss": 2.6672, + "step": 35683 + }, + { + "epoch": 1.6613590334520567, + "grad_norm": 0.30952880145014117, + "learning_rate": 4.933637902977489e-05, + "loss": 2.6341, + "step": 35684 + }, + { + "epoch": 1.6614055916381498, + "grad_norm": 0.3632592235943974, + "learning_rate": 4.933367053278965e-05, + "loss": 2.7419, + "step": 35685 + }, + { + "epoch": 1.661452149824243, + "grad_norm": 0.3305898239395641, + "learning_rate": 4.933096203776004e-05, + "loss": 2.685, + "step": 35686 + }, + { + "epoch": 1.661498708010336, + "grad_norm": 0.3743936084809693, + "learning_rate": 4.9328253544693994e-05, + "loss": 2.7665, + "step": 35687 + }, + { + "epoch": 1.661545266196429, + "grad_norm": 0.3470360349254745, + "learning_rate": 4.932554505359945e-05, + "loss": 2.7865, + "step": 35688 + }, + { + "epoch": 1.661591824382522, + "grad_norm": 0.3476693117123875, + "learning_rate": 4.932283656448437e-05, + "loss": 2.7477, + "step": 35689 + }, + { + "epoch": 1.6616383825686152, + "grad_norm": 0.33898730626676427, + "learning_rate": 4.932012807735668e-05, + "loss": 2.7336, + "step": 35690 + }, + { + "epoch": 1.661684940754708, + "grad_norm": 0.33598015970611694, + "learning_rate": 4.9317419592224364e-05, + "loss": 2.7359, + "step": 35691 + }, + { + "epoch": 1.6617314989408012, + "grad_norm": 0.3218279428930874, + "learning_rate": 4.931471110909536e-05, + "loss": 2.8065, + "step": 35692 + }, + { + "epoch": 1.6617780571268943, + "grad_norm": 0.3170657116650952, + "learning_rate": 4.9312002627977586e-05, + "loss": 2.7161, + "step": 35693 + }, + { + "epoch": 1.6618246153129874, + "grad_norm": 0.33067622491245247, + "learning_rate": 4.930929414887904e-05, + "loss": 2.5639, + "step": 35694 + }, + { + "epoch": 1.6618711734990805, + "grad_norm": 0.3126922071139109, + "learning_rate": 4.9306585671807627e-05, + "loss": 2.7123, + "step": 35695 + }, + { + "epoch": 1.6619177316851736, + "grad_norm": 0.33080057166427507, + "learning_rate": 4.9303877196771335e-05, + "loss": 2.6804, + "step": 35696 + }, + { + "epoch": 1.6619642898712668, + "grad_norm": 0.31848093427768664, + "learning_rate": 4.9301168723778094e-05, + "loss": 2.7545, + "step": 35697 + }, + { + "epoch": 1.6620108480573597, + "grad_norm": 0.34163010298929675, + "learning_rate": 4.929846025283584e-05, + "loss": 2.7942, + "step": 35698 + }, + { + "epoch": 1.6620574062434528, + "grad_norm": 0.3174071717027076, + "learning_rate": 4.929575178395255e-05, + "loss": 2.6175, + "step": 35699 + }, + { + "epoch": 1.6621039644295457, + "grad_norm": 0.3311478614230288, + "learning_rate": 4.929304331713615e-05, + "loss": 2.7335, + "step": 35700 + }, + { + "epoch": 1.6621505226156388, + "grad_norm": 0.3420415369575259, + "learning_rate": 4.929033485239461e-05, + "loss": 2.7638, + "step": 35701 + }, + { + "epoch": 1.662197080801732, + "grad_norm": 0.33268884024523476, + "learning_rate": 4.928762638973587e-05, + "loss": 2.7579, + "step": 35702 + }, + { + "epoch": 1.662243638987825, + "grad_norm": 0.33560578603790925, + "learning_rate": 4.928491792916785e-05, + "loss": 2.6272, + "step": 35703 + }, + { + "epoch": 1.6622901971739181, + "grad_norm": 0.33521335084086007, + "learning_rate": 4.9282209470698545e-05, + "loss": 2.5824, + "step": 35704 + }, + { + "epoch": 1.6623367553600112, + "grad_norm": 0.340756414508739, + "learning_rate": 4.9279501014335895e-05, + "loss": 2.6872, + "step": 35705 + }, + { + "epoch": 1.6623833135461044, + "grad_norm": 0.3102574240927347, + "learning_rate": 4.927679256008781e-05, + "loss": 2.6927, + "step": 35706 + }, + { + "epoch": 1.6624298717321975, + "grad_norm": 0.3537178091756127, + "learning_rate": 4.927408410796229e-05, + "loss": 2.7409, + "step": 35707 + }, + { + "epoch": 1.6624764299182904, + "grad_norm": 0.3310983278503951, + "learning_rate": 4.927137565796724e-05, + "loss": 2.6972, + "step": 35708 + }, + { + "epoch": 1.6625229881043835, + "grad_norm": 0.35431204157306256, + "learning_rate": 4.926866721011065e-05, + "loss": 2.8126, + "step": 35709 + }, + { + "epoch": 1.6625695462904764, + "grad_norm": 0.3325002831152043, + "learning_rate": 4.926595876440045e-05, + "loss": 2.6981, + "step": 35710 + }, + { + "epoch": 1.6626161044765695, + "grad_norm": 0.34492786204600984, + "learning_rate": 4.926325032084457e-05, + "loss": 2.7037, + "step": 35711 + }, + { + "epoch": 1.6626626626626626, + "grad_norm": 0.3351746631692087, + "learning_rate": 4.9260541879451e-05, + "loss": 2.6968, + "step": 35712 + }, + { + "epoch": 1.6627092208487557, + "grad_norm": 0.3323430556578963, + "learning_rate": 4.925783344022765e-05, + "loss": 2.7686, + "step": 35713 + }, + { + "epoch": 1.6627557790348488, + "grad_norm": 0.37755224570563906, + "learning_rate": 4.92551250031825e-05, + "loss": 2.7004, + "step": 35714 + }, + { + "epoch": 1.662802337220942, + "grad_norm": 0.35012749205149296, + "learning_rate": 4.925241656832349e-05, + "loss": 2.6861, + "step": 35715 + }, + { + "epoch": 1.662848895407035, + "grad_norm": 0.3526384705333667, + "learning_rate": 4.924970813565853e-05, + "loss": 2.6771, + "step": 35716 + }, + { + "epoch": 1.662895453593128, + "grad_norm": 0.33087849136883846, + "learning_rate": 4.9246999705195636e-05, + "loss": 2.7581, + "step": 35717 + }, + { + "epoch": 1.662942011779221, + "grad_norm": 0.36970105681095305, + "learning_rate": 4.92442912769427e-05, + "loss": 2.6806, + "step": 35718 + }, + { + "epoch": 1.6629885699653142, + "grad_norm": 0.3419251940230561, + "learning_rate": 4.9241582850907713e-05, + "loss": 2.6404, + "step": 35719 + }, + { + "epoch": 1.663035128151407, + "grad_norm": 0.36406692050090067, + "learning_rate": 4.9238874427098606e-05, + "loss": 2.7446, + "step": 35720 + }, + { + "epoch": 1.6630816863375002, + "grad_norm": 0.34226366187495794, + "learning_rate": 4.923616600552331e-05, + "loss": 2.6992, + "step": 35721 + }, + { + "epoch": 1.6631282445235933, + "grad_norm": 0.35846768887164976, + "learning_rate": 4.9233457586189806e-05, + "loss": 2.7756, + "step": 35722 + }, + { + "epoch": 1.6631748027096864, + "grad_norm": 0.3129433481855812, + "learning_rate": 4.923074916910603e-05, + "loss": 2.6051, + "step": 35723 + }, + { + "epoch": 1.6632213608957795, + "grad_norm": 0.34630311239899436, + "learning_rate": 4.922804075427991e-05, + "loss": 2.7007, + "step": 35724 + }, + { + "epoch": 1.6632679190818727, + "grad_norm": 0.3269664944698136, + "learning_rate": 4.9225332341719436e-05, + "loss": 2.733, + "step": 35725 + }, + { + "epoch": 1.6633144772679658, + "grad_norm": 0.3335285278052552, + "learning_rate": 4.9222623931432525e-05, + "loss": 2.7268, + "step": 35726 + }, + { + "epoch": 1.6633610354540587, + "grad_norm": 0.3628730707458068, + "learning_rate": 4.9219915523427154e-05, + "loss": 2.7276, + "step": 35727 + }, + { + "epoch": 1.6634075936401518, + "grad_norm": 0.32836959714118547, + "learning_rate": 4.921720711771124e-05, + "loss": 2.6928, + "step": 35728 + }, + { + "epoch": 1.663454151826245, + "grad_norm": 0.3462546697609402, + "learning_rate": 4.9214498714292736e-05, + "loss": 2.7718, + "step": 35729 + }, + { + "epoch": 1.6635007100123378, + "grad_norm": 0.31737169184765307, + "learning_rate": 4.921179031317961e-05, + "loss": 2.7177, + "step": 35730 + }, + { + "epoch": 1.663547268198431, + "grad_norm": 0.3569517508355766, + "learning_rate": 4.920908191437979e-05, + "loss": 2.7896, + "step": 35731 + }, + { + "epoch": 1.663593826384524, + "grad_norm": 0.3235030459876748, + "learning_rate": 4.9206373517901257e-05, + "loss": 2.7318, + "step": 35732 + }, + { + "epoch": 1.6636403845706171, + "grad_norm": 0.334768302505172, + "learning_rate": 4.920366512375194e-05, + "loss": 2.6443, + "step": 35733 + }, + { + "epoch": 1.6636869427567103, + "grad_norm": 0.3456398351719065, + "learning_rate": 4.920095673193976e-05, + "loss": 2.667, + "step": 35734 + }, + { + "epoch": 1.6637335009428034, + "grad_norm": 0.3202803754093801, + "learning_rate": 4.919824834247272e-05, + "loss": 2.7226, + "step": 35735 + }, + { + "epoch": 1.6637800591288965, + "grad_norm": 0.3596661880739978, + "learning_rate": 4.919553995535873e-05, + "loss": 2.7983, + "step": 35736 + }, + { + "epoch": 1.6638266173149894, + "grad_norm": 0.3373191378134523, + "learning_rate": 4.919283157060575e-05, + "loss": 2.662, + "step": 35737 + }, + { + "epoch": 1.6638731755010825, + "grad_norm": 0.36773851032692567, + "learning_rate": 4.9190123188221744e-05, + "loss": 2.7032, + "step": 35738 + }, + { + "epoch": 1.6639197336871754, + "grad_norm": 0.31165239739822087, + "learning_rate": 4.9187414808214634e-05, + "loss": 2.8072, + "step": 35739 + }, + { + "epoch": 1.6639662918732685, + "grad_norm": 0.36799697811748916, + "learning_rate": 4.918470643059239e-05, + "loss": 2.7564, + "step": 35740 + }, + { + "epoch": 1.6640128500593616, + "grad_norm": 0.3419886807347294, + "learning_rate": 4.918199805536296e-05, + "loss": 2.5708, + "step": 35741 + }, + { + "epoch": 1.6640594082454547, + "grad_norm": 0.37401899666730015, + "learning_rate": 4.917928968253426e-05, + "loss": 2.7232, + "step": 35742 + }, + { + "epoch": 1.6641059664315478, + "grad_norm": 0.3585825650832806, + "learning_rate": 4.917658131211428e-05, + "loss": 2.7237, + "step": 35743 + }, + { + "epoch": 1.664152524617641, + "grad_norm": 0.34167551400125734, + "learning_rate": 4.917387294411093e-05, + "loss": 2.7158, + "step": 35744 + }, + { + "epoch": 1.664199082803734, + "grad_norm": 0.38877489402597165, + "learning_rate": 4.917116457853221e-05, + "loss": 2.7168, + "step": 35745 + }, + { + "epoch": 1.6642456409898272, + "grad_norm": 0.328819470654591, + "learning_rate": 4.916845621538604e-05, + "loss": 2.6605, + "step": 35746 + }, + { + "epoch": 1.66429219917592, + "grad_norm": 0.3545725777773183, + "learning_rate": 4.9165747854680345e-05, + "loss": 2.7018, + "step": 35747 + }, + { + "epoch": 1.6643387573620132, + "grad_norm": 0.39174149557983634, + "learning_rate": 4.916303949642312e-05, + "loss": 2.7368, + "step": 35748 + }, + { + "epoch": 1.664385315548106, + "grad_norm": 0.34628156597774234, + "learning_rate": 4.916033114062228e-05, + "loss": 2.6486, + "step": 35749 + }, + { + "epoch": 1.6644318737341992, + "grad_norm": 0.3605087789014139, + "learning_rate": 4.915762278728578e-05, + "loss": 2.7573, + "step": 35750 + }, + { + "epoch": 1.6644784319202923, + "grad_norm": 0.34460901860908455, + "learning_rate": 4.91549144364216e-05, + "loss": 2.7433, + "step": 35751 + }, + { + "epoch": 1.6645249901063854, + "grad_norm": 0.33519405939999175, + "learning_rate": 4.915220608803764e-05, + "loss": 2.6449, + "step": 35752 + }, + { + "epoch": 1.6645715482924786, + "grad_norm": 0.3716564004830183, + "learning_rate": 4.914949774214189e-05, + "loss": 2.6982, + "step": 35753 + }, + { + "epoch": 1.6646181064785717, + "grad_norm": 0.3590512119788766, + "learning_rate": 4.914678939874225e-05, + "loss": 2.7736, + "step": 35754 + }, + { + "epoch": 1.6646646646646648, + "grad_norm": 0.3519206670715098, + "learning_rate": 4.914408105784672e-05, + "loss": 2.7691, + "step": 35755 + }, + { + "epoch": 1.664711222850758, + "grad_norm": 0.3632095948557017, + "learning_rate": 4.914137271946324e-05, + "loss": 2.6821, + "step": 35756 + }, + { + "epoch": 1.6647577810368508, + "grad_norm": 0.35566334968021895, + "learning_rate": 4.913866438359971e-05, + "loss": 2.748, + "step": 35757 + }, + { + "epoch": 1.664804339222944, + "grad_norm": 0.3677957002458633, + "learning_rate": 4.913595605026415e-05, + "loss": 2.5969, + "step": 35758 + }, + { + "epoch": 1.6648508974090368, + "grad_norm": 0.3492274641310363, + "learning_rate": 4.913324771946445e-05, + "loss": 2.6921, + "step": 35759 + }, + { + "epoch": 1.66489745559513, + "grad_norm": 0.36481830977336294, + "learning_rate": 4.91305393912086e-05, + "loss": 2.6179, + "step": 35760 + }, + { + "epoch": 1.664944013781223, + "grad_norm": 0.3226207328151106, + "learning_rate": 4.912783106550453e-05, + "loss": 2.6934, + "step": 35761 + }, + { + "epoch": 1.6649905719673161, + "grad_norm": 0.3795265763580958, + "learning_rate": 4.912512274236018e-05, + "loss": 2.63, + "step": 35762 + }, + { + "epoch": 1.6650371301534093, + "grad_norm": 0.3490176037988313, + "learning_rate": 4.912241442178351e-05, + "loss": 2.6008, + "step": 35763 + }, + { + "epoch": 1.6650836883395024, + "grad_norm": 0.36696278763789836, + "learning_rate": 4.911970610378248e-05, + "loss": 2.7102, + "step": 35764 + }, + { + "epoch": 1.6651302465255955, + "grad_norm": 0.3521655877251425, + "learning_rate": 4.911699778836501e-05, + "loss": 2.7452, + "step": 35765 + }, + { + "epoch": 1.6651768047116884, + "grad_norm": 0.3395641665518631, + "learning_rate": 4.9114289475539084e-05, + "loss": 2.7433, + "step": 35766 + }, + { + "epoch": 1.6652233628977815, + "grad_norm": 0.3527759730075369, + "learning_rate": 4.9111581165312595e-05, + "loss": 2.7124, + "step": 35767 + }, + { + "epoch": 1.6652699210838746, + "grad_norm": 0.33559064767615837, + "learning_rate": 4.910887285769356e-05, + "loss": 2.7081, + "step": 35768 + }, + { + "epoch": 1.6653164792699675, + "grad_norm": 0.35177640250305797, + "learning_rate": 4.9106164552689895e-05, + "loss": 2.6173, + "step": 35769 + }, + { + "epoch": 1.6653630374560606, + "grad_norm": 0.33862719190028684, + "learning_rate": 4.9103456250309527e-05, + "loss": 2.6862, + "step": 35770 + }, + { + "epoch": 1.6654095956421537, + "grad_norm": 0.3815738058195683, + "learning_rate": 4.910074795056045e-05, + "loss": 2.5833, + "step": 35771 + }, + { + "epoch": 1.6654561538282469, + "grad_norm": 0.3561266482297989, + "learning_rate": 4.9098039653450564e-05, + "loss": 2.7159, + "step": 35772 + }, + { + "epoch": 1.66550271201434, + "grad_norm": 0.331152464534023, + "learning_rate": 4.909533135898786e-05, + "loss": 2.8099, + "step": 35773 + }, + { + "epoch": 1.665549270200433, + "grad_norm": 0.3536647406511558, + "learning_rate": 4.909262306718028e-05, + "loss": 2.7164, + "step": 35774 + }, + { + "epoch": 1.6655958283865262, + "grad_norm": 0.3644191561163222, + "learning_rate": 4.908991477803574e-05, + "loss": 2.7521, + "step": 35775 + }, + { + "epoch": 1.665642386572619, + "grad_norm": 0.3468468529911115, + "learning_rate": 4.908720649156222e-05, + "loss": 2.68, + "step": 35776 + }, + { + "epoch": 1.6656889447587122, + "grad_norm": 0.3374726360042424, + "learning_rate": 4.9084498207767656e-05, + "loss": 2.6353, + "step": 35777 + }, + { + "epoch": 1.6657355029448053, + "grad_norm": 0.36040715941270085, + "learning_rate": 4.9081789926660006e-05, + "loss": 2.6707, + "step": 35778 + }, + { + "epoch": 1.6657820611308982, + "grad_norm": 0.3412641858446954, + "learning_rate": 4.9079081648247214e-05, + "loss": 2.6335, + "step": 35779 + }, + { + "epoch": 1.6658286193169913, + "grad_norm": 0.32816403877588984, + "learning_rate": 4.907637337253721e-05, + "loss": 2.7085, + "step": 35780 + }, + { + "epoch": 1.6658751775030844, + "grad_norm": 0.348904100201234, + "learning_rate": 4.9073665099537974e-05, + "loss": 2.7874, + "step": 35781 + }, + { + "epoch": 1.6659217356891776, + "grad_norm": 0.3406617424705871, + "learning_rate": 4.9070956829257444e-05, + "loss": 2.6954, + "step": 35782 + }, + { + "epoch": 1.6659682938752707, + "grad_norm": 0.31232520256100355, + "learning_rate": 4.906824856170354e-05, + "loss": 2.7652, + "step": 35783 + }, + { + "epoch": 1.6660148520613638, + "grad_norm": 0.3675718680739121, + "learning_rate": 4.906554029688426e-05, + "loss": 2.7179, + "step": 35784 + }, + { + "epoch": 1.666061410247457, + "grad_norm": 0.34630511755013726, + "learning_rate": 4.90628320348075e-05, + "loss": 2.6564, + "step": 35785 + }, + { + "epoch": 1.6661079684335498, + "grad_norm": 0.31198134865496346, + "learning_rate": 4.906012377548126e-05, + "loss": 2.7574, + "step": 35786 + }, + { + "epoch": 1.666154526619643, + "grad_norm": 0.35885692685268755, + "learning_rate": 4.905741551891347e-05, + "loss": 2.6903, + "step": 35787 + }, + { + "epoch": 1.6662010848057358, + "grad_norm": 0.3515645698433248, + "learning_rate": 4.9054707265112045e-05, + "loss": 2.6779, + "step": 35788 + }, + { + "epoch": 1.666247642991829, + "grad_norm": 0.30881611337468434, + "learning_rate": 4.905199901408498e-05, + "loss": 2.7406, + "step": 35789 + }, + { + "epoch": 1.666294201177922, + "grad_norm": 0.36773281527303786, + "learning_rate": 4.9049290765840195e-05, + "loss": 2.7451, + "step": 35790 + }, + { + "epoch": 1.6663407593640152, + "grad_norm": 0.32419421597813, + "learning_rate": 4.9046582520385665e-05, + "loss": 2.7461, + "step": 35791 + }, + { + "epoch": 1.6663873175501083, + "grad_norm": 0.3269360562211503, + "learning_rate": 4.904387427772931e-05, + "loss": 2.7756, + "step": 35792 + }, + { + "epoch": 1.6664338757362014, + "grad_norm": 0.3141338082988476, + "learning_rate": 4.9041166037879084e-05, + "loss": 2.7111, + "step": 35793 + }, + { + "epoch": 1.6664804339222945, + "grad_norm": 0.327518764823051, + "learning_rate": 4.903845780084295e-05, + "loss": 2.8233, + "step": 35794 + }, + { + "epoch": 1.6665269921083876, + "grad_norm": 0.32880092118929, + "learning_rate": 4.903574956662883e-05, + "loss": 2.752, + "step": 35795 + }, + { + "epoch": 1.6665735502944805, + "grad_norm": 0.32018251057290176, + "learning_rate": 4.903304133524471e-05, + "loss": 2.7633, + "step": 35796 + }, + { + "epoch": 1.6666201084805736, + "grad_norm": 0.3432412821742426, + "learning_rate": 4.9030333106698525e-05, + "loss": 2.738, + "step": 35797 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.3388064649695635, + "learning_rate": 4.9027624880998186e-05, + "loss": 2.6362, + "step": 35798 + }, + { + "epoch": 1.6667132248527596, + "grad_norm": 0.3308570587404061, + "learning_rate": 4.90249166581517e-05, + "loss": 2.7488, + "step": 35799 + }, + { + "epoch": 1.6667597830388527, + "grad_norm": 0.3671911357491961, + "learning_rate": 4.902220843816698e-05, + "loss": 2.7664, + "step": 35800 + }, + { + "epoch": 1.6668063412249459, + "grad_norm": 0.3489231155637028, + "learning_rate": 4.901950022105197e-05, + "loss": 2.7262, + "step": 35801 + }, + { + "epoch": 1.666852899411039, + "grad_norm": 0.3787746648140246, + "learning_rate": 4.901679200681465e-05, + "loss": 2.7465, + "step": 35802 + }, + { + "epoch": 1.666899457597132, + "grad_norm": 0.38966268004513777, + "learning_rate": 4.9014083795462934e-05, + "loss": 2.7206, + "step": 35803 + }, + { + "epoch": 1.6669460157832252, + "grad_norm": 0.3430150586397831, + "learning_rate": 4.90113755870048e-05, + "loss": 2.6582, + "step": 35804 + }, + { + "epoch": 1.666992573969318, + "grad_norm": 0.3760566811778267, + "learning_rate": 4.9008667381448176e-05, + "loss": 2.6576, + "step": 35805 + }, + { + "epoch": 1.6670391321554112, + "grad_norm": 0.35849057868913364, + "learning_rate": 4.9005959178801e-05, + "loss": 2.6711, + "step": 35806 + }, + { + "epoch": 1.6670856903415043, + "grad_norm": 0.34271469800426746, + "learning_rate": 4.900325097907125e-05, + "loss": 2.6368, + "step": 35807 + }, + { + "epoch": 1.6671322485275972, + "grad_norm": 0.35158676097167935, + "learning_rate": 4.9000542782266844e-05, + "loss": 2.7695, + "step": 35808 + }, + { + "epoch": 1.6671788067136903, + "grad_norm": 0.3150337599064752, + "learning_rate": 4.899783458839576e-05, + "loss": 2.7259, + "step": 35809 + }, + { + "epoch": 1.6672253648997835, + "grad_norm": 0.36002969136586055, + "learning_rate": 4.899512639746595e-05, + "loss": 2.7215, + "step": 35810 + }, + { + "epoch": 1.6672719230858766, + "grad_norm": 0.3442096785965365, + "learning_rate": 4.8992418209485307e-05, + "loss": 2.7083, + "step": 35811 + }, + { + "epoch": 1.6673184812719697, + "grad_norm": 0.30375659190409837, + "learning_rate": 4.898971002446185e-05, + "loss": 2.6924, + "step": 35812 + }, + { + "epoch": 1.6673650394580628, + "grad_norm": 0.36129928517034016, + "learning_rate": 4.898700184240347e-05, + "loss": 2.8211, + "step": 35813 + }, + { + "epoch": 1.667411597644156, + "grad_norm": 0.3094079137551929, + "learning_rate": 4.8984293663318154e-05, + "loss": 2.6483, + "step": 35814 + }, + { + "epoch": 1.6674581558302488, + "grad_norm": 0.3386853634766773, + "learning_rate": 4.898158548721384e-05, + "loss": 2.6191, + "step": 35815 + }, + { + "epoch": 1.667504714016342, + "grad_norm": 0.3253164143074562, + "learning_rate": 4.897887731409846e-05, + "loss": 2.8716, + "step": 35816 + }, + { + "epoch": 1.667551272202435, + "grad_norm": 0.356896232771157, + "learning_rate": 4.897616914397999e-05, + "loss": 2.735, + "step": 35817 + }, + { + "epoch": 1.667597830388528, + "grad_norm": 0.3477774179750563, + "learning_rate": 4.8973460976866366e-05, + "loss": 2.7691, + "step": 35818 + }, + { + "epoch": 1.667644388574621, + "grad_norm": 0.3580604895671486, + "learning_rate": 4.89707528127655e-05, + "loss": 2.7839, + "step": 35819 + }, + { + "epoch": 1.6676909467607142, + "grad_norm": 0.33462615977858395, + "learning_rate": 4.89680446516854e-05, + "loss": 2.6881, + "step": 35820 + }, + { + "epoch": 1.6677375049468073, + "grad_norm": 0.33596212338364234, + "learning_rate": 4.8965336493633965e-05, + "loss": 2.7561, + "step": 35821 + }, + { + "epoch": 1.6677840631329004, + "grad_norm": 0.3358740799493757, + "learning_rate": 4.8962628338619196e-05, + "loss": 2.7612, + "step": 35822 + }, + { + "epoch": 1.6678306213189935, + "grad_norm": 0.3246017582326426, + "learning_rate": 4.8959920186649e-05, + "loss": 2.7683, + "step": 35823 + }, + { + "epoch": 1.6678771795050866, + "grad_norm": 0.3283123414301555, + "learning_rate": 4.895721203773131e-05, + "loss": 2.7641, + "step": 35824 + }, + { + "epoch": 1.6679237376911795, + "grad_norm": 0.32069578235858326, + "learning_rate": 4.8954503891874135e-05, + "loss": 2.732, + "step": 35825 + }, + { + "epoch": 1.6679702958772726, + "grad_norm": 0.32730209158906787, + "learning_rate": 4.895179574908537e-05, + "loss": 2.5066, + "step": 35826 + }, + { + "epoch": 1.6680168540633655, + "grad_norm": 0.3165140180054598, + "learning_rate": 4.8949087609372985e-05, + "loss": 2.726, + "step": 35827 + }, + { + "epoch": 1.6680634122494586, + "grad_norm": 0.3465657298340844, + "learning_rate": 4.894637947274493e-05, + "loss": 2.8192, + "step": 35828 + }, + { + "epoch": 1.6681099704355518, + "grad_norm": 0.34797071602773627, + "learning_rate": 4.894367133920914e-05, + "loss": 2.6993, + "step": 35829 + }, + { + "epoch": 1.6681565286216449, + "grad_norm": 0.3459738765265244, + "learning_rate": 4.8940963208773574e-05, + "loss": 2.6977, + "step": 35830 + }, + { + "epoch": 1.668203086807738, + "grad_norm": 0.34921388893509675, + "learning_rate": 4.8938255081446164e-05, + "loss": 2.6276, + "step": 35831 + }, + { + "epoch": 1.668249644993831, + "grad_norm": 0.3229087661809725, + "learning_rate": 4.893554695723489e-05, + "loss": 2.6605, + "step": 35832 + }, + { + "epoch": 1.6682962031799242, + "grad_norm": 0.3355346551915962, + "learning_rate": 4.8932838836147684e-05, + "loss": 2.7207, + "step": 35833 + }, + { + "epoch": 1.6683427613660173, + "grad_norm": 0.34460994874836026, + "learning_rate": 4.893013071819246e-05, + "loss": 2.8122, + "step": 35834 + }, + { + "epoch": 1.6683893195521102, + "grad_norm": 0.33218167471338095, + "learning_rate": 4.892742260337723e-05, + "loss": 2.7829, + "step": 35835 + }, + { + "epoch": 1.6684358777382033, + "grad_norm": 0.34191060326132183, + "learning_rate": 4.89247144917099e-05, + "loss": 2.7505, + "step": 35836 + }, + { + "epoch": 1.6684824359242962, + "grad_norm": 0.327354219814195, + "learning_rate": 4.892200638319841e-05, + "loss": 2.7356, + "step": 35837 + }, + { + "epoch": 1.6685289941103894, + "grad_norm": 0.32636717387540776, + "learning_rate": 4.8919298277850746e-05, + "loss": 2.724, + "step": 35838 + }, + { + "epoch": 1.6685755522964825, + "grad_norm": 0.3378633208723594, + "learning_rate": 4.891659017567483e-05, + "loss": 2.7941, + "step": 35839 + }, + { + "epoch": 1.6686221104825756, + "grad_norm": 0.3432010750710023, + "learning_rate": 4.89138820766786e-05, + "loss": 2.604, + "step": 35840 + }, + { + "epoch": 1.6686686686686687, + "grad_norm": 0.35475344931428265, + "learning_rate": 4.891117398087004e-05, + "loss": 2.672, + "step": 35841 + }, + { + "epoch": 1.6687152268547618, + "grad_norm": 0.35007802353763895, + "learning_rate": 4.8908465888257064e-05, + "loss": 2.6956, + "step": 35842 + }, + { + "epoch": 1.668761785040855, + "grad_norm": 0.3337194527536629, + "learning_rate": 4.8905757798847646e-05, + "loss": 2.7456, + "step": 35843 + }, + { + "epoch": 1.668808343226948, + "grad_norm": 0.34637778822017634, + "learning_rate": 4.89030497126497e-05, + "loss": 2.6429, + "step": 35844 + }, + { + "epoch": 1.668854901413041, + "grad_norm": 0.34385954539282115, + "learning_rate": 4.890034162967122e-05, + "loss": 2.7592, + "step": 35845 + }, + { + "epoch": 1.668901459599134, + "grad_norm": 0.3230145019375523, + "learning_rate": 4.8897633549920116e-05, + "loss": 2.7419, + "step": 35846 + }, + { + "epoch": 1.668948017785227, + "grad_norm": 0.34247291036597743, + "learning_rate": 4.889492547340434e-05, + "loss": 2.7218, + "step": 35847 + }, + { + "epoch": 1.66899457597132, + "grad_norm": 0.3548338526666539, + "learning_rate": 4.8892217400131865e-05, + "loss": 2.6789, + "step": 35848 + }, + { + "epoch": 1.6690411341574132, + "grad_norm": 0.32898632106036085, + "learning_rate": 4.88895093301106e-05, + "loss": 2.7841, + "step": 35849 + }, + { + "epoch": 1.6690876923435063, + "grad_norm": 0.3430998604522258, + "learning_rate": 4.8886801263348544e-05, + "loss": 2.7279, + "step": 35850 + }, + { + "epoch": 1.6691342505295994, + "grad_norm": 0.3573478859781498, + "learning_rate": 4.888409319985361e-05, + "loss": 2.6653, + "step": 35851 + }, + { + "epoch": 1.6691808087156925, + "grad_norm": 0.33897932096726263, + "learning_rate": 4.888138513963374e-05, + "loss": 2.7225, + "step": 35852 + }, + { + "epoch": 1.6692273669017856, + "grad_norm": 0.3495853331298373, + "learning_rate": 4.88786770826969e-05, + "loss": 2.6954, + "step": 35853 + }, + { + "epoch": 1.6692739250878785, + "grad_norm": 0.34287290146747507, + "learning_rate": 4.887596902905104e-05, + "loss": 2.6719, + "step": 35854 + }, + { + "epoch": 1.6693204832739716, + "grad_norm": 0.3835016087805847, + "learning_rate": 4.887326097870409e-05, + "loss": 2.6738, + "step": 35855 + }, + { + "epoch": 1.6693670414600648, + "grad_norm": 0.33662968715481595, + "learning_rate": 4.887055293166402e-05, + "loss": 2.6821, + "step": 35856 + }, + { + "epoch": 1.6694135996461577, + "grad_norm": 0.3571832894379884, + "learning_rate": 4.8867844887938744e-05, + "loss": 2.7512, + "step": 35857 + }, + { + "epoch": 1.6694601578322508, + "grad_norm": 0.35048759948927644, + "learning_rate": 4.8865136847536253e-05, + "loss": 2.7021, + "step": 35858 + }, + { + "epoch": 1.6695067160183439, + "grad_norm": 0.38439446670865457, + "learning_rate": 4.886242881046447e-05, + "loss": 2.7505, + "step": 35859 + }, + { + "epoch": 1.669553274204437, + "grad_norm": 0.336462029177694, + "learning_rate": 4.885972077673133e-05, + "loss": 2.7712, + "step": 35860 + }, + { + "epoch": 1.6695998323905301, + "grad_norm": 0.34911407491428237, + "learning_rate": 4.885701274634482e-05, + "loss": 2.7962, + "step": 35861 + }, + { + "epoch": 1.6696463905766232, + "grad_norm": 0.3616807073403557, + "learning_rate": 4.8854304719312835e-05, + "loss": 2.6494, + "step": 35862 + }, + { + "epoch": 1.6696929487627163, + "grad_norm": 0.3205241022963002, + "learning_rate": 4.8851596695643386e-05, + "loss": 2.7247, + "step": 35863 + }, + { + "epoch": 1.6697395069488092, + "grad_norm": 0.35888802889910015, + "learning_rate": 4.8848888675344376e-05, + "loss": 2.68, + "step": 35864 + }, + { + "epoch": 1.6697860651349024, + "grad_norm": 0.34246391439620233, + "learning_rate": 4.884618065842376e-05, + "loss": 2.6903, + "step": 35865 + }, + { + "epoch": 1.6698326233209955, + "grad_norm": 0.3577709211748492, + "learning_rate": 4.8843472644889494e-05, + "loss": 2.777, + "step": 35866 + }, + { + "epoch": 1.6698791815070884, + "grad_norm": 0.35861437615761804, + "learning_rate": 4.884076463474951e-05, + "loss": 2.7696, + "step": 35867 + }, + { + "epoch": 1.6699257396931815, + "grad_norm": 0.3645750250243551, + "learning_rate": 4.883805662801179e-05, + "loss": 2.6457, + "step": 35868 + }, + { + "epoch": 1.6699722978792746, + "grad_norm": 0.3620388885999497, + "learning_rate": 4.883534862468426e-05, + "loss": 2.5581, + "step": 35869 + }, + { + "epoch": 1.6700188560653677, + "grad_norm": 0.35368612667743315, + "learning_rate": 4.8832640624774836e-05, + "loss": 2.616, + "step": 35870 + }, + { + "epoch": 1.6700654142514608, + "grad_norm": 0.3742973810574425, + "learning_rate": 4.882993262829152e-05, + "loss": 2.7795, + "step": 35871 + }, + { + "epoch": 1.670111972437554, + "grad_norm": 0.3529747245781013, + "learning_rate": 4.882722463524224e-05, + "loss": 2.6198, + "step": 35872 + }, + { + "epoch": 1.670158530623647, + "grad_norm": 0.3474509142606853, + "learning_rate": 4.8824516645634916e-05, + "loss": 2.6071, + "step": 35873 + }, + { + "epoch": 1.67020508880974, + "grad_norm": 0.363885067926642, + "learning_rate": 4.882180865947754e-05, + "loss": 2.6294, + "step": 35874 + }, + { + "epoch": 1.670251646995833, + "grad_norm": 0.34872470798721994, + "learning_rate": 4.8819100676778025e-05, + "loss": 2.7958, + "step": 35875 + }, + { + "epoch": 1.670298205181926, + "grad_norm": 0.3331428333478588, + "learning_rate": 4.881639269754435e-05, + "loss": 2.7063, + "step": 35876 + }, + { + "epoch": 1.670344763368019, + "grad_norm": 0.3300253442567229, + "learning_rate": 4.881368472178445e-05, + "loss": 2.6408, + "step": 35877 + }, + { + "epoch": 1.6703913215541122, + "grad_norm": 0.3255909059202709, + "learning_rate": 4.881097674950625e-05, + "loss": 2.7183, + "step": 35878 + }, + { + "epoch": 1.6704378797402053, + "grad_norm": 0.36565673300571366, + "learning_rate": 4.880826878071772e-05, + "loss": 2.6925, + "step": 35879 + }, + { + "epoch": 1.6704844379262984, + "grad_norm": 0.36411638748130076, + "learning_rate": 4.8805560815426804e-05, + "loss": 2.8054, + "step": 35880 + }, + { + "epoch": 1.6705309961123915, + "grad_norm": 0.32274218614975814, + "learning_rate": 4.880285285364146e-05, + "loss": 2.759, + "step": 35881 + }, + { + "epoch": 1.6705775542984846, + "grad_norm": 0.3431927324835457, + "learning_rate": 4.880014489536963e-05, + "loss": 2.6474, + "step": 35882 + }, + { + "epoch": 1.6706241124845778, + "grad_norm": 0.3213474089829231, + "learning_rate": 4.8797436940619225e-05, + "loss": 2.698, + "step": 35883 + }, + { + "epoch": 1.6706706706706707, + "grad_norm": 0.3369235484242674, + "learning_rate": 4.879472898939825e-05, + "loss": 2.7815, + "step": 35884 + }, + { + "epoch": 1.6707172288567638, + "grad_norm": 0.35167322891659086, + "learning_rate": 4.879202104171461e-05, + "loss": 2.7648, + "step": 35885 + }, + { + "epoch": 1.6707637870428567, + "grad_norm": 0.37484911022553674, + "learning_rate": 4.8789313097576284e-05, + "loss": 2.6738, + "step": 35886 + }, + { + "epoch": 1.6708103452289498, + "grad_norm": 0.33166177409699377, + "learning_rate": 4.8786605156991206e-05, + "loss": 2.7654, + "step": 35887 + }, + { + "epoch": 1.670856903415043, + "grad_norm": 0.36301236181818725, + "learning_rate": 4.87838972199673e-05, + "loss": 2.7082, + "step": 35888 + }, + { + "epoch": 1.670903461601136, + "grad_norm": 0.36914999508390217, + "learning_rate": 4.8781189286512563e-05, + "loss": 2.6694, + "step": 35889 + }, + { + "epoch": 1.6709500197872291, + "grad_norm": 0.36487795091298986, + "learning_rate": 4.877848135663491e-05, + "loss": 2.6967, + "step": 35890 + }, + { + "epoch": 1.6709965779733222, + "grad_norm": 0.3371126961702511, + "learning_rate": 4.8775773430342286e-05, + "loss": 2.6615, + "step": 35891 + }, + { + "epoch": 1.6710431361594154, + "grad_norm": 0.3712639153592178, + "learning_rate": 4.877306550764265e-05, + "loss": 2.7153, + "step": 35892 + }, + { + "epoch": 1.6710896943455082, + "grad_norm": 0.36097603484361923, + "learning_rate": 4.8770357588543935e-05, + "loss": 2.7816, + "step": 35893 + }, + { + "epoch": 1.6711362525316014, + "grad_norm": 0.361512083003997, + "learning_rate": 4.876764967305412e-05, + "loss": 2.7095, + "step": 35894 + }, + { + "epoch": 1.6711828107176945, + "grad_norm": 0.36057411116399096, + "learning_rate": 4.876494176118113e-05, + "loss": 2.8136, + "step": 35895 + }, + { + "epoch": 1.6712293689037874, + "grad_norm": 0.36626606296453007, + "learning_rate": 4.8762233852932887e-05, + "loss": 2.7664, + "step": 35896 + }, + { + "epoch": 1.6712759270898805, + "grad_norm": 0.3848492243607224, + "learning_rate": 4.8759525948317394e-05, + "loss": 2.6629, + "step": 35897 + }, + { + "epoch": 1.6713224852759736, + "grad_norm": 0.35949756031109226, + "learning_rate": 4.875681804734254e-05, + "loss": 2.6967, + "step": 35898 + }, + { + "epoch": 1.6713690434620667, + "grad_norm": 0.3573101654862402, + "learning_rate": 4.875411015001633e-05, + "loss": 2.5908, + "step": 35899 + }, + { + "epoch": 1.6714156016481598, + "grad_norm": 0.3614964065203811, + "learning_rate": 4.875140225634668e-05, + "loss": 2.7017, + "step": 35900 + }, + { + "epoch": 1.671462159834253, + "grad_norm": 0.34361459081532436, + "learning_rate": 4.874869436634153e-05, + "loss": 2.7828, + "step": 35901 + }, + { + "epoch": 1.671508718020346, + "grad_norm": 0.35753518057396966, + "learning_rate": 4.8745986480008854e-05, + "loss": 2.6098, + "step": 35902 + }, + { + "epoch": 1.671555276206439, + "grad_norm": 0.36941941944487733, + "learning_rate": 4.8743278597356574e-05, + "loss": 2.8233, + "step": 35903 + }, + { + "epoch": 1.671601834392532, + "grad_norm": 0.37565030411266226, + "learning_rate": 4.8740570718392654e-05, + "loss": 2.6774, + "step": 35904 + }, + { + "epoch": 1.6716483925786252, + "grad_norm": 0.35382848152535923, + "learning_rate": 4.8737862843125024e-05, + "loss": 2.7334, + "step": 35905 + }, + { + "epoch": 1.671694950764718, + "grad_norm": 0.3558453108914876, + "learning_rate": 4.8735154971561645e-05, + "loss": 2.6518, + "step": 35906 + }, + { + "epoch": 1.6717415089508112, + "grad_norm": 0.34897415610251753, + "learning_rate": 4.873244710371048e-05, + "loss": 2.691, + "step": 35907 + }, + { + "epoch": 1.6717880671369043, + "grad_norm": 0.38336970875629295, + "learning_rate": 4.872973923957945e-05, + "loss": 2.7271, + "step": 35908 + }, + { + "epoch": 1.6718346253229974, + "grad_norm": 0.3535619337564183, + "learning_rate": 4.872703137917649e-05, + "loss": 2.6733, + "step": 35909 + }, + { + "epoch": 1.6718811835090905, + "grad_norm": 0.3573092055431732, + "learning_rate": 4.872432352250959e-05, + "loss": 2.7015, + "step": 35910 + }, + { + "epoch": 1.6719277416951837, + "grad_norm": 0.36675813028651677, + "learning_rate": 4.872161566958665e-05, + "loss": 2.7275, + "step": 35911 + }, + { + "epoch": 1.6719742998812768, + "grad_norm": 0.34631899496389473, + "learning_rate": 4.8718907820415667e-05, + "loss": 2.6167, + "step": 35912 + }, + { + "epoch": 1.6720208580673697, + "grad_norm": 0.34527463181585627, + "learning_rate": 4.871619997500456e-05, + "loss": 2.7153, + "step": 35913 + }, + { + "epoch": 1.6720674162534628, + "grad_norm": 0.35173880138437397, + "learning_rate": 4.8713492133361263e-05, + "loss": 2.6574, + "step": 35914 + }, + { + "epoch": 1.6721139744395557, + "grad_norm": 0.3570646716972537, + "learning_rate": 4.8710784295493757e-05, + "loss": 2.6995, + "step": 35915 + }, + { + "epoch": 1.6721605326256488, + "grad_norm": 0.3565912358444352, + "learning_rate": 4.870807646140996e-05, + "loss": 2.6784, + "step": 35916 + }, + { + "epoch": 1.672207090811742, + "grad_norm": 0.35703040536737635, + "learning_rate": 4.870536863111784e-05, + "loss": 2.7099, + "step": 35917 + }, + { + "epoch": 1.672253648997835, + "grad_norm": 0.3504341299129847, + "learning_rate": 4.8702660804625336e-05, + "loss": 2.7041, + "step": 35918 + }, + { + "epoch": 1.6723002071839281, + "grad_norm": 0.33685142415415464, + "learning_rate": 4.8699952981940386e-05, + "loss": 2.8713, + "step": 35919 + }, + { + "epoch": 1.6723467653700212, + "grad_norm": 0.3452763101499685, + "learning_rate": 4.869724516307096e-05, + "loss": 2.7295, + "step": 35920 + }, + { + "epoch": 1.6723933235561144, + "grad_norm": 0.34491283382640037, + "learning_rate": 4.869453734802497e-05, + "loss": 2.7662, + "step": 35921 + }, + { + "epoch": 1.6724398817422075, + "grad_norm": 0.33529176643492936, + "learning_rate": 4.86918295368104e-05, + "loss": 2.6182, + "step": 35922 + }, + { + "epoch": 1.6724864399283004, + "grad_norm": 0.3254625806643418, + "learning_rate": 4.868912172943519e-05, + "loss": 2.6252, + "step": 35923 + }, + { + "epoch": 1.6725329981143935, + "grad_norm": 0.330436557461146, + "learning_rate": 4.868641392590725e-05, + "loss": 2.7278, + "step": 35924 + }, + { + "epoch": 1.6725795563004864, + "grad_norm": 0.32783629928450514, + "learning_rate": 4.868370612623458e-05, + "loss": 2.8035, + "step": 35925 + }, + { + "epoch": 1.6726261144865795, + "grad_norm": 0.35162817648622546, + "learning_rate": 4.8680998330425105e-05, + "loss": 2.653, + "step": 35926 + }, + { + "epoch": 1.6726726726726726, + "grad_norm": 0.3117833520321631, + "learning_rate": 4.867829053848674e-05, + "loss": 2.6897, + "step": 35927 + }, + { + "epoch": 1.6727192308587657, + "grad_norm": 0.3391702339523403, + "learning_rate": 4.8675582750427494e-05, + "loss": 2.6233, + "step": 35928 + }, + { + "epoch": 1.6727657890448588, + "grad_norm": 0.3333488410107088, + "learning_rate": 4.867287496625527e-05, + "loss": 2.7617, + "step": 35929 + }, + { + "epoch": 1.672812347230952, + "grad_norm": 0.3358592747851472, + "learning_rate": 4.8670167185978034e-05, + "loss": 2.7197, + "step": 35930 + }, + { + "epoch": 1.672858905417045, + "grad_norm": 0.3264065722565172, + "learning_rate": 4.8667459409603716e-05, + "loss": 2.7237, + "step": 35931 + }, + { + "epoch": 1.6729054636031382, + "grad_norm": 0.35178407723932525, + "learning_rate": 4.866475163714027e-05, + "loss": 2.7963, + "step": 35932 + }, + { + "epoch": 1.672952021789231, + "grad_norm": 0.33334306790104945, + "learning_rate": 4.866204386859566e-05, + "loss": 2.6307, + "step": 35933 + }, + { + "epoch": 1.6729985799753242, + "grad_norm": 0.34226601685200114, + "learning_rate": 4.86593361039778e-05, + "loss": 2.7281, + "step": 35934 + }, + { + "epoch": 1.673045138161417, + "grad_norm": 0.38772517788433586, + "learning_rate": 4.865662834329468e-05, + "loss": 2.6403, + "step": 35935 + }, + { + "epoch": 1.6730916963475102, + "grad_norm": 0.3323112558318227, + "learning_rate": 4.865392058655422e-05, + "loss": 2.829, + "step": 35936 + }, + { + "epoch": 1.6731382545336033, + "grad_norm": 0.3714597340354187, + "learning_rate": 4.865121283376435e-05, + "loss": 2.7272, + "step": 35937 + }, + { + "epoch": 1.6731848127196964, + "grad_norm": 0.3560659215094553, + "learning_rate": 4.864850508493306e-05, + "loss": 2.6988, + "step": 35938 + }, + { + "epoch": 1.6732313709057896, + "grad_norm": 0.35426384230286845, + "learning_rate": 4.864579734006825e-05, + "loss": 2.697, + "step": 35939 + }, + { + "epoch": 1.6732779290918827, + "grad_norm": 0.3565866208581951, + "learning_rate": 4.864308959917792e-05, + "loss": 2.7182, + "step": 35940 + }, + { + "epoch": 1.6733244872779758, + "grad_norm": 0.3512330815596307, + "learning_rate": 4.8640381862269976e-05, + "loss": 2.6677, + "step": 35941 + }, + { + "epoch": 1.6733710454640687, + "grad_norm": 0.34321242701339344, + "learning_rate": 4.863767412935237e-05, + "loss": 2.8014, + "step": 35942 + }, + { + "epoch": 1.6734176036501618, + "grad_norm": 0.3682272229424748, + "learning_rate": 4.8634966400433076e-05, + "loss": 2.7148, + "step": 35943 + }, + { + "epoch": 1.673464161836255, + "grad_norm": 0.3395134366130587, + "learning_rate": 4.863225867552e-05, + "loss": 2.6986, + "step": 35944 + }, + { + "epoch": 1.6735107200223478, + "grad_norm": 0.3541567695085619, + "learning_rate": 4.8629550954621117e-05, + "loss": 2.6936, + "step": 35945 + }, + { + "epoch": 1.673557278208441, + "grad_norm": 0.32431178870230315, + "learning_rate": 4.862684323774438e-05, + "loss": 2.65, + "step": 35946 + }, + { + "epoch": 1.673603836394534, + "grad_norm": 0.3531202843122795, + "learning_rate": 4.8624135524897695e-05, + "loss": 2.7449, + "step": 35947 + }, + { + "epoch": 1.6736503945806271, + "grad_norm": 0.3202882541566841, + "learning_rate": 4.862142781608906e-05, + "loss": 2.693, + "step": 35948 + }, + { + "epoch": 1.6736969527667203, + "grad_norm": 0.36076950237018063, + "learning_rate": 4.86187201113264e-05, + "loss": 2.7381, + "step": 35949 + }, + { + "epoch": 1.6737435109528134, + "grad_norm": 0.3259603392777987, + "learning_rate": 4.8616012410617635e-05, + "loss": 2.6523, + "step": 35950 + }, + { + "epoch": 1.6737900691389065, + "grad_norm": 0.3612317931775824, + "learning_rate": 4.861330471397076e-05, + "loss": 2.7092, + "step": 35951 + }, + { + "epoch": 1.6738366273249994, + "grad_norm": 0.33615629836227173, + "learning_rate": 4.861059702139368e-05, + "loss": 2.6529, + "step": 35952 + }, + { + "epoch": 1.6738831855110925, + "grad_norm": 0.3440499917325127, + "learning_rate": 4.860788933289438e-05, + "loss": 2.7026, + "step": 35953 + }, + { + "epoch": 1.6739297436971856, + "grad_norm": 0.36192146804602215, + "learning_rate": 4.860518164848079e-05, + "loss": 2.7273, + "step": 35954 + }, + { + "epoch": 1.6739763018832785, + "grad_norm": 0.34372081631865237, + "learning_rate": 4.860247396816084e-05, + "loss": 2.7661, + "step": 35955 + }, + { + "epoch": 1.6740228600693716, + "grad_norm": 0.33370230193761585, + "learning_rate": 4.8599766291942504e-05, + "loss": 2.7284, + "step": 35956 + }, + { + "epoch": 1.6740694182554647, + "grad_norm": 0.3643098495824891, + "learning_rate": 4.8597058619833694e-05, + "loss": 2.6716, + "step": 35957 + }, + { + "epoch": 1.6741159764415579, + "grad_norm": 0.34418597410973, + "learning_rate": 4.8594350951842405e-05, + "loss": 2.6918, + "step": 35958 + }, + { + "epoch": 1.674162534627651, + "grad_norm": 0.34580262759040953, + "learning_rate": 4.8591643287976554e-05, + "loss": 2.7507, + "step": 35959 + }, + { + "epoch": 1.674209092813744, + "grad_norm": 0.3425670868364383, + "learning_rate": 4.858893562824407e-05, + "loss": 2.6858, + "step": 35960 + }, + { + "epoch": 1.6742556509998372, + "grad_norm": 0.35688102776922864, + "learning_rate": 4.8586227972652934e-05, + "loss": 2.7443, + "step": 35961 + }, + { + "epoch": 1.67430220918593, + "grad_norm": 0.366996798300741, + "learning_rate": 4.858352032121109e-05, + "loss": 2.7435, + "step": 35962 + }, + { + "epoch": 1.6743487673720232, + "grad_norm": 0.3739230845927425, + "learning_rate": 4.858081267392645e-05, + "loss": 2.6955, + "step": 35963 + }, + { + "epoch": 1.674395325558116, + "grad_norm": 0.39198224902775913, + "learning_rate": 4.857810503080701e-05, + "loss": 2.7324, + "step": 35964 + }, + { + "epoch": 1.6744418837442092, + "grad_norm": 0.3704860558140927, + "learning_rate": 4.857539739186066e-05, + "loss": 2.673, + "step": 35965 + }, + { + "epoch": 1.6744884419303023, + "grad_norm": 0.32885038094902036, + "learning_rate": 4.857268975709541e-05, + "loss": 2.7287, + "step": 35966 + }, + { + "epoch": 1.6745350001163954, + "grad_norm": 0.3834393587271252, + "learning_rate": 4.856998212651917e-05, + "loss": 2.7207, + "step": 35967 + }, + { + "epoch": 1.6745815583024886, + "grad_norm": 0.33903755098049954, + "learning_rate": 4.856727450013989e-05, + "loss": 2.6465, + "step": 35968 + }, + { + "epoch": 1.6746281164885817, + "grad_norm": 0.31278686339971196, + "learning_rate": 4.856456687796552e-05, + "loss": 2.657, + "step": 35969 + }, + { + "epoch": 1.6746746746746748, + "grad_norm": 0.3494826007278388, + "learning_rate": 4.856185926000398e-05, + "loss": 2.6439, + "step": 35970 + }, + { + "epoch": 1.674721232860768, + "grad_norm": 0.3571737907322638, + "learning_rate": 4.855915164626327e-05, + "loss": 2.7574, + "step": 35971 + }, + { + "epoch": 1.6747677910468608, + "grad_norm": 0.3349501519997622, + "learning_rate": 4.8556444036751305e-05, + "loss": 2.6547, + "step": 35972 + }, + { + "epoch": 1.674814349232954, + "grad_norm": 0.33790204042489874, + "learning_rate": 4.855373643147601e-05, + "loss": 2.716, + "step": 35973 + }, + { + "epoch": 1.6748609074190468, + "grad_norm": 0.35732729031437316, + "learning_rate": 4.8551028830445386e-05, + "loss": 2.7465, + "step": 35974 + }, + { + "epoch": 1.67490746560514, + "grad_norm": 0.3386319332600552, + "learning_rate": 4.854832123366732e-05, + "loss": 2.6801, + "step": 35975 + }, + { + "epoch": 1.674954023791233, + "grad_norm": 0.3490873774843622, + "learning_rate": 4.854561364114981e-05, + "loss": 2.6945, + "step": 35976 + }, + { + "epoch": 1.6750005819773262, + "grad_norm": 0.39020218233233306, + "learning_rate": 4.854290605290078e-05, + "loss": 2.7399, + "step": 35977 + }, + { + "epoch": 1.6750471401634193, + "grad_norm": 0.304551357055907, + "learning_rate": 4.854019846892817e-05, + "loss": 2.703, + "step": 35978 + }, + { + "epoch": 1.6750936983495124, + "grad_norm": 0.38417236173162944, + "learning_rate": 4.853749088923994e-05, + "loss": 2.6979, + "step": 35979 + }, + { + "epoch": 1.6751402565356055, + "grad_norm": 0.3612659187437835, + "learning_rate": 4.853478331384403e-05, + "loss": 2.7123, + "step": 35980 + }, + { + "epoch": 1.6751868147216984, + "grad_norm": 0.3583633148371532, + "learning_rate": 4.8532075742748386e-05, + "loss": 2.8875, + "step": 35981 + }, + { + "epoch": 1.6752333729077915, + "grad_norm": 0.3818807088440135, + "learning_rate": 4.852936817596096e-05, + "loss": 2.6799, + "step": 35982 + }, + { + "epoch": 1.6752799310938846, + "grad_norm": 0.36151610814899904, + "learning_rate": 4.8526660613489675e-05, + "loss": 2.7464, + "step": 35983 + }, + { + "epoch": 1.6753264892799775, + "grad_norm": 0.33605507234568516, + "learning_rate": 4.852395305534251e-05, + "loss": 2.8036, + "step": 35984 + }, + { + "epoch": 1.6753730474660706, + "grad_norm": 0.3576375121704674, + "learning_rate": 4.8521245501527404e-05, + "loss": 2.7318, + "step": 35985 + }, + { + "epoch": 1.6754196056521637, + "grad_norm": 0.35736973975514236, + "learning_rate": 4.8518537952052274e-05, + "loss": 2.6902, + "step": 35986 + }, + { + "epoch": 1.6754661638382569, + "grad_norm": 0.3512979811121797, + "learning_rate": 4.851583040692511e-05, + "loss": 2.8376, + "step": 35987 + }, + { + "epoch": 1.67551272202435, + "grad_norm": 0.3330123720751299, + "learning_rate": 4.8513122866153816e-05, + "loss": 2.7781, + "step": 35988 + }, + { + "epoch": 1.675559280210443, + "grad_norm": 0.3372965216362396, + "learning_rate": 4.851041532974638e-05, + "loss": 2.6504, + "step": 35989 + }, + { + "epoch": 1.6756058383965362, + "grad_norm": 0.34458550263124216, + "learning_rate": 4.850770779771073e-05, + "loss": 2.635, + "step": 35990 + }, + { + "epoch": 1.675652396582629, + "grad_norm": 0.34948717685558534, + "learning_rate": 4.850500027005478e-05, + "loss": 2.7949, + "step": 35991 + }, + { + "epoch": 1.6756989547687222, + "grad_norm": 0.33629919411002135, + "learning_rate": 4.850229274678654e-05, + "loss": 2.7524, + "step": 35992 + }, + { + "epoch": 1.6757455129548153, + "grad_norm": 0.3780741788664842, + "learning_rate": 4.849958522791391e-05, + "loss": 2.6853, + "step": 35993 + }, + { + "epoch": 1.6757920711409082, + "grad_norm": 0.33769722222888066, + "learning_rate": 4.8496877713444864e-05, + "loss": 2.6689, + "step": 35994 + }, + { + "epoch": 1.6758386293270013, + "grad_norm": 0.3224182235957296, + "learning_rate": 4.849417020338733e-05, + "loss": 2.774, + "step": 35995 + }, + { + "epoch": 1.6758851875130945, + "grad_norm": 0.3850928092668305, + "learning_rate": 4.8491462697749236e-05, + "loss": 2.7176, + "step": 35996 + }, + { + "epoch": 1.6759317456991876, + "grad_norm": 0.35146399227597547, + "learning_rate": 4.848875519653857e-05, + "loss": 2.6656, + "step": 35997 + }, + { + "epoch": 1.6759783038852807, + "grad_norm": 0.3675286879744215, + "learning_rate": 4.8486047699763264e-05, + "loss": 2.7807, + "step": 35998 + }, + { + "epoch": 1.6760248620713738, + "grad_norm": 0.33373094069674697, + "learning_rate": 4.848334020743123e-05, + "loss": 2.7817, + "step": 35999 + }, + { + "epoch": 1.676071420257467, + "grad_norm": 0.3507532419220162, + "learning_rate": 4.8480632719550476e-05, + "loss": 2.6064, + "step": 36000 + }, + { + "epoch": 1.6761179784435598, + "grad_norm": 0.34870194139845684, + "learning_rate": 4.8477925236128886e-05, + "loss": 2.6166, + "step": 36001 + }, + { + "epoch": 1.676164536629653, + "grad_norm": 0.3375823070677284, + "learning_rate": 4.8475217757174454e-05, + "loss": 2.7503, + "step": 36002 + }, + { + "epoch": 1.6762110948157458, + "grad_norm": 0.37576588283760165, + "learning_rate": 4.847251028269511e-05, + "loss": 2.7567, + "step": 36003 + }, + { + "epoch": 1.676257653001839, + "grad_norm": 0.3554835975231598, + "learning_rate": 4.846980281269878e-05, + "loss": 2.715, + "step": 36004 + }, + { + "epoch": 1.676304211187932, + "grad_norm": 0.3639245649801108, + "learning_rate": 4.846709534719345e-05, + "loss": 2.6715, + "step": 36005 + }, + { + "epoch": 1.6763507693740252, + "grad_norm": 0.3801232192406714, + "learning_rate": 4.8464387886187026e-05, + "loss": 2.6933, + "step": 36006 + }, + { + "epoch": 1.6763973275601183, + "grad_norm": 0.35621408613354266, + "learning_rate": 4.846168042968749e-05, + "loss": 2.7044, + "step": 36007 + }, + { + "epoch": 1.6764438857462114, + "grad_norm": 0.35271967099297036, + "learning_rate": 4.845897297770276e-05, + "loss": 2.6782, + "step": 36008 + }, + { + "epoch": 1.6764904439323045, + "grad_norm": 0.354931918340241, + "learning_rate": 4.8456265530240785e-05, + "loss": 2.7997, + "step": 36009 + }, + { + "epoch": 1.6765370021183976, + "grad_norm": 0.3458554552374824, + "learning_rate": 4.845355808730954e-05, + "loss": 2.7614, + "step": 36010 + }, + { + "epoch": 1.6765835603044905, + "grad_norm": 0.36525782262289963, + "learning_rate": 4.845085064891692e-05, + "loss": 2.7071, + "step": 36011 + }, + { + "epoch": 1.6766301184905836, + "grad_norm": 0.37363174577660746, + "learning_rate": 4.844814321507092e-05, + "loss": 2.7524, + "step": 36012 + }, + { + "epoch": 1.6766766766766765, + "grad_norm": 0.3480791237748936, + "learning_rate": 4.8445435785779466e-05, + "loss": 2.7196, + "step": 36013 + }, + { + "epoch": 1.6767232348627696, + "grad_norm": 0.3901430237221658, + "learning_rate": 4.844272836105049e-05, + "loss": 2.7887, + "step": 36014 + }, + { + "epoch": 1.6767697930488628, + "grad_norm": 0.3630537005329721, + "learning_rate": 4.8440020940891975e-05, + "loss": 2.7524, + "step": 36015 + }, + { + "epoch": 1.6768163512349559, + "grad_norm": 0.4043530021465563, + "learning_rate": 4.843731352531184e-05, + "loss": 2.7749, + "step": 36016 + }, + { + "epoch": 1.676862909421049, + "grad_norm": 0.353589708085267, + "learning_rate": 4.843460611431802e-05, + "loss": 2.7722, + "step": 36017 + }, + { + "epoch": 1.676909467607142, + "grad_norm": 0.3935719424176588, + "learning_rate": 4.84318987079185e-05, + "loss": 2.7494, + "step": 36018 + }, + { + "epoch": 1.6769560257932352, + "grad_norm": 0.369930742006973, + "learning_rate": 4.8429191306121186e-05, + "loss": 2.8397, + "step": 36019 + }, + { + "epoch": 1.6770025839793283, + "grad_norm": 0.32450105796309103, + "learning_rate": 4.842648390893405e-05, + "loss": 2.6704, + "step": 36020 + }, + { + "epoch": 1.6770491421654212, + "grad_norm": 0.3597473079610293, + "learning_rate": 4.842377651636503e-05, + "loss": 2.6448, + "step": 36021 + }, + { + "epoch": 1.6770957003515143, + "grad_norm": 0.31564686770810924, + "learning_rate": 4.842106912842205e-05, + "loss": 2.7704, + "step": 36022 + }, + { + "epoch": 1.6771422585376072, + "grad_norm": 0.32674788109683783, + "learning_rate": 4.84183617451131e-05, + "loss": 2.8194, + "step": 36023 + }, + { + "epoch": 1.6771888167237003, + "grad_norm": 0.32848902781581835, + "learning_rate": 4.841565436644608e-05, + "loss": 2.7264, + "step": 36024 + }, + { + "epoch": 1.6772353749097935, + "grad_norm": 0.33946080934842326, + "learning_rate": 4.841294699242898e-05, + "loss": 2.6915, + "step": 36025 + }, + { + "epoch": 1.6772819330958866, + "grad_norm": 0.31957115644484746, + "learning_rate": 4.8410239623069724e-05, + "loss": 2.7716, + "step": 36026 + }, + { + "epoch": 1.6773284912819797, + "grad_norm": 0.32788644257876703, + "learning_rate": 4.8407532258376234e-05, + "loss": 2.714, + "step": 36027 + }, + { + "epoch": 1.6773750494680728, + "grad_norm": 0.3719399522669354, + "learning_rate": 4.840482489835651e-05, + "loss": 2.8254, + "step": 36028 + }, + { + "epoch": 1.677421607654166, + "grad_norm": 0.3021364111511569, + "learning_rate": 4.840211754301845e-05, + "loss": 2.6948, + "step": 36029 + }, + { + "epoch": 1.6774681658402588, + "grad_norm": 0.3571124071260735, + "learning_rate": 4.8399410192370015e-05, + "loss": 2.7524, + "step": 36030 + }, + { + "epoch": 1.677514724026352, + "grad_norm": 0.34152685223601503, + "learning_rate": 4.839670284641917e-05, + "loss": 2.7771, + "step": 36031 + }, + { + "epoch": 1.677561282212445, + "grad_norm": 0.32936803167599155, + "learning_rate": 4.839399550517383e-05, + "loss": 2.7069, + "step": 36032 + }, + { + "epoch": 1.677607840398538, + "grad_norm": 0.3255595605719826, + "learning_rate": 4.839128816864197e-05, + "loss": 2.7361, + "step": 36033 + }, + { + "epoch": 1.677654398584631, + "grad_norm": 0.3352696855153003, + "learning_rate": 4.838858083683152e-05, + "loss": 2.7344, + "step": 36034 + }, + { + "epoch": 1.6777009567707242, + "grad_norm": 0.3577477649785517, + "learning_rate": 4.83858735097504e-05, + "loss": 2.7135, + "step": 36035 + }, + { + "epoch": 1.6777475149568173, + "grad_norm": 0.3397957401707201, + "learning_rate": 4.8383166187406604e-05, + "loss": 2.6762, + "step": 36036 + }, + { + "epoch": 1.6777940731429104, + "grad_norm": 0.35136281922489987, + "learning_rate": 4.838045886980804e-05, + "loss": 2.842, + "step": 36037 + }, + { + "epoch": 1.6778406313290035, + "grad_norm": 0.34742574987341585, + "learning_rate": 4.837775155696269e-05, + "loss": 2.7464, + "step": 36038 + }, + { + "epoch": 1.6778871895150966, + "grad_norm": 0.3518742277727642, + "learning_rate": 4.837504424887847e-05, + "loss": 2.7236, + "step": 36039 + }, + { + "epoch": 1.6779337477011895, + "grad_norm": 0.3537841525633333, + "learning_rate": 4.8372336945563326e-05, + "loss": 2.8117, + "step": 36040 + }, + { + "epoch": 1.6779803058872826, + "grad_norm": 0.30908041342039605, + "learning_rate": 4.836962964702523e-05, + "loss": 2.716, + "step": 36041 + }, + { + "epoch": 1.6780268640733755, + "grad_norm": 0.3565055578728649, + "learning_rate": 4.836692235327209e-05, + "loss": 2.8203, + "step": 36042 + }, + { + "epoch": 1.6780734222594686, + "grad_norm": 0.3039587616242439, + "learning_rate": 4.8364215064311886e-05, + "loss": 2.7323, + "step": 36043 + }, + { + "epoch": 1.6781199804455618, + "grad_norm": 0.3477765425172662, + "learning_rate": 4.8361507780152554e-05, + "loss": 2.7067, + "step": 36044 + }, + { + "epoch": 1.6781665386316549, + "grad_norm": 0.3326576770609981, + "learning_rate": 4.8358800500802025e-05, + "loss": 2.7852, + "step": 36045 + }, + { + "epoch": 1.678213096817748, + "grad_norm": 0.33147961758310945, + "learning_rate": 4.835609322626827e-05, + "loss": 2.6813, + "step": 36046 + }, + { + "epoch": 1.678259655003841, + "grad_norm": 0.37864625564730325, + "learning_rate": 4.835338595655919e-05, + "loss": 2.677, + "step": 36047 + }, + { + "epoch": 1.6783062131899342, + "grad_norm": 0.3485782825405373, + "learning_rate": 4.8350678691682786e-05, + "loss": 2.7781, + "step": 36048 + }, + { + "epoch": 1.6783527713760273, + "grad_norm": 0.35674600441072485, + "learning_rate": 4.834797143164698e-05, + "loss": 2.7781, + "step": 36049 + }, + { + "epoch": 1.6783993295621202, + "grad_norm": 0.37905808865324664, + "learning_rate": 4.8345264176459685e-05, + "loss": 2.7675, + "step": 36050 + }, + { + "epoch": 1.6784458877482133, + "grad_norm": 0.3294611564607655, + "learning_rate": 4.8342556926128904e-05, + "loss": 2.7898, + "step": 36051 + }, + { + "epoch": 1.6784924459343062, + "grad_norm": 0.33095140714612414, + "learning_rate": 4.833984968066256e-05, + "loss": 2.7944, + "step": 36052 + }, + { + "epoch": 1.6785390041203994, + "grad_norm": 0.3534391553905056, + "learning_rate": 4.8337142440068565e-05, + "loss": 2.7431, + "step": 36053 + }, + { + "epoch": 1.6785855623064925, + "grad_norm": 0.32309773818300114, + "learning_rate": 4.8334435204354916e-05, + "loss": 2.6775, + "step": 36054 + }, + { + "epoch": 1.6786321204925856, + "grad_norm": 0.33773719304825195, + "learning_rate": 4.833172797352953e-05, + "loss": 2.7482, + "step": 36055 + }, + { + "epoch": 1.6786786786786787, + "grad_norm": 0.33328365897415013, + "learning_rate": 4.832902074760035e-05, + "loss": 2.6606, + "step": 36056 + }, + { + "epoch": 1.6787252368647718, + "grad_norm": 0.33554317300136954, + "learning_rate": 4.8326313526575346e-05, + "loss": 2.6896, + "step": 36057 + }, + { + "epoch": 1.678771795050865, + "grad_norm": 0.3264881526286667, + "learning_rate": 4.832360631046243e-05, + "loss": 2.7365, + "step": 36058 + }, + { + "epoch": 1.678818353236958, + "grad_norm": 0.32117231859666484, + "learning_rate": 4.8320899099269575e-05, + "loss": 2.735, + "step": 36059 + }, + { + "epoch": 1.678864911423051, + "grad_norm": 0.3200486081320515, + "learning_rate": 4.831819189300471e-05, + "loss": 2.7807, + "step": 36060 + }, + { + "epoch": 1.678911469609144, + "grad_norm": 0.33502731926678125, + "learning_rate": 4.8315484691675786e-05, + "loss": 2.7547, + "step": 36061 + }, + { + "epoch": 1.678958027795237, + "grad_norm": 0.3388244778097329, + "learning_rate": 4.831277749529076e-05, + "loss": 2.7619, + "step": 36062 + }, + { + "epoch": 1.67900458598133, + "grad_norm": 0.32989104502073147, + "learning_rate": 4.831007030385755e-05, + "loss": 2.697, + "step": 36063 + }, + { + "epoch": 1.6790511441674232, + "grad_norm": 0.34324699583536133, + "learning_rate": 4.8307363117384126e-05, + "loss": 2.8686, + "step": 36064 + }, + { + "epoch": 1.6790977023535163, + "grad_norm": 0.3553140043707152, + "learning_rate": 4.8304655935878405e-05, + "loss": 2.5932, + "step": 36065 + }, + { + "epoch": 1.6791442605396094, + "grad_norm": 0.32247530707384675, + "learning_rate": 4.830194875934838e-05, + "loss": 2.6651, + "step": 36066 + }, + { + "epoch": 1.6791908187257025, + "grad_norm": 0.364997889940857, + "learning_rate": 4.829924158780197e-05, + "loss": 2.7465, + "step": 36067 + }, + { + "epoch": 1.6792373769117956, + "grad_norm": 0.3344523045303188, + "learning_rate": 4.82965344212471e-05, + "loss": 2.6577, + "step": 36068 + }, + { + "epoch": 1.6792839350978885, + "grad_norm": 0.3648983673415866, + "learning_rate": 4.829382725969173e-05, + "loss": 2.8334, + "step": 36069 + }, + { + "epoch": 1.6793304932839817, + "grad_norm": 0.3796350870237639, + "learning_rate": 4.8291120103143816e-05, + "loss": 2.7347, + "step": 36070 + }, + { + "epoch": 1.6793770514700748, + "grad_norm": 0.3519413885521074, + "learning_rate": 4.828841295161131e-05, + "loss": 2.7178, + "step": 36071 + }, + { + "epoch": 1.6794236096561677, + "grad_norm": 0.36763828381386215, + "learning_rate": 4.828570580510214e-05, + "loss": 2.7271, + "step": 36072 + }, + { + "epoch": 1.6794701678422608, + "grad_norm": 0.3523649294181242, + "learning_rate": 4.828299866362423e-05, + "loss": 2.7785, + "step": 36073 + }, + { + "epoch": 1.679516726028354, + "grad_norm": 0.36761186417599095, + "learning_rate": 4.8280291527185576e-05, + "loss": 2.7382, + "step": 36074 + }, + { + "epoch": 1.679563284214447, + "grad_norm": 0.35905330928952023, + "learning_rate": 4.82775843957941e-05, + "loss": 2.776, + "step": 36075 + }, + { + "epoch": 1.6796098424005401, + "grad_norm": 0.36210151565785786, + "learning_rate": 4.8274877269457706e-05, + "loss": 2.7479, + "step": 36076 + }, + { + "epoch": 1.6796564005866332, + "grad_norm": 0.3646599801893506, + "learning_rate": 4.8272170148184416e-05, + "loss": 2.7536, + "step": 36077 + }, + { + "epoch": 1.6797029587727264, + "grad_norm": 0.38595556427047056, + "learning_rate": 4.826946303198211e-05, + "loss": 2.6193, + "step": 36078 + }, + { + "epoch": 1.6797495169588192, + "grad_norm": 0.38228769797233475, + "learning_rate": 4.8266755920858774e-05, + "loss": 2.7099, + "step": 36079 + }, + { + "epoch": 1.6797960751449124, + "grad_norm": 0.3249413159384473, + "learning_rate": 4.8264048814822346e-05, + "loss": 2.6481, + "step": 36080 + }, + { + "epoch": 1.6798426333310055, + "grad_norm": 0.3522611706722174, + "learning_rate": 4.8261341713880754e-05, + "loss": 2.649, + "step": 36081 + }, + { + "epoch": 1.6798891915170984, + "grad_norm": 0.3351358217066712, + "learning_rate": 4.8258634618041946e-05, + "loss": 2.702, + "step": 36082 + }, + { + "epoch": 1.6799357497031915, + "grad_norm": 0.3158767969307338, + "learning_rate": 4.8255927527313884e-05, + "loss": 2.6812, + "step": 36083 + }, + { + "epoch": 1.6799823078892846, + "grad_norm": 0.3602720158143054, + "learning_rate": 4.82532204417045e-05, + "loss": 2.722, + "step": 36084 + }, + { + "epoch": 1.6800288660753777, + "grad_norm": 0.32310046381990926, + "learning_rate": 4.825051336122176e-05, + "loss": 2.6704, + "step": 36085 + }, + { + "epoch": 1.6800754242614708, + "grad_norm": 0.3558731958816697, + "learning_rate": 4.824780628587355e-05, + "loss": 2.7846, + "step": 36086 + }, + { + "epoch": 1.680121982447564, + "grad_norm": 0.34730472140639806, + "learning_rate": 4.824509921566789e-05, + "loss": 2.6534, + "step": 36087 + }, + { + "epoch": 1.680168540633657, + "grad_norm": 0.3529932858730215, + "learning_rate": 4.8242392150612665e-05, + "loss": 2.751, + "step": 36088 + }, + { + "epoch": 1.68021509881975, + "grad_norm": 0.3342445143277043, + "learning_rate": 4.823968509071587e-05, + "loss": 2.7541, + "step": 36089 + }, + { + "epoch": 1.680261657005843, + "grad_norm": 0.3558788244762961, + "learning_rate": 4.823697803598543e-05, + "loss": 2.8202, + "step": 36090 + }, + { + "epoch": 1.680308215191936, + "grad_norm": 0.339788924266732, + "learning_rate": 4.823427098642925e-05, + "loss": 2.7069, + "step": 36091 + }, + { + "epoch": 1.680354773378029, + "grad_norm": 0.35781855105970223, + "learning_rate": 4.823156394205534e-05, + "loss": 2.7024, + "step": 36092 + }, + { + "epoch": 1.6804013315641222, + "grad_norm": 0.33386965318498274, + "learning_rate": 4.822885690287162e-05, + "loss": 2.6453, + "step": 36093 + }, + { + "epoch": 1.6804478897502153, + "grad_norm": 0.33947060343690044, + "learning_rate": 4.8226149868886014e-05, + "loss": 2.6697, + "step": 36094 + }, + { + "epoch": 1.6804944479363084, + "grad_norm": 0.34106494508675284, + "learning_rate": 4.8223442840106486e-05, + "loss": 2.7455, + "step": 36095 + }, + { + "epoch": 1.6805410061224015, + "grad_norm": 0.34452937398183725, + "learning_rate": 4.822073581654098e-05, + "loss": 2.7023, + "step": 36096 + }, + { + "epoch": 1.6805875643084947, + "grad_norm": 0.33506312722132947, + "learning_rate": 4.821802879819745e-05, + "loss": 2.6935, + "step": 36097 + }, + { + "epoch": 1.6806341224945878, + "grad_norm": 0.3485943115180008, + "learning_rate": 4.821532178508382e-05, + "loss": 2.6747, + "step": 36098 + }, + { + "epoch": 1.6806806806806807, + "grad_norm": 0.34233303457172043, + "learning_rate": 4.821261477720803e-05, + "loss": 2.6995, + "step": 36099 + }, + { + "epoch": 1.6807272388667738, + "grad_norm": 0.33014960888747397, + "learning_rate": 4.820990777457807e-05, + "loss": 2.6542, + "step": 36100 + }, + { + "epoch": 1.6807737970528667, + "grad_norm": 0.363827204035066, + "learning_rate": 4.820720077720182e-05, + "loss": 2.7025, + "step": 36101 + }, + { + "epoch": 1.6808203552389598, + "grad_norm": 0.33697634131008, + "learning_rate": 4.820449378508728e-05, + "loss": 2.7012, + "step": 36102 + }, + { + "epoch": 1.680866913425053, + "grad_norm": 0.3370678979990235, + "learning_rate": 4.8201786798242384e-05, + "loss": 2.8215, + "step": 36103 + }, + { + "epoch": 1.680913471611146, + "grad_norm": 0.33705493544629694, + "learning_rate": 4.819907981667504e-05, + "loss": 2.7694, + "step": 36104 + }, + { + "epoch": 1.6809600297972391, + "grad_norm": 0.34544002540804486, + "learning_rate": 4.819637284039324e-05, + "loss": 2.5892, + "step": 36105 + }, + { + "epoch": 1.6810065879833322, + "grad_norm": 0.35244343606635054, + "learning_rate": 4.81936658694049e-05, + "loss": 2.7675, + "step": 36106 + }, + { + "epoch": 1.6810531461694254, + "grad_norm": 0.31850109169409235, + "learning_rate": 4.819095890371798e-05, + "loss": 2.6984, + "step": 36107 + }, + { + "epoch": 1.6810997043555183, + "grad_norm": 0.3362503529711092, + "learning_rate": 4.8188251943340404e-05, + "loss": 2.7309, + "step": 36108 + }, + { + "epoch": 1.6811462625416114, + "grad_norm": 0.34375494705145215, + "learning_rate": 4.818554498828014e-05, + "loss": 2.7328, + "step": 36109 + }, + { + "epoch": 1.6811928207277045, + "grad_norm": 0.3469599852013658, + "learning_rate": 4.818283803854513e-05, + "loss": 2.6338, + "step": 36110 + }, + { + "epoch": 1.6812393789137974, + "grad_norm": 0.3515971749450523, + "learning_rate": 4.81801310941433e-05, + "loss": 2.5953, + "step": 36111 + }, + { + "epoch": 1.6812859370998905, + "grad_norm": 0.3415211295336968, + "learning_rate": 4.8177424155082594e-05, + "loss": 2.7475, + "step": 36112 + }, + { + "epoch": 1.6813324952859836, + "grad_norm": 0.33838651208285847, + "learning_rate": 4.8174717221370995e-05, + "loss": 2.7073, + "step": 36113 + }, + { + "epoch": 1.6813790534720767, + "grad_norm": 0.34093753029024443, + "learning_rate": 4.81720102930164e-05, + "loss": 2.8453, + "step": 36114 + }, + { + "epoch": 1.6814256116581698, + "grad_norm": 0.3450554768732111, + "learning_rate": 4.8169303370026785e-05, + "loss": 2.6965, + "step": 36115 + }, + { + "epoch": 1.681472169844263, + "grad_norm": 0.36943405910076277, + "learning_rate": 4.816659645241009e-05, + "loss": 2.6863, + "step": 36116 + }, + { + "epoch": 1.681518728030356, + "grad_norm": 0.32949306270858725, + "learning_rate": 4.8163889540174227e-05, + "loss": 2.7769, + "step": 36117 + }, + { + "epoch": 1.681565286216449, + "grad_norm": 0.37498650987785503, + "learning_rate": 4.81611826333272e-05, + "loss": 2.779, + "step": 36118 + }, + { + "epoch": 1.681611844402542, + "grad_norm": 0.36530096386400246, + "learning_rate": 4.81584757318769e-05, + "loss": 2.7136, + "step": 36119 + }, + { + "epoch": 1.6816584025886352, + "grad_norm": 0.32092647839044963, + "learning_rate": 4.815576883583129e-05, + "loss": 2.7771, + "step": 36120 + }, + { + "epoch": 1.681704960774728, + "grad_norm": 0.33456024809222573, + "learning_rate": 4.815306194519834e-05, + "loss": 2.6319, + "step": 36121 + }, + { + "epoch": 1.6817515189608212, + "grad_norm": 0.3339968053744026, + "learning_rate": 4.8150355059985954e-05, + "loss": 2.7118, + "step": 36122 + }, + { + "epoch": 1.6817980771469143, + "grad_norm": 0.3645201309752742, + "learning_rate": 4.81476481802021e-05, + "loss": 2.6723, + "step": 36123 + }, + { + "epoch": 1.6818446353330074, + "grad_norm": 0.3478163078919844, + "learning_rate": 4.8144941305854706e-05, + "loss": 2.7725, + "step": 36124 + }, + { + "epoch": 1.6818911935191005, + "grad_norm": 0.3565055086297756, + "learning_rate": 4.814223443695174e-05, + "loss": 2.7504, + "step": 36125 + }, + { + "epoch": 1.6819377517051937, + "grad_norm": 0.37730629610287675, + "learning_rate": 4.8139527573501136e-05, + "loss": 2.6973, + "step": 36126 + }, + { + "epoch": 1.6819843098912868, + "grad_norm": 0.333785359898574, + "learning_rate": 4.813682071551081e-05, + "loss": 2.7108, + "step": 36127 + }, + { + "epoch": 1.6820308680773797, + "grad_norm": 0.35363712714569434, + "learning_rate": 4.813411386298876e-05, + "loss": 2.7269, + "step": 36128 + }, + { + "epoch": 1.6820774262634728, + "grad_norm": 0.3467570423391029, + "learning_rate": 4.813140701594289e-05, + "loss": 2.7473, + "step": 36129 + }, + { + "epoch": 1.6821239844495657, + "grad_norm": 0.3285375911690284, + "learning_rate": 4.8128700174381145e-05, + "loss": 2.6892, + "step": 36130 + }, + { + "epoch": 1.6821705426356588, + "grad_norm": 0.32958706660958503, + "learning_rate": 4.81259933383115e-05, + "loss": 2.8299, + "step": 36131 + }, + { + "epoch": 1.682217100821752, + "grad_norm": 0.358313099881871, + "learning_rate": 4.812328650774187e-05, + "loss": 2.7372, + "step": 36132 + }, + { + "epoch": 1.682263659007845, + "grad_norm": 0.33817272565760625, + "learning_rate": 4.812057968268021e-05, + "loss": 2.6213, + "step": 36133 + }, + { + "epoch": 1.6823102171939381, + "grad_norm": 0.3370724555868319, + "learning_rate": 4.8117872863134467e-05, + "loss": 2.7509, + "step": 36134 + }, + { + "epoch": 1.6823567753800313, + "grad_norm": 0.35257244663436943, + "learning_rate": 4.811516604911257e-05, + "loss": 2.733, + "step": 36135 + }, + { + "epoch": 1.6824033335661244, + "grad_norm": 0.33012026802452893, + "learning_rate": 4.8112459240622496e-05, + "loss": 2.5744, + "step": 36136 + }, + { + "epoch": 1.6824498917522175, + "grad_norm": 0.3350085941498786, + "learning_rate": 4.810975243767214e-05, + "loss": 2.7457, + "step": 36137 + }, + { + "epoch": 1.6824964499383104, + "grad_norm": 0.33370646135052817, + "learning_rate": 4.8107045640269496e-05, + "loss": 2.7428, + "step": 36138 + }, + { + "epoch": 1.6825430081244035, + "grad_norm": 0.3392819557599077, + "learning_rate": 4.810433884842249e-05, + "loss": 2.5723, + "step": 36139 + }, + { + "epoch": 1.6825895663104964, + "grad_norm": 0.34117500170629195, + "learning_rate": 4.810163206213904e-05, + "loss": 2.8117, + "step": 36140 + }, + { + "epoch": 1.6826361244965895, + "grad_norm": 0.3449024164934935, + "learning_rate": 4.809892528142714e-05, + "loss": 2.7245, + "step": 36141 + }, + { + "epoch": 1.6826826826826826, + "grad_norm": 0.31447898485886844, + "learning_rate": 4.809621850629468e-05, + "loss": 2.8353, + "step": 36142 + }, + { + "epoch": 1.6827292408687757, + "grad_norm": 0.3519590517082118, + "learning_rate": 4.809351173674965e-05, + "loss": 2.7676, + "step": 36143 + }, + { + "epoch": 1.6827757990548688, + "grad_norm": 0.3387458842532935, + "learning_rate": 4.8090804972799974e-05, + "loss": 2.7645, + "step": 36144 + }, + { + "epoch": 1.682822357240962, + "grad_norm": 0.3171846729672474, + "learning_rate": 4.808809821445359e-05, + "loss": 2.7161, + "step": 36145 + }, + { + "epoch": 1.682868915427055, + "grad_norm": 0.3426111484399397, + "learning_rate": 4.8085391461718446e-05, + "loss": 2.7046, + "step": 36146 + }, + { + "epoch": 1.6829154736131482, + "grad_norm": 0.3175797386698613, + "learning_rate": 4.8082684714602506e-05, + "loss": 2.7417, + "step": 36147 + }, + { + "epoch": 1.682962031799241, + "grad_norm": 0.34363032399890975, + "learning_rate": 4.807997797311369e-05, + "loss": 2.7214, + "step": 36148 + }, + { + "epoch": 1.6830085899853342, + "grad_norm": 0.33951953085091197, + "learning_rate": 4.8077271237259955e-05, + "loss": 2.7814, + "step": 36149 + }, + { + "epoch": 1.683055148171427, + "grad_norm": 0.31083050510605953, + "learning_rate": 4.807456450704921e-05, + "loss": 2.7938, + "step": 36150 + }, + { + "epoch": 1.6831017063575202, + "grad_norm": 0.34891977832010196, + "learning_rate": 4.807185778248946e-05, + "loss": 2.6797, + "step": 36151 + }, + { + "epoch": 1.6831482645436133, + "grad_norm": 0.3551526539140847, + "learning_rate": 4.8069151063588616e-05, + "loss": 2.6509, + "step": 36152 + }, + { + "epoch": 1.6831948227297064, + "grad_norm": 0.3477723039683153, + "learning_rate": 4.80664443503546e-05, + "loss": 2.6804, + "step": 36153 + }, + { + "epoch": 1.6832413809157996, + "grad_norm": 0.33125448645592287, + "learning_rate": 4.80637376427954e-05, + "loss": 2.6677, + "step": 36154 + }, + { + "epoch": 1.6832879391018927, + "grad_norm": 0.356441136781509, + "learning_rate": 4.806103094091892e-05, + "loss": 2.7319, + "step": 36155 + }, + { + "epoch": 1.6833344972879858, + "grad_norm": 0.3667958023964378, + "learning_rate": 4.805832424473314e-05, + "loss": 2.8169, + "step": 36156 + }, + { + "epoch": 1.6833810554740787, + "grad_norm": 0.3401570859144432, + "learning_rate": 4.8055617554245995e-05, + "loss": 2.7941, + "step": 36157 + }, + { + "epoch": 1.6834276136601718, + "grad_norm": 0.3843647480685749, + "learning_rate": 4.8052910869465405e-05, + "loss": 2.6176, + "step": 36158 + }, + { + "epoch": 1.683474171846265, + "grad_norm": 0.3355648298843321, + "learning_rate": 4.805020419039933e-05, + "loss": 2.632, + "step": 36159 + }, + { + "epoch": 1.6835207300323578, + "grad_norm": 0.3811772323020964, + "learning_rate": 4.804749751705572e-05, + "loss": 2.6477, + "step": 36160 + }, + { + "epoch": 1.683567288218451, + "grad_norm": 0.3669361882436169, + "learning_rate": 4.8044790849442516e-05, + "loss": 2.7445, + "step": 36161 + }, + { + "epoch": 1.683613846404544, + "grad_norm": 0.335428406129951, + "learning_rate": 4.804208418756766e-05, + "loss": 2.6956, + "step": 36162 + }, + { + "epoch": 1.6836604045906371, + "grad_norm": 0.33848647308245994, + "learning_rate": 4.803937753143907e-05, + "loss": 2.6598, + "step": 36163 + }, + { + "epoch": 1.6837069627767303, + "grad_norm": 0.31864737459998543, + "learning_rate": 4.8036670881064744e-05, + "loss": 2.688, + "step": 36164 + }, + { + "epoch": 1.6837535209628234, + "grad_norm": 0.3541939206472093, + "learning_rate": 4.803396423645259e-05, + "loss": 2.7669, + "step": 36165 + }, + { + "epoch": 1.6838000791489165, + "grad_norm": 0.32398354316328515, + "learning_rate": 4.803125759761053e-05, + "loss": 2.6903, + "step": 36166 + }, + { + "epoch": 1.6838466373350094, + "grad_norm": 0.3437461082628572, + "learning_rate": 4.8028550964546564e-05, + "loss": 2.698, + "step": 36167 + }, + { + "epoch": 1.6838931955211025, + "grad_norm": 0.366351181800141, + "learning_rate": 4.802584433726858e-05, + "loss": 2.7662, + "step": 36168 + }, + { + "epoch": 1.6839397537071956, + "grad_norm": 0.3274986667013939, + "learning_rate": 4.8023137715784574e-05, + "loss": 2.761, + "step": 36169 + }, + { + "epoch": 1.6839863118932885, + "grad_norm": 0.36243658468980056, + "learning_rate": 4.8020431100102466e-05, + "loss": 2.7393, + "step": 36170 + }, + { + "epoch": 1.6840328700793816, + "grad_norm": 0.3303750026088441, + "learning_rate": 4.801772449023018e-05, + "loss": 2.645, + "step": 36171 + }, + { + "epoch": 1.6840794282654747, + "grad_norm": 0.3320270036520758, + "learning_rate": 4.801501788617568e-05, + "loss": 2.7566, + "step": 36172 + }, + { + "epoch": 1.6841259864515679, + "grad_norm": 0.3274932325348705, + "learning_rate": 4.801231128794691e-05, + "loss": 2.6239, + "step": 36173 + }, + { + "epoch": 1.684172544637661, + "grad_norm": 0.36980533128491705, + "learning_rate": 4.8009604695551824e-05, + "loss": 2.7279, + "step": 36174 + }, + { + "epoch": 1.684219102823754, + "grad_norm": 0.3314265002703123, + "learning_rate": 4.8006898108998345e-05, + "loss": 2.6047, + "step": 36175 + }, + { + "epoch": 1.6842656610098472, + "grad_norm": 0.3326170573736657, + "learning_rate": 4.800419152829441e-05, + "loss": 2.7223, + "step": 36176 + }, + { + "epoch": 1.68431221919594, + "grad_norm": 0.3753637099258873, + "learning_rate": 4.800148495344799e-05, + "loss": 2.7176, + "step": 36177 + }, + { + "epoch": 1.6843587773820332, + "grad_norm": 0.33306485602873825, + "learning_rate": 4.799877838446699e-05, + "loss": 2.639, + "step": 36178 + }, + { + "epoch": 1.684405335568126, + "grad_norm": 0.35734705791906335, + "learning_rate": 4.7996071821359406e-05, + "loss": 2.7623, + "step": 36179 + }, + { + "epoch": 1.6844518937542192, + "grad_norm": 0.35182569098692135, + "learning_rate": 4.7993365264133155e-05, + "loss": 2.7692, + "step": 36180 + }, + { + "epoch": 1.6844984519403123, + "grad_norm": 0.3681263349841914, + "learning_rate": 4.7990658712796155e-05, + "loss": 2.846, + "step": 36181 + }, + { + "epoch": 1.6845450101264055, + "grad_norm": 0.37369045756711555, + "learning_rate": 4.79879521673564e-05, + "loss": 2.7089, + "step": 36182 + }, + { + "epoch": 1.6845915683124986, + "grad_norm": 0.32197179309296253, + "learning_rate": 4.79852456278218e-05, + "loss": 2.7954, + "step": 36183 + }, + { + "epoch": 1.6846381264985917, + "grad_norm": 0.35318307610186217, + "learning_rate": 4.79825390942003e-05, + "loss": 2.7277, + "step": 36184 + }, + { + "epoch": 1.6846846846846848, + "grad_norm": 0.3251039981384805, + "learning_rate": 4.797983256649985e-05, + "loss": 2.6926, + "step": 36185 + }, + { + "epoch": 1.684731242870778, + "grad_norm": 0.3425873477877032, + "learning_rate": 4.797712604472839e-05, + "loss": 2.6734, + "step": 36186 + }, + { + "epoch": 1.6847778010568708, + "grad_norm": 0.3387416308579504, + "learning_rate": 4.7974419528893875e-05, + "loss": 2.6961, + "step": 36187 + }, + { + "epoch": 1.684824359242964, + "grad_norm": 0.3507035481749156, + "learning_rate": 4.797171301900424e-05, + "loss": 2.7298, + "step": 36188 + }, + { + "epoch": 1.6848709174290568, + "grad_norm": 0.3108503024215524, + "learning_rate": 4.7969006515067404e-05, + "loss": 2.7204, + "step": 36189 + }, + { + "epoch": 1.68491747561515, + "grad_norm": 0.32640774812611617, + "learning_rate": 4.796630001709136e-05, + "loss": 2.6794, + "step": 36190 + }, + { + "epoch": 1.684964033801243, + "grad_norm": 0.333426489789492, + "learning_rate": 4.7963593525084e-05, + "loss": 2.7839, + "step": 36191 + }, + { + "epoch": 1.6850105919873362, + "grad_norm": 0.34405900221261493, + "learning_rate": 4.7960887039053316e-05, + "loss": 2.6321, + "step": 36192 + }, + { + "epoch": 1.6850571501734293, + "grad_norm": 0.34852744440951605, + "learning_rate": 4.795818055900722e-05, + "loss": 2.8546, + "step": 36193 + }, + { + "epoch": 1.6851037083595224, + "grad_norm": 0.33037563132923253, + "learning_rate": 4.795547408495366e-05, + "loss": 2.6905, + "step": 36194 + }, + { + "epoch": 1.6851502665456155, + "grad_norm": 0.32628372477116174, + "learning_rate": 4.795276761690058e-05, + "loss": 2.6982, + "step": 36195 + }, + { + "epoch": 1.6851968247317084, + "grad_norm": 0.3350778233112773, + "learning_rate": 4.795006115485593e-05, + "loss": 2.6238, + "step": 36196 + }, + { + "epoch": 1.6852433829178015, + "grad_norm": 0.32203948704296836, + "learning_rate": 4.794735469882766e-05, + "loss": 2.6906, + "step": 36197 + }, + { + "epoch": 1.6852899411038946, + "grad_norm": 0.3279059501141478, + "learning_rate": 4.794464824882369e-05, + "loss": 2.7803, + "step": 36198 + }, + { + "epoch": 1.6853364992899875, + "grad_norm": 0.34609716083999237, + "learning_rate": 4.794194180485197e-05, + "loss": 2.7402, + "step": 36199 + }, + { + "epoch": 1.6853830574760806, + "grad_norm": 0.33520555960030757, + "learning_rate": 4.793923536692046e-05, + "loss": 2.6779, + "step": 36200 + }, + { + "epoch": 1.6854296156621738, + "grad_norm": 0.30825899842683574, + "learning_rate": 4.793652893503709e-05, + "loss": 2.7352, + "step": 36201 + }, + { + "epoch": 1.6854761738482669, + "grad_norm": 0.33529257752401403, + "learning_rate": 4.7933822509209786e-05, + "loss": 2.717, + "step": 36202 + }, + { + "epoch": 1.68552273203436, + "grad_norm": 0.35418158997714627, + "learning_rate": 4.7931116089446526e-05, + "loss": 2.6384, + "step": 36203 + }, + { + "epoch": 1.685569290220453, + "grad_norm": 0.32950421269507324, + "learning_rate": 4.792840967575522e-05, + "loss": 2.6164, + "step": 36204 + }, + { + "epoch": 1.6856158484065462, + "grad_norm": 0.34952958886026175, + "learning_rate": 4.792570326814385e-05, + "loss": 2.7421, + "step": 36205 + }, + { + "epoch": 1.685662406592639, + "grad_norm": 0.34035415899217936, + "learning_rate": 4.792299686662034e-05, + "loss": 2.5736, + "step": 36206 + }, + { + "epoch": 1.6857089647787322, + "grad_norm": 0.32995626436337117, + "learning_rate": 4.792029047119259e-05, + "loss": 2.7378, + "step": 36207 + }, + { + "epoch": 1.6857555229648253, + "grad_norm": 0.3738534618580107, + "learning_rate": 4.791758408186862e-05, + "loss": 2.6577, + "step": 36208 + }, + { + "epoch": 1.6858020811509182, + "grad_norm": 0.3345338718354756, + "learning_rate": 4.791487769865632e-05, + "loss": 2.6571, + "step": 36209 + }, + { + "epoch": 1.6858486393370113, + "grad_norm": 0.36288602444597085, + "learning_rate": 4.791217132156366e-05, + "loss": 2.679, + "step": 36210 + }, + { + "epoch": 1.6858951975231045, + "grad_norm": 0.35927228154260726, + "learning_rate": 4.790946495059856e-05, + "loss": 2.72, + "step": 36211 + }, + { + "epoch": 1.6859417557091976, + "grad_norm": 0.3247093118123271, + "learning_rate": 4.790675858576898e-05, + "loss": 2.7611, + "step": 36212 + }, + { + "epoch": 1.6859883138952907, + "grad_norm": 0.38454373557374893, + "learning_rate": 4.790405222708286e-05, + "loss": 2.6885, + "step": 36213 + }, + { + "epoch": 1.6860348720813838, + "grad_norm": 0.33051120730712613, + "learning_rate": 4.790134587454812e-05, + "loss": 2.8137, + "step": 36214 + }, + { + "epoch": 1.686081430267477, + "grad_norm": 0.38452680043934995, + "learning_rate": 4.789863952817275e-05, + "loss": 2.7964, + "step": 36215 + }, + { + "epoch": 1.6861279884535698, + "grad_norm": 0.3564679284407259, + "learning_rate": 4.789593318796466e-05, + "loss": 2.6608, + "step": 36216 + }, + { + "epoch": 1.686174546639663, + "grad_norm": 0.3530840421839587, + "learning_rate": 4.789322685393178e-05, + "loss": 2.7475, + "step": 36217 + }, + { + "epoch": 1.6862211048257558, + "grad_norm": 0.3665513089936766, + "learning_rate": 4.7890520526082094e-05, + "loss": 2.6459, + "step": 36218 + }, + { + "epoch": 1.686267663011849, + "grad_norm": 0.3917359858781599, + "learning_rate": 4.7887814204423525e-05, + "loss": 2.6706, + "step": 36219 + }, + { + "epoch": 1.686314221197942, + "grad_norm": 0.36952868117366294, + "learning_rate": 4.7885107888963984e-05, + "loss": 2.6989, + "step": 36220 + }, + { + "epoch": 1.6863607793840352, + "grad_norm": 0.3941376463042639, + "learning_rate": 4.7882401579711474e-05, + "loss": 2.6379, + "step": 36221 + }, + { + "epoch": 1.6864073375701283, + "grad_norm": 0.36626518712035144, + "learning_rate": 4.787969527667389e-05, + "loss": 2.6339, + "step": 36222 + }, + { + "epoch": 1.6864538957562214, + "grad_norm": 0.3559682549128625, + "learning_rate": 4.78769889798592e-05, + "loss": 2.7233, + "step": 36223 + }, + { + "epoch": 1.6865004539423145, + "grad_norm": 0.4108378924900862, + "learning_rate": 4.787428268927533e-05, + "loss": 2.6938, + "step": 36224 + }, + { + "epoch": 1.6865470121284076, + "grad_norm": 0.33718907938155374, + "learning_rate": 4.787157640493024e-05, + "loss": 2.7169, + "step": 36225 + }, + { + "epoch": 1.6865935703145005, + "grad_norm": 0.4191390149480946, + "learning_rate": 4.7868870126831864e-05, + "loss": 2.7231, + "step": 36226 + }, + { + "epoch": 1.6866401285005936, + "grad_norm": 0.33002763474079577, + "learning_rate": 4.7866163854988116e-05, + "loss": 2.7611, + "step": 36227 + }, + { + "epoch": 1.6866866866866865, + "grad_norm": 0.3957010230238921, + "learning_rate": 4.7863457589407005e-05, + "loss": 2.6969, + "step": 36228 + }, + { + "epoch": 1.6867332448727796, + "grad_norm": 0.3776566873823977, + "learning_rate": 4.786075133009642e-05, + "loss": 2.7578, + "step": 36229 + }, + { + "epoch": 1.6867798030588728, + "grad_norm": 0.4022816862207817, + "learning_rate": 4.785804507706431e-05, + "loss": 2.6771, + "step": 36230 + }, + { + "epoch": 1.6868263612449659, + "grad_norm": 0.33265545894768395, + "learning_rate": 4.785533883031864e-05, + "loss": 2.699, + "step": 36231 + }, + { + "epoch": 1.686872919431059, + "grad_norm": 0.36027674604267806, + "learning_rate": 4.785263258986732e-05, + "loss": 2.6275, + "step": 36232 + }, + { + "epoch": 1.686919477617152, + "grad_norm": 0.3656048395027552, + "learning_rate": 4.784992635571833e-05, + "loss": 2.6473, + "step": 36233 + }, + { + "epoch": 1.6869660358032452, + "grad_norm": 0.33386505803465716, + "learning_rate": 4.7847220127879606e-05, + "loss": 2.7324, + "step": 36234 + }, + { + "epoch": 1.6870125939893383, + "grad_norm": 0.3268647345714013, + "learning_rate": 4.7844513906359055e-05, + "loss": 2.5765, + "step": 36235 + }, + { + "epoch": 1.6870591521754312, + "grad_norm": 0.32292966991754896, + "learning_rate": 4.784180769116466e-05, + "loss": 2.8225, + "step": 36236 + }, + { + "epoch": 1.6871057103615243, + "grad_norm": 0.35703019409800296, + "learning_rate": 4.783910148230434e-05, + "loss": 2.699, + "step": 36237 + }, + { + "epoch": 1.6871522685476172, + "grad_norm": 0.3390909989647843, + "learning_rate": 4.783639527978604e-05, + "loss": 2.8236, + "step": 36238 + }, + { + "epoch": 1.6871988267337104, + "grad_norm": 0.33420140913950286, + "learning_rate": 4.7833689083617716e-05, + "loss": 2.6757, + "step": 36239 + }, + { + "epoch": 1.6872453849198035, + "grad_norm": 0.38194554856525886, + "learning_rate": 4.783098289380728e-05, + "loss": 2.81, + "step": 36240 + }, + { + "epoch": 1.6872919431058966, + "grad_norm": 0.30428240092771897, + "learning_rate": 4.7828276710362716e-05, + "loss": 2.7817, + "step": 36241 + }, + { + "epoch": 1.6873385012919897, + "grad_norm": 0.3651721242580357, + "learning_rate": 4.782557053329195e-05, + "loss": 2.6793, + "step": 36242 + }, + { + "epoch": 1.6873850594780828, + "grad_norm": 0.32891125903947716, + "learning_rate": 4.78228643626029e-05, + "loss": 2.7464, + "step": 36243 + }, + { + "epoch": 1.687431617664176, + "grad_norm": 0.32148040385702675, + "learning_rate": 4.7820158198303544e-05, + "loss": 2.7672, + "step": 36244 + }, + { + "epoch": 1.6874781758502688, + "grad_norm": 0.32951220417958804, + "learning_rate": 4.781745204040179e-05, + "loss": 2.6389, + "step": 36245 + }, + { + "epoch": 1.687524734036362, + "grad_norm": 0.34247825005167304, + "learning_rate": 4.781474588890562e-05, + "loss": 2.676, + "step": 36246 + }, + { + "epoch": 1.687571292222455, + "grad_norm": 0.3314896694425466, + "learning_rate": 4.7812039743822954e-05, + "loss": 2.7071, + "step": 36247 + }, + { + "epoch": 1.687617850408548, + "grad_norm": 0.33362689824171504, + "learning_rate": 4.7809333605161734e-05, + "loss": 2.7522, + "step": 36248 + }, + { + "epoch": 1.687664408594641, + "grad_norm": 0.35428927793141246, + "learning_rate": 4.780662747292991e-05, + "loss": 2.6433, + "step": 36249 + }, + { + "epoch": 1.6877109667807342, + "grad_norm": 0.32204502877102054, + "learning_rate": 4.780392134713539e-05, + "loss": 2.7436, + "step": 36250 + }, + { + "epoch": 1.6877575249668273, + "grad_norm": 0.37564231180713487, + "learning_rate": 4.7801215227786174e-05, + "loss": 2.8079, + "step": 36251 + }, + { + "epoch": 1.6878040831529204, + "grad_norm": 0.3381364909667293, + "learning_rate": 4.779850911489017e-05, + "loss": 2.8191, + "step": 36252 + }, + { + "epoch": 1.6878506413390135, + "grad_norm": 0.3541848160304308, + "learning_rate": 4.77958030084553e-05, + "loss": 2.6994, + "step": 36253 + }, + { + "epoch": 1.6878971995251066, + "grad_norm": 0.36494726659666804, + "learning_rate": 4.779309690848956e-05, + "loss": 2.7459, + "step": 36254 + }, + { + "epoch": 1.6879437577111995, + "grad_norm": 0.3551312521856877, + "learning_rate": 4.779039081500086e-05, + "loss": 2.7014, + "step": 36255 + }, + { + "epoch": 1.6879903158972926, + "grad_norm": 0.3314552907340034, + "learning_rate": 4.778768472799713e-05, + "loss": 2.6621, + "step": 36256 + }, + { + "epoch": 1.6880368740833858, + "grad_norm": 0.3225745044612284, + "learning_rate": 4.778497864748634e-05, + "loss": 2.6455, + "step": 36257 + }, + { + "epoch": 1.6880834322694787, + "grad_norm": 0.3389771521393575, + "learning_rate": 4.77822725734764e-05, + "loss": 2.676, + "step": 36258 + }, + { + "epoch": 1.6881299904555718, + "grad_norm": 0.33368798926883775, + "learning_rate": 4.77795665059753e-05, + "loss": 2.6875, + "step": 36259 + }, + { + "epoch": 1.6881765486416649, + "grad_norm": 0.3429316202645428, + "learning_rate": 4.7776860444990945e-05, + "loss": 2.6834, + "step": 36260 + }, + { + "epoch": 1.688223106827758, + "grad_norm": 0.33230840518172894, + "learning_rate": 4.777415439053128e-05, + "loss": 2.7046, + "step": 36261 + }, + { + "epoch": 1.6882696650138511, + "grad_norm": 0.34936198392726586, + "learning_rate": 4.777144834260426e-05, + "loss": 2.7848, + "step": 36262 + }, + { + "epoch": 1.6883162231999442, + "grad_norm": 0.34898356463871466, + "learning_rate": 4.77687423012178e-05, + "loss": 2.6504, + "step": 36263 + }, + { + "epoch": 1.6883627813860373, + "grad_norm": 0.34536873070417373, + "learning_rate": 4.776603626637988e-05, + "loss": 2.7889, + "step": 36264 + }, + { + "epoch": 1.6884093395721302, + "grad_norm": 0.35077623197452373, + "learning_rate": 4.7763330238098426e-05, + "loss": 2.762, + "step": 36265 + }, + { + "epoch": 1.6884558977582234, + "grad_norm": 0.32875992490273764, + "learning_rate": 4.776062421638135e-05, + "loss": 2.7086, + "step": 36266 + }, + { + "epoch": 1.6885024559443162, + "grad_norm": 0.3337095525952645, + "learning_rate": 4.775791820123666e-05, + "loss": 2.6622, + "step": 36267 + }, + { + "epoch": 1.6885490141304094, + "grad_norm": 0.3411314851583906, + "learning_rate": 4.775521219267222e-05, + "loss": 2.6248, + "step": 36268 + }, + { + "epoch": 1.6885955723165025, + "grad_norm": 0.33615857849023956, + "learning_rate": 4.775250619069604e-05, + "loss": 2.7876, + "step": 36269 + }, + { + "epoch": 1.6886421305025956, + "grad_norm": 0.3285787996584527, + "learning_rate": 4.774980019531603e-05, + "loss": 2.6811, + "step": 36270 + }, + { + "epoch": 1.6886886886886887, + "grad_norm": 0.3692396881847222, + "learning_rate": 4.774709420654011e-05, + "loss": 2.8064, + "step": 36271 + }, + { + "epoch": 1.6887352468747818, + "grad_norm": 0.3125671419693838, + "learning_rate": 4.774438822437628e-05, + "loss": 2.6604, + "step": 36272 + }, + { + "epoch": 1.688781805060875, + "grad_norm": 0.34415954158359385, + "learning_rate": 4.7741682248832443e-05, + "loss": 2.7448, + "step": 36273 + }, + { + "epoch": 1.688828363246968, + "grad_norm": 0.313619569969994, + "learning_rate": 4.773897627991653e-05, + "loss": 2.6546, + "step": 36274 + }, + { + "epoch": 1.688874921433061, + "grad_norm": 0.34681716391412676, + "learning_rate": 4.773627031763652e-05, + "loss": 2.5932, + "step": 36275 + }, + { + "epoch": 1.688921479619154, + "grad_norm": 0.3501935954767549, + "learning_rate": 4.77335643620003e-05, + "loss": 2.7308, + "step": 36276 + }, + { + "epoch": 1.688968037805247, + "grad_norm": 0.34486362493645306, + "learning_rate": 4.773085841301587e-05, + "loss": 2.7242, + "step": 36277 + }, + { + "epoch": 1.68901459599134, + "grad_norm": 0.36241033171667403, + "learning_rate": 4.772815247069115e-05, + "loss": 2.6217, + "step": 36278 + }, + { + "epoch": 1.6890611541774332, + "grad_norm": 0.33454756505743594, + "learning_rate": 4.7725446535034055e-05, + "loss": 2.6697, + "step": 36279 + }, + { + "epoch": 1.6891077123635263, + "grad_norm": 0.3897488735567906, + "learning_rate": 4.7722740606052574e-05, + "loss": 2.7421, + "step": 36280 + }, + { + "epoch": 1.6891542705496194, + "grad_norm": 0.3673270302129093, + "learning_rate": 4.7720034683754604e-05, + "loss": 2.7666, + "step": 36281 + }, + { + "epoch": 1.6892008287357125, + "grad_norm": 0.35212237056086276, + "learning_rate": 4.7717328768148126e-05, + "loss": 2.7215, + "step": 36282 + }, + { + "epoch": 1.6892473869218056, + "grad_norm": 0.3410053574538082, + "learning_rate": 4.771462285924107e-05, + "loss": 2.6996, + "step": 36283 + }, + { + "epoch": 1.6892939451078985, + "grad_norm": 0.3494587993862536, + "learning_rate": 4.771191695704134e-05, + "loss": 2.8173, + "step": 36284 + }, + { + "epoch": 1.6893405032939917, + "grad_norm": 0.3139453756530462, + "learning_rate": 4.770921106155694e-05, + "loss": 2.6783, + "step": 36285 + }, + { + "epoch": 1.6893870614800848, + "grad_norm": 0.3748251637275564, + "learning_rate": 4.770650517279576e-05, + "loss": 2.7254, + "step": 36286 + }, + { + "epoch": 1.6894336196661777, + "grad_norm": 0.33216903299997647, + "learning_rate": 4.7703799290765777e-05, + "loss": 2.768, + "step": 36287 + }, + { + "epoch": 1.6894801778522708, + "grad_norm": 0.3429766579292048, + "learning_rate": 4.770109341547491e-05, + "loss": 2.6918, + "step": 36288 + }, + { + "epoch": 1.689526736038364, + "grad_norm": 0.37907040230013006, + "learning_rate": 4.7698387546931093e-05, + "loss": 2.7192, + "step": 36289 + }, + { + "epoch": 1.689573294224457, + "grad_norm": 0.361231879096363, + "learning_rate": 4.7695681685142304e-05, + "loss": 2.6319, + "step": 36290 + }, + { + "epoch": 1.6896198524105501, + "grad_norm": 0.3167778505196357, + "learning_rate": 4.7692975830116456e-05, + "loss": 2.7007, + "step": 36291 + }, + { + "epoch": 1.6896664105966432, + "grad_norm": 0.3812011627727669, + "learning_rate": 4.769026998186148e-05, + "loss": 2.7448, + "step": 36292 + }, + { + "epoch": 1.6897129687827364, + "grad_norm": 0.34797177571769805, + "learning_rate": 4.768756414038535e-05, + "loss": 2.6889, + "step": 36293 + }, + { + "epoch": 1.6897595269688293, + "grad_norm": 0.31999030593806593, + "learning_rate": 4.768485830569598e-05, + "loss": 2.7023, + "step": 36294 + }, + { + "epoch": 1.6898060851549224, + "grad_norm": 0.33562623452319945, + "learning_rate": 4.768215247780134e-05, + "loss": 2.733, + "step": 36295 + }, + { + "epoch": 1.6898526433410155, + "grad_norm": 0.322381161781356, + "learning_rate": 4.767944665670935e-05, + "loss": 2.6901, + "step": 36296 + }, + { + "epoch": 1.6898992015271084, + "grad_norm": 0.32139593625103413, + "learning_rate": 4.7676740842427934e-05, + "loss": 2.6363, + "step": 36297 + }, + { + "epoch": 1.6899457597132015, + "grad_norm": 0.34930211734596694, + "learning_rate": 4.767403503496508e-05, + "loss": 2.6661, + "step": 36298 + }, + { + "epoch": 1.6899923178992946, + "grad_norm": 0.34342830479375946, + "learning_rate": 4.7671329234328695e-05, + "loss": 2.7352, + "step": 36299 + }, + { + "epoch": 1.6900388760853877, + "grad_norm": 0.39287535652174643, + "learning_rate": 4.766862344052674e-05, + "loss": 2.7169, + "step": 36300 + }, + { + "epoch": 1.6900854342714808, + "grad_norm": 0.3413166855057422, + "learning_rate": 4.766591765356714e-05, + "loss": 2.6129, + "step": 36301 + }, + { + "epoch": 1.690131992457574, + "grad_norm": 0.35062020419876266, + "learning_rate": 4.766321187345782e-05, + "loss": 2.7237, + "step": 36302 + }, + { + "epoch": 1.690178550643667, + "grad_norm": 0.3482959033375554, + "learning_rate": 4.766050610020677e-05, + "loss": 2.7125, + "step": 36303 + }, + { + "epoch": 1.69022510882976, + "grad_norm": 0.3127324159722489, + "learning_rate": 4.765780033382188e-05, + "loss": 2.7344, + "step": 36304 + }, + { + "epoch": 1.690271667015853, + "grad_norm": 0.3293249533009291, + "learning_rate": 4.7655094574311134e-05, + "loss": 2.7101, + "step": 36305 + }, + { + "epoch": 1.690318225201946, + "grad_norm": 0.36686480302383206, + "learning_rate": 4.7652388821682456e-05, + "loss": 2.7483, + "step": 36306 + }, + { + "epoch": 1.690364783388039, + "grad_norm": 0.3697922400827674, + "learning_rate": 4.764968307594377e-05, + "loss": 2.7471, + "step": 36307 + }, + { + "epoch": 1.6904113415741322, + "grad_norm": 0.32308681456045596, + "learning_rate": 4.764697733710305e-05, + "loss": 2.7264, + "step": 36308 + }, + { + "epoch": 1.6904578997602253, + "grad_norm": 0.3115673061754206, + "learning_rate": 4.764427160516822e-05, + "loss": 2.5294, + "step": 36309 + }, + { + "epoch": 1.6905044579463184, + "grad_norm": 0.36752781746181457, + "learning_rate": 4.764156588014719e-05, + "loss": 2.7274, + "step": 36310 + }, + { + "epoch": 1.6905510161324115, + "grad_norm": 0.3341467566930523, + "learning_rate": 4.7638860162047966e-05, + "loss": 2.764, + "step": 36311 + }, + { + "epoch": 1.6905975743185047, + "grad_norm": 0.3646112173203075, + "learning_rate": 4.763615445087844e-05, + "loss": 2.6844, + "step": 36312 + }, + { + "epoch": 1.6906441325045978, + "grad_norm": 0.3450654515412319, + "learning_rate": 4.763344874664658e-05, + "loss": 2.6208, + "step": 36313 + }, + { + "epoch": 1.6906906906906907, + "grad_norm": 0.35419332611427734, + "learning_rate": 4.763074304936031e-05, + "loss": 2.7561, + "step": 36314 + }, + { + "epoch": 1.6907372488767838, + "grad_norm": 0.3744851512394888, + "learning_rate": 4.762803735902756e-05, + "loss": 2.7744, + "step": 36315 + }, + { + "epoch": 1.6907838070628767, + "grad_norm": 0.3448250851291114, + "learning_rate": 4.7625331675656306e-05, + "loss": 2.5915, + "step": 36316 + }, + { + "epoch": 1.6908303652489698, + "grad_norm": 0.36398693621901185, + "learning_rate": 4.7622625999254447e-05, + "loss": 2.7471, + "step": 36317 + }, + { + "epoch": 1.690876923435063, + "grad_norm": 0.3779826458814435, + "learning_rate": 4.7619920329829963e-05, + "loss": 2.6279, + "step": 36318 + }, + { + "epoch": 1.690923481621156, + "grad_norm": 0.3433451777388754, + "learning_rate": 4.7617214667390785e-05, + "loss": 2.7383, + "step": 36319 + }, + { + "epoch": 1.6909700398072491, + "grad_norm": 0.3780051247479559, + "learning_rate": 4.7614509011944826e-05, + "loss": 2.7499, + "step": 36320 + }, + { + "epoch": 1.6910165979933423, + "grad_norm": 0.33367474067518577, + "learning_rate": 4.761180336350007e-05, + "loss": 2.803, + "step": 36321 + }, + { + "epoch": 1.6910631561794354, + "grad_norm": 0.3562416822539732, + "learning_rate": 4.760909772206442e-05, + "loss": 2.6748, + "step": 36322 + }, + { + "epoch": 1.6911097143655285, + "grad_norm": 0.3326125444115185, + "learning_rate": 4.760639208764584e-05, + "loss": 2.6757, + "step": 36323 + }, + { + "epoch": 1.6911562725516214, + "grad_norm": 0.36356028865417844, + "learning_rate": 4.760368646025227e-05, + "loss": 2.6803, + "step": 36324 + }, + { + "epoch": 1.6912028307377145, + "grad_norm": 0.33752292028348235, + "learning_rate": 4.760098083989163e-05, + "loss": 2.6013, + "step": 36325 + }, + { + "epoch": 1.6912493889238074, + "grad_norm": 0.328943746171898, + "learning_rate": 4.7598275226571894e-05, + "loss": 2.6522, + "step": 36326 + }, + { + "epoch": 1.6912959471099005, + "grad_norm": 0.36758081646985385, + "learning_rate": 4.759556962030098e-05, + "loss": 2.8527, + "step": 36327 + }, + { + "epoch": 1.6913425052959936, + "grad_norm": 0.350122979953301, + "learning_rate": 4.75928640210868e-05, + "loss": 2.7326, + "step": 36328 + }, + { + "epoch": 1.6913890634820867, + "grad_norm": 0.33849670904794993, + "learning_rate": 4.7590158428937365e-05, + "loss": 2.6462, + "step": 36329 + }, + { + "epoch": 1.6914356216681798, + "grad_norm": 0.3871020765573645, + "learning_rate": 4.7587452843860545e-05, + "loss": 2.6794, + "step": 36330 + }, + { + "epoch": 1.691482179854273, + "grad_norm": 0.31984853904269217, + "learning_rate": 4.758474726586434e-05, + "loss": 2.724, + "step": 36331 + }, + { + "epoch": 1.691528738040366, + "grad_norm": 0.3863829997556555, + "learning_rate": 4.7582041694956664e-05, + "loss": 2.5981, + "step": 36332 + }, + { + "epoch": 1.691575296226459, + "grad_norm": 0.36616430088389357, + "learning_rate": 4.757933613114543e-05, + "loss": 2.7167, + "step": 36333 + }, + { + "epoch": 1.691621854412552, + "grad_norm": 0.3999606695754717, + "learning_rate": 4.757663057443863e-05, + "loss": 2.7224, + "step": 36334 + }, + { + "epoch": 1.6916684125986452, + "grad_norm": 0.3868897669171608, + "learning_rate": 4.757392502484418e-05, + "loss": 2.7536, + "step": 36335 + }, + { + "epoch": 1.691714970784738, + "grad_norm": 0.34525513176168315, + "learning_rate": 4.757121948237001e-05, + "loss": 2.6437, + "step": 36336 + }, + { + "epoch": 1.6917615289708312, + "grad_norm": 0.3453894398544558, + "learning_rate": 4.756851394702409e-05, + "loss": 2.7644, + "step": 36337 + }, + { + "epoch": 1.6918080871569243, + "grad_norm": 0.39778227358282225, + "learning_rate": 4.756580841881432e-05, + "loss": 2.7478, + "step": 36338 + }, + { + "epoch": 1.6918546453430174, + "grad_norm": 0.3213698538994705, + "learning_rate": 4.7563102897748677e-05, + "loss": 2.7428, + "step": 36339 + }, + { + "epoch": 1.6919012035291106, + "grad_norm": 0.36735227364658407, + "learning_rate": 4.756039738383507e-05, + "loss": 2.683, + "step": 36340 + }, + { + "epoch": 1.6919477617152037, + "grad_norm": 0.34619222706428526, + "learning_rate": 4.755769187708148e-05, + "loss": 2.829, + "step": 36341 + }, + { + "epoch": 1.6919943199012968, + "grad_norm": 0.36189469100558197, + "learning_rate": 4.755498637749582e-05, + "loss": 2.7543, + "step": 36342 + }, + { + "epoch": 1.6920408780873897, + "grad_norm": 0.3418474997189474, + "learning_rate": 4.755228088508601e-05, + "loss": 2.7237, + "step": 36343 + }, + { + "epoch": 1.6920874362734828, + "grad_norm": 0.3519950307272664, + "learning_rate": 4.754957539986004e-05, + "loss": 2.8131, + "step": 36344 + }, + { + "epoch": 1.692133994459576, + "grad_norm": 0.3566243369632754, + "learning_rate": 4.754686992182582e-05, + "loss": 2.6607, + "step": 36345 + }, + { + "epoch": 1.6921805526456688, + "grad_norm": 0.34430968863261757, + "learning_rate": 4.7544164450991274e-05, + "loss": 2.7426, + "step": 36346 + }, + { + "epoch": 1.692227110831762, + "grad_norm": 0.3606674022051219, + "learning_rate": 4.754145898736439e-05, + "loss": 2.8549, + "step": 36347 + }, + { + "epoch": 1.692273669017855, + "grad_norm": 0.33786773692366273, + "learning_rate": 4.7538753530953066e-05, + "loss": 2.6146, + "step": 36348 + }, + { + "epoch": 1.6923202272039481, + "grad_norm": 0.36042884110267476, + "learning_rate": 4.753604808176526e-05, + "loss": 2.6483, + "step": 36349 + }, + { + "epoch": 1.6923667853900413, + "grad_norm": 0.3521435993569975, + "learning_rate": 4.753334263980893e-05, + "loss": 2.6411, + "step": 36350 + }, + { + "epoch": 1.6924133435761344, + "grad_norm": 0.36475577808721044, + "learning_rate": 4.753063720509197e-05, + "loss": 2.7165, + "step": 36351 + }, + { + "epoch": 1.6924599017622275, + "grad_norm": 0.3494845132457029, + "learning_rate": 4.7527931777622366e-05, + "loss": 2.7096, + "step": 36352 + }, + { + "epoch": 1.6925064599483204, + "grad_norm": 0.3485595745341235, + "learning_rate": 4.752522635740801e-05, + "loss": 2.6081, + "step": 36353 + }, + { + "epoch": 1.6925530181344135, + "grad_norm": 0.3558926816429133, + "learning_rate": 4.75225209444569e-05, + "loss": 2.7865, + "step": 36354 + }, + { + "epoch": 1.6925995763205064, + "grad_norm": 0.36947784858565835, + "learning_rate": 4.751981553877695e-05, + "loss": 2.6604, + "step": 36355 + }, + { + "epoch": 1.6926461345065995, + "grad_norm": 0.33126936678142255, + "learning_rate": 4.751711014037607e-05, + "loss": 2.7247, + "step": 36356 + }, + { + "epoch": 1.6926926926926926, + "grad_norm": 0.3349986506783644, + "learning_rate": 4.751440474926224e-05, + "loss": 2.741, + "step": 36357 + }, + { + "epoch": 1.6927392508787857, + "grad_norm": 0.3585291350821402, + "learning_rate": 4.751169936544338e-05, + "loss": 2.6933, + "step": 36358 + }, + { + "epoch": 1.6927858090648789, + "grad_norm": 0.3431695136762583, + "learning_rate": 4.7508993988927455e-05, + "loss": 2.8274, + "step": 36359 + }, + { + "epoch": 1.692832367250972, + "grad_norm": 0.34131439810051345, + "learning_rate": 4.750628861972238e-05, + "loss": 2.7647, + "step": 36360 + }, + { + "epoch": 1.692878925437065, + "grad_norm": 0.3269276447797499, + "learning_rate": 4.75035832578361e-05, + "loss": 2.7244, + "step": 36361 + }, + { + "epoch": 1.6929254836231582, + "grad_norm": 0.3205532748878386, + "learning_rate": 4.750087790327655e-05, + "loss": 2.6912, + "step": 36362 + }, + { + "epoch": 1.692972041809251, + "grad_norm": 0.3435407034522621, + "learning_rate": 4.7498172556051694e-05, + "loss": 2.5995, + "step": 36363 + }, + { + "epoch": 1.6930185999953442, + "grad_norm": 0.3309595090005424, + "learning_rate": 4.7495467216169445e-05, + "loss": 2.7648, + "step": 36364 + }, + { + "epoch": 1.693065158181437, + "grad_norm": 0.33758154294352477, + "learning_rate": 4.749276188363776e-05, + "loss": 2.7096, + "step": 36365 + }, + { + "epoch": 1.6931117163675302, + "grad_norm": 0.3173026090193787, + "learning_rate": 4.749005655846455e-05, + "loss": 2.6676, + "step": 36366 + }, + { + "epoch": 1.6931582745536233, + "grad_norm": 0.32280177010183586, + "learning_rate": 4.74873512406578e-05, + "loss": 2.6708, + "step": 36367 + }, + { + "epoch": 1.6932048327397164, + "grad_norm": 0.3281706864667505, + "learning_rate": 4.7484645930225426e-05, + "loss": 2.5938, + "step": 36368 + }, + { + "epoch": 1.6932513909258096, + "grad_norm": 0.37077093622565355, + "learning_rate": 4.748194062717535e-05, + "loss": 2.6693, + "step": 36369 + }, + { + "epoch": 1.6932979491119027, + "grad_norm": 0.3573972148144116, + "learning_rate": 4.747923533151555e-05, + "loss": 2.7736, + "step": 36370 + }, + { + "epoch": 1.6933445072979958, + "grad_norm": 0.32616976559663885, + "learning_rate": 4.747653004325392e-05, + "loss": 2.6435, + "step": 36371 + }, + { + "epoch": 1.6933910654840887, + "grad_norm": 0.32618231889642507, + "learning_rate": 4.747382476239845e-05, + "loss": 2.6604, + "step": 36372 + }, + { + "epoch": 1.6934376236701818, + "grad_norm": 0.3509016024506032, + "learning_rate": 4.747111948895705e-05, + "loss": 2.7245, + "step": 36373 + }, + { + "epoch": 1.693484181856275, + "grad_norm": 0.33418311355000313, + "learning_rate": 4.746841422293767e-05, + "loss": 2.8297, + "step": 36374 + }, + { + "epoch": 1.6935307400423678, + "grad_norm": 0.32655910096763224, + "learning_rate": 4.7465708964348234e-05, + "loss": 2.7663, + "step": 36375 + }, + { + "epoch": 1.693577298228461, + "grad_norm": 0.36641178997283175, + "learning_rate": 4.746300371319669e-05, + "loss": 2.7334, + "step": 36376 + }, + { + "epoch": 1.693623856414554, + "grad_norm": 0.32469689789958495, + "learning_rate": 4.7460298469490994e-05, + "loss": 2.8036, + "step": 36377 + }, + { + "epoch": 1.6936704146006472, + "grad_norm": 0.3525748068002319, + "learning_rate": 4.745759323323907e-05, + "loss": 2.5809, + "step": 36378 + }, + { + "epoch": 1.6937169727867403, + "grad_norm": 0.31021262857616494, + "learning_rate": 4.7454888004448847e-05, + "loss": 2.6962, + "step": 36379 + }, + { + "epoch": 1.6937635309728334, + "grad_norm": 0.3631391449869452, + "learning_rate": 4.745218278312829e-05, + "loss": 2.6039, + "step": 36380 + }, + { + "epoch": 1.6938100891589265, + "grad_norm": 0.35852146770347226, + "learning_rate": 4.7449477569285305e-05, + "loss": 2.6965, + "step": 36381 + }, + { + "epoch": 1.6938566473450194, + "grad_norm": 0.35433669888685837, + "learning_rate": 4.744677236292787e-05, + "loss": 2.6271, + "step": 36382 + }, + { + "epoch": 1.6939032055311125, + "grad_norm": 0.39418399192639236, + "learning_rate": 4.744406716406392e-05, + "loss": 2.7658, + "step": 36383 + }, + { + "epoch": 1.6939497637172056, + "grad_norm": 0.3813358075179397, + "learning_rate": 4.744136197270135e-05, + "loss": 2.6872, + "step": 36384 + }, + { + "epoch": 1.6939963219032985, + "grad_norm": 0.33281311487940596, + "learning_rate": 4.743865678884814e-05, + "loss": 2.6705, + "step": 36385 + }, + { + "epoch": 1.6940428800893916, + "grad_norm": 0.38309251293025326, + "learning_rate": 4.743595161251224e-05, + "loss": 2.7025, + "step": 36386 + }, + { + "epoch": 1.6940894382754847, + "grad_norm": 0.338455073762667, + "learning_rate": 4.743324644370155e-05, + "loss": 2.5915, + "step": 36387 + }, + { + "epoch": 1.6941359964615779, + "grad_norm": 0.3639369568373956, + "learning_rate": 4.743054128242403e-05, + "loss": 2.7508, + "step": 36388 + }, + { + "epoch": 1.694182554647671, + "grad_norm": 0.3542285979539974, + "learning_rate": 4.742783612868762e-05, + "loss": 2.7629, + "step": 36389 + }, + { + "epoch": 1.694229112833764, + "grad_norm": 0.3531349481901291, + "learning_rate": 4.7425130982500265e-05, + "loss": 2.6418, + "step": 36390 + }, + { + "epoch": 1.6942756710198572, + "grad_norm": 0.3890764091324383, + "learning_rate": 4.74224258438699e-05, + "loss": 2.6986, + "step": 36391 + }, + { + "epoch": 1.69432222920595, + "grad_norm": 0.33189278197156247, + "learning_rate": 4.741972071280444e-05, + "loss": 2.6605, + "step": 36392 + }, + { + "epoch": 1.6943687873920432, + "grad_norm": 0.38221926246278487, + "learning_rate": 4.741701558931186e-05, + "loss": 2.7062, + "step": 36393 + }, + { + "epoch": 1.6944153455781361, + "grad_norm": 0.3451639151537495, + "learning_rate": 4.741431047340007e-05, + "loss": 2.6968, + "step": 36394 + }, + { + "epoch": 1.6944619037642292, + "grad_norm": 0.33954983252058657, + "learning_rate": 4.741160536507705e-05, + "loss": 2.6777, + "step": 36395 + }, + { + "epoch": 1.6945084619503223, + "grad_norm": 0.3584283448599525, + "learning_rate": 4.74089002643507e-05, + "loss": 2.7252, + "step": 36396 + }, + { + "epoch": 1.6945550201364155, + "grad_norm": 0.3511605312653937, + "learning_rate": 4.7406195171228964e-05, + "loss": 2.7962, + "step": 36397 + }, + { + "epoch": 1.6946015783225086, + "grad_norm": 0.36346440826672977, + "learning_rate": 4.74034900857198e-05, + "loss": 2.7701, + "step": 36398 + }, + { + "epoch": 1.6946481365086017, + "grad_norm": 0.3398798636703613, + "learning_rate": 4.740078500783113e-05, + "loss": 2.6856, + "step": 36399 + }, + { + "epoch": 1.6946946946946948, + "grad_norm": 0.3231190801532838, + "learning_rate": 4.7398079937570915e-05, + "loss": 2.6672, + "step": 36400 + }, + { + "epoch": 1.694741252880788, + "grad_norm": 0.35606683466509553, + "learning_rate": 4.7395374874947066e-05, + "loss": 2.7716, + "step": 36401 + }, + { + "epoch": 1.6947878110668808, + "grad_norm": 0.3424701281378116, + "learning_rate": 4.7392669819967534e-05, + "loss": 2.6254, + "step": 36402 + }, + { + "epoch": 1.694834369252974, + "grad_norm": 0.34340236816161024, + "learning_rate": 4.738996477264027e-05, + "loss": 2.7058, + "step": 36403 + }, + { + "epoch": 1.6948809274390668, + "grad_norm": 0.34530629073701113, + "learning_rate": 4.7387259732973206e-05, + "loss": 2.7142, + "step": 36404 + }, + { + "epoch": 1.69492748562516, + "grad_norm": 0.3366420038519359, + "learning_rate": 4.7384554700974246e-05, + "loss": 2.7393, + "step": 36405 + }, + { + "epoch": 1.694974043811253, + "grad_norm": 0.346934532524178, + "learning_rate": 4.738184967665139e-05, + "loss": 2.7616, + "step": 36406 + }, + { + "epoch": 1.6950206019973462, + "grad_norm": 0.35534337921018766, + "learning_rate": 4.737914466001253e-05, + "loss": 2.7031, + "step": 36407 + }, + { + "epoch": 1.6950671601834393, + "grad_norm": 0.32954241393858613, + "learning_rate": 4.737643965106564e-05, + "loss": 2.6424, + "step": 36408 + }, + { + "epoch": 1.6951137183695324, + "grad_norm": 0.36348341917866467, + "learning_rate": 4.7373734649818635e-05, + "loss": 2.6926, + "step": 36409 + }, + { + "epoch": 1.6951602765556255, + "grad_norm": 0.3167409749289413, + "learning_rate": 4.7371029656279444e-05, + "loss": 2.7775, + "step": 36410 + }, + { + "epoch": 1.6952068347417186, + "grad_norm": 0.3811372378638028, + "learning_rate": 4.736832467045604e-05, + "loss": 2.6669, + "step": 36411 + }, + { + "epoch": 1.6952533929278115, + "grad_norm": 0.3417944298849965, + "learning_rate": 4.736561969235633e-05, + "loss": 2.7836, + "step": 36412 + }, + { + "epoch": 1.6952999511139046, + "grad_norm": 0.36476565304458924, + "learning_rate": 4.736291472198829e-05, + "loss": 2.6697, + "step": 36413 + }, + { + "epoch": 1.6953465092999975, + "grad_norm": 0.34782185996504394, + "learning_rate": 4.736020975935981e-05, + "loss": 2.7909, + "step": 36414 + }, + { + "epoch": 1.6953930674860906, + "grad_norm": 0.34952252637775505, + "learning_rate": 4.7357504804478865e-05, + "loss": 2.7063, + "step": 36415 + }, + { + "epoch": 1.6954396256721838, + "grad_norm": 0.34520065434778, + "learning_rate": 4.7354799857353386e-05, + "loss": 2.5966, + "step": 36416 + }, + { + "epoch": 1.6954861838582769, + "grad_norm": 0.37073995534020954, + "learning_rate": 4.7352094917991296e-05, + "loss": 2.6787, + "step": 36417 + }, + { + "epoch": 1.69553274204437, + "grad_norm": 0.34371840314787877, + "learning_rate": 4.734938998640056e-05, + "loss": 2.8204, + "step": 36418 + }, + { + "epoch": 1.695579300230463, + "grad_norm": 0.363427374201397, + "learning_rate": 4.734668506258911e-05, + "loss": 2.6838, + "step": 36419 + }, + { + "epoch": 1.6956258584165562, + "grad_norm": 0.3296479202954505, + "learning_rate": 4.734398014656485e-05, + "loss": 2.7996, + "step": 36420 + }, + { + "epoch": 1.6956724166026491, + "grad_norm": 0.3394661628325978, + "learning_rate": 4.734127523833578e-05, + "loss": 2.6306, + "step": 36421 + }, + { + "epoch": 1.6957189747887422, + "grad_norm": 0.36070067477632123, + "learning_rate": 4.7338570337909794e-05, + "loss": 2.6516, + "step": 36422 + }, + { + "epoch": 1.6957655329748353, + "grad_norm": 0.3346277715862563, + "learning_rate": 4.733586544529482e-05, + "loss": 2.6698, + "step": 36423 + }, + { + "epoch": 1.6958120911609282, + "grad_norm": 0.3284190106426461, + "learning_rate": 4.7333160560498845e-05, + "loss": 2.7396, + "step": 36424 + }, + { + "epoch": 1.6958586493470214, + "grad_norm": 0.3720965100714627, + "learning_rate": 4.7330455683529765e-05, + "loss": 2.6397, + "step": 36425 + }, + { + "epoch": 1.6959052075331145, + "grad_norm": 0.3339338343292678, + "learning_rate": 4.732775081439555e-05, + "loss": 2.7343, + "step": 36426 + }, + { + "epoch": 1.6959517657192076, + "grad_norm": 0.3694078204496823, + "learning_rate": 4.732504595310411e-05, + "loss": 2.749, + "step": 36427 + }, + { + "epoch": 1.6959983239053007, + "grad_norm": 0.3574467183306856, + "learning_rate": 4.73223410996634e-05, + "loss": 2.7268, + "step": 36428 + }, + { + "epoch": 1.6960448820913938, + "grad_norm": 0.3734449436868997, + "learning_rate": 4.731963625408136e-05, + "loss": 2.7276, + "step": 36429 + }, + { + "epoch": 1.696091440277487, + "grad_norm": 0.34405309222337327, + "learning_rate": 4.731693141636591e-05, + "loss": 2.6953, + "step": 36430 + }, + { + "epoch": 1.6961379984635798, + "grad_norm": 0.389897898600837, + "learning_rate": 4.7314226586525016e-05, + "loss": 2.7341, + "step": 36431 + }, + { + "epoch": 1.696184556649673, + "grad_norm": 0.3490471526542194, + "learning_rate": 4.7311521764566606e-05, + "loss": 2.7092, + "step": 36432 + }, + { + "epoch": 1.696231114835766, + "grad_norm": 0.34586853003468515, + "learning_rate": 4.730881695049859e-05, + "loss": 2.7874, + "step": 36433 + }, + { + "epoch": 1.696277673021859, + "grad_norm": 0.3756467507235101, + "learning_rate": 4.7306112144328956e-05, + "loss": 2.6539, + "step": 36434 + }, + { + "epoch": 1.696324231207952, + "grad_norm": 0.35780381287314855, + "learning_rate": 4.73034073460656e-05, + "loss": 2.6789, + "step": 36435 + }, + { + "epoch": 1.6963707893940452, + "grad_norm": 0.3388610400943797, + "learning_rate": 4.730070255571649e-05, + "loss": 2.5651, + "step": 36436 + }, + { + "epoch": 1.6964173475801383, + "grad_norm": 0.356477897709764, + "learning_rate": 4.729799777328956e-05, + "loss": 2.651, + "step": 36437 + }, + { + "epoch": 1.6964639057662314, + "grad_norm": 0.3549278966408273, + "learning_rate": 4.729529299879272e-05, + "loss": 2.7829, + "step": 36438 + }, + { + "epoch": 1.6965104639523245, + "grad_norm": 0.33784381738314884, + "learning_rate": 4.729258823223395e-05, + "loss": 2.7634, + "step": 36439 + }, + { + "epoch": 1.6965570221384176, + "grad_norm": 0.37187808646988296, + "learning_rate": 4.728988347362115e-05, + "loss": 2.7322, + "step": 36440 + }, + { + "epoch": 1.6966035803245105, + "grad_norm": 0.3484626348113358, + "learning_rate": 4.7287178722962275e-05, + "loss": 2.7337, + "step": 36441 + }, + { + "epoch": 1.6966501385106036, + "grad_norm": 0.3235238832701556, + "learning_rate": 4.7284473980265284e-05, + "loss": 2.7404, + "step": 36442 + }, + { + "epoch": 1.6966966966966965, + "grad_norm": 0.37635852056525304, + "learning_rate": 4.728176924553806e-05, + "loss": 2.7604, + "step": 36443 + }, + { + "epoch": 1.6967432548827897, + "grad_norm": 0.3074356400288852, + "learning_rate": 4.72790645187886e-05, + "loss": 2.7821, + "step": 36444 + }, + { + "epoch": 1.6967898130688828, + "grad_norm": 0.33832074531001455, + "learning_rate": 4.727635980002482e-05, + "loss": 2.5998, + "step": 36445 + }, + { + "epoch": 1.6968363712549759, + "grad_norm": 0.325741537470085, + "learning_rate": 4.727365508925463e-05, + "loss": 2.6661, + "step": 36446 + }, + { + "epoch": 1.696882929441069, + "grad_norm": 0.3371274220519982, + "learning_rate": 4.727095038648602e-05, + "loss": 2.764, + "step": 36447 + }, + { + "epoch": 1.6969294876271621, + "grad_norm": 0.37085354838857565, + "learning_rate": 4.726824569172688e-05, + "loss": 2.7008, + "step": 36448 + }, + { + "epoch": 1.6969760458132552, + "grad_norm": 0.34563818518890527, + "learning_rate": 4.726554100498519e-05, + "loss": 2.7278, + "step": 36449 + }, + { + "epoch": 1.6970226039993483, + "grad_norm": 0.36703439132591287, + "learning_rate": 4.726283632626887e-05, + "loss": 2.6785, + "step": 36450 + }, + { + "epoch": 1.6970691621854412, + "grad_norm": 0.35507748308443415, + "learning_rate": 4.726013165558584e-05, + "loss": 2.6965, + "step": 36451 + }, + { + "epoch": 1.6971157203715344, + "grad_norm": 0.36516958090119817, + "learning_rate": 4.725742699294405e-05, + "loss": 2.7323, + "step": 36452 + }, + { + "epoch": 1.6971622785576272, + "grad_norm": 0.36765238533773376, + "learning_rate": 4.725472233835145e-05, + "loss": 2.5726, + "step": 36453 + }, + { + "epoch": 1.6972088367437204, + "grad_norm": 0.36265705261015585, + "learning_rate": 4.725201769181598e-05, + "loss": 2.7108, + "step": 36454 + }, + { + "epoch": 1.6972553949298135, + "grad_norm": 0.34862439909786197, + "learning_rate": 4.724931305334557e-05, + "loss": 2.6864, + "step": 36455 + }, + { + "epoch": 1.6973019531159066, + "grad_norm": 0.37999451846006344, + "learning_rate": 4.7246608422948126e-05, + "loss": 2.5373, + "step": 36456 + }, + { + "epoch": 1.6973485113019997, + "grad_norm": 0.3196642589477683, + "learning_rate": 4.724390380063164e-05, + "loss": 2.7119, + "step": 36457 + }, + { + "epoch": 1.6973950694880928, + "grad_norm": 0.3613074641832618, + "learning_rate": 4.724119918640402e-05, + "loss": 2.7639, + "step": 36458 + }, + { + "epoch": 1.697441627674186, + "grad_norm": 0.33974845460564124, + "learning_rate": 4.72384945802732e-05, + "loss": 2.6942, + "step": 36459 + }, + { + "epoch": 1.6974881858602788, + "grad_norm": 0.3269192380301571, + "learning_rate": 4.723578998224714e-05, + "loss": 2.7857, + "step": 36460 + }, + { + "epoch": 1.697534744046372, + "grad_norm": 0.34307672179968063, + "learning_rate": 4.7233085392333734e-05, + "loss": 2.7446, + "step": 36461 + }, + { + "epoch": 1.697581302232465, + "grad_norm": 0.37277921859019414, + "learning_rate": 4.723038081054098e-05, + "loss": 2.7351, + "step": 36462 + }, + { + "epoch": 1.697627860418558, + "grad_norm": 0.36935972378130677, + "learning_rate": 4.722767623687679e-05, + "loss": 2.8201, + "step": 36463 + }, + { + "epoch": 1.697674418604651, + "grad_norm": 0.352693644181654, + "learning_rate": 4.7224971671349075e-05, + "loss": 2.7189, + "step": 36464 + }, + { + "epoch": 1.6977209767907442, + "grad_norm": 0.3826421049884738, + "learning_rate": 4.722226711396581e-05, + "loss": 2.7688, + "step": 36465 + }, + { + "epoch": 1.6977675349768373, + "grad_norm": 0.33501161213802055, + "learning_rate": 4.7219562564734905e-05, + "loss": 2.6487, + "step": 36466 + }, + { + "epoch": 1.6978140931629304, + "grad_norm": 0.3764024927294277, + "learning_rate": 4.721685802366432e-05, + "loss": 2.5565, + "step": 36467 + }, + { + "epoch": 1.6978606513490235, + "grad_norm": 0.3563931035062556, + "learning_rate": 4.721415349076199e-05, + "loss": 2.7577, + "step": 36468 + }, + { + "epoch": 1.6979072095351166, + "grad_norm": 0.36067118948600574, + "learning_rate": 4.721144896603582e-05, + "loss": 2.7282, + "step": 36469 + }, + { + "epoch": 1.6979537677212095, + "grad_norm": 0.38781724776544835, + "learning_rate": 4.720874444949379e-05, + "loss": 2.6884, + "step": 36470 + }, + { + "epoch": 1.6980003259073027, + "grad_norm": 0.3329535898417498, + "learning_rate": 4.7206039941143805e-05, + "loss": 2.6642, + "step": 36471 + }, + { + "epoch": 1.6980468840933958, + "grad_norm": 0.3923462713062208, + "learning_rate": 4.720333544099383e-05, + "loss": 2.7578, + "step": 36472 + }, + { + "epoch": 1.6980934422794887, + "grad_norm": 0.3546614624986283, + "learning_rate": 4.72006309490518e-05, + "loss": 2.7519, + "step": 36473 + }, + { + "epoch": 1.6981400004655818, + "grad_norm": 0.3569713535281381, + "learning_rate": 4.719792646532562e-05, + "loss": 2.7244, + "step": 36474 + }, + { + "epoch": 1.698186558651675, + "grad_norm": 0.3511077727951181, + "learning_rate": 4.719522198982327e-05, + "loss": 2.7107, + "step": 36475 + }, + { + "epoch": 1.698233116837768, + "grad_norm": 0.3589748168649829, + "learning_rate": 4.719251752255266e-05, + "loss": 2.6355, + "step": 36476 + }, + { + "epoch": 1.6982796750238611, + "grad_norm": 0.3659193695782746, + "learning_rate": 4.7189813063521734e-05, + "loss": 2.6871, + "step": 36477 + }, + { + "epoch": 1.6983262332099542, + "grad_norm": 0.35729911360639083, + "learning_rate": 4.7187108612738426e-05, + "loss": 2.635, + "step": 36478 + }, + { + "epoch": 1.6983727913960474, + "grad_norm": 0.3724851423700152, + "learning_rate": 4.718440417021068e-05, + "loss": 2.7306, + "step": 36479 + }, + { + "epoch": 1.6984193495821402, + "grad_norm": 0.3771758661723972, + "learning_rate": 4.718169973594643e-05, + "loss": 2.6856, + "step": 36480 + }, + { + "epoch": 1.6984659077682334, + "grad_norm": 0.381949212066314, + "learning_rate": 4.7178995309953624e-05, + "loss": 2.6968, + "step": 36481 + }, + { + "epoch": 1.6985124659543263, + "grad_norm": 0.35164507833441166, + "learning_rate": 4.717629089224016e-05, + "loss": 2.6576, + "step": 36482 + }, + { + "epoch": 1.6985590241404194, + "grad_norm": 0.3803142820035351, + "learning_rate": 4.717358648281404e-05, + "loss": 2.711, + "step": 36483 + }, + { + "epoch": 1.6986055823265125, + "grad_norm": 0.38808051722130277, + "learning_rate": 4.717088208168313e-05, + "loss": 2.6988, + "step": 36484 + }, + { + "epoch": 1.6986521405126056, + "grad_norm": 0.3824055456220575, + "learning_rate": 4.716817768885543e-05, + "loss": 2.8282, + "step": 36485 + }, + { + "epoch": 1.6986986986986987, + "grad_norm": 0.3273036257607873, + "learning_rate": 4.716547330433885e-05, + "loss": 2.646, + "step": 36486 + }, + { + "epoch": 1.6987452568847918, + "grad_norm": 0.33875740258801457, + "learning_rate": 4.71627689281413e-05, + "loss": 2.6711, + "step": 36487 + }, + { + "epoch": 1.698791815070885, + "grad_norm": 0.36106616203844927, + "learning_rate": 4.716006456027077e-05, + "loss": 2.7717, + "step": 36488 + }, + { + "epoch": 1.698838373256978, + "grad_norm": 0.33078721876936196, + "learning_rate": 4.7157360200735154e-05, + "loss": 2.6664, + "step": 36489 + }, + { + "epoch": 1.698884931443071, + "grad_norm": 0.3637377413059287, + "learning_rate": 4.715465584954242e-05, + "loss": 2.7322, + "step": 36490 + }, + { + "epoch": 1.698931489629164, + "grad_norm": 0.33823906141091675, + "learning_rate": 4.715195150670048e-05, + "loss": 2.7535, + "step": 36491 + }, + { + "epoch": 1.698978047815257, + "grad_norm": 0.3332422179581189, + "learning_rate": 4.714924717221728e-05, + "loss": 2.7119, + "step": 36492 + }, + { + "epoch": 1.69902460600135, + "grad_norm": 0.36567094244086523, + "learning_rate": 4.714654284610077e-05, + "loss": 2.6976, + "step": 36493 + }, + { + "epoch": 1.6990711641874432, + "grad_norm": 0.3377151999030871, + "learning_rate": 4.714383852835888e-05, + "loss": 2.7299, + "step": 36494 + }, + { + "epoch": 1.6991177223735363, + "grad_norm": 0.3467459475877765, + "learning_rate": 4.7141134218999516e-05, + "loss": 2.7105, + "step": 36495 + }, + { + "epoch": 1.6991642805596294, + "grad_norm": 0.3361495093301043, + "learning_rate": 4.713842991803067e-05, + "loss": 2.7647, + "step": 36496 + }, + { + "epoch": 1.6992108387457225, + "grad_norm": 0.3553628251023064, + "learning_rate": 4.713572562546022e-05, + "loss": 2.6487, + "step": 36497 + }, + { + "epoch": 1.6992573969318157, + "grad_norm": 0.37649761428098405, + "learning_rate": 4.713302134129616e-05, + "loss": 2.8383, + "step": 36498 + }, + { + "epoch": 1.6993039551179088, + "grad_norm": 0.3474371967210477, + "learning_rate": 4.7130317065546396e-05, + "loss": 2.7712, + "step": 36499 + }, + { + "epoch": 1.6993505133040017, + "grad_norm": 0.37253001934148866, + "learning_rate": 4.712761279821885e-05, + "loss": 2.6804, + "step": 36500 + }, + { + "epoch": 1.6993970714900948, + "grad_norm": 0.33514852827496333, + "learning_rate": 4.71249085393215e-05, + "loss": 2.7254, + "step": 36501 + }, + { + "epoch": 1.6994436296761877, + "grad_norm": 0.357324805647203, + "learning_rate": 4.712220428886225e-05, + "loss": 2.6687, + "step": 36502 + }, + { + "epoch": 1.6994901878622808, + "grad_norm": 0.3385788692919674, + "learning_rate": 4.7119500046849055e-05, + "loss": 2.7156, + "step": 36503 + }, + { + "epoch": 1.699536746048374, + "grad_norm": 0.3535663086233055, + "learning_rate": 4.7116795813289834e-05, + "loss": 2.7456, + "step": 36504 + }, + { + "epoch": 1.699583304234467, + "grad_norm": 0.32539489607368677, + "learning_rate": 4.711409158819253e-05, + "loss": 2.8525, + "step": 36505 + }, + { + "epoch": 1.6996298624205601, + "grad_norm": 0.3522244331811147, + "learning_rate": 4.71113873715651e-05, + "loss": 2.7001, + "step": 36506 + }, + { + "epoch": 1.6996764206066532, + "grad_norm": 0.35362945558372877, + "learning_rate": 4.7108683163415436e-05, + "loss": 2.7302, + "step": 36507 + }, + { + "epoch": 1.6997229787927464, + "grad_norm": 0.34086424745257826, + "learning_rate": 4.7105978963751525e-05, + "loss": 2.7083, + "step": 36508 + }, + { + "epoch": 1.6997695369788393, + "grad_norm": 0.37656912578647517, + "learning_rate": 4.710327477258128e-05, + "loss": 2.6932, + "step": 36509 + }, + { + "epoch": 1.6998160951649324, + "grad_norm": 0.34360831327035574, + "learning_rate": 4.710057058991262e-05, + "loss": 2.7135, + "step": 36510 + }, + { + "epoch": 1.6998626533510255, + "grad_norm": 0.3742298279084828, + "learning_rate": 4.7097866415753514e-05, + "loss": 2.6874, + "step": 36511 + }, + { + "epoch": 1.6999092115371184, + "grad_norm": 0.35872477775304157, + "learning_rate": 4.7095162250111894e-05, + "loss": 2.6903, + "step": 36512 + }, + { + "epoch": 1.6999557697232115, + "grad_norm": 0.346513974259814, + "learning_rate": 4.709245809299566e-05, + "loss": 2.6864, + "step": 36513 + }, + { + "epoch": 1.7000023279093046, + "grad_norm": 0.3423818923083405, + "learning_rate": 4.70897539444128e-05, + "loss": 2.6786, + "step": 36514 + }, + { + "epoch": 1.7000488860953977, + "grad_norm": 0.34054164454443125, + "learning_rate": 4.708704980437121e-05, + "loss": 2.7686, + "step": 36515 + }, + { + "epoch": 1.7000954442814908, + "grad_norm": 0.41423798074005247, + "learning_rate": 4.7084345672878856e-05, + "loss": 2.7192, + "step": 36516 + }, + { + "epoch": 1.700142002467584, + "grad_norm": 0.33750165490031475, + "learning_rate": 4.708164154994365e-05, + "loss": 2.7462, + "step": 36517 + }, + { + "epoch": 1.700188560653677, + "grad_norm": 0.3927284134888701, + "learning_rate": 4.7078937435573536e-05, + "loss": 2.7274, + "step": 36518 + }, + { + "epoch": 1.70023511883977, + "grad_norm": 0.38587814071358756, + "learning_rate": 4.707623332977646e-05, + "loss": 2.7105, + "step": 36519 + }, + { + "epoch": 1.700281677025863, + "grad_norm": 0.3516873530805949, + "learning_rate": 4.707352923256033e-05, + "loss": 2.6696, + "step": 36520 + }, + { + "epoch": 1.7003282352119562, + "grad_norm": 0.3720575636937467, + "learning_rate": 4.7070825143933136e-05, + "loss": 2.7457, + "step": 36521 + }, + { + "epoch": 1.700374793398049, + "grad_norm": 0.33452754465201756, + "learning_rate": 4.706812106390277e-05, + "loss": 2.7245, + "step": 36522 + }, + { + "epoch": 1.7004213515841422, + "grad_norm": 0.3846836596291703, + "learning_rate": 4.706541699247716e-05, + "loss": 2.8117, + "step": 36523 + }, + { + "epoch": 1.7004679097702353, + "grad_norm": 0.3784336290124047, + "learning_rate": 4.706271292966429e-05, + "loss": 2.7304, + "step": 36524 + }, + { + "epoch": 1.7005144679563284, + "grad_norm": 0.33906548095440797, + "learning_rate": 4.706000887547204e-05, + "loss": 2.7517, + "step": 36525 + }, + { + "epoch": 1.7005610261424216, + "grad_norm": 0.34857636294848776, + "learning_rate": 4.70573048299084e-05, + "loss": 2.6501, + "step": 36526 + }, + { + "epoch": 1.7006075843285147, + "grad_norm": 0.3528264189802008, + "learning_rate": 4.7054600792981276e-05, + "loss": 2.725, + "step": 36527 + }, + { + "epoch": 1.7006541425146078, + "grad_norm": 0.3271011864073143, + "learning_rate": 4.7051896764698605e-05, + "loss": 2.6464, + "step": 36528 + }, + { + "epoch": 1.7007007007007007, + "grad_norm": 0.34530761683725436, + "learning_rate": 4.7049192745068336e-05, + "loss": 2.6669, + "step": 36529 + }, + { + "epoch": 1.7007472588867938, + "grad_norm": 0.40237654729330924, + "learning_rate": 4.7046488734098375e-05, + "loss": 2.6295, + "step": 36530 + }, + { + "epoch": 1.7007938170728867, + "grad_norm": 0.33595758636394685, + "learning_rate": 4.7043784731796684e-05, + "loss": 2.7614, + "step": 36531 + }, + { + "epoch": 1.7008403752589798, + "grad_norm": 0.38747371800259994, + "learning_rate": 4.7041080738171214e-05, + "loss": 2.7195, + "step": 36532 + }, + { + "epoch": 1.700886933445073, + "grad_norm": 0.3452116645795281, + "learning_rate": 4.703837675322984e-05, + "loss": 2.6635, + "step": 36533 + }, + { + "epoch": 1.700933491631166, + "grad_norm": 0.34424062338567407, + "learning_rate": 4.7035672776980576e-05, + "loss": 2.5743, + "step": 36534 + }, + { + "epoch": 1.7009800498172591, + "grad_norm": 0.35225137360798525, + "learning_rate": 4.703296880943131e-05, + "loss": 2.7897, + "step": 36535 + }, + { + "epoch": 1.7010266080033523, + "grad_norm": 0.3604328753154089, + "learning_rate": 4.7030264850589975e-05, + "loss": 2.7035, + "step": 36536 + }, + { + "epoch": 1.7010731661894454, + "grad_norm": 0.34651224690019794, + "learning_rate": 4.702756090046454e-05, + "loss": 2.7501, + "step": 36537 + }, + { + "epoch": 1.7011197243755385, + "grad_norm": 0.32916586092615524, + "learning_rate": 4.7024856959062904e-05, + "loss": 2.7305, + "step": 36538 + }, + { + "epoch": 1.7011662825616314, + "grad_norm": 0.341019116138901, + "learning_rate": 4.702215302639303e-05, + "loss": 2.729, + "step": 36539 + }, + { + "epoch": 1.7012128407477245, + "grad_norm": 0.30912135923908635, + "learning_rate": 4.701944910246285e-05, + "loss": 2.6361, + "step": 36540 + }, + { + "epoch": 1.7012593989338174, + "grad_norm": 0.35078304790593434, + "learning_rate": 4.701674518728028e-05, + "loss": 2.6724, + "step": 36541 + }, + { + "epoch": 1.7013059571199105, + "grad_norm": 0.3196676373699454, + "learning_rate": 4.701404128085328e-05, + "loss": 2.673, + "step": 36542 + }, + { + "epoch": 1.7013525153060036, + "grad_norm": 0.33178147139862785, + "learning_rate": 4.701133738318976e-05, + "loss": 2.7506, + "step": 36543 + }, + { + "epoch": 1.7013990734920967, + "grad_norm": 0.3572306401727829, + "learning_rate": 4.7008633494297684e-05, + "loss": 2.8396, + "step": 36544 + }, + { + "epoch": 1.7014456316781899, + "grad_norm": 0.33316436166197677, + "learning_rate": 4.7005929614184975e-05, + "loss": 2.747, + "step": 36545 + }, + { + "epoch": 1.701492189864283, + "grad_norm": 0.35519467518829306, + "learning_rate": 4.700322574285955e-05, + "loss": 2.6747, + "step": 36546 + }, + { + "epoch": 1.701538748050376, + "grad_norm": 0.37006174914172735, + "learning_rate": 4.7000521880329374e-05, + "loss": 2.725, + "step": 36547 + }, + { + "epoch": 1.701585306236469, + "grad_norm": 0.34612504088076096, + "learning_rate": 4.699781802660238e-05, + "loss": 2.7155, + "step": 36548 + }, + { + "epoch": 1.701631864422562, + "grad_norm": 0.36894949431191754, + "learning_rate": 4.699511418168647e-05, + "loss": 2.6382, + "step": 36549 + }, + { + "epoch": 1.7016784226086552, + "grad_norm": 0.3316029055095795, + "learning_rate": 4.699241034558963e-05, + "loss": 2.6496, + "step": 36550 + }, + { + "epoch": 1.701724980794748, + "grad_norm": 0.33554365223995597, + "learning_rate": 4.698970651831974e-05, + "loss": 2.7506, + "step": 36551 + }, + { + "epoch": 1.7017715389808412, + "grad_norm": 0.3355151520763143, + "learning_rate": 4.698700269988479e-05, + "loss": 2.7134, + "step": 36552 + }, + { + "epoch": 1.7018180971669343, + "grad_norm": 0.3149994272268237, + "learning_rate": 4.6984298890292686e-05, + "loss": 2.598, + "step": 36553 + }, + { + "epoch": 1.7018646553530274, + "grad_norm": 0.3725494579299867, + "learning_rate": 4.698159508955137e-05, + "loss": 2.722, + "step": 36554 + }, + { + "epoch": 1.7019112135391206, + "grad_norm": 0.31144482051109756, + "learning_rate": 4.697889129766877e-05, + "loss": 2.6229, + "step": 36555 + }, + { + "epoch": 1.7019577717252137, + "grad_norm": 0.3415463520054106, + "learning_rate": 4.697618751465281e-05, + "loss": 2.7496, + "step": 36556 + }, + { + "epoch": 1.7020043299113068, + "grad_norm": 0.36530823253669503, + "learning_rate": 4.697348374051147e-05, + "loss": 2.667, + "step": 36557 + }, + { + "epoch": 1.7020508880973997, + "grad_norm": 0.34031413141113376, + "learning_rate": 4.697077997525265e-05, + "loss": 2.7304, + "step": 36558 + }, + { + "epoch": 1.7020974462834928, + "grad_norm": 0.35669597269845993, + "learning_rate": 4.6968076218884274e-05, + "loss": 2.6198, + "step": 36559 + }, + { + "epoch": 1.702144004469586, + "grad_norm": 0.31802563825259733, + "learning_rate": 4.6965372471414316e-05, + "loss": 2.8057, + "step": 36560 + }, + { + "epoch": 1.7021905626556788, + "grad_norm": 0.36694739021859574, + "learning_rate": 4.6962668732850666e-05, + "loss": 2.768, + "step": 36561 + }, + { + "epoch": 1.702237120841772, + "grad_norm": 0.37192935347846157, + "learning_rate": 4.695996500320131e-05, + "loss": 2.7086, + "step": 36562 + }, + { + "epoch": 1.702283679027865, + "grad_norm": 0.33965576773454925, + "learning_rate": 4.6957261282474156e-05, + "loss": 2.65, + "step": 36563 + }, + { + "epoch": 1.7023302372139582, + "grad_norm": 0.34451812873439636, + "learning_rate": 4.695455757067712e-05, + "loss": 2.5998, + "step": 36564 + }, + { + "epoch": 1.7023767954000513, + "grad_norm": 0.3437036524936662, + "learning_rate": 4.695185386781817e-05, + "loss": 2.658, + "step": 36565 + }, + { + "epoch": 1.7024233535861444, + "grad_norm": 0.36134311401713654, + "learning_rate": 4.694915017390524e-05, + "loss": 2.63, + "step": 36566 + }, + { + "epoch": 1.7024699117722375, + "grad_norm": 0.34003370965479784, + "learning_rate": 4.6946446488946246e-05, + "loss": 2.6782, + "step": 36567 + }, + { + "epoch": 1.7025164699583304, + "grad_norm": 0.3229279920105642, + "learning_rate": 4.694374281294913e-05, + "loss": 2.7052, + "step": 36568 + }, + { + "epoch": 1.7025630281444235, + "grad_norm": 0.35952493465148133, + "learning_rate": 4.694103914592181e-05, + "loss": 2.6975, + "step": 36569 + }, + { + "epoch": 1.7026095863305164, + "grad_norm": 0.3352062705152344, + "learning_rate": 4.693833548787226e-05, + "loss": 2.6191, + "step": 36570 + }, + { + "epoch": 1.7026561445166095, + "grad_norm": 0.3728386979586681, + "learning_rate": 4.69356318388084e-05, + "loss": 2.7364, + "step": 36571 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 0.33059769423324536, + "learning_rate": 4.693292819873813e-05, + "loss": 2.6624, + "step": 36572 + }, + { + "epoch": 1.7027492608887957, + "grad_norm": 0.3606550075347242, + "learning_rate": 4.693022456766944e-05, + "loss": 2.7139, + "step": 36573 + }, + { + "epoch": 1.7027958190748889, + "grad_norm": 0.35083411029314565, + "learning_rate": 4.692752094561022e-05, + "loss": 2.6518, + "step": 36574 + }, + { + "epoch": 1.702842377260982, + "grad_norm": 0.35002088517130986, + "learning_rate": 4.6924817332568433e-05, + "loss": 2.8061, + "step": 36575 + }, + { + "epoch": 1.702888935447075, + "grad_norm": 0.3488434358944323, + "learning_rate": 4.692211372855201e-05, + "loss": 2.8281, + "step": 36576 + }, + { + "epoch": 1.7029354936331682, + "grad_norm": 0.3299577282166199, + "learning_rate": 4.691941013356886e-05, + "loss": 2.5939, + "step": 36577 + }, + { + "epoch": 1.702982051819261, + "grad_norm": 0.3471457672449005, + "learning_rate": 4.691670654762696e-05, + "loss": 2.6423, + "step": 36578 + }, + { + "epoch": 1.7030286100053542, + "grad_norm": 0.3723704268096984, + "learning_rate": 4.6914002970734205e-05, + "loss": 2.7182, + "step": 36579 + }, + { + "epoch": 1.703075168191447, + "grad_norm": 0.3232257045272165, + "learning_rate": 4.691129940289857e-05, + "loss": 2.5941, + "step": 36580 + }, + { + "epoch": 1.7031217263775402, + "grad_norm": 0.37742355043005565, + "learning_rate": 4.6908595844127956e-05, + "loss": 2.755, + "step": 36581 + }, + { + "epoch": 1.7031682845636333, + "grad_norm": 0.3416149234678435, + "learning_rate": 4.690589229443029e-05, + "loss": 2.7175, + "step": 36582 + }, + { + "epoch": 1.7032148427497265, + "grad_norm": 0.3798845724496082, + "learning_rate": 4.690318875381355e-05, + "loss": 2.8288, + "step": 36583 + }, + { + "epoch": 1.7032614009358196, + "grad_norm": 0.37052194429265733, + "learning_rate": 4.690048522228564e-05, + "loss": 2.792, + "step": 36584 + }, + { + "epoch": 1.7033079591219127, + "grad_norm": 0.351884555505831, + "learning_rate": 4.689778169985448e-05, + "loss": 2.8055, + "step": 36585 + }, + { + "epoch": 1.7033545173080058, + "grad_norm": 0.3295819339747961, + "learning_rate": 4.689507818652805e-05, + "loss": 2.6602, + "step": 36586 + }, + { + "epoch": 1.703401075494099, + "grad_norm": 0.35230620216312863, + "learning_rate": 4.6892374682314245e-05, + "loss": 2.7268, + "step": 36587 + }, + { + "epoch": 1.7034476336801918, + "grad_norm": 0.3334477152754097, + "learning_rate": 4.688967118722102e-05, + "loss": 2.7074, + "step": 36588 + }, + { + "epoch": 1.703494191866285, + "grad_norm": 0.35101007268504275, + "learning_rate": 4.688696770125632e-05, + "loss": 2.6829, + "step": 36589 + }, + { + "epoch": 1.7035407500523778, + "grad_norm": 0.33603027460639373, + "learning_rate": 4.688426422442803e-05, + "loss": 2.5926, + "step": 36590 + }, + { + "epoch": 1.703587308238471, + "grad_norm": 0.34959381623219393, + "learning_rate": 4.688156075674414e-05, + "loss": 2.6497, + "step": 36591 + }, + { + "epoch": 1.703633866424564, + "grad_norm": 0.32557647970930725, + "learning_rate": 4.687885729821256e-05, + "loss": 2.7731, + "step": 36592 + }, + { + "epoch": 1.7036804246106572, + "grad_norm": 0.3720932070035537, + "learning_rate": 4.687615384884123e-05, + "loss": 2.722, + "step": 36593 + }, + { + "epoch": 1.7037269827967503, + "grad_norm": 0.36062306726286036, + "learning_rate": 4.687345040863808e-05, + "loss": 2.7527, + "step": 36594 + }, + { + "epoch": 1.7037735409828434, + "grad_norm": 0.34733145937063226, + "learning_rate": 4.687074697761102e-05, + "loss": 2.7305, + "step": 36595 + }, + { + "epoch": 1.7038200991689365, + "grad_norm": 0.3252633952314846, + "learning_rate": 4.6868043555768034e-05, + "loss": 2.6898, + "step": 36596 + }, + { + "epoch": 1.7038666573550294, + "grad_norm": 0.32217161104251035, + "learning_rate": 4.6865340143117014e-05, + "loss": 2.7287, + "step": 36597 + }, + { + "epoch": 1.7039132155411225, + "grad_norm": 0.35503979506788597, + "learning_rate": 4.686263673966593e-05, + "loss": 2.6204, + "step": 36598 + }, + { + "epoch": 1.7039597737272156, + "grad_norm": 0.34299742633250574, + "learning_rate": 4.6859933345422704e-05, + "loss": 2.716, + "step": 36599 + }, + { + "epoch": 1.7040063319133085, + "grad_norm": 0.34719935950013014, + "learning_rate": 4.685722996039524e-05, + "loss": 2.738, + "step": 36600 + }, + { + "epoch": 1.7040528900994016, + "grad_norm": 0.3398247255969187, + "learning_rate": 4.685452658459151e-05, + "loss": 2.768, + "step": 36601 + }, + { + "epoch": 1.7040994482854948, + "grad_norm": 0.3513149952990714, + "learning_rate": 4.6851823218019444e-05, + "loss": 2.7641, + "step": 36602 + }, + { + "epoch": 1.7041460064715879, + "grad_norm": 0.34921047157495255, + "learning_rate": 4.684911986068694e-05, + "loss": 2.7407, + "step": 36603 + }, + { + "epoch": 1.704192564657681, + "grad_norm": 0.3401087280728531, + "learning_rate": 4.684641651260199e-05, + "loss": 2.7506, + "step": 36604 + }, + { + "epoch": 1.704239122843774, + "grad_norm": 0.3568705498479217, + "learning_rate": 4.684371317377248e-05, + "loss": 2.6781, + "step": 36605 + }, + { + "epoch": 1.7042856810298672, + "grad_norm": 0.3478460499807456, + "learning_rate": 4.684100984420636e-05, + "loss": 2.7775, + "step": 36606 + }, + { + "epoch": 1.70433223921596, + "grad_norm": 0.37599298805117337, + "learning_rate": 4.683830652391158e-05, + "loss": 2.7058, + "step": 36607 + }, + { + "epoch": 1.7043787974020532, + "grad_norm": 0.3686987693529625, + "learning_rate": 4.683560321289603e-05, + "loss": 2.7766, + "step": 36608 + }, + { + "epoch": 1.7044253555881463, + "grad_norm": 0.3557694762785227, + "learning_rate": 4.68328999111677e-05, + "loss": 2.7059, + "step": 36609 + }, + { + "epoch": 1.7044719137742392, + "grad_norm": 0.38167310498774754, + "learning_rate": 4.683019661873447e-05, + "loss": 2.7249, + "step": 36610 + }, + { + "epoch": 1.7045184719603323, + "grad_norm": 0.35110794835052794, + "learning_rate": 4.682749333560432e-05, + "loss": 2.7548, + "step": 36611 + }, + { + "epoch": 1.7045650301464255, + "grad_norm": 0.35473780270448907, + "learning_rate": 4.682479006178517e-05, + "loss": 2.6811, + "step": 36612 + }, + { + "epoch": 1.7046115883325186, + "grad_norm": 0.3518123477837184, + "learning_rate": 4.682208679728492e-05, + "loss": 2.7003, + "step": 36613 + }, + { + "epoch": 1.7046581465186117, + "grad_norm": 0.36397640808567877, + "learning_rate": 4.6819383542111564e-05, + "loss": 2.6944, + "step": 36614 + }, + { + "epoch": 1.7047047047047048, + "grad_norm": 0.33623342898564934, + "learning_rate": 4.681668029627299e-05, + "loss": 2.7604, + "step": 36615 + }, + { + "epoch": 1.704751262890798, + "grad_norm": 0.38742393715098755, + "learning_rate": 4.681397705977714e-05, + "loss": 2.6026, + "step": 36616 + }, + { + "epoch": 1.7047978210768908, + "grad_norm": 0.32591129567838917, + "learning_rate": 4.6811273832631974e-05, + "loss": 2.6885, + "step": 36617 + }, + { + "epoch": 1.704844379262984, + "grad_norm": 0.3464628279427139, + "learning_rate": 4.680857061484539e-05, + "loss": 2.7141, + "step": 36618 + }, + { + "epoch": 1.7048909374490768, + "grad_norm": 0.3634205323386155, + "learning_rate": 4.680586740642534e-05, + "loss": 2.7062, + "step": 36619 + }, + { + "epoch": 1.70493749563517, + "grad_norm": 0.34524058537444186, + "learning_rate": 4.680316420737976e-05, + "loss": 2.6824, + "step": 36620 + }, + { + "epoch": 1.704984053821263, + "grad_norm": 0.3406283190325297, + "learning_rate": 4.680046101771656e-05, + "loss": 2.7118, + "step": 36621 + }, + { + "epoch": 1.7050306120073562, + "grad_norm": 0.34622000912304485, + "learning_rate": 4.679775783744371e-05, + "loss": 2.7038, + "step": 36622 + }, + { + "epoch": 1.7050771701934493, + "grad_norm": 0.34390091428379416, + "learning_rate": 4.6795054666569104e-05, + "loss": 2.7326, + "step": 36623 + }, + { + "epoch": 1.7051237283795424, + "grad_norm": 0.34107165586311455, + "learning_rate": 4.679235150510072e-05, + "loss": 2.6473, + "step": 36624 + }, + { + "epoch": 1.7051702865656355, + "grad_norm": 0.3841443579938512, + "learning_rate": 4.6789648353046464e-05, + "loss": 2.7103, + "step": 36625 + }, + { + "epoch": 1.7052168447517286, + "grad_norm": 0.39196117417151266, + "learning_rate": 4.6786945210414254e-05, + "loss": 2.7189, + "step": 36626 + }, + { + "epoch": 1.7052634029378215, + "grad_norm": 0.3429799320656156, + "learning_rate": 4.678424207721207e-05, + "loss": 2.7084, + "step": 36627 + }, + { + "epoch": 1.7053099611239146, + "grad_norm": 0.3717287170117243, + "learning_rate": 4.67815389534478e-05, + "loss": 2.6949, + "step": 36628 + }, + { + "epoch": 1.7053565193100075, + "grad_norm": 0.3643694126349031, + "learning_rate": 4.67788358391294e-05, + "loss": 2.7098, + "step": 36629 + }, + { + "epoch": 1.7054030774961007, + "grad_norm": 0.35365852304894774, + "learning_rate": 4.677613273426482e-05, + "loss": 2.7467, + "step": 36630 + }, + { + "epoch": 1.7054496356821938, + "grad_norm": 0.3697074040792606, + "learning_rate": 4.677342963886195e-05, + "loss": 2.7573, + "step": 36631 + }, + { + "epoch": 1.7054961938682869, + "grad_norm": 0.3664408780894327, + "learning_rate": 4.6770726552928756e-05, + "loss": 2.7244, + "step": 36632 + }, + { + "epoch": 1.70554275205438, + "grad_norm": 0.3847656320338837, + "learning_rate": 4.676802347647314e-05, + "loss": 2.6866, + "step": 36633 + }, + { + "epoch": 1.705589310240473, + "grad_norm": 0.3501073597305436, + "learning_rate": 4.6765320409503085e-05, + "loss": 2.739, + "step": 36634 + }, + { + "epoch": 1.7056358684265662, + "grad_norm": 0.37108523071214317, + "learning_rate": 4.676261735202649e-05, + "loss": 2.6402, + "step": 36635 + }, + { + "epoch": 1.7056824266126591, + "grad_norm": 0.3548921836869728, + "learning_rate": 4.6759914304051276e-05, + "loss": 2.5941, + "step": 36636 + }, + { + "epoch": 1.7057289847987522, + "grad_norm": 0.3351530519682237, + "learning_rate": 4.6757211265585416e-05, + "loss": 2.7307, + "step": 36637 + }, + { + "epoch": 1.7057755429848453, + "grad_norm": 0.33585995149508663, + "learning_rate": 4.675450823663682e-05, + "loss": 2.6341, + "step": 36638 + }, + { + "epoch": 1.7058221011709382, + "grad_norm": 0.3222855329352846, + "learning_rate": 4.67518052172134e-05, + "loss": 2.6361, + "step": 36639 + }, + { + "epoch": 1.7058686593570314, + "grad_norm": 0.3002301713921012, + "learning_rate": 4.674910220732314e-05, + "loss": 2.6025, + "step": 36640 + }, + { + "epoch": 1.7059152175431245, + "grad_norm": 0.33754333667805025, + "learning_rate": 4.6746399206973924e-05, + "loss": 2.5512, + "step": 36641 + }, + { + "epoch": 1.7059617757292176, + "grad_norm": 0.31392301405426787, + "learning_rate": 4.674369621617371e-05, + "loss": 2.7671, + "step": 36642 + }, + { + "epoch": 1.7060083339153107, + "grad_norm": 0.3335552169293738, + "learning_rate": 4.6740993234930435e-05, + "loss": 2.7461, + "step": 36643 + }, + { + "epoch": 1.7060548921014038, + "grad_norm": 0.31836590354915434, + "learning_rate": 4.673829026325202e-05, + "loss": 2.7657, + "step": 36644 + }, + { + "epoch": 1.706101450287497, + "grad_norm": 0.33497201380084574, + "learning_rate": 4.673558730114641e-05, + "loss": 2.6808, + "step": 36645 + }, + { + "epoch": 1.7061480084735898, + "grad_norm": 0.34228976415792345, + "learning_rate": 4.6732884348621506e-05, + "loss": 2.8206, + "step": 36646 + }, + { + "epoch": 1.706194566659683, + "grad_norm": 0.3251695433103149, + "learning_rate": 4.673018140568529e-05, + "loss": 2.7586, + "step": 36647 + }, + { + "epoch": 1.706241124845776, + "grad_norm": 0.3418827770350642, + "learning_rate": 4.672747847234566e-05, + "loss": 2.7649, + "step": 36648 + }, + { + "epoch": 1.706287683031869, + "grad_norm": 0.3496103410047729, + "learning_rate": 4.672477554861054e-05, + "loss": 2.6921, + "step": 36649 + }, + { + "epoch": 1.706334241217962, + "grad_norm": 0.3354867855083307, + "learning_rate": 4.6722072634487906e-05, + "loss": 2.7686, + "step": 36650 + }, + { + "epoch": 1.7063807994040552, + "grad_norm": 0.33727955155995304, + "learning_rate": 4.6719369729985646e-05, + "loss": 2.6552, + "step": 36651 + }, + { + "epoch": 1.7064273575901483, + "grad_norm": 0.383766156649044, + "learning_rate": 4.671666683511173e-05, + "loss": 2.7311, + "step": 36652 + }, + { + "epoch": 1.7064739157762414, + "grad_norm": 0.33280105704362567, + "learning_rate": 4.6713963949874076e-05, + "loss": 2.7681, + "step": 36653 + }, + { + "epoch": 1.7065204739623345, + "grad_norm": 0.34065610378044986, + "learning_rate": 4.67112610742806e-05, + "loss": 2.6423, + "step": 36654 + }, + { + "epoch": 1.7065670321484276, + "grad_norm": 0.35041986098161376, + "learning_rate": 4.670855820833925e-05, + "loss": 2.7813, + "step": 36655 + }, + { + "epoch": 1.7066135903345205, + "grad_norm": 0.3846015995207869, + "learning_rate": 4.6705855352057975e-05, + "loss": 2.7589, + "step": 36656 + }, + { + "epoch": 1.7066601485206137, + "grad_norm": 0.3443402751626288, + "learning_rate": 4.670315250544468e-05, + "loss": 2.6594, + "step": 36657 + }, + { + "epoch": 1.7067067067067065, + "grad_norm": 0.32951844751030734, + "learning_rate": 4.670044966850731e-05, + "loss": 2.6678, + "step": 36658 + }, + { + "epoch": 1.7067532648927997, + "grad_norm": 0.3595434649647183, + "learning_rate": 4.669774684125377e-05, + "loss": 2.7273, + "step": 36659 + }, + { + "epoch": 1.7067998230788928, + "grad_norm": 0.3403279865814548, + "learning_rate": 4.6695044023692047e-05, + "loss": 2.8201, + "step": 36660 + }, + { + "epoch": 1.706846381264986, + "grad_norm": 0.3594271346360347, + "learning_rate": 4.6692341215830045e-05, + "loss": 2.7203, + "step": 36661 + }, + { + "epoch": 1.706892939451079, + "grad_norm": 0.3516795354739231, + "learning_rate": 4.668963841767567e-05, + "loss": 2.6229, + "step": 36662 + }, + { + "epoch": 1.7069394976371721, + "grad_norm": 0.36711259707021215, + "learning_rate": 4.66869356292369e-05, + "loss": 2.7462, + "step": 36663 + }, + { + "epoch": 1.7069860558232652, + "grad_norm": 0.3378827780043492, + "learning_rate": 4.668423285052163e-05, + "loss": 2.6551, + "step": 36664 + }, + { + "epoch": 1.7070326140093584, + "grad_norm": 0.35910075920509177, + "learning_rate": 4.668153008153783e-05, + "loss": 2.6615, + "step": 36665 + }, + { + "epoch": 1.7070791721954512, + "grad_norm": 0.3735597612144113, + "learning_rate": 4.667882732229341e-05, + "loss": 2.618, + "step": 36666 + }, + { + "epoch": 1.7071257303815444, + "grad_norm": 0.33504951529679383, + "learning_rate": 4.66761245727963e-05, + "loss": 2.7285, + "step": 36667 + }, + { + "epoch": 1.7071722885676373, + "grad_norm": 0.38698384150139753, + "learning_rate": 4.6673421833054434e-05, + "loss": 2.6817, + "step": 36668 + }, + { + "epoch": 1.7072188467537304, + "grad_norm": 0.3535223062682243, + "learning_rate": 4.667071910307574e-05, + "loss": 2.7763, + "step": 36669 + }, + { + "epoch": 1.7072654049398235, + "grad_norm": 0.3541824036817134, + "learning_rate": 4.666801638286818e-05, + "loss": 2.7681, + "step": 36670 + }, + { + "epoch": 1.7073119631259166, + "grad_norm": 0.3904768258163181, + "learning_rate": 4.666531367243966e-05, + "loss": 2.752, + "step": 36671 + }, + { + "epoch": 1.7073585213120097, + "grad_norm": 0.3394297128328245, + "learning_rate": 4.666261097179809e-05, + "loss": 2.6877, + "step": 36672 + }, + { + "epoch": 1.7074050794981028, + "grad_norm": 0.38680113197831795, + "learning_rate": 4.6659908280951456e-05, + "loss": 2.6635, + "step": 36673 + }, + { + "epoch": 1.707451637684196, + "grad_norm": 0.3657659162388055, + "learning_rate": 4.665720559990766e-05, + "loss": 2.6725, + "step": 36674 + }, + { + "epoch": 1.707498195870289, + "grad_norm": 0.37136635055454975, + "learning_rate": 4.665450292867461e-05, + "loss": 2.6935, + "step": 36675 + }, + { + "epoch": 1.707544754056382, + "grad_norm": 0.3578145576568039, + "learning_rate": 4.6651800267260294e-05, + "loss": 2.6263, + "step": 36676 + }, + { + "epoch": 1.707591312242475, + "grad_norm": 0.35796732161719647, + "learning_rate": 4.6649097615672585e-05, + "loss": 2.6053, + "step": 36677 + }, + { + "epoch": 1.707637870428568, + "grad_norm": 0.41302802349387713, + "learning_rate": 4.664639497391947e-05, + "loss": 2.7668, + "step": 36678 + }, + { + "epoch": 1.707684428614661, + "grad_norm": 0.35668939464267474, + "learning_rate": 4.664369234200885e-05, + "loss": 2.7438, + "step": 36679 + }, + { + "epoch": 1.7077309868007542, + "grad_norm": 0.35228402539959963, + "learning_rate": 4.6640989719948657e-05, + "loss": 2.7126, + "step": 36680 + }, + { + "epoch": 1.7077775449868473, + "grad_norm": 0.3811420327624957, + "learning_rate": 4.663828710774683e-05, + "loss": 2.6793, + "step": 36681 + }, + { + "epoch": 1.7078241031729404, + "grad_norm": 0.353693348464717, + "learning_rate": 4.663558450541129e-05, + "loss": 2.7962, + "step": 36682 + }, + { + "epoch": 1.7078706613590335, + "grad_norm": 0.3684019611350237, + "learning_rate": 4.663288191295e-05, + "loss": 2.6562, + "step": 36683 + }, + { + "epoch": 1.7079172195451267, + "grad_norm": 0.37347846775448146, + "learning_rate": 4.6630179330370865e-05, + "loss": 2.7641, + "step": 36684 + }, + { + "epoch": 1.7079637777312195, + "grad_norm": 0.33742128201186045, + "learning_rate": 4.66274767576818e-05, + "loss": 2.7203, + "step": 36685 + }, + { + "epoch": 1.7080103359173127, + "grad_norm": 0.35157495474330747, + "learning_rate": 4.662477419489078e-05, + "loss": 2.6522, + "step": 36686 + }, + { + "epoch": 1.7080568941034058, + "grad_norm": 0.39772966888646394, + "learning_rate": 4.662207164200569e-05, + "loss": 2.6588, + "step": 36687 + }, + { + "epoch": 1.7081034522894987, + "grad_norm": 0.3628376130762841, + "learning_rate": 4.6619369099034514e-05, + "loss": 2.7988, + "step": 36688 + }, + { + "epoch": 1.7081500104755918, + "grad_norm": 0.38986283534073984, + "learning_rate": 4.6616666565985156e-05, + "loss": 2.711, + "step": 36689 + }, + { + "epoch": 1.708196568661685, + "grad_norm": 0.37906440705983246, + "learning_rate": 4.6613964042865525e-05, + "loss": 2.7325, + "step": 36690 + }, + { + "epoch": 1.708243126847778, + "grad_norm": 0.342955823527468, + "learning_rate": 4.66112615296836e-05, + "loss": 2.7466, + "step": 36691 + }, + { + "epoch": 1.7082896850338711, + "grad_norm": 0.388312781645973, + "learning_rate": 4.6608559026447276e-05, + "loss": 2.6978, + "step": 36692 + }, + { + "epoch": 1.7083362432199642, + "grad_norm": 0.3877715347059382, + "learning_rate": 4.660585653316451e-05, + "loss": 2.739, + "step": 36693 + }, + { + "epoch": 1.7083828014060574, + "grad_norm": 0.3726435569781357, + "learning_rate": 4.660315404984321e-05, + "loss": 2.7004, + "step": 36694 + }, + { + "epoch": 1.7084293595921503, + "grad_norm": 0.38350293182460216, + "learning_rate": 4.660045157649131e-05, + "loss": 2.6463, + "step": 36695 + }, + { + "epoch": 1.7084759177782434, + "grad_norm": 0.3400273456406101, + "learning_rate": 4.659774911311677e-05, + "loss": 2.7515, + "step": 36696 + }, + { + "epoch": 1.7085224759643365, + "grad_norm": 0.35307026894849464, + "learning_rate": 4.65950466597275e-05, + "loss": 2.7244, + "step": 36697 + }, + { + "epoch": 1.7085690341504294, + "grad_norm": 0.34635636766001643, + "learning_rate": 4.6592344216331406e-05, + "loss": 2.7077, + "step": 36698 + }, + { + "epoch": 1.7086155923365225, + "grad_norm": 0.34602244705742174, + "learning_rate": 4.658964178293647e-05, + "loss": 2.5794, + "step": 36699 + }, + { + "epoch": 1.7086621505226156, + "grad_norm": 0.3479093619472296, + "learning_rate": 4.658693935955057e-05, + "loss": 2.6574, + "step": 36700 + }, + { + "epoch": 1.7087087087087087, + "grad_norm": 0.3308093513894369, + "learning_rate": 4.65842369461817e-05, + "loss": 2.5922, + "step": 36701 + }, + { + "epoch": 1.7087552668948018, + "grad_norm": 0.3572968608540384, + "learning_rate": 4.6581534542837755e-05, + "loss": 2.7763, + "step": 36702 + }, + { + "epoch": 1.708801825080895, + "grad_norm": 0.37012465881681944, + "learning_rate": 4.6578832149526655e-05, + "loss": 2.7513, + "step": 36703 + }, + { + "epoch": 1.708848383266988, + "grad_norm": 0.35651551412087185, + "learning_rate": 4.657612976625636e-05, + "loss": 2.5818, + "step": 36704 + }, + { + "epoch": 1.708894941453081, + "grad_norm": 0.3443278290781834, + "learning_rate": 4.657342739303477e-05, + "loss": 2.6941, + "step": 36705 + }, + { + "epoch": 1.708941499639174, + "grad_norm": 0.3310338725056034, + "learning_rate": 4.657072502986985e-05, + "loss": 2.8132, + "step": 36706 + }, + { + "epoch": 1.708988057825267, + "grad_norm": 0.3369190671850985, + "learning_rate": 4.6568022676769505e-05, + "loss": 2.7151, + "step": 36707 + }, + { + "epoch": 1.70903461601136, + "grad_norm": 0.3760908814440183, + "learning_rate": 4.656532033374167e-05, + "loss": 2.6861, + "step": 36708 + }, + { + "epoch": 1.7090811741974532, + "grad_norm": 0.345488384095611, + "learning_rate": 4.6562618000794296e-05, + "loss": 2.6716, + "step": 36709 + }, + { + "epoch": 1.7091277323835463, + "grad_norm": 0.37906546479331416, + "learning_rate": 4.6559915677935276e-05, + "loss": 2.6766, + "step": 36710 + }, + { + "epoch": 1.7091742905696394, + "grad_norm": 0.3739716133336596, + "learning_rate": 4.655721336517259e-05, + "loss": 2.7477, + "step": 36711 + }, + { + "epoch": 1.7092208487557325, + "grad_norm": 0.3489049198082616, + "learning_rate": 4.655451106251414e-05, + "loss": 2.7382, + "step": 36712 + }, + { + "epoch": 1.7092674069418257, + "grad_norm": 0.3678097096550102, + "learning_rate": 4.655180876996784e-05, + "loss": 2.7068, + "step": 36713 + }, + { + "epoch": 1.7093139651279188, + "grad_norm": 0.36059371901826365, + "learning_rate": 4.654910648754166e-05, + "loss": 2.7373, + "step": 36714 + }, + { + "epoch": 1.7093605233140117, + "grad_norm": 0.3703546893931321, + "learning_rate": 4.6546404215243516e-05, + "loss": 2.6557, + "step": 36715 + }, + { + "epoch": 1.7094070815001048, + "grad_norm": 0.34081799326657414, + "learning_rate": 4.654370195308131e-05, + "loss": 2.6088, + "step": 36716 + }, + { + "epoch": 1.7094536396861977, + "grad_norm": 0.3435538259758444, + "learning_rate": 4.654099970106303e-05, + "loss": 2.5612, + "step": 36717 + }, + { + "epoch": 1.7095001978722908, + "grad_norm": 0.3731294153905059, + "learning_rate": 4.653829745919656e-05, + "loss": 2.707, + "step": 36718 + }, + { + "epoch": 1.709546756058384, + "grad_norm": 0.3512552659698506, + "learning_rate": 4.6535595227489856e-05, + "loss": 2.6606, + "step": 36719 + }, + { + "epoch": 1.709593314244477, + "grad_norm": 0.33987417292703676, + "learning_rate": 4.6532893005950824e-05, + "loss": 2.7348, + "step": 36720 + }, + { + "epoch": 1.7096398724305701, + "grad_norm": 0.36784274834508185, + "learning_rate": 4.653019079458741e-05, + "loss": 2.7369, + "step": 36721 + }, + { + "epoch": 1.7096864306166633, + "grad_norm": 0.37248093734417687, + "learning_rate": 4.652748859340755e-05, + "loss": 2.8061, + "step": 36722 + }, + { + "epoch": 1.7097329888027564, + "grad_norm": 0.36947229391699365, + "learning_rate": 4.652478640241916e-05, + "loss": 2.7555, + "step": 36723 + }, + { + "epoch": 1.7097795469888493, + "grad_norm": 0.36193166313506225, + "learning_rate": 4.65220842216302e-05, + "loss": 2.7568, + "step": 36724 + }, + { + "epoch": 1.7098261051749424, + "grad_norm": 0.34966032155725124, + "learning_rate": 4.651938205104857e-05, + "loss": 2.7648, + "step": 36725 + }, + { + "epoch": 1.7098726633610355, + "grad_norm": 0.34559964423494755, + "learning_rate": 4.65166798906822e-05, + "loss": 2.626, + "step": 36726 + }, + { + "epoch": 1.7099192215471284, + "grad_norm": 0.3652085771225027, + "learning_rate": 4.6513977740539045e-05, + "loss": 2.7957, + "step": 36727 + }, + { + "epoch": 1.7099657797332215, + "grad_norm": 0.3374445408382926, + "learning_rate": 4.6511275600627005e-05, + "loss": 2.6896, + "step": 36728 + }, + { + "epoch": 1.7100123379193146, + "grad_norm": 0.3325031577450747, + "learning_rate": 4.650857347095404e-05, + "loss": 2.5871, + "step": 36729 + }, + { + "epoch": 1.7100588961054077, + "grad_norm": 0.33340574654691985, + "learning_rate": 4.650587135152808e-05, + "loss": 2.7087, + "step": 36730 + }, + { + "epoch": 1.7101054542915008, + "grad_norm": 0.3436556962059204, + "learning_rate": 4.6503169242357025e-05, + "loss": 2.5917, + "step": 36731 + }, + { + "epoch": 1.710152012477594, + "grad_norm": 0.34894372190582157, + "learning_rate": 4.6500467143448834e-05, + "loss": 2.6561, + "step": 36732 + }, + { + "epoch": 1.710198570663687, + "grad_norm": 0.3449155389498308, + "learning_rate": 4.649776505481142e-05, + "loss": 2.7231, + "step": 36733 + }, + { + "epoch": 1.71024512884978, + "grad_norm": 0.3392073769325807, + "learning_rate": 4.649506297645272e-05, + "loss": 2.7308, + "step": 36734 + }, + { + "epoch": 1.710291687035873, + "grad_norm": 0.32444097605795147, + "learning_rate": 4.649236090838068e-05, + "loss": 2.7474, + "step": 36735 + }, + { + "epoch": 1.7103382452219662, + "grad_norm": 0.35854554403316896, + "learning_rate": 4.6489658850603186e-05, + "loss": 2.7117, + "step": 36736 + }, + { + "epoch": 1.710384803408059, + "grad_norm": 0.3289881581953553, + "learning_rate": 4.6486956803128225e-05, + "loss": 2.7498, + "step": 36737 + }, + { + "epoch": 1.7104313615941522, + "grad_norm": 0.34191728708263236, + "learning_rate": 4.6484254765963694e-05, + "loss": 2.7523, + "step": 36738 + }, + { + "epoch": 1.7104779197802453, + "grad_norm": 0.348680138110205, + "learning_rate": 4.648155273911751e-05, + "loss": 2.6936, + "step": 36739 + }, + { + "epoch": 1.7105244779663384, + "grad_norm": 0.3695351498087945, + "learning_rate": 4.6478850722597644e-05, + "loss": 2.6199, + "step": 36740 + }, + { + "epoch": 1.7105710361524316, + "grad_norm": 0.3605060093039233, + "learning_rate": 4.647614871641198e-05, + "loss": 2.6828, + "step": 36741 + }, + { + "epoch": 1.7106175943385247, + "grad_norm": 0.35003592509267295, + "learning_rate": 4.6473446720568496e-05, + "loss": 2.6917, + "step": 36742 + }, + { + "epoch": 1.7106641525246178, + "grad_norm": 0.39789938571009076, + "learning_rate": 4.64707447350751e-05, + "loss": 2.7, + "step": 36743 + }, + { + "epoch": 1.7107107107107107, + "grad_norm": 0.3213493729291192, + "learning_rate": 4.64680427599397e-05, + "loss": 2.5979, + "step": 36744 + }, + { + "epoch": 1.7107572688968038, + "grad_norm": 0.3354025846839349, + "learning_rate": 4.646534079517028e-05, + "loss": 2.6529, + "step": 36745 + }, + { + "epoch": 1.7108038270828967, + "grad_norm": 0.3453154543977607, + "learning_rate": 4.646263884077469e-05, + "loss": 2.6434, + "step": 36746 + }, + { + "epoch": 1.7108503852689898, + "grad_norm": 0.351544205652535, + "learning_rate": 4.645993689676094e-05, + "loss": 2.6728, + "step": 36747 + }, + { + "epoch": 1.710896943455083, + "grad_norm": 0.33952339250348607, + "learning_rate": 4.645723496313693e-05, + "loss": 2.7904, + "step": 36748 + }, + { + "epoch": 1.710943501641176, + "grad_norm": 0.3718223699197557, + "learning_rate": 4.645453303991056e-05, + "loss": 2.7355, + "step": 36749 + }, + { + "epoch": 1.7109900598272691, + "grad_norm": 0.36964432915987344, + "learning_rate": 4.6451831127089805e-05, + "loss": 2.6607, + "step": 36750 + }, + { + "epoch": 1.7110366180133623, + "grad_norm": 0.33888250105810463, + "learning_rate": 4.644912922468258e-05, + "loss": 2.736, + "step": 36751 + }, + { + "epoch": 1.7110831761994554, + "grad_norm": 0.34338600795147295, + "learning_rate": 4.644642733269679e-05, + "loss": 2.7557, + "step": 36752 + }, + { + "epoch": 1.7111297343855485, + "grad_norm": 0.3542350646465967, + "learning_rate": 4.64437254511404e-05, + "loss": 2.6734, + "step": 36753 + }, + { + "epoch": 1.7111762925716414, + "grad_norm": 0.36326049824770273, + "learning_rate": 4.6441023580021316e-05, + "loss": 2.7664, + "step": 36754 + }, + { + "epoch": 1.7112228507577345, + "grad_norm": 0.37353676151902987, + "learning_rate": 4.643832171934749e-05, + "loss": 2.692, + "step": 36755 + }, + { + "epoch": 1.7112694089438274, + "grad_norm": 0.36045496720782627, + "learning_rate": 4.643561986912684e-05, + "loss": 2.7558, + "step": 36756 + }, + { + "epoch": 1.7113159671299205, + "grad_norm": 0.3448503987651229, + "learning_rate": 4.6432918029367286e-05, + "loss": 2.6896, + "step": 36757 + }, + { + "epoch": 1.7113625253160136, + "grad_norm": 0.34007009403145433, + "learning_rate": 4.643021620007678e-05, + "loss": 2.7312, + "step": 36758 + }, + { + "epoch": 1.7114090835021067, + "grad_norm": 0.3553579434946406, + "learning_rate": 4.6427514381263205e-05, + "loss": 2.691, + "step": 36759 + }, + { + "epoch": 1.7114556416881999, + "grad_norm": 0.3392283988454679, + "learning_rate": 4.642481257293456e-05, + "loss": 2.7936, + "step": 36760 + }, + { + "epoch": 1.711502199874293, + "grad_norm": 0.3367929068686914, + "learning_rate": 4.6422110775098726e-05, + "loss": 2.7025, + "step": 36761 + }, + { + "epoch": 1.711548758060386, + "grad_norm": 0.3826941042664919, + "learning_rate": 4.641940898776362e-05, + "loss": 2.7375, + "step": 36762 + }, + { + "epoch": 1.7115953162464792, + "grad_norm": 0.3321509490183871, + "learning_rate": 4.641670721093723e-05, + "loss": 2.6986, + "step": 36763 + }, + { + "epoch": 1.711641874432572, + "grad_norm": 0.3388249670009654, + "learning_rate": 4.641400544462742e-05, + "loss": 2.6545, + "step": 36764 + }, + { + "epoch": 1.7116884326186652, + "grad_norm": 0.316338909818725, + "learning_rate": 4.641130368884218e-05, + "loss": 2.6191, + "step": 36765 + }, + { + "epoch": 1.711734990804758, + "grad_norm": 0.2997811817637088, + "learning_rate": 4.64086019435894e-05, + "loss": 2.7009, + "step": 36766 + }, + { + "epoch": 1.7117815489908512, + "grad_norm": 0.32498283028812525, + "learning_rate": 4.6405900208877006e-05, + "loss": 2.7542, + "step": 36767 + }, + { + "epoch": 1.7118281071769443, + "grad_norm": 0.31381343311682863, + "learning_rate": 4.640319848471296e-05, + "loss": 2.6447, + "step": 36768 + }, + { + "epoch": 1.7118746653630375, + "grad_norm": 0.3094584160389329, + "learning_rate": 4.640049677110517e-05, + "loss": 2.6415, + "step": 36769 + }, + { + "epoch": 1.7119212235491306, + "grad_norm": 0.33376902218315774, + "learning_rate": 4.639779506806157e-05, + "loss": 2.7292, + "step": 36770 + }, + { + "epoch": 1.7119677817352237, + "grad_norm": 0.3273705655044542, + "learning_rate": 4.639509337559008e-05, + "loss": 2.7916, + "step": 36771 + }, + { + "epoch": 1.7120143399213168, + "grad_norm": 0.34182849039592217, + "learning_rate": 4.639239169369862e-05, + "loss": 2.6506, + "step": 36772 + }, + { + "epoch": 1.7120608981074097, + "grad_norm": 0.3253406528137657, + "learning_rate": 4.638969002239516e-05, + "loss": 2.7881, + "step": 36773 + }, + { + "epoch": 1.7121074562935028, + "grad_norm": 0.38132484602476446, + "learning_rate": 4.63869883616876e-05, + "loss": 2.7273, + "step": 36774 + }, + { + "epoch": 1.712154014479596, + "grad_norm": 0.35024408392871825, + "learning_rate": 4.638428671158385e-05, + "loss": 2.7485, + "step": 36775 + }, + { + "epoch": 1.7122005726656888, + "grad_norm": 0.37415107535115943, + "learning_rate": 4.638158507209189e-05, + "loss": 2.7137, + "step": 36776 + }, + { + "epoch": 1.712247130851782, + "grad_norm": 0.35152514477378066, + "learning_rate": 4.63788834432196e-05, + "loss": 2.6406, + "step": 36777 + }, + { + "epoch": 1.712293689037875, + "grad_norm": 0.36381601216452525, + "learning_rate": 4.637618182497495e-05, + "loss": 2.7442, + "step": 36778 + }, + { + "epoch": 1.7123402472239682, + "grad_norm": 0.3744441201902485, + "learning_rate": 4.637348021736584e-05, + "loss": 2.7014, + "step": 36779 + }, + { + "epoch": 1.7123868054100613, + "grad_norm": 0.35443499180829047, + "learning_rate": 4.63707786204002e-05, + "loss": 2.7795, + "step": 36780 + }, + { + "epoch": 1.7124333635961544, + "grad_norm": 0.33429224659613604, + "learning_rate": 4.6368077034085986e-05, + "loss": 2.6792, + "step": 36781 + }, + { + "epoch": 1.7124799217822475, + "grad_norm": 0.3483702375096209, + "learning_rate": 4.636537545843109e-05, + "loss": 2.658, + "step": 36782 + }, + { + "epoch": 1.7125264799683404, + "grad_norm": 0.34512144616388984, + "learning_rate": 4.636267389344348e-05, + "loss": 2.6989, + "step": 36783 + }, + { + "epoch": 1.7125730381544335, + "grad_norm": 0.3426518771856673, + "learning_rate": 4.635997233913104e-05, + "loss": 2.7653, + "step": 36784 + }, + { + "epoch": 1.7126195963405264, + "grad_norm": 0.36090594493049133, + "learning_rate": 4.6357270795501735e-05, + "loss": 2.6344, + "step": 36785 + }, + { + "epoch": 1.7126661545266195, + "grad_norm": 0.3558779807858449, + "learning_rate": 4.635456926256348e-05, + "loss": 2.5899, + "step": 36786 + }, + { + "epoch": 1.7127127127127126, + "grad_norm": 0.36172897668480036, + "learning_rate": 4.635186774032422e-05, + "loss": 2.6974, + "step": 36787 + }, + { + "epoch": 1.7127592708988058, + "grad_norm": 0.35017854809442517, + "learning_rate": 4.634916622879184e-05, + "loss": 2.7352, + "step": 36788 + }, + { + "epoch": 1.7128058290848989, + "grad_norm": 0.3660867203723885, + "learning_rate": 4.634646472797432e-05, + "loss": 2.6915, + "step": 36789 + }, + { + "epoch": 1.712852387270992, + "grad_norm": 0.33098302906706834, + "learning_rate": 4.634376323787954e-05, + "loss": 2.7231, + "step": 36790 + }, + { + "epoch": 1.712898945457085, + "grad_norm": 0.3902798598490921, + "learning_rate": 4.634106175851548e-05, + "loss": 2.703, + "step": 36791 + }, + { + "epoch": 1.7129455036431782, + "grad_norm": 0.3228038421905305, + "learning_rate": 4.633836028989004e-05, + "loss": 2.7162, + "step": 36792 + }, + { + "epoch": 1.712992061829271, + "grad_norm": 0.3643147898301946, + "learning_rate": 4.633565883201114e-05, + "loss": 2.5951, + "step": 36793 + }, + { + "epoch": 1.7130386200153642, + "grad_norm": 0.3621320878641325, + "learning_rate": 4.633295738488673e-05, + "loss": 2.8122, + "step": 36794 + }, + { + "epoch": 1.7130851782014571, + "grad_norm": 0.36044244212371407, + "learning_rate": 4.633025594852472e-05, + "loss": 2.6792, + "step": 36795 + }, + { + "epoch": 1.7131317363875502, + "grad_norm": 0.37611488233758605, + "learning_rate": 4.6327554522933064e-05, + "loss": 2.686, + "step": 36796 + }, + { + "epoch": 1.7131782945736433, + "grad_norm": 0.3529683022150085, + "learning_rate": 4.632485310811966e-05, + "loss": 2.8, + "step": 36797 + }, + { + "epoch": 1.7132248527597365, + "grad_norm": 0.37317544558335813, + "learning_rate": 4.6322151704092445e-05, + "loss": 2.6991, + "step": 36798 + }, + { + "epoch": 1.7132714109458296, + "grad_norm": 0.38351532359219237, + "learning_rate": 4.631945031085937e-05, + "loss": 2.7634, + "step": 36799 + }, + { + "epoch": 1.7133179691319227, + "grad_norm": 0.35386945723265534, + "learning_rate": 4.631674892842832e-05, + "loss": 2.6406, + "step": 36800 + }, + { + "epoch": 1.7133645273180158, + "grad_norm": 0.3726903215087587, + "learning_rate": 4.631404755680728e-05, + "loss": 2.7394, + "step": 36801 + }, + { + "epoch": 1.713411085504109, + "grad_norm": 0.3718220405848517, + "learning_rate": 4.631134619600413e-05, + "loss": 2.7435, + "step": 36802 + }, + { + "epoch": 1.7134576436902018, + "grad_norm": 0.3672566096484382, + "learning_rate": 4.6308644846026814e-05, + "loss": 2.6411, + "step": 36803 + }, + { + "epoch": 1.713504201876295, + "grad_norm": 0.35505423110704326, + "learning_rate": 4.630594350688327e-05, + "loss": 2.6836, + "step": 36804 + }, + { + "epoch": 1.7135507600623878, + "grad_norm": 0.36684980355006314, + "learning_rate": 4.630324217858142e-05, + "loss": 2.7424, + "step": 36805 + }, + { + "epoch": 1.713597318248481, + "grad_norm": 0.36788605935507135, + "learning_rate": 4.630054086112917e-05, + "loss": 2.6812, + "step": 36806 + }, + { + "epoch": 1.713643876434574, + "grad_norm": 0.36614836273271867, + "learning_rate": 4.629783955453449e-05, + "loss": 2.6954, + "step": 36807 + }, + { + "epoch": 1.7136904346206672, + "grad_norm": 0.38408162315382194, + "learning_rate": 4.629513825880527e-05, + "loss": 2.5844, + "step": 36808 + }, + { + "epoch": 1.7137369928067603, + "grad_norm": 0.3310007892185989, + "learning_rate": 4.629243697394948e-05, + "loss": 2.7607, + "step": 36809 + }, + { + "epoch": 1.7137835509928534, + "grad_norm": 0.36541020355640197, + "learning_rate": 4.6289735699975e-05, + "loss": 2.658, + "step": 36810 + }, + { + "epoch": 1.7138301091789465, + "grad_norm": 0.3503822534784462, + "learning_rate": 4.6287034436889774e-05, + "loss": 2.8367, + "step": 36811 + }, + { + "epoch": 1.7138766673650394, + "grad_norm": 0.3529493225043648, + "learning_rate": 4.628433318470176e-05, + "loss": 2.6065, + "step": 36812 + }, + { + "epoch": 1.7139232255511325, + "grad_norm": 0.3341883739954478, + "learning_rate": 4.628163194341884e-05, + "loss": 2.747, + "step": 36813 + }, + { + "epoch": 1.7139697837372256, + "grad_norm": 0.3153979312679853, + "learning_rate": 4.6278930713048975e-05, + "loss": 2.6711, + "step": 36814 + }, + { + "epoch": 1.7140163419233185, + "grad_norm": 0.35685189084041824, + "learning_rate": 4.627622949360009e-05, + "loss": 2.6288, + "step": 36815 + }, + { + "epoch": 1.7140629001094116, + "grad_norm": 0.32075227990880595, + "learning_rate": 4.627352828508008e-05, + "loss": 2.6506, + "step": 36816 + }, + { + "epoch": 1.7141094582955048, + "grad_norm": 0.3170278285432341, + "learning_rate": 4.6270827087496916e-05, + "loss": 2.7405, + "step": 36817 + }, + { + "epoch": 1.7141560164815979, + "grad_norm": 0.34990500220748805, + "learning_rate": 4.626812590085849e-05, + "loss": 2.7152, + "step": 36818 + }, + { + "epoch": 1.714202574667691, + "grad_norm": 0.33997575573885946, + "learning_rate": 4.626542472517276e-05, + "loss": 2.6202, + "step": 36819 + }, + { + "epoch": 1.714249132853784, + "grad_norm": 0.3402271449437184, + "learning_rate": 4.6262723560447654e-05, + "loss": 2.7164, + "step": 36820 + }, + { + "epoch": 1.7142956910398772, + "grad_norm": 0.3471388091031764, + "learning_rate": 4.6260022406691066e-05, + "loss": 2.6934, + "step": 36821 + }, + { + "epoch": 1.7143422492259701, + "grad_norm": 0.3278518804943012, + "learning_rate": 4.6257321263910956e-05, + "loss": 2.6315, + "step": 36822 + }, + { + "epoch": 1.7143888074120632, + "grad_norm": 0.3376102038343574, + "learning_rate": 4.6254620132115236e-05, + "loss": 2.6696, + "step": 36823 + }, + { + "epoch": 1.7144353655981563, + "grad_norm": 0.3676368430894442, + "learning_rate": 4.625191901131183e-05, + "loss": 2.7786, + "step": 36824 + }, + { + "epoch": 1.7144819237842492, + "grad_norm": 0.33825543532552166, + "learning_rate": 4.6249217901508676e-05, + "loss": 2.7591, + "step": 36825 + }, + { + "epoch": 1.7145284819703424, + "grad_norm": 0.370177855788857, + "learning_rate": 4.624651680271369e-05, + "loss": 2.7002, + "step": 36826 + }, + { + "epoch": 1.7145750401564355, + "grad_norm": 0.3405517654573057, + "learning_rate": 4.6243815714934826e-05, + "loss": 2.7233, + "step": 36827 + }, + { + "epoch": 1.7146215983425286, + "grad_norm": 0.36597258958569573, + "learning_rate": 4.624111463817999e-05, + "loss": 2.7506, + "step": 36828 + }, + { + "epoch": 1.7146681565286217, + "grad_norm": 0.35343756938112186, + "learning_rate": 4.623841357245709e-05, + "loss": 2.6327, + "step": 36829 + }, + { + "epoch": 1.7147147147147148, + "grad_norm": 0.30473278323218445, + "learning_rate": 4.62357125177741e-05, + "loss": 2.7314, + "step": 36830 + }, + { + "epoch": 1.714761272900808, + "grad_norm": 0.34535009046843324, + "learning_rate": 4.6233011474138896e-05, + "loss": 2.7767, + "step": 36831 + }, + { + "epoch": 1.7148078310869008, + "grad_norm": 0.3425915894861634, + "learning_rate": 4.623031044155946e-05, + "loss": 2.6479, + "step": 36832 + }, + { + "epoch": 1.714854389272994, + "grad_norm": 0.34730387680566527, + "learning_rate": 4.622760942004369e-05, + "loss": 2.6291, + "step": 36833 + }, + { + "epoch": 1.7149009474590868, + "grad_norm": 0.3780223244776547, + "learning_rate": 4.62249084095995e-05, + "loss": 2.734, + "step": 36834 + }, + { + "epoch": 1.71494750564518, + "grad_norm": 0.3384351357418813, + "learning_rate": 4.6222207410234846e-05, + "loss": 2.7178, + "step": 36835 + }, + { + "epoch": 1.714994063831273, + "grad_norm": 0.3642990887077081, + "learning_rate": 4.621950642195762e-05, + "loss": 2.71, + "step": 36836 + }, + { + "epoch": 1.7150406220173662, + "grad_norm": 0.37979115945843034, + "learning_rate": 4.621680544477579e-05, + "loss": 2.7043, + "step": 36837 + }, + { + "epoch": 1.7150871802034593, + "grad_norm": 0.32732209849924077, + "learning_rate": 4.621410447869726e-05, + "loss": 2.7442, + "step": 36838 + }, + { + "epoch": 1.7151337383895524, + "grad_norm": 0.36832776409068363, + "learning_rate": 4.621140352372995e-05, + "loss": 2.7319, + "step": 36839 + }, + { + "epoch": 1.7151802965756455, + "grad_norm": 0.34853389891024233, + "learning_rate": 4.620870257988181e-05, + "loss": 2.7853, + "step": 36840 + }, + { + "epoch": 1.7152268547617386, + "grad_norm": 0.3537929314776061, + "learning_rate": 4.620600164716076e-05, + "loss": 2.6975, + "step": 36841 + }, + { + "epoch": 1.7152734129478315, + "grad_norm": 0.343038776674267, + "learning_rate": 4.620330072557469e-05, + "loss": 2.6377, + "step": 36842 + }, + { + "epoch": 1.7153199711339246, + "grad_norm": 0.32198563560804494, + "learning_rate": 4.620059981513158e-05, + "loss": 2.7594, + "step": 36843 + }, + { + "epoch": 1.7153665293200175, + "grad_norm": 0.3444252415820524, + "learning_rate": 4.6197898915839326e-05, + "loss": 2.6728, + "step": 36844 + }, + { + "epoch": 1.7154130875061107, + "grad_norm": 0.3376305187410388, + "learning_rate": 4.6195198027705874e-05, + "loss": 2.762, + "step": 36845 + }, + { + "epoch": 1.7154596456922038, + "grad_norm": 0.33967486125840224, + "learning_rate": 4.619249715073915e-05, + "loss": 2.7481, + "step": 36846 + }, + { + "epoch": 1.7155062038782969, + "grad_norm": 0.38517386290719335, + "learning_rate": 4.618979628494706e-05, + "loss": 2.6891, + "step": 36847 + }, + { + "epoch": 1.71555276206439, + "grad_norm": 0.33986513747530483, + "learning_rate": 4.618709543033755e-05, + "loss": 2.6757, + "step": 36848 + }, + { + "epoch": 1.7155993202504831, + "grad_norm": 0.340786272984228, + "learning_rate": 4.6184394586918516e-05, + "loss": 2.6075, + "step": 36849 + }, + { + "epoch": 1.7156458784365762, + "grad_norm": 0.3669586532555418, + "learning_rate": 4.618169375469793e-05, + "loss": 2.5734, + "step": 36850 + }, + { + "epoch": 1.7156924366226691, + "grad_norm": 0.32373548996261553, + "learning_rate": 4.6178992933683704e-05, + "loss": 2.6754, + "step": 36851 + }, + { + "epoch": 1.7157389948087622, + "grad_norm": 0.3322467448416985, + "learning_rate": 4.617629212388373e-05, + "loss": 2.7614, + "step": 36852 + }, + { + "epoch": 1.7157855529948554, + "grad_norm": 0.34426203144423034, + "learning_rate": 4.617359132530598e-05, + "loss": 2.7262, + "step": 36853 + }, + { + "epoch": 1.7158321111809482, + "grad_norm": 0.3293773695488318, + "learning_rate": 4.6170890537958344e-05, + "loss": 2.7028, + "step": 36854 + }, + { + "epoch": 1.7158786693670414, + "grad_norm": 0.31216193505343515, + "learning_rate": 4.616818976184879e-05, + "loss": 2.6303, + "step": 36855 + }, + { + "epoch": 1.7159252275531345, + "grad_norm": 0.34618387814718343, + "learning_rate": 4.616548899698522e-05, + "loss": 2.6647, + "step": 36856 + }, + { + "epoch": 1.7159717857392276, + "grad_norm": 0.32938092244794864, + "learning_rate": 4.616278824337554e-05, + "loss": 2.7366, + "step": 36857 + }, + { + "epoch": 1.7160183439253207, + "grad_norm": 0.3466286740922145, + "learning_rate": 4.616008750102773e-05, + "loss": 2.7144, + "step": 36858 + }, + { + "epoch": 1.7160649021114138, + "grad_norm": 0.3552719539983845, + "learning_rate": 4.615738676994967e-05, + "loss": 2.8028, + "step": 36859 + }, + { + "epoch": 1.716111460297507, + "grad_norm": 0.35416044394883417, + "learning_rate": 4.61546860501493e-05, + "loss": 2.686, + "step": 36860 + }, + { + "epoch": 1.7161580184835998, + "grad_norm": 0.30816785492763515, + "learning_rate": 4.615198534163455e-05, + "loss": 2.6505, + "step": 36861 + }, + { + "epoch": 1.716204576669693, + "grad_norm": 0.3472117471404314, + "learning_rate": 4.6149284644413334e-05, + "loss": 2.8524, + "step": 36862 + }, + { + "epoch": 1.716251134855786, + "grad_norm": 0.33682635621501383, + "learning_rate": 4.61465839584936e-05, + "loss": 2.6114, + "step": 36863 + }, + { + "epoch": 1.716297693041879, + "grad_norm": 0.3379894626428653, + "learning_rate": 4.6143883283883266e-05, + "loss": 2.7589, + "step": 36864 + }, + { + "epoch": 1.716344251227972, + "grad_norm": 0.35951809281563574, + "learning_rate": 4.614118262059023e-05, + "loss": 2.8096, + "step": 36865 + }, + { + "epoch": 1.7163908094140652, + "grad_norm": 0.3369616998835759, + "learning_rate": 4.6138481968622474e-05, + "loss": 2.7373, + "step": 36866 + }, + { + "epoch": 1.7164373676001583, + "grad_norm": 0.36220056086849134, + "learning_rate": 4.613578132798786e-05, + "loss": 2.7088, + "step": 36867 + }, + { + "epoch": 1.7164839257862514, + "grad_norm": 0.32382696315173853, + "learning_rate": 4.613308069869438e-05, + "loss": 2.7087, + "step": 36868 + }, + { + "epoch": 1.7165304839723445, + "grad_norm": 0.3616540668787997, + "learning_rate": 4.6130380080749914e-05, + "loss": 2.6575, + "step": 36869 + }, + { + "epoch": 1.7165770421584376, + "grad_norm": 0.33989253605279174, + "learning_rate": 4.612767947416239e-05, + "loss": 2.6448, + "step": 36870 + }, + { + "epoch": 1.7166236003445305, + "grad_norm": 0.3359017017685255, + "learning_rate": 4.612497887893976e-05, + "loss": 2.7974, + "step": 36871 + }, + { + "epoch": 1.7166701585306237, + "grad_norm": 0.3400188723480034, + "learning_rate": 4.6122278295089925e-05, + "loss": 2.6538, + "step": 36872 + }, + { + "epoch": 1.7167167167167166, + "grad_norm": 0.36292676569558935, + "learning_rate": 4.611957772262084e-05, + "loss": 2.7058, + "step": 36873 + }, + { + "epoch": 1.7167632749028097, + "grad_norm": 0.3161142160008049, + "learning_rate": 4.611687716154041e-05, + "loss": 2.6546, + "step": 36874 + }, + { + "epoch": 1.7168098330889028, + "grad_norm": 0.3270184389483767, + "learning_rate": 4.611417661185653e-05, + "loss": 2.6236, + "step": 36875 + }, + { + "epoch": 1.716856391274996, + "grad_norm": 0.3464885521388322, + "learning_rate": 4.6111476073577187e-05, + "loss": 2.7691, + "step": 36876 + }, + { + "epoch": 1.716902949461089, + "grad_norm": 0.3364685275519721, + "learning_rate": 4.610877554671028e-05, + "loss": 2.6899, + "step": 36877 + }, + { + "epoch": 1.7169495076471821, + "grad_norm": 0.3367217494694596, + "learning_rate": 4.6106075031263716e-05, + "loss": 2.7314, + "step": 36878 + }, + { + "epoch": 1.7169960658332752, + "grad_norm": 0.37322450419265263, + "learning_rate": 4.6103374527245455e-05, + "loss": 2.6836, + "step": 36879 + }, + { + "epoch": 1.7170426240193684, + "grad_norm": 0.32135940786545975, + "learning_rate": 4.610067403466338e-05, + "loss": 2.7249, + "step": 36880 + }, + { + "epoch": 1.7170891822054613, + "grad_norm": 0.38495112118041985, + "learning_rate": 4.609797355352548e-05, + "loss": 2.7024, + "step": 36881 + }, + { + "epoch": 1.7171357403915544, + "grad_norm": 0.3174394723155404, + "learning_rate": 4.6095273083839624e-05, + "loss": 2.728, + "step": 36882 + }, + { + "epoch": 1.7171822985776473, + "grad_norm": 0.34131205973476675, + "learning_rate": 4.609257262561375e-05, + "loss": 2.7785, + "step": 36883 + }, + { + "epoch": 1.7172288567637404, + "grad_norm": 0.34746571144307575, + "learning_rate": 4.6089872178855795e-05, + "loss": 2.7222, + "step": 36884 + }, + { + "epoch": 1.7172754149498335, + "grad_norm": 0.3350582456882977, + "learning_rate": 4.608717174357369e-05, + "loss": 2.7203, + "step": 36885 + }, + { + "epoch": 1.7173219731359266, + "grad_norm": 0.3299937663499579, + "learning_rate": 4.6084471319775344e-05, + "loss": 2.7488, + "step": 36886 + }, + { + "epoch": 1.7173685313220197, + "grad_norm": 0.35367205088680403, + "learning_rate": 4.6081770907468695e-05, + "loss": 2.6613, + "step": 36887 + }, + { + "epoch": 1.7174150895081128, + "grad_norm": 0.3095739047587234, + "learning_rate": 4.607907050666164e-05, + "loss": 2.6532, + "step": 36888 + }, + { + "epoch": 1.717461647694206, + "grad_norm": 0.33608354756745246, + "learning_rate": 4.607637011736216e-05, + "loss": 2.6693, + "step": 36889 + }, + { + "epoch": 1.717508205880299, + "grad_norm": 0.3452676224417913, + "learning_rate": 4.6073669739578105e-05, + "loss": 2.6444, + "step": 36890 + }, + { + "epoch": 1.717554764066392, + "grad_norm": 0.3310915208199341, + "learning_rate": 4.607096937331748e-05, + "loss": 2.797, + "step": 36891 + }, + { + "epoch": 1.717601322252485, + "grad_norm": 0.3393327886398083, + "learning_rate": 4.606826901858817e-05, + "loss": 2.6367, + "step": 36892 + }, + { + "epoch": 1.717647880438578, + "grad_norm": 0.3622272122685903, + "learning_rate": 4.606556867539807e-05, + "loss": 2.7555, + "step": 36893 + }, + { + "epoch": 1.717694438624671, + "grad_norm": 0.3348077765494324, + "learning_rate": 4.606286834375517e-05, + "loss": 2.6407, + "step": 36894 + }, + { + "epoch": 1.7177409968107642, + "grad_norm": 0.33037200863993027, + "learning_rate": 4.606016802366736e-05, + "loss": 2.6797, + "step": 36895 + }, + { + "epoch": 1.7177875549968573, + "grad_norm": 0.3138335027579448, + "learning_rate": 4.605746771514255e-05, + "loss": 2.67, + "step": 36896 + }, + { + "epoch": 1.7178341131829504, + "grad_norm": 0.3501379440126594, + "learning_rate": 4.605476741818871e-05, + "loss": 2.7515, + "step": 36897 + }, + { + "epoch": 1.7178806713690435, + "grad_norm": 0.34218776542290763, + "learning_rate": 4.6052067132813725e-05, + "loss": 2.7365, + "step": 36898 + }, + { + "epoch": 1.7179272295551367, + "grad_norm": 0.32848232636132524, + "learning_rate": 4.6049366859025545e-05, + "loss": 2.7199, + "step": 36899 + }, + { + "epoch": 1.7179737877412296, + "grad_norm": 0.36892254107165806, + "learning_rate": 4.604666659683208e-05, + "loss": 2.75, + "step": 36900 + }, + { + "epoch": 1.7180203459273227, + "grad_norm": 0.3748565642567542, + "learning_rate": 4.604396634624124e-05, + "loss": 2.6696, + "step": 36901 + }, + { + "epoch": 1.7180669041134158, + "grad_norm": 0.35357788694277126, + "learning_rate": 4.604126610726099e-05, + "loss": 2.7573, + "step": 36902 + }, + { + "epoch": 1.7181134622995087, + "grad_norm": 0.35617799484819485, + "learning_rate": 4.603856587989921e-05, + "loss": 2.6794, + "step": 36903 + }, + { + "epoch": 1.7181600204856018, + "grad_norm": 0.34741122715378553, + "learning_rate": 4.6035865664163875e-05, + "loss": 2.6261, + "step": 36904 + }, + { + "epoch": 1.718206578671695, + "grad_norm": 0.3655441790904049, + "learning_rate": 4.6033165460062885e-05, + "loss": 2.69, + "step": 36905 + }, + { + "epoch": 1.718253136857788, + "grad_norm": 0.38100493160105275, + "learning_rate": 4.603046526760413e-05, + "loss": 2.5782, + "step": 36906 + }, + { + "epoch": 1.7182996950438811, + "grad_norm": 0.3280427573402395, + "learning_rate": 4.6027765086795606e-05, + "loss": 2.6995, + "step": 36907 + }, + { + "epoch": 1.7183462532299743, + "grad_norm": 0.3438943969335042, + "learning_rate": 4.602506491764518e-05, + "loss": 2.6804, + "step": 36908 + }, + { + "epoch": 1.7183928114160674, + "grad_norm": 0.3570769965891033, + "learning_rate": 4.60223647601608e-05, + "loss": 2.7115, + "step": 36909 + }, + { + "epoch": 1.7184393696021603, + "grad_norm": 0.3332036641430399, + "learning_rate": 4.601966461435039e-05, + "loss": 2.6744, + "step": 36910 + }, + { + "epoch": 1.7184859277882534, + "grad_norm": 0.3798521855213512, + "learning_rate": 4.601696448022187e-05, + "loss": 2.7726, + "step": 36911 + }, + { + "epoch": 1.7185324859743465, + "grad_norm": 0.34393645487865593, + "learning_rate": 4.6014264357783186e-05, + "loss": 2.7806, + "step": 36912 + }, + { + "epoch": 1.7185790441604394, + "grad_norm": 0.350090926379541, + "learning_rate": 4.601156424704223e-05, + "loss": 2.6687, + "step": 36913 + }, + { + "epoch": 1.7186256023465325, + "grad_norm": 0.36731732486517155, + "learning_rate": 4.600886414800692e-05, + "loss": 2.8449, + "step": 36914 + }, + { + "epoch": 1.7186721605326256, + "grad_norm": 0.347873692409011, + "learning_rate": 4.600616406068522e-05, + "loss": 2.7363, + "step": 36915 + }, + { + "epoch": 1.7187187187187187, + "grad_norm": 0.3539343841732139, + "learning_rate": 4.600346398508502e-05, + "loss": 2.7377, + "step": 36916 + }, + { + "epoch": 1.7187652769048118, + "grad_norm": 0.36754781118036456, + "learning_rate": 4.6000763921214284e-05, + "loss": 2.8023, + "step": 36917 + }, + { + "epoch": 1.718811835090905, + "grad_norm": 0.31980357875267573, + "learning_rate": 4.5998063869080905e-05, + "loss": 2.6502, + "step": 36918 + }, + { + "epoch": 1.718858393276998, + "grad_norm": 0.37132434207620774, + "learning_rate": 4.5995363828692796e-05, + "loss": 2.629, + "step": 36919 + }, + { + "epoch": 1.718904951463091, + "grad_norm": 0.37544407785193695, + "learning_rate": 4.599266380005792e-05, + "loss": 2.7687, + "step": 36920 + }, + { + "epoch": 1.718951509649184, + "grad_norm": 0.31575699969860666, + "learning_rate": 4.598996378318417e-05, + "loss": 2.7921, + "step": 36921 + }, + { + "epoch": 1.718998067835277, + "grad_norm": 0.38232140525520103, + "learning_rate": 4.598726377807948e-05, + "loss": 2.7863, + "step": 36922 + }, + { + "epoch": 1.71904462602137, + "grad_norm": 0.355658699690592, + "learning_rate": 4.5984563784751786e-05, + "loss": 2.7703, + "step": 36923 + }, + { + "epoch": 1.7190911842074632, + "grad_norm": 0.3502092887756885, + "learning_rate": 4.5981863803208994e-05, + "loss": 2.6308, + "step": 36924 + }, + { + "epoch": 1.7191377423935563, + "grad_norm": 0.3285439429764731, + "learning_rate": 4.597916383345904e-05, + "loss": 2.6794, + "step": 36925 + }, + { + "epoch": 1.7191843005796494, + "grad_norm": 0.353132662119133, + "learning_rate": 4.597646387550983e-05, + "loss": 2.7672, + "step": 36926 + }, + { + "epoch": 1.7192308587657426, + "grad_norm": 0.36128386401412194, + "learning_rate": 4.597376392936932e-05, + "loss": 2.6263, + "step": 36927 + }, + { + "epoch": 1.7192774169518357, + "grad_norm": 0.36468108781409, + "learning_rate": 4.597106399504542e-05, + "loss": 2.6807, + "step": 36928 + }, + { + "epoch": 1.7193239751379288, + "grad_norm": 0.33824562174529016, + "learning_rate": 4.596836407254603e-05, + "loss": 2.6266, + "step": 36929 + }, + { + "epoch": 1.7193705333240217, + "grad_norm": 0.32814324766503156, + "learning_rate": 4.596566416187911e-05, + "loss": 2.7001, + "step": 36930 + }, + { + "epoch": 1.7194170915101148, + "grad_norm": 0.3362302818179109, + "learning_rate": 4.596296426305257e-05, + "loss": 2.6596, + "step": 36931 + }, + { + "epoch": 1.7194636496962077, + "grad_norm": 0.37635245623942737, + "learning_rate": 4.596026437607431e-05, + "loss": 2.7455, + "step": 36932 + }, + { + "epoch": 1.7195102078823008, + "grad_norm": 0.3277665958036429, + "learning_rate": 4.59575645009523e-05, + "loss": 2.6413, + "step": 36933 + }, + { + "epoch": 1.719556766068394, + "grad_norm": 0.3728826968387831, + "learning_rate": 4.5954864637694424e-05, + "loss": 2.7686, + "step": 36934 + }, + { + "epoch": 1.719603324254487, + "grad_norm": 0.3143873608922591, + "learning_rate": 4.5952164786308625e-05, + "loss": 2.6654, + "step": 36935 + }, + { + "epoch": 1.7196498824405801, + "grad_norm": 0.38191345155810374, + "learning_rate": 4.5949464946802836e-05, + "loss": 2.7356, + "step": 36936 + }, + { + "epoch": 1.7196964406266733, + "grad_norm": 0.3263315053384355, + "learning_rate": 4.594676511918495e-05, + "loss": 2.6783, + "step": 36937 + }, + { + "epoch": 1.7197429988127664, + "grad_norm": 0.34964618418764776, + "learning_rate": 4.594406530346293e-05, + "loss": 2.581, + "step": 36938 + }, + { + "epoch": 1.7197895569988593, + "grad_norm": 0.3495460071933593, + "learning_rate": 4.5941365499644655e-05, + "loss": 2.7756, + "step": 36939 + }, + { + "epoch": 1.7198361151849524, + "grad_norm": 0.3440186664856668, + "learning_rate": 4.593866570773809e-05, + "loss": 2.7321, + "step": 36940 + }, + { + "epoch": 1.7198826733710455, + "grad_norm": 0.3333668229303275, + "learning_rate": 4.593596592775115e-05, + "loss": 2.7426, + "step": 36941 + }, + { + "epoch": 1.7199292315571384, + "grad_norm": 0.3479612852985873, + "learning_rate": 4.593326615969172e-05, + "loss": 2.7822, + "step": 36942 + }, + { + "epoch": 1.7199757897432315, + "grad_norm": 0.35062528562881107, + "learning_rate": 4.593056640356777e-05, + "loss": 2.6524, + "step": 36943 + }, + { + "epoch": 1.7200223479293246, + "grad_norm": 0.31601356547014964, + "learning_rate": 4.592786665938719e-05, + "loss": 2.7715, + "step": 36944 + }, + { + "epoch": 1.7200689061154177, + "grad_norm": 0.3451157637213875, + "learning_rate": 4.5925166927157947e-05, + "loss": 2.6588, + "step": 36945 + }, + { + "epoch": 1.7201154643015109, + "grad_norm": 0.3462359388267691, + "learning_rate": 4.592246720688793e-05, + "loss": 2.6238, + "step": 36946 + }, + { + "epoch": 1.720162022487604, + "grad_norm": 0.3616223667290833, + "learning_rate": 4.5919767498585063e-05, + "loss": 2.7097, + "step": 36947 + }, + { + "epoch": 1.720208580673697, + "grad_norm": 0.32662826737727735, + "learning_rate": 4.591706780225728e-05, + "loss": 2.8042, + "step": 36948 + }, + { + "epoch": 1.72025513885979, + "grad_norm": 0.3770280125492903, + "learning_rate": 4.591436811791251e-05, + "loss": 2.7372, + "step": 36949 + }, + { + "epoch": 1.720301697045883, + "grad_norm": 0.3435391812908843, + "learning_rate": 4.591166844555865e-05, + "loss": 2.7223, + "step": 36950 + }, + { + "epoch": 1.7203482552319762, + "grad_norm": 0.3356866723579572, + "learning_rate": 4.590896878520365e-05, + "loss": 2.7481, + "step": 36951 + }, + { + "epoch": 1.720394813418069, + "grad_norm": 0.3572670183005689, + "learning_rate": 4.590626913685541e-05, + "loss": 2.718, + "step": 36952 + }, + { + "epoch": 1.7204413716041622, + "grad_norm": 0.33135591406001846, + "learning_rate": 4.5903569500521886e-05, + "loss": 2.5887, + "step": 36953 + }, + { + "epoch": 1.7204879297902553, + "grad_norm": 0.351600203357403, + "learning_rate": 4.5900869876210984e-05, + "loss": 2.7559, + "step": 36954 + }, + { + "epoch": 1.7205344879763484, + "grad_norm": 0.33495010003333314, + "learning_rate": 4.58981702639306e-05, + "loss": 2.6472, + "step": 36955 + }, + { + "epoch": 1.7205810461624416, + "grad_norm": 0.3474100776960715, + "learning_rate": 4.589547066368871e-05, + "loss": 2.7447, + "step": 36956 + }, + { + "epoch": 1.7206276043485347, + "grad_norm": 0.3279936525181351, + "learning_rate": 4.5892771075493176e-05, + "loss": 2.7565, + "step": 36957 + }, + { + "epoch": 1.7206741625346278, + "grad_norm": 0.33492880554863724, + "learning_rate": 4.589007149935198e-05, + "loss": 2.708, + "step": 36958 + }, + { + "epoch": 1.7207207207207207, + "grad_norm": 0.34910367064428593, + "learning_rate": 4.588737193527303e-05, + "loss": 2.7075, + "step": 36959 + }, + { + "epoch": 1.7207672789068138, + "grad_norm": 0.2995532610897422, + "learning_rate": 4.5884672383264216e-05, + "loss": 2.7823, + "step": 36960 + }, + { + "epoch": 1.7208138370929067, + "grad_norm": 0.3374010655658376, + "learning_rate": 4.588197284333349e-05, + "loss": 2.7266, + "step": 36961 + }, + { + "epoch": 1.7208603952789998, + "grad_norm": 0.333215233405695, + "learning_rate": 4.587927331548876e-05, + "loss": 2.6657, + "step": 36962 + }, + { + "epoch": 1.720906953465093, + "grad_norm": 0.3201259698930995, + "learning_rate": 4.587657379973797e-05, + "loss": 2.672, + "step": 36963 + }, + { + "epoch": 1.720953511651186, + "grad_norm": 0.3332166171612414, + "learning_rate": 4.587387429608904e-05, + "loss": 2.6431, + "step": 36964 + }, + { + "epoch": 1.7210000698372792, + "grad_norm": 0.333481386032764, + "learning_rate": 4.5871174804549845e-05, + "loss": 2.6593, + "step": 36965 + }, + { + "epoch": 1.7210466280233723, + "grad_norm": 0.3541105752452632, + "learning_rate": 4.5868475325128374e-05, + "loss": 2.7988, + "step": 36966 + }, + { + "epoch": 1.7210931862094654, + "grad_norm": 0.3333851455886247, + "learning_rate": 4.5865775857832525e-05, + "loss": 2.7911, + "step": 36967 + }, + { + "epoch": 1.7211397443955585, + "grad_norm": 0.3877061981526244, + "learning_rate": 4.5863076402670185e-05, + "loss": 2.7772, + "step": 36968 + }, + { + "epoch": 1.7211863025816514, + "grad_norm": 0.33321832641728416, + "learning_rate": 4.586037695964934e-05, + "loss": 2.7244, + "step": 36969 + }, + { + "epoch": 1.7212328607677445, + "grad_norm": 0.38643536657702654, + "learning_rate": 4.585767752877785e-05, + "loss": 2.6871, + "step": 36970 + }, + { + "epoch": 1.7212794189538374, + "grad_norm": 0.37040986887326005, + "learning_rate": 4.58549781100637e-05, + "loss": 2.7879, + "step": 36971 + }, + { + "epoch": 1.7213259771399305, + "grad_norm": 0.3557294392473763, + "learning_rate": 4.5852278703514776e-05, + "loss": 2.6493, + "step": 36972 + }, + { + "epoch": 1.7213725353260236, + "grad_norm": 0.3627681074268938, + "learning_rate": 4.584957930913899e-05, + "loss": 2.7254, + "step": 36973 + }, + { + "epoch": 1.7214190935121167, + "grad_norm": 0.3612579904428721, + "learning_rate": 4.5846879926944284e-05, + "loss": 2.726, + "step": 36974 + }, + { + "epoch": 1.7214656516982099, + "grad_norm": 0.34777836110891314, + "learning_rate": 4.584418055693858e-05, + "loss": 2.6268, + "step": 36975 + }, + { + "epoch": 1.721512209884303, + "grad_norm": 0.34464392619640244, + "learning_rate": 4.58414811991298e-05, + "loss": 2.6635, + "step": 36976 + }, + { + "epoch": 1.721558768070396, + "grad_norm": 0.3453027996600104, + "learning_rate": 4.583878185352587e-05, + "loss": 2.7262, + "step": 36977 + }, + { + "epoch": 1.7216053262564892, + "grad_norm": 0.37933522451983553, + "learning_rate": 4.583608252013468e-05, + "loss": 2.6281, + "step": 36978 + }, + { + "epoch": 1.721651884442582, + "grad_norm": 0.31277953979752693, + "learning_rate": 4.5833383198964205e-05, + "loss": 2.6603, + "step": 36979 + }, + { + "epoch": 1.7216984426286752, + "grad_norm": 0.38792783092952665, + "learning_rate": 4.583068389002231e-05, + "loss": 2.7293, + "step": 36980 + }, + { + "epoch": 1.7217450008147681, + "grad_norm": 0.331916701518022, + "learning_rate": 4.582798459331697e-05, + "loss": 2.754, + "step": 36981 + }, + { + "epoch": 1.7217915590008612, + "grad_norm": 0.34377610387180935, + "learning_rate": 4.5825285308856086e-05, + "loss": 2.627, + "step": 36982 + }, + { + "epoch": 1.7218381171869543, + "grad_norm": 0.3203455428025546, + "learning_rate": 4.582258603664756e-05, + "loss": 2.7434, + "step": 36983 + }, + { + "epoch": 1.7218846753730475, + "grad_norm": 0.36202343897379535, + "learning_rate": 4.581988677669935e-05, + "loss": 2.7897, + "step": 36984 + }, + { + "epoch": 1.7219312335591406, + "grad_norm": 0.37466233456216075, + "learning_rate": 4.5817187529019364e-05, + "loss": 2.807, + "step": 36985 + }, + { + "epoch": 1.7219777917452337, + "grad_norm": 0.3453671087004539, + "learning_rate": 4.58144882936155e-05, + "loss": 2.6839, + "step": 36986 + }, + { + "epoch": 1.7220243499313268, + "grad_norm": 0.3650419971908205, + "learning_rate": 4.581178907049571e-05, + "loss": 2.7023, + "step": 36987 + }, + { + "epoch": 1.7220709081174197, + "grad_norm": 0.33496124762916984, + "learning_rate": 4.5809089859667905e-05, + "loss": 2.7253, + "step": 36988 + }, + { + "epoch": 1.7221174663035128, + "grad_norm": 0.37175346420472205, + "learning_rate": 4.580639066114002e-05, + "loss": 2.8089, + "step": 36989 + }, + { + "epoch": 1.722164024489606, + "grad_norm": 0.35261284919933145, + "learning_rate": 4.580369147491996e-05, + "loss": 2.7006, + "step": 36990 + }, + { + "epoch": 1.7222105826756988, + "grad_norm": 0.34583104878637155, + "learning_rate": 4.580099230101563e-05, + "loss": 2.7353, + "step": 36991 + }, + { + "epoch": 1.722257140861792, + "grad_norm": 0.35583252006166294, + "learning_rate": 4.5798293139435005e-05, + "loss": 2.7673, + "step": 36992 + }, + { + "epoch": 1.722303699047885, + "grad_norm": 0.3215283009394573, + "learning_rate": 4.579559399018595e-05, + "loss": 2.7415, + "step": 36993 + }, + { + "epoch": 1.7223502572339782, + "grad_norm": 0.3582428754958267, + "learning_rate": 4.579289485327643e-05, + "loss": 2.68, + "step": 36994 + }, + { + "epoch": 1.7223968154200713, + "grad_norm": 0.3220584976837258, + "learning_rate": 4.579019572871435e-05, + "loss": 2.6271, + "step": 36995 + }, + { + "epoch": 1.7224433736061644, + "grad_norm": 0.348719661815697, + "learning_rate": 4.5787496616507615e-05, + "loss": 2.6913, + "step": 36996 + }, + { + "epoch": 1.7224899317922575, + "grad_norm": 0.3555417045901668, + "learning_rate": 4.578479751666418e-05, + "loss": 2.6715, + "step": 36997 + }, + { + "epoch": 1.7225364899783504, + "grad_norm": 0.33574570375771023, + "learning_rate": 4.578209842919193e-05, + "loss": 2.7943, + "step": 36998 + }, + { + "epoch": 1.7225830481644435, + "grad_norm": 0.37354003750217607, + "learning_rate": 4.577939935409883e-05, + "loss": 2.7602, + "step": 36999 + }, + { + "epoch": 1.7226296063505366, + "grad_norm": 0.3324545638609644, + "learning_rate": 4.577670029139276e-05, + "loss": 2.7673, + "step": 37000 + }, + { + "epoch": 1.7226761645366295, + "grad_norm": 0.33403567276566243, + "learning_rate": 4.577400124108166e-05, + "loss": 2.6834, + "step": 37001 + }, + { + "epoch": 1.7227227227227226, + "grad_norm": 0.3384259993374938, + "learning_rate": 4.577130220317346e-05, + "loss": 2.6795, + "step": 37002 + }, + { + "epoch": 1.7227692809088158, + "grad_norm": 0.35048134883916415, + "learning_rate": 4.576860317767605e-05, + "loss": 2.7116, + "step": 37003 + }, + { + "epoch": 1.7228158390949089, + "grad_norm": 0.3502610207154113, + "learning_rate": 4.576590416459739e-05, + "loss": 2.6352, + "step": 37004 + }, + { + "epoch": 1.722862397281002, + "grad_norm": 0.3270883018756052, + "learning_rate": 4.576320516394539e-05, + "loss": 2.5846, + "step": 37005 + }, + { + "epoch": 1.722908955467095, + "grad_norm": 0.3493402843534652, + "learning_rate": 4.576050617572795e-05, + "loss": 2.7271, + "step": 37006 + }, + { + "epoch": 1.7229555136531882, + "grad_norm": 0.34502135590615846, + "learning_rate": 4.575780719995302e-05, + "loss": 2.6859, + "step": 37007 + }, + { + "epoch": 1.7230020718392811, + "grad_norm": 0.3368597155938459, + "learning_rate": 4.575510823662851e-05, + "loss": 2.7068, + "step": 37008 + }, + { + "epoch": 1.7230486300253742, + "grad_norm": 0.3848608224449942, + "learning_rate": 4.575240928576231e-05, + "loss": 2.6443, + "step": 37009 + }, + { + "epoch": 1.7230951882114671, + "grad_norm": 0.35966177057646365, + "learning_rate": 4.57497103473624e-05, + "loss": 2.822, + "step": 37010 + }, + { + "epoch": 1.7231417463975602, + "grad_norm": 0.32595727659348717, + "learning_rate": 4.574701142143666e-05, + "loss": 2.6467, + "step": 37011 + }, + { + "epoch": 1.7231883045836534, + "grad_norm": 0.34970593386060095, + "learning_rate": 4.574431250799303e-05, + "loss": 2.7495, + "step": 37012 + }, + { + "epoch": 1.7232348627697465, + "grad_norm": 0.3441388488344302, + "learning_rate": 4.574161360703942e-05, + "loss": 2.7139, + "step": 37013 + }, + { + "epoch": 1.7232814209558396, + "grad_norm": 0.32624962042694483, + "learning_rate": 4.573891471858375e-05, + "loss": 2.6672, + "step": 37014 + }, + { + "epoch": 1.7233279791419327, + "grad_norm": 0.3712420549750675, + "learning_rate": 4.573621584263395e-05, + "loss": 2.7207, + "step": 37015 + }, + { + "epoch": 1.7233745373280258, + "grad_norm": 0.3752555645228223, + "learning_rate": 4.573351697919792e-05, + "loss": 2.6874, + "step": 37016 + }, + { + "epoch": 1.723421095514119, + "grad_norm": 0.3267972266647831, + "learning_rate": 4.573081812828362e-05, + "loss": 2.7586, + "step": 37017 + }, + { + "epoch": 1.7234676537002118, + "grad_norm": 0.37284549358306707, + "learning_rate": 4.572811928989895e-05, + "loss": 2.6967, + "step": 37018 + }, + { + "epoch": 1.723514211886305, + "grad_norm": 0.35800266737574254, + "learning_rate": 4.572542046405181e-05, + "loss": 2.6862, + "step": 37019 + }, + { + "epoch": 1.7235607700723978, + "grad_norm": 0.3426480265264949, + "learning_rate": 4.572272165075015e-05, + "loss": 2.6584, + "step": 37020 + }, + { + "epoch": 1.723607328258491, + "grad_norm": 0.3435565352055728, + "learning_rate": 4.572002285000186e-05, + "loss": 2.6967, + "step": 37021 + }, + { + "epoch": 1.723653886444584, + "grad_norm": 0.3828725178133396, + "learning_rate": 4.5717324061814915e-05, + "loss": 2.6051, + "step": 37022 + }, + { + "epoch": 1.7237004446306772, + "grad_norm": 0.3373152755410945, + "learning_rate": 4.571462528619719e-05, + "loss": 2.7111, + "step": 37023 + }, + { + "epoch": 1.7237470028167703, + "grad_norm": 0.347275562335734, + "learning_rate": 4.5711926523156615e-05, + "loss": 2.6187, + "step": 37024 + }, + { + "epoch": 1.7237935610028634, + "grad_norm": 0.3420739740746436, + "learning_rate": 4.5709227772701124e-05, + "loss": 2.7829, + "step": 37025 + }, + { + "epoch": 1.7238401191889565, + "grad_norm": 0.3400898094300173, + "learning_rate": 4.5706529034838614e-05, + "loss": 2.7521, + "step": 37026 + }, + { + "epoch": 1.7238866773750494, + "grad_norm": 0.3457628875247061, + "learning_rate": 4.570383030957702e-05, + "loss": 2.7328, + "step": 37027 + }, + { + "epoch": 1.7239332355611425, + "grad_norm": 0.3420226723408199, + "learning_rate": 4.5701131596924264e-05, + "loss": 2.6838, + "step": 37028 + }, + { + "epoch": 1.7239797937472356, + "grad_norm": 0.3295035780856635, + "learning_rate": 4.5698432896888246e-05, + "loss": 2.6002, + "step": 37029 + }, + { + "epoch": 1.7240263519333285, + "grad_norm": 0.33708815805453346, + "learning_rate": 4.569573420947692e-05, + "loss": 2.6486, + "step": 37030 + }, + { + "epoch": 1.7240729101194217, + "grad_norm": 0.34349465246207306, + "learning_rate": 4.56930355346982e-05, + "loss": 2.7085, + "step": 37031 + }, + { + "epoch": 1.7241194683055148, + "grad_norm": 0.3541978657105293, + "learning_rate": 4.5690336872559976e-05, + "loss": 2.6715, + "step": 37032 + }, + { + "epoch": 1.7241660264916079, + "grad_norm": 0.3241239036786194, + "learning_rate": 4.56876382230702e-05, + "loss": 2.7942, + "step": 37033 + }, + { + "epoch": 1.724212584677701, + "grad_norm": 0.3197672250187855, + "learning_rate": 4.568493958623677e-05, + "loss": 2.6258, + "step": 37034 + }, + { + "epoch": 1.7242591428637941, + "grad_norm": 0.32859273586288346, + "learning_rate": 4.568224096206763e-05, + "loss": 2.652, + "step": 37035 + }, + { + "epoch": 1.7243057010498872, + "grad_norm": 0.318294023324593, + "learning_rate": 4.567954235057069e-05, + "loss": 2.705, + "step": 37036 + }, + { + "epoch": 1.7243522592359801, + "grad_norm": 0.31444150401977633, + "learning_rate": 4.5676843751753864e-05, + "loss": 2.7011, + "step": 37037 + }, + { + "epoch": 1.7243988174220732, + "grad_norm": 0.3510857909958992, + "learning_rate": 4.5674145165625075e-05, + "loss": 2.7096, + "step": 37038 + }, + { + "epoch": 1.7244453756081664, + "grad_norm": 0.34178374581361337, + "learning_rate": 4.5671446592192226e-05, + "loss": 2.744, + "step": 37039 + }, + { + "epoch": 1.7244919337942592, + "grad_norm": 0.3339625804598305, + "learning_rate": 4.566874803146328e-05, + "loss": 2.6994, + "step": 37040 + }, + { + "epoch": 1.7245384919803524, + "grad_norm": 0.35158607549891596, + "learning_rate": 4.566604948344613e-05, + "loss": 2.7305, + "step": 37041 + }, + { + "epoch": 1.7245850501664455, + "grad_norm": 0.34497784947213345, + "learning_rate": 4.5663350948148675e-05, + "loss": 2.7005, + "step": 37042 + }, + { + "epoch": 1.7246316083525386, + "grad_norm": 0.36753992011222525, + "learning_rate": 4.566065242557888e-05, + "loss": 2.7623, + "step": 37043 + }, + { + "epoch": 1.7246781665386317, + "grad_norm": 0.3277071916140035, + "learning_rate": 4.5657953915744647e-05, + "loss": 2.6086, + "step": 37044 + }, + { + "epoch": 1.7247247247247248, + "grad_norm": 0.36341309977598035, + "learning_rate": 4.565525541865386e-05, + "loss": 2.6977, + "step": 37045 + }, + { + "epoch": 1.724771282910818, + "grad_norm": 0.3471915071932715, + "learning_rate": 4.56525569343145e-05, + "loss": 2.6571, + "step": 37046 + }, + { + "epoch": 1.7248178410969108, + "grad_norm": 0.3378949402250586, + "learning_rate": 4.564985846273443e-05, + "loss": 2.6526, + "step": 37047 + }, + { + "epoch": 1.724864399283004, + "grad_norm": 0.3444602787171093, + "learning_rate": 4.564716000392162e-05, + "loss": 2.7032, + "step": 37048 + }, + { + "epoch": 1.7249109574690968, + "grad_norm": 0.34580812298980834, + "learning_rate": 4.564446155788397e-05, + "loss": 2.7643, + "step": 37049 + }, + { + "epoch": 1.72495751565519, + "grad_norm": 0.33958547664858874, + "learning_rate": 4.5641763124629385e-05, + "loss": 2.6905, + "step": 37050 + }, + { + "epoch": 1.725004073841283, + "grad_norm": 0.3686209808192838, + "learning_rate": 4.56390647041658e-05, + "loss": 2.6381, + "step": 37051 + }, + { + "epoch": 1.7250506320273762, + "grad_norm": 0.36996542125707155, + "learning_rate": 4.5636366296501114e-05, + "loss": 2.7058, + "step": 37052 + }, + { + "epoch": 1.7250971902134693, + "grad_norm": 0.3451105107677692, + "learning_rate": 4.563366790164329e-05, + "loss": 2.7822, + "step": 37053 + }, + { + "epoch": 1.7251437483995624, + "grad_norm": 0.3272076419949148, + "learning_rate": 4.563096951960021e-05, + "loss": 2.6511, + "step": 37054 + }, + { + "epoch": 1.7251903065856555, + "grad_norm": 0.35489501052791816, + "learning_rate": 4.562827115037979e-05, + "loss": 2.7603, + "step": 37055 + }, + { + "epoch": 1.7252368647717486, + "grad_norm": 0.34024568345238604, + "learning_rate": 4.5625572793989985e-05, + "loss": 2.7004, + "step": 37056 + }, + { + "epoch": 1.7252834229578415, + "grad_norm": 0.31518653801255064, + "learning_rate": 4.5622874450438666e-05, + "loss": 2.7158, + "step": 37057 + }, + { + "epoch": 1.7253299811439347, + "grad_norm": 0.3413590821294117, + "learning_rate": 4.56201761197338e-05, + "loss": 2.6989, + "step": 37058 + }, + { + "epoch": 1.7253765393300275, + "grad_norm": 0.3284326298807117, + "learning_rate": 4.56174778018833e-05, + "loss": 2.7309, + "step": 37059 + }, + { + "epoch": 1.7254230975161207, + "grad_norm": 0.32876621019835217, + "learning_rate": 4.561477949689503e-05, + "loss": 2.6212, + "step": 37060 + }, + { + "epoch": 1.7254696557022138, + "grad_norm": 0.34480173928585944, + "learning_rate": 4.561208120477698e-05, + "loss": 2.5836, + "step": 37061 + }, + { + "epoch": 1.725516213888307, + "grad_norm": 0.3233836558850301, + "learning_rate": 4.560938292553704e-05, + "loss": 2.6121, + "step": 37062 + }, + { + "epoch": 1.7255627720744, + "grad_norm": 0.33542759572885783, + "learning_rate": 4.560668465918312e-05, + "loss": 2.7617, + "step": 37063 + }, + { + "epoch": 1.7256093302604931, + "grad_norm": 0.3436049282394028, + "learning_rate": 4.560398640572315e-05, + "loss": 2.6602, + "step": 37064 + }, + { + "epoch": 1.7256558884465862, + "grad_norm": 0.31421092994948874, + "learning_rate": 4.5601288165165035e-05, + "loss": 2.7372, + "step": 37065 + }, + { + "epoch": 1.7257024466326794, + "grad_norm": 0.32925668664383706, + "learning_rate": 4.559858993751673e-05, + "loss": 2.7265, + "step": 37066 + }, + { + "epoch": 1.7257490048187722, + "grad_norm": 0.3397528994696559, + "learning_rate": 4.559589172278612e-05, + "loss": 2.7443, + "step": 37067 + }, + { + "epoch": 1.7257955630048654, + "grad_norm": 0.33377358807913016, + "learning_rate": 4.5593193520981116e-05, + "loss": 2.7871, + "step": 37068 + }, + { + "epoch": 1.7258421211909583, + "grad_norm": 0.32021142196312763, + "learning_rate": 4.5590495332109675e-05, + "loss": 2.7367, + "step": 37069 + }, + { + "epoch": 1.7258886793770514, + "grad_norm": 0.3465808921932067, + "learning_rate": 4.558779715617968e-05, + "loss": 2.7421, + "step": 37070 + }, + { + "epoch": 1.7259352375631445, + "grad_norm": 0.33896717124730535, + "learning_rate": 4.558509899319908e-05, + "loss": 2.6255, + "step": 37071 + }, + { + "epoch": 1.7259817957492376, + "grad_norm": 0.34144746869679854, + "learning_rate": 4.558240084317578e-05, + "loss": 2.6921, + "step": 37072 + }, + { + "epoch": 1.7260283539353307, + "grad_norm": 0.35684774429826244, + "learning_rate": 4.5579702706117685e-05, + "loss": 2.7008, + "step": 37073 + }, + { + "epoch": 1.7260749121214238, + "grad_norm": 0.3551963429373394, + "learning_rate": 4.557700458203274e-05, + "loss": 2.6765, + "step": 37074 + }, + { + "epoch": 1.726121470307517, + "grad_norm": 0.33137127949420253, + "learning_rate": 4.557430647092884e-05, + "loss": 2.7282, + "step": 37075 + }, + { + "epoch": 1.7261680284936098, + "grad_norm": 0.38123242923284767, + "learning_rate": 4.557160837281393e-05, + "loss": 2.7173, + "step": 37076 + }, + { + "epoch": 1.726214586679703, + "grad_norm": 0.37186239705902524, + "learning_rate": 4.556891028769592e-05, + "loss": 2.7584, + "step": 37077 + }, + { + "epoch": 1.726261144865796, + "grad_norm": 0.35450550166598677, + "learning_rate": 4.556621221558268e-05, + "loss": 2.612, + "step": 37078 + }, + { + "epoch": 1.726307703051889, + "grad_norm": 0.387424081273018, + "learning_rate": 4.5563514156482205e-05, + "loss": 2.6906, + "step": 37079 + }, + { + "epoch": 1.726354261237982, + "grad_norm": 0.3825807996568402, + "learning_rate": 4.556081611040238e-05, + "loss": 2.7917, + "step": 37080 + }, + { + "epoch": 1.7264008194240752, + "grad_norm": 0.3794275211876564, + "learning_rate": 4.55581180773511e-05, + "loss": 2.6707, + "step": 37081 + }, + { + "epoch": 1.7264473776101683, + "grad_norm": 0.37197160489994957, + "learning_rate": 4.5555420057336327e-05, + "loss": 2.7502, + "step": 37082 + }, + { + "epoch": 1.7264939357962614, + "grad_norm": 0.35769470505251244, + "learning_rate": 4.555272205036593e-05, + "loss": 2.7545, + "step": 37083 + }, + { + "epoch": 1.7265404939823545, + "grad_norm": 0.3808582057257372, + "learning_rate": 4.555002405644788e-05, + "loss": 2.7219, + "step": 37084 + }, + { + "epoch": 1.7265870521684477, + "grad_norm": 0.3608830552530549, + "learning_rate": 4.554732607559008e-05, + "loss": 2.6794, + "step": 37085 + }, + { + "epoch": 1.7266336103545405, + "grad_norm": 0.35867181628794065, + "learning_rate": 4.554462810780041e-05, + "loss": 2.6424, + "step": 37086 + }, + { + "epoch": 1.7266801685406337, + "grad_norm": 0.3785802787808954, + "learning_rate": 4.554193015308684e-05, + "loss": 2.6669, + "step": 37087 + }, + { + "epoch": 1.7267267267267268, + "grad_norm": 0.3422086625912418, + "learning_rate": 4.5539232211457256e-05, + "loss": 2.6657, + "step": 37088 + }, + { + "epoch": 1.7267732849128197, + "grad_norm": 0.37568035272024425, + "learning_rate": 4.5536534282919596e-05, + "loss": 2.6463, + "step": 37089 + }, + { + "epoch": 1.7268198430989128, + "grad_norm": 0.3903473833585893, + "learning_rate": 4.553383636748177e-05, + "loss": 2.772, + "step": 37090 + }, + { + "epoch": 1.726866401285006, + "grad_norm": 0.37955887106617564, + "learning_rate": 4.5531138465151666e-05, + "loss": 2.8011, + "step": 37091 + }, + { + "epoch": 1.726912959471099, + "grad_norm": 0.36969452108367357, + "learning_rate": 4.552844057593726e-05, + "loss": 2.6929, + "step": 37092 + }, + { + "epoch": 1.7269595176571921, + "grad_norm": 0.3518520873487083, + "learning_rate": 4.5525742699846416e-05, + "loss": 2.6966, + "step": 37093 + }, + { + "epoch": 1.7270060758432852, + "grad_norm": 0.37738983089922695, + "learning_rate": 4.5523044836887097e-05, + "loss": 2.7478, + "step": 37094 + }, + { + "epoch": 1.7270526340293784, + "grad_norm": 0.3826213398528338, + "learning_rate": 4.55203469870672e-05, + "loss": 2.6536, + "step": 37095 + }, + { + "epoch": 1.7270991922154713, + "grad_norm": 0.3589689919811563, + "learning_rate": 4.551764915039462e-05, + "loss": 2.7432, + "step": 37096 + }, + { + "epoch": 1.7271457504015644, + "grad_norm": 0.38278873961003207, + "learning_rate": 4.5514951326877326e-05, + "loss": 2.7241, + "step": 37097 + }, + { + "epoch": 1.7271923085876573, + "grad_norm": 0.37924579266744674, + "learning_rate": 4.5512253516523205e-05, + "loss": 2.7419, + "step": 37098 + }, + { + "epoch": 1.7272388667737504, + "grad_norm": 0.34823824427909494, + "learning_rate": 4.5509555719340154e-05, + "loss": 2.8394, + "step": 37099 + }, + { + "epoch": 1.7272854249598435, + "grad_norm": 0.3728938430835943, + "learning_rate": 4.550685793533614e-05, + "loss": 2.753, + "step": 37100 + }, + { + "epoch": 1.7273319831459366, + "grad_norm": 0.33448723453338974, + "learning_rate": 4.550416016451904e-05, + "loss": 2.6166, + "step": 37101 + }, + { + "epoch": 1.7273785413320297, + "grad_norm": 0.3448074690823034, + "learning_rate": 4.55014624068968e-05, + "loss": 2.7183, + "step": 37102 + }, + { + "epoch": 1.7274250995181228, + "grad_norm": 0.33889764140299494, + "learning_rate": 4.549876466247733e-05, + "loss": 2.6922, + "step": 37103 + }, + { + "epoch": 1.727471657704216, + "grad_norm": 0.34350626191620637, + "learning_rate": 4.549606693126851e-05, + "loss": 2.7449, + "step": 37104 + }, + { + "epoch": 1.727518215890309, + "grad_norm": 0.3704804776096425, + "learning_rate": 4.549336921327832e-05, + "loss": 2.6777, + "step": 37105 + }, + { + "epoch": 1.727564774076402, + "grad_norm": 0.36748919246263545, + "learning_rate": 4.549067150851462e-05, + "loss": 2.7495, + "step": 37106 + }, + { + "epoch": 1.727611332262495, + "grad_norm": 0.35344615990785544, + "learning_rate": 4.548797381698538e-05, + "loss": 2.7644, + "step": 37107 + }, + { + "epoch": 1.727657890448588, + "grad_norm": 0.34441679436902223, + "learning_rate": 4.54852761386985e-05, + "loss": 2.7032, + "step": 37108 + }, + { + "epoch": 1.727704448634681, + "grad_norm": 0.34719021841080877, + "learning_rate": 4.548257847366185e-05, + "loss": 2.7676, + "step": 37109 + }, + { + "epoch": 1.7277510068207742, + "grad_norm": 0.36896086313580784, + "learning_rate": 4.5479880821883424e-05, + "loss": 2.7284, + "step": 37110 + }, + { + "epoch": 1.7277975650068673, + "grad_norm": 0.35003976692127625, + "learning_rate": 4.5477183183371085e-05, + "loss": 2.6986, + "step": 37111 + }, + { + "epoch": 1.7278441231929604, + "grad_norm": 0.3372944454201469, + "learning_rate": 4.547448555813277e-05, + "loss": 2.6991, + "step": 37112 + }, + { + "epoch": 1.7278906813790536, + "grad_norm": 0.41060236095771746, + "learning_rate": 4.5471787946176404e-05, + "loss": 2.7666, + "step": 37113 + }, + { + "epoch": 1.7279372395651467, + "grad_norm": 0.34227701865542154, + "learning_rate": 4.546909034750988e-05, + "loss": 2.6433, + "step": 37114 + }, + { + "epoch": 1.7279837977512396, + "grad_norm": 0.36079966269289204, + "learning_rate": 4.5466392762141144e-05, + "loss": 2.7209, + "step": 37115 + }, + { + "epoch": 1.7280303559373327, + "grad_norm": 0.36963269448698205, + "learning_rate": 4.5463695190078086e-05, + "loss": 2.6885, + "step": 37116 + }, + { + "epoch": 1.7280769141234258, + "grad_norm": 0.38278552549274925, + "learning_rate": 4.546099763132864e-05, + "loss": 2.8578, + "step": 37117 + }, + { + "epoch": 1.7281234723095187, + "grad_norm": 0.35256312195167233, + "learning_rate": 4.5458300085900726e-05, + "loss": 2.7142, + "step": 37118 + }, + { + "epoch": 1.7281700304956118, + "grad_norm": 0.39802400828336487, + "learning_rate": 4.5455602553802234e-05, + "loss": 2.7869, + "step": 37119 + }, + { + "epoch": 1.728216588681705, + "grad_norm": 0.35727807004566675, + "learning_rate": 4.545290503504112e-05, + "loss": 2.6319, + "step": 37120 + }, + { + "epoch": 1.728263146867798, + "grad_norm": 0.3442205727653462, + "learning_rate": 4.545020752962528e-05, + "loss": 2.6652, + "step": 37121 + }, + { + "epoch": 1.7283097050538911, + "grad_norm": 0.35590477852059665, + "learning_rate": 4.544751003756262e-05, + "loss": 2.7554, + "step": 37122 + }, + { + "epoch": 1.7283562632399843, + "grad_norm": 0.32076899553963323, + "learning_rate": 4.5444812558861085e-05, + "loss": 2.6777, + "step": 37123 + }, + { + "epoch": 1.7284028214260774, + "grad_norm": 0.35088678761801917, + "learning_rate": 4.544211509352856e-05, + "loss": 2.7618, + "step": 37124 + }, + { + "epoch": 1.7284493796121703, + "grad_norm": 0.3135125293567603, + "learning_rate": 4.543941764157299e-05, + "loss": 2.7642, + "step": 37125 + }, + { + "epoch": 1.7284959377982634, + "grad_norm": 0.36760776704246007, + "learning_rate": 4.543672020300228e-05, + "loss": 2.7484, + "step": 37126 + }, + { + "epoch": 1.7285424959843565, + "grad_norm": 0.3696519028311147, + "learning_rate": 4.5434022777824344e-05, + "loss": 2.6133, + "step": 37127 + }, + { + "epoch": 1.7285890541704494, + "grad_norm": 0.3185652297897084, + "learning_rate": 4.5431325366047106e-05, + "loss": 2.6366, + "step": 37128 + }, + { + "epoch": 1.7286356123565425, + "grad_norm": 0.36696793113455334, + "learning_rate": 4.542862796767846e-05, + "loss": 2.7042, + "step": 37129 + }, + { + "epoch": 1.7286821705426356, + "grad_norm": 0.3274347583414082, + "learning_rate": 4.542593058272636e-05, + "loss": 2.7517, + "step": 37130 + }, + { + "epoch": 1.7287287287287287, + "grad_norm": 0.3390388193497083, + "learning_rate": 4.5423233211198707e-05, + "loss": 2.6757, + "step": 37131 + }, + { + "epoch": 1.7287752869148219, + "grad_norm": 0.3403224848615291, + "learning_rate": 4.5420535853103394e-05, + "loss": 2.7148, + "step": 37132 + }, + { + "epoch": 1.728821845100915, + "grad_norm": 0.31969368235142576, + "learning_rate": 4.541783850844837e-05, + "loss": 2.6689, + "step": 37133 + }, + { + "epoch": 1.728868403287008, + "grad_norm": 0.3471866240150556, + "learning_rate": 4.5415141177241546e-05, + "loss": 2.723, + "step": 37134 + }, + { + "epoch": 1.728914961473101, + "grad_norm": 0.33439878940481377, + "learning_rate": 4.54124438594908e-05, + "loss": 2.6263, + "step": 37135 + }, + { + "epoch": 1.728961519659194, + "grad_norm": 0.334438597712412, + "learning_rate": 4.540974655520411e-05, + "loss": 2.7127, + "step": 37136 + }, + { + "epoch": 1.729008077845287, + "grad_norm": 0.3424504796082074, + "learning_rate": 4.540704926438934e-05, + "loss": 2.6866, + "step": 37137 + }, + { + "epoch": 1.72905463603138, + "grad_norm": 0.3296004593346636, + "learning_rate": 4.540435198705445e-05, + "loss": 2.8012, + "step": 37138 + }, + { + "epoch": 1.7291011942174732, + "grad_norm": 0.3375957390804433, + "learning_rate": 4.540165472320732e-05, + "loss": 2.7244, + "step": 37139 + }, + { + "epoch": 1.7291477524035663, + "grad_norm": 0.3139336665295106, + "learning_rate": 4.5398957472855885e-05, + "loss": 2.6909, + "step": 37140 + }, + { + "epoch": 1.7291943105896594, + "grad_norm": 0.3300936615532505, + "learning_rate": 4.539626023600806e-05, + "loss": 2.6226, + "step": 37141 + }, + { + "epoch": 1.7292408687757526, + "grad_norm": 0.35378229405600287, + "learning_rate": 4.539356301267174e-05, + "loss": 2.7236, + "step": 37142 + }, + { + "epoch": 1.7292874269618457, + "grad_norm": 0.32720717202173955, + "learning_rate": 4.539086580285488e-05, + "loss": 2.6901, + "step": 37143 + }, + { + "epoch": 1.7293339851479388, + "grad_norm": 0.32378757343225806, + "learning_rate": 4.5388168606565375e-05, + "loss": 2.7064, + "step": 37144 + }, + { + "epoch": 1.7293805433340317, + "grad_norm": 0.36699280655253624, + "learning_rate": 4.5385471423811114e-05, + "loss": 2.687, + "step": 37145 + }, + { + "epoch": 1.7294271015201248, + "grad_norm": 0.3461906263718424, + "learning_rate": 4.538277425460007e-05, + "loss": 2.6699, + "step": 37146 + }, + { + "epoch": 1.7294736597062177, + "grad_norm": 0.3169967633333277, + "learning_rate": 4.53800770989401e-05, + "loss": 2.7614, + "step": 37147 + }, + { + "epoch": 1.7295202178923108, + "grad_norm": 0.3836170953594575, + "learning_rate": 4.5377379956839174e-05, + "loss": 2.7096, + "step": 37148 + }, + { + "epoch": 1.729566776078404, + "grad_norm": 0.3214164212891892, + "learning_rate": 4.537468282830518e-05, + "loss": 2.829, + "step": 37149 + }, + { + "epoch": 1.729613334264497, + "grad_norm": 0.32222787652472895, + "learning_rate": 4.537198571334601e-05, + "loss": 2.6683, + "step": 37150 + }, + { + "epoch": 1.7296598924505902, + "grad_norm": 0.37372538669048555, + "learning_rate": 4.536928861196964e-05, + "loss": 2.8247, + "step": 37151 + }, + { + "epoch": 1.7297064506366833, + "grad_norm": 0.34260959665959395, + "learning_rate": 4.536659152418394e-05, + "loss": 2.644, + "step": 37152 + }, + { + "epoch": 1.7297530088227764, + "grad_norm": 0.35816695159044065, + "learning_rate": 4.5363894449996836e-05, + "loss": 2.7257, + "step": 37153 + }, + { + "epoch": 1.7297995670088695, + "grad_norm": 0.3329114374407185, + "learning_rate": 4.536119738941625e-05, + "loss": 2.6661, + "step": 37154 + }, + { + "epoch": 1.7298461251949624, + "grad_norm": 0.3623810428494106, + "learning_rate": 4.535850034245007e-05, + "loss": 2.724, + "step": 37155 + }, + { + "epoch": 1.7298926833810555, + "grad_norm": 0.3468752744878167, + "learning_rate": 4.535580330910626e-05, + "loss": 2.6809, + "step": 37156 + }, + { + "epoch": 1.7299392415671484, + "grad_norm": 0.3716494861898914, + "learning_rate": 4.535310628939271e-05, + "loss": 2.828, + "step": 37157 + }, + { + "epoch": 1.7299857997532415, + "grad_norm": 0.34595065127405833, + "learning_rate": 4.535040928331731e-05, + "loss": 2.5748, + "step": 37158 + }, + { + "epoch": 1.7300323579393346, + "grad_norm": 0.32098811330140775, + "learning_rate": 4.534771229088803e-05, + "loss": 2.6431, + "step": 37159 + }, + { + "epoch": 1.7300789161254277, + "grad_norm": 0.3352989568195151, + "learning_rate": 4.534501531211272e-05, + "loss": 2.6404, + "step": 37160 + }, + { + "epoch": 1.7301254743115209, + "grad_norm": 0.3179992716225628, + "learning_rate": 4.534231834699937e-05, + "loss": 2.7751, + "step": 37161 + }, + { + "epoch": 1.730172032497614, + "grad_norm": 0.33792278044962454, + "learning_rate": 4.533962139555584e-05, + "loss": 2.7664, + "step": 37162 + }, + { + "epoch": 1.730218590683707, + "grad_norm": 0.3344212858687157, + "learning_rate": 4.533692445779005e-05, + "loss": 2.7302, + "step": 37163 + }, + { + "epoch": 1.7302651488698, + "grad_norm": 0.32863322265978817, + "learning_rate": 4.533422753370995e-05, + "loss": 2.8141, + "step": 37164 + }, + { + "epoch": 1.730311707055893, + "grad_norm": 0.3709458022748458, + "learning_rate": 4.533153062332342e-05, + "loss": 2.6548, + "step": 37165 + }, + { + "epoch": 1.7303582652419862, + "grad_norm": 0.3651427716495355, + "learning_rate": 4.53288337266384e-05, + "loss": 2.7406, + "step": 37166 + }, + { + "epoch": 1.730404823428079, + "grad_norm": 0.3397494494404077, + "learning_rate": 4.532613684366279e-05, + "loss": 2.7656, + "step": 37167 + }, + { + "epoch": 1.7304513816141722, + "grad_norm": 0.3542124429240726, + "learning_rate": 4.5323439974404484e-05, + "loss": 2.7982, + "step": 37168 + }, + { + "epoch": 1.7304979398002653, + "grad_norm": 0.3470443019509269, + "learning_rate": 4.532074311887145e-05, + "loss": 2.7451, + "step": 37169 + }, + { + "epoch": 1.7305444979863585, + "grad_norm": 0.36745951360670553, + "learning_rate": 4.5318046277071566e-05, + "loss": 2.6842, + "step": 37170 + }, + { + "epoch": 1.7305910561724516, + "grad_norm": 0.3498963114205361, + "learning_rate": 4.531534944901274e-05, + "loss": 2.7675, + "step": 37171 + }, + { + "epoch": 1.7306376143585447, + "grad_norm": 0.34778685763277506, + "learning_rate": 4.531265263470292e-05, + "loss": 2.7638, + "step": 37172 + }, + { + "epoch": 1.7306841725446378, + "grad_norm": 0.33722331426065694, + "learning_rate": 4.530995583414998e-05, + "loss": 2.7687, + "step": 37173 + }, + { + "epoch": 1.7307307307307307, + "grad_norm": 0.3806165990203655, + "learning_rate": 4.530725904736188e-05, + "loss": 2.6919, + "step": 37174 + }, + { + "epoch": 1.7307772889168238, + "grad_norm": 0.355612691465757, + "learning_rate": 4.530456227434652e-05, + "loss": 2.6129, + "step": 37175 + }, + { + "epoch": 1.730823847102917, + "grad_norm": 0.35406939290054484, + "learning_rate": 4.5301865515111776e-05, + "loss": 2.6439, + "step": 37176 + }, + { + "epoch": 1.7308704052890098, + "grad_norm": 0.400490699923393, + "learning_rate": 4.5299168769665616e-05, + "loss": 2.7786, + "step": 37177 + }, + { + "epoch": 1.730916963475103, + "grad_norm": 0.34950212904572636, + "learning_rate": 4.529647203801592e-05, + "loss": 2.7798, + "step": 37178 + }, + { + "epoch": 1.730963521661196, + "grad_norm": 0.405789296901993, + "learning_rate": 4.5293775320170636e-05, + "loss": 2.6544, + "step": 37179 + }, + { + "epoch": 1.7310100798472892, + "grad_norm": 0.32867316199346597, + "learning_rate": 4.529107861613765e-05, + "loss": 2.6342, + "step": 37180 + }, + { + "epoch": 1.7310566380333823, + "grad_norm": 0.43357440949996834, + "learning_rate": 4.5288381925924864e-05, + "loss": 2.728, + "step": 37181 + }, + { + "epoch": 1.7311031962194754, + "grad_norm": 0.3395931732159289, + "learning_rate": 4.528568524954024e-05, + "loss": 2.8222, + "step": 37182 + }, + { + "epoch": 1.7311497544055685, + "grad_norm": 0.3793147093032865, + "learning_rate": 4.528298858699164e-05, + "loss": 2.6051, + "step": 37183 + }, + { + "epoch": 1.7311963125916614, + "grad_norm": 0.41226012365787745, + "learning_rate": 4.528029193828702e-05, + "loss": 2.5497, + "step": 37184 + }, + { + "epoch": 1.7312428707777545, + "grad_norm": 0.3601441599398475, + "learning_rate": 4.527759530343428e-05, + "loss": 2.6957, + "step": 37185 + }, + { + "epoch": 1.7312894289638474, + "grad_norm": 0.4387601799994525, + "learning_rate": 4.527489868244131e-05, + "loss": 2.6857, + "step": 37186 + }, + { + "epoch": 1.7313359871499405, + "grad_norm": 0.3557473460081934, + "learning_rate": 4.527220207531608e-05, + "loss": 2.6681, + "step": 37187 + }, + { + "epoch": 1.7313825453360336, + "grad_norm": 0.37840486824730163, + "learning_rate": 4.526950548206646e-05, + "loss": 2.7419, + "step": 37188 + }, + { + "epoch": 1.7314291035221268, + "grad_norm": 0.38155246961849476, + "learning_rate": 4.526680890270036e-05, + "loss": 2.7643, + "step": 37189 + }, + { + "epoch": 1.7314756617082199, + "grad_norm": 0.38460314439921095, + "learning_rate": 4.526411233722573e-05, + "loss": 2.6459, + "step": 37190 + }, + { + "epoch": 1.731522219894313, + "grad_norm": 0.36402655055231653, + "learning_rate": 4.526141578565045e-05, + "loss": 2.7323, + "step": 37191 + }, + { + "epoch": 1.731568778080406, + "grad_norm": 0.37319644145733755, + "learning_rate": 4.5258719247982454e-05, + "loss": 2.6329, + "step": 37192 + }, + { + "epoch": 1.7316153362664992, + "grad_norm": 0.35973759643015324, + "learning_rate": 4.525602272422966e-05, + "loss": 2.7504, + "step": 37193 + }, + { + "epoch": 1.731661894452592, + "grad_norm": 0.34233981826806215, + "learning_rate": 4.525332621439994e-05, + "loss": 2.7453, + "step": 37194 + }, + { + "epoch": 1.7317084526386852, + "grad_norm": 0.3846310584369131, + "learning_rate": 4.525062971850127e-05, + "loss": 2.7598, + "step": 37195 + }, + { + "epoch": 1.7317550108247781, + "grad_norm": 0.3590970351387127, + "learning_rate": 4.524793323654151e-05, + "loss": 2.7426, + "step": 37196 + }, + { + "epoch": 1.7318015690108712, + "grad_norm": 0.35754142862489685, + "learning_rate": 4.524523676852862e-05, + "loss": 2.7579, + "step": 37197 + }, + { + "epoch": 1.7318481271969643, + "grad_norm": 0.36208549683169616, + "learning_rate": 4.524254031447049e-05, + "loss": 2.7472, + "step": 37198 + }, + { + "epoch": 1.7318946853830575, + "grad_norm": 0.3390161492888389, + "learning_rate": 4.523984387437501e-05, + "loss": 2.7421, + "step": 37199 + }, + { + "epoch": 1.7319412435691506, + "grad_norm": 0.34707038262006773, + "learning_rate": 4.523714744825015e-05, + "loss": 2.6352, + "step": 37200 + }, + { + "epoch": 1.7319878017552437, + "grad_norm": 0.35484878223090166, + "learning_rate": 4.523445103610378e-05, + "loss": 2.6359, + "step": 37201 + }, + { + "epoch": 1.7320343599413368, + "grad_norm": 0.3138606490843042, + "learning_rate": 4.523175463794382e-05, + "loss": 2.652, + "step": 37202 + }, + { + "epoch": 1.7320809181274297, + "grad_norm": 0.33183405535871197, + "learning_rate": 4.522905825377821e-05, + "loss": 2.6368, + "step": 37203 + }, + { + "epoch": 1.7321274763135228, + "grad_norm": 0.3504184489822996, + "learning_rate": 4.522636188361483e-05, + "loss": 2.7373, + "step": 37204 + }, + { + "epoch": 1.732174034499616, + "grad_norm": 0.3603459063524131, + "learning_rate": 4.522366552746161e-05, + "loss": 2.6391, + "step": 37205 + }, + { + "epoch": 1.7322205926857088, + "grad_norm": 0.3554845247731186, + "learning_rate": 4.522096918532647e-05, + "loss": 2.6896, + "step": 37206 + }, + { + "epoch": 1.732267150871802, + "grad_norm": 0.36313775336882104, + "learning_rate": 4.521827285721729e-05, + "loss": 2.6526, + "step": 37207 + }, + { + "epoch": 1.732313709057895, + "grad_norm": 0.35230385191924546, + "learning_rate": 4.521557654314204e-05, + "loss": 2.6888, + "step": 37208 + }, + { + "epoch": 1.7323602672439882, + "grad_norm": 0.3418759878530487, + "learning_rate": 4.521288024310856e-05, + "loss": 2.7163, + "step": 37209 + }, + { + "epoch": 1.7324068254300813, + "grad_norm": 0.3691485472427831, + "learning_rate": 4.521018395712484e-05, + "loss": 2.7318, + "step": 37210 + }, + { + "epoch": 1.7324533836161744, + "grad_norm": 0.34499564153703155, + "learning_rate": 4.520748768519876e-05, + "loss": 2.8156, + "step": 37211 + }, + { + "epoch": 1.7324999418022675, + "grad_norm": 0.3630241406096388, + "learning_rate": 4.52047914273382e-05, + "loss": 2.7153, + "step": 37212 + }, + { + "epoch": 1.7325464999883604, + "grad_norm": 0.3793251772427536, + "learning_rate": 4.520209518355114e-05, + "loss": 2.8373, + "step": 37213 + }, + { + "epoch": 1.7325930581744535, + "grad_norm": 0.3361724503250574, + "learning_rate": 4.5199398953845436e-05, + "loss": 2.6278, + "step": 37214 + }, + { + "epoch": 1.7326396163605466, + "grad_norm": 0.35855338671880527, + "learning_rate": 4.519670273822902e-05, + "loss": 2.6371, + "step": 37215 + }, + { + "epoch": 1.7326861745466395, + "grad_norm": 0.34965644472208446, + "learning_rate": 4.519400653670983e-05, + "loss": 2.7822, + "step": 37216 + }, + { + "epoch": 1.7327327327327327, + "grad_norm": 0.33499096259831795, + "learning_rate": 4.519131034929574e-05, + "loss": 2.7355, + "step": 37217 + }, + { + "epoch": 1.7327792909188258, + "grad_norm": 0.3478406586542125, + "learning_rate": 4.518861417599469e-05, + "loss": 2.7603, + "step": 37218 + }, + { + "epoch": 1.7328258491049189, + "grad_norm": 0.3820422823249019, + "learning_rate": 4.5185918016814566e-05, + "loss": 2.7268, + "step": 37219 + }, + { + "epoch": 1.732872407291012, + "grad_norm": 0.34221260139618487, + "learning_rate": 4.518322187176332e-05, + "loss": 2.7376, + "step": 37220 + }, + { + "epoch": 1.732918965477105, + "grad_norm": 0.34658358528739824, + "learning_rate": 4.5180525740848837e-05, + "loss": 2.7603, + "step": 37221 + }, + { + "epoch": 1.7329655236631982, + "grad_norm": 0.338507690993851, + "learning_rate": 4.517782962407902e-05, + "loss": 2.7564, + "step": 37222 + }, + { + "epoch": 1.7330120818492911, + "grad_norm": 0.35089989081886747, + "learning_rate": 4.517513352146182e-05, + "loss": 2.7193, + "step": 37223 + }, + { + "epoch": 1.7330586400353842, + "grad_norm": 0.34064470490919385, + "learning_rate": 4.5172437433005136e-05, + "loss": 2.7292, + "step": 37224 + }, + { + "epoch": 1.7331051982214771, + "grad_norm": 0.3387430120265863, + "learning_rate": 4.5169741358716835e-05, + "loss": 2.6763, + "step": 37225 + }, + { + "epoch": 1.7331517564075702, + "grad_norm": 0.3525580276320419, + "learning_rate": 4.516704529860489e-05, + "loss": 2.6877, + "step": 37226 + }, + { + "epoch": 1.7331983145936634, + "grad_norm": 0.32597037580083776, + "learning_rate": 4.516434925267719e-05, + "loss": 2.7537, + "step": 37227 + }, + { + "epoch": 1.7332448727797565, + "grad_norm": 0.3578028592472573, + "learning_rate": 4.516165322094165e-05, + "loss": 2.754, + "step": 37228 + }, + { + "epoch": 1.7332914309658496, + "grad_norm": 0.31766355306575733, + "learning_rate": 4.515895720340618e-05, + "loss": 2.6183, + "step": 37229 + }, + { + "epoch": 1.7333379891519427, + "grad_norm": 0.3476938062456409, + "learning_rate": 4.515626120007869e-05, + "loss": 2.6776, + "step": 37230 + }, + { + "epoch": 1.7333845473380358, + "grad_norm": 0.3324281778191648, + "learning_rate": 4.5153565210967106e-05, + "loss": 2.7687, + "step": 37231 + }, + { + "epoch": 1.733431105524129, + "grad_norm": 0.3405598813256576, + "learning_rate": 4.515086923607931e-05, + "loss": 2.6844, + "step": 37232 + }, + { + "epoch": 1.7334776637102218, + "grad_norm": 0.3442420565718501, + "learning_rate": 4.5148173275423254e-05, + "loss": 2.6935, + "step": 37233 + }, + { + "epoch": 1.733524221896315, + "grad_norm": 0.3559035741077616, + "learning_rate": 4.5145477329006835e-05, + "loss": 2.7735, + "step": 37234 + }, + { + "epoch": 1.7335707800824078, + "grad_norm": 0.32882342413175264, + "learning_rate": 4.514278139683794e-05, + "loss": 2.6378, + "step": 37235 + }, + { + "epoch": 1.733617338268501, + "grad_norm": 0.35932444851258466, + "learning_rate": 4.514008547892452e-05, + "loss": 2.8317, + "step": 37236 + }, + { + "epoch": 1.733663896454594, + "grad_norm": 0.33810364025543027, + "learning_rate": 4.513738957527445e-05, + "loss": 2.7266, + "step": 37237 + }, + { + "epoch": 1.7337104546406872, + "grad_norm": 0.3252908952871178, + "learning_rate": 4.513469368589569e-05, + "loss": 2.6477, + "step": 37238 + }, + { + "epoch": 1.7337570128267803, + "grad_norm": 0.37494037989114026, + "learning_rate": 4.5131997810796125e-05, + "loss": 2.698, + "step": 37239 + }, + { + "epoch": 1.7338035710128734, + "grad_norm": 0.35754853720717505, + "learning_rate": 4.512930194998365e-05, + "loss": 2.7015, + "step": 37240 + }, + { + "epoch": 1.7338501291989665, + "grad_norm": 0.33170485505854097, + "learning_rate": 4.51266061034662e-05, + "loss": 2.7815, + "step": 37241 + }, + { + "epoch": 1.7338966873850596, + "grad_norm": 0.4000311543933365, + "learning_rate": 4.512391027125168e-05, + "loss": 2.6959, + "step": 37242 + }, + { + "epoch": 1.7339432455711525, + "grad_norm": 0.3281888744065142, + "learning_rate": 4.5121214453348e-05, + "loss": 2.6846, + "step": 37243 + }, + { + "epoch": 1.7339898037572457, + "grad_norm": 0.4029372956321939, + "learning_rate": 4.511851864976309e-05, + "loss": 2.6432, + "step": 37244 + }, + { + "epoch": 1.7340363619433385, + "grad_norm": 0.34736238302200495, + "learning_rate": 4.511582286050482e-05, + "loss": 2.6725, + "step": 37245 + }, + { + "epoch": 1.7340829201294317, + "grad_norm": 0.3591650659426487, + "learning_rate": 4.511312708558115e-05, + "loss": 2.7681, + "step": 37246 + }, + { + "epoch": 1.7341294783155248, + "grad_norm": 0.35243614692589137, + "learning_rate": 4.511043132499997e-05, + "loss": 2.6908, + "step": 37247 + }, + { + "epoch": 1.734176036501618, + "grad_norm": 0.32834137674677033, + "learning_rate": 4.5107735578769174e-05, + "loss": 2.7621, + "step": 37248 + }, + { + "epoch": 1.734222594687711, + "grad_norm": 0.3565444578561718, + "learning_rate": 4.510503984689671e-05, + "loss": 2.7182, + "step": 37249 + }, + { + "epoch": 1.7342691528738041, + "grad_norm": 0.35923363588467505, + "learning_rate": 4.510234412939045e-05, + "loss": 2.6667, + "step": 37250 + }, + { + "epoch": 1.7343157110598972, + "grad_norm": 0.3339228567256529, + "learning_rate": 4.5099648426258355e-05, + "loss": 2.7177, + "step": 37251 + }, + { + "epoch": 1.7343622692459901, + "grad_norm": 0.3852946140309357, + "learning_rate": 4.50969527375083e-05, + "loss": 2.7123, + "step": 37252 + }, + { + "epoch": 1.7344088274320832, + "grad_norm": 0.37978528643329595, + "learning_rate": 4.5094257063148206e-05, + "loss": 2.8053, + "step": 37253 + }, + { + "epoch": 1.7344553856181764, + "grad_norm": 0.36247239208589194, + "learning_rate": 4.5091561403185976e-05, + "loss": 2.6616, + "step": 37254 + }, + { + "epoch": 1.7345019438042693, + "grad_norm": 0.35079165535322165, + "learning_rate": 4.508886575762953e-05, + "loss": 2.6587, + "step": 37255 + }, + { + "epoch": 1.7345485019903624, + "grad_norm": 0.33082080764525995, + "learning_rate": 4.508617012648679e-05, + "loss": 2.7108, + "step": 37256 + }, + { + "epoch": 1.7345950601764555, + "grad_norm": 0.3257505037629854, + "learning_rate": 4.508347450976567e-05, + "loss": 2.6073, + "step": 37257 + }, + { + "epoch": 1.7346416183625486, + "grad_norm": 0.3635639229802125, + "learning_rate": 4.508077890747403e-05, + "loss": 2.7461, + "step": 37258 + }, + { + "epoch": 1.7346881765486417, + "grad_norm": 0.33794635049109684, + "learning_rate": 4.5078083319619844e-05, + "loss": 2.6973, + "step": 37259 + }, + { + "epoch": 1.7347347347347348, + "grad_norm": 0.34890462379423076, + "learning_rate": 4.5075387746211e-05, + "loss": 2.656, + "step": 37260 + }, + { + "epoch": 1.734781292920828, + "grad_norm": 0.3628958395416019, + "learning_rate": 4.507269218725539e-05, + "loss": 2.7138, + "step": 37261 + }, + { + "epoch": 1.7348278511069208, + "grad_norm": 0.3581261266002427, + "learning_rate": 4.506999664276096e-05, + "loss": 2.6663, + "step": 37262 + }, + { + "epoch": 1.734874409293014, + "grad_norm": 0.3549109731008409, + "learning_rate": 4.506730111273558e-05, + "loss": 2.6398, + "step": 37263 + }, + { + "epoch": 1.734920967479107, + "grad_norm": 0.37474203107124193, + "learning_rate": 4.506460559718721e-05, + "loss": 2.7001, + "step": 37264 + }, + { + "epoch": 1.7349675256652, + "grad_norm": 0.34427737384364826, + "learning_rate": 4.506191009612374e-05, + "loss": 2.6641, + "step": 37265 + }, + { + "epoch": 1.735014083851293, + "grad_norm": 0.3682965617320853, + "learning_rate": 4.5059214609553066e-05, + "loss": 2.7035, + "step": 37266 + }, + { + "epoch": 1.7350606420373862, + "grad_norm": 0.34310040893743504, + "learning_rate": 4.5056519137483114e-05, + "loss": 2.7042, + "step": 37267 + }, + { + "epoch": 1.7351072002234793, + "grad_norm": 0.3903424553292118, + "learning_rate": 4.505382367992178e-05, + "loss": 2.7673, + "step": 37268 + }, + { + "epoch": 1.7351537584095724, + "grad_norm": 0.3515806724687109, + "learning_rate": 4.5051128236877e-05, + "loss": 2.6406, + "step": 37269 + }, + { + "epoch": 1.7352003165956655, + "grad_norm": 0.34944062440344503, + "learning_rate": 4.504843280835668e-05, + "loss": 2.7249, + "step": 37270 + }, + { + "epoch": 1.7352468747817587, + "grad_norm": 0.40300492008730593, + "learning_rate": 4.504573739436869e-05, + "loss": 2.7823, + "step": 37271 + }, + { + "epoch": 1.7352934329678515, + "grad_norm": 0.3553690677307265, + "learning_rate": 4.5043041994921e-05, + "loss": 2.6925, + "step": 37272 + }, + { + "epoch": 1.7353399911539447, + "grad_norm": 0.39195267592057975, + "learning_rate": 4.504034661002147e-05, + "loss": 2.6299, + "step": 37273 + }, + { + "epoch": 1.7353865493400376, + "grad_norm": 0.33255842643185735, + "learning_rate": 4.503765123967806e-05, + "loss": 2.6903, + "step": 37274 + }, + { + "epoch": 1.7354331075261307, + "grad_norm": 0.3624942310326949, + "learning_rate": 4.503495588389864e-05, + "loss": 2.6051, + "step": 37275 + }, + { + "epoch": 1.7354796657122238, + "grad_norm": 0.3743589737709107, + "learning_rate": 4.503226054269113e-05, + "loss": 2.7305, + "step": 37276 + }, + { + "epoch": 1.735526223898317, + "grad_norm": 0.35976528734994256, + "learning_rate": 4.5029565216063466e-05, + "loss": 2.6138, + "step": 37277 + }, + { + "epoch": 1.73557278208441, + "grad_norm": 0.3866330470054222, + "learning_rate": 4.502686990402353e-05, + "loss": 2.7288, + "step": 37278 + }, + { + "epoch": 1.7356193402705031, + "grad_norm": 0.3379721806042434, + "learning_rate": 4.502417460657923e-05, + "loss": 2.5558, + "step": 37279 + }, + { + "epoch": 1.7356658984565962, + "grad_norm": 0.35879589602387185, + "learning_rate": 4.50214793237385e-05, + "loss": 2.5662, + "step": 37280 + }, + { + "epoch": 1.7357124566426894, + "grad_norm": 0.3543238974876992, + "learning_rate": 4.501878405550922e-05, + "loss": 2.6134, + "step": 37281 + }, + { + "epoch": 1.7357590148287823, + "grad_norm": 0.34955802469334507, + "learning_rate": 4.5016088801899344e-05, + "loss": 2.8413, + "step": 37282 + }, + { + "epoch": 1.7358055730148754, + "grad_norm": 0.3554738398977, + "learning_rate": 4.501339356291675e-05, + "loss": 2.7428, + "step": 37283 + }, + { + "epoch": 1.7358521312009683, + "grad_norm": 0.33889822590912394, + "learning_rate": 4.5010698338569335e-05, + "loss": 2.7035, + "step": 37284 + }, + { + "epoch": 1.7358986893870614, + "grad_norm": 0.33574292049317705, + "learning_rate": 4.5008003128865046e-05, + "loss": 2.7348, + "step": 37285 + }, + { + "epoch": 1.7359452475731545, + "grad_norm": 0.34629140972569666, + "learning_rate": 4.500530793381176e-05, + "loss": 2.7103, + "step": 37286 + }, + { + "epoch": 1.7359918057592476, + "grad_norm": 0.3500797622866434, + "learning_rate": 4.5002612753417424e-05, + "loss": 2.6311, + "step": 37287 + }, + { + "epoch": 1.7360383639453407, + "grad_norm": 0.32062760091407805, + "learning_rate": 4.499991758768992e-05, + "loss": 2.7161, + "step": 37288 + }, + { + "epoch": 1.7360849221314338, + "grad_norm": 0.3459003543064939, + "learning_rate": 4.499722243663715e-05, + "loss": 2.7291, + "step": 37289 + }, + { + "epoch": 1.736131480317527, + "grad_norm": 0.334358983646502, + "learning_rate": 4.4994527300267066e-05, + "loss": 2.6781, + "step": 37290 + }, + { + "epoch": 1.7361780385036198, + "grad_norm": 0.3170090538945276, + "learning_rate": 4.499183217858753e-05, + "loss": 2.5608, + "step": 37291 + }, + { + "epoch": 1.736224596689713, + "grad_norm": 0.3226454258841511, + "learning_rate": 4.498913707160649e-05, + "loss": 2.7653, + "step": 37292 + }, + { + "epoch": 1.736271154875806, + "grad_norm": 0.3428312983093529, + "learning_rate": 4.498644197933182e-05, + "loss": 2.7411, + "step": 37293 + }, + { + "epoch": 1.736317713061899, + "grad_norm": 0.3486230578287563, + "learning_rate": 4.498374690177146e-05, + "loss": 2.6953, + "step": 37294 + }, + { + "epoch": 1.736364271247992, + "grad_norm": 0.3274117951907128, + "learning_rate": 4.4981051838933315e-05, + "loss": 2.6497, + "step": 37295 + }, + { + "epoch": 1.7364108294340852, + "grad_norm": 0.3255122974120321, + "learning_rate": 4.497835679082529e-05, + "loss": 2.6418, + "step": 37296 + }, + { + "epoch": 1.7364573876201783, + "grad_norm": 0.345899069839691, + "learning_rate": 4.4975661757455266e-05, + "loss": 2.6866, + "step": 37297 + }, + { + "epoch": 1.7365039458062714, + "grad_norm": 0.33690345280015915, + "learning_rate": 4.497296673883121e-05, + "loss": 2.7407, + "step": 37298 + }, + { + "epoch": 1.7365505039923645, + "grad_norm": 0.3486190585522568, + "learning_rate": 4.4970271734960974e-05, + "loss": 2.7862, + "step": 37299 + }, + { + "epoch": 1.7365970621784577, + "grad_norm": 0.3402221359281184, + "learning_rate": 4.496757674585252e-05, + "loss": 2.874, + "step": 37300 + }, + { + "epoch": 1.7366436203645506, + "grad_norm": 0.35603111444878627, + "learning_rate": 4.496488177151373e-05, + "loss": 2.8089, + "step": 37301 + }, + { + "epoch": 1.7366901785506437, + "grad_norm": 0.3655696131273743, + "learning_rate": 4.496218681195249e-05, + "loss": 2.7322, + "step": 37302 + }, + { + "epoch": 1.7367367367367368, + "grad_norm": 0.4014940608962838, + "learning_rate": 4.4959491867176764e-05, + "loss": 2.8079, + "step": 37303 + }, + { + "epoch": 1.7367832949228297, + "grad_norm": 0.3272947559399964, + "learning_rate": 4.495679693719442e-05, + "loss": 2.7292, + "step": 37304 + }, + { + "epoch": 1.7368298531089228, + "grad_norm": 0.3532655321223976, + "learning_rate": 4.4954102022013386e-05, + "loss": 2.6611, + "step": 37305 + }, + { + "epoch": 1.736876411295016, + "grad_norm": 0.3509662600897463, + "learning_rate": 4.4951407121641565e-05, + "loss": 2.6584, + "step": 37306 + }, + { + "epoch": 1.736922969481109, + "grad_norm": 0.339405341309299, + "learning_rate": 4.494871223608686e-05, + "loss": 2.7237, + "step": 37307 + }, + { + "epoch": 1.7369695276672021, + "grad_norm": 0.33910360573254583, + "learning_rate": 4.49460173653572e-05, + "loss": 2.6562, + "step": 37308 + }, + { + "epoch": 1.7370160858532953, + "grad_norm": 0.32653552876719066, + "learning_rate": 4.4943322509460455e-05, + "loss": 2.8101, + "step": 37309 + }, + { + "epoch": 1.7370626440393884, + "grad_norm": 0.3630428782778871, + "learning_rate": 4.494062766840458e-05, + "loss": 2.6602, + "step": 37310 + }, + { + "epoch": 1.7371092022254813, + "grad_norm": 0.32993216384047563, + "learning_rate": 4.493793284219747e-05, + "loss": 2.7402, + "step": 37311 + }, + { + "epoch": 1.7371557604115744, + "grad_norm": 0.35877390138126564, + "learning_rate": 4.493523803084701e-05, + "loss": 2.6558, + "step": 37312 + }, + { + "epoch": 1.7372023185976673, + "grad_norm": 0.36582789363936696, + "learning_rate": 4.493254323436114e-05, + "loss": 2.7433, + "step": 37313 + }, + { + "epoch": 1.7372488767837604, + "grad_norm": 0.33368014096357557, + "learning_rate": 4.492984845274774e-05, + "loss": 2.6535, + "step": 37314 + }, + { + "epoch": 1.7372954349698535, + "grad_norm": 0.36581422654337103, + "learning_rate": 4.492715368601476e-05, + "loss": 2.6878, + "step": 37315 + }, + { + "epoch": 1.7373419931559466, + "grad_norm": 0.37867724232660954, + "learning_rate": 4.4924458934170084e-05, + "loss": 2.6958, + "step": 37316 + }, + { + "epoch": 1.7373885513420397, + "grad_norm": 0.3318620763357377, + "learning_rate": 4.49217641972216e-05, + "loss": 2.713, + "step": 37317 + }, + { + "epoch": 1.7374351095281328, + "grad_norm": 0.3492262182024522, + "learning_rate": 4.491906947517726e-05, + "loss": 2.7501, + "step": 37318 + }, + { + "epoch": 1.737481667714226, + "grad_norm": 0.3425496650365099, + "learning_rate": 4.491637476804494e-05, + "loss": 2.6944, + "step": 37319 + }, + { + "epoch": 1.737528225900319, + "grad_norm": 0.3423065515445958, + "learning_rate": 4.491368007583255e-05, + "loss": 2.762, + "step": 37320 + }, + { + "epoch": 1.737574784086412, + "grad_norm": 0.31791529039238914, + "learning_rate": 4.491098539854802e-05, + "loss": 2.7497, + "step": 37321 + }, + { + "epoch": 1.737621342272505, + "grad_norm": 0.3706799445721565, + "learning_rate": 4.490829073619923e-05, + "loss": 2.6322, + "step": 37322 + }, + { + "epoch": 1.737667900458598, + "grad_norm": 0.35510862480800803, + "learning_rate": 4.4905596088794127e-05, + "loss": 2.651, + "step": 37323 + }, + { + "epoch": 1.737714458644691, + "grad_norm": 0.3409406171029702, + "learning_rate": 4.4902901456340596e-05, + "loss": 2.7169, + "step": 37324 + }, + { + "epoch": 1.7377610168307842, + "grad_norm": 0.3446324879288298, + "learning_rate": 4.490020683884653e-05, + "loss": 2.8099, + "step": 37325 + }, + { + "epoch": 1.7378075750168773, + "grad_norm": 0.35330258868139364, + "learning_rate": 4.489751223631987e-05, + "loss": 2.7359, + "step": 37326 + }, + { + "epoch": 1.7378541332029704, + "grad_norm": 0.352465548776403, + "learning_rate": 4.489481764876848e-05, + "loss": 2.7292, + "step": 37327 + }, + { + "epoch": 1.7379006913890636, + "grad_norm": 0.34206651227464807, + "learning_rate": 4.489212307620033e-05, + "loss": 2.6927, + "step": 37328 + }, + { + "epoch": 1.7379472495751567, + "grad_norm": 0.350550010992214, + "learning_rate": 4.488942851862329e-05, + "loss": 2.6214, + "step": 37329 + }, + { + "epoch": 1.7379938077612498, + "grad_norm": 0.3337743916241986, + "learning_rate": 4.488673397604527e-05, + "loss": 2.6632, + "step": 37330 + }, + { + "epoch": 1.7380403659473427, + "grad_norm": 0.34573198457447835, + "learning_rate": 4.488403944847419e-05, + "loss": 2.6692, + "step": 37331 + }, + { + "epoch": 1.7380869241334358, + "grad_norm": 0.3590745891142941, + "learning_rate": 4.488134493591793e-05, + "loss": 2.6365, + "step": 37332 + }, + { + "epoch": 1.7381334823195287, + "grad_norm": 0.35566696688300575, + "learning_rate": 4.4878650438384434e-05, + "loss": 2.7016, + "step": 37333 + }, + { + "epoch": 1.7381800405056218, + "grad_norm": 0.32932531453327546, + "learning_rate": 4.48759559558816e-05, + "loss": 2.6433, + "step": 37334 + }, + { + "epoch": 1.738226598691715, + "grad_norm": 0.3524949825286743, + "learning_rate": 4.4873261488417306e-05, + "loss": 2.6552, + "step": 37335 + }, + { + "epoch": 1.738273156877808, + "grad_norm": 0.3622272738264577, + "learning_rate": 4.48705670359995e-05, + "loss": 2.7126, + "step": 37336 + }, + { + "epoch": 1.7383197150639011, + "grad_norm": 0.3611149264207209, + "learning_rate": 4.4867872598636085e-05, + "loss": 2.7336, + "step": 37337 + }, + { + "epoch": 1.7383662732499943, + "grad_norm": 0.35190649487664033, + "learning_rate": 4.486517817633493e-05, + "loss": 2.7108, + "step": 37338 + }, + { + "epoch": 1.7384128314360874, + "grad_norm": 0.3365543123687851, + "learning_rate": 4.4862483769104005e-05, + "loss": 2.7341, + "step": 37339 + }, + { + "epoch": 1.7384593896221803, + "grad_norm": 0.3411586455401577, + "learning_rate": 4.485978937695115e-05, + "loss": 2.5992, + "step": 37340 + }, + { + "epoch": 1.7385059478082734, + "grad_norm": 0.3598910036455584, + "learning_rate": 4.485709499988433e-05, + "loss": 2.7912, + "step": 37341 + }, + { + "epoch": 1.7385525059943665, + "grad_norm": 0.3278938372174562, + "learning_rate": 4.485440063791143e-05, + "loss": 2.7143, + "step": 37342 + }, + { + "epoch": 1.7385990641804594, + "grad_norm": 0.3686150487725057, + "learning_rate": 4.4851706291040344e-05, + "loss": 2.6781, + "step": 37343 + }, + { + "epoch": 1.7386456223665525, + "grad_norm": 0.3794155077564657, + "learning_rate": 4.4849011959279016e-05, + "loss": 2.8163, + "step": 37344 + }, + { + "epoch": 1.7386921805526456, + "grad_norm": 0.3495281693940602, + "learning_rate": 4.4846317642635296e-05, + "loss": 2.675, + "step": 37345 + }, + { + "epoch": 1.7387387387387387, + "grad_norm": 0.3400482011797637, + "learning_rate": 4.484362334111715e-05, + "loss": 2.6869, + "step": 37346 + }, + { + "epoch": 1.7387852969248319, + "grad_norm": 0.3595474944650664, + "learning_rate": 4.484092905473246e-05, + "loss": 2.7189, + "step": 37347 + }, + { + "epoch": 1.738831855110925, + "grad_norm": 0.3617027122381667, + "learning_rate": 4.483823478348912e-05, + "loss": 2.7405, + "step": 37348 + }, + { + "epoch": 1.738878413297018, + "grad_norm": 0.3212566284938763, + "learning_rate": 4.483554052739507e-05, + "loss": 2.6926, + "step": 37349 + }, + { + "epoch": 1.738924971483111, + "grad_norm": 0.3564853494195832, + "learning_rate": 4.483284628645818e-05, + "loss": 2.7132, + "step": 37350 + }, + { + "epoch": 1.738971529669204, + "grad_norm": 0.3753483467543651, + "learning_rate": 4.48301520606864e-05, + "loss": 2.6186, + "step": 37351 + }, + { + "epoch": 1.7390180878552972, + "grad_norm": 0.3091244010489275, + "learning_rate": 4.4827457850087605e-05, + "loss": 2.684, + "step": 37352 + }, + { + "epoch": 1.73906464604139, + "grad_norm": 0.3479710993458747, + "learning_rate": 4.48247636546697e-05, + "loss": 2.6712, + "step": 37353 + }, + { + "epoch": 1.7391112042274832, + "grad_norm": 0.34978505482336364, + "learning_rate": 4.482206947444062e-05, + "loss": 2.7732, + "step": 37354 + }, + { + "epoch": 1.7391577624135763, + "grad_norm": 0.34328704875907434, + "learning_rate": 4.481937530940826e-05, + "loss": 2.7232, + "step": 37355 + }, + { + "epoch": 1.7392043205996695, + "grad_norm": 0.33832603735776995, + "learning_rate": 4.481668115958051e-05, + "loss": 2.7442, + "step": 37356 + }, + { + "epoch": 1.7392508787857626, + "grad_norm": 0.33041488922185835, + "learning_rate": 4.48139870249653e-05, + "loss": 2.7063, + "step": 37357 + }, + { + "epoch": 1.7392974369718557, + "grad_norm": 0.3487503453995519, + "learning_rate": 4.481129290557051e-05, + "loss": 2.7172, + "step": 37358 + }, + { + "epoch": 1.7393439951579488, + "grad_norm": 0.34428267107297644, + "learning_rate": 4.480859880140408e-05, + "loss": 2.7271, + "step": 37359 + }, + { + "epoch": 1.7393905533440417, + "grad_norm": 0.3681030084212188, + "learning_rate": 4.48059047124739e-05, + "loss": 2.8236, + "step": 37360 + }, + { + "epoch": 1.7394371115301348, + "grad_norm": 0.3588274544584741, + "learning_rate": 4.480321063878786e-05, + "loss": 2.787, + "step": 37361 + }, + { + "epoch": 1.7394836697162277, + "grad_norm": 0.34897587057402496, + "learning_rate": 4.480051658035391e-05, + "loss": 2.614, + "step": 37362 + }, + { + "epoch": 1.7395302279023208, + "grad_norm": 0.35142463173608735, + "learning_rate": 4.4797822537179906e-05, + "loss": 2.6907, + "step": 37363 + }, + { + "epoch": 1.739576786088414, + "grad_norm": 0.3683536623574875, + "learning_rate": 4.479512850927381e-05, + "loss": 2.7141, + "step": 37364 + }, + { + "epoch": 1.739623344274507, + "grad_norm": 0.3543398850771794, + "learning_rate": 4.479243449664348e-05, + "loss": 2.6802, + "step": 37365 + }, + { + "epoch": 1.7396699024606002, + "grad_norm": 0.3374856807869994, + "learning_rate": 4.478974049929684e-05, + "loss": 2.5667, + "step": 37366 + }, + { + "epoch": 1.7397164606466933, + "grad_norm": 0.3779166405137107, + "learning_rate": 4.47870465172418e-05, + "loss": 2.8077, + "step": 37367 + }, + { + "epoch": 1.7397630188327864, + "grad_norm": 0.32532001701695806, + "learning_rate": 4.478435255048627e-05, + "loss": 2.7061, + "step": 37368 + }, + { + "epoch": 1.7398095770188795, + "grad_norm": 0.36677751779857315, + "learning_rate": 4.4781658599038164e-05, + "loss": 2.7639, + "step": 37369 + }, + { + "epoch": 1.7398561352049724, + "grad_norm": 0.3442715631652495, + "learning_rate": 4.4778964662905367e-05, + "loss": 2.7433, + "step": 37370 + }, + { + "epoch": 1.7399026933910655, + "grad_norm": 0.35701765757271553, + "learning_rate": 4.4776270742095775e-05, + "loss": 2.6031, + "step": 37371 + }, + { + "epoch": 1.7399492515771584, + "grad_norm": 0.3346965661245476, + "learning_rate": 4.477357683661734e-05, + "loss": 2.6445, + "step": 37372 + }, + { + "epoch": 1.7399958097632515, + "grad_norm": 0.34660067571920233, + "learning_rate": 4.4770882946477935e-05, + "loss": 2.7581, + "step": 37373 + }, + { + "epoch": 1.7400423679493446, + "grad_norm": 0.3750608639017803, + "learning_rate": 4.476818907168545e-05, + "loss": 2.7822, + "step": 37374 + }, + { + "epoch": 1.7400889261354378, + "grad_norm": 0.3247678515903862, + "learning_rate": 4.4765495212247845e-05, + "loss": 2.6398, + "step": 37375 + }, + { + "epoch": 1.7401354843215309, + "grad_norm": 0.3490491148694031, + "learning_rate": 4.476280136817297e-05, + "loss": 2.7165, + "step": 37376 + }, + { + "epoch": 1.740182042507624, + "grad_norm": 0.33805211823713965, + "learning_rate": 4.4760107539468777e-05, + "loss": 2.6848, + "step": 37377 + }, + { + "epoch": 1.740228600693717, + "grad_norm": 0.3632267648519539, + "learning_rate": 4.475741372614315e-05, + "loss": 2.7326, + "step": 37378 + }, + { + "epoch": 1.74027515887981, + "grad_norm": 0.37840539765068487, + "learning_rate": 4.4754719928203985e-05, + "loss": 2.7112, + "step": 37379 + }, + { + "epoch": 1.740321717065903, + "grad_norm": 0.35726210001518766, + "learning_rate": 4.475202614565921e-05, + "loss": 2.5808, + "step": 37380 + }, + { + "epoch": 1.7403682752519962, + "grad_norm": 0.3346535089732025, + "learning_rate": 4.474933237851672e-05, + "loss": 2.7905, + "step": 37381 + }, + { + "epoch": 1.7404148334380891, + "grad_norm": 0.3401651053643821, + "learning_rate": 4.474663862678443e-05, + "loss": 2.6769, + "step": 37382 + }, + { + "epoch": 1.7404613916241822, + "grad_norm": 0.3390448904106737, + "learning_rate": 4.474394489047024e-05, + "loss": 2.7766, + "step": 37383 + }, + { + "epoch": 1.7405079498102753, + "grad_norm": 0.3374988736043829, + "learning_rate": 4.474125116958203e-05, + "loss": 2.6651, + "step": 37384 + }, + { + "epoch": 1.7405545079963685, + "grad_norm": 0.35308141470567445, + "learning_rate": 4.4738557464127754e-05, + "loss": 2.6755, + "step": 37385 + }, + { + "epoch": 1.7406010661824616, + "grad_norm": 0.34912824702821743, + "learning_rate": 4.4735863774115274e-05, + "loss": 2.5588, + "step": 37386 + }, + { + "epoch": 1.7406476243685547, + "grad_norm": 0.3572027537737432, + "learning_rate": 4.4733170099552526e-05, + "loss": 2.6952, + "step": 37387 + }, + { + "epoch": 1.7406941825546478, + "grad_norm": 0.3592632173044963, + "learning_rate": 4.473047644044741e-05, + "loss": 2.7731, + "step": 37388 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.3473005606849219, + "learning_rate": 4.472778279680781e-05, + "loss": 2.6352, + "step": 37389 + }, + { + "epoch": 1.7407872989268338, + "grad_norm": 0.3312117230233297, + "learning_rate": 4.4725089168641664e-05, + "loss": 2.6443, + "step": 37390 + }, + { + "epoch": 1.740833857112927, + "grad_norm": 0.38436076135039815, + "learning_rate": 4.472239555595686e-05, + "loss": 2.759, + "step": 37391 + }, + { + "epoch": 1.7408804152990198, + "grad_norm": 0.34403802238296033, + "learning_rate": 4.4719701958761297e-05, + "loss": 2.6786, + "step": 37392 + }, + { + "epoch": 1.740926973485113, + "grad_norm": 0.3677262645170397, + "learning_rate": 4.47170083770629e-05, + "loss": 2.6682, + "step": 37393 + }, + { + "epoch": 1.740973531671206, + "grad_norm": 0.36657970638682635, + "learning_rate": 4.471431481086955e-05, + "loss": 2.645, + "step": 37394 + }, + { + "epoch": 1.7410200898572992, + "grad_norm": 0.354741608070795, + "learning_rate": 4.471162126018918e-05, + "loss": 2.6403, + "step": 37395 + }, + { + "epoch": 1.7410666480433923, + "grad_norm": 0.3794704742811509, + "learning_rate": 4.470892772502968e-05, + "loss": 2.7368, + "step": 37396 + }, + { + "epoch": 1.7411132062294854, + "grad_norm": 0.3412422696077817, + "learning_rate": 4.4706234205398926e-05, + "loss": 2.7068, + "step": 37397 + }, + { + "epoch": 1.7411597644155785, + "grad_norm": 0.36128467678071596, + "learning_rate": 4.4703540701304885e-05, + "loss": 2.6613, + "step": 37398 + }, + { + "epoch": 1.7412063226016714, + "grad_norm": 0.34302159784098774, + "learning_rate": 4.4700847212755404e-05, + "loss": 2.7046, + "step": 37399 + }, + { + "epoch": 1.7412528807877645, + "grad_norm": 0.3892044856400493, + "learning_rate": 4.4698153739758444e-05, + "loss": 2.7749, + "step": 37400 + }, + { + "epoch": 1.7412994389738574, + "grad_norm": 0.36812534371135835, + "learning_rate": 4.469546028232187e-05, + "loss": 2.6167, + "step": 37401 + }, + { + "epoch": 1.7413459971599505, + "grad_norm": 0.38001330425141366, + "learning_rate": 4.469276684045358e-05, + "loss": 2.6092, + "step": 37402 + }, + { + "epoch": 1.7413925553460436, + "grad_norm": 0.3819833069663032, + "learning_rate": 4.4690073414161524e-05, + "loss": 2.6384, + "step": 37403 + }, + { + "epoch": 1.7414391135321368, + "grad_norm": 0.35215803621692154, + "learning_rate": 4.468738000345356e-05, + "loss": 2.6409, + "step": 37404 + }, + { + "epoch": 1.7414856717182299, + "grad_norm": 0.3881119690541356, + "learning_rate": 4.468468660833761e-05, + "loss": 2.7643, + "step": 37405 + }, + { + "epoch": 1.741532229904323, + "grad_norm": 0.36971149254800867, + "learning_rate": 4.46819932288216e-05, + "loss": 2.7038, + "step": 37406 + }, + { + "epoch": 1.741578788090416, + "grad_norm": 0.35270001180786864, + "learning_rate": 4.46792998649134e-05, + "loss": 2.6331, + "step": 37407 + }, + { + "epoch": 1.7416253462765092, + "grad_norm": 0.36483142023324233, + "learning_rate": 4.467660651662095e-05, + "loss": 2.7131, + "step": 37408 + }, + { + "epoch": 1.7416719044626021, + "grad_norm": 0.34560677094242165, + "learning_rate": 4.4673913183952125e-05, + "loss": 2.7563, + "step": 37409 + }, + { + "epoch": 1.7417184626486952, + "grad_norm": 0.3425363136425669, + "learning_rate": 4.467121986691482e-05, + "loss": 2.7797, + "step": 37410 + }, + { + "epoch": 1.7417650208347881, + "grad_norm": 0.3486240893849663, + "learning_rate": 4.4668526565516986e-05, + "loss": 2.7201, + "step": 37411 + }, + { + "epoch": 1.7418115790208812, + "grad_norm": 0.35205985979357346, + "learning_rate": 4.466583327976648e-05, + "loss": 2.6714, + "step": 37412 + }, + { + "epoch": 1.7418581372069744, + "grad_norm": 0.363233747443028, + "learning_rate": 4.4663140009671245e-05, + "loss": 2.7085, + "step": 37413 + }, + { + "epoch": 1.7419046953930675, + "grad_norm": 0.32118177271178106, + "learning_rate": 4.466044675523917e-05, + "loss": 2.6493, + "step": 37414 + }, + { + "epoch": 1.7419512535791606, + "grad_norm": 0.37492293470138, + "learning_rate": 4.465775351647813e-05, + "loss": 2.6204, + "step": 37415 + }, + { + "epoch": 1.7419978117652537, + "grad_norm": 0.3362669795765757, + "learning_rate": 4.465506029339609e-05, + "loss": 2.7875, + "step": 37416 + }, + { + "epoch": 1.7420443699513468, + "grad_norm": 0.3835178306700363, + "learning_rate": 4.4652367086000904e-05, + "loss": 2.7436, + "step": 37417 + }, + { + "epoch": 1.74209092813744, + "grad_norm": 0.3403302924250965, + "learning_rate": 4.4649673894300496e-05, + "loss": 2.6234, + "step": 37418 + }, + { + "epoch": 1.7421374863235328, + "grad_norm": 0.3567507677728739, + "learning_rate": 4.4646980718302784e-05, + "loss": 2.7478, + "step": 37419 + }, + { + "epoch": 1.742184044509626, + "grad_norm": 0.3804445960360396, + "learning_rate": 4.464428755801564e-05, + "loss": 2.6411, + "step": 37420 + }, + { + "epoch": 1.7422306026957188, + "grad_norm": 0.34585882730650597, + "learning_rate": 4.464159441344699e-05, + "loss": 2.7438, + "step": 37421 + }, + { + "epoch": 1.742277160881812, + "grad_norm": 0.3421511314926062, + "learning_rate": 4.463890128460472e-05, + "loss": 2.6962, + "step": 37422 + }, + { + "epoch": 1.742323719067905, + "grad_norm": 0.35562595316947226, + "learning_rate": 4.463620817149676e-05, + "loss": 2.7577, + "step": 37423 + }, + { + "epoch": 1.7423702772539982, + "grad_norm": 0.3561717491711451, + "learning_rate": 4.463351507413101e-05, + "loss": 2.631, + "step": 37424 + }, + { + "epoch": 1.7424168354400913, + "grad_norm": 0.3339089597797313, + "learning_rate": 4.463082199251534e-05, + "loss": 2.7407, + "step": 37425 + }, + { + "epoch": 1.7424633936261844, + "grad_norm": 0.3645238375467966, + "learning_rate": 4.4628128926657694e-05, + "loss": 2.7339, + "step": 37426 + }, + { + "epoch": 1.7425099518122775, + "grad_norm": 0.3328515227602771, + "learning_rate": 4.462543587656596e-05, + "loss": 2.7466, + "step": 37427 + }, + { + "epoch": 1.7425565099983704, + "grad_norm": 0.3298146484092527, + "learning_rate": 4.462274284224803e-05, + "loss": 2.595, + "step": 37428 + }, + { + "epoch": 1.7426030681844635, + "grad_norm": 0.34505324843176266, + "learning_rate": 4.462004982371184e-05, + "loss": 2.681, + "step": 37429 + }, + { + "epoch": 1.7426496263705566, + "grad_norm": 0.3483947312964613, + "learning_rate": 4.4617356820965255e-05, + "loss": 2.6287, + "step": 37430 + }, + { + "epoch": 1.7426961845566495, + "grad_norm": 0.3523761185586454, + "learning_rate": 4.46146638340162e-05, + "loss": 2.7056, + "step": 37431 + }, + { + "epoch": 1.7427427427427427, + "grad_norm": 0.3511827749839883, + "learning_rate": 4.4611970862872596e-05, + "loss": 2.5996, + "step": 37432 + }, + { + "epoch": 1.7427893009288358, + "grad_norm": 0.34484394825628933, + "learning_rate": 4.46092779075423e-05, + "loss": 2.6752, + "step": 37433 + }, + { + "epoch": 1.7428358591149289, + "grad_norm": 0.3421306227982653, + "learning_rate": 4.460658496803327e-05, + "loss": 2.6894, + "step": 37434 + }, + { + "epoch": 1.742882417301022, + "grad_norm": 0.3469835583751741, + "learning_rate": 4.460389204435335e-05, + "loss": 2.6911, + "step": 37435 + }, + { + "epoch": 1.7429289754871151, + "grad_norm": 0.32832898414019945, + "learning_rate": 4.46011991365105e-05, + "loss": 2.6278, + "step": 37436 + }, + { + "epoch": 1.7429755336732082, + "grad_norm": 0.34838205276979817, + "learning_rate": 4.4598506244512594e-05, + "loss": 2.6862, + "step": 37437 + }, + { + "epoch": 1.7430220918593011, + "grad_norm": 0.3576424546990102, + "learning_rate": 4.459581336836752e-05, + "loss": 2.7406, + "step": 37438 + }, + { + "epoch": 1.7430686500453942, + "grad_norm": 0.32526217683595965, + "learning_rate": 4.4593120508083226e-05, + "loss": 2.6687, + "step": 37439 + }, + { + "epoch": 1.7431152082314874, + "grad_norm": 0.34609798385902035, + "learning_rate": 4.4590427663667566e-05, + "loss": 2.6525, + "step": 37440 + }, + { + "epoch": 1.7431617664175802, + "grad_norm": 0.3515398008587483, + "learning_rate": 4.4587734835128495e-05, + "loss": 2.734, + "step": 37441 + }, + { + "epoch": 1.7432083246036734, + "grad_norm": 0.3353244854191576, + "learning_rate": 4.458504202247388e-05, + "loss": 2.6484, + "step": 37442 + }, + { + "epoch": 1.7432548827897665, + "grad_norm": 0.3201618239826137, + "learning_rate": 4.458234922571164e-05, + "loss": 2.6392, + "step": 37443 + }, + { + "epoch": 1.7433014409758596, + "grad_norm": 0.357619578630569, + "learning_rate": 4.457965644484965e-05, + "loss": 2.6844, + "step": 37444 + }, + { + "epoch": 1.7433479991619527, + "grad_norm": 0.3529675317016233, + "learning_rate": 4.457696367989585e-05, + "loss": 2.7637, + "step": 37445 + }, + { + "epoch": 1.7433945573480458, + "grad_norm": 0.35764038037592877, + "learning_rate": 4.457427093085812e-05, + "loss": 2.7059, + "step": 37446 + }, + { + "epoch": 1.743441115534139, + "grad_norm": 0.36895250185821526, + "learning_rate": 4.4571578197744394e-05, + "loss": 2.7635, + "step": 37447 + }, + { + "epoch": 1.7434876737202318, + "grad_norm": 0.3311372929966283, + "learning_rate": 4.456888548056251e-05, + "loss": 2.6315, + "step": 37448 + }, + { + "epoch": 1.743534231906325, + "grad_norm": 0.3723710971192181, + "learning_rate": 4.456619277932044e-05, + "loss": 2.6338, + "step": 37449 + }, + { + "epoch": 1.7435807900924178, + "grad_norm": 0.35648819691226247, + "learning_rate": 4.456350009402606e-05, + "loss": 2.7664, + "step": 37450 + }, + { + "epoch": 1.743627348278511, + "grad_norm": 0.33179828670414985, + "learning_rate": 4.456080742468725e-05, + "loss": 2.7105, + "step": 37451 + }, + { + "epoch": 1.743673906464604, + "grad_norm": 0.3489602233148863, + "learning_rate": 4.455811477131196e-05, + "loss": 2.6577, + "step": 37452 + }, + { + "epoch": 1.7437204646506972, + "grad_norm": 0.332052597237936, + "learning_rate": 4.4555422133908046e-05, + "loss": 2.7935, + "step": 37453 + }, + { + "epoch": 1.7437670228367903, + "grad_norm": 0.32310981373559833, + "learning_rate": 4.4552729512483445e-05, + "loss": 2.6666, + "step": 37454 + }, + { + "epoch": 1.7438135810228834, + "grad_norm": 0.3090979164712127, + "learning_rate": 4.455003690704605e-05, + "loss": 2.612, + "step": 37455 + }, + { + "epoch": 1.7438601392089765, + "grad_norm": 0.34893371160332937, + "learning_rate": 4.4547344317603745e-05, + "loss": 2.6295, + "step": 37456 + }, + { + "epoch": 1.7439066973950696, + "grad_norm": 0.2931747681498924, + "learning_rate": 4.4544651744164455e-05, + "loss": 2.6007, + "step": 37457 + }, + { + "epoch": 1.7439532555811625, + "grad_norm": 0.35279390063938004, + "learning_rate": 4.454195918673607e-05, + "loss": 2.7881, + "step": 37458 + }, + { + "epoch": 1.7439998137672557, + "grad_norm": 0.35387480662005893, + "learning_rate": 4.453926664532651e-05, + "loss": 2.7545, + "step": 37459 + }, + { + "epoch": 1.7440463719533486, + "grad_norm": 0.34526566114347534, + "learning_rate": 4.4536574119943665e-05, + "loss": 2.7368, + "step": 37460 + }, + { + "epoch": 1.7440929301394417, + "grad_norm": 0.32449491396962954, + "learning_rate": 4.453388161059541e-05, + "loss": 2.5754, + "step": 37461 + }, + { + "epoch": 1.7441394883255348, + "grad_norm": 0.3594007575802098, + "learning_rate": 4.4531189117289704e-05, + "loss": 2.6603, + "step": 37462 + }, + { + "epoch": 1.744186046511628, + "grad_norm": 0.36165368191103403, + "learning_rate": 4.4528496640034414e-05, + "loss": 2.6221, + "step": 37463 + }, + { + "epoch": 1.744232604697721, + "grad_norm": 0.3470910639101669, + "learning_rate": 4.4525804178837425e-05, + "loss": 2.6889, + "step": 37464 + }, + { + "epoch": 1.7442791628838141, + "grad_norm": 0.34092617957012533, + "learning_rate": 4.452311173370668e-05, + "loss": 2.6459, + "step": 37465 + }, + { + "epoch": 1.7443257210699072, + "grad_norm": 0.36408978455671576, + "learning_rate": 4.452041930465004e-05, + "loss": 2.7851, + "step": 37466 + }, + { + "epoch": 1.7443722792560001, + "grad_norm": 0.3467019788892773, + "learning_rate": 4.451772689167546e-05, + "loss": 2.707, + "step": 37467 + }, + { + "epoch": 1.7444188374420933, + "grad_norm": 0.34219732342482373, + "learning_rate": 4.451503449479081e-05, + "loss": 2.7227, + "step": 37468 + }, + { + "epoch": 1.7444653956281864, + "grad_norm": 0.3480782915914767, + "learning_rate": 4.451234211400398e-05, + "loss": 2.6409, + "step": 37469 + }, + { + "epoch": 1.7445119538142793, + "grad_norm": 0.36254749116981183, + "learning_rate": 4.4509649749322885e-05, + "loss": 2.6283, + "step": 37470 + }, + { + "epoch": 1.7445585120003724, + "grad_norm": 0.33226216209186693, + "learning_rate": 4.450695740075542e-05, + "loss": 2.6383, + "step": 37471 + }, + { + "epoch": 1.7446050701864655, + "grad_norm": 0.3433875643791677, + "learning_rate": 4.450426506830951e-05, + "loss": 2.6321, + "step": 37472 + }, + { + "epoch": 1.7446516283725586, + "grad_norm": 0.33818797639588505, + "learning_rate": 4.450157275199305e-05, + "loss": 2.7161, + "step": 37473 + }, + { + "epoch": 1.7446981865586517, + "grad_norm": 0.35231052036508675, + "learning_rate": 4.44988804518139e-05, + "loss": 2.7181, + "step": 37474 + }, + { + "epoch": 1.7447447447447448, + "grad_norm": 0.34417694046211766, + "learning_rate": 4.4496188167780025e-05, + "loss": 2.7826, + "step": 37475 + }, + { + "epoch": 1.744791302930838, + "grad_norm": 0.3380395757793008, + "learning_rate": 4.4493495899899265e-05, + "loss": 2.5846, + "step": 37476 + }, + { + "epoch": 1.7448378611169308, + "grad_norm": 0.3483805083331694, + "learning_rate": 4.4490803648179575e-05, + "loss": 2.651, + "step": 37477 + }, + { + "epoch": 1.744884419303024, + "grad_norm": 0.372850990736481, + "learning_rate": 4.4488111412628836e-05, + "loss": 2.6827, + "step": 37478 + }, + { + "epoch": 1.744930977489117, + "grad_norm": 0.3301123179793668, + "learning_rate": 4.448541919325492e-05, + "loss": 2.6562, + "step": 37479 + }, + { + "epoch": 1.74497753567521, + "grad_norm": 0.3636582898527091, + "learning_rate": 4.448272699006579e-05, + "loss": 2.7464, + "step": 37480 + }, + { + "epoch": 1.745024093861303, + "grad_norm": 0.3413537272315443, + "learning_rate": 4.4480034803069295e-05, + "loss": 2.6021, + "step": 37481 + }, + { + "epoch": 1.7450706520473962, + "grad_norm": 0.3352799298802745, + "learning_rate": 4.4477342632273344e-05, + "loss": 2.6582, + "step": 37482 + }, + { + "epoch": 1.7451172102334893, + "grad_norm": 0.33547172662207486, + "learning_rate": 4.447465047768587e-05, + "loss": 2.6806, + "step": 37483 + }, + { + "epoch": 1.7451637684195824, + "grad_norm": 0.33425182907551304, + "learning_rate": 4.447195833931474e-05, + "loss": 2.6099, + "step": 37484 + }, + { + "epoch": 1.7452103266056755, + "grad_norm": 0.33114256224641797, + "learning_rate": 4.4469266217167875e-05, + "loss": 2.7383, + "step": 37485 + }, + { + "epoch": 1.7452568847917687, + "grad_norm": 0.32903678238654194, + "learning_rate": 4.446657411125317e-05, + "loss": 2.7395, + "step": 37486 + }, + { + "epoch": 1.7453034429778616, + "grad_norm": 0.34458205281679005, + "learning_rate": 4.446388202157851e-05, + "loss": 2.6079, + "step": 37487 + }, + { + "epoch": 1.7453500011639547, + "grad_norm": 0.3361140644404297, + "learning_rate": 4.4461189948151826e-05, + "loss": 2.6922, + "step": 37488 + }, + { + "epoch": 1.7453965593500476, + "grad_norm": 0.32370305977360686, + "learning_rate": 4.445849789098098e-05, + "loss": 2.599, + "step": 37489 + }, + { + "epoch": 1.7454431175361407, + "grad_norm": 0.3619440537167449, + "learning_rate": 4.445580585007392e-05, + "loss": 2.725, + "step": 37490 + }, + { + "epoch": 1.7454896757222338, + "grad_norm": 0.34641794469870235, + "learning_rate": 4.445311382543853e-05, + "loss": 2.6848, + "step": 37491 + }, + { + "epoch": 1.745536233908327, + "grad_norm": 0.31716919670203186, + "learning_rate": 4.445042181708268e-05, + "loss": 2.7235, + "step": 37492 + }, + { + "epoch": 1.74558279209442, + "grad_norm": 0.3620676026744676, + "learning_rate": 4.444772982501431e-05, + "loss": 2.6927, + "step": 37493 + }, + { + "epoch": 1.7456293502805131, + "grad_norm": 0.36011478097141597, + "learning_rate": 4.44450378492413e-05, + "loss": 2.7595, + "step": 37494 + }, + { + "epoch": 1.7456759084666063, + "grad_norm": 0.32898963043195845, + "learning_rate": 4.4442345889771556e-05, + "loss": 2.7566, + "step": 37495 + }, + { + "epoch": 1.7457224666526994, + "grad_norm": 0.3225417754739413, + "learning_rate": 4.443965394661299e-05, + "loss": 2.7121, + "step": 37496 + }, + { + "epoch": 1.7457690248387923, + "grad_norm": 0.3266104867270254, + "learning_rate": 4.443696201977349e-05, + "loss": 2.6933, + "step": 37497 + }, + { + "epoch": 1.7458155830248854, + "grad_norm": 0.3430172668519312, + "learning_rate": 4.4434270109260955e-05, + "loss": 2.7454, + "step": 37498 + }, + { + "epoch": 1.7458621412109783, + "grad_norm": 0.34673926091067014, + "learning_rate": 4.44315782150833e-05, + "loss": 2.5758, + "step": 37499 + }, + { + "epoch": 1.7459086993970714, + "grad_norm": 0.3306057565831139, + "learning_rate": 4.442888633724838e-05, + "loss": 2.6199, + "step": 37500 + }, + { + "epoch": 1.7459552575831645, + "grad_norm": 0.33371413321865834, + "learning_rate": 4.4426194475764165e-05, + "loss": 2.7233, + "step": 37501 + }, + { + "epoch": 1.7460018157692576, + "grad_norm": 0.32722122561923855, + "learning_rate": 4.442350263063849e-05, + "loss": 2.7485, + "step": 37502 + }, + { + "epoch": 1.7460483739553507, + "grad_norm": 0.34589614321622664, + "learning_rate": 4.442081080187931e-05, + "loss": 2.7254, + "step": 37503 + }, + { + "epoch": 1.7460949321414438, + "grad_norm": 0.322621465676566, + "learning_rate": 4.44181189894945e-05, + "loss": 2.6575, + "step": 37504 + }, + { + "epoch": 1.746141490327537, + "grad_norm": 0.3450031601614838, + "learning_rate": 4.441542719349194e-05, + "loss": 2.6438, + "step": 37505 + }, + { + "epoch": 1.7461880485136299, + "grad_norm": 0.30938865981885344, + "learning_rate": 4.4412735413879575e-05, + "loss": 2.7, + "step": 37506 + }, + { + "epoch": 1.746234606699723, + "grad_norm": 0.34248776633949096, + "learning_rate": 4.4410043650665266e-05, + "loss": 2.6825, + "step": 37507 + }, + { + "epoch": 1.746281164885816, + "grad_norm": 0.3480401051330556, + "learning_rate": 4.440735190385693e-05, + "loss": 2.7181, + "step": 37508 + }, + { + "epoch": 1.746327723071909, + "grad_norm": 0.35335592769372565, + "learning_rate": 4.440466017346248e-05, + "loss": 2.7483, + "step": 37509 + }, + { + "epoch": 1.746374281258002, + "grad_norm": 0.3597209173763366, + "learning_rate": 4.4401968459489785e-05, + "loss": 2.7256, + "step": 37510 + }, + { + "epoch": 1.7464208394440952, + "grad_norm": 0.33347329654062213, + "learning_rate": 4.4399276761946775e-05, + "loss": 2.7093, + "step": 37511 + }, + { + "epoch": 1.7464673976301883, + "grad_norm": 0.3430951277827052, + "learning_rate": 4.4396585080841306e-05, + "loss": 2.6753, + "step": 37512 + }, + { + "epoch": 1.7465139558162814, + "grad_norm": 0.3522771971922408, + "learning_rate": 4.4393893416181334e-05, + "loss": 2.692, + "step": 37513 + }, + { + "epoch": 1.7465605140023746, + "grad_norm": 0.3616175363414354, + "learning_rate": 4.439120176797474e-05, + "loss": 2.732, + "step": 37514 + }, + { + "epoch": 1.7466070721884677, + "grad_norm": 0.3490475100681515, + "learning_rate": 4.438851013622939e-05, + "loss": 2.714, + "step": 37515 + }, + { + "epoch": 1.7466536303745606, + "grad_norm": 0.35497766787943413, + "learning_rate": 4.438581852095323e-05, + "loss": 2.6641, + "step": 37516 + }, + { + "epoch": 1.7467001885606537, + "grad_norm": 0.34246186792987865, + "learning_rate": 4.438312692215414e-05, + "loss": 2.6926, + "step": 37517 + }, + { + "epoch": 1.7467467467467468, + "grad_norm": 0.36821534718928023, + "learning_rate": 4.4380435339839995e-05, + "loss": 2.6571, + "step": 37518 + }, + { + "epoch": 1.7467933049328397, + "grad_norm": 0.3729833574682004, + "learning_rate": 4.4377743774018746e-05, + "loss": 2.7243, + "step": 37519 + }, + { + "epoch": 1.7468398631189328, + "grad_norm": 0.36700924386332917, + "learning_rate": 4.437505222469825e-05, + "loss": 2.6964, + "step": 37520 + }, + { + "epoch": 1.746886421305026, + "grad_norm": 0.3642066443908317, + "learning_rate": 4.4372360691886414e-05, + "loss": 2.7944, + "step": 37521 + }, + { + "epoch": 1.746932979491119, + "grad_norm": 0.36882728235040674, + "learning_rate": 4.4369669175591175e-05, + "loss": 2.6829, + "step": 37522 + }, + { + "epoch": 1.7469795376772121, + "grad_norm": 0.3327120340471214, + "learning_rate": 4.4366977675820376e-05, + "loss": 2.7483, + "step": 37523 + }, + { + "epoch": 1.7470260958633053, + "grad_norm": 0.3857457518154846, + "learning_rate": 4.436428619258196e-05, + "loss": 2.8258, + "step": 37524 + }, + { + "epoch": 1.7470726540493984, + "grad_norm": 0.34075130600430326, + "learning_rate": 4.436159472588378e-05, + "loss": 2.7311, + "step": 37525 + }, + { + "epoch": 1.7471192122354913, + "grad_norm": 0.3333946909644436, + "learning_rate": 4.435890327573379e-05, + "loss": 2.7229, + "step": 37526 + }, + { + "epoch": 1.7471657704215844, + "grad_norm": 0.3377680525224479, + "learning_rate": 4.435621184213987e-05, + "loss": 2.7346, + "step": 37527 + }, + { + "epoch": 1.7472123286076773, + "grad_norm": 0.34521137751929937, + "learning_rate": 4.435352042510989e-05, + "loss": 2.7319, + "step": 37528 + }, + { + "epoch": 1.7472588867937704, + "grad_norm": 0.3578419265531606, + "learning_rate": 4.435082902465178e-05, + "loss": 2.6664, + "step": 37529 + }, + { + "epoch": 1.7473054449798635, + "grad_norm": 0.3392925279071156, + "learning_rate": 4.4348137640773416e-05, + "loss": 2.7815, + "step": 37530 + }, + { + "epoch": 1.7473520031659566, + "grad_norm": 0.36559942767077475, + "learning_rate": 4.434544627348274e-05, + "loss": 2.6808, + "step": 37531 + }, + { + "epoch": 1.7473985613520497, + "grad_norm": 0.37383533092507926, + "learning_rate": 4.434275492278761e-05, + "loss": 2.6127, + "step": 37532 + }, + { + "epoch": 1.7474451195381429, + "grad_norm": 0.37381012505761174, + "learning_rate": 4.434006358869593e-05, + "loss": 2.6786, + "step": 37533 + }, + { + "epoch": 1.747491677724236, + "grad_norm": 0.3561210582797261, + "learning_rate": 4.433737227121561e-05, + "loss": 2.6795, + "step": 37534 + }, + { + "epoch": 1.747538235910329, + "grad_norm": 0.38633865986734783, + "learning_rate": 4.4334680970354554e-05, + "loss": 2.8035, + "step": 37535 + }, + { + "epoch": 1.747584794096422, + "grad_norm": 0.3520506658931967, + "learning_rate": 4.433198968612064e-05, + "loss": 2.5985, + "step": 37536 + }, + { + "epoch": 1.747631352282515, + "grad_norm": 0.3352765113337158, + "learning_rate": 4.432929841852179e-05, + "loss": 2.5957, + "step": 37537 + }, + { + "epoch": 1.747677910468608, + "grad_norm": 0.36260139844999445, + "learning_rate": 4.432660716756587e-05, + "loss": 2.6231, + "step": 37538 + }, + { + "epoch": 1.747724468654701, + "grad_norm": 0.31471332664048973, + "learning_rate": 4.4323915933260805e-05, + "loss": 2.753, + "step": 37539 + }, + { + "epoch": 1.7477710268407942, + "grad_norm": 0.33542709301402046, + "learning_rate": 4.432122471561451e-05, + "loss": 2.6946, + "step": 37540 + }, + { + "epoch": 1.7478175850268873, + "grad_norm": 0.35829847446493646, + "learning_rate": 4.4318533514634826e-05, + "loss": 2.7707, + "step": 37541 + }, + { + "epoch": 1.7478641432129804, + "grad_norm": 0.3358161428980995, + "learning_rate": 4.431584233032971e-05, + "loss": 2.7187, + "step": 37542 + }, + { + "epoch": 1.7479107013990736, + "grad_norm": 0.3891871615993786, + "learning_rate": 4.431315116270702e-05, + "loss": 2.6259, + "step": 37543 + }, + { + "epoch": 1.7479572595851667, + "grad_norm": 0.32213102520004794, + "learning_rate": 4.4310460011774686e-05, + "loss": 2.6673, + "step": 37544 + }, + { + "epoch": 1.7480038177712598, + "grad_norm": 0.3738990063349916, + "learning_rate": 4.4307768877540596e-05, + "loss": 2.6823, + "step": 37545 + }, + { + "epoch": 1.7480503759573527, + "grad_norm": 0.33390681280607465, + "learning_rate": 4.4305077760012625e-05, + "loss": 2.6569, + "step": 37546 + }, + { + "epoch": 1.7480969341434458, + "grad_norm": 0.32964045665562786, + "learning_rate": 4.4302386659198695e-05, + "loss": 2.6876, + "step": 37547 + }, + { + "epoch": 1.7481434923295387, + "grad_norm": 0.35405028562328783, + "learning_rate": 4.4299695575106706e-05, + "loss": 2.7495, + "step": 37548 + }, + { + "epoch": 1.7481900505156318, + "grad_norm": 0.347570394707593, + "learning_rate": 4.4297004507744546e-05, + "loss": 2.6975, + "step": 37549 + }, + { + "epoch": 1.748236608701725, + "grad_norm": 0.3386365872464802, + "learning_rate": 4.429431345712013e-05, + "loss": 2.6546, + "step": 37550 + }, + { + "epoch": 1.748283166887818, + "grad_norm": 0.3309543774586695, + "learning_rate": 4.429162242324131e-05, + "loss": 2.6687, + "step": 37551 + }, + { + "epoch": 1.7483297250739112, + "grad_norm": 0.34777685174180023, + "learning_rate": 4.4288931406116045e-05, + "loss": 2.8061, + "step": 37552 + }, + { + "epoch": 1.7483762832600043, + "grad_norm": 0.33921003303449154, + "learning_rate": 4.42862404057522e-05, + "loss": 2.6477, + "step": 37553 + }, + { + "epoch": 1.7484228414460974, + "grad_norm": 0.3236520656703832, + "learning_rate": 4.4283549422157654e-05, + "loss": 2.6632, + "step": 37554 + }, + { + "epoch": 1.7484693996321903, + "grad_norm": 0.35802531962882156, + "learning_rate": 4.428085845534035e-05, + "loss": 2.7366, + "step": 37555 + }, + { + "epoch": 1.7485159578182834, + "grad_norm": 0.3300284251507014, + "learning_rate": 4.4278167505308146e-05, + "loss": 2.7095, + "step": 37556 + }, + { + "epoch": 1.7485625160043765, + "grad_norm": 0.3225870906466049, + "learning_rate": 4.427547657206897e-05, + "loss": 2.7198, + "step": 37557 + }, + { + "epoch": 1.7486090741904694, + "grad_norm": 0.33865684534830404, + "learning_rate": 4.427278565563071e-05, + "loss": 2.7104, + "step": 37558 + }, + { + "epoch": 1.7486556323765625, + "grad_norm": 0.3437042003030704, + "learning_rate": 4.4270094756001244e-05, + "loss": 2.7841, + "step": 37559 + }, + { + "epoch": 1.7487021905626556, + "grad_norm": 0.32478834943216495, + "learning_rate": 4.4267403873188495e-05, + "loss": 2.6403, + "step": 37560 + }, + { + "epoch": 1.7487487487487487, + "grad_norm": 0.3272177369590893, + "learning_rate": 4.426471300720034e-05, + "loss": 2.6693, + "step": 37561 + }, + { + "epoch": 1.7487953069348419, + "grad_norm": 0.33137012817102185, + "learning_rate": 4.426202215804471e-05, + "loss": 2.7909, + "step": 37562 + }, + { + "epoch": 1.748841865120935, + "grad_norm": 0.3304578522440437, + "learning_rate": 4.425933132572947e-05, + "loss": 2.7263, + "step": 37563 + }, + { + "epoch": 1.748888423307028, + "grad_norm": 0.34806826290835685, + "learning_rate": 4.425664051026251e-05, + "loss": 2.6653, + "step": 37564 + }, + { + "epoch": 1.748934981493121, + "grad_norm": 0.31816027625951193, + "learning_rate": 4.425394971165177e-05, + "loss": 2.6839, + "step": 37565 + }, + { + "epoch": 1.748981539679214, + "grad_norm": 0.3280314948671601, + "learning_rate": 4.425125892990509e-05, + "loss": 2.6504, + "step": 37566 + }, + { + "epoch": 1.7490280978653072, + "grad_norm": 0.3287163805518034, + "learning_rate": 4.4248568165030425e-05, + "loss": 2.6632, + "step": 37567 + }, + { + "epoch": 1.7490746560514001, + "grad_norm": 0.33084310782801163, + "learning_rate": 4.424587741703565e-05, + "loss": 2.5795, + "step": 37568 + }, + { + "epoch": 1.7491212142374932, + "grad_norm": 0.3462701122668852, + "learning_rate": 4.424318668592863e-05, + "loss": 2.6707, + "step": 37569 + }, + { + "epoch": 1.7491677724235863, + "grad_norm": 0.31970549668153236, + "learning_rate": 4.424049597171732e-05, + "loss": 2.7397, + "step": 37570 + }, + { + "epoch": 1.7492143306096795, + "grad_norm": 0.32396112853828507, + "learning_rate": 4.4237805274409576e-05, + "loss": 2.7196, + "step": 37571 + }, + { + "epoch": 1.7492608887957726, + "grad_norm": 0.337952415179876, + "learning_rate": 4.42351145940133e-05, + "loss": 2.8058, + "step": 37572 + }, + { + "epoch": 1.7493074469818657, + "grad_norm": 0.3278352242186308, + "learning_rate": 4.423242393053639e-05, + "loss": 2.6869, + "step": 37573 + }, + { + "epoch": 1.7493540051679588, + "grad_norm": 0.3270409724808908, + "learning_rate": 4.422973328398676e-05, + "loss": 2.7015, + "step": 37574 + }, + { + "epoch": 1.7494005633540517, + "grad_norm": 0.336255467333197, + "learning_rate": 4.422704265437229e-05, + "loss": 2.7461, + "step": 37575 + }, + { + "epoch": 1.7494471215401448, + "grad_norm": 0.32561611186619777, + "learning_rate": 4.422435204170089e-05, + "loss": 2.7151, + "step": 37576 + }, + { + "epoch": 1.7494936797262377, + "grad_norm": 0.3472005595593498, + "learning_rate": 4.422166144598043e-05, + "loss": 2.7013, + "step": 37577 + }, + { + "epoch": 1.7495402379123308, + "grad_norm": 0.33322248208677135, + "learning_rate": 4.4218970867218836e-05, + "loss": 2.6756, + "step": 37578 + }, + { + "epoch": 1.749586796098424, + "grad_norm": 0.35989347825468476, + "learning_rate": 4.421628030542397e-05, + "loss": 2.7091, + "step": 37579 + }, + { + "epoch": 1.749633354284517, + "grad_norm": 0.36668663242036026, + "learning_rate": 4.421358976060379e-05, + "loss": 2.8429, + "step": 37580 + }, + { + "epoch": 1.7496799124706102, + "grad_norm": 0.319740238609637, + "learning_rate": 4.421089923276614e-05, + "loss": 2.6729, + "step": 37581 + }, + { + "epoch": 1.7497264706567033, + "grad_norm": 0.36806028402116625, + "learning_rate": 4.4208208721918907e-05, + "loss": 2.7287, + "step": 37582 + }, + { + "epoch": 1.7497730288427964, + "grad_norm": 0.33843060030390115, + "learning_rate": 4.420551822807004e-05, + "loss": 2.6438, + "step": 37583 + }, + { + "epoch": 1.7498195870288895, + "grad_norm": 0.3496676932605984, + "learning_rate": 4.420282775122739e-05, + "loss": 2.669, + "step": 37584 + }, + { + "epoch": 1.7498661452149824, + "grad_norm": 0.377099030193018, + "learning_rate": 4.420013729139889e-05, + "loss": 2.6565, + "step": 37585 + }, + { + "epoch": 1.7499127034010755, + "grad_norm": 0.36025918209874147, + "learning_rate": 4.419744684859239e-05, + "loss": 2.6961, + "step": 37586 + }, + { + "epoch": 1.7499592615871684, + "grad_norm": 0.35084509765034166, + "learning_rate": 4.419475642281583e-05, + "loss": 2.6567, + "step": 37587 + }, + { + "epoch": 1.7500058197732615, + "grad_norm": 0.3347826422176578, + "learning_rate": 4.4192066014077085e-05, + "loss": 2.6329, + "step": 37588 + }, + { + "epoch": 1.7500523779593546, + "grad_norm": 0.3613479108096374, + "learning_rate": 4.4189375622384054e-05, + "loss": 2.6849, + "step": 37589 + }, + { + "epoch": 1.7500989361454478, + "grad_norm": 0.3553330536142525, + "learning_rate": 4.418668524774462e-05, + "loss": 2.6742, + "step": 37590 + }, + { + "epoch": 1.7501454943315409, + "grad_norm": 0.36083731062413693, + "learning_rate": 4.418399489016671e-05, + "loss": 2.7791, + "step": 37591 + }, + { + "epoch": 1.750192052517634, + "grad_norm": 0.40206822272513093, + "learning_rate": 4.418130454965817e-05, + "loss": 2.6376, + "step": 37592 + }, + { + "epoch": 1.750238610703727, + "grad_norm": 0.34742103904054794, + "learning_rate": 4.417861422622696e-05, + "loss": 2.7042, + "step": 37593 + }, + { + "epoch": 1.75028516888982, + "grad_norm": 0.38844854346689106, + "learning_rate": 4.4175923919880947e-05, + "loss": 2.7674, + "step": 37594 + }, + { + "epoch": 1.7503317270759131, + "grad_norm": 0.3863297889487294, + "learning_rate": 4.4173233630628e-05, + "loss": 2.707, + "step": 37595 + }, + { + "epoch": 1.7503782852620062, + "grad_norm": 0.3431446447421542, + "learning_rate": 4.417054335847605e-05, + "loss": 2.7481, + "step": 37596 + }, + { + "epoch": 1.7504248434480991, + "grad_norm": 0.40104595063174053, + "learning_rate": 4.416785310343298e-05, + "loss": 2.745, + "step": 37597 + }, + { + "epoch": 1.7504714016341922, + "grad_norm": 0.3845799137336287, + "learning_rate": 4.416516286550669e-05, + "loss": 2.6297, + "step": 37598 + }, + { + "epoch": 1.7505179598202854, + "grad_norm": 0.3743088370695412, + "learning_rate": 4.416247264470507e-05, + "loss": 2.7477, + "step": 37599 + }, + { + "epoch": 1.7505645180063785, + "grad_norm": 0.34477260574682056, + "learning_rate": 4.415978244103601e-05, + "loss": 2.6369, + "step": 37600 + }, + { + "epoch": 1.7506110761924716, + "grad_norm": 0.37731127673867254, + "learning_rate": 4.415709225450743e-05, + "loss": 2.6504, + "step": 37601 + }, + { + "epoch": 1.7506576343785647, + "grad_norm": 0.36734833363571395, + "learning_rate": 4.415440208512719e-05, + "loss": 2.7246, + "step": 37602 + }, + { + "epoch": 1.7507041925646578, + "grad_norm": 0.3253742945139576, + "learning_rate": 4.415171193290322e-05, + "loss": 2.6524, + "step": 37603 + }, + { + "epoch": 1.7507507507507507, + "grad_norm": 0.3779742275888543, + "learning_rate": 4.41490217978434e-05, + "loss": 2.6593, + "step": 37604 + }, + { + "epoch": 1.7507973089368438, + "grad_norm": 0.33417345591833747, + "learning_rate": 4.4146331679955596e-05, + "loss": 2.6977, + "step": 37605 + }, + { + "epoch": 1.750843867122937, + "grad_norm": 0.37464307918957396, + "learning_rate": 4.4143641579247764e-05, + "loss": 2.7747, + "step": 37606 + }, + { + "epoch": 1.7508904253090298, + "grad_norm": 0.37914563590536204, + "learning_rate": 4.414095149572776e-05, + "loss": 2.7313, + "step": 37607 + }, + { + "epoch": 1.750936983495123, + "grad_norm": 0.3495812928872721, + "learning_rate": 4.413826142940347e-05, + "loss": 2.6386, + "step": 37608 + }, + { + "epoch": 1.750983541681216, + "grad_norm": 0.36758559649830874, + "learning_rate": 4.4135571380282827e-05, + "loss": 2.6537, + "step": 37609 + }, + { + "epoch": 1.7510300998673092, + "grad_norm": 0.3631560754130112, + "learning_rate": 4.413288134837369e-05, + "loss": 2.6712, + "step": 37610 + }, + { + "epoch": 1.7510766580534023, + "grad_norm": 0.33824065169188017, + "learning_rate": 4.4130191333683976e-05, + "loss": 2.7245, + "step": 37611 + }, + { + "epoch": 1.7511232162394954, + "grad_norm": 0.3629676077413458, + "learning_rate": 4.4127501336221566e-05, + "loss": 2.6631, + "step": 37612 + }, + { + "epoch": 1.7511697744255885, + "grad_norm": 0.32533148625819513, + "learning_rate": 4.4124811355994366e-05, + "loss": 2.7972, + "step": 37613 + }, + { + "epoch": 1.7512163326116814, + "grad_norm": 0.34376960186703226, + "learning_rate": 4.412212139301027e-05, + "loss": 2.6553, + "step": 37614 + }, + { + "epoch": 1.7512628907977745, + "grad_norm": 0.34488074158298354, + "learning_rate": 4.411943144727715e-05, + "loss": 2.8005, + "step": 37615 + }, + { + "epoch": 1.7513094489838674, + "grad_norm": 0.32247534762599794, + "learning_rate": 4.411674151880293e-05, + "loss": 2.7067, + "step": 37616 + }, + { + "epoch": 1.7513560071699605, + "grad_norm": 0.3691731994314979, + "learning_rate": 4.411405160759551e-05, + "loss": 2.5996, + "step": 37617 + }, + { + "epoch": 1.7514025653560537, + "grad_norm": 0.3438127209546885, + "learning_rate": 4.4111361713662744e-05, + "loss": 2.6711, + "step": 37618 + }, + { + "epoch": 1.7514491235421468, + "grad_norm": 0.35524193177480823, + "learning_rate": 4.410867183701257e-05, + "loss": 2.6898, + "step": 37619 + }, + { + "epoch": 1.7514956817282399, + "grad_norm": 0.3518288912801912, + "learning_rate": 4.410598197765283e-05, + "loss": 2.7401, + "step": 37620 + }, + { + "epoch": 1.751542239914333, + "grad_norm": 0.38791755940424477, + "learning_rate": 4.410329213559149e-05, + "loss": 2.719, + "step": 37621 + }, + { + "epoch": 1.7515887981004261, + "grad_norm": 0.33673678740150925, + "learning_rate": 4.41006023108364e-05, + "loss": 2.6737, + "step": 37622 + }, + { + "epoch": 1.7516353562865192, + "grad_norm": 0.3615670247764178, + "learning_rate": 4.409791250339546e-05, + "loss": 2.7288, + "step": 37623 + }, + { + "epoch": 1.7516819144726121, + "grad_norm": 0.36127790742000643, + "learning_rate": 4.409522271327656e-05, + "loss": 2.6672, + "step": 37624 + }, + { + "epoch": 1.7517284726587052, + "grad_norm": 0.33503854775812003, + "learning_rate": 4.409253294048758e-05, + "loss": 2.7145, + "step": 37625 + }, + { + "epoch": 1.7517750308447981, + "grad_norm": 0.3897959523825527, + "learning_rate": 4.408984318503647e-05, + "loss": 2.6782, + "step": 37626 + }, + { + "epoch": 1.7518215890308912, + "grad_norm": 0.3404118755370986, + "learning_rate": 4.408715344693108e-05, + "loss": 2.7104, + "step": 37627 + }, + { + "epoch": 1.7518681472169844, + "grad_norm": 0.3617164200540338, + "learning_rate": 4.408446372617929e-05, + "loss": 2.7341, + "step": 37628 + }, + { + "epoch": 1.7519147054030775, + "grad_norm": 0.3514337874405459, + "learning_rate": 4.408177402278903e-05, + "loss": 2.7609, + "step": 37629 + }, + { + "epoch": 1.7519612635891706, + "grad_norm": 0.3700706388616589, + "learning_rate": 4.4079084336768196e-05, + "loss": 2.7196, + "step": 37630 + }, + { + "epoch": 1.7520078217752637, + "grad_norm": 0.3826141539900825, + "learning_rate": 4.4076394668124634e-05, + "loss": 2.5667, + "step": 37631 + }, + { + "epoch": 1.7520543799613568, + "grad_norm": 0.34053297129332727, + "learning_rate": 4.4073705016866294e-05, + "loss": 2.7487, + "step": 37632 + }, + { + "epoch": 1.75210093814745, + "grad_norm": 0.35989481255308736, + "learning_rate": 4.4071015383001024e-05, + "loss": 2.695, + "step": 37633 + }, + { + "epoch": 1.7521474963335428, + "grad_norm": 0.342714980701706, + "learning_rate": 4.4068325766536765e-05, + "loss": 2.6157, + "step": 37634 + }, + { + "epoch": 1.752194054519636, + "grad_norm": 0.35103255440516673, + "learning_rate": 4.4065636167481384e-05, + "loss": 2.7321, + "step": 37635 + }, + { + "epoch": 1.7522406127057288, + "grad_norm": 0.3639367165698308, + "learning_rate": 4.406294658584276e-05, + "loss": 2.6225, + "step": 37636 + }, + { + "epoch": 1.752287170891822, + "grad_norm": 0.3229738021143282, + "learning_rate": 4.406025702162882e-05, + "loss": 2.6615, + "step": 37637 + }, + { + "epoch": 1.752333729077915, + "grad_norm": 0.34593897186991796, + "learning_rate": 4.4057567474847415e-05, + "loss": 2.6648, + "step": 37638 + }, + { + "epoch": 1.7523802872640082, + "grad_norm": 0.3396913017584557, + "learning_rate": 4.405487794550649e-05, + "loss": 2.6857, + "step": 37639 + }, + { + "epoch": 1.7524268454501013, + "grad_norm": 0.33980586080788, + "learning_rate": 4.4052188433613905e-05, + "loss": 2.6965, + "step": 37640 + }, + { + "epoch": 1.7524734036361944, + "grad_norm": 0.35858074658969075, + "learning_rate": 4.404949893917755e-05, + "loss": 2.7212, + "step": 37641 + }, + { + "epoch": 1.7525199618222875, + "grad_norm": 0.3287129811040223, + "learning_rate": 4.404680946220534e-05, + "loss": 2.7167, + "step": 37642 + }, + { + "epoch": 1.7525665200083804, + "grad_norm": 0.3169033007828666, + "learning_rate": 4.404412000270514e-05, + "loss": 2.6934, + "step": 37643 + }, + { + "epoch": 1.7526130781944735, + "grad_norm": 0.3507779280152818, + "learning_rate": 4.4041430560684884e-05, + "loss": 2.843, + "step": 37644 + }, + { + "epoch": 1.7526596363805667, + "grad_norm": 0.3511618226304693, + "learning_rate": 4.403874113615244e-05, + "loss": 2.7198, + "step": 37645 + }, + { + "epoch": 1.7527061945666595, + "grad_norm": 0.3270998855187455, + "learning_rate": 4.403605172911569e-05, + "loss": 2.6909, + "step": 37646 + }, + { + "epoch": 1.7527527527527527, + "grad_norm": 0.32499761753008294, + "learning_rate": 4.4033362339582556e-05, + "loss": 2.6753, + "step": 37647 + }, + { + "epoch": 1.7527993109388458, + "grad_norm": 0.33072055993463034, + "learning_rate": 4.403067296756091e-05, + "loss": 2.6764, + "step": 37648 + }, + { + "epoch": 1.752845869124939, + "grad_norm": 0.33985961227406947, + "learning_rate": 4.4027983613058655e-05, + "loss": 2.7428, + "step": 37649 + }, + { + "epoch": 1.752892427311032, + "grad_norm": 0.31841539418029824, + "learning_rate": 4.402529427608368e-05, + "loss": 2.6862, + "step": 37650 + }, + { + "epoch": 1.7529389854971251, + "grad_norm": 0.31267996725305713, + "learning_rate": 4.4022604956643854e-05, + "loss": 2.6737, + "step": 37651 + }, + { + "epoch": 1.7529855436832182, + "grad_norm": 0.3405627740827259, + "learning_rate": 4.401991565474712e-05, + "loss": 2.6479, + "step": 37652 + }, + { + "epoch": 1.7530321018693111, + "grad_norm": 0.35870364402024985, + "learning_rate": 4.401722637040134e-05, + "loss": 2.7988, + "step": 37653 + }, + { + "epoch": 1.7530786600554042, + "grad_norm": 0.325181204059446, + "learning_rate": 4.4014537103614394e-05, + "loss": 2.7659, + "step": 37654 + }, + { + "epoch": 1.7531252182414974, + "grad_norm": 0.34445895407743843, + "learning_rate": 4.401184785439422e-05, + "loss": 2.6275, + "step": 37655 + }, + { + "epoch": 1.7531717764275903, + "grad_norm": 0.35661844782909746, + "learning_rate": 4.400915862274865e-05, + "loss": 2.6823, + "step": 37656 + }, + { + "epoch": 1.7532183346136834, + "grad_norm": 0.350137470294199, + "learning_rate": 4.400646940868563e-05, + "loss": 2.7026, + "step": 37657 + }, + { + "epoch": 1.7532648927997765, + "grad_norm": 0.3544886136909977, + "learning_rate": 4.400378021221304e-05, + "loss": 2.664, + "step": 37658 + }, + { + "epoch": 1.7533114509858696, + "grad_norm": 0.3490747371681191, + "learning_rate": 4.400109103333874e-05, + "loss": 2.6547, + "step": 37659 + }, + { + "epoch": 1.7533580091719627, + "grad_norm": 0.3753954334660017, + "learning_rate": 4.399840187207066e-05, + "loss": 2.6773, + "step": 37660 + }, + { + "epoch": 1.7534045673580558, + "grad_norm": 0.37765051606768724, + "learning_rate": 4.399571272841668e-05, + "loss": 2.7844, + "step": 37661 + }, + { + "epoch": 1.753451125544149, + "grad_norm": 0.3656267989265957, + "learning_rate": 4.39930236023847e-05, + "loss": 2.6338, + "step": 37662 + }, + { + "epoch": 1.7534976837302418, + "grad_norm": 0.3603049909477973, + "learning_rate": 4.39903344939826e-05, + "loss": 2.6947, + "step": 37663 + }, + { + "epoch": 1.753544241916335, + "grad_norm": 0.36790267326738274, + "learning_rate": 4.398764540321826e-05, + "loss": 2.7396, + "step": 37664 + }, + { + "epoch": 1.7535908001024278, + "grad_norm": 0.3438988026268202, + "learning_rate": 4.39849563300996e-05, + "loss": 2.6863, + "step": 37665 + }, + { + "epoch": 1.753637358288521, + "grad_norm": 0.3196053970700654, + "learning_rate": 4.3982267274634506e-05, + "loss": 2.695, + "step": 37666 + }, + { + "epoch": 1.753683916474614, + "grad_norm": 0.38948768588954363, + "learning_rate": 4.397957823683085e-05, + "loss": 2.7573, + "step": 37667 + }, + { + "epoch": 1.7537304746607072, + "grad_norm": 0.31539715138096475, + "learning_rate": 4.397688921669655e-05, + "loss": 2.688, + "step": 37668 + }, + { + "epoch": 1.7537770328468003, + "grad_norm": 0.36855879916010104, + "learning_rate": 4.397420021423947e-05, + "loss": 2.7703, + "step": 37669 + }, + { + "epoch": 1.7538235910328934, + "grad_norm": 0.36555347635621754, + "learning_rate": 4.397151122946754e-05, + "loss": 2.7409, + "step": 37670 + }, + { + "epoch": 1.7538701492189865, + "grad_norm": 0.33813927015112, + "learning_rate": 4.3968822262388634e-05, + "loss": 2.594, + "step": 37671 + }, + { + "epoch": 1.7539167074050797, + "grad_norm": 0.3416435329841899, + "learning_rate": 4.396613331301061e-05, + "loss": 2.7377, + "step": 37672 + }, + { + "epoch": 1.7539632655911725, + "grad_norm": 0.3595363206325501, + "learning_rate": 4.396344438134142e-05, + "loss": 2.6728, + "step": 37673 + }, + { + "epoch": 1.7540098237772657, + "grad_norm": 0.32834433895963777, + "learning_rate": 4.3960755467388916e-05, + "loss": 2.6834, + "step": 37674 + }, + { + "epoch": 1.7540563819633586, + "grad_norm": 0.3492238880429223, + "learning_rate": 4.395806657116101e-05, + "loss": 2.6447, + "step": 37675 + }, + { + "epoch": 1.7541029401494517, + "grad_norm": 0.32729326392599783, + "learning_rate": 4.3955377692665577e-05, + "loss": 2.7195, + "step": 37676 + }, + { + "epoch": 1.7541494983355448, + "grad_norm": 0.3310520114216023, + "learning_rate": 4.39526888319105e-05, + "loss": 2.674, + "step": 37677 + }, + { + "epoch": 1.754196056521638, + "grad_norm": 0.32577896261324396, + "learning_rate": 4.3949999988903703e-05, + "loss": 2.711, + "step": 37678 + }, + { + "epoch": 1.754242614707731, + "grad_norm": 0.3598247208613066, + "learning_rate": 4.3947311163653035e-05, + "loss": 2.6234, + "step": 37679 + }, + { + "epoch": 1.7542891728938241, + "grad_norm": 0.3422368255934784, + "learning_rate": 4.3944622356166444e-05, + "loss": 2.7356, + "step": 37680 + }, + { + "epoch": 1.7543357310799172, + "grad_norm": 0.3339771360216982, + "learning_rate": 4.394193356645178e-05, + "loss": 2.6961, + "step": 37681 + }, + { + "epoch": 1.7543822892660101, + "grad_norm": 0.34238585552565487, + "learning_rate": 4.393924479451693e-05, + "loss": 2.6374, + "step": 37682 + }, + { + "epoch": 1.7544288474521033, + "grad_norm": 0.3258229463945923, + "learning_rate": 4.393655604036982e-05, + "loss": 2.6361, + "step": 37683 + }, + { + "epoch": 1.7544754056381964, + "grad_norm": 0.33204656909145225, + "learning_rate": 4.393386730401832e-05, + "loss": 2.6692, + "step": 37684 + }, + { + "epoch": 1.7545219638242893, + "grad_norm": 0.34155809061545406, + "learning_rate": 4.3931178585470295e-05, + "loss": 2.6599, + "step": 37685 + }, + { + "epoch": 1.7545685220103824, + "grad_norm": 0.3399467525618978, + "learning_rate": 4.3928489884733694e-05, + "loss": 2.7468, + "step": 37686 + }, + { + "epoch": 1.7546150801964755, + "grad_norm": 0.3638872113334877, + "learning_rate": 4.3925801201816355e-05, + "loss": 2.6957, + "step": 37687 + }, + { + "epoch": 1.7546616383825686, + "grad_norm": 0.3516831648002837, + "learning_rate": 4.392311253672621e-05, + "loss": 2.7405, + "step": 37688 + }, + { + "epoch": 1.7547081965686617, + "grad_norm": 0.32884332072682904, + "learning_rate": 4.392042388947113e-05, + "loss": 2.6787, + "step": 37689 + }, + { + "epoch": 1.7547547547547548, + "grad_norm": 0.33138737943112806, + "learning_rate": 4.391773526005899e-05, + "loss": 2.7473, + "step": 37690 + }, + { + "epoch": 1.754801312940848, + "grad_norm": 0.3417836691738968, + "learning_rate": 4.39150466484977e-05, + "loss": 2.7956, + "step": 37691 + }, + { + "epoch": 1.7548478711269409, + "grad_norm": 0.33931538732093597, + "learning_rate": 4.3912358054795146e-05, + "loss": 2.6141, + "step": 37692 + }, + { + "epoch": 1.754894429313034, + "grad_norm": 0.3430006616978074, + "learning_rate": 4.390966947895924e-05, + "loss": 2.6713, + "step": 37693 + }, + { + "epoch": 1.754940987499127, + "grad_norm": 0.32975010589654147, + "learning_rate": 4.3906980920997846e-05, + "loss": 2.8185, + "step": 37694 + }, + { + "epoch": 1.75498754568522, + "grad_norm": 0.36693200234222567, + "learning_rate": 4.390429238091885e-05, + "loss": 2.7038, + "step": 37695 + }, + { + "epoch": 1.755034103871313, + "grad_norm": 0.3619213064740796, + "learning_rate": 4.3901603858730176e-05, + "loss": 2.7314, + "step": 37696 + }, + { + "epoch": 1.7550806620574062, + "grad_norm": 0.3276564994414606, + "learning_rate": 4.389891535443968e-05, + "loss": 2.5924, + "step": 37697 + }, + { + "epoch": 1.7551272202434993, + "grad_norm": 0.37659403446371564, + "learning_rate": 4.389622686805527e-05, + "loss": 2.6655, + "step": 37698 + }, + { + "epoch": 1.7551737784295924, + "grad_norm": 0.3419760574443077, + "learning_rate": 4.389353839958484e-05, + "loss": 2.6462, + "step": 37699 + }, + { + "epoch": 1.7552203366156856, + "grad_norm": 0.37194186690037456, + "learning_rate": 4.3890849949036264e-05, + "loss": 2.7872, + "step": 37700 + }, + { + "epoch": 1.7552668948017787, + "grad_norm": 0.3731050606243997, + "learning_rate": 4.3888161516417446e-05, + "loss": 2.7426, + "step": 37701 + }, + { + "epoch": 1.7553134529878716, + "grad_norm": 0.3526438272383293, + "learning_rate": 4.388547310173628e-05, + "loss": 2.7652, + "step": 37702 + }, + { + "epoch": 1.7553600111739647, + "grad_norm": 0.33621024623892515, + "learning_rate": 4.388278470500062e-05, + "loss": 2.7731, + "step": 37703 + }, + { + "epoch": 1.7554065693600576, + "grad_norm": 0.35282992486581244, + "learning_rate": 4.388009632621841e-05, + "loss": 2.6712, + "step": 37704 + }, + { + "epoch": 1.7554531275461507, + "grad_norm": 0.3459818830551703, + "learning_rate": 4.3877407965397494e-05, + "loss": 2.7474, + "step": 37705 + }, + { + "epoch": 1.7554996857322438, + "grad_norm": 0.3535181805881593, + "learning_rate": 4.38747196225458e-05, + "loss": 2.7392, + "step": 37706 + }, + { + "epoch": 1.755546243918337, + "grad_norm": 0.3657395258598791, + "learning_rate": 4.3872031297671204e-05, + "loss": 2.7732, + "step": 37707 + }, + { + "epoch": 1.75559280210443, + "grad_norm": 0.3312994794127914, + "learning_rate": 4.386934299078157e-05, + "loss": 2.5894, + "step": 37708 + }, + { + "epoch": 1.7556393602905231, + "grad_norm": 0.35700909143748594, + "learning_rate": 4.386665470188484e-05, + "loss": 2.7342, + "step": 37709 + }, + { + "epoch": 1.7556859184766163, + "grad_norm": 0.3570571933075806, + "learning_rate": 4.386396643098885e-05, + "loss": 2.6851, + "step": 37710 + }, + { + "epoch": 1.7557324766627094, + "grad_norm": 0.3367987644955747, + "learning_rate": 4.386127817810152e-05, + "loss": 2.6608, + "step": 37711 + }, + { + "epoch": 1.7557790348488023, + "grad_norm": 0.3599887273497339, + "learning_rate": 4.385858994323074e-05, + "loss": 2.7214, + "step": 37712 + }, + { + "epoch": 1.7558255930348954, + "grad_norm": 0.3636333583226674, + "learning_rate": 4.3855901726384384e-05, + "loss": 2.6717, + "step": 37713 + }, + { + "epoch": 1.7558721512209883, + "grad_norm": 0.3327809171127451, + "learning_rate": 4.385321352757037e-05, + "loss": 2.6571, + "step": 37714 + }, + { + "epoch": 1.7559187094070814, + "grad_norm": 0.3798839458515109, + "learning_rate": 4.385052534679654e-05, + "loss": 2.7583, + "step": 37715 + }, + { + "epoch": 1.7559652675931745, + "grad_norm": 0.31602425352461333, + "learning_rate": 4.3847837184070836e-05, + "loss": 2.7705, + "step": 37716 + }, + { + "epoch": 1.7560118257792676, + "grad_norm": 0.3595618376324411, + "learning_rate": 4.3845149039401125e-05, + "loss": 2.7469, + "step": 37717 + }, + { + "epoch": 1.7560583839653607, + "grad_norm": 0.3559563156880667, + "learning_rate": 4.384246091279527e-05, + "loss": 2.7239, + "step": 37718 + }, + { + "epoch": 1.7561049421514539, + "grad_norm": 0.37126144827415636, + "learning_rate": 4.383977280426121e-05, + "loss": 2.7241, + "step": 37719 + }, + { + "epoch": 1.756151500337547, + "grad_norm": 0.3407570389639392, + "learning_rate": 4.383708471380681e-05, + "loss": 2.6012, + "step": 37720 + }, + { + "epoch": 1.75619805852364, + "grad_norm": 0.33684240093234136, + "learning_rate": 4.3834396641439935e-05, + "loss": 2.6929, + "step": 37721 + }, + { + "epoch": 1.756244616709733, + "grad_norm": 0.3519164466871772, + "learning_rate": 4.3831708587168515e-05, + "loss": 2.6741, + "step": 37722 + }, + { + "epoch": 1.756291174895826, + "grad_norm": 0.32226617330505203, + "learning_rate": 4.382902055100042e-05, + "loss": 2.7797, + "step": 37723 + }, + { + "epoch": 1.756337733081919, + "grad_norm": 0.3337623303892558, + "learning_rate": 4.382633253294354e-05, + "loss": 2.6434, + "step": 37724 + }, + { + "epoch": 1.756384291268012, + "grad_norm": 0.3301922331516323, + "learning_rate": 4.382364453300578e-05, + "loss": 2.7017, + "step": 37725 + }, + { + "epoch": 1.7564308494541052, + "grad_norm": 0.3168050268471546, + "learning_rate": 4.3820956551195e-05, + "loss": 2.7144, + "step": 37726 + }, + { + "epoch": 1.7564774076401983, + "grad_norm": 0.319271834701351, + "learning_rate": 4.381826858751912e-05, + "loss": 2.6961, + "step": 37727 + }, + { + "epoch": 1.7565239658262914, + "grad_norm": 0.3499881616900051, + "learning_rate": 4.381558064198598e-05, + "loss": 2.6363, + "step": 37728 + }, + { + "epoch": 1.7565705240123846, + "grad_norm": 0.34369532849055706, + "learning_rate": 4.3812892714603534e-05, + "loss": 2.6557, + "step": 37729 + }, + { + "epoch": 1.7566170821984777, + "grad_norm": 0.3122256330218823, + "learning_rate": 4.3810204805379635e-05, + "loss": 2.6454, + "step": 37730 + }, + { + "epoch": 1.7566636403845706, + "grad_norm": 0.338795706120096, + "learning_rate": 4.3807516914322147e-05, + "loss": 2.6525, + "step": 37731 + }, + { + "epoch": 1.7567101985706637, + "grad_norm": 0.3624552995911141, + "learning_rate": 4.380482904143902e-05, + "loss": 2.6714, + "step": 37732 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.3037987900243924, + "learning_rate": 4.380214118673808e-05, + "loss": 2.6325, + "step": 37733 + }, + { + "epoch": 1.7568033149428497, + "grad_norm": 0.3434891835274271, + "learning_rate": 4.3799453350227264e-05, + "loss": 2.7399, + "step": 37734 + }, + { + "epoch": 1.7568498731289428, + "grad_norm": 0.33594358257482054, + "learning_rate": 4.379676553191445e-05, + "loss": 2.7409, + "step": 37735 + }, + { + "epoch": 1.756896431315036, + "grad_norm": 0.3386859229659219, + "learning_rate": 4.37940777318075e-05, + "loss": 2.7167, + "step": 37736 + }, + { + "epoch": 1.756942989501129, + "grad_norm": 0.34413429830004705, + "learning_rate": 4.379138994991433e-05, + "loss": 2.7715, + "step": 37737 + }, + { + "epoch": 1.7569895476872222, + "grad_norm": 0.33658287029070566, + "learning_rate": 4.378870218624282e-05, + "loss": 2.7712, + "step": 37738 + }, + { + "epoch": 1.7570361058733153, + "grad_norm": 0.33494957523393243, + "learning_rate": 4.378601444080085e-05, + "loss": 2.6635, + "step": 37739 + }, + { + "epoch": 1.7570826640594084, + "grad_norm": 0.3353876227288792, + "learning_rate": 4.3783326713596326e-05, + "loss": 2.6205, + "step": 37740 + }, + { + "epoch": 1.7571292222455013, + "grad_norm": 0.328127830465545, + "learning_rate": 4.3780639004637104e-05, + "loss": 2.6685, + "step": 37741 + }, + { + "epoch": 1.7571757804315944, + "grad_norm": 0.3459293237426527, + "learning_rate": 4.377795131393112e-05, + "loss": 2.6582, + "step": 37742 + }, + { + "epoch": 1.7572223386176875, + "grad_norm": 0.30752740682817664, + "learning_rate": 4.3775263641486226e-05, + "loss": 2.6784, + "step": 37743 + }, + { + "epoch": 1.7572688968037804, + "grad_norm": 0.33438088044644215, + "learning_rate": 4.3772575987310304e-05, + "loss": 2.5664, + "step": 37744 + }, + { + "epoch": 1.7573154549898735, + "grad_norm": 0.3344253069470272, + "learning_rate": 4.376988835141127e-05, + "loss": 2.6976, + "step": 37745 + }, + { + "epoch": 1.7573620131759666, + "grad_norm": 0.35483163637716264, + "learning_rate": 4.376720073379699e-05, + "loss": 2.6095, + "step": 37746 + }, + { + "epoch": 1.7574085713620597, + "grad_norm": 0.341953702136793, + "learning_rate": 4.376451313447539e-05, + "loss": 2.6884, + "step": 37747 + }, + { + "epoch": 1.7574551295481529, + "grad_norm": 0.34537927312060196, + "learning_rate": 4.3761825553454316e-05, + "loss": 2.671, + "step": 37748 + }, + { + "epoch": 1.757501687734246, + "grad_norm": 0.358469235683567, + "learning_rate": 4.375913799074166e-05, + "loss": 2.6648, + "step": 37749 + }, + { + "epoch": 1.757548245920339, + "grad_norm": 0.3573794619302658, + "learning_rate": 4.375645044634532e-05, + "loss": 2.7523, + "step": 37750 + }, + { + "epoch": 1.757594804106432, + "grad_norm": 0.37279722481650573, + "learning_rate": 4.3753762920273186e-05, + "loss": 2.6855, + "step": 37751 + }, + { + "epoch": 1.757641362292525, + "grad_norm": 0.38125104510064384, + "learning_rate": 4.375107541253316e-05, + "loss": 2.7194, + "step": 37752 + }, + { + "epoch": 1.757687920478618, + "grad_norm": 0.36683103646869014, + "learning_rate": 4.3748387923133105e-05, + "loss": 2.7897, + "step": 37753 + }, + { + "epoch": 1.757734478664711, + "grad_norm": 0.3386611015483836, + "learning_rate": 4.374570045208089e-05, + "loss": 2.7325, + "step": 37754 + }, + { + "epoch": 1.7577810368508042, + "grad_norm": 0.3865290437667049, + "learning_rate": 4.374301299938445e-05, + "loss": 2.7658, + "step": 37755 + }, + { + "epoch": 1.7578275950368973, + "grad_norm": 0.3674582020583081, + "learning_rate": 4.3740325565051654e-05, + "loss": 2.813, + "step": 37756 + }, + { + "epoch": 1.7578741532229905, + "grad_norm": 0.3531500520176441, + "learning_rate": 4.3737638149090365e-05, + "loss": 2.7406, + "step": 37757 + }, + { + "epoch": 1.7579207114090836, + "grad_norm": 0.3867878811228379, + "learning_rate": 4.373495075150851e-05, + "loss": 2.6627, + "step": 37758 + }, + { + "epoch": 1.7579672695951767, + "grad_norm": 0.34772959791426056, + "learning_rate": 4.373226337231394e-05, + "loss": 2.6525, + "step": 37759 + }, + { + "epoch": 1.7580138277812698, + "grad_norm": 0.3432551272761641, + "learning_rate": 4.3729576011514576e-05, + "loss": 2.7072, + "step": 37760 + }, + { + "epoch": 1.7580603859673627, + "grad_norm": 0.3455222626909806, + "learning_rate": 4.3726888669118295e-05, + "loss": 2.6814, + "step": 37761 + }, + { + "epoch": 1.7581069441534558, + "grad_norm": 0.3844842446292056, + "learning_rate": 4.3724201345132966e-05, + "loss": 2.7165, + "step": 37762 + }, + { + "epoch": 1.7581535023395487, + "grad_norm": 0.34759140680755163, + "learning_rate": 4.372151403956648e-05, + "loss": 2.797, + "step": 37763 + }, + { + "epoch": 1.7582000605256418, + "grad_norm": 0.37534153298749084, + "learning_rate": 4.3718826752426736e-05, + "loss": 2.5606, + "step": 37764 + }, + { + "epoch": 1.758246618711735, + "grad_norm": 0.3956667996740431, + "learning_rate": 4.371613948372163e-05, + "loss": 2.6852, + "step": 37765 + }, + { + "epoch": 1.758293176897828, + "grad_norm": 0.31153282047624736, + "learning_rate": 4.3713452233459034e-05, + "loss": 2.7644, + "step": 37766 + }, + { + "epoch": 1.7583397350839212, + "grad_norm": 0.38936011324086733, + "learning_rate": 4.3710765001646814e-05, + "loss": 2.713, + "step": 37767 + }, + { + "epoch": 1.7583862932700143, + "grad_norm": 0.3372639087832728, + "learning_rate": 4.37080777882929e-05, + "loss": 2.6759, + "step": 37768 + }, + { + "epoch": 1.7584328514561074, + "grad_norm": 0.33837978153224946, + "learning_rate": 4.3705390593405136e-05, + "loss": 2.6262, + "step": 37769 + }, + { + "epoch": 1.7584794096422003, + "grad_norm": 0.34838996340121076, + "learning_rate": 4.370270341699145e-05, + "loss": 2.6591, + "step": 37770 + }, + { + "epoch": 1.7585259678282934, + "grad_norm": 0.3553491866207921, + "learning_rate": 4.3700016259059716e-05, + "loss": 2.6294, + "step": 37771 + }, + { + "epoch": 1.7585725260143865, + "grad_norm": 0.36552234979944676, + "learning_rate": 4.369732911961779e-05, + "loss": 2.7707, + "step": 37772 + }, + { + "epoch": 1.7586190842004794, + "grad_norm": 0.33469450303594844, + "learning_rate": 4.36946419986736e-05, + "loss": 2.6095, + "step": 37773 + }, + { + "epoch": 1.7586656423865725, + "grad_norm": 0.37382761681517057, + "learning_rate": 4.369195489623502e-05, + "loss": 2.7315, + "step": 37774 + }, + { + "epoch": 1.7587122005726656, + "grad_norm": 0.30941851549766997, + "learning_rate": 4.3689267812309914e-05, + "loss": 2.6725, + "step": 37775 + }, + { + "epoch": 1.7587587587587588, + "grad_norm": 0.3613081527581835, + "learning_rate": 4.3686580746906194e-05, + "loss": 2.7167, + "step": 37776 + }, + { + "epoch": 1.7588053169448519, + "grad_norm": 0.36809194982312615, + "learning_rate": 4.3683893700031726e-05, + "loss": 2.77, + "step": 37777 + }, + { + "epoch": 1.758851875130945, + "grad_norm": 0.35296204264096703, + "learning_rate": 4.3681206671694424e-05, + "loss": 2.6297, + "step": 37778 + }, + { + "epoch": 1.758898433317038, + "grad_norm": 0.34924501037375716, + "learning_rate": 4.3678519661902164e-05, + "loss": 2.8021, + "step": 37779 + }, + { + "epoch": 1.758944991503131, + "grad_norm": 0.3901316805832971, + "learning_rate": 4.367583267066279e-05, + "loss": 2.7134, + "step": 37780 + }, + { + "epoch": 1.758991549689224, + "grad_norm": 0.369511078648573, + "learning_rate": 4.3673145697984256e-05, + "loss": 2.6941, + "step": 37781 + }, + { + "epoch": 1.7590381078753172, + "grad_norm": 0.35654661215728134, + "learning_rate": 4.367045874387439e-05, + "loss": 2.7592, + "step": 37782 + }, + { + "epoch": 1.7590846660614101, + "grad_norm": 0.3735062604722047, + "learning_rate": 4.366777180834113e-05, + "loss": 2.7089, + "step": 37783 + }, + { + "epoch": 1.7591312242475032, + "grad_norm": 0.37957515535427594, + "learning_rate": 4.366508489139234e-05, + "loss": 2.696, + "step": 37784 + }, + { + "epoch": 1.7591777824335963, + "grad_norm": 0.33177808533406145, + "learning_rate": 4.366239799303587e-05, + "loss": 2.8127, + "step": 37785 + }, + { + "epoch": 1.7592243406196895, + "grad_norm": 0.377950091206511, + "learning_rate": 4.365971111327967e-05, + "loss": 2.6792, + "step": 37786 + }, + { + "epoch": 1.7592708988057826, + "grad_norm": 0.3517441919314909, + "learning_rate": 4.365702425213157e-05, + "loss": 2.713, + "step": 37787 + }, + { + "epoch": 1.7593174569918757, + "grad_norm": 0.35731740844281806, + "learning_rate": 4.365433740959949e-05, + "loss": 2.7551, + "step": 37788 + }, + { + "epoch": 1.7593640151779688, + "grad_norm": 0.3611900458141722, + "learning_rate": 4.36516505856913e-05, + "loss": 2.6367, + "step": 37789 + }, + { + "epoch": 1.7594105733640617, + "grad_norm": 0.3490281755064222, + "learning_rate": 4.36489637804149e-05, + "loss": 2.6452, + "step": 37790 + }, + { + "epoch": 1.7594571315501548, + "grad_norm": 0.350200682967089, + "learning_rate": 4.364627699377816e-05, + "loss": 2.6413, + "step": 37791 + }, + { + "epoch": 1.7595036897362477, + "grad_norm": 0.35779563714945845, + "learning_rate": 4.3643590225788974e-05, + "loss": 2.6328, + "step": 37792 + }, + { + "epoch": 1.7595502479223408, + "grad_norm": 0.333710006601137, + "learning_rate": 4.3640903476455205e-05, + "loss": 2.6063, + "step": 37793 + }, + { + "epoch": 1.759596806108434, + "grad_norm": 0.36482577904899965, + "learning_rate": 4.363821674578479e-05, + "loss": 2.7301, + "step": 37794 + }, + { + "epoch": 1.759643364294527, + "grad_norm": 0.348052287351161, + "learning_rate": 4.3635530033785543e-05, + "loss": 2.6919, + "step": 37795 + }, + { + "epoch": 1.7596899224806202, + "grad_norm": 0.3621953490227457, + "learning_rate": 4.3632843340465426e-05, + "loss": 2.7537, + "step": 37796 + }, + { + "epoch": 1.7597364806667133, + "grad_norm": 0.32888076849352954, + "learning_rate": 4.363015666583227e-05, + "loss": 2.6329, + "step": 37797 + }, + { + "epoch": 1.7597830388528064, + "grad_norm": 0.3486032612376599, + "learning_rate": 4.362747000989397e-05, + "loss": 2.6661, + "step": 37798 + }, + { + "epoch": 1.7598295970388995, + "grad_norm": 0.376083912566584, + "learning_rate": 4.362478337265843e-05, + "loss": 2.7333, + "step": 37799 + }, + { + "epoch": 1.7598761552249924, + "grad_norm": 0.3321165065799831, + "learning_rate": 4.362209675413351e-05, + "loss": 2.7584, + "step": 37800 + }, + { + "epoch": 1.7599227134110855, + "grad_norm": 0.37198633494992567, + "learning_rate": 4.361941015432712e-05, + "loss": 2.7553, + "step": 37801 + }, + { + "epoch": 1.7599692715971784, + "grad_norm": 0.362909050887307, + "learning_rate": 4.361672357324713e-05, + "loss": 2.696, + "step": 37802 + }, + { + "epoch": 1.7600158297832715, + "grad_norm": 0.3713963616095059, + "learning_rate": 4.361403701090142e-05, + "loss": 2.7742, + "step": 37803 + }, + { + "epoch": 1.7600623879693647, + "grad_norm": 0.3234609917793311, + "learning_rate": 4.36113504672979e-05, + "loss": 2.5675, + "step": 37804 + }, + { + "epoch": 1.7601089461554578, + "grad_norm": 0.32341059847646325, + "learning_rate": 4.360866394244441e-05, + "loss": 2.7498, + "step": 37805 + }, + { + "epoch": 1.7601555043415509, + "grad_norm": 0.34455034633351583, + "learning_rate": 4.360597743634888e-05, + "loss": 2.7385, + "step": 37806 + }, + { + "epoch": 1.760202062527644, + "grad_norm": 0.34410863629628075, + "learning_rate": 4.360329094901918e-05, + "loss": 2.7097, + "step": 37807 + }, + { + "epoch": 1.7602486207137371, + "grad_norm": 0.3308800456851471, + "learning_rate": 4.360060448046318e-05, + "loss": 2.7139, + "step": 37808 + }, + { + "epoch": 1.7602951788998302, + "grad_norm": 0.3352182172676579, + "learning_rate": 4.359791803068878e-05, + "loss": 2.6689, + "step": 37809 + }, + { + "epoch": 1.7603417370859231, + "grad_norm": 0.32863336170249324, + "learning_rate": 4.359523159970387e-05, + "loss": 2.7416, + "step": 37810 + }, + { + "epoch": 1.7603882952720162, + "grad_norm": 0.3223830268087589, + "learning_rate": 4.35925451875163e-05, + "loss": 2.6308, + "step": 37811 + }, + { + "epoch": 1.7604348534581091, + "grad_norm": 0.31571750649564617, + "learning_rate": 4.3589858794134e-05, + "loss": 2.685, + "step": 37812 + }, + { + "epoch": 1.7604814116442022, + "grad_norm": 0.35149116331053115, + "learning_rate": 4.358717241956482e-05, + "loss": 2.658, + "step": 37813 + }, + { + "epoch": 1.7605279698302954, + "grad_norm": 0.32252881721674376, + "learning_rate": 4.358448606381665e-05, + "loss": 2.7656, + "step": 37814 + }, + { + "epoch": 1.7605745280163885, + "grad_norm": 0.35555628933365474, + "learning_rate": 4.358179972689741e-05, + "loss": 2.6691, + "step": 37815 + }, + { + "epoch": 1.7606210862024816, + "grad_norm": 0.3398095023171239, + "learning_rate": 4.357911340881493e-05, + "loss": 2.716, + "step": 37816 + }, + { + "epoch": 1.7606676443885747, + "grad_norm": 0.3490230542567991, + "learning_rate": 4.357642710957714e-05, + "loss": 2.6653, + "step": 37817 + }, + { + "epoch": 1.7607142025746678, + "grad_norm": 0.34077135803231395, + "learning_rate": 4.357374082919188e-05, + "loss": 2.7573, + "step": 37818 + }, + { + "epoch": 1.7607607607607607, + "grad_norm": 0.33415303745937863, + "learning_rate": 4.3571054567667076e-05, + "loss": 2.6746, + "step": 37819 + }, + { + "epoch": 1.7608073189468538, + "grad_norm": 0.3523509953792938, + "learning_rate": 4.3568368325010594e-05, + "loss": 2.7443, + "step": 37820 + }, + { + "epoch": 1.760853877132947, + "grad_norm": 0.3414753316925351, + "learning_rate": 4.3565682101230296e-05, + "loss": 2.7532, + "step": 37821 + }, + { + "epoch": 1.7609004353190398, + "grad_norm": 0.3692165154702744, + "learning_rate": 4.356299589633411e-05, + "loss": 2.6821, + "step": 37822 + }, + { + "epoch": 1.760946993505133, + "grad_norm": 0.36165085722101326, + "learning_rate": 4.3560309710329875e-05, + "loss": 2.7414, + "step": 37823 + }, + { + "epoch": 1.760993551691226, + "grad_norm": 0.32682549385928394, + "learning_rate": 4.355762354322552e-05, + "loss": 2.523, + "step": 37824 + }, + { + "epoch": 1.7610401098773192, + "grad_norm": 0.3767291531290631, + "learning_rate": 4.35549373950289e-05, + "loss": 2.8018, + "step": 37825 + }, + { + "epoch": 1.7610866680634123, + "grad_norm": 0.31794774731374326, + "learning_rate": 4.355225126574789e-05, + "loss": 2.6536, + "step": 37826 + }, + { + "epoch": 1.7611332262495054, + "grad_norm": 0.33287714052020956, + "learning_rate": 4.35495651553904e-05, + "loss": 2.7301, + "step": 37827 + }, + { + "epoch": 1.7611797844355985, + "grad_norm": 0.34212902260929257, + "learning_rate": 4.35468790639643e-05, + "loss": 2.6461, + "step": 37828 + }, + { + "epoch": 1.7612263426216914, + "grad_norm": 0.3293209474415623, + "learning_rate": 4.354419299147747e-05, + "loss": 2.7346, + "step": 37829 + }, + { + "epoch": 1.7612729008077845, + "grad_norm": 0.37618022841964976, + "learning_rate": 4.354150693793781e-05, + "loss": 2.7097, + "step": 37830 + }, + { + "epoch": 1.7613194589938777, + "grad_norm": 0.37142471267467986, + "learning_rate": 4.353882090335316e-05, + "loss": 2.7728, + "step": 37831 + }, + { + "epoch": 1.7613660171799705, + "grad_norm": 0.33177538833213127, + "learning_rate": 4.353613488773147e-05, + "loss": 2.6958, + "step": 37832 + }, + { + "epoch": 1.7614125753660637, + "grad_norm": 0.3867158602866127, + "learning_rate": 4.3533448891080575e-05, + "loss": 2.7181, + "step": 37833 + }, + { + "epoch": 1.7614591335521568, + "grad_norm": 0.3479235070943195, + "learning_rate": 4.353076291340835e-05, + "loss": 2.6608, + "step": 37834 + }, + { + "epoch": 1.76150569173825, + "grad_norm": 0.3483658183684888, + "learning_rate": 4.3528076954722726e-05, + "loss": 2.7581, + "step": 37835 + }, + { + "epoch": 1.761552249924343, + "grad_norm": 0.34507651522271793, + "learning_rate": 4.352539101503154e-05, + "loss": 2.678, + "step": 37836 + }, + { + "epoch": 1.7615988081104361, + "grad_norm": 0.3308798541048817, + "learning_rate": 4.35227050943427e-05, + "loss": 2.6463, + "step": 37837 + }, + { + "epoch": 1.7616453662965292, + "grad_norm": 0.313217582391508, + "learning_rate": 4.35200191926641e-05, + "loss": 2.7006, + "step": 37838 + }, + { + "epoch": 1.7616919244826221, + "grad_norm": 0.3451884156959, + "learning_rate": 4.3517333310003584e-05, + "loss": 2.802, + "step": 37839 + }, + { + "epoch": 1.7617384826687152, + "grad_norm": 0.3447825409620114, + "learning_rate": 4.351464744636905e-05, + "loss": 2.8545, + "step": 37840 + }, + { + "epoch": 1.7617850408548081, + "grad_norm": 0.3629378699053213, + "learning_rate": 4.35119616017684e-05, + "loss": 2.7544, + "step": 37841 + }, + { + "epoch": 1.7618315990409013, + "grad_norm": 0.3228285533881649, + "learning_rate": 4.3509275776209506e-05, + "loss": 2.7542, + "step": 37842 + }, + { + "epoch": 1.7618781572269944, + "grad_norm": 0.39199211626797664, + "learning_rate": 4.350658996970025e-05, + "loss": 2.6511, + "step": 37843 + }, + { + "epoch": 1.7619247154130875, + "grad_norm": 0.3320548234483703, + "learning_rate": 4.3503904182248496e-05, + "loss": 2.6791, + "step": 37844 + }, + { + "epoch": 1.7619712735991806, + "grad_norm": 0.38682132220017373, + "learning_rate": 4.350121841386216e-05, + "loss": 2.6706, + "step": 37845 + }, + { + "epoch": 1.7620178317852737, + "grad_norm": 0.35500294876648264, + "learning_rate": 4.349853266454911e-05, + "loss": 2.7227, + "step": 37846 + }, + { + "epoch": 1.7620643899713668, + "grad_norm": 0.34577385546452927, + "learning_rate": 4.34958469343172e-05, + "loss": 2.6841, + "step": 37847 + }, + { + "epoch": 1.76211094815746, + "grad_norm": 0.36728473992711674, + "learning_rate": 4.349316122317437e-05, + "loss": 2.7603, + "step": 37848 + }, + { + "epoch": 1.7621575063435528, + "grad_norm": 0.3885074506408954, + "learning_rate": 4.3490475531128436e-05, + "loss": 2.7341, + "step": 37849 + }, + { + "epoch": 1.762204064529646, + "grad_norm": 0.33523657831379566, + "learning_rate": 4.3487789858187345e-05, + "loss": 2.7067, + "step": 37850 + }, + { + "epoch": 1.7622506227157388, + "grad_norm": 0.4067452479276045, + "learning_rate": 4.348510420435895e-05, + "loss": 2.6511, + "step": 37851 + }, + { + "epoch": 1.762297180901832, + "grad_norm": 0.35628056811484526, + "learning_rate": 4.3482418569651116e-05, + "loss": 2.7513, + "step": 37852 + }, + { + "epoch": 1.762343739087925, + "grad_norm": 0.3926480298419958, + "learning_rate": 4.3479732954071746e-05, + "loss": 2.7499, + "step": 37853 + }, + { + "epoch": 1.7623902972740182, + "grad_norm": 0.34131230425169157, + "learning_rate": 4.347704735762872e-05, + "loss": 2.7899, + "step": 37854 + }, + { + "epoch": 1.7624368554601113, + "grad_norm": 0.3302329883255565, + "learning_rate": 4.347436178032992e-05, + "loss": 2.6533, + "step": 37855 + }, + { + "epoch": 1.7624834136462044, + "grad_norm": 0.33522563826544627, + "learning_rate": 4.3471676222183234e-05, + "loss": 2.7615, + "step": 37856 + }, + { + "epoch": 1.7625299718322975, + "grad_norm": 0.35135813163048124, + "learning_rate": 4.34689906831965e-05, + "loss": 2.5988, + "step": 37857 + }, + { + "epoch": 1.7625765300183904, + "grad_norm": 0.33003978847389487, + "learning_rate": 4.3466305163377674e-05, + "loss": 2.7143, + "step": 37858 + }, + { + "epoch": 1.7626230882044835, + "grad_norm": 0.33759029242326943, + "learning_rate": 4.346361966273457e-05, + "loss": 2.7183, + "step": 37859 + }, + { + "epoch": 1.7626696463905767, + "grad_norm": 0.3793451286179183, + "learning_rate": 4.3460934181275116e-05, + "loss": 2.7417, + "step": 37860 + }, + { + "epoch": 1.7627162045766696, + "grad_norm": 0.34139006723339865, + "learning_rate": 4.345824871900718e-05, + "loss": 2.8026, + "step": 37861 + }, + { + "epoch": 1.7627627627627627, + "grad_norm": 0.4000707560608703, + "learning_rate": 4.345556327593862e-05, + "loss": 2.7025, + "step": 37862 + }, + { + "epoch": 1.7628093209488558, + "grad_norm": 0.31019541552777136, + "learning_rate": 4.345287785207734e-05, + "loss": 2.6422, + "step": 37863 + }, + { + "epoch": 1.762855879134949, + "grad_norm": 0.38286387280088874, + "learning_rate": 4.345019244743124e-05, + "loss": 2.7434, + "step": 37864 + }, + { + "epoch": 1.762902437321042, + "grad_norm": 0.34777415596434746, + "learning_rate": 4.3447507062008164e-05, + "loss": 2.7192, + "step": 37865 + }, + { + "epoch": 1.7629489955071351, + "grad_norm": 0.3527688849513789, + "learning_rate": 4.344482169581601e-05, + "loss": 2.6408, + "step": 37866 + }, + { + "epoch": 1.7629955536932282, + "grad_norm": 0.3640251748203933, + "learning_rate": 4.3442136348862656e-05, + "loss": 2.7484, + "step": 37867 + }, + { + "epoch": 1.7630421118793211, + "grad_norm": 0.34025154647507494, + "learning_rate": 4.3439451021156e-05, + "loss": 2.7501, + "step": 37868 + }, + { + "epoch": 1.7630886700654143, + "grad_norm": 0.3636343516185975, + "learning_rate": 4.3436765712703905e-05, + "loss": 2.6059, + "step": 37869 + }, + { + "epoch": 1.7631352282515074, + "grad_norm": 0.3317908114201571, + "learning_rate": 4.3434080423514234e-05, + "loss": 2.7985, + "step": 37870 + }, + { + "epoch": 1.7631817864376003, + "grad_norm": 0.3341660527474715, + "learning_rate": 4.343139515359491e-05, + "loss": 2.6997, + "step": 37871 + }, + { + "epoch": 1.7632283446236934, + "grad_norm": 0.3492977903571787, + "learning_rate": 4.342870990295378e-05, + "loss": 2.6542, + "step": 37872 + }, + { + "epoch": 1.7632749028097865, + "grad_norm": 0.3510141384425959, + "learning_rate": 4.3426024671598756e-05, + "loss": 2.7215, + "step": 37873 + }, + { + "epoch": 1.7633214609958796, + "grad_norm": 0.3288044769303101, + "learning_rate": 4.3423339459537696e-05, + "loss": 2.7348, + "step": 37874 + }, + { + "epoch": 1.7633680191819727, + "grad_norm": 0.3273315929732395, + "learning_rate": 4.342065426677847e-05, + "loss": 2.668, + "step": 37875 + }, + { + "epoch": 1.7634145773680658, + "grad_norm": 0.33346161930547513, + "learning_rate": 4.3417969093329e-05, + "loss": 2.5742, + "step": 37876 + }, + { + "epoch": 1.763461135554159, + "grad_norm": 0.3397342732756012, + "learning_rate": 4.341528393919713e-05, + "loss": 2.7195, + "step": 37877 + }, + { + "epoch": 1.7635076937402518, + "grad_norm": 0.3571487796356561, + "learning_rate": 4.341259880439077e-05, + "loss": 2.733, + "step": 37878 + }, + { + "epoch": 1.763554251926345, + "grad_norm": 0.34077094641187095, + "learning_rate": 4.340991368891776e-05, + "loss": 2.7149, + "step": 37879 + }, + { + "epoch": 1.7636008101124379, + "grad_norm": 0.33579311957060193, + "learning_rate": 4.3407228592786017e-05, + "loss": 2.7238, + "step": 37880 + }, + { + "epoch": 1.763647368298531, + "grad_norm": 0.3500845702616562, + "learning_rate": 4.340454351600341e-05, + "loss": 2.6273, + "step": 37881 + }, + { + "epoch": 1.763693926484624, + "grad_norm": 0.3171415567246016, + "learning_rate": 4.3401858458577824e-05, + "loss": 2.7176, + "step": 37882 + }, + { + "epoch": 1.7637404846707172, + "grad_norm": 0.3200193818540419, + "learning_rate": 4.3399173420517105e-05, + "loss": 2.7476, + "step": 37883 + }, + { + "epoch": 1.7637870428568103, + "grad_norm": 0.33636080402261387, + "learning_rate": 4.339648840182919e-05, + "loss": 2.698, + "step": 37884 + }, + { + "epoch": 1.7638336010429034, + "grad_norm": 0.31305710605407816, + "learning_rate": 4.339380340252191e-05, + "loss": 2.6553, + "step": 37885 + }, + { + "epoch": 1.7638801592289965, + "grad_norm": 0.32445170072933754, + "learning_rate": 4.339111842260318e-05, + "loss": 2.6787, + "step": 37886 + }, + { + "epoch": 1.7639267174150897, + "grad_norm": 0.3320999149231581, + "learning_rate": 4.338843346208087e-05, + "loss": 2.7113, + "step": 37887 + }, + { + "epoch": 1.7639732756011826, + "grad_norm": 0.34873476769051004, + "learning_rate": 4.3385748520962835e-05, + "loss": 2.707, + "step": 37888 + }, + { + "epoch": 1.7640198337872757, + "grad_norm": 0.3142621842723929, + "learning_rate": 4.3383063599257e-05, + "loss": 2.693, + "step": 37889 + }, + { + "epoch": 1.7640663919733686, + "grad_norm": 0.34250997029185776, + "learning_rate": 4.338037869697121e-05, + "loss": 2.682, + "step": 37890 + }, + { + "epoch": 1.7641129501594617, + "grad_norm": 0.3099665843257165, + "learning_rate": 4.337769381411336e-05, + "loss": 2.5358, + "step": 37891 + }, + { + "epoch": 1.7641595083455548, + "grad_norm": 0.3293704680860498, + "learning_rate": 4.337500895069133e-05, + "loss": 2.6706, + "step": 37892 + }, + { + "epoch": 1.764206066531648, + "grad_norm": 0.3286893270029923, + "learning_rate": 4.337232410671298e-05, + "loss": 2.7883, + "step": 37893 + }, + { + "epoch": 1.764252624717741, + "grad_norm": 0.3428816131725317, + "learning_rate": 4.336963928218623e-05, + "loss": 2.6689, + "step": 37894 + }, + { + "epoch": 1.7642991829038341, + "grad_norm": 0.3486752932178388, + "learning_rate": 4.33669544771189e-05, + "loss": 2.7284, + "step": 37895 + }, + { + "epoch": 1.7643457410899273, + "grad_norm": 0.34018812963072514, + "learning_rate": 4.336426969151894e-05, + "loss": 2.6485, + "step": 37896 + }, + { + "epoch": 1.7643922992760204, + "grad_norm": 0.34638968485024496, + "learning_rate": 4.336158492539419e-05, + "loss": 2.6783, + "step": 37897 + }, + { + "epoch": 1.7644388574621133, + "grad_norm": 0.32979097073099706, + "learning_rate": 4.3358900178752516e-05, + "loss": 2.8165, + "step": 37898 + }, + { + "epoch": 1.7644854156482064, + "grad_norm": 0.37688281986735817, + "learning_rate": 4.3356215451601834e-05, + "loss": 2.7306, + "step": 37899 + }, + { + "epoch": 1.7645319738342993, + "grad_norm": 0.3237033052330472, + "learning_rate": 4.3353530743950006e-05, + "loss": 2.7332, + "step": 37900 + }, + { + "epoch": 1.7645785320203924, + "grad_norm": 0.33949506021827885, + "learning_rate": 4.3350846055804886e-05, + "loss": 2.6876, + "step": 37901 + }, + { + "epoch": 1.7646250902064855, + "grad_norm": 0.35879573385606606, + "learning_rate": 4.33481613871744e-05, + "loss": 2.7348, + "step": 37902 + }, + { + "epoch": 1.7646716483925786, + "grad_norm": 0.336458397696163, + "learning_rate": 4.3345476738066395e-05, + "loss": 2.7106, + "step": 37903 + }, + { + "epoch": 1.7647182065786717, + "grad_norm": 0.3363854924449707, + "learning_rate": 4.334279210848877e-05, + "loss": 2.7239, + "step": 37904 + }, + { + "epoch": 1.7647647647647648, + "grad_norm": 0.3420858133227519, + "learning_rate": 4.334010749844939e-05, + "loss": 2.7761, + "step": 37905 + }, + { + "epoch": 1.764811322950858, + "grad_norm": 0.33654284060715567, + "learning_rate": 4.3337422907956134e-05, + "loss": 2.7589, + "step": 37906 + }, + { + "epoch": 1.7648578811369509, + "grad_norm": 0.3426572508315944, + "learning_rate": 4.333473833701689e-05, + "loss": 2.7592, + "step": 37907 + }, + { + "epoch": 1.764904439323044, + "grad_norm": 0.34080840158644615, + "learning_rate": 4.333205378563952e-05, + "loss": 2.7261, + "step": 37908 + }, + { + "epoch": 1.764950997509137, + "grad_norm": 0.33419368882879613, + "learning_rate": 4.332936925383193e-05, + "loss": 2.8033, + "step": 37909 + }, + { + "epoch": 1.76499755569523, + "grad_norm": 0.34258531036405016, + "learning_rate": 4.3326684741601986e-05, + "loss": 2.8133, + "step": 37910 + }, + { + "epoch": 1.765044113881323, + "grad_norm": 0.35978081056781225, + "learning_rate": 4.3324000248957546e-05, + "loss": 2.8422, + "step": 37911 + }, + { + "epoch": 1.7650906720674162, + "grad_norm": 0.3449934819237233, + "learning_rate": 4.332131577590653e-05, + "loss": 2.7143, + "step": 37912 + }, + { + "epoch": 1.7651372302535093, + "grad_norm": 0.3522307535328033, + "learning_rate": 4.331863132245677e-05, + "loss": 2.7051, + "step": 37913 + }, + { + "epoch": 1.7651837884396024, + "grad_norm": 0.38693997522682894, + "learning_rate": 4.331594688861619e-05, + "loss": 2.803, + "step": 37914 + }, + { + "epoch": 1.7652303466256956, + "grad_norm": 0.3214957991857291, + "learning_rate": 4.331326247439265e-05, + "loss": 2.7355, + "step": 37915 + }, + { + "epoch": 1.7652769048117887, + "grad_norm": 0.3516925869721422, + "learning_rate": 4.331057807979401e-05, + "loss": 2.8396, + "step": 37916 + }, + { + "epoch": 1.7653234629978816, + "grad_norm": 0.3524312679633818, + "learning_rate": 4.330789370482818e-05, + "loss": 2.6807, + "step": 37917 + }, + { + "epoch": 1.7653700211839747, + "grad_norm": 0.3488162692013384, + "learning_rate": 4.330520934950301e-05, + "loss": 2.5595, + "step": 37918 + }, + { + "epoch": 1.7654165793700678, + "grad_norm": 0.3269124338849702, + "learning_rate": 4.330252501382639e-05, + "loss": 2.7242, + "step": 37919 + }, + { + "epoch": 1.7654631375561607, + "grad_norm": 0.3626769176286615, + "learning_rate": 4.329984069780622e-05, + "loss": 2.7591, + "step": 37920 + }, + { + "epoch": 1.7655096957422538, + "grad_norm": 0.3244536973236076, + "learning_rate": 4.3297156401450326e-05, + "loss": 2.6994, + "step": 37921 + }, + { + "epoch": 1.765556253928347, + "grad_norm": 0.32732089017343935, + "learning_rate": 4.329447212476663e-05, + "loss": 2.7164, + "step": 37922 + }, + { + "epoch": 1.76560281211444, + "grad_norm": 0.3449412732684035, + "learning_rate": 4.329178786776301e-05, + "loss": 2.7776, + "step": 37923 + }, + { + "epoch": 1.7656493703005331, + "grad_norm": 0.3464292516974302, + "learning_rate": 4.328910363044731e-05, + "loss": 2.6598, + "step": 37924 + }, + { + "epoch": 1.7656959284866263, + "grad_norm": 0.34151887048692614, + "learning_rate": 4.328641941282745e-05, + "loss": 2.77, + "step": 37925 + }, + { + "epoch": 1.7657424866727194, + "grad_norm": 0.37513943136275535, + "learning_rate": 4.328373521491126e-05, + "loss": 2.654, + "step": 37926 + }, + { + "epoch": 1.7657890448588123, + "grad_norm": 0.3447812719929398, + "learning_rate": 4.3281051036706675e-05, + "loss": 2.6374, + "step": 37927 + }, + { + "epoch": 1.7658356030449054, + "grad_norm": 0.3353895925479863, + "learning_rate": 4.327836687822154e-05, + "loss": 2.625, + "step": 37928 + }, + { + "epoch": 1.7658821612309983, + "grad_norm": 0.32567111434130697, + "learning_rate": 4.327568273946373e-05, + "loss": 2.6712, + "step": 37929 + }, + { + "epoch": 1.7659287194170914, + "grad_norm": 0.33474588633491653, + "learning_rate": 4.3272998620441136e-05, + "loss": 2.7308, + "step": 37930 + }, + { + "epoch": 1.7659752776031845, + "grad_norm": 0.34001924615299994, + "learning_rate": 4.32703145211616e-05, + "loss": 2.6338, + "step": 37931 + }, + { + "epoch": 1.7660218357892776, + "grad_norm": 0.34934097619291293, + "learning_rate": 4.326763044163306e-05, + "loss": 2.6983, + "step": 37932 + }, + { + "epoch": 1.7660683939753707, + "grad_norm": 0.337435085373636, + "learning_rate": 4.3264946381863356e-05, + "loss": 2.7409, + "step": 37933 + }, + { + "epoch": 1.7661149521614639, + "grad_norm": 0.3510255736102298, + "learning_rate": 4.326226234186035e-05, + "loss": 2.7156, + "step": 37934 + }, + { + "epoch": 1.766161510347557, + "grad_norm": 0.3421841182737125, + "learning_rate": 4.3259578321631964e-05, + "loss": 2.6688, + "step": 37935 + }, + { + "epoch": 1.76620806853365, + "grad_norm": 0.36402216755655675, + "learning_rate": 4.325689432118603e-05, + "loss": 2.6016, + "step": 37936 + }, + { + "epoch": 1.766254626719743, + "grad_norm": 0.3352665432002283, + "learning_rate": 4.3254210340530464e-05, + "loss": 2.6383, + "step": 37937 + }, + { + "epoch": 1.766301184905836, + "grad_norm": 0.34140961722009044, + "learning_rate": 4.3251526379673135e-05, + "loss": 2.7641, + "step": 37938 + }, + { + "epoch": 1.766347743091929, + "grad_norm": 0.35126585146925043, + "learning_rate": 4.324884243862188e-05, + "loss": 2.7189, + "step": 37939 + }, + { + "epoch": 1.766394301278022, + "grad_norm": 0.34389497287549115, + "learning_rate": 4.3246158517384636e-05, + "loss": 2.7607, + "step": 37940 + }, + { + "epoch": 1.7664408594641152, + "grad_norm": 0.3720383531276955, + "learning_rate": 4.3243474615969246e-05, + "loss": 2.7502, + "step": 37941 + }, + { + "epoch": 1.7664874176502083, + "grad_norm": 0.3787388470228126, + "learning_rate": 4.324079073438358e-05, + "loss": 2.7225, + "step": 37942 + }, + { + "epoch": 1.7665339758363015, + "grad_norm": 0.3905807117731004, + "learning_rate": 4.323810687263554e-05, + "loss": 2.6755, + "step": 37943 + }, + { + "epoch": 1.7665805340223946, + "grad_norm": 0.3734424451845361, + "learning_rate": 4.323542303073298e-05, + "loss": 2.7717, + "step": 37944 + }, + { + "epoch": 1.7666270922084877, + "grad_norm": 0.3681501104514503, + "learning_rate": 4.3232739208683796e-05, + "loss": 2.7169, + "step": 37945 + }, + { + "epoch": 1.7666736503945806, + "grad_norm": 0.3749925182871275, + "learning_rate": 4.3230055406495856e-05, + "loss": 2.66, + "step": 37946 + }, + { + "epoch": 1.7667202085806737, + "grad_norm": 0.3676848747091984, + "learning_rate": 4.322737162417701e-05, + "loss": 2.6822, + "step": 37947 + }, + { + "epoch": 1.7667667667667668, + "grad_norm": 0.35225274948856417, + "learning_rate": 4.3224687861735194e-05, + "loss": 2.701, + "step": 37948 + }, + { + "epoch": 1.7668133249528597, + "grad_norm": 0.3625361957690716, + "learning_rate": 4.3222004119178225e-05, + "loss": 2.8005, + "step": 37949 + }, + { + "epoch": 1.7668598831389528, + "grad_norm": 0.3301932034416261, + "learning_rate": 4.3219320396514035e-05, + "loss": 2.6291, + "step": 37950 + }, + { + "epoch": 1.766906441325046, + "grad_norm": 0.34339561760036186, + "learning_rate": 4.3216636693750464e-05, + "loss": 2.6572, + "step": 37951 + }, + { + "epoch": 1.766952999511139, + "grad_norm": 0.37918657613539014, + "learning_rate": 4.321395301089537e-05, + "loss": 2.7804, + "step": 37952 + }, + { + "epoch": 1.7669995576972322, + "grad_norm": 0.35892371374981025, + "learning_rate": 4.321126934795669e-05, + "loss": 2.6637, + "step": 37953 + }, + { + "epoch": 1.7670461158833253, + "grad_norm": 0.39205577426981525, + "learning_rate": 4.3208585704942246e-05, + "loss": 2.8139, + "step": 37954 + }, + { + "epoch": 1.7670926740694184, + "grad_norm": 0.3418561645274368, + "learning_rate": 4.320590208185995e-05, + "loss": 2.8316, + "step": 37955 + }, + { + "epoch": 1.7671392322555113, + "grad_norm": 0.3578289912232781, + "learning_rate": 4.3203218478717654e-05, + "loss": 2.7257, + "step": 37956 + }, + { + "epoch": 1.7671857904416044, + "grad_norm": 0.3330389106228491, + "learning_rate": 4.3200534895523226e-05, + "loss": 2.7074, + "step": 37957 + }, + { + "epoch": 1.7672323486276975, + "grad_norm": 0.34463008822664526, + "learning_rate": 4.3197851332284575e-05, + "loss": 2.7321, + "step": 37958 + }, + { + "epoch": 1.7672789068137904, + "grad_norm": 0.33965750042316306, + "learning_rate": 4.319516778900957e-05, + "loss": 2.6349, + "step": 37959 + }, + { + "epoch": 1.7673254649998835, + "grad_norm": 0.36483573212948617, + "learning_rate": 4.319248426570604e-05, + "loss": 2.7365, + "step": 37960 + }, + { + "epoch": 1.7673720231859766, + "grad_norm": 0.3407556830015465, + "learning_rate": 4.3189800762381936e-05, + "loss": 2.6376, + "step": 37961 + }, + { + "epoch": 1.7674185813720698, + "grad_norm": 0.3398980241586359, + "learning_rate": 4.318711727904507e-05, + "loss": 2.6918, + "step": 37962 + }, + { + "epoch": 1.7674651395581629, + "grad_norm": 0.3379464544724739, + "learning_rate": 4.318443381570336e-05, + "loss": 2.6539, + "step": 37963 + }, + { + "epoch": 1.767511697744256, + "grad_norm": 0.328413328663264, + "learning_rate": 4.318175037236467e-05, + "loss": 2.7483, + "step": 37964 + }, + { + "epoch": 1.767558255930349, + "grad_norm": 0.3322056357048774, + "learning_rate": 4.3179066949036844e-05, + "loss": 2.7887, + "step": 37965 + }, + { + "epoch": 1.767604814116442, + "grad_norm": 0.3415510435290319, + "learning_rate": 4.317638354572781e-05, + "loss": 2.8007, + "step": 37966 + }, + { + "epoch": 1.767651372302535, + "grad_norm": 0.3434841871866204, + "learning_rate": 4.3173700162445405e-05, + "loss": 2.6755, + "step": 37967 + }, + { + "epoch": 1.767697930488628, + "grad_norm": 0.3119279288183675, + "learning_rate": 4.3171016799197535e-05, + "loss": 2.6647, + "step": 37968 + }, + { + "epoch": 1.7677444886747211, + "grad_norm": 0.3646165398709238, + "learning_rate": 4.316833345599205e-05, + "loss": 2.6958, + "step": 37969 + }, + { + "epoch": 1.7677910468608142, + "grad_norm": 0.33777788409563514, + "learning_rate": 4.316565013283682e-05, + "loss": 2.739, + "step": 37970 + }, + { + "epoch": 1.7678376050469073, + "grad_norm": 0.35730475320811117, + "learning_rate": 4.316296682973975e-05, + "loss": 2.7822, + "step": 37971 + }, + { + "epoch": 1.7678841632330005, + "grad_norm": 0.33815589236700216, + "learning_rate": 4.3160283546708675e-05, + "loss": 2.728, + "step": 37972 + }, + { + "epoch": 1.7679307214190936, + "grad_norm": 0.34684486973810924, + "learning_rate": 4.315760028375151e-05, + "loss": 2.7209, + "step": 37973 + }, + { + "epoch": 1.7679772796051867, + "grad_norm": 0.33928791793558283, + "learning_rate": 4.315491704087613e-05, + "loss": 2.6857, + "step": 37974 + }, + { + "epoch": 1.7680238377912798, + "grad_norm": 0.34983088214927943, + "learning_rate": 4.315223381809036e-05, + "loss": 2.7408, + "step": 37975 + }, + { + "epoch": 1.7680703959773727, + "grad_norm": 0.3354365552735078, + "learning_rate": 4.314955061540213e-05, + "loss": 2.6528, + "step": 37976 + }, + { + "epoch": 1.7681169541634658, + "grad_norm": 0.33907182336820874, + "learning_rate": 4.31468674328193e-05, + "loss": 2.7442, + "step": 37977 + }, + { + "epoch": 1.7681635123495587, + "grad_norm": 0.37048159944350856, + "learning_rate": 4.314418427034972e-05, + "loss": 2.7752, + "step": 37978 + }, + { + "epoch": 1.7682100705356518, + "grad_norm": 0.3613627473869381, + "learning_rate": 4.3141501128001305e-05, + "loss": 2.6203, + "step": 37979 + }, + { + "epoch": 1.768256628721745, + "grad_norm": 0.32122738961546154, + "learning_rate": 4.31388180057819e-05, + "loss": 2.7436, + "step": 37980 + }, + { + "epoch": 1.768303186907838, + "grad_norm": 0.35330344281861864, + "learning_rate": 4.3136134903699385e-05, + "loss": 2.7134, + "step": 37981 + }, + { + "epoch": 1.7683497450939312, + "grad_norm": 0.35786862348736764, + "learning_rate": 4.313345182176165e-05, + "loss": 2.7462, + "step": 37982 + }, + { + "epoch": 1.7683963032800243, + "grad_norm": 0.3410904580263241, + "learning_rate": 4.313076875997653e-05, + "loss": 2.7871, + "step": 37983 + }, + { + "epoch": 1.7684428614661174, + "grad_norm": 0.3483509388837474, + "learning_rate": 4.312808571835195e-05, + "loss": 2.6637, + "step": 37984 + }, + { + "epoch": 1.7684894196522105, + "grad_norm": 0.33033487787965926, + "learning_rate": 4.312540269689574e-05, + "loss": 2.6467, + "step": 37985 + }, + { + "epoch": 1.7685359778383034, + "grad_norm": 0.35648006604699184, + "learning_rate": 4.312271969561582e-05, + "loss": 2.7584, + "step": 37986 + }, + { + "epoch": 1.7685825360243965, + "grad_norm": 0.3318630120273703, + "learning_rate": 4.312003671452004e-05, + "loss": 2.6816, + "step": 37987 + }, + { + "epoch": 1.7686290942104894, + "grad_norm": 0.31672472531438295, + "learning_rate": 4.3117353753616245e-05, + "loss": 2.6908, + "step": 37988 + }, + { + "epoch": 1.7686756523965825, + "grad_norm": 0.3454229041547648, + "learning_rate": 4.311467081291237e-05, + "loss": 2.72, + "step": 37989 + }, + { + "epoch": 1.7687222105826756, + "grad_norm": 0.30540009339044877, + "learning_rate": 4.311198789241624e-05, + "loss": 2.6522, + "step": 37990 + }, + { + "epoch": 1.7687687687687688, + "grad_norm": 0.32429431478587856, + "learning_rate": 4.310930499213575e-05, + "loss": 2.6804, + "step": 37991 + }, + { + "epoch": 1.7688153269548619, + "grad_norm": 0.3272318139764786, + "learning_rate": 4.310662211207879e-05, + "loss": 2.669, + "step": 37992 + }, + { + "epoch": 1.768861885140955, + "grad_norm": 0.32922861484908594, + "learning_rate": 4.310393925225319e-05, + "loss": 2.7716, + "step": 37993 + }, + { + "epoch": 1.768908443327048, + "grad_norm": 0.3223294006523891, + "learning_rate": 4.310125641266687e-05, + "loss": 2.7267, + "step": 37994 + }, + { + "epoch": 1.768955001513141, + "grad_norm": 0.3394371660422107, + "learning_rate": 4.3098573593327674e-05, + "loss": 2.7151, + "step": 37995 + }, + { + "epoch": 1.7690015596992341, + "grad_norm": 0.32264961890145394, + "learning_rate": 4.309589079424347e-05, + "loss": 2.6599, + "step": 37996 + }, + { + "epoch": 1.7690481178853272, + "grad_norm": 0.34054347528293, + "learning_rate": 4.3093208015422164e-05, + "loss": 2.6021, + "step": 37997 + }, + { + "epoch": 1.7690946760714201, + "grad_norm": 0.31437487516643525, + "learning_rate": 4.3090525256871596e-05, + "loss": 2.6832, + "step": 37998 + }, + { + "epoch": 1.7691412342575132, + "grad_norm": 0.364572904083631, + "learning_rate": 4.308784251859967e-05, + "loss": 2.5798, + "step": 37999 + }, + { + "epoch": 1.7691877924436064, + "grad_norm": 0.36106579664565525, + "learning_rate": 4.308515980061425e-05, + "loss": 2.5165, + "step": 38000 + }, + { + "epoch": 1.7692343506296995, + "grad_norm": 0.35585471639836486, + "learning_rate": 4.308247710292318e-05, + "loss": 2.5926, + "step": 38001 + }, + { + "epoch": 1.7692809088157926, + "grad_norm": 0.3890110414624839, + "learning_rate": 4.307979442553438e-05, + "loss": 2.6445, + "step": 38002 + }, + { + "epoch": 1.7693274670018857, + "grad_norm": 0.35452342293897965, + "learning_rate": 4.307711176845569e-05, + "loss": 2.6329, + "step": 38003 + }, + { + "epoch": 1.7693740251879788, + "grad_norm": 0.38598015043518236, + "learning_rate": 4.3074429131695004e-05, + "loss": 2.62, + "step": 38004 + }, + { + "epoch": 1.7694205833740717, + "grad_norm": 0.3457556332551757, + "learning_rate": 4.3071746515260186e-05, + "loss": 2.7108, + "step": 38005 + }, + { + "epoch": 1.7694671415601648, + "grad_norm": 0.36971510313952, + "learning_rate": 4.30690639191591e-05, + "loss": 2.7636, + "step": 38006 + }, + { + "epoch": 1.769513699746258, + "grad_norm": 0.3463956610398886, + "learning_rate": 4.3066381343399645e-05, + "loss": 2.6838, + "step": 38007 + }, + { + "epoch": 1.7695602579323508, + "grad_norm": 0.3492179153756775, + "learning_rate": 4.306369878798966e-05, + "loss": 2.6256, + "step": 38008 + }, + { + "epoch": 1.769606816118444, + "grad_norm": 0.33981721526579545, + "learning_rate": 4.3061016252937045e-05, + "loss": 2.6474, + "step": 38009 + }, + { + "epoch": 1.769653374304537, + "grad_norm": 0.3396266281323958, + "learning_rate": 4.3058333738249676e-05, + "loss": 2.6689, + "step": 38010 + }, + { + "epoch": 1.7696999324906302, + "grad_norm": 0.33544917489074916, + "learning_rate": 4.305565124393538e-05, + "loss": 2.8076, + "step": 38011 + }, + { + "epoch": 1.7697464906767233, + "grad_norm": 0.3608024096661801, + "learning_rate": 4.305296877000209e-05, + "loss": 2.7525, + "step": 38012 + }, + { + "epoch": 1.7697930488628164, + "grad_norm": 0.3554956446739909, + "learning_rate": 4.3050286316457654e-05, + "loss": 2.6793, + "step": 38013 + }, + { + "epoch": 1.7698396070489095, + "grad_norm": 0.3202972886447859, + "learning_rate": 4.304760388330992e-05, + "loss": 2.6151, + "step": 38014 + }, + { + "epoch": 1.7698861652350024, + "grad_norm": 0.3286446354344958, + "learning_rate": 4.3044921470566805e-05, + "loss": 2.7198, + "step": 38015 + }, + { + "epoch": 1.7699327234210955, + "grad_norm": 0.33712249627599433, + "learning_rate": 4.304223907823615e-05, + "loss": 2.8477, + "step": 38016 + }, + { + "epoch": 1.7699792816071884, + "grad_norm": 0.3341167814189885, + "learning_rate": 4.303955670632584e-05, + "loss": 2.7289, + "step": 38017 + }, + { + "epoch": 1.7700258397932815, + "grad_norm": 0.3345640715825183, + "learning_rate": 4.3036874354843755e-05, + "loss": 2.7347, + "step": 38018 + }, + { + "epoch": 1.7700723979793747, + "grad_norm": 0.3422339780642405, + "learning_rate": 4.303419202379774e-05, + "loss": 2.7077, + "step": 38019 + }, + { + "epoch": 1.7701189561654678, + "grad_norm": 0.34843277136717765, + "learning_rate": 4.3031509713195705e-05, + "loss": 2.7126, + "step": 38020 + }, + { + "epoch": 1.7701655143515609, + "grad_norm": 0.35350459976860377, + "learning_rate": 4.302882742304548e-05, + "loss": 2.7385, + "step": 38021 + }, + { + "epoch": 1.770212072537654, + "grad_norm": 0.3280661113800942, + "learning_rate": 4.302614515335498e-05, + "loss": 2.5199, + "step": 38022 + }, + { + "epoch": 1.7702586307237471, + "grad_norm": 0.36134526280316176, + "learning_rate": 4.302346290413205e-05, + "loss": 2.7403, + "step": 38023 + }, + { + "epoch": 1.7703051889098402, + "grad_norm": 0.3544518605578286, + "learning_rate": 4.302078067538456e-05, + "loss": 2.7212, + "step": 38024 + }, + { + "epoch": 1.7703517470959331, + "grad_norm": 0.336131257590018, + "learning_rate": 4.3018098467120404e-05, + "loss": 2.6358, + "step": 38025 + }, + { + "epoch": 1.7703983052820262, + "grad_norm": 0.3408126120849638, + "learning_rate": 4.301541627934742e-05, + "loss": 2.6821, + "step": 38026 + }, + { + "epoch": 1.7704448634681191, + "grad_norm": 0.33432593911380365, + "learning_rate": 4.301273411207353e-05, + "loss": 2.6261, + "step": 38027 + }, + { + "epoch": 1.7704914216542122, + "grad_norm": 0.33363603308885453, + "learning_rate": 4.3010051965306566e-05, + "loss": 2.6577, + "step": 38028 + }, + { + "epoch": 1.7705379798403054, + "grad_norm": 0.3275803488495721, + "learning_rate": 4.3007369839054405e-05, + "loss": 2.7667, + "step": 38029 + }, + { + "epoch": 1.7705845380263985, + "grad_norm": 0.3525222205795181, + "learning_rate": 4.300468773332493e-05, + "loss": 2.6921, + "step": 38030 + }, + { + "epoch": 1.7706310962124916, + "grad_norm": 0.35102147717495197, + "learning_rate": 4.300200564812601e-05, + "loss": 2.7759, + "step": 38031 + }, + { + "epoch": 1.7706776543985847, + "grad_norm": 0.34316469987250037, + "learning_rate": 4.29993235834655e-05, + "loss": 2.6568, + "step": 38032 + }, + { + "epoch": 1.7707242125846778, + "grad_norm": 0.38012051968394067, + "learning_rate": 4.299664153935131e-05, + "loss": 2.6834, + "step": 38033 + }, + { + "epoch": 1.7707707707707707, + "grad_norm": 0.3507534479222576, + "learning_rate": 4.299395951579126e-05, + "loss": 2.706, + "step": 38034 + }, + { + "epoch": 1.7708173289568638, + "grad_norm": 0.3356341594015676, + "learning_rate": 4.299127751279327e-05, + "loss": 2.5888, + "step": 38035 + }, + { + "epoch": 1.770863887142957, + "grad_norm": 0.36221441189011505, + "learning_rate": 4.298859553036519e-05, + "loss": 2.721, + "step": 38036 + }, + { + "epoch": 1.7709104453290498, + "grad_norm": 0.35268010006561046, + "learning_rate": 4.298591356851487e-05, + "loss": 2.6497, + "step": 38037 + }, + { + "epoch": 1.770957003515143, + "grad_norm": 0.3074881622617025, + "learning_rate": 4.298323162725022e-05, + "loss": 2.5451, + "step": 38038 + }, + { + "epoch": 1.771003561701236, + "grad_norm": 0.35474310003283277, + "learning_rate": 4.2980549706579076e-05, + "loss": 2.7113, + "step": 38039 + }, + { + "epoch": 1.7710501198873292, + "grad_norm": 0.3260423385450327, + "learning_rate": 4.297786780650934e-05, + "loss": 2.8152, + "step": 38040 + }, + { + "epoch": 1.7710966780734223, + "grad_norm": 0.35920733279370765, + "learning_rate": 4.2975185927048884e-05, + "loss": 2.7568, + "step": 38041 + }, + { + "epoch": 1.7711432362595154, + "grad_norm": 0.3356589489786736, + "learning_rate": 4.2972504068205546e-05, + "loss": 2.6812, + "step": 38042 + }, + { + "epoch": 1.7711897944456085, + "grad_norm": 0.3757298484455062, + "learning_rate": 4.296982222998722e-05, + "loss": 2.7524, + "step": 38043 + }, + { + "epoch": 1.7712363526317014, + "grad_norm": 0.34409541004347693, + "learning_rate": 4.2967140412401775e-05, + "loss": 2.7134, + "step": 38044 + }, + { + "epoch": 1.7712829108177945, + "grad_norm": 0.37393356100094133, + "learning_rate": 4.2964458615457084e-05, + "loss": 2.6459, + "step": 38045 + }, + { + "epoch": 1.7713294690038877, + "grad_norm": 0.34192425264985143, + "learning_rate": 4.2961776839161016e-05, + "loss": 2.7137, + "step": 38046 + }, + { + "epoch": 1.7713760271899806, + "grad_norm": 0.3671552673154744, + "learning_rate": 4.295909508352142e-05, + "loss": 2.7417, + "step": 38047 + }, + { + "epoch": 1.7714225853760737, + "grad_norm": 0.32845235191872024, + "learning_rate": 4.295641334854621e-05, + "loss": 2.8855, + "step": 38048 + }, + { + "epoch": 1.7714691435621668, + "grad_norm": 0.35296030237563575, + "learning_rate": 4.2953731634243234e-05, + "loss": 2.5732, + "step": 38049 + }, + { + "epoch": 1.77151570174826, + "grad_norm": 0.3465430392090641, + "learning_rate": 4.2951049940620333e-05, + "loss": 2.6377, + "step": 38050 + }, + { + "epoch": 1.771562259934353, + "grad_norm": 0.37317452947306956, + "learning_rate": 4.2948368267685426e-05, + "loss": 2.6531, + "step": 38051 + }, + { + "epoch": 1.7716088181204461, + "grad_norm": 0.32097655620972926, + "learning_rate": 4.294568661544635e-05, + "loss": 2.6065, + "step": 38052 + }, + { + "epoch": 1.7716553763065392, + "grad_norm": 0.3737270450689531, + "learning_rate": 4.294300498391101e-05, + "loss": 2.6552, + "step": 38053 + }, + { + "epoch": 1.7717019344926321, + "grad_norm": 0.38362329337460305, + "learning_rate": 4.294032337308725e-05, + "loss": 2.6746, + "step": 38054 + }, + { + "epoch": 1.7717484926787253, + "grad_norm": 0.3396317562193964, + "learning_rate": 4.293764178298294e-05, + "loss": 2.703, + "step": 38055 + }, + { + "epoch": 1.7717950508648181, + "grad_norm": 0.369845717828988, + "learning_rate": 4.293496021360595e-05, + "loss": 2.7051, + "step": 38056 + }, + { + "epoch": 1.7718416090509113, + "grad_norm": 0.3331079905104515, + "learning_rate": 4.293227866496416e-05, + "loss": 2.6622, + "step": 38057 + }, + { + "epoch": 1.7718881672370044, + "grad_norm": 0.36657565999026415, + "learning_rate": 4.292959713706544e-05, + "loss": 2.6939, + "step": 38058 + }, + { + "epoch": 1.7719347254230975, + "grad_norm": 0.33679383133440544, + "learning_rate": 4.2926915629917666e-05, + "loss": 2.6169, + "step": 38059 + }, + { + "epoch": 1.7719812836091906, + "grad_norm": 0.3117717218956891, + "learning_rate": 4.292423414352867e-05, + "loss": 2.7713, + "step": 38060 + }, + { + "epoch": 1.7720278417952837, + "grad_norm": 0.34594906485010335, + "learning_rate": 4.2921552677906375e-05, + "loss": 2.6598, + "step": 38061 + }, + { + "epoch": 1.7720743999813768, + "grad_norm": 0.35527444349468484, + "learning_rate": 4.29188712330586e-05, + "loss": 2.74, + "step": 38062 + }, + { + "epoch": 1.77212095816747, + "grad_norm": 0.32331942387095464, + "learning_rate": 4.2916189808993264e-05, + "loss": 2.6848, + "step": 38063 + }, + { + "epoch": 1.7721675163535628, + "grad_norm": 0.34582357163874144, + "learning_rate": 4.291350840571821e-05, + "loss": 2.6862, + "step": 38064 + }, + { + "epoch": 1.772214074539656, + "grad_norm": 0.3275312977805747, + "learning_rate": 4.291082702324129e-05, + "loss": 2.591, + "step": 38065 + }, + { + "epoch": 1.7722606327257489, + "grad_norm": 0.35916491001054474, + "learning_rate": 4.290814566157042e-05, + "loss": 2.7186, + "step": 38066 + }, + { + "epoch": 1.772307190911842, + "grad_norm": 0.3389343629730495, + "learning_rate": 4.290546432071344e-05, + "loss": 2.7443, + "step": 38067 + }, + { + "epoch": 1.772353749097935, + "grad_norm": 0.35425940470946543, + "learning_rate": 4.290278300067821e-05, + "loss": 2.6543, + "step": 38068 + }, + { + "epoch": 1.7724003072840282, + "grad_norm": 0.35217358703091817, + "learning_rate": 4.2900101701472614e-05, + "loss": 2.6405, + "step": 38069 + }, + { + "epoch": 1.7724468654701213, + "grad_norm": 0.35451542837786226, + "learning_rate": 4.2897420423104526e-05, + "loss": 2.7262, + "step": 38070 + }, + { + "epoch": 1.7724934236562144, + "grad_norm": 0.3285720120885355, + "learning_rate": 4.289473916558182e-05, + "loss": 2.7746, + "step": 38071 + }, + { + "epoch": 1.7725399818423075, + "grad_norm": 0.3856349375250355, + "learning_rate": 4.289205792891235e-05, + "loss": 2.7807, + "step": 38072 + }, + { + "epoch": 1.7725865400284007, + "grad_norm": 0.36549185380528976, + "learning_rate": 4.288937671310397e-05, + "loss": 2.6478, + "step": 38073 + }, + { + "epoch": 1.7726330982144936, + "grad_norm": 0.35425348908929716, + "learning_rate": 4.288669551816459e-05, + "loss": 2.6852, + "step": 38074 + }, + { + "epoch": 1.7726796564005867, + "grad_norm": 0.33515830057672336, + "learning_rate": 4.288401434410203e-05, + "loss": 2.5976, + "step": 38075 + }, + { + "epoch": 1.7727262145866796, + "grad_norm": 0.32858451839561076, + "learning_rate": 4.288133319092422e-05, + "loss": 2.6719, + "step": 38076 + }, + { + "epoch": 1.7727727727727727, + "grad_norm": 0.32124213769639093, + "learning_rate": 4.287865205863899e-05, + "loss": 2.6764, + "step": 38077 + }, + { + "epoch": 1.7728193309588658, + "grad_norm": 0.32368326046872287, + "learning_rate": 4.287597094725418e-05, + "loss": 2.6869, + "step": 38078 + }, + { + "epoch": 1.772865889144959, + "grad_norm": 0.35672250000622774, + "learning_rate": 4.2873289856777734e-05, + "loss": 2.6263, + "step": 38079 + }, + { + "epoch": 1.772912447331052, + "grad_norm": 0.3338459551753139, + "learning_rate": 4.2870608787217464e-05, + "loss": 2.7466, + "step": 38080 + }, + { + "epoch": 1.7729590055171451, + "grad_norm": 0.35867767231006925, + "learning_rate": 4.2867927738581265e-05, + "loss": 2.6755, + "step": 38081 + }, + { + "epoch": 1.7730055637032383, + "grad_norm": 0.35929861652828615, + "learning_rate": 4.2865246710876985e-05, + "loss": 2.6417, + "step": 38082 + }, + { + "epoch": 1.7730521218893311, + "grad_norm": 0.35494301022587305, + "learning_rate": 4.2862565704112505e-05, + "loss": 2.677, + "step": 38083 + }, + { + "epoch": 1.7730986800754243, + "grad_norm": 0.34988207654467773, + "learning_rate": 4.28598847182957e-05, + "loss": 2.6652, + "step": 38084 + }, + { + "epoch": 1.7731452382615174, + "grad_norm": 0.34579487818360205, + "learning_rate": 4.2857203753434426e-05, + "loss": 2.7374, + "step": 38085 + }, + { + "epoch": 1.7731917964476103, + "grad_norm": 0.3342333106605901, + "learning_rate": 4.285452280953654e-05, + "loss": 2.7121, + "step": 38086 + }, + { + "epoch": 1.7732383546337034, + "grad_norm": 0.37520252139441623, + "learning_rate": 4.2851841886609946e-05, + "loss": 2.7732, + "step": 38087 + }, + { + "epoch": 1.7732849128197965, + "grad_norm": 0.3309812871467807, + "learning_rate": 4.2849160984662466e-05, + "loss": 2.7952, + "step": 38088 + }, + { + "epoch": 1.7733314710058896, + "grad_norm": 0.37347905185259134, + "learning_rate": 4.284648010370202e-05, + "loss": 2.6943, + "step": 38089 + }, + { + "epoch": 1.7733780291919827, + "grad_norm": 0.3750886874647674, + "learning_rate": 4.284379924373645e-05, + "loss": 2.6911, + "step": 38090 + }, + { + "epoch": 1.7734245873780758, + "grad_norm": 0.3360063447128884, + "learning_rate": 4.284111840477361e-05, + "loss": 2.789, + "step": 38091 + }, + { + "epoch": 1.773471145564169, + "grad_norm": 0.39360644872304174, + "learning_rate": 4.283843758682139e-05, + "loss": 2.6141, + "step": 38092 + }, + { + "epoch": 1.7735177037502619, + "grad_norm": 0.36446811701716025, + "learning_rate": 4.2835756789887646e-05, + "loss": 2.6666, + "step": 38093 + }, + { + "epoch": 1.773564261936355, + "grad_norm": 0.3933412496302285, + "learning_rate": 4.2833076013980254e-05, + "loss": 2.6912, + "step": 38094 + }, + { + "epoch": 1.773610820122448, + "grad_norm": 0.36633258789099243, + "learning_rate": 4.2830395259107076e-05, + "loss": 2.7331, + "step": 38095 + }, + { + "epoch": 1.773657378308541, + "grad_norm": 0.3751577046159941, + "learning_rate": 4.282771452527598e-05, + "loss": 2.7113, + "step": 38096 + }, + { + "epoch": 1.773703936494634, + "grad_norm": 0.3681504200352804, + "learning_rate": 4.282503381249484e-05, + "loss": 2.6455, + "step": 38097 + }, + { + "epoch": 1.7737504946807272, + "grad_norm": 0.33659638393684704, + "learning_rate": 4.28223531207715e-05, + "loss": 2.7153, + "step": 38098 + }, + { + "epoch": 1.7737970528668203, + "grad_norm": 0.4194071824127817, + "learning_rate": 4.2819672450113873e-05, + "loss": 2.6144, + "step": 38099 + }, + { + "epoch": 1.7738436110529134, + "grad_norm": 0.3509037637592641, + "learning_rate": 4.281699180052979e-05, + "loss": 2.7315, + "step": 38100 + }, + { + "epoch": 1.7738901692390066, + "grad_norm": 0.39309533527356355, + "learning_rate": 4.2814311172027124e-05, + "loss": 2.6393, + "step": 38101 + }, + { + "epoch": 1.7739367274250997, + "grad_norm": 0.3317828942115599, + "learning_rate": 4.2811630564613756e-05, + "loss": 2.7695, + "step": 38102 + }, + { + "epoch": 1.7739832856111926, + "grad_norm": 0.348232247008758, + "learning_rate": 4.280894997829754e-05, + "loss": 2.6635, + "step": 38103 + }, + { + "epoch": 1.7740298437972857, + "grad_norm": 0.3503156130286741, + "learning_rate": 4.280626941308634e-05, + "loss": 2.7345, + "step": 38104 + }, + { + "epoch": 1.7740764019833786, + "grad_norm": 0.3487343518495946, + "learning_rate": 4.280358886898804e-05, + "loss": 2.6786, + "step": 38105 + }, + { + "epoch": 1.7741229601694717, + "grad_norm": 0.3631086850487129, + "learning_rate": 4.280090834601049e-05, + "loss": 2.6924, + "step": 38106 + }, + { + "epoch": 1.7741695183555648, + "grad_norm": 0.3265400200552329, + "learning_rate": 4.2798227844161573e-05, + "loss": 2.645, + "step": 38107 + }, + { + "epoch": 1.774216076541658, + "grad_norm": 0.3578117427331477, + "learning_rate": 4.279554736344915e-05, + "loss": 2.7167, + "step": 38108 + }, + { + "epoch": 1.774262634727751, + "grad_norm": 0.34095693642881814, + "learning_rate": 4.279286690388107e-05, + "loss": 2.7265, + "step": 38109 + }, + { + "epoch": 1.7743091929138441, + "grad_norm": 0.3554942155815515, + "learning_rate": 4.279018646546523e-05, + "loss": 2.6151, + "step": 38110 + }, + { + "epoch": 1.7743557510999373, + "grad_norm": 0.3744006243166911, + "learning_rate": 4.278750604820946e-05, + "loss": 2.7726, + "step": 38111 + }, + { + "epoch": 1.7744023092860304, + "grad_norm": 0.3681538353242271, + "learning_rate": 4.278482565212168e-05, + "loss": 2.5938, + "step": 38112 + }, + { + "epoch": 1.7744488674721233, + "grad_norm": 0.38027007668132157, + "learning_rate": 4.278214527720972e-05, + "loss": 2.686, + "step": 38113 + }, + { + "epoch": 1.7744954256582164, + "grad_norm": 0.38865760339925354, + "learning_rate": 4.277946492348143e-05, + "loss": 2.729, + "step": 38114 + }, + { + "epoch": 1.7745419838443093, + "grad_norm": 0.38213346207816284, + "learning_rate": 4.277678459094472e-05, + "loss": 2.6639, + "step": 38115 + }, + { + "epoch": 1.7745885420304024, + "grad_norm": 0.36234681940899005, + "learning_rate": 4.277410427960742e-05, + "loss": 2.6307, + "step": 38116 + }, + { + "epoch": 1.7746351002164955, + "grad_norm": 0.3827447213144955, + "learning_rate": 4.2771423989477426e-05, + "loss": 2.6328, + "step": 38117 + }, + { + "epoch": 1.7746816584025886, + "grad_norm": 0.3908758607325976, + "learning_rate": 4.27687437205626e-05, + "loss": 2.5665, + "step": 38118 + }, + { + "epoch": 1.7747282165886817, + "grad_norm": 0.33258326997277593, + "learning_rate": 4.2766063472870775e-05, + "loss": 2.6804, + "step": 38119 + }, + { + "epoch": 1.7747747747747749, + "grad_norm": 0.3565405290151055, + "learning_rate": 4.276338324640986e-05, + "loss": 2.656, + "step": 38120 + }, + { + "epoch": 1.774821332960868, + "grad_norm": 0.3754073041841266, + "learning_rate": 4.276070304118769e-05, + "loss": 2.599, + "step": 38121 + }, + { + "epoch": 1.7748678911469609, + "grad_norm": 0.33601593604377106, + "learning_rate": 4.275802285721214e-05, + "loss": 2.7273, + "step": 38122 + }, + { + "epoch": 1.774914449333054, + "grad_norm": 0.39320805863245817, + "learning_rate": 4.27553426944911e-05, + "loss": 2.7831, + "step": 38123 + }, + { + "epoch": 1.774961007519147, + "grad_norm": 0.36935266676115847, + "learning_rate": 4.275266255303238e-05, + "loss": 2.7016, + "step": 38124 + }, + { + "epoch": 1.77500756570524, + "grad_norm": 0.3515046477951902, + "learning_rate": 4.274998243284392e-05, + "loss": 2.7334, + "step": 38125 + }, + { + "epoch": 1.775054123891333, + "grad_norm": 0.3594217615190484, + "learning_rate": 4.274730233393354e-05, + "loss": 2.7224, + "step": 38126 + }, + { + "epoch": 1.7751006820774262, + "grad_norm": 0.3636328012506191, + "learning_rate": 4.2744622256309084e-05, + "loss": 2.7646, + "step": 38127 + }, + { + "epoch": 1.7751472402635193, + "grad_norm": 0.3733176575148366, + "learning_rate": 4.2741942199978475e-05, + "loss": 2.643, + "step": 38128 + }, + { + "epoch": 1.7751937984496124, + "grad_norm": 0.3237619199174248, + "learning_rate": 4.273926216494953e-05, + "loss": 2.6327, + "step": 38129 + }, + { + "epoch": 1.7752403566357056, + "grad_norm": 0.35300401139316767, + "learning_rate": 4.2736582151230154e-05, + "loss": 2.6981, + "step": 38130 + }, + { + "epoch": 1.7752869148217987, + "grad_norm": 0.35389874765465285, + "learning_rate": 4.273390215882819e-05, + "loss": 2.5716, + "step": 38131 + }, + { + "epoch": 1.7753334730078916, + "grad_norm": 0.34351359693179245, + "learning_rate": 4.273122218775151e-05, + "loss": 2.7673, + "step": 38132 + }, + { + "epoch": 1.7753800311939847, + "grad_norm": 0.3455687995686763, + "learning_rate": 4.272854223800796e-05, + "loss": 2.6626, + "step": 38133 + }, + { + "epoch": 1.7754265893800778, + "grad_norm": 0.34576273403506225, + "learning_rate": 4.2725862309605436e-05, + "loss": 2.808, + "step": 38134 + }, + { + "epoch": 1.7754731475661707, + "grad_norm": 0.33676755440561557, + "learning_rate": 4.2723182402551794e-05, + "loss": 2.7011, + "step": 38135 + }, + { + "epoch": 1.7755197057522638, + "grad_norm": 0.3274671860516829, + "learning_rate": 4.2720502516854894e-05, + "loss": 2.6826, + "step": 38136 + }, + { + "epoch": 1.775566263938357, + "grad_norm": 0.3532467321663999, + "learning_rate": 4.271782265252258e-05, + "loss": 2.6902, + "step": 38137 + }, + { + "epoch": 1.77561282212445, + "grad_norm": 0.34006038138844147, + "learning_rate": 4.2715142809562766e-05, + "loss": 2.7761, + "step": 38138 + }, + { + "epoch": 1.7756593803105432, + "grad_norm": 0.38395710231150965, + "learning_rate": 4.271246298798328e-05, + "loss": 2.8627, + "step": 38139 + }, + { + "epoch": 1.7757059384966363, + "grad_norm": 0.3401643557195866, + "learning_rate": 4.2709783187791984e-05, + "loss": 2.6757, + "step": 38140 + }, + { + "epoch": 1.7757524966827294, + "grad_norm": 0.35690218783750344, + "learning_rate": 4.2707103408996775e-05, + "loss": 2.7457, + "step": 38141 + }, + { + "epoch": 1.7757990548688223, + "grad_norm": 0.3739111534529981, + "learning_rate": 4.270442365160548e-05, + "loss": 2.652, + "step": 38142 + }, + { + "epoch": 1.7758456130549154, + "grad_norm": 0.3365813172566251, + "learning_rate": 4.2701743915626e-05, + "loss": 2.6224, + "step": 38143 + }, + { + "epoch": 1.7758921712410083, + "grad_norm": 0.33662829602505157, + "learning_rate": 4.269906420106619e-05, + "loss": 2.7955, + "step": 38144 + }, + { + "epoch": 1.7759387294271014, + "grad_norm": 0.3596256437366767, + "learning_rate": 4.269638450793389e-05, + "loss": 2.694, + "step": 38145 + }, + { + "epoch": 1.7759852876131945, + "grad_norm": 0.34039848283552643, + "learning_rate": 4.269370483623698e-05, + "loss": 2.6955, + "step": 38146 + }, + { + "epoch": 1.7760318457992876, + "grad_norm": 0.33148460334129043, + "learning_rate": 4.269102518598333e-05, + "loss": 2.654, + "step": 38147 + }, + { + "epoch": 1.7760784039853807, + "grad_norm": 0.34435128995803593, + "learning_rate": 4.268834555718081e-05, + "loss": 2.6577, + "step": 38148 + }, + { + "epoch": 1.7761249621714739, + "grad_norm": 0.34376142085012806, + "learning_rate": 4.268566594983728e-05, + "loss": 2.7376, + "step": 38149 + }, + { + "epoch": 1.776171520357567, + "grad_norm": 0.34124128221643896, + "learning_rate": 4.2682986363960584e-05, + "loss": 2.7356, + "step": 38150 + }, + { + "epoch": 1.77621807854366, + "grad_norm": 0.36680391993501427, + "learning_rate": 4.268030679955861e-05, + "loss": 2.6398, + "step": 38151 + }, + { + "epoch": 1.776264636729753, + "grad_norm": 0.32858173931124174, + "learning_rate": 4.26776272566392e-05, + "loss": 2.7196, + "step": 38152 + }, + { + "epoch": 1.776311194915846, + "grad_norm": 0.3637298584448939, + "learning_rate": 4.2674947735210255e-05, + "loss": 2.8216, + "step": 38153 + }, + { + "epoch": 1.776357753101939, + "grad_norm": 0.3688145639937907, + "learning_rate": 4.267226823527961e-05, + "loss": 2.7108, + "step": 38154 + }, + { + "epoch": 1.7764043112880321, + "grad_norm": 0.34839422121203034, + "learning_rate": 4.266958875685512e-05, + "loss": 2.6569, + "step": 38155 + }, + { + "epoch": 1.7764508694741252, + "grad_norm": 0.36681390681506226, + "learning_rate": 4.266690929994469e-05, + "loss": 2.6968, + "step": 38156 + }, + { + "epoch": 1.7764974276602183, + "grad_norm": 0.3673745018349859, + "learning_rate": 4.266422986455615e-05, + "loss": 2.6769, + "step": 38157 + }, + { + "epoch": 1.7765439858463115, + "grad_norm": 0.3372323645284596, + "learning_rate": 4.266155045069736e-05, + "loss": 2.7849, + "step": 38158 + }, + { + "epoch": 1.7765905440324046, + "grad_norm": 0.35604424225344533, + "learning_rate": 4.26588710583762e-05, + "loss": 2.6134, + "step": 38159 + }, + { + "epoch": 1.7766371022184977, + "grad_norm": 0.34776832089985876, + "learning_rate": 4.265619168760054e-05, + "loss": 2.7587, + "step": 38160 + }, + { + "epoch": 1.7766836604045908, + "grad_norm": 0.3606196743149715, + "learning_rate": 4.265351233837824e-05, + "loss": 2.7928, + "step": 38161 + }, + { + "epoch": 1.7767302185906837, + "grad_norm": 0.3427910381699684, + "learning_rate": 4.265083301071715e-05, + "loss": 2.744, + "step": 38162 + }, + { + "epoch": 1.7767767767767768, + "grad_norm": 0.3059254664394702, + "learning_rate": 4.264815370462513e-05, + "loss": 2.8022, + "step": 38163 + }, + { + "epoch": 1.7768233349628697, + "grad_norm": 0.37368229596392016, + "learning_rate": 4.2645474420110065e-05, + "loss": 2.5999, + "step": 38164 + }, + { + "epoch": 1.7768698931489628, + "grad_norm": 0.33452058457018513, + "learning_rate": 4.2642795157179795e-05, + "loss": 2.7418, + "step": 38165 + }, + { + "epoch": 1.776916451335056, + "grad_norm": 0.3303697336034993, + "learning_rate": 4.264011591584221e-05, + "loss": 2.7688, + "step": 38166 + }, + { + "epoch": 1.776963009521149, + "grad_norm": 0.3371053652374097, + "learning_rate": 4.2637436696105165e-05, + "loss": 2.6992, + "step": 38167 + }, + { + "epoch": 1.7770095677072422, + "grad_norm": 0.33156077123779326, + "learning_rate": 4.2634757497976493e-05, + "loss": 2.6964, + "step": 38168 + }, + { + "epoch": 1.7770561258933353, + "grad_norm": 0.3462068310361118, + "learning_rate": 4.263207832146411e-05, + "loss": 2.6969, + "step": 38169 + }, + { + "epoch": 1.7771026840794284, + "grad_norm": 0.34580150310825203, + "learning_rate": 4.262939916657584e-05, + "loss": 2.6535, + "step": 38170 + }, + { + "epoch": 1.7771492422655213, + "grad_norm": 0.33770232861413324, + "learning_rate": 4.262672003331957e-05, + "loss": 2.6983, + "step": 38171 + }, + { + "epoch": 1.7771958004516144, + "grad_norm": 0.3481377578225294, + "learning_rate": 4.262404092170313e-05, + "loss": 2.6764, + "step": 38172 + }, + { + "epoch": 1.7772423586377075, + "grad_norm": 0.3189451648956998, + "learning_rate": 4.2621361831734406e-05, + "loss": 2.6476, + "step": 38173 + }, + { + "epoch": 1.7772889168238004, + "grad_norm": 0.359187330771433, + "learning_rate": 4.261868276342127e-05, + "loss": 2.7423, + "step": 38174 + }, + { + "epoch": 1.7773354750098935, + "grad_norm": 0.3360627457554468, + "learning_rate": 4.2616003716771574e-05, + "loss": 2.6811, + "step": 38175 + }, + { + "epoch": 1.7773820331959866, + "grad_norm": 0.3425884097646129, + "learning_rate": 4.261332469179316e-05, + "loss": 2.6246, + "step": 38176 + }, + { + "epoch": 1.7774285913820798, + "grad_norm": 0.3284412483311731, + "learning_rate": 4.261064568849393e-05, + "loss": 2.5622, + "step": 38177 + }, + { + "epoch": 1.7774751495681729, + "grad_norm": 0.38638437450877594, + "learning_rate": 4.26079667068817e-05, + "loss": 2.6053, + "step": 38178 + }, + { + "epoch": 1.777521707754266, + "grad_norm": 0.3283199874314988, + "learning_rate": 4.260528774696439e-05, + "loss": 2.6698, + "step": 38179 + }, + { + "epoch": 1.777568265940359, + "grad_norm": 0.38576603361883494, + "learning_rate": 4.260260880874983e-05, + "loss": 2.6662, + "step": 38180 + }, + { + "epoch": 1.777614824126452, + "grad_norm": 0.34612094744623456, + "learning_rate": 4.259992989224586e-05, + "loss": 2.6764, + "step": 38181 + }, + { + "epoch": 1.7776613823125451, + "grad_norm": 0.33391180759282707, + "learning_rate": 4.2597250997460384e-05, + "loss": 2.7235, + "step": 38182 + }, + { + "epoch": 1.7777079404986382, + "grad_norm": 0.3365400210913339, + "learning_rate": 4.2594572124401246e-05, + "loss": 2.6492, + "step": 38183 + }, + { + "epoch": 1.7777544986847311, + "grad_norm": 0.34701409217391327, + "learning_rate": 4.2591893273076316e-05, + "loss": 2.7263, + "step": 38184 + }, + { + "epoch": 1.7778010568708242, + "grad_norm": 0.31629477864469135, + "learning_rate": 4.2589214443493434e-05, + "loss": 2.6907, + "step": 38185 + }, + { + "epoch": 1.7778476150569174, + "grad_norm": 0.3783439393504346, + "learning_rate": 4.258653563566048e-05, + "loss": 2.7354, + "step": 38186 + }, + { + "epoch": 1.7778941732430105, + "grad_norm": 0.3403503566462009, + "learning_rate": 4.258385684958533e-05, + "loss": 2.6373, + "step": 38187 + }, + { + "epoch": 1.7779407314291036, + "grad_norm": 0.33403883134822815, + "learning_rate": 4.2581178085275806e-05, + "loss": 2.6755, + "step": 38188 + }, + { + "epoch": 1.7779872896151967, + "grad_norm": 0.3756417779763921, + "learning_rate": 4.2578499342739817e-05, + "loss": 2.6427, + "step": 38189 + }, + { + "epoch": 1.7780338478012898, + "grad_norm": 0.3221249020844435, + "learning_rate": 4.257582062198519e-05, + "loss": 2.573, + "step": 38190 + }, + { + "epoch": 1.7780804059873827, + "grad_norm": 0.34489164289849866, + "learning_rate": 4.257314192301979e-05, + "loss": 2.6939, + "step": 38191 + }, + { + "epoch": 1.7781269641734758, + "grad_norm": 0.3267874776711945, + "learning_rate": 4.2570463245851496e-05, + "loss": 2.7173, + "step": 38192 + }, + { + "epoch": 1.7781735223595687, + "grad_norm": 0.32738527616009505, + "learning_rate": 4.256778459048817e-05, + "loss": 2.6084, + "step": 38193 + }, + { + "epoch": 1.7782200805456618, + "grad_norm": 0.3490210465860399, + "learning_rate": 4.256510595693764e-05, + "loss": 2.7889, + "step": 38194 + }, + { + "epoch": 1.778266638731755, + "grad_norm": 0.3316164770438164, + "learning_rate": 4.256242734520781e-05, + "loss": 2.7206, + "step": 38195 + }, + { + "epoch": 1.778313196917848, + "grad_norm": 0.3536547779909007, + "learning_rate": 4.255974875530652e-05, + "loss": 2.7445, + "step": 38196 + }, + { + "epoch": 1.7783597551039412, + "grad_norm": 0.3081019739036264, + "learning_rate": 4.255707018724163e-05, + "loss": 2.5956, + "step": 38197 + }, + { + "epoch": 1.7784063132900343, + "grad_norm": 0.3304845963618207, + "learning_rate": 4.2554391641021005e-05, + "loss": 2.6475, + "step": 38198 + }, + { + "epoch": 1.7784528714761274, + "grad_norm": 0.3380088686158211, + "learning_rate": 4.2551713116652514e-05, + "loss": 2.6599, + "step": 38199 + }, + { + "epoch": 1.7784994296622205, + "grad_norm": 0.3404969720364084, + "learning_rate": 4.254903461414401e-05, + "loss": 2.7191, + "step": 38200 + }, + { + "epoch": 1.7785459878483134, + "grad_norm": 0.3548865047872326, + "learning_rate": 4.254635613350334e-05, + "loss": 2.6901, + "step": 38201 + }, + { + "epoch": 1.7785925460344065, + "grad_norm": 0.3155796487930674, + "learning_rate": 4.2543677674738403e-05, + "loss": 2.7138, + "step": 38202 + }, + { + "epoch": 1.7786391042204994, + "grad_norm": 0.3514696320218696, + "learning_rate": 4.254099923785703e-05, + "loss": 2.7407, + "step": 38203 + }, + { + "epoch": 1.7786856624065925, + "grad_norm": 0.3441778472019231, + "learning_rate": 4.253832082286707e-05, + "loss": 2.7125, + "step": 38204 + }, + { + "epoch": 1.7787322205926857, + "grad_norm": 0.34049078107835834, + "learning_rate": 4.2535642429776435e-05, + "loss": 2.647, + "step": 38205 + }, + { + "epoch": 1.7787787787787788, + "grad_norm": 0.34932173185541093, + "learning_rate": 4.2532964058592924e-05, + "loss": 2.6309, + "step": 38206 + }, + { + "epoch": 1.7788253369648719, + "grad_norm": 0.3480506595351697, + "learning_rate": 4.2530285709324446e-05, + "loss": 2.6317, + "step": 38207 + }, + { + "epoch": 1.778871895150965, + "grad_norm": 0.34146644316084096, + "learning_rate": 4.2527607381978854e-05, + "loss": 2.7303, + "step": 38208 + }, + { + "epoch": 1.7789184533370581, + "grad_norm": 0.31685737090432115, + "learning_rate": 4.252492907656398e-05, + "loss": 2.7406, + "step": 38209 + }, + { + "epoch": 1.778965011523151, + "grad_norm": 0.3882724525945799, + "learning_rate": 4.252225079308772e-05, + "loss": 2.6469, + "step": 38210 + }, + { + "epoch": 1.7790115697092441, + "grad_norm": 0.3357035674599209, + "learning_rate": 4.251957253155789e-05, + "loss": 2.7204, + "step": 38211 + }, + { + "epoch": 1.7790581278953372, + "grad_norm": 0.34994408866226273, + "learning_rate": 4.25168942919824e-05, + "loss": 2.7484, + "step": 38212 + }, + { + "epoch": 1.7791046860814301, + "grad_norm": 0.3218798229644836, + "learning_rate": 4.2514216074369084e-05, + "loss": 2.7172, + "step": 38213 + }, + { + "epoch": 1.7791512442675232, + "grad_norm": 0.33239700150633333, + "learning_rate": 4.2511537878725785e-05, + "loss": 2.7951, + "step": 38214 + }, + { + "epoch": 1.7791978024536164, + "grad_norm": 0.3283512162716026, + "learning_rate": 4.250885970506041e-05, + "loss": 2.6793, + "step": 38215 + }, + { + "epoch": 1.7792443606397095, + "grad_norm": 0.32400194494674484, + "learning_rate": 4.25061815533808e-05, + "loss": 2.7234, + "step": 38216 + }, + { + "epoch": 1.7792909188258026, + "grad_norm": 0.3274700569446154, + "learning_rate": 4.2503503423694776e-05, + "loss": 2.7555, + "step": 38217 + }, + { + "epoch": 1.7793374770118957, + "grad_norm": 0.3098035089989659, + "learning_rate": 4.250082531601026e-05, + "loss": 2.7471, + "step": 38218 + }, + { + "epoch": 1.7793840351979888, + "grad_norm": 0.3396192255336051, + "learning_rate": 4.249814723033505e-05, + "loss": 2.6791, + "step": 38219 + }, + { + "epoch": 1.7794305933840817, + "grad_norm": 0.34053454478514356, + "learning_rate": 4.249546916667708e-05, + "loss": 2.6984, + "step": 38220 + }, + { + "epoch": 1.7794771515701748, + "grad_norm": 0.3460126332116327, + "learning_rate": 4.249279112504415e-05, + "loss": 2.6781, + "step": 38221 + }, + { + "epoch": 1.779523709756268, + "grad_norm": 0.34492345148013587, + "learning_rate": 4.249011310544414e-05, + "loss": 2.7676, + "step": 38222 + }, + { + "epoch": 1.7795702679423608, + "grad_norm": 0.3410131020757196, + "learning_rate": 4.248743510788492e-05, + "loss": 2.6587, + "step": 38223 + }, + { + "epoch": 1.779616826128454, + "grad_norm": 0.37505814755920414, + "learning_rate": 4.24847571323743e-05, + "loss": 2.6567, + "step": 38224 + }, + { + "epoch": 1.779663384314547, + "grad_norm": 0.3251739467935294, + "learning_rate": 4.248207917892021e-05, + "loss": 2.7973, + "step": 38225 + }, + { + "epoch": 1.7797099425006402, + "grad_norm": 0.35102173922262236, + "learning_rate": 4.247940124753048e-05, + "loss": 2.7214, + "step": 38226 + }, + { + "epoch": 1.7797565006867333, + "grad_norm": 0.3548446821037543, + "learning_rate": 4.247672333821294e-05, + "loss": 2.7849, + "step": 38227 + }, + { + "epoch": 1.7798030588728264, + "grad_norm": 0.35934876593895065, + "learning_rate": 4.24740454509755e-05, + "loss": 2.7914, + "step": 38228 + }, + { + "epoch": 1.7798496170589195, + "grad_norm": 0.34218768479819806, + "learning_rate": 4.2471367585825995e-05, + "loss": 2.663, + "step": 38229 + }, + { + "epoch": 1.7798961752450124, + "grad_norm": 0.344707604053262, + "learning_rate": 4.2468689742772264e-05, + "loss": 2.6683, + "step": 38230 + }, + { + "epoch": 1.7799427334311055, + "grad_norm": 0.3592871127215715, + "learning_rate": 4.246601192182221e-05, + "loss": 2.5927, + "step": 38231 + }, + { + "epoch": 1.7799892916171984, + "grad_norm": 0.32850799061146585, + "learning_rate": 4.246333412298364e-05, + "loss": 2.7199, + "step": 38232 + }, + { + "epoch": 1.7800358498032915, + "grad_norm": 0.35973271405012197, + "learning_rate": 4.2460656346264465e-05, + "loss": 2.7757, + "step": 38233 + }, + { + "epoch": 1.7800824079893847, + "grad_norm": 0.34299681883925554, + "learning_rate": 4.245797859167252e-05, + "loss": 2.6982, + "step": 38234 + }, + { + "epoch": 1.7801289661754778, + "grad_norm": 0.36277317542038756, + "learning_rate": 4.2455300859215666e-05, + "loss": 2.6142, + "step": 38235 + }, + { + "epoch": 1.780175524361571, + "grad_norm": 0.35036187322587975, + "learning_rate": 4.245262314890176e-05, + "loss": 2.6058, + "step": 38236 + }, + { + "epoch": 1.780222082547664, + "grad_norm": 0.34222281380304975, + "learning_rate": 4.2449945460738637e-05, + "loss": 2.5577, + "step": 38237 + }, + { + "epoch": 1.7802686407337571, + "grad_norm": 0.3390955319479467, + "learning_rate": 4.244726779473421e-05, + "loss": 2.6375, + "step": 38238 + }, + { + "epoch": 1.7803151989198502, + "grad_norm": 0.3606014861332944, + "learning_rate": 4.244459015089631e-05, + "loss": 2.6923, + "step": 38239 + }, + { + "epoch": 1.7803617571059431, + "grad_norm": 0.3065692526577642, + "learning_rate": 4.244191252923276e-05, + "loss": 2.6738, + "step": 38240 + }, + { + "epoch": 1.7804083152920362, + "grad_norm": 0.3397355039223334, + "learning_rate": 4.243923492975148e-05, + "loss": 2.6696, + "step": 38241 + }, + { + "epoch": 1.7804548734781291, + "grad_norm": 0.3411478929005035, + "learning_rate": 4.2436557352460284e-05, + "loss": 2.7505, + "step": 38242 + }, + { + "epoch": 1.7805014316642223, + "grad_norm": 0.3045863130253806, + "learning_rate": 4.2433879797367065e-05, + "loss": 2.7172, + "step": 38243 + }, + { + "epoch": 1.7805479898503154, + "grad_norm": 0.3469599071139092, + "learning_rate": 4.243120226447966e-05, + "loss": 2.7598, + "step": 38244 + }, + { + "epoch": 1.7805945480364085, + "grad_norm": 0.3298388663058621, + "learning_rate": 4.2428524753805914e-05, + "loss": 2.7973, + "step": 38245 + }, + { + "epoch": 1.7806411062225016, + "grad_norm": 0.35270643390060724, + "learning_rate": 4.242584726535372e-05, + "loss": 2.6636, + "step": 38246 + }, + { + "epoch": 1.7806876644085947, + "grad_norm": 0.33739892325070275, + "learning_rate": 4.2423169799130916e-05, + "loss": 2.7532, + "step": 38247 + }, + { + "epoch": 1.7807342225946878, + "grad_norm": 0.3384892899529214, + "learning_rate": 4.242049235514536e-05, + "loss": 2.638, + "step": 38248 + }, + { + "epoch": 1.7807807807807807, + "grad_norm": 0.33730630209168466, + "learning_rate": 4.241781493340492e-05, + "loss": 2.657, + "step": 38249 + }, + { + "epoch": 1.7808273389668738, + "grad_norm": 0.32100767609163605, + "learning_rate": 4.241513753391742e-05, + "loss": 2.7849, + "step": 38250 + }, + { + "epoch": 1.780873897152967, + "grad_norm": 0.3371811366791398, + "learning_rate": 4.241246015669077e-05, + "loss": 2.6425, + "step": 38251 + }, + { + "epoch": 1.7809204553390598, + "grad_norm": 0.3403148173507206, + "learning_rate": 4.24097828017328e-05, + "loss": 2.7002, + "step": 38252 + }, + { + "epoch": 1.780967013525153, + "grad_norm": 0.3215735453621765, + "learning_rate": 4.240710546905135e-05, + "loss": 2.7431, + "step": 38253 + }, + { + "epoch": 1.781013571711246, + "grad_norm": 0.3574788361839946, + "learning_rate": 4.240442815865432e-05, + "loss": 2.6598, + "step": 38254 + }, + { + "epoch": 1.7810601298973392, + "grad_norm": 0.31879088426100766, + "learning_rate": 4.240175087054952e-05, + "loss": 2.6962, + "step": 38255 + }, + { + "epoch": 1.7811066880834323, + "grad_norm": 0.3320532184971392, + "learning_rate": 4.2399073604744856e-05, + "loss": 2.7896, + "step": 38256 + }, + { + "epoch": 1.7811532462695254, + "grad_norm": 0.37013625277112033, + "learning_rate": 4.2396396361248156e-05, + "loss": 2.66, + "step": 38257 + }, + { + "epoch": 1.7811998044556185, + "grad_norm": 0.3175138406779677, + "learning_rate": 4.2393719140067264e-05, + "loss": 2.6406, + "step": 38258 + }, + { + "epoch": 1.7812463626417114, + "grad_norm": 0.3321612543163793, + "learning_rate": 4.239104194121009e-05, + "loss": 2.669, + "step": 38259 + }, + { + "epoch": 1.7812929208278045, + "grad_norm": 0.3179009208338168, + "learning_rate": 4.2388364764684435e-05, + "loss": 2.6999, + "step": 38260 + }, + { + "epoch": 1.7813394790138977, + "grad_norm": 0.34468568012380413, + "learning_rate": 4.23856876104982e-05, + "loss": 2.7197, + "step": 38261 + }, + { + "epoch": 1.7813860371999906, + "grad_norm": 0.3450764968420649, + "learning_rate": 4.238301047865921e-05, + "loss": 2.79, + "step": 38262 + }, + { + "epoch": 1.7814325953860837, + "grad_norm": 0.3546278906172485, + "learning_rate": 4.238033336917532e-05, + "loss": 2.6924, + "step": 38263 + }, + { + "epoch": 1.7814791535721768, + "grad_norm": 0.3342985499003474, + "learning_rate": 4.237765628205442e-05, + "loss": 2.701, + "step": 38264 + }, + { + "epoch": 1.78152571175827, + "grad_norm": 0.34224735697789954, + "learning_rate": 4.237497921730433e-05, + "loss": 2.6436, + "step": 38265 + }, + { + "epoch": 1.781572269944363, + "grad_norm": 0.3383399973803347, + "learning_rate": 4.2372302174932944e-05, + "loss": 2.733, + "step": 38266 + }, + { + "epoch": 1.7816188281304561, + "grad_norm": 0.35503712330493997, + "learning_rate": 4.23696251549481e-05, + "loss": 2.5475, + "step": 38267 + }, + { + "epoch": 1.7816653863165492, + "grad_norm": 0.32276347752213563, + "learning_rate": 4.2366948157357634e-05, + "loss": 2.7594, + "step": 38268 + }, + { + "epoch": 1.7817119445026421, + "grad_norm": 0.32745489034752956, + "learning_rate": 4.236427118216944e-05, + "loss": 2.6953, + "step": 38269 + }, + { + "epoch": 1.7817585026887353, + "grad_norm": 0.3640067898651073, + "learning_rate": 4.2361594229391364e-05, + "loss": 2.6216, + "step": 38270 + }, + { + "epoch": 1.7818050608748282, + "grad_norm": 0.3393688077154832, + "learning_rate": 4.235891729903123e-05, + "loss": 2.7762, + "step": 38271 + }, + { + "epoch": 1.7818516190609213, + "grad_norm": 0.365796063563611, + "learning_rate": 4.235624039109695e-05, + "loss": 2.7058, + "step": 38272 + }, + { + "epoch": 1.7818981772470144, + "grad_norm": 0.3324340809326777, + "learning_rate": 4.235356350559633e-05, + "loss": 2.6382, + "step": 38273 + }, + { + "epoch": 1.7819447354331075, + "grad_norm": 0.34847975110184143, + "learning_rate": 4.235088664253726e-05, + "loss": 2.774, + "step": 38274 + }, + { + "epoch": 1.7819912936192006, + "grad_norm": 0.30278480818355197, + "learning_rate": 4.234820980192759e-05, + "loss": 2.5909, + "step": 38275 + }, + { + "epoch": 1.7820378518052937, + "grad_norm": 0.33359866022453305, + "learning_rate": 4.234553298377515e-05, + "loss": 2.6783, + "step": 38276 + }, + { + "epoch": 1.7820844099913868, + "grad_norm": 0.31469034849795147, + "learning_rate": 4.234285618808783e-05, + "loss": 2.7453, + "step": 38277 + }, + { + "epoch": 1.78213096817748, + "grad_norm": 0.3337666575097491, + "learning_rate": 4.234017941487346e-05, + "loss": 2.8303, + "step": 38278 + }, + { + "epoch": 1.7821775263635729, + "grad_norm": 0.33206369137122255, + "learning_rate": 4.2337502664139924e-05, + "loss": 2.6644, + "step": 38279 + }, + { + "epoch": 1.782224084549666, + "grad_norm": 0.32410560280476386, + "learning_rate": 4.233482593589507e-05, + "loss": 2.7163, + "step": 38280 + }, + { + "epoch": 1.7822706427357589, + "grad_norm": 0.338915649008694, + "learning_rate": 4.2332149230146714e-05, + "loss": 2.7411, + "step": 38281 + }, + { + "epoch": 1.782317200921852, + "grad_norm": 0.3408118271918126, + "learning_rate": 4.232947254690278e-05, + "loss": 2.6062, + "step": 38282 + }, + { + "epoch": 1.782363759107945, + "grad_norm": 0.3251930098671104, + "learning_rate": 4.232679588617107e-05, + "loss": 2.6699, + "step": 38283 + }, + { + "epoch": 1.7824103172940382, + "grad_norm": 0.3177348326002107, + "learning_rate": 4.232411924795945e-05, + "loss": 2.6459, + "step": 38284 + }, + { + "epoch": 1.7824568754801313, + "grad_norm": 0.3686878882541338, + "learning_rate": 4.2321442632275806e-05, + "loss": 2.764, + "step": 38285 + }, + { + "epoch": 1.7825034336662244, + "grad_norm": 0.331183516308994, + "learning_rate": 4.2318766039127954e-05, + "loss": 2.7329, + "step": 38286 + }, + { + "epoch": 1.7825499918523176, + "grad_norm": 0.3382261204185281, + "learning_rate": 4.231608946852378e-05, + "loss": 2.6041, + "step": 38287 + }, + { + "epoch": 1.7825965500384107, + "grad_norm": 0.33468492860817245, + "learning_rate": 4.231341292047114e-05, + "loss": 2.7403, + "step": 38288 + }, + { + "epoch": 1.7826431082245036, + "grad_norm": 0.3624508245164628, + "learning_rate": 4.231073639497783e-05, + "loss": 2.705, + "step": 38289 + }, + { + "epoch": 1.7826896664105967, + "grad_norm": 0.3754257881806466, + "learning_rate": 4.230805989205179e-05, + "loss": 2.7641, + "step": 38290 + }, + { + "epoch": 1.7827362245966896, + "grad_norm": 0.3074852936684319, + "learning_rate": 4.230538341170081e-05, + "loss": 2.6634, + "step": 38291 + }, + { + "epoch": 1.7827827827827827, + "grad_norm": 0.3556126033403907, + "learning_rate": 4.230270695393279e-05, + "loss": 2.7446, + "step": 38292 + }, + { + "epoch": 1.7828293409688758, + "grad_norm": 0.3378380958064712, + "learning_rate": 4.230003051875557e-05, + "loss": 2.7584, + "step": 38293 + }, + { + "epoch": 1.782875899154969, + "grad_norm": 0.3118974802507106, + "learning_rate": 4.2297354106176976e-05, + "loss": 2.5979, + "step": 38294 + }, + { + "epoch": 1.782922457341062, + "grad_norm": 0.35898321689675483, + "learning_rate": 4.2294677716204925e-05, + "loss": 2.6193, + "step": 38295 + }, + { + "epoch": 1.7829690155271551, + "grad_norm": 0.3303264307796354, + "learning_rate": 4.229200134884721e-05, + "loss": 2.7555, + "step": 38296 + }, + { + "epoch": 1.7830155737132483, + "grad_norm": 0.3572026156718417, + "learning_rate": 4.228932500411171e-05, + "loss": 2.6761, + "step": 38297 + }, + { + "epoch": 1.7830621318993412, + "grad_norm": 0.3295343304125385, + "learning_rate": 4.2286648682006305e-05, + "loss": 2.7205, + "step": 38298 + }, + { + "epoch": 1.7831086900854343, + "grad_norm": 0.36653807812516737, + "learning_rate": 4.228397238253881e-05, + "loss": 2.7101, + "step": 38299 + }, + { + "epoch": 1.7831552482715274, + "grad_norm": 0.34254158664133105, + "learning_rate": 4.22812961057171e-05, + "loss": 2.5813, + "step": 38300 + }, + { + "epoch": 1.7832018064576203, + "grad_norm": 0.3369220655590574, + "learning_rate": 4.227861985154901e-05, + "loss": 2.5765, + "step": 38301 + }, + { + "epoch": 1.7832483646437134, + "grad_norm": 0.3476861892796272, + "learning_rate": 4.227594362004243e-05, + "loss": 2.6321, + "step": 38302 + }, + { + "epoch": 1.7832949228298065, + "grad_norm": 0.3412180249130635, + "learning_rate": 4.22732674112052e-05, + "loss": 2.706, + "step": 38303 + }, + { + "epoch": 1.7833414810158996, + "grad_norm": 0.33203131856455254, + "learning_rate": 4.2270591225045145e-05, + "loss": 2.7389, + "step": 38304 + }, + { + "epoch": 1.7833880392019927, + "grad_norm": 0.3841895463398229, + "learning_rate": 4.226791506157016e-05, + "loss": 2.7388, + "step": 38305 + }, + { + "epoch": 1.7834345973880859, + "grad_norm": 0.3332899119314825, + "learning_rate": 4.226523892078808e-05, + "loss": 2.6893, + "step": 38306 + }, + { + "epoch": 1.783481155574179, + "grad_norm": 0.3937858955848856, + "learning_rate": 4.2262562802706744e-05, + "loss": 2.7683, + "step": 38307 + }, + { + "epoch": 1.7835277137602719, + "grad_norm": 0.35930111596845266, + "learning_rate": 4.225988670733405e-05, + "loss": 2.6636, + "step": 38308 + }, + { + "epoch": 1.783574271946365, + "grad_norm": 0.3513560517273336, + "learning_rate": 4.225721063467781e-05, + "loss": 2.7746, + "step": 38309 + }, + { + "epoch": 1.783620830132458, + "grad_norm": 0.3590361065682632, + "learning_rate": 4.2254534584745895e-05, + "loss": 2.6749, + "step": 38310 + }, + { + "epoch": 1.783667388318551, + "grad_norm": 0.3531981825571487, + "learning_rate": 4.2251858557546176e-05, + "loss": 2.7237, + "step": 38311 + }, + { + "epoch": 1.783713946504644, + "grad_norm": 0.3557099940799723, + "learning_rate": 4.2249182553086466e-05, + "loss": 2.6743, + "step": 38312 + }, + { + "epoch": 1.7837605046907372, + "grad_norm": 0.3467145040001295, + "learning_rate": 4.224650657137466e-05, + "loss": 2.7261, + "step": 38313 + }, + { + "epoch": 1.7838070628768303, + "grad_norm": 0.3576573808040202, + "learning_rate": 4.224383061241857e-05, + "loss": 2.7034, + "step": 38314 + }, + { + "epoch": 1.7838536210629234, + "grad_norm": 0.3485160593458203, + "learning_rate": 4.2241154676226096e-05, + "loss": 2.7405, + "step": 38315 + }, + { + "epoch": 1.7839001792490166, + "grad_norm": 0.3385013491013612, + "learning_rate": 4.223847876280507e-05, + "loss": 2.6475, + "step": 38316 + }, + { + "epoch": 1.7839467374351097, + "grad_norm": 0.37283061522053385, + "learning_rate": 4.2235802872163316e-05, + "loss": 2.6952, + "step": 38317 + }, + { + "epoch": 1.7839932956212026, + "grad_norm": 0.3591441040250337, + "learning_rate": 4.223312700430874e-05, + "loss": 2.6277, + "step": 38318 + }, + { + "epoch": 1.7840398538072957, + "grad_norm": 0.33612289509372756, + "learning_rate": 4.2230451159249154e-05, + "loss": 2.6321, + "step": 38319 + }, + { + "epoch": 1.7840864119933886, + "grad_norm": 0.34045071340350097, + "learning_rate": 4.2227775336992445e-05, + "loss": 2.72, + "step": 38320 + }, + { + "epoch": 1.7841329701794817, + "grad_norm": 0.3492660040966864, + "learning_rate": 4.2225099537546455e-05, + "loss": 2.6439, + "step": 38321 + }, + { + "epoch": 1.7841795283655748, + "grad_norm": 0.34940981746967104, + "learning_rate": 4.222242376091902e-05, + "loss": 2.5986, + "step": 38322 + }, + { + "epoch": 1.784226086551668, + "grad_norm": 0.35063852688963226, + "learning_rate": 4.2219748007118e-05, + "loss": 2.731, + "step": 38323 + }, + { + "epoch": 1.784272644737761, + "grad_norm": 0.35373101049734357, + "learning_rate": 4.2217072276151256e-05, + "loss": 2.6625, + "step": 38324 + }, + { + "epoch": 1.7843192029238542, + "grad_norm": 0.3518468262093003, + "learning_rate": 4.221439656802664e-05, + "loss": 2.6554, + "step": 38325 + }, + { + "epoch": 1.7843657611099473, + "grad_norm": 0.3558669656277812, + "learning_rate": 4.221172088275202e-05, + "loss": 2.6011, + "step": 38326 + }, + { + "epoch": 1.7844123192960404, + "grad_norm": 0.3546325049437616, + "learning_rate": 4.2209045220335196e-05, + "loss": 2.688, + "step": 38327 + }, + { + "epoch": 1.7844588774821333, + "grad_norm": 0.3421831906320773, + "learning_rate": 4.220636958078409e-05, + "loss": 2.6483, + "step": 38328 + }, + { + "epoch": 1.7845054356682264, + "grad_norm": 0.36750591014953315, + "learning_rate": 4.220369396410651e-05, + "loss": 2.7189, + "step": 38329 + }, + { + "epoch": 1.7845519938543193, + "grad_norm": 0.3606466644953576, + "learning_rate": 4.220101837031031e-05, + "loss": 2.7497, + "step": 38330 + }, + { + "epoch": 1.7845985520404124, + "grad_norm": 0.3307586213701193, + "learning_rate": 4.219834279940337e-05, + "loss": 2.7175, + "step": 38331 + }, + { + "epoch": 1.7846451102265055, + "grad_norm": 0.3384097217360398, + "learning_rate": 4.219566725139349e-05, + "loss": 2.6223, + "step": 38332 + }, + { + "epoch": 1.7846916684125986, + "grad_norm": 0.32983054962068153, + "learning_rate": 4.21929917262886e-05, + "loss": 2.6044, + "step": 38333 + }, + { + "epoch": 1.7847382265986917, + "grad_norm": 0.3464078949578781, + "learning_rate": 4.21903162240965e-05, + "loss": 2.7319, + "step": 38334 + }, + { + "epoch": 1.7847847847847849, + "grad_norm": 0.3391614326416548, + "learning_rate": 4.2187640744825044e-05, + "loss": 2.7902, + "step": 38335 + }, + { + "epoch": 1.784831342970878, + "grad_norm": 0.35139599889333845, + "learning_rate": 4.218496528848209e-05, + "loss": 2.7825, + "step": 38336 + }, + { + "epoch": 1.7848779011569709, + "grad_norm": 0.3605142361115919, + "learning_rate": 4.21822898550755e-05, + "loss": 2.7746, + "step": 38337 + }, + { + "epoch": 1.784924459343064, + "grad_norm": 0.33979249997315586, + "learning_rate": 4.217961444461313e-05, + "loss": 2.7416, + "step": 38338 + }, + { + "epoch": 1.784971017529157, + "grad_norm": 0.3584404853278636, + "learning_rate": 4.217693905710281e-05, + "loss": 2.5931, + "step": 38339 + }, + { + "epoch": 1.78501757571525, + "grad_norm": 0.38752698233341387, + "learning_rate": 4.2174263692552385e-05, + "loss": 2.74, + "step": 38340 + }, + { + "epoch": 1.785064133901343, + "grad_norm": 0.3232133550182496, + "learning_rate": 4.217158835096975e-05, + "loss": 2.6231, + "step": 38341 + }, + { + "epoch": 1.7851106920874362, + "grad_norm": 0.3820027560339286, + "learning_rate": 4.2168913032362734e-05, + "loss": 2.6294, + "step": 38342 + }, + { + "epoch": 1.7851572502735293, + "grad_norm": 0.3384417992574428, + "learning_rate": 4.216623773673916e-05, + "loss": 2.705, + "step": 38343 + }, + { + "epoch": 1.7852038084596225, + "grad_norm": 0.3347276500739178, + "learning_rate": 4.216356246410693e-05, + "loss": 2.6638, + "step": 38344 + }, + { + "epoch": 1.7852503666457156, + "grad_norm": 0.35070298977144865, + "learning_rate": 4.2160887214473854e-05, + "loss": 2.7217, + "step": 38345 + }, + { + "epoch": 1.7852969248318087, + "grad_norm": 0.3371021827523817, + "learning_rate": 4.215821198784782e-05, + "loss": 2.6586, + "step": 38346 + }, + { + "epoch": 1.7853434830179016, + "grad_norm": 0.31118304951801495, + "learning_rate": 4.215553678423666e-05, + "loss": 2.7982, + "step": 38347 + }, + { + "epoch": 1.7853900412039947, + "grad_norm": 0.3609772080016832, + "learning_rate": 4.215286160364822e-05, + "loss": 2.6614, + "step": 38348 + }, + { + "epoch": 1.7854365993900878, + "grad_norm": 0.33802161094142763, + "learning_rate": 4.215018644609036e-05, + "loss": 2.7148, + "step": 38349 + }, + { + "epoch": 1.7854831575761807, + "grad_norm": 0.35343182177993937, + "learning_rate": 4.214751131157092e-05, + "loss": 2.6564, + "step": 38350 + }, + { + "epoch": 1.7855297157622738, + "grad_norm": 0.3402944400062251, + "learning_rate": 4.214483620009778e-05, + "loss": 2.755, + "step": 38351 + }, + { + "epoch": 1.785576273948367, + "grad_norm": 0.32577955420431837, + "learning_rate": 4.214216111167877e-05, + "loss": 2.6721, + "step": 38352 + }, + { + "epoch": 1.78562283213446, + "grad_norm": 0.34464664219959334, + "learning_rate": 4.213948604632172e-05, + "loss": 2.6738, + "step": 38353 + }, + { + "epoch": 1.7856693903205532, + "grad_norm": 0.34812037076350744, + "learning_rate": 4.213681100403453e-05, + "loss": 2.6718, + "step": 38354 + }, + { + "epoch": 1.7857159485066463, + "grad_norm": 0.33201496715813866, + "learning_rate": 4.2134135984825e-05, + "loss": 2.5921, + "step": 38355 + }, + { + "epoch": 1.7857625066927394, + "grad_norm": 0.34641834700387425, + "learning_rate": 4.213146098870102e-05, + "loss": 2.6628, + "step": 38356 + }, + { + "epoch": 1.7858090648788323, + "grad_norm": 0.34507764652203166, + "learning_rate": 4.212878601567043e-05, + "loss": 2.7298, + "step": 38357 + }, + { + "epoch": 1.7858556230649254, + "grad_norm": 0.3657038518112757, + "learning_rate": 4.2126111065741066e-05, + "loss": 2.6963, + "step": 38358 + }, + { + "epoch": 1.7859021812510183, + "grad_norm": 0.3726642544871236, + "learning_rate": 4.2123436138920805e-05, + "loss": 2.7121, + "step": 38359 + }, + { + "epoch": 1.7859487394371114, + "grad_norm": 0.32003320693136295, + "learning_rate": 4.2120761235217483e-05, + "loss": 2.5807, + "step": 38360 + }, + { + "epoch": 1.7859952976232045, + "grad_norm": 0.3330612140752799, + "learning_rate": 4.211808635463894e-05, + "loss": 2.6957, + "step": 38361 + }, + { + "epoch": 1.7860418558092976, + "grad_norm": 0.3654446457142757, + "learning_rate": 4.2115411497193036e-05, + "loss": 2.6911, + "step": 38362 + }, + { + "epoch": 1.7860884139953908, + "grad_norm": 0.3255742543662318, + "learning_rate": 4.2112736662887625e-05, + "loss": 2.7583, + "step": 38363 + }, + { + "epoch": 1.7861349721814839, + "grad_norm": 0.33360602680289236, + "learning_rate": 4.211006185173057e-05, + "loss": 2.7425, + "step": 38364 + }, + { + "epoch": 1.786181530367577, + "grad_norm": 0.33655162240827224, + "learning_rate": 4.2107387063729696e-05, + "loss": 2.7125, + "step": 38365 + }, + { + "epoch": 1.78622808855367, + "grad_norm": 0.3334649154246511, + "learning_rate": 4.2104712298892844e-05, + "loss": 2.6817, + "step": 38366 + }, + { + "epoch": 1.786274646739763, + "grad_norm": 0.33745114000737125, + "learning_rate": 4.2102037557227915e-05, + "loss": 2.6745, + "step": 38367 + }, + { + "epoch": 1.786321204925856, + "grad_norm": 0.3334322561080336, + "learning_rate": 4.20993628387427e-05, + "loss": 2.7108, + "step": 38368 + }, + { + "epoch": 1.786367763111949, + "grad_norm": 0.33847428491991716, + "learning_rate": 4.209668814344509e-05, + "loss": 2.7752, + "step": 38369 + }, + { + "epoch": 1.7864143212980421, + "grad_norm": 0.34168929806475207, + "learning_rate": 4.209401347134293e-05, + "loss": 2.5871, + "step": 38370 + }, + { + "epoch": 1.7864608794841352, + "grad_norm": 0.3517527580991689, + "learning_rate": 4.209133882244404e-05, + "loss": 2.8032, + "step": 38371 + }, + { + "epoch": 1.7865074376702283, + "grad_norm": 0.34589512115099347, + "learning_rate": 4.208866419675631e-05, + "loss": 2.8651, + "step": 38372 + }, + { + "epoch": 1.7865539958563215, + "grad_norm": 0.33426572384030145, + "learning_rate": 4.2085989594287565e-05, + "loss": 2.7778, + "step": 38373 + }, + { + "epoch": 1.7866005540424146, + "grad_norm": 0.36942284099943296, + "learning_rate": 4.208331501504567e-05, + "loss": 2.7052, + "step": 38374 + }, + { + "epoch": 1.7866471122285077, + "grad_norm": 0.3758431160579236, + "learning_rate": 4.208064045903845e-05, + "loss": 2.6161, + "step": 38375 + }, + { + "epoch": 1.7866936704146008, + "grad_norm": 0.3521784025640858, + "learning_rate": 4.207796592627378e-05, + "loss": 2.7007, + "step": 38376 + }, + { + "epoch": 1.7867402286006937, + "grad_norm": 0.3457170595265809, + "learning_rate": 4.20752914167595e-05, + "loss": 2.686, + "step": 38377 + }, + { + "epoch": 1.7867867867867868, + "grad_norm": 0.38520516050098935, + "learning_rate": 4.207261693050346e-05, + "loss": 2.7219, + "step": 38378 + }, + { + "epoch": 1.7868333449728797, + "grad_norm": 0.34392702170435524, + "learning_rate": 4.206994246751349e-05, + "loss": 2.5906, + "step": 38379 + }, + { + "epoch": 1.7868799031589728, + "grad_norm": 0.36656111560407684, + "learning_rate": 4.206726802779748e-05, + "loss": 2.6397, + "step": 38380 + }, + { + "epoch": 1.786926461345066, + "grad_norm": 0.3642389359909052, + "learning_rate": 4.206459361136323e-05, + "loss": 2.6202, + "step": 38381 + }, + { + "epoch": 1.786973019531159, + "grad_norm": 0.3626042451450216, + "learning_rate": 4.206191921821865e-05, + "loss": 2.6517, + "step": 38382 + }, + { + "epoch": 1.7870195777172522, + "grad_norm": 0.3777321786017761, + "learning_rate": 4.205924484837154e-05, + "loss": 2.6717, + "step": 38383 + }, + { + "epoch": 1.7870661359033453, + "grad_norm": 0.33882864979599436, + "learning_rate": 4.205657050182975e-05, + "loss": 2.6852, + "step": 38384 + }, + { + "epoch": 1.7871126940894384, + "grad_norm": 0.3844019595902459, + "learning_rate": 4.205389617860117e-05, + "loss": 2.7351, + "step": 38385 + }, + { + "epoch": 1.7871592522755313, + "grad_norm": 0.354081932857883, + "learning_rate": 4.20512218786936e-05, + "loss": 2.731, + "step": 38386 + }, + { + "epoch": 1.7872058104616244, + "grad_norm": 0.3465854027080188, + "learning_rate": 4.204854760211493e-05, + "loss": 2.6752, + "step": 38387 + }, + { + "epoch": 1.7872523686477175, + "grad_norm": 0.33183751388637456, + "learning_rate": 4.204587334887298e-05, + "loss": 2.7418, + "step": 38388 + }, + { + "epoch": 1.7872989268338104, + "grad_norm": 0.34727861906688945, + "learning_rate": 4.20431991189756e-05, + "loss": 2.6758, + "step": 38389 + }, + { + "epoch": 1.7873454850199035, + "grad_norm": 0.3520991672942371, + "learning_rate": 4.204052491243066e-05, + "loss": 2.6859, + "step": 38390 + }, + { + "epoch": 1.7873920432059967, + "grad_norm": 0.3609280992831758, + "learning_rate": 4.203785072924598e-05, + "loss": 2.8203, + "step": 38391 + }, + { + "epoch": 1.7874386013920898, + "grad_norm": 0.3319029825735682, + "learning_rate": 4.203517656942943e-05, + "loss": 2.6456, + "step": 38392 + }, + { + "epoch": 1.7874851595781829, + "grad_norm": 0.3783954833925592, + "learning_rate": 4.203250243298886e-05, + "loss": 2.7485, + "step": 38393 + }, + { + "epoch": 1.787531717764276, + "grad_norm": 0.35738806615547586, + "learning_rate": 4.2029828319932095e-05, + "loss": 2.7659, + "step": 38394 + }, + { + "epoch": 1.7875782759503691, + "grad_norm": 0.3252274714290703, + "learning_rate": 4.2027154230267015e-05, + "loss": 2.7409, + "step": 38395 + }, + { + "epoch": 1.787624834136462, + "grad_norm": 0.34619835065394916, + "learning_rate": 4.202448016400146e-05, + "loss": 2.6603, + "step": 38396 + }, + { + "epoch": 1.7876713923225551, + "grad_norm": 0.37108055498276493, + "learning_rate": 4.2021806121143236e-05, + "loss": 2.6772, + "step": 38397 + }, + { + "epoch": 1.7877179505086482, + "grad_norm": 0.3427824673518944, + "learning_rate": 4.201913210170025e-05, + "loss": 2.6933, + "step": 38398 + }, + { + "epoch": 1.7877645086947411, + "grad_norm": 0.339114032463928, + "learning_rate": 4.2016458105680315e-05, + "loss": 2.6518, + "step": 38399 + }, + { + "epoch": 1.7878110668808342, + "grad_norm": 0.3425906670507134, + "learning_rate": 4.20137841330913e-05, + "loss": 2.5867, + "step": 38400 + }, + { + "epoch": 1.7878576250669274, + "grad_norm": 0.335848136646513, + "learning_rate": 4.201111018394104e-05, + "loss": 2.7627, + "step": 38401 + }, + { + "epoch": 1.7879041832530205, + "grad_norm": 0.34991808836421134, + "learning_rate": 4.200843625823737e-05, + "loss": 2.5831, + "step": 38402 + }, + { + "epoch": 1.7879507414391136, + "grad_norm": 0.3649823268557827, + "learning_rate": 4.2005762355988174e-05, + "loss": 2.663, + "step": 38403 + }, + { + "epoch": 1.7879972996252067, + "grad_norm": 0.3477959269941866, + "learning_rate": 4.200308847720125e-05, + "loss": 2.8133, + "step": 38404 + }, + { + "epoch": 1.7880438578112998, + "grad_norm": 0.348310574647134, + "learning_rate": 4.20004146218845e-05, + "loss": 2.5931, + "step": 38405 + }, + { + "epoch": 1.7880904159973927, + "grad_norm": 0.3546387872595906, + "learning_rate": 4.1997740790045745e-05, + "loss": 2.7969, + "step": 38406 + }, + { + "epoch": 1.7881369741834858, + "grad_norm": 0.31867066861184007, + "learning_rate": 4.19950669816928e-05, + "loss": 2.672, + "step": 38407 + }, + { + "epoch": 1.7881835323695787, + "grad_norm": 0.31724989050397384, + "learning_rate": 4.199239319683358e-05, + "loss": 2.6022, + "step": 38408 + }, + { + "epoch": 1.7882300905556718, + "grad_norm": 0.3445342962265978, + "learning_rate": 4.198971943547587e-05, + "loss": 2.5969, + "step": 38409 + }, + { + "epoch": 1.788276648741765, + "grad_norm": 0.37012327687294927, + "learning_rate": 4.198704569762756e-05, + "loss": 2.6752, + "step": 38410 + }, + { + "epoch": 1.788323206927858, + "grad_norm": 0.3299236835580956, + "learning_rate": 4.198437198329649e-05, + "loss": 2.6546, + "step": 38411 + }, + { + "epoch": 1.7883697651139512, + "grad_norm": 0.36398774509720744, + "learning_rate": 4.1981698292490485e-05, + "loss": 2.7178, + "step": 38412 + }, + { + "epoch": 1.7884163233000443, + "grad_norm": 0.33569920762615346, + "learning_rate": 4.197902462521742e-05, + "loss": 2.7809, + "step": 38413 + }, + { + "epoch": 1.7884628814861374, + "grad_norm": 0.35996136378256083, + "learning_rate": 4.1976350981485106e-05, + "loss": 2.7423, + "step": 38414 + }, + { + "epoch": 1.7885094396722305, + "grad_norm": 0.33748600682576274, + "learning_rate": 4.197367736130142e-05, + "loss": 2.6974, + "step": 38415 + }, + { + "epoch": 1.7885559978583234, + "grad_norm": 0.3560723005862565, + "learning_rate": 4.197100376467421e-05, + "loss": 2.5643, + "step": 38416 + }, + { + "epoch": 1.7886025560444165, + "grad_norm": 0.36129696145965756, + "learning_rate": 4.196833019161129e-05, + "loss": 2.7257, + "step": 38417 + }, + { + "epoch": 1.7886491142305094, + "grad_norm": 0.3245147358013637, + "learning_rate": 4.1965656642120556e-05, + "loss": 2.6682, + "step": 38418 + }, + { + "epoch": 1.7886956724166025, + "grad_norm": 0.374971604403981, + "learning_rate": 4.196298311620982e-05, + "loss": 2.8132, + "step": 38419 + }, + { + "epoch": 1.7887422306026957, + "grad_norm": 0.3233470141253383, + "learning_rate": 4.196030961388692e-05, + "loss": 2.6901, + "step": 38420 + }, + { + "epoch": 1.7887887887887888, + "grad_norm": 0.33613685938971666, + "learning_rate": 4.195763613515974e-05, + "loss": 2.8125, + "step": 38421 + }, + { + "epoch": 1.788835346974882, + "grad_norm": 0.3581992441095404, + "learning_rate": 4.195496268003609e-05, + "loss": 2.6912, + "step": 38422 + }, + { + "epoch": 1.788881905160975, + "grad_norm": 0.33705094482211884, + "learning_rate": 4.195228924852385e-05, + "loss": 2.6312, + "step": 38423 + }, + { + "epoch": 1.7889284633470681, + "grad_norm": 0.3269795364170924, + "learning_rate": 4.194961584063084e-05, + "loss": 2.678, + "step": 38424 + }, + { + "epoch": 1.788975021533161, + "grad_norm": 0.36696741605426986, + "learning_rate": 4.194694245636491e-05, + "loss": 2.6724, + "step": 38425 + }, + { + "epoch": 1.7890215797192541, + "grad_norm": 0.3308248514632998, + "learning_rate": 4.1944269095733926e-05, + "loss": 2.6377, + "step": 38426 + }, + { + "epoch": 1.7890681379053472, + "grad_norm": 0.37906331373902413, + "learning_rate": 4.194159575874569e-05, + "loss": 2.6746, + "step": 38427 + }, + { + "epoch": 1.7891146960914401, + "grad_norm": 0.33727130211810585, + "learning_rate": 4.1938922445408094e-05, + "loss": 2.7439, + "step": 38428 + }, + { + "epoch": 1.7891612542775333, + "grad_norm": 0.3605202202976526, + "learning_rate": 4.193624915572898e-05, + "loss": 2.7148, + "step": 38429 + }, + { + "epoch": 1.7892078124636264, + "grad_norm": 0.3725597300890753, + "learning_rate": 4.193357588971615e-05, + "loss": 2.79, + "step": 38430 + }, + { + "epoch": 1.7892543706497195, + "grad_norm": 0.3505621126869109, + "learning_rate": 4.1930902647377504e-05, + "loss": 2.7866, + "step": 38431 + }, + { + "epoch": 1.7893009288358126, + "grad_norm": 0.3425073948747459, + "learning_rate": 4.192822942872086e-05, + "loss": 2.7069, + "step": 38432 + }, + { + "epoch": 1.7893474870219057, + "grad_norm": 0.40213864294121193, + "learning_rate": 4.1925556233754046e-05, + "loss": 2.6634, + "step": 38433 + }, + { + "epoch": 1.7893940452079988, + "grad_norm": 0.3364023930221673, + "learning_rate": 4.1922883062484955e-05, + "loss": 2.7044, + "step": 38434 + }, + { + "epoch": 1.7894406033940917, + "grad_norm": 0.36187795719355276, + "learning_rate": 4.192020991492138e-05, + "loss": 2.7229, + "step": 38435 + }, + { + "epoch": 1.7894871615801848, + "grad_norm": 0.38114591689040395, + "learning_rate": 4.191753679107122e-05, + "loss": 2.7238, + "step": 38436 + }, + { + "epoch": 1.789533719766278, + "grad_norm": 0.3535823560970029, + "learning_rate": 4.191486369094229e-05, + "loss": 2.7057, + "step": 38437 + }, + { + "epoch": 1.7895802779523708, + "grad_norm": 0.35034928335324333, + "learning_rate": 4.191219061454243e-05, + "loss": 2.6483, + "step": 38438 + }, + { + "epoch": 1.789626836138464, + "grad_norm": 0.3348856199603827, + "learning_rate": 4.1909517561879504e-05, + "loss": 2.8222, + "step": 38439 + }, + { + "epoch": 1.789673394324557, + "grad_norm": 0.3737756102900953, + "learning_rate": 4.190684453296133e-05, + "loss": 2.6987, + "step": 38440 + }, + { + "epoch": 1.7897199525106502, + "grad_norm": 0.33354534760742094, + "learning_rate": 4.190417152779578e-05, + "loss": 2.6686, + "step": 38441 + }, + { + "epoch": 1.7897665106967433, + "grad_norm": 0.36160184409681345, + "learning_rate": 4.190149854639069e-05, + "loss": 2.6952, + "step": 38442 + }, + { + "epoch": 1.7898130688828364, + "grad_norm": 0.35983584358833054, + "learning_rate": 4.1898825588753894e-05, + "loss": 2.7971, + "step": 38443 + }, + { + "epoch": 1.7898596270689295, + "grad_norm": 0.34017360799401086, + "learning_rate": 4.189615265489326e-05, + "loss": 2.7625, + "step": 38444 + }, + { + "epoch": 1.7899061852550224, + "grad_norm": 0.3301497723261276, + "learning_rate": 4.18934797448166e-05, + "loss": 2.5731, + "step": 38445 + }, + { + "epoch": 1.7899527434411155, + "grad_norm": 0.3507133917387227, + "learning_rate": 4.18908068585318e-05, + "loss": 2.7117, + "step": 38446 + }, + { + "epoch": 1.7899993016272084, + "grad_norm": 0.34100448331287314, + "learning_rate": 4.188813399604668e-05, + "loss": 2.7096, + "step": 38447 + }, + { + "epoch": 1.7900458598133016, + "grad_norm": 0.3573412106658111, + "learning_rate": 4.188546115736907e-05, + "loss": 2.7515, + "step": 38448 + }, + { + "epoch": 1.7900924179993947, + "grad_norm": 0.35534176511444926, + "learning_rate": 4.188278834250686e-05, + "loss": 2.7262, + "step": 38449 + }, + { + "epoch": 1.7901389761854878, + "grad_norm": 0.35666875275834226, + "learning_rate": 4.1880115551467854e-05, + "loss": 2.6941, + "step": 38450 + }, + { + "epoch": 1.790185534371581, + "grad_norm": 0.32713673681969563, + "learning_rate": 4.18774427842599e-05, + "loss": 2.672, + "step": 38451 + }, + { + "epoch": 1.790232092557674, + "grad_norm": 0.3743581801655627, + "learning_rate": 4.187477004089087e-05, + "loss": 2.6359, + "step": 38452 + }, + { + "epoch": 1.7902786507437671, + "grad_norm": 0.39312609929211717, + "learning_rate": 4.187209732136856e-05, + "loss": 2.7889, + "step": 38453 + }, + { + "epoch": 1.7903252089298602, + "grad_norm": 0.3340077293618495, + "learning_rate": 4.186942462570087e-05, + "loss": 2.6269, + "step": 38454 + }, + { + "epoch": 1.7903717671159531, + "grad_norm": 0.3615320094986834, + "learning_rate": 4.1866751953895616e-05, + "loss": 2.7655, + "step": 38455 + }, + { + "epoch": 1.7904183253020463, + "grad_norm": 0.34035308586858587, + "learning_rate": 4.186407930596063e-05, + "loss": 2.7948, + "step": 38456 + }, + { + "epoch": 1.7904648834881391, + "grad_norm": 0.3226012202997249, + "learning_rate": 4.1861406681903774e-05, + "loss": 2.7766, + "step": 38457 + }, + { + "epoch": 1.7905114416742323, + "grad_norm": 0.3455948304025634, + "learning_rate": 4.185873408173288e-05, + "loss": 2.6593, + "step": 38458 + }, + { + "epoch": 1.7905579998603254, + "grad_norm": 0.33079584774265863, + "learning_rate": 4.185606150545581e-05, + "loss": 2.641, + "step": 38459 + }, + { + "epoch": 1.7906045580464185, + "grad_norm": 0.3261404909698132, + "learning_rate": 4.185338895308041e-05, + "loss": 2.6999, + "step": 38460 + }, + { + "epoch": 1.7906511162325116, + "grad_norm": 0.3448066556714911, + "learning_rate": 4.185071642461448e-05, + "loss": 2.7613, + "step": 38461 + }, + { + "epoch": 1.7906976744186047, + "grad_norm": 0.3478487108801387, + "learning_rate": 4.184804392006592e-05, + "loss": 2.6374, + "step": 38462 + }, + { + "epoch": 1.7907442326046978, + "grad_norm": 0.35205523660417165, + "learning_rate": 4.1845371439442534e-05, + "loss": 2.7202, + "step": 38463 + }, + { + "epoch": 1.790790790790791, + "grad_norm": 0.3691780411202908, + "learning_rate": 4.1842698982752196e-05, + "loss": 2.7954, + "step": 38464 + }, + { + "epoch": 1.7908373489768838, + "grad_norm": 0.34020736125183837, + "learning_rate": 4.1840026550002715e-05, + "loss": 2.7656, + "step": 38465 + }, + { + "epoch": 1.790883907162977, + "grad_norm": 0.3274619562350002, + "learning_rate": 4.183735414120196e-05, + "loss": 2.6792, + "step": 38466 + }, + { + "epoch": 1.7909304653490699, + "grad_norm": 0.32822395024613954, + "learning_rate": 4.183468175635777e-05, + "loss": 2.5859, + "step": 38467 + }, + { + "epoch": 1.790977023535163, + "grad_norm": 0.34778000672616993, + "learning_rate": 4.183200939547799e-05, + "loss": 2.7237, + "step": 38468 + }, + { + "epoch": 1.791023581721256, + "grad_norm": 0.3380739127977365, + "learning_rate": 4.182933705857043e-05, + "loss": 2.6724, + "step": 38469 + }, + { + "epoch": 1.7910701399073492, + "grad_norm": 0.37447607507673725, + "learning_rate": 4.182666474564299e-05, + "loss": 2.718, + "step": 38470 + }, + { + "epoch": 1.7911166980934423, + "grad_norm": 0.3057017740851618, + "learning_rate": 4.182399245670345e-05, + "loss": 2.7526, + "step": 38471 + }, + { + "epoch": 1.7911632562795354, + "grad_norm": 0.35157744976665956, + "learning_rate": 4.182132019175972e-05, + "loss": 2.603, + "step": 38472 + }, + { + "epoch": 1.7912098144656285, + "grad_norm": 0.35477269670559175, + "learning_rate": 4.1818647950819605e-05, + "loss": 2.7003, + "step": 38473 + }, + { + "epoch": 1.7912563726517214, + "grad_norm": 0.32355294649388766, + "learning_rate": 4.1815975733890934e-05, + "loss": 2.7332, + "step": 38474 + }, + { + "epoch": 1.7913029308378146, + "grad_norm": 0.347037897128516, + "learning_rate": 4.181330354098158e-05, + "loss": 2.682, + "step": 38475 + }, + { + "epoch": 1.7913494890239077, + "grad_norm": 0.3532146912919632, + "learning_rate": 4.181063137209937e-05, + "loss": 2.7077, + "step": 38476 + }, + { + "epoch": 1.7913960472100006, + "grad_norm": 0.32022790464625517, + "learning_rate": 4.180795922725217e-05, + "loss": 2.6254, + "step": 38477 + }, + { + "epoch": 1.7914426053960937, + "grad_norm": 0.3382842478677976, + "learning_rate": 4.180528710644778e-05, + "loss": 2.6985, + "step": 38478 + }, + { + "epoch": 1.7914891635821868, + "grad_norm": 0.3348569821442602, + "learning_rate": 4.180261500969407e-05, + "loss": 2.6903, + "step": 38479 + }, + { + "epoch": 1.79153572176828, + "grad_norm": 0.35284418496273234, + "learning_rate": 4.179994293699888e-05, + "loss": 2.6548, + "step": 38480 + }, + { + "epoch": 1.791582279954373, + "grad_norm": 0.32695877898084635, + "learning_rate": 4.179727088837004e-05, + "loss": 2.7092, + "step": 38481 + }, + { + "epoch": 1.7916288381404661, + "grad_norm": 0.3347661632090915, + "learning_rate": 4.179459886381542e-05, + "loss": 2.7078, + "step": 38482 + }, + { + "epoch": 1.7916753963265593, + "grad_norm": 0.35356366487456853, + "learning_rate": 4.179192686334283e-05, + "loss": 2.7254, + "step": 38483 + }, + { + "epoch": 1.7917219545126521, + "grad_norm": 0.3282697360003029, + "learning_rate": 4.178925488696012e-05, + "loss": 2.8044, + "step": 38484 + }, + { + "epoch": 1.7917685126987453, + "grad_norm": 0.363057988448906, + "learning_rate": 4.1786582934675156e-05, + "loss": 2.6435, + "step": 38485 + }, + { + "epoch": 1.7918150708848384, + "grad_norm": 0.3611358484330559, + "learning_rate": 4.178391100649576e-05, + "loss": 2.7453, + "step": 38486 + }, + { + "epoch": 1.7918616290709313, + "grad_norm": 0.3787534677310973, + "learning_rate": 4.178123910242976e-05, + "loss": 2.6113, + "step": 38487 + }, + { + "epoch": 1.7919081872570244, + "grad_norm": 0.36581870373697833, + "learning_rate": 4.1778567222485025e-05, + "loss": 2.7538, + "step": 38488 + }, + { + "epoch": 1.7919547454431175, + "grad_norm": 0.3350927287491413, + "learning_rate": 4.177589536666938e-05, + "loss": 2.6379, + "step": 38489 + }, + { + "epoch": 1.7920013036292106, + "grad_norm": 0.35408647534239585, + "learning_rate": 4.177322353499068e-05, + "loss": 2.7882, + "step": 38490 + }, + { + "epoch": 1.7920478618153037, + "grad_norm": 0.35290857461946107, + "learning_rate": 4.177055172745676e-05, + "loss": 2.8391, + "step": 38491 + }, + { + "epoch": 1.7920944200013968, + "grad_norm": 0.3282559865409061, + "learning_rate": 4.176787994407545e-05, + "loss": 2.6675, + "step": 38492 + }, + { + "epoch": 1.79214097818749, + "grad_norm": 0.3271066301014648, + "learning_rate": 4.176520818485461e-05, + "loss": 2.6708, + "step": 38493 + }, + { + "epoch": 1.7921875363735829, + "grad_norm": 0.3207176092448632, + "learning_rate": 4.176253644980205e-05, + "loss": 2.737, + "step": 38494 + }, + { + "epoch": 1.792234094559676, + "grad_norm": 0.3485396106315706, + "learning_rate": 4.1759864738925666e-05, + "loss": 2.6556, + "step": 38495 + }, + { + "epoch": 1.7922806527457689, + "grad_norm": 0.3316556151527003, + "learning_rate": 4.175719305223326e-05, + "loss": 2.6756, + "step": 38496 + }, + { + "epoch": 1.792327210931862, + "grad_norm": 0.35005660798124366, + "learning_rate": 4.175452138973266e-05, + "loss": 2.6768, + "step": 38497 + }, + { + "epoch": 1.792373769117955, + "grad_norm": 0.3446551464796352, + "learning_rate": 4.175184975143174e-05, + "loss": 2.769, + "step": 38498 + }, + { + "epoch": 1.7924203273040482, + "grad_norm": 0.3347934716157808, + "learning_rate": 4.174917813733832e-05, + "loss": 2.6505, + "step": 38499 + }, + { + "epoch": 1.7924668854901413, + "grad_norm": 0.33239225729105903, + "learning_rate": 4.1746506547460264e-05, + "loss": 2.7255, + "step": 38500 + }, + { + "epoch": 1.7925134436762344, + "grad_norm": 0.3377744860350927, + "learning_rate": 4.17438349818054e-05, + "loss": 2.6453, + "step": 38501 + }, + { + "epoch": 1.7925600018623276, + "grad_norm": 0.33297031506528785, + "learning_rate": 4.174116344038156e-05, + "loss": 2.7732, + "step": 38502 + }, + { + "epoch": 1.7926065600484207, + "grad_norm": 0.36693419084068574, + "learning_rate": 4.17384919231966e-05, + "loss": 2.6692, + "step": 38503 + }, + { + "epoch": 1.7926531182345136, + "grad_norm": 0.3723423022981707, + "learning_rate": 4.173582043025834e-05, + "loss": 2.7046, + "step": 38504 + }, + { + "epoch": 1.7926996764206067, + "grad_norm": 0.37700916129606904, + "learning_rate": 4.173314896157463e-05, + "loss": 2.7277, + "step": 38505 + }, + { + "epoch": 1.7927462346066996, + "grad_norm": 0.3900117578910398, + "learning_rate": 4.173047751715333e-05, + "loss": 2.7009, + "step": 38506 + }, + { + "epoch": 1.7927927927927927, + "grad_norm": 0.32982714587768264, + "learning_rate": 4.1727806097002234e-05, + "loss": 2.6937, + "step": 38507 + }, + { + "epoch": 1.7928393509788858, + "grad_norm": 0.3675207324967482, + "learning_rate": 4.172513470112924e-05, + "loss": 2.7017, + "step": 38508 + }, + { + "epoch": 1.792885909164979, + "grad_norm": 0.3439946608177379, + "learning_rate": 4.1722463329542164e-05, + "loss": 2.6818, + "step": 38509 + }, + { + "epoch": 1.792932467351072, + "grad_norm": 0.3353373394186117, + "learning_rate": 4.171979198224881e-05, + "loss": 2.7257, + "step": 38510 + }, + { + "epoch": 1.7929790255371651, + "grad_norm": 0.3379484766345567, + "learning_rate": 4.171712065925708e-05, + "loss": 2.6607, + "step": 38511 + }, + { + "epoch": 1.7930255837232583, + "grad_norm": 0.3751113428388445, + "learning_rate": 4.1714449360574765e-05, + "loss": 2.7557, + "step": 38512 + }, + { + "epoch": 1.7930721419093512, + "grad_norm": 0.3458129321195661, + "learning_rate": 4.171177808620975e-05, + "loss": 2.6275, + "step": 38513 + }, + { + "epoch": 1.7931187000954443, + "grad_norm": 0.34042078678171617, + "learning_rate": 4.1709106836169844e-05, + "loss": 2.7197, + "step": 38514 + }, + { + "epoch": 1.7931652582815374, + "grad_norm": 0.33725922844926953, + "learning_rate": 4.170643561046288e-05, + "loss": 2.6881, + "step": 38515 + }, + { + "epoch": 1.7932118164676303, + "grad_norm": 0.33418679306182275, + "learning_rate": 4.170376440909672e-05, + "loss": 2.6771, + "step": 38516 + }, + { + "epoch": 1.7932583746537234, + "grad_norm": 0.3692895869510094, + "learning_rate": 4.1701093232079174e-05, + "loss": 2.8028, + "step": 38517 + }, + { + "epoch": 1.7933049328398165, + "grad_norm": 0.34661764376772014, + "learning_rate": 4.169842207941813e-05, + "loss": 2.6854, + "step": 38518 + }, + { + "epoch": 1.7933514910259096, + "grad_norm": 0.378085814497135, + "learning_rate": 4.1695750951121395e-05, + "loss": 2.7786, + "step": 38519 + }, + { + "epoch": 1.7933980492120027, + "grad_norm": 0.35736933014929917, + "learning_rate": 4.169307984719679e-05, + "loss": 2.651, + "step": 38520 + }, + { + "epoch": 1.7934446073980959, + "grad_norm": 0.3848402198673128, + "learning_rate": 4.1690408767652196e-05, + "loss": 2.7195, + "step": 38521 + }, + { + "epoch": 1.793491165584189, + "grad_norm": 0.32891342391717737, + "learning_rate": 4.1687737712495437e-05, + "loss": 2.7725, + "step": 38522 + }, + { + "epoch": 1.7935377237702819, + "grad_norm": 0.35721280121481724, + "learning_rate": 4.1685066681734324e-05, + "loss": 2.6689, + "step": 38523 + }, + { + "epoch": 1.793584281956375, + "grad_norm": 0.3602124480874804, + "learning_rate": 4.1682395675376746e-05, + "loss": 2.7714, + "step": 38524 + }, + { + "epoch": 1.793630840142468, + "grad_norm": 0.32433898843702114, + "learning_rate": 4.16797246934305e-05, + "loss": 2.6465, + "step": 38525 + }, + { + "epoch": 1.793677398328561, + "grad_norm": 0.34438477397732925, + "learning_rate": 4.167705373590346e-05, + "loss": 2.6878, + "step": 38526 + }, + { + "epoch": 1.793723956514654, + "grad_norm": 0.3401720749686713, + "learning_rate": 4.167438280280344e-05, + "loss": 2.7281, + "step": 38527 + }, + { + "epoch": 1.7937705147007472, + "grad_norm": 0.352389503023483, + "learning_rate": 4.1671711894138285e-05, + "loss": 2.7093, + "step": 38528 + }, + { + "epoch": 1.7938170728868403, + "grad_norm": 0.3229362332976652, + "learning_rate": 4.1669041009915845e-05, + "loss": 2.6658, + "step": 38529 + }, + { + "epoch": 1.7938636310729335, + "grad_norm": 0.3501310796178739, + "learning_rate": 4.1666370150143925e-05, + "loss": 2.6954, + "step": 38530 + }, + { + "epoch": 1.7939101892590266, + "grad_norm": 0.3120879123863135, + "learning_rate": 4.1663699314830406e-05, + "loss": 2.7032, + "step": 38531 + }, + { + "epoch": 1.7939567474451197, + "grad_norm": 0.32179524893964834, + "learning_rate": 4.1661028503983116e-05, + "loss": 2.6802, + "step": 38532 + }, + { + "epoch": 1.7940033056312126, + "grad_norm": 0.34437827101881185, + "learning_rate": 4.165835771760986e-05, + "loss": 2.7036, + "step": 38533 + }, + { + "epoch": 1.7940498638173057, + "grad_norm": 0.32953873086505, + "learning_rate": 4.165568695571852e-05, + "loss": 2.6665, + "step": 38534 + }, + { + "epoch": 1.7940964220033986, + "grad_norm": 0.35313508501669855, + "learning_rate": 4.16530162183169e-05, + "loss": 2.6165, + "step": 38535 + }, + { + "epoch": 1.7941429801894917, + "grad_norm": 0.3416168046236932, + "learning_rate": 4.165034550541288e-05, + "loss": 2.6318, + "step": 38536 + }, + { + "epoch": 1.7941895383755848, + "grad_norm": 0.33878835424397574, + "learning_rate": 4.164767481701427e-05, + "loss": 2.7338, + "step": 38537 + }, + { + "epoch": 1.794236096561678, + "grad_norm": 0.34911929712345896, + "learning_rate": 4.164500415312889e-05, + "loss": 2.6704, + "step": 38538 + }, + { + "epoch": 1.794282654747771, + "grad_norm": 0.33295437978438247, + "learning_rate": 4.164233351376462e-05, + "loss": 2.7712, + "step": 38539 + }, + { + "epoch": 1.7943292129338642, + "grad_norm": 0.322596174162471, + "learning_rate": 4.163966289892928e-05, + "loss": 2.5965, + "step": 38540 + }, + { + "epoch": 1.7943757711199573, + "grad_norm": 0.33684456060094153, + "learning_rate": 4.163699230863069e-05, + "loss": 2.7286, + "step": 38541 + }, + { + "epoch": 1.7944223293060504, + "grad_norm": 0.3474807506123543, + "learning_rate": 4.1634321742876714e-05, + "loss": 2.7797, + "step": 38542 + }, + { + "epoch": 1.7944688874921433, + "grad_norm": 0.3244968148472646, + "learning_rate": 4.1631651201675166e-05, + "loss": 2.6656, + "step": 38543 + }, + { + "epoch": 1.7945154456782364, + "grad_norm": 0.3501981193325471, + "learning_rate": 4.1628980685033914e-05, + "loss": 2.6748, + "step": 38544 + }, + { + "epoch": 1.7945620038643293, + "grad_norm": 0.342073542430209, + "learning_rate": 4.1626310192960786e-05, + "loss": 2.8068, + "step": 38545 + }, + { + "epoch": 1.7946085620504224, + "grad_norm": 0.3351816816750404, + "learning_rate": 4.162363972546357e-05, + "loss": 2.6425, + "step": 38546 + }, + { + "epoch": 1.7946551202365155, + "grad_norm": 0.3323183322029008, + "learning_rate": 4.162096928255019e-05, + "loss": 2.7108, + "step": 38547 + }, + { + "epoch": 1.7947016784226086, + "grad_norm": 0.32287929684600203, + "learning_rate": 4.161829886422841e-05, + "loss": 2.6102, + "step": 38548 + }, + { + "epoch": 1.7947482366087018, + "grad_norm": 0.3479650943273897, + "learning_rate": 4.161562847050612e-05, + "loss": 2.7072, + "step": 38549 + }, + { + "epoch": 1.7947947947947949, + "grad_norm": 0.34975049192241564, + "learning_rate": 4.161295810139113e-05, + "loss": 2.6638, + "step": 38550 + }, + { + "epoch": 1.794841352980888, + "grad_norm": 0.3370387396200614, + "learning_rate": 4.161028775689126e-05, + "loss": 2.7311, + "step": 38551 + }, + { + "epoch": 1.794887911166981, + "grad_norm": 0.33487330893670864, + "learning_rate": 4.16076174370144e-05, + "loss": 2.7, + "step": 38552 + }, + { + "epoch": 1.794934469353074, + "grad_norm": 0.3336165223743103, + "learning_rate": 4.160494714176834e-05, + "loss": 2.5851, + "step": 38553 + }, + { + "epoch": 1.794981027539167, + "grad_norm": 0.3578932877445937, + "learning_rate": 4.1602276871160946e-05, + "loss": 2.6422, + "step": 38554 + }, + { + "epoch": 1.79502758572526, + "grad_norm": 0.3583708035488811, + "learning_rate": 4.1599606625200037e-05, + "loss": 2.6684, + "step": 38555 + }, + { + "epoch": 1.7950741439113531, + "grad_norm": 0.3554339012877601, + "learning_rate": 4.159693640389344e-05, + "loss": 2.7252, + "step": 38556 + }, + { + "epoch": 1.7951207020974462, + "grad_norm": 0.3455150251397663, + "learning_rate": 4.159426620724902e-05, + "loss": 2.7264, + "step": 38557 + }, + { + "epoch": 1.7951672602835393, + "grad_norm": 0.3419104013426173, + "learning_rate": 4.1591596035274585e-05, + "loss": 2.6908, + "step": 38558 + }, + { + "epoch": 1.7952138184696325, + "grad_norm": 0.34129363192234435, + "learning_rate": 4.158892588797801e-05, + "loss": 2.7602, + "step": 38559 + }, + { + "epoch": 1.7952603766557256, + "grad_norm": 0.3243479102889379, + "learning_rate": 4.158625576536711e-05, + "loss": 2.8021, + "step": 38560 + }, + { + "epoch": 1.7953069348418187, + "grad_norm": 0.3426987488175975, + "learning_rate": 4.158358566744969e-05, + "loss": 2.7459, + "step": 38561 + }, + { + "epoch": 1.7953534930279116, + "grad_norm": 0.34941339714984493, + "learning_rate": 4.158091559423364e-05, + "loss": 2.7934, + "step": 38562 + }, + { + "epoch": 1.7954000512140047, + "grad_norm": 0.3219176524380731, + "learning_rate": 4.157824554572678e-05, + "loss": 2.5877, + "step": 38563 + }, + { + "epoch": 1.7954466094000978, + "grad_norm": 0.33707659993039696, + "learning_rate": 4.157557552193691e-05, + "loss": 2.7289, + "step": 38564 + }, + { + "epoch": 1.7954931675861907, + "grad_norm": 0.32883104795879525, + "learning_rate": 4.1572905522871924e-05, + "loss": 2.7569, + "step": 38565 + }, + { + "epoch": 1.7955397257722838, + "grad_norm": 0.3530623229742146, + "learning_rate": 4.1570235548539614e-05, + "loss": 2.621, + "step": 38566 + }, + { + "epoch": 1.795586283958377, + "grad_norm": 0.37036836681412105, + "learning_rate": 4.156756559894785e-05, + "loss": 2.7857, + "step": 38567 + }, + { + "epoch": 1.79563284214447, + "grad_norm": 0.37329503820110477, + "learning_rate": 4.156489567410445e-05, + "loss": 2.7491, + "step": 38568 + }, + { + "epoch": 1.7956794003305632, + "grad_norm": 0.34876572057232463, + "learning_rate": 4.1562225774017225e-05, + "loss": 2.641, + "step": 38569 + }, + { + "epoch": 1.7957259585166563, + "grad_norm": 0.383035749475878, + "learning_rate": 4.155955589869406e-05, + "loss": 2.7648, + "step": 38570 + }, + { + "epoch": 1.7957725167027494, + "grad_norm": 0.3289292247397293, + "learning_rate": 4.155688604814274e-05, + "loss": 2.556, + "step": 38571 + }, + { + "epoch": 1.7958190748888423, + "grad_norm": 0.3266712777280807, + "learning_rate": 4.155421622237115e-05, + "loss": 2.6677, + "step": 38572 + }, + { + "epoch": 1.7958656330749354, + "grad_norm": 0.3417373505402613, + "learning_rate": 4.155154642138711e-05, + "loss": 2.6252, + "step": 38573 + }, + { + "epoch": 1.7959121912610285, + "grad_norm": 0.3242595668413963, + "learning_rate": 4.154887664519842e-05, + "loss": 2.691, + "step": 38574 + }, + { + "epoch": 1.7959587494471214, + "grad_norm": 0.3204281800957035, + "learning_rate": 4.154620689381297e-05, + "loss": 2.6672, + "step": 38575 + }, + { + "epoch": 1.7960053076332145, + "grad_norm": 0.34769244223259277, + "learning_rate": 4.154353716723856e-05, + "loss": 2.7271, + "step": 38576 + }, + { + "epoch": 1.7960518658193076, + "grad_norm": 0.3805068714967184, + "learning_rate": 4.1540867465483036e-05, + "loss": 2.7001, + "step": 38577 + }, + { + "epoch": 1.7960984240054008, + "grad_norm": 0.31152065205873664, + "learning_rate": 4.153819778855424e-05, + "loss": 2.7221, + "step": 38578 + }, + { + "epoch": 1.7961449821914939, + "grad_norm": 0.3368197906479925, + "learning_rate": 4.153552813645999e-05, + "loss": 2.7473, + "step": 38579 + }, + { + "epoch": 1.796191540377587, + "grad_norm": 0.38425282755002266, + "learning_rate": 4.1532858509208136e-05, + "loss": 2.6989, + "step": 38580 + }, + { + "epoch": 1.79623809856368, + "grad_norm": 0.3420886373449334, + "learning_rate": 4.153018890680652e-05, + "loss": 2.7883, + "step": 38581 + }, + { + "epoch": 1.796284656749773, + "grad_norm": 0.34862154729602834, + "learning_rate": 4.152751932926294e-05, + "loss": 2.7204, + "step": 38582 + }, + { + "epoch": 1.7963312149358661, + "grad_norm": 0.33570219364107584, + "learning_rate": 4.152484977658527e-05, + "loss": 2.7274, + "step": 38583 + }, + { + "epoch": 1.796377773121959, + "grad_norm": 0.3543608236282328, + "learning_rate": 4.1522180248781314e-05, + "loss": 2.6887, + "step": 38584 + }, + { + "epoch": 1.7964243313080521, + "grad_norm": 0.3529512472144658, + "learning_rate": 4.151951074585895e-05, + "loss": 2.7043, + "step": 38585 + }, + { + "epoch": 1.7964708894941452, + "grad_norm": 0.32166686639279496, + "learning_rate": 4.151684126782598e-05, + "loss": 2.6965, + "step": 38586 + }, + { + "epoch": 1.7965174476802384, + "grad_norm": 0.35942842722473123, + "learning_rate": 4.151417181469023e-05, + "loss": 2.6081, + "step": 38587 + }, + { + "epoch": 1.7965640058663315, + "grad_norm": 0.3400792618390598, + "learning_rate": 4.1511502386459564e-05, + "loss": 2.6495, + "step": 38588 + }, + { + "epoch": 1.7966105640524246, + "grad_norm": 0.3511163852409012, + "learning_rate": 4.1508832983141796e-05, + "loss": 2.6682, + "step": 38589 + }, + { + "epoch": 1.7966571222385177, + "grad_norm": 0.34693019195320335, + "learning_rate": 4.150616360474476e-05, + "loss": 2.779, + "step": 38590 + }, + { + "epoch": 1.7967036804246108, + "grad_norm": 0.36904445426534327, + "learning_rate": 4.150349425127632e-05, + "loss": 2.7018, + "step": 38591 + }, + { + "epoch": 1.7967502386107037, + "grad_norm": 0.36244867071038384, + "learning_rate": 4.150082492274427e-05, + "loss": 2.6971, + "step": 38592 + }, + { + "epoch": 1.7967967967967968, + "grad_norm": 0.3417991581501647, + "learning_rate": 4.149815561915648e-05, + "loss": 2.6618, + "step": 38593 + }, + { + "epoch": 1.7968433549828897, + "grad_norm": 0.3229448272865504, + "learning_rate": 4.149548634052073e-05, + "loss": 2.7019, + "step": 38594 + }, + { + "epoch": 1.7968899131689828, + "grad_norm": 0.3559782888951089, + "learning_rate": 4.1492817086844916e-05, + "loss": 2.7373, + "step": 38595 + }, + { + "epoch": 1.796936471355076, + "grad_norm": 0.3574025816602399, + "learning_rate": 4.149014785813685e-05, + "loss": 2.6343, + "step": 38596 + }, + { + "epoch": 1.796983029541169, + "grad_norm": 0.3324058356024502, + "learning_rate": 4.148747865440434e-05, + "loss": 2.6814, + "step": 38597 + }, + { + "epoch": 1.7970295877272622, + "grad_norm": 0.3794439559345801, + "learning_rate": 4.148480947565526e-05, + "loss": 2.7126, + "step": 38598 + }, + { + "epoch": 1.7970761459133553, + "grad_norm": 0.3433162971680842, + "learning_rate": 4.148214032189742e-05, + "loss": 2.6505, + "step": 38599 + }, + { + "epoch": 1.7971227040994484, + "grad_norm": 0.36336548736969426, + "learning_rate": 4.1479471193138646e-05, + "loss": 2.707, + "step": 38600 + }, + { + "epoch": 1.7971692622855413, + "grad_norm": 0.3416167674341796, + "learning_rate": 4.14768020893868e-05, + "loss": 2.7283, + "step": 38601 + }, + { + "epoch": 1.7972158204716344, + "grad_norm": 0.34296671187273986, + "learning_rate": 4.14741330106497e-05, + "loss": 2.7462, + "step": 38602 + }, + { + "epoch": 1.7972623786577275, + "grad_norm": 0.36052861149250637, + "learning_rate": 4.147146395693517e-05, + "loss": 2.6831, + "step": 38603 + }, + { + "epoch": 1.7973089368438204, + "grad_norm": 0.346518483237694, + "learning_rate": 4.146879492825106e-05, + "loss": 2.6446, + "step": 38604 + }, + { + "epoch": 1.7973554950299135, + "grad_norm": 0.36956870187581403, + "learning_rate": 4.14661259246052e-05, + "loss": 2.6391, + "step": 38605 + }, + { + "epoch": 1.7974020532160067, + "grad_norm": 0.3653758702531392, + "learning_rate": 4.146345694600543e-05, + "loss": 2.6301, + "step": 38606 + }, + { + "epoch": 1.7974486114020998, + "grad_norm": 0.3547595442191998, + "learning_rate": 4.146078799245954e-05, + "loss": 2.7835, + "step": 38607 + }, + { + "epoch": 1.7974951695881929, + "grad_norm": 0.41132520232992426, + "learning_rate": 4.1458119063975423e-05, + "loss": 2.7334, + "step": 38608 + }, + { + "epoch": 1.797541727774286, + "grad_norm": 0.33324678332680235, + "learning_rate": 4.145545016056088e-05, + "loss": 2.6758, + "step": 38609 + }, + { + "epoch": 1.7975882859603791, + "grad_norm": 0.3711623952787463, + "learning_rate": 4.1452781282223726e-05, + "loss": 2.643, + "step": 38610 + }, + { + "epoch": 1.797634844146472, + "grad_norm": 0.4107299142227123, + "learning_rate": 4.145011242897185e-05, + "loss": 2.7207, + "step": 38611 + }, + { + "epoch": 1.7976814023325651, + "grad_norm": 0.34061189139178444, + "learning_rate": 4.1447443600813016e-05, + "loss": 2.6728, + "step": 38612 + }, + { + "epoch": 1.7977279605186582, + "grad_norm": 0.34884913657636857, + "learning_rate": 4.144477479775513e-05, + "loss": 2.6744, + "step": 38613 + }, + { + "epoch": 1.7977745187047511, + "grad_norm": 0.36707719427269125, + "learning_rate": 4.144210601980598e-05, + "loss": 2.7315, + "step": 38614 + }, + { + "epoch": 1.7978210768908442, + "grad_norm": 0.3677685310794141, + "learning_rate": 4.143943726697339e-05, + "loss": 2.7108, + "step": 38615 + }, + { + "epoch": 1.7978676350769374, + "grad_norm": 0.35637114709642714, + "learning_rate": 4.1436768539265215e-05, + "loss": 2.8154, + "step": 38616 + }, + { + "epoch": 1.7979141932630305, + "grad_norm": 0.3763690413271911, + "learning_rate": 4.143409983668929e-05, + "loss": 2.7182, + "step": 38617 + }, + { + "epoch": 1.7979607514491236, + "grad_norm": 0.3504015354337608, + "learning_rate": 4.143143115925342e-05, + "loss": 2.6405, + "step": 38618 + }, + { + "epoch": 1.7980073096352167, + "grad_norm": 0.3702726470745319, + "learning_rate": 4.142876250696548e-05, + "loss": 2.7287, + "step": 38619 + }, + { + "epoch": 1.7980538678213098, + "grad_norm": 0.38065055918908347, + "learning_rate": 4.142609387983324e-05, + "loss": 2.5859, + "step": 38620 + }, + { + "epoch": 1.7981004260074027, + "grad_norm": 0.31982412119585485, + "learning_rate": 4.14234252778646e-05, + "loss": 2.6616, + "step": 38621 + }, + { + "epoch": 1.7981469841934958, + "grad_norm": 0.34942836283062484, + "learning_rate": 4.1420756701067366e-05, + "loss": 2.7481, + "step": 38622 + }, + { + "epoch": 1.7981935423795887, + "grad_norm": 0.3438315008408821, + "learning_rate": 4.141808814944934e-05, + "loss": 2.7144, + "step": 38623 + }, + { + "epoch": 1.7982401005656818, + "grad_norm": 0.3617312836290681, + "learning_rate": 4.1415419623018396e-05, + "loss": 2.7078, + "step": 38624 + }, + { + "epoch": 1.798286658751775, + "grad_norm": 0.3159082406391151, + "learning_rate": 4.1412751121782335e-05, + "loss": 2.649, + "step": 38625 + }, + { + "epoch": 1.798333216937868, + "grad_norm": 0.33268059853016035, + "learning_rate": 4.141008264574903e-05, + "loss": 2.6831, + "step": 38626 + }, + { + "epoch": 1.7983797751239612, + "grad_norm": 0.3494080253344542, + "learning_rate": 4.1407414194926273e-05, + "loss": 2.7739, + "step": 38627 + }, + { + "epoch": 1.7984263333100543, + "grad_norm": 0.3206276622036745, + "learning_rate": 4.1404745769321906e-05, + "loss": 2.7035, + "step": 38628 + }, + { + "epoch": 1.7984728914961474, + "grad_norm": 0.3380019979612051, + "learning_rate": 4.140207736894376e-05, + "loss": 2.6147, + "step": 38629 + }, + { + "epoch": 1.7985194496822405, + "grad_norm": 0.34833411261656166, + "learning_rate": 4.139940899379967e-05, + "loss": 2.7658, + "step": 38630 + }, + { + "epoch": 1.7985660078683334, + "grad_norm": 0.3461054437130313, + "learning_rate": 4.139674064389748e-05, + "loss": 2.7558, + "step": 38631 + }, + { + "epoch": 1.7986125660544265, + "grad_norm": 0.33632585909113477, + "learning_rate": 4.139407231924501e-05, + "loss": 2.7449, + "step": 38632 + }, + { + "epoch": 1.7986591242405194, + "grad_norm": 0.3205385446860243, + "learning_rate": 4.1391404019850064e-05, + "loss": 2.698, + "step": 38633 + }, + { + "epoch": 1.7987056824266126, + "grad_norm": 0.32569456712992084, + "learning_rate": 4.138873574572053e-05, + "loss": 2.5975, + "step": 38634 + }, + { + "epoch": 1.7987522406127057, + "grad_norm": 0.37063379719749917, + "learning_rate": 4.13860674968642e-05, + "loss": 2.8314, + "step": 38635 + }, + { + "epoch": 1.7987987987987988, + "grad_norm": 0.33468835499184457, + "learning_rate": 4.138339927328889e-05, + "loss": 2.6202, + "step": 38636 + }, + { + "epoch": 1.798845356984892, + "grad_norm": 0.3295903062566117, + "learning_rate": 4.1380731075002484e-05, + "loss": 2.7859, + "step": 38637 + }, + { + "epoch": 1.798891915170985, + "grad_norm": 0.33432977666525526, + "learning_rate": 4.137806290201276e-05, + "loss": 2.6262, + "step": 38638 + }, + { + "epoch": 1.7989384733570781, + "grad_norm": 0.3514029132027016, + "learning_rate": 4.137539475432759e-05, + "loss": 2.7435, + "step": 38639 + }, + { + "epoch": 1.7989850315431712, + "grad_norm": 0.34366395889138146, + "learning_rate": 4.1372726631954784e-05, + "loss": 2.7458, + "step": 38640 + }, + { + "epoch": 1.7990315897292641, + "grad_norm": 0.3408163029597521, + "learning_rate": 4.137005853490217e-05, + "loss": 2.6482, + "step": 38641 + }, + { + "epoch": 1.7990781479153573, + "grad_norm": 0.35418717075853656, + "learning_rate": 4.136739046317758e-05, + "loss": 2.7665, + "step": 38642 + }, + { + "epoch": 1.7991247061014501, + "grad_norm": 0.3334608427115495, + "learning_rate": 4.136472241678885e-05, + "loss": 2.7588, + "step": 38643 + }, + { + "epoch": 1.7991712642875433, + "grad_norm": 0.34301957991946963, + "learning_rate": 4.1362054395743824e-05, + "loss": 2.6774, + "step": 38644 + }, + { + "epoch": 1.7992178224736364, + "grad_norm": 0.34367604401037877, + "learning_rate": 4.135938640005031e-05, + "loss": 2.7186, + "step": 38645 + }, + { + "epoch": 1.7992643806597295, + "grad_norm": 0.350577039075641, + "learning_rate": 4.135671842971613e-05, + "loss": 2.7445, + "step": 38646 + }, + { + "epoch": 1.7993109388458226, + "grad_norm": 0.31897421339524723, + "learning_rate": 4.1354050484749144e-05, + "loss": 2.6125, + "step": 38647 + }, + { + "epoch": 1.7993574970319157, + "grad_norm": 0.3684670343235225, + "learning_rate": 4.1351382565157156e-05, + "loss": 2.7403, + "step": 38648 + }, + { + "epoch": 1.7994040552180088, + "grad_norm": 0.34428765997290967, + "learning_rate": 4.134871467094802e-05, + "loss": 2.6847, + "step": 38649 + }, + { + "epoch": 1.7994506134041017, + "grad_norm": 0.3661136395469568, + "learning_rate": 4.134604680212956e-05, + "loss": 2.7185, + "step": 38650 + }, + { + "epoch": 1.7994971715901948, + "grad_norm": 0.34551683166773406, + "learning_rate": 4.134337895870958e-05, + "loss": 2.7042, + "step": 38651 + }, + { + "epoch": 1.799543729776288, + "grad_norm": 0.3415564400148854, + "learning_rate": 4.134071114069595e-05, + "loss": 2.6443, + "step": 38652 + }, + { + "epoch": 1.7995902879623809, + "grad_norm": 0.3613702724165966, + "learning_rate": 4.133804334809648e-05, + "loss": 2.7455, + "step": 38653 + }, + { + "epoch": 1.799636846148474, + "grad_norm": 0.34558607565373434, + "learning_rate": 4.133537558091899e-05, + "loss": 2.6127, + "step": 38654 + }, + { + "epoch": 1.799683404334567, + "grad_norm": 0.34251646627599797, + "learning_rate": 4.133270783917131e-05, + "loss": 2.5811, + "step": 38655 + }, + { + "epoch": 1.7997299625206602, + "grad_norm": 0.34832308198836065, + "learning_rate": 4.133004012286129e-05, + "loss": 2.5897, + "step": 38656 + }, + { + "epoch": 1.7997765207067533, + "grad_norm": 0.34187946443836686, + "learning_rate": 4.132737243199676e-05, + "loss": 2.6946, + "step": 38657 + }, + { + "epoch": 1.7998230788928464, + "grad_norm": 0.33393015938750936, + "learning_rate": 4.1324704766585524e-05, + "loss": 2.6926, + "step": 38658 + }, + { + "epoch": 1.7998696370789395, + "grad_norm": 0.34334977838646774, + "learning_rate": 4.132203712663541e-05, + "loss": 2.6719, + "step": 38659 + }, + { + "epoch": 1.7999161952650324, + "grad_norm": 0.35829204459326763, + "learning_rate": 4.131936951215429e-05, + "loss": 2.5626, + "step": 38660 + }, + { + "epoch": 1.7999627534511256, + "grad_norm": 0.35845071763681174, + "learning_rate": 4.1316701923149936e-05, + "loss": 2.7601, + "step": 38661 + }, + { + "epoch": 1.8000093116372187, + "grad_norm": 0.3503273024818964, + "learning_rate": 4.131403435963023e-05, + "loss": 2.7141, + "step": 38662 + }, + { + "epoch": 1.8000558698233116, + "grad_norm": 0.3547478018142213, + "learning_rate": 4.1311366821602985e-05, + "loss": 2.7231, + "step": 38663 + }, + { + "epoch": 1.8001024280094047, + "grad_norm": 0.36100098394974317, + "learning_rate": 4.130869930907599e-05, + "loss": 2.6686, + "step": 38664 + }, + { + "epoch": 1.8001489861954978, + "grad_norm": 0.3424044422633051, + "learning_rate": 4.1306031822057136e-05, + "loss": 2.695, + "step": 38665 + }, + { + "epoch": 1.800195544381591, + "grad_norm": 0.34985455256802167, + "learning_rate": 4.13033643605542e-05, + "loss": 2.7486, + "step": 38666 + }, + { + "epoch": 1.800242102567684, + "grad_norm": 0.3594887771377898, + "learning_rate": 4.130069692457506e-05, + "loss": 2.8048, + "step": 38667 + }, + { + "epoch": 1.8002886607537771, + "grad_norm": 0.3408765198808437, + "learning_rate": 4.129802951412749e-05, + "loss": 2.8313, + "step": 38668 + }, + { + "epoch": 1.8003352189398703, + "grad_norm": 0.3513773844820193, + "learning_rate": 4.129536212921935e-05, + "loss": 2.6822, + "step": 38669 + }, + { + "epoch": 1.8003817771259631, + "grad_norm": 0.30557633174658244, + "learning_rate": 4.129269476985848e-05, + "loss": 2.6208, + "step": 38670 + }, + { + "epoch": 1.8004283353120563, + "grad_norm": 0.31298429861248334, + "learning_rate": 4.129002743605269e-05, + "loss": 2.689, + "step": 38671 + }, + { + "epoch": 1.8004748934981492, + "grad_norm": 0.358985059440423, + "learning_rate": 4.1287360127809795e-05, + "loss": 2.6603, + "step": 38672 + }, + { + "epoch": 1.8005214516842423, + "grad_norm": 0.33321326438629867, + "learning_rate": 4.128469284513766e-05, + "loss": 2.7217, + "step": 38673 + }, + { + "epoch": 1.8005680098703354, + "grad_norm": 0.32700138238299253, + "learning_rate": 4.128202558804407e-05, + "loss": 2.6414, + "step": 38674 + }, + { + "epoch": 1.8006145680564285, + "grad_norm": 0.3382405056210557, + "learning_rate": 4.12793583565369e-05, + "loss": 2.7545, + "step": 38675 + }, + { + "epoch": 1.8006611262425216, + "grad_norm": 0.36491278239526437, + "learning_rate": 4.1276691150623945e-05, + "loss": 2.6584, + "step": 38676 + }, + { + "epoch": 1.8007076844286147, + "grad_norm": 0.3278470799954173, + "learning_rate": 4.127402397031303e-05, + "loss": 2.8517, + "step": 38677 + }, + { + "epoch": 1.8007542426147078, + "grad_norm": 0.3222867741646513, + "learning_rate": 4.127135681561201e-05, + "loss": 2.7436, + "step": 38678 + }, + { + "epoch": 1.800800800800801, + "grad_norm": 0.32253623309026463, + "learning_rate": 4.126868968652869e-05, + "loss": 2.7292, + "step": 38679 + }, + { + "epoch": 1.8008473589868939, + "grad_norm": 0.32637934159834436, + "learning_rate": 4.126602258307092e-05, + "loss": 2.6435, + "step": 38680 + }, + { + "epoch": 1.800893917172987, + "grad_norm": 0.3226241191022385, + "learning_rate": 4.12633555052465e-05, + "loss": 2.6891, + "step": 38681 + }, + { + "epoch": 1.8009404753590799, + "grad_norm": 0.3328956117437457, + "learning_rate": 4.126068845306327e-05, + "loss": 2.8441, + "step": 38682 + }, + { + "epoch": 1.800987033545173, + "grad_norm": 0.3503050128780207, + "learning_rate": 4.125802142652907e-05, + "loss": 2.7266, + "step": 38683 + }, + { + "epoch": 1.801033591731266, + "grad_norm": 0.32900877815074864, + "learning_rate": 4.1255354425651694e-05, + "loss": 2.7346, + "step": 38684 + }, + { + "epoch": 1.8010801499173592, + "grad_norm": 0.35062109630215904, + "learning_rate": 4.1252687450439006e-05, + "loss": 2.6875, + "step": 38685 + }, + { + "epoch": 1.8011267081034523, + "grad_norm": 0.33930493360842723, + "learning_rate": 4.125002050089883e-05, + "loss": 2.6789, + "step": 38686 + }, + { + "epoch": 1.8011732662895454, + "grad_norm": 0.33468232480066173, + "learning_rate": 4.124735357703895e-05, + "loss": 2.686, + "step": 38687 + }, + { + "epoch": 1.8012198244756386, + "grad_norm": 0.3443916169960441, + "learning_rate": 4.124468667886726e-05, + "loss": 2.6351, + "step": 38688 + }, + { + "epoch": 1.8012663826617314, + "grad_norm": 0.3568308571017444, + "learning_rate": 4.124201980639154e-05, + "loss": 2.6848, + "step": 38689 + }, + { + "epoch": 1.8013129408478246, + "grad_norm": 0.3329558366706986, + "learning_rate": 4.123935295961961e-05, + "loss": 2.7333, + "step": 38690 + }, + { + "epoch": 1.8013594990339177, + "grad_norm": 0.3795859517388841, + "learning_rate": 4.1236686138559336e-05, + "loss": 2.7675, + "step": 38691 + }, + { + "epoch": 1.8014060572200106, + "grad_norm": 0.32107494691987093, + "learning_rate": 4.123401934321852e-05, + "loss": 2.6777, + "step": 38692 + }, + { + "epoch": 1.8014526154061037, + "grad_norm": 0.3391215578744703, + "learning_rate": 4.1231352573604994e-05, + "loss": 2.7316, + "step": 38693 + }, + { + "epoch": 1.8014991735921968, + "grad_norm": 0.35075478123152176, + "learning_rate": 4.1228685829726586e-05, + "loss": 2.6135, + "step": 38694 + }, + { + "epoch": 1.80154573177829, + "grad_norm": 0.3375876432823044, + "learning_rate": 4.1226019111591116e-05, + "loss": 2.6427, + "step": 38695 + }, + { + "epoch": 1.801592289964383, + "grad_norm": 0.3391433436259079, + "learning_rate": 4.122335241920643e-05, + "loss": 2.6664, + "step": 38696 + }, + { + "epoch": 1.8016388481504761, + "grad_norm": 0.3200148974638541, + "learning_rate": 4.122068575258031e-05, + "loss": 2.6741, + "step": 38697 + }, + { + "epoch": 1.8016854063365693, + "grad_norm": 0.3629343860932827, + "learning_rate": 4.121801911172064e-05, + "loss": 2.7459, + "step": 38698 + }, + { + "epoch": 1.8017319645226622, + "grad_norm": 0.3661661255710923, + "learning_rate": 4.121535249663522e-05, + "loss": 2.7067, + "step": 38699 + }, + { + "epoch": 1.8017785227087553, + "grad_norm": 0.3363983690485917, + "learning_rate": 4.121268590733185e-05, + "loss": 2.7038, + "step": 38700 + }, + { + "epoch": 1.8018250808948484, + "grad_norm": 0.34345482746987704, + "learning_rate": 4.1210019343818404e-05, + "loss": 2.6887, + "step": 38701 + }, + { + "epoch": 1.8018716390809413, + "grad_norm": 0.3401480166628749, + "learning_rate": 4.120735280610266e-05, + "loss": 2.7522, + "step": 38702 + }, + { + "epoch": 1.8019181972670344, + "grad_norm": 0.3529264278574365, + "learning_rate": 4.12046862941925e-05, + "loss": 2.672, + "step": 38703 + }, + { + "epoch": 1.8019647554531275, + "grad_norm": 0.3487281761244056, + "learning_rate": 4.120201980809571e-05, + "loss": 2.6744, + "step": 38704 + }, + { + "epoch": 1.8020113136392206, + "grad_norm": 0.3441056647490279, + "learning_rate": 4.1199353347820116e-05, + "loss": 2.6679, + "step": 38705 + }, + { + "epoch": 1.8020578718253137, + "grad_norm": 0.3680430022080864, + "learning_rate": 4.1196686913373564e-05, + "loss": 2.7279, + "step": 38706 + }, + { + "epoch": 1.8021044300114069, + "grad_norm": 0.34082171338406186, + "learning_rate": 4.119402050476386e-05, + "loss": 2.7297, + "step": 38707 + }, + { + "epoch": 1.8021509881975, + "grad_norm": 0.34423089760980763, + "learning_rate": 4.119135412199884e-05, + "loss": 2.723, + "step": 38708 + }, + { + "epoch": 1.8021975463835929, + "grad_norm": 0.32343007578595734, + "learning_rate": 4.118868776508633e-05, + "loss": 2.6993, + "step": 38709 + }, + { + "epoch": 1.802244104569686, + "grad_norm": 0.35312644374134416, + "learning_rate": 4.118602143403414e-05, + "loss": 2.658, + "step": 38710 + }, + { + "epoch": 1.8022906627557789, + "grad_norm": 0.3302388640305007, + "learning_rate": 4.118335512885013e-05, + "loss": 2.6419, + "step": 38711 + }, + { + "epoch": 1.802337220941872, + "grad_norm": 0.3345949138190023, + "learning_rate": 4.118068884954211e-05, + "loss": 2.6686, + "step": 38712 + }, + { + "epoch": 1.802383779127965, + "grad_norm": 0.35345548280161126, + "learning_rate": 4.117802259611786e-05, + "loss": 2.7204, + "step": 38713 + }, + { + "epoch": 1.8024303373140582, + "grad_norm": 0.33894443294728155, + "learning_rate": 4.1175356368585274e-05, + "loss": 2.6981, + "step": 38714 + }, + { + "epoch": 1.8024768955001513, + "grad_norm": 0.359292873148614, + "learning_rate": 4.117269016695213e-05, + "loss": 2.6559, + "step": 38715 + }, + { + "epoch": 1.8025234536862444, + "grad_norm": 0.35074424213106586, + "learning_rate": 4.117002399122629e-05, + "loss": 2.6824, + "step": 38716 + }, + { + "epoch": 1.8025700118723376, + "grad_norm": 0.358810922712057, + "learning_rate": 4.1167357841415564e-05, + "loss": 2.7348, + "step": 38717 + }, + { + "epoch": 1.8026165700584307, + "grad_norm": 0.35816474117322517, + "learning_rate": 4.116469171752776e-05, + "loss": 2.7269, + "step": 38718 + }, + { + "epoch": 1.8026631282445236, + "grad_norm": 0.3679824311353968, + "learning_rate": 4.116202561957072e-05, + "loss": 2.7344, + "step": 38719 + }, + { + "epoch": 1.8027096864306167, + "grad_norm": 0.35163309834773526, + "learning_rate": 4.115935954755225e-05, + "loss": 2.764, + "step": 38720 + }, + { + "epoch": 1.8027562446167096, + "grad_norm": 0.3719479524823415, + "learning_rate": 4.1156693501480204e-05, + "loss": 2.6425, + "step": 38721 + }, + { + "epoch": 1.8028028028028027, + "grad_norm": 0.340447571012591, + "learning_rate": 4.11540274813624e-05, + "loss": 2.7055, + "step": 38722 + }, + { + "epoch": 1.8028493609888958, + "grad_norm": 0.3821270737719364, + "learning_rate": 4.115136148720662e-05, + "loss": 2.7106, + "step": 38723 + }, + { + "epoch": 1.802895919174989, + "grad_norm": 0.32296978879063404, + "learning_rate": 4.1148695519020754e-05, + "loss": 2.5704, + "step": 38724 + }, + { + "epoch": 1.802942477361082, + "grad_norm": 0.3474501865642728, + "learning_rate": 4.114602957681259e-05, + "loss": 2.5999, + "step": 38725 + }, + { + "epoch": 1.8029890355471752, + "grad_norm": 0.34263747766700214, + "learning_rate": 4.114336366058994e-05, + "loss": 2.6576, + "step": 38726 + }, + { + "epoch": 1.8030355937332683, + "grad_norm": 0.3236812549234348, + "learning_rate": 4.1140697770360666e-05, + "loss": 2.6768, + "step": 38727 + }, + { + "epoch": 1.8030821519193614, + "grad_norm": 0.33427692428852906, + "learning_rate": 4.1138031906132546e-05, + "loss": 2.6245, + "step": 38728 + }, + { + "epoch": 1.8031287101054543, + "grad_norm": 0.319255471251694, + "learning_rate": 4.113536606791345e-05, + "loss": 2.7354, + "step": 38729 + }, + { + "epoch": 1.8031752682915474, + "grad_norm": 0.3452946275164182, + "learning_rate": 4.113270025571118e-05, + "loss": 2.609, + "step": 38730 + }, + { + "epoch": 1.8032218264776403, + "grad_norm": 0.334026804527284, + "learning_rate": 4.113003446953356e-05, + "loss": 2.765, + "step": 38731 + }, + { + "epoch": 1.8032683846637334, + "grad_norm": 0.3463667668987888, + "learning_rate": 4.1127368709388414e-05, + "loss": 2.7358, + "step": 38732 + }, + { + "epoch": 1.8033149428498265, + "grad_norm": 0.33971592626915653, + "learning_rate": 4.112470297528355e-05, + "loss": 2.6287, + "step": 38733 + }, + { + "epoch": 1.8033615010359196, + "grad_norm": 0.3158187677367427, + "learning_rate": 4.112203726722683e-05, + "loss": 2.6707, + "step": 38734 + }, + { + "epoch": 1.8034080592220127, + "grad_norm": 0.3218032894163683, + "learning_rate": 4.111937158522606e-05, + "loss": 2.7597, + "step": 38735 + }, + { + "epoch": 1.8034546174081059, + "grad_norm": 0.3559250740339391, + "learning_rate": 4.1116705929289034e-05, + "loss": 2.6622, + "step": 38736 + }, + { + "epoch": 1.803501175594199, + "grad_norm": 0.3407835568854891, + "learning_rate": 4.111404029942362e-05, + "loss": 2.7406, + "step": 38737 + }, + { + "epoch": 1.8035477337802919, + "grad_norm": 0.33450616968567376, + "learning_rate": 4.111137469563761e-05, + "loss": 2.6231, + "step": 38738 + }, + { + "epoch": 1.803594291966385, + "grad_norm": 0.33150543578356867, + "learning_rate": 4.1108709117938855e-05, + "loss": 2.671, + "step": 38739 + }, + { + "epoch": 1.803640850152478, + "grad_norm": 0.32126259540265545, + "learning_rate": 4.110604356633516e-05, + "loss": 2.4777, + "step": 38740 + }, + { + "epoch": 1.803687408338571, + "grad_norm": 0.3480474127679224, + "learning_rate": 4.110337804083434e-05, + "loss": 2.6778, + "step": 38741 + }, + { + "epoch": 1.8037339665246641, + "grad_norm": 0.32715549678911343, + "learning_rate": 4.110071254144423e-05, + "loss": 2.6184, + "step": 38742 + }, + { + "epoch": 1.8037805247107572, + "grad_norm": 0.3295479021529072, + "learning_rate": 4.109804706817267e-05, + "loss": 2.6198, + "step": 38743 + }, + { + "epoch": 1.8038270828968503, + "grad_norm": 0.3548991551392478, + "learning_rate": 4.109538162102745e-05, + "loss": 2.7243, + "step": 38744 + }, + { + "epoch": 1.8038736410829435, + "grad_norm": 0.32904701071064196, + "learning_rate": 4.109271620001642e-05, + "loss": 2.6829, + "step": 38745 + }, + { + "epoch": 1.8039201992690366, + "grad_norm": 0.347413281561974, + "learning_rate": 4.109005080514736e-05, + "loss": 2.7844, + "step": 38746 + }, + { + "epoch": 1.8039667574551297, + "grad_norm": 0.3520490459652975, + "learning_rate": 4.108738543642815e-05, + "loss": 2.7398, + "step": 38747 + }, + { + "epoch": 1.8040133156412226, + "grad_norm": 0.3349170932329025, + "learning_rate": 4.108472009386659e-05, + "loss": 2.7104, + "step": 38748 + }, + { + "epoch": 1.8040598738273157, + "grad_norm": 0.32417734192010805, + "learning_rate": 4.1082054777470466e-05, + "loss": 2.6673, + "step": 38749 + }, + { + "epoch": 1.8041064320134088, + "grad_norm": 0.34281278862394216, + "learning_rate": 4.107938948724767e-05, + "loss": 2.7576, + "step": 38750 + }, + { + "epoch": 1.8041529901995017, + "grad_norm": 0.33192888104584756, + "learning_rate": 4.107672422320595e-05, + "loss": 2.7285, + "step": 38751 + }, + { + "epoch": 1.8041995483855948, + "grad_norm": 0.3259712546979593, + "learning_rate": 4.1074058985353196e-05, + "loss": 2.7388, + "step": 38752 + }, + { + "epoch": 1.804246106571688, + "grad_norm": 0.35441581366433084, + "learning_rate": 4.1071393773697196e-05, + "loss": 2.6652, + "step": 38753 + }, + { + "epoch": 1.804292664757781, + "grad_norm": 0.3046474266689073, + "learning_rate": 4.106872858824576e-05, + "loss": 2.6276, + "step": 38754 + }, + { + "epoch": 1.8043392229438742, + "grad_norm": 0.33852374726455303, + "learning_rate": 4.106606342900674e-05, + "loss": 2.7191, + "step": 38755 + }, + { + "epoch": 1.8043857811299673, + "grad_norm": 0.32584043964153553, + "learning_rate": 4.106339829598794e-05, + "loss": 2.6255, + "step": 38756 + }, + { + "epoch": 1.8044323393160604, + "grad_norm": 0.3433584929371455, + "learning_rate": 4.1060733189197184e-05, + "loss": 2.754, + "step": 38757 + }, + { + "epoch": 1.8044788975021533, + "grad_norm": 0.3312872890475871, + "learning_rate": 4.1058068108642314e-05, + "loss": 2.7229, + "step": 38758 + }, + { + "epoch": 1.8045254556882464, + "grad_norm": 0.35047199581749583, + "learning_rate": 4.10554030543311e-05, + "loss": 2.6725, + "step": 38759 + }, + { + "epoch": 1.8045720138743393, + "grad_norm": 0.33910292007928966, + "learning_rate": 4.1052738026271416e-05, + "loss": 2.7283, + "step": 38760 + }, + { + "epoch": 1.8046185720604324, + "grad_norm": 0.35862927901457536, + "learning_rate": 4.105007302447107e-05, + "loss": 2.6105, + "step": 38761 + }, + { + "epoch": 1.8046651302465255, + "grad_norm": 0.35580959687493935, + "learning_rate": 4.1047408048937855e-05, + "loss": 2.6933, + "step": 38762 + }, + { + "epoch": 1.8047116884326186, + "grad_norm": 0.3190779655071627, + "learning_rate": 4.104474309967963e-05, + "loss": 2.5922, + "step": 38763 + }, + { + "epoch": 1.8047582466187118, + "grad_norm": 0.35483007310591413, + "learning_rate": 4.104207817670419e-05, + "loss": 2.6296, + "step": 38764 + }, + { + "epoch": 1.8048048048048049, + "grad_norm": 0.3562814349479105, + "learning_rate": 4.103941328001938e-05, + "loss": 2.7093, + "step": 38765 + }, + { + "epoch": 1.804851362990898, + "grad_norm": 0.3207274539745593, + "learning_rate": 4.103674840963302e-05, + "loss": 2.5866, + "step": 38766 + }, + { + "epoch": 1.804897921176991, + "grad_norm": 0.36071629534863753, + "learning_rate": 4.1034083565552895e-05, + "loss": 2.5542, + "step": 38767 + }, + { + "epoch": 1.804944479363084, + "grad_norm": 0.3766611234167527, + "learning_rate": 4.1031418747786873e-05, + "loss": 2.759, + "step": 38768 + }, + { + "epoch": 1.8049910375491771, + "grad_norm": 0.334384838345163, + "learning_rate": 4.102875395634273e-05, + "loss": 2.6634, + "step": 38769 + }, + { + "epoch": 1.80503759573527, + "grad_norm": 0.35287634691288, + "learning_rate": 4.102608919122835e-05, + "loss": 2.5651, + "step": 38770 + }, + { + "epoch": 1.8050841539213631, + "grad_norm": 0.3458111268044893, + "learning_rate": 4.102342445245149e-05, + "loss": 2.6695, + "step": 38771 + }, + { + "epoch": 1.8051307121074562, + "grad_norm": 0.33001903752209544, + "learning_rate": 4.102075974001998e-05, + "loss": 2.773, + "step": 38772 + }, + { + "epoch": 1.8051772702935494, + "grad_norm": 0.36324593178436554, + "learning_rate": 4.101809505394167e-05, + "loss": 2.7543, + "step": 38773 + }, + { + "epoch": 1.8052238284796425, + "grad_norm": 0.327573126470936, + "learning_rate": 4.1015430394224356e-05, + "loss": 2.6634, + "step": 38774 + }, + { + "epoch": 1.8052703866657356, + "grad_norm": 0.3196884215695783, + "learning_rate": 4.1012765760875885e-05, + "loss": 2.6664, + "step": 38775 + }, + { + "epoch": 1.8053169448518287, + "grad_norm": 0.3392666681884208, + "learning_rate": 4.101010115390407e-05, + "loss": 2.6905, + "step": 38776 + }, + { + "epoch": 1.8053635030379216, + "grad_norm": 0.3408092199790123, + "learning_rate": 4.1007436573316685e-05, + "loss": 2.6565, + "step": 38777 + }, + { + "epoch": 1.8054100612240147, + "grad_norm": 0.36921744316171706, + "learning_rate": 4.100477201912162e-05, + "loss": 2.7394, + "step": 38778 + }, + { + "epoch": 1.8054566194101078, + "grad_norm": 0.3305487295814414, + "learning_rate": 4.100210749132666e-05, + "loss": 2.7519, + "step": 38779 + }, + { + "epoch": 1.8055031775962007, + "grad_norm": 0.3374925077404596, + "learning_rate": 4.099944298993961e-05, + "loss": 2.637, + "step": 38780 + }, + { + "epoch": 1.8055497357822938, + "grad_norm": 0.34106723403468137, + "learning_rate": 4.0996778514968324e-05, + "loss": 2.6706, + "step": 38781 + }, + { + "epoch": 1.805596293968387, + "grad_norm": 0.31767917249323224, + "learning_rate": 4.0994114066420596e-05, + "loss": 2.7973, + "step": 38782 + }, + { + "epoch": 1.80564285215448, + "grad_norm": 0.3715170467798684, + "learning_rate": 4.0991449644304274e-05, + "loss": 2.6608, + "step": 38783 + }, + { + "epoch": 1.8056894103405732, + "grad_norm": 0.3370169396491089, + "learning_rate": 4.098878524862716e-05, + "loss": 2.7166, + "step": 38784 + }, + { + "epoch": 1.8057359685266663, + "grad_norm": 0.3271647353703679, + "learning_rate": 4.098612087939704e-05, + "loss": 2.613, + "step": 38785 + }, + { + "epoch": 1.8057825267127594, + "grad_norm": 0.3377701434523164, + "learning_rate": 4.09834565366218e-05, + "loss": 2.6455, + "step": 38786 + }, + { + "epoch": 1.8058290848988523, + "grad_norm": 0.3401218537720924, + "learning_rate": 4.0980792220309194e-05, + "loss": 2.7634, + "step": 38787 + }, + { + "epoch": 1.8058756430849454, + "grad_norm": 0.3138754619198864, + "learning_rate": 4.09781279304671e-05, + "loss": 2.6726, + "step": 38788 + }, + { + "epoch": 1.8059222012710385, + "grad_norm": 0.3588900449744648, + "learning_rate": 4.097546366710332e-05, + "loss": 2.7336, + "step": 38789 + }, + { + "epoch": 1.8059687594571314, + "grad_norm": 0.3414562389235136, + "learning_rate": 4.097279943022564e-05, + "loss": 2.6473, + "step": 38790 + }, + { + "epoch": 1.8060153176432245, + "grad_norm": 0.306124360292022, + "learning_rate": 4.097013521984192e-05, + "loss": 2.6472, + "step": 38791 + }, + { + "epoch": 1.8060618758293177, + "grad_norm": 0.3205480640518892, + "learning_rate": 4.096747103595996e-05, + "loss": 2.7642, + "step": 38792 + }, + { + "epoch": 1.8061084340154108, + "grad_norm": 0.35316371939880853, + "learning_rate": 4.0964806878587575e-05, + "loss": 2.663, + "step": 38793 + }, + { + "epoch": 1.8061549922015039, + "grad_norm": 0.32287763879082626, + "learning_rate": 4.0962142747732605e-05, + "loss": 2.6646, + "step": 38794 + }, + { + "epoch": 1.806201550387597, + "grad_norm": 0.3270480401727339, + "learning_rate": 4.095947864340285e-05, + "loss": 2.7085, + "step": 38795 + }, + { + "epoch": 1.8062481085736901, + "grad_norm": 0.3477012061526711, + "learning_rate": 4.095681456560614e-05, + "loss": 2.6406, + "step": 38796 + }, + { + "epoch": 1.806294666759783, + "grad_norm": 0.3513902346244724, + "learning_rate": 4.095415051435027e-05, + "loss": 2.6878, + "step": 38797 + }, + { + "epoch": 1.8063412249458761, + "grad_norm": 0.31881090575200294, + "learning_rate": 4.095148648964309e-05, + "loss": 2.7466, + "step": 38798 + }, + { + "epoch": 1.806387783131969, + "grad_norm": 0.3258075970493666, + "learning_rate": 4.0948822491492416e-05, + "loss": 2.7511, + "step": 38799 + }, + { + "epoch": 1.8064343413180621, + "grad_norm": 0.3472949678984111, + "learning_rate": 4.094615851990603e-05, + "loss": 2.6814, + "step": 38800 + }, + { + "epoch": 1.8064808995041552, + "grad_norm": 0.3163326193949557, + "learning_rate": 4.094349457489181e-05, + "loss": 2.7216, + "step": 38801 + }, + { + "epoch": 1.8065274576902484, + "grad_norm": 0.36705726858065296, + "learning_rate": 4.094083065645752e-05, + "loss": 2.6881, + "step": 38802 + }, + { + "epoch": 1.8065740158763415, + "grad_norm": 0.3793052783070659, + "learning_rate": 4.093816676461099e-05, + "loss": 2.6625, + "step": 38803 + }, + { + "epoch": 1.8066205740624346, + "grad_norm": 0.3134072452978688, + "learning_rate": 4.0935502899360075e-05, + "loss": 2.7898, + "step": 38804 + }, + { + "epoch": 1.8066671322485277, + "grad_norm": 0.41748386871453963, + "learning_rate": 4.0932839060712546e-05, + "loss": 2.6432, + "step": 38805 + }, + { + "epoch": 1.8067136904346208, + "grad_norm": 0.33763895466847876, + "learning_rate": 4.0930175248676244e-05, + "loss": 2.532, + "step": 38806 + }, + { + "epoch": 1.8067602486207137, + "grad_norm": 0.32705157197910184, + "learning_rate": 4.0927511463258996e-05, + "loss": 2.63, + "step": 38807 + }, + { + "epoch": 1.8068068068068068, + "grad_norm": 0.3578107423364514, + "learning_rate": 4.0924847704468595e-05, + "loss": 2.7177, + "step": 38808 + }, + { + "epoch": 1.8068533649928997, + "grad_norm": 0.3530649052484095, + "learning_rate": 4.092218397231288e-05, + "loss": 2.617, + "step": 38809 + }, + { + "epoch": 1.8068999231789928, + "grad_norm": 0.33002068371464777, + "learning_rate": 4.0919520266799645e-05, + "loss": 2.6616, + "step": 38810 + }, + { + "epoch": 1.806946481365086, + "grad_norm": 0.3902011873423421, + "learning_rate": 4.0916856587936744e-05, + "loss": 2.687, + "step": 38811 + }, + { + "epoch": 1.806993039551179, + "grad_norm": 0.35411817779323973, + "learning_rate": 4.091419293573197e-05, + "loss": 2.7757, + "step": 38812 + }, + { + "epoch": 1.8070395977372722, + "grad_norm": 0.3688216874182142, + "learning_rate": 4.091152931019313e-05, + "loss": 2.7657, + "step": 38813 + }, + { + "epoch": 1.8070861559233653, + "grad_norm": 0.34515162456612425, + "learning_rate": 4.090886571132807e-05, + "loss": 2.7093, + "step": 38814 + }, + { + "epoch": 1.8071327141094584, + "grad_norm": 0.35466879011772784, + "learning_rate": 4.09062021391446e-05, + "loss": 2.6302, + "step": 38815 + }, + { + "epoch": 1.8071792722955515, + "grad_norm": 0.389523318836823, + "learning_rate": 4.0903538593650505e-05, + "loss": 2.6874, + "step": 38816 + }, + { + "epoch": 1.8072258304816444, + "grad_norm": 0.33614510296088385, + "learning_rate": 4.090087507485365e-05, + "loss": 2.7066, + "step": 38817 + }, + { + "epoch": 1.8072723886677375, + "grad_norm": 0.3892388897338362, + "learning_rate": 4.0898211582761816e-05, + "loss": 2.6328, + "step": 38818 + }, + { + "epoch": 1.8073189468538304, + "grad_norm": 0.32404438538013625, + "learning_rate": 4.089554811738284e-05, + "loss": 2.7294, + "step": 38819 + }, + { + "epoch": 1.8073655050399235, + "grad_norm": 0.34473213721299834, + "learning_rate": 4.0892884678724545e-05, + "loss": 2.7574, + "step": 38820 + }, + { + "epoch": 1.8074120632260167, + "grad_norm": 0.35241762766706936, + "learning_rate": 4.0890221266794725e-05, + "loss": 2.56, + "step": 38821 + }, + { + "epoch": 1.8074586214121098, + "grad_norm": 0.33161162730262383, + "learning_rate": 4.0887557881601216e-05, + "loss": 2.6651, + "step": 38822 + }, + { + "epoch": 1.807505179598203, + "grad_norm": 0.36828029203121776, + "learning_rate": 4.088489452315181e-05, + "loss": 2.7032, + "step": 38823 + }, + { + "epoch": 1.807551737784296, + "grad_norm": 0.33051002034284915, + "learning_rate": 4.0882231191454353e-05, + "loss": 2.7041, + "step": 38824 + }, + { + "epoch": 1.8075982959703891, + "grad_norm": 0.3355843858764818, + "learning_rate": 4.087956788651665e-05, + "loss": 2.7925, + "step": 38825 + }, + { + "epoch": 1.807644854156482, + "grad_norm": 0.35735567493924386, + "learning_rate": 4.087690460834651e-05, + "loss": 2.6675, + "step": 38826 + }, + { + "epoch": 1.8076914123425751, + "grad_norm": 0.331135313383058, + "learning_rate": 4.087424135695177e-05, + "loss": 2.7022, + "step": 38827 + }, + { + "epoch": 1.8077379705286682, + "grad_norm": 0.3688236570321101, + "learning_rate": 4.087157813234021e-05, + "loss": 2.6639, + "step": 38828 + }, + { + "epoch": 1.8077845287147611, + "grad_norm": 0.35252143192901, + "learning_rate": 4.086891493451969e-05, + "loss": 2.7093, + "step": 38829 + }, + { + "epoch": 1.8078310869008543, + "grad_norm": 0.3986754976751077, + "learning_rate": 4.0866251763498e-05, + "loss": 2.7005, + "step": 38830 + }, + { + "epoch": 1.8078776450869474, + "grad_norm": 0.33092820946030743, + "learning_rate": 4.0863588619282945e-05, + "loss": 2.6868, + "step": 38831 + }, + { + "epoch": 1.8079242032730405, + "grad_norm": 0.34586682481902964, + "learning_rate": 4.086092550188237e-05, + "loss": 2.6702, + "step": 38832 + }, + { + "epoch": 1.8079707614591336, + "grad_norm": 0.3352972466883483, + "learning_rate": 4.085826241130409e-05, + "loss": 2.5252, + "step": 38833 + }, + { + "epoch": 1.8080173196452267, + "grad_norm": 0.35743624791343487, + "learning_rate": 4.08555993475559e-05, + "loss": 2.6134, + "step": 38834 + }, + { + "epoch": 1.8080638778313198, + "grad_norm": 0.3541295131086382, + "learning_rate": 4.085293631064563e-05, + "loss": 2.6966, + "step": 38835 + }, + { + "epoch": 1.8081104360174127, + "grad_norm": 0.34499609343474974, + "learning_rate": 4.085027330058107e-05, + "loss": 2.6919, + "step": 38836 + }, + { + "epoch": 1.8081569942035058, + "grad_norm": 0.35924629757562715, + "learning_rate": 4.084761031737008e-05, + "loss": 2.6001, + "step": 38837 + }, + { + "epoch": 1.808203552389599, + "grad_norm": 0.367775420410362, + "learning_rate": 4.0844947361020446e-05, + "loss": 2.7704, + "step": 38838 + }, + { + "epoch": 1.8082501105756918, + "grad_norm": 0.3320807040718589, + "learning_rate": 4.084228443153997e-05, + "loss": 2.7672, + "step": 38839 + }, + { + "epoch": 1.808296668761785, + "grad_norm": 0.413015910343144, + "learning_rate": 4.083962152893651e-05, + "loss": 2.6838, + "step": 38840 + }, + { + "epoch": 1.808343226947878, + "grad_norm": 0.32841696469693116, + "learning_rate": 4.083695865321784e-05, + "loss": 2.7446, + "step": 38841 + }, + { + "epoch": 1.8083897851339712, + "grad_norm": 0.3781554192194099, + "learning_rate": 4.0834295804391814e-05, + "loss": 2.6726, + "step": 38842 + }, + { + "epoch": 1.8084363433200643, + "grad_norm": 0.35546695872272244, + "learning_rate": 4.083163298246623e-05, + "loss": 2.6123, + "step": 38843 + }, + { + "epoch": 1.8084829015061574, + "grad_norm": 0.37922641862248413, + "learning_rate": 4.082897018744887e-05, + "loss": 2.7269, + "step": 38844 + }, + { + "epoch": 1.8085294596922505, + "grad_norm": 0.39171917745336143, + "learning_rate": 4.0826307419347606e-05, + "loss": 2.7116, + "step": 38845 + }, + { + "epoch": 1.8085760178783434, + "grad_norm": 0.36771249006506274, + "learning_rate": 4.0823644678170214e-05, + "loss": 2.7356, + "step": 38846 + }, + { + "epoch": 1.8086225760644365, + "grad_norm": 0.36545944019129817, + "learning_rate": 4.082098196392453e-05, + "loss": 2.7337, + "step": 38847 + }, + { + "epoch": 1.8086691342505294, + "grad_norm": 0.3721156020343247, + "learning_rate": 4.081831927661836e-05, + "loss": 2.6782, + "step": 38848 + }, + { + "epoch": 1.8087156924366226, + "grad_norm": 0.3153297148869219, + "learning_rate": 4.0815656616259494e-05, + "loss": 2.6713, + "step": 38849 + }, + { + "epoch": 1.8087622506227157, + "grad_norm": 0.3856803513692698, + "learning_rate": 4.0812993982855796e-05, + "loss": 2.6676, + "step": 38850 + }, + { + "epoch": 1.8088088088088088, + "grad_norm": 0.347099757482274, + "learning_rate": 4.0810331376415054e-05, + "loss": 2.7556, + "step": 38851 + }, + { + "epoch": 1.808855366994902, + "grad_norm": 0.347546340691568, + "learning_rate": 4.080766879694507e-05, + "loss": 2.7991, + "step": 38852 + }, + { + "epoch": 1.808901925180995, + "grad_norm": 0.395912926504639, + "learning_rate": 4.0805006244453684e-05, + "loss": 2.5302, + "step": 38853 + }, + { + "epoch": 1.8089484833670881, + "grad_norm": 0.34325107996077153, + "learning_rate": 4.080234371894869e-05, + "loss": 2.6386, + "step": 38854 + }, + { + "epoch": 1.8089950415531812, + "grad_norm": 0.3311983914514653, + "learning_rate": 4.0799681220437915e-05, + "loss": 2.6572, + "step": 38855 + }, + { + "epoch": 1.8090415997392741, + "grad_norm": 0.3582748081464505, + "learning_rate": 4.079701874892918e-05, + "loss": 2.7135, + "step": 38856 + }, + { + "epoch": 1.8090881579253673, + "grad_norm": 0.3403607009882459, + "learning_rate": 4.079435630443027e-05, + "loss": 2.6863, + "step": 38857 + }, + { + "epoch": 1.8091347161114602, + "grad_norm": 0.3379006815448616, + "learning_rate": 4.0791693886949035e-05, + "loss": 2.7688, + "step": 38858 + }, + { + "epoch": 1.8091812742975533, + "grad_norm": 0.33200964979892517, + "learning_rate": 4.078903149649326e-05, + "loss": 2.5638, + "step": 38859 + }, + { + "epoch": 1.8092278324836464, + "grad_norm": 0.338936800365937, + "learning_rate": 4.078636913307078e-05, + "loss": 2.7577, + "step": 38860 + }, + { + "epoch": 1.8092743906697395, + "grad_norm": 0.35136511098981, + "learning_rate": 4.07837067966894e-05, + "loss": 2.542, + "step": 38861 + }, + { + "epoch": 1.8093209488558326, + "grad_norm": 0.3321155479714679, + "learning_rate": 4.078104448735691e-05, + "loss": 2.773, + "step": 38862 + }, + { + "epoch": 1.8093675070419257, + "grad_norm": 0.3662398589251514, + "learning_rate": 4.0778382205081164e-05, + "loss": 2.7682, + "step": 38863 + }, + { + "epoch": 1.8094140652280188, + "grad_norm": 0.3345628490935635, + "learning_rate": 4.077571994986994e-05, + "loss": 2.711, + "step": 38864 + }, + { + "epoch": 1.8094606234141117, + "grad_norm": 0.36939878260292197, + "learning_rate": 4.077305772173109e-05, + "loss": 2.7006, + "step": 38865 + }, + { + "epoch": 1.8095071816002049, + "grad_norm": 0.3691030260438024, + "learning_rate": 4.077039552067241e-05, + "loss": 2.6663, + "step": 38866 + }, + { + "epoch": 1.809553739786298, + "grad_norm": 0.31885704074510757, + "learning_rate": 4.076773334670169e-05, + "loss": 2.6427, + "step": 38867 + }, + { + "epoch": 1.8096002979723909, + "grad_norm": 0.41063966810327096, + "learning_rate": 4.0765071199826776e-05, + "loss": 2.7666, + "step": 38868 + }, + { + "epoch": 1.809646856158484, + "grad_norm": 0.3637258558176958, + "learning_rate": 4.076240908005546e-05, + "loss": 2.7084, + "step": 38869 + }, + { + "epoch": 1.809693414344577, + "grad_norm": 0.35895533018696885, + "learning_rate": 4.075974698739557e-05, + "loss": 2.7059, + "step": 38870 + }, + { + "epoch": 1.8097399725306702, + "grad_norm": 0.3851326340594565, + "learning_rate": 4.075708492185492e-05, + "loss": 2.7038, + "step": 38871 + }, + { + "epoch": 1.8097865307167633, + "grad_norm": 0.33481716385390803, + "learning_rate": 4.07544228834413e-05, + "loss": 2.6983, + "step": 38872 + }, + { + "epoch": 1.8098330889028564, + "grad_norm": 0.33391005293488274, + "learning_rate": 4.0751760872162554e-05, + "loss": 2.6567, + "step": 38873 + }, + { + "epoch": 1.8098796470889496, + "grad_norm": 0.35432544414848277, + "learning_rate": 4.0749098888026476e-05, + "loss": 2.7443, + "step": 38874 + }, + { + "epoch": 1.8099262052750424, + "grad_norm": 0.33446831385437775, + "learning_rate": 4.074643693104086e-05, + "loss": 2.7266, + "step": 38875 + }, + { + "epoch": 1.8099727634611356, + "grad_norm": 0.3613445224104598, + "learning_rate": 4.074377500121356e-05, + "loss": 2.6275, + "step": 38876 + }, + { + "epoch": 1.8100193216472287, + "grad_norm": 0.3407684485882496, + "learning_rate": 4.0741113098552355e-05, + "loss": 2.7807, + "step": 38877 + }, + { + "epoch": 1.8100658798333216, + "grad_norm": 0.3672902471447179, + "learning_rate": 4.0738451223065085e-05, + "loss": 2.7616, + "step": 38878 + }, + { + "epoch": 1.8101124380194147, + "grad_norm": 0.3466768110246867, + "learning_rate": 4.073578937475955e-05, + "loss": 2.6111, + "step": 38879 + }, + { + "epoch": 1.8101589962055078, + "grad_norm": 0.32805593597757415, + "learning_rate": 4.0733127553643535e-05, + "loss": 2.6236, + "step": 38880 + }, + { + "epoch": 1.810205554391601, + "grad_norm": 0.322800158857597, + "learning_rate": 4.07304657597249e-05, + "loss": 2.8477, + "step": 38881 + }, + { + "epoch": 1.810252112577694, + "grad_norm": 0.34780654332577193, + "learning_rate": 4.0727803993011434e-05, + "loss": 2.7853, + "step": 38882 + }, + { + "epoch": 1.8102986707637871, + "grad_norm": 0.356276995151732, + "learning_rate": 4.072514225351094e-05, + "loss": 2.7262, + "step": 38883 + }, + { + "epoch": 1.8103452289498803, + "grad_norm": 0.33098356626209846, + "learning_rate": 4.072248054123125e-05, + "loss": 2.6748, + "step": 38884 + }, + { + "epoch": 1.8103917871359732, + "grad_norm": 0.30422561192172454, + "learning_rate": 4.0719818856180156e-05, + "loss": 2.6324, + "step": 38885 + }, + { + "epoch": 1.8104383453220663, + "grad_norm": 0.33542046597417663, + "learning_rate": 4.071715719836549e-05, + "loss": 2.7049, + "step": 38886 + }, + { + "epoch": 1.8104849035081592, + "grad_norm": 0.3625349885449704, + "learning_rate": 4.071449556779504e-05, + "loss": 2.7162, + "step": 38887 + }, + { + "epoch": 1.8105314616942523, + "grad_norm": 0.34023920643650996, + "learning_rate": 4.071183396447664e-05, + "loss": 2.6823, + "step": 38888 + }, + { + "epoch": 1.8105780198803454, + "grad_norm": 0.32144834291490904, + "learning_rate": 4.07091723884181e-05, + "loss": 2.6451, + "step": 38889 + }, + { + "epoch": 1.8106245780664385, + "grad_norm": 0.33965564219624206, + "learning_rate": 4.0706510839627204e-05, + "loss": 2.621, + "step": 38890 + }, + { + "epoch": 1.8106711362525316, + "grad_norm": 0.35533244610148734, + "learning_rate": 4.0703849318111804e-05, + "loss": 2.6676, + "step": 38891 + }, + { + "epoch": 1.8107176944386247, + "grad_norm": 0.35007822277392153, + "learning_rate": 4.070118782387968e-05, + "loss": 2.8612, + "step": 38892 + }, + { + "epoch": 1.8107642526247179, + "grad_norm": 0.35508619830939614, + "learning_rate": 4.069852635693864e-05, + "loss": 2.7022, + "step": 38893 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.34884717489394085, + "learning_rate": 4.069586491729653e-05, + "loss": 2.683, + "step": 38894 + }, + { + "epoch": 1.8108573689969039, + "grad_norm": 0.35402291631420174, + "learning_rate": 4.069320350496113e-05, + "loss": 2.7085, + "step": 38895 + }, + { + "epoch": 1.810903927182997, + "grad_norm": 0.3432282686312159, + "learning_rate": 4.069054211994026e-05, + "loss": 2.69, + "step": 38896 + }, + { + "epoch": 1.8109504853690899, + "grad_norm": 0.3419469805303767, + "learning_rate": 4.068788076224175e-05, + "loss": 2.6544, + "step": 38897 + }, + { + "epoch": 1.810997043555183, + "grad_norm": 0.33764401541277167, + "learning_rate": 4.068521943187337e-05, + "loss": 2.5771, + "step": 38898 + }, + { + "epoch": 1.811043601741276, + "grad_norm": 0.33524934637093184, + "learning_rate": 4.068255812884297e-05, + "loss": 2.7281, + "step": 38899 + }, + { + "epoch": 1.8110901599273692, + "grad_norm": 0.3404515836767772, + "learning_rate": 4.0679896853158316e-05, + "loss": 2.7205, + "step": 38900 + }, + { + "epoch": 1.8111367181134623, + "grad_norm": 0.359741464507169, + "learning_rate": 4.067723560482727e-05, + "loss": 2.6529, + "step": 38901 + }, + { + "epoch": 1.8111832762995554, + "grad_norm": 0.3243646744917432, + "learning_rate": 4.0674574383857626e-05, + "loss": 2.6225, + "step": 38902 + }, + { + "epoch": 1.8112298344856486, + "grad_norm": 0.3883209106812247, + "learning_rate": 4.067191319025716e-05, + "loss": 2.6956, + "step": 38903 + }, + { + "epoch": 1.8112763926717417, + "grad_norm": 0.34892521323825815, + "learning_rate": 4.0669252024033736e-05, + "loss": 2.78, + "step": 38904 + }, + { + "epoch": 1.8113229508578346, + "grad_norm": 0.33016804269017286, + "learning_rate": 4.0666590885195116e-05, + "loss": 2.6915, + "step": 38905 + }, + { + "epoch": 1.8113695090439277, + "grad_norm": 0.3651495065036368, + "learning_rate": 4.066392977374915e-05, + "loss": 2.7088, + "step": 38906 + }, + { + "epoch": 1.8114160672300206, + "grad_norm": 0.33321625283626, + "learning_rate": 4.066126868970363e-05, + "loss": 2.6695, + "step": 38907 + }, + { + "epoch": 1.8114626254161137, + "grad_norm": 0.3362643522134732, + "learning_rate": 4.0658607633066356e-05, + "loss": 2.6817, + "step": 38908 + }, + { + "epoch": 1.8115091836022068, + "grad_norm": 0.365112508002451, + "learning_rate": 4.065594660384515e-05, + "loss": 2.7443, + "step": 38909 + }, + { + "epoch": 1.8115557417883, + "grad_norm": 0.35602455410906475, + "learning_rate": 4.0653285602047835e-05, + "loss": 2.6289, + "step": 38910 + }, + { + "epoch": 1.811602299974393, + "grad_norm": 0.3754400949361551, + "learning_rate": 4.065062462768219e-05, + "loss": 2.7147, + "step": 38911 + }, + { + "epoch": 1.8116488581604862, + "grad_norm": 0.33766046072284817, + "learning_rate": 4.064796368075605e-05, + "loss": 2.687, + "step": 38912 + }, + { + "epoch": 1.8116954163465793, + "grad_norm": 0.3871166322930066, + "learning_rate": 4.06453027612772e-05, + "loss": 2.6595, + "step": 38913 + }, + { + "epoch": 1.8117419745326722, + "grad_norm": 0.3869953613756066, + "learning_rate": 4.064264186925348e-05, + "loss": 2.748, + "step": 38914 + }, + { + "epoch": 1.8117885327187653, + "grad_norm": 0.3818083889684815, + "learning_rate": 4.063998100469268e-05, + "loss": 2.6295, + "step": 38915 + }, + { + "epoch": 1.8118350909048584, + "grad_norm": 0.34533920377041166, + "learning_rate": 4.0637320167602605e-05, + "loss": 2.6876, + "step": 38916 + }, + { + "epoch": 1.8118816490909513, + "grad_norm": 0.34875241041787974, + "learning_rate": 4.063465935799109e-05, + "loss": 2.6923, + "step": 38917 + }, + { + "epoch": 1.8119282072770444, + "grad_norm": 0.33858583842565365, + "learning_rate": 4.06319985758659e-05, + "loss": 2.6286, + "step": 38918 + }, + { + "epoch": 1.8119747654631375, + "grad_norm": 0.3516459193246981, + "learning_rate": 4.06293378212349e-05, + "loss": 2.673, + "step": 38919 + }, + { + "epoch": 1.8120213236492306, + "grad_norm": 0.32728986973138263, + "learning_rate": 4.062667709410587e-05, + "loss": 2.5561, + "step": 38920 + }, + { + "epoch": 1.8120678818353237, + "grad_norm": 0.33874148231735546, + "learning_rate": 4.06240163944866e-05, + "loss": 2.6924, + "step": 38921 + }, + { + "epoch": 1.8121144400214169, + "grad_norm": 0.35978691179474503, + "learning_rate": 4.062135572238492e-05, + "loss": 2.7109, + "step": 38922 + }, + { + "epoch": 1.81216099820751, + "grad_norm": 0.3604971483977961, + "learning_rate": 4.061869507780864e-05, + "loss": 2.6638, + "step": 38923 + }, + { + "epoch": 1.8122075563936029, + "grad_norm": 0.3386542697631576, + "learning_rate": 4.0616034460765586e-05, + "loss": 2.6615, + "step": 38924 + }, + { + "epoch": 1.812254114579696, + "grad_norm": 0.34068446238430294, + "learning_rate": 4.061337387126354e-05, + "loss": 2.5964, + "step": 38925 + }, + { + "epoch": 1.812300672765789, + "grad_norm": 0.3350942881959181, + "learning_rate": 4.0610713309310286e-05, + "loss": 2.666, + "step": 38926 + }, + { + "epoch": 1.812347230951882, + "grad_norm": 0.34712513508499554, + "learning_rate": 4.06080527749137e-05, + "loss": 2.8187, + "step": 38927 + }, + { + "epoch": 1.812393789137975, + "grad_norm": 0.35806370258599735, + "learning_rate": 4.0605392268081544e-05, + "loss": 2.6373, + "step": 38928 + }, + { + "epoch": 1.8124403473240682, + "grad_norm": 0.3483798435738056, + "learning_rate": 4.060273178882161e-05, + "loss": 2.6817, + "step": 38929 + }, + { + "epoch": 1.8124869055101613, + "grad_norm": 0.3118098909873005, + "learning_rate": 4.0600071337141764e-05, + "loss": 2.6901, + "step": 38930 + }, + { + "epoch": 1.8125334636962545, + "grad_norm": 0.4036733287677606, + "learning_rate": 4.059741091304975e-05, + "loss": 2.6555, + "step": 38931 + }, + { + "epoch": 1.8125800218823476, + "grad_norm": 0.36023774913790885, + "learning_rate": 4.059475051655345e-05, + "loss": 2.6881, + "step": 38932 + }, + { + "epoch": 1.8126265800684407, + "grad_norm": 0.3414936690604972, + "learning_rate": 4.059209014766061e-05, + "loss": 2.6411, + "step": 38933 + }, + { + "epoch": 1.8126731382545336, + "grad_norm": 0.4087359617797181, + "learning_rate": 4.058942980637906e-05, + "loss": 2.6561, + "step": 38934 + }, + { + "epoch": 1.8127196964406267, + "grad_norm": 0.32190326818017256, + "learning_rate": 4.05867694927166e-05, + "loss": 2.6516, + "step": 38935 + }, + { + "epoch": 1.8127662546267196, + "grad_norm": 0.34010666373803283, + "learning_rate": 4.058410920668105e-05, + "loss": 2.6618, + "step": 38936 + }, + { + "epoch": 1.8128128128128127, + "grad_norm": 0.37101651688124315, + "learning_rate": 4.058144894828022e-05, + "loss": 2.6708, + "step": 38937 + }, + { + "epoch": 1.8128593709989058, + "grad_norm": 0.3256279992524662, + "learning_rate": 4.057878871752191e-05, + "loss": 2.6455, + "step": 38938 + }, + { + "epoch": 1.812905929184999, + "grad_norm": 0.3534518442526089, + "learning_rate": 4.05761285144139e-05, + "loss": 2.6991, + "step": 38939 + }, + { + "epoch": 1.812952487371092, + "grad_norm": 0.3503559249912625, + "learning_rate": 4.057346833896405e-05, + "loss": 2.6657, + "step": 38940 + }, + { + "epoch": 1.8129990455571852, + "grad_norm": 0.35353111496450446, + "learning_rate": 4.0570808191180123e-05, + "loss": 2.663, + "step": 38941 + }, + { + "epoch": 1.8130456037432783, + "grad_norm": 0.34651009873604427, + "learning_rate": 4.056814807106996e-05, + "loss": 2.6049, + "step": 38942 + }, + { + "epoch": 1.8130921619293714, + "grad_norm": 0.3152296355488541, + "learning_rate": 4.0565487978641356e-05, + "loss": 2.7963, + "step": 38943 + }, + { + "epoch": 1.8131387201154643, + "grad_norm": 0.3798766110696621, + "learning_rate": 4.05628279139021e-05, + "loss": 2.6392, + "step": 38944 + }, + { + "epoch": 1.8131852783015574, + "grad_norm": 0.3196910992942392, + "learning_rate": 4.056016787686003e-05, + "loss": 2.7456, + "step": 38945 + }, + { + "epoch": 1.8132318364876503, + "grad_norm": 0.34121631246249423, + "learning_rate": 4.055750786752294e-05, + "loss": 2.7141, + "step": 38946 + }, + { + "epoch": 1.8132783946737434, + "grad_norm": 0.3471985597261328, + "learning_rate": 4.0554847885898616e-05, + "loss": 2.7856, + "step": 38947 + }, + { + "epoch": 1.8133249528598365, + "grad_norm": 0.34814961822763024, + "learning_rate": 4.055218793199489e-05, + "loss": 2.7316, + "step": 38948 + }, + { + "epoch": 1.8133715110459296, + "grad_norm": 0.34195759626529854, + "learning_rate": 4.054952800581956e-05, + "loss": 2.8022, + "step": 38949 + }, + { + "epoch": 1.8134180692320228, + "grad_norm": 0.34167524484892337, + "learning_rate": 4.054686810738045e-05, + "loss": 2.5475, + "step": 38950 + }, + { + "epoch": 1.8134646274181159, + "grad_norm": 0.32460222001566336, + "learning_rate": 4.0544208236685345e-05, + "loss": 2.6238, + "step": 38951 + }, + { + "epoch": 1.813511185604209, + "grad_norm": 0.3700825138381542, + "learning_rate": 4.0541548393742035e-05, + "loss": 2.5799, + "step": 38952 + }, + { + "epoch": 1.8135577437903019, + "grad_norm": 0.31843552828749044, + "learning_rate": 4.053888857855837e-05, + "loss": 2.6998, + "step": 38953 + }, + { + "epoch": 1.813604301976395, + "grad_norm": 0.34294699691335284, + "learning_rate": 4.053622879114212e-05, + "loss": 2.7105, + "step": 38954 + }, + { + "epoch": 1.813650860162488, + "grad_norm": 0.33395114652590774, + "learning_rate": 4.0533569031501124e-05, + "loss": 2.6208, + "step": 38955 + }, + { + "epoch": 1.813697418348581, + "grad_norm": 0.356249723227736, + "learning_rate": 4.053090929964316e-05, + "loss": 2.6049, + "step": 38956 + }, + { + "epoch": 1.8137439765346741, + "grad_norm": 0.3272832273942646, + "learning_rate": 4.052824959557604e-05, + "loss": 2.655, + "step": 38957 + }, + { + "epoch": 1.8137905347207672, + "grad_norm": 0.37301889263482557, + "learning_rate": 4.0525589919307586e-05, + "loss": 2.6884, + "step": 38958 + }, + { + "epoch": 1.8138370929068603, + "grad_norm": 0.3571727598036592, + "learning_rate": 4.052293027084558e-05, + "loss": 2.692, + "step": 38959 + }, + { + "epoch": 1.8138836510929535, + "grad_norm": 0.35290554352814246, + "learning_rate": 4.0520270650197856e-05, + "loss": 2.6316, + "step": 38960 + }, + { + "epoch": 1.8139302092790466, + "grad_norm": 0.3575360752169247, + "learning_rate": 4.0517611057372194e-05, + "loss": 2.8028, + "step": 38961 + }, + { + "epoch": 1.8139767674651397, + "grad_norm": 0.37307055384576315, + "learning_rate": 4.051495149237641e-05, + "loss": 2.6541, + "step": 38962 + }, + { + "epoch": 1.8140233256512326, + "grad_norm": 0.3642255620508803, + "learning_rate": 4.051229195521831e-05, + "loss": 2.6719, + "step": 38963 + }, + { + "epoch": 1.8140698838373257, + "grad_norm": 0.34519452791788485, + "learning_rate": 4.050963244590571e-05, + "loss": 2.6628, + "step": 38964 + }, + { + "epoch": 1.8141164420234188, + "grad_norm": 0.3691441410817026, + "learning_rate": 4.050697296444638e-05, + "loss": 2.6945, + "step": 38965 + }, + { + "epoch": 1.8141630002095117, + "grad_norm": 0.3604236669252662, + "learning_rate": 4.0504313510848166e-05, + "loss": 2.5972, + "step": 38966 + }, + { + "epoch": 1.8142095583956048, + "grad_norm": 0.3546978621650094, + "learning_rate": 4.0501654085118835e-05, + "loss": 2.6856, + "step": 38967 + }, + { + "epoch": 1.814256116581698, + "grad_norm": 0.3577392767996026, + "learning_rate": 4.049899468726623e-05, + "loss": 2.6646, + "step": 38968 + }, + { + "epoch": 1.814302674767791, + "grad_norm": 0.3362404287187111, + "learning_rate": 4.049633531729815e-05, + "loss": 2.744, + "step": 38969 + }, + { + "epoch": 1.8143492329538842, + "grad_norm": 0.372784624848169, + "learning_rate": 4.0493675975222364e-05, + "loss": 2.7645, + "step": 38970 + }, + { + "epoch": 1.8143957911399773, + "grad_norm": 0.3542575325593819, + "learning_rate": 4.0491016661046724e-05, + "loss": 2.7121, + "step": 38971 + }, + { + "epoch": 1.8144423493260704, + "grad_norm": 0.3282260214993383, + "learning_rate": 4.048835737477901e-05, + "loss": 2.5963, + "step": 38972 + }, + { + "epoch": 1.8144889075121633, + "grad_norm": 0.37794038704332605, + "learning_rate": 4.048569811642703e-05, + "loss": 2.7693, + "step": 38973 + }, + { + "epoch": 1.8145354656982564, + "grad_norm": 0.3749348987266149, + "learning_rate": 4.048303888599858e-05, + "loss": 2.7285, + "step": 38974 + }, + { + "epoch": 1.8145820238843493, + "grad_norm": 0.38369406280722373, + "learning_rate": 4.0480379683501475e-05, + "loss": 2.8351, + "step": 38975 + }, + { + "epoch": 1.8146285820704424, + "grad_norm": 0.3792656669582637, + "learning_rate": 4.047772050894353e-05, + "loss": 2.6594, + "step": 38976 + }, + { + "epoch": 1.8146751402565355, + "grad_norm": 0.37053644720661777, + "learning_rate": 4.0475061362332514e-05, + "loss": 2.7499, + "step": 38977 + }, + { + "epoch": 1.8147216984426287, + "grad_norm": 0.3358141860704548, + "learning_rate": 4.047240224367627e-05, + "loss": 2.6692, + "step": 38978 + }, + { + "epoch": 1.8147682566287218, + "grad_norm": 0.3533695969023047, + "learning_rate": 4.0469743152982594e-05, + "loss": 2.7495, + "step": 38979 + }, + { + "epoch": 1.8148148148148149, + "grad_norm": 0.3742800732429658, + "learning_rate": 4.046708409025926e-05, + "loss": 2.5598, + "step": 38980 + }, + { + "epoch": 1.814861373000908, + "grad_norm": 0.344891797416244, + "learning_rate": 4.046442505551412e-05, + "loss": 2.7452, + "step": 38981 + }, + { + "epoch": 1.8149079311870011, + "grad_norm": 0.3529685862977182, + "learning_rate": 4.046176604875495e-05, + "loss": 2.7667, + "step": 38982 + }, + { + "epoch": 1.814954489373094, + "grad_norm": 0.3613905235491128, + "learning_rate": 4.045910706998953e-05, + "loss": 2.7066, + "step": 38983 + }, + { + "epoch": 1.8150010475591871, + "grad_norm": 0.3331699687005352, + "learning_rate": 4.045644811922572e-05, + "loss": 2.6566, + "step": 38984 + }, + { + "epoch": 1.81504760574528, + "grad_norm": 0.35157605075950094, + "learning_rate": 4.045378919647128e-05, + "loss": 2.6351, + "step": 38985 + }, + { + "epoch": 1.8150941639313731, + "grad_norm": 0.3661326139199406, + "learning_rate": 4.045113030173404e-05, + "loss": 2.6851, + "step": 38986 + }, + { + "epoch": 1.8151407221174662, + "grad_norm": 0.34090367932697574, + "learning_rate": 4.044847143502178e-05, + "loss": 2.6921, + "step": 38987 + }, + { + "epoch": 1.8151872803035594, + "grad_norm": 0.36367631137551065, + "learning_rate": 4.044581259634232e-05, + "loss": 2.6244, + "step": 38988 + }, + { + "epoch": 1.8152338384896525, + "grad_norm": 0.31384238895192784, + "learning_rate": 4.044315378570347e-05, + "loss": 2.572, + "step": 38989 + }, + { + "epoch": 1.8152803966757456, + "grad_norm": 0.34397777705802274, + "learning_rate": 4.0440495003113e-05, + "loss": 2.7541, + "step": 38990 + }, + { + "epoch": 1.8153269548618387, + "grad_norm": 0.325547385436237, + "learning_rate": 4.043783624857875e-05, + "loss": 2.6496, + "step": 38991 + }, + { + "epoch": 1.8153735130479316, + "grad_norm": 0.3483989509388879, + "learning_rate": 4.043517752210851e-05, + "loss": 2.642, + "step": 38992 + }, + { + "epoch": 1.8154200712340247, + "grad_norm": 0.32744156403349967, + "learning_rate": 4.043251882371006e-05, + "loss": 2.7281, + "step": 38993 + }, + { + "epoch": 1.8154666294201178, + "grad_norm": 0.3599706864784047, + "learning_rate": 4.042986015339126e-05, + "loss": 2.6979, + "step": 38994 + }, + { + "epoch": 1.8155131876062107, + "grad_norm": 0.344399290904582, + "learning_rate": 4.042720151115984e-05, + "loss": 2.7024, + "step": 38995 + }, + { + "epoch": 1.8155597457923038, + "grad_norm": 0.35561278016594855, + "learning_rate": 4.042454289702367e-05, + "loss": 2.6434, + "step": 38996 + }, + { + "epoch": 1.815606303978397, + "grad_norm": 0.34411786479767853, + "learning_rate": 4.042188431099052e-05, + "loss": 2.6431, + "step": 38997 + }, + { + "epoch": 1.81565286216449, + "grad_norm": 0.33813715214848133, + "learning_rate": 4.041922575306818e-05, + "loss": 2.6442, + "step": 38998 + }, + { + "epoch": 1.8156994203505832, + "grad_norm": 0.3112936757667804, + "learning_rate": 4.0416567223264487e-05, + "loss": 2.6813, + "step": 38999 + }, + { + "epoch": 1.8157459785366763, + "grad_norm": 0.3366394433971916, + "learning_rate": 4.041390872158721e-05, + "loss": 2.6135, + "step": 39000 + }, + { + "epoch": 1.8157925367227694, + "grad_norm": 0.3451380806629587, + "learning_rate": 4.041125024804417e-05, + "loss": 2.6632, + "step": 39001 + }, + { + "epoch": 1.8158390949088623, + "grad_norm": 0.34009546021269615, + "learning_rate": 4.040859180264318e-05, + "loss": 2.7216, + "step": 39002 + }, + { + "epoch": 1.8158856530949554, + "grad_norm": 0.3516318079356653, + "learning_rate": 4.0405933385392004e-05, + "loss": 2.6322, + "step": 39003 + }, + { + "epoch": 1.8159322112810485, + "grad_norm": 0.32582256728559245, + "learning_rate": 4.0403274996298485e-05, + "loss": 2.6756, + "step": 39004 + }, + { + "epoch": 1.8159787694671414, + "grad_norm": 0.3141665231596749, + "learning_rate": 4.0400616635370416e-05, + "loss": 2.6496, + "step": 39005 + }, + { + "epoch": 1.8160253276532345, + "grad_norm": 0.3406364873592392, + "learning_rate": 4.039795830261556e-05, + "loss": 2.6829, + "step": 39006 + }, + { + "epoch": 1.8160718858393277, + "grad_norm": 0.32520247026935506, + "learning_rate": 4.039529999804178e-05, + "loss": 2.6376, + "step": 39007 + }, + { + "epoch": 1.8161184440254208, + "grad_norm": 0.33348419969275167, + "learning_rate": 4.039264172165682e-05, + "loss": 2.6329, + "step": 39008 + }, + { + "epoch": 1.816165002211514, + "grad_norm": 0.3393573158919137, + "learning_rate": 4.038998347346854e-05, + "loss": 2.712, + "step": 39009 + }, + { + "epoch": 1.816211560397607, + "grad_norm": 0.35014000495438874, + "learning_rate": 4.0387325253484707e-05, + "loss": 2.6291, + "step": 39010 + }, + { + "epoch": 1.8162581185837001, + "grad_norm": 0.356831638001036, + "learning_rate": 4.0384667061713116e-05, + "loss": 2.7673, + "step": 39011 + }, + { + "epoch": 1.816304676769793, + "grad_norm": 0.3363118191813924, + "learning_rate": 4.0382008898161586e-05, + "loss": 2.7172, + "step": 39012 + }, + { + "epoch": 1.8163512349558861, + "grad_norm": 0.33012992984940787, + "learning_rate": 4.03793507628379e-05, + "loss": 2.6422, + "step": 39013 + }, + { + "epoch": 1.816397793141979, + "grad_norm": 0.34855987535024885, + "learning_rate": 4.037669265574989e-05, + "loss": 2.618, + "step": 39014 + }, + { + "epoch": 1.8164443513280721, + "grad_norm": 0.35383480228974384, + "learning_rate": 4.037403457690534e-05, + "loss": 2.7558, + "step": 39015 + }, + { + "epoch": 1.8164909095141653, + "grad_norm": 0.33407740923609436, + "learning_rate": 4.0371376526312025e-05, + "loss": 2.6561, + "step": 39016 + }, + { + "epoch": 1.8165374677002584, + "grad_norm": 0.32393415409682874, + "learning_rate": 4.03687185039778e-05, + "loss": 2.6627, + "step": 39017 + }, + { + "epoch": 1.8165840258863515, + "grad_norm": 0.3568163327876261, + "learning_rate": 4.036606050991043e-05, + "loss": 2.7034, + "step": 39018 + }, + { + "epoch": 1.8166305840724446, + "grad_norm": 0.3477190104646451, + "learning_rate": 4.03634025441177e-05, + "loss": 2.7042, + "step": 39019 + }, + { + "epoch": 1.8166771422585377, + "grad_norm": 0.34084893767884084, + "learning_rate": 4.0360744606607465e-05, + "loss": 2.7481, + "step": 39020 + }, + { + "epoch": 1.8167237004446308, + "grad_norm": 0.3734277132042335, + "learning_rate": 4.0358086697387464e-05, + "loss": 2.5762, + "step": 39021 + }, + { + "epoch": 1.8167702586307237, + "grad_norm": 0.3113834812722344, + "learning_rate": 4.0355428816465554e-05, + "loss": 2.6512, + "step": 39022 + }, + { + "epoch": 1.8168168168168168, + "grad_norm": 0.37081324770494134, + "learning_rate": 4.035277096384952e-05, + "loss": 2.7786, + "step": 39023 + }, + { + "epoch": 1.8168633750029097, + "grad_norm": 0.34451278306993555, + "learning_rate": 4.035011313954713e-05, + "loss": 2.6351, + "step": 39024 + }, + { + "epoch": 1.8169099331890028, + "grad_norm": 0.34347388451733585, + "learning_rate": 4.034745534356622e-05, + "loss": 2.8111, + "step": 39025 + }, + { + "epoch": 1.816956491375096, + "grad_norm": 0.40494440448636365, + "learning_rate": 4.034479757591456e-05, + "loss": 2.763, + "step": 39026 + }, + { + "epoch": 1.817003049561189, + "grad_norm": 0.3773027339516002, + "learning_rate": 4.0342139836599976e-05, + "loss": 2.8178, + "step": 39027 + }, + { + "epoch": 1.8170496077472822, + "grad_norm": 0.4240095427678085, + "learning_rate": 4.033948212563028e-05, + "loss": 2.7159, + "step": 39028 + }, + { + "epoch": 1.8170961659333753, + "grad_norm": 0.3297166865302894, + "learning_rate": 4.033682444301322e-05, + "loss": 2.7687, + "step": 39029 + }, + { + "epoch": 1.8171427241194684, + "grad_norm": 0.4110225617786652, + "learning_rate": 4.033416678875665e-05, + "loss": 2.6809, + "step": 39030 + }, + { + "epoch": 1.8171892823055615, + "grad_norm": 0.3513701006114964, + "learning_rate": 4.033150916286833e-05, + "loss": 2.7376, + "step": 39031 + }, + { + "epoch": 1.8172358404916544, + "grad_norm": 0.3847110658014229, + "learning_rate": 4.03288515653561e-05, + "loss": 2.6584, + "step": 39032 + }, + { + "epoch": 1.8172823986777475, + "grad_norm": 0.349864572368053, + "learning_rate": 4.0326193996227726e-05, + "loss": 2.7291, + "step": 39033 + }, + { + "epoch": 1.8173289568638404, + "grad_norm": 0.346909843617485, + "learning_rate": 4.032353645549101e-05, + "loss": 2.7535, + "step": 39034 + }, + { + "epoch": 1.8173755150499336, + "grad_norm": 0.370617222139525, + "learning_rate": 4.032087894315378e-05, + "loss": 2.6965, + "step": 39035 + }, + { + "epoch": 1.8174220732360267, + "grad_norm": 0.34352166940094153, + "learning_rate": 4.031822145922382e-05, + "loss": 2.7319, + "step": 39036 + }, + { + "epoch": 1.8174686314221198, + "grad_norm": 0.3325983614920337, + "learning_rate": 4.031556400370891e-05, + "loss": 2.7211, + "step": 39037 + }, + { + "epoch": 1.817515189608213, + "grad_norm": 0.3538196519061783, + "learning_rate": 4.031290657661688e-05, + "loss": 2.7034, + "step": 39038 + }, + { + "epoch": 1.817561747794306, + "grad_norm": 0.3415708448190787, + "learning_rate": 4.0310249177955496e-05, + "loss": 2.7295, + "step": 39039 + }, + { + "epoch": 1.8176083059803991, + "grad_norm": 0.37228234873784616, + "learning_rate": 4.030759180773259e-05, + "loss": 2.7176, + "step": 39040 + }, + { + "epoch": 1.817654864166492, + "grad_norm": 0.3268339172234628, + "learning_rate": 4.030493446595595e-05, + "loss": 2.6159, + "step": 39041 + }, + { + "epoch": 1.8177014223525851, + "grad_norm": 0.3654138920225801, + "learning_rate": 4.030227715263335e-05, + "loss": 2.6389, + "step": 39042 + }, + { + "epoch": 1.8177479805386783, + "grad_norm": 0.3438797370262102, + "learning_rate": 4.029961986777264e-05, + "loss": 2.7112, + "step": 39043 + }, + { + "epoch": 1.8177945387247711, + "grad_norm": 0.3411628032935952, + "learning_rate": 4.0296962611381564e-05, + "loss": 2.6796, + "step": 39044 + }, + { + "epoch": 1.8178410969108643, + "grad_norm": 0.3444204801755501, + "learning_rate": 4.029430538346796e-05, + "loss": 2.7056, + "step": 39045 + }, + { + "epoch": 1.8178876550969574, + "grad_norm": 0.3444020739429561, + "learning_rate": 4.029164818403962e-05, + "loss": 2.6659, + "step": 39046 + }, + { + "epoch": 1.8179342132830505, + "grad_norm": 0.33676425299037543, + "learning_rate": 4.028899101310431e-05, + "loss": 2.6468, + "step": 39047 + }, + { + "epoch": 1.8179807714691436, + "grad_norm": 0.3640848542792416, + "learning_rate": 4.028633387066989e-05, + "loss": 2.7078, + "step": 39048 + }, + { + "epoch": 1.8180273296552367, + "grad_norm": 0.3367577803562579, + "learning_rate": 4.0283676756744103e-05, + "loss": 2.5675, + "step": 39049 + }, + { + "epoch": 1.8180738878413298, + "grad_norm": 0.34702636300410555, + "learning_rate": 4.028101967133478e-05, + "loss": 2.812, + "step": 39050 + }, + { + "epoch": 1.8181204460274227, + "grad_norm": 0.32542370701505774, + "learning_rate": 4.027836261444971e-05, + "loss": 2.6537, + "step": 39051 + }, + { + "epoch": 1.8181670042135158, + "grad_norm": 0.31335125524166035, + "learning_rate": 4.027570558609666e-05, + "loss": 2.681, + "step": 39052 + }, + { + "epoch": 1.818213562399609, + "grad_norm": 0.354872610600334, + "learning_rate": 4.027304858628348e-05, + "loss": 2.7446, + "step": 39053 + }, + { + "epoch": 1.8182601205857019, + "grad_norm": 0.3263905821759143, + "learning_rate": 4.0270391615017947e-05, + "loss": 2.7674, + "step": 39054 + }, + { + "epoch": 1.818306678771795, + "grad_norm": 0.32255514357572246, + "learning_rate": 4.0267734672307825e-05, + "loss": 2.7553, + "step": 39055 + }, + { + "epoch": 1.818353236957888, + "grad_norm": 0.3521768857028497, + "learning_rate": 4.0265077758160975e-05, + "loss": 2.7192, + "step": 39056 + }, + { + "epoch": 1.8183997951439812, + "grad_norm": 0.3181422724503436, + "learning_rate": 4.026242087258514e-05, + "loss": 2.622, + "step": 39057 + }, + { + "epoch": 1.8184463533300743, + "grad_norm": 0.3296627964896135, + "learning_rate": 4.0259764015588156e-05, + "loss": 2.6908, + "step": 39058 + }, + { + "epoch": 1.8184929115161674, + "grad_norm": 0.3361020050648942, + "learning_rate": 4.025710718717781e-05, + "loss": 2.7942, + "step": 39059 + }, + { + "epoch": 1.8185394697022605, + "grad_norm": 0.3273960820652241, + "learning_rate": 4.025445038736187e-05, + "loss": 2.7168, + "step": 39060 + }, + { + "epoch": 1.8185860278883534, + "grad_norm": 0.3496086522076831, + "learning_rate": 4.025179361614818e-05, + "loss": 2.7808, + "step": 39061 + }, + { + "epoch": 1.8186325860744466, + "grad_norm": 0.34017916015159255, + "learning_rate": 4.024913687354451e-05, + "loss": 2.6888, + "step": 39062 + }, + { + "epoch": 1.8186791442605394, + "grad_norm": 0.3265666238604303, + "learning_rate": 4.0246480159558666e-05, + "loss": 2.7609, + "step": 39063 + }, + { + "epoch": 1.8187257024466326, + "grad_norm": 0.34559819679211723, + "learning_rate": 4.0243823474198445e-05, + "loss": 2.7464, + "step": 39064 + }, + { + "epoch": 1.8187722606327257, + "grad_norm": 0.34539964881522006, + "learning_rate": 4.024116681747162e-05, + "loss": 2.6821, + "step": 39065 + }, + { + "epoch": 1.8188188188188188, + "grad_norm": 0.33484482377489116, + "learning_rate": 4.023851018938603e-05, + "loss": 2.7083, + "step": 39066 + }, + { + "epoch": 1.818865377004912, + "grad_norm": 0.3590801461734623, + "learning_rate": 4.0235853589949425e-05, + "loss": 2.5598, + "step": 39067 + }, + { + "epoch": 1.818911935191005, + "grad_norm": 0.3237835143834004, + "learning_rate": 4.0233197019169655e-05, + "loss": 2.8042, + "step": 39068 + }, + { + "epoch": 1.8189584933770981, + "grad_norm": 0.3318405618247762, + "learning_rate": 4.023054047705449e-05, + "loss": 2.6202, + "step": 39069 + }, + { + "epoch": 1.8190050515631913, + "grad_norm": 0.3696884233665877, + "learning_rate": 4.02278839636117e-05, + "loss": 2.7788, + "step": 39070 + }, + { + "epoch": 1.8190516097492841, + "grad_norm": 0.3531976592825895, + "learning_rate": 4.0225227478849134e-05, + "loss": 2.7245, + "step": 39071 + }, + { + "epoch": 1.8190981679353773, + "grad_norm": 0.3466341337893477, + "learning_rate": 4.022257102277456e-05, + "loss": 2.6282, + "step": 39072 + }, + { + "epoch": 1.8191447261214702, + "grad_norm": 0.33672632356844223, + "learning_rate": 4.021991459539576e-05, + "loss": 2.8046, + "step": 39073 + }, + { + "epoch": 1.8191912843075633, + "grad_norm": 0.34625974334522286, + "learning_rate": 4.0217258196720567e-05, + "loss": 2.7593, + "step": 39074 + }, + { + "epoch": 1.8192378424936564, + "grad_norm": 0.3420259264308634, + "learning_rate": 4.0214601826756736e-05, + "loss": 2.6808, + "step": 39075 + }, + { + "epoch": 1.8192844006797495, + "grad_norm": 0.36247860273382915, + "learning_rate": 4.0211945485512106e-05, + "loss": 2.7108, + "step": 39076 + }, + { + "epoch": 1.8193309588658426, + "grad_norm": 0.3384983228488143, + "learning_rate": 4.020928917299445e-05, + "loss": 2.692, + "step": 39077 + }, + { + "epoch": 1.8193775170519357, + "grad_norm": 0.34809867143121775, + "learning_rate": 4.0206632889211546e-05, + "loss": 2.7886, + "step": 39078 + }, + { + "epoch": 1.8194240752380288, + "grad_norm": 0.37116110091169, + "learning_rate": 4.020397663417123e-05, + "loss": 2.7381, + "step": 39079 + }, + { + "epoch": 1.8194706334241217, + "grad_norm": 0.3561839486316698, + "learning_rate": 4.020132040788125e-05, + "loss": 2.7488, + "step": 39080 + }, + { + "epoch": 1.8195171916102149, + "grad_norm": 0.37128817440131207, + "learning_rate": 4.019866421034946e-05, + "loss": 2.6687, + "step": 39081 + }, + { + "epoch": 1.819563749796308, + "grad_norm": 0.34470472555609416, + "learning_rate": 4.019600804158361e-05, + "loss": 2.535, + "step": 39082 + }, + { + "epoch": 1.8196103079824009, + "grad_norm": 0.3617742688008521, + "learning_rate": 4.0193351901591505e-05, + "loss": 2.6161, + "step": 39083 + }, + { + "epoch": 1.819656866168494, + "grad_norm": 0.34950267188250944, + "learning_rate": 4.019069579038096e-05, + "loss": 2.7025, + "step": 39084 + }, + { + "epoch": 1.819703424354587, + "grad_norm": 0.31739673279165637, + "learning_rate": 4.018803970795975e-05, + "loss": 2.6209, + "step": 39085 + }, + { + "epoch": 1.8197499825406802, + "grad_norm": 0.3324353982940154, + "learning_rate": 4.018538365433567e-05, + "loss": 2.677, + "step": 39086 + }, + { + "epoch": 1.8197965407267733, + "grad_norm": 0.3586324785789919, + "learning_rate": 4.0182727629516544e-05, + "loss": 2.6116, + "step": 39087 + }, + { + "epoch": 1.8198430989128664, + "grad_norm": 0.32359192109939217, + "learning_rate": 4.018007163351012e-05, + "loss": 2.7335, + "step": 39088 + }, + { + "epoch": 1.8198896570989596, + "grad_norm": 0.3635744990149814, + "learning_rate": 4.017741566632424e-05, + "loss": 2.7446, + "step": 39089 + }, + { + "epoch": 1.8199362152850524, + "grad_norm": 0.35984566812301266, + "learning_rate": 4.017475972796668e-05, + "loss": 2.7806, + "step": 39090 + }, + { + "epoch": 1.8199827734711456, + "grad_norm": 0.33416217605973897, + "learning_rate": 4.01721038184452e-05, + "loss": 2.7048, + "step": 39091 + }, + { + "epoch": 1.8200293316572387, + "grad_norm": 0.3750698502265007, + "learning_rate": 4.0169447937767656e-05, + "loss": 2.787, + "step": 39092 + }, + { + "epoch": 1.8200758898433316, + "grad_norm": 0.3376119853930884, + "learning_rate": 4.016679208594179e-05, + "loss": 2.7204, + "step": 39093 + }, + { + "epoch": 1.8201224480294247, + "grad_norm": 0.3078783618294364, + "learning_rate": 4.0164136262975435e-05, + "loss": 2.6602, + "step": 39094 + }, + { + "epoch": 1.8201690062155178, + "grad_norm": 0.3514803778833112, + "learning_rate": 4.0161480468876386e-05, + "loss": 2.7252, + "step": 39095 + }, + { + "epoch": 1.820215564401611, + "grad_norm": 0.3211779979375702, + "learning_rate": 4.015882470365239e-05, + "loss": 2.7032, + "step": 39096 + }, + { + "epoch": 1.820262122587704, + "grad_norm": 0.33865022602374045, + "learning_rate": 4.01561689673113e-05, + "loss": 2.7266, + "step": 39097 + }, + { + "epoch": 1.8203086807737971, + "grad_norm": 0.3391345818804545, + "learning_rate": 4.0153513259860875e-05, + "loss": 2.6727, + "step": 39098 + }, + { + "epoch": 1.8203552389598903, + "grad_norm": 0.34101506941937254, + "learning_rate": 4.015085758130891e-05, + "loss": 2.7, + "step": 39099 + }, + { + "epoch": 1.8204017971459832, + "grad_norm": 0.3698409939839099, + "learning_rate": 4.014820193166323e-05, + "loss": 2.7199, + "step": 39100 + }, + { + "epoch": 1.8204483553320763, + "grad_norm": 0.32103834905116485, + "learning_rate": 4.014554631093159e-05, + "loss": 2.6828, + "step": 39101 + }, + { + "epoch": 1.8204949135181692, + "grad_norm": 0.35125796918334035, + "learning_rate": 4.014289071912181e-05, + "loss": 2.7404, + "step": 39102 + }, + { + "epoch": 1.8205414717042623, + "grad_norm": 0.3345244986527221, + "learning_rate": 4.014023515624166e-05, + "loss": 2.6844, + "step": 39103 + }, + { + "epoch": 1.8205880298903554, + "grad_norm": 0.32611060748380694, + "learning_rate": 4.013757962229896e-05, + "loss": 2.6081, + "step": 39104 + }, + { + "epoch": 1.8206345880764485, + "grad_norm": 0.3269481064623237, + "learning_rate": 4.01349241173015e-05, + "loss": 2.613, + "step": 39105 + }, + { + "epoch": 1.8206811462625416, + "grad_norm": 0.33506194009301354, + "learning_rate": 4.0132268641257034e-05, + "loss": 2.703, + "step": 39106 + }, + { + "epoch": 1.8207277044486347, + "grad_norm": 0.3466554329238275, + "learning_rate": 4.012961319417342e-05, + "loss": 2.6612, + "step": 39107 + }, + { + "epoch": 1.8207742626347279, + "grad_norm": 0.348015195778008, + "learning_rate": 4.0126957776058416e-05, + "loss": 2.7372, + "step": 39108 + }, + { + "epoch": 1.820820820820821, + "grad_norm": 0.33977399320976653, + "learning_rate": 4.0124302386919795e-05, + "loss": 2.6459, + "step": 39109 + }, + { + "epoch": 1.8208673790069139, + "grad_norm": 0.3307505175454709, + "learning_rate": 4.0121647026765395e-05, + "loss": 2.5975, + "step": 39110 + }, + { + "epoch": 1.820913937193007, + "grad_norm": 0.356393562677462, + "learning_rate": 4.011899169560298e-05, + "loss": 2.5951, + "step": 39111 + }, + { + "epoch": 1.8209604953790999, + "grad_norm": 0.3371605225505243, + "learning_rate": 4.0116336393440345e-05, + "loss": 2.7103, + "step": 39112 + }, + { + "epoch": 1.821007053565193, + "grad_norm": 0.3319677730974217, + "learning_rate": 4.011368112028531e-05, + "loss": 2.6628, + "step": 39113 + }, + { + "epoch": 1.821053611751286, + "grad_norm": 0.34201166702783314, + "learning_rate": 4.011102587614562e-05, + "loss": 2.7064, + "step": 39114 + }, + { + "epoch": 1.8211001699373792, + "grad_norm": 0.3271877988672466, + "learning_rate": 4.010837066102912e-05, + "loss": 2.6354, + "step": 39115 + }, + { + "epoch": 1.8211467281234723, + "grad_norm": 0.36066622758240136, + "learning_rate": 4.010571547494355e-05, + "loss": 2.7141, + "step": 39116 + }, + { + "epoch": 1.8211932863095655, + "grad_norm": 0.3471032921399173, + "learning_rate": 4.0103060317896754e-05, + "loss": 2.7285, + "step": 39117 + }, + { + "epoch": 1.8212398444956586, + "grad_norm": 0.3252788387963352, + "learning_rate": 4.01004051898965e-05, + "loss": 2.7532, + "step": 39118 + }, + { + "epoch": 1.8212864026817517, + "grad_norm": 0.37216214773505024, + "learning_rate": 4.009775009095056e-05, + "loss": 2.7357, + "step": 39119 + }, + { + "epoch": 1.8213329608678446, + "grad_norm": 0.32634652135758474, + "learning_rate": 4.009509502106677e-05, + "loss": 2.6821, + "step": 39120 + }, + { + "epoch": 1.8213795190539377, + "grad_norm": 0.3411864366271654, + "learning_rate": 4.009243998025288e-05, + "loss": 2.8594, + "step": 39121 + }, + { + "epoch": 1.8214260772400306, + "grad_norm": 0.34580678852488783, + "learning_rate": 4.008978496851672e-05, + "loss": 2.682, + "step": 39122 + }, + { + "epoch": 1.8214726354261237, + "grad_norm": 0.35027845494197946, + "learning_rate": 4.008712998586607e-05, + "loss": 2.7357, + "step": 39123 + }, + { + "epoch": 1.8215191936122168, + "grad_norm": 0.34722749493908334, + "learning_rate": 4.008447503230871e-05, + "loss": 2.5961, + "step": 39124 + }, + { + "epoch": 1.82156575179831, + "grad_norm": 0.3421073179726571, + "learning_rate": 4.008182010785242e-05, + "loss": 2.5793, + "step": 39125 + }, + { + "epoch": 1.821612309984403, + "grad_norm": 0.34738590891172916, + "learning_rate": 4.007916521250504e-05, + "loss": 2.596, + "step": 39126 + }, + { + "epoch": 1.8216588681704962, + "grad_norm": 0.3595423414727762, + "learning_rate": 4.0076510346274315e-05, + "loss": 2.7062, + "step": 39127 + }, + { + "epoch": 1.8217054263565893, + "grad_norm": 0.37091321726400034, + "learning_rate": 4.0073855509168065e-05, + "loss": 2.6843, + "step": 39128 + }, + { + "epoch": 1.8217519845426822, + "grad_norm": 0.3230081765139664, + "learning_rate": 4.007120070119406e-05, + "loss": 2.6566, + "step": 39129 + }, + { + "epoch": 1.8217985427287753, + "grad_norm": 0.34451322514326943, + "learning_rate": 4.006854592236011e-05, + "loss": 2.6861, + "step": 39130 + }, + { + "epoch": 1.8218451009148684, + "grad_norm": 0.35780967088214555, + "learning_rate": 4.006589117267401e-05, + "loss": 2.7144, + "step": 39131 + }, + { + "epoch": 1.8218916591009613, + "grad_norm": 0.33531108355261824, + "learning_rate": 4.006323645214351e-05, + "loss": 2.652, + "step": 39132 + }, + { + "epoch": 1.8219382172870544, + "grad_norm": 0.33348649732604263, + "learning_rate": 4.0060581760776454e-05, + "loss": 2.6862, + "step": 39133 + }, + { + "epoch": 1.8219847754731475, + "grad_norm": 0.3354001383054106, + "learning_rate": 4.00579270985806e-05, + "loss": 2.6434, + "step": 39134 + }, + { + "epoch": 1.8220313336592406, + "grad_norm": 0.3497720635518541, + "learning_rate": 4.0055272465563764e-05, + "loss": 2.8087, + "step": 39135 + }, + { + "epoch": 1.8220778918453338, + "grad_norm": 0.3424000843482664, + "learning_rate": 4.005261786173372e-05, + "loss": 2.7044, + "step": 39136 + }, + { + "epoch": 1.8221244500314269, + "grad_norm": 0.3300141855553816, + "learning_rate": 4.0049963287098256e-05, + "loss": 2.7258, + "step": 39137 + }, + { + "epoch": 1.82217100821752, + "grad_norm": 0.337336211802267, + "learning_rate": 4.004730874166517e-05, + "loss": 2.6958, + "step": 39138 + }, + { + "epoch": 1.8222175664036129, + "grad_norm": 0.33877939940882235, + "learning_rate": 4.004465422544226e-05, + "loss": 2.7429, + "step": 39139 + }, + { + "epoch": 1.822264124589706, + "grad_norm": 0.3231579750451102, + "learning_rate": 4.004199973843731e-05, + "loss": 2.547, + "step": 39140 + }, + { + "epoch": 1.822310682775799, + "grad_norm": 0.3249828549265773, + "learning_rate": 4.003934528065812e-05, + "loss": 2.8015, + "step": 39141 + }, + { + "epoch": 1.822357240961892, + "grad_norm": 0.3519311392927675, + "learning_rate": 4.0036690852112436e-05, + "loss": 2.5961, + "step": 39142 + }, + { + "epoch": 1.8224037991479851, + "grad_norm": 0.35079243424753415, + "learning_rate": 4.003403645280811e-05, + "loss": 2.7019, + "step": 39143 + }, + { + "epoch": 1.8224503573340782, + "grad_norm": 0.3128926015889935, + "learning_rate": 4.00313820827529e-05, + "loss": 2.7493, + "step": 39144 + }, + { + "epoch": 1.8224969155201713, + "grad_norm": 0.3369378868550136, + "learning_rate": 4.0028727741954586e-05, + "loss": 2.5939, + "step": 39145 + }, + { + "epoch": 1.8225434737062645, + "grad_norm": 0.35045245632951316, + "learning_rate": 4.0026073430421e-05, + "loss": 2.7592, + "step": 39146 + }, + { + "epoch": 1.8225900318923576, + "grad_norm": 0.337275906499449, + "learning_rate": 4.002341914815987e-05, + "loss": 2.6553, + "step": 39147 + }, + { + "epoch": 1.8226365900784507, + "grad_norm": 0.3453987282494586, + "learning_rate": 4.0020764895179055e-05, + "loss": 2.6796, + "step": 39148 + }, + { + "epoch": 1.8226831482645436, + "grad_norm": 0.3600473058543932, + "learning_rate": 4.001811067148631e-05, + "loss": 2.7481, + "step": 39149 + }, + { + "epoch": 1.8227297064506367, + "grad_norm": 0.37477417165804083, + "learning_rate": 4.0015456477089406e-05, + "loss": 2.6464, + "step": 39150 + }, + { + "epoch": 1.8227762646367296, + "grad_norm": 0.34765003790806737, + "learning_rate": 4.0012802311996166e-05, + "loss": 2.757, + "step": 39151 + }, + { + "epoch": 1.8228228228228227, + "grad_norm": 0.3555401625560665, + "learning_rate": 4.0010148176214366e-05, + "loss": 2.6561, + "step": 39152 + }, + { + "epoch": 1.8228693810089158, + "grad_norm": 0.36769231664392177, + "learning_rate": 4.0007494069751804e-05, + "loss": 2.7037, + "step": 39153 + }, + { + "epoch": 1.822915939195009, + "grad_norm": 0.33363153351055164, + "learning_rate": 4.000483999261626e-05, + "loss": 2.7204, + "step": 39154 + }, + { + "epoch": 1.822962497381102, + "grad_norm": 0.35218633484084777, + "learning_rate": 4.0002185944815515e-05, + "loss": 2.6646, + "step": 39155 + }, + { + "epoch": 1.8230090555671952, + "grad_norm": 0.37507307534180623, + "learning_rate": 3.999953192635738e-05, + "loss": 2.7268, + "step": 39156 + }, + { + "epoch": 1.8230556137532883, + "grad_norm": 0.3437665791534529, + "learning_rate": 3.9996877937249614e-05, + "loss": 2.7914, + "step": 39157 + }, + { + "epoch": 1.8231021719393814, + "grad_norm": 0.3795582239066493, + "learning_rate": 3.999422397750005e-05, + "loss": 2.6879, + "step": 39158 + }, + { + "epoch": 1.8231487301254743, + "grad_norm": 0.3518575003222161, + "learning_rate": 3.9991570047116455e-05, + "loss": 2.7199, + "step": 39159 + }, + { + "epoch": 1.8231952883115674, + "grad_norm": 0.3719559380739906, + "learning_rate": 3.998891614610659e-05, + "loss": 2.7286, + "step": 39160 + }, + { + "epoch": 1.8232418464976603, + "grad_norm": 0.3562090194131059, + "learning_rate": 3.998626227447829e-05, + "loss": 2.6487, + "step": 39161 + }, + { + "epoch": 1.8232884046837534, + "grad_norm": 0.3855542733383581, + "learning_rate": 3.998360843223933e-05, + "loss": 2.7113, + "step": 39162 + }, + { + "epoch": 1.8233349628698465, + "grad_norm": 0.36256584858150326, + "learning_rate": 3.998095461939747e-05, + "loss": 2.7784, + "step": 39163 + }, + { + "epoch": 1.8233815210559396, + "grad_norm": 0.3623964114521585, + "learning_rate": 3.9978300835960536e-05, + "loss": 2.6831, + "step": 39164 + }, + { + "epoch": 1.8234280792420328, + "grad_norm": 0.32619405397978246, + "learning_rate": 3.9975647081936295e-05, + "loss": 2.6888, + "step": 39165 + }, + { + "epoch": 1.8234746374281259, + "grad_norm": 0.35775707154924763, + "learning_rate": 3.997299335733256e-05, + "loss": 2.8134, + "step": 39166 + }, + { + "epoch": 1.823521195614219, + "grad_norm": 0.3671969145097161, + "learning_rate": 3.997033966215709e-05, + "loss": 2.6409, + "step": 39167 + }, + { + "epoch": 1.8235677538003119, + "grad_norm": 0.33967957264331766, + "learning_rate": 3.996768599641767e-05, + "loss": 2.7307, + "step": 39168 + }, + { + "epoch": 1.823614311986405, + "grad_norm": 0.34001223554814547, + "learning_rate": 3.9965032360122124e-05, + "loss": 2.7518, + "step": 39169 + }, + { + "epoch": 1.8236608701724981, + "grad_norm": 0.346883695840879, + "learning_rate": 3.996237875327819e-05, + "loss": 2.7355, + "step": 39170 + }, + { + "epoch": 1.823707428358591, + "grad_norm": 0.3773454062242528, + "learning_rate": 3.995972517589372e-05, + "loss": 2.7049, + "step": 39171 + }, + { + "epoch": 1.8237539865446841, + "grad_norm": 0.35177809654893966, + "learning_rate": 3.995707162797646e-05, + "loss": 2.6396, + "step": 39172 + }, + { + "epoch": 1.8238005447307772, + "grad_norm": 0.34615394404017136, + "learning_rate": 3.995441810953419e-05, + "loss": 2.6115, + "step": 39173 + }, + { + "epoch": 1.8238471029168704, + "grad_norm": 0.3649546426058515, + "learning_rate": 3.9951764620574725e-05, + "loss": 2.7183, + "step": 39174 + }, + { + "epoch": 1.8238936611029635, + "grad_norm": 0.33418226917352234, + "learning_rate": 3.994911116110584e-05, + "loss": 2.6389, + "step": 39175 + }, + { + "epoch": 1.8239402192890566, + "grad_norm": 0.31297413654167155, + "learning_rate": 3.9946457731135316e-05, + "loss": 2.6649, + "step": 39176 + }, + { + "epoch": 1.8239867774751497, + "grad_norm": 0.3549544451675294, + "learning_rate": 3.994380433067096e-05, + "loss": 2.6917, + "step": 39177 + }, + { + "epoch": 1.8240333356612426, + "grad_norm": 0.36819936108972146, + "learning_rate": 3.994115095972054e-05, + "loss": 2.7007, + "step": 39178 + }, + { + "epoch": 1.8240798938473357, + "grad_norm": 0.34576051861083745, + "learning_rate": 3.993849761829186e-05, + "loss": 2.7486, + "step": 39179 + }, + { + "epoch": 1.8241264520334288, + "grad_norm": 0.3365047143788199, + "learning_rate": 3.99358443063927e-05, + "loss": 2.6871, + "step": 39180 + }, + { + "epoch": 1.8241730102195217, + "grad_norm": 0.3649871220514212, + "learning_rate": 3.993319102403083e-05, + "loss": 2.7252, + "step": 39181 + }, + { + "epoch": 1.8242195684056148, + "grad_norm": 0.3314003783656383, + "learning_rate": 3.993053777121406e-05, + "loss": 2.6725, + "step": 39182 + }, + { + "epoch": 1.824266126591708, + "grad_norm": 0.3808741103033066, + "learning_rate": 3.992788454795016e-05, + "loss": 2.7642, + "step": 39183 + }, + { + "epoch": 1.824312684777801, + "grad_norm": 0.345965570395555, + "learning_rate": 3.992523135424694e-05, + "loss": 2.7039, + "step": 39184 + }, + { + "epoch": 1.8243592429638942, + "grad_norm": 0.3643855049192177, + "learning_rate": 3.992257819011218e-05, + "loss": 2.7568, + "step": 39185 + }, + { + "epoch": 1.8244058011499873, + "grad_norm": 0.3550994527722007, + "learning_rate": 3.991992505555363e-05, + "loss": 2.7639, + "step": 39186 + }, + { + "epoch": 1.8244523593360804, + "grad_norm": 0.3358793362968387, + "learning_rate": 3.991727195057913e-05, + "loss": 2.7715, + "step": 39187 + }, + { + "epoch": 1.8244989175221733, + "grad_norm": 0.40445774118679206, + "learning_rate": 3.991461887519644e-05, + "loss": 2.7146, + "step": 39188 + }, + { + "epoch": 1.8245454757082664, + "grad_norm": 0.3517040243228812, + "learning_rate": 3.9911965829413345e-05, + "loss": 2.7, + "step": 39189 + }, + { + "epoch": 1.8245920338943593, + "grad_norm": 0.36637798450433773, + "learning_rate": 3.990931281323765e-05, + "loss": 2.7522, + "step": 39190 + }, + { + "epoch": 1.8246385920804524, + "grad_norm": 0.36037447635969777, + "learning_rate": 3.9906659826677116e-05, + "loss": 2.6375, + "step": 39191 + }, + { + "epoch": 1.8246851502665455, + "grad_norm": 0.34962780490568934, + "learning_rate": 3.9904006869739555e-05, + "loss": 2.7184, + "step": 39192 + }, + { + "epoch": 1.8247317084526387, + "grad_norm": 0.339227809489307, + "learning_rate": 3.9901353942432705e-05, + "loss": 2.653, + "step": 39193 + }, + { + "epoch": 1.8247782666387318, + "grad_norm": 0.3280694669018669, + "learning_rate": 3.989870104476442e-05, + "loss": 2.7097, + "step": 39194 + }, + { + "epoch": 1.8248248248248249, + "grad_norm": 0.3248305878720445, + "learning_rate": 3.9896048176742454e-05, + "loss": 2.6574, + "step": 39195 + }, + { + "epoch": 1.824871383010918, + "grad_norm": 0.35856885677321054, + "learning_rate": 3.9893395338374564e-05, + "loss": 2.7655, + "step": 39196 + }, + { + "epoch": 1.8249179411970111, + "grad_norm": 0.3392588472616286, + "learning_rate": 3.989074252966858e-05, + "loss": 2.5822, + "step": 39197 + }, + { + "epoch": 1.824964499383104, + "grad_norm": 0.34133095973929445, + "learning_rate": 3.9888089750632255e-05, + "loss": 2.7257, + "step": 39198 + }, + { + "epoch": 1.8250110575691971, + "grad_norm": 0.3467027377367229, + "learning_rate": 3.988543700127341e-05, + "loss": 2.6741, + "step": 39199 + }, + { + "epoch": 1.82505761575529, + "grad_norm": 0.33126130888892247, + "learning_rate": 3.988278428159982e-05, + "loss": 2.6749, + "step": 39200 + }, + { + "epoch": 1.8251041739413831, + "grad_norm": 0.3488762202913251, + "learning_rate": 3.988013159161924e-05, + "loss": 2.6403, + "step": 39201 + }, + { + "epoch": 1.8251507321274762, + "grad_norm": 0.3265329491567653, + "learning_rate": 3.987747893133948e-05, + "loss": 2.7171, + "step": 39202 + }, + { + "epoch": 1.8251972903135694, + "grad_norm": 0.3565675421879056, + "learning_rate": 3.987482630076833e-05, + "loss": 2.5774, + "step": 39203 + }, + { + "epoch": 1.8252438484996625, + "grad_norm": 0.34461998418945616, + "learning_rate": 3.9872173699913564e-05, + "loss": 2.74, + "step": 39204 + }, + { + "epoch": 1.8252904066857556, + "grad_norm": 0.34941541623772965, + "learning_rate": 3.986952112878297e-05, + "loss": 2.7126, + "step": 39205 + }, + { + "epoch": 1.8253369648718487, + "grad_norm": 0.32568444699440335, + "learning_rate": 3.986686858738432e-05, + "loss": 2.6471, + "step": 39206 + }, + { + "epoch": 1.8253835230579418, + "grad_norm": 0.36509631108165236, + "learning_rate": 3.986421607572544e-05, + "loss": 2.701, + "step": 39207 + }, + { + "epoch": 1.8254300812440347, + "grad_norm": 0.34970154946941756, + "learning_rate": 3.986156359381408e-05, + "loss": 2.642, + "step": 39208 + }, + { + "epoch": 1.8254766394301278, + "grad_norm": 0.34825128896686225, + "learning_rate": 3.985891114165801e-05, + "loss": 2.7358, + "step": 39209 + }, + { + "epoch": 1.8255231976162207, + "grad_norm": 0.36037240466538306, + "learning_rate": 3.985625871926505e-05, + "loss": 2.5847, + "step": 39210 + }, + { + "epoch": 1.8255697558023138, + "grad_norm": 0.3513109446474308, + "learning_rate": 3.985360632664296e-05, + "loss": 2.7058, + "step": 39211 + }, + { + "epoch": 1.825616313988407, + "grad_norm": 0.35388530340978236, + "learning_rate": 3.9850953963799556e-05, + "loss": 2.5753, + "step": 39212 + }, + { + "epoch": 1.8256628721745, + "grad_norm": 0.3320404089562906, + "learning_rate": 3.98483016307426e-05, + "loss": 2.6519, + "step": 39213 + }, + { + "epoch": 1.8257094303605932, + "grad_norm": 0.3856144301533713, + "learning_rate": 3.9845649327479876e-05, + "loss": 2.7061, + "step": 39214 + }, + { + "epoch": 1.8257559885466863, + "grad_norm": 0.3257697455784377, + "learning_rate": 3.9842997054019157e-05, + "loss": 2.6921, + "step": 39215 + }, + { + "epoch": 1.8258025467327794, + "grad_norm": 0.3575569606765225, + "learning_rate": 3.984034481036825e-05, + "loss": 2.715, + "step": 39216 + }, + { + "epoch": 1.8258491049188723, + "grad_norm": 0.33566927606322805, + "learning_rate": 3.9837692596534945e-05, + "loss": 2.6613, + "step": 39217 + }, + { + "epoch": 1.8258956631049654, + "grad_norm": 0.3374472608002715, + "learning_rate": 3.9835040412527004e-05, + "loss": 2.7273, + "step": 39218 + }, + { + "epoch": 1.8259422212910585, + "grad_norm": 0.35639420511162123, + "learning_rate": 3.98323882583522e-05, + "loss": 2.5843, + "step": 39219 + }, + { + "epoch": 1.8259887794771514, + "grad_norm": 0.33410433284222335, + "learning_rate": 3.9829736134018354e-05, + "loss": 2.5158, + "step": 39220 + }, + { + "epoch": 1.8260353376632446, + "grad_norm": 0.34788270106868785, + "learning_rate": 3.982708403953323e-05, + "loss": 2.6684, + "step": 39221 + }, + { + "epoch": 1.8260818958493377, + "grad_norm": 0.3400917059833533, + "learning_rate": 3.9824431974904594e-05, + "loss": 2.693, + "step": 39222 + }, + { + "epoch": 1.8261284540354308, + "grad_norm": 0.3451006458804188, + "learning_rate": 3.982177994014026e-05, + "loss": 2.8327, + "step": 39223 + }, + { + "epoch": 1.826175012221524, + "grad_norm": 0.33753382153438777, + "learning_rate": 3.9819127935247987e-05, + "loss": 2.714, + "step": 39224 + }, + { + "epoch": 1.826221570407617, + "grad_norm": 0.3280743669590602, + "learning_rate": 3.9816475960235586e-05, + "loss": 2.6951, + "step": 39225 + }, + { + "epoch": 1.8262681285937101, + "grad_norm": 0.33491695409422684, + "learning_rate": 3.981382401511083e-05, + "loss": 2.6717, + "step": 39226 + }, + { + "epoch": 1.826314686779803, + "grad_norm": 0.3209706064939433, + "learning_rate": 3.981117209988149e-05, + "loss": 2.6812, + "step": 39227 + }, + { + "epoch": 1.8263612449658961, + "grad_norm": 0.3556285151625934, + "learning_rate": 3.980852021455535e-05, + "loss": 2.7137, + "step": 39228 + }, + { + "epoch": 1.8264078031519893, + "grad_norm": 0.3631837713205242, + "learning_rate": 3.98058683591402e-05, + "loss": 2.7182, + "step": 39229 + }, + { + "epoch": 1.8264543613380821, + "grad_norm": 0.3500985516788221, + "learning_rate": 3.980321653364383e-05, + "loss": 2.7798, + "step": 39230 + }, + { + "epoch": 1.8265009195241753, + "grad_norm": 0.38387932458888446, + "learning_rate": 3.980056473807402e-05, + "loss": 2.682, + "step": 39231 + }, + { + "epoch": 1.8265474777102684, + "grad_norm": 0.36314013955823443, + "learning_rate": 3.979791297243852e-05, + "loss": 2.6692, + "step": 39232 + }, + { + "epoch": 1.8265940358963615, + "grad_norm": 0.3447279709458056, + "learning_rate": 3.979526123674516e-05, + "loss": 2.7018, + "step": 39233 + }, + { + "epoch": 1.8266405940824546, + "grad_norm": 0.37376301101678133, + "learning_rate": 3.979260953100169e-05, + "loss": 2.6567, + "step": 39234 + }, + { + "epoch": 1.8266871522685477, + "grad_norm": 0.359504197160229, + "learning_rate": 3.978995785521593e-05, + "loss": 2.715, + "step": 39235 + }, + { + "epoch": 1.8267337104546408, + "grad_norm": 0.39014928731254994, + "learning_rate": 3.9787306209395626e-05, + "loss": 2.8185, + "step": 39236 + }, + { + "epoch": 1.8267802686407337, + "grad_norm": 0.36811563186753177, + "learning_rate": 3.978465459354856e-05, + "loss": 2.6945, + "step": 39237 + }, + { + "epoch": 1.8268268268268268, + "grad_norm": 0.36009738876186664, + "learning_rate": 3.978200300768254e-05, + "loss": 2.6895, + "step": 39238 + }, + { + "epoch": 1.8268733850129197, + "grad_norm": 0.3663414551746072, + "learning_rate": 3.977935145180534e-05, + "loss": 2.8705, + "step": 39239 + }, + { + "epoch": 1.8269199431990129, + "grad_norm": 0.3428335736051654, + "learning_rate": 3.977669992592473e-05, + "loss": 2.6587, + "step": 39240 + }, + { + "epoch": 1.826966501385106, + "grad_norm": 0.36364275311271954, + "learning_rate": 3.977404843004849e-05, + "loss": 2.7895, + "step": 39241 + }, + { + "epoch": 1.827013059571199, + "grad_norm": 0.3586980479915782, + "learning_rate": 3.977139696418441e-05, + "loss": 2.6403, + "step": 39242 + }, + { + "epoch": 1.8270596177572922, + "grad_norm": 0.36114656456743405, + "learning_rate": 3.976874552834029e-05, + "loss": 2.687, + "step": 39243 + }, + { + "epoch": 1.8271061759433853, + "grad_norm": 0.33119131079146713, + "learning_rate": 3.976609412252389e-05, + "loss": 2.6844, + "step": 39244 + }, + { + "epoch": 1.8271527341294784, + "grad_norm": 0.3369555722441225, + "learning_rate": 3.9763442746742973e-05, + "loss": 2.6654, + "step": 39245 + }, + { + "epoch": 1.8271992923155715, + "grad_norm": 0.32646020419669103, + "learning_rate": 3.976079140100537e-05, + "loss": 2.6832, + "step": 39246 + }, + { + "epoch": 1.8272458505016644, + "grad_norm": 0.3587629737673379, + "learning_rate": 3.975814008531881e-05, + "loss": 2.7606, + "step": 39247 + }, + { + "epoch": 1.8272924086877576, + "grad_norm": 0.3244128703155401, + "learning_rate": 3.975548879969112e-05, + "loss": 2.6611, + "step": 39248 + }, + { + "epoch": 1.8273389668738504, + "grad_norm": 0.3538208610594636, + "learning_rate": 3.975283754413006e-05, + "loss": 2.6759, + "step": 39249 + }, + { + "epoch": 1.8273855250599436, + "grad_norm": 0.3532291287595003, + "learning_rate": 3.975018631864339e-05, + "loss": 2.7558, + "step": 39250 + }, + { + "epoch": 1.8274320832460367, + "grad_norm": 0.32637857805394305, + "learning_rate": 3.9747535123238944e-05, + "loss": 2.7242, + "step": 39251 + }, + { + "epoch": 1.8274786414321298, + "grad_norm": 0.32998389379471493, + "learning_rate": 3.9744883957924454e-05, + "loss": 2.7294, + "step": 39252 + }, + { + "epoch": 1.827525199618223, + "grad_norm": 0.34316161825832225, + "learning_rate": 3.9742232822707734e-05, + "loss": 2.7741, + "step": 39253 + }, + { + "epoch": 1.827571757804316, + "grad_norm": 0.34120233938926553, + "learning_rate": 3.973958171759654e-05, + "loss": 2.6679, + "step": 39254 + }, + { + "epoch": 1.8276183159904091, + "grad_norm": 0.32649203882715394, + "learning_rate": 3.9736930642598656e-05, + "loss": 2.7524, + "step": 39255 + }, + { + "epoch": 1.827664874176502, + "grad_norm": 0.3480519076089458, + "learning_rate": 3.9734279597721883e-05, + "loss": 2.7396, + "step": 39256 + }, + { + "epoch": 1.8277114323625951, + "grad_norm": 0.3183361809549942, + "learning_rate": 3.973162858297399e-05, + "loss": 2.5981, + "step": 39257 + }, + { + "epoch": 1.8277579905486883, + "grad_norm": 0.34108271503669085, + "learning_rate": 3.972897759836274e-05, + "loss": 2.7242, + "step": 39258 + }, + { + "epoch": 1.8278045487347812, + "grad_norm": 0.34564496957420876, + "learning_rate": 3.972632664389594e-05, + "loss": 2.5305, + "step": 39259 + }, + { + "epoch": 1.8278511069208743, + "grad_norm": 0.33660537589495476, + "learning_rate": 3.972367571958134e-05, + "loss": 2.7455, + "step": 39260 + }, + { + "epoch": 1.8278976651069674, + "grad_norm": 0.358850196534513, + "learning_rate": 3.9721024825426763e-05, + "loss": 2.7875, + "step": 39261 + }, + { + "epoch": 1.8279442232930605, + "grad_norm": 0.3496854223538681, + "learning_rate": 3.971837396143996e-05, + "loss": 2.7222, + "step": 39262 + }, + { + "epoch": 1.8279907814791536, + "grad_norm": 0.36368993436702246, + "learning_rate": 3.97157231276287e-05, + "loss": 2.619, + "step": 39263 + }, + { + "epoch": 1.8280373396652467, + "grad_norm": 0.29359360373921145, + "learning_rate": 3.97130723240008e-05, + "loss": 2.7007, + "step": 39264 + }, + { + "epoch": 1.8280838978513398, + "grad_norm": 0.35044599239101903, + "learning_rate": 3.971042155056401e-05, + "loss": 2.7457, + "step": 39265 + }, + { + "epoch": 1.8281304560374327, + "grad_norm": 0.34092408280787095, + "learning_rate": 3.9707770807326134e-05, + "loss": 2.6139, + "step": 39266 + }, + { + "epoch": 1.8281770142235259, + "grad_norm": 0.3060706062625054, + "learning_rate": 3.970512009429492e-05, + "loss": 2.6738, + "step": 39267 + }, + { + "epoch": 1.828223572409619, + "grad_norm": 0.3275339615433882, + "learning_rate": 3.970246941147817e-05, + "loss": 2.749, + "step": 39268 + }, + { + "epoch": 1.8282701305957119, + "grad_norm": 0.3715321693618493, + "learning_rate": 3.969981875888366e-05, + "loss": 2.6795, + "step": 39269 + }, + { + "epoch": 1.828316688781805, + "grad_norm": 0.33116745713110834, + "learning_rate": 3.969716813651915e-05, + "loss": 2.6516, + "step": 39270 + }, + { + "epoch": 1.828363246967898, + "grad_norm": 0.3210166851999411, + "learning_rate": 3.969451754439246e-05, + "loss": 2.6808, + "step": 39271 + }, + { + "epoch": 1.8284098051539912, + "grad_norm": 0.36882320894493803, + "learning_rate": 3.969186698251134e-05, + "loss": 2.7372, + "step": 39272 + }, + { + "epoch": 1.8284563633400843, + "grad_norm": 0.33206013462433026, + "learning_rate": 3.968921645088356e-05, + "loss": 2.6636, + "step": 39273 + }, + { + "epoch": 1.8285029215261774, + "grad_norm": 0.31550888597925786, + "learning_rate": 3.968656594951693e-05, + "loss": 2.5894, + "step": 39274 + }, + { + "epoch": 1.8285494797122706, + "grad_norm": 0.36541878969412783, + "learning_rate": 3.9683915478419214e-05, + "loss": 2.7276, + "step": 39275 + }, + { + "epoch": 1.8285960378983634, + "grad_norm": 0.3436061117119086, + "learning_rate": 3.968126503759817e-05, + "loss": 2.6832, + "step": 39276 + }, + { + "epoch": 1.8286425960844566, + "grad_norm": 0.3602743402417452, + "learning_rate": 3.9678614627061616e-05, + "loss": 2.7094, + "step": 39277 + }, + { + "epoch": 1.8286891542705495, + "grad_norm": 0.3421896586342097, + "learning_rate": 3.96759642468173e-05, + "loss": 2.7426, + "step": 39278 + }, + { + "epoch": 1.8287357124566426, + "grad_norm": 0.359056865395715, + "learning_rate": 3.967331389687303e-05, + "loss": 2.7236, + "step": 39279 + }, + { + "epoch": 1.8287822706427357, + "grad_norm": 0.34867065271266834, + "learning_rate": 3.9670663577236546e-05, + "loss": 2.7339, + "step": 39280 + }, + { + "epoch": 1.8288288288288288, + "grad_norm": 0.3801423879471233, + "learning_rate": 3.9668013287915656e-05, + "loss": 2.744, + "step": 39281 + }, + { + "epoch": 1.828875387014922, + "grad_norm": 0.34297474907011716, + "learning_rate": 3.966536302891813e-05, + "loss": 2.6773, + "step": 39282 + }, + { + "epoch": 1.828921945201015, + "grad_norm": 0.3651632529610204, + "learning_rate": 3.966271280025173e-05, + "loss": 2.6269, + "step": 39283 + }, + { + "epoch": 1.8289685033871081, + "grad_norm": 0.3643667211269489, + "learning_rate": 3.966006260192428e-05, + "loss": 2.6941, + "step": 39284 + }, + { + "epoch": 1.8290150615732013, + "grad_norm": 0.32579343844867253, + "learning_rate": 3.965741243394351e-05, + "loss": 2.6561, + "step": 39285 + }, + { + "epoch": 1.8290616197592942, + "grad_norm": 0.4071922051062081, + "learning_rate": 3.96547622963172e-05, + "loss": 2.6917, + "step": 39286 + }, + { + "epoch": 1.8291081779453873, + "grad_norm": 0.3544159336849096, + "learning_rate": 3.965211218905317e-05, + "loss": 2.7249, + "step": 39287 + }, + { + "epoch": 1.8291547361314802, + "grad_norm": 0.3722626699448856, + "learning_rate": 3.964946211215914e-05, + "loss": 2.5347, + "step": 39288 + }, + { + "epoch": 1.8292012943175733, + "grad_norm": 0.32242330643517136, + "learning_rate": 3.9646812065642954e-05, + "loss": 2.7481, + "step": 39289 + }, + { + "epoch": 1.8292478525036664, + "grad_norm": 0.3482573427929309, + "learning_rate": 3.964416204951235e-05, + "loss": 2.6584, + "step": 39290 + }, + { + "epoch": 1.8292944106897595, + "grad_norm": 0.33837102922806245, + "learning_rate": 3.9641512063775085e-05, + "loss": 2.6885, + "step": 39291 + }, + { + "epoch": 1.8293409688758526, + "grad_norm": 0.3214833187049498, + "learning_rate": 3.963886210843899e-05, + "loss": 2.7289, + "step": 39292 + }, + { + "epoch": 1.8293875270619457, + "grad_norm": 0.35516883400313765, + "learning_rate": 3.96362121835118e-05, + "loss": 2.7848, + "step": 39293 + }, + { + "epoch": 1.8294340852480389, + "grad_norm": 0.3432865206372072, + "learning_rate": 3.96335622890013e-05, + "loss": 2.6271, + "step": 39294 + }, + { + "epoch": 1.829480643434132, + "grad_norm": 0.36004337422805566, + "learning_rate": 3.963091242491529e-05, + "loss": 2.7854, + "step": 39295 + }, + { + "epoch": 1.8295272016202249, + "grad_norm": 0.34196360014620675, + "learning_rate": 3.962826259126151e-05, + "loss": 2.7633, + "step": 39296 + }, + { + "epoch": 1.829573759806318, + "grad_norm": 0.3695733660564055, + "learning_rate": 3.962561278804777e-05, + "loss": 2.6644, + "step": 39297 + }, + { + "epoch": 1.8296203179924109, + "grad_norm": 0.34825360909635583, + "learning_rate": 3.9622963015281845e-05, + "loss": 2.718, + "step": 39298 + }, + { + "epoch": 1.829666876178504, + "grad_norm": 0.34267159144534065, + "learning_rate": 3.962031327297147e-05, + "loss": 2.683, + "step": 39299 + }, + { + "epoch": 1.829713434364597, + "grad_norm": 0.3506003771119283, + "learning_rate": 3.961766356112448e-05, + "loss": 2.6547, + "step": 39300 + }, + { + "epoch": 1.8297599925506902, + "grad_norm": 0.35185458463716185, + "learning_rate": 3.96150138797486e-05, + "loss": 2.7855, + "step": 39301 + }, + { + "epoch": 1.8298065507367833, + "grad_norm": 0.3338498490320436, + "learning_rate": 3.961236422885166e-05, + "loss": 2.6373, + "step": 39302 + }, + { + "epoch": 1.8298531089228764, + "grad_norm": 0.3833952497779986, + "learning_rate": 3.96097146084414e-05, + "loss": 2.7087, + "step": 39303 + }, + { + "epoch": 1.8298996671089696, + "grad_norm": 0.3404155659616806, + "learning_rate": 3.96070650185256e-05, + "loss": 2.6877, + "step": 39304 + }, + { + "epoch": 1.8299462252950625, + "grad_norm": 0.35870165145581606, + "learning_rate": 3.960441545911204e-05, + "loss": 2.7567, + "step": 39305 + }, + { + "epoch": 1.8299927834811556, + "grad_norm": 0.3252893647853847, + "learning_rate": 3.960176593020849e-05, + "loss": 2.5826, + "step": 39306 + }, + { + "epoch": 1.8300393416672487, + "grad_norm": 0.3650878446858087, + "learning_rate": 3.9599116431822745e-05, + "loss": 2.5735, + "step": 39307 + }, + { + "epoch": 1.8300858998533416, + "grad_norm": 0.3562083148285584, + "learning_rate": 3.959646696396257e-05, + "loss": 2.7262, + "step": 39308 + }, + { + "epoch": 1.8301324580394347, + "grad_norm": 0.3356434985641615, + "learning_rate": 3.9593817526635715e-05, + "loss": 2.6596, + "step": 39309 + }, + { + "epoch": 1.8301790162255278, + "grad_norm": 0.38644930195980826, + "learning_rate": 3.9591168119850006e-05, + "loss": 2.7096, + "step": 39310 + }, + { + "epoch": 1.830225574411621, + "grad_norm": 0.3765376125791935, + "learning_rate": 3.958851874361319e-05, + "loss": 2.6518, + "step": 39311 + }, + { + "epoch": 1.830272132597714, + "grad_norm": 0.3575867711923747, + "learning_rate": 3.958586939793303e-05, + "loss": 2.6538, + "step": 39312 + }, + { + "epoch": 1.8303186907838072, + "grad_norm": 0.37147899573224313, + "learning_rate": 3.958322008281734e-05, + "loss": 2.6773, + "step": 39313 + }, + { + "epoch": 1.8303652489699003, + "grad_norm": 0.34547957033210785, + "learning_rate": 3.9580570798273844e-05, + "loss": 2.7035, + "step": 39314 + }, + { + "epoch": 1.8304118071559932, + "grad_norm": 0.3360387876801448, + "learning_rate": 3.957792154431037e-05, + "loss": 2.6071, + "step": 39315 + }, + { + "epoch": 1.8304583653420863, + "grad_norm": 0.3367642113238434, + "learning_rate": 3.957527232093466e-05, + "loss": 2.6479, + "step": 39316 + }, + { + "epoch": 1.8305049235281794, + "grad_norm": 0.35260643280002957, + "learning_rate": 3.9572623128154503e-05, + "loss": 2.6075, + "step": 39317 + }, + { + "epoch": 1.8305514817142723, + "grad_norm": 0.3200454443067425, + "learning_rate": 3.956997396597767e-05, + "loss": 2.7752, + "step": 39318 + }, + { + "epoch": 1.8305980399003654, + "grad_norm": 0.34027300226811236, + "learning_rate": 3.956732483441192e-05, + "loss": 2.6869, + "step": 39319 + }, + { + "epoch": 1.8306445980864585, + "grad_norm": 0.3348836673518318, + "learning_rate": 3.9564675733465076e-05, + "loss": 2.6427, + "step": 39320 + }, + { + "epoch": 1.8306911562725516, + "grad_norm": 0.3428788723482668, + "learning_rate": 3.956202666314486e-05, + "loss": 2.5901, + "step": 39321 + }, + { + "epoch": 1.8307377144586447, + "grad_norm": 0.328281008081011, + "learning_rate": 3.955937762345906e-05, + "loss": 2.762, + "step": 39322 + }, + { + "epoch": 1.8307842726447379, + "grad_norm": 0.33581125252682814, + "learning_rate": 3.955672861441547e-05, + "loss": 2.6191, + "step": 39323 + }, + { + "epoch": 1.830830830830831, + "grad_norm": 0.30818651608441067, + "learning_rate": 3.955407963602184e-05, + "loss": 2.7849, + "step": 39324 + }, + { + "epoch": 1.8308773890169239, + "grad_norm": 0.34253071218809134, + "learning_rate": 3.955143068828598e-05, + "loss": 2.6717, + "step": 39325 + }, + { + "epoch": 1.830923947203017, + "grad_norm": 0.3485952910449998, + "learning_rate": 3.954878177121563e-05, + "loss": 2.651, + "step": 39326 + }, + { + "epoch": 1.8309705053891099, + "grad_norm": 0.3414158714213637, + "learning_rate": 3.9546132884818563e-05, + "loss": 2.8304, + "step": 39327 + }, + { + "epoch": 1.831017063575203, + "grad_norm": 0.3676511395133296, + "learning_rate": 3.954348402910258e-05, + "loss": 2.7132, + "step": 39328 + }, + { + "epoch": 1.8310636217612961, + "grad_norm": 0.3617014919911672, + "learning_rate": 3.954083520407545e-05, + "loss": 2.6189, + "step": 39329 + }, + { + "epoch": 1.8311101799473892, + "grad_norm": 0.37349917934856286, + "learning_rate": 3.9538186409744926e-05, + "loss": 2.72, + "step": 39330 + }, + { + "epoch": 1.8311567381334823, + "grad_norm": 0.3334509047502471, + "learning_rate": 3.95355376461188e-05, + "loss": 2.5818, + "step": 39331 + }, + { + "epoch": 1.8312032963195755, + "grad_norm": 0.392154768940244, + "learning_rate": 3.953288891320483e-05, + "loss": 2.7625, + "step": 39332 + }, + { + "epoch": 1.8312498545056686, + "grad_norm": 0.3395460094059534, + "learning_rate": 3.953024021101081e-05, + "loss": 2.7576, + "step": 39333 + }, + { + "epoch": 1.8312964126917617, + "grad_norm": 0.3566887529340764, + "learning_rate": 3.95275915395445e-05, + "loss": 2.7037, + "step": 39334 + }, + { + "epoch": 1.8313429708778546, + "grad_norm": 0.3552817291915462, + "learning_rate": 3.952494289881367e-05, + "loss": 2.7143, + "step": 39335 + }, + { + "epoch": 1.8313895290639477, + "grad_norm": 0.3454358262373611, + "learning_rate": 3.952229428882611e-05, + "loss": 2.788, + "step": 39336 + }, + { + "epoch": 1.8314360872500406, + "grad_norm": 0.3384058375179801, + "learning_rate": 3.951964570958957e-05, + "loss": 2.7563, + "step": 39337 + }, + { + "epoch": 1.8314826454361337, + "grad_norm": 0.3564191495994044, + "learning_rate": 3.951699716111186e-05, + "loss": 2.693, + "step": 39338 + }, + { + "epoch": 1.8315292036222268, + "grad_norm": 0.4065284518837676, + "learning_rate": 3.9514348643400724e-05, + "loss": 2.6682, + "step": 39339 + }, + { + "epoch": 1.83157576180832, + "grad_norm": 0.37220255888675613, + "learning_rate": 3.951170015646393e-05, + "loss": 2.7415, + "step": 39340 + }, + { + "epoch": 1.831622319994413, + "grad_norm": 0.3818572891553113, + "learning_rate": 3.950905170030927e-05, + "loss": 2.8055, + "step": 39341 + }, + { + "epoch": 1.8316688781805062, + "grad_norm": 0.33917274811694753, + "learning_rate": 3.95064032749445e-05, + "loss": 2.6839, + "step": 39342 + }, + { + "epoch": 1.8317154363665993, + "grad_norm": 0.34242094501484743, + "learning_rate": 3.950375488037742e-05, + "loss": 2.6566, + "step": 39343 + }, + { + "epoch": 1.8317619945526922, + "grad_norm": 0.3348976739508407, + "learning_rate": 3.950110651661579e-05, + "loss": 2.5746, + "step": 39344 + }, + { + "epoch": 1.8318085527387853, + "grad_norm": 0.3240100324669809, + "learning_rate": 3.949845818366735e-05, + "loss": 2.5859, + "step": 39345 + }, + { + "epoch": 1.8318551109248784, + "grad_norm": 0.3184556308514759, + "learning_rate": 3.9495809881539924e-05, + "loss": 2.7022, + "step": 39346 + }, + { + "epoch": 1.8319016691109713, + "grad_norm": 0.3637523976095441, + "learning_rate": 3.9493161610241265e-05, + "loss": 2.6196, + "step": 39347 + }, + { + "epoch": 1.8319482272970644, + "grad_norm": 0.3304132205348639, + "learning_rate": 3.949051336977912e-05, + "loss": 2.5995, + "step": 39348 + }, + { + "epoch": 1.8319947854831575, + "grad_norm": 0.35964718172456, + "learning_rate": 3.948786516016129e-05, + "loss": 2.6757, + "step": 39349 + }, + { + "epoch": 1.8320413436692506, + "grad_norm": 0.34167233503865513, + "learning_rate": 3.948521698139554e-05, + "loss": 2.7596, + "step": 39350 + }, + { + "epoch": 1.8320879018553438, + "grad_norm": 0.32481226174124955, + "learning_rate": 3.948256883348964e-05, + "loss": 2.6639, + "step": 39351 + }, + { + "epoch": 1.8321344600414369, + "grad_norm": 0.35132562837036874, + "learning_rate": 3.947992071645138e-05, + "loss": 2.6583, + "step": 39352 + }, + { + "epoch": 1.83218101822753, + "grad_norm": 0.3224623885104118, + "learning_rate": 3.947727263028849e-05, + "loss": 2.7075, + "step": 39353 + }, + { + "epoch": 1.8322275764136229, + "grad_norm": 0.32311899355855334, + "learning_rate": 3.947462457500879e-05, + "loss": 2.6352, + "step": 39354 + }, + { + "epoch": 1.832274134599716, + "grad_norm": 0.3360263172945512, + "learning_rate": 3.947197655062003e-05, + "loss": 2.7651, + "step": 39355 + }, + { + "epoch": 1.8323206927858091, + "grad_norm": 0.353255252897528, + "learning_rate": 3.946932855712998e-05, + "loss": 2.7247, + "step": 39356 + }, + { + "epoch": 1.832367250971902, + "grad_norm": 0.34795292569169645, + "learning_rate": 3.946668059454641e-05, + "loss": 2.6964, + "step": 39357 + }, + { + "epoch": 1.8324138091579951, + "grad_norm": 0.3279548446368134, + "learning_rate": 3.946403266287708e-05, + "loss": 2.7608, + "step": 39358 + }, + { + "epoch": 1.8324603673440882, + "grad_norm": 0.37709756974019887, + "learning_rate": 3.94613847621298e-05, + "loss": 2.6239, + "step": 39359 + }, + { + "epoch": 1.8325069255301814, + "grad_norm": 0.3231883595720689, + "learning_rate": 3.9458736892312294e-05, + "loss": 2.6482, + "step": 39360 + }, + { + "epoch": 1.8325534837162745, + "grad_norm": 0.32207509906744863, + "learning_rate": 3.9456089053432376e-05, + "loss": 2.7036, + "step": 39361 + }, + { + "epoch": 1.8326000419023676, + "grad_norm": 0.33229294793492803, + "learning_rate": 3.9453441245497805e-05, + "loss": 2.6508, + "step": 39362 + }, + { + "epoch": 1.8326466000884607, + "grad_norm": 0.35367629073719475, + "learning_rate": 3.945079346851631e-05, + "loss": 2.6548, + "step": 39363 + }, + { + "epoch": 1.8326931582745536, + "grad_norm": 0.3133920243514137, + "learning_rate": 3.944814572249573e-05, + "loss": 2.5771, + "step": 39364 + }, + { + "epoch": 1.8327397164606467, + "grad_norm": 0.3656125592413855, + "learning_rate": 3.94454980074438e-05, + "loss": 2.7207, + "step": 39365 + }, + { + "epoch": 1.8327862746467396, + "grad_norm": 0.37253863670493187, + "learning_rate": 3.944285032336827e-05, + "loss": 2.7449, + "step": 39366 + }, + { + "epoch": 1.8328328328328327, + "grad_norm": 0.3298003796699391, + "learning_rate": 3.944020267027695e-05, + "loss": 2.6381, + "step": 39367 + }, + { + "epoch": 1.8328793910189258, + "grad_norm": 0.4061656109939261, + "learning_rate": 3.9437555048177594e-05, + "loss": 2.6133, + "step": 39368 + }, + { + "epoch": 1.832925949205019, + "grad_norm": 0.35044988988289194, + "learning_rate": 3.943490745707798e-05, + "loss": 2.7025, + "step": 39369 + }, + { + "epoch": 1.832972507391112, + "grad_norm": 0.35338505400180775, + "learning_rate": 3.9432259896985875e-05, + "loss": 2.6637, + "step": 39370 + }, + { + "epoch": 1.8330190655772052, + "grad_norm": 0.3714770643940901, + "learning_rate": 3.942961236790902e-05, + "loss": 2.7271, + "step": 39371 + }, + { + "epoch": 1.8330656237632983, + "grad_norm": 0.3596254718473239, + "learning_rate": 3.9426964869855236e-05, + "loss": 2.6758, + "step": 39372 + }, + { + "epoch": 1.8331121819493914, + "grad_norm": 0.34658270918789424, + "learning_rate": 3.942431740283224e-05, + "loss": 2.6849, + "step": 39373 + }, + { + "epoch": 1.8331587401354843, + "grad_norm": 0.33111381336341855, + "learning_rate": 3.942166996684786e-05, + "loss": 2.7488, + "step": 39374 + }, + { + "epoch": 1.8332052983215774, + "grad_norm": 0.34348270427105687, + "learning_rate": 3.941902256190984e-05, + "loss": 2.7559, + "step": 39375 + }, + { + "epoch": 1.8332518565076703, + "grad_norm": 0.3657774449281826, + "learning_rate": 3.941637518802592e-05, + "loss": 2.7691, + "step": 39376 + }, + { + "epoch": 1.8332984146937634, + "grad_norm": 0.326388459287018, + "learning_rate": 3.941372784520392e-05, + "loss": 2.6422, + "step": 39377 + }, + { + "epoch": 1.8333449728798565, + "grad_norm": 0.3307336380270776, + "learning_rate": 3.941108053345157e-05, + "loss": 2.6229, + "step": 39378 + }, + { + "epoch": 1.8333915310659497, + "grad_norm": 0.32070399454668774, + "learning_rate": 3.940843325277666e-05, + "loss": 2.6871, + "step": 39379 + }, + { + "epoch": 1.8334380892520428, + "grad_norm": 0.33956806533366873, + "learning_rate": 3.940578600318696e-05, + "loss": 2.5789, + "step": 39380 + }, + { + "epoch": 1.8334846474381359, + "grad_norm": 0.3326867023328007, + "learning_rate": 3.940313878469023e-05, + "loss": 2.7113, + "step": 39381 + }, + { + "epoch": 1.833531205624229, + "grad_norm": 0.34067697081256376, + "learning_rate": 3.9400491597294246e-05, + "loss": 2.684, + "step": 39382 + }, + { + "epoch": 1.8335777638103221, + "grad_norm": 0.3162118179613967, + "learning_rate": 3.939784444100678e-05, + "loss": 2.5797, + "step": 39383 + }, + { + "epoch": 1.833624321996415, + "grad_norm": 0.3697701445822127, + "learning_rate": 3.939519731583558e-05, + "loss": 2.6277, + "step": 39384 + }, + { + "epoch": 1.8336708801825081, + "grad_norm": 0.34727844456114687, + "learning_rate": 3.939255022178845e-05, + "loss": 2.6627, + "step": 39385 + }, + { + "epoch": 1.833717438368601, + "grad_norm": 0.36629416653138436, + "learning_rate": 3.9389903158873107e-05, + "loss": 2.7225, + "step": 39386 + }, + { + "epoch": 1.8337639965546941, + "grad_norm": 0.34504617401391185, + "learning_rate": 3.938725612709739e-05, + "loss": 2.6174, + "step": 39387 + }, + { + "epoch": 1.8338105547407872, + "grad_norm": 0.34035825169612793, + "learning_rate": 3.9384609126469026e-05, + "loss": 2.7491, + "step": 39388 + }, + { + "epoch": 1.8338571129268804, + "grad_norm": 0.3488749094731244, + "learning_rate": 3.938196215699576e-05, + "loss": 2.6998, + "step": 39389 + }, + { + "epoch": 1.8339036711129735, + "grad_norm": 0.3461565153741331, + "learning_rate": 3.937931521868542e-05, + "loss": 2.6994, + "step": 39390 + }, + { + "epoch": 1.8339502292990666, + "grad_norm": 0.3086681009046588, + "learning_rate": 3.937666831154573e-05, + "loss": 2.6084, + "step": 39391 + }, + { + "epoch": 1.8339967874851597, + "grad_norm": 0.34134292537769273, + "learning_rate": 3.937402143558447e-05, + "loss": 2.7222, + "step": 39392 + }, + { + "epoch": 1.8340433456712526, + "grad_norm": 0.3472633067847367, + "learning_rate": 3.937137459080942e-05, + "loss": 2.7414, + "step": 39393 + }, + { + "epoch": 1.8340899038573457, + "grad_norm": 0.331170175112681, + "learning_rate": 3.9368727777228334e-05, + "loss": 2.6851, + "step": 39394 + }, + { + "epoch": 1.8341364620434388, + "grad_norm": 0.37587053606971493, + "learning_rate": 3.9366080994848986e-05, + "loss": 2.661, + "step": 39395 + }, + { + "epoch": 1.8341830202295317, + "grad_norm": 0.34160668175584946, + "learning_rate": 3.9363434243679124e-05, + "loss": 2.7419, + "step": 39396 + }, + { + "epoch": 1.8342295784156248, + "grad_norm": 0.37678705332053564, + "learning_rate": 3.936078752372656e-05, + "loss": 2.5757, + "step": 39397 + }, + { + "epoch": 1.834276136601718, + "grad_norm": 0.34302409256365746, + "learning_rate": 3.9358140834999036e-05, + "loss": 2.6699, + "step": 39398 + }, + { + "epoch": 1.834322694787811, + "grad_norm": 0.35559424827311176, + "learning_rate": 3.9355494177504295e-05, + "loss": 2.7458, + "step": 39399 + }, + { + "epoch": 1.8343692529739042, + "grad_norm": 0.3787474413919533, + "learning_rate": 3.9352847551250156e-05, + "loss": 2.6495, + "step": 39400 + }, + { + "epoch": 1.8344158111599973, + "grad_norm": 0.37773672626124505, + "learning_rate": 3.9350200956244356e-05, + "loss": 2.7645, + "step": 39401 + }, + { + "epoch": 1.8344623693460904, + "grad_norm": 0.39735429733438377, + "learning_rate": 3.934755439249465e-05, + "loss": 2.7099, + "step": 39402 + }, + { + "epoch": 1.8345089275321833, + "grad_norm": 0.3591841686095985, + "learning_rate": 3.934490786000884e-05, + "loss": 2.745, + "step": 39403 + }, + { + "epoch": 1.8345554857182764, + "grad_norm": 0.38702229749911243, + "learning_rate": 3.9342261358794663e-05, + "loss": 2.6134, + "step": 39404 + }, + { + "epoch": 1.8346020439043695, + "grad_norm": 0.3713841669666268, + "learning_rate": 3.9339614888859905e-05, + "loss": 2.7139, + "step": 39405 + }, + { + "epoch": 1.8346486020904624, + "grad_norm": 0.344734749179946, + "learning_rate": 3.9336968450212335e-05, + "loss": 2.7978, + "step": 39406 + }, + { + "epoch": 1.8346951602765555, + "grad_norm": 0.3432624484617663, + "learning_rate": 3.93343220428597e-05, + "loss": 2.6273, + "step": 39407 + }, + { + "epoch": 1.8347417184626487, + "grad_norm": 0.35608841173917777, + "learning_rate": 3.9331675666809794e-05, + "loss": 2.7012, + "step": 39408 + }, + { + "epoch": 1.8347882766487418, + "grad_norm": 0.3309214941095165, + "learning_rate": 3.932902932207034e-05, + "loss": 2.5956, + "step": 39409 + }, + { + "epoch": 1.834834834834835, + "grad_norm": 0.36064609940221004, + "learning_rate": 3.9326383008649165e-05, + "loss": 2.6359, + "step": 39410 + }, + { + "epoch": 1.834881393020928, + "grad_norm": 0.3455261184818011, + "learning_rate": 3.9323736726554e-05, + "loss": 2.6805, + "step": 39411 + }, + { + "epoch": 1.8349279512070211, + "grad_norm": 0.3690216724646627, + "learning_rate": 3.9321090475792594e-05, + "loss": 2.6568, + "step": 39412 + }, + { + "epoch": 1.834974509393114, + "grad_norm": 0.3435130718171887, + "learning_rate": 3.9318444256372756e-05, + "loss": 2.8615, + "step": 39413 + }, + { + "epoch": 1.8350210675792071, + "grad_norm": 0.3367173539186166, + "learning_rate": 3.931579806830221e-05, + "loss": 2.7062, + "step": 39414 + }, + { + "epoch": 1.8350676257653, + "grad_norm": 0.35385585711838297, + "learning_rate": 3.9313151911588765e-05, + "loss": 2.62, + "step": 39415 + }, + { + "epoch": 1.8351141839513931, + "grad_norm": 0.34724973514440916, + "learning_rate": 3.931050578624017e-05, + "loss": 2.7386, + "step": 39416 + }, + { + "epoch": 1.8351607421374863, + "grad_norm": 0.325957997735817, + "learning_rate": 3.9307859692264176e-05, + "loss": 2.7037, + "step": 39417 + }, + { + "epoch": 1.8352073003235794, + "grad_norm": 0.3197356096789675, + "learning_rate": 3.930521362966856e-05, + "loss": 2.6343, + "step": 39418 + }, + { + "epoch": 1.8352538585096725, + "grad_norm": 0.336628423650841, + "learning_rate": 3.9302567598461095e-05, + "loss": 2.6939, + "step": 39419 + }, + { + "epoch": 1.8353004166957656, + "grad_norm": 0.3331156607888358, + "learning_rate": 3.929992159864954e-05, + "loss": 2.5812, + "step": 39420 + }, + { + "epoch": 1.8353469748818587, + "grad_norm": 0.3265379082145714, + "learning_rate": 3.929727563024166e-05, + "loss": 2.7456, + "step": 39421 + }, + { + "epoch": 1.8353935330679518, + "grad_norm": 0.32342097424163935, + "learning_rate": 3.92946296932452e-05, + "loss": 2.7668, + "step": 39422 + }, + { + "epoch": 1.8354400912540447, + "grad_norm": 0.340858164059341, + "learning_rate": 3.9291983787667975e-05, + "loss": 2.7827, + "step": 39423 + }, + { + "epoch": 1.8354866494401378, + "grad_norm": 0.32476010760693275, + "learning_rate": 3.928933791351772e-05, + "loss": 2.6976, + "step": 39424 + }, + { + "epoch": 1.8355332076262307, + "grad_norm": 0.3406015029826341, + "learning_rate": 3.9286692070802186e-05, + "loss": 2.6732, + "step": 39425 + }, + { + "epoch": 1.8355797658123238, + "grad_norm": 0.3408632323221613, + "learning_rate": 3.928404625952917e-05, + "loss": 2.8038, + "step": 39426 + }, + { + "epoch": 1.835626323998417, + "grad_norm": 0.3185767187994903, + "learning_rate": 3.928140047970641e-05, + "loss": 2.7108, + "step": 39427 + }, + { + "epoch": 1.83567288218451, + "grad_norm": 0.37334608417688275, + "learning_rate": 3.927875473134169e-05, + "loss": 2.6876, + "step": 39428 + }, + { + "epoch": 1.8357194403706032, + "grad_norm": 0.3478744995668384, + "learning_rate": 3.9276109014442785e-05, + "loss": 2.606, + "step": 39429 + }, + { + "epoch": 1.8357659985566963, + "grad_norm": 0.32028252651294137, + "learning_rate": 3.927346332901742e-05, + "loss": 2.7172, + "step": 39430 + }, + { + "epoch": 1.8358125567427894, + "grad_norm": 0.3357165960539245, + "learning_rate": 3.92708176750734e-05, + "loss": 2.6021, + "step": 39431 + }, + { + "epoch": 1.8358591149288823, + "grad_norm": 0.32895309079213386, + "learning_rate": 3.926817205261846e-05, + "loss": 2.7532, + "step": 39432 + }, + { + "epoch": 1.8359056731149754, + "grad_norm": 0.3229244139343254, + "learning_rate": 3.926552646166038e-05, + "loss": 2.7408, + "step": 39433 + }, + { + "epoch": 1.8359522313010685, + "grad_norm": 0.34508425464197934, + "learning_rate": 3.9262880902206944e-05, + "loss": 2.7105, + "step": 39434 + }, + { + "epoch": 1.8359987894871614, + "grad_norm": 0.34780076420414824, + "learning_rate": 3.926023537426586e-05, + "loss": 2.7099, + "step": 39435 + }, + { + "epoch": 1.8360453476732546, + "grad_norm": 0.38622739702524084, + "learning_rate": 3.9257589877844955e-05, + "loss": 2.808, + "step": 39436 + }, + { + "epoch": 1.8360919058593477, + "grad_norm": 0.35753738906886573, + "learning_rate": 3.9254944412951956e-05, + "loss": 2.7658, + "step": 39437 + }, + { + "epoch": 1.8361384640454408, + "grad_norm": 0.33867520627232, + "learning_rate": 3.925229897959462e-05, + "loss": 2.6356, + "step": 39438 + }, + { + "epoch": 1.836185022231534, + "grad_norm": 0.36676400270382975, + "learning_rate": 3.924965357778075e-05, + "loss": 2.7182, + "step": 39439 + }, + { + "epoch": 1.836231580417627, + "grad_norm": 0.3495131223664492, + "learning_rate": 3.9247008207518065e-05, + "loss": 2.6415, + "step": 39440 + }, + { + "epoch": 1.8362781386037201, + "grad_norm": 0.35112226790732853, + "learning_rate": 3.9244362868814375e-05, + "loss": 2.596, + "step": 39441 + }, + { + "epoch": 1.836324696789813, + "grad_norm": 0.34127497986829297, + "learning_rate": 3.924171756167742e-05, + "loss": 2.7313, + "step": 39442 + }, + { + "epoch": 1.8363712549759061, + "grad_norm": 0.344575068589317, + "learning_rate": 3.9239072286114955e-05, + "loss": 2.7098, + "step": 39443 + }, + { + "epoch": 1.8364178131619993, + "grad_norm": 0.34840115912281616, + "learning_rate": 3.923642704213475e-05, + "loss": 2.6128, + "step": 39444 + }, + { + "epoch": 1.8364643713480922, + "grad_norm": 0.334373463327686, + "learning_rate": 3.923378182974457e-05, + "loss": 2.6422, + "step": 39445 + }, + { + "epoch": 1.8365109295341853, + "grad_norm": 0.3445876571973375, + "learning_rate": 3.9231136648952195e-05, + "loss": 2.601, + "step": 39446 + }, + { + "epoch": 1.8365574877202784, + "grad_norm": 0.3492955377977065, + "learning_rate": 3.922849149976537e-05, + "loss": 2.6281, + "step": 39447 + }, + { + "epoch": 1.8366040459063715, + "grad_norm": 0.3297859894670464, + "learning_rate": 3.9225846382191836e-05, + "loss": 2.6797, + "step": 39448 + }, + { + "epoch": 1.8366506040924646, + "grad_norm": 0.3374390334632785, + "learning_rate": 3.922320129623941e-05, + "loss": 2.7222, + "step": 39449 + }, + { + "epoch": 1.8366971622785577, + "grad_norm": 0.3407100009992261, + "learning_rate": 3.92205562419158e-05, + "loss": 2.6849, + "step": 39450 + }, + { + "epoch": 1.8367437204646508, + "grad_norm": 0.3211839367752048, + "learning_rate": 3.921791121922882e-05, + "loss": 2.7889, + "step": 39451 + }, + { + "epoch": 1.8367902786507437, + "grad_norm": 0.32977403720526854, + "learning_rate": 3.921526622818621e-05, + "loss": 2.7123, + "step": 39452 + }, + { + "epoch": 1.8368368368368369, + "grad_norm": 0.33765576458468133, + "learning_rate": 3.92126212687957e-05, + "loss": 2.6863, + "step": 39453 + }, + { + "epoch": 1.8368833950229297, + "grad_norm": 0.3148701688771152, + "learning_rate": 3.920997634106512e-05, + "loss": 2.7099, + "step": 39454 + }, + { + "epoch": 1.8369299532090229, + "grad_norm": 0.2961323526975729, + "learning_rate": 3.920733144500219e-05, + "loss": 2.6478, + "step": 39455 + }, + { + "epoch": 1.836976511395116, + "grad_norm": 0.32740851712878666, + "learning_rate": 3.920468658061466e-05, + "loss": 2.6754, + "step": 39456 + }, + { + "epoch": 1.837023069581209, + "grad_norm": 0.3243837857489388, + "learning_rate": 3.920204174791032e-05, + "loss": 2.7131, + "step": 39457 + }, + { + "epoch": 1.8370696277673022, + "grad_norm": 0.31788947826675545, + "learning_rate": 3.9199396946896924e-05, + "loss": 2.5842, + "step": 39458 + }, + { + "epoch": 1.8371161859533953, + "grad_norm": 0.33272494894215804, + "learning_rate": 3.919675217758224e-05, + "loss": 2.7752, + "step": 39459 + }, + { + "epoch": 1.8371627441394884, + "grad_norm": 0.3527225746641922, + "learning_rate": 3.919410743997402e-05, + "loss": 2.6567, + "step": 39460 + }, + { + "epoch": 1.8372093023255816, + "grad_norm": 0.32231174484024167, + "learning_rate": 3.919146273408003e-05, + "loss": 2.6721, + "step": 39461 + }, + { + "epoch": 1.8372558605116744, + "grad_norm": 0.34129318045550106, + "learning_rate": 3.9188818059908034e-05, + "loss": 2.7297, + "step": 39462 + }, + { + "epoch": 1.8373024186977676, + "grad_norm": 0.33855689980249737, + "learning_rate": 3.918617341746578e-05, + "loss": 2.7181, + "step": 39463 + }, + { + "epoch": 1.8373489768838605, + "grad_norm": 0.32566763580015806, + "learning_rate": 3.918352880676105e-05, + "loss": 2.6676, + "step": 39464 + }, + { + "epoch": 1.8373955350699536, + "grad_norm": 0.3666635566913995, + "learning_rate": 3.918088422780161e-05, + "loss": 2.7529, + "step": 39465 + }, + { + "epoch": 1.8374420932560467, + "grad_norm": 0.3371826834293913, + "learning_rate": 3.917823968059518e-05, + "loss": 2.6527, + "step": 39466 + }, + { + "epoch": 1.8374886514421398, + "grad_norm": 0.34964735104794237, + "learning_rate": 3.9175595165149576e-05, + "loss": 2.699, + "step": 39467 + }, + { + "epoch": 1.837535209628233, + "grad_norm": 0.31710801552548, + "learning_rate": 3.917295068147252e-05, + "loss": 2.6342, + "step": 39468 + }, + { + "epoch": 1.837581767814326, + "grad_norm": 0.3506280011889373, + "learning_rate": 3.9170306229571804e-05, + "loss": 2.7925, + "step": 39469 + }, + { + "epoch": 1.8376283260004191, + "grad_norm": 0.3572363344711252, + "learning_rate": 3.916766180945516e-05, + "loss": 2.7406, + "step": 39470 + }, + { + "epoch": 1.8376748841865123, + "grad_norm": 0.2902380210504402, + "learning_rate": 3.9165017421130356e-05, + "loss": 2.6442, + "step": 39471 + }, + { + "epoch": 1.8377214423726052, + "grad_norm": 0.36169397595580866, + "learning_rate": 3.916237306460517e-05, + "loss": 2.7436, + "step": 39472 + }, + { + "epoch": 1.8377680005586983, + "grad_norm": 0.3718802611169561, + "learning_rate": 3.915972873988736e-05, + "loss": 2.7215, + "step": 39473 + }, + { + "epoch": 1.8378145587447912, + "grad_norm": 0.3260747785498571, + "learning_rate": 3.915708444698465e-05, + "loss": 2.7559, + "step": 39474 + }, + { + "epoch": 1.8378611169308843, + "grad_norm": 0.35211216279125335, + "learning_rate": 3.915444018590484e-05, + "loss": 2.6868, + "step": 39475 + }, + { + "epoch": 1.8379076751169774, + "grad_norm": 0.36472078769605776, + "learning_rate": 3.915179595665567e-05, + "loss": 2.6132, + "step": 39476 + }, + { + "epoch": 1.8379542333030705, + "grad_norm": 0.33941536240729064, + "learning_rate": 3.9149151759244935e-05, + "loss": 2.7413, + "step": 39477 + }, + { + "epoch": 1.8380007914891636, + "grad_norm": 0.35003855109382076, + "learning_rate": 3.914650759368037e-05, + "loss": 2.6465, + "step": 39478 + }, + { + "epoch": 1.8380473496752567, + "grad_norm": 0.3391303379299571, + "learning_rate": 3.914386345996971e-05, + "loss": 2.6962, + "step": 39479 + }, + { + "epoch": 1.8380939078613499, + "grad_norm": 0.3439868388675763, + "learning_rate": 3.914121935812076e-05, + "loss": 2.6443, + "step": 39480 + }, + { + "epoch": 1.8381404660474427, + "grad_norm": 0.36510005972504145, + "learning_rate": 3.913857528814125e-05, + "loss": 2.7258, + "step": 39481 + }, + { + "epoch": 1.8381870242335359, + "grad_norm": 0.34925883178295236, + "learning_rate": 3.913593125003897e-05, + "loss": 2.6842, + "step": 39482 + }, + { + "epoch": 1.838233582419629, + "grad_norm": 0.3837582147788781, + "learning_rate": 3.913328724382165e-05, + "loss": 2.6639, + "step": 39483 + }, + { + "epoch": 1.8382801406057219, + "grad_norm": 0.3581982768052511, + "learning_rate": 3.9130643269497056e-05, + "loss": 2.7366, + "step": 39484 + }, + { + "epoch": 1.838326698791815, + "grad_norm": 0.37029635239905717, + "learning_rate": 3.912799932707297e-05, + "loss": 2.6156, + "step": 39485 + }, + { + "epoch": 1.838373256977908, + "grad_norm": 0.35865370568004623, + "learning_rate": 3.912535541655711e-05, + "loss": 2.7651, + "step": 39486 + }, + { + "epoch": 1.8384198151640012, + "grad_norm": 0.36781529047545913, + "learning_rate": 3.912271153795728e-05, + "loss": 2.6616, + "step": 39487 + }, + { + "epoch": 1.8384663733500943, + "grad_norm": 0.3912259579301614, + "learning_rate": 3.912006769128122e-05, + "loss": 2.5706, + "step": 39488 + }, + { + "epoch": 1.8385129315361874, + "grad_norm": 0.3436146925163071, + "learning_rate": 3.9117423876536666e-05, + "loss": 2.6358, + "step": 39489 + }, + { + "epoch": 1.8385594897222806, + "grad_norm": 0.3952796142611629, + "learning_rate": 3.911478009373143e-05, + "loss": 2.73, + "step": 39490 + }, + { + "epoch": 1.8386060479083735, + "grad_norm": 0.34620931358910495, + "learning_rate": 3.911213634287324e-05, + "loss": 2.7322, + "step": 39491 + }, + { + "epoch": 1.8386526060944666, + "grad_norm": 0.38262330253960114, + "learning_rate": 3.9109492623969836e-05, + "loss": 2.7282, + "step": 39492 + }, + { + "epoch": 1.8386991642805597, + "grad_norm": 0.33172975629329093, + "learning_rate": 3.910684893702902e-05, + "loss": 2.6259, + "step": 39493 + }, + { + "epoch": 1.8387457224666526, + "grad_norm": 0.3399322145088255, + "learning_rate": 3.9104205282058506e-05, + "loss": 2.7115, + "step": 39494 + }, + { + "epoch": 1.8387922806527457, + "grad_norm": 0.33118650916798725, + "learning_rate": 3.910156165906609e-05, + "loss": 2.6026, + "step": 39495 + }, + { + "epoch": 1.8388388388388388, + "grad_norm": 0.33823674221319305, + "learning_rate": 3.9098918068059526e-05, + "loss": 2.7053, + "step": 39496 + }, + { + "epoch": 1.838885397024932, + "grad_norm": 0.34896418534318424, + "learning_rate": 3.909627450904655e-05, + "loss": 2.7218, + "step": 39497 + }, + { + "epoch": 1.838931955211025, + "grad_norm": 0.3523472831450062, + "learning_rate": 3.9093630982034946e-05, + "loss": 2.7615, + "step": 39498 + }, + { + "epoch": 1.8389785133971182, + "grad_norm": 0.3411828380578539, + "learning_rate": 3.909098748703244e-05, + "loss": 2.7416, + "step": 39499 + }, + { + "epoch": 1.8390250715832113, + "grad_norm": 0.39194432584622824, + "learning_rate": 3.908834402404683e-05, + "loss": 2.7487, + "step": 39500 + }, + { + "epoch": 1.8390716297693042, + "grad_norm": 0.3472227414505629, + "learning_rate": 3.9085700593085854e-05, + "loss": 2.5638, + "step": 39501 + }, + { + "epoch": 1.8391181879553973, + "grad_norm": 0.370098848683963, + "learning_rate": 3.908305719415725e-05, + "loss": 2.7294, + "step": 39502 + }, + { + "epoch": 1.8391647461414902, + "grad_norm": 0.3790695044292949, + "learning_rate": 3.9080413827268824e-05, + "loss": 2.6939, + "step": 39503 + }, + { + "epoch": 1.8392113043275833, + "grad_norm": 0.3544019345514821, + "learning_rate": 3.907777049242828e-05, + "loss": 2.7264, + "step": 39504 + }, + { + "epoch": 1.8392578625136764, + "grad_norm": 0.36468818911415873, + "learning_rate": 3.9075127189643435e-05, + "loss": 2.724, + "step": 39505 + }, + { + "epoch": 1.8393044206997695, + "grad_norm": 0.3830576408551329, + "learning_rate": 3.907248391892201e-05, + "loss": 2.6578, + "step": 39506 + }, + { + "epoch": 1.8393509788858626, + "grad_norm": 0.3393187681988284, + "learning_rate": 3.906984068027175e-05, + "loss": 2.7375, + "step": 39507 + }, + { + "epoch": 1.8393975370719557, + "grad_norm": 0.33579070449695475, + "learning_rate": 3.906719747370044e-05, + "loss": 2.7302, + "step": 39508 + }, + { + "epoch": 1.8394440952580489, + "grad_norm": 0.33562309724428435, + "learning_rate": 3.9064554299215825e-05, + "loss": 2.705, + "step": 39509 + }, + { + "epoch": 1.839490653444142, + "grad_norm": 0.3429541509342684, + "learning_rate": 3.906191115682568e-05, + "loss": 2.693, + "step": 39510 + }, + { + "epoch": 1.8395372116302349, + "grad_norm": 0.32444194305908775, + "learning_rate": 3.9059268046537745e-05, + "loss": 2.6763, + "step": 39511 + }, + { + "epoch": 1.839583769816328, + "grad_norm": 0.35851513007665736, + "learning_rate": 3.905662496835976e-05, + "loss": 2.673, + "step": 39512 + }, + { + "epoch": 1.8396303280024209, + "grad_norm": 0.36047686239850324, + "learning_rate": 3.905398192229952e-05, + "loss": 2.8109, + "step": 39513 + }, + { + "epoch": 1.839676886188514, + "grad_norm": 0.33655911109231473, + "learning_rate": 3.9051338908364774e-05, + "loss": 2.7834, + "step": 39514 + }, + { + "epoch": 1.839723444374607, + "grad_norm": 0.36413182261350485, + "learning_rate": 3.904869592656324e-05, + "loss": 2.6565, + "step": 39515 + }, + { + "epoch": 1.8397700025607002, + "grad_norm": 0.364314677086928, + "learning_rate": 3.904605297690272e-05, + "loss": 2.8191, + "step": 39516 + }, + { + "epoch": 1.8398165607467933, + "grad_norm": 0.3234736709871942, + "learning_rate": 3.904341005939095e-05, + "loss": 2.6465, + "step": 39517 + }, + { + "epoch": 1.8398631189328865, + "grad_norm": 0.3617010184599836, + "learning_rate": 3.90407671740357e-05, + "loss": 2.6768, + "step": 39518 + }, + { + "epoch": 1.8399096771189796, + "grad_norm": 0.38063893811881566, + "learning_rate": 3.9038124320844724e-05, + "loss": 2.6669, + "step": 39519 + }, + { + "epoch": 1.8399562353050725, + "grad_norm": 0.34649507235173804, + "learning_rate": 3.9035481499825764e-05, + "loss": 2.7363, + "step": 39520 + }, + { + "epoch": 1.8400027934911656, + "grad_norm": 0.34584086402095193, + "learning_rate": 3.903283871098659e-05, + "loss": 2.7294, + "step": 39521 + }, + { + "epoch": 1.8400493516772587, + "grad_norm": 0.3465581588873986, + "learning_rate": 3.903019595433495e-05, + "loss": 2.7128, + "step": 39522 + }, + { + "epoch": 1.8400959098633516, + "grad_norm": 0.32206924039609325, + "learning_rate": 3.902755322987861e-05, + "loss": 2.6278, + "step": 39523 + }, + { + "epoch": 1.8401424680494447, + "grad_norm": 0.3577137328598371, + "learning_rate": 3.9024910537625324e-05, + "loss": 2.6905, + "step": 39524 + }, + { + "epoch": 1.8401890262355378, + "grad_norm": 0.3180874503424674, + "learning_rate": 3.902226787758283e-05, + "loss": 2.728, + "step": 39525 + }, + { + "epoch": 1.840235584421631, + "grad_norm": 0.33400260533308285, + "learning_rate": 3.901962524975891e-05, + "loss": 2.7175, + "step": 39526 + }, + { + "epoch": 1.840282142607724, + "grad_norm": 0.34296313210903573, + "learning_rate": 3.901698265416128e-05, + "loss": 2.7902, + "step": 39527 + }, + { + "epoch": 1.8403287007938172, + "grad_norm": 0.3331282966597422, + "learning_rate": 3.901434009079776e-05, + "loss": 2.6076, + "step": 39528 + }, + { + "epoch": 1.8403752589799103, + "grad_norm": 0.3356293008137273, + "learning_rate": 3.901169755967606e-05, + "loss": 2.6689, + "step": 39529 + }, + { + "epoch": 1.8404218171660032, + "grad_norm": 0.31195872154776166, + "learning_rate": 3.900905506080393e-05, + "loss": 2.6749, + "step": 39530 + }, + { + "epoch": 1.8404683753520963, + "grad_norm": 0.341894018241255, + "learning_rate": 3.900641259418916e-05, + "loss": 2.6554, + "step": 39531 + }, + { + "epoch": 1.8405149335381894, + "grad_norm": 0.33626609963831355, + "learning_rate": 3.9003770159839487e-05, + "loss": 2.7087, + "step": 39532 + }, + { + "epoch": 1.8405614917242823, + "grad_norm": 0.313843368505913, + "learning_rate": 3.9001127757762656e-05, + "loss": 2.7385, + "step": 39533 + }, + { + "epoch": 1.8406080499103754, + "grad_norm": 0.3751540262786983, + "learning_rate": 3.899848538796643e-05, + "loss": 2.7492, + "step": 39534 + }, + { + "epoch": 1.8406546080964685, + "grad_norm": 0.3364459293574359, + "learning_rate": 3.899584305045856e-05, + "loss": 2.5701, + "step": 39535 + }, + { + "epoch": 1.8407011662825616, + "grad_norm": 0.3435358081747104, + "learning_rate": 3.899320074524682e-05, + "loss": 2.6079, + "step": 39536 + }, + { + "epoch": 1.8407477244686548, + "grad_norm": 0.32388782575282116, + "learning_rate": 3.8990558472338955e-05, + "loss": 2.7718, + "step": 39537 + }, + { + "epoch": 1.8407942826547479, + "grad_norm": 0.36961270705400706, + "learning_rate": 3.8987916231742695e-05, + "loss": 2.7461, + "step": 39538 + }, + { + "epoch": 1.840840840840841, + "grad_norm": 0.34319164463448554, + "learning_rate": 3.898527402346583e-05, + "loss": 2.6043, + "step": 39539 + }, + { + "epoch": 1.8408873990269339, + "grad_norm": 0.3464155468993484, + "learning_rate": 3.898263184751609e-05, + "loss": 2.7535, + "step": 39540 + }, + { + "epoch": 1.840933957213027, + "grad_norm": 0.37396494291708565, + "learning_rate": 3.897998970390125e-05, + "loss": 2.7236, + "step": 39541 + }, + { + "epoch": 1.8409805153991199, + "grad_norm": 0.32439812265062223, + "learning_rate": 3.897734759262907e-05, + "loss": 2.7864, + "step": 39542 + }, + { + "epoch": 1.841027073585213, + "grad_norm": 0.35661959295446066, + "learning_rate": 3.897470551370725e-05, + "loss": 2.6299, + "step": 39543 + }, + { + "epoch": 1.8410736317713061, + "grad_norm": 0.31713602638706173, + "learning_rate": 3.8972063467143615e-05, + "loss": 2.5856, + "step": 39544 + }, + { + "epoch": 1.8411201899573992, + "grad_norm": 0.3319358508521736, + "learning_rate": 3.896942145294588e-05, + "loss": 2.6227, + "step": 39545 + }, + { + "epoch": 1.8411667481434923, + "grad_norm": 0.30931811248798335, + "learning_rate": 3.896677947112181e-05, + "loss": 2.6449, + "step": 39546 + }, + { + "epoch": 1.8412133063295855, + "grad_norm": 0.3323667423142534, + "learning_rate": 3.8964137521679145e-05, + "loss": 2.7291, + "step": 39547 + }, + { + "epoch": 1.8412598645156786, + "grad_norm": 0.3347384949088726, + "learning_rate": 3.8961495604625646e-05, + "loss": 2.7147, + "step": 39548 + }, + { + "epoch": 1.8413064227017717, + "grad_norm": 0.3400884400723004, + "learning_rate": 3.895885371996908e-05, + "loss": 2.6156, + "step": 39549 + }, + { + "epoch": 1.8413529808878646, + "grad_norm": 0.3161336795253607, + "learning_rate": 3.89562118677172e-05, + "loss": 2.6335, + "step": 39550 + }, + { + "epoch": 1.8413995390739577, + "grad_norm": 0.34925019754497333, + "learning_rate": 3.895357004787772e-05, + "loss": 2.6918, + "step": 39551 + }, + { + "epoch": 1.8414460972600506, + "grad_norm": 0.3392537537914354, + "learning_rate": 3.8950928260458454e-05, + "loss": 2.693, + "step": 39552 + }, + { + "epoch": 1.8414926554461437, + "grad_norm": 0.35922392897646716, + "learning_rate": 3.8948286505467104e-05, + "loss": 2.7, + "step": 39553 + }, + { + "epoch": 1.8415392136322368, + "grad_norm": 0.34110868483362544, + "learning_rate": 3.894564478291146e-05, + "loss": 2.8044, + "step": 39554 + }, + { + "epoch": 1.84158577181833, + "grad_norm": 0.34955487910492483, + "learning_rate": 3.894300309279926e-05, + "loss": 2.8506, + "step": 39555 + }, + { + "epoch": 1.841632330004423, + "grad_norm": 0.37860442959120644, + "learning_rate": 3.894036143513823e-05, + "loss": 2.6285, + "step": 39556 + }, + { + "epoch": 1.8416788881905162, + "grad_norm": 0.3474816712935656, + "learning_rate": 3.893771980993618e-05, + "loss": 2.649, + "step": 39557 + }, + { + "epoch": 1.8417254463766093, + "grad_norm": 0.34805266781154137, + "learning_rate": 3.8935078217200816e-05, + "loss": 2.6401, + "step": 39558 + }, + { + "epoch": 1.8417720045627024, + "grad_norm": 0.3409203453613699, + "learning_rate": 3.893243665693992e-05, + "loss": 2.7031, + "step": 39559 + }, + { + "epoch": 1.8418185627487953, + "grad_norm": 0.360182097037884, + "learning_rate": 3.892979512916122e-05, + "loss": 2.6751, + "step": 39560 + }, + { + "epoch": 1.8418651209348884, + "grad_norm": 0.37058126813731335, + "learning_rate": 3.8927153633872485e-05, + "loss": 2.719, + "step": 39561 + }, + { + "epoch": 1.8419116791209813, + "grad_norm": 0.3478320257329472, + "learning_rate": 3.8924512171081474e-05, + "loss": 2.6545, + "step": 39562 + }, + { + "epoch": 1.8419582373070744, + "grad_norm": 0.35089941526961255, + "learning_rate": 3.89218707407959e-05, + "loss": 2.7434, + "step": 39563 + }, + { + "epoch": 1.8420047954931675, + "grad_norm": 0.33556488994920947, + "learning_rate": 3.891922934302357e-05, + "loss": 2.7024, + "step": 39564 + }, + { + "epoch": 1.8420513536792607, + "grad_norm": 0.3426097821534381, + "learning_rate": 3.891658797777221e-05, + "loss": 2.6983, + "step": 39565 + }, + { + "epoch": 1.8420979118653538, + "grad_norm": 0.36183933640264027, + "learning_rate": 3.891394664504955e-05, + "loss": 2.6663, + "step": 39566 + }, + { + "epoch": 1.8421444700514469, + "grad_norm": 0.3149123747161609, + "learning_rate": 3.8911305344863395e-05, + "loss": 2.6981, + "step": 39567 + }, + { + "epoch": 1.84219102823754, + "grad_norm": 0.3271780554717914, + "learning_rate": 3.890866407722145e-05, + "loss": 2.6423, + "step": 39568 + }, + { + "epoch": 1.842237586423633, + "grad_norm": 0.35882336801328923, + "learning_rate": 3.8906022842131476e-05, + "loss": 2.6893, + "step": 39569 + }, + { + "epoch": 1.842284144609726, + "grad_norm": 0.32366427723198155, + "learning_rate": 3.8903381639601245e-05, + "loss": 2.7672, + "step": 39570 + }, + { + "epoch": 1.8423307027958191, + "grad_norm": 0.3309708687424797, + "learning_rate": 3.890074046963848e-05, + "loss": 2.7295, + "step": 39571 + }, + { + "epoch": 1.842377260981912, + "grad_norm": 0.3310810846102981, + "learning_rate": 3.889809933225097e-05, + "loss": 2.617, + "step": 39572 + }, + { + "epoch": 1.8424238191680051, + "grad_norm": 0.3293873039242962, + "learning_rate": 3.889545822744644e-05, + "loss": 2.6559, + "step": 39573 + }, + { + "epoch": 1.8424703773540982, + "grad_norm": 0.3582045290516933, + "learning_rate": 3.889281715523263e-05, + "loss": 2.7371, + "step": 39574 + }, + { + "epoch": 1.8425169355401914, + "grad_norm": 0.3450527985702004, + "learning_rate": 3.8890176115617324e-05, + "loss": 2.7797, + "step": 39575 + }, + { + "epoch": 1.8425634937262845, + "grad_norm": 0.3408716609720184, + "learning_rate": 3.888753510860824e-05, + "loss": 2.7366, + "step": 39576 + }, + { + "epoch": 1.8426100519123776, + "grad_norm": 0.35319510944126364, + "learning_rate": 3.888489413421316e-05, + "loss": 2.6482, + "step": 39577 + }, + { + "epoch": 1.8426566100984707, + "grad_norm": 0.328547778899684, + "learning_rate": 3.8882253192439824e-05, + "loss": 2.7553, + "step": 39578 + }, + { + "epoch": 1.8427031682845636, + "grad_norm": 0.3666537573446847, + "learning_rate": 3.8879612283295955e-05, + "loss": 2.689, + "step": 39579 + }, + { + "epoch": 1.8427497264706567, + "grad_norm": 0.343634329036784, + "learning_rate": 3.8876971406789356e-05, + "loss": 2.5942, + "step": 39580 + }, + { + "epoch": 1.8427962846567498, + "grad_norm": 0.37146190935412504, + "learning_rate": 3.8874330562927726e-05, + "loss": 2.6884, + "step": 39581 + }, + { + "epoch": 1.8428428428428427, + "grad_norm": 0.35537570837292176, + "learning_rate": 3.8871689751718856e-05, + "loss": 2.663, + "step": 39582 + }, + { + "epoch": 1.8428894010289358, + "grad_norm": 0.3481978051808015, + "learning_rate": 3.8869048973170486e-05, + "loss": 2.7093, + "step": 39583 + }, + { + "epoch": 1.842935959215029, + "grad_norm": 0.3622384737474609, + "learning_rate": 3.8866408227290354e-05, + "loss": 2.6459, + "step": 39584 + }, + { + "epoch": 1.842982517401122, + "grad_norm": 0.32851519246807065, + "learning_rate": 3.886376751408622e-05, + "loss": 2.7341, + "step": 39585 + }, + { + "epoch": 1.8430290755872152, + "grad_norm": 0.33883056827145064, + "learning_rate": 3.8861126833565826e-05, + "loss": 2.7238, + "step": 39586 + }, + { + "epoch": 1.8430756337733083, + "grad_norm": 0.33589448246969683, + "learning_rate": 3.885848618573692e-05, + "loss": 2.7166, + "step": 39587 + }, + { + "epoch": 1.8431221919594014, + "grad_norm": 0.35593653652552704, + "learning_rate": 3.885584557060727e-05, + "loss": 2.6958, + "step": 39588 + }, + { + "epoch": 1.8431687501454943, + "grad_norm": 0.34092260755107656, + "learning_rate": 3.8853204988184605e-05, + "loss": 2.6839, + "step": 39589 + }, + { + "epoch": 1.8432153083315874, + "grad_norm": 0.3452782394895553, + "learning_rate": 3.88505644384767e-05, + "loss": 2.7872, + "step": 39590 + }, + { + "epoch": 1.8432618665176803, + "grad_norm": 0.3484688350542997, + "learning_rate": 3.884792392149129e-05, + "loss": 2.6732, + "step": 39591 + }, + { + "epoch": 1.8433084247037734, + "grad_norm": 0.31487816257258167, + "learning_rate": 3.88452834372361e-05, + "loss": 2.757, + "step": 39592 + }, + { + "epoch": 1.8433549828898665, + "grad_norm": 0.3611890641982965, + "learning_rate": 3.8842642985718934e-05, + "loss": 2.5954, + "step": 39593 + }, + { + "epoch": 1.8434015410759597, + "grad_norm": 0.3256150185344596, + "learning_rate": 3.884000256694749e-05, + "loss": 2.6693, + "step": 39594 + }, + { + "epoch": 1.8434480992620528, + "grad_norm": 0.33470858558115485, + "learning_rate": 3.883736218092955e-05, + "loss": 2.6516, + "step": 39595 + }, + { + "epoch": 1.843494657448146, + "grad_norm": 0.3479400329617354, + "learning_rate": 3.883472182767286e-05, + "loss": 2.6409, + "step": 39596 + }, + { + "epoch": 1.843541215634239, + "grad_norm": 0.35290015353222726, + "learning_rate": 3.8832081507185147e-05, + "loss": 2.6888, + "step": 39597 + }, + { + "epoch": 1.8435877738203321, + "grad_norm": 0.3773973706513931, + "learning_rate": 3.882944121947419e-05, + "loss": 2.7092, + "step": 39598 + }, + { + "epoch": 1.843634332006425, + "grad_norm": 0.3620422742680515, + "learning_rate": 3.8826800964547695e-05, + "loss": 2.6932, + "step": 39599 + }, + { + "epoch": 1.8436808901925181, + "grad_norm": 0.3445025273303021, + "learning_rate": 3.882416074241346e-05, + "loss": 2.6673, + "step": 39600 + }, + { + "epoch": 1.843727448378611, + "grad_norm": 0.4094561285780981, + "learning_rate": 3.882152055307922e-05, + "loss": 2.7336, + "step": 39601 + }, + { + "epoch": 1.8437740065647041, + "grad_norm": 0.37019631031941935, + "learning_rate": 3.8818880396552685e-05, + "loss": 2.6807, + "step": 39602 + }, + { + "epoch": 1.8438205647507973, + "grad_norm": 0.3798079019588608, + "learning_rate": 3.8816240272841664e-05, + "loss": 2.6214, + "step": 39603 + }, + { + "epoch": 1.8438671229368904, + "grad_norm": 0.3652666498696346, + "learning_rate": 3.8813600181953875e-05, + "loss": 2.7034, + "step": 39604 + }, + { + "epoch": 1.8439136811229835, + "grad_norm": 0.3686706033831073, + "learning_rate": 3.881096012389704e-05, + "loss": 2.6587, + "step": 39605 + }, + { + "epoch": 1.8439602393090766, + "grad_norm": 0.3598010620753735, + "learning_rate": 3.8808320098678966e-05, + "loss": 2.7475, + "step": 39606 + }, + { + "epoch": 1.8440067974951697, + "grad_norm": 0.33960271124494895, + "learning_rate": 3.880568010630734e-05, + "loss": 2.7127, + "step": 39607 + }, + { + "epoch": 1.8440533556812626, + "grad_norm": 0.3212542369778514, + "learning_rate": 3.880304014678996e-05, + "loss": 2.7157, + "step": 39608 + }, + { + "epoch": 1.8440999138673557, + "grad_norm": 0.32849611481662083, + "learning_rate": 3.880040022013456e-05, + "loss": 2.7208, + "step": 39609 + }, + { + "epoch": 1.8441464720534488, + "grad_norm": 0.39512989863530557, + "learning_rate": 3.879776032634887e-05, + "loss": 2.6297, + "step": 39610 + }, + { + "epoch": 1.8441930302395417, + "grad_norm": 0.3224487111224073, + "learning_rate": 3.879512046544066e-05, + "loss": 2.699, + "step": 39611 + }, + { + "epoch": 1.8442395884256348, + "grad_norm": 0.3645820531708765, + "learning_rate": 3.8792480637417647e-05, + "loss": 2.5389, + "step": 39612 + }, + { + "epoch": 1.844286146611728, + "grad_norm": 0.33819217741871677, + "learning_rate": 3.878984084228762e-05, + "loss": 2.6841, + "step": 39613 + }, + { + "epoch": 1.844332704797821, + "grad_norm": 0.3440928155058279, + "learning_rate": 3.87872010800583e-05, + "loss": 2.6891, + "step": 39614 + }, + { + "epoch": 1.8443792629839142, + "grad_norm": 0.3791790178322201, + "learning_rate": 3.8784561350737425e-05, + "loss": 2.672, + "step": 39615 + }, + { + "epoch": 1.8444258211700073, + "grad_norm": 0.3577167637059684, + "learning_rate": 3.878192165433278e-05, + "loss": 2.634, + "step": 39616 + }, + { + "epoch": 1.8444723793561004, + "grad_norm": 0.33398670507831085, + "learning_rate": 3.8779281990852065e-05, + "loss": 2.7507, + "step": 39617 + }, + { + "epoch": 1.8445189375421933, + "grad_norm": 0.35612687448092967, + "learning_rate": 3.8776642360303075e-05, + "loss": 2.6056, + "step": 39618 + }, + { + "epoch": 1.8445654957282864, + "grad_norm": 0.3232808668528173, + "learning_rate": 3.8774002762693535e-05, + "loss": 2.6894, + "step": 39619 + }, + { + "epoch": 1.8446120539143795, + "grad_norm": 0.3452540190359104, + "learning_rate": 3.877136319803117e-05, + "loss": 2.6595, + "step": 39620 + }, + { + "epoch": 1.8446586121004724, + "grad_norm": 0.3282658425640382, + "learning_rate": 3.8768723666323764e-05, + "loss": 2.6207, + "step": 39621 + }, + { + "epoch": 1.8447051702865656, + "grad_norm": 0.32383562130028903, + "learning_rate": 3.8766084167579056e-05, + "loss": 2.7142, + "step": 39622 + }, + { + "epoch": 1.8447517284726587, + "grad_norm": 0.3583404439387491, + "learning_rate": 3.876344470180476e-05, + "loss": 2.5915, + "step": 39623 + }, + { + "epoch": 1.8447982866587518, + "grad_norm": 0.324528964621336, + "learning_rate": 3.876080526900867e-05, + "loss": 2.7513, + "step": 39624 + }, + { + "epoch": 1.844844844844845, + "grad_norm": 0.3172104851045555, + "learning_rate": 3.875816586919848e-05, + "loss": 2.735, + "step": 39625 + }, + { + "epoch": 1.844891403030938, + "grad_norm": 0.34746593981515866, + "learning_rate": 3.875552650238199e-05, + "loss": 2.6805, + "step": 39626 + }, + { + "epoch": 1.8449379612170311, + "grad_norm": 0.33555484851462164, + "learning_rate": 3.875288716856693e-05, + "loss": 2.646, + "step": 39627 + }, + { + "epoch": 1.844984519403124, + "grad_norm": 0.3311860171765142, + "learning_rate": 3.875024786776101e-05, + "loss": 2.612, + "step": 39628 + }, + { + "epoch": 1.8450310775892171, + "grad_norm": 0.3277214670010545, + "learning_rate": 3.874760859997203e-05, + "loss": 2.6844, + "step": 39629 + }, + { + "epoch": 1.84507763577531, + "grad_norm": 0.3559513909468214, + "learning_rate": 3.874496936520768e-05, + "loss": 2.7267, + "step": 39630 + }, + { + "epoch": 1.8451241939614031, + "grad_norm": 0.3244973674510793, + "learning_rate": 3.874233016347576e-05, + "loss": 2.7655, + "step": 39631 + }, + { + "epoch": 1.8451707521474963, + "grad_norm": 0.3446850349927057, + "learning_rate": 3.8739690994784e-05, + "loss": 2.7604, + "step": 39632 + }, + { + "epoch": 1.8452173103335894, + "grad_norm": 0.31613578577147977, + "learning_rate": 3.87370518591401e-05, + "loss": 2.6377, + "step": 39633 + }, + { + "epoch": 1.8452638685196825, + "grad_norm": 0.3438249021932431, + "learning_rate": 3.873441275655188e-05, + "loss": 2.6864, + "step": 39634 + }, + { + "epoch": 1.8453104267057756, + "grad_norm": 0.3538275731151744, + "learning_rate": 3.873177368702703e-05, + "loss": 2.6293, + "step": 39635 + }, + { + "epoch": 1.8453569848918687, + "grad_norm": 0.32125126523134084, + "learning_rate": 3.8729134650573326e-05, + "loss": 2.6537, + "step": 39636 + }, + { + "epoch": 1.8454035430779618, + "grad_norm": 0.3478573480844714, + "learning_rate": 3.8726495647198504e-05, + "loss": 2.6239, + "step": 39637 + }, + { + "epoch": 1.8454501012640547, + "grad_norm": 0.37887385810232815, + "learning_rate": 3.8723856676910286e-05, + "loss": 2.7346, + "step": 39638 + }, + { + "epoch": 1.8454966594501478, + "grad_norm": 0.3314953479626156, + "learning_rate": 3.872121773971645e-05, + "loss": 2.7199, + "step": 39639 + }, + { + "epoch": 1.8455432176362407, + "grad_norm": 0.3664541402608314, + "learning_rate": 3.871857883562474e-05, + "loss": 2.6933, + "step": 39640 + }, + { + "epoch": 1.8455897758223339, + "grad_norm": 0.3544993594976234, + "learning_rate": 3.871593996464287e-05, + "loss": 2.6853, + "step": 39641 + }, + { + "epoch": 1.845636334008427, + "grad_norm": 0.35591020352997355, + "learning_rate": 3.8713301126778616e-05, + "loss": 2.7261, + "step": 39642 + }, + { + "epoch": 1.84568289219452, + "grad_norm": 0.32652913712058657, + "learning_rate": 3.8710662322039695e-05, + "loss": 2.6662, + "step": 39643 + }, + { + "epoch": 1.8457294503806132, + "grad_norm": 0.37436273989787416, + "learning_rate": 3.8708023550433884e-05, + "loss": 2.6753, + "step": 39644 + }, + { + "epoch": 1.8457760085667063, + "grad_norm": 0.3469409320479042, + "learning_rate": 3.870538481196892e-05, + "loss": 2.7156, + "step": 39645 + }, + { + "epoch": 1.8458225667527994, + "grad_norm": 0.35966784856207795, + "learning_rate": 3.870274610665251e-05, + "loss": 2.7117, + "step": 39646 + }, + { + "epoch": 1.8458691249388925, + "grad_norm": 0.37824241537262815, + "learning_rate": 3.8700107434492453e-05, + "loss": 2.6864, + "step": 39647 + }, + { + "epoch": 1.8459156831249854, + "grad_norm": 0.35424687501913404, + "learning_rate": 3.869746879549645e-05, + "loss": 2.6614, + "step": 39648 + }, + { + "epoch": 1.8459622413110786, + "grad_norm": 0.34103122956374404, + "learning_rate": 3.869483018967228e-05, + "loss": 2.7204, + "step": 39649 + }, + { + "epoch": 1.8460087994971714, + "grad_norm": 0.4008407046597296, + "learning_rate": 3.869219161702767e-05, + "loss": 2.7201, + "step": 39650 + }, + { + "epoch": 1.8460553576832646, + "grad_norm": 0.3530286857924864, + "learning_rate": 3.868955307757034e-05, + "loss": 2.7514, + "step": 39651 + }, + { + "epoch": 1.8461019158693577, + "grad_norm": 0.3589854604495112, + "learning_rate": 3.8686914571308086e-05, + "loss": 2.6885, + "step": 39652 + }, + { + "epoch": 1.8461484740554508, + "grad_norm": 0.37485842292881727, + "learning_rate": 3.86842760982486e-05, + "loss": 2.6652, + "step": 39653 + }, + { + "epoch": 1.846195032241544, + "grad_norm": 0.3532702057792135, + "learning_rate": 3.868163765839966e-05, + "loss": 2.6007, + "step": 39654 + }, + { + "epoch": 1.846241590427637, + "grad_norm": 0.35276701368782226, + "learning_rate": 3.867899925176901e-05, + "loss": 2.6472, + "step": 39655 + }, + { + "epoch": 1.8462881486137301, + "grad_norm": 0.3625329688845432, + "learning_rate": 3.867636087836435e-05, + "loss": 2.6964, + "step": 39656 + }, + { + "epoch": 1.846334706799823, + "grad_norm": 0.34278293310946545, + "learning_rate": 3.867372253819348e-05, + "loss": 2.7022, + "step": 39657 + }, + { + "epoch": 1.8463812649859161, + "grad_norm": 0.3708650905725238, + "learning_rate": 3.8671084231264124e-05, + "loss": 2.66, + "step": 39658 + }, + { + "epoch": 1.8464278231720093, + "grad_norm": 0.3753353721686603, + "learning_rate": 3.866844595758399e-05, + "loss": 2.715, + "step": 39659 + }, + { + "epoch": 1.8464743813581022, + "grad_norm": 0.3446048001341552, + "learning_rate": 3.866580771716087e-05, + "loss": 2.7347, + "step": 39660 + }, + { + "epoch": 1.8465209395441953, + "grad_norm": 0.3848691894096269, + "learning_rate": 3.866316951000249e-05, + "loss": 2.6684, + "step": 39661 + }, + { + "epoch": 1.8465674977302884, + "grad_norm": 0.3502961681958196, + "learning_rate": 3.86605313361166e-05, + "loss": 2.6499, + "step": 39662 + }, + { + "epoch": 1.8466140559163815, + "grad_norm": 0.34027707691147036, + "learning_rate": 3.865789319551092e-05, + "loss": 2.6761, + "step": 39663 + }, + { + "epoch": 1.8466606141024746, + "grad_norm": 0.35920796116813963, + "learning_rate": 3.865525508819319e-05, + "loss": 2.7884, + "step": 39664 + }, + { + "epoch": 1.8467071722885677, + "grad_norm": 0.35147903218357224, + "learning_rate": 3.8652617014171196e-05, + "loss": 2.7314, + "step": 39665 + }, + { + "epoch": 1.8467537304746608, + "grad_norm": 0.32655969124377543, + "learning_rate": 3.8649978973452626e-05, + "loss": 2.5719, + "step": 39666 + }, + { + "epoch": 1.8468002886607537, + "grad_norm": 0.33091205599487594, + "learning_rate": 3.864734096604528e-05, + "loss": 2.6577, + "step": 39667 + }, + { + "epoch": 1.8468468468468469, + "grad_norm": 0.34380058173359734, + "learning_rate": 3.864470299195686e-05, + "loss": 2.641, + "step": 39668 + }, + { + "epoch": 1.84689340503294, + "grad_norm": 0.3206806816609198, + "learning_rate": 3.8642065051195096e-05, + "loss": 2.5574, + "step": 39669 + }, + { + "epoch": 1.8469399632190329, + "grad_norm": 0.3173735965656126, + "learning_rate": 3.863942714376778e-05, + "loss": 2.6303, + "step": 39670 + }, + { + "epoch": 1.846986521405126, + "grad_norm": 0.349287293699005, + "learning_rate": 3.863678926968262e-05, + "loss": 2.6594, + "step": 39671 + }, + { + "epoch": 1.847033079591219, + "grad_norm": 0.3281651192475297, + "learning_rate": 3.8634151428947354e-05, + "loss": 2.6724, + "step": 39672 + }, + { + "epoch": 1.8470796377773122, + "grad_norm": 0.3375263195139551, + "learning_rate": 3.863151362156975e-05, + "loss": 2.6103, + "step": 39673 + }, + { + "epoch": 1.8471261959634053, + "grad_norm": 0.34186577655536715, + "learning_rate": 3.8628875847557515e-05, + "loss": 2.6825, + "step": 39674 + }, + { + "epoch": 1.8471727541494984, + "grad_norm": 0.3705641356947086, + "learning_rate": 3.8626238106918425e-05, + "loss": 2.7924, + "step": 39675 + }, + { + "epoch": 1.8472193123355916, + "grad_norm": 0.326541549660241, + "learning_rate": 3.862360039966021e-05, + "loss": 2.7492, + "step": 39676 + }, + { + "epoch": 1.8472658705216844, + "grad_norm": 0.3271857742181944, + "learning_rate": 3.862096272579058e-05, + "loss": 2.7348, + "step": 39677 + }, + { + "epoch": 1.8473124287077776, + "grad_norm": 0.32788315060265566, + "learning_rate": 3.8618325085317323e-05, + "loss": 2.7024, + "step": 39678 + }, + { + "epoch": 1.8473589868938705, + "grad_norm": 0.34781703030940125, + "learning_rate": 3.861568747824814e-05, + "loss": 2.692, + "step": 39679 + }, + { + "epoch": 1.8474055450799636, + "grad_norm": 0.3454660594068778, + "learning_rate": 3.861304990459082e-05, + "loss": 2.6863, + "step": 39680 + }, + { + "epoch": 1.8474521032660567, + "grad_norm": 0.3198240240264308, + "learning_rate": 3.861041236435308e-05, + "loss": 2.7053, + "step": 39681 + }, + { + "epoch": 1.8474986614521498, + "grad_norm": 0.3545493506877482, + "learning_rate": 3.860777485754262e-05, + "loss": 2.6558, + "step": 39682 + }, + { + "epoch": 1.847545219638243, + "grad_norm": 0.35195306526906095, + "learning_rate": 3.8605137384167255e-05, + "loss": 2.7803, + "step": 39683 + }, + { + "epoch": 1.847591777824336, + "grad_norm": 0.3453224521416497, + "learning_rate": 3.860249994423467e-05, + "loss": 2.6409, + "step": 39684 + }, + { + "epoch": 1.8476383360104291, + "grad_norm": 0.3511601490450848, + "learning_rate": 3.859986253775263e-05, + "loss": 2.7121, + "step": 39685 + }, + { + "epoch": 1.8476848941965223, + "grad_norm": 0.3483184560719133, + "learning_rate": 3.859722516472888e-05, + "loss": 2.7195, + "step": 39686 + }, + { + "epoch": 1.8477314523826152, + "grad_norm": 0.4253820421581424, + "learning_rate": 3.859458782517114e-05, + "loss": 2.6629, + "step": 39687 + }, + { + "epoch": 1.8477780105687083, + "grad_norm": 0.3363539017338121, + "learning_rate": 3.859195051908717e-05, + "loss": 2.7994, + "step": 39688 + }, + { + "epoch": 1.8478245687548012, + "grad_norm": 0.3506226276527526, + "learning_rate": 3.858931324648468e-05, + "loss": 2.7448, + "step": 39689 + }, + { + "epoch": 1.8478711269408943, + "grad_norm": 0.35324689885747135, + "learning_rate": 3.858667600737145e-05, + "loss": 2.7037, + "step": 39690 + }, + { + "epoch": 1.8479176851269874, + "grad_norm": 0.3578723223665469, + "learning_rate": 3.858403880175521e-05, + "loss": 2.7139, + "step": 39691 + }, + { + "epoch": 1.8479642433130805, + "grad_norm": 0.3851630380424576, + "learning_rate": 3.858140162964366e-05, + "loss": 2.7513, + "step": 39692 + }, + { + "epoch": 1.8480108014991736, + "grad_norm": 0.32638425052971626, + "learning_rate": 3.85787644910446e-05, + "loss": 2.7441, + "step": 39693 + }, + { + "epoch": 1.8480573596852667, + "grad_norm": 0.3801493110946649, + "learning_rate": 3.857612738596574e-05, + "loss": 2.7092, + "step": 39694 + }, + { + "epoch": 1.8481039178713599, + "grad_norm": 0.3619096108975443, + "learning_rate": 3.857349031441479e-05, + "loss": 2.7149, + "step": 39695 + }, + { + "epoch": 1.8481504760574528, + "grad_norm": 0.3438454459696448, + "learning_rate": 3.8570853276399555e-05, + "loss": 2.741, + "step": 39696 + }, + { + "epoch": 1.8481970342435459, + "grad_norm": 0.36331350980658794, + "learning_rate": 3.856821627192773e-05, + "loss": 2.6037, + "step": 39697 + }, + { + "epoch": 1.848243592429639, + "grad_norm": 0.36889945665267115, + "learning_rate": 3.8565579301007054e-05, + "loss": 2.6677, + "step": 39698 + }, + { + "epoch": 1.8482901506157319, + "grad_norm": 0.4203949968287188, + "learning_rate": 3.856294236364529e-05, + "loss": 2.7543, + "step": 39699 + }, + { + "epoch": 1.848336708801825, + "grad_norm": 0.3702262672838942, + "learning_rate": 3.856030545985015e-05, + "loss": 2.6509, + "step": 39700 + }, + { + "epoch": 1.848383266987918, + "grad_norm": 0.39742864676532347, + "learning_rate": 3.8557668589629404e-05, + "loss": 2.712, + "step": 39701 + }, + { + "epoch": 1.8484298251740112, + "grad_norm": 0.34485661751799684, + "learning_rate": 3.8555031752990746e-05, + "loss": 2.6127, + "step": 39702 + }, + { + "epoch": 1.8484763833601043, + "grad_norm": 0.3481206798620822, + "learning_rate": 3.8552394949941966e-05, + "loss": 2.7009, + "step": 39703 + }, + { + "epoch": 1.8485229415461975, + "grad_norm": 0.3480942091929728, + "learning_rate": 3.854975818049078e-05, + "loss": 2.7038, + "step": 39704 + }, + { + "epoch": 1.8485694997322906, + "grad_norm": 0.3522069808913274, + "learning_rate": 3.8547121444644904e-05, + "loss": 2.7193, + "step": 39705 + }, + { + "epoch": 1.8486160579183835, + "grad_norm": 0.32702913614929463, + "learning_rate": 3.8544484742412126e-05, + "loss": 2.6828, + "step": 39706 + }, + { + "epoch": 1.8486626161044766, + "grad_norm": 0.3825893939089121, + "learning_rate": 3.8541848073800126e-05, + "loss": 2.6611, + "step": 39707 + }, + { + "epoch": 1.8487091742905697, + "grad_norm": 0.34349362712891207, + "learning_rate": 3.853921143881669e-05, + "loss": 2.7472, + "step": 39708 + }, + { + "epoch": 1.8487557324766626, + "grad_norm": 0.3676294709913216, + "learning_rate": 3.8536574837469554e-05, + "loss": 2.7582, + "step": 39709 + }, + { + "epoch": 1.8488022906627557, + "grad_norm": 0.3601111633642691, + "learning_rate": 3.8533938269766425e-05, + "loss": 2.7099, + "step": 39710 + }, + { + "epoch": 1.8488488488488488, + "grad_norm": 0.364778654984183, + "learning_rate": 3.8531301735715055e-05, + "loss": 2.6551, + "step": 39711 + }, + { + "epoch": 1.848895407034942, + "grad_norm": 0.33000045050074395, + "learning_rate": 3.852866523532319e-05, + "loss": 2.6343, + "step": 39712 + }, + { + "epoch": 1.848941965221035, + "grad_norm": 0.3614712148724316, + "learning_rate": 3.852602876859856e-05, + "loss": 2.7593, + "step": 39713 + }, + { + "epoch": 1.8489885234071282, + "grad_norm": 0.3575648364355102, + "learning_rate": 3.852339233554891e-05, + "loss": 2.7075, + "step": 39714 + }, + { + "epoch": 1.8490350815932213, + "grad_norm": 0.3431796145315896, + "learning_rate": 3.852075593618196e-05, + "loss": 2.6134, + "step": 39715 + }, + { + "epoch": 1.8490816397793142, + "grad_norm": 0.3955181781147646, + "learning_rate": 3.851811957050548e-05, + "loss": 2.5841, + "step": 39716 + }, + { + "epoch": 1.8491281979654073, + "grad_norm": 0.3534358386668616, + "learning_rate": 3.851548323852718e-05, + "loss": 2.7273, + "step": 39717 + }, + { + "epoch": 1.8491747561515002, + "grad_norm": 0.32532971929391374, + "learning_rate": 3.8512846940254785e-05, + "loss": 2.649, + "step": 39718 + }, + { + "epoch": 1.8492213143375933, + "grad_norm": 0.37431762023759985, + "learning_rate": 3.851021067569608e-05, + "loss": 2.7184, + "step": 39719 + }, + { + "epoch": 1.8492678725236864, + "grad_norm": 0.3474155645466589, + "learning_rate": 3.850757444485875e-05, + "loss": 2.7356, + "step": 39720 + }, + { + "epoch": 1.8493144307097795, + "grad_norm": 0.3302710978066518, + "learning_rate": 3.850493824775058e-05, + "loss": 2.722, + "step": 39721 + }, + { + "epoch": 1.8493609888958726, + "grad_norm": 0.36733637264845875, + "learning_rate": 3.8502302084379285e-05, + "loss": 2.667, + "step": 39722 + }, + { + "epoch": 1.8494075470819658, + "grad_norm": 0.3283562728683976, + "learning_rate": 3.849966595475259e-05, + "loss": 2.6745, + "step": 39723 + }, + { + "epoch": 1.8494541052680589, + "grad_norm": 0.34827011623021864, + "learning_rate": 3.849702985887824e-05, + "loss": 2.7085, + "step": 39724 + }, + { + "epoch": 1.849500663454152, + "grad_norm": 0.34673726072226474, + "learning_rate": 3.849439379676398e-05, + "loss": 2.753, + "step": 39725 + }, + { + "epoch": 1.8495472216402449, + "grad_norm": 0.36375159009782426, + "learning_rate": 3.849175776841754e-05, + "loss": 2.756, + "step": 39726 + }, + { + "epoch": 1.849593779826338, + "grad_norm": 0.3375563797855751, + "learning_rate": 3.8489121773846666e-05, + "loss": 2.6758, + "step": 39727 + }, + { + "epoch": 1.8496403380124309, + "grad_norm": 0.34993694756979077, + "learning_rate": 3.8486485813059056e-05, + "loss": 2.6986, + "step": 39728 + }, + { + "epoch": 1.849686896198524, + "grad_norm": 0.33436633278920536, + "learning_rate": 3.8483849886062504e-05, + "loss": 2.6706, + "step": 39729 + }, + { + "epoch": 1.8497334543846171, + "grad_norm": 0.3331683999414127, + "learning_rate": 3.8481213992864715e-05, + "loss": 2.6908, + "step": 39730 + }, + { + "epoch": 1.8497800125707102, + "grad_norm": 0.34250583574259863, + "learning_rate": 3.847857813347341e-05, + "loss": 2.6277, + "step": 39731 + }, + { + "epoch": 1.8498265707568033, + "grad_norm": 0.331973155778008, + "learning_rate": 3.8475942307896365e-05, + "loss": 2.5959, + "step": 39732 + }, + { + "epoch": 1.8498731289428965, + "grad_norm": 0.3365731888892836, + "learning_rate": 3.8473306516141274e-05, + "loss": 2.668, + "step": 39733 + }, + { + "epoch": 1.8499196871289896, + "grad_norm": 0.3370441407944435, + "learning_rate": 3.847067075821591e-05, + "loss": 2.6841, + "step": 39734 + }, + { + "epoch": 1.8499662453150825, + "grad_norm": 0.3461996237896407, + "learning_rate": 3.8468035034127995e-05, + "loss": 2.6716, + "step": 39735 + }, + { + "epoch": 1.8500128035011756, + "grad_norm": 0.35130884544857266, + "learning_rate": 3.846539934388524e-05, + "loss": 2.6565, + "step": 39736 + }, + { + "epoch": 1.8500593616872687, + "grad_norm": 0.3280033980192651, + "learning_rate": 3.8462763687495416e-05, + "loss": 2.6377, + "step": 39737 + }, + { + "epoch": 1.8501059198733616, + "grad_norm": 0.3318787818493612, + "learning_rate": 3.846012806496624e-05, + "loss": 2.6385, + "step": 39738 + }, + { + "epoch": 1.8501524780594547, + "grad_norm": 0.3519132868906608, + "learning_rate": 3.845749247630546e-05, + "loss": 2.668, + "step": 39739 + }, + { + "epoch": 1.8501990362455478, + "grad_norm": 0.31910847107446094, + "learning_rate": 3.84548569215208e-05, + "loss": 2.6581, + "step": 39740 + }, + { + "epoch": 1.850245594431641, + "grad_norm": 0.2995304195477218, + "learning_rate": 3.8452221400619975e-05, + "loss": 2.6179, + "step": 39741 + }, + { + "epoch": 1.850292152617734, + "grad_norm": 0.3319659526986009, + "learning_rate": 3.8449585913610764e-05, + "loss": 2.5451, + "step": 39742 + }, + { + "epoch": 1.8503387108038272, + "grad_norm": 0.3241447013330481, + "learning_rate": 3.844695046050086e-05, + "loss": 2.6791, + "step": 39743 + }, + { + "epoch": 1.8503852689899203, + "grad_norm": 0.34697631740247387, + "learning_rate": 3.844431504129805e-05, + "loss": 2.698, + "step": 39744 + }, + { + "epoch": 1.8504318271760132, + "grad_norm": 0.3348108353709951, + "learning_rate": 3.844167965601003e-05, + "loss": 2.6981, + "step": 39745 + }, + { + "epoch": 1.8504783853621063, + "grad_norm": 0.31683718813021094, + "learning_rate": 3.843904430464451e-05, + "loss": 2.7315, + "step": 39746 + }, + { + "epoch": 1.8505249435481994, + "grad_norm": 0.33411936547814314, + "learning_rate": 3.843640898720929e-05, + "loss": 2.6722, + "step": 39747 + }, + { + "epoch": 1.8505715017342923, + "grad_norm": 0.3301970063375134, + "learning_rate": 3.843377370371207e-05, + "loss": 2.6602, + "step": 39748 + }, + { + "epoch": 1.8506180599203854, + "grad_norm": 0.35505985497508114, + "learning_rate": 3.843113845416056e-05, + "loss": 2.7154, + "step": 39749 + }, + { + "epoch": 1.8506646181064785, + "grad_norm": 0.32235432765123456, + "learning_rate": 3.8428503238562535e-05, + "loss": 2.6273, + "step": 39750 + }, + { + "epoch": 1.8507111762925716, + "grad_norm": 0.3707166770651841, + "learning_rate": 3.842586805692571e-05, + "loss": 2.766, + "step": 39751 + }, + { + "epoch": 1.8507577344786648, + "grad_norm": 0.33366005341539007, + "learning_rate": 3.8423232909257827e-05, + "loss": 2.5821, + "step": 39752 + }, + { + "epoch": 1.8508042926647579, + "grad_norm": 0.32768562004509794, + "learning_rate": 3.8420597795566624e-05, + "loss": 2.7338, + "step": 39753 + }, + { + "epoch": 1.850850850850851, + "grad_norm": 0.35532018816922634, + "learning_rate": 3.84179627158598e-05, + "loss": 2.5963, + "step": 39754 + }, + { + "epoch": 1.8508974090369439, + "grad_norm": 0.35115263832403587, + "learning_rate": 3.841532767014513e-05, + "loss": 2.7517, + "step": 39755 + }, + { + "epoch": 1.850943967223037, + "grad_norm": 0.32244261248056066, + "learning_rate": 3.841269265843032e-05, + "loss": 2.6496, + "step": 39756 + }, + { + "epoch": 1.85099052540913, + "grad_norm": 0.3593775051774948, + "learning_rate": 3.841005768072312e-05, + "loss": 2.6562, + "step": 39757 + }, + { + "epoch": 1.851037083595223, + "grad_norm": 0.3825997469508224, + "learning_rate": 3.840742273703127e-05, + "loss": 2.6273, + "step": 39758 + }, + { + "epoch": 1.8510836417813161, + "grad_norm": 0.3626773781061181, + "learning_rate": 3.840478782736248e-05, + "loss": 2.6671, + "step": 39759 + }, + { + "epoch": 1.8511301999674092, + "grad_norm": 0.3423220260805525, + "learning_rate": 3.840215295172451e-05, + "loss": 2.7271, + "step": 39760 + }, + { + "epoch": 1.8511767581535024, + "grad_norm": 0.3742379054171614, + "learning_rate": 3.839951811012506e-05, + "loss": 2.7158, + "step": 39761 + }, + { + "epoch": 1.8512233163395955, + "grad_norm": 0.3399759279408071, + "learning_rate": 3.8396883302571904e-05, + "loss": 2.7215, + "step": 39762 + }, + { + "epoch": 1.8512698745256886, + "grad_norm": 0.33067197416455957, + "learning_rate": 3.839424852907273e-05, + "loss": 2.6374, + "step": 39763 + }, + { + "epoch": 1.8513164327117817, + "grad_norm": 0.35852671378378387, + "learning_rate": 3.83916137896353e-05, + "loss": 2.6372, + "step": 39764 + }, + { + "epoch": 1.8513629908978746, + "grad_norm": 0.34265527926912626, + "learning_rate": 3.8388979084267354e-05, + "loss": 2.708, + "step": 39765 + }, + { + "epoch": 1.8514095490839677, + "grad_norm": 0.3394570102925302, + "learning_rate": 3.838634441297661e-05, + "loss": 2.5625, + "step": 39766 + }, + { + "epoch": 1.8514561072700606, + "grad_norm": 0.33666855691059905, + "learning_rate": 3.838370977577077e-05, + "loss": 2.6943, + "step": 39767 + }, + { + "epoch": 1.8515026654561537, + "grad_norm": 0.3313875120059189, + "learning_rate": 3.838107517265762e-05, + "loss": 2.7608, + "step": 39768 + }, + { + "epoch": 1.8515492236422468, + "grad_norm": 0.34660511468773414, + "learning_rate": 3.8378440603644855e-05, + "loss": 2.7005, + "step": 39769 + }, + { + "epoch": 1.85159578182834, + "grad_norm": 0.33880458297206445, + "learning_rate": 3.8375806068740236e-05, + "loss": 2.6807, + "step": 39770 + }, + { + "epoch": 1.851642340014433, + "grad_norm": 0.34591798406527996, + "learning_rate": 3.8373171567951496e-05, + "loss": 2.7824, + "step": 39771 + }, + { + "epoch": 1.8516888982005262, + "grad_norm": 0.3716150688738432, + "learning_rate": 3.837053710128632e-05, + "loss": 2.7922, + "step": 39772 + }, + { + "epoch": 1.8517354563866193, + "grad_norm": 0.32599766159366467, + "learning_rate": 3.836790266875249e-05, + "loss": 2.6919, + "step": 39773 + }, + { + "epoch": 1.8517820145727124, + "grad_norm": 0.35894805964758, + "learning_rate": 3.8365268270357713e-05, + "loss": 2.6346, + "step": 39774 + }, + { + "epoch": 1.8518285727588053, + "grad_norm": 0.33550826364559744, + "learning_rate": 3.836263390610974e-05, + "loss": 2.6771, + "step": 39775 + }, + { + "epoch": 1.8518751309448984, + "grad_norm": 0.31581304308607094, + "learning_rate": 3.835999957601628e-05, + "loss": 2.7802, + "step": 39776 + }, + { + "epoch": 1.8519216891309913, + "grad_norm": 0.39573902069119166, + "learning_rate": 3.835736528008507e-05, + "loss": 2.7182, + "step": 39777 + }, + { + "epoch": 1.8519682473170844, + "grad_norm": 0.3365422761197842, + "learning_rate": 3.8354731018323856e-05, + "loss": 2.6513, + "step": 39778 + }, + { + "epoch": 1.8520148055031775, + "grad_norm": 0.3669091110860149, + "learning_rate": 3.835209679074034e-05, + "loss": 2.6974, + "step": 39779 + }, + { + "epoch": 1.8520613636892707, + "grad_norm": 0.35506275276937205, + "learning_rate": 3.8349462597342304e-05, + "loss": 2.6963, + "step": 39780 + }, + { + "epoch": 1.8521079218753638, + "grad_norm": 0.3254711492557505, + "learning_rate": 3.8346828438137435e-05, + "loss": 2.6445, + "step": 39781 + }, + { + "epoch": 1.8521544800614569, + "grad_norm": 0.35654590215082244, + "learning_rate": 3.834419431313345e-05, + "loss": 2.6298, + "step": 39782 + }, + { + "epoch": 1.85220103824755, + "grad_norm": 0.36153948138422326, + "learning_rate": 3.834156022233814e-05, + "loss": 2.7412, + "step": 39783 + }, + { + "epoch": 1.852247596433643, + "grad_norm": 0.3665524378846252, + "learning_rate": 3.8338926165759195e-05, + "loss": 2.6812, + "step": 39784 + }, + { + "epoch": 1.852294154619736, + "grad_norm": 0.3581545037691786, + "learning_rate": 3.833629214340434e-05, + "loss": 2.7137, + "step": 39785 + }, + { + "epoch": 1.8523407128058291, + "grad_norm": 0.3182108132202504, + "learning_rate": 3.833365815528133e-05, + "loss": 2.6754, + "step": 39786 + }, + { + "epoch": 1.852387270991922, + "grad_norm": 0.35150088717987743, + "learning_rate": 3.833102420139788e-05, + "loss": 2.6533, + "step": 39787 + }, + { + "epoch": 1.8524338291780151, + "grad_norm": 0.3257381884874515, + "learning_rate": 3.832839028176173e-05, + "loss": 2.7125, + "step": 39788 + }, + { + "epoch": 1.8524803873641082, + "grad_norm": 0.3322012874886816, + "learning_rate": 3.8325756396380605e-05, + "loss": 2.6902, + "step": 39789 + }, + { + "epoch": 1.8525269455502014, + "grad_norm": 0.31199329004323717, + "learning_rate": 3.8323122545262226e-05, + "loss": 2.7166, + "step": 39790 + }, + { + "epoch": 1.8525735037362945, + "grad_norm": 0.30856815192919596, + "learning_rate": 3.832048872841434e-05, + "loss": 2.7051, + "step": 39791 + }, + { + "epoch": 1.8526200619223876, + "grad_norm": 0.36009623483347225, + "learning_rate": 3.831785494584465e-05, + "loss": 2.681, + "step": 39792 + }, + { + "epoch": 1.8526666201084807, + "grad_norm": 0.358212061999165, + "learning_rate": 3.831522119756093e-05, + "loss": 2.7339, + "step": 39793 + }, + { + "epoch": 1.8527131782945736, + "grad_norm": 0.3153878475205626, + "learning_rate": 3.831258748357088e-05, + "loss": 2.6057, + "step": 39794 + }, + { + "epoch": 1.8527597364806667, + "grad_norm": 0.3556515075551125, + "learning_rate": 3.830995380388222e-05, + "loss": 2.6383, + "step": 39795 + }, + { + "epoch": 1.8528062946667598, + "grad_norm": 0.35176081959933897, + "learning_rate": 3.83073201585027e-05, + "loss": 2.7495, + "step": 39796 + }, + { + "epoch": 1.8528528528528527, + "grad_norm": 0.3403327189284872, + "learning_rate": 3.830468654744004e-05, + "loss": 2.6721, + "step": 39797 + }, + { + "epoch": 1.8528994110389458, + "grad_norm": 0.3426236667222747, + "learning_rate": 3.830205297070198e-05, + "loss": 2.7469, + "step": 39798 + }, + { + "epoch": 1.852945969225039, + "grad_norm": 0.3155855259374961, + "learning_rate": 3.829941942829625e-05, + "loss": 2.7067, + "step": 39799 + }, + { + "epoch": 1.852992527411132, + "grad_norm": 0.3238887345447974, + "learning_rate": 3.829678592023056e-05, + "loss": 2.6364, + "step": 39800 + }, + { + "epoch": 1.8530390855972252, + "grad_norm": 0.32662178119222357, + "learning_rate": 3.829415244651266e-05, + "loss": 2.5506, + "step": 39801 + }, + { + "epoch": 1.8530856437833183, + "grad_norm": 0.33462672588734954, + "learning_rate": 3.8291519007150255e-05, + "loss": 2.6875, + "step": 39802 + }, + { + "epoch": 1.8531322019694114, + "grad_norm": 0.33487198327690315, + "learning_rate": 3.828888560215109e-05, + "loss": 2.7074, + "step": 39803 + }, + { + "epoch": 1.8531787601555043, + "grad_norm": 0.37899245100118295, + "learning_rate": 3.828625223152291e-05, + "loss": 2.6898, + "step": 39804 + }, + { + "epoch": 1.8532253183415974, + "grad_norm": 0.32736455756317817, + "learning_rate": 3.82836188952734e-05, + "loss": 2.7212, + "step": 39805 + }, + { + "epoch": 1.8532718765276903, + "grad_norm": 0.3351449547682207, + "learning_rate": 3.828098559341033e-05, + "loss": 2.6384, + "step": 39806 + }, + { + "epoch": 1.8533184347137834, + "grad_norm": 0.32412893081724226, + "learning_rate": 3.827835232594143e-05, + "loss": 2.6908, + "step": 39807 + }, + { + "epoch": 1.8533649928998766, + "grad_norm": 0.3646575970982723, + "learning_rate": 3.8275719092874366e-05, + "loss": 2.7467, + "step": 39808 + }, + { + "epoch": 1.8534115510859697, + "grad_norm": 0.3407769792644999, + "learning_rate": 3.827308589421695e-05, + "loss": 2.7975, + "step": 39809 + }, + { + "epoch": 1.8534581092720628, + "grad_norm": 0.3485506189823258, + "learning_rate": 3.8270452729976844e-05, + "loss": 2.5333, + "step": 39810 + }, + { + "epoch": 1.853504667458156, + "grad_norm": 0.33252379900971313, + "learning_rate": 3.8267819600161825e-05, + "loss": 2.8525, + "step": 39811 + }, + { + "epoch": 1.853551225644249, + "grad_norm": 0.34123793232700084, + "learning_rate": 3.826518650477959e-05, + "loss": 2.7205, + "step": 39812 + }, + { + "epoch": 1.8535977838303421, + "grad_norm": 0.33775739365545154, + "learning_rate": 3.8262553443837885e-05, + "loss": 2.6012, + "step": 39813 + }, + { + "epoch": 1.853644342016435, + "grad_norm": 0.34543455089100217, + "learning_rate": 3.825992041734442e-05, + "loss": 2.8157, + "step": 39814 + }, + { + "epoch": 1.8536909002025281, + "grad_norm": 0.34649288250724763, + "learning_rate": 3.8257287425306933e-05, + "loss": 2.6641, + "step": 39815 + }, + { + "epoch": 1.853737458388621, + "grad_norm": 0.3585768865066088, + "learning_rate": 3.825465446773316e-05, + "loss": 2.77, + "step": 39816 + }, + { + "epoch": 1.8537840165747141, + "grad_norm": 0.3405693002559091, + "learning_rate": 3.825202154463082e-05, + "loss": 2.6866, + "step": 39817 + }, + { + "epoch": 1.8538305747608073, + "grad_norm": 0.35876151150230806, + "learning_rate": 3.824938865600761e-05, + "loss": 2.708, + "step": 39818 + }, + { + "epoch": 1.8538771329469004, + "grad_norm": 0.31883278919573316, + "learning_rate": 3.824675580187132e-05, + "loss": 2.7707, + "step": 39819 + }, + { + "epoch": 1.8539236911329935, + "grad_norm": 0.35742650405817705, + "learning_rate": 3.824412298222961e-05, + "loss": 2.6871, + "step": 39820 + }, + { + "epoch": 1.8539702493190866, + "grad_norm": 0.36236481187925496, + "learning_rate": 3.824149019709028e-05, + "loss": 2.6261, + "step": 39821 + }, + { + "epoch": 1.8540168075051797, + "grad_norm": 0.3435231836966362, + "learning_rate": 3.823885744646101e-05, + "loss": 2.7727, + "step": 39822 + }, + { + "epoch": 1.8540633656912726, + "grad_norm": 0.32621074457574106, + "learning_rate": 3.823622473034951e-05, + "loss": 2.7005, + "step": 39823 + }, + { + "epoch": 1.8541099238773657, + "grad_norm": 0.3534697115857538, + "learning_rate": 3.823359204876355e-05, + "loss": 2.6208, + "step": 39824 + }, + { + "epoch": 1.8541564820634588, + "grad_norm": 0.34159939785744786, + "learning_rate": 3.8230959401710845e-05, + "loss": 2.6841, + "step": 39825 + }, + { + "epoch": 1.8542030402495517, + "grad_norm": 0.31480032444303563, + "learning_rate": 3.8228326789199095e-05, + "loss": 2.7193, + "step": 39826 + }, + { + "epoch": 1.8542495984356449, + "grad_norm": 0.3217793834859715, + "learning_rate": 3.822569421123606e-05, + "loss": 2.7357, + "step": 39827 + }, + { + "epoch": 1.854296156621738, + "grad_norm": 0.35639577435581926, + "learning_rate": 3.822306166782944e-05, + "loss": 2.6422, + "step": 39828 + }, + { + "epoch": 1.854342714807831, + "grad_norm": 0.35550878755838033, + "learning_rate": 3.822042915898699e-05, + "loss": 2.5977, + "step": 39829 + }, + { + "epoch": 1.8543892729939242, + "grad_norm": 0.3238598335202929, + "learning_rate": 3.8217796684716424e-05, + "loss": 2.6353, + "step": 39830 + }, + { + "epoch": 1.8544358311800173, + "grad_norm": 0.38194471132847935, + "learning_rate": 3.821516424502544e-05, + "loss": 2.7175, + "step": 39831 + }, + { + "epoch": 1.8544823893661104, + "grad_norm": 0.3290638126538416, + "learning_rate": 3.8212531839921806e-05, + "loss": 2.7661, + "step": 39832 + }, + { + "epoch": 1.8545289475522033, + "grad_norm": 0.3365936828517351, + "learning_rate": 3.820989946941321e-05, + "loss": 2.6333, + "step": 39833 + }, + { + "epoch": 1.8545755057382964, + "grad_norm": 0.31489085115063625, + "learning_rate": 3.820726713350742e-05, + "loss": 2.6379, + "step": 39834 + }, + { + "epoch": 1.8546220639243896, + "grad_norm": 0.33306120829640823, + "learning_rate": 3.820463483221213e-05, + "loss": 2.688, + "step": 39835 + }, + { + "epoch": 1.8546686221104824, + "grad_norm": 0.3268569586919153, + "learning_rate": 3.820200256553507e-05, + "loss": 2.7254, + "step": 39836 + }, + { + "epoch": 1.8547151802965756, + "grad_norm": 0.3307118311815735, + "learning_rate": 3.8199370333483976e-05, + "loss": 2.6329, + "step": 39837 + }, + { + "epoch": 1.8547617384826687, + "grad_norm": 0.36087506854230866, + "learning_rate": 3.819673813606657e-05, + "loss": 2.7449, + "step": 39838 + }, + { + "epoch": 1.8548082966687618, + "grad_norm": 0.3168268783775129, + "learning_rate": 3.8194105973290575e-05, + "loss": 2.7564, + "step": 39839 + }, + { + "epoch": 1.854854854854855, + "grad_norm": 0.37155711004729247, + "learning_rate": 3.81914738451637e-05, + "loss": 2.7236, + "step": 39840 + }, + { + "epoch": 1.854901413040948, + "grad_norm": 0.35968920043390235, + "learning_rate": 3.8188841751693705e-05, + "loss": 2.6524, + "step": 39841 + }, + { + "epoch": 1.8549479712270411, + "grad_norm": 0.3334162751195264, + "learning_rate": 3.8186209692888294e-05, + "loss": 2.7207, + "step": 39842 + }, + { + "epoch": 1.854994529413134, + "grad_norm": 0.3286261553671142, + "learning_rate": 3.81835776687552e-05, + "loss": 2.6911, + "step": 39843 + }, + { + "epoch": 1.8550410875992271, + "grad_norm": 0.33889559141522396, + "learning_rate": 3.8180945679302116e-05, + "loss": 2.6339, + "step": 39844 + }, + { + "epoch": 1.85508764578532, + "grad_norm": 0.34236467189220005, + "learning_rate": 3.817831372453681e-05, + "loss": 2.7554, + "step": 39845 + }, + { + "epoch": 1.8551342039714132, + "grad_norm": 0.32011803448671383, + "learning_rate": 3.8175681804466976e-05, + "loss": 2.7292, + "step": 39846 + }, + { + "epoch": 1.8551807621575063, + "grad_norm": 0.33854530448562614, + "learning_rate": 3.817304991910037e-05, + "loss": 2.6447, + "step": 39847 + }, + { + "epoch": 1.8552273203435994, + "grad_norm": 0.3203430682097409, + "learning_rate": 3.817041806844469e-05, + "loss": 2.6909, + "step": 39848 + }, + { + "epoch": 1.8552738785296925, + "grad_norm": 0.3323277474998165, + "learning_rate": 3.8167786252507656e-05, + "loss": 2.6972, + "step": 39849 + }, + { + "epoch": 1.8553204367157856, + "grad_norm": 0.3235545398216831, + "learning_rate": 3.816515447129702e-05, + "loss": 2.7118, + "step": 39850 + }, + { + "epoch": 1.8553669949018787, + "grad_norm": 0.33771865226052367, + "learning_rate": 3.816252272482048e-05, + "loss": 2.7809, + "step": 39851 + }, + { + "epoch": 1.8554135530879718, + "grad_norm": 0.3522337608864623, + "learning_rate": 3.8159891013085774e-05, + "loss": 2.8175, + "step": 39852 + }, + { + "epoch": 1.8554601112740647, + "grad_norm": 0.35785973916547337, + "learning_rate": 3.8157259336100616e-05, + "loss": 2.6924, + "step": 39853 + }, + { + "epoch": 1.8555066694601579, + "grad_norm": 0.3333123027014532, + "learning_rate": 3.815462769387274e-05, + "loss": 2.7045, + "step": 39854 + }, + { + "epoch": 1.8555532276462507, + "grad_norm": 0.35525290142211585, + "learning_rate": 3.815199608640987e-05, + "loss": 2.6763, + "step": 39855 + }, + { + "epoch": 1.8555997858323439, + "grad_norm": 0.35482913813307065, + "learning_rate": 3.81493645137197e-05, + "loss": 2.6702, + "step": 39856 + }, + { + "epoch": 1.855646344018437, + "grad_norm": 0.37430211518404016, + "learning_rate": 3.814673297581001e-05, + "loss": 2.5983, + "step": 39857 + }, + { + "epoch": 1.85569290220453, + "grad_norm": 0.35843716084450894, + "learning_rate": 3.814410147268847e-05, + "loss": 2.7287, + "step": 39858 + }, + { + "epoch": 1.8557394603906232, + "grad_norm": 0.3509378328619828, + "learning_rate": 3.814147000436282e-05, + "loss": 2.6991, + "step": 39859 + }, + { + "epoch": 1.8557860185767163, + "grad_norm": 0.3492457506254095, + "learning_rate": 3.8138838570840804e-05, + "loss": 2.6999, + "step": 39860 + }, + { + "epoch": 1.8558325767628094, + "grad_norm": 0.36668863056892514, + "learning_rate": 3.8136207172130134e-05, + "loss": 2.734, + "step": 39861 + }, + { + "epoch": 1.8558791349489026, + "grad_norm": 0.31726572423676896, + "learning_rate": 3.81335758082385e-05, + "loss": 2.6162, + "step": 39862 + }, + { + "epoch": 1.8559256931349954, + "grad_norm": 0.3471412392859138, + "learning_rate": 3.813094447917367e-05, + "loss": 2.6919, + "step": 39863 + }, + { + "epoch": 1.8559722513210886, + "grad_norm": 0.3587326617627051, + "learning_rate": 3.812831318494335e-05, + "loss": 2.6493, + "step": 39864 + }, + { + "epoch": 1.8560188095071815, + "grad_norm": 0.3176699660648066, + "learning_rate": 3.812568192555526e-05, + "loss": 2.6355, + "step": 39865 + }, + { + "epoch": 1.8560653676932746, + "grad_norm": 0.3091621602784366, + "learning_rate": 3.812305070101712e-05, + "loss": 2.6996, + "step": 39866 + }, + { + "epoch": 1.8561119258793677, + "grad_norm": 0.349081913385262, + "learning_rate": 3.812041951133666e-05, + "loss": 2.7868, + "step": 39867 + }, + { + "epoch": 1.8561584840654608, + "grad_norm": 0.3498082082577866, + "learning_rate": 3.81177883565216e-05, + "loss": 2.773, + "step": 39868 + }, + { + "epoch": 1.856205042251554, + "grad_norm": 0.336980668326184, + "learning_rate": 3.811515723657964e-05, + "loss": 2.6822, + "step": 39869 + }, + { + "epoch": 1.856251600437647, + "grad_norm": 0.35211029884804557, + "learning_rate": 3.8112526151518544e-05, + "loss": 2.676, + "step": 39870 + }, + { + "epoch": 1.8562981586237401, + "grad_norm": 0.3564832702519039, + "learning_rate": 3.8109895101346014e-05, + "loss": 2.6227, + "step": 39871 + }, + { + "epoch": 1.856344716809833, + "grad_norm": 0.3124771559197532, + "learning_rate": 3.810726408606975e-05, + "loss": 2.7227, + "step": 39872 + }, + { + "epoch": 1.8563912749959262, + "grad_norm": 0.35152373484636196, + "learning_rate": 3.810463310569752e-05, + "loss": 2.524, + "step": 39873 + }, + { + "epoch": 1.8564378331820193, + "grad_norm": 0.3598373743343571, + "learning_rate": 3.810200216023699e-05, + "loss": 2.6176, + "step": 39874 + }, + { + "epoch": 1.8564843913681122, + "grad_norm": 0.3294872435446298, + "learning_rate": 3.809937124969594e-05, + "loss": 2.6984, + "step": 39875 + }, + { + "epoch": 1.8565309495542053, + "grad_norm": 0.37626844605315446, + "learning_rate": 3.809674037408206e-05, + "loss": 2.6465, + "step": 39876 + }, + { + "epoch": 1.8565775077402984, + "grad_norm": 0.3551351018949651, + "learning_rate": 3.8094109533403064e-05, + "loss": 2.6857, + "step": 39877 + }, + { + "epoch": 1.8566240659263915, + "grad_norm": 0.3237537269836239, + "learning_rate": 3.809147872766669e-05, + "loss": 2.7232, + "step": 39878 + }, + { + "epoch": 1.8566706241124846, + "grad_norm": 0.3583592184473786, + "learning_rate": 3.808884795688065e-05, + "loss": 2.6672, + "step": 39879 + }, + { + "epoch": 1.8567171822985777, + "grad_norm": 0.3321316768531657, + "learning_rate": 3.808621722105266e-05, + "loss": 2.6659, + "step": 39880 + }, + { + "epoch": 1.8567637404846709, + "grad_norm": 0.34605805895589814, + "learning_rate": 3.808358652019046e-05, + "loss": 2.6708, + "step": 39881 + }, + { + "epoch": 1.8568102986707637, + "grad_norm": 0.3704379302739313, + "learning_rate": 3.808095585430175e-05, + "loss": 2.7322, + "step": 39882 + }, + { + "epoch": 1.8568568568568569, + "grad_norm": 0.3151614980764948, + "learning_rate": 3.8078325223394267e-05, + "loss": 2.6086, + "step": 39883 + }, + { + "epoch": 1.85690341504295, + "grad_norm": 0.3393274237012454, + "learning_rate": 3.8075694627475735e-05, + "loss": 2.6923, + "step": 39884 + }, + { + "epoch": 1.8569499732290429, + "grad_norm": 0.34968942748292486, + "learning_rate": 3.807306406655384e-05, + "loss": 2.6514, + "step": 39885 + }, + { + "epoch": 1.856996531415136, + "grad_norm": 0.3322763663611103, + "learning_rate": 3.807043354063634e-05, + "loss": 2.698, + "step": 39886 + }, + { + "epoch": 1.857043089601229, + "grad_norm": 0.3700923611708707, + "learning_rate": 3.8067803049730934e-05, + "loss": 2.8427, + "step": 39887 + }, + { + "epoch": 1.8570896477873222, + "grad_norm": 0.3650298986703999, + "learning_rate": 3.8065172593845366e-05, + "loss": 2.7882, + "step": 39888 + }, + { + "epoch": 1.8571362059734153, + "grad_norm": 0.3389993962316578, + "learning_rate": 3.806254217298734e-05, + "loss": 2.7472, + "step": 39889 + }, + { + "epoch": 1.8571827641595084, + "grad_norm": 0.36775398116226476, + "learning_rate": 3.805991178716457e-05, + "loss": 2.561, + "step": 39890 + }, + { + "epoch": 1.8572293223456016, + "grad_norm": 0.3266410296447352, + "learning_rate": 3.805728143638479e-05, + "loss": 2.5959, + "step": 39891 + }, + { + "epoch": 1.8572758805316945, + "grad_norm": 0.33446484240608615, + "learning_rate": 3.805465112065569e-05, + "loss": 2.6793, + "step": 39892 + }, + { + "epoch": 1.8573224387177876, + "grad_norm": 0.38609563149150167, + "learning_rate": 3.805202083998504e-05, + "loss": 2.6971, + "step": 39893 + }, + { + "epoch": 1.8573689969038805, + "grad_norm": 0.34170678712022456, + "learning_rate": 3.804939059438052e-05, + "loss": 2.5658, + "step": 39894 + }, + { + "epoch": 1.8574155550899736, + "grad_norm": 0.3521112675732111, + "learning_rate": 3.8046760383849846e-05, + "loss": 2.5968, + "step": 39895 + }, + { + "epoch": 1.8574621132760667, + "grad_norm": 0.3703161720289099, + "learning_rate": 3.8044130208400774e-05, + "loss": 2.6626, + "step": 39896 + }, + { + "epoch": 1.8575086714621598, + "grad_norm": 0.4026149603435217, + "learning_rate": 3.8041500068041e-05, + "loss": 2.7021, + "step": 39897 + }, + { + "epoch": 1.857555229648253, + "grad_norm": 0.3817989454322643, + "learning_rate": 3.803886996277823e-05, + "loss": 2.7019, + "step": 39898 + }, + { + "epoch": 1.857601787834346, + "grad_norm": 0.3641101169688159, + "learning_rate": 3.803623989262022e-05, + "loss": 2.711, + "step": 39899 + }, + { + "epoch": 1.8576483460204392, + "grad_norm": 0.3933875599508125, + "learning_rate": 3.803360985757464e-05, + "loss": 2.6312, + "step": 39900 + }, + { + "epoch": 1.8576949042065323, + "grad_norm": 0.35073750794137126, + "learning_rate": 3.8030979857649257e-05, + "loss": 2.7128, + "step": 39901 + }, + { + "epoch": 1.8577414623926252, + "grad_norm": 0.34635731363618677, + "learning_rate": 3.8028349892851775e-05, + "loss": 2.5508, + "step": 39902 + }, + { + "epoch": 1.8577880205787183, + "grad_norm": 0.40114290696271815, + "learning_rate": 3.8025719963189885e-05, + "loss": 2.7301, + "step": 39903 + }, + { + "epoch": 1.8578345787648112, + "grad_norm": 0.33444078764584634, + "learning_rate": 3.802309006867135e-05, + "loss": 2.7231, + "step": 39904 + }, + { + "epoch": 1.8578811369509043, + "grad_norm": 0.3733024598564045, + "learning_rate": 3.8020460209303844e-05, + "loss": 2.6761, + "step": 39905 + }, + { + "epoch": 1.8579276951369974, + "grad_norm": 0.37018119046870346, + "learning_rate": 3.801783038509512e-05, + "loss": 2.6864, + "step": 39906 + }, + { + "epoch": 1.8579742533230905, + "grad_norm": 0.3361549432076558, + "learning_rate": 3.801520059605289e-05, + "loss": 2.735, + "step": 39907 + }, + { + "epoch": 1.8580208115091836, + "grad_norm": 0.3500112360508072, + "learning_rate": 3.801257084218484e-05, + "loss": 2.7109, + "step": 39908 + }, + { + "epoch": 1.8580673696952767, + "grad_norm": 0.375233473303731, + "learning_rate": 3.800994112349874e-05, + "loss": 2.7385, + "step": 39909 + }, + { + "epoch": 1.8581139278813699, + "grad_norm": 0.3412725706188003, + "learning_rate": 3.800731144000226e-05, + "loss": 2.6335, + "step": 39910 + }, + { + "epoch": 1.8581604860674628, + "grad_norm": 0.347699563762087, + "learning_rate": 3.800468179170315e-05, + "loss": 2.6698, + "step": 39911 + }, + { + "epoch": 1.8582070442535559, + "grad_norm": 0.3484411891380453, + "learning_rate": 3.800205217860912e-05, + "loss": 2.6942, + "step": 39912 + }, + { + "epoch": 1.858253602439649, + "grad_norm": 0.3465886865240326, + "learning_rate": 3.7999422600727876e-05, + "loss": 2.6925, + "step": 39913 + }, + { + "epoch": 1.8583001606257419, + "grad_norm": 0.3403192550041521, + "learning_rate": 3.799679305806716e-05, + "loss": 2.647, + "step": 39914 + }, + { + "epoch": 1.858346718811835, + "grad_norm": 0.35576016876065286, + "learning_rate": 3.7994163550634674e-05, + "loss": 2.6354, + "step": 39915 + }, + { + "epoch": 1.8583932769979281, + "grad_norm": 0.3416519799741192, + "learning_rate": 3.799153407843812e-05, + "loss": 2.6628, + "step": 39916 + }, + { + "epoch": 1.8584398351840212, + "grad_norm": 0.3422892146192616, + "learning_rate": 3.7988904641485247e-05, + "loss": 2.5417, + "step": 39917 + }, + { + "epoch": 1.8584863933701143, + "grad_norm": 0.3351332965244589, + "learning_rate": 3.798627523978373e-05, + "loss": 2.6367, + "step": 39918 + }, + { + "epoch": 1.8585329515562075, + "grad_norm": 0.3425205498769476, + "learning_rate": 3.798364587334133e-05, + "loss": 2.6196, + "step": 39919 + }, + { + "epoch": 1.8585795097423006, + "grad_norm": 0.3369531324288884, + "learning_rate": 3.798101654216575e-05, + "loss": 2.6946, + "step": 39920 + }, + { + "epoch": 1.8586260679283935, + "grad_norm": 0.35155586474190914, + "learning_rate": 3.797838724626468e-05, + "loss": 2.7048, + "step": 39921 + }, + { + "epoch": 1.8586726261144866, + "grad_norm": 0.3603522086668158, + "learning_rate": 3.797575798564588e-05, + "loss": 2.6731, + "step": 39922 + }, + { + "epoch": 1.8587191843005797, + "grad_norm": 0.3241153685713656, + "learning_rate": 3.797312876031703e-05, + "loss": 2.6615, + "step": 39923 + }, + { + "epoch": 1.8587657424866726, + "grad_norm": 0.32815605071692294, + "learning_rate": 3.797049957028588e-05, + "loss": 2.6772, + "step": 39924 + }, + { + "epoch": 1.8588123006727657, + "grad_norm": 0.34823543961525677, + "learning_rate": 3.796787041556013e-05, + "loss": 2.7273, + "step": 39925 + }, + { + "epoch": 1.8588588588588588, + "grad_norm": 0.326027611470838, + "learning_rate": 3.796524129614747e-05, + "loss": 2.692, + "step": 39926 + }, + { + "epoch": 1.858905417044952, + "grad_norm": 0.31683325443713073, + "learning_rate": 3.796261221205566e-05, + "loss": 2.6457, + "step": 39927 + }, + { + "epoch": 1.858951975231045, + "grad_norm": 0.36615367454019154, + "learning_rate": 3.795998316329239e-05, + "loss": 2.699, + "step": 39928 + }, + { + "epoch": 1.8589985334171382, + "grad_norm": 0.34848697932255135, + "learning_rate": 3.79573541498654e-05, + "loss": 2.7074, + "step": 39929 + }, + { + "epoch": 1.8590450916032313, + "grad_norm": 0.32780045497547944, + "learning_rate": 3.795472517178238e-05, + "loss": 2.7654, + "step": 39930 + }, + { + "epoch": 1.8590916497893242, + "grad_norm": 0.36148728235687977, + "learning_rate": 3.795209622905104e-05, + "loss": 2.7066, + "step": 39931 + }, + { + "epoch": 1.8591382079754173, + "grad_norm": 0.3605665440120751, + "learning_rate": 3.794946732167913e-05, + "loss": 2.7101, + "step": 39932 + }, + { + "epoch": 1.8591847661615102, + "grad_norm": 0.3462682707516011, + "learning_rate": 3.794683844967434e-05, + "loss": 2.6198, + "step": 39933 + }, + { + "epoch": 1.8592313243476033, + "grad_norm": 0.3248504910900691, + "learning_rate": 3.794420961304438e-05, + "loss": 2.6488, + "step": 39934 + }, + { + "epoch": 1.8592778825336964, + "grad_norm": 0.3719165622965323, + "learning_rate": 3.794158081179699e-05, + "loss": 2.6979, + "step": 39935 + }, + { + "epoch": 1.8593244407197895, + "grad_norm": 0.3668743011455332, + "learning_rate": 3.7938952045939854e-05, + "loss": 2.6836, + "step": 39936 + }, + { + "epoch": 1.8593709989058826, + "grad_norm": 0.3631888469692432, + "learning_rate": 3.793632331548073e-05, + "loss": 2.7312, + "step": 39937 + }, + { + "epoch": 1.8594175570919758, + "grad_norm": 0.3539606780987455, + "learning_rate": 3.793369462042731e-05, + "loss": 2.5626, + "step": 39938 + }, + { + "epoch": 1.8594641152780689, + "grad_norm": 0.3195080352585967, + "learning_rate": 3.793106596078728e-05, + "loss": 2.699, + "step": 39939 + }, + { + "epoch": 1.859510673464162, + "grad_norm": 0.34414263811979523, + "learning_rate": 3.792843733656841e-05, + "loss": 2.7065, + "step": 39940 + }, + { + "epoch": 1.8595572316502549, + "grad_norm": 0.3284654708393193, + "learning_rate": 3.792580874777837e-05, + "loss": 2.5894, + "step": 39941 + }, + { + "epoch": 1.859603789836348, + "grad_norm": 0.33559807691447396, + "learning_rate": 3.792318019442491e-05, + "loss": 2.6818, + "step": 39942 + }, + { + "epoch": 1.859650348022441, + "grad_norm": 0.3255872723135538, + "learning_rate": 3.792055167651572e-05, + "loss": 2.6812, + "step": 39943 + }, + { + "epoch": 1.859696906208534, + "grad_norm": 0.34106489529622325, + "learning_rate": 3.79179231940585e-05, + "loss": 2.7164, + "step": 39944 + }, + { + "epoch": 1.8597434643946271, + "grad_norm": 0.3465678327660114, + "learning_rate": 3.7915294747061006e-05, + "loss": 2.6613, + "step": 39945 + }, + { + "epoch": 1.8597900225807202, + "grad_norm": 0.3348811580857283, + "learning_rate": 3.7912666335530913e-05, + "loss": 2.6515, + "step": 39946 + }, + { + "epoch": 1.8598365807668134, + "grad_norm": 0.33394861105933904, + "learning_rate": 3.791003795947598e-05, + "loss": 2.664, + "step": 39947 + }, + { + "epoch": 1.8598831389529065, + "grad_norm": 0.3807056550265084, + "learning_rate": 3.790740961890388e-05, + "loss": 2.6848, + "step": 39948 + }, + { + "epoch": 1.8599296971389996, + "grad_norm": 0.33542260803363966, + "learning_rate": 3.790478131382234e-05, + "loss": 2.7491, + "step": 39949 + }, + { + "epoch": 1.8599762553250927, + "grad_norm": 0.35970278193085437, + "learning_rate": 3.7902153044239076e-05, + "loss": 2.6114, + "step": 39950 + }, + { + "epoch": 1.8600228135111856, + "grad_norm": 0.3528158078522116, + "learning_rate": 3.789952481016181e-05, + "loss": 2.7043, + "step": 39951 + }, + { + "epoch": 1.8600693716972787, + "grad_norm": 0.35143676244343475, + "learning_rate": 3.7896896611598224e-05, + "loss": 2.5729, + "step": 39952 + }, + { + "epoch": 1.8601159298833716, + "grad_norm": 0.32270152563587423, + "learning_rate": 3.789426844855608e-05, + "loss": 2.7377, + "step": 39953 + }, + { + "epoch": 1.8601624880694647, + "grad_norm": 0.37883062654248467, + "learning_rate": 3.7891640321043055e-05, + "loss": 2.7497, + "step": 39954 + }, + { + "epoch": 1.8602090462555578, + "grad_norm": 0.33396884976685787, + "learning_rate": 3.7889012229066876e-05, + "loss": 2.7197, + "step": 39955 + }, + { + "epoch": 1.860255604441651, + "grad_norm": 0.3412965497371061, + "learning_rate": 3.7886384172635255e-05, + "loss": 2.7539, + "step": 39956 + }, + { + "epoch": 1.860302162627744, + "grad_norm": 0.3586787103256843, + "learning_rate": 3.788375615175589e-05, + "loss": 2.7359, + "step": 39957 + }, + { + "epoch": 1.8603487208138372, + "grad_norm": 0.31720993197285324, + "learning_rate": 3.788112816643652e-05, + "loss": 2.674, + "step": 39958 + }, + { + "epoch": 1.8603952789999303, + "grad_norm": 0.3538960202007725, + "learning_rate": 3.7878500216684824e-05, + "loss": 2.7095, + "step": 39959 + }, + { + "epoch": 1.8604418371860232, + "grad_norm": 0.33778714690802175, + "learning_rate": 3.787587230250856e-05, + "loss": 2.6334, + "step": 39960 + }, + { + "epoch": 1.8604883953721163, + "grad_norm": 0.3393921824212008, + "learning_rate": 3.7873244423915415e-05, + "loss": 2.7756, + "step": 39961 + }, + { + "epoch": 1.8605349535582094, + "grad_norm": 0.3469199423057818, + "learning_rate": 3.7870616580913085e-05, + "loss": 2.6442, + "step": 39962 + }, + { + "epoch": 1.8605815117443023, + "grad_norm": 0.341571638613506, + "learning_rate": 3.786798877350932e-05, + "loss": 2.6658, + "step": 39963 + }, + { + "epoch": 1.8606280699303954, + "grad_norm": 0.3688256959926574, + "learning_rate": 3.78653610017118e-05, + "loss": 2.6525, + "step": 39964 + }, + { + "epoch": 1.8606746281164885, + "grad_norm": 0.3142241071504983, + "learning_rate": 3.7862733265528244e-05, + "loss": 2.6773, + "step": 39965 + }, + { + "epoch": 1.8607211863025817, + "grad_norm": 0.34696207721226663, + "learning_rate": 3.786010556496639e-05, + "loss": 2.6857, + "step": 39966 + }, + { + "epoch": 1.8607677444886748, + "grad_norm": 0.34007913439137144, + "learning_rate": 3.785747790003392e-05, + "loss": 2.7527, + "step": 39967 + }, + { + "epoch": 1.8608143026747679, + "grad_norm": 0.33507598852334247, + "learning_rate": 3.785485027073856e-05, + "loss": 2.6194, + "step": 39968 + }, + { + "epoch": 1.860860860860861, + "grad_norm": 0.3330948211138609, + "learning_rate": 3.785222267708802e-05, + "loss": 2.7394, + "step": 39969 + }, + { + "epoch": 1.860907419046954, + "grad_norm": 0.3423484447943317, + "learning_rate": 3.784959511908999e-05, + "loss": 2.6914, + "step": 39970 + }, + { + "epoch": 1.860953977233047, + "grad_norm": 0.3419141208097424, + "learning_rate": 3.784696759675222e-05, + "loss": 2.7543, + "step": 39971 + }, + { + "epoch": 1.8610005354191401, + "grad_norm": 0.40366749372582234, + "learning_rate": 3.784434011008239e-05, + "loss": 2.7447, + "step": 39972 + }, + { + "epoch": 1.861047093605233, + "grad_norm": 0.33126901872576575, + "learning_rate": 3.7841712659088235e-05, + "loss": 2.7925, + "step": 39973 + }, + { + "epoch": 1.8610936517913261, + "grad_norm": 0.3374408390709984, + "learning_rate": 3.783908524377746e-05, + "loss": 2.7013, + "step": 39974 + }, + { + "epoch": 1.8611402099774192, + "grad_norm": 0.35609106220215336, + "learning_rate": 3.7836457864157756e-05, + "loss": 2.6818, + "step": 39975 + }, + { + "epoch": 1.8611867681635124, + "grad_norm": 0.33326612632849484, + "learning_rate": 3.783383052023686e-05, + "loss": 2.6298, + "step": 39976 + }, + { + "epoch": 1.8612333263496055, + "grad_norm": 0.35142720209707534, + "learning_rate": 3.783120321202247e-05, + "loss": 2.7436, + "step": 39977 + }, + { + "epoch": 1.8612798845356986, + "grad_norm": 0.33910860870387616, + "learning_rate": 3.78285759395223e-05, + "loss": 2.7999, + "step": 39978 + }, + { + "epoch": 1.8613264427217917, + "grad_norm": 0.33440970233227274, + "learning_rate": 3.782594870274407e-05, + "loss": 2.7008, + "step": 39979 + }, + { + "epoch": 1.8613730009078846, + "grad_norm": 0.3329734694817948, + "learning_rate": 3.782332150169547e-05, + "loss": 2.7269, + "step": 39980 + }, + { + "epoch": 1.8614195590939777, + "grad_norm": 0.331130796907923, + "learning_rate": 3.782069433638423e-05, + "loss": 2.6891, + "step": 39981 + }, + { + "epoch": 1.8614661172800706, + "grad_norm": 0.3359701217772045, + "learning_rate": 3.781806720681803e-05, + "loss": 2.6892, + "step": 39982 + }, + { + "epoch": 1.8615126754661637, + "grad_norm": 0.3533903819543575, + "learning_rate": 3.781544011300463e-05, + "loss": 2.7544, + "step": 39983 + }, + { + "epoch": 1.8615592336522568, + "grad_norm": 0.3359517971860623, + "learning_rate": 3.781281305495171e-05, + "loss": 2.5947, + "step": 39984 + }, + { + "epoch": 1.86160579183835, + "grad_norm": 0.36204771730291113, + "learning_rate": 3.781018603266696e-05, + "loss": 2.5986, + "step": 39985 + }, + { + "epoch": 1.861652350024443, + "grad_norm": 0.34392351617667605, + "learning_rate": 3.7807559046158134e-05, + "loss": 2.5801, + "step": 39986 + }, + { + "epoch": 1.8616989082105362, + "grad_norm": 0.3411510437964011, + "learning_rate": 3.780493209543293e-05, + "loss": 2.7442, + "step": 39987 + }, + { + "epoch": 1.8617454663966293, + "grad_norm": 0.35786916224017107, + "learning_rate": 3.780230518049902e-05, + "loss": 2.7051, + "step": 39988 + }, + { + "epoch": 1.8617920245827224, + "grad_norm": 0.3470834616770875, + "learning_rate": 3.779967830136416e-05, + "loss": 2.6471, + "step": 39989 + }, + { + "epoch": 1.8618385827688153, + "grad_norm": 0.33849209939290764, + "learning_rate": 3.779705145803604e-05, + "loss": 2.6199, + "step": 39990 + }, + { + "epoch": 1.8618851409549084, + "grad_norm": 0.3463518136834512, + "learning_rate": 3.7794424650522365e-05, + "loss": 2.6589, + "step": 39991 + }, + { + "epoch": 1.8619316991410013, + "grad_norm": 0.35142251738554875, + "learning_rate": 3.779179787883087e-05, + "loss": 2.84, + "step": 39992 + }, + { + "epoch": 1.8619782573270944, + "grad_norm": 0.3240863237893583, + "learning_rate": 3.778917114296923e-05, + "loss": 2.7369, + "step": 39993 + }, + { + "epoch": 1.8620248155131875, + "grad_norm": 0.4224075676722785, + "learning_rate": 3.7786544442945175e-05, + "loss": 2.8073, + "step": 39994 + }, + { + "epoch": 1.8620713736992807, + "grad_norm": 0.3797946023705353, + "learning_rate": 3.778391777876639e-05, + "loss": 2.59, + "step": 39995 + }, + { + "epoch": 1.8621179318853738, + "grad_norm": 0.3653092452295801, + "learning_rate": 3.7781291150440626e-05, + "loss": 2.6714, + "step": 39996 + }, + { + "epoch": 1.862164490071467, + "grad_norm": 0.3508285329337678, + "learning_rate": 3.7778664557975574e-05, + "loss": 2.6902, + "step": 39997 + }, + { + "epoch": 1.86221104825756, + "grad_norm": 0.37675533226414626, + "learning_rate": 3.777603800137891e-05, + "loss": 2.7282, + "step": 39998 + }, + { + "epoch": 1.862257606443653, + "grad_norm": 0.3732058141546527, + "learning_rate": 3.77734114806584e-05, + "loss": 2.6346, + "step": 39999 + }, + { + "epoch": 1.862304164629746, + "grad_norm": 0.34670814900588703, + "learning_rate": 3.777078499582169e-05, + "loss": 2.5988, + "step": 40000 + }, + { + "epoch": 1.8623507228158391, + "grad_norm": 0.39470363386006607, + "learning_rate": 3.776815854687655e-05, + "loss": 2.7926, + "step": 40001 + }, + { + "epoch": 1.862397281001932, + "grad_norm": 0.37270673389555203, + "learning_rate": 3.776553213383065e-05, + "loss": 2.6336, + "step": 40002 + }, + { + "epoch": 1.8624438391880251, + "grad_norm": 0.37033909490035016, + "learning_rate": 3.776290575669171e-05, + "loss": 2.6911, + "step": 40003 + }, + { + "epoch": 1.8624903973741183, + "grad_norm": 0.39407643512898227, + "learning_rate": 3.7760279415467425e-05, + "loss": 2.6626, + "step": 40004 + }, + { + "epoch": 1.8625369555602114, + "grad_norm": 0.34993821168618844, + "learning_rate": 3.775765311016553e-05, + "loss": 2.717, + "step": 40005 + }, + { + "epoch": 1.8625835137463045, + "grad_norm": 0.38028254973663644, + "learning_rate": 3.775502684079371e-05, + "loss": 2.6704, + "step": 40006 + }, + { + "epoch": 1.8626300719323976, + "grad_norm": 0.335000118732297, + "learning_rate": 3.7752400607359684e-05, + "loss": 2.6375, + "step": 40007 + }, + { + "epoch": 1.8626766301184907, + "grad_norm": 0.35715726645457624, + "learning_rate": 3.774977440987114e-05, + "loss": 2.6987, + "step": 40008 + }, + { + "epoch": 1.8627231883045836, + "grad_norm": 0.38459996759060117, + "learning_rate": 3.774714824833582e-05, + "loss": 2.699, + "step": 40009 + }, + { + "epoch": 1.8627697464906767, + "grad_norm": 0.3665226744972745, + "learning_rate": 3.774452212276141e-05, + "loss": 2.6545, + "step": 40010 + }, + { + "epoch": 1.8628163046767698, + "grad_norm": 0.35429813987638226, + "learning_rate": 3.77418960331556e-05, + "loss": 2.6986, + "step": 40011 + }, + { + "epoch": 1.8628628628628627, + "grad_norm": 0.35330787000738073, + "learning_rate": 3.773926997952614e-05, + "loss": 2.6857, + "step": 40012 + }, + { + "epoch": 1.8629094210489558, + "grad_norm": 0.4026944619220656, + "learning_rate": 3.77366439618807e-05, + "loss": 2.7468, + "step": 40013 + }, + { + "epoch": 1.862955979235049, + "grad_norm": 0.37302326238408395, + "learning_rate": 3.773401798022701e-05, + "loss": 2.6734, + "step": 40014 + }, + { + "epoch": 1.863002537421142, + "grad_norm": 0.330388922567385, + "learning_rate": 3.7731392034572775e-05, + "loss": 2.6958, + "step": 40015 + }, + { + "epoch": 1.8630490956072352, + "grad_norm": 0.3689031514540951, + "learning_rate": 3.772876612492569e-05, + "loss": 2.6694, + "step": 40016 + }, + { + "epoch": 1.8630956537933283, + "grad_norm": 0.3559711626725085, + "learning_rate": 3.772614025129346e-05, + "loss": 2.7058, + "step": 40017 + }, + { + "epoch": 1.8631422119794214, + "grad_norm": 0.33293363008498594, + "learning_rate": 3.77235144136838e-05, + "loss": 2.7145, + "step": 40018 + }, + { + "epoch": 1.8631887701655143, + "grad_norm": 0.38559851356360175, + "learning_rate": 3.772088861210443e-05, + "loss": 2.8265, + "step": 40019 + }, + { + "epoch": 1.8632353283516074, + "grad_norm": 0.3223556084367931, + "learning_rate": 3.7718262846563044e-05, + "loss": 2.7097, + "step": 40020 + }, + { + "epoch": 1.8632818865377003, + "grad_norm": 0.33626086283642004, + "learning_rate": 3.771563711706732e-05, + "loss": 2.725, + "step": 40021 + }, + { + "epoch": 1.8633284447237934, + "grad_norm": 0.3506880249647935, + "learning_rate": 3.7713011423625015e-05, + "loss": 2.773, + "step": 40022 + }, + { + "epoch": 1.8633750029098866, + "grad_norm": 0.35109791252052547, + "learning_rate": 3.771038576624381e-05, + "loss": 2.7865, + "step": 40023 + }, + { + "epoch": 1.8634215610959797, + "grad_norm": 0.3594885444124246, + "learning_rate": 3.770776014493139e-05, + "loss": 2.6614, + "step": 40024 + }, + { + "epoch": 1.8634681192820728, + "grad_norm": 0.34615786013366845, + "learning_rate": 3.770513455969551e-05, + "loss": 2.7386, + "step": 40025 + }, + { + "epoch": 1.863514677468166, + "grad_norm": 0.3303284199043618, + "learning_rate": 3.770250901054382e-05, + "loss": 2.6753, + "step": 40026 + }, + { + "epoch": 1.863561235654259, + "grad_norm": 0.35902838732208214, + "learning_rate": 3.769988349748408e-05, + "loss": 2.75, + "step": 40027 + }, + { + "epoch": 1.8636077938403521, + "grad_norm": 0.33314635374126317, + "learning_rate": 3.769725802052397e-05, + "loss": 2.6582, + "step": 40028 + }, + { + "epoch": 1.863654352026445, + "grad_norm": 0.3569329968704268, + "learning_rate": 3.769463257967118e-05, + "loss": 2.7403, + "step": 40029 + }, + { + "epoch": 1.8637009102125381, + "grad_norm": 0.3443691434271769, + "learning_rate": 3.769200717493344e-05, + "loss": 2.699, + "step": 40030 + }, + { + "epoch": 1.863747468398631, + "grad_norm": 0.32966918187183714, + "learning_rate": 3.768938180631844e-05, + "loss": 2.7126, + "step": 40031 + }, + { + "epoch": 1.8637940265847242, + "grad_norm": 0.37523158691788716, + "learning_rate": 3.768675647383391e-05, + "loss": 2.812, + "step": 40032 + }, + { + "epoch": 1.8638405847708173, + "grad_norm": 0.35790160735113014, + "learning_rate": 3.7684131177487526e-05, + "loss": 2.6274, + "step": 40033 + }, + { + "epoch": 1.8638871429569104, + "grad_norm": 0.3503410326113398, + "learning_rate": 3.768150591728699e-05, + "loss": 2.7071, + "step": 40034 + }, + { + "epoch": 1.8639337011430035, + "grad_norm": 0.35582794156749903, + "learning_rate": 3.7678880693240036e-05, + "loss": 2.6847, + "step": 40035 + }, + { + "epoch": 1.8639802593290966, + "grad_norm": 0.3472928144287813, + "learning_rate": 3.767625550535433e-05, + "loss": 2.6586, + "step": 40036 + }, + { + "epoch": 1.8640268175151897, + "grad_norm": 0.33553682720683586, + "learning_rate": 3.767363035363763e-05, + "loss": 2.7039, + "step": 40037 + }, + { + "epoch": 1.8640733757012828, + "grad_norm": 0.36476586111261683, + "learning_rate": 3.7671005238097607e-05, + "loss": 2.7571, + "step": 40038 + }, + { + "epoch": 1.8641199338873757, + "grad_norm": 0.3716675126466469, + "learning_rate": 3.766838015874194e-05, + "loss": 2.5388, + "step": 40039 + }, + { + "epoch": 1.8641664920734689, + "grad_norm": 0.3385769942550002, + "learning_rate": 3.7665755115578396e-05, + "loss": 2.7447, + "step": 40040 + }, + { + "epoch": 1.8642130502595617, + "grad_norm": 0.3867111009513227, + "learning_rate": 3.766313010861464e-05, + "loss": 2.6935, + "step": 40041 + }, + { + "epoch": 1.8642596084456549, + "grad_norm": 0.34953953912371283, + "learning_rate": 3.766050513785837e-05, + "loss": 2.7122, + "step": 40042 + }, + { + "epoch": 1.864306166631748, + "grad_norm": 0.3564359994157291, + "learning_rate": 3.7657880203317306e-05, + "loss": 2.6742, + "step": 40043 + }, + { + "epoch": 1.864352724817841, + "grad_norm": 0.3780809471335549, + "learning_rate": 3.765525530499915e-05, + "loss": 2.5904, + "step": 40044 + }, + { + "epoch": 1.8643992830039342, + "grad_norm": 0.3550001589057625, + "learning_rate": 3.7652630442911605e-05, + "loss": 2.6922, + "step": 40045 + }, + { + "epoch": 1.8644458411900273, + "grad_norm": 0.36827216297525456, + "learning_rate": 3.765000561706238e-05, + "loss": 2.6861, + "step": 40046 + }, + { + "epoch": 1.8644923993761204, + "grad_norm": 0.3496850567140822, + "learning_rate": 3.764738082745915e-05, + "loss": 2.6623, + "step": 40047 + }, + { + "epoch": 1.8645389575622133, + "grad_norm": 0.3694195361600622, + "learning_rate": 3.764475607410965e-05, + "loss": 2.7636, + "step": 40048 + }, + { + "epoch": 1.8645855157483064, + "grad_norm": 0.33176684930955286, + "learning_rate": 3.764213135702157e-05, + "loss": 2.731, + "step": 40049 + }, + { + "epoch": 1.8646320739343996, + "grad_norm": 0.3332227416368546, + "learning_rate": 3.763950667620263e-05, + "loss": 2.6908, + "step": 40050 + }, + { + "epoch": 1.8646786321204925, + "grad_norm": 0.36233502770157733, + "learning_rate": 3.763688203166052e-05, + "loss": 2.7406, + "step": 40051 + }, + { + "epoch": 1.8647251903065856, + "grad_norm": 0.32562513123783304, + "learning_rate": 3.7634257423402926e-05, + "loss": 2.736, + "step": 40052 + }, + { + "epoch": 1.8647717484926787, + "grad_norm": 0.3422167420810814, + "learning_rate": 3.763163285143758e-05, + "loss": 2.7502, + "step": 40053 + }, + { + "epoch": 1.8648183066787718, + "grad_norm": 0.35824914428266696, + "learning_rate": 3.762900831577217e-05, + "loss": 2.7566, + "step": 40054 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 0.3220456432992139, + "learning_rate": 3.762638381641441e-05, + "loss": 2.5435, + "step": 40055 + }, + { + "epoch": 1.864911423050958, + "grad_norm": 0.3338711648140484, + "learning_rate": 3.762375935337199e-05, + "loss": 2.6979, + "step": 40056 + }, + { + "epoch": 1.8649579812370511, + "grad_norm": 0.34381160540671074, + "learning_rate": 3.762113492665261e-05, + "loss": 2.7074, + "step": 40057 + }, + { + "epoch": 1.865004539423144, + "grad_norm": 0.36075675477201, + "learning_rate": 3.761851053626399e-05, + "loss": 2.6867, + "step": 40058 + }, + { + "epoch": 1.8650510976092372, + "grad_norm": 0.3192229673929567, + "learning_rate": 3.7615886182213824e-05, + "loss": 2.6344, + "step": 40059 + }, + { + "epoch": 1.8650976557953303, + "grad_norm": 0.3720807387048201, + "learning_rate": 3.761326186450979e-05, + "loss": 2.7726, + "step": 40060 + }, + { + "epoch": 1.8651442139814232, + "grad_norm": 0.3443018694696074, + "learning_rate": 3.7610637583159636e-05, + "loss": 2.6799, + "step": 40061 + }, + { + "epoch": 1.8651907721675163, + "grad_norm": 0.37175679698292735, + "learning_rate": 3.760801333817102e-05, + "loss": 2.6267, + "step": 40062 + }, + { + "epoch": 1.8652373303536094, + "grad_norm": 0.3155070226638933, + "learning_rate": 3.7605389129551684e-05, + "loss": 2.5782, + "step": 40063 + }, + { + "epoch": 1.8652838885397025, + "grad_norm": 0.35614993251837845, + "learning_rate": 3.7602764957309305e-05, + "loss": 2.6001, + "step": 40064 + }, + { + "epoch": 1.8653304467257956, + "grad_norm": 0.35918091226340804, + "learning_rate": 3.7600140821451565e-05, + "loss": 2.6928, + "step": 40065 + }, + { + "epoch": 1.8653770049118887, + "grad_norm": 0.32860107329443206, + "learning_rate": 3.759751672198622e-05, + "loss": 2.6389, + "step": 40066 + }, + { + "epoch": 1.8654235630979819, + "grad_norm": 0.38092163544169266, + "learning_rate": 3.7594892658920936e-05, + "loss": 2.6336, + "step": 40067 + }, + { + "epoch": 1.8654701212840747, + "grad_norm": 0.3488079655519001, + "learning_rate": 3.7592268632263415e-05, + "loss": 2.6705, + "step": 40068 + }, + { + "epoch": 1.8655166794701679, + "grad_norm": 0.3164086979456833, + "learning_rate": 3.7589644642021366e-05, + "loss": 2.7204, + "step": 40069 + }, + { + "epoch": 1.8655632376562608, + "grad_norm": 0.4009388177779705, + "learning_rate": 3.758702068820248e-05, + "loss": 2.6913, + "step": 40070 + }, + { + "epoch": 1.8656097958423539, + "grad_norm": 0.3652060544606344, + "learning_rate": 3.758439677081448e-05, + "loss": 2.7362, + "step": 40071 + }, + { + "epoch": 1.865656354028447, + "grad_norm": 0.38537155799121886, + "learning_rate": 3.758177288986503e-05, + "loss": 2.7473, + "step": 40072 + }, + { + "epoch": 1.86570291221454, + "grad_norm": 0.3628151954385387, + "learning_rate": 3.7579149045361875e-05, + "loss": 2.7572, + "step": 40073 + }, + { + "epoch": 1.8657494704006332, + "grad_norm": 0.38572263334657736, + "learning_rate": 3.757652523731269e-05, + "loss": 2.6264, + "step": 40074 + }, + { + "epoch": 1.8657960285867263, + "grad_norm": 0.36824177754561616, + "learning_rate": 3.757390146572516e-05, + "loss": 2.6271, + "step": 40075 + }, + { + "epoch": 1.8658425867728194, + "grad_norm": 0.3422546046630821, + "learning_rate": 3.757127773060703e-05, + "loss": 2.6115, + "step": 40076 + }, + { + "epoch": 1.8658891449589126, + "grad_norm": 0.38110503579185984, + "learning_rate": 3.756865403196597e-05, + "loss": 2.7072, + "step": 40077 + }, + { + "epoch": 1.8659357031450055, + "grad_norm": 0.36986687662227175, + "learning_rate": 3.7566030369809666e-05, + "loss": 2.7191, + "step": 40078 + }, + { + "epoch": 1.8659822613310986, + "grad_norm": 0.37108749657771306, + "learning_rate": 3.7563406744145864e-05, + "loss": 2.6967, + "step": 40079 + }, + { + "epoch": 1.8660288195171915, + "grad_norm": 0.3623637305400633, + "learning_rate": 3.756078315498222e-05, + "loss": 2.5716, + "step": 40080 + }, + { + "epoch": 1.8660753777032846, + "grad_norm": 0.38015636402547315, + "learning_rate": 3.755815960232647e-05, + "loss": 2.8374, + "step": 40081 + }, + { + "epoch": 1.8661219358893777, + "grad_norm": 0.36981190441788186, + "learning_rate": 3.7555536086186275e-05, + "loss": 2.6267, + "step": 40082 + }, + { + "epoch": 1.8661684940754708, + "grad_norm": 0.3474951386475432, + "learning_rate": 3.755291260656937e-05, + "loss": 2.7219, + "step": 40083 + }, + { + "epoch": 1.866215052261564, + "grad_norm": 0.373849624026045, + "learning_rate": 3.755028916348344e-05, + "loss": 2.6566, + "step": 40084 + }, + { + "epoch": 1.866261610447657, + "grad_norm": 0.35052603386591186, + "learning_rate": 3.754766575693617e-05, + "loss": 2.6462, + "step": 40085 + }, + { + "epoch": 1.8663081686337502, + "grad_norm": 0.3523240615012963, + "learning_rate": 3.7545042386935284e-05, + "loss": 2.6832, + "step": 40086 + }, + { + "epoch": 1.866354726819843, + "grad_norm": 0.34342442297224834, + "learning_rate": 3.7542419053488484e-05, + "loss": 2.755, + "step": 40087 + }, + { + "epoch": 1.8664012850059362, + "grad_norm": 0.37507120850129616, + "learning_rate": 3.753979575660343e-05, + "loss": 2.6649, + "step": 40088 + }, + { + "epoch": 1.8664478431920293, + "grad_norm": 0.3353357978890886, + "learning_rate": 3.753717249628787e-05, + "loss": 2.6873, + "step": 40089 + }, + { + "epoch": 1.8664944013781222, + "grad_norm": 0.3140539519405675, + "learning_rate": 3.753454927254946e-05, + "loss": 2.7198, + "step": 40090 + }, + { + "epoch": 1.8665409595642153, + "grad_norm": 0.33756582317708617, + "learning_rate": 3.753192608539594e-05, + "loss": 2.7531, + "step": 40091 + }, + { + "epoch": 1.8665875177503084, + "grad_norm": 0.33596545897147473, + "learning_rate": 3.7529302934834985e-05, + "loss": 2.5867, + "step": 40092 + }, + { + "epoch": 1.8666340759364015, + "grad_norm": 0.32611146360820537, + "learning_rate": 3.75266798208743e-05, + "loss": 2.7746, + "step": 40093 + }, + { + "epoch": 1.8666806341224946, + "grad_norm": 0.34848859543716193, + "learning_rate": 3.752405674352158e-05, + "loss": 2.6881, + "step": 40094 + }, + { + "epoch": 1.8667271923085877, + "grad_norm": 0.3235670933375309, + "learning_rate": 3.752143370278452e-05, + "loss": 2.5483, + "step": 40095 + }, + { + "epoch": 1.8667737504946809, + "grad_norm": 0.34323805395346113, + "learning_rate": 3.7518810698670815e-05, + "loss": 2.5389, + "step": 40096 + }, + { + "epoch": 1.8668203086807738, + "grad_norm": 0.3352926257279921, + "learning_rate": 3.7516187731188184e-05, + "loss": 2.675, + "step": 40097 + }, + { + "epoch": 1.8668668668668669, + "grad_norm": 0.31641448826115093, + "learning_rate": 3.751356480034429e-05, + "loss": 2.7648, + "step": 40098 + }, + { + "epoch": 1.86691342505296, + "grad_norm": 0.35964854858960743, + "learning_rate": 3.751094190614688e-05, + "loss": 2.7549, + "step": 40099 + }, + { + "epoch": 1.8669599832390529, + "grad_norm": 0.34131234393697807, + "learning_rate": 3.7508319048603625e-05, + "loss": 2.5054, + "step": 40100 + }, + { + "epoch": 1.867006541425146, + "grad_norm": 0.3393432831305495, + "learning_rate": 3.75056962277222e-05, + "loss": 2.6179, + "step": 40101 + }, + { + "epoch": 1.867053099611239, + "grad_norm": 0.30660558978300834, + "learning_rate": 3.7503073443510344e-05, + "loss": 2.6933, + "step": 40102 + }, + { + "epoch": 1.8670996577973322, + "grad_norm": 0.3747318827026148, + "learning_rate": 3.750045069597572e-05, + "loss": 2.6165, + "step": 40103 + }, + { + "epoch": 1.8671462159834253, + "grad_norm": 0.3461042396530613, + "learning_rate": 3.7497827985126056e-05, + "loss": 2.6015, + "step": 40104 + }, + { + "epoch": 1.8671927741695185, + "grad_norm": 0.34771132686311945, + "learning_rate": 3.749520531096904e-05, + "loss": 2.65, + "step": 40105 + }, + { + "epoch": 1.8672393323556116, + "grad_norm": 0.3672215482969322, + "learning_rate": 3.749258267351236e-05, + "loss": 2.5917, + "step": 40106 + }, + { + "epoch": 1.8672858905417045, + "grad_norm": 0.3332353744090487, + "learning_rate": 3.748996007276372e-05, + "loss": 2.7328, + "step": 40107 + }, + { + "epoch": 1.8673324487277976, + "grad_norm": 0.38319180137526254, + "learning_rate": 3.748733750873079e-05, + "loss": 2.6384, + "step": 40108 + }, + { + "epoch": 1.8673790069138905, + "grad_norm": 0.3396491206025762, + "learning_rate": 3.748471498142132e-05, + "loss": 2.7031, + "step": 40109 + }, + { + "epoch": 1.8674255650999836, + "grad_norm": 0.38660536371433135, + "learning_rate": 3.748209249084298e-05, + "loss": 2.775, + "step": 40110 + }, + { + "epoch": 1.8674721232860767, + "grad_norm": 0.324579048094503, + "learning_rate": 3.747947003700344e-05, + "loss": 2.6819, + "step": 40111 + }, + { + "epoch": 1.8675186814721698, + "grad_norm": 0.34558451766592174, + "learning_rate": 3.7476847619910436e-05, + "loss": 2.6681, + "step": 40112 + }, + { + "epoch": 1.867565239658263, + "grad_norm": 0.32896717872646125, + "learning_rate": 3.747422523957166e-05, + "loss": 2.699, + "step": 40113 + }, + { + "epoch": 1.867611797844356, + "grad_norm": 0.329563289064548, + "learning_rate": 3.747160289599478e-05, + "loss": 2.6407, + "step": 40114 + }, + { + "epoch": 1.8676583560304492, + "grad_norm": 0.34686796303233786, + "learning_rate": 3.746898058918753e-05, + "loss": 2.6527, + "step": 40115 + }, + { + "epoch": 1.8677049142165423, + "grad_norm": 0.35918512969442673, + "learning_rate": 3.746635831915756e-05, + "loss": 2.804, + "step": 40116 + }, + { + "epoch": 1.8677514724026352, + "grad_norm": 0.3243173279971292, + "learning_rate": 3.746373608591262e-05, + "loss": 2.6617, + "step": 40117 + }, + { + "epoch": 1.8677980305887283, + "grad_norm": 0.3258335238156904, + "learning_rate": 3.746111388946038e-05, + "loss": 2.6493, + "step": 40118 + }, + { + "epoch": 1.8678445887748212, + "grad_norm": 0.32667915834886235, + "learning_rate": 3.745849172980852e-05, + "loss": 2.6729, + "step": 40119 + }, + { + "epoch": 1.8678911469609143, + "grad_norm": 0.3181183175446461, + "learning_rate": 3.7455869606964765e-05, + "loss": 2.6656, + "step": 40120 + }, + { + "epoch": 1.8679377051470074, + "grad_norm": 0.32300680586403163, + "learning_rate": 3.745324752093678e-05, + "loss": 2.62, + "step": 40121 + }, + { + "epoch": 1.8679842633331005, + "grad_norm": 0.3194488234438786, + "learning_rate": 3.745062547173229e-05, + "loss": 2.6006, + "step": 40122 + }, + { + "epoch": 1.8680308215191936, + "grad_norm": 0.3447297130437093, + "learning_rate": 3.744800345935899e-05, + "loss": 2.6439, + "step": 40123 + }, + { + "epoch": 1.8680773797052868, + "grad_norm": 0.3143102111937028, + "learning_rate": 3.7445381483824535e-05, + "loss": 2.6958, + "step": 40124 + }, + { + "epoch": 1.8681239378913799, + "grad_norm": 0.31534751270676903, + "learning_rate": 3.744275954513667e-05, + "loss": 2.6553, + "step": 40125 + }, + { + "epoch": 1.868170496077473, + "grad_norm": 0.32906130248264875, + "learning_rate": 3.744013764330304e-05, + "loss": 2.6802, + "step": 40126 + }, + { + "epoch": 1.8682170542635659, + "grad_norm": 0.346522941371805, + "learning_rate": 3.74375157783314e-05, + "loss": 2.8001, + "step": 40127 + }, + { + "epoch": 1.868263612449659, + "grad_norm": 0.33063557355943995, + "learning_rate": 3.743489395022941e-05, + "loss": 2.7668, + "step": 40128 + }, + { + "epoch": 1.8683101706357519, + "grad_norm": 0.3386220324785627, + "learning_rate": 3.743227215900475e-05, + "loss": 2.6269, + "step": 40129 + }, + { + "epoch": 1.868356728821845, + "grad_norm": 0.33683852667177405, + "learning_rate": 3.7429650404665154e-05, + "loss": 2.8455, + "step": 40130 + }, + { + "epoch": 1.8684032870079381, + "grad_norm": 0.36018728751908186, + "learning_rate": 3.742702868721828e-05, + "loss": 2.6808, + "step": 40131 + }, + { + "epoch": 1.8684498451940312, + "grad_norm": 0.3417852112180726, + "learning_rate": 3.7424407006671854e-05, + "loss": 2.6321, + "step": 40132 + }, + { + "epoch": 1.8684964033801243, + "grad_norm": 0.3479723750968906, + "learning_rate": 3.742178536303356e-05, + "loss": 2.6877, + "step": 40133 + }, + { + "epoch": 1.8685429615662175, + "grad_norm": 0.3667581417309262, + "learning_rate": 3.741916375631105e-05, + "loss": 2.7201, + "step": 40134 + }, + { + "epoch": 1.8685895197523106, + "grad_norm": 0.3544296289456799, + "learning_rate": 3.741654218651208e-05, + "loss": 2.786, + "step": 40135 + }, + { + "epoch": 1.8686360779384035, + "grad_norm": 0.3722189028047812, + "learning_rate": 3.7413920653644327e-05, + "loss": 2.7794, + "step": 40136 + }, + { + "epoch": 1.8686826361244966, + "grad_norm": 0.3512404065911269, + "learning_rate": 3.741129915771545e-05, + "loss": 2.6431, + "step": 40137 + }, + { + "epoch": 1.8687291943105897, + "grad_norm": 0.3417006487451274, + "learning_rate": 3.7408677698733196e-05, + "loss": 2.6872, + "step": 40138 + }, + { + "epoch": 1.8687757524966826, + "grad_norm": 0.33176611210299034, + "learning_rate": 3.740605627670521e-05, + "loss": 2.5827, + "step": 40139 + }, + { + "epoch": 1.8688223106827757, + "grad_norm": 0.34681292657409635, + "learning_rate": 3.740343489163923e-05, + "loss": 2.713, + "step": 40140 + }, + { + "epoch": 1.8688688688688688, + "grad_norm": 0.3391645306237309, + "learning_rate": 3.7400813543542924e-05, + "loss": 2.6932, + "step": 40141 + }, + { + "epoch": 1.868915427054962, + "grad_norm": 0.3281582660994791, + "learning_rate": 3.7398192232423966e-05, + "loss": 2.5703, + "step": 40142 + }, + { + "epoch": 1.868961985241055, + "grad_norm": 0.3552360871545507, + "learning_rate": 3.739557095829009e-05, + "loss": 2.7687, + "step": 40143 + }, + { + "epoch": 1.8690085434271482, + "grad_norm": 0.3642125295098693, + "learning_rate": 3.739294972114897e-05, + "loss": 2.6716, + "step": 40144 + }, + { + "epoch": 1.8690551016132413, + "grad_norm": 0.3205117382281185, + "learning_rate": 3.739032852100831e-05, + "loss": 2.6204, + "step": 40145 + }, + { + "epoch": 1.8691016597993342, + "grad_norm": 0.36498082527118947, + "learning_rate": 3.7387707357875784e-05, + "loss": 2.5717, + "step": 40146 + }, + { + "epoch": 1.8691482179854273, + "grad_norm": 0.3351971577581597, + "learning_rate": 3.7385086231759094e-05, + "loss": 2.7146, + "step": 40147 + }, + { + "epoch": 1.8691947761715204, + "grad_norm": 0.33226097406291427, + "learning_rate": 3.738246514266594e-05, + "loss": 2.7397, + "step": 40148 + }, + { + "epoch": 1.8692413343576133, + "grad_norm": 0.35838944298998987, + "learning_rate": 3.737984409060399e-05, + "loss": 2.6648, + "step": 40149 + }, + { + "epoch": 1.8692878925437064, + "grad_norm": 0.35709240019036864, + "learning_rate": 3.737722307558097e-05, + "loss": 2.6645, + "step": 40150 + }, + { + "epoch": 1.8693344507297995, + "grad_norm": 0.34378393559682074, + "learning_rate": 3.737460209760456e-05, + "loss": 2.6641, + "step": 40151 + }, + { + "epoch": 1.8693810089158927, + "grad_norm": 0.33271811321082556, + "learning_rate": 3.7371981156682435e-05, + "loss": 2.684, + "step": 40152 + }, + { + "epoch": 1.8694275671019858, + "grad_norm": 0.32681781163652107, + "learning_rate": 3.736936025282232e-05, + "loss": 2.6212, + "step": 40153 + }, + { + "epoch": 1.8694741252880789, + "grad_norm": 0.33376395998788355, + "learning_rate": 3.736673938603188e-05, + "loss": 2.6582, + "step": 40154 + }, + { + "epoch": 1.869520683474172, + "grad_norm": 0.33268959685162836, + "learning_rate": 3.736411855631879e-05, + "loss": 2.632, + "step": 40155 + }, + { + "epoch": 1.869567241660265, + "grad_norm": 0.34435002691506555, + "learning_rate": 3.73614977636908e-05, + "loss": 2.6962, + "step": 40156 + }, + { + "epoch": 1.869613799846358, + "grad_norm": 0.3303035318989479, + "learning_rate": 3.7358877008155555e-05, + "loss": 2.7139, + "step": 40157 + }, + { + "epoch": 1.869660358032451, + "grad_norm": 0.36002630645141814, + "learning_rate": 3.735625628972077e-05, + "loss": 2.6097, + "step": 40158 + }, + { + "epoch": 1.869706916218544, + "grad_norm": 0.32509616404928054, + "learning_rate": 3.735363560839412e-05, + "loss": 2.6561, + "step": 40159 + }, + { + "epoch": 1.8697534744046371, + "grad_norm": 0.35409419341469567, + "learning_rate": 3.73510149641833e-05, + "loss": 2.684, + "step": 40160 + }, + { + "epoch": 1.8698000325907302, + "grad_norm": 0.36434280423072424, + "learning_rate": 3.734839435709602e-05, + "loss": 2.6784, + "step": 40161 + }, + { + "epoch": 1.8698465907768234, + "grad_norm": 0.32187688456949004, + "learning_rate": 3.734577378713993e-05, + "loss": 2.7505, + "step": 40162 + }, + { + "epoch": 1.8698931489629165, + "grad_norm": 0.35592087307689163, + "learning_rate": 3.734315325432277e-05, + "loss": 2.6801, + "step": 40163 + }, + { + "epoch": 1.8699397071490096, + "grad_norm": 0.33879978593161175, + "learning_rate": 3.7340532758652215e-05, + "loss": 2.6956, + "step": 40164 + }, + { + "epoch": 1.8699862653351027, + "grad_norm": 0.31803731763987325, + "learning_rate": 3.733791230013591e-05, + "loss": 2.7065, + "step": 40165 + }, + { + "epoch": 1.8700328235211956, + "grad_norm": 0.3485072080590512, + "learning_rate": 3.733529187878162e-05, + "loss": 2.711, + "step": 40166 + }, + { + "epoch": 1.8700793817072887, + "grad_norm": 0.3553817765954201, + "learning_rate": 3.7332671494596995e-05, + "loss": 2.7551, + "step": 40167 + }, + { + "epoch": 1.8701259398933816, + "grad_norm": 0.359485673107804, + "learning_rate": 3.733005114758972e-05, + "loss": 2.6847, + "step": 40168 + }, + { + "epoch": 1.8701724980794747, + "grad_norm": 0.3201133724947823, + "learning_rate": 3.732743083776751e-05, + "loss": 2.5754, + "step": 40169 + }, + { + "epoch": 1.8702190562655678, + "grad_norm": 0.35491790713557386, + "learning_rate": 3.732481056513804e-05, + "loss": 2.6744, + "step": 40170 + }, + { + "epoch": 1.870265614451661, + "grad_norm": 0.34839381244670187, + "learning_rate": 3.7322190329709004e-05, + "loss": 2.6811, + "step": 40171 + }, + { + "epoch": 1.870312172637754, + "grad_norm": 0.35523995888288473, + "learning_rate": 3.731957013148808e-05, + "loss": 2.7274, + "step": 40172 + }, + { + "epoch": 1.8703587308238472, + "grad_norm": 0.36475992713718625, + "learning_rate": 3.7316949970482974e-05, + "loss": 2.685, + "step": 40173 + }, + { + "epoch": 1.8704052890099403, + "grad_norm": 0.32788325168160015, + "learning_rate": 3.7314329846701367e-05, + "loss": 2.7904, + "step": 40174 + }, + { + "epoch": 1.8704518471960332, + "grad_norm": 0.3515445295256524, + "learning_rate": 3.731170976015095e-05, + "loss": 2.6441, + "step": 40175 + }, + { + "epoch": 1.8704984053821263, + "grad_norm": 0.3422551029362858, + "learning_rate": 3.730908971083942e-05, + "loss": 2.6299, + "step": 40176 + }, + { + "epoch": 1.8705449635682194, + "grad_norm": 0.34296484106942127, + "learning_rate": 3.730646969877447e-05, + "loss": 2.7101, + "step": 40177 + }, + { + "epoch": 1.8705915217543123, + "grad_norm": 0.3849874061893547, + "learning_rate": 3.730384972396376e-05, + "loss": 2.7313, + "step": 40178 + }, + { + "epoch": 1.8706380799404054, + "grad_norm": 0.3450142370917681, + "learning_rate": 3.7301229786415015e-05, + "loss": 2.6968, + "step": 40179 + }, + { + "epoch": 1.8706846381264985, + "grad_norm": 0.3444687280530566, + "learning_rate": 3.7298609886135905e-05, + "loss": 2.7269, + "step": 40180 + }, + { + "epoch": 1.8707311963125917, + "grad_norm": 0.369254466914411, + "learning_rate": 3.7295990023134126e-05, + "loss": 2.5915, + "step": 40181 + }, + { + "epoch": 1.8707777544986848, + "grad_norm": 0.35869124289224924, + "learning_rate": 3.729337019741737e-05, + "loss": 2.6515, + "step": 40182 + }, + { + "epoch": 1.870824312684778, + "grad_norm": 0.3676827988898491, + "learning_rate": 3.7290750408993317e-05, + "loss": 2.5967, + "step": 40183 + }, + { + "epoch": 1.870870870870871, + "grad_norm": 0.3333589266687916, + "learning_rate": 3.7288130657869655e-05, + "loss": 2.6979, + "step": 40184 + }, + { + "epoch": 1.870917429056964, + "grad_norm": 0.34778243529431563, + "learning_rate": 3.728551094405407e-05, + "loss": 2.7302, + "step": 40185 + }, + { + "epoch": 1.870963987243057, + "grad_norm": 0.39361346516731266, + "learning_rate": 3.728289126755428e-05, + "loss": 2.7278, + "step": 40186 + }, + { + "epoch": 1.8710105454291501, + "grad_norm": 0.33816755503843615, + "learning_rate": 3.728027162837794e-05, + "loss": 2.7558, + "step": 40187 + }, + { + "epoch": 1.871057103615243, + "grad_norm": 0.3506526847362003, + "learning_rate": 3.727765202653273e-05, + "loss": 2.6597, + "step": 40188 + }, + { + "epoch": 1.8711036618013361, + "grad_norm": 0.3367443179150763, + "learning_rate": 3.7275032462026385e-05, + "loss": 2.6449, + "step": 40189 + }, + { + "epoch": 1.8711502199874293, + "grad_norm": 0.36650741048345004, + "learning_rate": 3.727241293486657e-05, + "loss": 2.6859, + "step": 40190 + }, + { + "epoch": 1.8711967781735224, + "grad_norm": 0.32956068549741274, + "learning_rate": 3.726979344506094e-05, + "loss": 2.6194, + "step": 40191 + }, + { + "epoch": 1.8712433363596155, + "grad_norm": 0.3413132106005024, + "learning_rate": 3.726717399261723e-05, + "loss": 2.7334, + "step": 40192 + }, + { + "epoch": 1.8712898945457086, + "grad_norm": 0.3446500876536249, + "learning_rate": 3.7264554577543096e-05, + "loss": 2.6403, + "step": 40193 + }, + { + "epoch": 1.8713364527318017, + "grad_norm": 0.3426303716176629, + "learning_rate": 3.726193519984626e-05, + "loss": 2.6358, + "step": 40194 + }, + { + "epoch": 1.8713830109178946, + "grad_norm": 0.33887663449930794, + "learning_rate": 3.725931585953439e-05, + "loss": 2.6294, + "step": 40195 + }, + { + "epoch": 1.8714295691039877, + "grad_norm": 0.36893209074326666, + "learning_rate": 3.7256696556615166e-05, + "loss": 2.6986, + "step": 40196 + }, + { + "epoch": 1.8714761272900806, + "grad_norm": 0.31739216895985967, + "learning_rate": 3.725407729109629e-05, + "loss": 2.5738, + "step": 40197 + }, + { + "epoch": 1.8715226854761737, + "grad_norm": 0.3342138853833717, + "learning_rate": 3.725145806298541e-05, + "loss": 2.7236, + "step": 40198 + }, + { + "epoch": 1.8715692436622668, + "grad_norm": 0.3332651552437744, + "learning_rate": 3.724883887229028e-05, + "loss": 2.653, + "step": 40199 + }, + { + "epoch": 1.87161580184836, + "grad_norm": 0.3346663979500692, + "learning_rate": 3.724621971901854e-05, + "loss": 2.6979, + "step": 40200 + }, + { + "epoch": 1.871662360034453, + "grad_norm": 0.34414751074895894, + "learning_rate": 3.724360060317788e-05, + "loss": 2.6956, + "step": 40201 + }, + { + "epoch": 1.8717089182205462, + "grad_norm": 0.33146138208718956, + "learning_rate": 3.724098152477601e-05, + "loss": 2.6891, + "step": 40202 + }, + { + "epoch": 1.8717554764066393, + "grad_norm": 0.3288709474581077, + "learning_rate": 3.723836248382058e-05, + "loss": 2.6667, + "step": 40203 + }, + { + "epoch": 1.8718020345927324, + "grad_norm": 0.3190195650134549, + "learning_rate": 3.723574348031932e-05, + "loss": 2.7229, + "step": 40204 + }, + { + "epoch": 1.8718485927788253, + "grad_norm": 0.34892056403001265, + "learning_rate": 3.7233124514279896e-05, + "loss": 2.7132, + "step": 40205 + }, + { + "epoch": 1.8718951509649184, + "grad_norm": 0.33762243892037763, + "learning_rate": 3.723050558570998e-05, + "loss": 2.713, + "step": 40206 + }, + { + "epoch": 1.8719417091510113, + "grad_norm": 0.3495545848410626, + "learning_rate": 3.7227886694617274e-05, + "loss": 2.5713, + "step": 40207 + }, + { + "epoch": 1.8719882673371044, + "grad_norm": 0.30812485572771137, + "learning_rate": 3.722526784100948e-05, + "loss": 2.673, + "step": 40208 + }, + { + "epoch": 1.8720348255231976, + "grad_norm": 0.3325317834787704, + "learning_rate": 3.722264902489425e-05, + "loss": 2.6292, + "step": 40209 + }, + { + "epoch": 1.8720813837092907, + "grad_norm": 0.35002460935496593, + "learning_rate": 3.722003024627929e-05, + "loss": 2.6407, + "step": 40210 + }, + { + "epoch": 1.8721279418953838, + "grad_norm": 0.33933277502049963, + "learning_rate": 3.721741150517227e-05, + "loss": 2.7562, + "step": 40211 + }, + { + "epoch": 1.872174500081477, + "grad_norm": 0.37164999507580093, + "learning_rate": 3.721479280158091e-05, + "loss": 2.734, + "step": 40212 + }, + { + "epoch": 1.87222105826757, + "grad_norm": 0.3155294776511871, + "learning_rate": 3.7212174135512876e-05, + "loss": 2.6783, + "step": 40213 + }, + { + "epoch": 1.8722676164536631, + "grad_norm": 0.3550420153697696, + "learning_rate": 3.720955550697582e-05, + "loss": 2.6844, + "step": 40214 + }, + { + "epoch": 1.872314174639756, + "grad_norm": 0.34664027932930497, + "learning_rate": 3.720693691597749e-05, + "loss": 2.6757, + "step": 40215 + }, + { + "epoch": 1.8723607328258491, + "grad_norm": 0.3405382758227409, + "learning_rate": 3.720431836252551e-05, + "loss": 2.6825, + "step": 40216 + }, + { + "epoch": 1.872407291011942, + "grad_norm": 0.3153887420706196, + "learning_rate": 3.7201699846627615e-05, + "loss": 2.7139, + "step": 40217 + }, + { + "epoch": 1.8724538491980351, + "grad_norm": 0.34422152057166616, + "learning_rate": 3.719908136829148e-05, + "loss": 2.4822, + "step": 40218 + }, + { + "epoch": 1.8725004073841283, + "grad_norm": 0.32379223762245685, + "learning_rate": 3.719646292752475e-05, + "loss": 2.5962, + "step": 40219 + }, + { + "epoch": 1.8725469655702214, + "grad_norm": 0.30960340502893446, + "learning_rate": 3.719384452433516e-05, + "loss": 2.8262, + "step": 40220 + }, + { + "epoch": 1.8725935237563145, + "grad_norm": 0.31078580603322137, + "learning_rate": 3.719122615873038e-05, + "loss": 2.5842, + "step": 40221 + }, + { + "epoch": 1.8726400819424076, + "grad_norm": 0.34678047793173505, + "learning_rate": 3.718860783071808e-05, + "loss": 2.685, + "step": 40222 + }, + { + "epoch": 1.8726866401285007, + "grad_norm": 0.3494745120154088, + "learning_rate": 3.718598954030597e-05, + "loss": 2.6853, + "step": 40223 + }, + { + "epoch": 1.8727331983145936, + "grad_norm": 0.33840172059471363, + "learning_rate": 3.7183371287501686e-05, + "loss": 2.6146, + "step": 40224 + }, + { + "epoch": 1.8727797565006867, + "grad_norm": 0.35427649131192945, + "learning_rate": 3.718075307231297e-05, + "loss": 2.7776, + "step": 40225 + }, + { + "epoch": 1.8728263146867798, + "grad_norm": 0.3521571879364053, + "learning_rate": 3.7178134894747474e-05, + "loss": 2.7444, + "step": 40226 + }, + { + "epoch": 1.8728728728728727, + "grad_norm": 0.33288830985872814, + "learning_rate": 3.717551675481288e-05, + "loss": 2.6056, + "step": 40227 + }, + { + "epoch": 1.8729194310589659, + "grad_norm": 0.32342610157612217, + "learning_rate": 3.71728986525169e-05, + "loss": 2.5803, + "step": 40228 + }, + { + "epoch": 1.872965989245059, + "grad_norm": 0.3609520645345482, + "learning_rate": 3.717028058786717e-05, + "loss": 2.5875, + "step": 40229 + }, + { + "epoch": 1.873012547431152, + "grad_norm": 0.35061669309248505, + "learning_rate": 3.716766256087143e-05, + "loss": 2.6693, + "step": 40230 + }, + { + "epoch": 1.8730591056172452, + "grad_norm": 0.3167917656791728, + "learning_rate": 3.7165044571537325e-05, + "loss": 2.6983, + "step": 40231 + }, + { + "epoch": 1.8731056638033383, + "grad_norm": 0.34678129959133214, + "learning_rate": 3.7162426619872534e-05, + "loss": 2.68, + "step": 40232 + }, + { + "epoch": 1.8731522219894314, + "grad_norm": 0.3674198249701318, + "learning_rate": 3.7159808705884784e-05, + "loss": 2.6714, + "step": 40233 + }, + { + "epoch": 1.8731987801755243, + "grad_norm": 0.3181296929713874, + "learning_rate": 3.715719082958172e-05, + "loss": 2.727, + "step": 40234 + }, + { + "epoch": 1.8732453383616174, + "grad_norm": 0.3473070044859225, + "learning_rate": 3.715457299097104e-05, + "loss": 2.664, + "step": 40235 + }, + { + "epoch": 1.8732918965477106, + "grad_norm": 0.3353962879674779, + "learning_rate": 3.715195519006043e-05, + "loss": 2.6766, + "step": 40236 + }, + { + "epoch": 1.8733384547338034, + "grad_norm": 0.3626524152641527, + "learning_rate": 3.714933742685753e-05, + "loss": 2.7204, + "step": 40237 + }, + { + "epoch": 1.8733850129198966, + "grad_norm": 0.3297128562855754, + "learning_rate": 3.71467197013701e-05, + "loss": 2.7333, + "step": 40238 + }, + { + "epoch": 1.8734315711059897, + "grad_norm": 0.3418430491855271, + "learning_rate": 3.714410201360575e-05, + "loss": 2.6778, + "step": 40239 + }, + { + "epoch": 1.8734781292920828, + "grad_norm": 0.35363578406257024, + "learning_rate": 3.714148436357221e-05, + "loss": 2.7541, + "step": 40240 + }, + { + "epoch": 1.873524687478176, + "grad_norm": 0.3306355148311193, + "learning_rate": 3.713886675127716e-05, + "loss": 2.6953, + "step": 40241 + }, + { + "epoch": 1.873571245664269, + "grad_norm": 0.33034039729670084, + "learning_rate": 3.713624917672823e-05, + "loss": 2.6578, + "step": 40242 + }, + { + "epoch": 1.8736178038503621, + "grad_norm": 0.3589134010976236, + "learning_rate": 3.713363163993317e-05, + "loss": 2.7978, + "step": 40243 + }, + { + "epoch": 1.873664362036455, + "grad_norm": 0.33343108959013, + "learning_rate": 3.713101414089965e-05, + "loss": 2.6481, + "step": 40244 + }, + { + "epoch": 1.8737109202225481, + "grad_norm": 0.3452556344244787, + "learning_rate": 3.7128396679635294e-05, + "loss": 2.754, + "step": 40245 + }, + { + "epoch": 1.873757478408641, + "grad_norm": 0.38075301096293257, + "learning_rate": 3.712577925614785e-05, + "loss": 2.676, + "step": 40246 + }, + { + "epoch": 1.8738040365947342, + "grad_norm": 0.3488589688141198, + "learning_rate": 3.712316187044497e-05, + "loss": 2.7115, + "step": 40247 + }, + { + "epoch": 1.8738505947808273, + "grad_norm": 0.3293166921402611, + "learning_rate": 3.712054452253435e-05, + "loss": 2.6351, + "step": 40248 + }, + { + "epoch": 1.8738971529669204, + "grad_norm": 0.35014373986616537, + "learning_rate": 3.7117927212423666e-05, + "loss": 2.7731, + "step": 40249 + }, + { + "epoch": 1.8739437111530135, + "grad_norm": 0.36248236807688283, + "learning_rate": 3.7115309940120565e-05, + "loss": 2.5644, + "step": 40250 + }, + { + "epoch": 1.8739902693391066, + "grad_norm": 0.3320556239236965, + "learning_rate": 3.711269270563279e-05, + "loss": 2.6739, + "step": 40251 + }, + { + "epoch": 1.8740368275251997, + "grad_norm": 0.35318278809450704, + "learning_rate": 3.711007550896797e-05, + "loss": 2.7455, + "step": 40252 + }, + { + "epoch": 1.8740833857112928, + "grad_norm": 0.33747059156466686, + "learning_rate": 3.710745835013383e-05, + "loss": 2.6563, + "step": 40253 + }, + { + "epoch": 1.8741299438973857, + "grad_norm": 0.3528446334061416, + "learning_rate": 3.7104841229138035e-05, + "loss": 2.7374, + "step": 40254 + }, + { + "epoch": 1.8741765020834789, + "grad_norm": 0.34813369625036, + "learning_rate": 3.710222414598823e-05, + "loss": 2.6812, + "step": 40255 + }, + { + "epoch": 1.8742230602695718, + "grad_norm": 0.3509282818004057, + "learning_rate": 3.709960710069215e-05, + "loss": 2.7059, + "step": 40256 + }, + { + "epoch": 1.8742696184556649, + "grad_norm": 0.3577272479530608, + "learning_rate": 3.709699009325745e-05, + "loss": 2.7135, + "step": 40257 + }, + { + "epoch": 1.874316176641758, + "grad_norm": 0.3846353182684124, + "learning_rate": 3.70943731236918e-05, + "loss": 2.7787, + "step": 40258 + }, + { + "epoch": 1.874362734827851, + "grad_norm": 0.371016181869279, + "learning_rate": 3.709175619200291e-05, + "loss": 2.7191, + "step": 40259 + }, + { + "epoch": 1.8744092930139442, + "grad_norm": 0.34388852331277014, + "learning_rate": 3.7089139298198434e-05, + "loss": 2.6705, + "step": 40260 + }, + { + "epoch": 1.8744558512000373, + "grad_norm": 0.37521252399768573, + "learning_rate": 3.708652244228607e-05, + "loss": 2.6474, + "step": 40261 + }, + { + "epoch": 1.8745024093861304, + "grad_norm": 0.38754786626573473, + "learning_rate": 3.7083905624273493e-05, + "loss": 2.7391, + "step": 40262 + }, + { + "epoch": 1.8745489675722233, + "grad_norm": 0.35439922546270086, + "learning_rate": 3.708128884416836e-05, + "loss": 2.6716, + "step": 40263 + }, + { + "epoch": 1.8745955257583164, + "grad_norm": 0.37308062188063396, + "learning_rate": 3.707867210197839e-05, + "loss": 2.7845, + "step": 40264 + }, + { + "epoch": 1.8746420839444096, + "grad_norm": 0.35790215875381415, + "learning_rate": 3.707605539771123e-05, + "loss": 2.7677, + "step": 40265 + }, + { + "epoch": 1.8746886421305025, + "grad_norm": 0.3492979489539991, + "learning_rate": 3.707343873137459e-05, + "loss": 2.765, + "step": 40266 + }, + { + "epoch": 1.8747352003165956, + "grad_norm": 0.3508075051567763, + "learning_rate": 3.707082210297613e-05, + "loss": 2.6754, + "step": 40267 + }, + { + "epoch": 1.8747817585026887, + "grad_norm": 0.3758745672605751, + "learning_rate": 3.7068205512523515e-05, + "loss": 2.6658, + "step": 40268 + }, + { + "epoch": 1.8748283166887818, + "grad_norm": 0.339185359594971, + "learning_rate": 3.7065588960024466e-05, + "loss": 2.6683, + "step": 40269 + }, + { + "epoch": 1.874874874874875, + "grad_norm": 0.3608037369827677, + "learning_rate": 3.706297244548663e-05, + "loss": 2.7374, + "step": 40270 + }, + { + "epoch": 1.874921433060968, + "grad_norm": 0.36065919183927925, + "learning_rate": 3.7060355968917694e-05, + "loss": 2.7794, + "step": 40271 + }, + { + "epoch": 1.8749679912470611, + "grad_norm": 0.3726315226024017, + "learning_rate": 3.705773953032534e-05, + "loss": 2.6942, + "step": 40272 + }, + { + "epoch": 1.875014549433154, + "grad_norm": 0.3526765861033131, + "learning_rate": 3.705512312971725e-05, + "loss": 2.7148, + "step": 40273 + }, + { + "epoch": 1.8750611076192472, + "grad_norm": 0.3503246759163583, + "learning_rate": 3.7052506767101105e-05, + "loss": 2.6454, + "step": 40274 + }, + { + "epoch": 1.8751076658053403, + "grad_norm": 0.37920188257605697, + "learning_rate": 3.704989044248455e-05, + "loss": 2.7179, + "step": 40275 + }, + { + "epoch": 1.8751542239914332, + "grad_norm": 0.33630801280853323, + "learning_rate": 3.704727415587531e-05, + "loss": 2.6987, + "step": 40276 + }, + { + "epoch": 1.8752007821775263, + "grad_norm": 0.3177595194647769, + "learning_rate": 3.7044657907281056e-05, + "loss": 2.6106, + "step": 40277 + }, + { + "epoch": 1.8752473403636194, + "grad_norm": 0.36775770280629627, + "learning_rate": 3.704204169670942e-05, + "loss": 2.6893, + "step": 40278 + }, + { + "epoch": 1.8752938985497125, + "grad_norm": 0.3186750585773782, + "learning_rate": 3.703942552416815e-05, + "loss": 2.6889, + "step": 40279 + }, + { + "epoch": 1.8753404567358056, + "grad_norm": 0.3698335923844017, + "learning_rate": 3.703680938966488e-05, + "loss": 2.7514, + "step": 40280 + }, + { + "epoch": 1.8753870149218987, + "grad_norm": 0.3583841253609263, + "learning_rate": 3.703419329320728e-05, + "loss": 2.7561, + "step": 40281 + }, + { + "epoch": 1.8754335731079919, + "grad_norm": 0.32386730494661026, + "learning_rate": 3.703157723480306e-05, + "loss": 2.7611, + "step": 40282 + }, + { + "epoch": 1.8754801312940848, + "grad_norm": 0.3578666962082243, + "learning_rate": 3.702896121445988e-05, + "loss": 2.7493, + "step": 40283 + }, + { + "epoch": 1.8755266894801779, + "grad_norm": 0.34137916193792234, + "learning_rate": 3.702634523218542e-05, + "loss": 2.6769, + "step": 40284 + }, + { + "epoch": 1.8755732476662708, + "grad_norm": 0.3412110613161454, + "learning_rate": 3.702372928798736e-05, + "loss": 2.6374, + "step": 40285 + }, + { + "epoch": 1.8756198058523639, + "grad_norm": 0.34309359371721077, + "learning_rate": 3.702111338187337e-05, + "loss": 2.7024, + "step": 40286 + }, + { + "epoch": 1.875666364038457, + "grad_norm": 0.34605706843244566, + "learning_rate": 3.701849751385116e-05, + "loss": 2.6632, + "step": 40287 + }, + { + "epoch": 1.87571292222455, + "grad_norm": 0.3500483886720172, + "learning_rate": 3.7015881683928344e-05, + "loss": 2.7287, + "step": 40288 + }, + { + "epoch": 1.8757594804106432, + "grad_norm": 0.33761056713152376, + "learning_rate": 3.701326589211266e-05, + "loss": 2.7022, + "step": 40289 + }, + { + "epoch": 1.8758060385967363, + "grad_norm": 0.33802242355568485, + "learning_rate": 3.701065013841176e-05, + "loss": 2.5663, + "step": 40290 + }, + { + "epoch": 1.8758525967828295, + "grad_norm": 0.32507804424954845, + "learning_rate": 3.7008034422833306e-05, + "loss": 2.771, + "step": 40291 + }, + { + "epoch": 1.8758991549689226, + "grad_norm": 0.3533630027193495, + "learning_rate": 3.7005418745385e-05, + "loss": 2.6983, + "step": 40292 + }, + { + "epoch": 1.8759457131550155, + "grad_norm": 0.3298133461011379, + "learning_rate": 3.7002803106074504e-05, + "loss": 2.706, + "step": 40293 + }, + { + "epoch": 1.8759922713411086, + "grad_norm": 0.3385439776619948, + "learning_rate": 3.700018750490951e-05, + "loss": 2.6668, + "step": 40294 + }, + { + "epoch": 1.8760388295272015, + "grad_norm": 0.3669907722718788, + "learning_rate": 3.699757194189769e-05, + "loss": 2.7103, + "step": 40295 + }, + { + "epoch": 1.8760853877132946, + "grad_norm": 0.34481468050331915, + "learning_rate": 3.6994956417046714e-05, + "loss": 2.6778, + "step": 40296 + }, + { + "epoch": 1.8761319458993877, + "grad_norm": 0.3455776102550966, + "learning_rate": 3.699234093036424e-05, + "loss": 2.638, + "step": 40297 + }, + { + "epoch": 1.8761785040854808, + "grad_norm": 0.3751803368105223, + "learning_rate": 3.6989725481857986e-05, + "loss": 2.7202, + "step": 40298 + }, + { + "epoch": 1.876225062271574, + "grad_norm": 0.3319364007336326, + "learning_rate": 3.69871100715356e-05, + "loss": 2.6322, + "step": 40299 + }, + { + "epoch": 1.876271620457667, + "grad_norm": 0.3397983151947278, + "learning_rate": 3.698449469940477e-05, + "loss": 2.7629, + "step": 40300 + }, + { + "epoch": 1.8763181786437602, + "grad_norm": 0.3677046924016248, + "learning_rate": 3.6981879365473136e-05, + "loss": 2.7683, + "step": 40301 + }, + { + "epoch": 1.8763647368298533, + "grad_norm": 0.3484177357793802, + "learning_rate": 3.6979264069748435e-05, + "loss": 2.6516, + "step": 40302 + }, + { + "epoch": 1.8764112950159462, + "grad_norm": 0.3305014531681469, + "learning_rate": 3.697664881223831e-05, + "loss": 2.5601, + "step": 40303 + }, + { + "epoch": 1.8764578532020393, + "grad_norm": 0.3723777246852591, + "learning_rate": 3.697403359295042e-05, + "loss": 2.6296, + "step": 40304 + }, + { + "epoch": 1.8765044113881322, + "grad_norm": 0.3457642458500045, + "learning_rate": 3.697141841189247e-05, + "loss": 2.6863, + "step": 40305 + }, + { + "epoch": 1.8765509695742253, + "grad_norm": 0.345253797613677, + "learning_rate": 3.6968803269072104e-05, + "loss": 2.6344, + "step": 40306 + }, + { + "epoch": 1.8765975277603184, + "grad_norm": 0.33292388177767845, + "learning_rate": 3.696618816449704e-05, + "loss": 2.6158, + "step": 40307 + }, + { + "epoch": 1.8766440859464115, + "grad_norm": 0.3304537269315336, + "learning_rate": 3.696357309817493e-05, + "loss": 2.7484, + "step": 40308 + }, + { + "epoch": 1.8766906441325046, + "grad_norm": 0.3441301714973754, + "learning_rate": 3.696095807011344e-05, + "loss": 2.6824, + "step": 40309 + }, + { + "epoch": 1.8767372023185978, + "grad_norm": 0.36121600204438215, + "learning_rate": 3.695834308032026e-05, + "loss": 2.7345, + "step": 40310 + }, + { + "epoch": 1.8767837605046909, + "grad_norm": 0.3380049687193229, + "learning_rate": 3.695572812880305e-05, + "loss": 2.7721, + "step": 40311 + }, + { + "epoch": 1.8768303186907838, + "grad_norm": 0.40194455819242453, + "learning_rate": 3.6953113215569505e-05, + "loss": 2.6504, + "step": 40312 + }, + { + "epoch": 1.8768768768768769, + "grad_norm": 0.37665212685299604, + "learning_rate": 3.695049834062728e-05, + "loss": 2.7098, + "step": 40313 + }, + { + "epoch": 1.87692343506297, + "grad_norm": 0.3446866751149455, + "learning_rate": 3.694788350398404e-05, + "loss": 2.5723, + "step": 40314 + }, + { + "epoch": 1.8769699932490629, + "grad_norm": 0.3671986125573335, + "learning_rate": 3.6945268705647494e-05, + "loss": 2.7022, + "step": 40315 + }, + { + "epoch": 1.877016551435156, + "grad_norm": 0.36306948339718803, + "learning_rate": 3.6942653945625305e-05, + "loss": 2.6837, + "step": 40316 + }, + { + "epoch": 1.8770631096212491, + "grad_norm": 0.3580267778035693, + "learning_rate": 3.6940039223925115e-05, + "loss": 2.6499, + "step": 40317 + }, + { + "epoch": 1.8771096678073422, + "grad_norm": 0.34910738537339503, + "learning_rate": 3.693742454055466e-05, + "loss": 2.6714, + "step": 40318 + }, + { + "epoch": 1.8771562259934353, + "grad_norm": 0.3582367434148055, + "learning_rate": 3.6934809895521533e-05, + "loss": 2.6163, + "step": 40319 + }, + { + "epoch": 1.8772027841795285, + "grad_norm": 0.35069310367109524, + "learning_rate": 3.693219528883348e-05, + "loss": 2.7557, + "step": 40320 + }, + { + "epoch": 1.8772493423656216, + "grad_norm": 0.33712376808199324, + "learning_rate": 3.692958072049815e-05, + "loss": 2.6486, + "step": 40321 + }, + { + "epoch": 1.8772959005517145, + "grad_norm": 0.3382212564998741, + "learning_rate": 3.69269661905232e-05, + "loss": 2.6313, + "step": 40322 + }, + { + "epoch": 1.8773424587378076, + "grad_norm": 0.34833275431260835, + "learning_rate": 3.692435169891632e-05, + "loss": 2.6877, + "step": 40323 + }, + { + "epoch": 1.8773890169239007, + "grad_norm": 0.353926201877428, + "learning_rate": 3.6921737245685176e-05, + "loss": 2.6403, + "step": 40324 + }, + { + "epoch": 1.8774355751099936, + "grad_norm": 0.33336074405440375, + "learning_rate": 3.691912283083746e-05, + "loss": 2.6239, + "step": 40325 + }, + { + "epoch": 1.8774821332960867, + "grad_norm": 0.3449165422798244, + "learning_rate": 3.691650845438083e-05, + "loss": 2.6716, + "step": 40326 + }, + { + "epoch": 1.8775286914821798, + "grad_norm": 0.3583681018458443, + "learning_rate": 3.691389411632293e-05, + "loss": 2.5941, + "step": 40327 + }, + { + "epoch": 1.877575249668273, + "grad_norm": 0.3408177909063913, + "learning_rate": 3.691127981667149e-05, + "loss": 2.6035, + "step": 40328 + }, + { + "epoch": 1.877621807854366, + "grad_norm": 0.36043380995423135, + "learning_rate": 3.690866555543412e-05, + "loss": 2.7482, + "step": 40329 + }, + { + "epoch": 1.8776683660404592, + "grad_norm": 0.3515202612310529, + "learning_rate": 3.6906051332618564e-05, + "loss": 2.6036, + "step": 40330 + }, + { + "epoch": 1.8777149242265523, + "grad_norm": 0.3337383153797354, + "learning_rate": 3.690343714823245e-05, + "loss": 2.7945, + "step": 40331 + }, + { + "epoch": 1.8777614824126452, + "grad_norm": 0.36243190130753444, + "learning_rate": 3.690082300228344e-05, + "loss": 2.7091, + "step": 40332 + }, + { + "epoch": 1.8778080405987383, + "grad_norm": 0.32210148531427557, + "learning_rate": 3.689820889477924e-05, + "loss": 2.7026, + "step": 40333 + }, + { + "epoch": 1.8778545987848312, + "grad_norm": 0.38011990307598126, + "learning_rate": 3.689559482572751e-05, + "loss": 2.6849, + "step": 40334 + }, + { + "epoch": 1.8779011569709243, + "grad_norm": 0.33845470072152956, + "learning_rate": 3.689298079513591e-05, + "loss": 2.7345, + "step": 40335 + }, + { + "epoch": 1.8779477151570174, + "grad_norm": 0.34003907751626405, + "learning_rate": 3.689036680301211e-05, + "loss": 2.5888, + "step": 40336 + }, + { + "epoch": 1.8779942733431105, + "grad_norm": 0.3968179811539315, + "learning_rate": 3.68877528493638e-05, + "loss": 2.6662, + "step": 40337 + }, + { + "epoch": 1.8780408315292036, + "grad_norm": 0.32419213313598244, + "learning_rate": 3.6885138934198656e-05, + "loss": 2.6353, + "step": 40338 + }, + { + "epoch": 1.8780873897152968, + "grad_norm": 0.3368144058775332, + "learning_rate": 3.688252505752433e-05, + "loss": 2.7258, + "step": 40339 + }, + { + "epoch": 1.8781339479013899, + "grad_norm": 0.3378975465466682, + "learning_rate": 3.687991121934848e-05, + "loss": 2.6274, + "step": 40340 + }, + { + "epoch": 1.878180506087483, + "grad_norm": 0.3254875199038626, + "learning_rate": 3.687729741967883e-05, + "loss": 2.8005, + "step": 40341 + }, + { + "epoch": 1.8782270642735759, + "grad_norm": 0.3574580821041781, + "learning_rate": 3.687468365852299e-05, + "loss": 2.7742, + "step": 40342 + }, + { + "epoch": 1.878273622459669, + "grad_norm": 0.3554811335517112, + "learning_rate": 3.687206993588868e-05, + "loss": 2.6127, + "step": 40343 + }, + { + "epoch": 1.878320180645762, + "grad_norm": 0.3436609060885734, + "learning_rate": 3.6869456251783555e-05, + "loss": 2.8295, + "step": 40344 + }, + { + "epoch": 1.878366738831855, + "grad_norm": 0.3681906334419903, + "learning_rate": 3.686684260621526e-05, + "loss": 2.7598, + "step": 40345 + }, + { + "epoch": 1.8784132970179481, + "grad_norm": 0.3762811923200093, + "learning_rate": 3.686422899919151e-05, + "loss": 2.7642, + "step": 40346 + }, + { + "epoch": 1.8784598552040412, + "grad_norm": 0.37312782765732755, + "learning_rate": 3.686161543071994e-05, + "loss": 2.7418, + "step": 40347 + }, + { + "epoch": 1.8785064133901344, + "grad_norm": 0.35405505340537846, + "learning_rate": 3.6859001900808254e-05, + "loss": 2.7404, + "step": 40348 + }, + { + "epoch": 1.8785529715762275, + "grad_norm": 0.3649426553254693, + "learning_rate": 3.6856388409464084e-05, + "loss": 2.6439, + "step": 40349 + }, + { + "epoch": 1.8785995297623206, + "grad_norm": 0.34182063984276456, + "learning_rate": 3.6853774956695116e-05, + "loss": 2.6752, + "step": 40350 + }, + { + "epoch": 1.8786460879484135, + "grad_norm": 0.3380139948227304, + "learning_rate": 3.685116154250904e-05, + "loss": 2.6126, + "step": 40351 + }, + { + "epoch": 1.8786926461345066, + "grad_norm": 0.35510192716018524, + "learning_rate": 3.684854816691351e-05, + "loss": 2.6677, + "step": 40352 + }, + { + "epoch": 1.8787392043205997, + "grad_norm": 0.35646784858825653, + "learning_rate": 3.684593482991617e-05, + "loss": 2.6677, + "step": 40353 + }, + { + "epoch": 1.8787857625066926, + "grad_norm": 0.3452825751637269, + "learning_rate": 3.6843321531524746e-05, + "loss": 2.6751, + "step": 40354 + }, + { + "epoch": 1.8788323206927857, + "grad_norm": 0.34490787634494885, + "learning_rate": 3.684070827174685e-05, + "loss": 2.5841, + "step": 40355 + }, + { + "epoch": 1.8788788788788788, + "grad_norm": 0.3446458552329258, + "learning_rate": 3.68380950505902e-05, + "loss": 2.7296, + "step": 40356 + }, + { + "epoch": 1.878925437064972, + "grad_norm": 0.3287528212016884, + "learning_rate": 3.6835481868062435e-05, + "loss": 2.5477, + "step": 40357 + }, + { + "epoch": 1.878971995251065, + "grad_norm": 0.3360683116087413, + "learning_rate": 3.683286872417122e-05, + "loss": 2.5881, + "step": 40358 + }, + { + "epoch": 1.8790185534371582, + "grad_norm": 0.32894040458419777, + "learning_rate": 3.6830255618924266e-05, + "loss": 2.6928, + "step": 40359 + }, + { + "epoch": 1.8790651116232513, + "grad_norm": 0.34459353027296696, + "learning_rate": 3.68276425523292e-05, + "loss": 2.6645, + "step": 40360 + }, + { + "epoch": 1.8791116698093442, + "grad_norm": 0.33864445200155363, + "learning_rate": 3.682502952439371e-05, + "loss": 2.7603, + "step": 40361 + }, + { + "epoch": 1.8791582279954373, + "grad_norm": 0.3222856539339088, + "learning_rate": 3.682241653512546e-05, + "loss": 2.6365, + "step": 40362 + }, + { + "epoch": 1.8792047861815304, + "grad_norm": 0.34139478102678755, + "learning_rate": 3.681980358453211e-05, + "loss": 2.7219, + "step": 40363 + }, + { + "epoch": 1.8792513443676233, + "grad_norm": 0.3400973934156035, + "learning_rate": 3.681719067262135e-05, + "loss": 2.7965, + "step": 40364 + }, + { + "epoch": 1.8792979025537164, + "grad_norm": 0.33766952611811785, + "learning_rate": 3.681457779940082e-05, + "loss": 2.6954, + "step": 40365 + }, + { + "epoch": 1.8793444607398095, + "grad_norm": 0.3334104392921653, + "learning_rate": 3.681196496487822e-05, + "loss": 2.7327, + "step": 40366 + }, + { + "epoch": 1.8793910189259027, + "grad_norm": 0.35233657268333357, + "learning_rate": 3.6809352169061206e-05, + "loss": 2.687, + "step": 40367 + }, + { + "epoch": 1.8794375771119958, + "grad_norm": 0.3177088908805629, + "learning_rate": 3.680673941195742e-05, + "loss": 2.6264, + "step": 40368 + }, + { + "epoch": 1.8794841352980889, + "grad_norm": 0.3400453734406832, + "learning_rate": 3.6804126693574586e-05, + "loss": 2.7254, + "step": 40369 + }, + { + "epoch": 1.879530693484182, + "grad_norm": 0.3527045770926623, + "learning_rate": 3.680151401392034e-05, + "loss": 2.7123, + "step": 40370 + }, + { + "epoch": 1.879577251670275, + "grad_norm": 0.3165083980144565, + "learning_rate": 3.6798901373002316e-05, + "loss": 2.6949, + "step": 40371 + }, + { + "epoch": 1.879623809856368, + "grad_norm": 0.3247471879457507, + "learning_rate": 3.6796288770828245e-05, + "loss": 2.6905, + "step": 40372 + }, + { + "epoch": 1.879670368042461, + "grad_norm": 0.3282325824137625, + "learning_rate": 3.6793676207405755e-05, + "loss": 2.7761, + "step": 40373 + }, + { + "epoch": 1.879716926228554, + "grad_norm": 0.33446338511250784, + "learning_rate": 3.679106368274253e-05, + "loss": 2.7736, + "step": 40374 + }, + { + "epoch": 1.8797634844146471, + "grad_norm": 0.3271001232818071, + "learning_rate": 3.678845119684622e-05, + "loss": 2.74, + "step": 40375 + }, + { + "epoch": 1.8798100426007402, + "grad_norm": 0.3419559883333052, + "learning_rate": 3.678583874972451e-05, + "loss": 2.7137, + "step": 40376 + }, + { + "epoch": 1.8798566007868334, + "grad_norm": 0.3296704849048734, + "learning_rate": 3.678322634138508e-05, + "loss": 2.7167, + "step": 40377 + }, + { + "epoch": 1.8799031589729265, + "grad_norm": 0.31536220540318577, + "learning_rate": 3.678061397183554e-05, + "loss": 2.7211, + "step": 40378 + }, + { + "epoch": 1.8799497171590196, + "grad_norm": 0.36219922986252373, + "learning_rate": 3.6778001641083624e-05, + "loss": 2.6913, + "step": 40379 + }, + { + "epoch": 1.8799962753451127, + "grad_norm": 0.3305227190661842, + "learning_rate": 3.677538934913698e-05, + "loss": 2.6494, + "step": 40380 + }, + { + "epoch": 1.8800428335312056, + "grad_norm": 0.3366121142304823, + "learning_rate": 3.677277709600323e-05, + "loss": 2.7606, + "step": 40381 + }, + { + "epoch": 1.8800893917172987, + "grad_norm": 0.35881362660514105, + "learning_rate": 3.6770164881690104e-05, + "loss": 2.7031, + "step": 40382 + }, + { + "epoch": 1.8801359499033916, + "grad_norm": 0.31872263516111415, + "learning_rate": 3.676755270620522e-05, + "loss": 2.6554, + "step": 40383 + }, + { + "epoch": 1.8801825080894847, + "grad_norm": 0.32865033705123287, + "learning_rate": 3.6764940569556284e-05, + "loss": 2.6108, + "step": 40384 + }, + { + "epoch": 1.8802290662755778, + "grad_norm": 0.3439568107782536, + "learning_rate": 3.676232847175094e-05, + "loss": 2.659, + "step": 40385 + }, + { + "epoch": 1.880275624461671, + "grad_norm": 0.3320183479954205, + "learning_rate": 3.6759716412796855e-05, + "loss": 2.7769, + "step": 40386 + }, + { + "epoch": 1.880322182647764, + "grad_norm": 0.3389476588788196, + "learning_rate": 3.6757104392701705e-05, + "loss": 2.7313, + "step": 40387 + }, + { + "epoch": 1.8803687408338572, + "grad_norm": 0.32575545781364246, + "learning_rate": 3.675449241147314e-05, + "loss": 2.7299, + "step": 40388 + }, + { + "epoch": 1.8804152990199503, + "grad_norm": 0.362180811151009, + "learning_rate": 3.675188046911884e-05, + "loss": 2.6148, + "step": 40389 + }, + { + "epoch": 1.8804618572060434, + "grad_norm": 0.33950587988939696, + "learning_rate": 3.6749268565646464e-05, + "loss": 2.6288, + "step": 40390 + }, + { + "epoch": 1.8805084153921363, + "grad_norm": 0.3332539723077693, + "learning_rate": 3.674665670106366e-05, + "loss": 2.7377, + "step": 40391 + }, + { + "epoch": 1.8805549735782294, + "grad_norm": 0.36347144354474653, + "learning_rate": 3.6744044875378145e-05, + "loss": 2.7608, + "step": 40392 + }, + { + "epoch": 1.8806015317643223, + "grad_norm": 0.3469635984621745, + "learning_rate": 3.674143308859754e-05, + "loss": 2.6749, + "step": 40393 + }, + { + "epoch": 1.8806480899504154, + "grad_norm": 0.3404561871728822, + "learning_rate": 3.673882134072951e-05, + "loss": 2.652, + "step": 40394 + }, + { + "epoch": 1.8806946481365086, + "grad_norm": 0.3537161555940542, + "learning_rate": 3.673620963178176e-05, + "loss": 2.5355, + "step": 40395 + }, + { + "epoch": 1.8807412063226017, + "grad_norm": 0.35878366974369236, + "learning_rate": 3.673359796176189e-05, + "loss": 2.7155, + "step": 40396 + }, + { + "epoch": 1.8807877645086948, + "grad_norm": 0.3662013862208205, + "learning_rate": 3.673098633067763e-05, + "loss": 2.7983, + "step": 40397 + }, + { + "epoch": 1.880834322694788, + "grad_norm": 0.34123774531799334, + "learning_rate": 3.672837473853663e-05, + "loss": 2.6554, + "step": 40398 + }, + { + "epoch": 1.880880880880881, + "grad_norm": 0.37840656611717, + "learning_rate": 3.6725763185346515e-05, + "loss": 2.5839, + "step": 40399 + }, + { + "epoch": 1.880927439066974, + "grad_norm": 0.3285285018932093, + "learning_rate": 3.6723151671115e-05, + "loss": 2.8125, + "step": 40400 + }, + { + "epoch": 1.880973997253067, + "grad_norm": 0.34954007774932666, + "learning_rate": 3.6720540195849696e-05, + "loss": 2.5687, + "step": 40401 + }, + { + "epoch": 1.8810205554391601, + "grad_norm": 0.36351274634543784, + "learning_rate": 3.671792875955833e-05, + "loss": 2.6851, + "step": 40402 + }, + { + "epoch": 1.881067113625253, + "grad_norm": 0.33807060312549364, + "learning_rate": 3.6715317362248524e-05, + "loss": 2.7974, + "step": 40403 + }, + { + "epoch": 1.8811136718113461, + "grad_norm": 0.33963274694603074, + "learning_rate": 3.671270600392794e-05, + "loss": 2.6369, + "step": 40404 + }, + { + "epoch": 1.8811602299974393, + "grad_norm": 0.35966415485615255, + "learning_rate": 3.6710094684604266e-05, + "loss": 2.7672, + "step": 40405 + }, + { + "epoch": 1.8812067881835324, + "grad_norm": 0.3283748045282513, + "learning_rate": 3.670748340428517e-05, + "loss": 2.7019, + "step": 40406 + }, + { + "epoch": 1.8812533463696255, + "grad_norm": 0.35226221022824894, + "learning_rate": 3.6704872162978265e-05, + "loss": 2.7036, + "step": 40407 + }, + { + "epoch": 1.8812999045557186, + "grad_norm": 0.3320921268546012, + "learning_rate": 3.6702260960691284e-05, + "loss": 2.7429, + "step": 40408 + }, + { + "epoch": 1.8813464627418117, + "grad_norm": 0.3513270272939601, + "learning_rate": 3.669964979743182e-05, + "loss": 2.6685, + "step": 40409 + }, + { + "epoch": 1.8813930209279046, + "grad_norm": 0.35719593756896645, + "learning_rate": 3.6697038673207606e-05, + "loss": 2.6436, + "step": 40410 + }, + { + "epoch": 1.8814395791139977, + "grad_norm": 0.3352160047095148, + "learning_rate": 3.669442758802627e-05, + "loss": 2.7348, + "step": 40411 + }, + { + "epoch": 1.8814861373000906, + "grad_norm": 0.34608648004227516, + "learning_rate": 3.669181654189547e-05, + "loss": 2.7409, + "step": 40412 + }, + { + "epoch": 1.8815326954861837, + "grad_norm": 0.34397495883746154, + "learning_rate": 3.668920553482289e-05, + "loss": 2.6391, + "step": 40413 + }, + { + "epoch": 1.8815792536722769, + "grad_norm": 0.3263387545446677, + "learning_rate": 3.668659456681616e-05, + "loss": 2.724, + "step": 40414 + }, + { + "epoch": 1.88162581185837, + "grad_norm": 0.33892240644464744, + "learning_rate": 3.668398363788297e-05, + "loss": 2.6985, + "step": 40415 + }, + { + "epoch": 1.881672370044463, + "grad_norm": 0.33663331963641796, + "learning_rate": 3.6681372748031e-05, + "loss": 2.7549, + "step": 40416 + }, + { + "epoch": 1.8817189282305562, + "grad_norm": 0.325968249890662, + "learning_rate": 3.667876189726784e-05, + "loss": 2.6016, + "step": 40417 + }, + { + "epoch": 1.8817654864166493, + "grad_norm": 0.3750220982264454, + "learning_rate": 3.6676151085601243e-05, + "loss": 2.7761, + "step": 40418 + }, + { + "epoch": 1.8818120446027424, + "grad_norm": 0.34246556812363615, + "learning_rate": 3.66735403130388e-05, + "loss": 2.7193, + "step": 40419 + }, + { + "epoch": 1.8818586027888353, + "grad_norm": 0.32694655978413456, + "learning_rate": 3.667092957958823e-05, + "loss": 2.662, + "step": 40420 + }, + { + "epoch": 1.8819051609749284, + "grad_norm": 0.37844735297372617, + "learning_rate": 3.666831888525717e-05, + "loss": 2.591, + "step": 40421 + }, + { + "epoch": 1.8819517191610213, + "grad_norm": 0.3466698664399959, + "learning_rate": 3.666570823005325e-05, + "loss": 2.597, + "step": 40422 + }, + { + "epoch": 1.8819982773471144, + "grad_norm": 0.3591686541606921, + "learning_rate": 3.666309761398419e-05, + "loss": 2.706, + "step": 40423 + }, + { + "epoch": 1.8820448355332076, + "grad_norm": 0.34390090814925006, + "learning_rate": 3.666048703705762e-05, + "loss": 2.7489, + "step": 40424 + }, + { + "epoch": 1.8820913937193007, + "grad_norm": 0.4099449859803526, + "learning_rate": 3.66578764992812e-05, + "loss": 2.7493, + "step": 40425 + }, + { + "epoch": 1.8821379519053938, + "grad_norm": 0.35163413790618436, + "learning_rate": 3.665526600066261e-05, + "loss": 2.7354, + "step": 40426 + }, + { + "epoch": 1.882184510091487, + "grad_norm": 0.35887034593012945, + "learning_rate": 3.665265554120948e-05, + "loss": 2.7185, + "step": 40427 + }, + { + "epoch": 1.88223106827758, + "grad_norm": 0.33416382666792843, + "learning_rate": 3.6650045120929504e-05, + "loss": 2.6375, + "step": 40428 + }, + { + "epoch": 1.8822776264636731, + "grad_norm": 0.33943099264591065, + "learning_rate": 3.664743473983034e-05, + "loss": 2.6168, + "step": 40429 + }, + { + "epoch": 1.882324184649766, + "grad_norm": 0.350383620291667, + "learning_rate": 3.664482439791961e-05, + "loss": 2.7361, + "step": 40430 + }, + { + "epoch": 1.8823707428358591, + "grad_norm": 0.35101924150485353, + "learning_rate": 3.664221409520503e-05, + "loss": 2.7232, + "step": 40431 + }, + { + "epoch": 1.882417301021952, + "grad_norm": 0.32508106993943064, + "learning_rate": 3.663960383169421e-05, + "loss": 2.7438, + "step": 40432 + }, + { + "epoch": 1.8824638592080452, + "grad_norm": 0.35810892815709355, + "learning_rate": 3.663699360739487e-05, + "loss": 2.65, + "step": 40433 + }, + { + "epoch": 1.8825104173941383, + "grad_norm": 0.322621939498038, + "learning_rate": 3.6634383422314625e-05, + "loss": 2.7246, + "step": 40434 + }, + { + "epoch": 1.8825569755802314, + "grad_norm": 0.33077585156594286, + "learning_rate": 3.6631773276461135e-05, + "loss": 2.6261, + "step": 40435 + }, + { + "epoch": 1.8826035337663245, + "grad_norm": 0.3486369408015287, + "learning_rate": 3.662916316984208e-05, + "loss": 2.6618, + "step": 40436 + }, + { + "epoch": 1.8826500919524176, + "grad_norm": 0.33185989644558817, + "learning_rate": 3.662655310246512e-05, + "loss": 2.7265, + "step": 40437 + }, + { + "epoch": 1.8826966501385107, + "grad_norm": 0.3670997461951352, + "learning_rate": 3.662394307433791e-05, + "loss": 2.7968, + "step": 40438 + }, + { + "epoch": 1.8827432083246036, + "grad_norm": 0.3283469204997281, + "learning_rate": 3.662133308546811e-05, + "loss": 2.7759, + "step": 40439 + }, + { + "epoch": 1.8827897665106967, + "grad_norm": 0.36880712965967166, + "learning_rate": 3.6618723135863365e-05, + "loss": 2.6366, + "step": 40440 + }, + { + "epoch": 1.8828363246967899, + "grad_norm": 0.3237047125006897, + "learning_rate": 3.661611322553137e-05, + "loss": 2.6957, + "step": 40441 + }, + { + "epoch": 1.8828828828828827, + "grad_norm": 0.34821411072419656, + "learning_rate": 3.661350335447974e-05, + "loss": 2.6921, + "step": 40442 + }, + { + "epoch": 1.8829294410689759, + "grad_norm": 0.34905109546608304, + "learning_rate": 3.6610893522716174e-05, + "loss": 2.6485, + "step": 40443 + }, + { + "epoch": 1.882975999255069, + "grad_norm": 0.3562055616185024, + "learning_rate": 3.660828373024833e-05, + "loss": 2.7423, + "step": 40444 + }, + { + "epoch": 1.883022557441162, + "grad_norm": 0.3325743938286164, + "learning_rate": 3.660567397708382e-05, + "loss": 2.6339, + "step": 40445 + }, + { + "epoch": 1.8830691156272552, + "grad_norm": 0.344600089303188, + "learning_rate": 3.6603064263230366e-05, + "loss": 2.6531, + "step": 40446 + }, + { + "epoch": 1.8831156738133483, + "grad_norm": 0.32672474171944793, + "learning_rate": 3.660045458869559e-05, + "loss": 2.7884, + "step": 40447 + }, + { + "epoch": 1.8831622319994414, + "grad_norm": 0.3383809605589115, + "learning_rate": 3.659784495348715e-05, + "loss": 2.6235, + "step": 40448 + }, + { + "epoch": 1.8832087901855343, + "grad_norm": 0.3182461506406807, + "learning_rate": 3.6595235357612723e-05, + "loss": 2.5941, + "step": 40449 + }, + { + "epoch": 1.8832553483716274, + "grad_norm": 0.3212838971191618, + "learning_rate": 3.659262580107996e-05, + "loss": 2.7187, + "step": 40450 + }, + { + "epoch": 1.8833019065577206, + "grad_norm": 0.3198050889232586, + "learning_rate": 3.659001628389652e-05, + "loss": 2.658, + "step": 40451 + }, + { + "epoch": 1.8833484647438135, + "grad_norm": 0.3404080205857616, + "learning_rate": 3.658740680607007e-05, + "loss": 2.6953, + "step": 40452 + }, + { + "epoch": 1.8833950229299066, + "grad_norm": 0.34012781664235764, + "learning_rate": 3.6584797367608234e-05, + "loss": 2.7071, + "step": 40453 + }, + { + "epoch": 1.8834415811159997, + "grad_norm": 0.33604338598170214, + "learning_rate": 3.6582187968518724e-05, + "loss": 2.6697, + "step": 40454 + }, + { + "epoch": 1.8834881393020928, + "grad_norm": 0.3415676044316975, + "learning_rate": 3.6579578608809137e-05, + "loss": 2.8145, + "step": 40455 + }, + { + "epoch": 1.883534697488186, + "grad_norm": 0.31657397964086664, + "learning_rate": 3.65769692884872e-05, + "loss": 2.6145, + "step": 40456 + }, + { + "epoch": 1.883581255674279, + "grad_norm": 0.32915564893449206, + "learning_rate": 3.6574360007560525e-05, + "loss": 2.6349, + "step": 40457 + }, + { + "epoch": 1.8836278138603721, + "grad_norm": 0.33678098020302993, + "learning_rate": 3.657175076603677e-05, + "loss": 2.7199, + "step": 40458 + }, + { + "epoch": 1.883674372046465, + "grad_norm": 0.3516417173499426, + "learning_rate": 3.656914156392362e-05, + "loss": 2.738, + "step": 40459 + }, + { + "epoch": 1.8837209302325582, + "grad_norm": 0.34990555106405097, + "learning_rate": 3.6566532401228696e-05, + "loss": 2.6194, + "step": 40460 + }, + { + "epoch": 1.883767488418651, + "grad_norm": 0.32049303610027857, + "learning_rate": 3.656392327795969e-05, + "loss": 2.6055, + "step": 40461 + }, + { + "epoch": 1.8838140466047442, + "grad_norm": 0.33496399611016037, + "learning_rate": 3.656131419412425e-05, + "loss": 2.7543, + "step": 40462 + }, + { + "epoch": 1.8838606047908373, + "grad_norm": 0.34087422287478913, + "learning_rate": 3.6558705149730014e-05, + "loss": 2.6414, + "step": 40463 + }, + { + "epoch": 1.8839071629769304, + "grad_norm": 0.3502643331106492, + "learning_rate": 3.6556096144784666e-05, + "loss": 2.6316, + "step": 40464 + }, + { + "epoch": 1.8839537211630235, + "grad_norm": 0.3490117464059593, + "learning_rate": 3.655348717929586e-05, + "loss": 2.7264, + "step": 40465 + }, + { + "epoch": 1.8840002793491166, + "grad_norm": 0.3319873846387316, + "learning_rate": 3.6550878253271216e-05, + "loss": 2.7154, + "step": 40466 + }, + { + "epoch": 1.8840468375352097, + "grad_norm": 0.33592038582481654, + "learning_rate": 3.654826936671845e-05, + "loss": 2.6021, + "step": 40467 + }, + { + "epoch": 1.8840933957213029, + "grad_norm": 0.37847403411855757, + "learning_rate": 3.6545660519645154e-05, + "loss": 2.7347, + "step": 40468 + }, + { + "epoch": 1.8841399539073957, + "grad_norm": 0.3534941259921269, + "learning_rate": 3.654305171205904e-05, + "loss": 2.6855, + "step": 40469 + }, + { + "epoch": 1.8841865120934889, + "grad_norm": 0.3507636640058504, + "learning_rate": 3.654044294396776e-05, + "loss": 2.7437, + "step": 40470 + }, + { + "epoch": 1.8842330702795818, + "grad_norm": 0.39704944153947835, + "learning_rate": 3.653783421537891e-05, + "loss": 2.8238, + "step": 40471 + }, + { + "epoch": 1.8842796284656749, + "grad_norm": 0.348536186916746, + "learning_rate": 3.653522552630023e-05, + "loss": 2.7699, + "step": 40472 + }, + { + "epoch": 1.884326186651768, + "grad_norm": 0.34941565138609687, + "learning_rate": 3.6532616876739315e-05, + "loss": 2.6735, + "step": 40473 + }, + { + "epoch": 1.884372744837861, + "grad_norm": 0.35388294693683797, + "learning_rate": 3.6530008266703854e-05, + "loss": 2.722, + "step": 40474 + }, + { + "epoch": 1.8844193030239542, + "grad_norm": 0.3720070680212392, + "learning_rate": 3.6527399696201495e-05, + "loss": 2.7226, + "step": 40475 + }, + { + "epoch": 1.8844658612100473, + "grad_norm": 0.34740360847816143, + "learning_rate": 3.6524791165239874e-05, + "loss": 2.6932, + "step": 40476 + }, + { + "epoch": 1.8845124193961404, + "grad_norm": 0.34761598218357687, + "learning_rate": 3.652218267382668e-05, + "loss": 2.723, + "step": 40477 + }, + { + "epoch": 1.8845589775822333, + "grad_norm": 0.3408628396437051, + "learning_rate": 3.6519574221969535e-05, + "loss": 2.7358, + "step": 40478 + }, + { + "epoch": 1.8846055357683265, + "grad_norm": 0.3994756800304533, + "learning_rate": 3.651696580967613e-05, + "loss": 2.7418, + "step": 40479 + }, + { + "epoch": 1.8846520939544196, + "grad_norm": 0.34797884452898836, + "learning_rate": 3.6514357436954095e-05, + "loss": 2.6494, + "step": 40480 + }, + { + "epoch": 1.8846986521405125, + "grad_norm": 0.3709488070407973, + "learning_rate": 3.6511749103811076e-05, + "loss": 2.6419, + "step": 40481 + }, + { + "epoch": 1.8847452103266056, + "grad_norm": 0.3572039752802131, + "learning_rate": 3.650914081025477e-05, + "loss": 2.7779, + "step": 40482 + }, + { + "epoch": 1.8847917685126987, + "grad_norm": 0.343715552214027, + "learning_rate": 3.6506532556292806e-05, + "loss": 2.7545, + "step": 40483 + }, + { + "epoch": 1.8848383266987918, + "grad_norm": 0.3447703545035596, + "learning_rate": 3.6503924341932815e-05, + "loss": 2.7368, + "step": 40484 + }, + { + "epoch": 1.884884884884885, + "grad_norm": 0.358877278832771, + "learning_rate": 3.65013161671825e-05, + "loss": 2.6383, + "step": 40485 + }, + { + "epoch": 1.884931443070978, + "grad_norm": 0.36223870008952785, + "learning_rate": 3.6498708032049476e-05, + "loss": 2.6581, + "step": 40486 + }, + { + "epoch": 1.8849780012570712, + "grad_norm": 0.3208618538144578, + "learning_rate": 3.6496099936541414e-05, + "loss": 2.5853, + "step": 40487 + }, + { + "epoch": 1.885024559443164, + "grad_norm": 0.3580986316314157, + "learning_rate": 3.6493491880665984e-05, + "loss": 2.7678, + "step": 40488 + }, + { + "epoch": 1.8850711176292572, + "grad_norm": 0.3502492123831603, + "learning_rate": 3.649088386443082e-05, + "loss": 2.7447, + "step": 40489 + }, + { + "epoch": 1.8851176758153503, + "grad_norm": 0.3473973267086735, + "learning_rate": 3.6488275887843583e-05, + "loss": 2.5754, + "step": 40490 + }, + { + "epoch": 1.8851642340014432, + "grad_norm": 0.36106159349034495, + "learning_rate": 3.648566795091191e-05, + "loss": 2.7164, + "step": 40491 + }, + { + "epoch": 1.8852107921875363, + "grad_norm": 0.38615369497167723, + "learning_rate": 3.648306005364348e-05, + "loss": 2.6561, + "step": 40492 + }, + { + "epoch": 1.8852573503736294, + "grad_norm": 0.36112661754754577, + "learning_rate": 3.6480452196045945e-05, + "loss": 2.6872, + "step": 40493 + }, + { + "epoch": 1.8853039085597225, + "grad_norm": 0.38997907727307674, + "learning_rate": 3.647784437812693e-05, + "loss": 2.7679, + "step": 40494 + }, + { + "epoch": 1.8853504667458156, + "grad_norm": 0.3586341096457321, + "learning_rate": 3.647523659989413e-05, + "loss": 2.6191, + "step": 40495 + }, + { + "epoch": 1.8853970249319087, + "grad_norm": 0.37227181204419085, + "learning_rate": 3.647262886135516e-05, + "loss": 2.7225, + "step": 40496 + }, + { + "epoch": 1.8854435831180019, + "grad_norm": 0.3605752847664106, + "learning_rate": 3.647002116251771e-05, + "loss": 2.7056, + "step": 40497 + }, + { + "epoch": 1.8854901413040948, + "grad_norm": 0.36188185935589623, + "learning_rate": 3.6467413503389415e-05, + "loss": 2.7096, + "step": 40498 + }, + { + "epoch": 1.8855366994901879, + "grad_norm": 0.36404097480376696, + "learning_rate": 3.646480588397791e-05, + "loss": 2.711, + "step": 40499 + }, + { + "epoch": 1.8855832576762808, + "grad_norm": 0.3915211161552138, + "learning_rate": 3.646219830429087e-05, + "loss": 2.6601, + "step": 40500 + }, + { + "epoch": 1.8856298158623739, + "grad_norm": 0.35682276658747586, + "learning_rate": 3.645959076433597e-05, + "loss": 2.707, + "step": 40501 + }, + { + "epoch": 1.885676374048467, + "grad_norm": 0.3559361487278472, + "learning_rate": 3.645698326412081e-05, + "loss": 2.6633, + "step": 40502 + }, + { + "epoch": 1.8857229322345601, + "grad_norm": 0.332976529576632, + "learning_rate": 3.645437580365308e-05, + "loss": 2.6445, + "step": 40503 + }, + { + "epoch": 1.8857694904206532, + "grad_norm": 0.35654495817889226, + "learning_rate": 3.645176838294041e-05, + "loss": 2.7105, + "step": 40504 + }, + { + "epoch": 1.8858160486067463, + "grad_norm": 0.3481127394841406, + "learning_rate": 3.644916100199048e-05, + "loss": 2.6649, + "step": 40505 + }, + { + "epoch": 1.8858626067928395, + "grad_norm": 0.32065085317083153, + "learning_rate": 3.6446553660810933e-05, + "loss": 2.717, + "step": 40506 + }, + { + "epoch": 1.8859091649789326, + "grad_norm": 0.3306483481973393, + "learning_rate": 3.644394635940939e-05, + "loss": 2.6433, + "step": 40507 + }, + { + "epoch": 1.8859557231650255, + "grad_norm": 0.35769427712078666, + "learning_rate": 3.644133909779356e-05, + "loss": 2.7558, + "step": 40508 + }, + { + "epoch": 1.8860022813511186, + "grad_norm": 0.33216889151856266, + "learning_rate": 3.6438731875971034e-05, + "loss": 2.8322, + "step": 40509 + }, + { + "epoch": 1.8860488395372115, + "grad_norm": 0.35412793696716505, + "learning_rate": 3.643612469394952e-05, + "loss": 2.6238, + "step": 40510 + }, + { + "epoch": 1.8860953977233046, + "grad_norm": 0.3527641012830822, + "learning_rate": 3.643351755173663e-05, + "loss": 2.6588, + "step": 40511 + }, + { + "epoch": 1.8861419559093977, + "grad_norm": 0.3233731351795658, + "learning_rate": 3.643091044934002e-05, + "loss": 2.6132, + "step": 40512 + }, + { + "epoch": 1.8861885140954908, + "grad_norm": 0.33835744533725054, + "learning_rate": 3.642830338676737e-05, + "loss": 2.649, + "step": 40513 + }, + { + "epoch": 1.886235072281584, + "grad_norm": 0.3496205414827846, + "learning_rate": 3.64256963640263e-05, + "loss": 2.8776, + "step": 40514 + }, + { + "epoch": 1.886281630467677, + "grad_norm": 0.3465325856128546, + "learning_rate": 3.6423089381124484e-05, + "loss": 2.6367, + "step": 40515 + }, + { + "epoch": 1.8863281886537702, + "grad_norm": 0.36466784535642655, + "learning_rate": 3.642048243806957e-05, + "loss": 2.764, + "step": 40516 + }, + { + "epoch": 1.8863747468398633, + "grad_norm": 0.32610455056967086, + "learning_rate": 3.641787553486918e-05, + "loss": 2.7151, + "step": 40517 + }, + { + "epoch": 1.8864213050259562, + "grad_norm": 0.35701691720386913, + "learning_rate": 3.6415268671531e-05, + "loss": 2.7352, + "step": 40518 + }, + { + "epoch": 1.8864678632120493, + "grad_norm": 0.3503831393660137, + "learning_rate": 3.641266184806268e-05, + "loss": 2.672, + "step": 40519 + }, + { + "epoch": 1.8865144213981422, + "grad_norm": 0.3240766418526337, + "learning_rate": 3.641005506447183e-05, + "loss": 2.5988, + "step": 40520 + }, + { + "epoch": 1.8865609795842353, + "grad_norm": 0.3313752489891136, + "learning_rate": 3.6407448320766144e-05, + "loss": 2.6334, + "step": 40521 + }, + { + "epoch": 1.8866075377703284, + "grad_norm": 0.33900840087954565, + "learning_rate": 3.640484161695325e-05, + "loss": 2.7866, + "step": 40522 + }, + { + "epoch": 1.8866540959564215, + "grad_norm": 0.3411128975339971, + "learning_rate": 3.640223495304082e-05, + "loss": 2.7492, + "step": 40523 + }, + { + "epoch": 1.8867006541425146, + "grad_norm": 0.33014540546714277, + "learning_rate": 3.6399628329036494e-05, + "loss": 2.7217, + "step": 40524 + }, + { + "epoch": 1.8867472123286078, + "grad_norm": 0.33415256286817446, + "learning_rate": 3.63970217449479e-05, + "loss": 2.7411, + "step": 40525 + }, + { + "epoch": 1.8867937705147009, + "grad_norm": 0.3247645531594623, + "learning_rate": 3.639441520078273e-05, + "loss": 2.7916, + "step": 40526 + }, + { + "epoch": 1.8868403287007938, + "grad_norm": 0.3506893243978588, + "learning_rate": 3.639180869654859e-05, + "loss": 2.7191, + "step": 40527 + }, + { + "epoch": 1.8868868868868869, + "grad_norm": 0.3142775925123235, + "learning_rate": 3.638920223225317e-05, + "loss": 2.5924, + "step": 40528 + }, + { + "epoch": 1.88693344507298, + "grad_norm": 0.3221000999604572, + "learning_rate": 3.638659580790409e-05, + "loss": 2.725, + "step": 40529 + }, + { + "epoch": 1.886980003259073, + "grad_norm": 0.3516765945374826, + "learning_rate": 3.6383989423509e-05, + "loss": 2.6291, + "step": 40530 + }, + { + "epoch": 1.887026561445166, + "grad_norm": 0.3165547461846608, + "learning_rate": 3.6381383079075585e-05, + "loss": 2.5937, + "step": 40531 + }, + { + "epoch": 1.8870731196312591, + "grad_norm": 0.31090702041921114, + "learning_rate": 3.637877677461143e-05, + "loss": 2.5925, + "step": 40532 + }, + { + "epoch": 1.8871196778173522, + "grad_norm": 0.33594754576986197, + "learning_rate": 3.637617051012426e-05, + "loss": 2.7384, + "step": 40533 + }, + { + "epoch": 1.8871662360034454, + "grad_norm": 0.31093006236497445, + "learning_rate": 3.637356428562168e-05, + "loss": 2.7652, + "step": 40534 + }, + { + "epoch": 1.8872127941895385, + "grad_norm": 0.32396538633086674, + "learning_rate": 3.637095810111133e-05, + "loss": 2.7913, + "step": 40535 + }, + { + "epoch": 1.8872593523756316, + "grad_norm": 0.35674052721025473, + "learning_rate": 3.6368351956600896e-05, + "loss": 2.738, + "step": 40536 + }, + { + "epoch": 1.8873059105617245, + "grad_norm": 0.3242499893575941, + "learning_rate": 3.6365745852098e-05, + "loss": 2.6809, + "step": 40537 + }, + { + "epoch": 1.8873524687478176, + "grad_norm": 0.34194261730602465, + "learning_rate": 3.636313978761028e-05, + "loss": 2.7436, + "step": 40538 + }, + { + "epoch": 1.8873990269339107, + "grad_norm": 0.3549954703869739, + "learning_rate": 3.636053376314542e-05, + "loss": 2.7387, + "step": 40539 + }, + { + "epoch": 1.8874455851200036, + "grad_norm": 0.3197823653017118, + "learning_rate": 3.635792777871104e-05, + "loss": 2.6832, + "step": 40540 + }, + { + "epoch": 1.8874921433060967, + "grad_norm": 0.3401684575782489, + "learning_rate": 3.6355321834314813e-05, + "loss": 2.7372, + "step": 40541 + }, + { + "epoch": 1.8875387014921898, + "grad_norm": 0.36080342168681384, + "learning_rate": 3.6352715929964374e-05, + "loss": 2.6913, + "step": 40542 + }, + { + "epoch": 1.887585259678283, + "grad_norm": 0.32265445687985045, + "learning_rate": 3.6350110065667344e-05, + "loss": 2.6942, + "step": 40543 + }, + { + "epoch": 1.887631817864376, + "grad_norm": 0.3333827037348209, + "learning_rate": 3.634750424143141e-05, + "loss": 2.6487, + "step": 40544 + }, + { + "epoch": 1.8876783760504692, + "grad_norm": 0.3481071821214778, + "learning_rate": 3.634489845726419e-05, + "loss": 2.7789, + "step": 40545 + }, + { + "epoch": 1.8877249342365623, + "grad_norm": 0.34731074192749956, + "learning_rate": 3.6342292713173374e-05, + "loss": 2.7821, + "step": 40546 + }, + { + "epoch": 1.8877714924226552, + "grad_norm": 0.3242848274116809, + "learning_rate": 3.633968700916658e-05, + "loss": 2.7294, + "step": 40547 + }, + { + "epoch": 1.8878180506087483, + "grad_norm": 0.34519952351575584, + "learning_rate": 3.633708134525143e-05, + "loss": 2.7188, + "step": 40548 + }, + { + "epoch": 1.8878646087948412, + "grad_norm": 0.34701362521310336, + "learning_rate": 3.6334475721435634e-05, + "loss": 2.7073, + "step": 40549 + }, + { + "epoch": 1.8879111669809343, + "grad_norm": 0.34697094839754505, + "learning_rate": 3.6331870137726786e-05, + "loss": 2.7542, + "step": 40550 + }, + { + "epoch": 1.8879577251670274, + "grad_norm": 0.3550145862506334, + "learning_rate": 3.6329264594132565e-05, + "loss": 2.6988, + "step": 40551 + }, + { + "epoch": 1.8880042833531205, + "grad_norm": 0.3286337167077097, + "learning_rate": 3.63266590906606e-05, + "loss": 2.5972, + "step": 40552 + }, + { + "epoch": 1.8880508415392137, + "grad_norm": 0.3432134295246168, + "learning_rate": 3.632405362731854e-05, + "loss": 2.6164, + "step": 40553 + }, + { + "epoch": 1.8880973997253068, + "grad_norm": 0.34462949190744, + "learning_rate": 3.632144820411405e-05, + "loss": 2.5737, + "step": 40554 + }, + { + "epoch": 1.8881439579113999, + "grad_norm": 0.34165244146259705, + "learning_rate": 3.6318842821054755e-05, + "loss": 2.7516, + "step": 40555 + }, + { + "epoch": 1.888190516097493, + "grad_norm": 0.37070578458899367, + "learning_rate": 3.6316237478148294e-05, + "loss": 2.716, + "step": 40556 + }, + { + "epoch": 1.888237074283586, + "grad_norm": 0.3722075016468678, + "learning_rate": 3.631363217540235e-05, + "loss": 2.6371, + "step": 40557 + }, + { + "epoch": 1.888283632469679, + "grad_norm": 0.3409692494980651, + "learning_rate": 3.631102691282451e-05, + "loss": 2.5557, + "step": 40558 + }, + { + "epoch": 1.888330190655772, + "grad_norm": 0.34989294420973854, + "learning_rate": 3.6308421690422495e-05, + "loss": 2.6926, + "step": 40559 + }, + { + "epoch": 1.888376748841865, + "grad_norm": 0.33341535498623637, + "learning_rate": 3.63058165082039e-05, + "loss": 2.6632, + "step": 40560 + }, + { + "epoch": 1.8884233070279581, + "grad_norm": 0.3535383438533931, + "learning_rate": 3.630321136617638e-05, + "loss": 2.614, + "step": 40561 + }, + { + "epoch": 1.8884698652140512, + "grad_norm": 0.3590927968565828, + "learning_rate": 3.63006062643476e-05, + "loss": 2.7164, + "step": 40562 + }, + { + "epoch": 1.8885164234001444, + "grad_norm": 0.3655993899442727, + "learning_rate": 3.629800120272517e-05, + "loss": 2.7261, + "step": 40563 + }, + { + "epoch": 1.8885629815862375, + "grad_norm": 0.3274515261107618, + "learning_rate": 3.6295396181316774e-05, + "loss": 2.5595, + "step": 40564 + }, + { + "epoch": 1.8886095397723306, + "grad_norm": 0.3374882103176484, + "learning_rate": 3.629279120013004e-05, + "loss": 2.7368, + "step": 40565 + }, + { + "epoch": 1.8886560979584235, + "grad_norm": 0.34742785639428503, + "learning_rate": 3.629018625917261e-05, + "loss": 2.6985, + "step": 40566 + }, + { + "epoch": 1.8887026561445166, + "grad_norm": 0.35770412007656777, + "learning_rate": 3.6287581358452135e-05, + "loss": 2.7054, + "step": 40567 + }, + { + "epoch": 1.8887492143306097, + "grad_norm": 0.3427977652634499, + "learning_rate": 3.628497649797624e-05, + "loss": 2.7193, + "step": 40568 + }, + { + "epoch": 1.8887957725167026, + "grad_norm": 0.3679495025820802, + "learning_rate": 3.628237167775261e-05, + "loss": 2.7404, + "step": 40569 + }, + { + "epoch": 1.8888423307027957, + "grad_norm": 0.3459765194238892, + "learning_rate": 3.627976689778887e-05, + "loss": 2.7003, + "step": 40570 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.34824996293004834, + "learning_rate": 3.627716215809263e-05, + "loss": 2.6544, + "step": 40571 + }, + { + "epoch": 1.888935447074982, + "grad_norm": 0.32494323390557794, + "learning_rate": 3.6274557458671596e-05, + "loss": 2.7305, + "step": 40572 + }, + { + "epoch": 1.888982005261075, + "grad_norm": 0.3623806196727473, + "learning_rate": 3.627195279953339e-05, + "loss": 2.6595, + "step": 40573 + }, + { + "epoch": 1.8890285634471682, + "grad_norm": 0.36559730024808534, + "learning_rate": 3.626934818068562e-05, + "loss": 2.6887, + "step": 40574 + }, + { + "epoch": 1.8890751216332613, + "grad_norm": 0.33747966316500144, + "learning_rate": 3.6266743602135983e-05, + "loss": 2.7346, + "step": 40575 + }, + { + "epoch": 1.8891216798193542, + "grad_norm": 0.3514111899658078, + "learning_rate": 3.6264139063892086e-05, + "loss": 2.7476, + "step": 40576 + }, + { + "epoch": 1.8891682380054473, + "grad_norm": 0.34062190384704705, + "learning_rate": 3.6261534565961596e-05, + "loss": 2.7311, + "step": 40577 + }, + { + "epoch": 1.8892147961915404, + "grad_norm": 0.3545089415111388, + "learning_rate": 3.6258930108352165e-05, + "loss": 2.7167, + "step": 40578 + }, + { + "epoch": 1.8892613543776333, + "grad_norm": 0.33547456288336247, + "learning_rate": 3.625632569107139e-05, + "loss": 2.6252, + "step": 40579 + }, + { + "epoch": 1.8893079125637264, + "grad_norm": 0.3468382216183221, + "learning_rate": 3.625372131412697e-05, + "loss": 2.569, + "step": 40580 + }, + { + "epoch": 1.8893544707498195, + "grad_norm": 0.34822851871888033, + "learning_rate": 3.62511169775265e-05, + "loss": 2.6235, + "step": 40581 + }, + { + "epoch": 1.8894010289359127, + "grad_norm": 0.3352261868668096, + "learning_rate": 3.624851268127767e-05, + "loss": 2.7051, + "step": 40582 + }, + { + "epoch": 1.8894475871220058, + "grad_norm": 0.2985797335913747, + "learning_rate": 3.62459084253881e-05, + "loss": 2.6311, + "step": 40583 + }, + { + "epoch": 1.889494145308099, + "grad_norm": 0.3701160639545653, + "learning_rate": 3.6243304209865406e-05, + "loss": 2.6319, + "step": 40584 + }, + { + "epoch": 1.889540703494192, + "grad_norm": 0.31584913906614553, + "learning_rate": 3.624070003471729e-05, + "loss": 2.6502, + "step": 40585 + }, + { + "epoch": 1.889587261680285, + "grad_norm": 0.3482796637170143, + "learning_rate": 3.623809589995133e-05, + "loss": 2.6688, + "step": 40586 + }, + { + "epoch": 1.889633819866378, + "grad_norm": 0.328010509178824, + "learning_rate": 3.6235491805575236e-05, + "loss": 2.6937, + "step": 40587 + }, + { + "epoch": 1.889680378052471, + "grad_norm": 0.36607547108260097, + "learning_rate": 3.6232887751596615e-05, + "loss": 2.7115, + "step": 40588 + }, + { + "epoch": 1.889726936238564, + "grad_norm": 0.3402770320012861, + "learning_rate": 3.62302837380231e-05, + "loss": 2.673, + "step": 40589 + }, + { + "epoch": 1.8897734944246571, + "grad_norm": 0.3390251906077765, + "learning_rate": 3.622767976486234e-05, + "loss": 2.634, + "step": 40590 + }, + { + "epoch": 1.8898200526107503, + "grad_norm": 0.31712053608056157, + "learning_rate": 3.6225075832122e-05, + "loss": 2.6482, + "step": 40591 + }, + { + "epoch": 1.8898666107968434, + "grad_norm": 0.33840069826145824, + "learning_rate": 3.6222471939809694e-05, + "loss": 2.6785, + "step": 40592 + }, + { + "epoch": 1.8899131689829365, + "grad_norm": 0.344234867689878, + "learning_rate": 3.621986808793309e-05, + "loss": 2.5957, + "step": 40593 + }, + { + "epoch": 1.8899597271690296, + "grad_norm": 0.3389951221964907, + "learning_rate": 3.621726427649978e-05, + "loss": 2.6314, + "step": 40594 + }, + { + "epoch": 1.8900062853551227, + "grad_norm": 0.36608838781822806, + "learning_rate": 3.621466050551748e-05, + "loss": 2.7215, + "step": 40595 + }, + { + "epoch": 1.8900528435412156, + "grad_norm": 0.3555894147431336, + "learning_rate": 3.621205677499378e-05, + "loss": 2.6796, + "step": 40596 + }, + { + "epoch": 1.8900994017273087, + "grad_norm": 0.34608085958760393, + "learning_rate": 3.620945308493633e-05, + "loss": 2.702, + "step": 40597 + }, + { + "epoch": 1.8901459599134016, + "grad_norm": 0.3780531691677781, + "learning_rate": 3.6206849435352784e-05, + "loss": 2.6704, + "step": 40598 + }, + { + "epoch": 1.8901925180994947, + "grad_norm": 0.3672956554430257, + "learning_rate": 3.620424582625076e-05, + "loss": 2.7142, + "step": 40599 + }, + { + "epoch": 1.8902390762855878, + "grad_norm": 0.3836265213072514, + "learning_rate": 3.6201642257637936e-05, + "loss": 2.6757, + "step": 40600 + }, + { + "epoch": 1.890285634471681, + "grad_norm": 0.34604137011492453, + "learning_rate": 3.6199038729521926e-05, + "loss": 2.8089, + "step": 40601 + }, + { + "epoch": 1.890332192657774, + "grad_norm": 0.38655504510167155, + "learning_rate": 3.6196435241910374e-05, + "loss": 2.7967, + "step": 40602 + }, + { + "epoch": 1.8903787508438672, + "grad_norm": 0.36523506272183937, + "learning_rate": 3.619383179481093e-05, + "loss": 2.8087, + "step": 40603 + }, + { + "epoch": 1.8904253090299603, + "grad_norm": 0.37947954023444386, + "learning_rate": 3.619122838823122e-05, + "loss": 2.7423, + "step": 40604 + }, + { + "epoch": 1.8904718672160534, + "grad_norm": 0.36815651474067906, + "learning_rate": 3.618862502217891e-05, + "loss": 2.727, + "step": 40605 + }, + { + "epoch": 1.8905184254021463, + "grad_norm": 0.330657315420915, + "learning_rate": 3.618602169666162e-05, + "loss": 2.6093, + "step": 40606 + }, + { + "epoch": 1.8905649835882394, + "grad_norm": 0.35262766279662544, + "learning_rate": 3.618341841168697e-05, + "loss": 2.6104, + "step": 40607 + }, + { + "epoch": 1.8906115417743323, + "grad_norm": 0.33346838532399226, + "learning_rate": 3.618081516726266e-05, + "loss": 2.6866, + "step": 40608 + }, + { + "epoch": 1.8906580999604254, + "grad_norm": 0.36900682032781407, + "learning_rate": 3.617821196339629e-05, + "loss": 2.6525, + "step": 40609 + }, + { + "epoch": 1.8907046581465186, + "grad_norm": 0.3283120808008002, + "learning_rate": 3.617560880009549e-05, + "loss": 2.8384, + "step": 40610 + }, + { + "epoch": 1.8907512163326117, + "grad_norm": 0.35538470661908095, + "learning_rate": 3.617300567736793e-05, + "loss": 2.642, + "step": 40611 + }, + { + "epoch": 1.8907977745187048, + "grad_norm": 0.3600818578385945, + "learning_rate": 3.617040259522122e-05, + "loss": 2.6676, + "step": 40612 + }, + { + "epoch": 1.890844332704798, + "grad_norm": 0.37148131107336774, + "learning_rate": 3.616779955366303e-05, + "loss": 2.7237, + "step": 40613 + }, + { + "epoch": 1.890890890890891, + "grad_norm": 0.3531520152253067, + "learning_rate": 3.6165196552701e-05, + "loss": 2.6343, + "step": 40614 + }, + { + "epoch": 1.890937449076984, + "grad_norm": 0.34923500139397834, + "learning_rate": 3.616259359234273e-05, + "loss": 2.7316, + "step": 40615 + }, + { + "epoch": 1.890984007263077, + "grad_norm": 0.35388235872625956, + "learning_rate": 3.615999067259589e-05, + "loss": 2.8528, + "step": 40616 + }, + { + "epoch": 1.8910305654491701, + "grad_norm": 0.3362324180612428, + "learning_rate": 3.615738779346811e-05, + "loss": 2.7152, + "step": 40617 + }, + { + "epoch": 1.891077123635263, + "grad_norm": 0.30555484996433024, + "learning_rate": 3.6154784954967047e-05, + "loss": 2.7936, + "step": 40618 + }, + { + "epoch": 1.8911236818213562, + "grad_norm": 0.3359983071545085, + "learning_rate": 3.615218215710033e-05, + "loss": 2.6234, + "step": 40619 + }, + { + "epoch": 1.8911702400074493, + "grad_norm": 0.35553787363087225, + "learning_rate": 3.614957939987557e-05, + "loss": 2.6283, + "step": 40620 + }, + { + "epoch": 1.8912167981935424, + "grad_norm": 0.3329614580814262, + "learning_rate": 3.614697668330045e-05, + "loss": 2.6799, + "step": 40621 + }, + { + "epoch": 1.8912633563796355, + "grad_norm": 0.2969416794222879, + "learning_rate": 3.6144374007382564e-05, + "loss": 2.7339, + "step": 40622 + }, + { + "epoch": 1.8913099145657286, + "grad_norm": 0.35797511583293506, + "learning_rate": 3.614177137212961e-05, + "loss": 2.6977, + "step": 40623 + }, + { + "epoch": 1.8913564727518217, + "grad_norm": 0.3439064030373464, + "learning_rate": 3.6139168777549175e-05, + "loss": 2.7797, + "step": 40624 + }, + { + "epoch": 1.8914030309379146, + "grad_norm": 0.346755595720977, + "learning_rate": 3.61365662236489e-05, + "loss": 2.6709, + "step": 40625 + }, + { + "epoch": 1.8914495891240077, + "grad_norm": 0.3314238910871861, + "learning_rate": 3.613396371043646e-05, + "loss": 2.5921, + "step": 40626 + }, + { + "epoch": 1.8914961473101009, + "grad_norm": 0.34155810826510485, + "learning_rate": 3.613136123791947e-05, + "loss": 2.7065, + "step": 40627 + }, + { + "epoch": 1.8915427054961937, + "grad_norm": 0.3756086388476721, + "learning_rate": 3.612875880610556e-05, + "loss": 2.6056, + "step": 40628 + }, + { + "epoch": 1.8915892636822869, + "grad_norm": 0.3458313824905684, + "learning_rate": 3.612615641500237e-05, + "loss": 2.7437, + "step": 40629 + }, + { + "epoch": 1.89163582186838, + "grad_norm": 0.3528785559043017, + "learning_rate": 3.612355406461755e-05, + "loss": 2.8136, + "step": 40630 + }, + { + "epoch": 1.891682380054473, + "grad_norm": 0.3423180005997504, + "learning_rate": 3.6120951754958746e-05, + "loss": 2.6657, + "step": 40631 + }, + { + "epoch": 1.8917289382405662, + "grad_norm": 0.3534290193376758, + "learning_rate": 3.611834948603357e-05, + "loss": 2.738, + "step": 40632 + }, + { + "epoch": 1.8917754964266593, + "grad_norm": 0.3392469319916552, + "learning_rate": 3.6115747257849656e-05, + "loss": 2.7659, + "step": 40633 + }, + { + "epoch": 1.8918220546127524, + "grad_norm": 0.3410279146065146, + "learning_rate": 3.611314507041467e-05, + "loss": 2.7222, + "step": 40634 + }, + { + "epoch": 1.8918686127988453, + "grad_norm": 0.3361560863800575, + "learning_rate": 3.611054292373623e-05, + "loss": 2.7818, + "step": 40635 + }, + { + "epoch": 1.8919151709849384, + "grad_norm": 0.3365012910052657, + "learning_rate": 3.610794081782198e-05, + "loss": 2.6013, + "step": 40636 + }, + { + "epoch": 1.8919617291710313, + "grad_norm": 0.34869785565977834, + "learning_rate": 3.610533875267956e-05, + "loss": 2.678, + "step": 40637 + }, + { + "epoch": 1.8920082873571245, + "grad_norm": 0.32913737814420335, + "learning_rate": 3.610273672831659e-05, + "loss": 2.6037, + "step": 40638 + }, + { + "epoch": 1.8920548455432176, + "grad_norm": 0.3322859134549668, + "learning_rate": 3.610013474474074e-05, + "loss": 2.7508, + "step": 40639 + }, + { + "epoch": 1.8921014037293107, + "grad_norm": 0.33131867950594124, + "learning_rate": 3.6097532801959606e-05, + "loss": 2.6454, + "step": 40640 + }, + { + "epoch": 1.8921479619154038, + "grad_norm": 0.3578323199145232, + "learning_rate": 3.609493089998086e-05, + "loss": 2.7158, + "step": 40641 + }, + { + "epoch": 1.892194520101497, + "grad_norm": 0.3357261346245583, + "learning_rate": 3.6092329038812106e-05, + "loss": 2.7136, + "step": 40642 + }, + { + "epoch": 1.89224107828759, + "grad_norm": 0.3681490606980522, + "learning_rate": 3.6089727218461e-05, + "loss": 2.6871, + "step": 40643 + }, + { + "epoch": 1.8922876364736831, + "grad_norm": 0.37416685188533777, + "learning_rate": 3.608712543893519e-05, + "loss": 2.718, + "step": 40644 + }, + { + "epoch": 1.892334194659776, + "grad_norm": 0.3442130755667113, + "learning_rate": 3.6084523700242286e-05, + "loss": 2.6603, + "step": 40645 + }, + { + "epoch": 1.8923807528458692, + "grad_norm": 0.3531627031311889, + "learning_rate": 3.608192200238991e-05, + "loss": 2.6473, + "step": 40646 + }, + { + "epoch": 1.892427311031962, + "grad_norm": 0.33400739026840043, + "learning_rate": 3.607932034538576e-05, + "loss": 2.6194, + "step": 40647 + }, + { + "epoch": 1.8924738692180552, + "grad_norm": 0.37997427273655826, + "learning_rate": 3.60767187292374e-05, + "loss": 2.6414, + "step": 40648 + }, + { + "epoch": 1.8925204274041483, + "grad_norm": 0.3704935326257561, + "learning_rate": 3.607411715395252e-05, + "loss": 2.6167, + "step": 40649 + }, + { + "epoch": 1.8925669855902414, + "grad_norm": 0.33520032831956675, + "learning_rate": 3.607151561953873e-05, + "loss": 2.793, + "step": 40650 + }, + { + "epoch": 1.8926135437763345, + "grad_norm": 0.34378216060912564, + "learning_rate": 3.606891412600366e-05, + "loss": 2.5127, + "step": 40651 + }, + { + "epoch": 1.8926601019624276, + "grad_norm": 0.3443190911884869, + "learning_rate": 3.606631267335497e-05, + "loss": 2.624, + "step": 40652 + }, + { + "epoch": 1.8927066601485207, + "grad_norm": 0.31820444985064134, + "learning_rate": 3.6063711261600266e-05, + "loss": 2.6933, + "step": 40653 + }, + { + "epoch": 1.8927532183346136, + "grad_norm": 0.33162645546529546, + "learning_rate": 3.606110989074721e-05, + "loss": 2.6631, + "step": 40654 + }, + { + "epoch": 1.8927997765207067, + "grad_norm": 0.3537278474990584, + "learning_rate": 3.6058508560803413e-05, + "loss": 2.6836, + "step": 40655 + }, + { + "epoch": 1.8928463347067999, + "grad_norm": 0.3545278690672635, + "learning_rate": 3.6055907271776526e-05, + "loss": 2.7606, + "step": 40656 + }, + { + "epoch": 1.8928928928928928, + "grad_norm": 0.3367274469252669, + "learning_rate": 3.6053306023674184e-05, + "loss": 2.753, + "step": 40657 + }, + { + "epoch": 1.8929394510789859, + "grad_norm": 0.3409824645985675, + "learning_rate": 3.605070481650399e-05, + "loss": 2.6862, + "step": 40658 + }, + { + "epoch": 1.892986009265079, + "grad_norm": 0.3599606312436836, + "learning_rate": 3.6048103650273624e-05, + "loss": 2.736, + "step": 40659 + }, + { + "epoch": 1.893032567451172, + "grad_norm": 0.3420151730951336, + "learning_rate": 3.604550252499071e-05, + "loss": 2.6774, + "step": 40660 + }, + { + "epoch": 1.8930791256372652, + "grad_norm": 0.34572775291763475, + "learning_rate": 3.6042901440662836e-05, + "loss": 2.7623, + "step": 40661 + }, + { + "epoch": 1.8931256838233583, + "grad_norm": 0.3243053335149584, + "learning_rate": 3.60403003972977e-05, + "loss": 2.6358, + "step": 40662 + }, + { + "epoch": 1.8931722420094514, + "grad_norm": 0.3659009357669688, + "learning_rate": 3.60376993949029e-05, + "loss": 2.68, + "step": 40663 + }, + { + "epoch": 1.8932188001955443, + "grad_norm": 0.33270661147884295, + "learning_rate": 3.6035098433486056e-05, + "loss": 2.6065, + "step": 40664 + }, + { + "epoch": 1.8932653583816375, + "grad_norm": 0.3256243631909391, + "learning_rate": 3.603249751305485e-05, + "loss": 2.6068, + "step": 40665 + }, + { + "epoch": 1.8933119165677306, + "grad_norm": 0.36715464975362017, + "learning_rate": 3.602989663361688e-05, + "loss": 2.7341, + "step": 40666 + }, + { + "epoch": 1.8933584747538235, + "grad_norm": 0.3367971240724452, + "learning_rate": 3.602729579517979e-05, + "loss": 2.7003, + "step": 40667 + }, + { + "epoch": 1.8934050329399166, + "grad_norm": 0.3567435162655724, + "learning_rate": 3.602469499775121e-05, + "loss": 2.6771, + "step": 40668 + }, + { + "epoch": 1.8934515911260097, + "grad_norm": 0.34624662357738006, + "learning_rate": 3.602209424133876e-05, + "loss": 2.636, + "step": 40669 + }, + { + "epoch": 1.8934981493121028, + "grad_norm": 0.3581734066470268, + "learning_rate": 3.60194935259501e-05, + "loss": 2.7518, + "step": 40670 + }, + { + "epoch": 1.893544707498196, + "grad_norm": 0.3903616102736864, + "learning_rate": 3.601689285159283e-05, + "loss": 2.6915, + "step": 40671 + }, + { + "epoch": 1.893591265684289, + "grad_norm": 0.350169725796701, + "learning_rate": 3.6014292218274634e-05, + "loss": 2.6316, + "step": 40672 + }, + { + "epoch": 1.8936378238703822, + "grad_norm": 0.3729481136461467, + "learning_rate": 3.6011691626003096e-05, + "loss": 2.8024, + "step": 40673 + }, + { + "epoch": 1.893684382056475, + "grad_norm": 0.34357941811356724, + "learning_rate": 3.600909107478585e-05, + "loss": 2.6554, + "step": 40674 + }, + { + "epoch": 1.8937309402425682, + "grad_norm": 0.35950980222742474, + "learning_rate": 3.600649056463057e-05, + "loss": 2.518, + "step": 40675 + }, + { + "epoch": 1.893777498428661, + "grad_norm": 0.34153490245340085, + "learning_rate": 3.600389009554484e-05, + "loss": 2.7037, + "step": 40676 + }, + { + "epoch": 1.8938240566147542, + "grad_norm": 0.348167532058808, + "learning_rate": 3.6001289667536325e-05, + "loss": 2.6766, + "step": 40677 + }, + { + "epoch": 1.8938706148008473, + "grad_norm": 0.3376481437363661, + "learning_rate": 3.599868928061265e-05, + "loss": 2.6043, + "step": 40678 + }, + { + "epoch": 1.8939171729869404, + "grad_norm": 0.33241964528708323, + "learning_rate": 3.5996088934781433e-05, + "loss": 2.6872, + "step": 40679 + }, + { + "epoch": 1.8939637311730335, + "grad_norm": 0.3486637254308327, + "learning_rate": 3.599348863005033e-05, + "loss": 2.6415, + "step": 40680 + }, + { + "epoch": 1.8940102893591266, + "grad_norm": 0.34679900746588926, + "learning_rate": 3.5990888366426936e-05, + "loss": 2.6341, + "step": 40681 + }, + { + "epoch": 1.8940568475452197, + "grad_norm": 0.33693047052390507, + "learning_rate": 3.598828814391891e-05, + "loss": 2.7009, + "step": 40682 + }, + { + "epoch": 1.8941034057313129, + "grad_norm": 0.34303043839647945, + "learning_rate": 3.59856879625339e-05, + "loss": 2.6787, + "step": 40683 + }, + { + "epoch": 1.8941499639174058, + "grad_norm": 0.3346712978164355, + "learning_rate": 3.598308782227948e-05, + "loss": 2.6696, + "step": 40684 + }, + { + "epoch": 1.8941965221034989, + "grad_norm": 0.33346330662962737, + "learning_rate": 3.598048772316335e-05, + "loss": 2.73, + "step": 40685 + }, + { + "epoch": 1.8942430802895918, + "grad_norm": 0.3483512294021911, + "learning_rate": 3.597788766519309e-05, + "loss": 2.6898, + "step": 40686 + }, + { + "epoch": 1.8942896384756849, + "grad_norm": 0.35565888487555863, + "learning_rate": 3.5975287648376344e-05, + "loss": 2.8899, + "step": 40687 + }, + { + "epoch": 1.894336196661778, + "grad_norm": 0.346018065154992, + "learning_rate": 3.5972687672720756e-05, + "loss": 2.8174, + "step": 40688 + }, + { + "epoch": 1.894382754847871, + "grad_norm": 0.34472495967880384, + "learning_rate": 3.597008773823393e-05, + "loss": 2.6392, + "step": 40689 + }, + { + "epoch": 1.8944293130339642, + "grad_norm": 0.34272238700641355, + "learning_rate": 3.596748784492354e-05, + "loss": 2.7561, + "step": 40690 + }, + { + "epoch": 1.8944758712200573, + "grad_norm": 0.349128346686599, + "learning_rate": 3.596488799279718e-05, + "loss": 2.7202, + "step": 40691 + }, + { + "epoch": 1.8945224294061505, + "grad_norm": 0.3360164924294841, + "learning_rate": 3.5962288181862494e-05, + "loss": 2.5334, + "step": 40692 + }, + { + "epoch": 1.8945689875922436, + "grad_norm": 0.3504169779570655, + "learning_rate": 3.5959688412127114e-05, + "loss": 2.7529, + "step": 40693 + }, + { + "epoch": 1.8946155457783365, + "grad_norm": 0.35015420692756344, + "learning_rate": 3.595708868359865e-05, + "loss": 2.7687, + "step": 40694 + }, + { + "epoch": 1.8946621039644296, + "grad_norm": 0.34551785053986916, + "learning_rate": 3.595448899628476e-05, + "loss": 2.7035, + "step": 40695 + }, + { + "epoch": 1.8947086621505225, + "grad_norm": 0.35568289170115647, + "learning_rate": 3.595188935019306e-05, + "loss": 2.6277, + "step": 40696 + }, + { + "epoch": 1.8947552203366156, + "grad_norm": 0.3698249889333738, + "learning_rate": 3.594928974533116e-05, + "loss": 2.7194, + "step": 40697 + }, + { + "epoch": 1.8948017785227087, + "grad_norm": 0.3206398022531692, + "learning_rate": 3.594669018170674e-05, + "loss": 2.6523, + "step": 40698 + }, + { + "epoch": 1.8948483367088018, + "grad_norm": 0.3497259002477192, + "learning_rate": 3.594409065932739e-05, + "loss": 2.7416, + "step": 40699 + }, + { + "epoch": 1.894894894894895, + "grad_norm": 0.33133410975591804, + "learning_rate": 3.594149117820074e-05, + "loss": 2.6661, + "step": 40700 + }, + { + "epoch": 1.894941453080988, + "grad_norm": 0.34322354153147594, + "learning_rate": 3.5938891738334433e-05, + "loss": 2.7942, + "step": 40701 + }, + { + "epoch": 1.8949880112670812, + "grad_norm": 0.322591532771225, + "learning_rate": 3.593629233973609e-05, + "loss": 2.6674, + "step": 40702 + }, + { + "epoch": 1.895034569453174, + "grad_norm": 0.3255909610283944, + "learning_rate": 3.593369298241335e-05, + "loss": 2.6759, + "step": 40703 + }, + { + "epoch": 1.8950811276392672, + "grad_norm": 0.3264029998987992, + "learning_rate": 3.593109366637384e-05, + "loss": 2.7117, + "step": 40704 + }, + { + "epoch": 1.8951276858253603, + "grad_norm": 0.3419561955392785, + "learning_rate": 3.592849439162518e-05, + "loss": 2.7302, + "step": 40705 + }, + { + "epoch": 1.8951742440114532, + "grad_norm": 0.3237398484549363, + "learning_rate": 3.592589515817501e-05, + "loss": 2.6133, + "step": 40706 + }, + { + "epoch": 1.8952208021975463, + "grad_norm": 0.34574931595667074, + "learning_rate": 3.592329596603092e-05, + "loss": 2.6856, + "step": 40707 + }, + { + "epoch": 1.8952673603836394, + "grad_norm": 0.3414726230436912, + "learning_rate": 3.592069681520061e-05, + "loss": 2.6823, + "step": 40708 + }, + { + "epoch": 1.8953139185697325, + "grad_norm": 0.3383834988102094, + "learning_rate": 3.591809770569165e-05, + "loss": 2.7254, + "step": 40709 + }, + { + "epoch": 1.8953604767558256, + "grad_norm": 0.3498928163105356, + "learning_rate": 3.591549863751168e-05, + "loss": 2.6604, + "step": 40710 + }, + { + "epoch": 1.8954070349419188, + "grad_norm": 0.3773386034694238, + "learning_rate": 3.591289961066834e-05, + "loss": 2.6824, + "step": 40711 + }, + { + "epoch": 1.8954535931280119, + "grad_norm": 0.33137548713766607, + "learning_rate": 3.5910300625169235e-05, + "loss": 2.6685, + "step": 40712 + }, + { + "epoch": 1.8955001513141048, + "grad_norm": 0.3620600553545282, + "learning_rate": 3.590770168102203e-05, + "loss": 2.6749, + "step": 40713 + }, + { + "epoch": 1.8955467095001979, + "grad_norm": 0.35082435525166866, + "learning_rate": 3.590510277823434e-05, + "loss": 2.6388, + "step": 40714 + }, + { + "epoch": 1.895593267686291, + "grad_norm": 0.3579045443553147, + "learning_rate": 3.5902503916813756e-05, + "loss": 2.5802, + "step": 40715 + }, + { + "epoch": 1.8956398258723839, + "grad_norm": 0.32290764432871055, + "learning_rate": 3.589990509676796e-05, + "loss": 2.6446, + "step": 40716 + }, + { + "epoch": 1.895686384058477, + "grad_norm": 0.359566512960624, + "learning_rate": 3.5897306318104554e-05, + "loss": 2.8025, + "step": 40717 + }, + { + "epoch": 1.8957329422445701, + "grad_norm": 0.3588796448933529, + "learning_rate": 3.589470758083115e-05, + "loss": 2.6373, + "step": 40718 + }, + { + "epoch": 1.8957795004306632, + "grad_norm": 0.3633075468694286, + "learning_rate": 3.58921088849554e-05, + "loss": 2.5411, + "step": 40719 + }, + { + "epoch": 1.8958260586167563, + "grad_norm": 0.346180722028793, + "learning_rate": 3.58895102304849e-05, + "loss": 2.6184, + "step": 40720 + }, + { + "epoch": 1.8958726168028495, + "grad_norm": 0.344367593505896, + "learning_rate": 3.588691161742732e-05, + "loss": 2.7907, + "step": 40721 + }, + { + "epoch": 1.8959191749889426, + "grad_norm": 0.3569482288097586, + "learning_rate": 3.588431304579027e-05, + "loss": 2.6936, + "step": 40722 + }, + { + "epoch": 1.8959657331750355, + "grad_norm": 0.35577665868298974, + "learning_rate": 3.588171451558134e-05, + "loss": 2.6694, + "step": 40723 + }, + { + "epoch": 1.8960122913611286, + "grad_norm": 0.3732589537412554, + "learning_rate": 3.587911602680821e-05, + "loss": 2.5745, + "step": 40724 + }, + { + "epoch": 1.8960588495472215, + "grad_norm": 0.354490973542847, + "learning_rate": 3.587651757947846e-05, + "loss": 2.6491, + "step": 40725 + }, + { + "epoch": 1.8961054077333146, + "grad_norm": 0.3470846844459074, + "learning_rate": 3.587391917359977e-05, + "loss": 2.7527, + "step": 40726 + }, + { + "epoch": 1.8961519659194077, + "grad_norm": 0.33006221252094237, + "learning_rate": 3.5871320809179734e-05, + "loss": 2.7435, + "step": 40727 + }, + { + "epoch": 1.8961985241055008, + "grad_norm": 0.35209396224820577, + "learning_rate": 3.5868722486225946e-05, + "loss": 2.6529, + "step": 40728 + }, + { + "epoch": 1.896245082291594, + "grad_norm": 0.3460078191997204, + "learning_rate": 3.5866124204746096e-05, + "loss": 2.6256, + "step": 40729 + }, + { + "epoch": 1.896291640477687, + "grad_norm": 0.36712370228252666, + "learning_rate": 3.586352596474777e-05, + "loss": 2.7197, + "step": 40730 + }, + { + "epoch": 1.8963381986637802, + "grad_norm": 0.36220642407852877, + "learning_rate": 3.586092776623861e-05, + "loss": 2.7339, + "step": 40731 + }, + { + "epoch": 1.8963847568498733, + "grad_norm": 0.3527851878214767, + "learning_rate": 3.585832960922624e-05, + "loss": 2.6571, + "step": 40732 + }, + { + "epoch": 1.8964313150359662, + "grad_norm": 0.3361672848139568, + "learning_rate": 3.585573149371825e-05, + "loss": 2.7263, + "step": 40733 + }, + { + "epoch": 1.8964778732220593, + "grad_norm": 0.3251634545603648, + "learning_rate": 3.5853133419722316e-05, + "loss": 2.6581, + "step": 40734 + }, + { + "epoch": 1.8965244314081522, + "grad_norm": 0.3459394304397763, + "learning_rate": 3.585053538724604e-05, + "loss": 2.7268, + "step": 40735 + }, + { + "epoch": 1.8965709895942453, + "grad_norm": 0.33090467613074936, + "learning_rate": 3.584793739629704e-05, + "loss": 2.7853, + "step": 40736 + }, + { + "epoch": 1.8966175477803384, + "grad_norm": 0.35227159083072446, + "learning_rate": 3.584533944688296e-05, + "loss": 2.6635, + "step": 40737 + }, + { + "epoch": 1.8966641059664315, + "grad_norm": 0.3500747400927913, + "learning_rate": 3.5842741539011395e-05, + "loss": 2.675, + "step": 40738 + }, + { + "epoch": 1.8967106641525247, + "grad_norm": 0.347165098237499, + "learning_rate": 3.5840143672690006e-05, + "loss": 2.6757, + "step": 40739 + }, + { + "epoch": 1.8967572223386178, + "grad_norm": 0.33574928015294436, + "learning_rate": 3.583754584792641e-05, + "loss": 2.6737, + "step": 40740 + }, + { + "epoch": 1.8968037805247109, + "grad_norm": 0.3840097445164431, + "learning_rate": 3.5834948064728194e-05, + "loss": 2.6294, + "step": 40741 + }, + { + "epoch": 1.8968503387108038, + "grad_norm": 0.3579530292883193, + "learning_rate": 3.583235032310304e-05, + "loss": 2.6124, + "step": 40742 + }, + { + "epoch": 1.896896896896897, + "grad_norm": 0.3407238653877618, + "learning_rate": 3.582975262305852e-05, + "loss": 2.7617, + "step": 40743 + }, + { + "epoch": 1.89694345508299, + "grad_norm": 0.3519996435026504, + "learning_rate": 3.5827154964602295e-05, + "loss": 2.7459, + "step": 40744 + }, + { + "epoch": 1.896990013269083, + "grad_norm": 0.33699397185343216, + "learning_rate": 3.582455734774197e-05, + "loss": 2.6369, + "step": 40745 + }, + { + "epoch": 1.897036571455176, + "grad_norm": 0.3314299689163938, + "learning_rate": 3.5821959772485155e-05, + "loss": 2.7059, + "step": 40746 + }, + { + "epoch": 1.8970831296412691, + "grad_norm": 0.3539961066173841, + "learning_rate": 3.5819362238839514e-05, + "loss": 2.6967, + "step": 40747 + }, + { + "epoch": 1.8971296878273622, + "grad_norm": 0.3346514914104208, + "learning_rate": 3.5816764746812626e-05, + "loss": 2.6566, + "step": 40748 + }, + { + "epoch": 1.8971762460134554, + "grad_norm": 0.3432529027713358, + "learning_rate": 3.581416729641216e-05, + "loss": 2.5975, + "step": 40749 + }, + { + "epoch": 1.8972228041995485, + "grad_norm": 0.3490316382548863, + "learning_rate": 3.5811569887645714e-05, + "loss": 2.6488, + "step": 40750 + }, + { + "epoch": 1.8972693623856416, + "grad_norm": 0.3661359105806338, + "learning_rate": 3.580897252052088e-05, + "loss": 2.7417, + "step": 40751 + }, + { + "epoch": 1.8973159205717345, + "grad_norm": 0.3593287769851833, + "learning_rate": 3.580637519504535e-05, + "loss": 2.7165, + "step": 40752 + }, + { + "epoch": 1.8973624787578276, + "grad_norm": 0.3169250137306855, + "learning_rate": 3.5803777911226696e-05, + "loss": 2.7159, + "step": 40753 + }, + { + "epoch": 1.8974090369439207, + "grad_norm": 0.35985852214249947, + "learning_rate": 3.5801180669072556e-05, + "loss": 2.7731, + "step": 40754 + }, + { + "epoch": 1.8974555951300136, + "grad_norm": 0.3726854473202565, + "learning_rate": 3.579858346859055e-05, + "loss": 2.6887, + "step": 40755 + }, + { + "epoch": 1.8975021533161067, + "grad_norm": 0.37951528046375976, + "learning_rate": 3.579598630978831e-05, + "loss": 2.5959, + "step": 40756 + }, + { + "epoch": 1.8975487115021998, + "grad_norm": 0.31646596361383444, + "learning_rate": 3.579338919267345e-05, + "loss": 2.6392, + "step": 40757 + }, + { + "epoch": 1.897595269688293, + "grad_norm": 0.3745545301078114, + "learning_rate": 3.57907921172536e-05, + "loss": 2.6139, + "step": 40758 + }, + { + "epoch": 1.897641827874386, + "grad_norm": 0.35550437162599124, + "learning_rate": 3.5788195083536345e-05, + "loss": 2.5454, + "step": 40759 + }, + { + "epoch": 1.8976883860604792, + "grad_norm": 0.3322215534334648, + "learning_rate": 3.5785598091529363e-05, + "loss": 2.6, + "step": 40760 + }, + { + "epoch": 1.8977349442465723, + "grad_norm": 0.3587550815373771, + "learning_rate": 3.578300114124023e-05, + "loss": 2.7344, + "step": 40761 + }, + { + "epoch": 1.8977815024326652, + "grad_norm": 0.3357001498407788, + "learning_rate": 3.5780404232676615e-05, + "loss": 2.5851, + "step": 40762 + }, + { + "epoch": 1.8978280606187583, + "grad_norm": 0.3754370685826843, + "learning_rate": 3.577780736584611e-05, + "loss": 2.7153, + "step": 40763 + }, + { + "epoch": 1.8978746188048512, + "grad_norm": 0.32892124294990666, + "learning_rate": 3.577521054075631e-05, + "loss": 2.6413, + "step": 40764 + }, + { + "epoch": 1.8979211769909443, + "grad_norm": 0.3402320592352531, + "learning_rate": 3.577261375741489e-05, + "loss": 2.6302, + "step": 40765 + }, + { + "epoch": 1.8979677351770374, + "grad_norm": 0.3297991167336031, + "learning_rate": 3.577001701582944e-05, + "loss": 2.7186, + "step": 40766 + }, + { + "epoch": 1.8980142933631305, + "grad_norm": 0.3596373563318127, + "learning_rate": 3.576742031600758e-05, + "loss": 2.6562, + "step": 40767 + }, + { + "epoch": 1.8980608515492237, + "grad_norm": 0.3294652617922885, + "learning_rate": 3.576482365795696e-05, + "loss": 2.5869, + "step": 40768 + }, + { + "epoch": 1.8981074097353168, + "grad_norm": 0.34176070133611836, + "learning_rate": 3.576222704168517e-05, + "loss": 2.676, + "step": 40769 + }, + { + "epoch": 1.89815396792141, + "grad_norm": 0.35271874104594747, + "learning_rate": 3.5759630467199844e-05, + "loss": 2.6769, + "step": 40770 + }, + { + "epoch": 1.898200526107503, + "grad_norm": 0.34068378290772183, + "learning_rate": 3.575703393450858e-05, + "loss": 2.7513, + "step": 40771 + }, + { + "epoch": 1.898247084293596, + "grad_norm": 0.37972536723719436, + "learning_rate": 3.5754437443619045e-05, + "loss": 2.7442, + "step": 40772 + }, + { + "epoch": 1.898293642479689, + "grad_norm": 0.34016505992540286, + "learning_rate": 3.575184099453883e-05, + "loss": 2.7064, + "step": 40773 + }, + { + "epoch": 1.898340200665782, + "grad_norm": 0.35693575538527295, + "learning_rate": 3.5749244587275536e-05, + "loss": 2.7761, + "step": 40774 + }, + { + "epoch": 1.898386758851875, + "grad_norm": 0.33960467744106854, + "learning_rate": 3.574664822183682e-05, + "loss": 2.7159, + "step": 40775 + }, + { + "epoch": 1.8984333170379681, + "grad_norm": 0.3282692301206995, + "learning_rate": 3.5744051898230303e-05, + "loss": 2.7025, + "step": 40776 + }, + { + "epoch": 1.8984798752240613, + "grad_norm": 0.35518730580439095, + "learning_rate": 3.574145561646355e-05, + "loss": 2.7236, + "step": 40777 + }, + { + "epoch": 1.8985264334101544, + "grad_norm": 0.35814967767570527, + "learning_rate": 3.5738859376544254e-05, + "loss": 2.7566, + "step": 40778 + }, + { + "epoch": 1.8985729915962475, + "grad_norm": 0.33160691339796267, + "learning_rate": 3.573626317847999e-05, + "loss": 2.6955, + "step": 40779 + }, + { + "epoch": 1.8986195497823406, + "grad_norm": 0.34562179470888543, + "learning_rate": 3.5733667022278386e-05, + "loss": 2.5869, + "step": 40780 + }, + { + "epoch": 1.8986661079684337, + "grad_norm": 0.34293021819049574, + "learning_rate": 3.573107090794707e-05, + "loss": 2.5756, + "step": 40781 + }, + { + "epoch": 1.8987126661545266, + "grad_norm": 0.33546866368994077, + "learning_rate": 3.572847483549364e-05, + "loss": 2.7075, + "step": 40782 + }, + { + "epoch": 1.8987592243406197, + "grad_norm": 0.3498758957933425, + "learning_rate": 3.572587880492575e-05, + "loss": 2.6929, + "step": 40783 + }, + { + "epoch": 1.8988057825267126, + "grad_norm": 0.3671776923250018, + "learning_rate": 3.572328281625098e-05, + "loss": 2.6064, + "step": 40784 + }, + { + "epoch": 1.8988523407128057, + "grad_norm": 0.37191683597599845, + "learning_rate": 3.572068686947698e-05, + "loss": 2.697, + "step": 40785 + }, + { + "epoch": 1.8988988988988988, + "grad_norm": 0.3496682411775524, + "learning_rate": 3.571809096461137e-05, + "loss": 2.7128, + "step": 40786 + }, + { + "epoch": 1.898945457084992, + "grad_norm": 0.3616305180542738, + "learning_rate": 3.571549510166173e-05, + "loss": 2.6298, + "step": 40787 + }, + { + "epoch": 1.898992015271085, + "grad_norm": 0.3748142870025902, + "learning_rate": 3.571289928063572e-05, + "loss": 2.5905, + "step": 40788 + }, + { + "epoch": 1.8990385734571782, + "grad_norm": 0.34660571496231407, + "learning_rate": 3.571030350154093e-05, + "loss": 2.6225, + "step": 40789 + }, + { + "epoch": 1.8990851316432713, + "grad_norm": 0.3716764978136382, + "learning_rate": 3.5707707764385005e-05, + "loss": 2.6973, + "step": 40790 + }, + { + "epoch": 1.8991316898293642, + "grad_norm": 0.38405055853917464, + "learning_rate": 3.570511206917556e-05, + "loss": 2.8079, + "step": 40791 + }, + { + "epoch": 1.8991782480154573, + "grad_norm": 0.3648963278532367, + "learning_rate": 3.5702516415920183e-05, + "loss": 2.6395, + "step": 40792 + }, + { + "epoch": 1.8992248062015504, + "grad_norm": 0.37060432521899855, + "learning_rate": 3.5699920804626515e-05, + "loss": 2.6872, + "step": 40793 + }, + { + "epoch": 1.8992713643876433, + "grad_norm": 0.38897666764853395, + "learning_rate": 3.5697325235302185e-05, + "loss": 2.7803, + "step": 40794 + }, + { + "epoch": 1.8993179225737364, + "grad_norm": 0.33355961254388644, + "learning_rate": 3.5694729707954774e-05, + "loss": 2.6414, + "step": 40795 + }, + { + "epoch": 1.8993644807598296, + "grad_norm": 0.3881104963033271, + "learning_rate": 3.569213422259195e-05, + "loss": 2.7826, + "step": 40796 + }, + { + "epoch": 1.8994110389459227, + "grad_norm": 0.42805024368863953, + "learning_rate": 3.5689538779221267e-05, + "loss": 2.6969, + "step": 40797 + }, + { + "epoch": 1.8994575971320158, + "grad_norm": 0.3314015133208006, + "learning_rate": 3.56869433778504e-05, + "loss": 2.6819, + "step": 40798 + }, + { + "epoch": 1.899504155318109, + "grad_norm": 0.3712044082149172, + "learning_rate": 3.5684348018486946e-05, + "loss": 2.5774, + "step": 40799 + }, + { + "epoch": 1.899550713504202, + "grad_norm": 0.3495493935777391, + "learning_rate": 3.5681752701138505e-05, + "loss": 2.8103, + "step": 40800 + }, + { + "epoch": 1.899597271690295, + "grad_norm": 0.33748069431055255, + "learning_rate": 3.5679157425812724e-05, + "loss": 2.7035, + "step": 40801 + }, + { + "epoch": 1.899643829876388, + "grad_norm": 0.3788988543955315, + "learning_rate": 3.5676562192517185e-05, + "loss": 2.6389, + "step": 40802 + }, + { + "epoch": 1.8996903880624811, + "grad_norm": 0.3273978089889272, + "learning_rate": 3.5673967001259545e-05, + "loss": 2.6541, + "step": 40803 + }, + { + "epoch": 1.899736946248574, + "grad_norm": 0.35444999284884154, + "learning_rate": 3.56713718520474e-05, + "loss": 2.6707, + "step": 40804 + }, + { + "epoch": 1.8997835044346671, + "grad_norm": 0.3604050328367495, + "learning_rate": 3.5668776744888364e-05, + "loss": 2.542, + "step": 40805 + }, + { + "epoch": 1.8998300626207603, + "grad_norm": 0.34020376436465616, + "learning_rate": 3.566618167979005e-05, + "loss": 2.6763, + "step": 40806 + }, + { + "epoch": 1.8998766208068534, + "grad_norm": 0.33414492465664614, + "learning_rate": 3.5663586656760084e-05, + "loss": 2.7185, + "step": 40807 + }, + { + "epoch": 1.8999231789929465, + "grad_norm": 0.38453071630329044, + "learning_rate": 3.566099167580609e-05, + "loss": 2.7236, + "step": 40808 + }, + { + "epoch": 1.8999697371790396, + "grad_norm": 0.3783151576987155, + "learning_rate": 3.565839673693566e-05, + "loss": 2.7227, + "step": 40809 + }, + { + "epoch": 1.9000162953651327, + "grad_norm": 0.39296990894421074, + "learning_rate": 3.5655801840156416e-05, + "loss": 2.6782, + "step": 40810 + }, + { + "epoch": 1.9000628535512256, + "grad_norm": 0.3918387846984009, + "learning_rate": 3.565320698547599e-05, + "loss": 2.7326, + "step": 40811 + }, + { + "epoch": 1.9001094117373187, + "grad_norm": 0.3813206859511489, + "learning_rate": 3.5650612172902e-05, + "loss": 2.6682, + "step": 40812 + }, + { + "epoch": 1.9001559699234116, + "grad_norm": 0.33316886463725276, + "learning_rate": 3.5648017402442024e-05, + "loss": 2.5863, + "step": 40813 + }, + { + "epoch": 1.9002025281095047, + "grad_norm": 0.385951207487757, + "learning_rate": 3.564542267410372e-05, + "loss": 2.7761, + "step": 40814 + }, + { + "epoch": 1.9002490862955979, + "grad_norm": 0.38652096723198726, + "learning_rate": 3.564282798789468e-05, + "loss": 2.7547, + "step": 40815 + }, + { + "epoch": 1.900295644481691, + "grad_norm": 0.3713077929284404, + "learning_rate": 3.5640233343822533e-05, + "loss": 2.6497, + "step": 40816 + }, + { + "epoch": 1.900342202667784, + "grad_norm": 0.3836911148959517, + "learning_rate": 3.5637638741894884e-05, + "loss": 2.6776, + "step": 40817 + }, + { + "epoch": 1.9003887608538772, + "grad_norm": 0.3595258435845598, + "learning_rate": 3.563504418211935e-05, + "loss": 2.5903, + "step": 40818 + }, + { + "epoch": 1.9004353190399703, + "grad_norm": 0.4058164679434547, + "learning_rate": 3.563244966450354e-05, + "loss": 2.6356, + "step": 40819 + }, + { + "epoch": 1.9004818772260634, + "grad_norm": 0.3444854328082668, + "learning_rate": 3.5629855189055077e-05, + "loss": 2.6871, + "step": 40820 + }, + { + "epoch": 1.9005284354121563, + "grad_norm": 0.3752838536434986, + "learning_rate": 3.562726075578158e-05, + "loss": 2.715, + "step": 40821 + }, + { + "epoch": 1.9005749935982494, + "grad_norm": 0.38285133457742515, + "learning_rate": 3.5624666364690664e-05, + "loss": 2.6808, + "step": 40822 + }, + { + "epoch": 1.9006215517843423, + "grad_norm": 0.335912141941229, + "learning_rate": 3.562207201578991e-05, + "loss": 2.6392, + "step": 40823 + }, + { + "epoch": 1.9006681099704354, + "grad_norm": 0.38159123203620177, + "learning_rate": 3.561947770908698e-05, + "loss": 2.666, + "step": 40824 + }, + { + "epoch": 1.9007146681565286, + "grad_norm": 0.34594986678411216, + "learning_rate": 3.561688344458944e-05, + "loss": 2.6306, + "step": 40825 + }, + { + "epoch": 1.9007612263426217, + "grad_norm": 0.35628257232152555, + "learning_rate": 3.561428922230496e-05, + "loss": 2.7982, + "step": 40826 + }, + { + "epoch": 1.9008077845287148, + "grad_norm": 0.3609139021442685, + "learning_rate": 3.561169504224112e-05, + "loss": 2.6259, + "step": 40827 + }, + { + "epoch": 1.900854342714808, + "grad_norm": 0.329779259226043, + "learning_rate": 3.5609100904405515e-05, + "loss": 2.6958, + "step": 40828 + }, + { + "epoch": 1.900900900900901, + "grad_norm": 0.3846809490954695, + "learning_rate": 3.56065068088058e-05, + "loss": 2.6965, + "step": 40829 + }, + { + "epoch": 1.900947459086994, + "grad_norm": 0.35317588108245807, + "learning_rate": 3.5603912755449576e-05, + "loss": 2.6142, + "step": 40830 + }, + { + "epoch": 1.900994017273087, + "grad_norm": 0.3354006770625262, + "learning_rate": 3.5601318744344436e-05, + "loss": 2.6701, + "step": 40831 + }, + { + "epoch": 1.9010405754591801, + "grad_norm": 0.33278255338349155, + "learning_rate": 3.559872477549801e-05, + "loss": 2.6913, + "step": 40832 + }, + { + "epoch": 1.901087133645273, + "grad_norm": 0.32901373391122224, + "learning_rate": 3.5596130848917906e-05, + "loss": 2.6345, + "step": 40833 + }, + { + "epoch": 1.9011336918313662, + "grad_norm": 0.3457783178442397, + "learning_rate": 3.559353696461175e-05, + "loss": 2.7134, + "step": 40834 + }, + { + "epoch": 1.9011802500174593, + "grad_norm": 0.3592192667067147, + "learning_rate": 3.559094312258714e-05, + "loss": 2.6859, + "step": 40835 + }, + { + "epoch": 1.9012268082035524, + "grad_norm": 0.31145476963916413, + "learning_rate": 3.558834932285168e-05, + "loss": 2.6463, + "step": 40836 + }, + { + "epoch": 1.9012733663896455, + "grad_norm": 0.3399096316602162, + "learning_rate": 3.5585755565413e-05, + "loss": 2.7141, + "step": 40837 + }, + { + "epoch": 1.9013199245757386, + "grad_norm": 0.3334906663239976, + "learning_rate": 3.55831618502787e-05, + "loss": 2.552, + "step": 40838 + }, + { + "epoch": 1.9013664827618317, + "grad_norm": 0.3192666217006775, + "learning_rate": 3.558056817745642e-05, + "loss": 2.5694, + "step": 40839 + }, + { + "epoch": 1.9014130409479246, + "grad_norm": 0.3428696179484379, + "learning_rate": 3.5577974546953744e-05, + "loss": 2.6956, + "step": 40840 + }, + { + "epoch": 1.9014595991340177, + "grad_norm": 0.33867799434931184, + "learning_rate": 3.557538095877828e-05, + "loss": 2.649, + "step": 40841 + }, + { + "epoch": 1.9015061573201109, + "grad_norm": 0.31319468679336404, + "learning_rate": 3.5572787412937666e-05, + "loss": 2.6506, + "step": 40842 + }, + { + "epoch": 1.9015527155062038, + "grad_norm": 0.3057030548646413, + "learning_rate": 3.55701939094395e-05, + "loss": 2.735, + "step": 40843 + }, + { + "epoch": 1.9015992736922969, + "grad_norm": 0.3517546560836318, + "learning_rate": 3.556760044829138e-05, + "loss": 2.7545, + "step": 40844 + }, + { + "epoch": 1.90164583187839, + "grad_norm": 0.3192163979163412, + "learning_rate": 3.5565007029500946e-05, + "loss": 2.702, + "step": 40845 + }, + { + "epoch": 1.901692390064483, + "grad_norm": 0.32177207870428653, + "learning_rate": 3.5562413653075785e-05, + "loss": 2.5642, + "step": 40846 + }, + { + "epoch": 1.9017389482505762, + "grad_norm": 0.3585164982450559, + "learning_rate": 3.555982031902353e-05, + "loss": 2.6638, + "step": 40847 + }, + { + "epoch": 1.9017855064366693, + "grad_norm": 0.3244649719769144, + "learning_rate": 3.555722702735177e-05, + "loss": 2.6299, + "step": 40848 + }, + { + "epoch": 1.9018320646227624, + "grad_norm": 0.3141957286170393, + "learning_rate": 3.555463377806811e-05, + "loss": 2.6098, + "step": 40849 + }, + { + "epoch": 1.9018786228088553, + "grad_norm": 0.3679797309450637, + "learning_rate": 3.55520405711802e-05, + "loss": 2.6703, + "step": 40850 + }, + { + "epoch": 1.9019251809949484, + "grad_norm": 0.3606485155577534, + "learning_rate": 3.554944740669561e-05, + "loss": 2.7838, + "step": 40851 + }, + { + "epoch": 1.9019717391810413, + "grad_norm": 0.38349226830893224, + "learning_rate": 3.554685428462199e-05, + "loss": 2.7163, + "step": 40852 + }, + { + "epoch": 1.9020182973671345, + "grad_norm": 0.3442381316799233, + "learning_rate": 3.554426120496692e-05, + "loss": 2.7236, + "step": 40853 + }, + { + "epoch": 1.9020648555532276, + "grad_norm": 0.32159903917484467, + "learning_rate": 3.5541668167738e-05, + "loss": 2.7864, + "step": 40854 + }, + { + "epoch": 1.9021114137393207, + "grad_norm": 0.31824336803029984, + "learning_rate": 3.553907517294288e-05, + "loss": 2.6686, + "step": 40855 + }, + { + "epoch": 1.9021579719254138, + "grad_norm": 0.36873025083972694, + "learning_rate": 3.553648222058915e-05, + "loss": 2.608, + "step": 40856 + }, + { + "epoch": 1.902204530111507, + "grad_norm": 0.35242023235320563, + "learning_rate": 3.553388931068441e-05, + "loss": 2.6142, + "step": 40857 + }, + { + "epoch": 1.9022510882976, + "grad_norm": 0.3525251838060047, + "learning_rate": 3.5531296443236294e-05, + "loss": 2.6264, + "step": 40858 + }, + { + "epoch": 1.9022976464836931, + "grad_norm": 0.3505849639988462, + "learning_rate": 3.552870361825238e-05, + "loss": 2.6865, + "step": 40859 + }, + { + "epoch": 1.902344204669786, + "grad_norm": 0.3574012341892984, + "learning_rate": 3.552611083574031e-05, + "loss": 2.6437, + "step": 40860 + }, + { + "epoch": 1.9023907628558792, + "grad_norm": 0.35174998769907306, + "learning_rate": 3.5523518095707664e-05, + "loss": 2.6868, + "step": 40861 + }, + { + "epoch": 1.902437321041972, + "grad_norm": 0.34600579396241066, + "learning_rate": 3.552092539816208e-05, + "loss": 2.6699, + "step": 40862 + }, + { + "epoch": 1.9024838792280652, + "grad_norm": 0.3456869262204719, + "learning_rate": 3.551833274311116e-05, + "loss": 2.6858, + "step": 40863 + }, + { + "epoch": 1.9025304374141583, + "grad_norm": 0.33826470495935446, + "learning_rate": 3.551574013056248e-05, + "loss": 2.6457, + "step": 40864 + }, + { + "epoch": 1.9025769956002514, + "grad_norm": 0.33576959610506063, + "learning_rate": 3.55131475605237e-05, + "loss": 2.6295, + "step": 40865 + }, + { + "epoch": 1.9026235537863445, + "grad_norm": 0.3577650090610923, + "learning_rate": 3.5510555033002405e-05, + "loss": 2.7242, + "step": 40866 + }, + { + "epoch": 1.9026701119724376, + "grad_norm": 0.36738563760072185, + "learning_rate": 3.5507962548006175e-05, + "loss": 2.7272, + "step": 40867 + }, + { + "epoch": 1.9027166701585307, + "grad_norm": 0.33748959162852465, + "learning_rate": 3.5505370105542675e-05, + "loss": 2.6536, + "step": 40868 + }, + { + "epoch": 1.9027632283446239, + "grad_norm": 0.35242029871705405, + "learning_rate": 3.5502777705619474e-05, + "loss": 2.7127, + "step": 40869 + }, + { + "epoch": 1.9028097865307168, + "grad_norm": 0.36231935214919864, + "learning_rate": 3.5500185348244195e-05, + "loss": 2.7058, + "step": 40870 + }, + { + "epoch": 1.9028563447168099, + "grad_norm": 0.3633691482185415, + "learning_rate": 3.5497593033424447e-05, + "loss": 2.6702, + "step": 40871 + }, + { + "epoch": 1.9029029029029028, + "grad_norm": 0.3517640741284848, + "learning_rate": 3.5495000761167826e-05, + "loss": 2.6239, + "step": 40872 + }, + { + "epoch": 1.9029494610889959, + "grad_norm": 0.37334616240916196, + "learning_rate": 3.5492408531481964e-05, + "loss": 2.6995, + "step": 40873 + }, + { + "epoch": 1.902996019275089, + "grad_norm": 0.38973373688483404, + "learning_rate": 3.548981634437443e-05, + "loss": 2.7023, + "step": 40874 + }, + { + "epoch": 1.903042577461182, + "grad_norm": 0.3271131253787962, + "learning_rate": 3.548722419985288e-05, + "loss": 2.6827, + "step": 40875 + }, + { + "epoch": 1.9030891356472752, + "grad_norm": 0.3837614436356353, + "learning_rate": 3.548463209792489e-05, + "loss": 2.694, + "step": 40876 + }, + { + "epoch": 1.9031356938333683, + "grad_norm": 0.3335106703043783, + "learning_rate": 3.548204003859806e-05, + "loss": 2.7465, + "step": 40877 + }, + { + "epoch": 1.9031822520194615, + "grad_norm": 0.3717261380337671, + "learning_rate": 3.547944802188003e-05, + "loss": 2.7119, + "step": 40878 + }, + { + "epoch": 1.9032288102055543, + "grad_norm": 0.359650071762108, + "learning_rate": 3.5476856047778376e-05, + "loss": 2.5757, + "step": 40879 + }, + { + "epoch": 1.9032753683916475, + "grad_norm": 0.3259559422953688, + "learning_rate": 3.547426411630073e-05, + "loss": 2.6896, + "step": 40880 + }, + { + "epoch": 1.9033219265777406, + "grad_norm": 0.32649892516030316, + "learning_rate": 3.547167222745469e-05, + "loss": 2.7807, + "step": 40881 + }, + { + "epoch": 1.9033684847638335, + "grad_norm": 0.3756344875227046, + "learning_rate": 3.546908038124785e-05, + "loss": 2.6864, + "step": 40882 + }, + { + "epoch": 1.9034150429499266, + "grad_norm": 0.3281685589907961, + "learning_rate": 3.546648857768783e-05, + "loss": 2.672, + "step": 40883 + }, + { + "epoch": 1.9034616011360197, + "grad_norm": 0.33500636662933914, + "learning_rate": 3.5463896816782236e-05, + "loss": 2.7182, + "step": 40884 + }, + { + "epoch": 1.9035081593221128, + "grad_norm": 0.350653445986832, + "learning_rate": 3.5461305098538676e-05, + "loss": 2.6618, + "step": 40885 + }, + { + "epoch": 1.903554717508206, + "grad_norm": 0.3394421306139646, + "learning_rate": 3.545871342296476e-05, + "loss": 2.6813, + "step": 40886 + }, + { + "epoch": 1.903601275694299, + "grad_norm": 0.3724484507376874, + "learning_rate": 3.545612179006806e-05, + "loss": 2.8178, + "step": 40887 + }, + { + "epoch": 1.9036478338803922, + "grad_norm": 0.35261985407806756, + "learning_rate": 3.5453530199856235e-05, + "loss": 2.6833, + "step": 40888 + }, + { + "epoch": 1.903694392066485, + "grad_norm": 0.33027450481033505, + "learning_rate": 3.545093865233687e-05, + "loss": 2.6739, + "step": 40889 + }, + { + "epoch": 1.9037409502525782, + "grad_norm": 0.34895388912996894, + "learning_rate": 3.5448347147517536e-05, + "loss": 2.6497, + "step": 40890 + }, + { + "epoch": 1.9037875084386713, + "grad_norm": 0.3617186007165681, + "learning_rate": 3.5445755685405904e-05, + "loss": 2.6743, + "step": 40891 + }, + { + "epoch": 1.9038340666247642, + "grad_norm": 0.33436761486921657, + "learning_rate": 3.544316426600951e-05, + "loss": 2.7036, + "step": 40892 + }, + { + "epoch": 1.9038806248108573, + "grad_norm": 0.3305587877600185, + "learning_rate": 3.544057288933602e-05, + "loss": 2.6612, + "step": 40893 + }, + { + "epoch": 1.9039271829969504, + "grad_norm": 0.32381048691733905, + "learning_rate": 3.5437981555393015e-05, + "loss": 2.708, + "step": 40894 + }, + { + "epoch": 1.9039737411830435, + "grad_norm": 0.3642161361130554, + "learning_rate": 3.5435390264188095e-05, + "loss": 2.6515, + "step": 40895 + }, + { + "epoch": 1.9040202993691366, + "grad_norm": 0.3353625627799985, + "learning_rate": 3.543279901572887e-05, + "loss": 2.7414, + "step": 40896 + }, + { + "epoch": 1.9040668575552298, + "grad_norm": 0.34199249828788103, + "learning_rate": 3.543020781002294e-05, + "loss": 2.6306, + "step": 40897 + }, + { + "epoch": 1.9041134157413229, + "grad_norm": 0.3418356865987015, + "learning_rate": 3.542761664707792e-05, + "loss": 2.617, + "step": 40898 + }, + { + "epoch": 1.9041599739274158, + "grad_norm": 0.34584499734689644, + "learning_rate": 3.5425025526901414e-05, + "loss": 2.7079, + "step": 40899 + }, + { + "epoch": 1.9042065321135089, + "grad_norm": 0.35935907362418645, + "learning_rate": 3.5422434449501e-05, + "loss": 2.7356, + "step": 40900 + }, + { + "epoch": 1.9042530902996018, + "grad_norm": 0.3294129493059677, + "learning_rate": 3.541984341488433e-05, + "loss": 2.599, + "step": 40901 + }, + { + "epoch": 1.9042996484856949, + "grad_norm": 0.3418168502449861, + "learning_rate": 3.541725242305898e-05, + "loss": 2.6885, + "step": 40902 + }, + { + "epoch": 1.904346206671788, + "grad_norm": 0.35642383601483646, + "learning_rate": 3.541466147403254e-05, + "loss": 2.7053, + "step": 40903 + }, + { + "epoch": 1.9043927648578811, + "grad_norm": 0.36077230594088955, + "learning_rate": 3.5412070567812647e-05, + "loss": 2.6438, + "step": 40904 + }, + { + "epoch": 1.9044393230439742, + "grad_norm": 0.3189656106550288, + "learning_rate": 3.540947970440687e-05, + "loss": 2.6884, + "step": 40905 + }, + { + "epoch": 1.9044858812300673, + "grad_norm": 0.3280941552384905, + "learning_rate": 3.5406888883822856e-05, + "loss": 2.52, + "step": 40906 + }, + { + "epoch": 1.9045324394161605, + "grad_norm": 0.340810536966383, + "learning_rate": 3.540429810606819e-05, + "loss": 2.7383, + "step": 40907 + }, + { + "epoch": 1.9045789976022536, + "grad_norm": 0.3481733790609574, + "learning_rate": 3.5401707371150447e-05, + "loss": 2.6911, + "step": 40908 + }, + { + "epoch": 1.9046255557883465, + "grad_norm": 0.3388821513641098, + "learning_rate": 3.539911667907727e-05, + "loss": 2.6211, + "step": 40909 + }, + { + "epoch": 1.9046721139744396, + "grad_norm": 0.34760443203961483, + "learning_rate": 3.5396526029856234e-05, + "loss": 2.6842, + "step": 40910 + }, + { + "epoch": 1.9047186721605325, + "grad_norm": 0.3424844671617849, + "learning_rate": 3.5393935423494975e-05, + "loss": 2.6807, + "step": 40911 + }, + { + "epoch": 1.9047652303466256, + "grad_norm": 0.3834790248753113, + "learning_rate": 3.539134486000108e-05, + "loss": 2.7111, + "step": 40912 + }, + { + "epoch": 1.9048117885327187, + "grad_norm": 0.35022344263305166, + "learning_rate": 3.538875433938212e-05, + "loss": 2.7282, + "step": 40913 + }, + { + "epoch": 1.9048583467188118, + "grad_norm": 0.3390935833956596, + "learning_rate": 3.538616386164575e-05, + "loss": 2.7807, + "step": 40914 + }, + { + "epoch": 1.904904904904905, + "grad_norm": 0.3690355309231568, + "learning_rate": 3.538357342679952e-05, + "loss": 2.6639, + "step": 40915 + }, + { + "epoch": 1.904951463090998, + "grad_norm": 0.3196805964616735, + "learning_rate": 3.5380983034851095e-05, + "loss": 2.6857, + "step": 40916 + }, + { + "epoch": 1.9049980212770912, + "grad_norm": 0.3558386613961843, + "learning_rate": 3.537839268580804e-05, + "loss": 2.7704, + "step": 40917 + }, + { + "epoch": 1.905044579463184, + "grad_norm": 0.34626387680281995, + "learning_rate": 3.5375802379677944e-05, + "loss": 2.6499, + "step": 40918 + }, + { + "epoch": 1.9050911376492772, + "grad_norm": 0.32632717050820953, + "learning_rate": 3.537321211646844e-05, + "loss": 2.6055, + "step": 40919 + }, + { + "epoch": 1.9051376958353703, + "grad_norm": 0.34923068700188387, + "learning_rate": 3.537062189618713e-05, + "loss": 2.7583, + "step": 40920 + }, + { + "epoch": 1.9051842540214632, + "grad_norm": 0.35754727502844763, + "learning_rate": 3.5368031718841574e-05, + "loss": 2.7285, + "step": 40921 + }, + { + "epoch": 1.9052308122075563, + "grad_norm": 0.33252898556720223, + "learning_rate": 3.5365441584439416e-05, + "loss": 2.662, + "step": 40922 + }, + { + "epoch": 1.9052773703936494, + "grad_norm": 0.330020064189696, + "learning_rate": 3.5362851492988245e-05, + "loss": 2.7072, + "step": 40923 + }, + { + "epoch": 1.9053239285797425, + "grad_norm": 0.337258031648703, + "learning_rate": 3.536026144449568e-05, + "loss": 2.7152, + "step": 40924 + }, + { + "epoch": 1.9053704867658356, + "grad_norm": 0.3201074609101664, + "learning_rate": 3.53576714389693e-05, + "loss": 2.6698, + "step": 40925 + }, + { + "epoch": 1.9054170449519288, + "grad_norm": 0.3513037826842112, + "learning_rate": 3.535508147641669e-05, + "loss": 2.6446, + "step": 40926 + }, + { + "epoch": 1.9054636031380219, + "grad_norm": 0.3198122318708776, + "learning_rate": 3.535249155684549e-05, + "loss": 2.7995, + "step": 40927 + }, + { + "epoch": 1.9055101613241148, + "grad_norm": 0.3422412919860772, + "learning_rate": 3.534990168026327e-05, + "loss": 2.8316, + "step": 40928 + }, + { + "epoch": 1.9055567195102079, + "grad_norm": 0.3515270462682023, + "learning_rate": 3.534731184667766e-05, + "loss": 2.6753, + "step": 40929 + }, + { + "epoch": 1.905603277696301, + "grad_norm": 0.3440060515632991, + "learning_rate": 3.534472205609625e-05, + "loss": 2.7231, + "step": 40930 + }, + { + "epoch": 1.905649835882394, + "grad_norm": 0.37680441344696175, + "learning_rate": 3.534213230852661e-05, + "loss": 2.6668, + "step": 40931 + }, + { + "epoch": 1.905696394068487, + "grad_norm": 0.3395963836763471, + "learning_rate": 3.533954260397639e-05, + "loss": 2.6502, + "step": 40932 + }, + { + "epoch": 1.9057429522545801, + "grad_norm": 0.37483714517042144, + "learning_rate": 3.533695294245316e-05, + "loss": 2.6653, + "step": 40933 + }, + { + "epoch": 1.9057895104406732, + "grad_norm": 0.34067166592288944, + "learning_rate": 3.5334363323964546e-05, + "loss": 2.6727, + "step": 40934 + }, + { + "epoch": 1.9058360686267664, + "grad_norm": 0.34998438863056497, + "learning_rate": 3.5331773748518115e-05, + "loss": 2.6261, + "step": 40935 + }, + { + "epoch": 1.9058826268128595, + "grad_norm": 0.3914265663466515, + "learning_rate": 3.532918421612148e-05, + "loss": 2.7227, + "step": 40936 + }, + { + "epoch": 1.9059291849989526, + "grad_norm": 0.3706205143297463, + "learning_rate": 3.532659472678226e-05, + "loss": 2.7644, + "step": 40937 + }, + { + "epoch": 1.9059757431850455, + "grad_norm": 0.34080450234813714, + "learning_rate": 3.5324005280508035e-05, + "loss": 2.6449, + "step": 40938 + }, + { + "epoch": 1.9060223013711386, + "grad_norm": 0.35695368016525975, + "learning_rate": 3.5321415877306384e-05, + "loss": 2.6997, + "step": 40939 + }, + { + "epoch": 1.9060688595572315, + "grad_norm": 0.40903185269568565, + "learning_rate": 3.5318826517184964e-05, + "loss": 2.6972, + "step": 40940 + }, + { + "epoch": 1.9061154177433246, + "grad_norm": 0.3263835132111783, + "learning_rate": 3.531623720015131e-05, + "loss": 2.6229, + "step": 40941 + }, + { + "epoch": 1.9061619759294177, + "grad_norm": 0.3475789846581307, + "learning_rate": 3.531364792621308e-05, + "loss": 2.6622, + "step": 40942 + }, + { + "epoch": 1.9062085341155108, + "grad_norm": 0.3476869151663631, + "learning_rate": 3.5311058695377844e-05, + "loss": 2.7477, + "step": 40943 + }, + { + "epoch": 1.906255092301604, + "grad_norm": 0.346943541358573, + "learning_rate": 3.530846950765318e-05, + "loss": 2.6657, + "step": 40944 + }, + { + "epoch": 1.906301650487697, + "grad_norm": 0.3248101269625356, + "learning_rate": 3.530588036304674e-05, + "loss": 2.5941, + "step": 40945 + }, + { + "epoch": 1.9063482086737902, + "grad_norm": 0.34086561495760825, + "learning_rate": 3.530329126156607e-05, + "loss": 2.6527, + "step": 40946 + }, + { + "epoch": 1.9063947668598833, + "grad_norm": 0.33269694124187327, + "learning_rate": 3.5300702203218814e-05, + "loss": 2.6985, + "step": 40947 + }, + { + "epoch": 1.9064413250459762, + "grad_norm": 0.31956947150693904, + "learning_rate": 3.529811318801254e-05, + "loss": 2.6921, + "step": 40948 + }, + { + "epoch": 1.9064878832320693, + "grad_norm": 0.3511686310163955, + "learning_rate": 3.5295524215954845e-05, + "loss": 2.6822, + "step": 40949 + }, + { + "epoch": 1.9065344414181622, + "grad_norm": 0.3392644401418246, + "learning_rate": 3.529293528705335e-05, + "loss": 2.65, + "step": 40950 + }, + { + "epoch": 1.9065809996042553, + "grad_norm": 0.32631470031670984, + "learning_rate": 3.529034640131563e-05, + "loss": 2.6773, + "step": 40951 + }, + { + "epoch": 1.9066275577903484, + "grad_norm": 0.35697535104576084, + "learning_rate": 3.528775755874931e-05, + "loss": 2.6448, + "step": 40952 + }, + { + "epoch": 1.9066741159764415, + "grad_norm": 0.36679485893844965, + "learning_rate": 3.5285168759361966e-05, + "loss": 2.7069, + "step": 40953 + }, + { + "epoch": 1.9067206741625347, + "grad_norm": 0.3330188931594746, + "learning_rate": 3.5282580003161187e-05, + "loss": 2.6496, + "step": 40954 + }, + { + "epoch": 1.9067672323486278, + "grad_norm": 0.36630870669958593, + "learning_rate": 3.527999129015461e-05, + "loss": 2.5793, + "step": 40955 + }, + { + "epoch": 1.9068137905347209, + "grad_norm": 0.3500306748593658, + "learning_rate": 3.5277402620349794e-05, + "loss": 2.7813, + "step": 40956 + }, + { + "epoch": 1.906860348720814, + "grad_norm": 0.35853713800603354, + "learning_rate": 3.5274813993754346e-05, + "loss": 2.6747, + "step": 40957 + }, + { + "epoch": 1.906906906906907, + "grad_norm": 0.3546945636289683, + "learning_rate": 3.527222541037588e-05, + "loss": 2.6539, + "step": 40958 + }, + { + "epoch": 1.906953465093, + "grad_norm": 0.34744284469402764, + "learning_rate": 3.5269636870221976e-05, + "loss": 2.6202, + "step": 40959 + }, + { + "epoch": 1.907000023279093, + "grad_norm": 0.35485271698703563, + "learning_rate": 3.526704837330025e-05, + "loss": 2.6938, + "step": 40960 + }, + { + "epoch": 1.907046581465186, + "grad_norm": 0.3366044366345972, + "learning_rate": 3.5264459919618265e-05, + "loss": 2.7814, + "step": 40961 + }, + { + "epoch": 1.9070931396512791, + "grad_norm": 0.31233724973848564, + "learning_rate": 3.526187150918365e-05, + "loss": 2.7298, + "step": 40962 + }, + { + "epoch": 1.9071396978373722, + "grad_norm": 0.38259896048835884, + "learning_rate": 3.525928314200399e-05, + "loss": 2.7269, + "step": 40963 + }, + { + "epoch": 1.9071862560234654, + "grad_norm": 0.33183919308075205, + "learning_rate": 3.5256694818086864e-05, + "loss": 2.6467, + "step": 40964 + }, + { + "epoch": 1.9072328142095585, + "grad_norm": 0.334978264828759, + "learning_rate": 3.525410653743991e-05, + "loss": 2.6901, + "step": 40965 + }, + { + "epoch": 1.9072793723956516, + "grad_norm": 0.331892332237207, + "learning_rate": 3.52515183000707e-05, + "loss": 2.6844, + "step": 40966 + }, + { + "epoch": 1.9073259305817445, + "grad_norm": 0.344176002356002, + "learning_rate": 3.524893010598681e-05, + "loss": 2.818, + "step": 40967 + }, + { + "epoch": 1.9073724887678376, + "grad_norm": 0.3451069418526316, + "learning_rate": 3.524634195519588e-05, + "loss": 2.6058, + "step": 40968 + }, + { + "epoch": 1.9074190469539307, + "grad_norm": 0.31681286977852474, + "learning_rate": 3.524375384770546e-05, + "loss": 2.6532, + "step": 40969 + }, + { + "epoch": 1.9074656051400236, + "grad_norm": 0.3482924986289704, + "learning_rate": 3.524116578352319e-05, + "loss": 2.6391, + "step": 40970 + }, + { + "epoch": 1.9075121633261167, + "grad_norm": 0.35351953361277644, + "learning_rate": 3.5238577762656637e-05, + "loss": 2.5854, + "step": 40971 + }, + { + "epoch": 1.9075587215122098, + "grad_norm": 0.3149307309013424, + "learning_rate": 3.52359897851134e-05, + "loss": 2.7258, + "step": 40972 + }, + { + "epoch": 1.907605279698303, + "grad_norm": 0.34294456799389234, + "learning_rate": 3.523340185090109e-05, + "loss": 2.6874, + "step": 40973 + }, + { + "epoch": 1.907651837884396, + "grad_norm": 0.32191028707605357, + "learning_rate": 3.5230813960027275e-05, + "loss": 2.716, + "step": 40974 + }, + { + "epoch": 1.9076983960704892, + "grad_norm": 0.3466356914730134, + "learning_rate": 3.5228226112499576e-05, + "loss": 2.6298, + "step": 40975 + }, + { + "epoch": 1.9077449542565823, + "grad_norm": 0.3519836015869408, + "learning_rate": 3.5225638308325584e-05, + "loss": 2.608, + "step": 40976 + }, + { + "epoch": 1.9077915124426752, + "grad_norm": 0.3354836038466345, + "learning_rate": 3.5223050547512856e-05, + "loss": 2.6097, + "step": 40977 + }, + { + "epoch": 1.9078380706287683, + "grad_norm": 0.32726900004737974, + "learning_rate": 3.5220462830069055e-05, + "loss": 2.5492, + "step": 40978 + }, + { + "epoch": 1.9078846288148614, + "grad_norm": 0.3427329007186841, + "learning_rate": 3.5217875156001736e-05, + "loss": 2.7063, + "step": 40979 + }, + { + "epoch": 1.9079311870009543, + "grad_norm": 0.3381201302175249, + "learning_rate": 3.5215287525318476e-05, + "loss": 2.6439, + "step": 40980 + }, + { + "epoch": 1.9079777451870474, + "grad_norm": 0.34648590626763975, + "learning_rate": 3.5212699938026905e-05, + "loss": 2.7338, + "step": 40981 + }, + { + "epoch": 1.9080243033731406, + "grad_norm": 0.3288814757778229, + "learning_rate": 3.5210112394134584e-05, + "loss": 2.6866, + "step": 40982 + }, + { + "epoch": 1.9080708615592337, + "grad_norm": 0.3249309805614163, + "learning_rate": 3.520752489364916e-05, + "loss": 2.7712, + "step": 40983 + }, + { + "epoch": 1.9081174197453268, + "grad_norm": 0.3361665484406508, + "learning_rate": 3.5204937436578176e-05, + "loss": 2.6061, + "step": 40984 + }, + { + "epoch": 1.90816397793142, + "grad_norm": 0.31906870027156575, + "learning_rate": 3.520235002292924e-05, + "loss": 2.7687, + "step": 40985 + }, + { + "epoch": 1.908210536117513, + "grad_norm": 0.3314596344501383, + "learning_rate": 3.519976265270995e-05, + "loss": 2.5825, + "step": 40986 + }, + { + "epoch": 1.908257094303606, + "grad_norm": 0.3318977837629484, + "learning_rate": 3.51971753259279e-05, + "loss": 2.6249, + "step": 40987 + }, + { + "epoch": 1.908303652489699, + "grad_norm": 0.32523196396325693, + "learning_rate": 3.5194588042590684e-05, + "loss": 2.6451, + "step": 40988 + }, + { + "epoch": 1.908350210675792, + "grad_norm": 0.3257933019216348, + "learning_rate": 3.51920008027059e-05, + "loss": 2.651, + "step": 40989 + }, + { + "epoch": 1.908396768861885, + "grad_norm": 0.3881398544565282, + "learning_rate": 3.518941360628111e-05, + "loss": 2.6684, + "step": 40990 + }, + { + "epoch": 1.9084433270479781, + "grad_norm": 0.3281791245282309, + "learning_rate": 3.518682645332396e-05, + "loss": 2.6593, + "step": 40991 + }, + { + "epoch": 1.9084898852340713, + "grad_norm": 0.3701521172847672, + "learning_rate": 3.518423934384201e-05, + "loss": 2.6326, + "step": 40992 + }, + { + "epoch": 1.9085364434201644, + "grad_norm": 0.36633295403018784, + "learning_rate": 3.518165227784284e-05, + "loss": 2.6669, + "step": 40993 + }, + { + "epoch": 1.9085830016062575, + "grad_norm": 0.36763277858494187, + "learning_rate": 3.517906525533408e-05, + "loss": 2.6725, + "step": 40994 + }, + { + "epoch": 1.9086295597923506, + "grad_norm": 0.3441005450650911, + "learning_rate": 3.517647827632328e-05, + "loss": 2.6643, + "step": 40995 + }, + { + "epoch": 1.9086761179784437, + "grad_norm": 0.35472943986932515, + "learning_rate": 3.517389134081808e-05, + "loss": 2.742, + "step": 40996 + }, + { + "epoch": 1.9087226761645366, + "grad_norm": 0.36206790647042214, + "learning_rate": 3.517130444882604e-05, + "loss": 2.6684, + "step": 40997 + }, + { + "epoch": 1.9087692343506297, + "grad_norm": 0.39106165646031527, + "learning_rate": 3.516871760035476e-05, + "loss": 2.6608, + "step": 40998 + }, + { + "epoch": 1.9088157925367226, + "grad_norm": 0.37223217109156964, + "learning_rate": 3.516613079541184e-05, + "loss": 2.6146, + "step": 40999 + }, + { + "epoch": 1.9088623507228157, + "grad_norm": 0.33927821350540455, + "learning_rate": 3.5163544034004846e-05, + "loss": 2.7379, + "step": 41000 + }, + { + "epoch": 1.9089089089089089, + "grad_norm": 0.3799182630377173, + "learning_rate": 3.516095731614141e-05, + "loss": 2.6519, + "step": 41001 + }, + { + "epoch": 1.908955467095002, + "grad_norm": 0.3319829303036212, + "learning_rate": 3.5158370641829095e-05, + "loss": 2.7501, + "step": 41002 + }, + { + "epoch": 1.909002025281095, + "grad_norm": 0.36893121256067035, + "learning_rate": 3.515578401107549e-05, + "loss": 2.7736, + "step": 41003 + }, + { + "epoch": 1.9090485834671882, + "grad_norm": 0.37009634720472734, + "learning_rate": 3.5153197423888205e-05, + "loss": 2.7195, + "step": 41004 + }, + { + "epoch": 1.9090951416532813, + "grad_norm": 0.33663876219548017, + "learning_rate": 3.515061088027481e-05, + "loss": 2.7472, + "step": 41005 + }, + { + "epoch": 1.9091416998393742, + "grad_norm": 0.39110097579691394, + "learning_rate": 3.5148024380242924e-05, + "loss": 2.7111, + "step": 41006 + }, + { + "epoch": 1.9091882580254673, + "grad_norm": 0.3340174521339328, + "learning_rate": 3.514543792380013e-05, + "loss": 2.6631, + "step": 41007 + }, + { + "epoch": 1.9092348162115604, + "grad_norm": 0.3436360290806096, + "learning_rate": 3.514285151095399e-05, + "loss": 2.6449, + "step": 41008 + }, + { + "epoch": 1.9092813743976533, + "grad_norm": 0.356547399105994, + "learning_rate": 3.514026514171214e-05, + "loss": 2.6573, + "step": 41009 + }, + { + "epoch": 1.9093279325837464, + "grad_norm": 0.3360313777057006, + "learning_rate": 3.513767881608214e-05, + "loss": 2.5959, + "step": 41010 + }, + { + "epoch": 1.9093744907698396, + "grad_norm": 0.33173757126716785, + "learning_rate": 3.5135092534071586e-05, + "loss": 2.7048, + "step": 41011 + }, + { + "epoch": 1.9094210489559327, + "grad_norm": 0.32451294363164057, + "learning_rate": 3.513250629568808e-05, + "loss": 2.7122, + "step": 41012 + }, + { + "epoch": 1.9094676071420258, + "grad_norm": 0.37459053848712154, + "learning_rate": 3.512992010093919e-05, + "loss": 2.6203, + "step": 41013 + }, + { + "epoch": 1.909514165328119, + "grad_norm": 0.3325614041492625, + "learning_rate": 3.512733394983253e-05, + "loss": 2.7133, + "step": 41014 + }, + { + "epoch": 1.909560723514212, + "grad_norm": 0.34672678721294925, + "learning_rate": 3.512474784237567e-05, + "loss": 2.6635, + "step": 41015 + }, + { + "epoch": 1.909607281700305, + "grad_norm": 0.33867155844889923, + "learning_rate": 3.51221617785762e-05, + "loss": 2.7112, + "step": 41016 + }, + { + "epoch": 1.909653839886398, + "grad_norm": 0.33773853590718556, + "learning_rate": 3.5119575758441744e-05, + "loss": 2.6783, + "step": 41017 + }, + { + "epoch": 1.9097003980724911, + "grad_norm": 0.320851152728488, + "learning_rate": 3.511698978197985e-05, + "loss": 2.6741, + "step": 41018 + }, + { + "epoch": 1.909746956258584, + "grad_norm": 0.350409516404562, + "learning_rate": 3.511440384919813e-05, + "loss": 2.6567, + "step": 41019 + }, + { + "epoch": 1.9097935144446772, + "grad_norm": 0.32696089376420673, + "learning_rate": 3.511181796010418e-05, + "loss": 2.7113, + "step": 41020 + }, + { + "epoch": 1.9098400726307703, + "grad_norm": 0.3528452180997692, + "learning_rate": 3.510923211470555e-05, + "loss": 2.7791, + "step": 41021 + }, + { + "epoch": 1.9098866308168634, + "grad_norm": 0.3324040905314481, + "learning_rate": 3.5106646313009885e-05, + "loss": 2.5549, + "step": 41022 + }, + { + "epoch": 1.9099331890029565, + "grad_norm": 0.3457615859597717, + "learning_rate": 3.5104060555024734e-05, + "loss": 2.7491, + "step": 41023 + }, + { + "epoch": 1.9099797471890496, + "grad_norm": 0.3429420475895876, + "learning_rate": 3.51014748407577e-05, + "loss": 2.6701, + "step": 41024 + }, + { + "epoch": 1.9100263053751427, + "grad_norm": 0.35029016816728803, + "learning_rate": 3.5098889170216374e-05, + "loss": 2.7457, + "step": 41025 + }, + { + "epoch": 1.9100728635612356, + "grad_norm": 0.3270044532522017, + "learning_rate": 3.509630354340832e-05, + "loss": 2.6727, + "step": 41026 + }, + { + "epoch": 1.9101194217473287, + "grad_norm": 0.3393387538228068, + "learning_rate": 3.509371796034117e-05, + "loss": 2.6814, + "step": 41027 + }, + { + "epoch": 1.9101659799334216, + "grad_norm": 0.32609114300404834, + "learning_rate": 3.509113242102249e-05, + "loss": 2.7003, + "step": 41028 + }, + { + "epoch": 1.9102125381195147, + "grad_norm": 0.34605725863824954, + "learning_rate": 3.5088546925459843e-05, + "loss": 2.6555, + "step": 41029 + }, + { + "epoch": 1.9102590963056079, + "grad_norm": 0.3495744241488884, + "learning_rate": 3.508596147366086e-05, + "loss": 2.6742, + "step": 41030 + }, + { + "epoch": 1.910305654491701, + "grad_norm": 0.34826859040774966, + "learning_rate": 3.508337606563309e-05, + "loss": 2.687, + "step": 41031 + }, + { + "epoch": 1.910352212677794, + "grad_norm": 0.3720037927629425, + "learning_rate": 3.508079070138417e-05, + "loss": 2.6876, + "step": 41032 + }, + { + "epoch": 1.9103987708638872, + "grad_norm": 0.34541630189709926, + "learning_rate": 3.507820538092166e-05, + "loss": 2.7221, + "step": 41033 + }, + { + "epoch": 1.9104453290499803, + "grad_norm": 0.34283759030583116, + "learning_rate": 3.507562010425312e-05, + "loss": 2.6944, + "step": 41034 + }, + { + "epoch": 1.9104918872360734, + "grad_norm": 0.3835309918494372, + "learning_rate": 3.507303487138619e-05, + "loss": 2.707, + "step": 41035 + }, + { + "epoch": 1.9105384454221663, + "grad_norm": 0.32171130992476726, + "learning_rate": 3.507044968232842e-05, + "loss": 2.6634, + "step": 41036 + }, + { + "epoch": 1.9105850036082594, + "grad_norm": 0.3511499631466384, + "learning_rate": 3.5067864537087415e-05, + "loss": 2.7324, + "step": 41037 + }, + { + "epoch": 1.9106315617943523, + "grad_norm": 0.3599992079018346, + "learning_rate": 3.506527943567076e-05, + "loss": 2.6997, + "step": 41038 + }, + { + "epoch": 1.9106781199804455, + "grad_norm": 0.34513376701502707, + "learning_rate": 3.506269437808602e-05, + "loss": 2.682, + "step": 41039 + }, + { + "epoch": 1.9107246781665386, + "grad_norm": 0.3267472716752802, + "learning_rate": 3.506010936434082e-05, + "loss": 2.7082, + "step": 41040 + }, + { + "epoch": 1.9107712363526317, + "grad_norm": 0.3712083932614787, + "learning_rate": 3.5057524394442705e-05, + "loss": 2.7206, + "step": 41041 + }, + { + "epoch": 1.9108177945387248, + "grad_norm": 0.3287485137929086, + "learning_rate": 3.505493946839931e-05, + "loss": 2.6665, + "step": 41042 + }, + { + "epoch": 1.910864352724818, + "grad_norm": 0.3426768344628438, + "learning_rate": 3.5052354586218185e-05, + "loss": 2.6658, + "step": 41043 + }, + { + "epoch": 1.910910910910911, + "grad_norm": 0.34633859898420266, + "learning_rate": 3.504976974790691e-05, + "loss": 2.6054, + "step": 41044 + }, + { + "epoch": 1.9109574690970041, + "grad_norm": 0.3444817220232462, + "learning_rate": 3.504718495347311e-05, + "loss": 2.7161, + "step": 41045 + }, + { + "epoch": 1.911004027283097, + "grad_norm": 0.34260903672298754, + "learning_rate": 3.5044600202924354e-05, + "loss": 2.6816, + "step": 41046 + }, + { + "epoch": 1.9110505854691902, + "grad_norm": 0.3237541410915766, + "learning_rate": 3.5042015496268196e-05, + "loss": 2.6179, + "step": 41047 + }, + { + "epoch": 1.911097143655283, + "grad_norm": 0.35029382176497526, + "learning_rate": 3.503943083351228e-05, + "loss": 2.7737, + "step": 41048 + }, + { + "epoch": 1.9111437018413762, + "grad_norm": 0.30969470211221123, + "learning_rate": 3.503684621466413e-05, + "loss": 2.7366, + "step": 41049 + }, + { + "epoch": 1.9111902600274693, + "grad_norm": 0.49448825796262713, + "learning_rate": 3.503426163973139e-05, + "loss": 2.6925, + "step": 41050 + }, + { + "epoch": 1.9112368182135624, + "grad_norm": 0.33048071489857883, + "learning_rate": 3.5031677108721614e-05, + "loss": 2.7105, + "step": 41051 + }, + { + "epoch": 1.9112833763996555, + "grad_norm": 0.3316372381064882, + "learning_rate": 3.502909262164237e-05, + "loss": 2.6262, + "step": 41052 + }, + { + "epoch": 1.9113299345857486, + "grad_norm": 0.3525068413434912, + "learning_rate": 3.502650817850128e-05, + "loss": 2.758, + "step": 41053 + }, + { + "epoch": 1.9113764927718417, + "grad_norm": 0.3565683608898263, + "learning_rate": 3.502392377930589e-05, + "loss": 2.7218, + "step": 41054 + }, + { + "epoch": 1.9114230509579346, + "grad_norm": 0.31097988499687296, + "learning_rate": 3.5021339424063836e-05, + "loss": 2.6012, + "step": 41055 + }, + { + "epoch": 1.9114696091440277, + "grad_norm": 0.34903116768186787, + "learning_rate": 3.5018755112782675e-05, + "loss": 2.7418, + "step": 41056 + }, + { + "epoch": 1.9115161673301209, + "grad_norm": 0.34595721134922425, + "learning_rate": 3.501617084546998e-05, + "loss": 2.6492, + "step": 41057 + }, + { + "epoch": 1.9115627255162138, + "grad_norm": 0.35399066584147504, + "learning_rate": 3.5013586622133355e-05, + "loss": 2.7074, + "step": 41058 + }, + { + "epoch": 1.9116092837023069, + "grad_norm": 0.3414818991772085, + "learning_rate": 3.5011002442780374e-05, + "loss": 2.6241, + "step": 41059 + }, + { + "epoch": 1.9116558418884, + "grad_norm": 0.35722959933827936, + "learning_rate": 3.500841830741861e-05, + "loss": 2.5622, + "step": 41060 + }, + { + "epoch": 1.911702400074493, + "grad_norm": 0.348073889128164, + "learning_rate": 3.500583421605569e-05, + "loss": 2.739, + "step": 41061 + }, + { + "epoch": 1.9117489582605862, + "grad_norm": 0.33394647737837074, + "learning_rate": 3.500325016869915e-05, + "loss": 2.6776, + "step": 41062 + }, + { + "epoch": 1.9117955164466793, + "grad_norm": 0.3478785137082989, + "learning_rate": 3.50006661653566e-05, + "loss": 2.6861, + "step": 41063 + }, + { + "epoch": 1.9118420746327724, + "grad_norm": 0.3586000010874225, + "learning_rate": 3.4998082206035605e-05, + "loss": 2.7562, + "step": 41064 + }, + { + "epoch": 1.9118886328188653, + "grad_norm": 0.3284660669481802, + "learning_rate": 3.4995498290743776e-05, + "loss": 2.7308, + "step": 41065 + }, + { + "epoch": 1.9119351910049585, + "grad_norm": 0.3660776642338505, + "learning_rate": 3.499291441948868e-05, + "loss": 2.6766, + "step": 41066 + }, + { + "epoch": 1.9119817491910516, + "grad_norm": 0.34351141676891905, + "learning_rate": 3.4990330592277886e-05, + "loss": 2.6092, + "step": 41067 + }, + { + "epoch": 1.9120283073771445, + "grad_norm": 0.344895439144377, + "learning_rate": 3.498774680911902e-05, + "loss": 2.7128, + "step": 41068 + }, + { + "epoch": 1.9120748655632376, + "grad_norm": 0.3546669957986558, + "learning_rate": 3.498516307001962e-05, + "loss": 2.6073, + "step": 41069 + }, + { + "epoch": 1.9121214237493307, + "grad_norm": 0.3342756920642345, + "learning_rate": 3.498257937498728e-05, + "loss": 2.627, + "step": 41070 + }, + { + "epoch": 1.9121679819354238, + "grad_norm": 0.3475603694426101, + "learning_rate": 3.49799957240296e-05, + "loss": 2.617, + "step": 41071 + }, + { + "epoch": 1.912214540121517, + "grad_norm": 0.3594812203202838, + "learning_rate": 3.497741211715415e-05, + "loss": 2.6463, + "step": 41072 + }, + { + "epoch": 1.91226109830761, + "grad_norm": 0.346687456450282, + "learning_rate": 3.497482855436852e-05, + "loss": 2.6044, + "step": 41073 + }, + { + "epoch": 1.9123076564937032, + "grad_norm": 0.39435040950125755, + "learning_rate": 3.4972245035680294e-05, + "loss": 2.6554, + "step": 41074 + }, + { + "epoch": 1.912354214679796, + "grad_norm": 0.33527273036585536, + "learning_rate": 3.496966156109703e-05, + "loss": 2.5677, + "step": 41075 + }, + { + "epoch": 1.9124007728658892, + "grad_norm": 0.3442265891743755, + "learning_rate": 3.496707813062635e-05, + "loss": 2.7415, + "step": 41076 + }, + { + "epoch": 1.912447331051982, + "grad_norm": 0.3744634353228082, + "learning_rate": 3.496449474427579e-05, + "loss": 2.5414, + "step": 41077 + }, + { + "epoch": 1.9124938892380752, + "grad_norm": 0.33535788659419963, + "learning_rate": 3.496191140205298e-05, + "loss": 2.6999, + "step": 41078 + }, + { + "epoch": 1.9125404474241683, + "grad_norm": 0.3199485644960498, + "learning_rate": 3.495932810396548e-05, + "loss": 2.6476, + "step": 41079 + }, + { + "epoch": 1.9125870056102614, + "grad_norm": 0.34647933097648553, + "learning_rate": 3.4956744850020836e-05, + "loss": 2.722, + "step": 41080 + }, + { + "epoch": 1.9126335637963545, + "grad_norm": 0.3315880730582965, + "learning_rate": 3.49541616402267e-05, + "loss": 2.6045, + "step": 41081 + }, + { + "epoch": 1.9126801219824476, + "grad_norm": 0.32548247887840015, + "learning_rate": 3.495157847459059e-05, + "loss": 2.654, + "step": 41082 + }, + { + "epoch": 1.9127266801685407, + "grad_norm": 0.3546388985922169, + "learning_rate": 3.494899535312014e-05, + "loss": 2.6854, + "step": 41083 + }, + { + "epoch": 1.9127732383546339, + "grad_norm": 0.33317091370270574, + "learning_rate": 3.4946412275822904e-05, + "loss": 2.6975, + "step": 41084 + }, + { + "epoch": 1.9128197965407268, + "grad_norm": 0.33924818010879887, + "learning_rate": 3.494382924270645e-05, + "loss": 2.6202, + "step": 41085 + }, + { + "epoch": 1.9128663547268199, + "grad_norm": 0.34803539157434576, + "learning_rate": 3.494124625377838e-05, + "loss": 2.7444, + "step": 41086 + }, + { + "epoch": 1.9129129129129128, + "grad_norm": 0.34519778242387544, + "learning_rate": 3.493866330904627e-05, + "loss": 2.6783, + "step": 41087 + }, + { + "epoch": 1.9129594710990059, + "grad_norm": 0.324570668152233, + "learning_rate": 3.493608040851769e-05, + "loss": 2.6731, + "step": 41088 + }, + { + "epoch": 1.913006029285099, + "grad_norm": 0.3521201229925794, + "learning_rate": 3.493349755220025e-05, + "loss": 2.6928, + "step": 41089 + }, + { + "epoch": 1.9130525874711921, + "grad_norm": 0.3717561709198939, + "learning_rate": 3.493091474010148e-05, + "loss": 2.5543, + "step": 41090 + }, + { + "epoch": 1.9130991456572852, + "grad_norm": 0.3325760437029159, + "learning_rate": 3.4928331972229025e-05, + "loss": 2.6824, + "step": 41091 + }, + { + "epoch": 1.9131457038433783, + "grad_norm": 0.3232417305793341, + "learning_rate": 3.4925749248590425e-05, + "loss": 2.6662, + "step": 41092 + }, + { + "epoch": 1.9131922620294715, + "grad_norm": 0.33705963043851433, + "learning_rate": 3.492316656919323e-05, + "loss": 2.7175, + "step": 41093 + }, + { + "epoch": 1.9132388202155644, + "grad_norm": 0.3587913155143064, + "learning_rate": 3.492058393404509e-05, + "loss": 2.5826, + "step": 41094 + }, + { + "epoch": 1.9132853784016575, + "grad_norm": 0.3248721160642347, + "learning_rate": 3.4918001343153534e-05, + "loss": 2.7626, + "step": 41095 + }, + { + "epoch": 1.9133319365877506, + "grad_norm": 0.3511557336149404, + "learning_rate": 3.491541879652617e-05, + "loss": 2.7256, + "step": 41096 + }, + { + "epoch": 1.9133784947738435, + "grad_norm": 0.35377562748004293, + "learning_rate": 3.491283629417057e-05, + "loss": 2.7281, + "step": 41097 + }, + { + "epoch": 1.9134250529599366, + "grad_norm": 0.31447651274218236, + "learning_rate": 3.49102538360943e-05, + "loss": 2.7762, + "step": 41098 + }, + { + "epoch": 1.9134716111460297, + "grad_norm": 0.36802952737888883, + "learning_rate": 3.4907671422304944e-05, + "loss": 2.643, + "step": 41099 + }, + { + "epoch": 1.9135181693321228, + "grad_norm": 0.33507816632621057, + "learning_rate": 3.4905089052810094e-05, + "loss": 2.6304, + "step": 41100 + }, + { + "epoch": 1.913564727518216, + "grad_norm": 0.3948601714824849, + "learning_rate": 3.4902506727617324e-05, + "loss": 2.6331, + "step": 41101 + }, + { + "epoch": 1.913611285704309, + "grad_norm": 0.3434829393891515, + "learning_rate": 3.489992444673421e-05, + "loss": 2.6997, + "step": 41102 + }, + { + "epoch": 1.9136578438904022, + "grad_norm": 0.3846134974722357, + "learning_rate": 3.4897342210168305e-05, + "loss": 2.7783, + "step": 41103 + }, + { + "epoch": 1.913704402076495, + "grad_norm": 0.3741762749057406, + "learning_rate": 3.489476001792724e-05, + "loss": 2.4914, + "step": 41104 + }, + { + "epoch": 1.9137509602625882, + "grad_norm": 0.37570846050082096, + "learning_rate": 3.4892177870018563e-05, + "loss": 2.7107, + "step": 41105 + }, + { + "epoch": 1.9137975184486813, + "grad_norm": 0.3502061376456196, + "learning_rate": 3.488959576644984e-05, + "loss": 2.8097, + "step": 41106 + }, + { + "epoch": 1.9138440766347742, + "grad_norm": 0.3638236800313658, + "learning_rate": 3.488701370722868e-05, + "loss": 2.6864, + "step": 41107 + }, + { + "epoch": 1.9138906348208673, + "grad_norm": 0.3431293643960119, + "learning_rate": 3.488443169236263e-05, + "loss": 2.644, + "step": 41108 + }, + { + "epoch": 1.9139371930069604, + "grad_norm": 0.3802391567244408, + "learning_rate": 3.48818497218593e-05, + "loss": 2.6902, + "step": 41109 + }, + { + "epoch": 1.9139837511930535, + "grad_norm": 0.3468184355153013, + "learning_rate": 3.487926779572626e-05, + "loss": 2.669, + "step": 41110 + }, + { + "epoch": 1.9140303093791466, + "grad_norm": 0.3443825707337161, + "learning_rate": 3.487668591397105e-05, + "loss": 2.6847, + "step": 41111 + }, + { + "epoch": 1.9140768675652398, + "grad_norm": 0.3935554431271281, + "learning_rate": 3.487410407660129e-05, + "loss": 2.6712, + "step": 41112 + }, + { + "epoch": 1.9141234257513329, + "grad_norm": 0.3527262503445123, + "learning_rate": 3.4871522283624544e-05, + "loss": 2.6234, + "step": 41113 + }, + { + "epoch": 1.9141699839374258, + "grad_norm": 0.35257049628362225, + "learning_rate": 3.48689405350484e-05, + "loss": 2.5437, + "step": 41114 + }, + { + "epoch": 1.9142165421235189, + "grad_norm": 0.36397958264424196, + "learning_rate": 3.486635883088043e-05, + "loss": 2.6933, + "step": 41115 + }, + { + "epoch": 1.9142631003096118, + "grad_norm": 0.3539926750490881, + "learning_rate": 3.4863777171128174e-05, + "loss": 2.6569, + "step": 41116 + }, + { + "epoch": 1.914309658495705, + "grad_norm": 0.36302450232005307, + "learning_rate": 3.486119555579927e-05, + "loss": 2.5201, + "step": 41117 + }, + { + "epoch": 1.914356216681798, + "grad_norm": 0.31527263462052024, + "learning_rate": 3.485861398490125e-05, + "loss": 2.683, + "step": 41118 + }, + { + "epoch": 1.9144027748678911, + "grad_norm": 0.3574900040334849, + "learning_rate": 3.485603245844171e-05, + "loss": 2.695, + "step": 41119 + }, + { + "epoch": 1.9144493330539842, + "grad_norm": 0.32197678840017085, + "learning_rate": 3.485345097642824e-05, + "loss": 2.7581, + "step": 41120 + }, + { + "epoch": 1.9144958912400774, + "grad_norm": 0.3526256369339279, + "learning_rate": 3.4850869538868365e-05, + "loss": 2.8086, + "step": 41121 + }, + { + "epoch": 1.9145424494261705, + "grad_norm": 0.34944528777537154, + "learning_rate": 3.4848288145769725e-05, + "loss": 2.5747, + "step": 41122 + }, + { + "epoch": 1.9145890076122636, + "grad_norm": 0.3531486284651685, + "learning_rate": 3.484570679713986e-05, + "loss": 2.6629, + "step": 41123 + }, + { + "epoch": 1.9146355657983565, + "grad_norm": 0.34399329432491943, + "learning_rate": 3.484312549298634e-05, + "loss": 2.6797, + "step": 41124 + }, + { + "epoch": 1.9146821239844496, + "grad_norm": 0.3480102782399932, + "learning_rate": 3.4840544233316763e-05, + "loss": 2.7137, + "step": 41125 + }, + { + "epoch": 1.9147286821705425, + "grad_norm": 0.36040527104400044, + "learning_rate": 3.483796301813869e-05, + "loss": 2.7228, + "step": 41126 + }, + { + "epoch": 1.9147752403566356, + "grad_norm": 0.34472315606074116, + "learning_rate": 3.483538184745971e-05, + "loss": 2.7066, + "step": 41127 + }, + { + "epoch": 1.9148217985427287, + "grad_norm": 0.34705791661907376, + "learning_rate": 3.48328007212874e-05, + "loss": 2.6539, + "step": 41128 + }, + { + "epoch": 1.9148683567288218, + "grad_norm": 0.36386542969741376, + "learning_rate": 3.48302196396293e-05, + "loss": 2.6876, + "step": 41129 + }, + { + "epoch": 1.914914914914915, + "grad_norm": 0.332013477136072, + "learning_rate": 3.482763860249303e-05, + "loss": 2.6547, + "step": 41130 + }, + { + "epoch": 1.914961473101008, + "grad_norm": 0.35362496541865196, + "learning_rate": 3.4825057609886126e-05, + "loss": 2.6524, + "step": 41131 + }, + { + "epoch": 1.9150080312871012, + "grad_norm": 0.35197523442247397, + "learning_rate": 3.48224766618162e-05, + "loss": 2.7315, + "step": 41132 + }, + { + "epoch": 1.9150545894731943, + "grad_norm": 0.3506262243466994, + "learning_rate": 3.481989575829081e-05, + "loss": 2.7493, + "step": 41133 + }, + { + "epoch": 1.9151011476592872, + "grad_norm": 0.3757770154316391, + "learning_rate": 3.4817314899317514e-05, + "loss": 2.7013, + "step": 41134 + }, + { + "epoch": 1.9151477058453803, + "grad_norm": 0.34201781612766863, + "learning_rate": 3.481473408490392e-05, + "loss": 2.7049, + "step": 41135 + }, + { + "epoch": 1.9151942640314732, + "grad_norm": 0.34138578964326105, + "learning_rate": 3.481215331505758e-05, + "loss": 2.6301, + "step": 41136 + }, + { + "epoch": 1.9152408222175663, + "grad_norm": 0.3695710929185929, + "learning_rate": 3.480957258978609e-05, + "loss": 2.6455, + "step": 41137 + }, + { + "epoch": 1.9152873804036594, + "grad_norm": 0.3451671347196884, + "learning_rate": 3.4806991909096985e-05, + "loss": 2.6435, + "step": 41138 + }, + { + "epoch": 1.9153339385897525, + "grad_norm": 0.36234125980242116, + "learning_rate": 3.4804411272997864e-05, + "loss": 2.7523, + "step": 41139 + }, + { + "epoch": 1.9153804967758457, + "grad_norm": 0.3895018787098583, + "learning_rate": 3.4801830681496314e-05, + "loss": 2.6454, + "step": 41140 + }, + { + "epoch": 1.9154270549619388, + "grad_norm": 0.3673375780263906, + "learning_rate": 3.479925013459989e-05, + "loss": 2.6772, + "step": 41141 + }, + { + "epoch": 1.9154736131480319, + "grad_norm": 0.3635966266617671, + "learning_rate": 3.479666963231615e-05, + "loss": 2.7491, + "step": 41142 + }, + { + "epoch": 1.9155201713341248, + "grad_norm": 0.392517899401742, + "learning_rate": 3.479408917465271e-05, + "loss": 2.6954, + "step": 41143 + }, + { + "epoch": 1.915566729520218, + "grad_norm": 0.3622837678358068, + "learning_rate": 3.47915087616171e-05, + "loss": 2.7387, + "step": 41144 + }, + { + "epoch": 1.915613287706311, + "grad_norm": 0.3481211728905596, + "learning_rate": 3.4788928393216924e-05, + "loss": 2.6481, + "step": 41145 + }, + { + "epoch": 1.915659845892404, + "grad_norm": 0.3789939187753253, + "learning_rate": 3.478634806945975e-05, + "loss": 2.7299, + "step": 41146 + }, + { + "epoch": 1.915706404078497, + "grad_norm": 0.3536456370266118, + "learning_rate": 3.478376779035313e-05, + "loss": 2.7638, + "step": 41147 + }, + { + "epoch": 1.9157529622645901, + "grad_norm": 0.3696189535434539, + "learning_rate": 3.478118755590467e-05, + "loss": 2.638, + "step": 41148 + }, + { + "epoch": 1.9157995204506832, + "grad_norm": 0.35338630025781675, + "learning_rate": 3.477860736612191e-05, + "loss": 2.6891, + "step": 41149 + }, + { + "epoch": 1.9158460786367764, + "grad_norm": 0.3552470325580024, + "learning_rate": 3.477602722101245e-05, + "loss": 2.5807, + "step": 41150 + }, + { + "epoch": 1.9158926368228695, + "grad_norm": 0.33269353337284646, + "learning_rate": 3.4773447120583844e-05, + "loss": 2.7055, + "step": 41151 + }, + { + "epoch": 1.9159391950089626, + "grad_norm": 0.345898638913954, + "learning_rate": 3.477086706484367e-05, + "loss": 2.7153, + "step": 41152 + }, + { + "epoch": 1.9159857531950555, + "grad_norm": 0.37262177796843315, + "learning_rate": 3.4768287053799516e-05, + "loss": 2.6802, + "step": 41153 + }, + { + "epoch": 1.9160323113811486, + "grad_norm": 0.34890536286682566, + "learning_rate": 3.476570708745891e-05, + "loss": 2.6697, + "step": 41154 + }, + { + "epoch": 1.9160788695672415, + "grad_norm": 0.31081901712885557, + "learning_rate": 3.476312716582948e-05, + "loss": 2.6184, + "step": 41155 + }, + { + "epoch": 1.9161254277533346, + "grad_norm": 0.37987349429020106, + "learning_rate": 3.4760547288918766e-05, + "loss": 2.7259, + "step": 41156 + }, + { + "epoch": 1.9161719859394277, + "grad_norm": 0.362630336946172, + "learning_rate": 3.4757967456734326e-05, + "loss": 2.7055, + "step": 41157 + }, + { + "epoch": 1.9162185441255208, + "grad_norm": 0.34853330501620294, + "learning_rate": 3.475538766928377e-05, + "loss": 2.5686, + "step": 41158 + }, + { + "epoch": 1.916265102311614, + "grad_norm": 0.3751355903764624, + "learning_rate": 3.475280792657465e-05, + "loss": 2.6777, + "step": 41159 + }, + { + "epoch": 1.916311660497707, + "grad_norm": 0.3164944325248091, + "learning_rate": 3.475022822861451e-05, + "loss": 2.6459, + "step": 41160 + }, + { + "epoch": 1.9163582186838002, + "grad_norm": 0.3702662326395544, + "learning_rate": 3.474764857541097e-05, + "loss": 2.7001, + "step": 41161 + }, + { + "epoch": 1.9164047768698933, + "grad_norm": 0.3377783878836793, + "learning_rate": 3.474506896697158e-05, + "loss": 2.7203, + "step": 41162 + }, + { + "epoch": 1.9164513350559862, + "grad_norm": 0.3933943798695507, + "learning_rate": 3.474248940330391e-05, + "loss": 2.6838, + "step": 41163 + }, + { + "epoch": 1.9164978932420793, + "grad_norm": 0.33834859426322345, + "learning_rate": 3.473990988441552e-05, + "loss": 2.5697, + "step": 41164 + }, + { + "epoch": 1.9165444514281722, + "grad_norm": 0.326468417618414, + "learning_rate": 3.4737330410313994e-05, + "loss": 2.6579, + "step": 41165 + }, + { + "epoch": 1.9165910096142653, + "grad_norm": 0.3675029819291159, + "learning_rate": 3.473475098100691e-05, + "loss": 2.7413, + "step": 41166 + }, + { + "epoch": 1.9166375678003584, + "grad_norm": 0.32699245144278855, + "learning_rate": 3.4732171596501804e-05, + "loss": 2.6684, + "step": 41167 + }, + { + "epoch": 1.9166841259864515, + "grad_norm": 0.33778672301482976, + "learning_rate": 3.4729592256806293e-05, + "loss": 2.6519, + "step": 41168 + }, + { + "epoch": 1.9167306841725447, + "grad_norm": 0.3667663199922211, + "learning_rate": 3.472701296192793e-05, + "loss": 2.6739, + "step": 41169 + }, + { + "epoch": 1.9167772423586378, + "grad_norm": 0.35410333817545775, + "learning_rate": 3.472443371187425e-05, + "loss": 2.73, + "step": 41170 + }, + { + "epoch": 1.916823800544731, + "grad_norm": 0.33189584211218515, + "learning_rate": 3.472185450665287e-05, + "loss": 2.6576, + "step": 41171 + }, + { + "epoch": 1.916870358730824, + "grad_norm": 0.32622772552822166, + "learning_rate": 3.471927534627133e-05, + "loss": 2.59, + "step": 41172 + }, + { + "epoch": 1.916916916916917, + "grad_norm": 0.33294685652636913, + "learning_rate": 3.471669623073723e-05, + "loss": 2.6756, + "step": 41173 + }, + { + "epoch": 1.91696347510301, + "grad_norm": 0.33231756739060136, + "learning_rate": 3.4714117160058124e-05, + "loss": 2.668, + "step": 41174 + }, + { + "epoch": 1.917010033289103, + "grad_norm": 0.3307291188351362, + "learning_rate": 3.471153813424156e-05, + "loss": 2.6342, + "step": 41175 + }, + { + "epoch": 1.917056591475196, + "grad_norm": 0.3456186711474883, + "learning_rate": 3.4708959153295125e-05, + "loss": 2.7542, + "step": 41176 + }, + { + "epoch": 1.9171031496612891, + "grad_norm": 0.35533480156911534, + "learning_rate": 3.4706380217226406e-05, + "loss": 2.5227, + "step": 41177 + }, + { + "epoch": 1.9171497078473823, + "grad_norm": 0.3475759671954587, + "learning_rate": 3.4703801326042945e-05, + "loss": 2.6082, + "step": 41178 + }, + { + "epoch": 1.9171962660334754, + "grad_norm": 0.3505506895528727, + "learning_rate": 3.4701222479752325e-05, + "loss": 2.714, + "step": 41179 + }, + { + "epoch": 1.9172428242195685, + "grad_norm": 0.34448482332584074, + "learning_rate": 3.469864367836209e-05, + "loss": 2.5528, + "step": 41180 + }, + { + "epoch": 1.9172893824056616, + "grad_norm": 0.34054194196842474, + "learning_rate": 3.469606492187986e-05, + "loss": 2.7195, + "step": 41181 + }, + { + "epoch": 1.9173359405917545, + "grad_norm": 0.3528525668757614, + "learning_rate": 3.4693486210313155e-05, + "loss": 2.6117, + "step": 41182 + }, + { + "epoch": 1.9173824987778476, + "grad_norm": 0.3331738079897981, + "learning_rate": 3.469090754366955e-05, + "loss": 2.5822, + "step": 41183 + }, + { + "epoch": 1.9174290569639407, + "grad_norm": 0.34565390718478956, + "learning_rate": 3.468832892195664e-05, + "loss": 2.6895, + "step": 41184 + }, + { + "epoch": 1.9174756151500336, + "grad_norm": 0.3274287386072303, + "learning_rate": 3.4685750345181954e-05, + "loss": 2.6732, + "step": 41185 + }, + { + "epoch": 1.9175221733361267, + "grad_norm": 0.3521998041425819, + "learning_rate": 3.46831718133531e-05, + "loss": 2.6263, + "step": 41186 + }, + { + "epoch": 1.9175687315222198, + "grad_norm": 0.3640979728644587, + "learning_rate": 3.468059332647764e-05, + "loss": 2.6305, + "step": 41187 + }, + { + "epoch": 1.917615289708313, + "grad_norm": 0.32442036449971745, + "learning_rate": 3.46780148845631e-05, + "loss": 2.7579, + "step": 41188 + }, + { + "epoch": 1.917661847894406, + "grad_norm": 0.3525268536443604, + "learning_rate": 3.4675436487617097e-05, + "loss": 2.6476, + "step": 41189 + }, + { + "epoch": 1.9177084060804992, + "grad_norm": 0.3453685050446245, + "learning_rate": 3.4672858135647166e-05, + "loss": 2.7065, + "step": 41190 + }, + { + "epoch": 1.9177549642665923, + "grad_norm": 0.33666677312723253, + "learning_rate": 3.46702798286609e-05, + "loss": 2.7238, + "step": 41191 + }, + { + "epoch": 1.9178015224526852, + "grad_norm": 0.3540160913102141, + "learning_rate": 3.466770156666586e-05, + "loss": 2.6054, + "step": 41192 + }, + { + "epoch": 1.9178480806387783, + "grad_norm": 0.320225988596674, + "learning_rate": 3.466512334966957e-05, + "loss": 2.6144, + "step": 41193 + }, + { + "epoch": 1.9178946388248714, + "grad_norm": 0.3541093007595951, + "learning_rate": 3.466254517767966e-05, + "loss": 2.685, + "step": 41194 + }, + { + "epoch": 1.9179411970109643, + "grad_norm": 0.35153141775754865, + "learning_rate": 3.465996705070367e-05, + "loss": 2.7707, + "step": 41195 + }, + { + "epoch": 1.9179877551970574, + "grad_norm": 0.3924639152046334, + "learning_rate": 3.465738896874914e-05, + "loss": 2.6347, + "step": 41196 + }, + { + "epoch": 1.9180343133831506, + "grad_norm": 0.3206147752159443, + "learning_rate": 3.465481093182369e-05, + "loss": 2.6318, + "step": 41197 + }, + { + "epoch": 1.9180808715692437, + "grad_norm": 0.35128776038774934, + "learning_rate": 3.465223293993483e-05, + "loss": 2.6628, + "step": 41198 + }, + { + "epoch": 1.9181274297553368, + "grad_norm": 0.3548912526069816, + "learning_rate": 3.464965499309018e-05, + "loss": 2.6791, + "step": 41199 + }, + { + "epoch": 1.91817398794143, + "grad_norm": 0.33269668861416485, + "learning_rate": 3.4647077091297276e-05, + "loss": 2.5507, + "step": 41200 + }, + { + "epoch": 1.918220546127523, + "grad_norm": 0.33309400841539333, + "learning_rate": 3.4644499234563685e-05, + "loss": 2.7716, + "step": 41201 + }, + { + "epoch": 1.918267104313616, + "grad_norm": 0.3711216638663209, + "learning_rate": 3.464192142289697e-05, + "loss": 2.6012, + "step": 41202 + }, + { + "epoch": 1.918313662499709, + "grad_norm": 0.3341998188852846, + "learning_rate": 3.46393436563047e-05, + "loss": 2.6495, + "step": 41203 + }, + { + "epoch": 1.918360220685802, + "grad_norm": 0.3321694899167139, + "learning_rate": 3.463676593479446e-05, + "loss": 2.5534, + "step": 41204 + }, + { + "epoch": 1.918406778871895, + "grad_norm": 0.35970037195819154, + "learning_rate": 3.46341882583738e-05, + "loss": 2.6298, + "step": 41205 + }, + { + "epoch": 1.9184533370579882, + "grad_norm": 0.35607065280277345, + "learning_rate": 3.463161062705026e-05, + "loss": 2.6527, + "step": 41206 + }, + { + "epoch": 1.9184998952440813, + "grad_norm": 0.3466311827749832, + "learning_rate": 3.462903304083145e-05, + "loss": 2.6272, + "step": 41207 + }, + { + "epoch": 1.9185464534301744, + "grad_norm": 0.33364929055602355, + "learning_rate": 3.462645549972489e-05, + "loss": 2.6881, + "step": 41208 + }, + { + "epoch": 1.9185930116162675, + "grad_norm": 0.3482550517555594, + "learning_rate": 3.462387800373819e-05, + "loss": 2.6748, + "step": 41209 + }, + { + "epoch": 1.9186395698023606, + "grad_norm": 0.3361053709280776, + "learning_rate": 3.4621300552878896e-05, + "loss": 2.6828, + "step": 41210 + }, + { + "epoch": 1.9186861279884537, + "grad_norm": 0.35121231077834253, + "learning_rate": 3.461872314715455e-05, + "loss": 2.6846, + "step": 41211 + }, + { + "epoch": 1.9187326861745466, + "grad_norm": 0.3653445186449221, + "learning_rate": 3.461614578657275e-05, + "loss": 2.7362, + "step": 41212 + }, + { + "epoch": 1.9187792443606397, + "grad_norm": 0.3370658982404433, + "learning_rate": 3.461356847114105e-05, + "loss": 2.6128, + "step": 41213 + }, + { + "epoch": 1.9188258025467326, + "grad_norm": 0.351639599951002, + "learning_rate": 3.4610991200867006e-05, + "loss": 2.6611, + "step": 41214 + }, + { + "epoch": 1.9188723607328257, + "grad_norm": 0.31897072280459254, + "learning_rate": 3.460841397575818e-05, + "loss": 2.6828, + "step": 41215 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 0.3589958456539062, + "learning_rate": 3.460583679582215e-05, + "loss": 2.6913, + "step": 41216 + }, + { + "epoch": 1.918965477105012, + "grad_norm": 0.32482089935526104, + "learning_rate": 3.460325966106647e-05, + "loss": 2.696, + "step": 41217 + }, + { + "epoch": 1.919012035291105, + "grad_norm": 0.35498844364052967, + "learning_rate": 3.460068257149871e-05, + "loss": 2.6712, + "step": 41218 + }, + { + "epoch": 1.9190585934771982, + "grad_norm": 0.34010224488021534, + "learning_rate": 3.459810552712642e-05, + "loss": 2.6498, + "step": 41219 + }, + { + "epoch": 1.9191051516632913, + "grad_norm": 0.329554908095054, + "learning_rate": 3.459552852795719e-05, + "loss": 2.6541, + "step": 41220 + }, + { + "epoch": 1.9191517098493842, + "grad_norm": 0.34890817198870855, + "learning_rate": 3.4592951573998536e-05, + "loss": 2.6315, + "step": 41221 + }, + { + "epoch": 1.9191982680354773, + "grad_norm": 0.34194830733703313, + "learning_rate": 3.459037466525808e-05, + "loss": 2.6958, + "step": 41222 + }, + { + "epoch": 1.9192448262215704, + "grad_norm": 0.33855316676497066, + "learning_rate": 3.458779780174335e-05, + "loss": 2.6958, + "step": 41223 + }, + { + "epoch": 1.9192913844076633, + "grad_norm": 0.3315139762215775, + "learning_rate": 3.45852209834619e-05, + "loss": 2.5657, + "step": 41224 + }, + { + "epoch": 1.9193379425937565, + "grad_norm": 0.3292135179023393, + "learning_rate": 3.458264421042133e-05, + "loss": 2.6732, + "step": 41225 + }, + { + "epoch": 1.9193845007798496, + "grad_norm": 0.3536058470034319, + "learning_rate": 3.458006748262916e-05, + "loss": 2.6873, + "step": 41226 + }, + { + "epoch": 1.9194310589659427, + "grad_norm": 0.33934176329855503, + "learning_rate": 3.4577490800092995e-05, + "loss": 2.7207, + "step": 41227 + }, + { + "epoch": 1.9194776171520358, + "grad_norm": 0.33700679958331425, + "learning_rate": 3.457491416282036e-05, + "loss": 2.6233, + "step": 41228 + }, + { + "epoch": 1.919524175338129, + "grad_norm": 0.31480745784726394, + "learning_rate": 3.457233757081883e-05, + "loss": 2.5431, + "step": 41229 + }, + { + "epoch": 1.919570733524222, + "grad_norm": 0.33755991856781764, + "learning_rate": 3.456976102409598e-05, + "loss": 2.6754, + "step": 41230 + }, + { + "epoch": 1.919617291710315, + "grad_norm": 0.3406372717272767, + "learning_rate": 3.456718452265937e-05, + "loss": 2.7112, + "step": 41231 + }, + { + "epoch": 1.919663849896408, + "grad_norm": 0.3203952664049278, + "learning_rate": 3.456460806651652e-05, + "loss": 2.6767, + "step": 41232 + }, + { + "epoch": 1.9197104080825012, + "grad_norm": 0.34621389836622574, + "learning_rate": 3.4562031655675055e-05, + "loss": 2.7237, + "step": 41233 + }, + { + "epoch": 1.919756966268594, + "grad_norm": 0.35729124913617305, + "learning_rate": 3.455945529014248e-05, + "loss": 2.7146, + "step": 41234 + }, + { + "epoch": 1.9198035244546872, + "grad_norm": 0.33891769600647687, + "learning_rate": 3.4556878969926404e-05, + "loss": 2.6974, + "step": 41235 + }, + { + "epoch": 1.9198500826407803, + "grad_norm": 0.32346382759732356, + "learning_rate": 3.455430269503437e-05, + "loss": 2.6979, + "step": 41236 + }, + { + "epoch": 1.9198966408268734, + "grad_norm": 0.34177289021305945, + "learning_rate": 3.455172646547391e-05, + "loss": 2.518, + "step": 41237 + }, + { + "epoch": 1.9199431990129665, + "grad_norm": 0.32806010274310077, + "learning_rate": 3.4549150281252636e-05, + "loss": 2.7296, + "step": 41238 + }, + { + "epoch": 1.9199897571990596, + "grad_norm": 0.3205432768154242, + "learning_rate": 3.454657414237807e-05, + "loss": 2.7217, + "step": 41239 + }, + { + "epoch": 1.9200363153851527, + "grad_norm": 0.32160315394210115, + "learning_rate": 3.45439980488578e-05, + "loss": 2.6747, + "step": 41240 + }, + { + "epoch": 1.9200828735712456, + "grad_norm": 0.33032589586080235, + "learning_rate": 3.454142200069936e-05, + "loss": 2.7748, + "step": 41241 + }, + { + "epoch": 1.9201294317573387, + "grad_norm": 0.36030957875447145, + "learning_rate": 3.4538845997910324e-05, + "loss": 2.8054, + "step": 41242 + }, + { + "epoch": 1.9201759899434316, + "grad_norm": 0.3536016785889557, + "learning_rate": 3.4536270040498266e-05, + "loss": 2.6687, + "step": 41243 + }, + { + "epoch": 1.9202225481295248, + "grad_norm": 0.31962166460543656, + "learning_rate": 3.453369412847071e-05, + "loss": 2.6551, + "step": 41244 + }, + { + "epoch": 1.9202691063156179, + "grad_norm": 0.3273087821674943, + "learning_rate": 3.4531118261835246e-05, + "loss": 2.6865, + "step": 41245 + }, + { + "epoch": 1.920315664501711, + "grad_norm": 0.3594844749061053, + "learning_rate": 3.452854244059944e-05, + "loss": 2.707, + "step": 41246 + }, + { + "epoch": 1.920362222687804, + "grad_norm": 0.3305308912631807, + "learning_rate": 3.452596666477081e-05, + "loss": 2.6842, + "step": 41247 + }, + { + "epoch": 1.9204087808738972, + "grad_norm": 0.36858000859594886, + "learning_rate": 3.452339093435698e-05, + "loss": 2.5893, + "step": 41248 + }, + { + "epoch": 1.9204553390599903, + "grad_norm": 0.33742759438040576, + "learning_rate": 3.4520815249365454e-05, + "loss": 2.5812, + "step": 41249 + }, + { + "epoch": 1.9205018972460834, + "grad_norm": 0.3303901723627315, + "learning_rate": 3.45182396098038e-05, + "loss": 2.7583, + "step": 41250 + }, + { + "epoch": 1.9205484554321763, + "grad_norm": 0.36013810354395814, + "learning_rate": 3.4515664015679603e-05, + "loss": 2.5723, + "step": 41251 + }, + { + "epoch": 1.9205950136182695, + "grad_norm": 0.33254737102404897, + "learning_rate": 3.45130884670004e-05, + "loss": 2.6583, + "step": 41252 + }, + { + "epoch": 1.9206415718043623, + "grad_norm": 0.33798607474107517, + "learning_rate": 3.4510512963773766e-05, + "loss": 2.6373, + "step": 41253 + }, + { + "epoch": 1.9206881299904555, + "grad_norm": 0.3346960887905403, + "learning_rate": 3.450793750600724e-05, + "loss": 2.6352, + "step": 41254 + }, + { + "epoch": 1.9207346881765486, + "grad_norm": 0.32754082083724834, + "learning_rate": 3.4505362093708386e-05, + "loss": 2.5791, + "step": 41255 + }, + { + "epoch": 1.9207812463626417, + "grad_norm": 0.34267109019791325, + "learning_rate": 3.4502786726884786e-05, + "loss": 2.7082, + "step": 41256 + }, + { + "epoch": 1.9208278045487348, + "grad_norm": 0.3473670552075932, + "learning_rate": 3.4500211405543955e-05, + "loss": 2.7988, + "step": 41257 + }, + { + "epoch": 1.920874362734828, + "grad_norm": 0.3404371731852857, + "learning_rate": 3.44976361296935e-05, + "loss": 2.6044, + "step": 41258 + }, + { + "epoch": 1.920920920920921, + "grad_norm": 0.3476257006709923, + "learning_rate": 3.449506089934095e-05, + "loss": 2.7197, + "step": 41259 + }, + { + "epoch": 1.9209674791070142, + "grad_norm": 0.36597244684873803, + "learning_rate": 3.449248571449385e-05, + "loss": 2.6407, + "step": 41260 + }, + { + "epoch": 1.921014037293107, + "grad_norm": 0.3475430423972443, + "learning_rate": 3.4489910575159804e-05, + "loss": 2.6781, + "step": 41261 + }, + { + "epoch": 1.9210605954792002, + "grad_norm": 0.34430494601183237, + "learning_rate": 3.448733548134632e-05, + "loss": 2.7052, + "step": 41262 + }, + { + "epoch": 1.921107153665293, + "grad_norm": 0.32228016779446533, + "learning_rate": 3.448476043306099e-05, + "loss": 2.7519, + "step": 41263 + }, + { + "epoch": 1.9211537118513862, + "grad_norm": 0.3597484088834701, + "learning_rate": 3.4482185430311364e-05, + "loss": 2.7222, + "step": 41264 + }, + { + "epoch": 1.9212002700374793, + "grad_norm": 0.35119590062859474, + "learning_rate": 3.447961047310499e-05, + "loss": 2.5809, + "step": 41265 + }, + { + "epoch": 1.9212468282235724, + "grad_norm": 0.33526394908302276, + "learning_rate": 3.447703556144943e-05, + "loss": 2.8075, + "step": 41266 + }, + { + "epoch": 1.9212933864096655, + "grad_norm": 0.3513215793580907, + "learning_rate": 3.4474460695352244e-05, + "loss": 2.7637, + "step": 41267 + }, + { + "epoch": 1.9213399445957586, + "grad_norm": 0.34319452063994277, + "learning_rate": 3.447188587482098e-05, + "loss": 2.6222, + "step": 41268 + }, + { + "epoch": 1.9213865027818517, + "grad_norm": 0.33403421290190144, + "learning_rate": 3.446931109986321e-05, + "loss": 2.5382, + "step": 41269 + }, + { + "epoch": 1.9214330609679446, + "grad_norm": 0.3608740329690148, + "learning_rate": 3.446673637048646e-05, + "loss": 2.4685, + "step": 41270 + }, + { + "epoch": 1.9214796191540378, + "grad_norm": 0.3196756503165924, + "learning_rate": 3.446416168669834e-05, + "loss": 2.7185, + "step": 41271 + }, + { + "epoch": 1.9215261773401309, + "grad_norm": 0.3441076664317337, + "learning_rate": 3.446158704850636e-05, + "loss": 2.7403, + "step": 41272 + }, + { + "epoch": 1.9215727355262238, + "grad_norm": 0.34878241512872127, + "learning_rate": 3.445901245591808e-05, + "loss": 2.7442, + "step": 41273 + }, + { + "epoch": 1.9216192937123169, + "grad_norm": 0.3390705831978666, + "learning_rate": 3.445643790894109e-05, + "loss": 2.6211, + "step": 41274 + }, + { + "epoch": 1.92166585189841, + "grad_norm": 0.36006213198339154, + "learning_rate": 3.44538634075829e-05, + "loss": 2.7667, + "step": 41275 + }, + { + "epoch": 1.921712410084503, + "grad_norm": 0.34916932996528055, + "learning_rate": 3.44512889518511e-05, + "loss": 2.7337, + "step": 41276 + }, + { + "epoch": 1.9217589682705962, + "grad_norm": 0.3729178923221883, + "learning_rate": 3.4448714541753246e-05, + "loss": 2.7057, + "step": 41277 + }, + { + "epoch": 1.9218055264566893, + "grad_norm": 0.3484564680828309, + "learning_rate": 3.444614017729687e-05, + "loss": 2.6658, + "step": 41278 + }, + { + "epoch": 1.9218520846427825, + "grad_norm": 0.3611091459560116, + "learning_rate": 3.444356585848956e-05, + "loss": 2.7158, + "step": 41279 + }, + { + "epoch": 1.9218986428288753, + "grad_norm": 0.3791360830361255, + "learning_rate": 3.444099158533881e-05, + "loss": 2.7163, + "step": 41280 + }, + { + "epoch": 1.9219452010149685, + "grad_norm": 0.3424504989257716, + "learning_rate": 3.4438417357852256e-05, + "loss": 2.6387, + "step": 41281 + }, + { + "epoch": 1.9219917592010616, + "grad_norm": 0.3655471333309597, + "learning_rate": 3.4435843176037405e-05, + "loss": 2.643, + "step": 41282 + }, + { + "epoch": 1.9220383173871545, + "grad_norm": 0.3695686125735732, + "learning_rate": 3.443326903990181e-05, + "loss": 2.7686, + "step": 41283 + }, + { + "epoch": 1.9220848755732476, + "grad_norm": 0.3339165890995723, + "learning_rate": 3.4430694949453046e-05, + "loss": 2.6741, + "step": 41284 + }, + { + "epoch": 1.9221314337593407, + "grad_norm": 0.3687233332707929, + "learning_rate": 3.442812090469866e-05, + "loss": 2.7137, + "step": 41285 + }, + { + "epoch": 1.9221779919454338, + "grad_norm": 0.3569218582467346, + "learning_rate": 3.442554690564619e-05, + "loss": 2.6332, + "step": 41286 + }, + { + "epoch": 1.922224550131527, + "grad_norm": 0.33663623314454477, + "learning_rate": 3.442297295230322e-05, + "loss": 2.7046, + "step": 41287 + }, + { + "epoch": 1.92227110831762, + "grad_norm": 0.35277398192586906, + "learning_rate": 3.442039904467727e-05, + "loss": 2.586, + "step": 41288 + }, + { + "epoch": 1.9223176665037132, + "grad_norm": 0.3244609028924637, + "learning_rate": 3.441782518277594e-05, + "loss": 2.6446, + "step": 41289 + }, + { + "epoch": 1.922364224689806, + "grad_norm": 0.3489859718024076, + "learning_rate": 3.441525136660675e-05, + "loss": 2.678, + "step": 41290 + }, + { + "epoch": 1.9224107828758992, + "grad_norm": 0.32746556996995135, + "learning_rate": 3.4412677596177256e-05, + "loss": 2.6438, + "step": 41291 + }, + { + "epoch": 1.922457341061992, + "grad_norm": 0.3353058992819048, + "learning_rate": 3.441010387149503e-05, + "loss": 2.6821, + "step": 41292 + }, + { + "epoch": 1.9225038992480852, + "grad_norm": 0.3551323580966271, + "learning_rate": 3.4407530192567595e-05, + "loss": 2.7293, + "step": 41293 + }, + { + "epoch": 1.9225504574341783, + "grad_norm": 0.33230808732740796, + "learning_rate": 3.4404956559402534e-05, + "loss": 2.6962, + "step": 41294 + }, + { + "epoch": 1.9225970156202714, + "grad_norm": 0.35214621293038606, + "learning_rate": 3.44023829720074e-05, + "loss": 2.6204, + "step": 41295 + }, + { + "epoch": 1.9226435738063645, + "grad_norm": 0.336488841931516, + "learning_rate": 3.4399809430389706e-05, + "loss": 2.5515, + "step": 41296 + }, + { + "epoch": 1.9226901319924576, + "grad_norm": 0.3507686326702963, + "learning_rate": 3.439723593455706e-05, + "loss": 2.7249, + "step": 41297 + }, + { + "epoch": 1.9227366901785508, + "grad_norm": 0.35500180082377436, + "learning_rate": 3.439466248451696e-05, + "loss": 2.5974, + "step": 41298 + }, + { + "epoch": 1.9227832483646439, + "grad_norm": 0.3527920123013262, + "learning_rate": 3.439208908027702e-05, + "loss": 2.7098, + "step": 41299 + }, + { + "epoch": 1.9228298065507368, + "grad_norm": 0.3785506212315415, + "learning_rate": 3.438951572184476e-05, + "loss": 2.6019, + "step": 41300 + }, + { + "epoch": 1.9228763647368299, + "grad_norm": 0.34184423377085743, + "learning_rate": 3.438694240922771e-05, + "loss": 2.71, + "step": 41301 + }, + { + "epoch": 1.9229229229229228, + "grad_norm": 0.36639632764540847, + "learning_rate": 3.438436914243346e-05, + "loss": 2.8227, + "step": 41302 + }, + { + "epoch": 1.9229694811090159, + "grad_norm": 0.365898403141356, + "learning_rate": 3.4381795921469555e-05, + "loss": 2.704, + "step": 41303 + }, + { + "epoch": 1.923016039295109, + "grad_norm": 0.3406658889118356, + "learning_rate": 3.437922274634353e-05, + "loss": 2.7318, + "step": 41304 + }, + { + "epoch": 1.9230625974812021, + "grad_norm": 0.3224224865399092, + "learning_rate": 3.437664961706296e-05, + "loss": 2.7087, + "step": 41305 + }, + { + "epoch": 1.9231091556672952, + "grad_norm": 0.35023376062840567, + "learning_rate": 3.437407653363536e-05, + "loss": 2.6327, + "step": 41306 + }, + { + "epoch": 1.9231557138533883, + "grad_norm": 0.3506665559448654, + "learning_rate": 3.4371503496068325e-05, + "loss": 2.6585, + "step": 41307 + }, + { + "epoch": 1.9232022720394815, + "grad_norm": 0.3401644040887757, + "learning_rate": 3.436893050436939e-05, + "loss": 2.7055, + "step": 41308 + }, + { + "epoch": 1.9232488302255744, + "grad_norm": 0.3454082548657268, + "learning_rate": 3.436635755854608e-05, + "loss": 2.6548, + "step": 41309 + }, + { + "epoch": 1.9232953884116675, + "grad_norm": 0.3298812648372774, + "learning_rate": 3.4363784658605997e-05, + "loss": 2.7588, + "step": 41310 + }, + { + "epoch": 1.9233419465977606, + "grad_norm": 0.34484286275690806, + "learning_rate": 3.436121180455664e-05, + "loss": 2.712, + "step": 41311 + }, + { + "epoch": 1.9233885047838535, + "grad_norm": 0.34204021245856103, + "learning_rate": 3.435863899640561e-05, + "loss": 2.6059, + "step": 41312 + }, + { + "epoch": 1.9234350629699466, + "grad_norm": 0.3249005109996846, + "learning_rate": 3.435606623416043e-05, + "loss": 2.6506, + "step": 41313 + }, + { + "epoch": 1.9234816211560397, + "grad_norm": 0.3504231580721906, + "learning_rate": 3.435349351782864e-05, + "loss": 2.7367, + "step": 41314 + }, + { + "epoch": 1.9235281793421328, + "grad_norm": 0.32808494182246173, + "learning_rate": 3.4350920847417824e-05, + "loss": 2.5922, + "step": 41315 + }, + { + "epoch": 1.923574737528226, + "grad_norm": 0.3390234436414399, + "learning_rate": 3.43483482229355e-05, + "loss": 2.6219, + "step": 41316 + }, + { + "epoch": 1.923621295714319, + "grad_norm": 0.36614376658541775, + "learning_rate": 3.4345775644389236e-05, + "loss": 2.7738, + "step": 41317 + }, + { + "epoch": 1.9236678539004122, + "grad_norm": 0.3155381845749628, + "learning_rate": 3.434320311178659e-05, + "loss": 2.7152, + "step": 41318 + }, + { + "epoch": 1.923714412086505, + "grad_norm": 0.35294298434935406, + "learning_rate": 3.434063062513507e-05, + "loss": 2.7294, + "step": 41319 + }, + { + "epoch": 1.9237609702725982, + "grad_norm": 0.33018087407637153, + "learning_rate": 3.433805818444229e-05, + "loss": 2.7356, + "step": 41320 + }, + { + "epoch": 1.9238075284586913, + "grad_norm": 0.3185395044096706, + "learning_rate": 3.4335485789715755e-05, + "loss": 2.639, + "step": 41321 + }, + { + "epoch": 1.9238540866447842, + "grad_norm": 0.32533728900405773, + "learning_rate": 3.433291344096301e-05, + "loss": 2.634, + "step": 41322 + }, + { + "epoch": 1.9239006448308773, + "grad_norm": 0.3386081073959315, + "learning_rate": 3.4330341138191634e-05, + "loss": 2.6379, + "step": 41323 + }, + { + "epoch": 1.9239472030169704, + "grad_norm": 0.3417140239983017, + "learning_rate": 3.432776888140915e-05, + "loss": 2.6861, + "step": 41324 + }, + { + "epoch": 1.9239937612030635, + "grad_norm": 0.3392925691925834, + "learning_rate": 3.432519667062315e-05, + "loss": 2.5986, + "step": 41325 + }, + { + "epoch": 1.9240403193891567, + "grad_norm": 0.34594205826208013, + "learning_rate": 3.432262450584114e-05, + "loss": 2.7065, + "step": 41326 + }, + { + "epoch": 1.9240868775752498, + "grad_norm": 0.34929182006462317, + "learning_rate": 3.4320052387070664e-05, + "loss": 2.7264, + "step": 41327 + }, + { + "epoch": 1.9241334357613429, + "grad_norm": 0.3540864292252756, + "learning_rate": 3.431748031431931e-05, + "loss": 2.6584, + "step": 41328 + }, + { + "epoch": 1.9241799939474358, + "grad_norm": 0.3298847220380254, + "learning_rate": 3.4314908287594595e-05, + "loss": 2.6078, + "step": 41329 + }, + { + "epoch": 1.924226552133529, + "grad_norm": 0.3509765976122606, + "learning_rate": 3.431233630690409e-05, + "loss": 2.7359, + "step": 41330 + }, + { + "epoch": 1.9242731103196218, + "grad_norm": 0.33274846408329484, + "learning_rate": 3.430976437225534e-05, + "loss": 2.6443, + "step": 41331 + }, + { + "epoch": 1.924319668505715, + "grad_norm": 0.35134672710054426, + "learning_rate": 3.4307192483655854e-05, + "loss": 2.5939, + "step": 41332 + }, + { + "epoch": 1.924366226691808, + "grad_norm": 0.32936973008728676, + "learning_rate": 3.430462064111324e-05, + "loss": 2.6737, + "step": 41333 + }, + { + "epoch": 1.9244127848779011, + "grad_norm": 0.33330053936895765, + "learning_rate": 3.430204884463499e-05, + "loss": 2.7268, + "step": 41334 + }, + { + "epoch": 1.9244593430639942, + "grad_norm": 0.3445186654278969, + "learning_rate": 3.429947709422871e-05, + "loss": 2.6727, + "step": 41335 + }, + { + "epoch": 1.9245059012500874, + "grad_norm": 0.33407619184811543, + "learning_rate": 3.429690538990191e-05, + "loss": 2.6779, + "step": 41336 + }, + { + "epoch": 1.9245524594361805, + "grad_norm": 0.3390178553261229, + "learning_rate": 3.429433373166213e-05, + "loss": 2.6671, + "step": 41337 + }, + { + "epoch": 1.9245990176222736, + "grad_norm": 0.3448495874210543, + "learning_rate": 3.429176211951696e-05, + "loss": 2.7418, + "step": 41338 + }, + { + "epoch": 1.9246455758083665, + "grad_norm": 0.3725685720090914, + "learning_rate": 3.428919055347391e-05, + "loss": 2.7112, + "step": 41339 + }, + { + "epoch": 1.9246921339944596, + "grad_norm": 0.3275048053751785, + "learning_rate": 3.428661903354052e-05, + "loss": 2.6796, + "step": 41340 + }, + { + "epoch": 1.9247386921805525, + "grad_norm": 0.3208595585121837, + "learning_rate": 3.4284047559724373e-05, + "loss": 2.5823, + "step": 41341 + }, + { + "epoch": 1.9247852503666456, + "grad_norm": 0.35623475759452755, + "learning_rate": 3.4281476132033e-05, + "loss": 2.5878, + "step": 41342 + }, + { + "epoch": 1.9248318085527387, + "grad_norm": 0.35944715570698504, + "learning_rate": 3.427890475047394e-05, + "loss": 2.5952, + "step": 41343 + }, + { + "epoch": 1.9248783667388318, + "grad_norm": 0.33120262146919194, + "learning_rate": 3.4276333415054756e-05, + "loss": 2.6568, + "step": 41344 + }, + { + "epoch": 1.924924924924925, + "grad_norm": 0.3683399517585205, + "learning_rate": 3.427376212578296e-05, + "loss": 2.7132, + "step": 41345 + }, + { + "epoch": 1.924971483111018, + "grad_norm": 0.350040464575029, + "learning_rate": 3.427119088266615e-05, + "loss": 2.733, + "step": 41346 + }, + { + "epoch": 1.9250180412971112, + "grad_norm": 0.34306470956675, + "learning_rate": 3.4268619685711814e-05, + "loss": 2.6748, + "step": 41347 + }, + { + "epoch": 1.9250645994832043, + "grad_norm": 0.3477324199257721, + "learning_rate": 3.4266048534927556e-05, + "loss": 2.675, + "step": 41348 + }, + { + "epoch": 1.9251111576692972, + "grad_norm": 0.3215762933811011, + "learning_rate": 3.42634774303209e-05, + "loss": 2.6715, + "step": 41349 + }, + { + "epoch": 1.9251577158553903, + "grad_norm": 0.36313871355966176, + "learning_rate": 3.426090637189936e-05, + "loss": 2.6718, + "step": 41350 + }, + { + "epoch": 1.9252042740414832, + "grad_norm": 0.33758894238350345, + "learning_rate": 3.425833535967053e-05, + "loss": 2.6522, + "step": 41351 + }, + { + "epoch": 1.9252508322275763, + "grad_norm": 0.332823609963314, + "learning_rate": 3.4255764393641926e-05, + "loss": 2.6697, + "step": 41352 + }, + { + "epoch": 1.9252973904136694, + "grad_norm": 0.32281492262451095, + "learning_rate": 3.4253193473821096e-05, + "loss": 2.5405, + "step": 41353 + }, + { + "epoch": 1.9253439485997625, + "grad_norm": 0.32527841686154807, + "learning_rate": 3.425062260021562e-05, + "loss": 2.7248, + "step": 41354 + }, + { + "epoch": 1.9253905067858557, + "grad_norm": 0.33071778291323756, + "learning_rate": 3.424805177283299e-05, + "loss": 2.6799, + "step": 41355 + }, + { + "epoch": 1.9254370649719488, + "grad_norm": 0.3222816365730862, + "learning_rate": 3.424548099168079e-05, + "loss": 2.5666, + "step": 41356 + }, + { + "epoch": 1.925483623158042, + "grad_norm": 0.33767348081208987, + "learning_rate": 3.424291025676655e-05, + "loss": 2.6154, + "step": 41357 + }, + { + "epoch": 1.9255301813441348, + "grad_norm": 0.3330741511062825, + "learning_rate": 3.42403395680978e-05, + "loss": 2.6996, + "step": 41358 + }, + { + "epoch": 1.925576739530228, + "grad_norm": 0.3386690693297191, + "learning_rate": 3.423776892568211e-05, + "loss": 2.6858, + "step": 41359 + }, + { + "epoch": 1.925623297716321, + "grad_norm": 0.33959402678784667, + "learning_rate": 3.423519832952701e-05, + "loss": 2.6281, + "step": 41360 + }, + { + "epoch": 1.925669855902414, + "grad_norm": 0.3360214990050407, + "learning_rate": 3.423262777964006e-05, + "loss": 2.6415, + "step": 41361 + }, + { + "epoch": 1.925716414088507, + "grad_norm": 0.32908561421619553, + "learning_rate": 3.4230057276028805e-05, + "loss": 2.7627, + "step": 41362 + }, + { + "epoch": 1.9257629722746001, + "grad_norm": 0.34776422114828637, + "learning_rate": 3.4227486818700744e-05, + "loss": 2.6526, + "step": 41363 + }, + { + "epoch": 1.9258095304606933, + "grad_norm": 0.323838897087549, + "learning_rate": 3.422491640766348e-05, + "loss": 2.641, + "step": 41364 + }, + { + "epoch": 1.9258560886467864, + "grad_norm": 0.3515777116708571, + "learning_rate": 3.4222346042924525e-05, + "loss": 2.5777, + "step": 41365 + }, + { + "epoch": 1.9259026468328795, + "grad_norm": 0.35576336025286637, + "learning_rate": 3.421977572449143e-05, + "loss": 2.6367, + "step": 41366 + }, + { + "epoch": 1.9259492050189726, + "grad_norm": 0.3402390665896022, + "learning_rate": 3.421720545237175e-05, + "loss": 2.7993, + "step": 41367 + }, + { + "epoch": 1.9259957632050655, + "grad_norm": 0.3734245542911887, + "learning_rate": 3.4214635226573004e-05, + "loss": 2.6863, + "step": 41368 + }, + { + "epoch": 1.9260423213911586, + "grad_norm": 0.33489487136359364, + "learning_rate": 3.421206504710276e-05, + "loss": 2.7109, + "step": 41369 + }, + { + "epoch": 1.9260888795772517, + "grad_norm": 0.35528222367247664, + "learning_rate": 3.420949491396853e-05, + "loss": 2.7301, + "step": 41370 + }, + { + "epoch": 1.9261354377633446, + "grad_norm": 0.3067577483539231, + "learning_rate": 3.42069248271779e-05, + "loss": 2.7163, + "step": 41371 + }, + { + "epoch": 1.9261819959494377, + "grad_norm": 0.33894441000071146, + "learning_rate": 3.420435478673838e-05, + "loss": 2.6577, + "step": 41372 + }, + { + "epoch": 1.9262285541355308, + "grad_norm": 0.34768870827230197, + "learning_rate": 3.420178479265751e-05, + "loss": 2.6639, + "step": 41373 + }, + { + "epoch": 1.926275112321624, + "grad_norm": 0.3242680480042704, + "learning_rate": 3.4199214844942864e-05, + "loss": 2.6583, + "step": 41374 + }, + { + "epoch": 1.926321670507717, + "grad_norm": 0.35566514019196377, + "learning_rate": 3.419664494360194e-05, + "loss": 2.6967, + "step": 41375 + }, + { + "epoch": 1.9263682286938102, + "grad_norm": 0.33147215007107733, + "learning_rate": 3.4194075088642333e-05, + "loss": 2.7583, + "step": 41376 + }, + { + "epoch": 1.9264147868799033, + "grad_norm": 0.3371539668755688, + "learning_rate": 3.419150528007156e-05, + "loss": 2.7359, + "step": 41377 + }, + { + "epoch": 1.9264613450659962, + "grad_norm": 0.3241496755739574, + "learning_rate": 3.418893551789715e-05, + "loss": 2.7259, + "step": 41378 + }, + { + "epoch": 1.9265079032520893, + "grad_norm": 0.34437021198194345, + "learning_rate": 3.4186365802126655e-05, + "loss": 2.634, + "step": 41379 + }, + { + "epoch": 1.9265544614381822, + "grad_norm": 0.351860482104841, + "learning_rate": 3.418379613276763e-05, + "loss": 2.7617, + "step": 41380 + }, + { + "epoch": 1.9266010196242753, + "grad_norm": 0.3359565668838262, + "learning_rate": 3.418122650982759e-05, + "loss": 2.7474, + "step": 41381 + }, + { + "epoch": 1.9266475778103684, + "grad_norm": 0.35151824125725323, + "learning_rate": 3.417865693331411e-05, + "loss": 2.7016, + "step": 41382 + }, + { + "epoch": 1.9266941359964616, + "grad_norm": 0.32675628292756176, + "learning_rate": 3.417608740323469e-05, + "loss": 2.7239, + "step": 41383 + }, + { + "epoch": 1.9267406941825547, + "grad_norm": 0.33732969361570553, + "learning_rate": 3.417351791959691e-05, + "loss": 2.6762, + "step": 41384 + }, + { + "epoch": 1.9267872523686478, + "grad_norm": 0.3351968367567898, + "learning_rate": 3.41709484824083e-05, + "loss": 2.6563, + "step": 41385 + }, + { + "epoch": 1.926833810554741, + "grad_norm": 0.3466240402647762, + "learning_rate": 3.4168379091676374e-05, + "loss": 2.641, + "step": 41386 + }, + { + "epoch": 1.926880368740834, + "grad_norm": 0.3236093896191121, + "learning_rate": 3.4165809747408715e-05, + "loss": 2.6647, + "step": 41387 + }, + { + "epoch": 1.926926926926927, + "grad_norm": 0.3405777319161208, + "learning_rate": 3.4163240449612835e-05, + "loss": 2.7405, + "step": 41388 + }, + { + "epoch": 1.92697348511302, + "grad_norm": 0.32295121821803263, + "learning_rate": 3.416067119829629e-05, + "loss": 2.6062, + "step": 41389 + }, + { + "epoch": 1.927020043299113, + "grad_norm": 0.32691119343356534, + "learning_rate": 3.415810199346662e-05, + "loss": 2.7407, + "step": 41390 + }, + { + "epoch": 1.927066601485206, + "grad_norm": 0.4125747148201218, + "learning_rate": 3.415553283513135e-05, + "loss": 2.715, + "step": 41391 + }, + { + "epoch": 1.9271131596712991, + "grad_norm": 0.34412978451806453, + "learning_rate": 3.415296372329804e-05, + "loss": 2.6895, + "step": 41392 + }, + { + "epoch": 1.9271597178573923, + "grad_norm": 0.36379252606604373, + "learning_rate": 3.4150394657974205e-05, + "loss": 2.6642, + "step": 41393 + }, + { + "epoch": 1.9272062760434854, + "grad_norm": 0.33185970631707995, + "learning_rate": 3.414782563916742e-05, + "loss": 2.6562, + "step": 41394 + }, + { + "epoch": 1.9272528342295785, + "grad_norm": 0.33217723936169063, + "learning_rate": 3.4145256666885193e-05, + "loss": 2.6612, + "step": 41395 + }, + { + "epoch": 1.9272993924156716, + "grad_norm": 0.32239789498569654, + "learning_rate": 3.414268774113507e-05, + "loss": 2.7669, + "step": 41396 + }, + { + "epoch": 1.9273459506017645, + "grad_norm": 0.34314435374726093, + "learning_rate": 3.414011886192461e-05, + "loss": 2.6603, + "step": 41397 + }, + { + "epoch": 1.9273925087878576, + "grad_norm": 0.3299021580945497, + "learning_rate": 3.413755002926134e-05, + "loss": 2.6217, + "step": 41398 + }, + { + "epoch": 1.9274390669739507, + "grad_norm": 0.3367680060742178, + "learning_rate": 3.413498124315277e-05, + "loss": 2.5517, + "step": 41399 + }, + { + "epoch": 1.9274856251600436, + "grad_norm": 0.3398577778924224, + "learning_rate": 3.4132412503606507e-05, + "loss": 2.7698, + "step": 41400 + }, + { + "epoch": 1.9275321833461367, + "grad_norm": 0.3273085170262142, + "learning_rate": 3.4129843810630016e-05, + "loss": 2.5407, + "step": 41401 + }, + { + "epoch": 1.9275787415322299, + "grad_norm": 0.3311431639679697, + "learning_rate": 3.4127275164230885e-05, + "loss": 2.6888, + "step": 41402 + }, + { + "epoch": 1.927625299718323, + "grad_norm": 0.3162848688828676, + "learning_rate": 3.412470656441664e-05, + "loss": 2.5926, + "step": 41403 + }, + { + "epoch": 1.927671857904416, + "grad_norm": 0.3680176115392433, + "learning_rate": 3.4122138011194815e-05, + "loss": 2.7161, + "step": 41404 + }, + { + "epoch": 1.9277184160905092, + "grad_norm": 0.3374607997918929, + "learning_rate": 3.4119569504572946e-05, + "loss": 2.66, + "step": 41405 + }, + { + "epoch": 1.9277649742766023, + "grad_norm": 0.3422109539743839, + "learning_rate": 3.4117001044558576e-05, + "loss": 2.7346, + "step": 41406 + }, + { + "epoch": 1.9278115324626952, + "grad_norm": 0.3338128917125678, + "learning_rate": 3.4114432631159256e-05, + "loss": 2.7034, + "step": 41407 + }, + { + "epoch": 1.9278580906487883, + "grad_norm": 0.33214713633476756, + "learning_rate": 3.4111864264382506e-05, + "loss": 2.6404, + "step": 41408 + }, + { + "epoch": 1.9279046488348814, + "grad_norm": 0.3280625135465927, + "learning_rate": 3.4109295944235855e-05, + "loss": 2.7163, + "step": 41409 + }, + { + "epoch": 1.9279512070209743, + "grad_norm": 0.3217804729676312, + "learning_rate": 3.410672767072687e-05, + "loss": 2.5931, + "step": 41410 + }, + { + "epoch": 1.9279977652070674, + "grad_norm": 0.31397197110059816, + "learning_rate": 3.410415944386306e-05, + "loss": 2.7557, + "step": 41411 + }, + { + "epoch": 1.9280443233931606, + "grad_norm": 0.4002796008357874, + "learning_rate": 3.410159126365199e-05, + "loss": 2.6344, + "step": 41412 + }, + { + "epoch": 1.9280908815792537, + "grad_norm": 0.35769990703741594, + "learning_rate": 3.409902313010119e-05, + "loss": 2.8141, + "step": 41413 + }, + { + "epoch": 1.9281374397653468, + "grad_norm": 0.3283423034642008, + "learning_rate": 3.409645504321816e-05, + "loss": 2.708, + "step": 41414 + }, + { + "epoch": 1.92818399795144, + "grad_norm": 0.34653055787675285, + "learning_rate": 3.409388700301049e-05, + "loss": 2.7718, + "step": 41415 + }, + { + "epoch": 1.928230556137533, + "grad_norm": 0.3647659866896163, + "learning_rate": 3.409131900948571e-05, + "loss": 2.6136, + "step": 41416 + }, + { + "epoch": 1.928277114323626, + "grad_norm": 0.38408683661067, + "learning_rate": 3.408875106265131e-05, + "loss": 2.694, + "step": 41417 + }, + { + "epoch": 1.928323672509719, + "grad_norm": 0.34735705565811514, + "learning_rate": 3.408618316251487e-05, + "loss": 2.6585, + "step": 41418 + }, + { + "epoch": 1.928370230695812, + "grad_norm": 0.3700947824802193, + "learning_rate": 3.4083615309083916e-05, + "loss": 2.6301, + "step": 41419 + }, + { + "epoch": 1.928416788881905, + "grad_norm": 0.34244472119835595, + "learning_rate": 3.4081047502365995e-05, + "loss": 2.6586, + "step": 41420 + }, + { + "epoch": 1.9284633470679982, + "grad_norm": 0.3441043199696506, + "learning_rate": 3.407847974236862e-05, + "loss": 2.6698, + "step": 41421 + }, + { + "epoch": 1.9285099052540913, + "grad_norm": 0.34866131097435576, + "learning_rate": 3.407591202909932e-05, + "loss": 2.6315, + "step": 41422 + }, + { + "epoch": 1.9285564634401844, + "grad_norm": 0.34446357238933983, + "learning_rate": 3.407334436256568e-05, + "loss": 2.7713, + "step": 41423 + }, + { + "epoch": 1.9286030216262775, + "grad_norm": 0.35992900477932144, + "learning_rate": 3.407077674277518e-05, + "loss": 2.6753, + "step": 41424 + }, + { + "epoch": 1.9286495798123706, + "grad_norm": 0.31790594170248365, + "learning_rate": 3.40682091697354e-05, + "loss": 2.7164, + "step": 41425 + }, + { + "epoch": 1.9286961379984637, + "grad_norm": 0.33337543627808786, + "learning_rate": 3.406564164345386e-05, + "loss": 2.5823, + "step": 41426 + }, + { + "epoch": 1.9287426961845566, + "grad_norm": 0.3409717726488009, + "learning_rate": 3.4063074163938055e-05, + "loss": 2.6488, + "step": 41427 + }, + { + "epoch": 1.9287892543706497, + "grad_norm": 0.34833430794025466, + "learning_rate": 3.4060506731195595e-05, + "loss": 2.7336, + "step": 41428 + }, + { + "epoch": 1.9288358125567426, + "grad_norm": 0.3360176474596236, + "learning_rate": 3.405793934523397e-05, + "loss": 2.5979, + "step": 41429 + }, + { + "epoch": 1.9288823707428358, + "grad_norm": 0.3642318547603225, + "learning_rate": 3.405537200606072e-05, + "loss": 2.6486, + "step": 41430 + }, + { + "epoch": 1.9289289289289289, + "grad_norm": 0.3350995226015634, + "learning_rate": 3.405280471368338e-05, + "loss": 2.7172, + "step": 41431 + }, + { + "epoch": 1.928975487115022, + "grad_norm": 0.32950986323595777, + "learning_rate": 3.405023746810949e-05, + "loss": 2.6351, + "step": 41432 + }, + { + "epoch": 1.929022045301115, + "grad_norm": 0.35919984890395273, + "learning_rate": 3.4047670269346584e-05, + "loss": 2.7079, + "step": 41433 + }, + { + "epoch": 1.9290686034872082, + "grad_norm": 0.3497178420682759, + "learning_rate": 3.4045103117402194e-05, + "loss": 2.674, + "step": 41434 + }, + { + "epoch": 1.9291151616733013, + "grad_norm": 0.33241776434956805, + "learning_rate": 3.404253601228384e-05, + "loss": 2.7373, + "step": 41435 + }, + { + "epoch": 1.9291617198593944, + "grad_norm": 0.3552452274634747, + "learning_rate": 3.403996895399909e-05, + "loss": 2.7502, + "step": 41436 + }, + { + "epoch": 1.9292082780454873, + "grad_norm": 0.37927141916023754, + "learning_rate": 3.403740194255544e-05, + "loss": 2.7399, + "step": 41437 + }, + { + "epoch": 1.9292548362315804, + "grad_norm": 0.35090213917644286, + "learning_rate": 3.403483497796045e-05, + "loss": 2.7619, + "step": 41438 + }, + { + "epoch": 1.9293013944176733, + "grad_norm": 0.34231253730148187, + "learning_rate": 3.403226806022166e-05, + "loss": 2.6214, + "step": 41439 + }, + { + "epoch": 1.9293479526037665, + "grad_norm": 0.34423506649285657, + "learning_rate": 3.4029701189346565e-05, + "loss": 2.5888, + "step": 41440 + }, + { + "epoch": 1.9293945107898596, + "grad_norm": 0.32834605841748155, + "learning_rate": 3.402713436534275e-05, + "loss": 2.5964, + "step": 41441 + }, + { + "epoch": 1.9294410689759527, + "grad_norm": 0.3174268834350126, + "learning_rate": 3.40245675882177e-05, + "loss": 2.5059, + "step": 41442 + }, + { + "epoch": 1.9294876271620458, + "grad_norm": 0.3340424377067843, + "learning_rate": 3.402200085797899e-05, + "loss": 2.7197, + "step": 41443 + }, + { + "epoch": 1.929534185348139, + "grad_norm": 0.327816519120679, + "learning_rate": 3.4019434174634126e-05, + "loss": 2.5815, + "step": 41444 + }, + { + "epoch": 1.929580743534232, + "grad_norm": 0.34591105148171686, + "learning_rate": 3.401686753819064e-05, + "loss": 2.77, + "step": 41445 + }, + { + "epoch": 1.929627301720325, + "grad_norm": 0.3776203651894425, + "learning_rate": 3.401430094865609e-05, + "loss": 2.629, + "step": 41446 + }, + { + "epoch": 1.929673859906418, + "grad_norm": 0.31852102697061385, + "learning_rate": 3.401173440603797e-05, + "loss": 2.6763, + "step": 41447 + }, + { + "epoch": 1.9297204180925112, + "grad_norm": 0.3603932069481113, + "learning_rate": 3.4009167910343845e-05, + "loss": 2.5561, + "step": 41448 + }, + { + "epoch": 1.929766976278604, + "grad_norm": 0.35106792372383167, + "learning_rate": 3.400660146158124e-05, + "loss": 2.6745, + "step": 41449 + }, + { + "epoch": 1.9298135344646972, + "grad_norm": 0.34554908454922073, + "learning_rate": 3.400403505975768e-05, + "loss": 2.6825, + "step": 41450 + }, + { + "epoch": 1.9298600926507903, + "grad_norm": 0.3424626165559791, + "learning_rate": 3.40014687048807e-05, + "loss": 2.6608, + "step": 41451 + }, + { + "epoch": 1.9299066508368834, + "grad_norm": 0.3891823760130817, + "learning_rate": 3.3998902396957847e-05, + "loss": 2.634, + "step": 41452 + }, + { + "epoch": 1.9299532090229765, + "grad_norm": 0.34181578633785703, + "learning_rate": 3.399633613599661e-05, + "loss": 2.7564, + "step": 41453 + }, + { + "epoch": 1.9299997672090696, + "grad_norm": 0.34117114181329017, + "learning_rate": 3.399376992200458e-05, + "loss": 2.6565, + "step": 41454 + }, + { + "epoch": 1.9300463253951627, + "grad_norm": 0.34887903715469065, + "learning_rate": 3.399120375498924e-05, + "loss": 2.6696, + "step": 41455 + }, + { + "epoch": 1.9300928835812556, + "grad_norm": 0.33246141423410736, + "learning_rate": 3.3988637634958154e-05, + "loss": 2.678, + "step": 41456 + }, + { + "epoch": 1.9301394417673488, + "grad_norm": 0.3655574935751307, + "learning_rate": 3.398607156191883e-05, + "loss": 2.6737, + "step": 41457 + }, + { + "epoch": 1.9301859999534419, + "grad_norm": 0.3172809690736779, + "learning_rate": 3.3983505535878814e-05, + "loss": 2.6625, + "step": 41458 + }, + { + "epoch": 1.9302325581395348, + "grad_norm": 0.31058305817874354, + "learning_rate": 3.3980939556845624e-05, + "loss": 2.6237, + "step": 41459 + }, + { + "epoch": 1.9302791163256279, + "grad_norm": 0.3746997613431013, + "learning_rate": 3.3978373624826795e-05, + "loss": 2.7013, + "step": 41460 + }, + { + "epoch": 1.930325674511721, + "grad_norm": 0.34370684045418093, + "learning_rate": 3.3975807739829876e-05, + "loss": 2.6804, + "step": 41461 + }, + { + "epoch": 1.930372232697814, + "grad_norm": 0.34828144762464985, + "learning_rate": 3.397324190186237e-05, + "loss": 2.6871, + "step": 41462 + }, + { + "epoch": 1.9304187908839072, + "grad_norm": 0.3835771890736616, + "learning_rate": 3.397067611093182e-05, + "loss": 2.6998, + "step": 41463 + }, + { + "epoch": 1.9304653490700003, + "grad_norm": 0.36371778195126964, + "learning_rate": 3.396811036704577e-05, + "loss": 2.7414, + "step": 41464 + }, + { + "epoch": 1.9305119072560935, + "grad_norm": 0.37410543236206034, + "learning_rate": 3.3965544670211715e-05, + "loss": 2.6384, + "step": 41465 + }, + { + "epoch": 1.9305584654421863, + "grad_norm": 0.31710464534116034, + "learning_rate": 3.396297902043723e-05, + "loss": 2.5696, + "step": 41466 + }, + { + "epoch": 1.9306050236282795, + "grad_norm": 0.3719737823347938, + "learning_rate": 3.3960413417729805e-05, + "loss": 2.7651, + "step": 41467 + }, + { + "epoch": 1.9306515818143724, + "grad_norm": 0.35993313887104605, + "learning_rate": 3.395784786209699e-05, + "loss": 2.7499, + "step": 41468 + }, + { + "epoch": 1.9306981400004655, + "grad_norm": 0.3309073679931598, + "learning_rate": 3.395528235354632e-05, + "loss": 2.6776, + "step": 41469 + }, + { + "epoch": 1.9307446981865586, + "grad_norm": 0.37382973530818947, + "learning_rate": 3.395271689208531e-05, + "loss": 2.6985, + "step": 41470 + }, + { + "epoch": 1.9307912563726517, + "grad_norm": 0.34848648113123354, + "learning_rate": 3.395015147772149e-05, + "loss": 2.672, + "step": 41471 + }, + { + "epoch": 1.9308378145587448, + "grad_norm": 0.31549865020447604, + "learning_rate": 3.39475861104624e-05, + "loss": 2.5168, + "step": 41472 + }, + { + "epoch": 1.930884372744838, + "grad_norm": 0.36821085979509277, + "learning_rate": 3.394502079031555e-05, + "loss": 2.6801, + "step": 41473 + }, + { + "epoch": 1.930930930930931, + "grad_norm": 0.34708328997840243, + "learning_rate": 3.394245551728851e-05, + "loss": 2.617, + "step": 41474 + }, + { + "epoch": 1.9309774891170242, + "grad_norm": 0.345879006982483, + "learning_rate": 3.393989029138876e-05, + "loss": 2.7664, + "step": 41475 + }, + { + "epoch": 1.931024047303117, + "grad_norm": 0.3727871271001847, + "learning_rate": 3.3937325112623845e-05, + "loss": 2.7767, + "step": 41476 + }, + { + "epoch": 1.9310706054892102, + "grad_norm": 0.3406543716156609, + "learning_rate": 3.393475998100132e-05, + "loss": 2.6739, + "step": 41477 + }, + { + "epoch": 1.931117163675303, + "grad_norm": 0.3489167328627834, + "learning_rate": 3.3932194896528664e-05, + "loss": 2.6452, + "step": 41478 + }, + { + "epoch": 1.9311637218613962, + "grad_norm": 0.3471167696510527, + "learning_rate": 3.392962985921346e-05, + "loss": 2.6488, + "step": 41479 + }, + { + "epoch": 1.9312102800474893, + "grad_norm": 0.3530083782598829, + "learning_rate": 3.39270648690632e-05, + "loss": 2.6238, + "step": 41480 + }, + { + "epoch": 1.9312568382335824, + "grad_norm": 0.34539035614149105, + "learning_rate": 3.3924499926085415e-05, + "loss": 2.7057, + "step": 41481 + }, + { + "epoch": 1.9313033964196755, + "grad_norm": 0.3366577916740831, + "learning_rate": 3.3921935030287656e-05, + "loss": 2.6494, + "step": 41482 + }, + { + "epoch": 1.9313499546057686, + "grad_norm": 0.3402047362014167, + "learning_rate": 3.3919370181677404e-05, + "loss": 2.6272, + "step": 41483 + }, + { + "epoch": 1.9313965127918618, + "grad_norm": 0.34434268196433326, + "learning_rate": 3.391680538026224e-05, + "loss": 2.7449, + "step": 41484 + }, + { + "epoch": 1.9314430709779546, + "grad_norm": 0.34097338368485414, + "learning_rate": 3.391424062604967e-05, + "loss": 2.6779, + "step": 41485 + }, + { + "epoch": 1.9314896291640478, + "grad_norm": 0.3575177587363543, + "learning_rate": 3.3911675919047195e-05, + "loss": 2.5852, + "step": 41486 + }, + { + "epoch": 1.9315361873501409, + "grad_norm": 0.35792568410481773, + "learning_rate": 3.3909111259262386e-05, + "loss": 2.7283, + "step": 41487 + }, + { + "epoch": 1.9315827455362338, + "grad_norm": 0.33838478945838996, + "learning_rate": 3.3906546646702754e-05, + "loss": 2.7288, + "step": 41488 + }, + { + "epoch": 1.9316293037223269, + "grad_norm": 0.339082109527785, + "learning_rate": 3.39039820813758e-05, + "loss": 2.5853, + "step": 41489 + }, + { + "epoch": 1.93167586190842, + "grad_norm": 0.3716336080859577, + "learning_rate": 3.3901417563289094e-05, + "loss": 2.6989, + "step": 41490 + }, + { + "epoch": 1.9317224200945131, + "grad_norm": 0.35443598252829966, + "learning_rate": 3.389885309245012e-05, + "loss": 2.6149, + "step": 41491 + }, + { + "epoch": 1.9317689782806062, + "grad_norm": 0.34464737076203855, + "learning_rate": 3.389628866886645e-05, + "loss": 2.6984, + "step": 41492 + }, + { + "epoch": 1.9318155364666993, + "grad_norm": 0.3139352644268065, + "learning_rate": 3.389372429254558e-05, + "loss": 2.5685, + "step": 41493 + }, + { + "epoch": 1.9318620946527925, + "grad_norm": 0.31981787933158246, + "learning_rate": 3.389115996349504e-05, + "loss": 2.75, + "step": 41494 + }, + { + "epoch": 1.9319086528388854, + "grad_norm": 0.37825411402154446, + "learning_rate": 3.3888595681722364e-05, + "loss": 2.678, + "step": 41495 + }, + { + "epoch": 1.9319552110249785, + "grad_norm": 0.34542932896778655, + "learning_rate": 3.388603144723505e-05, + "loss": 2.7361, + "step": 41496 + }, + { + "epoch": 1.9320017692110716, + "grad_norm": 0.3775886796131332, + "learning_rate": 3.388346726004067e-05, + "loss": 2.6889, + "step": 41497 + }, + { + "epoch": 1.9320483273971645, + "grad_norm": 0.33881638224038746, + "learning_rate": 3.388090312014672e-05, + "loss": 2.7085, + "step": 41498 + }, + { + "epoch": 1.9320948855832576, + "grad_norm": 0.36951246799597837, + "learning_rate": 3.387833902756072e-05, + "loss": 2.6872, + "step": 41499 + }, + { + "epoch": 1.9321414437693507, + "grad_norm": 0.35094013752574715, + "learning_rate": 3.387577498229022e-05, + "loss": 2.5939, + "step": 41500 + }, + { + "epoch": 1.9321880019554438, + "grad_norm": 0.31994512470899705, + "learning_rate": 3.387321098434272e-05, + "loss": 2.6726, + "step": 41501 + }, + { + "epoch": 1.932234560141537, + "grad_norm": 0.3785866999247606, + "learning_rate": 3.3870647033725765e-05, + "loss": 2.6359, + "step": 41502 + }, + { + "epoch": 1.93228111832763, + "grad_norm": 0.3695758001078713, + "learning_rate": 3.3868083130446884e-05, + "loss": 2.69, + "step": 41503 + }, + { + "epoch": 1.9323276765137232, + "grad_norm": 0.3522722083077619, + "learning_rate": 3.386551927451356e-05, + "loss": 2.6442, + "step": 41504 + }, + { + "epoch": 1.932374234699816, + "grad_norm": 0.34099373735870797, + "learning_rate": 3.386295546593338e-05, + "loss": 2.5857, + "step": 41505 + }, + { + "epoch": 1.9324207928859092, + "grad_norm": 0.33856481724391535, + "learning_rate": 3.386039170471382e-05, + "loss": 2.691, + "step": 41506 + }, + { + "epoch": 1.932467351072002, + "grad_norm": 0.39485613357374855, + "learning_rate": 3.385782799086242e-05, + "loss": 2.6756, + "step": 41507 + }, + { + "epoch": 1.9325139092580952, + "grad_norm": 0.36855390462879484, + "learning_rate": 3.3855264324386706e-05, + "loss": 2.7612, + "step": 41508 + }, + { + "epoch": 1.9325604674441883, + "grad_norm": 0.3494741795555562, + "learning_rate": 3.3852700705294196e-05, + "loss": 2.6806, + "step": 41509 + }, + { + "epoch": 1.9326070256302814, + "grad_norm": 0.4104537121467008, + "learning_rate": 3.385013713359243e-05, + "loss": 2.671, + "step": 41510 + }, + { + "epoch": 1.9326535838163745, + "grad_norm": 0.39308897880863597, + "learning_rate": 3.3847573609288916e-05, + "loss": 2.5708, + "step": 41511 + }, + { + "epoch": 1.9327001420024676, + "grad_norm": 0.34803868596160487, + "learning_rate": 3.384501013239117e-05, + "loss": 2.6855, + "step": 41512 + }, + { + "epoch": 1.9327467001885608, + "grad_norm": 0.33440441247387653, + "learning_rate": 3.3842446702906746e-05, + "loss": 2.5916, + "step": 41513 + }, + { + "epoch": 1.9327932583746539, + "grad_norm": 0.35023634053118635, + "learning_rate": 3.3839883320843125e-05, + "loss": 2.7182, + "step": 41514 + }, + { + "epoch": 1.9328398165607468, + "grad_norm": 0.34253654814182394, + "learning_rate": 3.3837319986207874e-05, + "loss": 2.5489, + "step": 41515 + }, + { + "epoch": 1.9328863747468399, + "grad_norm": 0.3607942591750471, + "learning_rate": 3.383475669900851e-05, + "loss": 2.672, + "step": 41516 + }, + { + "epoch": 1.9329329329329328, + "grad_norm": 0.3734379877555448, + "learning_rate": 3.38321934592525e-05, + "loss": 2.7553, + "step": 41517 + }, + { + "epoch": 1.932979491119026, + "grad_norm": 0.3461021478472932, + "learning_rate": 3.382963026694745e-05, + "loss": 2.6691, + "step": 41518 + }, + { + "epoch": 1.933026049305119, + "grad_norm": 0.38003064150679, + "learning_rate": 3.382706712210082e-05, + "loss": 2.6433, + "step": 41519 + }, + { + "epoch": 1.9330726074912121, + "grad_norm": 0.3441049250957966, + "learning_rate": 3.3824504024720174e-05, + "loss": 2.7397, + "step": 41520 + }, + { + "epoch": 1.9331191656773052, + "grad_norm": 0.36845015456341346, + "learning_rate": 3.3821940974812995e-05, + "loss": 2.6931, + "step": 41521 + }, + { + "epoch": 1.9331657238633984, + "grad_norm": 0.3679337107104606, + "learning_rate": 3.381937797238683e-05, + "loss": 2.6734, + "step": 41522 + }, + { + "epoch": 1.9332122820494915, + "grad_norm": 0.36861444492391315, + "learning_rate": 3.38168150174492e-05, + "loss": 2.6476, + "step": 41523 + }, + { + "epoch": 1.9332588402355846, + "grad_norm": 0.3383080694710112, + "learning_rate": 3.381425211000764e-05, + "loss": 2.6774, + "step": 41524 + }, + { + "epoch": 1.9333053984216775, + "grad_norm": 0.3619563178064202, + "learning_rate": 3.381168925006962e-05, + "loss": 2.6696, + "step": 41525 + }, + { + "epoch": 1.9333519566077706, + "grad_norm": 0.36263345480246595, + "learning_rate": 3.3809126437642726e-05, + "loss": 2.7809, + "step": 41526 + }, + { + "epoch": 1.9333985147938635, + "grad_norm": 0.34173760350927945, + "learning_rate": 3.3806563672734425e-05, + "loss": 2.6822, + "step": 41527 + }, + { + "epoch": 1.9334450729799566, + "grad_norm": 0.36261403424027605, + "learning_rate": 3.380400095535229e-05, + "loss": 2.632, + "step": 41528 + }, + { + "epoch": 1.9334916311660497, + "grad_norm": 0.3429681131458122, + "learning_rate": 3.3801438285503814e-05, + "loss": 2.6244, + "step": 41529 + }, + { + "epoch": 1.9335381893521428, + "grad_norm": 0.37501115425933257, + "learning_rate": 3.379887566319649e-05, + "loss": 2.7464, + "step": 41530 + }, + { + "epoch": 1.933584747538236, + "grad_norm": 0.3706900595820638, + "learning_rate": 3.3796313088437906e-05, + "loss": 2.6601, + "step": 41531 + }, + { + "epoch": 1.933631305724329, + "grad_norm": 0.3752704626703437, + "learning_rate": 3.379375056123553e-05, + "loss": 2.7145, + "step": 41532 + }, + { + "epoch": 1.9336778639104222, + "grad_norm": 0.37754584984920764, + "learning_rate": 3.3791188081596916e-05, + "loss": 2.7317, + "step": 41533 + }, + { + "epoch": 1.933724422096515, + "grad_norm": 0.3563781417752615, + "learning_rate": 3.378862564952955e-05, + "loss": 2.7673, + "step": 41534 + }, + { + "epoch": 1.9337709802826082, + "grad_norm": 0.3801091485973223, + "learning_rate": 3.378606326504098e-05, + "loss": 2.6675, + "step": 41535 + }, + { + "epoch": 1.9338175384687013, + "grad_norm": 0.36477048536874224, + "learning_rate": 3.378350092813871e-05, + "loss": 2.711, + "step": 41536 + }, + { + "epoch": 1.9338640966547942, + "grad_norm": 0.32805430664845453, + "learning_rate": 3.378093863883026e-05, + "loss": 2.602, + "step": 41537 + }, + { + "epoch": 1.9339106548408873, + "grad_norm": 0.3931391292219686, + "learning_rate": 3.377837639712318e-05, + "loss": 2.7092, + "step": 41538 + }, + { + "epoch": 1.9339572130269804, + "grad_norm": 0.3431021995527149, + "learning_rate": 3.3775814203024955e-05, + "loss": 2.692, + "step": 41539 + }, + { + "epoch": 1.9340037712130735, + "grad_norm": 0.37795543781073154, + "learning_rate": 3.37732520565431e-05, + "loss": 2.7062, + "step": 41540 + }, + { + "epoch": 1.9340503293991667, + "grad_norm": 0.3481158708910147, + "learning_rate": 3.377068995768518e-05, + "loss": 2.6815, + "step": 41541 + }, + { + "epoch": 1.9340968875852598, + "grad_norm": 0.3675671486446819, + "learning_rate": 3.376812790645868e-05, + "loss": 2.6526, + "step": 41542 + }, + { + "epoch": 1.9341434457713529, + "grad_norm": 0.35743401206432607, + "learning_rate": 3.376556590287111e-05, + "loss": 2.6869, + "step": 41543 + }, + { + "epoch": 1.9341900039574458, + "grad_norm": 0.3416834838653284, + "learning_rate": 3.3763003946930024e-05, + "loss": 2.6552, + "step": 41544 + }, + { + "epoch": 1.934236562143539, + "grad_norm": 0.37586293677807936, + "learning_rate": 3.376044203864291e-05, + "loss": 2.5699, + "step": 41545 + }, + { + "epoch": 1.934283120329632, + "grad_norm": 0.34248808717377144, + "learning_rate": 3.375788017801731e-05, + "loss": 2.6641, + "step": 41546 + }, + { + "epoch": 1.934329678515725, + "grad_norm": 0.33416214259916666, + "learning_rate": 3.375531836506072e-05, + "loss": 2.6923, + "step": 41547 + }, + { + "epoch": 1.934376236701818, + "grad_norm": 0.3767348568030046, + "learning_rate": 3.375275659978067e-05, + "loss": 2.6342, + "step": 41548 + }, + { + "epoch": 1.9344227948879111, + "grad_norm": 0.3491123294771516, + "learning_rate": 3.3750194882184684e-05, + "loss": 2.5669, + "step": 41549 + }, + { + "epoch": 1.9344693530740042, + "grad_norm": 0.3329273276298135, + "learning_rate": 3.374763321228026e-05, + "loss": 2.695, + "step": 41550 + }, + { + "epoch": 1.9345159112600974, + "grad_norm": 0.3761228452049514, + "learning_rate": 3.374507159007495e-05, + "loss": 2.5968, + "step": 41551 + }, + { + "epoch": 1.9345624694461905, + "grad_norm": 0.3351280729604353, + "learning_rate": 3.3742510015576257e-05, + "loss": 2.644, + "step": 41552 + }, + { + "epoch": 1.9346090276322836, + "grad_norm": 0.3425726322550689, + "learning_rate": 3.3739948488791675e-05, + "loss": 2.7383, + "step": 41553 + }, + { + "epoch": 1.9346555858183765, + "grad_norm": 0.33153563767631883, + "learning_rate": 3.373738700972876e-05, + "loss": 2.6816, + "step": 41554 + }, + { + "epoch": 1.9347021440044696, + "grad_norm": 0.3461090309330836, + "learning_rate": 3.3734825578394994e-05, + "loss": 2.7494, + "step": 41555 + }, + { + "epoch": 1.9347487021905625, + "grad_norm": 0.35391382091649004, + "learning_rate": 3.373226419479793e-05, + "loss": 2.8333, + "step": 41556 + }, + { + "epoch": 1.9347952603766556, + "grad_norm": 0.3225537992245324, + "learning_rate": 3.3729702858945065e-05, + "loss": 2.7065, + "step": 41557 + }, + { + "epoch": 1.9348418185627487, + "grad_norm": 0.35490479162859484, + "learning_rate": 3.372714157084392e-05, + "loss": 2.7483, + "step": 41558 + }, + { + "epoch": 1.9348883767488418, + "grad_norm": 0.33851673758015355, + "learning_rate": 3.372458033050201e-05, + "loss": 2.6528, + "step": 41559 + }, + { + "epoch": 1.934934934934935, + "grad_norm": 0.3748888200422334, + "learning_rate": 3.372201913792685e-05, + "loss": 2.7397, + "step": 41560 + }, + { + "epoch": 1.934981493121028, + "grad_norm": 0.3266952459604101, + "learning_rate": 3.3719457993125955e-05, + "loss": 2.6556, + "step": 41561 + }, + { + "epoch": 1.9350280513071212, + "grad_norm": 0.3381728908771586, + "learning_rate": 3.371689689610686e-05, + "loss": 2.6404, + "step": 41562 + }, + { + "epoch": 1.9350746094932143, + "grad_norm": 0.3535435248074913, + "learning_rate": 3.371433584687705e-05, + "loss": 2.6609, + "step": 41563 + }, + { + "epoch": 1.9351211676793072, + "grad_norm": 0.40429888241954237, + "learning_rate": 3.3711774845444076e-05, + "loss": 2.6068, + "step": 41564 + }, + { + "epoch": 1.9351677258654003, + "grad_norm": 0.3390054886543984, + "learning_rate": 3.3709213891815445e-05, + "loss": 2.6959, + "step": 41565 + }, + { + "epoch": 1.9352142840514932, + "grad_norm": 0.3590283291448424, + "learning_rate": 3.3706652985998645e-05, + "loss": 2.7168, + "step": 41566 + }, + { + "epoch": 1.9352608422375863, + "grad_norm": 0.3848959351000656, + "learning_rate": 3.370409212800123e-05, + "loss": 2.7905, + "step": 41567 + }, + { + "epoch": 1.9353074004236794, + "grad_norm": 0.38215790209331274, + "learning_rate": 3.3701531317830676e-05, + "loss": 2.7551, + "step": 41568 + }, + { + "epoch": 1.9353539586097726, + "grad_norm": 0.35159770559104564, + "learning_rate": 3.369897055549455e-05, + "loss": 2.8467, + "step": 41569 + }, + { + "epoch": 1.9354005167958657, + "grad_norm": 0.384828938425859, + "learning_rate": 3.3696409841000334e-05, + "loss": 2.6857, + "step": 41570 + }, + { + "epoch": 1.9354470749819588, + "grad_norm": 0.35996339123900084, + "learning_rate": 3.369384917435554e-05, + "loss": 2.7301, + "step": 41571 + }, + { + "epoch": 1.935493633168052, + "grad_norm": 0.3357733734073255, + "learning_rate": 3.3691288555567705e-05, + "loss": 2.6377, + "step": 41572 + }, + { + "epoch": 1.9355401913541448, + "grad_norm": 0.3787975494590248, + "learning_rate": 3.36887279846443e-05, + "loss": 2.7597, + "step": 41573 + }, + { + "epoch": 1.935586749540238, + "grad_norm": 0.3699246858133705, + "learning_rate": 3.36861674615929e-05, + "loss": 2.7004, + "step": 41574 + }, + { + "epoch": 1.935633307726331, + "grad_norm": 0.4063633104318805, + "learning_rate": 3.3683606986420994e-05, + "loss": 2.6101, + "step": 41575 + }, + { + "epoch": 1.935679865912424, + "grad_norm": 0.334265713424078, + "learning_rate": 3.368104655913607e-05, + "loss": 2.7147, + "step": 41576 + }, + { + "epoch": 1.935726424098517, + "grad_norm": 0.37687995926596085, + "learning_rate": 3.367848617974569e-05, + "loss": 2.7286, + "step": 41577 + }, + { + "epoch": 1.9357729822846101, + "grad_norm": 0.3712248031052249, + "learning_rate": 3.367592584825734e-05, + "loss": 2.7013, + "step": 41578 + }, + { + "epoch": 1.9358195404707033, + "grad_norm": 0.34197502313935557, + "learning_rate": 3.3673365564678525e-05, + "loss": 2.6667, + "step": 41579 + }, + { + "epoch": 1.9358660986567964, + "grad_norm": 0.33065723864251073, + "learning_rate": 3.367080532901679e-05, + "loss": 2.6826, + "step": 41580 + }, + { + "epoch": 1.9359126568428895, + "grad_norm": 0.3668593015131223, + "learning_rate": 3.3668245141279614e-05, + "loss": 2.6202, + "step": 41581 + }, + { + "epoch": 1.9359592150289826, + "grad_norm": 0.3622865520186364, + "learning_rate": 3.366568500147455e-05, + "loss": 2.7168, + "step": 41582 + }, + { + "epoch": 1.9360057732150755, + "grad_norm": 0.3459722814174565, + "learning_rate": 3.366312490960909e-05, + "loss": 2.581, + "step": 41583 + }, + { + "epoch": 1.9360523314011686, + "grad_norm": 0.3792507765525562, + "learning_rate": 3.366056486569074e-05, + "loss": 2.745, + "step": 41584 + }, + { + "epoch": 1.9360988895872617, + "grad_norm": 0.349426963912306, + "learning_rate": 3.365800486972703e-05, + "loss": 2.7297, + "step": 41585 + }, + { + "epoch": 1.9361454477733546, + "grad_norm": 0.3766172114312657, + "learning_rate": 3.3655444921725454e-05, + "loss": 2.6975, + "step": 41586 + }, + { + "epoch": 1.9361920059594477, + "grad_norm": 0.35403815516062087, + "learning_rate": 3.365288502169355e-05, + "loss": 2.6616, + "step": 41587 + }, + { + "epoch": 1.9362385641455409, + "grad_norm": 0.35396122583225376, + "learning_rate": 3.365032516963882e-05, + "loss": 2.7543, + "step": 41588 + }, + { + "epoch": 1.936285122331634, + "grad_norm": 0.3692434234985628, + "learning_rate": 3.364776536556875e-05, + "loss": 2.6195, + "step": 41589 + }, + { + "epoch": 1.936331680517727, + "grad_norm": 0.33809277311452707, + "learning_rate": 3.364520560949091e-05, + "loss": 2.7192, + "step": 41590 + }, + { + "epoch": 1.9363782387038202, + "grad_norm": 0.34060086740773565, + "learning_rate": 3.3642645901412746e-05, + "loss": 2.7351, + "step": 41591 + }, + { + "epoch": 1.9364247968899133, + "grad_norm": 0.3573409204978158, + "learning_rate": 3.364008624134183e-05, + "loss": 2.7862, + "step": 41592 + }, + { + "epoch": 1.9364713550760062, + "grad_norm": 0.3262361708899886, + "learning_rate": 3.363752662928565e-05, + "loss": 2.7021, + "step": 41593 + }, + { + "epoch": 1.9365179132620993, + "grad_norm": 0.3468576900676379, + "learning_rate": 3.36349670652517e-05, + "loss": 2.6483, + "step": 41594 + }, + { + "epoch": 1.9365644714481922, + "grad_norm": 0.3522110919677698, + "learning_rate": 3.363240754924752e-05, + "loss": 2.7222, + "step": 41595 + }, + { + "epoch": 1.9366110296342853, + "grad_norm": 0.331195047145813, + "learning_rate": 3.3629848081280623e-05, + "loss": 2.6862, + "step": 41596 + }, + { + "epoch": 1.9366575878203784, + "grad_norm": 0.33172796059194315, + "learning_rate": 3.362728866135849e-05, + "loss": 2.6382, + "step": 41597 + }, + { + "epoch": 1.9367041460064716, + "grad_norm": 0.36672896570371905, + "learning_rate": 3.3624729289488664e-05, + "loss": 2.6759, + "step": 41598 + }, + { + "epoch": 1.9367507041925647, + "grad_norm": 0.3623946473159874, + "learning_rate": 3.362216996567863e-05, + "loss": 2.701, + "step": 41599 + }, + { + "epoch": 1.9367972623786578, + "grad_norm": 0.3427917451837026, + "learning_rate": 3.3619610689935935e-05, + "loss": 2.6851, + "step": 41600 + }, + { + "epoch": 1.936843820564751, + "grad_norm": 0.3627041602623949, + "learning_rate": 3.3617051462268054e-05, + "loss": 2.7119, + "step": 41601 + }, + { + "epoch": 1.936890378750844, + "grad_norm": 0.3179712826364087, + "learning_rate": 3.361449228268251e-05, + "loss": 2.7293, + "step": 41602 + }, + { + "epoch": 1.936936936936937, + "grad_norm": 0.3589428900093105, + "learning_rate": 3.361193315118682e-05, + "loss": 2.6475, + "step": 41603 + }, + { + "epoch": 1.93698349512303, + "grad_norm": 0.3392515810712476, + "learning_rate": 3.360937406778849e-05, + "loss": 2.7727, + "step": 41604 + }, + { + "epoch": 1.937030053309123, + "grad_norm": 0.32731466973893325, + "learning_rate": 3.360681503249504e-05, + "loss": 2.6388, + "step": 41605 + }, + { + "epoch": 1.937076611495216, + "grad_norm": 0.35615173267847516, + "learning_rate": 3.360425604531398e-05, + "loss": 2.6759, + "step": 41606 + }, + { + "epoch": 1.9371231696813092, + "grad_norm": 0.3196622493012896, + "learning_rate": 3.360169710625278e-05, + "loss": 2.6797, + "step": 41607 + }, + { + "epoch": 1.9371697278674023, + "grad_norm": 0.33460066897985696, + "learning_rate": 3.359913821531902e-05, + "loss": 2.6415, + "step": 41608 + }, + { + "epoch": 1.9372162860534954, + "grad_norm": 0.3513495475047448, + "learning_rate": 3.3596579372520156e-05, + "loss": 2.7764, + "step": 41609 + }, + { + "epoch": 1.9372628442395885, + "grad_norm": 0.3062687761482343, + "learning_rate": 3.359402057786373e-05, + "loss": 2.5928, + "step": 41610 + }, + { + "epoch": 1.9373094024256816, + "grad_norm": 0.35613058804293146, + "learning_rate": 3.359146183135723e-05, + "loss": 2.7423, + "step": 41611 + }, + { + "epoch": 1.9373559606117747, + "grad_norm": 0.3754627646928014, + "learning_rate": 3.358890313300816e-05, + "loss": 2.695, + "step": 41612 + }, + { + "epoch": 1.9374025187978676, + "grad_norm": 0.3235233158512656, + "learning_rate": 3.358634448282406e-05, + "loss": 2.6729, + "step": 41613 + }, + { + "epoch": 1.9374490769839607, + "grad_norm": 0.3607272766655578, + "learning_rate": 3.358378588081242e-05, + "loss": 2.7123, + "step": 41614 + }, + { + "epoch": 1.9374956351700536, + "grad_norm": 0.3563494748434077, + "learning_rate": 3.3581227326980735e-05, + "loss": 2.5386, + "step": 41615 + }, + { + "epoch": 1.9375421933561467, + "grad_norm": 0.31387132546937674, + "learning_rate": 3.3578668821336546e-05, + "loss": 2.7266, + "step": 41616 + }, + { + "epoch": 1.9375887515422399, + "grad_norm": 0.3250497079778723, + "learning_rate": 3.3576110363887325e-05, + "loss": 2.694, + "step": 41617 + }, + { + "epoch": 1.937635309728333, + "grad_norm": 0.33654537226660186, + "learning_rate": 3.357355195464062e-05, + "loss": 2.6867, + "step": 41618 + }, + { + "epoch": 1.937681867914426, + "grad_norm": 0.3395108406883249, + "learning_rate": 3.357099359360393e-05, + "loss": 2.7161, + "step": 41619 + }, + { + "epoch": 1.9377284261005192, + "grad_norm": 0.3649049494265111, + "learning_rate": 3.356843528078474e-05, + "loss": 2.7381, + "step": 41620 + }, + { + "epoch": 1.9377749842866123, + "grad_norm": 0.34892179779151405, + "learning_rate": 3.356587701619058e-05, + "loss": 2.6688, + "step": 41621 + }, + { + "epoch": 1.9378215424727052, + "grad_norm": 0.36004955529940236, + "learning_rate": 3.3563318799828947e-05, + "loss": 2.6962, + "step": 41622 + }, + { + "epoch": 1.9378681006587983, + "grad_norm": 0.3547816029175226, + "learning_rate": 3.356076063170737e-05, + "loss": 2.6401, + "step": 41623 + }, + { + "epoch": 1.9379146588448914, + "grad_norm": 0.3552221203634397, + "learning_rate": 3.355820251183333e-05, + "loss": 2.7265, + "step": 41624 + }, + { + "epoch": 1.9379612170309843, + "grad_norm": 0.35241842590579014, + "learning_rate": 3.3555644440214334e-05, + "loss": 2.6644, + "step": 41625 + }, + { + "epoch": 1.9380077752170775, + "grad_norm": 0.3283836482073807, + "learning_rate": 3.355308641685792e-05, + "loss": 2.5785, + "step": 41626 + }, + { + "epoch": 1.9380543334031706, + "grad_norm": 0.3352725089919519, + "learning_rate": 3.355052844177156e-05, + "loss": 2.6846, + "step": 41627 + }, + { + "epoch": 1.9381008915892637, + "grad_norm": 0.3353656108940724, + "learning_rate": 3.354797051496279e-05, + "loss": 2.7429, + "step": 41628 + }, + { + "epoch": 1.9381474497753568, + "grad_norm": 0.3527822294572362, + "learning_rate": 3.354541263643911e-05, + "loss": 2.6808, + "step": 41629 + }, + { + "epoch": 1.93819400796145, + "grad_norm": 0.33483840745842725, + "learning_rate": 3.3542854806208e-05, + "loss": 2.6764, + "step": 41630 + }, + { + "epoch": 1.938240566147543, + "grad_norm": 0.3678690745375146, + "learning_rate": 3.3540297024277014e-05, + "loss": 2.6522, + "step": 41631 + }, + { + "epoch": 1.938287124333636, + "grad_norm": 0.35396718228705615, + "learning_rate": 3.353773929065364e-05, + "loss": 2.5626, + "step": 41632 + }, + { + "epoch": 1.938333682519729, + "grad_norm": 0.3812302586461861, + "learning_rate": 3.353518160534534e-05, + "loss": 2.621, + "step": 41633 + }, + { + "epoch": 1.9383802407058222, + "grad_norm": 0.3566862990012634, + "learning_rate": 3.35326239683597e-05, + "loss": 2.5978, + "step": 41634 + }, + { + "epoch": 1.938426798891915, + "grad_norm": 0.32748191048252423, + "learning_rate": 3.3530066379704175e-05, + "loss": 2.6066, + "step": 41635 + }, + { + "epoch": 1.9384733570780082, + "grad_norm": 0.35698689294466696, + "learning_rate": 3.352750883938628e-05, + "loss": 2.7164, + "step": 41636 + }, + { + "epoch": 1.9385199152641013, + "grad_norm": 0.358117324083193, + "learning_rate": 3.352495134741353e-05, + "loss": 2.6494, + "step": 41637 + }, + { + "epoch": 1.9385664734501944, + "grad_norm": 0.34393070364202594, + "learning_rate": 3.352239390379341e-05, + "loss": 2.7353, + "step": 41638 + }, + { + "epoch": 1.9386130316362875, + "grad_norm": 0.3963496233431869, + "learning_rate": 3.351983650853345e-05, + "loss": 2.6492, + "step": 41639 + }, + { + "epoch": 1.9386595898223806, + "grad_norm": 0.3434102196354962, + "learning_rate": 3.3517279161641135e-05, + "loss": 2.7429, + "step": 41640 + }, + { + "epoch": 1.9387061480084737, + "grad_norm": 0.35937438907124636, + "learning_rate": 3.3514721863124e-05, + "loss": 2.7518, + "step": 41641 + }, + { + "epoch": 1.9387527061945666, + "grad_norm": 0.33920410723112365, + "learning_rate": 3.3512164612989533e-05, + "loss": 2.7494, + "step": 41642 + }, + { + "epoch": 1.9387992643806597, + "grad_norm": 0.37718138191631867, + "learning_rate": 3.350960741124522e-05, + "loss": 2.7298, + "step": 41643 + }, + { + "epoch": 1.9388458225667526, + "grad_norm": 0.3400340644919031, + "learning_rate": 3.35070502578986e-05, + "loss": 2.6483, + "step": 41644 + }, + { + "epoch": 1.9388923807528458, + "grad_norm": 0.3606769623027294, + "learning_rate": 3.3504493152957154e-05, + "loss": 2.6764, + "step": 41645 + }, + { + "epoch": 1.9389389389389389, + "grad_norm": 0.31273382782009207, + "learning_rate": 3.3501936096428396e-05, + "loss": 2.7686, + "step": 41646 + }, + { + "epoch": 1.938985497125032, + "grad_norm": 0.3504250817390612, + "learning_rate": 3.3499379088319836e-05, + "loss": 2.676, + "step": 41647 + }, + { + "epoch": 1.939032055311125, + "grad_norm": 0.3127755369043031, + "learning_rate": 3.349682212863897e-05, + "loss": 2.7117, + "step": 41648 + }, + { + "epoch": 1.9390786134972182, + "grad_norm": 0.3298884121219892, + "learning_rate": 3.3494265217393316e-05, + "loss": 2.5901, + "step": 41649 + }, + { + "epoch": 1.9391251716833113, + "grad_norm": 0.3328367054661489, + "learning_rate": 3.3491708354590365e-05, + "loss": 2.6213, + "step": 41650 + }, + { + "epoch": 1.9391717298694044, + "grad_norm": 0.3362683838852364, + "learning_rate": 3.34891515402376e-05, + "loss": 2.674, + "step": 41651 + }, + { + "epoch": 1.9392182880554973, + "grad_norm": 0.33256466628621656, + "learning_rate": 3.3486594774342574e-05, + "loss": 2.633, + "step": 41652 + }, + { + "epoch": 1.9392648462415905, + "grad_norm": 0.33368246020957587, + "learning_rate": 3.348403805691274e-05, + "loss": 2.6993, + "step": 41653 + }, + { + "epoch": 1.9393114044276833, + "grad_norm": 0.3540133835351269, + "learning_rate": 3.348148138795565e-05, + "loss": 2.6497, + "step": 41654 + }, + { + "epoch": 1.9393579626137765, + "grad_norm": 0.3366849579904424, + "learning_rate": 3.347892476747878e-05, + "loss": 2.7142, + "step": 41655 + }, + { + "epoch": 1.9394045207998696, + "grad_norm": 0.34579351576385825, + "learning_rate": 3.347636819548963e-05, + "loss": 2.639, + "step": 41656 + }, + { + "epoch": 1.9394510789859627, + "grad_norm": 0.33948796342324955, + "learning_rate": 3.347381167199571e-05, + "loss": 2.7276, + "step": 41657 + }, + { + "epoch": 1.9394976371720558, + "grad_norm": 0.3392370881264768, + "learning_rate": 3.347125519700454e-05, + "loss": 2.7216, + "step": 41658 + }, + { + "epoch": 1.939544195358149, + "grad_norm": 0.3488748750560971, + "learning_rate": 3.346869877052359e-05, + "loss": 2.6872, + "step": 41659 + }, + { + "epoch": 1.939590753544242, + "grad_norm": 0.3376010384507092, + "learning_rate": 3.34661423925604e-05, + "loss": 2.6309, + "step": 41660 + }, + { + "epoch": 1.939637311730335, + "grad_norm": 0.3377486206382957, + "learning_rate": 3.3463586063122445e-05, + "loss": 2.6409, + "step": 41661 + }, + { + "epoch": 1.939683869916428, + "grad_norm": 0.31879629300999063, + "learning_rate": 3.346102978221724e-05, + "loss": 2.7082, + "step": 41662 + }, + { + "epoch": 1.9397304281025212, + "grad_norm": 0.3116632159040519, + "learning_rate": 3.345847354985226e-05, + "loss": 2.6473, + "step": 41663 + }, + { + "epoch": 1.939776986288614, + "grad_norm": 0.3379554745144248, + "learning_rate": 3.3455917366035055e-05, + "loss": 2.6301, + "step": 41664 + }, + { + "epoch": 1.9398235444747072, + "grad_norm": 0.3319988302005858, + "learning_rate": 3.3453361230773104e-05, + "loss": 2.7068, + "step": 41665 + }, + { + "epoch": 1.9398701026608003, + "grad_norm": 0.31878436076909905, + "learning_rate": 3.345080514407388e-05, + "loss": 2.7938, + "step": 41666 + }, + { + "epoch": 1.9399166608468934, + "grad_norm": 0.3377608096806504, + "learning_rate": 3.344824910594494e-05, + "loss": 2.718, + "step": 41667 + }, + { + "epoch": 1.9399632190329865, + "grad_norm": 0.35847553319863545, + "learning_rate": 3.3445693116393763e-05, + "loss": 2.6676, + "step": 41668 + }, + { + "epoch": 1.9400097772190796, + "grad_norm": 0.3290257701334381, + "learning_rate": 3.344313717542782e-05, + "loss": 2.6869, + "step": 41669 + }, + { + "epoch": 1.9400563354051727, + "grad_norm": 0.33348742090684513, + "learning_rate": 3.344058128305465e-05, + "loss": 2.7204, + "step": 41670 + }, + { + "epoch": 1.9401028935912656, + "grad_norm": 0.3522457073350182, + "learning_rate": 3.3438025439281745e-05, + "loss": 2.6462, + "step": 41671 + }, + { + "epoch": 1.9401494517773588, + "grad_norm": 0.36268102013120795, + "learning_rate": 3.34354696441166e-05, + "loss": 2.7256, + "step": 41672 + }, + { + "epoch": 1.9401960099634519, + "grad_norm": 0.3250160899991565, + "learning_rate": 3.343291389756673e-05, + "loss": 2.6676, + "step": 41673 + }, + { + "epoch": 1.9402425681495448, + "grad_norm": 0.3371149148738349, + "learning_rate": 3.343035819963961e-05, + "loss": 2.7388, + "step": 41674 + }, + { + "epoch": 1.9402891263356379, + "grad_norm": 0.3462002267435115, + "learning_rate": 3.3427802550342775e-05, + "loss": 2.752, + "step": 41675 + }, + { + "epoch": 1.940335684521731, + "grad_norm": 0.32697653568455315, + "learning_rate": 3.342524694968368e-05, + "loss": 2.6416, + "step": 41676 + }, + { + "epoch": 1.9403822427078241, + "grad_norm": 0.345431742045594, + "learning_rate": 3.3422691397669876e-05, + "loss": 2.6263, + "step": 41677 + }, + { + "epoch": 1.9404288008939172, + "grad_norm": 0.3576125239988517, + "learning_rate": 3.3420135894308835e-05, + "loss": 2.7294, + "step": 41678 + }, + { + "epoch": 1.9404753590800103, + "grad_norm": 0.3545011656263629, + "learning_rate": 3.341758043960805e-05, + "loss": 2.6881, + "step": 41679 + }, + { + "epoch": 1.9405219172661035, + "grad_norm": 0.38584025031276986, + "learning_rate": 3.3415025033575044e-05, + "loss": 2.7498, + "step": 41680 + }, + { + "epoch": 1.9405684754521964, + "grad_norm": 0.3344860194760703, + "learning_rate": 3.3412469676217296e-05, + "loss": 2.6275, + "step": 41681 + }, + { + "epoch": 1.9406150336382895, + "grad_norm": 0.35841360357639607, + "learning_rate": 3.3409914367542326e-05, + "loss": 2.6807, + "step": 41682 + }, + { + "epoch": 1.9406615918243824, + "grad_norm": 0.32454724920338074, + "learning_rate": 3.340735910755762e-05, + "loss": 2.7428, + "step": 41683 + }, + { + "epoch": 1.9407081500104755, + "grad_norm": 0.3564647054337364, + "learning_rate": 3.340480389627068e-05, + "loss": 2.6974, + "step": 41684 + }, + { + "epoch": 1.9407547081965686, + "grad_norm": 0.3295191526531437, + "learning_rate": 3.3402248733689003e-05, + "loss": 2.6481, + "step": 41685 + }, + { + "epoch": 1.9408012663826617, + "grad_norm": 0.34785158442837605, + "learning_rate": 3.339969361982009e-05, + "loss": 2.7049, + "step": 41686 + }, + { + "epoch": 1.9408478245687548, + "grad_norm": 0.35023251412877915, + "learning_rate": 3.3397138554671445e-05, + "loss": 2.616, + "step": 41687 + }, + { + "epoch": 1.940894382754848, + "grad_norm": 0.32832813433375263, + "learning_rate": 3.339458353825058e-05, + "loss": 2.7292, + "step": 41688 + }, + { + "epoch": 1.940940940940941, + "grad_norm": 0.33493981102267295, + "learning_rate": 3.3392028570564935e-05, + "loss": 2.7379, + "step": 41689 + }, + { + "epoch": 1.9409874991270342, + "grad_norm": 0.3820075611955095, + "learning_rate": 3.338947365162208e-05, + "loss": 2.702, + "step": 41690 + }, + { + "epoch": 1.941034057313127, + "grad_norm": 0.32375927745388916, + "learning_rate": 3.3386918781429486e-05, + "loss": 2.6644, + "step": 41691 + }, + { + "epoch": 1.9410806154992202, + "grad_norm": 0.33746390444351176, + "learning_rate": 3.338436395999463e-05, + "loss": 2.7583, + "step": 41692 + }, + { + "epoch": 1.941127173685313, + "grad_norm": 0.3633510516725625, + "learning_rate": 3.338180918732504e-05, + "loss": 2.709, + "step": 41693 + }, + { + "epoch": 1.9411737318714062, + "grad_norm": 0.33147871783288824, + "learning_rate": 3.3379254463428187e-05, + "loss": 2.5947, + "step": 41694 + }, + { + "epoch": 1.9412202900574993, + "grad_norm": 0.35336145108287587, + "learning_rate": 3.337669978831161e-05, + "loss": 2.6632, + "step": 41695 + }, + { + "epoch": 1.9412668482435924, + "grad_norm": 0.33683656508177723, + "learning_rate": 3.337414516198277e-05, + "loss": 2.6061, + "step": 41696 + }, + { + "epoch": 1.9413134064296855, + "grad_norm": 0.3257860514571081, + "learning_rate": 3.337159058444917e-05, + "loss": 2.6648, + "step": 41697 + }, + { + "epoch": 1.9413599646157786, + "grad_norm": 0.3497327598015021, + "learning_rate": 3.3369036055718326e-05, + "loss": 2.6849, + "step": 41698 + }, + { + "epoch": 1.9414065228018718, + "grad_norm": 0.3339197958900645, + "learning_rate": 3.336648157579771e-05, + "loss": 2.7538, + "step": 41699 + }, + { + "epoch": 1.9414530809879649, + "grad_norm": 0.32134638626544926, + "learning_rate": 3.3363927144694845e-05, + "loss": 2.7789, + "step": 41700 + }, + { + "epoch": 1.9414996391740578, + "grad_norm": 0.35624533725184687, + "learning_rate": 3.336137276241722e-05, + "loss": 2.6186, + "step": 41701 + }, + { + "epoch": 1.9415461973601509, + "grad_norm": 0.3476735466204119, + "learning_rate": 3.335881842897229e-05, + "loss": 2.6598, + "step": 41702 + }, + { + "epoch": 1.9415927555462438, + "grad_norm": 0.3261740321082571, + "learning_rate": 3.3356264144367624e-05, + "loss": 2.6182, + "step": 41703 + }, + { + "epoch": 1.941639313732337, + "grad_norm": 0.3556803142270701, + "learning_rate": 3.335370990861067e-05, + "loss": 2.7894, + "step": 41704 + }, + { + "epoch": 1.94168587191843, + "grad_norm": 0.32749046254564773, + "learning_rate": 3.335115572170894e-05, + "loss": 2.6991, + "step": 41705 + }, + { + "epoch": 1.9417324301045231, + "grad_norm": 0.3438452988973415, + "learning_rate": 3.334860158366994e-05, + "loss": 2.6269, + "step": 41706 + }, + { + "epoch": 1.9417789882906162, + "grad_norm": 0.3379128307741635, + "learning_rate": 3.334604749450112e-05, + "loss": 2.5473, + "step": 41707 + }, + { + "epoch": 1.9418255464767094, + "grad_norm": 0.3512835746955505, + "learning_rate": 3.334349345421004e-05, + "loss": 2.6796, + "step": 41708 + }, + { + "epoch": 1.9418721046628025, + "grad_norm": 0.3335213014231684, + "learning_rate": 3.3340939462804164e-05, + "loss": 2.7343, + "step": 41709 + }, + { + "epoch": 1.9419186628488954, + "grad_norm": 0.35920300425533086, + "learning_rate": 3.333838552029098e-05, + "loss": 2.6448, + "step": 41710 + }, + { + "epoch": 1.9419652210349885, + "grad_norm": 0.3591771262995622, + "learning_rate": 3.3335831626677995e-05, + "loss": 2.664, + "step": 41711 + }, + { + "epoch": 1.9420117792210816, + "grad_norm": 0.3354137564169435, + "learning_rate": 3.33332777819727e-05, + "loss": 2.7247, + "step": 41712 + }, + { + "epoch": 1.9420583374071745, + "grad_norm": 0.330964370943677, + "learning_rate": 3.33307239861826e-05, + "loss": 2.6347, + "step": 41713 + }, + { + "epoch": 1.9421048955932676, + "grad_norm": 0.33029611903880285, + "learning_rate": 3.332817023931519e-05, + "loss": 2.7078, + "step": 41714 + }, + { + "epoch": 1.9421514537793607, + "grad_norm": 0.34391775450209144, + "learning_rate": 3.332561654137793e-05, + "loss": 2.621, + "step": 41715 + }, + { + "epoch": 1.9421980119654538, + "grad_norm": 0.33344585041503466, + "learning_rate": 3.3323062892378364e-05, + "loss": 2.5861, + "step": 41716 + }, + { + "epoch": 1.942244570151547, + "grad_norm": 0.3400290874409788, + "learning_rate": 3.332050929232396e-05, + "loss": 2.6155, + "step": 41717 + }, + { + "epoch": 1.94229112833764, + "grad_norm": 0.34369091052775413, + "learning_rate": 3.331795574122222e-05, + "loss": 2.578, + "step": 41718 + }, + { + "epoch": 1.9423376865237332, + "grad_norm": 0.3050389109933761, + "learning_rate": 3.331540223908064e-05, + "loss": 2.6307, + "step": 41719 + }, + { + "epoch": 1.942384244709826, + "grad_norm": 0.33505990847075073, + "learning_rate": 3.3312848785906695e-05, + "loss": 2.5893, + "step": 41720 + }, + { + "epoch": 1.9424308028959192, + "grad_norm": 0.33359100671202235, + "learning_rate": 3.3310295381707915e-05, + "loss": 2.6952, + "step": 41721 + }, + { + "epoch": 1.9424773610820123, + "grad_norm": 0.3168198354029548, + "learning_rate": 3.3307742026491756e-05, + "loss": 2.5639, + "step": 41722 + }, + { + "epoch": 1.9425239192681052, + "grad_norm": 0.36258410255466156, + "learning_rate": 3.330518872026575e-05, + "loss": 2.6205, + "step": 41723 + }, + { + "epoch": 1.9425704774541983, + "grad_norm": 0.3715267334529913, + "learning_rate": 3.3302635463037355e-05, + "loss": 2.6017, + "step": 41724 + }, + { + "epoch": 1.9426170356402914, + "grad_norm": 0.36593284449335445, + "learning_rate": 3.330008225481408e-05, + "loss": 2.7186, + "step": 41725 + }, + { + "epoch": 1.9426635938263845, + "grad_norm": 0.38127031332500433, + "learning_rate": 3.329752909560343e-05, + "loss": 2.6753, + "step": 41726 + }, + { + "epoch": 1.9427101520124777, + "grad_norm": 0.35900577781654675, + "learning_rate": 3.3294975985412886e-05, + "loss": 2.6686, + "step": 41727 + }, + { + "epoch": 1.9427567101985708, + "grad_norm": 0.3588863735850575, + "learning_rate": 3.329242292424992e-05, + "loss": 2.6036, + "step": 41728 + }, + { + "epoch": 1.9428032683846639, + "grad_norm": 0.3550683140479245, + "learning_rate": 3.3289869912122066e-05, + "loss": 2.6757, + "step": 41729 + }, + { + "epoch": 1.9428498265707568, + "grad_norm": 0.3258771640398643, + "learning_rate": 3.328731694903678e-05, + "loss": 2.7037, + "step": 41730 + }, + { + "epoch": 1.94289638475685, + "grad_norm": 0.3313150349734989, + "learning_rate": 3.3284764035001595e-05, + "loss": 2.7572, + "step": 41731 + }, + { + "epoch": 1.9429429429429428, + "grad_norm": 0.37497128444040084, + "learning_rate": 3.328221117002398e-05, + "loss": 2.658, + "step": 41732 + }, + { + "epoch": 1.942989501129036, + "grad_norm": 0.33372958837915884, + "learning_rate": 3.3279658354111405e-05, + "loss": 2.6512, + "step": 41733 + }, + { + "epoch": 1.943036059315129, + "grad_norm": 0.34648588005137426, + "learning_rate": 3.327710558727141e-05, + "loss": 2.5767, + "step": 41734 + }, + { + "epoch": 1.9430826175012221, + "grad_norm": 0.3462471205405025, + "learning_rate": 3.3274552869511443e-05, + "loss": 2.7901, + "step": 41735 + }, + { + "epoch": 1.9431291756873152, + "grad_norm": 0.32391784292455494, + "learning_rate": 3.327200020083903e-05, + "loss": 2.6939, + "step": 41736 + }, + { + "epoch": 1.9431757338734084, + "grad_norm": 0.3326038123582247, + "learning_rate": 3.326944758126164e-05, + "loss": 2.6314, + "step": 41737 + }, + { + "epoch": 1.9432222920595015, + "grad_norm": 0.3185446037169646, + "learning_rate": 3.326689501078678e-05, + "loss": 2.7153, + "step": 41738 + }, + { + "epoch": 1.9432688502455946, + "grad_norm": 0.3246513381800631, + "learning_rate": 3.326434248942193e-05, + "loss": 2.7127, + "step": 41739 + }, + { + "epoch": 1.9433154084316875, + "grad_norm": 0.34196018276237833, + "learning_rate": 3.3261790017174575e-05, + "loss": 2.6663, + "step": 41740 + }, + { + "epoch": 1.9433619666177806, + "grad_norm": 0.3361569410973458, + "learning_rate": 3.325923759405224e-05, + "loss": 2.748, + "step": 41741 + }, + { + "epoch": 1.9434085248038735, + "grad_norm": 0.36047402852434396, + "learning_rate": 3.325668522006239e-05, + "loss": 2.7091, + "step": 41742 + }, + { + "epoch": 1.9434550829899666, + "grad_norm": 0.3208806545844773, + "learning_rate": 3.32541328952125e-05, + "loss": 2.7387, + "step": 41743 + }, + { + "epoch": 1.9435016411760597, + "grad_norm": 0.32287647136494385, + "learning_rate": 3.32515806195101e-05, + "loss": 2.7297, + "step": 41744 + }, + { + "epoch": 1.9435481993621528, + "grad_norm": 0.34271973241858905, + "learning_rate": 3.324902839296266e-05, + "loss": 2.7781, + "step": 41745 + }, + { + "epoch": 1.943594757548246, + "grad_norm": 0.3594744105425866, + "learning_rate": 3.3246476215577646e-05, + "loss": 2.7344, + "step": 41746 + }, + { + "epoch": 1.943641315734339, + "grad_norm": 0.35335900823368793, + "learning_rate": 3.324392408736261e-05, + "loss": 2.7184, + "step": 41747 + }, + { + "epoch": 1.9436878739204322, + "grad_norm": 0.34445347344048655, + "learning_rate": 3.324137200832498e-05, + "loss": 2.6627, + "step": 41748 + }, + { + "epoch": 1.943734432106525, + "grad_norm": 0.36498238056018023, + "learning_rate": 3.323881997847229e-05, + "loss": 2.6447, + "step": 41749 + }, + { + "epoch": 1.9437809902926182, + "grad_norm": 0.3393624035570148, + "learning_rate": 3.3236267997812e-05, + "loss": 2.7204, + "step": 41750 + }, + { + "epoch": 1.9438275484787113, + "grad_norm": 0.34166363320462856, + "learning_rate": 3.323371606635161e-05, + "loss": 2.7264, + "step": 41751 + }, + { + "epoch": 1.9438741066648042, + "grad_norm": 0.3569363944509391, + "learning_rate": 3.323116418409863e-05, + "loss": 2.702, + "step": 41752 + }, + { + "epoch": 1.9439206648508973, + "grad_norm": 0.32479997118198717, + "learning_rate": 3.32286123510605e-05, + "loss": 2.6907, + "step": 41753 + }, + { + "epoch": 1.9439672230369904, + "grad_norm": 0.3419305753414468, + "learning_rate": 3.3226060567244765e-05, + "loss": 2.6862, + "step": 41754 + }, + { + "epoch": 1.9440137812230835, + "grad_norm": 0.3441653191920897, + "learning_rate": 3.322350883265889e-05, + "loss": 2.7436, + "step": 41755 + }, + { + "epoch": 1.9440603394091767, + "grad_norm": 0.34971067450657234, + "learning_rate": 3.322095714731035e-05, + "loss": 2.7431, + "step": 41756 + }, + { + "epoch": 1.9441068975952698, + "grad_norm": 0.31926267281659604, + "learning_rate": 3.321840551120665e-05, + "loss": 2.645, + "step": 41757 + }, + { + "epoch": 1.944153455781363, + "grad_norm": 0.3423065655761254, + "learning_rate": 3.321585392435527e-05, + "loss": 2.6684, + "step": 41758 + }, + { + "epoch": 1.9442000139674558, + "grad_norm": 0.3548144398502692, + "learning_rate": 3.321330238676372e-05, + "loss": 2.7085, + "step": 41759 + }, + { + "epoch": 1.944246572153549, + "grad_norm": 0.3260086463599725, + "learning_rate": 3.321075089843948e-05, + "loss": 2.6733, + "step": 41760 + }, + { + "epoch": 1.944293130339642, + "grad_norm": 0.33444465509870996, + "learning_rate": 3.320819945939002e-05, + "loss": 2.6138, + "step": 41761 + }, + { + "epoch": 1.944339688525735, + "grad_norm": 0.3556313882241609, + "learning_rate": 3.320564806962285e-05, + "loss": 2.7026, + "step": 41762 + }, + { + "epoch": 1.944386246711828, + "grad_norm": 0.34597077716431707, + "learning_rate": 3.320309672914544e-05, + "loss": 2.6044, + "step": 41763 + }, + { + "epoch": 1.9444328048979211, + "grad_norm": 0.36619348411334396, + "learning_rate": 3.320054543796528e-05, + "loss": 2.7247, + "step": 41764 + }, + { + "epoch": 1.9444793630840143, + "grad_norm": 0.3605836912700573, + "learning_rate": 3.319799419608988e-05, + "loss": 2.6256, + "step": 41765 + }, + { + "epoch": 1.9445259212701074, + "grad_norm": 0.33653639352512277, + "learning_rate": 3.3195443003526694e-05, + "loss": 2.5736, + "step": 41766 + }, + { + "epoch": 1.9445724794562005, + "grad_norm": 0.3478484712952023, + "learning_rate": 3.319289186028324e-05, + "loss": 2.7103, + "step": 41767 + }, + { + "epoch": 1.9446190376422936, + "grad_norm": 0.37277223307792867, + "learning_rate": 3.3190340766367e-05, + "loss": 2.5743, + "step": 41768 + }, + { + "epoch": 1.9446655958283865, + "grad_norm": 0.35003752107782066, + "learning_rate": 3.318778972178542e-05, + "loss": 2.6866, + "step": 41769 + }, + { + "epoch": 1.9447121540144796, + "grad_norm": 0.353320948809301, + "learning_rate": 3.318523872654606e-05, + "loss": 2.8077, + "step": 41770 + }, + { + "epoch": 1.9447587122005725, + "grad_norm": 0.34794002378692285, + "learning_rate": 3.3182687780656336e-05, + "loss": 2.6575, + "step": 41771 + }, + { + "epoch": 1.9448052703866656, + "grad_norm": 0.3464306802893091, + "learning_rate": 3.318013688412379e-05, + "loss": 2.6582, + "step": 41772 + }, + { + "epoch": 1.9448518285727587, + "grad_norm": 0.34240094443557223, + "learning_rate": 3.3177586036955886e-05, + "loss": 2.693, + "step": 41773 + }, + { + "epoch": 1.9448983867588518, + "grad_norm": 0.33682882000630016, + "learning_rate": 3.3175035239160094e-05, + "loss": 2.6735, + "step": 41774 + }, + { + "epoch": 1.944944944944945, + "grad_norm": 0.3410603234304615, + "learning_rate": 3.3172484490743924e-05, + "loss": 2.7168, + "step": 41775 + }, + { + "epoch": 1.944991503131038, + "grad_norm": 0.3649112884163902, + "learning_rate": 3.3169933791714844e-05, + "loss": 2.7242, + "step": 41776 + }, + { + "epoch": 1.9450380613171312, + "grad_norm": 0.3457069134846273, + "learning_rate": 3.316738314208037e-05, + "loss": 2.741, + "step": 41777 + }, + { + "epoch": 1.9450846195032243, + "grad_norm": 0.4026947693146589, + "learning_rate": 3.3164832541847965e-05, + "loss": 2.7115, + "step": 41778 + }, + { + "epoch": 1.9451311776893172, + "grad_norm": 0.33459616429664707, + "learning_rate": 3.316228199102509e-05, + "loss": 2.7025, + "step": 41779 + }, + { + "epoch": 1.9451777358754103, + "grad_norm": 0.35580021144619467, + "learning_rate": 3.3159731489619296e-05, + "loss": 2.6412, + "step": 41780 + }, + { + "epoch": 1.9452242940615032, + "grad_norm": 0.35167315477153965, + "learning_rate": 3.315718103763802e-05, + "loss": 2.657, + "step": 41781 + }, + { + "epoch": 1.9452708522475963, + "grad_norm": 0.34277813109902167, + "learning_rate": 3.3154630635088734e-05, + "loss": 2.6383, + "step": 41782 + }, + { + "epoch": 1.9453174104336894, + "grad_norm": 0.31041286892502984, + "learning_rate": 3.315208028197898e-05, + "loss": 2.7139, + "step": 41783 + }, + { + "epoch": 1.9453639686197826, + "grad_norm": 0.36301458177511403, + "learning_rate": 3.314952997831618e-05, + "loss": 2.7429, + "step": 41784 + }, + { + "epoch": 1.9454105268058757, + "grad_norm": 0.372759283054177, + "learning_rate": 3.3146979724107876e-05, + "loss": 2.6966, + "step": 41785 + }, + { + "epoch": 1.9454570849919688, + "grad_norm": 0.33267512060513077, + "learning_rate": 3.314442951936152e-05, + "loss": 2.6247, + "step": 41786 + }, + { + "epoch": 1.945503643178062, + "grad_norm": 0.32379467457929395, + "learning_rate": 3.3141879364084594e-05, + "loss": 2.7172, + "step": 41787 + }, + { + "epoch": 1.945550201364155, + "grad_norm": 0.34363874021765495, + "learning_rate": 3.313932925828461e-05, + "loss": 2.7425, + "step": 41788 + }, + { + "epoch": 1.945596759550248, + "grad_norm": 0.3303939175510282, + "learning_rate": 3.3136779201969e-05, + "loss": 2.6711, + "step": 41789 + }, + { + "epoch": 1.945643317736341, + "grad_norm": 0.3178546984398427, + "learning_rate": 3.313422919514531e-05, + "loss": 2.6321, + "step": 41790 + }, + { + "epoch": 1.945689875922434, + "grad_norm": 0.3400496512824438, + "learning_rate": 3.3131679237821e-05, + "loss": 2.6702, + "step": 41791 + }, + { + "epoch": 1.945736434108527, + "grad_norm": 0.35734015460063395, + "learning_rate": 3.3129129330003515e-05, + "loss": 2.658, + "step": 41792 + }, + { + "epoch": 1.9457829922946202, + "grad_norm": 0.3641422709167089, + "learning_rate": 3.312657947170041e-05, + "loss": 2.6695, + "step": 41793 + }, + { + "epoch": 1.9458295504807133, + "grad_norm": 0.3485743850679731, + "learning_rate": 3.31240296629191e-05, + "loss": 2.6501, + "step": 41794 + }, + { + "epoch": 1.9458761086668064, + "grad_norm": 0.38542077882525616, + "learning_rate": 3.312147990366713e-05, + "loss": 2.7537, + "step": 41795 + }, + { + "epoch": 1.9459226668528995, + "grad_norm": 0.3383529920475845, + "learning_rate": 3.3118930193951945e-05, + "loss": 2.653, + "step": 41796 + }, + { + "epoch": 1.9459692250389926, + "grad_norm": 0.3472123086705402, + "learning_rate": 3.311638053378102e-05, + "loss": 2.6951, + "step": 41797 + }, + { + "epoch": 1.9460157832250855, + "grad_norm": 0.36252889556339873, + "learning_rate": 3.311383092316187e-05, + "loss": 2.6581, + "step": 41798 + }, + { + "epoch": 1.9460623414111786, + "grad_norm": 0.3196284107131827, + "learning_rate": 3.3111281362101974e-05, + "loss": 2.6251, + "step": 41799 + }, + { + "epoch": 1.9461088995972717, + "grad_norm": 0.34043281431795414, + "learning_rate": 3.310873185060879e-05, + "loss": 2.631, + "step": 41800 + }, + { + "epoch": 1.9461554577833646, + "grad_norm": 0.34884676294377315, + "learning_rate": 3.310618238868982e-05, + "loss": 2.6147, + "step": 41801 + }, + { + "epoch": 1.9462020159694577, + "grad_norm": 0.34127720124576033, + "learning_rate": 3.310363297635252e-05, + "loss": 2.5956, + "step": 41802 + }, + { + "epoch": 1.9462485741555509, + "grad_norm": 0.3504258711022369, + "learning_rate": 3.310108361360441e-05, + "loss": 2.6237, + "step": 41803 + }, + { + "epoch": 1.946295132341644, + "grad_norm": 0.3424682914451927, + "learning_rate": 3.309853430045297e-05, + "loss": 2.6809, + "step": 41804 + }, + { + "epoch": 1.946341690527737, + "grad_norm": 0.33466711191167786, + "learning_rate": 3.309598503690563e-05, + "loss": 2.6101, + "step": 41805 + }, + { + "epoch": 1.9463882487138302, + "grad_norm": 0.33674359156966105, + "learning_rate": 3.3093435822969934e-05, + "loss": 2.6297, + "step": 41806 + }, + { + "epoch": 1.9464348068999233, + "grad_norm": 0.35071399085215726, + "learning_rate": 3.309088665865332e-05, + "loss": 2.653, + "step": 41807 + }, + { + "epoch": 1.9464813650860162, + "grad_norm": 0.3327160290893935, + "learning_rate": 3.3088337543963306e-05, + "loss": 2.7066, + "step": 41808 + }, + { + "epoch": 1.9465279232721093, + "grad_norm": 0.3615436827455347, + "learning_rate": 3.3085788478907354e-05, + "loss": 2.7123, + "step": 41809 + }, + { + "epoch": 1.9465744814582024, + "grad_norm": 0.34848011013662605, + "learning_rate": 3.308323946349293e-05, + "loss": 2.6389, + "step": 41810 + }, + { + "epoch": 1.9466210396442953, + "grad_norm": 0.32816139269695854, + "learning_rate": 3.3080690497727554e-05, + "loss": 2.6597, + "step": 41811 + }, + { + "epoch": 1.9466675978303885, + "grad_norm": 0.358069984131348, + "learning_rate": 3.307814158161866e-05, + "loss": 2.7745, + "step": 41812 + }, + { + "epoch": 1.9467141560164816, + "grad_norm": 0.352535887067181, + "learning_rate": 3.3075592715173774e-05, + "loss": 2.6062, + "step": 41813 + }, + { + "epoch": 1.9467607142025747, + "grad_norm": 0.34585940417662453, + "learning_rate": 3.307304389840036e-05, + "loss": 2.7024, + "step": 41814 + }, + { + "epoch": 1.9468072723886678, + "grad_norm": 0.3205272132825525, + "learning_rate": 3.307049513130587e-05, + "loss": 2.6417, + "step": 41815 + }, + { + "epoch": 1.946853830574761, + "grad_norm": 0.3560730697169051, + "learning_rate": 3.306794641389783e-05, + "loss": 2.6898, + "step": 41816 + }, + { + "epoch": 1.946900388760854, + "grad_norm": 0.34586740036758223, + "learning_rate": 3.3065397746183696e-05, + "loss": 2.8188, + "step": 41817 + }, + { + "epoch": 1.946946946946947, + "grad_norm": 0.34823349665320735, + "learning_rate": 3.306284912817094e-05, + "loss": 2.659, + "step": 41818 + }, + { + "epoch": 1.94699350513304, + "grad_norm": 0.350415076118158, + "learning_rate": 3.3060300559867066e-05, + "loss": 2.7221, + "step": 41819 + }, + { + "epoch": 1.947040063319133, + "grad_norm": 0.35473219764795483, + "learning_rate": 3.3057752041279516e-05, + "loss": 2.6828, + "step": 41820 + }, + { + "epoch": 1.947086621505226, + "grad_norm": 0.35999964210463675, + "learning_rate": 3.305520357241582e-05, + "loss": 2.629, + "step": 41821 + }, + { + "epoch": 1.9471331796913192, + "grad_norm": 0.3582820131727842, + "learning_rate": 3.305265515328343e-05, + "loss": 2.736, + "step": 41822 + }, + { + "epoch": 1.9471797378774123, + "grad_norm": 0.3712025792605456, + "learning_rate": 3.305010678388981e-05, + "loss": 2.721, + "step": 41823 + }, + { + "epoch": 1.9472262960635054, + "grad_norm": 0.3501332043748796, + "learning_rate": 3.304755846424248e-05, + "loss": 2.637, + "step": 41824 + }, + { + "epoch": 1.9472728542495985, + "grad_norm": 0.3569260266196969, + "learning_rate": 3.3045010194348884e-05, + "loss": 2.6595, + "step": 41825 + }, + { + "epoch": 1.9473194124356916, + "grad_norm": 0.35517915245860315, + "learning_rate": 3.3042461974216524e-05, + "loss": 2.6504, + "step": 41826 + }, + { + "epoch": 1.9473659706217847, + "grad_norm": 0.3615013677222422, + "learning_rate": 3.303991380385285e-05, + "loss": 2.7631, + "step": 41827 + }, + { + "epoch": 1.9474125288078776, + "grad_norm": 0.3567333619923882, + "learning_rate": 3.303736568326536e-05, + "loss": 2.6124, + "step": 41828 + }, + { + "epoch": 1.9474590869939707, + "grad_norm": 0.3524364627643483, + "learning_rate": 3.3034817612461556e-05, + "loss": 2.6406, + "step": 41829 + }, + { + "epoch": 1.9475056451800636, + "grad_norm": 0.3300949819841265, + "learning_rate": 3.303226959144885e-05, + "loss": 2.7449, + "step": 41830 + }, + { + "epoch": 1.9475522033661568, + "grad_norm": 0.34356134046847103, + "learning_rate": 3.302972162023479e-05, + "loss": 2.6443, + "step": 41831 + }, + { + "epoch": 1.9475987615522499, + "grad_norm": 0.3718329409553814, + "learning_rate": 3.302717369882683e-05, + "loss": 2.6862, + "step": 41832 + }, + { + "epoch": 1.947645319738343, + "grad_norm": 0.361037041687118, + "learning_rate": 3.302462582723241e-05, + "loss": 2.6469, + "step": 41833 + }, + { + "epoch": 1.947691877924436, + "grad_norm": 0.324295694792552, + "learning_rate": 3.3022078005459077e-05, + "loss": 2.6853, + "step": 41834 + }, + { + "epoch": 1.9477384361105292, + "grad_norm": 0.3643426262584698, + "learning_rate": 3.3019530233514264e-05, + "loss": 2.7419, + "step": 41835 + }, + { + "epoch": 1.9477849942966223, + "grad_norm": 0.34255124348109944, + "learning_rate": 3.3016982511405435e-05, + "loss": 2.6311, + "step": 41836 + }, + { + "epoch": 1.9478315524827152, + "grad_norm": 0.33147833480200967, + "learning_rate": 3.301443483914012e-05, + "loss": 2.6433, + "step": 41837 + }, + { + "epoch": 1.9478781106688083, + "grad_norm": 0.3988151615909268, + "learning_rate": 3.301188721672574e-05, + "loss": 2.6104, + "step": 41838 + }, + { + "epoch": 1.9479246688549015, + "grad_norm": 0.32509340294863853, + "learning_rate": 3.300933964416981e-05, + "loss": 2.6933, + "step": 41839 + }, + { + "epoch": 1.9479712270409943, + "grad_norm": 0.32124933069093775, + "learning_rate": 3.3006792121479794e-05, + "loss": 2.5488, + "step": 41840 + }, + { + "epoch": 1.9480177852270875, + "grad_norm": 0.38265986270317737, + "learning_rate": 3.300424464866316e-05, + "loss": 2.7168, + "step": 41841 + }, + { + "epoch": 1.9480643434131806, + "grad_norm": 0.36145242414341244, + "learning_rate": 3.3001697225727414e-05, + "loss": 2.6706, + "step": 41842 + }, + { + "epoch": 1.9481109015992737, + "grad_norm": 0.3112613726053341, + "learning_rate": 3.299914985267998e-05, + "loss": 2.6376, + "step": 41843 + }, + { + "epoch": 1.9481574597853668, + "grad_norm": 0.3557584276987667, + "learning_rate": 3.29966025295284e-05, + "loss": 2.7322, + "step": 41844 + }, + { + "epoch": 1.94820401797146, + "grad_norm": 0.3747538200130765, + "learning_rate": 3.2994055256280105e-05, + "loss": 2.6564, + "step": 41845 + }, + { + "epoch": 1.948250576157553, + "grad_norm": 0.32594701895925304, + "learning_rate": 3.299150803294256e-05, + "loss": 2.6495, + "step": 41846 + }, + { + "epoch": 1.948297134343646, + "grad_norm": 0.33710997520767033, + "learning_rate": 3.298896085952329e-05, + "loss": 2.6603, + "step": 41847 + }, + { + "epoch": 1.948343692529739, + "grad_norm": 0.34363055782200186, + "learning_rate": 3.2986413736029736e-05, + "loss": 2.5769, + "step": 41848 + }, + { + "epoch": 1.9483902507158322, + "grad_norm": 0.3492442719734197, + "learning_rate": 3.2983866662469376e-05, + "loss": 2.6849, + "step": 41849 + }, + { + "epoch": 1.948436808901925, + "grad_norm": 0.36074862836933824, + "learning_rate": 3.2981319638849705e-05, + "loss": 2.7019, + "step": 41850 + }, + { + "epoch": 1.9484833670880182, + "grad_norm": 0.3352972313980513, + "learning_rate": 3.297877266517817e-05, + "loss": 2.6463, + "step": 41851 + }, + { + "epoch": 1.9485299252741113, + "grad_norm": 0.3438769835400233, + "learning_rate": 3.297622574146228e-05, + "loss": 2.7042, + "step": 41852 + }, + { + "epoch": 1.9485764834602044, + "grad_norm": 0.3793742264063852, + "learning_rate": 3.297367886770947e-05, + "loss": 2.6631, + "step": 41853 + }, + { + "epoch": 1.9486230416462975, + "grad_norm": 0.3348815615743831, + "learning_rate": 3.297113204392724e-05, + "loss": 2.7055, + "step": 41854 + }, + { + "epoch": 1.9486695998323906, + "grad_norm": 0.3470190948676614, + "learning_rate": 3.296858527012306e-05, + "loss": 2.6847, + "step": 41855 + }, + { + "epoch": 1.9487161580184837, + "grad_norm": 0.3508977647985541, + "learning_rate": 3.29660385463044e-05, + "loss": 2.7454, + "step": 41856 + }, + { + "epoch": 1.9487627162045766, + "grad_norm": 0.34560273463091384, + "learning_rate": 3.296349187247875e-05, + "loss": 2.6215, + "step": 41857 + }, + { + "epoch": 1.9488092743906698, + "grad_norm": 0.3576211662923265, + "learning_rate": 3.296094524865358e-05, + "loss": 2.6577, + "step": 41858 + }, + { + "epoch": 1.9488558325767626, + "grad_norm": 0.36054779061264935, + "learning_rate": 3.295839867483633e-05, + "loss": 2.7484, + "step": 41859 + }, + { + "epoch": 1.9489023907628558, + "grad_norm": 0.37099628967476966, + "learning_rate": 3.295585215103452e-05, + "loss": 2.7211, + "step": 41860 + }, + { + "epoch": 1.9489489489489489, + "grad_norm": 0.3432334553390125, + "learning_rate": 3.29533056772556e-05, + "loss": 2.686, + "step": 41861 + }, + { + "epoch": 1.948995507135042, + "grad_norm": 0.34367661130728405, + "learning_rate": 3.2950759253507045e-05, + "loss": 2.6271, + "step": 41862 + }, + { + "epoch": 1.949042065321135, + "grad_norm": 0.3422805450948301, + "learning_rate": 3.294821287979634e-05, + "loss": 2.6672, + "step": 41863 + }, + { + "epoch": 1.9490886235072282, + "grad_norm": 0.3416269028612837, + "learning_rate": 3.2945666556130946e-05, + "loss": 2.6295, + "step": 41864 + }, + { + "epoch": 1.9491351816933213, + "grad_norm": 0.3478065180148054, + "learning_rate": 3.294312028251835e-05, + "loss": 2.7641, + "step": 41865 + }, + { + "epoch": 1.9491817398794145, + "grad_norm": 0.34501414236764827, + "learning_rate": 3.294057405896599e-05, + "loss": 2.7035, + "step": 41866 + }, + { + "epoch": 1.9492282980655073, + "grad_norm": 0.3329822206043594, + "learning_rate": 3.293802788548139e-05, + "loss": 2.7742, + "step": 41867 + }, + { + "epoch": 1.9492748562516005, + "grad_norm": 0.33553372859908903, + "learning_rate": 3.2935481762072e-05, + "loss": 2.7062, + "step": 41868 + }, + { + "epoch": 1.9493214144376934, + "grad_norm": 0.3525259712802023, + "learning_rate": 3.293293568874526e-05, + "loss": 2.5864, + "step": 41869 + }, + { + "epoch": 1.9493679726237865, + "grad_norm": 0.3416935030126376, + "learning_rate": 3.29303896655087e-05, + "loss": 2.6625, + "step": 41870 + }, + { + "epoch": 1.9494145308098796, + "grad_norm": 0.34230743052983525, + "learning_rate": 3.292784369236976e-05, + "loss": 2.6294, + "step": 41871 + }, + { + "epoch": 1.9494610889959727, + "grad_norm": 0.3141122242291743, + "learning_rate": 3.2925297769335896e-05, + "loss": 2.6445, + "step": 41872 + }, + { + "epoch": 1.9495076471820658, + "grad_norm": 0.3573304984154905, + "learning_rate": 3.2922751896414624e-05, + "loss": 2.738, + "step": 41873 + }, + { + "epoch": 1.949554205368159, + "grad_norm": 0.3674818731358665, + "learning_rate": 3.292020607361337e-05, + "loss": 2.6922, + "step": 41874 + }, + { + "epoch": 1.949600763554252, + "grad_norm": 0.3232429275444894, + "learning_rate": 3.2917660300939655e-05, + "loss": 2.7216, + "step": 41875 + }, + { + "epoch": 1.949647321740345, + "grad_norm": 0.35914417662203263, + "learning_rate": 3.291511457840092e-05, + "loss": 2.6457, + "step": 41876 + }, + { + "epoch": 1.949693879926438, + "grad_norm": 0.35426161490469743, + "learning_rate": 3.291256890600463e-05, + "loss": 2.6815, + "step": 41877 + }, + { + "epoch": 1.9497404381125312, + "grad_norm": 0.356877845031374, + "learning_rate": 3.291002328375828e-05, + "loss": 2.5927, + "step": 41878 + }, + { + "epoch": 1.949786996298624, + "grad_norm": 0.34928182091818505, + "learning_rate": 3.2907477711669296e-05, + "loss": 2.6534, + "step": 41879 + }, + { + "epoch": 1.9498335544847172, + "grad_norm": 0.37091775401221905, + "learning_rate": 3.290493218974522e-05, + "loss": 2.6545, + "step": 41880 + }, + { + "epoch": 1.9498801126708103, + "grad_norm": 0.3588435652668977, + "learning_rate": 3.2902386717993475e-05, + "loss": 2.6615, + "step": 41881 + }, + { + "epoch": 1.9499266708569034, + "grad_norm": 0.3565963849284363, + "learning_rate": 3.2899841296421516e-05, + "loss": 2.6357, + "step": 41882 + }, + { + "epoch": 1.9499732290429965, + "grad_norm": 0.35060580183709705, + "learning_rate": 3.289729592503686e-05, + "loss": 2.6342, + "step": 41883 + }, + { + "epoch": 1.9500197872290896, + "grad_norm": 0.34526507719787103, + "learning_rate": 3.289475060384694e-05, + "loss": 2.6504, + "step": 41884 + }, + { + "epoch": 1.9500663454151828, + "grad_norm": 0.3555324677513144, + "learning_rate": 3.289220533285926e-05, + "loss": 2.7444, + "step": 41885 + }, + { + "epoch": 1.9501129036012756, + "grad_norm": 0.3320387378686906, + "learning_rate": 3.288966011208128e-05, + "loss": 2.7334, + "step": 41886 + }, + { + "epoch": 1.9501594617873688, + "grad_norm": 0.3337319687563587, + "learning_rate": 3.2887114941520425e-05, + "loss": 2.6816, + "step": 41887 + }, + { + "epoch": 1.9502060199734619, + "grad_norm": 0.3459350048333241, + "learning_rate": 3.288456982118423e-05, + "loss": 2.6256, + "step": 41888 + }, + { + "epoch": 1.9502525781595548, + "grad_norm": 0.34197309862475417, + "learning_rate": 3.288202475108014e-05, + "loss": 2.6131, + "step": 41889 + }, + { + "epoch": 1.9502991363456479, + "grad_norm": 0.3563406056877101, + "learning_rate": 3.287947973121561e-05, + "loss": 2.7256, + "step": 41890 + }, + { + "epoch": 1.950345694531741, + "grad_norm": 0.3482811056719847, + "learning_rate": 3.2876934761598124e-05, + "loss": 2.6966, + "step": 41891 + }, + { + "epoch": 1.9503922527178341, + "grad_norm": 0.3409086324520351, + "learning_rate": 3.287438984223513e-05, + "loss": 2.6251, + "step": 41892 + }, + { + "epoch": 1.9504388109039272, + "grad_norm": 0.3542109153053912, + "learning_rate": 3.287184497313414e-05, + "loss": 2.5839, + "step": 41893 + }, + { + "epoch": 1.9504853690900203, + "grad_norm": 0.32408162809222635, + "learning_rate": 3.28693001543026e-05, + "loss": 2.6515, + "step": 41894 + }, + { + "epoch": 1.9505319272761135, + "grad_norm": 0.3265570123476873, + "learning_rate": 3.2866755385747944e-05, + "loss": 2.6732, + "step": 41895 + }, + { + "epoch": 1.9505784854622064, + "grad_norm": 0.3390846703618545, + "learning_rate": 3.28642106674777e-05, + "loss": 2.7213, + "step": 41896 + }, + { + "epoch": 1.9506250436482995, + "grad_norm": 0.33664403161890993, + "learning_rate": 3.2861665999499286e-05, + "loss": 2.6858, + "step": 41897 + }, + { + "epoch": 1.9506716018343924, + "grad_norm": 0.34667907704498163, + "learning_rate": 3.2859121381820216e-05, + "loss": 2.7212, + "step": 41898 + }, + { + "epoch": 1.9507181600204855, + "grad_norm": 0.34187679088770234, + "learning_rate": 3.2856576814447936e-05, + "loss": 2.6227, + "step": 41899 + }, + { + "epoch": 1.9507647182065786, + "grad_norm": 0.33694606875566163, + "learning_rate": 3.2854032297389895e-05, + "loss": 2.6724, + "step": 41900 + }, + { + "epoch": 1.9508112763926717, + "grad_norm": 0.343384371092155, + "learning_rate": 3.285148783065361e-05, + "loss": 2.6046, + "step": 41901 + }, + { + "epoch": 1.9508578345787648, + "grad_norm": 0.33252927906646296, + "learning_rate": 3.284894341424649e-05, + "loss": 2.5846, + "step": 41902 + }, + { + "epoch": 1.950904392764858, + "grad_norm": 0.3874068748475475, + "learning_rate": 3.284639904817605e-05, + "loss": 2.6645, + "step": 41903 + }, + { + "epoch": 1.950950950950951, + "grad_norm": 0.33308183422140686, + "learning_rate": 3.284385473244974e-05, + "loss": 2.6173, + "step": 41904 + }, + { + "epoch": 1.9509975091370442, + "grad_norm": 0.338887874704255, + "learning_rate": 3.2841310467075e-05, + "loss": 2.7282, + "step": 41905 + }, + { + "epoch": 1.951044067323137, + "grad_norm": 0.3541341090472667, + "learning_rate": 3.2838766252059355e-05, + "loss": 2.6468, + "step": 41906 + }, + { + "epoch": 1.9510906255092302, + "grad_norm": 0.3255706411734931, + "learning_rate": 3.283622208741023e-05, + "loss": 2.7334, + "step": 41907 + }, + { + "epoch": 1.951137183695323, + "grad_norm": 0.3591181696645308, + "learning_rate": 3.283367797313509e-05, + "loss": 2.6834, + "step": 41908 + }, + { + "epoch": 1.9511837418814162, + "grad_norm": 0.3602548968664024, + "learning_rate": 3.283113390924143e-05, + "loss": 2.7965, + "step": 41909 + }, + { + "epoch": 1.9512303000675093, + "grad_norm": 0.34772358970763595, + "learning_rate": 3.282858989573668e-05, + "loss": 2.6671, + "step": 41910 + }, + { + "epoch": 1.9512768582536024, + "grad_norm": 0.3442384063570611, + "learning_rate": 3.282604593262835e-05, + "loss": 2.6723, + "step": 41911 + }, + { + "epoch": 1.9513234164396955, + "grad_norm": 0.34560240224104793, + "learning_rate": 3.282350201992388e-05, + "loss": 2.6316, + "step": 41912 + }, + { + "epoch": 1.9513699746257887, + "grad_norm": 0.3238186773508953, + "learning_rate": 3.282095815763072e-05, + "loss": 2.562, + "step": 41913 + }, + { + "epoch": 1.9514165328118818, + "grad_norm": 0.3377567800193786, + "learning_rate": 3.281841434575638e-05, + "loss": 2.642, + "step": 41914 + }, + { + "epoch": 1.9514630909979749, + "grad_norm": 0.3254889668722715, + "learning_rate": 3.281587058430828e-05, + "loss": 2.6626, + "step": 41915 + }, + { + "epoch": 1.9515096491840678, + "grad_norm": 0.337969547423718, + "learning_rate": 3.281332687329393e-05, + "loss": 2.6893, + "step": 41916 + }, + { + "epoch": 1.951556207370161, + "grad_norm": 0.3325971763797516, + "learning_rate": 3.2810783212720766e-05, + "loss": 2.7193, + "step": 41917 + }, + { + "epoch": 1.9516027655562538, + "grad_norm": 0.368906625444295, + "learning_rate": 3.280823960259624e-05, + "loss": 2.7255, + "step": 41918 + }, + { + "epoch": 1.951649323742347, + "grad_norm": 0.343267722825752, + "learning_rate": 3.280569604292786e-05, + "loss": 2.7299, + "step": 41919 + }, + { + "epoch": 1.95169588192844, + "grad_norm": 0.3311468465780496, + "learning_rate": 3.280315253372304e-05, + "loss": 2.7548, + "step": 41920 + }, + { + "epoch": 1.9517424401145331, + "grad_norm": 0.34499762293583386, + "learning_rate": 3.28006090749893e-05, + "loss": 2.6013, + "step": 41921 + }, + { + "epoch": 1.9517889983006262, + "grad_norm": 0.39079714599188614, + "learning_rate": 3.279806566673408e-05, + "loss": 2.7278, + "step": 41922 + }, + { + "epoch": 1.9518355564867194, + "grad_norm": 0.34756435748291165, + "learning_rate": 3.2795522308964824e-05, + "loss": 2.7819, + "step": 41923 + }, + { + "epoch": 1.9518821146728125, + "grad_norm": 0.32844906760698284, + "learning_rate": 3.279297900168903e-05, + "loss": 2.6425, + "step": 41924 + }, + { + "epoch": 1.9519286728589054, + "grad_norm": 0.3548545955945431, + "learning_rate": 3.2790435744914146e-05, + "loss": 2.6563, + "step": 41925 + }, + { + "epoch": 1.9519752310449985, + "grad_norm": 0.3445197685103955, + "learning_rate": 3.2787892538647614e-05, + "loss": 2.6655, + "step": 41926 + }, + { + "epoch": 1.9520217892310916, + "grad_norm": 0.3425702549455223, + "learning_rate": 3.278534938289695e-05, + "loss": 2.7485, + "step": 41927 + }, + { + "epoch": 1.9520683474171845, + "grad_norm": 0.3441556039652327, + "learning_rate": 3.2782806277669574e-05, + "loss": 2.7485, + "step": 41928 + }, + { + "epoch": 1.9521149056032776, + "grad_norm": 0.31983353082071264, + "learning_rate": 3.278026322297299e-05, + "loss": 2.7059, + "step": 41929 + }, + { + "epoch": 1.9521614637893707, + "grad_norm": 0.34638706789372475, + "learning_rate": 3.2777720218814615e-05, + "loss": 2.7802, + "step": 41930 + }, + { + "epoch": 1.9522080219754638, + "grad_norm": 0.34141169419307754, + "learning_rate": 3.277517726520193e-05, + "loss": 2.6269, + "step": 41931 + }, + { + "epoch": 1.952254580161557, + "grad_norm": 0.34478345871748023, + "learning_rate": 3.2772634362142416e-05, + "loss": 2.6639, + "step": 41932 + }, + { + "epoch": 1.95230113834765, + "grad_norm": 0.32873518858679, + "learning_rate": 3.2770091509643504e-05, + "loss": 2.7109, + "step": 41933 + }, + { + "epoch": 1.9523476965337432, + "grad_norm": 0.3342187867713565, + "learning_rate": 3.276754870771269e-05, + "loss": 2.7322, + "step": 41934 + }, + { + "epoch": 1.952394254719836, + "grad_norm": 0.3562756657840894, + "learning_rate": 3.276500595635744e-05, + "loss": 2.7261, + "step": 41935 + }, + { + "epoch": 1.9524408129059292, + "grad_norm": 0.33809469326354324, + "learning_rate": 3.276246325558516e-05, + "loss": 2.726, + "step": 41936 + }, + { + "epoch": 1.9524873710920223, + "grad_norm": 0.3642428398884698, + "learning_rate": 3.275992060540338e-05, + "loss": 2.7101, + "step": 41937 + }, + { + "epoch": 1.9525339292781152, + "grad_norm": 0.34026742512855296, + "learning_rate": 3.275737800581953e-05, + "loss": 2.7732, + "step": 41938 + }, + { + "epoch": 1.9525804874642083, + "grad_norm": 0.3392927318753887, + "learning_rate": 3.275483545684107e-05, + "loss": 2.7439, + "step": 41939 + }, + { + "epoch": 1.9526270456503014, + "grad_norm": 0.3560233383271537, + "learning_rate": 3.2752292958475486e-05, + "loss": 2.7181, + "step": 41940 + }, + { + "epoch": 1.9526736038363945, + "grad_norm": 0.3396025680132477, + "learning_rate": 3.274975051073021e-05, + "loss": 2.7609, + "step": 41941 + }, + { + "epoch": 1.9527201620224877, + "grad_norm": 0.34594455524923556, + "learning_rate": 3.2747208113612725e-05, + "loss": 2.7092, + "step": 41942 + }, + { + "epoch": 1.9527667202085808, + "grad_norm": 0.3350402404714215, + "learning_rate": 3.274466576713048e-05, + "loss": 2.6872, + "step": 41943 + }, + { + "epoch": 1.952813278394674, + "grad_norm": 0.34515298788804866, + "learning_rate": 3.274212347129093e-05, + "loss": 2.6469, + "step": 41944 + }, + { + "epoch": 1.9528598365807668, + "grad_norm": 0.32785416739114387, + "learning_rate": 3.273958122610157e-05, + "loss": 2.6369, + "step": 41945 + }, + { + "epoch": 1.95290639476686, + "grad_norm": 0.3481644070291356, + "learning_rate": 3.273703903156981e-05, + "loss": 2.6325, + "step": 41946 + }, + { + "epoch": 1.9529529529529528, + "grad_norm": 0.3336224369425109, + "learning_rate": 3.273449688770316e-05, + "loss": 2.6656, + "step": 41947 + }, + { + "epoch": 1.952999511139046, + "grad_norm": 0.34152879023297117, + "learning_rate": 3.2731954794509056e-05, + "loss": 2.6194, + "step": 41948 + }, + { + "epoch": 1.953046069325139, + "grad_norm": 0.35163038986469936, + "learning_rate": 3.2729412751994954e-05, + "loss": 2.6482, + "step": 41949 + }, + { + "epoch": 1.9530926275112321, + "grad_norm": 0.32889442203740826, + "learning_rate": 3.272687076016834e-05, + "loss": 2.6735, + "step": 41950 + }, + { + "epoch": 1.9531391856973253, + "grad_norm": 0.3526362560775501, + "learning_rate": 3.272432881903665e-05, + "loss": 2.5993, + "step": 41951 + }, + { + "epoch": 1.9531857438834184, + "grad_norm": 0.3268357135431264, + "learning_rate": 3.2721786928607354e-05, + "loss": 2.5944, + "step": 41952 + }, + { + "epoch": 1.9532323020695115, + "grad_norm": 0.31332603327721686, + "learning_rate": 3.2719245088887915e-05, + "loss": 2.5867, + "step": 41953 + }, + { + "epoch": 1.9532788602556046, + "grad_norm": 0.3357119379914945, + "learning_rate": 3.271670329988579e-05, + "loss": 2.6561, + "step": 41954 + }, + { + "epoch": 1.9533254184416975, + "grad_norm": 0.3283100707230217, + "learning_rate": 3.271416156160844e-05, + "loss": 2.6789, + "step": 41955 + }, + { + "epoch": 1.9533719766277906, + "grad_norm": 0.342480556038406, + "learning_rate": 3.2711619874063306e-05, + "loss": 2.7491, + "step": 41956 + }, + { + "epoch": 1.9534185348138835, + "grad_norm": 0.3475590542431612, + "learning_rate": 3.2709078237257874e-05, + "loss": 2.7835, + "step": 41957 + }, + { + "epoch": 1.9534650929999766, + "grad_norm": 0.3366014590514812, + "learning_rate": 3.27065366511996e-05, + "loss": 2.6719, + "step": 41958 + }, + { + "epoch": 1.9535116511860697, + "grad_norm": 0.3206814266767464, + "learning_rate": 3.2703995115895924e-05, + "loss": 2.645, + "step": 41959 + }, + { + "epoch": 1.9535582093721628, + "grad_norm": 0.32803558313510117, + "learning_rate": 3.270145363135434e-05, + "loss": 2.6532, + "step": 41960 + }, + { + "epoch": 1.953604767558256, + "grad_norm": 0.3137292748887691, + "learning_rate": 3.269891219758228e-05, + "loss": 2.7002, + "step": 41961 + }, + { + "epoch": 1.953651325744349, + "grad_norm": 0.3294422591122146, + "learning_rate": 3.2696370814587186e-05, + "loss": 2.6876, + "step": 41962 + }, + { + "epoch": 1.9536978839304422, + "grad_norm": 0.352940700099663, + "learning_rate": 3.269382948237656e-05, + "loss": 2.7414, + "step": 41963 + }, + { + "epoch": 1.953744442116535, + "grad_norm": 0.35132413005328794, + "learning_rate": 3.2691288200957825e-05, + "loss": 2.6627, + "step": 41964 + }, + { + "epoch": 1.9537910003026282, + "grad_norm": 0.34665019213285814, + "learning_rate": 3.2688746970338464e-05, + "loss": 2.6302, + "step": 41965 + }, + { + "epoch": 1.9538375584887213, + "grad_norm": 0.38119342101345527, + "learning_rate": 3.268620579052593e-05, + "loss": 2.6579, + "step": 41966 + }, + { + "epoch": 1.9538841166748142, + "grad_norm": 0.34307690762327253, + "learning_rate": 3.268366466152767e-05, + "loss": 2.6951, + "step": 41967 + }, + { + "epoch": 1.9539306748609073, + "grad_norm": 0.36065695870755743, + "learning_rate": 3.2681123583351155e-05, + "loss": 2.5487, + "step": 41968 + }, + { + "epoch": 1.9539772330470004, + "grad_norm": 0.35290798232459886, + "learning_rate": 3.267858255600381e-05, + "loss": 2.7035, + "step": 41969 + }, + { + "epoch": 1.9540237912330936, + "grad_norm": 0.3371091170786384, + "learning_rate": 3.267604157949315e-05, + "loss": 2.6062, + "step": 41970 + }, + { + "epoch": 1.9540703494191867, + "grad_norm": 0.3351321366064173, + "learning_rate": 3.2673500653826594e-05, + "loss": 2.8306, + "step": 41971 + }, + { + "epoch": 1.9541169076052798, + "grad_norm": 0.36024340447273917, + "learning_rate": 3.267095977901159e-05, + "loss": 2.6477, + "step": 41972 + }, + { + "epoch": 1.954163465791373, + "grad_norm": 0.33427364693216055, + "learning_rate": 3.266841895505563e-05, + "loss": 2.7574, + "step": 41973 + }, + { + "epoch": 1.9542100239774658, + "grad_norm": 0.3733021849653892, + "learning_rate": 3.266587818196614e-05, + "loss": 2.5978, + "step": 41974 + }, + { + "epoch": 1.954256582163559, + "grad_norm": 0.36374260369719824, + "learning_rate": 3.2663337459750596e-05, + "loss": 2.6996, + "step": 41975 + }, + { + "epoch": 1.954303140349652, + "grad_norm": 0.3274194647047657, + "learning_rate": 3.266079678841646e-05, + "loss": 2.5476, + "step": 41976 + }, + { + "epoch": 1.954349698535745, + "grad_norm": 0.38447342748431157, + "learning_rate": 3.2658256167971165e-05, + "loss": 2.7575, + "step": 41977 + }, + { + "epoch": 1.954396256721838, + "grad_norm": 0.34674616963471083, + "learning_rate": 3.265571559842218e-05, + "loss": 2.6079, + "step": 41978 + }, + { + "epoch": 1.9544428149079311, + "grad_norm": 0.3746437560982764, + "learning_rate": 3.2653175079776964e-05, + "loss": 2.7501, + "step": 41979 + }, + { + "epoch": 1.9544893730940243, + "grad_norm": 0.3375174058332763, + "learning_rate": 3.2650634612042963e-05, + "loss": 2.6729, + "step": 41980 + }, + { + "epoch": 1.9545359312801174, + "grad_norm": 0.3204803039354387, + "learning_rate": 3.264809419522765e-05, + "loss": 2.5472, + "step": 41981 + }, + { + "epoch": 1.9545824894662105, + "grad_norm": 0.37187279216350766, + "learning_rate": 3.264555382933846e-05, + "loss": 2.7963, + "step": 41982 + }, + { + "epoch": 1.9546290476523036, + "grad_norm": 0.34284790731104764, + "learning_rate": 3.264301351438287e-05, + "loss": 2.7822, + "step": 41983 + }, + { + "epoch": 1.9546756058383965, + "grad_norm": 0.317219537266623, + "learning_rate": 3.264047325036833e-05, + "loss": 2.647, + "step": 41984 + }, + { + "epoch": 1.9547221640244896, + "grad_norm": 0.38039766984806417, + "learning_rate": 3.263793303730227e-05, + "loss": 2.6897, + "step": 41985 + }, + { + "epoch": 1.9547687222105825, + "grad_norm": 0.32989668405199213, + "learning_rate": 3.2635392875192186e-05, + "loss": 2.6707, + "step": 41986 + }, + { + "epoch": 1.9548152803966756, + "grad_norm": 0.33516683626002663, + "learning_rate": 3.2632852764045497e-05, + "loss": 2.6869, + "step": 41987 + }, + { + "epoch": 1.9548618385827687, + "grad_norm": 0.35073482314615073, + "learning_rate": 3.2630312703869686e-05, + "loss": 2.578, + "step": 41988 + }, + { + "epoch": 1.9549083967688619, + "grad_norm": 0.36446152858342196, + "learning_rate": 3.26277726946722e-05, + "loss": 2.723, + "step": 41989 + }, + { + "epoch": 1.954954954954955, + "grad_norm": 0.32334191460742806, + "learning_rate": 3.262523273646048e-05, + "loss": 2.6491, + "step": 41990 + }, + { + "epoch": 1.955001513141048, + "grad_norm": 0.3303056217860414, + "learning_rate": 3.262269282924199e-05, + "loss": 2.6489, + "step": 41991 + }, + { + "epoch": 1.9550480713271412, + "grad_norm": 0.357581118061673, + "learning_rate": 3.262015297302419e-05, + "loss": 2.5695, + "step": 41992 + }, + { + "epoch": 1.9550946295132343, + "grad_norm": 0.3338178705368492, + "learning_rate": 3.261761316781453e-05, + "loss": 2.7001, + "step": 41993 + }, + { + "epoch": 1.9551411876993272, + "grad_norm": 0.3275955535623622, + "learning_rate": 3.2615073413620464e-05, + "loss": 2.684, + "step": 41994 + }, + { + "epoch": 1.9551877458854203, + "grad_norm": 0.33530726657031046, + "learning_rate": 3.261253371044943e-05, + "loss": 2.5912, + "step": 41995 + }, + { + "epoch": 1.9552343040715132, + "grad_norm": 0.34271779226048016, + "learning_rate": 3.260999405830891e-05, + "loss": 2.7281, + "step": 41996 + }, + { + "epoch": 1.9552808622576063, + "grad_norm": 0.31538597252310796, + "learning_rate": 3.260745445720633e-05, + "loss": 2.6873, + "step": 41997 + }, + { + "epoch": 1.9553274204436994, + "grad_norm": 0.33542856704682467, + "learning_rate": 3.2604914907149174e-05, + "loss": 2.7126, + "step": 41998 + }, + { + "epoch": 1.9553739786297926, + "grad_norm": 0.3718966488723026, + "learning_rate": 3.260237540814488e-05, + "loss": 2.5973, + "step": 41999 + }, + { + "epoch": 1.9554205368158857, + "grad_norm": 0.322434669128924, + "learning_rate": 3.259983596020089e-05, + "loss": 2.7198, + "step": 42000 + }, + { + "epoch": 1.9554670950019788, + "grad_norm": 0.35595467487281357, + "learning_rate": 3.259729656332467e-05, + "loss": 2.6806, + "step": 42001 + }, + { + "epoch": 1.955513653188072, + "grad_norm": 0.3442366172849418, + "learning_rate": 3.259475721752368e-05, + "loss": 2.7949, + "step": 42002 + }, + { + "epoch": 1.955560211374165, + "grad_norm": 0.3117460964074, + "learning_rate": 3.259221792280535e-05, + "loss": 2.6666, + "step": 42003 + }, + { + "epoch": 1.955606769560258, + "grad_norm": 0.3726880875240601, + "learning_rate": 3.258967867917715e-05, + "loss": 2.6826, + "step": 42004 + }, + { + "epoch": 1.955653327746351, + "grad_norm": 0.3522454651776537, + "learning_rate": 3.258713948664652e-05, + "loss": 2.6649, + "step": 42005 + }, + { + "epoch": 1.955699885932444, + "grad_norm": 0.3463632311074277, + "learning_rate": 3.258460034522094e-05, + "loss": 2.6538, + "step": 42006 + }, + { + "epoch": 1.955746444118537, + "grad_norm": 0.36328398588739635, + "learning_rate": 3.258206125490784e-05, + "loss": 2.7356, + "step": 42007 + }, + { + "epoch": 1.9557930023046302, + "grad_norm": 0.3681329240618348, + "learning_rate": 3.257952221571465e-05, + "loss": 2.712, + "step": 42008 + }, + { + "epoch": 1.9558395604907233, + "grad_norm": 0.3206518392255064, + "learning_rate": 3.257698322764886e-05, + "loss": 2.6319, + "step": 42009 + }, + { + "epoch": 1.9558861186768164, + "grad_norm": 0.3469399482531537, + "learning_rate": 3.25744442907179e-05, + "loss": 2.8026, + "step": 42010 + }, + { + "epoch": 1.9559326768629095, + "grad_norm": 0.33656339167844473, + "learning_rate": 3.257190540492925e-05, + "loss": 2.6808, + "step": 42011 + }, + { + "epoch": 1.9559792350490026, + "grad_norm": 0.3141827127648209, + "learning_rate": 3.256936657029033e-05, + "loss": 2.7022, + "step": 42012 + }, + { + "epoch": 1.9560257932350955, + "grad_norm": 0.34575036841149553, + "learning_rate": 3.256682778680859e-05, + "loss": 2.64, + "step": 42013 + }, + { + "epoch": 1.9560723514211886, + "grad_norm": 0.3096212524766028, + "learning_rate": 3.2564289054491506e-05, + "loss": 2.6417, + "step": 42014 + }, + { + "epoch": 1.9561189096072817, + "grad_norm": 0.33061343618040023, + "learning_rate": 3.256175037334651e-05, + "loss": 2.6867, + "step": 42015 + }, + { + "epoch": 1.9561654677933746, + "grad_norm": 0.3395662831734089, + "learning_rate": 3.2559211743381066e-05, + "loss": 2.6156, + "step": 42016 + }, + { + "epoch": 1.9562120259794678, + "grad_norm": 0.31571461354912667, + "learning_rate": 3.255667316460261e-05, + "loss": 2.6238, + "step": 42017 + }, + { + "epoch": 1.9562585841655609, + "grad_norm": 0.3488296008620206, + "learning_rate": 3.255413463701861e-05, + "loss": 2.7529, + "step": 42018 + }, + { + "epoch": 1.956305142351654, + "grad_norm": 0.35782263418400106, + "learning_rate": 3.25515961606365e-05, + "loss": 2.7586, + "step": 42019 + }, + { + "epoch": 1.956351700537747, + "grad_norm": 0.33229720706588506, + "learning_rate": 3.254905773546374e-05, + "loss": 2.7289, + "step": 42020 + }, + { + "epoch": 1.9563982587238402, + "grad_norm": 0.3654026982930393, + "learning_rate": 3.254651936150776e-05, + "loss": 2.6287, + "step": 42021 + }, + { + "epoch": 1.9564448169099333, + "grad_norm": 0.32597704967226626, + "learning_rate": 3.2543981038776044e-05, + "loss": 2.6702, + "step": 42022 + }, + { + "epoch": 1.9564913750960262, + "grad_norm": 0.38449239892186077, + "learning_rate": 3.254144276727601e-05, + "loss": 2.6196, + "step": 42023 + }, + { + "epoch": 1.9565379332821193, + "grad_norm": 0.35745966098524207, + "learning_rate": 3.2538904547015136e-05, + "loss": 2.7315, + "step": 42024 + }, + { + "epoch": 1.9565844914682125, + "grad_norm": 0.3446464791101177, + "learning_rate": 3.2536366378000856e-05, + "loss": 2.6046, + "step": 42025 + }, + { + "epoch": 1.9566310496543053, + "grad_norm": 0.3723498460511873, + "learning_rate": 3.2533828260240593e-05, + "loss": 2.6061, + "step": 42026 + }, + { + "epoch": 1.9566776078403985, + "grad_norm": 0.3175413568168753, + "learning_rate": 3.253129019374186e-05, + "loss": 2.6759, + "step": 42027 + }, + { + "epoch": 1.9567241660264916, + "grad_norm": 0.38339391442959825, + "learning_rate": 3.2528752178512046e-05, + "loss": 2.6246, + "step": 42028 + }, + { + "epoch": 1.9567707242125847, + "grad_norm": 0.337394803027706, + "learning_rate": 3.252621421455864e-05, + "loss": 2.6011, + "step": 42029 + }, + { + "epoch": 1.9568172823986778, + "grad_norm": 0.3340073237209419, + "learning_rate": 3.2523676301889064e-05, + "loss": 2.6653, + "step": 42030 + }, + { + "epoch": 1.956863840584771, + "grad_norm": 0.34228671199670024, + "learning_rate": 3.2521138440510773e-05, + "loss": 2.6931, + "step": 42031 + }, + { + "epoch": 1.956910398770864, + "grad_norm": 0.3319327984244562, + "learning_rate": 3.251860063043123e-05, + "loss": 2.6561, + "step": 42032 + }, + { + "epoch": 1.956956956956957, + "grad_norm": 0.3439677365378764, + "learning_rate": 3.2516062871657857e-05, + "loss": 2.6597, + "step": 42033 + }, + { + "epoch": 1.95700351514305, + "grad_norm": 0.3612191450347856, + "learning_rate": 3.251352516419813e-05, + "loss": 2.6869, + "step": 42034 + }, + { + "epoch": 1.957050073329143, + "grad_norm": 0.350215690078965, + "learning_rate": 3.251098750805949e-05, + "loss": 2.6847, + "step": 42035 + }, + { + "epoch": 1.957096631515236, + "grad_norm": 0.37234508026430224, + "learning_rate": 3.250844990324935e-05, + "loss": 2.6334, + "step": 42036 + }, + { + "epoch": 1.9571431897013292, + "grad_norm": 0.33397800425583374, + "learning_rate": 3.250591234977521e-05, + "loss": 2.6555, + "step": 42037 + }, + { + "epoch": 1.9571897478874223, + "grad_norm": 0.3446480390414362, + "learning_rate": 3.25033748476445e-05, + "loss": 2.7255, + "step": 42038 + }, + { + "epoch": 1.9572363060735154, + "grad_norm": 0.33059236574635315, + "learning_rate": 3.2500837396864636e-05, + "loss": 2.6685, + "step": 42039 + }, + { + "epoch": 1.9572828642596085, + "grad_norm": 0.3238510973616112, + "learning_rate": 3.2498299997443106e-05, + "loss": 2.7476, + "step": 42040 + }, + { + "epoch": 1.9573294224457016, + "grad_norm": 0.33648430753204106, + "learning_rate": 3.249576264938734e-05, + "loss": 2.6676, + "step": 42041 + }, + { + "epoch": 1.9573759806317947, + "grad_norm": 0.33961603593468953, + "learning_rate": 3.2493225352704795e-05, + "loss": 2.6708, + "step": 42042 + }, + { + "epoch": 1.9574225388178876, + "grad_norm": 0.3288144904216708, + "learning_rate": 3.249068810740289e-05, + "loss": 2.6384, + "step": 42043 + }, + { + "epoch": 1.9574690970039808, + "grad_norm": 0.33693158676321, + "learning_rate": 3.2488150913489094e-05, + "loss": 2.6112, + "step": 42044 + }, + { + "epoch": 1.9575156551900736, + "grad_norm": 0.3161995360472161, + "learning_rate": 3.248561377097086e-05, + "loss": 2.6501, + "step": 42045 + }, + { + "epoch": 1.9575622133761668, + "grad_norm": 0.3309509604399782, + "learning_rate": 3.248307667985561e-05, + "loss": 2.7808, + "step": 42046 + }, + { + "epoch": 1.9576087715622599, + "grad_norm": 0.3637772243961273, + "learning_rate": 3.248053964015082e-05, + "loss": 2.6237, + "step": 42047 + }, + { + "epoch": 1.957655329748353, + "grad_norm": 0.31651396033685303, + "learning_rate": 3.247800265186392e-05, + "loss": 2.7029, + "step": 42048 + }, + { + "epoch": 1.957701887934446, + "grad_norm": 0.31410084390613624, + "learning_rate": 3.247546571500234e-05, + "loss": 2.7269, + "step": 42049 + }, + { + "epoch": 1.9577484461205392, + "grad_norm": 0.3433251146646953, + "learning_rate": 3.2472928829573554e-05, + "loss": 2.6701, + "step": 42050 + }, + { + "epoch": 1.9577950043066323, + "grad_norm": 0.37221719446825197, + "learning_rate": 3.247039199558498e-05, + "loss": 2.6839, + "step": 42051 + }, + { + "epoch": 1.9578415624927252, + "grad_norm": 0.32573235900467234, + "learning_rate": 3.24678552130441e-05, + "loss": 2.6706, + "step": 42052 + }, + { + "epoch": 1.9578881206788183, + "grad_norm": 0.33150558850990675, + "learning_rate": 3.246531848195834e-05, + "loss": 2.6989, + "step": 42053 + }, + { + "epoch": 1.9579346788649115, + "grad_norm": 0.34376634413057533, + "learning_rate": 3.2462781802335125e-05, + "loss": 2.6873, + "step": 42054 + }, + { + "epoch": 1.9579812370510044, + "grad_norm": 0.32038558334339085, + "learning_rate": 3.246024517418193e-05, + "loss": 2.6283, + "step": 42055 + }, + { + "epoch": 1.9580277952370975, + "grad_norm": 0.2988436801242212, + "learning_rate": 3.2457708597506177e-05, + "loss": 2.706, + "step": 42056 + }, + { + "epoch": 1.9580743534231906, + "grad_norm": 0.3534211797353632, + "learning_rate": 3.245517207231533e-05, + "loss": 2.7718, + "step": 42057 + }, + { + "epoch": 1.9581209116092837, + "grad_norm": 0.3517221548480908, + "learning_rate": 3.245263559861682e-05, + "loss": 2.7024, + "step": 42058 + }, + { + "epoch": 1.9581674697953768, + "grad_norm": 0.37656409825950965, + "learning_rate": 3.2450099176418084e-05, + "loss": 2.6893, + "step": 42059 + }, + { + "epoch": 1.95821402798147, + "grad_norm": 0.3136996305097796, + "learning_rate": 3.24475628057266e-05, + "loss": 2.6857, + "step": 42060 + }, + { + "epoch": 1.958260586167563, + "grad_norm": 0.3516151724860111, + "learning_rate": 3.244502648654979e-05, + "loss": 2.6669, + "step": 42061 + }, + { + "epoch": 1.958307144353656, + "grad_norm": 0.33282672221446696, + "learning_rate": 3.244249021889507e-05, + "loss": 2.6568, + "step": 42062 + }, + { + "epoch": 1.958353702539749, + "grad_norm": 0.33019540543283127, + "learning_rate": 3.2439954002769935e-05, + "loss": 2.6645, + "step": 42063 + }, + { + "epoch": 1.9584002607258422, + "grad_norm": 0.33267098430763425, + "learning_rate": 3.243741783818179e-05, + "loss": 2.6364, + "step": 42064 + }, + { + "epoch": 1.958446818911935, + "grad_norm": 0.3270801210853698, + "learning_rate": 3.243488172513811e-05, + "loss": 2.6511, + "step": 42065 + }, + { + "epoch": 1.9584933770980282, + "grad_norm": 0.3305704783986442, + "learning_rate": 3.2432345663646324e-05, + "loss": 2.7406, + "step": 42066 + }, + { + "epoch": 1.9585399352841213, + "grad_norm": 0.34290349057585623, + "learning_rate": 3.242980965371386e-05, + "loss": 2.7325, + "step": 42067 + }, + { + "epoch": 1.9585864934702144, + "grad_norm": 0.3304314099455814, + "learning_rate": 3.2427273695348184e-05, + "loss": 2.6574, + "step": 42068 + }, + { + "epoch": 1.9586330516563075, + "grad_norm": 0.324756442546607, + "learning_rate": 3.24247377885567e-05, + "loss": 2.7038, + "step": 42069 + }, + { + "epoch": 1.9586796098424006, + "grad_norm": 0.3203052638803867, + "learning_rate": 3.242220193334691e-05, + "loss": 2.7579, + "step": 42070 + }, + { + "epoch": 1.9587261680284938, + "grad_norm": 0.3309406772701158, + "learning_rate": 3.241966612972623e-05, + "loss": 2.6096, + "step": 42071 + }, + { + "epoch": 1.9587727262145866, + "grad_norm": 0.3087376915779084, + "learning_rate": 3.241713037770207e-05, + "loss": 2.6394, + "step": 42072 + }, + { + "epoch": 1.9588192844006798, + "grad_norm": 0.3215556416190062, + "learning_rate": 3.241459467728192e-05, + "loss": 2.5945, + "step": 42073 + }, + { + "epoch": 1.9588658425867727, + "grad_norm": 0.3277891495491036, + "learning_rate": 3.2412059028473204e-05, + "loss": 2.7315, + "step": 42074 + }, + { + "epoch": 1.9589124007728658, + "grad_norm": 0.32079997827627865, + "learning_rate": 3.240952343128334e-05, + "loss": 2.6618, + "step": 42075 + }, + { + "epoch": 1.9589589589589589, + "grad_norm": 0.3369135685795038, + "learning_rate": 3.2406987885719816e-05, + "loss": 2.8071, + "step": 42076 + }, + { + "epoch": 1.959005517145052, + "grad_norm": 0.339707813894763, + "learning_rate": 3.240445239179003e-05, + "loss": 2.6447, + "step": 42077 + }, + { + "epoch": 1.9590520753311451, + "grad_norm": 0.32745474110449263, + "learning_rate": 3.240191694950145e-05, + "loss": 2.6682, + "step": 42078 + }, + { + "epoch": 1.9590986335172382, + "grad_norm": 0.32854772430489415, + "learning_rate": 3.2399381558861525e-05, + "loss": 2.6684, + "step": 42079 + }, + { + "epoch": 1.9591451917033313, + "grad_norm": 0.3438272599661417, + "learning_rate": 3.2396846219877665e-05, + "loss": 2.6756, + "step": 42080 + }, + { + "epoch": 1.9591917498894245, + "grad_norm": 0.304079541196456, + "learning_rate": 3.239431093255734e-05, + "loss": 2.6812, + "step": 42081 + }, + { + "epoch": 1.9592383080755174, + "grad_norm": 0.36892222788290807, + "learning_rate": 3.2391775696907954e-05, + "loss": 2.6813, + "step": 42082 + }, + { + "epoch": 1.9592848662616105, + "grad_norm": 0.33267558293821164, + "learning_rate": 3.238924051293699e-05, + "loss": 2.6852, + "step": 42083 + }, + { + "epoch": 1.9593314244477034, + "grad_norm": 0.3170664324225176, + "learning_rate": 3.2386705380651876e-05, + "loss": 2.6346, + "step": 42084 + }, + { + "epoch": 1.9593779826337965, + "grad_norm": 0.33867945327838994, + "learning_rate": 3.2384170300060024e-05, + "loss": 2.6361, + "step": 42085 + }, + { + "epoch": 1.9594245408198896, + "grad_norm": 0.3437405795407793, + "learning_rate": 3.238163527116892e-05, + "loss": 2.7015, + "step": 42086 + }, + { + "epoch": 1.9594710990059827, + "grad_norm": 0.32828153216619166, + "learning_rate": 3.2379100293985955e-05, + "loss": 2.7109, + "step": 42087 + }, + { + "epoch": 1.9595176571920758, + "grad_norm": 0.34436825848787206, + "learning_rate": 3.237656536851862e-05, + "loss": 2.5801, + "step": 42088 + }, + { + "epoch": 1.959564215378169, + "grad_norm": 0.35741225775116736, + "learning_rate": 3.237403049477433e-05, + "loss": 2.6955, + "step": 42089 + }, + { + "epoch": 1.959610773564262, + "grad_norm": 0.35317498470638153, + "learning_rate": 3.23714956727605e-05, + "loss": 2.5317, + "step": 42090 + }, + { + "epoch": 1.9596573317503552, + "grad_norm": 0.3361274609112986, + "learning_rate": 3.2368960902484615e-05, + "loss": 2.6079, + "step": 42091 + }, + { + "epoch": 1.959703889936448, + "grad_norm": 0.33661917242024114, + "learning_rate": 3.236642618395409e-05, + "loss": 2.7131, + "step": 42092 + }, + { + "epoch": 1.9597504481225412, + "grad_norm": 0.33274569998910797, + "learning_rate": 3.236389151717636e-05, + "loss": 2.6614, + "step": 42093 + }, + { + "epoch": 1.959797006308634, + "grad_norm": 0.31820030637813973, + "learning_rate": 3.236135690215888e-05, + "loss": 2.7222, + "step": 42094 + }, + { + "epoch": 1.9598435644947272, + "grad_norm": 0.3739933855789654, + "learning_rate": 3.2358822338909056e-05, + "loss": 2.6987, + "step": 42095 + }, + { + "epoch": 1.9598901226808203, + "grad_norm": 0.33299029895239923, + "learning_rate": 3.235628782743437e-05, + "loss": 2.6434, + "step": 42096 + }, + { + "epoch": 1.9599366808669134, + "grad_norm": 0.33908715449188015, + "learning_rate": 3.2353753367742254e-05, + "loss": 2.6824, + "step": 42097 + }, + { + "epoch": 1.9599832390530065, + "grad_norm": 0.35663284810623574, + "learning_rate": 3.23512189598401e-05, + "loss": 2.704, + "step": 42098 + }, + { + "epoch": 1.9600297972390996, + "grad_norm": 0.3598486148766914, + "learning_rate": 3.2348684603735405e-05, + "loss": 2.6628, + "step": 42099 + }, + { + "epoch": 1.9600763554251928, + "grad_norm": 0.35009974697528706, + "learning_rate": 3.234615029943556e-05, + "loss": 2.6976, + "step": 42100 + }, + { + "epoch": 1.9601229136112857, + "grad_norm": 0.3448533779192537, + "learning_rate": 3.234361604694805e-05, + "loss": 2.6947, + "step": 42101 + }, + { + "epoch": 1.9601694717973788, + "grad_norm": 0.34029381747400567, + "learning_rate": 3.234108184628028e-05, + "loss": 2.689, + "step": 42102 + }, + { + "epoch": 1.9602160299834719, + "grad_norm": 0.3399721863210755, + "learning_rate": 3.233854769743968e-05, + "loss": 2.7955, + "step": 42103 + }, + { + "epoch": 1.9602625881695648, + "grad_norm": 0.3542561661334123, + "learning_rate": 3.2336013600433724e-05, + "loss": 2.6924, + "step": 42104 + }, + { + "epoch": 1.960309146355658, + "grad_norm": 0.35298913613185234, + "learning_rate": 3.233347955526981e-05, + "loss": 2.7454, + "step": 42105 + }, + { + "epoch": 1.960355704541751, + "grad_norm": 0.3608745076186702, + "learning_rate": 3.233094556195541e-05, + "loss": 2.6653, + "step": 42106 + }, + { + "epoch": 1.9604022627278441, + "grad_norm": 0.3429064268799107, + "learning_rate": 3.232841162049795e-05, + "loss": 2.693, + "step": 42107 + }, + { + "epoch": 1.9604488209139372, + "grad_norm": 0.36931243794484503, + "learning_rate": 3.2325877730904824e-05, + "loss": 2.5664, + "step": 42108 + }, + { + "epoch": 1.9604953791000304, + "grad_norm": 0.35831454588827416, + "learning_rate": 3.232334389318353e-05, + "loss": 2.7168, + "step": 42109 + }, + { + "epoch": 1.9605419372861235, + "grad_norm": 0.3331817228863585, + "learning_rate": 3.2320810107341495e-05, + "loss": 2.7028, + "step": 42110 + }, + { + "epoch": 1.9605884954722164, + "grad_norm": 0.3446102120029218, + "learning_rate": 3.231827637338611e-05, + "loss": 2.6894, + "step": 42111 + }, + { + "epoch": 1.9606350536583095, + "grad_norm": 0.3449289203247526, + "learning_rate": 3.231574269132486e-05, + "loss": 2.7016, + "step": 42112 + }, + { + "epoch": 1.9606816118444026, + "grad_norm": 0.32232579330401306, + "learning_rate": 3.231320906116514e-05, + "loss": 2.719, + "step": 42113 + }, + { + "epoch": 1.9607281700304955, + "grad_norm": 0.35711791452820557, + "learning_rate": 3.231067548291444e-05, + "loss": 2.6451, + "step": 42114 + }, + { + "epoch": 1.9607747282165886, + "grad_norm": 0.3329159195654542, + "learning_rate": 3.230814195658016e-05, + "loss": 2.6952, + "step": 42115 + }, + { + "epoch": 1.9608212864026817, + "grad_norm": 0.32293841464210216, + "learning_rate": 3.230560848216972e-05, + "loss": 2.613, + "step": 42116 + }, + { + "epoch": 1.9608678445887748, + "grad_norm": 0.32920083814250894, + "learning_rate": 3.230307505969059e-05, + "loss": 2.6817, + "step": 42117 + }, + { + "epoch": 1.960914402774868, + "grad_norm": 0.3462761274330752, + "learning_rate": 3.2300541689150186e-05, + "loss": 2.6525, + "step": 42118 + }, + { + "epoch": 1.960960960960961, + "grad_norm": 0.33104930884762857, + "learning_rate": 3.2298008370555966e-05, + "loss": 2.6674, + "step": 42119 + }, + { + "epoch": 1.9610075191470542, + "grad_norm": 0.332869317179052, + "learning_rate": 3.229547510391534e-05, + "loss": 2.6777, + "step": 42120 + }, + { + "epoch": 1.961054077333147, + "grad_norm": 0.34930488017685557, + "learning_rate": 3.229294188923573e-05, + "loss": 2.7535, + "step": 42121 + }, + { + "epoch": 1.9611006355192402, + "grad_norm": 0.335657921141696, + "learning_rate": 3.2290408726524615e-05, + "loss": 2.6665, + "step": 42122 + }, + { + "epoch": 1.961147193705333, + "grad_norm": 0.3392440407751598, + "learning_rate": 3.228787561578938e-05, + "loss": 2.6698, + "step": 42123 + }, + { + "epoch": 1.9611937518914262, + "grad_norm": 0.35453552966655716, + "learning_rate": 3.228534255703751e-05, + "loss": 2.6403, + "step": 42124 + }, + { + "epoch": 1.9612403100775193, + "grad_norm": 0.33578914951535227, + "learning_rate": 3.228280955027641e-05, + "loss": 2.6558, + "step": 42125 + }, + { + "epoch": 1.9612868682636124, + "grad_norm": 0.3283809067822098, + "learning_rate": 3.2280276595513504e-05, + "loss": 2.6713, + "step": 42126 + }, + { + "epoch": 1.9613334264497055, + "grad_norm": 0.3351184260749644, + "learning_rate": 3.227774369275626e-05, + "loss": 2.7323, + "step": 42127 + }, + { + "epoch": 1.9613799846357987, + "grad_norm": 0.33461074870865065, + "learning_rate": 3.227521084201208e-05, + "loss": 2.5858, + "step": 42128 + }, + { + "epoch": 1.9614265428218918, + "grad_norm": 0.30827377934648753, + "learning_rate": 3.2272678043288405e-05, + "loss": 2.6863, + "step": 42129 + }, + { + "epoch": 1.9614731010079849, + "grad_norm": 0.32577688832306223, + "learning_rate": 3.227014529659269e-05, + "loss": 2.5485, + "step": 42130 + }, + { + "epoch": 1.9615196591940778, + "grad_norm": 0.32282767166602366, + "learning_rate": 3.226761260193234e-05, + "loss": 2.7046, + "step": 42131 + }, + { + "epoch": 1.961566217380171, + "grad_norm": 0.31810231328352945, + "learning_rate": 3.226507995931481e-05, + "loss": 2.6226, + "step": 42132 + }, + { + "epoch": 1.9616127755662638, + "grad_norm": 0.3053918792666095, + "learning_rate": 3.226254736874752e-05, + "loss": 2.7137, + "step": 42133 + }, + { + "epoch": 1.961659333752357, + "grad_norm": 0.30775204462651934, + "learning_rate": 3.2260014830237894e-05, + "loss": 2.6572, + "step": 42134 + }, + { + "epoch": 1.96170589193845, + "grad_norm": 0.3138345476129737, + "learning_rate": 3.2257482343793385e-05, + "loss": 2.6015, + "step": 42135 + }, + { + "epoch": 1.9617524501245431, + "grad_norm": 0.3038193808286005, + "learning_rate": 3.225494990942141e-05, + "loss": 2.6391, + "step": 42136 + }, + { + "epoch": 1.9617990083106362, + "grad_norm": 0.33125125068879496, + "learning_rate": 3.225241752712942e-05, + "loss": 2.6638, + "step": 42137 + }, + { + "epoch": 1.9618455664967294, + "grad_norm": 0.34766731585890265, + "learning_rate": 3.224988519692484e-05, + "loss": 2.8052, + "step": 42138 + }, + { + "epoch": 1.9618921246828225, + "grad_norm": 0.3083372101882529, + "learning_rate": 3.224735291881508e-05, + "loss": 2.7264, + "step": 42139 + }, + { + "epoch": 1.9619386828689154, + "grad_norm": 0.33028035556342356, + "learning_rate": 3.224482069280761e-05, + "loss": 2.6416, + "step": 42140 + }, + { + "epoch": 1.9619852410550085, + "grad_norm": 0.3383983062410893, + "learning_rate": 3.224228851890983e-05, + "loss": 2.6743, + "step": 42141 + }, + { + "epoch": 1.9620317992411016, + "grad_norm": 0.3318925208378611, + "learning_rate": 3.2239756397129184e-05, + "loss": 2.7232, + "step": 42142 + }, + { + "epoch": 1.9620783574271945, + "grad_norm": 0.3306713273252965, + "learning_rate": 3.223722432747311e-05, + "loss": 2.7949, + "step": 42143 + }, + { + "epoch": 1.9621249156132876, + "grad_norm": 0.3445988969145304, + "learning_rate": 3.223469230994903e-05, + "loss": 2.7665, + "step": 42144 + }, + { + "epoch": 1.9621714737993807, + "grad_norm": 0.35317236836484833, + "learning_rate": 3.223216034456439e-05, + "loss": 2.7016, + "step": 42145 + }, + { + "epoch": 1.9622180319854738, + "grad_norm": 0.31818597279605454, + "learning_rate": 3.2229628431326596e-05, + "loss": 2.6883, + "step": 42146 + }, + { + "epoch": 1.962264590171567, + "grad_norm": 0.33081056129343855, + "learning_rate": 3.222709657024308e-05, + "loss": 2.7006, + "step": 42147 + }, + { + "epoch": 1.96231114835766, + "grad_norm": 0.3498912007454324, + "learning_rate": 3.222456476132131e-05, + "loss": 2.6759, + "step": 42148 + }, + { + "epoch": 1.9623577065437532, + "grad_norm": 0.33900154331284016, + "learning_rate": 3.222203300456866e-05, + "loss": 2.7161, + "step": 42149 + }, + { + "epoch": 1.962404264729846, + "grad_norm": 0.31436888575607846, + "learning_rate": 3.221950129999262e-05, + "loss": 2.6769, + "step": 42150 + }, + { + "epoch": 1.9624508229159392, + "grad_norm": 0.33859800651076005, + "learning_rate": 3.221696964760058e-05, + "loss": 2.6662, + "step": 42151 + }, + { + "epoch": 1.9624973811020323, + "grad_norm": 0.32058752410503466, + "learning_rate": 3.2214438047399976e-05, + "loss": 2.8013, + "step": 42152 + }, + { + "epoch": 1.9625439392881252, + "grad_norm": 0.33610985754279904, + "learning_rate": 3.221190649939826e-05, + "loss": 2.6279, + "step": 42153 + }, + { + "epoch": 1.9625904974742183, + "grad_norm": 0.3282393493608583, + "learning_rate": 3.220937500360283e-05, + "loss": 2.6256, + "step": 42154 + }, + { + "epoch": 1.9626370556603114, + "grad_norm": 0.3453415137727591, + "learning_rate": 3.220684356002114e-05, + "loss": 2.7235, + "step": 42155 + }, + { + "epoch": 1.9626836138464046, + "grad_norm": 0.33043431907699944, + "learning_rate": 3.2204312168660616e-05, + "loss": 2.6755, + "step": 42156 + }, + { + "epoch": 1.9627301720324977, + "grad_norm": 0.33944739100239457, + "learning_rate": 3.220178082952867e-05, + "loss": 2.6475, + "step": 42157 + }, + { + "epoch": 1.9627767302185908, + "grad_norm": 0.362284112633112, + "learning_rate": 3.219924954263277e-05, + "loss": 2.6991, + "step": 42158 + }, + { + "epoch": 1.962823288404684, + "grad_norm": 0.332799210754694, + "learning_rate": 3.219671830798028e-05, + "loss": 2.6465, + "step": 42159 + }, + { + "epoch": 1.9628698465907768, + "grad_norm": 0.34214575373408257, + "learning_rate": 3.2194187125578694e-05, + "loss": 2.7468, + "step": 42160 + }, + { + "epoch": 1.96291640477687, + "grad_norm": 0.3573408130220357, + "learning_rate": 3.219165599543541e-05, + "loss": 2.7843, + "step": 42161 + }, + { + "epoch": 1.9629629629629628, + "grad_norm": 0.37804716550126205, + "learning_rate": 3.218912491755786e-05, + "loss": 2.7439, + "step": 42162 + }, + { + "epoch": 1.963009521149056, + "grad_norm": 0.35246195300050837, + "learning_rate": 3.218659389195348e-05, + "loss": 2.634, + "step": 42163 + }, + { + "epoch": 1.963056079335149, + "grad_norm": 0.33592323132871454, + "learning_rate": 3.2184062918629686e-05, + "loss": 2.7218, + "step": 42164 + }, + { + "epoch": 1.9631026375212421, + "grad_norm": 0.34772634472861225, + "learning_rate": 3.21815319975939e-05, + "loss": 2.7434, + "step": 42165 + }, + { + "epoch": 1.9631491957073353, + "grad_norm": 0.3483212546819059, + "learning_rate": 3.217900112885358e-05, + "loss": 2.6829, + "step": 42166 + }, + { + "epoch": 1.9631957538934284, + "grad_norm": 0.33298386163351174, + "learning_rate": 3.2176470312416126e-05, + "loss": 2.7177, + "step": 42167 + }, + { + "epoch": 1.9632423120795215, + "grad_norm": 0.3360778991967327, + "learning_rate": 3.217393954828898e-05, + "loss": 2.611, + "step": 42168 + }, + { + "epoch": 1.9632888702656146, + "grad_norm": 0.3364105999542101, + "learning_rate": 3.2171408836479564e-05, + "loss": 2.6641, + "step": 42169 + }, + { + "epoch": 1.9633354284517075, + "grad_norm": 0.3799490542561223, + "learning_rate": 3.216887817699531e-05, + "loss": 2.676, + "step": 42170 + }, + { + "epoch": 1.9633819866378006, + "grad_norm": 0.3078050572389318, + "learning_rate": 3.2166347569843644e-05, + "loss": 2.6908, + "step": 42171 + }, + { + "epoch": 1.9634285448238935, + "grad_norm": 0.3414169745284918, + "learning_rate": 3.216381701503198e-05, + "loss": 2.4917, + "step": 42172 + }, + { + "epoch": 1.9634751030099866, + "grad_norm": 0.358120820625814, + "learning_rate": 3.2161286512567766e-05, + "loss": 2.7524, + "step": 42173 + }, + { + "epoch": 1.9635216611960797, + "grad_norm": 0.33400619538622817, + "learning_rate": 3.215875606245842e-05, + "loss": 2.7366, + "step": 42174 + }, + { + "epoch": 1.9635682193821729, + "grad_norm": 0.3336945513690874, + "learning_rate": 3.2156225664711346e-05, + "loss": 2.5984, + "step": 42175 + }, + { + "epoch": 1.963614777568266, + "grad_norm": 0.335821915260794, + "learning_rate": 3.215369531933402e-05, + "loss": 2.5879, + "step": 42176 + }, + { + "epoch": 1.963661335754359, + "grad_norm": 0.3336253736351921, + "learning_rate": 3.2151165026333816e-05, + "loss": 2.6365, + "step": 42177 + }, + { + "epoch": 1.9637078939404522, + "grad_norm": 0.33867166252146597, + "learning_rate": 3.21486347857182e-05, + "loss": 2.6705, + "step": 42178 + }, + { + "epoch": 1.9637544521265453, + "grad_norm": 0.3410509098558875, + "learning_rate": 3.2146104597494586e-05, + "loss": 2.6596, + "step": 42179 + }, + { + "epoch": 1.9638010103126382, + "grad_norm": 0.32700080062423387, + "learning_rate": 3.214357446167039e-05, + "loss": 2.739, + "step": 42180 + }, + { + "epoch": 1.9638475684987313, + "grad_norm": 0.33296634308648343, + "learning_rate": 3.2141044378253035e-05, + "loss": 2.7179, + "step": 42181 + }, + { + "epoch": 1.9638941266848242, + "grad_norm": 0.3368742202845442, + "learning_rate": 3.213851434724997e-05, + "loss": 2.6467, + "step": 42182 + }, + { + "epoch": 1.9639406848709173, + "grad_norm": 0.35073683870957173, + "learning_rate": 3.21359843686686e-05, + "loss": 2.6403, + "step": 42183 + }, + { + "epoch": 1.9639872430570104, + "grad_norm": 0.31840355410201815, + "learning_rate": 3.2133454442516365e-05, + "loss": 2.6788, + "step": 42184 + }, + { + "epoch": 1.9640338012431036, + "grad_norm": 0.3318902032897902, + "learning_rate": 3.213092456880065e-05, + "loss": 2.6442, + "step": 42185 + }, + { + "epoch": 1.9640803594291967, + "grad_norm": 0.3476145708924042, + "learning_rate": 3.2128394747528944e-05, + "loss": 2.5708, + "step": 42186 + }, + { + "epoch": 1.9641269176152898, + "grad_norm": 0.3084510740611333, + "learning_rate": 3.212586497870863e-05, + "loss": 2.7149, + "step": 42187 + }, + { + "epoch": 1.964173475801383, + "grad_norm": 0.3258537147348345, + "learning_rate": 3.2123335262347124e-05, + "loss": 2.6988, + "step": 42188 + }, + { + "epoch": 1.9642200339874758, + "grad_norm": 0.34913191846367736, + "learning_rate": 3.212080559845189e-05, + "loss": 2.5936, + "step": 42189 + }, + { + "epoch": 1.964266592173569, + "grad_norm": 0.330406979175405, + "learning_rate": 3.211827598703031e-05, + "loss": 2.655, + "step": 42190 + }, + { + "epoch": 1.964313150359662, + "grad_norm": 0.3735686343030963, + "learning_rate": 3.211574642808984e-05, + "loss": 2.6936, + "step": 42191 + }, + { + "epoch": 1.964359708545755, + "grad_norm": 0.3309125734122887, + "learning_rate": 3.211321692163791e-05, + "loss": 2.7356, + "step": 42192 + }, + { + "epoch": 1.964406266731848, + "grad_norm": 0.35749667312345595, + "learning_rate": 3.21106874676819e-05, + "loss": 2.6836, + "step": 42193 + }, + { + "epoch": 1.9644528249179412, + "grad_norm": 0.34441655312577124, + "learning_rate": 3.2108158066229266e-05, + "loss": 2.709, + "step": 42194 + }, + { + "epoch": 1.9644993831040343, + "grad_norm": 0.3248077617995576, + "learning_rate": 3.2105628717287425e-05, + "loss": 2.6589, + "step": 42195 + }, + { + "epoch": 1.9645459412901274, + "grad_norm": 0.3434183174142599, + "learning_rate": 3.2103099420863815e-05, + "loss": 2.5685, + "step": 42196 + }, + { + "epoch": 1.9645924994762205, + "grad_norm": 0.3340092536480102, + "learning_rate": 3.210057017696584e-05, + "loss": 2.731, + "step": 42197 + }, + { + "epoch": 1.9646390576623136, + "grad_norm": 0.3418985627130289, + "learning_rate": 3.209804098560091e-05, + "loss": 2.6552, + "step": 42198 + }, + { + "epoch": 1.9646856158484065, + "grad_norm": 0.3477353724253075, + "learning_rate": 3.209551184677648e-05, + "loss": 2.6847, + "step": 42199 + }, + { + "epoch": 1.9647321740344996, + "grad_norm": 0.336896056911059, + "learning_rate": 3.209298276049997e-05, + "loss": 2.6848, + "step": 42200 + }, + { + "epoch": 1.9647787322205927, + "grad_norm": 0.35135371075466676, + "learning_rate": 3.2090453726778763e-05, + "loss": 2.6083, + "step": 42201 + }, + { + "epoch": 1.9648252904066856, + "grad_norm": 0.3396802094766783, + "learning_rate": 3.2087924745620336e-05, + "loss": 2.6372, + "step": 42202 + }, + { + "epoch": 1.9648718485927787, + "grad_norm": 0.3384362916217454, + "learning_rate": 3.208539581703206e-05, + "loss": 2.6974, + "step": 42203 + }, + { + "epoch": 1.9649184067788719, + "grad_norm": 0.35105313165937896, + "learning_rate": 3.208286694102141e-05, + "loss": 2.6969, + "step": 42204 + }, + { + "epoch": 1.964964964964965, + "grad_norm": 0.3356027689181883, + "learning_rate": 3.208033811759577e-05, + "loss": 2.6896, + "step": 42205 + }, + { + "epoch": 1.965011523151058, + "grad_norm": 0.338757927613077, + "learning_rate": 3.2077809346762554e-05, + "loss": 2.6131, + "step": 42206 + }, + { + "epoch": 1.9650580813371512, + "grad_norm": 0.37310664474434757, + "learning_rate": 3.2075280628529224e-05, + "loss": 2.7214, + "step": 42207 + }, + { + "epoch": 1.9651046395232443, + "grad_norm": 0.3361827037036784, + "learning_rate": 3.2072751962903166e-05, + "loss": 2.6086, + "step": 42208 + }, + { + "epoch": 1.9651511977093372, + "grad_norm": 0.31953020496133894, + "learning_rate": 3.2070223349891834e-05, + "loss": 2.5991, + "step": 42209 + }, + { + "epoch": 1.9651977558954303, + "grad_norm": 0.36070838849211057, + "learning_rate": 3.206769478950262e-05, + "loss": 2.6997, + "step": 42210 + }, + { + "epoch": 1.9652443140815232, + "grad_norm": 0.33598466804745364, + "learning_rate": 3.2065166281742944e-05, + "loss": 2.7896, + "step": 42211 + }, + { + "epoch": 1.9652908722676163, + "grad_norm": 0.34336328937073524, + "learning_rate": 3.2062637826620254e-05, + "loss": 2.6261, + "step": 42212 + }, + { + "epoch": 1.9653374304537095, + "grad_norm": 0.3424289656685326, + "learning_rate": 3.2060109424141936e-05, + "loss": 2.6732, + "step": 42213 + }, + { + "epoch": 1.9653839886398026, + "grad_norm": 0.3392073123202384, + "learning_rate": 3.205758107431544e-05, + "loss": 2.647, + "step": 42214 + }, + { + "epoch": 1.9654305468258957, + "grad_norm": 0.3622712026561552, + "learning_rate": 3.2055052777148195e-05, + "loss": 2.6159, + "step": 42215 + }, + { + "epoch": 1.9654771050119888, + "grad_norm": 0.3538355131325781, + "learning_rate": 3.205252453264757e-05, + "loss": 2.6465, + "step": 42216 + }, + { + "epoch": 1.965523663198082, + "grad_norm": 0.33899611886854897, + "learning_rate": 3.204999634082104e-05, + "loss": 2.67, + "step": 42217 + }, + { + "epoch": 1.965570221384175, + "grad_norm": 0.33208135334388567, + "learning_rate": 3.2047468201676004e-05, + "loss": 2.6872, + "step": 42218 + }, + { + "epoch": 1.965616779570268, + "grad_norm": 0.35278006360177233, + "learning_rate": 3.204494011521986e-05, + "loss": 2.7882, + "step": 42219 + }, + { + "epoch": 1.965663337756361, + "grad_norm": 0.36919190816940817, + "learning_rate": 3.204241208146006e-05, + "loss": 2.7424, + "step": 42220 + }, + { + "epoch": 1.965709895942454, + "grad_norm": 0.32891680034316245, + "learning_rate": 3.203988410040401e-05, + "loss": 2.6906, + "step": 42221 + }, + { + "epoch": 1.965756454128547, + "grad_norm": 0.32809087440676327, + "learning_rate": 3.2037356172059136e-05, + "loss": 2.7387, + "step": 42222 + }, + { + "epoch": 1.9658030123146402, + "grad_norm": 0.30487747674026516, + "learning_rate": 3.203482829643285e-05, + "loss": 2.6636, + "step": 42223 + }, + { + "epoch": 1.9658495705007333, + "grad_norm": 0.3428521542089596, + "learning_rate": 3.2032300473532565e-05, + "loss": 2.6829, + "step": 42224 + }, + { + "epoch": 1.9658961286868264, + "grad_norm": 0.36192901839104147, + "learning_rate": 3.202977270336571e-05, + "loss": 2.785, + "step": 42225 + }, + { + "epoch": 1.9659426868729195, + "grad_norm": 0.32597646035603156, + "learning_rate": 3.2027244985939694e-05, + "loss": 2.6805, + "step": 42226 + }, + { + "epoch": 1.9659892450590126, + "grad_norm": 0.3399963210772801, + "learning_rate": 3.2024717321261963e-05, + "loss": 2.6893, + "step": 42227 + }, + { + "epoch": 1.9660358032451055, + "grad_norm": 0.3531893259347251, + "learning_rate": 3.202218970933992e-05, + "loss": 2.7547, + "step": 42228 + }, + { + "epoch": 1.9660823614311986, + "grad_norm": 0.35877853262818654, + "learning_rate": 3.201966215018095e-05, + "loss": 2.7288, + "step": 42229 + }, + { + "epoch": 1.9661289196172917, + "grad_norm": 0.3312435662499689, + "learning_rate": 3.201713464379252e-05, + "loss": 2.6494, + "step": 42230 + }, + { + "epoch": 1.9661754778033846, + "grad_norm": 0.3366395941553042, + "learning_rate": 3.201460719018202e-05, + "loss": 2.6148, + "step": 42231 + }, + { + "epoch": 1.9662220359894778, + "grad_norm": 0.35798105224334265, + "learning_rate": 3.201207978935688e-05, + "loss": 2.6851, + "step": 42232 + }, + { + "epoch": 1.9662685941755709, + "grad_norm": 0.3387594034123545, + "learning_rate": 3.200955244132452e-05, + "loss": 2.6912, + "step": 42233 + }, + { + "epoch": 1.966315152361664, + "grad_norm": 0.3382912642094143, + "learning_rate": 3.200702514609235e-05, + "loss": 2.7015, + "step": 42234 + }, + { + "epoch": 1.966361710547757, + "grad_norm": 0.3363363751577464, + "learning_rate": 3.2004497903667786e-05, + "loss": 2.6895, + "step": 42235 + }, + { + "epoch": 1.9664082687338502, + "grad_norm": 0.33473998574156266, + "learning_rate": 3.200197071405825e-05, + "loss": 2.6111, + "step": 42236 + }, + { + "epoch": 1.9664548269199433, + "grad_norm": 0.3399124044287454, + "learning_rate": 3.199944357727114e-05, + "loss": 2.6527, + "step": 42237 + }, + { + "epoch": 1.9665013851060362, + "grad_norm": 0.34407402479733645, + "learning_rate": 3.199691649331391e-05, + "loss": 2.6177, + "step": 42238 + }, + { + "epoch": 1.9665479432921293, + "grad_norm": 0.3315973787425082, + "learning_rate": 3.199438946219394e-05, + "loss": 2.6213, + "step": 42239 + }, + { + "epoch": 1.9665945014782225, + "grad_norm": 0.33088439113174184, + "learning_rate": 3.1991862483918675e-05, + "loss": 2.6574, + "step": 42240 + }, + { + "epoch": 1.9666410596643153, + "grad_norm": 0.3409102866199075, + "learning_rate": 3.198933555849553e-05, + "loss": 2.7258, + "step": 42241 + }, + { + "epoch": 1.9666876178504085, + "grad_norm": 0.3373525000731151, + "learning_rate": 3.198680868593188e-05, + "loss": 2.6888, + "step": 42242 + }, + { + "epoch": 1.9667341760365016, + "grad_norm": 0.38357845213196956, + "learning_rate": 3.19842818662352e-05, + "loss": 2.6847, + "step": 42243 + }, + { + "epoch": 1.9667807342225947, + "grad_norm": 0.35002566529900875, + "learning_rate": 3.198175509941286e-05, + "loss": 2.7536, + "step": 42244 + }, + { + "epoch": 1.9668272924086878, + "grad_norm": 0.31921326655900445, + "learning_rate": 3.19792283854723e-05, + "loss": 2.6911, + "step": 42245 + }, + { + "epoch": 1.966873850594781, + "grad_norm": 0.3723295270282467, + "learning_rate": 3.1976701724420935e-05, + "loss": 2.7714, + "step": 42246 + }, + { + "epoch": 1.966920408780874, + "grad_norm": 0.34120961495731444, + "learning_rate": 3.197417511626617e-05, + "loss": 2.7064, + "step": 42247 + }, + { + "epoch": 1.966966966966967, + "grad_norm": 0.372343239623106, + "learning_rate": 3.1971648561015436e-05, + "loss": 2.5943, + "step": 42248 + }, + { + "epoch": 1.96701352515306, + "grad_norm": 0.33636170976278895, + "learning_rate": 3.1969122058676106e-05, + "loss": 2.7851, + "step": 42249 + }, + { + "epoch": 1.967060083339153, + "grad_norm": 0.341200348235273, + "learning_rate": 3.1966595609255654e-05, + "loss": 2.6602, + "step": 42250 + }, + { + "epoch": 1.967106641525246, + "grad_norm": 0.3535861949226587, + "learning_rate": 3.196406921276147e-05, + "loss": 2.6386, + "step": 42251 + }, + { + "epoch": 1.9671531997113392, + "grad_norm": 0.3162621793084165, + "learning_rate": 3.196154286920094e-05, + "loss": 2.6814, + "step": 42252 + }, + { + "epoch": 1.9671997578974323, + "grad_norm": 0.34034853652592045, + "learning_rate": 3.195901657858153e-05, + "loss": 2.6895, + "step": 42253 + }, + { + "epoch": 1.9672463160835254, + "grad_norm": 0.32183380973104075, + "learning_rate": 3.1956490340910626e-05, + "loss": 2.7111, + "step": 42254 + }, + { + "epoch": 1.9672928742696185, + "grad_norm": 0.32500912733557935, + "learning_rate": 3.195396415619563e-05, + "loss": 2.5921, + "step": 42255 + }, + { + "epoch": 1.9673394324557116, + "grad_norm": 0.3505613896227563, + "learning_rate": 3.195143802444398e-05, + "loss": 2.7012, + "step": 42256 + }, + { + "epoch": 1.9673859906418047, + "grad_norm": 0.3497370217641022, + "learning_rate": 3.194891194566308e-05, + "loss": 2.7149, + "step": 42257 + }, + { + "epoch": 1.9674325488278976, + "grad_norm": 0.31453282941781996, + "learning_rate": 3.194638591986035e-05, + "loss": 2.7237, + "step": 42258 + }, + { + "epoch": 1.9674791070139908, + "grad_norm": 0.34318677023983296, + "learning_rate": 3.19438599470432e-05, + "loss": 2.6657, + "step": 42259 + }, + { + "epoch": 1.9675256652000837, + "grad_norm": 0.33365558184086447, + "learning_rate": 3.1941334027219036e-05, + "loss": 2.6754, + "step": 42260 + }, + { + "epoch": 1.9675722233861768, + "grad_norm": 0.3507258327597978, + "learning_rate": 3.1938808160395286e-05, + "loss": 2.7253, + "step": 42261 + }, + { + "epoch": 1.9676187815722699, + "grad_norm": 0.34932560838088356, + "learning_rate": 3.193628234657934e-05, + "loss": 2.7267, + "step": 42262 + }, + { + "epoch": 1.967665339758363, + "grad_norm": 0.3558764882647521, + "learning_rate": 3.1933756585778634e-05, + "loss": 2.726, + "step": 42263 + }, + { + "epoch": 1.9677118979444561, + "grad_norm": 0.36764612786142464, + "learning_rate": 3.1931230878000586e-05, + "loss": 2.5929, + "step": 42264 + }, + { + "epoch": 1.9677584561305492, + "grad_norm": 0.3220411790861454, + "learning_rate": 3.192870522325257e-05, + "loss": 2.6989, + "step": 42265 + }, + { + "epoch": 1.9678050143166423, + "grad_norm": 0.3630547888878761, + "learning_rate": 3.192617962154204e-05, + "loss": 2.751, + "step": 42266 + }, + { + "epoch": 1.9678515725027355, + "grad_norm": 0.34420725557320114, + "learning_rate": 3.192365407287637e-05, + "loss": 2.7696, + "step": 42267 + }, + { + "epoch": 1.9678981306888284, + "grad_norm": 0.3607634715407053, + "learning_rate": 3.192112857726302e-05, + "loss": 2.5635, + "step": 42268 + }, + { + "epoch": 1.9679446888749215, + "grad_norm": 0.31941034542902186, + "learning_rate": 3.191860313470938e-05, + "loss": 2.6826, + "step": 42269 + }, + { + "epoch": 1.9679912470610144, + "grad_norm": 0.35457097249780456, + "learning_rate": 3.191607774522284e-05, + "loss": 2.749, + "step": 42270 + }, + { + "epoch": 1.9680378052471075, + "grad_norm": 0.34719219949161667, + "learning_rate": 3.191355240881084e-05, + "loss": 2.7025, + "step": 42271 + }, + { + "epoch": 1.9680843634332006, + "grad_norm": 0.3465647552286705, + "learning_rate": 3.191102712548078e-05, + "loss": 2.6986, + "step": 42272 + }, + { + "epoch": 1.9681309216192937, + "grad_norm": 0.37504112341958284, + "learning_rate": 3.190850189524007e-05, + "loss": 2.6889, + "step": 42273 + }, + { + "epoch": 1.9681774798053868, + "grad_norm": 0.3256373776640988, + "learning_rate": 3.190597671809614e-05, + "loss": 2.8258, + "step": 42274 + }, + { + "epoch": 1.96822403799148, + "grad_norm": 0.36633015616424963, + "learning_rate": 3.1903451594056356e-05, + "loss": 2.6591, + "step": 42275 + }, + { + "epoch": 1.968270596177573, + "grad_norm": 0.3524353310012092, + "learning_rate": 3.190092652312818e-05, + "loss": 2.6638, + "step": 42276 + }, + { + "epoch": 1.968317154363666, + "grad_norm": 0.3406511600011333, + "learning_rate": 3.189840150531901e-05, + "loss": 2.7392, + "step": 42277 + }, + { + "epoch": 1.968363712549759, + "grad_norm": 0.40529348721209585, + "learning_rate": 3.189587654063623e-05, + "loss": 2.7192, + "step": 42278 + }, + { + "epoch": 1.9684102707358522, + "grad_norm": 0.34024961178123436, + "learning_rate": 3.189335162908728e-05, + "loss": 2.6654, + "step": 42279 + }, + { + "epoch": 1.968456828921945, + "grad_norm": 0.3562741588871598, + "learning_rate": 3.189082677067954e-05, + "loss": 2.598, + "step": 42280 + }, + { + "epoch": 1.9685033871080382, + "grad_norm": 0.3364416564938497, + "learning_rate": 3.188830196542045e-05, + "loss": 2.5281, + "step": 42281 + }, + { + "epoch": 1.9685499452941313, + "grad_norm": 0.34550270607857425, + "learning_rate": 3.1885777213317424e-05, + "loss": 2.6146, + "step": 42282 + }, + { + "epoch": 1.9685965034802244, + "grad_norm": 0.3604230933146541, + "learning_rate": 3.188325251437785e-05, + "loss": 2.7214, + "step": 42283 + }, + { + "epoch": 1.9686430616663175, + "grad_norm": 0.35055750256277957, + "learning_rate": 3.1880727868609135e-05, + "loss": 2.7514, + "step": 42284 + }, + { + "epoch": 1.9686896198524106, + "grad_norm": 0.3903727551747375, + "learning_rate": 3.18782032760187e-05, + "loss": 2.6365, + "step": 42285 + }, + { + "epoch": 1.9687361780385038, + "grad_norm": 0.3491218707537885, + "learning_rate": 3.1875678736613966e-05, + "loss": 2.6691, + "step": 42286 + }, + { + "epoch": 1.9687827362245967, + "grad_norm": 0.3490988659754399, + "learning_rate": 3.1873154250402325e-05, + "loss": 2.6173, + "step": 42287 + }, + { + "epoch": 1.9688292944106898, + "grad_norm": 0.357509303436164, + "learning_rate": 3.187062981739117e-05, + "loss": 2.7191, + "step": 42288 + }, + { + "epoch": 1.9688758525967829, + "grad_norm": 0.35943144194560894, + "learning_rate": 3.1868105437587955e-05, + "loss": 2.7093, + "step": 42289 + }, + { + "epoch": 1.9689224107828758, + "grad_norm": 0.35220059249356167, + "learning_rate": 3.186558111100007e-05, + "loss": 2.7257, + "step": 42290 + }, + { + "epoch": 1.968968968968969, + "grad_norm": 0.3621016797778309, + "learning_rate": 3.186305683763489e-05, + "loss": 2.6747, + "step": 42291 + }, + { + "epoch": 1.969015527155062, + "grad_norm": 0.3188090804697572, + "learning_rate": 3.186053261749987e-05, + "loss": 2.5832, + "step": 42292 + }, + { + "epoch": 1.9690620853411551, + "grad_norm": 0.3765894611884452, + "learning_rate": 3.1858008450602386e-05, + "loss": 2.7295, + "step": 42293 + }, + { + "epoch": 1.9691086435272482, + "grad_norm": 0.34129391020962196, + "learning_rate": 3.185548433694988e-05, + "loss": 2.5958, + "step": 42294 + }, + { + "epoch": 1.9691552017133414, + "grad_norm": 0.31126971727803737, + "learning_rate": 3.185296027654974e-05, + "loss": 2.6547, + "step": 42295 + }, + { + "epoch": 1.9692017598994345, + "grad_norm": 0.32694666202676653, + "learning_rate": 3.1850436269409353e-05, + "loss": 2.6851, + "step": 42296 + }, + { + "epoch": 1.9692483180855274, + "grad_norm": 0.3503427501901443, + "learning_rate": 3.1847912315536154e-05, + "loss": 2.7767, + "step": 42297 + }, + { + "epoch": 1.9692948762716205, + "grad_norm": 0.35301284348782214, + "learning_rate": 3.184538841493755e-05, + "loss": 2.7264, + "step": 42298 + }, + { + "epoch": 1.9693414344577134, + "grad_norm": 0.3384507530806412, + "learning_rate": 3.184286456762094e-05, + "loss": 2.6743, + "step": 42299 + }, + { + "epoch": 1.9693879926438065, + "grad_norm": 0.36022513646312315, + "learning_rate": 3.1840340773593744e-05, + "loss": 2.7262, + "step": 42300 + }, + { + "epoch": 1.9694345508298996, + "grad_norm": 0.3285444981103951, + "learning_rate": 3.1837817032863335e-05, + "loss": 2.578, + "step": 42301 + }, + { + "epoch": 1.9694811090159927, + "grad_norm": 0.3460401437260745, + "learning_rate": 3.183529334543717e-05, + "loss": 2.6856, + "step": 42302 + }, + { + "epoch": 1.9695276672020858, + "grad_norm": 0.33510409679740394, + "learning_rate": 3.1832769711322595e-05, + "loss": 2.6985, + "step": 42303 + }, + { + "epoch": 1.969574225388179, + "grad_norm": 0.3373702864519779, + "learning_rate": 3.1830246130527084e-05, + "loss": 2.6869, + "step": 42304 + }, + { + "epoch": 1.969620783574272, + "grad_norm": 0.3828459009746438, + "learning_rate": 3.182772260305801e-05, + "loss": 2.6107, + "step": 42305 + }, + { + "epoch": 1.9696673417603652, + "grad_norm": 0.3356040119692918, + "learning_rate": 3.182519912892276e-05, + "loss": 2.6622, + "step": 42306 + }, + { + "epoch": 1.969713899946458, + "grad_norm": 0.3651898780789648, + "learning_rate": 3.182267570812877e-05, + "loss": 2.6968, + "step": 42307 + }, + { + "epoch": 1.9697604581325512, + "grad_norm": 0.380303014060782, + "learning_rate": 3.1820152340683434e-05, + "loss": 2.6021, + "step": 42308 + }, + { + "epoch": 1.969807016318644, + "grad_norm": 0.33187806122152425, + "learning_rate": 3.181762902659416e-05, + "loss": 2.6101, + "step": 42309 + }, + { + "epoch": 1.9698535745047372, + "grad_norm": 0.3433979466411264, + "learning_rate": 3.181510576586836e-05, + "loss": 2.6391, + "step": 42310 + }, + { + "epoch": 1.9699001326908303, + "grad_norm": 0.3903205449146013, + "learning_rate": 3.1812582558513425e-05, + "loss": 2.6404, + "step": 42311 + }, + { + "epoch": 1.9699466908769234, + "grad_norm": 0.33472735714208823, + "learning_rate": 3.181005940453678e-05, + "loss": 2.7383, + "step": 42312 + }, + { + "epoch": 1.9699932490630165, + "grad_norm": 0.33929781238725243, + "learning_rate": 3.1807536303945815e-05, + "loss": 2.6612, + "step": 42313 + }, + { + "epoch": 1.9700398072491097, + "grad_norm": 0.37367426731773945, + "learning_rate": 3.1805013256747924e-05, + "loss": 2.6966, + "step": 42314 + }, + { + "epoch": 1.9700863654352028, + "grad_norm": 0.3503955859586713, + "learning_rate": 3.180249026295054e-05, + "loss": 2.6678, + "step": 42315 + }, + { + "epoch": 1.9701329236212957, + "grad_norm": 0.3033795136285964, + "learning_rate": 3.179996732256104e-05, + "loss": 2.6008, + "step": 42316 + }, + { + "epoch": 1.9701794818073888, + "grad_norm": 0.37558350287578635, + "learning_rate": 3.179744443558686e-05, + "loss": 2.662, + "step": 42317 + }, + { + "epoch": 1.970226039993482, + "grad_norm": 0.34846300959213805, + "learning_rate": 3.179492160203539e-05, + "loss": 2.6167, + "step": 42318 + }, + { + "epoch": 1.9702725981795748, + "grad_norm": 0.33239294431486827, + "learning_rate": 3.179239882191401e-05, + "loss": 2.6223, + "step": 42319 + }, + { + "epoch": 1.970319156365668, + "grad_norm": 0.35885875529379846, + "learning_rate": 3.1789876095230166e-05, + "loss": 2.6354, + "step": 42320 + }, + { + "epoch": 1.970365714551761, + "grad_norm": 0.33204684297739706, + "learning_rate": 3.178735342199123e-05, + "loss": 2.7127, + "step": 42321 + }, + { + "epoch": 1.9704122727378541, + "grad_norm": 0.3254000183678851, + "learning_rate": 3.1784830802204626e-05, + "loss": 2.7315, + "step": 42322 + }, + { + "epoch": 1.9704588309239472, + "grad_norm": 0.36409780599756747, + "learning_rate": 3.178230823587774e-05, + "loss": 2.6953, + "step": 42323 + }, + { + "epoch": 1.9705053891100404, + "grad_norm": 0.34637327304097304, + "learning_rate": 3.177978572301798e-05, + "loss": 2.6196, + "step": 42324 + }, + { + "epoch": 1.9705519472961335, + "grad_norm": 0.30772654354017115, + "learning_rate": 3.177726326363278e-05, + "loss": 2.5691, + "step": 42325 + }, + { + "epoch": 1.9705985054822264, + "grad_norm": 0.3536474132773262, + "learning_rate": 3.177474085772949e-05, + "loss": 2.6452, + "step": 42326 + }, + { + "epoch": 1.9706450636683195, + "grad_norm": 0.34336078478355425, + "learning_rate": 3.177221850531555e-05, + "loss": 2.6984, + "step": 42327 + }, + { + "epoch": 1.9706916218544126, + "grad_norm": 0.3341352362005721, + "learning_rate": 3.1769696206398366e-05, + "loss": 2.7954, + "step": 42328 + }, + { + "epoch": 1.9707381800405055, + "grad_norm": 0.3354564738731846, + "learning_rate": 3.1767173960985295e-05, + "loss": 2.5707, + "step": 42329 + }, + { + "epoch": 1.9707847382265986, + "grad_norm": 0.34216311393143134, + "learning_rate": 3.176465176908381e-05, + "loss": 2.6724, + "step": 42330 + }, + { + "epoch": 1.9708312964126917, + "grad_norm": 0.35489292819402013, + "learning_rate": 3.176212963070126e-05, + "loss": 2.6469, + "step": 42331 + }, + { + "epoch": 1.9708778545987848, + "grad_norm": 0.3266004936250919, + "learning_rate": 3.175960754584505e-05, + "loss": 2.7711, + "step": 42332 + }, + { + "epoch": 1.970924412784878, + "grad_norm": 0.3294511479748344, + "learning_rate": 3.175708551452261e-05, + "loss": 2.7076, + "step": 42333 + }, + { + "epoch": 1.970970970970971, + "grad_norm": 0.33986841154737985, + "learning_rate": 3.175456353674132e-05, + "loss": 2.696, + "step": 42334 + }, + { + "epoch": 1.9710175291570642, + "grad_norm": 0.36807095118666755, + "learning_rate": 3.17520416125086e-05, + "loss": 2.6367, + "step": 42335 + }, + { + "epoch": 1.971064087343157, + "grad_norm": 0.34432064904397724, + "learning_rate": 3.1749519741831825e-05, + "loss": 2.684, + "step": 42336 + }, + { + "epoch": 1.9711106455292502, + "grad_norm": 0.34773115690479556, + "learning_rate": 3.1746997924718416e-05, + "loss": 2.7001, + "step": 42337 + }, + { + "epoch": 1.971157203715343, + "grad_norm": 0.3647808162638853, + "learning_rate": 3.1744476161175776e-05, + "loss": 2.6533, + "step": 42338 + }, + { + "epoch": 1.9712037619014362, + "grad_norm": 0.33359080063234836, + "learning_rate": 3.174195445121128e-05, + "loss": 2.736, + "step": 42339 + }, + { + "epoch": 1.9712503200875293, + "grad_norm": 0.36753786486743345, + "learning_rate": 3.173943279483238e-05, + "loss": 2.7512, + "step": 42340 + }, + { + "epoch": 1.9712968782736224, + "grad_norm": 0.36500192124037584, + "learning_rate": 3.173691119204643e-05, + "loss": 2.6372, + "step": 42341 + }, + { + "epoch": 1.9713434364597155, + "grad_norm": 0.3616188968937353, + "learning_rate": 3.1734389642860835e-05, + "loss": 2.7719, + "step": 42342 + }, + { + "epoch": 1.9713899946458087, + "grad_norm": 0.3567448314508344, + "learning_rate": 3.173186814728303e-05, + "loss": 2.7328, + "step": 42343 + }, + { + "epoch": 1.9714365528319018, + "grad_norm": 0.38268932585389104, + "learning_rate": 3.1729346705320365e-05, + "loss": 2.6873, + "step": 42344 + }, + { + "epoch": 1.971483111017995, + "grad_norm": 0.32227026008024334, + "learning_rate": 3.172682531698029e-05, + "loss": 2.7592, + "step": 42345 + }, + { + "epoch": 1.9715296692040878, + "grad_norm": 0.3652814323924298, + "learning_rate": 3.172430398227018e-05, + "loss": 2.6774, + "step": 42346 + }, + { + "epoch": 1.971576227390181, + "grad_norm": 0.32763404825033515, + "learning_rate": 3.172178270119743e-05, + "loss": 2.7244, + "step": 42347 + }, + { + "epoch": 1.9716227855762738, + "grad_norm": 0.3379201678972814, + "learning_rate": 3.1719261473769454e-05, + "loss": 2.7429, + "step": 42348 + }, + { + "epoch": 1.971669343762367, + "grad_norm": 0.3105032714211297, + "learning_rate": 3.171674029999364e-05, + "loss": 2.7017, + "step": 42349 + }, + { + "epoch": 1.97171590194846, + "grad_norm": 0.32357452433690403, + "learning_rate": 3.1714219179877395e-05, + "loss": 2.6452, + "step": 42350 + }, + { + "epoch": 1.9717624601345531, + "grad_norm": 0.3306045081223332, + "learning_rate": 3.171169811342811e-05, + "loss": 2.6833, + "step": 42351 + }, + { + "epoch": 1.9718090183206463, + "grad_norm": 0.33347646140022125, + "learning_rate": 3.1709177100653184e-05, + "loss": 2.7353, + "step": 42352 + }, + { + "epoch": 1.9718555765067394, + "grad_norm": 0.33495367786340663, + "learning_rate": 3.1706656141560035e-05, + "loss": 2.636, + "step": 42353 + }, + { + "epoch": 1.9719021346928325, + "grad_norm": 0.36460975810290097, + "learning_rate": 3.1704135236156055e-05, + "loss": 2.7746, + "step": 42354 + }, + { + "epoch": 1.9719486928789256, + "grad_norm": 0.3529234207660755, + "learning_rate": 3.170161438444861e-05, + "loss": 2.6934, + "step": 42355 + }, + { + "epoch": 1.9719952510650185, + "grad_norm": 0.3501654347807297, + "learning_rate": 3.169909358644514e-05, + "loss": 2.7375, + "step": 42356 + }, + { + "epoch": 1.9720418092511116, + "grad_norm": 0.36114759085901904, + "learning_rate": 3.1696572842153015e-05, + "loss": 2.6452, + "step": 42357 + }, + { + "epoch": 1.9720883674372045, + "grad_norm": 0.3329010213799461, + "learning_rate": 3.1694052151579664e-05, + "loss": 2.7129, + "step": 42358 + }, + { + "epoch": 1.9721349256232976, + "grad_norm": 0.38013920217613195, + "learning_rate": 3.169153151473247e-05, + "loss": 2.728, + "step": 42359 + }, + { + "epoch": 1.9721814838093907, + "grad_norm": 0.32014031550131716, + "learning_rate": 3.1689010931618804e-05, + "loss": 2.6093, + "step": 42360 + }, + { + "epoch": 1.9722280419954838, + "grad_norm": 0.31513623538646734, + "learning_rate": 3.16864904022461e-05, + "loss": 2.674, + "step": 42361 + }, + { + "epoch": 1.972274600181577, + "grad_norm": 0.34904316314713185, + "learning_rate": 3.1683969926621734e-05, + "loss": 2.7579, + "step": 42362 + }, + { + "epoch": 1.97232115836767, + "grad_norm": 0.3241314910994549, + "learning_rate": 3.1681449504753125e-05, + "loss": 2.7231, + "step": 42363 + }, + { + "epoch": 1.9723677165537632, + "grad_norm": 0.3306032117767775, + "learning_rate": 3.1678929136647657e-05, + "loss": 2.7449, + "step": 42364 + }, + { + "epoch": 1.972414274739856, + "grad_norm": 0.3697634050591422, + "learning_rate": 3.1676408822312706e-05, + "loss": 2.7098, + "step": 42365 + }, + { + "epoch": 1.9724608329259492, + "grad_norm": 0.3461976925424133, + "learning_rate": 3.1673888561755716e-05, + "loss": 2.6096, + "step": 42366 + }, + { + "epoch": 1.9725073911120423, + "grad_norm": 0.32495512941561644, + "learning_rate": 3.167136835498405e-05, + "loss": 2.6294, + "step": 42367 + }, + { + "epoch": 1.9725539492981352, + "grad_norm": 0.3511319188016261, + "learning_rate": 3.1668848202005095e-05, + "loss": 2.659, + "step": 42368 + }, + { + "epoch": 1.9726005074842283, + "grad_norm": 0.34027917611286074, + "learning_rate": 3.166632810282628e-05, + "loss": 2.6399, + "step": 42369 + }, + { + "epoch": 1.9726470656703214, + "grad_norm": 0.36869714141813464, + "learning_rate": 3.166380805745497e-05, + "loss": 2.6905, + "step": 42370 + }, + { + "epoch": 1.9726936238564146, + "grad_norm": 0.3479488806682403, + "learning_rate": 3.16612880658986e-05, + "loss": 2.6757, + "step": 42371 + }, + { + "epoch": 1.9727401820425077, + "grad_norm": 0.32049877479744954, + "learning_rate": 3.1658768128164537e-05, + "loss": 2.6134, + "step": 42372 + }, + { + "epoch": 1.9727867402286008, + "grad_norm": 0.3365461957857391, + "learning_rate": 3.165624824426019e-05, + "loss": 2.7531, + "step": 42373 + }, + { + "epoch": 1.972833298414694, + "grad_norm": 0.3368202053084901, + "learning_rate": 3.165372841419294e-05, + "loss": 2.6828, + "step": 42374 + }, + { + "epoch": 1.9728798566007868, + "grad_norm": 0.3455969672357203, + "learning_rate": 3.165120863797017e-05, + "loss": 2.7476, + "step": 42375 + }, + { + "epoch": 1.97292641478688, + "grad_norm": 0.3686311968619892, + "learning_rate": 3.164868891559932e-05, + "loss": 2.7278, + "step": 42376 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.3469279075143901, + "learning_rate": 3.1646169247087755e-05, + "loss": 2.6746, + "step": 42377 + }, + { + "epoch": 1.973019531159066, + "grad_norm": 0.38817142871227883, + "learning_rate": 3.164364963244286e-05, + "loss": 2.6705, + "step": 42378 + }, + { + "epoch": 1.973066089345159, + "grad_norm": 0.32559993388653297, + "learning_rate": 3.1641130071672065e-05, + "loss": 2.6578, + "step": 42379 + }, + { + "epoch": 1.9731126475312522, + "grad_norm": 0.34275399856001804, + "learning_rate": 3.1638610564782724e-05, + "loss": 2.6738, + "step": 42380 + }, + { + "epoch": 1.9731592057173453, + "grad_norm": 0.33284796372723624, + "learning_rate": 3.163609111178227e-05, + "loss": 2.7236, + "step": 42381 + }, + { + "epoch": 1.9732057639034384, + "grad_norm": 0.32944070915667856, + "learning_rate": 3.163357171267808e-05, + "loss": 2.6995, + "step": 42382 + }, + { + "epoch": 1.9732523220895315, + "grad_norm": 0.3442966629455603, + "learning_rate": 3.1631052367477524e-05, + "loss": 2.6919, + "step": 42383 + }, + { + "epoch": 1.9732988802756246, + "grad_norm": 0.36860791316421265, + "learning_rate": 3.1628533076188046e-05, + "loss": 2.6355, + "step": 42384 + }, + { + "epoch": 1.9733454384617175, + "grad_norm": 0.33860098269959954, + "learning_rate": 3.162601383881701e-05, + "loss": 2.6799, + "step": 42385 + }, + { + "epoch": 1.9733919966478106, + "grad_norm": 0.3206787413229438, + "learning_rate": 3.1623494655371804e-05, + "loss": 2.7202, + "step": 42386 + }, + { + "epoch": 1.9734385548339035, + "grad_norm": 0.33416868542290323, + "learning_rate": 3.162097552585984e-05, + "loss": 2.6368, + "step": 42387 + }, + { + "epoch": 1.9734851130199966, + "grad_norm": 0.3319715177119102, + "learning_rate": 3.161845645028848e-05, + "loss": 2.6705, + "step": 42388 + }, + { + "epoch": 1.9735316712060897, + "grad_norm": 0.3316556311225389, + "learning_rate": 3.1615937428665165e-05, + "loss": 2.6312, + "step": 42389 + }, + { + "epoch": 1.9735782293921829, + "grad_norm": 0.33327079894768813, + "learning_rate": 3.161341846099727e-05, + "loss": 2.6215, + "step": 42390 + }, + { + "epoch": 1.973624787578276, + "grad_norm": 0.3590704813241913, + "learning_rate": 3.161089954729214e-05, + "loss": 2.6655, + "step": 42391 + }, + { + "epoch": 1.973671345764369, + "grad_norm": 0.33267623020132486, + "learning_rate": 3.160838068755724e-05, + "loss": 2.6624, + "step": 42392 + }, + { + "epoch": 1.9737179039504622, + "grad_norm": 0.325210397878595, + "learning_rate": 3.160586188179991e-05, + "loss": 2.6549, + "step": 42393 + }, + { + "epoch": 1.9737644621365553, + "grad_norm": 0.34913012838744295, + "learning_rate": 3.160334313002759e-05, + "loss": 2.5945, + "step": 42394 + }, + { + "epoch": 1.9738110203226482, + "grad_norm": 0.343361322581584, + "learning_rate": 3.160082443224763e-05, + "loss": 2.553, + "step": 42395 + }, + { + "epoch": 1.9738575785087413, + "grad_norm": 0.3351794299164396, + "learning_rate": 3.159830578846743e-05, + "loss": 2.5961, + "step": 42396 + }, + { + "epoch": 1.9739041366948342, + "grad_norm": 0.31952646313234834, + "learning_rate": 3.1595787198694405e-05, + "loss": 2.7019, + "step": 42397 + }, + { + "epoch": 1.9739506948809273, + "grad_norm": 0.34859192283947993, + "learning_rate": 3.159326866293593e-05, + "loss": 2.6435, + "step": 42398 + }, + { + "epoch": 1.9739972530670205, + "grad_norm": 0.3656639163849595, + "learning_rate": 3.1590750181199395e-05, + "loss": 2.7656, + "step": 42399 + }, + { + "epoch": 1.9740438112531136, + "grad_norm": 0.3320056416824077, + "learning_rate": 3.15882317534922e-05, + "loss": 2.7031, + "step": 42400 + }, + { + "epoch": 1.9740903694392067, + "grad_norm": 0.3401196960671389, + "learning_rate": 3.158571337982171e-05, + "loss": 2.647, + "step": 42401 + }, + { + "epoch": 1.9741369276252998, + "grad_norm": 0.3367911688155862, + "learning_rate": 3.158319506019536e-05, + "loss": 2.687, + "step": 42402 + }, + { + "epoch": 1.974183485811393, + "grad_norm": 0.3191833216570826, + "learning_rate": 3.1580676794620515e-05, + "loss": 2.6302, + "step": 42403 + }, + { + "epoch": 1.9742300439974858, + "grad_norm": 0.33739931404782947, + "learning_rate": 3.1578158583104556e-05, + "loss": 2.6448, + "step": 42404 + }, + { + "epoch": 1.974276602183579, + "grad_norm": 0.3412053487719934, + "learning_rate": 3.157564042565489e-05, + "loss": 2.7009, + "step": 42405 + }, + { + "epoch": 1.974323160369672, + "grad_norm": 0.35566840490575097, + "learning_rate": 3.1573122322278904e-05, + "loss": 2.8044, + "step": 42406 + }, + { + "epoch": 1.974369718555765, + "grad_norm": 0.3580673145223296, + "learning_rate": 3.1570604272984e-05, + "loss": 2.7621, + "step": 42407 + }, + { + "epoch": 1.974416276741858, + "grad_norm": 0.33237810149171604, + "learning_rate": 3.156808627777755e-05, + "loss": 2.8062, + "step": 42408 + }, + { + "epoch": 1.9744628349279512, + "grad_norm": 0.3411621410304701, + "learning_rate": 3.156556833666694e-05, + "loss": 2.6766, + "step": 42409 + }, + { + "epoch": 1.9745093931140443, + "grad_norm": 0.34079464710031, + "learning_rate": 3.156305044965959e-05, + "loss": 2.6619, + "step": 42410 + }, + { + "epoch": 1.9745559513001374, + "grad_norm": 0.34384270284882307, + "learning_rate": 3.1560532616762864e-05, + "loss": 2.7276, + "step": 42411 + }, + { + "epoch": 1.9746025094862305, + "grad_norm": 0.3351263043269675, + "learning_rate": 3.1558014837984165e-05, + "loss": 2.6323, + "step": 42412 + }, + { + "epoch": 1.9746490676723236, + "grad_norm": 0.3418308249118231, + "learning_rate": 3.1555497113330865e-05, + "loss": 2.707, + "step": 42413 + }, + { + "epoch": 1.9746956258584165, + "grad_norm": 0.33297681676438634, + "learning_rate": 3.155297944281036e-05, + "loss": 2.7942, + "step": 42414 + }, + { + "epoch": 1.9747421840445096, + "grad_norm": 0.3355202050344483, + "learning_rate": 3.155046182643006e-05, + "loss": 2.6251, + "step": 42415 + }, + { + "epoch": 1.9747887422306027, + "grad_norm": 0.34776165389495156, + "learning_rate": 3.1547944264197305e-05, + "loss": 2.7075, + "step": 42416 + }, + { + "epoch": 1.9748353004166956, + "grad_norm": 0.35157051859327804, + "learning_rate": 3.154542675611955e-05, + "loss": 2.684, + "step": 42417 + }, + { + "epoch": 1.9748818586027888, + "grad_norm": 0.33599194581073877, + "learning_rate": 3.154290930220414e-05, + "loss": 2.6278, + "step": 42418 + }, + { + "epoch": 1.9749284167888819, + "grad_norm": 0.32685908450523754, + "learning_rate": 3.1540391902458456e-05, + "loss": 2.6491, + "step": 42419 + }, + { + "epoch": 1.974974974974975, + "grad_norm": 0.3610483306474527, + "learning_rate": 3.1537874556889926e-05, + "loss": 2.7805, + "step": 42420 + }, + { + "epoch": 1.975021533161068, + "grad_norm": 0.35094727328968656, + "learning_rate": 3.1535357265505905e-05, + "loss": 2.5747, + "step": 42421 + }, + { + "epoch": 1.9750680913471612, + "grad_norm": 0.339788771034173, + "learning_rate": 3.1532840028313784e-05, + "loss": 2.8034, + "step": 42422 + }, + { + "epoch": 1.9751146495332543, + "grad_norm": 0.36044045151892634, + "learning_rate": 3.153032284532097e-05, + "loss": 2.7614, + "step": 42423 + }, + { + "epoch": 1.9751612077193472, + "grad_norm": 0.34801954169692556, + "learning_rate": 3.152780571653483e-05, + "loss": 2.6064, + "step": 42424 + }, + { + "epoch": 1.9752077659054403, + "grad_norm": 0.3495070756993558, + "learning_rate": 3.152528864196277e-05, + "loss": 2.6822, + "step": 42425 + }, + { + "epoch": 1.9752543240915332, + "grad_norm": 0.3437402838052046, + "learning_rate": 3.1522771621612174e-05, + "loss": 2.7186, + "step": 42426 + }, + { + "epoch": 1.9753008822776263, + "grad_norm": 0.3341237470157015, + "learning_rate": 3.15202546554904e-05, + "loss": 2.6992, + "step": 42427 + }, + { + "epoch": 1.9753474404637195, + "grad_norm": 0.3516329727549677, + "learning_rate": 3.151773774360488e-05, + "loss": 2.7388, + "step": 42428 + }, + { + "epoch": 1.9753939986498126, + "grad_norm": 0.36837180721727525, + "learning_rate": 3.151522088596295e-05, + "loss": 2.7712, + "step": 42429 + }, + { + "epoch": 1.9754405568359057, + "grad_norm": 0.36159452124567315, + "learning_rate": 3.1512704082572056e-05, + "loss": 2.6744, + "step": 42430 + }, + { + "epoch": 1.9754871150219988, + "grad_norm": 0.3418103529071234, + "learning_rate": 3.151018733343955e-05, + "loss": 2.6185, + "step": 42431 + }, + { + "epoch": 1.975533673208092, + "grad_norm": 0.3560314192431567, + "learning_rate": 3.1507670638572794e-05, + "loss": 2.6028, + "step": 42432 + }, + { + "epoch": 1.975580231394185, + "grad_norm": 0.3421606143428804, + "learning_rate": 3.1505153997979236e-05, + "loss": 2.7022, + "step": 42433 + }, + { + "epoch": 1.975626789580278, + "grad_norm": 0.3502078700119911, + "learning_rate": 3.150263741166621e-05, + "loss": 2.6761, + "step": 42434 + }, + { + "epoch": 1.975673347766371, + "grad_norm": 0.3225759308161909, + "learning_rate": 3.150012087964113e-05, + "loss": 2.6962, + "step": 42435 + }, + { + "epoch": 1.975719905952464, + "grad_norm": 0.3251523670835603, + "learning_rate": 3.149760440191137e-05, + "loss": 2.6457, + "step": 42436 + }, + { + "epoch": 1.975766464138557, + "grad_norm": 0.36532526723025555, + "learning_rate": 3.149508797848432e-05, + "loss": 2.6469, + "step": 42437 + }, + { + "epoch": 1.9758130223246502, + "grad_norm": 0.33818219865179766, + "learning_rate": 3.149257160936737e-05, + "loss": 2.6569, + "step": 42438 + }, + { + "epoch": 1.9758595805107433, + "grad_norm": 0.3084663758481475, + "learning_rate": 3.14900552945679e-05, + "loss": 2.5959, + "step": 42439 + }, + { + "epoch": 1.9759061386968364, + "grad_norm": 0.3485725532608974, + "learning_rate": 3.148753903409327e-05, + "loss": 2.6784, + "step": 42440 + }, + { + "epoch": 1.9759526968829295, + "grad_norm": 0.317812908000279, + "learning_rate": 3.1485022827950916e-05, + "loss": 2.5782, + "step": 42441 + }, + { + "epoch": 1.9759992550690226, + "grad_norm": 0.33117021597133456, + "learning_rate": 3.148250667614817e-05, + "loss": 2.649, + "step": 42442 + }, + { + "epoch": 1.9760458132551157, + "grad_norm": 0.34193852258397345, + "learning_rate": 3.1479990578692465e-05, + "loss": 2.6674, + "step": 42443 + }, + { + "epoch": 1.9760923714412086, + "grad_norm": 0.3466439960036265, + "learning_rate": 3.1477474535591164e-05, + "loss": 2.7369, + "step": 42444 + }, + { + "epoch": 1.9761389296273018, + "grad_norm": 0.32399849463507996, + "learning_rate": 3.147495854685163e-05, + "loss": 2.7468, + "step": 42445 + }, + { + "epoch": 1.9761854878133946, + "grad_norm": 0.38819923713191645, + "learning_rate": 3.147244261248129e-05, + "loss": 2.6647, + "step": 42446 + }, + { + "epoch": 1.9762320459994878, + "grad_norm": 0.3136902412761001, + "learning_rate": 3.1469926732487495e-05, + "loss": 2.5333, + "step": 42447 + }, + { + "epoch": 1.9762786041855809, + "grad_norm": 0.3220691644355531, + "learning_rate": 3.146741090687764e-05, + "loss": 2.7712, + "step": 42448 + }, + { + "epoch": 1.976325162371674, + "grad_norm": 0.3602104503867682, + "learning_rate": 3.1464895135659113e-05, + "loss": 2.6848, + "step": 42449 + }, + { + "epoch": 1.976371720557767, + "grad_norm": 0.35355100892240526, + "learning_rate": 3.146237941883928e-05, + "loss": 2.7788, + "step": 42450 + }, + { + "epoch": 1.9764182787438602, + "grad_norm": 0.3460350741873387, + "learning_rate": 3.1459863756425554e-05, + "loss": 2.6375, + "step": 42451 + }, + { + "epoch": 1.9764648369299533, + "grad_norm": 0.3138359639722766, + "learning_rate": 3.145734814842528e-05, + "loss": 2.6745, + "step": 42452 + }, + { + "epoch": 1.9765113951160462, + "grad_norm": 0.35630718232856257, + "learning_rate": 3.1454832594845885e-05, + "loss": 2.7376, + "step": 42453 + }, + { + "epoch": 1.9765579533021393, + "grad_norm": 0.358974462880303, + "learning_rate": 3.145231709569472e-05, + "loss": 2.7021, + "step": 42454 + }, + { + "epoch": 1.9766045114882325, + "grad_norm": 0.33619974041036566, + "learning_rate": 3.144980165097916e-05, + "loss": 2.6517, + "step": 42455 + }, + { + "epoch": 1.9766510696743254, + "grad_norm": 0.3387522754085838, + "learning_rate": 3.144728626070663e-05, + "loss": 2.6172, + "step": 42456 + }, + { + "epoch": 1.9766976278604185, + "grad_norm": 0.365971563332737, + "learning_rate": 3.144477092488449e-05, + "loss": 2.7462, + "step": 42457 + }, + { + "epoch": 1.9767441860465116, + "grad_norm": 0.36184744396714885, + "learning_rate": 3.1442255643520094e-05, + "loss": 2.7874, + "step": 42458 + }, + { + "epoch": 1.9767907442326047, + "grad_norm": 0.3622568275763285, + "learning_rate": 3.143974041662087e-05, + "loss": 2.6493, + "step": 42459 + }, + { + "epoch": 1.9768373024186978, + "grad_norm": 0.374010558127115, + "learning_rate": 3.143722524419417e-05, + "loss": 2.762, + "step": 42460 + }, + { + "epoch": 1.976883860604791, + "grad_norm": 0.3449893699642221, + "learning_rate": 3.143471012624739e-05, + "loss": 2.5993, + "step": 42461 + }, + { + "epoch": 1.976930418790884, + "grad_norm": 0.3317702990650646, + "learning_rate": 3.14321950627879e-05, + "loss": 2.6922, + "step": 42462 + }, + { + "epoch": 1.976976976976977, + "grad_norm": 0.349998072915749, + "learning_rate": 3.1429680053823096e-05, + "loss": 2.7126, + "step": 42463 + }, + { + "epoch": 1.97702353516307, + "grad_norm": 0.36591997956374006, + "learning_rate": 3.142716509936035e-05, + "loss": 2.6529, + "step": 42464 + }, + { + "epoch": 1.9770700933491632, + "grad_norm": 0.3311491463307332, + "learning_rate": 3.1424650199407034e-05, + "loss": 2.6694, + "step": 42465 + }, + { + "epoch": 1.977116651535256, + "grad_norm": 0.34646518830033185, + "learning_rate": 3.1422135353970563e-05, + "loss": 2.6417, + "step": 42466 + }, + { + "epoch": 1.9771632097213492, + "grad_norm": 0.35558409518170137, + "learning_rate": 3.141962056305828e-05, + "loss": 2.6218, + "step": 42467 + }, + { + "epoch": 1.9772097679074423, + "grad_norm": 0.3509351087308187, + "learning_rate": 3.1417105826677575e-05, + "loss": 2.6089, + "step": 42468 + }, + { + "epoch": 1.9772563260935354, + "grad_norm": 0.3553200025035977, + "learning_rate": 3.1414591144835835e-05, + "loss": 2.6823, + "step": 42469 + }, + { + "epoch": 1.9773028842796285, + "grad_norm": 0.3874737183189078, + "learning_rate": 3.1412076517540436e-05, + "loss": 2.6666, + "step": 42470 + }, + { + "epoch": 1.9773494424657216, + "grad_norm": 0.36003342010478434, + "learning_rate": 3.140956194479877e-05, + "loss": 2.6076, + "step": 42471 + }, + { + "epoch": 1.9773960006518148, + "grad_norm": 0.36176944821321844, + "learning_rate": 3.140704742661822e-05, + "loss": 2.7723, + "step": 42472 + }, + { + "epoch": 1.9774425588379076, + "grad_norm": 0.3635015083194833, + "learning_rate": 3.140453296300614e-05, + "loss": 2.6372, + "step": 42473 + }, + { + "epoch": 1.9774891170240008, + "grad_norm": 0.33596243788019386, + "learning_rate": 3.140201855396991e-05, + "loss": 2.72, + "step": 42474 + }, + { + "epoch": 1.9775356752100937, + "grad_norm": 0.35299935289433165, + "learning_rate": 3.139950419951695e-05, + "loss": 2.6469, + "step": 42475 + }, + { + "epoch": 1.9775822333961868, + "grad_norm": 0.35945609233827674, + "learning_rate": 3.13969898996546e-05, + "loss": 2.7139, + "step": 42476 + }, + { + "epoch": 1.9776287915822799, + "grad_norm": 0.3603457722428538, + "learning_rate": 3.139447565439026e-05, + "loss": 2.6916, + "step": 42477 + }, + { + "epoch": 1.977675349768373, + "grad_norm": 0.3643385240557714, + "learning_rate": 3.139196146373128e-05, + "loss": 2.7598, + "step": 42478 + }, + { + "epoch": 1.9777219079544661, + "grad_norm": 0.3554425643806448, + "learning_rate": 3.138944732768508e-05, + "loss": 2.6729, + "step": 42479 + }, + { + "epoch": 1.9777684661405592, + "grad_norm": 0.3770117331209801, + "learning_rate": 3.138693324625902e-05, + "loss": 2.6757, + "step": 42480 + }, + { + "epoch": 1.9778150243266523, + "grad_norm": 0.3457829233991612, + "learning_rate": 3.138441921946045e-05, + "loss": 2.7147, + "step": 42481 + }, + { + "epoch": 1.9778615825127455, + "grad_norm": 0.34789616281784447, + "learning_rate": 3.1381905247296804e-05, + "loss": 2.678, + "step": 42482 + }, + { + "epoch": 1.9779081406988384, + "grad_norm": 0.3738227399507819, + "learning_rate": 3.137939132977541e-05, + "loss": 2.7241, + "step": 42483 + }, + { + "epoch": 1.9779546988849315, + "grad_norm": 0.3427626004362387, + "learning_rate": 3.137687746690369e-05, + "loss": 2.6973, + "step": 42484 + }, + { + "epoch": 1.9780012570710244, + "grad_norm": 0.37326754144607593, + "learning_rate": 3.1374363658689e-05, + "loss": 2.7487, + "step": 42485 + }, + { + "epoch": 1.9780478152571175, + "grad_norm": 0.3643034973762611, + "learning_rate": 3.137184990513871e-05, + "loss": 2.7041, + "step": 42486 + }, + { + "epoch": 1.9780943734432106, + "grad_norm": 0.35153947243690925, + "learning_rate": 3.136933620626019e-05, + "loss": 2.7154, + "step": 42487 + }, + { + "epoch": 1.9781409316293037, + "grad_norm": 0.40685626787308454, + "learning_rate": 3.136682256206085e-05, + "loss": 2.572, + "step": 42488 + }, + { + "epoch": 1.9781874898153968, + "grad_norm": 0.30665639132242417, + "learning_rate": 3.1364308972548054e-05, + "loss": 2.6297, + "step": 42489 + }, + { + "epoch": 1.97823404800149, + "grad_norm": 0.38678213814996776, + "learning_rate": 3.1361795437729166e-05, + "loss": 2.6633, + "step": 42490 + }, + { + "epoch": 1.978280606187583, + "grad_norm": 0.34251261274088435, + "learning_rate": 3.1359281957611565e-05, + "loss": 2.7821, + "step": 42491 + }, + { + "epoch": 1.978327164373676, + "grad_norm": 0.3314139680670004, + "learning_rate": 3.135676853220265e-05, + "loss": 2.7441, + "step": 42492 + }, + { + "epoch": 1.978373722559769, + "grad_norm": 0.34159735337003194, + "learning_rate": 3.1354255161509785e-05, + "loss": 2.7336, + "step": 42493 + }, + { + "epoch": 1.9784202807458622, + "grad_norm": 0.35675578675850445, + "learning_rate": 3.1351741845540325e-05, + "loss": 2.7506, + "step": 42494 + }, + { + "epoch": 1.978466838931955, + "grad_norm": 0.36717446446233754, + "learning_rate": 3.134922858430168e-05, + "loss": 2.7722, + "step": 42495 + }, + { + "epoch": 1.9785133971180482, + "grad_norm": 0.3573234345958742, + "learning_rate": 3.13467153778012e-05, + "loss": 2.7044, + "step": 42496 + }, + { + "epoch": 1.9785599553041413, + "grad_norm": 0.3549933024301, + "learning_rate": 3.134420222604629e-05, + "loss": 2.7327, + "step": 42497 + }, + { + "epoch": 1.9786065134902344, + "grad_norm": 0.33449768312377787, + "learning_rate": 3.1341689129044306e-05, + "loss": 2.7092, + "step": 42498 + }, + { + "epoch": 1.9786530716763275, + "grad_norm": 0.38647899945025177, + "learning_rate": 3.1339176086802616e-05, + "loss": 2.7231, + "step": 42499 + }, + { + "epoch": 1.9786996298624207, + "grad_norm": 0.33401728351190435, + "learning_rate": 3.1336663099328606e-05, + "loss": 2.6745, + "step": 42500 + }, + { + "epoch": 1.9787461880485138, + "grad_norm": 0.35210222183996515, + "learning_rate": 3.133415016662965e-05, + "loss": 2.6239, + "step": 42501 + }, + { + "epoch": 1.9787927462346067, + "grad_norm": 0.37216145418178526, + "learning_rate": 3.1331637288713136e-05, + "loss": 2.5963, + "step": 42502 + }, + { + "epoch": 1.9788393044206998, + "grad_norm": 0.3569218475765682, + "learning_rate": 3.132912446558643e-05, + "loss": 2.6725, + "step": 42503 + }, + { + "epoch": 1.978885862606793, + "grad_norm": 0.3525260647461342, + "learning_rate": 3.132661169725688e-05, + "loss": 2.7247, + "step": 42504 + }, + { + "epoch": 1.9789324207928858, + "grad_norm": 0.3504855940772454, + "learning_rate": 3.132409898373191e-05, + "loss": 2.5622, + "step": 42505 + }, + { + "epoch": 1.978978978978979, + "grad_norm": 0.33964592273808164, + "learning_rate": 3.132158632501885e-05, + "loss": 2.5702, + "step": 42506 + }, + { + "epoch": 1.979025537165072, + "grad_norm": 0.35880084577096094, + "learning_rate": 3.13190737211251e-05, + "loss": 2.7573, + "step": 42507 + }, + { + "epoch": 1.9790720953511651, + "grad_norm": 0.329824015900827, + "learning_rate": 3.1316561172058044e-05, + "loss": 2.7008, + "step": 42508 + }, + { + "epoch": 1.9791186535372582, + "grad_norm": 0.31460430355251673, + "learning_rate": 3.1314048677825014e-05, + "loss": 2.5916, + "step": 42509 + }, + { + "epoch": 1.9791652117233514, + "grad_norm": 0.34826784943471034, + "learning_rate": 3.131153623843344e-05, + "loss": 2.7641, + "step": 42510 + }, + { + "epoch": 1.9792117699094445, + "grad_norm": 0.35112411043600583, + "learning_rate": 3.130902385389065e-05, + "loss": 2.5348, + "step": 42511 + }, + { + "epoch": 1.9792583280955374, + "grad_norm": 0.3304619890511083, + "learning_rate": 3.130651152420403e-05, + "loss": 2.6048, + "step": 42512 + }, + { + "epoch": 1.9793048862816305, + "grad_norm": 0.3380445590442424, + "learning_rate": 3.1303999249380955e-05, + "loss": 2.6219, + "step": 42513 + }, + { + "epoch": 1.9793514444677234, + "grad_norm": 0.37588458230140115, + "learning_rate": 3.1301487029428814e-05, + "loss": 2.6952, + "step": 42514 + }, + { + "epoch": 1.9793980026538165, + "grad_norm": 0.36328702533169877, + "learning_rate": 3.1298974864354956e-05, + "loss": 2.6708, + "step": 42515 + }, + { + "epoch": 1.9794445608399096, + "grad_norm": 0.33922337291287463, + "learning_rate": 3.129646275416678e-05, + "loss": 2.6832, + "step": 42516 + }, + { + "epoch": 1.9794911190260027, + "grad_norm": 0.34210211490608494, + "learning_rate": 3.129395069887162e-05, + "loss": 2.7256, + "step": 42517 + }, + { + "epoch": 1.9795376772120958, + "grad_norm": 0.35990812516132253, + "learning_rate": 3.129143869847688e-05, + "loss": 2.628, + "step": 42518 + }, + { + "epoch": 1.979584235398189, + "grad_norm": 0.37107169216910124, + "learning_rate": 3.1288926752989924e-05, + "loss": 2.6347, + "step": 42519 + }, + { + "epoch": 1.979630793584282, + "grad_norm": 0.31718782772016224, + "learning_rate": 3.1286414862418126e-05, + "loss": 2.5423, + "step": 42520 + }, + { + "epoch": 1.9796773517703752, + "grad_norm": 0.32767111809297333, + "learning_rate": 3.1283903026768866e-05, + "loss": 2.5254, + "step": 42521 + }, + { + "epoch": 1.979723909956468, + "grad_norm": 0.36278674305810216, + "learning_rate": 3.128139124604948e-05, + "loss": 2.6862, + "step": 42522 + }, + { + "epoch": 1.9797704681425612, + "grad_norm": 0.3632558252495857, + "learning_rate": 3.12788795202674e-05, + "loss": 2.6593, + "step": 42523 + }, + { + "epoch": 1.979817026328654, + "grad_norm": 0.37499860269959384, + "learning_rate": 3.127636784942994e-05, + "loss": 2.759, + "step": 42524 + }, + { + "epoch": 1.9798635845147472, + "grad_norm": 0.3588770529110853, + "learning_rate": 3.1273856233544505e-05, + "loss": 2.6037, + "step": 42525 + }, + { + "epoch": 1.9799101427008403, + "grad_norm": 0.37096980973842275, + "learning_rate": 3.127134467261847e-05, + "loss": 2.7163, + "step": 42526 + }, + { + "epoch": 1.9799567008869334, + "grad_norm": 0.3292620471018697, + "learning_rate": 3.126883316665918e-05, + "loss": 2.5825, + "step": 42527 + }, + { + "epoch": 1.9800032590730265, + "grad_norm": 0.3485019686629462, + "learning_rate": 3.126632171567403e-05, + "loss": 2.6489, + "step": 42528 + }, + { + "epoch": 1.9800498172591197, + "grad_norm": 0.34888551761599534, + "learning_rate": 3.1263810319670374e-05, + "loss": 2.7212, + "step": 42529 + }, + { + "epoch": 1.9800963754452128, + "grad_norm": 0.3616019882420528, + "learning_rate": 3.126129897865557e-05, + "loss": 2.7072, + "step": 42530 + }, + { + "epoch": 1.980142933631306, + "grad_norm": 0.3494348396155263, + "learning_rate": 3.125878769263703e-05, + "loss": 2.6833, + "step": 42531 + }, + { + "epoch": 1.9801894918173988, + "grad_norm": 0.33551463461332887, + "learning_rate": 3.125627646162208e-05, + "loss": 2.7162, + "step": 42532 + }, + { + "epoch": 1.980236050003492, + "grad_norm": 0.337562823288016, + "learning_rate": 3.1253765285618134e-05, + "loss": 2.6277, + "step": 42533 + }, + { + "epoch": 1.9802826081895848, + "grad_norm": 0.3378440933448553, + "learning_rate": 3.125125416463254e-05, + "loss": 2.6469, + "step": 42534 + }, + { + "epoch": 1.980329166375678, + "grad_norm": 0.31388531204758363, + "learning_rate": 3.124874309867265e-05, + "loss": 2.6272, + "step": 42535 + }, + { + "epoch": 1.980375724561771, + "grad_norm": 0.3332253608174351, + "learning_rate": 3.124623208774587e-05, + "loss": 2.7239, + "step": 42536 + }, + { + "epoch": 1.9804222827478641, + "grad_norm": 0.3480291476909231, + "learning_rate": 3.124372113185954e-05, + "loss": 2.6722, + "step": 42537 + }, + { + "epoch": 1.9804688409339573, + "grad_norm": 0.3355953823038032, + "learning_rate": 3.124121023102104e-05, + "loss": 2.644, + "step": 42538 + }, + { + "epoch": 1.9805153991200504, + "grad_norm": 0.33455472184346674, + "learning_rate": 3.1238699385237754e-05, + "loss": 2.7679, + "step": 42539 + }, + { + "epoch": 1.9805619573061435, + "grad_norm": 0.32540451699663825, + "learning_rate": 3.123618859451702e-05, + "loss": 2.5476, + "step": 42540 + }, + { + "epoch": 1.9806085154922364, + "grad_norm": 0.3745597971736504, + "learning_rate": 3.123367785886624e-05, + "loss": 2.6848, + "step": 42541 + }, + { + "epoch": 1.9806550736783295, + "grad_norm": 0.3900440477321678, + "learning_rate": 3.123116717829274e-05, + "loss": 2.7339, + "step": 42542 + }, + { + "epoch": 1.9807016318644226, + "grad_norm": 0.36583971939341164, + "learning_rate": 3.1228656552803945e-05, + "loss": 2.7009, + "step": 42543 + }, + { + "epoch": 1.9807481900505155, + "grad_norm": 0.35639737862235776, + "learning_rate": 3.122614598240719e-05, + "loss": 2.7348, + "step": 42544 + }, + { + "epoch": 1.9807947482366086, + "grad_norm": 0.333642350546161, + "learning_rate": 3.122363546710982e-05, + "loss": 2.786, + "step": 42545 + }, + { + "epoch": 1.9808413064227017, + "grad_norm": 0.34910721808305895, + "learning_rate": 3.122112500691926e-05, + "loss": 2.7081, + "step": 42546 + }, + { + "epoch": 1.9808878646087948, + "grad_norm": 0.3605750351546559, + "learning_rate": 3.1218614601842846e-05, + "loss": 2.6937, + "step": 42547 + }, + { + "epoch": 1.980934422794888, + "grad_norm": 0.3212870054719807, + "learning_rate": 3.1216104251887925e-05, + "loss": 2.6232, + "step": 42548 + }, + { + "epoch": 1.980980980980981, + "grad_norm": 0.3622637421028203, + "learning_rate": 3.121359395706191e-05, + "loss": 2.7541, + "step": 42549 + }, + { + "epoch": 1.9810275391670742, + "grad_norm": 0.3340199321164657, + "learning_rate": 3.1211083717372136e-05, + "loss": 2.7456, + "step": 42550 + }, + { + "epoch": 1.981074097353167, + "grad_norm": 0.35559210183197787, + "learning_rate": 3.120857353282597e-05, + "loss": 2.7253, + "step": 42551 + }, + { + "epoch": 1.9811206555392602, + "grad_norm": 0.3674106948636231, + "learning_rate": 3.120606340343081e-05, + "loss": 2.6233, + "step": 42552 + }, + { + "epoch": 1.9811672137253533, + "grad_norm": 0.33358788034533493, + "learning_rate": 3.1203553329193994e-05, + "loss": 2.6596, + "step": 42553 + }, + { + "epoch": 1.9812137719114462, + "grad_norm": 0.32837528972173285, + "learning_rate": 3.12010433101229e-05, + "loss": 2.5959, + "step": 42554 + }, + { + "epoch": 1.9812603300975393, + "grad_norm": 0.3716683556448038, + "learning_rate": 3.119853334622488e-05, + "loss": 2.7135, + "step": 42555 + }, + { + "epoch": 1.9813068882836324, + "grad_norm": 0.36368253960282365, + "learning_rate": 3.1196023437507324e-05, + "loss": 2.7288, + "step": 42556 + }, + { + "epoch": 1.9813534464697256, + "grad_norm": 0.33252904542186085, + "learning_rate": 3.1193513583977597e-05, + "loss": 2.7291, + "step": 42557 + }, + { + "epoch": 1.9814000046558187, + "grad_norm": 0.35597941661212124, + "learning_rate": 3.119100378564302e-05, + "loss": 2.6609, + "step": 42558 + }, + { + "epoch": 1.9814465628419118, + "grad_norm": 0.3343812525658604, + "learning_rate": 3.118849404251103e-05, + "loss": 2.6894, + "step": 42559 + }, + { + "epoch": 1.981493121028005, + "grad_norm": 0.3455194003525995, + "learning_rate": 3.118598435458893e-05, + "loss": 2.5929, + "step": 42560 + }, + { + "epoch": 1.9815396792140978, + "grad_norm": 0.3275358447091224, + "learning_rate": 3.1183474721884134e-05, + "loss": 2.5685, + "step": 42561 + }, + { + "epoch": 1.981586237400191, + "grad_norm": 0.3206594602950413, + "learning_rate": 3.1180965144403975e-05, + "loss": 2.6085, + "step": 42562 + }, + { + "epoch": 1.9816327955862838, + "grad_norm": 0.3293203942113693, + "learning_rate": 3.1178455622155834e-05, + "loss": 2.6606, + "step": 42563 + }, + { + "epoch": 1.981679353772377, + "grad_norm": 0.33772921127435107, + "learning_rate": 3.117594615514706e-05, + "loss": 2.7404, + "step": 42564 + }, + { + "epoch": 1.98172591195847, + "grad_norm": 0.34967595904415466, + "learning_rate": 3.117343674338505e-05, + "loss": 2.7329, + "step": 42565 + }, + { + "epoch": 1.9817724701445631, + "grad_norm": 0.31762116734167395, + "learning_rate": 3.117092738687713e-05, + "loss": 2.6459, + "step": 42566 + }, + { + "epoch": 1.9818190283306563, + "grad_norm": 0.33725072710754894, + "learning_rate": 3.11684180856307e-05, + "loss": 2.6922, + "step": 42567 + }, + { + "epoch": 1.9818655865167494, + "grad_norm": 0.35231005148672945, + "learning_rate": 3.116590883965308e-05, + "loss": 2.6465, + "step": 42568 + }, + { + "epoch": 1.9819121447028425, + "grad_norm": 0.33170278124336433, + "learning_rate": 3.116339964895169e-05, + "loss": 2.5692, + "step": 42569 + }, + { + "epoch": 1.9819587028889356, + "grad_norm": 0.3515143910851972, + "learning_rate": 3.1160890513533864e-05, + "loss": 2.6936, + "step": 42570 + }, + { + "epoch": 1.9820052610750285, + "grad_norm": 0.3331785756918974, + "learning_rate": 3.115838143340695e-05, + "loss": 2.6637, + "step": 42571 + }, + { + "epoch": 1.9820518192611216, + "grad_norm": 0.35066339310743133, + "learning_rate": 3.1155872408578344e-05, + "loss": 2.6142, + "step": 42572 + }, + { + "epoch": 1.9820983774472145, + "grad_norm": 0.3330161306832899, + "learning_rate": 3.115336343905538e-05, + "loss": 2.601, + "step": 42573 + }, + { + "epoch": 1.9821449356333076, + "grad_norm": 0.3354026977942831, + "learning_rate": 3.1150854524845454e-05, + "loss": 2.7121, + "step": 42574 + }, + { + "epoch": 1.9821914938194007, + "grad_norm": 0.3419959361683571, + "learning_rate": 3.1148345665955915e-05, + "loss": 2.7745, + "step": 42575 + }, + { + "epoch": 1.9822380520054939, + "grad_norm": 0.3403824783981389, + "learning_rate": 3.1145836862394114e-05, + "loss": 2.7641, + "step": 42576 + }, + { + "epoch": 1.982284610191587, + "grad_norm": 0.3297485656108009, + "learning_rate": 3.114332811416743e-05, + "loss": 2.6718, + "step": 42577 + }, + { + "epoch": 1.98233116837768, + "grad_norm": 0.33958074740478217, + "learning_rate": 3.114081942128322e-05, + "loss": 2.5265, + "step": 42578 + }, + { + "epoch": 1.9823777265637732, + "grad_norm": 0.34897813026316205, + "learning_rate": 3.1138310783748846e-05, + "loss": 2.6642, + "step": 42579 + }, + { + "epoch": 1.982424284749866, + "grad_norm": 0.35269868392553305, + "learning_rate": 3.113580220157168e-05, + "loss": 2.7046, + "step": 42580 + }, + { + "epoch": 1.9824708429359592, + "grad_norm": 0.33333955281643085, + "learning_rate": 3.113329367475906e-05, + "loss": 2.6811, + "step": 42581 + }, + { + "epoch": 1.9825174011220523, + "grad_norm": 0.35352027936918456, + "learning_rate": 3.1130785203318376e-05, + "loss": 2.6498, + "step": 42582 + }, + { + "epoch": 1.9825639593081452, + "grad_norm": 0.3283568489826034, + "learning_rate": 3.1128276787256984e-05, + "loss": 2.5773, + "step": 42583 + }, + { + "epoch": 1.9826105174942383, + "grad_norm": 0.33310841927945617, + "learning_rate": 3.1125768426582216e-05, + "loss": 2.7254, + "step": 42584 + }, + { + "epoch": 1.9826570756803314, + "grad_norm": 0.3398804717806921, + "learning_rate": 3.112326012130148e-05, + "loss": 2.6292, + "step": 42585 + }, + { + "epoch": 1.9827036338664246, + "grad_norm": 0.33694511257237614, + "learning_rate": 3.11207518714221e-05, + "loss": 2.7453, + "step": 42586 + }, + { + "epoch": 1.9827501920525177, + "grad_norm": 0.3360944156206392, + "learning_rate": 3.111824367695146e-05, + "loss": 2.6142, + "step": 42587 + }, + { + "epoch": 1.9827967502386108, + "grad_norm": 0.3536678802376738, + "learning_rate": 3.111573553789693e-05, + "loss": 2.6591, + "step": 42588 + }, + { + "epoch": 1.982843308424704, + "grad_norm": 0.3556642801109649, + "learning_rate": 3.1113227454265836e-05, + "loss": 2.7431, + "step": 42589 + }, + { + "epoch": 1.9828898666107968, + "grad_norm": 0.33280651358222807, + "learning_rate": 3.111071942606557e-05, + "loss": 2.6625, + "step": 42590 + }, + { + "epoch": 1.98293642479689, + "grad_norm": 0.3404975954698211, + "learning_rate": 3.110821145330347e-05, + "loss": 2.6522, + "step": 42591 + }, + { + "epoch": 1.982982982982983, + "grad_norm": 0.33471445869628835, + "learning_rate": 3.110570353598692e-05, + "loss": 2.5774, + "step": 42592 + }, + { + "epoch": 1.983029541169076, + "grad_norm": 0.34507718431044443, + "learning_rate": 3.110319567412328e-05, + "loss": 2.5401, + "step": 42593 + }, + { + "epoch": 1.983076099355169, + "grad_norm": 0.33142601082076406, + "learning_rate": 3.1100687867719867e-05, + "loss": 2.6364, + "step": 42594 + }, + { + "epoch": 1.9831226575412622, + "grad_norm": 0.3181486516162308, + "learning_rate": 3.109818011678409e-05, + "loss": 2.6935, + "step": 42595 + }, + { + "epoch": 1.9831692157273553, + "grad_norm": 0.361386822479596, + "learning_rate": 3.1095672421323286e-05, + "loss": 2.5885, + "step": 42596 + }, + { + "epoch": 1.9832157739134484, + "grad_norm": 0.33484716362809513, + "learning_rate": 3.109316478134483e-05, + "loss": 2.5833, + "step": 42597 + }, + { + "epoch": 1.9832623320995415, + "grad_norm": 0.3422535753758215, + "learning_rate": 3.1090657196856074e-05, + "loss": 2.6468, + "step": 42598 + }, + { + "epoch": 1.9833088902856346, + "grad_norm": 0.3421990316752083, + "learning_rate": 3.108814966786436e-05, + "loss": 2.6608, + "step": 42599 + }, + { + "epoch": 1.9833554484717275, + "grad_norm": 0.33933386176683894, + "learning_rate": 3.108564219437709e-05, + "loss": 2.5933, + "step": 42600 + }, + { + "epoch": 1.9834020066578206, + "grad_norm": 0.3633026794271839, + "learning_rate": 3.108313477640159e-05, + "loss": 2.6983, + "step": 42601 + }, + { + "epoch": 1.9834485648439135, + "grad_norm": 0.35586612054038186, + "learning_rate": 3.108062741394521e-05, + "loss": 2.6434, + "step": 42602 + }, + { + "epoch": 1.9834951230300066, + "grad_norm": 0.34314736630629017, + "learning_rate": 3.1078120107015326e-05, + "loss": 2.7218, + "step": 42603 + }, + { + "epoch": 1.9835416812160998, + "grad_norm": 0.31494933987749674, + "learning_rate": 3.1075612855619304e-05, + "loss": 2.7702, + "step": 42604 + }, + { + "epoch": 1.9835882394021929, + "grad_norm": 0.349295058819523, + "learning_rate": 3.10731056597645e-05, + "loss": 2.6953, + "step": 42605 + }, + { + "epoch": 1.983634797588286, + "grad_norm": 0.3499625489584115, + "learning_rate": 3.107059851945826e-05, + "loss": 2.7551, + "step": 42606 + }, + { + "epoch": 1.983681355774379, + "grad_norm": 0.34543010721970624, + "learning_rate": 3.106809143470794e-05, + "loss": 2.6807, + "step": 42607 + }, + { + "epoch": 1.9837279139604722, + "grad_norm": 0.3584515865284674, + "learning_rate": 3.106558440552092e-05, + "loss": 2.5826, + "step": 42608 + }, + { + "epoch": 1.9837744721465653, + "grad_norm": 0.3261304627616747, + "learning_rate": 3.106307743190452e-05, + "loss": 2.6358, + "step": 42609 + }, + { + "epoch": 1.9838210303326582, + "grad_norm": 0.3442324493553181, + "learning_rate": 3.106057051386615e-05, + "loss": 2.6396, + "step": 42610 + }, + { + "epoch": 1.9838675885187513, + "grad_norm": 0.35147794303117674, + "learning_rate": 3.105806365141314e-05, + "loss": 2.6792, + "step": 42611 + }, + { + "epoch": 1.9839141467048442, + "grad_norm": 0.34165768531783014, + "learning_rate": 3.105555684455282e-05, + "loss": 2.6977, + "step": 42612 + }, + { + "epoch": 1.9839607048909373, + "grad_norm": 0.3533671552997257, + "learning_rate": 3.1053050093292594e-05, + "loss": 2.6215, + "step": 42613 + }, + { + "epoch": 1.9840072630770305, + "grad_norm": 0.3624274366775842, + "learning_rate": 3.1050543397639795e-05, + "loss": 2.7578, + "step": 42614 + }, + { + "epoch": 1.9840538212631236, + "grad_norm": 0.34071732474342686, + "learning_rate": 3.1048036757601796e-05, + "loss": 2.6314, + "step": 42615 + }, + { + "epoch": 1.9841003794492167, + "grad_norm": 0.33211484605404834, + "learning_rate": 3.104553017318593e-05, + "loss": 2.5881, + "step": 42616 + }, + { + "epoch": 1.9841469376353098, + "grad_norm": 0.3352342351171511, + "learning_rate": 3.104302364439956e-05, + "loss": 2.647, + "step": 42617 + }, + { + "epoch": 1.984193495821403, + "grad_norm": 0.32197621374642804, + "learning_rate": 3.104051717125006e-05, + "loss": 2.7114, + "step": 42618 + }, + { + "epoch": 1.9842400540074958, + "grad_norm": 0.3349047730428873, + "learning_rate": 3.103801075374476e-05, + "loss": 2.7175, + "step": 42619 + }, + { + "epoch": 1.984286612193589, + "grad_norm": 0.3439885829302152, + "learning_rate": 3.1035504391891044e-05, + "loss": 2.6414, + "step": 42620 + }, + { + "epoch": 1.984333170379682, + "grad_norm": 0.34992724657676477, + "learning_rate": 3.103299808569626e-05, + "loss": 2.6887, + "step": 42621 + }, + { + "epoch": 1.984379728565775, + "grad_norm": 0.3383326416724395, + "learning_rate": 3.103049183516773e-05, + "loss": 2.7095, + "step": 42622 + }, + { + "epoch": 1.984426286751868, + "grad_norm": 0.3235175329729265, + "learning_rate": 3.102798564031287e-05, + "loss": 2.7562, + "step": 42623 + }, + { + "epoch": 1.9844728449379612, + "grad_norm": 0.35875278272686295, + "learning_rate": 3.1025479501139e-05, + "loss": 2.6496, + "step": 42624 + }, + { + "epoch": 1.9845194031240543, + "grad_norm": 0.3471064183130308, + "learning_rate": 3.102297341765345e-05, + "loss": 2.7666, + "step": 42625 + }, + { + "epoch": 1.9845659613101474, + "grad_norm": 0.32546613880900876, + "learning_rate": 3.1020467389863626e-05, + "loss": 2.6048, + "step": 42626 + }, + { + "epoch": 1.9846125194962405, + "grad_norm": 0.3218657951571162, + "learning_rate": 3.101796141777685e-05, + "loss": 2.6054, + "step": 42627 + }, + { + "epoch": 1.9846590776823336, + "grad_norm": 0.33643754371527584, + "learning_rate": 3.10154555014005e-05, + "loss": 2.6634, + "step": 42628 + }, + { + "epoch": 1.9847056358684265, + "grad_norm": 0.34500117911400374, + "learning_rate": 3.101294964074191e-05, + "loss": 2.6219, + "step": 42629 + }, + { + "epoch": 1.9847521940545196, + "grad_norm": 0.36190307066130484, + "learning_rate": 3.101044383580844e-05, + "loss": 2.6451, + "step": 42630 + }, + { + "epoch": 1.9847987522406128, + "grad_norm": 0.3547054309710002, + "learning_rate": 3.1007938086607454e-05, + "loss": 2.6501, + "step": 42631 + }, + { + "epoch": 1.9848453104267056, + "grad_norm": 0.33303986414738607, + "learning_rate": 3.1005432393146284e-05, + "loss": 2.6971, + "step": 42632 + }, + { + "epoch": 1.9848918686127988, + "grad_norm": 0.35786791305859456, + "learning_rate": 3.1002926755432316e-05, + "loss": 2.6148, + "step": 42633 + }, + { + "epoch": 1.9849384267988919, + "grad_norm": 0.3193378896439978, + "learning_rate": 3.100042117347288e-05, + "loss": 2.6908, + "step": 42634 + }, + { + "epoch": 1.984984984984985, + "grad_norm": 0.34576867334606093, + "learning_rate": 3.099791564727533e-05, + "loss": 2.705, + "step": 42635 + }, + { + "epoch": 1.985031543171078, + "grad_norm": 0.34654427009823546, + "learning_rate": 3.0995410176847034e-05, + "loss": 2.7044, + "step": 42636 + }, + { + "epoch": 1.9850781013571712, + "grad_norm": 0.3384852027750259, + "learning_rate": 3.099290476219532e-05, + "loss": 2.6967, + "step": 42637 + }, + { + "epoch": 1.9851246595432643, + "grad_norm": 0.314738238974341, + "learning_rate": 3.0990399403327583e-05, + "loss": 2.6973, + "step": 42638 + }, + { + "epoch": 1.9851712177293572, + "grad_norm": 0.3412851269047484, + "learning_rate": 3.0987894100251146e-05, + "loss": 2.6416, + "step": 42639 + }, + { + "epoch": 1.9852177759154503, + "grad_norm": 0.31449299862901897, + "learning_rate": 3.098538885297335e-05, + "loss": 2.7146, + "step": 42640 + }, + { + "epoch": 1.9852643341015432, + "grad_norm": 0.33451905876012633, + "learning_rate": 3.0982883661501586e-05, + "loss": 2.7424, + "step": 42641 + }, + { + "epoch": 1.9853108922876364, + "grad_norm": 0.3199409129292572, + "learning_rate": 3.098037852584317e-05, + "loss": 2.6843, + "step": 42642 + }, + { + "epoch": 1.9853574504737295, + "grad_norm": 0.3349746131492533, + "learning_rate": 3.097787344600547e-05, + "loss": 2.7046, + "step": 42643 + }, + { + "epoch": 1.9854040086598226, + "grad_norm": 0.3362071191977415, + "learning_rate": 3.097536842199585e-05, + "loss": 2.7364, + "step": 42644 + }, + { + "epoch": 1.9854505668459157, + "grad_norm": 0.32947040399495536, + "learning_rate": 3.097286345382163e-05, + "loss": 2.6606, + "step": 42645 + }, + { + "epoch": 1.9854971250320088, + "grad_norm": 0.331014814590429, + "learning_rate": 3.0970358541490195e-05, + "loss": 2.5223, + "step": 42646 + }, + { + "epoch": 1.985543683218102, + "grad_norm": 0.31513816553556806, + "learning_rate": 3.0967853685008884e-05, + "loss": 2.6763, + "step": 42647 + }, + { + "epoch": 1.985590241404195, + "grad_norm": 0.30745836213152755, + "learning_rate": 3.096534888438503e-05, + "loss": 2.6405, + "step": 42648 + }, + { + "epoch": 1.985636799590288, + "grad_norm": 0.3368963126910421, + "learning_rate": 3.0962844139626024e-05, + "loss": 2.7283, + "step": 42649 + }, + { + "epoch": 1.985683357776381, + "grad_norm": 0.33227639186725466, + "learning_rate": 3.096033945073917e-05, + "loss": 2.6641, + "step": 42650 + }, + { + "epoch": 1.985729915962474, + "grad_norm": 0.3358902635256517, + "learning_rate": 3.0957834817731865e-05, + "loss": 2.6818, + "step": 42651 + }, + { + "epoch": 1.985776474148567, + "grad_norm": 0.32860484137525015, + "learning_rate": 3.0955330240611445e-05, + "loss": 2.6737, + "step": 42652 + }, + { + "epoch": 1.9858230323346602, + "grad_norm": 0.34579716550087103, + "learning_rate": 3.095282571938524e-05, + "loss": 2.6552, + "step": 42653 + }, + { + "epoch": 1.9858695905207533, + "grad_norm": 0.3160915275414252, + "learning_rate": 3.095032125406062e-05, + "loss": 2.6139, + "step": 42654 + }, + { + "epoch": 1.9859161487068464, + "grad_norm": 0.33450351953443164, + "learning_rate": 3.0947816844644917e-05, + "loss": 2.6258, + "step": 42655 + }, + { + "epoch": 1.9859627068929395, + "grad_norm": 0.34545611702669177, + "learning_rate": 3.094531249114552e-05, + "loss": 2.6415, + "step": 42656 + }, + { + "epoch": 1.9860092650790326, + "grad_norm": 0.3471367035035473, + "learning_rate": 3.094280819356974e-05, + "loss": 2.6873, + "step": 42657 + }, + { + "epoch": 1.9860558232651258, + "grad_norm": 0.33967148143134873, + "learning_rate": 3.094030395192493e-05, + "loss": 2.664, + "step": 42658 + }, + { + "epoch": 1.9861023814512186, + "grad_norm": 0.36072721230186344, + "learning_rate": 3.0937799766218467e-05, + "loss": 2.6464, + "step": 42659 + }, + { + "epoch": 1.9861489396373118, + "grad_norm": 0.3462254864607228, + "learning_rate": 3.0935295636457685e-05, + "loss": 2.4253, + "step": 42660 + }, + { + "epoch": 1.9861954978234047, + "grad_norm": 0.31851413872785794, + "learning_rate": 3.093279156264991e-05, + "loss": 2.5974, + "step": 42661 + }, + { + "epoch": 1.9862420560094978, + "grad_norm": 0.33782007998293584, + "learning_rate": 3.093028754480253e-05, + "loss": 2.6099, + "step": 42662 + }, + { + "epoch": 1.9862886141955909, + "grad_norm": 0.35289367436685826, + "learning_rate": 3.092778358292286e-05, + "loss": 2.5955, + "step": 42663 + }, + { + "epoch": 1.986335172381684, + "grad_norm": 0.3491682346928652, + "learning_rate": 3.092527967701829e-05, + "loss": 2.6674, + "step": 42664 + }, + { + "epoch": 1.9863817305677771, + "grad_norm": 0.34435037228252496, + "learning_rate": 3.092277582709614e-05, + "loss": 2.6594, + "step": 42665 + }, + { + "epoch": 1.9864282887538702, + "grad_norm": 0.34769207901940274, + "learning_rate": 3.092027203316376e-05, + "loss": 2.6744, + "step": 42666 + }, + { + "epoch": 1.9864748469399633, + "grad_norm": 0.33686046726263186, + "learning_rate": 3.091776829522851e-05, + "loss": 2.6507, + "step": 42667 + }, + { + "epoch": 1.9865214051260562, + "grad_norm": 0.35128309509329597, + "learning_rate": 3.091526461329771e-05, + "loss": 2.7371, + "step": 42668 + }, + { + "epoch": 1.9865679633121494, + "grad_norm": 0.33382711735359866, + "learning_rate": 3.091276098737874e-05, + "loss": 2.7896, + "step": 42669 + }, + { + "epoch": 1.9866145214982425, + "grad_norm": 0.36190934466941693, + "learning_rate": 3.091025741747895e-05, + "loss": 2.5844, + "step": 42670 + }, + { + "epoch": 1.9866610796843354, + "grad_norm": 0.362369503995322, + "learning_rate": 3.090775390360564e-05, + "loss": 2.6844, + "step": 42671 + }, + { + "epoch": 1.9867076378704285, + "grad_norm": 0.3404877522271013, + "learning_rate": 3.090525044576622e-05, + "loss": 2.6182, + "step": 42672 + }, + { + "epoch": 1.9867541960565216, + "grad_norm": 0.3724269344069182, + "learning_rate": 3.090274704396799e-05, + "loss": 2.6615, + "step": 42673 + }, + { + "epoch": 1.9868007542426147, + "grad_norm": 0.3427775086162828, + "learning_rate": 3.0900243698218336e-05, + "loss": 2.6438, + "step": 42674 + }, + { + "epoch": 1.9868473124287078, + "grad_norm": 0.3243339980774307, + "learning_rate": 3.089774040852458e-05, + "loss": 2.7039, + "step": 42675 + }, + { + "epoch": 1.986893870614801, + "grad_norm": 0.34582926099983846, + "learning_rate": 3.089523717489405e-05, + "loss": 2.678, + "step": 42676 + }, + { + "epoch": 1.986940428800894, + "grad_norm": 0.3478947992593083, + "learning_rate": 3.0892733997334146e-05, + "loss": 2.7018, + "step": 42677 + }, + { + "epoch": 1.986986986986987, + "grad_norm": 0.31316361850861885, + "learning_rate": 3.0890230875852186e-05, + "loss": 2.695, + "step": 42678 + }, + { + "epoch": 1.98703354517308, + "grad_norm": 0.3302101972528018, + "learning_rate": 3.08877278104555e-05, + "loss": 2.6917, + "step": 42679 + }, + { + "epoch": 1.9870801033591732, + "grad_norm": 0.32187557932256244, + "learning_rate": 3.0885224801151455e-05, + "loss": 2.7041, + "step": 42680 + }, + { + "epoch": 1.987126661545266, + "grad_norm": 0.3233860895156023, + "learning_rate": 3.088272184794738e-05, + "loss": 2.6666, + "step": 42681 + }, + { + "epoch": 1.9871732197313592, + "grad_norm": 0.31777555607099445, + "learning_rate": 3.088021895085066e-05, + "loss": 2.6738, + "step": 42682 + }, + { + "epoch": 1.9872197779174523, + "grad_norm": 0.3244953168793144, + "learning_rate": 3.0877716109868604e-05, + "loss": 2.6828, + "step": 42683 + }, + { + "epoch": 1.9872663361035454, + "grad_norm": 0.35337954493367557, + "learning_rate": 3.087521332500854e-05, + "loss": 2.5962, + "step": 42684 + }, + { + "epoch": 1.9873128942896385, + "grad_norm": 0.3456417598040945, + "learning_rate": 3.087271059627788e-05, + "loss": 2.7052, + "step": 42685 + }, + { + "epoch": 1.9873594524757316, + "grad_norm": 0.37657069453433567, + "learning_rate": 3.08702079236839e-05, + "loss": 2.6355, + "step": 42686 + }, + { + "epoch": 1.9874060106618248, + "grad_norm": 0.3523948903747657, + "learning_rate": 3.0867705307234e-05, + "loss": 2.6352, + "step": 42687 + }, + { + "epoch": 1.9874525688479177, + "grad_norm": 0.3728283290157171, + "learning_rate": 3.086520274693548e-05, + "loss": 2.6544, + "step": 42688 + }, + { + "epoch": 1.9874991270340108, + "grad_norm": 0.3448067463399794, + "learning_rate": 3.0862700242795704e-05, + "loss": 2.659, + "step": 42689 + }, + { + "epoch": 1.9875456852201037, + "grad_norm": 0.3697032960367559, + "learning_rate": 3.086019779482204e-05, + "loss": 2.6422, + "step": 42690 + }, + { + "epoch": 1.9875922434061968, + "grad_norm": 0.35321471951126565, + "learning_rate": 3.0857695403021794e-05, + "loss": 2.6093, + "step": 42691 + }, + { + "epoch": 1.98763880159229, + "grad_norm": 0.3326211460714312, + "learning_rate": 3.085519306740233e-05, + "loss": 2.6222, + "step": 42692 + }, + { + "epoch": 1.987685359778383, + "grad_norm": 0.36186106925087824, + "learning_rate": 3.0852690787970996e-05, + "loss": 2.6838, + "step": 42693 + }, + { + "epoch": 1.9877319179644761, + "grad_norm": 0.3543054422022693, + "learning_rate": 3.0850188564735106e-05, + "loss": 2.5926, + "step": 42694 + }, + { + "epoch": 1.9877784761505692, + "grad_norm": 0.33022770261606404, + "learning_rate": 3.084768639770205e-05, + "loss": 2.5231, + "step": 42695 + }, + { + "epoch": 1.9878250343366624, + "grad_norm": 0.3328170135405376, + "learning_rate": 3.084518428687914e-05, + "loss": 2.7169, + "step": 42696 + }, + { + "epoch": 1.9878715925227555, + "grad_norm": 0.3710163401070196, + "learning_rate": 3.084268223227371e-05, + "loss": 2.6964, + "step": 42697 + }, + { + "epoch": 1.9879181507088484, + "grad_norm": 0.31123354673575226, + "learning_rate": 3.084018023389315e-05, + "loss": 2.7519, + "step": 42698 + }, + { + "epoch": 1.9879647088949415, + "grad_norm": 0.3304887098383084, + "learning_rate": 3.0837678291744735e-05, + "loss": 2.6472, + "step": 42699 + }, + { + "epoch": 1.9880112670810344, + "grad_norm": 0.3407408547931625, + "learning_rate": 3.0835176405835874e-05, + "loss": 2.6533, + "step": 42700 + }, + { + "epoch": 1.9880578252671275, + "grad_norm": 0.3395401499708389, + "learning_rate": 3.0832674576173884e-05, + "loss": 2.6756, + "step": 42701 + }, + { + "epoch": 1.9881043834532206, + "grad_norm": 0.34315221213756464, + "learning_rate": 3.083017280276609e-05, + "loss": 2.6261, + "step": 42702 + }, + { + "epoch": 1.9881509416393137, + "grad_norm": 0.32941103549700324, + "learning_rate": 3.082767108561987e-05, + "loss": 2.7069, + "step": 42703 + }, + { + "epoch": 1.9881974998254068, + "grad_norm": 0.3637813969817, + "learning_rate": 3.082516942474253e-05, + "loss": 2.7024, + "step": 42704 + }, + { + "epoch": 1.9882440580115, + "grad_norm": 0.3261211475887895, + "learning_rate": 3.0822667820141435e-05, + "loss": 2.7767, + "step": 42705 + }, + { + "epoch": 1.988290616197593, + "grad_norm": 0.3235621052196935, + "learning_rate": 3.082016627182394e-05, + "loss": 2.7324, + "step": 42706 + }, + { + "epoch": 1.988337174383686, + "grad_norm": 0.3311977567187554, + "learning_rate": 3.081766477979733e-05, + "loss": 2.6954, + "step": 42707 + }, + { + "epoch": 1.988383732569779, + "grad_norm": 0.32542168152637685, + "learning_rate": 3.0815163344069006e-05, + "loss": 2.6968, + "step": 42708 + }, + { + "epoch": 1.9884302907558722, + "grad_norm": 0.3288811841714055, + "learning_rate": 3.0812661964646275e-05, + "loss": 2.7089, + "step": 42709 + }, + { + "epoch": 1.988476848941965, + "grad_norm": 0.36207403605770555, + "learning_rate": 3.08101606415365e-05, + "loss": 2.7205, + "step": 42710 + }, + { + "epoch": 1.9885234071280582, + "grad_norm": 0.3422382844315764, + "learning_rate": 3.080765937474702e-05, + "loss": 2.7207, + "step": 42711 + }, + { + "epoch": 1.9885699653141513, + "grad_norm": 0.33258265499454803, + "learning_rate": 3.080515816428515e-05, + "loss": 2.7223, + "step": 42712 + }, + { + "epoch": 1.9886165235002444, + "grad_norm": 0.3459031330058626, + "learning_rate": 3.080265701015827e-05, + "loss": 2.6762, + "step": 42713 + }, + { + "epoch": 1.9886630816863375, + "grad_norm": 0.32444874340506197, + "learning_rate": 3.080015591237369e-05, + "loss": 2.6796, + "step": 42714 + }, + { + "epoch": 1.9887096398724307, + "grad_norm": 0.3452055307509726, + "learning_rate": 3.079765487093875e-05, + "loss": 2.7599, + "step": 42715 + }, + { + "epoch": 1.9887561980585238, + "grad_norm": 0.33640040195795273, + "learning_rate": 3.079515388586083e-05, + "loss": 2.6495, + "step": 42716 + }, + { + "epoch": 1.9888027562446167, + "grad_norm": 0.32679468325055483, + "learning_rate": 3.079265295714722e-05, + "loss": 2.5872, + "step": 42717 + }, + { + "epoch": 1.9888493144307098, + "grad_norm": 0.3522887846304535, + "learning_rate": 3.079015208480529e-05, + "loss": 2.6004, + "step": 42718 + }, + { + "epoch": 1.988895872616803, + "grad_norm": 0.32185625235514487, + "learning_rate": 3.078765126884238e-05, + "loss": 2.7456, + "step": 42719 + }, + { + "epoch": 1.9889424308028958, + "grad_norm": 0.3405772060962624, + "learning_rate": 3.0785150509265794e-05, + "loss": 2.7212, + "step": 42720 + }, + { + "epoch": 1.988988988988989, + "grad_norm": 0.3490053703021574, + "learning_rate": 3.0782649806082916e-05, + "loss": 2.6612, + "step": 42721 + }, + { + "epoch": 1.989035547175082, + "grad_norm": 0.33681923538209396, + "learning_rate": 3.078014915930105e-05, + "loss": 2.6853, + "step": 42722 + }, + { + "epoch": 1.9890821053611751, + "grad_norm": 0.360047522468295, + "learning_rate": 3.0777648568927575e-05, + "loss": 2.6399, + "step": 42723 + }, + { + "epoch": 1.9891286635472682, + "grad_norm": 0.331636053010596, + "learning_rate": 3.0775148034969803e-05, + "loss": 2.6221, + "step": 42724 + }, + { + "epoch": 1.9891752217333614, + "grad_norm": 0.3342953558286327, + "learning_rate": 3.0772647557435065e-05, + "loss": 2.6937, + "step": 42725 + }, + { + "epoch": 1.9892217799194545, + "grad_norm": 0.32938754166795764, + "learning_rate": 3.0770147136330726e-05, + "loss": 2.7198, + "step": 42726 + }, + { + "epoch": 1.9892683381055474, + "grad_norm": 0.3289757203486773, + "learning_rate": 3.0767646771664104e-05, + "loss": 2.7601, + "step": 42727 + }, + { + "epoch": 1.9893148962916405, + "grad_norm": 0.3648447490344701, + "learning_rate": 3.076514646344254e-05, + "loss": 2.7814, + "step": 42728 + }, + { + "epoch": 1.9893614544777334, + "grad_norm": 0.3409749273761356, + "learning_rate": 3.076264621167339e-05, + "loss": 2.6422, + "step": 42729 + }, + { + "epoch": 1.9894080126638265, + "grad_norm": 0.3324336943489276, + "learning_rate": 3.076014601636397e-05, + "loss": 2.7398, + "step": 42730 + }, + { + "epoch": 1.9894545708499196, + "grad_norm": 0.32290617434473123, + "learning_rate": 3.075764587752163e-05, + "loss": 2.7197, + "step": 42731 + }, + { + "epoch": 1.9895011290360127, + "grad_norm": 0.34322894875301263, + "learning_rate": 3.075514579515371e-05, + "loss": 2.5859, + "step": 42732 + }, + { + "epoch": 1.9895476872221058, + "grad_norm": 0.31031744112066895, + "learning_rate": 3.075264576926752e-05, + "loss": 2.6751, + "step": 42733 + }, + { + "epoch": 1.989594245408199, + "grad_norm": 0.335030254009856, + "learning_rate": 3.0750145799870436e-05, + "loss": 2.6196, + "step": 42734 + }, + { + "epoch": 1.989640803594292, + "grad_norm": 0.34536846154730916, + "learning_rate": 3.074764588696976e-05, + "loss": 2.5441, + "step": 42735 + }, + { + "epoch": 1.9896873617803852, + "grad_norm": 0.3082059531127539, + "learning_rate": 3.074514603057286e-05, + "loss": 2.6638, + "step": 42736 + }, + { + "epoch": 1.989733919966478, + "grad_norm": 0.3206917711354777, + "learning_rate": 3.0742646230687064e-05, + "loss": 2.6259, + "step": 42737 + }, + { + "epoch": 1.9897804781525712, + "grad_norm": 0.3174918215447657, + "learning_rate": 3.074014648731969e-05, + "loss": 2.6244, + "step": 42738 + }, + { + "epoch": 1.989827036338664, + "grad_norm": 0.3487767166653464, + "learning_rate": 3.07376468004781e-05, + "loss": 2.5619, + "step": 42739 + }, + { + "epoch": 1.9898735945247572, + "grad_norm": 0.3299711583313531, + "learning_rate": 3.073514717016961e-05, + "loss": 2.66, + "step": 42740 + }, + { + "epoch": 1.9899201527108503, + "grad_norm": 0.34678572895930376, + "learning_rate": 3.073264759640156e-05, + "loss": 2.7442, + "step": 42741 + }, + { + "epoch": 1.9899667108969434, + "grad_norm": 0.4031439438685146, + "learning_rate": 3.0730148079181315e-05, + "loss": 2.7083, + "step": 42742 + }, + { + "epoch": 1.9900132690830366, + "grad_norm": 0.3130494509844395, + "learning_rate": 3.072764861851617e-05, + "loss": 2.6133, + "step": 42743 + }, + { + "epoch": 1.9900598272691297, + "grad_norm": 0.3588901582530038, + "learning_rate": 3.0725149214413484e-05, + "loss": 2.7416, + "step": 42744 + }, + { + "epoch": 1.9901063854552228, + "grad_norm": 0.36344950762208644, + "learning_rate": 3.072264986688057e-05, + "loss": 2.7247, + "step": 42745 + }, + { + "epoch": 1.990152943641316, + "grad_norm": 0.35250910227469584, + "learning_rate": 3.072015057592481e-05, + "loss": 2.6208, + "step": 42746 + }, + { + "epoch": 1.9901995018274088, + "grad_norm": 0.3569410307420234, + "learning_rate": 3.071765134155349e-05, + "loss": 2.5907, + "step": 42747 + }, + { + "epoch": 1.990246060013502, + "grad_norm": 0.35831294260275726, + "learning_rate": 3.071515216377395e-05, + "loss": 2.7447, + "step": 42748 + }, + { + "epoch": 1.9902926181995948, + "grad_norm": 0.3537361919415573, + "learning_rate": 3.0712653042593566e-05, + "loss": 2.7601, + "step": 42749 + }, + { + "epoch": 1.990339176385688, + "grad_norm": 0.34015805946748323, + "learning_rate": 3.071015397801964e-05, + "loss": 2.7472, + "step": 42750 + }, + { + "epoch": 1.990385734571781, + "grad_norm": 0.36279349332734895, + "learning_rate": 3.0707654970059494e-05, + "loss": 2.6381, + "step": 42751 + }, + { + "epoch": 1.9904322927578741, + "grad_norm": 0.3571011584005337, + "learning_rate": 3.070515601872049e-05, + "loss": 2.6774, + "step": 42752 + }, + { + "epoch": 1.9904788509439673, + "grad_norm": 0.3541161595242159, + "learning_rate": 3.0702657124009956e-05, + "loss": 2.758, + "step": 42753 + }, + { + "epoch": 1.9905254091300604, + "grad_norm": 0.3683053511033832, + "learning_rate": 3.0700158285935216e-05, + "loss": 2.7255, + "step": 42754 + }, + { + "epoch": 1.9905719673161535, + "grad_norm": 0.3575884888271493, + "learning_rate": 3.0697659504503615e-05, + "loss": 2.713, + "step": 42755 + }, + { + "epoch": 1.9906185255022464, + "grad_norm": 0.3316857170379012, + "learning_rate": 3.069516077972248e-05, + "loss": 2.6497, + "step": 42756 + }, + { + "epoch": 1.9906650836883395, + "grad_norm": 0.36151275540792094, + "learning_rate": 3.069266211159916e-05, + "loss": 2.6724, + "step": 42757 + }, + { + "epoch": 1.9907116418744326, + "grad_norm": 0.35997813553666985, + "learning_rate": 3.069016350014094e-05, + "loss": 2.6561, + "step": 42758 + }, + { + "epoch": 1.9907582000605255, + "grad_norm": 0.3311404381234186, + "learning_rate": 3.068766494535522e-05, + "loss": 2.6745, + "step": 42759 + }, + { + "epoch": 1.9908047582466186, + "grad_norm": 0.3377452766523189, + "learning_rate": 3.068516644724929e-05, + "loss": 2.6933, + "step": 42760 + }, + { + "epoch": 1.9908513164327117, + "grad_norm": 0.3570926581936647, + "learning_rate": 3.068266800583048e-05, + "loss": 2.7416, + "step": 42761 + }, + { + "epoch": 1.9908978746188049, + "grad_norm": 0.36256809661149064, + "learning_rate": 3.068016962110616e-05, + "loss": 2.7249, + "step": 42762 + }, + { + "epoch": 1.990944432804898, + "grad_norm": 0.36604119647102046, + "learning_rate": 3.067767129308362e-05, + "loss": 2.6759, + "step": 42763 + }, + { + "epoch": 1.990990990990991, + "grad_norm": 0.3360040037161077, + "learning_rate": 3.067517302177022e-05, + "loss": 2.5921, + "step": 42764 + }, + { + "epoch": 1.9910375491770842, + "grad_norm": 0.3542876214379931, + "learning_rate": 3.067267480717329e-05, + "loss": 2.7605, + "step": 42765 + }, + { + "epoch": 1.991084107363177, + "grad_norm": 0.36184668223110944, + "learning_rate": 3.067017664930015e-05, + "loss": 2.638, + "step": 42766 + }, + { + "epoch": 1.9911306655492702, + "grad_norm": 0.3420796340174919, + "learning_rate": 3.066767854815813e-05, + "loss": 2.6382, + "step": 42767 + }, + { + "epoch": 1.9911772237353633, + "grad_norm": 0.346976492722603, + "learning_rate": 3.066518050375458e-05, + "loss": 2.6846, + "step": 42768 + }, + { + "epoch": 1.9912237819214562, + "grad_norm": 0.35709552085270174, + "learning_rate": 3.0662682516096807e-05, + "loss": 2.6584, + "step": 42769 + }, + { + "epoch": 1.9912703401075493, + "grad_norm": 0.3714852772902967, + "learning_rate": 3.066018458519218e-05, + "loss": 2.6822, + "step": 42770 + }, + { + "epoch": 1.9913168982936424, + "grad_norm": 0.34852527243433384, + "learning_rate": 3.065768671104797e-05, + "loss": 2.6292, + "step": 42771 + }, + { + "epoch": 1.9913634564797356, + "grad_norm": 0.383141296925118, + "learning_rate": 3.065518889367157e-05, + "loss": 2.7237, + "step": 42772 + }, + { + "epoch": 1.9914100146658287, + "grad_norm": 0.32131800880410033, + "learning_rate": 3.0652691133070286e-05, + "loss": 2.6933, + "step": 42773 + }, + { + "epoch": 1.9914565728519218, + "grad_norm": 0.3589841278514395, + "learning_rate": 3.065019342925143e-05, + "loss": 2.7604, + "step": 42774 + }, + { + "epoch": 1.991503131038015, + "grad_norm": 0.3452381700361232, + "learning_rate": 3.0647695782222365e-05, + "loss": 2.6183, + "step": 42775 + }, + { + "epoch": 1.9915496892241078, + "grad_norm": 0.3015278418627331, + "learning_rate": 3.064519819199039e-05, + "loss": 2.6568, + "step": 42776 + }, + { + "epoch": 1.991596247410201, + "grad_norm": 0.3325366659042627, + "learning_rate": 3.064270065856287e-05, + "loss": 2.671, + "step": 42777 + }, + { + "epoch": 1.9916428055962938, + "grad_norm": 0.3504623305686139, + "learning_rate": 3.0640203181947116e-05, + "loss": 2.6627, + "step": 42778 + }, + { + "epoch": 1.991689363782387, + "grad_norm": 0.3427543077793007, + "learning_rate": 3.063770576215045e-05, + "loss": 2.668, + "step": 42779 + }, + { + "epoch": 1.99173592196848, + "grad_norm": 0.34704771810304824, + "learning_rate": 3.0635208399180204e-05, + "loss": 2.6423, + "step": 42780 + }, + { + "epoch": 1.9917824801545732, + "grad_norm": 0.34214336053337385, + "learning_rate": 3.063271109304372e-05, + "loss": 2.6558, + "step": 42781 + }, + { + "epoch": 1.9918290383406663, + "grad_norm": 0.35420974594207005, + "learning_rate": 3.063021384374833e-05, + "loss": 2.7968, + "step": 42782 + }, + { + "epoch": 1.9918755965267594, + "grad_norm": 0.35004585487654843, + "learning_rate": 3.0627716651301355e-05, + "loss": 2.6725, + "step": 42783 + }, + { + "epoch": 1.9919221547128525, + "grad_norm": 0.3321990325060073, + "learning_rate": 3.06252195157101e-05, + "loss": 2.7266, + "step": 42784 + }, + { + "epoch": 1.9919687128989456, + "grad_norm": 0.33084504373476736, + "learning_rate": 3.062272243698194e-05, + "loss": 2.5759, + "step": 42785 + }, + { + "epoch": 1.9920152710850385, + "grad_norm": 0.3223486045099737, + "learning_rate": 3.062022541512418e-05, + "loss": 2.5937, + "step": 42786 + }, + { + "epoch": 1.9920618292711316, + "grad_norm": 0.34736584171026874, + "learning_rate": 3.061772845014412e-05, + "loss": 2.7197, + "step": 42787 + }, + { + "epoch": 1.9921083874572245, + "grad_norm": 0.35357331003283093, + "learning_rate": 3.061523154204915e-05, + "loss": 2.6658, + "step": 42788 + }, + { + "epoch": 1.9921549456433176, + "grad_norm": 0.3310453264502542, + "learning_rate": 3.061273469084654e-05, + "loss": 2.6244, + "step": 42789 + }, + { + "epoch": 1.9922015038294107, + "grad_norm": 0.3307469761761244, + "learning_rate": 3.061023789654367e-05, + "loss": 2.6846, + "step": 42790 + }, + { + "epoch": 1.9922480620155039, + "grad_norm": 0.34371246080076034, + "learning_rate": 3.0607741159147835e-05, + "loss": 2.638, + "step": 42791 + }, + { + "epoch": 1.992294620201597, + "grad_norm": 0.3324676308778734, + "learning_rate": 3.060524447866636e-05, + "loss": 2.6907, + "step": 42792 + }, + { + "epoch": 1.99234117838769, + "grad_norm": 0.36172548384530234, + "learning_rate": 3.060274785510659e-05, + "loss": 2.7289, + "step": 42793 + }, + { + "epoch": 1.9923877365737832, + "grad_norm": 0.34111301294319957, + "learning_rate": 3.060025128847583e-05, + "loss": 2.8137, + "step": 42794 + }, + { + "epoch": 1.992434294759876, + "grad_norm": 0.3532116484115485, + "learning_rate": 3.059775477878144e-05, + "loss": 2.7529, + "step": 42795 + }, + { + "epoch": 1.9924808529459692, + "grad_norm": 0.3685741701765118, + "learning_rate": 3.059525832603073e-05, + "loss": 2.6534, + "step": 42796 + }, + { + "epoch": 1.9925274111320623, + "grad_norm": 0.35278968070280425, + "learning_rate": 3.059276193023099e-05, + "loss": 2.6675, + "step": 42797 + }, + { + "epoch": 1.9925739693181552, + "grad_norm": 0.34134637501735854, + "learning_rate": 3.0590265591389624e-05, + "loss": 2.7055, + "step": 42798 + }, + { + "epoch": 1.9926205275042483, + "grad_norm": 0.3652663770505718, + "learning_rate": 3.058776930951388e-05, + "loss": 2.7014, + "step": 42799 + }, + { + "epoch": 1.9926670856903415, + "grad_norm": 0.36608014726794197, + "learning_rate": 3.058527308461115e-05, + "loss": 2.796, + "step": 42800 + }, + { + "epoch": 1.9927136438764346, + "grad_norm": 0.37606313685617326, + "learning_rate": 3.0582776916688724e-05, + "loss": 2.7069, + "step": 42801 + }, + { + "epoch": 1.9927602020625277, + "grad_norm": 0.36356923944339214, + "learning_rate": 3.0580280805753916e-05, + "loss": 2.5933, + "step": 42802 + }, + { + "epoch": 1.9928067602486208, + "grad_norm": 0.37457497128524964, + "learning_rate": 3.05777847518141e-05, + "loss": 2.6487, + "step": 42803 + }, + { + "epoch": 1.992853318434714, + "grad_norm": 0.3137283115045979, + "learning_rate": 3.057528875487656e-05, + "loss": 2.7481, + "step": 42804 + }, + { + "epoch": 1.9928998766208068, + "grad_norm": 0.35707926720939376, + "learning_rate": 3.057279281494863e-05, + "loss": 2.5989, + "step": 42805 + }, + { + "epoch": 1.9929464348069, + "grad_norm": 0.3536424691436832, + "learning_rate": 3.057029693203764e-05, + "loss": 2.7255, + "step": 42806 + }, + { + "epoch": 1.992992992992993, + "grad_norm": 0.3448471686463394, + "learning_rate": 3.056780110615091e-05, + "loss": 2.6634, + "step": 42807 + }, + { + "epoch": 1.993039551179086, + "grad_norm": 0.3284733145059232, + "learning_rate": 3.056530533729578e-05, + "loss": 2.5465, + "step": 42808 + }, + { + "epoch": 1.993086109365179, + "grad_norm": 0.332316894263618, + "learning_rate": 3.056280962547957e-05, + "loss": 2.6446, + "step": 42809 + }, + { + "epoch": 1.9931326675512722, + "grad_norm": 0.3420568182041981, + "learning_rate": 3.056031397070958e-05, + "loss": 2.6946, + "step": 42810 + }, + { + "epoch": 1.9931792257373653, + "grad_norm": 0.34422444739821006, + "learning_rate": 3.0557818372993164e-05, + "loss": 2.6834, + "step": 42811 + }, + { + "epoch": 1.9932257839234584, + "grad_norm": 0.3385319811468565, + "learning_rate": 3.055532283233762e-05, + "loss": 2.6505, + "step": 42812 + }, + { + "epoch": 1.9932723421095515, + "grad_norm": 0.34401247635204735, + "learning_rate": 3.055282734875031e-05, + "loss": 2.6903, + "step": 42813 + }, + { + "epoch": 1.9933189002956446, + "grad_norm": 0.3461786831971517, + "learning_rate": 3.055033192223853e-05, + "loss": 2.695, + "step": 42814 + }, + { + "epoch": 1.9933654584817375, + "grad_norm": 0.34187655719909743, + "learning_rate": 3.054783655280959e-05, + "loss": 2.6115, + "step": 42815 + }, + { + "epoch": 1.9934120166678306, + "grad_norm": 0.3210144770914962, + "learning_rate": 3.054534124047086e-05, + "loss": 2.5696, + "step": 42816 + }, + { + "epoch": 1.9934585748539235, + "grad_norm": 0.3557170652593612, + "learning_rate": 3.054284598522962e-05, + "loss": 2.6421, + "step": 42817 + }, + { + "epoch": 1.9935051330400166, + "grad_norm": 0.3646686626597345, + "learning_rate": 3.054035078709323e-05, + "loss": 2.688, + "step": 42818 + }, + { + "epoch": 1.9935516912261098, + "grad_norm": 0.33623781631614585, + "learning_rate": 3.0537855646068974e-05, + "loss": 2.7352, + "step": 42819 + }, + { + "epoch": 1.9935982494122029, + "grad_norm": 0.35320015209329614, + "learning_rate": 3.0535360562164196e-05, + "loss": 2.7631, + "step": 42820 + }, + { + "epoch": 1.993644807598296, + "grad_norm": 0.3594045934814643, + "learning_rate": 3.0532865535386234e-05, + "loss": 2.5585, + "step": 42821 + }, + { + "epoch": 1.993691365784389, + "grad_norm": 0.34145311248641835, + "learning_rate": 3.0530370565742385e-05, + "loss": 2.6644, + "step": 42822 + }, + { + "epoch": 1.9937379239704822, + "grad_norm": 0.3403045000749703, + "learning_rate": 3.052787565323997e-05, + "loss": 2.67, + "step": 42823 + }, + { + "epoch": 1.9937844821565753, + "grad_norm": 0.3710460709854069, + "learning_rate": 3.052538079788633e-05, + "loss": 2.6487, + "step": 42824 + }, + { + "epoch": 1.9938310403426682, + "grad_norm": 0.3505849138813499, + "learning_rate": 3.052288599968877e-05, + "loss": 2.7528, + "step": 42825 + }, + { + "epoch": 1.9938775985287613, + "grad_norm": 0.3390932507271082, + "learning_rate": 3.052039125865463e-05, + "loss": 2.7128, + "step": 42826 + }, + { + "epoch": 1.9939241567148542, + "grad_norm": 0.37095281841460354, + "learning_rate": 3.051789657479123e-05, + "loss": 2.6501, + "step": 42827 + }, + { + "epoch": 1.9939707149009473, + "grad_norm": 0.3554059810578989, + "learning_rate": 3.051540194810586e-05, + "loss": 2.5612, + "step": 42828 + }, + { + "epoch": 1.9940172730870405, + "grad_norm": 0.32044084689966146, + "learning_rate": 3.051290737860589e-05, + "loss": 2.7102, + "step": 42829 + }, + { + "epoch": 1.9940638312731336, + "grad_norm": 0.32967770862112805, + "learning_rate": 3.0510412866298598e-05, + "loss": 2.6632, + "step": 42830 + }, + { + "epoch": 1.9941103894592267, + "grad_norm": 0.3716218139947691, + "learning_rate": 3.0507918411191344e-05, + "loss": 2.6422, + "step": 42831 + }, + { + "epoch": 1.9941569476453198, + "grad_norm": 0.3338317271191114, + "learning_rate": 3.0505424013291417e-05, + "loss": 2.6928, + "step": 42832 + }, + { + "epoch": 1.994203505831413, + "grad_norm": 0.3236885244131829, + "learning_rate": 3.050292967260615e-05, + "loss": 2.6518, + "step": 42833 + }, + { + "epoch": 1.994250064017506, + "grad_norm": 0.32030365046899284, + "learning_rate": 3.0500435389142866e-05, + "loss": 2.6687, + "step": 42834 + }, + { + "epoch": 1.994296622203599, + "grad_norm": 0.3076024271708831, + "learning_rate": 3.049794116290887e-05, + "loss": 2.6357, + "step": 42835 + }, + { + "epoch": 1.994343180389692, + "grad_norm": 0.315619206423197, + "learning_rate": 3.049544699391152e-05, + "loss": 2.5343, + "step": 42836 + }, + { + "epoch": 1.994389738575785, + "grad_norm": 0.29960057676999535, + "learning_rate": 3.049295288215811e-05, + "loss": 2.7226, + "step": 42837 + }, + { + "epoch": 1.994436296761878, + "grad_norm": 0.31627352139386333, + "learning_rate": 3.049045882765593e-05, + "loss": 2.725, + "step": 42838 + }, + { + "epoch": 1.9944828549479712, + "grad_norm": 0.3232169667423277, + "learning_rate": 3.0487964830412363e-05, + "loss": 2.6771, + "step": 42839 + }, + { + "epoch": 1.9945294131340643, + "grad_norm": 0.3327414459340318, + "learning_rate": 3.0485470890434698e-05, + "loss": 2.7409, + "step": 42840 + }, + { + "epoch": 1.9945759713201574, + "grad_norm": 0.3243515497819515, + "learning_rate": 3.048297700773023e-05, + "loss": 2.6916, + "step": 42841 + }, + { + "epoch": 1.9946225295062505, + "grad_norm": 0.32835508567532157, + "learning_rate": 3.048048318230632e-05, + "loss": 2.6924, + "step": 42842 + }, + { + "epoch": 1.9946690876923436, + "grad_norm": 0.3401308956410061, + "learning_rate": 3.0477989414170256e-05, + "loss": 2.7194, + "step": 42843 + }, + { + "epoch": 1.9947156458784365, + "grad_norm": 0.3617460319496915, + "learning_rate": 3.0475495703329382e-05, + "loss": 2.6998, + "step": 42844 + }, + { + "epoch": 1.9947622040645296, + "grad_norm": 0.34599990454897583, + "learning_rate": 3.0473002049791e-05, + "loss": 2.7716, + "step": 42845 + }, + { + "epoch": 1.9948087622506228, + "grad_norm": 0.3249492454683364, + "learning_rate": 3.047050845356243e-05, + "loss": 2.6256, + "step": 42846 + }, + { + "epoch": 1.9948553204367157, + "grad_norm": 0.3512433834812663, + "learning_rate": 3.0468014914651e-05, + "loss": 2.6918, + "step": 42847 + }, + { + "epoch": 1.9949018786228088, + "grad_norm": 0.3326552553674631, + "learning_rate": 3.0465521433064004e-05, + "loss": 2.8018, + "step": 42848 + }, + { + "epoch": 1.9949484368089019, + "grad_norm": 0.3511217675799185, + "learning_rate": 3.0463028008808803e-05, + "loss": 2.7143, + "step": 42849 + }, + { + "epoch": 1.994994994994995, + "grad_norm": 0.3528126259325959, + "learning_rate": 3.046053464189268e-05, + "loss": 2.6537, + "step": 42850 + }, + { + "epoch": 1.9950415531810881, + "grad_norm": 0.34159947277110886, + "learning_rate": 3.045804133232295e-05, + "loss": 2.6259, + "step": 42851 + }, + { + "epoch": 1.9950881113671812, + "grad_norm": 0.3490454495438222, + "learning_rate": 3.0455548080106967e-05, + "loss": 2.6351, + "step": 42852 + }, + { + "epoch": 1.9951346695532743, + "grad_norm": 0.3679595408960202, + "learning_rate": 3.0453054885251998e-05, + "loss": 2.6045, + "step": 42853 + }, + { + "epoch": 1.9951812277393672, + "grad_norm": 0.33181617491771814, + "learning_rate": 3.0450561747765412e-05, + "loss": 2.7103, + "step": 42854 + }, + { + "epoch": 1.9952277859254604, + "grad_norm": 0.3988323126211642, + "learning_rate": 3.0448068667654496e-05, + "loss": 2.5809, + "step": 42855 + }, + { + "epoch": 1.9952743441115535, + "grad_norm": 0.35240254436143803, + "learning_rate": 3.0445575644926566e-05, + "loss": 2.7404, + "step": 42856 + }, + { + "epoch": 1.9953209022976464, + "grad_norm": 0.3418896431652297, + "learning_rate": 3.044308267958894e-05, + "loss": 2.6192, + "step": 42857 + }, + { + "epoch": 1.9953674604837395, + "grad_norm": 0.363143312461622, + "learning_rate": 3.0440589771648958e-05, + "loss": 2.6435, + "step": 42858 + }, + { + "epoch": 1.9954140186698326, + "grad_norm": 0.34260155877461484, + "learning_rate": 3.04380969211139e-05, + "loss": 2.6252, + "step": 42859 + }, + { + "epoch": 1.9954605768559257, + "grad_norm": 0.35016907528175156, + "learning_rate": 3.0435604127991118e-05, + "loss": 2.6572, + "step": 42860 + }, + { + "epoch": 1.9955071350420188, + "grad_norm": 0.3753273409729867, + "learning_rate": 3.043311139228789e-05, + "loss": 2.6583, + "step": 42861 + }, + { + "epoch": 1.995553693228112, + "grad_norm": 0.3169431732473195, + "learning_rate": 3.0430618714011565e-05, + "loss": 2.5896, + "step": 42862 + }, + { + "epoch": 1.995600251414205, + "grad_norm": 0.36817928495109276, + "learning_rate": 3.0428126093169452e-05, + "loss": 2.6026, + "step": 42863 + }, + { + "epoch": 1.995646809600298, + "grad_norm": 0.36761942723344115, + "learning_rate": 3.0425633529768838e-05, + "loss": 2.6683, + "step": 42864 + }, + { + "epoch": 1.995693367786391, + "grad_norm": 0.32462572241787463, + "learning_rate": 3.042314102381708e-05, + "loss": 2.6823, + "step": 42865 + }, + { + "epoch": 1.995739925972484, + "grad_norm": 0.386678253604561, + "learning_rate": 3.0420648575321464e-05, + "loss": 2.6627, + "step": 42866 + }, + { + "epoch": 1.995786484158577, + "grad_norm": 0.3445418765434977, + "learning_rate": 3.0418156184289327e-05, + "loss": 2.7527, + "step": 42867 + }, + { + "epoch": 1.9958330423446702, + "grad_norm": 0.34373230173124975, + "learning_rate": 3.0415663850727972e-05, + "loss": 2.7962, + "step": 42868 + }, + { + "epoch": 1.9958796005307633, + "grad_norm": 0.33086576538407036, + "learning_rate": 3.0413171574644705e-05, + "loss": 2.6038, + "step": 42869 + }, + { + "epoch": 1.9959261587168564, + "grad_norm": 0.3524721018872247, + "learning_rate": 3.0410679356046856e-05, + "loss": 2.6401, + "step": 42870 + }, + { + "epoch": 1.9959727169029495, + "grad_norm": 0.31861626456105, + "learning_rate": 3.0408187194941728e-05, + "loss": 2.6142, + "step": 42871 + }, + { + "epoch": 1.9960192750890426, + "grad_norm": 0.3518711570632137, + "learning_rate": 3.040569509133665e-05, + "loss": 2.6719, + "step": 42872 + }, + { + "epoch": 1.9960658332751358, + "grad_norm": 0.34169919887929606, + "learning_rate": 3.0403203045238927e-05, + "loss": 2.7173, + "step": 42873 + }, + { + "epoch": 1.9961123914612287, + "grad_norm": 0.33037269133108427, + "learning_rate": 3.040071105665585e-05, + "loss": 2.7123, + "step": 42874 + }, + { + "epoch": 1.9961589496473218, + "grad_norm": 0.3882999020339171, + "learning_rate": 3.0398219125594778e-05, + "loss": 2.6869, + "step": 42875 + }, + { + "epoch": 1.9962055078334147, + "grad_norm": 0.329368160952875, + "learning_rate": 3.0395727252063e-05, + "loss": 2.6642, + "step": 42876 + }, + { + "epoch": 1.9962520660195078, + "grad_norm": 0.3336709742113271, + "learning_rate": 3.0393235436067813e-05, + "loss": 2.7122, + "step": 42877 + }, + { + "epoch": 1.996298624205601, + "grad_norm": 0.3485992461825981, + "learning_rate": 3.0390743677616574e-05, + "loss": 2.6369, + "step": 42878 + }, + { + "epoch": 1.996345182391694, + "grad_norm": 0.33315037264129027, + "learning_rate": 3.0388251976716542e-05, + "loss": 2.5991, + "step": 42879 + }, + { + "epoch": 1.9963917405777871, + "grad_norm": 0.3187211280992143, + "learning_rate": 3.038576033337508e-05, + "loss": 2.6471, + "step": 42880 + }, + { + "epoch": 1.9964382987638802, + "grad_norm": 0.346122554290043, + "learning_rate": 3.0383268747599482e-05, + "loss": 2.6735, + "step": 42881 + }, + { + "epoch": 1.9964848569499734, + "grad_norm": 0.3335196137278477, + "learning_rate": 3.0380777219397044e-05, + "loss": 2.5939, + "step": 42882 + }, + { + "epoch": 1.9965314151360662, + "grad_norm": 0.3408805280343202, + "learning_rate": 3.037828574877509e-05, + "loss": 2.6697, + "step": 42883 + }, + { + "epoch": 1.9965779733221594, + "grad_norm": 0.34541363247023027, + "learning_rate": 3.037579433574094e-05, + "loss": 2.7273, + "step": 42884 + }, + { + "epoch": 1.9966245315082525, + "grad_norm": 0.3474703584992705, + "learning_rate": 3.037330298030191e-05, + "loss": 2.6725, + "step": 42885 + }, + { + "epoch": 1.9966710896943454, + "grad_norm": 0.3197722485444988, + "learning_rate": 3.0370811682465305e-05, + "loss": 2.5691, + "step": 42886 + }, + { + "epoch": 1.9967176478804385, + "grad_norm": 0.346245309636972, + "learning_rate": 3.0368320442238406e-05, + "loss": 2.5431, + "step": 42887 + }, + { + "epoch": 1.9967642060665316, + "grad_norm": 0.34098439123039254, + "learning_rate": 3.0365829259628574e-05, + "loss": 2.6547, + "step": 42888 + }, + { + "epoch": 1.9968107642526247, + "grad_norm": 0.3221499227105648, + "learning_rate": 3.0363338134643082e-05, + "loss": 2.6661, + "step": 42889 + }, + { + "epoch": 1.9968573224387178, + "grad_norm": 0.34969898060427523, + "learning_rate": 3.0360847067289278e-05, + "loss": 2.6582, + "step": 42890 + }, + { + "epoch": 1.996903880624811, + "grad_norm": 0.3469929949541951, + "learning_rate": 3.0358356057574456e-05, + "loss": 2.6146, + "step": 42891 + }, + { + "epoch": 1.996950438810904, + "grad_norm": 0.31134219644702643, + "learning_rate": 3.0355865105505898e-05, + "loss": 2.7593, + "step": 42892 + }, + { + "epoch": 1.996996996996997, + "grad_norm": 0.347463937684468, + "learning_rate": 3.0353374211090956e-05, + "loss": 2.6595, + "step": 42893 + }, + { + "epoch": 1.99704355518309, + "grad_norm": 0.3520655483862675, + "learning_rate": 3.0350883374336936e-05, + "loss": 2.7193, + "step": 42894 + }, + { + "epoch": 1.9970901133691832, + "grad_norm": 0.30900916280293905, + "learning_rate": 3.0348392595251123e-05, + "loss": 2.5764, + "step": 42895 + }, + { + "epoch": 1.997136671555276, + "grad_norm": 0.33710896890595726, + "learning_rate": 3.0345901873840842e-05, + "loss": 2.6926, + "step": 42896 + }, + { + "epoch": 1.9971832297413692, + "grad_norm": 0.3599725272916173, + "learning_rate": 3.0343411210113402e-05, + "loss": 2.6595, + "step": 42897 + }, + { + "epoch": 1.9972297879274623, + "grad_norm": 0.3571140252772106, + "learning_rate": 3.034092060407613e-05, + "loss": 2.7149, + "step": 42898 + }, + { + "epoch": 1.9972763461135554, + "grad_norm": 0.3446277442912386, + "learning_rate": 3.0338430055736312e-05, + "loss": 2.6233, + "step": 42899 + }, + { + "epoch": 1.9973229042996485, + "grad_norm": 0.32782280019183574, + "learning_rate": 3.0335939565101245e-05, + "loss": 2.7225, + "step": 42900 + }, + { + "epoch": 1.9973694624857417, + "grad_norm": 0.3337484656618482, + "learning_rate": 3.0333449132178283e-05, + "loss": 2.5687, + "step": 42901 + }, + { + "epoch": 1.9974160206718348, + "grad_norm": 0.3614656082965115, + "learning_rate": 3.033095875697468e-05, + "loss": 2.7928, + "step": 42902 + }, + { + "epoch": 1.9974625788579277, + "grad_norm": 0.3565614485293135, + "learning_rate": 3.0328468439497802e-05, + "loss": 2.6713, + "step": 42903 + }, + { + "epoch": 1.9975091370440208, + "grad_norm": 0.358846744392069, + "learning_rate": 3.0325978179754934e-05, + "loss": 2.6942, + "step": 42904 + }, + { + "epoch": 1.9975556952301137, + "grad_norm": 0.3501882838146334, + "learning_rate": 3.0323487977753363e-05, + "loss": 2.7098, + "step": 42905 + }, + { + "epoch": 1.9976022534162068, + "grad_norm": 0.3769101124136999, + "learning_rate": 3.0320997833500432e-05, + "loss": 2.5847, + "step": 42906 + }, + { + "epoch": 1.9976488116023, + "grad_norm": 0.3244938021149965, + "learning_rate": 3.0318507747003422e-05, + "loss": 2.6241, + "step": 42907 + }, + { + "epoch": 1.997695369788393, + "grad_norm": 0.3188743869817609, + "learning_rate": 3.0316017718269664e-05, + "loss": 2.7136, + "step": 42908 + }, + { + "epoch": 1.9977419279744861, + "grad_norm": 0.35853251449300044, + "learning_rate": 3.0313527747306447e-05, + "loss": 2.669, + "step": 42909 + }, + { + "epoch": 1.9977884861605792, + "grad_norm": 0.3181281080016674, + "learning_rate": 3.031103783412108e-05, + "loss": 2.6506, + "step": 42910 + }, + { + "epoch": 1.9978350443466724, + "grad_norm": 0.33515950558156105, + "learning_rate": 3.0308547978720897e-05, + "loss": 2.7021, + "step": 42911 + }, + { + "epoch": 1.9978816025327655, + "grad_norm": 0.3354940952509696, + "learning_rate": 3.0306058181113184e-05, + "loss": 2.6031, + "step": 42912 + }, + { + "epoch": 1.9979281607188584, + "grad_norm": 0.34444304089893796, + "learning_rate": 3.0303568441305224e-05, + "loss": 2.6348, + "step": 42913 + }, + { + "epoch": 1.9979747189049515, + "grad_norm": 0.3473400180603433, + "learning_rate": 3.030107875930438e-05, + "loss": 2.8215, + "step": 42914 + }, + { + "epoch": 1.9980212770910444, + "grad_norm": 0.35596974320743985, + "learning_rate": 3.02985891351179e-05, + "loss": 2.6666, + "step": 42915 + }, + { + "epoch": 1.9980678352771375, + "grad_norm": 0.3360266732152526, + "learning_rate": 3.0296099568753144e-05, + "loss": 2.6109, + "step": 42916 + }, + { + "epoch": 1.9981143934632306, + "grad_norm": 0.34846383218990185, + "learning_rate": 3.0293610060217393e-05, + "loss": 2.7633, + "step": 42917 + }, + { + "epoch": 1.9981609516493237, + "grad_norm": 0.3484722242562139, + "learning_rate": 3.0291120609517933e-05, + "loss": 2.6665, + "step": 42918 + }, + { + "epoch": 1.9982075098354168, + "grad_norm": 0.37992309673760244, + "learning_rate": 3.0288631216662115e-05, + "loss": 2.6533, + "step": 42919 + }, + { + "epoch": 1.99825406802151, + "grad_norm": 0.33829648128350437, + "learning_rate": 3.0286141881657215e-05, + "loss": 2.7439, + "step": 42920 + }, + { + "epoch": 1.998300626207603, + "grad_norm": 0.3416534787799015, + "learning_rate": 3.028365260451055e-05, + "loss": 2.7986, + "step": 42921 + }, + { + "epoch": 1.9983471843936962, + "grad_norm": 0.37178386531337493, + "learning_rate": 3.0281163385229415e-05, + "loss": 2.7265, + "step": 42922 + }, + { + "epoch": 1.998393742579789, + "grad_norm": 0.3360863794588117, + "learning_rate": 3.0278674223821125e-05, + "loss": 2.7312, + "step": 42923 + }, + { + "epoch": 1.9984403007658822, + "grad_norm": 0.3398854826762975, + "learning_rate": 3.0276185120292992e-05, + "loss": 2.618, + "step": 42924 + }, + { + "epoch": 1.998486858951975, + "grad_norm": 0.36376452167145296, + "learning_rate": 3.027369607465229e-05, + "loss": 2.7128, + "step": 42925 + }, + { + "epoch": 1.9985334171380682, + "grad_norm": 0.32084574689477896, + "learning_rate": 3.027120708690637e-05, + "loss": 2.7384, + "step": 42926 + }, + { + "epoch": 1.9985799753241613, + "grad_norm": 0.3881915129853439, + "learning_rate": 3.0268718157062514e-05, + "loss": 2.647, + "step": 42927 + }, + { + "epoch": 1.9986265335102544, + "grad_norm": 0.36187927646607704, + "learning_rate": 3.026622928512801e-05, + "loss": 2.706, + "step": 42928 + }, + { + "epoch": 1.9986730916963475, + "grad_norm": 0.3370792407189423, + "learning_rate": 3.026374047111019e-05, + "loss": 2.6006, + "step": 42929 + }, + { + "epoch": 1.9987196498824407, + "grad_norm": 0.3504819362154101, + "learning_rate": 3.0261251715016337e-05, + "loss": 2.6579, + "step": 42930 + }, + { + "epoch": 1.9987662080685338, + "grad_norm": 0.3309849183738705, + "learning_rate": 3.0258763016853775e-05, + "loss": 2.7009, + "step": 42931 + }, + { + "epoch": 1.9988127662546267, + "grad_norm": 0.3420295171766601, + "learning_rate": 3.0256274376629808e-05, + "loss": 2.6297, + "step": 42932 + }, + { + "epoch": 1.9988593244407198, + "grad_norm": 0.3427389433120445, + "learning_rate": 3.025378579435172e-05, + "loss": 2.7011, + "step": 42933 + }, + { + "epoch": 1.998905882626813, + "grad_norm": 0.3195411204826225, + "learning_rate": 3.025129727002683e-05, + "loss": 2.6661, + "step": 42934 + }, + { + "epoch": 1.9989524408129058, + "grad_norm": 0.34872698223694193, + "learning_rate": 3.0248808803662427e-05, + "loss": 2.7115, + "step": 42935 + }, + { + "epoch": 1.998998998998999, + "grad_norm": 0.3266152240648211, + "learning_rate": 3.0246320395265824e-05, + "loss": 2.6106, + "step": 42936 + }, + { + "epoch": 1.999045557185092, + "grad_norm": 0.3310463416112038, + "learning_rate": 3.0243832044844337e-05, + "loss": 2.6863, + "step": 42937 + }, + { + "epoch": 1.9990921153711851, + "grad_norm": 0.35932591223239385, + "learning_rate": 3.0241343752405237e-05, + "loss": 2.5126, + "step": 42938 + }, + { + "epoch": 1.9991386735572783, + "grad_norm": 0.31090291142660953, + "learning_rate": 3.0238855517955866e-05, + "loss": 2.7275, + "step": 42939 + }, + { + "epoch": 1.9991852317433714, + "grad_norm": 0.3466843275999913, + "learning_rate": 3.0236367341503512e-05, + "loss": 2.6815, + "step": 42940 + }, + { + "epoch": 1.9992317899294645, + "grad_norm": 0.3149998801512059, + "learning_rate": 3.0233879223055445e-05, + "loss": 2.6368, + "step": 42941 + }, + { + "epoch": 1.9992783481155574, + "grad_norm": 0.35929016783309037, + "learning_rate": 3.0231391162619015e-05, + "loss": 2.6646, + "step": 42942 + }, + { + "epoch": 1.9993249063016505, + "grad_norm": 0.3248823305554163, + "learning_rate": 3.0228903160201492e-05, + "loss": 2.6875, + "step": 42943 + }, + { + "epoch": 1.9993714644877436, + "grad_norm": 0.3110165601197879, + "learning_rate": 3.0226415215810205e-05, + "loss": 2.7089, + "step": 42944 + }, + { + "epoch": 1.9994180226738365, + "grad_norm": 0.34446639462143813, + "learning_rate": 3.022392732945244e-05, + "loss": 2.5948, + "step": 42945 + }, + { + "epoch": 1.9994645808599296, + "grad_norm": 0.3215648727123139, + "learning_rate": 3.0221439501135495e-05, + "loss": 2.6163, + "step": 42946 + }, + { + "epoch": 1.9995111390460227, + "grad_norm": 0.3205927346780991, + "learning_rate": 3.0218951730866684e-05, + "loss": 2.691, + "step": 42947 + }, + { + "epoch": 1.9995576972321158, + "grad_norm": 0.33491283438974156, + "learning_rate": 3.0216464018653278e-05, + "loss": 2.7018, + "step": 42948 + }, + { + "epoch": 1.999604255418209, + "grad_norm": 0.33441987039679844, + "learning_rate": 3.0213976364502628e-05, + "loss": 2.654, + "step": 42949 + }, + { + "epoch": 1.999650813604302, + "grad_norm": 0.3166871312007112, + "learning_rate": 3.0211488768422002e-05, + "loss": 2.6267, + "step": 42950 + }, + { + "epoch": 1.9996973717903952, + "grad_norm": 0.3195059034433378, + "learning_rate": 3.020900123041869e-05, + "loss": 2.6679, + "step": 42951 + }, + { + "epoch": 1.999743929976488, + "grad_norm": 0.3294152718861176, + "learning_rate": 3.0206513750500033e-05, + "loss": 2.5661, + "step": 42952 + }, + { + "epoch": 1.9997904881625812, + "grad_norm": 0.3202827912283697, + "learning_rate": 3.0204026328673302e-05, + "loss": 2.6935, + "step": 42953 + }, + { + "epoch": 1.999837046348674, + "grad_norm": 0.3474210069879793, + "learning_rate": 3.0201538964945784e-05, + "loss": 2.7029, + "step": 42954 + }, + { + "epoch": 1.9998836045347672, + "grad_norm": 0.3142835017287072, + "learning_rate": 3.0199051659324818e-05, + "loss": 2.6675, + "step": 42955 + }, + { + "epoch": 1.9999301627208603, + "grad_norm": 0.3491937965859051, + "learning_rate": 3.0196564411817668e-05, + "loss": 2.593, + "step": 42956 + }, + { + "epoch": 1.9999767209069534, + "grad_norm": 0.3414091213679481, + "learning_rate": 3.0194077222431672e-05, + "loss": 2.7096, + "step": 42957 + }, + { + "epoch": 2.0, + "grad_norm": 0.5073350755328192, + "learning_rate": 3.0191590091174105e-05, + "loss": 2.7886, + "step": 42958 + }, + { + "epoch": 2.000046558186093, + "grad_norm": 0.3387908639704511, + "learning_rate": 3.0189103018052254e-05, + "loss": 2.6019, + "step": 42959 + }, + { + "epoch": 2.0000931163721862, + "grad_norm": 0.3674628948080134, + "learning_rate": 3.0186616003073453e-05, + "loss": 2.6896, + "step": 42960 + }, + { + "epoch": 2.0001396745582793, + "grad_norm": 0.36151706937131156, + "learning_rate": 3.0184129046244957e-05, + "loss": 2.7565, + "step": 42961 + }, + { + "epoch": 2.0001862327443725, + "grad_norm": 0.3468036496076416, + "learning_rate": 3.018164214757412e-05, + "loss": 2.7146, + "step": 42962 + }, + { + "epoch": 2.000232790930465, + "grad_norm": 0.37251819530085756, + "learning_rate": 3.0179155307068198e-05, + "loss": 2.6543, + "step": 42963 + }, + { + "epoch": 2.0002793491165582, + "grad_norm": 0.3452363819676303, + "learning_rate": 3.0176668524734486e-05, + "loss": 2.6867, + "step": 42964 + }, + { + "epoch": 2.0003259073026514, + "grad_norm": 0.3388338041449524, + "learning_rate": 3.0174181800580327e-05, + "loss": 2.5308, + "step": 42965 + }, + { + "epoch": 2.0003724654887445, + "grad_norm": 0.38210961539946464, + "learning_rate": 3.0171695134612964e-05, + "loss": 2.6178, + "step": 42966 + }, + { + "epoch": 2.0004190236748376, + "grad_norm": 0.35312135803367206, + "learning_rate": 3.0169208526839742e-05, + "loss": 2.6169, + "step": 42967 + }, + { + "epoch": 2.0004655818609307, + "grad_norm": 0.34282827711982383, + "learning_rate": 3.0166721977267944e-05, + "loss": 2.6125, + "step": 42968 + }, + { + "epoch": 2.000512140047024, + "grad_norm": 0.35837868306931436, + "learning_rate": 3.0164235485904835e-05, + "loss": 2.6119, + "step": 42969 + }, + { + "epoch": 2.000558698233117, + "grad_norm": 0.36382979520495595, + "learning_rate": 3.0161749052757766e-05, + "loss": 2.665, + "step": 42970 + }, + { + "epoch": 2.00060525641921, + "grad_norm": 0.36847609704518103, + "learning_rate": 3.015926267783401e-05, + "loss": 2.7436, + "step": 42971 + }, + { + "epoch": 2.000651814605303, + "grad_norm": 0.353223082754618, + "learning_rate": 3.015677636114085e-05, + "loss": 2.742, + "step": 42972 + }, + { + "epoch": 2.000698372791396, + "grad_norm": 0.39465484119028665, + "learning_rate": 3.015429010268561e-05, + "loss": 2.7823, + "step": 42973 + }, + { + "epoch": 2.000744930977489, + "grad_norm": 0.3525539042599903, + "learning_rate": 3.015180390247555e-05, + "loss": 2.6568, + "step": 42974 + }, + { + "epoch": 2.000791489163582, + "grad_norm": 0.3730670318362406, + "learning_rate": 3.0149317760518014e-05, + "loss": 2.7261, + "step": 42975 + }, + { + "epoch": 2.000838047349675, + "grad_norm": 0.33544892116027814, + "learning_rate": 3.014683167682027e-05, + "loss": 2.6346, + "step": 42976 + }, + { + "epoch": 2.0008846055357683, + "grad_norm": 0.3858043602659636, + "learning_rate": 3.0144345651389606e-05, + "loss": 2.6934, + "step": 42977 + }, + { + "epoch": 2.0009311637218614, + "grad_norm": 0.35094585740419365, + "learning_rate": 3.014185968423334e-05, + "loss": 2.722, + "step": 42978 + }, + { + "epoch": 2.0009777219079545, + "grad_norm": 0.3716979694009816, + "learning_rate": 3.0139373775358748e-05, + "loss": 2.7505, + "step": 42979 + }, + { + "epoch": 2.0010242800940476, + "grad_norm": 0.3619051463991748, + "learning_rate": 3.013688792477316e-05, + "loss": 2.7208, + "step": 42980 + }, + { + "epoch": 2.0010708382801408, + "grad_norm": 0.34700840460607135, + "learning_rate": 3.013440213248384e-05, + "loss": 2.5859, + "step": 42981 + }, + { + "epoch": 2.001117396466234, + "grad_norm": 0.3675333353138996, + "learning_rate": 3.0131916398498084e-05, + "loss": 2.7053, + "step": 42982 + }, + { + "epoch": 2.0011639546523265, + "grad_norm": 0.34321228733942105, + "learning_rate": 3.0129430722823204e-05, + "loss": 2.6818, + "step": 42983 + }, + { + "epoch": 2.0012105128384197, + "grad_norm": 0.35958342678950767, + "learning_rate": 3.0126945105466486e-05, + "loss": 2.6287, + "step": 42984 + }, + { + "epoch": 2.001257071024513, + "grad_norm": 0.32681650951790303, + "learning_rate": 3.0124459546435236e-05, + "loss": 2.5858, + "step": 42985 + }, + { + "epoch": 2.001303629210606, + "grad_norm": 0.33121099612714927, + "learning_rate": 3.0121974045736733e-05, + "loss": 2.638, + "step": 42986 + }, + { + "epoch": 2.001350187396699, + "grad_norm": 0.35391994345611616, + "learning_rate": 3.0119488603378264e-05, + "loss": 2.6509, + "step": 42987 + }, + { + "epoch": 2.001396745582792, + "grad_norm": 0.36305987983246807, + "learning_rate": 3.0117003219367152e-05, + "loss": 2.7129, + "step": 42988 + }, + { + "epoch": 2.0014433037688852, + "grad_norm": 0.3394059442297399, + "learning_rate": 3.011451789371068e-05, + "loss": 2.6401, + "step": 42989 + }, + { + "epoch": 2.0014898619549784, + "grad_norm": 0.3623342903899079, + "learning_rate": 3.011203262641612e-05, + "loss": 2.7246, + "step": 42990 + }, + { + "epoch": 2.0015364201410715, + "grad_norm": 0.35132564757807444, + "learning_rate": 3.0109547417490802e-05, + "loss": 2.6804, + "step": 42991 + }, + { + "epoch": 2.0015829783271646, + "grad_norm": 0.32461278602050897, + "learning_rate": 3.0107062266941987e-05, + "loss": 2.655, + "step": 42992 + }, + { + "epoch": 2.0016295365132573, + "grad_norm": 0.3415475479636781, + "learning_rate": 3.0104577174777e-05, + "loss": 2.6804, + "step": 42993 + }, + { + "epoch": 2.0016760946993504, + "grad_norm": 0.3322922874420102, + "learning_rate": 3.0102092141003117e-05, + "loss": 2.6927, + "step": 42994 + }, + { + "epoch": 2.0017226528854435, + "grad_norm": 0.3498989878248602, + "learning_rate": 3.0099607165627618e-05, + "loss": 2.6166, + "step": 42995 + }, + { + "epoch": 2.0017692110715366, + "grad_norm": 0.31673791893994285, + "learning_rate": 3.009712224865783e-05, + "loss": 2.6131, + "step": 42996 + }, + { + "epoch": 2.0018157692576297, + "grad_norm": 0.34295216967481085, + "learning_rate": 3.009463739010102e-05, + "loss": 2.6808, + "step": 42997 + }, + { + "epoch": 2.001862327443723, + "grad_norm": 0.33926515904543936, + "learning_rate": 3.0092152589964494e-05, + "loss": 2.5553, + "step": 42998 + }, + { + "epoch": 2.001908885629816, + "grad_norm": 0.3408491778071219, + "learning_rate": 3.0089667848255542e-05, + "loss": 2.6713, + "step": 42999 + }, + { + "epoch": 2.001955443815909, + "grad_norm": 0.35369337458199773, + "learning_rate": 3.0087183164981438e-05, + "loss": 2.6124, + "step": 43000 + }, + { + "epoch": 2.002002002002002, + "grad_norm": 0.34832204602841876, + "learning_rate": 3.0084698540149503e-05, + "loss": 2.5834, + "step": 43001 + }, + { + "epoch": 2.002048560188095, + "grad_norm": 0.40071087650740117, + "learning_rate": 3.0082213973766993e-05, + "loss": 2.6172, + "step": 43002 + }, + { + "epoch": 2.002095118374188, + "grad_norm": 0.3588360040486331, + "learning_rate": 3.0079729465841245e-05, + "loss": 2.6793, + "step": 43003 + }, + { + "epoch": 2.002141676560281, + "grad_norm": 0.3581989379333875, + "learning_rate": 3.007724501637953e-05, + "loss": 2.701, + "step": 43004 + }, + { + "epoch": 2.002188234746374, + "grad_norm": 0.3558315664673213, + "learning_rate": 3.007476062538912e-05, + "loss": 2.695, + "step": 43005 + }, + { + "epoch": 2.0022347929324673, + "grad_norm": 0.3891710662663228, + "learning_rate": 3.0072276292877343e-05, + "loss": 2.6842, + "step": 43006 + }, + { + "epoch": 2.0022813511185604, + "grad_norm": 0.3455049925065672, + "learning_rate": 3.0069792018851473e-05, + "loss": 2.6339, + "step": 43007 + }, + { + "epoch": 2.0023279093046535, + "grad_norm": 0.3203619695698823, + "learning_rate": 3.0067307803318778e-05, + "loss": 2.6391, + "step": 43008 + }, + { + "epoch": 2.0023744674907467, + "grad_norm": 0.31712475018066, + "learning_rate": 3.006482364628659e-05, + "loss": 2.62, + "step": 43009 + }, + { + "epoch": 2.0024210256768398, + "grad_norm": 0.3288318620400979, + "learning_rate": 3.0062339547762175e-05, + "loss": 2.6513, + "step": 43010 + }, + { + "epoch": 2.002467583862933, + "grad_norm": 0.3443395682098466, + "learning_rate": 3.0059855507752833e-05, + "loss": 2.5906, + "step": 43011 + }, + { + "epoch": 2.0025141420490256, + "grad_norm": 0.33897475369781216, + "learning_rate": 3.0057371526265855e-05, + "loss": 2.6206, + "step": 43012 + }, + { + "epoch": 2.0025607002351187, + "grad_norm": 0.33349183735575716, + "learning_rate": 3.0054887603308507e-05, + "loss": 2.6877, + "step": 43013 + }, + { + "epoch": 2.002607258421212, + "grad_norm": 0.3587955769555941, + "learning_rate": 3.005240373888812e-05, + "loss": 2.635, + "step": 43014 + }, + { + "epoch": 2.002653816607305, + "grad_norm": 0.3425706269005357, + "learning_rate": 3.004991993301194e-05, + "loss": 2.6792, + "step": 43015 + }, + { + "epoch": 2.002700374793398, + "grad_norm": 0.3453109719158582, + "learning_rate": 3.00474361856873e-05, + "loss": 2.6121, + "step": 43016 + }, + { + "epoch": 2.002746932979491, + "grad_norm": 0.3556523194776631, + "learning_rate": 3.004495249692147e-05, + "loss": 2.7652, + "step": 43017 + }, + { + "epoch": 2.0027934911655842, + "grad_norm": 0.343133126349879, + "learning_rate": 3.0042468866721718e-05, + "loss": 2.5564, + "step": 43018 + }, + { + "epoch": 2.0028400493516774, + "grad_norm": 0.37876205720132705, + "learning_rate": 3.0039985295095375e-05, + "loss": 2.6303, + "step": 43019 + }, + { + "epoch": 2.0028866075377705, + "grad_norm": 0.34633264955069887, + "learning_rate": 3.0037501782049692e-05, + "loss": 2.7096, + "step": 43020 + }, + { + "epoch": 2.0029331657238636, + "grad_norm": 0.3285944399777105, + "learning_rate": 3.0035018327591983e-05, + "loss": 2.7378, + "step": 43021 + }, + { + "epoch": 2.0029797239099563, + "grad_norm": 0.355580950003468, + "learning_rate": 3.0032534931729537e-05, + "loss": 2.6631, + "step": 43022 + }, + { + "epoch": 2.0030262820960494, + "grad_norm": 0.33518460581204423, + "learning_rate": 3.003005159446962e-05, + "loss": 2.6763, + "step": 43023 + }, + { + "epoch": 2.0030728402821425, + "grad_norm": 0.34235538778776564, + "learning_rate": 3.0027568315819547e-05, + "loss": 2.6808, + "step": 43024 + }, + { + "epoch": 2.0031193984682356, + "grad_norm": 0.36445420827458486, + "learning_rate": 3.00250850957866e-05, + "loss": 2.6485, + "step": 43025 + }, + { + "epoch": 2.0031659566543287, + "grad_norm": 0.357391685539876, + "learning_rate": 3.002260193437803e-05, + "loss": 2.7599, + "step": 43026 + }, + { + "epoch": 2.003212514840422, + "grad_norm": 0.34155127606279245, + "learning_rate": 3.002011883160118e-05, + "loss": 2.6202, + "step": 43027 + }, + { + "epoch": 2.003259073026515, + "grad_norm": 0.33206392444621086, + "learning_rate": 3.001763578746329e-05, + "loss": 2.5252, + "step": 43028 + }, + { + "epoch": 2.003305631212608, + "grad_norm": 0.3624675961127311, + "learning_rate": 3.001515280197169e-05, + "loss": 2.6498, + "step": 43029 + }, + { + "epoch": 2.003352189398701, + "grad_norm": 0.3049403257292926, + "learning_rate": 3.0012669875133648e-05, + "loss": 2.483, + "step": 43030 + }, + { + "epoch": 2.0033987475847943, + "grad_norm": 0.3405991230436579, + "learning_rate": 3.0010187006956435e-05, + "loss": 2.673, + "step": 43031 + }, + { + "epoch": 2.003445305770887, + "grad_norm": 0.3638963778889513, + "learning_rate": 3.0007704197447373e-05, + "loss": 2.594, + "step": 43032 + }, + { + "epoch": 2.00349186395698, + "grad_norm": 0.3375198628671356, + "learning_rate": 3.000522144661372e-05, + "loss": 2.7459, + "step": 43033 + }, + { + "epoch": 2.003538422143073, + "grad_norm": 0.3304348006616843, + "learning_rate": 3.0002738754462767e-05, + "loss": 2.6284, + "step": 43034 + }, + { + "epoch": 2.0035849803291663, + "grad_norm": 0.3369132981580178, + "learning_rate": 3.000025612100182e-05, + "loss": 2.5243, + "step": 43035 + }, + { + "epoch": 2.0036315385152594, + "grad_norm": 0.34082882091709943, + "learning_rate": 2.999777354623814e-05, + "loss": 2.6392, + "step": 43036 + }, + { + "epoch": 2.0036780967013526, + "grad_norm": 0.3627278917545781, + "learning_rate": 2.9995291030179028e-05, + "loss": 2.6742, + "step": 43037 + }, + { + "epoch": 2.0037246548874457, + "grad_norm": 0.3226326837231767, + "learning_rate": 2.9992808572831754e-05, + "loss": 2.7143, + "step": 43038 + }, + { + "epoch": 2.003771213073539, + "grad_norm": 0.34313507103973184, + "learning_rate": 2.9990326174203636e-05, + "loss": 2.7041, + "step": 43039 + }, + { + "epoch": 2.003817771259632, + "grad_norm": 0.36169786548383703, + "learning_rate": 2.998784383430194e-05, + "loss": 2.5811, + "step": 43040 + }, + { + "epoch": 2.003864329445725, + "grad_norm": 0.37265092350411616, + "learning_rate": 2.9985361553133927e-05, + "loss": 2.6368, + "step": 43041 + }, + { + "epoch": 2.0039108876318177, + "grad_norm": 0.3689435324483946, + "learning_rate": 2.9982879330706924e-05, + "loss": 2.592, + "step": 43042 + }, + { + "epoch": 2.003957445817911, + "grad_norm": 0.37774150002275203, + "learning_rate": 2.9980397167028206e-05, + "loss": 2.7405, + "step": 43043 + }, + { + "epoch": 2.004004004004004, + "grad_norm": 0.3463217150494522, + "learning_rate": 2.9977915062105022e-05, + "loss": 2.6051, + "step": 43044 + }, + { + "epoch": 2.004050562190097, + "grad_norm": 0.35843380567509836, + "learning_rate": 2.997543301594471e-05, + "loss": 2.6577, + "step": 43045 + }, + { + "epoch": 2.00409712037619, + "grad_norm": 0.32438891638337336, + "learning_rate": 2.997295102855452e-05, + "loss": 2.5939, + "step": 43046 + }, + { + "epoch": 2.0041436785622833, + "grad_norm": 0.3391923581568109, + "learning_rate": 2.9970469099941738e-05, + "loss": 2.7078, + "step": 43047 + }, + { + "epoch": 2.0041902367483764, + "grad_norm": 0.36304212577499695, + "learning_rate": 2.996798723011367e-05, + "loss": 2.6559, + "step": 43048 + }, + { + "epoch": 2.0042367949344695, + "grad_norm": 0.34562831067987826, + "learning_rate": 2.9965505419077577e-05, + "loss": 2.6091, + "step": 43049 + }, + { + "epoch": 2.0042833531205626, + "grad_norm": 0.35221377765185435, + "learning_rate": 2.9963023666840763e-05, + "loss": 2.5679, + "step": 43050 + }, + { + "epoch": 2.0043299113066553, + "grad_norm": 0.3729511070520338, + "learning_rate": 2.9960541973410484e-05, + "loss": 2.6771, + "step": 43051 + }, + { + "epoch": 2.0043764694927484, + "grad_norm": 0.37597731482881946, + "learning_rate": 2.9958060338794054e-05, + "loss": 2.6437, + "step": 43052 + }, + { + "epoch": 2.0044230276788415, + "grad_norm": 0.326622153560947, + "learning_rate": 2.9955578762998742e-05, + "loss": 2.636, + "step": 43053 + }, + { + "epoch": 2.0044695858649346, + "grad_norm": 0.3332475710779765, + "learning_rate": 2.9953097246031814e-05, + "loss": 2.6421, + "step": 43054 + }, + { + "epoch": 2.0045161440510277, + "grad_norm": 0.37083056842307915, + "learning_rate": 2.9950615787900593e-05, + "loss": 2.651, + "step": 43055 + }, + { + "epoch": 2.004562702237121, + "grad_norm": 0.3387595085823687, + "learning_rate": 2.9948134388612314e-05, + "loss": 2.6902, + "step": 43056 + }, + { + "epoch": 2.004609260423214, + "grad_norm": 0.3420141227236544, + "learning_rate": 2.9945653048174306e-05, + "loss": 2.6453, + "step": 43057 + }, + { + "epoch": 2.004655818609307, + "grad_norm": 0.3334093062281976, + "learning_rate": 2.9943171766593835e-05, + "loss": 2.5763, + "step": 43058 + }, + { + "epoch": 2.0047023767954, + "grad_norm": 0.37598732830715936, + "learning_rate": 2.994069054387817e-05, + "loss": 2.701, + "step": 43059 + }, + { + "epoch": 2.0047489349814933, + "grad_norm": 0.32904708308265296, + "learning_rate": 2.99382093800346e-05, + "loss": 2.6817, + "step": 43060 + }, + { + "epoch": 2.004795493167586, + "grad_norm": 0.3425683011303747, + "learning_rate": 2.993572827507041e-05, + "loss": 2.6102, + "step": 43061 + }, + { + "epoch": 2.004842051353679, + "grad_norm": 0.35029284270077016, + "learning_rate": 2.993324722899288e-05, + "loss": 2.6103, + "step": 43062 + }, + { + "epoch": 2.004888609539772, + "grad_norm": 0.32378129804942085, + "learning_rate": 2.9930766241809304e-05, + "loss": 2.5944, + "step": 43063 + }, + { + "epoch": 2.0049351677258653, + "grad_norm": 0.34079725086208223, + "learning_rate": 2.9928285313526926e-05, + "loss": 2.6522, + "step": 43064 + }, + { + "epoch": 2.0049817259119584, + "grad_norm": 0.33996232301214363, + "learning_rate": 2.9925804444153078e-05, + "loss": 2.6854, + "step": 43065 + }, + { + "epoch": 2.0050282840980516, + "grad_norm": 0.3336986689036364, + "learning_rate": 2.992332363369501e-05, + "loss": 2.7031, + "step": 43066 + }, + { + "epoch": 2.0050748422841447, + "grad_norm": 0.3355268567414085, + "learning_rate": 2.9920842882159995e-05, + "loss": 2.578, + "step": 43067 + }, + { + "epoch": 2.005121400470238, + "grad_norm": 0.33603492036200755, + "learning_rate": 2.991836218955535e-05, + "loss": 2.651, + "step": 43068 + }, + { + "epoch": 2.005167958656331, + "grad_norm": 0.33854528496062697, + "learning_rate": 2.991588155588831e-05, + "loss": 2.6854, + "step": 43069 + }, + { + "epoch": 2.005214516842424, + "grad_norm": 0.3417181700645419, + "learning_rate": 2.9913400981166195e-05, + "loss": 2.6117, + "step": 43070 + }, + { + "epoch": 2.0052610750285167, + "grad_norm": 0.3364506323470149, + "learning_rate": 2.9910920465396276e-05, + "loss": 2.6069, + "step": 43071 + }, + { + "epoch": 2.00530763321461, + "grad_norm": 0.3302142152125164, + "learning_rate": 2.9908440008585813e-05, + "loss": 2.6355, + "step": 43072 + }, + { + "epoch": 2.005354191400703, + "grad_norm": 0.349628396932987, + "learning_rate": 2.99059596107421e-05, + "loss": 2.733, + "step": 43073 + }, + { + "epoch": 2.005400749586796, + "grad_norm": 0.3295042953534296, + "learning_rate": 2.9903479271872414e-05, + "loss": 2.7249, + "step": 43074 + }, + { + "epoch": 2.005447307772889, + "grad_norm": 0.3393443489422822, + "learning_rate": 2.9900998991984054e-05, + "loss": 2.7032, + "step": 43075 + }, + { + "epoch": 2.0054938659589823, + "grad_norm": 0.34109178625701286, + "learning_rate": 2.989851877108427e-05, + "loss": 2.6871, + "step": 43076 + }, + { + "epoch": 2.0055404241450754, + "grad_norm": 0.3718137189870711, + "learning_rate": 2.9896038609180343e-05, + "loss": 2.6811, + "step": 43077 + }, + { + "epoch": 2.0055869823311685, + "grad_norm": 0.33129608214806716, + "learning_rate": 2.9893558506279585e-05, + "loss": 2.6944, + "step": 43078 + }, + { + "epoch": 2.0056335405172616, + "grad_norm": 0.33952957038943804, + "learning_rate": 2.9891078462389244e-05, + "loss": 2.7494, + "step": 43079 + }, + { + "epoch": 2.0056800987033547, + "grad_norm": 0.3707798043791914, + "learning_rate": 2.9888598477516584e-05, + "loss": 2.707, + "step": 43080 + }, + { + "epoch": 2.0057266568894474, + "grad_norm": 0.3254932777886971, + "learning_rate": 2.9886118551668927e-05, + "loss": 2.5737, + "step": 43081 + }, + { + "epoch": 2.0057732150755405, + "grad_norm": 0.3575487899295451, + "learning_rate": 2.988363868485352e-05, + "loss": 2.6445, + "step": 43082 + }, + { + "epoch": 2.0058197732616336, + "grad_norm": 0.36983514735103545, + "learning_rate": 2.9881158877077663e-05, + "loss": 2.655, + "step": 43083 + }, + { + "epoch": 2.0058663314477267, + "grad_norm": 0.3206053505730269, + "learning_rate": 2.987867912834863e-05, + "loss": 2.6235, + "step": 43084 + }, + { + "epoch": 2.00591288963382, + "grad_norm": 0.36823145104126853, + "learning_rate": 2.9876199438673675e-05, + "loss": 2.687, + "step": 43085 + }, + { + "epoch": 2.005959447819913, + "grad_norm": 0.3242646440300871, + "learning_rate": 2.9873719808060098e-05, + "loss": 2.6425, + "step": 43086 + }, + { + "epoch": 2.006006006006006, + "grad_norm": 0.3558340530745034, + "learning_rate": 2.9871240236515162e-05, + "loss": 2.7582, + "step": 43087 + }, + { + "epoch": 2.006052564192099, + "grad_norm": 0.33904258954644073, + "learning_rate": 2.9868760724046164e-05, + "loss": 2.6141, + "step": 43088 + }, + { + "epoch": 2.0060991223781923, + "grad_norm": 0.3546543840628484, + "learning_rate": 2.986628127066038e-05, + "loss": 2.6794, + "step": 43089 + }, + { + "epoch": 2.006145680564285, + "grad_norm": 0.32321965093654137, + "learning_rate": 2.986380187636505e-05, + "loss": 2.6263, + "step": 43090 + }, + { + "epoch": 2.006192238750378, + "grad_norm": 0.33345584377177495, + "learning_rate": 2.986132254116749e-05, + "loss": 2.7338, + "step": 43091 + }, + { + "epoch": 2.0062387969364712, + "grad_norm": 0.33191973282227477, + "learning_rate": 2.9858843265074953e-05, + "loss": 2.6161, + "step": 43092 + }, + { + "epoch": 2.0062853551225643, + "grad_norm": 0.34312663550411177, + "learning_rate": 2.9856364048094754e-05, + "loss": 2.569, + "step": 43093 + }, + { + "epoch": 2.0063319133086575, + "grad_norm": 0.3331141613815124, + "learning_rate": 2.985388489023413e-05, + "loss": 2.6708, + "step": 43094 + }, + { + "epoch": 2.0063784714947506, + "grad_norm": 0.3270961579541365, + "learning_rate": 2.9851405791500353e-05, + "loss": 2.5848, + "step": 43095 + }, + { + "epoch": 2.0064250296808437, + "grad_norm": 0.36156189403372974, + "learning_rate": 2.9848926751900735e-05, + "loss": 2.6629, + "step": 43096 + }, + { + "epoch": 2.006471587866937, + "grad_norm": 0.32963702484133334, + "learning_rate": 2.9846447771442532e-05, + "loss": 2.6726, + "step": 43097 + }, + { + "epoch": 2.00651814605303, + "grad_norm": 0.3538143788801354, + "learning_rate": 2.984396885013301e-05, + "loss": 2.6965, + "step": 43098 + }, + { + "epoch": 2.006564704239123, + "grad_norm": 0.3579582140773325, + "learning_rate": 2.9841489987979455e-05, + "loss": 2.7405, + "step": 43099 + }, + { + "epoch": 2.0066112624252157, + "grad_norm": 0.3566514145821772, + "learning_rate": 2.983901118498914e-05, + "loss": 2.5553, + "step": 43100 + }, + { + "epoch": 2.006657820611309, + "grad_norm": 0.3749330526923154, + "learning_rate": 2.983653244116935e-05, + "loss": 2.7346, + "step": 43101 + }, + { + "epoch": 2.006704378797402, + "grad_norm": 0.3363414189579944, + "learning_rate": 2.9834053756527346e-05, + "loss": 2.6247, + "step": 43102 + }, + { + "epoch": 2.006750936983495, + "grad_norm": 0.4059508355592079, + "learning_rate": 2.98315751310704e-05, + "loss": 2.6324, + "step": 43103 + }, + { + "epoch": 2.006797495169588, + "grad_norm": 0.3427664133440079, + "learning_rate": 2.9829096564805802e-05, + "loss": 2.6633, + "step": 43104 + }, + { + "epoch": 2.0068440533556813, + "grad_norm": 0.3868296802374251, + "learning_rate": 2.982661805774081e-05, + "loss": 2.6811, + "step": 43105 + }, + { + "epoch": 2.0068906115417744, + "grad_norm": 0.36607458371919593, + "learning_rate": 2.9824139609882717e-05, + "loss": 2.5924, + "step": 43106 + }, + { + "epoch": 2.0069371697278675, + "grad_norm": 0.34705588397931864, + "learning_rate": 2.9821661221238785e-05, + "loss": 2.6481, + "step": 43107 + }, + { + "epoch": 2.0069837279139606, + "grad_norm": 0.3874468719278403, + "learning_rate": 2.981918289181628e-05, + "loss": 2.6807, + "step": 43108 + }, + { + "epoch": 2.0070302861000537, + "grad_norm": 0.34821152447180514, + "learning_rate": 2.9816704621622494e-05, + "loss": 2.6767, + "step": 43109 + }, + { + "epoch": 2.0070768442861464, + "grad_norm": 0.35257609022632985, + "learning_rate": 2.9814226410664688e-05, + "loss": 2.6489, + "step": 43110 + }, + { + "epoch": 2.0071234024722395, + "grad_norm": 0.3811993384118515, + "learning_rate": 2.9811748258950145e-05, + "loss": 2.6194, + "step": 43111 + }, + { + "epoch": 2.0071699606583326, + "grad_norm": 0.34939369725238145, + "learning_rate": 2.9809270166486126e-05, + "loss": 2.6543, + "step": 43112 + }, + { + "epoch": 2.0072165188444258, + "grad_norm": 0.3560805408189537, + "learning_rate": 2.9806792133279905e-05, + "loss": 2.6043, + "step": 43113 + }, + { + "epoch": 2.007263077030519, + "grad_norm": 0.36816851985896387, + "learning_rate": 2.9804314159338776e-05, + "loss": 2.5638, + "step": 43114 + }, + { + "epoch": 2.007309635216612, + "grad_norm": 0.3619990222554446, + "learning_rate": 2.9801836244669988e-05, + "loss": 2.596, + "step": 43115 + }, + { + "epoch": 2.007356193402705, + "grad_norm": 0.3294391913049618, + "learning_rate": 2.9799358389280808e-05, + "loss": 2.614, + "step": 43116 + }, + { + "epoch": 2.007402751588798, + "grad_norm": 0.38333457352663586, + "learning_rate": 2.979688059317854e-05, + "loss": 2.6157, + "step": 43117 + }, + { + "epoch": 2.0074493097748913, + "grad_norm": 0.33259021575212955, + "learning_rate": 2.9794402856370414e-05, + "loss": 2.6243, + "step": 43118 + }, + { + "epoch": 2.0074958679609844, + "grad_norm": 0.3679976664399873, + "learning_rate": 2.9791925178863743e-05, + "loss": 2.621, + "step": 43119 + }, + { + "epoch": 2.007542426147077, + "grad_norm": 0.3495539664093536, + "learning_rate": 2.9789447560665783e-05, + "loss": 2.6955, + "step": 43120 + }, + { + "epoch": 2.0075889843331702, + "grad_norm": 0.36572407119879524, + "learning_rate": 2.9786970001783786e-05, + "loss": 2.6585, + "step": 43121 + }, + { + "epoch": 2.0076355425192633, + "grad_norm": 0.3372873242045708, + "learning_rate": 2.978449250222506e-05, + "loss": 2.6779, + "step": 43122 + }, + { + "epoch": 2.0076821007053565, + "grad_norm": 0.34836614254794723, + "learning_rate": 2.978201506199685e-05, + "loss": 2.6489, + "step": 43123 + }, + { + "epoch": 2.0077286588914496, + "grad_norm": 0.3524798224834485, + "learning_rate": 2.977953768110644e-05, + "loss": 2.641, + "step": 43124 + }, + { + "epoch": 2.0077752170775427, + "grad_norm": 0.36065468812182544, + "learning_rate": 2.9777060359561082e-05, + "loss": 2.6064, + "step": 43125 + }, + { + "epoch": 2.007821775263636, + "grad_norm": 0.3502649493357355, + "learning_rate": 2.9774583097368058e-05, + "loss": 2.6524, + "step": 43126 + }, + { + "epoch": 2.007868333449729, + "grad_norm": 0.34374080348329167, + "learning_rate": 2.9772105894534656e-05, + "loss": 2.6813, + "step": 43127 + }, + { + "epoch": 2.007914891635822, + "grad_norm": 0.36201523711451694, + "learning_rate": 2.9769628751068113e-05, + "loss": 2.5357, + "step": 43128 + }, + { + "epoch": 2.007961449821915, + "grad_norm": 0.3688261177157461, + "learning_rate": 2.976715166697573e-05, + "loss": 2.7016, + "step": 43129 + }, + { + "epoch": 2.008008008008008, + "grad_norm": 0.34795624647577794, + "learning_rate": 2.9764674642264767e-05, + "loss": 2.6478, + "step": 43130 + }, + { + "epoch": 2.008054566194101, + "grad_norm": 0.3815446271477247, + "learning_rate": 2.9762197676942473e-05, + "loss": 2.7549, + "step": 43131 + }, + { + "epoch": 2.008101124380194, + "grad_norm": 0.3550646892475117, + "learning_rate": 2.975972077101615e-05, + "loss": 2.6095, + "step": 43132 + }, + { + "epoch": 2.008147682566287, + "grad_norm": 0.3335950796061985, + "learning_rate": 2.975724392449305e-05, + "loss": 2.6552, + "step": 43133 + }, + { + "epoch": 2.0081942407523803, + "grad_norm": 0.3494877656216272, + "learning_rate": 2.9754767137380428e-05, + "loss": 2.6286, + "step": 43134 + }, + { + "epoch": 2.0082407989384734, + "grad_norm": 0.3664714326239409, + "learning_rate": 2.975229040968559e-05, + "loss": 2.6682, + "step": 43135 + }, + { + "epoch": 2.0082873571245665, + "grad_norm": 0.3362762120093173, + "learning_rate": 2.974981374141578e-05, + "loss": 2.6687, + "step": 43136 + }, + { + "epoch": 2.0083339153106596, + "grad_norm": 0.3389573874935685, + "learning_rate": 2.9747337132578278e-05, + "loss": 2.7714, + "step": 43137 + }, + { + "epoch": 2.0083804734967527, + "grad_norm": 0.3379366735636386, + "learning_rate": 2.9744860583180335e-05, + "loss": 2.6377, + "step": 43138 + }, + { + "epoch": 2.0084270316828454, + "grad_norm": 0.360204542872906, + "learning_rate": 2.974238409322923e-05, + "loss": 2.6424, + "step": 43139 + }, + { + "epoch": 2.0084735898689385, + "grad_norm": 0.33289303199688036, + "learning_rate": 2.9739907662732248e-05, + "loss": 2.6517, + "step": 43140 + }, + { + "epoch": 2.0085201480550317, + "grad_norm": 0.3494098627385464, + "learning_rate": 2.9737431291696614e-05, + "loss": 2.7554, + "step": 43141 + }, + { + "epoch": 2.0085667062411248, + "grad_norm": 0.3518973574210517, + "learning_rate": 2.9734954980129643e-05, + "loss": 2.6492, + "step": 43142 + }, + { + "epoch": 2.008613264427218, + "grad_norm": 0.33136033375725177, + "learning_rate": 2.973247872803858e-05, + "loss": 2.649, + "step": 43143 + }, + { + "epoch": 2.008659822613311, + "grad_norm": 0.34949573820276186, + "learning_rate": 2.973000253543069e-05, + "loss": 2.6103, + "step": 43144 + }, + { + "epoch": 2.008706380799404, + "grad_norm": 0.3396417641210082, + "learning_rate": 2.9727526402313254e-05, + "loss": 2.7093, + "step": 43145 + }, + { + "epoch": 2.0087529389854972, + "grad_norm": 0.3437563402693729, + "learning_rate": 2.972505032869351e-05, + "loss": 2.4894, + "step": 43146 + }, + { + "epoch": 2.0087994971715903, + "grad_norm": 0.34596320882261633, + "learning_rate": 2.9722574314578765e-05, + "loss": 2.6924, + "step": 43147 + }, + { + "epoch": 2.0088460553576835, + "grad_norm": 0.3553349364795294, + "learning_rate": 2.9720098359976278e-05, + "loss": 2.6214, + "step": 43148 + }, + { + "epoch": 2.008892613543776, + "grad_norm": 0.3556109428563792, + "learning_rate": 2.971762246489328e-05, + "loss": 2.6519, + "step": 43149 + }, + { + "epoch": 2.0089391717298692, + "grad_norm": 0.33186235899371336, + "learning_rate": 2.9715146629337083e-05, + "loss": 2.6731, + "step": 43150 + }, + { + "epoch": 2.0089857299159624, + "grad_norm": 0.34640069205279406, + "learning_rate": 2.9712670853314915e-05, + "loss": 2.6198, + "step": 43151 + }, + { + "epoch": 2.0090322881020555, + "grad_norm": 0.33089777201588666, + "learning_rate": 2.971019513683406e-05, + "loss": 2.6936, + "step": 43152 + }, + { + "epoch": 2.0090788462881486, + "grad_norm": 0.3565118253083993, + "learning_rate": 2.9707719479901802e-05, + "loss": 2.6926, + "step": 43153 + }, + { + "epoch": 2.0091254044742417, + "grad_norm": 0.34752246453064606, + "learning_rate": 2.9705243882525358e-05, + "loss": 2.6364, + "step": 43154 + }, + { + "epoch": 2.009171962660335, + "grad_norm": 0.36491579052115397, + "learning_rate": 2.970276834471205e-05, + "loss": 2.6198, + "step": 43155 + }, + { + "epoch": 2.009218520846428, + "grad_norm": 0.34899802063838753, + "learning_rate": 2.9700292866469113e-05, + "loss": 2.7284, + "step": 43156 + }, + { + "epoch": 2.009265079032521, + "grad_norm": 0.32898002674049104, + "learning_rate": 2.96978174478038e-05, + "loss": 2.6828, + "step": 43157 + }, + { + "epoch": 2.009311637218614, + "grad_norm": 0.3608183563357523, + "learning_rate": 2.969534208872341e-05, + "loss": 2.6899, + "step": 43158 + }, + { + "epoch": 2.009358195404707, + "grad_norm": 0.34182456966655783, + "learning_rate": 2.9692866789235175e-05, + "loss": 2.6833, + "step": 43159 + }, + { + "epoch": 2.0094047535908, + "grad_norm": 0.37280897721793316, + "learning_rate": 2.9690391549346398e-05, + "loss": 2.6594, + "step": 43160 + }, + { + "epoch": 2.009451311776893, + "grad_norm": 0.3472489161168781, + "learning_rate": 2.968791636906431e-05, + "loss": 2.6283, + "step": 43161 + }, + { + "epoch": 2.009497869962986, + "grad_norm": 0.37288322011416003, + "learning_rate": 2.9685441248396184e-05, + "loss": 2.6857, + "step": 43162 + }, + { + "epoch": 2.0095444281490793, + "grad_norm": 0.3529255140366377, + "learning_rate": 2.9682966187349292e-05, + "loss": 2.6489, + "step": 43163 + }, + { + "epoch": 2.0095909863351724, + "grad_norm": 0.34666299947572415, + "learning_rate": 2.9680491185930874e-05, + "loss": 2.7099, + "step": 43164 + }, + { + "epoch": 2.0096375445212655, + "grad_norm": 0.35693631599397185, + "learning_rate": 2.967801624414823e-05, + "loss": 2.6646, + "step": 43165 + }, + { + "epoch": 2.0096841027073586, + "grad_norm": 0.34214337451958043, + "learning_rate": 2.967554136200862e-05, + "loss": 2.619, + "step": 43166 + }, + { + "epoch": 2.0097306608934518, + "grad_norm": 0.3932287881692015, + "learning_rate": 2.9673066539519258e-05, + "loss": 2.6624, + "step": 43167 + }, + { + "epoch": 2.009777219079545, + "grad_norm": 0.36709754809834194, + "learning_rate": 2.9670591776687473e-05, + "loss": 2.704, + "step": 43168 + }, + { + "epoch": 2.0098237772656375, + "grad_norm": 0.3451496748249405, + "learning_rate": 2.9668117073520496e-05, + "loss": 2.7169, + "step": 43169 + }, + { + "epoch": 2.0098703354517307, + "grad_norm": 0.34469041902682135, + "learning_rate": 2.9665642430025564e-05, + "loss": 2.6379, + "step": 43170 + }, + { + "epoch": 2.0099168936378238, + "grad_norm": 0.3388426728475451, + "learning_rate": 2.9663167846209998e-05, + "loss": 2.6764, + "step": 43171 + }, + { + "epoch": 2.009963451823917, + "grad_norm": 0.35328620016180595, + "learning_rate": 2.9660693322081008e-05, + "loss": 2.6352, + "step": 43172 + }, + { + "epoch": 2.01001001001001, + "grad_norm": 0.3260702418002444, + "learning_rate": 2.9658218857645897e-05, + "loss": 2.6809, + "step": 43173 + }, + { + "epoch": 2.010056568196103, + "grad_norm": 0.3533930406059981, + "learning_rate": 2.9655744452911917e-05, + "loss": 2.6653, + "step": 43174 + }, + { + "epoch": 2.0101031263821962, + "grad_norm": 0.34693705391191504, + "learning_rate": 2.9653270107886306e-05, + "loss": 2.6698, + "step": 43175 + }, + { + "epoch": 2.0101496845682894, + "grad_norm": 0.3381935461023334, + "learning_rate": 2.965079582257635e-05, + "loss": 2.7021, + "step": 43176 + }, + { + "epoch": 2.0101962427543825, + "grad_norm": 0.35058946084969045, + "learning_rate": 2.964832159698929e-05, + "loss": 2.609, + "step": 43177 + }, + { + "epoch": 2.010242800940475, + "grad_norm": 0.35303118945663087, + "learning_rate": 2.9645847431132423e-05, + "loss": 2.698, + "step": 43178 + }, + { + "epoch": 2.0102893591265683, + "grad_norm": 0.32011570885015334, + "learning_rate": 2.964337332501298e-05, + "loss": 2.6542, + "step": 43179 + }, + { + "epoch": 2.0103359173126614, + "grad_norm": 0.3497952515952043, + "learning_rate": 2.9640899278638213e-05, + "loss": 2.5939, + "step": 43180 + }, + { + "epoch": 2.0103824754987545, + "grad_norm": 0.34594097458845047, + "learning_rate": 2.9638425292015428e-05, + "loss": 2.6997, + "step": 43181 + }, + { + "epoch": 2.0104290336848476, + "grad_norm": 0.3392409378452926, + "learning_rate": 2.9635951365151833e-05, + "loss": 2.5119, + "step": 43182 + }, + { + "epoch": 2.0104755918709407, + "grad_norm": 0.33467876831696963, + "learning_rate": 2.9633477498054733e-05, + "loss": 2.6973, + "step": 43183 + }, + { + "epoch": 2.010522150057034, + "grad_norm": 0.32857374849559173, + "learning_rate": 2.9631003690731373e-05, + "loss": 2.7037, + "step": 43184 + }, + { + "epoch": 2.010568708243127, + "grad_norm": 0.34842829247044255, + "learning_rate": 2.9628529943188993e-05, + "loss": 2.6317, + "step": 43185 + }, + { + "epoch": 2.01061526642922, + "grad_norm": 0.3230878511429486, + "learning_rate": 2.9626056255434885e-05, + "loss": 2.6633, + "step": 43186 + }, + { + "epoch": 2.010661824615313, + "grad_norm": 0.3369091003382855, + "learning_rate": 2.9623582627476298e-05, + "loss": 2.7675, + "step": 43187 + }, + { + "epoch": 2.010708382801406, + "grad_norm": 0.34681776085287497, + "learning_rate": 2.9621109059320483e-05, + "loss": 2.7181, + "step": 43188 + }, + { + "epoch": 2.010754940987499, + "grad_norm": 0.3289289209158294, + "learning_rate": 2.9618635550974695e-05, + "loss": 2.6517, + "step": 43189 + }, + { + "epoch": 2.010801499173592, + "grad_norm": 0.3254866248155094, + "learning_rate": 2.961616210244621e-05, + "loss": 2.7325, + "step": 43190 + }, + { + "epoch": 2.010848057359685, + "grad_norm": 0.3239329259722717, + "learning_rate": 2.9613688713742292e-05, + "loss": 2.569, + "step": 43191 + }, + { + "epoch": 2.0108946155457783, + "grad_norm": 0.3361806749619757, + "learning_rate": 2.961121538487019e-05, + "loss": 2.6935, + "step": 43192 + }, + { + "epoch": 2.0109411737318714, + "grad_norm": 0.34084361201609753, + "learning_rate": 2.960874211583714e-05, + "loss": 2.5537, + "step": 43193 + }, + { + "epoch": 2.0109877319179645, + "grad_norm": 0.36259922501085934, + "learning_rate": 2.960626890665044e-05, + "loss": 2.5816, + "step": 43194 + }, + { + "epoch": 2.0110342901040577, + "grad_norm": 0.3390103638797429, + "learning_rate": 2.9603795757317314e-05, + "loss": 2.64, + "step": 43195 + }, + { + "epoch": 2.0110808482901508, + "grad_norm": 0.3951649507826728, + "learning_rate": 2.960132266784506e-05, + "loss": 2.6759, + "step": 43196 + }, + { + "epoch": 2.011127406476244, + "grad_norm": 0.35368326129788247, + "learning_rate": 2.959884963824091e-05, + "loss": 2.6023, + "step": 43197 + }, + { + "epoch": 2.0111739646623366, + "grad_norm": 0.36030870760398853, + "learning_rate": 2.9596376668512115e-05, + "loss": 2.6989, + "step": 43198 + }, + { + "epoch": 2.0112205228484297, + "grad_norm": 0.3671739778157434, + "learning_rate": 2.9593903758665953e-05, + "loss": 2.6847, + "step": 43199 + }, + { + "epoch": 2.011267081034523, + "grad_norm": 0.3587219127303307, + "learning_rate": 2.9591430908709673e-05, + "loss": 2.6972, + "step": 43200 + }, + { + "epoch": 2.011313639220616, + "grad_norm": 0.3439909751747414, + "learning_rate": 2.9588958118650535e-05, + "loss": 2.6986, + "step": 43201 + }, + { + "epoch": 2.011360197406709, + "grad_norm": 0.34490155850869003, + "learning_rate": 2.958648538849579e-05, + "loss": 2.5286, + "step": 43202 + }, + { + "epoch": 2.011406755592802, + "grad_norm": 0.3328986385929658, + "learning_rate": 2.9584012718252697e-05, + "loss": 2.7566, + "step": 43203 + }, + { + "epoch": 2.0114533137788952, + "grad_norm": 0.33295376235529467, + "learning_rate": 2.958154010792853e-05, + "loss": 2.6398, + "step": 43204 + }, + { + "epoch": 2.0114998719649884, + "grad_norm": 0.301941450109125, + "learning_rate": 2.9579067557530522e-05, + "loss": 2.5804, + "step": 43205 + }, + { + "epoch": 2.0115464301510815, + "grad_norm": 0.3379930880055132, + "learning_rate": 2.9576595067065928e-05, + "loss": 2.655, + "step": 43206 + }, + { + "epoch": 2.0115929883371746, + "grad_norm": 0.3301991817618388, + "learning_rate": 2.9574122636542023e-05, + "loss": 2.5449, + "step": 43207 + }, + { + "epoch": 2.0116395465232673, + "grad_norm": 0.31961305285180325, + "learning_rate": 2.9571650265966045e-05, + "loss": 2.5876, + "step": 43208 + }, + { + "epoch": 2.0116861047093604, + "grad_norm": 0.3062532262764702, + "learning_rate": 2.9569177955345285e-05, + "loss": 2.6765, + "step": 43209 + }, + { + "epoch": 2.0117326628954535, + "grad_norm": 0.3127388594316381, + "learning_rate": 2.956670570468697e-05, + "loss": 2.6269, + "step": 43210 + }, + { + "epoch": 2.0117792210815466, + "grad_norm": 0.3094773150402099, + "learning_rate": 2.956423351399834e-05, + "loss": 2.6774, + "step": 43211 + }, + { + "epoch": 2.0118257792676397, + "grad_norm": 0.31340876046445876, + "learning_rate": 2.956176138328669e-05, + "loss": 2.6274, + "step": 43212 + }, + { + "epoch": 2.011872337453733, + "grad_norm": 0.3331052556790332, + "learning_rate": 2.9559289312559247e-05, + "loss": 2.6036, + "step": 43213 + }, + { + "epoch": 2.011918895639826, + "grad_norm": 0.3333840239339789, + "learning_rate": 2.9556817301823292e-05, + "loss": 2.7251, + "step": 43214 + }, + { + "epoch": 2.011965453825919, + "grad_norm": 0.33357007322796883, + "learning_rate": 2.9554345351086045e-05, + "loss": 2.6063, + "step": 43215 + }, + { + "epoch": 2.012012012012012, + "grad_norm": 0.3317076920380171, + "learning_rate": 2.9551873460354784e-05, + "loss": 2.5378, + "step": 43216 + }, + { + "epoch": 2.0120585701981053, + "grad_norm": 0.325842202591581, + "learning_rate": 2.9549401629636776e-05, + "loss": 2.8085, + "step": 43217 + }, + { + "epoch": 2.012105128384198, + "grad_norm": 0.32323685541046177, + "learning_rate": 2.954692985893923e-05, + "loss": 2.5785, + "step": 43218 + }, + { + "epoch": 2.012151686570291, + "grad_norm": 0.3372898450632223, + "learning_rate": 2.9544458148269455e-05, + "loss": 2.648, + "step": 43219 + }, + { + "epoch": 2.012198244756384, + "grad_norm": 0.3637484736166554, + "learning_rate": 2.9541986497634677e-05, + "loss": 2.6257, + "step": 43220 + }, + { + "epoch": 2.0122448029424773, + "grad_norm": 0.34616530382875466, + "learning_rate": 2.9539514907042144e-05, + "loss": 2.661, + "step": 43221 + }, + { + "epoch": 2.0122913611285704, + "grad_norm": 0.3515174026311419, + "learning_rate": 2.9537043376499128e-05, + "loss": 2.7103, + "step": 43222 + }, + { + "epoch": 2.0123379193146635, + "grad_norm": 0.3627193645667236, + "learning_rate": 2.9534571906012875e-05, + "loss": 2.5751, + "step": 43223 + }, + { + "epoch": 2.0123844775007567, + "grad_norm": 0.35636293928100304, + "learning_rate": 2.953210049559062e-05, + "loss": 2.6326, + "step": 43224 + }, + { + "epoch": 2.01243103568685, + "grad_norm": 0.3419422360348146, + "learning_rate": 2.952962914523965e-05, + "loss": 2.6104, + "step": 43225 + }, + { + "epoch": 2.012477593872943, + "grad_norm": 0.3651382572287109, + "learning_rate": 2.9527157854967193e-05, + "loss": 2.6135, + "step": 43226 + }, + { + "epoch": 2.0125241520590356, + "grad_norm": 0.33461031006703457, + "learning_rate": 2.952468662478052e-05, + "loss": 2.6968, + "step": 43227 + }, + { + "epoch": 2.0125707102451287, + "grad_norm": 0.3386470570260054, + "learning_rate": 2.9522215454686874e-05, + "loss": 2.6674, + "step": 43228 + }, + { + "epoch": 2.012617268431222, + "grad_norm": 0.3307964904242487, + "learning_rate": 2.95197443446935e-05, + "loss": 2.6099, + "step": 43229 + }, + { + "epoch": 2.012663826617315, + "grad_norm": 0.32784559185061257, + "learning_rate": 2.9517273294807668e-05, + "loss": 2.6345, + "step": 43230 + }, + { + "epoch": 2.012710384803408, + "grad_norm": 0.32727173435829665, + "learning_rate": 2.9514802305036605e-05, + "loss": 2.5918, + "step": 43231 + }, + { + "epoch": 2.012756942989501, + "grad_norm": 0.3340844842345573, + "learning_rate": 2.951233137538759e-05, + "loss": 2.6167, + "step": 43232 + }, + { + "epoch": 2.0128035011755943, + "grad_norm": 0.34194236554591073, + "learning_rate": 2.9509860505867877e-05, + "loss": 2.5827, + "step": 43233 + }, + { + "epoch": 2.0128500593616874, + "grad_norm": 0.3293631349144071, + "learning_rate": 2.9507389696484677e-05, + "loss": 2.5379, + "step": 43234 + }, + { + "epoch": 2.0128966175477805, + "grad_norm": 0.33510246285308265, + "learning_rate": 2.9504918947245285e-05, + "loss": 2.6098, + "step": 43235 + }, + { + "epoch": 2.0129431757338736, + "grad_norm": 0.35882049264765736, + "learning_rate": 2.9502448258156928e-05, + "loss": 2.6554, + "step": 43236 + }, + { + "epoch": 2.0129897339199663, + "grad_norm": 0.3221720951123868, + "learning_rate": 2.9499977629226884e-05, + "loss": 2.6017, + "step": 43237 + }, + { + "epoch": 2.0130362921060594, + "grad_norm": 0.3179418595847344, + "learning_rate": 2.949750706046238e-05, + "loss": 2.6688, + "step": 43238 + }, + { + "epoch": 2.0130828502921525, + "grad_norm": 0.3620761142180578, + "learning_rate": 2.9495036551870662e-05, + "loss": 2.5871, + "step": 43239 + }, + { + "epoch": 2.0131294084782456, + "grad_norm": 0.36607158167704096, + "learning_rate": 2.9492566103459006e-05, + "loss": 2.6597, + "step": 43240 + }, + { + "epoch": 2.0131759666643387, + "grad_norm": 0.32885642475043936, + "learning_rate": 2.9490095715234635e-05, + "loss": 2.5441, + "step": 43241 + }, + { + "epoch": 2.013222524850432, + "grad_norm": 0.3617792060328111, + "learning_rate": 2.948762538720482e-05, + "loss": 2.6939, + "step": 43242 + }, + { + "epoch": 2.013269083036525, + "grad_norm": 0.33904066595270044, + "learning_rate": 2.9485155119376806e-05, + "loss": 2.7002, + "step": 43243 + }, + { + "epoch": 2.013315641222618, + "grad_norm": 0.3383388611293945, + "learning_rate": 2.948268491175783e-05, + "loss": 2.666, + "step": 43244 + }, + { + "epoch": 2.013362199408711, + "grad_norm": 0.35405037753210816, + "learning_rate": 2.9480214764355163e-05, + "loss": 2.5814, + "step": 43245 + }, + { + "epoch": 2.0134087575948043, + "grad_norm": 0.36661383915466655, + "learning_rate": 2.9477744677176046e-05, + "loss": 2.7653, + "step": 43246 + }, + { + "epoch": 2.013455315780897, + "grad_norm": 0.37721230152421337, + "learning_rate": 2.947527465022771e-05, + "loss": 2.7406, + "step": 43247 + }, + { + "epoch": 2.01350187396699, + "grad_norm": 0.40992443309189586, + "learning_rate": 2.9472804683517436e-05, + "loss": 2.6705, + "step": 43248 + }, + { + "epoch": 2.013548432153083, + "grad_norm": 0.3468487728704689, + "learning_rate": 2.9470334777052445e-05, + "loss": 2.5997, + "step": 43249 + }, + { + "epoch": 2.0135949903391763, + "grad_norm": 0.36530540010707735, + "learning_rate": 2.946786493084001e-05, + "loss": 2.7222, + "step": 43250 + }, + { + "epoch": 2.0136415485252694, + "grad_norm": 0.41616937182673003, + "learning_rate": 2.9465395144887375e-05, + "loss": 2.632, + "step": 43251 + }, + { + "epoch": 2.0136881067113626, + "grad_norm": 0.38397578914373187, + "learning_rate": 2.946292541920177e-05, + "loss": 2.7336, + "step": 43252 + }, + { + "epoch": 2.0137346648974557, + "grad_norm": 0.3532205734151905, + "learning_rate": 2.946045575379046e-05, + "loss": 2.6117, + "step": 43253 + }, + { + "epoch": 2.013781223083549, + "grad_norm": 0.3814802349341798, + "learning_rate": 2.9457986148660676e-05, + "loss": 2.639, + "step": 43254 + }, + { + "epoch": 2.013827781269642, + "grad_norm": 0.3778496157483357, + "learning_rate": 2.9455516603819698e-05, + "loss": 2.6273, + "step": 43255 + }, + { + "epoch": 2.013874339455735, + "grad_norm": 0.349431936546777, + "learning_rate": 2.9453047119274757e-05, + "loss": 2.5919, + "step": 43256 + }, + { + "epoch": 2.0139208976418277, + "grad_norm": 0.35445621727136356, + "learning_rate": 2.9450577695033077e-05, + "loss": 2.6, + "step": 43257 + }, + { + "epoch": 2.013967455827921, + "grad_norm": 0.38605005489009353, + "learning_rate": 2.9448108331101942e-05, + "loss": 2.6162, + "step": 43258 + }, + { + "epoch": 2.014014014014014, + "grad_norm": 0.3362671896427632, + "learning_rate": 2.944563902748857e-05, + "loss": 2.6214, + "step": 43259 + }, + { + "epoch": 2.014060572200107, + "grad_norm": 0.34719348369747766, + "learning_rate": 2.944316978420024e-05, + "loss": 2.6676, + "step": 43260 + }, + { + "epoch": 2.0141071303862, + "grad_norm": 0.38455185389416735, + "learning_rate": 2.9440700601244187e-05, + "loss": 2.593, + "step": 43261 + }, + { + "epoch": 2.0141536885722933, + "grad_norm": 0.33233319973449776, + "learning_rate": 2.9438231478627626e-05, + "loss": 2.5881, + "step": 43262 + }, + { + "epoch": 2.0142002467583864, + "grad_norm": 0.33625904782380567, + "learning_rate": 2.9435762416357852e-05, + "loss": 2.5322, + "step": 43263 + }, + { + "epoch": 2.0142468049444795, + "grad_norm": 0.342659022924466, + "learning_rate": 2.9433293414442087e-05, + "loss": 2.6511, + "step": 43264 + }, + { + "epoch": 2.0142933631305726, + "grad_norm": 0.3137989147243522, + "learning_rate": 2.9430824472887576e-05, + "loss": 2.6201, + "step": 43265 + }, + { + "epoch": 2.0143399213166653, + "grad_norm": 0.35003473148683334, + "learning_rate": 2.942835559170157e-05, + "loss": 2.7319, + "step": 43266 + }, + { + "epoch": 2.0143864795027584, + "grad_norm": 0.32101603628539027, + "learning_rate": 2.9425886770891296e-05, + "loss": 2.5708, + "step": 43267 + }, + { + "epoch": 2.0144330376888515, + "grad_norm": 0.32780056200450663, + "learning_rate": 2.9423418010464045e-05, + "loss": 2.7151, + "step": 43268 + }, + { + "epoch": 2.0144795958749446, + "grad_norm": 0.32894339896335867, + "learning_rate": 2.9420949310427027e-05, + "loss": 2.5966, + "step": 43269 + }, + { + "epoch": 2.0145261540610377, + "grad_norm": 0.3487241140943314, + "learning_rate": 2.9418480670787487e-05, + "loss": 2.5994, + "step": 43270 + }, + { + "epoch": 2.014572712247131, + "grad_norm": 0.3472414193802593, + "learning_rate": 2.941601209155268e-05, + "loss": 2.7584, + "step": 43271 + }, + { + "epoch": 2.014619270433224, + "grad_norm": 0.3207476317279403, + "learning_rate": 2.941354357272984e-05, + "loss": 2.6569, + "step": 43272 + }, + { + "epoch": 2.014665828619317, + "grad_norm": 0.36347665272281715, + "learning_rate": 2.9411075114326248e-05, + "loss": 2.6594, + "step": 43273 + }, + { + "epoch": 2.01471238680541, + "grad_norm": 0.3515069034526502, + "learning_rate": 2.9408606716349107e-05, + "loss": 2.6402, + "step": 43274 + }, + { + "epoch": 2.0147589449915033, + "grad_norm": 0.36312896460586597, + "learning_rate": 2.940613837880567e-05, + "loss": 2.6748, + "step": 43275 + }, + { + "epoch": 2.014805503177596, + "grad_norm": 0.3322675054783444, + "learning_rate": 2.94036701017032e-05, + "loss": 2.5697, + "step": 43276 + }, + { + "epoch": 2.014852061363689, + "grad_norm": 0.3946902199604388, + "learning_rate": 2.9401201885048923e-05, + "loss": 2.6158, + "step": 43277 + }, + { + "epoch": 2.014898619549782, + "grad_norm": 0.3445907964360533, + "learning_rate": 2.9398733728850096e-05, + "loss": 2.6427, + "step": 43278 + }, + { + "epoch": 2.0149451777358753, + "grad_norm": 0.3636158481170865, + "learning_rate": 2.9396265633113963e-05, + "loss": 2.6231, + "step": 43279 + }, + { + "epoch": 2.0149917359219685, + "grad_norm": 0.37125463511236917, + "learning_rate": 2.9393797597847734e-05, + "loss": 2.6368, + "step": 43280 + }, + { + "epoch": 2.0150382941080616, + "grad_norm": 0.34239602503981653, + "learning_rate": 2.93913296230587e-05, + "loss": 2.6139, + "step": 43281 + }, + { + "epoch": 2.0150848522941547, + "grad_norm": 0.3501483292569995, + "learning_rate": 2.9388861708754082e-05, + "loss": 2.6338, + "step": 43282 + }, + { + "epoch": 2.015131410480248, + "grad_norm": 0.34845553801309964, + "learning_rate": 2.938639385494111e-05, + "loss": 2.6388, + "step": 43283 + }, + { + "epoch": 2.015177968666341, + "grad_norm": 0.36116823981828816, + "learning_rate": 2.9383926061627054e-05, + "loss": 2.7354, + "step": 43284 + }, + { + "epoch": 2.015224526852434, + "grad_norm": 0.3371711355726617, + "learning_rate": 2.9381458328819127e-05, + "loss": 2.6775, + "step": 43285 + }, + { + "epoch": 2.0152710850385267, + "grad_norm": 0.3485392893374931, + "learning_rate": 2.937899065652461e-05, + "loss": 2.635, + "step": 43286 + }, + { + "epoch": 2.01531764322462, + "grad_norm": 0.3404475287395345, + "learning_rate": 2.9376523044750724e-05, + "loss": 2.6246, + "step": 43287 + }, + { + "epoch": 2.015364201410713, + "grad_norm": 0.3546405161385789, + "learning_rate": 2.9374055493504693e-05, + "loss": 2.702, + "step": 43288 + }, + { + "epoch": 2.015410759596806, + "grad_norm": 0.3469002980348078, + "learning_rate": 2.937158800279379e-05, + "loss": 2.653, + "step": 43289 + }, + { + "epoch": 2.015457317782899, + "grad_norm": 0.35107201552658673, + "learning_rate": 2.936912057262524e-05, + "loss": 2.6563, + "step": 43290 + }, + { + "epoch": 2.0155038759689923, + "grad_norm": 0.35801936892064795, + "learning_rate": 2.93666532030063e-05, + "loss": 2.6125, + "step": 43291 + }, + { + "epoch": 2.0155504341550854, + "grad_norm": 0.3221248576200373, + "learning_rate": 2.9364185893944195e-05, + "loss": 2.6738, + "step": 43292 + }, + { + "epoch": 2.0155969923411785, + "grad_norm": 0.3612623303180131, + "learning_rate": 2.9361718645446153e-05, + "loss": 2.677, + "step": 43293 + }, + { + "epoch": 2.0156435505272716, + "grad_norm": 0.34832613563878223, + "learning_rate": 2.9359251457519454e-05, + "loss": 2.6354, + "step": 43294 + }, + { + "epoch": 2.0156901087133647, + "grad_norm": 0.37239642429364506, + "learning_rate": 2.9356784330171294e-05, + "loss": 2.724, + "step": 43295 + }, + { + "epoch": 2.0157366668994574, + "grad_norm": 0.36795565413137166, + "learning_rate": 2.935431726340897e-05, + "loss": 2.7405, + "step": 43296 + }, + { + "epoch": 2.0157832250855505, + "grad_norm": 0.3460348663357976, + "learning_rate": 2.9351850257239677e-05, + "loss": 2.6661, + "step": 43297 + }, + { + "epoch": 2.0158297832716436, + "grad_norm": 0.3370243190318451, + "learning_rate": 2.934938331167066e-05, + "loss": 2.6608, + "step": 43298 + }, + { + "epoch": 2.0158763414577368, + "grad_norm": 0.3271025463199653, + "learning_rate": 2.934691642670918e-05, + "loss": 2.6747, + "step": 43299 + }, + { + "epoch": 2.01592289964383, + "grad_norm": 0.3518084604097546, + "learning_rate": 2.9344449602362473e-05, + "loss": 2.7502, + "step": 43300 + }, + { + "epoch": 2.015969457829923, + "grad_norm": 0.347508856361399, + "learning_rate": 2.9341982838637748e-05, + "loss": 2.7276, + "step": 43301 + }, + { + "epoch": 2.016016016016016, + "grad_norm": 0.3507377788839966, + "learning_rate": 2.933951613554229e-05, + "loss": 2.6119, + "step": 43302 + }, + { + "epoch": 2.016062574202109, + "grad_norm": 0.360968777706131, + "learning_rate": 2.93370494930833e-05, + "loss": 2.6414, + "step": 43303 + }, + { + "epoch": 2.0161091323882023, + "grad_norm": 0.3248259794936549, + "learning_rate": 2.9334582911268056e-05, + "loss": 2.6269, + "step": 43304 + }, + { + "epoch": 2.0161556905742954, + "grad_norm": 0.33870203172082464, + "learning_rate": 2.933211639010377e-05, + "loss": 2.6662, + "step": 43305 + }, + { + "epoch": 2.016202248760388, + "grad_norm": 0.3400638047537428, + "learning_rate": 2.9329649929597662e-05, + "loss": 2.6606, + "step": 43306 + }, + { + "epoch": 2.0162488069464812, + "grad_norm": 0.35436290368317663, + "learning_rate": 2.9327183529757017e-05, + "loss": 2.6884, + "step": 43307 + }, + { + "epoch": 2.0162953651325743, + "grad_norm": 0.34398855946206525, + "learning_rate": 2.9324717190589034e-05, + "loss": 2.7286, + "step": 43308 + }, + { + "epoch": 2.0163419233186675, + "grad_norm": 0.35388183229848047, + "learning_rate": 2.932225091210099e-05, + "loss": 2.6921, + "step": 43309 + }, + { + "epoch": 2.0163884815047606, + "grad_norm": 0.3491172020191709, + "learning_rate": 2.93197846943001e-05, + "loss": 2.6343, + "step": 43310 + }, + { + "epoch": 2.0164350396908537, + "grad_norm": 0.34589811464092984, + "learning_rate": 2.9317318537193585e-05, + "loss": 2.6687, + "step": 43311 + }, + { + "epoch": 2.016481597876947, + "grad_norm": 0.3391180987191731, + "learning_rate": 2.931485244078872e-05, + "loss": 2.7085, + "step": 43312 + }, + { + "epoch": 2.01652815606304, + "grad_norm": 0.3372460360679688, + "learning_rate": 2.9312386405092723e-05, + "loss": 2.6863, + "step": 43313 + }, + { + "epoch": 2.016574714249133, + "grad_norm": 0.3558874449703755, + "learning_rate": 2.9309920430112824e-05, + "loss": 2.5993, + "step": 43314 + }, + { + "epoch": 2.0166212724352257, + "grad_norm": 0.3781496000531446, + "learning_rate": 2.9307454515856293e-05, + "loss": 2.6296, + "step": 43315 + }, + { + "epoch": 2.016667830621319, + "grad_norm": 0.34751510936428154, + "learning_rate": 2.9304988662330324e-05, + "loss": 2.5558, + "step": 43316 + }, + { + "epoch": 2.016714388807412, + "grad_norm": 0.3189157083315967, + "learning_rate": 2.9302522869542187e-05, + "loss": 2.5937, + "step": 43317 + }, + { + "epoch": 2.016760946993505, + "grad_norm": 0.36055281794732413, + "learning_rate": 2.9300057137499106e-05, + "loss": 2.5723, + "step": 43318 + }, + { + "epoch": 2.016807505179598, + "grad_norm": 0.3521410025972882, + "learning_rate": 2.9297591466208297e-05, + "loss": 2.6523, + "step": 43319 + }, + { + "epoch": 2.0168540633656913, + "grad_norm": 0.3173991180573997, + "learning_rate": 2.9295125855677042e-05, + "loss": 2.5525, + "step": 43320 + }, + { + "epoch": 2.0169006215517844, + "grad_norm": 0.3500905962129376, + "learning_rate": 2.9292660305912527e-05, + "loss": 2.486, + "step": 43321 + }, + { + "epoch": 2.0169471797378775, + "grad_norm": 0.3583621752184198, + "learning_rate": 2.9290194816922034e-05, + "loss": 2.6574, + "step": 43322 + }, + { + "epoch": 2.0169937379239706, + "grad_norm": 0.33739056488789915, + "learning_rate": 2.9287729388712788e-05, + "loss": 2.6661, + "step": 43323 + }, + { + "epoch": 2.0170402961100637, + "grad_norm": 0.3586926163276452, + "learning_rate": 2.9285264021291986e-05, + "loss": 2.7579, + "step": 43324 + }, + { + "epoch": 2.0170868542961564, + "grad_norm": 0.35619694081276954, + "learning_rate": 2.9282798714666914e-05, + "loss": 2.6846, + "step": 43325 + }, + { + "epoch": 2.0171334124822495, + "grad_norm": 0.32633342844623753, + "learning_rate": 2.9280333468844783e-05, + "loss": 2.5535, + "step": 43326 + }, + { + "epoch": 2.0171799706683426, + "grad_norm": 0.3468799566015526, + "learning_rate": 2.927786828383283e-05, + "loss": 2.691, + "step": 43327 + }, + { + "epoch": 2.0172265288544358, + "grad_norm": 0.3738004176402848, + "learning_rate": 2.9275403159638303e-05, + "loss": 2.7376, + "step": 43328 + }, + { + "epoch": 2.017273087040529, + "grad_norm": 0.3243142603103159, + "learning_rate": 2.927293809626841e-05, + "loss": 2.6556, + "step": 43329 + }, + { + "epoch": 2.017319645226622, + "grad_norm": 0.37201300832660095, + "learning_rate": 2.9270473093730416e-05, + "loss": 2.7084, + "step": 43330 + }, + { + "epoch": 2.017366203412715, + "grad_norm": 0.33479445967400556, + "learning_rate": 2.926800815203152e-05, + "loss": 2.5965, + "step": 43331 + }, + { + "epoch": 2.0174127615988082, + "grad_norm": 0.33089487500524567, + "learning_rate": 2.9265543271178998e-05, + "loss": 2.7699, + "step": 43332 + }, + { + "epoch": 2.0174593197849013, + "grad_norm": 0.35906723961417564, + "learning_rate": 2.9263078451180066e-05, + "loss": 2.5499, + "step": 43333 + }, + { + "epoch": 2.0175058779709945, + "grad_norm": 0.3356133573824586, + "learning_rate": 2.926061369204193e-05, + "loss": 2.711, + "step": 43334 + }, + { + "epoch": 2.017552436157087, + "grad_norm": 0.3537669430861685, + "learning_rate": 2.9258148993771876e-05, + "loss": 2.5834, + "step": 43335 + }, + { + "epoch": 2.0175989943431802, + "grad_norm": 0.3494809452205285, + "learning_rate": 2.925568435637711e-05, + "loss": 2.665, + "step": 43336 + }, + { + "epoch": 2.0176455525292734, + "grad_norm": 0.33847570640228075, + "learning_rate": 2.925321977986485e-05, + "loss": 2.7577, + "step": 43337 + }, + { + "epoch": 2.0176921107153665, + "grad_norm": 0.35373532192388296, + "learning_rate": 2.9250755264242358e-05, + "loss": 2.6403, + "step": 43338 + }, + { + "epoch": 2.0177386689014596, + "grad_norm": 0.33904357250075956, + "learning_rate": 2.9248290809516847e-05, + "loss": 2.6167, + "step": 43339 + }, + { + "epoch": 2.0177852270875527, + "grad_norm": 0.33170296247089387, + "learning_rate": 2.9245826415695566e-05, + "loss": 2.6124, + "step": 43340 + }, + { + "epoch": 2.017831785273646, + "grad_norm": 0.334631378279453, + "learning_rate": 2.9243362082785742e-05, + "loss": 2.609, + "step": 43341 + }, + { + "epoch": 2.017878343459739, + "grad_norm": 0.3377026799829215, + "learning_rate": 2.92408978107946e-05, + "loss": 2.6003, + "step": 43342 + }, + { + "epoch": 2.017924901645832, + "grad_norm": 0.3224423952309504, + "learning_rate": 2.923843359972938e-05, + "loss": 2.6282, + "step": 43343 + }, + { + "epoch": 2.017971459831925, + "grad_norm": 0.3724890444934258, + "learning_rate": 2.92359694495973e-05, + "loss": 2.626, + "step": 43344 + }, + { + "epoch": 2.018018018018018, + "grad_norm": 0.34108898565434825, + "learning_rate": 2.9233505360405622e-05, + "loss": 2.6601, + "step": 43345 + }, + { + "epoch": 2.018064576204111, + "grad_norm": 0.36111458910090793, + "learning_rate": 2.923104133216156e-05, + "loss": 2.6877, + "step": 43346 + }, + { + "epoch": 2.018111134390204, + "grad_norm": 0.36403100950442613, + "learning_rate": 2.9228577364872324e-05, + "loss": 2.658, + "step": 43347 + }, + { + "epoch": 2.018157692576297, + "grad_norm": 0.3276607148053394, + "learning_rate": 2.9226113458545186e-05, + "loss": 2.6968, + "step": 43348 + }, + { + "epoch": 2.0182042507623903, + "grad_norm": 0.3443728527856546, + "learning_rate": 2.9223649613187344e-05, + "loss": 2.6831, + "step": 43349 + }, + { + "epoch": 2.0182508089484834, + "grad_norm": 0.3375876738643364, + "learning_rate": 2.922118582880605e-05, + "loss": 2.6722, + "step": 43350 + }, + { + "epoch": 2.0182973671345765, + "grad_norm": 0.36188429753679585, + "learning_rate": 2.9218722105408546e-05, + "loss": 2.6249, + "step": 43351 + }, + { + "epoch": 2.0183439253206696, + "grad_norm": 0.3260543805765714, + "learning_rate": 2.9216258443002025e-05, + "loss": 2.6476, + "step": 43352 + }, + { + "epoch": 2.0183904835067628, + "grad_norm": 0.3287476738368449, + "learning_rate": 2.9213794841593744e-05, + "loss": 2.5533, + "step": 43353 + }, + { + "epoch": 2.0184370416928554, + "grad_norm": 0.3572520775526156, + "learning_rate": 2.921133130119093e-05, + "loss": 2.6886, + "step": 43354 + }, + { + "epoch": 2.0184835998789485, + "grad_norm": 0.3238804661971636, + "learning_rate": 2.9208867821800805e-05, + "loss": 2.6571, + "step": 43355 + }, + { + "epoch": 2.0185301580650417, + "grad_norm": 0.3249361366009446, + "learning_rate": 2.9206404403430615e-05, + "loss": 2.6981, + "step": 43356 + }, + { + "epoch": 2.0185767162511348, + "grad_norm": 0.3826164648705841, + "learning_rate": 2.9203941046087556e-05, + "loss": 2.6622, + "step": 43357 + }, + { + "epoch": 2.018623274437228, + "grad_norm": 0.3444915858595886, + "learning_rate": 2.9201477749778906e-05, + "loss": 2.7185, + "step": 43358 + }, + { + "epoch": 2.018669832623321, + "grad_norm": 0.3501721771312052, + "learning_rate": 2.919901451451187e-05, + "loss": 2.6101, + "step": 43359 + }, + { + "epoch": 2.018716390809414, + "grad_norm": 0.33425507325675313, + "learning_rate": 2.9196551340293654e-05, + "loss": 2.6979, + "step": 43360 + }, + { + "epoch": 2.0187629489955072, + "grad_norm": 0.35965228204453603, + "learning_rate": 2.9194088227131532e-05, + "loss": 2.5972, + "step": 43361 + }, + { + "epoch": 2.0188095071816003, + "grad_norm": 0.3402225320783176, + "learning_rate": 2.91916251750327e-05, + "loss": 2.6507, + "step": 43362 + }, + { + "epoch": 2.0188560653676935, + "grad_norm": 0.3273795177597856, + "learning_rate": 2.9189162184004404e-05, + "loss": 2.6611, + "step": 43363 + }, + { + "epoch": 2.018902623553786, + "grad_norm": 0.34207791164308565, + "learning_rate": 2.918669925405388e-05, + "loss": 2.6303, + "step": 43364 + }, + { + "epoch": 2.0189491817398793, + "grad_norm": 0.34191474909335684, + "learning_rate": 2.9184236385188303e-05, + "loss": 2.6798, + "step": 43365 + }, + { + "epoch": 2.0189957399259724, + "grad_norm": 0.3403203444832343, + "learning_rate": 2.9181773577414983e-05, + "loss": 2.6678, + "step": 43366 + }, + { + "epoch": 2.0190422981120655, + "grad_norm": 0.32330516155991984, + "learning_rate": 2.917931083074107e-05, + "loss": 2.6577, + "step": 43367 + }, + { + "epoch": 2.0190888562981586, + "grad_norm": 0.3420286162712086, + "learning_rate": 2.9176848145173864e-05, + "loss": 2.6496, + "step": 43368 + }, + { + "epoch": 2.0191354144842517, + "grad_norm": 0.3340952250331267, + "learning_rate": 2.9174385520720537e-05, + "loss": 2.7067, + "step": 43369 + }, + { + "epoch": 2.019181972670345, + "grad_norm": 0.33033363265691357, + "learning_rate": 2.917192295738834e-05, + "loss": 2.6629, + "step": 43370 + }, + { + "epoch": 2.019228530856438, + "grad_norm": 0.3089215856107824, + "learning_rate": 2.9169460455184494e-05, + "loss": 2.5528, + "step": 43371 + }, + { + "epoch": 2.019275089042531, + "grad_norm": 0.31887804187727337, + "learning_rate": 2.9166998014116253e-05, + "loss": 2.599, + "step": 43372 + }, + { + "epoch": 2.019321647228624, + "grad_norm": 0.34056610594647047, + "learning_rate": 2.9164535634190797e-05, + "loss": 2.6429, + "step": 43373 + }, + { + "epoch": 2.019368205414717, + "grad_norm": 0.33220130001814085, + "learning_rate": 2.9162073315415384e-05, + "loss": 2.5727, + "step": 43374 + }, + { + "epoch": 2.01941476360081, + "grad_norm": 0.35330059335456554, + "learning_rate": 2.915961105779722e-05, + "loss": 2.6711, + "step": 43375 + }, + { + "epoch": 2.019461321786903, + "grad_norm": 0.3258626689138199, + "learning_rate": 2.9157148861343552e-05, + "loss": 2.5938, + "step": 43376 + }, + { + "epoch": 2.019507879972996, + "grad_norm": 0.33956155960618795, + "learning_rate": 2.915468672606162e-05, + "loss": 2.5554, + "step": 43377 + }, + { + "epoch": 2.0195544381590893, + "grad_norm": 0.3239010377737244, + "learning_rate": 2.9152224651958587e-05, + "loss": 2.5931, + "step": 43378 + }, + { + "epoch": 2.0196009963451824, + "grad_norm": 0.3593371472663565, + "learning_rate": 2.914976263904176e-05, + "loss": 2.7355, + "step": 43379 + }, + { + "epoch": 2.0196475545312755, + "grad_norm": 0.3262155789961366, + "learning_rate": 2.914730068731829e-05, + "loss": 2.658, + "step": 43380 + }, + { + "epoch": 2.0196941127173687, + "grad_norm": 0.32177937646212035, + "learning_rate": 2.914483879679547e-05, + "loss": 2.7338, + "step": 43381 + }, + { + "epoch": 2.0197406709034618, + "grad_norm": 0.3427959931594224, + "learning_rate": 2.914237696748048e-05, + "loss": 2.619, + "step": 43382 + }, + { + "epoch": 2.019787229089555, + "grad_norm": 0.33650020817837284, + "learning_rate": 2.9139915199380558e-05, + "loss": 2.6425, + "step": 43383 + }, + { + "epoch": 2.0198337872756476, + "grad_norm": 0.3167692980764141, + "learning_rate": 2.913745349250293e-05, + "loss": 2.5762, + "step": 43384 + }, + { + "epoch": 2.0198803454617407, + "grad_norm": 0.3503146040295164, + "learning_rate": 2.913499184685482e-05, + "loss": 2.7303, + "step": 43385 + }, + { + "epoch": 2.019926903647834, + "grad_norm": 0.31384962418817397, + "learning_rate": 2.913253026244347e-05, + "loss": 2.6125, + "step": 43386 + }, + { + "epoch": 2.019973461833927, + "grad_norm": 0.3403093096538607, + "learning_rate": 2.9130068739276072e-05, + "loss": 2.6785, + "step": 43387 + }, + { + "epoch": 2.02002002002002, + "grad_norm": 0.3516191510939275, + "learning_rate": 2.9127607277359862e-05, + "loss": 2.7691, + "step": 43388 + }, + { + "epoch": 2.020066578206113, + "grad_norm": 0.3223799860669109, + "learning_rate": 2.9125145876702075e-05, + "loss": 2.6512, + "step": 43389 + }, + { + "epoch": 2.0201131363922062, + "grad_norm": 0.35284437310052924, + "learning_rate": 2.912268453730994e-05, + "loss": 2.6275, + "step": 43390 + }, + { + "epoch": 2.0201596945782994, + "grad_norm": 0.30731017201548255, + "learning_rate": 2.9120223259190638e-05, + "loss": 2.5352, + "step": 43391 + }, + { + "epoch": 2.0202062527643925, + "grad_norm": 0.3306820774298442, + "learning_rate": 2.9117762042351458e-05, + "loss": 2.6888, + "step": 43392 + }, + { + "epoch": 2.0202528109504856, + "grad_norm": 0.3354249049382017, + "learning_rate": 2.911530088679955e-05, + "loss": 2.6888, + "step": 43393 + }, + { + "epoch": 2.0202993691365783, + "grad_norm": 0.32464828715578087, + "learning_rate": 2.9112839792542217e-05, + "loss": 2.754, + "step": 43394 + }, + { + "epoch": 2.0203459273226714, + "grad_norm": 0.33549359676623886, + "learning_rate": 2.911037875958662e-05, + "loss": 2.6689, + "step": 43395 + }, + { + "epoch": 2.0203924855087645, + "grad_norm": 0.31996520131122225, + "learning_rate": 2.9107917787940008e-05, + "loss": 2.6774, + "step": 43396 + }, + { + "epoch": 2.0204390436948576, + "grad_norm": 0.31936466161839955, + "learning_rate": 2.9105456877609594e-05, + "loss": 2.6113, + "step": 43397 + }, + { + "epoch": 2.0204856018809507, + "grad_norm": 0.3301358137802314, + "learning_rate": 2.9102996028602607e-05, + "loss": 2.6486, + "step": 43398 + }, + { + "epoch": 2.020532160067044, + "grad_norm": 0.33407647272873775, + "learning_rate": 2.910053524092629e-05, + "loss": 2.7038, + "step": 43399 + }, + { + "epoch": 2.020578718253137, + "grad_norm": 0.33973518301009364, + "learning_rate": 2.9098074514587824e-05, + "loss": 2.641, + "step": 43400 + }, + { + "epoch": 2.02062527643923, + "grad_norm": 0.3533019871270985, + "learning_rate": 2.9095613849594448e-05, + "loss": 2.7626, + "step": 43401 + }, + { + "epoch": 2.020671834625323, + "grad_norm": 0.3193024003295377, + "learning_rate": 2.9093153245953386e-05, + "loss": 2.6296, + "step": 43402 + }, + { + "epoch": 2.020718392811416, + "grad_norm": 0.35296205868117464, + "learning_rate": 2.9090692703671856e-05, + "loss": 2.6878, + "step": 43403 + }, + { + "epoch": 2.020764950997509, + "grad_norm": 0.32867720953132834, + "learning_rate": 2.9088232222757083e-05, + "loss": 2.5638, + "step": 43404 + }, + { + "epoch": 2.020811509183602, + "grad_norm": 0.5217068653158236, + "learning_rate": 2.9085771803216315e-05, + "loss": 2.75, + "step": 43405 + }, + { + "epoch": 2.020858067369695, + "grad_norm": 0.33279433231778494, + "learning_rate": 2.9083311445056704e-05, + "loss": 2.657, + "step": 43406 + }, + { + "epoch": 2.0209046255557883, + "grad_norm": 0.34450972682022785, + "learning_rate": 2.9080851148285553e-05, + "loss": 2.6628, + "step": 43407 + }, + { + "epoch": 2.0209511837418814, + "grad_norm": 0.37871266011622795, + "learning_rate": 2.9078390912910025e-05, + "loss": 2.6046, + "step": 43408 + }, + { + "epoch": 2.0209977419279745, + "grad_norm": 0.36257272263766005, + "learning_rate": 2.9075930738937353e-05, + "loss": 2.6752, + "step": 43409 + }, + { + "epoch": 2.0210443001140677, + "grad_norm": 0.3490147972233895, + "learning_rate": 2.9073470626374766e-05, + "loss": 2.6688, + "step": 43410 + }, + { + "epoch": 2.0210908583001608, + "grad_norm": 0.38284412711560417, + "learning_rate": 2.907101057522948e-05, + "loss": 2.6696, + "step": 43411 + }, + { + "epoch": 2.021137416486254, + "grad_norm": 0.3502352688595593, + "learning_rate": 2.9068550585508734e-05, + "loss": 2.6664, + "step": 43412 + }, + { + "epoch": 2.0211839746723466, + "grad_norm": 0.36606513568621735, + "learning_rate": 2.9066090657219713e-05, + "loss": 2.6885, + "step": 43413 + }, + { + "epoch": 2.0212305328584397, + "grad_norm": 0.37913722013629475, + "learning_rate": 2.9063630790369654e-05, + "loss": 2.7935, + "step": 43414 + }, + { + "epoch": 2.021277091044533, + "grad_norm": 0.3336783081530531, + "learning_rate": 2.906117098496578e-05, + "loss": 2.5905, + "step": 43415 + }, + { + "epoch": 2.021323649230626, + "grad_norm": 0.3532869641424576, + "learning_rate": 2.90587112410153e-05, + "loss": 2.593, + "step": 43416 + }, + { + "epoch": 2.021370207416719, + "grad_norm": 0.35354753439083664, + "learning_rate": 2.905625155852544e-05, + "loss": 2.6359, + "step": 43417 + }, + { + "epoch": 2.021416765602812, + "grad_norm": 0.3315813860384278, + "learning_rate": 2.9053791937503437e-05, + "loss": 2.6451, + "step": 43418 + }, + { + "epoch": 2.0214633237889053, + "grad_norm": 0.3325673348500475, + "learning_rate": 2.9051332377956457e-05, + "loss": 2.5857, + "step": 43419 + }, + { + "epoch": 2.0215098819749984, + "grad_norm": 0.3323984889583222, + "learning_rate": 2.9048872879891787e-05, + "loss": 2.5977, + "step": 43420 + }, + { + "epoch": 2.0215564401610915, + "grad_norm": 0.3385975476797825, + "learning_rate": 2.90464134433166e-05, + "loss": 2.6228, + "step": 43421 + }, + { + "epoch": 2.0216029983471846, + "grad_norm": 0.3485409783936878, + "learning_rate": 2.9043954068238115e-05, + "loss": 2.69, + "step": 43422 + }, + { + "epoch": 2.0216495565332773, + "grad_norm": 0.3567280496843613, + "learning_rate": 2.9041494754663567e-05, + "loss": 2.629, + "step": 43423 + }, + { + "epoch": 2.0216961147193704, + "grad_norm": 0.3512896032221733, + "learning_rate": 2.9039035502600166e-05, + "loss": 2.6437, + "step": 43424 + }, + { + "epoch": 2.0217426729054635, + "grad_norm": 0.3316825295844534, + "learning_rate": 2.903657631205514e-05, + "loss": 2.6414, + "step": 43425 + }, + { + "epoch": 2.0217892310915566, + "grad_norm": 0.32804878874035576, + "learning_rate": 2.9034117183035692e-05, + "loss": 2.6203, + "step": 43426 + }, + { + "epoch": 2.0218357892776497, + "grad_norm": 0.3311780397856152, + "learning_rate": 2.9031658115549037e-05, + "loss": 2.6405, + "step": 43427 + }, + { + "epoch": 2.021882347463743, + "grad_norm": 0.3534403179087017, + "learning_rate": 2.90291991096024e-05, + "loss": 2.6605, + "step": 43428 + }, + { + "epoch": 2.021928905649836, + "grad_norm": 0.3360888592309985, + "learning_rate": 2.9026740165203003e-05, + "loss": 2.6641, + "step": 43429 + }, + { + "epoch": 2.021975463835929, + "grad_norm": 0.3496788042131198, + "learning_rate": 2.902428128235805e-05, + "loss": 2.6535, + "step": 43430 + }, + { + "epoch": 2.022022022022022, + "grad_norm": 0.34067719545681696, + "learning_rate": 2.902182246107479e-05, + "loss": 2.6392, + "step": 43431 + }, + { + "epoch": 2.0220685802081153, + "grad_norm": 0.3249783016053469, + "learning_rate": 2.9019363701360374e-05, + "loss": 2.6, + "step": 43432 + }, + { + "epoch": 2.022115138394208, + "grad_norm": 0.33726325428086557, + "learning_rate": 2.9016905003222093e-05, + "loss": 2.6463, + "step": 43433 + }, + { + "epoch": 2.022161696580301, + "grad_norm": 0.3465450457430525, + "learning_rate": 2.9014446366667112e-05, + "loss": 2.7192, + "step": 43434 + }, + { + "epoch": 2.022208254766394, + "grad_norm": 0.33558868702199185, + "learning_rate": 2.9011987791702672e-05, + "loss": 2.6619, + "step": 43435 + }, + { + "epoch": 2.0222548129524873, + "grad_norm": 0.33804192187847665, + "learning_rate": 2.900952927833597e-05, + "loss": 2.7022, + "step": 43436 + }, + { + "epoch": 2.0223013711385804, + "grad_norm": 0.3420413644131086, + "learning_rate": 2.9007070826574235e-05, + "loss": 2.6807, + "step": 43437 + }, + { + "epoch": 2.0223479293246736, + "grad_norm": 0.33989873838568635, + "learning_rate": 2.90046124364247e-05, + "loss": 2.5691, + "step": 43438 + }, + { + "epoch": 2.0223944875107667, + "grad_norm": 0.35540193162244416, + "learning_rate": 2.900215410789452e-05, + "loss": 2.6105, + "step": 43439 + }, + { + "epoch": 2.02244104569686, + "grad_norm": 0.3358442108621005, + "learning_rate": 2.8999695840990982e-05, + "loss": 2.7577, + "step": 43440 + }, + { + "epoch": 2.022487603882953, + "grad_norm": 0.35346456319273983, + "learning_rate": 2.899723763572125e-05, + "loss": 2.7021, + "step": 43441 + }, + { + "epoch": 2.0225341620690456, + "grad_norm": 0.35422776608916556, + "learning_rate": 2.899477949209256e-05, + "loss": 2.5682, + "step": 43442 + }, + { + "epoch": 2.0225807202551387, + "grad_norm": 0.34660663407078196, + "learning_rate": 2.899232141011212e-05, + "loss": 2.6746, + "step": 43443 + }, + { + "epoch": 2.022627278441232, + "grad_norm": 0.3473288445577929, + "learning_rate": 2.8989863389787175e-05, + "loss": 2.6407, + "step": 43444 + }, + { + "epoch": 2.022673836627325, + "grad_norm": 0.33536073721717735, + "learning_rate": 2.8987405431124866e-05, + "loss": 2.7279, + "step": 43445 + }, + { + "epoch": 2.022720394813418, + "grad_norm": 0.3282805450586961, + "learning_rate": 2.8984947534132488e-05, + "loss": 2.6361, + "step": 43446 + }, + { + "epoch": 2.022766952999511, + "grad_norm": 0.3307164348203062, + "learning_rate": 2.8982489698817207e-05, + "loss": 2.633, + "step": 43447 + }, + { + "epoch": 2.0228135111856043, + "grad_norm": 0.3600510900936601, + "learning_rate": 2.8980031925186236e-05, + "loss": 2.7542, + "step": 43448 + }, + { + "epoch": 2.0228600693716974, + "grad_norm": 0.3775963128864178, + "learning_rate": 2.897757421324681e-05, + "loss": 2.6513, + "step": 43449 + }, + { + "epoch": 2.0229066275577905, + "grad_norm": 0.32088966011431713, + "learning_rate": 2.8975116563006134e-05, + "loss": 2.4752, + "step": 43450 + }, + { + "epoch": 2.0229531857438836, + "grad_norm": 0.34245849568305997, + "learning_rate": 2.8972658974471432e-05, + "loss": 2.6896, + "step": 43451 + }, + { + "epoch": 2.0229997439299763, + "grad_norm": 0.377258080239758, + "learning_rate": 2.897020144764987e-05, + "loss": 2.7369, + "step": 43452 + }, + { + "epoch": 2.0230463021160694, + "grad_norm": 0.3460787252909156, + "learning_rate": 2.8967743982548734e-05, + "loss": 2.6495, + "step": 43453 + }, + { + "epoch": 2.0230928603021625, + "grad_norm": 0.3392567429609421, + "learning_rate": 2.8965286579175173e-05, + "loss": 2.8029, + "step": 43454 + }, + { + "epoch": 2.0231394184882556, + "grad_norm": 0.32866050089016147, + "learning_rate": 2.8962829237536427e-05, + "loss": 2.6082, + "step": 43455 + }, + { + "epoch": 2.0231859766743487, + "grad_norm": 0.37325286209270864, + "learning_rate": 2.8960371957639704e-05, + "loss": 2.6824, + "step": 43456 + }, + { + "epoch": 2.023232534860442, + "grad_norm": 0.3444880334235475, + "learning_rate": 2.895791473949222e-05, + "loss": 2.656, + "step": 43457 + }, + { + "epoch": 2.023279093046535, + "grad_norm": 0.3304227527033486, + "learning_rate": 2.8955457583101174e-05, + "loss": 2.6498, + "step": 43458 + }, + { + "epoch": 2.023325651232628, + "grad_norm": 0.3632113101535083, + "learning_rate": 2.8953000488473813e-05, + "loss": 2.6969, + "step": 43459 + }, + { + "epoch": 2.023372209418721, + "grad_norm": 0.33702725405701744, + "learning_rate": 2.8950543455617306e-05, + "loss": 2.6756, + "step": 43460 + }, + { + "epoch": 2.0234187676048143, + "grad_norm": 0.3463186150626428, + "learning_rate": 2.8948086484538872e-05, + "loss": 2.7304, + "step": 43461 + }, + { + "epoch": 2.023465325790907, + "grad_norm": 0.3407707052829193, + "learning_rate": 2.8945629575245735e-05, + "loss": 2.6308, + "step": 43462 + }, + { + "epoch": 2.023511883977, + "grad_norm": 0.34430780759706453, + "learning_rate": 2.8943172727745105e-05, + "loss": 2.7116, + "step": 43463 + }, + { + "epoch": 2.023558442163093, + "grad_norm": 0.3391575066366512, + "learning_rate": 2.8940715942044206e-05, + "loss": 2.7281, + "step": 43464 + }, + { + "epoch": 2.0236050003491863, + "grad_norm": 0.35144059488464685, + "learning_rate": 2.8938259218150186e-05, + "loss": 2.6904, + "step": 43465 + }, + { + "epoch": 2.0236515585352794, + "grad_norm": 0.3581986062679858, + "learning_rate": 2.8935802556070347e-05, + "loss": 2.6702, + "step": 43466 + }, + { + "epoch": 2.0236981167213726, + "grad_norm": 0.32432005542994374, + "learning_rate": 2.893334595581183e-05, + "loss": 2.6369, + "step": 43467 + }, + { + "epoch": 2.0237446749074657, + "grad_norm": 0.35353522457756026, + "learning_rate": 2.8930889417381862e-05, + "loss": 2.73, + "step": 43468 + }, + { + "epoch": 2.023791233093559, + "grad_norm": 0.31522966224539917, + "learning_rate": 2.8928432940787663e-05, + "loss": 2.6518, + "step": 43469 + }, + { + "epoch": 2.023837791279652, + "grad_norm": 0.36658850879600363, + "learning_rate": 2.8925976526036437e-05, + "loss": 2.5659, + "step": 43470 + }, + { + "epoch": 2.023884349465745, + "grad_norm": 0.29599989979674846, + "learning_rate": 2.8923520173135388e-05, + "loss": 2.6569, + "step": 43471 + }, + { + "epoch": 2.0239309076518377, + "grad_norm": 0.31097484290296934, + "learning_rate": 2.8921063882091758e-05, + "loss": 2.6796, + "step": 43472 + }, + { + "epoch": 2.023977465837931, + "grad_norm": 0.34253109899122214, + "learning_rate": 2.891860765291271e-05, + "loss": 2.6389, + "step": 43473 + }, + { + "epoch": 2.024024024024024, + "grad_norm": 0.32019508622907056, + "learning_rate": 2.8916151485605468e-05, + "loss": 2.682, + "step": 43474 + }, + { + "epoch": 2.024070582210117, + "grad_norm": 0.3120624512743874, + "learning_rate": 2.891369538017724e-05, + "loss": 2.7051, + "step": 43475 + }, + { + "epoch": 2.02411714039621, + "grad_norm": 0.3243836881557385, + "learning_rate": 2.891123933663525e-05, + "loss": 2.525, + "step": 43476 + }, + { + "epoch": 2.0241636985823033, + "grad_norm": 0.33887374451906854, + "learning_rate": 2.8908783354986707e-05, + "loss": 2.6868, + "step": 43477 + }, + { + "epoch": 2.0242102567683964, + "grad_norm": 0.3335055949427486, + "learning_rate": 2.8906327435238768e-05, + "loss": 2.5765, + "step": 43478 + }, + { + "epoch": 2.0242568149544895, + "grad_norm": 0.31119280458579923, + "learning_rate": 2.890387157739872e-05, + "loss": 2.7096, + "step": 43479 + }, + { + "epoch": 2.0243033731405826, + "grad_norm": 0.34123295796301334, + "learning_rate": 2.890141578147371e-05, + "loss": 2.6918, + "step": 43480 + }, + { + "epoch": 2.0243499313266753, + "grad_norm": 0.3143590747102938, + "learning_rate": 2.889896004747097e-05, + "loss": 2.599, + "step": 43481 + }, + { + "epoch": 2.0243964895127684, + "grad_norm": 0.3345504056335331, + "learning_rate": 2.8896504375397705e-05, + "loss": 2.695, + "step": 43482 + }, + { + "epoch": 2.0244430476988615, + "grad_norm": 0.3741812755111606, + "learning_rate": 2.889404876526111e-05, + "loss": 2.7272, + "step": 43483 + }, + { + "epoch": 2.0244896058849546, + "grad_norm": 0.32812055177320604, + "learning_rate": 2.8891593217068412e-05, + "loss": 2.5233, + "step": 43484 + }, + { + "epoch": 2.0245361640710478, + "grad_norm": 0.35106374322095424, + "learning_rate": 2.8889137730826822e-05, + "loss": 2.6894, + "step": 43485 + }, + { + "epoch": 2.024582722257141, + "grad_norm": 0.3450912975217175, + "learning_rate": 2.888668230654352e-05, + "loss": 2.6119, + "step": 43486 + }, + { + "epoch": 2.024629280443234, + "grad_norm": 0.3503946949277142, + "learning_rate": 2.8884226944225724e-05, + "loss": 2.6975, + "step": 43487 + }, + { + "epoch": 2.024675838629327, + "grad_norm": 0.3579460013536227, + "learning_rate": 2.8881771643880636e-05, + "loss": 2.6205, + "step": 43488 + }, + { + "epoch": 2.02472239681542, + "grad_norm": 0.35254818289076606, + "learning_rate": 2.8879316405515465e-05, + "loss": 2.6339, + "step": 43489 + }, + { + "epoch": 2.0247689550015133, + "grad_norm": 0.3301949608313029, + "learning_rate": 2.8876861229137453e-05, + "loss": 2.5703, + "step": 43490 + }, + { + "epoch": 2.024815513187606, + "grad_norm": 0.3364235388229631, + "learning_rate": 2.8874406114753725e-05, + "loss": 2.6185, + "step": 43491 + }, + { + "epoch": 2.024862071373699, + "grad_norm": 0.3412530304635183, + "learning_rate": 2.8871951062371573e-05, + "loss": 2.5906, + "step": 43492 + }, + { + "epoch": 2.0249086295597922, + "grad_norm": 0.34810661774272317, + "learning_rate": 2.8869496071998127e-05, + "loss": 2.6611, + "step": 43493 + }, + { + "epoch": 2.0249551877458853, + "grad_norm": 0.32918029523625236, + "learning_rate": 2.886704114364066e-05, + "loss": 2.7216, + "step": 43494 + }, + { + "epoch": 2.0250017459319785, + "grad_norm": 0.32997291708170634, + "learning_rate": 2.8864586277306332e-05, + "loss": 2.6676, + "step": 43495 + }, + { + "epoch": 2.0250483041180716, + "grad_norm": 0.3377513235224612, + "learning_rate": 2.8862131473002362e-05, + "loss": 2.6355, + "step": 43496 + }, + { + "epoch": 2.0250948623041647, + "grad_norm": 0.34411763361367814, + "learning_rate": 2.8859676730735952e-05, + "loss": 2.7603, + "step": 43497 + }, + { + "epoch": 2.025141420490258, + "grad_norm": 0.3115573443627925, + "learning_rate": 2.8857222050514327e-05, + "loss": 2.6974, + "step": 43498 + }, + { + "epoch": 2.025187978676351, + "grad_norm": 0.3471133590064644, + "learning_rate": 2.8854767432344654e-05, + "loss": 2.704, + "step": 43499 + }, + { + "epoch": 2.025234536862444, + "grad_norm": 0.3470486573019662, + "learning_rate": 2.8852312876234155e-05, + "loss": 2.8033, + "step": 43500 + }, + { + "epoch": 2.0252810950485367, + "grad_norm": 0.3346511534280311, + "learning_rate": 2.8849858382190032e-05, + "loss": 2.6422, + "step": 43501 + }, + { + "epoch": 2.02532765323463, + "grad_norm": 0.3331317538596529, + "learning_rate": 2.8847403950219493e-05, + "loss": 2.5886, + "step": 43502 + }, + { + "epoch": 2.025374211420723, + "grad_norm": 0.3366813965355712, + "learning_rate": 2.884494958032976e-05, + "loss": 2.7232, + "step": 43503 + }, + { + "epoch": 2.025420769606816, + "grad_norm": 0.3311606082935044, + "learning_rate": 2.884249527252797e-05, + "loss": 2.614, + "step": 43504 + }, + { + "epoch": 2.025467327792909, + "grad_norm": 0.3225398182047933, + "learning_rate": 2.8840041026821417e-05, + "loss": 2.6701, + "step": 43505 + }, + { + "epoch": 2.0255138859790023, + "grad_norm": 0.3311579115905751, + "learning_rate": 2.8837586843217225e-05, + "loss": 2.6368, + "step": 43506 + }, + { + "epoch": 2.0255604441650954, + "grad_norm": 0.3295376230526894, + "learning_rate": 2.8835132721722665e-05, + "loss": 2.6843, + "step": 43507 + }, + { + "epoch": 2.0256070023511885, + "grad_norm": 0.3234474755789754, + "learning_rate": 2.883267866234488e-05, + "loss": 2.501, + "step": 43508 + }, + { + "epoch": 2.0256535605372816, + "grad_norm": 0.3211629509136826, + "learning_rate": 2.8830224665091103e-05, + "loss": 2.5778, + "step": 43509 + }, + { + "epoch": 2.0257001187233747, + "grad_norm": 0.322056333274902, + "learning_rate": 2.8827770729968535e-05, + "loss": 2.6793, + "step": 43510 + }, + { + "epoch": 2.0257466769094674, + "grad_norm": 0.3216387359504056, + "learning_rate": 2.882531685698437e-05, + "loss": 2.6672, + "step": 43511 + }, + { + "epoch": 2.0257932350955605, + "grad_norm": 0.34702300272992187, + "learning_rate": 2.882286304614583e-05, + "loss": 2.6199, + "step": 43512 + }, + { + "epoch": 2.0258397932816536, + "grad_norm": 0.31447182108268945, + "learning_rate": 2.882040929746009e-05, + "loss": 2.6664, + "step": 43513 + }, + { + "epoch": 2.0258863514677468, + "grad_norm": 0.3651107744401995, + "learning_rate": 2.8817955610934356e-05, + "loss": 2.6816, + "step": 43514 + }, + { + "epoch": 2.02593290965384, + "grad_norm": 0.33339307885578395, + "learning_rate": 2.8815501986575833e-05, + "loss": 2.6192, + "step": 43515 + }, + { + "epoch": 2.025979467839933, + "grad_norm": 0.3023943662861858, + "learning_rate": 2.881304842439175e-05, + "loss": 2.536, + "step": 43516 + }, + { + "epoch": 2.026026026026026, + "grad_norm": 0.3242214284300283, + "learning_rate": 2.8810594924389246e-05, + "loss": 2.6459, + "step": 43517 + }, + { + "epoch": 2.026072584212119, + "grad_norm": 0.3477227535234448, + "learning_rate": 2.880814148657559e-05, + "loss": 2.6735, + "step": 43518 + }, + { + "epoch": 2.0261191423982123, + "grad_norm": 0.34323681439682013, + "learning_rate": 2.8805688110957913e-05, + "loss": 2.6937, + "step": 43519 + }, + { + "epoch": 2.0261657005843055, + "grad_norm": 0.3184281601442407, + "learning_rate": 2.8803234797543494e-05, + "loss": 2.5837, + "step": 43520 + }, + { + "epoch": 2.026212258770398, + "grad_norm": 0.37429594205590727, + "learning_rate": 2.880078154633947e-05, + "loss": 2.7039, + "step": 43521 + }, + { + "epoch": 2.0262588169564912, + "grad_norm": 0.3472031120823144, + "learning_rate": 2.879832835735307e-05, + "loss": 2.6748, + "step": 43522 + }, + { + "epoch": 2.0263053751425844, + "grad_norm": 0.3511988552752714, + "learning_rate": 2.8795875230591484e-05, + "loss": 2.729, + "step": 43523 + }, + { + "epoch": 2.0263519333286775, + "grad_norm": 0.37144511739727737, + "learning_rate": 2.879342216606192e-05, + "loss": 2.7358, + "step": 43524 + }, + { + "epoch": 2.0263984915147706, + "grad_norm": 0.35932551185951295, + "learning_rate": 2.8790969163771585e-05, + "loss": 2.7429, + "step": 43525 + }, + { + "epoch": 2.0264450497008637, + "grad_norm": 0.35415916199021824, + "learning_rate": 2.8788516223727647e-05, + "loss": 2.6308, + "step": 43526 + }, + { + "epoch": 2.026491607886957, + "grad_norm": 0.3234548994292287, + "learning_rate": 2.878606334593733e-05, + "loss": 2.6095, + "step": 43527 + }, + { + "epoch": 2.02653816607305, + "grad_norm": 0.3487376995314459, + "learning_rate": 2.8783610530407824e-05, + "loss": 2.6916, + "step": 43528 + }, + { + "epoch": 2.026584724259143, + "grad_norm": 0.35642248513907526, + "learning_rate": 2.878115777714633e-05, + "loss": 2.6527, + "step": 43529 + }, + { + "epoch": 2.0266312824452357, + "grad_norm": 0.35391700029281914, + "learning_rate": 2.8778705086160052e-05, + "loss": 2.6629, + "step": 43530 + }, + { + "epoch": 2.026677840631329, + "grad_norm": 0.34035242651931913, + "learning_rate": 2.87762524574562e-05, + "loss": 2.7344, + "step": 43531 + }, + { + "epoch": 2.026724398817422, + "grad_norm": 0.3516957291416167, + "learning_rate": 2.8773799891041912e-05, + "loss": 2.6692, + "step": 43532 + }, + { + "epoch": 2.026770957003515, + "grad_norm": 0.3371854611535028, + "learning_rate": 2.8771347386924473e-05, + "loss": 2.6649, + "step": 43533 + }, + { + "epoch": 2.026817515189608, + "grad_norm": 0.33147437010261244, + "learning_rate": 2.8768894945111023e-05, + "loss": 2.6893, + "step": 43534 + }, + { + "epoch": 2.0268640733757013, + "grad_norm": 0.35170738971903015, + "learning_rate": 2.8766442565608776e-05, + "loss": 2.5979, + "step": 43535 + }, + { + "epoch": 2.0269106315617944, + "grad_norm": 0.32418763243365445, + "learning_rate": 2.876399024842492e-05, + "loss": 2.5731, + "step": 43536 + }, + { + "epoch": 2.0269571897478875, + "grad_norm": 0.3527786905611506, + "learning_rate": 2.8761537993566666e-05, + "loss": 2.6912, + "step": 43537 + }, + { + "epoch": 2.0270037479339806, + "grad_norm": 0.32874220929000114, + "learning_rate": 2.8759085801041202e-05, + "loss": 2.6193, + "step": 43538 + }, + { + "epoch": 2.0270503061200738, + "grad_norm": 0.35494417724795957, + "learning_rate": 2.8756633670855748e-05, + "loss": 2.6902, + "step": 43539 + }, + { + "epoch": 2.0270968643061664, + "grad_norm": 0.3149940998861289, + "learning_rate": 2.8754181603017464e-05, + "loss": 2.6226, + "step": 43540 + }, + { + "epoch": 2.0271434224922595, + "grad_norm": 0.3335852468670475, + "learning_rate": 2.8751729597533566e-05, + "loss": 2.7158, + "step": 43541 + }, + { + "epoch": 2.0271899806783527, + "grad_norm": 0.34357409319835763, + "learning_rate": 2.8749277654411244e-05, + "loss": 2.6256, + "step": 43542 + }, + { + "epoch": 2.0272365388644458, + "grad_norm": 0.32952219168593705, + "learning_rate": 2.8746825773657704e-05, + "loss": 2.5879, + "step": 43543 + }, + { + "epoch": 2.027283097050539, + "grad_norm": 0.3258973911456913, + "learning_rate": 2.874437395528015e-05, + "loss": 2.6344, + "step": 43544 + }, + { + "epoch": 2.027329655236632, + "grad_norm": 0.3514770916582726, + "learning_rate": 2.874192219928573e-05, + "loss": 2.68, + "step": 43545 + }, + { + "epoch": 2.027376213422725, + "grad_norm": 0.32980125523745574, + "learning_rate": 2.8739470505681708e-05, + "loss": 2.6215, + "step": 43546 + }, + { + "epoch": 2.0274227716088182, + "grad_norm": 0.32300051342547664, + "learning_rate": 2.8737018874475207e-05, + "loss": 2.5392, + "step": 43547 + }, + { + "epoch": 2.0274693297949113, + "grad_norm": 0.32355507179553017, + "learning_rate": 2.873456730567351e-05, + "loss": 2.5888, + "step": 43548 + }, + { + "epoch": 2.0275158879810045, + "grad_norm": 0.3411645919512494, + "learning_rate": 2.873211579928373e-05, + "loss": 2.6934, + "step": 43549 + }, + { + "epoch": 2.027562446167097, + "grad_norm": 0.3462200302368629, + "learning_rate": 2.87296643553131e-05, + "loss": 2.7551, + "step": 43550 + }, + { + "epoch": 2.0276090043531902, + "grad_norm": 0.3307882069020218, + "learning_rate": 2.8727212973768813e-05, + "loss": 2.7021, + "step": 43551 + }, + { + "epoch": 2.0276555625392834, + "grad_norm": 0.35866954693471376, + "learning_rate": 2.872476165465806e-05, + "loss": 2.5456, + "step": 43552 + }, + { + "epoch": 2.0277021207253765, + "grad_norm": 0.3321928841210946, + "learning_rate": 2.872231039798805e-05, + "loss": 2.5749, + "step": 43553 + }, + { + "epoch": 2.0277486789114696, + "grad_norm": 0.3215328255898086, + "learning_rate": 2.8719859203765953e-05, + "loss": 2.6776, + "step": 43554 + }, + { + "epoch": 2.0277952370975627, + "grad_norm": 0.33858456859982683, + "learning_rate": 2.8717408071998965e-05, + "loss": 2.689, + "step": 43555 + }, + { + "epoch": 2.027841795283656, + "grad_norm": 0.31807853238030853, + "learning_rate": 2.8714957002694288e-05, + "loss": 2.6279, + "step": 43556 + }, + { + "epoch": 2.027888353469749, + "grad_norm": 0.3233435238779276, + "learning_rate": 2.8712505995859133e-05, + "loss": 2.5164, + "step": 43557 + }, + { + "epoch": 2.027934911655842, + "grad_norm": 0.3427534768597869, + "learning_rate": 2.8710055051500638e-05, + "loss": 2.6652, + "step": 43558 + }, + { + "epoch": 2.027981469841935, + "grad_norm": 0.3437669669119359, + "learning_rate": 2.870760416962608e-05, + "loss": 2.5432, + "step": 43559 + }, + { + "epoch": 2.028028028028028, + "grad_norm": 0.31893346338601103, + "learning_rate": 2.8705153350242553e-05, + "loss": 2.6971, + "step": 43560 + }, + { + "epoch": 2.028074586214121, + "grad_norm": 0.33850414941165846, + "learning_rate": 2.8702702593357355e-05, + "loss": 2.6204, + "step": 43561 + }, + { + "epoch": 2.028121144400214, + "grad_norm": 0.33428414512506255, + "learning_rate": 2.8700251898977605e-05, + "loss": 2.6502, + "step": 43562 + }, + { + "epoch": 2.028167702586307, + "grad_norm": 0.3486673093626396, + "learning_rate": 2.8697801267110513e-05, + "loss": 2.6118, + "step": 43563 + }, + { + "epoch": 2.0282142607724003, + "grad_norm": 0.3489419014429903, + "learning_rate": 2.8695350697763278e-05, + "loss": 2.6695, + "step": 43564 + }, + { + "epoch": 2.0282608189584934, + "grad_norm": 0.32460867996565357, + "learning_rate": 2.8692900190943084e-05, + "loss": 2.5748, + "step": 43565 + }, + { + "epoch": 2.0283073771445865, + "grad_norm": 0.3709651034804969, + "learning_rate": 2.8690449746657154e-05, + "loss": 2.6632, + "step": 43566 + }, + { + "epoch": 2.0283539353306796, + "grad_norm": 0.3297550092769407, + "learning_rate": 2.8687999364912637e-05, + "loss": 2.6121, + "step": 43567 + }, + { + "epoch": 2.0284004935167728, + "grad_norm": 0.3582458503403832, + "learning_rate": 2.8685549045716742e-05, + "loss": 2.7158, + "step": 43568 + }, + { + "epoch": 2.028447051702866, + "grad_norm": 0.3317537678727199, + "learning_rate": 2.8683098789076657e-05, + "loss": 2.5276, + "step": 43569 + }, + { + "epoch": 2.0284936098889585, + "grad_norm": 0.3306926374396222, + "learning_rate": 2.868064859499958e-05, + "loss": 2.6852, + "step": 43570 + }, + { + "epoch": 2.0285401680750517, + "grad_norm": 0.34063670745744873, + "learning_rate": 2.86781984634927e-05, + "loss": 2.7244, + "step": 43571 + }, + { + "epoch": 2.028586726261145, + "grad_norm": 0.3777713522747221, + "learning_rate": 2.867574839456322e-05, + "loss": 2.7416, + "step": 43572 + }, + { + "epoch": 2.028633284447238, + "grad_norm": 0.3477179483712686, + "learning_rate": 2.8673298388218283e-05, + "loss": 2.6746, + "step": 43573 + }, + { + "epoch": 2.028679842633331, + "grad_norm": 0.35678650827775266, + "learning_rate": 2.8670848444465155e-05, + "loss": 2.69, + "step": 43574 + }, + { + "epoch": 2.028726400819424, + "grad_norm": 0.3389246278136816, + "learning_rate": 2.866839856331096e-05, + "loss": 2.6824, + "step": 43575 + }, + { + "epoch": 2.0287729590055172, + "grad_norm": 0.35128624977518064, + "learning_rate": 2.8665948744762922e-05, + "loss": 2.6624, + "step": 43576 + }, + { + "epoch": 2.0288195171916104, + "grad_norm": 0.35718461146537384, + "learning_rate": 2.866349898882822e-05, + "loss": 2.6967, + "step": 43577 + }, + { + "epoch": 2.0288660753777035, + "grad_norm": 0.3467607809267553, + "learning_rate": 2.8661049295514043e-05, + "loss": 2.6054, + "step": 43578 + }, + { + "epoch": 2.028912633563796, + "grad_norm": 0.3535721511277796, + "learning_rate": 2.8658599664827606e-05, + "loss": 2.5979, + "step": 43579 + }, + { + "epoch": 2.0289591917498893, + "grad_norm": 0.3467328070468846, + "learning_rate": 2.8656150096776057e-05, + "loss": 2.7355, + "step": 43580 + }, + { + "epoch": 2.0290057499359824, + "grad_norm": 0.3922320637514795, + "learning_rate": 2.8653700591366606e-05, + "loss": 2.7284, + "step": 43581 + }, + { + "epoch": 2.0290523081220755, + "grad_norm": 0.3304707749586845, + "learning_rate": 2.865125114860644e-05, + "loss": 2.5936, + "step": 43582 + }, + { + "epoch": 2.0290988663081686, + "grad_norm": 0.3580360369291391, + "learning_rate": 2.8648801768502743e-05, + "loss": 2.6361, + "step": 43583 + }, + { + "epoch": 2.0291454244942617, + "grad_norm": 0.3340392917267083, + "learning_rate": 2.864635245106272e-05, + "loss": 2.693, + "step": 43584 + }, + { + "epoch": 2.029191982680355, + "grad_norm": 0.32435119172729, + "learning_rate": 2.864390319629356e-05, + "loss": 2.5707, + "step": 43585 + }, + { + "epoch": 2.029238540866448, + "grad_norm": 0.34262480644790044, + "learning_rate": 2.8641454004202406e-05, + "loss": 2.5159, + "step": 43586 + }, + { + "epoch": 2.029285099052541, + "grad_norm": 0.3362234662915411, + "learning_rate": 2.8639004874796514e-05, + "loss": 2.6543, + "step": 43587 + }, + { + "epoch": 2.029331657238634, + "grad_norm": 0.3725856243387515, + "learning_rate": 2.863655580808302e-05, + "loss": 2.6497, + "step": 43588 + }, + { + "epoch": 2.029378215424727, + "grad_norm": 0.33955779221314353, + "learning_rate": 2.863410680406914e-05, + "loss": 2.7018, + "step": 43589 + }, + { + "epoch": 2.02942477361082, + "grad_norm": 0.34144678923627325, + "learning_rate": 2.863165786276204e-05, + "loss": 2.6762, + "step": 43590 + }, + { + "epoch": 2.029471331796913, + "grad_norm": 0.3689923846990723, + "learning_rate": 2.8629208984168922e-05, + "loss": 2.6537, + "step": 43591 + }, + { + "epoch": 2.029517889983006, + "grad_norm": 0.36385600962253745, + "learning_rate": 2.862676016829699e-05, + "loss": 2.7049, + "step": 43592 + }, + { + "epoch": 2.0295644481690993, + "grad_norm": 0.341989137767627, + "learning_rate": 2.8624311415153392e-05, + "loss": 2.654, + "step": 43593 + }, + { + "epoch": 2.0296110063551924, + "grad_norm": 0.3394766209103831, + "learning_rate": 2.8621862724745334e-05, + "loss": 2.6197, + "step": 43594 + }, + { + "epoch": 2.0296575645412855, + "grad_norm": 0.3922574312553453, + "learning_rate": 2.8619414097080005e-05, + "loss": 2.6176, + "step": 43595 + }, + { + "epoch": 2.0297041227273787, + "grad_norm": 0.34794768586466845, + "learning_rate": 2.8616965532164585e-05, + "loss": 2.6328, + "step": 43596 + }, + { + "epoch": 2.0297506809134718, + "grad_norm": 0.3568198587331754, + "learning_rate": 2.8614517030006273e-05, + "loss": 2.6548, + "step": 43597 + }, + { + "epoch": 2.029797239099565, + "grad_norm": 0.35751766380646594, + "learning_rate": 2.8612068590612258e-05, + "loss": 2.6199, + "step": 43598 + }, + { + "epoch": 2.0298437972856576, + "grad_norm": 0.3556852377048653, + "learning_rate": 2.8609620213989673e-05, + "loss": 2.6961, + "step": 43599 + }, + { + "epoch": 2.0298903554717507, + "grad_norm": 0.3386976053884129, + "learning_rate": 2.860717190014579e-05, + "loss": 2.6065, + "step": 43600 + }, + { + "epoch": 2.029936913657844, + "grad_norm": 0.3740309622302551, + "learning_rate": 2.8604723649087728e-05, + "loss": 2.7128, + "step": 43601 + }, + { + "epoch": 2.029983471843937, + "grad_norm": 0.35427528000843145, + "learning_rate": 2.8602275460822702e-05, + "loss": 2.6753, + "step": 43602 + }, + { + "epoch": 2.03003003003003, + "grad_norm": 0.33207296298332334, + "learning_rate": 2.859982733535788e-05, + "loss": 2.5971, + "step": 43603 + }, + { + "epoch": 2.030076588216123, + "grad_norm": 0.35813212163144437, + "learning_rate": 2.8597379272700463e-05, + "loss": 2.671, + "step": 43604 + }, + { + "epoch": 2.0301231464022162, + "grad_norm": 0.35314856703538616, + "learning_rate": 2.8594931272857645e-05, + "loss": 2.586, + "step": 43605 + }, + { + "epoch": 2.0301697045883094, + "grad_norm": 0.317297917760618, + "learning_rate": 2.8592483335836557e-05, + "loss": 2.6102, + "step": 43606 + }, + { + "epoch": 2.0302162627744025, + "grad_norm": 0.3513093446889116, + "learning_rate": 2.859003546164446e-05, + "loss": 2.6516, + "step": 43607 + }, + { + "epoch": 2.0302628209604956, + "grad_norm": 0.3516937512305702, + "learning_rate": 2.8587587650288483e-05, + "loss": 2.6266, + "step": 43608 + }, + { + "epoch": 2.0303093791465883, + "grad_norm": 0.35578739991017866, + "learning_rate": 2.858513990177582e-05, + "loss": 2.6495, + "step": 43609 + }, + { + "epoch": 2.0303559373326814, + "grad_norm": 0.3284656492682513, + "learning_rate": 2.8582692216113665e-05, + "loss": 2.6156, + "step": 43610 + }, + { + "epoch": 2.0304024955187745, + "grad_norm": 0.33653971092846136, + "learning_rate": 2.8580244593309224e-05, + "loss": 2.5797, + "step": 43611 + }, + { + "epoch": 2.0304490537048676, + "grad_norm": 0.35015180281208674, + "learning_rate": 2.8577797033369606e-05, + "loss": 2.7301, + "step": 43612 + }, + { + "epoch": 2.0304956118909607, + "grad_norm": 0.3339987285761303, + "learning_rate": 2.8575349536302087e-05, + "loss": 2.7069, + "step": 43613 + }, + { + "epoch": 2.030542170077054, + "grad_norm": 0.34610080320116254, + "learning_rate": 2.8572902102113787e-05, + "loss": 2.5772, + "step": 43614 + }, + { + "epoch": 2.030588728263147, + "grad_norm": 0.33069541870001423, + "learning_rate": 2.8570454730811913e-05, + "loss": 2.5897, + "step": 43615 + }, + { + "epoch": 2.03063528644924, + "grad_norm": 0.35163264041176007, + "learning_rate": 2.8568007422403632e-05, + "loss": 2.7487, + "step": 43616 + }, + { + "epoch": 2.030681844635333, + "grad_norm": 0.33933793082276337, + "learning_rate": 2.8565560176896143e-05, + "loss": 2.5713, + "step": 43617 + }, + { + "epoch": 2.030728402821426, + "grad_norm": 0.32676148860550824, + "learning_rate": 2.856311299429665e-05, + "loss": 2.6501, + "step": 43618 + }, + { + "epoch": 2.030774961007519, + "grad_norm": 0.3267460683211646, + "learning_rate": 2.856066587461226e-05, + "loss": 2.7076, + "step": 43619 + }, + { + "epoch": 2.030821519193612, + "grad_norm": 0.3518626447394363, + "learning_rate": 2.8558218817850246e-05, + "loss": 2.6304, + "step": 43620 + }, + { + "epoch": 2.030868077379705, + "grad_norm": 0.34283756575390517, + "learning_rate": 2.855577182401773e-05, + "loss": 2.7651, + "step": 43621 + }, + { + "epoch": 2.0309146355657983, + "grad_norm": 0.3422193640833323, + "learning_rate": 2.8553324893121906e-05, + "loss": 2.5975, + "step": 43622 + }, + { + "epoch": 2.0309611937518914, + "grad_norm": 0.32866124857894785, + "learning_rate": 2.8550878025169968e-05, + "loss": 2.708, + "step": 43623 + }, + { + "epoch": 2.0310077519379846, + "grad_norm": 0.3378938751965334, + "learning_rate": 2.854843122016908e-05, + "loss": 2.5747, + "step": 43624 + }, + { + "epoch": 2.0310543101240777, + "grad_norm": 0.3449665675817505, + "learning_rate": 2.8545984478126443e-05, + "loss": 2.6598, + "step": 43625 + }, + { + "epoch": 2.031100868310171, + "grad_norm": 0.3168840828357694, + "learning_rate": 2.8543537799049246e-05, + "loss": 2.5459, + "step": 43626 + }, + { + "epoch": 2.031147426496264, + "grad_norm": 0.3307290245462128, + "learning_rate": 2.8541091182944624e-05, + "loss": 2.6374, + "step": 43627 + }, + { + "epoch": 2.0311939846823566, + "grad_norm": 0.3326798921659726, + "learning_rate": 2.8538644629819793e-05, + "loss": 2.6479, + "step": 43628 + }, + { + "epoch": 2.0312405428684497, + "grad_norm": 0.33703203446036617, + "learning_rate": 2.8536198139681924e-05, + "loss": 2.6807, + "step": 43629 + }, + { + "epoch": 2.031287101054543, + "grad_norm": 0.35492238669101117, + "learning_rate": 2.8533751712538197e-05, + "loss": 2.6202, + "step": 43630 + }, + { + "epoch": 2.031333659240636, + "grad_norm": 0.3401508267768821, + "learning_rate": 2.8531305348395814e-05, + "loss": 2.7059, + "step": 43631 + }, + { + "epoch": 2.031380217426729, + "grad_norm": 0.3279379174660127, + "learning_rate": 2.8528859047261896e-05, + "loss": 2.5836, + "step": 43632 + }, + { + "epoch": 2.031426775612822, + "grad_norm": 0.35594758753552813, + "learning_rate": 2.8526412809143698e-05, + "loss": 2.5637, + "step": 43633 + }, + { + "epoch": 2.0314733337989153, + "grad_norm": 0.3408283741883915, + "learning_rate": 2.852396663404835e-05, + "loss": 2.6252, + "step": 43634 + }, + { + "epoch": 2.0315198919850084, + "grad_norm": 0.33875664294786006, + "learning_rate": 2.8521520521983043e-05, + "loss": 2.6403, + "step": 43635 + }, + { + "epoch": 2.0315664501711015, + "grad_norm": 0.3635934147520198, + "learning_rate": 2.8519074472954953e-05, + "loss": 2.6624, + "step": 43636 + }, + { + "epoch": 2.0316130083571946, + "grad_norm": 0.32995084152716236, + "learning_rate": 2.8516628486971263e-05, + "loss": 2.6356, + "step": 43637 + }, + { + "epoch": 2.0316595665432873, + "grad_norm": 0.35140308618138494, + "learning_rate": 2.851418256403916e-05, + "loss": 2.7007, + "step": 43638 + }, + { + "epoch": 2.0317061247293804, + "grad_norm": 0.3323407573499792, + "learning_rate": 2.8511736704165826e-05, + "loss": 2.5947, + "step": 43639 + }, + { + "epoch": 2.0317526829154735, + "grad_norm": 0.3662580189365717, + "learning_rate": 2.850929090735841e-05, + "loss": 2.6782, + "step": 43640 + }, + { + "epoch": 2.0317992411015666, + "grad_norm": 0.3309776868329361, + "learning_rate": 2.8506845173624108e-05, + "loss": 2.7214, + "step": 43641 + }, + { + "epoch": 2.0318457992876597, + "grad_norm": 0.3922982636177724, + "learning_rate": 2.8504399502970097e-05, + "loss": 2.7152, + "step": 43642 + }, + { + "epoch": 2.031892357473753, + "grad_norm": 0.32024360915124833, + "learning_rate": 2.8501953895403554e-05, + "loss": 2.6068, + "step": 43643 + }, + { + "epoch": 2.031938915659846, + "grad_norm": 0.34487808775219936, + "learning_rate": 2.849950835093168e-05, + "loss": 2.6859, + "step": 43644 + }, + { + "epoch": 2.031985473845939, + "grad_norm": 0.36543274114565044, + "learning_rate": 2.8497062869561596e-05, + "loss": 2.7874, + "step": 43645 + }, + { + "epoch": 2.032032032032032, + "grad_norm": 0.37062344928028024, + "learning_rate": 2.8494617451300553e-05, + "loss": 2.7646, + "step": 43646 + }, + { + "epoch": 2.0320785902181253, + "grad_norm": 0.35221849410647343, + "learning_rate": 2.8492172096155666e-05, + "loss": 2.6019, + "step": 43647 + }, + { + "epoch": 2.032125148404218, + "grad_norm": 0.3236935154566202, + "learning_rate": 2.848972680413413e-05, + "loss": 2.6007, + "step": 43648 + }, + { + "epoch": 2.032171706590311, + "grad_norm": 0.33983054181702493, + "learning_rate": 2.8487281575243136e-05, + "loss": 2.56, + "step": 43649 + }, + { + "epoch": 2.032218264776404, + "grad_norm": 0.3551149162149088, + "learning_rate": 2.8484836409489846e-05, + "loss": 2.6181, + "step": 43650 + }, + { + "epoch": 2.0322648229624973, + "grad_norm": 0.3318595482485909, + "learning_rate": 2.8482391306881445e-05, + "loss": 2.6207, + "step": 43651 + }, + { + "epoch": 2.0323113811485904, + "grad_norm": 0.34521317270261825, + "learning_rate": 2.8479946267425124e-05, + "loss": 2.6977, + "step": 43652 + }, + { + "epoch": 2.0323579393346836, + "grad_norm": 0.34763108661755354, + "learning_rate": 2.8477501291128018e-05, + "loss": 2.6043, + "step": 43653 + }, + { + "epoch": 2.0324044975207767, + "grad_norm": 0.3502239585792682, + "learning_rate": 2.8475056377997327e-05, + "loss": 2.6854, + "step": 43654 + }, + { + "epoch": 2.03245105570687, + "grad_norm": 0.3389997784340154, + "learning_rate": 2.8472611528040227e-05, + "loss": 2.6269, + "step": 43655 + }, + { + "epoch": 2.032497613892963, + "grad_norm": 0.34994532035205733, + "learning_rate": 2.8470166741263893e-05, + "loss": 2.6526, + "step": 43656 + }, + { + "epoch": 2.0325441720790556, + "grad_norm": 0.3573628541519942, + "learning_rate": 2.8467722017675513e-05, + "loss": 2.6337, + "step": 43657 + }, + { + "epoch": 2.0325907302651487, + "grad_norm": 0.3479744275862124, + "learning_rate": 2.846527735728221e-05, + "loss": 2.6321, + "step": 43658 + }, + { + "epoch": 2.032637288451242, + "grad_norm": 0.34956025940563773, + "learning_rate": 2.846283276009124e-05, + "loss": 2.6626, + "step": 43659 + }, + { + "epoch": 2.032683846637335, + "grad_norm": 0.36624581015193464, + "learning_rate": 2.8460388226109692e-05, + "loss": 2.618, + "step": 43660 + }, + { + "epoch": 2.032730404823428, + "grad_norm": 0.31702709682851415, + "learning_rate": 2.8457943755344817e-05, + "loss": 2.5484, + "step": 43661 + }, + { + "epoch": 2.032776963009521, + "grad_norm": 0.38311383856138975, + "learning_rate": 2.845549934780374e-05, + "loss": 2.6817, + "step": 43662 + }, + { + "epoch": 2.0328235211956143, + "grad_norm": 0.3459253630864621, + "learning_rate": 2.8453055003493646e-05, + "loss": 2.6542, + "step": 43663 + }, + { + "epoch": 2.0328700793817074, + "grad_norm": 0.33647074683133293, + "learning_rate": 2.8450610722421718e-05, + "loss": 2.6537, + "step": 43664 + }, + { + "epoch": 2.0329166375678005, + "grad_norm": 0.34963423774904046, + "learning_rate": 2.8448166504595142e-05, + "loss": 2.6442, + "step": 43665 + }, + { + "epoch": 2.0329631957538936, + "grad_norm": 0.3559795318750734, + "learning_rate": 2.8445722350021052e-05, + "loss": 2.5894, + "step": 43666 + }, + { + "epoch": 2.0330097539399863, + "grad_norm": 0.32340815667352024, + "learning_rate": 2.8443278258706653e-05, + "loss": 2.6342, + "step": 43667 + }, + { + "epoch": 2.0330563121260794, + "grad_norm": 0.31919831963362655, + "learning_rate": 2.8440834230659096e-05, + "loss": 2.6292, + "step": 43668 + }, + { + "epoch": 2.0331028703121725, + "grad_norm": 0.3274824499921495, + "learning_rate": 2.843839026588557e-05, + "loss": 2.7417, + "step": 43669 + }, + { + "epoch": 2.0331494284982656, + "grad_norm": 0.3257379233191319, + "learning_rate": 2.8435946364393275e-05, + "loss": 2.6083, + "step": 43670 + }, + { + "epoch": 2.0331959866843587, + "grad_norm": 0.3119714662348644, + "learning_rate": 2.8433502526189298e-05, + "loss": 2.5477, + "step": 43671 + }, + { + "epoch": 2.033242544870452, + "grad_norm": 0.33502143941907764, + "learning_rate": 2.8431058751280915e-05, + "loss": 2.5744, + "step": 43672 + }, + { + "epoch": 2.033289103056545, + "grad_norm": 0.3537355394273192, + "learning_rate": 2.842861503967521e-05, + "loss": 2.6185, + "step": 43673 + }, + { + "epoch": 2.033335661242638, + "grad_norm": 0.3146903482000551, + "learning_rate": 2.842617139137943e-05, + "loss": 2.667, + "step": 43674 + }, + { + "epoch": 2.033382219428731, + "grad_norm": 0.3552746585272062, + "learning_rate": 2.842372780640069e-05, + "loss": 2.6398, + "step": 43675 + }, + { + "epoch": 2.0334287776148243, + "grad_norm": 0.35233266209849745, + "learning_rate": 2.842128428474619e-05, + "loss": 2.6572, + "step": 43676 + }, + { + "epoch": 2.033475335800917, + "grad_norm": 0.35025116973859866, + "learning_rate": 2.8418840826423093e-05, + "loss": 2.7002, + "step": 43677 + }, + { + "epoch": 2.03352189398701, + "grad_norm": 0.33076212217391593, + "learning_rate": 2.841639743143857e-05, + "loss": 2.575, + "step": 43678 + }, + { + "epoch": 2.0335684521731032, + "grad_norm": 0.33412332907894965, + "learning_rate": 2.8413954099799817e-05, + "loss": 2.6569, + "step": 43679 + }, + { + "epoch": 2.0336150103591963, + "grad_norm": 0.34174920662017005, + "learning_rate": 2.841151083151396e-05, + "loss": 2.6579, + "step": 43680 + }, + { + "epoch": 2.0336615685452895, + "grad_norm": 0.328389848598885, + "learning_rate": 2.8409067626588193e-05, + "loss": 2.5886, + "step": 43681 + }, + { + "epoch": 2.0337081267313826, + "grad_norm": 0.3392633429421721, + "learning_rate": 2.8406624485029686e-05, + "loss": 2.6316, + "step": 43682 + }, + { + "epoch": 2.0337546849174757, + "grad_norm": 0.33363276222710003, + "learning_rate": 2.8404181406845625e-05, + "loss": 2.722, + "step": 43683 + }, + { + "epoch": 2.033801243103569, + "grad_norm": 0.35110153211552675, + "learning_rate": 2.8401738392043125e-05, + "loss": 2.5874, + "step": 43684 + }, + { + "epoch": 2.033847801289662, + "grad_norm": 0.3435226218620232, + "learning_rate": 2.8399295440629437e-05, + "loss": 2.6338, + "step": 43685 + }, + { + "epoch": 2.033894359475755, + "grad_norm": 0.33518527267411885, + "learning_rate": 2.839685255261165e-05, + "loss": 2.7324, + "step": 43686 + }, + { + "epoch": 2.0339409176618477, + "grad_norm": 0.3674481325236193, + "learning_rate": 2.8394409727997013e-05, + "loss": 2.6383, + "step": 43687 + }, + { + "epoch": 2.033987475847941, + "grad_norm": 0.33143082017025244, + "learning_rate": 2.8391966966792627e-05, + "loss": 2.7386, + "step": 43688 + }, + { + "epoch": 2.034034034034034, + "grad_norm": 0.35396123670120894, + "learning_rate": 2.8389524269005696e-05, + "loss": 2.6508, + "step": 43689 + }, + { + "epoch": 2.034080592220127, + "grad_norm": 0.35068619658350836, + "learning_rate": 2.8387081634643386e-05, + "loss": 2.5371, + "step": 43690 + }, + { + "epoch": 2.03412715040622, + "grad_norm": 0.32885850416995477, + "learning_rate": 2.838463906371286e-05, + "loss": 2.586, + "step": 43691 + }, + { + "epoch": 2.0341737085923133, + "grad_norm": 0.31532893926184435, + "learning_rate": 2.83821965562213e-05, + "loss": 2.5729, + "step": 43692 + }, + { + "epoch": 2.0342202667784064, + "grad_norm": 0.35276618066044885, + "learning_rate": 2.837975411217585e-05, + "loss": 2.5852, + "step": 43693 + }, + { + "epoch": 2.0342668249644995, + "grad_norm": 0.3624847754031335, + "learning_rate": 2.8377311731583695e-05, + "loss": 2.6112, + "step": 43694 + }, + { + "epoch": 2.0343133831505926, + "grad_norm": 0.33064058473849434, + "learning_rate": 2.8374869414451998e-05, + "loss": 2.6464, + "step": 43695 + }, + { + "epoch": 2.0343599413366857, + "grad_norm": 0.3253355813669867, + "learning_rate": 2.8372427160787928e-05, + "loss": 2.5256, + "step": 43696 + }, + { + "epoch": 2.0344064995227784, + "grad_norm": 0.3384866124762785, + "learning_rate": 2.836998497059865e-05, + "loss": 2.6223, + "step": 43697 + }, + { + "epoch": 2.0344530577088715, + "grad_norm": 0.35706701995432105, + "learning_rate": 2.836754284389136e-05, + "loss": 2.6485, + "step": 43698 + }, + { + "epoch": 2.0344996158949646, + "grad_norm": 0.31265544607555784, + "learning_rate": 2.8365100780673158e-05, + "loss": 2.6618, + "step": 43699 + }, + { + "epoch": 2.0345461740810578, + "grad_norm": 0.3305764050591129, + "learning_rate": 2.836265878095129e-05, + "loss": 2.516, + "step": 43700 + }, + { + "epoch": 2.034592732267151, + "grad_norm": 0.33832620314636086, + "learning_rate": 2.8360216844732877e-05, + "loss": 2.63, + "step": 43701 + }, + { + "epoch": 2.034639290453244, + "grad_norm": 0.32010014971503126, + "learning_rate": 2.8357774972025087e-05, + "loss": 2.6011, + "step": 43702 + }, + { + "epoch": 2.034685848639337, + "grad_norm": 0.3554796754029103, + "learning_rate": 2.8355333162835097e-05, + "loss": 2.681, + "step": 43703 + }, + { + "epoch": 2.03473240682543, + "grad_norm": 0.34094610322433155, + "learning_rate": 2.835289141717008e-05, + "loss": 2.7348, + "step": 43704 + }, + { + "epoch": 2.0347789650115233, + "grad_norm": 0.32448328446427166, + "learning_rate": 2.8350449735037204e-05, + "loss": 2.5419, + "step": 43705 + }, + { + "epoch": 2.034825523197616, + "grad_norm": 0.36284742538742754, + "learning_rate": 2.8348008116443607e-05, + "loss": 2.6412, + "step": 43706 + }, + { + "epoch": 2.034872081383709, + "grad_norm": 0.35427726654417635, + "learning_rate": 2.8345566561396475e-05, + "loss": 2.6242, + "step": 43707 + }, + { + "epoch": 2.0349186395698022, + "grad_norm": 0.3581920940370833, + "learning_rate": 2.8343125069902974e-05, + "loss": 2.6881, + "step": 43708 + }, + { + "epoch": 2.0349651977558953, + "grad_norm": 0.36314014858252397, + "learning_rate": 2.8340683641970267e-05, + "loss": 2.6336, + "step": 43709 + }, + { + "epoch": 2.0350117559419885, + "grad_norm": 0.3417974899664122, + "learning_rate": 2.8338242277605515e-05, + "loss": 2.6879, + "step": 43710 + }, + { + "epoch": 2.0350583141280816, + "grad_norm": 0.37405551005940874, + "learning_rate": 2.833580097681591e-05, + "loss": 2.7058, + "step": 43711 + }, + { + "epoch": 2.0351048723141747, + "grad_norm": 0.331195573234499, + "learning_rate": 2.8333359739608556e-05, + "loss": 2.5622, + "step": 43712 + }, + { + "epoch": 2.035151430500268, + "grad_norm": 0.3673881911037977, + "learning_rate": 2.8330918565990694e-05, + "loss": 2.7598, + "step": 43713 + }, + { + "epoch": 2.035197988686361, + "grad_norm": 0.3604909069611129, + "learning_rate": 2.832847745596944e-05, + "loss": 2.665, + "step": 43714 + }, + { + "epoch": 2.035244546872454, + "grad_norm": 0.3823173343244864, + "learning_rate": 2.8326036409551958e-05, + "loss": 2.7544, + "step": 43715 + }, + { + "epoch": 2.0352911050585467, + "grad_norm": 0.3549126110688647, + "learning_rate": 2.832359542674543e-05, + "loss": 2.6318, + "step": 43716 + }, + { + "epoch": 2.03533766324464, + "grad_norm": 0.35278607404691614, + "learning_rate": 2.8321154507557013e-05, + "loss": 2.6042, + "step": 43717 + }, + { + "epoch": 2.035384221430733, + "grad_norm": 0.3325483199605284, + "learning_rate": 2.831871365199389e-05, + "loss": 2.6557, + "step": 43718 + }, + { + "epoch": 2.035430779616826, + "grad_norm": 0.34044995655847615, + "learning_rate": 2.8316272860063186e-05, + "loss": 2.548, + "step": 43719 + }, + { + "epoch": 2.035477337802919, + "grad_norm": 0.35422356615644035, + "learning_rate": 2.831383213177209e-05, + "loss": 2.6485, + "step": 43720 + }, + { + "epoch": 2.0355238959890123, + "grad_norm": 0.3722769275721891, + "learning_rate": 2.8311391467127757e-05, + "loss": 2.7234, + "step": 43721 + }, + { + "epoch": 2.0355704541751054, + "grad_norm": 0.3286176653329321, + "learning_rate": 2.830895086613735e-05, + "loss": 2.5897, + "step": 43722 + }, + { + "epoch": 2.0356170123611985, + "grad_norm": 0.3570454617175687, + "learning_rate": 2.830651032880804e-05, + "loss": 2.6825, + "step": 43723 + }, + { + "epoch": 2.0356635705472916, + "grad_norm": 0.3420811835393791, + "learning_rate": 2.8304069855147002e-05, + "loss": 2.6089, + "step": 43724 + }, + { + "epoch": 2.0357101287333847, + "grad_norm": 0.3367900843111975, + "learning_rate": 2.830162944516135e-05, + "loss": 2.5149, + "step": 43725 + }, + { + "epoch": 2.0357566869194774, + "grad_norm": 0.36235600307035787, + "learning_rate": 2.8299189098858305e-05, + "loss": 2.6814, + "step": 43726 + }, + { + "epoch": 2.0358032451055705, + "grad_norm": 0.32860193698497403, + "learning_rate": 2.8296748816244988e-05, + "loss": 2.6453, + "step": 43727 + }, + { + "epoch": 2.0358498032916637, + "grad_norm": 0.3505919779637324, + "learning_rate": 2.829430859732858e-05, + "loss": 2.6502, + "step": 43728 + }, + { + "epoch": 2.0358963614777568, + "grad_norm": 0.3586888016236327, + "learning_rate": 2.829186844211623e-05, + "loss": 2.6112, + "step": 43729 + }, + { + "epoch": 2.03594291966385, + "grad_norm": 0.33492493883151436, + "learning_rate": 2.828942835061511e-05, + "loss": 2.6858, + "step": 43730 + }, + { + "epoch": 2.035989477849943, + "grad_norm": 0.3169818209898712, + "learning_rate": 2.8286988322832404e-05, + "loss": 2.5839, + "step": 43731 + }, + { + "epoch": 2.036036036036036, + "grad_norm": 0.33555609552134413, + "learning_rate": 2.8284548358775202e-05, + "loss": 2.6433, + "step": 43732 + }, + { + "epoch": 2.0360825942221292, + "grad_norm": 0.33689011692879917, + "learning_rate": 2.8282108458450752e-05, + "loss": 2.7619, + "step": 43733 + }, + { + "epoch": 2.0361291524082223, + "grad_norm": 0.33725221395436916, + "learning_rate": 2.8279668621866157e-05, + "loss": 2.7367, + "step": 43734 + }, + { + "epoch": 2.0361757105943155, + "grad_norm": 0.33750259962846446, + "learning_rate": 2.827722884902859e-05, + "loss": 2.5805, + "step": 43735 + }, + { + "epoch": 2.036222268780408, + "grad_norm": 0.3450031411071067, + "learning_rate": 2.8274789139945224e-05, + "loss": 2.6374, + "step": 43736 + }, + { + "epoch": 2.0362688269665012, + "grad_norm": 0.3458901814120178, + "learning_rate": 2.827234949462323e-05, + "loss": 2.5755, + "step": 43737 + }, + { + "epoch": 2.0363153851525944, + "grad_norm": 0.3313968356047966, + "learning_rate": 2.8269909913069702e-05, + "loss": 2.5168, + "step": 43738 + }, + { + "epoch": 2.0363619433386875, + "grad_norm": 0.3225393331810125, + "learning_rate": 2.8267470395291894e-05, + "loss": 2.7362, + "step": 43739 + }, + { + "epoch": 2.0364085015247806, + "grad_norm": 0.33166787402303394, + "learning_rate": 2.82650309412969e-05, + "loss": 2.4952, + "step": 43740 + }, + { + "epoch": 2.0364550597108737, + "grad_norm": 0.34974116870533395, + "learning_rate": 2.8262591551091895e-05, + "loss": 2.6124, + "step": 43741 + }, + { + "epoch": 2.036501617896967, + "grad_norm": 0.33033070103533063, + "learning_rate": 2.8260152224684044e-05, + "loss": 2.7171, + "step": 43742 + }, + { + "epoch": 2.03654817608306, + "grad_norm": 0.37011569504239117, + "learning_rate": 2.8257712962080508e-05, + "loss": 2.7014, + "step": 43743 + }, + { + "epoch": 2.036594734269153, + "grad_norm": 0.3502742544118235, + "learning_rate": 2.825527376328846e-05, + "loss": 2.5811, + "step": 43744 + }, + { + "epoch": 2.036641292455246, + "grad_norm": 0.3265159691912563, + "learning_rate": 2.8252834628314994e-05, + "loss": 2.6261, + "step": 43745 + }, + { + "epoch": 2.036687850641339, + "grad_norm": 0.33624872353424373, + "learning_rate": 2.8250395557167362e-05, + "loss": 2.6062, + "step": 43746 + }, + { + "epoch": 2.036734408827432, + "grad_norm": 0.323508545597573, + "learning_rate": 2.8247956549852662e-05, + "loss": 2.5717, + "step": 43747 + }, + { + "epoch": 2.036780967013525, + "grad_norm": 0.3437318202606753, + "learning_rate": 2.824551760637806e-05, + "loss": 2.6704, + "step": 43748 + }, + { + "epoch": 2.036827525199618, + "grad_norm": 0.33740664063653253, + "learning_rate": 2.8243078726750715e-05, + "loss": 2.6819, + "step": 43749 + }, + { + "epoch": 2.0368740833857113, + "grad_norm": 0.31028813419451934, + "learning_rate": 2.82406399109778e-05, + "loss": 2.4992, + "step": 43750 + }, + { + "epoch": 2.0369206415718044, + "grad_norm": 0.35565436714775694, + "learning_rate": 2.823820115906646e-05, + "loss": 2.7542, + "step": 43751 + }, + { + "epoch": 2.0369671997578975, + "grad_norm": 0.3320492418893884, + "learning_rate": 2.8235762471023876e-05, + "loss": 2.5838, + "step": 43752 + }, + { + "epoch": 2.0370137579439906, + "grad_norm": 0.3234587330401612, + "learning_rate": 2.8233323846857167e-05, + "loss": 2.6548, + "step": 43753 + }, + { + "epoch": 2.0370603161300838, + "grad_norm": 0.3400521047011782, + "learning_rate": 2.8230885286573506e-05, + "loss": 2.7126, + "step": 43754 + }, + { + "epoch": 2.0371068743161764, + "grad_norm": 0.33982438613556987, + "learning_rate": 2.8228446790180053e-05, + "loss": 2.6194, + "step": 43755 + }, + { + "epoch": 2.0371534325022695, + "grad_norm": 0.3099857779797754, + "learning_rate": 2.8226008357683964e-05, + "loss": 2.679, + "step": 43756 + }, + { + "epoch": 2.0371999906883627, + "grad_norm": 0.34931270846246637, + "learning_rate": 2.8223569989092414e-05, + "loss": 2.656, + "step": 43757 + }, + { + "epoch": 2.0372465488744558, + "grad_norm": 0.35064936631901306, + "learning_rate": 2.8221131684412506e-05, + "loss": 2.6457, + "step": 43758 + }, + { + "epoch": 2.037293107060549, + "grad_norm": 0.3172441014824578, + "learning_rate": 2.8218693443651462e-05, + "loss": 2.6598, + "step": 43759 + }, + { + "epoch": 2.037339665246642, + "grad_norm": 0.308409389383308, + "learning_rate": 2.8216255266816395e-05, + "loss": 2.6766, + "step": 43760 + }, + { + "epoch": 2.037386223432735, + "grad_norm": 0.34293207326369984, + "learning_rate": 2.821381715391447e-05, + "loss": 2.7166, + "step": 43761 + }, + { + "epoch": 2.0374327816188282, + "grad_norm": 0.34181703174529854, + "learning_rate": 2.821137910495284e-05, + "loss": 2.6388, + "step": 43762 + }, + { + "epoch": 2.0374793398049214, + "grad_norm": 0.32304163457516005, + "learning_rate": 2.820894111993867e-05, + "loss": 2.7334, + "step": 43763 + }, + { + "epoch": 2.0375258979910145, + "grad_norm": 0.34353001855506177, + "learning_rate": 2.8206503198879108e-05, + "loss": 2.6459, + "step": 43764 + }, + { + "epoch": 2.037572456177107, + "grad_norm": 0.33218629691360657, + "learning_rate": 2.820406534178134e-05, + "loss": 2.6766, + "step": 43765 + }, + { + "epoch": 2.0376190143632003, + "grad_norm": 0.3332348966181837, + "learning_rate": 2.8201627548652464e-05, + "loss": 2.6483, + "step": 43766 + }, + { + "epoch": 2.0376655725492934, + "grad_norm": 0.34102042754007705, + "learning_rate": 2.8199189819499673e-05, + "loss": 2.6407, + "step": 43767 + }, + { + "epoch": 2.0377121307353865, + "grad_norm": 0.36203422688683984, + "learning_rate": 2.8196752154330107e-05, + "loss": 2.6235, + "step": 43768 + }, + { + "epoch": 2.0377586889214796, + "grad_norm": 0.35762218654631744, + "learning_rate": 2.8194314553150926e-05, + "loss": 2.6655, + "step": 43769 + }, + { + "epoch": 2.0378052471075727, + "grad_norm": 0.3339581649611087, + "learning_rate": 2.8191877015969303e-05, + "loss": 2.6277, + "step": 43770 + }, + { + "epoch": 2.037851805293666, + "grad_norm": 0.3583320613797213, + "learning_rate": 2.8189439542792328e-05, + "loss": 2.6378, + "step": 43771 + }, + { + "epoch": 2.037898363479759, + "grad_norm": 0.33808991559076473, + "learning_rate": 2.818700213362725e-05, + "loss": 2.6598, + "step": 43772 + }, + { + "epoch": 2.037944921665852, + "grad_norm": 0.34770445560915797, + "learning_rate": 2.8184564788481137e-05, + "loss": 2.706, + "step": 43773 + }, + { + "epoch": 2.037991479851945, + "grad_norm": 0.3473099428271008, + "learning_rate": 2.8182127507361188e-05, + "loss": 2.7113, + "step": 43774 + }, + { + "epoch": 2.038038038038038, + "grad_norm": 0.30830105532539503, + "learning_rate": 2.8179690290274545e-05, + "loss": 2.5122, + "step": 43775 + }, + { + "epoch": 2.038084596224131, + "grad_norm": 0.36590596696087774, + "learning_rate": 2.817725313722836e-05, + "loss": 2.6538, + "step": 43776 + }, + { + "epoch": 2.038131154410224, + "grad_norm": 0.33894666711882454, + "learning_rate": 2.8174816048229792e-05, + "loss": 2.6411, + "step": 43777 + }, + { + "epoch": 2.038177712596317, + "grad_norm": 0.34882850988401, + "learning_rate": 2.8172379023286e-05, + "loss": 2.6559, + "step": 43778 + }, + { + "epoch": 2.0382242707824103, + "grad_norm": 0.35059705134856134, + "learning_rate": 2.816994206240411e-05, + "loss": 2.6567, + "step": 43779 + }, + { + "epoch": 2.0382708289685034, + "grad_norm": 0.3577197711422501, + "learning_rate": 2.8167505165591283e-05, + "loss": 2.7556, + "step": 43780 + }, + { + "epoch": 2.0383173871545965, + "grad_norm": 0.3247628477380366, + "learning_rate": 2.8165068332854683e-05, + "loss": 2.5988, + "step": 43781 + }, + { + "epoch": 2.0383639453406897, + "grad_norm": 0.36270359152693044, + "learning_rate": 2.816263156420146e-05, + "loss": 2.6616, + "step": 43782 + }, + { + "epoch": 2.0384105035267828, + "grad_norm": 0.3382636222636246, + "learning_rate": 2.816019485963878e-05, + "loss": 2.6151, + "step": 43783 + }, + { + "epoch": 2.038457061712876, + "grad_norm": 0.3389008533364028, + "learning_rate": 2.815775821917373e-05, + "loss": 2.5826, + "step": 43784 + }, + { + "epoch": 2.0385036198989686, + "grad_norm": 0.35022214895119735, + "learning_rate": 2.8155321642813546e-05, + "loss": 2.6161, + "step": 43785 + }, + { + "epoch": 2.0385501780850617, + "grad_norm": 0.3436602947925449, + "learning_rate": 2.815288513056531e-05, + "loss": 2.5396, + "step": 43786 + }, + { + "epoch": 2.038596736271155, + "grad_norm": 0.3458788080563238, + "learning_rate": 2.8150448682436247e-05, + "loss": 2.6659, + "step": 43787 + }, + { + "epoch": 2.038643294457248, + "grad_norm": 0.3137105908084093, + "learning_rate": 2.8148012298433436e-05, + "loss": 2.5817, + "step": 43788 + }, + { + "epoch": 2.038689852643341, + "grad_norm": 0.34077979013950843, + "learning_rate": 2.814557597856406e-05, + "loss": 2.5573, + "step": 43789 + }, + { + "epoch": 2.038736410829434, + "grad_norm": 0.35484622611071825, + "learning_rate": 2.8143139722835266e-05, + "loss": 2.7505, + "step": 43790 + }, + { + "epoch": 2.0387829690155272, + "grad_norm": 0.3526544964619558, + "learning_rate": 2.814070353125422e-05, + "loss": 2.7106, + "step": 43791 + }, + { + "epoch": 2.0388295272016204, + "grad_norm": 0.378500785461503, + "learning_rate": 2.8138267403828034e-05, + "loss": 2.6854, + "step": 43792 + }, + { + "epoch": 2.0388760853877135, + "grad_norm": 0.358880942220719, + "learning_rate": 2.8135831340563885e-05, + "loss": 2.6753, + "step": 43793 + }, + { + "epoch": 2.038922643573806, + "grad_norm": 0.34449439830824313, + "learning_rate": 2.8133395341468916e-05, + "loss": 2.6605, + "step": 43794 + }, + { + "epoch": 2.0389692017598993, + "grad_norm": 0.3754939801536797, + "learning_rate": 2.813095940655027e-05, + "loss": 2.6792, + "step": 43795 + }, + { + "epoch": 2.0390157599459924, + "grad_norm": 0.36147534351585764, + "learning_rate": 2.8128523535815132e-05, + "loss": 2.7729, + "step": 43796 + }, + { + "epoch": 2.0390623181320855, + "grad_norm": 0.35288676382910866, + "learning_rate": 2.8126087729270583e-05, + "loss": 2.6453, + "step": 43797 + }, + { + "epoch": 2.0391088763181786, + "grad_norm": 0.3310764998770669, + "learning_rate": 2.812365198692385e-05, + "loss": 2.5038, + "step": 43798 + }, + { + "epoch": 2.0391554345042717, + "grad_norm": 0.34396454109349833, + "learning_rate": 2.8121216308782007e-05, + "loss": 2.6672, + "step": 43799 + }, + { + "epoch": 2.039201992690365, + "grad_norm": 0.37514895601719395, + "learning_rate": 2.811878069485227e-05, + "loss": 2.6259, + "step": 43800 + }, + { + "epoch": 2.039248550876458, + "grad_norm": 0.36001973620765787, + "learning_rate": 2.8116345145141742e-05, + "loss": 2.603, + "step": 43801 + }, + { + "epoch": 2.039295109062551, + "grad_norm": 0.351035991564147, + "learning_rate": 2.8113909659657577e-05, + "loss": 2.6295, + "step": 43802 + }, + { + "epoch": 2.039341667248644, + "grad_norm": 0.3797220745590698, + "learning_rate": 2.8111474238406943e-05, + "loss": 2.6931, + "step": 43803 + }, + { + "epoch": 2.039388225434737, + "grad_norm": 0.33596741690765003, + "learning_rate": 2.8109038881396964e-05, + "loss": 2.7016, + "step": 43804 + }, + { + "epoch": 2.03943478362083, + "grad_norm": 0.3320064103451693, + "learning_rate": 2.8106603588634828e-05, + "loss": 2.6541, + "step": 43805 + }, + { + "epoch": 2.039481341806923, + "grad_norm": 0.37530425350621516, + "learning_rate": 2.8104168360127635e-05, + "loss": 2.6923, + "step": 43806 + }, + { + "epoch": 2.039527899993016, + "grad_norm": 0.32833128655758015, + "learning_rate": 2.8101733195882545e-05, + "loss": 2.6205, + "step": 43807 + }, + { + "epoch": 2.0395744581791093, + "grad_norm": 0.31495138322464034, + "learning_rate": 2.8099298095906713e-05, + "loss": 2.5674, + "step": 43808 + }, + { + "epoch": 2.0396210163652024, + "grad_norm": 0.31247171971270227, + "learning_rate": 2.8096863060207303e-05, + "loss": 2.6763, + "step": 43809 + }, + { + "epoch": 2.0396675745512955, + "grad_norm": 0.3367857691237341, + "learning_rate": 2.80944280887914e-05, + "loss": 2.6646, + "step": 43810 + }, + { + "epoch": 2.0397141327373887, + "grad_norm": 0.35611543512636756, + "learning_rate": 2.8091993181666238e-05, + "loss": 2.6463, + "step": 43811 + }, + { + "epoch": 2.039760690923482, + "grad_norm": 0.31943666803829013, + "learning_rate": 2.8089558338838872e-05, + "loss": 2.5797, + "step": 43812 + }, + { + "epoch": 2.039807249109575, + "grad_norm": 0.3450511062546588, + "learning_rate": 2.8087123560316537e-05, + "loss": 2.6117, + "step": 43813 + }, + { + "epoch": 2.0398538072956676, + "grad_norm": 0.3311406193356904, + "learning_rate": 2.808468884610631e-05, + "loss": 2.6887, + "step": 43814 + }, + { + "epoch": 2.0399003654817607, + "grad_norm": 0.3534095190691826, + "learning_rate": 2.8082254196215373e-05, + "loss": 2.6385, + "step": 43815 + }, + { + "epoch": 2.039946923667854, + "grad_norm": 0.32335454291254195, + "learning_rate": 2.8079819610650853e-05, + "loss": 2.5115, + "step": 43816 + }, + { + "epoch": 2.039993481853947, + "grad_norm": 0.3479617800836068, + "learning_rate": 2.8077385089419905e-05, + "loss": 2.7263, + "step": 43817 + }, + { + "epoch": 2.04004004004004, + "grad_norm": 0.3101912541691191, + "learning_rate": 2.8074950632529688e-05, + "loss": 2.6811, + "step": 43818 + }, + { + "epoch": 2.040086598226133, + "grad_norm": 0.3497602078795397, + "learning_rate": 2.8072516239987317e-05, + "loss": 2.7244, + "step": 43819 + }, + { + "epoch": 2.0401331564122263, + "grad_norm": 0.3582519489805035, + "learning_rate": 2.807008191179995e-05, + "loss": 2.6024, + "step": 43820 + }, + { + "epoch": 2.0401797145983194, + "grad_norm": 0.3197074727045117, + "learning_rate": 2.8067647647974727e-05, + "loss": 2.5829, + "step": 43821 + }, + { + "epoch": 2.0402262727844125, + "grad_norm": 0.32300953985451253, + "learning_rate": 2.80652134485188e-05, + "loss": 2.6564, + "step": 43822 + }, + { + "epoch": 2.0402728309705056, + "grad_norm": 0.3418567039893052, + "learning_rate": 2.8062779313439315e-05, + "loss": 2.5373, + "step": 43823 + }, + { + "epoch": 2.0403193891565983, + "grad_norm": 0.3458116577451212, + "learning_rate": 2.8060345242743423e-05, + "loss": 2.5216, + "step": 43824 + }, + { + "epoch": 2.0403659473426914, + "grad_norm": 0.33826023172158487, + "learning_rate": 2.8057911236438227e-05, + "loss": 2.7004, + "step": 43825 + }, + { + "epoch": 2.0404125055287845, + "grad_norm": 0.337990196502383, + "learning_rate": 2.805547729453093e-05, + "loss": 2.5613, + "step": 43826 + }, + { + "epoch": 2.0404590637148776, + "grad_norm": 0.33893060695583055, + "learning_rate": 2.805304341702863e-05, + "loss": 2.665, + "step": 43827 + }, + { + "epoch": 2.0405056219009707, + "grad_norm": 0.35559347939952296, + "learning_rate": 2.8050609603938483e-05, + "loss": 2.5353, + "step": 43828 + }, + { + "epoch": 2.040552180087064, + "grad_norm": 0.34161873016701216, + "learning_rate": 2.8048175855267637e-05, + "loss": 2.7208, + "step": 43829 + }, + { + "epoch": 2.040598738273157, + "grad_norm": 0.33257720254658485, + "learning_rate": 2.8045742171023233e-05, + "loss": 2.5906, + "step": 43830 + }, + { + "epoch": 2.04064529645925, + "grad_norm": 0.338712656638507, + "learning_rate": 2.804330855121243e-05, + "loss": 2.6539, + "step": 43831 + }, + { + "epoch": 2.040691854645343, + "grad_norm": 0.33127684567752813, + "learning_rate": 2.8040874995842338e-05, + "loss": 2.6507, + "step": 43832 + }, + { + "epoch": 2.040738412831436, + "grad_norm": 0.34892627853331437, + "learning_rate": 2.803844150492011e-05, + "loss": 2.6519, + "step": 43833 + }, + { + "epoch": 2.040784971017529, + "grad_norm": 0.36648963341089535, + "learning_rate": 2.8036008078452892e-05, + "loss": 2.6793, + "step": 43834 + }, + { + "epoch": 2.040831529203622, + "grad_norm": 0.3484697941712544, + "learning_rate": 2.803357471644783e-05, + "loss": 2.7073, + "step": 43835 + }, + { + "epoch": 2.040878087389715, + "grad_norm": 0.35681883147447874, + "learning_rate": 2.8031141418912056e-05, + "loss": 2.6867, + "step": 43836 + }, + { + "epoch": 2.0409246455758083, + "grad_norm": 0.34754475027539417, + "learning_rate": 2.8028708185852747e-05, + "loss": 2.7091, + "step": 43837 + }, + { + "epoch": 2.0409712037619014, + "grad_norm": 0.3469482647577364, + "learning_rate": 2.802627501727697e-05, + "loss": 2.7622, + "step": 43838 + }, + { + "epoch": 2.0410177619479946, + "grad_norm": 0.3673319596681539, + "learning_rate": 2.802384191319195e-05, + "loss": 2.6159, + "step": 43839 + }, + { + "epoch": 2.0410643201340877, + "grad_norm": 0.3842835866482044, + "learning_rate": 2.8021408873604748e-05, + "loss": 2.6741, + "step": 43840 + }, + { + "epoch": 2.041110878320181, + "grad_norm": 0.35391894821918346, + "learning_rate": 2.8018975898522588e-05, + "loss": 2.5847, + "step": 43841 + }, + { + "epoch": 2.041157436506274, + "grad_norm": 0.36422242946150735, + "learning_rate": 2.8016542987952545e-05, + "loss": 2.6085, + "step": 43842 + }, + { + "epoch": 2.0412039946923666, + "grad_norm": 0.3490316690421224, + "learning_rate": 2.8014110141901784e-05, + "loss": 2.6128, + "step": 43843 + }, + { + "epoch": 2.0412505528784597, + "grad_norm": 0.339574140207157, + "learning_rate": 2.801167736037747e-05, + "loss": 2.6539, + "step": 43844 + }, + { + "epoch": 2.041297111064553, + "grad_norm": 0.3667991401547959, + "learning_rate": 2.800924464338669e-05, + "loss": 2.744, + "step": 43845 + }, + { + "epoch": 2.041343669250646, + "grad_norm": 0.3556326023688573, + "learning_rate": 2.800681199093661e-05, + "loss": 2.6076, + "step": 43846 + }, + { + "epoch": 2.041390227436739, + "grad_norm": 0.36190732464299663, + "learning_rate": 2.8004379403034375e-05, + "loss": 2.5896, + "step": 43847 + }, + { + "epoch": 2.041436785622832, + "grad_norm": 0.3341822300818975, + "learning_rate": 2.8001946879687117e-05, + "loss": 2.5996, + "step": 43848 + }, + { + "epoch": 2.0414833438089253, + "grad_norm": 0.32198319868462344, + "learning_rate": 2.7999514420901978e-05, + "loss": 2.6162, + "step": 43849 + }, + { + "epoch": 2.0415299019950184, + "grad_norm": 0.3622651149140733, + "learning_rate": 2.7997082026686116e-05, + "loss": 2.5397, + "step": 43850 + }, + { + "epoch": 2.0415764601811115, + "grad_norm": 0.33811040598217823, + "learning_rate": 2.7994649697046616e-05, + "loss": 2.6409, + "step": 43851 + }, + { + "epoch": 2.0416230183672046, + "grad_norm": 0.3359398324880183, + "learning_rate": 2.7992217431990685e-05, + "loss": 2.5662, + "step": 43852 + }, + { + "epoch": 2.0416695765532973, + "grad_norm": 0.34577316627951615, + "learning_rate": 2.798978523152539e-05, + "loss": 2.7333, + "step": 43853 + }, + { + "epoch": 2.0417161347393904, + "grad_norm": 0.3218950923358267, + "learning_rate": 2.7987353095657942e-05, + "loss": 2.7374, + "step": 43854 + }, + { + "epoch": 2.0417626929254835, + "grad_norm": 0.34302929505904467, + "learning_rate": 2.7984921024395428e-05, + "loss": 2.6075, + "step": 43855 + }, + { + "epoch": 2.0418092511115766, + "grad_norm": 0.34357094881326905, + "learning_rate": 2.7982489017745007e-05, + "loss": 2.584, + "step": 43856 + }, + { + "epoch": 2.0418558092976697, + "grad_norm": 0.33068878155553033, + "learning_rate": 2.798005707571382e-05, + "loss": 2.7054, + "step": 43857 + }, + { + "epoch": 2.041902367483763, + "grad_norm": 0.321309592513189, + "learning_rate": 2.7977625198308966e-05, + "loss": 2.6512, + "step": 43858 + }, + { + "epoch": 2.041948925669856, + "grad_norm": 0.3612899662765619, + "learning_rate": 2.797519338553765e-05, + "loss": 2.6523, + "step": 43859 + }, + { + "epoch": 2.041995483855949, + "grad_norm": 0.32347154870729955, + "learning_rate": 2.797276163740695e-05, + "loss": 2.6243, + "step": 43860 + }, + { + "epoch": 2.042042042042042, + "grad_norm": 0.3370879265634289, + "learning_rate": 2.7970329953924024e-05, + "loss": 2.716, + "step": 43861 + }, + { + "epoch": 2.0420886002281353, + "grad_norm": 0.34967991128425, + "learning_rate": 2.796789833509601e-05, + "loss": 2.6534, + "step": 43862 + }, + { + "epoch": 2.042135158414228, + "grad_norm": 0.36440379478348073, + "learning_rate": 2.796546678093005e-05, + "loss": 2.6898, + "step": 43863 + }, + { + "epoch": 2.042181716600321, + "grad_norm": 0.3465139034240422, + "learning_rate": 2.7963035291433272e-05, + "loss": 2.6219, + "step": 43864 + }, + { + "epoch": 2.042228274786414, + "grad_norm": 0.34239768964662987, + "learning_rate": 2.7960603866612828e-05, + "loss": 2.7327, + "step": 43865 + }, + { + "epoch": 2.0422748329725073, + "grad_norm": 0.3544513817813224, + "learning_rate": 2.7958172506475798e-05, + "loss": 2.6631, + "step": 43866 + }, + { + "epoch": 2.0423213911586005, + "grad_norm": 0.3259212063199742, + "learning_rate": 2.795574121102941e-05, + "loss": 2.5917, + "step": 43867 + }, + { + "epoch": 2.0423679493446936, + "grad_norm": 0.32359964981303335, + "learning_rate": 2.7953309980280727e-05, + "loss": 2.6173, + "step": 43868 + }, + { + "epoch": 2.0424145075307867, + "grad_norm": 0.3241979335004569, + "learning_rate": 2.7950878814236902e-05, + "loss": 2.6577, + "step": 43869 + }, + { + "epoch": 2.04246106571688, + "grad_norm": 0.35512700494366994, + "learning_rate": 2.7948447712905084e-05, + "loss": 2.7135, + "step": 43870 + }, + { + "epoch": 2.042507623902973, + "grad_norm": 0.3299637578402983, + "learning_rate": 2.7946016676292396e-05, + "loss": 2.5881, + "step": 43871 + }, + { + "epoch": 2.042554182089066, + "grad_norm": 0.3313843919593753, + "learning_rate": 2.7943585704405996e-05, + "loss": 2.6898, + "step": 43872 + }, + { + "epoch": 2.0426007402751587, + "grad_norm": 0.3547072339467719, + "learning_rate": 2.7941154797252977e-05, + "loss": 2.7031, + "step": 43873 + }, + { + "epoch": 2.042647298461252, + "grad_norm": 0.3353892186518813, + "learning_rate": 2.7938723954840496e-05, + "loss": 2.636, + "step": 43874 + }, + { + "epoch": 2.042693856647345, + "grad_norm": 0.32741941942824343, + "learning_rate": 2.7936293177175687e-05, + "loss": 2.5971, + "step": 43875 + }, + { + "epoch": 2.042740414833438, + "grad_norm": 0.3547518178582476, + "learning_rate": 2.7933862464265687e-05, + "loss": 2.5676, + "step": 43876 + }, + { + "epoch": 2.042786973019531, + "grad_norm": 0.3494274115587004, + "learning_rate": 2.7931431816117627e-05, + "loss": 2.7155, + "step": 43877 + }, + { + "epoch": 2.0428335312056243, + "grad_norm": 0.34724798527724166, + "learning_rate": 2.7929001232738656e-05, + "loss": 2.5892, + "step": 43878 + }, + { + "epoch": 2.0428800893917174, + "grad_norm": 0.3355809970940743, + "learning_rate": 2.7926570714135852e-05, + "loss": 2.598, + "step": 43879 + }, + { + "epoch": 2.0429266475778105, + "grad_norm": 0.3432433498060676, + "learning_rate": 2.7924140260316435e-05, + "loss": 2.6897, + "step": 43880 + }, + { + "epoch": 2.0429732057639036, + "grad_norm": 0.33877525633299466, + "learning_rate": 2.792170987128746e-05, + "loss": 2.6363, + "step": 43881 + }, + { + "epoch": 2.0430197639499963, + "grad_norm": 0.30016040907364583, + "learning_rate": 2.7919279547056098e-05, + "loss": 2.6659, + "step": 43882 + }, + { + "epoch": 2.0430663221360894, + "grad_norm": 0.355420402917058, + "learning_rate": 2.7916849287629476e-05, + "loss": 2.6351, + "step": 43883 + }, + { + "epoch": 2.0431128803221825, + "grad_norm": 0.34994129444985406, + "learning_rate": 2.791441909301473e-05, + "loss": 2.5246, + "step": 43884 + }, + { + "epoch": 2.0431594385082756, + "grad_norm": 0.3469213564921691, + "learning_rate": 2.7911988963219003e-05, + "loss": 2.6675, + "step": 43885 + }, + { + "epoch": 2.0432059966943688, + "grad_norm": 0.3551670320655428, + "learning_rate": 2.7909558898249387e-05, + "loss": 2.6856, + "step": 43886 + }, + { + "epoch": 2.043252554880462, + "grad_norm": 0.3266890564526531, + "learning_rate": 2.7907128898113048e-05, + "loss": 2.6641, + "step": 43887 + }, + { + "epoch": 2.043299113066555, + "grad_norm": 0.35102423042194364, + "learning_rate": 2.790469896281711e-05, + "loss": 2.6519, + "step": 43888 + }, + { + "epoch": 2.043345671252648, + "grad_norm": 0.38275693652515935, + "learning_rate": 2.79022690923687e-05, + "loss": 2.6971, + "step": 43889 + }, + { + "epoch": 2.043392229438741, + "grad_norm": 0.3569546191799255, + "learning_rate": 2.7899839286774954e-05, + "loss": 2.6424, + "step": 43890 + }, + { + "epoch": 2.0434387876248343, + "grad_norm": 0.3278847516209217, + "learning_rate": 2.7897409546043018e-05, + "loss": 2.6083, + "step": 43891 + }, + { + "epoch": 2.043485345810927, + "grad_norm": 0.3606027427957356, + "learning_rate": 2.789497987017997e-05, + "loss": 2.6132, + "step": 43892 + }, + { + "epoch": 2.04353190399702, + "grad_norm": 0.3383317694975863, + "learning_rate": 2.7892550259193025e-05, + "loss": 2.7103, + "step": 43893 + }, + { + "epoch": 2.0435784621831132, + "grad_norm": 0.33140731179671745, + "learning_rate": 2.789012071308924e-05, + "loss": 2.7101, + "step": 43894 + }, + { + "epoch": 2.0436250203692063, + "grad_norm": 0.3359589476226788, + "learning_rate": 2.788769123187578e-05, + "loss": 2.6523, + "step": 43895 + }, + { + "epoch": 2.0436715785552995, + "grad_norm": 0.35309424309809934, + "learning_rate": 2.788526181555976e-05, + "loss": 2.6747, + "step": 43896 + }, + { + "epoch": 2.0437181367413926, + "grad_norm": 0.33771496207051926, + "learning_rate": 2.7882832464148316e-05, + "loss": 2.6907, + "step": 43897 + }, + { + "epoch": 2.0437646949274857, + "grad_norm": 0.3375015401570586, + "learning_rate": 2.7880403177648606e-05, + "loss": 2.6049, + "step": 43898 + }, + { + "epoch": 2.043811253113579, + "grad_norm": 0.34984958642641967, + "learning_rate": 2.7877973956067693e-05, + "loss": 2.6537, + "step": 43899 + }, + { + "epoch": 2.043857811299672, + "grad_norm": 0.34909236057507564, + "learning_rate": 2.7875544799412783e-05, + "loss": 2.6576, + "step": 43900 + }, + { + "epoch": 2.043904369485765, + "grad_norm": 0.3247145393413665, + "learning_rate": 2.7873115707690956e-05, + "loss": 2.6816, + "step": 43901 + }, + { + "epoch": 2.0439509276718577, + "grad_norm": 0.37665345270135947, + "learning_rate": 2.7870686680909348e-05, + "loss": 2.6685, + "step": 43902 + }, + { + "epoch": 2.043997485857951, + "grad_norm": 0.3536223331323058, + "learning_rate": 2.7868257719075096e-05, + "loss": 2.596, + "step": 43903 + }, + { + "epoch": 2.044044044044044, + "grad_norm": 0.3399012972923138, + "learning_rate": 2.786582882219535e-05, + "loss": 2.7059, + "step": 43904 + }, + { + "epoch": 2.044090602230137, + "grad_norm": 0.3694502044832401, + "learning_rate": 2.786339999027717e-05, + "loss": 2.6649, + "step": 43905 + }, + { + "epoch": 2.04413716041623, + "grad_norm": 0.34237641904360894, + "learning_rate": 2.7860971223327776e-05, + "loss": 2.505, + "step": 43906 + }, + { + "epoch": 2.0441837186023233, + "grad_norm": 0.3614902308532262, + "learning_rate": 2.7858542521354226e-05, + "loss": 2.6902, + "step": 43907 + }, + { + "epoch": 2.0442302767884164, + "grad_norm": 0.3709382400425144, + "learning_rate": 2.7856113884363676e-05, + "loss": 2.685, + "step": 43908 + }, + { + "epoch": 2.0442768349745095, + "grad_norm": 0.3550982807618514, + "learning_rate": 2.7853685312363253e-05, + "loss": 2.5828, + "step": 43909 + }, + { + "epoch": 2.0443233931606026, + "grad_norm": 0.34965920271354867, + "learning_rate": 2.7851256805360077e-05, + "loss": 2.6072, + "step": 43910 + }, + { + "epoch": 2.0443699513466957, + "grad_norm": 0.34383109962462394, + "learning_rate": 2.78488283633613e-05, + "loss": 2.5675, + "step": 43911 + }, + { + "epoch": 2.0444165095327884, + "grad_norm": 0.35119272347759, + "learning_rate": 2.7846399986373996e-05, + "loss": 2.5927, + "step": 43912 + }, + { + "epoch": 2.0444630677188815, + "grad_norm": 0.3768647993154746, + "learning_rate": 2.7843971674405356e-05, + "loss": 2.7728, + "step": 43913 + }, + { + "epoch": 2.0445096259049746, + "grad_norm": 0.3277368680223933, + "learning_rate": 2.7841543427462457e-05, + "loss": 2.6286, + "step": 43914 + }, + { + "epoch": 2.0445561840910678, + "grad_norm": 0.36455679551091785, + "learning_rate": 2.783911524555245e-05, + "loss": 2.728, + "step": 43915 + }, + { + "epoch": 2.044602742277161, + "grad_norm": 0.34143467611012107, + "learning_rate": 2.783668712868246e-05, + "loss": 2.6484, + "step": 43916 + }, + { + "epoch": 2.044649300463254, + "grad_norm": 0.3409902574692739, + "learning_rate": 2.7834259076859604e-05, + "loss": 2.6314, + "step": 43917 + }, + { + "epoch": 2.044695858649347, + "grad_norm": 0.38088586830556115, + "learning_rate": 2.7831831090091015e-05, + "loss": 2.6693, + "step": 43918 + }, + { + "epoch": 2.0447424168354402, + "grad_norm": 0.3467891622026647, + "learning_rate": 2.7829403168383838e-05, + "loss": 2.6573, + "step": 43919 + }, + { + "epoch": 2.0447889750215333, + "grad_norm": 0.3599011108737539, + "learning_rate": 2.7826975311745158e-05, + "loss": 2.7201, + "step": 43920 + }, + { + "epoch": 2.0448355332076265, + "grad_norm": 0.3612480537411968, + "learning_rate": 2.782454752018212e-05, + "loss": 2.6953, + "step": 43921 + }, + { + "epoch": 2.044882091393719, + "grad_norm": 0.3829022967656032, + "learning_rate": 2.782211979370185e-05, + "loss": 2.6819, + "step": 43922 + }, + { + "epoch": 2.0449286495798122, + "grad_norm": 0.35022694116661995, + "learning_rate": 2.781969213231148e-05, + "loss": 2.6288, + "step": 43923 + }, + { + "epoch": 2.0449752077659054, + "grad_norm": 0.3354320853724431, + "learning_rate": 2.781726453601814e-05, + "loss": 2.6616, + "step": 43924 + }, + { + "epoch": 2.0450217659519985, + "grad_norm": 0.3518420266592868, + "learning_rate": 2.781483700482891e-05, + "loss": 2.6634, + "step": 43925 + }, + { + "epoch": 2.0450683241380916, + "grad_norm": 0.36846863938802615, + "learning_rate": 2.7812409538750982e-05, + "loss": 2.721, + "step": 43926 + }, + { + "epoch": 2.0451148823241847, + "grad_norm": 0.32130849510729376, + "learning_rate": 2.7809982137791434e-05, + "loss": 2.592, + "step": 43927 + }, + { + "epoch": 2.045161440510278, + "grad_norm": 0.3395434160661141, + "learning_rate": 2.7807554801957393e-05, + "loss": 2.6306, + "step": 43928 + }, + { + "epoch": 2.045207998696371, + "grad_norm": 0.34475341252201974, + "learning_rate": 2.7805127531256005e-05, + "loss": 2.5995, + "step": 43929 + }, + { + "epoch": 2.045254556882464, + "grad_norm": 0.34323307436570033, + "learning_rate": 2.7802700325694364e-05, + "loss": 2.4821, + "step": 43930 + }, + { + "epoch": 2.0453011150685567, + "grad_norm": 0.3360808987543574, + "learning_rate": 2.7800273185279623e-05, + "loss": 2.7196, + "step": 43931 + }, + { + "epoch": 2.04534767325465, + "grad_norm": 0.3155957854851907, + "learning_rate": 2.779784611001891e-05, + "loss": 2.5568, + "step": 43932 + }, + { + "epoch": 2.045394231440743, + "grad_norm": 0.36469222500185816, + "learning_rate": 2.7795419099919313e-05, + "loss": 2.7409, + "step": 43933 + }, + { + "epoch": 2.045440789626836, + "grad_norm": 0.34175100555678306, + "learning_rate": 2.7792992154987972e-05, + "loss": 2.6235, + "step": 43934 + }, + { + "epoch": 2.045487347812929, + "grad_norm": 0.3523618117591021, + "learning_rate": 2.7790565275232005e-05, + "loss": 2.7803, + "step": 43935 + }, + { + "epoch": 2.0455339059990223, + "grad_norm": 0.36085127857690147, + "learning_rate": 2.7788138460658553e-05, + "loss": 2.6738, + "step": 43936 + }, + { + "epoch": 2.0455804641851154, + "grad_norm": 0.32517921275179107, + "learning_rate": 2.7785711711274736e-05, + "loss": 2.5846, + "step": 43937 + }, + { + "epoch": 2.0456270223712085, + "grad_norm": 0.3481447212021125, + "learning_rate": 2.7783285027087637e-05, + "loss": 2.7263, + "step": 43938 + }, + { + "epoch": 2.0456735805573016, + "grad_norm": 0.35028015197068496, + "learning_rate": 2.7780858408104437e-05, + "loss": 2.7266, + "step": 43939 + }, + { + "epoch": 2.0457201387433948, + "grad_norm": 0.3367943983333198, + "learning_rate": 2.777843185433221e-05, + "loss": 2.7443, + "step": 43940 + }, + { + "epoch": 2.0457666969294874, + "grad_norm": 0.3196213824375661, + "learning_rate": 2.77760053657781e-05, + "loss": 2.561, + "step": 43941 + }, + { + "epoch": 2.0458132551155805, + "grad_norm": 0.3073355058788343, + "learning_rate": 2.777357894244923e-05, + "loss": 2.6361, + "step": 43942 + }, + { + "epoch": 2.0458598133016737, + "grad_norm": 0.3571446679801547, + "learning_rate": 2.777115258435271e-05, + "loss": 2.6981, + "step": 43943 + }, + { + "epoch": 2.0459063714877668, + "grad_norm": 0.3148952268612475, + "learning_rate": 2.7768726291495667e-05, + "loss": 2.6263, + "step": 43944 + }, + { + "epoch": 2.04595292967386, + "grad_norm": 0.3085968317030877, + "learning_rate": 2.7766300063885243e-05, + "loss": 2.5735, + "step": 43945 + }, + { + "epoch": 2.045999487859953, + "grad_norm": 0.33766575764425066, + "learning_rate": 2.7763873901528516e-05, + "loss": 2.6119, + "step": 43946 + }, + { + "epoch": 2.046046046046046, + "grad_norm": 0.3568421185118413, + "learning_rate": 2.7761447804432628e-05, + "loss": 2.7073, + "step": 43947 + }, + { + "epoch": 2.0460926042321392, + "grad_norm": 0.2985657658536569, + "learning_rate": 2.7759021772604705e-05, + "loss": 2.6169, + "step": 43948 + }, + { + "epoch": 2.0461391624182323, + "grad_norm": 0.32475838998823436, + "learning_rate": 2.775659580605186e-05, + "loss": 2.6546, + "step": 43949 + }, + { + "epoch": 2.0461857206043255, + "grad_norm": 0.3286264177416291, + "learning_rate": 2.7754169904781236e-05, + "loss": 2.6104, + "step": 43950 + }, + { + "epoch": 2.046232278790418, + "grad_norm": 0.34744990783801094, + "learning_rate": 2.775174406879989e-05, + "loss": 2.7256, + "step": 43951 + }, + { + "epoch": 2.0462788369765113, + "grad_norm": 0.34089962869653095, + "learning_rate": 2.7749318298115023e-05, + "loss": 2.5975, + "step": 43952 + }, + { + "epoch": 2.0463253951626044, + "grad_norm": 0.3435359189712237, + "learning_rate": 2.7746892592733675e-05, + "loss": 2.7443, + "step": 43953 + }, + { + "epoch": 2.0463719533486975, + "grad_norm": 0.33186398855428556, + "learning_rate": 2.774446695266304e-05, + "loss": 2.531, + "step": 43954 + }, + { + "epoch": 2.0464185115347906, + "grad_norm": 0.33719126151102463, + "learning_rate": 2.774204137791019e-05, + "loss": 2.6176, + "step": 43955 + }, + { + "epoch": 2.0464650697208837, + "grad_norm": 0.3239495792998026, + "learning_rate": 2.773961586848225e-05, + "loss": 2.6853, + "step": 43956 + }, + { + "epoch": 2.046511627906977, + "grad_norm": 0.34673829197407885, + "learning_rate": 2.773719042438635e-05, + "loss": 2.7266, + "step": 43957 + }, + { + "epoch": 2.04655818609307, + "grad_norm": 0.33058844375344487, + "learning_rate": 2.773476504562962e-05, + "loss": 2.7154, + "step": 43958 + }, + { + "epoch": 2.046604744279163, + "grad_norm": 0.3648481289643309, + "learning_rate": 2.7732339732219138e-05, + "loss": 2.6334, + "step": 43959 + }, + { + "epoch": 2.046651302465256, + "grad_norm": 0.3370343887174261, + "learning_rate": 2.772991448416205e-05, + "loss": 2.604, + "step": 43960 + }, + { + "epoch": 2.046697860651349, + "grad_norm": 0.35611025425932236, + "learning_rate": 2.7727489301465472e-05, + "loss": 2.6603, + "step": 43961 + }, + { + "epoch": 2.046744418837442, + "grad_norm": 0.3334984617702228, + "learning_rate": 2.7725064184136507e-05, + "loss": 2.5849, + "step": 43962 + }, + { + "epoch": 2.046790977023535, + "grad_norm": 0.32312196934787385, + "learning_rate": 2.7722639132182315e-05, + "loss": 2.7099, + "step": 43963 + }, + { + "epoch": 2.046837535209628, + "grad_norm": 0.35605948503676726, + "learning_rate": 2.7720214145609935e-05, + "loss": 2.5848, + "step": 43964 + }, + { + "epoch": 2.0468840933957213, + "grad_norm": 0.3289684792243371, + "learning_rate": 2.771778922442658e-05, + "loss": 2.5179, + "step": 43965 + }, + { + "epoch": 2.0469306515818144, + "grad_norm": 0.3447366540346809, + "learning_rate": 2.7715364368639273e-05, + "loss": 2.671, + "step": 43966 + }, + { + "epoch": 2.0469772097679075, + "grad_norm": 0.3505519336773647, + "learning_rate": 2.7712939578255214e-05, + "loss": 2.5921, + "step": 43967 + }, + { + "epoch": 2.0470237679540007, + "grad_norm": 0.36151808498433946, + "learning_rate": 2.7710514853281465e-05, + "loss": 2.6478, + "step": 43968 + }, + { + "epoch": 2.0470703261400938, + "grad_norm": 0.36516826474533204, + "learning_rate": 2.7708090193725155e-05, + "loss": 2.7187, + "step": 43969 + }, + { + "epoch": 2.0471168843261864, + "grad_norm": 0.3039672878452432, + "learning_rate": 2.7705665599593413e-05, + "loss": 2.6479, + "step": 43970 + }, + { + "epoch": 2.0471634425122796, + "grad_norm": 0.36150581141949545, + "learning_rate": 2.770324107089334e-05, + "loss": 2.5914, + "step": 43971 + }, + { + "epoch": 2.0472100006983727, + "grad_norm": 0.348209387655881, + "learning_rate": 2.770081660763208e-05, + "loss": 2.6878, + "step": 43972 + }, + { + "epoch": 2.047256558884466, + "grad_norm": 0.3111457150966719, + "learning_rate": 2.7698392209816702e-05, + "loss": 2.6365, + "step": 43973 + }, + { + "epoch": 2.047303117070559, + "grad_norm": 0.3564099766064138, + "learning_rate": 2.7695967877454353e-05, + "loss": 2.5147, + "step": 43974 + }, + { + "epoch": 2.047349675256652, + "grad_norm": 0.33771446572463815, + "learning_rate": 2.7693543610552132e-05, + "loss": 2.6383, + "step": 43975 + }, + { + "epoch": 2.047396233442745, + "grad_norm": 0.3311903288867899, + "learning_rate": 2.7691119409117193e-05, + "loss": 2.5315, + "step": 43976 + }, + { + "epoch": 2.0474427916288382, + "grad_norm": 0.3727792369792105, + "learning_rate": 2.7688695273156573e-05, + "loss": 2.6986, + "step": 43977 + }, + { + "epoch": 2.0474893498149314, + "grad_norm": 0.34320431647371485, + "learning_rate": 2.7686271202677478e-05, + "loss": 2.6281, + "step": 43978 + }, + { + "epoch": 2.0475359080010245, + "grad_norm": 0.3353715883491697, + "learning_rate": 2.7683847197686935e-05, + "loss": 2.7566, + "step": 43979 + }, + { + "epoch": 2.047582466187117, + "grad_norm": 0.3361732369983426, + "learning_rate": 2.7681423258192148e-05, + "loss": 2.5844, + "step": 43980 + }, + { + "epoch": 2.0476290243732103, + "grad_norm": 0.3450720575035919, + "learning_rate": 2.767899938420016e-05, + "loss": 2.6535, + "step": 43981 + }, + { + "epoch": 2.0476755825593034, + "grad_norm": 0.32517019423296256, + "learning_rate": 2.767657557571811e-05, + "loss": 2.697, + "step": 43982 + }, + { + "epoch": 2.0477221407453965, + "grad_norm": 0.3571348648038557, + "learning_rate": 2.7674151832753113e-05, + "loss": 2.5838, + "step": 43983 + }, + { + "epoch": 2.0477686989314896, + "grad_norm": 0.3489620197810268, + "learning_rate": 2.767172815531228e-05, + "loss": 2.6891, + "step": 43984 + }, + { + "epoch": 2.0478152571175827, + "grad_norm": 0.35098677569460685, + "learning_rate": 2.7669304543402746e-05, + "loss": 2.7156, + "step": 43985 + }, + { + "epoch": 2.047861815303676, + "grad_norm": 0.3421079150318588, + "learning_rate": 2.7666880997031586e-05, + "loss": 2.5727, + "step": 43986 + }, + { + "epoch": 2.047908373489769, + "grad_norm": 0.35365997668246546, + "learning_rate": 2.766445751620593e-05, + "loss": 2.5188, + "step": 43987 + }, + { + "epoch": 2.047954931675862, + "grad_norm": 0.3558273559573734, + "learning_rate": 2.7662034100932887e-05, + "loss": 2.657, + "step": 43988 + }, + { + "epoch": 2.048001489861955, + "grad_norm": 0.34134689213892583, + "learning_rate": 2.765961075121957e-05, + "loss": 2.7413, + "step": 43989 + }, + { + "epoch": 2.048048048048048, + "grad_norm": 0.35880951237884523, + "learning_rate": 2.7657187467073098e-05, + "loss": 2.5534, + "step": 43990 + }, + { + "epoch": 2.048094606234141, + "grad_norm": 0.3879961107527643, + "learning_rate": 2.7654764248500605e-05, + "loss": 2.6841, + "step": 43991 + }, + { + "epoch": 2.048141164420234, + "grad_norm": 0.3282160393765828, + "learning_rate": 2.7652341095509126e-05, + "loss": 2.5988, + "step": 43992 + }, + { + "epoch": 2.048187722606327, + "grad_norm": 0.35063393564410633, + "learning_rate": 2.764991800810588e-05, + "loss": 2.6954, + "step": 43993 + }, + { + "epoch": 2.0482342807924203, + "grad_norm": 0.38825243278262545, + "learning_rate": 2.764749498629789e-05, + "loss": 2.5941, + "step": 43994 + }, + { + "epoch": 2.0482808389785134, + "grad_norm": 0.32923510246592186, + "learning_rate": 2.76450720300923e-05, + "loss": 2.5925, + "step": 43995 + }, + { + "epoch": 2.0483273971646065, + "grad_norm": 0.3421540131201332, + "learning_rate": 2.764264913949623e-05, + "loss": 2.6965, + "step": 43996 + }, + { + "epoch": 2.0483739553506997, + "grad_norm": 0.332669443212921, + "learning_rate": 2.7640226314516782e-05, + "loss": 2.7217, + "step": 43997 + }, + { + "epoch": 2.0484205135367928, + "grad_norm": 0.331980632908249, + "learning_rate": 2.763780355516108e-05, + "loss": 2.6937, + "step": 43998 + }, + { + "epoch": 2.048467071722886, + "grad_norm": 0.3231935753113724, + "learning_rate": 2.76353808614362e-05, + "loss": 2.6679, + "step": 43999 + }, + { + "epoch": 2.0485136299089786, + "grad_norm": 0.33804590603364426, + "learning_rate": 2.7632958233349282e-05, + "loss": 2.5349, + "step": 44000 + }, + { + "epoch": 2.0485601880950717, + "grad_norm": 0.3293964784036065, + "learning_rate": 2.763053567090742e-05, + "loss": 2.5938, + "step": 44001 + }, + { + "epoch": 2.048606746281165, + "grad_norm": 0.31098811358909917, + "learning_rate": 2.7628113174117742e-05, + "loss": 2.5662, + "step": 44002 + }, + { + "epoch": 2.048653304467258, + "grad_norm": 0.344876650594812, + "learning_rate": 2.762569074298734e-05, + "loss": 2.5795, + "step": 44003 + }, + { + "epoch": 2.048699862653351, + "grad_norm": 0.33956955486455415, + "learning_rate": 2.7623268377523353e-05, + "loss": 2.6502, + "step": 44004 + }, + { + "epoch": 2.048746420839444, + "grad_norm": 0.32399428575688566, + "learning_rate": 2.762084607773283e-05, + "loss": 2.5948, + "step": 44005 + }, + { + "epoch": 2.0487929790255373, + "grad_norm": 0.32903264103371005, + "learning_rate": 2.7618423843622955e-05, + "loss": 2.7192, + "step": 44006 + }, + { + "epoch": 2.0488395372116304, + "grad_norm": 0.33510702078997334, + "learning_rate": 2.7616001675200788e-05, + "loss": 2.6117, + "step": 44007 + }, + { + "epoch": 2.0488860953977235, + "grad_norm": 0.33919250429486775, + "learning_rate": 2.7613579572473447e-05, + "loss": 2.7342, + "step": 44008 + }, + { + "epoch": 2.048932653583816, + "grad_norm": 0.34264225937205284, + "learning_rate": 2.761115753544804e-05, + "loss": 2.675, + "step": 44009 + }, + { + "epoch": 2.0489792117699093, + "grad_norm": 0.3429770963242578, + "learning_rate": 2.760873556413169e-05, + "loss": 2.5649, + "step": 44010 + }, + { + "epoch": 2.0490257699560024, + "grad_norm": 0.32883243765830467, + "learning_rate": 2.7606313658531507e-05, + "loss": 2.6167, + "step": 44011 + }, + { + "epoch": 2.0490723281420955, + "grad_norm": 0.32798620075621876, + "learning_rate": 2.760389181865457e-05, + "loss": 2.6016, + "step": 44012 + }, + { + "epoch": 2.0491188863281886, + "grad_norm": 0.3473012775339493, + "learning_rate": 2.7601470044508005e-05, + "loss": 2.63, + "step": 44013 + }, + { + "epoch": 2.0491654445142817, + "grad_norm": 0.3548117764130289, + "learning_rate": 2.7599048336098922e-05, + "loss": 2.6423, + "step": 44014 + }, + { + "epoch": 2.049212002700375, + "grad_norm": 0.32605859239475016, + "learning_rate": 2.759662669343442e-05, + "loss": 2.671, + "step": 44015 + }, + { + "epoch": 2.049258560886468, + "grad_norm": 0.3261758137993597, + "learning_rate": 2.759420511652161e-05, + "loss": 2.647, + "step": 44016 + }, + { + "epoch": 2.049305119072561, + "grad_norm": 0.35701716072320155, + "learning_rate": 2.7591783605367626e-05, + "loss": 2.6475, + "step": 44017 + }, + { + "epoch": 2.049351677258654, + "grad_norm": 0.32877319679467015, + "learning_rate": 2.7589362159979502e-05, + "loss": 2.6173, + "step": 44018 + }, + { + "epoch": 2.049398235444747, + "grad_norm": 0.3183641841591019, + "learning_rate": 2.7586940780364445e-05, + "loss": 2.7184, + "step": 44019 + }, + { + "epoch": 2.04944479363084, + "grad_norm": 0.33556388952323085, + "learning_rate": 2.758451946652948e-05, + "loss": 2.6698, + "step": 44020 + }, + { + "epoch": 2.049491351816933, + "grad_norm": 0.3306303163171991, + "learning_rate": 2.7582098218481743e-05, + "loss": 2.6768, + "step": 44021 + }, + { + "epoch": 2.049537910003026, + "grad_norm": 0.3305888347596164, + "learning_rate": 2.7579677036228334e-05, + "loss": 2.7165, + "step": 44022 + }, + { + "epoch": 2.0495844681891193, + "grad_norm": 0.3241041253744303, + "learning_rate": 2.7577255919776375e-05, + "loss": 2.6204, + "step": 44023 + }, + { + "epoch": 2.0496310263752124, + "grad_norm": 0.3274261968294505, + "learning_rate": 2.7574834869132977e-05, + "loss": 2.6908, + "step": 44024 + }, + { + "epoch": 2.0496775845613056, + "grad_norm": 0.32244185905614425, + "learning_rate": 2.7572413884305188e-05, + "loss": 2.7275, + "step": 44025 + }, + { + "epoch": 2.0497241427473987, + "grad_norm": 0.3301977248760319, + "learning_rate": 2.7569992965300197e-05, + "loss": 2.6871, + "step": 44026 + }, + { + "epoch": 2.049770700933492, + "grad_norm": 0.33831379786579746, + "learning_rate": 2.756757211212504e-05, + "loss": 2.7155, + "step": 44027 + }, + { + "epoch": 2.049817259119585, + "grad_norm": 0.3355028199437636, + "learning_rate": 2.7565151324786853e-05, + "loss": 2.5875, + "step": 44028 + }, + { + "epoch": 2.0498638173056776, + "grad_norm": 0.3390773336892797, + "learning_rate": 2.756273060329274e-05, + "loss": 2.682, + "step": 44029 + }, + { + "epoch": 2.0499103754917707, + "grad_norm": 0.354277102119923, + "learning_rate": 2.756030994764982e-05, + "loss": 2.6585, + "step": 44030 + }, + { + "epoch": 2.049956933677864, + "grad_norm": 0.35321221831296207, + "learning_rate": 2.7557889357865134e-05, + "loss": 2.6971, + "step": 44031 + }, + { + "epoch": 2.050003491863957, + "grad_norm": 0.33098853293415137, + "learning_rate": 2.755546883394588e-05, + "loss": 2.7142, + "step": 44032 + }, + { + "epoch": 2.05005005005005, + "grad_norm": 0.3855341891066119, + "learning_rate": 2.7553048375899093e-05, + "loss": 2.6479, + "step": 44033 + }, + { + "epoch": 2.050096608236143, + "grad_norm": 0.3872235650897969, + "learning_rate": 2.7550627983731892e-05, + "loss": 2.7652, + "step": 44034 + }, + { + "epoch": 2.0501431664222363, + "grad_norm": 0.35088163077473516, + "learning_rate": 2.7548207657451386e-05, + "loss": 2.6593, + "step": 44035 + }, + { + "epoch": 2.0501897246083294, + "grad_norm": 0.36281681797824855, + "learning_rate": 2.7545787397064686e-05, + "loss": 2.6147, + "step": 44036 + }, + { + "epoch": 2.0502362827944225, + "grad_norm": 0.3508078131132645, + "learning_rate": 2.7543367202578908e-05, + "loss": 2.7347, + "step": 44037 + }, + { + "epoch": 2.0502828409805156, + "grad_norm": 0.3463608075637819, + "learning_rate": 2.754094707400109e-05, + "loss": 2.7033, + "step": 44038 + }, + { + "epoch": 2.0503293991666083, + "grad_norm": 0.353995644116195, + "learning_rate": 2.7538527011338415e-05, + "loss": 2.6762, + "step": 44039 + }, + { + "epoch": 2.0503759573527014, + "grad_norm": 0.3404283968743792, + "learning_rate": 2.753610701459794e-05, + "loss": 2.5774, + "step": 44040 + }, + { + "epoch": 2.0504225155387945, + "grad_norm": 0.33786679798618885, + "learning_rate": 2.7533687083786776e-05, + "loss": 2.5698, + "step": 44041 + }, + { + "epoch": 2.0504690737248876, + "grad_norm": 0.3502818262389777, + "learning_rate": 2.753126721891202e-05, + "loss": 2.5322, + "step": 44042 + }, + { + "epoch": 2.0505156319109807, + "grad_norm": 0.32852785254999245, + "learning_rate": 2.7528847419980787e-05, + "loss": 2.7269, + "step": 44043 + }, + { + "epoch": 2.050562190097074, + "grad_norm": 0.3426418107131664, + "learning_rate": 2.752642768700017e-05, + "loss": 2.6544, + "step": 44044 + }, + { + "epoch": 2.050608748283167, + "grad_norm": 0.3476104196643831, + "learning_rate": 2.7524008019977297e-05, + "loss": 2.6573, + "step": 44045 + }, + { + "epoch": 2.05065530646926, + "grad_norm": 0.36965916818795125, + "learning_rate": 2.752158841891923e-05, + "loss": 2.5356, + "step": 44046 + }, + { + "epoch": 2.050701864655353, + "grad_norm": 0.33547328382526476, + "learning_rate": 2.751916888383308e-05, + "loss": 2.5857, + "step": 44047 + }, + { + "epoch": 2.050748422841446, + "grad_norm": 0.3446993154630996, + "learning_rate": 2.7516749414725957e-05, + "loss": 2.5809, + "step": 44048 + }, + { + "epoch": 2.050794981027539, + "grad_norm": 0.35136817721089186, + "learning_rate": 2.7514330011604962e-05, + "loss": 2.637, + "step": 44049 + }, + { + "epoch": 2.050841539213632, + "grad_norm": 0.33242818340137104, + "learning_rate": 2.7511910674477205e-05, + "loss": 2.752, + "step": 44050 + }, + { + "epoch": 2.050888097399725, + "grad_norm": 0.3491882998332225, + "learning_rate": 2.750949140334974e-05, + "loss": 2.7058, + "step": 44051 + }, + { + "epoch": 2.0509346555858183, + "grad_norm": 0.3907311989441359, + "learning_rate": 2.7507072198229745e-05, + "loss": 2.7609, + "step": 44052 + }, + { + "epoch": 2.0509812137719114, + "grad_norm": 0.33200744885177763, + "learning_rate": 2.750465305912425e-05, + "loss": 2.7285, + "step": 44053 + }, + { + "epoch": 2.0510277719580046, + "grad_norm": 0.3623310460870707, + "learning_rate": 2.7502233986040388e-05, + "loss": 2.6239, + "step": 44054 + }, + { + "epoch": 2.0510743301440977, + "grad_norm": 0.3454590841354611, + "learning_rate": 2.749981497898525e-05, + "loss": 2.7148, + "step": 44055 + }, + { + "epoch": 2.051120888330191, + "grad_norm": 0.3393940407495136, + "learning_rate": 2.749739603796594e-05, + "loss": 2.6426, + "step": 44056 + }, + { + "epoch": 2.051167446516284, + "grad_norm": 0.3443978638984338, + "learning_rate": 2.7494977162989553e-05, + "loss": 2.4933, + "step": 44057 + }, + { + "epoch": 2.0512140047023766, + "grad_norm": 0.3535881389457657, + "learning_rate": 2.7492558354063214e-05, + "loss": 2.7119, + "step": 44058 + }, + { + "epoch": 2.0512605628884697, + "grad_norm": 0.33553072976830345, + "learning_rate": 2.7490139611193977e-05, + "loss": 2.6844, + "step": 44059 + }, + { + "epoch": 2.051307121074563, + "grad_norm": 0.30925180705991934, + "learning_rate": 2.7487720934388965e-05, + "loss": 2.5569, + "step": 44060 + }, + { + "epoch": 2.051353679260656, + "grad_norm": 0.35680195647431145, + "learning_rate": 2.748530232365527e-05, + "loss": 2.6938, + "step": 44061 + }, + { + "epoch": 2.051400237446749, + "grad_norm": 0.35389706551138456, + "learning_rate": 2.7482883779e-05, + "loss": 2.5509, + "step": 44062 + }, + { + "epoch": 2.051446795632842, + "grad_norm": 0.3405128866434118, + "learning_rate": 2.748046530043027e-05, + "loss": 2.6231, + "step": 44063 + }, + { + "epoch": 2.0514933538189353, + "grad_norm": 0.36067752002284575, + "learning_rate": 2.7478046887953108e-05, + "loss": 2.6212, + "step": 44064 + }, + { + "epoch": 2.0515399120050284, + "grad_norm": 0.345987603391014, + "learning_rate": 2.7475628541575704e-05, + "loss": 2.635, + "step": 44065 + }, + { + "epoch": 2.0515864701911215, + "grad_norm": 0.3072212252710331, + "learning_rate": 2.7473210261305087e-05, + "loss": 2.6264, + "step": 44066 + }, + { + "epoch": 2.0516330283772146, + "grad_norm": 0.35096772106464696, + "learning_rate": 2.7470792047148388e-05, + "loss": 2.7507, + "step": 44067 + }, + { + "epoch": 2.0516795865633073, + "grad_norm": 0.35305664769995687, + "learning_rate": 2.7468373899112682e-05, + "loss": 2.6394, + "step": 44068 + }, + { + "epoch": 2.0517261447494004, + "grad_norm": 0.3569936293329325, + "learning_rate": 2.7465955817205086e-05, + "loss": 2.7524, + "step": 44069 + }, + { + "epoch": 2.0517727029354935, + "grad_norm": 0.3473983876623909, + "learning_rate": 2.7463537801432688e-05, + "loss": 2.7393, + "step": 44070 + }, + { + "epoch": 2.0518192611215866, + "grad_norm": 0.39347010040242797, + "learning_rate": 2.746111985180261e-05, + "loss": 2.7046, + "step": 44071 + }, + { + "epoch": 2.0518658193076798, + "grad_norm": 0.345015732066706, + "learning_rate": 2.7458701968321897e-05, + "loss": 2.5227, + "step": 44072 + }, + { + "epoch": 2.051912377493773, + "grad_norm": 0.3459240225105561, + "learning_rate": 2.745628415099768e-05, + "loss": 2.6416, + "step": 44073 + }, + { + "epoch": 2.051958935679866, + "grad_norm": 0.368974862783574, + "learning_rate": 2.7453866399837046e-05, + "loss": 2.6068, + "step": 44074 + }, + { + "epoch": 2.052005493865959, + "grad_norm": 0.3605033968160624, + "learning_rate": 2.7451448714847094e-05, + "loss": 2.5901, + "step": 44075 + }, + { + "epoch": 2.052052052052052, + "grad_norm": 0.35725008692087196, + "learning_rate": 2.7449031096034938e-05, + "loss": 2.6209, + "step": 44076 + }, + { + "epoch": 2.0520986102381453, + "grad_norm": 0.3597130100692147, + "learning_rate": 2.7446613543407612e-05, + "loss": 2.704, + "step": 44077 + }, + { + "epoch": 2.052145168424238, + "grad_norm": 0.372199925996518, + "learning_rate": 2.7444196056972293e-05, + "loss": 2.641, + "step": 44078 + }, + { + "epoch": 2.052191726610331, + "grad_norm": 0.32377635740782795, + "learning_rate": 2.7441778636736003e-05, + "loss": 2.6438, + "step": 44079 + }, + { + "epoch": 2.0522382847964242, + "grad_norm": 0.3676233970060195, + "learning_rate": 2.7439361282705906e-05, + "loss": 2.7713, + "step": 44080 + }, + { + "epoch": 2.0522848429825173, + "grad_norm": 0.3789278542778408, + "learning_rate": 2.7436943994889042e-05, + "loss": 2.7132, + "step": 44081 + }, + { + "epoch": 2.0523314011686105, + "grad_norm": 0.34637799803469216, + "learning_rate": 2.7434526773292523e-05, + "loss": 2.5868, + "step": 44082 + }, + { + "epoch": 2.0523779593547036, + "grad_norm": 0.3486969891237054, + "learning_rate": 2.743210961792344e-05, + "loss": 2.5774, + "step": 44083 + }, + { + "epoch": 2.0524245175407967, + "grad_norm": 0.32445461732153674, + "learning_rate": 2.742969252878892e-05, + "loss": 2.5593, + "step": 44084 + }, + { + "epoch": 2.05247107572689, + "grad_norm": 0.3383844378532062, + "learning_rate": 2.742727550589601e-05, + "loss": 2.7454, + "step": 44085 + }, + { + "epoch": 2.052517633912983, + "grad_norm": 0.3282032802788509, + "learning_rate": 2.742485854925182e-05, + "loss": 2.6709, + "step": 44086 + }, + { + "epoch": 2.052564192099076, + "grad_norm": 0.3179771961267259, + "learning_rate": 2.742244165886344e-05, + "loss": 2.703, + "step": 44087 + }, + { + "epoch": 2.0526107502851687, + "grad_norm": 0.32513078445153304, + "learning_rate": 2.742002483473798e-05, + "loss": 2.7009, + "step": 44088 + }, + { + "epoch": 2.052657308471262, + "grad_norm": 0.3530552790352539, + "learning_rate": 2.7417608076882533e-05, + "loss": 2.7213, + "step": 44089 + }, + { + "epoch": 2.052703866657355, + "grad_norm": 0.32838074671744993, + "learning_rate": 2.7415191385304144e-05, + "loss": 2.5951, + "step": 44090 + }, + { + "epoch": 2.052750424843448, + "grad_norm": 0.31960711672023384, + "learning_rate": 2.7412774760009983e-05, + "loss": 2.6292, + "step": 44091 + }, + { + "epoch": 2.052796983029541, + "grad_norm": 0.34397465640107083, + "learning_rate": 2.741035820100707e-05, + "loss": 2.6846, + "step": 44092 + }, + { + "epoch": 2.0528435412156343, + "grad_norm": 0.32823070858665504, + "learning_rate": 2.7407941708302565e-05, + "loss": 2.7125, + "step": 44093 + }, + { + "epoch": 2.0528900994017274, + "grad_norm": 0.34163482000470907, + "learning_rate": 2.7405525281903504e-05, + "loss": 2.6541, + "step": 44094 + }, + { + "epoch": 2.0529366575878205, + "grad_norm": 0.3551520946831087, + "learning_rate": 2.7403108921817e-05, + "loss": 2.7497, + "step": 44095 + }, + { + "epoch": 2.0529832157739136, + "grad_norm": 0.32888931939036786, + "learning_rate": 2.740069262805015e-05, + "loss": 2.6705, + "step": 44096 + }, + { + "epoch": 2.0530297739600067, + "grad_norm": 0.33285848680195956, + "learning_rate": 2.7398276400610036e-05, + "loss": 2.5354, + "step": 44097 + }, + { + "epoch": 2.0530763321460994, + "grad_norm": 0.3168708411484711, + "learning_rate": 2.739586023950378e-05, + "loss": 2.6388, + "step": 44098 + }, + { + "epoch": 2.0531228903321925, + "grad_norm": 0.33730815436705247, + "learning_rate": 2.7393444144738423e-05, + "loss": 2.7046, + "step": 44099 + }, + { + "epoch": 2.0531694485182856, + "grad_norm": 0.3201209986062097, + "learning_rate": 2.7391028116321075e-05, + "loss": 2.6205, + "step": 44100 + }, + { + "epoch": 2.0532160067043788, + "grad_norm": 0.32552015524269656, + "learning_rate": 2.7388612154258842e-05, + "loss": 2.6372, + "step": 44101 + }, + { + "epoch": 2.053262564890472, + "grad_norm": 0.35709594503090475, + "learning_rate": 2.7386196258558818e-05, + "loss": 2.7477, + "step": 44102 + }, + { + "epoch": 2.053309123076565, + "grad_norm": 0.3106405831527391, + "learning_rate": 2.7383780429228046e-05, + "loss": 2.6162, + "step": 44103 + }, + { + "epoch": 2.053355681262658, + "grad_norm": 0.32600978385715984, + "learning_rate": 2.7381364666273684e-05, + "loss": 2.5622, + "step": 44104 + }, + { + "epoch": 2.053402239448751, + "grad_norm": 0.37608574414863555, + "learning_rate": 2.737894896970275e-05, + "loss": 2.6947, + "step": 44105 + }, + { + "epoch": 2.0534487976348443, + "grad_norm": 0.31973276398920447, + "learning_rate": 2.7376533339522415e-05, + "loss": 2.6984, + "step": 44106 + }, + { + "epoch": 2.053495355820937, + "grad_norm": 0.37679482651145785, + "learning_rate": 2.7374117775739705e-05, + "loss": 2.6577, + "step": 44107 + }, + { + "epoch": 2.05354191400703, + "grad_norm": 0.3561100572576403, + "learning_rate": 2.7371702278361728e-05, + "loss": 2.6722, + "step": 44108 + }, + { + "epoch": 2.0535884721931232, + "grad_norm": 0.34067950734221136, + "learning_rate": 2.736928684739558e-05, + "loss": 2.6527, + "step": 44109 + }, + { + "epoch": 2.0536350303792164, + "grad_norm": 0.3723704060091802, + "learning_rate": 2.7366871482848344e-05, + "loss": 2.6609, + "step": 44110 + }, + { + "epoch": 2.0536815885653095, + "grad_norm": 0.39721507304453724, + "learning_rate": 2.7364456184727127e-05, + "loss": 2.619, + "step": 44111 + }, + { + "epoch": 2.0537281467514026, + "grad_norm": 0.3622687927067133, + "learning_rate": 2.736204095303898e-05, + "loss": 2.6686, + "step": 44112 + }, + { + "epoch": 2.0537747049374957, + "grad_norm": 0.3364679595905565, + "learning_rate": 2.7359625787791022e-05, + "loss": 2.5387, + "step": 44113 + }, + { + "epoch": 2.053821263123589, + "grad_norm": 0.3514968024848888, + "learning_rate": 2.735721068899032e-05, + "loss": 2.659, + "step": 44114 + }, + { + "epoch": 2.053867821309682, + "grad_norm": 0.36154985905857245, + "learning_rate": 2.7354795656643984e-05, + "loss": 2.6442, + "step": 44115 + }, + { + "epoch": 2.053914379495775, + "grad_norm": 0.36254562981132143, + "learning_rate": 2.7352380690759083e-05, + "loss": 2.6776, + "step": 44116 + }, + { + "epoch": 2.0539609376818677, + "grad_norm": 0.331258765938367, + "learning_rate": 2.7349965791342742e-05, + "loss": 2.7082, + "step": 44117 + }, + { + "epoch": 2.054007495867961, + "grad_norm": 0.3652537576763866, + "learning_rate": 2.7347550958401973e-05, + "loss": 2.6356, + "step": 44118 + }, + { + "epoch": 2.054054054054054, + "grad_norm": 0.33293660848879153, + "learning_rate": 2.7345136191943953e-05, + "loss": 2.6075, + "step": 44119 + }, + { + "epoch": 2.054100612240147, + "grad_norm": 0.3405308620718379, + "learning_rate": 2.734272149197571e-05, + "loss": 2.6343, + "step": 44120 + }, + { + "epoch": 2.05414717042624, + "grad_norm": 0.373187319931208, + "learning_rate": 2.7340306858504344e-05, + "loss": 2.5806, + "step": 44121 + }, + { + "epoch": 2.0541937286123333, + "grad_norm": 0.33042327005767685, + "learning_rate": 2.7337892291536942e-05, + "loss": 2.7115, + "step": 44122 + }, + { + "epoch": 2.0542402867984264, + "grad_norm": 0.35708497976632025, + "learning_rate": 2.73354777910806e-05, + "loss": 2.6051, + "step": 44123 + }, + { + "epoch": 2.0542868449845195, + "grad_norm": 0.34448224601879374, + "learning_rate": 2.7333063357142413e-05, + "loss": 2.6269, + "step": 44124 + }, + { + "epoch": 2.0543334031706126, + "grad_norm": 0.3417276076388386, + "learning_rate": 2.7330648989729434e-05, + "loss": 2.6668, + "step": 44125 + }, + { + "epoch": 2.0543799613567058, + "grad_norm": 0.35100377552948303, + "learning_rate": 2.7328234688848765e-05, + "loss": 2.6081, + "step": 44126 + }, + { + "epoch": 2.0544265195427984, + "grad_norm": 0.36971314790093834, + "learning_rate": 2.73258204545075e-05, + "loss": 2.6778, + "step": 44127 + }, + { + "epoch": 2.0544730777288915, + "grad_norm": 0.34261726960229766, + "learning_rate": 2.732340628671271e-05, + "loss": 2.617, + "step": 44128 + }, + { + "epoch": 2.0545196359149847, + "grad_norm": 0.36459782328853796, + "learning_rate": 2.7320992185471494e-05, + "loss": 2.603, + "step": 44129 + }, + { + "epoch": 2.0545661941010778, + "grad_norm": 0.3571285203470606, + "learning_rate": 2.7318578150790946e-05, + "loss": 2.6904, + "step": 44130 + }, + { + "epoch": 2.054612752287171, + "grad_norm": 0.367219797051187, + "learning_rate": 2.7316164182678104e-05, + "loss": 2.6527, + "step": 44131 + }, + { + "epoch": 2.054659310473264, + "grad_norm": 0.33390815745333174, + "learning_rate": 2.7313750281140114e-05, + "loss": 2.6447, + "step": 44132 + }, + { + "epoch": 2.054705868659357, + "grad_norm": 0.33157425062112933, + "learning_rate": 2.7311336446184006e-05, + "loss": 2.5974, + "step": 44133 + }, + { + "epoch": 2.0547524268454502, + "grad_norm": 0.3503880706885605, + "learning_rate": 2.7308922677816922e-05, + "loss": 2.5879, + "step": 44134 + }, + { + "epoch": 2.0547989850315433, + "grad_norm": 0.3398756659125441, + "learning_rate": 2.7306508976045896e-05, + "loss": 2.6441, + "step": 44135 + }, + { + "epoch": 2.0548455432176365, + "grad_norm": 0.33963726014535844, + "learning_rate": 2.7304095340878033e-05, + "loss": 2.6873, + "step": 44136 + }, + { + "epoch": 2.054892101403729, + "grad_norm": 0.35724877821059625, + "learning_rate": 2.730168177232043e-05, + "loss": 2.6968, + "step": 44137 + }, + { + "epoch": 2.0549386595898222, + "grad_norm": 0.33149419658207874, + "learning_rate": 2.729926827038014e-05, + "loss": 2.7292, + "step": 44138 + }, + { + "epoch": 2.0549852177759154, + "grad_norm": 0.33286905193640837, + "learning_rate": 2.7296854835064257e-05, + "loss": 2.6827, + "step": 44139 + }, + { + "epoch": 2.0550317759620085, + "grad_norm": 0.3465486904187756, + "learning_rate": 2.729444146637987e-05, + "loss": 2.6473, + "step": 44140 + }, + { + "epoch": 2.0550783341481016, + "grad_norm": 0.3384892673155585, + "learning_rate": 2.729202816433406e-05, + "loss": 2.7222, + "step": 44141 + }, + { + "epoch": 2.0551248923341947, + "grad_norm": 0.33938138563159065, + "learning_rate": 2.7289614928933903e-05, + "loss": 2.5961, + "step": 44142 + }, + { + "epoch": 2.055171450520288, + "grad_norm": 0.35254270297337514, + "learning_rate": 2.728720176018652e-05, + "loss": 2.7547, + "step": 44143 + }, + { + "epoch": 2.055218008706381, + "grad_norm": 0.329998556097132, + "learning_rate": 2.728478865809892e-05, + "loss": 2.704, + "step": 44144 + }, + { + "epoch": 2.055264566892474, + "grad_norm": 0.36153228899266326, + "learning_rate": 2.728237562267827e-05, + "loss": 2.5884, + "step": 44145 + }, + { + "epoch": 2.0553111250785667, + "grad_norm": 0.3358796480766604, + "learning_rate": 2.7279962653931562e-05, + "loss": 2.633, + "step": 44146 + }, + { + "epoch": 2.05535768326466, + "grad_norm": 0.3494443026880322, + "learning_rate": 2.7277549751865965e-05, + "loss": 2.5939, + "step": 44147 + }, + { + "epoch": 2.055404241450753, + "grad_norm": 0.38263000624983146, + "learning_rate": 2.7275136916488508e-05, + "loss": 2.6399, + "step": 44148 + }, + { + "epoch": 2.055450799636846, + "grad_norm": 0.36440991476537293, + "learning_rate": 2.7272724147806283e-05, + "loss": 2.7354, + "step": 44149 + }, + { + "epoch": 2.055497357822939, + "grad_norm": 0.3529303731132466, + "learning_rate": 2.7270311445826392e-05, + "loss": 2.6344, + "step": 44150 + }, + { + "epoch": 2.0555439160090323, + "grad_norm": 0.35274154598461505, + "learning_rate": 2.7267898810555858e-05, + "loss": 2.6676, + "step": 44151 + }, + { + "epoch": 2.0555904741951254, + "grad_norm": 0.3559695700705048, + "learning_rate": 2.7265486242001836e-05, + "loss": 2.4815, + "step": 44152 + }, + { + "epoch": 2.0556370323812185, + "grad_norm": 0.36784814962300066, + "learning_rate": 2.726307374017136e-05, + "loss": 2.6706, + "step": 44153 + }, + { + "epoch": 2.0556835905673116, + "grad_norm": 0.359430970624395, + "learning_rate": 2.726066130507152e-05, + "loss": 2.6902, + "step": 44154 + }, + { + "epoch": 2.0557301487534048, + "grad_norm": 0.369949283803162, + "learning_rate": 2.7258248936709403e-05, + "loss": 2.7216, + "step": 44155 + }, + { + "epoch": 2.0557767069394974, + "grad_norm": 0.3433582770703031, + "learning_rate": 2.7255836635092103e-05, + "loss": 2.7549, + "step": 44156 + }, + { + "epoch": 2.0558232651255905, + "grad_norm": 0.34811244150825954, + "learning_rate": 2.7253424400226633e-05, + "loss": 2.7181, + "step": 44157 + }, + { + "epoch": 2.0558698233116837, + "grad_norm": 0.35994597762212244, + "learning_rate": 2.7251012232120177e-05, + "loss": 2.6752, + "step": 44158 + }, + { + "epoch": 2.055916381497777, + "grad_norm": 0.3495955920739653, + "learning_rate": 2.7248600130779707e-05, + "loss": 2.6675, + "step": 44159 + }, + { + "epoch": 2.05596293968387, + "grad_norm": 0.347048295903185, + "learning_rate": 2.72461880962124e-05, + "loss": 2.7082, + "step": 44160 + }, + { + "epoch": 2.056009497869963, + "grad_norm": 0.33747855860000026, + "learning_rate": 2.7243776128425264e-05, + "loss": 2.6515, + "step": 44161 + }, + { + "epoch": 2.056056056056056, + "grad_norm": 0.3442050471888204, + "learning_rate": 2.7241364227425403e-05, + "loss": 2.7903, + "step": 44162 + }, + { + "epoch": 2.0561026142421492, + "grad_norm": 0.33079300936224215, + "learning_rate": 2.7238952393219918e-05, + "loss": 2.6691, + "step": 44163 + }, + { + "epoch": 2.0561491724282424, + "grad_norm": 0.3233104900867585, + "learning_rate": 2.723654062581583e-05, + "loss": 2.7107, + "step": 44164 + }, + { + "epoch": 2.0561957306143355, + "grad_norm": 0.31523607643984225, + "learning_rate": 2.7234128925220293e-05, + "loss": 2.5906, + "step": 44165 + }, + { + "epoch": 2.056242288800428, + "grad_norm": 0.32167124808773073, + "learning_rate": 2.723171729144032e-05, + "loss": 2.6223, + "step": 44166 + }, + { + "epoch": 2.0562888469865213, + "grad_norm": 0.3263346904486286, + "learning_rate": 2.722930572448301e-05, + "loss": 2.6496, + "step": 44167 + }, + { + "epoch": 2.0563354051726144, + "grad_norm": 0.3327353293155517, + "learning_rate": 2.722689422435545e-05, + "loss": 2.6229, + "step": 44168 + }, + { + "epoch": 2.0563819633587075, + "grad_norm": 0.3408179995604945, + "learning_rate": 2.722448279106471e-05, + "loss": 2.7333, + "step": 44169 + }, + { + "epoch": 2.0564285215448006, + "grad_norm": 0.35061334997703897, + "learning_rate": 2.7222071424617873e-05, + "loss": 2.6392, + "step": 44170 + }, + { + "epoch": 2.0564750797308937, + "grad_norm": 0.32615191151749584, + "learning_rate": 2.721966012502203e-05, + "loss": 2.672, + "step": 44171 + }, + { + "epoch": 2.056521637916987, + "grad_norm": 0.33307814165143773, + "learning_rate": 2.7217248892284197e-05, + "loss": 2.6678, + "step": 44172 + }, + { + "epoch": 2.05656819610308, + "grad_norm": 0.33802921043669637, + "learning_rate": 2.721483772641154e-05, + "loss": 2.6396, + "step": 44173 + }, + { + "epoch": 2.056614754289173, + "grad_norm": 0.3371546439401645, + "learning_rate": 2.721242662741107e-05, + "loss": 2.5402, + "step": 44174 + }, + { + "epoch": 2.056661312475266, + "grad_norm": 0.3255580313477539, + "learning_rate": 2.7210015595289873e-05, + "loss": 2.596, + "step": 44175 + }, + { + "epoch": 2.056707870661359, + "grad_norm": 0.34447317208507133, + "learning_rate": 2.720760463005506e-05, + "loss": 2.68, + "step": 44176 + }, + { + "epoch": 2.056754428847452, + "grad_norm": 0.33819662722847255, + "learning_rate": 2.7205193731713653e-05, + "loss": 2.4389, + "step": 44177 + }, + { + "epoch": 2.056800987033545, + "grad_norm": 0.32739703578390855, + "learning_rate": 2.7202782900272784e-05, + "loss": 2.6635, + "step": 44178 + }, + { + "epoch": 2.056847545219638, + "grad_norm": 0.33596000602444565, + "learning_rate": 2.720037213573948e-05, + "loss": 2.6495, + "step": 44179 + }, + { + "epoch": 2.0568941034057313, + "grad_norm": 0.35016262185523384, + "learning_rate": 2.7197961438120846e-05, + "loss": 2.7946, + "step": 44180 + }, + { + "epoch": 2.0569406615918244, + "grad_norm": 0.3317777316040885, + "learning_rate": 2.7195550807423943e-05, + "loss": 2.6536, + "step": 44181 + }, + { + "epoch": 2.0569872197779175, + "grad_norm": 0.35351318550649363, + "learning_rate": 2.7193140243655856e-05, + "loss": 2.6324, + "step": 44182 + }, + { + "epoch": 2.0570337779640107, + "grad_norm": 0.3415454250812606, + "learning_rate": 2.719072974682365e-05, + "loss": 2.6116, + "step": 44183 + }, + { + "epoch": 2.0570803361501038, + "grad_norm": 0.3257536315887324, + "learning_rate": 2.718831931693443e-05, + "loss": 2.6331, + "step": 44184 + }, + { + "epoch": 2.0571268943361964, + "grad_norm": 0.35826655707490945, + "learning_rate": 2.7185908953995203e-05, + "loss": 2.6601, + "step": 44185 + }, + { + "epoch": 2.0571734525222896, + "grad_norm": 0.34434194202630214, + "learning_rate": 2.718349865801313e-05, + "loss": 2.7292, + "step": 44186 + }, + { + "epoch": 2.0572200107083827, + "grad_norm": 0.35007321883284365, + "learning_rate": 2.7181088428995215e-05, + "loss": 2.5966, + "step": 44187 + }, + { + "epoch": 2.057266568894476, + "grad_norm": 0.3458082058903389, + "learning_rate": 2.717867826694856e-05, + "loss": 2.6666, + "step": 44188 + }, + { + "epoch": 2.057313127080569, + "grad_norm": 0.3198955949398715, + "learning_rate": 2.7176268171880236e-05, + "loss": 2.6747, + "step": 44189 + }, + { + "epoch": 2.057359685266662, + "grad_norm": 0.3552379216886272, + "learning_rate": 2.7173858143797317e-05, + "loss": 2.6406, + "step": 44190 + }, + { + "epoch": 2.057406243452755, + "grad_norm": 0.34884788115905013, + "learning_rate": 2.7171448182706894e-05, + "loss": 2.6606, + "step": 44191 + }, + { + "epoch": 2.0574528016388482, + "grad_norm": 0.37469633865749385, + "learning_rate": 2.7169038288615978e-05, + "loss": 2.6721, + "step": 44192 + }, + { + "epoch": 2.0574993598249414, + "grad_norm": 0.3855298961010556, + "learning_rate": 2.7166628461531727e-05, + "loss": 2.6209, + "step": 44193 + }, + { + "epoch": 2.0575459180110345, + "grad_norm": 0.3183346499948437, + "learning_rate": 2.7164218701461163e-05, + "loss": 2.5989, + "step": 44194 + }, + { + "epoch": 2.057592476197127, + "grad_norm": 0.3818922448905036, + "learning_rate": 2.7161809008411355e-05, + "loss": 2.6559, + "step": 44195 + }, + { + "epoch": 2.0576390343832203, + "grad_norm": 0.3429231135224187, + "learning_rate": 2.7159399382389395e-05, + "loss": 2.6643, + "step": 44196 + }, + { + "epoch": 2.0576855925693134, + "grad_norm": 0.33770606288728783, + "learning_rate": 2.7156989823402362e-05, + "loss": 2.5775, + "step": 44197 + }, + { + "epoch": 2.0577321507554065, + "grad_norm": 0.35497814633626973, + "learning_rate": 2.7154580331457275e-05, + "loss": 2.6634, + "step": 44198 + }, + { + "epoch": 2.0577787089414996, + "grad_norm": 0.3270397635743806, + "learning_rate": 2.7152170906561286e-05, + "loss": 2.6365, + "step": 44199 + }, + { + "epoch": 2.0578252671275927, + "grad_norm": 0.35123104916449277, + "learning_rate": 2.7149761548721403e-05, + "loss": 2.6715, + "step": 44200 + }, + { + "epoch": 2.057871825313686, + "grad_norm": 0.3271601649017156, + "learning_rate": 2.714735225794472e-05, + "loss": 2.681, + "step": 44201 + }, + { + "epoch": 2.057918383499779, + "grad_norm": 0.3563691187548447, + "learning_rate": 2.71449430342383e-05, + "loss": 2.7848, + "step": 44202 + }, + { + "epoch": 2.057964941685872, + "grad_norm": 0.3548346327274888, + "learning_rate": 2.7142533877609234e-05, + "loss": 2.6679, + "step": 44203 + }, + { + "epoch": 2.058011499871965, + "grad_norm": 0.29673350615124017, + "learning_rate": 2.714012478806459e-05, + "loss": 2.6013, + "step": 44204 + }, + { + "epoch": 2.058058058058058, + "grad_norm": 0.3337378451268429, + "learning_rate": 2.7137715765611387e-05, + "loss": 2.6498, + "step": 44205 + }, + { + "epoch": 2.058104616244151, + "grad_norm": 0.3411635989397563, + "learning_rate": 2.7135306810256776e-05, + "loss": 2.6875, + "step": 44206 + }, + { + "epoch": 2.058151174430244, + "grad_norm": 0.33149596155801586, + "learning_rate": 2.713289792200777e-05, + "loss": 2.6581, + "step": 44207 + }, + { + "epoch": 2.058197732616337, + "grad_norm": 0.3228295121656196, + "learning_rate": 2.713048910087146e-05, + "loss": 2.6794, + "step": 44208 + }, + { + "epoch": 2.0582442908024303, + "grad_norm": 0.31405852492962033, + "learning_rate": 2.71280803468549e-05, + "loss": 2.6776, + "step": 44209 + }, + { + "epoch": 2.0582908489885234, + "grad_norm": 0.33954736325390983, + "learning_rate": 2.712567165996518e-05, + "loss": 2.6544, + "step": 44210 + }, + { + "epoch": 2.0583374071746166, + "grad_norm": 0.342681281720027, + "learning_rate": 2.7123263040209356e-05, + "loss": 2.7495, + "step": 44211 + }, + { + "epoch": 2.0583839653607097, + "grad_norm": 0.3425585602598208, + "learning_rate": 2.7120854487594527e-05, + "loss": 2.6182, + "step": 44212 + }, + { + "epoch": 2.058430523546803, + "grad_norm": 0.35152587436575794, + "learning_rate": 2.7118446002127706e-05, + "loss": 2.6307, + "step": 44213 + }, + { + "epoch": 2.058477081732896, + "grad_norm": 0.34943829620095546, + "learning_rate": 2.7116037583816002e-05, + "loss": 2.6969, + "step": 44214 + }, + { + "epoch": 2.0585236399189886, + "grad_norm": 0.33441535364633096, + "learning_rate": 2.711362923266647e-05, + "loss": 2.5473, + "step": 44215 + }, + { + "epoch": 2.0585701981050817, + "grad_norm": 0.3393495134155802, + "learning_rate": 2.7111220948686178e-05, + "loss": 2.6856, + "step": 44216 + }, + { + "epoch": 2.058616756291175, + "grad_norm": 0.33489409614288623, + "learning_rate": 2.7108812731882223e-05, + "loss": 2.6745, + "step": 44217 + }, + { + "epoch": 2.058663314477268, + "grad_norm": 0.3633202895622134, + "learning_rate": 2.71064045822616e-05, + "loss": 2.6553, + "step": 44218 + }, + { + "epoch": 2.058709872663361, + "grad_norm": 0.36307340585719555, + "learning_rate": 2.7103996499831475e-05, + "loss": 2.6393, + "step": 44219 + }, + { + "epoch": 2.058756430849454, + "grad_norm": 0.3592272632322282, + "learning_rate": 2.710158848459884e-05, + "loss": 2.6456, + "step": 44220 + }, + { + "epoch": 2.0588029890355473, + "grad_norm": 0.35299775672570216, + "learning_rate": 2.709918053657078e-05, + "loss": 2.6618, + "step": 44221 + }, + { + "epoch": 2.0588495472216404, + "grad_norm": 0.354694126509436, + "learning_rate": 2.7096772655754376e-05, + "loss": 2.7065, + "step": 44222 + }, + { + "epoch": 2.0588961054077335, + "grad_norm": 0.3654881963728919, + "learning_rate": 2.7094364842156684e-05, + "loss": 2.614, + "step": 44223 + }, + { + "epoch": 2.058942663593826, + "grad_norm": 0.3367368814856315, + "learning_rate": 2.7091957095784783e-05, + "loss": 2.7107, + "step": 44224 + }, + { + "epoch": 2.0589892217799193, + "grad_norm": 0.34216240175241797, + "learning_rate": 2.708954941664574e-05, + "loss": 2.5762, + "step": 44225 + }, + { + "epoch": 2.0590357799660124, + "grad_norm": 0.36205772877049663, + "learning_rate": 2.7087141804746595e-05, + "loss": 2.6289, + "step": 44226 + }, + { + "epoch": 2.0590823381521055, + "grad_norm": 0.33299955690573396, + "learning_rate": 2.7084734260094434e-05, + "loss": 2.7116, + "step": 44227 + }, + { + "epoch": 2.0591288963381986, + "grad_norm": 0.35050401843287593, + "learning_rate": 2.708232678269632e-05, + "loss": 2.623, + "step": 44228 + }, + { + "epoch": 2.0591754545242917, + "grad_norm": 0.378957667584329, + "learning_rate": 2.707991937255932e-05, + "loss": 2.6096, + "step": 44229 + }, + { + "epoch": 2.059222012710385, + "grad_norm": 0.34716839830067475, + "learning_rate": 2.7077512029690523e-05, + "loss": 2.586, + "step": 44230 + }, + { + "epoch": 2.059268570896478, + "grad_norm": 0.3575629929274835, + "learning_rate": 2.7075104754096925e-05, + "loss": 2.6218, + "step": 44231 + }, + { + "epoch": 2.059315129082571, + "grad_norm": 0.36553990520321467, + "learning_rate": 2.7072697545785674e-05, + "loss": 2.6962, + "step": 44232 + }, + { + "epoch": 2.059361687268664, + "grad_norm": 0.32637299639694917, + "learning_rate": 2.7070290404763777e-05, + "loss": 2.5741, + "step": 44233 + }, + { + "epoch": 2.059408245454757, + "grad_norm": 0.3568538255829428, + "learning_rate": 2.7067883331038323e-05, + "loss": 2.666, + "step": 44234 + }, + { + "epoch": 2.05945480364085, + "grad_norm": 0.3330862975701795, + "learning_rate": 2.706547632461637e-05, + "loss": 2.5456, + "step": 44235 + }, + { + "epoch": 2.059501361826943, + "grad_norm": 0.3202178083666013, + "learning_rate": 2.706306938550498e-05, + "loss": 2.6966, + "step": 44236 + }, + { + "epoch": 2.059547920013036, + "grad_norm": 0.35199484449956564, + "learning_rate": 2.706066251371123e-05, + "loss": 2.5836, + "step": 44237 + }, + { + "epoch": 2.0595944781991293, + "grad_norm": 0.3289551397890015, + "learning_rate": 2.7058255709242193e-05, + "loss": 2.6356, + "step": 44238 + }, + { + "epoch": 2.0596410363852224, + "grad_norm": 0.33995934393661886, + "learning_rate": 2.705584897210489e-05, + "loss": 2.6833, + "step": 44239 + }, + { + "epoch": 2.0596875945713156, + "grad_norm": 0.33262259796434446, + "learning_rate": 2.705344230230642e-05, + "loss": 2.6642, + "step": 44240 + }, + { + "epoch": 2.0597341527574087, + "grad_norm": 0.3421712228850424, + "learning_rate": 2.705103569985383e-05, + "loss": 2.6805, + "step": 44241 + }, + { + "epoch": 2.059780710943502, + "grad_norm": 0.3264786711365509, + "learning_rate": 2.7048629164754186e-05, + "loss": 2.6648, + "step": 44242 + }, + { + "epoch": 2.059827269129595, + "grad_norm": 0.33968182169254, + "learning_rate": 2.7046222697014578e-05, + "loss": 2.6325, + "step": 44243 + }, + { + "epoch": 2.0598738273156876, + "grad_norm": 0.3546837512312534, + "learning_rate": 2.7043816296642004e-05, + "loss": 2.6813, + "step": 44244 + }, + { + "epoch": 2.0599203855017807, + "grad_norm": 0.3352555997104464, + "learning_rate": 2.704140996364361e-05, + "loss": 2.6848, + "step": 44245 + }, + { + "epoch": 2.059966943687874, + "grad_norm": 0.3366493082760864, + "learning_rate": 2.703900369802638e-05, + "loss": 2.6571, + "step": 44246 + }, + { + "epoch": 2.060013501873967, + "grad_norm": 0.3545989296154994, + "learning_rate": 2.7036597499797445e-05, + "loss": 2.7406, + "step": 44247 + }, + { + "epoch": 2.06006006006006, + "grad_norm": 0.3424273870321375, + "learning_rate": 2.7034191368963813e-05, + "loss": 2.6943, + "step": 44248 + }, + { + "epoch": 2.060106618246153, + "grad_norm": 0.32110097750423633, + "learning_rate": 2.703178530553257e-05, + "loss": 2.5528, + "step": 44249 + }, + { + "epoch": 2.0601531764322463, + "grad_norm": 0.338392656991475, + "learning_rate": 2.702937930951077e-05, + "loss": 2.715, + "step": 44250 + }, + { + "epoch": 2.0601997346183394, + "grad_norm": 0.31593058421072906, + "learning_rate": 2.7026973380905505e-05, + "loss": 2.6606, + "step": 44251 + }, + { + "epoch": 2.0602462928044325, + "grad_norm": 0.32227338224811547, + "learning_rate": 2.7024567519723785e-05, + "loss": 2.6278, + "step": 44252 + }, + { + "epoch": 2.0602928509905256, + "grad_norm": 0.3284493845293678, + "learning_rate": 2.7022161725972705e-05, + "loss": 2.717, + "step": 44253 + }, + { + "epoch": 2.0603394091766183, + "grad_norm": 0.35395070583583893, + "learning_rate": 2.7019755999659308e-05, + "loss": 2.7107, + "step": 44254 + }, + { + "epoch": 2.0603859673627114, + "grad_norm": 0.31257653570583444, + "learning_rate": 2.7017350340790665e-05, + "loss": 2.6576, + "step": 44255 + }, + { + "epoch": 2.0604325255488045, + "grad_norm": 0.3333756103790864, + "learning_rate": 2.7014944749373855e-05, + "loss": 2.7403, + "step": 44256 + }, + { + "epoch": 2.0604790837348976, + "grad_norm": 0.32934049482182476, + "learning_rate": 2.7012539225415877e-05, + "loss": 2.5302, + "step": 44257 + }, + { + "epoch": 2.0605256419209907, + "grad_norm": 0.33336819405854695, + "learning_rate": 2.7010133768923873e-05, + "loss": 2.6079, + "step": 44258 + }, + { + "epoch": 2.060572200107084, + "grad_norm": 0.31620253828254674, + "learning_rate": 2.7007728379904816e-05, + "loss": 2.6245, + "step": 44259 + }, + { + "epoch": 2.060618758293177, + "grad_norm": 0.321124150706986, + "learning_rate": 2.7005323058365862e-05, + "loss": 2.6256, + "step": 44260 + }, + { + "epoch": 2.06066531647927, + "grad_norm": 0.3470469443824228, + "learning_rate": 2.7002917804313994e-05, + "loss": 2.738, + "step": 44261 + }, + { + "epoch": 2.060711874665363, + "grad_norm": 0.34652546227449166, + "learning_rate": 2.700051261775629e-05, + "loss": 2.6378, + "step": 44262 + }, + { + "epoch": 2.0607584328514563, + "grad_norm": 0.32629921060691025, + "learning_rate": 2.6998107498699827e-05, + "loss": 2.6153, + "step": 44263 + }, + { + "epoch": 2.060804991037549, + "grad_norm": 0.33252421116879616, + "learning_rate": 2.699570244715165e-05, + "loss": 2.6478, + "step": 44264 + }, + { + "epoch": 2.060851549223642, + "grad_norm": 0.334559650156115, + "learning_rate": 2.699329746311883e-05, + "loss": 2.7226, + "step": 44265 + }, + { + "epoch": 2.0608981074097352, + "grad_norm": 0.32741445878758646, + "learning_rate": 2.6990892546608404e-05, + "loss": 2.6438, + "step": 44266 + }, + { + "epoch": 2.0609446655958283, + "grad_norm": 0.3273375379239266, + "learning_rate": 2.698848769762744e-05, + "loss": 2.571, + "step": 44267 + }, + { + "epoch": 2.0609912237819215, + "grad_norm": 0.3634722937549949, + "learning_rate": 2.6986082916183e-05, + "loss": 2.6485, + "step": 44268 + }, + { + "epoch": 2.0610377819680146, + "grad_norm": 0.33270925325646106, + "learning_rate": 2.6983678202282158e-05, + "loss": 2.7627, + "step": 44269 + }, + { + "epoch": 2.0610843401541077, + "grad_norm": 0.3311666545447229, + "learning_rate": 2.6981273555931913e-05, + "loss": 2.6927, + "step": 44270 + }, + { + "epoch": 2.061130898340201, + "grad_norm": 0.3550404942480454, + "learning_rate": 2.6978868977139403e-05, + "loss": 2.6367, + "step": 44271 + }, + { + "epoch": 2.061177456526294, + "grad_norm": 0.33201336582873026, + "learning_rate": 2.69764644659116e-05, + "loss": 2.6631, + "step": 44272 + }, + { + "epoch": 2.0612240147123866, + "grad_norm": 0.33738909658236615, + "learning_rate": 2.6974060022255656e-05, + "loss": 2.5626, + "step": 44273 + }, + { + "epoch": 2.0612705728984797, + "grad_norm": 0.3619105613842856, + "learning_rate": 2.6971655646178547e-05, + "loss": 2.6443, + "step": 44274 + }, + { + "epoch": 2.061317131084573, + "grad_norm": 0.3465534773597818, + "learning_rate": 2.6969251337687362e-05, + "loss": 2.7028, + "step": 44275 + }, + { + "epoch": 2.061363689270666, + "grad_norm": 0.33621351041965747, + "learning_rate": 2.6966847096789157e-05, + "loss": 2.6546, + "step": 44276 + }, + { + "epoch": 2.061410247456759, + "grad_norm": 0.3452022019874159, + "learning_rate": 2.6964442923490983e-05, + "loss": 2.5993, + "step": 44277 + }, + { + "epoch": 2.061456805642852, + "grad_norm": 0.34800423145520404, + "learning_rate": 2.6962038817799927e-05, + "loss": 2.6938, + "step": 44278 + }, + { + "epoch": 2.0615033638289453, + "grad_norm": 0.34105467067543316, + "learning_rate": 2.695963477972299e-05, + "loss": 2.6029, + "step": 44279 + }, + { + "epoch": 2.0615499220150384, + "grad_norm": 0.33473348559438615, + "learning_rate": 2.695723080926726e-05, + "loss": 2.6432, + "step": 44280 + }, + { + "epoch": 2.0615964802011315, + "grad_norm": 0.33385614721908224, + "learning_rate": 2.695482690643979e-05, + "loss": 2.5654, + "step": 44281 + }, + { + "epoch": 2.0616430383872246, + "grad_norm": 0.3503981171731376, + "learning_rate": 2.6952423071247623e-05, + "loss": 2.6742, + "step": 44282 + }, + { + "epoch": 2.0616895965733173, + "grad_norm": 0.3531865482695862, + "learning_rate": 2.6950019303697825e-05, + "loss": 2.5423, + "step": 44283 + }, + { + "epoch": 2.0617361547594104, + "grad_norm": 0.3892767786736871, + "learning_rate": 2.6947615603797482e-05, + "loss": 2.5403, + "step": 44284 + }, + { + "epoch": 2.0617827129455035, + "grad_norm": 0.3641021789279395, + "learning_rate": 2.6945211971553564e-05, + "loss": 2.6638, + "step": 44285 + }, + { + "epoch": 2.0618292711315966, + "grad_norm": 0.32903333624043735, + "learning_rate": 2.6942808406973223e-05, + "loss": 2.6485, + "step": 44286 + }, + { + "epoch": 2.0618758293176898, + "grad_norm": 0.3523511729186629, + "learning_rate": 2.6940404910063445e-05, + "loss": 2.6237, + "step": 44287 + }, + { + "epoch": 2.061922387503783, + "grad_norm": 0.36985954595413745, + "learning_rate": 2.693800148083131e-05, + "loss": 2.6338, + "step": 44288 + }, + { + "epoch": 2.061968945689876, + "grad_norm": 0.33864796656405594, + "learning_rate": 2.6935598119283867e-05, + "loss": 2.6864, + "step": 44289 + }, + { + "epoch": 2.062015503875969, + "grad_norm": 0.36022130856511825, + "learning_rate": 2.693319482542817e-05, + "loss": 2.6264, + "step": 44290 + }, + { + "epoch": 2.062062062062062, + "grad_norm": 0.3401766436750895, + "learning_rate": 2.69307915992713e-05, + "loss": 2.7314, + "step": 44291 + }, + { + "epoch": 2.0621086202481553, + "grad_norm": 0.3547342428896871, + "learning_rate": 2.692838844082025e-05, + "loss": 2.6471, + "step": 44292 + }, + { + "epoch": 2.062155178434248, + "grad_norm": 0.36821331030357457, + "learning_rate": 2.6925985350082124e-05, + "loss": 2.6528, + "step": 44293 + }, + { + "epoch": 2.062201736620341, + "grad_norm": 0.3510521676057873, + "learning_rate": 2.6923582327063946e-05, + "loss": 2.5813, + "step": 44294 + }, + { + "epoch": 2.0622482948064342, + "grad_norm": 0.3378069273304892, + "learning_rate": 2.6921179371772788e-05, + "loss": 2.5519, + "step": 44295 + }, + { + "epoch": 2.0622948529925273, + "grad_norm": 0.37074242086879694, + "learning_rate": 2.691877648421569e-05, + "loss": 2.7002, + "step": 44296 + }, + { + "epoch": 2.0623414111786205, + "grad_norm": 0.3673378637826128, + "learning_rate": 2.691637366439973e-05, + "loss": 2.641, + "step": 44297 + }, + { + "epoch": 2.0623879693647136, + "grad_norm": 0.32151336118211005, + "learning_rate": 2.6913970912331898e-05, + "loss": 2.6646, + "step": 44298 + }, + { + "epoch": 2.0624345275508067, + "grad_norm": 0.3504715923178545, + "learning_rate": 2.691156822801933e-05, + "loss": 2.6826, + "step": 44299 + }, + { + "epoch": 2.0624810857369, + "grad_norm": 0.365556759677035, + "learning_rate": 2.6909165611469013e-05, + "loss": 2.6972, + "step": 44300 + }, + { + "epoch": 2.062527643922993, + "grad_norm": 0.32297707694404465, + "learning_rate": 2.6906763062688024e-05, + "loss": 2.6692, + "step": 44301 + }, + { + "epoch": 2.062574202109086, + "grad_norm": 0.355939249834739, + "learning_rate": 2.6904360581683403e-05, + "loss": 2.7062, + "step": 44302 + }, + { + "epoch": 2.0626207602951787, + "grad_norm": 0.3568638874501915, + "learning_rate": 2.690195816846221e-05, + "loss": 2.5842, + "step": 44303 + }, + { + "epoch": 2.062667318481272, + "grad_norm": 0.3368225107444888, + "learning_rate": 2.689955582303152e-05, + "loss": 2.6279, + "step": 44304 + }, + { + "epoch": 2.062713876667365, + "grad_norm": 0.318646505804548, + "learning_rate": 2.6897153545398333e-05, + "loss": 2.6239, + "step": 44305 + }, + { + "epoch": 2.062760434853458, + "grad_norm": 0.3493009106679267, + "learning_rate": 2.6894751335569722e-05, + "loss": 2.5891, + "step": 44306 + }, + { + "epoch": 2.062806993039551, + "grad_norm": 0.3265174317274961, + "learning_rate": 2.6892349193552745e-05, + "loss": 2.5474, + "step": 44307 + }, + { + "epoch": 2.0628535512256443, + "grad_norm": 0.31217766221425086, + "learning_rate": 2.688994711935444e-05, + "loss": 2.7407, + "step": 44308 + }, + { + "epoch": 2.0629001094117374, + "grad_norm": 0.3429902536158538, + "learning_rate": 2.6887545112981872e-05, + "loss": 2.7336, + "step": 44309 + }, + { + "epoch": 2.0629466675978305, + "grad_norm": 0.36006436388624385, + "learning_rate": 2.6885143174442094e-05, + "loss": 2.7214, + "step": 44310 + }, + { + "epoch": 2.0629932257839236, + "grad_norm": 0.31779055524250865, + "learning_rate": 2.6882741303742117e-05, + "loss": 2.7143, + "step": 44311 + }, + { + "epoch": 2.0630397839700167, + "grad_norm": 0.3555296512130849, + "learning_rate": 2.6880339500889045e-05, + "loss": 2.5996, + "step": 44312 + }, + { + "epoch": 2.0630863421561094, + "grad_norm": 0.3218279729836979, + "learning_rate": 2.687793776588988e-05, + "loss": 2.6896, + "step": 44313 + }, + { + "epoch": 2.0631329003422025, + "grad_norm": 0.3405202690358799, + "learning_rate": 2.687553609875169e-05, + "loss": 2.7073, + "step": 44314 + }, + { + "epoch": 2.0631794585282957, + "grad_norm": 0.3533376096838315, + "learning_rate": 2.6873134499481527e-05, + "loss": 2.7362, + "step": 44315 + }, + { + "epoch": 2.0632260167143888, + "grad_norm": 0.32928858210690215, + "learning_rate": 2.687073296808643e-05, + "loss": 2.6215, + "step": 44316 + }, + { + "epoch": 2.063272574900482, + "grad_norm": 0.3263046082066344, + "learning_rate": 2.686833150457348e-05, + "loss": 2.7021, + "step": 44317 + }, + { + "epoch": 2.063319133086575, + "grad_norm": 0.33965352491082507, + "learning_rate": 2.6865930108949655e-05, + "loss": 2.645, + "step": 44318 + }, + { + "epoch": 2.063365691272668, + "grad_norm": 0.3277904447066412, + "learning_rate": 2.6863528781222087e-05, + "loss": 2.7103, + "step": 44319 + }, + { + "epoch": 2.0634122494587612, + "grad_norm": 0.3690637401307745, + "learning_rate": 2.686112752139776e-05, + "loss": 2.7207, + "step": 44320 + }, + { + "epoch": 2.0634588076448543, + "grad_norm": 0.32390813530852414, + "learning_rate": 2.6858726329483747e-05, + "loss": 2.5669, + "step": 44321 + }, + { + "epoch": 2.063505365830947, + "grad_norm": 0.3920072338835008, + "learning_rate": 2.6856325205487094e-05, + "loss": 2.7249, + "step": 44322 + }, + { + "epoch": 2.06355192401704, + "grad_norm": 0.377495701845619, + "learning_rate": 2.685392414941486e-05, + "loss": 2.6959, + "step": 44323 + }, + { + "epoch": 2.0635984822031332, + "grad_norm": 0.3579373617758958, + "learning_rate": 2.6851523161274045e-05, + "loss": 2.6163, + "step": 44324 + }, + { + "epoch": 2.0636450403892264, + "grad_norm": 0.36476760977745376, + "learning_rate": 2.6849122241071767e-05, + "loss": 2.5768, + "step": 44325 + }, + { + "epoch": 2.0636915985753195, + "grad_norm": 0.33997348351635337, + "learning_rate": 2.684672138881501e-05, + "loss": 2.7349, + "step": 44326 + }, + { + "epoch": 2.0637381567614126, + "grad_norm": 0.3656629139396835, + "learning_rate": 2.6844320604510854e-05, + "loss": 2.6702, + "step": 44327 + }, + { + "epoch": 2.0637847149475057, + "grad_norm": 0.35590322538983854, + "learning_rate": 2.6841919888166327e-05, + "loss": 2.7064, + "step": 44328 + }, + { + "epoch": 2.063831273133599, + "grad_norm": 0.33907192448424156, + "learning_rate": 2.683951923978849e-05, + "loss": 2.6826, + "step": 44329 + }, + { + "epoch": 2.063877831319692, + "grad_norm": 0.3215249668842526, + "learning_rate": 2.6837118659384396e-05, + "loss": 2.5939, + "step": 44330 + }, + { + "epoch": 2.063924389505785, + "grad_norm": 0.3452436018129841, + "learning_rate": 2.683471814696104e-05, + "loss": 2.7045, + "step": 44331 + }, + { + "epoch": 2.0639709476918777, + "grad_norm": 0.3383487006720731, + "learning_rate": 2.6832317702525546e-05, + "loss": 2.6294, + "step": 44332 + }, + { + "epoch": 2.064017505877971, + "grad_norm": 0.3139259190024529, + "learning_rate": 2.6829917326084898e-05, + "loss": 2.6234, + "step": 44333 + }, + { + "epoch": 2.064064064064064, + "grad_norm": 0.3469846668390645, + "learning_rate": 2.6827517017646153e-05, + "loss": 2.6334, + "step": 44334 + }, + { + "epoch": 2.064110622250157, + "grad_norm": 0.3421296639973495, + "learning_rate": 2.6825116777216364e-05, + "loss": 2.5951, + "step": 44335 + }, + { + "epoch": 2.06415718043625, + "grad_norm": 0.3143094219176405, + "learning_rate": 2.682271660480258e-05, + "loss": 2.6923, + "step": 44336 + }, + { + "epoch": 2.0642037386223433, + "grad_norm": 0.3335433555592004, + "learning_rate": 2.6820316500411834e-05, + "loss": 2.6009, + "step": 44337 + }, + { + "epoch": 2.0642502968084364, + "grad_norm": 0.34902264690579476, + "learning_rate": 2.68179164640512e-05, + "loss": 2.5368, + "step": 44338 + }, + { + "epoch": 2.0642968549945295, + "grad_norm": 0.32691338394941755, + "learning_rate": 2.6815516495727677e-05, + "loss": 2.593, + "step": 44339 + }, + { + "epoch": 2.0643434131806226, + "grad_norm": 0.318648738569532, + "learning_rate": 2.681311659544833e-05, + "loss": 2.6252, + "step": 44340 + }, + { + "epoch": 2.0643899713667158, + "grad_norm": 0.34472006878249184, + "learning_rate": 2.68107167632202e-05, + "loss": 2.7481, + "step": 44341 + }, + { + "epoch": 2.0644365295528084, + "grad_norm": 0.3351728235193986, + "learning_rate": 2.6808316999050332e-05, + "loss": 2.6119, + "step": 44342 + }, + { + "epoch": 2.0644830877389015, + "grad_norm": 0.32350786280157917, + "learning_rate": 2.6805917302945782e-05, + "loss": 2.6741, + "step": 44343 + }, + { + "epoch": 2.0645296459249947, + "grad_norm": 0.36826640685000855, + "learning_rate": 2.6803517674913547e-05, + "loss": 2.7302, + "step": 44344 + }, + { + "epoch": 2.0645762041110878, + "grad_norm": 0.3501950787954401, + "learning_rate": 2.6801118114960744e-05, + "loss": 2.6571, + "step": 44345 + }, + { + "epoch": 2.064622762297181, + "grad_norm": 0.3430274247332717, + "learning_rate": 2.6798718623094343e-05, + "loss": 2.753, + "step": 44346 + }, + { + "epoch": 2.064669320483274, + "grad_norm": 0.359238482386802, + "learning_rate": 2.679631919932143e-05, + "loss": 2.6836, + "step": 44347 + }, + { + "epoch": 2.064715878669367, + "grad_norm": 0.33154856386400794, + "learning_rate": 2.679391984364903e-05, + "loss": 2.6169, + "step": 44348 + }, + { + "epoch": 2.0647624368554602, + "grad_norm": 0.34093764410683675, + "learning_rate": 2.679152055608418e-05, + "loss": 2.6393, + "step": 44349 + }, + { + "epoch": 2.0648089950415534, + "grad_norm": 0.32759417423879683, + "learning_rate": 2.6789121336633944e-05, + "loss": 2.7031, + "step": 44350 + }, + { + "epoch": 2.0648555532276465, + "grad_norm": 0.330240783374019, + "learning_rate": 2.678672218530537e-05, + "loss": 2.6668, + "step": 44351 + }, + { + "epoch": 2.064902111413739, + "grad_norm": 0.33082399060291, + "learning_rate": 2.6784323102105448e-05, + "loss": 2.6595, + "step": 44352 + }, + { + "epoch": 2.0649486695998323, + "grad_norm": 0.3347016722429531, + "learning_rate": 2.6781924087041256e-05, + "loss": 2.6672, + "step": 44353 + }, + { + "epoch": 2.0649952277859254, + "grad_norm": 0.36595072253865346, + "learning_rate": 2.677952514011983e-05, + "loss": 2.7682, + "step": 44354 + }, + { + "epoch": 2.0650417859720185, + "grad_norm": 0.34503313007205133, + "learning_rate": 2.6777126261348217e-05, + "loss": 2.6003, + "step": 44355 + }, + { + "epoch": 2.0650883441581116, + "grad_norm": 0.3521609588667324, + "learning_rate": 2.6774727450733462e-05, + "loss": 2.6553, + "step": 44356 + }, + { + "epoch": 2.0651349023442047, + "grad_norm": 0.341041839111465, + "learning_rate": 2.6772328708282558e-05, + "loss": 2.72, + "step": 44357 + }, + { + "epoch": 2.065181460530298, + "grad_norm": 0.33236795819655235, + "learning_rate": 2.6769930034002614e-05, + "loss": 2.7575, + "step": 44358 + }, + { + "epoch": 2.065228018716391, + "grad_norm": 0.3605919158377523, + "learning_rate": 2.6767531427900627e-05, + "loss": 2.5647, + "step": 44359 + }, + { + "epoch": 2.065274576902484, + "grad_norm": 0.3266774549856192, + "learning_rate": 2.6765132889983636e-05, + "loss": 2.5515, + "step": 44360 + }, + { + "epoch": 2.0653211350885767, + "grad_norm": 0.32611077107206893, + "learning_rate": 2.6762734420258694e-05, + "loss": 2.6248, + "step": 44361 + }, + { + "epoch": 2.06536769327467, + "grad_norm": 0.31372694842711824, + "learning_rate": 2.6760336018732846e-05, + "loss": 2.6555, + "step": 44362 + }, + { + "epoch": 2.065414251460763, + "grad_norm": 0.3329146123846981, + "learning_rate": 2.6757937685413116e-05, + "loss": 2.6555, + "step": 44363 + }, + { + "epoch": 2.065460809646856, + "grad_norm": 0.3565971925919999, + "learning_rate": 2.6755539420306563e-05, + "loss": 2.6732, + "step": 44364 + }, + { + "epoch": 2.065507367832949, + "grad_norm": 0.32681467201474584, + "learning_rate": 2.6753141223420202e-05, + "loss": 2.5614, + "step": 44365 + }, + { + "epoch": 2.0655539260190423, + "grad_norm": 0.33814474130973493, + "learning_rate": 2.6750743094761078e-05, + "loss": 2.629, + "step": 44366 + }, + { + "epoch": 2.0656004842051354, + "grad_norm": 0.3486682181643848, + "learning_rate": 2.674834503433623e-05, + "loss": 2.5224, + "step": 44367 + }, + { + "epoch": 2.0656470423912285, + "grad_norm": 0.3405372140492798, + "learning_rate": 2.6745947042152696e-05, + "loss": 2.5118, + "step": 44368 + }, + { + "epoch": 2.0656936005773217, + "grad_norm": 0.3406803799738321, + "learning_rate": 2.674354911821755e-05, + "loss": 2.6615, + "step": 44369 + }, + { + "epoch": 2.0657401587634148, + "grad_norm": 0.3289978272336885, + "learning_rate": 2.6741151262537743e-05, + "loss": 2.6445, + "step": 44370 + }, + { + "epoch": 2.0657867169495074, + "grad_norm": 0.3516236731164743, + "learning_rate": 2.6738753475120408e-05, + "loss": 2.6153, + "step": 44371 + }, + { + "epoch": 2.0658332751356006, + "grad_norm": 0.3450591421498137, + "learning_rate": 2.67363557559725e-05, + "loss": 2.5588, + "step": 44372 + }, + { + "epoch": 2.0658798333216937, + "grad_norm": 0.333588180694002, + "learning_rate": 2.6733958105101143e-05, + "loss": 2.5847, + "step": 44373 + }, + { + "epoch": 2.065926391507787, + "grad_norm": 0.35988193779151006, + "learning_rate": 2.67315605225133e-05, + "loss": 2.7077, + "step": 44374 + }, + { + "epoch": 2.06597294969388, + "grad_norm": 0.34841248247475043, + "learning_rate": 2.6729163008216034e-05, + "loss": 2.767, + "step": 44375 + }, + { + "epoch": 2.066019507879973, + "grad_norm": 0.3440180816530608, + "learning_rate": 2.672676556221638e-05, + "loss": 2.7462, + "step": 44376 + }, + { + "epoch": 2.066066066066066, + "grad_norm": 0.3390404690315125, + "learning_rate": 2.6724368184521408e-05, + "loss": 2.7147, + "step": 44377 + }, + { + "epoch": 2.0661126242521592, + "grad_norm": 0.345439046408194, + "learning_rate": 2.672197087513809e-05, + "loss": 2.7058, + "step": 44378 + }, + { + "epoch": 2.0661591824382524, + "grad_norm": 0.3255208356729692, + "learning_rate": 2.6719573634073503e-05, + "loss": 2.682, + "step": 44379 + }, + { + "epoch": 2.0662057406243455, + "grad_norm": 0.3334076425085602, + "learning_rate": 2.671717646133467e-05, + "loss": 2.6623, + "step": 44380 + }, + { + "epoch": 2.066252298810438, + "grad_norm": 0.3302861024959155, + "learning_rate": 2.671477935692863e-05, + "loss": 2.6056, + "step": 44381 + }, + { + "epoch": 2.0662988569965313, + "grad_norm": 0.318991576215223, + "learning_rate": 2.671238232086244e-05, + "loss": 2.6569, + "step": 44382 + }, + { + "epoch": 2.0663454151826244, + "grad_norm": 0.32942107514236163, + "learning_rate": 2.6709985353143075e-05, + "loss": 2.6169, + "step": 44383 + }, + { + "epoch": 2.0663919733687175, + "grad_norm": 0.3455207290875889, + "learning_rate": 2.6707588453777643e-05, + "loss": 2.7023, + "step": 44384 + }, + { + "epoch": 2.0664385315548106, + "grad_norm": 0.34277980215175563, + "learning_rate": 2.6705191622773106e-05, + "loss": 2.7185, + "step": 44385 + }, + { + "epoch": 2.0664850897409037, + "grad_norm": 0.3290845144271087, + "learning_rate": 2.6702794860136576e-05, + "loss": 2.5336, + "step": 44386 + }, + { + "epoch": 2.066531647926997, + "grad_norm": 0.3376836051248634, + "learning_rate": 2.670039816587503e-05, + "loss": 2.663, + "step": 44387 + }, + { + "epoch": 2.06657820611309, + "grad_norm": 0.32708334287523827, + "learning_rate": 2.6698001539995522e-05, + "loss": 2.633, + "step": 44388 + }, + { + "epoch": 2.066624764299183, + "grad_norm": 0.3344968826232714, + "learning_rate": 2.6695604982505086e-05, + "loss": 2.5836, + "step": 44389 + }, + { + "epoch": 2.066671322485276, + "grad_norm": 0.3355510254812262, + "learning_rate": 2.669320849341074e-05, + "loss": 2.6528, + "step": 44390 + }, + { + "epoch": 2.066717880671369, + "grad_norm": 0.3542643704451138, + "learning_rate": 2.6690812072719562e-05, + "loss": 2.6325, + "step": 44391 + }, + { + "epoch": 2.066764438857462, + "grad_norm": 0.356486431434172, + "learning_rate": 2.668841572043853e-05, + "loss": 2.6192, + "step": 44392 + }, + { + "epoch": 2.066810997043555, + "grad_norm": 0.3405209576980166, + "learning_rate": 2.6686019436574707e-05, + "loss": 2.6391, + "step": 44393 + }, + { + "epoch": 2.066857555229648, + "grad_norm": 0.3465834884680192, + "learning_rate": 2.6683623221135118e-05, + "loss": 2.6185, + "step": 44394 + }, + { + "epoch": 2.0669041134157413, + "grad_norm": 0.3715367955252418, + "learning_rate": 2.6681227074126813e-05, + "loss": 2.7616, + "step": 44395 + }, + { + "epoch": 2.0669506716018344, + "grad_norm": 0.3471560896602842, + "learning_rate": 2.667883099555678e-05, + "loss": 2.5685, + "step": 44396 + }, + { + "epoch": 2.0669972297879275, + "grad_norm": 0.37145818952373433, + "learning_rate": 2.6676434985432107e-05, + "loss": 2.7139, + "step": 44397 + }, + { + "epoch": 2.0670437879740207, + "grad_norm": 0.3806243299490612, + "learning_rate": 2.6674039043759768e-05, + "loss": 2.6196, + "step": 44398 + }, + { + "epoch": 2.067090346160114, + "grad_norm": 0.34087022916084797, + "learning_rate": 2.6671643170546867e-05, + "loss": 2.6992, + "step": 44399 + }, + { + "epoch": 2.0671369043462064, + "grad_norm": 0.3490003225657944, + "learning_rate": 2.666924736580037e-05, + "loss": 2.6733, + "step": 44400 + }, + { + "epoch": 2.0671834625322996, + "grad_norm": 0.36470997625178475, + "learning_rate": 2.6666851629527334e-05, + "loss": 2.6498, + "step": 44401 + }, + { + "epoch": 2.0672300207183927, + "grad_norm": 0.3661079911173608, + "learning_rate": 2.666445596173479e-05, + "loss": 2.7254, + "step": 44402 + }, + { + "epoch": 2.067276578904486, + "grad_norm": 0.3229122667628487, + "learning_rate": 2.6662060362429776e-05, + "loss": 2.6738, + "step": 44403 + }, + { + "epoch": 2.067323137090579, + "grad_norm": 0.33013160924540014, + "learning_rate": 2.665966483161933e-05, + "loss": 2.6082, + "step": 44404 + }, + { + "epoch": 2.067369695276672, + "grad_norm": 0.36711777932926026, + "learning_rate": 2.6657269369310446e-05, + "loss": 2.6759, + "step": 44405 + }, + { + "epoch": 2.067416253462765, + "grad_norm": 0.3452686814601476, + "learning_rate": 2.665487397551018e-05, + "loss": 2.645, + "step": 44406 + }, + { + "epoch": 2.0674628116488583, + "grad_norm": 0.36781234687791964, + "learning_rate": 2.665247865022556e-05, + "loss": 2.6139, + "step": 44407 + }, + { + "epoch": 2.0675093698349514, + "grad_norm": 0.34943276292854797, + "learning_rate": 2.665008339346361e-05, + "loss": 2.6486, + "step": 44408 + }, + { + "epoch": 2.0675559280210445, + "grad_norm": 0.36828684495502195, + "learning_rate": 2.664768820523137e-05, + "loss": 2.7051, + "step": 44409 + }, + { + "epoch": 2.067602486207137, + "grad_norm": 0.34786027302566047, + "learning_rate": 2.6645293085535882e-05, + "loss": 2.6, + "step": 44410 + }, + { + "epoch": 2.0676490443932303, + "grad_norm": 0.3387424474130491, + "learning_rate": 2.6642898034384123e-05, + "loss": 2.6199, + "step": 44411 + }, + { + "epoch": 2.0676956025793234, + "grad_norm": 0.3306067602694381, + "learning_rate": 2.6640503051783188e-05, + "loss": 2.4688, + "step": 44412 + }, + { + "epoch": 2.0677421607654165, + "grad_norm": 0.37120952413121194, + "learning_rate": 2.663810813774006e-05, + "loss": 2.5883, + "step": 44413 + }, + { + "epoch": 2.0677887189515096, + "grad_norm": 0.3582549991350404, + "learning_rate": 2.663571329226179e-05, + "loss": 2.699, + "step": 44414 + }, + { + "epoch": 2.0678352771376027, + "grad_norm": 0.3586340281222067, + "learning_rate": 2.6633318515355388e-05, + "loss": 2.6168, + "step": 44415 + }, + { + "epoch": 2.067881835323696, + "grad_norm": 0.3347099797377309, + "learning_rate": 2.6630923807027895e-05, + "loss": 2.6618, + "step": 44416 + }, + { + "epoch": 2.067928393509789, + "grad_norm": 0.36942787322065584, + "learning_rate": 2.6628529167286365e-05, + "loss": 2.6777, + "step": 44417 + }, + { + "epoch": 2.067974951695882, + "grad_norm": 0.3726253501805144, + "learning_rate": 2.6626134596137775e-05, + "loss": 2.6918, + "step": 44418 + }, + { + "epoch": 2.068021509881975, + "grad_norm": 0.31646186292855155, + "learning_rate": 2.6623740093589182e-05, + "loss": 2.6659, + "step": 44419 + }, + { + "epoch": 2.068068068068068, + "grad_norm": 0.38574328054100554, + "learning_rate": 2.6621345659647608e-05, + "loss": 2.6963, + "step": 44420 + }, + { + "epoch": 2.068114626254161, + "grad_norm": 0.34156107541233244, + "learning_rate": 2.6618951294320083e-05, + "loss": 2.7394, + "step": 44421 + }, + { + "epoch": 2.068161184440254, + "grad_norm": 0.32791607629324565, + "learning_rate": 2.6616556997613628e-05, + "loss": 2.5595, + "step": 44422 + }, + { + "epoch": 2.068207742626347, + "grad_norm": 0.33780935024244263, + "learning_rate": 2.661416276953529e-05, + "loss": 2.6529, + "step": 44423 + }, + { + "epoch": 2.0682543008124403, + "grad_norm": 0.31457570461102075, + "learning_rate": 2.6611768610092046e-05, + "loss": 2.7095, + "step": 44424 + }, + { + "epoch": 2.0683008589985334, + "grad_norm": 0.3485477912845422, + "learning_rate": 2.6609374519290996e-05, + "loss": 2.5392, + "step": 44425 + }, + { + "epoch": 2.0683474171846266, + "grad_norm": 0.3444562488225882, + "learning_rate": 2.660698049713909e-05, + "loss": 2.69, + "step": 44426 + }, + { + "epoch": 2.0683939753707197, + "grad_norm": 0.31263491217204326, + "learning_rate": 2.660458654364343e-05, + "loss": 2.6039, + "step": 44427 + }, + { + "epoch": 2.068440533556813, + "grad_norm": 0.3329474292945403, + "learning_rate": 2.6602192658810987e-05, + "loss": 2.597, + "step": 44428 + }, + { + "epoch": 2.068487091742906, + "grad_norm": 0.3619537180910601, + "learning_rate": 2.65997988426488e-05, + "loss": 2.6533, + "step": 44429 + }, + { + "epoch": 2.0685336499289986, + "grad_norm": 0.3407332029391978, + "learning_rate": 2.659740509516392e-05, + "loss": 2.657, + "step": 44430 + }, + { + "epoch": 2.0685802081150917, + "grad_norm": 0.34321622899243365, + "learning_rate": 2.6595011416363323e-05, + "loss": 2.6433, + "step": 44431 + }, + { + "epoch": 2.068626766301185, + "grad_norm": 0.35402653660016975, + "learning_rate": 2.659261780625407e-05, + "loss": 2.6143, + "step": 44432 + }, + { + "epoch": 2.068673324487278, + "grad_norm": 0.3482752158271818, + "learning_rate": 2.6590224264843178e-05, + "loss": 2.6275, + "step": 44433 + }, + { + "epoch": 2.068719882673371, + "grad_norm": 0.3334519190821962, + "learning_rate": 2.658783079213767e-05, + "loss": 2.7435, + "step": 44434 + }, + { + "epoch": 2.068766440859464, + "grad_norm": 0.3486792709029122, + "learning_rate": 2.6585437388144563e-05, + "loss": 2.6954, + "step": 44435 + }, + { + "epoch": 2.0688129990455573, + "grad_norm": 0.35758579675548297, + "learning_rate": 2.658304405287092e-05, + "loss": 2.7339, + "step": 44436 + }, + { + "epoch": 2.0688595572316504, + "grad_norm": 0.3213193004508703, + "learning_rate": 2.6580650786323695e-05, + "loss": 2.5605, + "step": 44437 + }, + { + "epoch": 2.0689061154177435, + "grad_norm": 0.3428794471600291, + "learning_rate": 2.6578257588509993e-05, + "loss": 2.6051, + "step": 44438 + }, + { + "epoch": 2.068952673603836, + "grad_norm": 0.3152615822966252, + "learning_rate": 2.6575864459436755e-05, + "loss": 2.6326, + "step": 44439 + }, + { + "epoch": 2.0689992317899293, + "grad_norm": 0.3225274085344769, + "learning_rate": 2.657347139911109e-05, + "loss": 2.659, + "step": 44440 + }, + { + "epoch": 2.0690457899760224, + "grad_norm": 0.36178736315397975, + "learning_rate": 2.6571078407539955e-05, + "loss": 2.6366, + "step": 44441 + }, + { + "epoch": 2.0690923481621155, + "grad_norm": 0.33448399925291644, + "learning_rate": 2.6568685484730404e-05, + "loss": 2.6243, + "step": 44442 + }, + { + "epoch": 2.0691389063482086, + "grad_norm": 0.34881577212497034, + "learning_rate": 2.656629263068946e-05, + "loss": 2.6097, + "step": 44443 + }, + { + "epoch": 2.0691854645343017, + "grad_norm": 0.30531855615387155, + "learning_rate": 2.6563899845424112e-05, + "loss": 2.6565, + "step": 44444 + }, + { + "epoch": 2.069232022720395, + "grad_norm": 0.33401549820582094, + "learning_rate": 2.6561507128941443e-05, + "loss": 2.6832, + "step": 44445 + }, + { + "epoch": 2.069278580906488, + "grad_norm": 0.35138660829600166, + "learning_rate": 2.655911448124842e-05, + "loss": 2.6303, + "step": 44446 + }, + { + "epoch": 2.069325139092581, + "grad_norm": 0.34240461643080006, + "learning_rate": 2.655672190235209e-05, + "loss": 2.6912, + "step": 44447 + }, + { + "epoch": 2.069371697278674, + "grad_norm": 0.3328144494849179, + "learning_rate": 2.655432939225947e-05, + "loss": 2.6178, + "step": 44448 + }, + { + "epoch": 2.0694182554647673, + "grad_norm": 0.32590571122764256, + "learning_rate": 2.6551936950977602e-05, + "loss": 2.5839, + "step": 44449 + }, + { + "epoch": 2.06946481365086, + "grad_norm": 0.36935694289481624, + "learning_rate": 2.654954457851345e-05, + "loss": 2.5848, + "step": 44450 + }, + { + "epoch": 2.069511371836953, + "grad_norm": 0.3273293705198791, + "learning_rate": 2.6547152274874115e-05, + "loss": 2.5899, + "step": 44451 + }, + { + "epoch": 2.069557930023046, + "grad_norm": 0.3310673369060086, + "learning_rate": 2.654476004006654e-05, + "loss": 2.6239, + "step": 44452 + }, + { + "epoch": 2.0696044882091393, + "grad_norm": 0.33371645028850133, + "learning_rate": 2.6542367874097817e-05, + "loss": 2.6958, + "step": 44453 + }, + { + "epoch": 2.0696510463952325, + "grad_norm": 0.34689844710955775, + "learning_rate": 2.6539975776974925e-05, + "loss": 2.6027, + "step": 44454 + }, + { + "epoch": 2.0696976045813256, + "grad_norm": 0.3182699637664782, + "learning_rate": 2.6537583748704882e-05, + "loss": 2.6013, + "step": 44455 + }, + { + "epoch": 2.0697441627674187, + "grad_norm": 0.323612770880693, + "learning_rate": 2.6535191789294745e-05, + "loss": 2.6438, + "step": 44456 + }, + { + "epoch": 2.069790720953512, + "grad_norm": 0.3402635388620273, + "learning_rate": 2.653279989875146e-05, + "loss": 2.6934, + "step": 44457 + }, + { + "epoch": 2.069837279139605, + "grad_norm": 0.34098559872139517, + "learning_rate": 2.6530408077082147e-05, + "loss": 2.659, + "step": 44458 + }, + { + "epoch": 2.0698838373256976, + "grad_norm": 0.3200792576587646, + "learning_rate": 2.6528016324293748e-05, + "loss": 2.6608, + "step": 44459 + }, + { + "epoch": 2.0699303955117907, + "grad_norm": 0.3403476408420386, + "learning_rate": 2.6525624640393308e-05, + "loss": 2.6306, + "step": 44460 + }, + { + "epoch": 2.069976953697884, + "grad_norm": 0.35937304244188695, + "learning_rate": 2.652323302538785e-05, + "loss": 2.6569, + "step": 44461 + }, + { + "epoch": 2.070023511883977, + "grad_norm": 0.3057830464479563, + "learning_rate": 2.6520841479284385e-05, + "loss": 2.5564, + "step": 44462 + }, + { + "epoch": 2.07007007007007, + "grad_norm": 0.33515428855605883, + "learning_rate": 2.6518450002089946e-05, + "loss": 2.6912, + "step": 44463 + }, + { + "epoch": 2.070116628256163, + "grad_norm": 0.338607258460071, + "learning_rate": 2.6516058593811553e-05, + "loss": 2.6165, + "step": 44464 + }, + { + "epoch": 2.0701631864422563, + "grad_norm": 0.3313178451378926, + "learning_rate": 2.6513667254456175e-05, + "loss": 2.7931, + "step": 44465 + }, + { + "epoch": 2.0702097446283494, + "grad_norm": 0.3254170396074708, + "learning_rate": 2.6511275984030914e-05, + "loss": 2.7049, + "step": 44466 + }, + { + "epoch": 2.0702563028144425, + "grad_norm": 0.33797511740543396, + "learning_rate": 2.650888478254273e-05, + "loss": 2.7364, + "step": 44467 + }, + { + "epoch": 2.0703028610005356, + "grad_norm": 0.34006772334332525, + "learning_rate": 2.650649364999865e-05, + "loss": 2.5829, + "step": 44468 + }, + { + "epoch": 2.0703494191866283, + "grad_norm": 0.3621428748892842, + "learning_rate": 2.6504102586405712e-05, + "loss": 2.6763, + "step": 44469 + }, + { + "epoch": 2.0703959773727214, + "grad_norm": 0.3254804619882877, + "learning_rate": 2.6501711591770882e-05, + "loss": 2.6631, + "step": 44470 + }, + { + "epoch": 2.0704425355588145, + "grad_norm": 0.36356250969951237, + "learning_rate": 2.649932066610126e-05, + "loss": 2.6469, + "step": 44471 + }, + { + "epoch": 2.0704890937449076, + "grad_norm": 0.36583625194038993, + "learning_rate": 2.649692980940378e-05, + "loss": 2.6473, + "step": 44472 + }, + { + "epoch": 2.0705356519310008, + "grad_norm": 0.3410926926515617, + "learning_rate": 2.6494539021685506e-05, + "loss": 2.7607, + "step": 44473 + }, + { + "epoch": 2.070582210117094, + "grad_norm": 0.33935847139909464, + "learning_rate": 2.6492148302953444e-05, + "loss": 2.5845, + "step": 44474 + }, + { + "epoch": 2.070628768303187, + "grad_norm": 0.3497154853791868, + "learning_rate": 2.6489757653214615e-05, + "loss": 2.6747, + "step": 44475 + }, + { + "epoch": 2.07067532648928, + "grad_norm": 0.33949376715415275, + "learning_rate": 2.648736707247602e-05, + "loss": 2.6222, + "step": 44476 + }, + { + "epoch": 2.070721884675373, + "grad_norm": 0.3261522990829468, + "learning_rate": 2.6484976560744718e-05, + "loss": 2.644, + "step": 44477 + }, + { + "epoch": 2.0707684428614663, + "grad_norm": 0.3351047982320544, + "learning_rate": 2.6482586118027646e-05, + "loss": 2.6633, + "step": 44478 + }, + { + "epoch": 2.070815001047559, + "grad_norm": 0.32751746886539324, + "learning_rate": 2.648019574433191e-05, + "loss": 2.651, + "step": 44479 + }, + { + "epoch": 2.070861559233652, + "grad_norm": 0.3571711995749808, + "learning_rate": 2.647780543966446e-05, + "loss": 2.7006, + "step": 44480 + }, + { + "epoch": 2.0709081174197452, + "grad_norm": 0.3461812611583368, + "learning_rate": 2.647541520403234e-05, + "loss": 2.6846, + "step": 44481 + }, + { + "epoch": 2.0709546756058383, + "grad_norm": 0.3408578831699598, + "learning_rate": 2.6473025037442567e-05, + "loss": 2.6504, + "step": 44482 + }, + { + "epoch": 2.0710012337919315, + "grad_norm": 0.3413126604303765, + "learning_rate": 2.6470634939902117e-05, + "loss": 2.6634, + "step": 44483 + }, + { + "epoch": 2.0710477919780246, + "grad_norm": 0.3328585073203623, + "learning_rate": 2.6468244911418072e-05, + "loss": 2.6565, + "step": 44484 + }, + { + "epoch": 2.0710943501641177, + "grad_norm": 0.3317867870336456, + "learning_rate": 2.646585495199737e-05, + "loss": 2.6072, + "step": 44485 + }, + { + "epoch": 2.071140908350211, + "grad_norm": 0.36538452146960787, + "learning_rate": 2.6463465061647108e-05, + "loss": 2.7206, + "step": 44486 + }, + { + "epoch": 2.071187466536304, + "grad_norm": 0.33110413696822194, + "learning_rate": 2.646107524037424e-05, + "loss": 2.6783, + "step": 44487 + }, + { + "epoch": 2.071234024722397, + "grad_norm": 0.3455270117577417, + "learning_rate": 2.6458685488185793e-05, + "loss": 2.6816, + "step": 44488 + }, + { + "epoch": 2.0712805829084897, + "grad_norm": 0.37126038010891976, + "learning_rate": 2.6456295805088783e-05, + "loss": 2.6851, + "step": 44489 + }, + { + "epoch": 2.071327141094583, + "grad_norm": 0.34359746737845004, + "learning_rate": 2.645390619109025e-05, + "loss": 2.6454, + "step": 44490 + }, + { + "epoch": 2.071373699280676, + "grad_norm": 0.3440711561200339, + "learning_rate": 2.645151664619714e-05, + "loss": 2.55, + "step": 44491 + }, + { + "epoch": 2.071420257466769, + "grad_norm": 0.3858195845844589, + "learning_rate": 2.6449127170416542e-05, + "loss": 2.6965, + "step": 44492 + }, + { + "epoch": 2.071466815652862, + "grad_norm": 0.38240430893616895, + "learning_rate": 2.6446737763755426e-05, + "loss": 2.7426, + "step": 44493 + }, + { + "epoch": 2.0715133738389553, + "grad_norm": 0.36748122668350597, + "learning_rate": 2.644434842622081e-05, + "loss": 2.6924, + "step": 44494 + }, + { + "epoch": 2.0715599320250484, + "grad_norm": 0.37252043944082913, + "learning_rate": 2.6441959157819727e-05, + "loss": 2.6462, + "step": 44495 + }, + { + "epoch": 2.0716064902111415, + "grad_norm": 0.37255932654489904, + "learning_rate": 2.643956995855914e-05, + "loss": 2.6442, + "step": 44496 + }, + { + "epoch": 2.0716530483972346, + "grad_norm": 0.36087440552278127, + "learning_rate": 2.6437180828446127e-05, + "loss": 2.696, + "step": 44497 + }, + { + "epoch": 2.0716996065833273, + "grad_norm": 0.361910379617601, + "learning_rate": 2.643479176748763e-05, + "loss": 2.6021, + "step": 44498 + }, + { + "epoch": 2.0717461647694204, + "grad_norm": 0.3815569671348212, + "learning_rate": 2.6432402775690747e-05, + "loss": 2.6795, + "step": 44499 + }, + { + "epoch": 2.0717927229555135, + "grad_norm": 0.3362067100463734, + "learning_rate": 2.6430013853062407e-05, + "loss": 2.6268, + "step": 44500 + }, + { + "epoch": 2.0718392811416066, + "grad_norm": 0.33958188355977154, + "learning_rate": 2.6427624999609656e-05, + "loss": 2.5658, + "step": 44501 + }, + { + "epoch": 2.0718858393276998, + "grad_norm": 0.34952310459125624, + "learning_rate": 2.642523621533951e-05, + "loss": 2.7416, + "step": 44502 + }, + { + "epoch": 2.071932397513793, + "grad_norm": 0.34862007383019944, + "learning_rate": 2.6422847500258972e-05, + "loss": 2.6005, + "step": 44503 + }, + { + "epoch": 2.071978955699886, + "grad_norm": 0.346416589165982, + "learning_rate": 2.6420458854375053e-05, + "loss": 2.6392, + "step": 44504 + }, + { + "epoch": 2.072025513885979, + "grad_norm": 0.3368117464603158, + "learning_rate": 2.641807027769478e-05, + "loss": 2.6396, + "step": 44505 + }, + { + "epoch": 2.0720720720720722, + "grad_norm": 0.3496040379646827, + "learning_rate": 2.641568177022513e-05, + "loss": 2.6007, + "step": 44506 + }, + { + "epoch": 2.0721186302581653, + "grad_norm": 0.34584465495480804, + "learning_rate": 2.641329333197313e-05, + "loss": 2.6283, + "step": 44507 + }, + { + "epoch": 2.072165188444258, + "grad_norm": 0.3533352343231293, + "learning_rate": 2.6410904962945814e-05, + "loss": 2.7242, + "step": 44508 + }, + { + "epoch": 2.072211746630351, + "grad_norm": 0.34296093694829216, + "learning_rate": 2.6408516663150124e-05, + "loss": 2.6135, + "step": 44509 + }, + { + "epoch": 2.0722583048164442, + "grad_norm": 0.41696462544635393, + "learning_rate": 2.6406128432593158e-05, + "loss": 2.6827, + "step": 44510 + }, + { + "epoch": 2.0723048630025374, + "grad_norm": 0.34431285131700273, + "learning_rate": 2.6403740271281836e-05, + "loss": 2.6909, + "step": 44511 + }, + { + "epoch": 2.0723514211886305, + "grad_norm": 0.3546739432762446, + "learning_rate": 2.640135217922325e-05, + "loss": 2.664, + "step": 44512 + }, + { + "epoch": 2.0723979793747236, + "grad_norm": 0.39220214653087926, + "learning_rate": 2.6398964156424357e-05, + "loss": 2.5543, + "step": 44513 + }, + { + "epoch": 2.0724445375608167, + "grad_norm": 0.35417972630162714, + "learning_rate": 2.6396576202892175e-05, + "loss": 2.637, + "step": 44514 + }, + { + "epoch": 2.07249109574691, + "grad_norm": 0.374800173659649, + "learning_rate": 2.6394188318633716e-05, + "loss": 2.7681, + "step": 44515 + }, + { + "epoch": 2.072537653933003, + "grad_norm": 0.37298994036449035, + "learning_rate": 2.639180050365599e-05, + "loss": 2.6149, + "step": 44516 + }, + { + "epoch": 2.072584212119096, + "grad_norm": 0.3528266797098586, + "learning_rate": 2.6389412757966004e-05, + "loss": 2.678, + "step": 44517 + }, + { + "epoch": 2.0726307703051887, + "grad_norm": 0.35741142570107093, + "learning_rate": 2.6387025081570782e-05, + "loss": 2.722, + "step": 44518 + }, + { + "epoch": 2.072677328491282, + "grad_norm": 0.3662041202600473, + "learning_rate": 2.6384637474477302e-05, + "loss": 2.6503, + "step": 44519 + }, + { + "epoch": 2.072723886677375, + "grad_norm": 0.32650840081367855, + "learning_rate": 2.638224993669258e-05, + "loss": 2.6048, + "step": 44520 + }, + { + "epoch": 2.072770444863468, + "grad_norm": 0.350967800670348, + "learning_rate": 2.6379862468223625e-05, + "loss": 2.6173, + "step": 44521 + }, + { + "epoch": 2.072817003049561, + "grad_norm": 0.3409265894424832, + "learning_rate": 2.6377475069077452e-05, + "loss": 2.556, + "step": 44522 + }, + { + "epoch": 2.0728635612356543, + "grad_norm": 0.32758003656515317, + "learning_rate": 2.6375087739261085e-05, + "loss": 2.6469, + "step": 44523 + }, + { + "epoch": 2.0729101194217474, + "grad_norm": 0.35082010735197955, + "learning_rate": 2.6372700478781466e-05, + "loss": 2.7722, + "step": 44524 + }, + { + "epoch": 2.0729566776078405, + "grad_norm": 0.37850251725024486, + "learning_rate": 2.6370313287645676e-05, + "loss": 2.6938, + "step": 44525 + }, + { + "epoch": 2.0730032357939336, + "grad_norm": 0.36604404336125085, + "learning_rate": 2.6367926165860672e-05, + "loss": 2.6969, + "step": 44526 + }, + { + "epoch": 2.0730497939800268, + "grad_norm": 0.38084675267096135, + "learning_rate": 2.636553911343348e-05, + "loss": 2.6346, + "step": 44527 + }, + { + "epoch": 2.0730963521661194, + "grad_norm": 0.3453135332518529, + "learning_rate": 2.63631521303711e-05, + "loss": 2.6215, + "step": 44528 + }, + { + "epoch": 2.0731429103522125, + "grad_norm": 0.35441129519815595, + "learning_rate": 2.636076521668054e-05, + "loss": 2.6325, + "step": 44529 + }, + { + "epoch": 2.0731894685383057, + "grad_norm": 0.37969789398094056, + "learning_rate": 2.6358378372368807e-05, + "loss": 2.5918, + "step": 44530 + }, + { + "epoch": 2.0732360267243988, + "grad_norm": 0.3172394008601896, + "learning_rate": 2.635599159744292e-05, + "loss": 2.6337, + "step": 44531 + }, + { + "epoch": 2.073282584910492, + "grad_norm": 0.3886450954305131, + "learning_rate": 2.6353604891909856e-05, + "loss": 2.6223, + "step": 44532 + }, + { + "epoch": 2.073329143096585, + "grad_norm": 0.38876817704531896, + "learning_rate": 2.6351218255776623e-05, + "loss": 2.593, + "step": 44533 + }, + { + "epoch": 2.073375701282678, + "grad_norm": 0.336874680758991, + "learning_rate": 2.634883168905024e-05, + "loss": 2.6333, + "step": 44534 + }, + { + "epoch": 2.0734222594687712, + "grad_norm": 0.3867975268640403, + "learning_rate": 2.6346445191737707e-05, + "loss": 2.7591, + "step": 44535 + }, + { + "epoch": 2.0734688176548643, + "grad_norm": 0.34327434337435564, + "learning_rate": 2.6344058763846047e-05, + "loss": 2.5894, + "step": 44536 + }, + { + "epoch": 2.073515375840957, + "grad_norm": 0.35741975678190935, + "learning_rate": 2.6341672405382202e-05, + "loss": 2.5851, + "step": 44537 + }, + { + "epoch": 2.07356193402705, + "grad_norm": 0.3708770720317459, + "learning_rate": 2.6339286116353256e-05, + "loss": 2.6961, + "step": 44538 + }, + { + "epoch": 2.0736084922131433, + "grad_norm": 0.34394832943007586, + "learning_rate": 2.6336899896766137e-05, + "loss": 2.5623, + "step": 44539 + }, + { + "epoch": 2.0736550503992364, + "grad_norm": 0.34916602680453024, + "learning_rate": 2.6334513746627927e-05, + "loss": 2.7132, + "step": 44540 + }, + { + "epoch": 2.0737016085853295, + "grad_norm": 0.3256839308512072, + "learning_rate": 2.6332127665945562e-05, + "loss": 2.6489, + "step": 44541 + }, + { + "epoch": 2.0737481667714226, + "grad_norm": 0.3502874410351856, + "learning_rate": 2.6329741654726075e-05, + "loss": 2.6928, + "step": 44542 + }, + { + "epoch": 2.0737947249575157, + "grad_norm": 0.3446133030280431, + "learning_rate": 2.6327355712976466e-05, + "loss": 2.6575, + "step": 44543 + }, + { + "epoch": 2.073841283143609, + "grad_norm": 0.32783964472162386, + "learning_rate": 2.632496984070375e-05, + "loss": 2.6356, + "step": 44544 + }, + { + "epoch": 2.073887841329702, + "grad_norm": 0.36210669521913763, + "learning_rate": 2.63225840379149e-05, + "loss": 2.6712, + "step": 44545 + }, + { + "epoch": 2.073934399515795, + "grad_norm": 0.3324101406021063, + "learning_rate": 2.6320198304616928e-05, + "loss": 2.7294, + "step": 44546 + }, + { + "epoch": 2.0739809577018877, + "grad_norm": 0.3605825290419204, + "learning_rate": 2.6317812640816854e-05, + "loss": 2.6661, + "step": 44547 + }, + { + "epoch": 2.074027515887981, + "grad_norm": 0.33459691165659383, + "learning_rate": 2.631542704652165e-05, + "loss": 2.7321, + "step": 44548 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 0.36532943083553554, + "learning_rate": 2.6313041521738362e-05, + "loss": 2.6625, + "step": 44549 + }, + { + "epoch": 2.074120632260167, + "grad_norm": 0.32288494461785305, + "learning_rate": 2.6310656066473928e-05, + "loss": 2.5701, + "step": 44550 + }, + { + "epoch": 2.07416719044626, + "grad_norm": 0.3400780356576799, + "learning_rate": 2.6308270680735414e-05, + "loss": 2.6602, + "step": 44551 + }, + { + "epoch": 2.0742137486323533, + "grad_norm": 0.35741554386146873, + "learning_rate": 2.6305885364529757e-05, + "loss": 2.6782, + "step": 44552 + }, + { + "epoch": 2.0742603068184464, + "grad_norm": 0.3453024961832085, + "learning_rate": 2.6303500117864038e-05, + "loss": 2.6987, + "step": 44553 + }, + { + "epoch": 2.0743068650045395, + "grad_norm": 0.3623162449738465, + "learning_rate": 2.6301114940745186e-05, + "loss": 2.6495, + "step": 44554 + }, + { + "epoch": 2.0743534231906327, + "grad_norm": 0.3366895546099206, + "learning_rate": 2.6298729833180224e-05, + "loss": 2.5946, + "step": 44555 + }, + { + "epoch": 2.0743999813767258, + "grad_norm": 0.35473062019079205, + "learning_rate": 2.6296344795176165e-05, + "loss": 2.6957, + "step": 44556 + }, + { + "epoch": 2.0744465395628184, + "grad_norm": 0.34266219968518014, + "learning_rate": 2.629395982673999e-05, + "loss": 2.598, + "step": 44557 + }, + { + "epoch": 2.0744930977489116, + "grad_norm": 0.3319710628595824, + "learning_rate": 2.6291574927878725e-05, + "loss": 2.6388, + "step": 44558 + }, + { + "epoch": 2.0745396559350047, + "grad_norm": 0.3244312924331804, + "learning_rate": 2.6289190098599343e-05, + "loss": 2.6058, + "step": 44559 + }, + { + "epoch": 2.074586214121098, + "grad_norm": 0.32942459086796955, + "learning_rate": 2.6286805338908847e-05, + "loss": 2.6619, + "step": 44560 + }, + { + "epoch": 2.074632772307191, + "grad_norm": 0.3641084721641029, + "learning_rate": 2.6284420648814234e-05, + "loss": 2.6406, + "step": 44561 + }, + { + "epoch": 2.074679330493284, + "grad_norm": 0.3270939609229711, + "learning_rate": 2.6282036028322544e-05, + "loss": 2.6702, + "step": 44562 + }, + { + "epoch": 2.074725888679377, + "grad_norm": 0.34954131636155655, + "learning_rate": 2.6279651477440693e-05, + "loss": 2.6468, + "step": 44563 + }, + { + "epoch": 2.0747724468654702, + "grad_norm": 0.352495319064033, + "learning_rate": 2.6277266996175765e-05, + "loss": 2.6481, + "step": 44564 + }, + { + "epoch": 2.0748190050515634, + "grad_norm": 0.32688159918784526, + "learning_rate": 2.6274882584534688e-05, + "loss": 2.6619, + "step": 44565 + }, + { + "epoch": 2.0748655632376565, + "grad_norm": 0.33431037197793795, + "learning_rate": 2.627249824252453e-05, + "loss": 2.5741, + "step": 44566 + }, + { + "epoch": 2.074912121423749, + "grad_norm": 0.31233951147559075, + "learning_rate": 2.6270113970152232e-05, + "loss": 2.6272, + "step": 44567 + }, + { + "epoch": 2.0749586796098423, + "grad_norm": 0.33235621310910335, + "learning_rate": 2.626772976742481e-05, + "loss": 2.6102, + "step": 44568 + }, + { + "epoch": 2.0750052377959354, + "grad_norm": 0.32958857275033654, + "learning_rate": 2.626534563434927e-05, + "loss": 2.6225, + "step": 44569 + }, + { + "epoch": 2.0750517959820285, + "grad_norm": 0.3232405063345131, + "learning_rate": 2.62629615709326e-05, + "loss": 2.7488, + "step": 44570 + }, + { + "epoch": 2.0750983541681216, + "grad_norm": 0.3098958865539323, + "learning_rate": 2.626057757718181e-05, + "loss": 2.6569, + "step": 44571 + }, + { + "epoch": 2.0751449123542147, + "grad_norm": 0.29896901903746526, + "learning_rate": 2.6258193653103873e-05, + "loss": 2.5935, + "step": 44572 + }, + { + "epoch": 2.075191470540308, + "grad_norm": 0.32722309028579377, + "learning_rate": 2.62558097987058e-05, + "loss": 2.5864, + "step": 44573 + }, + { + "epoch": 2.075238028726401, + "grad_norm": 0.3500135427380325, + "learning_rate": 2.6253426013994586e-05, + "loss": 2.6552, + "step": 44574 + }, + { + "epoch": 2.075284586912494, + "grad_norm": 0.3139853607264227, + "learning_rate": 2.6251042298977224e-05, + "loss": 2.6344, + "step": 44575 + }, + { + "epoch": 2.0753311450985867, + "grad_norm": 0.32827778534440616, + "learning_rate": 2.624865865366072e-05, + "loss": 2.5995, + "step": 44576 + }, + { + "epoch": 2.07537770328468, + "grad_norm": 0.3150030373367588, + "learning_rate": 2.6246275078052075e-05, + "loss": 2.7155, + "step": 44577 + }, + { + "epoch": 2.075424261470773, + "grad_norm": 0.31545862774156586, + "learning_rate": 2.6243891572158237e-05, + "loss": 2.6717, + "step": 44578 + }, + { + "epoch": 2.075470819656866, + "grad_norm": 0.355045481266487, + "learning_rate": 2.624150813598627e-05, + "loss": 2.7566, + "step": 44579 + }, + { + "epoch": 2.075517377842959, + "grad_norm": 0.3354030976389315, + "learning_rate": 2.623912476954312e-05, + "loss": 2.6512, + "step": 44580 + }, + { + "epoch": 2.0755639360290523, + "grad_norm": 0.31738625885192295, + "learning_rate": 2.62367414728358e-05, + "loss": 2.6003, + "step": 44581 + }, + { + "epoch": 2.0756104942151454, + "grad_norm": 0.2977121993836166, + "learning_rate": 2.6234358245871305e-05, + "loss": 2.6207, + "step": 44582 + }, + { + "epoch": 2.0756570524012385, + "grad_norm": 0.33397519839659623, + "learning_rate": 2.6231975088656625e-05, + "loss": 2.6541, + "step": 44583 + }, + { + "epoch": 2.0757036105873317, + "grad_norm": 0.3357351101877417, + "learning_rate": 2.6229592001198777e-05, + "loss": 2.6619, + "step": 44584 + }, + { + "epoch": 2.0757501687734248, + "grad_norm": 0.3202498342351946, + "learning_rate": 2.6227208983504714e-05, + "loss": 2.603, + "step": 44585 + }, + { + "epoch": 2.0757967269595174, + "grad_norm": 0.3504518016827592, + "learning_rate": 2.622482603558145e-05, + "loss": 2.7083, + "step": 44586 + }, + { + "epoch": 2.0758432851456106, + "grad_norm": 0.3221876848398983, + "learning_rate": 2.6222443157435984e-05, + "loss": 2.6991, + "step": 44587 + }, + { + "epoch": 2.0758898433317037, + "grad_norm": 0.3441929976028805, + "learning_rate": 2.62200603490753e-05, + "loss": 2.7374, + "step": 44588 + }, + { + "epoch": 2.075936401517797, + "grad_norm": 0.3574790347233786, + "learning_rate": 2.6217677610506398e-05, + "loss": 2.6195, + "step": 44589 + }, + { + "epoch": 2.07598295970389, + "grad_norm": 0.35008063915229043, + "learning_rate": 2.6215294941736286e-05, + "loss": 2.653, + "step": 44590 + }, + { + "epoch": 2.076029517889983, + "grad_norm": 0.33989773703422055, + "learning_rate": 2.62129123427719e-05, + "loss": 2.6934, + "step": 44591 + }, + { + "epoch": 2.076076076076076, + "grad_norm": 0.3409020936186885, + "learning_rate": 2.6210529813620315e-05, + "loss": 2.7082, + "step": 44592 + }, + { + "epoch": 2.0761226342621693, + "grad_norm": 0.3506658363834652, + "learning_rate": 2.620814735428846e-05, + "loss": 2.6376, + "step": 44593 + }, + { + "epoch": 2.0761691924482624, + "grad_norm": 0.3391923572509185, + "learning_rate": 2.620576496478335e-05, + "loss": 2.655, + "step": 44594 + }, + { + "epoch": 2.0762157506343555, + "grad_norm": 0.34947371010696787, + "learning_rate": 2.620338264511198e-05, + "loss": 2.6432, + "step": 44595 + }, + { + "epoch": 2.076262308820448, + "grad_norm": 0.32787473088065583, + "learning_rate": 2.6201000395281327e-05, + "loss": 2.5873, + "step": 44596 + }, + { + "epoch": 2.0763088670065413, + "grad_norm": 0.32858025716966277, + "learning_rate": 2.619861821529842e-05, + "loss": 2.6254, + "step": 44597 + }, + { + "epoch": 2.0763554251926344, + "grad_norm": 0.3575730429793154, + "learning_rate": 2.6196236105170206e-05, + "loss": 2.6133, + "step": 44598 + }, + { + "epoch": 2.0764019833787275, + "grad_norm": 0.3321786520806616, + "learning_rate": 2.619385406490369e-05, + "loss": 2.6624, + "step": 44599 + }, + { + "epoch": 2.0764485415648206, + "grad_norm": 0.32439788937474123, + "learning_rate": 2.619147209450587e-05, + "loss": 2.6305, + "step": 44600 + }, + { + "epoch": 2.0764950997509137, + "grad_norm": 0.3376206772257453, + "learning_rate": 2.6189090193983734e-05, + "loss": 2.7184, + "step": 44601 + }, + { + "epoch": 2.076541657937007, + "grad_norm": 0.34039272441687257, + "learning_rate": 2.618670836334427e-05, + "loss": 2.5879, + "step": 44602 + }, + { + "epoch": 2.0765882161231, + "grad_norm": 0.3322021009020047, + "learning_rate": 2.618432660259449e-05, + "loss": 2.6739, + "step": 44603 + }, + { + "epoch": 2.076634774309193, + "grad_norm": 0.361342961178352, + "learning_rate": 2.618194491174133e-05, + "loss": 2.6518, + "step": 44604 + }, + { + "epoch": 2.076681332495286, + "grad_norm": 0.3355440685156244, + "learning_rate": 2.6179563290791853e-05, + "loss": 2.7046, + "step": 44605 + }, + { + "epoch": 2.076727890681379, + "grad_norm": 0.35947986597401044, + "learning_rate": 2.6177181739752988e-05, + "loss": 2.6874, + "step": 44606 + }, + { + "epoch": 2.076774448867472, + "grad_norm": 0.33798079301175876, + "learning_rate": 2.6174800258631748e-05, + "loss": 2.5656, + "step": 44607 + }, + { + "epoch": 2.076821007053565, + "grad_norm": 0.34179931274479997, + "learning_rate": 2.617241884743512e-05, + "loss": 2.6386, + "step": 44608 + }, + { + "epoch": 2.076867565239658, + "grad_norm": 0.35721305431930467, + "learning_rate": 2.6170037506170104e-05, + "loss": 2.7275, + "step": 44609 + }, + { + "epoch": 2.0769141234257513, + "grad_norm": 0.3682776618579135, + "learning_rate": 2.6167656234843696e-05, + "loss": 2.7204, + "step": 44610 + }, + { + "epoch": 2.0769606816118444, + "grad_norm": 0.3233083905115786, + "learning_rate": 2.6165275033462826e-05, + "loss": 2.6387, + "step": 44611 + }, + { + "epoch": 2.0770072397979376, + "grad_norm": 0.34219786028232346, + "learning_rate": 2.6162893902034568e-05, + "loss": 2.6998, + "step": 44612 + }, + { + "epoch": 2.0770537979840307, + "grad_norm": 0.3476026782433178, + "learning_rate": 2.6160512840565848e-05, + "loss": 2.6316, + "step": 44613 + }, + { + "epoch": 2.077100356170124, + "grad_norm": 0.3360592501536425, + "learning_rate": 2.6158131849063676e-05, + "loss": 2.5816, + "step": 44614 + }, + { + "epoch": 2.0771469143562165, + "grad_norm": 0.36309313300557855, + "learning_rate": 2.6155750927535038e-05, + "loss": 2.6371, + "step": 44615 + }, + { + "epoch": 2.0771934725423096, + "grad_norm": 0.3369562761649273, + "learning_rate": 2.6153370075986937e-05, + "loss": 2.6484, + "step": 44616 + }, + { + "epoch": 2.0772400307284027, + "grad_norm": 0.3390370573998553, + "learning_rate": 2.615098929442631e-05, + "loss": 2.6287, + "step": 44617 + }, + { + "epoch": 2.077286588914496, + "grad_norm": 0.33844625610582063, + "learning_rate": 2.6148608582860224e-05, + "loss": 2.7583, + "step": 44618 + }, + { + "epoch": 2.077333147100589, + "grad_norm": 0.3262648391576185, + "learning_rate": 2.6146227941295593e-05, + "loss": 2.6333, + "step": 44619 + }, + { + "epoch": 2.077379705286682, + "grad_norm": 0.37564950213433695, + "learning_rate": 2.6143847369739437e-05, + "loss": 2.5868, + "step": 44620 + }, + { + "epoch": 2.077426263472775, + "grad_norm": 0.34884891421708536, + "learning_rate": 2.614146686819874e-05, + "loss": 2.6471, + "step": 44621 + }, + { + "epoch": 2.0774728216588683, + "grad_norm": 0.3578339069185393, + "learning_rate": 2.613908643668048e-05, + "loss": 2.6872, + "step": 44622 + }, + { + "epoch": 2.0775193798449614, + "grad_norm": 0.33268346663956, + "learning_rate": 2.613670607519168e-05, + "loss": 2.6218, + "step": 44623 + }, + { + "epoch": 2.0775659380310545, + "grad_norm": 0.34492322163641026, + "learning_rate": 2.6134325783739254e-05, + "loss": 2.5652, + "step": 44624 + }, + { + "epoch": 2.0776124962171476, + "grad_norm": 0.3717884299513559, + "learning_rate": 2.6131945562330272e-05, + "loss": 2.679, + "step": 44625 + }, + { + "epoch": 2.0776590544032403, + "grad_norm": 0.3504494622160751, + "learning_rate": 2.6129565410971657e-05, + "loss": 2.705, + "step": 44626 + }, + { + "epoch": 2.0777056125893334, + "grad_norm": 0.36881375099306984, + "learning_rate": 2.6127185329670412e-05, + "loss": 2.6153, + "step": 44627 + }, + { + "epoch": 2.0777521707754265, + "grad_norm": 0.33861149707005667, + "learning_rate": 2.612480531843353e-05, + "loss": 2.655, + "step": 44628 + }, + { + "epoch": 2.0777987289615196, + "grad_norm": 0.3434738255964185, + "learning_rate": 2.6122425377267994e-05, + "loss": 2.5899, + "step": 44629 + }, + { + "epoch": 2.0778452871476127, + "grad_norm": 0.3273840329927241, + "learning_rate": 2.6120045506180784e-05, + "loss": 2.7233, + "step": 44630 + }, + { + "epoch": 2.077891845333706, + "grad_norm": 0.37018149332330214, + "learning_rate": 2.6117665705178906e-05, + "loss": 2.6887, + "step": 44631 + }, + { + "epoch": 2.077938403519799, + "grad_norm": 0.35237620869802994, + "learning_rate": 2.611528597426931e-05, + "loss": 2.7356, + "step": 44632 + }, + { + "epoch": 2.077984961705892, + "grad_norm": 0.321458300882852, + "learning_rate": 2.6112906313458994e-05, + "loss": 2.7563, + "step": 44633 + }, + { + "epoch": 2.078031519891985, + "grad_norm": 0.3471754629075443, + "learning_rate": 2.6110526722754953e-05, + "loss": 2.7052, + "step": 44634 + }, + { + "epoch": 2.078078078078078, + "grad_norm": 0.3229261012948346, + "learning_rate": 2.6108147202164156e-05, + "loss": 2.664, + "step": 44635 + }, + { + "epoch": 2.078124636264171, + "grad_norm": 0.3589290570050179, + "learning_rate": 2.6105767751693612e-05, + "loss": 2.6678, + "step": 44636 + }, + { + "epoch": 2.078171194450264, + "grad_norm": 0.3414375286148671, + "learning_rate": 2.6103388371350247e-05, + "loss": 2.6982, + "step": 44637 + }, + { + "epoch": 2.078217752636357, + "grad_norm": 0.334387132337363, + "learning_rate": 2.6101009061141123e-05, + "loss": 2.6467, + "step": 44638 + }, + { + "epoch": 2.0782643108224503, + "grad_norm": 0.348579304058968, + "learning_rate": 2.6098629821073163e-05, + "loss": 2.6453, + "step": 44639 + }, + { + "epoch": 2.0783108690085434, + "grad_norm": 0.347852308630755, + "learning_rate": 2.609625065115337e-05, + "loss": 2.5914, + "step": 44640 + }, + { + "epoch": 2.0783574271946366, + "grad_norm": 0.3353840855540775, + "learning_rate": 2.6093871551388726e-05, + "loss": 2.614, + "step": 44641 + }, + { + "epoch": 2.0784039853807297, + "grad_norm": 0.32181222762407363, + "learning_rate": 2.6091492521786215e-05, + "loss": 2.5774, + "step": 44642 + }, + { + "epoch": 2.078450543566823, + "grad_norm": 0.3304034118941013, + "learning_rate": 2.6089113562352817e-05, + "loss": 2.5748, + "step": 44643 + }, + { + "epoch": 2.078497101752916, + "grad_norm": 0.3389891832437359, + "learning_rate": 2.6086734673095536e-05, + "loss": 2.5113, + "step": 44644 + }, + { + "epoch": 2.0785436599390086, + "grad_norm": 0.34344990535252756, + "learning_rate": 2.6084355854021315e-05, + "loss": 2.6313, + "step": 44645 + }, + { + "epoch": 2.0785902181251017, + "grad_norm": 0.32411396273604964, + "learning_rate": 2.6081977105137144e-05, + "loss": 2.5229, + "step": 44646 + }, + { + "epoch": 2.078636776311195, + "grad_norm": 0.32301343068171307, + "learning_rate": 2.6079598426450025e-05, + "loss": 2.623, + "step": 44647 + }, + { + "epoch": 2.078683334497288, + "grad_norm": 0.32733011927727584, + "learning_rate": 2.6077219817966924e-05, + "loss": 2.5883, + "step": 44648 + }, + { + "epoch": 2.078729892683381, + "grad_norm": 0.32646671425450513, + "learning_rate": 2.6074841279694838e-05, + "loss": 2.6242, + "step": 44649 + }, + { + "epoch": 2.078776450869474, + "grad_norm": 0.33472088488239304, + "learning_rate": 2.6072462811640706e-05, + "loss": 2.6342, + "step": 44650 + }, + { + "epoch": 2.0788230090555673, + "grad_norm": 0.32433727149095665, + "learning_rate": 2.607008441381158e-05, + "loss": 2.6806, + "step": 44651 + }, + { + "epoch": 2.0788695672416604, + "grad_norm": 0.3192068457417257, + "learning_rate": 2.6067706086214372e-05, + "loss": 2.5996, + "step": 44652 + }, + { + "epoch": 2.0789161254277535, + "grad_norm": 0.3403075425722908, + "learning_rate": 2.606532782885609e-05, + "loss": 2.5574, + "step": 44653 + }, + { + "epoch": 2.0789626836138466, + "grad_norm": 0.3353560106511256, + "learning_rate": 2.6062949641743716e-05, + "loss": 2.6405, + "step": 44654 + }, + { + "epoch": 2.0790092417999393, + "grad_norm": 0.35431817050774683, + "learning_rate": 2.6060571524884227e-05, + "loss": 2.5322, + "step": 44655 + }, + { + "epoch": 2.0790557999860324, + "grad_norm": 0.3377476670678414, + "learning_rate": 2.60581934782846e-05, + "loss": 2.6785, + "step": 44656 + }, + { + "epoch": 2.0791023581721255, + "grad_norm": 0.32048478233595645, + "learning_rate": 2.6055815501951836e-05, + "loss": 2.5797, + "step": 44657 + }, + { + "epoch": 2.0791489163582186, + "grad_norm": 0.3506615085605543, + "learning_rate": 2.605343759589287e-05, + "loss": 2.7568, + "step": 44658 + }, + { + "epoch": 2.0791954745443118, + "grad_norm": 0.35569076475303607, + "learning_rate": 2.6051059760114715e-05, + "loss": 2.6578, + "step": 44659 + }, + { + "epoch": 2.079242032730405, + "grad_norm": 0.34094861417566175, + "learning_rate": 2.6048681994624337e-05, + "loss": 2.6656, + "step": 44660 + }, + { + "epoch": 2.079288590916498, + "grad_norm": 0.337399231885201, + "learning_rate": 2.6046304299428714e-05, + "loss": 2.7465, + "step": 44661 + }, + { + "epoch": 2.079335149102591, + "grad_norm": 0.3274967842271086, + "learning_rate": 2.604392667453485e-05, + "loss": 2.6732, + "step": 44662 + }, + { + "epoch": 2.079381707288684, + "grad_norm": 0.3499614897795218, + "learning_rate": 2.6041549119949666e-05, + "loss": 2.5768, + "step": 44663 + }, + { + "epoch": 2.0794282654747773, + "grad_norm": 0.3473173624727325, + "learning_rate": 2.603917163568021e-05, + "loss": 2.616, + "step": 44664 + }, + { + "epoch": 2.07947482366087, + "grad_norm": 0.3583377263051228, + "learning_rate": 2.6036794221733386e-05, + "loss": 2.7488, + "step": 44665 + }, + { + "epoch": 2.079521381846963, + "grad_norm": 0.33483500286382, + "learning_rate": 2.603441687811625e-05, + "loss": 2.6307, + "step": 44666 + }, + { + "epoch": 2.0795679400330562, + "grad_norm": 0.3330214603862901, + "learning_rate": 2.603203960483572e-05, + "loss": 2.6876, + "step": 44667 + }, + { + "epoch": 2.0796144982191493, + "grad_norm": 0.3485528320470635, + "learning_rate": 2.6029662401898792e-05, + "loss": 2.4822, + "step": 44668 + }, + { + "epoch": 2.0796610564052425, + "grad_norm": 0.34676757339565195, + "learning_rate": 2.602728526931244e-05, + "loss": 2.7113, + "step": 44669 + }, + { + "epoch": 2.0797076145913356, + "grad_norm": 0.32340307354519315, + "learning_rate": 2.6024908207083666e-05, + "loss": 2.6838, + "step": 44670 + }, + { + "epoch": 2.0797541727774287, + "grad_norm": 0.38247615273111557, + "learning_rate": 2.6022531215219405e-05, + "loss": 2.6315, + "step": 44671 + }, + { + "epoch": 2.079800730963522, + "grad_norm": 0.34999821153308813, + "learning_rate": 2.6020154293726655e-05, + "loss": 2.6797, + "step": 44672 + }, + { + "epoch": 2.079847289149615, + "grad_norm": 0.3536963936954163, + "learning_rate": 2.601777744261239e-05, + "loss": 2.7044, + "step": 44673 + }, + { + "epoch": 2.0798938473357076, + "grad_norm": 0.34586517518320625, + "learning_rate": 2.601540066188358e-05, + "loss": 2.6755, + "step": 44674 + }, + { + "epoch": 2.0799404055218007, + "grad_norm": 0.3380905047753905, + "learning_rate": 2.6013023951547234e-05, + "loss": 2.6464, + "step": 44675 + }, + { + "epoch": 2.079986963707894, + "grad_norm": 0.33999476331297634, + "learning_rate": 2.6010647311610255e-05, + "loss": 2.6466, + "step": 44676 + }, + { + "epoch": 2.080033521893987, + "grad_norm": 0.35103131546001565, + "learning_rate": 2.6008270742079698e-05, + "loss": 2.6594, + "step": 44677 + }, + { + "epoch": 2.08008008008008, + "grad_norm": 0.32965465795620946, + "learning_rate": 2.600589424296247e-05, + "loss": 2.678, + "step": 44678 + }, + { + "epoch": 2.080126638266173, + "grad_norm": 0.3284718294770659, + "learning_rate": 2.6003517814265623e-05, + "loss": 2.6866, + "step": 44679 + }, + { + "epoch": 2.0801731964522663, + "grad_norm": 0.347957570833146, + "learning_rate": 2.600114145599606e-05, + "loss": 2.5593, + "step": 44680 + }, + { + "epoch": 2.0802197546383594, + "grad_norm": 0.3286238085999955, + "learning_rate": 2.5998765168160777e-05, + "loss": 2.5806, + "step": 44681 + }, + { + "epoch": 2.0802663128244525, + "grad_norm": 0.34179627938592894, + "learning_rate": 2.5996388950766766e-05, + "loss": 2.7607, + "step": 44682 + }, + { + "epoch": 2.0803128710105456, + "grad_norm": 0.310690715584304, + "learning_rate": 2.5994012803820977e-05, + "loss": 2.5857, + "step": 44683 + }, + { + "epoch": 2.0803594291966383, + "grad_norm": 0.33663836717344064, + "learning_rate": 2.5991636727330426e-05, + "loss": 2.6651, + "step": 44684 + }, + { + "epoch": 2.0804059873827314, + "grad_norm": 0.32319024058094104, + "learning_rate": 2.5989260721302032e-05, + "loss": 2.5869, + "step": 44685 + }, + { + "epoch": 2.0804525455688245, + "grad_norm": 0.3306764717142312, + "learning_rate": 2.5986884785742788e-05, + "loss": 2.6458, + "step": 44686 + }, + { + "epoch": 2.0804991037549176, + "grad_norm": 0.348385480228165, + "learning_rate": 2.5984508920659677e-05, + "loss": 2.6547, + "step": 44687 + }, + { + "epoch": 2.0805456619410108, + "grad_norm": 0.3233705662755279, + "learning_rate": 2.5982133126059682e-05, + "loss": 2.5756, + "step": 44688 + }, + { + "epoch": 2.080592220127104, + "grad_norm": 0.3444034944061645, + "learning_rate": 2.597975740194972e-05, + "loss": 2.6616, + "step": 44689 + }, + { + "epoch": 2.080638778313197, + "grad_norm": 0.3355786835989822, + "learning_rate": 2.597738174833685e-05, + "loss": 2.6655, + "step": 44690 + }, + { + "epoch": 2.08068533649929, + "grad_norm": 0.32015132089718246, + "learning_rate": 2.5975006165227957e-05, + "loss": 2.5576, + "step": 44691 + }, + { + "epoch": 2.080731894685383, + "grad_norm": 0.3493973455053189, + "learning_rate": 2.597263065263009e-05, + "loss": 2.5902, + "step": 44692 + }, + { + "epoch": 2.0807784528714763, + "grad_norm": 0.35985678900277634, + "learning_rate": 2.5970255210550166e-05, + "loss": 2.7275, + "step": 44693 + }, + { + "epoch": 2.080825011057569, + "grad_norm": 0.35296642438364356, + "learning_rate": 2.5967879838995172e-05, + "loss": 2.6617, + "step": 44694 + }, + { + "epoch": 2.080871569243662, + "grad_norm": 0.33423641140326027, + "learning_rate": 2.5965504537972083e-05, + "loss": 2.5954, + "step": 44695 + }, + { + "epoch": 2.0809181274297552, + "grad_norm": 0.3091485087719317, + "learning_rate": 2.5963129307487876e-05, + "loss": 2.6309, + "step": 44696 + }, + { + "epoch": 2.0809646856158484, + "grad_norm": 0.32334689569717673, + "learning_rate": 2.5960754147549536e-05, + "loss": 2.6259, + "step": 44697 + }, + { + "epoch": 2.0810112438019415, + "grad_norm": 0.358409129403731, + "learning_rate": 2.5958379058163995e-05, + "loss": 2.6771, + "step": 44698 + }, + { + "epoch": 2.0810578019880346, + "grad_norm": 0.3237063865963714, + "learning_rate": 2.5956004039338244e-05, + "loss": 2.5859, + "step": 44699 + }, + { + "epoch": 2.0811043601741277, + "grad_norm": 0.3332764916221847, + "learning_rate": 2.5953629091079245e-05, + "loss": 2.5776, + "step": 44700 + }, + { + "epoch": 2.081150918360221, + "grad_norm": 0.32438925761237297, + "learning_rate": 2.5951254213393984e-05, + "loss": 2.6412, + "step": 44701 + }, + { + "epoch": 2.081197476546314, + "grad_norm": 0.340460156398388, + "learning_rate": 2.5948879406289417e-05, + "loss": 2.6009, + "step": 44702 + }, + { + "epoch": 2.081244034732407, + "grad_norm": 0.31467432426230896, + "learning_rate": 2.5946504669772544e-05, + "loss": 2.641, + "step": 44703 + }, + { + "epoch": 2.0812905929184997, + "grad_norm": 0.3517625762435386, + "learning_rate": 2.5944130003850263e-05, + "loss": 2.7353, + "step": 44704 + }, + { + "epoch": 2.081337151104593, + "grad_norm": 0.32744884418121495, + "learning_rate": 2.5941755408529635e-05, + "loss": 2.7271, + "step": 44705 + }, + { + "epoch": 2.081383709290686, + "grad_norm": 0.32864559696169454, + "learning_rate": 2.5939380883817565e-05, + "loss": 2.522, + "step": 44706 + }, + { + "epoch": 2.081430267476779, + "grad_norm": 0.31888698890858913, + "learning_rate": 2.5937006429721043e-05, + "loss": 2.6316, + "step": 44707 + }, + { + "epoch": 2.081476825662872, + "grad_norm": 0.3189419447054601, + "learning_rate": 2.593463204624703e-05, + "loss": 2.6791, + "step": 44708 + }, + { + "epoch": 2.0815233838489653, + "grad_norm": 0.33297670358960557, + "learning_rate": 2.593225773340251e-05, + "loss": 2.7369, + "step": 44709 + }, + { + "epoch": 2.0815699420350584, + "grad_norm": 0.32176799665876993, + "learning_rate": 2.592988349119446e-05, + "loss": 2.5841, + "step": 44710 + }, + { + "epoch": 2.0816165002211515, + "grad_norm": 0.33561250681725646, + "learning_rate": 2.5927509319629807e-05, + "loss": 2.631, + "step": 44711 + }, + { + "epoch": 2.0816630584072446, + "grad_norm": 0.33448298133021764, + "learning_rate": 2.5925135218715544e-05, + "loss": 2.7584, + "step": 44712 + }, + { + "epoch": 2.0817096165933373, + "grad_norm": 0.3220351996465307, + "learning_rate": 2.5922761188458643e-05, + "loss": 2.629, + "step": 44713 + }, + { + "epoch": 2.0817561747794304, + "grad_norm": 0.3297866817469179, + "learning_rate": 2.592038722886606e-05, + "loss": 2.706, + "step": 44714 + }, + { + "epoch": 2.0818027329655235, + "grad_norm": 0.339215108632467, + "learning_rate": 2.5918013339944768e-05, + "loss": 2.5713, + "step": 44715 + }, + { + "epoch": 2.0818492911516167, + "grad_norm": 0.33417102622378053, + "learning_rate": 2.591563952170175e-05, + "loss": 2.6407, + "step": 44716 + }, + { + "epoch": 2.0818958493377098, + "grad_norm": 0.35419045334387644, + "learning_rate": 2.5913265774143925e-05, + "loss": 2.6912, + "step": 44717 + }, + { + "epoch": 2.081942407523803, + "grad_norm": 0.33202307172815365, + "learning_rate": 2.5910892097278333e-05, + "loss": 2.5546, + "step": 44718 + }, + { + "epoch": 2.081988965709896, + "grad_norm": 0.31428430281449804, + "learning_rate": 2.5908518491111855e-05, + "loss": 2.6732, + "step": 44719 + }, + { + "epoch": 2.082035523895989, + "grad_norm": 0.36670305252632807, + "learning_rate": 2.5906144955651546e-05, + "loss": 2.7726, + "step": 44720 + }, + { + "epoch": 2.0820820820820822, + "grad_norm": 0.3286052362641173, + "learning_rate": 2.5903771490904306e-05, + "loss": 2.5825, + "step": 44721 + }, + { + "epoch": 2.0821286402681753, + "grad_norm": 0.32093686241573144, + "learning_rate": 2.5901398096877124e-05, + "loss": 2.6379, + "step": 44722 + }, + { + "epoch": 2.082175198454268, + "grad_norm": 0.35609632222935794, + "learning_rate": 2.589902477357698e-05, + "loss": 2.7995, + "step": 44723 + }, + { + "epoch": 2.082221756640361, + "grad_norm": 0.3500597001507245, + "learning_rate": 2.5896651521010808e-05, + "loss": 2.6819, + "step": 44724 + }, + { + "epoch": 2.0822683148264542, + "grad_norm": 0.3158035291255412, + "learning_rate": 2.5894278339185586e-05, + "loss": 2.5194, + "step": 44725 + }, + { + "epoch": 2.0823148730125474, + "grad_norm": 0.35229744854502154, + "learning_rate": 2.5891905228108282e-05, + "loss": 2.6672, + "step": 44726 + }, + { + "epoch": 2.0823614311986405, + "grad_norm": 0.34889598448062875, + "learning_rate": 2.5889532187785864e-05, + "loss": 2.7094, + "step": 44727 + }, + { + "epoch": 2.0824079893847336, + "grad_norm": 0.35839196347103064, + "learning_rate": 2.5887159218225292e-05, + "loss": 2.7772, + "step": 44728 + }, + { + "epoch": 2.0824545475708267, + "grad_norm": 0.34734082069732675, + "learning_rate": 2.588478631943355e-05, + "loss": 2.7391, + "step": 44729 + }, + { + "epoch": 2.08250110575692, + "grad_norm": 0.3296580054514756, + "learning_rate": 2.5882413491417552e-05, + "loss": 2.6517, + "step": 44730 + }, + { + "epoch": 2.082547663943013, + "grad_norm": 0.3399818477821583, + "learning_rate": 2.588004073418433e-05, + "loss": 2.6924, + "step": 44731 + }, + { + "epoch": 2.082594222129106, + "grad_norm": 0.37840401216585395, + "learning_rate": 2.5877668047740773e-05, + "loss": 2.5185, + "step": 44732 + }, + { + "epoch": 2.0826407803151987, + "grad_norm": 0.31889388130575075, + "learning_rate": 2.587529543209392e-05, + "loss": 2.5862, + "step": 44733 + }, + { + "epoch": 2.082687338501292, + "grad_norm": 0.3384363879790449, + "learning_rate": 2.5872922887250683e-05, + "loss": 2.6409, + "step": 44734 + }, + { + "epoch": 2.082733896687385, + "grad_norm": 0.3486592225621081, + "learning_rate": 2.5870550413218043e-05, + "loss": 2.6136, + "step": 44735 + }, + { + "epoch": 2.082780454873478, + "grad_norm": 0.342816525910018, + "learning_rate": 2.5868178010002975e-05, + "loss": 2.6554, + "step": 44736 + }, + { + "epoch": 2.082827013059571, + "grad_norm": 0.32157332409969563, + "learning_rate": 2.5865805677612388e-05, + "loss": 2.5964, + "step": 44737 + }, + { + "epoch": 2.0828735712456643, + "grad_norm": 0.3304878264187811, + "learning_rate": 2.5863433416053318e-05, + "loss": 2.7255, + "step": 44738 + }, + { + "epoch": 2.0829201294317574, + "grad_norm": 0.33437229159289106, + "learning_rate": 2.5861061225332684e-05, + "loss": 2.5973, + "step": 44739 + }, + { + "epoch": 2.0829666876178505, + "grad_norm": 0.35280770589128924, + "learning_rate": 2.5858689105457445e-05, + "loss": 2.7377, + "step": 44740 + }, + { + "epoch": 2.0830132458039436, + "grad_norm": 0.33920314556294356, + "learning_rate": 2.5856317056434588e-05, + "loss": 2.6741, + "step": 44741 + }, + { + "epoch": 2.0830598039900368, + "grad_norm": 0.3150565422277991, + "learning_rate": 2.5853945078271075e-05, + "loss": 2.6108, + "step": 44742 + }, + { + "epoch": 2.0831063621761294, + "grad_norm": 0.3378351587311021, + "learning_rate": 2.585157317097382e-05, + "loss": 2.6098, + "step": 44743 + }, + { + "epoch": 2.0831529203622225, + "grad_norm": 0.33811902900833485, + "learning_rate": 2.584920133454985e-05, + "loss": 2.6224, + "step": 44744 + }, + { + "epoch": 2.0831994785483157, + "grad_norm": 0.331786841949608, + "learning_rate": 2.584682956900606e-05, + "loss": 2.5546, + "step": 44745 + }, + { + "epoch": 2.083246036734409, + "grad_norm": 0.3519601528063176, + "learning_rate": 2.584445787434948e-05, + "loss": 2.7202, + "step": 44746 + }, + { + "epoch": 2.083292594920502, + "grad_norm": 0.34378491692816504, + "learning_rate": 2.5842086250587026e-05, + "loss": 2.6396, + "step": 44747 + }, + { + "epoch": 2.083339153106595, + "grad_norm": 0.3537246671100222, + "learning_rate": 2.5839714697725665e-05, + "loss": 2.6377, + "step": 44748 + }, + { + "epoch": 2.083385711292688, + "grad_norm": 0.36141155069641806, + "learning_rate": 2.5837343215772382e-05, + "loss": 2.6759, + "step": 44749 + }, + { + "epoch": 2.0834322694787812, + "grad_norm": 0.3410078469948182, + "learning_rate": 2.583497180473407e-05, + "loss": 2.4941, + "step": 44750 + }, + { + "epoch": 2.0834788276648744, + "grad_norm": 0.35995651408067053, + "learning_rate": 2.5832600464617786e-05, + "loss": 2.7205, + "step": 44751 + }, + { + "epoch": 2.083525385850967, + "grad_norm": 0.3584959287958286, + "learning_rate": 2.583022919543041e-05, + "loss": 2.6614, + "step": 44752 + }, + { + "epoch": 2.08357194403706, + "grad_norm": 0.3349306951080318, + "learning_rate": 2.582785799717893e-05, + "loss": 2.6069, + "step": 44753 + }, + { + "epoch": 2.0836185022231533, + "grad_norm": 0.3943122708066332, + "learning_rate": 2.5825486869870308e-05, + "loss": 2.7154, + "step": 44754 + }, + { + "epoch": 2.0836650604092464, + "grad_norm": 0.33674447194514223, + "learning_rate": 2.58231158135115e-05, + "loss": 2.6455, + "step": 44755 + }, + { + "epoch": 2.0837116185953395, + "grad_norm": 0.3541686552876096, + "learning_rate": 2.5820744828109468e-05, + "loss": 2.6359, + "step": 44756 + }, + { + "epoch": 2.0837581767814326, + "grad_norm": 0.33560212007232965, + "learning_rate": 2.5818373913671184e-05, + "loss": 2.5725, + "step": 44757 + }, + { + "epoch": 2.0838047349675257, + "grad_norm": 0.3663488091378466, + "learning_rate": 2.5816003070203554e-05, + "loss": 2.683, + "step": 44758 + }, + { + "epoch": 2.083851293153619, + "grad_norm": 0.3507819810651512, + "learning_rate": 2.581363229771361e-05, + "loss": 2.6361, + "step": 44759 + }, + { + "epoch": 2.083897851339712, + "grad_norm": 0.35360314705199986, + "learning_rate": 2.581126159620825e-05, + "loss": 2.5565, + "step": 44760 + }, + { + "epoch": 2.083944409525805, + "grad_norm": 0.378776124571382, + "learning_rate": 2.580889096569446e-05, + "loss": 2.6178, + "step": 44761 + }, + { + "epoch": 2.0839909677118977, + "grad_norm": 0.3537626907997494, + "learning_rate": 2.580652040617921e-05, + "loss": 2.7174, + "step": 44762 + }, + { + "epoch": 2.084037525897991, + "grad_norm": 0.34405895942838693, + "learning_rate": 2.5804149917669396e-05, + "loss": 2.6777, + "step": 44763 + }, + { + "epoch": 2.084084084084084, + "grad_norm": 0.3515858813725888, + "learning_rate": 2.5801779500172064e-05, + "loss": 2.6661, + "step": 44764 + }, + { + "epoch": 2.084130642270177, + "grad_norm": 0.3719279183774414, + "learning_rate": 2.5799409153694098e-05, + "loss": 2.6938, + "step": 44765 + }, + { + "epoch": 2.08417720045627, + "grad_norm": 0.3509790299158265, + "learning_rate": 2.5797038878242484e-05, + "loss": 2.6804, + "step": 44766 + }, + { + "epoch": 2.0842237586423633, + "grad_norm": 0.34429801667789856, + "learning_rate": 2.579466867382418e-05, + "loss": 2.5488, + "step": 44767 + }, + { + "epoch": 2.0842703168284564, + "grad_norm": 0.3175441587596933, + "learning_rate": 2.5792298540446135e-05, + "loss": 2.5764, + "step": 44768 + }, + { + "epoch": 2.0843168750145495, + "grad_norm": 0.37981983756608656, + "learning_rate": 2.578992847811531e-05, + "loss": 2.6324, + "step": 44769 + }, + { + "epoch": 2.0843634332006427, + "grad_norm": 0.34612563640839616, + "learning_rate": 2.578755848683868e-05, + "loss": 2.708, + "step": 44770 + }, + { + "epoch": 2.0844099913867358, + "grad_norm": 0.3418215815562173, + "learning_rate": 2.5785188566623143e-05, + "loss": 2.6584, + "step": 44771 + }, + { + "epoch": 2.0844565495728284, + "grad_norm": 0.33971691004236476, + "learning_rate": 2.5782818717475733e-05, + "loss": 2.6287, + "step": 44772 + }, + { + "epoch": 2.0845031077589216, + "grad_norm": 0.3406046750432444, + "learning_rate": 2.578044893940334e-05, + "loss": 2.5975, + "step": 44773 + }, + { + "epoch": 2.0845496659450147, + "grad_norm": 0.3506287118526518, + "learning_rate": 2.5778079232412944e-05, + "loss": 2.6629, + "step": 44774 + }, + { + "epoch": 2.084596224131108, + "grad_norm": 0.3546598660229565, + "learning_rate": 2.577570959651152e-05, + "loss": 2.6489, + "step": 44775 + }, + { + "epoch": 2.084642782317201, + "grad_norm": 0.36032364864118016, + "learning_rate": 2.577334003170596e-05, + "loss": 2.7057, + "step": 44776 + }, + { + "epoch": 2.084689340503294, + "grad_norm": 0.34874428077088754, + "learning_rate": 2.5770970538003302e-05, + "loss": 2.6221, + "step": 44777 + }, + { + "epoch": 2.084735898689387, + "grad_norm": 0.3575897834098419, + "learning_rate": 2.576860111541044e-05, + "loss": 2.7137, + "step": 44778 + }, + { + "epoch": 2.0847824568754802, + "grad_norm": 0.3586680042208159, + "learning_rate": 2.5766231763934345e-05, + "loss": 2.7249, + "step": 44779 + }, + { + "epoch": 2.0848290150615734, + "grad_norm": 0.3496438693590617, + "learning_rate": 2.5763862483581974e-05, + "loss": 2.657, + "step": 44780 + }, + { + "epoch": 2.0848755732476665, + "grad_norm": 0.4012999223353975, + "learning_rate": 2.5761493274360282e-05, + "loss": 2.7112, + "step": 44781 + }, + { + "epoch": 2.084922131433759, + "grad_norm": 0.34600940704240446, + "learning_rate": 2.5759124136276215e-05, + "loss": 2.735, + "step": 44782 + }, + { + "epoch": 2.0849686896198523, + "grad_norm": 0.36441713785595875, + "learning_rate": 2.5756755069336746e-05, + "loss": 2.6961, + "step": 44783 + }, + { + "epoch": 2.0850152478059454, + "grad_norm": 0.33927695893693727, + "learning_rate": 2.5754386073548776e-05, + "loss": 2.7321, + "step": 44784 + }, + { + "epoch": 2.0850618059920385, + "grad_norm": 0.3314219259168334, + "learning_rate": 2.575201714891934e-05, + "loss": 2.6951, + "step": 44785 + }, + { + "epoch": 2.0851083641781316, + "grad_norm": 0.35636443208627455, + "learning_rate": 2.574964829545532e-05, + "loss": 2.7444, + "step": 44786 + }, + { + "epoch": 2.0851549223642247, + "grad_norm": 0.3245645118790876, + "learning_rate": 2.5747279513163692e-05, + "loss": 2.5962, + "step": 44787 + }, + { + "epoch": 2.085201480550318, + "grad_norm": 0.3219098185560227, + "learning_rate": 2.5744910802051424e-05, + "loss": 2.5421, + "step": 44788 + }, + { + "epoch": 2.085248038736411, + "grad_norm": 0.33838594159487645, + "learning_rate": 2.5742542162125428e-05, + "loss": 2.637, + "step": 44789 + }, + { + "epoch": 2.085294596922504, + "grad_norm": 0.35103493074591574, + "learning_rate": 2.5740173593392713e-05, + "loss": 2.5938, + "step": 44790 + }, + { + "epoch": 2.0853411551085967, + "grad_norm": 0.3483117561619744, + "learning_rate": 2.573780509586016e-05, + "loss": 2.729, + "step": 44791 + }, + { + "epoch": 2.08538771329469, + "grad_norm": 0.33351909983475325, + "learning_rate": 2.5735436669534797e-05, + "loss": 2.7078, + "step": 44792 + }, + { + "epoch": 2.085434271480783, + "grad_norm": 0.34596735898636394, + "learning_rate": 2.5733068314423514e-05, + "loss": 2.6594, + "step": 44793 + }, + { + "epoch": 2.085480829666876, + "grad_norm": 0.3289454961352293, + "learning_rate": 2.5730700030533282e-05, + "loss": 2.6277, + "step": 44794 + }, + { + "epoch": 2.085527387852969, + "grad_norm": 0.3296491551936581, + "learning_rate": 2.5728331817871064e-05, + "loss": 2.6182, + "step": 44795 + }, + { + "epoch": 2.0855739460390623, + "grad_norm": 0.3440843662995334, + "learning_rate": 2.5725963676443798e-05, + "loss": 2.6918, + "step": 44796 + }, + { + "epoch": 2.0856205042251554, + "grad_norm": 0.33104414866248516, + "learning_rate": 2.5723595606258437e-05, + "loss": 2.6816, + "step": 44797 + }, + { + "epoch": 2.0856670624112486, + "grad_norm": 0.3395209680247699, + "learning_rate": 2.5721227607321953e-05, + "loss": 2.6713, + "step": 44798 + }, + { + "epoch": 2.0857136205973417, + "grad_norm": 0.37653720187538436, + "learning_rate": 2.571885967964125e-05, + "loss": 2.6772, + "step": 44799 + }, + { + "epoch": 2.085760178783435, + "grad_norm": 0.3344939750550293, + "learning_rate": 2.5716491823223304e-05, + "loss": 2.5408, + "step": 44800 + }, + { + "epoch": 2.0858067369695275, + "grad_norm": 0.3191043451833807, + "learning_rate": 2.5714124038075084e-05, + "loss": 2.6871, + "step": 44801 + }, + { + "epoch": 2.0858532951556206, + "grad_norm": 0.3278064098937628, + "learning_rate": 2.5711756324203473e-05, + "loss": 2.6521, + "step": 44802 + }, + { + "epoch": 2.0858998533417137, + "grad_norm": 0.3445339357565189, + "learning_rate": 2.570938868161551e-05, + "loss": 2.6017, + "step": 44803 + }, + { + "epoch": 2.085946411527807, + "grad_norm": 0.35994469188228206, + "learning_rate": 2.5707021110318053e-05, + "loss": 2.6536, + "step": 44804 + }, + { + "epoch": 2.0859929697139, + "grad_norm": 0.31608639599653926, + "learning_rate": 2.5704653610318134e-05, + "loss": 2.701, + "step": 44805 + }, + { + "epoch": 2.086039527899993, + "grad_norm": 0.35635417884811105, + "learning_rate": 2.570228618162265e-05, + "loss": 2.6281, + "step": 44806 + }, + { + "epoch": 2.086086086086086, + "grad_norm": 0.3589158723579689, + "learning_rate": 2.5699918824238556e-05, + "loss": 2.6758, + "step": 44807 + }, + { + "epoch": 2.0861326442721793, + "grad_norm": 0.35864768597569135, + "learning_rate": 2.5697551538172814e-05, + "loss": 2.6632, + "step": 44808 + }, + { + "epoch": 2.0861792024582724, + "grad_norm": 0.37178454216328416, + "learning_rate": 2.5695184323432352e-05, + "loss": 2.6812, + "step": 44809 + }, + { + "epoch": 2.0862257606443655, + "grad_norm": 0.3552665724881887, + "learning_rate": 2.5692817180024143e-05, + "loss": 2.6236, + "step": 44810 + }, + { + "epoch": 2.086272318830458, + "grad_norm": 0.3379805860166707, + "learning_rate": 2.5690450107955132e-05, + "loss": 2.6152, + "step": 44811 + }, + { + "epoch": 2.0863188770165513, + "grad_norm": 0.3542087235920811, + "learning_rate": 2.5688083107232242e-05, + "loss": 2.5802, + "step": 44812 + }, + { + "epoch": 2.0863654352026444, + "grad_norm": 0.34917956602712635, + "learning_rate": 2.568571617786243e-05, + "loss": 2.6111, + "step": 44813 + }, + { + "epoch": 2.0864119933887375, + "grad_norm": 0.33549178280050923, + "learning_rate": 2.5683349319852644e-05, + "loss": 2.6828, + "step": 44814 + }, + { + "epoch": 2.0864585515748306, + "grad_norm": 0.364566708016196, + "learning_rate": 2.568098253320983e-05, + "loss": 2.5926, + "step": 44815 + }, + { + "epoch": 2.0865051097609237, + "grad_norm": 0.33695398582533453, + "learning_rate": 2.567861581794096e-05, + "loss": 2.5866, + "step": 44816 + }, + { + "epoch": 2.086551667947017, + "grad_norm": 0.34758078483531846, + "learning_rate": 2.5676249174052915e-05, + "loss": 2.6514, + "step": 44817 + }, + { + "epoch": 2.08659822613311, + "grad_norm": 0.37236611774184286, + "learning_rate": 2.5673882601552722e-05, + "loss": 2.7094, + "step": 44818 + }, + { + "epoch": 2.086644784319203, + "grad_norm": 0.34797314680307967, + "learning_rate": 2.567151610044727e-05, + "loss": 2.684, + "step": 44819 + }, + { + "epoch": 2.086691342505296, + "grad_norm": 0.35845827439271233, + "learning_rate": 2.566914967074352e-05, + "loss": 2.5287, + "step": 44820 + }, + { + "epoch": 2.086737900691389, + "grad_norm": 0.32524863172323265, + "learning_rate": 2.5666783312448418e-05, + "loss": 2.6337, + "step": 44821 + }, + { + "epoch": 2.086784458877482, + "grad_norm": 0.3369438462788384, + "learning_rate": 2.566441702556891e-05, + "loss": 2.5643, + "step": 44822 + }, + { + "epoch": 2.086831017063575, + "grad_norm": 0.33564266878597, + "learning_rate": 2.5662050810111943e-05, + "loss": 2.5973, + "step": 44823 + }, + { + "epoch": 2.086877575249668, + "grad_norm": 0.32723218189902387, + "learning_rate": 2.565968466608447e-05, + "loss": 2.7877, + "step": 44824 + }, + { + "epoch": 2.0869241334357613, + "grad_norm": 0.32286733380671034, + "learning_rate": 2.565731859349342e-05, + "loss": 2.6221, + "step": 44825 + }, + { + "epoch": 2.0869706916218544, + "grad_norm": 0.35783469995120876, + "learning_rate": 2.565495259234573e-05, + "loss": 2.7574, + "step": 44826 + }, + { + "epoch": 2.0870172498079476, + "grad_norm": 0.3395395315687064, + "learning_rate": 2.565258666264836e-05, + "loss": 2.6059, + "step": 44827 + }, + { + "epoch": 2.0870638079940407, + "grad_norm": 0.34404070525868047, + "learning_rate": 2.565022080440825e-05, + "loss": 2.6645, + "step": 44828 + }, + { + "epoch": 2.087110366180134, + "grad_norm": 0.3392367039180707, + "learning_rate": 2.5647855017632362e-05, + "loss": 2.5545, + "step": 44829 + }, + { + "epoch": 2.087156924366227, + "grad_norm": 0.3336764169948325, + "learning_rate": 2.5645489302327574e-05, + "loss": 2.6815, + "step": 44830 + }, + { + "epoch": 2.0872034825523196, + "grad_norm": 0.3700659802725642, + "learning_rate": 2.564312365850092e-05, + "loss": 2.718, + "step": 44831 + }, + { + "epoch": 2.0872500407384127, + "grad_norm": 0.34574149530404585, + "learning_rate": 2.5640758086159265e-05, + "loss": 2.6273, + "step": 44832 + }, + { + "epoch": 2.087296598924506, + "grad_norm": 0.35947632808760094, + "learning_rate": 2.5638392585309622e-05, + "loss": 2.6531, + "step": 44833 + }, + { + "epoch": 2.087343157110599, + "grad_norm": 0.3306732735886642, + "learning_rate": 2.5636027155958875e-05, + "loss": 2.6849, + "step": 44834 + }, + { + "epoch": 2.087389715296692, + "grad_norm": 0.3523489845343673, + "learning_rate": 2.5633661798113984e-05, + "loss": 2.6821, + "step": 44835 + }, + { + "epoch": 2.087436273482785, + "grad_norm": 0.3629616571264361, + "learning_rate": 2.56312965117819e-05, + "loss": 2.5972, + "step": 44836 + }, + { + "epoch": 2.0874828316688783, + "grad_norm": 0.31518586209588095, + "learning_rate": 2.562893129696958e-05, + "loss": 2.6268, + "step": 44837 + }, + { + "epoch": 2.0875293898549714, + "grad_norm": 0.3906531094086222, + "learning_rate": 2.562656615368393e-05, + "loss": 2.6951, + "step": 44838 + }, + { + "epoch": 2.0875759480410645, + "grad_norm": 0.3388005248364818, + "learning_rate": 2.5624201081931898e-05, + "loss": 2.694, + "step": 44839 + }, + { + "epoch": 2.0876225062271576, + "grad_norm": 0.3564639020531452, + "learning_rate": 2.5621836081720442e-05, + "loss": 2.6391, + "step": 44840 + }, + { + "epoch": 2.0876690644132503, + "grad_norm": 0.34269677853290215, + "learning_rate": 2.561947115305649e-05, + "loss": 2.6503, + "step": 44841 + }, + { + "epoch": 2.0877156225993434, + "grad_norm": 0.3446987498413533, + "learning_rate": 2.5617106295947012e-05, + "loss": 2.7396, + "step": 44842 + }, + { + "epoch": 2.0877621807854365, + "grad_norm": 0.321132177257394, + "learning_rate": 2.5614741510398882e-05, + "loss": 2.6815, + "step": 44843 + }, + { + "epoch": 2.0878087389715296, + "grad_norm": 0.3365360735716692, + "learning_rate": 2.5612376796419124e-05, + "loss": 2.6836, + "step": 44844 + }, + { + "epoch": 2.0878552971576227, + "grad_norm": 0.3271436505401006, + "learning_rate": 2.56100121540146e-05, + "loss": 2.7613, + "step": 44845 + }, + { + "epoch": 2.087901855343716, + "grad_norm": 0.3382548446934504, + "learning_rate": 2.5607647583192322e-05, + "loss": 2.674, + "step": 44846 + }, + { + "epoch": 2.087948413529809, + "grad_norm": 0.3385665767247605, + "learning_rate": 2.560528308395918e-05, + "loss": 2.5703, + "step": 44847 + }, + { + "epoch": 2.087994971715902, + "grad_norm": 0.3199517844714633, + "learning_rate": 2.560291865632213e-05, + "loss": 2.668, + "step": 44848 + }, + { + "epoch": 2.088041529901995, + "grad_norm": 0.36109787785236525, + "learning_rate": 2.5600554300288106e-05, + "loss": 2.5731, + "step": 44849 + }, + { + "epoch": 2.088088088088088, + "grad_norm": 0.3506800159295374, + "learning_rate": 2.5598190015864054e-05, + "loss": 2.6003, + "step": 44850 + }, + { + "epoch": 2.088134646274181, + "grad_norm": 0.32833931115048665, + "learning_rate": 2.5595825803056926e-05, + "loss": 2.6917, + "step": 44851 + }, + { + "epoch": 2.088181204460274, + "grad_norm": 0.3476486046953137, + "learning_rate": 2.559346166187363e-05, + "loss": 2.6172, + "step": 44852 + }, + { + "epoch": 2.0882277626463672, + "grad_norm": 0.3280599397420236, + "learning_rate": 2.5591097592321122e-05, + "loss": 2.6938, + "step": 44853 + }, + { + "epoch": 2.0882743208324603, + "grad_norm": 0.32944684602714125, + "learning_rate": 2.5588733594406334e-05, + "loss": 2.5094, + "step": 44854 + }, + { + "epoch": 2.0883208790185535, + "grad_norm": 0.340917099365142, + "learning_rate": 2.558636966813623e-05, + "loss": 2.6142, + "step": 44855 + }, + { + "epoch": 2.0883674372046466, + "grad_norm": 0.32884312903516494, + "learning_rate": 2.5584005813517688e-05, + "loss": 2.6001, + "step": 44856 + }, + { + "epoch": 2.0884139953907397, + "grad_norm": 0.3511947215643931, + "learning_rate": 2.5581642030557717e-05, + "loss": 2.6378, + "step": 44857 + }, + { + "epoch": 2.088460553576833, + "grad_norm": 0.35093387928556147, + "learning_rate": 2.5579278319263178e-05, + "loss": 2.718, + "step": 44858 + }, + { + "epoch": 2.088507111762926, + "grad_norm": 0.3381043428409842, + "learning_rate": 2.55769146796411e-05, + "loss": 2.6386, + "step": 44859 + }, + { + "epoch": 2.0885536699490186, + "grad_norm": 0.3353676487544342, + "learning_rate": 2.5574551111698353e-05, + "loss": 2.5954, + "step": 44860 + }, + { + "epoch": 2.0886002281351117, + "grad_norm": 0.3340111408049601, + "learning_rate": 2.5572187615441885e-05, + "loss": 2.6798, + "step": 44861 + }, + { + "epoch": 2.088646786321205, + "grad_norm": 0.3262595831217178, + "learning_rate": 2.5569824190878644e-05, + "loss": 2.6723, + "step": 44862 + }, + { + "epoch": 2.088693344507298, + "grad_norm": 0.3323568102866212, + "learning_rate": 2.5567460838015556e-05, + "loss": 2.6972, + "step": 44863 + }, + { + "epoch": 2.088739902693391, + "grad_norm": 0.34036804231085255, + "learning_rate": 2.556509755685959e-05, + "loss": 2.6084, + "step": 44864 + }, + { + "epoch": 2.088786460879484, + "grad_norm": 0.33061780371288296, + "learning_rate": 2.5562734347417632e-05, + "loss": 2.67, + "step": 44865 + }, + { + "epoch": 2.0888330190655773, + "grad_norm": 0.3233807376015318, + "learning_rate": 2.5560371209696644e-05, + "loss": 2.6556, + "step": 44866 + }, + { + "epoch": 2.0888795772516704, + "grad_norm": 0.31977384829838507, + "learning_rate": 2.555800814370356e-05, + "loss": 2.6593, + "step": 44867 + }, + { + "epoch": 2.0889261354377635, + "grad_norm": 0.328513066601706, + "learning_rate": 2.5555645149445305e-05, + "loss": 2.6161, + "step": 44868 + }, + { + "epoch": 2.0889726936238566, + "grad_norm": 0.32710342748175675, + "learning_rate": 2.5553282226928833e-05, + "loss": 2.746, + "step": 44869 + }, + { + "epoch": 2.0890192518099493, + "grad_norm": 0.3338475827279256, + "learning_rate": 2.5550919376161086e-05, + "loss": 2.6927, + "step": 44870 + }, + { + "epoch": 2.0890658099960424, + "grad_norm": 0.3381192238625134, + "learning_rate": 2.5548556597148942e-05, + "loss": 2.6636, + "step": 44871 + }, + { + "epoch": 2.0891123681821355, + "grad_norm": 0.3145098941047607, + "learning_rate": 2.5546193889899418e-05, + "loss": 2.6245, + "step": 44872 + }, + { + "epoch": 2.0891589263682286, + "grad_norm": 0.3171805798546829, + "learning_rate": 2.5543831254419382e-05, + "loss": 2.6467, + "step": 44873 + }, + { + "epoch": 2.0892054845543218, + "grad_norm": 0.33766786709822827, + "learning_rate": 2.5541468690715797e-05, + "loss": 2.7064, + "step": 44874 + }, + { + "epoch": 2.089252042740415, + "grad_norm": 0.3088368088429188, + "learning_rate": 2.5539106198795587e-05, + "loss": 2.6248, + "step": 44875 + }, + { + "epoch": 2.089298600926508, + "grad_norm": 0.32514994195851793, + "learning_rate": 2.5536743778665696e-05, + "loss": 2.5694, + "step": 44876 + }, + { + "epoch": 2.089345159112601, + "grad_norm": 0.34013855510595326, + "learning_rate": 2.5534381430333065e-05, + "loss": 2.6027, + "step": 44877 + }, + { + "epoch": 2.089391717298694, + "grad_norm": 0.3265683009067514, + "learning_rate": 2.55320191538046e-05, + "loss": 2.6407, + "step": 44878 + }, + { + "epoch": 2.0894382754847873, + "grad_norm": 0.3242296029008189, + "learning_rate": 2.5529656949087245e-05, + "loss": 2.6255, + "step": 44879 + }, + { + "epoch": 2.08948483367088, + "grad_norm": 0.31553076504025107, + "learning_rate": 2.552729481618793e-05, + "loss": 2.6981, + "step": 44880 + }, + { + "epoch": 2.089531391856973, + "grad_norm": 0.3389016127391784, + "learning_rate": 2.55249327551136e-05, + "loss": 2.6049, + "step": 44881 + }, + { + "epoch": 2.0895779500430662, + "grad_norm": 0.31035522785028363, + "learning_rate": 2.5522570765871185e-05, + "loss": 2.7029, + "step": 44882 + }, + { + "epoch": 2.0896245082291593, + "grad_norm": 0.35542999564179, + "learning_rate": 2.552020884846763e-05, + "loss": 2.7228, + "step": 44883 + }, + { + "epoch": 2.0896710664152525, + "grad_norm": 0.3227389950107539, + "learning_rate": 2.5517847002909812e-05, + "loss": 2.6242, + "step": 44884 + }, + { + "epoch": 2.0897176246013456, + "grad_norm": 0.33096563045104943, + "learning_rate": 2.551548522920474e-05, + "loss": 2.6537, + "step": 44885 + }, + { + "epoch": 2.0897641827874387, + "grad_norm": 0.31257934825305783, + "learning_rate": 2.5513123527359285e-05, + "loss": 2.6693, + "step": 44886 + }, + { + "epoch": 2.089810740973532, + "grad_norm": 0.3629991908465195, + "learning_rate": 2.55107618973804e-05, + "loss": 2.729, + "step": 44887 + }, + { + "epoch": 2.089857299159625, + "grad_norm": 0.3592378199404814, + "learning_rate": 2.550840033927502e-05, + "loss": 2.7403, + "step": 44888 + }, + { + "epoch": 2.0899038573457176, + "grad_norm": 0.33798243945326445, + "learning_rate": 2.5506038853050063e-05, + "loss": 2.6521, + "step": 44889 + }, + { + "epoch": 2.0899504155318107, + "grad_norm": 0.3295596041267933, + "learning_rate": 2.5503677438712497e-05, + "loss": 2.7023, + "step": 44890 + }, + { + "epoch": 2.089996973717904, + "grad_norm": 0.3497371381025515, + "learning_rate": 2.55013160962692e-05, + "loss": 2.6031, + "step": 44891 + }, + { + "epoch": 2.090043531903997, + "grad_norm": 0.33987159571577324, + "learning_rate": 2.5498954825727133e-05, + "loss": 2.5968, + "step": 44892 + }, + { + "epoch": 2.09009009009009, + "grad_norm": 0.35553447077363726, + "learning_rate": 2.5496593627093212e-05, + "loss": 2.6004, + "step": 44893 + }, + { + "epoch": 2.090136648276183, + "grad_norm": 0.41641649325336666, + "learning_rate": 2.549423250037438e-05, + "loss": 2.6035, + "step": 44894 + }, + { + "epoch": 2.0901832064622763, + "grad_norm": 0.33518382652231665, + "learning_rate": 2.5491871445577565e-05, + "loss": 2.7877, + "step": 44895 + }, + { + "epoch": 2.0902297646483694, + "grad_norm": 0.3755406361696971, + "learning_rate": 2.548951046270971e-05, + "loss": 2.6161, + "step": 44896 + }, + { + "epoch": 2.0902763228344625, + "grad_norm": 0.3690877375063738, + "learning_rate": 2.5487149551777685e-05, + "loss": 2.613, + "step": 44897 + }, + { + "epoch": 2.0903228810205556, + "grad_norm": 0.35028176736144945, + "learning_rate": 2.54847887127885e-05, + "loss": 2.6388, + "step": 44898 + }, + { + "epoch": 2.0903694392066483, + "grad_norm": 0.3452870993626601, + "learning_rate": 2.5482427945749026e-05, + "loss": 2.528, + "step": 44899 + }, + { + "epoch": 2.0904159973927414, + "grad_norm": 0.36474211030515, + "learning_rate": 2.548006725066621e-05, + "loss": 2.6324, + "step": 44900 + }, + { + "epoch": 2.0904625555788345, + "grad_norm": 0.35446935745086466, + "learning_rate": 2.5477706627546983e-05, + "loss": 2.6412, + "step": 44901 + }, + { + "epoch": 2.0905091137649277, + "grad_norm": 0.3579988087185576, + "learning_rate": 2.547534607639827e-05, + "loss": 2.6195, + "step": 44902 + }, + { + "epoch": 2.0905556719510208, + "grad_norm": 0.36092577847120616, + "learning_rate": 2.547298559722702e-05, + "loss": 2.6824, + "step": 44903 + }, + { + "epoch": 2.090602230137114, + "grad_norm": 0.3350308110425702, + "learning_rate": 2.5470625190040103e-05, + "loss": 2.6264, + "step": 44904 + }, + { + "epoch": 2.090648788323207, + "grad_norm": 0.40017181085748366, + "learning_rate": 2.5468264854844525e-05, + "loss": 2.6592, + "step": 44905 + }, + { + "epoch": 2.0906953465093, + "grad_norm": 0.371024338797199, + "learning_rate": 2.546590459164715e-05, + "loss": 2.6478, + "step": 44906 + }, + { + "epoch": 2.0907419046953932, + "grad_norm": 0.3850131284230404, + "learning_rate": 2.546354440045493e-05, + "loss": 2.709, + "step": 44907 + }, + { + "epoch": 2.0907884628814863, + "grad_norm": 0.3340486395106247, + "learning_rate": 2.5461184281274785e-05, + "loss": 2.5304, + "step": 44908 + }, + { + "epoch": 2.090835021067579, + "grad_norm": 0.3554055843862559, + "learning_rate": 2.5458824234113676e-05, + "loss": 2.6347, + "step": 44909 + }, + { + "epoch": 2.090881579253672, + "grad_norm": 0.37502229520562297, + "learning_rate": 2.545646425897845e-05, + "loss": 2.6872, + "step": 44910 + }, + { + "epoch": 2.0909281374397652, + "grad_norm": 0.3534585816421336, + "learning_rate": 2.545410435587613e-05, + "loss": 2.7169, + "step": 44911 + }, + { + "epoch": 2.0909746956258584, + "grad_norm": 0.3265726435603041, + "learning_rate": 2.545174452481357e-05, + "loss": 2.6869, + "step": 44912 + }, + { + "epoch": 2.0910212538119515, + "grad_norm": 0.3518891794735821, + "learning_rate": 2.5449384765797728e-05, + "loss": 2.6776, + "step": 44913 + }, + { + "epoch": 2.0910678119980446, + "grad_norm": 0.3533345930778077, + "learning_rate": 2.544702507883552e-05, + "loss": 2.6307, + "step": 44914 + }, + { + "epoch": 2.0911143701841377, + "grad_norm": 0.32845199013717225, + "learning_rate": 2.5444665463933875e-05, + "loss": 2.7506, + "step": 44915 + }, + { + "epoch": 2.091160928370231, + "grad_norm": 0.3460011009598038, + "learning_rate": 2.544230592109974e-05, + "loss": 2.6474, + "step": 44916 + }, + { + "epoch": 2.091207486556324, + "grad_norm": 0.3583428668155351, + "learning_rate": 2.5439946450339973e-05, + "loss": 2.6572, + "step": 44917 + }, + { + "epoch": 2.091254044742417, + "grad_norm": 0.3141861991431433, + "learning_rate": 2.543758705166159e-05, + "loss": 2.6672, + "step": 44918 + }, + { + "epoch": 2.0913006029285097, + "grad_norm": 0.33317916981782575, + "learning_rate": 2.5435227725071442e-05, + "loss": 2.7075, + "step": 44919 + }, + { + "epoch": 2.091347161114603, + "grad_norm": 0.33629470484571977, + "learning_rate": 2.5432868470576487e-05, + "loss": 2.7049, + "step": 44920 + }, + { + "epoch": 2.091393719300696, + "grad_norm": 0.3562890033698888, + "learning_rate": 2.5430509288183636e-05, + "loss": 2.6241, + "step": 44921 + }, + { + "epoch": 2.091440277486789, + "grad_norm": 0.34276542991223136, + "learning_rate": 2.5428150177899824e-05, + "loss": 2.5593, + "step": 44922 + }, + { + "epoch": 2.091486835672882, + "grad_norm": 0.3464297896694032, + "learning_rate": 2.5425791139731968e-05, + "loss": 2.6516, + "step": 44923 + }, + { + "epoch": 2.0915333938589753, + "grad_norm": 0.32446705514504354, + "learning_rate": 2.542343217368702e-05, + "loss": 2.6578, + "step": 44924 + }, + { + "epoch": 2.0915799520450684, + "grad_norm": 0.34237629537979986, + "learning_rate": 2.542107327977185e-05, + "loss": 2.6269, + "step": 44925 + }, + { + "epoch": 2.0916265102311615, + "grad_norm": 0.34406557636369095, + "learning_rate": 2.5418714457993408e-05, + "loss": 2.5562, + "step": 44926 + }, + { + "epoch": 2.0916730684172546, + "grad_norm": 0.364588286221348, + "learning_rate": 2.5416355708358624e-05, + "loss": 2.7235, + "step": 44927 + }, + { + "epoch": 2.0917196266033473, + "grad_norm": 0.31441024641531784, + "learning_rate": 2.5413997030874415e-05, + "loss": 2.5893, + "step": 44928 + }, + { + "epoch": 2.0917661847894404, + "grad_norm": 0.32707057070531753, + "learning_rate": 2.5411638425547713e-05, + "loss": 2.591, + "step": 44929 + }, + { + "epoch": 2.0918127429755335, + "grad_norm": 0.3823831290108958, + "learning_rate": 2.5409279892385395e-05, + "loss": 2.7624, + "step": 44930 + }, + { + "epoch": 2.0918593011616267, + "grad_norm": 0.3317425871274062, + "learning_rate": 2.540692143139446e-05, + "loss": 2.6118, + "step": 44931 + }, + { + "epoch": 2.0919058593477198, + "grad_norm": 0.3345039678407746, + "learning_rate": 2.540456304258177e-05, + "loss": 2.5156, + "step": 44932 + }, + { + "epoch": 2.091952417533813, + "grad_norm": 0.3589335814729064, + "learning_rate": 2.540220472595426e-05, + "loss": 2.6392, + "step": 44933 + }, + { + "epoch": 2.091998975719906, + "grad_norm": 0.3462401275898182, + "learning_rate": 2.5399846481518857e-05, + "loss": 2.59, + "step": 44934 + }, + { + "epoch": 2.092045533905999, + "grad_norm": 0.35603074976713783, + "learning_rate": 2.539748830928248e-05, + "loss": 2.6182, + "step": 44935 + }, + { + "epoch": 2.0920920920920922, + "grad_norm": 0.31058344838172697, + "learning_rate": 2.5395130209252048e-05, + "loss": 2.6672, + "step": 44936 + }, + { + "epoch": 2.0921386502781854, + "grad_norm": 0.36853128054821316, + "learning_rate": 2.539277218143451e-05, + "loss": 2.6428, + "step": 44937 + }, + { + "epoch": 2.092185208464278, + "grad_norm": 0.34853838419871286, + "learning_rate": 2.5390414225836746e-05, + "loss": 2.6353, + "step": 44938 + }, + { + "epoch": 2.092231766650371, + "grad_norm": 0.34269799672298346, + "learning_rate": 2.538805634246568e-05, + "loss": 2.567, + "step": 44939 + }, + { + "epoch": 2.0922783248364643, + "grad_norm": 0.3747048038978959, + "learning_rate": 2.538569853132825e-05, + "loss": 2.677, + "step": 44940 + }, + { + "epoch": 2.0923248830225574, + "grad_norm": 0.3464533359906412, + "learning_rate": 2.538334079243137e-05, + "loss": 2.6224, + "step": 44941 + }, + { + "epoch": 2.0923714412086505, + "grad_norm": 0.3406740973988919, + "learning_rate": 2.5380983125781977e-05, + "loss": 2.5908, + "step": 44942 + }, + { + "epoch": 2.0924179993947436, + "grad_norm": 0.3687531075715349, + "learning_rate": 2.537862553138694e-05, + "loss": 2.7332, + "step": 44943 + }, + { + "epoch": 2.0924645575808367, + "grad_norm": 0.3488139465770893, + "learning_rate": 2.5376268009253244e-05, + "loss": 2.6013, + "step": 44944 + }, + { + "epoch": 2.09251111576693, + "grad_norm": 0.3733077945566672, + "learning_rate": 2.537391055938776e-05, + "loss": 2.7345, + "step": 44945 + }, + { + "epoch": 2.092557673953023, + "grad_norm": 0.33244858475472444, + "learning_rate": 2.5371553181797425e-05, + "loss": 2.6185, + "step": 44946 + }, + { + "epoch": 2.092604232139116, + "grad_norm": 0.3543976065974194, + "learning_rate": 2.536919587648915e-05, + "loss": 2.8102, + "step": 44947 + }, + { + "epoch": 2.0926507903252087, + "grad_norm": 0.3509638601506873, + "learning_rate": 2.536683864346986e-05, + "loss": 2.6726, + "step": 44948 + }, + { + "epoch": 2.092697348511302, + "grad_norm": 0.3243712644738884, + "learning_rate": 2.536448148274647e-05, + "loss": 2.6524, + "step": 44949 + }, + { + "epoch": 2.092743906697395, + "grad_norm": 0.35010417026766694, + "learning_rate": 2.536212439432592e-05, + "loss": 2.5658, + "step": 44950 + }, + { + "epoch": 2.092790464883488, + "grad_norm": 0.32957838618172686, + "learning_rate": 2.535976737821509e-05, + "loss": 2.6177, + "step": 44951 + }, + { + "epoch": 2.092837023069581, + "grad_norm": 0.36545996442872697, + "learning_rate": 2.5357410434420914e-05, + "loss": 2.8181, + "step": 44952 + }, + { + "epoch": 2.0928835812556743, + "grad_norm": 0.3489030036686357, + "learning_rate": 2.535505356295031e-05, + "loss": 2.6833, + "step": 44953 + }, + { + "epoch": 2.0929301394417674, + "grad_norm": 0.3661303227243777, + "learning_rate": 2.5352696763810196e-05, + "loss": 2.6028, + "step": 44954 + }, + { + "epoch": 2.0929766976278605, + "grad_norm": 0.36672450489035147, + "learning_rate": 2.535034003700751e-05, + "loss": 2.6536, + "step": 44955 + }, + { + "epoch": 2.0930232558139537, + "grad_norm": 0.3447041462587149, + "learning_rate": 2.5347983382549107e-05, + "loss": 2.5923, + "step": 44956 + }, + { + "epoch": 2.0930698140000468, + "grad_norm": 0.36205188240377056, + "learning_rate": 2.5345626800441986e-05, + "loss": 2.6112, + "step": 44957 + }, + { + "epoch": 2.0931163721861394, + "grad_norm": 0.3727566236275148, + "learning_rate": 2.5343270290692976e-05, + "loss": 2.7768, + "step": 44958 + }, + { + "epoch": 2.0931629303722326, + "grad_norm": 0.3514549237858348, + "learning_rate": 2.5340913853309084e-05, + "loss": 2.6106, + "step": 44959 + }, + { + "epoch": 2.0932094885583257, + "grad_norm": 0.3770069849109338, + "learning_rate": 2.533855748829716e-05, + "loss": 2.6049, + "step": 44960 + }, + { + "epoch": 2.093256046744419, + "grad_norm": 0.3486797735236335, + "learning_rate": 2.533620119566414e-05, + "loss": 2.6504, + "step": 44961 + }, + { + "epoch": 2.093302604930512, + "grad_norm": 0.3266010730672514, + "learning_rate": 2.5333844975416937e-05, + "loss": 2.6979, + "step": 44962 + }, + { + "epoch": 2.093349163116605, + "grad_norm": 0.3688023544603877, + "learning_rate": 2.533148882756249e-05, + "loss": 2.684, + "step": 44963 + }, + { + "epoch": 2.093395721302698, + "grad_norm": 0.350602380613836, + "learning_rate": 2.5329132752107675e-05, + "loss": 2.5741, + "step": 44964 + }, + { + "epoch": 2.0934422794887912, + "grad_norm": 0.33895280834143315, + "learning_rate": 2.532677674905942e-05, + "loss": 2.6985, + "step": 44965 + }, + { + "epoch": 2.0934888376748844, + "grad_norm": 0.36561885950254297, + "learning_rate": 2.5324420818424656e-05, + "loss": 2.6424, + "step": 44966 + }, + { + "epoch": 2.093535395860977, + "grad_norm": 0.3763741167577687, + "learning_rate": 2.5322064960210272e-05, + "loss": 2.7143, + "step": 44967 + }, + { + "epoch": 2.09358195404707, + "grad_norm": 0.3648297712207631, + "learning_rate": 2.5319709174423222e-05, + "loss": 2.6565, + "step": 44968 + }, + { + "epoch": 2.0936285122331633, + "grad_norm": 0.3598245036404872, + "learning_rate": 2.5317353461070358e-05, + "loss": 2.6607, + "step": 44969 + }, + { + "epoch": 2.0936750704192564, + "grad_norm": 0.3680896576460689, + "learning_rate": 2.5314997820158665e-05, + "loss": 2.5898, + "step": 44970 + }, + { + "epoch": 2.0937216286053495, + "grad_norm": 0.34259816904204177, + "learning_rate": 2.5312642251694984e-05, + "loss": 2.5943, + "step": 44971 + }, + { + "epoch": 2.0937681867914426, + "grad_norm": 0.3764232083697342, + "learning_rate": 2.5310286755686308e-05, + "loss": 2.5458, + "step": 44972 + }, + { + "epoch": 2.0938147449775357, + "grad_norm": 0.3644659774969907, + "learning_rate": 2.5307931332139478e-05, + "loss": 2.7137, + "step": 44973 + }, + { + "epoch": 2.093861303163629, + "grad_norm": 0.3653586920448348, + "learning_rate": 2.5305575981061446e-05, + "loss": 2.7336, + "step": 44974 + }, + { + "epoch": 2.093907861349722, + "grad_norm": 0.3897949452710257, + "learning_rate": 2.5303220702459117e-05, + "loss": 2.577, + "step": 44975 + }, + { + "epoch": 2.093954419535815, + "grad_norm": 0.3663293227839355, + "learning_rate": 2.5300865496339397e-05, + "loss": 2.4694, + "step": 44976 + }, + { + "epoch": 2.0940009777219077, + "grad_norm": 0.3582505154438849, + "learning_rate": 2.5298510362709228e-05, + "loss": 2.6706, + "step": 44977 + }, + { + "epoch": 2.094047535908001, + "grad_norm": 0.35305972366917476, + "learning_rate": 2.5296155301575476e-05, + "loss": 2.6477, + "step": 44978 + }, + { + "epoch": 2.094094094094094, + "grad_norm": 0.34453479178145424, + "learning_rate": 2.5293800312945072e-05, + "loss": 2.5987, + "step": 44979 + }, + { + "epoch": 2.094140652280187, + "grad_norm": 0.33994509900324144, + "learning_rate": 2.529144539682493e-05, + "loss": 2.7358, + "step": 44980 + }, + { + "epoch": 2.09418721046628, + "grad_norm": 0.3548061437610862, + "learning_rate": 2.5289090553221983e-05, + "loss": 2.6642, + "step": 44981 + }, + { + "epoch": 2.0942337686523733, + "grad_norm": 0.34342563737218773, + "learning_rate": 2.528673578214308e-05, + "loss": 2.5988, + "step": 44982 + }, + { + "epoch": 2.0942803268384664, + "grad_norm": 0.3661316493141249, + "learning_rate": 2.528438108359521e-05, + "loss": 2.7204, + "step": 44983 + }, + { + "epoch": 2.0943268850245595, + "grad_norm": 0.3589052709069444, + "learning_rate": 2.528202645758521e-05, + "loss": 2.6994, + "step": 44984 + }, + { + "epoch": 2.0943734432106527, + "grad_norm": 0.3593597436717494, + "learning_rate": 2.5279671904120073e-05, + "loss": 2.6266, + "step": 44985 + }, + { + "epoch": 2.094420001396746, + "grad_norm": 0.3560246717958265, + "learning_rate": 2.527731742320664e-05, + "loss": 2.6077, + "step": 44986 + }, + { + "epoch": 2.0944665595828384, + "grad_norm": 0.3214520852024369, + "learning_rate": 2.5274963014851838e-05, + "loss": 2.5993, + "step": 44987 + }, + { + "epoch": 2.0945131177689316, + "grad_norm": 0.35343430619765154, + "learning_rate": 2.5272608679062588e-05, + "loss": 2.5934, + "step": 44988 + }, + { + "epoch": 2.0945596759550247, + "grad_norm": 0.33524376659873517, + "learning_rate": 2.5270254415845796e-05, + "loss": 2.6408, + "step": 44989 + }, + { + "epoch": 2.094606234141118, + "grad_norm": 0.35049330456646566, + "learning_rate": 2.526790022520839e-05, + "loss": 2.6354, + "step": 44990 + }, + { + "epoch": 2.094652792327211, + "grad_norm": 0.35305215650620886, + "learning_rate": 2.5265546107157247e-05, + "loss": 2.5837, + "step": 44991 + }, + { + "epoch": 2.094699350513304, + "grad_norm": 0.35379198747925944, + "learning_rate": 2.526319206169928e-05, + "loss": 2.6766, + "step": 44992 + }, + { + "epoch": 2.094745908699397, + "grad_norm": 0.3473827141242019, + "learning_rate": 2.526083808884141e-05, + "loss": 2.5402, + "step": 44993 + }, + { + "epoch": 2.0947924668854903, + "grad_norm": 0.36121134712685266, + "learning_rate": 2.525848418859055e-05, + "loss": 2.7115, + "step": 44994 + }, + { + "epoch": 2.0948390250715834, + "grad_norm": 0.3273709510976137, + "learning_rate": 2.525613036095359e-05, + "loss": 2.6947, + "step": 44995 + }, + { + "epoch": 2.0948855832576765, + "grad_norm": 0.3548598413474234, + "learning_rate": 2.5253776605937473e-05, + "loss": 2.7268, + "step": 44996 + }, + { + "epoch": 2.094932141443769, + "grad_norm": 0.3307619845080838, + "learning_rate": 2.5251422923549052e-05, + "loss": 2.6678, + "step": 44997 + }, + { + "epoch": 2.0949786996298623, + "grad_norm": 0.3582252661334945, + "learning_rate": 2.5249069313795292e-05, + "loss": 2.6918, + "step": 44998 + }, + { + "epoch": 2.0950252578159554, + "grad_norm": 0.3338354420053068, + "learning_rate": 2.524671577668307e-05, + "loss": 2.5144, + "step": 44999 + }, + { + "epoch": 2.0950718160020485, + "grad_norm": 0.36079368716987165, + "learning_rate": 2.5244362312219293e-05, + "loss": 2.6973, + "step": 45000 + }, + { + "epoch": 2.0951183741881416, + "grad_norm": 0.3410440303846425, + "learning_rate": 2.524200892041087e-05, + "loss": 2.6089, + "step": 45001 + }, + { + "epoch": 2.0951649323742347, + "grad_norm": 0.3600300745230964, + "learning_rate": 2.5239655601264717e-05, + "loss": 2.6673, + "step": 45002 + }, + { + "epoch": 2.095211490560328, + "grad_norm": 0.3579894523891248, + "learning_rate": 2.523730235478775e-05, + "loss": 2.6396, + "step": 45003 + }, + { + "epoch": 2.095258048746421, + "grad_norm": 0.31918254132259616, + "learning_rate": 2.5234949180986843e-05, + "loss": 2.6255, + "step": 45004 + }, + { + "epoch": 2.095304606932514, + "grad_norm": 0.3279733520893515, + "learning_rate": 2.5232596079868924e-05, + "loss": 2.5667, + "step": 45005 + }, + { + "epoch": 2.0953511651186068, + "grad_norm": 0.36695101767737603, + "learning_rate": 2.5230243051440892e-05, + "loss": 2.6954, + "step": 45006 + }, + { + "epoch": 2.0953977233047, + "grad_norm": 0.34055163703801733, + "learning_rate": 2.522789009570966e-05, + "loss": 2.6012, + "step": 45007 + }, + { + "epoch": 2.095444281490793, + "grad_norm": 0.33489744146956507, + "learning_rate": 2.5225537212682125e-05, + "loss": 2.7394, + "step": 45008 + }, + { + "epoch": 2.095490839676886, + "grad_norm": 0.3498564104092674, + "learning_rate": 2.5223184402365223e-05, + "loss": 2.699, + "step": 45009 + }, + { + "epoch": 2.095537397862979, + "grad_norm": 0.36571315461809023, + "learning_rate": 2.522083166476579e-05, + "loss": 2.7004, + "step": 45010 + }, + { + "epoch": 2.0955839560490723, + "grad_norm": 0.3321007511662823, + "learning_rate": 2.5218478999890816e-05, + "loss": 2.6751, + "step": 45011 + }, + { + "epoch": 2.0956305142351654, + "grad_norm": 0.33640312903448877, + "learning_rate": 2.521612640774713e-05, + "loss": 2.7134, + "step": 45012 + }, + { + "epoch": 2.0956770724212586, + "grad_norm": 0.347576622944086, + "learning_rate": 2.5213773888341706e-05, + "loss": 2.5363, + "step": 45013 + }, + { + "epoch": 2.0957236306073517, + "grad_norm": 0.3198507991412714, + "learning_rate": 2.5211421441681395e-05, + "loss": 2.6125, + "step": 45014 + }, + { + "epoch": 2.095770188793445, + "grad_norm": 0.35264709904566177, + "learning_rate": 2.5209069067773116e-05, + "loss": 2.5673, + "step": 45015 + }, + { + "epoch": 2.095816746979538, + "grad_norm": 0.35705582969058497, + "learning_rate": 2.52067167666238e-05, + "loss": 2.6423, + "step": 45016 + }, + { + "epoch": 2.0958633051656306, + "grad_norm": 0.35189047187462025, + "learning_rate": 2.520436453824031e-05, + "loss": 2.6013, + "step": 45017 + }, + { + "epoch": 2.0959098633517237, + "grad_norm": 0.341163647012944, + "learning_rate": 2.5202012382629563e-05, + "loss": 2.5868, + "step": 45018 + }, + { + "epoch": 2.095956421537817, + "grad_norm": 0.3316075449321055, + "learning_rate": 2.5199660299798468e-05, + "loss": 2.6391, + "step": 45019 + }, + { + "epoch": 2.09600297972391, + "grad_norm": 0.3245448985296536, + "learning_rate": 2.519730828975393e-05, + "loss": 2.6755, + "step": 45020 + }, + { + "epoch": 2.096049537910003, + "grad_norm": 0.3362454616827733, + "learning_rate": 2.5194956352502846e-05, + "loss": 2.6038, + "step": 45021 + }, + { + "epoch": 2.096096096096096, + "grad_norm": 0.3538398051157422, + "learning_rate": 2.5192604488052142e-05, + "loss": 2.6524, + "step": 45022 + }, + { + "epoch": 2.0961426542821893, + "grad_norm": 0.32741419566749946, + "learning_rate": 2.5190252696408657e-05, + "loss": 2.6274, + "step": 45023 + }, + { + "epoch": 2.0961892124682824, + "grad_norm": 0.3394338615625107, + "learning_rate": 2.5187900977579383e-05, + "loss": 2.725, + "step": 45024 + }, + { + "epoch": 2.0962357706543755, + "grad_norm": 0.3332577392522059, + "learning_rate": 2.5185549331571127e-05, + "loss": 2.5964, + "step": 45025 + }, + { + "epoch": 2.096282328840468, + "grad_norm": 0.3523346811902457, + "learning_rate": 2.5183197758390886e-05, + "loss": 2.5941, + "step": 45026 + }, + { + "epoch": 2.0963288870265613, + "grad_norm": 0.3366716064635683, + "learning_rate": 2.5180846258045488e-05, + "loss": 2.7342, + "step": 45027 + }, + { + "epoch": 2.0963754452126544, + "grad_norm": 0.31153214059960577, + "learning_rate": 2.5178494830541866e-05, + "loss": 2.6879, + "step": 45028 + }, + { + "epoch": 2.0964220033987475, + "grad_norm": 0.3179688616547383, + "learning_rate": 2.5176143475886937e-05, + "loss": 2.6419, + "step": 45029 + }, + { + "epoch": 2.0964685615848406, + "grad_norm": 0.32610110458236574, + "learning_rate": 2.517379219408754e-05, + "loss": 2.6776, + "step": 45030 + }, + { + "epoch": 2.0965151197709337, + "grad_norm": 0.3494803655263109, + "learning_rate": 2.517144098515065e-05, + "loss": 2.6101, + "step": 45031 + }, + { + "epoch": 2.096561677957027, + "grad_norm": 0.3483968204889561, + "learning_rate": 2.516908984908313e-05, + "loss": 2.5858, + "step": 45032 + }, + { + "epoch": 2.09660823614312, + "grad_norm": 0.33069289601463225, + "learning_rate": 2.5166738785891874e-05, + "loss": 2.5592, + "step": 45033 + }, + { + "epoch": 2.096654794329213, + "grad_norm": 0.3727129012112113, + "learning_rate": 2.51643877955838e-05, + "loss": 2.6345, + "step": 45034 + }, + { + "epoch": 2.096701352515306, + "grad_norm": 0.3526828327592696, + "learning_rate": 2.5162036878165812e-05, + "loss": 2.6473, + "step": 45035 + }, + { + "epoch": 2.096747910701399, + "grad_norm": 0.3485146888932203, + "learning_rate": 2.5159686033644763e-05, + "loss": 2.6341, + "step": 45036 + }, + { + "epoch": 2.096794468887492, + "grad_norm": 0.3517653950210935, + "learning_rate": 2.5157335262027626e-05, + "loss": 2.6212, + "step": 45037 + }, + { + "epoch": 2.096841027073585, + "grad_norm": 0.3551540559853116, + "learning_rate": 2.5154984563321225e-05, + "loss": 2.6352, + "step": 45038 + }, + { + "epoch": 2.096887585259678, + "grad_norm": 0.32685111717607357, + "learning_rate": 2.5152633937532543e-05, + "loss": 2.6706, + "step": 45039 + }, + { + "epoch": 2.0969341434457713, + "grad_norm": 0.37288653520981235, + "learning_rate": 2.5150283384668404e-05, + "loss": 2.6682, + "step": 45040 + }, + { + "epoch": 2.0969807016318645, + "grad_norm": 0.327387085025046, + "learning_rate": 2.514793290473574e-05, + "loss": 2.6256, + "step": 45041 + }, + { + "epoch": 2.0970272598179576, + "grad_norm": 0.3599607209511504, + "learning_rate": 2.514558249774146e-05, + "loss": 2.6594, + "step": 45042 + }, + { + "epoch": 2.0970738180040507, + "grad_norm": 0.3716518193101054, + "learning_rate": 2.5143232163692408e-05, + "loss": 2.614, + "step": 45043 + }, + { + "epoch": 2.097120376190144, + "grad_norm": 0.3316565209167103, + "learning_rate": 2.5140881902595564e-05, + "loss": 2.6941, + "step": 45044 + }, + { + "epoch": 2.097166934376237, + "grad_norm": 0.3541964872376711, + "learning_rate": 2.5138531714457757e-05, + "loss": 2.6667, + "step": 45045 + }, + { + "epoch": 2.0972134925623296, + "grad_norm": 0.34334108281317993, + "learning_rate": 2.513618159928592e-05, + "loss": 2.7625, + "step": 45046 + }, + { + "epoch": 2.0972600507484227, + "grad_norm": 0.3513354389036456, + "learning_rate": 2.513383155708693e-05, + "loss": 2.6844, + "step": 45047 + }, + { + "epoch": 2.097306608934516, + "grad_norm": 0.38032381033713325, + "learning_rate": 2.51314815878677e-05, + "loss": 2.7471, + "step": 45048 + }, + { + "epoch": 2.097353167120609, + "grad_norm": 0.3274274459889697, + "learning_rate": 2.5129131691635122e-05, + "loss": 2.6562, + "step": 45049 + }, + { + "epoch": 2.097399725306702, + "grad_norm": 0.3255840015548852, + "learning_rate": 2.512678186839611e-05, + "loss": 2.6782, + "step": 45050 + }, + { + "epoch": 2.097446283492795, + "grad_norm": 0.39561134499157974, + "learning_rate": 2.5124432118157505e-05, + "loss": 2.6369, + "step": 45051 + }, + { + "epoch": 2.0974928416788883, + "grad_norm": 0.33664036367293043, + "learning_rate": 2.5122082440926276e-05, + "loss": 2.6364, + "step": 45052 + }, + { + "epoch": 2.0975393998649814, + "grad_norm": 0.3304043186133604, + "learning_rate": 2.511973283670927e-05, + "loss": 2.5684, + "step": 45053 + }, + { + "epoch": 2.0975859580510745, + "grad_norm": 0.37092846302339905, + "learning_rate": 2.5117383305513398e-05, + "loss": 2.6219, + "step": 45054 + }, + { + "epoch": 2.0976325162371676, + "grad_norm": 0.35032603608406276, + "learning_rate": 2.511503384734557e-05, + "loss": 2.6942, + "step": 45055 + }, + { + "epoch": 2.0976790744232603, + "grad_norm": 0.3388218497939131, + "learning_rate": 2.5112684462212625e-05, + "loss": 2.6806, + "step": 45056 + }, + { + "epoch": 2.0977256326093534, + "grad_norm": 0.3410574062374234, + "learning_rate": 2.5110335150121545e-05, + "loss": 2.7692, + "step": 45057 + }, + { + "epoch": 2.0977721907954465, + "grad_norm": 0.3176253492780104, + "learning_rate": 2.5107985911079158e-05, + "loss": 2.5827, + "step": 45058 + }, + { + "epoch": 2.0978187489815396, + "grad_norm": 0.33809599611308405, + "learning_rate": 2.5105636745092377e-05, + "loss": 2.6467, + "step": 45059 + }, + { + "epoch": 2.0978653071676328, + "grad_norm": 0.36058842543033554, + "learning_rate": 2.5103287652168093e-05, + "loss": 2.7344, + "step": 45060 + }, + { + "epoch": 2.097911865353726, + "grad_norm": 0.3470729982358643, + "learning_rate": 2.510093863231322e-05, + "loss": 2.5615, + "step": 45061 + }, + { + "epoch": 2.097958423539819, + "grad_norm": 0.36496345221671167, + "learning_rate": 2.5098589685534634e-05, + "loss": 2.6422, + "step": 45062 + }, + { + "epoch": 2.098004981725912, + "grad_norm": 0.34070437313657476, + "learning_rate": 2.5096240811839255e-05, + "loss": 2.7666, + "step": 45063 + }, + { + "epoch": 2.098051539912005, + "grad_norm": 0.3502822610437976, + "learning_rate": 2.5093892011233915e-05, + "loss": 2.5658, + "step": 45064 + }, + { + "epoch": 2.098098098098098, + "grad_norm": 0.3495864560076088, + "learning_rate": 2.5091543283725588e-05, + "loss": 2.6555, + "step": 45065 + }, + { + "epoch": 2.098144656284191, + "grad_norm": 0.321245335834397, + "learning_rate": 2.508919462932111e-05, + "loss": 2.7008, + "step": 45066 + }, + { + "epoch": 2.098191214470284, + "grad_norm": 0.3527445705497415, + "learning_rate": 2.508684604802739e-05, + "loss": 2.8167, + "step": 45067 + }, + { + "epoch": 2.0982377726563772, + "grad_norm": 0.3308849049862579, + "learning_rate": 2.5084497539851338e-05, + "loss": 2.6813, + "step": 45068 + }, + { + "epoch": 2.0982843308424703, + "grad_norm": 0.33419657343712744, + "learning_rate": 2.5082149104799798e-05, + "loss": 2.5948, + "step": 45069 + }, + { + "epoch": 2.0983308890285635, + "grad_norm": 0.3386413405064453, + "learning_rate": 2.5079800742879733e-05, + "loss": 2.7085, + "step": 45070 + }, + { + "epoch": 2.0983774472146566, + "grad_norm": 0.3231217831199605, + "learning_rate": 2.5077452454097972e-05, + "loss": 2.6445, + "step": 45071 + }, + { + "epoch": 2.0984240054007497, + "grad_norm": 0.33080174378948674, + "learning_rate": 2.5075104238461446e-05, + "loss": 2.7757, + "step": 45072 + }, + { + "epoch": 2.098470563586843, + "grad_norm": 0.32817818941658344, + "learning_rate": 2.5072756095977023e-05, + "loss": 2.701, + "step": 45073 + }, + { + "epoch": 2.098517121772936, + "grad_norm": 0.34991797583256895, + "learning_rate": 2.5070408026651605e-05, + "loss": 2.5358, + "step": 45074 + }, + { + "epoch": 2.0985636799590286, + "grad_norm": 0.31997844430361266, + "learning_rate": 2.506806003049209e-05, + "loss": 2.6296, + "step": 45075 + }, + { + "epoch": 2.0986102381451217, + "grad_norm": 0.35855257970362164, + "learning_rate": 2.506571210750538e-05, + "loss": 2.6911, + "step": 45076 + }, + { + "epoch": 2.098656796331215, + "grad_norm": 0.34087964055444153, + "learning_rate": 2.5063364257698308e-05, + "loss": 2.7078, + "step": 45077 + }, + { + "epoch": 2.098703354517308, + "grad_norm": 0.3398507343614992, + "learning_rate": 2.5061016481077837e-05, + "loss": 2.6674, + "step": 45078 + }, + { + "epoch": 2.098749912703401, + "grad_norm": 0.34853678932496324, + "learning_rate": 2.505866877765082e-05, + "loss": 2.6991, + "step": 45079 + }, + { + "epoch": 2.098796470889494, + "grad_norm": 0.3222282466347389, + "learning_rate": 2.5056321147424143e-05, + "loss": 2.7394, + "step": 45080 + }, + { + "epoch": 2.0988430290755873, + "grad_norm": 0.3462234823169735, + "learning_rate": 2.5053973590404733e-05, + "loss": 2.7083, + "step": 45081 + }, + { + "epoch": 2.0988895872616804, + "grad_norm": 0.3572462095186008, + "learning_rate": 2.505162610659941e-05, + "loss": 2.6788, + "step": 45082 + }, + { + "epoch": 2.0989361454477735, + "grad_norm": 0.3316413387681127, + "learning_rate": 2.5049278696015142e-05, + "loss": 2.6881, + "step": 45083 + }, + { + "epoch": 2.0989827036338666, + "grad_norm": 0.32674167507080876, + "learning_rate": 2.5046931358658748e-05, + "loss": 2.6979, + "step": 45084 + }, + { + "epoch": 2.0990292618199593, + "grad_norm": 0.33766616879164646, + "learning_rate": 2.5044584094537195e-05, + "loss": 2.6027, + "step": 45085 + }, + { + "epoch": 2.0990758200060524, + "grad_norm": 0.3342379472283776, + "learning_rate": 2.50422369036573e-05, + "loss": 2.7299, + "step": 45086 + }, + { + "epoch": 2.0991223781921455, + "grad_norm": 0.3276313288738628, + "learning_rate": 2.5039889786025982e-05, + "loss": 2.6667, + "step": 45087 + }, + { + "epoch": 2.0991689363782386, + "grad_norm": 0.34552677571781026, + "learning_rate": 2.503754274165014e-05, + "loss": 2.5593, + "step": 45088 + }, + { + "epoch": 2.0992154945643318, + "grad_norm": 0.365328230391382, + "learning_rate": 2.5035195770536657e-05, + "loss": 2.711, + "step": 45089 + }, + { + "epoch": 2.099262052750425, + "grad_norm": 0.32537165555724473, + "learning_rate": 2.5032848872692384e-05, + "loss": 2.6712, + "step": 45090 + }, + { + "epoch": 2.099308610936518, + "grad_norm": 0.3620333431946748, + "learning_rate": 2.503050204812428e-05, + "loss": 2.565, + "step": 45091 + }, + { + "epoch": 2.099355169122611, + "grad_norm": 0.33069641286880513, + "learning_rate": 2.502815529683917e-05, + "loss": 2.5752, + "step": 45092 + }, + { + "epoch": 2.0994017273087042, + "grad_norm": 0.3426731139546019, + "learning_rate": 2.502580861884396e-05, + "loss": 2.6747, + "step": 45093 + }, + { + "epoch": 2.0994482854947973, + "grad_norm": 0.329191878701944, + "learning_rate": 2.502346201414557e-05, + "loss": 2.6769, + "step": 45094 + }, + { + "epoch": 2.09949484368089, + "grad_norm": 0.3503522367637224, + "learning_rate": 2.5021115482750813e-05, + "loss": 2.6148, + "step": 45095 + }, + { + "epoch": 2.099541401866983, + "grad_norm": 0.3371412867060853, + "learning_rate": 2.501876902466666e-05, + "loss": 2.7217, + "step": 45096 + }, + { + "epoch": 2.0995879600530762, + "grad_norm": 0.3198480987348551, + "learning_rate": 2.5016422639899924e-05, + "loss": 2.6196, + "step": 45097 + }, + { + "epoch": 2.0996345182391694, + "grad_norm": 0.3540468179998977, + "learning_rate": 2.501407632845756e-05, + "loss": 2.6513, + "step": 45098 + }, + { + "epoch": 2.0996810764252625, + "grad_norm": 0.34837270506236107, + "learning_rate": 2.501173009034641e-05, + "loss": 2.7651, + "step": 45099 + }, + { + "epoch": 2.0997276346113556, + "grad_norm": 0.3631613385464193, + "learning_rate": 2.500938392557336e-05, + "loss": 2.6365, + "step": 45100 + }, + { + "epoch": 2.0997741927974487, + "grad_norm": 0.3343740588331845, + "learning_rate": 2.5007037834145308e-05, + "loss": 2.6551, + "step": 45101 + }, + { + "epoch": 2.099820750983542, + "grad_norm": 0.31927917311812126, + "learning_rate": 2.5004691816069143e-05, + "loss": 2.6773, + "step": 45102 + }, + { + "epoch": 2.099867309169635, + "grad_norm": 0.3679408709144773, + "learning_rate": 2.500234587135174e-05, + "loss": 2.7056, + "step": 45103 + }, + { + "epoch": 2.0999138673557276, + "grad_norm": 0.32748861671942253, + "learning_rate": 2.500000000000001e-05, + "loss": 2.6323, + "step": 45104 + }, + { + "epoch": 2.0999604255418207, + "grad_norm": 0.35514351915166265, + "learning_rate": 2.4997654202020797e-05, + "loss": 2.6533, + "step": 45105 + }, + { + "epoch": 2.100006983727914, + "grad_norm": 0.3557655100070094, + "learning_rate": 2.499530847742101e-05, + "loss": 2.7054, + "step": 45106 + }, + { + "epoch": 2.100053541914007, + "grad_norm": 0.32149026198376474, + "learning_rate": 2.4992962826207526e-05, + "loss": 2.6347, + "step": 45107 + }, + { + "epoch": 2.1001001001001, + "grad_norm": 0.3334611525475221, + "learning_rate": 2.4990617248387232e-05, + "loss": 2.6462, + "step": 45108 + }, + { + "epoch": 2.100146658286193, + "grad_norm": 0.3362860394669008, + "learning_rate": 2.4988271743967035e-05, + "loss": 2.6399, + "step": 45109 + }, + { + "epoch": 2.1001932164722863, + "grad_norm": 0.3598234071223841, + "learning_rate": 2.4985926312953756e-05, + "loss": 2.7429, + "step": 45110 + }, + { + "epoch": 2.1002397746583794, + "grad_norm": 0.354314160549181, + "learning_rate": 2.4983580955354358e-05, + "loss": 2.5969, + "step": 45111 + }, + { + "epoch": 2.1002863328444725, + "grad_norm": 0.31148255778164613, + "learning_rate": 2.4981235671175663e-05, + "loss": 2.6873, + "step": 45112 + }, + { + "epoch": 2.1003328910305656, + "grad_norm": 0.33392368183762, + "learning_rate": 2.497889046042458e-05, + "loss": 2.6208, + "step": 45113 + }, + { + "epoch": 2.1003794492166583, + "grad_norm": 0.3529067567883037, + "learning_rate": 2.4976545323107986e-05, + "loss": 2.6642, + "step": 45114 + }, + { + "epoch": 2.1004260074027514, + "grad_norm": 0.33077043644444537, + "learning_rate": 2.4974200259232765e-05, + "loss": 2.6179, + "step": 45115 + }, + { + "epoch": 2.1004725655888445, + "grad_norm": 0.3078238403848382, + "learning_rate": 2.4971855268805805e-05, + "loss": 2.6459, + "step": 45116 + }, + { + "epoch": 2.1005191237749377, + "grad_norm": 0.3297988278443952, + "learning_rate": 2.4969510351834003e-05, + "loss": 2.6274, + "step": 45117 + }, + { + "epoch": 2.1005656819610308, + "grad_norm": 0.3434946904444145, + "learning_rate": 2.4967165508324202e-05, + "loss": 2.6962, + "step": 45118 + }, + { + "epoch": 2.100612240147124, + "grad_norm": 0.31498917549159766, + "learning_rate": 2.4964820738283302e-05, + "loss": 2.7012, + "step": 45119 + }, + { + "epoch": 2.100658798333217, + "grad_norm": 0.32412452843531414, + "learning_rate": 2.496247604171819e-05, + "loss": 2.7126, + "step": 45120 + }, + { + "epoch": 2.10070535651931, + "grad_norm": 0.33230379492520884, + "learning_rate": 2.496013141863574e-05, + "loss": 2.6405, + "step": 45121 + }, + { + "epoch": 2.1007519147054032, + "grad_norm": 0.33688396901955286, + "learning_rate": 2.495778686904286e-05, + "loss": 2.6634, + "step": 45122 + }, + { + "epoch": 2.1007984728914963, + "grad_norm": 0.36217521646873096, + "learning_rate": 2.495544239294637e-05, + "loss": 2.6502, + "step": 45123 + }, + { + "epoch": 2.100845031077589, + "grad_norm": 0.3570409064681297, + "learning_rate": 2.4953097990353224e-05, + "loss": 2.6723, + "step": 45124 + }, + { + "epoch": 2.100891589263682, + "grad_norm": 0.3423911500180567, + "learning_rate": 2.4950753661270236e-05, + "loss": 2.703, + "step": 45125 + }, + { + "epoch": 2.1009381474497753, + "grad_norm": 0.35570697349814034, + "learning_rate": 2.4948409405704354e-05, + "loss": 2.6806, + "step": 45126 + }, + { + "epoch": 2.1009847056358684, + "grad_norm": 0.3366048624600819, + "learning_rate": 2.4946065223662403e-05, + "loss": 2.5376, + "step": 45127 + }, + { + "epoch": 2.1010312638219615, + "grad_norm": 0.3391048239730504, + "learning_rate": 2.494372111515128e-05, + "loss": 2.6202, + "step": 45128 + }, + { + "epoch": 2.1010778220080546, + "grad_norm": 0.3430566912779215, + "learning_rate": 2.494137708017787e-05, + "loss": 2.6189, + "step": 45129 + }, + { + "epoch": 2.1011243801941477, + "grad_norm": 0.32656997437976376, + "learning_rate": 2.4939033118749066e-05, + "loss": 2.715, + "step": 45130 + }, + { + "epoch": 2.101170938380241, + "grad_norm": 0.3199713855340992, + "learning_rate": 2.4936689230871713e-05, + "loss": 2.6171, + "step": 45131 + }, + { + "epoch": 2.101217496566334, + "grad_norm": 0.34028071551354655, + "learning_rate": 2.4934345416552707e-05, + "loss": 2.7185, + "step": 45132 + }, + { + "epoch": 2.101264054752427, + "grad_norm": 0.3545754279485253, + "learning_rate": 2.4932001675798932e-05, + "loss": 2.6537, + "step": 45133 + }, + { + "epoch": 2.1013106129385197, + "grad_norm": 0.3189299375560507, + "learning_rate": 2.4929658008617257e-05, + "loss": 2.7077, + "step": 45134 + }, + { + "epoch": 2.101357171124613, + "grad_norm": 0.3392553223759964, + "learning_rate": 2.492731441501459e-05, + "loss": 2.6883, + "step": 45135 + }, + { + "epoch": 2.101403729310706, + "grad_norm": 0.33375760939078114, + "learning_rate": 2.4924970894997747e-05, + "loss": 2.5816, + "step": 45136 + }, + { + "epoch": 2.101450287496799, + "grad_norm": 0.3461339694236508, + "learning_rate": 2.4922627448573678e-05, + "loss": 2.696, + "step": 45137 + }, + { + "epoch": 2.101496845682892, + "grad_norm": 0.3941539487877911, + "learning_rate": 2.492028407574919e-05, + "loss": 2.6231, + "step": 45138 + }, + { + "epoch": 2.1015434038689853, + "grad_norm": 0.32975051054480864, + "learning_rate": 2.4917940776531234e-05, + "loss": 2.5346, + "step": 45139 + }, + { + "epoch": 2.1015899620550784, + "grad_norm": 0.3717977165587847, + "learning_rate": 2.4915597550926635e-05, + "loss": 2.6959, + "step": 45140 + }, + { + "epoch": 2.1016365202411715, + "grad_norm": 0.3589184839834526, + "learning_rate": 2.4913254398942288e-05, + "loss": 2.6559, + "step": 45141 + }, + { + "epoch": 2.1016830784272647, + "grad_norm": 0.3196736459661409, + "learning_rate": 2.4910911320585063e-05, + "loss": 2.6873, + "step": 45142 + }, + { + "epoch": 2.1017296366133573, + "grad_norm": 0.3229421075030799, + "learning_rate": 2.4908568315861842e-05, + "loss": 2.7252, + "step": 45143 + }, + { + "epoch": 2.1017761947994504, + "grad_norm": 0.3572455569257107, + "learning_rate": 2.490622538477952e-05, + "loss": 2.6338, + "step": 45144 + }, + { + "epoch": 2.1018227529855436, + "grad_norm": 0.3497690485672353, + "learning_rate": 2.490388252734494e-05, + "loss": 2.5784, + "step": 45145 + }, + { + "epoch": 2.1018693111716367, + "grad_norm": 0.3408559515697434, + "learning_rate": 2.4901539743564988e-05, + "loss": 2.5989, + "step": 45146 + }, + { + "epoch": 2.10191586935773, + "grad_norm": 0.3449496968840054, + "learning_rate": 2.4899197033446548e-05, + "loss": 2.675, + "step": 45147 + }, + { + "epoch": 2.101962427543823, + "grad_norm": 0.35462465963615025, + "learning_rate": 2.4896854396996505e-05, + "loss": 2.6508, + "step": 45148 + }, + { + "epoch": 2.102008985729916, + "grad_norm": 0.33209457813588034, + "learning_rate": 2.4894511834221683e-05, + "loss": 2.6643, + "step": 45149 + }, + { + "epoch": 2.102055543916009, + "grad_norm": 0.3503871909238979, + "learning_rate": 2.489216934512904e-05, + "loss": 2.659, + "step": 45150 + }, + { + "epoch": 2.1021021021021022, + "grad_norm": 0.36868946765880267, + "learning_rate": 2.4889826929725367e-05, + "loss": 2.6187, + "step": 45151 + }, + { + "epoch": 2.1021486602881954, + "grad_norm": 0.34124456023602884, + "learning_rate": 2.4887484588017612e-05, + "loss": 2.662, + "step": 45152 + }, + { + "epoch": 2.102195218474288, + "grad_norm": 0.3550534856019812, + "learning_rate": 2.48851423200126e-05, + "loss": 2.6202, + "step": 45153 + }, + { + "epoch": 2.102241776660381, + "grad_norm": 0.34793472307865475, + "learning_rate": 2.488280012571722e-05, + "loss": 2.643, + "step": 45154 + }, + { + "epoch": 2.1022883348464743, + "grad_norm": 0.3710046715199044, + "learning_rate": 2.488045800513835e-05, + "loss": 2.7356, + "step": 45155 + }, + { + "epoch": 2.1023348930325674, + "grad_norm": 0.34117378757514394, + "learning_rate": 2.487811595828286e-05, + "loss": 2.6944, + "step": 45156 + }, + { + "epoch": 2.1023814512186605, + "grad_norm": 0.3359082213274879, + "learning_rate": 2.487577398515764e-05, + "loss": 2.623, + "step": 45157 + }, + { + "epoch": 2.1024280094047536, + "grad_norm": 0.34450547529709424, + "learning_rate": 2.4873432085769532e-05, + "loss": 2.7187, + "step": 45158 + }, + { + "epoch": 2.1024745675908467, + "grad_norm": 0.37281112573038355, + "learning_rate": 2.4871090260125423e-05, + "loss": 2.7007, + "step": 45159 + }, + { + "epoch": 2.10252112577694, + "grad_norm": 0.35709835460081557, + "learning_rate": 2.4868748508232192e-05, + "loss": 2.6346, + "step": 45160 + }, + { + "epoch": 2.102567683963033, + "grad_norm": 0.3295380486329058, + "learning_rate": 2.4866406830096706e-05, + "loss": 2.7169, + "step": 45161 + }, + { + "epoch": 2.102614242149126, + "grad_norm": 0.3435245709048211, + "learning_rate": 2.486406522572584e-05, + "loss": 2.544, + "step": 45162 + }, + { + "epoch": 2.1026608003352187, + "grad_norm": 0.37628811165170584, + "learning_rate": 2.4861723695126488e-05, + "loss": 2.6774, + "step": 45163 + }, + { + "epoch": 2.102707358521312, + "grad_norm": 0.34754785608306954, + "learning_rate": 2.485938223830546e-05, + "loss": 2.663, + "step": 45164 + }, + { + "epoch": 2.102753916707405, + "grad_norm": 0.33056950480762964, + "learning_rate": 2.4857040855269708e-05, + "loss": 2.6065, + "step": 45165 + }, + { + "epoch": 2.102800474893498, + "grad_norm": 0.33522453662213914, + "learning_rate": 2.4854699546026045e-05, + "loss": 2.547, + "step": 45166 + }, + { + "epoch": 2.102847033079591, + "grad_norm": 0.3592017761432481, + "learning_rate": 2.485235831058136e-05, + "loss": 2.6402, + "step": 45167 + }, + { + "epoch": 2.1028935912656843, + "grad_norm": 0.3342547936594698, + "learning_rate": 2.485001714894253e-05, + "loss": 2.6027, + "step": 45168 + }, + { + "epoch": 2.1029401494517774, + "grad_norm": 0.3352072559116607, + "learning_rate": 2.4847676061116416e-05, + "loss": 2.6585, + "step": 45169 + }, + { + "epoch": 2.1029867076378705, + "grad_norm": 0.35739037163542836, + "learning_rate": 2.4845335047109912e-05, + "loss": 2.6403, + "step": 45170 + }, + { + "epoch": 2.1030332658239637, + "grad_norm": 0.3060530915675934, + "learning_rate": 2.4842994106929862e-05, + "loss": 2.5712, + "step": 45171 + }, + { + "epoch": 2.1030798240100568, + "grad_norm": 0.3521589271649328, + "learning_rate": 2.484065324058314e-05, + "loss": 2.7507, + "step": 45172 + }, + { + "epoch": 2.1031263821961494, + "grad_norm": 0.3394035622853044, + "learning_rate": 2.4838312448076623e-05, + "loss": 2.6595, + "step": 45173 + }, + { + "epoch": 2.1031729403822426, + "grad_norm": 0.34202519864135744, + "learning_rate": 2.483597172941718e-05, + "loss": 2.6881, + "step": 45174 + }, + { + "epoch": 2.1032194985683357, + "grad_norm": 0.33922863988197643, + "learning_rate": 2.483363108461168e-05, + "loss": 2.6888, + "step": 45175 + }, + { + "epoch": 2.103266056754429, + "grad_norm": 0.3507718869243966, + "learning_rate": 2.483129051366701e-05, + "loss": 2.6922, + "step": 45176 + }, + { + "epoch": 2.103312614940522, + "grad_norm": 0.3191893199421474, + "learning_rate": 2.4828950016589993e-05, + "loss": 2.538, + "step": 45177 + }, + { + "epoch": 2.103359173126615, + "grad_norm": 0.3326522283081918, + "learning_rate": 2.4826609593387557e-05, + "loss": 2.6746, + "step": 45178 + }, + { + "epoch": 2.103405731312708, + "grad_norm": 0.3405932660666071, + "learning_rate": 2.4824269244066527e-05, + "loss": 2.7154, + "step": 45179 + }, + { + "epoch": 2.1034522894988013, + "grad_norm": 0.35288707416445947, + "learning_rate": 2.482192896863379e-05, + "loss": 2.6003, + "step": 45180 + }, + { + "epoch": 2.1034988476848944, + "grad_norm": 0.33676821997765, + "learning_rate": 2.48195887670962e-05, + "loss": 2.7176, + "step": 45181 + }, + { + "epoch": 2.103545405870987, + "grad_norm": 0.3558635952415615, + "learning_rate": 2.481724863946065e-05, + "loss": 2.6204, + "step": 45182 + }, + { + "epoch": 2.10359196405708, + "grad_norm": 0.3337732504088131, + "learning_rate": 2.4814908585734004e-05, + "loss": 2.6567, + "step": 45183 + }, + { + "epoch": 2.1036385222431733, + "grad_norm": 0.3816120114653601, + "learning_rate": 2.4812568605923104e-05, + "loss": 2.7199, + "step": 45184 + }, + { + "epoch": 2.1036850804292664, + "grad_norm": 0.3403233013681439, + "learning_rate": 2.4810228700034836e-05, + "loss": 2.6483, + "step": 45185 + }, + { + "epoch": 2.1037316386153595, + "grad_norm": 0.34193014465933796, + "learning_rate": 2.4807888868076062e-05, + "loss": 2.6418, + "step": 45186 + }, + { + "epoch": 2.1037781968014526, + "grad_norm": 0.33678371604058377, + "learning_rate": 2.4805549110053655e-05, + "loss": 2.7117, + "step": 45187 + }, + { + "epoch": 2.1038247549875457, + "grad_norm": 0.34351450050408555, + "learning_rate": 2.4803209425974476e-05, + "loss": 2.6529, + "step": 45188 + }, + { + "epoch": 2.103871313173639, + "grad_norm": 0.35716734069226463, + "learning_rate": 2.4800869815845418e-05, + "loss": 2.6282, + "step": 45189 + }, + { + "epoch": 2.103917871359732, + "grad_norm": 0.3451642431505931, + "learning_rate": 2.4798530279673283e-05, + "loss": 2.7145, + "step": 45190 + }, + { + "epoch": 2.103964429545825, + "grad_norm": 0.35075240977718575, + "learning_rate": 2.479619081746502e-05, + "loss": 2.5414, + "step": 45191 + }, + { + "epoch": 2.104010987731918, + "grad_norm": 0.3675485727427635, + "learning_rate": 2.4793851429227433e-05, + "loss": 2.6744, + "step": 45192 + }, + { + "epoch": 2.104057545918011, + "grad_norm": 0.35362733646082833, + "learning_rate": 2.4791512114967407e-05, + "loss": 2.7504, + "step": 45193 + }, + { + "epoch": 2.104104104104104, + "grad_norm": 0.3619063622399197, + "learning_rate": 2.4789172874691812e-05, + "loss": 2.6664, + "step": 45194 + }, + { + "epoch": 2.104150662290197, + "grad_norm": 0.3288372558837658, + "learning_rate": 2.478683370840751e-05, + "loss": 2.6123, + "step": 45195 + }, + { + "epoch": 2.10419722047629, + "grad_norm": 0.3510030138130677, + "learning_rate": 2.478449461612139e-05, + "loss": 2.6946, + "step": 45196 + }, + { + "epoch": 2.1042437786623833, + "grad_norm": 0.3311175355491761, + "learning_rate": 2.478215559784025e-05, + "loss": 2.6455, + "step": 45197 + }, + { + "epoch": 2.1042903368484764, + "grad_norm": 0.34916910381916644, + "learning_rate": 2.4779816653571042e-05, + "loss": 2.627, + "step": 45198 + }, + { + "epoch": 2.1043368950345696, + "grad_norm": 0.31683489715655294, + "learning_rate": 2.4777477783320564e-05, + "loss": 2.5897, + "step": 45199 + }, + { + "epoch": 2.1043834532206627, + "grad_norm": 0.3500858446409669, + "learning_rate": 2.477513898709571e-05, + "loss": 2.6065, + "step": 45200 + }, + { + "epoch": 2.104430011406756, + "grad_norm": 0.3584901250275675, + "learning_rate": 2.4772800264903328e-05, + "loss": 2.6045, + "step": 45201 + }, + { + "epoch": 2.1044765695928485, + "grad_norm": 0.3515805273798815, + "learning_rate": 2.477046161675032e-05, + "loss": 2.6363, + "step": 45202 + }, + { + "epoch": 2.1045231277789416, + "grad_norm": 0.3361614903623329, + "learning_rate": 2.476812304264348e-05, + "loss": 2.5828, + "step": 45203 + }, + { + "epoch": 2.1045696859650347, + "grad_norm": 0.3473018014101771, + "learning_rate": 2.4765784542589755e-05, + "loss": 2.5487, + "step": 45204 + }, + { + "epoch": 2.104616244151128, + "grad_norm": 0.3496546088720322, + "learning_rate": 2.476344611659594e-05, + "loss": 2.7066, + "step": 45205 + }, + { + "epoch": 2.104662802337221, + "grad_norm": 0.365165936959707, + "learning_rate": 2.476110776466893e-05, + "loss": 2.6158, + "step": 45206 + }, + { + "epoch": 2.104709360523314, + "grad_norm": 0.33154329438817753, + "learning_rate": 2.475876948681558e-05, + "loss": 2.589, + "step": 45207 + }, + { + "epoch": 2.104755918709407, + "grad_norm": 0.3656731298692746, + "learning_rate": 2.4756431283042758e-05, + "loss": 2.6806, + "step": 45208 + }, + { + "epoch": 2.1048024768955003, + "grad_norm": 0.33262200039554074, + "learning_rate": 2.475409315335734e-05, + "loss": 2.6327, + "step": 45209 + }, + { + "epoch": 2.1048490350815934, + "grad_norm": 0.343833947550231, + "learning_rate": 2.475175509776613e-05, + "loss": 2.6487, + "step": 45210 + }, + { + "epoch": 2.1048955932676865, + "grad_norm": 0.3667510569153934, + "learning_rate": 2.4749417116276076e-05, + "loss": 2.7673, + "step": 45211 + }, + { + "epoch": 2.104942151453779, + "grad_norm": 0.3487041913875717, + "learning_rate": 2.4747079208893976e-05, + "loss": 2.6388, + "step": 45212 + }, + { + "epoch": 2.1049887096398723, + "grad_norm": 0.3417901181717911, + "learning_rate": 2.4744741375626707e-05, + "loss": 2.6221, + "step": 45213 + }, + { + "epoch": 2.1050352678259654, + "grad_norm": 0.3444913415263482, + "learning_rate": 2.4742403616481137e-05, + "loss": 2.5761, + "step": 45214 + }, + { + "epoch": 2.1050818260120585, + "grad_norm": 0.3517869309961652, + "learning_rate": 2.474006593146412e-05, + "loss": 2.7023, + "step": 45215 + }, + { + "epoch": 2.1051283841981516, + "grad_norm": 0.3465686121630944, + "learning_rate": 2.473772832058252e-05, + "loss": 2.6163, + "step": 45216 + }, + { + "epoch": 2.1051749423842447, + "grad_norm": 0.3362380302963757, + "learning_rate": 2.4735390783843222e-05, + "loss": 2.6345, + "step": 45217 + }, + { + "epoch": 2.105221500570338, + "grad_norm": 0.35341528939759614, + "learning_rate": 2.473305332125304e-05, + "loss": 2.6027, + "step": 45218 + }, + { + "epoch": 2.105268058756431, + "grad_norm": 0.3586424432044665, + "learning_rate": 2.473071593281886e-05, + "loss": 2.6119, + "step": 45219 + }, + { + "epoch": 2.105314616942524, + "grad_norm": 0.3259049107560995, + "learning_rate": 2.4728378618547543e-05, + "loss": 2.6218, + "step": 45220 + }, + { + "epoch": 2.105361175128617, + "grad_norm": 0.3609553788972007, + "learning_rate": 2.472604137844594e-05, + "loss": 2.6312, + "step": 45221 + }, + { + "epoch": 2.10540773331471, + "grad_norm": 0.33496850796720884, + "learning_rate": 2.472370421252094e-05, + "loss": 2.5933, + "step": 45222 + }, + { + "epoch": 2.105454291500803, + "grad_norm": 0.352763151538974, + "learning_rate": 2.4721367120779332e-05, + "loss": 2.6732, + "step": 45223 + }, + { + "epoch": 2.105500849686896, + "grad_norm": 0.33209504041781623, + "learning_rate": 2.4719030103228065e-05, + "loss": 2.6717, + "step": 45224 + }, + { + "epoch": 2.105547407872989, + "grad_norm": 0.34997010435617, + "learning_rate": 2.4716693159873932e-05, + "loss": 2.6788, + "step": 45225 + }, + { + "epoch": 2.1055939660590823, + "grad_norm": 0.3385155051802127, + "learning_rate": 2.471435629072381e-05, + "loss": 2.6036, + "step": 45226 + }, + { + "epoch": 2.1056405242451754, + "grad_norm": 0.341604360157267, + "learning_rate": 2.471201949578457e-05, + "loss": 2.5406, + "step": 45227 + }, + { + "epoch": 2.1056870824312686, + "grad_norm": 0.33195655735178364, + "learning_rate": 2.4709682775063057e-05, + "loss": 2.66, + "step": 45228 + }, + { + "epoch": 2.1057336406173617, + "grad_norm": 0.3198075222103741, + "learning_rate": 2.4707346128566132e-05, + "loss": 2.6376, + "step": 45229 + }, + { + "epoch": 2.105780198803455, + "grad_norm": 0.33162534421254913, + "learning_rate": 2.4705009556300678e-05, + "loss": 2.5585, + "step": 45230 + }, + { + "epoch": 2.105826756989548, + "grad_norm": 0.35142455943145545, + "learning_rate": 2.4702673058273508e-05, + "loss": 2.6593, + "step": 45231 + }, + { + "epoch": 2.1058733151756406, + "grad_norm": 0.3258212397645377, + "learning_rate": 2.47003366344915e-05, + "loss": 2.6043, + "step": 45232 + }, + { + "epoch": 2.1059198733617337, + "grad_norm": 0.3674691564043227, + "learning_rate": 2.4698000284961507e-05, + "loss": 2.7122, + "step": 45233 + }, + { + "epoch": 2.105966431547827, + "grad_norm": 0.33485152491597386, + "learning_rate": 2.46956640096904e-05, + "loss": 2.7663, + "step": 45234 + }, + { + "epoch": 2.10601298973392, + "grad_norm": 0.3595056387468533, + "learning_rate": 2.4693327808685042e-05, + "loss": 2.6074, + "step": 45235 + }, + { + "epoch": 2.106059547920013, + "grad_norm": 0.3327961454386107, + "learning_rate": 2.469099168195223e-05, + "loss": 2.6582, + "step": 45236 + }, + { + "epoch": 2.106106106106106, + "grad_norm": 0.33753577301264537, + "learning_rate": 2.4688655629498908e-05, + "loss": 2.5907, + "step": 45237 + }, + { + "epoch": 2.1061526642921993, + "grad_norm": 0.33917124055347847, + "learning_rate": 2.468631965133187e-05, + "loss": 2.6901, + "step": 45238 + }, + { + "epoch": 2.1061992224782924, + "grad_norm": 0.3660791447814612, + "learning_rate": 2.4683983747457983e-05, + "loss": 2.5856, + "step": 45239 + }, + { + "epoch": 2.1062457806643855, + "grad_norm": 0.3404363757995464, + "learning_rate": 2.4681647917884115e-05, + "loss": 2.5346, + "step": 45240 + }, + { + "epoch": 2.106292338850478, + "grad_norm": 0.3273633103777549, + "learning_rate": 2.4679312162617114e-05, + "loss": 2.627, + "step": 45241 + }, + { + "epoch": 2.1063388970365713, + "grad_norm": 0.33766941407505136, + "learning_rate": 2.4676976481663837e-05, + "loss": 2.6533, + "step": 45242 + }, + { + "epoch": 2.1063854552226644, + "grad_norm": 0.3330182249772023, + "learning_rate": 2.4674640875031163e-05, + "loss": 2.6092, + "step": 45243 + }, + { + "epoch": 2.1064320134087575, + "grad_norm": 0.34751766543202417, + "learning_rate": 2.46723053427259e-05, + "loss": 2.7457, + "step": 45244 + }, + { + "epoch": 2.1064785715948506, + "grad_norm": 0.3525795114183976, + "learning_rate": 2.4669969884754924e-05, + "loss": 2.5856, + "step": 45245 + }, + { + "epoch": 2.1065251297809438, + "grad_norm": 0.3142842373720427, + "learning_rate": 2.4667634501125092e-05, + "loss": 2.6027, + "step": 45246 + }, + { + "epoch": 2.106571687967037, + "grad_norm": 0.32858087147395104, + "learning_rate": 2.4665299191843262e-05, + "loss": 2.5684, + "step": 45247 + }, + { + "epoch": 2.10661824615313, + "grad_norm": 0.3702409810315064, + "learning_rate": 2.4662963956916295e-05, + "loss": 2.7234, + "step": 45248 + }, + { + "epoch": 2.106664804339223, + "grad_norm": 0.3270183049288298, + "learning_rate": 2.4660628796350997e-05, + "loss": 2.6505, + "step": 45249 + }, + { + "epoch": 2.106711362525316, + "grad_norm": 0.34468411877253424, + "learning_rate": 2.46582937101543e-05, + "loss": 2.6594, + "step": 45250 + }, + { + "epoch": 2.106757920711409, + "grad_norm": 0.36176606290906216, + "learning_rate": 2.465595869833297e-05, + "loss": 2.6141, + "step": 45251 + }, + { + "epoch": 2.106804478897502, + "grad_norm": 0.34733777434685004, + "learning_rate": 2.4653623760893945e-05, + "loss": 2.6761, + "step": 45252 + }, + { + "epoch": 2.106851037083595, + "grad_norm": 0.32792524686088964, + "learning_rate": 2.465128889784402e-05, + "loss": 2.6085, + "step": 45253 + }, + { + "epoch": 2.1068975952696882, + "grad_norm": 0.3543844602918597, + "learning_rate": 2.4648954109190063e-05, + "loss": 2.7117, + "step": 45254 + }, + { + "epoch": 2.1069441534557813, + "grad_norm": 0.338279992087772, + "learning_rate": 2.464661939493893e-05, + "loss": 2.6848, + "step": 45255 + }, + { + "epoch": 2.1069907116418745, + "grad_norm": 0.3336953683309103, + "learning_rate": 2.4644284755097485e-05, + "loss": 2.6264, + "step": 45256 + }, + { + "epoch": 2.1070372698279676, + "grad_norm": 0.33939188427874906, + "learning_rate": 2.4641950189672553e-05, + "loss": 2.5923, + "step": 45257 + }, + { + "epoch": 2.1070838280140607, + "grad_norm": 0.3276842214558801, + "learning_rate": 2.4639615698670998e-05, + "loss": 2.5558, + "step": 45258 + }, + { + "epoch": 2.107130386200154, + "grad_norm": 0.3480573915898875, + "learning_rate": 2.4637281282099668e-05, + "loss": 2.7364, + "step": 45259 + }, + { + "epoch": 2.107176944386247, + "grad_norm": 0.33557775352331154, + "learning_rate": 2.4634946939965424e-05, + "loss": 2.5248, + "step": 45260 + }, + { + "epoch": 2.1072235025723396, + "grad_norm": 0.3511260790947617, + "learning_rate": 2.463261267227513e-05, + "loss": 2.699, + "step": 45261 + }, + { + "epoch": 2.1072700607584327, + "grad_norm": 0.33518505954804906, + "learning_rate": 2.463027847903558e-05, + "loss": 2.6497, + "step": 45262 + }, + { + "epoch": 2.107316618944526, + "grad_norm": 0.3391955243807427, + "learning_rate": 2.4627944360253696e-05, + "loss": 2.6333, + "step": 45263 + }, + { + "epoch": 2.107363177130619, + "grad_norm": 0.33559664619917007, + "learning_rate": 2.4625610315936267e-05, + "loss": 2.6733, + "step": 45264 + }, + { + "epoch": 2.107409735316712, + "grad_norm": 0.3302208334472406, + "learning_rate": 2.46232763460902e-05, + "loss": 2.6062, + "step": 45265 + }, + { + "epoch": 2.107456293502805, + "grad_norm": 0.32858703965299874, + "learning_rate": 2.4620942450722306e-05, + "loss": 2.568, + "step": 45266 + }, + { + "epoch": 2.1075028516888983, + "grad_norm": 0.35029583549019483, + "learning_rate": 2.4618608629839446e-05, + "loss": 2.7559, + "step": 45267 + }, + { + "epoch": 2.1075494098749914, + "grad_norm": 0.3476904054104544, + "learning_rate": 2.4616274883448465e-05, + "loss": 2.6345, + "step": 45268 + }, + { + "epoch": 2.1075959680610845, + "grad_norm": 0.32952841637951097, + "learning_rate": 2.461394121155622e-05, + "loss": 2.7192, + "step": 45269 + }, + { + "epoch": 2.1076425262471776, + "grad_norm": 0.3308388218832927, + "learning_rate": 2.4611607614169572e-05, + "loss": 2.7012, + "step": 45270 + }, + { + "epoch": 2.1076890844332703, + "grad_norm": 0.31300271422968506, + "learning_rate": 2.4609274091295336e-05, + "loss": 2.5926, + "step": 45271 + }, + { + "epoch": 2.1077356426193634, + "grad_norm": 0.338138835638261, + "learning_rate": 2.4606940642940378e-05, + "loss": 2.6433, + "step": 45272 + }, + { + "epoch": 2.1077822008054565, + "grad_norm": 0.33123016616477, + "learning_rate": 2.460460726911155e-05, + "loss": 2.6736, + "step": 45273 + }, + { + "epoch": 2.1078287589915496, + "grad_norm": 0.35179777062979484, + "learning_rate": 2.460227396981572e-05, + "loss": 2.6999, + "step": 45274 + }, + { + "epoch": 2.1078753171776428, + "grad_norm": 0.3361953094308311, + "learning_rate": 2.4599940745059675e-05, + "loss": 2.7286, + "step": 45275 + }, + { + "epoch": 2.107921875363736, + "grad_norm": 0.3159011248374907, + "learning_rate": 2.4597607594850336e-05, + "loss": 2.593, + "step": 45276 + }, + { + "epoch": 2.107968433549829, + "grad_norm": 0.355689692450154, + "learning_rate": 2.4595274519194476e-05, + "loss": 2.6572, + "step": 45277 + }, + { + "epoch": 2.108014991735922, + "grad_norm": 0.363020341942865, + "learning_rate": 2.459294151809902e-05, + "loss": 2.7166, + "step": 45278 + }, + { + "epoch": 2.108061549922015, + "grad_norm": 0.31968361157024566, + "learning_rate": 2.4590608591570768e-05, + "loss": 2.6836, + "step": 45279 + }, + { + "epoch": 2.108108108108108, + "grad_norm": 0.33163204734459806, + "learning_rate": 2.4588275739616567e-05, + "loss": 2.6973, + "step": 45280 + }, + { + "epoch": 2.108154666294201, + "grad_norm": 0.3496593408890798, + "learning_rate": 2.458594296224328e-05, + "loss": 2.6221, + "step": 45281 + }, + { + "epoch": 2.108201224480294, + "grad_norm": 0.36198647408778906, + "learning_rate": 2.458361025945774e-05, + "loss": 2.5643, + "step": 45282 + }, + { + "epoch": 2.1082477826663872, + "grad_norm": 0.3814008674746284, + "learning_rate": 2.4581277631266823e-05, + "loss": 2.6822, + "step": 45283 + }, + { + "epoch": 2.1082943408524804, + "grad_norm": 0.3432930596014478, + "learning_rate": 2.4578945077677335e-05, + "loss": 2.5914, + "step": 45284 + }, + { + "epoch": 2.1083408990385735, + "grad_norm": 0.37127475255216397, + "learning_rate": 2.4576612598696135e-05, + "loss": 2.6736, + "step": 45285 + }, + { + "epoch": 2.1083874572246666, + "grad_norm": 0.35283846300586047, + "learning_rate": 2.4574280194330075e-05, + "loss": 2.5955, + "step": 45286 + }, + { + "epoch": 2.1084340154107597, + "grad_norm": 0.34135257787392725, + "learning_rate": 2.4571947864585997e-05, + "loss": 2.7236, + "step": 45287 + }, + { + "epoch": 2.108480573596853, + "grad_norm": 0.3654915786689819, + "learning_rate": 2.4569615609470747e-05, + "loss": 2.6569, + "step": 45288 + }, + { + "epoch": 2.108527131782946, + "grad_norm": 0.3686092758144071, + "learning_rate": 2.4567283428991188e-05, + "loss": 2.6837, + "step": 45289 + }, + { + "epoch": 2.1085736899690386, + "grad_norm": 0.37712646726838434, + "learning_rate": 2.4564951323154107e-05, + "loss": 2.6024, + "step": 45290 + }, + { + "epoch": 2.1086202481551317, + "grad_norm": 0.34931493211194964, + "learning_rate": 2.4562619291966432e-05, + "loss": 2.5972, + "step": 45291 + }, + { + "epoch": 2.108666806341225, + "grad_norm": 0.364918694258348, + "learning_rate": 2.456028733543494e-05, + "loss": 2.6663, + "step": 45292 + }, + { + "epoch": 2.108713364527318, + "grad_norm": 0.3566598021478309, + "learning_rate": 2.45579554535665e-05, + "loss": 2.7173, + "step": 45293 + }, + { + "epoch": 2.108759922713411, + "grad_norm": 0.363725311782227, + "learning_rate": 2.4555623646367954e-05, + "loss": 2.6763, + "step": 45294 + }, + { + "epoch": 2.108806480899504, + "grad_norm": 0.34807905264633654, + "learning_rate": 2.4553291913846143e-05, + "loss": 2.6454, + "step": 45295 + }, + { + "epoch": 2.1088530390855973, + "grad_norm": 0.3684728332065873, + "learning_rate": 2.4550960256007932e-05, + "loss": 2.5668, + "step": 45296 + }, + { + "epoch": 2.1088995972716904, + "grad_norm": 0.3591238161804017, + "learning_rate": 2.454862867286013e-05, + "loss": 2.5761, + "step": 45297 + }, + { + "epoch": 2.1089461554577835, + "grad_norm": 0.3374957121194091, + "learning_rate": 2.4546297164409594e-05, + "loss": 2.6875, + "step": 45298 + }, + { + "epoch": 2.1089927136438766, + "grad_norm": 0.3610508539525218, + "learning_rate": 2.4543965730663172e-05, + "loss": 2.7187, + "step": 45299 + }, + { + "epoch": 2.1090392718299693, + "grad_norm": 0.35094833981546486, + "learning_rate": 2.4541634371627698e-05, + "loss": 2.6914, + "step": 45300 + }, + { + "epoch": 2.1090858300160624, + "grad_norm": 0.3273207960106233, + "learning_rate": 2.4539303087310017e-05, + "loss": 2.586, + "step": 45301 + }, + { + "epoch": 2.1091323882021555, + "grad_norm": 0.3325959818667824, + "learning_rate": 2.4536971877716997e-05, + "loss": 2.6605, + "step": 45302 + }, + { + "epoch": 2.1091789463882487, + "grad_norm": 0.33746709144234577, + "learning_rate": 2.453464074285542e-05, + "loss": 2.6981, + "step": 45303 + }, + { + "epoch": 2.1092255045743418, + "grad_norm": 0.3467128112792361, + "learning_rate": 2.4532309682732195e-05, + "loss": 2.6638, + "step": 45304 + }, + { + "epoch": 2.109272062760435, + "grad_norm": 0.35822149031614714, + "learning_rate": 2.45299786973541e-05, + "loss": 2.7734, + "step": 45305 + }, + { + "epoch": 2.109318620946528, + "grad_norm": 0.34526474938668644, + "learning_rate": 2.452764778672804e-05, + "loss": 2.6416, + "step": 45306 + }, + { + "epoch": 2.109365179132621, + "grad_norm": 0.3302112456704471, + "learning_rate": 2.452531695086081e-05, + "loss": 2.6657, + "step": 45307 + }, + { + "epoch": 2.1094117373187142, + "grad_norm": 0.3616813572151895, + "learning_rate": 2.4522986189759267e-05, + "loss": 2.634, + "step": 45308 + }, + { + "epoch": 2.1094582955048073, + "grad_norm": 0.3238057709267074, + "learning_rate": 2.4520655503430262e-05, + "loss": 2.6552, + "step": 45309 + }, + { + "epoch": 2.1095048536909, + "grad_norm": 0.35007716717326126, + "learning_rate": 2.4518324891880612e-05, + "loss": 2.7071, + "step": 45310 + }, + { + "epoch": 2.109551411876993, + "grad_norm": 0.32582534739992824, + "learning_rate": 2.451599435511716e-05, + "loss": 2.6479, + "step": 45311 + }, + { + "epoch": 2.1095979700630862, + "grad_norm": 0.3417622459087367, + "learning_rate": 2.451366389314676e-05, + "loss": 2.5325, + "step": 45312 + }, + { + "epoch": 2.1096445282491794, + "grad_norm": 0.3499938325418574, + "learning_rate": 2.4511333505976243e-05, + "loss": 2.6029, + "step": 45313 + }, + { + "epoch": 2.1096910864352725, + "grad_norm": 0.3191128552611995, + "learning_rate": 2.450900319361245e-05, + "loss": 2.5894, + "step": 45314 + }, + { + "epoch": 2.1097376446213656, + "grad_norm": 0.3565661422150764, + "learning_rate": 2.4506672956062243e-05, + "loss": 2.693, + "step": 45315 + }, + { + "epoch": 2.1097842028074587, + "grad_norm": 0.34587546251928064, + "learning_rate": 2.4504342793332404e-05, + "loss": 2.6198, + "step": 45316 + }, + { + "epoch": 2.109830760993552, + "grad_norm": 0.35905564421236164, + "learning_rate": 2.4502012705429838e-05, + "loss": 2.6522, + "step": 45317 + }, + { + "epoch": 2.109877319179645, + "grad_norm": 0.3343015398307169, + "learning_rate": 2.4499682692361315e-05, + "loss": 2.6114, + "step": 45318 + }, + { + "epoch": 2.1099238773657376, + "grad_norm": 0.3300252315994384, + "learning_rate": 2.4497352754133757e-05, + "loss": 2.69, + "step": 45319 + }, + { + "epoch": 2.1099704355518307, + "grad_norm": 0.37608078802610573, + "learning_rate": 2.4495022890753927e-05, + "loss": 2.7436, + "step": 45320 + }, + { + "epoch": 2.110016993737924, + "grad_norm": 0.3221181512230766, + "learning_rate": 2.44926931022287e-05, + "loss": 2.6309, + "step": 45321 + }, + { + "epoch": 2.110063551924017, + "grad_norm": 0.3487302476211435, + "learning_rate": 2.4490363388564923e-05, + "loss": 2.7669, + "step": 45322 + }, + { + "epoch": 2.11011011011011, + "grad_norm": 0.3404614332493026, + "learning_rate": 2.448803374976938e-05, + "loss": 2.64, + "step": 45323 + }, + { + "epoch": 2.110156668296203, + "grad_norm": 0.3603283485584449, + "learning_rate": 2.448570418584898e-05, + "loss": 2.7014, + "step": 45324 + }, + { + "epoch": 2.1102032264822963, + "grad_norm": 0.3436034570817972, + "learning_rate": 2.44833746968105e-05, + "loss": 2.6654, + "step": 45325 + }, + { + "epoch": 2.1102497846683894, + "grad_norm": 0.34259308844320496, + "learning_rate": 2.4481045282660815e-05, + "loss": 2.6566, + "step": 45326 + }, + { + "epoch": 2.1102963428544825, + "grad_norm": 0.3381367591135624, + "learning_rate": 2.4478715943406737e-05, + "loss": 2.6427, + "step": 45327 + }, + { + "epoch": 2.1103429010405756, + "grad_norm": 0.3425042863091611, + "learning_rate": 2.4476386679055136e-05, + "loss": 2.7218, + "step": 45328 + }, + { + "epoch": 2.1103894592266683, + "grad_norm": 0.3198508092701782, + "learning_rate": 2.4474057489612788e-05, + "loss": 2.5974, + "step": 45329 + }, + { + "epoch": 2.1104360174127614, + "grad_norm": 0.3290436591192324, + "learning_rate": 2.4471728375086604e-05, + "loss": 2.6297, + "step": 45330 + }, + { + "epoch": 2.1104825755988545, + "grad_norm": 0.34830733051230384, + "learning_rate": 2.4469399335483344e-05, + "loss": 2.5947, + "step": 45331 + }, + { + "epoch": 2.1105291337849477, + "grad_norm": 0.31797289518687893, + "learning_rate": 2.446707037080992e-05, + "loss": 2.6525, + "step": 45332 + }, + { + "epoch": 2.110575691971041, + "grad_norm": 0.34810637062060135, + "learning_rate": 2.446474148107311e-05, + "loss": 2.721, + "step": 45333 + }, + { + "epoch": 2.110622250157134, + "grad_norm": 0.3561662296379779, + "learning_rate": 2.4462412666279766e-05, + "loss": 2.5838, + "step": 45334 + }, + { + "epoch": 2.110668808343227, + "grad_norm": 0.32516762886598466, + "learning_rate": 2.446008392643675e-05, + "loss": 2.6815, + "step": 45335 + }, + { + "epoch": 2.11071536652932, + "grad_norm": 0.3396684442986677, + "learning_rate": 2.445775526155083e-05, + "loss": 2.5995, + "step": 45336 + }, + { + "epoch": 2.1107619247154132, + "grad_norm": 0.36943302701949476, + "learning_rate": 2.4455426671628918e-05, + "loss": 2.6418, + "step": 45337 + }, + { + "epoch": 2.1108084829015064, + "grad_norm": 0.35718838430476524, + "learning_rate": 2.4453098156677794e-05, + "loss": 2.6183, + "step": 45338 + }, + { + "epoch": 2.110855041087599, + "grad_norm": 0.3556914996414109, + "learning_rate": 2.4450769716704313e-05, + "loss": 2.6713, + "step": 45339 + }, + { + "epoch": 2.110901599273692, + "grad_norm": 0.3832820688035906, + "learning_rate": 2.44484413517153e-05, + "loss": 2.6837, + "step": 45340 + }, + { + "epoch": 2.1109481574597853, + "grad_norm": 0.34849111157788015, + "learning_rate": 2.44461130617176e-05, + "loss": 2.5907, + "step": 45341 + }, + { + "epoch": 2.1109947156458784, + "grad_norm": 0.35700868954890896, + "learning_rate": 2.444378484671804e-05, + "loss": 2.6072, + "step": 45342 + }, + { + "epoch": 2.1110412738319715, + "grad_norm": 0.359781977783116, + "learning_rate": 2.444145670672347e-05, + "loss": 2.6451, + "step": 45343 + }, + { + "epoch": 2.1110878320180646, + "grad_norm": 0.36052464095660847, + "learning_rate": 2.4439128641740673e-05, + "loss": 2.6057, + "step": 45344 + }, + { + "epoch": 2.1111343902041577, + "grad_norm": 0.36466416674047397, + "learning_rate": 2.4436800651776555e-05, + "loss": 2.795, + "step": 45345 + }, + { + "epoch": 2.111180948390251, + "grad_norm": 0.37369529530839596, + "learning_rate": 2.4434472736837882e-05, + "loss": 2.662, + "step": 45346 + }, + { + "epoch": 2.111227506576344, + "grad_norm": 0.3412070316139905, + "learning_rate": 2.4432144896931512e-05, + "loss": 2.574, + "step": 45347 + }, + { + "epoch": 2.111274064762437, + "grad_norm": 0.36489807344172037, + "learning_rate": 2.4429817132064303e-05, + "loss": 2.7154, + "step": 45348 + }, + { + "epoch": 2.1113206229485297, + "grad_norm": 0.35072146669542553, + "learning_rate": 2.4427489442243023e-05, + "loss": 2.674, + "step": 45349 + }, + { + "epoch": 2.111367181134623, + "grad_norm": 0.3221789962640684, + "learning_rate": 2.4425161827474578e-05, + "loss": 2.6123, + "step": 45350 + }, + { + "epoch": 2.111413739320716, + "grad_norm": 0.34577843088643545, + "learning_rate": 2.4422834287765745e-05, + "loss": 2.6526, + "step": 45351 + }, + { + "epoch": 2.111460297506809, + "grad_norm": 0.37140114676720193, + "learning_rate": 2.4420506823123373e-05, + "loss": 2.7137, + "step": 45352 + }, + { + "epoch": 2.111506855692902, + "grad_norm": 0.3753286370409817, + "learning_rate": 2.44181794335543e-05, + "loss": 2.712, + "step": 45353 + }, + { + "epoch": 2.1115534138789953, + "grad_norm": 0.33100745227216466, + "learning_rate": 2.441585211906534e-05, + "loss": 2.6718, + "step": 45354 + }, + { + "epoch": 2.1115999720650884, + "grad_norm": 0.3424934126946811, + "learning_rate": 2.441352487966334e-05, + "loss": 2.6164, + "step": 45355 + }, + { + "epoch": 2.1116465302511815, + "grad_norm": 0.35531064956453473, + "learning_rate": 2.4411197715355143e-05, + "loss": 2.6811, + "step": 45356 + }, + { + "epoch": 2.1116930884372747, + "grad_norm": 0.3201572183887127, + "learning_rate": 2.4408870626147518e-05, + "loss": 2.7107, + "step": 45357 + }, + { + "epoch": 2.1117396466233673, + "grad_norm": 0.34464323851984613, + "learning_rate": 2.4406543612047378e-05, + "loss": 2.644, + "step": 45358 + }, + { + "epoch": 2.1117862048094604, + "grad_norm": 0.352720653827917, + "learning_rate": 2.440421667306149e-05, + "loss": 2.5609, + "step": 45359 + }, + { + "epoch": 2.1118327629955536, + "grad_norm": 0.34312643901357315, + "learning_rate": 2.4401889809196705e-05, + "loss": 2.6735, + "step": 45360 + }, + { + "epoch": 2.1118793211816467, + "grad_norm": 0.3378926820559865, + "learning_rate": 2.439956302045987e-05, + "loss": 2.7056, + "step": 45361 + }, + { + "epoch": 2.11192587936774, + "grad_norm": 0.34149232162046045, + "learning_rate": 2.4397236306857756e-05, + "loss": 2.6634, + "step": 45362 + }, + { + "epoch": 2.111972437553833, + "grad_norm": 0.3429878240754457, + "learning_rate": 2.4394909668397277e-05, + "loss": 2.5916, + "step": 45363 + }, + { + "epoch": 2.112018995739926, + "grad_norm": 0.33881488655188524, + "learning_rate": 2.439258310508519e-05, + "loss": 2.6057, + "step": 45364 + }, + { + "epoch": 2.112065553926019, + "grad_norm": 0.3563553791349226, + "learning_rate": 2.439025661692835e-05, + "loss": 2.6665, + "step": 45365 + }, + { + "epoch": 2.1121121121121122, + "grad_norm": 0.3445339606572014, + "learning_rate": 2.438793020393358e-05, + "loss": 2.5404, + "step": 45366 + }, + { + "epoch": 2.1121586702982054, + "grad_norm": 0.3436537722225279, + "learning_rate": 2.4385603866107713e-05, + "loss": 2.7606, + "step": 45367 + }, + { + "epoch": 2.1122052284842985, + "grad_norm": 0.346443666343445, + "learning_rate": 2.4383277603457576e-05, + "loss": 2.7334, + "step": 45368 + }, + { + "epoch": 2.112251786670391, + "grad_norm": 0.3249002205989869, + "learning_rate": 2.4380951415990022e-05, + "loss": 2.6398, + "step": 45369 + }, + { + "epoch": 2.1122983448564843, + "grad_norm": 0.3150442903676375, + "learning_rate": 2.4378625303711804e-05, + "loss": 2.6074, + "step": 45370 + }, + { + "epoch": 2.1123449030425774, + "grad_norm": 0.3370900311457334, + "learning_rate": 2.4376299266629837e-05, + "loss": 2.6529, + "step": 45371 + }, + { + "epoch": 2.1123914612286705, + "grad_norm": 0.3440310057304695, + "learning_rate": 2.437397330475089e-05, + "loss": 2.7451, + "step": 45372 + }, + { + "epoch": 2.1124380194147636, + "grad_norm": 0.3318588029686595, + "learning_rate": 2.4371647418081806e-05, + "loss": 2.6295, + "step": 45373 + }, + { + "epoch": 2.1124845776008567, + "grad_norm": 0.3328972099310088, + "learning_rate": 2.4369321606629437e-05, + "loss": 2.6308, + "step": 45374 + }, + { + "epoch": 2.11253113578695, + "grad_norm": 0.34305791676472086, + "learning_rate": 2.436699587040054e-05, + "loss": 2.694, + "step": 45375 + }, + { + "epoch": 2.112577693973043, + "grad_norm": 0.32542319288528976, + "learning_rate": 2.436467020940202e-05, + "loss": 2.5361, + "step": 45376 + }, + { + "epoch": 2.112624252159136, + "grad_norm": 0.35502696238352066, + "learning_rate": 2.436234462364063e-05, + "loss": 2.5743, + "step": 45377 + }, + { + "epoch": 2.1126708103452287, + "grad_norm": 0.3319283544265989, + "learning_rate": 2.436001911312328e-05, + "loss": 2.6998, + "step": 45378 + }, + { + "epoch": 2.112717368531322, + "grad_norm": 0.33668149644046164, + "learning_rate": 2.435769367785672e-05, + "loss": 2.6064, + "step": 45379 + }, + { + "epoch": 2.112763926717415, + "grad_norm": 0.3514702515772261, + "learning_rate": 2.4355368317847805e-05, + "loss": 2.6305, + "step": 45380 + }, + { + "epoch": 2.112810484903508, + "grad_norm": 0.33947871140486735, + "learning_rate": 2.4353043033103355e-05, + "loss": 2.6807, + "step": 45381 + }, + { + "epoch": 2.112857043089601, + "grad_norm": 0.34133165807287524, + "learning_rate": 2.435071782363022e-05, + "loss": 2.5132, + "step": 45382 + }, + { + "epoch": 2.1129036012756943, + "grad_norm": 0.3144834508058391, + "learning_rate": 2.4348392689435163e-05, + "loss": 2.5267, + "step": 45383 + }, + { + "epoch": 2.1129501594617874, + "grad_norm": 0.3331776308162815, + "learning_rate": 2.4346067630525082e-05, + "loss": 2.6556, + "step": 45384 + }, + { + "epoch": 2.1129967176478806, + "grad_norm": 0.35567488909517664, + "learning_rate": 2.4343742646906747e-05, + "loss": 2.627, + "step": 45385 + }, + { + "epoch": 2.1130432758339737, + "grad_norm": 0.3444561555348423, + "learning_rate": 2.4341417738587e-05, + "loss": 2.6577, + "step": 45386 + }, + { + "epoch": 2.113089834020067, + "grad_norm": 0.33850297223456266, + "learning_rate": 2.4339092905572685e-05, + "loss": 2.7544, + "step": 45387 + }, + { + "epoch": 2.1131363922061595, + "grad_norm": 0.3777438539234038, + "learning_rate": 2.4336768147870563e-05, + "loss": 2.6683, + "step": 45388 + }, + { + "epoch": 2.1131829503922526, + "grad_norm": 0.34808114634336257, + "learning_rate": 2.433444346548754e-05, + "loss": 2.6412, + "step": 45389 + }, + { + "epoch": 2.1132295085783457, + "grad_norm": 0.35171638111629006, + "learning_rate": 2.4332118858430358e-05, + "loss": 2.6401, + "step": 45390 + }, + { + "epoch": 2.113276066764439, + "grad_norm": 0.3415048840288674, + "learning_rate": 2.432979432670592e-05, + "loss": 2.6975, + "step": 45391 + }, + { + "epoch": 2.113322624950532, + "grad_norm": 0.33586345002409096, + "learning_rate": 2.432746987032098e-05, + "loss": 2.652, + "step": 45392 + }, + { + "epoch": 2.113369183136625, + "grad_norm": 0.35904166558419304, + "learning_rate": 2.4325145489282387e-05, + "loss": 2.628, + "step": 45393 + }, + { + "epoch": 2.113415741322718, + "grad_norm": 0.3504039173195591, + "learning_rate": 2.4322821183596967e-05, + "loss": 2.7166, + "step": 45394 + }, + { + "epoch": 2.1134622995088113, + "grad_norm": 0.32894565427978806, + "learning_rate": 2.4320496953271537e-05, + "loss": 2.6411, + "step": 45395 + }, + { + "epoch": 2.1135088576949044, + "grad_norm": 0.3332175485742771, + "learning_rate": 2.4318172798312917e-05, + "loss": 2.6714, + "step": 45396 + }, + { + "epoch": 2.1135554158809975, + "grad_norm": 0.34168206328816925, + "learning_rate": 2.431584871872795e-05, + "loss": 2.6687, + "step": 45397 + }, + { + "epoch": 2.11360197406709, + "grad_norm": 0.3702777658809571, + "learning_rate": 2.4313524714523417e-05, + "loss": 2.6353, + "step": 45398 + }, + { + "epoch": 2.1136485322531833, + "grad_norm": 0.3572789083989032, + "learning_rate": 2.4311200785706158e-05, + "loss": 2.6997, + "step": 45399 + }, + { + "epoch": 2.1136950904392764, + "grad_norm": 0.3456078412495188, + "learning_rate": 2.4308876932283013e-05, + "loss": 2.6494, + "step": 45400 + }, + { + "epoch": 2.1137416486253695, + "grad_norm": 0.3589164882169927, + "learning_rate": 2.4306553154260746e-05, + "loss": 2.6988, + "step": 45401 + }, + { + "epoch": 2.1137882068114626, + "grad_norm": 0.3496183433422459, + "learning_rate": 2.4304229451646248e-05, + "loss": 2.5831, + "step": 45402 + }, + { + "epoch": 2.1138347649975557, + "grad_norm": 0.337311596896086, + "learning_rate": 2.4301905824446274e-05, + "loss": 2.6944, + "step": 45403 + }, + { + "epoch": 2.113881323183649, + "grad_norm": 0.3516355031690022, + "learning_rate": 2.429958227266771e-05, + "loss": 2.6979, + "step": 45404 + }, + { + "epoch": 2.113927881369742, + "grad_norm": 0.33502534892638447, + "learning_rate": 2.4297258796317317e-05, + "loss": 2.6438, + "step": 45405 + }, + { + "epoch": 2.113974439555835, + "grad_norm": 0.3181940831432325, + "learning_rate": 2.429493539540194e-05, + "loss": 2.5027, + "step": 45406 + }, + { + "epoch": 2.114020997741928, + "grad_norm": 0.3508694163379411, + "learning_rate": 2.42926120699284e-05, + "loss": 2.5966, + "step": 45407 + }, + { + "epoch": 2.114067555928021, + "grad_norm": 0.3478852707427115, + "learning_rate": 2.4290288819903506e-05, + "loss": 2.5898, + "step": 45408 + }, + { + "epoch": 2.114114114114114, + "grad_norm": 0.3736392776229391, + "learning_rate": 2.4287965645334083e-05, + "loss": 2.6842, + "step": 45409 + }, + { + "epoch": 2.114160672300207, + "grad_norm": 0.32151679352777196, + "learning_rate": 2.4285642546226965e-05, + "loss": 2.6596, + "step": 45410 + }, + { + "epoch": 2.1142072304863, + "grad_norm": 0.3293725136892697, + "learning_rate": 2.4283319522588937e-05, + "loss": 2.6962, + "step": 45411 + }, + { + "epoch": 2.1142537886723933, + "grad_norm": 0.3276510924935806, + "learning_rate": 2.4280996574426827e-05, + "loss": 2.6527, + "step": 45412 + }, + { + "epoch": 2.1143003468584864, + "grad_norm": 0.3347250888102755, + "learning_rate": 2.427867370174747e-05, + "loss": 2.6463, + "step": 45413 + }, + { + "epoch": 2.1143469050445796, + "grad_norm": 0.32383689755721123, + "learning_rate": 2.427635090455766e-05, + "loss": 2.652, + "step": 45414 + }, + { + "epoch": 2.1143934632306727, + "grad_norm": 0.3210359137019672, + "learning_rate": 2.427402818286425e-05, + "loss": 2.6219, + "step": 45415 + }, + { + "epoch": 2.114440021416766, + "grad_norm": 0.33179955681090834, + "learning_rate": 2.4271705536673994e-05, + "loss": 2.711, + "step": 45416 + }, + { + "epoch": 2.1144865796028585, + "grad_norm": 0.3588620123536301, + "learning_rate": 2.4269382965993787e-05, + "loss": 2.6418, + "step": 45417 + }, + { + "epoch": 2.1145331377889516, + "grad_norm": 0.3320089327094801, + "learning_rate": 2.4267060470830366e-05, + "loss": 2.7264, + "step": 45418 + }, + { + "epoch": 2.1145796959750447, + "grad_norm": 0.34996904802279777, + "learning_rate": 2.426473805119062e-05, + "loss": 2.6233, + "step": 45419 + }, + { + "epoch": 2.114626254161138, + "grad_norm": 0.3503142928012027, + "learning_rate": 2.426241570708132e-05, + "loss": 2.6753, + "step": 45420 + }, + { + "epoch": 2.114672812347231, + "grad_norm": 0.3151891830725461, + "learning_rate": 2.4260093438509295e-05, + "loss": 2.6006, + "step": 45421 + }, + { + "epoch": 2.114719370533324, + "grad_norm": 0.3645637560303957, + "learning_rate": 2.425777124548136e-05, + "loss": 2.6876, + "step": 45422 + }, + { + "epoch": 2.114765928719417, + "grad_norm": 0.3463839189335114, + "learning_rate": 2.425544912800435e-05, + "loss": 2.6005, + "step": 45423 + }, + { + "epoch": 2.1148124869055103, + "grad_norm": 0.33753108034521395, + "learning_rate": 2.4253127086085036e-05, + "loss": 2.6694, + "step": 45424 + }, + { + "epoch": 2.1148590450916034, + "grad_norm": 0.3424862968257953, + "learning_rate": 2.4250805119730264e-05, + "loss": 2.635, + "step": 45425 + }, + { + "epoch": 2.1149056032776965, + "grad_norm": 0.33251438728175015, + "learning_rate": 2.424848322894684e-05, + "loss": 2.7178, + "step": 45426 + }, + { + "epoch": 2.114952161463789, + "grad_norm": 0.33024801874177356, + "learning_rate": 2.4246161413741576e-05, + "loss": 2.7773, + "step": 45427 + }, + { + "epoch": 2.1149987196498823, + "grad_norm": 0.36229483416916997, + "learning_rate": 2.4243839674121315e-05, + "loss": 2.5499, + "step": 45428 + }, + { + "epoch": 2.1150452778359754, + "grad_norm": 0.3172305246286248, + "learning_rate": 2.4241518010092805e-05, + "loss": 2.5552, + "step": 45429 + }, + { + "epoch": 2.1150918360220685, + "grad_norm": 0.33067916583303636, + "learning_rate": 2.4239196421662948e-05, + "loss": 2.6448, + "step": 45430 + }, + { + "epoch": 2.1151383942081616, + "grad_norm": 0.3421477232462366, + "learning_rate": 2.423687490883847e-05, + "loss": 2.7236, + "step": 45431 + }, + { + "epoch": 2.1151849523942547, + "grad_norm": 0.31309229627474366, + "learning_rate": 2.423455347162626e-05, + "loss": 2.6524, + "step": 45432 + }, + { + "epoch": 2.115231510580348, + "grad_norm": 0.3434720445777553, + "learning_rate": 2.4232232110033082e-05, + "loss": 2.7307, + "step": 45433 + }, + { + "epoch": 2.115278068766441, + "grad_norm": 0.33766295441007377, + "learning_rate": 2.4229910824065765e-05, + "loss": 2.677, + "step": 45434 + }, + { + "epoch": 2.115324626952534, + "grad_norm": 0.3163307793969828, + "learning_rate": 2.4227589613731117e-05, + "loss": 2.6897, + "step": 45435 + }, + { + "epoch": 2.115371185138627, + "grad_norm": 0.31488743995538176, + "learning_rate": 2.4225268479035955e-05, + "loss": 2.6477, + "step": 45436 + }, + { + "epoch": 2.11541774332472, + "grad_norm": 0.3093143721548584, + "learning_rate": 2.4222947419987114e-05, + "loss": 2.6933, + "step": 45437 + }, + { + "epoch": 2.115464301510813, + "grad_norm": 0.35147293957165804, + "learning_rate": 2.422062643659136e-05, + "loss": 2.5578, + "step": 45438 + }, + { + "epoch": 2.115510859696906, + "grad_norm": 0.3493005897230988, + "learning_rate": 2.4218305528855533e-05, + "loss": 2.7196, + "step": 45439 + }, + { + "epoch": 2.1155574178829992, + "grad_norm": 0.32953187524161986, + "learning_rate": 2.4215984696786432e-05, + "loss": 2.5519, + "step": 45440 + }, + { + "epoch": 2.1156039760690923, + "grad_norm": 0.3356023433972888, + "learning_rate": 2.4213663940390903e-05, + "loss": 2.6393, + "step": 45441 + }, + { + "epoch": 2.1156505342551855, + "grad_norm": 0.34796168136934025, + "learning_rate": 2.4211343259675684e-05, + "loss": 2.6916, + "step": 45442 + }, + { + "epoch": 2.1156970924412786, + "grad_norm": 0.3388546978226409, + "learning_rate": 2.4209022654647674e-05, + "loss": 2.6202, + "step": 45443 + }, + { + "epoch": 2.1157436506273717, + "grad_norm": 0.32389315163978216, + "learning_rate": 2.42067021253136e-05, + "loss": 2.5756, + "step": 45444 + }, + { + "epoch": 2.115790208813465, + "grad_norm": 0.33359976805695607, + "learning_rate": 2.420438167168036e-05, + "loss": 2.5748, + "step": 45445 + }, + { + "epoch": 2.115836766999558, + "grad_norm": 0.3343790256761241, + "learning_rate": 2.4202061293754692e-05, + "loss": 2.6365, + "step": 45446 + }, + { + "epoch": 2.1158833251856506, + "grad_norm": 0.32362179124667, + "learning_rate": 2.4199740991543433e-05, + "loss": 2.7202, + "step": 45447 + }, + { + "epoch": 2.1159298833717437, + "grad_norm": 0.3444803881963142, + "learning_rate": 2.4197420765053397e-05, + "loss": 2.6978, + "step": 45448 + }, + { + "epoch": 2.115976441557837, + "grad_norm": 0.3300284689225022, + "learning_rate": 2.419510061429139e-05, + "loss": 2.5618, + "step": 45449 + }, + { + "epoch": 2.11602299974393, + "grad_norm": 0.36606935520882744, + "learning_rate": 2.4192780539264232e-05, + "loss": 2.71, + "step": 45450 + }, + { + "epoch": 2.116069557930023, + "grad_norm": 0.3510048656342544, + "learning_rate": 2.4190460539978705e-05, + "loss": 2.6991, + "step": 45451 + }, + { + "epoch": 2.116116116116116, + "grad_norm": 0.3384100822330212, + "learning_rate": 2.4188140616441633e-05, + "loss": 2.648, + "step": 45452 + }, + { + "epoch": 2.1161626743022093, + "grad_norm": 0.3377647945998462, + "learning_rate": 2.4185820768659827e-05, + "loss": 2.5942, + "step": 45453 + }, + { + "epoch": 2.1162092324883024, + "grad_norm": 0.36194672899524655, + "learning_rate": 2.4183500996640097e-05, + "loss": 2.7014, + "step": 45454 + }, + { + "epoch": 2.1162557906743955, + "grad_norm": 0.36275392977579846, + "learning_rate": 2.4181181300389243e-05, + "loss": 2.6704, + "step": 45455 + }, + { + "epoch": 2.116302348860488, + "grad_norm": 0.33028327614483627, + "learning_rate": 2.41788616799141e-05, + "loss": 2.5906, + "step": 45456 + }, + { + "epoch": 2.1163489070465813, + "grad_norm": 0.32904029532733486, + "learning_rate": 2.417654213522142e-05, + "loss": 2.702, + "step": 45457 + }, + { + "epoch": 2.1163954652326744, + "grad_norm": 0.35585019619930314, + "learning_rate": 2.4174222666318075e-05, + "loss": 2.7217, + "step": 45458 + }, + { + "epoch": 2.1164420234187675, + "grad_norm": 0.33244976479414956, + "learning_rate": 2.417190327321083e-05, + "loss": 2.6718, + "step": 45459 + }, + { + "epoch": 2.1164885816048606, + "grad_norm": 0.34171113017776733, + "learning_rate": 2.41695839559065e-05, + "loss": 2.6235, + "step": 45460 + }, + { + "epoch": 2.1165351397909538, + "grad_norm": 0.3620401182080814, + "learning_rate": 2.4167264714411897e-05, + "loss": 2.7176, + "step": 45461 + }, + { + "epoch": 2.116581697977047, + "grad_norm": 0.3408688885936453, + "learning_rate": 2.4164945548733825e-05, + "loss": 2.5643, + "step": 45462 + }, + { + "epoch": 2.11662825616314, + "grad_norm": 0.34092561698651586, + "learning_rate": 2.4162626458879116e-05, + "loss": 2.7277, + "step": 45463 + }, + { + "epoch": 2.116674814349233, + "grad_norm": 0.35338248923431215, + "learning_rate": 2.416030744485453e-05, + "loss": 2.7117, + "step": 45464 + }, + { + "epoch": 2.116721372535326, + "grad_norm": 0.37049782339607673, + "learning_rate": 2.41579885066669e-05, + "loss": 2.6766, + "step": 45465 + }, + { + "epoch": 2.116767930721419, + "grad_norm": 0.32346236699241343, + "learning_rate": 2.4155669644323026e-05, + "loss": 2.6387, + "step": 45466 + }, + { + "epoch": 2.116814488907512, + "grad_norm": 0.35145271943865636, + "learning_rate": 2.4153350857829715e-05, + "loss": 2.5273, + "step": 45467 + }, + { + "epoch": 2.116861047093605, + "grad_norm": 0.3639705743350712, + "learning_rate": 2.4151032147193774e-05, + "loss": 2.588, + "step": 45468 + }, + { + "epoch": 2.1169076052796982, + "grad_norm": 0.3353551164305789, + "learning_rate": 2.414871351242202e-05, + "loss": 2.607, + "step": 45469 + }, + { + "epoch": 2.1169541634657913, + "grad_norm": 0.34767586922811905, + "learning_rate": 2.4146394953521218e-05, + "loss": 2.6493, + "step": 45470 + }, + { + "epoch": 2.1170007216518845, + "grad_norm": 0.3535240279603734, + "learning_rate": 2.4144076470498233e-05, + "loss": 2.6693, + "step": 45471 + }, + { + "epoch": 2.1170472798379776, + "grad_norm": 0.3593691110761948, + "learning_rate": 2.4141758063359814e-05, + "loss": 2.641, + "step": 45472 + }, + { + "epoch": 2.1170938380240707, + "grad_norm": 0.36011912539542806, + "learning_rate": 2.413943973211279e-05, + "loss": 2.6108, + "step": 45473 + }, + { + "epoch": 2.117140396210164, + "grad_norm": 0.36780185630617707, + "learning_rate": 2.4137121476763964e-05, + "loss": 2.6802, + "step": 45474 + }, + { + "epoch": 2.117186954396257, + "grad_norm": 0.37028153523964585, + "learning_rate": 2.4134803297320135e-05, + "loss": 2.5776, + "step": 45475 + }, + { + "epoch": 2.1172335125823496, + "grad_norm": 0.35735511521140323, + "learning_rate": 2.4132485193788133e-05, + "loss": 2.6274, + "step": 45476 + }, + { + "epoch": 2.1172800707684427, + "grad_norm": 0.3446765290348752, + "learning_rate": 2.4130167166174723e-05, + "loss": 2.583, + "step": 45477 + }, + { + "epoch": 2.117326628954536, + "grad_norm": 0.35468040016572855, + "learning_rate": 2.4127849214486713e-05, + "loss": 2.6936, + "step": 45478 + }, + { + "epoch": 2.117373187140629, + "grad_norm": 0.3423844787896586, + "learning_rate": 2.4125531338730928e-05, + "loss": 2.598, + "step": 45479 + }, + { + "epoch": 2.117419745326722, + "grad_norm": 0.3407715154766071, + "learning_rate": 2.4123213538914154e-05, + "loss": 2.7361, + "step": 45480 + }, + { + "epoch": 2.117466303512815, + "grad_norm": 0.36479512653658697, + "learning_rate": 2.4120895815043197e-05, + "loss": 2.647, + "step": 45481 + }, + { + "epoch": 2.1175128616989083, + "grad_norm": 0.3816113326446019, + "learning_rate": 2.4118578167124882e-05, + "loss": 2.6667, + "step": 45482 + }, + { + "epoch": 2.1175594198850014, + "grad_norm": 0.3469700330789634, + "learning_rate": 2.411626059516595e-05, + "loss": 2.6573, + "step": 45483 + }, + { + "epoch": 2.1176059780710945, + "grad_norm": 0.32017025162450746, + "learning_rate": 2.4113943099173285e-05, + "loss": 2.5876, + "step": 45484 + }, + { + "epoch": 2.1176525362571876, + "grad_norm": 0.38195672258636976, + "learning_rate": 2.4111625679153626e-05, + "loss": 2.6897, + "step": 45485 + }, + { + "epoch": 2.1176990944432803, + "grad_norm": 0.36016299203524843, + "learning_rate": 2.41093083351138e-05, + "loss": 2.6884, + "step": 45486 + }, + { + "epoch": 2.1177456526293734, + "grad_norm": 0.3344260691337649, + "learning_rate": 2.4106991067060598e-05, + "loss": 2.5897, + "step": 45487 + }, + { + "epoch": 2.1177922108154665, + "grad_norm": 0.349461496417034, + "learning_rate": 2.4104673875000826e-05, + "loss": 2.6766, + "step": 45488 + }, + { + "epoch": 2.1178387690015597, + "grad_norm": 0.3384729765906971, + "learning_rate": 2.410235675894131e-05, + "loss": 2.6765, + "step": 45489 + }, + { + "epoch": 2.1178853271876528, + "grad_norm": 0.3438980426848568, + "learning_rate": 2.4100039718888778e-05, + "loss": 2.6461, + "step": 45490 + }, + { + "epoch": 2.117931885373746, + "grad_norm": 0.3606757128630465, + "learning_rate": 2.4097722754850123e-05, + "loss": 2.6089, + "step": 45491 + }, + { + "epoch": 2.117978443559839, + "grad_norm": 0.3476804426052555, + "learning_rate": 2.409540586683208e-05, + "loss": 2.7091, + "step": 45492 + }, + { + "epoch": 2.118025001745932, + "grad_norm": 0.36024656523301335, + "learning_rate": 2.4093089054841468e-05, + "loss": 2.5915, + "step": 45493 + }, + { + "epoch": 2.1180715599320252, + "grad_norm": 0.3564782080770624, + "learning_rate": 2.4090772318885085e-05, + "loss": 2.6664, + "step": 45494 + }, + { + "epoch": 2.118118118118118, + "grad_norm": 0.3685398832967492, + "learning_rate": 2.4088455658969754e-05, + "loss": 2.6746, + "step": 45495 + }, + { + "epoch": 2.118164676304211, + "grad_norm": 0.3305381763627379, + "learning_rate": 2.4086139075102217e-05, + "loss": 2.622, + "step": 45496 + }, + { + "epoch": 2.118211234490304, + "grad_norm": 0.3524737451888983, + "learning_rate": 2.4083822567289344e-05, + "loss": 2.7028, + "step": 45497 + }, + { + "epoch": 2.1182577926763972, + "grad_norm": 0.36588570424836253, + "learning_rate": 2.4081506135537878e-05, + "loss": 2.6678, + "step": 45498 + }, + { + "epoch": 2.1183043508624904, + "grad_norm": 0.3471314548301177, + "learning_rate": 2.4079189779854644e-05, + "loss": 2.598, + "step": 45499 + }, + { + "epoch": 2.1183509090485835, + "grad_norm": 0.3310996746918236, + "learning_rate": 2.4076873500246433e-05, + "loss": 2.6283, + "step": 45500 + }, + { + "epoch": 2.1183974672346766, + "grad_norm": 0.34610490105237657, + "learning_rate": 2.407455729672004e-05, + "loss": 2.6679, + "step": 45501 + }, + { + "epoch": 2.1184440254207697, + "grad_norm": 0.3488551999295127, + "learning_rate": 2.4072241169282288e-05, + "loss": 2.5801, + "step": 45502 + }, + { + "epoch": 2.118490583606863, + "grad_norm": 0.3434019049209793, + "learning_rate": 2.406992511793992e-05, + "loss": 2.7286, + "step": 45503 + }, + { + "epoch": 2.118537141792956, + "grad_norm": 0.34365057955968314, + "learning_rate": 2.4067609142699798e-05, + "loss": 2.6465, + "step": 45504 + }, + { + "epoch": 2.1185836999790486, + "grad_norm": 0.3531605537378861, + "learning_rate": 2.4065293243568677e-05, + "loss": 2.7192, + "step": 45505 + }, + { + "epoch": 2.1186302581651417, + "grad_norm": 0.35303908441943566, + "learning_rate": 2.4062977420553356e-05, + "loss": 2.6491, + "step": 45506 + }, + { + "epoch": 2.118676816351235, + "grad_norm": 0.341521861092162, + "learning_rate": 2.406066167366065e-05, + "loss": 2.5959, + "step": 45507 + }, + { + "epoch": 2.118723374537328, + "grad_norm": 0.33122821699418126, + "learning_rate": 2.4058346002897343e-05, + "loss": 2.6085, + "step": 45508 + }, + { + "epoch": 2.118769932723421, + "grad_norm": 0.3452848725153947, + "learning_rate": 2.405603040827023e-05, + "loss": 2.622, + "step": 45509 + }, + { + "epoch": 2.118816490909514, + "grad_norm": 0.33724937611190153, + "learning_rate": 2.4053714889786137e-05, + "loss": 2.5633, + "step": 45510 + }, + { + "epoch": 2.1188630490956073, + "grad_norm": 0.3491598988641301, + "learning_rate": 2.4051399447451818e-05, + "loss": 2.7215, + "step": 45511 + }, + { + "epoch": 2.1189096072817004, + "grad_norm": 0.35832112999551524, + "learning_rate": 2.404908408127408e-05, + "loss": 2.6689, + "step": 45512 + }, + { + "epoch": 2.1189561654677935, + "grad_norm": 0.3334694175364569, + "learning_rate": 2.4046768791259728e-05, + "loss": 2.6285, + "step": 45513 + }, + { + "epoch": 2.1190027236538866, + "grad_norm": 0.3822061260234871, + "learning_rate": 2.4044453577415548e-05, + "loss": 2.7246, + "step": 45514 + }, + { + "epoch": 2.1190492818399793, + "grad_norm": 0.37521224460387476, + "learning_rate": 2.4042138439748358e-05, + "loss": 2.7991, + "step": 45515 + }, + { + "epoch": 2.1190958400260724, + "grad_norm": 0.3647224073169419, + "learning_rate": 2.4039823378264903e-05, + "loss": 2.6961, + "step": 45516 + }, + { + "epoch": 2.1191423982121655, + "grad_norm": 0.382945653426996, + "learning_rate": 2.4037508392972042e-05, + "loss": 2.7704, + "step": 45517 + }, + { + "epoch": 2.1191889563982587, + "grad_norm": 0.3328646437219495, + "learning_rate": 2.4035193483876517e-05, + "loss": 2.6871, + "step": 45518 + }, + { + "epoch": 2.1192355145843518, + "grad_norm": 0.38534116971435695, + "learning_rate": 2.403287865098514e-05, + "loss": 2.7611, + "step": 45519 + }, + { + "epoch": 2.119282072770445, + "grad_norm": 0.36417509093981404, + "learning_rate": 2.403056389430471e-05, + "loss": 2.7328, + "step": 45520 + }, + { + "epoch": 2.119328630956538, + "grad_norm": 0.32186288218940934, + "learning_rate": 2.4028249213842015e-05, + "loss": 2.7033, + "step": 45521 + }, + { + "epoch": 2.119375189142631, + "grad_norm": 0.3444846678348511, + "learning_rate": 2.4025934609603852e-05, + "loss": 2.7351, + "step": 45522 + }, + { + "epoch": 2.1194217473287242, + "grad_norm": 0.34482489000369987, + "learning_rate": 2.4023620081597025e-05, + "loss": 2.6906, + "step": 45523 + }, + { + "epoch": 2.1194683055148174, + "grad_norm": 0.3263300776687265, + "learning_rate": 2.40213056298283e-05, + "loss": 2.5341, + "step": 45524 + }, + { + "epoch": 2.11951486370091, + "grad_norm": 0.35837213639525894, + "learning_rate": 2.401899125430448e-05, + "loss": 2.6422, + "step": 45525 + }, + { + "epoch": 2.119561421887003, + "grad_norm": 0.39089978082301635, + "learning_rate": 2.4016676955032363e-05, + "loss": 2.6673, + "step": 45526 + }, + { + "epoch": 2.1196079800730963, + "grad_norm": 0.31848814859215, + "learning_rate": 2.4014362732018742e-05, + "loss": 2.5576, + "step": 45527 + }, + { + "epoch": 2.1196545382591894, + "grad_norm": 0.35322187440143676, + "learning_rate": 2.4012048585270423e-05, + "loss": 2.6756, + "step": 45528 + }, + { + "epoch": 2.1197010964452825, + "grad_norm": 0.3811874594765047, + "learning_rate": 2.400973451479414e-05, + "loss": 2.6568, + "step": 45529 + }, + { + "epoch": 2.1197476546313756, + "grad_norm": 0.30550532923295753, + "learning_rate": 2.4007420520596768e-05, + "loss": 2.5854, + "step": 45530 + }, + { + "epoch": 2.1197942128174687, + "grad_norm": 0.3298521868603529, + "learning_rate": 2.400510660268504e-05, + "loss": 2.6612, + "step": 45531 + }, + { + "epoch": 2.119840771003562, + "grad_norm": 0.38332721359728045, + "learning_rate": 2.4002792761065755e-05, + "loss": 2.6837, + "step": 45532 + }, + { + "epoch": 2.119887329189655, + "grad_norm": 0.3254062942847349, + "learning_rate": 2.4000478995745717e-05, + "loss": 2.5603, + "step": 45533 + }, + { + "epoch": 2.1199338873757476, + "grad_norm": 0.3292472928221606, + "learning_rate": 2.3998165306731712e-05, + "loss": 2.6957, + "step": 45534 + }, + { + "epoch": 2.1199804455618407, + "grad_norm": 0.3945335359751096, + "learning_rate": 2.399585169403053e-05, + "loss": 2.8567, + "step": 45535 + }, + { + "epoch": 2.120027003747934, + "grad_norm": 0.3410790979980799, + "learning_rate": 2.399353815764898e-05, + "loss": 2.5696, + "step": 45536 + }, + { + "epoch": 2.120073561934027, + "grad_norm": 0.35176671439080437, + "learning_rate": 2.399122469759382e-05, + "loss": 2.706, + "step": 45537 + }, + { + "epoch": 2.12012012012012, + "grad_norm": 0.3603257461325445, + "learning_rate": 2.3988911313871843e-05, + "loss": 2.7055, + "step": 45538 + }, + { + "epoch": 2.120166678306213, + "grad_norm": 0.33686836799490877, + "learning_rate": 2.398659800648985e-05, + "loss": 2.717, + "step": 45539 + }, + { + "epoch": 2.1202132364923063, + "grad_norm": 0.35655265823669585, + "learning_rate": 2.3984284775454634e-05, + "loss": 2.6708, + "step": 45540 + }, + { + "epoch": 2.1202597946783994, + "grad_norm": 0.3514464201901931, + "learning_rate": 2.3981971620772997e-05, + "loss": 2.6442, + "step": 45541 + }, + { + "epoch": 2.1203063528644925, + "grad_norm": 0.35282308004249596, + "learning_rate": 2.3979658542451672e-05, + "loss": 2.6572, + "step": 45542 + }, + { + "epoch": 2.1203529110505857, + "grad_norm": 0.34799018726316056, + "learning_rate": 2.3977345540497526e-05, + "loss": 2.618, + "step": 45543 + }, + { + "epoch": 2.1203994692366783, + "grad_norm": 0.3591448928604562, + "learning_rate": 2.3975032614917264e-05, + "loss": 2.7368, + "step": 45544 + }, + { + "epoch": 2.1204460274227714, + "grad_norm": 0.3303179105106016, + "learning_rate": 2.397271976571776e-05, + "loss": 2.5914, + "step": 45545 + }, + { + "epoch": 2.1204925856088646, + "grad_norm": 0.32668360221996223, + "learning_rate": 2.3970406992905736e-05, + "loss": 2.6087, + "step": 45546 + }, + { + "epoch": 2.1205391437949577, + "grad_norm": 0.3451916896103606, + "learning_rate": 2.396809429648801e-05, + "loss": 2.6897, + "step": 45547 + }, + { + "epoch": 2.120585701981051, + "grad_norm": 0.35537051816307574, + "learning_rate": 2.3965781676471355e-05, + "loss": 2.634, + "step": 45548 + }, + { + "epoch": 2.120632260167144, + "grad_norm": 0.35933069853163696, + "learning_rate": 2.3963469132862587e-05, + "loss": 2.6634, + "step": 45549 + }, + { + "epoch": 2.120678818353237, + "grad_norm": 0.32378342751403194, + "learning_rate": 2.3961156665668455e-05, + "loss": 2.7036, + "step": 45550 + }, + { + "epoch": 2.12072537653933, + "grad_norm": 0.34107605888051123, + "learning_rate": 2.3958844274895763e-05, + "loss": 2.6996, + "step": 45551 + }, + { + "epoch": 2.1207719347254232, + "grad_norm": 0.3449371142104153, + "learning_rate": 2.3956531960551298e-05, + "loss": 2.6036, + "step": 45552 + }, + { + "epoch": 2.1208184929115164, + "grad_norm": 0.3446007232106591, + "learning_rate": 2.3954219722641846e-05, + "loss": 2.7133, + "step": 45553 + }, + { + "epoch": 2.120865051097609, + "grad_norm": 0.33724392354425947, + "learning_rate": 2.3951907561174208e-05, + "loss": 2.6669, + "step": 45554 + }, + { + "epoch": 2.120911609283702, + "grad_norm": 0.33842715116771777, + "learning_rate": 2.3949595476155124e-05, + "loss": 2.6623, + "step": 45555 + }, + { + "epoch": 2.1209581674697953, + "grad_norm": 0.33544529315935084, + "learning_rate": 2.394728346759144e-05, + "loss": 2.6583, + "step": 45556 + }, + { + "epoch": 2.1210047256558884, + "grad_norm": 0.345893175819599, + "learning_rate": 2.3944971535489878e-05, + "loss": 2.6624, + "step": 45557 + }, + { + "epoch": 2.1210512838419815, + "grad_norm": 0.3415420806355894, + "learning_rate": 2.39426596798573e-05, + "loss": 2.6247, + "step": 45558 + }, + { + "epoch": 2.1210978420280746, + "grad_norm": 0.3692935241796306, + "learning_rate": 2.3940347900700427e-05, + "loss": 2.6845, + "step": 45559 + }, + { + "epoch": 2.1211444002141677, + "grad_norm": 0.33221216741198795, + "learning_rate": 2.3938036198026066e-05, + "loss": 2.6737, + "step": 45560 + }, + { + "epoch": 2.121190958400261, + "grad_norm": 0.35888497872601144, + "learning_rate": 2.3935724571841e-05, + "loss": 2.6275, + "step": 45561 + }, + { + "epoch": 2.121237516586354, + "grad_norm": 0.37191614803088613, + "learning_rate": 2.3933413022152018e-05, + "loss": 2.7284, + "step": 45562 + }, + { + "epoch": 2.121284074772447, + "grad_norm": 0.3338434723419252, + "learning_rate": 2.3931101548965917e-05, + "loss": 2.6676, + "step": 45563 + }, + { + "epoch": 2.1213306329585397, + "grad_norm": 0.3480821598318295, + "learning_rate": 2.3928790152289442e-05, + "loss": 2.6255, + "step": 45564 + }, + { + "epoch": 2.121377191144633, + "grad_norm": 0.35560134229133783, + "learning_rate": 2.3926478832129408e-05, + "loss": 2.6023, + "step": 45565 + }, + { + "epoch": 2.121423749330726, + "grad_norm": 0.34224508171568624, + "learning_rate": 2.392416758849258e-05, + "loss": 2.7142, + "step": 45566 + }, + { + "epoch": 2.121470307516819, + "grad_norm": 0.3477909997877899, + "learning_rate": 2.392185642138578e-05, + "loss": 2.6432, + "step": 45567 + }, + { + "epoch": 2.121516865702912, + "grad_norm": 0.3428066374994638, + "learning_rate": 2.3919545330815717e-05, + "loss": 2.5733, + "step": 45568 + }, + { + "epoch": 2.1215634238890053, + "grad_norm": 0.34634333418824814, + "learning_rate": 2.391723431678926e-05, + "loss": 2.6499, + "step": 45569 + }, + { + "epoch": 2.1216099820750984, + "grad_norm": 0.36072107334048115, + "learning_rate": 2.3914923379313114e-05, + "loss": 2.7239, + "step": 45570 + }, + { + "epoch": 2.1216565402611915, + "grad_norm": 0.34420210803149315, + "learning_rate": 2.3912612518394133e-05, + "loss": 2.7756, + "step": 45571 + }, + { + "epoch": 2.1217030984472847, + "grad_norm": 0.34720358030737364, + "learning_rate": 2.3910301734039047e-05, + "loss": 2.6115, + "step": 45572 + }, + { + "epoch": 2.121749656633378, + "grad_norm": 0.34637765187701397, + "learning_rate": 2.3907991026254654e-05, + "loss": 2.6772, + "step": 45573 + }, + { + "epoch": 2.1217962148194704, + "grad_norm": 0.34118054395710845, + "learning_rate": 2.3905680395047743e-05, + "loss": 2.6443, + "step": 45574 + }, + { + "epoch": 2.1218427730055636, + "grad_norm": 0.37461829798619467, + "learning_rate": 2.390336984042508e-05, + "loss": 2.5741, + "step": 45575 + }, + { + "epoch": 2.1218893311916567, + "grad_norm": 0.35743576775505376, + "learning_rate": 2.390105936239348e-05, + "loss": 2.6259, + "step": 45576 + }, + { + "epoch": 2.12193588937775, + "grad_norm": 0.3515818982813086, + "learning_rate": 2.3898748960959678e-05, + "loss": 2.6651, + "step": 45577 + }, + { + "epoch": 2.121982447563843, + "grad_norm": 0.324634846650764, + "learning_rate": 2.389643863613048e-05, + "loss": 2.6067, + "step": 45578 + }, + { + "epoch": 2.122029005749936, + "grad_norm": 0.3788356944248073, + "learning_rate": 2.3894128387912655e-05, + "loss": 2.6926, + "step": 45579 + }, + { + "epoch": 2.122075563936029, + "grad_norm": 0.35236666027314295, + "learning_rate": 2.3891818216313e-05, + "loss": 2.6119, + "step": 45580 + }, + { + "epoch": 2.1221221221221223, + "grad_norm": 0.3297771285103996, + "learning_rate": 2.3889508121338284e-05, + "loss": 2.6644, + "step": 45581 + }, + { + "epoch": 2.1221686803082154, + "grad_norm": 0.3158683763983323, + "learning_rate": 2.3887198102995305e-05, + "loss": 2.6206, + "step": 45582 + }, + { + "epoch": 2.1222152384943085, + "grad_norm": 0.3461381606169058, + "learning_rate": 2.3884888161290797e-05, + "loss": 2.7043, + "step": 45583 + }, + { + "epoch": 2.122261796680401, + "grad_norm": 0.3345725342808736, + "learning_rate": 2.38825782962316e-05, + "loss": 2.6202, + "step": 45584 + }, + { + "epoch": 2.1223083548664943, + "grad_norm": 0.31953638662698797, + "learning_rate": 2.388026850782445e-05, + "loss": 2.683, + "step": 45585 + }, + { + "epoch": 2.1223549130525874, + "grad_norm": 0.34290430274149153, + "learning_rate": 2.3877958796076132e-05, + "loss": 2.6497, + "step": 45586 + }, + { + "epoch": 2.1224014712386805, + "grad_norm": 0.32231942751774556, + "learning_rate": 2.3875649160993434e-05, + "loss": 2.7022, + "step": 45587 + }, + { + "epoch": 2.1224480294247736, + "grad_norm": 0.35327557859316044, + "learning_rate": 2.3873339602583132e-05, + "loss": 2.6608, + "step": 45588 + }, + { + "epoch": 2.1224945876108667, + "grad_norm": 0.3374388297294709, + "learning_rate": 2.3871030120852023e-05, + "loss": 2.6733, + "step": 45589 + }, + { + "epoch": 2.12254114579696, + "grad_norm": 0.3221688207179119, + "learning_rate": 2.3868720715806848e-05, + "loss": 2.7188, + "step": 45590 + }, + { + "epoch": 2.122587703983053, + "grad_norm": 0.3511137211152269, + "learning_rate": 2.3866411387454408e-05, + "loss": 2.6666, + "step": 45591 + }, + { + "epoch": 2.122634262169146, + "grad_norm": 0.3144132385813491, + "learning_rate": 2.3864102135801468e-05, + "loss": 2.6473, + "step": 45592 + }, + { + "epoch": 2.1226808203552388, + "grad_norm": 0.3330445982539781, + "learning_rate": 2.3861792960854816e-05, + "loss": 2.6731, + "step": 45593 + }, + { + "epoch": 2.122727378541332, + "grad_norm": 0.3519202000843122, + "learning_rate": 2.3859483862621228e-05, + "loss": 2.722, + "step": 45594 + }, + { + "epoch": 2.122773936727425, + "grad_norm": 0.34369093160329284, + "learning_rate": 2.38571748411075e-05, + "loss": 2.6706, + "step": 45595 + }, + { + "epoch": 2.122820494913518, + "grad_norm": 0.3492315547867036, + "learning_rate": 2.385486589632035e-05, + "loss": 2.6578, + "step": 45596 + }, + { + "epoch": 2.122867053099611, + "grad_norm": 0.3388098936498572, + "learning_rate": 2.3852557028266626e-05, + "loss": 2.6822, + "step": 45597 + }, + { + "epoch": 2.1229136112857043, + "grad_norm": 0.3666241584626147, + "learning_rate": 2.3850248236953037e-05, + "loss": 2.6451, + "step": 45598 + }, + { + "epoch": 2.1229601694717974, + "grad_norm": 0.3418581366411817, + "learning_rate": 2.384793952238643e-05, + "loss": 2.6082, + "step": 45599 + }, + { + "epoch": 2.1230067276578906, + "grad_norm": 0.36244118779819484, + "learning_rate": 2.384563088457353e-05, + "loss": 2.5361, + "step": 45600 + }, + { + "epoch": 2.1230532858439837, + "grad_norm": 0.3300525076435881, + "learning_rate": 2.3843322323521117e-05, + "loss": 2.6816, + "step": 45601 + }, + { + "epoch": 2.123099844030077, + "grad_norm": 0.3649074104949646, + "learning_rate": 2.3841013839236005e-05, + "loss": 2.5668, + "step": 45602 + }, + { + "epoch": 2.1231464022161695, + "grad_norm": 0.3338160751249627, + "learning_rate": 2.3838705431724917e-05, + "loss": 2.5833, + "step": 45603 + }, + { + "epoch": 2.1231929604022626, + "grad_norm": 0.3128644948610751, + "learning_rate": 2.383639710099465e-05, + "loss": 2.6713, + "step": 45604 + }, + { + "epoch": 2.1232395185883557, + "grad_norm": 0.3554568228384631, + "learning_rate": 2.3834088847051978e-05, + "loss": 2.5975, + "step": 45605 + }, + { + "epoch": 2.123286076774449, + "grad_norm": 0.3605723754802891, + "learning_rate": 2.383178066990368e-05, + "loss": 2.5852, + "step": 45606 + }, + { + "epoch": 2.123332634960542, + "grad_norm": 0.33550815918609517, + "learning_rate": 2.3829472569556534e-05, + "loss": 2.7473, + "step": 45607 + }, + { + "epoch": 2.123379193146635, + "grad_norm": 0.3130003945201595, + "learning_rate": 2.3827164546017315e-05, + "loss": 2.6812, + "step": 45608 + }, + { + "epoch": 2.123425751332728, + "grad_norm": 0.3658046877182528, + "learning_rate": 2.382485659929276e-05, + "loss": 2.5912, + "step": 45609 + }, + { + "epoch": 2.1234723095188213, + "grad_norm": 0.34978348581271007, + "learning_rate": 2.382254872938971e-05, + "loss": 2.6244, + "step": 45610 + }, + { + "epoch": 2.1235188677049144, + "grad_norm": 0.32192875978324403, + "learning_rate": 2.3820240936314857e-05, + "loss": 2.8363, + "step": 45611 + }, + { + "epoch": 2.1235654258910075, + "grad_norm": 0.3629388651101852, + "learning_rate": 2.381793322007506e-05, + "loss": 2.737, + "step": 45612 + }, + { + "epoch": 2.1236119840771, + "grad_norm": 0.34433987912603925, + "learning_rate": 2.381562558067703e-05, + "loss": 2.5923, + "step": 45613 + }, + { + "epoch": 2.1236585422631933, + "grad_norm": 0.33203873218014224, + "learning_rate": 2.3813318018127563e-05, + "loss": 2.6606, + "step": 45614 + }, + { + "epoch": 2.1237051004492864, + "grad_norm": 0.3186902672058834, + "learning_rate": 2.3811010532433442e-05, + "loss": 2.7172, + "step": 45615 + }, + { + "epoch": 2.1237516586353795, + "grad_norm": 0.34980988532973745, + "learning_rate": 2.380870312360139e-05, + "loss": 2.6534, + "step": 45616 + }, + { + "epoch": 2.1237982168214726, + "grad_norm": 0.3405135240207731, + "learning_rate": 2.380639579163825e-05, + "loss": 2.7648, + "step": 45617 + }, + { + "epoch": 2.1238447750075657, + "grad_norm": 0.34238404305376224, + "learning_rate": 2.3804088536550735e-05, + "loss": 2.5878, + "step": 45618 + }, + { + "epoch": 2.123891333193659, + "grad_norm": 0.31317872374397987, + "learning_rate": 2.380178135834564e-05, + "loss": 2.6271, + "step": 45619 + }, + { + "epoch": 2.123937891379752, + "grad_norm": 0.3265436089808424, + "learning_rate": 2.3799474257029743e-05, + "loss": 2.6107, + "step": 45620 + }, + { + "epoch": 2.123984449565845, + "grad_norm": 0.3425867150347925, + "learning_rate": 2.3797167232609823e-05, + "loss": 2.6561, + "step": 45621 + }, + { + "epoch": 2.124031007751938, + "grad_norm": 0.3345323662189333, + "learning_rate": 2.3794860285092596e-05, + "loss": 2.669, + "step": 45622 + }, + { + "epoch": 2.124077565938031, + "grad_norm": 0.34546444440506596, + "learning_rate": 2.3792553414484915e-05, + "loss": 2.6751, + "step": 45623 + }, + { + "epoch": 2.124124124124124, + "grad_norm": 0.3483234990815072, + "learning_rate": 2.3790246620793466e-05, + "loss": 2.5703, + "step": 45624 + }, + { + "epoch": 2.124170682310217, + "grad_norm": 0.3350662291687017, + "learning_rate": 2.37879399040251e-05, + "loss": 2.6373, + "step": 45625 + }, + { + "epoch": 2.12421724049631, + "grad_norm": 0.3081378828573404, + "learning_rate": 2.378563326418653e-05, + "loss": 2.6296, + "step": 45626 + }, + { + "epoch": 2.1242637986824033, + "grad_norm": 0.3607843778176517, + "learning_rate": 2.378332670128454e-05, + "loss": 2.706, + "step": 45627 + }, + { + "epoch": 2.1243103568684965, + "grad_norm": 0.33923099745708124, + "learning_rate": 2.378102021532593e-05, + "loss": 2.6529, + "step": 45628 + }, + { + "epoch": 2.1243569150545896, + "grad_norm": 0.33125996426646065, + "learning_rate": 2.3778713806317397e-05, + "loss": 2.5954, + "step": 45629 + }, + { + "epoch": 2.1244034732406827, + "grad_norm": 0.34121243516166233, + "learning_rate": 2.37764074742658e-05, + "loss": 2.6265, + "step": 45630 + }, + { + "epoch": 2.124450031426776, + "grad_norm": 0.32566561997717053, + "learning_rate": 2.377410121917784e-05, + "loss": 2.6796, + "step": 45631 + }, + { + "epoch": 2.1244965896128685, + "grad_norm": 0.3233391202376915, + "learning_rate": 2.3771795041060313e-05, + "loss": 2.6522, + "step": 45632 + }, + { + "epoch": 2.1245431477989616, + "grad_norm": 0.33539698817973046, + "learning_rate": 2.3769488939919977e-05, + "loss": 2.6336, + "step": 45633 + }, + { + "epoch": 2.1245897059850547, + "grad_norm": 0.336574274804176, + "learning_rate": 2.3767182915763615e-05, + "loss": 2.6301, + "step": 45634 + }, + { + "epoch": 2.124636264171148, + "grad_norm": 0.31551657952971096, + "learning_rate": 2.3764876968597982e-05, + "loss": 2.6062, + "step": 45635 + }, + { + "epoch": 2.124682822357241, + "grad_norm": 0.35736469305466506, + "learning_rate": 2.376257109842987e-05, + "loss": 2.6215, + "step": 45636 + }, + { + "epoch": 2.124729380543334, + "grad_norm": 0.32479323882923233, + "learning_rate": 2.376026530526599e-05, + "loss": 2.5987, + "step": 45637 + }, + { + "epoch": 2.124775938729427, + "grad_norm": 0.3310879126702528, + "learning_rate": 2.3757959589113183e-05, + "loss": 2.5871, + "step": 45638 + }, + { + "epoch": 2.1248224969155203, + "grad_norm": 0.341134922079523, + "learning_rate": 2.3755653949978162e-05, + "loss": 2.6138, + "step": 45639 + }, + { + "epoch": 2.1248690551016134, + "grad_norm": 0.34903237966294454, + "learning_rate": 2.3753348387867706e-05, + "loss": 2.7563, + "step": 45640 + }, + { + "epoch": 2.1249156132877065, + "grad_norm": 0.34136764800675246, + "learning_rate": 2.375104290278861e-05, + "loss": 2.6779, + "step": 45641 + }, + { + "epoch": 2.124962171473799, + "grad_norm": 0.3179692499003411, + "learning_rate": 2.374873749474758e-05, + "loss": 2.6869, + "step": 45642 + }, + { + "epoch": 2.1250087296598923, + "grad_norm": 0.34085039842184767, + "learning_rate": 2.374643216375146e-05, + "loss": 2.6045, + "step": 45643 + }, + { + "epoch": 2.1250552878459854, + "grad_norm": 0.341202772860577, + "learning_rate": 2.374412690980695e-05, + "loss": 2.6245, + "step": 45644 + }, + { + "epoch": 2.1251018460320785, + "grad_norm": 0.3282936446032464, + "learning_rate": 2.3741821732920842e-05, + "loss": 2.69, + "step": 45645 + }, + { + "epoch": 2.1251484042181716, + "grad_norm": 0.34013566678399665, + "learning_rate": 2.3739516633099905e-05, + "loss": 2.6901, + "step": 45646 + }, + { + "epoch": 2.1251949624042648, + "grad_norm": 0.3553069392925834, + "learning_rate": 2.3737211610350895e-05, + "loss": 2.6843, + "step": 45647 + }, + { + "epoch": 2.125241520590358, + "grad_norm": 0.34864264375303805, + "learning_rate": 2.3734906664680582e-05, + "loss": 2.6617, + "step": 45648 + }, + { + "epoch": 2.125288078776451, + "grad_norm": 0.3355582806024296, + "learning_rate": 2.373260179609575e-05, + "loss": 2.6404, + "step": 45649 + }, + { + "epoch": 2.125334636962544, + "grad_norm": 0.3374556202915455, + "learning_rate": 2.373029700460311e-05, + "loss": 2.6428, + "step": 45650 + }, + { + "epoch": 2.125381195148637, + "grad_norm": 0.3473458176887811, + "learning_rate": 2.3727992290209495e-05, + "loss": 2.6149, + "step": 45651 + }, + { + "epoch": 2.12542775333473, + "grad_norm": 0.320763576537744, + "learning_rate": 2.3725687652921612e-05, + "loss": 2.6111, + "step": 45652 + }, + { + "epoch": 2.125474311520823, + "grad_norm": 0.34062440294231766, + "learning_rate": 2.372338309274625e-05, + "loss": 2.6289, + "step": 45653 + }, + { + "epoch": 2.125520869706916, + "grad_norm": 0.3459643278381194, + "learning_rate": 2.372107860969019e-05, + "loss": 2.6766, + "step": 45654 + }, + { + "epoch": 2.1255674278930092, + "grad_norm": 0.34696426327158403, + "learning_rate": 2.3718774203760135e-05, + "loss": 2.7289, + "step": 45655 + }, + { + "epoch": 2.1256139860791023, + "grad_norm": 0.35375778997228424, + "learning_rate": 2.371646987496293e-05, + "loss": 2.5681, + "step": 45656 + }, + { + "epoch": 2.1256605442651955, + "grad_norm": 0.3562586974142165, + "learning_rate": 2.371416562330528e-05, + "loss": 2.597, + "step": 45657 + }, + { + "epoch": 2.1257071024512886, + "grad_norm": 0.3734691067092099, + "learning_rate": 2.3711861448793955e-05, + "loss": 2.6426, + "step": 45658 + }, + { + "epoch": 2.1257536606373817, + "grad_norm": 0.3519243665671311, + "learning_rate": 2.3709557351435735e-05, + "loss": 2.7063, + "step": 45659 + }, + { + "epoch": 2.125800218823475, + "grad_norm": 0.329910234855678, + "learning_rate": 2.3707253331237368e-05, + "loss": 2.6915, + "step": 45660 + }, + { + "epoch": 2.125846777009568, + "grad_norm": 0.3445996733866987, + "learning_rate": 2.3704949388205626e-05, + "loss": 2.6407, + "step": 45661 + }, + { + "epoch": 2.1258933351956606, + "grad_norm": 0.33513682673793943, + "learning_rate": 2.3702645522347287e-05, + "loss": 2.5961, + "step": 45662 + }, + { + "epoch": 2.1259398933817537, + "grad_norm": 0.33206048083225376, + "learning_rate": 2.3700341733669052e-05, + "loss": 2.6148, + "step": 45663 + }, + { + "epoch": 2.125986451567847, + "grad_norm": 0.3514046971072601, + "learning_rate": 2.369803802217776e-05, + "loss": 2.6618, + "step": 45664 + }, + { + "epoch": 2.12603300975394, + "grad_norm": 0.3459893295447379, + "learning_rate": 2.369573438788012e-05, + "loss": 2.5764, + "step": 45665 + }, + { + "epoch": 2.126079567940033, + "grad_norm": 0.36787482881709144, + "learning_rate": 2.369343083078291e-05, + "loss": 2.5857, + "step": 45666 + }, + { + "epoch": 2.126126126126126, + "grad_norm": 0.32627302581587253, + "learning_rate": 2.36911273508929e-05, + "loss": 2.8051, + "step": 45667 + }, + { + "epoch": 2.1261726843122193, + "grad_norm": 0.33070371233166784, + "learning_rate": 2.368882394821681e-05, + "loss": 2.6906, + "step": 45668 + }, + { + "epoch": 2.1262192424983124, + "grad_norm": 0.38067556264465013, + "learning_rate": 2.368652062276146e-05, + "loss": 2.7598, + "step": 45669 + }, + { + "epoch": 2.1262658006844055, + "grad_norm": 0.34284211948586907, + "learning_rate": 2.3684217374533547e-05, + "loss": 2.7125, + "step": 45670 + }, + { + "epoch": 2.126312358870498, + "grad_norm": 0.3382430377448683, + "learning_rate": 2.3681914203539902e-05, + "loss": 2.6748, + "step": 45671 + }, + { + "epoch": 2.1263589170565913, + "grad_norm": 0.35628287213812004, + "learning_rate": 2.367961110978722e-05, + "loss": 2.6148, + "step": 45672 + }, + { + "epoch": 2.1264054752426844, + "grad_norm": 0.35407423004103705, + "learning_rate": 2.367730809328229e-05, + "loss": 2.6249, + "step": 45673 + }, + { + "epoch": 2.1264520334287775, + "grad_norm": 0.335697086273927, + "learning_rate": 2.367500515403187e-05, + "loss": 2.6037, + "step": 45674 + }, + { + "epoch": 2.1264985916148706, + "grad_norm": 0.3756360234560366, + "learning_rate": 2.3672702292042737e-05, + "loss": 2.6313, + "step": 45675 + }, + { + "epoch": 2.1265451498009638, + "grad_norm": 0.3535292062435732, + "learning_rate": 2.3670399507321585e-05, + "loss": 2.6487, + "step": 45676 + }, + { + "epoch": 2.126591707987057, + "grad_norm": 0.3403945016063516, + "learning_rate": 2.3668096799875256e-05, + "loss": 2.733, + "step": 45677 + }, + { + "epoch": 2.12663826617315, + "grad_norm": 0.32438698895319334, + "learning_rate": 2.3665794169710454e-05, + "loss": 2.5558, + "step": 45678 + }, + { + "epoch": 2.126684824359243, + "grad_norm": 0.34416758649699747, + "learning_rate": 2.366349161683395e-05, + "loss": 2.6026, + "step": 45679 + }, + { + "epoch": 2.1267313825453362, + "grad_norm": 0.3250359040932734, + "learning_rate": 2.3661189141252522e-05, + "loss": 2.5756, + "step": 45680 + }, + { + "epoch": 2.1267779407314293, + "grad_norm": 0.3617617972040615, + "learning_rate": 2.365888674297287e-05, + "loss": 2.6734, + "step": 45681 + }, + { + "epoch": 2.126824498917522, + "grad_norm": 0.34537702521887076, + "learning_rate": 2.3656584422001826e-05, + "loss": 2.7927, + "step": 45682 + }, + { + "epoch": 2.126871057103615, + "grad_norm": 0.3242708902801859, + "learning_rate": 2.3654282178346076e-05, + "loss": 2.664, + "step": 45683 + }, + { + "epoch": 2.1269176152897082, + "grad_norm": 0.3538612493472915, + "learning_rate": 2.3651980012012454e-05, + "loss": 2.7097, + "step": 45684 + }, + { + "epoch": 2.1269641734758014, + "grad_norm": 0.3354472098095561, + "learning_rate": 2.364967792300765e-05, + "loss": 2.7418, + "step": 45685 + }, + { + "epoch": 2.1270107316618945, + "grad_norm": 0.31677088996732583, + "learning_rate": 2.364737591133845e-05, + "loss": 2.6589, + "step": 45686 + }, + { + "epoch": 2.1270572898479876, + "grad_norm": 0.3351757001744203, + "learning_rate": 2.3645073977011604e-05, + "loss": 2.7194, + "step": 45687 + }, + { + "epoch": 2.1271038480340807, + "grad_norm": 0.342035583355571, + "learning_rate": 2.364277212003387e-05, + "loss": 2.666, + "step": 45688 + }, + { + "epoch": 2.127150406220174, + "grad_norm": 0.34302215707944994, + "learning_rate": 2.3640470340412e-05, + "loss": 2.6827, + "step": 45689 + }, + { + "epoch": 2.127196964406267, + "grad_norm": 0.3349962493866636, + "learning_rate": 2.3638168638152775e-05, + "loss": 2.6624, + "step": 45690 + }, + { + "epoch": 2.1272435225923596, + "grad_norm": 0.3467873895135268, + "learning_rate": 2.3635867013262908e-05, + "loss": 2.7337, + "step": 45691 + }, + { + "epoch": 2.1272900807784527, + "grad_norm": 0.3416413101993077, + "learning_rate": 2.3633565465749175e-05, + "loss": 2.613, + "step": 45692 + }, + { + "epoch": 2.127336638964546, + "grad_norm": 0.3662499492445382, + "learning_rate": 2.3631263995618346e-05, + "loss": 2.7207, + "step": 45693 + }, + { + "epoch": 2.127383197150639, + "grad_norm": 0.33639311154258267, + "learning_rate": 2.3628962602877126e-05, + "loss": 2.5871, + "step": 45694 + }, + { + "epoch": 2.127429755336732, + "grad_norm": 0.33322129461317274, + "learning_rate": 2.3626661287532336e-05, + "loss": 2.662, + "step": 45695 + }, + { + "epoch": 2.127476313522825, + "grad_norm": 0.37141077323621174, + "learning_rate": 2.362436004959066e-05, + "loss": 2.644, + "step": 45696 + }, + { + "epoch": 2.1275228717089183, + "grad_norm": 0.31562313628716837, + "learning_rate": 2.3622058889058928e-05, + "loss": 2.5085, + "step": 45697 + }, + { + "epoch": 2.1275694298950114, + "grad_norm": 0.33880340960962474, + "learning_rate": 2.3619757805943836e-05, + "loss": 2.5998, + "step": 45698 + }, + { + "epoch": 2.1276159880811045, + "grad_norm": 0.3597677312077657, + "learning_rate": 2.3617456800252156e-05, + "loss": 2.6445, + "step": 45699 + }, + { + "epoch": 2.1276625462671976, + "grad_norm": 0.30544798481413055, + "learning_rate": 2.361515587199064e-05, + "loss": 2.6501, + "step": 45700 + }, + { + "epoch": 2.1277091044532903, + "grad_norm": 0.3291741888052206, + "learning_rate": 2.3612855021166048e-05, + "loss": 2.5415, + "step": 45701 + }, + { + "epoch": 2.1277556626393834, + "grad_norm": 0.3496655572160569, + "learning_rate": 2.3610554247785118e-05, + "loss": 2.5851, + "step": 45702 + }, + { + "epoch": 2.1278022208254765, + "grad_norm": 0.3405822357160058, + "learning_rate": 2.3608253551854637e-05, + "loss": 2.7089, + "step": 45703 + }, + { + "epoch": 2.1278487790115697, + "grad_norm": 0.31995385971010765, + "learning_rate": 2.3605952933381308e-05, + "loss": 2.5595, + "step": 45704 + }, + { + "epoch": 2.1278953371976628, + "grad_norm": 0.3333073610581819, + "learning_rate": 2.3603652392371907e-05, + "loss": 2.6727, + "step": 45705 + }, + { + "epoch": 2.127941895383756, + "grad_norm": 0.36098580635872685, + "learning_rate": 2.360135192883319e-05, + "loss": 2.6629, + "step": 45706 + }, + { + "epoch": 2.127988453569849, + "grad_norm": 0.3393718532653891, + "learning_rate": 2.35990515427719e-05, + "loss": 2.594, + "step": 45707 + }, + { + "epoch": 2.128035011755942, + "grad_norm": 0.3366858454808981, + "learning_rate": 2.3596751234194815e-05, + "loss": 2.7458, + "step": 45708 + }, + { + "epoch": 2.1280815699420352, + "grad_norm": 0.3338449221081633, + "learning_rate": 2.3594451003108626e-05, + "loss": 2.573, + "step": 45709 + }, + { + "epoch": 2.128128128128128, + "grad_norm": 0.35916148692389893, + "learning_rate": 2.3592150849520155e-05, + "loss": 2.6976, + "step": 45710 + }, + { + "epoch": 2.128174686314221, + "grad_norm": 0.3451476150341189, + "learning_rate": 2.3589850773436107e-05, + "loss": 2.6489, + "step": 45711 + }, + { + "epoch": 2.128221244500314, + "grad_norm": 0.3426301878806467, + "learning_rate": 2.3587550774863244e-05, + "loss": 2.598, + "step": 45712 + }, + { + "epoch": 2.1282678026864073, + "grad_norm": 0.37978788960527676, + "learning_rate": 2.3585250853808316e-05, + "loss": 2.7075, + "step": 45713 + }, + { + "epoch": 2.1283143608725004, + "grad_norm": 0.3298987499455675, + "learning_rate": 2.358295101027807e-05, + "loss": 2.5532, + "step": 45714 + }, + { + "epoch": 2.1283609190585935, + "grad_norm": 0.34252778476227036, + "learning_rate": 2.3580651244279266e-05, + "loss": 2.6328, + "step": 45715 + }, + { + "epoch": 2.1284074772446866, + "grad_norm": 0.35186637886623173, + "learning_rate": 2.3578351555818667e-05, + "loss": 2.6249, + "step": 45716 + }, + { + "epoch": 2.1284540354307797, + "grad_norm": 0.3285878071262159, + "learning_rate": 2.357605194490298e-05, + "loss": 2.6117, + "step": 45717 + }, + { + "epoch": 2.128500593616873, + "grad_norm": 0.35425192408122724, + "learning_rate": 2.357375241153898e-05, + "loss": 2.6395, + "step": 45718 + }, + { + "epoch": 2.128547151802966, + "grad_norm": 0.34118486090367356, + "learning_rate": 2.3571452955733415e-05, + "loss": 2.649, + "step": 45719 + }, + { + "epoch": 2.128593709989059, + "grad_norm": 0.31342661585740833, + "learning_rate": 2.356915357749303e-05, + "loss": 2.5432, + "step": 45720 + }, + { + "epoch": 2.1286402681751517, + "grad_norm": 0.3374552624546199, + "learning_rate": 2.3566854276824594e-05, + "loss": 2.5904, + "step": 45721 + }, + { + "epoch": 2.128686826361245, + "grad_norm": 0.3619145693187338, + "learning_rate": 2.35645550537348e-05, + "loss": 2.6941, + "step": 45722 + }, + { + "epoch": 2.128733384547338, + "grad_norm": 0.32581419217671526, + "learning_rate": 2.3562255908230468e-05, + "loss": 2.6084, + "step": 45723 + }, + { + "epoch": 2.128779942733431, + "grad_norm": 0.3213449975065255, + "learning_rate": 2.3559956840318274e-05, + "loss": 2.5916, + "step": 45724 + }, + { + "epoch": 2.128826500919524, + "grad_norm": 0.36476663822575417, + "learning_rate": 2.3557657850005042e-05, + "loss": 2.6772, + "step": 45725 + }, + { + "epoch": 2.1288730591056173, + "grad_norm": 0.318638743171862, + "learning_rate": 2.3555358937297457e-05, + "loss": 2.7398, + "step": 45726 + }, + { + "epoch": 2.1289196172917104, + "grad_norm": 0.31990973173257126, + "learning_rate": 2.355306010220229e-05, + "loss": 2.6543, + "step": 45727 + }, + { + "epoch": 2.1289661754778035, + "grad_norm": 0.3311681652452666, + "learning_rate": 2.3550761344726287e-05, + "loss": 2.7932, + "step": 45728 + }, + { + "epoch": 2.1290127336638967, + "grad_norm": 0.3092396330763971, + "learning_rate": 2.354846266487619e-05, + "loss": 2.6585, + "step": 45729 + }, + { + "epoch": 2.1290592918499893, + "grad_norm": 0.3191903552957182, + "learning_rate": 2.354616406265877e-05, + "loss": 2.6274, + "step": 45730 + }, + { + "epoch": 2.1291058500360824, + "grad_norm": 0.32599645078923795, + "learning_rate": 2.354386553808074e-05, + "loss": 2.5876, + "step": 45731 + }, + { + "epoch": 2.1291524082221756, + "grad_norm": 0.3087566684601166, + "learning_rate": 2.3541567091148848e-05, + "loss": 2.5081, + "step": 45732 + }, + { + "epoch": 2.1291989664082687, + "grad_norm": 0.32920209118255683, + "learning_rate": 2.3539268721869856e-05, + "loss": 2.6211, + "step": 45733 + }, + { + "epoch": 2.129245524594362, + "grad_norm": 0.3346211335558057, + "learning_rate": 2.353697043025052e-05, + "loss": 2.6811, + "step": 45734 + }, + { + "epoch": 2.129292082780455, + "grad_norm": 0.3202167256450041, + "learning_rate": 2.353467221629753e-05, + "loss": 2.6324, + "step": 45735 + }, + { + "epoch": 2.129338640966548, + "grad_norm": 0.3323512677000684, + "learning_rate": 2.353237408001771e-05, + "loss": 2.5899, + "step": 45736 + }, + { + "epoch": 2.129385199152641, + "grad_norm": 0.3594294616881098, + "learning_rate": 2.3530076021417724e-05, + "loss": 2.7482, + "step": 45737 + }, + { + "epoch": 2.1294317573387342, + "grad_norm": 0.3274973511316574, + "learning_rate": 2.35277780405044e-05, + "loss": 2.6076, + "step": 45738 + }, + { + "epoch": 2.1294783155248274, + "grad_norm": 0.3323031799562519, + "learning_rate": 2.3525480137284416e-05, + "loss": 2.6326, + "step": 45739 + }, + { + "epoch": 2.12952487371092, + "grad_norm": 0.36137584676388407, + "learning_rate": 2.352318231176454e-05, + "loss": 2.7226, + "step": 45740 + }, + { + "epoch": 2.129571431897013, + "grad_norm": 0.33807481314418386, + "learning_rate": 2.3520884563951517e-05, + "loss": 2.644, + "step": 45741 + }, + { + "epoch": 2.1296179900831063, + "grad_norm": 0.339358334000865, + "learning_rate": 2.351858689385209e-05, + "loss": 2.618, + "step": 45742 + }, + { + "epoch": 2.1296645482691994, + "grad_norm": 0.3505109205957775, + "learning_rate": 2.3516289301473023e-05, + "loss": 2.6626, + "step": 45743 + }, + { + "epoch": 2.1297111064552925, + "grad_norm": 0.3358981154399181, + "learning_rate": 2.351399178682101e-05, + "loss": 2.6384, + "step": 45744 + }, + { + "epoch": 2.1297576646413856, + "grad_norm": 0.33199347988489786, + "learning_rate": 2.351169434990283e-05, + "loss": 2.4857, + "step": 45745 + }, + { + "epoch": 2.1298042228274787, + "grad_norm": 0.35319803569440633, + "learning_rate": 2.3509396990725214e-05, + "loss": 2.6886, + "step": 45746 + }, + { + "epoch": 2.129850781013572, + "grad_norm": 0.3453898345958492, + "learning_rate": 2.3507099709294912e-05, + "loss": 2.7518, + "step": 45747 + }, + { + "epoch": 2.129897339199665, + "grad_norm": 0.37919990865233355, + "learning_rate": 2.3504802505618657e-05, + "loss": 2.5793, + "step": 45748 + }, + { + "epoch": 2.1299438973857576, + "grad_norm": 0.34412866689313426, + "learning_rate": 2.3502505379703214e-05, + "loss": 2.5866, + "step": 45749 + }, + { + "epoch": 2.1299904555718507, + "grad_norm": 0.32433687819158546, + "learning_rate": 2.3500208331555273e-05, + "loss": 2.5501, + "step": 45750 + }, + { + "epoch": 2.130037013757944, + "grad_norm": 0.3861175955599521, + "learning_rate": 2.3497911361181653e-05, + "loss": 2.6851, + "step": 45751 + }, + { + "epoch": 2.130083571944037, + "grad_norm": 0.3612498944033396, + "learning_rate": 2.3495614468589022e-05, + "loss": 2.5868, + "step": 45752 + }, + { + "epoch": 2.13013013013013, + "grad_norm": 0.33541123057959216, + "learning_rate": 2.349331765378416e-05, + "loss": 2.606, + "step": 45753 + }, + { + "epoch": 2.130176688316223, + "grad_norm": 0.34269167376001786, + "learning_rate": 2.34910209167738e-05, + "loss": 2.6007, + "step": 45754 + }, + { + "epoch": 2.1302232465023163, + "grad_norm": 0.33229329382563444, + "learning_rate": 2.3488724257564677e-05, + "loss": 2.6738, + "step": 45755 + }, + { + "epoch": 2.1302698046884094, + "grad_norm": 0.3181052628435813, + "learning_rate": 2.348642767616356e-05, + "loss": 2.6863, + "step": 45756 + }, + { + "epoch": 2.1303163628745025, + "grad_norm": 0.3272466241583976, + "learning_rate": 2.3484131172577138e-05, + "loss": 2.7123, + "step": 45757 + }, + { + "epoch": 2.1303629210605957, + "grad_norm": 0.3595470989442849, + "learning_rate": 2.3481834746812188e-05, + "loss": 2.7015, + "step": 45758 + }, + { + "epoch": 2.1304094792466888, + "grad_norm": 0.3264966310507831, + "learning_rate": 2.3479538398875433e-05, + "loss": 2.5903, + "step": 45759 + }, + { + "epoch": 2.1304560374327814, + "grad_norm": 0.31913052098814015, + "learning_rate": 2.3477242128773624e-05, + "loss": 2.6456, + "step": 45760 + }, + { + "epoch": 2.1305025956188746, + "grad_norm": 0.32794887562127, + "learning_rate": 2.34749459365135e-05, + "loss": 2.7074, + "step": 45761 + }, + { + "epoch": 2.1305491538049677, + "grad_norm": 0.343039633610417, + "learning_rate": 2.347264982210181e-05, + "loss": 2.5665, + "step": 45762 + }, + { + "epoch": 2.130595711991061, + "grad_norm": 0.32677230149682956, + "learning_rate": 2.3470353785545246e-05, + "loss": 2.6945, + "step": 45763 + }, + { + "epoch": 2.130642270177154, + "grad_norm": 0.3332653239821054, + "learning_rate": 2.3468057826850614e-05, + "loss": 2.6933, + "step": 45764 + }, + { + "epoch": 2.130688828363247, + "grad_norm": 0.3243468053744382, + "learning_rate": 2.3465761946024602e-05, + "loss": 2.6453, + "step": 45765 + }, + { + "epoch": 2.13073538654934, + "grad_norm": 0.3426324722033856, + "learning_rate": 2.3463466143073965e-05, + "loss": 2.6347, + "step": 45766 + }, + { + "epoch": 2.1307819447354333, + "grad_norm": 0.3116697647690656, + "learning_rate": 2.346117041800544e-05, + "loss": 2.523, + "step": 45767 + }, + { + "epoch": 2.1308285029215264, + "grad_norm": 0.3289127006297617, + "learning_rate": 2.345887477082576e-05, + "loss": 2.646, + "step": 45768 + }, + { + "epoch": 2.130875061107619, + "grad_norm": 0.3331630706886812, + "learning_rate": 2.3456579201541696e-05, + "loss": 2.6228, + "step": 45769 + }, + { + "epoch": 2.130921619293712, + "grad_norm": 0.33892959101954767, + "learning_rate": 2.3454283710159935e-05, + "loss": 2.7453, + "step": 45770 + }, + { + "epoch": 2.1309681774798053, + "grad_norm": 0.3285006992086318, + "learning_rate": 2.3451988296687234e-05, + "loss": 2.6966, + "step": 45771 + }, + { + "epoch": 2.1310147356658984, + "grad_norm": 0.31978030024724535, + "learning_rate": 2.3449692961130333e-05, + "loss": 2.6345, + "step": 45772 + }, + { + "epoch": 2.1310612938519915, + "grad_norm": 0.3450262543015428, + "learning_rate": 2.3447397703495967e-05, + "loss": 2.7166, + "step": 45773 + }, + { + "epoch": 2.1311078520380846, + "grad_norm": 0.3272752656815147, + "learning_rate": 2.3445102523790872e-05, + "loss": 2.6098, + "step": 45774 + }, + { + "epoch": 2.1311544102241777, + "grad_norm": 0.31925431732997545, + "learning_rate": 2.3442807422021807e-05, + "loss": 2.5375, + "step": 45775 + }, + { + "epoch": 2.131200968410271, + "grad_norm": 0.3208039442742291, + "learning_rate": 2.3440512398195447e-05, + "loss": 2.6153, + "step": 45776 + }, + { + "epoch": 2.131247526596364, + "grad_norm": 0.32932304596714396, + "learning_rate": 2.34382174523186e-05, + "loss": 2.6818, + "step": 45777 + }, + { + "epoch": 2.131294084782457, + "grad_norm": 0.3031572763393626, + "learning_rate": 2.343592258439795e-05, + "loss": 2.5285, + "step": 45778 + }, + { + "epoch": 2.1313406429685497, + "grad_norm": 0.3598901260490801, + "learning_rate": 2.3433627794440254e-05, + "loss": 2.7096, + "step": 45779 + }, + { + "epoch": 2.131387201154643, + "grad_norm": 0.3027134802958248, + "learning_rate": 2.3431333082452244e-05, + "loss": 2.6878, + "step": 45780 + }, + { + "epoch": 2.131433759340736, + "grad_norm": 0.32676046368948986, + "learning_rate": 2.3429038448440655e-05, + "loss": 2.6476, + "step": 45781 + }, + { + "epoch": 2.131480317526829, + "grad_norm": 0.34424808393095263, + "learning_rate": 2.3426743892412234e-05, + "loss": 2.632, + "step": 45782 + }, + { + "epoch": 2.131526875712922, + "grad_norm": 0.3365455176790271, + "learning_rate": 2.3424449414373667e-05, + "loss": 2.6458, + "step": 45783 + }, + { + "epoch": 2.1315734338990153, + "grad_norm": 0.3173974232874464, + "learning_rate": 2.3422155014331758e-05, + "loss": 2.6004, + "step": 45784 + }, + { + "epoch": 2.1316199920851084, + "grad_norm": 0.3378470081065185, + "learning_rate": 2.3419860692293187e-05, + "loss": 2.6951, + "step": 45785 + }, + { + "epoch": 2.1316665502712016, + "grad_norm": 0.34483244770331667, + "learning_rate": 2.341756644826471e-05, + "loss": 2.6692, + "step": 45786 + }, + { + "epoch": 2.1317131084572947, + "grad_norm": 0.3431439666792755, + "learning_rate": 2.3415272282253055e-05, + "loss": 2.6683, + "step": 45787 + }, + { + "epoch": 2.1317596666433873, + "grad_norm": 0.3265777863307726, + "learning_rate": 2.3412978194264974e-05, + "loss": 2.5904, + "step": 45788 + }, + { + "epoch": 2.1318062248294805, + "grad_norm": 0.3270555549220393, + "learning_rate": 2.3410684184307148e-05, + "loss": 2.716, + "step": 45789 + }, + { + "epoch": 2.1318527830155736, + "grad_norm": 0.31944096449101617, + "learning_rate": 2.340839025238638e-05, + "loss": 2.7022, + "step": 45790 + }, + { + "epoch": 2.1318993412016667, + "grad_norm": 0.3398777143739628, + "learning_rate": 2.340609639850935e-05, + "loss": 2.7159, + "step": 45791 + }, + { + "epoch": 2.13194589938776, + "grad_norm": 0.34713422387159487, + "learning_rate": 2.3403802622682803e-05, + "loss": 2.7004, + "step": 45792 + }, + { + "epoch": 2.131992457573853, + "grad_norm": 0.3367061516581328, + "learning_rate": 2.3401508924913478e-05, + "loss": 2.6857, + "step": 45793 + }, + { + "epoch": 2.132039015759946, + "grad_norm": 0.34584861210623535, + "learning_rate": 2.3399215305208104e-05, + "loss": 2.6008, + "step": 45794 + }, + { + "epoch": 2.132085573946039, + "grad_norm": 0.3289842710178249, + "learning_rate": 2.3396921763573433e-05, + "loss": 2.5442, + "step": 45795 + }, + { + "epoch": 2.1321321321321323, + "grad_norm": 0.3743551124827685, + "learning_rate": 2.3394628300016137e-05, + "loss": 2.7485, + "step": 45796 + }, + { + "epoch": 2.1321786903182254, + "grad_norm": 0.3486928876867175, + "learning_rate": 2.3392334914543017e-05, + "loss": 2.6776, + "step": 45797 + }, + { + "epoch": 2.1322252485043185, + "grad_norm": 0.3296844467127885, + "learning_rate": 2.3390041607160762e-05, + "loss": 2.5533, + "step": 45798 + }, + { + "epoch": 2.132271806690411, + "grad_norm": 0.35801793198756815, + "learning_rate": 2.3387748377876113e-05, + "loss": 2.7007, + "step": 45799 + }, + { + "epoch": 2.1323183648765043, + "grad_norm": 0.3472471120174757, + "learning_rate": 2.3385455226695797e-05, + "loss": 2.6892, + "step": 45800 + }, + { + "epoch": 2.1323649230625974, + "grad_norm": 0.3473057571514469, + "learning_rate": 2.3383162153626555e-05, + "loss": 2.6557, + "step": 45801 + }, + { + "epoch": 2.1324114812486905, + "grad_norm": 0.3630622448343166, + "learning_rate": 2.33808691586751e-05, + "loss": 2.7236, + "step": 45802 + }, + { + "epoch": 2.1324580394347836, + "grad_norm": 0.3252001769546638, + "learning_rate": 2.3378576241848192e-05, + "loss": 2.6258, + "step": 45803 + }, + { + "epoch": 2.1325045976208767, + "grad_norm": 0.3360917388104713, + "learning_rate": 2.3376283403152526e-05, + "loss": 2.6744, + "step": 45804 + }, + { + "epoch": 2.13255115580697, + "grad_norm": 0.3562544711181632, + "learning_rate": 2.3373990642594847e-05, + "loss": 2.6426, + "step": 45805 + }, + { + "epoch": 2.132597713993063, + "grad_norm": 0.3145195216238206, + "learning_rate": 2.3371697960181875e-05, + "loss": 2.6743, + "step": 45806 + }, + { + "epoch": 2.132644272179156, + "grad_norm": 0.32547475443169366, + "learning_rate": 2.3369405355920355e-05, + "loss": 2.5406, + "step": 45807 + }, + { + "epoch": 2.1326908303652488, + "grad_norm": 0.34884237680169844, + "learning_rate": 2.3367112829817018e-05, + "loss": 2.6675, + "step": 45808 + }, + { + "epoch": 2.132737388551342, + "grad_norm": 0.356844343527583, + "learning_rate": 2.3364820381878548e-05, + "loss": 2.693, + "step": 45809 + }, + { + "epoch": 2.132783946737435, + "grad_norm": 0.3570793397862788, + "learning_rate": 2.336252801211174e-05, + "loss": 2.6275, + "step": 45810 + }, + { + "epoch": 2.132830504923528, + "grad_norm": 0.3480494405665998, + "learning_rate": 2.336023572052327e-05, + "loss": 2.6943, + "step": 45811 + }, + { + "epoch": 2.132877063109621, + "grad_norm": 0.3304872195626714, + "learning_rate": 2.335794350711989e-05, + "loss": 2.6399, + "step": 45812 + }, + { + "epoch": 2.1329236212957143, + "grad_norm": 0.3626969654227701, + "learning_rate": 2.335565137190831e-05, + "loss": 2.6947, + "step": 45813 + }, + { + "epoch": 2.1329701794818074, + "grad_norm": 0.3627363251415348, + "learning_rate": 2.335335931489528e-05, + "loss": 2.655, + "step": 45814 + }, + { + "epoch": 2.1330167376679006, + "grad_norm": 0.33409024080790073, + "learning_rate": 2.3351067336087505e-05, + "loss": 2.6054, + "step": 45815 + }, + { + "epoch": 2.1330632958539937, + "grad_norm": 0.34581651763603155, + "learning_rate": 2.3348775435491748e-05, + "loss": 2.5856, + "step": 45816 + }, + { + "epoch": 2.133109854040087, + "grad_norm": 0.3454613833176556, + "learning_rate": 2.334648361311469e-05, + "loss": 2.5816, + "step": 45817 + }, + { + "epoch": 2.1331564122261795, + "grad_norm": 0.34767219133127025, + "learning_rate": 2.3344191868963072e-05, + "loss": 2.7461, + "step": 45818 + }, + { + "epoch": 2.1332029704122726, + "grad_norm": 0.3295086194048547, + "learning_rate": 2.3341900203043627e-05, + "loss": 2.59, + "step": 45819 + }, + { + "epoch": 2.1332495285983657, + "grad_norm": 0.3605350912060892, + "learning_rate": 2.3339608615363078e-05, + "loss": 2.6872, + "step": 45820 + }, + { + "epoch": 2.133296086784459, + "grad_norm": 0.33770212988151865, + "learning_rate": 2.3337317105928176e-05, + "loss": 2.6045, + "step": 45821 + }, + { + "epoch": 2.133342644970552, + "grad_norm": 0.33815365073901105, + "learning_rate": 2.3335025674745576e-05, + "loss": 2.5381, + "step": 45822 + }, + { + "epoch": 2.133389203156645, + "grad_norm": 0.3461169151482887, + "learning_rate": 2.3332734321822092e-05, + "loss": 2.5853, + "step": 45823 + }, + { + "epoch": 2.133435761342738, + "grad_norm": 0.3471447440413913, + "learning_rate": 2.3330443047164384e-05, + "loss": 2.6642, + "step": 45824 + }, + { + "epoch": 2.1334823195288313, + "grad_norm": 0.35470521429681556, + "learning_rate": 2.3328151850779202e-05, + "loss": 2.728, + "step": 45825 + }, + { + "epoch": 2.1335288777149244, + "grad_norm": 0.3385658680842685, + "learning_rate": 2.332586073267326e-05, + "loss": 2.6693, + "step": 45826 + }, + { + "epoch": 2.1335754359010175, + "grad_norm": 0.347017825345616, + "learning_rate": 2.3323569692853293e-05, + "loss": 2.7151, + "step": 45827 + }, + { + "epoch": 2.13362199408711, + "grad_norm": 0.33667116562721167, + "learning_rate": 2.3321278731326017e-05, + "loss": 2.5717, + "step": 45828 + }, + { + "epoch": 2.1336685522732033, + "grad_norm": 0.3569014888193704, + "learning_rate": 2.3318987848098185e-05, + "loss": 2.5839, + "step": 45829 + }, + { + "epoch": 2.1337151104592964, + "grad_norm": 0.34224794946396137, + "learning_rate": 2.3316697043176473e-05, + "loss": 2.6322, + "step": 45830 + }, + { + "epoch": 2.1337616686453895, + "grad_norm": 0.3622948605336053, + "learning_rate": 2.331440631656763e-05, + "loss": 2.5616, + "step": 45831 + }, + { + "epoch": 2.1338082268314826, + "grad_norm": 0.37077654427801343, + "learning_rate": 2.3312115668278368e-05, + "loss": 2.6871, + "step": 45832 + }, + { + "epoch": 2.1338547850175758, + "grad_norm": 0.3305900502443493, + "learning_rate": 2.330982509831542e-05, + "loss": 2.6784, + "step": 45833 + }, + { + "epoch": 2.133901343203669, + "grad_norm": 0.39878257793577493, + "learning_rate": 2.330753460668553e-05, + "loss": 2.7472, + "step": 45834 + }, + { + "epoch": 2.133947901389762, + "grad_norm": 0.3585106890574466, + "learning_rate": 2.3305244193395354e-05, + "loss": 2.7292, + "step": 45835 + }, + { + "epoch": 2.133994459575855, + "grad_norm": 0.3351156134004635, + "learning_rate": 2.3302953858451692e-05, + "loss": 2.6145, + "step": 45836 + }, + { + "epoch": 2.134041017761948, + "grad_norm": 0.3486802527156884, + "learning_rate": 2.3300663601861196e-05, + "loss": 2.6223, + "step": 45837 + }, + { + "epoch": 2.134087575948041, + "grad_norm": 0.34767611647104407, + "learning_rate": 2.329837342363066e-05, + "loss": 2.6154, + "step": 45838 + }, + { + "epoch": 2.134134134134134, + "grad_norm": 0.35099697660571527, + "learning_rate": 2.329608332376675e-05, + "loss": 2.678, + "step": 45839 + }, + { + "epoch": 2.134180692320227, + "grad_norm": 0.365256512371344, + "learning_rate": 2.32937933022762e-05, + "loss": 2.7486, + "step": 45840 + }, + { + "epoch": 2.1342272505063202, + "grad_norm": 0.363427777842137, + "learning_rate": 2.3291503359165738e-05, + "loss": 2.6758, + "step": 45841 + }, + { + "epoch": 2.1342738086924133, + "grad_norm": 0.3449251858657845, + "learning_rate": 2.3289213494442103e-05, + "loss": 2.6056, + "step": 45842 + }, + { + "epoch": 2.1343203668785065, + "grad_norm": 0.3413511026821933, + "learning_rate": 2.3286923708111973e-05, + "loss": 2.5913, + "step": 45843 + }, + { + "epoch": 2.1343669250645996, + "grad_norm": 0.3330739535213985, + "learning_rate": 2.3284634000182092e-05, + "loss": 2.6547, + "step": 45844 + }, + { + "epoch": 2.1344134832506927, + "grad_norm": 0.35231681050543023, + "learning_rate": 2.3282344370659178e-05, + "loss": 2.6591, + "step": 45845 + }, + { + "epoch": 2.134460041436786, + "grad_norm": 0.3377239541945691, + "learning_rate": 2.3280054819549952e-05, + "loss": 2.5133, + "step": 45846 + }, + { + "epoch": 2.1345065996228785, + "grad_norm": 0.3669211851022344, + "learning_rate": 2.3277765346861147e-05, + "loss": 2.6197, + "step": 45847 + }, + { + "epoch": 2.1345531578089716, + "grad_norm": 0.3164901499846299, + "learning_rate": 2.3275475952599435e-05, + "loss": 2.6018, + "step": 45848 + }, + { + "epoch": 2.1345997159950647, + "grad_norm": 0.35113700140038795, + "learning_rate": 2.3273186636771605e-05, + "loss": 2.6028, + "step": 45849 + }, + { + "epoch": 2.134646274181158, + "grad_norm": 0.33008209333407795, + "learning_rate": 2.32708973993843e-05, + "loss": 2.6407, + "step": 45850 + }, + { + "epoch": 2.134692832367251, + "grad_norm": 0.3694182989008489, + "learning_rate": 2.3268608240444324e-05, + "loss": 2.7266, + "step": 45851 + }, + { + "epoch": 2.134739390553344, + "grad_norm": 0.34039113456833175, + "learning_rate": 2.3266319159958323e-05, + "loss": 2.6321, + "step": 45852 + }, + { + "epoch": 2.134785948739437, + "grad_norm": 0.34012546031474694, + "learning_rate": 2.3264030157933035e-05, + "loss": 2.5648, + "step": 45853 + }, + { + "epoch": 2.1348325069255303, + "grad_norm": 0.3651539958156236, + "learning_rate": 2.3261741234375196e-05, + "loss": 2.5985, + "step": 45854 + }, + { + "epoch": 2.1348790651116234, + "grad_norm": 0.3477430915626853, + "learning_rate": 2.325945238929151e-05, + "loss": 2.7348, + "step": 45855 + }, + { + "epoch": 2.1349256232977165, + "grad_norm": 0.33574448957750364, + "learning_rate": 2.325716362268871e-05, + "loss": 2.5856, + "step": 45856 + }, + { + "epoch": 2.134972181483809, + "grad_norm": 0.3294470891300098, + "learning_rate": 2.325487493457349e-05, + "loss": 2.6308, + "step": 45857 + }, + { + "epoch": 2.1350187396699023, + "grad_norm": 0.3542962798610267, + "learning_rate": 2.325258632495257e-05, + "loss": 2.6674, + "step": 45858 + }, + { + "epoch": 2.1350652978559954, + "grad_norm": 0.357353770844395, + "learning_rate": 2.3250297793832676e-05, + "loss": 2.6658, + "step": 45859 + }, + { + "epoch": 2.1351118560420885, + "grad_norm": 0.3331178160486559, + "learning_rate": 2.3248009341220544e-05, + "loss": 2.6699, + "step": 45860 + }, + { + "epoch": 2.1351584142281816, + "grad_norm": 0.32684689907256886, + "learning_rate": 2.3245720967122824e-05, + "loss": 2.6882, + "step": 45861 + }, + { + "epoch": 2.1352049724142748, + "grad_norm": 0.32042133555268465, + "learning_rate": 2.3243432671546324e-05, + "loss": 2.6246, + "step": 45862 + }, + { + "epoch": 2.135251530600368, + "grad_norm": 0.33749997450187913, + "learning_rate": 2.324114445449767e-05, + "loss": 2.5273, + "step": 45863 + }, + { + "epoch": 2.135298088786461, + "grad_norm": 0.3367818701180519, + "learning_rate": 2.323885631598366e-05, + "loss": 2.6318, + "step": 45864 + }, + { + "epoch": 2.135344646972554, + "grad_norm": 0.34070477518024117, + "learning_rate": 2.3236568256010954e-05, + "loss": 2.6309, + "step": 45865 + }, + { + "epoch": 2.135391205158647, + "grad_norm": 0.3238169188070697, + "learning_rate": 2.323428027458628e-05, + "loss": 2.5338, + "step": 45866 + }, + { + "epoch": 2.13543776334474, + "grad_norm": 0.35361699305160676, + "learning_rate": 2.323199237171636e-05, + "loss": 2.5941, + "step": 45867 + }, + { + "epoch": 2.135484321530833, + "grad_norm": 0.35245449042222726, + "learning_rate": 2.3229704547407903e-05, + "loss": 2.7487, + "step": 45868 + }, + { + "epoch": 2.135530879716926, + "grad_norm": 0.3426056447019484, + "learning_rate": 2.3227416801667646e-05, + "loss": 2.8076, + "step": 45869 + }, + { + "epoch": 2.1355774379030192, + "grad_norm": 0.34565388903001254, + "learning_rate": 2.322512913450226e-05, + "loss": 2.6833, + "step": 45870 + }, + { + "epoch": 2.1356239960891124, + "grad_norm": 0.3465032617814521, + "learning_rate": 2.3222841545918484e-05, + "loss": 2.6249, + "step": 45871 + }, + { + "epoch": 2.1356705542752055, + "grad_norm": 0.3426434616579157, + "learning_rate": 2.322055403592303e-05, + "loss": 2.6647, + "step": 45872 + }, + { + "epoch": 2.1357171124612986, + "grad_norm": 0.33892390901903807, + "learning_rate": 2.3218266604522615e-05, + "loss": 2.6992, + "step": 45873 + }, + { + "epoch": 2.1357636706473917, + "grad_norm": 0.34661033189934726, + "learning_rate": 2.3215979251723947e-05, + "loss": 2.643, + "step": 45874 + }, + { + "epoch": 2.135810228833485, + "grad_norm": 0.32967868477878476, + "learning_rate": 2.3213691977533765e-05, + "loss": 2.5771, + "step": 45875 + }, + { + "epoch": 2.135856787019578, + "grad_norm": 0.33270115959668617, + "learning_rate": 2.3211404781958713e-05, + "loss": 2.6903, + "step": 45876 + }, + { + "epoch": 2.1359033452056706, + "grad_norm": 0.347225495092358, + "learning_rate": 2.3209117665005585e-05, + "loss": 2.6292, + "step": 45877 + }, + { + "epoch": 2.1359499033917637, + "grad_norm": 0.3324640686112681, + "learning_rate": 2.3206830626681048e-05, + "loss": 2.6051, + "step": 45878 + }, + { + "epoch": 2.135996461577857, + "grad_norm": 0.3528468165925063, + "learning_rate": 2.3204543666991817e-05, + "loss": 2.6976, + "step": 45879 + }, + { + "epoch": 2.13604301976395, + "grad_norm": 0.3364100941028868, + "learning_rate": 2.320225678594461e-05, + "loss": 2.5537, + "step": 45880 + }, + { + "epoch": 2.136089577950043, + "grad_norm": 0.3547355983648039, + "learning_rate": 2.3199969983546144e-05, + "loss": 2.6541, + "step": 45881 + }, + { + "epoch": 2.136136136136136, + "grad_norm": 0.3222389217271049, + "learning_rate": 2.3197683259803138e-05, + "loss": 2.6234, + "step": 45882 + }, + { + "epoch": 2.1361826943222293, + "grad_norm": 0.3521486674195182, + "learning_rate": 2.319539661472228e-05, + "loss": 2.6028, + "step": 45883 + }, + { + "epoch": 2.1362292525083224, + "grad_norm": 0.36268306656011884, + "learning_rate": 2.3193110048310284e-05, + "loss": 2.6915, + "step": 45884 + }, + { + "epoch": 2.1362758106944155, + "grad_norm": 0.3399622658365157, + "learning_rate": 2.3190823560573877e-05, + "loss": 2.6827, + "step": 45885 + }, + { + "epoch": 2.136322368880508, + "grad_norm": 0.36828601300579705, + "learning_rate": 2.3188537151519755e-05, + "loss": 2.7404, + "step": 45886 + }, + { + "epoch": 2.1363689270666013, + "grad_norm": 0.335267041886652, + "learning_rate": 2.3186250821154642e-05, + "loss": 2.6295, + "step": 45887 + }, + { + "epoch": 2.1364154852526944, + "grad_norm": 0.3400008606351638, + "learning_rate": 2.3183964569485257e-05, + "loss": 2.6534, + "step": 45888 + }, + { + "epoch": 2.1364620434387875, + "grad_norm": 0.34667412202036957, + "learning_rate": 2.3181678396518252e-05, + "loss": 2.6936, + "step": 45889 + }, + { + "epoch": 2.1365086016248807, + "grad_norm": 0.32425566124437966, + "learning_rate": 2.3179392302260423e-05, + "loss": 2.5425, + "step": 45890 + }, + { + "epoch": 2.1365551598109738, + "grad_norm": 0.3727733274327593, + "learning_rate": 2.3177106286718415e-05, + "loss": 2.7113, + "step": 45891 + }, + { + "epoch": 2.136601717997067, + "grad_norm": 0.3362992572504971, + "learning_rate": 2.3174820349898958e-05, + "loss": 2.703, + "step": 45892 + }, + { + "epoch": 2.13664827618316, + "grad_norm": 0.3180903287768888, + "learning_rate": 2.3172534491808758e-05, + "loss": 2.5333, + "step": 45893 + }, + { + "epoch": 2.136694834369253, + "grad_norm": 0.3608501887672868, + "learning_rate": 2.317024871245452e-05, + "loss": 2.7631, + "step": 45894 + }, + { + "epoch": 2.1367413925553462, + "grad_norm": 0.3364994872181515, + "learning_rate": 2.3167963011842992e-05, + "loss": 2.7594, + "step": 45895 + }, + { + "epoch": 2.1367879507414393, + "grad_norm": 0.35100638056359035, + "learning_rate": 2.316567738998082e-05, + "loss": 2.5768, + "step": 45896 + }, + { + "epoch": 2.136834508927532, + "grad_norm": 0.3299604687576388, + "learning_rate": 2.3163391846874743e-05, + "loss": 2.7341, + "step": 45897 + }, + { + "epoch": 2.136881067113625, + "grad_norm": 0.34396557898553876, + "learning_rate": 2.3161106382531474e-05, + "loss": 2.6876, + "step": 45898 + }, + { + "epoch": 2.1369276252997182, + "grad_norm": 0.3442450598272719, + "learning_rate": 2.315882099695771e-05, + "loss": 2.5634, + "step": 45899 + }, + { + "epoch": 2.1369741834858114, + "grad_norm": 0.3562543624139645, + "learning_rate": 2.315653569016016e-05, + "loss": 2.7451, + "step": 45900 + }, + { + "epoch": 2.1370207416719045, + "grad_norm": 0.3482102518501331, + "learning_rate": 2.315425046214555e-05, + "loss": 2.613, + "step": 45901 + }, + { + "epoch": 2.1370672998579976, + "grad_norm": 0.33343556502606353, + "learning_rate": 2.3151965312920532e-05, + "loss": 2.58, + "step": 45902 + }, + { + "epoch": 2.1371138580440907, + "grad_norm": 0.3337691279550813, + "learning_rate": 2.3149680242491894e-05, + "loss": 2.626, + "step": 45903 + }, + { + "epoch": 2.137160416230184, + "grad_norm": 0.3239229949905684, + "learning_rate": 2.3147395250866278e-05, + "loss": 2.5854, + "step": 45904 + }, + { + "epoch": 2.137206974416277, + "grad_norm": 0.339141807294715, + "learning_rate": 2.3145110338050413e-05, + "loss": 2.6669, + "step": 45905 + }, + { + "epoch": 2.1372535326023696, + "grad_norm": 0.3590670268376435, + "learning_rate": 2.3142825504051e-05, + "loss": 2.6487, + "step": 45906 + }, + { + "epoch": 2.1373000907884627, + "grad_norm": 0.31827754277561343, + "learning_rate": 2.314054074887475e-05, + "loss": 2.6701, + "step": 45907 + }, + { + "epoch": 2.137346648974556, + "grad_norm": 0.32916132723278985, + "learning_rate": 2.3138256072528387e-05, + "loss": 2.663, + "step": 45908 + }, + { + "epoch": 2.137393207160649, + "grad_norm": 0.35335237837378053, + "learning_rate": 2.3135971475018558e-05, + "loss": 2.6597, + "step": 45909 + }, + { + "epoch": 2.137439765346742, + "grad_norm": 0.3307179880169181, + "learning_rate": 2.313368695635204e-05, + "loss": 2.6085, + "step": 45910 + }, + { + "epoch": 2.137486323532835, + "grad_norm": 0.3158525212897993, + "learning_rate": 2.313140251653549e-05, + "loss": 2.5767, + "step": 45911 + }, + { + "epoch": 2.1375328817189283, + "grad_norm": 0.35024830726054634, + "learning_rate": 2.3129118155575623e-05, + "loss": 2.7636, + "step": 45912 + }, + { + "epoch": 2.1375794399050214, + "grad_norm": 0.33361383777976156, + "learning_rate": 2.3126833873479153e-05, + "loss": 2.5597, + "step": 45913 + }, + { + "epoch": 2.1376259980911145, + "grad_norm": 0.3263721006513287, + "learning_rate": 2.3124549670252793e-05, + "loss": 2.5305, + "step": 45914 + }, + { + "epoch": 2.1376725562772076, + "grad_norm": 0.36424656061883953, + "learning_rate": 2.3122265545903195e-05, + "loss": 2.7433, + "step": 45915 + }, + { + "epoch": 2.1377191144633003, + "grad_norm": 0.31985034333994583, + "learning_rate": 2.311998150043713e-05, + "loss": 2.6433, + "step": 45916 + }, + { + "epoch": 2.1377656726493934, + "grad_norm": 0.36770171528626816, + "learning_rate": 2.3117697533861244e-05, + "loss": 2.6683, + "step": 45917 + }, + { + "epoch": 2.1378122308354865, + "grad_norm": 0.33826985308811314, + "learning_rate": 2.31154136461823e-05, + "loss": 2.6292, + "step": 45918 + }, + { + "epoch": 2.1378587890215797, + "grad_norm": 0.3223970370021736, + "learning_rate": 2.311312983740695e-05, + "loss": 2.6373, + "step": 45919 + }, + { + "epoch": 2.137905347207673, + "grad_norm": 0.34882344625803546, + "learning_rate": 2.3110846107541917e-05, + "loss": 2.6364, + "step": 45920 + }, + { + "epoch": 2.137951905393766, + "grad_norm": 0.33217357455271956, + "learning_rate": 2.3108562456593917e-05, + "loss": 2.6387, + "step": 45921 + }, + { + "epoch": 2.137998463579859, + "grad_norm": 0.33204941906445296, + "learning_rate": 2.3106278884569604e-05, + "loss": 2.6474, + "step": 45922 + }, + { + "epoch": 2.138045021765952, + "grad_norm": 0.3458346750059687, + "learning_rate": 2.3103995391475745e-05, + "loss": 2.7109, + "step": 45923 + }, + { + "epoch": 2.1380915799520452, + "grad_norm": 0.33799630036324013, + "learning_rate": 2.3101711977318998e-05, + "loss": 2.5451, + "step": 45924 + }, + { + "epoch": 2.138138138138138, + "grad_norm": 0.3167691464159624, + "learning_rate": 2.3099428642106076e-05, + "loss": 2.5613, + "step": 45925 + }, + { + "epoch": 2.138184696324231, + "grad_norm": 0.34268817669771656, + "learning_rate": 2.3097145385843676e-05, + "loss": 2.6277, + "step": 45926 + }, + { + "epoch": 2.138231254510324, + "grad_norm": 0.320045579893429, + "learning_rate": 2.3094862208538503e-05, + "loss": 2.6103, + "step": 45927 + }, + { + "epoch": 2.1382778126964173, + "grad_norm": 0.3478791236296619, + "learning_rate": 2.3092579110197264e-05, + "loss": 2.5495, + "step": 45928 + }, + { + "epoch": 2.1383243708825104, + "grad_norm": 0.31320132540640555, + "learning_rate": 2.309029609082667e-05, + "loss": 2.6083, + "step": 45929 + }, + { + "epoch": 2.1383709290686035, + "grad_norm": 0.36050944523067574, + "learning_rate": 2.308801315043337e-05, + "loss": 2.562, + "step": 45930 + }, + { + "epoch": 2.1384174872546966, + "grad_norm": 0.3598330591049049, + "learning_rate": 2.3085730289024132e-05, + "loss": 2.6877, + "step": 45931 + }, + { + "epoch": 2.1384640454407897, + "grad_norm": 0.3439283408234643, + "learning_rate": 2.3083447506605617e-05, + "loss": 2.6898, + "step": 45932 + }, + { + "epoch": 2.138510603626883, + "grad_norm": 0.35881600029779104, + "learning_rate": 2.3081164803184518e-05, + "loss": 2.7064, + "step": 45933 + }, + { + "epoch": 2.138557161812976, + "grad_norm": 0.3435506498774347, + "learning_rate": 2.3078882178767575e-05, + "loss": 2.6471, + "step": 45934 + }, + { + "epoch": 2.138603719999069, + "grad_norm": 0.35362471012334346, + "learning_rate": 2.3076599633361425e-05, + "loss": 2.6177, + "step": 45935 + }, + { + "epoch": 2.1386502781851617, + "grad_norm": 0.3543968401202911, + "learning_rate": 2.3074317166972837e-05, + "loss": 2.6025, + "step": 45936 + }, + { + "epoch": 2.138696836371255, + "grad_norm": 0.30363783486655843, + "learning_rate": 2.3072034779608452e-05, + "loss": 2.5325, + "step": 45937 + }, + { + "epoch": 2.138743394557348, + "grad_norm": 0.3157004388400199, + "learning_rate": 2.3069752471275003e-05, + "loss": 2.6093, + "step": 45938 + }, + { + "epoch": 2.138789952743441, + "grad_norm": 0.3447932484121059, + "learning_rate": 2.3067470241979173e-05, + "loss": 2.7142, + "step": 45939 + }, + { + "epoch": 2.138836510929534, + "grad_norm": 0.34163774978402006, + "learning_rate": 2.306518809172766e-05, + "loss": 2.661, + "step": 45940 + }, + { + "epoch": 2.1388830691156273, + "grad_norm": 0.33657684667503407, + "learning_rate": 2.3062906020527168e-05, + "loss": 2.5949, + "step": 45941 + }, + { + "epoch": 2.1389296273017204, + "grad_norm": 0.35749192322411105, + "learning_rate": 2.3060624028384416e-05, + "loss": 2.6883, + "step": 45942 + }, + { + "epoch": 2.1389761854878135, + "grad_norm": 0.3467440706605874, + "learning_rate": 2.3058342115306037e-05, + "loss": 2.6545, + "step": 45943 + }, + { + "epoch": 2.1390227436739067, + "grad_norm": 0.33710592573994924, + "learning_rate": 2.3056060281298807e-05, + "loss": 2.6801, + "step": 45944 + }, + { + "epoch": 2.1390693018599993, + "grad_norm": 0.32384821473683206, + "learning_rate": 2.305377852636937e-05, + "loss": 2.6376, + "step": 45945 + }, + { + "epoch": 2.1391158600460924, + "grad_norm": 0.33861728752426185, + "learning_rate": 2.3051496850524434e-05, + "loss": 2.6394, + "step": 45946 + }, + { + "epoch": 2.1391624182321856, + "grad_norm": 0.34788721066300815, + "learning_rate": 2.3049215253770723e-05, + "loss": 2.5891, + "step": 45947 + }, + { + "epoch": 2.1392089764182787, + "grad_norm": 0.33452089846445865, + "learning_rate": 2.3046933736114872e-05, + "loss": 2.667, + "step": 45948 + }, + { + "epoch": 2.139255534604372, + "grad_norm": 0.30317884347798524, + "learning_rate": 2.3044652297563652e-05, + "loss": 2.6512, + "step": 45949 + }, + { + "epoch": 2.139302092790465, + "grad_norm": 0.33198099684165455, + "learning_rate": 2.3042370938123703e-05, + "loss": 2.6105, + "step": 45950 + }, + { + "epoch": 2.139348650976558, + "grad_norm": 0.34939460848925424, + "learning_rate": 2.3040089657801738e-05, + "loss": 2.6541, + "step": 45951 + }, + { + "epoch": 2.139395209162651, + "grad_norm": 0.30809699194141155, + "learning_rate": 2.3037808456604454e-05, + "loss": 2.5655, + "step": 45952 + }, + { + "epoch": 2.1394417673487442, + "grad_norm": 0.3467929854955116, + "learning_rate": 2.303552733453855e-05, + "loss": 2.6288, + "step": 45953 + }, + { + "epoch": 2.1394883255348374, + "grad_norm": 0.3325905006378381, + "learning_rate": 2.3033246291610717e-05, + "loss": 2.6914, + "step": 45954 + }, + { + "epoch": 2.13953488372093, + "grad_norm": 0.3331785709515488, + "learning_rate": 2.3030965327827664e-05, + "loss": 2.727, + "step": 45955 + }, + { + "epoch": 2.139581441907023, + "grad_norm": 0.3703185098552267, + "learning_rate": 2.302868444319603e-05, + "loss": 2.7499, + "step": 45956 + }, + { + "epoch": 2.1396280000931163, + "grad_norm": 0.33572510004373046, + "learning_rate": 2.3026403637722593e-05, + "loss": 2.777, + "step": 45957 + }, + { + "epoch": 2.1396745582792094, + "grad_norm": 0.31830145132969007, + "learning_rate": 2.3024122911413986e-05, + "loss": 2.5982, + "step": 45958 + }, + { + "epoch": 2.1397211164653025, + "grad_norm": 0.32817984119899585, + "learning_rate": 2.302184226427692e-05, + "loss": 2.6153, + "step": 45959 + }, + { + "epoch": 2.1397676746513956, + "grad_norm": 0.38944430991376194, + "learning_rate": 2.301956169631811e-05, + "loss": 2.6855, + "step": 45960 + }, + { + "epoch": 2.1398142328374887, + "grad_norm": 0.32133280123206087, + "learning_rate": 2.3017281207544182e-05, + "loss": 2.6176, + "step": 45961 + }, + { + "epoch": 2.139860791023582, + "grad_norm": 0.3655768848154415, + "learning_rate": 2.3015000797961922e-05, + "loss": 2.6209, + "step": 45962 + }, + { + "epoch": 2.139907349209675, + "grad_norm": 0.3669825787694001, + "learning_rate": 2.3012720467577937e-05, + "loss": 2.5641, + "step": 45963 + }, + { + "epoch": 2.1399539073957676, + "grad_norm": 0.3532841863971293, + "learning_rate": 2.3010440216398993e-05, + "loss": 2.6207, + "step": 45964 + }, + { + "epoch": 2.1400004655818607, + "grad_norm": 0.34479917325005516, + "learning_rate": 2.300816004443173e-05, + "loss": 2.6804, + "step": 45965 + }, + { + "epoch": 2.140047023767954, + "grad_norm": 0.37169345815875204, + "learning_rate": 2.3005879951682864e-05, + "loss": 2.5987, + "step": 45966 + }, + { + "epoch": 2.140093581954047, + "grad_norm": 0.3651282237205347, + "learning_rate": 2.300359993815907e-05, + "loss": 2.6664, + "step": 45967 + }, + { + "epoch": 2.14014014014014, + "grad_norm": 0.3635321500007781, + "learning_rate": 2.300132000386708e-05, + "loss": 2.7401, + "step": 45968 + }, + { + "epoch": 2.140186698326233, + "grad_norm": 0.3428581629129427, + "learning_rate": 2.299904014881351e-05, + "loss": 2.6359, + "step": 45969 + }, + { + "epoch": 2.1402332565123263, + "grad_norm": 0.35393473406288606, + "learning_rate": 2.2996760373005138e-05, + "loss": 2.6422, + "step": 45970 + }, + { + "epoch": 2.1402798146984194, + "grad_norm": 0.35502411419665036, + "learning_rate": 2.29944806764486e-05, + "loss": 2.7992, + "step": 45971 + }, + { + "epoch": 2.1403263728845126, + "grad_norm": 0.3518350545292569, + "learning_rate": 2.2992201059150597e-05, + "loss": 2.6352, + "step": 45972 + }, + { + "epoch": 2.1403729310706057, + "grad_norm": 0.3278412410150744, + "learning_rate": 2.298992152111784e-05, + "loss": 2.6596, + "step": 45973 + }, + { + "epoch": 2.140419489256699, + "grad_norm": 0.3307623718047799, + "learning_rate": 2.2987642062356968e-05, + "loss": 2.6534, + "step": 45974 + }, + { + "epoch": 2.1404660474427915, + "grad_norm": 0.3570259191667372, + "learning_rate": 2.2985362682874738e-05, + "loss": 2.6372, + "step": 45975 + }, + { + "epoch": 2.1405126056288846, + "grad_norm": 0.3343521877916131, + "learning_rate": 2.2983083382677777e-05, + "loss": 2.6901, + "step": 45976 + }, + { + "epoch": 2.1405591638149777, + "grad_norm": 0.3502244103298527, + "learning_rate": 2.2980804161772833e-05, + "loss": 2.6435, + "step": 45977 + }, + { + "epoch": 2.140605722001071, + "grad_norm": 0.3634376280663227, + "learning_rate": 2.297852502016656e-05, + "loss": 2.7631, + "step": 45978 + }, + { + "epoch": 2.140652280187164, + "grad_norm": 0.32726461586035976, + "learning_rate": 2.2976245957865645e-05, + "loss": 2.6057, + "step": 45979 + }, + { + "epoch": 2.140698838373257, + "grad_norm": 0.37211902170508554, + "learning_rate": 2.2973966974876786e-05, + "loss": 2.6882, + "step": 45980 + }, + { + "epoch": 2.14074539655935, + "grad_norm": 0.31328855037965464, + "learning_rate": 2.2971688071206675e-05, + "loss": 2.6198, + "step": 45981 + }, + { + "epoch": 2.1407919547454433, + "grad_norm": 0.3343170892174258, + "learning_rate": 2.2969409246861995e-05, + "loss": 2.5644, + "step": 45982 + }, + { + "epoch": 2.1408385129315364, + "grad_norm": 0.33355211103515237, + "learning_rate": 2.2967130501849455e-05, + "loss": 2.681, + "step": 45983 + }, + { + "epoch": 2.140885071117629, + "grad_norm": 0.33224366223973517, + "learning_rate": 2.2964851836175706e-05, + "loss": 2.6878, + "step": 45984 + }, + { + "epoch": 2.140931629303722, + "grad_norm": 0.32939013755178076, + "learning_rate": 2.296257324984745e-05, + "loss": 2.6088, + "step": 45985 + }, + { + "epoch": 2.1409781874898153, + "grad_norm": 0.3357817858706221, + "learning_rate": 2.2960294742871403e-05, + "loss": 2.6835, + "step": 45986 + }, + { + "epoch": 2.1410247456759084, + "grad_norm": 0.33782461685496135, + "learning_rate": 2.2958016315254184e-05, + "loss": 2.627, + "step": 45987 + }, + { + "epoch": 2.1410713038620015, + "grad_norm": 0.3065059695325259, + "learning_rate": 2.295573796700256e-05, + "loss": 2.4181, + "step": 45988 + }, + { + "epoch": 2.1411178620480946, + "grad_norm": 0.33577923572796725, + "learning_rate": 2.2953459698123147e-05, + "loss": 2.5968, + "step": 45989 + }, + { + "epoch": 2.1411644202341877, + "grad_norm": 0.36486153120007364, + "learning_rate": 2.2951181508622705e-05, + "loss": 2.699, + "step": 45990 + }, + { + "epoch": 2.141210978420281, + "grad_norm": 0.34474834242757474, + "learning_rate": 2.294890339850786e-05, + "loss": 2.6854, + "step": 45991 + }, + { + "epoch": 2.141257536606374, + "grad_norm": 0.3151706805038502, + "learning_rate": 2.2946625367785314e-05, + "loss": 2.5464, + "step": 45992 + }, + { + "epoch": 2.141304094792467, + "grad_norm": 0.34182015706994273, + "learning_rate": 2.2944347416461758e-05, + "loss": 2.7071, + "step": 45993 + }, + { + "epoch": 2.1413506529785598, + "grad_norm": 0.34292145658833995, + "learning_rate": 2.2942069544543882e-05, + "loss": 2.6103, + "step": 45994 + }, + { + "epoch": 2.141397211164653, + "grad_norm": 0.34433878929848255, + "learning_rate": 2.2939791752038364e-05, + "loss": 2.6333, + "step": 45995 + }, + { + "epoch": 2.141443769350746, + "grad_norm": 0.33041756029210645, + "learning_rate": 2.2937514038951908e-05, + "loss": 2.6512, + "step": 45996 + }, + { + "epoch": 2.141490327536839, + "grad_norm": 0.3261139432333914, + "learning_rate": 2.2935236405291166e-05, + "loss": 2.5338, + "step": 45997 + }, + { + "epoch": 2.141536885722932, + "grad_norm": 0.33426140353626393, + "learning_rate": 2.2932958851062836e-05, + "loss": 2.6399, + "step": 45998 + }, + { + "epoch": 2.1415834439090253, + "grad_norm": 0.330028983729821, + "learning_rate": 2.2930681376273606e-05, + "loss": 2.6146, + "step": 45999 + }, + { + "epoch": 2.1416300020951184, + "grad_norm": 0.329659209380477, + "learning_rate": 2.2928403980930158e-05, + "loss": 2.7381, + "step": 46000 + }, + { + "epoch": 2.1416765602812116, + "grad_norm": 0.32670145964574165, + "learning_rate": 2.29261266650392e-05, + "loss": 2.6875, + "step": 46001 + }, + { + "epoch": 2.1417231184673047, + "grad_norm": 0.3479089230861325, + "learning_rate": 2.292384942860735e-05, + "loss": 2.5541, + "step": 46002 + }, + { + "epoch": 2.141769676653398, + "grad_norm": 0.32066959337240286, + "learning_rate": 2.2921572271641377e-05, + "loss": 2.6447, + "step": 46003 + }, + { + "epoch": 2.1418162348394905, + "grad_norm": 0.3292677371862423, + "learning_rate": 2.2919295194147906e-05, + "loss": 2.6112, + "step": 46004 + }, + { + "epoch": 2.1418627930255836, + "grad_norm": 0.3341596455655947, + "learning_rate": 2.2917018196133633e-05, + "loss": 2.6754, + "step": 46005 + }, + { + "epoch": 2.1419093512116767, + "grad_norm": 0.33229869329510225, + "learning_rate": 2.2914741277605238e-05, + "loss": 2.6757, + "step": 46006 + }, + { + "epoch": 2.14195590939777, + "grad_norm": 0.32956042822710463, + "learning_rate": 2.2912464438569414e-05, + "loss": 2.6897, + "step": 46007 + }, + { + "epoch": 2.142002467583863, + "grad_norm": 0.32443523480974173, + "learning_rate": 2.2910187679032836e-05, + "loss": 2.5724, + "step": 46008 + }, + { + "epoch": 2.142049025769956, + "grad_norm": 0.32840137388035756, + "learning_rate": 2.2907910999002207e-05, + "loss": 2.6545, + "step": 46009 + }, + { + "epoch": 2.142095583956049, + "grad_norm": 0.32786215610419, + "learning_rate": 2.2905634398484176e-05, + "loss": 2.6974, + "step": 46010 + }, + { + "epoch": 2.1421421421421423, + "grad_norm": 0.3160342908956952, + "learning_rate": 2.2903357877485436e-05, + "loss": 2.5704, + "step": 46011 + }, + { + "epoch": 2.1421887003282354, + "grad_norm": 0.33739855006448544, + "learning_rate": 2.290108143601267e-05, + "loss": 2.6345, + "step": 46012 + }, + { + "epoch": 2.1422352585143285, + "grad_norm": 0.34848620067953956, + "learning_rate": 2.289880507407256e-05, + "loss": 2.6743, + "step": 46013 + }, + { + "epoch": 2.142281816700421, + "grad_norm": 0.3627506215809822, + "learning_rate": 2.2896528791671806e-05, + "loss": 2.6506, + "step": 46014 + }, + { + "epoch": 2.1423283748865143, + "grad_norm": 0.3467371069667606, + "learning_rate": 2.2894252588817034e-05, + "loss": 2.6621, + "step": 46015 + }, + { + "epoch": 2.1423749330726074, + "grad_norm": 0.33211464185808515, + "learning_rate": 2.2891976465514998e-05, + "loss": 2.6466, + "step": 46016 + }, + { + "epoch": 2.1424214912587005, + "grad_norm": 0.3281842526066052, + "learning_rate": 2.2889700421772303e-05, + "loss": 2.6034, + "step": 46017 + }, + { + "epoch": 2.1424680494447936, + "grad_norm": 0.3433071624459797, + "learning_rate": 2.2887424457595702e-05, + "loss": 2.7037, + "step": 46018 + }, + { + "epoch": 2.1425146076308867, + "grad_norm": 0.34184725110977016, + "learning_rate": 2.2885148572991822e-05, + "loss": 2.7188, + "step": 46019 + }, + { + "epoch": 2.14256116581698, + "grad_norm": 0.33476566378006717, + "learning_rate": 2.2882872767967362e-05, + "loss": 2.6897, + "step": 46020 + }, + { + "epoch": 2.142607724003073, + "grad_norm": 0.35636102020319105, + "learning_rate": 2.2880597042528997e-05, + "loss": 2.5396, + "step": 46021 + }, + { + "epoch": 2.142654282189166, + "grad_norm": 0.37231031151210714, + "learning_rate": 2.2878321396683424e-05, + "loss": 2.5636, + "step": 46022 + }, + { + "epoch": 2.1427008403752588, + "grad_norm": 0.3410842967301977, + "learning_rate": 2.2876045830437287e-05, + "loss": 2.6896, + "step": 46023 + }, + { + "epoch": 2.142747398561352, + "grad_norm": 0.3446842439370471, + "learning_rate": 2.2873770343797286e-05, + "loss": 2.607, + "step": 46024 + }, + { + "epoch": 2.142793956747445, + "grad_norm": 0.3311854748134745, + "learning_rate": 2.2871494936770094e-05, + "loss": 2.611, + "step": 46025 + }, + { + "epoch": 2.142840514933538, + "grad_norm": 0.31833736042935856, + "learning_rate": 2.2869219609362397e-05, + "loss": 2.5554, + "step": 46026 + }, + { + "epoch": 2.1428870731196312, + "grad_norm": 0.3362424060510714, + "learning_rate": 2.2866944361580882e-05, + "loss": 2.5838, + "step": 46027 + }, + { + "epoch": 2.1429336313057243, + "grad_norm": 0.3210000455871318, + "learning_rate": 2.2864669193432174e-05, + "loss": 2.7126, + "step": 46028 + }, + { + "epoch": 2.1429801894918175, + "grad_norm": 0.3394175063599766, + "learning_rate": 2.286239410492303e-05, + "loss": 2.7296, + "step": 46029 + }, + { + "epoch": 2.1430267476779106, + "grad_norm": 0.3709704819382979, + "learning_rate": 2.2860119096060045e-05, + "loss": 2.6445, + "step": 46030 + }, + { + "epoch": 2.1430733058640037, + "grad_norm": 0.33600781697064264, + "learning_rate": 2.285784416684998e-05, + "loss": 2.7716, + "step": 46031 + }, + { + "epoch": 2.143119864050097, + "grad_norm": 0.34772447281102586, + "learning_rate": 2.285556931729944e-05, + "loss": 2.6623, + "step": 46032 + }, + { + "epoch": 2.1431664222361895, + "grad_norm": 0.3611955428809131, + "learning_rate": 2.2853294547415138e-05, + "loss": 2.6878, + "step": 46033 + }, + { + "epoch": 2.1432129804222826, + "grad_norm": 0.3157309231200437, + "learning_rate": 2.2851019857203737e-05, + "loss": 2.6363, + "step": 46034 + }, + { + "epoch": 2.1432595386083757, + "grad_norm": 0.3248190630641299, + "learning_rate": 2.2848745246671922e-05, + "loss": 2.6306, + "step": 46035 + }, + { + "epoch": 2.143306096794469, + "grad_norm": 0.3348364314843914, + "learning_rate": 2.284647071582638e-05, + "loss": 2.5634, + "step": 46036 + }, + { + "epoch": 2.143352654980562, + "grad_norm": 0.337615260606509, + "learning_rate": 2.2844196264673757e-05, + "loss": 2.6335, + "step": 46037 + }, + { + "epoch": 2.143399213166655, + "grad_norm": 0.3284654549886252, + "learning_rate": 2.2841921893220735e-05, + "loss": 2.6158, + "step": 46038 + }, + { + "epoch": 2.143445771352748, + "grad_norm": 0.3489877288110555, + "learning_rate": 2.2839647601474006e-05, + "loss": 2.7271, + "step": 46039 + }, + { + "epoch": 2.1434923295388413, + "grad_norm": 0.34222080753273554, + "learning_rate": 2.2837373389440232e-05, + "loss": 2.7362, + "step": 46040 + }, + { + "epoch": 2.1435388877249344, + "grad_norm": 0.35576956490087164, + "learning_rate": 2.2835099257126092e-05, + "loss": 2.6784, + "step": 46041 + }, + { + "epoch": 2.1435854459110275, + "grad_norm": 0.33505188722414686, + "learning_rate": 2.283282520453827e-05, + "loss": 2.7114, + "step": 46042 + }, + { + "epoch": 2.14363200409712, + "grad_norm": 0.3395751898049239, + "learning_rate": 2.28305512316834e-05, + "loss": 2.5278, + "step": 46043 + }, + { + "epoch": 2.1436785622832133, + "grad_norm": 0.34097280924083845, + "learning_rate": 2.2828277338568227e-05, + "loss": 2.7415, + "step": 46044 + }, + { + "epoch": 2.1437251204693064, + "grad_norm": 0.3254982561442567, + "learning_rate": 2.2826003525199356e-05, + "loss": 2.5744, + "step": 46045 + }, + { + "epoch": 2.1437716786553995, + "grad_norm": 0.3571561926246926, + "learning_rate": 2.2823729791583493e-05, + "loss": 2.6034, + "step": 46046 + }, + { + "epoch": 2.1438182368414926, + "grad_norm": 0.341977653212037, + "learning_rate": 2.2821456137727304e-05, + "loss": 2.6371, + "step": 46047 + }, + { + "epoch": 2.1438647950275858, + "grad_norm": 0.3478076240314166, + "learning_rate": 2.2819182563637466e-05, + "loss": 2.6365, + "step": 46048 + }, + { + "epoch": 2.143911353213679, + "grad_norm": 0.3466070038418656, + "learning_rate": 2.2816909069320664e-05, + "loss": 2.6905, + "step": 46049 + }, + { + "epoch": 2.143957911399772, + "grad_norm": 0.35682303393590115, + "learning_rate": 2.2814635654783535e-05, + "loss": 2.6353, + "step": 46050 + }, + { + "epoch": 2.144004469585865, + "grad_norm": 0.34542257984173497, + "learning_rate": 2.2812362320032776e-05, + "loss": 2.6357, + "step": 46051 + }, + { + "epoch": 2.144051027771958, + "grad_norm": 0.3437624411570193, + "learning_rate": 2.2810089065075058e-05, + "loss": 2.7291, + "step": 46052 + }, + { + "epoch": 2.144097585958051, + "grad_norm": 0.35762464812651534, + "learning_rate": 2.2807815889917046e-05, + "loss": 2.6467, + "step": 46053 + }, + { + "epoch": 2.144144144144144, + "grad_norm": 0.346476866447088, + "learning_rate": 2.2805542794565414e-05, + "loss": 2.5423, + "step": 46054 + }, + { + "epoch": 2.144190702330237, + "grad_norm": 0.35102698770157487, + "learning_rate": 2.2803269779026858e-05, + "loss": 2.6844, + "step": 46055 + }, + { + "epoch": 2.1442372605163302, + "grad_norm": 0.35483196938512773, + "learning_rate": 2.2800996843307988e-05, + "loss": 2.7121, + "step": 46056 + }, + { + "epoch": 2.1442838187024233, + "grad_norm": 0.35594189226753176, + "learning_rate": 2.2798723987415545e-05, + "loss": 2.6919, + "step": 46057 + }, + { + "epoch": 2.1443303768885165, + "grad_norm": 0.3400771289317161, + "learning_rate": 2.279645121135615e-05, + "loss": 2.6134, + "step": 46058 + }, + { + "epoch": 2.1443769350746096, + "grad_norm": 0.35469862647861644, + "learning_rate": 2.2794178515136493e-05, + "loss": 2.6656, + "step": 46059 + }, + { + "epoch": 2.1444234932607027, + "grad_norm": 0.32865642759261826, + "learning_rate": 2.2791905898763243e-05, + "loss": 2.5126, + "step": 46060 + }, + { + "epoch": 2.144470051446796, + "grad_norm": 0.3300703921442524, + "learning_rate": 2.278963336224307e-05, + "loss": 2.6667, + "step": 46061 + }, + { + "epoch": 2.1445166096328885, + "grad_norm": 0.35671482473439736, + "learning_rate": 2.2787360905582655e-05, + "loss": 2.633, + "step": 46062 + }, + { + "epoch": 2.1445631678189816, + "grad_norm": 0.33337172365781226, + "learning_rate": 2.278508852878864e-05, + "loss": 2.6286, + "step": 46063 + }, + { + "epoch": 2.1446097260050747, + "grad_norm": 0.3560333474265518, + "learning_rate": 2.278281623186771e-05, + "loss": 2.6602, + "step": 46064 + }, + { + "epoch": 2.144656284191168, + "grad_norm": 0.35191236641869467, + "learning_rate": 2.2780544014826533e-05, + "loss": 2.5517, + "step": 46065 + }, + { + "epoch": 2.144702842377261, + "grad_norm": 0.3391975055272119, + "learning_rate": 2.2778271877671775e-05, + "loss": 2.7354, + "step": 46066 + }, + { + "epoch": 2.144749400563354, + "grad_norm": 0.31967817788059, + "learning_rate": 2.277599982041011e-05, + "loss": 2.6631, + "step": 46067 + }, + { + "epoch": 2.144795958749447, + "grad_norm": 0.3332855457898308, + "learning_rate": 2.2773727843048222e-05, + "loss": 2.625, + "step": 46068 + }, + { + "epoch": 2.1448425169355403, + "grad_norm": 0.3628638367827142, + "learning_rate": 2.2771455945592722e-05, + "loss": 2.6601, + "step": 46069 + }, + { + "epoch": 2.1448890751216334, + "grad_norm": 0.3402713561376841, + "learning_rate": 2.2769184128050353e-05, + "loss": 2.587, + "step": 46070 + }, + { + "epoch": 2.1449356333077265, + "grad_norm": 0.3351613625691321, + "learning_rate": 2.276691239042773e-05, + "loss": 2.6149, + "step": 46071 + }, + { + "epoch": 2.1449821914938196, + "grad_norm": 0.34662636788489026, + "learning_rate": 2.2764640732731535e-05, + "loss": 2.592, + "step": 46072 + }, + { + "epoch": 2.1450287496799123, + "grad_norm": 0.3464395952219697, + "learning_rate": 2.276236915496843e-05, + "loss": 2.6713, + "step": 46073 + }, + { + "epoch": 2.1450753078660054, + "grad_norm": 0.34610491431116225, + "learning_rate": 2.2760097657145095e-05, + "loss": 2.6868, + "step": 46074 + }, + { + "epoch": 2.1451218660520985, + "grad_norm": 0.36085572438406127, + "learning_rate": 2.275782623926821e-05, + "loss": 2.764, + "step": 46075 + }, + { + "epoch": 2.1451684242381917, + "grad_norm": 0.32934024251057903, + "learning_rate": 2.2755554901344378e-05, + "loss": 2.5469, + "step": 46076 + }, + { + "epoch": 2.1452149824242848, + "grad_norm": 0.3474927111871859, + "learning_rate": 2.2753283643380353e-05, + "loss": 2.5412, + "step": 46077 + }, + { + "epoch": 2.145261540610378, + "grad_norm": 0.35082534507117025, + "learning_rate": 2.2751012465382733e-05, + "loss": 2.7309, + "step": 46078 + }, + { + "epoch": 2.145308098796471, + "grad_norm": 0.3286707131139918, + "learning_rate": 2.2748741367358206e-05, + "loss": 2.6808, + "step": 46079 + }, + { + "epoch": 2.145354656982564, + "grad_norm": 0.3458220379812591, + "learning_rate": 2.2746470349313437e-05, + "loss": 2.6503, + "step": 46080 + }, + { + "epoch": 2.1454012151686572, + "grad_norm": 0.3589658034367122, + "learning_rate": 2.2744199411255118e-05, + "loss": 2.6606, + "step": 46081 + }, + { + "epoch": 2.14544777335475, + "grad_norm": 0.343578498403456, + "learning_rate": 2.2741928553189845e-05, + "loss": 2.5588, + "step": 46082 + }, + { + "epoch": 2.145494331540843, + "grad_norm": 0.3643288179474655, + "learning_rate": 2.2739657775124367e-05, + "loss": 2.7073, + "step": 46083 + }, + { + "epoch": 2.145540889726936, + "grad_norm": 0.356239461853518, + "learning_rate": 2.2737387077065285e-05, + "loss": 2.6547, + "step": 46084 + }, + { + "epoch": 2.1455874479130292, + "grad_norm": 0.3443173944625574, + "learning_rate": 2.273511645901929e-05, + "loss": 2.6533, + "step": 46085 + }, + { + "epoch": 2.1456340060991224, + "grad_norm": 0.37127012223034644, + "learning_rate": 2.2732845920993045e-05, + "loss": 2.6118, + "step": 46086 + }, + { + "epoch": 2.1456805642852155, + "grad_norm": 0.34609332044404767, + "learning_rate": 2.2730575462993204e-05, + "loss": 2.7138, + "step": 46087 + }, + { + "epoch": 2.1457271224713086, + "grad_norm": 0.36814635241423266, + "learning_rate": 2.2728305085026462e-05, + "loss": 2.7041, + "step": 46088 + }, + { + "epoch": 2.1457736806574017, + "grad_norm": 0.3259440222577033, + "learning_rate": 2.2726034787099415e-05, + "loss": 2.7374, + "step": 46089 + }, + { + "epoch": 2.145820238843495, + "grad_norm": 0.3326392847311994, + "learning_rate": 2.2723764569218815e-05, + "loss": 2.6735, + "step": 46090 + }, + { + "epoch": 2.145866797029588, + "grad_norm": 0.3458386286494724, + "learning_rate": 2.272149443139125e-05, + "loss": 2.6294, + "step": 46091 + }, + { + "epoch": 2.1459133552156806, + "grad_norm": 0.324336318380997, + "learning_rate": 2.2719224373623414e-05, + "loss": 2.6172, + "step": 46092 + }, + { + "epoch": 2.1459599134017737, + "grad_norm": 0.33025068903735716, + "learning_rate": 2.2716954395921974e-05, + "loss": 2.6808, + "step": 46093 + }, + { + "epoch": 2.146006471587867, + "grad_norm": 0.36907286416575646, + "learning_rate": 2.2714684498293576e-05, + "loss": 2.7559, + "step": 46094 + }, + { + "epoch": 2.14605302977396, + "grad_norm": 0.3774396377119542, + "learning_rate": 2.2712414680744897e-05, + "loss": 2.719, + "step": 46095 + }, + { + "epoch": 2.146099587960053, + "grad_norm": 0.3365129573852062, + "learning_rate": 2.2710144943282607e-05, + "loss": 2.5697, + "step": 46096 + }, + { + "epoch": 2.146146146146146, + "grad_norm": 0.3398338046161404, + "learning_rate": 2.2707875285913337e-05, + "loss": 2.5094, + "step": 46097 + }, + { + "epoch": 2.1461927043322393, + "grad_norm": 0.3576568548642396, + "learning_rate": 2.2705605708643768e-05, + "loss": 2.7065, + "step": 46098 + }, + { + "epoch": 2.1462392625183324, + "grad_norm": 0.34163174646880223, + "learning_rate": 2.2703336211480554e-05, + "loss": 2.5327, + "step": 46099 + }, + { + "epoch": 2.1462858207044255, + "grad_norm": 0.3304021811140884, + "learning_rate": 2.2701066794430354e-05, + "loss": 2.6277, + "step": 46100 + }, + { + "epoch": 2.146332378890518, + "grad_norm": 0.35437228281968336, + "learning_rate": 2.269879745749986e-05, + "loss": 2.6181, + "step": 46101 + }, + { + "epoch": 2.1463789370766113, + "grad_norm": 0.35089304238513463, + "learning_rate": 2.269652820069566e-05, + "loss": 2.7042, + "step": 46102 + }, + { + "epoch": 2.1464254952627044, + "grad_norm": 0.38892536004587286, + "learning_rate": 2.2694259024024506e-05, + "loss": 2.7365, + "step": 46103 + }, + { + "epoch": 2.1464720534487975, + "grad_norm": 0.345124211585409, + "learning_rate": 2.2691989927492985e-05, + "loss": 2.6911, + "step": 46104 + }, + { + "epoch": 2.1465186116348907, + "grad_norm": 0.34064173947810106, + "learning_rate": 2.2689720911107793e-05, + "loss": 2.6814, + "step": 46105 + }, + { + "epoch": 2.1465651698209838, + "grad_norm": 0.338319849560766, + "learning_rate": 2.2687451974875574e-05, + "loss": 2.6053, + "step": 46106 + }, + { + "epoch": 2.146611728007077, + "grad_norm": 0.3415990073359908, + "learning_rate": 2.268518311880299e-05, + "loss": 2.6601, + "step": 46107 + }, + { + "epoch": 2.14665828619317, + "grad_norm": 0.3342566857135904, + "learning_rate": 2.2682914342896705e-05, + "loss": 2.6869, + "step": 46108 + }, + { + "epoch": 2.146704844379263, + "grad_norm": 0.35139714903211094, + "learning_rate": 2.2680645647163395e-05, + "loss": 2.6001, + "step": 46109 + }, + { + "epoch": 2.1467514025653562, + "grad_norm": 0.34405212299585003, + "learning_rate": 2.2678377031609677e-05, + "loss": 2.6973, + "step": 46110 + }, + { + "epoch": 2.1467979607514494, + "grad_norm": 0.3493757991422803, + "learning_rate": 2.2676108496242238e-05, + "loss": 2.5706, + "step": 46111 + }, + { + "epoch": 2.146844518937542, + "grad_norm": 0.3359021312890616, + "learning_rate": 2.2673840041067724e-05, + "loss": 2.6205, + "step": 46112 + }, + { + "epoch": 2.146891077123635, + "grad_norm": 0.33589533401758864, + "learning_rate": 2.2671571666092794e-05, + "loss": 2.7236, + "step": 46113 + }, + { + "epoch": 2.1469376353097283, + "grad_norm": 0.3710192451675751, + "learning_rate": 2.266930337132413e-05, + "loss": 2.6619, + "step": 46114 + }, + { + "epoch": 2.1469841934958214, + "grad_norm": 0.3353577060441777, + "learning_rate": 2.2667035156768333e-05, + "loss": 2.6595, + "step": 46115 + }, + { + "epoch": 2.1470307516819145, + "grad_norm": 0.3508718701269698, + "learning_rate": 2.2664767022432133e-05, + "loss": 2.6555, + "step": 46116 + }, + { + "epoch": 2.1470773098680076, + "grad_norm": 0.3620820594615153, + "learning_rate": 2.2662498968322126e-05, + "loss": 2.7698, + "step": 46117 + }, + { + "epoch": 2.1471238680541007, + "grad_norm": 0.35020545440665485, + "learning_rate": 2.266023099444499e-05, + "loss": 2.7119, + "step": 46118 + }, + { + "epoch": 2.147170426240194, + "grad_norm": 0.34695554144988466, + "learning_rate": 2.2657963100807388e-05, + "loss": 2.7196, + "step": 46119 + }, + { + "epoch": 2.147216984426287, + "grad_norm": 0.3812446042526624, + "learning_rate": 2.265569528741597e-05, + "loss": 2.6618, + "step": 46120 + }, + { + "epoch": 2.1472635426123796, + "grad_norm": 0.3365206743599023, + "learning_rate": 2.265342755427739e-05, + "loss": 2.6573, + "step": 46121 + }, + { + "epoch": 2.1473101007984727, + "grad_norm": 0.33880470734760676, + "learning_rate": 2.265115990139833e-05, + "loss": 2.6157, + "step": 46122 + }, + { + "epoch": 2.147356658984566, + "grad_norm": 0.37328349035252617, + "learning_rate": 2.2648892328785394e-05, + "loss": 2.621, + "step": 46123 + }, + { + "epoch": 2.147403217170659, + "grad_norm": 0.32584782323838607, + "learning_rate": 2.264662483644527e-05, + "loss": 2.5995, + "step": 46124 + }, + { + "epoch": 2.147449775356752, + "grad_norm": 0.3290881339573597, + "learning_rate": 2.2644357424384603e-05, + "loss": 2.6496, + "step": 46125 + }, + { + "epoch": 2.147496333542845, + "grad_norm": 0.3461621921718217, + "learning_rate": 2.2642090092610057e-05, + "loss": 2.6462, + "step": 46126 + }, + { + "epoch": 2.1475428917289383, + "grad_norm": 0.37139724381851347, + "learning_rate": 2.26398228411283e-05, + "loss": 2.6624, + "step": 46127 + }, + { + "epoch": 2.1475894499150314, + "grad_norm": 0.33276476978921765, + "learning_rate": 2.2637555669945926e-05, + "loss": 2.6287, + "step": 46128 + }, + { + "epoch": 2.1476360081011245, + "grad_norm": 0.3481357662158577, + "learning_rate": 2.2635288579069674e-05, + "loss": 2.7176, + "step": 46129 + }, + { + "epoch": 2.1476825662872177, + "grad_norm": 0.35484797956610814, + "learning_rate": 2.2633021568506114e-05, + "loss": 2.6422, + "step": 46130 + }, + { + "epoch": 2.1477291244733103, + "grad_norm": 0.3419669359257552, + "learning_rate": 2.2630754638261976e-05, + "loss": 2.6147, + "step": 46131 + }, + { + "epoch": 2.1477756826594034, + "grad_norm": 0.3358208099109887, + "learning_rate": 2.2628487788343853e-05, + "loss": 2.6988, + "step": 46132 + }, + { + "epoch": 2.1478222408454966, + "grad_norm": 0.34612460314849874, + "learning_rate": 2.262622101875843e-05, + "loss": 2.716, + "step": 46133 + }, + { + "epoch": 2.1478687990315897, + "grad_norm": 0.3553611230548715, + "learning_rate": 2.2623954329512347e-05, + "loss": 2.6516, + "step": 46134 + }, + { + "epoch": 2.147915357217683, + "grad_norm": 0.3468518844233562, + "learning_rate": 2.2621687720612282e-05, + "loss": 2.7362, + "step": 46135 + }, + { + "epoch": 2.147961915403776, + "grad_norm": 0.3451442900886326, + "learning_rate": 2.261942119206485e-05, + "loss": 2.6535, + "step": 46136 + }, + { + "epoch": 2.148008473589869, + "grad_norm": 0.3712141882880248, + "learning_rate": 2.2617154743876713e-05, + "loss": 2.6101, + "step": 46137 + }, + { + "epoch": 2.148055031775962, + "grad_norm": 0.3480413850414991, + "learning_rate": 2.2614888376054534e-05, + "loss": 2.5978, + "step": 46138 + }, + { + "epoch": 2.1481015899620552, + "grad_norm": 0.3656199399659397, + "learning_rate": 2.2612622088604957e-05, + "loss": 2.6778, + "step": 46139 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 0.3623531733859737, + "learning_rate": 2.2610355881534656e-05, + "loss": 2.62, + "step": 46140 + }, + { + "epoch": 2.148194706334241, + "grad_norm": 0.3462725608267318, + "learning_rate": 2.2608089754850224e-05, + "loss": 2.5568, + "step": 46141 + }, + { + "epoch": 2.148241264520334, + "grad_norm": 0.37029973509926095, + "learning_rate": 2.2605823708558387e-05, + "loss": 2.5592, + "step": 46142 + }, + { + "epoch": 2.1482878227064273, + "grad_norm": 0.32504527244517084, + "learning_rate": 2.2603557742665726e-05, + "loss": 2.5805, + "step": 46143 + }, + { + "epoch": 2.1483343808925204, + "grad_norm": 0.3386057957019201, + "learning_rate": 2.2601291857178957e-05, + "loss": 2.6003, + "step": 46144 + }, + { + "epoch": 2.1483809390786135, + "grad_norm": 0.36714328201143587, + "learning_rate": 2.2599026052104676e-05, + "loss": 2.687, + "step": 46145 + }, + { + "epoch": 2.1484274972647066, + "grad_norm": 0.3445139810690417, + "learning_rate": 2.2596760327449562e-05, + "loss": 2.5638, + "step": 46146 + }, + { + "epoch": 2.1484740554507997, + "grad_norm": 0.33780169047138336, + "learning_rate": 2.259449468322025e-05, + "loss": 2.6522, + "step": 46147 + }, + { + "epoch": 2.148520613636893, + "grad_norm": 0.3471684922294482, + "learning_rate": 2.25922291194234e-05, + "loss": 2.6416, + "step": 46148 + }, + { + "epoch": 2.148567171822986, + "grad_norm": 0.3433574589869608, + "learning_rate": 2.2589963636065674e-05, + "loss": 2.5727, + "step": 46149 + }, + { + "epoch": 2.148613730009079, + "grad_norm": 0.3621711827615962, + "learning_rate": 2.2587698233153688e-05, + "loss": 2.666, + "step": 46150 + }, + { + "epoch": 2.1486602881951717, + "grad_norm": 0.35240161780977497, + "learning_rate": 2.2585432910694104e-05, + "loss": 2.6851, + "step": 46151 + }, + { + "epoch": 2.148706846381265, + "grad_norm": 0.31513851659565933, + "learning_rate": 2.2583167668693573e-05, + "loss": 2.5799, + "step": 46152 + }, + { + "epoch": 2.148753404567358, + "grad_norm": 0.35028809615022105, + "learning_rate": 2.258090250715877e-05, + "loss": 2.5719, + "step": 46153 + }, + { + "epoch": 2.148799962753451, + "grad_norm": 0.3427477609786298, + "learning_rate": 2.257863742609627e-05, + "loss": 2.5698, + "step": 46154 + }, + { + "epoch": 2.148846520939544, + "grad_norm": 0.3354821096671701, + "learning_rate": 2.2576372425512808e-05, + "loss": 2.6161, + "step": 46155 + }, + { + "epoch": 2.1488930791256373, + "grad_norm": 0.3359383218487118, + "learning_rate": 2.2574107505414955e-05, + "loss": 2.7004, + "step": 46156 + }, + { + "epoch": 2.1489396373117304, + "grad_norm": 0.33222567866392344, + "learning_rate": 2.2571842665809433e-05, + "loss": 2.6907, + "step": 46157 + }, + { + "epoch": 2.1489861954978235, + "grad_norm": 0.3402762078293047, + "learning_rate": 2.2569577906702832e-05, + "loss": 2.6835, + "step": 46158 + }, + { + "epoch": 2.1490327536839167, + "grad_norm": 0.33403623834063506, + "learning_rate": 2.2567313228101818e-05, + "loss": 2.6135, + "step": 46159 + }, + { + "epoch": 2.1490793118700093, + "grad_norm": 0.32134199944768255, + "learning_rate": 2.2565048630013036e-05, + "loss": 2.6538, + "step": 46160 + }, + { + "epoch": 2.1491258700561024, + "grad_norm": 0.35643335983499136, + "learning_rate": 2.256278411244313e-05, + "loss": 2.564, + "step": 46161 + }, + { + "epoch": 2.1491724282421956, + "grad_norm": 0.32279719521258765, + "learning_rate": 2.2560519675398774e-05, + "loss": 2.5876, + "step": 46162 + }, + { + "epoch": 2.1492189864282887, + "grad_norm": 0.3246485221466021, + "learning_rate": 2.2558255318886567e-05, + "loss": 2.5782, + "step": 46163 + }, + { + "epoch": 2.149265544614382, + "grad_norm": 0.3195052035784245, + "learning_rate": 2.2555991042913177e-05, + "loss": 2.5901, + "step": 46164 + }, + { + "epoch": 2.149312102800475, + "grad_norm": 0.34386567533119566, + "learning_rate": 2.255372684748525e-05, + "loss": 2.6711, + "step": 46165 + }, + { + "epoch": 2.149358660986568, + "grad_norm": 0.33628641196677295, + "learning_rate": 2.2551462732609437e-05, + "loss": 2.6778, + "step": 46166 + }, + { + "epoch": 2.149405219172661, + "grad_norm": 0.34122825950551994, + "learning_rate": 2.2549198698292368e-05, + "loss": 2.7112, + "step": 46167 + }, + { + "epoch": 2.1494517773587543, + "grad_norm": 0.36714559144049175, + "learning_rate": 2.254693474454072e-05, + "loss": 2.6442, + "step": 46168 + }, + { + "epoch": 2.1494983355448474, + "grad_norm": 0.3261924192799377, + "learning_rate": 2.2544670871361074e-05, + "loss": 2.6837, + "step": 46169 + }, + { + "epoch": 2.14954489373094, + "grad_norm": 0.3263400415269465, + "learning_rate": 2.254240707876015e-05, + "loss": 2.6789, + "step": 46170 + }, + { + "epoch": 2.149591451917033, + "grad_norm": 0.3477058675946172, + "learning_rate": 2.2540143366744543e-05, + "loss": 2.5772, + "step": 46171 + }, + { + "epoch": 2.1496380101031263, + "grad_norm": 0.33998729958796486, + "learning_rate": 2.2537879735320905e-05, + "loss": 2.6352, + "step": 46172 + }, + { + "epoch": 2.1496845682892194, + "grad_norm": 0.33196811938916415, + "learning_rate": 2.253561618449589e-05, + "loss": 2.6417, + "step": 46173 + }, + { + "epoch": 2.1497311264753125, + "grad_norm": 0.3314115424407007, + "learning_rate": 2.253335271427613e-05, + "loss": 2.6357, + "step": 46174 + }, + { + "epoch": 2.1497776846614056, + "grad_norm": 0.3459785843513995, + "learning_rate": 2.2531089324668296e-05, + "loss": 2.6993, + "step": 46175 + }, + { + "epoch": 2.1498242428474987, + "grad_norm": 0.3265107153486598, + "learning_rate": 2.2528826015678988e-05, + "loss": 2.5553, + "step": 46176 + }, + { + "epoch": 2.149870801033592, + "grad_norm": 0.33747868655756824, + "learning_rate": 2.252656278731487e-05, + "loss": 2.6266, + "step": 46177 + }, + { + "epoch": 2.149917359219685, + "grad_norm": 0.3581233125680696, + "learning_rate": 2.2524299639582586e-05, + "loss": 2.6257, + "step": 46178 + }, + { + "epoch": 2.1499639174057776, + "grad_norm": 0.3073599872855043, + "learning_rate": 2.2522036572488773e-05, + "loss": 2.6175, + "step": 46179 + }, + { + "epoch": 2.1500104755918708, + "grad_norm": 0.3405549349565831, + "learning_rate": 2.2519773586040077e-05, + "loss": 2.57, + "step": 46180 + }, + { + "epoch": 2.150057033777964, + "grad_norm": 0.39445676776974054, + "learning_rate": 2.2517510680243158e-05, + "loss": 2.7073, + "step": 46181 + }, + { + "epoch": 2.150103591964057, + "grad_norm": 0.3365035332398018, + "learning_rate": 2.2515247855104598e-05, + "loss": 2.6626, + "step": 46182 + }, + { + "epoch": 2.15015015015015, + "grad_norm": 0.3268075813581259, + "learning_rate": 2.2512985110631114e-05, + "loss": 2.6084, + "step": 46183 + }, + { + "epoch": 2.150196708336243, + "grad_norm": 0.3405375102585076, + "learning_rate": 2.2510722446829295e-05, + "loss": 2.5482, + "step": 46184 + }, + { + "epoch": 2.1502432665223363, + "grad_norm": 0.34898515460087043, + "learning_rate": 2.2508459863705796e-05, + "loss": 2.6613, + "step": 46185 + }, + { + "epoch": 2.1502898247084294, + "grad_norm": 0.32370634629587675, + "learning_rate": 2.2506197361267263e-05, + "loss": 2.575, + "step": 46186 + }, + { + "epoch": 2.1503363828945226, + "grad_norm": 0.3461649322208879, + "learning_rate": 2.250393493952033e-05, + "loss": 2.5526, + "step": 46187 + }, + { + "epoch": 2.1503829410806157, + "grad_norm": 0.3370855492145204, + "learning_rate": 2.2501672598471652e-05, + "loss": 2.6254, + "step": 46188 + }, + { + "epoch": 2.150429499266709, + "grad_norm": 0.35315933796916, + "learning_rate": 2.2499410338127842e-05, + "loss": 2.7085, + "step": 46189 + }, + { + "epoch": 2.1504760574528015, + "grad_norm": 0.36038311213280394, + "learning_rate": 2.249714815849555e-05, + "loss": 2.5704, + "step": 46190 + }, + { + "epoch": 2.1505226156388946, + "grad_norm": 0.34094084099041205, + "learning_rate": 2.2494886059581428e-05, + "loss": 2.6242, + "step": 46191 + }, + { + "epoch": 2.1505691738249877, + "grad_norm": 0.34143254240988363, + "learning_rate": 2.2492624041392096e-05, + "loss": 2.652, + "step": 46192 + }, + { + "epoch": 2.150615732011081, + "grad_norm": 0.33149747626810633, + "learning_rate": 2.2490362103934205e-05, + "loss": 2.7382, + "step": 46193 + }, + { + "epoch": 2.150662290197174, + "grad_norm": 0.3536697238617444, + "learning_rate": 2.248810024721441e-05, + "loss": 2.6948, + "step": 46194 + }, + { + "epoch": 2.150708848383267, + "grad_norm": 0.33066344513826795, + "learning_rate": 2.2485838471239297e-05, + "loss": 2.6188, + "step": 46195 + }, + { + "epoch": 2.15075540656936, + "grad_norm": 0.33982392712502696, + "learning_rate": 2.248357677601557e-05, + "loss": 2.691, + "step": 46196 + }, + { + "epoch": 2.1508019647554533, + "grad_norm": 0.35436467536303917, + "learning_rate": 2.2481315161549815e-05, + "loss": 2.6826, + "step": 46197 + }, + { + "epoch": 2.1508485229415464, + "grad_norm": 0.3426217672076775, + "learning_rate": 2.2479053627848696e-05, + "loss": 2.6979, + "step": 46198 + }, + { + "epoch": 2.150895081127639, + "grad_norm": 0.34503686543214707, + "learning_rate": 2.2476792174918835e-05, + "loss": 2.5998, + "step": 46199 + }, + { + "epoch": 2.150941639313732, + "grad_norm": 0.34368569691522005, + "learning_rate": 2.247453080276688e-05, + "loss": 2.5648, + "step": 46200 + }, + { + "epoch": 2.1509881974998253, + "grad_norm": 0.34457262269467054, + "learning_rate": 2.2472269511399486e-05, + "loss": 2.6109, + "step": 46201 + }, + { + "epoch": 2.1510347556859184, + "grad_norm": 0.3381262654445576, + "learning_rate": 2.247000830082323e-05, + "loss": 2.5729, + "step": 46202 + }, + { + "epoch": 2.1510813138720115, + "grad_norm": 0.3599513307014455, + "learning_rate": 2.246774717104483e-05, + "loss": 2.7362, + "step": 46203 + }, + { + "epoch": 2.1511278720581046, + "grad_norm": 0.3439793625101182, + "learning_rate": 2.2465486122070857e-05, + "loss": 2.6174, + "step": 46204 + }, + { + "epoch": 2.1511744302441977, + "grad_norm": 0.30947164697624324, + "learning_rate": 2.2463225153907967e-05, + "loss": 2.5977, + "step": 46205 + }, + { + "epoch": 2.151220988430291, + "grad_norm": 0.3332061861818934, + "learning_rate": 2.2460964266562802e-05, + "loss": 2.7446, + "step": 46206 + }, + { + "epoch": 2.151267546616384, + "grad_norm": 0.3523642918293382, + "learning_rate": 2.245870346004201e-05, + "loss": 2.6311, + "step": 46207 + }, + { + "epoch": 2.151314104802477, + "grad_norm": 0.3418029406478133, + "learning_rate": 2.2456442734352173e-05, + "loss": 2.7026, + "step": 46208 + }, + { + "epoch": 2.1513606629885698, + "grad_norm": 0.3195633369679428, + "learning_rate": 2.24541820895e-05, + "loss": 2.5572, + "step": 46209 + }, + { + "epoch": 2.151407221174663, + "grad_norm": 0.32008233733424846, + "learning_rate": 2.245192152549207e-05, + "loss": 2.5581, + "step": 46210 + }, + { + "epoch": 2.151453779360756, + "grad_norm": 0.33812812139089304, + "learning_rate": 2.244966104233504e-05, + "loss": 2.6747, + "step": 46211 + }, + { + "epoch": 2.151500337546849, + "grad_norm": 0.3232226301749603, + "learning_rate": 2.244740064003554e-05, + "loss": 2.6753, + "step": 46212 + }, + { + "epoch": 2.151546895732942, + "grad_norm": 0.33421448625867833, + "learning_rate": 2.24451403186002e-05, + "loss": 2.734, + "step": 46213 + }, + { + "epoch": 2.1515934539190353, + "grad_norm": 0.32659945825738607, + "learning_rate": 2.2442880078035682e-05, + "loss": 2.5532, + "step": 46214 + }, + { + "epoch": 2.1516400121051285, + "grad_norm": 0.3425614339579111, + "learning_rate": 2.244061991834856e-05, + "loss": 2.6203, + "step": 46215 + }, + { + "epoch": 2.1516865702912216, + "grad_norm": 0.3575575398052037, + "learning_rate": 2.2438359839545535e-05, + "loss": 2.6069, + "step": 46216 + }, + { + "epoch": 2.1517331284773147, + "grad_norm": 0.3224633172173613, + "learning_rate": 2.2436099841633194e-05, + "loss": 2.5687, + "step": 46217 + }, + { + "epoch": 2.151779686663408, + "grad_norm": 0.3365932810391959, + "learning_rate": 2.2433839924618182e-05, + "loss": 2.7006, + "step": 46218 + }, + { + "epoch": 2.1518262448495005, + "grad_norm": 0.3577586291410028, + "learning_rate": 2.2431580088507136e-05, + "loss": 2.5958, + "step": 46219 + }, + { + "epoch": 2.1518728030355936, + "grad_norm": 0.3594332322246137, + "learning_rate": 2.2429320333306686e-05, + "loss": 2.6516, + "step": 46220 + }, + { + "epoch": 2.1519193612216867, + "grad_norm": 0.33581770066715416, + "learning_rate": 2.2427060659023462e-05, + "loss": 2.7285, + "step": 46221 + }, + { + "epoch": 2.15196591940778, + "grad_norm": 0.31880874288717953, + "learning_rate": 2.2424801065664124e-05, + "loss": 2.635, + "step": 46222 + }, + { + "epoch": 2.152012477593873, + "grad_norm": 0.38244520088434786, + "learning_rate": 2.2422541553235256e-05, + "loss": 2.6886, + "step": 46223 + }, + { + "epoch": 2.152059035779966, + "grad_norm": 0.38852951949998227, + "learning_rate": 2.2420282121743514e-05, + "loss": 2.6467, + "step": 46224 + }, + { + "epoch": 2.152105593966059, + "grad_norm": 0.3262177746417927, + "learning_rate": 2.2418022771195524e-05, + "loss": 2.6334, + "step": 46225 + }, + { + "epoch": 2.1521521521521523, + "grad_norm": 0.3454839634141798, + "learning_rate": 2.241576350159792e-05, + "loss": 2.5937, + "step": 46226 + }, + { + "epoch": 2.1521987103382454, + "grad_norm": 0.35896576688166715, + "learning_rate": 2.241350431295735e-05, + "loss": 2.6259, + "step": 46227 + }, + { + "epoch": 2.1522452685243385, + "grad_norm": 0.3380997362690834, + "learning_rate": 2.241124520528039e-05, + "loss": 2.6188, + "step": 46228 + }, + { + "epoch": 2.152291826710431, + "grad_norm": 0.34846988964169184, + "learning_rate": 2.2408986178573743e-05, + "loss": 2.6296, + "step": 46229 + }, + { + "epoch": 2.1523383848965243, + "grad_norm": 0.33162560105610567, + "learning_rate": 2.2406727232843988e-05, + "loss": 2.6435, + "step": 46230 + }, + { + "epoch": 2.1523849430826174, + "grad_norm": 0.3229315792664388, + "learning_rate": 2.240446836809777e-05, + "loss": 2.6595, + "step": 46231 + }, + { + "epoch": 2.1524315012687105, + "grad_norm": 0.35947166539568237, + "learning_rate": 2.240220958434172e-05, + "loss": 2.6321, + "step": 46232 + }, + { + "epoch": 2.1524780594548036, + "grad_norm": 0.3458793524682695, + "learning_rate": 2.2399950881582465e-05, + "loss": 2.6548, + "step": 46233 + }, + { + "epoch": 2.1525246176408968, + "grad_norm": 0.3369554664553862, + "learning_rate": 2.2397692259826637e-05, + "loss": 2.6989, + "step": 46234 + }, + { + "epoch": 2.15257117582699, + "grad_norm": 0.3330315267254907, + "learning_rate": 2.2395433719080878e-05, + "loss": 2.5375, + "step": 46235 + }, + { + "epoch": 2.152617734013083, + "grad_norm": 0.3355632642644129, + "learning_rate": 2.2393175259351784e-05, + "loss": 2.6439, + "step": 46236 + }, + { + "epoch": 2.152664292199176, + "grad_norm": 0.3535061083027604, + "learning_rate": 2.2390916880646003e-05, + "loss": 2.6807, + "step": 46237 + }, + { + "epoch": 2.1527108503852688, + "grad_norm": 0.3351607559819148, + "learning_rate": 2.2388658582970158e-05, + "loss": 2.6226, + "step": 46238 + }, + { + "epoch": 2.152757408571362, + "grad_norm": 0.3551520618063849, + "learning_rate": 2.238640036633088e-05, + "loss": 2.7154, + "step": 46239 + }, + { + "epoch": 2.152803966757455, + "grad_norm": 0.3328153636127528, + "learning_rate": 2.2384142230734817e-05, + "loss": 2.5912, + "step": 46240 + }, + { + "epoch": 2.152850524943548, + "grad_norm": 0.3401587833460154, + "learning_rate": 2.2381884176188538e-05, + "loss": 2.6094, + "step": 46241 + }, + { + "epoch": 2.1528970831296412, + "grad_norm": 0.33971361445446085, + "learning_rate": 2.2379626202698745e-05, + "loss": 2.6069, + "step": 46242 + }, + { + "epoch": 2.1529436413157343, + "grad_norm": 0.3359114959337602, + "learning_rate": 2.2377368310272007e-05, + "loss": 2.5816, + "step": 46243 + }, + { + "epoch": 2.1529901995018275, + "grad_norm": 0.33449959466942486, + "learning_rate": 2.2375110498914975e-05, + "loss": 2.5725, + "step": 46244 + }, + { + "epoch": 2.1530367576879206, + "grad_norm": 0.32708731313904166, + "learning_rate": 2.2372852768634274e-05, + "loss": 2.5373, + "step": 46245 + }, + { + "epoch": 2.1530833158740137, + "grad_norm": 0.3339674390762826, + "learning_rate": 2.2370595119436523e-05, + "loss": 2.6543, + "step": 46246 + }, + { + "epoch": 2.153129874060107, + "grad_norm": 0.33708484373426667, + "learning_rate": 2.2368337551328357e-05, + "loss": 2.6062, + "step": 46247 + }, + { + "epoch": 2.1531764322462, + "grad_norm": 0.36210188787174263, + "learning_rate": 2.2366080064316412e-05, + "loss": 2.7541, + "step": 46248 + }, + { + "epoch": 2.1532229904322926, + "grad_norm": 0.3383798254995443, + "learning_rate": 2.236382265840727e-05, + "loss": 2.6459, + "step": 46249 + }, + { + "epoch": 2.1532695486183857, + "grad_norm": 0.3275414939699167, + "learning_rate": 2.236156533360762e-05, + "loss": 2.5707, + "step": 46250 + }, + { + "epoch": 2.153316106804479, + "grad_norm": 0.35152170792589654, + "learning_rate": 2.2359308089924036e-05, + "loss": 2.61, + "step": 46251 + }, + { + "epoch": 2.153362664990572, + "grad_norm": 0.35225239439826095, + "learning_rate": 2.2357050927363153e-05, + "loss": 2.6468, + "step": 46252 + }, + { + "epoch": 2.153409223176665, + "grad_norm": 0.3306815523599348, + "learning_rate": 2.2354793845931627e-05, + "loss": 2.7691, + "step": 46253 + }, + { + "epoch": 2.153455781362758, + "grad_norm": 0.36463733510483076, + "learning_rate": 2.235253684563602e-05, + "loss": 2.7049, + "step": 46254 + }, + { + "epoch": 2.1535023395488513, + "grad_norm": 0.3346450474235438, + "learning_rate": 2.235027992648303e-05, + "loss": 2.7194, + "step": 46255 + }, + { + "epoch": 2.1535488977349444, + "grad_norm": 0.32012743072460037, + "learning_rate": 2.23480230884792e-05, + "loss": 2.5853, + "step": 46256 + }, + { + "epoch": 2.1535954559210375, + "grad_norm": 0.33631392882036953, + "learning_rate": 2.2345766331631247e-05, + "loss": 2.6335, + "step": 46257 + }, + { + "epoch": 2.15364201410713, + "grad_norm": 0.3661477701706844, + "learning_rate": 2.234350965594572e-05, + "loss": 2.6117, + "step": 46258 + }, + { + "epoch": 2.1536885722932233, + "grad_norm": 0.3480602420169662, + "learning_rate": 2.2341253061429273e-05, + "loss": 2.6257, + "step": 46259 + }, + { + "epoch": 2.1537351304793164, + "grad_norm": 0.3152596536830261, + "learning_rate": 2.233899654808852e-05, + "loss": 2.5421, + "step": 46260 + }, + { + "epoch": 2.1537816886654095, + "grad_norm": 0.32541187398846594, + "learning_rate": 2.2336740115930105e-05, + "loss": 2.6447, + "step": 46261 + }, + { + "epoch": 2.1538282468515026, + "grad_norm": 0.3109727614572929, + "learning_rate": 2.2334483764960596e-05, + "loss": 2.642, + "step": 46262 + }, + { + "epoch": 2.1538748050375958, + "grad_norm": 0.3596896577732401, + "learning_rate": 2.2332227495186686e-05, + "loss": 2.6385, + "step": 46263 + }, + { + "epoch": 2.153921363223689, + "grad_norm": 0.3287346546775258, + "learning_rate": 2.2329971306614948e-05, + "loss": 2.6369, + "step": 46264 + }, + { + "epoch": 2.153967921409782, + "grad_norm": 0.3139079321731432, + "learning_rate": 2.232771519925202e-05, + "loss": 2.6759, + "step": 46265 + }, + { + "epoch": 2.154014479595875, + "grad_norm": 0.3322172343133846, + "learning_rate": 2.2325459173104535e-05, + "loss": 2.608, + "step": 46266 + }, + { + "epoch": 2.1540610377819682, + "grad_norm": 0.3285957574201155, + "learning_rate": 2.2323203228179063e-05, + "loss": 2.5835, + "step": 46267 + }, + { + "epoch": 2.154107595968061, + "grad_norm": 0.3269375581752395, + "learning_rate": 2.2320947364482303e-05, + "loss": 2.6653, + "step": 46268 + }, + { + "epoch": 2.154154154154154, + "grad_norm": 0.3444079126605401, + "learning_rate": 2.23186915820208e-05, + "loss": 2.598, + "step": 46269 + }, + { + "epoch": 2.154200712340247, + "grad_norm": 0.3340003238555253, + "learning_rate": 2.2316435880801245e-05, + "loss": 2.5624, + "step": 46270 + }, + { + "epoch": 2.1542472705263402, + "grad_norm": 0.3510926369883535, + "learning_rate": 2.231418026083021e-05, + "loss": 2.6424, + "step": 46271 + }, + { + "epoch": 2.1542938287124334, + "grad_norm": 0.31869256944891283, + "learning_rate": 2.231192472211432e-05, + "loss": 2.6406, + "step": 46272 + }, + { + "epoch": 2.1543403868985265, + "grad_norm": 0.337613769404335, + "learning_rate": 2.230966926466021e-05, + "loss": 2.6581, + "step": 46273 + }, + { + "epoch": 2.1543869450846196, + "grad_norm": 0.33122942863480376, + "learning_rate": 2.2307413888474486e-05, + "loss": 2.6499, + "step": 46274 + }, + { + "epoch": 2.1544335032707127, + "grad_norm": 0.3522724711328396, + "learning_rate": 2.230515859356378e-05, + "loss": 2.6153, + "step": 46275 + }, + { + "epoch": 2.154480061456806, + "grad_norm": 0.3355049503476095, + "learning_rate": 2.230290337993472e-05, + "loss": 2.6718, + "step": 46276 + }, + { + "epoch": 2.1545266196428985, + "grad_norm": 0.3383189509477266, + "learning_rate": 2.2300648247593896e-05, + "loss": 2.5235, + "step": 46277 + }, + { + "epoch": 2.1545731778289916, + "grad_norm": 0.34565478021702034, + "learning_rate": 2.2298393196547935e-05, + "loss": 2.6922, + "step": 46278 + }, + { + "epoch": 2.1546197360150847, + "grad_norm": 0.3352127716675308, + "learning_rate": 2.2296138226803482e-05, + "loss": 2.6454, + "step": 46279 + }, + { + "epoch": 2.154666294201178, + "grad_norm": 0.34159469587802205, + "learning_rate": 2.22938833383671e-05, + "loss": 2.6695, + "step": 46280 + }, + { + "epoch": 2.154712852387271, + "grad_norm": 0.35028292114037757, + "learning_rate": 2.2291628531245473e-05, + "loss": 2.7128, + "step": 46281 + }, + { + "epoch": 2.154759410573364, + "grad_norm": 0.3318166112894082, + "learning_rate": 2.228937380544515e-05, + "loss": 2.6523, + "step": 46282 + }, + { + "epoch": 2.154805968759457, + "grad_norm": 0.33724083428350626, + "learning_rate": 2.2287119160972825e-05, + "loss": 2.6932, + "step": 46283 + }, + { + "epoch": 2.1548525269455503, + "grad_norm": 0.33899698169718684, + "learning_rate": 2.228486459783506e-05, + "loss": 2.5752, + "step": 46284 + }, + { + "epoch": 2.1548990851316434, + "grad_norm": 0.35789427613124347, + "learning_rate": 2.2282610116038475e-05, + "loss": 2.6356, + "step": 46285 + }, + { + "epoch": 2.1549456433177365, + "grad_norm": 0.34546241840269537, + "learning_rate": 2.2280355715589708e-05, + "loss": 2.5557, + "step": 46286 + }, + { + "epoch": 2.1549922015038296, + "grad_norm": 0.36455715627567215, + "learning_rate": 2.2278101396495366e-05, + "loss": 2.6938, + "step": 46287 + }, + { + "epoch": 2.1550387596899223, + "grad_norm": 0.3608153584323533, + "learning_rate": 2.2275847158762065e-05, + "loss": 2.6557, + "step": 46288 + }, + { + "epoch": 2.1550853178760154, + "grad_norm": 0.34139515080771515, + "learning_rate": 2.2273593002396436e-05, + "loss": 2.5379, + "step": 46289 + }, + { + "epoch": 2.1551318760621085, + "grad_norm": 0.3467731017068138, + "learning_rate": 2.227133892740506e-05, + "loss": 2.6638, + "step": 46290 + }, + { + "epoch": 2.1551784342482017, + "grad_norm": 0.3317768839058625, + "learning_rate": 2.2269084933794582e-05, + "loss": 2.6216, + "step": 46291 + }, + { + "epoch": 2.1552249924342948, + "grad_norm": 0.34502851514677024, + "learning_rate": 2.2266831021571598e-05, + "loss": 2.506, + "step": 46292 + }, + { + "epoch": 2.155271550620388, + "grad_norm": 0.3386156969885655, + "learning_rate": 2.2264577190742737e-05, + "loss": 2.7235, + "step": 46293 + }, + { + "epoch": 2.155318108806481, + "grad_norm": 0.4027144923145875, + "learning_rate": 2.226232344131463e-05, + "loss": 2.6664, + "step": 46294 + }, + { + "epoch": 2.155364666992574, + "grad_norm": 0.3456897907823278, + "learning_rate": 2.226006977329383e-05, + "loss": 2.6753, + "step": 46295 + }, + { + "epoch": 2.1554112251786672, + "grad_norm": 0.34703207242437756, + "learning_rate": 2.225781618668703e-05, + "loss": 2.7663, + "step": 46296 + }, + { + "epoch": 2.15545778336476, + "grad_norm": 0.34641334277808244, + "learning_rate": 2.2255562681500792e-05, + "loss": 2.6287, + "step": 46297 + }, + { + "epoch": 2.155504341550853, + "grad_norm": 0.3460753504948273, + "learning_rate": 2.225330925774174e-05, + "loss": 2.6099, + "step": 46298 + }, + { + "epoch": 2.155550899736946, + "grad_norm": 0.3174901848349662, + "learning_rate": 2.225105591541649e-05, + "loss": 2.6473, + "step": 46299 + }, + { + "epoch": 2.1555974579230393, + "grad_norm": 0.35582535154045486, + "learning_rate": 2.2248802654531664e-05, + "loss": 2.6903, + "step": 46300 + }, + { + "epoch": 2.1556440161091324, + "grad_norm": 0.31564938928396546, + "learning_rate": 2.2246549475093863e-05, + "loss": 2.6341, + "step": 46301 + }, + { + "epoch": 2.1556905742952255, + "grad_norm": 0.3490700072130724, + "learning_rate": 2.2244296377109726e-05, + "loss": 2.6815, + "step": 46302 + }, + { + "epoch": 2.1557371324813186, + "grad_norm": 0.36560793699256494, + "learning_rate": 2.2242043360585824e-05, + "loss": 2.6853, + "step": 46303 + }, + { + "epoch": 2.1557836906674117, + "grad_norm": 0.3373031152124098, + "learning_rate": 2.2239790425528796e-05, + "loss": 2.5897, + "step": 46304 + }, + { + "epoch": 2.155830248853505, + "grad_norm": 0.32429160979892885, + "learning_rate": 2.2237537571945245e-05, + "loss": 2.7368, + "step": 46305 + }, + { + "epoch": 2.155876807039598, + "grad_norm": 0.3221036011084776, + "learning_rate": 2.2235284799841784e-05, + "loss": 2.5295, + "step": 46306 + }, + { + "epoch": 2.1559233652256906, + "grad_norm": 0.3415260264860166, + "learning_rate": 2.2233032109225044e-05, + "loss": 2.6563, + "step": 46307 + }, + { + "epoch": 2.1559699234117837, + "grad_norm": 0.3071475174607429, + "learning_rate": 2.2230779500101584e-05, + "loss": 2.6836, + "step": 46308 + }, + { + "epoch": 2.156016481597877, + "grad_norm": 0.35135456588553404, + "learning_rate": 2.2228526972478086e-05, + "loss": 2.6565, + "step": 46309 + }, + { + "epoch": 2.15606303978397, + "grad_norm": 0.33724317321232705, + "learning_rate": 2.2226274526361096e-05, + "loss": 2.7356, + "step": 46310 + }, + { + "epoch": 2.156109597970063, + "grad_norm": 0.35316988100923613, + "learning_rate": 2.2224022161757285e-05, + "loss": 2.6626, + "step": 46311 + }, + { + "epoch": 2.156156156156156, + "grad_norm": 0.34615262747186654, + "learning_rate": 2.222176987867321e-05, + "loss": 2.6964, + "step": 46312 + }, + { + "epoch": 2.1562027143422493, + "grad_norm": 0.326189385801832, + "learning_rate": 2.2219517677115508e-05, + "loss": 2.6333, + "step": 46313 + }, + { + "epoch": 2.1562492725283424, + "grad_norm": 0.34856274961746264, + "learning_rate": 2.2217265557090787e-05, + "loss": 2.7355, + "step": 46314 + }, + { + "epoch": 2.1562958307144355, + "grad_norm": 0.35256939818095695, + "learning_rate": 2.2215013518605672e-05, + "loss": 2.6124, + "step": 46315 + }, + { + "epoch": 2.156342388900528, + "grad_norm": 0.32066709579023794, + "learning_rate": 2.2212761561666733e-05, + "loss": 2.5778, + "step": 46316 + }, + { + "epoch": 2.1563889470866213, + "grad_norm": 0.32103959965120626, + "learning_rate": 2.2210509686280607e-05, + "loss": 2.6294, + "step": 46317 + }, + { + "epoch": 2.1564355052727144, + "grad_norm": 0.3160879445951851, + "learning_rate": 2.2208257892453894e-05, + "loss": 2.591, + "step": 46318 + }, + { + "epoch": 2.1564820634588076, + "grad_norm": 0.3360442730712545, + "learning_rate": 2.2206006180193205e-05, + "loss": 2.7088, + "step": 46319 + }, + { + "epoch": 2.1565286216449007, + "grad_norm": 0.32125126315566427, + "learning_rate": 2.2203754549505174e-05, + "loss": 2.6339, + "step": 46320 + }, + { + "epoch": 2.156575179830994, + "grad_norm": 0.33862499864118845, + "learning_rate": 2.220150300039634e-05, + "loss": 2.6588, + "step": 46321 + }, + { + "epoch": 2.156621738017087, + "grad_norm": 0.32046689940717227, + "learning_rate": 2.21992515328734e-05, + "loss": 2.6429, + "step": 46322 + }, + { + "epoch": 2.15666829620318, + "grad_norm": 0.34618440872238293, + "learning_rate": 2.2197000146942877e-05, + "loss": 2.7098, + "step": 46323 + }, + { + "epoch": 2.156714854389273, + "grad_norm": 0.33012975729970534, + "learning_rate": 2.2194748842611456e-05, + "loss": 2.5976, + "step": 46324 + }, + { + "epoch": 2.1567614125753662, + "grad_norm": 0.30568121221749384, + "learning_rate": 2.2192497619885694e-05, + "loss": 2.5699, + "step": 46325 + }, + { + "epoch": 2.1568079707614594, + "grad_norm": 0.34571133371815954, + "learning_rate": 2.219024647877221e-05, + "loss": 2.6898, + "step": 46326 + }, + { + "epoch": 2.156854528947552, + "grad_norm": 0.33524551168000877, + "learning_rate": 2.2187995419277607e-05, + "loss": 2.5994, + "step": 46327 + }, + { + "epoch": 2.156901087133645, + "grad_norm": 0.35596000794548965, + "learning_rate": 2.21857444414085e-05, + "loss": 2.6744, + "step": 46328 + }, + { + "epoch": 2.1569476453197383, + "grad_norm": 0.33595704860450737, + "learning_rate": 2.218349354517152e-05, + "loss": 2.6666, + "step": 46329 + }, + { + "epoch": 2.1569942035058314, + "grad_norm": 0.3504187705458031, + "learning_rate": 2.218124273057322e-05, + "loss": 2.6864, + "step": 46330 + }, + { + "epoch": 2.1570407616919245, + "grad_norm": 0.3682709945393093, + "learning_rate": 2.217899199762023e-05, + "loss": 2.6749, + "step": 46331 + }, + { + "epoch": 2.1570873198780176, + "grad_norm": 0.32445269167573937, + "learning_rate": 2.2176741346319164e-05, + "loss": 2.6797, + "step": 46332 + }, + { + "epoch": 2.1571338780641107, + "grad_norm": 0.3506257455897342, + "learning_rate": 2.217449077667663e-05, + "loss": 2.6168, + "step": 46333 + }, + { + "epoch": 2.157180436250204, + "grad_norm": 0.3629130325373767, + "learning_rate": 2.2172240288699197e-05, + "loss": 2.6143, + "step": 46334 + }, + { + "epoch": 2.157226994436297, + "grad_norm": 0.3369507338327057, + "learning_rate": 2.2169989882393533e-05, + "loss": 2.6359, + "step": 46335 + }, + { + "epoch": 2.1572735526223896, + "grad_norm": 0.34323126127797243, + "learning_rate": 2.2167739557766166e-05, + "loss": 2.6746, + "step": 46336 + }, + { + "epoch": 2.1573201108084827, + "grad_norm": 0.3360819492134632, + "learning_rate": 2.216548931482378e-05, + "loss": 2.6959, + "step": 46337 + }, + { + "epoch": 2.157366668994576, + "grad_norm": 0.3479528640007018, + "learning_rate": 2.2163239153572923e-05, + "loss": 2.6889, + "step": 46338 + }, + { + "epoch": 2.157413227180669, + "grad_norm": 0.3000623949466797, + "learning_rate": 2.2160989074020217e-05, + "loss": 2.495, + "step": 46339 + }, + { + "epoch": 2.157459785366762, + "grad_norm": 0.32401396535463384, + "learning_rate": 2.2158739076172262e-05, + "loss": 2.5548, + "step": 46340 + }, + { + "epoch": 2.157506343552855, + "grad_norm": 0.3333237348072396, + "learning_rate": 2.215648916003567e-05, + "loss": 2.6258, + "step": 46341 + }, + { + "epoch": 2.1575529017389483, + "grad_norm": 0.3203004069332036, + "learning_rate": 2.2154239325617055e-05, + "loss": 2.717, + "step": 46342 + }, + { + "epoch": 2.1575994599250414, + "grad_norm": 0.30723621361589365, + "learning_rate": 2.2151989572922986e-05, + "loss": 2.672, + "step": 46343 + }, + { + "epoch": 2.1576460181111345, + "grad_norm": 0.3059437289699608, + "learning_rate": 2.214973990196009e-05, + "loss": 2.6253, + "step": 46344 + }, + { + "epoch": 2.1576925762972277, + "grad_norm": 0.32620286624532513, + "learning_rate": 2.2147490312734952e-05, + "loss": 2.4831, + "step": 46345 + }, + { + "epoch": 2.1577391344833203, + "grad_norm": 0.30523000131538824, + "learning_rate": 2.214524080525419e-05, + "loss": 2.6121, + "step": 46346 + }, + { + "epoch": 2.1577856926694134, + "grad_norm": 0.3169549500568265, + "learning_rate": 2.2142991379524408e-05, + "loss": 2.7769, + "step": 46347 + }, + { + "epoch": 2.1578322508555066, + "grad_norm": 0.31803278936504764, + "learning_rate": 2.2140742035552214e-05, + "loss": 2.6899, + "step": 46348 + }, + { + "epoch": 2.1578788090415997, + "grad_norm": 0.31618366092956324, + "learning_rate": 2.2138492773344167e-05, + "loss": 2.5913, + "step": 46349 + }, + { + "epoch": 2.157925367227693, + "grad_norm": 0.320334503433739, + "learning_rate": 2.213624359290693e-05, + "loss": 2.7832, + "step": 46350 + }, + { + "epoch": 2.157971925413786, + "grad_norm": 0.34049885998850293, + "learning_rate": 2.2133994494247057e-05, + "loss": 2.674, + "step": 46351 + }, + { + "epoch": 2.158018483599879, + "grad_norm": 0.34339140684602937, + "learning_rate": 2.2131745477371164e-05, + "loss": 2.6444, + "step": 46352 + }, + { + "epoch": 2.158065041785972, + "grad_norm": 0.32038944418526016, + "learning_rate": 2.2129496542285856e-05, + "loss": 2.6494, + "step": 46353 + }, + { + "epoch": 2.1581115999720653, + "grad_norm": 0.3502237585793126, + "learning_rate": 2.2127247688997727e-05, + "loss": 2.6471, + "step": 46354 + }, + { + "epoch": 2.158158158158158, + "grad_norm": 0.36986990530812786, + "learning_rate": 2.2124998917513397e-05, + "loss": 2.6629, + "step": 46355 + }, + { + "epoch": 2.158204716344251, + "grad_norm": 0.3281909920581328, + "learning_rate": 2.212275022783943e-05, + "loss": 2.6557, + "step": 46356 + }, + { + "epoch": 2.158251274530344, + "grad_norm": 0.3516915365259821, + "learning_rate": 2.2120501619982448e-05, + "loss": 2.6229, + "step": 46357 + }, + { + "epoch": 2.1582978327164373, + "grad_norm": 0.3279587097231825, + "learning_rate": 2.211825309394905e-05, + "loss": 2.6681, + "step": 46358 + }, + { + "epoch": 2.1583443909025304, + "grad_norm": 0.359161603622489, + "learning_rate": 2.2116004649745825e-05, + "loss": 2.6827, + "step": 46359 + }, + { + "epoch": 2.1583909490886235, + "grad_norm": 0.337687368213743, + "learning_rate": 2.2113756287379385e-05, + "loss": 2.6566, + "step": 46360 + }, + { + "epoch": 2.1584375072747166, + "grad_norm": 0.3276644057809911, + "learning_rate": 2.2111508006856334e-05, + "loss": 2.6296, + "step": 46361 + }, + { + "epoch": 2.1584840654608097, + "grad_norm": 0.3536556486976242, + "learning_rate": 2.210925980818323e-05, + "loss": 2.6486, + "step": 46362 + }, + { + "epoch": 2.158530623646903, + "grad_norm": 0.36495874729827044, + "learning_rate": 2.2107011691366737e-05, + "loss": 2.6961, + "step": 46363 + }, + { + "epoch": 2.158577181832996, + "grad_norm": 0.30671449331394696, + "learning_rate": 2.2104763656413398e-05, + "loss": 2.612, + "step": 46364 + }, + { + "epoch": 2.158623740019089, + "grad_norm": 0.36269335093927185, + "learning_rate": 2.2102515703329825e-05, + "loss": 2.6153, + "step": 46365 + }, + { + "epoch": 2.1586702982051817, + "grad_norm": 0.3518363015029348, + "learning_rate": 2.2100267832122622e-05, + "loss": 2.6629, + "step": 46366 + }, + { + "epoch": 2.158716856391275, + "grad_norm": 0.3350335605906772, + "learning_rate": 2.209802004279839e-05, + "loss": 2.7054, + "step": 46367 + }, + { + "epoch": 2.158763414577368, + "grad_norm": 0.31920263023823364, + "learning_rate": 2.2095772335363728e-05, + "loss": 2.6476, + "step": 46368 + }, + { + "epoch": 2.158809972763461, + "grad_norm": 0.35218528871663757, + "learning_rate": 2.2093524709825197e-05, + "loss": 2.736, + "step": 46369 + }, + { + "epoch": 2.158856530949554, + "grad_norm": 0.3314375559667897, + "learning_rate": 2.2091277166189457e-05, + "loss": 2.5732, + "step": 46370 + }, + { + "epoch": 2.1589030891356473, + "grad_norm": 0.3284611477412034, + "learning_rate": 2.208902970446305e-05, + "loss": 2.549, + "step": 46371 + }, + { + "epoch": 2.1589496473217404, + "grad_norm": 0.3144554809230903, + "learning_rate": 2.2086782324652595e-05, + "loss": 2.6125, + "step": 46372 + }, + { + "epoch": 2.1589962055078336, + "grad_norm": 0.3175476772351366, + "learning_rate": 2.2084535026764673e-05, + "loss": 2.6718, + "step": 46373 + }, + { + "epoch": 2.1590427636939267, + "grad_norm": 0.34613829772713056, + "learning_rate": 2.208228781080592e-05, + "loss": 2.6501, + "step": 46374 + }, + { + "epoch": 2.1590893218800193, + "grad_norm": 0.34096084604145294, + "learning_rate": 2.208004067678286e-05, + "loss": 2.6329, + "step": 46375 + }, + { + "epoch": 2.1591358800661125, + "grad_norm": 0.3424787364472527, + "learning_rate": 2.207779362470217e-05, + "loss": 2.709, + "step": 46376 + }, + { + "epoch": 2.1591824382522056, + "grad_norm": 0.321702564486935, + "learning_rate": 2.2075546654570383e-05, + "loss": 2.6318, + "step": 46377 + }, + { + "epoch": 2.1592289964382987, + "grad_norm": 0.34918019604496087, + "learning_rate": 2.2073299766394118e-05, + "loss": 2.5781, + "step": 46378 + }, + { + "epoch": 2.159275554624392, + "grad_norm": 0.3318349484857923, + "learning_rate": 2.2071052960179966e-05, + "loss": 2.5805, + "step": 46379 + }, + { + "epoch": 2.159322112810485, + "grad_norm": 0.33145659314934106, + "learning_rate": 2.2068806235934527e-05, + "loss": 2.5259, + "step": 46380 + }, + { + "epoch": 2.159368670996578, + "grad_norm": 0.35758750257192, + "learning_rate": 2.2066559593664397e-05, + "loss": 2.6571, + "step": 46381 + }, + { + "epoch": 2.159415229182671, + "grad_norm": 0.3259781386603873, + "learning_rate": 2.2064313033376134e-05, + "loss": 2.4973, + "step": 46382 + }, + { + "epoch": 2.1594617873687643, + "grad_norm": 0.33382640444012757, + "learning_rate": 2.2062066555076395e-05, + "loss": 2.5713, + "step": 46383 + }, + { + "epoch": 2.1595083455548574, + "grad_norm": 0.330839927374177, + "learning_rate": 2.205982015877172e-05, + "loss": 2.5965, + "step": 46384 + }, + { + "epoch": 2.15955490374095, + "grad_norm": 0.31460004166604116, + "learning_rate": 2.205757384446872e-05, + "loss": 2.5817, + "step": 46385 + }, + { + "epoch": 2.159601461927043, + "grad_norm": 0.32679525536176235, + "learning_rate": 2.2055327612173993e-05, + "loss": 2.5487, + "step": 46386 + }, + { + "epoch": 2.1596480201131363, + "grad_norm": 0.3340286357335926, + "learning_rate": 2.2053081461894122e-05, + "loss": 2.6636, + "step": 46387 + }, + { + "epoch": 2.1596945782992294, + "grad_norm": 0.345491895017586, + "learning_rate": 2.20508353936357e-05, + "loss": 2.5867, + "step": 46388 + }, + { + "epoch": 2.1597411364853225, + "grad_norm": 0.3395917442408725, + "learning_rate": 2.2048589407405345e-05, + "loss": 2.6512, + "step": 46389 + }, + { + "epoch": 2.1597876946714156, + "grad_norm": 0.3254029243306158, + "learning_rate": 2.204634350320961e-05, + "loss": 2.535, + "step": 46390 + }, + { + "epoch": 2.1598342528575087, + "grad_norm": 0.3430861073844384, + "learning_rate": 2.20440976810551e-05, + "loss": 2.6349, + "step": 46391 + }, + { + "epoch": 2.159880811043602, + "grad_norm": 0.310719039988386, + "learning_rate": 2.204185194094841e-05, + "loss": 2.6477, + "step": 46392 + }, + { + "epoch": 2.159927369229695, + "grad_norm": 0.322342875731063, + "learning_rate": 2.203960628289613e-05, + "loss": 2.7395, + "step": 46393 + }, + { + "epoch": 2.159973927415788, + "grad_norm": 0.334286176885812, + "learning_rate": 2.203736070690487e-05, + "loss": 2.6295, + "step": 46394 + }, + { + "epoch": 2.1600204856018808, + "grad_norm": 0.3078502992801204, + "learning_rate": 2.2035115212981162e-05, + "loss": 2.5806, + "step": 46395 + }, + { + "epoch": 2.160067043787974, + "grad_norm": 0.332332601094843, + "learning_rate": 2.203286980113168e-05, + "loss": 2.6036, + "step": 46396 + }, + { + "epoch": 2.160113601974067, + "grad_norm": 0.342760454501872, + "learning_rate": 2.2030624471362943e-05, + "loss": 2.6685, + "step": 46397 + }, + { + "epoch": 2.16016016016016, + "grad_norm": 0.34748186912467277, + "learning_rate": 2.2028379223681574e-05, + "loss": 2.6417, + "step": 46398 + }, + { + "epoch": 2.160206718346253, + "grad_norm": 0.3391937061519568, + "learning_rate": 2.202613405809415e-05, + "loss": 2.6208, + "step": 46399 + }, + { + "epoch": 2.1602532765323463, + "grad_norm": 0.33287178419960245, + "learning_rate": 2.202388897460727e-05, + "loss": 2.6063, + "step": 46400 + }, + { + "epoch": 2.1602998347184394, + "grad_norm": 0.355008316708119, + "learning_rate": 2.2021643973227518e-05, + "loss": 2.6651, + "step": 46401 + }, + { + "epoch": 2.1603463929045326, + "grad_norm": 0.35098356364913524, + "learning_rate": 2.201939905396151e-05, + "loss": 2.6508, + "step": 46402 + }, + { + "epoch": 2.1603929510906257, + "grad_norm": 0.34379761468005393, + "learning_rate": 2.2017154216815783e-05, + "loss": 2.6342, + "step": 46403 + }, + { + "epoch": 2.160439509276719, + "grad_norm": 0.3497563358503125, + "learning_rate": 2.2014909461796955e-05, + "loss": 2.6481, + "step": 46404 + }, + { + "epoch": 2.1604860674628115, + "grad_norm": 0.33280630326569616, + "learning_rate": 2.2012664788911608e-05, + "loss": 2.6126, + "step": 46405 + }, + { + "epoch": 2.1605326256489046, + "grad_norm": 0.333297030425151, + "learning_rate": 2.2010420198166336e-05, + "loss": 2.602, + "step": 46406 + }, + { + "epoch": 2.1605791838349977, + "grad_norm": 0.3358705326301049, + "learning_rate": 2.200817568956774e-05, + "loss": 2.6231, + "step": 46407 + }, + { + "epoch": 2.160625742021091, + "grad_norm": 0.37025737453657964, + "learning_rate": 2.2005931263122352e-05, + "loss": 2.7567, + "step": 46408 + }, + { + "epoch": 2.160672300207184, + "grad_norm": 0.3393710199033598, + "learning_rate": 2.200368691883684e-05, + "loss": 2.7202, + "step": 46409 + }, + { + "epoch": 2.160718858393277, + "grad_norm": 0.34151122056612715, + "learning_rate": 2.2001442656717723e-05, + "loss": 2.7585, + "step": 46410 + }, + { + "epoch": 2.16076541657937, + "grad_norm": 0.34840614476725484, + "learning_rate": 2.199919847677162e-05, + "loss": 2.5754, + "step": 46411 + }, + { + "epoch": 2.1608119747654633, + "grad_norm": 0.3331257787810084, + "learning_rate": 2.199695437900511e-05, + "loss": 2.5404, + "step": 46412 + }, + { + "epoch": 2.1608585329515564, + "grad_norm": 0.32483145837823635, + "learning_rate": 2.199471036342478e-05, + "loss": 2.7204, + "step": 46413 + }, + { + "epoch": 2.160905091137649, + "grad_norm": 0.3632653171061694, + "learning_rate": 2.199246643003721e-05, + "loss": 2.6082, + "step": 46414 + }, + { + "epoch": 2.160951649323742, + "grad_norm": 0.3365191468151203, + "learning_rate": 2.199022257884902e-05, + "loss": 2.667, + "step": 46415 + }, + { + "epoch": 2.1609982075098353, + "grad_norm": 0.354645099014715, + "learning_rate": 2.1987978809866744e-05, + "loss": 2.6795, + "step": 46416 + }, + { + "epoch": 2.1610447656959284, + "grad_norm": 0.3290658309810509, + "learning_rate": 2.198573512309699e-05, + "loss": 2.7211, + "step": 46417 + }, + { + "epoch": 2.1610913238820215, + "grad_norm": 0.3432673753048362, + "learning_rate": 2.1983491518546346e-05, + "loss": 2.6352, + "step": 46418 + }, + { + "epoch": 2.1611378820681146, + "grad_norm": 0.34770438742816845, + "learning_rate": 2.1981247996221393e-05, + "loss": 2.61, + "step": 46419 + }, + { + "epoch": 2.1611844402542078, + "grad_norm": 0.3607134362166324, + "learning_rate": 2.1979004556128736e-05, + "loss": 2.7254, + "step": 46420 + }, + { + "epoch": 2.161230998440301, + "grad_norm": 0.3223785171260541, + "learning_rate": 2.1976761198274898e-05, + "loss": 2.5734, + "step": 46421 + }, + { + "epoch": 2.161277556626394, + "grad_norm": 0.3502987208340169, + "learning_rate": 2.197451792266655e-05, + "loss": 2.6372, + "step": 46422 + }, + { + "epoch": 2.161324114812487, + "grad_norm": 0.3517139765279602, + "learning_rate": 2.197227472931019e-05, + "loss": 2.6294, + "step": 46423 + }, + { + "epoch": 2.1613706729985798, + "grad_norm": 0.32663726611147703, + "learning_rate": 2.1970031618212478e-05, + "loss": 2.6182, + "step": 46424 + }, + { + "epoch": 2.161417231184673, + "grad_norm": 0.3403110804669461, + "learning_rate": 2.1967788589379945e-05, + "loss": 2.6203, + "step": 46425 + }, + { + "epoch": 2.161463789370766, + "grad_norm": 0.330164382463492, + "learning_rate": 2.196554564281919e-05, + "loss": 2.6721, + "step": 46426 + }, + { + "epoch": 2.161510347556859, + "grad_norm": 0.29376888484545116, + "learning_rate": 2.1963302778536797e-05, + "loss": 2.6946, + "step": 46427 + }, + { + "epoch": 2.1615569057429522, + "grad_norm": 0.3310798938169788, + "learning_rate": 2.196105999653937e-05, + "loss": 2.6764, + "step": 46428 + }, + { + "epoch": 2.1616034639290453, + "grad_norm": 0.33756754043279075, + "learning_rate": 2.1958817296833444e-05, + "loss": 2.6202, + "step": 46429 + }, + { + "epoch": 2.1616500221151385, + "grad_norm": 0.32074035960439856, + "learning_rate": 2.1956574679425633e-05, + "loss": 2.6938, + "step": 46430 + }, + { + "epoch": 2.1616965803012316, + "grad_norm": 0.3348928780422045, + "learning_rate": 2.195433214432251e-05, + "loss": 2.6622, + "step": 46431 + }, + { + "epoch": 2.1617431384873247, + "grad_norm": 0.3455882445224543, + "learning_rate": 2.1952089691530657e-05, + "loss": 2.6516, + "step": 46432 + }, + { + "epoch": 2.161789696673418, + "grad_norm": 0.33283332875006694, + "learning_rate": 2.194984732105668e-05, + "loss": 2.496, + "step": 46433 + }, + { + "epoch": 2.1618362548595105, + "grad_norm": 0.3528353029698138, + "learning_rate": 2.1947605032907098e-05, + "loss": 2.7163, + "step": 46434 + }, + { + "epoch": 2.1618828130456036, + "grad_norm": 0.34207513873827483, + "learning_rate": 2.194536282708857e-05, + "loss": 2.614, + "step": 46435 + }, + { + "epoch": 2.1619293712316967, + "grad_norm": 0.3229012373924739, + "learning_rate": 2.1943120703607607e-05, + "loss": 2.6074, + "step": 46436 + }, + { + "epoch": 2.16197592941779, + "grad_norm": 0.3305745735920983, + "learning_rate": 2.194087866247085e-05, + "loss": 2.5525, + "step": 46437 + }, + { + "epoch": 2.162022487603883, + "grad_norm": 0.3531655194280954, + "learning_rate": 2.1938636703684834e-05, + "loss": 2.7082, + "step": 46438 + }, + { + "epoch": 2.162069045789976, + "grad_norm": 0.3354475877493429, + "learning_rate": 2.193639482725615e-05, + "loss": 2.6682, + "step": 46439 + }, + { + "epoch": 2.162115603976069, + "grad_norm": 0.31177850732571244, + "learning_rate": 2.193415303319139e-05, + "loss": 2.5918, + "step": 46440 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.34120624795233256, + "learning_rate": 2.1931911321497124e-05, + "loss": 2.5987, + "step": 46441 + }, + { + "epoch": 2.1622087203482554, + "grad_norm": 0.38206277603855315, + "learning_rate": 2.192966969217995e-05, + "loss": 2.6937, + "step": 46442 + }, + { + "epoch": 2.1622552785343485, + "grad_norm": 0.329185963765254, + "learning_rate": 2.192742814524641e-05, + "loss": 2.6505, + "step": 46443 + }, + { + "epoch": 2.162301836720441, + "grad_norm": 0.33300599394332847, + "learning_rate": 2.1925186680703103e-05, + "loss": 2.5764, + "step": 46444 + }, + { + "epoch": 2.1623483949065343, + "grad_norm": 0.34014554266399566, + "learning_rate": 2.1922945298556606e-05, + "loss": 2.7141, + "step": 46445 + }, + { + "epoch": 2.1623949530926274, + "grad_norm": 0.34354150187996824, + "learning_rate": 2.1920703998813523e-05, + "loss": 2.5809, + "step": 46446 + }, + { + "epoch": 2.1624415112787205, + "grad_norm": 0.3462898037935257, + "learning_rate": 2.191846278148037e-05, + "loss": 2.6259, + "step": 46447 + }, + { + "epoch": 2.1624880694648136, + "grad_norm": 0.34915671545338733, + "learning_rate": 2.1916221646563796e-05, + "loss": 2.6816, + "step": 46448 + }, + { + "epoch": 2.1625346276509068, + "grad_norm": 0.3219634625530823, + "learning_rate": 2.1913980594070304e-05, + "loss": 2.6026, + "step": 46449 + }, + { + "epoch": 2.162581185837, + "grad_norm": 0.3225489903109412, + "learning_rate": 2.1911739624006556e-05, + "loss": 2.5909, + "step": 46450 + }, + { + "epoch": 2.162627744023093, + "grad_norm": 0.3293625102430659, + "learning_rate": 2.1909498736379062e-05, + "loss": 2.5893, + "step": 46451 + }, + { + "epoch": 2.162674302209186, + "grad_norm": 0.3305300491871442, + "learning_rate": 2.1907257931194424e-05, + "loss": 2.6864, + "step": 46452 + }, + { + "epoch": 2.1627208603952788, + "grad_norm": 0.34705296055148954, + "learning_rate": 2.1905017208459217e-05, + "loss": 2.6435, + "step": 46453 + }, + { + "epoch": 2.162767418581372, + "grad_norm": 0.33337365579506506, + "learning_rate": 2.1902776568180015e-05, + "loss": 2.7063, + "step": 46454 + }, + { + "epoch": 2.162813976767465, + "grad_norm": 0.32776841840339205, + "learning_rate": 2.1900536010363414e-05, + "loss": 2.4819, + "step": 46455 + }, + { + "epoch": 2.162860534953558, + "grad_norm": 0.34116973873492845, + "learning_rate": 2.1898295535015957e-05, + "loss": 2.6126, + "step": 46456 + }, + { + "epoch": 2.1629070931396512, + "grad_norm": 0.3581807682052103, + "learning_rate": 2.189605514214423e-05, + "loss": 2.6935, + "step": 46457 + }, + { + "epoch": 2.1629536513257444, + "grad_norm": 0.3668415258898277, + "learning_rate": 2.1893814831754816e-05, + "loss": 2.7161, + "step": 46458 + }, + { + "epoch": 2.1630002095118375, + "grad_norm": 0.3531724694432742, + "learning_rate": 2.1891574603854282e-05, + "loss": 2.672, + "step": 46459 + }, + { + "epoch": 2.1630467676979306, + "grad_norm": 0.338151703438184, + "learning_rate": 2.1889334458449214e-05, + "loss": 2.5801, + "step": 46460 + }, + { + "epoch": 2.1630933258840237, + "grad_norm": 0.3478228830319653, + "learning_rate": 2.1887094395546188e-05, + "loss": 2.6521, + "step": 46461 + }, + { + "epoch": 2.163139884070117, + "grad_norm": 0.33098633154686324, + "learning_rate": 2.188485441515174e-05, + "loss": 2.6838, + "step": 46462 + }, + { + "epoch": 2.16318644225621, + "grad_norm": 0.3336128819483491, + "learning_rate": 2.188261451727251e-05, + "loss": 2.6496, + "step": 46463 + }, + { + "epoch": 2.1632330004423026, + "grad_norm": 0.34959941470584227, + "learning_rate": 2.1880374701915018e-05, + "loss": 2.6165, + "step": 46464 + }, + { + "epoch": 2.1632795586283957, + "grad_norm": 0.32653086454429564, + "learning_rate": 2.187813496908586e-05, + "loss": 2.6323, + "step": 46465 + }, + { + "epoch": 2.163326116814489, + "grad_norm": 0.32642011826355294, + "learning_rate": 2.18758953187916e-05, + "loss": 2.6014, + "step": 46466 + }, + { + "epoch": 2.163372675000582, + "grad_norm": 0.341188577846106, + "learning_rate": 2.187365575103882e-05, + "loss": 2.6154, + "step": 46467 + }, + { + "epoch": 2.163419233186675, + "grad_norm": 0.34140501728641875, + "learning_rate": 2.1871416265834103e-05, + "loss": 2.6757, + "step": 46468 + }, + { + "epoch": 2.163465791372768, + "grad_norm": 0.35087646027950015, + "learning_rate": 2.1869176863184e-05, + "loss": 2.6522, + "step": 46469 + }, + { + "epoch": 2.1635123495588613, + "grad_norm": 0.3109498478919957, + "learning_rate": 2.1866937543095083e-05, + "loss": 2.6184, + "step": 46470 + }, + { + "epoch": 2.1635589077449544, + "grad_norm": 0.3141967044940461, + "learning_rate": 2.1864698305573934e-05, + "loss": 2.6109, + "step": 46471 + }, + { + "epoch": 2.1636054659310475, + "grad_norm": 0.38764160818353793, + "learning_rate": 2.1862459150627124e-05, + "loss": 2.5529, + "step": 46472 + }, + { + "epoch": 2.16365202411714, + "grad_norm": 0.3423876886492405, + "learning_rate": 2.1860220078261222e-05, + "loss": 2.6356, + "step": 46473 + }, + { + "epoch": 2.1636985823032333, + "grad_norm": 0.3194504375555187, + "learning_rate": 2.1857981088482825e-05, + "loss": 2.6624, + "step": 46474 + }, + { + "epoch": 2.1637451404893264, + "grad_norm": 0.34305947960100863, + "learning_rate": 2.1855742181298444e-05, + "loss": 2.6439, + "step": 46475 + }, + { + "epoch": 2.1637916986754195, + "grad_norm": 0.3393668009957475, + "learning_rate": 2.185350335671472e-05, + "loss": 2.6107, + "step": 46476 + }, + { + "epoch": 2.1638382568615127, + "grad_norm": 0.3106443963717258, + "learning_rate": 2.1851264614738177e-05, + "loss": 2.6688, + "step": 46477 + }, + { + "epoch": 2.1638848150476058, + "grad_norm": 0.3705184756784701, + "learning_rate": 2.1849025955375395e-05, + "loss": 2.5992, + "step": 46478 + }, + { + "epoch": 2.163931373233699, + "grad_norm": 0.33646618915330057, + "learning_rate": 2.184678737863295e-05, + "loss": 2.6322, + "step": 46479 + }, + { + "epoch": 2.163977931419792, + "grad_norm": 0.32704819833436327, + "learning_rate": 2.1844548884517413e-05, + "loss": 2.6319, + "step": 46480 + }, + { + "epoch": 2.164024489605885, + "grad_norm": 0.31268271628922745, + "learning_rate": 2.1842310473035365e-05, + "loss": 2.5957, + "step": 46481 + }, + { + "epoch": 2.1640710477919782, + "grad_norm": 0.3398538281210473, + "learning_rate": 2.1840072144193347e-05, + "loss": 2.7299, + "step": 46482 + }, + { + "epoch": 2.164117605978071, + "grad_norm": 0.3253497442824889, + "learning_rate": 2.183783389799794e-05, + "loss": 2.5948, + "step": 46483 + }, + { + "epoch": 2.164164164164164, + "grad_norm": 0.33087662506913434, + "learning_rate": 2.1835595734455717e-05, + "loss": 2.5745, + "step": 46484 + }, + { + "epoch": 2.164210722350257, + "grad_norm": 0.3356825848532744, + "learning_rate": 2.183335765357325e-05, + "loss": 2.6656, + "step": 46485 + }, + { + "epoch": 2.1642572805363502, + "grad_norm": 0.3407117953831825, + "learning_rate": 2.1831119655357096e-05, + "loss": 2.6248, + "step": 46486 + }, + { + "epoch": 2.1643038387224434, + "grad_norm": 0.3267797561080217, + "learning_rate": 2.1828881739813855e-05, + "loss": 2.5892, + "step": 46487 + }, + { + "epoch": 2.1643503969085365, + "grad_norm": 0.3351720822826544, + "learning_rate": 2.1826643906950035e-05, + "loss": 2.634, + "step": 46488 + }, + { + "epoch": 2.1643969550946296, + "grad_norm": 0.3317017316314874, + "learning_rate": 2.182440615677227e-05, + "loss": 2.5626, + "step": 46489 + }, + { + "epoch": 2.1644435132807227, + "grad_norm": 0.33281295838762925, + "learning_rate": 2.1822168489287083e-05, + "loss": 2.6633, + "step": 46490 + }, + { + "epoch": 2.164490071466816, + "grad_norm": 0.30169884510143175, + "learning_rate": 2.1819930904501056e-05, + "loss": 2.6496, + "step": 46491 + }, + { + "epoch": 2.1645366296529085, + "grad_norm": 0.3365706254576916, + "learning_rate": 2.1817693402420753e-05, + "loss": 2.6981, + "step": 46492 + }, + { + "epoch": 2.1645831878390016, + "grad_norm": 0.3462380107155162, + "learning_rate": 2.1815455983052745e-05, + "loss": 2.6096, + "step": 46493 + }, + { + "epoch": 2.1646297460250947, + "grad_norm": 0.32902118105669964, + "learning_rate": 2.1813218646403614e-05, + "loss": 2.6944, + "step": 46494 + }, + { + "epoch": 2.164676304211188, + "grad_norm": 0.33418290067736073, + "learning_rate": 2.1810981392479875e-05, + "loss": 2.7467, + "step": 46495 + }, + { + "epoch": 2.164722862397281, + "grad_norm": 0.33712416887608415, + "learning_rate": 2.1808744221288164e-05, + "loss": 2.7837, + "step": 46496 + }, + { + "epoch": 2.164769420583374, + "grad_norm": 0.3241727935316518, + "learning_rate": 2.180650713283499e-05, + "loss": 2.6751, + "step": 46497 + }, + { + "epoch": 2.164815978769467, + "grad_norm": 0.30988314533294636, + "learning_rate": 2.1804270127126945e-05, + "loss": 2.6603, + "step": 46498 + }, + { + "epoch": 2.1648625369555603, + "grad_norm": 0.3408405153043279, + "learning_rate": 2.180203320417059e-05, + "loss": 2.6161, + "step": 46499 + }, + { + "epoch": 2.1649090951416534, + "grad_norm": 0.31765930730278624, + "learning_rate": 2.1799796363972507e-05, + "loss": 2.6739, + "step": 46500 + }, + { + "epoch": 2.1649556533277465, + "grad_norm": 0.33558466643321533, + "learning_rate": 2.1797559606539203e-05, + "loss": 2.6782, + "step": 46501 + }, + { + "epoch": 2.1650022115138396, + "grad_norm": 0.29659519139755836, + "learning_rate": 2.1795322931877322e-05, + "loss": 2.5809, + "step": 46502 + }, + { + "epoch": 2.1650487696999323, + "grad_norm": 0.31760679429588834, + "learning_rate": 2.1793086339993375e-05, + "loss": 2.6254, + "step": 46503 + }, + { + "epoch": 2.1650953278860254, + "grad_norm": 0.333322459562949, + "learning_rate": 2.1790849830893933e-05, + "loss": 2.6825, + "step": 46504 + }, + { + "epoch": 2.1651418860721185, + "grad_norm": 0.31346765856392556, + "learning_rate": 2.1788613404585574e-05, + "loss": 2.6874, + "step": 46505 + }, + { + "epoch": 2.1651884442582117, + "grad_norm": 0.33340655838365635, + "learning_rate": 2.1786377061074852e-05, + "loss": 2.657, + "step": 46506 + }, + { + "epoch": 2.165235002444305, + "grad_norm": 0.34176667315574916, + "learning_rate": 2.1784140800368354e-05, + "loss": 2.6325, + "step": 46507 + }, + { + "epoch": 2.165281560630398, + "grad_norm": 0.33609298030496526, + "learning_rate": 2.1781904622472587e-05, + "loss": 2.5839, + "step": 46508 + }, + { + "epoch": 2.165328118816491, + "grad_norm": 0.31931145492326896, + "learning_rate": 2.1779668527394185e-05, + "loss": 2.6633, + "step": 46509 + }, + { + "epoch": 2.165374677002584, + "grad_norm": 0.32384162223374, + "learning_rate": 2.1777432515139655e-05, + "loss": 2.6264, + "step": 46510 + }, + { + "epoch": 2.1654212351886772, + "grad_norm": 0.3469751859843908, + "learning_rate": 2.1775196585715585e-05, + "loss": 2.7043, + "step": 46511 + }, + { + "epoch": 2.16546779337477, + "grad_norm": 0.32912259247883824, + "learning_rate": 2.1772960739128533e-05, + "loss": 2.6001, + "step": 46512 + }, + { + "epoch": 2.165514351560863, + "grad_norm": 0.3243999161195978, + "learning_rate": 2.1770724975385053e-05, + "loss": 2.6317, + "step": 46513 + }, + { + "epoch": 2.165560909746956, + "grad_norm": 0.3347178664808283, + "learning_rate": 2.1768489294491722e-05, + "loss": 2.6697, + "step": 46514 + }, + { + "epoch": 2.1656074679330493, + "grad_norm": 0.3312910906245436, + "learning_rate": 2.1766253696455107e-05, + "loss": 2.5614, + "step": 46515 + }, + { + "epoch": 2.1656540261191424, + "grad_norm": 0.33657262811100086, + "learning_rate": 2.1764018181281744e-05, + "loss": 2.6572, + "step": 46516 + }, + { + "epoch": 2.1657005843052355, + "grad_norm": 0.32949921231770274, + "learning_rate": 2.17617827489782e-05, + "loss": 2.6348, + "step": 46517 + }, + { + "epoch": 2.1657471424913286, + "grad_norm": 0.3313381487227984, + "learning_rate": 2.1759547399551044e-05, + "loss": 2.6223, + "step": 46518 + }, + { + "epoch": 2.1657937006774217, + "grad_norm": 0.3414947950213653, + "learning_rate": 2.1757312133006835e-05, + "loss": 2.6717, + "step": 46519 + }, + { + "epoch": 2.165840258863515, + "grad_norm": 0.3580456426986719, + "learning_rate": 2.1755076949352148e-05, + "loss": 2.647, + "step": 46520 + }, + { + "epoch": 2.165886817049608, + "grad_norm": 0.31412311194011233, + "learning_rate": 2.1752841848593492e-05, + "loss": 2.6931, + "step": 46521 + }, + { + "epoch": 2.1659333752357006, + "grad_norm": 0.32210612782208015, + "learning_rate": 2.1750606830737498e-05, + "loss": 2.6292, + "step": 46522 + }, + { + "epoch": 2.1659799334217937, + "grad_norm": 0.34644726794389435, + "learning_rate": 2.1748371895790675e-05, + "loss": 2.6836, + "step": 46523 + }, + { + "epoch": 2.166026491607887, + "grad_norm": 0.3445134278218758, + "learning_rate": 2.1746137043759595e-05, + "loss": 2.651, + "step": 46524 + }, + { + "epoch": 2.16607304979398, + "grad_norm": 0.32065484128113575, + "learning_rate": 2.174390227465082e-05, + "loss": 2.5376, + "step": 46525 + }, + { + "epoch": 2.166119607980073, + "grad_norm": 0.34596339243389845, + "learning_rate": 2.1741667588470904e-05, + "loss": 2.7346, + "step": 46526 + }, + { + "epoch": 2.166166166166166, + "grad_norm": 0.3436727798781059, + "learning_rate": 2.1739432985226415e-05, + "loss": 2.5402, + "step": 46527 + }, + { + "epoch": 2.1662127243522593, + "grad_norm": 0.3639141533184067, + "learning_rate": 2.1737198464923924e-05, + "loss": 2.7054, + "step": 46528 + }, + { + "epoch": 2.1662592825383524, + "grad_norm": 0.36821733069548557, + "learning_rate": 2.173496402756995e-05, + "loss": 2.5981, + "step": 46529 + }, + { + "epoch": 2.1663058407244455, + "grad_norm": 0.3857542639838654, + "learning_rate": 2.1732729673171072e-05, + "loss": 2.7277, + "step": 46530 + }, + { + "epoch": 2.166352398910538, + "grad_norm": 0.34846023344466753, + "learning_rate": 2.1730495401733854e-05, + "loss": 2.6272, + "step": 46531 + }, + { + "epoch": 2.1663989570966313, + "grad_norm": 0.3402830005513971, + "learning_rate": 2.1728261213264837e-05, + "loss": 2.6085, + "step": 46532 + }, + { + "epoch": 2.1664455152827244, + "grad_norm": 0.348099960017291, + "learning_rate": 2.1726027107770613e-05, + "loss": 2.6034, + "step": 46533 + }, + { + "epoch": 2.1664920734688176, + "grad_norm": 0.32880234497959465, + "learning_rate": 2.1723793085257676e-05, + "loss": 2.555, + "step": 46534 + }, + { + "epoch": 2.1665386316549107, + "grad_norm": 0.34550426493811437, + "learning_rate": 2.1721559145732656e-05, + "loss": 2.6994, + "step": 46535 + }, + { + "epoch": 2.166585189841004, + "grad_norm": 0.35435374575469575, + "learning_rate": 2.171932528920206e-05, + "loss": 2.6742, + "step": 46536 + }, + { + "epoch": 2.166631748027097, + "grad_norm": 0.3358233541230124, + "learning_rate": 2.1717091515672457e-05, + "loss": 2.6596, + "step": 46537 + }, + { + "epoch": 2.16667830621319, + "grad_norm": 0.32923873057335745, + "learning_rate": 2.1714857825150397e-05, + "loss": 2.7206, + "step": 46538 + }, + { + "epoch": 2.166724864399283, + "grad_norm": 0.364072670211289, + "learning_rate": 2.1712624217642448e-05, + "loss": 2.6554, + "step": 46539 + }, + { + "epoch": 2.1667714225853762, + "grad_norm": 0.34428900810419066, + "learning_rate": 2.171039069315516e-05, + "loss": 2.6931, + "step": 46540 + }, + { + "epoch": 2.1668179807714694, + "grad_norm": 0.37753099111046734, + "learning_rate": 2.1708157251695104e-05, + "loss": 2.7086, + "step": 46541 + }, + { + "epoch": 2.166864538957562, + "grad_norm": 0.33694689680134177, + "learning_rate": 2.1705923893268808e-05, + "loss": 2.5967, + "step": 46542 + }, + { + "epoch": 2.166911097143655, + "grad_norm": 0.3570502186936444, + "learning_rate": 2.1703690617882833e-05, + "loss": 2.5869, + "step": 46543 + }, + { + "epoch": 2.1669576553297483, + "grad_norm": 0.35105117964178323, + "learning_rate": 2.1701457425543736e-05, + "loss": 2.6821, + "step": 46544 + }, + { + "epoch": 2.1670042135158414, + "grad_norm": 0.3584706657195021, + "learning_rate": 2.1699224316258077e-05, + "loss": 2.6746, + "step": 46545 + }, + { + "epoch": 2.1670507717019345, + "grad_norm": 0.3701783339968033, + "learning_rate": 2.1696991290032427e-05, + "loss": 2.6422, + "step": 46546 + }, + { + "epoch": 2.1670973298880276, + "grad_norm": 0.35935942479189376, + "learning_rate": 2.169475834687328e-05, + "loss": 2.7321, + "step": 46547 + }, + { + "epoch": 2.1671438880741207, + "grad_norm": 0.32312781158624276, + "learning_rate": 2.1692525486787264e-05, + "loss": 2.6115, + "step": 46548 + }, + { + "epoch": 2.167190446260214, + "grad_norm": 0.34269467059858616, + "learning_rate": 2.1690292709780863e-05, + "loss": 2.501, + "step": 46549 + }, + { + "epoch": 2.167237004446307, + "grad_norm": 0.3619139052993151, + "learning_rate": 2.16880600158607e-05, + "loss": 2.5694, + "step": 46550 + }, + { + "epoch": 2.1672835626323996, + "grad_norm": 0.34780775123260854, + "learning_rate": 2.168582740503327e-05, + "loss": 2.6899, + "step": 46551 + }, + { + "epoch": 2.1673301208184927, + "grad_norm": 0.38396167030749173, + "learning_rate": 2.1683594877305153e-05, + "loss": 2.5055, + "step": 46552 + }, + { + "epoch": 2.167376679004586, + "grad_norm": 0.32746542976177884, + "learning_rate": 2.168136243268289e-05, + "loss": 2.5944, + "step": 46553 + }, + { + "epoch": 2.167423237190679, + "grad_norm": 0.33517093226541295, + "learning_rate": 2.167913007117306e-05, + "loss": 2.6485, + "step": 46554 + }, + { + "epoch": 2.167469795376772, + "grad_norm": 0.33657851773803904, + "learning_rate": 2.167689779278217e-05, + "loss": 2.6114, + "step": 46555 + }, + { + "epoch": 2.167516353562865, + "grad_norm": 0.3383243263567344, + "learning_rate": 2.1674665597516803e-05, + "loss": 2.5899, + "step": 46556 + }, + { + "epoch": 2.1675629117489583, + "grad_norm": 0.33873645065032454, + "learning_rate": 2.1672433485383492e-05, + "loss": 2.6978, + "step": 46557 + }, + { + "epoch": 2.1676094699350514, + "grad_norm": 0.34847514916706523, + "learning_rate": 2.1670201456388807e-05, + "loss": 2.6001, + "step": 46558 + }, + { + "epoch": 2.1676560281211446, + "grad_norm": 0.3249450652456433, + "learning_rate": 2.1667969510539305e-05, + "loss": 2.5188, + "step": 46559 + }, + { + "epoch": 2.1677025863072377, + "grad_norm": 0.35461506310467233, + "learning_rate": 2.1665737647841484e-05, + "loss": 2.6835, + "step": 46560 + }, + { + "epoch": 2.1677491444933303, + "grad_norm": 0.3509533429352578, + "learning_rate": 2.1663505868301965e-05, + "loss": 2.6817, + "step": 46561 + }, + { + "epoch": 2.1677957026794235, + "grad_norm": 0.3371501070636465, + "learning_rate": 2.1661274171927232e-05, + "loss": 2.7177, + "step": 46562 + }, + { + "epoch": 2.1678422608655166, + "grad_norm": 0.32407063960287624, + "learning_rate": 2.1659042558723903e-05, + "loss": 2.6702, + "step": 46563 + }, + { + "epoch": 2.1678888190516097, + "grad_norm": 0.33775534407768615, + "learning_rate": 2.1656811028698475e-05, + "loss": 2.6453, + "step": 46564 + }, + { + "epoch": 2.167935377237703, + "grad_norm": 0.36242037924223425, + "learning_rate": 2.1654579581857516e-05, + "loss": 2.7125, + "step": 46565 + }, + { + "epoch": 2.167981935423796, + "grad_norm": 0.3400406001745087, + "learning_rate": 2.165234821820757e-05, + "loss": 2.6169, + "step": 46566 + }, + { + "epoch": 2.168028493609889, + "grad_norm": 0.30912868183389364, + "learning_rate": 2.165011693775519e-05, + "loss": 2.5571, + "step": 46567 + }, + { + "epoch": 2.168075051795982, + "grad_norm": 0.36100964419464543, + "learning_rate": 2.1647885740506942e-05, + "loss": 2.7066, + "step": 46568 + }, + { + "epoch": 2.1681216099820753, + "grad_norm": 0.33811756758586725, + "learning_rate": 2.1645654626469335e-05, + "loss": 2.6825, + "step": 46569 + }, + { + "epoch": 2.1681681681681684, + "grad_norm": 0.3437386071424178, + "learning_rate": 2.1643423595648944e-05, + "loss": 2.5588, + "step": 46570 + }, + { + "epoch": 2.168214726354261, + "grad_norm": 0.3624954604125975, + "learning_rate": 2.164119264805231e-05, + "loss": 2.6308, + "step": 46571 + }, + { + "epoch": 2.168261284540354, + "grad_norm": 0.34796077208686427, + "learning_rate": 2.1638961783685997e-05, + "loss": 2.6819, + "step": 46572 + }, + { + "epoch": 2.1683078427264473, + "grad_norm": 0.35837273988528096, + "learning_rate": 2.1636731002556505e-05, + "loss": 2.6327, + "step": 46573 + }, + { + "epoch": 2.1683544009125404, + "grad_norm": 0.3499167047003165, + "learning_rate": 2.1634500304670447e-05, + "loss": 2.5941, + "step": 46574 + }, + { + "epoch": 2.1684009590986335, + "grad_norm": 0.3493075552535972, + "learning_rate": 2.16322696900343e-05, + "loss": 2.6517, + "step": 46575 + }, + { + "epoch": 2.1684475172847266, + "grad_norm": 0.35589955973163745, + "learning_rate": 2.163003915865468e-05, + "loss": 2.603, + "step": 46576 + }, + { + "epoch": 2.1684940754708197, + "grad_norm": 0.3702320771259898, + "learning_rate": 2.162780871053809e-05, + "loss": 2.6142, + "step": 46577 + }, + { + "epoch": 2.168540633656913, + "grad_norm": 0.3650716733417069, + "learning_rate": 2.1625578345691078e-05, + "loss": 2.5648, + "step": 46578 + }, + { + "epoch": 2.168587191843006, + "grad_norm": 0.34836906599782286, + "learning_rate": 2.1623348064120202e-05, + "loss": 2.5043, + "step": 46579 + }, + { + "epoch": 2.168633750029099, + "grad_norm": 0.3569006673105259, + "learning_rate": 2.1621117865832002e-05, + "loss": 2.6106, + "step": 46580 + }, + { + "epoch": 2.1686803082151918, + "grad_norm": 0.36085221514826155, + "learning_rate": 2.161888775083303e-05, + "loss": 2.5878, + "step": 46581 + }, + { + "epoch": 2.168726866401285, + "grad_norm": 0.36966965785015826, + "learning_rate": 2.161665771912984e-05, + "loss": 2.625, + "step": 46582 + }, + { + "epoch": 2.168773424587378, + "grad_norm": 0.33474150676177816, + "learning_rate": 2.1614427770728945e-05, + "loss": 2.5536, + "step": 46583 + }, + { + "epoch": 2.168819982773471, + "grad_norm": 0.3723678078786352, + "learning_rate": 2.1612197905636912e-05, + "loss": 2.7063, + "step": 46584 + }, + { + "epoch": 2.168866540959564, + "grad_norm": 0.3548359724247934, + "learning_rate": 2.160996812386028e-05, + "loss": 2.5816, + "step": 46585 + }, + { + "epoch": 2.1689130991456573, + "grad_norm": 0.336310186220471, + "learning_rate": 2.1607738425405595e-05, + "loss": 2.6134, + "step": 46586 + }, + { + "epoch": 2.1689596573317504, + "grad_norm": 0.3190137627069932, + "learning_rate": 2.1605508810279412e-05, + "loss": 2.6223, + "step": 46587 + }, + { + "epoch": 2.1690062155178436, + "grad_norm": 0.3545110549671159, + "learning_rate": 2.1603279278488237e-05, + "loss": 2.7962, + "step": 46588 + }, + { + "epoch": 2.1690527737039367, + "grad_norm": 0.3177930308182817, + "learning_rate": 2.160104983003867e-05, + "loss": 2.6142, + "step": 46589 + }, + { + "epoch": 2.1690993318900293, + "grad_norm": 0.33730025071821257, + "learning_rate": 2.1598820464937207e-05, + "loss": 2.6774, + "step": 46590 + }, + { + "epoch": 2.1691458900761225, + "grad_norm": 0.3093144347035781, + "learning_rate": 2.1596591183190405e-05, + "loss": 2.6397, + "step": 46591 + }, + { + "epoch": 2.1691924482622156, + "grad_norm": 0.32434457773034137, + "learning_rate": 2.159436198480482e-05, + "loss": 2.6042, + "step": 46592 + }, + { + "epoch": 2.1692390064483087, + "grad_norm": 0.3244209697859643, + "learning_rate": 2.1592132869786973e-05, + "loss": 2.65, + "step": 46593 + }, + { + "epoch": 2.169285564634402, + "grad_norm": 0.2892048944722343, + "learning_rate": 2.158990383814342e-05, + "loss": 2.5727, + "step": 46594 + }, + { + "epoch": 2.169332122820495, + "grad_norm": 0.30927733197700336, + "learning_rate": 2.158767488988072e-05, + "loss": 2.6828, + "step": 46595 + }, + { + "epoch": 2.169378681006588, + "grad_norm": 0.2891983347387023, + "learning_rate": 2.158544602500538e-05, + "loss": 2.6606, + "step": 46596 + }, + { + "epoch": 2.169425239192681, + "grad_norm": 0.3136586875212778, + "learning_rate": 2.158321724352395e-05, + "loss": 2.6942, + "step": 46597 + }, + { + "epoch": 2.1694717973787743, + "grad_norm": 0.3240676296052826, + "learning_rate": 2.1580988545442986e-05, + "loss": 2.7668, + "step": 46598 + }, + { + "epoch": 2.1695183555648674, + "grad_norm": 0.303160316304938, + "learning_rate": 2.1578759930769017e-05, + "loss": 2.6459, + "step": 46599 + }, + { + "epoch": 2.16956491375096, + "grad_norm": 0.3380440315692263, + "learning_rate": 2.1576531399508603e-05, + "loss": 2.6786, + "step": 46600 + }, + { + "epoch": 2.169611471937053, + "grad_norm": 0.32943971858881, + "learning_rate": 2.1574302951668235e-05, + "loss": 2.583, + "step": 46601 + }, + { + "epoch": 2.1696580301231463, + "grad_norm": 0.31212654170188886, + "learning_rate": 2.1572074587254527e-05, + "loss": 2.6041, + "step": 46602 + }, + { + "epoch": 2.1697045883092394, + "grad_norm": 0.3356947206233073, + "learning_rate": 2.1569846306273938e-05, + "loss": 2.6158, + "step": 46603 + }, + { + "epoch": 2.1697511464953325, + "grad_norm": 0.30496533407393817, + "learning_rate": 2.156761810873309e-05, + "loss": 2.5343, + "step": 46604 + }, + { + "epoch": 2.1697977046814256, + "grad_norm": 0.3172932047111991, + "learning_rate": 2.156538999463846e-05, + "loss": 2.6689, + "step": 46605 + }, + { + "epoch": 2.1698442628675187, + "grad_norm": 0.33011840631952344, + "learning_rate": 2.156316196399661e-05, + "loss": 2.6516, + "step": 46606 + }, + { + "epoch": 2.169890821053612, + "grad_norm": 0.32071268942968045, + "learning_rate": 2.1560934016814077e-05, + "loss": 2.6309, + "step": 46607 + }, + { + "epoch": 2.169937379239705, + "grad_norm": 0.34559413597966526, + "learning_rate": 2.1558706153097418e-05, + "loss": 2.7064, + "step": 46608 + }, + { + "epoch": 2.169983937425798, + "grad_norm": 0.2977618739449228, + "learning_rate": 2.155647837285314e-05, + "loss": 2.5651, + "step": 46609 + }, + { + "epoch": 2.1700304956118908, + "grad_norm": 0.32657018588651704, + "learning_rate": 2.1554250676087795e-05, + "loss": 2.6072, + "step": 46610 + }, + { + "epoch": 2.170077053797984, + "grad_norm": 0.356548168687348, + "learning_rate": 2.1552023062807926e-05, + "loss": 2.6292, + "step": 46611 + }, + { + "epoch": 2.170123611984077, + "grad_norm": 0.3370085040772457, + "learning_rate": 2.154979553302006e-05, + "loss": 2.6254, + "step": 46612 + }, + { + "epoch": 2.17017017017017, + "grad_norm": 0.3361807747569018, + "learning_rate": 2.1547568086730762e-05, + "loss": 2.6077, + "step": 46613 + }, + { + "epoch": 2.1702167283562632, + "grad_norm": 0.33676348614070717, + "learning_rate": 2.154534072394651e-05, + "loss": 2.5184, + "step": 46614 + }, + { + "epoch": 2.1702632865423563, + "grad_norm": 0.32440661066784965, + "learning_rate": 2.154311344467392e-05, + "loss": 2.6408, + "step": 46615 + }, + { + "epoch": 2.1703098447284495, + "grad_norm": 0.3270544180401602, + "learning_rate": 2.154088624891945e-05, + "loss": 2.6752, + "step": 46616 + }, + { + "epoch": 2.1703564029145426, + "grad_norm": 0.311863359315502, + "learning_rate": 2.1538659136689714e-05, + "loss": 2.6551, + "step": 46617 + }, + { + "epoch": 2.1704029611006357, + "grad_norm": 0.34758013222014345, + "learning_rate": 2.153643210799119e-05, + "loss": 2.6327, + "step": 46618 + }, + { + "epoch": 2.170449519286729, + "grad_norm": 0.3525542536532281, + "learning_rate": 2.1534205162830427e-05, + "loss": 2.7009, + "step": 46619 + }, + { + "epoch": 2.1704960774728215, + "grad_norm": 0.3264268198470157, + "learning_rate": 2.153197830121398e-05, + "loss": 2.6926, + "step": 46620 + }, + { + "epoch": 2.1705426356589146, + "grad_norm": 0.3514693504436514, + "learning_rate": 2.1529751523148363e-05, + "loss": 2.5913, + "step": 46621 + }, + { + "epoch": 2.1705891938450077, + "grad_norm": 0.35577396245753246, + "learning_rate": 2.1527524828640138e-05, + "loss": 2.7438, + "step": 46622 + }, + { + "epoch": 2.170635752031101, + "grad_norm": 0.33580510537430497, + "learning_rate": 2.1525298217695812e-05, + "loss": 2.6298, + "step": 46623 + }, + { + "epoch": 2.170682310217194, + "grad_norm": 0.31141715199625186, + "learning_rate": 2.1523071690321926e-05, + "loss": 2.6487, + "step": 46624 + }, + { + "epoch": 2.170728868403287, + "grad_norm": 0.32616094039063215, + "learning_rate": 2.1520845246525018e-05, + "loss": 2.738, + "step": 46625 + }, + { + "epoch": 2.17077542658938, + "grad_norm": 0.3285529426885765, + "learning_rate": 2.151861888631164e-05, + "loss": 2.711, + "step": 46626 + }, + { + "epoch": 2.1708219847754733, + "grad_norm": 0.320450516869747, + "learning_rate": 2.151639260968828e-05, + "loss": 2.6331, + "step": 46627 + }, + { + "epoch": 2.1708685429615664, + "grad_norm": 0.30540468074180266, + "learning_rate": 2.1514166416661536e-05, + "loss": 2.6192, + "step": 46628 + }, + { + "epoch": 2.170915101147659, + "grad_norm": 0.32231164233981374, + "learning_rate": 2.1511940307237867e-05, + "loss": 2.5797, + "step": 46629 + }, + { + "epoch": 2.170961659333752, + "grad_norm": 0.3409049379171026, + "learning_rate": 2.1509714281423888e-05, + "loss": 2.6392, + "step": 46630 + }, + { + "epoch": 2.1710082175198453, + "grad_norm": 0.35509871722044034, + "learning_rate": 2.1507488339226068e-05, + "loss": 2.7286, + "step": 46631 + }, + { + "epoch": 2.1710547757059384, + "grad_norm": 0.3308435160751835, + "learning_rate": 2.1505262480650962e-05, + "loss": 2.6191, + "step": 46632 + }, + { + "epoch": 2.1711013338920315, + "grad_norm": 0.3493836354810654, + "learning_rate": 2.150303670570511e-05, + "loss": 2.6084, + "step": 46633 + }, + { + "epoch": 2.1711478920781246, + "grad_norm": 0.34776524676718557, + "learning_rate": 2.1500811014395033e-05, + "loss": 2.8304, + "step": 46634 + }, + { + "epoch": 2.1711944502642178, + "grad_norm": 0.35244468579852845, + "learning_rate": 2.1498585406727285e-05, + "loss": 2.6464, + "step": 46635 + }, + { + "epoch": 2.171241008450311, + "grad_norm": 0.33905122157802786, + "learning_rate": 2.1496359882708367e-05, + "loss": 2.6073, + "step": 46636 + }, + { + "epoch": 2.171287566636404, + "grad_norm": 0.3496101617028127, + "learning_rate": 2.149413444234482e-05, + "loss": 2.7098, + "step": 46637 + }, + { + "epoch": 2.171334124822497, + "grad_norm": 0.34218904798374083, + "learning_rate": 2.1491909085643188e-05, + "loss": 2.5935, + "step": 46638 + }, + { + "epoch": 2.17138068300859, + "grad_norm": 0.3306366658998298, + "learning_rate": 2.1489683812609984e-05, + "loss": 2.6679, + "step": 46639 + }, + { + "epoch": 2.171427241194683, + "grad_norm": 0.3465075150991771, + "learning_rate": 2.1487458623251754e-05, + "loss": 2.7403, + "step": 46640 + }, + { + "epoch": 2.171473799380776, + "grad_norm": 0.35941396613529386, + "learning_rate": 2.1485233517575043e-05, + "loss": 2.6216, + "step": 46641 + }, + { + "epoch": 2.171520357566869, + "grad_norm": 0.3453709630025343, + "learning_rate": 2.1483008495586326e-05, + "loss": 2.6484, + "step": 46642 + }, + { + "epoch": 2.1715669157529622, + "grad_norm": 0.32531781322226866, + "learning_rate": 2.1480783557292205e-05, + "loss": 2.6253, + "step": 46643 + }, + { + "epoch": 2.1716134739390553, + "grad_norm": 0.34404043901911263, + "learning_rate": 2.147855870269916e-05, + "loss": 2.6453, + "step": 46644 + }, + { + "epoch": 2.1716600321251485, + "grad_norm": 0.35025314398248947, + "learning_rate": 2.1476333931813732e-05, + "loss": 2.6542, + "step": 46645 + }, + { + "epoch": 2.1717065903112416, + "grad_norm": 0.3617053465889264, + "learning_rate": 2.1474109244642454e-05, + "loss": 2.6617, + "step": 46646 + }, + { + "epoch": 2.1717531484973347, + "grad_norm": 0.3510666474078045, + "learning_rate": 2.1471884641191852e-05, + "loss": 2.646, + "step": 46647 + }, + { + "epoch": 2.171799706683428, + "grad_norm": 0.33361122765755674, + "learning_rate": 2.146966012146848e-05, + "loss": 2.682, + "step": 46648 + }, + { + "epoch": 2.1718462648695205, + "grad_norm": 0.3621621088704248, + "learning_rate": 2.1467435685478827e-05, + "loss": 2.5946, + "step": 46649 + }, + { + "epoch": 2.1718928230556136, + "grad_norm": 0.3552119796535606, + "learning_rate": 2.146521133322944e-05, + "loss": 2.6669, + "step": 46650 + }, + { + "epoch": 2.1719393812417067, + "grad_norm": 0.3382177937298911, + "learning_rate": 2.1462987064726848e-05, + "loss": 2.6454, + "step": 46651 + }, + { + "epoch": 2.1719859394278, + "grad_norm": 0.3309699227842034, + "learning_rate": 2.146076287997757e-05, + "loss": 2.7354, + "step": 46652 + }, + { + "epoch": 2.172032497613893, + "grad_norm": 0.35670282377021967, + "learning_rate": 2.1458538778988146e-05, + "loss": 2.6542, + "step": 46653 + }, + { + "epoch": 2.172079055799986, + "grad_norm": 0.32900593341746354, + "learning_rate": 2.1456314761765117e-05, + "loss": 2.6771, + "step": 46654 + }, + { + "epoch": 2.172125613986079, + "grad_norm": 0.3227983992180473, + "learning_rate": 2.145409082831496e-05, + "loss": 2.5853, + "step": 46655 + }, + { + "epoch": 2.1721721721721723, + "grad_norm": 0.33940587164912533, + "learning_rate": 2.1451866978644263e-05, + "loss": 2.6594, + "step": 46656 + }, + { + "epoch": 2.1722187303582654, + "grad_norm": 0.33007453609997234, + "learning_rate": 2.1449643212759514e-05, + "loss": 2.5748, + "step": 46657 + }, + { + "epoch": 2.1722652885443585, + "grad_norm": 0.346747339647911, + "learning_rate": 2.144741953066724e-05, + "loss": 2.5877, + "step": 46658 + }, + { + "epoch": 2.172311846730451, + "grad_norm": 0.32852182439906197, + "learning_rate": 2.1445195932373986e-05, + "loss": 2.724, + "step": 46659 + }, + { + "epoch": 2.1723584049165443, + "grad_norm": 0.36324608789105045, + "learning_rate": 2.1442972417886264e-05, + "loss": 2.5865, + "step": 46660 + }, + { + "epoch": 2.1724049631026374, + "grad_norm": 0.3613924635220983, + "learning_rate": 2.1440748987210623e-05, + "loss": 2.6378, + "step": 46661 + }, + { + "epoch": 2.1724515212887305, + "grad_norm": 0.3412865939045509, + "learning_rate": 2.143852564035355e-05, + "loss": 2.6608, + "step": 46662 + }, + { + "epoch": 2.1724980794748237, + "grad_norm": 0.34257027581229954, + "learning_rate": 2.1436302377321593e-05, + "loss": 2.5888, + "step": 46663 + }, + { + "epoch": 2.1725446376609168, + "grad_norm": 0.3455108762375882, + "learning_rate": 2.1434079198121275e-05, + "loss": 2.5649, + "step": 46664 + }, + { + "epoch": 2.17259119584701, + "grad_norm": 0.3418900439989944, + "learning_rate": 2.1431856102759117e-05, + "loss": 2.6806, + "step": 46665 + }, + { + "epoch": 2.172637754033103, + "grad_norm": 0.34721769276126146, + "learning_rate": 2.1429633091241653e-05, + "loss": 2.6846, + "step": 46666 + }, + { + "epoch": 2.172684312219196, + "grad_norm": 0.3482868575615322, + "learning_rate": 2.1427410163575416e-05, + "loss": 2.6233, + "step": 46667 + }, + { + "epoch": 2.172730870405289, + "grad_norm": 0.33250777129322456, + "learning_rate": 2.1425187319766875e-05, + "loss": 2.574, + "step": 46668 + }, + { + "epoch": 2.172777428591382, + "grad_norm": 0.3373106009009961, + "learning_rate": 2.1422964559822638e-05, + "loss": 2.6259, + "step": 46669 + }, + { + "epoch": 2.172823986777475, + "grad_norm": 0.33571038791293967, + "learning_rate": 2.142074188374916e-05, + "loss": 2.5304, + "step": 46670 + }, + { + "epoch": 2.172870544963568, + "grad_norm": 0.33473432958351834, + "learning_rate": 2.1418519291552996e-05, + "loss": 2.5461, + "step": 46671 + }, + { + "epoch": 2.1729171031496612, + "grad_norm": 0.3272089145556989, + "learning_rate": 2.141629678324066e-05, + "loss": 2.7367, + "step": 46672 + }, + { + "epoch": 2.1729636613357544, + "grad_norm": 0.3250949094215258, + "learning_rate": 2.141407435881867e-05, + "loss": 2.6543, + "step": 46673 + }, + { + "epoch": 2.1730102195218475, + "grad_norm": 0.3159915886583856, + "learning_rate": 2.1411852018293583e-05, + "loss": 2.6175, + "step": 46674 + }, + { + "epoch": 2.1730567777079406, + "grad_norm": 0.3176000822802088, + "learning_rate": 2.1409629761671852e-05, + "loss": 2.6415, + "step": 46675 + }, + { + "epoch": 2.1731033358940337, + "grad_norm": 0.3345257981537934, + "learning_rate": 2.1407407588960077e-05, + "loss": 2.59, + "step": 46676 + }, + { + "epoch": 2.173149894080127, + "grad_norm": 0.33184268662442923, + "learning_rate": 2.1405185500164728e-05, + "loss": 2.651, + "step": 46677 + }, + { + "epoch": 2.17319645226622, + "grad_norm": 0.33929328179906226, + "learning_rate": 2.1402963495292345e-05, + "loss": 2.6794, + "step": 46678 + }, + { + "epoch": 2.1732430104523126, + "grad_norm": 0.33408136409092426, + "learning_rate": 2.1400741574349442e-05, + "loss": 2.5874, + "step": 46679 + }, + { + "epoch": 2.1732895686384057, + "grad_norm": 0.34643121652576897, + "learning_rate": 2.1398519737342547e-05, + "loss": 2.7116, + "step": 46680 + }, + { + "epoch": 2.173336126824499, + "grad_norm": 0.3585228337641491, + "learning_rate": 2.139629798427818e-05, + "loss": 2.6942, + "step": 46681 + }, + { + "epoch": 2.173382685010592, + "grad_norm": 0.32494991074887186, + "learning_rate": 2.1394076315162874e-05, + "loss": 2.6515, + "step": 46682 + }, + { + "epoch": 2.173429243196685, + "grad_norm": 0.3298342846304133, + "learning_rate": 2.1391854730003123e-05, + "loss": 2.5867, + "step": 46683 + }, + { + "epoch": 2.173475801382778, + "grad_norm": 0.35368909953531, + "learning_rate": 2.1389633228805455e-05, + "loss": 2.7226, + "step": 46684 + }, + { + "epoch": 2.1735223595688713, + "grad_norm": 0.33506221201049385, + "learning_rate": 2.1387411811576404e-05, + "loss": 2.6105, + "step": 46685 + }, + { + "epoch": 2.1735689177549644, + "grad_norm": 0.35492700290662954, + "learning_rate": 2.138519047832247e-05, + "loss": 2.6606, + "step": 46686 + }, + { + "epoch": 2.1736154759410575, + "grad_norm": 0.32793375106480244, + "learning_rate": 2.1382969229050205e-05, + "loss": 2.6508, + "step": 46687 + }, + { + "epoch": 2.17366203412715, + "grad_norm": 0.36902807695314477, + "learning_rate": 2.138074806376607e-05, + "loss": 2.6626, + "step": 46688 + }, + { + "epoch": 2.1737085923132433, + "grad_norm": 0.317832174457281, + "learning_rate": 2.137852698247666e-05, + "loss": 2.5452, + "step": 46689 + }, + { + "epoch": 2.1737551504993364, + "grad_norm": 0.3470073135113433, + "learning_rate": 2.1376305985188427e-05, + "loss": 2.6572, + "step": 46690 + }, + { + "epoch": 2.1738017086854295, + "grad_norm": 0.3267413969788196, + "learning_rate": 2.1374085071907922e-05, + "loss": 2.6663, + "step": 46691 + }, + { + "epoch": 2.1738482668715227, + "grad_norm": 0.35016652279301597, + "learning_rate": 2.137186424264166e-05, + "loss": 2.6726, + "step": 46692 + }, + { + "epoch": 2.1738948250576158, + "grad_norm": 0.33019173007942026, + "learning_rate": 2.136964349739615e-05, + "loss": 2.5705, + "step": 46693 + }, + { + "epoch": 2.173941383243709, + "grad_norm": 0.33899719713812976, + "learning_rate": 2.136742283617792e-05, + "loss": 2.6013, + "step": 46694 + }, + { + "epoch": 2.173987941429802, + "grad_norm": 0.3662398750835828, + "learning_rate": 2.13652022589935e-05, + "loss": 2.681, + "step": 46695 + }, + { + "epoch": 2.174034499615895, + "grad_norm": 0.3403014708385702, + "learning_rate": 2.1362981765849373e-05, + "loss": 2.7141, + "step": 46696 + }, + { + "epoch": 2.1740810578019882, + "grad_norm": 0.3361048875472721, + "learning_rate": 2.1360761356752067e-05, + "loss": 2.6014, + "step": 46697 + }, + { + "epoch": 2.174127615988081, + "grad_norm": 0.3555470109897755, + "learning_rate": 2.1358541031708114e-05, + "loss": 2.611, + "step": 46698 + }, + { + "epoch": 2.174174174174174, + "grad_norm": 0.36751090501048483, + "learning_rate": 2.1356320790724015e-05, + "loss": 2.5894, + "step": 46699 + }, + { + "epoch": 2.174220732360267, + "grad_norm": 0.3407389598032371, + "learning_rate": 2.135410063380631e-05, + "loss": 2.602, + "step": 46700 + }, + { + "epoch": 2.1742672905463603, + "grad_norm": 0.3315084912003403, + "learning_rate": 2.1351880560961467e-05, + "loss": 2.5664, + "step": 46701 + }, + { + "epoch": 2.1743138487324534, + "grad_norm": 0.3279865624486664, + "learning_rate": 2.134966057219607e-05, + "loss": 2.5989, + "step": 46702 + }, + { + "epoch": 2.1743604069185465, + "grad_norm": 0.3368647540788749, + "learning_rate": 2.134744066751657e-05, + "loss": 2.6035, + "step": 46703 + }, + { + "epoch": 2.1744069651046396, + "grad_norm": 0.3233595102958008, + "learning_rate": 2.1345220846929516e-05, + "loss": 2.7384, + "step": 46704 + }, + { + "epoch": 2.1744535232907327, + "grad_norm": 0.3171540062844592, + "learning_rate": 2.1343001110441412e-05, + "loss": 2.6861, + "step": 46705 + }, + { + "epoch": 2.174500081476826, + "grad_norm": 0.33509915437841975, + "learning_rate": 2.1340781458058777e-05, + "loss": 2.6752, + "step": 46706 + }, + { + "epoch": 2.1745466396629185, + "grad_norm": 0.31442115224860023, + "learning_rate": 2.1338561889788127e-05, + "loss": 2.6444, + "step": 46707 + }, + { + "epoch": 2.1745931978490116, + "grad_norm": 0.3266592121026152, + "learning_rate": 2.133634240563599e-05, + "loss": 2.6205, + "step": 46708 + }, + { + "epoch": 2.1746397560351047, + "grad_norm": 0.35023595801667745, + "learning_rate": 2.1334123005608848e-05, + "loss": 2.6364, + "step": 46709 + }, + { + "epoch": 2.174686314221198, + "grad_norm": 0.327848260129001, + "learning_rate": 2.133190368971323e-05, + "loss": 2.7285, + "step": 46710 + }, + { + "epoch": 2.174732872407291, + "grad_norm": 0.3114155383664102, + "learning_rate": 2.132968445795565e-05, + "loss": 2.6666, + "step": 46711 + }, + { + "epoch": 2.174779430593384, + "grad_norm": 0.3432447821872438, + "learning_rate": 2.132746531034262e-05, + "loss": 2.6144, + "step": 46712 + }, + { + "epoch": 2.174825988779477, + "grad_norm": 0.3307436329456461, + "learning_rate": 2.1325246246880674e-05, + "loss": 2.6517, + "step": 46713 + }, + { + "epoch": 2.1748725469655703, + "grad_norm": 0.32796725942727467, + "learning_rate": 2.1323027267576274e-05, + "loss": 2.5348, + "step": 46714 + }, + { + "epoch": 2.1749191051516634, + "grad_norm": 0.32841238835624664, + "learning_rate": 2.1320808372435995e-05, + "loss": 2.6873, + "step": 46715 + }, + { + "epoch": 2.1749656633377565, + "grad_norm": 0.3399913012911921, + "learning_rate": 2.1318589561466285e-05, + "loss": 2.5949, + "step": 46716 + }, + { + "epoch": 2.1750122215238497, + "grad_norm": 0.33081227853209433, + "learning_rate": 2.1316370834673723e-05, + "loss": 2.5134, + "step": 46717 + }, + { + "epoch": 2.1750587797099423, + "grad_norm": 0.35663053073334905, + "learning_rate": 2.1314152192064767e-05, + "loss": 2.5624, + "step": 46718 + }, + { + "epoch": 2.1751053378960354, + "grad_norm": 0.35934981915908726, + "learning_rate": 2.1311933633645953e-05, + "loss": 2.6392, + "step": 46719 + }, + { + "epoch": 2.1751518960821286, + "grad_norm": 0.33871628011734856, + "learning_rate": 2.1309715159423782e-05, + "loss": 2.6785, + "step": 46720 + }, + { + "epoch": 2.1751984542682217, + "grad_norm": 0.37128776948611525, + "learning_rate": 2.1307496769404788e-05, + "loss": 2.5974, + "step": 46721 + }, + { + "epoch": 2.175245012454315, + "grad_norm": 0.3209747386551036, + "learning_rate": 2.130527846359545e-05, + "loss": 2.6697, + "step": 46722 + }, + { + "epoch": 2.175291570640408, + "grad_norm": 0.30417138281641193, + "learning_rate": 2.1303060242002294e-05, + "loss": 2.5694, + "step": 46723 + }, + { + "epoch": 2.175338128826501, + "grad_norm": 0.3322182444773798, + "learning_rate": 2.130084210463183e-05, + "loss": 2.6648, + "step": 46724 + }, + { + "epoch": 2.175384687012594, + "grad_norm": 0.32912997919368875, + "learning_rate": 2.1298624051490562e-05, + "loss": 2.6664, + "step": 46725 + }, + { + "epoch": 2.1754312451986872, + "grad_norm": 0.3444334249950659, + "learning_rate": 2.1296406082585023e-05, + "loss": 2.6603, + "step": 46726 + }, + { + "epoch": 2.17547780338478, + "grad_norm": 0.34004195399288734, + "learning_rate": 2.1294188197921673e-05, + "loss": 2.6346, + "step": 46727 + }, + { + "epoch": 2.175524361570873, + "grad_norm": 0.3272397069442502, + "learning_rate": 2.1291970397507087e-05, + "loss": 2.5484, + "step": 46728 + }, + { + "epoch": 2.175570919756966, + "grad_norm": 0.35393098929808314, + "learning_rate": 2.12897526813477e-05, + "loss": 2.6746, + "step": 46729 + }, + { + "epoch": 2.1756174779430593, + "grad_norm": 0.3403775034992476, + "learning_rate": 2.1287535049450103e-05, + "loss": 2.6254, + "step": 46730 + }, + { + "epoch": 2.1756640361291524, + "grad_norm": 0.37729165175137763, + "learning_rate": 2.128531750182074e-05, + "loss": 2.7097, + "step": 46731 + }, + { + "epoch": 2.1757105943152455, + "grad_norm": 0.31687213921416124, + "learning_rate": 2.128310003846614e-05, + "loss": 2.6334, + "step": 46732 + }, + { + "epoch": 2.1757571525013386, + "grad_norm": 0.3558789539739526, + "learning_rate": 2.1280882659392814e-05, + "loss": 2.595, + "step": 46733 + }, + { + "epoch": 2.1758037106874317, + "grad_norm": 0.3434386026628077, + "learning_rate": 2.127866536460727e-05, + "loss": 2.6691, + "step": 46734 + }, + { + "epoch": 2.175850268873525, + "grad_norm": 0.35517433852921254, + "learning_rate": 2.127644815411603e-05, + "loss": 2.646, + "step": 46735 + }, + { + "epoch": 2.175896827059618, + "grad_norm": 0.3362704677689393, + "learning_rate": 2.1274231027925567e-05, + "loss": 2.6993, + "step": 46736 + }, + { + "epoch": 2.1759433852457106, + "grad_norm": 0.3162934271540501, + "learning_rate": 2.1272013986042405e-05, + "loss": 2.5912, + "step": 46737 + }, + { + "epoch": 2.1759899434318037, + "grad_norm": 0.3467532960542798, + "learning_rate": 2.1269797028473053e-05, + "loss": 2.6837, + "step": 46738 + }, + { + "epoch": 2.176036501617897, + "grad_norm": 0.3366729632703032, + "learning_rate": 2.126758015522403e-05, + "loss": 2.696, + "step": 46739 + }, + { + "epoch": 2.17608305980399, + "grad_norm": 0.34147006553038806, + "learning_rate": 2.12653633663018e-05, + "loss": 2.6962, + "step": 46740 + }, + { + "epoch": 2.176129617990083, + "grad_norm": 0.3298353036111561, + "learning_rate": 2.126314666171293e-05, + "loss": 2.5389, + "step": 46741 + }, + { + "epoch": 2.176176176176176, + "grad_norm": 0.3431224697453628, + "learning_rate": 2.1260930041463856e-05, + "loss": 2.653, + "step": 46742 + }, + { + "epoch": 2.1762227343622693, + "grad_norm": 0.32669849626119307, + "learning_rate": 2.125871350556116e-05, + "loss": 2.4629, + "step": 46743 + }, + { + "epoch": 2.1762692925483624, + "grad_norm": 0.34895937617308675, + "learning_rate": 2.1256497054011287e-05, + "loss": 2.5145, + "step": 46744 + }, + { + "epoch": 2.1763158507344555, + "grad_norm": 0.32205871798029984, + "learning_rate": 2.1254280686820764e-05, + "loss": 2.6968, + "step": 46745 + }, + { + "epoch": 2.1763624089205487, + "grad_norm": 0.33452475876430476, + "learning_rate": 2.1252064403996102e-05, + "loss": 2.6499, + "step": 46746 + }, + { + "epoch": 2.1764089671066413, + "grad_norm": 0.3164465768623099, + "learning_rate": 2.1249848205543792e-05, + "loss": 2.5792, + "step": 46747 + }, + { + "epoch": 2.1764555252927344, + "grad_norm": 0.33573818803476985, + "learning_rate": 2.1247632091470367e-05, + "loss": 2.5865, + "step": 46748 + }, + { + "epoch": 2.1765020834788276, + "grad_norm": 0.33830168992833576, + "learning_rate": 2.1245416061782287e-05, + "loss": 2.6577, + "step": 46749 + }, + { + "epoch": 2.1765486416649207, + "grad_norm": 0.3457658692192059, + "learning_rate": 2.124320011648608e-05, + "loss": 2.5482, + "step": 46750 + }, + { + "epoch": 2.176595199851014, + "grad_norm": 0.34672936526293374, + "learning_rate": 2.1240984255588246e-05, + "loss": 2.7852, + "step": 46751 + }, + { + "epoch": 2.176641758037107, + "grad_norm": 0.338203258194364, + "learning_rate": 2.1238768479095296e-05, + "loss": 2.6197, + "step": 46752 + }, + { + "epoch": 2.1766883162232, + "grad_norm": 0.33912041780729435, + "learning_rate": 2.123655278701372e-05, + "loss": 2.6451, + "step": 46753 + }, + { + "epoch": 2.176734874409293, + "grad_norm": 0.3193712894683742, + "learning_rate": 2.123433717935005e-05, + "loss": 2.5726, + "step": 46754 + }, + { + "epoch": 2.1767814325953863, + "grad_norm": 0.3240206233455494, + "learning_rate": 2.1232121656110725e-05, + "loss": 2.6101, + "step": 46755 + }, + { + "epoch": 2.1768279907814794, + "grad_norm": 0.32841125668329674, + "learning_rate": 2.122990621730233e-05, + "loss": 2.6186, + "step": 46756 + }, + { + "epoch": 2.176874548967572, + "grad_norm": 0.3322383504408295, + "learning_rate": 2.122769086293131e-05, + "loss": 2.6082, + "step": 46757 + }, + { + "epoch": 2.176921107153665, + "grad_norm": 0.34601277784537926, + "learning_rate": 2.122547559300418e-05, + "loss": 2.7022, + "step": 46758 + }, + { + "epoch": 2.1769676653397583, + "grad_norm": 0.3569705554460614, + "learning_rate": 2.122326040752744e-05, + "loss": 2.5932, + "step": 46759 + }, + { + "epoch": 2.1770142235258514, + "grad_norm": 0.36148605213580415, + "learning_rate": 2.1221045306507602e-05, + "loss": 2.6897, + "step": 46760 + }, + { + "epoch": 2.1770607817119445, + "grad_norm": 0.3398806420722163, + "learning_rate": 2.1218830289951175e-05, + "loss": 2.6414, + "step": 46761 + }, + { + "epoch": 2.1771073398980376, + "grad_norm": 0.34330280703210014, + "learning_rate": 2.1216615357864634e-05, + "loss": 2.5786, + "step": 46762 + }, + { + "epoch": 2.1771538980841307, + "grad_norm": 0.35312537448034387, + "learning_rate": 2.1214400510254485e-05, + "loss": 2.7078, + "step": 46763 + }, + { + "epoch": 2.177200456270224, + "grad_norm": 0.3599063081972913, + "learning_rate": 2.1212185747127233e-05, + "loss": 2.6221, + "step": 46764 + }, + { + "epoch": 2.177247014456317, + "grad_norm": 0.36850257870819075, + "learning_rate": 2.1209971068489382e-05, + "loss": 2.697, + "step": 46765 + }, + { + "epoch": 2.1772935726424096, + "grad_norm": 0.33787253442648163, + "learning_rate": 2.1207756474347436e-05, + "loss": 2.7336, + "step": 46766 + }, + { + "epoch": 2.1773401308285028, + "grad_norm": 0.338666743837561, + "learning_rate": 2.12055419647079e-05, + "loss": 2.6017, + "step": 46767 + }, + { + "epoch": 2.177386689014596, + "grad_norm": 0.3864183206068609, + "learning_rate": 2.120332753957722e-05, + "loss": 2.739, + "step": 46768 + }, + { + "epoch": 2.177433247200689, + "grad_norm": 0.3335823168079484, + "learning_rate": 2.120111319896198e-05, + "loss": 2.641, + "step": 46769 + }, + { + "epoch": 2.177479805386782, + "grad_norm": 0.3437762716665271, + "learning_rate": 2.1198898942868616e-05, + "loss": 2.5795, + "step": 46770 + }, + { + "epoch": 2.177526363572875, + "grad_norm": 0.38934436364315284, + "learning_rate": 2.1196684771303648e-05, + "loss": 2.5717, + "step": 46771 + }, + { + "epoch": 2.1775729217589683, + "grad_norm": 0.3532070053442903, + "learning_rate": 2.1194470684273577e-05, + "loss": 2.6845, + "step": 46772 + }, + { + "epoch": 2.1776194799450614, + "grad_norm": 0.3417366398150557, + "learning_rate": 2.119225668178489e-05, + "loss": 2.6488, + "step": 46773 + }, + { + "epoch": 2.1776660381311546, + "grad_norm": 0.3619415384837274, + "learning_rate": 2.119004276384411e-05, + "loss": 2.6917, + "step": 46774 + }, + { + "epoch": 2.1777125963172477, + "grad_norm": 0.34720662697515076, + "learning_rate": 2.1187828930457697e-05, + "loss": 2.613, + "step": 46775 + }, + { + "epoch": 2.1777591545033403, + "grad_norm": 0.3353526064143048, + "learning_rate": 2.1185615181632172e-05, + "loss": 2.6714, + "step": 46776 + }, + { + "epoch": 2.1778057126894335, + "grad_norm": 0.3661395664343305, + "learning_rate": 2.1183401517374023e-05, + "loss": 2.6013, + "step": 46777 + }, + { + "epoch": 2.1778522708755266, + "grad_norm": 0.3327967487288538, + "learning_rate": 2.1181187937689756e-05, + "loss": 2.6197, + "step": 46778 + }, + { + "epoch": 2.1778988290616197, + "grad_norm": 0.3544410006593998, + "learning_rate": 2.117897444258586e-05, + "loss": 2.6872, + "step": 46779 + }, + { + "epoch": 2.177945387247713, + "grad_norm": 0.3512157787305418, + "learning_rate": 2.1176761032068854e-05, + "loss": 2.5879, + "step": 46780 + }, + { + "epoch": 2.177991945433806, + "grad_norm": 0.3249056104584976, + "learning_rate": 2.1174547706145182e-05, + "loss": 2.6097, + "step": 46781 + }, + { + "epoch": 2.178038503619899, + "grad_norm": 0.34623080786958055, + "learning_rate": 2.1172334464821402e-05, + "loss": 2.6291, + "step": 46782 + }, + { + "epoch": 2.178085061805992, + "grad_norm": 0.37397265256685414, + "learning_rate": 2.117012130810397e-05, + "loss": 2.7163, + "step": 46783 + }, + { + "epoch": 2.1781316199920853, + "grad_norm": 0.3221002395989538, + "learning_rate": 2.1167908235999384e-05, + "loss": 2.6193, + "step": 46784 + }, + { + "epoch": 2.1781781781781784, + "grad_norm": 0.34956141480943576, + "learning_rate": 2.116569524851415e-05, + "loss": 2.5534, + "step": 46785 + }, + { + "epoch": 2.178224736364271, + "grad_norm": 0.3435161596457371, + "learning_rate": 2.116348234565476e-05, + "loss": 2.6823, + "step": 46786 + }, + { + "epoch": 2.178271294550364, + "grad_norm": 0.3656779467609673, + "learning_rate": 2.116126952742773e-05, + "loss": 2.76, + "step": 46787 + }, + { + "epoch": 2.1783178527364573, + "grad_norm": 0.3271776387545695, + "learning_rate": 2.1159056793839493e-05, + "loss": 2.6104, + "step": 46788 + }, + { + "epoch": 2.1783644109225504, + "grad_norm": 0.36744187134724166, + "learning_rate": 2.115684414489662e-05, + "loss": 2.7453, + "step": 46789 + }, + { + "epoch": 2.1784109691086435, + "grad_norm": 0.33261848007655925, + "learning_rate": 2.115463158060555e-05, + "loss": 2.6487, + "step": 46790 + }, + { + "epoch": 2.1784575272947366, + "grad_norm": 0.34477507494967047, + "learning_rate": 2.115241910097279e-05, + "loss": 2.6932, + "step": 46791 + }, + { + "epoch": 2.1785040854808297, + "grad_norm": 0.3302504338406735, + "learning_rate": 2.1150206706004844e-05, + "loss": 2.7148, + "step": 46792 + }, + { + "epoch": 2.178550643666923, + "grad_norm": 0.33581127335890815, + "learning_rate": 2.1147994395708214e-05, + "loss": 2.6081, + "step": 46793 + }, + { + "epoch": 2.178597201853016, + "grad_norm": 0.32821795043326585, + "learning_rate": 2.1145782170089346e-05, + "loss": 2.6458, + "step": 46794 + }, + { + "epoch": 2.178643760039109, + "grad_norm": 0.34181041277826474, + "learning_rate": 2.1143570029154796e-05, + "loss": 2.7247, + "step": 46795 + }, + { + "epoch": 2.1786903182252018, + "grad_norm": 0.36975592279549824, + "learning_rate": 2.1141357972911008e-05, + "loss": 2.7196, + "step": 46796 + }, + { + "epoch": 2.178736876411295, + "grad_norm": 0.34185663004796685, + "learning_rate": 2.1139146001364497e-05, + "loss": 2.5543, + "step": 46797 + }, + { + "epoch": 2.178783434597388, + "grad_norm": 0.34418328217633687, + "learning_rate": 2.1136934114521745e-05, + "loss": 2.6002, + "step": 46798 + }, + { + "epoch": 2.178829992783481, + "grad_norm": 0.35197743772028744, + "learning_rate": 2.1134722312389244e-05, + "loss": 2.6674, + "step": 46799 + }, + { + "epoch": 2.178876550969574, + "grad_norm": 0.33237783602049886, + "learning_rate": 2.113251059497351e-05, + "loss": 2.6087, + "step": 46800 + }, + { + "epoch": 2.1789231091556673, + "grad_norm": 0.36694495679882067, + "learning_rate": 2.113029896228098e-05, + "loss": 2.7269, + "step": 46801 + }, + { + "epoch": 2.1789696673417605, + "grad_norm": 0.32069617219570207, + "learning_rate": 2.1128087414318214e-05, + "loss": 2.6213, + "step": 46802 + }, + { + "epoch": 2.1790162255278536, + "grad_norm": 0.3341737886842025, + "learning_rate": 2.1125875951091645e-05, + "loss": 2.645, + "step": 46803 + }, + { + "epoch": 2.1790627837139467, + "grad_norm": 0.3628223198080124, + "learning_rate": 2.1123664572607788e-05, + "loss": 2.7424, + "step": 46804 + }, + { + "epoch": 2.1791093419000394, + "grad_norm": 0.33385135920673714, + "learning_rate": 2.1121453278873132e-05, + "loss": 2.5896, + "step": 46805 + }, + { + "epoch": 2.1791559000861325, + "grad_norm": 0.33058373234157884, + "learning_rate": 2.1119242069894168e-05, + "loss": 2.614, + "step": 46806 + }, + { + "epoch": 2.1792024582722256, + "grad_norm": 0.35168437158152993, + "learning_rate": 2.111703094567738e-05, + "loss": 2.6129, + "step": 46807 + }, + { + "epoch": 2.1792490164583187, + "grad_norm": 0.3546612074649505, + "learning_rate": 2.1114819906229276e-05, + "loss": 2.667, + "step": 46808 + }, + { + "epoch": 2.179295574644412, + "grad_norm": 0.34458926758366976, + "learning_rate": 2.111260895155632e-05, + "loss": 2.6009, + "step": 46809 + }, + { + "epoch": 2.179342132830505, + "grad_norm": 0.32317747453090345, + "learning_rate": 2.1110398081665e-05, + "loss": 2.5034, + "step": 46810 + }, + { + "epoch": 2.179388691016598, + "grad_norm": 0.331973364293666, + "learning_rate": 2.110818729656182e-05, + "loss": 2.5699, + "step": 46811 + }, + { + "epoch": 2.179435249202691, + "grad_norm": 0.3389530307400496, + "learning_rate": 2.110597659625327e-05, + "loss": 2.5956, + "step": 46812 + }, + { + "epoch": 2.1794818073887843, + "grad_norm": 0.33842079115368684, + "learning_rate": 2.1103765980745843e-05, + "loss": 2.6684, + "step": 46813 + }, + { + "epoch": 2.1795283655748774, + "grad_norm": 0.3214672356713173, + "learning_rate": 2.1101555450045977e-05, + "loss": 2.6889, + "step": 46814 + }, + { + "epoch": 2.1795749237609705, + "grad_norm": 0.31020620812990174, + "learning_rate": 2.1099345004160242e-05, + "loss": 2.5946, + "step": 46815 + }, + { + "epoch": 2.179621481947063, + "grad_norm": 0.356381915559845, + "learning_rate": 2.1097134643095062e-05, + "loss": 2.681, + "step": 46816 + }, + { + "epoch": 2.1796680401331563, + "grad_norm": 0.3327114192602464, + "learning_rate": 2.1094924366856945e-05, + "loss": 2.6459, + "step": 46817 + }, + { + "epoch": 2.1797145983192494, + "grad_norm": 0.3617036726244922, + "learning_rate": 2.1092714175452376e-05, + "loss": 2.6794, + "step": 46818 + }, + { + "epoch": 2.1797611565053425, + "grad_norm": 0.32694795717601016, + "learning_rate": 2.1090504068887846e-05, + "loss": 2.6469, + "step": 46819 + }, + { + "epoch": 2.1798077146914356, + "grad_norm": 0.3169364064850771, + "learning_rate": 2.1088294047169838e-05, + "loss": 2.6855, + "step": 46820 + }, + { + "epoch": 2.1798542728775288, + "grad_norm": 0.3309349533151896, + "learning_rate": 2.1086084110304853e-05, + "loss": 2.6638, + "step": 46821 + }, + { + "epoch": 2.179900831063622, + "grad_norm": 0.33854418668123887, + "learning_rate": 2.1083874258299345e-05, + "loss": 2.5508, + "step": 46822 + }, + { + "epoch": 2.179947389249715, + "grad_norm": 0.3365268968700058, + "learning_rate": 2.1081664491159823e-05, + "loss": 2.6016, + "step": 46823 + }, + { + "epoch": 2.179993947435808, + "grad_norm": 0.3556147241405869, + "learning_rate": 2.1079454808892762e-05, + "loss": 2.6007, + "step": 46824 + }, + { + "epoch": 2.1800405056219008, + "grad_norm": 0.34290589918735065, + "learning_rate": 2.1077245211504654e-05, + "loss": 2.6092, + "step": 46825 + }, + { + "epoch": 2.180087063807994, + "grad_norm": 0.3290343598250739, + "learning_rate": 2.1075035699002e-05, + "loss": 2.6018, + "step": 46826 + }, + { + "epoch": 2.180133621994087, + "grad_norm": 0.3634147820541416, + "learning_rate": 2.1072826271391227e-05, + "loss": 2.5904, + "step": 46827 + }, + { + "epoch": 2.18018018018018, + "grad_norm": 0.3666287302561745, + "learning_rate": 2.10706169286789e-05, + "loss": 2.6495, + "step": 46828 + }, + { + "epoch": 2.1802267383662732, + "grad_norm": 0.34279008862417765, + "learning_rate": 2.1068407670871442e-05, + "loss": 2.7474, + "step": 46829 + }, + { + "epoch": 2.1802732965523663, + "grad_norm": 0.34477970034485467, + "learning_rate": 2.1066198497975355e-05, + "loss": 2.7794, + "step": 46830 + }, + { + "epoch": 2.1803198547384595, + "grad_norm": 0.3514237586255285, + "learning_rate": 2.1063989409997127e-05, + "loss": 2.6115, + "step": 46831 + }, + { + "epoch": 2.1803664129245526, + "grad_norm": 0.3557640710896232, + "learning_rate": 2.1061780406943238e-05, + "loss": 2.5563, + "step": 46832 + }, + { + "epoch": 2.1804129711106457, + "grad_norm": 0.3351413721983593, + "learning_rate": 2.1059571488820178e-05, + "loss": 2.6489, + "step": 46833 + }, + { + "epoch": 2.180459529296739, + "grad_norm": 0.3294199822635607, + "learning_rate": 2.1057362655634437e-05, + "loss": 2.5396, + "step": 46834 + }, + { + "epoch": 2.1805060874828315, + "grad_norm": 0.34774687238095064, + "learning_rate": 2.105515390739247e-05, + "loss": 2.5987, + "step": 46835 + }, + { + "epoch": 2.1805526456689246, + "grad_norm": 0.33879620036711133, + "learning_rate": 2.1052945244100773e-05, + "loss": 2.5283, + "step": 46836 + }, + { + "epoch": 2.1805992038550177, + "grad_norm": 0.34418619302181297, + "learning_rate": 2.105073666576583e-05, + "loss": 2.6016, + "step": 46837 + }, + { + "epoch": 2.180645762041111, + "grad_norm": 0.3536800980885437, + "learning_rate": 2.1048528172394122e-05, + "loss": 2.6836, + "step": 46838 + }, + { + "epoch": 2.180692320227204, + "grad_norm": 0.31366912034236394, + "learning_rate": 2.1046319763992155e-05, + "loss": 2.6431, + "step": 46839 + }, + { + "epoch": 2.180738878413297, + "grad_norm": 0.34262021475291043, + "learning_rate": 2.1044111440566343e-05, + "loss": 2.5828, + "step": 46840 + }, + { + "epoch": 2.18078543659939, + "grad_norm": 0.34090093821152756, + "learning_rate": 2.1041903202123255e-05, + "loss": 2.7326, + "step": 46841 + }, + { + "epoch": 2.1808319947854833, + "grad_norm": 0.3112299577776899, + "learning_rate": 2.1039695048669287e-05, + "loss": 2.6562, + "step": 46842 + }, + { + "epoch": 2.1808785529715764, + "grad_norm": 0.33208282407693146, + "learning_rate": 2.1037486980211e-05, + "loss": 2.6037, + "step": 46843 + }, + { + "epoch": 2.180925111157669, + "grad_norm": 0.34839844647564994, + "learning_rate": 2.1035278996754814e-05, + "loss": 2.716, + "step": 46844 + }, + { + "epoch": 2.180971669343762, + "grad_norm": 0.3186698094561797, + "learning_rate": 2.1033071098307234e-05, + "loss": 2.6658, + "step": 46845 + }, + { + "epoch": 2.1810182275298553, + "grad_norm": 0.316656750823675, + "learning_rate": 2.1030863284874735e-05, + "loss": 2.5936, + "step": 46846 + }, + { + "epoch": 2.1810647857159484, + "grad_norm": 0.35020284082790365, + "learning_rate": 2.1028655556463815e-05, + "loss": 2.5625, + "step": 46847 + }, + { + "epoch": 2.1811113439020415, + "grad_norm": 0.3456545051952611, + "learning_rate": 2.1026447913080925e-05, + "loss": 2.6117, + "step": 46848 + }, + { + "epoch": 2.1811579020881346, + "grad_norm": 0.3322498419648286, + "learning_rate": 2.102424035473255e-05, + "loss": 2.7594, + "step": 46849 + }, + { + "epoch": 2.1812044602742278, + "grad_norm": 0.32882707903125785, + "learning_rate": 2.102203288142518e-05, + "loss": 2.6309, + "step": 46850 + }, + { + "epoch": 2.181251018460321, + "grad_norm": 0.3414076040493215, + "learning_rate": 2.1019825493165286e-05, + "loss": 2.7369, + "step": 46851 + }, + { + "epoch": 2.181297576646414, + "grad_norm": 0.32640001817564024, + "learning_rate": 2.1017618189959366e-05, + "loss": 2.6965, + "step": 46852 + }, + { + "epoch": 2.181344134832507, + "grad_norm": 0.3239903533508494, + "learning_rate": 2.1015410971813842e-05, + "loss": 2.6108, + "step": 46853 + }, + { + "epoch": 2.1813906930186002, + "grad_norm": 0.3243005650807661, + "learning_rate": 2.1013203838735272e-05, + "loss": 2.6441, + "step": 46854 + }, + { + "epoch": 2.181437251204693, + "grad_norm": 0.36013150549357537, + "learning_rate": 2.101099679073006e-05, + "loss": 2.6309, + "step": 46855 + }, + { + "epoch": 2.181483809390786, + "grad_norm": 0.3360412381028763, + "learning_rate": 2.1008789827804748e-05, + "loss": 2.5585, + "step": 46856 + }, + { + "epoch": 2.181530367576879, + "grad_norm": 0.3433668852011834, + "learning_rate": 2.1006582949965764e-05, + "loss": 2.6476, + "step": 46857 + }, + { + "epoch": 2.1815769257629722, + "grad_norm": 0.3589135246910188, + "learning_rate": 2.10043761572196e-05, + "loss": 2.6821, + "step": 46858 + }, + { + "epoch": 2.1816234839490654, + "grad_norm": 0.375719124308749, + "learning_rate": 2.100216944957274e-05, + "loss": 2.6914, + "step": 46859 + }, + { + "epoch": 2.1816700421351585, + "grad_norm": 0.3450019996692927, + "learning_rate": 2.0999962827031655e-05, + "loss": 2.6317, + "step": 46860 + }, + { + "epoch": 2.1817166003212516, + "grad_norm": 0.33560704755737614, + "learning_rate": 2.099775628960284e-05, + "loss": 2.5849, + "step": 46861 + }, + { + "epoch": 2.1817631585073447, + "grad_norm": 0.3375976047509517, + "learning_rate": 2.0995549837292732e-05, + "loss": 2.6213, + "step": 46862 + }, + { + "epoch": 2.181809716693438, + "grad_norm": 0.3441071884053346, + "learning_rate": 2.0993343470107835e-05, + "loss": 2.6565, + "step": 46863 + }, + { + "epoch": 2.1818562748795305, + "grad_norm": 0.3469721831832292, + "learning_rate": 2.099113718805461e-05, + "loss": 2.5306, + "step": 46864 + }, + { + "epoch": 2.1819028330656236, + "grad_norm": 0.34616482548571154, + "learning_rate": 2.098893099113956e-05, + "loss": 2.656, + "step": 46865 + }, + { + "epoch": 2.1819493912517167, + "grad_norm": 0.34903734642705403, + "learning_rate": 2.0986724879369107e-05, + "loss": 2.7546, + "step": 46866 + }, + { + "epoch": 2.18199594943781, + "grad_norm": 0.3377807321538894, + "learning_rate": 2.098451885274979e-05, + "loss": 2.6266, + "step": 46867 + }, + { + "epoch": 2.182042507623903, + "grad_norm": 0.34643249132159354, + "learning_rate": 2.0982312911288016e-05, + "loss": 2.7394, + "step": 46868 + }, + { + "epoch": 2.182089065809996, + "grad_norm": 0.3360904421799173, + "learning_rate": 2.0980107054990333e-05, + "loss": 2.6853, + "step": 46869 + }, + { + "epoch": 2.182135623996089, + "grad_norm": 0.3191325163181532, + "learning_rate": 2.0977901283863154e-05, + "loss": 2.6118, + "step": 46870 + }, + { + "epoch": 2.1821821821821823, + "grad_norm": 0.3379579167680549, + "learning_rate": 2.097569559791298e-05, + "loss": 2.5491, + "step": 46871 + }, + { + "epoch": 2.1822287403682754, + "grad_norm": 0.32697017393440714, + "learning_rate": 2.097348999714628e-05, + "loss": 2.5829, + "step": 46872 + }, + { + "epoch": 2.1822752985543685, + "grad_norm": 0.33257585190113237, + "learning_rate": 2.0971284481569527e-05, + "loss": 2.6277, + "step": 46873 + }, + { + "epoch": 2.182321856740461, + "grad_norm": 0.34263197204159784, + "learning_rate": 2.096907905118921e-05, + "loss": 2.5736, + "step": 46874 + }, + { + "epoch": 2.1823684149265543, + "grad_norm": 0.33642597943745, + "learning_rate": 2.096687370601177e-05, + "loss": 2.6567, + "step": 46875 + }, + { + "epoch": 2.1824149731126474, + "grad_norm": 0.33991074743383476, + "learning_rate": 2.0964668446043696e-05, + "loss": 2.6865, + "step": 46876 + }, + { + "epoch": 2.1824615312987405, + "grad_norm": 0.3387398172408062, + "learning_rate": 2.096246327129146e-05, + "loss": 2.6702, + "step": 46877 + }, + { + "epoch": 2.1825080894848337, + "grad_norm": 0.35249888950739783, + "learning_rate": 2.0960258181761532e-05, + "loss": 2.6395, + "step": 46878 + }, + { + "epoch": 2.1825546476709268, + "grad_norm": 0.3337571679725793, + "learning_rate": 2.0958053177460385e-05, + "loss": 2.558, + "step": 46879 + }, + { + "epoch": 2.18260120585702, + "grad_norm": 0.36185983512725917, + "learning_rate": 2.095584825839451e-05, + "loss": 2.7741, + "step": 46880 + }, + { + "epoch": 2.182647764043113, + "grad_norm": 0.3492256963299809, + "learning_rate": 2.0953643424570325e-05, + "loss": 2.5992, + "step": 46881 + }, + { + "epoch": 2.182694322229206, + "grad_norm": 0.34647319078608435, + "learning_rate": 2.0951438675994367e-05, + "loss": 2.6072, + "step": 46882 + }, + { + "epoch": 2.182740880415299, + "grad_norm": 0.35698598319812086, + "learning_rate": 2.094923401267306e-05, + "loss": 2.6655, + "step": 46883 + }, + { + "epoch": 2.182787438601392, + "grad_norm": 0.3154315801969938, + "learning_rate": 2.0947029434612887e-05, + "loss": 2.6483, + "step": 46884 + }, + { + "epoch": 2.182833996787485, + "grad_norm": 0.3699841569734703, + "learning_rate": 2.0944824941820317e-05, + "loss": 2.6245, + "step": 46885 + }, + { + "epoch": 2.182880554973578, + "grad_norm": 0.35911843202041843, + "learning_rate": 2.0942620534301827e-05, + "loss": 2.671, + "step": 46886 + }, + { + "epoch": 2.1829271131596713, + "grad_norm": 0.33455253915012434, + "learning_rate": 2.09404162120639e-05, + "loss": 2.6039, + "step": 46887 + }, + { + "epoch": 2.1829736713457644, + "grad_norm": 0.35328939311873503, + "learning_rate": 2.0938211975112974e-05, + "loss": 2.6169, + "step": 46888 + }, + { + "epoch": 2.1830202295318575, + "grad_norm": 0.37759417024309977, + "learning_rate": 2.0936007823455534e-05, + "loss": 2.6621, + "step": 46889 + }, + { + "epoch": 2.1830667877179506, + "grad_norm": 0.3391126527556465, + "learning_rate": 2.0933803757098043e-05, + "loss": 2.6941, + "step": 46890 + }, + { + "epoch": 2.1831133459040437, + "grad_norm": 0.34527964599940136, + "learning_rate": 2.0931599776046974e-05, + "loss": 2.523, + "step": 46891 + }, + { + "epoch": 2.183159904090137, + "grad_norm": 0.3419047348792756, + "learning_rate": 2.09293958803088e-05, + "loss": 2.6431, + "step": 46892 + }, + { + "epoch": 2.18320646227623, + "grad_norm": 0.32938365550581433, + "learning_rate": 2.092719206989e-05, + "loss": 2.6879, + "step": 46893 + }, + { + "epoch": 2.1832530204623226, + "grad_norm": 0.3453572505710583, + "learning_rate": 2.0924988344796993e-05, + "loss": 2.7147, + "step": 46894 + }, + { + "epoch": 2.1832995786484157, + "grad_norm": 0.32505510591000997, + "learning_rate": 2.0922784705036312e-05, + "loss": 2.5983, + "step": 46895 + }, + { + "epoch": 2.183346136834509, + "grad_norm": 0.32527605596707565, + "learning_rate": 2.0920581150614366e-05, + "loss": 2.62, + "step": 46896 + }, + { + "epoch": 2.183392695020602, + "grad_norm": 0.3327586043306671, + "learning_rate": 2.0918377681537678e-05, + "loss": 2.6129, + "step": 46897 + }, + { + "epoch": 2.183439253206695, + "grad_norm": 0.3228532677104234, + "learning_rate": 2.0916174297812675e-05, + "loss": 2.5864, + "step": 46898 + }, + { + "epoch": 2.183485811392788, + "grad_norm": 0.3242593641110123, + "learning_rate": 2.091397099944583e-05, + "loss": 2.5954, + "step": 46899 + }, + { + "epoch": 2.1835323695788813, + "grad_norm": 0.3415688598855419, + "learning_rate": 2.0911767786443615e-05, + "loss": 2.6834, + "step": 46900 + }, + { + "epoch": 2.1835789277649744, + "grad_norm": 0.31265884314674, + "learning_rate": 2.0909564658812515e-05, + "loss": 2.6865, + "step": 46901 + }, + { + "epoch": 2.1836254859510675, + "grad_norm": 0.3383817091262871, + "learning_rate": 2.0907361616558957e-05, + "loss": 2.5996, + "step": 46902 + }, + { + "epoch": 2.18367204413716, + "grad_norm": 0.38041507802611846, + "learning_rate": 2.090515865968943e-05, + "loss": 2.7398, + "step": 46903 + }, + { + "epoch": 2.1837186023232533, + "grad_norm": 0.33338078758635914, + "learning_rate": 2.0902955788210392e-05, + "loss": 2.657, + "step": 46904 + }, + { + "epoch": 2.1837651605093464, + "grad_norm": 0.3259965606646268, + "learning_rate": 2.0900753002128316e-05, + "loss": 2.6493, + "step": 46905 + }, + { + "epoch": 2.1838117186954396, + "grad_norm": 0.3433262071638509, + "learning_rate": 2.089855030144968e-05, + "loss": 2.5672, + "step": 46906 + }, + { + "epoch": 2.1838582768815327, + "grad_norm": 0.36079000565963054, + "learning_rate": 2.0896347686180896e-05, + "loss": 2.5531, + "step": 46907 + }, + { + "epoch": 2.183904835067626, + "grad_norm": 0.32909009496454994, + "learning_rate": 2.0894145156328503e-05, + "loss": 2.5967, + "step": 46908 + }, + { + "epoch": 2.183951393253719, + "grad_norm": 0.3399752361161694, + "learning_rate": 2.0891942711898887e-05, + "loss": 2.5579, + "step": 46909 + }, + { + "epoch": 2.183997951439812, + "grad_norm": 0.33913438432770243, + "learning_rate": 2.0889740352898592e-05, + "loss": 2.6074, + "step": 46910 + }, + { + "epoch": 2.184044509625905, + "grad_norm": 0.3450006956430074, + "learning_rate": 2.0887538079334014e-05, + "loss": 2.6687, + "step": 46911 + }, + { + "epoch": 2.1840910678119982, + "grad_norm": 0.34351997874678586, + "learning_rate": 2.088533589121165e-05, + "loss": 2.668, + "step": 46912 + }, + { + "epoch": 2.184137625998091, + "grad_norm": 0.34603989360143056, + "learning_rate": 2.0883133788537957e-05, + "loss": 2.6886, + "step": 46913 + }, + { + "epoch": 2.184184184184184, + "grad_norm": 0.35523094435504815, + "learning_rate": 2.0880931771319396e-05, + "loss": 2.7535, + "step": 46914 + }, + { + "epoch": 2.184230742370277, + "grad_norm": 0.3693026481888649, + "learning_rate": 2.087872983956245e-05, + "loss": 2.7181, + "step": 46915 + }, + { + "epoch": 2.1842773005563703, + "grad_norm": 0.347368467010872, + "learning_rate": 2.0876527993273548e-05, + "loss": 2.6135, + "step": 46916 + }, + { + "epoch": 2.1843238587424634, + "grad_norm": 0.35264830551567034, + "learning_rate": 2.0874326232459162e-05, + "loss": 2.5935, + "step": 46917 + }, + { + "epoch": 2.1843704169285565, + "grad_norm": 0.3611890482562359, + "learning_rate": 2.0872124557125762e-05, + "loss": 2.6234, + "step": 46918 + }, + { + "epoch": 2.1844169751146496, + "grad_norm": 0.3116381918328515, + "learning_rate": 2.0869922967279827e-05, + "loss": 2.6399, + "step": 46919 + }, + { + "epoch": 2.1844635333007427, + "grad_norm": 0.342998907460392, + "learning_rate": 2.086772146292776e-05, + "loss": 2.6892, + "step": 46920 + }, + { + "epoch": 2.184510091486836, + "grad_norm": 0.37773989001816555, + "learning_rate": 2.086552004407609e-05, + "loss": 2.6955, + "step": 46921 + }, + { + "epoch": 2.1845566496729285, + "grad_norm": 0.33982097221589747, + "learning_rate": 2.086331871073122e-05, + "loss": 2.7008, + "step": 46922 + }, + { + "epoch": 2.1846032078590216, + "grad_norm": 0.32883447275873123, + "learning_rate": 2.0861117462899676e-05, + "loss": 2.6366, + "step": 46923 + }, + { + "epoch": 2.1846497660451147, + "grad_norm": 0.3730791428571668, + "learning_rate": 2.085891630058786e-05, + "loss": 2.7448, + "step": 46924 + }, + { + "epoch": 2.184696324231208, + "grad_norm": 0.3498252360733357, + "learning_rate": 2.085671522380226e-05, + "loss": 2.7216, + "step": 46925 + }, + { + "epoch": 2.184742882417301, + "grad_norm": 0.32784202701595416, + "learning_rate": 2.085451423254932e-05, + "loss": 2.6239, + "step": 46926 + }, + { + "epoch": 2.184789440603394, + "grad_norm": 0.33940355589412685, + "learning_rate": 2.0852313326835515e-05, + "loss": 2.5901, + "step": 46927 + }, + { + "epoch": 2.184835998789487, + "grad_norm": 0.3345691357140959, + "learning_rate": 2.0850112506667318e-05, + "loss": 2.5828, + "step": 46928 + }, + { + "epoch": 2.1848825569755803, + "grad_norm": 0.35581011205687646, + "learning_rate": 2.084791177205115e-05, + "loss": 2.7097, + "step": 46929 + }, + { + "epoch": 2.1849291151616734, + "grad_norm": 0.33737453212849866, + "learning_rate": 2.084571112299349e-05, + "loss": 2.7865, + "step": 46930 + }, + { + "epoch": 2.1849756733477665, + "grad_norm": 0.34945642939344107, + "learning_rate": 2.0843510559500795e-05, + "loss": 2.6022, + "step": 46931 + }, + { + "epoch": 2.1850222315338597, + "grad_norm": 0.3499095201919474, + "learning_rate": 2.0841310081579522e-05, + "loss": 2.5219, + "step": 46932 + }, + { + "epoch": 2.1850687897199523, + "grad_norm": 0.3372788662976942, + "learning_rate": 2.083910968923613e-05, + "loss": 2.7352, + "step": 46933 + }, + { + "epoch": 2.1851153479060454, + "grad_norm": 0.3256401568063854, + "learning_rate": 2.08369093824771e-05, + "loss": 2.4817, + "step": 46934 + }, + { + "epoch": 2.1851619060921386, + "grad_norm": 0.34548885979123645, + "learning_rate": 2.083470916130883e-05, + "loss": 2.7247, + "step": 46935 + }, + { + "epoch": 2.1852084642782317, + "grad_norm": 0.32869136551228684, + "learning_rate": 2.0832509025737857e-05, + "loss": 2.5915, + "step": 46936 + }, + { + "epoch": 2.185255022464325, + "grad_norm": 0.3165689878435178, + "learning_rate": 2.083030897577057e-05, + "loss": 2.6604, + "step": 46937 + }, + { + "epoch": 2.185301580650418, + "grad_norm": 0.3378822519447244, + "learning_rate": 2.0828109011413453e-05, + "loss": 2.661, + "step": 46938 + }, + { + "epoch": 2.185348138836511, + "grad_norm": 0.34689159651854845, + "learning_rate": 2.0825909132672967e-05, + "loss": 2.7188, + "step": 46939 + }, + { + "epoch": 2.185394697022604, + "grad_norm": 0.3096309137913261, + "learning_rate": 2.0823709339555562e-05, + "loss": 2.6211, + "step": 46940 + }, + { + "epoch": 2.1854412552086973, + "grad_norm": 0.3385716956298576, + "learning_rate": 2.0821509632067713e-05, + "loss": 2.623, + "step": 46941 + }, + { + "epoch": 2.18548781339479, + "grad_norm": 0.3349563240161927, + "learning_rate": 2.0819310010215844e-05, + "loss": 2.6005, + "step": 46942 + }, + { + "epoch": 2.185534371580883, + "grad_norm": 0.3240654998091023, + "learning_rate": 2.0817110474006425e-05, + "loss": 2.6687, + "step": 46943 + }, + { + "epoch": 2.185580929766976, + "grad_norm": 0.348654217069139, + "learning_rate": 2.0814911023445905e-05, + "loss": 2.6788, + "step": 46944 + }, + { + "epoch": 2.1856274879530693, + "grad_norm": 0.33169070466325545, + "learning_rate": 2.0812711658540752e-05, + "loss": 2.6914, + "step": 46945 + }, + { + "epoch": 2.1856740461391624, + "grad_norm": 0.3275318263813225, + "learning_rate": 2.0810512379297413e-05, + "loss": 2.6125, + "step": 46946 + }, + { + "epoch": 2.1857206043252555, + "grad_norm": 0.3390304461959599, + "learning_rate": 2.0808313185722363e-05, + "loss": 2.595, + "step": 46947 + }, + { + "epoch": 2.1857671625113486, + "grad_norm": 0.34204156862721685, + "learning_rate": 2.0806114077822005e-05, + "loss": 2.7022, + "step": 46948 + }, + { + "epoch": 2.1858137206974417, + "grad_norm": 0.3417894865310102, + "learning_rate": 2.080391505560285e-05, + "loss": 2.5792, + "step": 46949 + }, + { + "epoch": 2.185860278883535, + "grad_norm": 0.3565248319352198, + "learning_rate": 2.0801716119071323e-05, + "loss": 2.6509, + "step": 46950 + }, + { + "epoch": 2.185906837069628, + "grad_norm": 0.34684975015067204, + "learning_rate": 2.0799517268233877e-05, + "loss": 2.6753, + "step": 46951 + }, + { + "epoch": 2.1859533952557206, + "grad_norm": 0.33827844613265057, + "learning_rate": 2.0797318503096975e-05, + "loss": 2.5929, + "step": 46952 + }, + { + "epoch": 2.1859999534418137, + "grad_norm": 0.3526540089538499, + "learning_rate": 2.0795119823667064e-05, + "loss": 2.6482, + "step": 46953 + }, + { + "epoch": 2.186046511627907, + "grad_norm": 0.3519670767087806, + "learning_rate": 2.0792921229950613e-05, + "loss": 2.6553, + "step": 46954 + }, + { + "epoch": 2.186093069814, + "grad_norm": 0.3607482009254837, + "learning_rate": 2.079072272195405e-05, + "loss": 2.6691, + "step": 46955 + }, + { + "epoch": 2.186139628000093, + "grad_norm": 0.35189592179913454, + "learning_rate": 2.0788524299683833e-05, + "loss": 2.6437, + "step": 46956 + }, + { + "epoch": 2.186186186186186, + "grad_norm": 0.34232220136246944, + "learning_rate": 2.0786325963146426e-05, + "loss": 2.6198, + "step": 46957 + }, + { + "epoch": 2.1862327443722793, + "grad_norm": 0.34340828264583473, + "learning_rate": 2.0784127712348268e-05, + "loss": 2.6293, + "step": 46958 + }, + { + "epoch": 2.1862793025583724, + "grad_norm": 0.3287350268045105, + "learning_rate": 2.0781929547295815e-05, + "loss": 2.6176, + "step": 46959 + }, + { + "epoch": 2.1863258607444656, + "grad_norm": 0.3325918570037546, + "learning_rate": 2.0779731467995544e-05, + "loss": 2.6244, + "step": 46960 + }, + { + "epoch": 2.1863724189305587, + "grad_norm": 0.34512816550706177, + "learning_rate": 2.0777533474453843e-05, + "loss": 2.5796, + "step": 46961 + }, + { + "epoch": 2.1864189771166513, + "grad_norm": 0.323378548110227, + "learning_rate": 2.077533556667724e-05, + "loss": 2.5562, + "step": 46962 + }, + { + "epoch": 2.1864655353027445, + "grad_norm": 0.3487220757515364, + "learning_rate": 2.0773137744672126e-05, + "loss": 2.7539, + "step": 46963 + }, + { + "epoch": 2.1865120934888376, + "grad_norm": 0.31987659538704083, + "learning_rate": 2.0770940008444977e-05, + "loss": 2.7406, + "step": 46964 + }, + { + "epoch": 2.1865586516749307, + "grad_norm": 0.35352667647105784, + "learning_rate": 2.076874235800224e-05, + "loss": 2.5851, + "step": 46965 + }, + { + "epoch": 2.186605209861024, + "grad_norm": 0.35170101365610373, + "learning_rate": 2.0766544793350366e-05, + "loss": 2.7013, + "step": 46966 + }, + { + "epoch": 2.186651768047117, + "grad_norm": 0.3287011286774124, + "learning_rate": 2.0764347314495817e-05, + "loss": 2.6067, + "step": 46967 + }, + { + "epoch": 2.18669832623321, + "grad_norm": 0.33526934432609146, + "learning_rate": 2.0762149921444995e-05, + "loss": 2.7599, + "step": 46968 + }, + { + "epoch": 2.186744884419303, + "grad_norm": 0.33559809164554444, + "learning_rate": 2.075995261420442e-05, + "loss": 2.6015, + "step": 46969 + }, + { + "epoch": 2.1867914426053963, + "grad_norm": 0.32648342913281725, + "learning_rate": 2.075775539278048e-05, + "loss": 2.554, + "step": 46970 + }, + { + "epoch": 2.1868380007914894, + "grad_norm": 0.3414835143689586, + "learning_rate": 2.0755558257179654e-05, + "loss": 2.4842, + "step": 46971 + }, + { + "epoch": 2.186884558977582, + "grad_norm": 0.3259816792368996, + "learning_rate": 2.0753361207408377e-05, + "loss": 2.6523, + "step": 46972 + }, + { + "epoch": 2.186931117163675, + "grad_norm": 0.3200126732049063, + "learning_rate": 2.0751164243473126e-05, + "loss": 2.6172, + "step": 46973 + }, + { + "epoch": 2.1869776753497683, + "grad_norm": 0.35852244984099246, + "learning_rate": 2.074896736538029e-05, + "loss": 2.7288, + "step": 46974 + }, + { + "epoch": 2.1870242335358614, + "grad_norm": 0.3448380201603596, + "learning_rate": 2.0746770573136388e-05, + "loss": 2.6336, + "step": 46975 + }, + { + "epoch": 2.1870707917219545, + "grad_norm": 0.3318167826818264, + "learning_rate": 2.074457386674782e-05, + "loss": 2.6336, + "step": 46976 + }, + { + "epoch": 2.1871173499080476, + "grad_norm": 0.35403028230399386, + "learning_rate": 2.0742377246221033e-05, + "loss": 2.5853, + "step": 46977 + }, + { + "epoch": 2.1871639080941407, + "grad_norm": 0.34077747995764845, + "learning_rate": 2.0740180711562496e-05, + "loss": 2.6622, + "step": 46978 + }, + { + "epoch": 2.187210466280234, + "grad_norm": 0.3483141670788723, + "learning_rate": 2.0737984262778643e-05, + "loss": 2.6141, + "step": 46979 + }, + { + "epoch": 2.187257024466327, + "grad_norm": 0.31480293091444694, + "learning_rate": 2.073578789987594e-05, + "loss": 2.6066, + "step": 46980 + }, + { + "epoch": 2.1873035826524196, + "grad_norm": 0.3414079136383067, + "learning_rate": 2.0733591622860788e-05, + "loss": 2.711, + "step": 46981 + }, + { + "epoch": 2.1873501408385128, + "grad_norm": 0.33449235104659913, + "learning_rate": 2.0731395431739692e-05, + "loss": 2.6967, + "step": 46982 + }, + { + "epoch": 2.187396699024606, + "grad_norm": 0.3400612765996399, + "learning_rate": 2.0729199326519043e-05, + "loss": 2.686, + "step": 46983 + }, + { + "epoch": 2.187443257210699, + "grad_norm": 0.34514651814799124, + "learning_rate": 2.0727003307205323e-05, + "loss": 2.57, + "step": 46984 + }, + { + "epoch": 2.187489815396792, + "grad_norm": 0.3304000480165785, + "learning_rate": 2.0724807373804955e-05, + "loss": 2.6166, + "step": 46985 + }, + { + "epoch": 2.187536373582885, + "grad_norm": 0.3319489508991911, + "learning_rate": 2.0722611526324397e-05, + "loss": 2.6774, + "step": 46986 + }, + { + "epoch": 2.1875829317689783, + "grad_norm": 0.33200276399022477, + "learning_rate": 2.0720415764770083e-05, + "loss": 2.6686, + "step": 46987 + }, + { + "epoch": 2.1876294899550714, + "grad_norm": 0.3374396941562484, + "learning_rate": 2.0718220089148492e-05, + "loss": 2.6802, + "step": 46988 + }, + { + "epoch": 2.1876760481411646, + "grad_norm": 0.3389974246440472, + "learning_rate": 2.071602449946602e-05, + "loss": 2.6474, + "step": 46989 + }, + { + "epoch": 2.1877226063272577, + "grad_norm": 0.31319021127050256, + "learning_rate": 2.071382899572913e-05, + "loss": 2.7053, + "step": 46990 + }, + { + "epoch": 2.187769164513351, + "grad_norm": 0.3204494844652491, + "learning_rate": 2.071163357794426e-05, + "loss": 2.5562, + "step": 46991 + }, + { + "epoch": 2.1878157226994435, + "grad_norm": 0.3331691545259406, + "learning_rate": 2.0709438246117867e-05, + "loss": 2.7503, + "step": 46992 + }, + { + "epoch": 2.1878622808855366, + "grad_norm": 0.3174028703318373, + "learning_rate": 2.070724300025641e-05, + "loss": 2.5251, + "step": 46993 + }, + { + "epoch": 2.1879088390716297, + "grad_norm": 0.3528812751442833, + "learning_rate": 2.0705047840366264e-05, + "loss": 2.6876, + "step": 46994 + }, + { + "epoch": 2.187955397257723, + "grad_norm": 0.3500836277354571, + "learning_rate": 2.0702852766453958e-05, + "loss": 2.645, + "step": 46995 + }, + { + "epoch": 2.188001955443816, + "grad_norm": 0.3278402652389026, + "learning_rate": 2.0700657778525873e-05, + "loss": 2.6518, + "step": 46996 + }, + { + "epoch": 2.188048513629909, + "grad_norm": 0.36999418491774255, + "learning_rate": 2.069846287658847e-05, + "loss": 2.6993, + "step": 46997 + }, + { + "epoch": 2.188095071816002, + "grad_norm": 0.324917787242811, + "learning_rate": 2.0696268060648195e-05, + "loss": 2.6644, + "step": 46998 + }, + { + "epoch": 2.1881416300020953, + "grad_norm": 0.3287034221697167, + "learning_rate": 2.0694073330711483e-05, + "loss": 2.7067, + "step": 46999 + }, + { + "epoch": 2.1881881881881884, + "grad_norm": 0.35717414043188506, + "learning_rate": 2.0691878686784777e-05, + "loss": 2.6872, + "step": 47000 + }, + { + "epoch": 2.188234746374281, + "grad_norm": 0.3282044212198277, + "learning_rate": 2.0689684128874544e-05, + "loss": 2.5532, + "step": 47001 + }, + { + "epoch": 2.188281304560374, + "grad_norm": 0.33884629410625067, + "learning_rate": 2.0687489656987175e-05, + "loss": 2.6518, + "step": 47002 + }, + { + "epoch": 2.1883278627464673, + "grad_norm": 0.3056708057224338, + "learning_rate": 2.068529527112914e-05, + "loss": 2.6366, + "step": 47003 + }, + { + "epoch": 2.1883744209325604, + "grad_norm": 0.33750595155987717, + "learning_rate": 2.0683100971306872e-05, + "loss": 2.5836, + "step": 47004 + }, + { + "epoch": 2.1884209791186535, + "grad_norm": 0.34104899728213717, + "learning_rate": 2.068090675752682e-05, + "loss": 2.7228, + "step": 47005 + }, + { + "epoch": 2.1884675373047466, + "grad_norm": 0.3400632570843204, + "learning_rate": 2.0678712629795432e-05, + "loss": 2.5919, + "step": 47006 + }, + { + "epoch": 2.1885140954908398, + "grad_norm": 0.3866437402855246, + "learning_rate": 2.0676518588119102e-05, + "loss": 2.7245, + "step": 47007 + }, + { + "epoch": 2.188560653676933, + "grad_norm": 0.342045050871769, + "learning_rate": 2.067432463250434e-05, + "loss": 2.6318, + "step": 47008 + }, + { + "epoch": 2.188607211863026, + "grad_norm": 0.35552405008410704, + "learning_rate": 2.0672130762957504e-05, + "loss": 2.6265, + "step": 47009 + }, + { + "epoch": 2.188653770049119, + "grad_norm": 0.3644014373596987, + "learning_rate": 2.066993697948511e-05, + "loss": 2.5374, + "step": 47010 + }, + { + "epoch": 2.1887003282352118, + "grad_norm": 0.3592767506373737, + "learning_rate": 2.0667743282093545e-05, + "loss": 2.6731, + "step": 47011 + }, + { + "epoch": 2.188746886421305, + "grad_norm": 0.3601816443908871, + "learning_rate": 2.0665549670789265e-05, + "loss": 2.7092, + "step": 47012 + }, + { + "epoch": 2.188793444607398, + "grad_norm": 0.3870639657330343, + "learning_rate": 2.0663356145578705e-05, + "loss": 2.6961, + "step": 47013 + }, + { + "epoch": 2.188840002793491, + "grad_norm": 0.3580092941019882, + "learning_rate": 2.066116270646832e-05, + "loss": 2.629, + "step": 47014 + }, + { + "epoch": 2.1888865609795842, + "grad_norm": 0.38046652925071583, + "learning_rate": 2.065896935346452e-05, + "loss": 2.643, + "step": 47015 + }, + { + "epoch": 2.1889331191656773, + "grad_norm": 0.3585861110723084, + "learning_rate": 2.0656776086573753e-05, + "loss": 2.5308, + "step": 47016 + }, + { + "epoch": 2.1889796773517705, + "grad_norm": 0.3682420027933735, + "learning_rate": 2.0654582905802456e-05, + "loss": 2.6633, + "step": 47017 + }, + { + "epoch": 2.1890262355378636, + "grad_norm": 0.33638111181367875, + "learning_rate": 2.065238981115707e-05, + "loss": 2.5596, + "step": 47018 + }, + { + "epoch": 2.1890727937239567, + "grad_norm": 0.3998878520116728, + "learning_rate": 2.065019680264404e-05, + "loss": 2.6437, + "step": 47019 + }, + { + "epoch": 2.1891193519100494, + "grad_norm": 0.3584485405693443, + "learning_rate": 2.0648003880269762e-05, + "loss": 2.6418, + "step": 47020 + }, + { + "epoch": 2.1891659100961425, + "grad_norm": 0.3818383660073917, + "learning_rate": 2.0645811044040738e-05, + "loss": 2.6403, + "step": 47021 + }, + { + "epoch": 2.1892124682822356, + "grad_norm": 0.3743133160586022, + "learning_rate": 2.0643618293963322e-05, + "loss": 2.7272, + "step": 47022 + }, + { + "epoch": 2.1892590264683287, + "grad_norm": 0.340271428685984, + "learning_rate": 2.064142563004404e-05, + "loss": 2.6581, + "step": 47023 + }, + { + "epoch": 2.189305584654422, + "grad_norm": 0.3857456920562117, + "learning_rate": 2.0639233052289252e-05, + "loss": 2.7552, + "step": 47024 + }, + { + "epoch": 2.189352142840515, + "grad_norm": 0.3533844835282714, + "learning_rate": 2.0637040560705435e-05, + "loss": 2.5799, + "step": 47025 + }, + { + "epoch": 2.189398701026608, + "grad_norm": 0.3456668213926079, + "learning_rate": 2.0634848155299004e-05, + "loss": 2.6706, + "step": 47026 + }, + { + "epoch": 2.189445259212701, + "grad_norm": 0.3578879530416657, + "learning_rate": 2.0632655836076397e-05, + "loss": 2.6302, + "step": 47027 + }, + { + "epoch": 2.1894918173987943, + "grad_norm": 0.38045763184541764, + "learning_rate": 2.0630463603044074e-05, + "loss": 2.6778, + "step": 47028 + }, + { + "epoch": 2.1895383755848874, + "grad_norm": 0.37425109446428134, + "learning_rate": 2.062827145620843e-05, + "loss": 2.6666, + "step": 47029 + }, + { + "epoch": 2.1895849337709805, + "grad_norm": 0.3812016817709522, + "learning_rate": 2.062607939557592e-05, + "loss": 2.6425, + "step": 47030 + }, + { + "epoch": 2.189631491957073, + "grad_norm": 0.3346270019331126, + "learning_rate": 2.0623887421152966e-05, + "loss": 2.6512, + "step": 47031 + }, + { + "epoch": 2.1896780501431663, + "grad_norm": 0.3485195423716327, + "learning_rate": 2.0621695532946035e-05, + "loss": 2.6702, + "step": 47032 + }, + { + "epoch": 2.1897246083292594, + "grad_norm": 0.3321589174711899, + "learning_rate": 2.0619503730961488e-05, + "loss": 2.6515, + "step": 47033 + }, + { + "epoch": 2.1897711665153525, + "grad_norm": 0.3664352420271139, + "learning_rate": 2.0617312015205843e-05, + "loss": 2.6911, + "step": 47034 + }, + { + "epoch": 2.1898177247014456, + "grad_norm": 0.3436052415247404, + "learning_rate": 2.0615120385685455e-05, + "loss": 2.6093, + "step": 47035 + }, + { + "epoch": 2.1898642828875388, + "grad_norm": 0.3390036781436174, + "learning_rate": 2.061292884240683e-05, + "loss": 2.5625, + "step": 47036 + }, + { + "epoch": 2.189910841073632, + "grad_norm": 0.33699264018392305, + "learning_rate": 2.061073738537635e-05, + "loss": 2.6959, + "step": 47037 + }, + { + "epoch": 2.189957399259725, + "grad_norm": 0.3780551542860465, + "learning_rate": 2.0608546014600456e-05, + "loss": 2.7164, + "step": 47038 + }, + { + "epoch": 2.190003957445818, + "grad_norm": 0.33156891224667756, + "learning_rate": 2.0606354730085586e-05, + "loss": 2.6213, + "step": 47039 + }, + { + "epoch": 2.1900505156319108, + "grad_norm": 0.3480920737661487, + "learning_rate": 2.0604163531838168e-05, + "loss": 2.5605, + "step": 47040 + }, + { + "epoch": 2.190097073818004, + "grad_norm": 0.3256168698350828, + "learning_rate": 2.060197241986465e-05, + "loss": 2.64, + "step": 47041 + }, + { + "epoch": 2.190143632004097, + "grad_norm": 0.31976063002590177, + "learning_rate": 2.059978139417143e-05, + "loss": 2.6515, + "step": 47042 + }, + { + "epoch": 2.19019019019019, + "grad_norm": 0.32798712390229906, + "learning_rate": 2.059759045476496e-05, + "loss": 2.6256, + "step": 47043 + }, + { + "epoch": 2.1902367483762832, + "grad_norm": 0.3458577919445384, + "learning_rate": 2.059539960165166e-05, + "loss": 2.7338, + "step": 47044 + }, + { + "epoch": 2.1902833065623764, + "grad_norm": 0.33436119937290737, + "learning_rate": 2.059320883483797e-05, + "loss": 2.6742, + "step": 47045 + }, + { + "epoch": 2.1903298647484695, + "grad_norm": 0.3659762836842456, + "learning_rate": 2.059101815433031e-05, + "loss": 2.6618, + "step": 47046 + }, + { + "epoch": 2.1903764229345626, + "grad_norm": 0.33777251758780585, + "learning_rate": 2.058882756013513e-05, + "loss": 2.6181, + "step": 47047 + }, + { + "epoch": 2.1904229811206557, + "grad_norm": 0.3537464666815509, + "learning_rate": 2.0586637052258817e-05, + "loss": 2.6732, + "step": 47048 + }, + { + "epoch": 2.190469539306749, + "grad_norm": 0.3348790128381065, + "learning_rate": 2.0584446630707856e-05, + "loss": 2.736, + "step": 47049 + }, + { + "epoch": 2.1905160974928415, + "grad_norm": 0.3418545724523828, + "learning_rate": 2.0582256295488627e-05, + "loss": 2.7026, + "step": 47050 + }, + { + "epoch": 2.1905626556789346, + "grad_norm": 0.34948636212207956, + "learning_rate": 2.058006604660758e-05, + "loss": 2.5816, + "step": 47051 + }, + { + "epoch": 2.1906092138650277, + "grad_norm": 0.35003526380941524, + "learning_rate": 2.0577875884071147e-05, + "loss": 2.5784, + "step": 47052 + }, + { + "epoch": 2.190655772051121, + "grad_norm": 0.35050769505218454, + "learning_rate": 2.0575685807885743e-05, + "loss": 2.6765, + "step": 47053 + }, + { + "epoch": 2.190702330237214, + "grad_norm": 0.3440357573380518, + "learning_rate": 2.057349581805782e-05, + "loss": 2.624, + "step": 47054 + }, + { + "epoch": 2.190748888423307, + "grad_norm": 0.33684274232829853, + "learning_rate": 2.057130591459377e-05, + "loss": 2.6038, + "step": 47055 + }, + { + "epoch": 2.1907954466094, + "grad_norm": 0.32773273235245065, + "learning_rate": 2.056911609750004e-05, + "loss": 2.5889, + "step": 47056 + }, + { + "epoch": 2.1908420047954933, + "grad_norm": 0.33523210331129566, + "learning_rate": 2.0566926366783053e-05, + "loss": 2.5718, + "step": 47057 + }, + { + "epoch": 2.1908885629815864, + "grad_norm": 0.3326373654494077, + "learning_rate": 2.056473672244924e-05, + "loss": 2.5378, + "step": 47058 + }, + { + "epoch": 2.190935121167679, + "grad_norm": 0.33659969583094923, + "learning_rate": 2.056254716450502e-05, + "loss": 2.6479, + "step": 47059 + }, + { + "epoch": 2.190981679353772, + "grad_norm": 0.3350496007949598, + "learning_rate": 2.0560357692956844e-05, + "loss": 2.6896, + "step": 47060 + }, + { + "epoch": 2.1910282375398653, + "grad_norm": 0.3279514048196611, + "learning_rate": 2.0558168307811087e-05, + "loss": 2.6028, + "step": 47061 + }, + { + "epoch": 2.1910747957259584, + "grad_norm": 0.3234813635165705, + "learning_rate": 2.0555979009074237e-05, + "loss": 2.6858, + "step": 47062 + }, + { + "epoch": 2.1911213539120515, + "grad_norm": 0.3541186677177915, + "learning_rate": 2.055378979675267e-05, + "loss": 2.6771, + "step": 47063 + }, + { + "epoch": 2.1911679120981447, + "grad_norm": 0.3300296379882951, + "learning_rate": 2.0551600670852827e-05, + "loss": 2.7232, + "step": 47064 + }, + { + "epoch": 2.1912144702842378, + "grad_norm": 0.31673758020036746, + "learning_rate": 2.0549411631381137e-05, + "loss": 2.6159, + "step": 47065 + }, + { + "epoch": 2.191261028470331, + "grad_norm": 0.32130504852915903, + "learning_rate": 2.0547222678344024e-05, + "loss": 2.6449, + "step": 47066 + }, + { + "epoch": 2.191307586656424, + "grad_norm": 0.34436113596194223, + "learning_rate": 2.0545033811747934e-05, + "loss": 2.679, + "step": 47067 + }, + { + "epoch": 2.191354144842517, + "grad_norm": 0.3129587240982609, + "learning_rate": 2.054284503159924e-05, + "loss": 2.5815, + "step": 47068 + }, + { + "epoch": 2.1914007030286102, + "grad_norm": 0.34064565284511233, + "learning_rate": 2.0540656337904395e-05, + "loss": 2.6278, + "step": 47069 + }, + { + "epoch": 2.191447261214703, + "grad_norm": 0.31955225550059335, + "learning_rate": 2.0538467730669824e-05, + "loss": 2.6657, + "step": 47070 + }, + { + "epoch": 2.191493819400796, + "grad_norm": 0.32582542308691803, + "learning_rate": 2.0536279209901948e-05, + "loss": 2.6993, + "step": 47071 + }, + { + "epoch": 2.191540377586889, + "grad_norm": 0.3241683447112627, + "learning_rate": 2.053409077560719e-05, + "loss": 2.6887, + "step": 47072 + }, + { + "epoch": 2.1915869357729822, + "grad_norm": 0.31043176424924723, + "learning_rate": 2.0531902427791992e-05, + "loss": 2.6807, + "step": 47073 + }, + { + "epoch": 2.1916334939590754, + "grad_norm": 0.29457144825541887, + "learning_rate": 2.0529714166462722e-05, + "loss": 2.5907, + "step": 47074 + }, + { + "epoch": 2.1916800521451685, + "grad_norm": 0.33371137146370256, + "learning_rate": 2.0527525991625867e-05, + "loss": 2.6806, + "step": 47075 + }, + { + "epoch": 2.1917266103312616, + "grad_norm": 0.32519254327527763, + "learning_rate": 2.0525337903287806e-05, + "loss": 2.6651, + "step": 47076 + }, + { + "epoch": 2.1917731685173547, + "grad_norm": 0.33019083821159567, + "learning_rate": 2.0523149901454974e-05, + "loss": 2.6505, + "step": 47077 + }, + { + "epoch": 2.191819726703448, + "grad_norm": 0.3354796053380791, + "learning_rate": 2.0520961986133798e-05, + "loss": 2.6926, + "step": 47078 + }, + { + "epoch": 2.1918662848895405, + "grad_norm": 0.3382750610389489, + "learning_rate": 2.0518774157330688e-05, + "loss": 2.6287, + "step": 47079 + }, + { + "epoch": 2.1919128430756336, + "grad_norm": 0.32882757375716287, + "learning_rate": 2.051658641505209e-05, + "loss": 2.5147, + "step": 47080 + }, + { + "epoch": 2.1919594012617267, + "grad_norm": 0.3119447723162429, + "learning_rate": 2.0514398759304372e-05, + "loss": 2.545, + "step": 47081 + }, + { + "epoch": 2.19200595944782, + "grad_norm": 0.34003657101473733, + "learning_rate": 2.051221119009402e-05, + "loss": 2.6949, + "step": 47082 + }, + { + "epoch": 2.192052517633913, + "grad_norm": 0.3339205396172487, + "learning_rate": 2.051002370742741e-05, + "loss": 2.6394, + "step": 47083 + }, + { + "epoch": 2.192099075820006, + "grad_norm": 0.33291070679537155, + "learning_rate": 2.050783631131098e-05, + "loss": 2.6258, + "step": 47084 + }, + { + "epoch": 2.192145634006099, + "grad_norm": 0.3137460666956571, + "learning_rate": 2.050564900175114e-05, + "loss": 2.5761, + "step": 47085 + }, + { + "epoch": 2.1921921921921923, + "grad_norm": 0.3421350186531215, + "learning_rate": 2.050346177875433e-05, + "loss": 2.6456, + "step": 47086 + }, + { + "epoch": 2.1922387503782854, + "grad_norm": 0.34387359231881964, + "learning_rate": 2.0501274642326922e-05, + "loss": 2.7729, + "step": 47087 + }, + { + "epoch": 2.1922853085643785, + "grad_norm": 0.3432184084798567, + "learning_rate": 2.0499087592475402e-05, + "loss": 2.5432, + "step": 47088 + }, + { + "epoch": 2.192331866750471, + "grad_norm": 0.321609582936105, + "learning_rate": 2.0496900629206133e-05, + "loss": 2.6762, + "step": 47089 + }, + { + "epoch": 2.1923784249365643, + "grad_norm": 0.33864502309352057, + "learning_rate": 2.049471375252556e-05, + "loss": 2.7873, + "step": 47090 + }, + { + "epoch": 2.1924249831226574, + "grad_norm": 0.3580557883074979, + "learning_rate": 2.0492526962440084e-05, + "loss": 2.6635, + "step": 47091 + }, + { + "epoch": 2.1924715413087505, + "grad_norm": 0.3710135646696266, + "learning_rate": 2.049034025895614e-05, + "loss": 2.6955, + "step": 47092 + }, + { + "epoch": 2.1925180994948437, + "grad_norm": 0.323041631617846, + "learning_rate": 2.048815364208016e-05, + "loss": 2.6326, + "step": 47093 + }, + { + "epoch": 2.192564657680937, + "grad_norm": 0.3600048578052878, + "learning_rate": 2.0485967111818506e-05, + "loss": 2.7498, + "step": 47094 + }, + { + "epoch": 2.19261121586703, + "grad_norm": 0.33935168991427844, + "learning_rate": 2.0483780668177665e-05, + "loss": 2.6422, + "step": 47095 + }, + { + "epoch": 2.192657774053123, + "grad_norm": 0.3546740258197864, + "learning_rate": 2.0481594311164005e-05, + "loss": 2.7119, + "step": 47096 + }, + { + "epoch": 2.192704332239216, + "grad_norm": 0.37714152805553847, + "learning_rate": 2.0479408040783953e-05, + "loss": 2.6935, + "step": 47097 + }, + { + "epoch": 2.192750890425309, + "grad_norm": 0.35404099243818704, + "learning_rate": 2.0477221857043937e-05, + "loss": 2.6724, + "step": 47098 + }, + { + "epoch": 2.192797448611402, + "grad_norm": 0.35146604952944577, + "learning_rate": 2.0475035759950363e-05, + "loss": 2.7147, + "step": 47099 + }, + { + "epoch": 2.192844006797495, + "grad_norm": 0.3509786559043791, + "learning_rate": 2.047284974950965e-05, + "loss": 2.6456, + "step": 47100 + }, + { + "epoch": 2.192890564983588, + "grad_norm": 0.36504100802806877, + "learning_rate": 2.047066382572823e-05, + "loss": 2.6625, + "step": 47101 + }, + { + "epoch": 2.1929371231696813, + "grad_norm": 0.36174451227794435, + "learning_rate": 2.0468477988612484e-05, + "loss": 2.6094, + "step": 47102 + }, + { + "epoch": 2.1929836813557744, + "grad_norm": 0.3345966190167338, + "learning_rate": 2.046629223816885e-05, + "loss": 2.6331, + "step": 47103 + }, + { + "epoch": 2.1930302395418675, + "grad_norm": 0.3455962704408874, + "learning_rate": 2.0464106574403734e-05, + "loss": 2.7165, + "step": 47104 + }, + { + "epoch": 2.1930767977279606, + "grad_norm": 0.38688668676977855, + "learning_rate": 2.0461920997323558e-05, + "loss": 2.7268, + "step": 47105 + }, + { + "epoch": 2.1931233559140537, + "grad_norm": 0.36624804396954114, + "learning_rate": 2.045973550693475e-05, + "loss": 2.6031, + "step": 47106 + }, + { + "epoch": 2.193169914100147, + "grad_norm": 0.3320581990161544, + "learning_rate": 2.045755010324368e-05, + "loss": 2.7321, + "step": 47107 + }, + { + "epoch": 2.19321647228624, + "grad_norm": 0.347432623225336, + "learning_rate": 2.045536478625682e-05, + "loss": 2.6665, + "step": 47108 + }, + { + "epoch": 2.1932630304723326, + "grad_norm": 0.3668910099535728, + "learning_rate": 2.0453179555980533e-05, + "loss": 2.5974, + "step": 47109 + }, + { + "epoch": 2.1933095886584257, + "grad_norm": 0.33421635697406105, + "learning_rate": 2.0450994412421255e-05, + "loss": 2.6822, + "step": 47110 + }, + { + "epoch": 2.193356146844519, + "grad_norm": 0.3497478891876702, + "learning_rate": 2.0448809355585398e-05, + "loss": 2.6465, + "step": 47111 + }, + { + "epoch": 2.193402705030612, + "grad_norm": 0.3640719595785058, + "learning_rate": 2.0446624385479372e-05, + "loss": 2.6015, + "step": 47112 + }, + { + "epoch": 2.193449263216705, + "grad_norm": 0.3485328533016291, + "learning_rate": 2.0444439502109598e-05, + "loss": 2.6763, + "step": 47113 + }, + { + "epoch": 2.193495821402798, + "grad_norm": 0.3323753399804563, + "learning_rate": 2.0442254705482493e-05, + "loss": 2.6114, + "step": 47114 + }, + { + "epoch": 2.1935423795888913, + "grad_norm": 0.3777448758132244, + "learning_rate": 2.044006999560445e-05, + "loss": 2.6987, + "step": 47115 + }, + { + "epoch": 2.1935889377749844, + "grad_norm": 0.32518205889334684, + "learning_rate": 2.043788537248188e-05, + "loss": 2.699, + "step": 47116 + }, + { + "epoch": 2.1936354959610775, + "grad_norm": 0.33700342022525753, + "learning_rate": 2.0435700836121206e-05, + "loss": 2.7211, + "step": 47117 + }, + { + "epoch": 2.19368205414717, + "grad_norm": 0.3398627070585952, + "learning_rate": 2.0433516386528846e-05, + "loss": 2.5796, + "step": 47118 + }, + { + "epoch": 2.1937286123332633, + "grad_norm": 0.34115478376231423, + "learning_rate": 2.0431332023711213e-05, + "loss": 2.5692, + "step": 47119 + }, + { + "epoch": 2.1937751705193564, + "grad_norm": 0.3488947101561999, + "learning_rate": 2.042914774767467e-05, + "loss": 2.6446, + "step": 47120 + }, + { + "epoch": 2.1938217287054496, + "grad_norm": 0.3255680068244465, + "learning_rate": 2.0426963558425703e-05, + "loss": 2.6723, + "step": 47121 + }, + { + "epoch": 2.1938682868915427, + "grad_norm": 0.3217255662116868, + "learning_rate": 2.0424779455970667e-05, + "loss": 2.5947, + "step": 47122 + }, + { + "epoch": 2.193914845077636, + "grad_norm": 0.3568087113471391, + "learning_rate": 2.042259544031599e-05, + "loss": 2.6795, + "step": 47123 + }, + { + "epoch": 2.193961403263729, + "grad_norm": 0.3308024003175569, + "learning_rate": 2.0420411511468084e-05, + "loss": 2.6169, + "step": 47124 + }, + { + "epoch": 2.194007961449822, + "grad_norm": 0.33529255315815953, + "learning_rate": 2.0418227669433355e-05, + "loss": 2.5247, + "step": 47125 + }, + { + "epoch": 2.194054519635915, + "grad_norm": 0.3237596842841824, + "learning_rate": 2.0416043914218208e-05, + "loss": 2.6487, + "step": 47126 + }, + { + "epoch": 2.1941010778220082, + "grad_norm": 0.3299101417817996, + "learning_rate": 2.0413860245829082e-05, + "loss": 2.6154, + "step": 47127 + }, + { + "epoch": 2.194147636008101, + "grad_norm": 0.3258410052630785, + "learning_rate": 2.041167666427234e-05, + "loss": 2.7019, + "step": 47128 + }, + { + "epoch": 2.194194194194194, + "grad_norm": 0.348259157218838, + "learning_rate": 2.0409493169554413e-05, + "loss": 2.6282, + "step": 47129 + }, + { + "epoch": 2.194240752380287, + "grad_norm": 0.36901619714099443, + "learning_rate": 2.0407309761681708e-05, + "loss": 2.72, + "step": 47130 + }, + { + "epoch": 2.1942873105663803, + "grad_norm": 0.31895616383858216, + "learning_rate": 2.0405126440660634e-05, + "loss": 2.5916, + "step": 47131 + }, + { + "epoch": 2.1943338687524734, + "grad_norm": 0.350639709428938, + "learning_rate": 2.0402943206497615e-05, + "loss": 2.6322, + "step": 47132 + }, + { + "epoch": 2.1943804269385665, + "grad_norm": 0.3513864649923675, + "learning_rate": 2.0400760059199007e-05, + "loss": 2.6151, + "step": 47133 + }, + { + "epoch": 2.1944269851246596, + "grad_norm": 0.322780478181381, + "learning_rate": 2.039857699877129e-05, + "loss": 2.6322, + "step": 47134 + }, + { + "epoch": 2.1944735433107527, + "grad_norm": 0.38599735276367053, + "learning_rate": 2.0396394025220793e-05, + "loss": 2.6408, + "step": 47135 + }, + { + "epoch": 2.194520101496846, + "grad_norm": 0.3050797293594951, + "learning_rate": 2.0394211138554003e-05, + "loss": 2.5818, + "step": 47136 + }, + { + "epoch": 2.194566659682939, + "grad_norm": 0.3667446035390138, + "learning_rate": 2.039202833877727e-05, + "loss": 2.6719, + "step": 47137 + }, + { + "epoch": 2.1946132178690316, + "grad_norm": 0.32864140046339757, + "learning_rate": 2.0389845625897014e-05, + "loss": 2.6347, + "step": 47138 + }, + { + "epoch": 2.1946597760551247, + "grad_norm": 0.3390808637000527, + "learning_rate": 2.0387662999919644e-05, + "loss": 2.6808, + "step": 47139 + }, + { + "epoch": 2.194706334241218, + "grad_norm": 0.34949069108271497, + "learning_rate": 2.038548046085158e-05, + "loss": 2.6605, + "step": 47140 + }, + { + "epoch": 2.194752892427311, + "grad_norm": 0.3580294037837425, + "learning_rate": 2.0383298008699204e-05, + "loss": 2.7396, + "step": 47141 + }, + { + "epoch": 2.194799450613404, + "grad_norm": 0.3219837943077287, + "learning_rate": 2.0381115643468922e-05, + "loss": 2.6935, + "step": 47142 + }, + { + "epoch": 2.194846008799497, + "grad_norm": 0.3435749116534974, + "learning_rate": 2.037893336516715e-05, + "loss": 2.6447, + "step": 47143 + }, + { + "epoch": 2.1948925669855903, + "grad_norm": 0.3515066211939241, + "learning_rate": 2.0376751173800295e-05, + "loss": 2.6237, + "step": 47144 + }, + { + "epoch": 2.1949391251716834, + "grad_norm": 0.330610941779601, + "learning_rate": 2.037456906937477e-05, + "loss": 2.7117, + "step": 47145 + }, + { + "epoch": 2.1949856833577766, + "grad_norm": 0.3390185310310529, + "learning_rate": 2.0372387051896936e-05, + "loss": 2.6844, + "step": 47146 + }, + { + "epoch": 2.1950322415438697, + "grad_norm": 0.343632687581628, + "learning_rate": 2.037020512137326e-05, + "loss": 2.6746, + "step": 47147 + }, + { + "epoch": 2.1950787997299623, + "grad_norm": 0.3245245098628347, + "learning_rate": 2.0368023277810078e-05, + "loss": 2.5933, + "step": 47148 + }, + { + "epoch": 2.1951253579160555, + "grad_norm": 0.34831254549823387, + "learning_rate": 2.0365841521213863e-05, + "loss": 2.6447, + "step": 47149 + }, + { + "epoch": 2.1951719161021486, + "grad_norm": 0.334693577764482, + "learning_rate": 2.036365985159096e-05, + "loss": 2.5518, + "step": 47150 + }, + { + "epoch": 2.1952184742882417, + "grad_norm": 0.3268093437073197, + "learning_rate": 2.0361478268947802e-05, + "loss": 2.5642, + "step": 47151 + }, + { + "epoch": 2.195265032474335, + "grad_norm": 0.3217332749178197, + "learning_rate": 2.0359296773290786e-05, + "loss": 2.6163, + "step": 47152 + }, + { + "epoch": 2.195311590660428, + "grad_norm": 0.3360779410621864, + "learning_rate": 2.035711536462631e-05, + "loss": 2.7533, + "step": 47153 + }, + { + "epoch": 2.195358148846521, + "grad_norm": 0.3371914514762999, + "learning_rate": 2.0354934042960803e-05, + "loss": 2.6929, + "step": 47154 + }, + { + "epoch": 2.195404707032614, + "grad_norm": 0.35447599409940067, + "learning_rate": 2.0352752808300622e-05, + "loss": 2.5651, + "step": 47155 + }, + { + "epoch": 2.1954512652187073, + "grad_norm": 0.32403728200607607, + "learning_rate": 2.0350571660652196e-05, + "loss": 2.6265, + "step": 47156 + }, + { + "epoch": 2.1954978234048, + "grad_norm": 0.3232815630185088, + "learning_rate": 2.0348390600021917e-05, + "loss": 2.7424, + "step": 47157 + }, + { + "epoch": 2.195544381590893, + "grad_norm": 0.3413048713897545, + "learning_rate": 2.0346209626416206e-05, + "loss": 2.6809, + "step": 47158 + }, + { + "epoch": 2.195590939776986, + "grad_norm": 0.327939275365583, + "learning_rate": 2.034402873984142e-05, + "loss": 2.7127, + "step": 47159 + }, + { + "epoch": 2.1956374979630793, + "grad_norm": 0.3266008704978789, + "learning_rate": 2.0341847940304015e-05, + "loss": 2.6723, + "step": 47160 + }, + { + "epoch": 2.1956840561491724, + "grad_norm": 0.3657035888347345, + "learning_rate": 2.033966722781034e-05, + "loss": 2.683, + "step": 47161 + }, + { + "epoch": 2.1957306143352655, + "grad_norm": 0.3422517449829103, + "learning_rate": 2.0337486602366845e-05, + "loss": 2.6573, + "step": 47162 + }, + { + "epoch": 2.1957771725213586, + "grad_norm": 0.3370028130359562, + "learning_rate": 2.0335306063979893e-05, + "loss": 2.5818, + "step": 47163 + }, + { + "epoch": 2.1958237307074517, + "grad_norm": 0.36090894727575534, + "learning_rate": 2.033312561265589e-05, + "loss": 2.578, + "step": 47164 + }, + { + "epoch": 2.195870288893545, + "grad_norm": 0.33610715761981763, + "learning_rate": 2.033094524840124e-05, + "loss": 2.6218, + "step": 47165 + }, + { + "epoch": 2.195916847079638, + "grad_norm": 0.34215498382515985, + "learning_rate": 2.0328764971222345e-05, + "loss": 2.5734, + "step": 47166 + }, + { + "epoch": 2.1959634052657306, + "grad_norm": 0.3368540084736106, + "learning_rate": 2.0326584781125624e-05, + "loss": 2.6497, + "step": 47167 + }, + { + "epoch": 2.1960099634518238, + "grad_norm": 0.31178788294886256, + "learning_rate": 2.0324404678117433e-05, + "loss": 2.583, + "step": 47168 + }, + { + "epoch": 2.196056521637917, + "grad_norm": 0.33142630674941237, + "learning_rate": 2.0322224662204187e-05, + "loss": 2.6162, + "step": 47169 + }, + { + "epoch": 2.19610307982401, + "grad_norm": 0.345916787494461, + "learning_rate": 2.0320044733392292e-05, + "loss": 2.6113, + "step": 47170 + }, + { + "epoch": 2.196149638010103, + "grad_norm": 0.3109704105865629, + "learning_rate": 2.0317864891688137e-05, + "loss": 2.6368, + "step": 47171 + }, + { + "epoch": 2.196196196196196, + "grad_norm": 0.3280682712643884, + "learning_rate": 2.031568513709813e-05, + "loss": 2.6088, + "step": 47172 + }, + { + "epoch": 2.1962427543822893, + "grad_norm": 0.3517209428337601, + "learning_rate": 2.031350546962868e-05, + "loss": 2.6758, + "step": 47173 + }, + { + "epoch": 2.1962893125683824, + "grad_norm": 0.3361977278895947, + "learning_rate": 2.0311325889286127e-05, + "loss": 2.623, + "step": 47174 + }, + { + "epoch": 2.1963358707544756, + "grad_norm": 0.3417276233542595, + "learning_rate": 2.0309146396076944e-05, + "loss": 2.5894, + "step": 47175 + }, + { + "epoch": 2.1963824289405687, + "grad_norm": 0.3398593029014235, + "learning_rate": 2.030696699000748e-05, + "loss": 2.5565, + "step": 47176 + }, + { + "epoch": 2.1964289871266613, + "grad_norm": 0.35618063494328617, + "learning_rate": 2.0304787671084136e-05, + "loss": 2.7296, + "step": 47177 + }, + { + "epoch": 2.1964755453127545, + "grad_norm": 0.36267073802646005, + "learning_rate": 2.0302608439313316e-05, + "loss": 2.5951, + "step": 47178 + }, + { + "epoch": 2.1965221034988476, + "grad_norm": 0.33789115117673385, + "learning_rate": 2.0300429294701416e-05, + "loss": 2.5638, + "step": 47179 + }, + { + "epoch": 2.1965686616849407, + "grad_norm": 0.3348546226851956, + "learning_rate": 2.0298250237254845e-05, + "loss": 2.6855, + "step": 47180 + }, + { + "epoch": 2.196615219871034, + "grad_norm": 0.3439750657676076, + "learning_rate": 2.029607126697997e-05, + "loss": 2.7349, + "step": 47181 + }, + { + "epoch": 2.196661778057127, + "grad_norm": 0.34632769323352014, + "learning_rate": 2.0293892383883196e-05, + "loss": 2.625, + "step": 47182 + }, + { + "epoch": 2.19670833624322, + "grad_norm": 0.3328685508005655, + "learning_rate": 2.0291713587970922e-05, + "loss": 2.6653, + "step": 47183 + }, + { + "epoch": 2.196754894429313, + "grad_norm": 0.3250787575674974, + "learning_rate": 2.0289534879249543e-05, + "loss": 2.5814, + "step": 47184 + }, + { + "epoch": 2.1968014526154063, + "grad_norm": 0.3561073112402082, + "learning_rate": 2.0287356257725452e-05, + "loss": 2.7294, + "step": 47185 + }, + { + "epoch": 2.1968480108014994, + "grad_norm": 0.3409822471988229, + "learning_rate": 2.0285177723405064e-05, + "loss": 2.6928, + "step": 47186 + }, + { + "epoch": 2.196894568987592, + "grad_norm": 0.3153030171445736, + "learning_rate": 2.0282999276294713e-05, + "loss": 2.6161, + "step": 47187 + }, + { + "epoch": 2.196941127173685, + "grad_norm": 0.3578448508625214, + "learning_rate": 2.0280820916400865e-05, + "loss": 2.6894, + "step": 47188 + }, + { + "epoch": 2.1969876853597783, + "grad_norm": 0.3148012254834487, + "learning_rate": 2.0278642643729845e-05, + "loss": 2.6723, + "step": 47189 + }, + { + "epoch": 2.1970342435458714, + "grad_norm": 0.3139017063327886, + "learning_rate": 2.0276464458288118e-05, + "loss": 2.6029, + "step": 47190 + }, + { + "epoch": 2.1970808017319645, + "grad_norm": 0.33594767035940715, + "learning_rate": 2.0274286360082022e-05, + "loss": 2.7226, + "step": 47191 + }, + { + "epoch": 2.1971273599180576, + "grad_norm": 0.32798240367872633, + "learning_rate": 2.027210834911797e-05, + "loss": 2.5759, + "step": 47192 + }, + { + "epoch": 2.1971739181041507, + "grad_norm": 0.35249057678740475, + "learning_rate": 2.026993042540236e-05, + "loss": 2.7168, + "step": 47193 + }, + { + "epoch": 2.197220476290244, + "grad_norm": 0.3331830701466685, + "learning_rate": 2.0267752588941562e-05, + "loss": 2.6902, + "step": 47194 + }, + { + "epoch": 2.197267034476337, + "grad_norm": 0.3254097530770889, + "learning_rate": 2.0265574839741986e-05, + "loss": 2.6738, + "step": 47195 + }, + { + "epoch": 2.1973135926624296, + "grad_norm": 0.3352520086285431, + "learning_rate": 2.0263397177810012e-05, + "loss": 2.6334, + "step": 47196 + }, + { + "epoch": 2.1973601508485228, + "grad_norm": 0.36017230212954554, + "learning_rate": 2.0261219603152043e-05, + "loss": 2.6288, + "step": 47197 + }, + { + "epoch": 2.197406709034616, + "grad_norm": 0.33052821658153914, + "learning_rate": 2.0259042115774464e-05, + "loss": 2.6182, + "step": 47198 + }, + { + "epoch": 2.197453267220709, + "grad_norm": 0.3419399502788302, + "learning_rate": 2.025686471568368e-05, + "loss": 2.6498, + "step": 47199 + }, + { + "epoch": 2.197499825406802, + "grad_norm": 0.3707862255405224, + "learning_rate": 2.0254687402886036e-05, + "loss": 2.6483, + "step": 47200 + }, + { + "epoch": 2.1975463835928952, + "grad_norm": 0.33009290090744914, + "learning_rate": 2.0252510177387985e-05, + "loss": 2.5673, + "step": 47201 + }, + { + "epoch": 2.1975929417789883, + "grad_norm": 0.35445641820252827, + "learning_rate": 2.0250333039195855e-05, + "loss": 2.5899, + "step": 47202 + }, + { + "epoch": 2.1976394999650815, + "grad_norm": 0.34391096785419467, + "learning_rate": 2.0248155988316102e-05, + "loss": 2.6025, + "step": 47203 + }, + { + "epoch": 2.1976860581511746, + "grad_norm": 0.32898573513735696, + "learning_rate": 2.0245979024755065e-05, + "loss": 2.6258, + "step": 47204 + }, + { + "epoch": 2.1977326163372677, + "grad_norm": 0.3363312801957705, + "learning_rate": 2.0243802148519143e-05, + "loss": 2.6623, + "step": 47205 + }, + { + "epoch": 2.197779174523361, + "grad_norm": 0.33515302161643074, + "learning_rate": 2.0241625359614747e-05, + "loss": 2.6285, + "step": 47206 + }, + { + "epoch": 2.1978257327094535, + "grad_norm": 0.3442411293885459, + "learning_rate": 2.0239448658048217e-05, + "loss": 2.6444, + "step": 47207 + }, + { + "epoch": 2.1978722908955466, + "grad_norm": 0.3498317469899282, + "learning_rate": 2.0237272043826007e-05, + "loss": 2.6865, + "step": 47208 + }, + { + "epoch": 2.1979188490816397, + "grad_norm": 0.3362219184215805, + "learning_rate": 2.0235095516954455e-05, + "loss": 2.6804, + "step": 47209 + }, + { + "epoch": 2.197965407267733, + "grad_norm": 0.3317114294130668, + "learning_rate": 2.0232919077439966e-05, + "loss": 2.6935, + "step": 47210 + }, + { + "epoch": 2.198011965453826, + "grad_norm": 0.36178249624243364, + "learning_rate": 2.0230742725288926e-05, + "loss": 2.6181, + "step": 47211 + }, + { + "epoch": 2.198058523639919, + "grad_norm": 0.33771541680175055, + "learning_rate": 2.0228566460507736e-05, + "loss": 2.5665, + "step": 47212 + }, + { + "epoch": 2.198105081826012, + "grad_norm": 0.30689406762266536, + "learning_rate": 2.022639028310274e-05, + "loss": 2.6648, + "step": 47213 + }, + { + "epoch": 2.1981516400121053, + "grad_norm": 0.3544008290340202, + "learning_rate": 2.022421419308039e-05, + "loss": 2.6737, + "step": 47214 + }, + { + "epoch": 2.1981981981981984, + "grad_norm": 0.32492540558345717, + "learning_rate": 2.0222038190447008e-05, + "loss": 2.598, + "step": 47215 + }, + { + "epoch": 2.198244756384291, + "grad_norm": 0.3412963995964202, + "learning_rate": 2.021986227520904e-05, + "loss": 2.5845, + "step": 47216 + }, + { + "epoch": 2.198291314570384, + "grad_norm": 0.34436869545236976, + "learning_rate": 2.0217686447372824e-05, + "loss": 2.4573, + "step": 47217 + }, + { + "epoch": 2.1983378727564773, + "grad_norm": 0.3421964141837092, + "learning_rate": 2.0215510706944767e-05, + "loss": 2.7275, + "step": 47218 + }, + { + "epoch": 2.1983844309425704, + "grad_norm": 0.3334819783977025, + "learning_rate": 2.0213335053931266e-05, + "loss": 2.7015, + "step": 47219 + }, + { + "epoch": 2.1984309891286635, + "grad_norm": 0.3160182003050864, + "learning_rate": 2.021115948833866e-05, + "loss": 2.5841, + "step": 47220 + }, + { + "epoch": 2.1984775473147566, + "grad_norm": 0.3189008368837241, + "learning_rate": 2.02089840101734e-05, + "loss": 2.6595, + "step": 47221 + }, + { + "epoch": 2.1985241055008498, + "grad_norm": 0.33446106742306764, + "learning_rate": 2.0206808619441818e-05, + "loss": 2.6385, + "step": 47222 + }, + { + "epoch": 2.198570663686943, + "grad_norm": 0.32616510199499804, + "learning_rate": 2.0204633316150324e-05, + "loss": 2.5871, + "step": 47223 + }, + { + "epoch": 2.198617221873036, + "grad_norm": 0.3302494809755658, + "learning_rate": 2.020245810030529e-05, + "loss": 2.671, + "step": 47224 + }, + { + "epoch": 2.198663780059129, + "grad_norm": 0.31670519096669075, + "learning_rate": 2.020028297191311e-05, + "loss": 2.591, + "step": 47225 + }, + { + "epoch": 2.1987103382452218, + "grad_norm": 0.31196174445540775, + "learning_rate": 2.0198107930980158e-05, + "loss": 2.6358, + "step": 47226 + }, + { + "epoch": 2.198756896431315, + "grad_norm": 0.3363223243764648, + "learning_rate": 2.0195932977512845e-05, + "loss": 2.6285, + "step": 47227 + }, + { + "epoch": 2.198803454617408, + "grad_norm": 0.3423821106027075, + "learning_rate": 2.0193758111517496e-05, + "loss": 2.6519, + "step": 47228 + }, + { + "epoch": 2.198850012803501, + "grad_norm": 0.32145455446394283, + "learning_rate": 2.0191583333000563e-05, + "loss": 2.6428, + "step": 47229 + }, + { + "epoch": 2.1988965709895942, + "grad_norm": 0.36399657718979017, + "learning_rate": 2.0189408641968384e-05, + "loss": 2.717, + "step": 47230 + }, + { + "epoch": 2.1989431291756873, + "grad_norm": 0.35164944245928287, + "learning_rate": 2.018723403842735e-05, + "loss": 2.5691, + "step": 47231 + }, + { + "epoch": 2.1989896873617805, + "grad_norm": 0.3428618538515027, + "learning_rate": 2.018505952238385e-05, + "loss": 2.6948, + "step": 47232 + }, + { + "epoch": 2.1990362455478736, + "grad_norm": 0.3222727352834739, + "learning_rate": 2.0182885093844263e-05, + "loss": 2.6004, + "step": 47233 + }, + { + "epoch": 2.1990828037339667, + "grad_norm": 0.33410556514420964, + "learning_rate": 2.0180710752814984e-05, + "loss": 2.5966, + "step": 47234 + }, + { + "epoch": 2.1991293619200594, + "grad_norm": 0.3249341486043896, + "learning_rate": 2.017853649930237e-05, + "loss": 2.554, + "step": 47235 + }, + { + "epoch": 2.1991759201061525, + "grad_norm": 0.32412807924362913, + "learning_rate": 2.017636233331281e-05, + "loss": 2.6555, + "step": 47236 + }, + { + "epoch": 2.1992224782922456, + "grad_norm": 0.33822971138220753, + "learning_rate": 2.017418825485269e-05, + "loss": 2.6955, + "step": 47237 + }, + { + "epoch": 2.1992690364783387, + "grad_norm": 0.3470295618812521, + "learning_rate": 2.0172014263928386e-05, + "loss": 2.5853, + "step": 47238 + }, + { + "epoch": 2.199315594664432, + "grad_norm": 0.3519632664960901, + "learning_rate": 2.0169840360546283e-05, + "loss": 2.6675, + "step": 47239 + }, + { + "epoch": 2.199362152850525, + "grad_norm": 0.3103591352999147, + "learning_rate": 2.0167666544712777e-05, + "loss": 2.6589, + "step": 47240 + }, + { + "epoch": 2.199408711036618, + "grad_norm": 0.3365490183022102, + "learning_rate": 2.0165492816434194e-05, + "loss": 2.5393, + "step": 47241 + }, + { + "epoch": 2.199455269222711, + "grad_norm": 0.3545930100531273, + "learning_rate": 2.0163319175716987e-05, + "loss": 2.6311, + "step": 47242 + }, + { + "epoch": 2.1995018274088043, + "grad_norm": 0.3420928211093859, + "learning_rate": 2.0161145622567484e-05, + "loss": 2.6774, + "step": 47243 + }, + { + "epoch": 2.1995483855948974, + "grad_norm": 0.34196596374710375, + "learning_rate": 2.015897215699208e-05, + "loss": 2.7117, + "step": 47244 + }, + { + "epoch": 2.1995949437809905, + "grad_norm": 0.3740938784506661, + "learning_rate": 2.0156798778997148e-05, + "loss": 2.653, + "step": 47245 + }, + { + "epoch": 2.199641501967083, + "grad_norm": 0.31722795149374927, + "learning_rate": 2.0154625488589075e-05, + "loss": 2.5629, + "step": 47246 + }, + { + "epoch": 2.1996880601531763, + "grad_norm": 0.3361516571885885, + "learning_rate": 2.015245228577425e-05, + "loss": 2.582, + "step": 47247 + }, + { + "epoch": 2.1997346183392694, + "grad_norm": 0.3544110185640003, + "learning_rate": 2.0150279170559024e-05, + "loss": 2.6619, + "step": 47248 + }, + { + "epoch": 2.1997811765253625, + "grad_norm": 0.3153794470736512, + "learning_rate": 2.014810614294978e-05, + "loss": 2.5845, + "step": 47249 + }, + { + "epoch": 2.1998277347114557, + "grad_norm": 0.3354825674901346, + "learning_rate": 2.014593320295291e-05, + "loss": 2.7263, + "step": 47250 + }, + { + "epoch": 2.1998742928975488, + "grad_norm": 0.35581939430341614, + "learning_rate": 2.0143760350574786e-05, + "loss": 2.5843, + "step": 47251 + }, + { + "epoch": 2.199920851083642, + "grad_norm": 0.3330233203961209, + "learning_rate": 2.014158758582178e-05, + "loss": 2.6054, + "step": 47252 + }, + { + "epoch": 2.199967409269735, + "grad_norm": 0.35287564823436196, + "learning_rate": 2.0139414908700294e-05, + "loss": 2.6754, + "step": 47253 + }, + { + "epoch": 2.200013967455828, + "grad_norm": 0.33070747261761896, + "learning_rate": 2.0137242319216647e-05, + "loss": 2.6914, + "step": 47254 + }, + { + "epoch": 2.200060525641921, + "grad_norm": 0.3296463716828061, + "learning_rate": 2.0135069817377282e-05, + "loss": 2.7079, + "step": 47255 + }, + { + "epoch": 2.200107083828014, + "grad_norm": 0.33572612455360207, + "learning_rate": 2.0132897403188533e-05, + "loss": 2.5787, + "step": 47256 + }, + { + "epoch": 2.200153642014107, + "grad_norm": 0.3317870564111299, + "learning_rate": 2.0130725076656785e-05, + "loss": 2.7017, + "step": 47257 + }, + { + "epoch": 2.2002002002002, + "grad_norm": 0.3321087341489498, + "learning_rate": 2.0128552837788424e-05, + "loss": 2.5533, + "step": 47258 + }, + { + "epoch": 2.2002467583862932, + "grad_norm": 0.3398799831684787, + "learning_rate": 2.012638068658981e-05, + "loss": 2.5729, + "step": 47259 + }, + { + "epoch": 2.2002933165723864, + "grad_norm": 0.3228098164903968, + "learning_rate": 2.012420862306734e-05, + "loss": 2.5887, + "step": 47260 + }, + { + "epoch": 2.2003398747584795, + "grad_norm": 0.34834018109431286, + "learning_rate": 2.0122036647227348e-05, + "loss": 2.6618, + "step": 47261 + }, + { + "epoch": 2.2003864329445726, + "grad_norm": 0.33759775696868266, + "learning_rate": 2.011986475907627e-05, + "loss": 2.6527, + "step": 47262 + }, + { + "epoch": 2.2004329911306657, + "grad_norm": 0.34165387189682206, + "learning_rate": 2.011769295862042e-05, + "loss": 2.6976, + "step": 47263 + }, + { + "epoch": 2.200479549316759, + "grad_norm": 0.3412418458758476, + "learning_rate": 2.0115521245866202e-05, + "loss": 2.4986, + "step": 47264 + }, + { + "epoch": 2.2005261075028515, + "grad_norm": 0.3411089891063427, + "learning_rate": 2.011334962081998e-05, + "loss": 2.6428, + "step": 47265 + }, + { + "epoch": 2.2005726656889446, + "grad_norm": 0.3265668652090208, + "learning_rate": 2.011117808348816e-05, + "loss": 2.6965, + "step": 47266 + }, + { + "epoch": 2.2006192238750377, + "grad_norm": 0.317178208272984, + "learning_rate": 2.0109006633877048e-05, + "loss": 2.6388, + "step": 47267 + }, + { + "epoch": 2.200665782061131, + "grad_norm": 0.3411232254120808, + "learning_rate": 2.0106835271993092e-05, + "loss": 2.7084, + "step": 47268 + }, + { + "epoch": 2.200712340247224, + "grad_norm": 0.33552895136163885, + "learning_rate": 2.010466399784261e-05, + "loss": 2.6255, + "step": 47269 + }, + { + "epoch": 2.200758898433317, + "grad_norm": 0.32717189332290164, + "learning_rate": 2.0102492811431994e-05, + "loss": 2.6719, + "step": 47270 + }, + { + "epoch": 2.20080545661941, + "grad_norm": 0.3464508578496844, + "learning_rate": 2.0100321712767616e-05, + "loss": 2.7093, + "step": 47271 + }, + { + "epoch": 2.2008520148055033, + "grad_norm": 0.3399028748487677, + "learning_rate": 2.009815070185585e-05, + "loss": 2.6087, + "step": 47272 + }, + { + "epoch": 2.2008985729915964, + "grad_norm": 0.32096852002566917, + "learning_rate": 2.0095979778703082e-05, + "loss": 2.6198, + "step": 47273 + }, + { + "epoch": 2.200945131177689, + "grad_norm": 0.3191803148763216, + "learning_rate": 2.0093808943315636e-05, + "loss": 2.7309, + "step": 47274 + }, + { + "epoch": 2.200991689363782, + "grad_norm": 0.32128047616825484, + "learning_rate": 2.0091638195699942e-05, + "loss": 2.6111, + "step": 47275 + }, + { + "epoch": 2.2010382475498753, + "grad_norm": 0.34456708638090267, + "learning_rate": 2.0089467535862334e-05, + "loss": 2.598, + "step": 47276 + }, + { + "epoch": 2.2010848057359684, + "grad_norm": 0.3202939992202284, + "learning_rate": 2.008729696380918e-05, + "loss": 2.6424, + "step": 47277 + }, + { + "epoch": 2.2011313639220615, + "grad_norm": 0.33698381786987536, + "learning_rate": 2.0085126479546873e-05, + "loss": 2.7454, + "step": 47278 + }, + { + "epoch": 2.2011779221081547, + "grad_norm": 0.3393562371261025, + "learning_rate": 2.0082956083081772e-05, + "loss": 2.593, + "step": 47279 + }, + { + "epoch": 2.2012244802942478, + "grad_norm": 0.3258689058640496, + "learning_rate": 2.0080785774420246e-05, + "loss": 2.57, + "step": 47280 + }, + { + "epoch": 2.201271038480341, + "grad_norm": 0.35750209802445543, + "learning_rate": 2.0078615553568685e-05, + "loss": 2.6442, + "step": 47281 + }, + { + "epoch": 2.201317596666434, + "grad_norm": 0.32667018993742003, + "learning_rate": 2.007644542053342e-05, + "loss": 2.5796, + "step": 47282 + }, + { + "epoch": 2.201364154852527, + "grad_norm": 0.3105998283149663, + "learning_rate": 2.0074275375320845e-05, + "loss": 2.6031, + "step": 47283 + }, + { + "epoch": 2.2014107130386202, + "grad_norm": 0.3403443100460273, + "learning_rate": 2.0072105417937316e-05, + "loss": 2.7258, + "step": 47284 + }, + { + "epoch": 2.201457271224713, + "grad_norm": 0.34603232332383754, + "learning_rate": 2.0069935548389213e-05, + "loss": 2.6068, + "step": 47285 + }, + { + "epoch": 2.201503829410806, + "grad_norm": 0.33143817532612374, + "learning_rate": 2.0067765766682916e-05, + "loss": 2.6286, + "step": 47286 + }, + { + "epoch": 2.201550387596899, + "grad_norm": 0.3262893314259791, + "learning_rate": 2.0065596072824744e-05, + "loss": 2.614, + "step": 47287 + }, + { + "epoch": 2.2015969457829923, + "grad_norm": 0.32711600657757317, + "learning_rate": 2.0063426466821134e-05, + "loss": 2.7182, + "step": 47288 + }, + { + "epoch": 2.2016435039690854, + "grad_norm": 0.3432047051331221, + "learning_rate": 2.0061256948678402e-05, + "loss": 2.5583, + "step": 47289 + }, + { + "epoch": 2.2016900621551785, + "grad_norm": 0.33296065965589333, + "learning_rate": 2.005908751840293e-05, + "loss": 2.5797, + "step": 47290 + }, + { + "epoch": 2.2017366203412716, + "grad_norm": 0.3114937787002897, + "learning_rate": 2.0056918176001087e-05, + "loss": 2.6098, + "step": 47291 + }, + { + "epoch": 2.2017831785273647, + "grad_norm": 0.33102940768463573, + "learning_rate": 2.005474892147924e-05, + "loss": 2.5973, + "step": 47292 + }, + { + "epoch": 2.201829736713458, + "grad_norm": 0.3578538737652022, + "learning_rate": 2.005257975484375e-05, + "loss": 2.6702, + "step": 47293 + }, + { + "epoch": 2.2018762948995505, + "grad_norm": 0.3208739307278845, + "learning_rate": 2.0050410676101012e-05, + "loss": 2.6655, + "step": 47294 + }, + { + "epoch": 2.2019228530856436, + "grad_norm": 0.32204647456599533, + "learning_rate": 2.0048241685257346e-05, + "loss": 2.6548, + "step": 47295 + }, + { + "epoch": 2.2019694112717367, + "grad_norm": 0.33451930054218243, + "learning_rate": 2.0046072782319143e-05, + "loss": 2.6831, + "step": 47296 + }, + { + "epoch": 2.20201596945783, + "grad_norm": 0.3414633041490492, + "learning_rate": 2.004390396729276e-05, + "loss": 2.6682, + "step": 47297 + }, + { + "epoch": 2.202062527643923, + "grad_norm": 0.329923866551363, + "learning_rate": 2.0041735240184572e-05, + "loss": 2.551, + "step": 47298 + }, + { + "epoch": 2.202109085830016, + "grad_norm": 0.3490612294141411, + "learning_rate": 2.003956660100096e-05, + "loss": 2.7266, + "step": 47299 + }, + { + "epoch": 2.202155644016109, + "grad_norm": 0.3106150236830524, + "learning_rate": 2.003739804974823e-05, + "loss": 2.5891, + "step": 47300 + }, + { + "epoch": 2.2022022022022023, + "grad_norm": 0.3370087651621332, + "learning_rate": 2.003522958643282e-05, + "loss": 2.5755, + "step": 47301 + }, + { + "epoch": 2.2022487603882954, + "grad_norm": 0.3380366890154149, + "learning_rate": 2.003306121106102e-05, + "loss": 2.6217, + "step": 47302 + }, + { + "epoch": 2.2022953185743885, + "grad_norm": 0.3193454518668396, + "learning_rate": 2.003089292363927e-05, + "loss": 2.7208, + "step": 47303 + }, + { + "epoch": 2.202341876760481, + "grad_norm": 0.3230683723610959, + "learning_rate": 2.0028724724173887e-05, + "loss": 2.6555, + "step": 47304 + }, + { + "epoch": 2.2023884349465743, + "grad_norm": 0.3295975387874839, + "learning_rate": 2.002655661267124e-05, + "loss": 2.7161, + "step": 47305 + }, + { + "epoch": 2.2024349931326674, + "grad_norm": 0.34047525870807366, + "learning_rate": 2.0024388589137695e-05, + "loss": 2.5962, + "step": 47306 + }, + { + "epoch": 2.2024815513187606, + "grad_norm": 0.3574531690890646, + "learning_rate": 2.0022220653579632e-05, + "loss": 2.5897, + "step": 47307 + }, + { + "epoch": 2.2025281095048537, + "grad_norm": 0.3302004535134593, + "learning_rate": 2.002005280600338e-05, + "loss": 2.5437, + "step": 47308 + }, + { + "epoch": 2.202574667690947, + "grad_norm": 0.3157376511901589, + "learning_rate": 2.0017885046415324e-05, + "loss": 2.6679, + "step": 47309 + }, + { + "epoch": 2.20262122587704, + "grad_norm": 0.35792514785773555, + "learning_rate": 2.0015717374821825e-05, + "loss": 2.6049, + "step": 47310 + }, + { + "epoch": 2.202667784063133, + "grad_norm": 0.3303734572049622, + "learning_rate": 2.0013549791229236e-05, + "loss": 2.6642, + "step": 47311 + }, + { + "epoch": 2.202714342249226, + "grad_norm": 0.3574720425471137, + "learning_rate": 2.001138229564394e-05, + "loss": 2.6994, + "step": 47312 + }, + { + "epoch": 2.2027609004353192, + "grad_norm": 0.35439305943278376, + "learning_rate": 2.000921488807225e-05, + "loss": 2.6173, + "step": 47313 + }, + { + "epoch": 2.202807458621412, + "grad_norm": 0.3461795662164334, + "learning_rate": 2.000704756852059e-05, + "loss": 2.6912, + "step": 47314 + }, + { + "epoch": 2.202854016807505, + "grad_norm": 0.3387811053893345, + "learning_rate": 2.0004880336995264e-05, + "loss": 2.6668, + "step": 47315 + }, + { + "epoch": 2.202900574993598, + "grad_norm": 0.3394562399295183, + "learning_rate": 2.000271319350269e-05, + "loss": 2.6013, + "step": 47316 + }, + { + "epoch": 2.2029471331796913, + "grad_norm": 0.3327656726491072, + "learning_rate": 2.000054613804918e-05, + "loss": 2.7022, + "step": 47317 + }, + { + "epoch": 2.2029936913657844, + "grad_norm": 0.3491078902560967, + "learning_rate": 1.9998379170641114e-05, + "loss": 2.5899, + "step": 47318 + }, + { + "epoch": 2.2030402495518775, + "grad_norm": 0.3703343524304948, + "learning_rate": 1.999621229128485e-05, + "loss": 2.5987, + "step": 47319 + }, + { + "epoch": 2.2030868077379706, + "grad_norm": 0.3716386526677805, + "learning_rate": 1.9994045499986742e-05, + "loss": 2.7315, + "step": 47320 + }, + { + "epoch": 2.2031333659240637, + "grad_norm": 0.3536063843374028, + "learning_rate": 1.9991878796753177e-05, + "loss": 2.5701, + "step": 47321 + }, + { + "epoch": 2.203179924110157, + "grad_norm": 0.3406740188279735, + "learning_rate": 1.998971218159047e-05, + "loss": 2.7468, + "step": 47322 + }, + { + "epoch": 2.20322648229625, + "grad_norm": 0.3463586326773231, + "learning_rate": 1.9987545654505002e-05, + "loss": 2.7082, + "step": 47323 + }, + { + "epoch": 2.2032730404823426, + "grad_norm": 0.37409114859203924, + "learning_rate": 1.998537921550313e-05, + "loss": 2.6813, + "step": 47324 + }, + { + "epoch": 2.2033195986684357, + "grad_norm": 0.33217894422827715, + "learning_rate": 1.9983212864591234e-05, + "loss": 2.7206, + "step": 47325 + }, + { + "epoch": 2.203366156854529, + "grad_norm": 0.34357908653198443, + "learning_rate": 1.9981046601775614e-05, + "loss": 2.6631, + "step": 47326 + }, + { + "epoch": 2.203412715040622, + "grad_norm": 0.34994739377574985, + "learning_rate": 1.99788804270627e-05, + "loss": 2.705, + "step": 47327 + }, + { + "epoch": 2.203459273226715, + "grad_norm": 0.3422193525052431, + "learning_rate": 1.997671434045878e-05, + "loss": 2.7128, + "step": 47328 + }, + { + "epoch": 2.203505831412808, + "grad_norm": 0.33539081209217525, + "learning_rate": 1.997454834197028e-05, + "loss": 2.6299, + "step": 47329 + }, + { + "epoch": 2.2035523895989013, + "grad_norm": 0.3296464028541768, + "learning_rate": 1.9972382431603503e-05, + "loss": 2.6394, + "step": 47330 + }, + { + "epoch": 2.2035989477849944, + "grad_norm": 0.3251888974951837, + "learning_rate": 1.9970216609364828e-05, + "loss": 2.6735, + "step": 47331 + }, + { + "epoch": 2.2036455059710875, + "grad_norm": 0.36061516204382044, + "learning_rate": 1.9968050875260608e-05, + "loss": 2.6943, + "step": 47332 + }, + { + "epoch": 2.20369206415718, + "grad_norm": 0.3412561938792763, + "learning_rate": 1.9965885229297203e-05, + "loss": 2.5788, + "step": 47333 + }, + { + "epoch": 2.2037386223432733, + "grad_norm": 0.3287821524406668, + "learning_rate": 1.996371967148098e-05, + "loss": 2.69, + "step": 47334 + }, + { + "epoch": 2.2037851805293664, + "grad_norm": 0.31945796813868593, + "learning_rate": 1.996155420181826e-05, + "loss": 2.6471, + "step": 47335 + }, + { + "epoch": 2.2038317387154596, + "grad_norm": 0.3099128361287441, + "learning_rate": 1.995938882031543e-05, + "loss": 2.6116, + "step": 47336 + }, + { + "epoch": 2.2038782969015527, + "grad_norm": 0.35161926302183005, + "learning_rate": 1.9957223526978823e-05, + "loss": 2.5944, + "step": 47337 + }, + { + "epoch": 2.203924855087646, + "grad_norm": 0.3112680882996612, + "learning_rate": 1.9955058321814813e-05, + "loss": 2.6092, + "step": 47338 + }, + { + "epoch": 2.203971413273739, + "grad_norm": 0.327373261089772, + "learning_rate": 1.9952893204829742e-05, + "loss": 2.6085, + "step": 47339 + }, + { + "epoch": 2.204017971459832, + "grad_norm": 0.31472738633088254, + "learning_rate": 1.9950728176029986e-05, + "loss": 2.595, + "step": 47340 + }, + { + "epoch": 2.204064529645925, + "grad_norm": 0.31556982803131983, + "learning_rate": 1.9948563235421853e-05, + "loss": 2.648, + "step": 47341 + }, + { + "epoch": 2.2041110878320183, + "grad_norm": 0.3047191276838225, + "learning_rate": 1.9946398383011755e-05, + "loss": 2.6632, + "step": 47342 + }, + { + "epoch": 2.204157646018111, + "grad_norm": 0.33132125464709467, + "learning_rate": 1.9944233618806006e-05, + "loss": 2.6875, + "step": 47343 + }, + { + "epoch": 2.204204204204204, + "grad_norm": 0.3080283032608417, + "learning_rate": 1.9942068942810967e-05, + "loss": 2.5859, + "step": 47344 + }, + { + "epoch": 2.204250762390297, + "grad_norm": 0.32737932016157995, + "learning_rate": 1.9939904355032995e-05, + "loss": 2.5461, + "step": 47345 + }, + { + "epoch": 2.2042973205763903, + "grad_norm": 0.33197498708009265, + "learning_rate": 1.9937739855478442e-05, + "loss": 2.6062, + "step": 47346 + }, + { + "epoch": 2.2043438787624834, + "grad_norm": 0.3137538758364467, + "learning_rate": 1.9935575444153677e-05, + "loss": 2.6335, + "step": 47347 + }, + { + "epoch": 2.2043904369485765, + "grad_norm": 0.31285136663839924, + "learning_rate": 1.993341112106502e-05, + "loss": 2.5997, + "step": 47348 + }, + { + "epoch": 2.2044369951346696, + "grad_norm": 0.34375337212752, + "learning_rate": 1.9931246886218842e-05, + "loss": 2.7084, + "step": 47349 + }, + { + "epoch": 2.2044835533207627, + "grad_norm": 0.36016113971845104, + "learning_rate": 1.9929082739621485e-05, + "loss": 2.6729, + "step": 47350 + }, + { + "epoch": 2.204530111506856, + "grad_norm": 0.3352255256181702, + "learning_rate": 1.9926918681279316e-05, + "loss": 2.6956, + "step": 47351 + }, + { + "epoch": 2.204576669692949, + "grad_norm": 0.33621716129127166, + "learning_rate": 1.992475471119867e-05, + "loss": 2.6877, + "step": 47352 + }, + { + "epoch": 2.2046232278790416, + "grad_norm": 0.3307996593615745, + "learning_rate": 1.9922590829385928e-05, + "loss": 2.5466, + "step": 47353 + }, + { + "epoch": 2.2046697860651348, + "grad_norm": 0.35706645655977776, + "learning_rate": 1.9920427035847383e-05, + "loss": 2.6419, + "step": 47354 + }, + { + "epoch": 2.204716344251228, + "grad_norm": 0.307088002716096, + "learning_rate": 1.991826333058946e-05, + "loss": 2.6754, + "step": 47355 + }, + { + "epoch": 2.204762902437321, + "grad_norm": 0.33070217189151846, + "learning_rate": 1.9916099713618446e-05, + "loss": 2.5905, + "step": 47356 + }, + { + "epoch": 2.204809460623414, + "grad_norm": 0.33488724884949644, + "learning_rate": 1.9913936184940724e-05, + "loss": 2.6853, + "step": 47357 + }, + { + "epoch": 2.204856018809507, + "grad_norm": 0.3397918496866386, + "learning_rate": 1.991177274456263e-05, + "loss": 2.6086, + "step": 47358 + }, + { + "epoch": 2.2049025769956003, + "grad_norm": 0.32349359033141567, + "learning_rate": 1.990960939249052e-05, + "loss": 2.6546, + "step": 47359 + }, + { + "epoch": 2.2049491351816934, + "grad_norm": 0.35090038348389646, + "learning_rate": 1.9907446128730757e-05, + "loss": 2.6384, + "step": 47360 + }, + { + "epoch": 2.2049956933677866, + "grad_norm": 0.37349893973921944, + "learning_rate": 1.990528295328966e-05, + "loss": 2.6916, + "step": 47361 + }, + { + "epoch": 2.2050422515538797, + "grad_norm": 0.3237459700828808, + "learning_rate": 1.9903119866173593e-05, + "loss": 2.7184, + "step": 47362 + }, + { + "epoch": 2.2050888097399723, + "grad_norm": 0.31294762234072554, + "learning_rate": 1.9900956867388905e-05, + "loss": 2.646, + "step": 47363 + }, + { + "epoch": 2.2051353679260655, + "grad_norm": 0.3372273375480639, + "learning_rate": 1.989879395694194e-05, + "loss": 2.6707, + "step": 47364 + }, + { + "epoch": 2.2051819261121586, + "grad_norm": 0.32258269379189, + "learning_rate": 1.989663113483905e-05, + "loss": 2.6934, + "step": 47365 + }, + { + "epoch": 2.2052284842982517, + "grad_norm": 0.31106068884834853, + "learning_rate": 1.9894468401086604e-05, + "loss": 2.6645, + "step": 47366 + }, + { + "epoch": 2.205275042484345, + "grad_norm": 0.3304273727395837, + "learning_rate": 1.989230575569089e-05, + "loss": 2.6832, + "step": 47367 + }, + { + "epoch": 2.205321600670438, + "grad_norm": 0.3266449225152511, + "learning_rate": 1.9890143198658322e-05, + "loss": 2.6391, + "step": 47368 + }, + { + "epoch": 2.205368158856531, + "grad_norm": 0.33143035568789675, + "learning_rate": 1.9887980729995205e-05, + "loss": 2.6261, + "step": 47369 + }, + { + "epoch": 2.205414717042624, + "grad_norm": 0.30487085925978596, + "learning_rate": 1.9885818349707897e-05, + "loss": 2.7008, + "step": 47370 + }, + { + "epoch": 2.2054612752287173, + "grad_norm": 0.3320093729610417, + "learning_rate": 1.9883656057802745e-05, + "loss": 2.6153, + "step": 47371 + }, + { + "epoch": 2.20550783341481, + "grad_norm": 0.34887841055521635, + "learning_rate": 1.9881493854286097e-05, + "loss": 2.6676, + "step": 47372 + }, + { + "epoch": 2.205554391600903, + "grad_norm": 0.3348979297654137, + "learning_rate": 1.9879331739164314e-05, + "loss": 2.6475, + "step": 47373 + }, + { + "epoch": 2.205600949786996, + "grad_norm": 0.3326816005208172, + "learning_rate": 1.987716971244369e-05, + "loss": 2.716, + "step": 47374 + }, + { + "epoch": 2.2056475079730893, + "grad_norm": 0.3299758050964782, + "learning_rate": 1.9875007774130645e-05, + "loss": 2.5844, + "step": 47375 + }, + { + "epoch": 2.2056940661591824, + "grad_norm": 0.3456169062392907, + "learning_rate": 1.987284592423146e-05, + "loss": 2.6002, + "step": 47376 + }, + { + "epoch": 2.2057406243452755, + "grad_norm": 0.34325674630195013, + "learning_rate": 1.9870684162752508e-05, + "loss": 2.7054, + "step": 47377 + }, + { + "epoch": 2.2057871825313686, + "grad_norm": 0.3242937458671234, + "learning_rate": 1.9868522489700126e-05, + "loss": 2.5914, + "step": 47378 + }, + { + "epoch": 2.2058337407174617, + "grad_norm": 0.3285917270974737, + "learning_rate": 1.986636090508068e-05, + "loss": 2.6532, + "step": 47379 + }, + { + "epoch": 2.205880298903555, + "grad_norm": 0.3291306850897125, + "learning_rate": 1.9864199408900462e-05, + "loss": 2.6093, + "step": 47380 + }, + { + "epoch": 2.205926857089648, + "grad_norm": 0.3443843314468791, + "learning_rate": 1.9862038001165888e-05, + "loss": 2.6614, + "step": 47381 + }, + { + "epoch": 2.205973415275741, + "grad_norm": 0.3504255289447019, + "learning_rate": 1.9859876681883244e-05, + "loss": 2.6501, + "step": 47382 + }, + { + "epoch": 2.2060199734618338, + "grad_norm": 0.31000291416966613, + "learning_rate": 1.985771545105889e-05, + "loss": 2.5563, + "step": 47383 + }, + { + "epoch": 2.206066531647927, + "grad_norm": 0.32236720572151334, + "learning_rate": 1.9855554308699177e-05, + "loss": 2.6465, + "step": 47384 + }, + { + "epoch": 2.20611308983402, + "grad_norm": 0.33127162204779964, + "learning_rate": 1.9853393254810438e-05, + "loss": 2.5348, + "step": 47385 + }, + { + "epoch": 2.206159648020113, + "grad_norm": 0.37406227119165786, + "learning_rate": 1.9851232289399037e-05, + "loss": 2.6523, + "step": 47386 + }, + { + "epoch": 2.206206206206206, + "grad_norm": 0.34367573888633335, + "learning_rate": 1.9849071412471266e-05, + "loss": 2.7267, + "step": 47387 + }, + { + "epoch": 2.2062527643922993, + "grad_norm": 0.3325189290346317, + "learning_rate": 1.984691062403354e-05, + "loss": 2.5425, + "step": 47388 + }, + { + "epoch": 2.2062993225783925, + "grad_norm": 0.3460450191286901, + "learning_rate": 1.9844749924092133e-05, + "loss": 2.6779, + "step": 47389 + }, + { + "epoch": 2.2063458807644856, + "grad_norm": 0.3550781333729752, + "learning_rate": 1.984258931265342e-05, + "loss": 2.6568, + "step": 47390 + }, + { + "epoch": 2.2063924389505787, + "grad_norm": 0.3479472876791261, + "learning_rate": 1.9840428789723735e-05, + "loss": 2.6282, + "step": 47391 + }, + { + "epoch": 2.2064389971366714, + "grad_norm": 0.32394241926087125, + "learning_rate": 1.983826835530942e-05, + "loss": 2.6028, + "step": 47392 + }, + { + "epoch": 2.2064855553227645, + "grad_norm": 0.3619788651628683, + "learning_rate": 1.9836108009416814e-05, + "loss": 2.6622, + "step": 47393 + }, + { + "epoch": 2.2065321135088576, + "grad_norm": 0.32393717322995785, + "learning_rate": 1.9833947752052285e-05, + "loss": 2.603, + "step": 47394 + }, + { + "epoch": 2.2065786716949507, + "grad_norm": 0.3218788435816589, + "learning_rate": 1.9831787583222123e-05, + "loss": 2.666, + "step": 47395 + }, + { + "epoch": 2.206625229881044, + "grad_norm": 0.3740220770055984, + "learning_rate": 1.9829627502932695e-05, + "loss": 2.7106, + "step": 47396 + }, + { + "epoch": 2.206671788067137, + "grad_norm": 0.3459319826445791, + "learning_rate": 1.9827467511190332e-05, + "loss": 2.775, + "step": 47397 + }, + { + "epoch": 2.20671834625323, + "grad_norm": 0.31806231479718966, + "learning_rate": 1.9825307608001385e-05, + "loss": 2.6169, + "step": 47398 + }, + { + "epoch": 2.206764904439323, + "grad_norm": 0.3324876234439153, + "learning_rate": 1.982314779337221e-05, + "loss": 2.6135, + "step": 47399 + }, + { + "epoch": 2.2068114626254163, + "grad_norm": 0.32287226323018725, + "learning_rate": 1.9820988067309083e-05, + "loss": 2.5839, + "step": 47400 + }, + { + "epoch": 2.2068580208115094, + "grad_norm": 0.3299632803167406, + "learning_rate": 1.9818828429818413e-05, + "loss": 2.5783, + "step": 47401 + }, + { + "epoch": 2.206904578997602, + "grad_norm": 0.3229077629930723, + "learning_rate": 1.9816668880906498e-05, + "loss": 2.6183, + "step": 47402 + }, + { + "epoch": 2.206951137183695, + "grad_norm": 0.334191541668699, + "learning_rate": 1.9814509420579684e-05, + "loss": 2.6966, + "step": 47403 + }, + { + "epoch": 2.2069976953697883, + "grad_norm": 0.34885829605758006, + "learning_rate": 1.9812350048844308e-05, + "loss": 2.6728, + "step": 47404 + }, + { + "epoch": 2.2070442535558814, + "grad_norm": 0.33371085554784213, + "learning_rate": 1.9810190765706717e-05, + "loss": 2.6357, + "step": 47405 + }, + { + "epoch": 2.2070908117419745, + "grad_norm": 0.3277101611497819, + "learning_rate": 1.9808031571173235e-05, + "loss": 2.6298, + "step": 47406 + }, + { + "epoch": 2.2071373699280676, + "grad_norm": 0.37147322744451444, + "learning_rate": 1.980587246525023e-05, + "loss": 2.7123, + "step": 47407 + }, + { + "epoch": 2.2071839281141608, + "grad_norm": 0.33894492572080664, + "learning_rate": 1.980371344794399e-05, + "loss": 2.7099, + "step": 47408 + }, + { + "epoch": 2.207230486300254, + "grad_norm": 0.33317195896639096, + "learning_rate": 1.9801554519260885e-05, + "loss": 2.6336, + "step": 47409 + }, + { + "epoch": 2.207277044486347, + "grad_norm": 0.33482061545724723, + "learning_rate": 1.9799395679207235e-05, + "loss": 2.5512, + "step": 47410 + }, + { + "epoch": 2.2073236026724397, + "grad_norm": 0.34540699125522545, + "learning_rate": 1.9797236927789386e-05, + "loss": 2.691, + "step": 47411 + }, + { + "epoch": 2.2073701608585328, + "grad_norm": 0.35948646508575455, + "learning_rate": 1.9795078265013688e-05, + "loss": 2.7302, + "step": 47412 + }, + { + "epoch": 2.207416719044626, + "grad_norm": 0.32873381776481037, + "learning_rate": 1.9792919690886425e-05, + "loss": 2.6301, + "step": 47413 + }, + { + "epoch": 2.207463277230719, + "grad_norm": 0.33227275376257626, + "learning_rate": 1.9790761205413998e-05, + "loss": 2.5558, + "step": 47414 + }, + { + "epoch": 2.207509835416812, + "grad_norm": 0.35632333853718906, + "learning_rate": 1.97886028086027e-05, + "loss": 2.7058, + "step": 47415 + }, + { + "epoch": 2.2075563936029052, + "grad_norm": 0.3474540759591032, + "learning_rate": 1.9786444500458874e-05, + "loss": 2.6758, + "step": 47416 + }, + { + "epoch": 2.2076029517889983, + "grad_norm": 0.3423196272119641, + "learning_rate": 1.9784286280988857e-05, + "loss": 2.6049, + "step": 47417 + }, + { + "epoch": 2.2076495099750915, + "grad_norm": 0.3370662157626772, + "learning_rate": 1.978212815019898e-05, + "loss": 2.705, + "step": 47418 + }, + { + "epoch": 2.2076960681611846, + "grad_norm": 0.3250213638253799, + "learning_rate": 1.9779970108095585e-05, + "loss": 2.5775, + "step": 47419 + }, + { + "epoch": 2.2077426263472777, + "grad_norm": 0.35477852432159435, + "learning_rate": 1.9777812154685016e-05, + "loss": 2.586, + "step": 47420 + }, + { + "epoch": 2.207789184533371, + "grad_norm": 0.3479438566665743, + "learning_rate": 1.9775654289973573e-05, + "loss": 2.741, + "step": 47421 + }, + { + "epoch": 2.2078357427194635, + "grad_norm": 0.3284081967751697, + "learning_rate": 1.9773496513967604e-05, + "loss": 2.621, + "step": 47422 + }, + { + "epoch": 2.2078823009055566, + "grad_norm": 0.34557364177715644, + "learning_rate": 1.9771338826673452e-05, + "loss": 2.6012, + "step": 47423 + }, + { + "epoch": 2.2079288590916497, + "grad_norm": 0.34601998031104786, + "learning_rate": 1.976918122809744e-05, + "loss": 2.6586, + "step": 47424 + }, + { + "epoch": 2.207975417277743, + "grad_norm": 0.3238848484230384, + "learning_rate": 1.9767023718245913e-05, + "loss": 2.5977, + "step": 47425 + }, + { + "epoch": 2.208021975463836, + "grad_norm": 0.3310882186946962, + "learning_rate": 1.9764866297125166e-05, + "loss": 2.51, + "step": 47426 + }, + { + "epoch": 2.208068533649929, + "grad_norm": 0.3347554785732838, + "learning_rate": 1.9762708964741594e-05, + "loss": 2.5175, + "step": 47427 + }, + { + "epoch": 2.208115091836022, + "grad_norm": 0.3291017608488934, + "learning_rate": 1.976055172110145e-05, + "loss": 2.6074, + "step": 47428 + }, + { + "epoch": 2.2081616500221153, + "grad_norm": 0.31391162434698433, + "learning_rate": 1.9758394566211148e-05, + "loss": 2.616, + "step": 47429 + }, + { + "epoch": 2.2082082082082084, + "grad_norm": 0.3378102262711051, + "learning_rate": 1.9756237500076956e-05, + "loss": 2.5775, + "step": 47430 + }, + { + "epoch": 2.208254766394301, + "grad_norm": 0.32155908780449055, + "learning_rate": 1.9754080522705238e-05, + "loss": 2.6474, + "step": 47431 + }, + { + "epoch": 2.208301324580394, + "grad_norm": 0.32731625188367747, + "learning_rate": 1.9751923634102304e-05, + "loss": 2.6927, + "step": 47432 + }, + { + "epoch": 2.2083478827664873, + "grad_norm": 0.31959038009999113, + "learning_rate": 1.9749766834274512e-05, + "loss": 2.5928, + "step": 47433 + }, + { + "epoch": 2.2083944409525804, + "grad_norm": 0.3312209950352817, + "learning_rate": 1.974761012322816e-05, + "loss": 2.6579, + "step": 47434 + }, + { + "epoch": 2.2084409991386735, + "grad_norm": 0.3111760523884745, + "learning_rate": 1.9745453500969597e-05, + "loss": 2.5816, + "step": 47435 + }, + { + "epoch": 2.2084875573247666, + "grad_norm": 0.32112942668199534, + "learning_rate": 1.9743296967505136e-05, + "loss": 2.6419, + "step": 47436 + }, + { + "epoch": 2.2085341155108598, + "grad_norm": 0.32883121226681383, + "learning_rate": 1.9741140522841128e-05, + "loss": 2.685, + "step": 47437 + }, + { + "epoch": 2.208580673696953, + "grad_norm": 0.3386273145081496, + "learning_rate": 1.9738984166983904e-05, + "loss": 2.6945, + "step": 47438 + }, + { + "epoch": 2.208627231883046, + "grad_norm": 0.33715382587614223, + "learning_rate": 1.973682789993974e-05, + "loss": 2.7484, + "step": 47439 + }, + { + "epoch": 2.208673790069139, + "grad_norm": 0.34445910277516145, + "learning_rate": 1.973467172171505e-05, + "loss": 2.5558, + "step": 47440 + }, + { + "epoch": 2.208720348255232, + "grad_norm": 0.33373292906381313, + "learning_rate": 1.9732515632316077e-05, + "loss": 2.6298, + "step": 47441 + }, + { + "epoch": 2.208766906441325, + "grad_norm": 0.319918929320408, + "learning_rate": 1.9730359631749225e-05, + "loss": 2.602, + "step": 47442 + }, + { + "epoch": 2.208813464627418, + "grad_norm": 0.3358708824567902, + "learning_rate": 1.9728203720020767e-05, + "loss": 2.6194, + "step": 47443 + }, + { + "epoch": 2.208860022813511, + "grad_norm": 0.3539810112963982, + "learning_rate": 1.9726047897137046e-05, + "loss": 2.7107, + "step": 47444 + }, + { + "epoch": 2.2089065809996042, + "grad_norm": 0.32678592610444884, + "learning_rate": 1.9723892163104397e-05, + "loss": 2.6302, + "step": 47445 + }, + { + "epoch": 2.2089531391856974, + "grad_norm": 0.3204867992497953, + "learning_rate": 1.972173651792914e-05, + "loss": 2.6441, + "step": 47446 + }, + { + "epoch": 2.2089996973717905, + "grad_norm": 0.3525526210613898, + "learning_rate": 1.9719580961617618e-05, + "loss": 2.7022, + "step": 47447 + }, + { + "epoch": 2.2090462555578836, + "grad_norm": 0.31915853778572895, + "learning_rate": 1.971742549417613e-05, + "loss": 2.6745, + "step": 47448 + }, + { + "epoch": 2.2090928137439767, + "grad_norm": 0.3165946976760662, + "learning_rate": 1.971527011561101e-05, + "loss": 2.5215, + "step": 47449 + }, + { + "epoch": 2.2091393719300694, + "grad_norm": 0.34430440355853736, + "learning_rate": 1.971311482592859e-05, + "loss": 2.7019, + "step": 47450 + }, + { + "epoch": 2.2091859301161625, + "grad_norm": 0.34095504923024783, + "learning_rate": 1.9710959625135217e-05, + "loss": 2.7027, + "step": 47451 + }, + { + "epoch": 2.2092324883022556, + "grad_norm": 0.3252851759486128, + "learning_rate": 1.9708804513237156e-05, + "loss": 2.6052, + "step": 47452 + }, + { + "epoch": 2.2092790464883487, + "grad_norm": 0.3461124141410244, + "learning_rate": 1.9706649490240802e-05, + "loss": 2.6584, + "step": 47453 + }, + { + "epoch": 2.209325604674442, + "grad_norm": 0.34482383541756284, + "learning_rate": 1.9704494556152415e-05, + "loss": 2.6476, + "step": 47454 + }, + { + "epoch": 2.209372162860535, + "grad_norm": 0.33661545097478784, + "learning_rate": 1.9702339710978385e-05, + "loss": 2.5994, + "step": 47455 + }, + { + "epoch": 2.209418721046628, + "grad_norm": 0.3233770960715636, + "learning_rate": 1.970018495472498e-05, + "loss": 2.6432, + "step": 47456 + }, + { + "epoch": 2.209465279232721, + "grad_norm": 0.3376807822742844, + "learning_rate": 1.9698030287398556e-05, + "loss": 2.5723, + "step": 47457 + }, + { + "epoch": 2.2095118374188143, + "grad_norm": 0.3271409776060337, + "learning_rate": 1.9695875709005422e-05, + "loss": 2.577, + "step": 47458 + }, + { + "epoch": 2.2095583956049074, + "grad_norm": 0.31220859678351026, + "learning_rate": 1.9693721219551914e-05, + "loss": 2.5706, + "step": 47459 + }, + { + "epoch": 2.2096049537910005, + "grad_norm": 0.3282909074387492, + "learning_rate": 1.969156681904436e-05, + "loss": 2.6835, + "step": 47460 + }, + { + "epoch": 2.209651511977093, + "grad_norm": 0.3561720189958656, + "learning_rate": 1.9689412507489058e-05, + "loss": 2.6986, + "step": 47461 + }, + { + "epoch": 2.2096980701631863, + "grad_norm": 0.32583499000369665, + "learning_rate": 1.968725828489234e-05, + "loss": 2.7139, + "step": 47462 + }, + { + "epoch": 2.2097446283492794, + "grad_norm": 0.34637531703042546, + "learning_rate": 1.9685104151260535e-05, + "loss": 2.7025, + "step": 47463 + }, + { + "epoch": 2.2097911865353725, + "grad_norm": 0.33160753540035315, + "learning_rate": 1.9682950106599968e-05, + "loss": 2.6651, + "step": 47464 + }, + { + "epoch": 2.2098377447214657, + "grad_norm": 0.37453954018828567, + "learning_rate": 1.9680796150916947e-05, + "loss": 2.6876, + "step": 47465 + }, + { + "epoch": 2.2098843029075588, + "grad_norm": 0.3302643905337935, + "learning_rate": 1.9678642284217823e-05, + "loss": 2.5988, + "step": 47466 + }, + { + "epoch": 2.209930861093652, + "grad_norm": 0.32190258323778964, + "learning_rate": 1.9676488506508868e-05, + "loss": 2.6218, + "step": 47467 + }, + { + "epoch": 2.209977419279745, + "grad_norm": 0.35135420357243174, + "learning_rate": 1.967433481779646e-05, + "loss": 2.5757, + "step": 47468 + }, + { + "epoch": 2.210023977465838, + "grad_norm": 0.33760660985315327, + "learning_rate": 1.9672181218086873e-05, + "loss": 2.6108, + "step": 47469 + }, + { + "epoch": 2.210070535651931, + "grad_norm": 0.3427990692946981, + "learning_rate": 1.967002770738645e-05, + "loss": 2.6275, + "step": 47470 + }, + { + "epoch": 2.210117093838024, + "grad_norm": 0.3592601674501539, + "learning_rate": 1.9667874285701503e-05, + "loss": 2.7508, + "step": 47471 + }, + { + "epoch": 2.210163652024117, + "grad_norm": 0.34155747490846855, + "learning_rate": 1.9665720953038364e-05, + "loss": 2.6649, + "step": 47472 + }, + { + "epoch": 2.21021021021021, + "grad_norm": 0.31705310425605837, + "learning_rate": 1.9663567709403357e-05, + "loss": 2.663, + "step": 47473 + }, + { + "epoch": 2.2102567683963033, + "grad_norm": 0.34853076097559343, + "learning_rate": 1.9661414554802775e-05, + "loss": 2.7425, + "step": 47474 + }, + { + "epoch": 2.2103033265823964, + "grad_norm": 0.31539184995967545, + "learning_rate": 1.9659261489242954e-05, + "loss": 2.6073, + "step": 47475 + }, + { + "epoch": 2.2103498847684895, + "grad_norm": 0.3315289390021457, + "learning_rate": 1.965710851273021e-05, + "loss": 2.6927, + "step": 47476 + }, + { + "epoch": 2.2103964429545826, + "grad_norm": 0.3317364870419098, + "learning_rate": 1.965495562527086e-05, + "loss": 2.6095, + "step": 47477 + }, + { + "epoch": 2.2104430011406757, + "grad_norm": 0.32833942182903114, + "learning_rate": 1.965280282687123e-05, + "loss": 2.62, + "step": 47478 + }, + { + "epoch": 2.210489559326769, + "grad_norm": 0.3138648460559724, + "learning_rate": 1.965065011753765e-05, + "loss": 2.5282, + "step": 47479 + }, + { + "epoch": 2.2105361175128615, + "grad_norm": 0.3347970267153968, + "learning_rate": 1.9648497497276392e-05, + "loss": 2.6214, + "step": 47480 + }, + { + "epoch": 2.2105826756989546, + "grad_norm": 0.36212911075501275, + "learning_rate": 1.9646344966093832e-05, + "loss": 2.7538, + "step": 47481 + }, + { + "epoch": 2.2106292338850477, + "grad_norm": 0.3064669225210776, + "learning_rate": 1.9644192523996223e-05, + "loss": 2.5894, + "step": 47482 + }, + { + "epoch": 2.210675792071141, + "grad_norm": 0.3289235105565263, + "learning_rate": 1.964204017098996e-05, + "loss": 2.6902, + "step": 47483 + }, + { + "epoch": 2.210722350257234, + "grad_norm": 0.38667573421458956, + "learning_rate": 1.9639887907081294e-05, + "loss": 2.7248, + "step": 47484 + }, + { + "epoch": 2.210768908443327, + "grad_norm": 0.3171609635268697, + "learning_rate": 1.9637735732276574e-05, + "loss": 2.5851, + "step": 47485 + }, + { + "epoch": 2.21081546662942, + "grad_norm": 0.3316123359291926, + "learning_rate": 1.9635583646582118e-05, + "loss": 2.6708, + "step": 47486 + }, + { + "epoch": 2.2108620248155133, + "grad_norm": 0.35074111438143407, + "learning_rate": 1.963343165000422e-05, + "loss": 2.6487, + "step": 47487 + }, + { + "epoch": 2.2109085830016064, + "grad_norm": 0.3482121926877388, + "learning_rate": 1.963127974254921e-05, + "loss": 2.6478, + "step": 47488 + }, + { + "epoch": 2.2109551411876995, + "grad_norm": 0.32155172900125706, + "learning_rate": 1.9629127924223394e-05, + "loss": 2.6091, + "step": 47489 + }, + { + "epoch": 2.211001699373792, + "grad_norm": 0.3574094194723466, + "learning_rate": 1.96269761950331e-05, + "loss": 2.7014, + "step": 47490 + }, + { + "epoch": 2.2110482575598853, + "grad_norm": 0.33458380065553583, + "learning_rate": 1.9624824554984645e-05, + "loss": 2.611, + "step": 47491 + }, + { + "epoch": 2.2110948157459784, + "grad_norm": 0.33295902189390136, + "learning_rate": 1.9622673004084346e-05, + "loss": 2.6236, + "step": 47492 + }, + { + "epoch": 2.2111413739320716, + "grad_norm": 0.34019239269217094, + "learning_rate": 1.9620521542338476e-05, + "loss": 2.6736, + "step": 47493 + }, + { + "epoch": 2.2111879321181647, + "grad_norm": 0.32846839143304024, + "learning_rate": 1.961837016975342e-05, + "loss": 2.6906, + "step": 47494 + }, + { + "epoch": 2.211234490304258, + "grad_norm": 0.35291566286090936, + "learning_rate": 1.9616218886335418e-05, + "loss": 2.6746, + "step": 47495 + }, + { + "epoch": 2.211281048490351, + "grad_norm": 0.32820287767077627, + "learning_rate": 1.9614067692090854e-05, + "loss": 2.6876, + "step": 47496 + }, + { + "epoch": 2.211327606676444, + "grad_norm": 0.3287127195168121, + "learning_rate": 1.9611916587025997e-05, + "loss": 2.6915, + "step": 47497 + }, + { + "epoch": 2.211374164862537, + "grad_norm": 0.3344249424522124, + "learning_rate": 1.9609765571147165e-05, + "loss": 2.665, + "step": 47498 + }, + { + "epoch": 2.2114207230486302, + "grad_norm": 0.330405645934739, + "learning_rate": 1.960761464446069e-05, + "loss": 2.6503, + "step": 47499 + }, + { + "epoch": 2.211467281234723, + "grad_norm": 0.34006040406208726, + "learning_rate": 1.9605463806972845e-05, + "loss": 2.6359, + "step": 47500 + }, + { + "epoch": 2.211513839420816, + "grad_norm": 0.31957037326846516, + "learning_rate": 1.9603313058690003e-05, + "loss": 2.7286, + "step": 47501 + }, + { + "epoch": 2.211560397606909, + "grad_norm": 0.3713996604317442, + "learning_rate": 1.9601162399618422e-05, + "loss": 2.7042, + "step": 47502 + }, + { + "epoch": 2.2116069557930023, + "grad_norm": 0.34344757698100303, + "learning_rate": 1.9599011829764442e-05, + "loss": 2.5662, + "step": 47503 + }, + { + "epoch": 2.2116535139790954, + "grad_norm": 0.31480108765432896, + "learning_rate": 1.959686134913436e-05, + "loss": 2.6313, + "step": 47504 + }, + { + "epoch": 2.2117000721651885, + "grad_norm": 0.33542783301667506, + "learning_rate": 1.9594710957734514e-05, + "loss": 2.6621, + "step": 47505 + }, + { + "epoch": 2.2117466303512816, + "grad_norm": 0.35320990992877044, + "learning_rate": 1.9592560655571158e-05, + "loss": 2.5864, + "step": 47506 + }, + { + "epoch": 2.2117931885373747, + "grad_norm": 0.34019133708283855, + "learning_rate": 1.9590410442650682e-05, + "loss": 2.6968, + "step": 47507 + }, + { + "epoch": 2.211839746723468, + "grad_norm": 0.3493992115822036, + "learning_rate": 1.9588260318979314e-05, + "loss": 2.6832, + "step": 47508 + }, + { + "epoch": 2.2118863049095605, + "grad_norm": 0.3339948135046372, + "learning_rate": 1.9586110284563445e-05, + "loss": 2.5799, + "step": 47509 + }, + { + "epoch": 2.2119328630956536, + "grad_norm": 0.3165087585776766, + "learning_rate": 1.958396033940933e-05, + "loss": 2.6776, + "step": 47510 + }, + { + "epoch": 2.2119794212817467, + "grad_norm": 0.34811298149080105, + "learning_rate": 1.958181048352329e-05, + "loss": 2.6837, + "step": 47511 + }, + { + "epoch": 2.21202597946784, + "grad_norm": 0.34840815726596985, + "learning_rate": 1.9579660716911657e-05, + "loss": 2.7301, + "step": 47512 + }, + { + "epoch": 2.212072537653933, + "grad_norm": 0.32058715259182496, + "learning_rate": 1.957751103958069e-05, + "loss": 2.6899, + "step": 47513 + }, + { + "epoch": 2.212119095840026, + "grad_norm": 0.3162671897940496, + "learning_rate": 1.9575361451536773e-05, + "loss": 2.5956, + "step": 47514 + }, + { + "epoch": 2.212165654026119, + "grad_norm": 0.3383496202952749, + "learning_rate": 1.9573211952786147e-05, + "loss": 2.6053, + "step": 47515 + }, + { + "epoch": 2.2122122122122123, + "grad_norm": 0.37511378692300745, + "learning_rate": 1.9571062543335145e-05, + "loss": 2.6783, + "step": 47516 + }, + { + "epoch": 2.2122587703983054, + "grad_norm": 0.34060211788367256, + "learning_rate": 1.9568913223190082e-05, + "loss": 2.5657, + "step": 47517 + }, + { + "epoch": 2.2123053285843985, + "grad_norm": 0.3490959541320222, + "learning_rate": 1.9566763992357258e-05, + "loss": 2.6721, + "step": 47518 + }, + { + "epoch": 2.212351886770491, + "grad_norm": 0.34051447870326046, + "learning_rate": 1.9564614850842983e-05, + "loss": 2.7102, + "step": 47519 + }, + { + "epoch": 2.2123984449565843, + "grad_norm": 0.33382007152798704, + "learning_rate": 1.9562465798653585e-05, + "loss": 2.6757, + "step": 47520 + }, + { + "epoch": 2.2124450031426774, + "grad_norm": 0.32799649809376236, + "learning_rate": 1.9560316835795317e-05, + "loss": 2.6764, + "step": 47521 + }, + { + "epoch": 2.2124915613287706, + "grad_norm": 0.32182688256892206, + "learning_rate": 1.9558167962274553e-05, + "loss": 2.6945, + "step": 47522 + }, + { + "epoch": 2.2125381195148637, + "grad_norm": 0.31349585326675633, + "learning_rate": 1.955601917809755e-05, + "loss": 2.5598, + "step": 47523 + }, + { + "epoch": 2.212584677700957, + "grad_norm": 0.35901586960210446, + "learning_rate": 1.955387048327063e-05, + "loss": 2.6521, + "step": 47524 + }, + { + "epoch": 2.21263123588705, + "grad_norm": 0.3316891441210973, + "learning_rate": 1.9551721877800123e-05, + "loss": 2.7029, + "step": 47525 + }, + { + "epoch": 2.212677794073143, + "grad_norm": 0.32728997952701083, + "learning_rate": 1.954957336169227e-05, + "loss": 2.6834, + "step": 47526 + }, + { + "epoch": 2.212724352259236, + "grad_norm": 0.33286049777625604, + "learning_rate": 1.9547424934953457e-05, + "loss": 2.5962, + "step": 47527 + }, + { + "epoch": 2.2127709104453293, + "grad_norm": 0.346913178575649, + "learning_rate": 1.954527659758994e-05, + "loss": 2.6434, + "step": 47528 + }, + { + "epoch": 2.212817468631422, + "grad_norm": 0.3351540121582209, + "learning_rate": 1.954312834960803e-05, + "loss": 2.7681, + "step": 47529 + }, + { + "epoch": 2.212864026817515, + "grad_norm": 0.3303404151854055, + "learning_rate": 1.954098019101404e-05, + "loss": 2.6063, + "step": 47530 + }, + { + "epoch": 2.212910585003608, + "grad_norm": 0.3281889064130081, + "learning_rate": 1.9538832121814272e-05, + "loss": 2.6489, + "step": 47531 + }, + { + "epoch": 2.2129571431897013, + "grad_norm": 0.3298774952168594, + "learning_rate": 1.9536684142015026e-05, + "loss": 2.6794, + "step": 47532 + }, + { + "epoch": 2.2130037013757944, + "grad_norm": 0.3611184488239686, + "learning_rate": 1.9534536251622636e-05, + "loss": 2.7371, + "step": 47533 + }, + { + "epoch": 2.2130502595618875, + "grad_norm": 0.3408524400592368, + "learning_rate": 1.953238845064334e-05, + "loss": 2.6698, + "step": 47534 + }, + { + "epoch": 2.2130968177479806, + "grad_norm": 0.3430606767015318, + "learning_rate": 1.9530240739083522e-05, + "loss": 2.6531, + "step": 47535 + }, + { + "epoch": 2.2131433759340737, + "grad_norm": 0.3479477073374065, + "learning_rate": 1.9528093116949427e-05, + "loss": 2.6214, + "step": 47536 + }, + { + "epoch": 2.213189934120167, + "grad_norm": 0.3656722006064021, + "learning_rate": 1.9525945584247374e-05, + "loss": 2.6894, + "step": 47537 + }, + { + "epoch": 2.21323649230626, + "grad_norm": 0.3393547578740078, + "learning_rate": 1.952379814098369e-05, + "loss": 2.6197, + "step": 47538 + }, + { + "epoch": 2.2132830504923526, + "grad_norm": 0.36930684951377807, + "learning_rate": 1.9521650787164618e-05, + "loss": 2.6283, + "step": 47539 + }, + { + "epoch": 2.2133296086784457, + "grad_norm": 0.344894756200535, + "learning_rate": 1.9519503522796534e-05, + "loss": 2.6939, + "step": 47540 + }, + { + "epoch": 2.213376166864539, + "grad_norm": 0.3137975892469956, + "learning_rate": 1.951735634788569e-05, + "loss": 2.6211, + "step": 47541 + }, + { + "epoch": 2.213422725050632, + "grad_norm": 0.3327564729765618, + "learning_rate": 1.95152092624384e-05, + "loss": 2.7009, + "step": 47542 + }, + { + "epoch": 2.213469283236725, + "grad_norm": 0.3545142665108747, + "learning_rate": 1.9513062266460964e-05, + "loss": 2.641, + "step": 47543 + }, + { + "epoch": 2.213515841422818, + "grad_norm": 0.33537255868229293, + "learning_rate": 1.9510915359959693e-05, + "loss": 2.6082, + "step": 47544 + }, + { + "epoch": 2.2135623996089113, + "grad_norm": 0.3149163804733013, + "learning_rate": 1.9508768542940876e-05, + "loss": 2.5637, + "step": 47545 + }, + { + "epoch": 2.2136089577950044, + "grad_norm": 0.3469367180292126, + "learning_rate": 1.9506621815410837e-05, + "loss": 2.6001, + "step": 47546 + }, + { + "epoch": 2.2136555159810976, + "grad_norm": 0.32759346708853954, + "learning_rate": 1.9504475177375825e-05, + "loss": 2.5864, + "step": 47547 + }, + { + "epoch": 2.2137020741671902, + "grad_norm": 0.32001622037471256, + "learning_rate": 1.9502328628842215e-05, + "loss": 2.5775, + "step": 47548 + }, + { + "epoch": 2.2137486323532833, + "grad_norm": 0.33809045606867544, + "learning_rate": 1.950018216981624e-05, + "loss": 2.6051, + "step": 47549 + }, + { + "epoch": 2.2137951905393765, + "grad_norm": 0.31740602881411606, + "learning_rate": 1.949803580030423e-05, + "loss": 2.6438, + "step": 47550 + }, + { + "epoch": 2.2138417487254696, + "grad_norm": 0.33529935222266677, + "learning_rate": 1.94958895203125e-05, + "loss": 2.6855, + "step": 47551 + }, + { + "epoch": 2.2138883069115627, + "grad_norm": 0.3254002931560069, + "learning_rate": 1.9493743329847285e-05, + "loss": 2.588, + "step": 47552 + }, + { + "epoch": 2.213934865097656, + "grad_norm": 0.3349621183256839, + "learning_rate": 1.949159722891497e-05, + "loss": 2.6249, + "step": 47553 + }, + { + "epoch": 2.213981423283749, + "grad_norm": 0.33780807208351293, + "learning_rate": 1.9489451217521777e-05, + "loss": 2.716, + "step": 47554 + }, + { + "epoch": 2.214027981469842, + "grad_norm": 0.34189492374374714, + "learning_rate": 1.9487305295674075e-05, + "loss": 2.5625, + "step": 47555 + }, + { + "epoch": 2.214074539655935, + "grad_norm": 0.3369769871551877, + "learning_rate": 1.9485159463378104e-05, + "loss": 2.6038, + "step": 47556 + }, + { + "epoch": 2.2141210978420283, + "grad_norm": 0.34909270569296325, + "learning_rate": 1.9483013720640187e-05, + "loss": 2.6397, + "step": 47557 + }, + { + "epoch": 2.2141676560281214, + "grad_norm": 0.3413949916368822, + "learning_rate": 1.9480868067466612e-05, + "loss": 2.6513, + "step": 47558 + }, + { + "epoch": 2.214214214214214, + "grad_norm": 0.3425448994819056, + "learning_rate": 1.9478722503863705e-05, + "loss": 2.7285, + "step": 47559 + }, + { + "epoch": 2.214260772400307, + "grad_norm": 0.3191315438453337, + "learning_rate": 1.9476577029837708e-05, + "loss": 2.5779, + "step": 47560 + }, + { + "epoch": 2.2143073305864003, + "grad_norm": 0.31817005593602876, + "learning_rate": 1.947443164539498e-05, + "loss": 2.6298, + "step": 47561 + }, + { + "epoch": 2.2143538887724934, + "grad_norm": 0.3442567741469806, + "learning_rate": 1.9472286350541775e-05, + "loss": 2.6667, + "step": 47562 + }, + { + "epoch": 2.2144004469585865, + "grad_norm": 0.34567572169064087, + "learning_rate": 1.94701411452844e-05, + "loss": 2.6086, + "step": 47563 + }, + { + "epoch": 2.2144470051446796, + "grad_norm": 0.32798655161080154, + "learning_rate": 1.9467996029629155e-05, + "loss": 2.5832, + "step": 47564 + }, + { + "epoch": 2.2144935633307727, + "grad_norm": 0.33725107290015144, + "learning_rate": 1.946585100358233e-05, + "loss": 2.6621, + "step": 47565 + }, + { + "epoch": 2.214540121516866, + "grad_norm": 0.3399934813810507, + "learning_rate": 1.9463706067150246e-05, + "loss": 2.6637, + "step": 47566 + }, + { + "epoch": 2.214586679702959, + "grad_norm": 0.3271616628951867, + "learning_rate": 1.9461561220339135e-05, + "loss": 2.6021, + "step": 47567 + }, + { + "epoch": 2.2146332378890516, + "grad_norm": 0.3226834834759137, + "learning_rate": 1.945941646315538e-05, + "loss": 2.5801, + "step": 47568 + }, + { + "epoch": 2.2146797960751448, + "grad_norm": 0.32778651144213333, + "learning_rate": 1.9457271795605198e-05, + "loss": 2.6907, + "step": 47569 + }, + { + "epoch": 2.214726354261238, + "grad_norm": 0.3341176968383535, + "learning_rate": 1.9455127217694925e-05, + "loss": 2.6752, + "step": 47570 + }, + { + "epoch": 2.214772912447331, + "grad_norm": 0.32178148075241464, + "learning_rate": 1.9452982729430834e-05, + "loss": 2.6924, + "step": 47571 + }, + { + "epoch": 2.214819470633424, + "grad_norm": 0.3276316038393204, + "learning_rate": 1.945083833081924e-05, + "loss": 2.669, + "step": 47572 + }, + { + "epoch": 2.214866028819517, + "grad_norm": 0.31311366913078664, + "learning_rate": 1.944869402186642e-05, + "loss": 2.6834, + "step": 47573 + }, + { + "epoch": 2.2149125870056103, + "grad_norm": 0.32606794391137517, + "learning_rate": 1.944654980257869e-05, + "loss": 2.6766, + "step": 47574 + }, + { + "epoch": 2.2149591451917034, + "grad_norm": 0.3381792775891835, + "learning_rate": 1.9444405672962313e-05, + "loss": 2.6265, + "step": 47575 + }, + { + "epoch": 2.2150057033777966, + "grad_norm": 0.3149256270510245, + "learning_rate": 1.944226163302359e-05, + "loss": 2.5208, + "step": 47576 + }, + { + "epoch": 2.2150522615638897, + "grad_norm": 0.33396142112807287, + "learning_rate": 1.944011768276882e-05, + "loss": 2.6119, + "step": 47577 + }, + { + "epoch": 2.2150988197499824, + "grad_norm": 0.3436538058851055, + "learning_rate": 1.9437973822204296e-05, + "loss": 2.7039, + "step": 47578 + }, + { + "epoch": 2.2151453779360755, + "grad_norm": 0.32717916825967525, + "learning_rate": 1.9435830051336322e-05, + "loss": 2.621, + "step": 47579 + }, + { + "epoch": 2.2151919361221686, + "grad_norm": 0.35179742453122265, + "learning_rate": 1.9433686370171143e-05, + "loss": 2.6455, + "step": 47580 + }, + { + "epoch": 2.2152384943082617, + "grad_norm": 0.3282979709791167, + "learning_rate": 1.9431542778715122e-05, + "loss": 2.5969, + "step": 47581 + }, + { + "epoch": 2.215285052494355, + "grad_norm": 0.33588874582407685, + "learning_rate": 1.9429399276974493e-05, + "loss": 2.5611, + "step": 47582 + }, + { + "epoch": 2.215331610680448, + "grad_norm": 0.3744461539830862, + "learning_rate": 1.9427255864955558e-05, + "loss": 2.6991, + "step": 47583 + }, + { + "epoch": 2.215378168866541, + "grad_norm": 0.34904250621719896, + "learning_rate": 1.9425112542664618e-05, + "loss": 2.6022, + "step": 47584 + }, + { + "epoch": 2.215424727052634, + "grad_norm": 0.3167668846096211, + "learning_rate": 1.9422969310107963e-05, + "loss": 2.7131, + "step": 47585 + }, + { + "epoch": 2.2154712852387273, + "grad_norm": 0.32304758607165834, + "learning_rate": 1.9420826167291884e-05, + "loss": 2.5952, + "step": 47586 + }, + { + "epoch": 2.21551784342482, + "grad_norm": 0.3492497835520806, + "learning_rate": 1.941868311422268e-05, + "loss": 2.7039, + "step": 47587 + }, + { + "epoch": 2.215564401610913, + "grad_norm": 0.3388317533412518, + "learning_rate": 1.9416540150906616e-05, + "loss": 2.5991, + "step": 47588 + }, + { + "epoch": 2.215610959797006, + "grad_norm": 0.33324999846104497, + "learning_rate": 1.9414397277349988e-05, + "loss": 2.6145, + "step": 47589 + }, + { + "epoch": 2.2156575179830993, + "grad_norm": 0.3368147280936816, + "learning_rate": 1.941225449355909e-05, + "loss": 2.716, + "step": 47590 + }, + { + "epoch": 2.2157040761691924, + "grad_norm": 0.3259760600255396, + "learning_rate": 1.9410111799540216e-05, + "loss": 2.679, + "step": 47591 + }, + { + "epoch": 2.2157506343552855, + "grad_norm": 0.3218563278500406, + "learning_rate": 1.9407969195299668e-05, + "loss": 2.6129, + "step": 47592 + }, + { + "epoch": 2.2157971925413786, + "grad_norm": 0.33121646342035305, + "learning_rate": 1.940582668084368e-05, + "loss": 2.6574, + "step": 47593 + }, + { + "epoch": 2.2158437507274718, + "grad_norm": 0.33839764488871216, + "learning_rate": 1.940368425617861e-05, + "loss": 2.7344, + "step": 47594 + }, + { + "epoch": 2.215890308913565, + "grad_norm": 0.3275319784134891, + "learning_rate": 1.9401541921310695e-05, + "loss": 2.6729, + "step": 47595 + }, + { + "epoch": 2.215936867099658, + "grad_norm": 0.3265831206992785, + "learning_rate": 1.9399399676246243e-05, + "loss": 2.6312, + "step": 47596 + }, + { + "epoch": 2.215983425285751, + "grad_norm": 0.34201327263416514, + "learning_rate": 1.9397257520991535e-05, + "loss": 2.6542, + "step": 47597 + }, + { + "epoch": 2.2160299834718438, + "grad_norm": 0.3448816795815512, + "learning_rate": 1.9395115455552863e-05, + "loss": 2.6445, + "step": 47598 + }, + { + "epoch": 2.216076541657937, + "grad_norm": 0.31654512240149724, + "learning_rate": 1.9392973479936517e-05, + "loss": 2.6031, + "step": 47599 + }, + { + "epoch": 2.21612309984403, + "grad_norm": 0.33632881395786235, + "learning_rate": 1.939083159414879e-05, + "loss": 2.5652, + "step": 47600 + }, + { + "epoch": 2.216169658030123, + "grad_norm": 0.3232150350823436, + "learning_rate": 1.9388689798195937e-05, + "loss": 2.6588, + "step": 47601 + }, + { + "epoch": 2.2162162162162162, + "grad_norm": 0.32642733670888774, + "learning_rate": 1.9386548092084268e-05, + "loss": 2.5528, + "step": 47602 + }, + { + "epoch": 2.2162627744023093, + "grad_norm": 0.3095834160983855, + "learning_rate": 1.9384406475820067e-05, + "loss": 2.6216, + "step": 47603 + }, + { + "epoch": 2.2163093325884025, + "grad_norm": 0.33011390670920054, + "learning_rate": 1.9382264949409612e-05, + "loss": 2.5648, + "step": 47604 + }, + { + "epoch": 2.2163558907744956, + "grad_norm": 0.3339575686997656, + "learning_rate": 1.9380123512859212e-05, + "loss": 2.6757, + "step": 47605 + }, + { + "epoch": 2.2164024489605887, + "grad_norm": 0.3412628102607234, + "learning_rate": 1.9377982166175095e-05, + "loss": 2.6798, + "step": 47606 + }, + { + "epoch": 2.2164490071466814, + "grad_norm": 0.34242752397454024, + "learning_rate": 1.9375840909363623e-05, + "loss": 2.6529, + "step": 47607 + }, + { + "epoch": 2.2164955653327745, + "grad_norm": 0.32966003295604535, + "learning_rate": 1.9373699742431e-05, + "loss": 2.5995, + "step": 47608 + }, + { + "epoch": 2.2165421235188676, + "grad_norm": 0.33708288701859795, + "learning_rate": 1.9371558665383592e-05, + "loss": 2.6911, + "step": 47609 + }, + { + "epoch": 2.2165886817049607, + "grad_norm": 0.3425915091465931, + "learning_rate": 1.9369417678227624e-05, + "loss": 2.6353, + "step": 47610 + }, + { + "epoch": 2.216635239891054, + "grad_norm": 0.31390181969111647, + "learning_rate": 1.9367276780969396e-05, + "loss": 2.5611, + "step": 47611 + }, + { + "epoch": 2.216681798077147, + "grad_norm": 0.3194853708438998, + "learning_rate": 1.9365135973615195e-05, + "loss": 2.393, + "step": 47612 + }, + { + "epoch": 2.21672835626324, + "grad_norm": 0.31037296771557105, + "learning_rate": 1.9362995256171302e-05, + "loss": 2.6138, + "step": 47613 + }, + { + "epoch": 2.216774914449333, + "grad_norm": 0.32026392068570786, + "learning_rate": 1.9360854628644015e-05, + "loss": 2.6431, + "step": 47614 + }, + { + "epoch": 2.2168214726354263, + "grad_norm": 0.3262220116168703, + "learning_rate": 1.9358714091039582e-05, + "loss": 2.6941, + "step": 47615 + }, + { + "epoch": 2.2168680308215194, + "grad_norm": 0.3483495951080868, + "learning_rate": 1.9356573643364305e-05, + "loss": 2.6137, + "step": 47616 + }, + { + "epoch": 2.216914589007612, + "grad_norm": 0.3282257972348419, + "learning_rate": 1.935443328562447e-05, + "loss": 2.6803, + "step": 47617 + }, + { + "epoch": 2.216961147193705, + "grad_norm": 0.3337458953125953, + "learning_rate": 1.9352293017826366e-05, + "loss": 2.6795, + "step": 47618 + }, + { + "epoch": 2.2170077053797983, + "grad_norm": 0.3447632943983351, + "learning_rate": 1.9350152839976233e-05, + "loss": 2.6438, + "step": 47619 + }, + { + "epoch": 2.2170542635658914, + "grad_norm": 0.32067903580071366, + "learning_rate": 1.9348012752080413e-05, + "loss": 2.7161, + "step": 47620 + }, + { + "epoch": 2.2171008217519845, + "grad_norm": 0.325691590644772, + "learning_rate": 1.9345872754145123e-05, + "loss": 2.5699, + "step": 47621 + }, + { + "epoch": 2.2171473799380776, + "grad_norm": 0.3271340159055574, + "learning_rate": 1.934373284617671e-05, + "loss": 2.5637, + "step": 47622 + }, + { + "epoch": 2.2171939381241708, + "grad_norm": 0.32574492077256534, + "learning_rate": 1.9341593028181405e-05, + "loss": 2.5898, + "step": 47623 + }, + { + "epoch": 2.217240496310264, + "grad_norm": 0.3186081678089233, + "learning_rate": 1.93394533001655e-05, + "loss": 2.6161, + "step": 47624 + }, + { + "epoch": 2.217287054496357, + "grad_norm": 0.33831887334896654, + "learning_rate": 1.933731366213528e-05, + "loss": 2.6453, + "step": 47625 + }, + { + "epoch": 2.2173336126824497, + "grad_norm": 0.32451184461111154, + "learning_rate": 1.9335174114097025e-05, + "loss": 2.4815, + "step": 47626 + }, + { + "epoch": 2.2173801708685428, + "grad_norm": 0.3654242421983805, + "learning_rate": 1.9333034656057036e-05, + "loss": 2.7224, + "step": 47627 + }, + { + "epoch": 2.217426729054636, + "grad_norm": 0.32374941692895914, + "learning_rate": 1.9330895288021544e-05, + "loss": 2.5675, + "step": 47628 + }, + { + "epoch": 2.217473287240729, + "grad_norm": 0.3346151836724831, + "learning_rate": 1.9328756009996856e-05, + "loss": 2.7226, + "step": 47629 + }, + { + "epoch": 2.217519845426822, + "grad_norm": 0.33463561443867496, + "learning_rate": 1.9326616821989245e-05, + "loss": 2.6297, + "step": 47630 + }, + { + "epoch": 2.2175664036129152, + "grad_norm": 0.34864752369168467, + "learning_rate": 1.9324477724004998e-05, + "loss": 2.7279, + "step": 47631 + }, + { + "epoch": 2.2176129617990084, + "grad_norm": 0.3434424811681088, + "learning_rate": 1.932233871605038e-05, + "loss": 2.5289, + "step": 47632 + }, + { + "epoch": 2.2176595199851015, + "grad_norm": 0.33610649705911383, + "learning_rate": 1.9320199798131698e-05, + "loss": 2.6573, + "step": 47633 + }, + { + "epoch": 2.2177060781711946, + "grad_norm": 0.36508501522246467, + "learning_rate": 1.9318060970255168e-05, + "loss": 2.7706, + "step": 47634 + }, + { + "epoch": 2.2177526363572877, + "grad_norm": 0.33579438365793784, + "learning_rate": 1.9315922232427148e-05, + "loss": 2.5525, + "step": 47635 + }, + { + "epoch": 2.217799194543381, + "grad_norm": 0.32238952032383483, + "learning_rate": 1.9313783584653848e-05, + "loss": 2.6209, + "step": 47636 + }, + { + "epoch": 2.2178457527294735, + "grad_norm": 0.336958991835167, + "learning_rate": 1.9311645026941577e-05, + "loss": 2.6387, + "step": 47637 + }, + { + "epoch": 2.2178923109155666, + "grad_norm": 0.3246614748928633, + "learning_rate": 1.93095065592966e-05, + "loss": 2.5737, + "step": 47638 + }, + { + "epoch": 2.2179388691016597, + "grad_norm": 0.35730053943097145, + "learning_rate": 1.93073681817252e-05, + "loss": 2.5836, + "step": 47639 + }, + { + "epoch": 2.217985427287753, + "grad_norm": 0.34117379480585386, + "learning_rate": 1.9305229894233667e-05, + "loss": 2.7205, + "step": 47640 + }, + { + "epoch": 2.218031985473846, + "grad_norm": 0.3260047948857172, + "learning_rate": 1.930309169682824e-05, + "loss": 2.6515, + "step": 47641 + }, + { + "epoch": 2.218078543659939, + "grad_norm": 0.3617680190346844, + "learning_rate": 1.930095358951522e-05, + "loss": 2.6431, + "step": 47642 + }, + { + "epoch": 2.218125101846032, + "grad_norm": 0.33497628617949154, + "learning_rate": 1.929881557230087e-05, + "loss": 2.6873, + "step": 47643 + }, + { + "epoch": 2.2181716600321253, + "grad_norm": 0.31744880234945305, + "learning_rate": 1.929667764519148e-05, + "loss": 2.7107, + "step": 47644 + }, + { + "epoch": 2.2182182182182184, + "grad_norm": 0.34535533740376173, + "learning_rate": 1.929453980819331e-05, + "loss": 2.7572, + "step": 47645 + }, + { + "epoch": 2.218264776404311, + "grad_norm": 0.31995332107626934, + "learning_rate": 1.929240206131266e-05, + "loss": 2.6568, + "step": 47646 + }, + { + "epoch": 2.218311334590404, + "grad_norm": 0.3506451019005625, + "learning_rate": 1.929026440455575e-05, + "loss": 2.6284, + "step": 47647 + }, + { + "epoch": 2.2183578927764973, + "grad_norm": 0.337192880881334, + "learning_rate": 1.9288126837928926e-05, + "loss": 2.7, + "step": 47648 + }, + { + "epoch": 2.2184044509625904, + "grad_norm": 0.36190500048904795, + "learning_rate": 1.9285989361438406e-05, + "loss": 2.654, + "step": 47649 + }, + { + "epoch": 2.2184510091486835, + "grad_norm": 0.30093135170406604, + "learning_rate": 1.9283851975090478e-05, + "loss": 2.526, + "step": 47650 + }, + { + "epoch": 2.2184975673347767, + "grad_norm": 0.3422268021525712, + "learning_rate": 1.9281714678891422e-05, + "loss": 2.5857, + "step": 47651 + }, + { + "epoch": 2.2185441255208698, + "grad_norm": 0.331221705426491, + "learning_rate": 1.9279577472847504e-05, + "loss": 2.6632, + "step": 47652 + }, + { + "epoch": 2.218590683706963, + "grad_norm": 0.31322047811967485, + "learning_rate": 1.9277440356965016e-05, + "loss": 2.6696, + "step": 47653 + }, + { + "epoch": 2.218637241893056, + "grad_norm": 0.3281532117478789, + "learning_rate": 1.92753033312502e-05, + "loss": 2.6471, + "step": 47654 + }, + { + "epoch": 2.218683800079149, + "grad_norm": 0.3141778833235742, + "learning_rate": 1.927316639570934e-05, + "loss": 2.6444, + "step": 47655 + }, + { + "epoch": 2.218730358265242, + "grad_norm": 0.329514027687069, + "learning_rate": 1.9271029550348712e-05, + "loss": 2.5262, + "step": 47656 + }, + { + "epoch": 2.218776916451335, + "grad_norm": 0.3241185727000297, + "learning_rate": 1.926889279517458e-05, + "loss": 2.6426, + "step": 47657 + }, + { + "epoch": 2.218823474637428, + "grad_norm": 0.3130578167910957, + "learning_rate": 1.9266756130193226e-05, + "loss": 2.5267, + "step": 47658 + }, + { + "epoch": 2.218870032823521, + "grad_norm": 0.32617811880345504, + "learning_rate": 1.926461955541093e-05, + "loss": 2.6523, + "step": 47659 + }, + { + "epoch": 2.2189165910096142, + "grad_norm": 0.32480351318442874, + "learning_rate": 1.9262483070833914e-05, + "loss": 2.5705, + "step": 47660 + }, + { + "epoch": 2.2189631491957074, + "grad_norm": 0.3278451716552784, + "learning_rate": 1.9260346676468516e-05, + "loss": 2.6351, + "step": 47661 + }, + { + "epoch": 2.2190097073818005, + "grad_norm": 0.33286837543024433, + "learning_rate": 1.925821037232095e-05, + "loss": 2.4776, + "step": 47662 + }, + { + "epoch": 2.2190562655678936, + "grad_norm": 0.3405867926186565, + "learning_rate": 1.9256074158397515e-05, + "loss": 2.6384, + "step": 47663 + }, + { + "epoch": 2.2191028237539867, + "grad_norm": 0.3369147876303221, + "learning_rate": 1.925393803470447e-05, + "loss": 2.5659, + "step": 47664 + }, + { + "epoch": 2.2191493819400794, + "grad_norm": 0.3183785339713968, + "learning_rate": 1.9251802001248086e-05, + "loss": 2.6437, + "step": 47665 + }, + { + "epoch": 2.2191959401261725, + "grad_norm": 0.31382898065465276, + "learning_rate": 1.924966605803466e-05, + "loss": 2.5807, + "step": 47666 + }, + { + "epoch": 2.2192424983122656, + "grad_norm": 0.34403142891107247, + "learning_rate": 1.924753020507039e-05, + "loss": 2.6348, + "step": 47667 + }, + { + "epoch": 2.2192890564983587, + "grad_norm": 0.3270701312401619, + "learning_rate": 1.924539444236163e-05, + "loss": 2.6147, + "step": 47668 + }, + { + "epoch": 2.219335614684452, + "grad_norm": 0.33658318698706763, + "learning_rate": 1.9243258769914586e-05, + "loss": 2.5831, + "step": 47669 + }, + { + "epoch": 2.219382172870545, + "grad_norm": 0.33624910288035587, + "learning_rate": 1.924112318773555e-05, + "loss": 2.664, + "step": 47670 + }, + { + "epoch": 2.219428731056638, + "grad_norm": 0.32596273640953055, + "learning_rate": 1.923898769583079e-05, + "loss": 2.6803, + "step": 47671 + }, + { + "epoch": 2.219475289242731, + "grad_norm": 0.3284139198872438, + "learning_rate": 1.923685229420658e-05, + "loss": 2.5774, + "step": 47672 + }, + { + "epoch": 2.2195218474288243, + "grad_norm": 0.32368421019019233, + "learning_rate": 1.923471698286915e-05, + "loss": 2.6578, + "step": 47673 + }, + { + "epoch": 2.2195684056149174, + "grad_norm": 0.3228259404312456, + "learning_rate": 1.9232581761824832e-05, + "loss": 2.6822, + "step": 47674 + }, + { + "epoch": 2.2196149638010105, + "grad_norm": 0.34254848680027306, + "learning_rate": 1.9230446631079828e-05, + "loss": 2.6223, + "step": 47675 + }, + { + "epoch": 2.219661521987103, + "grad_norm": 0.3403887875622411, + "learning_rate": 1.9228311590640442e-05, + "loss": 2.6802, + "step": 47676 + }, + { + "epoch": 2.2197080801731963, + "grad_norm": 0.327767773645935, + "learning_rate": 1.922617664051292e-05, + "loss": 2.5399, + "step": 47677 + }, + { + "epoch": 2.2197546383592894, + "grad_norm": 0.34099860881554106, + "learning_rate": 1.9224041780703545e-05, + "loss": 2.5909, + "step": 47678 + }, + { + "epoch": 2.2198011965453825, + "grad_norm": 0.3442995497024233, + "learning_rate": 1.9221907011218594e-05, + "loss": 2.7289, + "step": 47679 + }, + { + "epoch": 2.2198477547314757, + "grad_norm": 0.3472125746731399, + "learning_rate": 1.9219772332064273e-05, + "loss": 2.702, + "step": 47680 + }, + { + "epoch": 2.219894312917569, + "grad_norm": 0.3463764668573161, + "learning_rate": 1.9217637743246923e-05, + "loss": 2.6872, + "step": 47681 + }, + { + "epoch": 2.219940871103662, + "grad_norm": 0.32918744462904087, + "learning_rate": 1.921550324477276e-05, + "loss": 2.584, + "step": 47682 + }, + { + "epoch": 2.219987429289755, + "grad_norm": 0.3492198238036438, + "learning_rate": 1.921336883664806e-05, + "loss": 2.7199, + "step": 47683 + }, + { + "epoch": 2.220033987475848, + "grad_norm": 0.33661562288140495, + "learning_rate": 1.921123451887909e-05, + "loss": 2.6471, + "step": 47684 + }, + { + "epoch": 2.220080545661941, + "grad_norm": 0.33008451771992064, + "learning_rate": 1.9209100291472117e-05, + "loss": 2.5713, + "step": 47685 + }, + { + "epoch": 2.220127103848034, + "grad_norm": 0.34207724518586097, + "learning_rate": 1.9206966154433396e-05, + "loss": 2.677, + "step": 47686 + }, + { + "epoch": 2.220173662034127, + "grad_norm": 0.3324481473127297, + "learning_rate": 1.9204832107769215e-05, + "loss": 2.6013, + "step": 47687 + }, + { + "epoch": 2.22022022022022, + "grad_norm": 0.34516488977963916, + "learning_rate": 1.9202698151485804e-05, + "loss": 2.6541, + "step": 47688 + }, + { + "epoch": 2.2202667784063133, + "grad_norm": 0.361669661016353, + "learning_rate": 1.9200564285589435e-05, + "loss": 2.692, + "step": 47689 + }, + { + "epoch": 2.2203133365924064, + "grad_norm": 0.326317388243007, + "learning_rate": 1.919843051008638e-05, + "loss": 2.6333, + "step": 47690 + }, + { + "epoch": 2.2203598947784995, + "grad_norm": 0.36537386731221827, + "learning_rate": 1.919629682498289e-05, + "loss": 2.6837, + "step": 47691 + }, + { + "epoch": 2.2204064529645926, + "grad_norm": 0.3419245626409265, + "learning_rate": 1.9194163230285255e-05, + "loss": 2.6532, + "step": 47692 + }, + { + "epoch": 2.2204530111506857, + "grad_norm": 0.316620770025533, + "learning_rate": 1.9192029725999682e-05, + "loss": 2.6582, + "step": 47693 + }, + { + "epoch": 2.220499569336779, + "grad_norm": 0.34518332066751767, + "learning_rate": 1.9189896312132505e-05, + "loss": 2.6368, + "step": 47694 + }, + { + "epoch": 2.2205461275228715, + "grad_norm": 0.3521838034800442, + "learning_rate": 1.9187762988689926e-05, + "loss": 2.593, + "step": 47695 + }, + { + "epoch": 2.2205926857089646, + "grad_norm": 0.32300339313220916, + "learning_rate": 1.9185629755678225e-05, + "loss": 2.6904, + "step": 47696 + }, + { + "epoch": 2.2206392438950577, + "grad_norm": 0.3300134618618384, + "learning_rate": 1.918349661310367e-05, + "loss": 2.6328, + "step": 47697 + }, + { + "epoch": 2.220685802081151, + "grad_norm": 0.33747291005726926, + "learning_rate": 1.9181363560972514e-05, + "loss": 2.7043, + "step": 47698 + }, + { + "epoch": 2.220732360267244, + "grad_norm": 0.3207594380895358, + "learning_rate": 1.917923059929102e-05, + "loss": 2.5397, + "step": 47699 + }, + { + "epoch": 2.220778918453337, + "grad_norm": 0.3250794895507859, + "learning_rate": 1.9177097728065464e-05, + "loss": 2.6102, + "step": 47700 + }, + { + "epoch": 2.22082547663943, + "grad_norm": 0.33200237723949805, + "learning_rate": 1.917496494730207e-05, + "loss": 2.6583, + "step": 47701 + }, + { + "epoch": 2.2208720348255233, + "grad_norm": 0.3367126775059979, + "learning_rate": 1.9172832257007124e-05, + "loss": 2.6148, + "step": 47702 + }, + { + "epoch": 2.2209185930116164, + "grad_norm": 0.3290922622594891, + "learning_rate": 1.9170699657186873e-05, + "loss": 2.6196, + "step": 47703 + }, + { + "epoch": 2.2209651511977095, + "grad_norm": 0.35866222192836106, + "learning_rate": 1.9168567147847583e-05, + "loss": 2.6022, + "step": 47704 + }, + { + "epoch": 2.221011709383802, + "grad_norm": 0.3193472219399535, + "learning_rate": 1.9166434728995524e-05, + "loss": 2.5334, + "step": 47705 + }, + { + "epoch": 2.2210582675698953, + "grad_norm": 0.3272602936833424, + "learning_rate": 1.9164302400636912e-05, + "loss": 2.645, + "step": 47706 + }, + { + "epoch": 2.2211048257559884, + "grad_norm": 0.3233473962394866, + "learning_rate": 1.916217016277807e-05, + "loss": 2.6105, + "step": 47707 + }, + { + "epoch": 2.2211513839420816, + "grad_norm": 0.3431958696084907, + "learning_rate": 1.9160038015425202e-05, + "loss": 2.5592, + "step": 47708 + }, + { + "epoch": 2.2211979421281747, + "grad_norm": 0.33541651796625765, + "learning_rate": 1.9157905958584583e-05, + "loss": 2.7093, + "step": 47709 + }, + { + "epoch": 2.221244500314268, + "grad_norm": 0.33666675191947637, + "learning_rate": 1.9155773992262472e-05, + "loss": 2.6249, + "step": 47710 + }, + { + "epoch": 2.221291058500361, + "grad_norm": 0.32760162402338733, + "learning_rate": 1.9153642116465127e-05, + "loss": 2.5906, + "step": 47711 + }, + { + "epoch": 2.221337616686454, + "grad_norm": 0.3255912978417789, + "learning_rate": 1.9151510331198802e-05, + "loss": 2.5268, + "step": 47712 + }, + { + "epoch": 2.221384174872547, + "grad_norm": 0.3372469779294139, + "learning_rate": 1.9149378636469772e-05, + "loss": 2.7111, + "step": 47713 + }, + { + "epoch": 2.2214307330586403, + "grad_norm": 0.33141542607491065, + "learning_rate": 1.914724703228426e-05, + "loss": 2.7443, + "step": 47714 + }, + { + "epoch": 2.221477291244733, + "grad_norm": 0.34922049311596415, + "learning_rate": 1.9145115518648538e-05, + "loss": 2.6669, + "step": 47715 + }, + { + "epoch": 2.221523849430826, + "grad_norm": 0.32725881021007086, + "learning_rate": 1.914298409556886e-05, + "loss": 2.6686, + "step": 47716 + }, + { + "epoch": 2.221570407616919, + "grad_norm": 0.32602187087486256, + "learning_rate": 1.914085276305149e-05, + "loss": 2.6228, + "step": 47717 + }, + { + "epoch": 2.2216169658030123, + "grad_norm": 0.32941261907668784, + "learning_rate": 1.9138721521102693e-05, + "loss": 2.6748, + "step": 47718 + }, + { + "epoch": 2.2216635239891054, + "grad_norm": 0.3287316642219826, + "learning_rate": 1.913659036972867e-05, + "loss": 2.7523, + "step": 47719 + }, + { + "epoch": 2.2217100821751985, + "grad_norm": 0.3243359717558936, + "learning_rate": 1.9134459308935747e-05, + "loss": 2.6002, + "step": 47720 + }, + { + "epoch": 2.2217566403612916, + "grad_norm": 0.33695679218785135, + "learning_rate": 1.9132328338730115e-05, + "loss": 2.6297, + "step": 47721 + }, + { + "epoch": 2.2218031985473847, + "grad_norm": 0.36428063676790096, + "learning_rate": 1.913019745911809e-05, + "loss": 2.6063, + "step": 47722 + }, + { + "epoch": 2.221849756733478, + "grad_norm": 0.3352478456687085, + "learning_rate": 1.9128066670105875e-05, + "loss": 2.695, + "step": 47723 + }, + { + "epoch": 2.2218963149195705, + "grad_norm": 0.3575456459720786, + "learning_rate": 1.912593597169975e-05, + "loss": 2.693, + "step": 47724 + }, + { + "epoch": 2.2219428731056636, + "grad_norm": 0.31037904083284523, + "learning_rate": 1.9123805363905956e-05, + "loss": 2.5701, + "step": 47725 + }, + { + "epoch": 2.2219894312917567, + "grad_norm": 0.33288405478941163, + "learning_rate": 1.912167484673077e-05, + "loss": 2.5797, + "step": 47726 + }, + { + "epoch": 2.22203598947785, + "grad_norm": 0.3494649655803252, + "learning_rate": 1.9119544420180406e-05, + "loss": 2.712, + "step": 47727 + }, + { + "epoch": 2.222082547663943, + "grad_norm": 0.3376631611316465, + "learning_rate": 1.9117414084261143e-05, + "loss": 2.6085, + "step": 47728 + }, + { + "epoch": 2.222129105850036, + "grad_norm": 0.32857137507128814, + "learning_rate": 1.911528383897922e-05, + "loss": 2.6554, + "step": 47729 + }, + { + "epoch": 2.222175664036129, + "grad_norm": 0.362143757164952, + "learning_rate": 1.91131536843409e-05, + "loss": 2.7591, + "step": 47730 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3180881388612856, + "learning_rate": 1.9111023620352453e-05, + "loss": 2.6397, + "step": 47731 + }, + { + "epoch": 2.2222687804083154, + "grad_norm": 0.3592451783993841, + "learning_rate": 1.9108893647020066e-05, + "loss": 2.6769, + "step": 47732 + }, + { + "epoch": 2.2223153385944086, + "grad_norm": 0.337963651576502, + "learning_rate": 1.910676376435007e-05, + "loss": 2.5835, + "step": 47733 + }, + { + "epoch": 2.2223618967805017, + "grad_norm": 0.3426276784409824, + "learning_rate": 1.9104633972348646e-05, + "loss": 2.566, + "step": 47734 + }, + { + "epoch": 2.2224084549665943, + "grad_norm": 0.3207117936476323, + "learning_rate": 1.9102504271022113e-05, + "loss": 2.6797, + "step": 47735 + }, + { + "epoch": 2.2224550131526875, + "grad_norm": 0.34907736111531473, + "learning_rate": 1.9100374660376667e-05, + "loss": 2.6231, + "step": 47736 + }, + { + "epoch": 2.2225015713387806, + "grad_norm": 0.34242377092852294, + "learning_rate": 1.909824514041858e-05, + "loss": 2.6309, + "step": 47737 + }, + { + "epoch": 2.2225481295248737, + "grad_norm": 0.3205260540136862, + "learning_rate": 1.9096115711154093e-05, + "loss": 2.7198, + "step": 47738 + }, + { + "epoch": 2.222594687710967, + "grad_norm": 0.3214735216265862, + "learning_rate": 1.9093986372589467e-05, + "loss": 2.6237, + "step": 47739 + }, + { + "epoch": 2.22264124589706, + "grad_norm": 0.35438969545057625, + "learning_rate": 1.909185712473096e-05, + "loss": 2.5189, + "step": 47740 + }, + { + "epoch": 2.222687804083153, + "grad_norm": 0.2966748058654626, + "learning_rate": 1.9089727967584796e-05, + "loss": 2.5597, + "step": 47741 + }, + { + "epoch": 2.222734362269246, + "grad_norm": 0.3463996652583281, + "learning_rate": 1.9087598901157232e-05, + "loss": 2.6133, + "step": 47742 + }, + { + "epoch": 2.2227809204553393, + "grad_norm": 0.35367348001286225, + "learning_rate": 1.9085469925454518e-05, + "loss": 2.6119, + "step": 47743 + }, + { + "epoch": 2.222827478641432, + "grad_norm": 0.3503623950030674, + "learning_rate": 1.9083341040482923e-05, + "loss": 2.6402, + "step": 47744 + }, + { + "epoch": 2.222874036827525, + "grad_norm": 0.34362002486666393, + "learning_rate": 1.908121224624864e-05, + "loss": 2.6297, + "step": 47745 + }, + { + "epoch": 2.222920595013618, + "grad_norm": 0.3472116754227947, + "learning_rate": 1.9079083542757993e-05, + "loss": 2.5941, + "step": 47746 + }, + { + "epoch": 2.2229671531997113, + "grad_norm": 0.34453706922812843, + "learning_rate": 1.907695493001715e-05, + "loss": 2.6909, + "step": 47747 + }, + { + "epoch": 2.2230137113858044, + "grad_norm": 0.33224119783523987, + "learning_rate": 1.907482640803243e-05, + "loss": 2.6139, + "step": 47748 + }, + { + "epoch": 2.2230602695718975, + "grad_norm": 0.3208384608463689, + "learning_rate": 1.9072697976810033e-05, + "loss": 2.5894, + "step": 47749 + }, + { + "epoch": 2.2231068277579906, + "grad_norm": 0.3686328038043271, + "learning_rate": 1.907056963635622e-05, + "loss": 2.6598, + "step": 47750 + }, + { + "epoch": 2.2231533859440837, + "grad_norm": 0.32053312484212226, + "learning_rate": 1.9068441386677243e-05, + "loss": 2.6264, + "step": 47751 + }, + { + "epoch": 2.223199944130177, + "grad_norm": 0.3298982999282002, + "learning_rate": 1.9066313227779343e-05, + "loss": 2.6449, + "step": 47752 + }, + { + "epoch": 2.22324650231627, + "grad_norm": 0.3337693505347565, + "learning_rate": 1.906418515966878e-05, + "loss": 2.6303, + "step": 47753 + }, + { + "epoch": 2.2232930605023626, + "grad_norm": 0.34167971979993184, + "learning_rate": 1.9062057182351768e-05, + "loss": 2.5244, + "step": 47754 + }, + { + "epoch": 2.2233396186884558, + "grad_norm": 0.3388488561204009, + "learning_rate": 1.905992929583457e-05, + "loss": 2.6224, + "step": 47755 + }, + { + "epoch": 2.223386176874549, + "grad_norm": 0.35907326915009663, + "learning_rate": 1.9057801500123433e-05, + "loss": 2.7081, + "step": 47756 + }, + { + "epoch": 2.223432735060642, + "grad_norm": 0.33158651058638694, + "learning_rate": 1.9055673795224598e-05, + "loss": 2.5968, + "step": 47757 + }, + { + "epoch": 2.223479293246735, + "grad_norm": 0.33367178631882316, + "learning_rate": 1.9053546181144317e-05, + "loss": 2.6806, + "step": 47758 + }, + { + "epoch": 2.223525851432828, + "grad_norm": 0.33412688113404865, + "learning_rate": 1.905141865788884e-05, + "loss": 2.5478, + "step": 47759 + }, + { + "epoch": 2.2235724096189213, + "grad_norm": 0.3304209682523474, + "learning_rate": 1.904929122546437e-05, + "loss": 2.7006, + "step": 47760 + }, + { + "epoch": 2.2236189678050144, + "grad_norm": 0.3320410998081292, + "learning_rate": 1.904716388387722e-05, + "loss": 2.6589, + "step": 47761 + }, + { + "epoch": 2.2236655259911076, + "grad_norm": 0.3295714156954548, + "learning_rate": 1.9045036633133567e-05, + "loss": 2.5969, + "step": 47762 + }, + { + "epoch": 2.2237120841772002, + "grad_norm": 0.34519468891314264, + "learning_rate": 1.904290947323969e-05, + "loss": 2.729, + "step": 47763 + }, + { + "epoch": 2.2237586423632933, + "grad_norm": 0.32549698249703257, + "learning_rate": 1.9040782404201825e-05, + "loss": 2.6471, + "step": 47764 + }, + { + "epoch": 2.2238052005493865, + "grad_norm": 0.3373784096331641, + "learning_rate": 1.9038655426026203e-05, + "loss": 2.6055, + "step": 47765 + }, + { + "epoch": 2.2238517587354796, + "grad_norm": 0.3347876206596866, + "learning_rate": 1.9036528538719107e-05, + "loss": 2.5463, + "step": 47766 + }, + { + "epoch": 2.2238983169215727, + "grad_norm": 0.33355292934090325, + "learning_rate": 1.9034401742286723e-05, + "loss": 2.6421, + "step": 47767 + }, + { + "epoch": 2.223944875107666, + "grad_norm": 0.33519866917895996, + "learning_rate": 1.903227503673532e-05, + "loss": 2.6118, + "step": 47768 + }, + { + "epoch": 2.223991433293759, + "grad_norm": 0.3306462078864695, + "learning_rate": 1.903014842207114e-05, + "loss": 2.6376, + "step": 47769 + }, + { + "epoch": 2.224037991479852, + "grad_norm": 0.3339067745078706, + "learning_rate": 1.902802189830043e-05, + "loss": 2.6269, + "step": 47770 + }, + { + "epoch": 2.224084549665945, + "grad_norm": 0.3279335066171271, + "learning_rate": 1.9025895465429415e-05, + "loss": 2.6265, + "step": 47771 + }, + { + "epoch": 2.2241311078520383, + "grad_norm": 0.3325027082885978, + "learning_rate": 1.902376912346437e-05, + "loss": 2.6173, + "step": 47772 + }, + { + "epoch": 2.2241776660381314, + "grad_norm": 0.3265177361262917, + "learning_rate": 1.9021642872411473e-05, + "loss": 2.6854, + "step": 47773 + }, + { + "epoch": 2.224224224224224, + "grad_norm": 0.3353685275881382, + "learning_rate": 1.9019516712277036e-05, + "loss": 2.6056, + "step": 47774 + }, + { + "epoch": 2.224270782410317, + "grad_norm": 0.33976814292504387, + "learning_rate": 1.9017390643067235e-05, + "loss": 2.6933, + "step": 47775 + }, + { + "epoch": 2.2243173405964103, + "grad_norm": 0.3420006355804737, + "learning_rate": 1.901526466478837e-05, + "loss": 2.7379, + "step": 47776 + }, + { + "epoch": 2.2243638987825034, + "grad_norm": 0.3409990020384644, + "learning_rate": 1.9013138777446633e-05, + "loss": 2.6748, + "step": 47777 + }, + { + "epoch": 2.2244104569685965, + "grad_norm": 0.35931524349165367, + "learning_rate": 1.9011012981048282e-05, + "loss": 2.726, + "step": 47778 + }, + { + "epoch": 2.2244570151546896, + "grad_norm": 0.32571347996101724, + "learning_rate": 1.9008887275599576e-05, + "loss": 2.6633, + "step": 47779 + }, + { + "epoch": 2.2245035733407827, + "grad_norm": 0.35884232507304525, + "learning_rate": 1.9006761661106714e-05, + "loss": 2.6831, + "step": 47780 + }, + { + "epoch": 2.224550131526876, + "grad_norm": 0.34378331030718623, + "learning_rate": 1.9004636137575948e-05, + "loss": 2.7324, + "step": 47781 + }, + { + "epoch": 2.224596689712969, + "grad_norm": 0.3244702358806411, + "learning_rate": 1.9002510705013525e-05, + "loss": 2.5945, + "step": 47782 + }, + { + "epoch": 2.2246432478990616, + "grad_norm": 0.31437566582181986, + "learning_rate": 1.9000385363425684e-05, + "loss": 2.5384, + "step": 47783 + }, + { + "epoch": 2.2246898060851548, + "grad_norm": 0.3356476060176654, + "learning_rate": 1.899826011281865e-05, + "loss": 2.726, + "step": 47784 + }, + { + "epoch": 2.224736364271248, + "grad_norm": 0.3763948493838632, + "learning_rate": 1.899613495319869e-05, + "loss": 2.7728, + "step": 47785 + }, + { + "epoch": 2.224782922457341, + "grad_norm": 0.3368581566978845, + "learning_rate": 1.8994009884571983e-05, + "loss": 2.6302, + "step": 47786 + }, + { + "epoch": 2.224829480643434, + "grad_norm": 0.34558374973988426, + "learning_rate": 1.899188490694484e-05, + "loss": 2.6867, + "step": 47787 + }, + { + "epoch": 2.2248760388295272, + "grad_norm": 0.38194383021778633, + "learning_rate": 1.898976002032342e-05, + "loss": 2.7834, + "step": 47788 + }, + { + "epoch": 2.2249225970156203, + "grad_norm": 0.32926742443802903, + "learning_rate": 1.898763522471404e-05, + "loss": 2.677, + "step": 47789 + }, + { + "epoch": 2.2249691552017135, + "grad_norm": 0.3646068186638806, + "learning_rate": 1.898551052012288e-05, + "loss": 2.5434, + "step": 47790 + }, + { + "epoch": 2.2250157133878066, + "grad_norm": 0.3478463804506645, + "learning_rate": 1.8983385906556183e-05, + "loss": 2.6875, + "step": 47791 + }, + { + "epoch": 2.2250622715738997, + "grad_norm": 0.31273788133072655, + "learning_rate": 1.8981261384020216e-05, + "loss": 2.6537, + "step": 47792 + }, + { + "epoch": 2.2251088297599924, + "grad_norm": 0.3463318592423029, + "learning_rate": 1.8979136952521154e-05, + "loss": 2.707, + "step": 47793 + }, + { + "epoch": 2.2251553879460855, + "grad_norm": 0.341552654419067, + "learning_rate": 1.8977012612065304e-05, + "loss": 2.6851, + "step": 47794 + }, + { + "epoch": 2.2252019461321786, + "grad_norm": 0.33011819566475625, + "learning_rate": 1.8974888362658844e-05, + "loss": 2.568, + "step": 47795 + }, + { + "epoch": 2.2252485043182717, + "grad_norm": 0.37037747956735073, + "learning_rate": 1.8972764204308037e-05, + "loss": 2.7236, + "step": 47796 + }, + { + "epoch": 2.225295062504365, + "grad_norm": 0.3638071618373541, + "learning_rate": 1.897064013701911e-05, + "loss": 2.6583, + "step": 47797 + }, + { + "epoch": 2.225341620690458, + "grad_norm": 0.3069162421791453, + "learning_rate": 1.8968516160798312e-05, + "loss": 2.6131, + "step": 47798 + }, + { + "epoch": 2.225388178876551, + "grad_norm": 0.3286204522201734, + "learning_rate": 1.8966392275651823e-05, + "loss": 2.7188, + "step": 47799 + }, + { + "epoch": 2.225434737062644, + "grad_norm": 0.334830500332511, + "learning_rate": 1.8964268481585956e-05, + "loss": 2.5706, + "step": 47800 + }, + { + "epoch": 2.2254812952487373, + "grad_norm": 0.3293547644444062, + "learning_rate": 1.8962144778606867e-05, + "loss": 2.6155, + "step": 47801 + }, + { + "epoch": 2.22552785343483, + "grad_norm": 0.320502644671382, + "learning_rate": 1.896002116672086e-05, + "loss": 2.6938, + "step": 47802 + }, + { + "epoch": 2.225574411620923, + "grad_norm": 0.3190481707031043, + "learning_rate": 1.895789764593412e-05, + "loss": 2.5692, + "step": 47803 + }, + { + "epoch": 2.225620969807016, + "grad_norm": 0.3631151405957175, + "learning_rate": 1.8955774216252886e-05, + "loss": 2.7034, + "step": 47804 + }, + { + "epoch": 2.2256675279931093, + "grad_norm": 0.32085950102477095, + "learning_rate": 1.8953650877683415e-05, + "loss": 2.5744, + "step": 47805 + }, + { + "epoch": 2.2257140861792024, + "grad_norm": 0.3374587068037358, + "learning_rate": 1.8951527630231887e-05, + "loss": 2.7494, + "step": 47806 + }, + { + "epoch": 2.2257606443652955, + "grad_norm": 0.3251690965143595, + "learning_rate": 1.8949404473904603e-05, + "loss": 2.7307, + "step": 47807 + }, + { + "epoch": 2.2258072025513886, + "grad_norm": 0.33525195769179145, + "learning_rate": 1.8947281408707734e-05, + "loss": 2.6688, + "step": 47808 + }, + { + "epoch": 2.2258537607374818, + "grad_norm": 0.3260479669450465, + "learning_rate": 1.894515843464754e-05, + "loss": 2.6354, + "step": 47809 + }, + { + "epoch": 2.225900318923575, + "grad_norm": 0.3500977776152594, + "learning_rate": 1.894303555173025e-05, + "loss": 2.6806, + "step": 47810 + }, + { + "epoch": 2.225946877109668, + "grad_norm": 0.33171271073931957, + "learning_rate": 1.8940912759962083e-05, + "loss": 2.5918, + "step": 47811 + }, + { + "epoch": 2.225993435295761, + "grad_norm": 0.3127238729643083, + "learning_rate": 1.8938790059349283e-05, + "loss": 2.6108, + "step": 47812 + }, + { + "epoch": 2.2260399934818538, + "grad_norm": 0.2943631972852174, + "learning_rate": 1.8936667449898088e-05, + "loss": 2.6839, + "step": 47813 + }, + { + "epoch": 2.226086551667947, + "grad_norm": 0.3454533538468143, + "learning_rate": 1.893454493161468e-05, + "loss": 2.6034, + "step": 47814 + }, + { + "epoch": 2.22613310985404, + "grad_norm": 0.3374986675769848, + "learning_rate": 1.8932422504505358e-05, + "loss": 2.673, + "step": 47815 + }, + { + "epoch": 2.226179668040133, + "grad_norm": 0.3485541143801581, + "learning_rate": 1.8930300168576298e-05, + "loss": 2.695, + "step": 47816 + }, + { + "epoch": 2.2262262262262262, + "grad_norm": 0.31765556516609555, + "learning_rate": 1.8928177923833746e-05, + "loss": 2.677, + "step": 47817 + }, + { + "epoch": 2.2262727844123193, + "grad_norm": 0.3504434162708529, + "learning_rate": 1.8926055770283944e-05, + "loss": 2.664, + "step": 47818 + }, + { + "epoch": 2.2263193425984125, + "grad_norm": 0.348950366777134, + "learning_rate": 1.892393370793308e-05, + "loss": 2.628, + "step": 47819 + }, + { + "epoch": 2.2263659007845056, + "grad_norm": 0.33025103064530514, + "learning_rate": 1.8921811736787437e-05, + "loss": 2.6856, + "step": 47820 + }, + { + "epoch": 2.2264124589705987, + "grad_norm": 0.3385984392184756, + "learning_rate": 1.89196898568532e-05, + "loss": 2.6268, + "step": 47821 + }, + { + "epoch": 2.2264590171566914, + "grad_norm": 0.32384585220529877, + "learning_rate": 1.891756806813661e-05, + "loss": 2.6642, + "step": 47822 + }, + { + "epoch": 2.2265055753427845, + "grad_norm": 0.33174329934456503, + "learning_rate": 1.8915446370643895e-05, + "loss": 2.5677, + "step": 47823 + }, + { + "epoch": 2.2265521335288776, + "grad_norm": 0.33293371983456826, + "learning_rate": 1.8913324764381285e-05, + "loss": 2.7621, + "step": 47824 + }, + { + "epoch": 2.2265986917149707, + "grad_norm": 0.3364586020556483, + "learning_rate": 1.8911203249355008e-05, + "loss": 2.673, + "step": 47825 + }, + { + "epoch": 2.226645249901064, + "grad_norm": 0.32946229739335303, + "learning_rate": 1.89090818255713e-05, + "loss": 2.6984, + "step": 47826 + }, + { + "epoch": 2.226691808087157, + "grad_norm": 0.35612188905835346, + "learning_rate": 1.8906960493036334e-05, + "loss": 2.6683, + "step": 47827 + }, + { + "epoch": 2.22673836627325, + "grad_norm": 0.3619095898817589, + "learning_rate": 1.890483925175642e-05, + "loss": 2.6623, + "step": 47828 + }, + { + "epoch": 2.226784924459343, + "grad_norm": 0.3201901763204935, + "learning_rate": 1.8902718101737714e-05, + "loss": 2.6297, + "step": 47829 + }, + { + "epoch": 2.2268314826454363, + "grad_norm": 0.3188199179048777, + "learning_rate": 1.8900597042986473e-05, + "loss": 2.5386, + "step": 47830 + }, + { + "epoch": 2.2268780408315294, + "grad_norm": 0.3278888121255281, + "learning_rate": 1.889847607550893e-05, + "loss": 2.5645, + "step": 47831 + }, + { + "epoch": 2.226924599017622, + "grad_norm": 0.36945440294968723, + "learning_rate": 1.889635519931126e-05, + "loss": 2.6165, + "step": 47832 + }, + { + "epoch": 2.226971157203715, + "grad_norm": 0.3327725909650876, + "learning_rate": 1.8894234414399766e-05, + "loss": 2.5878, + "step": 47833 + }, + { + "epoch": 2.2270177153898083, + "grad_norm": 0.35541236008143645, + "learning_rate": 1.8892113720780603e-05, + "loss": 2.6252, + "step": 47834 + }, + { + "epoch": 2.2270642735759014, + "grad_norm": 0.34919457532263903, + "learning_rate": 1.8889993118460026e-05, + "loss": 2.7376, + "step": 47835 + }, + { + "epoch": 2.2271108317619945, + "grad_norm": 0.3451638542853529, + "learning_rate": 1.8887872607444253e-05, + "loss": 2.6798, + "step": 47836 + }, + { + "epoch": 2.2271573899480877, + "grad_norm": 0.33078371898754017, + "learning_rate": 1.8885752187739503e-05, + "loss": 2.6072, + "step": 47837 + }, + { + "epoch": 2.2272039481341808, + "grad_norm": 0.34038140671888406, + "learning_rate": 1.8883631859352013e-05, + "loss": 2.7029, + "step": 47838 + }, + { + "epoch": 2.227250506320274, + "grad_norm": 0.3286589555848611, + "learning_rate": 1.888151162228801e-05, + "loss": 2.722, + "step": 47839 + }, + { + "epoch": 2.227297064506367, + "grad_norm": 0.3401259088230659, + "learning_rate": 1.8879391476553675e-05, + "loss": 2.5865, + "step": 47840 + }, + { + "epoch": 2.2273436226924597, + "grad_norm": 0.3416352019573657, + "learning_rate": 1.8877271422155295e-05, + "loss": 2.6214, + "step": 47841 + }, + { + "epoch": 2.227390180878553, + "grad_norm": 0.32515068383723594, + "learning_rate": 1.8875151459099038e-05, + "loss": 2.6927, + "step": 47842 + }, + { + "epoch": 2.227436739064646, + "grad_norm": 0.32812108093090064, + "learning_rate": 1.8873031587391144e-05, + "loss": 2.6256, + "step": 47843 + }, + { + "epoch": 2.227483297250739, + "grad_norm": 0.334553368847273, + "learning_rate": 1.8870911807037857e-05, + "loss": 2.6444, + "step": 47844 + }, + { + "epoch": 2.227529855436832, + "grad_norm": 0.3170018661347053, + "learning_rate": 1.8868792118045337e-05, + "loss": 2.5831, + "step": 47845 + }, + { + "epoch": 2.2275764136229252, + "grad_norm": 0.3182334556973675, + "learning_rate": 1.8866672520419882e-05, + "loss": 2.5484, + "step": 47846 + }, + { + "epoch": 2.2276229718090184, + "grad_norm": 0.33776184839922424, + "learning_rate": 1.886455301416764e-05, + "loss": 2.6135, + "step": 47847 + }, + { + "epoch": 2.2276695299951115, + "grad_norm": 0.354058804695787, + "learning_rate": 1.8862433599294903e-05, + "loss": 2.6728, + "step": 47848 + }, + { + "epoch": 2.2277160881812046, + "grad_norm": 0.3035707033759425, + "learning_rate": 1.8860314275807844e-05, + "loss": 2.6829, + "step": 47849 + }, + { + "epoch": 2.2277626463672977, + "grad_norm": 0.3187555625387573, + "learning_rate": 1.885819504371269e-05, + "loss": 2.6696, + "step": 47850 + }, + { + "epoch": 2.227809204553391, + "grad_norm": 0.3375894905521666, + "learning_rate": 1.8856075903015664e-05, + "loss": 2.6571, + "step": 47851 + }, + { + "epoch": 2.2278557627394835, + "grad_norm": 0.32486648485114716, + "learning_rate": 1.8853956853723002e-05, + "loss": 2.621, + "step": 47852 + }, + { + "epoch": 2.2279023209255766, + "grad_norm": 0.325751739230185, + "learning_rate": 1.885183789584088e-05, + "loss": 2.5731, + "step": 47853 + }, + { + "epoch": 2.2279488791116697, + "grad_norm": 0.3208291245287829, + "learning_rate": 1.884971902937558e-05, + "loss": 2.6398, + "step": 47854 + }, + { + "epoch": 2.227995437297763, + "grad_norm": 0.3153065304771477, + "learning_rate": 1.8847600254333263e-05, + "loss": 2.6334, + "step": 47855 + }, + { + "epoch": 2.228041995483856, + "grad_norm": 0.33434704650754016, + "learning_rate": 1.8845481570720174e-05, + "loss": 2.7104, + "step": 47856 + }, + { + "epoch": 2.228088553669949, + "grad_norm": 0.34625177107024, + "learning_rate": 1.8843362978542543e-05, + "loss": 2.6293, + "step": 47857 + }, + { + "epoch": 2.228135111856042, + "grad_norm": 0.32636749340452487, + "learning_rate": 1.884124447780654e-05, + "loss": 2.6428, + "step": 47858 + }, + { + "epoch": 2.2281816700421353, + "grad_norm": 0.3174454497874862, + "learning_rate": 1.883912606851845e-05, + "loss": 2.7314, + "step": 47859 + }, + { + "epoch": 2.2282282282282284, + "grad_norm": 0.3325154541730981, + "learning_rate": 1.8837007750684422e-05, + "loss": 2.5788, + "step": 47860 + }, + { + "epoch": 2.228274786414321, + "grad_norm": 0.346668132088005, + "learning_rate": 1.8834889524310733e-05, + "loss": 2.7123, + "step": 47861 + }, + { + "epoch": 2.228321344600414, + "grad_norm": 0.32850570605585794, + "learning_rate": 1.8832771389403558e-05, + "loss": 2.5241, + "step": 47862 + }, + { + "epoch": 2.2283679027865073, + "grad_norm": 0.32371917099076, + "learning_rate": 1.8830653345969134e-05, + "loss": 2.6601, + "step": 47863 + }, + { + "epoch": 2.2284144609726004, + "grad_norm": 0.33202890737635576, + "learning_rate": 1.882853539401367e-05, + "loss": 2.6757, + "step": 47864 + }, + { + "epoch": 2.2284610191586935, + "grad_norm": 0.3463096118106095, + "learning_rate": 1.8826417533543377e-05, + "loss": 2.676, + "step": 47865 + }, + { + "epoch": 2.2285075773447867, + "grad_norm": 0.3381701119920177, + "learning_rate": 1.8824299764564484e-05, + "loss": 2.6884, + "step": 47866 + }, + { + "epoch": 2.2285541355308798, + "grad_norm": 0.3204850686656487, + "learning_rate": 1.882218208708322e-05, + "loss": 2.7087, + "step": 47867 + }, + { + "epoch": 2.228600693716973, + "grad_norm": 0.32493017003078145, + "learning_rate": 1.8820064501105757e-05, + "loss": 2.6537, + "step": 47868 + }, + { + "epoch": 2.228647251903066, + "grad_norm": 0.3438693666909577, + "learning_rate": 1.8817947006638333e-05, + "loss": 2.6433, + "step": 47869 + }, + { + "epoch": 2.228693810089159, + "grad_norm": 0.3465315757796763, + "learning_rate": 1.881582960368718e-05, + "loss": 2.7386, + "step": 47870 + }, + { + "epoch": 2.228740368275252, + "grad_norm": 0.3479032623364494, + "learning_rate": 1.881371229225846e-05, + "loss": 2.7187, + "step": 47871 + }, + { + "epoch": 2.228786926461345, + "grad_norm": 0.3324718776740631, + "learning_rate": 1.881159507235846e-05, + "loss": 2.5555, + "step": 47872 + }, + { + "epoch": 2.228833484647438, + "grad_norm": 0.3267398560830077, + "learning_rate": 1.880947794399332e-05, + "loss": 2.6156, + "step": 47873 + }, + { + "epoch": 2.228880042833531, + "grad_norm": 0.33652314888561013, + "learning_rate": 1.8807360907169325e-05, + "loss": 2.5827, + "step": 47874 + }, + { + "epoch": 2.2289266010196243, + "grad_norm": 0.35108633541962864, + "learning_rate": 1.8805243961892633e-05, + "loss": 2.6152, + "step": 47875 + }, + { + "epoch": 2.2289731592057174, + "grad_norm": 0.3335501562142863, + "learning_rate": 1.880312710816948e-05, + "loss": 2.7436, + "step": 47876 + }, + { + "epoch": 2.2290197173918105, + "grad_norm": 0.3458335925859015, + "learning_rate": 1.880101034600607e-05, + "loss": 2.6824, + "step": 47877 + }, + { + "epoch": 2.2290662755779036, + "grad_norm": 0.31694652905565596, + "learning_rate": 1.8798893675408624e-05, + "loss": 2.7277, + "step": 47878 + }, + { + "epoch": 2.2291128337639967, + "grad_norm": 0.323896396606731, + "learning_rate": 1.879677709638335e-05, + "loss": 2.595, + "step": 47879 + }, + { + "epoch": 2.22915939195009, + "grad_norm": 0.3462275874721312, + "learning_rate": 1.879466060893648e-05, + "loss": 2.6645, + "step": 47880 + }, + { + "epoch": 2.2292059501361825, + "grad_norm": 0.32216338197816397, + "learning_rate": 1.879254421307419e-05, + "loss": 2.7229, + "step": 47881 + }, + { + "epoch": 2.2292525083222756, + "grad_norm": 0.3226523519169405, + "learning_rate": 1.879042790880271e-05, + "loss": 2.6859, + "step": 47882 + }, + { + "epoch": 2.2292990665083687, + "grad_norm": 0.30767743201254766, + "learning_rate": 1.8788311696128245e-05, + "loss": 2.5398, + "step": 47883 + }, + { + "epoch": 2.229345624694462, + "grad_norm": 0.3227111035112487, + "learning_rate": 1.8786195575057013e-05, + "loss": 2.6263, + "step": 47884 + }, + { + "epoch": 2.229392182880555, + "grad_norm": 0.3017659755548161, + "learning_rate": 1.878407954559524e-05, + "loss": 2.5978, + "step": 47885 + }, + { + "epoch": 2.229438741066648, + "grad_norm": 0.32996916043904134, + "learning_rate": 1.878196360774908e-05, + "loss": 2.6721, + "step": 47886 + }, + { + "epoch": 2.229485299252741, + "grad_norm": 0.3198448320024813, + "learning_rate": 1.877984776152482e-05, + "loss": 2.6056, + "step": 47887 + }, + { + "epoch": 2.2295318574388343, + "grad_norm": 0.3274325902323284, + "learning_rate": 1.8777732006928617e-05, + "loss": 2.6562, + "step": 47888 + }, + { + "epoch": 2.2295784156249274, + "grad_norm": 0.35835711566323464, + "learning_rate": 1.877561634396669e-05, + "loss": 2.6711, + "step": 47889 + }, + { + "epoch": 2.2296249738110205, + "grad_norm": 0.3362581974672775, + "learning_rate": 1.8773500772645253e-05, + "loss": 2.656, + "step": 47890 + }, + { + "epoch": 2.229671531997113, + "grad_norm": 0.3240452558544973, + "learning_rate": 1.8771385292970513e-05, + "loss": 2.571, + "step": 47891 + }, + { + "epoch": 2.2297180901832063, + "grad_norm": 0.3245252155458322, + "learning_rate": 1.8769269904948685e-05, + "loss": 2.6126, + "step": 47892 + }, + { + "epoch": 2.2297646483692994, + "grad_norm": 0.3123505924999629, + "learning_rate": 1.8767154608585986e-05, + "loss": 2.5628, + "step": 47893 + }, + { + "epoch": 2.2298112065553926, + "grad_norm": 0.32960133443148326, + "learning_rate": 1.8765039403888597e-05, + "loss": 2.5864, + "step": 47894 + }, + { + "epoch": 2.2298577647414857, + "grad_norm": 0.3410773125966433, + "learning_rate": 1.8762924290862737e-05, + "loss": 2.6383, + "step": 47895 + }, + { + "epoch": 2.229904322927579, + "grad_norm": 0.32446781814476827, + "learning_rate": 1.8760809269514618e-05, + "loss": 2.6741, + "step": 47896 + }, + { + "epoch": 2.229950881113672, + "grad_norm": 0.332666095792369, + "learning_rate": 1.8758694339850446e-05, + "loss": 2.7582, + "step": 47897 + }, + { + "epoch": 2.229997439299765, + "grad_norm": 0.33023078457639965, + "learning_rate": 1.875657950187645e-05, + "loss": 2.5248, + "step": 47898 + }, + { + "epoch": 2.230043997485858, + "grad_norm": 0.3308577879889059, + "learning_rate": 1.8754464755598776e-05, + "loss": 2.6808, + "step": 47899 + }, + { + "epoch": 2.230090555671951, + "grad_norm": 0.3436674035709616, + "learning_rate": 1.8752350101023702e-05, + "loss": 2.6077, + "step": 47900 + }, + { + "epoch": 2.230137113858044, + "grad_norm": 0.3392407807078295, + "learning_rate": 1.8750235538157373e-05, + "loss": 2.6626, + "step": 47901 + }, + { + "epoch": 2.230183672044137, + "grad_norm": 0.3384344426442504, + "learning_rate": 1.8748121067006058e-05, + "loss": 2.6567, + "step": 47902 + }, + { + "epoch": 2.23023023023023, + "grad_norm": 0.321425988380348, + "learning_rate": 1.874600668757591e-05, + "loss": 2.6306, + "step": 47903 + }, + { + "epoch": 2.2302767884163233, + "grad_norm": 0.32936013409147913, + "learning_rate": 1.874389239987315e-05, + "loss": 2.6416, + "step": 47904 + }, + { + "epoch": 2.2303233466024164, + "grad_norm": 0.3398914688902412, + "learning_rate": 1.8741778203903992e-05, + "loss": 2.6838, + "step": 47905 + }, + { + "epoch": 2.2303699047885095, + "grad_norm": 0.32881099546235476, + "learning_rate": 1.8739664099674654e-05, + "loss": 2.6005, + "step": 47906 + }, + { + "epoch": 2.2304164629746026, + "grad_norm": 0.333752194296748, + "learning_rate": 1.8737550087191303e-05, + "loss": 2.6018, + "step": 47907 + }, + { + "epoch": 2.2304630211606957, + "grad_norm": 0.33098672469762447, + "learning_rate": 1.8735436166460157e-05, + "loss": 2.6078, + "step": 47908 + }, + { + "epoch": 2.230509579346789, + "grad_norm": 0.32814545217563673, + "learning_rate": 1.873332233748743e-05, + "loss": 2.594, + "step": 47909 + }, + { + "epoch": 2.2305561375328815, + "grad_norm": 0.3530855252719858, + "learning_rate": 1.8731208600279327e-05, + "loss": 2.6928, + "step": 47910 + }, + { + "epoch": 2.2306026957189746, + "grad_norm": 0.33618136754944167, + "learning_rate": 1.8729094954842058e-05, + "loss": 2.6746, + "step": 47911 + }, + { + "epoch": 2.2306492539050677, + "grad_norm": 0.3320774493260462, + "learning_rate": 1.872698140118178e-05, + "loss": 2.6524, + "step": 47912 + }, + { + "epoch": 2.230695812091161, + "grad_norm": 0.3269078363702416, + "learning_rate": 1.8724867939304763e-05, + "loss": 2.5952, + "step": 47913 + }, + { + "epoch": 2.230742370277254, + "grad_norm": 0.35336849344568605, + "learning_rate": 1.872275456921715e-05, + "loss": 2.6437, + "step": 47914 + }, + { + "epoch": 2.230788928463347, + "grad_norm": 0.31768206297424517, + "learning_rate": 1.87206412909252e-05, + "loss": 2.6584, + "step": 47915 + }, + { + "epoch": 2.23083548664944, + "grad_norm": 0.3327565427777867, + "learning_rate": 1.8718528104435067e-05, + "loss": 2.5974, + "step": 47916 + }, + { + "epoch": 2.2308820448355333, + "grad_norm": 0.3529791892101421, + "learning_rate": 1.871641500975298e-05, + "loss": 2.6819, + "step": 47917 + }, + { + "epoch": 2.2309286030216264, + "grad_norm": 0.33110246700738055, + "learning_rate": 1.8714302006885124e-05, + "loss": 2.551, + "step": 47918 + }, + { + "epoch": 2.2309751612077195, + "grad_norm": 0.3416989605897568, + "learning_rate": 1.871218909583772e-05, + "loss": 2.551, + "step": 47919 + }, + { + "epoch": 2.231021719393812, + "grad_norm": 0.337134606146157, + "learning_rate": 1.8710076276616966e-05, + "loss": 2.6128, + "step": 47920 + }, + { + "epoch": 2.2310682775799053, + "grad_norm": 0.31471413184066943, + "learning_rate": 1.870796354922904e-05, + "loss": 2.5521, + "step": 47921 + }, + { + "epoch": 2.2311148357659984, + "grad_norm": 0.3636482892645752, + "learning_rate": 1.8705850913680162e-05, + "loss": 2.7546, + "step": 47922 + }, + { + "epoch": 2.2311613939520916, + "grad_norm": 0.3380605332181263, + "learning_rate": 1.870373836997653e-05, + "loss": 2.6023, + "step": 47923 + }, + { + "epoch": 2.2312079521381847, + "grad_norm": 0.3415735769883506, + "learning_rate": 1.8701625918124337e-05, + "loss": 2.6461, + "step": 47924 + }, + { + "epoch": 2.231254510324278, + "grad_norm": 0.3224060562035623, + "learning_rate": 1.8699513558129794e-05, + "loss": 2.6282, + "step": 47925 + }, + { + "epoch": 2.231301068510371, + "grad_norm": 0.3443321317090973, + "learning_rate": 1.8697401289999105e-05, + "loss": 2.5725, + "step": 47926 + }, + { + "epoch": 2.231347626696464, + "grad_norm": 0.33528048691692436, + "learning_rate": 1.8695289113738436e-05, + "loss": 2.5001, + "step": 47927 + }, + { + "epoch": 2.231394184882557, + "grad_norm": 0.34137207385313156, + "learning_rate": 1.8693177029354035e-05, + "loss": 2.6721, + "step": 47928 + }, + { + "epoch": 2.2314407430686503, + "grad_norm": 0.33557201856234675, + "learning_rate": 1.869106503685206e-05, + "loss": 2.7104, + "step": 47929 + }, + { + "epoch": 2.231487301254743, + "grad_norm": 0.3408160666989938, + "learning_rate": 1.8688953136238722e-05, + "loss": 2.633, + "step": 47930 + }, + { + "epoch": 2.231533859440836, + "grad_norm": 0.32740295020068694, + "learning_rate": 1.8686841327520228e-05, + "loss": 2.5493, + "step": 47931 + }, + { + "epoch": 2.231580417626929, + "grad_norm": 0.33813891851902517, + "learning_rate": 1.8684729610702766e-05, + "loss": 2.5324, + "step": 47932 + }, + { + "epoch": 2.2316269758130223, + "grad_norm": 0.3337485739939465, + "learning_rate": 1.8682617985792556e-05, + "loss": 2.6269, + "step": 47933 + }, + { + "epoch": 2.2316735339991154, + "grad_norm": 0.3371429730136536, + "learning_rate": 1.8680506452795758e-05, + "loss": 2.7436, + "step": 47934 + }, + { + "epoch": 2.2317200921852085, + "grad_norm": 0.35207521203455044, + "learning_rate": 1.8678395011718593e-05, + "loss": 2.7659, + "step": 47935 + }, + { + "epoch": 2.2317666503713016, + "grad_norm": 0.3358093839694449, + "learning_rate": 1.8676283662567247e-05, + "loss": 2.6713, + "step": 47936 + }, + { + "epoch": 2.2318132085573947, + "grad_norm": 0.3328336666751472, + "learning_rate": 1.8674172405347923e-05, + "loss": 2.6522, + "step": 47937 + }, + { + "epoch": 2.231859766743488, + "grad_norm": 0.318665999334001, + "learning_rate": 1.8672061240066817e-05, + "loss": 2.5247, + "step": 47938 + }, + { + "epoch": 2.2319063249295805, + "grad_norm": 0.3342126707168384, + "learning_rate": 1.866995016673015e-05, + "loss": 2.5692, + "step": 47939 + }, + { + "epoch": 2.2319528831156736, + "grad_norm": 0.35149164266884186, + "learning_rate": 1.866783918534405e-05, + "loss": 2.6509, + "step": 47940 + }, + { + "epoch": 2.2319994413017668, + "grad_norm": 0.3126324650784148, + "learning_rate": 1.8665728295914792e-05, + "loss": 2.591, + "step": 47941 + }, + { + "epoch": 2.23204599948786, + "grad_norm": 0.34525388104819815, + "learning_rate": 1.8663617498448517e-05, + "loss": 2.6352, + "step": 47942 + }, + { + "epoch": 2.232092557673953, + "grad_norm": 0.3367837884668336, + "learning_rate": 1.866150679295144e-05, + "loss": 2.4917, + "step": 47943 + }, + { + "epoch": 2.232139115860046, + "grad_norm": 0.32593948517896254, + "learning_rate": 1.865939617942975e-05, + "loss": 2.6055, + "step": 47944 + }, + { + "epoch": 2.232185674046139, + "grad_norm": 0.3293592852288666, + "learning_rate": 1.8657285657889644e-05, + "loss": 2.5592, + "step": 47945 + }, + { + "epoch": 2.2322322322322323, + "grad_norm": 0.34384700272438506, + "learning_rate": 1.8655175228337333e-05, + "loss": 2.7083, + "step": 47946 + }, + { + "epoch": 2.2322787904183254, + "grad_norm": 0.3523434876645635, + "learning_rate": 1.865306489077898e-05, + "loss": 2.6131, + "step": 47947 + }, + { + "epoch": 2.2323253486044186, + "grad_norm": 0.3401641045603584, + "learning_rate": 1.8650954645220797e-05, + "loss": 2.5801, + "step": 47948 + }, + { + "epoch": 2.2323719067905117, + "grad_norm": 0.3444258005252268, + "learning_rate": 1.8648844491668972e-05, + "loss": 2.5948, + "step": 47949 + }, + { + "epoch": 2.2324184649766043, + "grad_norm": 0.3477250106296655, + "learning_rate": 1.86467344301297e-05, + "loss": 2.7319, + "step": 47950 + }, + { + "epoch": 2.2324650231626975, + "grad_norm": 0.30214918998306617, + "learning_rate": 1.8644624460609168e-05, + "loss": 2.511, + "step": 47951 + }, + { + "epoch": 2.2325115813487906, + "grad_norm": 0.336228359171453, + "learning_rate": 1.8642514583113596e-05, + "loss": 2.7118, + "step": 47952 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 0.3392704595335537, + "learning_rate": 1.864040479764912e-05, + "loss": 2.6175, + "step": 47953 + }, + { + "epoch": 2.232604697720977, + "grad_norm": 0.2994035934731397, + "learning_rate": 1.8638295104221997e-05, + "loss": 2.6353, + "step": 47954 + }, + { + "epoch": 2.23265125590707, + "grad_norm": 0.3353881700261337, + "learning_rate": 1.863618550283837e-05, + "loss": 2.6589, + "step": 47955 + }, + { + "epoch": 2.232697814093163, + "grad_norm": 0.3247024218497839, + "learning_rate": 1.8634075993504456e-05, + "loss": 2.6553, + "step": 47956 + }, + { + "epoch": 2.232744372279256, + "grad_norm": 0.3222058724634986, + "learning_rate": 1.8631966576226435e-05, + "loss": 2.6597, + "step": 47957 + }, + { + "epoch": 2.2327909304653493, + "grad_norm": 0.33891081680029117, + "learning_rate": 1.86298572510105e-05, + "loss": 2.616, + "step": 47958 + }, + { + "epoch": 2.232837488651442, + "grad_norm": 0.32342175963854214, + "learning_rate": 1.8627748017862867e-05, + "loss": 2.6377, + "step": 47959 + }, + { + "epoch": 2.232884046837535, + "grad_norm": 0.3219293552490725, + "learning_rate": 1.8625638876789663e-05, + "loss": 2.6847, + "step": 47960 + }, + { + "epoch": 2.232930605023628, + "grad_norm": 0.3209423145751909, + "learning_rate": 1.8623529827797152e-05, + "loss": 2.6086, + "step": 47961 + }, + { + "epoch": 2.2329771632097213, + "grad_norm": 0.33756966956308365, + "learning_rate": 1.8621420870891476e-05, + "loss": 2.6789, + "step": 47962 + }, + { + "epoch": 2.2330237213958144, + "grad_norm": 0.3153855148711626, + "learning_rate": 1.8619312006078836e-05, + "loss": 2.6464, + "step": 47963 + }, + { + "epoch": 2.2330702795819075, + "grad_norm": 0.3200625832380075, + "learning_rate": 1.8617203233365427e-05, + "loss": 2.6753, + "step": 47964 + }, + { + "epoch": 2.2331168377680006, + "grad_norm": 0.34535382663108805, + "learning_rate": 1.861509455275745e-05, + "loss": 2.6714, + "step": 47965 + }, + { + "epoch": 2.2331633959540937, + "grad_norm": 0.3534685938497641, + "learning_rate": 1.8612985964261042e-05, + "loss": 2.6732, + "step": 47966 + }, + { + "epoch": 2.233209954140187, + "grad_norm": 0.3202714574176506, + "learning_rate": 1.861087746788247e-05, + "loss": 2.6382, + "step": 47967 + }, + { + "epoch": 2.23325651232628, + "grad_norm": 0.35459240050699653, + "learning_rate": 1.8608769063627857e-05, + "loss": 2.696, + "step": 47968 + }, + { + "epoch": 2.2333030705123726, + "grad_norm": 0.34948017682224775, + "learning_rate": 1.8606660751503413e-05, + "loss": 2.6998, + "step": 47969 + }, + { + "epoch": 2.2333496286984658, + "grad_norm": 0.3250359236339629, + "learning_rate": 1.860455253151533e-05, + "loss": 2.6202, + "step": 47970 + }, + { + "epoch": 2.233396186884559, + "grad_norm": 0.3358721907974288, + "learning_rate": 1.8602444403669787e-05, + "loss": 2.5689, + "step": 47971 + }, + { + "epoch": 2.233442745070652, + "grad_norm": 0.3426401562905722, + "learning_rate": 1.8600336367973004e-05, + "loss": 2.6218, + "step": 47972 + }, + { + "epoch": 2.233489303256745, + "grad_norm": 0.33847903553029995, + "learning_rate": 1.8598228424431097e-05, + "loss": 2.5755, + "step": 47973 + }, + { + "epoch": 2.233535861442838, + "grad_norm": 0.3473013381596226, + "learning_rate": 1.8596120573050336e-05, + "loss": 2.5961, + "step": 47974 + }, + { + "epoch": 2.2335824196289313, + "grad_norm": 0.3248803146959683, + "learning_rate": 1.8594012813836843e-05, + "loss": 2.6804, + "step": 47975 + }, + { + "epoch": 2.2336289778150245, + "grad_norm": 0.3396326891100604, + "learning_rate": 1.8591905146796835e-05, + "loss": 2.6903, + "step": 47976 + }, + { + "epoch": 2.2336755360011176, + "grad_norm": 0.33060522396522907, + "learning_rate": 1.858979757193649e-05, + "loss": 2.6545, + "step": 47977 + }, + { + "epoch": 2.2337220941872102, + "grad_norm": 0.3404296017963053, + "learning_rate": 1.8587690089261993e-05, + "loss": 2.6505, + "step": 47978 + }, + { + "epoch": 2.2337686523733034, + "grad_norm": 0.3464604960565779, + "learning_rate": 1.858558269877953e-05, + "loss": 2.6267, + "step": 47979 + }, + { + "epoch": 2.2338152105593965, + "grad_norm": 0.34138930080848684, + "learning_rate": 1.8583475400495305e-05, + "loss": 2.6037, + "step": 47980 + }, + { + "epoch": 2.2338617687454896, + "grad_norm": 0.34464770214699925, + "learning_rate": 1.858136819441547e-05, + "loss": 2.7031, + "step": 47981 + }, + { + "epoch": 2.2339083269315827, + "grad_norm": 0.32221600822200125, + "learning_rate": 1.8579261080546223e-05, + "loss": 2.5683, + "step": 47982 + }, + { + "epoch": 2.233954885117676, + "grad_norm": 0.33138672615483417, + "learning_rate": 1.8577154058893747e-05, + "loss": 2.6207, + "step": 47983 + }, + { + "epoch": 2.234001443303769, + "grad_norm": 0.3547270517705144, + "learning_rate": 1.857504712946423e-05, + "loss": 2.5662, + "step": 47984 + }, + { + "epoch": 2.234048001489862, + "grad_norm": 0.34304918978816756, + "learning_rate": 1.857294029226387e-05, + "loss": 2.6588, + "step": 47985 + }, + { + "epoch": 2.234094559675955, + "grad_norm": 0.3373960353433683, + "learning_rate": 1.85708335472988e-05, + "loss": 2.6698, + "step": 47986 + }, + { + "epoch": 2.2341411178620483, + "grad_norm": 0.3430605570479044, + "learning_rate": 1.8568726894575272e-05, + "loss": 2.6658, + "step": 47987 + }, + { + "epoch": 2.2341876760481414, + "grad_norm": 0.3416877335070832, + "learning_rate": 1.8566620334099418e-05, + "loss": 2.6136, + "step": 47988 + }, + { + "epoch": 2.234234234234234, + "grad_norm": 0.34487289841498686, + "learning_rate": 1.8564513865877435e-05, + "loss": 2.687, + "step": 47989 + }, + { + "epoch": 2.234280792420327, + "grad_norm": 0.34875057239714646, + "learning_rate": 1.8562407489915508e-05, + "loss": 2.6519, + "step": 47990 + }, + { + "epoch": 2.2343273506064203, + "grad_norm": 0.34242452389547107, + "learning_rate": 1.8560301206219816e-05, + "loss": 2.7948, + "step": 47991 + }, + { + "epoch": 2.2343739087925134, + "grad_norm": 0.3688185115806247, + "learning_rate": 1.855819501479655e-05, + "loss": 2.6448, + "step": 47992 + }, + { + "epoch": 2.2344204669786065, + "grad_norm": 0.3302246484319477, + "learning_rate": 1.8556088915651893e-05, + "loss": 2.6521, + "step": 47993 + }, + { + "epoch": 2.2344670251646996, + "grad_norm": 0.3385182337553089, + "learning_rate": 1.8553982908792e-05, + "loss": 2.6616, + "step": 47994 + }, + { + "epoch": 2.2345135833507928, + "grad_norm": 0.33474991233210954, + "learning_rate": 1.855187699422307e-05, + "loss": 2.5867, + "step": 47995 + }, + { + "epoch": 2.234560141536886, + "grad_norm": 0.3414936273009811, + "learning_rate": 1.8549771171951287e-05, + "loss": 2.6976, + "step": 47996 + }, + { + "epoch": 2.234606699722979, + "grad_norm": 0.32171558827414803, + "learning_rate": 1.8547665441982825e-05, + "loss": 2.5802, + "step": 47997 + }, + { + "epoch": 2.2346532579090717, + "grad_norm": 0.3180269955965349, + "learning_rate": 1.854555980432388e-05, + "loss": 2.6421, + "step": 47998 + }, + { + "epoch": 2.2346998160951648, + "grad_norm": 0.30192320324975375, + "learning_rate": 1.8543454258980592e-05, + "loss": 2.6403, + "step": 47999 + }, + { + "epoch": 2.234746374281258, + "grad_norm": 0.3210162719214889, + "learning_rate": 1.854134880595919e-05, + "loss": 2.6211, + "step": 48000 + }, + { + "epoch": 2.234792932467351, + "grad_norm": 0.3377558680464458, + "learning_rate": 1.8539243445265825e-05, + "loss": 2.7302, + "step": 48001 + }, + { + "epoch": 2.234839490653444, + "grad_norm": 0.329296829542787, + "learning_rate": 1.853713817690667e-05, + "loss": 2.6914, + "step": 48002 + }, + { + "epoch": 2.2348860488395372, + "grad_norm": 0.29569056214166245, + "learning_rate": 1.8535033000887926e-05, + "loss": 2.5061, + "step": 48003 + }, + { + "epoch": 2.2349326070256303, + "grad_norm": 0.33660123209918263, + "learning_rate": 1.8532927917215753e-05, + "loss": 2.5602, + "step": 48004 + }, + { + "epoch": 2.2349791652117235, + "grad_norm": 0.34901632187235554, + "learning_rate": 1.8530822925896335e-05, + "loss": 2.6311, + "step": 48005 + }, + { + "epoch": 2.2350257233978166, + "grad_norm": 0.32412637085862084, + "learning_rate": 1.852871802693587e-05, + "loss": 2.6485, + "step": 48006 + }, + { + "epoch": 2.2350722815839097, + "grad_norm": 0.3260235009258859, + "learning_rate": 1.8526613220340505e-05, + "loss": 2.6183, + "step": 48007 + }, + { + "epoch": 2.2351188397700024, + "grad_norm": 0.36058663993997, + "learning_rate": 1.8524508506116427e-05, + "loss": 2.5953, + "step": 48008 + }, + { + "epoch": 2.2351653979560955, + "grad_norm": 0.34585507984890385, + "learning_rate": 1.8522403884269812e-05, + "loss": 2.5957, + "step": 48009 + }, + { + "epoch": 2.2352119561421886, + "grad_norm": 0.35954691922985305, + "learning_rate": 1.852029935480685e-05, + "loss": 2.633, + "step": 48010 + }, + { + "epoch": 2.2352585143282817, + "grad_norm": 0.40442342307634377, + "learning_rate": 1.8518194917733712e-05, + "loss": 2.6631, + "step": 48011 + }, + { + "epoch": 2.235305072514375, + "grad_norm": 0.3419146716406115, + "learning_rate": 1.8516090573056542e-05, + "loss": 2.6758, + "step": 48012 + }, + { + "epoch": 2.235351630700468, + "grad_norm": 0.3506289911587056, + "learning_rate": 1.8513986320781583e-05, + "loss": 2.7955, + "step": 48013 + }, + { + "epoch": 2.235398188886561, + "grad_norm": 0.355014052947507, + "learning_rate": 1.8511882160914934e-05, + "loss": 2.6453, + "step": 48014 + }, + { + "epoch": 2.235444747072654, + "grad_norm": 0.3581630987355637, + "learning_rate": 1.8509778093462842e-05, + "loss": 2.5766, + "step": 48015 + }, + { + "epoch": 2.2354913052587473, + "grad_norm": 0.3277219303601558, + "learning_rate": 1.8507674118431433e-05, + "loss": 2.5826, + "step": 48016 + }, + { + "epoch": 2.23553786344484, + "grad_norm": 0.36671872432669045, + "learning_rate": 1.85055702358269e-05, + "loss": 2.6798, + "step": 48017 + }, + { + "epoch": 2.235584421630933, + "grad_norm": 0.3593344596317215, + "learning_rate": 1.8503466445655415e-05, + "loss": 2.6929, + "step": 48018 + }, + { + "epoch": 2.235630979817026, + "grad_norm": 0.33448547410905943, + "learning_rate": 1.850136274792317e-05, + "loss": 2.5361, + "step": 48019 + }, + { + "epoch": 2.2356775380031193, + "grad_norm": 0.3057730115132649, + "learning_rate": 1.8499259142636305e-05, + "loss": 2.6049, + "step": 48020 + }, + { + "epoch": 2.2357240961892124, + "grad_norm": 0.3624493683850491, + "learning_rate": 1.8497155629801004e-05, + "loss": 2.6394, + "step": 48021 + }, + { + "epoch": 2.2357706543753055, + "grad_norm": 0.3701031527966498, + "learning_rate": 1.849505220942346e-05, + "loss": 2.7714, + "step": 48022 + }, + { + "epoch": 2.2358172125613986, + "grad_norm": 0.3427800842301718, + "learning_rate": 1.8492948881509826e-05, + "loss": 2.6339, + "step": 48023 + }, + { + "epoch": 2.2358637707474918, + "grad_norm": 0.33023155163709766, + "learning_rate": 1.8490845646066302e-05, + "loss": 2.6101, + "step": 48024 + }, + { + "epoch": 2.235910328933585, + "grad_norm": 0.3506052181857618, + "learning_rate": 1.848874250309901e-05, + "loss": 2.6033, + "step": 48025 + }, + { + "epoch": 2.235956887119678, + "grad_norm": 0.3558040744644376, + "learning_rate": 1.8486639452614187e-05, + "loss": 2.6059, + "step": 48026 + }, + { + "epoch": 2.236003445305771, + "grad_norm": 0.32241889964908554, + "learning_rate": 1.8484536494617938e-05, + "loss": 2.648, + "step": 48027 + }, + { + "epoch": 2.236050003491864, + "grad_norm": 0.31847104629745066, + "learning_rate": 1.84824336291165e-05, + "loss": 2.7306, + "step": 48028 + }, + { + "epoch": 2.236096561677957, + "grad_norm": 0.3443140752034899, + "learning_rate": 1.8480330856116002e-05, + "loss": 2.7031, + "step": 48029 + }, + { + "epoch": 2.23614311986405, + "grad_norm": 0.34367523935266786, + "learning_rate": 1.8478228175622624e-05, + "loss": 2.6274, + "step": 48030 + }, + { + "epoch": 2.236189678050143, + "grad_norm": 0.325094068418664, + "learning_rate": 1.8476125587642547e-05, + "loss": 2.6619, + "step": 48031 + }, + { + "epoch": 2.2362362362362362, + "grad_norm": 0.3418941440937159, + "learning_rate": 1.847402309218193e-05, + "loss": 2.6734, + "step": 48032 + }, + { + "epoch": 2.2362827944223294, + "grad_norm": 0.3565190405310687, + "learning_rate": 1.847192068924697e-05, + "loss": 2.5798, + "step": 48033 + }, + { + "epoch": 2.2363293526084225, + "grad_norm": 0.3334299770598281, + "learning_rate": 1.84698183788438e-05, + "loss": 2.6909, + "step": 48034 + }, + { + "epoch": 2.2363759107945156, + "grad_norm": 0.34488192138873586, + "learning_rate": 1.8467716160978605e-05, + "loss": 2.6882, + "step": 48035 + }, + { + "epoch": 2.2364224689806087, + "grad_norm": 0.3417964586640142, + "learning_rate": 1.846561403565756e-05, + "loss": 2.6334, + "step": 48036 + }, + { + "epoch": 2.2364690271667014, + "grad_norm": 0.35236781145997453, + "learning_rate": 1.846351200288684e-05, + "loss": 2.5114, + "step": 48037 + }, + { + "epoch": 2.2365155853527945, + "grad_norm": 0.3184478184159169, + "learning_rate": 1.846141006267258e-05, + "loss": 2.5635, + "step": 48038 + }, + { + "epoch": 2.2365621435388876, + "grad_norm": 0.3063644307682048, + "learning_rate": 1.8459308215021004e-05, + "loss": 2.6631, + "step": 48039 + }, + { + "epoch": 2.2366087017249807, + "grad_norm": 0.31356546603595875, + "learning_rate": 1.8457206459938216e-05, + "loss": 2.5523, + "step": 48040 + }, + { + "epoch": 2.236655259911074, + "grad_norm": 0.3208687946972919, + "learning_rate": 1.8455104797430457e-05, + "loss": 2.6875, + "step": 48041 + }, + { + "epoch": 2.236701818097167, + "grad_norm": 0.31542928872192405, + "learning_rate": 1.845300322750384e-05, + "loss": 2.5217, + "step": 48042 + }, + { + "epoch": 2.23674837628326, + "grad_norm": 0.31323786327548314, + "learning_rate": 1.8450901750164544e-05, + "loss": 2.5999, + "step": 48043 + }, + { + "epoch": 2.236794934469353, + "grad_norm": 0.3281945493447721, + "learning_rate": 1.844880036541875e-05, + "loss": 2.5923, + "step": 48044 + }, + { + "epoch": 2.2368414926554463, + "grad_norm": 0.3428325297855788, + "learning_rate": 1.844669907327262e-05, + "loss": 2.6294, + "step": 48045 + }, + { + "epoch": 2.2368880508415394, + "grad_norm": 0.3078545908855454, + "learning_rate": 1.8444597873732332e-05, + "loss": 2.6418, + "step": 48046 + }, + { + "epoch": 2.236934609027632, + "grad_norm": 0.34538679004383865, + "learning_rate": 1.844249676680403e-05, + "loss": 2.6229, + "step": 48047 + }, + { + "epoch": 2.236981167213725, + "grad_norm": 0.3131346905228978, + "learning_rate": 1.844039575249389e-05, + "loss": 2.6286, + "step": 48048 + }, + { + "epoch": 2.2370277253998183, + "grad_norm": 0.33520848608544845, + "learning_rate": 1.843829483080807e-05, + "loss": 2.6468, + "step": 48049 + }, + { + "epoch": 2.2370742835859114, + "grad_norm": 0.3301522158935233, + "learning_rate": 1.8436194001752753e-05, + "loss": 2.643, + "step": 48050 + }, + { + "epoch": 2.2371208417720045, + "grad_norm": 0.3074833471511932, + "learning_rate": 1.8434093265334095e-05, + "loss": 2.5816, + "step": 48051 + }, + { + "epoch": 2.2371673999580977, + "grad_norm": 0.3471972875016605, + "learning_rate": 1.8431992621558286e-05, + "loss": 2.7589, + "step": 48052 + }, + { + "epoch": 2.2372139581441908, + "grad_norm": 0.33263231132414806, + "learning_rate": 1.8429892070431426e-05, + "loss": 2.6652, + "step": 48053 + }, + { + "epoch": 2.237260516330284, + "grad_norm": 0.32109171128328085, + "learning_rate": 1.842779161195976e-05, + "loss": 2.6049, + "step": 48054 + }, + { + "epoch": 2.237307074516377, + "grad_norm": 0.3218270380825681, + "learning_rate": 1.84256912461494e-05, + "loss": 2.6365, + "step": 48055 + }, + { + "epoch": 2.23735363270247, + "grad_norm": 0.3309594476316371, + "learning_rate": 1.8423590973006527e-05, + "loss": 2.6669, + "step": 48056 + }, + { + "epoch": 2.237400190888563, + "grad_norm": 0.3019493663605647, + "learning_rate": 1.84214907925373e-05, + "loss": 2.5284, + "step": 48057 + }, + { + "epoch": 2.237446749074656, + "grad_norm": 0.3414537265281907, + "learning_rate": 1.8419390704747886e-05, + "loss": 2.5817, + "step": 48058 + }, + { + "epoch": 2.237493307260749, + "grad_norm": 0.29866767952624995, + "learning_rate": 1.8417290709644467e-05, + "loss": 2.5595, + "step": 48059 + }, + { + "epoch": 2.237539865446842, + "grad_norm": 0.3409512718650928, + "learning_rate": 1.8415190807233175e-05, + "loss": 2.7244, + "step": 48060 + }, + { + "epoch": 2.2375864236329353, + "grad_norm": 0.3189459701537567, + "learning_rate": 1.8413090997520183e-05, + "loss": 2.6961, + "step": 48061 + }, + { + "epoch": 2.2376329818190284, + "grad_norm": 0.3287840734052389, + "learning_rate": 1.841099128051166e-05, + "loss": 2.7244, + "step": 48062 + }, + { + "epoch": 2.2376795400051215, + "grad_norm": 0.3101209204699643, + "learning_rate": 1.8408891656213763e-05, + "loss": 2.5743, + "step": 48063 + }, + { + "epoch": 2.2377260981912146, + "grad_norm": 0.3400659821514857, + "learning_rate": 1.8406792124632654e-05, + "loss": 2.609, + "step": 48064 + }, + { + "epoch": 2.2377726563773077, + "grad_norm": 0.3224568400977676, + "learning_rate": 1.8404692685774515e-05, + "loss": 2.5773, + "step": 48065 + }, + { + "epoch": 2.237819214563401, + "grad_norm": 0.3509007376684748, + "learning_rate": 1.8402593339645463e-05, + "loss": 2.651, + "step": 48066 + }, + { + "epoch": 2.2378657727494935, + "grad_norm": 0.3185932813275813, + "learning_rate": 1.8400494086251713e-05, + "loss": 2.5849, + "step": 48067 + }, + { + "epoch": 2.2379123309355866, + "grad_norm": 0.3187646878275999, + "learning_rate": 1.839839492559937e-05, + "loss": 2.5464, + "step": 48068 + }, + { + "epoch": 2.2379588891216797, + "grad_norm": 0.372665041559385, + "learning_rate": 1.8396295857694658e-05, + "loss": 2.6216, + "step": 48069 + }, + { + "epoch": 2.238005447307773, + "grad_norm": 0.3260317337415718, + "learning_rate": 1.8394196882543685e-05, + "loss": 2.6006, + "step": 48070 + }, + { + "epoch": 2.238052005493866, + "grad_norm": 0.3406804890209618, + "learning_rate": 1.8392098000152636e-05, + "loss": 2.5286, + "step": 48071 + }, + { + "epoch": 2.238098563679959, + "grad_norm": 0.3440534491781651, + "learning_rate": 1.838999921052768e-05, + "loss": 2.6303, + "step": 48072 + }, + { + "epoch": 2.238145121866052, + "grad_norm": 0.349963227114989, + "learning_rate": 1.8387900513674942e-05, + "loss": 2.6359, + "step": 48073 + }, + { + "epoch": 2.2381916800521453, + "grad_norm": 0.3313729862910888, + "learning_rate": 1.8385801909600603e-05, + "loss": 2.7258, + "step": 48074 + }, + { + "epoch": 2.2382382382382384, + "grad_norm": 0.34464206751851656, + "learning_rate": 1.8383703398310826e-05, + "loss": 2.6682, + "step": 48075 + }, + { + "epoch": 2.238284796424331, + "grad_norm": 0.33513708320508395, + "learning_rate": 1.8381604979811756e-05, + "loss": 2.5845, + "step": 48076 + }, + { + "epoch": 2.238331354610424, + "grad_norm": 0.34154733755682637, + "learning_rate": 1.8379506654109568e-05, + "loss": 2.6322, + "step": 48077 + }, + { + "epoch": 2.2383779127965173, + "grad_norm": 0.31621763690999644, + "learning_rate": 1.8377408421210423e-05, + "loss": 2.6468, + "step": 48078 + }, + { + "epoch": 2.2384244709826104, + "grad_norm": 0.3468847912151523, + "learning_rate": 1.8375310281120437e-05, + "loss": 2.6926, + "step": 48079 + }, + { + "epoch": 2.2384710291687036, + "grad_norm": 0.32030887417363363, + "learning_rate": 1.837321223384584e-05, + "loss": 2.5926, + "step": 48080 + }, + { + "epoch": 2.2385175873547967, + "grad_norm": 0.3179774606415398, + "learning_rate": 1.837111427939271e-05, + "loss": 2.5679, + "step": 48081 + }, + { + "epoch": 2.23856414554089, + "grad_norm": 0.3280887892865355, + "learning_rate": 1.8369016417767282e-05, + "loss": 2.6161, + "step": 48082 + }, + { + "epoch": 2.238610703726983, + "grad_norm": 0.3279519787051231, + "learning_rate": 1.836691864897565e-05, + "loss": 2.6802, + "step": 48083 + }, + { + "epoch": 2.238657261913076, + "grad_norm": 0.31573978096850275, + "learning_rate": 1.8364820973024e-05, + "loss": 2.6426, + "step": 48084 + }, + { + "epoch": 2.238703820099169, + "grad_norm": 0.3370033084828569, + "learning_rate": 1.83627233899185e-05, + "loss": 2.7265, + "step": 48085 + }, + { + "epoch": 2.238750378285262, + "grad_norm": 0.3394133481555485, + "learning_rate": 1.8360625899665257e-05, + "loss": 2.6392, + "step": 48086 + }, + { + "epoch": 2.238796936471355, + "grad_norm": 0.3297735827319133, + "learning_rate": 1.8358528502270494e-05, + "loss": 2.6767, + "step": 48087 + }, + { + "epoch": 2.238843494657448, + "grad_norm": 0.34197343141135933, + "learning_rate": 1.8356431197740313e-05, + "loss": 2.6439, + "step": 48088 + }, + { + "epoch": 2.238890052843541, + "grad_norm": 0.33513603987220547, + "learning_rate": 1.8354333986080895e-05, + "loss": 2.6575, + "step": 48089 + }, + { + "epoch": 2.2389366110296343, + "grad_norm": 0.31759627917223693, + "learning_rate": 1.8352236867298382e-05, + "loss": 2.5948, + "step": 48090 + }, + { + "epoch": 2.2389831692157274, + "grad_norm": 0.33438409256507884, + "learning_rate": 1.835013984139896e-05, + "loss": 2.6397, + "step": 48091 + }, + { + "epoch": 2.2390297274018205, + "grad_norm": 0.32855810063759067, + "learning_rate": 1.834804290838872e-05, + "loss": 2.6514, + "step": 48092 + }, + { + "epoch": 2.2390762855879136, + "grad_norm": 0.3171632347287238, + "learning_rate": 1.834594606827389e-05, + "loss": 2.5265, + "step": 48093 + }, + { + "epoch": 2.2391228437740067, + "grad_norm": 0.3316211346805319, + "learning_rate": 1.8343849321060557e-05, + "loss": 2.6447, + "step": 48094 + }, + { + "epoch": 2.2391694019601, + "grad_norm": 0.318149676637732, + "learning_rate": 1.8341752666754947e-05, + "loss": 2.586, + "step": 48095 + }, + { + "epoch": 2.2392159601461925, + "grad_norm": 0.33016221707672166, + "learning_rate": 1.833965610536315e-05, + "loss": 2.6061, + "step": 48096 + }, + { + "epoch": 2.2392625183322856, + "grad_norm": 0.328120724175743, + "learning_rate": 1.8337559636891344e-05, + "loss": 2.5664, + "step": 48097 + }, + { + "epoch": 2.2393090765183787, + "grad_norm": 0.3281626036857927, + "learning_rate": 1.8335463261345703e-05, + "loss": 2.72, + "step": 48098 + }, + { + "epoch": 2.239355634704472, + "grad_norm": 0.3196670830464186, + "learning_rate": 1.833336697873232e-05, + "loss": 2.6391, + "step": 48099 + }, + { + "epoch": 2.239402192890565, + "grad_norm": 0.33608737138359607, + "learning_rate": 1.833127078905742e-05, + "loss": 2.6493, + "step": 48100 + }, + { + "epoch": 2.239448751076658, + "grad_norm": 0.3220352896752732, + "learning_rate": 1.83291746923271e-05, + "loss": 2.5972, + "step": 48101 + }, + { + "epoch": 2.239495309262751, + "grad_norm": 0.3420265940535223, + "learning_rate": 1.832707868854754e-05, + "loss": 2.6852, + "step": 48102 + }, + { + "epoch": 2.2395418674488443, + "grad_norm": 0.3123056190976986, + "learning_rate": 1.8324982777724875e-05, + "loss": 2.6106, + "step": 48103 + }, + { + "epoch": 2.2395884256349374, + "grad_norm": 0.33747867935239423, + "learning_rate": 1.832288695986527e-05, + "loss": 2.6605, + "step": 48104 + }, + { + "epoch": 2.2396349838210305, + "grad_norm": 0.3314436629506711, + "learning_rate": 1.8320791234974872e-05, + "loss": 2.6421, + "step": 48105 + }, + { + "epoch": 2.239681542007123, + "grad_norm": 0.3361469562866871, + "learning_rate": 1.8318695603059843e-05, + "loss": 2.6888, + "step": 48106 + }, + { + "epoch": 2.2397281001932163, + "grad_norm": 0.32876461759268377, + "learning_rate": 1.831660006412629e-05, + "loss": 2.6957, + "step": 48107 + }, + { + "epoch": 2.2397746583793094, + "grad_norm": 0.3384282574782113, + "learning_rate": 1.8314504618180427e-05, + "loss": 2.5872, + "step": 48108 + }, + { + "epoch": 2.2398212165654026, + "grad_norm": 0.3335934795160237, + "learning_rate": 1.831240926522836e-05, + "loss": 2.6217, + "step": 48109 + }, + { + "epoch": 2.2398677747514957, + "grad_norm": 0.33843041582071953, + "learning_rate": 1.831031400527624e-05, + "loss": 2.6501, + "step": 48110 + }, + { + "epoch": 2.239914332937589, + "grad_norm": 0.31932767867991063, + "learning_rate": 1.8308218838330254e-05, + "loss": 2.6052, + "step": 48111 + }, + { + "epoch": 2.239960891123682, + "grad_norm": 0.3137108247199405, + "learning_rate": 1.830612376439648e-05, + "loss": 2.5807, + "step": 48112 + }, + { + "epoch": 2.240007449309775, + "grad_norm": 0.32452548769651685, + "learning_rate": 1.8304028783481152e-05, + "loss": 2.6951, + "step": 48113 + }, + { + "epoch": 2.240054007495868, + "grad_norm": 0.3267308290772013, + "learning_rate": 1.8301933895590363e-05, + "loss": 2.6991, + "step": 48114 + }, + { + "epoch": 2.240100565681961, + "grad_norm": 0.32381526038507547, + "learning_rate": 1.8299839100730267e-05, + "loss": 2.6952, + "step": 48115 + }, + { + "epoch": 2.240147123868054, + "grad_norm": 0.3560415209641508, + "learning_rate": 1.8297744398907025e-05, + "loss": 2.6084, + "step": 48116 + }, + { + "epoch": 2.240193682054147, + "grad_norm": 0.31608785815384766, + "learning_rate": 1.8295649790126772e-05, + "loss": 2.6751, + "step": 48117 + }, + { + "epoch": 2.24024024024024, + "grad_norm": 0.3138806765803052, + "learning_rate": 1.8293555274395673e-05, + "loss": 2.6684, + "step": 48118 + }, + { + "epoch": 2.2402867984263333, + "grad_norm": 0.31128703836518695, + "learning_rate": 1.829146085171988e-05, + "loss": 2.6006, + "step": 48119 + }, + { + "epoch": 2.2403333566124264, + "grad_norm": 0.36081655326800594, + "learning_rate": 1.828936652210549e-05, + "loss": 2.7223, + "step": 48120 + }, + { + "epoch": 2.2403799147985195, + "grad_norm": 0.3504227413869046, + "learning_rate": 1.828727228555872e-05, + "loss": 2.7196, + "step": 48121 + }, + { + "epoch": 2.2404264729846126, + "grad_norm": 0.33349858110745495, + "learning_rate": 1.8285178142085663e-05, + "loss": 2.6541, + "step": 48122 + }, + { + "epoch": 2.2404730311707057, + "grad_norm": 0.3548427930157008, + "learning_rate": 1.8283084091692486e-05, + "loss": 2.6642, + "step": 48123 + }, + { + "epoch": 2.240519589356799, + "grad_norm": 0.34493451019491905, + "learning_rate": 1.828099013438535e-05, + "loss": 2.6956, + "step": 48124 + }, + { + "epoch": 2.240566147542892, + "grad_norm": 0.36634487481684896, + "learning_rate": 1.827889627017035e-05, + "loss": 2.7093, + "step": 48125 + }, + { + "epoch": 2.2406127057289846, + "grad_norm": 0.37311093842862236, + "learning_rate": 1.8276802499053693e-05, + "loss": 2.7783, + "step": 48126 + }, + { + "epoch": 2.2406592639150777, + "grad_norm": 0.3615913384318843, + "learning_rate": 1.8274708821041482e-05, + "loss": 2.7284, + "step": 48127 + }, + { + "epoch": 2.240705822101171, + "grad_norm": 0.3423152589721896, + "learning_rate": 1.8272615236139873e-05, + "loss": 2.7348, + "step": 48128 + }, + { + "epoch": 2.240752380287264, + "grad_norm": 0.34354799827567106, + "learning_rate": 1.8270521744355012e-05, + "loss": 2.7689, + "step": 48129 + }, + { + "epoch": 2.240798938473357, + "grad_norm": 0.34498178502326393, + "learning_rate": 1.826842834569305e-05, + "loss": 2.6019, + "step": 48130 + }, + { + "epoch": 2.24084549665945, + "grad_norm": 0.3328574318726017, + "learning_rate": 1.826633504016012e-05, + "loss": 2.5686, + "step": 48131 + }, + { + "epoch": 2.2408920548455433, + "grad_norm": 0.33338435716048137, + "learning_rate": 1.826424182776239e-05, + "loss": 2.6987, + "step": 48132 + }, + { + "epoch": 2.2409386130316364, + "grad_norm": 0.3420869273706528, + "learning_rate": 1.826214870850595e-05, + "loss": 2.7217, + "step": 48133 + }, + { + "epoch": 2.2409851712177296, + "grad_norm": 0.36280327542157936, + "learning_rate": 1.8260055682397005e-05, + "loss": 2.5761, + "step": 48134 + }, + { + "epoch": 2.2410317294038222, + "grad_norm": 0.3546493575536954, + "learning_rate": 1.825796274944166e-05, + "loss": 2.6172, + "step": 48135 + }, + { + "epoch": 2.2410782875899153, + "grad_norm": 0.3372141451906364, + "learning_rate": 1.825586990964606e-05, + "loss": 2.676, + "step": 48136 + }, + { + "epoch": 2.2411248457760085, + "grad_norm": 0.334573113151678, + "learning_rate": 1.8253777163016378e-05, + "loss": 2.6007, + "step": 48137 + }, + { + "epoch": 2.2411714039621016, + "grad_norm": 0.34207981239349944, + "learning_rate": 1.8251684509558693e-05, + "loss": 2.684, + "step": 48138 + }, + { + "epoch": 2.2412179621481947, + "grad_norm": 0.3318302617482107, + "learning_rate": 1.8249591949279227e-05, + "loss": 2.7022, + "step": 48139 + }, + { + "epoch": 2.241264520334288, + "grad_norm": 0.35272096951815635, + "learning_rate": 1.8247499482184037e-05, + "loss": 2.5959, + "step": 48140 + }, + { + "epoch": 2.241311078520381, + "grad_norm": 0.34970007251562346, + "learning_rate": 1.824540710827935e-05, + "loss": 2.6293, + "step": 48141 + }, + { + "epoch": 2.241357636706474, + "grad_norm": 0.3267772129763615, + "learning_rate": 1.8243314827571244e-05, + "loss": 2.5471, + "step": 48142 + }, + { + "epoch": 2.241404194892567, + "grad_norm": 0.3386622187880626, + "learning_rate": 1.8241222640065886e-05, + "loss": 2.5998, + "step": 48143 + }, + { + "epoch": 2.2414507530786603, + "grad_norm": 0.33001218336998295, + "learning_rate": 1.8239130545769405e-05, + "loss": 2.6131, + "step": 48144 + }, + { + "epoch": 2.241497311264753, + "grad_norm": 0.33160811168500975, + "learning_rate": 1.8237038544687968e-05, + "loss": 2.6147, + "step": 48145 + }, + { + "epoch": 2.241543869450846, + "grad_norm": 0.3447290961557979, + "learning_rate": 1.8234946636827654e-05, + "loss": 2.6753, + "step": 48146 + }, + { + "epoch": 2.241590427636939, + "grad_norm": 0.34038837861479865, + "learning_rate": 1.823285482219468e-05, + "loss": 2.6505, + "step": 48147 + }, + { + "epoch": 2.2416369858230323, + "grad_norm": 0.3419297140985861, + "learning_rate": 1.823076310079513e-05, + "loss": 2.6796, + "step": 48148 + }, + { + "epoch": 2.2416835440091254, + "grad_norm": 0.3392597663630583, + "learning_rate": 1.822867147263516e-05, + "loss": 2.6587, + "step": 48149 + }, + { + "epoch": 2.2417301021952185, + "grad_norm": 0.3303374288132085, + "learning_rate": 1.8226579937720928e-05, + "loss": 2.685, + "step": 48150 + }, + { + "epoch": 2.2417766603813116, + "grad_norm": 0.3480523518546308, + "learning_rate": 1.822448849605852e-05, + "loss": 2.5835, + "step": 48151 + }, + { + "epoch": 2.2418232185674047, + "grad_norm": 0.3266531535961069, + "learning_rate": 1.8222397147654135e-05, + "loss": 2.6688, + "step": 48152 + }, + { + "epoch": 2.241869776753498, + "grad_norm": 0.3371740741025133, + "learning_rate": 1.8220305892513856e-05, + "loss": 2.5982, + "step": 48153 + }, + { + "epoch": 2.2419163349395905, + "grad_norm": 0.3550790430054196, + "learning_rate": 1.8218214730643884e-05, + "loss": 2.7583, + "step": 48154 + }, + { + "epoch": 2.2419628931256836, + "grad_norm": 0.33682320909395547, + "learning_rate": 1.82161236620503e-05, + "loss": 2.6538, + "step": 48155 + }, + { + "epoch": 2.2420094513117768, + "grad_norm": 0.3212186955547014, + "learning_rate": 1.8214032686739263e-05, + "loss": 2.6305, + "step": 48156 + }, + { + "epoch": 2.24205600949787, + "grad_norm": 0.33085071981805325, + "learning_rate": 1.821194180471691e-05, + "loss": 2.6513, + "step": 48157 + }, + { + "epoch": 2.242102567683963, + "grad_norm": 0.3182321746944325, + "learning_rate": 1.8209851015989376e-05, + "loss": 2.6272, + "step": 48158 + }, + { + "epoch": 2.242149125870056, + "grad_norm": 0.35525365390106367, + "learning_rate": 1.8207760320562793e-05, + "loss": 2.6763, + "step": 48159 + }, + { + "epoch": 2.242195684056149, + "grad_norm": 0.3122737904516164, + "learning_rate": 1.8205669718443323e-05, + "loss": 2.517, + "step": 48160 + }, + { + "epoch": 2.2422422422422423, + "grad_norm": 0.33097255170093925, + "learning_rate": 1.8203579209637062e-05, + "loss": 2.6366, + "step": 48161 + }, + { + "epoch": 2.2422888004283354, + "grad_norm": 0.3351129252583236, + "learning_rate": 1.8201488794150162e-05, + "loss": 2.5868, + "step": 48162 + }, + { + "epoch": 2.2423353586144286, + "grad_norm": 0.33885186816275414, + "learning_rate": 1.8199398471988776e-05, + "loss": 2.5777, + "step": 48163 + }, + { + "epoch": 2.2423819168005217, + "grad_norm": 0.30593088652090716, + "learning_rate": 1.8197308243158996e-05, + "loss": 2.5971, + "step": 48164 + }, + { + "epoch": 2.2424284749866144, + "grad_norm": 0.3226640138061226, + "learning_rate": 1.8195218107667013e-05, + "loss": 2.6339, + "step": 48165 + }, + { + "epoch": 2.2424750331727075, + "grad_norm": 0.31947962160196497, + "learning_rate": 1.8193128065518893e-05, + "loss": 2.6414, + "step": 48166 + }, + { + "epoch": 2.2425215913588006, + "grad_norm": 0.35624984560751194, + "learning_rate": 1.8191038116720842e-05, + "loss": 2.6668, + "step": 48167 + }, + { + "epoch": 2.2425681495448937, + "grad_norm": 0.33143953455933534, + "learning_rate": 1.818894826127895e-05, + "loss": 2.5873, + "step": 48168 + }, + { + "epoch": 2.242614707730987, + "grad_norm": 0.32432970663783617, + "learning_rate": 1.818685849919936e-05, + "loss": 2.6773, + "step": 48169 + }, + { + "epoch": 2.24266126591708, + "grad_norm": 0.3261128621943916, + "learning_rate": 1.8184768830488203e-05, + "loss": 2.5294, + "step": 48170 + }, + { + "epoch": 2.242707824103173, + "grad_norm": 0.3053737326754024, + "learning_rate": 1.818267925515162e-05, + "loss": 2.5017, + "step": 48171 + }, + { + "epoch": 2.242754382289266, + "grad_norm": 0.32833706241781035, + "learning_rate": 1.8180589773195733e-05, + "loss": 2.5698, + "step": 48172 + }, + { + "epoch": 2.2428009404753593, + "grad_norm": 0.32949705933995505, + "learning_rate": 1.81785003846267e-05, + "loss": 2.5929, + "step": 48173 + }, + { + "epoch": 2.242847498661452, + "grad_norm": 0.33621942572397007, + "learning_rate": 1.8176411089450617e-05, + "loss": 2.7023, + "step": 48174 + }, + { + "epoch": 2.242894056847545, + "grad_norm": 0.3253027944069855, + "learning_rate": 1.817432188767363e-05, + "loss": 2.6214, + "step": 48175 + }, + { + "epoch": 2.242940615033638, + "grad_norm": 0.32592831057620525, + "learning_rate": 1.8172232779301875e-05, + "loss": 2.6329, + "step": 48176 + }, + { + "epoch": 2.2429871732197313, + "grad_norm": 0.3530692062009976, + "learning_rate": 1.817014376434148e-05, + "loss": 2.6715, + "step": 48177 + }, + { + "epoch": 2.2430337314058244, + "grad_norm": 0.3419819608249376, + "learning_rate": 1.8168054842798593e-05, + "loss": 2.5815, + "step": 48178 + }, + { + "epoch": 2.2430802895919175, + "grad_norm": 0.31041167562622834, + "learning_rate": 1.8165966014679297e-05, + "loss": 2.4866, + "step": 48179 + }, + { + "epoch": 2.2431268477780106, + "grad_norm": 0.3318530613356197, + "learning_rate": 1.8163877279989795e-05, + "loss": 2.6027, + "step": 48180 + }, + { + "epoch": 2.2431734059641038, + "grad_norm": 0.32258209364657003, + "learning_rate": 1.8161788638736148e-05, + "loss": 2.6221, + "step": 48181 + }, + { + "epoch": 2.243219964150197, + "grad_norm": 0.32939977136458815, + "learning_rate": 1.815970009092452e-05, + "loss": 2.7695, + "step": 48182 + }, + { + "epoch": 2.24326652233629, + "grad_norm": 0.33227045273726963, + "learning_rate": 1.815761163656104e-05, + "loss": 2.6577, + "step": 48183 + }, + { + "epoch": 2.2433130805223827, + "grad_norm": 0.3266010577637822, + "learning_rate": 1.8155523275651825e-05, + "loss": 2.4214, + "step": 48184 + }, + { + "epoch": 2.2433596387084758, + "grad_norm": 0.3616543587148674, + "learning_rate": 1.8153435008203016e-05, + "loss": 2.6695, + "step": 48185 + }, + { + "epoch": 2.243406196894569, + "grad_norm": 0.3318843990048146, + "learning_rate": 1.8151346834220757e-05, + "loss": 2.5774, + "step": 48186 + }, + { + "epoch": 2.243452755080662, + "grad_norm": 0.3149874223676241, + "learning_rate": 1.8149258753711134e-05, + "loss": 2.6106, + "step": 48187 + }, + { + "epoch": 2.243499313266755, + "grad_norm": 0.34502103730675665, + "learning_rate": 1.8147170766680304e-05, + "loss": 2.6337, + "step": 48188 + }, + { + "epoch": 2.2435458714528482, + "grad_norm": 0.3498922328684923, + "learning_rate": 1.814508287313439e-05, + "loss": 2.7035, + "step": 48189 + }, + { + "epoch": 2.2435924296389413, + "grad_norm": 0.3193413469837284, + "learning_rate": 1.8142995073079518e-05, + "loss": 2.6879, + "step": 48190 + }, + { + "epoch": 2.2436389878250345, + "grad_norm": 0.33286294983639053, + "learning_rate": 1.8140907366521833e-05, + "loss": 2.5877, + "step": 48191 + }, + { + "epoch": 2.2436855460111276, + "grad_norm": 0.3371758902098584, + "learning_rate": 1.8138819753467413e-05, + "loss": 2.5802, + "step": 48192 + }, + { + "epoch": 2.2437321041972202, + "grad_norm": 0.3208599286497088, + "learning_rate": 1.813673223392245e-05, + "loss": 2.5654, + "step": 48193 + }, + { + "epoch": 2.2437786623833134, + "grad_norm": 0.3284388694661243, + "learning_rate": 1.813464480789301e-05, + "loss": 2.5889, + "step": 48194 + }, + { + "epoch": 2.2438252205694065, + "grad_norm": 0.33362047644133025, + "learning_rate": 1.8132557475385277e-05, + "loss": 2.7597, + "step": 48195 + }, + { + "epoch": 2.2438717787554996, + "grad_norm": 0.3396801246663502, + "learning_rate": 1.8130470236405334e-05, + "loss": 2.6675, + "step": 48196 + }, + { + "epoch": 2.2439183369415927, + "grad_norm": 0.3229583538757126, + "learning_rate": 1.8128383090959322e-05, + "loss": 2.6433, + "step": 48197 + }, + { + "epoch": 2.243964895127686, + "grad_norm": 0.33347930939981685, + "learning_rate": 1.8126296039053364e-05, + "loss": 2.6449, + "step": 48198 + }, + { + "epoch": 2.244011453313779, + "grad_norm": 0.34411773373210885, + "learning_rate": 1.8124209080693606e-05, + "loss": 2.6718, + "step": 48199 + }, + { + "epoch": 2.244058011499872, + "grad_norm": 0.3262918944301729, + "learning_rate": 1.8122122215886134e-05, + "loss": 2.5788, + "step": 48200 + }, + { + "epoch": 2.244104569685965, + "grad_norm": 0.37508724962702583, + "learning_rate": 1.8120035444637096e-05, + "loss": 2.7069, + "step": 48201 + }, + { + "epoch": 2.2441511278720583, + "grad_norm": 0.4019355347353557, + "learning_rate": 1.811794876695261e-05, + "loss": 2.6902, + "step": 48202 + }, + { + "epoch": 2.2441976860581514, + "grad_norm": 0.3122795904492757, + "learning_rate": 1.8115862182838804e-05, + "loss": 2.6276, + "step": 48203 + }, + { + "epoch": 2.244244244244244, + "grad_norm": 0.3417165338530874, + "learning_rate": 1.811377569230182e-05, + "loss": 2.6146, + "step": 48204 + }, + { + "epoch": 2.244290802430337, + "grad_norm": 0.36049869462965506, + "learning_rate": 1.811168929534773e-05, + "loss": 2.559, + "step": 48205 + }, + { + "epoch": 2.2443373606164303, + "grad_norm": 0.34262903284634194, + "learning_rate": 1.8109602991982723e-05, + "loss": 2.6853, + "step": 48206 + }, + { + "epoch": 2.2443839188025234, + "grad_norm": 0.3009276384423548, + "learning_rate": 1.810751678221286e-05, + "loss": 2.6871, + "step": 48207 + }, + { + "epoch": 2.2444304769886165, + "grad_norm": 0.34276974807687466, + "learning_rate": 1.810543066604432e-05, + "loss": 2.5857, + "step": 48208 + }, + { + "epoch": 2.2444770351747096, + "grad_norm": 0.3449198729341727, + "learning_rate": 1.8103344643483182e-05, + "loss": 2.6582, + "step": 48209 + }, + { + "epoch": 2.2445235933608028, + "grad_norm": 0.3342300905502087, + "learning_rate": 1.810125871453559e-05, + "loss": 2.6171, + "step": 48210 + }, + { + "epoch": 2.244570151546896, + "grad_norm": 0.3585838825521331, + "learning_rate": 1.809917287920766e-05, + "loss": 2.6623, + "step": 48211 + }, + { + "epoch": 2.244616709732989, + "grad_norm": 0.3298341101123946, + "learning_rate": 1.8097087137505514e-05, + "loss": 2.6369, + "step": 48212 + }, + { + "epoch": 2.2446632679190817, + "grad_norm": 0.3397570381437335, + "learning_rate": 1.8095001489435286e-05, + "loss": 2.6545, + "step": 48213 + }, + { + "epoch": 2.2447098261051748, + "grad_norm": 0.3589605831977728, + "learning_rate": 1.8092915935003074e-05, + "loss": 2.6721, + "step": 48214 + }, + { + "epoch": 2.244756384291268, + "grad_norm": 0.33442630738344603, + "learning_rate": 1.8090830474215016e-05, + "loss": 2.5802, + "step": 48215 + }, + { + "epoch": 2.244802942477361, + "grad_norm": 0.3618868640100948, + "learning_rate": 1.8088745107077214e-05, + "loss": 2.6775, + "step": 48216 + }, + { + "epoch": 2.244849500663454, + "grad_norm": 0.3438776875196585, + "learning_rate": 1.8086659833595825e-05, + "loss": 2.6367, + "step": 48217 + }, + { + "epoch": 2.2448960588495472, + "grad_norm": 0.34945501020294184, + "learning_rate": 1.8084574653776914e-05, + "loss": 2.7782, + "step": 48218 + }, + { + "epoch": 2.2449426170356404, + "grad_norm": 0.3312074769027594, + "learning_rate": 1.8082489567626666e-05, + "loss": 2.6494, + "step": 48219 + }, + { + "epoch": 2.2449891752217335, + "grad_norm": 0.3298296145731227, + "learning_rate": 1.8080404575151128e-05, + "loss": 2.581, + "step": 48220 + }, + { + "epoch": 2.2450357334078266, + "grad_norm": 0.3491422669048606, + "learning_rate": 1.8078319676356498e-05, + "loss": 2.6016, + "step": 48221 + }, + { + "epoch": 2.2450822915939197, + "grad_norm": 0.3267314383184824, + "learning_rate": 1.807623487124883e-05, + "loss": 2.7064, + "step": 48222 + }, + { + "epoch": 2.2451288497800124, + "grad_norm": 0.33358015674443, + "learning_rate": 1.8074150159834274e-05, + "loss": 2.7095, + "step": 48223 + }, + { + "epoch": 2.2451754079661055, + "grad_norm": 0.35386161088354023, + "learning_rate": 1.8072065542118942e-05, + "loss": 2.698, + "step": 48224 + }, + { + "epoch": 2.2452219661521986, + "grad_norm": 0.33324320433225907, + "learning_rate": 1.806998101810895e-05, + "loss": 2.6516, + "step": 48225 + }, + { + "epoch": 2.2452685243382917, + "grad_norm": 0.34522465503116484, + "learning_rate": 1.8067896587810436e-05, + "loss": 2.6593, + "step": 48226 + }, + { + "epoch": 2.245315082524385, + "grad_norm": 0.3458576450168232, + "learning_rate": 1.806581225122948e-05, + "loss": 2.7275, + "step": 48227 + }, + { + "epoch": 2.245361640710478, + "grad_norm": 0.31073227865231107, + "learning_rate": 1.806372800837222e-05, + "loss": 2.5498, + "step": 48228 + }, + { + "epoch": 2.245408198896571, + "grad_norm": 0.3618960201364077, + "learning_rate": 1.8061643859244775e-05, + "loss": 2.6239, + "step": 48229 + }, + { + "epoch": 2.245454757082664, + "grad_norm": 0.3419410376203326, + "learning_rate": 1.8059559803853254e-05, + "loss": 2.6317, + "step": 48230 + }, + { + "epoch": 2.2455013152687573, + "grad_norm": 0.35476741513820736, + "learning_rate": 1.8057475842203785e-05, + "loss": 2.5621, + "step": 48231 + }, + { + "epoch": 2.2455478734548504, + "grad_norm": 0.34647917627338387, + "learning_rate": 1.805539197430249e-05, + "loss": 2.6361, + "step": 48232 + }, + { + "epoch": 2.245594431640943, + "grad_norm": 0.3520437331192955, + "learning_rate": 1.8053308200155438e-05, + "loss": 2.7411, + "step": 48233 + }, + { + "epoch": 2.245640989827036, + "grad_norm": 0.3469805169051313, + "learning_rate": 1.8051224519768815e-05, + "loss": 2.5962, + "step": 48234 + }, + { + "epoch": 2.2456875480131293, + "grad_norm": 0.3308967886778708, + "learning_rate": 1.8049140933148685e-05, + "loss": 2.6723, + "step": 48235 + }, + { + "epoch": 2.2457341061992224, + "grad_norm": 0.3280514977656855, + "learning_rate": 1.8047057440301175e-05, + "loss": 2.5383, + "step": 48236 + }, + { + "epoch": 2.2457806643853155, + "grad_norm": 0.31967084311712807, + "learning_rate": 1.8044974041232405e-05, + "loss": 2.5246, + "step": 48237 + }, + { + "epoch": 2.2458272225714087, + "grad_norm": 0.3515631173383081, + "learning_rate": 1.8042890735948492e-05, + "loss": 2.5558, + "step": 48238 + }, + { + "epoch": 2.2458737807575018, + "grad_norm": 0.3243568375778216, + "learning_rate": 1.8040807524455555e-05, + "loss": 2.6406, + "step": 48239 + }, + { + "epoch": 2.245920338943595, + "grad_norm": 0.3178460787774303, + "learning_rate": 1.8038724406759688e-05, + "loss": 2.663, + "step": 48240 + }, + { + "epoch": 2.245966897129688, + "grad_norm": 0.3529902514519547, + "learning_rate": 1.8036641382867015e-05, + "loss": 2.6419, + "step": 48241 + }, + { + "epoch": 2.246013455315781, + "grad_norm": 0.3377299420607696, + "learning_rate": 1.803455845278365e-05, + "loss": 2.5963, + "step": 48242 + }, + { + "epoch": 2.246060013501874, + "grad_norm": 0.35896026391521396, + "learning_rate": 1.8032475616515708e-05, + "loss": 2.782, + "step": 48243 + }, + { + "epoch": 2.246106571687967, + "grad_norm": 0.33740427133074236, + "learning_rate": 1.80303928740693e-05, + "loss": 2.6882, + "step": 48244 + }, + { + "epoch": 2.24615312987406, + "grad_norm": 0.3429414764660365, + "learning_rate": 1.802831022545056e-05, + "loss": 2.7424, + "step": 48245 + }, + { + "epoch": 2.246199688060153, + "grad_norm": 0.3260370067677552, + "learning_rate": 1.802622767066554e-05, + "loss": 2.6213, + "step": 48246 + }, + { + "epoch": 2.2462462462462462, + "grad_norm": 0.3310534822276369, + "learning_rate": 1.8024145209720434e-05, + "loss": 2.4627, + "step": 48247 + }, + { + "epoch": 2.2462928044323394, + "grad_norm": 0.3291944854955884, + "learning_rate": 1.8022062842621285e-05, + "loss": 2.6052, + "step": 48248 + }, + { + "epoch": 2.2463393626184325, + "grad_norm": 0.3261840443962462, + "learning_rate": 1.8019980569374235e-05, + "loss": 2.6932, + "step": 48249 + }, + { + "epoch": 2.2463859208045256, + "grad_norm": 0.3144072566760245, + "learning_rate": 1.8017898389985394e-05, + "loss": 2.5924, + "step": 48250 + }, + { + "epoch": 2.2464324789906187, + "grad_norm": 0.3192320508975547, + "learning_rate": 1.801581630446087e-05, + "loss": 2.631, + "step": 48251 + }, + { + "epoch": 2.2464790371767114, + "grad_norm": 0.32103602528565983, + "learning_rate": 1.8013734312806784e-05, + "loss": 2.5426, + "step": 48252 + }, + { + "epoch": 2.2465255953628045, + "grad_norm": 0.3387283928343458, + "learning_rate": 1.801165241502921e-05, + "loss": 2.6563, + "step": 48253 + }, + { + "epoch": 2.2465721535488976, + "grad_norm": 0.33867910033225057, + "learning_rate": 1.8009570611134318e-05, + "loss": 2.6837, + "step": 48254 + }, + { + "epoch": 2.2466187117349907, + "grad_norm": 0.3006406807822453, + "learning_rate": 1.8007488901128168e-05, + "loss": 2.6142, + "step": 48255 + }, + { + "epoch": 2.246665269921084, + "grad_norm": 0.3695637258491557, + "learning_rate": 1.8005407285016884e-05, + "loss": 2.7411, + "step": 48256 + }, + { + "epoch": 2.246711828107177, + "grad_norm": 0.34195284011941374, + "learning_rate": 1.800332576280658e-05, + "loss": 2.6076, + "step": 48257 + }, + { + "epoch": 2.24675838629327, + "grad_norm": 0.32091050393076676, + "learning_rate": 1.8001244334503377e-05, + "loss": 2.5749, + "step": 48258 + }, + { + "epoch": 2.246804944479363, + "grad_norm": 0.3412410567273214, + "learning_rate": 1.7999163000113334e-05, + "loss": 2.6865, + "step": 48259 + }, + { + "epoch": 2.2468515026654563, + "grad_norm": 0.31520398404602323, + "learning_rate": 1.7997081759642632e-05, + "loss": 2.6025, + "step": 48260 + }, + { + "epoch": 2.2468980608515494, + "grad_norm": 0.3110506158340704, + "learning_rate": 1.7995000613097328e-05, + "loss": 2.5667, + "step": 48261 + }, + { + "epoch": 2.246944619037642, + "grad_norm": 0.31946224760917075, + "learning_rate": 1.799291956048354e-05, + "loss": 2.5968, + "step": 48262 + }, + { + "epoch": 2.246991177223735, + "grad_norm": 0.32032708048174124, + "learning_rate": 1.799083860180738e-05, + "loss": 2.5511, + "step": 48263 + }, + { + "epoch": 2.2470377354098283, + "grad_norm": 0.33653626531756015, + "learning_rate": 1.7988757737074957e-05, + "loss": 2.7424, + "step": 48264 + }, + { + "epoch": 2.2470842935959214, + "grad_norm": 0.3281603414341052, + "learning_rate": 1.7986676966292394e-05, + "loss": 2.6922, + "step": 48265 + }, + { + "epoch": 2.2471308517820145, + "grad_norm": 0.34816467918502597, + "learning_rate": 1.7984596289465744e-05, + "loss": 2.6253, + "step": 48266 + }, + { + "epoch": 2.2471774099681077, + "grad_norm": 0.3287679830882944, + "learning_rate": 1.7982515706601184e-05, + "loss": 2.6054, + "step": 48267 + }, + { + "epoch": 2.247223968154201, + "grad_norm": 0.30469180917378663, + "learning_rate": 1.798043521770477e-05, + "loss": 2.6597, + "step": 48268 + }, + { + "epoch": 2.247270526340294, + "grad_norm": 0.3364099569831459, + "learning_rate": 1.7978354822782622e-05, + "loss": 2.5498, + "step": 48269 + }, + { + "epoch": 2.247317084526387, + "grad_norm": 0.34796389565337743, + "learning_rate": 1.7976274521840853e-05, + "loss": 2.7557, + "step": 48270 + }, + { + "epoch": 2.24736364271248, + "grad_norm": 0.35009548148298664, + "learning_rate": 1.797419431488556e-05, + "loss": 2.6843, + "step": 48271 + }, + { + "epoch": 2.247410200898573, + "grad_norm": 0.316346405320806, + "learning_rate": 1.797211420192285e-05, + "loss": 2.5955, + "step": 48272 + }, + { + "epoch": 2.247456759084666, + "grad_norm": 0.3587415244194187, + "learning_rate": 1.797003418295885e-05, + "loss": 2.66, + "step": 48273 + }, + { + "epoch": 2.247503317270759, + "grad_norm": 0.33064590521246995, + "learning_rate": 1.7967954257999624e-05, + "loss": 2.5199, + "step": 48274 + }, + { + "epoch": 2.247549875456852, + "grad_norm": 0.35752676359181107, + "learning_rate": 1.79658744270513e-05, + "loss": 2.5751, + "step": 48275 + }, + { + "epoch": 2.2475964336429453, + "grad_norm": 0.3186617069585651, + "learning_rate": 1.7963794690119978e-05, + "loss": 2.6601, + "step": 48276 + }, + { + "epoch": 2.2476429918290384, + "grad_norm": 0.3258345082963749, + "learning_rate": 1.7961715047211762e-05, + "loss": 2.6059, + "step": 48277 + }, + { + "epoch": 2.2476895500151315, + "grad_norm": 0.33250697712080485, + "learning_rate": 1.7959635498332772e-05, + "loss": 2.6256, + "step": 48278 + }, + { + "epoch": 2.2477361082012246, + "grad_norm": 0.35229877301091117, + "learning_rate": 1.795755604348906e-05, + "loss": 2.6329, + "step": 48279 + }, + { + "epoch": 2.2477826663873177, + "grad_norm": 0.3199525122769461, + "learning_rate": 1.79554766826868e-05, + "loss": 2.5295, + "step": 48280 + }, + { + "epoch": 2.247829224573411, + "grad_norm": 0.3292521516275416, + "learning_rate": 1.7953397415932044e-05, + "loss": 2.6664, + "step": 48281 + }, + { + "epoch": 2.2478757827595035, + "grad_norm": 0.3405664424314123, + "learning_rate": 1.795131824323091e-05, + "loss": 2.552, + "step": 48282 + }, + { + "epoch": 2.2479223409455966, + "grad_norm": 0.3203820494793497, + "learning_rate": 1.7949239164589494e-05, + "loss": 2.6375, + "step": 48283 + }, + { + "epoch": 2.2479688991316897, + "grad_norm": 0.34072472163398876, + "learning_rate": 1.7947160180013906e-05, + "loss": 2.6775, + "step": 48284 + }, + { + "epoch": 2.248015457317783, + "grad_norm": 0.33645571820453113, + "learning_rate": 1.7945081289510247e-05, + "loss": 2.6365, + "step": 48285 + }, + { + "epoch": 2.248062015503876, + "grad_norm": 0.34305062176983375, + "learning_rate": 1.794300249308463e-05, + "loss": 2.7144, + "step": 48286 + }, + { + "epoch": 2.248108573689969, + "grad_norm": 0.33546056705386534, + "learning_rate": 1.794092379074313e-05, + "loss": 2.6192, + "step": 48287 + }, + { + "epoch": 2.248155131876062, + "grad_norm": 0.3233220063634559, + "learning_rate": 1.7938845182491858e-05, + "loss": 2.6211, + "step": 48288 + }, + { + "epoch": 2.2482016900621553, + "grad_norm": 0.34417173586696986, + "learning_rate": 1.7936766668336918e-05, + "loss": 2.6778, + "step": 48289 + }, + { + "epoch": 2.2482482482482484, + "grad_norm": 0.3375542852673378, + "learning_rate": 1.793468824828441e-05, + "loss": 2.5525, + "step": 48290 + }, + { + "epoch": 2.248294806434341, + "grad_norm": 0.3195400739965421, + "learning_rate": 1.7932609922340447e-05, + "loss": 2.6548, + "step": 48291 + }, + { + "epoch": 2.248341364620434, + "grad_norm": 0.33556288189978806, + "learning_rate": 1.7930531690511083e-05, + "loss": 2.703, + "step": 48292 + }, + { + "epoch": 2.2483879228065273, + "grad_norm": 0.3580540294688811, + "learning_rate": 1.792845355280248e-05, + "loss": 2.6666, + "step": 48293 + }, + { + "epoch": 2.2484344809926204, + "grad_norm": 0.353459990600825, + "learning_rate": 1.7926375509220694e-05, + "loss": 2.6772, + "step": 48294 + }, + { + "epoch": 2.2484810391787136, + "grad_norm": 0.32449064346851886, + "learning_rate": 1.792429755977183e-05, + "loss": 2.5679, + "step": 48295 + }, + { + "epoch": 2.2485275973648067, + "grad_norm": 0.3408077339271353, + "learning_rate": 1.7922219704461996e-05, + "loss": 2.614, + "step": 48296 + }, + { + "epoch": 2.2485741555509, + "grad_norm": 0.32506822223252974, + "learning_rate": 1.7920141943297293e-05, + "loss": 2.5684, + "step": 48297 + }, + { + "epoch": 2.248620713736993, + "grad_norm": 0.32971994654739495, + "learning_rate": 1.7918064276283803e-05, + "loss": 2.7092, + "step": 48298 + }, + { + "epoch": 2.248667271923086, + "grad_norm": 0.33882163703483065, + "learning_rate": 1.7915986703427657e-05, + "loss": 2.6511, + "step": 48299 + }, + { + "epoch": 2.248713830109179, + "grad_norm": 0.32093548578241266, + "learning_rate": 1.791390922473491e-05, + "loss": 2.5804, + "step": 48300 + }, + { + "epoch": 2.2487603882952723, + "grad_norm": 0.3516386954721444, + "learning_rate": 1.791183184021168e-05, + "loss": 2.6613, + "step": 48301 + }, + { + "epoch": 2.248806946481365, + "grad_norm": 0.35299514638219714, + "learning_rate": 1.7909754549864055e-05, + "loss": 2.7266, + "step": 48302 + }, + { + "epoch": 2.248853504667458, + "grad_norm": 0.3086115081820844, + "learning_rate": 1.7907677353698148e-05, + "loss": 2.6491, + "step": 48303 + }, + { + "epoch": 2.248900062853551, + "grad_norm": 0.3417041535872215, + "learning_rate": 1.7905600251720062e-05, + "loss": 2.6585, + "step": 48304 + }, + { + "epoch": 2.2489466210396443, + "grad_norm": 0.35548034685601065, + "learning_rate": 1.7903523243935838e-05, + "loss": 2.7005, + "step": 48305 + }, + { + "epoch": 2.2489931792257374, + "grad_norm": 0.32912396915674197, + "learning_rate": 1.7901446330351652e-05, + "loss": 2.648, + "step": 48306 + }, + { + "epoch": 2.2490397374118305, + "grad_norm": 0.33424172787258327, + "learning_rate": 1.789936951097352e-05, + "loss": 2.6911, + "step": 48307 + }, + { + "epoch": 2.2490862955979236, + "grad_norm": 0.31770086221489713, + "learning_rate": 1.7897292785807614e-05, + "loss": 2.6115, + "step": 48308 + }, + { + "epoch": 2.2491328537840167, + "grad_norm": 0.333536034819564, + "learning_rate": 1.7895216154859966e-05, + "loss": 2.5821, + "step": 48309 + }, + { + "epoch": 2.24917941197011, + "grad_norm": 0.32217411643229044, + "learning_rate": 1.7893139618136706e-05, + "loss": 2.6107, + "step": 48310 + }, + { + "epoch": 2.2492259701562025, + "grad_norm": 0.32897511457548395, + "learning_rate": 1.7891063175643914e-05, + "loss": 2.6314, + "step": 48311 + }, + { + "epoch": 2.2492725283422956, + "grad_norm": 0.3475474991374012, + "learning_rate": 1.7888986827387705e-05, + "loss": 2.6157, + "step": 48312 + }, + { + "epoch": 2.2493190865283887, + "grad_norm": 0.32920344108763594, + "learning_rate": 1.788691057337414e-05, + "loss": 2.668, + "step": 48313 + }, + { + "epoch": 2.249365644714482, + "grad_norm": 0.314109497603787, + "learning_rate": 1.7884834413609324e-05, + "loss": 2.5909, + "step": 48314 + }, + { + "epoch": 2.249412202900575, + "grad_norm": 0.32873222727741974, + "learning_rate": 1.788275834809936e-05, + "loss": 2.6507, + "step": 48315 + }, + { + "epoch": 2.249458761086668, + "grad_norm": 0.33210402503553216, + "learning_rate": 1.788068237685034e-05, + "loss": 2.639, + "step": 48316 + }, + { + "epoch": 2.249505319272761, + "grad_norm": 0.33669249968354636, + "learning_rate": 1.787860649986836e-05, + "loss": 2.6927, + "step": 48317 + }, + { + "epoch": 2.2495518774588543, + "grad_norm": 0.3500517872725554, + "learning_rate": 1.787653071715948e-05, + "loss": 2.6899, + "step": 48318 + }, + { + "epoch": 2.2495984356449474, + "grad_norm": 0.32602936691116374, + "learning_rate": 1.7874455028729846e-05, + "loss": 2.6618, + "step": 48319 + }, + { + "epoch": 2.2496449938310406, + "grad_norm": 0.32232436492814337, + "learning_rate": 1.787237943458549e-05, + "loss": 2.6619, + "step": 48320 + }, + { + "epoch": 2.2496915520171332, + "grad_norm": 0.3437287474640071, + "learning_rate": 1.787030393473257e-05, + "loss": 2.6247, + "step": 48321 + }, + { + "epoch": 2.2497381102032263, + "grad_norm": 0.338724134836838, + "learning_rate": 1.786822852917712e-05, + "loss": 2.7415, + "step": 48322 + }, + { + "epoch": 2.2497846683893195, + "grad_norm": 0.33215191988348314, + "learning_rate": 1.7866153217925263e-05, + "loss": 2.5637, + "step": 48323 + }, + { + "epoch": 2.2498312265754126, + "grad_norm": 0.3117368149507764, + "learning_rate": 1.7864078000983077e-05, + "loss": 2.5882, + "step": 48324 + }, + { + "epoch": 2.2498777847615057, + "grad_norm": 0.34749671408813326, + "learning_rate": 1.7862002878356654e-05, + "loss": 2.6368, + "step": 48325 + }, + { + "epoch": 2.249924342947599, + "grad_norm": 0.36472131749126696, + "learning_rate": 1.7859927850052103e-05, + "loss": 2.6254, + "step": 48326 + }, + { + "epoch": 2.249970901133692, + "grad_norm": 0.33685878577310974, + "learning_rate": 1.7857852916075486e-05, + "loss": 2.6055, + "step": 48327 + }, + { + "epoch": 2.250017459319785, + "grad_norm": 0.34954377656564084, + "learning_rate": 1.7855778076432898e-05, + "loss": 2.6686, + "step": 48328 + }, + { + "epoch": 2.250064017505878, + "grad_norm": 0.3320953011118654, + "learning_rate": 1.785370333113044e-05, + "loss": 2.6867, + "step": 48329 + }, + { + "epoch": 2.250110575691971, + "grad_norm": 0.3292934948504622, + "learning_rate": 1.785162868017421e-05, + "loss": 2.657, + "step": 48330 + }, + { + "epoch": 2.250157133878064, + "grad_norm": 0.3413166647488859, + "learning_rate": 1.7849554123570245e-05, + "loss": 2.6613, + "step": 48331 + }, + { + "epoch": 2.250203692064157, + "grad_norm": 0.34134991543941207, + "learning_rate": 1.784747966132471e-05, + "loss": 2.6522, + "step": 48332 + }, + { + "epoch": 2.25025025025025, + "grad_norm": 0.33343685629311104, + "learning_rate": 1.7845405293443618e-05, + "loss": 2.5543, + "step": 48333 + }, + { + "epoch": 2.2502968084363433, + "grad_norm": 0.33311802470066276, + "learning_rate": 1.784333101993313e-05, + "loss": 2.6712, + "step": 48334 + }, + { + "epoch": 2.2503433666224364, + "grad_norm": 0.331548444174259, + "learning_rate": 1.784125684079928e-05, + "loss": 2.7353, + "step": 48335 + }, + { + "epoch": 2.2503899248085295, + "grad_norm": 0.316828407923944, + "learning_rate": 1.7839182756048172e-05, + "loss": 2.658, + "step": 48336 + }, + { + "epoch": 2.2504364829946226, + "grad_norm": 0.33072186539901127, + "learning_rate": 1.7837108765685896e-05, + "loss": 2.5652, + "step": 48337 + }, + { + "epoch": 2.2504830411807157, + "grad_norm": 0.36759145025727374, + "learning_rate": 1.7835034869718537e-05, + "loss": 2.6385, + "step": 48338 + }, + { + "epoch": 2.250529599366809, + "grad_norm": 0.3614049438182594, + "learning_rate": 1.78329610681522e-05, + "loss": 2.6861, + "step": 48339 + }, + { + "epoch": 2.250576157552902, + "grad_norm": 0.3377476405206838, + "learning_rate": 1.7830887360992933e-05, + "loss": 2.7211, + "step": 48340 + }, + { + "epoch": 2.2506227157389946, + "grad_norm": 0.3558996884832667, + "learning_rate": 1.7828813748246842e-05, + "loss": 2.5377, + "step": 48341 + }, + { + "epoch": 2.2506692739250878, + "grad_norm": 0.35302658341750753, + "learning_rate": 1.7826740229920015e-05, + "loss": 2.6528, + "step": 48342 + }, + { + "epoch": 2.250715832111181, + "grad_norm": 0.32811875538021096, + "learning_rate": 1.7824666806018537e-05, + "loss": 2.5655, + "step": 48343 + }, + { + "epoch": 2.250762390297274, + "grad_norm": 0.3643832528916006, + "learning_rate": 1.7822593476548487e-05, + "loss": 2.6965, + "step": 48344 + }, + { + "epoch": 2.250808948483367, + "grad_norm": 0.34825277964911605, + "learning_rate": 1.7820520241515983e-05, + "loss": 2.7374, + "step": 48345 + }, + { + "epoch": 2.25085550666946, + "grad_norm": 0.3321448428435349, + "learning_rate": 1.781844710092704e-05, + "loss": 2.7151, + "step": 48346 + }, + { + "epoch": 2.2509020648555533, + "grad_norm": 0.30929582300809394, + "learning_rate": 1.7816374054787816e-05, + "loss": 2.6502, + "step": 48347 + }, + { + "epoch": 2.2509486230416464, + "grad_norm": 0.3651110513242174, + "learning_rate": 1.7814301103104346e-05, + "loss": 2.6064, + "step": 48348 + }, + { + "epoch": 2.2509951812277396, + "grad_norm": 0.35685515393713696, + "learning_rate": 1.781222824588274e-05, + "loss": 2.5893, + "step": 48349 + }, + { + "epoch": 2.2510417394138322, + "grad_norm": 0.32690354869514837, + "learning_rate": 1.7810155483129065e-05, + "loss": 2.6665, + "step": 48350 + }, + { + "epoch": 2.2510882975999253, + "grad_norm": 0.3365446401835002, + "learning_rate": 1.780808281484942e-05, + "loss": 2.6193, + "step": 48351 + }, + { + "epoch": 2.2511348557860185, + "grad_norm": 0.3497972217180604, + "learning_rate": 1.7806010241049892e-05, + "loss": 2.6605, + "step": 48352 + }, + { + "epoch": 2.2511814139721116, + "grad_norm": 0.331523661315135, + "learning_rate": 1.7803937761736535e-05, + "loss": 2.6761, + "step": 48353 + }, + { + "epoch": 2.2512279721582047, + "grad_norm": 0.3438272324390983, + "learning_rate": 1.780186537691545e-05, + "loss": 2.7698, + "step": 48354 + }, + { + "epoch": 2.251274530344298, + "grad_norm": 0.3330124010575486, + "learning_rate": 1.779979308659272e-05, + "loss": 2.6771, + "step": 48355 + }, + { + "epoch": 2.251321088530391, + "grad_norm": 0.34074040028962177, + "learning_rate": 1.7797720890774422e-05, + "loss": 2.5549, + "step": 48356 + }, + { + "epoch": 2.251367646716484, + "grad_norm": 0.3310073850477008, + "learning_rate": 1.7795648789466644e-05, + "loss": 2.6322, + "step": 48357 + }, + { + "epoch": 2.251414204902577, + "grad_norm": 0.3397972935481329, + "learning_rate": 1.7793576782675482e-05, + "loss": 2.6564, + "step": 48358 + }, + { + "epoch": 2.2514607630886703, + "grad_norm": 0.3486143721612522, + "learning_rate": 1.7791504870406962e-05, + "loss": 2.5461, + "step": 48359 + }, + { + "epoch": 2.2515073212747634, + "grad_norm": 0.36292370420996073, + "learning_rate": 1.778943305266723e-05, + "loss": 2.637, + "step": 48360 + }, + { + "epoch": 2.251553879460856, + "grad_norm": 0.3401394430812633, + "learning_rate": 1.7787361329462316e-05, + "loss": 2.6453, + "step": 48361 + }, + { + "epoch": 2.251600437646949, + "grad_norm": 0.3098295507721994, + "learning_rate": 1.778528970079835e-05, + "loss": 2.5457, + "step": 48362 + }, + { + "epoch": 2.2516469958330423, + "grad_norm": 0.35164353011079497, + "learning_rate": 1.778321816668137e-05, + "loss": 2.5322, + "step": 48363 + }, + { + "epoch": 2.2516935540191354, + "grad_norm": 0.36695508955917766, + "learning_rate": 1.7781146727117477e-05, + "loss": 2.6457, + "step": 48364 + }, + { + "epoch": 2.2517401122052285, + "grad_norm": 0.3410596551233309, + "learning_rate": 1.7779075382112753e-05, + "loss": 2.5322, + "step": 48365 + }, + { + "epoch": 2.2517866703913216, + "grad_norm": 0.33050189256814294, + "learning_rate": 1.7777004131673254e-05, + "loss": 2.6627, + "step": 48366 + }, + { + "epoch": 2.2518332285774147, + "grad_norm": 0.3414259515787848, + "learning_rate": 1.7774932975805075e-05, + "loss": 2.593, + "step": 48367 + }, + { + "epoch": 2.251879786763508, + "grad_norm": 0.3544718230603915, + "learning_rate": 1.7772861914514287e-05, + "loss": 2.6576, + "step": 48368 + }, + { + "epoch": 2.2519263449496005, + "grad_norm": 0.3449299336180158, + "learning_rate": 1.777079094780698e-05, + "loss": 2.6707, + "step": 48369 + }, + { + "epoch": 2.2519729031356936, + "grad_norm": 0.31475133238762054, + "learning_rate": 1.7768720075689227e-05, + "loss": 2.5838, + "step": 48370 + }, + { + "epoch": 2.2520194613217868, + "grad_norm": 0.3333143561090022, + "learning_rate": 1.776664929816712e-05, + "loss": 2.6307, + "step": 48371 + }, + { + "epoch": 2.25206601950788, + "grad_norm": 0.33535679577967464, + "learning_rate": 1.776457861524669e-05, + "loss": 2.5969, + "step": 48372 + }, + { + "epoch": 2.252112577693973, + "grad_norm": 0.334113478958792, + "learning_rate": 1.7762508026934076e-05, + "loss": 2.6106, + "step": 48373 + }, + { + "epoch": 2.252159135880066, + "grad_norm": 0.32528128811082757, + "learning_rate": 1.7760437533235296e-05, + "loss": 2.6437, + "step": 48374 + }, + { + "epoch": 2.2522056940661592, + "grad_norm": 0.351931156045808, + "learning_rate": 1.7758367134156482e-05, + "loss": 2.6009, + "step": 48375 + }, + { + "epoch": 2.2522522522522523, + "grad_norm": 0.380223000412542, + "learning_rate": 1.7756296829703677e-05, + "loss": 2.6069, + "step": 48376 + }, + { + "epoch": 2.2522988104383455, + "grad_norm": 0.358148002896652, + "learning_rate": 1.7754226619882958e-05, + "loss": 2.5605, + "step": 48377 + }, + { + "epoch": 2.2523453686244386, + "grad_norm": 0.3899879856538761, + "learning_rate": 1.775215650470043e-05, + "loss": 2.6893, + "step": 48378 + }, + { + "epoch": 2.2523919268105317, + "grad_norm": 0.3350164199497337, + "learning_rate": 1.7750086484162105e-05, + "loss": 2.5955, + "step": 48379 + }, + { + "epoch": 2.2524384849966244, + "grad_norm": 0.3617358895171935, + "learning_rate": 1.7748016558274134e-05, + "loss": 2.6683, + "step": 48380 + }, + { + "epoch": 2.2524850431827175, + "grad_norm": 0.3394195570304972, + "learning_rate": 1.7745946727042546e-05, + "loss": 2.6163, + "step": 48381 + }, + { + "epoch": 2.2525316013688106, + "grad_norm": 0.33340711227037106, + "learning_rate": 1.774387699047342e-05, + "loss": 2.6139, + "step": 48382 + }, + { + "epoch": 2.2525781595549037, + "grad_norm": 0.3242230197897861, + "learning_rate": 1.7741807348572837e-05, + "loss": 2.6497, + "step": 48383 + }, + { + "epoch": 2.252624717740997, + "grad_norm": 0.381848325597386, + "learning_rate": 1.7739737801346894e-05, + "loss": 2.6846, + "step": 48384 + }, + { + "epoch": 2.25267127592709, + "grad_norm": 0.3580156367472993, + "learning_rate": 1.7737668348801605e-05, + "loss": 2.7453, + "step": 48385 + }, + { + "epoch": 2.252717834113183, + "grad_norm": 0.3376243261418325, + "learning_rate": 1.7735598990943114e-05, + "loss": 2.7329, + "step": 48386 + }, + { + "epoch": 2.252764392299276, + "grad_norm": 0.3398019078836982, + "learning_rate": 1.7733529727777432e-05, + "loss": 2.5903, + "step": 48387 + }, + { + "epoch": 2.2528109504853693, + "grad_norm": 0.34567679288362174, + "learning_rate": 1.7731460559310694e-05, + "loss": 2.5969, + "step": 48388 + }, + { + "epoch": 2.252857508671462, + "grad_norm": 0.3291983177452127, + "learning_rate": 1.772939148554892e-05, + "loss": 2.652, + "step": 48389 + }, + { + "epoch": 2.252904066857555, + "grad_norm": 0.33210261558765675, + "learning_rate": 1.77273225064982e-05, + "loss": 2.5952, + "step": 48390 + }, + { + "epoch": 2.252950625043648, + "grad_norm": 0.35044817828121483, + "learning_rate": 1.7725253622164635e-05, + "loss": 2.6353, + "step": 48391 + }, + { + "epoch": 2.2529971832297413, + "grad_norm": 0.3587693988428661, + "learning_rate": 1.772318483255423e-05, + "loss": 2.7454, + "step": 48392 + }, + { + "epoch": 2.2530437414158344, + "grad_norm": 0.3437721646670298, + "learning_rate": 1.7721116137673133e-05, + "loss": 2.589, + "step": 48393 + }, + { + "epoch": 2.2530902996019275, + "grad_norm": 0.3314778005765173, + "learning_rate": 1.7719047537527365e-05, + "loss": 2.6399, + "step": 48394 + }, + { + "epoch": 2.2531368577880206, + "grad_norm": 0.33620904172073607, + "learning_rate": 1.7716979032123016e-05, + "loss": 2.6705, + "step": 48395 + }, + { + "epoch": 2.2531834159741138, + "grad_norm": 0.3507744266419259, + "learning_rate": 1.7714910621466145e-05, + "loss": 2.669, + "step": 48396 + }, + { + "epoch": 2.253229974160207, + "grad_norm": 0.3429150943838576, + "learning_rate": 1.771284230556284e-05, + "loss": 2.6479, + "step": 48397 + }, + { + "epoch": 2.2532765323463, + "grad_norm": 0.327013444709086, + "learning_rate": 1.771077408441915e-05, + "loss": 2.6219, + "step": 48398 + }, + { + "epoch": 2.253323090532393, + "grad_norm": 0.35324274992239674, + "learning_rate": 1.7708705958041185e-05, + "loss": 2.6035, + "step": 48399 + }, + { + "epoch": 2.2533696487184858, + "grad_norm": 0.33629868991575607, + "learning_rate": 1.7706637926434943e-05, + "loss": 2.5355, + "step": 48400 + }, + { + "epoch": 2.253416206904579, + "grad_norm": 0.31356927090344794, + "learning_rate": 1.7704569989606578e-05, + "loss": 2.5362, + "step": 48401 + }, + { + "epoch": 2.253462765090672, + "grad_norm": 0.34387949731042533, + "learning_rate": 1.77025021475621e-05, + "loss": 2.6753, + "step": 48402 + }, + { + "epoch": 2.253509323276765, + "grad_norm": 0.32454539022275686, + "learning_rate": 1.77004344003076e-05, + "loss": 2.5626, + "step": 48403 + }, + { + "epoch": 2.2535558814628582, + "grad_norm": 0.3361660690827304, + "learning_rate": 1.7698366747849153e-05, + "loss": 2.6444, + "step": 48404 + }, + { + "epoch": 2.2536024396489513, + "grad_norm": 0.333805360697406, + "learning_rate": 1.7696299190192787e-05, + "loss": 2.6132, + "step": 48405 + }, + { + "epoch": 2.2536489978350445, + "grad_norm": 0.33047989438983116, + "learning_rate": 1.7694231727344636e-05, + "loss": 2.593, + "step": 48406 + }, + { + "epoch": 2.2536955560211376, + "grad_norm": 0.3269550653096631, + "learning_rate": 1.7692164359310713e-05, + "loss": 2.6091, + "step": 48407 + }, + { + "epoch": 2.2537421142072303, + "grad_norm": 0.32210548652323007, + "learning_rate": 1.7690097086097108e-05, + "loss": 2.6969, + "step": 48408 + }, + { + "epoch": 2.2537886723933234, + "grad_norm": 0.3356833524899101, + "learning_rate": 1.768802990770988e-05, + "loss": 2.6731, + "step": 48409 + }, + { + "epoch": 2.2538352305794165, + "grad_norm": 0.3434061927170626, + "learning_rate": 1.7685962824155105e-05, + "loss": 2.6827, + "step": 48410 + }, + { + "epoch": 2.2538817887655096, + "grad_norm": 0.33413762996811125, + "learning_rate": 1.768389583543884e-05, + "loss": 2.6518, + "step": 48411 + }, + { + "epoch": 2.2539283469516027, + "grad_norm": 0.3219475924842295, + "learning_rate": 1.768182894156718e-05, + "loss": 2.595, + "step": 48412 + }, + { + "epoch": 2.253974905137696, + "grad_norm": 0.34759350701172537, + "learning_rate": 1.767976214254613e-05, + "loss": 2.6818, + "step": 48413 + }, + { + "epoch": 2.254021463323789, + "grad_norm": 0.3601744311775931, + "learning_rate": 1.767769543838183e-05, + "loss": 2.6282, + "step": 48414 + }, + { + "epoch": 2.254068021509882, + "grad_norm": 0.31252745962600986, + "learning_rate": 1.7675628829080295e-05, + "loss": 2.7069, + "step": 48415 + }, + { + "epoch": 2.254114579695975, + "grad_norm": 0.33311620410602794, + "learning_rate": 1.7673562314647602e-05, + "loss": 2.5826, + "step": 48416 + }, + { + "epoch": 2.2541611378820683, + "grad_norm": 0.3267078348610103, + "learning_rate": 1.7671495895089834e-05, + "loss": 2.5076, + "step": 48417 + }, + { + "epoch": 2.2542076960681614, + "grad_norm": 0.3577240371219841, + "learning_rate": 1.766942957041301e-05, + "loss": 2.6944, + "step": 48418 + }, + { + "epoch": 2.254254254254254, + "grad_norm": 0.32523502280995553, + "learning_rate": 1.766736334062326e-05, + "loss": 2.5631, + "step": 48419 + }, + { + "epoch": 2.254300812440347, + "grad_norm": 0.32006010994766704, + "learning_rate": 1.7665297205726593e-05, + "loss": 2.7112, + "step": 48420 + }, + { + "epoch": 2.2543473706264403, + "grad_norm": 0.328985981714521, + "learning_rate": 1.76632311657291e-05, + "loss": 2.6619, + "step": 48421 + }, + { + "epoch": 2.2543939288125334, + "grad_norm": 0.34097591283992096, + "learning_rate": 1.7661165220636833e-05, + "loss": 2.6759, + "step": 48422 + }, + { + "epoch": 2.2544404869986265, + "grad_norm": 0.33191480324003436, + "learning_rate": 1.7659099370455855e-05, + "loss": 2.544, + "step": 48423 + }, + { + "epoch": 2.2544870451847197, + "grad_norm": 0.3226639137756331, + "learning_rate": 1.765703361519224e-05, + "loss": 2.5949, + "step": 48424 + }, + { + "epoch": 2.2545336033708128, + "grad_norm": 0.3327332940463529, + "learning_rate": 1.7654967954852065e-05, + "loss": 2.6276, + "step": 48425 + }, + { + "epoch": 2.254580161556906, + "grad_norm": 0.32848777393471423, + "learning_rate": 1.7652902389441335e-05, + "loss": 2.7149, + "step": 48426 + }, + { + "epoch": 2.254626719742999, + "grad_norm": 0.3207370036448976, + "learning_rate": 1.765083691896618e-05, + "loss": 2.7311, + "step": 48427 + }, + { + "epoch": 2.2546732779290917, + "grad_norm": 0.34545815845918953, + "learning_rate": 1.764877154343262e-05, + "loss": 2.605, + "step": 48428 + }, + { + "epoch": 2.254719836115185, + "grad_norm": 0.3213515585017678, + "learning_rate": 1.7646706262846725e-05, + "loss": 2.6051, + "step": 48429 + }, + { + "epoch": 2.254766394301278, + "grad_norm": 0.31388834120625014, + "learning_rate": 1.764464107721458e-05, + "loss": 2.6485, + "step": 48430 + }, + { + "epoch": 2.254812952487371, + "grad_norm": 0.3292329309085705, + "learning_rate": 1.764257598654219e-05, + "loss": 2.5248, + "step": 48431 + }, + { + "epoch": 2.254859510673464, + "grad_norm": 0.32768342364251507, + "learning_rate": 1.7640510990835685e-05, + "loss": 2.6484, + "step": 48432 + }, + { + "epoch": 2.2549060688595572, + "grad_norm": 0.32499586043390855, + "learning_rate": 1.7638446090101056e-05, + "loss": 2.5397, + "step": 48433 + }, + { + "epoch": 2.2549526270456504, + "grad_norm": 0.345996166843926, + "learning_rate": 1.7636381284344438e-05, + "loss": 2.7305, + "step": 48434 + }, + { + "epoch": 2.2549991852317435, + "grad_norm": 0.3329981749715021, + "learning_rate": 1.7634316573571835e-05, + "loss": 2.5541, + "step": 48435 + }, + { + "epoch": 2.2550457434178366, + "grad_norm": 0.313094734316021, + "learning_rate": 1.7632251957789325e-05, + "loss": 2.6351, + "step": 48436 + }, + { + "epoch": 2.2550923016039297, + "grad_norm": 0.33333963692725194, + "learning_rate": 1.7630187437002967e-05, + "loss": 2.7473, + "step": 48437 + }, + { + "epoch": 2.255138859790023, + "grad_norm": 0.3320102248190193, + "learning_rate": 1.7628123011218834e-05, + "loss": 2.7017, + "step": 48438 + }, + { + "epoch": 2.2551854179761155, + "grad_norm": 0.30654070596992755, + "learning_rate": 1.7626058680442938e-05, + "loss": 2.7171, + "step": 48439 + }, + { + "epoch": 2.2552319761622086, + "grad_norm": 0.32214101625895725, + "learning_rate": 1.7623994444681407e-05, + "loss": 2.609, + "step": 48440 + }, + { + "epoch": 2.2552785343483017, + "grad_norm": 0.3096463400847555, + "learning_rate": 1.7621930303940243e-05, + "loss": 2.611, + "step": 48441 + }, + { + "epoch": 2.255325092534395, + "grad_norm": 0.31723737665094137, + "learning_rate": 1.761986625822552e-05, + "loss": 2.6789, + "step": 48442 + }, + { + "epoch": 2.255371650720488, + "grad_norm": 0.3126570694535348, + "learning_rate": 1.7617802307543323e-05, + "loss": 2.7472, + "step": 48443 + }, + { + "epoch": 2.255418208906581, + "grad_norm": 0.34832656477856144, + "learning_rate": 1.761573845189965e-05, + "loss": 2.6817, + "step": 48444 + }, + { + "epoch": 2.255464767092674, + "grad_norm": 0.3443987915407261, + "learning_rate": 1.761367469130063e-05, + "loss": 2.6208, + "step": 48445 + }, + { + "epoch": 2.2555113252787673, + "grad_norm": 0.32664293161782465, + "learning_rate": 1.7611611025752244e-05, + "loss": 2.5835, + "step": 48446 + }, + { + "epoch": 2.25555788346486, + "grad_norm": 0.34314186600335866, + "learning_rate": 1.760954745526063e-05, + "loss": 2.7303, + "step": 48447 + }, + { + "epoch": 2.255604441650953, + "grad_norm": 0.3235676042020471, + "learning_rate": 1.7607483979831786e-05, + "loss": 2.6996, + "step": 48448 + }, + { + "epoch": 2.255650999837046, + "grad_norm": 0.3254234606982466, + "learning_rate": 1.7605420599471777e-05, + "loss": 2.6245, + "step": 48449 + }, + { + "epoch": 2.2556975580231393, + "grad_norm": 0.36376786874504996, + "learning_rate": 1.7603357314186676e-05, + "loss": 2.6314, + "step": 48450 + }, + { + "epoch": 2.2557441162092324, + "grad_norm": 0.31763155794462494, + "learning_rate": 1.7601294123982527e-05, + "loss": 2.6216, + "step": 48451 + }, + { + "epoch": 2.2557906743953255, + "grad_norm": 0.3100457697997176, + "learning_rate": 1.7599231028865386e-05, + "loss": 2.576, + "step": 48452 + }, + { + "epoch": 2.2558372325814187, + "grad_norm": 0.33450452140438236, + "learning_rate": 1.7597168028841327e-05, + "loss": 2.5751, + "step": 48453 + }, + { + "epoch": 2.2558837907675118, + "grad_norm": 0.3310623194172686, + "learning_rate": 1.759510512391637e-05, + "loss": 2.6855, + "step": 48454 + }, + { + "epoch": 2.255930348953605, + "grad_norm": 0.3419522145405879, + "learning_rate": 1.7593042314096592e-05, + "loss": 2.5918, + "step": 48455 + }, + { + "epoch": 2.255976907139698, + "grad_norm": 0.3190782638356795, + "learning_rate": 1.7590979599388056e-05, + "loss": 2.5971, + "step": 48456 + }, + { + "epoch": 2.256023465325791, + "grad_norm": 0.31373800744059777, + "learning_rate": 1.758891697979677e-05, + "loss": 2.629, + "step": 48457 + }, + { + "epoch": 2.256070023511884, + "grad_norm": 0.30874276606370404, + "learning_rate": 1.758685445532885e-05, + "loss": 2.5823, + "step": 48458 + }, + { + "epoch": 2.256116581697977, + "grad_norm": 0.2979098809651928, + "learning_rate": 1.7584792025990287e-05, + "loss": 2.5743, + "step": 48459 + }, + { + "epoch": 2.25616313988407, + "grad_norm": 0.3336876372538031, + "learning_rate": 1.7582729691787203e-05, + "loss": 2.6022, + "step": 48460 + }, + { + "epoch": 2.256209698070163, + "grad_norm": 0.3292993469232671, + "learning_rate": 1.758066745272559e-05, + "loss": 2.6754, + "step": 48461 + }, + { + "epoch": 2.2562562562562563, + "grad_norm": 0.33108662984631765, + "learning_rate": 1.757860530881153e-05, + "loss": 2.6754, + "step": 48462 + }, + { + "epoch": 2.2563028144423494, + "grad_norm": 0.32331511483193787, + "learning_rate": 1.7576543260051066e-05, + "loss": 2.6963, + "step": 48463 + }, + { + "epoch": 2.2563493726284425, + "grad_norm": 0.33643727126041084, + "learning_rate": 1.7574481306450253e-05, + "loss": 2.5354, + "step": 48464 + }, + { + "epoch": 2.2563959308145356, + "grad_norm": 0.3215020154521661, + "learning_rate": 1.757241944801515e-05, + "loss": 2.6032, + "step": 48465 + }, + { + "epoch": 2.2564424890006287, + "grad_norm": 0.32671276776920133, + "learning_rate": 1.757035768475181e-05, + "loss": 2.5824, + "step": 48466 + }, + { + "epoch": 2.2564890471867214, + "grad_norm": 0.32327588320525275, + "learning_rate": 1.756829601666626e-05, + "loss": 2.65, + "step": 48467 + }, + { + "epoch": 2.2565356053728145, + "grad_norm": 0.31969366645249575, + "learning_rate": 1.7566234443764567e-05, + "loss": 2.5753, + "step": 48468 + }, + { + "epoch": 2.2565821635589076, + "grad_norm": 0.3241784689983553, + "learning_rate": 1.7564172966052783e-05, + "loss": 2.6048, + "step": 48469 + }, + { + "epoch": 2.2566287217450007, + "grad_norm": 0.3228484461391799, + "learning_rate": 1.756211158353695e-05, + "loss": 2.6853, + "step": 48470 + }, + { + "epoch": 2.256675279931094, + "grad_norm": 0.34918408402409296, + "learning_rate": 1.7560050296223147e-05, + "loss": 2.6944, + "step": 48471 + }, + { + "epoch": 2.256721838117187, + "grad_norm": 0.3297196221915785, + "learning_rate": 1.755798910411736e-05, + "loss": 2.6124, + "step": 48472 + }, + { + "epoch": 2.25676839630328, + "grad_norm": 0.338676885394296, + "learning_rate": 1.755592800722572e-05, + "loss": 2.6256, + "step": 48473 + }, + { + "epoch": 2.256814954489373, + "grad_norm": 0.3245171490150375, + "learning_rate": 1.7553867005554213e-05, + "loss": 2.6068, + "step": 48474 + }, + { + "epoch": 2.2568615126754663, + "grad_norm": 0.3541852677062028, + "learning_rate": 1.7551806099108913e-05, + "loss": 2.661, + "step": 48475 + }, + { + "epoch": 2.2569080708615594, + "grad_norm": 0.33266934496854383, + "learning_rate": 1.7549745287895862e-05, + "loss": 2.5207, + "step": 48476 + }, + { + "epoch": 2.2569546290476525, + "grad_norm": 0.35970953615491974, + "learning_rate": 1.7547684571921113e-05, + "loss": 2.7165, + "step": 48477 + }, + { + "epoch": 2.257001187233745, + "grad_norm": 0.321608454413975, + "learning_rate": 1.7545623951190715e-05, + "loss": 2.6226, + "step": 48478 + }, + { + "epoch": 2.2570477454198383, + "grad_norm": 0.33532191258097693, + "learning_rate": 1.754356342571073e-05, + "loss": 2.7212, + "step": 48479 + }, + { + "epoch": 2.2570943036059314, + "grad_norm": 0.3456299347974103, + "learning_rate": 1.754150299548717e-05, + "loss": 2.5405, + "step": 48480 + }, + { + "epoch": 2.2571408617920246, + "grad_norm": 0.3174851856206381, + "learning_rate": 1.75394426605261e-05, + "loss": 2.5783, + "step": 48481 + }, + { + "epoch": 2.2571874199781177, + "grad_norm": 0.3405065445875843, + "learning_rate": 1.7537382420833575e-05, + "loss": 2.6719, + "step": 48482 + }, + { + "epoch": 2.257233978164211, + "grad_norm": 0.3239336580566782, + "learning_rate": 1.7535322276415628e-05, + "loss": 2.5415, + "step": 48483 + }, + { + "epoch": 2.257280536350304, + "grad_norm": 0.342938168207368, + "learning_rate": 1.753326222727833e-05, + "loss": 2.6259, + "step": 48484 + }, + { + "epoch": 2.257327094536397, + "grad_norm": 0.31089532941586634, + "learning_rate": 1.7531202273427678e-05, + "loss": 2.6673, + "step": 48485 + }, + { + "epoch": 2.2573736527224897, + "grad_norm": 0.332026600158078, + "learning_rate": 1.7529142414869774e-05, + "loss": 2.5817, + "step": 48486 + }, + { + "epoch": 2.257420210908583, + "grad_norm": 0.31184803304920533, + "learning_rate": 1.752708265161061e-05, + "loss": 2.5906, + "step": 48487 + }, + { + "epoch": 2.257466769094676, + "grad_norm": 0.3354156626362877, + "learning_rate": 1.7525022983656298e-05, + "loss": 2.7245, + "step": 48488 + }, + { + "epoch": 2.257513327280769, + "grad_norm": 0.31029897022254116, + "learning_rate": 1.752296341101282e-05, + "loss": 2.706, + "step": 48489 + }, + { + "epoch": 2.257559885466862, + "grad_norm": 0.3172965224639263, + "learning_rate": 1.7520903933686246e-05, + "loss": 2.6291, + "step": 48490 + }, + { + "epoch": 2.2576064436529553, + "grad_norm": 0.3402931834850167, + "learning_rate": 1.7518844551682624e-05, + "loss": 2.6817, + "step": 48491 + }, + { + "epoch": 2.2576530018390484, + "grad_norm": 0.30616007549436747, + "learning_rate": 1.7516785265008e-05, + "loss": 2.5085, + "step": 48492 + }, + { + "epoch": 2.2576995600251415, + "grad_norm": 0.30957610200018687, + "learning_rate": 1.75147260736684e-05, + "loss": 2.586, + "step": 48493 + }, + { + "epoch": 2.2577461182112346, + "grad_norm": 0.3033770522040709, + "learning_rate": 1.7512666977669883e-05, + "loss": 2.638, + "step": 48494 + }, + { + "epoch": 2.2577926763973277, + "grad_norm": 0.32444861952608783, + "learning_rate": 1.7510607977018485e-05, + "loss": 2.6903, + "step": 48495 + }, + { + "epoch": 2.257839234583421, + "grad_norm": 0.3099263143041983, + "learning_rate": 1.750854907172025e-05, + "loss": 2.6591, + "step": 48496 + }, + { + "epoch": 2.2578857927695135, + "grad_norm": 0.345944932561134, + "learning_rate": 1.750649026178124e-05, + "loss": 2.5943, + "step": 48497 + }, + { + "epoch": 2.2579323509556066, + "grad_norm": 0.31232041532012944, + "learning_rate": 1.750443154720745e-05, + "loss": 2.5941, + "step": 48498 + }, + { + "epoch": 2.2579789091416997, + "grad_norm": 0.33250892622582495, + "learning_rate": 1.7502372928004978e-05, + "loss": 2.6588, + "step": 48499 + }, + { + "epoch": 2.258025467327793, + "grad_norm": 0.3302221143969622, + "learning_rate": 1.7500314404179813e-05, + "loss": 2.5913, + "step": 48500 + }, + { + "epoch": 2.258072025513886, + "grad_norm": 0.321119827602257, + "learning_rate": 1.7498255975738054e-05, + "loss": 2.6738, + "step": 48501 + }, + { + "epoch": 2.258118583699979, + "grad_norm": 0.3425762528115343, + "learning_rate": 1.7496197642685695e-05, + "loss": 2.7121, + "step": 48502 + }, + { + "epoch": 2.258165141886072, + "grad_norm": 0.3467991641522403, + "learning_rate": 1.7494139405028796e-05, + "loss": 2.7097, + "step": 48503 + }, + { + "epoch": 2.2582117000721653, + "grad_norm": 0.3371370262235884, + "learning_rate": 1.7492081262773398e-05, + "loss": 2.7333, + "step": 48504 + }, + { + "epoch": 2.2582582582582584, + "grad_norm": 0.3199981274307577, + "learning_rate": 1.7490023215925534e-05, + "loss": 2.588, + "step": 48505 + }, + { + "epoch": 2.258304816444351, + "grad_norm": 0.36356822608492795, + "learning_rate": 1.7487965264491262e-05, + "loss": 2.6619, + "step": 48506 + }, + { + "epoch": 2.258351374630444, + "grad_norm": 0.373187654723449, + "learning_rate": 1.7485907408476598e-05, + "loss": 2.6079, + "step": 48507 + }, + { + "epoch": 2.2583979328165373, + "grad_norm": 0.3243735356179054, + "learning_rate": 1.7483849647887594e-05, + "loss": 2.6753, + "step": 48508 + }, + { + "epoch": 2.2584444910026304, + "grad_norm": 0.3254692535678309, + "learning_rate": 1.7481791982730283e-05, + "loss": 2.5762, + "step": 48509 + }, + { + "epoch": 2.2584910491887236, + "grad_norm": 0.3530260505037675, + "learning_rate": 1.747973441301073e-05, + "loss": 2.5811, + "step": 48510 + }, + { + "epoch": 2.2585376073748167, + "grad_norm": 0.33553518219442163, + "learning_rate": 1.7477676938734918e-05, + "loss": 2.6099, + "step": 48511 + }, + { + "epoch": 2.25858416556091, + "grad_norm": 0.3247325451772133, + "learning_rate": 1.7475619559908952e-05, + "loss": 2.6468, + "step": 48512 + }, + { + "epoch": 2.258630723747003, + "grad_norm": 0.3427425005491961, + "learning_rate": 1.7473562276538806e-05, + "loss": 2.639, + "step": 48513 + }, + { + "epoch": 2.258677281933096, + "grad_norm": 0.35391681450596524, + "learning_rate": 1.7471505088630584e-05, + "loss": 2.8201, + "step": 48514 + }, + { + "epoch": 2.258723840119189, + "grad_norm": 0.33170652250153476, + "learning_rate": 1.7469447996190273e-05, + "loss": 2.6262, + "step": 48515 + }, + { + "epoch": 2.2587703983052823, + "grad_norm": 0.33588356129982805, + "learning_rate": 1.7467390999223925e-05, + "loss": 2.6142, + "step": 48516 + }, + { + "epoch": 2.258816956491375, + "grad_norm": 0.31907772604226303, + "learning_rate": 1.7465334097737583e-05, + "loss": 2.5634, + "step": 48517 + }, + { + "epoch": 2.258863514677468, + "grad_norm": 0.34387188117983913, + "learning_rate": 1.746327729173728e-05, + "loss": 2.6116, + "step": 48518 + }, + { + "epoch": 2.258910072863561, + "grad_norm": 0.33927309654581056, + "learning_rate": 1.746122058122907e-05, + "loss": 2.6475, + "step": 48519 + }, + { + "epoch": 2.2589566310496543, + "grad_norm": 0.3242017637940052, + "learning_rate": 1.7459163966218956e-05, + "loss": 2.6166, + "step": 48520 + }, + { + "epoch": 2.2590031892357474, + "grad_norm": 0.3330929522037791, + "learning_rate": 1.7457107446712984e-05, + "loss": 2.6359, + "step": 48521 + }, + { + "epoch": 2.2590497474218405, + "grad_norm": 0.32550108529738686, + "learning_rate": 1.74550510227172e-05, + "loss": 2.5743, + "step": 48522 + }, + { + "epoch": 2.2590963056079336, + "grad_norm": 0.3461677387316004, + "learning_rate": 1.7452994694237633e-05, + "loss": 2.6252, + "step": 48523 + }, + { + "epoch": 2.2591428637940267, + "grad_norm": 0.340558154823924, + "learning_rate": 1.7450938461280326e-05, + "loss": 2.5988, + "step": 48524 + }, + { + "epoch": 2.2591894219801194, + "grad_norm": 0.33028609329792896, + "learning_rate": 1.7448882323851317e-05, + "loss": 2.6097, + "step": 48525 + }, + { + "epoch": 2.2592359801662125, + "grad_norm": 0.3284560954122865, + "learning_rate": 1.7446826281956602e-05, + "loss": 2.7031, + "step": 48526 + }, + { + "epoch": 2.2592825383523056, + "grad_norm": 0.3498210761521683, + "learning_rate": 1.7444770335602278e-05, + "loss": 2.6194, + "step": 48527 + }, + { + "epoch": 2.2593290965383988, + "grad_norm": 0.32720806763909566, + "learning_rate": 1.744271448479433e-05, + "loss": 2.662, + "step": 48528 + }, + { + "epoch": 2.259375654724492, + "grad_norm": 0.32714228873363765, + "learning_rate": 1.7440658729538807e-05, + "loss": 2.6286, + "step": 48529 + }, + { + "epoch": 2.259422212910585, + "grad_norm": 0.374080295709359, + "learning_rate": 1.743860306984174e-05, + "loss": 2.5575, + "step": 48530 + }, + { + "epoch": 2.259468771096678, + "grad_norm": 0.3036809166287605, + "learning_rate": 1.7436547505709173e-05, + "loss": 2.5896, + "step": 48531 + }, + { + "epoch": 2.259515329282771, + "grad_norm": 0.30431323230575646, + "learning_rate": 1.7434492037147137e-05, + "loss": 2.4948, + "step": 48532 + }, + { + "epoch": 2.2595618874688643, + "grad_norm": 0.36662594641275725, + "learning_rate": 1.743243666416165e-05, + "loss": 2.6411, + "step": 48533 + }, + { + "epoch": 2.2596084456549574, + "grad_norm": 0.3354410241416179, + "learning_rate": 1.7430381386758747e-05, + "loss": 2.5932, + "step": 48534 + }, + { + "epoch": 2.2596550038410506, + "grad_norm": 0.3277924456955461, + "learning_rate": 1.742832620494447e-05, + "loss": 2.5777, + "step": 48535 + }, + { + "epoch": 2.2597015620271432, + "grad_norm": 0.3688443684438066, + "learning_rate": 1.7426271118724848e-05, + "loss": 2.6812, + "step": 48536 + }, + { + "epoch": 2.2597481202132363, + "grad_norm": 0.3495314518855044, + "learning_rate": 1.742421612810591e-05, + "loss": 2.6629, + "step": 48537 + }, + { + "epoch": 2.2597946783993295, + "grad_norm": 0.35593629104602786, + "learning_rate": 1.7422161233093703e-05, + "loss": 2.6728, + "step": 48538 + }, + { + "epoch": 2.2598412365854226, + "grad_norm": 0.33675473685848367, + "learning_rate": 1.742010643369421e-05, + "loss": 2.6733, + "step": 48539 + }, + { + "epoch": 2.2598877947715157, + "grad_norm": 0.3561424584510801, + "learning_rate": 1.741805172991353e-05, + "loss": 2.6559, + "step": 48540 + }, + { + "epoch": 2.259934352957609, + "grad_norm": 0.32622699439917846, + "learning_rate": 1.7415997121757645e-05, + "loss": 2.6287, + "step": 48541 + }, + { + "epoch": 2.259980911143702, + "grad_norm": 0.3332610591529751, + "learning_rate": 1.7413942609232592e-05, + "loss": 2.6493, + "step": 48542 + }, + { + "epoch": 2.260027469329795, + "grad_norm": 0.3629562922593649, + "learning_rate": 1.741188819234441e-05, + "loss": 2.6435, + "step": 48543 + }, + { + "epoch": 2.260074027515888, + "grad_norm": 0.35690410006731804, + "learning_rate": 1.7409833871099123e-05, + "loss": 2.6335, + "step": 48544 + }, + { + "epoch": 2.260120585701981, + "grad_norm": 0.33046497737190195, + "learning_rate": 1.7407779645502788e-05, + "loss": 2.5686, + "step": 48545 + }, + { + "epoch": 2.260167143888074, + "grad_norm": 0.33992177994988837, + "learning_rate": 1.7405725515561366e-05, + "loss": 2.6065, + "step": 48546 + }, + { + "epoch": 2.260213702074167, + "grad_norm": 0.3737362683324886, + "learning_rate": 1.7403671481280964e-05, + "loss": 2.694, + "step": 48547 + }, + { + "epoch": 2.26026026026026, + "grad_norm": 0.3153206032160162, + "learning_rate": 1.7401617542667564e-05, + "loss": 2.6697, + "step": 48548 + }, + { + "epoch": 2.2603068184463533, + "grad_norm": 0.34975203548781136, + "learning_rate": 1.73995636997272e-05, + "loss": 2.6884, + "step": 48549 + }, + { + "epoch": 2.2603533766324464, + "grad_norm": 0.3112828111965756, + "learning_rate": 1.739750995246591e-05, + "loss": 2.5449, + "step": 48550 + }, + { + "epoch": 2.2603999348185395, + "grad_norm": 0.3330819216786589, + "learning_rate": 1.7395456300889724e-05, + "loss": 2.6906, + "step": 48551 + }, + { + "epoch": 2.2604464930046326, + "grad_norm": 0.33793797208798687, + "learning_rate": 1.7393402745004634e-05, + "loss": 2.6707, + "step": 48552 + }, + { + "epoch": 2.2604930511907257, + "grad_norm": 0.32786408677289325, + "learning_rate": 1.7391349284816727e-05, + "loss": 2.6786, + "step": 48553 + }, + { + "epoch": 2.260539609376819, + "grad_norm": 0.32740384125932553, + "learning_rate": 1.7389295920331976e-05, + "loss": 2.6164, + "step": 48554 + }, + { + "epoch": 2.260586167562912, + "grad_norm": 0.35384972877951726, + "learning_rate": 1.7387242651556435e-05, + "loss": 2.7606, + "step": 48555 + }, + { + "epoch": 2.2606327257490046, + "grad_norm": 0.3581685953900202, + "learning_rate": 1.7385189478496123e-05, + "loss": 2.6711, + "step": 48556 + }, + { + "epoch": 2.2606792839350978, + "grad_norm": 0.33089612205228436, + "learning_rate": 1.738313640115707e-05, + "loss": 2.6675, + "step": 48557 + }, + { + "epoch": 2.260725842121191, + "grad_norm": 0.3266047826207086, + "learning_rate": 1.7381083419545308e-05, + "loss": 2.6271, + "step": 48558 + }, + { + "epoch": 2.260772400307284, + "grad_norm": 0.3496813623872091, + "learning_rate": 1.7379030533666823e-05, + "loss": 2.6225, + "step": 48559 + }, + { + "epoch": 2.260818958493377, + "grad_norm": 0.3776192092930128, + "learning_rate": 1.7376977743527707e-05, + "loss": 2.679, + "step": 48560 + }, + { + "epoch": 2.26086551667947, + "grad_norm": 0.33823645385971385, + "learning_rate": 1.7374925049133922e-05, + "loss": 2.5781, + "step": 48561 + }, + { + "epoch": 2.2609120748655633, + "grad_norm": 0.34376962475653483, + "learning_rate": 1.7372872450491523e-05, + "loss": 2.6646, + "step": 48562 + }, + { + "epoch": 2.2609586330516565, + "grad_norm": 0.3407520838612022, + "learning_rate": 1.737081994760653e-05, + "loss": 2.6295, + "step": 48563 + }, + { + "epoch": 2.2610051912377496, + "grad_norm": 0.3567155628416019, + "learning_rate": 1.7368767540484966e-05, + "loss": 2.8118, + "step": 48564 + }, + { + "epoch": 2.2610517494238422, + "grad_norm": 0.3370684563855808, + "learning_rate": 1.7366715229132853e-05, + "loss": 2.5952, + "step": 48565 + }, + { + "epoch": 2.2610983076099354, + "grad_norm": 0.32674271253813186, + "learning_rate": 1.7364663013556232e-05, + "loss": 2.6357, + "step": 48566 + }, + { + "epoch": 2.2611448657960285, + "grad_norm": 0.32589836766633473, + "learning_rate": 1.7362610893761088e-05, + "loss": 2.5981, + "step": 48567 + }, + { + "epoch": 2.2611914239821216, + "grad_norm": 0.3254809572318501, + "learning_rate": 1.7360558869753474e-05, + "loss": 2.6713, + "step": 48568 + }, + { + "epoch": 2.2612379821682147, + "grad_norm": 0.31403415401948276, + "learning_rate": 1.7358506941539405e-05, + "loss": 2.618, + "step": 48569 + }, + { + "epoch": 2.261284540354308, + "grad_norm": 0.3279243193020112, + "learning_rate": 1.7356455109124897e-05, + "loss": 2.6481, + "step": 48570 + }, + { + "epoch": 2.261331098540401, + "grad_norm": 0.31839821910081617, + "learning_rate": 1.7354403372516e-05, + "loss": 2.498, + "step": 48571 + }, + { + "epoch": 2.261377656726494, + "grad_norm": 0.3337202627320636, + "learning_rate": 1.735235173171867e-05, + "loss": 2.6564, + "step": 48572 + }, + { + "epoch": 2.261424214912587, + "grad_norm": 0.31364847849620237, + "learning_rate": 1.735030018673901e-05, + "loss": 2.612, + "step": 48573 + }, + { + "epoch": 2.2614707730986803, + "grad_norm": 0.3339598454992316, + "learning_rate": 1.734824873758299e-05, + "loss": 2.6249, + "step": 48574 + }, + { + "epoch": 2.2615173312847734, + "grad_norm": 0.32169423287683235, + "learning_rate": 1.7346197384256634e-05, + "loss": 2.554, + "step": 48575 + }, + { + "epoch": 2.261563889470866, + "grad_norm": 0.3431207007775045, + "learning_rate": 1.734414612676598e-05, + "loss": 2.64, + "step": 48576 + }, + { + "epoch": 2.261610447656959, + "grad_norm": 0.3184300759269518, + "learning_rate": 1.734209496511704e-05, + "loss": 2.5505, + "step": 48577 + }, + { + "epoch": 2.2616570058430523, + "grad_norm": 0.3350828577178345, + "learning_rate": 1.7340043899315827e-05, + "loss": 2.626, + "step": 48578 + }, + { + "epoch": 2.2617035640291454, + "grad_norm": 0.3397257142177882, + "learning_rate": 1.7337992929368392e-05, + "loss": 2.7192, + "step": 48579 + }, + { + "epoch": 2.2617501222152385, + "grad_norm": 0.3331835836086746, + "learning_rate": 1.7335942055280707e-05, + "loss": 2.6446, + "step": 48580 + }, + { + "epoch": 2.2617966804013316, + "grad_norm": 0.3467674216294025, + "learning_rate": 1.7333891277058818e-05, + "loss": 2.6983, + "step": 48581 + }, + { + "epoch": 2.2618432385874248, + "grad_norm": 0.331394770937486, + "learning_rate": 1.7331840594708736e-05, + "loss": 2.6161, + "step": 48582 + }, + { + "epoch": 2.261889796773518, + "grad_norm": 0.3172396699521421, + "learning_rate": 1.7329790008236485e-05, + "loss": 2.6166, + "step": 48583 + }, + { + "epoch": 2.2619363549596105, + "grad_norm": 0.3153519109136671, + "learning_rate": 1.73277395176481e-05, + "loss": 2.5759, + "step": 48584 + }, + { + "epoch": 2.2619829131457037, + "grad_norm": 0.3317703098680634, + "learning_rate": 1.7325689122949552e-05, + "loss": 2.5729, + "step": 48585 + }, + { + "epoch": 2.2620294713317968, + "grad_norm": 0.3408793974953303, + "learning_rate": 1.7323638824146915e-05, + "loss": 2.7822, + "step": 48586 + }, + { + "epoch": 2.26207602951789, + "grad_norm": 0.29938115678454197, + "learning_rate": 1.732158862124616e-05, + "loss": 2.7047, + "step": 48587 + }, + { + "epoch": 2.262122587703983, + "grad_norm": 0.30165073224853384, + "learning_rate": 1.7319538514253326e-05, + "loss": 2.6643, + "step": 48588 + }, + { + "epoch": 2.262169145890076, + "grad_norm": 0.3210599909435059, + "learning_rate": 1.731748850317443e-05, + "loss": 2.6199, + "step": 48589 + }, + { + "epoch": 2.2622157040761692, + "grad_norm": 0.32752314610934563, + "learning_rate": 1.731543858801548e-05, + "loss": 2.5828, + "step": 48590 + }, + { + "epoch": 2.2622622622622623, + "grad_norm": 0.31234462204949853, + "learning_rate": 1.7313388768782502e-05, + "loss": 2.6095, + "step": 48591 + }, + { + "epoch": 2.2623088204483555, + "grad_norm": 0.3411958339900599, + "learning_rate": 1.7311339045481524e-05, + "loss": 2.6501, + "step": 48592 + }, + { + "epoch": 2.2623553786344486, + "grad_norm": 0.31075785548620183, + "learning_rate": 1.7309289418118523e-05, + "loss": 2.5751, + "step": 48593 + }, + { + "epoch": 2.2624019368205417, + "grad_norm": 0.32368658661348076, + "learning_rate": 1.7307239886699543e-05, + "loss": 2.6243, + "step": 48594 + }, + { + "epoch": 2.2624484950066344, + "grad_norm": 0.3600141641300525, + "learning_rate": 1.730519045123059e-05, + "loss": 2.6824, + "step": 48595 + }, + { + "epoch": 2.2624950531927275, + "grad_norm": 0.32708408735743, + "learning_rate": 1.7303141111717685e-05, + "loss": 2.5419, + "step": 48596 + }, + { + "epoch": 2.2625416113788206, + "grad_norm": 0.3178013209747909, + "learning_rate": 1.7301091868166858e-05, + "loss": 2.606, + "step": 48597 + }, + { + "epoch": 2.2625881695649137, + "grad_norm": 0.32832769690742, + "learning_rate": 1.7299042720584067e-05, + "loss": 2.6964, + "step": 48598 + }, + { + "epoch": 2.262634727751007, + "grad_norm": 0.32601056125069716, + "learning_rate": 1.72969936689754e-05, + "loss": 2.5801, + "step": 48599 + }, + { + "epoch": 2.2626812859371, + "grad_norm": 0.32191412777095335, + "learning_rate": 1.72949447133468e-05, + "loss": 2.6275, + "step": 48600 + }, + { + "epoch": 2.262727844123193, + "grad_norm": 0.3144022597345195, + "learning_rate": 1.729289585370435e-05, + "loss": 2.5383, + "step": 48601 + }, + { + "epoch": 2.262774402309286, + "grad_norm": 0.3381783684355955, + "learning_rate": 1.7290847090054014e-05, + "loss": 2.6757, + "step": 48602 + }, + { + "epoch": 2.2628209604953793, + "grad_norm": 0.32668656683467706, + "learning_rate": 1.7288798422401813e-05, + "loss": 2.6491, + "step": 48603 + }, + { + "epoch": 2.262867518681472, + "grad_norm": 0.31372049303571825, + "learning_rate": 1.728674985075377e-05, + "loss": 2.6144, + "step": 48604 + }, + { + "epoch": 2.262914076867565, + "grad_norm": 0.3432710214444534, + "learning_rate": 1.7284701375115907e-05, + "loss": 2.7098, + "step": 48605 + }, + { + "epoch": 2.262960635053658, + "grad_norm": 0.34133152997531274, + "learning_rate": 1.728265299549421e-05, + "loss": 2.6694, + "step": 48606 + }, + { + "epoch": 2.2630071932397513, + "grad_norm": 0.3330417089211404, + "learning_rate": 1.7280604711894705e-05, + "loss": 2.5927, + "step": 48607 + }, + { + "epoch": 2.2630537514258444, + "grad_norm": 0.32991400495872236, + "learning_rate": 1.7278556524323397e-05, + "loss": 2.5227, + "step": 48608 + }, + { + "epoch": 2.2631003096119375, + "grad_norm": 0.32594386249312507, + "learning_rate": 1.7276508432786302e-05, + "loss": 2.6596, + "step": 48609 + }, + { + "epoch": 2.2631468677980306, + "grad_norm": 0.34149094147113424, + "learning_rate": 1.7274460437289445e-05, + "loss": 2.6597, + "step": 48610 + }, + { + "epoch": 2.2631934259841238, + "grad_norm": 0.32643973406871946, + "learning_rate": 1.72724125378388e-05, + "loss": 2.568, + "step": 48611 + }, + { + "epoch": 2.263239984170217, + "grad_norm": 0.3267160744035864, + "learning_rate": 1.7270364734440427e-05, + "loss": 2.5981, + "step": 48612 + }, + { + "epoch": 2.26328654235631, + "grad_norm": 0.3246673514483243, + "learning_rate": 1.7268317027100272e-05, + "loss": 2.675, + "step": 48613 + }, + { + "epoch": 2.263333100542403, + "grad_norm": 0.32619030703618324, + "learning_rate": 1.726626941582442e-05, + "loss": 2.7012, + "step": 48614 + }, + { + "epoch": 2.263379658728496, + "grad_norm": 0.3363170438339007, + "learning_rate": 1.726422190061882e-05, + "loss": 2.6461, + "step": 48615 + }, + { + "epoch": 2.263426216914589, + "grad_norm": 0.364322488322425, + "learning_rate": 1.7262174481489514e-05, + "loss": 2.7398, + "step": 48616 + }, + { + "epoch": 2.263472775100682, + "grad_norm": 0.3372853554434858, + "learning_rate": 1.726012715844249e-05, + "loss": 2.6103, + "step": 48617 + }, + { + "epoch": 2.263519333286775, + "grad_norm": 0.33711537732899305, + "learning_rate": 1.725807993148377e-05, + "loss": 2.7055, + "step": 48618 + }, + { + "epoch": 2.2635658914728682, + "grad_norm": 0.347583772602967, + "learning_rate": 1.725603280061938e-05, + "loss": 2.6606, + "step": 48619 + }, + { + "epoch": 2.2636124496589614, + "grad_norm": 0.34171040668547514, + "learning_rate": 1.7253985765855295e-05, + "loss": 2.7057, + "step": 48620 + }, + { + "epoch": 2.2636590078450545, + "grad_norm": 0.3400442853574243, + "learning_rate": 1.7251938827197527e-05, + "loss": 2.5854, + "step": 48621 + }, + { + "epoch": 2.2637055660311476, + "grad_norm": 0.33613861378570553, + "learning_rate": 1.72498919846521e-05, + "loss": 2.516, + "step": 48622 + }, + { + "epoch": 2.2637521242172403, + "grad_norm": 0.3426007955909677, + "learning_rate": 1.7247845238225026e-05, + "loss": 2.5963, + "step": 48623 + }, + { + "epoch": 2.2637986824033334, + "grad_norm": 0.33780904373049847, + "learning_rate": 1.7245798587922263e-05, + "loss": 2.7382, + "step": 48624 + }, + { + "epoch": 2.2638452405894265, + "grad_norm": 0.36564281613843597, + "learning_rate": 1.7243752033749893e-05, + "loss": 2.6464, + "step": 48625 + }, + { + "epoch": 2.2638917987755196, + "grad_norm": 0.3426105354965089, + "learning_rate": 1.7241705575713852e-05, + "loss": 2.6103, + "step": 48626 + }, + { + "epoch": 2.2639383569616127, + "grad_norm": 0.3302066114918767, + "learning_rate": 1.7239659213820204e-05, + "loss": 2.6775, + "step": 48627 + }, + { + "epoch": 2.263984915147706, + "grad_norm": 0.3500111968564776, + "learning_rate": 1.7237612948074917e-05, + "loss": 2.6095, + "step": 48628 + }, + { + "epoch": 2.264031473333799, + "grad_norm": 0.3278435239237071, + "learning_rate": 1.723556677848401e-05, + "loss": 2.6211, + "step": 48629 + }, + { + "epoch": 2.264078031519892, + "grad_norm": 0.33036612962363787, + "learning_rate": 1.723352070505348e-05, + "loss": 2.6309, + "step": 48630 + }, + { + "epoch": 2.264124589705985, + "grad_norm": 0.3363557000561513, + "learning_rate": 1.7231474727789345e-05, + "loss": 2.5152, + "step": 48631 + }, + { + "epoch": 2.2641711478920783, + "grad_norm": 0.36195683598822, + "learning_rate": 1.7229428846697615e-05, + "loss": 2.6908, + "step": 48632 + }, + { + "epoch": 2.2642177060781714, + "grad_norm": 0.32497486492885785, + "learning_rate": 1.7227383061784268e-05, + "loss": 2.5436, + "step": 48633 + }, + { + "epoch": 2.264264264264264, + "grad_norm": 0.3081417941944369, + "learning_rate": 1.7225337373055327e-05, + "loss": 2.7201, + "step": 48634 + }, + { + "epoch": 2.264310822450357, + "grad_norm": 0.3201261650490457, + "learning_rate": 1.722329178051679e-05, + "loss": 2.6572, + "step": 48635 + }, + { + "epoch": 2.2643573806364503, + "grad_norm": 0.3454784804161858, + "learning_rate": 1.7221246284174657e-05, + "loss": 2.6051, + "step": 48636 + }, + { + "epoch": 2.2644039388225434, + "grad_norm": 0.3450587734063593, + "learning_rate": 1.7219200884034947e-05, + "loss": 2.6262, + "step": 48637 + }, + { + "epoch": 2.2644504970086365, + "grad_norm": 0.3142589342754226, + "learning_rate": 1.7217155580103662e-05, + "loss": 2.6414, + "step": 48638 + }, + { + "epoch": 2.2644970551947297, + "grad_norm": 0.31451071975484207, + "learning_rate": 1.7215110372386768e-05, + "loss": 2.6669, + "step": 48639 + }, + { + "epoch": 2.2645436133808228, + "grad_norm": 0.33390721035482496, + "learning_rate": 1.7213065260890322e-05, + "loss": 2.6976, + "step": 48640 + }, + { + "epoch": 2.264590171566916, + "grad_norm": 0.3166880449099082, + "learning_rate": 1.7211020245620286e-05, + "loss": 2.6623, + "step": 48641 + }, + { + "epoch": 2.264636729753009, + "grad_norm": 0.3268054449156736, + "learning_rate": 1.7208975326582676e-05, + "loss": 2.6246, + "step": 48642 + }, + { + "epoch": 2.2646832879391017, + "grad_norm": 0.327618893923808, + "learning_rate": 1.7206930503783493e-05, + "loss": 2.637, + "step": 48643 + }, + { + "epoch": 2.264729846125195, + "grad_norm": 0.3318259195383322, + "learning_rate": 1.7204885777228736e-05, + "loss": 2.5673, + "step": 48644 + }, + { + "epoch": 2.264776404311288, + "grad_norm": 0.3342226912199783, + "learning_rate": 1.7202841146924427e-05, + "loss": 2.6879, + "step": 48645 + }, + { + "epoch": 2.264822962497381, + "grad_norm": 0.32688135177734445, + "learning_rate": 1.720079661287653e-05, + "loss": 2.6504, + "step": 48646 + }, + { + "epoch": 2.264869520683474, + "grad_norm": 0.32716513834430244, + "learning_rate": 1.7198752175091066e-05, + "loss": 2.6202, + "step": 48647 + }, + { + "epoch": 2.2649160788695673, + "grad_norm": 0.3329735321609526, + "learning_rate": 1.7196707833574034e-05, + "loss": 2.5563, + "step": 48648 + }, + { + "epoch": 2.2649626370556604, + "grad_norm": 0.33260187810430897, + "learning_rate": 1.7194663588331428e-05, + "loss": 2.6836, + "step": 48649 + }, + { + "epoch": 2.2650091952417535, + "grad_norm": 0.32658432698022644, + "learning_rate": 1.7192619439369255e-05, + "loss": 2.519, + "step": 48650 + }, + { + "epoch": 2.2650557534278466, + "grad_norm": 0.3253595847343038, + "learning_rate": 1.719057538669353e-05, + "loss": 2.6327, + "step": 48651 + }, + { + "epoch": 2.2651023116139397, + "grad_norm": 0.3653351212660097, + "learning_rate": 1.7188531430310194e-05, + "loss": 2.7742, + "step": 48652 + }, + { + "epoch": 2.265148869800033, + "grad_norm": 0.31777557784537286, + "learning_rate": 1.7186487570225328e-05, + "loss": 2.48, + "step": 48653 + }, + { + "epoch": 2.2651954279861255, + "grad_norm": 0.3111090810156759, + "learning_rate": 1.7184443806444848e-05, + "loss": 2.6112, + "step": 48654 + }, + { + "epoch": 2.2652419861722186, + "grad_norm": 0.3521804802391168, + "learning_rate": 1.718240013897483e-05, + "loss": 2.6049, + "step": 48655 + }, + { + "epoch": 2.2652885443583117, + "grad_norm": 0.33190074795648317, + "learning_rate": 1.718035656782122e-05, + "loss": 2.6502, + "step": 48656 + }, + { + "epoch": 2.265335102544405, + "grad_norm": 0.31739696131071704, + "learning_rate": 1.7178313092990028e-05, + "loss": 2.5588, + "step": 48657 + }, + { + "epoch": 2.265381660730498, + "grad_norm": 0.322891846847023, + "learning_rate": 1.717626971448727e-05, + "loss": 2.5345, + "step": 48658 + }, + { + "epoch": 2.265428218916591, + "grad_norm": 0.3480325213862616, + "learning_rate": 1.7174226432318908e-05, + "loss": 2.6297, + "step": 48659 + }, + { + "epoch": 2.265474777102684, + "grad_norm": 0.3401242028439665, + "learning_rate": 1.717218324649096e-05, + "loss": 2.6512, + "step": 48660 + }, + { + "epoch": 2.2655213352887773, + "grad_norm": 0.3485239469728989, + "learning_rate": 1.7170140157009418e-05, + "loss": 2.6924, + "step": 48661 + }, + { + "epoch": 2.26556789347487, + "grad_norm": 0.33951065775420464, + "learning_rate": 1.716809716388028e-05, + "loss": 2.6309, + "step": 48662 + }, + { + "epoch": 2.265614451660963, + "grad_norm": 0.35009934495679756, + "learning_rate": 1.7166054267109545e-05, + "loss": 2.7554, + "step": 48663 + }, + { + "epoch": 2.265661009847056, + "grad_norm": 0.32664566409241785, + "learning_rate": 1.716401146670322e-05, + "loss": 2.657, + "step": 48664 + }, + { + "epoch": 2.2657075680331493, + "grad_norm": 0.34541518918432607, + "learning_rate": 1.7161968762667258e-05, + "loss": 2.6305, + "step": 48665 + }, + { + "epoch": 2.2657541262192424, + "grad_norm": 0.3681566035275505, + "learning_rate": 1.715992615500771e-05, + "loss": 2.7641, + "step": 48666 + }, + { + "epoch": 2.2658006844053356, + "grad_norm": 0.3322072353351828, + "learning_rate": 1.7157883643730516e-05, + "loss": 2.5834, + "step": 48667 + }, + { + "epoch": 2.2658472425914287, + "grad_norm": 0.36893510462606816, + "learning_rate": 1.7155841228841724e-05, + "loss": 2.6907, + "step": 48668 + }, + { + "epoch": 2.265893800777522, + "grad_norm": 0.36284438711715317, + "learning_rate": 1.715379891034729e-05, + "loss": 2.6766, + "step": 48669 + }, + { + "epoch": 2.265940358963615, + "grad_norm": 0.34944329220131415, + "learning_rate": 1.7151756688253223e-05, + "loss": 2.5785, + "step": 48670 + }, + { + "epoch": 2.265986917149708, + "grad_norm": 0.3429365841508864, + "learning_rate": 1.714971456256553e-05, + "loss": 2.7031, + "step": 48671 + }, + { + "epoch": 2.266033475335801, + "grad_norm": 0.32999004148021766, + "learning_rate": 1.714767253329015e-05, + "loss": 2.4815, + "step": 48672 + }, + { + "epoch": 2.266080033521894, + "grad_norm": 0.35602283743126156, + "learning_rate": 1.7145630600433148e-05, + "loss": 2.6536, + "step": 48673 + }, + { + "epoch": 2.266126591707987, + "grad_norm": 0.35955580354930666, + "learning_rate": 1.7143588764000473e-05, + "loss": 2.5875, + "step": 48674 + }, + { + "epoch": 2.26617314989408, + "grad_norm": 0.34746692245850525, + "learning_rate": 1.714154702399812e-05, + "loss": 2.6556, + "step": 48675 + }, + { + "epoch": 2.266219708080173, + "grad_norm": 0.3056508749537203, + "learning_rate": 1.7139505380432096e-05, + "loss": 2.6094, + "step": 48676 + }, + { + "epoch": 2.2662662662662663, + "grad_norm": 0.34607462767159053, + "learning_rate": 1.7137463833308403e-05, + "loss": 2.7307, + "step": 48677 + }, + { + "epoch": 2.2663128244523594, + "grad_norm": 0.33386467669871256, + "learning_rate": 1.713542238263298e-05, + "loss": 2.636, + "step": 48678 + }, + { + "epoch": 2.2663593826384525, + "grad_norm": 0.3214898197081618, + "learning_rate": 1.7133381028411886e-05, + "loss": 2.6498, + "step": 48679 + }, + { + "epoch": 2.2664059408245456, + "grad_norm": 0.32771727866502537, + "learning_rate": 1.7131339770651056e-05, + "loss": 2.6513, + "step": 48680 + }, + { + "epoch": 2.2664524990106387, + "grad_norm": 0.32732406648234685, + "learning_rate": 1.7129298609356532e-05, + "loss": 2.6602, + "step": 48681 + }, + { + "epoch": 2.2664990571967314, + "grad_norm": 0.33264054451020886, + "learning_rate": 1.7127257544534263e-05, + "loss": 2.6708, + "step": 48682 + }, + { + "epoch": 2.2665456153828245, + "grad_norm": 0.3299365847692301, + "learning_rate": 1.7125216576190256e-05, + "loss": 2.6163, + "step": 48683 + }, + { + "epoch": 2.2665921735689176, + "grad_norm": 0.32985162014014124, + "learning_rate": 1.7123175704330512e-05, + "loss": 2.7609, + "step": 48684 + }, + { + "epoch": 2.2666387317550107, + "grad_norm": 0.3264864383057523, + "learning_rate": 1.712113492896098e-05, + "loss": 2.6524, + "step": 48685 + }, + { + "epoch": 2.266685289941104, + "grad_norm": 0.32552864468166504, + "learning_rate": 1.7119094250087708e-05, + "loss": 2.5241, + "step": 48686 + }, + { + "epoch": 2.266731848127197, + "grad_norm": 0.3478675634136948, + "learning_rate": 1.7117053667716642e-05, + "loss": 2.6972, + "step": 48687 + }, + { + "epoch": 2.26677840631329, + "grad_norm": 0.31927762850855274, + "learning_rate": 1.711501318185378e-05, + "loss": 2.6499, + "step": 48688 + }, + { + "epoch": 2.266824964499383, + "grad_norm": 0.33786383603667164, + "learning_rate": 1.7112972792505117e-05, + "loss": 2.6633, + "step": 48689 + }, + { + "epoch": 2.2668715226854763, + "grad_norm": 0.3325152654636554, + "learning_rate": 1.7110932499676635e-05, + "loss": 2.7542, + "step": 48690 + }, + { + "epoch": 2.2669180808715694, + "grad_norm": 0.3115060702747089, + "learning_rate": 1.710889230337433e-05, + "loss": 2.6365, + "step": 48691 + }, + { + "epoch": 2.2669646390576625, + "grad_norm": 0.32924405187774924, + "learning_rate": 1.7106852203604202e-05, + "loss": 2.629, + "step": 48692 + }, + { + "epoch": 2.267011197243755, + "grad_norm": 0.31537565891821084, + "learning_rate": 1.7104812200372188e-05, + "loss": 2.6569, + "step": 48693 + }, + { + "epoch": 2.2670577554298483, + "grad_norm": 0.3338649883691983, + "learning_rate": 1.7102772293684343e-05, + "loss": 2.6277, + "step": 48694 + }, + { + "epoch": 2.2671043136159414, + "grad_norm": 0.3270060579035138, + "learning_rate": 1.7100732483546605e-05, + "loss": 2.635, + "step": 48695 + }, + { + "epoch": 2.2671508718020346, + "grad_norm": 0.34306885454890074, + "learning_rate": 1.7098692769964974e-05, + "loss": 2.6861, + "step": 48696 + }, + { + "epoch": 2.2671974299881277, + "grad_norm": 0.3149151107522773, + "learning_rate": 1.7096653152945456e-05, + "loss": 2.5431, + "step": 48697 + }, + { + "epoch": 2.267243988174221, + "grad_norm": 0.32625214125021407, + "learning_rate": 1.709461363249399e-05, + "loss": 2.6421, + "step": 48698 + }, + { + "epoch": 2.267290546360314, + "grad_norm": 0.3439168385568482, + "learning_rate": 1.7092574208616617e-05, + "loss": 2.6195, + "step": 48699 + }, + { + "epoch": 2.267337104546407, + "grad_norm": 0.3189831641766033, + "learning_rate": 1.7090534881319287e-05, + "loss": 2.6518, + "step": 48700 + }, + { + "epoch": 2.2673836627324997, + "grad_norm": 0.3573727857441896, + "learning_rate": 1.7088495650607993e-05, + "loss": 2.7024, + "step": 48701 + }, + { + "epoch": 2.267430220918593, + "grad_norm": 0.32825356834703373, + "learning_rate": 1.708645651648872e-05, + "loss": 2.6664, + "step": 48702 + }, + { + "epoch": 2.267476779104686, + "grad_norm": 0.31584685390159967, + "learning_rate": 1.7084417478967457e-05, + "loss": 2.689, + "step": 48703 + }, + { + "epoch": 2.267523337290779, + "grad_norm": 0.32770326955217294, + "learning_rate": 1.7082378538050182e-05, + "loss": 2.5936, + "step": 48704 + }, + { + "epoch": 2.267569895476872, + "grad_norm": 0.3140609293393919, + "learning_rate": 1.7080339693742904e-05, + "loss": 2.6198, + "step": 48705 + }, + { + "epoch": 2.2676164536629653, + "grad_norm": 0.30658796140673444, + "learning_rate": 1.707830094605155e-05, + "loss": 2.6294, + "step": 48706 + }, + { + "epoch": 2.2676630118490584, + "grad_norm": 0.3293827319524929, + "learning_rate": 1.7076262294982175e-05, + "loss": 2.4836, + "step": 48707 + }, + { + "epoch": 2.2677095700351515, + "grad_norm": 0.33177218205339476, + "learning_rate": 1.7074223740540705e-05, + "loss": 2.6868, + "step": 48708 + }, + { + "epoch": 2.2677561282212446, + "grad_norm": 0.3019065686669413, + "learning_rate": 1.7072185282733148e-05, + "loss": 2.6543, + "step": 48709 + }, + { + "epoch": 2.2678026864073377, + "grad_norm": 0.30361540066740006, + "learning_rate": 1.7070146921565504e-05, + "loss": 2.585, + "step": 48710 + }, + { + "epoch": 2.267849244593431, + "grad_norm": 0.33343182712010816, + "learning_rate": 1.7068108657043696e-05, + "loss": 2.6423, + "step": 48711 + }, + { + "epoch": 2.2678958027795235, + "grad_norm": 0.30697857228172165, + "learning_rate": 1.706607048917378e-05, + "loss": 2.5835, + "step": 48712 + }, + { + "epoch": 2.2679423609656166, + "grad_norm": 0.3328872992315989, + "learning_rate": 1.7064032417961685e-05, + "loss": 2.5938, + "step": 48713 + }, + { + "epoch": 2.2679889191517097, + "grad_norm": 0.33085693402582445, + "learning_rate": 1.706199444341341e-05, + "loss": 2.6223, + "step": 48714 + }, + { + "epoch": 2.268035477337803, + "grad_norm": 0.35053134652790835, + "learning_rate": 1.705995656553494e-05, + "loss": 2.7281, + "step": 48715 + }, + { + "epoch": 2.268082035523896, + "grad_norm": 0.34849170890504266, + "learning_rate": 1.705791878433225e-05, + "loss": 2.6525, + "step": 48716 + }, + { + "epoch": 2.268128593709989, + "grad_norm": 0.3314310206720834, + "learning_rate": 1.7055881099811322e-05, + "loss": 2.7402, + "step": 48717 + }, + { + "epoch": 2.268175151896082, + "grad_norm": 0.35042047898719, + "learning_rate": 1.7053843511978152e-05, + "loss": 2.5654, + "step": 48718 + }, + { + "epoch": 2.2682217100821753, + "grad_norm": 0.34292807276678106, + "learning_rate": 1.7051806020838672e-05, + "loss": 2.5451, + "step": 48719 + }, + { + "epoch": 2.2682682682682684, + "grad_norm": 0.32862507749166403, + "learning_rate": 1.7049768626398933e-05, + "loss": 2.6508, + "step": 48720 + }, + { + "epoch": 2.268314826454361, + "grad_norm": 0.3515643864198516, + "learning_rate": 1.704773132866486e-05, + "loss": 2.7024, + "step": 48721 + }, + { + "epoch": 2.2683613846404542, + "grad_norm": 0.33330349818135246, + "learning_rate": 1.7045694127642446e-05, + "loss": 2.7251, + "step": 48722 + }, + { + "epoch": 2.2684079428265473, + "grad_norm": 0.3146770986657389, + "learning_rate": 1.7043657023337696e-05, + "loss": 2.557, + "step": 48723 + }, + { + "epoch": 2.2684545010126405, + "grad_norm": 0.379514210266681, + "learning_rate": 1.7041620015756525e-05, + "loss": 2.7584, + "step": 48724 + }, + { + "epoch": 2.2685010591987336, + "grad_norm": 0.34081021095023106, + "learning_rate": 1.703958310490499e-05, + "loss": 2.6655, + "step": 48725 + }, + { + "epoch": 2.2685476173848267, + "grad_norm": 0.3342659646360014, + "learning_rate": 1.7037546290788996e-05, + "loss": 2.6425, + "step": 48726 + }, + { + "epoch": 2.26859417557092, + "grad_norm": 0.3538033107595074, + "learning_rate": 1.7035509573414594e-05, + "loss": 2.6865, + "step": 48727 + }, + { + "epoch": 2.268640733757013, + "grad_norm": 0.34909833719355077, + "learning_rate": 1.70334729527877e-05, + "loss": 2.6724, + "step": 48728 + }, + { + "epoch": 2.268687291943106, + "grad_norm": 0.32709530592834374, + "learning_rate": 1.7031436428914323e-05, + "loss": 2.6125, + "step": 48729 + }, + { + "epoch": 2.268733850129199, + "grad_norm": 0.3436194450113923, + "learning_rate": 1.7029400001800426e-05, + "loss": 2.6004, + "step": 48730 + }, + { + "epoch": 2.2687804083152923, + "grad_norm": 0.3330302207189212, + "learning_rate": 1.7027363671452014e-05, + "loss": 2.6263, + "step": 48731 + }, + { + "epoch": 2.268826966501385, + "grad_norm": 0.33463733516319605, + "learning_rate": 1.7025327437875e-05, + "loss": 2.6885, + "step": 48732 + }, + { + "epoch": 2.268873524687478, + "grad_norm": 0.34333930154755743, + "learning_rate": 1.702329130107544e-05, + "loss": 2.7372, + "step": 48733 + }, + { + "epoch": 2.268920082873571, + "grad_norm": 0.34003167826445496, + "learning_rate": 1.7021255261059254e-05, + "loss": 2.6439, + "step": 48734 + }, + { + "epoch": 2.2689666410596643, + "grad_norm": 0.34914925824308984, + "learning_rate": 1.7019219317832437e-05, + "loss": 2.641, + "step": 48735 + }, + { + "epoch": 2.2690131992457574, + "grad_norm": 0.3290116891504371, + "learning_rate": 1.701718347140097e-05, + "loss": 2.5446, + "step": 48736 + }, + { + "epoch": 2.2690597574318505, + "grad_norm": 0.356361629081159, + "learning_rate": 1.7015147721770795e-05, + "loss": 2.6735, + "step": 48737 + }, + { + "epoch": 2.2691063156179436, + "grad_norm": 0.350931953808758, + "learning_rate": 1.7013112068947944e-05, + "loss": 2.6992, + "step": 48738 + }, + { + "epoch": 2.2691528738040367, + "grad_norm": 0.319955387125877, + "learning_rate": 1.701107651293833e-05, + "loss": 2.6158, + "step": 48739 + }, + { + "epoch": 2.26919943199013, + "grad_norm": 0.31222743407728615, + "learning_rate": 1.7009041053747982e-05, + "loss": 2.6733, + "step": 48740 + }, + { + "epoch": 2.2692459901762225, + "grad_norm": 0.32861010865844453, + "learning_rate": 1.700700569138284e-05, + "loss": 2.7542, + "step": 48741 + }, + { + "epoch": 2.2692925483623156, + "grad_norm": 0.3354881847147694, + "learning_rate": 1.7004970425848876e-05, + "loss": 2.6744, + "step": 48742 + }, + { + "epoch": 2.2693391065484088, + "grad_norm": 0.3179856012491291, + "learning_rate": 1.700293525715208e-05, + "loss": 2.6219, + "step": 48743 + }, + { + "epoch": 2.269385664734502, + "grad_norm": 0.3123402976191582, + "learning_rate": 1.7000900185298417e-05, + "loss": 2.6824, + "step": 48744 + }, + { + "epoch": 2.269432222920595, + "grad_norm": 0.32155806819941096, + "learning_rate": 1.6998865210293864e-05, + "loss": 2.5962, + "step": 48745 + }, + { + "epoch": 2.269478781106688, + "grad_norm": 0.33817264474175646, + "learning_rate": 1.69968303321444e-05, + "loss": 2.6875, + "step": 48746 + }, + { + "epoch": 2.269525339292781, + "grad_norm": 0.33722948546261805, + "learning_rate": 1.6994795550855973e-05, + "loss": 2.6398, + "step": 48747 + }, + { + "epoch": 2.2695718974788743, + "grad_norm": 0.3424567108651638, + "learning_rate": 1.6992760866434577e-05, + "loss": 2.6585, + "step": 48748 + }, + { + "epoch": 2.2696184556649674, + "grad_norm": 0.33529314442927827, + "learning_rate": 1.699072627888618e-05, + "loss": 2.6301, + "step": 48749 + }, + { + "epoch": 2.2696650138510606, + "grad_norm": 0.33108151042052814, + "learning_rate": 1.6988691788216725e-05, + "loss": 2.7006, + "step": 48750 + }, + { + "epoch": 2.2697115720371537, + "grad_norm": 0.3551234173247556, + "learning_rate": 1.6986657394432242e-05, + "loss": 2.6714, + "step": 48751 + }, + { + "epoch": 2.2697581302232464, + "grad_norm": 0.3475550785944222, + "learning_rate": 1.6984623097538625e-05, + "loss": 2.5834, + "step": 48752 + }, + { + "epoch": 2.2698046884093395, + "grad_norm": 0.34994722190311756, + "learning_rate": 1.6982588897541928e-05, + "loss": 2.6789, + "step": 48753 + }, + { + "epoch": 2.2698512465954326, + "grad_norm": 0.3136335258653411, + "learning_rate": 1.698055479444806e-05, + "loss": 2.6601, + "step": 48754 + }, + { + "epoch": 2.2698978047815257, + "grad_norm": 0.3148652552602108, + "learning_rate": 1.6978520788263014e-05, + "loss": 2.6332, + "step": 48755 + }, + { + "epoch": 2.269944362967619, + "grad_norm": 0.325836980566097, + "learning_rate": 1.6976486878992755e-05, + "loss": 2.6234, + "step": 48756 + }, + { + "epoch": 2.269990921153712, + "grad_norm": 0.3207389637286229, + "learning_rate": 1.6974453066643254e-05, + "loss": 2.7079, + "step": 48757 + }, + { + "epoch": 2.270037479339805, + "grad_norm": 0.32620864721989934, + "learning_rate": 1.6972419351220476e-05, + "loss": 2.6301, + "step": 48758 + }, + { + "epoch": 2.270084037525898, + "grad_norm": 0.32083957673446495, + "learning_rate": 1.6970385732730417e-05, + "loss": 2.6493, + "step": 48759 + }, + { + "epoch": 2.270130595711991, + "grad_norm": 0.32952492086814755, + "learning_rate": 1.6968352211178995e-05, + "loss": 2.6444, + "step": 48760 + }, + { + "epoch": 2.270177153898084, + "grad_norm": 0.3221616981695405, + "learning_rate": 1.696631878657221e-05, + "loss": 2.592, + "step": 48761 + }, + { + "epoch": 2.270223712084177, + "grad_norm": 0.3248915727313237, + "learning_rate": 1.696428545891603e-05, + "loss": 2.5595, + "step": 48762 + }, + { + "epoch": 2.27027027027027, + "grad_norm": 0.3306122206589481, + "learning_rate": 1.6962252228216413e-05, + "loss": 2.6113, + "step": 48763 + }, + { + "epoch": 2.2703168284563633, + "grad_norm": 0.34515792476475643, + "learning_rate": 1.6960219094479345e-05, + "loss": 2.6196, + "step": 48764 + }, + { + "epoch": 2.2703633866424564, + "grad_norm": 0.315777813115189, + "learning_rate": 1.695818605771075e-05, + "loss": 2.6435, + "step": 48765 + }, + { + "epoch": 2.2704099448285495, + "grad_norm": 0.33732025648432373, + "learning_rate": 1.6956153117916656e-05, + "loss": 2.5944, + "step": 48766 + }, + { + "epoch": 2.2704565030146426, + "grad_norm": 0.31599772507930707, + "learning_rate": 1.695412027510298e-05, + "loss": 2.6975, + "step": 48767 + }, + { + "epoch": 2.2705030612007358, + "grad_norm": 0.32422085699982983, + "learning_rate": 1.6952087529275707e-05, + "loss": 2.5616, + "step": 48768 + }, + { + "epoch": 2.270549619386829, + "grad_norm": 0.3190577542888656, + "learning_rate": 1.6950054880440797e-05, + "loss": 2.5719, + "step": 48769 + }, + { + "epoch": 2.270596177572922, + "grad_norm": 0.32462612441006244, + "learning_rate": 1.6948022328604224e-05, + "loss": 2.7047, + "step": 48770 + }, + { + "epoch": 2.2706427357590147, + "grad_norm": 0.3252858918294963, + "learning_rate": 1.694598987377195e-05, + "loss": 2.6456, + "step": 48771 + }, + { + "epoch": 2.2706892939451078, + "grad_norm": 0.3545574489392, + "learning_rate": 1.6943957515949955e-05, + "loss": 2.7201, + "step": 48772 + }, + { + "epoch": 2.270735852131201, + "grad_norm": 0.30463143728282793, + "learning_rate": 1.694192525514417e-05, + "loss": 2.6297, + "step": 48773 + }, + { + "epoch": 2.270782410317294, + "grad_norm": 0.342321905639416, + "learning_rate": 1.6939893091360575e-05, + "loss": 2.6777, + "step": 48774 + }, + { + "epoch": 2.270828968503387, + "grad_norm": 0.33403065801423454, + "learning_rate": 1.6937861024605144e-05, + "loss": 2.5432, + "step": 48775 + }, + { + "epoch": 2.2708755266894802, + "grad_norm": 0.33441031376380964, + "learning_rate": 1.693582905488383e-05, + "loss": 2.6931, + "step": 48776 + }, + { + "epoch": 2.2709220848755733, + "grad_norm": 0.312806988503789, + "learning_rate": 1.6933797182202617e-05, + "loss": 2.5587, + "step": 48777 + }, + { + "epoch": 2.2709686430616665, + "grad_norm": 0.31674257950411433, + "learning_rate": 1.693176540656742e-05, + "loss": 2.7099, + "step": 48778 + }, + { + "epoch": 2.2710152012477596, + "grad_norm": 0.3392338668751004, + "learning_rate": 1.6929733727984264e-05, + "loss": 2.5868, + "step": 48779 + }, + { + "epoch": 2.2710617594338522, + "grad_norm": 0.35085611871383765, + "learning_rate": 1.692770214645905e-05, + "loss": 2.6814, + "step": 48780 + }, + { + "epoch": 2.2711083176199454, + "grad_norm": 0.3273231784256629, + "learning_rate": 1.6925670661997805e-05, + "loss": 2.6781, + "step": 48781 + }, + { + "epoch": 2.2711548758060385, + "grad_norm": 0.33952780628784246, + "learning_rate": 1.692363927460644e-05, + "loss": 2.6614, + "step": 48782 + }, + { + "epoch": 2.2712014339921316, + "grad_norm": 0.3375929647347583, + "learning_rate": 1.6921607984290936e-05, + "loss": 2.572, + "step": 48783 + }, + { + "epoch": 2.2712479921782247, + "grad_norm": 0.3406099653284577, + "learning_rate": 1.6919576791057257e-05, + "loss": 2.6586, + "step": 48784 + }, + { + "epoch": 2.271294550364318, + "grad_norm": 0.33024404419078063, + "learning_rate": 1.6917545694911375e-05, + "loss": 2.6335, + "step": 48785 + }, + { + "epoch": 2.271341108550411, + "grad_norm": 0.34825549339965417, + "learning_rate": 1.691551469585922e-05, + "loss": 2.7044, + "step": 48786 + }, + { + "epoch": 2.271387666736504, + "grad_norm": 0.35150096773573847, + "learning_rate": 1.691348379390677e-05, + "loss": 2.5762, + "step": 48787 + }, + { + "epoch": 2.271434224922597, + "grad_norm": 0.3464255831521322, + "learning_rate": 1.691145298905999e-05, + "loss": 2.7206, + "step": 48788 + }, + { + "epoch": 2.2714807831086903, + "grad_norm": 0.34193198506596145, + "learning_rate": 1.6909422281324834e-05, + "loss": 2.7277, + "step": 48789 + }, + { + "epoch": 2.2715273412947834, + "grad_norm": 0.3461808391680439, + "learning_rate": 1.6907391670707273e-05, + "loss": 2.5658, + "step": 48790 + }, + { + "epoch": 2.271573899480876, + "grad_norm": 0.34984974139020125, + "learning_rate": 1.6905361157213234e-05, + "loss": 2.5295, + "step": 48791 + }, + { + "epoch": 2.271620457666969, + "grad_norm": 0.346272080117562, + "learning_rate": 1.6903330740848728e-05, + "loss": 2.5957, + "step": 48792 + }, + { + "epoch": 2.2716670158530623, + "grad_norm": 0.3245082004316335, + "learning_rate": 1.6901300421619652e-05, + "loss": 2.5623, + "step": 48793 + }, + { + "epoch": 2.2717135740391554, + "grad_norm": 0.3676608461003171, + "learning_rate": 1.689927019953203e-05, + "loss": 2.605, + "step": 48794 + }, + { + "epoch": 2.2717601322252485, + "grad_norm": 0.35192711156252693, + "learning_rate": 1.6897240074591775e-05, + "loss": 2.5828, + "step": 48795 + }, + { + "epoch": 2.2718066904113416, + "grad_norm": 0.344052195030586, + "learning_rate": 1.689521004680486e-05, + "loss": 2.7029, + "step": 48796 + }, + { + "epoch": 2.2718532485974348, + "grad_norm": 0.3429574886769403, + "learning_rate": 1.689318011617724e-05, + "loss": 2.6018, + "step": 48797 + }, + { + "epoch": 2.271899806783528, + "grad_norm": 0.32885630388412257, + "learning_rate": 1.689115028271488e-05, + "loss": 2.6543, + "step": 48798 + }, + { + "epoch": 2.2719463649696205, + "grad_norm": 0.3492265796108997, + "learning_rate": 1.688912054642374e-05, + "loss": 2.5784, + "step": 48799 + }, + { + "epoch": 2.2719929231557137, + "grad_norm": 0.33730242486559814, + "learning_rate": 1.688709090730976e-05, + "loss": 2.6674, + "step": 48800 + }, + { + "epoch": 2.2720394813418068, + "grad_norm": 0.3280232046024803, + "learning_rate": 1.6885061365378906e-05, + "loss": 2.7316, + "step": 48801 + }, + { + "epoch": 2.2720860395279, + "grad_norm": 0.34877702091987184, + "learning_rate": 1.688303192063713e-05, + "loss": 2.5509, + "step": 48802 + }, + { + "epoch": 2.272132597713993, + "grad_norm": 0.3249749145799923, + "learning_rate": 1.6881002573090415e-05, + "loss": 2.5864, + "step": 48803 + }, + { + "epoch": 2.272179155900086, + "grad_norm": 0.35614879322476634, + "learning_rate": 1.6878973322744657e-05, + "loss": 2.6134, + "step": 48804 + }, + { + "epoch": 2.2722257140861792, + "grad_norm": 0.33589172165867465, + "learning_rate": 1.6876944169605884e-05, + "loss": 2.5327, + "step": 48805 + }, + { + "epoch": 2.2722722722722724, + "grad_norm": 0.3165257859917856, + "learning_rate": 1.6874915113679985e-05, + "loss": 2.6141, + "step": 48806 + }, + { + "epoch": 2.2723188304583655, + "grad_norm": 0.34452029560161784, + "learning_rate": 1.6872886154972973e-05, + "loss": 2.6792, + "step": 48807 + }, + { + "epoch": 2.2723653886444586, + "grad_norm": 0.32923026655848686, + "learning_rate": 1.6870857293490767e-05, + "loss": 2.6087, + "step": 48808 + }, + { + "epoch": 2.2724119468305517, + "grad_norm": 0.35499837437099896, + "learning_rate": 1.6868828529239328e-05, + "loss": 2.7441, + "step": 48809 + }, + { + "epoch": 2.2724585050166444, + "grad_norm": 0.34745480238712706, + "learning_rate": 1.6866799862224615e-05, + "loss": 2.622, + "step": 48810 + }, + { + "epoch": 2.2725050632027375, + "grad_norm": 0.3664215254939204, + "learning_rate": 1.686477129245258e-05, + "loss": 2.6693, + "step": 48811 + }, + { + "epoch": 2.2725516213888306, + "grad_norm": 0.33037276650799674, + "learning_rate": 1.6862742819929196e-05, + "loss": 2.5671, + "step": 48812 + }, + { + "epoch": 2.2725981795749237, + "grad_norm": 0.3468191391030965, + "learning_rate": 1.6860714444660375e-05, + "loss": 2.6678, + "step": 48813 + }, + { + "epoch": 2.272644737761017, + "grad_norm": 0.35576229393308706, + "learning_rate": 1.6858686166652093e-05, + "loss": 2.6988, + "step": 48814 + }, + { + "epoch": 2.27269129594711, + "grad_norm": 0.3601678627386215, + "learning_rate": 1.6856657985910307e-05, + "loss": 2.6243, + "step": 48815 + }, + { + "epoch": 2.272737854133203, + "grad_norm": 0.32883970680754737, + "learning_rate": 1.685462990244096e-05, + "loss": 2.5608, + "step": 48816 + }, + { + "epoch": 2.272784412319296, + "grad_norm": 0.3430657870126619, + "learning_rate": 1.685260191625001e-05, + "loss": 2.617, + "step": 48817 + }, + { + "epoch": 2.2728309705053893, + "grad_norm": 0.3320570385993098, + "learning_rate": 1.6850574027343423e-05, + "loss": 2.6992, + "step": 48818 + }, + { + "epoch": 2.272877528691482, + "grad_norm": 0.3235460429561481, + "learning_rate": 1.684854623572711e-05, + "loss": 2.564, + "step": 48819 + }, + { + "epoch": 2.272924086877575, + "grad_norm": 0.33281457348787763, + "learning_rate": 1.6846518541407076e-05, + "loss": 2.6757, + "step": 48820 + }, + { + "epoch": 2.272970645063668, + "grad_norm": 0.31774111127042926, + "learning_rate": 1.6844490944389224e-05, + "loss": 2.6399, + "step": 48821 + }, + { + "epoch": 2.2730172032497613, + "grad_norm": 0.3258121521329574, + "learning_rate": 1.684246344467953e-05, + "loss": 2.708, + "step": 48822 + }, + { + "epoch": 2.2730637614358544, + "grad_norm": 0.3297920908496887, + "learning_rate": 1.6840436042283936e-05, + "loss": 2.6067, + "step": 48823 + }, + { + "epoch": 2.2731103196219475, + "grad_norm": 0.3409207943804537, + "learning_rate": 1.6838408737208393e-05, + "loss": 2.6014, + "step": 48824 + }, + { + "epoch": 2.2731568778080407, + "grad_norm": 0.3347857836515807, + "learning_rate": 1.6836381529458874e-05, + "loss": 2.7011, + "step": 48825 + }, + { + "epoch": 2.2732034359941338, + "grad_norm": 0.3226087174318999, + "learning_rate": 1.6834354419041294e-05, + "loss": 2.6218, + "step": 48826 + }, + { + "epoch": 2.273249994180227, + "grad_norm": 0.36262079830592386, + "learning_rate": 1.6832327405961613e-05, + "loss": 2.564, + "step": 48827 + }, + { + "epoch": 2.27329655236632, + "grad_norm": 0.3370469805878388, + "learning_rate": 1.683030049022578e-05, + "loss": 2.77, + "step": 48828 + }, + { + "epoch": 2.273343110552413, + "grad_norm": 0.3248726074931619, + "learning_rate": 1.682827367183975e-05, + "loss": 2.6703, + "step": 48829 + }, + { + "epoch": 2.273389668738506, + "grad_norm": 0.34085295951948225, + "learning_rate": 1.682624695080947e-05, + "loss": 2.6887, + "step": 48830 + }, + { + "epoch": 2.273436226924599, + "grad_norm": 0.36087297127142404, + "learning_rate": 1.6824220327140904e-05, + "loss": 2.5139, + "step": 48831 + }, + { + "epoch": 2.273482785110692, + "grad_norm": 0.3566774785109677, + "learning_rate": 1.6822193800839952e-05, + "loss": 2.671, + "step": 48832 + }, + { + "epoch": 2.273529343296785, + "grad_norm": 0.3260578723074551, + "learning_rate": 1.682016737191262e-05, + "loss": 2.6476, + "step": 48833 + }, + { + "epoch": 2.2735759014828782, + "grad_norm": 0.35194683599232135, + "learning_rate": 1.6818141040364816e-05, + "loss": 2.6118, + "step": 48834 + }, + { + "epoch": 2.2736224596689714, + "grad_norm": 0.3730302009624915, + "learning_rate": 1.6816114806202498e-05, + "loss": 2.7453, + "step": 48835 + }, + { + "epoch": 2.2736690178550645, + "grad_norm": 0.3410108269678484, + "learning_rate": 1.681408866943161e-05, + "loss": 2.5887, + "step": 48836 + }, + { + "epoch": 2.2737155760411576, + "grad_norm": 0.3533581088410184, + "learning_rate": 1.6812062630058106e-05, + "loss": 2.6198, + "step": 48837 + }, + { + "epoch": 2.2737621342272503, + "grad_norm": 0.34419563906485023, + "learning_rate": 1.681003668808795e-05, + "loss": 2.5827, + "step": 48838 + }, + { + "epoch": 2.2738086924133434, + "grad_norm": 0.33167064364852084, + "learning_rate": 1.6808010843527044e-05, + "loss": 2.6197, + "step": 48839 + }, + { + "epoch": 2.2738552505994365, + "grad_norm": 0.35080922574563117, + "learning_rate": 1.6805985096381355e-05, + "loss": 2.6132, + "step": 48840 + }, + { + "epoch": 2.2739018087855296, + "grad_norm": 0.3564976798572505, + "learning_rate": 1.680395944665683e-05, + "loss": 2.6595, + "step": 48841 + }, + { + "epoch": 2.2739483669716227, + "grad_norm": 0.3058510072620722, + "learning_rate": 1.680193389435942e-05, + "loss": 2.6492, + "step": 48842 + }, + { + "epoch": 2.273994925157716, + "grad_norm": 0.33627841061619995, + "learning_rate": 1.6799908439495056e-05, + "loss": 2.6766, + "step": 48843 + }, + { + "epoch": 2.274041483343809, + "grad_norm": 0.3276394423904228, + "learning_rate": 1.679788308206971e-05, + "loss": 2.6664, + "step": 48844 + }, + { + "epoch": 2.274088041529902, + "grad_norm": 0.3239513456069063, + "learning_rate": 1.6795857822089272e-05, + "loss": 2.651, + "step": 48845 + }, + { + "epoch": 2.274134599715995, + "grad_norm": 0.31193444263723774, + "learning_rate": 1.6793832659559754e-05, + "loss": 2.6956, + "step": 48846 + }, + { + "epoch": 2.2741811579020883, + "grad_norm": 0.32899169595616934, + "learning_rate": 1.6791807594487048e-05, + "loss": 2.6137, + "step": 48847 + }, + { + "epoch": 2.2742277160881814, + "grad_norm": 0.3067593932675045, + "learning_rate": 1.6789782626877115e-05, + "loss": 2.5868, + "step": 48848 + }, + { + "epoch": 2.274274274274274, + "grad_norm": 0.3562852271348729, + "learning_rate": 1.67877577567359e-05, + "loss": 2.629, + "step": 48849 + }, + { + "epoch": 2.274320832460367, + "grad_norm": 0.32251919668472645, + "learning_rate": 1.6785732984069348e-05, + "loss": 2.6342, + "step": 48850 + }, + { + "epoch": 2.2743673906464603, + "grad_norm": 0.3301026338923746, + "learning_rate": 1.6783708308883407e-05, + "loss": 2.7164, + "step": 48851 + }, + { + "epoch": 2.2744139488325534, + "grad_norm": 0.32060556073809515, + "learning_rate": 1.678168373118398e-05, + "loss": 2.6724, + "step": 48852 + }, + { + "epoch": 2.2744605070186465, + "grad_norm": 0.3253075836971992, + "learning_rate": 1.677965925097707e-05, + "loss": 2.6666, + "step": 48853 + }, + { + "epoch": 2.2745070652047397, + "grad_norm": 0.3243257906776001, + "learning_rate": 1.677763486826857e-05, + "loss": 2.6692, + "step": 48854 + }, + { + "epoch": 2.274553623390833, + "grad_norm": 0.3290841121798479, + "learning_rate": 1.6775610583064443e-05, + "loss": 2.6625, + "step": 48855 + }, + { + "epoch": 2.274600181576926, + "grad_norm": 0.3079732023846484, + "learning_rate": 1.6773586395370622e-05, + "loss": 2.5333, + "step": 48856 + }, + { + "epoch": 2.274646739763019, + "grad_norm": 0.30914979003481985, + "learning_rate": 1.6771562305193052e-05, + "loss": 2.6808, + "step": 48857 + }, + { + "epoch": 2.2746932979491117, + "grad_norm": 0.3349799154320096, + "learning_rate": 1.6769538312537675e-05, + "loss": 2.6645, + "step": 48858 + }, + { + "epoch": 2.274739856135205, + "grad_norm": 0.3102131245480092, + "learning_rate": 1.6767514417410447e-05, + "loss": 2.6577, + "step": 48859 + }, + { + "epoch": 2.274786414321298, + "grad_norm": 0.31044938866155963, + "learning_rate": 1.676549061981727e-05, + "loss": 2.6887, + "step": 48860 + }, + { + "epoch": 2.274832972507391, + "grad_norm": 0.3195687176327915, + "learning_rate": 1.6763466919764102e-05, + "loss": 2.5364, + "step": 48861 + }, + { + "epoch": 2.274879530693484, + "grad_norm": 0.3374721671931077, + "learning_rate": 1.676144331725689e-05, + "loss": 2.6601, + "step": 48862 + }, + { + "epoch": 2.2749260888795773, + "grad_norm": 0.31642434763907545, + "learning_rate": 1.6759419812301564e-05, + "loss": 2.6972, + "step": 48863 + }, + { + "epoch": 2.2749726470656704, + "grad_norm": 0.31888575486377724, + "learning_rate": 1.6757396404904086e-05, + "loss": 2.6493, + "step": 48864 + }, + { + "epoch": 2.2750192052517635, + "grad_norm": 0.30670555958179946, + "learning_rate": 1.6755373095070336e-05, + "loss": 2.6626, + "step": 48865 + }, + { + "epoch": 2.2750657634378566, + "grad_norm": 0.32095854874323687, + "learning_rate": 1.675334988280633e-05, + "loss": 2.5561, + "step": 48866 + }, + { + "epoch": 2.2751123216239497, + "grad_norm": 0.3382155860870552, + "learning_rate": 1.6751326768117948e-05, + "loss": 2.6158, + "step": 48867 + }, + { + "epoch": 2.275158879810043, + "grad_norm": 0.32065250956117247, + "learning_rate": 1.674930375101114e-05, + "loss": 2.6374, + "step": 48868 + }, + { + "epoch": 2.2752054379961355, + "grad_norm": 0.331894788984275, + "learning_rate": 1.6747280831491858e-05, + "loss": 2.6572, + "step": 48869 + }, + { + "epoch": 2.2752519961822286, + "grad_norm": 0.336686180641425, + "learning_rate": 1.674525800956603e-05, + "loss": 2.6602, + "step": 48870 + }, + { + "epoch": 2.2752985543683217, + "grad_norm": 0.33342709484448585, + "learning_rate": 1.674323528523959e-05, + "loss": 2.7145, + "step": 48871 + }, + { + "epoch": 2.275345112554415, + "grad_norm": 0.3088553113689747, + "learning_rate": 1.6741212658518495e-05, + "loss": 2.6593, + "step": 48872 + }, + { + "epoch": 2.275391670740508, + "grad_norm": 0.3255603721875239, + "learning_rate": 1.673919012940865e-05, + "loss": 2.6237, + "step": 48873 + }, + { + "epoch": 2.275438228926601, + "grad_norm": 0.3304763746291425, + "learning_rate": 1.6737167697916005e-05, + "loss": 2.6809, + "step": 48874 + }, + { + "epoch": 2.275484787112694, + "grad_norm": 0.3276130802497817, + "learning_rate": 1.6735145364046495e-05, + "loss": 2.6861, + "step": 48875 + }, + { + "epoch": 2.2755313452987873, + "grad_norm": 0.3175712704572164, + "learning_rate": 1.673312312780606e-05, + "loss": 2.6462, + "step": 48876 + }, + { + "epoch": 2.27557790348488, + "grad_norm": 0.32339685050538997, + "learning_rate": 1.6731100989200644e-05, + "loss": 2.5829, + "step": 48877 + }, + { + "epoch": 2.275624461670973, + "grad_norm": 0.38392982867043707, + "learning_rate": 1.6729078948236137e-05, + "loss": 2.6629, + "step": 48878 + }, + { + "epoch": 2.275671019857066, + "grad_norm": 0.31770880876152346, + "learning_rate": 1.6727057004918534e-05, + "loss": 2.653, + "step": 48879 + }, + { + "epoch": 2.2757175780431593, + "grad_norm": 0.3348301399412496, + "learning_rate": 1.6725035159253733e-05, + "loss": 2.6018, + "step": 48880 + }, + { + "epoch": 2.2757641362292524, + "grad_norm": 0.33871380912942195, + "learning_rate": 1.6723013411247667e-05, + "loss": 2.6743, + "step": 48881 + }, + { + "epoch": 2.2758106944153456, + "grad_norm": 0.32575608268051076, + "learning_rate": 1.6720991760906285e-05, + "loss": 2.5881, + "step": 48882 + }, + { + "epoch": 2.2758572526014387, + "grad_norm": 0.3539333031086548, + "learning_rate": 1.6718970208235506e-05, + "loss": 2.6984, + "step": 48883 + }, + { + "epoch": 2.275903810787532, + "grad_norm": 0.3360041022546035, + "learning_rate": 1.671694875324128e-05, + "loss": 2.573, + "step": 48884 + }, + { + "epoch": 2.275950368973625, + "grad_norm": 0.3433708818970148, + "learning_rate": 1.671492739592954e-05, + "loss": 2.6102, + "step": 48885 + }, + { + "epoch": 2.275996927159718, + "grad_norm": 0.34111890757759206, + "learning_rate": 1.6712906136306196e-05, + "loss": 2.6925, + "step": 48886 + }, + { + "epoch": 2.276043485345811, + "grad_norm": 0.32842646596122477, + "learning_rate": 1.6710884974377194e-05, + "loss": 2.6065, + "step": 48887 + }, + { + "epoch": 2.276090043531904, + "grad_norm": 0.3235083353486063, + "learning_rate": 1.6708863910148466e-05, + "loss": 2.749, + "step": 48888 + }, + { + "epoch": 2.276136601717997, + "grad_norm": 0.3213176013515041, + "learning_rate": 1.6706842943625937e-05, + "loss": 2.5956, + "step": 48889 + }, + { + "epoch": 2.27618315990409, + "grad_norm": 0.33843305818036346, + "learning_rate": 1.6704822074815564e-05, + "loss": 2.7237, + "step": 48890 + }, + { + "epoch": 2.276229718090183, + "grad_norm": 0.31004035299743055, + "learning_rate": 1.6702801303723224e-05, + "loss": 2.6895, + "step": 48891 + }, + { + "epoch": 2.2762762762762763, + "grad_norm": 0.3436200001898951, + "learning_rate": 1.6700780630354916e-05, + "loss": 2.6901, + "step": 48892 + }, + { + "epoch": 2.2763228344623694, + "grad_norm": 0.32068337394958746, + "learning_rate": 1.66987600547165e-05, + "loss": 2.6767, + "step": 48893 + }, + { + "epoch": 2.2763693926484625, + "grad_norm": 0.3345181339161667, + "learning_rate": 1.669673957681398e-05, + "loss": 2.6515, + "step": 48894 + }, + { + "epoch": 2.2764159508345556, + "grad_norm": 0.3112215883896486, + "learning_rate": 1.669471919665323e-05, + "loss": 2.6075, + "step": 48895 + }, + { + "epoch": 2.2764625090206487, + "grad_norm": 0.3235200628735817, + "learning_rate": 1.6692698914240202e-05, + "loss": 2.6215, + "step": 48896 + }, + { + "epoch": 2.2765090672067414, + "grad_norm": 0.3289699635680988, + "learning_rate": 1.6690678729580817e-05, + "loss": 2.5795, + "step": 48897 + }, + { + "epoch": 2.2765556253928345, + "grad_norm": 0.3309223061733202, + "learning_rate": 1.668865864268102e-05, + "loss": 2.6711, + "step": 48898 + }, + { + "epoch": 2.2766021835789276, + "grad_norm": 0.3245462541146684, + "learning_rate": 1.668663865354672e-05, + "loss": 2.7046, + "step": 48899 + }, + { + "epoch": 2.2766487417650207, + "grad_norm": 0.33704518042922704, + "learning_rate": 1.668461876218385e-05, + "loss": 2.7021, + "step": 48900 + }, + { + "epoch": 2.276695299951114, + "grad_norm": 0.3039872398724521, + "learning_rate": 1.6682598968598346e-05, + "loss": 2.5721, + "step": 48901 + }, + { + "epoch": 2.276741858137207, + "grad_norm": 0.3603411910811648, + "learning_rate": 1.6680579272796126e-05, + "loss": 2.7217, + "step": 48902 + }, + { + "epoch": 2.2767884163233, + "grad_norm": 0.3229321744130196, + "learning_rate": 1.6678559674783144e-05, + "loss": 2.6909, + "step": 48903 + }, + { + "epoch": 2.276834974509393, + "grad_norm": 0.30755774128551794, + "learning_rate": 1.667654017456527e-05, + "loss": 2.6729, + "step": 48904 + }, + { + "epoch": 2.2768815326954863, + "grad_norm": 0.3443814690822966, + "learning_rate": 1.6674520772148505e-05, + "loss": 2.5799, + "step": 48905 + }, + { + "epoch": 2.2769280908815794, + "grad_norm": 0.32226822899570434, + "learning_rate": 1.6672501467538703e-05, + "loss": 2.7023, + "step": 48906 + }, + { + "epoch": 2.2769746490676726, + "grad_norm": 0.3526822583449697, + "learning_rate": 1.667048226074186e-05, + "loss": 2.7133, + "step": 48907 + }, + { + "epoch": 2.2770212072537652, + "grad_norm": 0.3319484735527597, + "learning_rate": 1.6668463151763845e-05, + "loss": 2.7456, + "step": 48908 + }, + { + "epoch": 2.2770677654398583, + "grad_norm": 0.3329879028832446, + "learning_rate": 1.6666444140610614e-05, + "loss": 2.6122, + "step": 48909 + }, + { + "epoch": 2.2771143236259515, + "grad_norm": 0.3269444418940014, + "learning_rate": 1.6664425227288082e-05, + "loss": 2.6395, + "step": 48910 + }, + { + "epoch": 2.2771608818120446, + "grad_norm": 0.3414795385490859, + "learning_rate": 1.6662406411802185e-05, + "loss": 2.6527, + "step": 48911 + }, + { + "epoch": 2.2772074399981377, + "grad_norm": 0.32285181224479886, + "learning_rate": 1.6660387694158854e-05, + "loss": 2.6614, + "step": 48912 + }, + { + "epoch": 2.277253998184231, + "grad_norm": 0.33540957860750226, + "learning_rate": 1.665836907436398e-05, + "loss": 2.6399, + "step": 48913 + }, + { + "epoch": 2.277300556370324, + "grad_norm": 0.33619140825167804, + "learning_rate": 1.665635055242351e-05, + "loss": 2.691, + "step": 48914 + }, + { + "epoch": 2.277347114556417, + "grad_norm": 0.334407958088852, + "learning_rate": 1.6654332128343366e-05, + "loss": 2.5617, + "step": 48915 + }, + { + "epoch": 2.27739367274251, + "grad_norm": 0.3425618023238129, + "learning_rate": 1.6652313802129486e-05, + "loss": 2.6528, + "step": 48916 + }, + { + "epoch": 2.277440230928603, + "grad_norm": 0.35574541460145004, + "learning_rate": 1.6650295573787754e-05, + "loss": 2.7293, + "step": 48917 + }, + { + "epoch": 2.277486789114696, + "grad_norm": 0.3168209279591858, + "learning_rate": 1.6648277443324144e-05, + "loss": 2.7342, + "step": 48918 + }, + { + "epoch": 2.277533347300789, + "grad_norm": 0.35276205906947944, + "learning_rate": 1.664625941074453e-05, + "loss": 2.5817, + "step": 48919 + }, + { + "epoch": 2.277579905486882, + "grad_norm": 0.3254096737698439, + "learning_rate": 1.6644241476054883e-05, + "loss": 2.6699, + "step": 48920 + }, + { + "epoch": 2.2776264636729753, + "grad_norm": 0.330518867358646, + "learning_rate": 1.664222363926109e-05, + "loss": 2.702, + "step": 48921 + }, + { + "epoch": 2.2776730218590684, + "grad_norm": 0.33204534478380976, + "learning_rate": 1.664020590036908e-05, + "loss": 2.6973, + "step": 48922 + }, + { + "epoch": 2.2777195800451615, + "grad_norm": 0.3136526959756339, + "learning_rate": 1.6638188259384785e-05, + "loss": 2.5897, + "step": 48923 + }, + { + "epoch": 2.2777661382312546, + "grad_norm": 0.3167021024089795, + "learning_rate": 1.6636170716314113e-05, + "loss": 2.6402, + "step": 48924 + }, + { + "epoch": 2.2778126964173477, + "grad_norm": 0.3284290019102636, + "learning_rate": 1.6634153271163018e-05, + "loss": 2.6602, + "step": 48925 + }, + { + "epoch": 2.277859254603441, + "grad_norm": 0.3227813084869969, + "learning_rate": 1.6632135923937374e-05, + "loss": 2.5937, + "step": 48926 + }, + { + "epoch": 2.277905812789534, + "grad_norm": 0.32983632459421613, + "learning_rate": 1.663011867464313e-05, + "loss": 2.7694, + "step": 48927 + }, + { + "epoch": 2.2779523709756266, + "grad_norm": 0.3428276372452041, + "learning_rate": 1.6628101523286194e-05, + "loss": 2.6135, + "step": 48928 + }, + { + "epoch": 2.2779989291617198, + "grad_norm": 0.3210476586680625, + "learning_rate": 1.6626084469872492e-05, + "loss": 2.6753, + "step": 48929 + }, + { + "epoch": 2.278045487347813, + "grad_norm": 0.31785824003308155, + "learning_rate": 1.662406751440795e-05, + "loss": 2.681, + "step": 48930 + }, + { + "epoch": 2.278092045533906, + "grad_norm": 0.33677494294403865, + "learning_rate": 1.662205065689849e-05, + "loss": 2.5572, + "step": 48931 + }, + { + "epoch": 2.278138603719999, + "grad_norm": 0.32952217344247803, + "learning_rate": 1.6620033897349995e-05, + "loss": 2.6477, + "step": 48932 + }, + { + "epoch": 2.278185161906092, + "grad_norm": 0.3320536164748549, + "learning_rate": 1.6618017235768445e-05, + "loss": 2.7099, + "step": 48933 + }, + { + "epoch": 2.2782317200921853, + "grad_norm": 0.33582007197180735, + "learning_rate": 1.661600067215971e-05, + "loss": 2.5991, + "step": 48934 + }, + { + "epoch": 2.2782782782782784, + "grad_norm": 0.3264925129585009, + "learning_rate": 1.6613984206529726e-05, + "loss": 2.6671, + "step": 48935 + }, + { + "epoch": 2.278324836464371, + "grad_norm": 0.31950440077228226, + "learning_rate": 1.6611967838884408e-05, + "loss": 2.7308, + "step": 48936 + }, + { + "epoch": 2.2783713946504642, + "grad_norm": 0.34404572408961565, + "learning_rate": 1.660995156922967e-05, + "loss": 2.6507, + "step": 48937 + }, + { + "epoch": 2.2784179528365573, + "grad_norm": 0.3226055485332105, + "learning_rate": 1.6607935397571454e-05, + "loss": 2.6382, + "step": 48938 + }, + { + "epoch": 2.2784645110226505, + "grad_norm": 0.30889237707862477, + "learning_rate": 1.6605919323915643e-05, + "loss": 2.5124, + "step": 48939 + }, + { + "epoch": 2.2785110692087436, + "grad_norm": 0.3350600810406283, + "learning_rate": 1.6603903348268167e-05, + "loss": 2.6786, + "step": 48940 + }, + { + "epoch": 2.2785576273948367, + "grad_norm": 0.3206127462143856, + "learning_rate": 1.6601887470634947e-05, + "loss": 2.6476, + "step": 48941 + }, + { + "epoch": 2.27860418558093, + "grad_norm": 0.31783383725603015, + "learning_rate": 1.6599871691021897e-05, + "loss": 2.667, + "step": 48942 + }, + { + "epoch": 2.278650743767023, + "grad_norm": 0.34703649190595043, + "learning_rate": 1.6597856009434933e-05, + "loss": 2.5782, + "step": 48943 + }, + { + "epoch": 2.278697301953116, + "grad_norm": 0.3188609648352618, + "learning_rate": 1.6595840425879984e-05, + "loss": 2.7254, + "step": 48944 + }, + { + "epoch": 2.278743860139209, + "grad_norm": 0.3326960341294685, + "learning_rate": 1.6593824940362923e-05, + "loss": 2.8401, + "step": 48945 + }, + { + "epoch": 2.2787904183253023, + "grad_norm": 0.3385693474406302, + "learning_rate": 1.6591809552889725e-05, + "loss": 2.6104, + "step": 48946 + }, + { + "epoch": 2.278836976511395, + "grad_norm": 0.30748087531644, + "learning_rate": 1.6589794263466256e-05, + "loss": 2.6274, + "step": 48947 + }, + { + "epoch": 2.278883534697488, + "grad_norm": 0.3480636896117183, + "learning_rate": 1.658777907209846e-05, + "loss": 2.655, + "step": 48948 + }, + { + "epoch": 2.278930092883581, + "grad_norm": 0.32427080702202987, + "learning_rate": 1.658576397879223e-05, + "loss": 2.7286, + "step": 48949 + }, + { + "epoch": 2.2789766510696743, + "grad_norm": 0.34087444708164044, + "learning_rate": 1.6583748983553492e-05, + "loss": 2.5973, + "step": 48950 + }, + { + "epoch": 2.2790232092557674, + "grad_norm": 0.33202933837365217, + "learning_rate": 1.658173408638818e-05, + "loss": 2.5736, + "step": 48951 + }, + { + "epoch": 2.2790697674418605, + "grad_norm": 0.3059510368969798, + "learning_rate": 1.657971928730217e-05, + "loss": 2.6728, + "step": 48952 + }, + { + "epoch": 2.2791163256279536, + "grad_norm": 0.3156338154715352, + "learning_rate": 1.657770458630139e-05, + "loss": 2.6613, + "step": 48953 + }, + { + "epoch": 2.2791628838140467, + "grad_norm": 0.3173964291448871, + "learning_rate": 1.657568998339175e-05, + "loss": 2.6479, + "step": 48954 + }, + { + "epoch": 2.27920944200014, + "grad_norm": 0.3334324722884092, + "learning_rate": 1.657367547857917e-05, + "loss": 2.6997, + "step": 48955 + }, + { + "epoch": 2.2792560001862325, + "grad_norm": 0.3179548656367059, + "learning_rate": 1.657166107186956e-05, + "loss": 2.5985, + "step": 48956 + }, + { + "epoch": 2.2793025583723256, + "grad_norm": 0.31697223999615093, + "learning_rate": 1.6569646763268847e-05, + "loss": 2.6139, + "step": 48957 + }, + { + "epoch": 2.2793491165584188, + "grad_norm": 0.33966203002351947, + "learning_rate": 1.6567632552782898e-05, + "loss": 2.5982, + "step": 48958 + }, + { + "epoch": 2.279395674744512, + "grad_norm": 0.3222311882883694, + "learning_rate": 1.6565618440417687e-05, + "loss": 2.5942, + "step": 48959 + }, + { + "epoch": 2.279442232930605, + "grad_norm": 0.32923021136374464, + "learning_rate": 1.6563604426179053e-05, + "loss": 2.6842, + "step": 48960 + }, + { + "epoch": 2.279488791116698, + "grad_norm": 0.3196121078328467, + "learning_rate": 1.656159051007298e-05, + "loss": 2.7123, + "step": 48961 + }, + { + "epoch": 2.2795353493027912, + "grad_norm": 0.3439054035255074, + "learning_rate": 1.6559576692105333e-05, + "loss": 2.7357, + "step": 48962 + }, + { + "epoch": 2.2795819074888843, + "grad_norm": 0.3117225596371237, + "learning_rate": 1.655756297228203e-05, + "loss": 2.558, + "step": 48963 + }, + { + "epoch": 2.2796284656749775, + "grad_norm": 0.3234098174639511, + "learning_rate": 1.6555549350609002e-05, + "loss": 2.6718, + "step": 48964 + }, + { + "epoch": 2.2796750238610706, + "grad_norm": 0.3523363448513433, + "learning_rate": 1.655353582709211e-05, + "loss": 2.7007, + "step": 48965 + }, + { + "epoch": 2.2797215820471637, + "grad_norm": 0.3271252123528907, + "learning_rate": 1.6551522401737328e-05, + "loss": 2.6674, + "step": 48966 + }, + { + "epoch": 2.2797681402332564, + "grad_norm": 0.336005283448297, + "learning_rate": 1.6549509074550517e-05, + "loss": 2.6467, + "step": 48967 + }, + { + "epoch": 2.2798146984193495, + "grad_norm": 0.3430595089715493, + "learning_rate": 1.6547495845537602e-05, + "loss": 2.6667, + "step": 48968 + }, + { + "epoch": 2.2798612566054426, + "grad_norm": 0.3366737975618892, + "learning_rate": 1.6545482714704487e-05, + "loss": 2.6935, + "step": 48969 + }, + { + "epoch": 2.2799078147915357, + "grad_norm": 0.3578934094133608, + "learning_rate": 1.6543469682057106e-05, + "loss": 2.5362, + "step": 48970 + }, + { + "epoch": 2.279954372977629, + "grad_norm": 0.34335956151592384, + "learning_rate": 1.654145674760131e-05, + "loss": 2.6521, + "step": 48971 + }, + { + "epoch": 2.280000931163722, + "grad_norm": 0.32517866415146507, + "learning_rate": 1.653944391134307e-05, + "loss": 2.5856, + "step": 48972 + }, + { + "epoch": 2.280047489349815, + "grad_norm": 0.34608467352395206, + "learning_rate": 1.6537431173288243e-05, + "loss": 2.6272, + "step": 48973 + }, + { + "epoch": 2.280094047535908, + "grad_norm": 0.36223446658475444, + "learning_rate": 1.6535418533442787e-05, + "loss": 2.7133, + "step": 48974 + }, + { + "epoch": 2.280140605722001, + "grad_norm": 0.3408219065404578, + "learning_rate": 1.6533405991812562e-05, + "loss": 2.6036, + "step": 48975 + }, + { + "epoch": 2.280187163908094, + "grad_norm": 0.3366017073449786, + "learning_rate": 1.6531393548403496e-05, + "loss": 2.6445, + "step": 48976 + }, + { + "epoch": 2.280233722094187, + "grad_norm": 0.3535109024836665, + "learning_rate": 1.6529381203221506e-05, + "loss": 2.6825, + "step": 48977 + }, + { + "epoch": 2.28028028028028, + "grad_norm": 0.33539840017567935, + "learning_rate": 1.6527368956272455e-05, + "loss": 2.7444, + "step": 48978 + }, + { + "epoch": 2.2803268384663733, + "grad_norm": 0.3356996893045058, + "learning_rate": 1.6525356807562315e-05, + "loss": 2.5658, + "step": 48979 + }, + { + "epoch": 2.2803733966524664, + "grad_norm": 0.34488167012328097, + "learning_rate": 1.6523344757096932e-05, + "loss": 2.6946, + "step": 48980 + }, + { + "epoch": 2.2804199548385595, + "grad_norm": 0.3604114700912587, + "learning_rate": 1.6521332804882234e-05, + "loss": 2.6863, + "step": 48981 + }, + { + "epoch": 2.2804665130246526, + "grad_norm": 0.3300412705388323, + "learning_rate": 1.6519320950924134e-05, + "loss": 2.5468, + "step": 48982 + }, + { + "epoch": 2.2805130712107458, + "grad_norm": 0.33448413087484363, + "learning_rate": 1.651730919522852e-05, + "loss": 2.6356, + "step": 48983 + }, + { + "epoch": 2.280559629396839, + "grad_norm": 0.368168501591091, + "learning_rate": 1.6515297537801306e-05, + "loss": 2.7152, + "step": 48984 + }, + { + "epoch": 2.280606187582932, + "grad_norm": 0.3358473417076836, + "learning_rate": 1.6513285978648412e-05, + "loss": 2.5464, + "step": 48985 + }, + { + "epoch": 2.2806527457690247, + "grad_norm": 0.32496232265937763, + "learning_rate": 1.65112745177757e-05, + "loss": 2.6926, + "step": 48986 + }, + { + "epoch": 2.2806993039551178, + "grad_norm": 0.3468470074828332, + "learning_rate": 1.650926315518912e-05, + "loss": 2.6717, + "step": 48987 + }, + { + "epoch": 2.280745862141211, + "grad_norm": 0.354438453784687, + "learning_rate": 1.6507251890894542e-05, + "loss": 2.6179, + "step": 48988 + }, + { + "epoch": 2.280792420327304, + "grad_norm": 0.3356009580105488, + "learning_rate": 1.650524072489788e-05, + "loss": 2.5213, + "step": 48989 + }, + { + "epoch": 2.280838978513397, + "grad_norm": 0.34895324077782064, + "learning_rate": 1.6503229657205048e-05, + "loss": 2.6359, + "step": 48990 + }, + { + "epoch": 2.2808855366994902, + "grad_norm": 0.3454307518119803, + "learning_rate": 1.6501218687821914e-05, + "loss": 2.6328, + "step": 48991 + }, + { + "epoch": 2.2809320948855833, + "grad_norm": 0.3191488584954332, + "learning_rate": 1.6499207816754432e-05, + "loss": 2.5182, + "step": 48992 + }, + { + "epoch": 2.2809786530716765, + "grad_norm": 0.34977512241121556, + "learning_rate": 1.649719704400845e-05, + "loss": 2.6692, + "step": 48993 + }, + { + "epoch": 2.2810252112577696, + "grad_norm": 0.3475756433282102, + "learning_rate": 1.6495186369589903e-05, + "loss": 2.6294, + "step": 48994 + }, + { + "epoch": 2.2810717694438623, + "grad_norm": 0.32629732036333137, + "learning_rate": 1.6493175793504678e-05, + "loss": 2.6529, + "step": 48995 + }, + { + "epoch": 2.2811183276299554, + "grad_norm": 0.34325760217351436, + "learning_rate": 1.649116531575869e-05, + "loss": 2.6718, + "step": 48996 + }, + { + "epoch": 2.2811648858160485, + "grad_norm": 0.3461605355135265, + "learning_rate": 1.648915493635782e-05, + "loss": 2.6559, + "step": 48997 + }, + { + "epoch": 2.2812114440021416, + "grad_norm": 0.3330650803379174, + "learning_rate": 1.6487144655308002e-05, + "loss": 2.6198, + "step": 48998 + }, + { + "epoch": 2.2812580021882347, + "grad_norm": 0.32296517971303973, + "learning_rate": 1.6485134472615076e-05, + "loss": 2.5701, + "step": 48999 + }, + { + "epoch": 2.281304560374328, + "grad_norm": 0.34207261721093235, + "learning_rate": 1.6483124388285016e-05, + "loss": 2.6874, + "step": 49000 + }, + { + "epoch": 2.281351118560421, + "grad_norm": 0.3247219200986299, + "learning_rate": 1.648111440232366e-05, + "loss": 2.6914, + "step": 49001 + }, + { + "epoch": 2.281397676746514, + "grad_norm": 0.3438809669648412, + "learning_rate": 1.647910451473694e-05, + "loss": 2.5942, + "step": 49002 + }, + { + "epoch": 2.281444234932607, + "grad_norm": 0.34541755323430073, + "learning_rate": 1.6477094725530756e-05, + "loss": 2.6532, + "step": 49003 + }, + { + "epoch": 2.2814907931187003, + "grad_norm": 0.3036356195872225, + "learning_rate": 1.647508503471097e-05, + "loss": 2.6491, + "step": 49004 + }, + { + "epoch": 2.2815373513047934, + "grad_norm": 0.30983061585772015, + "learning_rate": 1.647307544228353e-05, + "loss": 2.5676, + "step": 49005 + }, + { + "epoch": 2.281583909490886, + "grad_norm": 0.33659333657475765, + "learning_rate": 1.64710659482543e-05, + "loss": 2.6739, + "step": 49006 + }, + { + "epoch": 2.281630467676979, + "grad_norm": 0.3371942840405006, + "learning_rate": 1.646905655262919e-05, + "loss": 2.5772, + "step": 49007 + }, + { + "epoch": 2.2816770258630723, + "grad_norm": 0.3388406951653877, + "learning_rate": 1.6467047255414093e-05, + "loss": 2.6265, + "step": 49008 + }, + { + "epoch": 2.2817235840491654, + "grad_norm": 0.34393109055090326, + "learning_rate": 1.6465038056614906e-05, + "loss": 2.7912, + "step": 49009 + }, + { + "epoch": 2.2817701422352585, + "grad_norm": 0.308371856761467, + "learning_rate": 1.6463028956237535e-05, + "loss": 2.6595, + "step": 49010 + }, + { + "epoch": 2.2818167004213517, + "grad_norm": 0.3407103648397465, + "learning_rate": 1.6461019954287883e-05, + "loss": 2.5507, + "step": 49011 + }, + { + "epoch": 2.2818632586074448, + "grad_norm": 0.331900836597294, + "learning_rate": 1.6459011050771798e-05, + "loss": 2.597, + "step": 49012 + }, + { + "epoch": 2.281909816793538, + "grad_norm": 0.3348675539317449, + "learning_rate": 1.6457002245695246e-05, + "loss": 2.6965, + "step": 49013 + }, + { + "epoch": 2.2819563749796306, + "grad_norm": 0.3107456740257757, + "learning_rate": 1.645499353906407e-05, + "loss": 2.5609, + "step": 49014 + }, + { + "epoch": 2.2820029331657237, + "grad_norm": 0.35155682887328166, + "learning_rate": 1.645298493088419e-05, + "loss": 2.6775, + "step": 49015 + }, + { + "epoch": 2.282049491351817, + "grad_norm": 0.33227466375109965, + "learning_rate": 1.645097642116151e-05, + "loss": 2.7187, + "step": 49016 + }, + { + "epoch": 2.28209604953791, + "grad_norm": 0.3155469525061529, + "learning_rate": 1.6448968009901873e-05, + "loss": 2.607, + "step": 49017 + }, + { + "epoch": 2.282142607724003, + "grad_norm": 0.3536127709723997, + "learning_rate": 1.6446959697111238e-05, + "loss": 2.6354, + "step": 49018 + }, + { + "epoch": 2.282189165910096, + "grad_norm": 0.3134513807223236, + "learning_rate": 1.644495148279544e-05, + "loss": 2.5743, + "step": 49019 + }, + { + "epoch": 2.2822357240961892, + "grad_norm": 0.31836525061028387, + "learning_rate": 1.644294336696044e-05, + "loss": 2.6403, + "step": 49020 + }, + { + "epoch": 2.2822822822822824, + "grad_norm": 0.3437413118349725, + "learning_rate": 1.644093534961208e-05, + "loss": 2.561, + "step": 49021 + }, + { + "epoch": 2.2823288404683755, + "grad_norm": 0.3250906296164132, + "learning_rate": 1.6438927430756264e-05, + "loss": 2.6226, + "step": 49022 + }, + { + "epoch": 2.2823753986544686, + "grad_norm": 0.32593728630644925, + "learning_rate": 1.6436919610398895e-05, + "loss": 2.6377, + "step": 49023 + }, + { + "epoch": 2.2824219568405617, + "grad_norm": 0.3339976972884876, + "learning_rate": 1.6434911888545867e-05, + "loss": 2.6571, + "step": 49024 + }, + { + "epoch": 2.2824685150266544, + "grad_norm": 0.31030525060564557, + "learning_rate": 1.6432904265203042e-05, + "loss": 2.6578, + "step": 49025 + }, + { + "epoch": 2.2825150732127475, + "grad_norm": 0.32887571632505347, + "learning_rate": 1.6430896740376366e-05, + "loss": 2.6073, + "step": 49026 + }, + { + "epoch": 2.2825616313988406, + "grad_norm": 0.33477911922081544, + "learning_rate": 1.6428889314071683e-05, + "loss": 2.5846, + "step": 49027 + }, + { + "epoch": 2.2826081895849337, + "grad_norm": 0.3373659326574888, + "learning_rate": 1.6426881986294906e-05, + "loss": 2.6096, + "step": 49028 + }, + { + "epoch": 2.282654747771027, + "grad_norm": 0.3521289153509881, + "learning_rate": 1.6424874757051937e-05, + "loss": 2.6246, + "step": 49029 + }, + { + "epoch": 2.28270130595712, + "grad_norm": 0.3151673137967856, + "learning_rate": 1.642286762634862e-05, + "loss": 2.6244, + "step": 49030 + }, + { + "epoch": 2.282747864143213, + "grad_norm": 0.33260019762982007, + "learning_rate": 1.6420860594190913e-05, + "loss": 2.748, + "step": 49031 + }, + { + "epoch": 2.282794422329306, + "grad_norm": 0.35093520636051134, + "learning_rate": 1.641885366058464e-05, + "loss": 2.5698, + "step": 49032 + }, + { + "epoch": 2.2828409805153993, + "grad_norm": 0.35256255203370873, + "learning_rate": 1.6416846825535753e-05, + "loss": 2.6405, + "step": 49033 + }, + { + "epoch": 2.282887538701492, + "grad_norm": 0.3202133048379357, + "learning_rate": 1.64148400890501e-05, + "loss": 2.6595, + "step": 49034 + }, + { + "epoch": 2.282934096887585, + "grad_norm": 0.3023516718224824, + "learning_rate": 1.6412833451133573e-05, + "loss": 2.5854, + "step": 49035 + }, + { + "epoch": 2.282980655073678, + "grad_norm": 0.3432994989023989, + "learning_rate": 1.6410826911792082e-05, + "loss": 2.6016, + "step": 49036 + }, + { + "epoch": 2.2830272132597713, + "grad_norm": 0.34137363333360365, + "learning_rate": 1.64088204710315e-05, + "loss": 2.6794, + "step": 49037 + }, + { + "epoch": 2.2830737714458644, + "grad_norm": 0.3298917106763507, + "learning_rate": 1.640681412885772e-05, + "loss": 2.6714, + "step": 49038 + }, + { + "epoch": 2.2831203296319575, + "grad_norm": 0.3259031657635559, + "learning_rate": 1.6404807885276653e-05, + "loss": 2.6517, + "step": 49039 + }, + { + "epoch": 2.2831668878180507, + "grad_norm": 0.33790818797599703, + "learning_rate": 1.6402801740294142e-05, + "loss": 2.548, + "step": 49040 + }, + { + "epoch": 2.2832134460041438, + "grad_norm": 0.31344061916372185, + "learning_rate": 1.64007956939161e-05, + "loss": 2.6004, + "step": 49041 + }, + { + "epoch": 2.283260004190237, + "grad_norm": 0.3337667200049408, + "learning_rate": 1.639878974614843e-05, + "loss": 2.5894, + "step": 49042 + }, + { + "epoch": 2.28330656237633, + "grad_norm": 0.33512605960484265, + "learning_rate": 1.639678389699697e-05, + "loss": 2.5999, + "step": 49043 + }, + { + "epoch": 2.283353120562423, + "grad_norm": 0.3330227958519239, + "learning_rate": 1.639477814646767e-05, + "loss": 2.6061, + "step": 49044 + }, + { + "epoch": 2.283399678748516, + "grad_norm": 0.3344117318168844, + "learning_rate": 1.639277249456635e-05, + "loss": 2.6365, + "step": 49045 + }, + { + "epoch": 2.283446236934609, + "grad_norm": 0.343537845878836, + "learning_rate": 1.639076694129897e-05, + "loss": 2.6438, + "step": 49046 + }, + { + "epoch": 2.283492795120702, + "grad_norm": 0.3296963888388755, + "learning_rate": 1.6388761486671357e-05, + "loss": 2.7085, + "step": 49047 + }, + { + "epoch": 2.283539353306795, + "grad_norm": 0.3401445011913392, + "learning_rate": 1.6386756130689418e-05, + "loss": 2.61, + "step": 49048 + }, + { + "epoch": 2.2835859114928883, + "grad_norm": 0.3178742402003248, + "learning_rate": 1.6384750873359046e-05, + "loss": 2.5988, + "step": 49049 + }, + { + "epoch": 2.2836324696789814, + "grad_norm": 0.31753097776515066, + "learning_rate": 1.638274571468611e-05, + "loss": 2.647, + "step": 49050 + }, + { + "epoch": 2.2836790278650745, + "grad_norm": 0.3334615901463308, + "learning_rate": 1.6380740654676508e-05, + "loss": 2.8092, + "step": 49051 + }, + { + "epoch": 2.2837255860511676, + "grad_norm": 0.3325092654825946, + "learning_rate": 1.6378735693336134e-05, + "loss": 2.6897, + "step": 49052 + }, + { + "epoch": 2.2837721442372603, + "grad_norm": 0.34611032220520654, + "learning_rate": 1.637673083067084e-05, + "loss": 2.7395, + "step": 49053 + }, + { + "epoch": 2.2838187024233534, + "grad_norm": 0.3358902405898821, + "learning_rate": 1.637472606668653e-05, + "loss": 2.6218, + "step": 49054 + }, + { + "epoch": 2.2838652606094465, + "grad_norm": 0.33061574503243646, + "learning_rate": 1.637272140138909e-05, + "loss": 2.5554, + "step": 49055 + }, + { + "epoch": 2.2839118187955396, + "grad_norm": 0.354713439862882, + "learning_rate": 1.6370716834784393e-05, + "loss": 2.6786, + "step": 49056 + }, + { + "epoch": 2.2839583769816327, + "grad_norm": 0.32701735549101146, + "learning_rate": 1.6368712366878353e-05, + "loss": 2.6604, + "step": 49057 + }, + { + "epoch": 2.284004935167726, + "grad_norm": 0.3241209069488097, + "learning_rate": 1.6366707997676794e-05, + "loss": 2.6321, + "step": 49058 + }, + { + "epoch": 2.284051493353819, + "grad_norm": 0.3405506488016536, + "learning_rate": 1.6364703727185664e-05, + "loss": 2.6278, + "step": 49059 + }, + { + "epoch": 2.284098051539912, + "grad_norm": 0.34899378258107344, + "learning_rate": 1.63626995554108e-05, + "loss": 2.715, + "step": 49060 + }, + { + "epoch": 2.284144609726005, + "grad_norm": 0.3244535970154532, + "learning_rate": 1.63606954823581e-05, + "loss": 2.67, + "step": 49061 + }, + { + "epoch": 2.2841911679120983, + "grad_norm": 0.3178597107969026, + "learning_rate": 1.6358691508033447e-05, + "loss": 2.7292, + "step": 49062 + }, + { + "epoch": 2.2842377260981914, + "grad_norm": 0.3622465193639571, + "learning_rate": 1.6356687632442713e-05, + "loss": 2.6636, + "step": 49063 + }, + { + "epoch": 2.284284284284284, + "grad_norm": 0.3514506838844542, + "learning_rate": 1.635468385559179e-05, + "loss": 2.709, + "step": 49064 + }, + { + "epoch": 2.284330842470377, + "grad_norm": 0.3249816182069977, + "learning_rate": 1.6352680177486574e-05, + "loss": 2.5996, + "step": 49065 + }, + { + "epoch": 2.2843774006564703, + "grad_norm": 0.3282729502022651, + "learning_rate": 1.635067659813291e-05, + "loss": 2.7082, + "step": 49066 + }, + { + "epoch": 2.2844239588425634, + "grad_norm": 0.34919001796570903, + "learning_rate": 1.6348673117536688e-05, + "loss": 2.6257, + "step": 49067 + }, + { + "epoch": 2.2844705170286566, + "grad_norm": 0.33759061167278076, + "learning_rate": 1.6346669735703808e-05, + "loss": 2.7024, + "step": 49068 + }, + { + "epoch": 2.2845170752147497, + "grad_norm": 0.3318510369910034, + "learning_rate": 1.634466645264013e-05, + "loss": 2.6512, + "step": 49069 + }, + { + "epoch": 2.284563633400843, + "grad_norm": 0.3573048546391235, + "learning_rate": 1.6342663268351555e-05, + "loss": 2.6644, + "step": 49070 + }, + { + "epoch": 2.284610191586936, + "grad_norm": 0.3556136843540086, + "learning_rate": 1.6340660182843915e-05, + "loss": 2.7056, + "step": 49071 + }, + { + "epoch": 2.284656749773029, + "grad_norm": 0.3396682935853223, + "learning_rate": 1.6338657196123153e-05, + "loss": 2.6987, + "step": 49072 + }, + { + "epoch": 2.2847033079591217, + "grad_norm": 0.34990188624772756, + "learning_rate": 1.6336654308195086e-05, + "loss": 2.7004, + "step": 49073 + }, + { + "epoch": 2.284749866145215, + "grad_norm": 0.3580503882730843, + "learning_rate": 1.6334651519065657e-05, + "loss": 2.6466, + "step": 49074 + }, + { + "epoch": 2.284796424331308, + "grad_norm": 0.3254514319053027, + "learning_rate": 1.6332648828740686e-05, + "loss": 2.5385, + "step": 49075 + }, + { + "epoch": 2.284842982517401, + "grad_norm": 0.3252705307794945, + "learning_rate": 1.6330646237226083e-05, + "loss": 2.6066, + "step": 49076 + }, + { + "epoch": 2.284889540703494, + "grad_norm": 0.31910532143419545, + "learning_rate": 1.632864374452771e-05, + "loss": 2.5353, + "step": 49077 + }, + { + "epoch": 2.2849360988895873, + "grad_norm": 0.35318878646900936, + "learning_rate": 1.6326641350651467e-05, + "loss": 2.652, + "step": 49078 + }, + { + "epoch": 2.2849826570756804, + "grad_norm": 0.35552169421630503, + "learning_rate": 1.6324639055603203e-05, + "loss": 2.6507, + "step": 49079 + }, + { + "epoch": 2.2850292152617735, + "grad_norm": 0.33856761349060244, + "learning_rate": 1.63226368593888e-05, + "loss": 2.6814, + "step": 49080 + }, + { + "epoch": 2.2850757734478666, + "grad_norm": 0.3347162060620102, + "learning_rate": 1.6320634762014144e-05, + "loss": 2.689, + "step": 49081 + }, + { + "epoch": 2.2851223316339597, + "grad_norm": 0.33341654390393954, + "learning_rate": 1.6318632763485105e-05, + "loss": 2.5896, + "step": 49082 + }, + { + "epoch": 2.285168889820053, + "grad_norm": 0.3265582676834084, + "learning_rate": 1.6316630863807575e-05, + "loss": 2.6169, + "step": 49083 + }, + { + "epoch": 2.2852154480061455, + "grad_norm": 0.31852959674462533, + "learning_rate": 1.6314629062987387e-05, + "loss": 2.6203, + "step": 49084 + }, + { + "epoch": 2.2852620061922386, + "grad_norm": 0.32367931704045305, + "learning_rate": 1.6312627361030468e-05, + "loss": 2.6275, + "step": 49085 + }, + { + "epoch": 2.2853085643783317, + "grad_norm": 0.3449770865651724, + "learning_rate": 1.6310625757942643e-05, + "loss": 2.6624, + "step": 49086 + }, + { + "epoch": 2.285355122564425, + "grad_norm": 0.33265930109433295, + "learning_rate": 1.6308624253729848e-05, + "loss": 2.7337, + "step": 49087 + }, + { + "epoch": 2.285401680750518, + "grad_norm": 0.32597056784718126, + "learning_rate": 1.63066228483979e-05, + "loss": 2.6974, + "step": 49088 + }, + { + "epoch": 2.285448238936611, + "grad_norm": 0.31759668639292715, + "learning_rate": 1.6304621541952697e-05, + "loss": 2.5759, + "step": 49089 + }, + { + "epoch": 2.285494797122704, + "grad_norm": 0.3289488891884307, + "learning_rate": 1.630262033440011e-05, + "loss": 2.6083, + "step": 49090 + }, + { + "epoch": 2.2855413553087973, + "grad_norm": 0.3162260763949139, + "learning_rate": 1.630061922574601e-05, + "loss": 2.5487, + "step": 49091 + }, + { + "epoch": 2.2855879134948904, + "grad_norm": 0.3300777958564674, + "learning_rate": 1.6298618215996298e-05, + "loss": 2.6815, + "step": 49092 + }, + { + "epoch": 2.285634471680983, + "grad_norm": 0.3360992501310287, + "learning_rate": 1.6296617305156798e-05, + "loss": 2.7435, + "step": 49093 + }, + { + "epoch": 2.285681029867076, + "grad_norm": 0.31887675579845703, + "learning_rate": 1.6294616493233406e-05, + "loss": 2.5887, + "step": 49094 + }, + { + "epoch": 2.2857275880531693, + "grad_norm": 0.32599928386390936, + "learning_rate": 1.629261578023199e-05, + "loss": 2.5402, + "step": 49095 + }, + { + "epoch": 2.2857741462392624, + "grad_norm": 0.329772971820992, + "learning_rate": 1.6290615166158452e-05, + "loss": 2.5905, + "step": 49096 + }, + { + "epoch": 2.2858207044253556, + "grad_norm": 0.320257584792865, + "learning_rate": 1.6288614651018597e-05, + "loss": 2.6374, + "step": 49097 + }, + { + "epoch": 2.2858672626114487, + "grad_norm": 0.31261690048194135, + "learning_rate": 1.628661423481837e-05, + "loss": 2.646, + "step": 49098 + }, + { + "epoch": 2.285913820797542, + "grad_norm": 0.33292845261182796, + "learning_rate": 1.628461391756358e-05, + "loss": 2.7049, + "step": 49099 + }, + { + "epoch": 2.285960378983635, + "grad_norm": 0.30674198032987227, + "learning_rate": 1.6282613699260156e-05, + "loss": 2.6062, + "step": 49100 + }, + { + "epoch": 2.286006937169728, + "grad_norm": 0.3110598918607411, + "learning_rate": 1.6280613579913928e-05, + "loss": 2.7537, + "step": 49101 + }, + { + "epoch": 2.286053495355821, + "grad_norm": 0.34075050225648035, + "learning_rate": 1.627861355953077e-05, + "loss": 2.5821, + "step": 49102 + }, + { + "epoch": 2.2861000535419143, + "grad_norm": 0.35264884262364093, + "learning_rate": 1.6276613638116565e-05, + "loss": 2.5858, + "step": 49103 + }, + { + "epoch": 2.286146611728007, + "grad_norm": 0.3157373271419005, + "learning_rate": 1.6274613815677174e-05, + "loss": 2.5701, + "step": 49104 + }, + { + "epoch": 2.2861931699141, + "grad_norm": 0.3295109686524306, + "learning_rate": 1.6272614092218485e-05, + "loss": 2.6205, + "step": 49105 + }, + { + "epoch": 2.286239728100193, + "grad_norm": 0.35560367477217725, + "learning_rate": 1.6270614467746337e-05, + "loss": 2.6441, + "step": 49106 + }, + { + "epoch": 2.2862862862862863, + "grad_norm": 0.3343275727912041, + "learning_rate": 1.6268614942266607e-05, + "loss": 2.6148, + "step": 49107 + }, + { + "epoch": 2.2863328444723794, + "grad_norm": 0.3381299663870384, + "learning_rate": 1.6266615515785177e-05, + "loss": 2.6379, + "step": 49108 + }, + { + "epoch": 2.2863794026584725, + "grad_norm": 0.3564758094778143, + "learning_rate": 1.6264616188307903e-05, + "loss": 2.64, + "step": 49109 + }, + { + "epoch": 2.2864259608445656, + "grad_norm": 0.3450271100363136, + "learning_rate": 1.6262616959840656e-05, + "loss": 2.6057, + "step": 49110 + }, + { + "epoch": 2.2864725190306587, + "grad_norm": 0.3140325793563231, + "learning_rate": 1.6260617830389324e-05, + "loss": 2.6497, + "step": 49111 + }, + { + "epoch": 2.2865190772167514, + "grad_norm": 0.347594965534006, + "learning_rate": 1.6258618799959718e-05, + "loss": 2.6577, + "step": 49112 + }, + { + "epoch": 2.2865656354028445, + "grad_norm": 0.3577894791483375, + "learning_rate": 1.6256619868557777e-05, + "loss": 2.6055, + "step": 49113 + }, + { + "epoch": 2.2866121935889376, + "grad_norm": 0.3317240129402655, + "learning_rate": 1.6254621036189316e-05, + "loss": 2.6709, + "step": 49114 + }, + { + "epoch": 2.2866587517750308, + "grad_norm": 0.3384026310458577, + "learning_rate": 1.6252622302860216e-05, + "loss": 2.6678, + "step": 49115 + }, + { + "epoch": 2.286705309961124, + "grad_norm": 0.35630870214946153, + "learning_rate": 1.625062366857635e-05, + "loss": 2.563, + "step": 49116 + }, + { + "epoch": 2.286751868147217, + "grad_norm": 0.33540979575820684, + "learning_rate": 1.624862513334357e-05, + "loss": 2.6208, + "step": 49117 + }, + { + "epoch": 2.28679842633331, + "grad_norm": 0.32512670128738086, + "learning_rate": 1.6246626697167773e-05, + "loss": 2.6319, + "step": 49118 + }, + { + "epoch": 2.286844984519403, + "grad_norm": 0.3613519077338674, + "learning_rate": 1.624462836005478e-05, + "loss": 2.625, + "step": 49119 + }, + { + "epoch": 2.2868915427054963, + "grad_norm": 0.36584153671711894, + "learning_rate": 1.6242630122010483e-05, + "loss": 2.6178, + "step": 49120 + }, + { + "epoch": 2.2869381008915894, + "grad_norm": 0.32953847994814844, + "learning_rate": 1.624063198304074e-05, + "loss": 2.6011, + "step": 49121 + }, + { + "epoch": 2.2869846590776826, + "grad_norm": 0.3581828704160662, + "learning_rate": 1.6238633943151415e-05, + "loss": 2.6961, + "step": 49122 + }, + { + "epoch": 2.2870312172637752, + "grad_norm": 0.3683963054878485, + "learning_rate": 1.623663600234837e-05, + "loss": 2.6701, + "step": 49123 + }, + { + "epoch": 2.2870777754498683, + "grad_norm": 0.34263988182125643, + "learning_rate": 1.6234638160637494e-05, + "loss": 2.5831, + "step": 49124 + }, + { + "epoch": 2.2871243336359615, + "grad_norm": 0.33296784611921654, + "learning_rate": 1.6232640418024597e-05, + "loss": 2.6126, + "step": 49125 + }, + { + "epoch": 2.2871708918220546, + "grad_norm": 0.36037122377072023, + "learning_rate": 1.6230642774515602e-05, + "loss": 2.7336, + "step": 49126 + }, + { + "epoch": 2.2872174500081477, + "grad_norm": 0.32952240042656555, + "learning_rate": 1.622864523011633e-05, + "loss": 2.5828, + "step": 49127 + }, + { + "epoch": 2.287264008194241, + "grad_norm": 0.3315207905234223, + "learning_rate": 1.6226647784832655e-05, + "loss": 2.6617, + "step": 49128 + }, + { + "epoch": 2.287310566380334, + "grad_norm": 0.32804050742466023, + "learning_rate": 1.6224650438670443e-05, + "loss": 2.5995, + "step": 49129 + }, + { + "epoch": 2.287357124566427, + "grad_norm": 0.3533588171189934, + "learning_rate": 1.622265319163555e-05, + "loss": 2.6604, + "step": 49130 + }, + { + "epoch": 2.28740368275252, + "grad_norm": 0.3429314364458346, + "learning_rate": 1.622065604373386e-05, + "loss": 2.6958, + "step": 49131 + }, + { + "epoch": 2.287450240938613, + "grad_norm": 0.30335351012495804, + "learning_rate": 1.62186589949712e-05, + "loss": 2.6458, + "step": 49132 + }, + { + "epoch": 2.287496799124706, + "grad_norm": 0.3313807203696827, + "learning_rate": 1.6216662045353454e-05, + "loss": 2.6564, + "step": 49133 + }, + { + "epoch": 2.287543357310799, + "grad_norm": 0.317723984977221, + "learning_rate": 1.6214665194886476e-05, + "loss": 2.6504, + "step": 49134 + }, + { + "epoch": 2.287589915496892, + "grad_norm": 0.3372773925034316, + "learning_rate": 1.6212668443576122e-05, + "loss": 2.7013, + "step": 49135 + }, + { + "epoch": 2.2876364736829853, + "grad_norm": 0.3204313197418301, + "learning_rate": 1.621067179142826e-05, + "loss": 2.634, + "step": 49136 + }, + { + "epoch": 2.2876830318690784, + "grad_norm": 0.32498870305871536, + "learning_rate": 1.6208675238448768e-05, + "loss": 2.6055, + "step": 49137 + }, + { + "epoch": 2.2877295900551715, + "grad_norm": 0.31125857213787544, + "learning_rate": 1.6206678784643443e-05, + "loss": 2.6974, + "step": 49138 + }, + { + "epoch": 2.2877761482412646, + "grad_norm": 0.3142188366770498, + "learning_rate": 1.6204682430018226e-05, + "loss": 2.6745, + "step": 49139 + }, + { + "epoch": 2.2878227064273577, + "grad_norm": 0.3438861215201424, + "learning_rate": 1.620268617457892e-05, + "loss": 2.7254, + "step": 49140 + }, + { + "epoch": 2.287869264613451, + "grad_norm": 0.320168746268624, + "learning_rate": 1.62006900183314e-05, + "loss": 2.5822, + "step": 49141 + }, + { + "epoch": 2.287915822799544, + "grad_norm": 0.3199540429834268, + "learning_rate": 1.6198693961281526e-05, + "loss": 2.5679, + "step": 49142 + }, + { + "epoch": 2.2879623809856366, + "grad_norm": 0.329509050138619, + "learning_rate": 1.619669800343515e-05, + "loss": 2.5668, + "step": 49143 + }, + { + "epoch": 2.2880089391717298, + "grad_norm": 0.3600537271416793, + "learning_rate": 1.6194702144798156e-05, + "loss": 2.7154, + "step": 49144 + }, + { + "epoch": 2.288055497357823, + "grad_norm": 0.32273259295226614, + "learning_rate": 1.6192706385376343e-05, + "loss": 2.7117, + "step": 49145 + }, + { + "epoch": 2.288102055543916, + "grad_norm": 0.31788472568757925, + "learning_rate": 1.6190710725175645e-05, + "loss": 2.7305, + "step": 49146 + }, + { + "epoch": 2.288148613730009, + "grad_norm": 0.32445571574141613, + "learning_rate": 1.618871516420186e-05, + "loss": 2.7173, + "step": 49147 + }, + { + "epoch": 2.288195171916102, + "grad_norm": 0.32757321504704473, + "learning_rate": 1.6186719702460866e-05, + "loss": 2.7301, + "step": 49148 + }, + { + "epoch": 2.2882417301021953, + "grad_norm": 0.32908063474156496, + "learning_rate": 1.6184724339958523e-05, + "loss": 2.6305, + "step": 49149 + }, + { + "epoch": 2.2882882882882885, + "grad_norm": 0.3228046017979521, + "learning_rate": 1.6182729076700696e-05, + "loss": 2.6738, + "step": 49150 + }, + { + "epoch": 2.288334846474381, + "grad_norm": 0.314481135639012, + "learning_rate": 1.6180733912693196e-05, + "loss": 2.7046, + "step": 49151 + }, + { + "epoch": 2.2883814046604742, + "grad_norm": 0.3447024360235822, + "learning_rate": 1.6178738847941938e-05, + "loss": 2.7453, + "step": 49152 + }, + { + "epoch": 2.2884279628465674, + "grad_norm": 0.31223459986120455, + "learning_rate": 1.617674388245274e-05, + "loss": 2.5265, + "step": 49153 + }, + { + "epoch": 2.2884745210326605, + "grad_norm": 0.33042345673159446, + "learning_rate": 1.6174749016231462e-05, + "loss": 2.5993, + "step": 49154 + }, + { + "epoch": 2.2885210792187536, + "grad_norm": 0.3525018336025778, + "learning_rate": 1.6172754249283968e-05, + "loss": 2.6275, + "step": 49155 + }, + { + "epoch": 2.2885676374048467, + "grad_norm": 0.3165493439027034, + "learning_rate": 1.61707595816161e-05, + "loss": 2.6009, + "step": 49156 + }, + { + "epoch": 2.28861419559094, + "grad_norm": 0.33283136698008714, + "learning_rate": 1.616876501323374e-05, + "loss": 2.607, + "step": 49157 + }, + { + "epoch": 2.288660753777033, + "grad_norm": 0.36726326929788433, + "learning_rate": 1.616677054414269e-05, + "loss": 2.7363, + "step": 49158 + }, + { + "epoch": 2.288707311963126, + "grad_norm": 0.3371745602819325, + "learning_rate": 1.6164776174348867e-05, + "loss": 2.6752, + "step": 49159 + }, + { + "epoch": 2.288753870149219, + "grad_norm": 0.34065678613428685, + "learning_rate": 1.6162781903858075e-05, + "loss": 2.64, + "step": 49160 + }, + { + "epoch": 2.2888004283353123, + "grad_norm": 0.3434647452190576, + "learning_rate": 1.6160787732676185e-05, + "loss": 2.7198, + "step": 49161 + }, + { + "epoch": 2.288846986521405, + "grad_norm": 0.32685783042212363, + "learning_rate": 1.6158793660809053e-05, + "loss": 2.6296, + "step": 49162 + }, + { + "epoch": 2.288893544707498, + "grad_norm": 0.30688175735991163, + "learning_rate": 1.6156799688262525e-05, + "loss": 2.5469, + "step": 49163 + }, + { + "epoch": 2.288940102893591, + "grad_norm": 0.3178640956556998, + "learning_rate": 1.6154805815042458e-05, + "loss": 2.6502, + "step": 49164 + }, + { + "epoch": 2.2889866610796843, + "grad_norm": 0.3583706912051041, + "learning_rate": 1.6152812041154714e-05, + "loss": 2.6793, + "step": 49165 + }, + { + "epoch": 2.2890332192657774, + "grad_norm": 0.33000494210714365, + "learning_rate": 1.615081836660512e-05, + "loss": 2.5459, + "step": 49166 + }, + { + "epoch": 2.2890797774518705, + "grad_norm": 0.3625030523760243, + "learning_rate": 1.6148824791399535e-05, + "loss": 2.5912, + "step": 49167 + }, + { + "epoch": 2.2891263356379636, + "grad_norm": 0.33780466284185046, + "learning_rate": 1.614683131554382e-05, + "loss": 2.7377, + "step": 49168 + }, + { + "epoch": 2.2891728938240568, + "grad_norm": 0.33918243611282045, + "learning_rate": 1.6144837939043815e-05, + "loss": 2.6417, + "step": 49169 + }, + { + "epoch": 2.28921945201015, + "grad_norm": 0.3417221035256002, + "learning_rate": 1.6142844661905393e-05, + "loss": 2.6017, + "step": 49170 + }, + { + "epoch": 2.2892660101962425, + "grad_norm": 0.3384489782698968, + "learning_rate": 1.6140851484134355e-05, + "loss": 2.6021, + "step": 49171 + }, + { + "epoch": 2.2893125683823357, + "grad_norm": 0.3330062825858356, + "learning_rate": 1.6138858405736612e-05, + "loss": 2.6456, + "step": 49172 + }, + { + "epoch": 2.2893591265684288, + "grad_norm": 0.34688075106114363, + "learning_rate": 1.6136865426717972e-05, + "loss": 2.6673, + "step": 49173 + }, + { + "epoch": 2.289405684754522, + "grad_norm": 0.33211465190703526, + "learning_rate": 1.613487254708429e-05, + "loss": 2.7181, + "step": 49174 + }, + { + "epoch": 2.289452242940615, + "grad_norm": 0.3397672088341365, + "learning_rate": 1.6132879766841423e-05, + "loss": 2.5819, + "step": 49175 + }, + { + "epoch": 2.289498801126708, + "grad_norm": 0.3282518064735498, + "learning_rate": 1.6130887085995223e-05, + "loss": 2.6266, + "step": 49176 + }, + { + "epoch": 2.2895453593128012, + "grad_norm": 0.3289383039166996, + "learning_rate": 1.6128894504551527e-05, + "loss": 2.5712, + "step": 49177 + }, + { + "epoch": 2.2895919174988943, + "grad_norm": 0.3422491578124328, + "learning_rate": 1.6126902022516204e-05, + "loss": 2.6287, + "step": 49178 + }, + { + "epoch": 2.2896384756849875, + "grad_norm": 0.3430076549436383, + "learning_rate": 1.6124909639895075e-05, + "loss": 2.6426, + "step": 49179 + }, + { + "epoch": 2.2896850338710806, + "grad_norm": 0.3016719349145123, + "learning_rate": 1.6122917356693995e-05, + "loss": 2.652, + "step": 49180 + }, + { + "epoch": 2.2897315920571737, + "grad_norm": 0.32812469313820375, + "learning_rate": 1.6120925172918817e-05, + "loss": 2.6583, + "step": 49181 + }, + { + "epoch": 2.2897781502432664, + "grad_norm": 0.32736767999184685, + "learning_rate": 1.611893308857539e-05, + "loss": 2.6682, + "step": 49182 + }, + { + "epoch": 2.2898247084293595, + "grad_norm": 0.3453526150754718, + "learning_rate": 1.6116941103669565e-05, + "loss": 2.6222, + "step": 49183 + }, + { + "epoch": 2.2898712666154526, + "grad_norm": 0.32472752894690693, + "learning_rate": 1.6114949218207147e-05, + "loss": 2.5928, + "step": 49184 + }, + { + "epoch": 2.2899178248015457, + "grad_norm": 0.3302707805289219, + "learning_rate": 1.6112957432194054e-05, + "loss": 2.5329, + "step": 49185 + }, + { + "epoch": 2.289964382987639, + "grad_norm": 0.32200223364629676, + "learning_rate": 1.611096574563606e-05, + "loss": 2.619, + "step": 49186 + }, + { + "epoch": 2.290010941173732, + "grad_norm": 0.3429041507906596, + "learning_rate": 1.610897415853907e-05, + "loss": 2.6603, + "step": 49187 + }, + { + "epoch": 2.290057499359825, + "grad_norm": 0.32961591767751447, + "learning_rate": 1.610698267090889e-05, + "loss": 2.5959, + "step": 49188 + }, + { + "epoch": 2.290104057545918, + "grad_norm": 0.32410938423211005, + "learning_rate": 1.6104991282751376e-05, + "loss": 2.603, + "step": 49189 + }, + { + "epoch": 2.290150615732011, + "grad_norm": 0.345649115322483, + "learning_rate": 1.610299999407237e-05, + "loss": 2.8376, + "step": 49190 + }, + { + "epoch": 2.290197173918104, + "grad_norm": 0.34049583473149675, + "learning_rate": 1.6101008804877736e-05, + "loss": 2.6133, + "step": 49191 + }, + { + "epoch": 2.290243732104197, + "grad_norm": 0.33641080080960517, + "learning_rate": 1.6099017715173288e-05, + "loss": 2.6444, + "step": 49192 + }, + { + "epoch": 2.29029029029029, + "grad_norm": 0.3338980906830645, + "learning_rate": 1.609702672496488e-05, + "loss": 2.6563, + "step": 49193 + }, + { + "epoch": 2.2903368484763833, + "grad_norm": 0.30886594952780844, + "learning_rate": 1.6095035834258366e-05, + "loss": 2.6131, + "step": 49194 + }, + { + "epoch": 2.2903834066624764, + "grad_norm": 0.3433783238640324, + "learning_rate": 1.6093045043059573e-05, + "loss": 2.6617, + "step": 49195 + }, + { + "epoch": 2.2904299648485695, + "grad_norm": 0.3342274265941722, + "learning_rate": 1.6091054351374373e-05, + "loss": 2.5055, + "step": 49196 + }, + { + "epoch": 2.2904765230346626, + "grad_norm": 0.32474089129079453, + "learning_rate": 1.6089063759208557e-05, + "loss": 2.5905, + "step": 49197 + }, + { + "epoch": 2.2905230812207558, + "grad_norm": 0.32052871972593927, + "learning_rate": 1.608707326656803e-05, + "loss": 2.6057, + "step": 49198 + }, + { + "epoch": 2.290569639406849, + "grad_norm": 0.31279676157815733, + "learning_rate": 1.6085082873458566e-05, + "loss": 2.5843, + "step": 49199 + }, + { + "epoch": 2.290616197592942, + "grad_norm": 0.35333330696554965, + "learning_rate": 1.6083092579886077e-05, + "loss": 2.6761, + "step": 49200 + }, + { + "epoch": 2.2906627557790347, + "grad_norm": 0.3238479193618447, + "learning_rate": 1.6081102385856355e-05, + "loss": 2.7069, + "step": 49201 + }, + { + "epoch": 2.290709313965128, + "grad_norm": 0.3382133053642898, + "learning_rate": 1.6079112291375254e-05, + "loss": 2.6538, + "step": 49202 + }, + { + "epoch": 2.290755872151221, + "grad_norm": 0.3217160547705732, + "learning_rate": 1.6077122296448615e-05, + "loss": 2.5524, + "step": 49203 + }, + { + "epoch": 2.290802430337314, + "grad_norm": 0.319008071216, + "learning_rate": 1.6075132401082283e-05, + "loss": 2.557, + "step": 49204 + }, + { + "epoch": 2.290848988523407, + "grad_norm": 0.3137872183976851, + "learning_rate": 1.6073142605282116e-05, + "loss": 2.6429, + "step": 49205 + }, + { + "epoch": 2.2908955467095002, + "grad_norm": 0.3137431264080425, + "learning_rate": 1.607115290905391e-05, + "loss": 2.6915, + "step": 49206 + }, + { + "epoch": 2.2909421048955934, + "grad_norm": 0.32701883289102773, + "learning_rate": 1.606916331240353e-05, + "loss": 2.6545, + "step": 49207 + }, + { + "epoch": 2.2909886630816865, + "grad_norm": 0.35424629141264247, + "learning_rate": 1.6067173815336812e-05, + "loss": 2.6433, + "step": 49208 + }, + { + "epoch": 2.2910352212677796, + "grad_norm": 0.3328524154791796, + "learning_rate": 1.6065184417859614e-05, + "loss": 2.6459, + "step": 49209 + }, + { + "epoch": 2.2910817794538723, + "grad_norm": 0.33265144484593595, + "learning_rate": 1.606319511997772e-05, + "loss": 2.6173, + "step": 49210 + }, + { + "epoch": 2.2911283376399654, + "grad_norm": 0.3703866250728622, + "learning_rate": 1.6061205921697043e-05, + "loss": 2.6086, + "step": 49211 + }, + { + "epoch": 2.2911748958260585, + "grad_norm": 0.32713794532518553, + "learning_rate": 1.6059216823023342e-05, + "loss": 2.6407, + "step": 49212 + }, + { + "epoch": 2.2912214540121516, + "grad_norm": 0.34769247532498126, + "learning_rate": 1.605722782396254e-05, + "loss": 2.6814, + "step": 49213 + }, + { + "epoch": 2.2912680121982447, + "grad_norm": 0.34601495122060616, + "learning_rate": 1.6055238924520405e-05, + "loss": 2.6764, + "step": 49214 + }, + { + "epoch": 2.291314570384338, + "grad_norm": 0.33063414807062974, + "learning_rate": 1.60532501247028e-05, + "loss": 2.7103, + "step": 49215 + }, + { + "epoch": 2.291361128570431, + "grad_norm": 0.3429208487098751, + "learning_rate": 1.605126142451557e-05, + "loss": 2.5753, + "step": 49216 + }, + { + "epoch": 2.291407686756524, + "grad_norm": 0.3471385854462711, + "learning_rate": 1.6049272823964533e-05, + "loss": 2.5191, + "step": 49217 + }, + { + "epoch": 2.291454244942617, + "grad_norm": 0.33094191253547256, + "learning_rate": 1.604728432305556e-05, + "loss": 2.5812, + "step": 49218 + }, + { + "epoch": 2.2915008031287103, + "grad_norm": 0.36590580579315624, + "learning_rate": 1.604529592179444e-05, + "loss": 2.6419, + "step": 49219 + }, + { + "epoch": 2.2915473613148034, + "grad_norm": 0.3379983353072727, + "learning_rate": 1.6043307620187036e-05, + "loss": 2.6579, + "step": 49220 + }, + { + "epoch": 2.291593919500896, + "grad_norm": 0.3324367679905105, + "learning_rate": 1.6041319418239174e-05, + "loss": 2.6966, + "step": 49221 + }, + { + "epoch": 2.291640477686989, + "grad_norm": 0.3407061868447287, + "learning_rate": 1.6039331315956702e-05, + "loss": 2.5954, + "step": 49222 + }, + { + "epoch": 2.2916870358730823, + "grad_norm": 0.33543780825587866, + "learning_rate": 1.6037343313345443e-05, + "loss": 2.6655, + "step": 49223 + }, + { + "epoch": 2.2917335940591754, + "grad_norm": 0.3249305222085426, + "learning_rate": 1.603535541041125e-05, + "loss": 2.6582, + "step": 49224 + }, + { + "epoch": 2.2917801522452685, + "grad_norm": 0.3415933926494421, + "learning_rate": 1.603336760715991e-05, + "loss": 2.5802, + "step": 49225 + }, + { + "epoch": 2.2918267104313617, + "grad_norm": 0.3130127916251285, + "learning_rate": 1.6031379903597323e-05, + "loss": 2.6273, + "step": 49226 + }, + { + "epoch": 2.2918732686174548, + "grad_norm": 0.3170349038415283, + "learning_rate": 1.6029392299729274e-05, + "loss": 2.6128, + "step": 49227 + }, + { + "epoch": 2.291919826803548, + "grad_norm": 0.34828136374727475, + "learning_rate": 1.6027404795561614e-05, + "loss": 2.6592, + "step": 49228 + }, + { + "epoch": 2.2919663849896406, + "grad_norm": 0.33135155199340144, + "learning_rate": 1.602541739110017e-05, + "loss": 2.6423, + "step": 49229 + }, + { + "epoch": 2.2920129431757337, + "grad_norm": 0.33924900258784507, + "learning_rate": 1.6023430086350782e-05, + "loss": 2.6277, + "step": 49230 + }, + { + "epoch": 2.292059501361827, + "grad_norm": 0.3132890980985134, + "learning_rate": 1.6021442881319293e-05, + "loss": 2.6757, + "step": 49231 + }, + { + "epoch": 2.29210605954792, + "grad_norm": 0.34016195173250285, + "learning_rate": 1.6019455776011506e-05, + "loss": 2.6091, + "step": 49232 + }, + { + "epoch": 2.292152617734013, + "grad_norm": 0.3335513977333532, + "learning_rate": 1.601746877043327e-05, + "loss": 2.6074, + "step": 49233 + }, + { + "epoch": 2.292199175920106, + "grad_norm": 0.33706395458234784, + "learning_rate": 1.6015481864590416e-05, + "loss": 2.6435, + "step": 49234 + }, + { + "epoch": 2.2922457341061993, + "grad_norm": 0.3200565578100985, + "learning_rate": 1.601349505848877e-05, + "loss": 2.7089, + "step": 49235 + }, + { + "epoch": 2.2922922922922924, + "grad_norm": 0.33306216955959467, + "learning_rate": 1.6011508352134174e-05, + "loss": 2.6404, + "step": 49236 + }, + { + "epoch": 2.2923388504783855, + "grad_norm": 0.31509377141910994, + "learning_rate": 1.6009521745532465e-05, + "loss": 2.5854, + "step": 49237 + }, + { + "epoch": 2.2923854086644786, + "grad_norm": 0.3116557753751219, + "learning_rate": 1.600753523868943e-05, + "loss": 2.632, + "step": 49238 + }, + { + "epoch": 2.2924319668505717, + "grad_norm": 0.34323983293306537, + "learning_rate": 1.600554883161096e-05, + "loss": 2.6739, + "step": 49239 + }, + { + "epoch": 2.2924785250366644, + "grad_norm": 0.3308059197638708, + "learning_rate": 1.6003562524302847e-05, + "loss": 2.6327, + "step": 49240 + }, + { + "epoch": 2.2925250832227575, + "grad_norm": 0.33692315302961173, + "learning_rate": 1.6001576316770922e-05, + "loss": 2.5791, + "step": 49241 + }, + { + "epoch": 2.2925716414088506, + "grad_norm": 0.3474174977156484, + "learning_rate": 1.599959020902102e-05, + "loss": 2.8188, + "step": 49242 + }, + { + "epoch": 2.2926181995949437, + "grad_norm": 0.3338277579933255, + "learning_rate": 1.5997604201058976e-05, + "loss": 2.5992, + "step": 49243 + }, + { + "epoch": 2.292664757781037, + "grad_norm": 0.31682354060974177, + "learning_rate": 1.5995618292890623e-05, + "loss": 2.5903, + "step": 49244 + }, + { + "epoch": 2.29271131596713, + "grad_norm": 0.3110862394702757, + "learning_rate": 1.599363248452177e-05, + "loss": 2.6876, + "step": 49245 + }, + { + "epoch": 2.292757874153223, + "grad_norm": 0.35812319546446497, + "learning_rate": 1.5991646775958254e-05, + "loss": 2.6153, + "step": 49246 + }, + { + "epoch": 2.292804432339316, + "grad_norm": 0.318254670771162, + "learning_rate": 1.5989661167205903e-05, + "loss": 2.6474, + "step": 49247 + }, + { + "epoch": 2.2928509905254093, + "grad_norm": 0.3331879294539427, + "learning_rate": 1.598767565827055e-05, + "loss": 2.6171, + "step": 49248 + }, + { + "epoch": 2.292897548711502, + "grad_norm": 0.32730442561448847, + "learning_rate": 1.5985690249158015e-05, + "loss": 2.5988, + "step": 49249 + }, + { + "epoch": 2.292944106897595, + "grad_norm": 0.31419602867292834, + "learning_rate": 1.5983704939874143e-05, + "loss": 2.6685, + "step": 49250 + }, + { + "epoch": 2.292990665083688, + "grad_norm": 0.3215114713020214, + "learning_rate": 1.5981719730424715e-05, + "loss": 2.694, + "step": 49251 + }, + { + "epoch": 2.2930372232697813, + "grad_norm": 0.3084419165067857, + "learning_rate": 1.5979734620815623e-05, + "loss": 2.6753, + "step": 49252 + }, + { + "epoch": 2.2930837814558744, + "grad_norm": 0.32977111793758934, + "learning_rate": 1.597774961105264e-05, + "loss": 2.6781, + "step": 49253 + }, + { + "epoch": 2.2931303396419676, + "grad_norm": 0.34438866844317517, + "learning_rate": 1.597576470114161e-05, + "loss": 2.684, + "step": 49254 + }, + { + "epoch": 2.2931768978280607, + "grad_norm": 0.3044531138474743, + "learning_rate": 1.597377989108836e-05, + "loss": 2.5303, + "step": 49255 + }, + { + "epoch": 2.293223456014154, + "grad_norm": 0.3427158494843098, + "learning_rate": 1.5971795180898707e-05, + "loss": 2.6284, + "step": 49256 + }, + { + "epoch": 2.293270014200247, + "grad_norm": 0.32244527717556376, + "learning_rate": 1.5969810570578508e-05, + "loss": 2.652, + "step": 49257 + }, + { + "epoch": 2.29331657238634, + "grad_norm": 0.3129461112198813, + "learning_rate": 1.596782606013353e-05, + "loss": 2.6644, + "step": 49258 + }, + { + "epoch": 2.293363130572433, + "grad_norm": 0.3264957471588115, + "learning_rate": 1.5965841649569656e-05, + "loss": 2.6584, + "step": 49259 + }, + { + "epoch": 2.293409688758526, + "grad_norm": 0.32991744946531776, + "learning_rate": 1.596385733889267e-05, + "loss": 2.7535, + "step": 49260 + }, + { + "epoch": 2.293456246944619, + "grad_norm": 0.32230216730396577, + "learning_rate": 1.5961873128108414e-05, + "loss": 2.6647, + "step": 49261 + }, + { + "epoch": 2.293502805130712, + "grad_norm": 0.3162009976002414, + "learning_rate": 1.59598890172227e-05, + "loss": 2.7036, + "step": 49262 + }, + { + "epoch": 2.293549363316805, + "grad_norm": 0.3244519174806333, + "learning_rate": 1.5957905006241375e-05, + "loss": 2.6805, + "step": 49263 + }, + { + "epoch": 2.2935959215028983, + "grad_norm": 0.32447002881625264, + "learning_rate": 1.5955921095170218e-05, + "loss": 2.5391, + "step": 49264 + }, + { + "epoch": 2.2936424796889914, + "grad_norm": 0.30948717626035815, + "learning_rate": 1.595393728401511e-05, + "loss": 2.5911, + "step": 49265 + }, + { + "epoch": 2.2936890378750845, + "grad_norm": 0.30765960972689615, + "learning_rate": 1.595195357278182e-05, + "loss": 2.5868, + "step": 49266 + }, + { + "epoch": 2.2937355960611776, + "grad_norm": 0.310823323152868, + "learning_rate": 1.5949969961476196e-05, + "loss": 2.5754, + "step": 49267 + }, + { + "epoch": 2.2937821542472703, + "grad_norm": 0.3162271475798778, + "learning_rate": 1.5947986450104057e-05, + "loss": 2.5406, + "step": 49268 + }, + { + "epoch": 2.2938287124333634, + "grad_norm": 0.3175780816987668, + "learning_rate": 1.594600303867122e-05, + "loss": 2.5155, + "step": 49269 + }, + { + "epoch": 2.2938752706194565, + "grad_norm": 0.3295720799674656, + "learning_rate": 1.594401972718353e-05, + "loss": 2.6249, + "step": 49270 + }, + { + "epoch": 2.2939218288055496, + "grad_norm": 0.32441660072775164, + "learning_rate": 1.5942036515646752e-05, + "loss": 2.5458, + "step": 49271 + }, + { + "epoch": 2.2939683869916427, + "grad_norm": 0.32975748938100075, + "learning_rate": 1.5940053404066772e-05, + "loss": 2.5952, + "step": 49272 + }, + { + "epoch": 2.294014945177736, + "grad_norm": 0.3630813133193469, + "learning_rate": 1.5938070392449365e-05, + "loss": 2.6794, + "step": 49273 + }, + { + "epoch": 2.294061503363829, + "grad_norm": 0.33902958540833955, + "learning_rate": 1.5936087480800366e-05, + "loss": 2.6945, + "step": 49274 + }, + { + "epoch": 2.294108061549922, + "grad_norm": 0.31778614758579954, + "learning_rate": 1.5934104669125598e-05, + "loss": 2.75, + "step": 49275 + }, + { + "epoch": 2.294154619736015, + "grad_norm": 0.31607795767526575, + "learning_rate": 1.5932121957430875e-05, + "loss": 2.6426, + "step": 49276 + }, + { + "epoch": 2.2942011779221083, + "grad_norm": 0.3218332561052183, + "learning_rate": 1.5930139345722012e-05, + "loss": 2.6097, + "step": 49277 + }, + { + "epoch": 2.2942477361082014, + "grad_norm": 0.33307495552580024, + "learning_rate": 1.5928156834004853e-05, + "loss": 2.6043, + "step": 49278 + }, + { + "epoch": 2.294294294294294, + "grad_norm": 0.34167743593409355, + "learning_rate": 1.5926174422285168e-05, + "loss": 2.6371, + "step": 49279 + }, + { + "epoch": 2.294340852480387, + "grad_norm": 0.30833678196492453, + "learning_rate": 1.5924192110568835e-05, + "loss": 2.5373, + "step": 49280 + }, + { + "epoch": 2.2943874106664803, + "grad_norm": 0.3141295132137919, + "learning_rate": 1.5922209898861622e-05, + "loss": 2.5146, + "step": 49281 + }, + { + "epoch": 2.2944339688525734, + "grad_norm": 0.3260245924715619, + "learning_rate": 1.5920227787169366e-05, + "loss": 2.642, + "step": 49282 + }, + { + "epoch": 2.2944805270386666, + "grad_norm": 0.34694852154542977, + "learning_rate": 1.5918245775497904e-05, + "loss": 2.6171, + "step": 49283 + }, + { + "epoch": 2.2945270852247597, + "grad_norm": 0.35798426067134487, + "learning_rate": 1.5916263863853e-05, + "loss": 2.5954, + "step": 49284 + }, + { + "epoch": 2.294573643410853, + "grad_norm": 0.3572377786636479, + "learning_rate": 1.5914282052240535e-05, + "loss": 2.6892, + "step": 49285 + }, + { + "epoch": 2.294620201596946, + "grad_norm": 0.3136495783355408, + "learning_rate": 1.5912300340666274e-05, + "loss": 2.6074, + "step": 49286 + }, + { + "epoch": 2.294666759783039, + "grad_norm": 0.3075836457778912, + "learning_rate": 1.591031872913606e-05, + "loss": 2.6231, + "step": 49287 + }, + { + "epoch": 2.2947133179691317, + "grad_norm": 0.3357018152051148, + "learning_rate": 1.59083372176557e-05, + "loss": 2.7099, + "step": 49288 + }, + { + "epoch": 2.294759876155225, + "grad_norm": 0.35023812838049706, + "learning_rate": 1.590635580623101e-05, + "loss": 2.6589, + "step": 49289 + }, + { + "epoch": 2.294806434341318, + "grad_norm": 0.3315462206452318, + "learning_rate": 1.5904374494867807e-05, + "loss": 2.511, + "step": 49290 + }, + { + "epoch": 2.294852992527411, + "grad_norm": 0.3147615723740979, + "learning_rate": 1.5902393283571926e-05, + "loss": 2.5426, + "step": 49291 + }, + { + "epoch": 2.294899550713504, + "grad_norm": 0.3137758211787975, + "learning_rate": 1.5900412172349128e-05, + "loss": 2.6336, + "step": 49292 + }, + { + "epoch": 2.2949461088995973, + "grad_norm": 0.34700524968732116, + "learning_rate": 1.589843116120529e-05, + "loss": 2.6998, + "step": 49293 + }, + { + "epoch": 2.2949926670856904, + "grad_norm": 0.31819581459980617, + "learning_rate": 1.5896450250146183e-05, + "loss": 2.4943, + "step": 49294 + }, + { + "epoch": 2.2950392252717835, + "grad_norm": 0.330322609983849, + "learning_rate": 1.5894469439177635e-05, + "loss": 2.6304, + "step": 49295 + }, + { + "epoch": 2.2950857834578766, + "grad_norm": 0.329768051285136, + "learning_rate": 1.5892488728305472e-05, + "loss": 2.5421, + "step": 49296 + }, + { + "epoch": 2.2951323416439697, + "grad_norm": 0.3593204499112178, + "learning_rate": 1.5890508117535462e-05, + "loss": 2.6983, + "step": 49297 + }, + { + "epoch": 2.295178899830063, + "grad_norm": 0.323681062838388, + "learning_rate": 1.588852760687349e-05, + "loss": 2.5694, + "step": 49298 + }, + { + "epoch": 2.2952254580161555, + "grad_norm": 0.32826261888370944, + "learning_rate": 1.5886547196325306e-05, + "loss": 2.7104, + "step": 49299 + }, + { + "epoch": 2.2952720162022486, + "grad_norm": 0.33906847504736204, + "learning_rate": 1.5884566885896744e-05, + "loss": 2.5628, + "step": 49300 + }, + { + "epoch": 2.2953185743883417, + "grad_norm": 0.33557397656282095, + "learning_rate": 1.5882586675593626e-05, + "loss": 2.6564, + "step": 49301 + }, + { + "epoch": 2.295365132574435, + "grad_norm": 0.30502556580469753, + "learning_rate": 1.5880606565421745e-05, + "loss": 2.5858, + "step": 49302 + }, + { + "epoch": 2.295411690760528, + "grad_norm": 0.32026042744441147, + "learning_rate": 1.587862655538693e-05, + "loss": 2.5786, + "step": 49303 + }, + { + "epoch": 2.295458248946621, + "grad_norm": 0.3283773671598025, + "learning_rate": 1.5876646645494995e-05, + "loss": 2.6355, + "step": 49304 + }, + { + "epoch": 2.295504807132714, + "grad_norm": 0.31905172524797776, + "learning_rate": 1.587466683575171e-05, + "loss": 2.6427, + "step": 49305 + }, + { + "epoch": 2.2955513653188073, + "grad_norm": 0.30750694747046436, + "learning_rate": 1.587268712616295e-05, + "loss": 2.6361, + "step": 49306 + }, + { + "epoch": 2.2955979235049004, + "grad_norm": 0.31231268240573656, + "learning_rate": 1.587070751673447e-05, + "loss": 2.6923, + "step": 49307 + }, + { + "epoch": 2.295644481690993, + "grad_norm": 0.3294272905408369, + "learning_rate": 1.586872800747211e-05, + "loss": 2.7805, + "step": 49308 + }, + { + "epoch": 2.2956910398770862, + "grad_norm": 0.33386899585052926, + "learning_rate": 1.5866748598381682e-05, + "loss": 2.6749, + "step": 49309 + }, + { + "epoch": 2.2957375980631793, + "grad_norm": 0.31995391934090794, + "learning_rate": 1.586476928946895e-05, + "loss": 2.7051, + "step": 49310 + }, + { + "epoch": 2.2957841562492725, + "grad_norm": 0.3529400182012179, + "learning_rate": 1.5862790080739793e-05, + "loss": 2.6333, + "step": 49311 + }, + { + "epoch": 2.2958307144353656, + "grad_norm": 0.3476374565907725, + "learning_rate": 1.586081097219995e-05, + "loss": 2.6719, + "step": 49312 + }, + { + "epoch": 2.2958772726214587, + "grad_norm": 0.3356723101916737, + "learning_rate": 1.5858831963855297e-05, + "loss": 2.6588, + "step": 49313 + }, + { + "epoch": 2.295923830807552, + "grad_norm": 0.32096285404418395, + "learning_rate": 1.5856853055711592e-05, + "loss": 2.6347, + "step": 49314 + }, + { + "epoch": 2.295970388993645, + "grad_norm": 0.36565737137160864, + "learning_rate": 1.585487424777466e-05, + "loss": 2.6435, + "step": 49315 + }, + { + "epoch": 2.296016947179738, + "grad_norm": 0.327296670670268, + "learning_rate": 1.5852895540050306e-05, + "loss": 2.6379, + "step": 49316 + }, + { + "epoch": 2.296063505365831, + "grad_norm": 0.317044414869981, + "learning_rate": 1.5850916932544357e-05, + "loss": 2.6135, + "step": 49317 + }, + { + "epoch": 2.2961100635519243, + "grad_norm": 0.3455106930104189, + "learning_rate": 1.584893842526257e-05, + "loss": 2.6401, + "step": 49318 + }, + { + "epoch": 2.296156621738017, + "grad_norm": 0.34098590894272995, + "learning_rate": 1.5846960018210815e-05, + "loss": 2.6227, + "step": 49319 + }, + { + "epoch": 2.29620317992411, + "grad_norm": 0.32053909919234536, + "learning_rate": 1.5844981711394857e-05, + "loss": 2.5912, + "step": 49320 + }, + { + "epoch": 2.296249738110203, + "grad_norm": 0.3377418166812117, + "learning_rate": 1.584300350482051e-05, + "loss": 2.5957, + "step": 49321 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 0.3321908034711919, + "learning_rate": 1.5841025398493597e-05, + "loss": 2.6364, + "step": 49322 + }, + { + "epoch": 2.2963428544823894, + "grad_norm": 0.3280695600864476, + "learning_rate": 1.5839047392419875e-05, + "loss": 2.7047, + "step": 49323 + }, + { + "epoch": 2.2963894126684825, + "grad_norm": 0.3255791111544036, + "learning_rate": 1.5837069486605222e-05, + "loss": 2.6314, + "step": 49324 + }, + { + "epoch": 2.2964359708545756, + "grad_norm": 0.3246008805230725, + "learning_rate": 1.583509168105537e-05, + "loss": 2.5521, + "step": 49325 + }, + { + "epoch": 2.2964825290406687, + "grad_norm": 0.340606657357718, + "learning_rate": 1.5833113975776187e-05, + "loss": 2.706, + "step": 49326 + }, + { + "epoch": 2.2965290872267614, + "grad_norm": 0.3524222150619406, + "learning_rate": 1.5831136370773436e-05, + "loss": 2.6531, + "step": 49327 + }, + { + "epoch": 2.2965756454128545, + "grad_norm": 0.3290912019474432, + "learning_rate": 1.5829158866052936e-05, + "loss": 2.6992, + "step": 49328 + }, + { + "epoch": 2.2966222035989476, + "grad_norm": 0.315821042393228, + "learning_rate": 1.5827181461620482e-05, + "loss": 2.6544, + "step": 49329 + }, + { + "epoch": 2.2966687617850408, + "grad_norm": 0.34335070197702405, + "learning_rate": 1.5825204157481887e-05, + "loss": 2.6733, + "step": 49330 + }, + { + "epoch": 2.296715319971134, + "grad_norm": 0.33187337802890526, + "learning_rate": 1.582322695364295e-05, + "loss": 2.5594, + "step": 49331 + }, + { + "epoch": 2.296761878157227, + "grad_norm": 0.3368384952278179, + "learning_rate": 1.5821249850109494e-05, + "loss": 2.5091, + "step": 49332 + }, + { + "epoch": 2.29680843634332, + "grad_norm": 0.33677032714985344, + "learning_rate": 1.5819272846887277e-05, + "loss": 2.5247, + "step": 49333 + }, + { + "epoch": 2.296854994529413, + "grad_norm": 0.3476018683844753, + "learning_rate": 1.5817295943982134e-05, + "loss": 2.7507, + "step": 49334 + }, + { + "epoch": 2.2969015527155063, + "grad_norm": 0.33559374112720597, + "learning_rate": 1.581531914139988e-05, + "loss": 2.7301, + "step": 49335 + }, + { + "epoch": 2.2969481109015994, + "grad_norm": 0.3515629675892371, + "learning_rate": 1.5813342439146262e-05, + "loss": 2.6061, + "step": 49336 + }, + { + "epoch": 2.2969946690876926, + "grad_norm": 0.32501839005876615, + "learning_rate": 1.5811365837227145e-05, + "loss": 2.5625, + "step": 49337 + }, + { + "epoch": 2.2970412272737852, + "grad_norm": 0.3592779257174328, + "learning_rate": 1.5809389335648265e-05, + "loss": 2.7031, + "step": 49338 + }, + { + "epoch": 2.2970877854598784, + "grad_norm": 0.3258565007629437, + "learning_rate": 1.58074129344155e-05, + "loss": 2.6007, + "step": 49339 + }, + { + "epoch": 2.2971343436459715, + "grad_norm": 0.33070973518934027, + "learning_rate": 1.5805436633534594e-05, + "loss": 2.6349, + "step": 49340 + }, + { + "epoch": 2.2971809018320646, + "grad_norm": 0.3184779672885175, + "learning_rate": 1.5803460433011353e-05, + "loss": 2.6553, + "step": 49341 + }, + { + "epoch": 2.2972274600181577, + "grad_norm": 0.34818078759290155, + "learning_rate": 1.5801484332851596e-05, + "loss": 2.6407, + "step": 49342 + }, + { + "epoch": 2.297274018204251, + "grad_norm": 0.3384476898112537, + "learning_rate": 1.579950833306111e-05, + "loss": 2.6668, + "step": 49343 + }, + { + "epoch": 2.297320576390344, + "grad_norm": 0.32499966712325834, + "learning_rate": 1.5797532433645697e-05, + "loss": 2.6319, + "step": 49344 + }, + { + "epoch": 2.297367134576437, + "grad_norm": 0.32438770984116616, + "learning_rate": 1.579555663461117e-05, + "loss": 2.5158, + "step": 49345 + }, + { + "epoch": 2.29741369276253, + "grad_norm": 0.32642714692999475, + "learning_rate": 1.5793580935963305e-05, + "loss": 2.5413, + "step": 49346 + }, + { + "epoch": 2.297460250948623, + "grad_norm": 0.3316092662059662, + "learning_rate": 1.5791605337707903e-05, + "loss": 2.5632, + "step": 49347 + }, + { + "epoch": 2.297506809134716, + "grad_norm": 0.3237927698209387, + "learning_rate": 1.5789629839850777e-05, + "loss": 2.7054, + "step": 49348 + }, + { + "epoch": 2.297553367320809, + "grad_norm": 0.3358166085749851, + "learning_rate": 1.5787654442397714e-05, + "loss": 2.714, + "step": 49349 + }, + { + "epoch": 2.297599925506902, + "grad_norm": 0.3397528581652955, + "learning_rate": 1.5785679145354526e-05, + "loss": 2.6584, + "step": 49350 + }, + { + "epoch": 2.2976464836929953, + "grad_norm": 0.3121907398202422, + "learning_rate": 1.5783703948726975e-05, + "loss": 2.5947, + "step": 49351 + }, + { + "epoch": 2.2976930418790884, + "grad_norm": 0.35810405926713307, + "learning_rate": 1.578172885252091e-05, + "loss": 2.6642, + "step": 49352 + }, + { + "epoch": 2.2977396000651815, + "grad_norm": 0.32619567895858476, + "learning_rate": 1.577975385674208e-05, + "loss": 2.6396, + "step": 49353 + }, + { + "epoch": 2.2977861582512746, + "grad_norm": 0.3224679347539796, + "learning_rate": 1.5777778961396307e-05, + "loss": 2.662, + "step": 49354 + }, + { + "epoch": 2.2978327164373678, + "grad_norm": 0.3169025443343541, + "learning_rate": 1.5775804166489377e-05, + "loss": 2.569, + "step": 49355 + }, + { + "epoch": 2.297879274623461, + "grad_norm": 0.35847266449631227, + "learning_rate": 1.5773829472027095e-05, + "loss": 2.6544, + "step": 49356 + }, + { + "epoch": 2.297925832809554, + "grad_norm": 0.3106723809117518, + "learning_rate": 1.5771854878015245e-05, + "loss": 2.6131, + "step": 49357 + }, + { + "epoch": 2.2979723909956467, + "grad_norm": 0.33223346049571945, + "learning_rate": 1.576988038445965e-05, + "loss": 2.6307, + "step": 49358 + }, + { + "epoch": 2.2980189491817398, + "grad_norm": 0.3332348448757594, + "learning_rate": 1.5767905991366073e-05, + "loss": 2.588, + "step": 49359 + }, + { + "epoch": 2.298065507367833, + "grad_norm": 0.3234059035957203, + "learning_rate": 1.576593169874031e-05, + "loss": 2.5683, + "step": 49360 + }, + { + "epoch": 2.298112065553926, + "grad_norm": 0.3248101478145614, + "learning_rate": 1.576395750658817e-05, + "loss": 2.7426, + "step": 49361 + }, + { + "epoch": 2.298158623740019, + "grad_norm": 0.3387697398574502, + "learning_rate": 1.5761983414915436e-05, + "loss": 2.6981, + "step": 49362 + }, + { + "epoch": 2.2982051819261122, + "grad_norm": 0.3470174780911286, + "learning_rate": 1.576000942372793e-05, + "loss": 2.6049, + "step": 49363 + }, + { + "epoch": 2.2982517401122053, + "grad_norm": 0.35974764620823957, + "learning_rate": 1.575803553303139e-05, + "loss": 2.6402, + "step": 49364 + }, + { + "epoch": 2.2982982982982985, + "grad_norm": 0.3184701130593125, + "learning_rate": 1.5756061742831672e-05, + "loss": 2.6068, + "step": 49365 + }, + { + "epoch": 2.298344856484391, + "grad_norm": 0.3176748478114469, + "learning_rate": 1.5754088053134508e-05, + "loss": 2.5631, + "step": 49366 + }, + { + "epoch": 2.2983914146704842, + "grad_norm": 0.3263252104056265, + "learning_rate": 1.5752114463945755e-05, + "loss": 2.5976, + "step": 49367 + }, + { + "epoch": 2.2984379728565774, + "grad_norm": 0.34751863248930587, + "learning_rate": 1.5750140975271153e-05, + "loss": 2.6254, + "step": 49368 + }, + { + "epoch": 2.2984845310426705, + "grad_norm": 0.34068228528645617, + "learning_rate": 1.574816758711652e-05, + "loss": 2.683, + "step": 49369 + }, + { + "epoch": 2.2985310892287636, + "grad_norm": 0.32426104364451724, + "learning_rate": 1.5746194299487633e-05, + "loss": 2.6017, + "step": 49370 + }, + { + "epoch": 2.2985776474148567, + "grad_norm": 0.32866021512506494, + "learning_rate": 1.5744221112390313e-05, + "loss": 2.6126, + "step": 49371 + }, + { + "epoch": 2.29862420560095, + "grad_norm": 0.3373408839243919, + "learning_rate": 1.5742248025830308e-05, + "loss": 2.6032, + "step": 49372 + }, + { + "epoch": 2.298670763787043, + "grad_norm": 0.3205662480597472, + "learning_rate": 1.5740275039813436e-05, + "loss": 2.6634, + "step": 49373 + }, + { + "epoch": 2.298717321973136, + "grad_norm": 0.35039966031452036, + "learning_rate": 1.5738302154345473e-05, + "loss": 2.5631, + "step": 49374 + }, + { + "epoch": 2.298763880159229, + "grad_norm": 0.326247427694166, + "learning_rate": 1.573632936943223e-05, + "loss": 2.5683, + "step": 49375 + }, + { + "epoch": 2.2988104383453223, + "grad_norm": 0.3189556975152677, + "learning_rate": 1.5734356685079488e-05, + "loss": 2.5613, + "step": 49376 + }, + { + "epoch": 2.298856996531415, + "grad_norm": 0.3454484433279459, + "learning_rate": 1.5732384101293006e-05, + "loss": 2.6552, + "step": 49377 + }, + { + "epoch": 2.298903554717508, + "grad_norm": 0.33466162222676465, + "learning_rate": 1.573041161807864e-05, + "loss": 2.6067, + "step": 49378 + }, + { + "epoch": 2.298950112903601, + "grad_norm": 0.3443809034028379, + "learning_rate": 1.5728439235442098e-05, + "loss": 2.6961, + "step": 49379 + }, + { + "epoch": 2.2989966710896943, + "grad_norm": 0.3260841539946658, + "learning_rate": 1.5726466953389246e-05, + "loss": 2.681, + "step": 49380 + }, + { + "epoch": 2.2990432292757874, + "grad_norm": 0.378643159212716, + "learning_rate": 1.5724494771925823e-05, + "loss": 2.569, + "step": 49381 + }, + { + "epoch": 2.2990897874618805, + "grad_norm": 0.3392809003184656, + "learning_rate": 1.5722522691057628e-05, + "loss": 2.6948, + "step": 49382 + }, + { + "epoch": 2.2991363456479736, + "grad_norm": 0.3601830125696287, + "learning_rate": 1.5720550710790455e-05, + "loss": 2.6885, + "step": 49383 + }, + { + "epoch": 2.2991829038340668, + "grad_norm": 0.3381841342917695, + "learning_rate": 1.571857883113009e-05, + "loss": 2.6084, + "step": 49384 + }, + { + "epoch": 2.29922946202016, + "grad_norm": 0.35535374851049767, + "learning_rate": 1.571660705208233e-05, + "loss": 2.6571, + "step": 49385 + }, + { + "epoch": 2.2992760202062525, + "grad_norm": 0.3411380973497218, + "learning_rate": 1.5714635373652938e-05, + "loss": 2.6311, + "step": 49386 + }, + { + "epoch": 2.2993225783923457, + "grad_norm": 0.3409802977372872, + "learning_rate": 1.571266379584771e-05, + "loss": 2.6385, + "step": 49387 + }, + { + "epoch": 2.2993691365784388, + "grad_norm": 0.3536158132178175, + "learning_rate": 1.5710692318672436e-05, + "loss": 2.676, + "step": 49388 + }, + { + "epoch": 2.299415694764532, + "grad_norm": 0.34226283628898896, + "learning_rate": 1.570872094213292e-05, + "loss": 2.5837, + "step": 49389 + }, + { + "epoch": 2.299462252950625, + "grad_norm": 0.32633490368267204, + "learning_rate": 1.57067496662349e-05, + "loss": 2.6853, + "step": 49390 + }, + { + "epoch": 2.299508811136718, + "grad_norm": 0.3613223017183822, + "learning_rate": 1.5704778490984222e-05, + "loss": 2.693, + "step": 49391 + }, + { + "epoch": 2.2995553693228112, + "grad_norm": 0.36663264582614397, + "learning_rate": 1.5702807416386605e-05, + "loss": 2.6463, + "step": 49392 + }, + { + "epoch": 2.2996019275089044, + "grad_norm": 0.3134454546500631, + "learning_rate": 1.570083644244791e-05, + "loss": 2.5567, + "step": 49393 + }, + { + "epoch": 2.2996484856949975, + "grad_norm": 0.3566186492690844, + "learning_rate": 1.5698865569173852e-05, + "loss": 2.6263, + "step": 49394 + }, + { + "epoch": 2.2996950438810906, + "grad_norm": 0.34720467209772765, + "learning_rate": 1.5696894796570254e-05, + "loss": 2.6415, + "step": 49395 + }, + { + "epoch": 2.2997416020671837, + "grad_norm": 0.36223802532400634, + "learning_rate": 1.569492412464289e-05, + "loss": 2.7196, + "step": 49396 + }, + { + "epoch": 2.2997881602532764, + "grad_norm": 0.33249094019529224, + "learning_rate": 1.569295355339754e-05, + "loss": 2.5925, + "step": 49397 + }, + { + "epoch": 2.2998347184393695, + "grad_norm": 0.3288345798570415, + "learning_rate": 1.569098308284001e-05, + "loss": 2.5933, + "step": 49398 + }, + { + "epoch": 2.2998812766254626, + "grad_norm": 0.34914138231172936, + "learning_rate": 1.568901271297605e-05, + "loss": 2.6512, + "step": 49399 + }, + { + "epoch": 2.2999278348115557, + "grad_norm": 0.3215998137989771, + "learning_rate": 1.5687042443811457e-05, + "loss": 2.5698, + "step": 49400 + }, + { + "epoch": 2.299974392997649, + "grad_norm": 0.34338545455973823, + "learning_rate": 1.5685072275352007e-05, + "loss": 2.6781, + "step": 49401 + }, + { + "epoch": 2.300020951183742, + "grad_norm": 0.33329344807303996, + "learning_rate": 1.5683102207603494e-05, + "loss": 2.592, + "step": 49402 + }, + { + "epoch": 2.300067509369835, + "grad_norm": 0.3219073045279618, + "learning_rate": 1.56811322405717e-05, + "loss": 2.5148, + "step": 49403 + }, + { + "epoch": 2.300114067555928, + "grad_norm": 0.3439505515392243, + "learning_rate": 1.5679162374262412e-05, + "loss": 2.6963, + "step": 49404 + }, + { + "epoch": 2.300160625742021, + "grad_norm": 0.3275521533157951, + "learning_rate": 1.5677192608681374e-05, + "loss": 2.5282, + "step": 49405 + }, + { + "epoch": 2.300207183928114, + "grad_norm": 0.3217019182973514, + "learning_rate": 1.5675222943834423e-05, + "loss": 2.5994, + "step": 49406 + }, + { + "epoch": 2.300253742114207, + "grad_norm": 0.33384854800329905, + "learning_rate": 1.5673253379727292e-05, + "loss": 2.586, + "step": 49407 + }, + { + "epoch": 2.3003003003003, + "grad_norm": 0.31871927438452285, + "learning_rate": 1.5671283916365786e-05, + "loss": 2.6924, + "step": 49408 + }, + { + "epoch": 2.3003468584863933, + "grad_norm": 0.3507528209918908, + "learning_rate": 1.566931455375568e-05, + "loss": 2.6816, + "step": 49409 + }, + { + "epoch": 2.3003934166724864, + "grad_norm": 0.3325380273602361, + "learning_rate": 1.566734529190275e-05, + "loss": 2.5915, + "step": 49410 + }, + { + "epoch": 2.3004399748585795, + "grad_norm": 0.3309579630251084, + "learning_rate": 1.56653761308128e-05, + "loss": 2.7625, + "step": 49411 + }, + { + "epoch": 2.3004865330446727, + "grad_norm": 0.34398599571496674, + "learning_rate": 1.566340707049157e-05, + "loss": 2.6869, + "step": 49412 + }, + { + "epoch": 2.3005330912307658, + "grad_norm": 0.32454680397014646, + "learning_rate": 1.5661438110944853e-05, + "loss": 2.5321, + "step": 49413 + }, + { + "epoch": 2.300579649416859, + "grad_norm": 0.3506102483535701, + "learning_rate": 1.565946925217844e-05, + "loss": 2.654, + "step": 49414 + }, + { + "epoch": 2.300626207602952, + "grad_norm": 0.3347124480378374, + "learning_rate": 1.5657500494198097e-05, + "loss": 2.6614, + "step": 49415 + }, + { + "epoch": 2.3006727657890447, + "grad_norm": 0.3551709592675429, + "learning_rate": 1.5655531837009608e-05, + "loss": 2.7053, + "step": 49416 + }, + { + "epoch": 2.300719323975138, + "grad_norm": 0.3269309412692896, + "learning_rate": 1.5653563280618767e-05, + "loss": 2.5731, + "step": 49417 + }, + { + "epoch": 2.300765882161231, + "grad_norm": 0.34674596533683827, + "learning_rate": 1.5651594825031295e-05, + "loss": 2.7295, + "step": 49418 + }, + { + "epoch": 2.300812440347324, + "grad_norm": 0.33047089573524463, + "learning_rate": 1.564962647025305e-05, + "loss": 2.7019, + "step": 49419 + }, + { + "epoch": 2.300858998533417, + "grad_norm": 0.31932191681941324, + "learning_rate": 1.5647658216289746e-05, + "loss": 2.6184, + "step": 49420 + }, + { + "epoch": 2.3009055567195102, + "grad_norm": 0.34097817737123637, + "learning_rate": 1.5645690063147178e-05, + "loss": 2.761, + "step": 49421 + }, + { + "epoch": 2.3009521149056034, + "grad_norm": 0.3215251738582989, + "learning_rate": 1.564372201083113e-05, + "loss": 2.689, + "step": 49422 + }, + { + "epoch": 2.3009986730916965, + "grad_norm": 0.3361795801692932, + "learning_rate": 1.5641754059347374e-05, + "loss": 2.6253, + "step": 49423 + }, + { + "epoch": 2.3010452312777896, + "grad_norm": 0.31745943732198795, + "learning_rate": 1.5639786208701702e-05, + "loss": 2.6166, + "step": 49424 + }, + { + "epoch": 2.3010917894638823, + "grad_norm": 0.3314500864190666, + "learning_rate": 1.563781845889985e-05, + "loss": 2.6229, + "step": 49425 + }, + { + "epoch": 2.3011383476499754, + "grad_norm": 0.3306054649108832, + "learning_rate": 1.5635850809947616e-05, + "loss": 2.6803, + "step": 49426 + }, + { + "epoch": 2.3011849058360685, + "grad_norm": 0.3322118695041565, + "learning_rate": 1.563388326185078e-05, + "loss": 2.7341, + "step": 49427 + }, + { + "epoch": 2.3012314640221616, + "grad_norm": 0.32536767149021323, + "learning_rate": 1.5631915814615112e-05, + "loss": 2.6968, + "step": 49428 + }, + { + "epoch": 2.3012780222082547, + "grad_norm": 0.3197965735692218, + "learning_rate": 1.562994846824638e-05, + "loss": 2.6026, + "step": 49429 + }, + { + "epoch": 2.301324580394348, + "grad_norm": 0.3173126344299946, + "learning_rate": 1.562798122275038e-05, + "loss": 2.5782, + "step": 49430 + }, + { + "epoch": 2.301371138580441, + "grad_norm": 0.3247249506197678, + "learning_rate": 1.562601407813284e-05, + "loss": 2.6869, + "step": 49431 + }, + { + "epoch": 2.301417696766534, + "grad_norm": 0.3170664202774931, + "learning_rate": 1.562404703439959e-05, + "loss": 2.5826, + "step": 49432 + }, + { + "epoch": 2.301464254952627, + "grad_norm": 0.32565450583604694, + "learning_rate": 1.5622080091556363e-05, + "loss": 2.5863, + "step": 49433 + }, + { + "epoch": 2.3015108131387203, + "grad_norm": 0.3224392707956508, + "learning_rate": 1.5620113249608943e-05, + "loss": 2.662, + "step": 49434 + }, + { + "epoch": 2.3015573713248134, + "grad_norm": 0.3068298479108909, + "learning_rate": 1.5618146508563103e-05, + "loss": 2.5559, + "step": 49435 + }, + { + "epoch": 2.301603929510906, + "grad_norm": 0.32990723535669647, + "learning_rate": 1.5616179868424612e-05, + "loss": 2.7007, + "step": 49436 + }, + { + "epoch": 2.301650487696999, + "grad_norm": 0.31318668769933283, + "learning_rate": 1.5614213329199266e-05, + "loss": 2.6599, + "step": 49437 + }, + { + "epoch": 2.3016970458830923, + "grad_norm": 0.30880331687118956, + "learning_rate": 1.561224689089279e-05, + "loss": 2.6563, + "step": 49438 + }, + { + "epoch": 2.3017436040691854, + "grad_norm": 0.3321212001827773, + "learning_rate": 1.561028055351101e-05, + "loss": 2.6574, + "step": 49439 + }, + { + "epoch": 2.3017901622552785, + "grad_norm": 0.3080330988658093, + "learning_rate": 1.5608314317059648e-05, + "loss": 2.582, + "step": 49440 + }, + { + "epoch": 2.3018367204413717, + "grad_norm": 0.3177291196561298, + "learning_rate": 1.5606348181544495e-05, + "loss": 2.657, + "step": 49441 + }, + { + "epoch": 2.301883278627465, + "grad_norm": 0.3285940424903079, + "learning_rate": 1.5604382146971326e-05, + "loss": 2.5211, + "step": 49442 + }, + { + "epoch": 2.301929836813558, + "grad_norm": 0.2842447884411059, + "learning_rate": 1.560241621334592e-05, + "loss": 2.6328, + "step": 49443 + }, + { + "epoch": 2.3019763949996506, + "grad_norm": 0.31483512875164726, + "learning_rate": 1.5600450380674002e-05, + "loss": 2.6243, + "step": 49444 + }, + { + "epoch": 2.3020229531857437, + "grad_norm": 0.3480399367439839, + "learning_rate": 1.559848464896141e-05, + "loss": 2.5914, + "step": 49445 + }, + { + "epoch": 2.302069511371837, + "grad_norm": 0.3183479027312111, + "learning_rate": 1.5596519018213855e-05, + "loss": 2.6188, + "step": 49446 + }, + { + "epoch": 2.30211606955793, + "grad_norm": 0.31843888458550823, + "learning_rate": 1.559455348843713e-05, + "loss": 2.6724, + "step": 49447 + }, + { + "epoch": 2.302162627744023, + "grad_norm": 0.31822184152117206, + "learning_rate": 1.5592588059637e-05, + "loss": 2.5492, + "step": 49448 + }, + { + "epoch": 2.302209185930116, + "grad_norm": 0.3155860782949567, + "learning_rate": 1.559062273181923e-05, + "loss": 2.7265, + "step": 49449 + }, + { + "epoch": 2.3022557441162093, + "grad_norm": 0.30288493267166516, + "learning_rate": 1.5588657504989616e-05, + "loss": 2.6834, + "step": 49450 + }, + { + "epoch": 2.3023023023023024, + "grad_norm": 0.3318500364669103, + "learning_rate": 1.5586692379153868e-05, + "loss": 2.6271, + "step": 49451 + }, + { + "epoch": 2.3023488604883955, + "grad_norm": 0.3182558688371032, + "learning_rate": 1.558472735431782e-05, + "loss": 2.6471, + "step": 49452 + }, + { + "epoch": 2.3023954186744886, + "grad_norm": 0.3159468901343128, + "learning_rate": 1.558276243048719e-05, + "loss": 2.7156, + "step": 49453 + }, + { + "epoch": 2.3024419768605817, + "grad_norm": 0.3224648316199772, + "learning_rate": 1.558079760766776e-05, + "loss": 2.625, + "step": 49454 + }, + { + "epoch": 2.3024885350466744, + "grad_norm": 0.30882786212547364, + "learning_rate": 1.5578832885865296e-05, + "loss": 2.4972, + "step": 49455 + }, + { + "epoch": 2.3025350932327675, + "grad_norm": 0.3356704335606663, + "learning_rate": 1.557686826508557e-05, + "loss": 2.6971, + "step": 49456 + }, + { + "epoch": 2.3025816514188606, + "grad_norm": 0.3267547790926055, + "learning_rate": 1.5574903745334347e-05, + "loss": 2.6386, + "step": 49457 + }, + { + "epoch": 2.3026282096049537, + "grad_norm": 0.3238420740774401, + "learning_rate": 1.5572939326617403e-05, + "loss": 2.575, + "step": 49458 + }, + { + "epoch": 2.302674767791047, + "grad_norm": 0.3321257457131657, + "learning_rate": 1.5570975008940476e-05, + "loss": 2.5585, + "step": 49459 + }, + { + "epoch": 2.30272132597714, + "grad_norm": 0.3043482311250491, + "learning_rate": 1.556901079230934e-05, + "loss": 2.5461, + "step": 49460 + }, + { + "epoch": 2.302767884163233, + "grad_norm": 0.32484819832329426, + "learning_rate": 1.5567046676729774e-05, + "loss": 2.6917, + "step": 49461 + }, + { + "epoch": 2.302814442349326, + "grad_norm": 0.32384837487078055, + "learning_rate": 1.5565082662207524e-05, + "loss": 2.6707, + "step": 49462 + }, + { + "epoch": 2.3028610005354193, + "grad_norm": 0.33932067406417454, + "learning_rate": 1.556311874874839e-05, + "loss": 2.6114, + "step": 49463 + }, + { + "epoch": 2.302907558721512, + "grad_norm": 0.33280064402738646, + "learning_rate": 1.5561154936358067e-05, + "loss": 2.6017, + "step": 49464 + }, + { + "epoch": 2.302954116907605, + "grad_norm": 0.31890588895868527, + "learning_rate": 1.55591912250424e-05, + "loss": 2.652, + "step": 49465 + }, + { + "epoch": 2.303000675093698, + "grad_norm": 0.3233355198087785, + "learning_rate": 1.5557227614807102e-05, + "loss": 2.5629, + "step": 49466 + }, + { + "epoch": 2.3030472332797913, + "grad_norm": 0.3480575054990761, + "learning_rate": 1.555526410565794e-05, + "loss": 2.6946, + "step": 49467 + }, + { + "epoch": 2.3030937914658844, + "grad_norm": 0.3406039965014307, + "learning_rate": 1.555330069760069e-05, + "loss": 2.6273, + "step": 49468 + }, + { + "epoch": 2.3031403496519776, + "grad_norm": 0.3218758279938199, + "learning_rate": 1.5551337390641108e-05, + "loss": 2.6691, + "step": 49469 + }, + { + "epoch": 2.3031869078380707, + "grad_norm": 0.31450256168320967, + "learning_rate": 1.5549374184784953e-05, + "loss": 2.5645, + "step": 49470 + }, + { + "epoch": 2.303233466024164, + "grad_norm": 0.3262569391136319, + "learning_rate": 1.554741108003801e-05, + "loss": 2.5416, + "step": 49471 + }, + { + "epoch": 2.303280024210257, + "grad_norm": 0.33606578509446655, + "learning_rate": 1.5545448076406004e-05, + "loss": 2.6243, + "step": 49472 + }, + { + "epoch": 2.30332658239635, + "grad_norm": 0.34958523945382053, + "learning_rate": 1.5543485173894712e-05, + "loss": 2.5844, + "step": 49473 + }, + { + "epoch": 2.303373140582443, + "grad_norm": 0.3249148611395782, + "learning_rate": 1.5541522372509905e-05, + "loss": 2.6068, + "step": 49474 + }, + { + "epoch": 2.303419698768536, + "grad_norm": 0.3214737959885426, + "learning_rate": 1.5539559672257325e-05, + "loss": 2.7136, + "step": 49475 + }, + { + "epoch": 2.303466256954629, + "grad_norm": 0.30385920805462435, + "learning_rate": 1.5537597073142762e-05, + "loss": 2.5687, + "step": 49476 + }, + { + "epoch": 2.303512815140722, + "grad_norm": 0.3122117758554207, + "learning_rate": 1.5535634575171926e-05, + "loss": 2.6211, + "step": 49477 + }, + { + "epoch": 2.303559373326815, + "grad_norm": 0.3558019406761116, + "learning_rate": 1.5533672178350637e-05, + "loss": 2.733, + "step": 49478 + }, + { + "epoch": 2.3036059315129083, + "grad_norm": 0.3131541297805714, + "learning_rate": 1.553170988268459e-05, + "loss": 2.5861, + "step": 49479 + }, + { + "epoch": 2.3036524896990014, + "grad_norm": 0.30882065270921766, + "learning_rate": 1.552974768817962e-05, + "loss": 2.6553, + "step": 49480 + }, + { + "epoch": 2.3036990478850945, + "grad_norm": 0.322880625313385, + "learning_rate": 1.552778559484142e-05, + "loss": 2.5868, + "step": 49481 + }, + { + "epoch": 2.3037456060711876, + "grad_norm": 0.35259475315079086, + "learning_rate": 1.5525823602675776e-05, + "loss": 2.6536, + "step": 49482 + }, + { + "epoch": 2.3037921642572807, + "grad_norm": 0.3244567297939653, + "learning_rate": 1.552386171168844e-05, + "loss": 2.6854, + "step": 49483 + }, + { + "epoch": 2.3038387224433734, + "grad_norm": 0.32862134803093906, + "learning_rate": 1.5521899921885196e-05, + "loss": 2.6702, + "step": 49484 + }, + { + "epoch": 2.3038852806294665, + "grad_norm": 0.33109017294187537, + "learning_rate": 1.5519938233271763e-05, + "loss": 2.6353, + "step": 49485 + }, + { + "epoch": 2.3039318388155596, + "grad_norm": 0.3480408991639277, + "learning_rate": 1.551797664585391e-05, + "loss": 2.6268, + "step": 49486 + }, + { + "epoch": 2.3039783970016527, + "grad_norm": 0.3136086860470345, + "learning_rate": 1.55160151596374e-05, + "loss": 2.6265, + "step": 49487 + }, + { + "epoch": 2.304024955187746, + "grad_norm": 0.34321692067921716, + "learning_rate": 1.551405377462799e-05, + "loss": 2.6729, + "step": 49488 + }, + { + "epoch": 2.304071513373839, + "grad_norm": 0.33127931949868594, + "learning_rate": 1.5512092490831447e-05, + "loss": 2.6502, + "step": 49489 + }, + { + "epoch": 2.304118071559932, + "grad_norm": 0.33869466737475074, + "learning_rate": 1.5510131308253484e-05, + "loss": 2.6393, + "step": 49490 + }, + { + "epoch": 2.304164629746025, + "grad_norm": 0.3289160719276599, + "learning_rate": 1.550817022689992e-05, + "loss": 2.6186, + "step": 49491 + }, + { + "epoch": 2.3042111879321183, + "grad_norm": 0.3467209498202584, + "learning_rate": 1.5506209246776448e-05, + "loss": 2.6292, + "step": 49492 + }, + { + "epoch": 2.3042577461182114, + "grad_norm": 0.3295722884425199, + "learning_rate": 1.5504248367888884e-05, + "loss": 2.6925, + "step": 49493 + }, + { + "epoch": 2.3043043043043046, + "grad_norm": 0.33999352578085973, + "learning_rate": 1.5502287590242942e-05, + "loss": 2.6412, + "step": 49494 + }, + { + "epoch": 2.3043508624903972, + "grad_norm": 0.3521365427664697, + "learning_rate": 1.550032691384438e-05, + "loss": 2.7478, + "step": 49495 + }, + { + "epoch": 2.3043974206764903, + "grad_norm": 0.3393326458755914, + "learning_rate": 1.5498366338698962e-05, + "loss": 2.6125, + "step": 49496 + }, + { + "epoch": 2.3044439788625835, + "grad_norm": 0.344250507901089, + "learning_rate": 1.549640586481244e-05, + "loss": 2.6377, + "step": 49497 + }, + { + "epoch": 2.3044905370486766, + "grad_norm": 0.3285118100085208, + "learning_rate": 1.5494445492190583e-05, + "loss": 2.6063, + "step": 49498 + }, + { + "epoch": 2.3045370952347697, + "grad_norm": 0.3503135608483704, + "learning_rate": 1.5492485220839115e-05, + "loss": 2.6183, + "step": 49499 + }, + { + "epoch": 2.304583653420863, + "grad_norm": 0.3303214840242621, + "learning_rate": 1.5490525050763805e-05, + "loss": 2.6195, + "step": 49500 + }, + { + "epoch": 2.304630211606956, + "grad_norm": 0.33484533565885694, + "learning_rate": 1.54885649819704e-05, + "loss": 2.6036, + "step": 49501 + }, + { + "epoch": 2.304676769793049, + "grad_norm": 0.3583196018787593, + "learning_rate": 1.5486605014464672e-05, + "loss": 2.7002, + "step": 49502 + }, + { + "epoch": 2.3047233279791417, + "grad_norm": 0.327662713646684, + "learning_rate": 1.5484645148252332e-05, + "loss": 2.6343, + "step": 49503 + }, + { + "epoch": 2.304769886165235, + "grad_norm": 0.3280699714288199, + "learning_rate": 1.5482685383339186e-05, + "loss": 2.7022, + "step": 49504 + }, + { + "epoch": 2.304816444351328, + "grad_norm": 0.3268844113133327, + "learning_rate": 1.5480725719730926e-05, + "loss": 2.6057, + "step": 49505 + }, + { + "epoch": 2.304863002537421, + "grad_norm": 0.3277290748550582, + "learning_rate": 1.5478766157433367e-05, + "loss": 2.6029, + "step": 49506 + }, + { + "epoch": 2.304909560723514, + "grad_norm": 0.32053104204394556, + "learning_rate": 1.5476806696452218e-05, + "loss": 2.622, + "step": 49507 + }, + { + "epoch": 2.3049561189096073, + "grad_norm": 0.31843934902001086, + "learning_rate": 1.547484733679323e-05, + "loss": 2.6331, + "step": 49508 + }, + { + "epoch": 2.3050026770957004, + "grad_norm": 0.30874169508740773, + "learning_rate": 1.5472888078462172e-05, + "loss": 2.6149, + "step": 49509 + }, + { + "epoch": 2.3050492352817935, + "grad_norm": 0.3431895795104112, + "learning_rate": 1.5470928921464783e-05, + "loss": 2.7261, + "step": 49510 + }, + { + "epoch": 2.3050957934678866, + "grad_norm": 0.3204841915818419, + "learning_rate": 1.5468969865806832e-05, + "loss": 2.6563, + "step": 49511 + }, + { + "epoch": 2.3051423516539797, + "grad_norm": 0.32223375290694506, + "learning_rate": 1.546701091149404e-05, + "loss": 2.7024, + "step": 49512 + }, + { + "epoch": 2.305188909840073, + "grad_norm": 0.2968831332587701, + "learning_rate": 1.5465052058532166e-05, + "loss": 2.4847, + "step": 49513 + }, + { + "epoch": 2.3052354680261655, + "grad_norm": 0.3304798745195451, + "learning_rate": 1.5463093306926958e-05, + "loss": 2.6347, + "step": 49514 + }, + { + "epoch": 2.3052820262122586, + "grad_norm": 0.31025350006317004, + "learning_rate": 1.5461134656684173e-05, + "loss": 2.6155, + "step": 49515 + }, + { + "epoch": 2.3053285843983518, + "grad_norm": 0.30635295013322394, + "learning_rate": 1.5459176107809554e-05, + "loss": 2.6715, + "step": 49516 + }, + { + "epoch": 2.305375142584445, + "grad_norm": 0.3361315861745607, + "learning_rate": 1.545721766030887e-05, + "loss": 2.6958, + "step": 49517 + }, + { + "epoch": 2.305421700770538, + "grad_norm": 0.3279480025975402, + "learning_rate": 1.5455259314187816e-05, + "loss": 2.6225, + "step": 49518 + }, + { + "epoch": 2.305468258956631, + "grad_norm": 0.32767606808438604, + "learning_rate": 1.5453301069452198e-05, + "loss": 2.7038, + "step": 49519 + }, + { + "epoch": 2.305514817142724, + "grad_norm": 0.3122883596902673, + "learning_rate": 1.5451342926107725e-05, + "loss": 2.5992, + "step": 49520 + }, + { + "epoch": 2.3055613753288173, + "grad_norm": 0.3104201454723258, + "learning_rate": 1.544938488416016e-05, + "loss": 2.5625, + "step": 49521 + }, + { + "epoch": 2.3056079335149104, + "grad_norm": 0.33308495491728035, + "learning_rate": 1.5447426943615244e-05, + "loss": 2.5956, + "step": 49522 + }, + { + "epoch": 2.305654491701003, + "grad_norm": 0.3301550309235021, + "learning_rate": 1.5445469104478727e-05, + "loss": 2.6504, + "step": 49523 + }, + { + "epoch": 2.3057010498870962, + "grad_norm": 0.3199923602889666, + "learning_rate": 1.5443511366756375e-05, + "loss": 2.7176, + "step": 49524 + }, + { + "epoch": 2.3057476080731893, + "grad_norm": 0.29832661729579735, + "learning_rate": 1.5441553730453888e-05, + "loss": 2.6023, + "step": 49525 + }, + { + "epoch": 2.3057941662592825, + "grad_norm": 0.3211790523179427, + "learning_rate": 1.5439596195577037e-05, + "loss": 2.5944, + "step": 49526 + }, + { + "epoch": 2.3058407244453756, + "grad_norm": 0.34754004425776186, + "learning_rate": 1.5437638762131572e-05, + "loss": 2.7124, + "step": 49527 + }, + { + "epoch": 2.3058872826314687, + "grad_norm": 0.3211894206108142, + "learning_rate": 1.5435681430123232e-05, + "loss": 2.5634, + "step": 49528 + }, + { + "epoch": 2.305933840817562, + "grad_norm": 0.3115073430973475, + "learning_rate": 1.5433724199557757e-05, + "loss": 2.5887, + "step": 49529 + }, + { + "epoch": 2.305980399003655, + "grad_norm": 0.32151596675924937, + "learning_rate": 1.5431767070440917e-05, + "loss": 2.6274, + "step": 49530 + }, + { + "epoch": 2.306026957189748, + "grad_norm": 0.3154422915649033, + "learning_rate": 1.5429810042778396e-05, + "loss": 2.6201, + "step": 49531 + }, + { + "epoch": 2.306073515375841, + "grad_norm": 0.31650638285842897, + "learning_rate": 1.542785311657602e-05, + "loss": 2.6363, + "step": 49532 + }, + { + "epoch": 2.3061200735619343, + "grad_norm": 0.3181670761716225, + "learning_rate": 1.5425896291839466e-05, + "loss": 2.7343, + "step": 49533 + }, + { + "epoch": 2.306166631748027, + "grad_norm": 0.31134432961615144, + "learning_rate": 1.5423939568574498e-05, + "loss": 2.6519, + "step": 49534 + }, + { + "epoch": 2.30621318993412, + "grad_norm": 0.31314195764743474, + "learning_rate": 1.542198294678687e-05, + "loss": 2.8088, + "step": 49535 + }, + { + "epoch": 2.306259748120213, + "grad_norm": 0.3220657685136509, + "learning_rate": 1.5420026426482304e-05, + "loss": 2.6782, + "step": 49536 + }, + { + "epoch": 2.3063063063063063, + "grad_norm": 0.3293085291297262, + "learning_rate": 1.5418070007666575e-05, + "loss": 2.5359, + "step": 49537 + }, + { + "epoch": 2.3063528644923994, + "grad_norm": 0.3312205260558896, + "learning_rate": 1.541611369034539e-05, + "loss": 2.6065, + "step": 49538 + }, + { + "epoch": 2.3063994226784925, + "grad_norm": 0.3041568248011135, + "learning_rate": 1.5414157474524504e-05, + "loss": 2.6028, + "step": 49539 + }, + { + "epoch": 2.3064459808645856, + "grad_norm": 0.3168039452331142, + "learning_rate": 1.5412201360209655e-05, + "loss": 2.5952, + "step": 49540 + }, + { + "epoch": 2.3064925390506787, + "grad_norm": 0.3218261077721823, + "learning_rate": 1.541024534740659e-05, + "loss": 2.5419, + "step": 49541 + }, + { + "epoch": 2.3065390972367714, + "grad_norm": 0.31816774526423985, + "learning_rate": 1.540828943612104e-05, + "loss": 2.6562, + "step": 49542 + }, + { + "epoch": 2.3065856554228645, + "grad_norm": 0.325158718744787, + "learning_rate": 1.5406333626358783e-05, + "loss": 2.6684, + "step": 49543 + }, + { + "epoch": 2.3066322136089576, + "grad_norm": 0.32972324230305533, + "learning_rate": 1.5404377918125485e-05, + "loss": 2.6398, + "step": 49544 + }, + { + "epoch": 2.3066787717950508, + "grad_norm": 0.324852546474162, + "learning_rate": 1.5402422311426968e-05, + "loss": 2.6599, + "step": 49545 + }, + { + "epoch": 2.306725329981144, + "grad_norm": 0.3194223534114179, + "learning_rate": 1.5400466806268914e-05, + "loss": 2.6035, + "step": 49546 + }, + { + "epoch": 2.306771888167237, + "grad_norm": 0.34487795356994005, + "learning_rate": 1.5398511402657083e-05, + "loss": 2.5922, + "step": 49547 + }, + { + "epoch": 2.30681844635333, + "grad_norm": 0.33104858756209465, + "learning_rate": 1.5396556100597208e-05, + "loss": 2.6773, + "step": 49548 + }, + { + "epoch": 2.3068650045394232, + "grad_norm": 0.3251813618912042, + "learning_rate": 1.5394600900095035e-05, + "loss": 2.6651, + "step": 49549 + }, + { + "epoch": 2.3069115627255163, + "grad_norm": 0.3254527152531776, + "learning_rate": 1.539264580115632e-05, + "loss": 2.7089, + "step": 49550 + }, + { + "epoch": 2.3069581209116095, + "grad_norm": 0.31626120506863764, + "learning_rate": 1.539069080378674e-05, + "loss": 2.5528, + "step": 49551 + }, + { + "epoch": 2.3070046790977026, + "grad_norm": 0.3557771462420546, + "learning_rate": 1.538873590799211e-05, + "loss": 2.6283, + "step": 49552 + }, + { + "epoch": 2.3070512372837952, + "grad_norm": 0.33623422683890325, + "learning_rate": 1.538678111377811e-05, + "loss": 2.6364, + "step": 49553 + }, + { + "epoch": 2.3070977954698884, + "grad_norm": 0.33562322493917524, + "learning_rate": 1.53848264211505e-05, + "loss": 2.61, + "step": 49554 + }, + { + "epoch": 2.3071443536559815, + "grad_norm": 0.31109583187534096, + "learning_rate": 1.5382871830115015e-05, + "loss": 2.6302, + "step": 49555 + }, + { + "epoch": 2.3071909118420746, + "grad_norm": 0.33490168383548014, + "learning_rate": 1.5380917340677407e-05, + "loss": 2.5931, + "step": 49556 + }, + { + "epoch": 2.3072374700281677, + "grad_norm": 0.3256908543270799, + "learning_rate": 1.5378962952843364e-05, + "loss": 2.5648, + "step": 49557 + }, + { + "epoch": 2.307284028214261, + "grad_norm": 0.3414428689929562, + "learning_rate": 1.5377008666618687e-05, + "loss": 2.7171, + "step": 49558 + }, + { + "epoch": 2.307330586400354, + "grad_norm": 0.3594311426321454, + "learning_rate": 1.537505448200906e-05, + "loss": 2.6858, + "step": 49559 + }, + { + "epoch": 2.307377144586447, + "grad_norm": 0.3162032407647818, + "learning_rate": 1.5373100399020234e-05, + "loss": 2.6413, + "step": 49560 + }, + { + "epoch": 2.30742370277254, + "grad_norm": 0.3431050143976873, + "learning_rate": 1.5371146417657955e-05, + "loss": 2.6434, + "step": 49561 + }, + { + "epoch": 2.307470260958633, + "grad_norm": 0.3469940762778604, + "learning_rate": 1.5369192537927945e-05, + "loss": 2.712, + "step": 49562 + }, + { + "epoch": 2.307516819144726, + "grad_norm": 0.32510785369846357, + "learning_rate": 1.5367238759835962e-05, + "loss": 2.6815, + "step": 49563 + }, + { + "epoch": 2.307563377330819, + "grad_norm": 0.32505408062172203, + "learning_rate": 1.5365285083387682e-05, + "loss": 2.6717, + "step": 49564 + }, + { + "epoch": 2.307609935516912, + "grad_norm": 0.3389065428572489, + "learning_rate": 1.536333150858892e-05, + "loss": 2.6009, + "step": 49565 + }, + { + "epoch": 2.3076564937030053, + "grad_norm": 0.34283641446129454, + "learning_rate": 1.5361378035445345e-05, + "loss": 2.4966, + "step": 49566 + }, + { + "epoch": 2.3077030518890984, + "grad_norm": 0.3430800858182078, + "learning_rate": 1.5359424663962713e-05, + "loss": 2.6464, + "step": 49567 + }, + { + "epoch": 2.3077496100751915, + "grad_norm": 0.33433918681446056, + "learning_rate": 1.5357471394146762e-05, + "loss": 2.674, + "step": 49568 + }, + { + "epoch": 2.3077961682612846, + "grad_norm": 0.3522218490293299, + "learning_rate": 1.535551822600321e-05, + "loss": 2.7051, + "step": 49569 + }, + { + "epoch": 2.3078427264473778, + "grad_norm": 0.327324230143168, + "learning_rate": 1.5353565159537808e-05, + "loss": 2.7027, + "step": 49570 + }, + { + "epoch": 2.307889284633471, + "grad_norm": 0.33873286699440264, + "learning_rate": 1.5351612194756293e-05, + "loss": 2.6739, + "step": 49571 + }, + { + "epoch": 2.307935842819564, + "grad_norm": 0.3451744896362145, + "learning_rate": 1.5349659331664367e-05, + "loss": 2.6427, + "step": 49572 + }, + { + "epoch": 2.3079824010056567, + "grad_norm": 0.33006587795617215, + "learning_rate": 1.534770657026778e-05, + "loss": 2.5516, + "step": 49573 + }, + { + "epoch": 2.3080289591917498, + "grad_norm": 0.34354026602337295, + "learning_rate": 1.534575391057226e-05, + "loss": 2.5845, + "step": 49574 + }, + { + "epoch": 2.308075517377843, + "grad_norm": 0.338728396673767, + "learning_rate": 1.5343801352583532e-05, + "loss": 2.6358, + "step": 49575 + }, + { + "epoch": 2.308122075563936, + "grad_norm": 0.3152089560956133, + "learning_rate": 1.5341848896307355e-05, + "loss": 2.5827, + "step": 49576 + }, + { + "epoch": 2.308168633750029, + "grad_norm": 0.3746504109656819, + "learning_rate": 1.53398965417494e-05, + "loss": 2.6739, + "step": 49577 + }, + { + "epoch": 2.3082151919361222, + "grad_norm": 0.3223036872685851, + "learning_rate": 1.533794428891547e-05, + "loss": 2.646, + "step": 49578 + }, + { + "epoch": 2.3082617501222153, + "grad_norm": 0.34194457000908957, + "learning_rate": 1.533599213781124e-05, + "loss": 2.6544, + "step": 49579 + }, + { + "epoch": 2.3083083083083085, + "grad_norm": 0.3540558983700444, + "learning_rate": 1.533404008844246e-05, + "loss": 2.656, + "step": 49580 + }, + { + "epoch": 2.308354866494401, + "grad_norm": 0.3357215692531516, + "learning_rate": 1.5332088140814856e-05, + "loss": 2.7044, + "step": 49581 + }, + { + "epoch": 2.3084014246804943, + "grad_norm": 0.3268740745536959, + "learning_rate": 1.5330136294934154e-05, + "loss": 2.6232, + "step": 49582 + }, + { + "epoch": 2.3084479828665874, + "grad_norm": 0.34638422751767783, + "learning_rate": 1.532818455080609e-05, + "loss": 2.5082, + "step": 49583 + }, + { + "epoch": 2.3084945410526805, + "grad_norm": 0.34031705400772616, + "learning_rate": 1.5326232908436405e-05, + "loss": 2.5969, + "step": 49584 + }, + { + "epoch": 2.3085410992387736, + "grad_norm": 0.33950657682558205, + "learning_rate": 1.5324281367830796e-05, + "loss": 2.6236, + "step": 49585 + }, + { + "epoch": 2.3085876574248667, + "grad_norm": 0.350570668601026, + "learning_rate": 1.5322329928995006e-05, + "loss": 2.7467, + "step": 49586 + }, + { + "epoch": 2.30863421561096, + "grad_norm": 0.3682143366701113, + "learning_rate": 1.5320378591934758e-05, + "loss": 2.6317, + "step": 49587 + }, + { + "epoch": 2.308680773797053, + "grad_norm": 0.3219409357532334, + "learning_rate": 1.5318427356655785e-05, + "loss": 2.6397, + "step": 49588 + }, + { + "epoch": 2.308727331983146, + "grad_norm": 0.3234368520524359, + "learning_rate": 1.5316476223163823e-05, + "loss": 2.6809, + "step": 49589 + }, + { + "epoch": 2.308773890169239, + "grad_norm": 0.3286773979428885, + "learning_rate": 1.5314525191464556e-05, + "loss": 2.5626, + "step": 49590 + }, + { + "epoch": 2.3088204483553323, + "grad_norm": 0.321914383895656, + "learning_rate": 1.5312574261563773e-05, + "loss": 2.4686, + "step": 49591 + }, + { + "epoch": 2.308867006541425, + "grad_norm": 0.35118876304198265, + "learning_rate": 1.5310623433467154e-05, + "loss": 2.5782, + "step": 49592 + }, + { + "epoch": 2.308913564727518, + "grad_norm": 0.3445123815490278, + "learning_rate": 1.5308672707180433e-05, + "loss": 2.5822, + "step": 49593 + }, + { + "epoch": 2.308960122913611, + "grad_norm": 0.3399935299396646, + "learning_rate": 1.5306722082709346e-05, + "loss": 2.6496, + "step": 49594 + }, + { + "epoch": 2.3090066810997043, + "grad_norm": 0.36307045447034764, + "learning_rate": 1.5304771560059605e-05, + "loss": 2.547, + "step": 49595 + }, + { + "epoch": 2.3090532392857974, + "grad_norm": 0.3350832708281245, + "learning_rate": 1.5302821139236946e-05, + "loss": 2.5899, + "step": 49596 + }, + { + "epoch": 2.3090997974718905, + "grad_norm": 0.34222287498316994, + "learning_rate": 1.5300870820247105e-05, + "loss": 2.593, + "step": 49597 + }, + { + "epoch": 2.3091463556579837, + "grad_norm": 0.3457377265973925, + "learning_rate": 1.5298920603095768e-05, + "loss": 2.6792, + "step": 49598 + }, + { + "epoch": 2.3091929138440768, + "grad_norm": 0.3335138598791212, + "learning_rate": 1.5296970487788687e-05, + "loss": 2.6275, + "step": 49599 + }, + { + "epoch": 2.30923947203017, + "grad_norm": 0.31004756754967033, + "learning_rate": 1.5295020474331572e-05, + "loss": 2.5488, + "step": 49600 + }, + { + "epoch": 2.3092860302162626, + "grad_norm": 0.33814536151258856, + "learning_rate": 1.529307056273016e-05, + "loss": 2.5688, + "step": 49601 + }, + { + "epoch": 2.3093325884023557, + "grad_norm": 0.3354033625655128, + "learning_rate": 1.529112075299018e-05, + "loss": 2.635, + "step": 49602 + }, + { + "epoch": 2.309379146588449, + "grad_norm": 0.3250964829030865, + "learning_rate": 1.528917104511731e-05, + "loss": 2.7258, + "step": 49603 + }, + { + "epoch": 2.309425704774542, + "grad_norm": 0.330896665698896, + "learning_rate": 1.5287221439117332e-05, + "loss": 2.6663, + "step": 49604 + }, + { + "epoch": 2.309472262960635, + "grad_norm": 0.3250585054446213, + "learning_rate": 1.528527193499591e-05, + "loss": 2.6822, + "step": 49605 + }, + { + "epoch": 2.309518821146728, + "grad_norm": 0.31118650555890864, + "learning_rate": 1.5283322532758824e-05, + "loss": 2.5509, + "step": 49606 + }, + { + "epoch": 2.3095653793328212, + "grad_norm": 0.3181648813197167, + "learning_rate": 1.5281373232411755e-05, + "loss": 2.5748, + "step": 49607 + }, + { + "epoch": 2.3096119375189144, + "grad_norm": 0.32662094135777425, + "learning_rate": 1.527942403396043e-05, + "loss": 2.604, + "step": 49608 + }, + { + "epoch": 2.3096584957050075, + "grad_norm": 0.33028115390439855, + "learning_rate": 1.5277474937410586e-05, + "loss": 2.5831, + "step": 49609 + }, + { + "epoch": 2.3097050538911006, + "grad_norm": 0.3106470402633877, + "learning_rate": 1.5275525942767936e-05, + "loss": 2.6277, + "step": 49610 + }, + { + "epoch": 2.3097516120771937, + "grad_norm": 0.328254632874402, + "learning_rate": 1.527357705003817e-05, + "loss": 2.6799, + "step": 49611 + }, + { + "epoch": 2.3097981702632864, + "grad_norm": 0.33320547404088696, + "learning_rate": 1.5271628259227066e-05, + "loss": 2.6363, + "step": 49612 + }, + { + "epoch": 2.3098447284493795, + "grad_norm": 0.33219686764760636, + "learning_rate": 1.5269679570340294e-05, + "loss": 2.5894, + "step": 49613 + }, + { + "epoch": 2.3098912866354726, + "grad_norm": 0.3382349423219818, + "learning_rate": 1.526773098338359e-05, + "loss": 2.6315, + "step": 49614 + }, + { + "epoch": 2.3099378448215657, + "grad_norm": 0.3024432484224326, + "learning_rate": 1.526578249836269e-05, + "loss": 2.5479, + "step": 49615 + }, + { + "epoch": 2.309984403007659, + "grad_norm": 0.31769904543414307, + "learning_rate": 1.5263834115283265e-05, + "loss": 2.6608, + "step": 49616 + }, + { + "epoch": 2.310030961193752, + "grad_norm": 0.34345935774660646, + "learning_rate": 1.52618858341511e-05, + "loss": 2.6593, + "step": 49617 + }, + { + "epoch": 2.310077519379845, + "grad_norm": 0.3235912411590664, + "learning_rate": 1.5259937654971846e-05, + "loss": 2.6157, + "step": 49618 + }, + { + "epoch": 2.310124077565938, + "grad_norm": 0.32908278023833637, + "learning_rate": 1.525798957775128e-05, + "loss": 2.6589, + "step": 49619 + }, + { + "epoch": 2.310170635752031, + "grad_norm": 0.32276643748192263, + "learning_rate": 1.525604160249508e-05, + "loss": 2.5769, + "step": 49620 + }, + { + "epoch": 2.310217193938124, + "grad_norm": 0.35427398179879643, + "learning_rate": 1.5254093729208973e-05, + "loss": 2.5277, + "step": 49621 + }, + { + "epoch": 2.310263752124217, + "grad_norm": 0.344579317099567, + "learning_rate": 1.5252145957898678e-05, + "loss": 2.6613, + "step": 49622 + }, + { + "epoch": 2.31031031031031, + "grad_norm": 0.33107467054797685, + "learning_rate": 1.5250198288569912e-05, + "loss": 2.5413, + "step": 49623 + }, + { + "epoch": 2.3103568684964033, + "grad_norm": 0.32843987029479754, + "learning_rate": 1.5248250721228386e-05, + "loss": 2.5411, + "step": 49624 + }, + { + "epoch": 2.3104034266824964, + "grad_norm": 0.31709703956512725, + "learning_rate": 1.524630325587984e-05, + "loss": 2.5868, + "step": 49625 + }, + { + "epoch": 2.3104499848685895, + "grad_norm": 0.3293954872849349, + "learning_rate": 1.5244355892529954e-05, + "loss": 2.6579, + "step": 49626 + }, + { + "epoch": 2.3104965430546827, + "grad_norm": 0.3148393768775073, + "learning_rate": 1.5242408631184463e-05, + "loss": 2.628, + "step": 49627 + }, + { + "epoch": 2.3105431012407758, + "grad_norm": 0.33542934971607896, + "learning_rate": 1.5240461471849088e-05, + "loss": 2.7292, + "step": 49628 + }, + { + "epoch": 2.310589659426869, + "grad_norm": 0.35941952299599006, + "learning_rate": 1.5238514414529508e-05, + "loss": 2.6185, + "step": 49629 + }, + { + "epoch": 2.310636217612962, + "grad_norm": 0.3228009481767816, + "learning_rate": 1.5236567459231493e-05, + "loss": 2.641, + "step": 49630 + }, + { + "epoch": 2.3106827757990547, + "grad_norm": 0.32674696197923614, + "learning_rate": 1.5234620605960697e-05, + "loss": 2.6191, + "step": 49631 + }, + { + "epoch": 2.310729333985148, + "grad_norm": 0.3125200426244177, + "learning_rate": 1.5232673854722895e-05, + "loss": 2.6118, + "step": 49632 + }, + { + "epoch": 2.310775892171241, + "grad_norm": 0.33712516933793796, + "learning_rate": 1.5230727205523754e-05, + "loss": 2.7064, + "step": 49633 + }, + { + "epoch": 2.310822450357334, + "grad_norm": 0.3179370419128509, + "learning_rate": 1.5228780658369002e-05, + "loss": 2.6519, + "step": 49634 + }, + { + "epoch": 2.310869008543427, + "grad_norm": 0.3181494728535783, + "learning_rate": 1.5226834213264357e-05, + "loss": 2.5479, + "step": 49635 + }, + { + "epoch": 2.3109155667295203, + "grad_norm": 0.3252039147145361, + "learning_rate": 1.5224887870215526e-05, + "loss": 2.6761, + "step": 49636 + }, + { + "epoch": 2.3109621249156134, + "grad_norm": 0.3220285834600698, + "learning_rate": 1.5222941629228222e-05, + "loss": 2.6405, + "step": 49637 + }, + { + "epoch": 2.3110086831017065, + "grad_norm": 0.356690536599752, + "learning_rate": 1.5220995490308176e-05, + "loss": 2.6521, + "step": 49638 + }, + { + "epoch": 2.3110552412877996, + "grad_norm": 0.3399040282623002, + "learning_rate": 1.5219049453461065e-05, + "loss": 2.5538, + "step": 49639 + }, + { + "epoch": 2.3111017994738923, + "grad_norm": 0.327603698129683, + "learning_rate": 1.5217103518692621e-05, + "loss": 2.5866, + "step": 49640 + }, + { + "epoch": 2.3111483576599854, + "grad_norm": 0.3448977998370443, + "learning_rate": 1.5215157686008547e-05, + "loss": 2.6851, + "step": 49641 + }, + { + "epoch": 2.3111949158460785, + "grad_norm": 0.33888465225185793, + "learning_rate": 1.5213211955414559e-05, + "loss": 2.6126, + "step": 49642 + }, + { + "epoch": 2.3112414740321716, + "grad_norm": 0.31466869487191906, + "learning_rate": 1.5211266326916384e-05, + "loss": 2.5796, + "step": 49643 + }, + { + "epoch": 2.3112880322182647, + "grad_norm": 0.33100080285618505, + "learning_rate": 1.5209320800519683e-05, + "loss": 2.71, + "step": 49644 + }, + { + "epoch": 2.311334590404358, + "grad_norm": 0.3408596880398031, + "learning_rate": 1.5207375376230227e-05, + "loss": 2.6387, + "step": 49645 + }, + { + "epoch": 2.311381148590451, + "grad_norm": 0.3342470669968483, + "learning_rate": 1.5205430054053683e-05, + "loss": 2.6897, + "step": 49646 + }, + { + "epoch": 2.311427706776544, + "grad_norm": 0.33032886575473674, + "learning_rate": 1.5203484833995774e-05, + "loss": 2.5366, + "step": 49647 + }, + { + "epoch": 2.311474264962637, + "grad_norm": 0.3454349841056238, + "learning_rate": 1.5201539716062213e-05, + "loss": 2.6772, + "step": 49648 + }, + { + "epoch": 2.3115208231487303, + "grad_norm": 0.34147251382126503, + "learning_rate": 1.5199594700258697e-05, + "loss": 2.6774, + "step": 49649 + }, + { + "epoch": 2.3115673813348234, + "grad_norm": 0.3401108016493899, + "learning_rate": 1.5197649786590945e-05, + "loss": 2.6381, + "step": 49650 + }, + { + "epoch": 2.311613939520916, + "grad_norm": 0.3364478855230176, + "learning_rate": 1.5195704975064685e-05, + "loss": 2.5335, + "step": 49651 + }, + { + "epoch": 2.311660497707009, + "grad_norm": 0.3368280707739338, + "learning_rate": 1.5193760265685574e-05, + "loss": 2.6524, + "step": 49652 + }, + { + "epoch": 2.3117070558931023, + "grad_norm": 0.340910450475493, + "learning_rate": 1.5191815658459358e-05, + "loss": 2.7184, + "step": 49653 + }, + { + "epoch": 2.3117536140791954, + "grad_norm": 0.32973400174538503, + "learning_rate": 1.5189871153391726e-05, + "loss": 2.5856, + "step": 49654 + }, + { + "epoch": 2.3118001722652886, + "grad_norm": 0.326299840535657, + "learning_rate": 1.5187926750488401e-05, + "loss": 2.5846, + "step": 49655 + }, + { + "epoch": 2.3118467304513817, + "grad_norm": 0.3273665917595844, + "learning_rate": 1.5185982449755094e-05, + "loss": 2.7005, + "step": 49656 + }, + { + "epoch": 2.311893288637475, + "grad_norm": 0.33318929963381894, + "learning_rate": 1.5184038251197463e-05, + "loss": 2.7443, + "step": 49657 + }, + { + "epoch": 2.311939846823568, + "grad_norm": 0.325183915491529, + "learning_rate": 1.5182094154821285e-05, + "loss": 2.6756, + "step": 49658 + }, + { + "epoch": 2.311986405009661, + "grad_norm": 0.3396691908538533, + "learning_rate": 1.5180150160632194e-05, + "loss": 2.6499, + "step": 49659 + }, + { + "epoch": 2.3120329631957537, + "grad_norm": 0.3225102925253211, + "learning_rate": 1.517820626863597e-05, + "loss": 2.5379, + "step": 49660 + }, + { + "epoch": 2.312079521381847, + "grad_norm": 0.33478903617169375, + "learning_rate": 1.5176262478838266e-05, + "loss": 2.6719, + "step": 49661 + }, + { + "epoch": 2.31212607956794, + "grad_norm": 0.3461469672946764, + "learning_rate": 1.5174318791244797e-05, + "loss": 2.5694, + "step": 49662 + }, + { + "epoch": 2.312172637754033, + "grad_norm": 0.3644430897153997, + "learning_rate": 1.5172375205861267e-05, + "loss": 2.6882, + "step": 49663 + }, + { + "epoch": 2.312219195940126, + "grad_norm": 0.33237716524968663, + "learning_rate": 1.5170431722693408e-05, + "loss": 2.7089, + "step": 49664 + }, + { + "epoch": 2.3122657541262193, + "grad_norm": 0.30362396459616764, + "learning_rate": 1.5168488341746879e-05, + "loss": 2.6027, + "step": 49665 + }, + { + "epoch": 2.3123123123123124, + "grad_norm": 0.3540854925067638, + "learning_rate": 1.516654506302741e-05, + "loss": 2.5652, + "step": 49666 + }, + { + "epoch": 2.3123588704984055, + "grad_norm": 0.34349534693538436, + "learning_rate": 1.5164601886540697e-05, + "loss": 2.6754, + "step": 49667 + }, + { + "epoch": 2.3124054286844986, + "grad_norm": 0.3693399688561911, + "learning_rate": 1.5162658812292446e-05, + "loss": 2.7001, + "step": 49668 + }, + { + "epoch": 2.3124519868705917, + "grad_norm": 0.3307494624091241, + "learning_rate": 1.5160715840288376e-05, + "loss": 2.5372, + "step": 49669 + }, + { + "epoch": 2.312498545056685, + "grad_norm": 0.33189565771177226, + "learning_rate": 1.515877297053414e-05, + "loss": 2.6299, + "step": 49670 + }, + { + "epoch": 2.3125451032427775, + "grad_norm": 0.33142835142095267, + "learning_rate": 1.5156830203035504e-05, + "loss": 2.5626, + "step": 49671 + }, + { + "epoch": 2.3125916614288706, + "grad_norm": 0.3222332922961547, + "learning_rate": 1.5154887537798107e-05, + "loss": 2.576, + "step": 49672 + }, + { + "epoch": 2.3126382196149637, + "grad_norm": 0.32896608923479226, + "learning_rate": 1.5152944974827716e-05, + "loss": 2.6168, + "step": 49673 + }, + { + "epoch": 2.312684777801057, + "grad_norm": 0.3275696149037262, + "learning_rate": 1.5151002514129981e-05, + "loss": 2.5995, + "step": 49674 + }, + { + "epoch": 2.31273133598715, + "grad_norm": 0.34010475950580077, + "learning_rate": 1.514906015571062e-05, + "loss": 2.5838, + "step": 49675 + }, + { + "epoch": 2.312777894173243, + "grad_norm": 0.3286953719780408, + "learning_rate": 1.5147117899575335e-05, + "loss": 2.5478, + "step": 49676 + }, + { + "epoch": 2.312824452359336, + "grad_norm": 0.336762084511856, + "learning_rate": 1.5145175745729828e-05, + "loss": 2.626, + "step": 49677 + }, + { + "epoch": 2.3128710105454293, + "grad_norm": 0.3296240714940205, + "learning_rate": 1.5143233694179809e-05, + "loss": 2.5925, + "step": 49678 + }, + { + "epoch": 2.312917568731522, + "grad_norm": 0.33443255992608084, + "learning_rate": 1.5141291744930953e-05, + "loss": 2.6485, + "step": 49679 + }, + { + "epoch": 2.312964126917615, + "grad_norm": 0.3447776256278673, + "learning_rate": 1.5139349897988969e-05, + "loss": 2.6851, + "step": 49680 + }, + { + "epoch": 2.313010685103708, + "grad_norm": 0.3362514525814428, + "learning_rate": 1.5137408153359556e-05, + "loss": 2.6171, + "step": 49681 + }, + { + "epoch": 2.3130572432898013, + "grad_norm": 0.3391171240101274, + "learning_rate": 1.5135466511048435e-05, + "loss": 2.7455, + "step": 49682 + }, + { + "epoch": 2.3131038014758944, + "grad_norm": 0.3384123224641435, + "learning_rate": 1.5133524971061253e-05, + "loss": 2.6118, + "step": 49683 + }, + { + "epoch": 2.3131503596619876, + "grad_norm": 0.34700759521518354, + "learning_rate": 1.513158353340377e-05, + "loss": 2.7205, + "step": 49684 + }, + { + "epoch": 2.3131969178480807, + "grad_norm": 0.3533339297782784, + "learning_rate": 1.5129642198081623e-05, + "loss": 2.7495, + "step": 49685 + }, + { + "epoch": 2.313243476034174, + "grad_norm": 0.3357685670329473, + "learning_rate": 1.5127700965100572e-05, + "loss": 2.503, + "step": 49686 + }, + { + "epoch": 2.313290034220267, + "grad_norm": 0.3517423877698109, + "learning_rate": 1.512575983446627e-05, + "loss": 2.6444, + "step": 49687 + }, + { + "epoch": 2.31333659240636, + "grad_norm": 0.34719807581503986, + "learning_rate": 1.5123818806184426e-05, + "loss": 2.5926, + "step": 49688 + }, + { + "epoch": 2.313383150592453, + "grad_norm": 0.3268956695873262, + "learning_rate": 1.5121877880260737e-05, + "loss": 2.6971, + "step": 49689 + }, + { + "epoch": 2.313429708778546, + "grad_norm": 0.3110737204991931, + "learning_rate": 1.51199370567009e-05, + "loss": 2.6804, + "step": 49690 + }, + { + "epoch": 2.313476266964639, + "grad_norm": 0.35048440101035916, + "learning_rate": 1.5117996335510632e-05, + "loss": 2.6601, + "step": 49691 + }, + { + "epoch": 2.313522825150732, + "grad_norm": 0.33258894192574573, + "learning_rate": 1.5116055716695588e-05, + "loss": 2.6354, + "step": 49692 + }, + { + "epoch": 2.313569383336825, + "grad_norm": 0.34600117439331446, + "learning_rate": 1.511411520026148e-05, + "loss": 2.7264, + "step": 49693 + }, + { + "epoch": 2.3136159415229183, + "grad_norm": 0.32994054104090875, + "learning_rate": 1.5112174786214018e-05, + "loss": 2.6743, + "step": 49694 + }, + { + "epoch": 2.3136624997090114, + "grad_norm": 0.3325377087517949, + "learning_rate": 1.5110234474558877e-05, + "loss": 2.6649, + "step": 49695 + }, + { + "epoch": 2.3137090578951045, + "grad_norm": 0.3158073746935507, + "learning_rate": 1.5108294265301765e-05, + "loss": 2.6726, + "step": 49696 + }, + { + "epoch": 2.3137556160811976, + "grad_norm": 0.33625212256618503, + "learning_rate": 1.5106354158448383e-05, + "loss": 2.5392, + "step": 49697 + }, + { + "epoch": 2.3138021742672907, + "grad_norm": 0.3359198643173891, + "learning_rate": 1.5104414154004387e-05, + "loss": 2.5464, + "step": 49698 + }, + { + "epoch": 2.3138487324533834, + "grad_norm": 0.32445288232982744, + "learning_rate": 1.510247425197553e-05, + "loss": 2.5668, + "step": 49699 + }, + { + "epoch": 2.3138952906394765, + "grad_norm": 0.34744106773302125, + "learning_rate": 1.5100534452367454e-05, + "loss": 2.6045, + "step": 49700 + }, + { + "epoch": 2.3139418488255696, + "grad_norm": 0.3393698402833261, + "learning_rate": 1.5098594755185868e-05, + "loss": 2.5422, + "step": 49701 + }, + { + "epoch": 2.3139884070116628, + "grad_norm": 0.3046423906473824, + "learning_rate": 1.5096655160436474e-05, + "loss": 2.5912, + "step": 49702 + }, + { + "epoch": 2.314034965197756, + "grad_norm": 0.3466953085116127, + "learning_rate": 1.509471566812496e-05, + "loss": 2.6216, + "step": 49703 + }, + { + "epoch": 2.314081523383849, + "grad_norm": 0.33561997889051176, + "learning_rate": 1.5092776278257027e-05, + "loss": 2.6073, + "step": 49704 + }, + { + "epoch": 2.314128081569942, + "grad_norm": 0.329326612925802, + "learning_rate": 1.5090836990838342e-05, + "loss": 2.6834, + "step": 49705 + }, + { + "epoch": 2.314174639756035, + "grad_norm": 0.32770087399097536, + "learning_rate": 1.5088897805874614e-05, + "loss": 2.6499, + "step": 49706 + }, + { + "epoch": 2.3142211979421283, + "grad_norm": 0.34396962402998954, + "learning_rate": 1.508695872337153e-05, + "loss": 2.643, + "step": 49707 + }, + { + "epoch": 2.3142677561282214, + "grad_norm": 0.33550417870950056, + "learning_rate": 1.5085019743334784e-05, + "loss": 2.597, + "step": 49708 + }, + { + "epoch": 2.3143143143143146, + "grad_norm": 0.32287999268981066, + "learning_rate": 1.5083080865770061e-05, + "loss": 2.56, + "step": 49709 + }, + { + "epoch": 2.3143608725004072, + "grad_norm": 0.3340619635614316, + "learning_rate": 1.5081142090683077e-05, + "loss": 2.5558, + "step": 49710 + }, + { + "epoch": 2.3144074306865003, + "grad_norm": 0.3274281014038211, + "learning_rate": 1.5079203418079463e-05, + "loss": 2.6074, + "step": 49711 + }, + { + "epoch": 2.3144539888725935, + "grad_norm": 0.30799767323775784, + "learning_rate": 1.5077264847964984e-05, + "loss": 2.6287, + "step": 49712 + }, + { + "epoch": 2.3145005470586866, + "grad_norm": 0.32563236138902263, + "learning_rate": 1.5075326380345273e-05, + "loss": 2.6841, + "step": 49713 + }, + { + "epoch": 2.3145471052447797, + "grad_norm": 0.32615828189339996, + "learning_rate": 1.5073388015226042e-05, + "loss": 2.5053, + "step": 49714 + }, + { + "epoch": 2.314593663430873, + "grad_norm": 0.32860676011354933, + "learning_rate": 1.5071449752612977e-05, + "loss": 2.6586, + "step": 49715 + }, + { + "epoch": 2.314640221616966, + "grad_norm": 0.3464475418701396, + "learning_rate": 1.506951159251176e-05, + "loss": 2.6568, + "step": 49716 + }, + { + "epoch": 2.314686779803059, + "grad_norm": 0.32348404072849557, + "learning_rate": 1.5067573534928104e-05, + "loss": 2.5377, + "step": 49717 + }, + { + "epoch": 2.3147333379891517, + "grad_norm": 0.33136568426816826, + "learning_rate": 1.5065635579867666e-05, + "loss": 2.6308, + "step": 49718 + }, + { + "epoch": 2.314779896175245, + "grad_norm": 0.33883072758203353, + "learning_rate": 1.5063697727336141e-05, + "loss": 2.6191, + "step": 49719 + }, + { + "epoch": 2.314826454361338, + "grad_norm": 0.3128852954329573, + "learning_rate": 1.5061759977339223e-05, + "loss": 2.59, + "step": 49720 + }, + { + "epoch": 2.314873012547431, + "grad_norm": 0.31765347994051013, + "learning_rate": 1.50598223298826e-05, + "loss": 2.656, + "step": 49721 + }, + { + "epoch": 2.314919570733524, + "grad_norm": 0.34275110928983316, + "learning_rate": 1.5057884784971954e-05, + "loss": 2.6415, + "step": 49722 + }, + { + "epoch": 2.3149661289196173, + "grad_norm": 0.3192206176182316, + "learning_rate": 1.5055947342612985e-05, + "loss": 2.6298, + "step": 49723 + }, + { + "epoch": 2.3150126871057104, + "grad_norm": 0.34213677997275016, + "learning_rate": 1.5054010002811342e-05, + "loss": 2.6838, + "step": 49724 + }, + { + "epoch": 2.3150592452918035, + "grad_norm": 0.3258649809204537, + "learning_rate": 1.5052072765572767e-05, + "loss": 2.6806, + "step": 49725 + }, + { + "epoch": 2.3151058034778966, + "grad_norm": 0.3278087787560246, + "learning_rate": 1.5050135630902902e-05, + "loss": 2.5556, + "step": 49726 + }, + { + "epoch": 2.3151523616639897, + "grad_norm": 0.3414654849250772, + "learning_rate": 1.5048198598807446e-05, + "loss": 2.6789, + "step": 49727 + }, + { + "epoch": 2.315198919850083, + "grad_norm": 0.33115256556275285, + "learning_rate": 1.5046261669292077e-05, + "loss": 2.6713, + "step": 49728 + }, + { + "epoch": 2.3152454780361755, + "grad_norm": 0.3386520697055443, + "learning_rate": 1.5044324842362496e-05, + "loss": 2.7961, + "step": 49729 + }, + { + "epoch": 2.3152920362222686, + "grad_norm": 0.32077554370483913, + "learning_rate": 1.5042388118024386e-05, + "loss": 2.6432, + "step": 49730 + }, + { + "epoch": 2.3153385944083618, + "grad_norm": 0.3415410500961451, + "learning_rate": 1.5040451496283397e-05, + "loss": 2.6687, + "step": 49731 + }, + { + "epoch": 2.315385152594455, + "grad_norm": 0.34700609975274865, + "learning_rate": 1.5038514977145268e-05, + "loss": 2.6912, + "step": 49732 + }, + { + "epoch": 2.315431710780548, + "grad_norm": 0.33724877623521626, + "learning_rate": 1.5036578560615633e-05, + "loss": 2.601, + "step": 49733 + }, + { + "epoch": 2.315478268966641, + "grad_norm": 0.3162000458227752, + "learning_rate": 1.5034642246700204e-05, + "loss": 2.557, + "step": 49734 + }, + { + "epoch": 2.315524827152734, + "grad_norm": 0.3208663826173557, + "learning_rate": 1.5032706035404648e-05, + "loss": 2.6017, + "step": 49735 + }, + { + "epoch": 2.3155713853388273, + "grad_norm": 0.3229101485534111, + "learning_rate": 1.5030769926734672e-05, + "loss": 2.593, + "step": 49736 + }, + { + "epoch": 2.3156179435249205, + "grad_norm": 0.32986622648303493, + "learning_rate": 1.5028833920695912e-05, + "loss": 2.7553, + "step": 49737 + }, + { + "epoch": 2.315664501711013, + "grad_norm": 0.3510468406131342, + "learning_rate": 1.502689801729411e-05, + "loss": 2.7426, + "step": 49738 + }, + { + "epoch": 2.3157110598971062, + "grad_norm": 0.30703739179769396, + "learning_rate": 1.5024962216534905e-05, + "loss": 2.5461, + "step": 49739 + }, + { + "epoch": 2.3157576180831994, + "grad_norm": 0.3420971021377712, + "learning_rate": 1.5023026518423982e-05, + "loss": 2.7137, + "step": 49740 + }, + { + "epoch": 2.3158041762692925, + "grad_norm": 0.30711354764848686, + "learning_rate": 1.5021090922967035e-05, + "loss": 2.5442, + "step": 49741 + }, + { + "epoch": 2.3158507344553856, + "grad_norm": 0.34938029106763524, + "learning_rate": 1.501915543016974e-05, + "loss": 2.6817, + "step": 49742 + }, + { + "epoch": 2.3158972926414787, + "grad_norm": 0.3340825945521515, + "learning_rate": 1.5017220040037794e-05, + "loss": 2.5758, + "step": 49743 + }, + { + "epoch": 2.315943850827572, + "grad_norm": 0.3211440816362109, + "learning_rate": 1.501528475257683e-05, + "loss": 2.6153, + "step": 49744 + }, + { + "epoch": 2.315990409013665, + "grad_norm": 0.3342539197614867, + "learning_rate": 1.501334956779259e-05, + "loss": 2.6547, + "step": 49745 + }, + { + "epoch": 2.316036967199758, + "grad_norm": 0.32720003602797915, + "learning_rate": 1.5011414485690706e-05, + "loss": 2.6975, + "step": 49746 + }, + { + "epoch": 2.316083525385851, + "grad_norm": 0.3204046604409768, + "learning_rate": 1.500947950627687e-05, + "loss": 2.7242, + "step": 49747 + }, + { + "epoch": 2.3161300835719443, + "grad_norm": 0.333378655044065, + "learning_rate": 1.5007544629556769e-05, + "loss": 2.7505, + "step": 49748 + }, + { + "epoch": 2.316176641758037, + "grad_norm": 0.3356615214380442, + "learning_rate": 1.5005609855536074e-05, + "loss": 2.6493, + "step": 49749 + }, + { + "epoch": 2.31622319994413, + "grad_norm": 0.3382349772169988, + "learning_rate": 1.5003675184220467e-05, + "loss": 2.7334, + "step": 49750 + }, + { + "epoch": 2.316269758130223, + "grad_norm": 0.3237242416314937, + "learning_rate": 1.5001740615615645e-05, + "loss": 2.624, + "step": 49751 + }, + { + "epoch": 2.3163163163163163, + "grad_norm": 0.33568293491174434, + "learning_rate": 1.4999806149727247e-05, + "loss": 2.543, + "step": 49752 + }, + { + "epoch": 2.3163628745024094, + "grad_norm": 0.32740653389410934, + "learning_rate": 1.4997871786560968e-05, + "loss": 2.6355, + "step": 49753 + }, + { + "epoch": 2.3164094326885025, + "grad_norm": 0.3257599867955206, + "learning_rate": 1.4995937526122495e-05, + "loss": 2.6171, + "step": 49754 + }, + { + "epoch": 2.3164559908745956, + "grad_norm": 0.3164299304029617, + "learning_rate": 1.4994003368417493e-05, + "loss": 2.6293, + "step": 49755 + }, + { + "epoch": 2.3165025490606888, + "grad_norm": 0.3260498219191931, + "learning_rate": 1.499206931345165e-05, + "loss": 2.6742, + "step": 49756 + }, + { + "epoch": 2.3165491072467814, + "grad_norm": 0.3086169450305964, + "learning_rate": 1.4990135361230612e-05, + "loss": 2.583, + "step": 49757 + }, + { + "epoch": 2.3165956654328745, + "grad_norm": 0.3423714009987276, + "learning_rate": 1.4988201511760108e-05, + "loss": 2.7413, + "step": 49758 + }, + { + "epoch": 2.3166422236189677, + "grad_norm": 0.3172245302463272, + "learning_rate": 1.498626776504577e-05, + "loss": 2.7445, + "step": 49759 + }, + { + "epoch": 2.3166887818050608, + "grad_norm": 0.31849166083186775, + "learning_rate": 1.498433412109328e-05, + "loss": 2.6542, + "step": 49760 + }, + { + "epoch": 2.316735339991154, + "grad_norm": 0.3250464717695063, + "learning_rate": 1.4982400579908323e-05, + "loss": 2.5818, + "step": 49761 + }, + { + "epoch": 2.316781898177247, + "grad_norm": 0.3462989683979246, + "learning_rate": 1.4980467141496574e-05, + "loss": 2.5847, + "step": 49762 + }, + { + "epoch": 2.31682845636334, + "grad_norm": 0.32377151447544544, + "learning_rate": 1.4978533805863698e-05, + "loss": 2.6341, + "step": 49763 + }, + { + "epoch": 2.3168750145494332, + "grad_norm": 0.3547780797948919, + "learning_rate": 1.4976600573015398e-05, + "loss": 2.6456, + "step": 49764 + }, + { + "epoch": 2.3169215727355263, + "grad_norm": 0.3377710089483897, + "learning_rate": 1.4974667442957302e-05, + "loss": 2.6346, + "step": 49765 + }, + { + "epoch": 2.3169681309216195, + "grad_norm": 0.3248333208622409, + "learning_rate": 1.4972734415695111e-05, + "loss": 2.5896, + "step": 49766 + }, + { + "epoch": 2.3170146891077126, + "grad_norm": 0.3213156610498621, + "learning_rate": 1.4970801491234493e-05, + "loss": 2.6173, + "step": 49767 + }, + { + "epoch": 2.3170612472938052, + "grad_norm": 0.33317691358228374, + "learning_rate": 1.4968868669581121e-05, + "loss": 2.6306, + "step": 49768 + }, + { + "epoch": 2.3171078054798984, + "grad_norm": 0.3067685156590733, + "learning_rate": 1.4966935950740685e-05, + "loss": 2.5759, + "step": 49769 + }, + { + "epoch": 2.3171543636659915, + "grad_norm": 0.3426634995736079, + "learning_rate": 1.4965003334718803e-05, + "loss": 2.6848, + "step": 49770 + }, + { + "epoch": 2.3172009218520846, + "grad_norm": 0.32703588520966076, + "learning_rate": 1.4963070821521225e-05, + "loss": 2.571, + "step": 49771 + }, + { + "epoch": 2.3172474800381777, + "grad_norm": 0.30846069822374517, + "learning_rate": 1.4961138411153564e-05, + "loss": 2.5145, + "step": 49772 + }, + { + "epoch": 2.317294038224271, + "grad_norm": 0.36767834837878194, + "learning_rate": 1.4959206103621503e-05, + "loss": 2.7077, + "step": 49773 + }, + { + "epoch": 2.317340596410364, + "grad_norm": 0.3443528397559284, + "learning_rate": 1.495727389893073e-05, + "loss": 2.6777, + "step": 49774 + }, + { + "epoch": 2.317387154596457, + "grad_norm": 0.3197236169888874, + "learning_rate": 1.4955341797086902e-05, + "loss": 2.6537, + "step": 49775 + }, + { + "epoch": 2.31743371278255, + "grad_norm": 0.31785408511157576, + "learning_rate": 1.4953409798095691e-05, + "loss": 2.6068, + "step": 49776 + }, + { + "epoch": 2.317480270968643, + "grad_norm": 0.3298093936027294, + "learning_rate": 1.4951477901962784e-05, + "loss": 2.6213, + "step": 49777 + }, + { + "epoch": 2.317526829154736, + "grad_norm": 0.32850972762014924, + "learning_rate": 1.494954610869383e-05, + "loss": 2.5483, + "step": 49778 + }, + { + "epoch": 2.317573387340829, + "grad_norm": 0.33024487062026514, + "learning_rate": 1.4947614418294498e-05, + "loss": 2.5901, + "step": 49779 + }, + { + "epoch": 2.317619945526922, + "grad_norm": 0.33578992219290377, + "learning_rate": 1.4945682830770463e-05, + "loss": 2.658, + "step": 49780 + }, + { + "epoch": 2.3176665037130153, + "grad_norm": 0.34191998244050226, + "learning_rate": 1.49437513461274e-05, + "loss": 2.6096, + "step": 49781 + }, + { + "epoch": 2.3177130618991084, + "grad_norm": 0.3210456580986047, + "learning_rate": 1.4941819964370985e-05, + "loss": 2.6159, + "step": 49782 + }, + { + "epoch": 2.3177596200852015, + "grad_norm": 0.33526256165485274, + "learning_rate": 1.4939888685506847e-05, + "loss": 2.6561, + "step": 49783 + }, + { + "epoch": 2.3178061782712946, + "grad_norm": 0.3202314145745357, + "learning_rate": 1.4937957509540712e-05, + "loss": 2.6394, + "step": 49784 + }, + { + "epoch": 2.3178527364573878, + "grad_norm": 0.34175382904278795, + "learning_rate": 1.4936026436478184e-05, + "loss": 2.5892, + "step": 49785 + }, + { + "epoch": 2.317899294643481, + "grad_norm": 0.3271460782511574, + "learning_rate": 1.4934095466324998e-05, + "loss": 2.6659, + "step": 49786 + }, + { + "epoch": 2.317945852829574, + "grad_norm": 0.316146262733098, + "learning_rate": 1.4932164599086768e-05, + "loss": 2.6252, + "step": 49787 + }, + { + "epoch": 2.3179924110156667, + "grad_norm": 0.3071083779218375, + "learning_rate": 1.4930233834769186e-05, + "loss": 2.5845, + "step": 49788 + }, + { + "epoch": 2.31803896920176, + "grad_norm": 0.3379059044697699, + "learning_rate": 1.4928303173377906e-05, + "loss": 2.6297, + "step": 49789 + }, + { + "epoch": 2.318085527387853, + "grad_norm": 0.33794707705285976, + "learning_rate": 1.4926372614918605e-05, + "loss": 2.5886, + "step": 49790 + }, + { + "epoch": 2.318132085573946, + "grad_norm": 0.3118425415060953, + "learning_rate": 1.4924442159396961e-05, + "loss": 2.6112, + "step": 49791 + }, + { + "epoch": 2.318178643760039, + "grad_norm": 0.3215723716948926, + "learning_rate": 1.4922511806818606e-05, + "loss": 2.7146, + "step": 49792 + }, + { + "epoch": 2.3182252019461322, + "grad_norm": 0.3202628569161439, + "learning_rate": 1.4920581557189222e-05, + "loss": 2.604, + "step": 49793 + }, + { + "epoch": 2.3182717601322254, + "grad_norm": 0.31857987042582186, + "learning_rate": 1.4918651410514478e-05, + "loss": 2.6785, + "step": 49794 + }, + { + "epoch": 2.3183183183183185, + "grad_norm": 0.32531948075888806, + "learning_rate": 1.4916721366800052e-05, + "loss": 2.5916, + "step": 49795 + }, + { + "epoch": 2.318364876504411, + "grad_norm": 0.31072939229827556, + "learning_rate": 1.491479142605156e-05, + "loss": 2.7574, + "step": 49796 + }, + { + "epoch": 2.3184114346905043, + "grad_norm": 0.32056200094192683, + "learning_rate": 1.4912861588274735e-05, + "loss": 2.6516, + "step": 49797 + }, + { + "epoch": 2.3184579928765974, + "grad_norm": 0.34344418654365777, + "learning_rate": 1.4910931853475163e-05, + "loss": 2.645, + "step": 49798 + }, + { + "epoch": 2.3185045510626905, + "grad_norm": 0.3190061082196437, + "learning_rate": 1.4909002221658591e-05, + "loss": 2.5215, + "step": 49799 + }, + { + "epoch": 2.3185511092487836, + "grad_norm": 0.3448454122307639, + "learning_rate": 1.4907072692830614e-05, + "loss": 2.6658, + "step": 49800 + }, + { + "epoch": 2.3185976674348767, + "grad_norm": 0.30852465686556263, + "learning_rate": 1.490514326699693e-05, + "loss": 2.5905, + "step": 49801 + }, + { + "epoch": 2.31864422562097, + "grad_norm": 0.3393203468859926, + "learning_rate": 1.4903213944163192e-05, + "loss": 2.6796, + "step": 49802 + }, + { + "epoch": 2.318690783807063, + "grad_norm": 0.3028629067225795, + "learning_rate": 1.4901284724335063e-05, + "loss": 2.5959, + "step": 49803 + }, + { + "epoch": 2.318737341993156, + "grad_norm": 0.31007668883641293, + "learning_rate": 1.4899355607518217e-05, + "loss": 2.5832, + "step": 49804 + }, + { + "epoch": 2.318783900179249, + "grad_norm": 0.3124709106863017, + "learning_rate": 1.4897426593718295e-05, + "loss": 2.5737, + "step": 49805 + }, + { + "epoch": 2.3188304583653423, + "grad_norm": 0.32900183921000037, + "learning_rate": 1.4895497682940963e-05, + "loss": 2.6328, + "step": 49806 + }, + { + "epoch": 2.318877016551435, + "grad_norm": 0.31232017388338895, + "learning_rate": 1.4893568875191888e-05, + "loss": 2.6206, + "step": 49807 + }, + { + "epoch": 2.318923574737528, + "grad_norm": 0.3261818974969936, + "learning_rate": 1.4891640170476728e-05, + "loss": 2.619, + "step": 49808 + }, + { + "epoch": 2.318970132923621, + "grad_norm": 0.31554779700072705, + "learning_rate": 1.4889711568801151e-05, + "loss": 2.5722, + "step": 49809 + }, + { + "epoch": 2.3190166911097143, + "grad_norm": 0.3241112148410544, + "learning_rate": 1.4887783070170823e-05, + "loss": 2.6123, + "step": 49810 + }, + { + "epoch": 2.3190632492958074, + "grad_norm": 0.3153814928069426, + "learning_rate": 1.488585467459136e-05, + "loss": 2.6477, + "step": 49811 + }, + { + "epoch": 2.3191098074819005, + "grad_norm": 0.34026352775280894, + "learning_rate": 1.4883926382068491e-05, + "loss": 2.6691, + "step": 49812 + }, + { + "epoch": 2.3191563656679937, + "grad_norm": 0.319728578162957, + "learning_rate": 1.4881998192607815e-05, + "loss": 2.6481, + "step": 49813 + }, + { + "epoch": 2.3192029238540868, + "grad_norm": 0.3263400649395429, + "learning_rate": 1.488007010621502e-05, + "loss": 2.6245, + "step": 49814 + }, + { + "epoch": 2.31924948204018, + "grad_norm": 0.3432723960609296, + "learning_rate": 1.487814212289576e-05, + "loss": 2.6822, + "step": 49815 + }, + { + "epoch": 2.3192960402262726, + "grad_norm": 0.3219367796497666, + "learning_rate": 1.487621424265569e-05, + "loss": 2.5964, + "step": 49816 + }, + { + "epoch": 2.3193425984123657, + "grad_norm": 0.336303758902616, + "learning_rate": 1.4874286465500486e-05, + "loss": 2.6681, + "step": 49817 + }, + { + "epoch": 2.319389156598459, + "grad_norm": 0.31311587454623874, + "learning_rate": 1.4872358791435782e-05, + "loss": 2.6258, + "step": 49818 + }, + { + "epoch": 2.319435714784552, + "grad_norm": 0.3488460730905568, + "learning_rate": 1.4870431220467235e-05, + "loss": 2.5855, + "step": 49819 + }, + { + "epoch": 2.319482272970645, + "grad_norm": 0.32748111614641895, + "learning_rate": 1.4868503752600515e-05, + "loss": 2.5984, + "step": 49820 + }, + { + "epoch": 2.319528831156738, + "grad_norm": 0.3343527595889753, + "learning_rate": 1.486657638784128e-05, + "loss": 2.725, + "step": 49821 + }, + { + "epoch": 2.3195753893428313, + "grad_norm": 0.3645144116589816, + "learning_rate": 1.4864649126195179e-05, + "loss": 2.6753, + "step": 49822 + }, + { + "epoch": 2.3196219475289244, + "grad_norm": 0.32751482645145974, + "learning_rate": 1.4862721967667886e-05, + "loss": 2.5555, + "step": 49823 + }, + { + "epoch": 2.3196685057150175, + "grad_norm": 0.33157047329865774, + "learning_rate": 1.4860794912265008e-05, + "loss": 2.6531, + "step": 49824 + }, + { + "epoch": 2.3197150639011106, + "grad_norm": 0.3285136784577327, + "learning_rate": 1.4858867959992272e-05, + "loss": 2.6293, + "step": 49825 + }, + { + "epoch": 2.3197616220872037, + "grad_norm": 0.31855347634901965, + "learning_rate": 1.4856941110855277e-05, + "loss": 2.6638, + "step": 49826 + }, + { + "epoch": 2.3198081802732964, + "grad_norm": 0.33161619342547904, + "learning_rate": 1.4855014364859704e-05, + "loss": 2.6293, + "step": 49827 + }, + { + "epoch": 2.3198547384593895, + "grad_norm": 0.356157054745045, + "learning_rate": 1.4853087722011194e-05, + "loss": 2.7124, + "step": 49828 + }, + { + "epoch": 2.3199012966454826, + "grad_norm": 0.32466402107342934, + "learning_rate": 1.4851161182315415e-05, + "loss": 2.7556, + "step": 49829 + }, + { + "epoch": 2.3199478548315757, + "grad_norm": 0.32870280784003886, + "learning_rate": 1.4849234745778024e-05, + "loss": 2.6337, + "step": 49830 + }, + { + "epoch": 2.319994413017669, + "grad_norm": 0.3318139614078585, + "learning_rate": 1.4847308412404653e-05, + "loss": 2.6798, + "step": 49831 + }, + { + "epoch": 2.320040971203762, + "grad_norm": 0.30648313363560525, + "learning_rate": 1.4845382182200972e-05, + "loss": 2.6064, + "step": 49832 + }, + { + "epoch": 2.320087529389855, + "grad_norm": 0.3357835243251541, + "learning_rate": 1.4843456055172627e-05, + "loss": 2.7129, + "step": 49833 + }, + { + "epoch": 2.320134087575948, + "grad_norm": 0.32494003004332206, + "learning_rate": 1.484153003132528e-05, + "loss": 2.6984, + "step": 49834 + }, + { + "epoch": 2.3201806457620413, + "grad_norm": 0.3231700957060417, + "learning_rate": 1.4839604110664574e-05, + "loss": 2.6539, + "step": 49835 + }, + { + "epoch": 2.320227203948134, + "grad_norm": 0.32821752179194164, + "learning_rate": 1.4837678293196184e-05, + "loss": 2.5959, + "step": 49836 + }, + { + "epoch": 2.320273762134227, + "grad_norm": 0.3516181085943948, + "learning_rate": 1.483575257892571e-05, + "loss": 2.6843, + "step": 49837 + }, + { + "epoch": 2.32032032032032, + "grad_norm": 0.31376860743657176, + "learning_rate": 1.4833826967858871e-05, + "loss": 2.62, + "step": 49838 + }, + { + "epoch": 2.3203668785064133, + "grad_norm": 0.3294841500057427, + "learning_rate": 1.4831901460001275e-05, + "loss": 2.6877, + "step": 49839 + }, + { + "epoch": 2.3204134366925064, + "grad_norm": 0.3118164072385419, + "learning_rate": 1.482997605535858e-05, + "loss": 2.5726, + "step": 49840 + }, + { + "epoch": 2.3204599948785996, + "grad_norm": 0.33499363963531176, + "learning_rate": 1.4828050753936435e-05, + "loss": 2.5498, + "step": 49841 + }, + { + "epoch": 2.3205065530646927, + "grad_norm": 0.3005998629055944, + "learning_rate": 1.48261255557405e-05, + "loss": 2.5656, + "step": 49842 + }, + { + "epoch": 2.320553111250786, + "grad_norm": 0.3016201877692375, + "learning_rate": 1.4824200460776438e-05, + "loss": 2.6544, + "step": 49843 + }, + { + "epoch": 2.320599669436879, + "grad_norm": 0.3407833048261889, + "learning_rate": 1.4822275469049851e-05, + "loss": 2.6475, + "step": 49844 + }, + { + "epoch": 2.320646227622972, + "grad_norm": 0.3522885069544518, + "learning_rate": 1.482035058056645e-05, + "loss": 2.6436, + "step": 49845 + }, + { + "epoch": 2.320692785809065, + "grad_norm": 0.3161534611225618, + "learning_rate": 1.4818425795331842e-05, + "loss": 2.5618, + "step": 49846 + }, + { + "epoch": 2.320739343995158, + "grad_norm": 0.31811340186027687, + "learning_rate": 1.4816501113351683e-05, + "loss": 2.6542, + "step": 49847 + }, + { + "epoch": 2.320785902181251, + "grad_norm": 0.33956011771902156, + "learning_rate": 1.481457653463163e-05, + "loss": 2.7147, + "step": 49848 + }, + { + "epoch": 2.320832460367344, + "grad_norm": 0.340722363502387, + "learning_rate": 1.481265205917734e-05, + "loss": 2.6724, + "step": 49849 + }, + { + "epoch": 2.320879018553437, + "grad_norm": 0.3108538670431992, + "learning_rate": 1.4810727686994424e-05, + "loss": 2.7188, + "step": 49850 + }, + { + "epoch": 2.3209255767395303, + "grad_norm": 0.31355610006439805, + "learning_rate": 1.4808803418088584e-05, + "loss": 2.5782, + "step": 49851 + }, + { + "epoch": 2.3209721349256234, + "grad_norm": 0.3381879137972681, + "learning_rate": 1.4806879252465427e-05, + "loss": 2.6196, + "step": 49852 + }, + { + "epoch": 2.3210186931117165, + "grad_norm": 0.3253882242804098, + "learning_rate": 1.4804955190130609e-05, + "loss": 2.6677, + "step": 49853 + }, + { + "epoch": 2.3210652512978096, + "grad_norm": 0.30753370001106156, + "learning_rate": 1.4803031231089782e-05, + "loss": 2.5769, + "step": 49854 + }, + { + "epoch": 2.3211118094839023, + "grad_norm": 0.3292462190024656, + "learning_rate": 1.4801107375348589e-05, + "loss": 2.6512, + "step": 49855 + }, + { + "epoch": 2.3211583676699954, + "grad_norm": 0.3312012275008529, + "learning_rate": 1.4799183622912694e-05, + "loss": 2.6285, + "step": 49856 + }, + { + "epoch": 2.3212049258560885, + "grad_norm": 0.34442862919202133, + "learning_rate": 1.4797259973787692e-05, + "loss": 2.6385, + "step": 49857 + }, + { + "epoch": 2.3212514840421816, + "grad_norm": 0.33845940226693955, + "learning_rate": 1.4795336427979301e-05, + "loss": 2.6678, + "step": 49858 + }, + { + "epoch": 2.3212980422282747, + "grad_norm": 0.32809411508927794, + "learning_rate": 1.4793412985493116e-05, + "loss": 2.6472, + "step": 49859 + }, + { + "epoch": 2.321344600414368, + "grad_norm": 0.35241854905980224, + "learning_rate": 1.4791489646334788e-05, + "loss": 2.6256, + "step": 49860 + }, + { + "epoch": 2.321391158600461, + "grad_norm": 0.35127821594711, + "learning_rate": 1.4789566410509975e-05, + "loss": 2.6721, + "step": 49861 + }, + { + "epoch": 2.321437716786554, + "grad_norm": 0.34164181856249487, + "learning_rate": 1.4787643278024311e-05, + "loss": 2.7327, + "step": 49862 + }, + { + "epoch": 2.321484274972647, + "grad_norm": 0.3221900502454643, + "learning_rate": 1.478572024888345e-05, + "loss": 2.5832, + "step": 49863 + }, + { + "epoch": 2.3215308331587403, + "grad_norm": 0.31002550201121254, + "learning_rate": 1.4783797323093046e-05, + "loss": 2.6344, + "step": 49864 + }, + { + "epoch": 2.3215773913448334, + "grad_norm": 0.3491646812307353, + "learning_rate": 1.478187450065871e-05, + "loss": 2.5849, + "step": 49865 + }, + { + "epoch": 2.321623949530926, + "grad_norm": 0.3481084384673818, + "learning_rate": 1.47799517815861e-05, + "loss": 2.6086, + "step": 49866 + }, + { + "epoch": 2.321670507717019, + "grad_norm": 0.33842106949105305, + "learning_rate": 1.4778029165880863e-05, + "loss": 2.772, + "step": 49867 + }, + { + "epoch": 2.3217170659031123, + "grad_norm": 0.3306666175551267, + "learning_rate": 1.4776106653548644e-05, + "loss": 2.7042, + "step": 49868 + }, + { + "epoch": 2.3217636240892054, + "grad_norm": 0.3210925978401093, + "learning_rate": 1.4774184244595097e-05, + "loss": 2.5097, + "step": 49869 + }, + { + "epoch": 2.3218101822752986, + "grad_norm": 0.3395132424069888, + "learning_rate": 1.4772261939025817e-05, + "loss": 2.5652, + "step": 49870 + }, + { + "epoch": 2.3218567404613917, + "grad_norm": 0.33229503767949, + "learning_rate": 1.4770339736846505e-05, + "loss": 2.582, + "step": 49871 + }, + { + "epoch": 2.321903298647485, + "grad_norm": 0.3201414724111579, + "learning_rate": 1.4768417638062764e-05, + "loss": 2.6186, + "step": 49872 + }, + { + "epoch": 2.321949856833578, + "grad_norm": 0.32084246837405317, + "learning_rate": 1.4766495642680245e-05, + "loss": 2.583, + "step": 49873 + }, + { + "epoch": 2.321996415019671, + "grad_norm": 0.3634055784110167, + "learning_rate": 1.4764573750704592e-05, + "loss": 2.638, + "step": 49874 + }, + { + "epoch": 2.3220429732057637, + "grad_norm": 0.31992355137339934, + "learning_rate": 1.4762651962141438e-05, + "loss": 2.6156, + "step": 49875 + }, + { + "epoch": 2.322089531391857, + "grad_norm": 0.332919300840597, + "learning_rate": 1.4760730276996438e-05, + "loss": 2.5853, + "step": 49876 + }, + { + "epoch": 2.32213608957795, + "grad_norm": 0.3629404602257742, + "learning_rate": 1.4758808695275233e-05, + "loss": 2.6568, + "step": 49877 + }, + { + "epoch": 2.322182647764043, + "grad_norm": 0.346646415246902, + "learning_rate": 1.4756887216983434e-05, + "loss": 2.6648, + "step": 49878 + }, + { + "epoch": 2.322229205950136, + "grad_norm": 0.33979073473601507, + "learning_rate": 1.4754965842126705e-05, + "loss": 2.7185, + "step": 49879 + }, + { + "epoch": 2.3222757641362293, + "grad_norm": 0.3362873923351807, + "learning_rate": 1.4753044570710677e-05, + "loss": 2.6879, + "step": 49880 + }, + { + "epoch": 2.3223223223223224, + "grad_norm": 0.3359796859481412, + "learning_rate": 1.4751123402740991e-05, + "loss": 2.602, + "step": 49881 + }, + { + "epoch": 2.3223688805084155, + "grad_norm": 0.30635535807114983, + "learning_rate": 1.47492023382233e-05, + "loss": 2.6999, + "step": 49882 + }, + { + "epoch": 2.3224154386945086, + "grad_norm": 0.34437444653640337, + "learning_rate": 1.4747281377163197e-05, + "loss": 2.5451, + "step": 49883 + }, + { + "epoch": 2.3224619968806017, + "grad_norm": 0.32163211040600526, + "learning_rate": 1.4745360519566381e-05, + "loss": 2.5554, + "step": 49884 + }, + { + "epoch": 2.322508555066695, + "grad_norm": 0.3246017232908862, + "learning_rate": 1.4743439765438443e-05, + "loss": 2.5769, + "step": 49885 + }, + { + "epoch": 2.3225551132527875, + "grad_norm": 0.3184957913951457, + "learning_rate": 1.4741519114785034e-05, + "loss": 2.6829, + "step": 49886 + }, + { + "epoch": 2.3226016714388806, + "grad_norm": 0.33543544337173925, + "learning_rate": 1.4739598567611796e-05, + "loss": 2.5907, + "step": 49887 + }, + { + "epoch": 2.3226482296249737, + "grad_norm": 0.3433776815025318, + "learning_rate": 1.4737678123924358e-05, + "loss": 2.6823, + "step": 49888 + }, + { + "epoch": 2.322694787811067, + "grad_norm": 0.31104256661754814, + "learning_rate": 1.4735757783728366e-05, + "loss": 2.5118, + "step": 49889 + }, + { + "epoch": 2.32274134599716, + "grad_norm": 0.3256369296579569, + "learning_rate": 1.4733837547029467e-05, + "loss": 2.6506, + "step": 49890 + }, + { + "epoch": 2.322787904183253, + "grad_norm": 0.34355221542394143, + "learning_rate": 1.4731917413833263e-05, + "loss": 2.6888, + "step": 49891 + }, + { + "epoch": 2.322834462369346, + "grad_norm": 0.3175599584150747, + "learning_rate": 1.4729997384145404e-05, + "loss": 2.7066, + "step": 49892 + }, + { + "epoch": 2.3228810205554393, + "grad_norm": 0.31294363568464734, + "learning_rate": 1.4728077457971529e-05, + "loss": 2.6326, + "step": 49893 + }, + { + "epoch": 2.322927578741532, + "grad_norm": 0.3228723388639694, + "learning_rate": 1.4726157635317273e-05, + "loss": 2.5764, + "step": 49894 + }, + { + "epoch": 2.322974136927625, + "grad_norm": 0.32855227226915135, + "learning_rate": 1.4724237916188288e-05, + "loss": 2.5795, + "step": 49895 + }, + { + "epoch": 2.3230206951137182, + "grad_norm": 0.3241400450197675, + "learning_rate": 1.4722318300590154e-05, + "loss": 2.6689, + "step": 49896 + }, + { + "epoch": 2.3230672532998113, + "grad_norm": 0.3604115895705683, + "learning_rate": 1.4720398788528571e-05, + "loss": 2.6594, + "step": 49897 + }, + { + "epoch": 2.3231138114859045, + "grad_norm": 0.3118843181487444, + "learning_rate": 1.4718479380009115e-05, + "loss": 2.6844, + "step": 49898 + }, + { + "epoch": 2.3231603696719976, + "grad_norm": 0.3134081532761187, + "learning_rate": 1.4716560075037473e-05, + "loss": 2.6261, + "step": 49899 + }, + { + "epoch": 2.3232069278580907, + "grad_norm": 0.3192651265565366, + "learning_rate": 1.471464087361924e-05, + "loss": 2.6617, + "step": 49900 + }, + { + "epoch": 2.323253486044184, + "grad_norm": 0.3084637842933851, + "learning_rate": 1.4712721775760058e-05, + "loss": 2.5737, + "step": 49901 + }, + { + "epoch": 2.323300044230277, + "grad_norm": 0.30263529960073376, + "learning_rate": 1.471080278146556e-05, + "loss": 2.7833, + "step": 49902 + }, + { + "epoch": 2.32334660241637, + "grad_norm": 0.3225079088670756, + "learning_rate": 1.4708883890741393e-05, + "loss": 2.7602, + "step": 49903 + }, + { + "epoch": 2.323393160602463, + "grad_norm": 0.3304222601309089, + "learning_rate": 1.4706965103593162e-05, + "loss": 2.6606, + "step": 49904 + }, + { + "epoch": 2.323439718788556, + "grad_norm": 0.32338886051308385, + "learning_rate": 1.4705046420026513e-05, + "loss": 2.703, + "step": 49905 + }, + { + "epoch": 2.323486276974649, + "grad_norm": 0.34028180227997484, + "learning_rate": 1.470312784004707e-05, + "loss": 2.6114, + "step": 49906 + }, + { + "epoch": 2.323532835160742, + "grad_norm": 0.32653718960951644, + "learning_rate": 1.470120936366048e-05, + "loss": 2.5353, + "step": 49907 + }, + { + "epoch": 2.323579393346835, + "grad_norm": 0.33370285578599806, + "learning_rate": 1.4699290990872367e-05, + "loss": 2.6336, + "step": 49908 + }, + { + "epoch": 2.3236259515329283, + "grad_norm": 0.3310869041590161, + "learning_rate": 1.4697372721688335e-05, + "loss": 2.6871, + "step": 49909 + }, + { + "epoch": 2.3236725097190214, + "grad_norm": 0.3463299880374004, + "learning_rate": 1.4695454556114063e-05, + "loss": 2.6894, + "step": 49910 + }, + { + "epoch": 2.3237190679051145, + "grad_norm": 0.36025379785488754, + "learning_rate": 1.469353649415512e-05, + "loss": 2.6179, + "step": 49911 + }, + { + "epoch": 2.3237656260912076, + "grad_norm": 0.33437830025414494, + "learning_rate": 1.4691618535817204e-05, + "loss": 2.6544, + "step": 49912 + }, + { + "epoch": 2.3238121842773007, + "grad_norm": 0.33240963346297686, + "learning_rate": 1.4689700681105894e-05, + "loss": 2.7018, + "step": 49913 + }, + { + "epoch": 2.3238587424633934, + "grad_norm": 0.3149843281089434, + "learning_rate": 1.4687782930026834e-05, + "loss": 2.5628, + "step": 49914 + }, + { + "epoch": 2.3239053006494865, + "grad_norm": 0.3280974739788343, + "learning_rate": 1.4685865282585648e-05, + "loss": 2.5865, + "step": 49915 + }, + { + "epoch": 2.3239518588355796, + "grad_norm": 0.3333760549021668, + "learning_rate": 1.4683947738787973e-05, + "loss": 2.7073, + "step": 49916 + }, + { + "epoch": 2.3239984170216728, + "grad_norm": 0.32526630525417455, + "learning_rate": 1.468203029863945e-05, + "loss": 2.6227, + "step": 49917 + }, + { + "epoch": 2.324044975207766, + "grad_norm": 0.3027337101196876, + "learning_rate": 1.4680112962145665e-05, + "loss": 2.6633, + "step": 49918 + }, + { + "epoch": 2.324091533393859, + "grad_norm": 0.3122865009425497, + "learning_rate": 1.4678195729312277e-05, + "loss": 2.5546, + "step": 49919 + }, + { + "epoch": 2.324138091579952, + "grad_norm": 0.33172821910128486, + "learning_rate": 1.46762786001449e-05, + "loss": 2.6582, + "step": 49920 + }, + { + "epoch": 2.324184649766045, + "grad_norm": 0.33937419404161046, + "learning_rate": 1.467436157464918e-05, + "loss": 2.6642, + "step": 49921 + }, + { + "epoch": 2.3242312079521383, + "grad_norm": 0.31987819705433884, + "learning_rate": 1.4672444652830702e-05, + "loss": 2.724, + "step": 49922 + }, + { + "epoch": 2.3242777661382314, + "grad_norm": 0.3280878008943774, + "learning_rate": 1.4670527834695147e-05, + "loss": 2.6347, + "step": 49923 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 0.3596531847310656, + "learning_rate": 1.4668611120248077e-05, + "loss": 2.6805, + "step": 49924 + }, + { + "epoch": 2.3243708825104172, + "grad_norm": 0.3448172391449002, + "learning_rate": 1.4666694509495188e-05, + "loss": 2.686, + "step": 49925 + }, + { + "epoch": 2.3244174406965104, + "grad_norm": 0.3128479782972798, + "learning_rate": 1.4664778002442059e-05, + "loss": 2.6643, + "step": 49926 + }, + { + "epoch": 2.3244639988826035, + "grad_norm": 0.31543186624381747, + "learning_rate": 1.4662861599094318e-05, + "loss": 2.6099, + "step": 49927 + }, + { + "epoch": 2.3245105570686966, + "grad_norm": 0.3402198574051059, + "learning_rate": 1.4660945299457601e-05, + "loss": 2.7017, + "step": 49928 + }, + { + "epoch": 2.3245571152547897, + "grad_norm": 0.3414377893959447, + "learning_rate": 1.4659029103537525e-05, + "loss": 2.645, + "step": 49929 + }, + { + "epoch": 2.324603673440883, + "grad_norm": 0.3221263699051288, + "learning_rate": 1.4657113011339734e-05, + "loss": 2.7139, + "step": 49930 + }, + { + "epoch": 2.324650231626976, + "grad_norm": 0.33270553332240516, + "learning_rate": 1.4655197022869822e-05, + "loss": 2.6352, + "step": 49931 + }, + { + "epoch": 2.324696789813069, + "grad_norm": 0.3393273493164394, + "learning_rate": 1.4653281138133419e-05, + "loss": 2.693, + "step": 49932 + }, + { + "epoch": 2.3247433479991617, + "grad_norm": 0.33283025565373364, + "learning_rate": 1.4651365357136154e-05, + "loss": 2.5717, + "step": 49933 + }, + { + "epoch": 2.324789906185255, + "grad_norm": 0.3006598406638728, + "learning_rate": 1.4649449679883653e-05, + "loss": 2.5956, + "step": 49934 + }, + { + "epoch": 2.324836464371348, + "grad_norm": 0.321062994348697, + "learning_rate": 1.4647534106381532e-05, + "loss": 2.4914, + "step": 49935 + }, + { + "epoch": 2.324883022557441, + "grad_norm": 0.34403817397908787, + "learning_rate": 1.4645618636635433e-05, + "loss": 2.617, + "step": 49936 + }, + { + "epoch": 2.324929580743534, + "grad_norm": 0.3333856027404634, + "learning_rate": 1.4643703270650933e-05, + "loss": 2.7328, + "step": 49937 + }, + { + "epoch": 2.3249761389296273, + "grad_norm": 0.3186575366019016, + "learning_rate": 1.4641788008433705e-05, + "loss": 2.6633, + "step": 49938 + }, + { + "epoch": 2.3250226971157204, + "grad_norm": 0.32281036566768645, + "learning_rate": 1.4639872849989333e-05, + "loss": 2.6038, + "step": 49939 + }, + { + "epoch": 2.3250692553018135, + "grad_norm": 0.34962605552366066, + "learning_rate": 1.4637957795323448e-05, + "loss": 2.7189, + "step": 49940 + }, + { + "epoch": 2.3251158134879066, + "grad_norm": 0.3394281135929824, + "learning_rate": 1.4636042844441677e-05, + "loss": 2.601, + "step": 49941 + }, + { + "epoch": 2.3251623716739998, + "grad_norm": 0.3086579751767786, + "learning_rate": 1.4634127997349633e-05, + "loss": 2.5375, + "step": 49942 + }, + { + "epoch": 2.325208929860093, + "grad_norm": 0.3202713293557162, + "learning_rate": 1.463221325405294e-05, + "loss": 2.5986, + "step": 49943 + }, + { + "epoch": 2.3252554880461855, + "grad_norm": 0.3493435424904661, + "learning_rate": 1.4630298614557236e-05, + "loss": 2.6563, + "step": 49944 + }, + { + "epoch": 2.3253020462322787, + "grad_norm": 0.32488812734904177, + "learning_rate": 1.4628384078868102e-05, + "loss": 2.5831, + "step": 49945 + }, + { + "epoch": 2.3253486044183718, + "grad_norm": 0.3237608904883717, + "learning_rate": 1.4626469646991175e-05, + "loss": 2.6492, + "step": 49946 + }, + { + "epoch": 2.325395162604465, + "grad_norm": 0.3181535731651945, + "learning_rate": 1.4624555318932081e-05, + "loss": 2.5898, + "step": 49947 + }, + { + "epoch": 2.325441720790558, + "grad_norm": 0.3151016805817011, + "learning_rate": 1.4622641094696427e-05, + "loss": 2.5526, + "step": 49948 + }, + { + "epoch": 2.325488278976651, + "grad_norm": 0.344948551329486, + "learning_rate": 1.462072697428985e-05, + "loss": 2.6556, + "step": 49949 + }, + { + "epoch": 2.3255348371627442, + "grad_norm": 0.3334253913104713, + "learning_rate": 1.4618812957717925e-05, + "loss": 2.599, + "step": 49950 + }, + { + "epoch": 2.3255813953488373, + "grad_norm": 0.3156443183589161, + "learning_rate": 1.4616899044986333e-05, + "loss": 2.6228, + "step": 49951 + }, + { + "epoch": 2.3256279535349305, + "grad_norm": 0.31434369677015755, + "learning_rate": 1.461498523610062e-05, + "loss": 2.649, + "step": 49952 + }, + { + "epoch": 2.325674511721023, + "grad_norm": 0.33202116787064934, + "learning_rate": 1.4613071531066474e-05, + "loss": 2.5004, + "step": 49953 + }, + { + "epoch": 2.3257210699071162, + "grad_norm": 0.3500767035647347, + "learning_rate": 1.4611157929889457e-05, + "loss": 2.6498, + "step": 49954 + }, + { + "epoch": 2.3257676280932094, + "grad_norm": 0.3198574500231312, + "learning_rate": 1.4609244432575209e-05, + "loss": 2.6255, + "step": 49955 + }, + { + "epoch": 2.3258141862793025, + "grad_norm": 0.339678694266933, + "learning_rate": 1.460733103912934e-05, + "loss": 2.67, + "step": 49956 + }, + { + "epoch": 2.3258607444653956, + "grad_norm": 0.31772303606745356, + "learning_rate": 1.4605417749557482e-05, + "loss": 2.6713, + "step": 49957 + }, + { + "epoch": 2.3259073026514887, + "grad_norm": 0.3291461799454653, + "learning_rate": 1.4603504563865223e-05, + "loss": 2.5752, + "step": 49958 + }, + { + "epoch": 2.325953860837582, + "grad_norm": 0.3380655549737732, + "learning_rate": 1.4601591482058191e-05, + "loss": 2.6483, + "step": 49959 + }, + { + "epoch": 2.326000419023675, + "grad_norm": 0.3354507960219197, + "learning_rate": 1.4599678504142001e-05, + "loss": 2.5531, + "step": 49960 + }, + { + "epoch": 2.326046977209768, + "grad_norm": 0.32789601839645066, + "learning_rate": 1.4597765630122267e-05, + "loss": 2.5777, + "step": 49961 + }, + { + "epoch": 2.326093535395861, + "grad_norm": 0.3304321602462965, + "learning_rate": 1.4595852860004615e-05, + "loss": 2.6232, + "step": 49962 + }, + { + "epoch": 2.3261400935819543, + "grad_norm": 0.3360916898724084, + "learning_rate": 1.459394019379462e-05, + "loss": 2.6526, + "step": 49963 + }, + { + "epoch": 2.326186651768047, + "grad_norm": 0.32821408746683445, + "learning_rate": 1.459202763149795e-05, + "loss": 2.5728, + "step": 49964 + }, + { + "epoch": 2.32623320995414, + "grad_norm": 0.3487246162750547, + "learning_rate": 1.4590115173120167e-05, + "loss": 2.6884, + "step": 49965 + }, + { + "epoch": 2.326279768140233, + "grad_norm": 0.3247740002223853, + "learning_rate": 1.458820281866693e-05, + "loss": 2.6781, + "step": 49966 + }, + { + "epoch": 2.3263263263263263, + "grad_norm": 0.34823020803294696, + "learning_rate": 1.4586290568143818e-05, + "loss": 2.6579, + "step": 49967 + }, + { + "epoch": 2.3263728845124194, + "grad_norm": 0.3088116588768765, + "learning_rate": 1.4584378421556455e-05, + "loss": 2.6691, + "step": 49968 + }, + { + "epoch": 2.3264194426985125, + "grad_norm": 0.32529299322514277, + "learning_rate": 1.4582466378910448e-05, + "loss": 2.676, + "step": 49969 + }, + { + "epoch": 2.3264660008846056, + "grad_norm": 0.30740161025386437, + "learning_rate": 1.458055444021142e-05, + "loss": 2.5955, + "step": 49970 + }, + { + "epoch": 2.3265125590706988, + "grad_norm": 0.32651561735476703, + "learning_rate": 1.4578642605464981e-05, + "loss": 2.6673, + "step": 49971 + }, + { + "epoch": 2.3265591172567914, + "grad_norm": 0.31895074363527925, + "learning_rate": 1.457673087467673e-05, + "loss": 2.6027, + "step": 49972 + }, + { + "epoch": 2.3266056754428845, + "grad_norm": 0.3383241141366062, + "learning_rate": 1.4574819247852278e-05, + "loss": 2.6648, + "step": 49973 + }, + { + "epoch": 2.3266522336289777, + "grad_norm": 0.30939897322677923, + "learning_rate": 1.4572907724997248e-05, + "loss": 2.6586, + "step": 49974 + }, + { + "epoch": 2.3266987918150708, + "grad_norm": 0.30771130654358736, + "learning_rate": 1.4570996306117252e-05, + "loss": 2.5774, + "step": 49975 + }, + { + "epoch": 2.326745350001164, + "grad_norm": 0.3094863478798056, + "learning_rate": 1.4569084991217862e-05, + "loss": 2.6425, + "step": 49976 + }, + { + "epoch": 2.326791908187257, + "grad_norm": 0.34184035601477497, + "learning_rate": 1.456717378030475e-05, + "loss": 2.5706, + "step": 49977 + }, + { + "epoch": 2.32683846637335, + "grad_norm": 0.3132869796953672, + "learning_rate": 1.4565262673383462e-05, + "loss": 2.66, + "step": 49978 + }, + { + "epoch": 2.3268850245594432, + "grad_norm": 0.3363346937402326, + "learning_rate": 1.4563351670459669e-05, + "loss": 2.7068, + "step": 49979 + }, + { + "epoch": 2.3269315827455364, + "grad_norm": 0.32685850617547124, + "learning_rate": 1.456144077153893e-05, + "loss": 2.6879, + "step": 49980 + }, + { + "epoch": 2.3269781409316295, + "grad_norm": 0.3107442142758488, + "learning_rate": 1.4559529976626868e-05, + "loss": 2.64, + "step": 49981 + }, + { + "epoch": 2.3270246991177226, + "grad_norm": 0.32677065084306817, + "learning_rate": 1.4557619285729102e-05, + "loss": 2.6579, + "step": 49982 + }, + { + "epoch": 2.3270712573038153, + "grad_norm": 0.3189222507265499, + "learning_rate": 1.4555708698851222e-05, + "loss": 2.6092, + "step": 49983 + }, + { + "epoch": 2.3271178154899084, + "grad_norm": 0.32167014517965953, + "learning_rate": 1.455379821599887e-05, + "loss": 2.6089, + "step": 49984 + }, + { + "epoch": 2.3271643736760015, + "grad_norm": 0.2984772766431951, + "learning_rate": 1.4551887837177609e-05, + "loss": 2.606, + "step": 49985 + }, + { + "epoch": 2.3272109318620946, + "grad_norm": 0.335959053052558, + "learning_rate": 1.4549977562393063e-05, + "loss": 2.6275, + "step": 49986 + }, + { + "epoch": 2.3272574900481877, + "grad_norm": 0.34357500862215307, + "learning_rate": 1.4548067391650844e-05, + "loss": 2.7671, + "step": 49987 + }, + { + "epoch": 2.327304048234281, + "grad_norm": 0.3098888140869485, + "learning_rate": 1.454615732495655e-05, + "loss": 2.5739, + "step": 49988 + }, + { + "epoch": 2.327350606420374, + "grad_norm": 0.33128897886291436, + "learning_rate": 1.4544247362315794e-05, + "loss": 2.5916, + "step": 49989 + }, + { + "epoch": 2.327397164606467, + "grad_norm": 0.3317990551077226, + "learning_rate": 1.4542337503734193e-05, + "loss": 2.6065, + "step": 49990 + }, + { + "epoch": 2.32744372279256, + "grad_norm": 0.33232168985430593, + "learning_rate": 1.4540427749217311e-05, + "loss": 2.6041, + "step": 49991 + }, + { + "epoch": 2.327490280978653, + "grad_norm": 0.3355809606839016, + "learning_rate": 1.4538518098770808e-05, + "loss": 2.6823, + "step": 49992 + }, + { + "epoch": 2.327536839164746, + "grad_norm": 0.31847691175090675, + "learning_rate": 1.4536608552400249e-05, + "loss": 2.6929, + "step": 49993 + }, + { + "epoch": 2.327583397350839, + "grad_norm": 0.3410381226898459, + "learning_rate": 1.4534699110111245e-05, + "loss": 2.5941, + "step": 49994 + }, + { + "epoch": 2.327629955536932, + "grad_norm": 0.33481447324925456, + "learning_rate": 1.4532789771909411e-05, + "loss": 2.6981, + "step": 49995 + }, + { + "epoch": 2.3276765137230253, + "grad_norm": 0.32182055947192373, + "learning_rate": 1.453088053780034e-05, + "loss": 2.6178, + "step": 49996 + }, + { + "epoch": 2.3277230719091184, + "grad_norm": 0.3293408511047828, + "learning_rate": 1.4528971407789654e-05, + "loss": 2.6677, + "step": 49997 + }, + { + "epoch": 2.3277696300952115, + "grad_norm": 0.3396422474630734, + "learning_rate": 1.4527062381882934e-05, + "loss": 2.6394, + "step": 49998 + }, + { + "epoch": 2.3278161882813047, + "grad_norm": 0.323165909641071, + "learning_rate": 1.4525153460085789e-05, + "loss": 2.5709, + "step": 49999 + }, + { + "epoch": 2.3278627464673978, + "grad_norm": 0.3294169418168801, + "learning_rate": 1.4523244642403821e-05, + "loss": 2.6941, + "step": 50000 + }, + { + "epoch": 2.327909304653491, + "grad_norm": 0.3100883255066283, + "learning_rate": 1.4521335928842634e-05, + "loss": 2.6251, + "step": 50001 + }, + { + "epoch": 2.327955862839584, + "grad_norm": 0.3183124697062098, + "learning_rate": 1.4519427319407836e-05, + "loss": 2.6117, + "step": 50002 + }, + { + "epoch": 2.3280024210256767, + "grad_norm": 0.3124476862507778, + "learning_rate": 1.4517518814105035e-05, + "loss": 2.6658, + "step": 50003 + }, + { + "epoch": 2.32804897921177, + "grad_norm": 0.3497084983247242, + "learning_rate": 1.451561041293979e-05, + "loss": 2.655, + "step": 50004 + }, + { + "epoch": 2.328095537397863, + "grad_norm": 0.3194570919167293, + "learning_rate": 1.4513702115917765e-05, + "loss": 2.6364, + "step": 50005 + }, + { + "epoch": 2.328142095583956, + "grad_norm": 0.3324504878433116, + "learning_rate": 1.4511793923044515e-05, + "loss": 2.5535, + "step": 50006 + }, + { + "epoch": 2.328188653770049, + "grad_norm": 0.3346046453030334, + "learning_rate": 1.4509885834325648e-05, + "loss": 2.5669, + "step": 50007 + }, + { + "epoch": 2.3282352119561422, + "grad_norm": 0.30268609526448875, + "learning_rate": 1.4507977849766774e-05, + "loss": 2.6209, + "step": 50008 + }, + { + "epoch": 2.3282817701422354, + "grad_norm": 0.34109103389757417, + "learning_rate": 1.4506069969373487e-05, + "loss": 2.733, + "step": 50009 + }, + { + "epoch": 2.3283283283283285, + "grad_norm": 0.33797340609012316, + "learning_rate": 1.45041621931514e-05, + "loss": 2.7303, + "step": 50010 + }, + { + "epoch": 2.328374886514421, + "grad_norm": 0.3225685517665973, + "learning_rate": 1.450225452110609e-05, + "loss": 2.6828, + "step": 50011 + }, + { + "epoch": 2.3284214447005143, + "grad_norm": 0.33793626682552996, + "learning_rate": 1.4500346953243166e-05, + "loss": 2.6937, + "step": 50012 + }, + { + "epoch": 2.3284680028866074, + "grad_norm": 0.3268328800198124, + "learning_rate": 1.4498439489568221e-05, + "loss": 2.4581, + "step": 50013 + }, + { + "epoch": 2.3285145610727005, + "grad_norm": 0.33747079918647727, + "learning_rate": 1.4496532130086864e-05, + "loss": 2.7397, + "step": 50014 + }, + { + "epoch": 2.3285611192587936, + "grad_norm": 0.3215464755755718, + "learning_rate": 1.4494624874804685e-05, + "loss": 2.7502, + "step": 50015 + }, + { + "epoch": 2.3286076774448867, + "grad_norm": 0.3270868600968133, + "learning_rate": 1.44927177237273e-05, + "loss": 2.6989, + "step": 50016 + }, + { + "epoch": 2.32865423563098, + "grad_norm": 0.33041240078517836, + "learning_rate": 1.4490810676860256e-05, + "loss": 2.6916, + "step": 50017 + }, + { + "epoch": 2.328700793817073, + "grad_norm": 0.3351381445224009, + "learning_rate": 1.4488903734209218e-05, + "loss": 2.6761, + "step": 50018 + }, + { + "epoch": 2.328747352003166, + "grad_norm": 0.3382842965561169, + "learning_rate": 1.4486996895779737e-05, + "loss": 2.5917, + "step": 50019 + }, + { + "epoch": 2.328793910189259, + "grad_norm": 0.32322945366971356, + "learning_rate": 1.4485090161577419e-05, + "loss": 2.5957, + "step": 50020 + }, + { + "epoch": 2.3288404683753523, + "grad_norm": 0.315462092985303, + "learning_rate": 1.4483183531607863e-05, + "loss": 2.6107, + "step": 50021 + }, + { + "epoch": 2.328887026561445, + "grad_norm": 0.33576699919682435, + "learning_rate": 1.4481277005876664e-05, + "loss": 2.6769, + "step": 50022 + }, + { + "epoch": 2.328933584747538, + "grad_norm": 0.3397997875260012, + "learning_rate": 1.4479370584389435e-05, + "loss": 2.6717, + "step": 50023 + }, + { + "epoch": 2.328980142933631, + "grad_norm": 0.3238781911187559, + "learning_rate": 1.447746426715172e-05, + "loss": 2.6329, + "step": 50024 + }, + { + "epoch": 2.3290267011197243, + "grad_norm": 0.35147882530950025, + "learning_rate": 1.4475558054169181e-05, + "loss": 2.7039, + "step": 50025 + }, + { + "epoch": 2.3290732593058174, + "grad_norm": 0.3335445335644033, + "learning_rate": 1.4473651945447363e-05, + "loss": 2.5325, + "step": 50026 + }, + { + "epoch": 2.3291198174919105, + "grad_norm": 0.3342603728716061, + "learning_rate": 1.4471745940991876e-05, + "loss": 2.7075, + "step": 50027 + }, + { + "epoch": 2.3291663756780037, + "grad_norm": 0.3099114891060124, + "learning_rate": 1.4469840040808314e-05, + "loss": 2.637, + "step": 50028 + }, + { + "epoch": 2.329212933864097, + "grad_norm": 0.36163084350321767, + "learning_rate": 1.4467934244902288e-05, + "loss": 2.7629, + "step": 50029 + }, + { + "epoch": 2.32925949205019, + "grad_norm": 0.3242259550705726, + "learning_rate": 1.4466028553279343e-05, + "loss": 2.5757, + "step": 50030 + }, + { + "epoch": 2.3293060502362826, + "grad_norm": 0.3324048529529534, + "learning_rate": 1.4464122965945137e-05, + "loss": 2.6356, + "step": 50031 + }, + { + "epoch": 2.3293526084223757, + "grad_norm": 0.326262836718621, + "learning_rate": 1.446221748290521e-05, + "loss": 2.6587, + "step": 50032 + }, + { + "epoch": 2.329399166608469, + "grad_norm": 0.3441807326661819, + "learning_rate": 1.4460312104165175e-05, + "loss": 2.6727, + "step": 50033 + }, + { + "epoch": 2.329445724794562, + "grad_norm": 0.3498325157360418, + "learning_rate": 1.4458406829730619e-05, + "loss": 2.7635, + "step": 50034 + }, + { + "epoch": 2.329492282980655, + "grad_norm": 0.33897418781161953, + "learning_rate": 1.4456501659607147e-05, + "loss": 2.6504, + "step": 50035 + }, + { + "epoch": 2.329538841166748, + "grad_norm": 0.3352122781874524, + "learning_rate": 1.4454596593800352e-05, + "loss": 2.6112, + "step": 50036 + }, + { + "epoch": 2.3295853993528413, + "grad_norm": 0.32904075812351496, + "learning_rate": 1.4452691632315779e-05, + "loss": 2.6072, + "step": 50037 + }, + { + "epoch": 2.3296319575389344, + "grad_norm": 0.3473485643534871, + "learning_rate": 1.4450786775159087e-05, + "loss": 2.5974, + "step": 50038 + }, + { + "epoch": 2.3296785157250275, + "grad_norm": 0.3400089484969412, + "learning_rate": 1.4448882022335819e-05, + "loss": 2.7377, + "step": 50039 + }, + { + "epoch": 2.3297250739111206, + "grad_norm": 0.32463085790621365, + "learning_rate": 1.4446977373851584e-05, + "loss": 2.6143, + "step": 50040 + }, + { + "epoch": 2.3297716320972137, + "grad_norm": 0.34488720473486234, + "learning_rate": 1.4445072829711959e-05, + "loss": 2.6174, + "step": 50041 + }, + { + "epoch": 2.3298181902833064, + "grad_norm": 0.3374631192851742, + "learning_rate": 1.444316838992255e-05, + "loss": 2.5902, + "step": 50042 + }, + { + "epoch": 2.3298647484693995, + "grad_norm": 0.3692118951314359, + "learning_rate": 1.4441264054488935e-05, + "loss": 2.7331, + "step": 50043 + }, + { + "epoch": 2.3299113066554926, + "grad_norm": 0.319565915214757, + "learning_rate": 1.4439359823416726e-05, + "loss": 2.5907, + "step": 50044 + }, + { + "epoch": 2.3299578648415857, + "grad_norm": 0.3247955545570594, + "learning_rate": 1.4437455696711471e-05, + "loss": 2.6471, + "step": 50045 + }, + { + "epoch": 2.330004423027679, + "grad_norm": 0.32347606218792185, + "learning_rate": 1.4435551674378788e-05, + "loss": 2.6863, + "step": 50046 + }, + { + "epoch": 2.330050981213772, + "grad_norm": 0.33179525978787505, + "learning_rate": 1.4433647756424251e-05, + "loss": 2.6812, + "step": 50047 + }, + { + "epoch": 2.330097539399865, + "grad_norm": 0.3296479308135795, + "learning_rate": 1.4431743942853453e-05, + "loss": 2.6606, + "step": 50048 + }, + { + "epoch": 2.330144097585958, + "grad_norm": 0.33212039246054276, + "learning_rate": 1.4429840233672004e-05, + "loss": 2.7129, + "step": 50049 + }, + { + "epoch": 2.3301906557720513, + "grad_norm": 0.3180278450536806, + "learning_rate": 1.4427936628885436e-05, + "loss": 2.7105, + "step": 50050 + }, + { + "epoch": 2.330237213958144, + "grad_norm": 0.3199562836354059, + "learning_rate": 1.44260331284994e-05, + "loss": 2.6801, + "step": 50051 + }, + { + "epoch": 2.330283772144237, + "grad_norm": 0.3222554118223409, + "learning_rate": 1.4424129732519437e-05, + "loss": 2.6744, + "step": 50052 + }, + { + "epoch": 2.33033033033033, + "grad_norm": 0.33480977767349146, + "learning_rate": 1.4422226440951148e-05, + "loss": 2.5931, + "step": 50053 + }, + { + "epoch": 2.3303768885164233, + "grad_norm": 0.3169990503163881, + "learning_rate": 1.4420323253800122e-05, + "loss": 2.6469, + "step": 50054 + }, + { + "epoch": 2.3304234467025164, + "grad_norm": 0.323715224586403, + "learning_rate": 1.441842017107194e-05, + "loss": 2.601, + "step": 50055 + }, + { + "epoch": 2.3304700048886096, + "grad_norm": 0.32031351546370684, + "learning_rate": 1.4416517192772188e-05, + "loss": 2.6662, + "step": 50056 + }, + { + "epoch": 2.3305165630747027, + "grad_norm": 0.3579499645344909, + "learning_rate": 1.4414614318906467e-05, + "loss": 2.6602, + "step": 50057 + }, + { + "epoch": 2.330563121260796, + "grad_norm": 0.3262972010036469, + "learning_rate": 1.4412711549480334e-05, + "loss": 2.621, + "step": 50058 + }, + { + "epoch": 2.330609679446889, + "grad_norm": 0.3372973769910365, + "learning_rate": 1.4410808884499388e-05, + "loss": 2.6385, + "step": 50059 + }, + { + "epoch": 2.330656237632982, + "grad_norm": 0.34197328514846126, + "learning_rate": 1.4408906323969213e-05, + "loss": 2.6488, + "step": 50060 + }, + { + "epoch": 2.330702795819075, + "grad_norm": 0.3423193175245286, + "learning_rate": 1.4407003867895385e-05, + "loss": 2.5657, + "step": 50061 + }, + { + "epoch": 2.330749354005168, + "grad_norm": 0.33283764606455113, + "learning_rate": 1.4405101516283515e-05, + "loss": 2.5616, + "step": 50062 + }, + { + "epoch": 2.330795912191261, + "grad_norm": 0.3324093853415508, + "learning_rate": 1.4403199269139134e-05, + "loss": 2.5418, + "step": 50063 + }, + { + "epoch": 2.330842470377354, + "grad_norm": 0.3669513600552409, + "learning_rate": 1.4401297126467883e-05, + "loss": 2.6753, + "step": 50064 + }, + { + "epoch": 2.330889028563447, + "grad_norm": 0.33477853283579573, + "learning_rate": 1.4399395088275308e-05, + "loss": 2.6038, + "step": 50065 + }, + { + "epoch": 2.3309355867495403, + "grad_norm": 0.3324874761683795, + "learning_rate": 1.4397493154566999e-05, + "loss": 2.5897, + "step": 50066 + }, + { + "epoch": 2.3309821449356334, + "grad_norm": 0.331629809803058, + "learning_rate": 1.439559132534854e-05, + "loss": 2.5752, + "step": 50067 + }, + { + "epoch": 2.3310287031217265, + "grad_norm": 0.33946773759249116, + "learning_rate": 1.4393689600625516e-05, + "loss": 2.6515, + "step": 50068 + }, + { + "epoch": 2.3310752613078196, + "grad_norm": 0.3350198865056265, + "learning_rate": 1.4391787980403499e-05, + "loss": 2.7021, + "step": 50069 + }, + { + "epoch": 2.3311218194939123, + "grad_norm": 0.336783057429319, + "learning_rate": 1.4389886464688097e-05, + "loss": 2.5364, + "step": 50070 + }, + { + "epoch": 2.3311683776800054, + "grad_norm": 0.3195916163925743, + "learning_rate": 1.4387985053484848e-05, + "loss": 2.6485, + "step": 50071 + }, + { + "epoch": 2.3312149358660985, + "grad_norm": 0.3111530096752747, + "learning_rate": 1.4386083746799362e-05, + "loss": 2.579, + "step": 50072 + }, + { + "epoch": 2.3312614940521916, + "grad_norm": 0.32379563206858053, + "learning_rate": 1.4384182544637209e-05, + "loss": 2.685, + "step": 50073 + }, + { + "epoch": 2.3313080522382847, + "grad_norm": 0.3163403440908907, + "learning_rate": 1.438228144700397e-05, + "loss": 2.5959, + "step": 50074 + }, + { + "epoch": 2.331354610424378, + "grad_norm": 0.33145233101109933, + "learning_rate": 1.4380380453905245e-05, + "loss": 2.6653, + "step": 50075 + }, + { + "epoch": 2.331401168610471, + "grad_norm": 0.33217991460543383, + "learning_rate": 1.437847956534656e-05, + "loss": 2.6268, + "step": 50076 + }, + { + "epoch": 2.331447726796564, + "grad_norm": 0.32321891477411535, + "learning_rate": 1.437657878133356e-05, + "loss": 2.6132, + "step": 50077 + }, + { + "epoch": 2.331494284982657, + "grad_norm": 0.337038589440647, + "learning_rate": 1.4374678101871758e-05, + "loss": 2.669, + "step": 50078 + }, + { + "epoch": 2.3315408431687503, + "grad_norm": 0.3576359570872759, + "learning_rate": 1.4372777526966802e-05, + "loss": 2.7273, + "step": 50079 + }, + { + "epoch": 2.3315874013548434, + "grad_norm": 0.34307213904002654, + "learning_rate": 1.4370877056624216e-05, + "loss": 2.7526, + "step": 50080 + }, + { + "epoch": 2.331633959540936, + "grad_norm": 0.3328992060456821, + "learning_rate": 1.4368976690849594e-05, + "loss": 2.5868, + "step": 50081 + }, + { + "epoch": 2.3316805177270292, + "grad_norm": 0.339694908672213, + "learning_rate": 1.4367076429648519e-05, + "loss": 2.6368, + "step": 50082 + }, + { + "epoch": 2.3317270759131223, + "grad_norm": 0.3367046612742082, + "learning_rate": 1.4365176273026582e-05, + "loss": 2.6721, + "step": 50083 + }, + { + "epoch": 2.3317736340992155, + "grad_norm": 0.34854261575418666, + "learning_rate": 1.4363276220989318e-05, + "loss": 2.6728, + "step": 50084 + }, + { + "epoch": 2.3318201922853086, + "grad_norm": 0.3088416928994783, + "learning_rate": 1.4361376273542332e-05, + "loss": 2.6162, + "step": 50085 + }, + { + "epoch": 2.3318667504714017, + "grad_norm": 0.3436980446632951, + "learning_rate": 1.4359476430691198e-05, + "loss": 2.6353, + "step": 50086 + }, + { + "epoch": 2.331913308657495, + "grad_norm": 0.3130233345487367, + "learning_rate": 1.4357576692441488e-05, + "loss": 2.6076, + "step": 50087 + }, + { + "epoch": 2.331959866843588, + "grad_norm": 0.3467933435999147, + "learning_rate": 1.4355677058798794e-05, + "loss": 2.6086, + "step": 50088 + }, + { + "epoch": 2.332006425029681, + "grad_norm": 0.3207680228119167, + "learning_rate": 1.4353777529768647e-05, + "loss": 2.5992, + "step": 50089 + }, + { + "epoch": 2.3320529832157737, + "grad_norm": 0.322089315566714, + "learning_rate": 1.4351878105356676e-05, + "loss": 2.7061, + "step": 50090 + }, + { + "epoch": 2.332099541401867, + "grad_norm": 0.33227751232116604, + "learning_rate": 1.4349978785568408e-05, + "loss": 2.5467, + "step": 50091 + }, + { + "epoch": 2.33214609958796, + "grad_norm": 0.3477181591021419, + "learning_rate": 1.4348079570409468e-05, + "loss": 2.5932, + "step": 50092 + }, + { + "epoch": 2.332192657774053, + "grad_norm": 0.31391824099983107, + "learning_rate": 1.4346180459885384e-05, + "loss": 2.6476, + "step": 50093 + }, + { + "epoch": 2.332239215960146, + "grad_norm": 0.31381596985822796, + "learning_rate": 1.434428145400175e-05, + "loss": 2.6085, + "step": 50094 + }, + { + "epoch": 2.3322857741462393, + "grad_norm": 0.3431999578645945, + "learning_rate": 1.434238255276414e-05, + "loss": 2.58, + "step": 50095 + }, + { + "epoch": 2.3323323323323324, + "grad_norm": 0.32515095241414516, + "learning_rate": 1.434048375617812e-05, + "loss": 2.6126, + "step": 50096 + }, + { + "epoch": 2.3323788905184255, + "grad_norm": 0.33612635391480045, + "learning_rate": 1.4338585064249283e-05, + "loss": 2.6601, + "step": 50097 + }, + { + "epoch": 2.3324254487045186, + "grad_norm": 0.3236062500366363, + "learning_rate": 1.4336686476983168e-05, + "loss": 2.6891, + "step": 50098 + }, + { + "epoch": 2.3324720068906117, + "grad_norm": 0.34328219494712914, + "learning_rate": 1.4334787994385374e-05, + "loss": 2.6441, + "step": 50099 + }, + { + "epoch": 2.332518565076705, + "grad_norm": 0.3118530356433617, + "learning_rate": 1.4332889616461454e-05, + "loss": 2.6858, + "step": 50100 + }, + { + "epoch": 2.3325651232627975, + "grad_norm": 0.32221839115287776, + "learning_rate": 1.4330991343216992e-05, + "loss": 2.5447, + "step": 50101 + }, + { + "epoch": 2.3326116814488906, + "grad_norm": 0.33488368835884524, + "learning_rate": 1.4329093174657559e-05, + "loss": 2.6943, + "step": 50102 + }, + { + "epoch": 2.3326582396349838, + "grad_norm": 0.3187198507267094, + "learning_rate": 1.4327195110788739e-05, + "loss": 2.5952, + "step": 50103 + }, + { + "epoch": 2.332704797821077, + "grad_norm": 0.3213068155387845, + "learning_rate": 1.4325297151616052e-05, + "loss": 2.7065, + "step": 50104 + }, + { + "epoch": 2.33275135600717, + "grad_norm": 0.3169454575883142, + "learning_rate": 1.4323399297145134e-05, + "loss": 2.6, + "step": 50105 + }, + { + "epoch": 2.332797914193263, + "grad_norm": 0.3419268771304896, + "learning_rate": 1.4321501547381516e-05, + "loss": 2.723, + "step": 50106 + }, + { + "epoch": 2.332844472379356, + "grad_norm": 0.3416485856255885, + "learning_rate": 1.4319603902330769e-05, + "loss": 2.5584, + "step": 50107 + }, + { + "epoch": 2.3328910305654493, + "grad_norm": 0.3588376936872809, + "learning_rate": 1.4317706361998473e-05, + "loss": 2.7199, + "step": 50108 + }, + { + "epoch": 2.332937588751542, + "grad_norm": 0.33539289534463046, + "learning_rate": 1.431580892639019e-05, + "loss": 2.6911, + "step": 50109 + }, + { + "epoch": 2.332984146937635, + "grad_norm": 0.3150238275629665, + "learning_rate": 1.4313911595511508e-05, + "loss": 2.5912, + "step": 50110 + }, + { + "epoch": 2.3330307051237282, + "grad_norm": 0.31644063517502147, + "learning_rate": 1.4312014369367971e-05, + "loss": 2.5631, + "step": 50111 + }, + { + "epoch": 2.3330772633098213, + "grad_norm": 0.32287287234520007, + "learning_rate": 1.4310117247965148e-05, + "loss": 2.6778, + "step": 50112 + }, + { + "epoch": 2.3331238214959145, + "grad_norm": 0.31851058548705824, + "learning_rate": 1.4308220231308622e-05, + "loss": 2.5612, + "step": 50113 + }, + { + "epoch": 2.3331703796820076, + "grad_norm": 0.31994498264706833, + "learning_rate": 1.4306323319403948e-05, + "loss": 2.6538, + "step": 50114 + }, + { + "epoch": 2.3332169378681007, + "grad_norm": 0.3115008046181425, + "learning_rate": 1.4304426512256703e-05, + "loss": 2.6019, + "step": 50115 + }, + { + "epoch": 2.333263496054194, + "grad_norm": 0.3270464598433469, + "learning_rate": 1.4302529809872462e-05, + "loss": 2.6392, + "step": 50116 + }, + { + "epoch": 2.333310054240287, + "grad_norm": 0.3488826070461071, + "learning_rate": 1.4300633212256748e-05, + "loss": 2.7066, + "step": 50117 + }, + { + "epoch": 2.33335661242638, + "grad_norm": 0.3366232147602653, + "learning_rate": 1.429873671941519e-05, + "loss": 2.5887, + "step": 50118 + }, + { + "epoch": 2.333403170612473, + "grad_norm": 0.31718088368992936, + "learning_rate": 1.4296840331353307e-05, + "loss": 2.674, + "step": 50119 + }, + { + "epoch": 2.333449728798566, + "grad_norm": 0.30605242433602775, + "learning_rate": 1.429494404807668e-05, + "loss": 2.4625, + "step": 50120 + }, + { + "epoch": 2.333496286984659, + "grad_norm": 0.34022777799444365, + "learning_rate": 1.4293047869590875e-05, + "loss": 2.5972, + "step": 50121 + }, + { + "epoch": 2.333542845170752, + "grad_norm": 0.3274217371331398, + "learning_rate": 1.4291151795901458e-05, + "loss": 2.6154, + "step": 50122 + }, + { + "epoch": 2.333589403356845, + "grad_norm": 0.3143471414495946, + "learning_rate": 1.4289255827014004e-05, + "loss": 2.5995, + "step": 50123 + }, + { + "epoch": 2.3336359615429383, + "grad_norm": 0.31056816338469556, + "learning_rate": 1.4287359962934055e-05, + "loss": 2.6081, + "step": 50124 + }, + { + "epoch": 2.3336825197290314, + "grad_norm": 0.34778400371871043, + "learning_rate": 1.4285464203667187e-05, + "loss": 2.6087, + "step": 50125 + }, + { + "epoch": 2.3337290779151245, + "grad_norm": 0.3134451783930969, + "learning_rate": 1.4283568549218956e-05, + "loss": 2.5336, + "step": 50126 + }, + { + "epoch": 2.3337756361012176, + "grad_norm": 0.3283788443205079, + "learning_rate": 1.4281672999594942e-05, + "loss": 2.6332, + "step": 50127 + }, + { + "epoch": 2.3338221942873107, + "grad_norm": 0.3269293967989736, + "learning_rate": 1.4279777554800694e-05, + "loss": 2.6552, + "step": 50128 + }, + { + "epoch": 2.3338687524734034, + "grad_norm": 0.3338604587002034, + "learning_rate": 1.4277882214841793e-05, + "loss": 2.5666, + "step": 50129 + }, + { + "epoch": 2.3339153106594965, + "grad_norm": 0.32926307634084023, + "learning_rate": 1.427598697972376e-05, + "loss": 2.6163, + "step": 50130 + }, + { + "epoch": 2.3339618688455896, + "grad_norm": 0.3088765171880185, + "learning_rate": 1.427409184945222e-05, + "loss": 2.7057, + "step": 50131 + }, + { + "epoch": 2.3340084270316828, + "grad_norm": 0.3473171064489442, + "learning_rate": 1.427219682403268e-05, + "loss": 2.6732, + "step": 50132 + }, + { + "epoch": 2.334054985217776, + "grad_norm": 0.313006681226884, + "learning_rate": 1.4270301903470728e-05, + "loss": 2.6779, + "step": 50133 + }, + { + "epoch": 2.334101543403869, + "grad_norm": 0.3216002567034461, + "learning_rate": 1.4268407087771919e-05, + "loss": 2.6052, + "step": 50134 + }, + { + "epoch": 2.334148101589962, + "grad_norm": 0.31467288898030826, + "learning_rate": 1.4266512376941809e-05, + "loss": 2.7275, + "step": 50135 + }, + { + "epoch": 2.3341946597760552, + "grad_norm": 0.3563079769643052, + "learning_rate": 1.4264617770985982e-05, + "loss": 2.6402, + "step": 50136 + }, + { + "epoch": 2.3342412179621483, + "grad_norm": 0.3095732869400198, + "learning_rate": 1.4262723269909955e-05, + "loss": 2.6251, + "step": 50137 + }, + { + "epoch": 2.3342877761482415, + "grad_norm": 0.3213894635671078, + "learning_rate": 1.4260828873719344e-05, + "loss": 2.5739, + "step": 50138 + }, + { + "epoch": 2.3343343343343346, + "grad_norm": 0.33039734230215356, + "learning_rate": 1.4258934582419664e-05, + "loss": 2.6654, + "step": 50139 + }, + { + "epoch": 2.3343808925204272, + "grad_norm": 0.31810681771119903, + "learning_rate": 1.425704039601649e-05, + "loss": 2.6933, + "step": 50140 + }, + { + "epoch": 2.3344274507065204, + "grad_norm": 0.33148891788109625, + "learning_rate": 1.4255146314515377e-05, + "loss": 2.6514, + "step": 50141 + }, + { + "epoch": 2.3344740088926135, + "grad_norm": 0.33154645748196315, + "learning_rate": 1.4253252337921907e-05, + "loss": 2.6634, + "step": 50142 + }, + { + "epoch": 2.3345205670787066, + "grad_norm": 0.3159071432794183, + "learning_rate": 1.4251358466241589e-05, + "loss": 2.5946, + "step": 50143 + }, + { + "epoch": 2.3345671252647997, + "grad_norm": 0.3062463272568352, + "learning_rate": 1.4249464699480041e-05, + "loss": 2.6006, + "step": 50144 + }, + { + "epoch": 2.334613683450893, + "grad_norm": 0.3039528309606343, + "learning_rate": 1.4247571037642782e-05, + "loss": 2.5525, + "step": 50145 + }, + { + "epoch": 2.334660241636986, + "grad_norm": 0.31408423433781224, + "learning_rate": 1.4245677480735376e-05, + "loss": 2.6301, + "step": 50146 + }, + { + "epoch": 2.334706799823079, + "grad_norm": 0.3460789467099751, + "learning_rate": 1.4243784028763384e-05, + "loss": 2.6772, + "step": 50147 + }, + { + "epoch": 2.3347533580091717, + "grad_norm": 0.31462417112683183, + "learning_rate": 1.4241890681732361e-05, + "loss": 2.6613, + "step": 50148 + }, + { + "epoch": 2.334799916195265, + "grad_norm": 0.33392448878445447, + "learning_rate": 1.4239997439647883e-05, + "loss": 2.6393, + "step": 50149 + }, + { + "epoch": 2.334846474381358, + "grad_norm": 0.36275055536379697, + "learning_rate": 1.423810430251546e-05, + "loss": 2.6637, + "step": 50150 + }, + { + "epoch": 2.334893032567451, + "grad_norm": 0.33511489311308335, + "learning_rate": 1.423621127034071e-05, + "loss": 2.5792, + "step": 50151 + }, + { + "epoch": 2.334939590753544, + "grad_norm": 0.32783139726261, + "learning_rate": 1.4234318343129138e-05, + "loss": 2.6517, + "step": 50152 + }, + { + "epoch": 2.3349861489396373, + "grad_norm": 0.34597886793011867, + "learning_rate": 1.4232425520886322e-05, + "loss": 2.6944, + "step": 50153 + }, + { + "epoch": 2.3350327071257304, + "grad_norm": 0.34142771223184626, + "learning_rate": 1.4230532803617812e-05, + "loss": 2.6148, + "step": 50154 + }, + { + "epoch": 2.3350792653118235, + "grad_norm": 0.3257926712251225, + "learning_rate": 1.4228640191329161e-05, + "loss": 2.6733, + "step": 50155 + }, + { + "epoch": 2.3351258234979166, + "grad_norm": 0.3366798292762975, + "learning_rate": 1.4226747684025931e-05, + "loss": 2.707, + "step": 50156 + }, + { + "epoch": 2.3351723816840098, + "grad_norm": 0.3330207674163447, + "learning_rate": 1.4224855281713684e-05, + "loss": 2.6501, + "step": 50157 + }, + { + "epoch": 2.335218939870103, + "grad_norm": 0.3375366271889483, + "learning_rate": 1.4222962984397953e-05, + "loss": 2.6074, + "step": 50158 + }, + { + "epoch": 2.3352654980561955, + "grad_norm": 0.33064474757566864, + "learning_rate": 1.4221070792084295e-05, + "loss": 2.6885, + "step": 50159 + }, + { + "epoch": 2.3353120562422887, + "grad_norm": 0.31475348313572027, + "learning_rate": 1.4219178704778269e-05, + "loss": 2.6624, + "step": 50160 + }, + { + "epoch": 2.3353586144283818, + "grad_norm": 0.33383430924718316, + "learning_rate": 1.4217286722485435e-05, + "loss": 2.6996, + "step": 50161 + }, + { + "epoch": 2.335405172614475, + "grad_norm": 0.3203651640824868, + "learning_rate": 1.421539484521135e-05, + "loss": 2.6676, + "step": 50162 + }, + { + "epoch": 2.335451730800568, + "grad_norm": 0.31517829736751574, + "learning_rate": 1.4213503072961526e-05, + "loss": 2.633, + "step": 50163 + }, + { + "epoch": 2.335498288986661, + "grad_norm": 0.34690091582428945, + "learning_rate": 1.421161140574157e-05, + "loss": 2.6423, + "step": 50164 + }, + { + "epoch": 2.3355448471727542, + "grad_norm": 0.33969776472205276, + "learning_rate": 1.4209719843557001e-05, + "loss": 2.6915, + "step": 50165 + }, + { + "epoch": 2.3355914053588474, + "grad_norm": 0.31984420656323914, + "learning_rate": 1.4207828386413374e-05, + "loss": 2.5371, + "step": 50166 + }, + { + "epoch": 2.3356379635449405, + "grad_norm": 0.33514661007727237, + "learning_rate": 1.4205937034316247e-05, + "loss": 2.6341, + "step": 50167 + }, + { + "epoch": 2.335684521731033, + "grad_norm": 0.3203375952463694, + "learning_rate": 1.420404578727117e-05, + "loss": 2.603, + "step": 50168 + }, + { + "epoch": 2.3357310799171263, + "grad_norm": 0.35526900742013157, + "learning_rate": 1.4202154645283689e-05, + "loss": 2.7247, + "step": 50169 + }, + { + "epoch": 2.3357776381032194, + "grad_norm": 0.33186246560766974, + "learning_rate": 1.4200263608359377e-05, + "loss": 2.5761, + "step": 50170 + }, + { + "epoch": 2.3358241962893125, + "grad_norm": 0.31632701024896975, + "learning_rate": 1.4198372676503746e-05, + "loss": 2.5937, + "step": 50171 + }, + { + "epoch": 2.3358707544754056, + "grad_norm": 0.31075377989667147, + "learning_rate": 1.4196481849722365e-05, + "loss": 2.6395, + "step": 50172 + }, + { + "epoch": 2.3359173126614987, + "grad_norm": 0.34320795373907587, + "learning_rate": 1.4194591128020778e-05, + "loss": 2.6336, + "step": 50173 + }, + { + "epoch": 2.335963870847592, + "grad_norm": 0.29849691493209596, + "learning_rate": 1.4192700511404545e-05, + "loss": 2.6173, + "step": 50174 + }, + { + "epoch": 2.336010429033685, + "grad_norm": 0.3412111528909817, + "learning_rate": 1.4190809999879224e-05, + "loss": 2.587, + "step": 50175 + }, + { + "epoch": 2.336056987219778, + "grad_norm": 0.3273318608930196, + "learning_rate": 1.4188919593450312e-05, + "loss": 2.6429, + "step": 50176 + }, + { + "epoch": 2.336103545405871, + "grad_norm": 0.31339564776443074, + "learning_rate": 1.4187029292123428e-05, + "loss": 2.6408, + "step": 50177 + }, + { + "epoch": 2.3361501035919643, + "grad_norm": 0.3207860666200922, + "learning_rate": 1.4185139095904064e-05, + "loss": 2.6487, + "step": 50178 + }, + { + "epoch": 2.336196661778057, + "grad_norm": 0.3529930820764646, + "learning_rate": 1.4183249004797794e-05, + "loss": 2.721, + "step": 50179 + }, + { + "epoch": 2.33624321996415, + "grad_norm": 0.3307159404377648, + "learning_rate": 1.4181359018810158e-05, + "loss": 2.6216, + "step": 50180 + }, + { + "epoch": 2.336289778150243, + "grad_norm": 0.3210290721650858, + "learning_rate": 1.4179469137946706e-05, + "loss": 2.6112, + "step": 50181 + }, + { + "epoch": 2.3363363363363363, + "grad_norm": 0.32631580971805685, + "learning_rate": 1.4177579362212978e-05, + "loss": 2.6195, + "step": 50182 + }, + { + "epoch": 2.3363828945224294, + "grad_norm": 0.3408620957919041, + "learning_rate": 1.4175689691614542e-05, + "loss": 2.718, + "step": 50183 + }, + { + "epoch": 2.3364294527085225, + "grad_norm": 0.31201869014013295, + "learning_rate": 1.4173800126156916e-05, + "loss": 2.5748, + "step": 50184 + }, + { + "epoch": 2.3364760108946157, + "grad_norm": 0.3455844261751043, + "learning_rate": 1.4171910665845656e-05, + "loss": 2.6277, + "step": 50185 + }, + { + "epoch": 2.3365225690807088, + "grad_norm": 0.3686205743707177, + "learning_rate": 1.4170021310686304e-05, + "loss": 2.724, + "step": 50186 + }, + { + "epoch": 2.3365691272668014, + "grad_norm": 0.3246998063177924, + "learning_rate": 1.4168132060684418e-05, + "loss": 2.6113, + "step": 50187 + }, + { + "epoch": 2.3366156854528946, + "grad_norm": 0.3143160733544013, + "learning_rate": 1.4166242915845546e-05, + "loss": 2.6909, + "step": 50188 + }, + { + "epoch": 2.3366622436389877, + "grad_norm": 0.34667914125184063, + "learning_rate": 1.4164353876175184e-05, + "loss": 2.6159, + "step": 50189 + }, + { + "epoch": 2.336708801825081, + "grad_norm": 0.3219852273472904, + "learning_rate": 1.4162464941678954e-05, + "loss": 2.6096, + "step": 50190 + }, + { + "epoch": 2.336755360011174, + "grad_norm": 0.3301483563137837, + "learning_rate": 1.4160576112362323e-05, + "loss": 2.6233, + "step": 50191 + }, + { + "epoch": 2.336801918197267, + "grad_norm": 0.36805611884643996, + "learning_rate": 1.4158687388230902e-05, + "loss": 2.5829, + "step": 50192 + }, + { + "epoch": 2.33684847638336, + "grad_norm": 0.3495872691934668, + "learning_rate": 1.4156798769290186e-05, + "loss": 2.6588, + "step": 50193 + }, + { + "epoch": 2.3368950345694532, + "grad_norm": 0.3058904319726162, + "learning_rate": 1.4154910255545733e-05, + "loss": 2.6816, + "step": 50194 + }, + { + "epoch": 2.3369415927555464, + "grad_norm": 0.3281969440908359, + "learning_rate": 1.415302184700309e-05, + "loss": 2.5619, + "step": 50195 + }, + { + "epoch": 2.3369881509416395, + "grad_norm": 0.3492497100251574, + "learning_rate": 1.415113354366781e-05, + "loss": 2.6356, + "step": 50196 + }, + { + "epoch": 2.3370347091277326, + "grad_norm": 0.3739102426924745, + "learning_rate": 1.4149245345545408e-05, + "loss": 2.6811, + "step": 50197 + }, + { + "epoch": 2.3370812673138253, + "grad_norm": 0.3397239104172314, + "learning_rate": 1.4147357252641436e-05, + "loss": 2.6753, + "step": 50198 + }, + { + "epoch": 2.3371278254999184, + "grad_norm": 0.3367771518440384, + "learning_rate": 1.4145469264961442e-05, + "loss": 2.674, + "step": 50199 + }, + { + "epoch": 2.3371743836860115, + "grad_norm": 0.34608470964464927, + "learning_rate": 1.4143581382510962e-05, + "loss": 2.585, + "step": 50200 + }, + { + "epoch": 2.3372209418721046, + "grad_norm": 0.3569410758530518, + "learning_rate": 1.4141693605295553e-05, + "loss": 2.6254, + "step": 50201 + }, + { + "epoch": 2.3372675000581977, + "grad_norm": 0.323163508542376, + "learning_rate": 1.413980593332071e-05, + "loss": 2.5905, + "step": 50202 + }, + { + "epoch": 2.337314058244291, + "grad_norm": 0.33425602598481663, + "learning_rate": 1.4137918366592034e-05, + "loss": 2.5767, + "step": 50203 + }, + { + "epoch": 2.337360616430384, + "grad_norm": 0.3280820014826254, + "learning_rate": 1.4136030905115005e-05, + "loss": 2.6449, + "step": 50204 + }, + { + "epoch": 2.337407174616477, + "grad_norm": 0.3275950303785406, + "learning_rate": 1.4134143548895224e-05, + "loss": 2.6443, + "step": 50205 + }, + { + "epoch": 2.33745373280257, + "grad_norm": 0.3331464223819692, + "learning_rate": 1.413225629793818e-05, + "loss": 2.6326, + "step": 50206 + }, + { + "epoch": 2.337500290988663, + "grad_norm": 0.33974918402952214, + "learning_rate": 1.4130369152249429e-05, + "loss": 2.6507, + "step": 50207 + }, + { + "epoch": 2.337546849174756, + "grad_norm": 0.33934233673331693, + "learning_rate": 1.4128482111834512e-05, + "loss": 2.6265, + "step": 50208 + }, + { + "epoch": 2.337593407360849, + "grad_norm": 0.3267420014070676, + "learning_rate": 1.412659517669897e-05, + "loss": 2.6532, + "step": 50209 + }, + { + "epoch": 2.337639965546942, + "grad_norm": 0.33527113633971695, + "learning_rate": 1.4124708346848348e-05, + "loss": 2.6211, + "step": 50210 + }, + { + "epoch": 2.3376865237330353, + "grad_norm": 0.34333589791767105, + "learning_rate": 1.4122821622288158e-05, + "loss": 2.6552, + "step": 50211 + }, + { + "epoch": 2.3377330819191284, + "grad_norm": 0.33731532483806276, + "learning_rate": 1.4120935003023955e-05, + "loss": 2.7115, + "step": 50212 + }, + { + "epoch": 2.3377796401052215, + "grad_norm": 0.3331366303469862, + "learning_rate": 1.411904848906127e-05, + "loss": 2.6343, + "step": 50213 + }, + { + "epoch": 2.3378261982913147, + "grad_norm": 0.3471520312163432, + "learning_rate": 1.411716208040566e-05, + "loss": 2.6756, + "step": 50214 + }, + { + "epoch": 2.3378727564774078, + "grad_norm": 0.33134767621115074, + "learning_rate": 1.4115275777062615e-05, + "loss": 2.6235, + "step": 50215 + }, + { + "epoch": 2.337919314663501, + "grad_norm": 0.3262588471563501, + "learning_rate": 1.4113389579037728e-05, + "loss": 2.6638, + "step": 50216 + }, + { + "epoch": 2.337965872849594, + "grad_norm": 0.3179133777391196, + "learning_rate": 1.4111503486336474e-05, + "loss": 2.5431, + "step": 50217 + }, + { + "epoch": 2.3380124310356867, + "grad_norm": 0.3113577327568702, + "learning_rate": 1.4109617498964456e-05, + "loss": 2.5036, + "step": 50218 + }, + { + "epoch": 2.33805898922178, + "grad_norm": 0.33948295242552295, + "learning_rate": 1.410773161692716e-05, + "loss": 2.6175, + "step": 50219 + }, + { + "epoch": 2.338105547407873, + "grad_norm": 0.3203984859351475, + "learning_rate": 1.4105845840230137e-05, + "loss": 2.5704, + "step": 50220 + }, + { + "epoch": 2.338152105593966, + "grad_norm": 0.3278497123930107, + "learning_rate": 1.4103960168878916e-05, + "loss": 2.6118, + "step": 50221 + }, + { + "epoch": 2.338198663780059, + "grad_norm": 0.34858845952048556, + "learning_rate": 1.410207460287904e-05, + "loss": 2.6405, + "step": 50222 + }, + { + "epoch": 2.3382452219661523, + "grad_norm": 0.34660168682084785, + "learning_rate": 1.4100189142236054e-05, + "loss": 2.631, + "step": 50223 + }, + { + "epoch": 2.3382917801522454, + "grad_norm": 0.33754651134077246, + "learning_rate": 1.4098303786955458e-05, + "loss": 2.757, + "step": 50224 + }, + { + "epoch": 2.3383383383383385, + "grad_norm": 0.3189423974681564, + "learning_rate": 1.4096418537042799e-05, + "loss": 2.597, + "step": 50225 + }, + { + "epoch": 2.3383848965244316, + "grad_norm": 0.33319992866490145, + "learning_rate": 1.4094533392503623e-05, + "loss": 2.6674, + "step": 50226 + }, + { + "epoch": 2.3384314547105243, + "grad_norm": 0.30900068662717134, + "learning_rate": 1.4092648353343452e-05, + "loss": 2.5932, + "step": 50227 + }, + { + "epoch": 2.3384780128966174, + "grad_norm": 0.32038599561300163, + "learning_rate": 1.4090763419567816e-05, + "loss": 2.6227, + "step": 50228 + }, + { + "epoch": 2.3385245710827105, + "grad_norm": 0.31881791153714717, + "learning_rate": 1.408887859118227e-05, + "loss": 2.6737, + "step": 50229 + }, + { + "epoch": 2.3385711292688036, + "grad_norm": 0.3402581099688795, + "learning_rate": 1.40869938681923e-05, + "loss": 2.6524, + "step": 50230 + }, + { + "epoch": 2.3386176874548967, + "grad_norm": 0.3221997586483204, + "learning_rate": 1.408510925060349e-05, + "loss": 2.6894, + "step": 50231 + }, + { + "epoch": 2.33866424564099, + "grad_norm": 0.3122843769810998, + "learning_rate": 1.4083224738421336e-05, + "loss": 2.6674, + "step": 50232 + }, + { + "epoch": 2.338710803827083, + "grad_norm": 0.3310690334869011, + "learning_rate": 1.4081340331651378e-05, + "loss": 2.5905, + "step": 50233 + }, + { + "epoch": 2.338757362013176, + "grad_norm": 0.32903932089211874, + "learning_rate": 1.4079456030299144e-05, + "loss": 2.6292, + "step": 50234 + }, + { + "epoch": 2.338803920199269, + "grad_norm": 0.32335915411105115, + "learning_rate": 1.4077571834370168e-05, + "loss": 2.5899, + "step": 50235 + }, + { + "epoch": 2.3388504783853623, + "grad_norm": 0.3145032542605884, + "learning_rate": 1.4075687743869997e-05, + "loss": 2.6545, + "step": 50236 + }, + { + "epoch": 2.3388970365714554, + "grad_norm": 0.3348813494903708, + "learning_rate": 1.4073803758804133e-05, + "loss": 2.5911, + "step": 50237 + }, + { + "epoch": 2.338943594757548, + "grad_norm": 0.3077705564968293, + "learning_rate": 1.4071919879178109e-05, + "loss": 2.565, + "step": 50238 + }, + { + "epoch": 2.338990152943641, + "grad_norm": 0.3094427203928276, + "learning_rate": 1.4070036104997463e-05, + "loss": 2.6253, + "step": 50239 + }, + { + "epoch": 2.3390367111297343, + "grad_norm": 0.3403187956933639, + "learning_rate": 1.4068152436267717e-05, + "loss": 2.6311, + "step": 50240 + }, + { + "epoch": 2.3390832693158274, + "grad_norm": 0.3092438436608821, + "learning_rate": 1.4066268872994409e-05, + "loss": 2.6201, + "step": 50241 + }, + { + "epoch": 2.3391298275019206, + "grad_norm": 0.3330549313433636, + "learning_rate": 1.4064385415183073e-05, + "loss": 2.6586, + "step": 50242 + }, + { + "epoch": 2.3391763856880137, + "grad_norm": 0.3434145340200928, + "learning_rate": 1.4062502062839201e-05, + "loss": 2.656, + "step": 50243 + }, + { + "epoch": 2.339222943874107, + "grad_norm": 0.31454161968270894, + "learning_rate": 1.4060618815968375e-05, + "loss": 2.5373, + "step": 50244 + }, + { + "epoch": 2.3392695020602, + "grad_norm": 0.3562931622460251, + "learning_rate": 1.4058735674576057e-05, + "loss": 2.7063, + "step": 50245 + }, + { + "epoch": 2.3393160602462926, + "grad_norm": 0.330491307796747, + "learning_rate": 1.405685263866784e-05, + "loss": 2.6727, + "step": 50246 + }, + { + "epoch": 2.3393626184323857, + "grad_norm": 0.3258112711439851, + "learning_rate": 1.4054969708249205e-05, + "loss": 2.5398, + "step": 50247 + }, + { + "epoch": 2.339409176618479, + "grad_norm": 0.34215632267718604, + "learning_rate": 1.4053086883325694e-05, + "loss": 2.602, + "step": 50248 + }, + { + "epoch": 2.339455734804572, + "grad_norm": 0.31080518651174943, + "learning_rate": 1.4051204163902848e-05, + "loss": 2.5308, + "step": 50249 + }, + { + "epoch": 2.339502292990665, + "grad_norm": 0.3315634952685533, + "learning_rate": 1.4049321549986155e-05, + "loss": 2.541, + "step": 50250 + }, + { + "epoch": 2.339548851176758, + "grad_norm": 0.32420291807609564, + "learning_rate": 1.4047439041581168e-05, + "loss": 2.7107, + "step": 50251 + }, + { + "epoch": 2.3395954093628513, + "grad_norm": 0.32474228866094096, + "learning_rate": 1.4045556638693403e-05, + "loss": 2.6219, + "step": 50252 + }, + { + "epoch": 2.3396419675489444, + "grad_norm": 0.3211407288741822, + "learning_rate": 1.4043674341328389e-05, + "loss": 2.7104, + "step": 50253 + }, + { + "epoch": 2.3396885257350375, + "grad_norm": 0.3162622744699478, + "learning_rate": 1.4041792149491645e-05, + "loss": 2.6144, + "step": 50254 + }, + { + "epoch": 2.3397350839211306, + "grad_norm": 0.31575950314721624, + "learning_rate": 1.4039910063188715e-05, + "loss": 2.7434, + "step": 50255 + }, + { + "epoch": 2.3397816421072237, + "grad_norm": 0.3124837620688592, + "learning_rate": 1.4038028082425076e-05, + "loss": 2.5978, + "step": 50256 + }, + { + "epoch": 2.3398282002933164, + "grad_norm": 0.3250152352201839, + "learning_rate": 1.4036146207206312e-05, + "loss": 2.6298, + "step": 50257 + }, + { + "epoch": 2.3398747584794095, + "grad_norm": 0.3291477298943594, + "learning_rate": 1.4034264437537886e-05, + "loss": 2.7626, + "step": 50258 + }, + { + "epoch": 2.3399213166655026, + "grad_norm": 0.3158586100430935, + "learning_rate": 1.4032382773425385e-05, + "loss": 2.7187, + "step": 50259 + }, + { + "epoch": 2.3399678748515957, + "grad_norm": 0.3362978256375999, + "learning_rate": 1.4030501214874276e-05, + "loss": 2.7618, + "step": 50260 + }, + { + "epoch": 2.340014433037689, + "grad_norm": 0.2897201085560261, + "learning_rate": 1.4028619761890105e-05, + "loss": 2.6226, + "step": 50261 + }, + { + "epoch": 2.340060991223782, + "grad_norm": 0.30110061995587906, + "learning_rate": 1.4026738414478401e-05, + "loss": 2.6785, + "step": 50262 + }, + { + "epoch": 2.340107549409875, + "grad_norm": 0.3139832084893766, + "learning_rate": 1.4024857172644652e-05, + "loss": 2.6038, + "step": 50263 + }, + { + "epoch": 2.340154107595968, + "grad_norm": 0.3171203134229688, + "learning_rate": 1.4022976036394431e-05, + "loss": 2.5625, + "step": 50264 + }, + { + "epoch": 2.3402006657820613, + "grad_norm": 0.30969098175761967, + "learning_rate": 1.402109500573322e-05, + "loss": 2.6838, + "step": 50265 + }, + { + "epoch": 2.340247223968154, + "grad_norm": 0.2980205679991549, + "learning_rate": 1.4019214080666549e-05, + "loss": 2.6015, + "step": 50266 + }, + { + "epoch": 2.340293782154247, + "grad_norm": 0.32617062290371396, + "learning_rate": 1.4017333261199934e-05, + "loss": 2.4576, + "step": 50267 + }, + { + "epoch": 2.34034034034034, + "grad_norm": 0.33308213621314664, + "learning_rate": 1.401545254733892e-05, + "loss": 2.65, + "step": 50268 + }, + { + "epoch": 2.3403868985264333, + "grad_norm": 0.338775652674004, + "learning_rate": 1.4013571939088976e-05, + "loss": 2.7765, + "step": 50269 + }, + { + "epoch": 2.3404334567125264, + "grad_norm": 0.3206080692928333, + "learning_rate": 1.4011691436455687e-05, + "loss": 2.6773, + "step": 50270 + }, + { + "epoch": 2.3404800148986196, + "grad_norm": 0.34196646027147065, + "learning_rate": 1.400981103944451e-05, + "loss": 2.6347, + "step": 50271 + }, + { + "epoch": 2.3405265730847127, + "grad_norm": 0.32821142838262224, + "learning_rate": 1.4007930748061015e-05, + "loss": 2.5833, + "step": 50272 + }, + { + "epoch": 2.340573131270806, + "grad_norm": 0.3277051504128492, + "learning_rate": 1.4006050562310685e-05, + "loss": 2.6036, + "step": 50273 + }, + { + "epoch": 2.340619689456899, + "grad_norm": 0.32535454240291056, + "learning_rate": 1.4004170482199052e-05, + "loss": 2.6525, + "step": 50274 + }, + { + "epoch": 2.340666247642992, + "grad_norm": 0.3345894876305331, + "learning_rate": 1.400229050773163e-05, + "loss": 2.6528, + "step": 50275 + }, + { + "epoch": 2.340712805829085, + "grad_norm": 0.3229087577449654, + "learning_rate": 1.4000410638913941e-05, + "loss": 2.5186, + "step": 50276 + }, + { + "epoch": 2.340759364015178, + "grad_norm": 0.34435220889769064, + "learning_rate": 1.3998530875751515e-05, + "loss": 2.5981, + "step": 50277 + }, + { + "epoch": 2.340805922201271, + "grad_norm": 0.3467932890349702, + "learning_rate": 1.399665121824984e-05, + "loss": 2.7555, + "step": 50278 + }, + { + "epoch": 2.340852480387364, + "grad_norm": 0.304052100893892, + "learning_rate": 1.3994771666414446e-05, + "loss": 2.6608, + "step": 50279 + }, + { + "epoch": 2.340899038573457, + "grad_norm": 0.329308161166863, + "learning_rate": 1.3992892220250848e-05, + "loss": 2.704, + "step": 50280 + }, + { + "epoch": 2.3409455967595503, + "grad_norm": 0.33014843893458024, + "learning_rate": 1.399101287976457e-05, + "loss": 2.6544, + "step": 50281 + }, + { + "epoch": 2.3409921549456434, + "grad_norm": 0.3619876993453179, + "learning_rate": 1.3989133644961117e-05, + "loss": 2.6043, + "step": 50282 + }, + { + "epoch": 2.3410387131317365, + "grad_norm": 0.3342290666851514, + "learning_rate": 1.3987254515846021e-05, + "loss": 2.7013, + "step": 50283 + }, + { + "epoch": 2.3410852713178296, + "grad_norm": 0.34560919863101114, + "learning_rate": 1.3985375492424762e-05, + "loss": 2.7995, + "step": 50284 + }, + { + "epoch": 2.3411318295039223, + "grad_norm": 0.32274255069416574, + "learning_rate": 1.3983496574702903e-05, + "loss": 2.6089, + "step": 50285 + }, + { + "epoch": 2.3411783876900154, + "grad_norm": 0.3187981325246581, + "learning_rate": 1.3981617762685923e-05, + "loss": 2.5546, + "step": 50286 + }, + { + "epoch": 2.3412249458761085, + "grad_norm": 0.3582742077381973, + "learning_rate": 1.3979739056379342e-05, + "loss": 2.598, + "step": 50287 + }, + { + "epoch": 2.3412715040622016, + "grad_norm": 0.33650486570745486, + "learning_rate": 1.3977860455788677e-05, + "loss": 2.6704, + "step": 50288 + }, + { + "epoch": 2.3413180622482948, + "grad_norm": 0.32164825434622996, + "learning_rate": 1.3975981960919449e-05, + "loss": 2.5934, + "step": 50289 + }, + { + "epoch": 2.341364620434388, + "grad_norm": 0.340901870764464, + "learning_rate": 1.3974103571777176e-05, + "loss": 2.5672, + "step": 50290 + }, + { + "epoch": 2.341411178620481, + "grad_norm": 0.35255121353428176, + "learning_rate": 1.3972225288367347e-05, + "loss": 2.647, + "step": 50291 + }, + { + "epoch": 2.341457736806574, + "grad_norm": 0.3310072026462479, + "learning_rate": 1.3970347110695486e-05, + "loss": 2.5632, + "step": 50292 + }, + { + "epoch": 2.341504294992667, + "grad_norm": 0.329259075292397, + "learning_rate": 1.3968469038767108e-05, + "loss": 2.5869, + "step": 50293 + }, + { + "epoch": 2.3415508531787603, + "grad_norm": 0.36006199447632053, + "learning_rate": 1.396659107258772e-05, + "loss": 2.5982, + "step": 50294 + }, + { + "epoch": 2.3415974113648534, + "grad_norm": 0.3371772206030505, + "learning_rate": 1.3964713212162838e-05, + "loss": 2.6463, + "step": 50295 + }, + { + "epoch": 2.341643969550946, + "grad_norm": 0.3168147413859908, + "learning_rate": 1.396283545749799e-05, + "loss": 2.583, + "step": 50296 + }, + { + "epoch": 2.3416905277370392, + "grad_norm": 0.40091955935384516, + "learning_rate": 1.396095780859864e-05, + "loss": 2.6245, + "step": 50297 + }, + { + "epoch": 2.3417370859231323, + "grad_norm": 0.33102836623985027, + "learning_rate": 1.3959080265470353e-05, + "loss": 2.6268, + "step": 50298 + }, + { + "epoch": 2.3417836441092255, + "grad_norm": 0.30993927231263585, + "learning_rate": 1.395720282811861e-05, + "loss": 2.5108, + "step": 50299 + }, + { + "epoch": 2.3418302022953186, + "grad_norm": 0.3560961355173089, + "learning_rate": 1.3955325496548915e-05, + "loss": 2.5647, + "step": 50300 + }, + { + "epoch": 2.3418767604814117, + "grad_norm": 0.34325322302504674, + "learning_rate": 1.3953448270766794e-05, + "loss": 2.6493, + "step": 50301 + }, + { + "epoch": 2.341923318667505, + "grad_norm": 0.31411662087139064, + "learning_rate": 1.3951571150777754e-05, + "loss": 2.5059, + "step": 50302 + }, + { + "epoch": 2.341969876853598, + "grad_norm": 0.35888861902309166, + "learning_rate": 1.394969413658731e-05, + "loss": 2.6537, + "step": 50303 + }, + { + "epoch": 2.342016435039691, + "grad_norm": 0.33514893639408694, + "learning_rate": 1.3947817228200954e-05, + "loss": 2.5396, + "step": 50304 + }, + { + "epoch": 2.3420629932257837, + "grad_norm": 0.3268264499643359, + "learning_rate": 1.3945940425624199e-05, + "loss": 2.6572, + "step": 50305 + }, + { + "epoch": 2.342109551411877, + "grad_norm": 0.2967343816679334, + "learning_rate": 1.3944063728862555e-05, + "loss": 2.5972, + "step": 50306 + }, + { + "epoch": 2.34215610959797, + "grad_norm": 0.31855959797845484, + "learning_rate": 1.3942187137921536e-05, + "loss": 2.5027, + "step": 50307 + }, + { + "epoch": 2.342202667784063, + "grad_norm": 0.3264493905674224, + "learning_rate": 1.3940310652806642e-05, + "loss": 2.507, + "step": 50308 + }, + { + "epoch": 2.342249225970156, + "grad_norm": 0.3346058598557715, + "learning_rate": 1.3938434273523398e-05, + "loss": 2.615, + "step": 50309 + }, + { + "epoch": 2.3422957841562493, + "grad_norm": 0.3100907851550388, + "learning_rate": 1.3936558000077271e-05, + "loss": 2.671, + "step": 50310 + }, + { + "epoch": 2.3423423423423424, + "grad_norm": 0.3215864406230643, + "learning_rate": 1.3934681832473823e-05, + "loss": 2.5553, + "step": 50311 + }, + { + "epoch": 2.3423889005284355, + "grad_norm": 0.33759267122628905, + "learning_rate": 1.393280577071851e-05, + "loss": 2.6725, + "step": 50312 + }, + { + "epoch": 2.3424354587145286, + "grad_norm": 0.33146647804334683, + "learning_rate": 1.3930929814816867e-05, + "loss": 2.6916, + "step": 50313 + }, + { + "epoch": 2.3424820169006217, + "grad_norm": 0.30025957912634005, + "learning_rate": 1.3929053964774386e-05, + "loss": 2.6064, + "step": 50314 + }, + { + "epoch": 2.342528575086715, + "grad_norm": 0.32074944399114536, + "learning_rate": 1.3927178220596581e-05, + "loss": 2.6403, + "step": 50315 + }, + { + "epoch": 2.3425751332728075, + "grad_norm": 0.335172488837203, + "learning_rate": 1.3925302582288968e-05, + "loss": 2.6443, + "step": 50316 + }, + { + "epoch": 2.3426216914589006, + "grad_norm": 0.32278625940217603, + "learning_rate": 1.3923427049857008e-05, + "loss": 2.5907, + "step": 50317 + }, + { + "epoch": 2.3426682496449938, + "grad_norm": 0.31871715592514377, + "learning_rate": 1.3921551623306262e-05, + "loss": 2.5737, + "step": 50318 + }, + { + "epoch": 2.342714807831087, + "grad_norm": 0.31946858013466933, + "learning_rate": 1.3919676302642198e-05, + "loss": 2.6155, + "step": 50319 + }, + { + "epoch": 2.34276136601718, + "grad_norm": 0.3206234170687451, + "learning_rate": 1.3917801087870324e-05, + "loss": 2.6788, + "step": 50320 + }, + { + "epoch": 2.342807924203273, + "grad_norm": 0.31287611305943974, + "learning_rate": 1.3915925978996152e-05, + "loss": 2.5732, + "step": 50321 + }, + { + "epoch": 2.342854482389366, + "grad_norm": 0.32315221768422725, + "learning_rate": 1.39140509760252e-05, + "loss": 2.6845, + "step": 50322 + }, + { + "epoch": 2.3429010405754593, + "grad_norm": 0.32746507185639295, + "learning_rate": 1.3912176078962923e-05, + "loss": 2.6415, + "step": 50323 + }, + { + "epoch": 2.342947598761552, + "grad_norm": 0.324838511066519, + "learning_rate": 1.3910301287814881e-05, + "loss": 2.6386, + "step": 50324 + }, + { + "epoch": 2.342994156947645, + "grad_norm": 0.3181382438976694, + "learning_rate": 1.3908426602586533e-05, + "loss": 2.6919, + "step": 50325 + }, + { + "epoch": 2.3430407151337382, + "grad_norm": 0.32079443086493353, + "learning_rate": 1.3906552023283403e-05, + "loss": 2.6146, + "step": 50326 + }, + { + "epoch": 2.3430872733198314, + "grad_norm": 0.3193719967294469, + "learning_rate": 1.3904677549910982e-05, + "loss": 2.723, + "step": 50327 + }, + { + "epoch": 2.3431338315059245, + "grad_norm": 0.3108440420029895, + "learning_rate": 1.3902803182474778e-05, + "loss": 2.6013, + "step": 50328 + }, + { + "epoch": 2.3431803896920176, + "grad_norm": 0.3174206810222711, + "learning_rate": 1.3900928920980305e-05, + "loss": 2.5258, + "step": 50329 + }, + { + "epoch": 2.3432269478781107, + "grad_norm": 0.3134361523180571, + "learning_rate": 1.3899054765433023e-05, + "loss": 2.6334, + "step": 50330 + }, + { + "epoch": 2.343273506064204, + "grad_norm": 0.32432269799150387, + "learning_rate": 1.3897180715838487e-05, + "loss": 2.6819, + "step": 50331 + }, + { + "epoch": 2.343320064250297, + "grad_norm": 0.33053107811665683, + "learning_rate": 1.3895306772202149e-05, + "loss": 2.7159, + "step": 50332 + }, + { + "epoch": 2.34336662243639, + "grad_norm": 0.346329092039672, + "learning_rate": 1.3893432934529532e-05, + "loss": 2.5511, + "step": 50333 + }, + { + "epoch": 2.343413180622483, + "grad_norm": 0.3055109174732491, + "learning_rate": 1.3891559202826133e-05, + "loss": 2.591, + "step": 50334 + }, + { + "epoch": 2.343459738808576, + "grad_norm": 0.3308991895528899, + "learning_rate": 1.388968557709745e-05, + "loss": 2.5798, + "step": 50335 + }, + { + "epoch": 2.343506296994669, + "grad_norm": 0.3105263092399334, + "learning_rate": 1.3887812057348981e-05, + "loss": 2.6229, + "step": 50336 + }, + { + "epoch": 2.343552855180762, + "grad_norm": 0.3320018525016317, + "learning_rate": 1.3885938643586244e-05, + "loss": 2.6377, + "step": 50337 + }, + { + "epoch": 2.343599413366855, + "grad_norm": 0.31667298901460195, + "learning_rate": 1.3884065335814705e-05, + "loss": 2.5972, + "step": 50338 + }, + { + "epoch": 2.3436459715529483, + "grad_norm": 0.3314785617417029, + "learning_rate": 1.3882192134039873e-05, + "loss": 2.4615, + "step": 50339 + }, + { + "epoch": 2.3436925297390414, + "grad_norm": 0.3034827088592239, + "learning_rate": 1.3880319038267248e-05, + "loss": 2.7185, + "step": 50340 + }, + { + "epoch": 2.3437390879251345, + "grad_norm": 0.32688114466265084, + "learning_rate": 1.387844604850233e-05, + "loss": 2.6332, + "step": 50341 + }, + { + "epoch": 2.3437856461112276, + "grad_norm": 0.35852898692788526, + "learning_rate": 1.3876573164750629e-05, + "loss": 2.6594, + "step": 50342 + }, + { + "epoch": 2.3438322042973208, + "grad_norm": 0.3105350340394976, + "learning_rate": 1.3874700387017598e-05, + "loss": 2.6901, + "step": 50343 + }, + { + "epoch": 2.3438787624834134, + "grad_norm": 0.34870554395309533, + "learning_rate": 1.3872827715308795e-05, + "loss": 2.574, + "step": 50344 + }, + { + "epoch": 2.3439253206695065, + "grad_norm": 0.34741634257478515, + "learning_rate": 1.3870955149629661e-05, + "loss": 2.6379, + "step": 50345 + }, + { + "epoch": 2.3439718788555997, + "grad_norm": 0.33754896382398614, + "learning_rate": 1.3869082689985719e-05, + "loss": 2.5072, + "step": 50346 + }, + { + "epoch": 2.3440184370416928, + "grad_norm": 0.32567520069698336, + "learning_rate": 1.3867210336382458e-05, + "loss": 2.6824, + "step": 50347 + }, + { + "epoch": 2.344064995227786, + "grad_norm": 0.33712466861758184, + "learning_rate": 1.386533808882538e-05, + "loss": 2.6548, + "step": 50348 + }, + { + "epoch": 2.344111553413879, + "grad_norm": 0.3385082540398993, + "learning_rate": 1.386346594731997e-05, + "loss": 2.6149, + "step": 50349 + }, + { + "epoch": 2.344158111599972, + "grad_norm": 0.32798783018716676, + "learning_rate": 1.3861593911871746e-05, + "loss": 2.5945, + "step": 50350 + }, + { + "epoch": 2.3442046697860652, + "grad_norm": 0.3011993917773517, + "learning_rate": 1.3859721982486163e-05, + "loss": 2.5846, + "step": 50351 + }, + { + "epoch": 2.3442512279721583, + "grad_norm": 0.31362453853157096, + "learning_rate": 1.385785015916874e-05, + "loss": 2.7039, + "step": 50352 + }, + { + "epoch": 2.3442977861582515, + "grad_norm": 0.31629614672500184, + "learning_rate": 1.3855978441924967e-05, + "loss": 2.63, + "step": 50353 + }, + { + "epoch": 2.3443443443443446, + "grad_norm": 0.3311091301519921, + "learning_rate": 1.3854106830760333e-05, + "loss": 2.6714, + "step": 50354 + }, + { + "epoch": 2.3443909025304372, + "grad_norm": 0.3385264378649907, + "learning_rate": 1.3852235325680352e-05, + "loss": 2.625, + "step": 50355 + }, + { + "epoch": 2.3444374607165304, + "grad_norm": 0.316378113897182, + "learning_rate": 1.3850363926690468e-05, + "loss": 2.6827, + "step": 50356 + }, + { + "epoch": 2.3444840189026235, + "grad_norm": 0.3156641635547091, + "learning_rate": 1.3848492633796234e-05, + "loss": 2.6174, + "step": 50357 + }, + { + "epoch": 2.3445305770887166, + "grad_norm": 0.314219228006008, + "learning_rate": 1.38466214470031e-05, + "loss": 2.64, + "step": 50358 + }, + { + "epoch": 2.3445771352748097, + "grad_norm": 0.31222465152943435, + "learning_rate": 1.3844750366316567e-05, + "loss": 2.6565, + "step": 50359 + }, + { + "epoch": 2.344623693460903, + "grad_norm": 0.30444571405343773, + "learning_rate": 1.384287939174213e-05, + "loss": 2.5747, + "step": 50360 + }, + { + "epoch": 2.344670251646996, + "grad_norm": 0.32297343142363266, + "learning_rate": 1.3841008523285281e-05, + "loss": 2.7012, + "step": 50361 + }, + { + "epoch": 2.344716809833089, + "grad_norm": 0.3074332566252079, + "learning_rate": 1.383913776095151e-05, + "loss": 2.5535, + "step": 50362 + }, + { + "epoch": 2.3447633680191817, + "grad_norm": 0.314722571879349, + "learning_rate": 1.3837267104746327e-05, + "loss": 2.6647, + "step": 50363 + }, + { + "epoch": 2.344809926205275, + "grad_norm": 0.3091272578959535, + "learning_rate": 1.3835396554675179e-05, + "loss": 2.5901, + "step": 50364 + }, + { + "epoch": 2.344856484391368, + "grad_norm": 0.3191593548756862, + "learning_rate": 1.3833526110743583e-05, + "loss": 2.61, + "step": 50365 + }, + { + "epoch": 2.344903042577461, + "grad_norm": 0.3185202428193945, + "learning_rate": 1.3831655772957025e-05, + "loss": 2.5292, + "step": 50366 + }, + { + "epoch": 2.344949600763554, + "grad_norm": 0.33837734706411166, + "learning_rate": 1.3829785541320994e-05, + "loss": 2.6456, + "step": 50367 + }, + { + "epoch": 2.3449961589496473, + "grad_norm": 0.3246503711396824, + "learning_rate": 1.3827915415840998e-05, + "loss": 2.7135, + "step": 50368 + }, + { + "epoch": 2.3450427171357404, + "grad_norm": 0.32208782895691346, + "learning_rate": 1.382604539652247e-05, + "loss": 2.6506, + "step": 50369 + }, + { + "epoch": 2.3450892753218335, + "grad_norm": 0.3200669224065893, + "learning_rate": 1.3824175483370966e-05, + "loss": 2.6203, + "step": 50370 + }, + { + "epoch": 2.3451358335079266, + "grad_norm": 0.33437450987474726, + "learning_rate": 1.3822305676391918e-05, + "loss": 2.6415, + "step": 50371 + }, + { + "epoch": 2.3451823916940198, + "grad_norm": 0.3467987583076254, + "learning_rate": 1.3820435975590868e-05, + "loss": 2.5574, + "step": 50372 + }, + { + "epoch": 2.345228949880113, + "grad_norm": 0.3256662003419601, + "learning_rate": 1.3818566380973252e-05, + "loss": 2.6997, + "step": 50373 + }, + { + "epoch": 2.3452755080662055, + "grad_norm": 0.327644866995464, + "learning_rate": 1.3816696892544583e-05, + "loss": 2.6291, + "step": 50374 + }, + { + "epoch": 2.3453220662522987, + "grad_norm": 0.3374893901646031, + "learning_rate": 1.3814827510310346e-05, + "loss": 2.568, + "step": 50375 + }, + { + "epoch": 2.345368624438392, + "grad_norm": 0.35253672599424657, + "learning_rate": 1.3812958234276042e-05, + "loss": 2.705, + "step": 50376 + }, + { + "epoch": 2.345415182624485, + "grad_norm": 0.2978969604712444, + "learning_rate": 1.3811089064447124e-05, + "loss": 2.5851, + "step": 50377 + }, + { + "epoch": 2.345461740810578, + "grad_norm": 0.336067583253929, + "learning_rate": 1.3809220000829092e-05, + "loss": 2.6689, + "step": 50378 + }, + { + "epoch": 2.345508298996671, + "grad_norm": 0.33625731230288025, + "learning_rate": 1.3807351043427435e-05, + "loss": 2.7287, + "step": 50379 + }, + { + "epoch": 2.3455548571827642, + "grad_norm": 0.3165278532809412, + "learning_rate": 1.3805482192247637e-05, + "loss": 2.6464, + "step": 50380 + }, + { + "epoch": 2.3456014153688574, + "grad_norm": 0.3205998063280426, + "learning_rate": 1.3803613447295199e-05, + "loss": 2.5842, + "step": 50381 + }, + { + "epoch": 2.3456479735549505, + "grad_norm": 0.31989459355695404, + "learning_rate": 1.3801744808575562e-05, + "loss": 2.6316, + "step": 50382 + }, + { + "epoch": 2.345694531741043, + "grad_norm": 0.31613339172266336, + "learning_rate": 1.3799876276094264e-05, + "loss": 2.5498, + "step": 50383 + }, + { + "epoch": 2.3457410899271363, + "grad_norm": 0.3526607639190539, + "learning_rate": 1.3798007849856731e-05, + "loss": 2.6516, + "step": 50384 + }, + { + "epoch": 2.3457876481132294, + "grad_norm": 0.33656890878284135, + "learning_rate": 1.3796139529868512e-05, + "loss": 2.7283, + "step": 50385 + }, + { + "epoch": 2.3458342062993225, + "grad_norm": 0.32646349652537415, + "learning_rate": 1.3794271316135044e-05, + "loss": 2.6533, + "step": 50386 + }, + { + "epoch": 2.3458807644854156, + "grad_norm": 0.3257347893056321, + "learning_rate": 1.379240320866182e-05, + "loss": 2.6123, + "step": 50387 + }, + { + "epoch": 2.3459273226715087, + "grad_norm": 0.3348909804660802, + "learning_rate": 1.3790535207454325e-05, + "loss": 2.7374, + "step": 50388 + }, + { + "epoch": 2.345973880857602, + "grad_norm": 0.3246481383761354, + "learning_rate": 1.378866731251804e-05, + "loss": 2.5584, + "step": 50389 + }, + { + "epoch": 2.346020439043695, + "grad_norm": 0.3352952119580463, + "learning_rate": 1.3786799523858469e-05, + "loss": 2.6594, + "step": 50390 + }, + { + "epoch": 2.346066997229788, + "grad_norm": 0.3156744361284264, + "learning_rate": 1.3784931841481052e-05, + "loss": 2.7882, + "step": 50391 + }, + { + "epoch": 2.346113555415881, + "grad_norm": 0.3377501081750828, + "learning_rate": 1.37830642653913e-05, + "loss": 2.6119, + "step": 50392 + }, + { + "epoch": 2.3461601136019743, + "grad_norm": 0.3304405028212175, + "learning_rate": 1.3781196795594681e-05, + "loss": 2.6117, + "step": 50393 + }, + { + "epoch": 2.346206671788067, + "grad_norm": 0.35868342195714464, + "learning_rate": 1.3779329432096699e-05, + "loss": 2.6886, + "step": 50394 + }, + { + "epoch": 2.34625322997416, + "grad_norm": 0.33185154083255614, + "learning_rate": 1.3777462174902784e-05, + "loss": 2.7204, + "step": 50395 + }, + { + "epoch": 2.346299788160253, + "grad_norm": 0.3208101505576318, + "learning_rate": 1.3775595024018484e-05, + "loss": 2.6595, + "step": 50396 + }, + { + "epoch": 2.3463463463463463, + "grad_norm": 0.33709537572342485, + "learning_rate": 1.3773727979449209e-05, + "loss": 2.6668, + "step": 50397 + }, + { + "epoch": 2.3463929045324394, + "grad_norm": 0.34522085979705086, + "learning_rate": 1.3771861041200507e-05, + "loss": 2.6809, + "step": 50398 + }, + { + "epoch": 2.3464394627185325, + "grad_norm": 0.3324916056978446, + "learning_rate": 1.3769994209277803e-05, + "loss": 2.7195, + "step": 50399 + }, + { + "epoch": 2.3464860209046257, + "grad_norm": 0.3190626300427839, + "learning_rate": 1.3768127483686604e-05, + "loss": 2.5429, + "step": 50400 + }, + { + "epoch": 2.3465325790907188, + "grad_norm": 0.32213688373405924, + "learning_rate": 1.3766260864432379e-05, + "loss": 2.634, + "step": 50401 + }, + { + "epoch": 2.346579137276812, + "grad_norm": 0.3208425533416937, + "learning_rate": 1.3764394351520604e-05, + "loss": 2.6045, + "step": 50402 + }, + { + "epoch": 2.3466256954629046, + "grad_norm": 0.3103458148457339, + "learning_rate": 1.3762527944956783e-05, + "loss": 2.55, + "step": 50403 + }, + { + "epoch": 2.3466722536489977, + "grad_norm": 0.3352055770423149, + "learning_rate": 1.3760661644746353e-05, + "loss": 2.6483, + "step": 50404 + }, + { + "epoch": 2.346718811835091, + "grad_norm": 0.3348262768786283, + "learning_rate": 1.3758795450894819e-05, + "loss": 2.7681, + "step": 50405 + }, + { + "epoch": 2.346765370021184, + "grad_norm": 0.32031746484228063, + "learning_rate": 1.3756929363407639e-05, + "loss": 2.6748, + "step": 50406 + }, + { + "epoch": 2.346811928207277, + "grad_norm": 0.31389790180715377, + "learning_rate": 1.3755063382290306e-05, + "loss": 2.5819, + "step": 50407 + }, + { + "epoch": 2.34685848639337, + "grad_norm": 0.3268532111671219, + "learning_rate": 1.3753197507548294e-05, + "loss": 2.64, + "step": 50408 + }, + { + "epoch": 2.3469050445794633, + "grad_norm": 0.318287254869434, + "learning_rate": 1.375133173918709e-05, + "loss": 2.6495, + "step": 50409 + }, + { + "epoch": 2.3469516027655564, + "grad_norm": 0.3438122421936705, + "learning_rate": 1.374946607721212e-05, + "loss": 2.4767, + "step": 50410 + }, + { + "epoch": 2.3469981609516495, + "grad_norm": 0.31781231460136455, + "learning_rate": 1.3747600521628934e-05, + "loss": 2.5828, + "step": 50411 + }, + { + "epoch": 2.3470447191377426, + "grad_norm": 0.33923237661757644, + "learning_rate": 1.374573507244295e-05, + "loss": 2.7239, + "step": 50412 + }, + { + "epoch": 2.3470912773238357, + "grad_norm": 0.34082628825524697, + "learning_rate": 1.3743869729659664e-05, + "loss": 2.7405, + "step": 50413 + }, + { + "epoch": 2.3471378355099284, + "grad_norm": 0.32011582807592426, + "learning_rate": 1.3742004493284544e-05, + "loss": 2.5976, + "step": 50414 + }, + { + "epoch": 2.3471843936960215, + "grad_norm": 0.3195031636292936, + "learning_rate": 1.3740139363323074e-05, + "loss": 2.5227, + "step": 50415 + }, + { + "epoch": 2.3472309518821146, + "grad_norm": 0.31194872274309327, + "learning_rate": 1.3738274339780739e-05, + "loss": 2.5782, + "step": 50416 + }, + { + "epoch": 2.3472775100682077, + "grad_norm": 0.3184876158337981, + "learning_rate": 1.3736409422662977e-05, + "loss": 2.6734, + "step": 50417 + }, + { + "epoch": 2.347324068254301, + "grad_norm": 0.32549874283776487, + "learning_rate": 1.3734544611975281e-05, + "loss": 2.6984, + "step": 50418 + }, + { + "epoch": 2.347370626440394, + "grad_norm": 0.30918742337968663, + "learning_rate": 1.3732679907723123e-05, + "loss": 2.6191, + "step": 50419 + }, + { + "epoch": 2.347417184626487, + "grad_norm": 0.3036565349616975, + "learning_rate": 1.3730815309911976e-05, + "loss": 2.6671, + "step": 50420 + }, + { + "epoch": 2.34746374281258, + "grad_norm": 0.3192441913113859, + "learning_rate": 1.372895081854732e-05, + "loss": 2.6646, + "step": 50421 + }, + { + "epoch": 2.347510300998673, + "grad_norm": 0.3116947500131894, + "learning_rate": 1.3727086433634629e-05, + "loss": 2.6276, + "step": 50422 + }, + { + "epoch": 2.347556859184766, + "grad_norm": 0.32840273038198453, + "learning_rate": 1.3725222155179335e-05, + "loss": 2.5601, + "step": 50423 + }, + { + "epoch": 2.347603417370859, + "grad_norm": 0.31550671035155087, + "learning_rate": 1.3723357983186974e-05, + "loss": 2.6463, + "step": 50424 + }, + { + "epoch": 2.347649975556952, + "grad_norm": 0.32291332903373626, + "learning_rate": 1.3721493917662964e-05, + "loss": 2.6375, + "step": 50425 + }, + { + "epoch": 2.3476965337430453, + "grad_norm": 0.3065774473418036, + "learning_rate": 1.3719629958612802e-05, + "loss": 2.8099, + "step": 50426 + }, + { + "epoch": 2.3477430919291384, + "grad_norm": 0.3436069050715853, + "learning_rate": 1.3717766106041947e-05, + "loss": 2.604, + "step": 50427 + }, + { + "epoch": 2.3477896501152316, + "grad_norm": 0.3400794334767307, + "learning_rate": 1.3715902359955874e-05, + "loss": 2.6481, + "step": 50428 + }, + { + "epoch": 2.3478362083013247, + "grad_norm": 0.31399318083616634, + "learning_rate": 1.3714038720360073e-05, + "loss": 2.6169, + "step": 50429 + }, + { + "epoch": 2.347882766487418, + "grad_norm": 0.3105729038670545, + "learning_rate": 1.3712175187259957e-05, + "loss": 2.6303, + "step": 50430 + }, + { + "epoch": 2.347929324673511, + "grad_norm": 0.32197101901112124, + "learning_rate": 1.3710311760661065e-05, + "loss": 2.6191, + "step": 50431 + }, + { + "epoch": 2.347975882859604, + "grad_norm": 0.3287396081121783, + "learning_rate": 1.3708448440568816e-05, + "loss": 2.7227, + "step": 50432 + }, + { + "epoch": 2.3480224410456967, + "grad_norm": 0.3324725887094304, + "learning_rate": 1.37065852269887e-05, + "loss": 2.6653, + "step": 50433 + }, + { + "epoch": 2.34806899923179, + "grad_norm": 0.317893703211454, + "learning_rate": 1.3704722119926177e-05, + "loss": 2.7112, + "step": 50434 + }, + { + "epoch": 2.348115557417883, + "grad_norm": 0.325351832367792, + "learning_rate": 1.3702859119386734e-05, + "loss": 2.6481, + "step": 50435 + }, + { + "epoch": 2.348162115603976, + "grad_norm": 0.3140124972103149, + "learning_rate": 1.3700996225375795e-05, + "loss": 2.6089, + "step": 50436 + }, + { + "epoch": 2.348208673790069, + "grad_norm": 0.31384212970963704, + "learning_rate": 1.3699133437898887e-05, + "loss": 2.5221, + "step": 50437 + }, + { + "epoch": 2.3482552319761623, + "grad_norm": 0.3401237296577332, + "learning_rate": 1.3697270756961428e-05, + "loss": 2.6497, + "step": 50438 + }, + { + "epoch": 2.3483017901622554, + "grad_norm": 0.31889800755350384, + "learning_rate": 1.3695408182568903e-05, + "loss": 2.6014, + "step": 50439 + }, + { + "epoch": 2.3483483483483485, + "grad_norm": 0.3206161335788015, + "learning_rate": 1.369354571472678e-05, + "loss": 2.6154, + "step": 50440 + }, + { + "epoch": 2.3483949065344416, + "grad_norm": 0.32787252483063917, + "learning_rate": 1.3691683353440526e-05, + "loss": 2.6821, + "step": 50441 + }, + { + "epoch": 2.3484414647205343, + "grad_norm": 0.35536947640507976, + "learning_rate": 1.3689821098715616e-05, + "loss": 2.7133, + "step": 50442 + }, + { + "epoch": 2.3484880229066274, + "grad_norm": 0.33166485634533055, + "learning_rate": 1.3687958950557472e-05, + "loss": 2.5809, + "step": 50443 + }, + { + "epoch": 2.3485345810927205, + "grad_norm": 0.32429348339120156, + "learning_rate": 1.368609690897163e-05, + "loss": 2.6742, + "step": 50444 + }, + { + "epoch": 2.3485811392788136, + "grad_norm": 0.31225415597410117, + "learning_rate": 1.3684234973963494e-05, + "loss": 2.6331, + "step": 50445 + }, + { + "epoch": 2.3486276974649067, + "grad_norm": 0.3423885682464052, + "learning_rate": 1.3682373145538552e-05, + "loss": 2.6121, + "step": 50446 + }, + { + "epoch": 2.348674255651, + "grad_norm": 0.31682368087961643, + "learning_rate": 1.3680511423702264e-05, + "loss": 2.6258, + "step": 50447 + }, + { + "epoch": 2.348720813837093, + "grad_norm": 0.3121982506256341, + "learning_rate": 1.3678649808460097e-05, + "loss": 2.6286, + "step": 50448 + }, + { + "epoch": 2.348767372023186, + "grad_norm": 0.318628650119548, + "learning_rate": 1.3676788299817522e-05, + "loss": 2.6765, + "step": 50449 + }, + { + "epoch": 2.348813930209279, + "grad_norm": 0.3354420595504842, + "learning_rate": 1.3674926897780005e-05, + "loss": 2.5772, + "step": 50450 + }, + { + "epoch": 2.3488604883953723, + "grad_norm": 0.325961908976856, + "learning_rate": 1.367306560235298e-05, + "loss": 2.6024, + "step": 50451 + }, + { + "epoch": 2.3489070465814654, + "grad_norm": 0.3480273121755577, + "learning_rate": 1.3671204413541932e-05, + "loss": 2.7624, + "step": 50452 + }, + { + "epoch": 2.348953604767558, + "grad_norm": 0.32771859095439176, + "learning_rate": 1.3669343331352324e-05, + "loss": 2.6462, + "step": 50453 + }, + { + "epoch": 2.349000162953651, + "grad_norm": 0.32536216601689516, + "learning_rate": 1.3667482355789606e-05, + "loss": 2.6518, + "step": 50454 + }, + { + "epoch": 2.3490467211397443, + "grad_norm": 0.3211879265835045, + "learning_rate": 1.366562148685927e-05, + "loss": 2.7201, + "step": 50455 + }, + { + "epoch": 2.3490932793258374, + "grad_norm": 0.3227576319732214, + "learning_rate": 1.3663760724566727e-05, + "loss": 2.6237, + "step": 50456 + }, + { + "epoch": 2.3491398375119306, + "grad_norm": 0.33159628100910404, + "learning_rate": 1.3661900068917488e-05, + "loss": 2.5403, + "step": 50457 + }, + { + "epoch": 2.3491863956980237, + "grad_norm": 0.3179370493798998, + "learning_rate": 1.3660039519916985e-05, + "loss": 2.5987, + "step": 50458 + }, + { + "epoch": 2.349232953884117, + "grad_norm": 0.3393463805473462, + "learning_rate": 1.3658179077570683e-05, + "loss": 2.7223, + "step": 50459 + }, + { + "epoch": 2.34927951207021, + "grad_norm": 0.33282901337698917, + "learning_rate": 1.3656318741884045e-05, + "loss": 2.6922, + "step": 50460 + }, + { + "epoch": 2.3493260702563026, + "grad_norm": 0.332491739703438, + "learning_rate": 1.3654458512862528e-05, + "loss": 2.5808, + "step": 50461 + }, + { + "epoch": 2.3493726284423957, + "grad_norm": 0.3331854606350751, + "learning_rate": 1.3652598390511601e-05, + "loss": 2.7025, + "step": 50462 + }, + { + "epoch": 2.349419186628489, + "grad_norm": 0.33526718933691696, + "learning_rate": 1.365073837483673e-05, + "loss": 2.6188, + "step": 50463 + }, + { + "epoch": 2.349465744814582, + "grad_norm": 0.3099305639152852, + "learning_rate": 1.3648878465843346e-05, + "loss": 2.5637, + "step": 50464 + }, + { + "epoch": 2.349512303000675, + "grad_norm": 0.32612469678974904, + "learning_rate": 1.364701866353692e-05, + "loss": 2.6398, + "step": 50465 + }, + { + "epoch": 2.349558861186768, + "grad_norm": 0.3074984865727844, + "learning_rate": 1.364515896792291e-05, + "loss": 2.7021, + "step": 50466 + }, + { + "epoch": 2.3496054193728613, + "grad_norm": 0.3083587210901887, + "learning_rate": 1.3643299379006785e-05, + "loss": 2.6248, + "step": 50467 + }, + { + "epoch": 2.3496519775589544, + "grad_norm": 0.31972375610777, + "learning_rate": 1.3641439896794006e-05, + "loss": 2.5219, + "step": 50468 + }, + { + "epoch": 2.3496985357450475, + "grad_norm": 0.3286817392592836, + "learning_rate": 1.3639580521289986e-05, + "loss": 2.6745, + "step": 50469 + }, + { + "epoch": 2.3497450939311406, + "grad_norm": 0.3327837400963366, + "learning_rate": 1.3637721252500246e-05, + "loss": 2.5888, + "step": 50470 + }, + { + "epoch": 2.3497916521172337, + "grad_norm": 0.3314943164444089, + "learning_rate": 1.3635862090430202e-05, + "loss": 2.6382, + "step": 50471 + }, + { + "epoch": 2.3498382103033264, + "grad_norm": 0.3219256017153883, + "learning_rate": 1.3634003035085313e-05, + "loss": 2.6154, + "step": 50472 + }, + { + "epoch": 2.3498847684894195, + "grad_norm": 0.3453552745125632, + "learning_rate": 1.3632144086471049e-05, + "loss": 2.6303, + "step": 50473 + }, + { + "epoch": 2.3499313266755126, + "grad_norm": 0.3461666946271442, + "learning_rate": 1.3630285244592855e-05, + "loss": 2.6611, + "step": 50474 + }, + { + "epoch": 2.3499778848616057, + "grad_norm": 0.3249645529604533, + "learning_rate": 1.3628426509456193e-05, + "loss": 2.6655, + "step": 50475 + }, + { + "epoch": 2.350024443047699, + "grad_norm": 0.3264301291247675, + "learning_rate": 1.3626567881066532e-05, + "loss": 2.6501, + "step": 50476 + }, + { + "epoch": 2.350071001233792, + "grad_norm": 0.3259480208595075, + "learning_rate": 1.3624709359429295e-05, + "loss": 2.6357, + "step": 50477 + }, + { + "epoch": 2.350117559419885, + "grad_norm": 0.3279381450971314, + "learning_rate": 1.3622850944549948e-05, + "loss": 2.5735, + "step": 50478 + }, + { + "epoch": 2.350164117605978, + "grad_norm": 0.33175117954146083, + "learning_rate": 1.3620992636433955e-05, + "loss": 2.6778, + "step": 50479 + }, + { + "epoch": 2.3502106757920713, + "grad_norm": 0.3230124802538803, + "learning_rate": 1.3619134435086762e-05, + "loss": 2.6373, + "step": 50480 + }, + { + "epoch": 2.350257233978164, + "grad_norm": 0.3258992817785287, + "learning_rate": 1.361727634051384e-05, + "loss": 2.8082, + "step": 50481 + }, + { + "epoch": 2.350303792164257, + "grad_norm": 0.34006055728716206, + "learning_rate": 1.3615418352720599e-05, + "loss": 2.6192, + "step": 50482 + }, + { + "epoch": 2.3503503503503502, + "grad_norm": 0.3213827270995548, + "learning_rate": 1.361356047171255e-05, + "loss": 2.5902, + "step": 50483 + }, + { + "epoch": 2.3503969085364433, + "grad_norm": 0.3250420491331605, + "learning_rate": 1.3611702697495089e-05, + "loss": 2.6897, + "step": 50484 + }, + { + "epoch": 2.3504434667225365, + "grad_norm": 0.33305613299150505, + "learning_rate": 1.360984503007372e-05, + "loss": 2.6254, + "step": 50485 + }, + { + "epoch": 2.3504900249086296, + "grad_norm": 0.3147124482924394, + "learning_rate": 1.3607987469453864e-05, + "loss": 2.5381, + "step": 50486 + }, + { + "epoch": 2.3505365830947227, + "grad_norm": 0.3237723555728311, + "learning_rate": 1.3606130015640972e-05, + "loss": 2.6265, + "step": 50487 + }, + { + "epoch": 2.350583141280816, + "grad_norm": 0.31795748130933604, + "learning_rate": 1.3604272668640505e-05, + "loss": 2.5672, + "step": 50488 + }, + { + "epoch": 2.350629699466909, + "grad_norm": 0.32472854770296233, + "learning_rate": 1.3602415428457931e-05, + "loss": 2.6295, + "step": 50489 + }, + { + "epoch": 2.350676257653002, + "grad_norm": 0.31510486230051427, + "learning_rate": 1.3600558295098664e-05, + "loss": 2.6111, + "step": 50490 + }, + { + "epoch": 2.350722815839095, + "grad_norm": 0.30848730525113455, + "learning_rate": 1.3598701268568171e-05, + "loss": 2.6107, + "step": 50491 + }, + { + "epoch": 2.350769374025188, + "grad_norm": 0.32998783510992397, + "learning_rate": 1.3596844348871907e-05, + "loss": 2.6364, + "step": 50492 + }, + { + "epoch": 2.350815932211281, + "grad_norm": 0.33726270986548096, + "learning_rate": 1.359498753601532e-05, + "loss": 2.631, + "step": 50493 + }, + { + "epoch": 2.350862490397374, + "grad_norm": 0.3247240670172854, + "learning_rate": 1.3593130830003869e-05, + "loss": 2.5803, + "step": 50494 + }, + { + "epoch": 2.350909048583467, + "grad_norm": 0.3270126276839053, + "learning_rate": 1.359127423084296e-05, + "loss": 2.6678, + "step": 50495 + }, + { + "epoch": 2.3509556067695603, + "grad_norm": 0.32825659900632675, + "learning_rate": 1.3589417738538107e-05, + "loss": 2.602, + "step": 50496 + }, + { + "epoch": 2.3510021649556534, + "grad_norm": 0.3345323059611368, + "learning_rate": 1.3587561353094697e-05, + "loss": 2.6118, + "step": 50497 + }, + { + "epoch": 2.3510487231417465, + "grad_norm": 0.3207650401597135, + "learning_rate": 1.358570507451824e-05, + "loss": 2.683, + "step": 50498 + }, + { + "epoch": 2.3510952813278396, + "grad_norm": 0.3265945513556006, + "learning_rate": 1.3583848902814127e-05, + "loss": 2.6286, + "step": 50499 + }, + { + "epoch": 2.3511418395139323, + "grad_norm": 0.34056212954135356, + "learning_rate": 1.358199283798784e-05, + "loss": 2.5826, + "step": 50500 + }, + { + "epoch": 2.3511883977000254, + "grad_norm": 0.33452367846882836, + "learning_rate": 1.3580136880044814e-05, + "loss": 2.6811, + "step": 50501 + }, + { + "epoch": 2.3512349558861185, + "grad_norm": 0.3413932550242443, + "learning_rate": 1.3578281028990492e-05, + "loss": 2.6544, + "step": 50502 + }, + { + "epoch": 2.3512815140722116, + "grad_norm": 0.30801210758760494, + "learning_rate": 1.3576425284830347e-05, + "loss": 2.5015, + "step": 50503 + }, + { + "epoch": 2.3513280722583048, + "grad_norm": 0.3415425324805864, + "learning_rate": 1.357456964756979e-05, + "loss": 2.6168, + "step": 50504 + }, + { + "epoch": 2.351374630444398, + "grad_norm": 0.342551957041276, + "learning_rate": 1.3572714117214286e-05, + "loss": 2.6365, + "step": 50505 + }, + { + "epoch": 2.351421188630491, + "grad_norm": 0.31853706208024524, + "learning_rate": 1.3570858693769274e-05, + "loss": 2.6717, + "step": 50506 + }, + { + "epoch": 2.351467746816584, + "grad_norm": 0.30582476114753926, + "learning_rate": 1.3569003377240214e-05, + "loss": 2.5907, + "step": 50507 + }, + { + "epoch": 2.351514305002677, + "grad_norm": 0.30648631462177817, + "learning_rate": 1.3567148167632515e-05, + "loss": 2.4593, + "step": 50508 + }, + { + "epoch": 2.3515608631887703, + "grad_norm": 0.3326067228507788, + "learning_rate": 1.3565293064951673e-05, + "loss": 2.603, + "step": 50509 + }, + { + "epoch": 2.3516074213748634, + "grad_norm": 0.3093568513711665, + "learning_rate": 1.3563438069203083e-05, + "loss": 2.6367, + "step": 50510 + }, + { + "epoch": 2.351653979560956, + "grad_norm": 0.3127467694788042, + "learning_rate": 1.3561583180392235e-05, + "loss": 2.6506, + "step": 50511 + }, + { + "epoch": 2.3517005377470492, + "grad_norm": 0.31136439690952566, + "learning_rate": 1.3559728398524541e-05, + "loss": 2.6841, + "step": 50512 + }, + { + "epoch": 2.3517470959331424, + "grad_norm": 0.32292079313585437, + "learning_rate": 1.3557873723605451e-05, + "loss": 2.6176, + "step": 50513 + }, + { + "epoch": 2.3517936541192355, + "grad_norm": 0.3146505824195746, + "learning_rate": 1.3556019155640414e-05, + "loss": 2.6841, + "step": 50514 + }, + { + "epoch": 2.3518402123053286, + "grad_norm": 0.3264299375780631, + "learning_rate": 1.3554164694634869e-05, + "loss": 2.6674, + "step": 50515 + }, + { + "epoch": 2.3518867704914217, + "grad_norm": 0.31907612767864785, + "learning_rate": 1.3552310340594277e-05, + "loss": 2.6375, + "step": 50516 + }, + { + "epoch": 2.351933328677515, + "grad_norm": 0.3314264552914643, + "learning_rate": 1.3550456093524045e-05, + "loss": 2.6041, + "step": 50517 + }, + { + "epoch": 2.351979886863608, + "grad_norm": 0.31412121066075865, + "learning_rate": 1.3548601953429635e-05, + "loss": 2.5279, + "step": 50518 + }, + { + "epoch": 2.352026445049701, + "grad_norm": 0.3121950640372147, + "learning_rate": 1.3546747920316488e-05, + "loss": 2.743, + "step": 50519 + }, + { + "epoch": 2.3520730032357937, + "grad_norm": 0.3262698498115422, + "learning_rate": 1.3544893994190045e-05, + "loss": 2.6596, + "step": 50520 + }, + { + "epoch": 2.352119561421887, + "grad_norm": 0.3247707887581064, + "learning_rate": 1.3543040175055744e-05, + "loss": 2.5399, + "step": 50521 + }, + { + "epoch": 2.35216611960798, + "grad_norm": 0.3270932689941742, + "learning_rate": 1.3541186462919047e-05, + "loss": 2.6131, + "step": 50522 + }, + { + "epoch": 2.352212677794073, + "grad_norm": 0.31507389394168045, + "learning_rate": 1.3539332857785348e-05, + "loss": 2.5794, + "step": 50523 + }, + { + "epoch": 2.352259235980166, + "grad_norm": 0.3232402994191277, + "learning_rate": 1.3537479359660138e-05, + "loss": 2.7097, + "step": 50524 + }, + { + "epoch": 2.3523057941662593, + "grad_norm": 0.32184091692411787, + "learning_rate": 1.353562596854882e-05, + "loss": 2.6261, + "step": 50525 + }, + { + "epoch": 2.3523523523523524, + "grad_norm": 0.327801619952498, + "learning_rate": 1.3533772684456853e-05, + "loss": 2.5282, + "step": 50526 + }, + { + "epoch": 2.3523989105384455, + "grad_norm": 0.3054942658275614, + "learning_rate": 1.3531919507389668e-05, + "loss": 2.6181, + "step": 50527 + }, + { + "epoch": 2.3524454687245386, + "grad_norm": 0.3345747274792268, + "learning_rate": 1.3530066437352706e-05, + "loss": 2.7512, + "step": 50528 + }, + { + "epoch": 2.3524920269106318, + "grad_norm": 0.3096296718613336, + "learning_rate": 1.352821347435142e-05, + "loss": 2.6833, + "step": 50529 + }, + { + "epoch": 2.352538585096725, + "grad_norm": 0.34119327131194627, + "learning_rate": 1.352636061839122e-05, + "loss": 2.579, + "step": 50530 + }, + { + "epoch": 2.3525851432828175, + "grad_norm": 0.32222627895275224, + "learning_rate": 1.352450786947756e-05, + "loss": 2.7169, + "step": 50531 + }, + { + "epoch": 2.3526317014689107, + "grad_norm": 0.33411224957089014, + "learning_rate": 1.3522655227615876e-05, + "loss": 2.692, + "step": 50532 + }, + { + "epoch": 2.3526782596550038, + "grad_norm": 0.30499210502962015, + "learning_rate": 1.3520802692811602e-05, + "loss": 2.5523, + "step": 50533 + }, + { + "epoch": 2.352724817841097, + "grad_norm": 0.3168630168794329, + "learning_rate": 1.351895026507018e-05, + "loss": 2.6074, + "step": 50534 + }, + { + "epoch": 2.35277137602719, + "grad_norm": 0.32472579364013293, + "learning_rate": 1.3517097944397062e-05, + "loss": 2.619, + "step": 50535 + }, + { + "epoch": 2.352817934213283, + "grad_norm": 0.32490883129537707, + "learning_rate": 1.3515245730797639e-05, + "loss": 2.6294, + "step": 50536 + }, + { + "epoch": 2.3528644923993762, + "grad_norm": 0.34263151832267025, + "learning_rate": 1.3513393624277398e-05, + "loss": 2.6198, + "step": 50537 + }, + { + "epoch": 2.3529110505854693, + "grad_norm": 0.3365490940712891, + "learning_rate": 1.3511541624841727e-05, + "loss": 2.6576, + "step": 50538 + }, + { + "epoch": 2.352957608771562, + "grad_norm": 0.3264093406051988, + "learning_rate": 1.3509689732496117e-05, + "loss": 2.6768, + "step": 50539 + }, + { + "epoch": 2.353004166957655, + "grad_norm": 0.32971573575621616, + "learning_rate": 1.3507837947245955e-05, + "loss": 2.6437, + "step": 50540 + }, + { + "epoch": 2.3530507251437482, + "grad_norm": 0.3324987126389997, + "learning_rate": 1.3505986269096693e-05, + "loss": 2.5623, + "step": 50541 + }, + { + "epoch": 2.3530972833298414, + "grad_norm": 0.3257122620953922, + "learning_rate": 1.3504134698053789e-05, + "loss": 2.5913, + "step": 50542 + }, + { + "epoch": 2.3531438415159345, + "grad_norm": 0.3329551744694686, + "learning_rate": 1.3502283234122632e-05, + "loss": 2.5436, + "step": 50543 + }, + { + "epoch": 2.3531903997020276, + "grad_norm": 0.343038359246979, + "learning_rate": 1.3500431877308678e-05, + "loss": 2.6232, + "step": 50544 + }, + { + "epoch": 2.3532369578881207, + "grad_norm": 0.3178992832234233, + "learning_rate": 1.3498580627617363e-05, + "loss": 2.7402, + "step": 50545 + }, + { + "epoch": 2.353283516074214, + "grad_norm": 0.30467676413383543, + "learning_rate": 1.3496729485054115e-05, + "loss": 2.6239, + "step": 50546 + }, + { + "epoch": 2.353330074260307, + "grad_norm": 0.32898638324369056, + "learning_rate": 1.3494878449624377e-05, + "loss": 2.5626, + "step": 50547 + }, + { + "epoch": 2.3533766324464, + "grad_norm": 0.3511327446990102, + "learning_rate": 1.3493027521333578e-05, + "loss": 2.6297, + "step": 50548 + }, + { + "epoch": 2.353423190632493, + "grad_norm": 0.3026036824453292, + "learning_rate": 1.3491176700187124e-05, + "loss": 2.4845, + "step": 50549 + }, + { + "epoch": 2.353469748818586, + "grad_norm": 0.33953120072782494, + "learning_rate": 1.34893259861905e-05, + "loss": 2.6625, + "step": 50550 + }, + { + "epoch": 2.353516307004679, + "grad_norm": 0.33963233359764855, + "learning_rate": 1.348747537934908e-05, + "loss": 2.6091, + "step": 50551 + }, + { + "epoch": 2.353562865190772, + "grad_norm": 0.31695517077717733, + "learning_rate": 1.3485624879668347e-05, + "loss": 2.5629, + "step": 50552 + }, + { + "epoch": 2.353609423376865, + "grad_norm": 0.3171474701253688, + "learning_rate": 1.3483774487153695e-05, + "loss": 2.5791, + "step": 50553 + }, + { + "epoch": 2.3536559815629583, + "grad_norm": 0.33088913441823087, + "learning_rate": 1.3481924201810564e-05, + "loss": 2.5816, + "step": 50554 + }, + { + "epoch": 2.3537025397490514, + "grad_norm": 0.33672392981545174, + "learning_rate": 1.3480074023644407e-05, + "loss": 2.6566, + "step": 50555 + }, + { + "epoch": 2.3537490979351445, + "grad_norm": 0.31606331575486984, + "learning_rate": 1.34782239526606e-05, + "loss": 2.6081, + "step": 50556 + }, + { + "epoch": 2.3537956561212376, + "grad_norm": 0.32150476875693373, + "learning_rate": 1.3476373988864643e-05, + "loss": 2.6293, + "step": 50557 + }, + { + "epoch": 2.3538422143073308, + "grad_norm": 0.34400112527668797, + "learning_rate": 1.3474524132261918e-05, + "loss": 2.5343, + "step": 50558 + }, + { + "epoch": 2.3538887724934234, + "grad_norm": 0.33536491952756087, + "learning_rate": 1.3472674382857858e-05, + "loss": 2.6057, + "step": 50559 + }, + { + "epoch": 2.3539353306795165, + "grad_norm": 0.3445897319227914, + "learning_rate": 1.3470824740657905e-05, + "loss": 2.6065, + "step": 50560 + }, + { + "epoch": 2.3539818888656097, + "grad_norm": 0.3420080917864928, + "learning_rate": 1.3468975205667495e-05, + "loss": 2.705, + "step": 50561 + }, + { + "epoch": 2.3540284470517028, + "grad_norm": 0.34166909142825297, + "learning_rate": 1.3467125777892015e-05, + "loss": 2.745, + "step": 50562 + }, + { + "epoch": 2.354075005237796, + "grad_norm": 0.3161740206284904, + "learning_rate": 1.3465276457336957e-05, + "loss": 2.6156, + "step": 50563 + }, + { + "epoch": 2.354121563423889, + "grad_norm": 0.32428814570024617, + "learning_rate": 1.3463427244007676e-05, + "loss": 2.7043, + "step": 50564 + }, + { + "epoch": 2.354168121609982, + "grad_norm": 0.3166573171442679, + "learning_rate": 1.3461578137909669e-05, + "loss": 2.624, + "step": 50565 + }, + { + "epoch": 2.3542146797960752, + "grad_norm": 0.3421263501348431, + "learning_rate": 1.3459729139048316e-05, + "loss": 2.8138, + "step": 50566 + }, + { + "epoch": 2.3542612379821684, + "grad_norm": 0.3227816395174091, + "learning_rate": 1.3457880247429055e-05, + "loss": 2.6754, + "step": 50567 + }, + { + "epoch": 2.3543077961682615, + "grad_norm": 0.3323099487419929, + "learning_rate": 1.3456031463057333e-05, + "loss": 2.6617, + "step": 50568 + }, + { + "epoch": 2.3543543543543546, + "grad_norm": 0.32295316980264377, + "learning_rate": 1.3454182785938529e-05, + "loss": 2.6192, + "step": 50569 + }, + { + "epoch": 2.3544009125404473, + "grad_norm": 0.3259392124575179, + "learning_rate": 1.3452334216078133e-05, + "loss": 2.602, + "step": 50570 + }, + { + "epoch": 2.3544474707265404, + "grad_norm": 0.35545925774649784, + "learning_rate": 1.3450485753481513e-05, + "loss": 2.656, + "step": 50571 + }, + { + "epoch": 2.3544940289126335, + "grad_norm": 0.3300867875115142, + "learning_rate": 1.3448637398154118e-05, + "loss": 2.7174, + "step": 50572 + }, + { + "epoch": 2.3545405870987266, + "grad_norm": 0.3269614569684885, + "learning_rate": 1.3446789150101375e-05, + "loss": 2.6304, + "step": 50573 + }, + { + "epoch": 2.3545871452848197, + "grad_norm": 0.3269349923243019, + "learning_rate": 1.3444941009328699e-05, + "loss": 2.6599, + "step": 50574 + }, + { + "epoch": 2.354633703470913, + "grad_norm": 0.32627601150437674, + "learning_rate": 1.3443092975841525e-05, + "loss": 2.5927, + "step": 50575 + }, + { + "epoch": 2.354680261657006, + "grad_norm": 0.32445865105414856, + "learning_rate": 1.3441245049645285e-05, + "loss": 2.6664, + "step": 50576 + }, + { + "epoch": 2.354726819843099, + "grad_norm": 0.315490640194887, + "learning_rate": 1.343939723074536e-05, + "loss": 2.6832, + "step": 50577 + }, + { + "epoch": 2.354773378029192, + "grad_norm": 0.3129003311147928, + "learning_rate": 1.343754951914723e-05, + "loss": 2.7258, + "step": 50578 + }, + { + "epoch": 2.354819936215285, + "grad_norm": 0.31661280590891283, + "learning_rate": 1.3435701914856275e-05, + "loss": 2.6092, + "step": 50579 + }, + { + "epoch": 2.354866494401378, + "grad_norm": 0.31284825729904736, + "learning_rate": 1.3433854417877934e-05, + "loss": 2.5519, + "step": 50580 + }, + { + "epoch": 2.354913052587471, + "grad_norm": 0.32483946697406174, + "learning_rate": 1.3432007028217648e-05, + "loss": 2.66, + "step": 50581 + }, + { + "epoch": 2.354959610773564, + "grad_norm": 0.31962762379441595, + "learning_rate": 1.3430159745880783e-05, + "loss": 2.7001, + "step": 50582 + }, + { + "epoch": 2.3550061689596573, + "grad_norm": 0.3360746584073823, + "learning_rate": 1.3428312570872825e-05, + "loss": 2.6184, + "step": 50583 + }, + { + "epoch": 2.3550527271457504, + "grad_norm": 0.3282308976152574, + "learning_rate": 1.3426465503199154e-05, + "loss": 2.6786, + "step": 50584 + }, + { + "epoch": 2.3550992853318435, + "grad_norm": 0.31889233929099764, + "learning_rate": 1.3424618542865203e-05, + "loss": 2.6463, + "step": 50585 + }, + { + "epoch": 2.3551458435179367, + "grad_norm": 0.3148300520463595, + "learning_rate": 1.3422771689876396e-05, + "loss": 2.682, + "step": 50586 + }, + { + "epoch": 2.3551924017040298, + "grad_norm": 0.3480633292819187, + "learning_rate": 1.3420924944238145e-05, + "loss": 2.7093, + "step": 50587 + }, + { + "epoch": 2.355238959890123, + "grad_norm": 0.33596576735434425, + "learning_rate": 1.341907830595588e-05, + "loss": 2.7124, + "step": 50588 + }, + { + "epoch": 2.355285518076216, + "grad_norm": 0.3251526484598787, + "learning_rate": 1.3417231775035032e-05, + "loss": 2.627, + "step": 50589 + }, + { + "epoch": 2.3553320762623087, + "grad_norm": 0.33195659255868376, + "learning_rate": 1.341538535148097e-05, + "loss": 2.6355, + "step": 50590 + }, + { + "epoch": 2.355378634448402, + "grad_norm": 0.3477330673570046, + "learning_rate": 1.3413539035299183e-05, + "loss": 2.677, + "step": 50591 + }, + { + "epoch": 2.355425192634495, + "grad_norm": 0.31206321366086964, + "learning_rate": 1.3411692826495036e-05, + "loss": 2.6114, + "step": 50592 + }, + { + "epoch": 2.355471750820588, + "grad_norm": 0.3364375325037076, + "learning_rate": 1.3409846725073971e-05, + "loss": 2.7004, + "step": 50593 + }, + { + "epoch": 2.355518309006681, + "grad_norm": 0.3510100569220488, + "learning_rate": 1.3408000731041408e-05, + "loss": 2.5837, + "step": 50594 + }, + { + "epoch": 2.3555648671927742, + "grad_norm": 0.31615221479426014, + "learning_rate": 1.3406154844402735e-05, + "loss": 2.6612, + "step": 50595 + }, + { + "epoch": 2.3556114253788674, + "grad_norm": 0.3128910924527611, + "learning_rate": 1.3404309065163423e-05, + "loss": 2.6092, + "step": 50596 + }, + { + "epoch": 2.3556579835649605, + "grad_norm": 0.31423298899072777, + "learning_rate": 1.340246339332884e-05, + "loss": 2.6545, + "step": 50597 + }, + { + "epoch": 2.355704541751053, + "grad_norm": 0.3340706336555489, + "learning_rate": 1.3400617828904421e-05, + "loss": 2.6523, + "step": 50598 + }, + { + "epoch": 2.3557510999371463, + "grad_norm": 0.3287620968289293, + "learning_rate": 1.3398772371895585e-05, + "loss": 2.6693, + "step": 50599 + }, + { + "epoch": 2.3557976581232394, + "grad_norm": 0.3198576864282032, + "learning_rate": 1.3396927022307742e-05, + "loss": 2.6093, + "step": 50600 + }, + { + "epoch": 2.3558442163093325, + "grad_norm": 0.3354327471034061, + "learning_rate": 1.3395081780146318e-05, + "loss": 2.7069, + "step": 50601 + }, + { + "epoch": 2.3558907744954256, + "grad_norm": 0.3220629408689729, + "learning_rate": 1.3393236645416735e-05, + "loss": 2.6142, + "step": 50602 + }, + { + "epoch": 2.3559373326815187, + "grad_norm": 0.3246671490665397, + "learning_rate": 1.3391391618124366e-05, + "loss": 2.5663, + "step": 50603 + }, + { + "epoch": 2.355983890867612, + "grad_norm": 0.32075866676266956, + "learning_rate": 1.3389546698274685e-05, + "loss": 2.671, + "step": 50604 + }, + { + "epoch": 2.356030449053705, + "grad_norm": 0.3477840128769718, + "learning_rate": 1.3387701885873067e-05, + "loss": 2.6463, + "step": 50605 + }, + { + "epoch": 2.356077007239798, + "grad_norm": 0.3231507026644479, + "learning_rate": 1.3385857180924932e-05, + "loss": 2.6519, + "step": 50606 + }, + { + "epoch": 2.356123565425891, + "grad_norm": 0.30557817450921837, + "learning_rate": 1.3384012583435702e-05, + "loss": 2.5981, + "step": 50607 + }, + { + "epoch": 2.3561701236119843, + "grad_norm": 0.3156137700412127, + "learning_rate": 1.3382168093410791e-05, + "loss": 2.6749, + "step": 50608 + }, + { + "epoch": 2.356216681798077, + "grad_norm": 0.3322468773976802, + "learning_rate": 1.3380323710855619e-05, + "loss": 2.5225, + "step": 50609 + }, + { + "epoch": 2.35626323998417, + "grad_norm": 0.31403394623741293, + "learning_rate": 1.337847943577556e-05, + "loss": 2.5288, + "step": 50610 + }, + { + "epoch": 2.356309798170263, + "grad_norm": 0.322452894867221, + "learning_rate": 1.3376635268176085e-05, + "loss": 2.5713, + "step": 50611 + }, + { + "epoch": 2.3563563563563563, + "grad_norm": 0.3252611857500735, + "learning_rate": 1.3374791208062564e-05, + "loss": 2.5405, + "step": 50612 + }, + { + "epoch": 2.3564029145424494, + "grad_norm": 0.32614701377101885, + "learning_rate": 1.3372947255440426e-05, + "loss": 2.6501, + "step": 50613 + }, + { + "epoch": 2.3564494727285425, + "grad_norm": 0.33895392886453785, + "learning_rate": 1.3371103410315079e-05, + "loss": 2.6713, + "step": 50614 + }, + { + "epoch": 2.3564960309146357, + "grad_norm": 0.32705409109494543, + "learning_rate": 1.3369259672691942e-05, + "loss": 2.558, + "step": 50615 + }, + { + "epoch": 2.356542589100729, + "grad_norm": 0.3063272349223528, + "learning_rate": 1.3367416042576398e-05, + "loss": 2.6319, + "step": 50616 + }, + { + "epoch": 2.356589147286822, + "grad_norm": 0.31631321518351385, + "learning_rate": 1.3365572519973902e-05, + "loss": 2.6442, + "step": 50617 + }, + { + "epoch": 2.3566357054729146, + "grad_norm": 0.3126217829660279, + "learning_rate": 1.3363729104889833e-05, + "loss": 2.5806, + "step": 50618 + }, + { + "epoch": 2.3566822636590077, + "grad_norm": 0.3278131764744562, + "learning_rate": 1.3361885797329603e-05, + "loss": 2.6661, + "step": 50619 + }, + { + "epoch": 2.356728821845101, + "grad_norm": 0.3236532401706616, + "learning_rate": 1.3360042597298633e-05, + "loss": 2.5846, + "step": 50620 + }, + { + "epoch": 2.356775380031194, + "grad_norm": 0.34227424422762365, + "learning_rate": 1.3358199504802332e-05, + "loss": 2.6961, + "step": 50621 + }, + { + "epoch": 2.356821938217287, + "grad_norm": 0.3441431853673475, + "learning_rate": 1.3356356519846113e-05, + "loss": 2.5782, + "step": 50622 + }, + { + "epoch": 2.35686849640338, + "grad_norm": 0.3354097404071167, + "learning_rate": 1.3354513642435352e-05, + "loss": 2.6421, + "step": 50623 + }, + { + "epoch": 2.3569150545894733, + "grad_norm": 0.32787646823464134, + "learning_rate": 1.3352670872575512e-05, + "loss": 2.6383, + "step": 50624 + }, + { + "epoch": 2.3569616127755664, + "grad_norm": 0.32760107652669157, + "learning_rate": 1.335082821027196e-05, + "loss": 2.6154, + "step": 50625 + }, + { + "epoch": 2.3570081709616595, + "grad_norm": 0.3244839469016818, + "learning_rate": 1.334898565553011e-05, + "loss": 2.5921, + "step": 50626 + }, + { + "epoch": 2.3570547291477526, + "grad_norm": 0.3334561579220205, + "learning_rate": 1.3347143208355384e-05, + "loss": 2.7232, + "step": 50627 + }, + { + "epoch": 2.3571012873338457, + "grad_norm": 0.32087715877885153, + "learning_rate": 1.334530086875318e-05, + "loss": 2.6378, + "step": 50628 + }, + { + "epoch": 2.3571478455199384, + "grad_norm": 0.3279022766023704, + "learning_rate": 1.3343458636728905e-05, + "loss": 2.7791, + "step": 50629 + }, + { + "epoch": 2.3571944037060315, + "grad_norm": 0.3295917133336306, + "learning_rate": 1.3341616512287986e-05, + "loss": 2.6715, + "step": 50630 + }, + { + "epoch": 2.3572409618921246, + "grad_norm": 0.32235026112175696, + "learning_rate": 1.3339774495435791e-05, + "loss": 2.5453, + "step": 50631 + }, + { + "epoch": 2.3572875200782177, + "grad_norm": 0.3210001774226515, + "learning_rate": 1.3337932586177747e-05, + "loss": 2.5309, + "step": 50632 + }, + { + "epoch": 2.357334078264311, + "grad_norm": 0.31869590982675067, + "learning_rate": 1.3336090784519261e-05, + "loss": 2.6294, + "step": 50633 + }, + { + "epoch": 2.357380636450404, + "grad_norm": 0.34823011981856505, + "learning_rate": 1.3334249090465739e-05, + "loss": 2.7164, + "step": 50634 + }, + { + "epoch": 2.357427194636497, + "grad_norm": 0.32108897220473503, + "learning_rate": 1.3332407504022593e-05, + "loss": 2.5846, + "step": 50635 + }, + { + "epoch": 2.35747375282259, + "grad_norm": 0.33207892009690426, + "learning_rate": 1.3330566025195191e-05, + "loss": 2.7298, + "step": 50636 + }, + { + "epoch": 2.357520311008683, + "grad_norm": 0.34090676626757904, + "learning_rate": 1.3328724653989e-05, + "loss": 2.5948, + "step": 50637 + }, + { + "epoch": 2.357566869194776, + "grad_norm": 0.32635095814911114, + "learning_rate": 1.3326883390409367e-05, + "loss": 2.6858, + "step": 50638 + }, + { + "epoch": 2.357613427380869, + "grad_norm": 0.3317664566630975, + "learning_rate": 1.332504223446172e-05, + "loss": 2.7275, + "step": 50639 + }, + { + "epoch": 2.357659985566962, + "grad_norm": 0.3379033003600755, + "learning_rate": 1.3323201186151463e-05, + "loss": 2.5875, + "step": 50640 + }, + { + "epoch": 2.3577065437530553, + "grad_norm": 0.33747955005646296, + "learning_rate": 1.3321360245483999e-05, + "loss": 2.6479, + "step": 50641 + }, + { + "epoch": 2.3577531019391484, + "grad_norm": 0.3284662819026358, + "learning_rate": 1.3319519412464725e-05, + "loss": 2.61, + "step": 50642 + }, + { + "epoch": 2.3577996601252416, + "grad_norm": 0.33580539785600333, + "learning_rate": 1.3317678687099067e-05, + "loss": 2.6457, + "step": 50643 + }, + { + "epoch": 2.3578462183113347, + "grad_norm": 0.3226106531467442, + "learning_rate": 1.331583806939239e-05, + "loss": 2.5779, + "step": 50644 + }, + { + "epoch": 2.357892776497428, + "grad_norm": 0.3301230432167364, + "learning_rate": 1.3313997559350116e-05, + "loss": 2.6011, + "step": 50645 + }, + { + "epoch": 2.357939334683521, + "grad_norm": 0.3333931282572689, + "learning_rate": 1.3312157156977645e-05, + "loss": 2.635, + "step": 50646 + }, + { + "epoch": 2.357985892869614, + "grad_norm": 0.3711130598383973, + "learning_rate": 1.3310316862280375e-05, + "loss": 2.6804, + "step": 50647 + }, + { + "epoch": 2.3580324510557067, + "grad_norm": 0.3435276530407376, + "learning_rate": 1.3308476675263731e-05, + "loss": 2.7137, + "step": 50648 + }, + { + "epoch": 2.3580790092418, + "grad_norm": 0.32058878008821645, + "learning_rate": 1.3306636595933058e-05, + "loss": 2.7203, + "step": 50649 + }, + { + "epoch": 2.358125567427893, + "grad_norm": 0.3586046607179893, + "learning_rate": 1.3304796624293825e-05, + "loss": 2.609, + "step": 50650 + }, + { + "epoch": 2.358172125613986, + "grad_norm": 0.3684235214711072, + "learning_rate": 1.330295676035138e-05, + "loss": 2.651, + "step": 50651 + }, + { + "epoch": 2.358218683800079, + "grad_norm": 0.3326699326733557, + "learning_rate": 1.3301117004111146e-05, + "loss": 2.5374, + "step": 50652 + }, + { + "epoch": 2.3582652419861723, + "grad_norm": 0.3383253161492358, + "learning_rate": 1.3299277355578521e-05, + "loss": 2.5512, + "step": 50653 + }, + { + "epoch": 2.3583118001722654, + "grad_norm": 0.34901736628035746, + "learning_rate": 1.3297437814758895e-05, + "loss": 2.5949, + "step": 50654 + }, + { + "epoch": 2.3583583583583585, + "grad_norm": 0.3198291033463002, + "learning_rate": 1.3295598381657676e-05, + "loss": 2.7019, + "step": 50655 + }, + { + "epoch": 2.3584049165444516, + "grad_norm": 0.3179427314116292, + "learning_rate": 1.3293759056280275e-05, + "loss": 2.6265, + "step": 50656 + }, + { + "epoch": 2.3584514747305443, + "grad_norm": 0.33199386949812487, + "learning_rate": 1.3291919838632055e-05, + "loss": 2.6906, + "step": 50657 + }, + { + "epoch": 2.3584980329166374, + "grad_norm": 0.3186467465501221, + "learning_rate": 1.329008072871844e-05, + "loss": 2.6418, + "step": 50658 + }, + { + "epoch": 2.3585445911027305, + "grad_norm": 0.31279035152102863, + "learning_rate": 1.328824172654482e-05, + "loss": 2.6284, + "step": 50659 + }, + { + "epoch": 2.3585911492888236, + "grad_norm": 0.30759608307863434, + "learning_rate": 1.3286402832116596e-05, + "loss": 2.5659, + "step": 50660 + }, + { + "epoch": 2.3586377074749167, + "grad_norm": 0.3217580812224147, + "learning_rate": 1.3284564045439174e-05, + "loss": 2.6856, + "step": 50661 + }, + { + "epoch": 2.35868426566101, + "grad_norm": 0.3180206541847883, + "learning_rate": 1.3282725366517911e-05, + "loss": 2.632, + "step": 50662 + }, + { + "epoch": 2.358730823847103, + "grad_norm": 0.3092150651615352, + "learning_rate": 1.3280886795358261e-05, + "loss": 2.6681, + "step": 50663 + }, + { + "epoch": 2.358777382033196, + "grad_norm": 0.3118457256440444, + "learning_rate": 1.327904833196556e-05, + "loss": 2.5764, + "step": 50664 + }, + { + "epoch": 2.358823940219289, + "grad_norm": 0.33113354496489095, + "learning_rate": 1.3277209976345268e-05, + "loss": 2.6259, + "step": 50665 + }, + { + "epoch": 2.3588704984053823, + "grad_norm": 0.309998290416899, + "learning_rate": 1.3275371728502728e-05, + "loss": 2.6701, + "step": 50666 + }, + { + "epoch": 2.3589170565914754, + "grad_norm": 0.318952291829625, + "learning_rate": 1.3273533588443354e-05, + "loss": 2.679, + "step": 50667 + }, + { + "epoch": 2.358963614777568, + "grad_norm": 0.3303788848128355, + "learning_rate": 1.3271695556172541e-05, + "loss": 2.6306, + "step": 50668 + }, + { + "epoch": 2.3590101729636612, + "grad_norm": 0.3279154350497064, + "learning_rate": 1.3269857631695704e-05, + "loss": 2.7306, + "step": 50669 + }, + { + "epoch": 2.3590567311497543, + "grad_norm": 0.30472004626811766, + "learning_rate": 1.3268019815018196e-05, + "loss": 2.5708, + "step": 50670 + }, + { + "epoch": 2.3591032893358475, + "grad_norm": 0.3064633501893838, + "learning_rate": 1.326618210614543e-05, + "loss": 2.5686, + "step": 50671 + }, + { + "epoch": 2.3591498475219406, + "grad_norm": 0.31422003339491894, + "learning_rate": 1.3264344505082804e-05, + "loss": 2.6314, + "step": 50672 + }, + { + "epoch": 2.3591964057080337, + "grad_norm": 0.32031050675563133, + "learning_rate": 1.3262507011835706e-05, + "loss": 2.6784, + "step": 50673 + }, + { + "epoch": 2.359242963894127, + "grad_norm": 0.3224660248483302, + "learning_rate": 1.326066962640955e-05, + "loss": 2.6695, + "step": 50674 + }, + { + "epoch": 2.35928952208022, + "grad_norm": 0.327912100972378, + "learning_rate": 1.3258832348809674e-05, + "loss": 2.6562, + "step": 50675 + }, + { + "epoch": 2.3593360802663126, + "grad_norm": 0.3029590403103771, + "learning_rate": 1.3256995179041537e-05, + "loss": 2.5795, + "step": 50676 + }, + { + "epoch": 2.3593826384524057, + "grad_norm": 0.30611997869690866, + "learning_rate": 1.3255158117110467e-05, + "loss": 2.5684, + "step": 50677 + }, + { + "epoch": 2.359429196638499, + "grad_norm": 0.32223619915397966, + "learning_rate": 1.325332116302192e-05, + "loss": 2.5377, + "step": 50678 + }, + { + "epoch": 2.359475754824592, + "grad_norm": 0.3485674390120139, + "learning_rate": 1.3251484316781243e-05, + "loss": 2.6768, + "step": 50679 + }, + { + "epoch": 2.359522313010685, + "grad_norm": 0.3026752077606027, + "learning_rate": 1.3249647578393837e-05, + "loss": 2.6272, + "step": 50680 + }, + { + "epoch": 2.359568871196778, + "grad_norm": 0.3146300858067175, + "learning_rate": 1.3247810947865092e-05, + "loss": 2.6309, + "step": 50681 + }, + { + "epoch": 2.3596154293828713, + "grad_norm": 0.3058122633610952, + "learning_rate": 1.3245974425200403e-05, + "loss": 2.6481, + "step": 50682 + }, + { + "epoch": 2.3596619875689644, + "grad_norm": 0.3064341278411313, + "learning_rate": 1.3244138010405178e-05, + "loss": 2.6441, + "step": 50683 + }, + { + "epoch": 2.3597085457550575, + "grad_norm": 0.3004021584912023, + "learning_rate": 1.3242301703484766e-05, + "loss": 2.7195, + "step": 50684 + }, + { + "epoch": 2.3597551039411506, + "grad_norm": 0.3137257894683826, + "learning_rate": 1.3240465504444577e-05, + "loss": 2.7433, + "step": 50685 + }, + { + "epoch": 2.3598016621272437, + "grad_norm": 0.3190143242386903, + "learning_rate": 1.3238629413290004e-05, + "loss": 2.7145, + "step": 50686 + }, + { + "epoch": 2.3598482203133364, + "grad_norm": 0.3095828820699698, + "learning_rate": 1.3236793430026445e-05, + "loss": 2.6387, + "step": 50687 + }, + { + "epoch": 2.3598947784994295, + "grad_norm": 0.2983917492663373, + "learning_rate": 1.3234957554659244e-05, + "loss": 2.5796, + "step": 50688 + }, + { + "epoch": 2.3599413366855226, + "grad_norm": 0.3151167236797103, + "learning_rate": 1.3233121787193854e-05, + "loss": 2.6311, + "step": 50689 + }, + { + "epoch": 2.3599878948716158, + "grad_norm": 0.33974041271876326, + "learning_rate": 1.32312861276356e-05, + "loss": 2.6527, + "step": 50690 + }, + { + "epoch": 2.360034453057709, + "grad_norm": 0.34780906967738795, + "learning_rate": 1.3229450575989927e-05, + "loss": 2.589, + "step": 50691 + }, + { + "epoch": 2.360081011243802, + "grad_norm": 0.3294948154561418, + "learning_rate": 1.3227615132262178e-05, + "loss": 2.7249, + "step": 50692 + }, + { + "epoch": 2.360127569429895, + "grad_norm": 0.3349526544287613, + "learning_rate": 1.3225779796457754e-05, + "loss": 2.6749, + "step": 50693 + }, + { + "epoch": 2.360174127615988, + "grad_norm": 0.3166229164408603, + "learning_rate": 1.3223944568582048e-05, + "loss": 2.6, + "step": 50694 + }, + { + "epoch": 2.3602206858020813, + "grad_norm": 0.3460263661872206, + "learning_rate": 1.3222109448640441e-05, + "loss": 2.6183, + "step": 50695 + }, + { + "epoch": 2.360267243988174, + "grad_norm": 0.3237536794537063, + "learning_rate": 1.3220274436638335e-05, + "loss": 2.614, + "step": 50696 + }, + { + "epoch": 2.360313802174267, + "grad_norm": 0.3233455329929235, + "learning_rate": 1.3218439532581083e-05, + "loss": 2.6553, + "step": 50697 + }, + { + "epoch": 2.3603603603603602, + "grad_norm": 0.3301936339355439, + "learning_rate": 1.3216604736474086e-05, + "loss": 2.7248, + "step": 50698 + }, + { + "epoch": 2.3604069185464533, + "grad_norm": 0.3458079212959709, + "learning_rate": 1.3214770048322734e-05, + "loss": 2.5877, + "step": 50699 + }, + { + "epoch": 2.3604534767325465, + "grad_norm": 0.3418046620761944, + "learning_rate": 1.3212935468132404e-05, + "loss": 2.5428, + "step": 50700 + }, + { + "epoch": 2.3605000349186396, + "grad_norm": 0.3286214578891197, + "learning_rate": 1.3211100995908488e-05, + "loss": 2.7367, + "step": 50701 + }, + { + "epoch": 2.3605465931047327, + "grad_norm": 0.33656681398280064, + "learning_rate": 1.320926663165638e-05, + "loss": 2.7166, + "step": 50702 + }, + { + "epoch": 2.360593151290826, + "grad_norm": 0.3416043763926331, + "learning_rate": 1.3207432375381418e-05, + "loss": 2.6026, + "step": 50703 + }, + { + "epoch": 2.360639709476919, + "grad_norm": 0.3389982233738332, + "learning_rate": 1.320559822708905e-05, + "loss": 2.6853, + "step": 50704 + }, + { + "epoch": 2.360686267663012, + "grad_norm": 0.33336817130892415, + "learning_rate": 1.3203764186784613e-05, + "loss": 2.6338, + "step": 50705 + }, + { + "epoch": 2.360732825849105, + "grad_norm": 0.33243836573347385, + "learning_rate": 1.3201930254473499e-05, + "loss": 2.678, + "step": 50706 + }, + { + "epoch": 2.360779384035198, + "grad_norm": 0.3389525104559924, + "learning_rate": 1.3200096430161096e-05, + "loss": 2.6398, + "step": 50707 + }, + { + "epoch": 2.360825942221291, + "grad_norm": 0.34887600233639104, + "learning_rate": 1.3198262713852783e-05, + "loss": 2.6589, + "step": 50708 + }, + { + "epoch": 2.360872500407384, + "grad_norm": 0.366892541270342, + "learning_rate": 1.3196429105553958e-05, + "loss": 2.4981, + "step": 50709 + }, + { + "epoch": 2.360919058593477, + "grad_norm": 0.3477285057938658, + "learning_rate": 1.3194595605269976e-05, + "loss": 2.6125, + "step": 50710 + }, + { + "epoch": 2.3609656167795703, + "grad_norm": 0.3511505163335132, + "learning_rate": 1.3192762213006233e-05, + "loss": 2.697, + "step": 50711 + }, + { + "epoch": 2.3610121749656634, + "grad_norm": 0.32986649971769855, + "learning_rate": 1.3190928928768099e-05, + "loss": 2.51, + "step": 50712 + }, + { + "epoch": 2.3610587331517565, + "grad_norm": 0.3583296997197604, + "learning_rate": 1.3189095752560966e-05, + "loss": 2.6716, + "step": 50713 + }, + { + "epoch": 2.3611052913378496, + "grad_norm": 0.3448757846758267, + "learning_rate": 1.3187262684390212e-05, + "loss": 2.6362, + "step": 50714 + }, + { + "epoch": 2.3611518495239423, + "grad_norm": 0.3439187674122942, + "learning_rate": 1.3185429724261227e-05, + "loss": 2.7035, + "step": 50715 + }, + { + "epoch": 2.3611984077100354, + "grad_norm": 0.33538030188020823, + "learning_rate": 1.3183596872179349e-05, + "loss": 2.6312, + "step": 50716 + }, + { + "epoch": 2.3612449658961285, + "grad_norm": 0.3152902961717145, + "learning_rate": 1.318176412815002e-05, + "loss": 2.544, + "step": 50717 + }, + { + "epoch": 2.3612915240822216, + "grad_norm": 0.33253379120874893, + "learning_rate": 1.317993149217857e-05, + "loss": 2.5794, + "step": 50718 + }, + { + "epoch": 2.3613380822683148, + "grad_norm": 0.35479569191412547, + "learning_rate": 1.3178098964270396e-05, + "loss": 2.6505, + "step": 50719 + }, + { + "epoch": 2.361384640454408, + "grad_norm": 0.33171968176166533, + "learning_rate": 1.317626654443087e-05, + "loss": 2.6356, + "step": 50720 + }, + { + "epoch": 2.361431198640501, + "grad_norm": 0.3342109226908899, + "learning_rate": 1.3174434232665378e-05, + "loss": 2.6271, + "step": 50721 + }, + { + "epoch": 2.361477756826594, + "grad_norm": 0.3327313208413612, + "learning_rate": 1.3172602028979303e-05, + "loss": 2.7297, + "step": 50722 + }, + { + "epoch": 2.3615243150126872, + "grad_norm": 0.35146143703217203, + "learning_rate": 1.3170769933377986e-05, + "loss": 2.6451, + "step": 50723 + }, + { + "epoch": 2.3615708731987803, + "grad_norm": 0.3427404454193967, + "learning_rate": 1.316893794586686e-05, + "loss": 2.6148, + "step": 50724 + }, + { + "epoch": 2.3616174313848735, + "grad_norm": 0.30384656103799973, + "learning_rate": 1.3167106066451262e-05, + "loss": 2.5692, + "step": 50725 + }, + { + "epoch": 2.361663989570966, + "grad_norm": 0.33664434088904843, + "learning_rate": 1.3165274295136575e-05, + "loss": 2.6016, + "step": 50726 + }, + { + "epoch": 2.3617105477570592, + "grad_norm": 0.324447078098104, + "learning_rate": 1.316344263192818e-05, + "loss": 2.7469, + "step": 50727 + }, + { + "epoch": 2.3617571059431524, + "grad_norm": 0.3205406169226412, + "learning_rate": 1.3161611076831465e-05, + "loss": 2.5824, + "step": 50728 + }, + { + "epoch": 2.3618036641292455, + "grad_norm": 0.32909706484847673, + "learning_rate": 1.3159779629851765e-05, + "loss": 2.6096, + "step": 50729 + }, + { + "epoch": 2.3618502223153386, + "grad_norm": 0.3141804596339409, + "learning_rate": 1.315794829099451e-05, + "loss": 2.5985, + "step": 50730 + }, + { + "epoch": 2.3618967805014317, + "grad_norm": 0.31575905122034803, + "learning_rate": 1.3156117060265033e-05, + "loss": 2.6446, + "step": 50731 + }, + { + "epoch": 2.361943338687525, + "grad_norm": 0.324764483348375, + "learning_rate": 1.3154285937668726e-05, + "loss": 2.7036, + "step": 50732 + }, + { + "epoch": 2.361989896873618, + "grad_norm": 0.31782985887321846, + "learning_rate": 1.3152454923210954e-05, + "loss": 2.6811, + "step": 50733 + }, + { + "epoch": 2.362036455059711, + "grad_norm": 0.3190582782805924, + "learning_rate": 1.3150624016897101e-05, + "loss": 2.661, + "step": 50734 + }, + { + "epoch": 2.3620830132458037, + "grad_norm": 0.3068372711466648, + "learning_rate": 1.3148793218732553e-05, + "loss": 2.4998, + "step": 50735 + }, + { + "epoch": 2.362129571431897, + "grad_norm": 0.3453808827071877, + "learning_rate": 1.3146962528722634e-05, + "loss": 2.5688, + "step": 50736 + }, + { + "epoch": 2.36217612961799, + "grad_norm": 0.3300512319423787, + "learning_rate": 1.314513194687278e-05, + "loss": 2.479, + "step": 50737 + }, + { + "epoch": 2.362222687804083, + "grad_norm": 0.3143011011395027, + "learning_rate": 1.3143301473188313e-05, + "loss": 2.6003, + "step": 50738 + }, + { + "epoch": 2.362269245990176, + "grad_norm": 0.31771838240548445, + "learning_rate": 1.314147110767463e-05, + "loss": 2.617, + "step": 50739 + }, + { + "epoch": 2.3623158041762693, + "grad_norm": 0.3305797745924947, + "learning_rate": 1.3139640850337099e-05, + "loss": 2.7469, + "step": 50740 + }, + { + "epoch": 2.3623623623623624, + "grad_norm": 0.3345379680339203, + "learning_rate": 1.3137810701181086e-05, + "loss": 2.6505, + "step": 50741 + }, + { + "epoch": 2.3624089205484555, + "grad_norm": 0.34917995047623124, + "learning_rate": 1.3135980660211972e-05, + "loss": 2.6259, + "step": 50742 + }, + { + "epoch": 2.3624554787345486, + "grad_norm": 0.33493196123714275, + "learning_rate": 1.3134150727435134e-05, + "loss": 2.6229, + "step": 50743 + }, + { + "epoch": 2.3625020369206418, + "grad_norm": 0.31347876610044856, + "learning_rate": 1.3132320902855921e-05, + "loss": 2.6861, + "step": 50744 + }, + { + "epoch": 2.362548595106735, + "grad_norm": 0.32391820163201807, + "learning_rate": 1.313049118647971e-05, + "loss": 2.5625, + "step": 50745 + }, + { + "epoch": 2.3625951532928275, + "grad_norm": 0.30975557942849624, + "learning_rate": 1.3128661578311879e-05, + "loss": 2.6886, + "step": 50746 + }, + { + "epoch": 2.3626417114789207, + "grad_norm": 0.3190252917201378, + "learning_rate": 1.3126832078357792e-05, + "loss": 2.554, + "step": 50747 + }, + { + "epoch": 2.3626882696650138, + "grad_norm": 0.3358451292472502, + "learning_rate": 1.3125002686622833e-05, + "loss": 2.6237, + "step": 50748 + }, + { + "epoch": 2.362734827851107, + "grad_norm": 0.3224267530738759, + "learning_rate": 1.3123173403112332e-05, + "loss": 2.6227, + "step": 50749 + }, + { + "epoch": 2.3627813860372, + "grad_norm": 0.34310069712951236, + "learning_rate": 1.3121344227831711e-05, + "loss": 2.6691, + "step": 50750 + }, + { + "epoch": 2.362827944223293, + "grad_norm": 0.3409380545701376, + "learning_rate": 1.31195151607863e-05, + "loss": 2.6359, + "step": 50751 + }, + { + "epoch": 2.3628745024093862, + "grad_norm": 0.3441028572228089, + "learning_rate": 1.3117686201981477e-05, + "loss": 2.7545, + "step": 50752 + }, + { + "epoch": 2.3629210605954794, + "grad_norm": 0.3530011938380414, + "learning_rate": 1.3115857351422611e-05, + "loss": 2.613, + "step": 50753 + }, + { + "epoch": 2.362967618781572, + "grad_norm": 0.32072818308701645, + "learning_rate": 1.311402860911507e-05, + "loss": 2.6885, + "step": 50754 + }, + { + "epoch": 2.363014176967665, + "grad_norm": 0.31402576393006043, + "learning_rate": 1.311219997506422e-05, + "loss": 2.6205, + "step": 50755 + }, + { + "epoch": 2.3630607351537583, + "grad_norm": 0.33737414405322663, + "learning_rate": 1.3110371449275443e-05, + "loss": 2.6144, + "step": 50756 + }, + { + "epoch": 2.3631072933398514, + "grad_norm": 0.33409704346234914, + "learning_rate": 1.3108543031754084e-05, + "loss": 2.6321, + "step": 50757 + }, + { + "epoch": 2.3631538515259445, + "grad_norm": 0.3333595978804233, + "learning_rate": 1.3106714722505508e-05, + "loss": 2.7461, + "step": 50758 + }, + { + "epoch": 2.3632004097120376, + "grad_norm": 0.3473923865808235, + "learning_rate": 1.3104886521535093e-05, + "loss": 2.7697, + "step": 50759 + }, + { + "epoch": 2.3632469678981307, + "grad_norm": 0.3288814108105175, + "learning_rate": 1.3103058428848202e-05, + "loss": 2.7002, + "step": 50760 + }, + { + "epoch": 2.363293526084224, + "grad_norm": 0.33725465192663817, + "learning_rate": 1.3101230444450213e-05, + "loss": 2.5699, + "step": 50761 + }, + { + "epoch": 2.363340084270317, + "grad_norm": 0.3112820252921711, + "learning_rate": 1.3099402568346452e-05, + "loss": 2.5812, + "step": 50762 + }, + { + "epoch": 2.36338664245641, + "grad_norm": 0.31134336824809145, + "learning_rate": 1.3097574800542333e-05, + "loss": 2.6183, + "step": 50763 + }, + { + "epoch": 2.363433200642503, + "grad_norm": 0.31761634625616364, + "learning_rate": 1.3095747141043186e-05, + "loss": 2.5799, + "step": 50764 + }, + { + "epoch": 2.363479758828596, + "grad_norm": 0.3342495733934297, + "learning_rate": 1.3093919589854387e-05, + "loss": 2.7836, + "step": 50765 + }, + { + "epoch": 2.363526317014689, + "grad_norm": 0.3216442980572962, + "learning_rate": 1.3092092146981293e-05, + "loss": 2.758, + "step": 50766 + }, + { + "epoch": 2.363572875200782, + "grad_norm": 0.305913597698231, + "learning_rate": 1.3090264812429275e-05, + "loss": 2.6948, + "step": 50767 + }, + { + "epoch": 2.363619433386875, + "grad_norm": 0.31449809783031824, + "learning_rate": 1.3088437586203694e-05, + "loss": 2.7108, + "step": 50768 + }, + { + "epoch": 2.3636659915729683, + "grad_norm": 0.32757071711632574, + "learning_rate": 1.3086610468309924e-05, + "loss": 2.6769, + "step": 50769 + }, + { + "epoch": 2.3637125497590614, + "grad_norm": 0.31358535645831576, + "learning_rate": 1.3084783458753302e-05, + "loss": 2.5836, + "step": 50770 + }, + { + "epoch": 2.3637591079451545, + "grad_norm": 0.3177485297196828, + "learning_rate": 1.3082956557539205e-05, + "loss": 2.6159, + "step": 50771 + }, + { + "epoch": 2.3638056661312477, + "grad_norm": 0.31346402192300993, + "learning_rate": 1.3081129764672995e-05, + "loss": 2.6713, + "step": 50772 + }, + { + "epoch": 2.3638522243173408, + "grad_norm": 0.3279763228621105, + "learning_rate": 1.3079303080160032e-05, + "loss": 2.536, + "step": 50773 + }, + { + "epoch": 2.3638987825034334, + "grad_norm": 0.3140832928525712, + "learning_rate": 1.3077476504005687e-05, + "loss": 2.5996, + "step": 50774 + }, + { + "epoch": 2.3639453406895266, + "grad_norm": 0.30437752647353045, + "learning_rate": 1.3075650036215287e-05, + "loss": 2.693, + "step": 50775 + }, + { + "epoch": 2.3639918988756197, + "grad_norm": 0.31144146843399384, + "learning_rate": 1.3073823676794244e-05, + "loss": 2.556, + "step": 50776 + }, + { + "epoch": 2.364038457061713, + "grad_norm": 0.32744018487390547, + "learning_rate": 1.3071997425747861e-05, + "loss": 2.6859, + "step": 50777 + }, + { + "epoch": 2.364085015247806, + "grad_norm": 0.303446081927209, + "learning_rate": 1.3070171283081557e-05, + "loss": 2.6595, + "step": 50778 + }, + { + "epoch": 2.364131573433899, + "grad_norm": 0.3493207493180483, + "learning_rate": 1.3068345248800646e-05, + "loss": 2.7717, + "step": 50779 + }, + { + "epoch": 2.364178131619992, + "grad_norm": 0.3007381870684888, + "learning_rate": 1.3066519322910503e-05, + "loss": 2.5756, + "step": 50780 + }, + { + "epoch": 2.3642246898060852, + "grad_norm": 0.32586248191264466, + "learning_rate": 1.3064693505416492e-05, + "loss": 2.6392, + "step": 50781 + }, + { + "epoch": 2.3642712479921784, + "grad_norm": 0.34067101561674157, + "learning_rate": 1.306286779632398e-05, + "loss": 2.6957, + "step": 50782 + }, + { + "epoch": 2.3643178061782715, + "grad_norm": 0.3233966978359817, + "learning_rate": 1.3061042195638296e-05, + "loss": 2.619, + "step": 50783 + }, + { + "epoch": 2.3643643643643646, + "grad_norm": 0.3200810229624977, + "learning_rate": 1.3059216703364813e-05, + "loss": 2.5289, + "step": 50784 + }, + { + "epoch": 2.3644109225504573, + "grad_norm": 0.3459456296768499, + "learning_rate": 1.3057391319508888e-05, + "loss": 2.6853, + "step": 50785 + }, + { + "epoch": 2.3644574807365504, + "grad_norm": 0.32550189171752414, + "learning_rate": 1.3055566044075884e-05, + "loss": 2.6194, + "step": 50786 + }, + { + "epoch": 2.3645040389226435, + "grad_norm": 0.32413851554993556, + "learning_rate": 1.305374087707117e-05, + "loss": 2.5406, + "step": 50787 + }, + { + "epoch": 2.3645505971087366, + "grad_norm": 0.3097316610725996, + "learning_rate": 1.3051915818500055e-05, + "loss": 2.6024, + "step": 50788 + }, + { + "epoch": 2.3645971552948297, + "grad_norm": 0.33162835275592223, + "learning_rate": 1.3050090868367954e-05, + "loss": 2.6261, + "step": 50789 + }, + { + "epoch": 2.364643713480923, + "grad_norm": 0.3313680865166175, + "learning_rate": 1.3048266026680168e-05, + "loss": 2.6785, + "step": 50790 + }, + { + "epoch": 2.364690271667016, + "grad_norm": 0.31656554711943957, + "learning_rate": 1.3046441293442113e-05, + "loss": 2.6698, + "step": 50791 + }, + { + "epoch": 2.364736829853109, + "grad_norm": 0.32659645230174666, + "learning_rate": 1.3044616668659094e-05, + "loss": 2.6195, + "step": 50792 + }, + { + "epoch": 2.364783388039202, + "grad_norm": 0.32181307013142185, + "learning_rate": 1.3042792152336487e-05, + "loss": 2.59, + "step": 50793 + }, + { + "epoch": 2.364829946225295, + "grad_norm": 0.31577107210721367, + "learning_rate": 1.3040967744479643e-05, + "loss": 2.7178, + "step": 50794 + }, + { + "epoch": 2.364876504411388, + "grad_norm": 0.30435652874427155, + "learning_rate": 1.3039143445093921e-05, + "loss": 2.5832, + "step": 50795 + }, + { + "epoch": 2.364923062597481, + "grad_norm": 0.33608335202393413, + "learning_rate": 1.3037319254184682e-05, + "loss": 2.6555, + "step": 50796 + }, + { + "epoch": 2.364969620783574, + "grad_norm": 0.30716389193042315, + "learning_rate": 1.3035495171757255e-05, + "loss": 2.5536, + "step": 50797 + }, + { + "epoch": 2.3650161789696673, + "grad_norm": 0.3218258856961499, + "learning_rate": 1.303367119781701e-05, + "loss": 2.6912, + "step": 50798 + }, + { + "epoch": 2.3650627371557604, + "grad_norm": 0.31709514142814177, + "learning_rate": 1.3031847332369296e-05, + "loss": 2.6667, + "step": 50799 + }, + { + "epoch": 2.3651092953418535, + "grad_norm": 0.3068646024448955, + "learning_rate": 1.3030023575419487e-05, + "loss": 2.6745, + "step": 50800 + }, + { + "epoch": 2.3651558535279467, + "grad_norm": 0.3230105621848924, + "learning_rate": 1.3028199926972883e-05, + "loss": 2.6011, + "step": 50801 + }, + { + "epoch": 2.3652024117140398, + "grad_norm": 0.3278999139599276, + "learning_rate": 1.30263763870349e-05, + "loss": 2.62, + "step": 50802 + }, + { + "epoch": 2.365248969900133, + "grad_norm": 0.3348057108799848, + "learning_rate": 1.302455295561083e-05, + "loss": 2.5754, + "step": 50803 + }, + { + "epoch": 2.365295528086226, + "grad_norm": 0.3005039726028371, + "learning_rate": 1.3022729632706087e-05, + "loss": 2.5791, + "step": 50804 + }, + { + "epoch": 2.3653420862723187, + "grad_norm": 0.31690364660030534, + "learning_rate": 1.3020906418325973e-05, + "loss": 2.5821, + "step": 50805 + }, + { + "epoch": 2.365388644458412, + "grad_norm": 0.3205696998995142, + "learning_rate": 1.3019083312475856e-05, + "loss": 2.5423, + "step": 50806 + }, + { + "epoch": 2.365435202644505, + "grad_norm": 0.3142198183399386, + "learning_rate": 1.3017260315161084e-05, + "loss": 2.6122, + "step": 50807 + }, + { + "epoch": 2.365481760830598, + "grad_norm": 0.3359345572442606, + "learning_rate": 1.3015437426387011e-05, + "loss": 2.7027, + "step": 50808 + }, + { + "epoch": 2.365528319016691, + "grad_norm": 0.30851663356561304, + "learning_rate": 1.3013614646159006e-05, + "loss": 2.7181, + "step": 50809 + }, + { + "epoch": 2.3655748772027843, + "grad_norm": 0.33217180837148186, + "learning_rate": 1.3011791974482373e-05, + "loss": 2.6406, + "step": 50810 + }, + { + "epoch": 2.3656214353888774, + "grad_norm": 0.309886083414623, + "learning_rate": 1.3009969411362494e-05, + "loss": 2.7202, + "step": 50811 + }, + { + "epoch": 2.3656679935749705, + "grad_norm": 0.31793123866601924, + "learning_rate": 1.3008146956804706e-05, + "loss": 2.5916, + "step": 50812 + }, + { + "epoch": 2.365714551761063, + "grad_norm": 0.3368616474311465, + "learning_rate": 1.300632461081437e-05, + "loss": 2.7005, + "step": 50813 + }, + { + "epoch": 2.3657611099471563, + "grad_norm": 0.32356235806043526, + "learning_rate": 1.300450237339682e-05, + "loss": 2.7499, + "step": 50814 + }, + { + "epoch": 2.3658076681332494, + "grad_norm": 0.34285331665473334, + "learning_rate": 1.3002680244557425e-05, + "loss": 2.6673, + "step": 50815 + }, + { + "epoch": 2.3658542263193425, + "grad_norm": 0.31572752214087435, + "learning_rate": 1.3000858224301493e-05, + "loss": 2.7006, + "step": 50816 + }, + { + "epoch": 2.3659007845054356, + "grad_norm": 0.3221161796123274, + "learning_rate": 1.2999036312634427e-05, + "loss": 2.5763, + "step": 50817 + }, + { + "epoch": 2.3659473426915287, + "grad_norm": 0.31430978925672104, + "learning_rate": 1.2997214509561528e-05, + "loss": 2.7071, + "step": 50818 + }, + { + "epoch": 2.365993900877622, + "grad_norm": 0.3130114062668492, + "learning_rate": 1.299539281508816e-05, + "loss": 2.6183, + "step": 50819 + }, + { + "epoch": 2.366040459063715, + "grad_norm": 0.32720271335482143, + "learning_rate": 1.2993571229219665e-05, + "loss": 2.629, + "step": 50820 + }, + { + "epoch": 2.366087017249808, + "grad_norm": 0.3190180168029783, + "learning_rate": 1.299174975196139e-05, + "loss": 2.7005, + "step": 50821 + }, + { + "epoch": 2.366133575435901, + "grad_norm": 0.353990368771764, + "learning_rate": 1.2989928383318706e-05, + "loss": 2.6673, + "step": 50822 + }, + { + "epoch": 2.3661801336219943, + "grad_norm": 0.32077423415514894, + "learning_rate": 1.2988107123296917e-05, + "loss": 2.5706, + "step": 50823 + }, + { + "epoch": 2.366226691808087, + "grad_norm": 0.3052710463559913, + "learning_rate": 1.2986285971901391e-05, + "loss": 2.6392, + "step": 50824 + }, + { + "epoch": 2.36627324999418, + "grad_norm": 0.34711382570823623, + "learning_rate": 1.298446492913747e-05, + "loss": 2.7194, + "step": 50825 + }, + { + "epoch": 2.366319808180273, + "grad_norm": 0.3563473982146459, + "learning_rate": 1.298264399501049e-05, + "loss": 2.6354, + "step": 50826 + }, + { + "epoch": 2.3663663663663663, + "grad_norm": 0.34516921021276087, + "learning_rate": 1.2980823169525808e-05, + "loss": 2.6067, + "step": 50827 + }, + { + "epoch": 2.3664129245524594, + "grad_norm": 0.3255665653899234, + "learning_rate": 1.2979002452688777e-05, + "loss": 2.4939, + "step": 50828 + }, + { + "epoch": 2.3664594827385526, + "grad_norm": 0.33254260642551414, + "learning_rate": 1.29771818445047e-05, + "loss": 2.6337, + "step": 50829 + }, + { + "epoch": 2.3665060409246457, + "grad_norm": 0.3217339604470249, + "learning_rate": 1.2975361344978975e-05, + "loss": 2.6953, + "step": 50830 + }, + { + "epoch": 2.366552599110739, + "grad_norm": 0.3219339008653792, + "learning_rate": 1.2973540954116887e-05, + "loss": 2.6694, + "step": 50831 + }, + { + "epoch": 2.366599157296832, + "grad_norm": 0.32646165458228416, + "learning_rate": 1.2971720671923837e-05, + "loss": 2.4921, + "step": 50832 + }, + { + "epoch": 2.3666457154829246, + "grad_norm": 0.31128129685716127, + "learning_rate": 1.2969900498405124e-05, + "loss": 2.6988, + "step": 50833 + }, + { + "epoch": 2.3666922736690177, + "grad_norm": 0.3116545465419091, + "learning_rate": 1.2968080433566105e-05, + "loss": 2.5794, + "step": 50834 + }, + { + "epoch": 2.366738831855111, + "grad_norm": 0.3412343044252236, + "learning_rate": 1.296626047741214e-05, + "loss": 2.6308, + "step": 50835 + }, + { + "epoch": 2.366785390041204, + "grad_norm": 0.34031127165866026, + "learning_rate": 1.2964440629948532e-05, + "loss": 2.7106, + "step": 50836 + }, + { + "epoch": 2.366831948227297, + "grad_norm": 0.3521281284051118, + "learning_rate": 1.2962620891180643e-05, + "loss": 2.6974, + "step": 50837 + }, + { + "epoch": 2.36687850641339, + "grad_norm": 0.31141165958471423, + "learning_rate": 1.2960801261113814e-05, + "loss": 2.6454, + "step": 50838 + }, + { + "epoch": 2.3669250645994833, + "grad_norm": 0.3118402725530831, + "learning_rate": 1.2958981739753389e-05, + "loss": 2.6094, + "step": 50839 + }, + { + "epoch": 2.3669716227855764, + "grad_norm": 0.3611332952353601, + "learning_rate": 1.29571623271047e-05, + "loss": 2.6757, + "step": 50840 + }, + { + "epoch": 2.3670181809716695, + "grad_norm": 0.3252468396943582, + "learning_rate": 1.29553430231731e-05, + "loss": 2.5507, + "step": 50841 + }, + { + "epoch": 2.3670647391577626, + "grad_norm": 0.31623006398671716, + "learning_rate": 1.2953523827963898e-05, + "loss": 2.615, + "step": 50842 + }, + { + "epoch": 2.3671112973438557, + "grad_norm": 0.31579038573113155, + "learning_rate": 1.2951704741482479e-05, + "loss": 2.5945, + "step": 50843 + }, + { + "epoch": 2.3671578555299484, + "grad_norm": 0.3448668403887783, + "learning_rate": 1.2949885763734126e-05, + "loss": 2.6804, + "step": 50844 + }, + { + "epoch": 2.3672044137160415, + "grad_norm": 0.334345822941312, + "learning_rate": 1.2948066894724237e-05, + "loss": 2.7342, + "step": 50845 + }, + { + "epoch": 2.3672509719021346, + "grad_norm": 0.31512785707509094, + "learning_rate": 1.2946248134458111e-05, + "loss": 2.5936, + "step": 50846 + }, + { + "epoch": 2.3672975300882277, + "grad_norm": 0.31387788266057787, + "learning_rate": 1.2944429482941095e-05, + "loss": 2.5765, + "step": 50847 + }, + { + "epoch": 2.367344088274321, + "grad_norm": 0.33213942795553697, + "learning_rate": 1.294261094017854e-05, + "loss": 2.6847, + "step": 50848 + }, + { + "epoch": 2.367390646460414, + "grad_norm": 0.3420143778891804, + "learning_rate": 1.2940792506175742e-05, + "loss": 2.6178, + "step": 50849 + }, + { + "epoch": 2.367437204646507, + "grad_norm": 0.3022476631126476, + "learning_rate": 1.2938974180938096e-05, + "loss": 2.6097, + "step": 50850 + }, + { + "epoch": 2.3674837628326, + "grad_norm": 0.3035801171226063, + "learning_rate": 1.2937155964470898e-05, + "loss": 2.553, + "step": 50851 + }, + { + "epoch": 2.367530321018693, + "grad_norm": 0.30187992794341495, + "learning_rate": 1.2935337856779495e-05, + "loss": 2.6106, + "step": 50852 + }, + { + "epoch": 2.367576879204786, + "grad_norm": 0.3147785783599786, + "learning_rate": 1.2933519857869225e-05, + "loss": 2.5867, + "step": 50853 + }, + { + "epoch": 2.367623437390879, + "grad_norm": 0.3211694903558353, + "learning_rate": 1.2931701967745435e-05, + "loss": 2.6881, + "step": 50854 + }, + { + "epoch": 2.367669995576972, + "grad_norm": 0.3204663843534844, + "learning_rate": 1.2929884186413421e-05, + "loss": 2.6833, + "step": 50855 + }, + { + "epoch": 2.3677165537630653, + "grad_norm": 0.31612272801002866, + "learning_rate": 1.2928066513878573e-05, + "loss": 2.5831, + "step": 50856 + }, + { + "epoch": 2.3677631119491584, + "grad_norm": 0.3126585694733075, + "learning_rate": 1.2926248950146164e-05, + "loss": 2.6066, + "step": 50857 + }, + { + "epoch": 2.3678096701352516, + "grad_norm": 0.3083468916957355, + "learning_rate": 1.2924431495221595e-05, + "loss": 2.6162, + "step": 50858 + }, + { + "epoch": 2.3678562283213447, + "grad_norm": 0.3355990680356161, + "learning_rate": 1.292261414911015e-05, + "loss": 2.5912, + "step": 50859 + }, + { + "epoch": 2.367902786507438, + "grad_norm": 0.3038092345769589, + "learning_rate": 1.2920796911817184e-05, + "loss": 2.6579, + "step": 50860 + }, + { + "epoch": 2.367949344693531, + "grad_norm": 0.3188645365177194, + "learning_rate": 1.291897978334804e-05, + "loss": 2.673, + "step": 50861 + }, + { + "epoch": 2.367995902879624, + "grad_norm": 0.30968194175083463, + "learning_rate": 1.2917162763708007e-05, + "loss": 2.5305, + "step": 50862 + }, + { + "epoch": 2.3680424610657167, + "grad_norm": 0.3320757126850319, + "learning_rate": 1.2915345852902477e-05, + "loss": 2.6512, + "step": 50863 + }, + { + "epoch": 2.36808901925181, + "grad_norm": 0.30051527956787494, + "learning_rate": 1.291352905093674e-05, + "loss": 2.6603, + "step": 50864 + }, + { + "epoch": 2.368135577437903, + "grad_norm": 0.32849064661084065, + "learning_rate": 1.2911712357816141e-05, + "loss": 2.715, + "step": 50865 + }, + { + "epoch": 2.368182135623996, + "grad_norm": 0.32149264235465314, + "learning_rate": 1.290989577354601e-05, + "loss": 2.6548, + "step": 50866 + }, + { + "epoch": 2.368228693810089, + "grad_norm": 0.3302063490573895, + "learning_rate": 1.2908079298131686e-05, + "loss": 2.6525, + "step": 50867 + }, + { + "epoch": 2.3682752519961823, + "grad_norm": 0.3137869948553694, + "learning_rate": 1.2906262931578494e-05, + "loss": 2.6313, + "step": 50868 + }, + { + "epoch": 2.3683218101822754, + "grad_norm": 0.3129012862093886, + "learning_rate": 1.290444667389178e-05, + "loss": 2.6734, + "step": 50869 + }, + { + "epoch": 2.3683683683683685, + "grad_norm": 0.31282913891187664, + "learning_rate": 1.2902630525076825e-05, + "loss": 2.5733, + "step": 50870 + }, + { + "epoch": 2.3684149265544616, + "grad_norm": 0.32972373390125476, + "learning_rate": 1.2900814485139029e-05, + "loss": 2.5887, + "step": 50871 + }, + { + "epoch": 2.3684614847405543, + "grad_norm": 0.32675804490636906, + "learning_rate": 1.2898998554083674e-05, + "loss": 2.6586, + "step": 50872 + }, + { + "epoch": 2.3685080429266474, + "grad_norm": 0.3316313669620992, + "learning_rate": 1.2897182731916103e-05, + "loss": 2.5958, + "step": 50873 + }, + { + "epoch": 2.3685546011127405, + "grad_norm": 0.3255449171529396, + "learning_rate": 1.2895367018641657e-05, + "loss": 2.7091, + "step": 50874 + }, + { + "epoch": 2.3686011592988336, + "grad_norm": 0.3267320799554261, + "learning_rate": 1.2893551414265626e-05, + "loss": 2.6912, + "step": 50875 + }, + { + "epoch": 2.3686477174849268, + "grad_norm": 0.3357952059374992, + "learning_rate": 1.2891735918793397e-05, + "loss": 2.6949, + "step": 50876 + }, + { + "epoch": 2.36869427567102, + "grad_norm": 0.3342294472124521, + "learning_rate": 1.2889920532230254e-05, + "loss": 2.5913, + "step": 50877 + }, + { + "epoch": 2.368740833857113, + "grad_norm": 0.3208220950844166, + "learning_rate": 1.2888105254581534e-05, + "loss": 2.6219, + "step": 50878 + }, + { + "epoch": 2.368787392043206, + "grad_norm": 0.3207977648989736, + "learning_rate": 1.2886290085852571e-05, + "loss": 2.5892, + "step": 50879 + }, + { + "epoch": 2.368833950229299, + "grad_norm": 0.3610585728984757, + "learning_rate": 1.2884475026048687e-05, + "loss": 2.6104, + "step": 50880 + }, + { + "epoch": 2.3688805084153923, + "grad_norm": 0.34613112426037534, + "learning_rate": 1.2882660075175218e-05, + "loss": 2.6819, + "step": 50881 + }, + { + "epoch": 2.3689270666014854, + "grad_norm": 0.32507826518821265, + "learning_rate": 1.2880845233237498e-05, + "loss": 2.5867, + "step": 50882 + }, + { + "epoch": 2.368973624787578, + "grad_norm": 0.3110254812808718, + "learning_rate": 1.2879030500240808e-05, + "loss": 2.6294, + "step": 50883 + }, + { + "epoch": 2.3690201829736712, + "grad_norm": 0.3397183854741537, + "learning_rate": 1.2877215876190536e-05, + "loss": 2.6099, + "step": 50884 + }, + { + "epoch": 2.3690667411597643, + "grad_norm": 0.33520310234448086, + "learning_rate": 1.2875401361091965e-05, + "loss": 2.5867, + "step": 50885 + }, + { + "epoch": 2.3691132993458575, + "grad_norm": 0.31119140651251465, + "learning_rate": 1.2873586954950433e-05, + "loss": 2.5497, + "step": 50886 + }, + { + "epoch": 2.3691598575319506, + "grad_norm": 0.327964192410714, + "learning_rate": 1.2871772657771287e-05, + "loss": 2.591, + "step": 50887 + }, + { + "epoch": 2.3692064157180437, + "grad_norm": 0.33082794571018637, + "learning_rate": 1.286995846955979e-05, + "loss": 2.5631, + "step": 50888 + }, + { + "epoch": 2.369252973904137, + "grad_norm": 0.3217047801041667, + "learning_rate": 1.2868144390321346e-05, + "loss": 2.6413, + "step": 50889 + }, + { + "epoch": 2.36929953209023, + "grad_norm": 0.31205794401290904, + "learning_rate": 1.2866330420061217e-05, + "loss": 2.6277, + "step": 50890 + }, + { + "epoch": 2.3693460902763226, + "grad_norm": 0.3405101543945682, + "learning_rate": 1.2864516558784756e-05, + "loss": 2.6405, + "step": 50891 + }, + { + "epoch": 2.3693926484624157, + "grad_norm": 0.32899600629965653, + "learning_rate": 1.286270280649728e-05, + "loss": 2.7234, + "step": 50892 + }, + { + "epoch": 2.369439206648509, + "grad_norm": 0.3242334292880341, + "learning_rate": 1.286088916320411e-05, + "loss": 2.6535, + "step": 50893 + }, + { + "epoch": 2.369485764834602, + "grad_norm": 0.32818944644230397, + "learning_rate": 1.2859075628910573e-05, + "loss": 2.7168, + "step": 50894 + }, + { + "epoch": 2.369532323020695, + "grad_norm": 0.33385885542716326, + "learning_rate": 1.2857262203622005e-05, + "loss": 2.6079, + "step": 50895 + }, + { + "epoch": 2.369578881206788, + "grad_norm": 0.32110722404544295, + "learning_rate": 1.2855448887343679e-05, + "loss": 2.6524, + "step": 50896 + }, + { + "epoch": 2.3696254393928813, + "grad_norm": 0.31915514316749305, + "learning_rate": 1.2853635680080989e-05, + "loss": 2.6288, + "step": 50897 + }, + { + "epoch": 2.3696719975789744, + "grad_norm": 0.3484659995260326, + "learning_rate": 1.2851822581839196e-05, + "loss": 2.7178, + "step": 50898 + }, + { + "epoch": 2.3697185557650675, + "grad_norm": 0.3011987637737336, + "learning_rate": 1.2850009592623652e-05, + "loss": 2.5307, + "step": 50899 + }, + { + "epoch": 2.3697651139511606, + "grad_norm": 0.33534409515456015, + "learning_rate": 1.284819671243968e-05, + "loss": 2.5579, + "step": 50900 + }, + { + "epoch": 2.3698116721372537, + "grad_norm": 0.31419676903777427, + "learning_rate": 1.284638394129256e-05, + "loss": 2.5893, + "step": 50901 + }, + { + "epoch": 2.3698582303233464, + "grad_norm": 0.32068372945475543, + "learning_rate": 1.2844571279187673e-05, + "loss": 2.6665, + "step": 50902 + }, + { + "epoch": 2.3699047885094395, + "grad_norm": 0.31797258322310784, + "learning_rate": 1.2842758726130283e-05, + "loss": 2.705, + "step": 50903 + }, + { + "epoch": 2.3699513466955326, + "grad_norm": 0.31954297847007745, + "learning_rate": 1.284094628212576e-05, + "loss": 2.6252, + "step": 50904 + }, + { + "epoch": 2.3699979048816258, + "grad_norm": 0.33347886970124624, + "learning_rate": 1.2839133947179388e-05, + "loss": 2.6255, + "step": 50905 + }, + { + "epoch": 2.370044463067719, + "grad_norm": 0.3378304149505155, + "learning_rate": 1.2837321721296497e-05, + "loss": 2.6954, + "step": 50906 + }, + { + "epoch": 2.370091021253812, + "grad_norm": 0.31778561728755605, + "learning_rate": 1.2835509604482404e-05, + "loss": 2.5845, + "step": 50907 + }, + { + "epoch": 2.370137579439905, + "grad_norm": 0.3351169450351717, + "learning_rate": 1.2833697596742444e-05, + "loss": 2.7858, + "step": 50908 + }, + { + "epoch": 2.370184137625998, + "grad_norm": 0.35603817235210544, + "learning_rate": 1.283188569808189e-05, + "loss": 2.7317, + "step": 50909 + }, + { + "epoch": 2.3702306958120913, + "grad_norm": 0.33901478313007993, + "learning_rate": 1.2830073908506118e-05, + "loss": 2.6916, + "step": 50910 + }, + { + "epoch": 2.370277253998184, + "grad_norm": 0.3439358081793951, + "learning_rate": 1.2828262228020405e-05, + "loss": 2.6108, + "step": 50911 + }, + { + "epoch": 2.370323812184277, + "grad_norm": 0.3167564798133098, + "learning_rate": 1.282645065663008e-05, + "loss": 2.6789, + "step": 50912 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 0.3280255292613292, + "learning_rate": 1.2824639194340477e-05, + "loss": 2.5664, + "step": 50913 + }, + { + "epoch": 2.3704169285564634, + "grad_norm": 0.32094020861452754, + "learning_rate": 1.2822827841156859e-05, + "loss": 2.5979, + "step": 50914 + }, + { + "epoch": 2.3704634867425565, + "grad_norm": 0.3411496651356226, + "learning_rate": 1.2821016597084617e-05, + "loss": 2.6487, + "step": 50915 + }, + { + "epoch": 2.3705100449286496, + "grad_norm": 0.32397214179797407, + "learning_rate": 1.2819205462128992e-05, + "loss": 2.7111, + "step": 50916 + }, + { + "epoch": 2.3705566031147427, + "grad_norm": 0.3504235330737392, + "learning_rate": 1.281739443629537e-05, + "loss": 2.6337, + "step": 50917 + }, + { + "epoch": 2.370603161300836, + "grad_norm": 0.3359357848437592, + "learning_rate": 1.2815583519589014e-05, + "loss": 2.6432, + "step": 50918 + }, + { + "epoch": 2.370649719486929, + "grad_norm": 0.34065970955970076, + "learning_rate": 1.281377271201526e-05, + "loss": 2.6067, + "step": 50919 + }, + { + "epoch": 2.370696277673022, + "grad_norm": 0.34704379077526776, + "learning_rate": 1.281196201357942e-05, + "loss": 2.6978, + "step": 50920 + }, + { + "epoch": 2.370742835859115, + "grad_norm": 0.32091775739214007, + "learning_rate": 1.2810151424286803e-05, + "loss": 2.5582, + "step": 50921 + }, + { + "epoch": 2.370789394045208, + "grad_norm": 0.31316697936822624, + "learning_rate": 1.2808340944142732e-05, + "loss": 2.684, + "step": 50922 + }, + { + "epoch": 2.370835952231301, + "grad_norm": 0.3220454196567019, + "learning_rate": 1.280653057315253e-05, + "loss": 2.7156, + "step": 50923 + }, + { + "epoch": 2.370882510417394, + "grad_norm": 0.3373015721317536, + "learning_rate": 1.2804720311321484e-05, + "loss": 2.6148, + "step": 50924 + }, + { + "epoch": 2.370929068603487, + "grad_norm": 0.34997557216529296, + "learning_rate": 1.2802910158654919e-05, + "loss": 2.6482, + "step": 50925 + }, + { + "epoch": 2.3709756267895803, + "grad_norm": 0.32554976050051465, + "learning_rate": 1.2801100115158143e-05, + "loss": 2.7169, + "step": 50926 + }, + { + "epoch": 2.3710221849756734, + "grad_norm": 0.29698759299474803, + "learning_rate": 1.2799290180836481e-05, + "loss": 2.5684, + "step": 50927 + }, + { + "epoch": 2.3710687431617665, + "grad_norm": 0.32060264024948965, + "learning_rate": 1.2797480355695246e-05, + "loss": 2.5126, + "step": 50928 + }, + { + "epoch": 2.3711153013478596, + "grad_norm": 0.317854220764461, + "learning_rate": 1.2795670639739716e-05, + "loss": 2.6014, + "step": 50929 + }, + { + "epoch": 2.3711618595339523, + "grad_norm": 0.3291201946247483, + "learning_rate": 1.2793861032975251e-05, + "loss": 2.5959, + "step": 50930 + }, + { + "epoch": 2.3712084177200454, + "grad_norm": 0.32541869243099614, + "learning_rate": 1.2792051535407129e-05, + "loss": 2.5912, + "step": 50931 + }, + { + "epoch": 2.3712549759061385, + "grad_norm": 0.33681295391689364, + "learning_rate": 1.279024214704067e-05, + "loss": 2.7391, + "step": 50932 + }, + { + "epoch": 2.3713015340922317, + "grad_norm": 0.3195490151102727, + "learning_rate": 1.2788432867881185e-05, + "loss": 2.5773, + "step": 50933 + }, + { + "epoch": 2.3713480922783248, + "grad_norm": 0.32421422520542875, + "learning_rate": 1.278662369793398e-05, + "loss": 2.7114, + "step": 50934 + }, + { + "epoch": 2.371394650464418, + "grad_norm": 0.31792243887437527, + "learning_rate": 1.2784814637204373e-05, + "loss": 2.7196, + "step": 50935 + }, + { + "epoch": 2.371441208650511, + "grad_norm": 0.3295073613367456, + "learning_rate": 1.278300568569768e-05, + "loss": 2.674, + "step": 50936 + }, + { + "epoch": 2.371487766836604, + "grad_norm": 0.3204251835579367, + "learning_rate": 1.2781196843419186e-05, + "loss": 2.6177, + "step": 50937 + }, + { + "epoch": 2.3715343250226972, + "grad_norm": 0.3108488026294764, + "learning_rate": 1.2779388110374208e-05, + "loss": 2.6269, + "step": 50938 + }, + { + "epoch": 2.3715808832087903, + "grad_norm": 0.3152750684423997, + "learning_rate": 1.2777579486568065e-05, + "loss": 2.5515, + "step": 50939 + }, + { + "epoch": 2.3716274413948835, + "grad_norm": 0.3327595120311281, + "learning_rate": 1.2775770972006057e-05, + "loss": 2.6515, + "step": 50940 + }, + { + "epoch": 2.371673999580976, + "grad_norm": 0.3423326695299088, + "learning_rate": 1.277396256669351e-05, + "loss": 2.6308, + "step": 50941 + }, + { + "epoch": 2.3717205577670692, + "grad_norm": 0.327985068648728, + "learning_rate": 1.2772154270635684e-05, + "loss": 2.6595, + "step": 50942 + }, + { + "epoch": 2.3717671159531624, + "grad_norm": 0.34242441851593336, + "learning_rate": 1.2770346083837947e-05, + "loss": 2.7374, + "step": 50943 + }, + { + "epoch": 2.3718136741392555, + "grad_norm": 0.3178054832283538, + "learning_rate": 1.2768538006305564e-05, + "loss": 2.5268, + "step": 50944 + }, + { + "epoch": 2.3718602323253486, + "grad_norm": 0.33144320043664843, + "learning_rate": 1.2766730038043851e-05, + "loss": 2.689, + "step": 50945 + }, + { + "epoch": 2.3719067905114417, + "grad_norm": 0.30533166964296365, + "learning_rate": 1.276492217905812e-05, + "loss": 2.617, + "step": 50946 + }, + { + "epoch": 2.371953348697535, + "grad_norm": 0.3393571496427388, + "learning_rate": 1.2763114429353667e-05, + "loss": 2.649, + "step": 50947 + }, + { + "epoch": 2.371999906883628, + "grad_norm": 0.3294945352320232, + "learning_rate": 1.2761306788935812e-05, + "loss": 2.6888, + "step": 50948 + }, + { + "epoch": 2.372046465069721, + "grad_norm": 0.3252657498533756, + "learning_rate": 1.275949925780987e-05, + "loss": 2.6114, + "step": 50949 + }, + { + "epoch": 2.3720930232558137, + "grad_norm": 0.3340786520654137, + "learning_rate": 1.2757691835981106e-05, + "loss": 2.636, + "step": 50950 + }, + { + "epoch": 2.372139581441907, + "grad_norm": 0.32490712817741785, + "learning_rate": 1.2755884523454852e-05, + "loss": 2.5187, + "step": 50951 + }, + { + "epoch": 2.372186139628, + "grad_norm": 0.33998197583302187, + "learning_rate": 1.2754077320236407e-05, + "loss": 2.602, + "step": 50952 + }, + { + "epoch": 2.372232697814093, + "grad_norm": 0.3044771656201689, + "learning_rate": 1.275227022633108e-05, + "loss": 2.6405, + "step": 50953 + }, + { + "epoch": 2.372279256000186, + "grad_norm": 0.3236919199354943, + "learning_rate": 1.2750463241744182e-05, + "loss": 2.6513, + "step": 50954 + }, + { + "epoch": 2.3723258141862793, + "grad_norm": 0.32647273610150035, + "learning_rate": 1.2748656366480971e-05, + "loss": 2.6028, + "step": 50955 + }, + { + "epoch": 2.3723723723723724, + "grad_norm": 0.32283157921891387, + "learning_rate": 1.2746849600546817e-05, + "loss": 2.603, + "step": 50956 + }, + { + "epoch": 2.3724189305584655, + "grad_norm": 0.3093794912440534, + "learning_rate": 1.2745042943946961e-05, + "loss": 2.5497, + "step": 50957 + }, + { + "epoch": 2.3724654887445586, + "grad_norm": 0.31244769129075894, + "learning_rate": 1.2743236396686765e-05, + "loss": 2.5639, + "step": 50958 + }, + { + "epoch": 2.3725120469306518, + "grad_norm": 0.3314502026602238, + "learning_rate": 1.2741429958771478e-05, + "loss": 2.6402, + "step": 50959 + }, + { + "epoch": 2.372558605116745, + "grad_norm": 0.3168029748526323, + "learning_rate": 1.2739623630206432e-05, + "loss": 2.6606, + "step": 50960 + }, + { + "epoch": 2.3726051633028375, + "grad_norm": 0.3152140003670378, + "learning_rate": 1.273781741099691e-05, + "loss": 2.6975, + "step": 50961 + }, + { + "epoch": 2.3726517214889307, + "grad_norm": 0.30722474498573477, + "learning_rate": 1.2736011301148248e-05, + "loss": 2.5887, + "step": 50962 + }, + { + "epoch": 2.372698279675024, + "grad_norm": 0.33274670231702164, + "learning_rate": 1.2734205300665703e-05, + "loss": 2.6188, + "step": 50963 + }, + { + "epoch": 2.372744837861117, + "grad_norm": 0.3289188747510943, + "learning_rate": 1.273239940955459e-05, + "loss": 2.6475, + "step": 50964 + }, + { + "epoch": 2.37279139604721, + "grad_norm": 0.32361923136250187, + "learning_rate": 1.2730593627820215e-05, + "loss": 2.6191, + "step": 50965 + }, + { + "epoch": 2.372837954233303, + "grad_norm": 0.33016642328296636, + "learning_rate": 1.272878795546788e-05, + "loss": 2.6284, + "step": 50966 + }, + { + "epoch": 2.3728845124193962, + "grad_norm": 0.3157522480646472, + "learning_rate": 1.272698239250289e-05, + "loss": 2.5918, + "step": 50967 + }, + { + "epoch": 2.3729310706054894, + "grad_norm": 0.30947997520612447, + "learning_rate": 1.272517693893051e-05, + "loss": 2.6748, + "step": 50968 + }, + { + "epoch": 2.3729776287915825, + "grad_norm": 0.31012086147741597, + "learning_rate": 1.272337159475609e-05, + "loss": 2.6059, + "step": 50969 + }, + { + "epoch": 2.373024186977675, + "grad_norm": 0.30215637182435173, + "learning_rate": 1.2721566359984876e-05, + "loss": 2.5914, + "step": 50970 + }, + { + "epoch": 2.3730707451637683, + "grad_norm": 0.31533399627128406, + "learning_rate": 1.2719761234622219e-05, + "loss": 2.5911, + "step": 50971 + }, + { + "epoch": 2.3731173033498614, + "grad_norm": 0.3381360071687936, + "learning_rate": 1.2717956218673377e-05, + "loss": 2.6662, + "step": 50972 + }, + { + "epoch": 2.3731638615359545, + "grad_norm": 0.3123910167600213, + "learning_rate": 1.271615131214366e-05, + "loss": 2.6354, + "step": 50973 + }, + { + "epoch": 2.3732104197220476, + "grad_norm": 0.3465033232752359, + "learning_rate": 1.2714346515038362e-05, + "loss": 2.559, + "step": 50974 + }, + { + "epoch": 2.3732569779081407, + "grad_norm": 0.3118080659232129, + "learning_rate": 1.2712541827362783e-05, + "loss": 2.6278, + "step": 50975 + }, + { + "epoch": 2.373303536094234, + "grad_norm": 0.29498704451117863, + "learning_rate": 1.271073724912224e-05, + "loss": 2.6032, + "step": 50976 + }, + { + "epoch": 2.373350094280327, + "grad_norm": 0.33438234119406807, + "learning_rate": 1.2708932780321996e-05, + "loss": 2.6641, + "step": 50977 + }, + { + "epoch": 2.37339665246642, + "grad_norm": 0.35634094236214103, + "learning_rate": 1.2707128420967357e-05, + "loss": 2.6871, + "step": 50978 + }, + { + "epoch": 2.373443210652513, + "grad_norm": 0.3450468116978619, + "learning_rate": 1.2705324171063626e-05, + "loss": 2.6194, + "step": 50979 + }, + { + "epoch": 2.3734897688386063, + "grad_norm": 0.33080649719961003, + "learning_rate": 1.2703520030616107e-05, + "loss": 2.7235, + "step": 50980 + }, + { + "epoch": 2.373536327024699, + "grad_norm": 0.3203584576572688, + "learning_rate": 1.2701715999630048e-05, + "loss": 2.5972, + "step": 50981 + }, + { + "epoch": 2.373582885210792, + "grad_norm": 0.33047010788016673, + "learning_rate": 1.2699912078110815e-05, + "loss": 2.5821, + "step": 50982 + }, + { + "epoch": 2.373629443396885, + "grad_norm": 0.33282808302911365, + "learning_rate": 1.269810826606363e-05, + "loss": 2.668, + "step": 50983 + }, + { + "epoch": 2.3736760015829783, + "grad_norm": 0.3349640394523094, + "learning_rate": 1.2696304563493855e-05, + "loss": 2.6662, + "step": 50984 + }, + { + "epoch": 2.3737225597690714, + "grad_norm": 0.3272699675308575, + "learning_rate": 1.2694500970406737e-05, + "loss": 2.6831, + "step": 50985 + }, + { + "epoch": 2.3737691179551645, + "grad_norm": 0.33528252485061827, + "learning_rate": 1.2692697486807582e-05, + "loss": 2.6397, + "step": 50986 + }, + { + "epoch": 2.3738156761412577, + "grad_norm": 0.3204563752627302, + "learning_rate": 1.2690894112701685e-05, + "loss": 2.6398, + "step": 50987 + }, + { + "epoch": 2.3738622343273508, + "grad_norm": 0.3022310573713122, + "learning_rate": 1.2689090848094337e-05, + "loss": 2.6055, + "step": 50988 + }, + { + "epoch": 2.3739087925134434, + "grad_norm": 0.318832227191146, + "learning_rate": 1.2687287692990852e-05, + "loss": 2.5881, + "step": 50989 + }, + { + "epoch": 2.3739553506995366, + "grad_norm": 0.3194962167642856, + "learning_rate": 1.2685484647396479e-05, + "loss": 2.5467, + "step": 50990 + }, + { + "epoch": 2.3740019088856297, + "grad_norm": 0.33830717183939696, + "learning_rate": 1.2683681711316536e-05, + "loss": 2.6525, + "step": 50991 + }, + { + "epoch": 2.374048467071723, + "grad_norm": 0.34166543891991746, + "learning_rate": 1.268187888475631e-05, + "loss": 2.808, + "step": 50992 + }, + { + "epoch": 2.374095025257816, + "grad_norm": 0.33032457057931336, + "learning_rate": 1.2680076167721094e-05, + "loss": 2.5755, + "step": 50993 + }, + { + "epoch": 2.374141583443909, + "grad_norm": 0.3287217870676233, + "learning_rate": 1.267827356021618e-05, + "loss": 2.6497, + "step": 50994 + }, + { + "epoch": 2.374188141630002, + "grad_norm": 0.3321559372304986, + "learning_rate": 1.267647106224687e-05, + "loss": 2.6823, + "step": 50995 + }, + { + "epoch": 2.3742346998160953, + "grad_norm": 0.320767877608409, + "learning_rate": 1.2674668673818408e-05, + "loss": 2.6453, + "step": 50996 + }, + { + "epoch": 2.3742812580021884, + "grad_norm": 0.3376834525427938, + "learning_rate": 1.2672866394936144e-05, + "loss": 2.6732, + "step": 50997 + }, + { + "epoch": 2.3743278161882815, + "grad_norm": 0.3457315774042743, + "learning_rate": 1.2671064225605328e-05, + "loss": 2.6334, + "step": 50998 + }, + { + "epoch": 2.3743743743743746, + "grad_norm": 0.3100983966301157, + "learning_rate": 1.2669262165831264e-05, + "loss": 2.6031, + "step": 50999 + }, + { + "epoch": 2.3744209325604673, + "grad_norm": 0.3247418914884851, + "learning_rate": 1.2667460215619237e-05, + "loss": 2.6503, + "step": 51000 + }, + { + "epoch": 2.3744674907465604, + "grad_norm": 0.31728247000042326, + "learning_rate": 1.2665658374974531e-05, + "loss": 2.5589, + "step": 51001 + }, + { + "epoch": 2.3745140489326535, + "grad_norm": 0.32663229259051885, + "learning_rate": 1.2663856643902461e-05, + "loss": 2.5867, + "step": 51002 + }, + { + "epoch": 2.3745606071187466, + "grad_norm": 0.31317399586388117, + "learning_rate": 1.2662055022408276e-05, + "loss": 2.5287, + "step": 51003 + }, + { + "epoch": 2.3746071653048397, + "grad_norm": 0.31788151257875535, + "learning_rate": 1.2660253510497277e-05, + "loss": 2.6269, + "step": 51004 + }, + { + "epoch": 2.374653723490933, + "grad_norm": 0.3268851141320333, + "learning_rate": 1.2658452108174762e-05, + "loss": 2.6828, + "step": 51005 + }, + { + "epoch": 2.374700281677026, + "grad_norm": 0.3120584767573737, + "learning_rate": 1.2656650815446008e-05, + "loss": 2.6998, + "step": 51006 + }, + { + "epoch": 2.374746839863119, + "grad_norm": 0.31446473007585257, + "learning_rate": 1.2654849632316308e-05, + "loss": 2.6524, + "step": 51007 + }, + { + "epoch": 2.374793398049212, + "grad_norm": 0.3168312571320165, + "learning_rate": 1.265304855879096e-05, + "loss": 2.5987, + "step": 51008 + }, + { + "epoch": 2.374839956235305, + "grad_norm": 0.303346790933381, + "learning_rate": 1.2651247594875198e-05, + "loss": 2.5719, + "step": 51009 + }, + { + "epoch": 2.374886514421398, + "grad_norm": 0.3199237756821453, + "learning_rate": 1.2649446740574382e-05, + "loss": 2.5325, + "step": 51010 + }, + { + "epoch": 2.374933072607491, + "grad_norm": 0.31602957152282685, + "learning_rate": 1.2647645995893742e-05, + "loss": 2.6818, + "step": 51011 + }, + { + "epoch": 2.374979630793584, + "grad_norm": 0.3078933031835655, + "learning_rate": 1.2645845360838582e-05, + "loss": 2.494, + "step": 51012 + }, + { + "epoch": 2.3750261889796773, + "grad_norm": 0.34694802095233823, + "learning_rate": 1.2644044835414188e-05, + "loss": 2.7012, + "step": 51013 + }, + { + "epoch": 2.3750727471657704, + "grad_norm": 0.3083754396165844, + "learning_rate": 1.2642244419625837e-05, + "loss": 2.6236, + "step": 51014 + }, + { + "epoch": 2.3751193053518636, + "grad_norm": 0.3090788076412143, + "learning_rate": 1.2640444113478839e-05, + "loss": 2.7039, + "step": 51015 + }, + { + "epoch": 2.3751658635379567, + "grad_norm": 0.3403862936471813, + "learning_rate": 1.2638643916978438e-05, + "loss": 2.6634, + "step": 51016 + }, + { + "epoch": 2.37521242172405, + "grad_norm": 0.3181383756107454, + "learning_rate": 1.263684383012994e-05, + "loss": 2.7071, + "step": 51017 + }, + { + "epoch": 2.375258979910143, + "grad_norm": 0.3255440256379842, + "learning_rate": 1.2635043852938621e-05, + "loss": 2.6501, + "step": 51018 + }, + { + "epoch": 2.375305538096236, + "grad_norm": 0.331175069695677, + "learning_rate": 1.2633243985409771e-05, + "loss": 2.669, + "step": 51019 + }, + { + "epoch": 2.3753520962823287, + "grad_norm": 0.3487631977472435, + "learning_rate": 1.2631444227548667e-05, + "loss": 2.6599, + "step": 51020 + }, + { + "epoch": 2.375398654468422, + "grad_norm": 0.31324065689609404, + "learning_rate": 1.2629644579360606e-05, + "loss": 2.6016, + "step": 51021 + }, + { + "epoch": 2.375445212654515, + "grad_norm": 0.32621732219247485, + "learning_rate": 1.2627845040850828e-05, + "loss": 2.6507, + "step": 51022 + }, + { + "epoch": 2.375491770840608, + "grad_norm": 0.3281311068940186, + "learning_rate": 1.2626045612024673e-05, + "loss": 2.6733, + "step": 51023 + }, + { + "epoch": 2.375538329026701, + "grad_norm": 0.32152322238302194, + "learning_rate": 1.2624246292887375e-05, + "loss": 2.6049, + "step": 51024 + }, + { + "epoch": 2.3755848872127943, + "grad_norm": 0.3148054761903965, + "learning_rate": 1.2622447083444233e-05, + "loss": 2.6352, + "step": 51025 + }, + { + "epoch": 2.3756314453988874, + "grad_norm": 0.32398980681178546, + "learning_rate": 1.2620647983700528e-05, + "loss": 2.6045, + "step": 51026 + }, + { + "epoch": 2.3756780035849805, + "grad_norm": 0.314899308506825, + "learning_rate": 1.2618848993661536e-05, + "loss": 2.7049, + "step": 51027 + }, + { + "epoch": 2.375724561771073, + "grad_norm": 0.3207535885774133, + "learning_rate": 1.2617050113332557e-05, + "loss": 2.6419, + "step": 51028 + }, + { + "epoch": 2.3757711199571663, + "grad_norm": 0.3243971001700178, + "learning_rate": 1.2615251342718826e-05, + "loss": 2.5784, + "step": 51029 + }, + { + "epoch": 2.3758176781432594, + "grad_norm": 0.32083303125376783, + "learning_rate": 1.2613452681825677e-05, + "loss": 2.6807, + "step": 51030 + }, + { + "epoch": 2.3758642363293525, + "grad_norm": 0.32991252194128656, + "learning_rate": 1.2611654130658346e-05, + "loss": 2.6872, + "step": 51031 + }, + { + "epoch": 2.3759107945154456, + "grad_norm": 0.3140008022770216, + "learning_rate": 1.2609855689222122e-05, + "loss": 2.6307, + "step": 51032 + }, + { + "epoch": 2.3759573527015387, + "grad_norm": 0.3194908469945143, + "learning_rate": 1.260805735752229e-05, + "loss": 2.6092, + "step": 51033 + }, + { + "epoch": 2.376003910887632, + "grad_norm": 0.32524052070727694, + "learning_rate": 1.260625913556413e-05, + "loss": 2.69, + "step": 51034 + }, + { + "epoch": 2.376050469073725, + "grad_norm": 0.3083911401436559, + "learning_rate": 1.2604461023352915e-05, + "loss": 2.5805, + "step": 51035 + }, + { + "epoch": 2.376097027259818, + "grad_norm": 0.3354555250649117, + "learning_rate": 1.2602663020893934e-05, + "loss": 2.5989, + "step": 51036 + }, + { + "epoch": 2.376143585445911, + "grad_norm": 0.34231599448550015, + "learning_rate": 1.260086512819244e-05, + "loss": 2.7423, + "step": 51037 + }, + { + "epoch": 2.3761901436320043, + "grad_norm": 0.33235888922910667, + "learning_rate": 1.2599067345253723e-05, + "loss": 2.6763, + "step": 51038 + }, + { + "epoch": 2.376236701818097, + "grad_norm": 0.30151794417391914, + "learning_rate": 1.2597269672083061e-05, + "loss": 2.5535, + "step": 51039 + }, + { + "epoch": 2.37628326000419, + "grad_norm": 0.3118412706520157, + "learning_rate": 1.2595472108685725e-05, + "loss": 2.5934, + "step": 51040 + }, + { + "epoch": 2.376329818190283, + "grad_norm": 0.30144302228529235, + "learning_rate": 1.2593674655067006e-05, + "loss": 2.6305, + "step": 51041 + }, + { + "epoch": 2.3763763763763763, + "grad_norm": 0.3138140303938676, + "learning_rate": 1.2591877311232142e-05, + "loss": 2.6186, + "step": 51042 + }, + { + "epoch": 2.3764229345624694, + "grad_norm": 0.3230473048439604, + "learning_rate": 1.2590080077186455e-05, + "loss": 2.6539, + "step": 51043 + }, + { + "epoch": 2.3764694927485626, + "grad_norm": 0.31087091000668915, + "learning_rate": 1.2588282952935193e-05, + "loss": 2.7702, + "step": 51044 + }, + { + "epoch": 2.3765160509346557, + "grad_norm": 0.3018229592370384, + "learning_rate": 1.2586485938483628e-05, + "loss": 2.6684, + "step": 51045 + }, + { + "epoch": 2.376562609120749, + "grad_norm": 0.32002900953268587, + "learning_rate": 1.2584689033837038e-05, + "loss": 2.6809, + "step": 51046 + }, + { + "epoch": 2.376609167306842, + "grad_norm": 0.3267556937930211, + "learning_rate": 1.2582892239000704e-05, + "loss": 2.5851, + "step": 51047 + }, + { + "epoch": 2.3766557254929346, + "grad_norm": 0.33604407252809587, + "learning_rate": 1.2581095553979899e-05, + "loss": 2.6937, + "step": 51048 + }, + { + "epoch": 2.3767022836790277, + "grad_norm": 0.2999586989969662, + "learning_rate": 1.2579298978779903e-05, + "loss": 2.5901, + "step": 51049 + }, + { + "epoch": 2.376748841865121, + "grad_norm": 0.33513343209821544, + "learning_rate": 1.2577502513405965e-05, + "loss": 2.6714, + "step": 51050 + }, + { + "epoch": 2.376795400051214, + "grad_norm": 0.3328723159383763, + "learning_rate": 1.2575706157863365e-05, + "loss": 2.6395, + "step": 51051 + }, + { + "epoch": 2.376841958237307, + "grad_norm": 0.3285220626940136, + "learning_rate": 1.2573909912157383e-05, + "loss": 2.6332, + "step": 51052 + }, + { + "epoch": 2.3768885164234, + "grad_norm": 0.3047049058186312, + "learning_rate": 1.2572113776293292e-05, + "loss": 2.5843, + "step": 51053 + }, + { + "epoch": 2.3769350746094933, + "grad_norm": 0.3227907229694385, + "learning_rate": 1.2570317750276372e-05, + "loss": 2.6548, + "step": 51054 + }, + { + "epoch": 2.3769816327955864, + "grad_norm": 0.3106888818146099, + "learning_rate": 1.2568521834111852e-05, + "loss": 2.6793, + "step": 51055 + }, + { + "epoch": 2.3770281909816795, + "grad_norm": 0.31445667363990476, + "learning_rate": 1.256672602780506e-05, + "loss": 2.6455, + "step": 51056 + }, + { + "epoch": 2.3770747491677726, + "grad_norm": 0.3036740758456816, + "learning_rate": 1.256493033136123e-05, + "loss": 2.6997, + "step": 51057 + }, + { + "epoch": 2.3771213073538657, + "grad_norm": 0.3151485772527365, + "learning_rate": 1.2563134744785638e-05, + "loss": 2.5695, + "step": 51058 + }, + { + "epoch": 2.3771678655399584, + "grad_norm": 0.32891951470595554, + "learning_rate": 1.2561339268083556e-05, + "loss": 2.6675, + "step": 51059 + }, + { + "epoch": 2.3772144237260515, + "grad_norm": 0.3302138839276609, + "learning_rate": 1.2559543901260257e-05, + "loss": 2.5964, + "step": 51060 + }, + { + "epoch": 2.3772609819121446, + "grad_norm": 0.3493313541952214, + "learning_rate": 1.255774864432101e-05, + "loss": 2.6656, + "step": 51061 + }, + { + "epoch": 2.3773075400982377, + "grad_norm": 0.3062654777928202, + "learning_rate": 1.2555953497271095e-05, + "loss": 2.5666, + "step": 51062 + }, + { + "epoch": 2.377354098284331, + "grad_norm": 0.3100559138268132, + "learning_rate": 1.255415846011575e-05, + "loss": 2.6554, + "step": 51063 + }, + { + "epoch": 2.377400656470424, + "grad_norm": 0.34204203391628657, + "learning_rate": 1.2552363532860268e-05, + "loss": 2.6329, + "step": 51064 + }, + { + "epoch": 2.377447214656517, + "grad_norm": 0.3330278627717931, + "learning_rate": 1.2550568715509904e-05, + "loss": 2.5641, + "step": 51065 + }, + { + "epoch": 2.37749377284261, + "grad_norm": 0.33197022513593205, + "learning_rate": 1.2548774008069935e-05, + "loss": 2.5966, + "step": 51066 + }, + { + "epoch": 2.377540331028703, + "grad_norm": 0.3226061933985108, + "learning_rate": 1.2546979410545634e-05, + "loss": 2.6045, + "step": 51067 + }, + { + "epoch": 2.377586889214796, + "grad_norm": 0.3123653059067355, + "learning_rate": 1.2545184922942232e-05, + "loss": 2.5609, + "step": 51068 + }, + { + "epoch": 2.377633447400889, + "grad_norm": 0.3215030616875394, + "learning_rate": 1.2543390545265054e-05, + "loss": 2.5866, + "step": 51069 + }, + { + "epoch": 2.3776800055869822, + "grad_norm": 0.3407836613195864, + "learning_rate": 1.2541596277519303e-05, + "loss": 2.7282, + "step": 51070 + }, + { + "epoch": 2.3777265637730753, + "grad_norm": 0.33446988385233295, + "learning_rate": 1.253980211971031e-05, + "loss": 2.5635, + "step": 51071 + }, + { + "epoch": 2.3777731219591685, + "grad_norm": 0.3169857087419738, + "learning_rate": 1.2538008071843288e-05, + "loss": 2.639, + "step": 51072 + }, + { + "epoch": 2.3778196801452616, + "grad_norm": 0.3270203078407291, + "learning_rate": 1.2536214133923524e-05, + "loss": 2.6328, + "step": 51073 + }, + { + "epoch": 2.3778662383313547, + "grad_norm": 0.32463286102142147, + "learning_rate": 1.2534420305956279e-05, + "loss": 2.5601, + "step": 51074 + }, + { + "epoch": 2.377912796517448, + "grad_norm": 0.3331776038952162, + "learning_rate": 1.2532626587946833e-05, + "loss": 2.6913, + "step": 51075 + }, + { + "epoch": 2.377959354703541, + "grad_norm": 0.32970449793001855, + "learning_rate": 1.2530832979900425e-05, + "loss": 2.6424, + "step": 51076 + }, + { + "epoch": 2.378005912889634, + "grad_norm": 0.352481850780566, + "learning_rate": 1.252903948182233e-05, + "loss": 2.6134, + "step": 51077 + }, + { + "epoch": 2.3780524710757267, + "grad_norm": 0.30773146405236573, + "learning_rate": 1.2527246093717814e-05, + "loss": 2.6049, + "step": 51078 + }, + { + "epoch": 2.37809902926182, + "grad_norm": 0.30423323785341116, + "learning_rate": 1.2525452815592137e-05, + "loss": 2.6093, + "step": 51079 + }, + { + "epoch": 2.378145587447913, + "grad_norm": 0.33242611392916777, + "learning_rate": 1.252365964745058e-05, + "loss": 2.5972, + "step": 51080 + }, + { + "epoch": 2.378192145634006, + "grad_norm": 0.3228039751244913, + "learning_rate": 1.2521866589298358e-05, + "loss": 2.6416, + "step": 51081 + }, + { + "epoch": 2.378238703820099, + "grad_norm": 0.35259899438321934, + "learning_rate": 1.25200736411408e-05, + "loss": 2.6443, + "step": 51082 + }, + { + "epoch": 2.3782852620061923, + "grad_norm": 0.3247320517878374, + "learning_rate": 1.25182808029831e-05, + "loss": 2.601, + "step": 51083 + }, + { + "epoch": 2.3783318201922854, + "grad_norm": 0.3083527715381683, + "learning_rate": 1.2516488074830585e-05, + "loss": 2.5317, + "step": 51084 + }, + { + "epoch": 2.3783783783783785, + "grad_norm": 0.33543976152161975, + "learning_rate": 1.251469545668847e-05, + "loss": 2.6356, + "step": 51085 + }, + { + "epoch": 2.3784249365644716, + "grad_norm": 0.3210931458600897, + "learning_rate": 1.2512902948562027e-05, + "loss": 2.6249, + "step": 51086 + }, + { + "epoch": 2.3784714947505643, + "grad_norm": 0.3454718475657705, + "learning_rate": 1.2511110550456523e-05, + "loss": 2.7627, + "step": 51087 + }, + { + "epoch": 2.3785180529366574, + "grad_norm": 0.31756569904142407, + "learning_rate": 1.2509318262377217e-05, + "loss": 2.6167, + "step": 51088 + }, + { + "epoch": 2.3785646111227505, + "grad_norm": 0.32709494641370973, + "learning_rate": 1.250752608432938e-05, + "loss": 2.6224, + "step": 51089 + }, + { + "epoch": 2.3786111693088436, + "grad_norm": 0.3298913350196352, + "learning_rate": 1.250573401631825e-05, + "loss": 2.6978, + "step": 51090 + }, + { + "epoch": 2.3786577274949368, + "grad_norm": 0.326804904192016, + "learning_rate": 1.2503942058349093e-05, + "loss": 2.6049, + "step": 51091 + }, + { + "epoch": 2.37870428568103, + "grad_norm": 0.32268269703427316, + "learning_rate": 1.2502150210427177e-05, + "loss": 2.6286, + "step": 51092 + }, + { + "epoch": 2.378750843867123, + "grad_norm": 0.3066094562672208, + "learning_rate": 1.2500358472557767e-05, + "loss": 2.5275, + "step": 51093 + }, + { + "epoch": 2.378797402053216, + "grad_norm": 0.32808973495635135, + "learning_rate": 1.249856684474608e-05, + "loss": 2.6348, + "step": 51094 + }, + { + "epoch": 2.378843960239309, + "grad_norm": 0.3436062798447855, + "learning_rate": 1.2496775326997439e-05, + "loss": 2.6429, + "step": 51095 + }, + { + "epoch": 2.3788905184254023, + "grad_norm": 0.3240136152419688, + "learning_rate": 1.2494983919317033e-05, + "loss": 2.6078, + "step": 51096 + }, + { + "epoch": 2.3789370766114954, + "grad_norm": 0.31619262702455814, + "learning_rate": 1.2493192621710187e-05, + "loss": 2.6377, + "step": 51097 + }, + { + "epoch": 2.378983634797588, + "grad_norm": 0.3847151948953289, + "learning_rate": 1.2491401434182109e-05, + "loss": 2.6881, + "step": 51098 + }, + { + "epoch": 2.3790301929836812, + "grad_norm": 0.32934074704483457, + "learning_rate": 1.2489610356738074e-05, + "loss": 2.6264, + "step": 51099 + }, + { + "epoch": 2.3790767511697744, + "grad_norm": 0.34060123902204564, + "learning_rate": 1.2487819389383337e-05, + "loss": 2.7591, + "step": 51100 + }, + { + "epoch": 2.3791233093558675, + "grad_norm": 0.3239041193613461, + "learning_rate": 1.2486028532123157e-05, + "loss": 2.5838, + "step": 51101 + }, + { + "epoch": 2.3791698675419606, + "grad_norm": 0.3267204855502385, + "learning_rate": 1.24842377849628e-05, + "loss": 2.6023, + "step": 51102 + }, + { + "epoch": 2.3792164257280537, + "grad_norm": 0.3399972276649435, + "learning_rate": 1.2482447147907495e-05, + "loss": 2.6263, + "step": 51103 + }, + { + "epoch": 2.379262983914147, + "grad_norm": 0.3256752960187919, + "learning_rate": 1.248065662096251e-05, + "loss": 2.6495, + "step": 51104 + }, + { + "epoch": 2.37930954210024, + "grad_norm": 0.3304599912466514, + "learning_rate": 1.24788662041331e-05, + "loss": 2.7075, + "step": 51105 + }, + { + "epoch": 2.3793561002863326, + "grad_norm": 0.34038326406239977, + "learning_rate": 1.2477075897424522e-05, + "loss": 2.6272, + "step": 51106 + }, + { + "epoch": 2.3794026584724257, + "grad_norm": 0.3217742299951792, + "learning_rate": 1.2475285700842031e-05, + "loss": 2.5791, + "step": 51107 + }, + { + "epoch": 2.379449216658519, + "grad_norm": 0.32455941016797546, + "learning_rate": 1.2473495614390896e-05, + "loss": 2.6104, + "step": 51108 + }, + { + "epoch": 2.379495774844612, + "grad_norm": 0.33729375320388383, + "learning_rate": 1.2471705638076326e-05, + "loss": 2.6907, + "step": 51109 + }, + { + "epoch": 2.379542333030705, + "grad_norm": 0.33190358557210664, + "learning_rate": 1.2469915771903629e-05, + "loss": 2.5333, + "step": 51110 + }, + { + "epoch": 2.379588891216798, + "grad_norm": 0.31651288224176477, + "learning_rate": 1.2468126015878018e-05, + "loss": 2.4881, + "step": 51111 + }, + { + "epoch": 2.3796354494028913, + "grad_norm": 0.36037991733209473, + "learning_rate": 1.246633637000476e-05, + "loss": 2.7293, + "step": 51112 + }, + { + "epoch": 2.3796820075889844, + "grad_norm": 0.3269372210226292, + "learning_rate": 1.2464546834289104e-05, + "loss": 2.6222, + "step": 51113 + }, + { + "epoch": 2.3797285657750775, + "grad_norm": 0.31719781914850437, + "learning_rate": 1.2462757408736309e-05, + "loss": 2.6303, + "step": 51114 + }, + { + "epoch": 2.3797751239611706, + "grad_norm": 0.31517401570642356, + "learning_rate": 1.2460968093351639e-05, + "loss": 2.6776, + "step": 51115 + }, + { + "epoch": 2.3798216821472638, + "grad_norm": 0.3207487987574115, + "learning_rate": 1.245917888814031e-05, + "loss": 2.5895, + "step": 51116 + }, + { + "epoch": 2.3798682403333564, + "grad_norm": 0.33452040896922175, + "learning_rate": 1.2457389793107593e-05, + "loss": 2.5639, + "step": 51117 + }, + { + "epoch": 2.3799147985194495, + "grad_norm": 0.306252248014412, + "learning_rate": 1.2455600808258743e-05, + "loss": 2.5901, + "step": 51118 + }, + { + "epoch": 2.3799613567055427, + "grad_norm": 0.3186427933577017, + "learning_rate": 1.2453811933599002e-05, + "loss": 2.6176, + "step": 51119 + }, + { + "epoch": 2.3800079148916358, + "grad_norm": 0.3272306676853394, + "learning_rate": 1.2452023169133625e-05, + "loss": 2.6733, + "step": 51120 + }, + { + "epoch": 2.380054473077729, + "grad_norm": 0.3193326088002745, + "learning_rate": 1.2450234514867875e-05, + "loss": 2.6859, + "step": 51121 + }, + { + "epoch": 2.380101031263822, + "grad_norm": 0.30481343983854786, + "learning_rate": 1.244844597080696e-05, + "loss": 2.6715, + "step": 51122 + }, + { + "epoch": 2.380147589449915, + "grad_norm": 0.315743375455617, + "learning_rate": 1.2446657536956186e-05, + "loss": 2.5705, + "step": 51123 + }, + { + "epoch": 2.3801941476360082, + "grad_norm": 0.343118834294904, + "learning_rate": 1.2444869213320742e-05, + "loss": 2.6746, + "step": 51124 + }, + { + "epoch": 2.3802407058221013, + "grad_norm": 0.3162667117285285, + "learning_rate": 1.2443080999905938e-05, + "loss": 2.6252, + "step": 51125 + }, + { + "epoch": 2.380287264008194, + "grad_norm": 0.3323338016958978, + "learning_rate": 1.2441292896716971e-05, + "loss": 2.6313, + "step": 51126 + }, + { + "epoch": 2.380333822194287, + "grad_norm": 0.3146656420974136, + "learning_rate": 1.2439504903759114e-05, + "loss": 2.6346, + "step": 51127 + }, + { + "epoch": 2.3803803803803802, + "grad_norm": 0.3336035650917163, + "learning_rate": 1.2437717021037626e-05, + "loss": 2.6335, + "step": 51128 + }, + { + "epoch": 2.3804269385664734, + "grad_norm": 0.33063145951026696, + "learning_rate": 1.243592924855772e-05, + "loss": 2.4817, + "step": 51129 + }, + { + "epoch": 2.3804734967525665, + "grad_norm": 0.3274606442965324, + "learning_rate": 1.243414158632466e-05, + "loss": 2.6202, + "step": 51130 + }, + { + "epoch": 2.3805200549386596, + "grad_norm": 0.32322303776376626, + "learning_rate": 1.2432354034343697e-05, + "loss": 2.6154, + "step": 51131 + }, + { + "epoch": 2.3805666131247527, + "grad_norm": 0.3259529714888802, + "learning_rate": 1.2430566592620074e-05, + "loss": 2.6656, + "step": 51132 + }, + { + "epoch": 2.380613171310846, + "grad_norm": 0.32616942995516063, + "learning_rate": 1.2428779261159034e-05, + "loss": 2.6729, + "step": 51133 + }, + { + "epoch": 2.380659729496939, + "grad_norm": 0.34236457119625513, + "learning_rate": 1.242699203996584e-05, + "loss": 2.5871, + "step": 51134 + }, + { + "epoch": 2.380706287683032, + "grad_norm": 0.33257831953124223, + "learning_rate": 1.2425204929045698e-05, + "loss": 2.6662, + "step": 51135 + }, + { + "epoch": 2.380752845869125, + "grad_norm": 0.3355427742001755, + "learning_rate": 1.2423417928403903e-05, + "loss": 2.6317, + "step": 51136 + }, + { + "epoch": 2.380799404055218, + "grad_norm": 0.3253435435018575, + "learning_rate": 1.2421631038045645e-05, + "loss": 2.6766, + "step": 51137 + }, + { + "epoch": 2.380845962241311, + "grad_norm": 0.3073290065784306, + "learning_rate": 1.2419844257976232e-05, + "loss": 2.6599, + "step": 51138 + }, + { + "epoch": 2.380892520427404, + "grad_norm": 0.322540854272664, + "learning_rate": 1.2418057588200854e-05, + "loss": 2.6349, + "step": 51139 + }, + { + "epoch": 2.380939078613497, + "grad_norm": 0.3299617983117778, + "learning_rate": 1.2416271028724773e-05, + "loss": 2.6139, + "step": 51140 + }, + { + "epoch": 2.3809856367995903, + "grad_norm": 0.31639975081173, + "learning_rate": 1.2414484579553254e-05, + "loss": 2.5731, + "step": 51141 + }, + { + "epoch": 2.3810321949856834, + "grad_norm": 0.321651519121532, + "learning_rate": 1.241269824069149e-05, + "loss": 2.6204, + "step": 51142 + }, + { + "epoch": 2.3810787531717765, + "grad_norm": 0.3233531925855329, + "learning_rate": 1.2410912012144782e-05, + "loss": 2.6509, + "step": 51143 + }, + { + "epoch": 2.3811253113578696, + "grad_norm": 0.31525663076166294, + "learning_rate": 1.2409125893918328e-05, + "loss": 2.5372, + "step": 51144 + }, + { + "epoch": 2.3811718695439628, + "grad_norm": 0.3073407798899626, + "learning_rate": 1.2407339886017383e-05, + "loss": 2.6117, + "step": 51145 + }, + { + "epoch": 2.3812184277300554, + "grad_norm": 0.2881574185702689, + "learning_rate": 1.24055539884472e-05, + "loss": 2.6742, + "step": 51146 + }, + { + "epoch": 2.3812649859161485, + "grad_norm": 0.3466673462831834, + "learning_rate": 1.2403768201213023e-05, + "loss": 2.6773, + "step": 51147 + }, + { + "epoch": 2.3813115441022417, + "grad_norm": 0.31202187398305176, + "learning_rate": 1.2401982524320049e-05, + "loss": 2.5567, + "step": 51148 + }, + { + "epoch": 2.3813581022883348, + "grad_norm": 0.3202628796581995, + "learning_rate": 1.2400196957773585e-05, + "loss": 2.7159, + "step": 51149 + }, + { + "epoch": 2.381404660474428, + "grad_norm": 0.29627510139782776, + "learning_rate": 1.239841150157881e-05, + "loss": 2.6864, + "step": 51150 + }, + { + "epoch": 2.381451218660521, + "grad_norm": 0.3090172553607397, + "learning_rate": 1.2396626155741026e-05, + "loss": 2.6157, + "step": 51151 + }, + { + "epoch": 2.381497776846614, + "grad_norm": 0.3460313831269264, + "learning_rate": 1.2394840920265422e-05, + "loss": 2.7245, + "step": 51152 + }, + { + "epoch": 2.3815443350327072, + "grad_norm": 0.307785623767166, + "learning_rate": 1.2393055795157255e-05, + "loss": 2.6447, + "step": 51153 + }, + { + "epoch": 2.3815908932188004, + "grad_norm": 0.3275603870226663, + "learning_rate": 1.2391270780421777e-05, + "loss": 2.7112, + "step": 51154 + }, + { + "epoch": 2.3816374514048935, + "grad_norm": 0.33051435120854405, + "learning_rate": 1.2389485876064188e-05, + "loss": 2.5914, + "step": 51155 + }, + { + "epoch": 2.3816840095909866, + "grad_norm": 0.30368100555616134, + "learning_rate": 1.238770108208978e-05, + "loss": 2.7177, + "step": 51156 + }, + { + "epoch": 2.3817305677770793, + "grad_norm": 0.2993494691814269, + "learning_rate": 1.238591639850375e-05, + "loss": 2.591, + "step": 51157 + }, + { + "epoch": 2.3817771259631724, + "grad_norm": 0.31591701303765796, + "learning_rate": 1.2384131825311345e-05, + "loss": 2.7119, + "step": 51158 + }, + { + "epoch": 2.3818236841492655, + "grad_norm": 0.3215907282190044, + "learning_rate": 1.2382347362517815e-05, + "loss": 2.635, + "step": 51159 + }, + { + "epoch": 2.3818702423353586, + "grad_norm": 0.30181078872780975, + "learning_rate": 1.2380563010128388e-05, + "loss": 2.651, + "step": 51160 + }, + { + "epoch": 2.3819168005214517, + "grad_norm": 0.30490932413464855, + "learning_rate": 1.2378778768148297e-05, + "loss": 2.6181, + "step": 51161 + }, + { + "epoch": 2.381963358707545, + "grad_norm": 0.32176603303380896, + "learning_rate": 1.2376994636582806e-05, + "loss": 2.4516, + "step": 51162 + }, + { + "epoch": 2.382009916893638, + "grad_norm": 0.3395143275519239, + "learning_rate": 1.2375210615437094e-05, + "loss": 2.6637, + "step": 51163 + }, + { + "epoch": 2.382056475079731, + "grad_norm": 0.3241432338568465, + "learning_rate": 1.2373426704716462e-05, + "loss": 2.7209, + "step": 51164 + }, + { + "epoch": 2.3821030332658237, + "grad_norm": 0.3118006857442211, + "learning_rate": 1.2371642904426106e-05, + "loss": 2.577, + "step": 51165 + }, + { + "epoch": 2.382149591451917, + "grad_norm": 0.34636216634614597, + "learning_rate": 1.2369859214571267e-05, + "loss": 2.6181, + "step": 51166 + }, + { + "epoch": 2.38219614963801, + "grad_norm": 0.32216374128351505, + "learning_rate": 1.2368075635157201e-05, + "loss": 2.6525, + "step": 51167 + }, + { + "epoch": 2.382242707824103, + "grad_norm": 0.3201180060035528, + "learning_rate": 1.2366292166189091e-05, + "loss": 2.5507, + "step": 51168 + }, + { + "epoch": 2.382289266010196, + "grad_norm": 0.3520276842251856, + "learning_rate": 1.2364508807672243e-05, + "loss": 2.6707, + "step": 51169 + }, + { + "epoch": 2.3823358241962893, + "grad_norm": 0.3144483963714162, + "learning_rate": 1.2362725559611832e-05, + "loss": 2.6089, + "step": 51170 + }, + { + "epoch": 2.3823823823823824, + "grad_norm": 0.3214187082564813, + "learning_rate": 1.2360942422013123e-05, + "loss": 2.5885, + "step": 51171 + }, + { + "epoch": 2.3824289405684755, + "grad_norm": 0.32103716201715066, + "learning_rate": 1.2359159394881332e-05, + "loss": 2.5781, + "step": 51172 + }, + { + "epoch": 2.3824754987545687, + "grad_norm": 0.3244463959000353, + "learning_rate": 1.2357376478221699e-05, + "loss": 2.6822, + "step": 51173 + }, + { + "epoch": 2.3825220569406618, + "grad_norm": 0.32912074838109057, + "learning_rate": 1.2355593672039462e-05, + "loss": 2.5727, + "step": 51174 + }, + { + "epoch": 2.382568615126755, + "grad_norm": 0.33359008387583516, + "learning_rate": 1.2353810976339863e-05, + "loss": 2.6735, + "step": 51175 + }, + { + "epoch": 2.3826151733128476, + "grad_norm": 0.3306310669095198, + "learning_rate": 1.235202839112809e-05, + "loss": 2.6759, + "step": 51176 + }, + { + "epoch": 2.3826617314989407, + "grad_norm": 0.31110979112389386, + "learning_rate": 1.235024591640943e-05, + "loss": 2.6885, + "step": 51177 + }, + { + "epoch": 2.382708289685034, + "grad_norm": 0.3151064284412917, + "learning_rate": 1.2348463552189082e-05, + "loss": 2.6329, + "step": 51178 + }, + { + "epoch": 2.382754847871127, + "grad_norm": 0.3090952697800967, + "learning_rate": 1.2346681298472279e-05, + "loss": 2.7375, + "step": 51179 + }, + { + "epoch": 2.38280140605722, + "grad_norm": 0.31087396351681135, + "learning_rate": 1.2344899155264271e-05, + "loss": 2.7629, + "step": 51180 + }, + { + "epoch": 2.382847964243313, + "grad_norm": 0.3217435949629996, + "learning_rate": 1.2343117122570252e-05, + "loss": 2.6585, + "step": 51181 + }, + { + "epoch": 2.3828945224294062, + "grad_norm": 0.3287355461005166, + "learning_rate": 1.2341335200395499e-05, + "loss": 2.6712, + "step": 51182 + }, + { + "epoch": 2.3829410806154994, + "grad_norm": 0.3289652468006716, + "learning_rate": 1.2339553388745201e-05, + "loss": 2.6782, + "step": 51183 + }, + { + "epoch": 2.3829876388015925, + "grad_norm": 0.31691546884528937, + "learning_rate": 1.2337771687624606e-05, + "loss": 2.5822, + "step": 51184 + }, + { + "epoch": 2.383034196987685, + "grad_norm": 0.3052252744590274, + "learning_rate": 1.2335990097038942e-05, + "loss": 2.6146, + "step": 51185 + }, + { + "epoch": 2.3830807551737783, + "grad_norm": 0.3268507803475999, + "learning_rate": 1.2334208616993432e-05, + "loss": 2.705, + "step": 51186 + }, + { + "epoch": 2.3831273133598714, + "grad_norm": 0.3112066266603589, + "learning_rate": 1.2332427247493311e-05, + "loss": 2.6493, + "step": 51187 + }, + { + "epoch": 2.3831738715459645, + "grad_norm": 0.32451565068594385, + "learning_rate": 1.2330645988543821e-05, + "loss": 2.5802, + "step": 51188 + }, + { + "epoch": 2.3832204297320576, + "grad_norm": 0.32297772407292447, + "learning_rate": 1.2328864840150145e-05, + "loss": 2.6591, + "step": 51189 + }, + { + "epoch": 2.3832669879181507, + "grad_norm": 0.31494975235168293, + "learning_rate": 1.2327083802317567e-05, + "loss": 2.6459, + "step": 51190 + }, + { + "epoch": 2.383313546104244, + "grad_norm": 0.3099624525368449, + "learning_rate": 1.232530287505127e-05, + "loss": 2.626, + "step": 51191 + }, + { + "epoch": 2.383360104290337, + "grad_norm": 0.30585837596991766, + "learning_rate": 1.2323522058356501e-05, + "loss": 2.6029, + "step": 51192 + }, + { + "epoch": 2.38340666247643, + "grad_norm": 0.31325446969641796, + "learning_rate": 1.2321741352238497e-05, + "loss": 2.5813, + "step": 51193 + }, + { + "epoch": 2.383453220662523, + "grad_norm": 0.33567209485339466, + "learning_rate": 1.2319960756702443e-05, + "loss": 2.6938, + "step": 51194 + }, + { + "epoch": 2.3834997788486163, + "grad_norm": 0.30986347665498926, + "learning_rate": 1.2318180271753621e-05, + "loss": 2.6311, + "step": 51195 + }, + { + "epoch": 2.383546337034709, + "grad_norm": 0.34140085859363406, + "learning_rate": 1.2316399897397196e-05, + "loss": 2.7467, + "step": 51196 + }, + { + "epoch": 2.383592895220802, + "grad_norm": 0.31493467254957713, + "learning_rate": 1.2314619633638452e-05, + "loss": 2.5428, + "step": 51197 + }, + { + "epoch": 2.383639453406895, + "grad_norm": 0.3416339285733471, + "learning_rate": 1.2312839480482573e-05, + "loss": 2.6223, + "step": 51198 + }, + { + "epoch": 2.3836860115929883, + "grad_norm": 0.3456171455595553, + "learning_rate": 1.2311059437934802e-05, + "loss": 2.7179, + "step": 51199 + }, + { + "epoch": 2.3837325697790814, + "grad_norm": 0.31738070905993615, + "learning_rate": 1.2309279506000354e-05, + "loss": 2.6508, + "step": 51200 + }, + { + "epoch": 2.3837791279651745, + "grad_norm": 0.33728602883856534, + "learning_rate": 1.2307499684684476e-05, + "loss": 2.6374, + "step": 51201 + }, + { + "epoch": 2.3838256861512677, + "grad_norm": 0.33477624470852, + "learning_rate": 1.230571997399234e-05, + "loss": 2.5714, + "step": 51202 + }, + { + "epoch": 2.383872244337361, + "grad_norm": 0.3355878058889949, + "learning_rate": 1.2303940373929234e-05, + "loss": 2.6825, + "step": 51203 + }, + { + "epoch": 2.3839188025234535, + "grad_norm": 0.3269270159664592, + "learning_rate": 1.2302160884500336e-05, + "loss": 2.5742, + "step": 51204 + }, + { + "epoch": 2.3839653607095466, + "grad_norm": 0.3340125642920601, + "learning_rate": 1.2300381505710878e-05, + "loss": 2.702, + "step": 51205 + }, + { + "epoch": 2.3840119188956397, + "grad_norm": 0.35509178091209637, + "learning_rate": 1.2298602237566104e-05, + "loss": 2.6504, + "step": 51206 + }, + { + "epoch": 2.384058477081733, + "grad_norm": 0.3238761272950563, + "learning_rate": 1.2296823080071185e-05, + "loss": 2.4652, + "step": 51207 + }, + { + "epoch": 2.384105035267826, + "grad_norm": 0.3253991234818678, + "learning_rate": 1.2295044033231407e-05, + "loss": 2.5648, + "step": 51208 + }, + { + "epoch": 2.384151593453919, + "grad_norm": 0.3230287181044747, + "learning_rate": 1.2293265097051931e-05, + "loss": 2.746, + "step": 51209 + }, + { + "epoch": 2.384198151640012, + "grad_norm": 0.35713293607846713, + "learning_rate": 1.2291486271538039e-05, + "loss": 2.6849, + "step": 51210 + }, + { + "epoch": 2.3842447098261053, + "grad_norm": 0.3348996167122723, + "learning_rate": 1.2289707556694908e-05, + "loss": 2.599, + "step": 51211 + }, + { + "epoch": 2.3842912680121984, + "grad_norm": 0.3289462626902121, + "learning_rate": 1.2287928952527761e-05, + "loss": 2.6801, + "step": 51212 + }, + { + "epoch": 2.3843378261982915, + "grad_norm": 0.3287511144194036, + "learning_rate": 1.2286150459041834e-05, + "loss": 2.6184, + "step": 51213 + }, + { + "epoch": 2.3843843843843846, + "grad_norm": 0.33490464279002413, + "learning_rate": 1.2284372076242339e-05, + "loss": 2.6111, + "step": 51214 + }, + { + "epoch": 2.3844309425704773, + "grad_norm": 0.32357643213620685, + "learning_rate": 1.2282593804134495e-05, + "loss": 2.6008, + "step": 51215 + }, + { + "epoch": 2.3844775007565704, + "grad_norm": 0.3257704824587672, + "learning_rate": 1.2280815642723537e-05, + "loss": 2.5836, + "step": 51216 + }, + { + "epoch": 2.3845240589426635, + "grad_norm": 0.31742712570041653, + "learning_rate": 1.2279037592014652e-05, + "loss": 2.6702, + "step": 51217 + }, + { + "epoch": 2.3845706171287566, + "grad_norm": 0.33546637060547374, + "learning_rate": 1.2277259652013079e-05, + "loss": 2.6679, + "step": 51218 + }, + { + "epoch": 2.3846171753148497, + "grad_norm": 0.3091173466387326, + "learning_rate": 1.227548182272405e-05, + "loss": 2.586, + "step": 51219 + }, + { + "epoch": 2.384663733500943, + "grad_norm": 0.32755599096828425, + "learning_rate": 1.2273704104152733e-05, + "loss": 2.6663, + "step": 51220 + }, + { + "epoch": 2.384710291687036, + "grad_norm": 0.3523061183605163, + "learning_rate": 1.2271926496304403e-05, + "loss": 2.6423, + "step": 51221 + }, + { + "epoch": 2.384756849873129, + "grad_norm": 0.32469095185155644, + "learning_rate": 1.2270148999184228e-05, + "loss": 2.6105, + "step": 51222 + }, + { + "epoch": 2.384803408059222, + "grad_norm": 0.31341800455096175, + "learning_rate": 1.2268371612797475e-05, + "loss": 2.5748, + "step": 51223 + }, + { + "epoch": 2.384849966245315, + "grad_norm": 0.3300247750168922, + "learning_rate": 1.2266594337149322e-05, + "loss": 2.597, + "step": 51224 + }, + { + "epoch": 2.384896524431408, + "grad_norm": 0.3310471028792169, + "learning_rate": 1.2264817172244997e-05, + "loss": 2.6254, + "step": 51225 + }, + { + "epoch": 2.384943082617501, + "grad_norm": 0.3452736496515238, + "learning_rate": 1.2263040118089714e-05, + "loss": 2.6157, + "step": 51226 + }, + { + "epoch": 2.384989640803594, + "grad_norm": 0.3104210250314136, + "learning_rate": 1.2261263174688691e-05, + "loss": 2.6353, + "step": 51227 + }, + { + "epoch": 2.3850361989896873, + "grad_norm": 0.3183915519808085, + "learning_rate": 1.2259486342047144e-05, + "loss": 2.6118, + "step": 51228 + }, + { + "epoch": 2.3850827571757804, + "grad_norm": 0.3415111547636108, + "learning_rate": 1.22577096201703e-05, + "loss": 2.5833, + "step": 51229 + }, + { + "epoch": 2.3851293153618736, + "grad_norm": 0.3462225349568584, + "learning_rate": 1.2255933009063347e-05, + "loss": 2.7128, + "step": 51230 + }, + { + "epoch": 2.3851758735479667, + "grad_norm": 0.3150414789537342, + "learning_rate": 1.2254156508731512e-05, + "loss": 2.5993, + "step": 51231 + }, + { + "epoch": 2.38522243173406, + "grad_norm": 0.3287344752348597, + "learning_rate": 1.2252380119180007e-05, + "loss": 2.5584, + "step": 51232 + }, + { + "epoch": 2.385268989920153, + "grad_norm": 0.33964458773838113, + "learning_rate": 1.225060384041405e-05, + "loss": 2.6409, + "step": 51233 + }, + { + "epoch": 2.385315548106246, + "grad_norm": 0.32915355820551373, + "learning_rate": 1.2248827672438868e-05, + "loss": 2.6349, + "step": 51234 + }, + { + "epoch": 2.3853621062923387, + "grad_norm": 0.32371063564245534, + "learning_rate": 1.2247051615259624e-05, + "loss": 2.6373, + "step": 51235 + }, + { + "epoch": 2.385408664478432, + "grad_norm": 0.3366430509931205, + "learning_rate": 1.2245275668881596e-05, + "loss": 2.5214, + "step": 51236 + }, + { + "epoch": 2.385455222664525, + "grad_norm": 0.3517665285667521, + "learning_rate": 1.2243499833309946e-05, + "loss": 2.6037, + "step": 51237 + }, + { + "epoch": 2.385501780850618, + "grad_norm": 0.3136767416644816, + "learning_rate": 1.2241724108549907e-05, + "loss": 2.5338, + "step": 51238 + }, + { + "epoch": 2.385548339036711, + "grad_norm": 0.31821228202539276, + "learning_rate": 1.223994849460669e-05, + "loss": 2.7564, + "step": 51239 + }, + { + "epoch": 2.3855948972228043, + "grad_norm": 0.3394502056495219, + "learning_rate": 1.2238172991485503e-05, + "loss": 2.6593, + "step": 51240 + }, + { + "epoch": 2.3856414554088974, + "grad_norm": 0.3256001521167139, + "learning_rate": 1.2236397599191556e-05, + "loss": 2.5527, + "step": 51241 + }, + { + "epoch": 2.3856880135949905, + "grad_norm": 0.2963896708998471, + "learning_rate": 1.2234622317730082e-05, + "loss": 2.6277, + "step": 51242 + }, + { + "epoch": 2.385734571781083, + "grad_norm": 0.34196795518551415, + "learning_rate": 1.2232847147106253e-05, + "loss": 2.5896, + "step": 51243 + }, + { + "epoch": 2.3857811299671763, + "grad_norm": 0.3070081161108051, + "learning_rate": 1.2231072087325296e-05, + "loss": 2.5778, + "step": 51244 + }, + { + "epoch": 2.3858276881532694, + "grad_norm": 0.33201551109541394, + "learning_rate": 1.222929713839242e-05, + "loss": 2.6904, + "step": 51245 + }, + { + "epoch": 2.3858742463393625, + "grad_norm": 0.3138732455129525, + "learning_rate": 1.2227522300312837e-05, + "loss": 2.5439, + "step": 51246 + }, + { + "epoch": 2.3859208045254556, + "grad_norm": 0.3154233193903909, + "learning_rate": 1.2225747573091773e-05, + "loss": 2.7282, + "step": 51247 + }, + { + "epoch": 2.3859673627115487, + "grad_norm": 0.3087550172149653, + "learning_rate": 1.222397295673438e-05, + "loss": 2.5753, + "step": 51248 + }, + { + "epoch": 2.386013920897642, + "grad_norm": 0.31938202565441437, + "learning_rate": 1.2222198451245941e-05, + "loss": 2.6968, + "step": 51249 + }, + { + "epoch": 2.386060479083735, + "grad_norm": 0.32503516638351476, + "learning_rate": 1.2220424056631603e-05, + "loss": 2.5839, + "step": 51250 + }, + { + "epoch": 2.386107037269828, + "grad_norm": 0.3106397265915949, + "learning_rate": 1.2218649772896623e-05, + "loss": 2.6363, + "step": 51251 + }, + { + "epoch": 2.386153595455921, + "grad_norm": 0.32508571630886707, + "learning_rate": 1.2216875600046168e-05, + "loss": 2.666, + "step": 51252 + }, + { + "epoch": 2.3862001536420143, + "grad_norm": 0.3193191491930202, + "learning_rate": 1.2215101538085461e-05, + "loss": 2.5813, + "step": 51253 + }, + { + "epoch": 2.386246711828107, + "grad_norm": 0.33383393712662063, + "learning_rate": 1.2213327587019707e-05, + "loss": 2.5982, + "step": 51254 + }, + { + "epoch": 2.3862932700142, + "grad_norm": 0.3276462549351813, + "learning_rate": 1.2211553746854132e-05, + "loss": 2.6787, + "step": 51255 + }, + { + "epoch": 2.3863398282002932, + "grad_norm": 0.3299693671244791, + "learning_rate": 1.2209780017593908e-05, + "loss": 2.6083, + "step": 51256 + }, + { + "epoch": 2.3863863863863863, + "grad_norm": 0.3217160758635624, + "learning_rate": 1.2208006399244259e-05, + "loss": 2.6922, + "step": 51257 + }, + { + "epoch": 2.3864329445724795, + "grad_norm": 0.3279325351338928, + "learning_rate": 1.2206232891810388e-05, + "loss": 2.6928, + "step": 51258 + }, + { + "epoch": 2.3864795027585726, + "grad_norm": 0.3187236190502484, + "learning_rate": 1.2204459495297494e-05, + "loss": 2.691, + "step": 51259 + }, + { + "epoch": 2.3865260609446657, + "grad_norm": 0.3096810020860444, + "learning_rate": 1.2202686209710807e-05, + "loss": 2.5811, + "step": 51260 + }, + { + "epoch": 2.386572619130759, + "grad_norm": 0.30602720270681116, + "learning_rate": 1.2200913035055483e-05, + "loss": 2.5809, + "step": 51261 + }, + { + "epoch": 2.386619177316852, + "grad_norm": 0.3152734692911068, + "learning_rate": 1.2199139971336787e-05, + "loss": 2.6677, + "step": 51262 + }, + { + "epoch": 2.3866657355029446, + "grad_norm": 0.2941989971782951, + "learning_rate": 1.2197367018559858e-05, + "loss": 2.546, + "step": 51263 + }, + { + "epoch": 2.3867122936890377, + "grad_norm": 0.31498366317119325, + "learning_rate": 1.2195594176729963e-05, + "loss": 2.5888, + "step": 51264 + }, + { + "epoch": 2.386758851875131, + "grad_norm": 0.3372319981472138, + "learning_rate": 1.219382144585226e-05, + "loss": 2.6458, + "step": 51265 + }, + { + "epoch": 2.386805410061224, + "grad_norm": 0.3113004265899531, + "learning_rate": 1.2192048825931968e-05, + "loss": 2.6342, + "step": 51266 + }, + { + "epoch": 2.386851968247317, + "grad_norm": 0.29081148025850445, + "learning_rate": 1.2190276316974286e-05, + "loss": 2.6372, + "step": 51267 + }, + { + "epoch": 2.38689852643341, + "grad_norm": 0.3231493286085821, + "learning_rate": 1.2188503918984417e-05, + "loss": 2.6882, + "step": 51268 + }, + { + "epoch": 2.3869450846195033, + "grad_norm": 0.3323124534534692, + "learning_rate": 1.2186731631967585e-05, + "loss": 2.5537, + "step": 51269 + }, + { + "epoch": 2.3869916428055964, + "grad_norm": 0.310671402393029, + "learning_rate": 1.2184959455928952e-05, + "loss": 2.6514, + "step": 51270 + }, + { + "epoch": 2.3870382009916895, + "grad_norm": 0.30252849764116657, + "learning_rate": 1.2183187390873735e-05, + "loss": 2.5879, + "step": 51271 + }, + { + "epoch": 2.3870847591777826, + "grad_norm": 0.30867468409372356, + "learning_rate": 1.2181415436807142e-05, + "loss": 2.6174, + "step": 51272 + }, + { + "epoch": 2.3871313173638757, + "grad_norm": 0.2969443186646864, + "learning_rate": 1.217964359373438e-05, + "loss": 2.6123, + "step": 51273 + }, + { + "epoch": 2.3871778755499684, + "grad_norm": 0.3231173213231847, + "learning_rate": 1.2177871861660606e-05, + "loss": 2.682, + "step": 51274 + }, + { + "epoch": 2.3872244337360615, + "grad_norm": 0.32506177200595193, + "learning_rate": 1.2176100240591088e-05, + "loss": 2.7041, + "step": 51275 + }, + { + "epoch": 2.3872709919221546, + "grad_norm": 0.29238079957010593, + "learning_rate": 1.217432873053096e-05, + "loss": 2.587, + "step": 51276 + }, + { + "epoch": 2.3873175501082478, + "grad_norm": 0.3287280379180989, + "learning_rate": 1.2172557331485473e-05, + "loss": 2.5762, + "step": 51277 + }, + { + "epoch": 2.387364108294341, + "grad_norm": 0.31250666856427867, + "learning_rate": 1.2170786043459797e-05, + "loss": 2.6514, + "step": 51278 + }, + { + "epoch": 2.387410666480434, + "grad_norm": 0.3273643500964347, + "learning_rate": 1.2169014866459133e-05, + "loss": 2.6371, + "step": 51279 + }, + { + "epoch": 2.387457224666527, + "grad_norm": 0.3285175971168998, + "learning_rate": 1.2167243800488682e-05, + "loss": 2.6852, + "step": 51280 + }, + { + "epoch": 2.38750378285262, + "grad_norm": 0.30971549542306775, + "learning_rate": 1.2165472845553644e-05, + "loss": 2.616, + "step": 51281 + }, + { + "epoch": 2.387550341038713, + "grad_norm": 0.3261653052386019, + "learning_rate": 1.2163702001659233e-05, + "loss": 2.625, + "step": 51282 + }, + { + "epoch": 2.387596899224806, + "grad_norm": 0.33050586627085005, + "learning_rate": 1.2161931268810611e-05, + "loss": 2.6629, + "step": 51283 + }, + { + "epoch": 2.387643457410899, + "grad_norm": 0.3143926914774678, + "learning_rate": 1.2160160647012996e-05, + "loss": 2.6738, + "step": 51284 + }, + { + "epoch": 2.3876900155969922, + "grad_norm": 0.3304175619983524, + "learning_rate": 1.2158390136271585e-05, + "loss": 2.5987, + "step": 51285 + }, + { + "epoch": 2.3877365737830853, + "grad_norm": 0.31273190879664103, + "learning_rate": 1.2156619736591562e-05, + "loss": 2.6853, + "step": 51286 + }, + { + "epoch": 2.3877831319691785, + "grad_norm": 0.3092792153742128, + "learning_rate": 1.215484944797814e-05, + "loss": 2.6182, + "step": 51287 + }, + { + "epoch": 2.3878296901552716, + "grad_norm": 0.3170277773367165, + "learning_rate": 1.2153079270436519e-05, + "loss": 2.6898, + "step": 51288 + }, + { + "epoch": 2.3878762483413647, + "grad_norm": 0.30684379306367743, + "learning_rate": 1.215130920397185e-05, + "loss": 2.5575, + "step": 51289 + }, + { + "epoch": 2.387922806527458, + "grad_norm": 0.3442964364893454, + "learning_rate": 1.2149539248589392e-05, + "loss": 2.581, + "step": 51290 + }, + { + "epoch": 2.387969364713551, + "grad_norm": 0.30591710277215717, + "learning_rate": 1.214776940429429e-05, + "loss": 2.6684, + "step": 51291 + }, + { + "epoch": 2.388015922899644, + "grad_norm": 0.30762190460724625, + "learning_rate": 1.2145999671091757e-05, + "loss": 2.6923, + "step": 51292 + }, + { + "epoch": 2.3880624810857367, + "grad_norm": 0.3312810570831098, + "learning_rate": 1.2144230048986987e-05, + "loss": 2.6675, + "step": 51293 + }, + { + "epoch": 2.38810903927183, + "grad_norm": 0.34946346034191217, + "learning_rate": 1.2142460537985167e-05, + "loss": 2.7158, + "step": 51294 + }, + { + "epoch": 2.388155597457923, + "grad_norm": 0.3134674941706328, + "learning_rate": 1.2140691138091515e-05, + "loss": 2.7131, + "step": 51295 + }, + { + "epoch": 2.388202155644016, + "grad_norm": 0.3193370007167193, + "learning_rate": 1.213892184931119e-05, + "loss": 2.6002, + "step": 51296 + }, + { + "epoch": 2.388248713830109, + "grad_norm": 0.3017702461453864, + "learning_rate": 1.2137152671649399e-05, + "loss": 2.6336, + "step": 51297 + }, + { + "epoch": 2.3882952720162023, + "grad_norm": 0.30863780386264766, + "learning_rate": 1.213538360511134e-05, + "loss": 2.6145, + "step": 51298 + }, + { + "epoch": 2.3883418302022954, + "grad_norm": 0.3320019715211217, + "learning_rate": 1.213361464970219e-05, + "loss": 2.6366, + "step": 51299 + }, + { + "epoch": 2.3883883883883885, + "grad_norm": 0.3135065094301463, + "learning_rate": 1.2131845805427156e-05, + "loss": 2.6601, + "step": 51300 + }, + { + "epoch": 2.3884349465744816, + "grad_norm": 0.31148613105744427, + "learning_rate": 1.2130077072291441e-05, + "loss": 2.6223, + "step": 51301 + }, + { + "epoch": 2.3884815047605743, + "grad_norm": 0.29972910501708366, + "learning_rate": 1.2128308450300185e-05, + "loss": 2.6359, + "step": 51302 + }, + { + "epoch": 2.3885280629466674, + "grad_norm": 0.3184493735597144, + "learning_rate": 1.2126539939458643e-05, + "loss": 2.6444, + "step": 51303 + }, + { + "epoch": 2.3885746211327605, + "grad_norm": 0.3335392140502371, + "learning_rate": 1.2124771539771963e-05, + "loss": 2.6172, + "step": 51304 + }, + { + "epoch": 2.3886211793188536, + "grad_norm": 0.3427005984242259, + "learning_rate": 1.2123003251245341e-05, + "loss": 2.5769, + "step": 51305 + }, + { + "epoch": 2.3886677375049468, + "grad_norm": 0.3164167125530186, + "learning_rate": 1.212123507388398e-05, + "loss": 2.6184, + "step": 51306 + }, + { + "epoch": 2.38871429569104, + "grad_norm": 0.30892508071800207, + "learning_rate": 1.2119467007693058e-05, + "loss": 2.6852, + "step": 51307 + }, + { + "epoch": 2.388760853877133, + "grad_norm": 0.31549306945874334, + "learning_rate": 1.2117699052677783e-05, + "loss": 2.539, + "step": 51308 + }, + { + "epoch": 2.388807412063226, + "grad_norm": 0.33473220392497594, + "learning_rate": 1.2115931208843317e-05, + "loss": 2.6305, + "step": 51309 + }, + { + "epoch": 2.3888539702493192, + "grad_norm": 0.32524226235723247, + "learning_rate": 1.2114163476194863e-05, + "loss": 2.669, + "step": 51310 + }, + { + "epoch": 2.3889005284354123, + "grad_norm": 0.3187343183366732, + "learning_rate": 1.2112395854737602e-05, + "loss": 2.5073, + "step": 51311 + }, + { + "epoch": 2.3889470866215055, + "grad_norm": 0.298223639553626, + "learning_rate": 1.2110628344476726e-05, + "loss": 2.6457, + "step": 51312 + }, + { + "epoch": 2.388993644807598, + "grad_norm": 0.3167221278053682, + "learning_rate": 1.2108860945417427e-05, + "loss": 2.6539, + "step": 51313 + }, + { + "epoch": 2.3890402029936912, + "grad_norm": 0.3317185319968656, + "learning_rate": 1.21070936575649e-05, + "loss": 2.6894, + "step": 51314 + }, + { + "epoch": 2.3890867611797844, + "grad_norm": 0.2974272878085977, + "learning_rate": 1.2105326480924295e-05, + "loss": 2.6748, + "step": 51315 + }, + { + "epoch": 2.3891333193658775, + "grad_norm": 0.3274147460267856, + "learning_rate": 1.2103559415500848e-05, + "loss": 2.6511, + "step": 51316 + }, + { + "epoch": 2.3891798775519706, + "grad_norm": 0.30423109746489335, + "learning_rate": 1.210179246129971e-05, + "loss": 2.5615, + "step": 51317 + }, + { + "epoch": 2.3892264357380637, + "grad_norm": 0.3291291102097964, + "learning_rate": 1.2100025618326072e-05, + "loss": 2.6561, + "step": 51318 + }, + { + "epoch": 2.389272993924157, + "grad_norm": 0.3136522443692044, + "learning_rate": 1.209825888658513e-05, + "loss": 2.6044, + "step": 51319 + }, + { + "epoch": 2.38931955211025, + "grad_norm": 0.3239501854377503, + "learning_rate": 1.2096492266082066e-05, + "loss": 2.6697, + "step": 51320 + }, + { + "epoch": 2.389366110296343, + "grad_norm": 0.31779110701887064, + "learning_rate": 1.2094725756822073e-05, + "loss": 2.6001, + "step": 51321 + }, + { + "epoch": 2.3894126684824357, + "grad_norm": 0.31647147926182806, + "learning_rate": 1.2092959358810296e-05, + "loss": 2.6668, + "step": 51322 + }, + { + "epoch": 2.389459226668529, + "grad_norm": 0.3317850323550015, + "learning_rate": 1.2091193072051981e-05, + "loss": 2.7348, + "step": 51323 + }, + { + "epoch": 2.389505784854622, + "grad_norm": 0.3085803883432502, + "learning_rate": 1.2089426896552265e-05, + "loss": 2.6172, + "step": 51324 + }, + { + "epoch": 2.389552343040715, + "grad_norm": 0.3255615106654891, + "learning_rate": 1.2087660832316344e-05, + "loss": 2.6652, + "step": 51325 + }, + { + "epoch": 2.389598901226808, + "grad_norm": 0.31483076273853666, + "learning_rate": 1.2085894879349408e-05, + "loss": 2.6467, + "step": 51326 + }, + { + "epoch": 2.3896454594129013, + "grad_norm": 0.31474419676785004, + "learning_rate": 1.2084129037656645e-05, + "loss": 2.6506, + "step": 51327 + }, + { + "epoch": 2.3896920175989944, + "grad_norm": 0.32857267346447105, + "learning_rate": 1.2082363307243199e-05, + "loss": 2.6281, + "step": 51328 + }, + { + "epoch": 2.3897385757850875, + "grad_norm": 0.33612198309990954, + "learning_rate": 1.2080597688114309e-05, + "loss": 2.6113, + "step": 51329 + }, + { + "epoch": 2.3897851339711806, + "grad_norm": 0.32422360227677227, + "learning_rate": 1.2078832180275118e-05, + "loss": 2.5908, + "step": 51330 + }, + { + "epoch": 2.3898316921572738, + "grad_norm": 0.3268884181567952, + "learning_rate": 1.2077066783730823e-05, + "loss": 2.6857, + "step": 51331 + }, + { + "epoch": 2.3898782503433664, + "grad_norm": 0.3150829548812088, + "learning_rate": 1.2075301498486591e-05, + "loss": 2.6542, + "step": 51332 + }, + { + "epoch": 2.3899248085294595, + "grad_norm": 0.3129414778352943, + "learning_rate": 1.2073536324547618e-05, + "loss": 2.6144, + "step": 51333 + }, + { + "epoch": 2.3899713667155527, + "grad_norm": 0.3330917815989138, + "learning_rate": 1.2071771261919095e-05, + "loss": 2.5983, + "step": 51334 + }, + { + "epoch": 2.3900179249016458, + "grad_norm": 0.33619782827453826, + "learning_rate": 1.2070006310606158e-05, + "loss": 2.6926, + "step": 51335 + }, + { + "epoch": 2.390064483087739, + "grad_norm": 0.31098739524672725, + "learning_rate": 1.2068241470614044e-05, + "loss": 2.6874, + "step": 51336 + }, + { + "epoch": 2.390111041273832, + "grad_norm": 0.30538234239119216, + "learning_rate": 1.2066476741947892e-05, + "loss": 2.5081, + "step": 51337 + }, + { + "epoch": 2.390157599459925, + "grad_norm": 0.3220246883514325, + "learning_rate": 1.2064712124612886e-05, + "loss": 2.5655, + "step": 51338 + }, + { + "epoch": 2.3902041576460182, + "grad_norm": 0.3270797471512887, + "learning_rate": 1.2062947618614217e-05, + "loss": 2.6843, + "step": 51339 + }, + { + "epoch": 2.3902507158321114, + "grad_norm": 0.33618357984513375, + "learning_rate": 1.2061183223957062e-05, + "loss": 2.7009, + "step": 51340 + }, + { + "epoch": 2.390297274018204, + "grad_norm": 0.33328493849461843, + "learning_rate": 1.2059418940646594e-05, + "loss": 2.6527, + "step": 51341 + }, + { + "epoch": 2.390343832204297, + "grad_norm": 0.34363731441935536, + "learning_rate": 1.205765476868801e-05, + "loss": 2.6055, + "step": 51342 + }, + { + "epoch": 2.3903903903903903, + "grad_norm": 0.3284559596882162, + "learning_rate": 1.2055890708086454e-05, + "loss": 2.6771, + "step": 51343 + }, + { + "epoch": 2.3904369485764834, + "grad_norm": 0.3349299326977157, + "learning_rate": 1.2054126758847124e-05, + "loss": 2.6127, + "step": 51344 + }, + { + "epoch": 2.3904835067625765, + "grad_norm": 0.31621955961502474, + "learning_rate": 1.2052362920975191e-05, + "loss": 2.6185, + "step": 51345 + }, + { + "epoch": 2.3905300649486696, + "grad_norm": 0.3097386450502885, + "learning_rate": 1.2050599194475837e-05, + "loss": 2.5585, + "step": 51346 + }, + { + "epoch": 2.3905766231347627, + "grad_norm": 0.3399402689767664, + "learning_rate": 1.2048835579354245e-05, + "loss": 2.6444, + "step": 51347 + }, + { + "epoch": 2.390623181320856, + "grad_norm": 0.33851147958912137, + "learning_rate": 1.2047072075615556e-05, + "loss": 2.6483, + "step": 51348 + }, + { + "epoch": 2.390669739506949, + "grad_norm": 0.33471993149352675, + "learning_rate": 1.2045308683265005e-05, + "loss": 2.7663, + "step": 51349 + }, + { + "epoch": 2.390716297693042, + "grad_norm": 0.3001051147245617, + "learning_rate": 1.2043545402307716e-05, + "loss": 2.5033, + "step": 51350 + }, + { + "epoch": 2.390762855879135, + "grad_norm": 0.32240917243351763, + "learning_rate": 1.2041782232748877e-05, + "loss": 2.6148, + "step": 51351 + }, + { + "epoch": 2.390809414065228, + "grad_norm": 0.3133903670179401, + "learning_rate": 1.204001917459367e-05, + "loss": 2.6417, + "step": 51352 + }, + { + "epoch": 2.390855972251321, + "grad_norm": 0.32395377674786646, + "learning_rate": 1.2038256227847273e-05, + "loss": 2.6501, + "step": 51353 + }, + { + "epoch": 2.390902530437414, + "grad_norm": 0.31945356070162334, + "learning_rate": 1.2036493392514847e-05, + "loss": 2.579, + "step": 51354 + }, + { + "epoch": 2.390949088623507, + "grad_norm": 0.2996200541579251, + "learning_rate": 1.203473066860159e-05, + "loss": 2.6409, + "step": 51355 + }, + { + "epoch": 2.3909956468096003, + "grad_norm": 0.3263172271773704, + "learning_rate": 1.2032968056112642e-05, + "loss": 2.6847, + "step": 51356 + }, + { + "epoch": 2.3910422049956934, + "grad_norm": 0.3282498722763939, + "learning_rate": 1.2031205555053199e-05, + "loss": 2.7474, + "step": 51357 + }, + { + "epoch": 2.3910887631817865, + "grad_norm": 0.32977892279517296, + "learning_rate": 1.2029443165428423e-05, + "loss": 2.6543, + "step": 51358 + }, + { + "epoch": 2.3911353213678797, + "grad_norm": 0.29591099501362506, + "learning_rate": 1.2027680887243487e-05, + "loss": 2.5953, + "step": 51359 + }, + { + "epoch": 2.3911818795539728, + "grad_norm": 0.29482177639447005, + "learning_rate": 1.2025918720503588e-05, + "loss": 2.5576, + "step": 51360 + }, + { + "epoch": 2.3912284377400654, + "grad_norm": 0.317940627858883, + "learning_rate": 1.2024156665213843e-05, + "loss": 2.6183, + "step": 51361 + }, + { + "epoch": 2.3912749959261586, + "grad_norm": 0.3470498369089462, + "learning_rate": 1.2022394721379481e-05, + "loss": 2.666, + "step": 51362 + }, + { + "epoch": 2.3913215541122517, + "grad_norm": 0.3396542880367157, + "learning_rate": 1.202063288900563e-05, + "loss": 2.6715, + "step": 51363 + }, + { + "epoch": 2.391368112298345, + "grad_norm": 0.3249202724192012, + "learning_rate": 1.2018871168097507e-05, + "loss": 2.6372, + "step": 51364 + }, + { + "epoch": 2.391414670484438, + "grad_norm": 0.3506698233881331, + "learning_rate": 1.2017109558660233e-05, + "loss": 2.6642, + "step": 51365 + }, + { + "epoch": 2.391461228670531, + "grad_norm": 0.32817735614042837, + "learning_rate": 1.201534806069901e-05, + "loss": 2.5815, + "step": 51366 + }, + { + "epoch": 2.391507786856624, + "grad_norm": 0.3048334093104192, + "learning_rate": 1.2013586674218991e-05, + "loss": 2.6182, + "step": 51367 + }, + { + "epoch": 2.3915543450427172, + "grad_norm": 0.3264065243580575, + "learning_rate": 1.2011825399225374e-05, + "loss": 2.5843, + "step": 51368 + }, + { + "epoch": 2.3916009032288104, + "grad_norm": 0.3385788051225246, + "learning_rate": 1.201006423572329e-05, + "loss": 2.5959, + "step": 51369 + }, + { + "epoch": 2.3916474614149035, + "grad_norm": 0.31995099338430055, + "learning_rate": 1.2008303183717922e-05, + "loss": 2.5695, + "step": 51370 + }, + { + "epoch": 2.3916940196009966, + "grad_norm": 0.3143737560908059, + "learning_rate": 1.2006542243214441e-05, + "loss": 2.5882, + "step": 51371 + }, + { + "epoch": 2.3917405777870893, + "grad_norm": 0.34091886787243625, + "learning_rate": 1.2004781414218019e-05, + "loss": 2.6604, + "step": 51372 + }, + { + "epoch": 2.3917871359731824, + "grad_norm": 0.34227399281117254, + "learning_rate": 1.2003020696733835e-05, + "loss": 2.6188, + "step": 51373 + }, + { + "epoch": 2.3918336941592755, + "grad_norm": 0.32883659848656477, + "learning_rate": 1.2001260090767014e-05, + "loss": 2.5176, + "step": 51374 + }, + { + "epoch": 2.3918802523453686, + "grad_norm": 0.3205926609712903, + "learning_rate": 1.1999499596322777e-05, + "loss": 2.566, + "step": 51375 + }, + { + "epoch": 2.3919268105314617, + "grad_norm": 0.31590806375183217, + "learning_rate": 1.1997739213406235e-05, + "loss": 2.5539, + "step": 51376 + }, + { + "epoch": 2.391973368717555, + "grad_norm": 0.3126499576715284, + "learning_rate": 1.1995978942022618e-05, + "loss": 2.6738, + "step": 51377 + }, + { + "epoch": 2.392019926903648, + "grad_norm": 0.31296300896224755, + "learning_rate": 1.1994218782177041e-05, + "loss": 2.6906, + "step": 51378 + }, + { + "epoch": 2.392066485089741, + "grad_norm": 0.32961922504240726, + "learning_rate": 1.199245873387469e-05, + "loss": 2.6889, + "step": 51379 + }, + { + "epoch": 2.3921130432758337, + "grad_norm": 0.3337410708454449, + "learning_rate": 1.1990698797120726e-05, + "loss": 2.6395, + "step": 51380 + }, + { + "epoch": 2.392159601461927, + "grad_norm": 0.3346579999435878, + "learning_rate": 1.1988938971920315e-05, + "loss": 2.629, + "step": 51381 + }, + { + "epoch": 2.39220615964802, + "grad_norm": 0.3353212517144815, + "learning_rate": 1.1987179258278642e-05, + "loss": 2.6216, + "step": 51382 + }, + { + "epoch": 2.392252717834113, + "grad_norm": 0.3354637231547665, + "learning_rate": 1.1985419656200836e-05, + "loss": 2.6104, + "step": 51383 + }, + { + "epoch": 2.392299276020206, + "grad_norm": 0.34968668630435096, + "learning_rate": 1.198366016569208e-05, + "loss": 2.6737, + "step": 51384 + }, + { + "epoch": 2.3923458342062993, + "grad_norm": 0.3328365532094515, + "learning_rate": 1.1981900786757533e-05, + "loss": 2.617, + "step": 51385 + }, + { + "epoch": 2.3923923923923924, + "grad_norm": 0.34694206635578956, + "learning_rate": 1.1980141519402376e-05, + "loss": 2.7034, + "step": 51386 + }, + { + "epoch": 2.3924389505784855, + "grad_norm": 0.32536460181513654, + "learning_rate": 1.1978382363631735e-05, + "loss": 2.6987, + "step": 51387 + }, + { + "epoch": 2.3924855087645787, + "grad_norm": 0.33547852666047007, + "learning_rate": 1.197662331945082e-05, + "loss": 2.6448, + "step": 51388 + }, + { + "epoch": 2.3925320669506718, + "grad_norm": 0.3236140504972807, + "learning_rate": 1.1974864386864742e-05, + "loss": 2.5805, + "step": 51389 + }, + { + "epoch": 2.392578625136765, + "grad_norm": 0.31657574963471286, + "learning_rate": 1.1973105565878717e-05, + "loss": 2.5412, + "step": 51390 + }, + { + "epoch": 2.3926251833228576, + "grad_norm": 0.31557992193082, + "learning_rate": 1.1971346856497872e-05, + "loss": 2.5846, + "step": 51391 + }, + { + "epoch": 2.3926717415089507, + "grad_norm": 0.3312972978798853, + "learning_rate": 1.1969588258727376e-05, + "loss": 2.6611, + "step": 51392 + }, + { + "epoch": 2.392718299695044, + "grad_norm": 0.31546202322026146, + "learning_rate": 1.1967829772572387e-05, + "loss": 2.585, + "step": 51393 + }, + { + "epoch": 2.392764857881137, + "grad_norm": 0.3223860430293879, + "learning_rate": 1.1966071398038076e-05, + "loss": 2.5656, + "step": 51394 + }, + { + "epoch": 2.39281141606723, + "grad_norm": 0.34796155996891104, + "learning_rate": 1.1964313135129613e-05, + "loss": 2.62, + "step": 51395 + }, + { + "epoch": 2.392857974253323, + "grad_norm": 0.3410051093697219, + "learning_rate": 1.1962554983852125e-05, + "loss": 2.6383, + "step": 51396 + }, + { + "epoch": 2.3929045324394163, + "grad_norm": 0.3166890046002745, + "learning_rate": 1.1960796944210794e-05, + "loss": 2.6931, + "step": 51397 + }, + { + "epoch": 2.3929510906255094, + "grad_norm": 0.32158641109899627, + "learning_rate": 1.1959039016210778e-05, + "loss": 2.6917, + "step": 51398 + }, + { + "epoch": 2.3929976488116025, + "grad_norm": 0.31874048119865456, + "learning_rate": 1.1957281199857234e-05, + "loss": 2.6432, + "step": 51399 + }, + { + "epoch": 2.393044206997695, + "grad_norm": 0.33205917636191806, + "learning_rate": 1.1955523495155319e-05, + "loss": 2.6657, + "step": 51400 + }, + { + "epoch": 2.3930907651837883, + "grad_norm": 0.31060868133099423, + "learning_rate": 1.1953765902110209e-05, + "loss": 2.4832, + "step": 51401 + }, + { + "epoch": 2.3931373233698814, + "grad_norm": 0.33565240576203875, + "learning_rate": 1.1952008420727023e-05, + "loss": 2.5898, + "step": 51402 + }, + { + "epoch": 2.3931838815559745, + "grad_norm": 0.3348030251969981, + "learning_rate": 1.195025105101097e-05, + "loss": 2.6443, + "step": 51403 + }, + { + "epoch": 2.3932304397420676, + "grad_norm": 0.3136920072855391, + "learning_rate": 1.194849379296717e-05, + "loss": 2.6409, + "step": 51404 + }, + { + "epoch": 2.3932769979281607, + "grad_norm": 0.30493704354790635, + "learning_rate": 1.1946736646600787e-05, + "loss": 2.6166, + "step": 51405 + }, + { + "epoch": 2.393323556114254, + "grad_norm": 0.32435278588460886, + "learning_rate": 1.1944979611916985e-05, + "loss": 2.6241, + "step": 51406 + }, + { + "epoch": 2.393370114300347, + "grad_norm": 0.32309949579047503, + "learning_rate": 1.1943222688920918e-05, + "loss": 2.6787, + "step": 51407 + }, + { + "epoch": 2.39341667248644, + "grad_norm": 0.3010244586515728, + "learning_rate": 1.1941465877617764e-05, + "loss": 2.5489, + "step": 51408 + }, + { + "epoch": 2.393463230672533, + "grad_norm": 0.32536824882992893, + "learning_rate": 1.1939709178012637e-05, + "loss": 2.5974, + "step": 51409 + }, + { + "epoch": 2.3935097888586263, + "grad_norm": 0.3215250155542131, + "learning_rate": 1.1937952590110717e-05, + "loss": 2.6244, + "step": 51410 + }, + { + "epoch": 2.393556347044719, + "grad_norm": 0.33234573439458537, + "learning_rate": 1.1936196113917153e-05, + "loss": 2.5962, + "step": 51411 + }, + { + "epoch": 2.393602905230812, + "grad_norm": 0.3344866078378723, + "learning_rate": 1.193443974943711e-05, + "loss": 2.6102, + "step": 51412 + }, + { + "epoch": 2.393649463416905, + "grad_norm": 0.3324878023145684, + "learning_rate": 1.1932683496675728e-05, + "loss": 2.6387, + "step": 51413 + }, + { + "epoch": 2.3936960216029983, + "grad_norm": 0.3182811040706568, + "learning_rate": 1.1930927355638189e-05, + "loss": 2.5086, + "step": 51414 + }, + { + "epoch": 2.3937425797890914, + "grad_norm": 0.3383707555689061, + "learning_rate": 1.1929171326329602e-05, + "loss": 2.6604, + "step": 51415 + }, + { + "epoch": 2.3937891379751846, + "grad_norm": 0.3255943492917084, + "learning_rate": 1.1927415408755171e-05, + "loss": 2.5879, + "step": 51416 + }, + { + "epoch": 2.3938356961612777, + "grad_norm": 0.35194689299320553, + "learning_rate": 1.1925659602919998e-05, + "loss": 2.7397, + "step": 51417 + }, + { + "epoch": 2.393882254347371, + "grad_norm": 0.3228263892420072, + "learning_rate": 1.1923903908829287e-05, + "loss": 2.5837, + "step": 51418 + }, + { + "epoch": 2.3939288125334635, + "grad_norm": 0.31006344688818344, + "learning_rate": 1.1922148326488159e-05, + "loss": 2.6765, + "step": 51419 + }, + { + "epoch": 2.3939753707195566, + "grad_norm": 0.322792742146812, + "learning_rate": 1.1920392855901768e-05, + "loss": 2.6067, + "step": 51420 + }, + { + "epoch": 2.3940219289056497, + "grad_norm": 0.359061777384805, + "learning_rate": 1.191863749707529e-05, + "loss": 2.6736, + "step": 51421 + }, + { + "epoch": 2.394068487091743, + "grad_norm": 0.31662774614051403, + "learning_rate": 1.191688225001385e-05, + "loss": 2.5431, + "step": 51422 + }, + { + "epoch": 2.394115045277836, + "grad_norm": 0.294402305779888, + "learning_rate": 1.1915127114722602e-05, + "loss": 2.5979, + "step": 51423 + }, + { + "epoch": 2.394161603463929, + "grad_norm": 0.31894016346225296, + "learning_rate": 1.1913372091206709e-05, + "loss": 2.6923, + "step": 51424 + }, + { + "epoch": 2.394208161650022, + "grad_norm": 0.31296150958642605, + "learning_rate": 1.1911617179471312e-05, + "loss": 2.6528, + "step": 51425 + }, + { + "epoch": 2.3942547198361153, + "grad_norm": 0.32902524093677804, + "learning_rate": 1.1909862379521569e-05, + "loss": 2.6948, + "step": 51426 + }, + { + "epoch": 2.3943012780222084, + "grad_norm": 0.3073062531872966, + "learning_rate": 1.1908107691362642e-05, + "loss": 2.6048, + "step": 51427 + }, + { + "epoch": 2.3943478362083015, + "grad_norm": 0.3548769459256006, + "learning_rate": 1.1906353114999635e-05, + "loss": 2.5392, + "step": 51428 + }, + { + "epoch": 2.3943943943943946, + "grad_norm": 0.3075153557255205, + "learning_rate": 1.190459865043776e-05, + "loss": 2.6826, + "step": 51429 + }, + { + "epoch": 2.3944409525804873, + "grad_norm": 0.31705117759685547, + "learning_rate": 1.1902844297682104e-05, + "loss": 2.6717, + "step": 51430 + }, + { + "epoch": 2.3944875107665804, + "grad_norm": 0.30929511410579424, + "learning_rate": 1.190109005673788e-05, + "loss": 2.7216, + "step": 51431 + }, + { + "epoch": 2.3945340689526735, + "grad_norm": 0.32181807602248225, + "learning_rate": 1.1899335927610184e-05, + "loss": 2.7252, + "step": 51432 + }, + { + "epoch": 2.3945806271387666, + "grad_norm": 0.3151761255501715, + "learning_rate": 1.1897581910304185e-05, + "loss": 2.7139, + "step": 51433 + }, + { + "epoch": 2.3946271853248597, + "grad_norm": 0.3003260854400038, + "learning_rate": 1.1895828004825044e-05, + "loss": 2.608, + "step": 51434 + }, + { + "epoch": 2.394673743510953, + "grad_norm": 0.3096417770133619, + "learning_rate": 1.1894074211177864e-05, + "loss": 2.6767, + "step": 51435 + }, + { + "epoch": 2.394720301697046, + "grad_norm": 0.32887375193817625, + "learning_rate": 1.1892320529367851e-05, + "loss": 2.549, + "step": 51436 + }, + { + "epoch": 2.394766859883139, + "grad_norm": 0.3055577021303797, + "learning_rate": 1.1890566959400112e-05, + "loss": 2.6252, + "step": 51437 + }, + { + "epoch": 2.394813418069232, + "grad_norm": 0.31235205646082714, + "learning_rate": 1.18888135012798e-05, + "loss": 2.6432, + "step": 51438 + }, + { + "epoch": 2.394859976255325, + "grad_norm": 0.31702273817382604, + "learning_rate": 1.188706015501207e-05, + "loss": 2.5642, + "step": 51439 + }, + { + "epoch": 2.394906534441418, + "grad_norm": 0.3198814628260282, + "learning_rate": 1.1885306920602074e-05, + "loss": 2.7033, + "step": 51440 + }, + { + "epoch": 2.394953092627511, + "grad_norm": 0.33906465694842297, + "learning_rate": 1.1883553798054925e-05, + "loss": 2.6393, + "step": 51441 + }, + { + "epoch": 2.394999650813604, + "grad_norm": 0.314012812789888, + "learning_rate": 1.1881800787375812e-05, + "loss": 2.5763, + "step": 51442 + }, + { + "epoch": 2.3950462089996973, + "grad_norm": 0.33204882017414783, + "learning_rate": 1.1880047888569834e-05, + "loss": 2.7126, + "step": 51443 + }, + { + "epoch": 2.3950927671857904, + "grad_norm": 0.3047385666491145, + "learning_rate": 1.1878295101642184e-05, + "loss": 2.6787, + "step": 51444 + }, + { + "epoch": 2.3951393253718836, + "grad_norm": 0.3130827921089433, + "learning_rate": 1.1876542426597975e-05, + "loss": 2.6148, + "step": 51445 + }, + { + "epoch": 2.3951858835579767, + "grad_norm": 0.32575352114719136, + "learning_rate": 1.1874789863442354e-05, + "loss": 2.7086, + "step": 51446 + }, + { + "epoch": 2.39523244174407, + "grad_norm": 0.30596621916508066, + "learning_rate": 1.1873037412180482e-05, + "loss": 2.4441, + "step": 51447 + }, + { + "epoch": 2.395278999930163, + "grad_norm": 0.3290879128842448, + "learning_rate": 1.1871285072817462e-05, + "loss": 2.5938, + "step": 51448 + }, + { + "epoch": 2.395325558116256, + "grad_norm": 0.3102510302336442, + "learning_rate": 1.1869532845358495e-05, + "loss": 2.6159, + "step": 51449 + }, + { + "epoch": 2.3953721163023487, + "grad_norm": 0.31147626640850323, + "learning_rate": 1.1867780729808675e-05, + "loss": 2.551, + "step": 51450 + }, + { + "epoch": 2.395418674488442, + "grad_norm": 0.30989051464377, + "learning_rate": 1.1866028726173161e-05, + "loss": 2.6201, + "step": 51451 + }, + { + "epoch": 2.395465232674535, + "grad_norm": 0.3021354450162027, + "learning_rate": 1.18642768344571e-05, + "loss": 2.6759, + "step": 51452 + }, + { + "epoch": 2.395511790860628, + "grad_norm": 0.3021803763900126, + "learning_rate": 1.1862525054665625e-05, + "loss": 2.5526, + "step": 51453 + }, + { + "epoch": 2.395558349046721, + "grad_norm": 0.3114924753025101, + "learning_rate": 1.1860773386803887e-05, + "loss": 2.7114, + "step": 51454 + }, + { + "epoch": 2.3956049072328143, + "grad_norm": 0.32745297007921387, + "learning_rate": 1.1859021830877032e-05, + "loss": 2.6978, + "step": 51455 + }, + { + "epoch": 2.3956514654189074, + "grad_norm": 0.3215086229044568, + "learning_rate": 1.1857270386890163e-05, + "loss": 2.6595, + "step": 51456 + }, + { + "epoch": 2.3956980236050005, + "grad_norm": 0.3001601535161303, + "learning_rate": 1.1855519054848474e-05, + "loss": 2.5479, + "step": 51457 + }, + { + "epoch": 2.395744581791093, + "grad_norm": 0.3152790698104052, + "learning_rate": 1.1853767834757068e-05, + "loss": 2.5771, + "step": 51458 + }, + { + "epoch": 2.3957911399771863, + "grad_norm": 0.32587122849085376, + "learning_rate": 1.1852016726621096e-05, + "loss": 2.7493, + "step": 51459 + }, + { + "epoch": 2.3958376981632794, + "grad_norm": 0.31384133038542067, + "learning_rate": 1.185026573044571e-05, + "loss": 2.5681, + "step": 51460 + }, + { + "epoch": 2.3958842563493725, + "grad_norm": 0.31518340560196095, + "learning_rate": 1.1848514846236003e-05, + "loss": 2.6015, + "step": 51461 + }, + { + "epoch": 2.3959308145354656, + "grad_norm": 0.30435836505202446, + "learning_rate": 1.1846764073997179e-05, + "loss": 2.4752, + "step": 51462 + }, + { + "epoch": 2.3959773727215588, + "grad_norm": 0.3185821821716385, + "learning_rate": 1.1845013413734329e-05, + "loss": 2.5359, + "step": 51463 + }, + { + "epoch": 2.396023930907652, + "grad_norm": 0.31656561132679234, + "learning_rate": 1.1843262865452604e-05, + "loss": 2.7157, + "step": 51464 + }, + { + "epoch": 2.396070489093745, + "grad_norm": 0.3382953894461821, + "learning_rate": 1.1841512429157142e-05, + "loss": 2.6012, + "step": 51465 + }, + { + "epoch": 2.396117047279838, + "grad_norm": 0.33021046175235735, + "learning_rate": 1.1839762104853086e-05, + "loss": 2.6937, + "step": 51466 + }, + { + "epoch": 2.396163605465931, + "grad_norm": 0.3371431072318668, + "learning_rate": 1.183801189254557e-05, + "loss": 2.62, + "step": 51467 + }, + { + "epoch": 2.3962101636520243, + "grad_norm": 0.3175578342908248, + "learning_rate": 1.1836261792239738e-05, + "loss": 2.5824, + "step": 51468 + }, + { + "epoch": 2.396256721838117, + "grad_norm": 0.3273793622615083, + "learning_rate": 1.1834511803940695e-05, + "loss": 2.6131, + "step": 51469 + }, + { + "epoch": 2.39630328002421, + "grad_norm": 0.33307921830060433, + "learning_rate": 1.1832761927653624e-05, + "loss": 2.5753, + "step": 51470 + }, + { + "epoch": 2.3963498382103032, + "grad_norm": 0.3321259962951649, + "learning_rate": 1.1831012163383626e-05, + "loss": 2.6248, + "step": 51471 + }, + { + "epoch": 2.3963963963963963, + "grad_norm": 0.32362282698588835, + "learning_rate": 1.1829262511135847e-05, + "loss": 2.5967, + "step": 51472 + }, + { + "epoch": 2.3964429545824895, + "grad_norm": 0.3180560585442157, + "learning_rate": 1.1827512970915433e-05, + "loss": 2.5416, + "step": 51473 + }, + { + "epoch": 2.3964895127685826, + "grad_norm": 0.34263343779344807, + "learning_rate": 1.182576354272748e-05, + "loss": 2.6929, + "step": 51474 + }, + { + "epoch": 2.3965360709546757, + "grad_norm": 0.3425486202297813, + "learning_rate": 1.1824014226577179e-05, + "loss": 2.6731, + "step": 51475 + }, + { + "epoch": 2.396582629140769, + "grad_norm": 0.32299978370318033, + "learning_rate": 1.182226502246962e-05, + "loss": 2.6797, + "step": 51476 + }, + { + "epoch": 2.396629187326862, + "grad_norm": 0.32326667791413394, + "learning_rate": 1.1820515930409953e-05, + "loss": 2.7235, + "step": 51477 + }, + { + "epoch": 2.3966757455129546, + "grad_norm": 0.29992671506886615, + "learning_rate": 1.1818766950403315e-05, + "loss": 2.6219, + "step": 51478 + }, + { + "epoch": 2.3967223036990477, + "grad_norm": 0.3156667515137296, + "learning_rate": 1.1817018082454829e-05, + "loss": 2.5486, + "step": 51479 + }, + { + "epoch": 2.396768861885141, + "grad_norm": 0.32634640462620657, + "learning_rate": 1.1815269326569633e-05, + "loss": 2.631, + "step": 51480 + }, + { + "epoch": 2.396815420071234, + "grad_norm": 0.32842957061495803, + "learning_rate": 1.1813520682752877e-05, + "loss": 2.491, + "step": 51481 + }, + { + "epoch": 2.396861978257327, + "grad_norm": 0.303824551540554, + "learning_rate": 1.1811772151009648e-05, + "loss": 2.6558, + "step": 51482 + }, + { + "epoch": 2.39690853644342, + "grad_norm": 0.29044749634352873, + "learning_rate": 1.1810023731345133e-05, + "loss": 2.5845, + "step": 51483 + }, + { + "epoch": 2.3969550946295133, + "grad_norm": 0.3413223532005198, + "learning_rate": 1.180827542376442e-05, + "loss": 2.6218, + "step": 51484 + }, + { + "epoch": 2.3970016528156064, + "grad_norm": 0.3204042606168585, + "learning_rate": 1.1806527228272657e-05, + "loss": 2.6436, + "step": 51485 + }, + { + "epoch": 2.3970482110016995, + "grad_norm": 0.316253170566204, + "learning_rate": 1.1804779144874989e-05, + "loss": 2.6691, + "step": 51486 + }, + { + "epoch": 2.3970947691877926, + "grad_norm": 0.3233625150372295, + "learning_rate": 1.18030311735765e-05, + "loss": 2.6176, + "step": 51487 + }, + { + "epoch": 2.3971413273738857, + "grad_norm": 0.3140578147447349, + "learning_rate": 1.1801283314382384e-05, + "loss": 2.6375, + "step": 51488 + }, + { + "epoch": 2.3971878855599784, + "grad_norm": 0.30310978836697255, + "learning_rate": 1.179953556729771e-05, + "loss": 2.5968, + "step": 51489 + }, + { + "epoch": 2.3972344437460715, + "grad_norm": 0.3138426550267204, + "learning_rate": 1.179778793232766e-05, + "loss": 2.5687, + "step": 51490 + }, + { + "epoch": 2.3972810019321646, + "grad_norm": 0.31473409058872925, + "learning_rate": 1.1796040409477327e-05, + "loss": 2.5946, + "step": 51491 + }, + { + "epoch": 2.3973275601182578, + "grad_norm": 0.28836449502353195, + "learning_rate": 1.1794292998751849e-05, + "loss": 2.5698, + "step": 51492 + }, + { + "epoch": 2.397374118304351, + "grad_norm": 0.3069997603831814, + "learning_rate": 1.179254570015636e-05, + "loss": 2.6581, + "step": 51493 + }, + { + "epoch": 2.397420676490444, + "grad_norm": 0.31068827300432966, + "learning_rate": 1.1790798513696e-05, + "loss": 2.6416, + "step": 51494 + }, + { + "epoch": 2.397467234676537, + "grad_norm": 0.3113427366657313, + "learning_rate": 1.1789051439375854e-05, + "loss": 2.5929, + "step": 51495 + }, + { + "epoch": 2.39751379286263, + "grad_norm": 0.3046615167844393, + "learning_rate": 1.1787304477201106e-05, + "loss": 2.5311, + "step": 51496 + }, + { + "epoch": 2.397560351048723, + "grad_norm": 0.30271151116567885, + "learning_rate": 1.178555762717684e-05, + "loss": 2.6303, + "step": 51497 + }, + { + "epoch": 2.397606909234816, + "grad_norm": 0.2964097562275954, + "learning_rate": 1.1783810889308195e-05, + "loss": 2.67, + "step": 51498 + }, + { + "epoch": 2.397653467420909, + "grad_norm": 0.31174576220793937, + "learning_rate": 1.1782064263600318e-05, + "loss": 2.6201, + "step": 51499 + }, + { + "epoch": 2.3977000256070022, + "grad_norm": 0.3002595270057868, + "learning_rate": 1.1780317750058284e-05, + "loss": 2.5798, + "step": 51500 + }, + { + "epoch": 2.3977465837930954, + "grad_norm": 0.30141565652955005, + "learning_rate": 1.1778571348687284e-05, + "loss": 2.6336, + "step": 51501 + }, + { + "epoch": 2.3977931419791885, + "grad_norm": 0.32661156244601, + "learning_rate": 1.1776825059492386e-05, + "loss": 2.5545, + "step": 51502 + }, + { + "epoch": 2.3978397001652816, + "grad_norm": 0.3260358211133219, + "learning_rate": 1.1775078882478769e-05, + "loss": 2.6161, + "step": 51503 + }, + { + "epoch": 2.3978862583513747, + "grad_norm": 0.30466234909613243, + "learning_rate": 1.1773332817651511e-05, + "loss": 2.5796, + "step": 51504 + }, + { + "epoch": 2.397932816537468, + "grad_norm": 0.3187442193957155, + "learning_rate": 1.1771586865015754e-05, + "loss": 2.5357, + "step": 51505 + }, + { + "epoch": 2.397979374723561, + "grad_norm": 0.33105148943981766, + "learning_rate": 1.1769841024576628e-05, + "loss": 2.6388, + "step": 51506 + }, + { + "epoch": 2.398025932909654, + "grad_norm": 0.32778574928742393, + "learning_rate": 1.1768095296339248e-05, + "loss": 2.7241, + "step": 51507 + }, + { + "epoch": 2.3980724910957467, + "grad_norm": 0.3222635265863738, + "learning_rate": 1.1766349680308747e-05, + "loss": 2.7086, + "step": 51508 + }, + { + "epoch": 2.39811904928184, + "grad_norm": 0.32204068803536556, + "learning_rate": 1.1764604176490257e-05, + "loss": 2.6227, + "step": 51509 + }, + { + "epoch": 2.398165607467933, + "grad_norm": 0.33117985940688266, + "learning_rate": 1.176285878488887e-05, + "loss": 2.6897, + "step": 51510 + }, + { + "epoch": 2.398212165654026, + "grad_norm": 0.32394408007638, + "learning_rate": 1.1761113505509724e-05, + "loss": 2.6932, + "step": 51511 + }, + { + "epoch": 2.398258723840119, + "grad_norm": 0.3360965896068381, + "learning_rate": 1.1759368338357962e-05, + "loss": 2.7053, + "step": 51512 + }, + { + "epoch": 2.3983052820262123, + "grad_norm": 0.3520563972655796, + "learning_rate": 1.1757623283438651e-05, + "loss": 2.615, + "step": 51513 + }, + { + "epoch": 2.3983518402123054, + "grad_norm": 0.3091316727177162, + "learning_rate": 1.1755878340756983e-05, + "loss": 2.6827, + "step": 51514 + }, + { + "epoch": 2.3983983983983985, + "grad_norm": 0.3301701990855155, + "learning_rate": 1.1754133510318016e-05, + "loss": 2.5656, + "step": 51515 + }, + { + "epoch": 2.3984449565844916, + "grad_norm": 0.35365565419724804, + "learning_rate": 1.1752388792126923e-05, + "loss": 2.6414, + "step": 51516 + }, + { + "epoch": 2.3984915147705843, + "grad_norm": 0.3137059779164127, + "learning_rate": 1.1750644186188792e-05, + "loss": 2.5099, + "step": 51517 + }, + { + "epoch": 2.3985380729566774, + "grad_norm": 0.31541279011997975, + "learning_rate": 1.1748899692508746e-05, + "loss": 2.6565, + "step": 51518 + }, + { + "epoch": 2.3985846311427705, + "grad_norm": 0.32162489142170225, + "learning_rate": 1.1747155311091917e-05, + "loss": 2.5547, + "step": 51519 + }, + { + "epoch": 2.3986311893288637, + "grad_norm": 0.32829927174772, + "learning_rate": 1.1745411041943416e-05, + "loss": 2.6419, + "step": 51520 + }, + { + "epoch": 2.3986777475149568, + "grad_norm": 0.31211476171573044, + "learning_rate": 1.1743666885068361e-05, + "loss": 2.6119, + "step": 51521 + }, + { + "epoch": 2.39872430570105, + "grad_norm": 0.3157198805281611, + "learning_rate": 1.1741922840471896e-05, + "loss": 2.6596, + "step": 51522 + }, + { + "epoch": 2.398770863887143, + "grad_norm": 0.3126471526338853, + "learning_rate": 1.17401789081591e-05, + "loss": 2.6549, + "step": 51523 + }, + { + "epoch": 2.398817422073236, + "grad_norm": 0.33844442469742525, + "learning_rate": 1.1738435088135113e-05, + "loss": 2.6206, + "step": 51524 + }, + { + "epoch": 2.3988639802593292, + "grad_norm": 0.3158371541102027, + "learning_rate": 1.1736691380405046e-05, + "loss": 2.7108, + "step": 51525 + }, + { + "epoch": 2.3989105384454223, + "grad_norm": 0.3228711243079514, + "learning_rate": 1.1734947784974022e-05, + "loss": 2.7834, + "step": 51526 + }, + { + "epoch": 2.3989570966315155, + "grad_norm": 0.3183673856549131, + "learning_rate": 1.1733204301847168e-05, + "loss": 2.5381, + "step": 51527 + }, + { + "epoch": 2.399003654817608, + "grad_norm": 0.30733140250230195, + "learning_rate": 1.1731460931029564e-05, + "loss": 2.5688, + "step": 51528 + }, + { + "epoch": 2.3990502130037012, + "grad_norm": 0.32100087923496134, + "learning_rate": 1.172971767252638e-05, + "loss": 2.6562, + "step": 51529 + }, + { + "epoch": 2.3990967711897944, + "grad_norm": 0.3104890481141306, + "learning_rate": 1.172797452634269e-05, + "loss": 2.6363, + "step": 51530 + }, + { + "epoch": 2.3991433293758875, + "grad_norm": 0.3114123489941215, + "learning_rate": 1.1726231492483625e-05, + "loss": 2.6156, + "step": 51531 + }, + { + "epoch": 2.3991898875619806, + "grad_norm": 0.32437704794076255, + "learning_rate": 1.1724488570954301e-05, + "loss": 2.656, + "step": 51532 + }, + { + "epoch": 2.3992364457480737, + "grad_norm": 0.31992496297533607, + "learning_rate": 1.1722745761759828e-05, + "loss": 2.5151, + "step": 51533 + }, + { + "epoch": 2.399283003934167, + "grad_norm": 0.3114267627728025, + "learning_rate": 1.1721003064905328e-05, + "loss": 2.5782, + "step": 51534 + }, + { + "epoch": 2.39932956212026, + "grad_norm": 0.30798867213061715, + "learning_rate": 1.1719260480395927e-05, + "loss": 2.6124, + "step": 51535 + }, + { + "epoch": 2.399376120306353, + "grad_norm": 0.31596763929451466, + "learning_rate": 1.1717518008236711e-05, + "loss": 2.6246, + "step": 51536 + }, + { + "epoch": 2.3994226784924457, + "grad_norm": 0.30012376357532294, + "learning_rate": 1.1715775648432809e-05, + "loss": 2.6553, + "step": 51537 + }, + { + "epoch": 2.399469236678539, + "grad_norm": 0.3218889686343569, + "learning_rate": 1.1714033400989333e-05, + "loss": 2.6509, + "step": 51538 + }, + { + "epoch": 2.399515794864632, + "grad_norm": 0.3192950083624976, + "learning_rate": 1.1712291265911396e-05, + "loss": 2.594, + "step": 51539 + }, + { + "epoch": 2.399562353050725, + "grad_norm": 0.30304470347206247, + "learning_rate": 1.1710549243204128e-05, + "loss": 2.5411, + "step": 51540 + }, + { + "epoch": 2.399608911236818, + "grad_norm": 0.3006429315566748, + "learning_rate": 1.1708807332872601e-05, + "loss": 2.5608, + "step": 51541 + }, + { + "epoch": 2.3996554694229113, + "grad_norm": 0.30628310659612407, + "learning_rate": 1.1707065534921973e-05, + "loss": 2.6432, + "step": 51542 + }, + { + "epoch": 2.3997020276090044, + "grad_norm": 0.31972876469861616, + "learning_rate": 1.1705323849357313e-05, + "loss": 2.6223, + "step": 51543 + }, + { + "epoch": 2.3997485857950975, + "grad_norm": 0.3091245440667364, + "learning_rate": 1.1703582276183778e-05, + "loss": 2.6041, + "step": 51544 + }, + { + "epoch": 2.3997951439811906, + "grad_norm": 0.3142976431835867, + "learning_rate": 1.1701840815406444e-05, + "loss": 2.6636, + "step": 51545 + }, + { + "epoch": 2.3998417021672838, + "grad_norm": 0.3151391986352103, + "learning_rate": 1.1700099467030434e-05, + "loss": 2.5529, + "step": 51546 + }, + { + "epoch": 2.399888260353377, + "grad_norm": 0.3223528251973554, + "learning_rate": 1.1698358231060864e-05, + "loss": 2.6141, + "step": 51547 + }, + { + "epoch": 2.3999348185394695, + "grad_norm": 0.314983420225666, + "learning_rate": 1.1696617107502849e-05, + "loss": 2.6307, + "step": 51548 + }, + { + "epoch": 2.3999813767255627, + "grad_norm": 0.31031111645500187, + "learning_rate": 1.1694876096361473e-05, + "loss": 2.6933, + "step": 51549 + }, + { + "epoch": 2.400027934911656, + "grad_norm": 0.31291258824604307, + "learning_rate": 1.1693135197641864e-05, + "loss": 2.59, + "step": 51550 + }, + { + "epoch": 2.400074493097749, + "grad_norm": 0.31574349570750615, + "learning_rate": 1.1691394411349133e-05, + "loss": 2.5545, + "step": 51551 + }, + { + "epoch": 2.400121051283842, + "grad_norm": 0.3291750816666458, + "learning_rate": 1.1689653737488383e-05, + "loss": 2.6672, + "step": 51552 + }, + { + "epoch": 2.400167609469935, + "grad_norm": 0.3260865517465668, + "learning_rate": 1.1687913176064735e-05, + "loss": 2.5449, + "step": 51553 + }, + { + "epoch": 2.4002141676560282, + "grad_norm": 0.3222975276826512, + "learning_rate": 1.1686172727083262e-05, + "loss": 2.6703, + "step": 51554 + }, + { + "epoch": 2.4002607258421214, + "grad_norm": 0.33539747253515895, + "learning_rate": 1.1684432390549127e-05, + "loss": 2.6365, + "step": 51555 + }, + { + "epoch": 2.400307284028214, + "grad_norm": 0.3349432625534646, + "learning_rate": 1.1682692166467379e-05, + "loss": 2.6505, + "step": 51556 + }, + { + "epoch": 2.400353842214307, + "grad_norm": 0.33377363606970223, + "learning_rate": 1.168095205484318e-05, + "loss": 2.6515, + "step": 51557 + }, + { + "epoch": 2.4004004004004003, + "grad_norm": 0.3242807548291276, + "learning_rate": 1.1679212055681604e-05, + "loss": 2.6734, + "step": 51558 + }, + { + "epoch": 2.4004469585864934, + "grad_norm": 0.3236297054612881, + "learning_rate": 1.1677472168987763e-05, + "loss": 2.5224, + "step": 51559 + }, + { + "epoch": 2.4004935167725865, + "grad_norm": 0.33454648791297764, + "learning_rate": 1.1675732394766764e-05, + "loss": 2.7675, + "step": 51560 + }, + { + "epoch": 2.4005400749586796, + "grad_norm": 0.30439667328778963, + "learning_rate": 1.1673992733023714e-05, + "loss": 2.6956, + "step": 51561 + }, + { + "epoch": 2.4005866331447727, + "grad_norm": 0.3091263792540896, + "learning_rate": 1.1672253183763737e-05, + "loss": 2.7244, + "step": 51562 + }, + { + "epoch": 2.400633191330866, + "grad_norm": 0.33977301735723925, + "learning_rate": 1.1670513746991907e-05, + "loss": 2.7121, + "step": 51563 + }, + { + "epoch": 2.400679749516959, + "grad_norm": 0.3193346626168103, + "learning_rate": 1.1668774422713335e-05, + "loss": 2.593, + "step": 51564 + }, + { + "epoch": 2.400726307703052, + "grad_norm": 0.3218974447040283, + "learning_rate": 1.1667035210933141e-05, + "loss": 2.6399, + "step": 51565 + }, + { + "epoch": 2.400772865889145, + "grad_norm": 0.31740838177502184, + "learning_rate": 1.1665296111656438e-05, + "loss": 2.5875, + "step": 51566 + }, + { + "epoch": 2.400819424075238, + "grad_norm": 0.303503998387533, + "learning_rate": 1.1663557124888286e-05, + "loss": 2.5113, + "step": 51567 + }, + { + "epoch": 2.400865982261331, + "grad_norm": 0.31951520292829894, + "learning_rate": 1.1661818250633839e-05, + "loss": 2.6566, + "step": 51568 + }, + { + "epoch": 2.400912540447424, + "grad_norm": 0.30469110231788443, + "learning_rate": 1.1660079488898157e-05, + "loss": 2.5823, + "step": 51569 + }, + { + "epoch": 2.400959098633517, + "grad_norm": 0.3131911689057891, + "learning_rate": 1.1658340839686387e-05, + "loss": 2.5741, + "step": 51570 + }, + { + "epoch": 2.4010056568196103, + "grad_norm": 0.31556476851752774, + "learning_rate": 1.1656602303003599e-05, + "loss": 2.6719, + "step": 51571 + }, + { + "epoch": 2.4010522150057034, + "grad_norm": 0.3241550690913588, + "learning_rate": 1.1654863878854904e-05, + "loss": 2.7019, + "step": 51572 + }, + { + "epoch": 2.4010987731917965, + "grad_norm": 0.2915653473224565, + "learning_rate": 1.165312556724541e-05, + "loss": 2.5834, + "step": 51573 + }, + { + "epoch": 2.4011453313778897, + "grad_norm": 0.31215541664807445, + "learning_rate": 1.165138736818021e-05, + "loss": 2.507, + "step": 51574 + }, + { + "epoch": 2.4011918895639828, + "grad_norm": 0.31903758321698605, + "learning_rate": 1.1649649281664427e-05, + "loss": 2.5792, + "step": 51575 + }, + { + "epoch": 2.4012384477500754, + "grad_norm": 0.3056308496840115, + "learning_rate": 1.164791130770313e-05, + "loss": 2.6003, + "step": 51576 + }, + { + "epoch": 2.4012850059361686, + "grad_norm": 0.3137279012030382, + "learning_rate": 1.1646173446301434e-05, + "loss": 2.6705, + "step": 51577 + }, + { + "epoch": 2.4013315641222617, + "grad_norm": 0.310959176720362, + "learning_rate": 1.164443569746444e-05, + "loss": 2.614, + "step": 51578 + }, + { + "epoch": 2.401378122308355, + "grad_norm": 0.3117077528065538, + "learning_rate": 1.1642698061197249e-05, + "loss": 2.5843, + "step": 51579 + }, + { + "epoch": 2.401424680494448, + "grad_norm": 0.31647899938673124, + "learning_rate": 1.1640960537504963e-05, + "loss": 2.6099, + "step": 51580 + }, + { + "epoch": 2.401471238680541, + "grad_norm": 0.3112334656018892, + "learning_rate": 1.163922312639269e-05, + "loss": 2.638, + "step": 51581 + }, + { + "epoch": 2.401517796866634, + "grad_norm": 0.3065177588012199, + "learning_rate": 1.163748582786549e-05, + "loss": 2.6835, + "step": 51582 + }, + { + "epoch": 2.4015643550527273, + "grad_norm": 0.3284519557283217, + "learning_rate": 1.1635748641928518e-05, + "loss": 2.6367, + "step": 51583 + }, + { + "epoch": 2.4016109132388204, + "grad_norm": 0.31966647269036014, + "learning_rate": 1.1634011568586833e-05, + "loss": 2.5871, + "step": 51584 + }, + { + "epoch": 2.4016574714249135, + "grad_norm": 0.3195560166839694, + "learning_rate": 1.163227460784554e-05, + "loss": 2.6567, + "step": 51585 + }, + { + "epoch": 2.4017040296110066, + "grad_norm": 0.32736855904194245, + "learning_rate": 1.1630537759709747e-05, + "loss": 2.6177, + "step": 51586 + }, + { + "epoch": 2.4017505877970993, + "grad_norm": 0.3070190743888526, + "learning_rate": 1.1628801024184538e-05, + "loss": 2.6736, + "step": 51587 + }, + { + "epoch": 2.4017971459831924, + "grad_norm": 0.3250461475780048, + "learning_rate": 1.162706440127504e-05, + "loss": 2.6461, + "step": 51588 + }, + { + "epoch": 2.4018437041692855, + "grad_norm": 0.35342659634805873, + "learning_rate": 1.162532789098631e-05, + "loss": 2.6555, + "step": 51589 + }, + { + "epoch": 2.4018902623553786, + "grad_norm": 0.31819081922784326, + "learning_rate": 1.162359149332346e-05, + "loss": 2.7011, + "step": 51590 + }, + { + "epoch": 2.4019368205414717, + "grad_norm": 0.30852170269264206, + "learning_rate": 1.1621855208291588e-05, + "loss": 2.6372, + "step": 51591 + }, + { + "epoch": 2.401983378727565, + "grad_norm": 0.3285477948994253, + "learning_rate": 1.1620119035895794e-05, + "loss": 2.6125, + "step": 51592 + }, + { + "epoch": 2.402029936913658, + "grad_norm": 0.3192988310093521, + "learning_rate": 1.1618382976141168e-05, + "loss": 2.5935, + "step": 51593 + }, + { + "epoch": 2.402076495099751, + "grad_norm": 0.33959242051414235, + "learning_rate": 1.1616647029032818e-05, + "loss": 2.6494, + "step": 51594 + }, + { + "epoch": 2.4021230532858437, + "grad_norm": 0.32319256514426997, + "learning_rate": 1.1614911194575805e-05, + "loss": 2.6556, + "step": 51595 + }, + { + "epoch": 2.402169611471937, + "grad_norm": 0.31368993650939436, + "learning_rate": 1.1613175472775267e-05, + "loss": 2.6958, + "step": 51596 + }, + { + "epoch": 2.40221616965803, + "grad_norm": 0.32525647436738664, + "learning_rate": 1.161143986363627e-05, + "loss": 2.6472, + "step": 51597 + }, + { + "epoch": 2.402262727844123, + "grad_norm": 0.3219227616078956, + "learning_rate": 1.1609704367163915e-05, + "loss": 2.6043, + "step": 51598 + }, + { + "epoch": 2.402309286030216, + "grad_norm": 0.34532011794293455, + "learning_rate": 1.1607968983363287e-05, + "loss": 2.6496, + "step": 51599 + }, + { + "epoch": 2.4023558442163093, + "grad_norm": 0.30789456349462463, + "learning_rate": 1.1606233712239495e-05, + "loss": 2.7057, + "step": 51600 + }, + { + "epoch": 2.4024024024024024, + "grad_norm": 0.33901657928156426, + "learning_rate": 1.1604498553797639e-05, + "loss": 2.6566, + "step": 51601 + }, + { + "epoch": 2.4024489605884956, + "grad_norm": 0.32201510486116924, + "learning_rate": 1.160276350804278e-05, + "loss": 2.6962, + "step": 51602 + }, + { + "epoch": 2.4024955187745887, + "grad_norm": 0.3294206251740831, + "learning_rate": 1.1601028574980033e-05, + "loss": 2.6451, + "step": 51603 + }, + { + "epoch": 2.402542076960682, + "grad_norm": 0.32146731483784236, + "learning_rate": 1.1599293754614477e-05, + "loss": 2.6274, + "step": 51604 + }, + { + "epoch": 2.402588635146775, + "grad_norm": 0.31283026669917663, + "learning_rate": 1.1597559046951218e-05, + "loss": 2.6395, + "step": 51605 + }, + { + "epoch": 2.4026351933328676, + "grad_norm": 0.31824439953894496, + "learning_rate": 1.1595824451995335e-05, + "loss": 2.6523, + "step": 51606 + }, + { + "epoch": 2.4026817515189607, + "grad_norm": 0.32088172039312474, + "learning_rate": 1.1594089969751937e-05, + "loss": 2.6579, + "step": 51607 + }, + { + "epoch": 2.402728309705054, + "grad_norm": 0.33122189767863885, + "learning_rate": 1.1592355600226074e-05, + "loss": 2.6165, + "step": 51608 + }, + { + "epoch": 2.402774867891147, + "grad_norm": 0.31389795120776215, + "learning_rate": 1.1590621343422891e-05, + "loss": 2.6445, + "step": 51609 + }, + { + "epoch": 2.40282142607724, + "grad_norm": 0.3153743983264365, + "learning_rate": 1.1588887199347442e-05, + "loss": 2.5691, + "step": 51610 + }, + { + "epoch": 2.402867984263333, + "grad_norm": 0.33135949403073095, + "learning_rate": 1.1587153168004816e-05, + "loss": 2.6659, + "step": 51611 + }, + { + "epoch": 2.4029145424494263, + "grad_norm": 0.3513320391502947, + "learning_rate": 1.158541924940012e-05, + "loss": 2.6093, + "step": 51612 + }, + { + "epoch": 2.4029611006355194, + "grad_norm": 0.33492504313711086, + "learning_rate": 1.1583685443538428e-05, + "loss": 2.7543, + "step": 51613 + }, + { + "epoch": 2.4030076588216125, + "grad_norm": 0.307148052769452, + "learning_rate": 1.1581951750424851e-05, + "loss": 2.6403, + "step": 51614 + }, + { + "epoch": 2.403054217007705, + "grad_norm": 0.31649689771513756, + "learning_rate": 1.1580218170064433e-05, + "loss": 2.6961, + "step": 51615 + }, + { + "epoch": 2.4031007751937983, + "grad_norm": 0.3160868263667592, + "learning_rate": 1.1578484702462316e-05, + "loss": 2.6401, + "step": 51616 + }, + { + "epoch": 2.4031473333798914, + "grad_norm": 0.3128039771652516, + "learning_rate": 1.157675134762355e-05, + "loss": 2.5708, + "step": 51617 + }, + { + "epoch": 2.4031938915659845, + "grad_norm": 0.31156942388659065, + "learning_rate": 1.1575018105553232e-05, + "loss": 2.5571, + "step": 51618 + }, + { + "epoch": 2.4032404497520776, + "grad_norm": 0.3381276969465815, + "learning_rate": 1.1573284976256449e-05, + "loss": 2.5641, + "step": 51619 + }, + { + "epoch": 2.4032870079381707, + "grad_norm": 0.3218176442631708, + "learning_rate": 1.1571551959738303e-05, + "loss": 2.6867, + "step": 51620 + }, + { + "epoch": 2.403333566124264, + "grad_norm": 0.2972323497019401, + "learning_rate": 1.1569819056003839e-05, + "loss": 2.5851, + "step": 51621 + }, + { + "epoch": 2.403380124310357, + "grad_norm": 0.3158280618292182, + "learning_rate": 1.1568086265058197e-05, + "loss": 2.6413, + "step": 51622 + }, + { + "epoch": 2.40342668249645, + "grad_norm": 0.32775499106697153, + "learning_rate": 1.1566353586906425e-05, + "loss": 2.5965, + "step": 51623 + }, + { + "epoch": 2.403473240682543, + "grad_norm": 0.3137390566166224, + "learning_rate": 1.1564621021553617e-05, + "loss": 2.6252, + "step": 51624 + }, + { + "epoch": 2.4035197988686363, + "grad_norm": 0.3377839449041374, + "learning_rate": 1.156288856900486e-05, + "loss": 2.6889, + "step": 51625 + }, + { + "epoch": 2.403566357054729, + "grad_norm": 0.3178364323228444, + "learning_rate": 1.156115622926524e-05, + "loss": 2.6416, + "step": 51626 + }, + { + "epoch": 2.403612915240822, + "grad_norm": 0.3439544033172301, + "learning_rate": 1.1559424002339853e-05, + "loss": 2.6561, + "step": 51627 + }, + { + "epoch": 2.403659473426915, + "grad_norm": 0.31938541992853514, + "learning_rate": 1.1557691888233734e-05, + "loss": 2.7472, + "step": 51628 + }, + { + "epoch": 2.4037060316130083, + "grad_norm": 0.3150952131997946, + "learning_rate": 1.1555959886952038e-05, + "loss": 2.6303, + "step": 51629 + }, + { + "epoch": 2.4037525897991014, + "grad_norm": 0.32061212345409873, + "learning_rate": 1.15542279984998e-05, + "loss": 2.5706, + "step": 51630 + }, + { + "epoch": 2.4037991479851946, + "grad_norm": 0.3254204516909163, + "learning_rate": 1.155249622288211e-05, + "loss": 2.6291, + "step": 51631 + }, + { + "epoch": 2.4038457061712877, + "grad_norm": 0.3060329654871379, + "learning_rate": 1.1550764560104061e-05, + "loss": 2.5735, + "step": 51632 + }, + { + "epoch": 2.403892264357381, + "grad_norm": 0.3254622873556181, + "learning_rate": 1.1549033010170724e-05, + "loss": 2.7082, + "step": 51633 + }, + { + "epoch": 2.4039388225434735, + "grad_norm": 0.3398425604469984, + "learning_rate": 1.1547301573087188e-05, + "loss": 2.5852, + "step": 51634 + }, + { + "epoch": 2.4039853807295666, + "grad_norm": 0.3270965725753805, + "learning_rate": 1.154557024885855e-05, + "loss": 2.6906, + "step": 51635 + }, + { + "epoch": 2.4040319389156597, + "grad_norm": 0.3311964221354921, + "learning_rate": 1.1543839037489862e-05, + "loss": 2.7206, + "step": 51636 + }, + { + "epoch": 2.404078497101753, + "grad_norm": 0.33056187800214004, + "learning_rate": 1.1542107938986219e-05, + "loss": 2.6632, + "step": 51637 + }, + { + "epoch": 2.404125055287846, + "grad_norm": 0.3203570199711878, + "learning_rate": 1.1540376953352694e-05, + "loss": 2.5994, + "step": 51638 + }, + { + "epoch": 2.404171613473939, + "grad_norm": 0.3174751981991304, + "learning_rate": 1.153864608059438e-05, + "loss": 2.6154, + "step": 51639 + }, + { + "epoch": 2.404218171660032, + "grad_norm": 0.32701046298841174, + "learning_rate": 1.1536915320716363e-05, + "loss": 2.7893, + "step": 51640 + }, + { + "epoch": 2.4042647298461253, + "grad_norm": 0.30888548133255855, + "learning_rate": 1.1535184673723681e-05, + "loss": 2.5485, + "step": 51641 + }, + { + "epoch": 2.4043112880322184, + "grad_norm": 0.3341246727050179, + "learning_rate": 1.1533454139621468e-05, + "loss": 2.6065, + "step": 51642 + }, + { + "epoch": 2.4043578462183115, + "grad_norm": 0.3222858364069977, + "learning_rate": 1.1531723718414767e-05, + "loss": 2.6593, + "step": 51643 + }, + { + "epoch": 2.4044044044044046, + "grad_norm": 0.3001512401501986, + "learning_rate": 1.152999341010867e-05, + "loss": 2.6271, + "step": 51644 + }, + { + "epoch": 2.4044509625904973, + "grad_norm": 0.33167511421404505, + "learning_rate": 1.1528263214708246e-05, + "loss": 2.719, + "step": 51645 + }, + { + "epoch": 2.4044975207765904, + "grad_norm": 0.33072429730638525, + "learning_rate": 1.1526533132218586e-05, + "loss": 2.6661, + "step": 51646 + }, + { + "epoch": 2.4045440789626835, + "grad_norm": 0.34297836036135704, + "learning_rate": 1.1524803162644754e-05, + "loss": 2.6882, + "step": 51647 + }, + { + "epoch": 2.4045906371487766, + "grad_norm": 0.33570257845957885, + "learning_rate": 1.1523073305991854e-05, + "loss": 2.6949, + "step": 51648 + }, + { + "epoch": 2.4046371953348697, + "grad_norm": 0.3109759142269981, + "learning_rate": 1.1521343562264924e-05, + "loss": 2.6157, + "step": 51649 + }, + { + "epoch": 2.404683753520963, + "grad_norm": 0.3317858743716701, + "learning_rate": 1.1519613931469064e-05, + "loss": 2.6157, + "step": 51650 + }, + { + "epoch": 2.404730311707056, + "grad_norm": 0.34244522395179167, + "learning_rate": 1.1517884413609347e-05, + "loss": 2.7248, + "step": 51651 + }, + { + "epoch": 2.404776869893149, + "grad_norm": 0.3202294222814523, + "learning_rate": 1.1516155008690848e-05, + "loss": 2.693, + "step": 51652 + }, + { + "epoch": 2.404823428079242, + "grad_norm": 0.3246151398788921, + "learning_rate": 1.1514425716718652e-05, + "loss": 2.627, + "step": 51653 + }, + { + "epoch": 2.404869986265335, + "grad_norm": 0.3167182634166136, + "learning_rate": 1.1512696537697804e-05, + "loss": 2.5802, + "step": 51654 + }, + { + "epoch": 2.404916544451428, + "grad_norm": 0.3320596922857945, + "learning_rate": 1.1510967471633422e-05, + "loss": 2.5883, + "step": 51655 + }, + { + "epoch": 2.404963102637521, + "grad_norm": 0.3375351271852105, + "learning_rate": 1.1509238518530535e-05, + "loss": 2.6766, + "step": 51656 + }, + { + "epoch": 2.4050096608236142, + "grad_norm": 0.3408380750019431, + "learning_rate": 1.1507509678394263e-05, + "loss": 2.5631, + "step": 51657 + }, + { + "epoch": 2.4050562190097073, + "grad_norm": 0.3387590958126004, + "learning_rate": 1.150578095122965e-05, + "loss": 2.6102, + "step": 51658 + }, + { + "epoch": 2.4051027771958005, + "grad_norm": 0.33504384528154196, + "learning_rate": 1.1504052337041776e-05, + "loss": 2.5954, + "step": 51659 + }, + { + "epoch": 2.4051493353818936, + "grad_norm": 0.3097600598964035, + "learning_rate": 1.1502323835835715e-05, + "loss": 2.5892, + "step": 51660 + }, + { + "epoch": 2.4051958935679867, + "grad_norm": 0.3424261389936734, + "learning_rate": 1.1500595447616552e-05, + "loss": 2.6842, + "step": 51661 + }, + { + "epoch": 2.40524245175408, + "grad_norm": 0.3202532378126225, + "learning_rate": 1.1498867172389339e-05, + "loss": 2.6397, + "step": 51662 + }, + { + "epoch": 2.405289009940173, + "grad_norm": 0.3155189333740799, + "learning_rate": 1.1497139010159158e-05, + "loss": 2.6752, + "step": 51663 + }, + { + "epoch": 2.405335568126266, + "grad_norm": 0.3093696395284515, + "learning_rate": 1.149541096093108e-05, + "loss": 2.5949, + "step": 51664 + }, + { + "epoch": 2.4053821263123587, + "grad_norm": 0.3394422931359689, + "learning_rate": 1.1493683024710179e-05, + "loss": 2.6417, + "step": 51665 + }, + { + "epoch": 2.405428684498452, + "grad_norm": 0.3025071173080621, + "learning_rate": 1.1491955201501537e-05, + "loss": 2.6175, + "step": 51666 + }, + { + "epoch": 2.405475242684545, + "grad_norm": 0.3136241137827249, + "learning_rate": 1.149022749131018e-05, + "loss": 2.694, + "step": 51667 + }, + { + "epoch": 2.405521800870638, + "grad_norm": 0.3177364724967634, + "learning_rate": 1.1488499894141246e-05, + "loss": 2.553, + "step": 51668 + }, + { + "epoch": 2.405568359056731, + "grad_norm": 0.3004387419912448, + "learning_rate": 1.148677240999974e-05, + "loss": 2.5862, + "step": 51669 + }, + { + "epoch": 2.4056149172428243, + "grad_norm": 0.3100745020909453, + "learning_rate": 1.148504503889079e-05, + "loss": 2.6194, + "step": 51670 + }, + { + "epoch": 2.4056614754289174, + "grad_norm": 0.33135968225739293, + "learning_rate": 1.1483317780819425e-05, + "loss": 2.6067, + "step": 51671 + }, + { + "epoch": 2.4057080336150105, + "grad_norm": 0.30740806716134367, + "learning_rate": 1.148159063579073e-05, + "loss": 2.6445, + "step": 51672 + }, + { + "epoch": 2.405754591801103, + "grad_norm": 0.2979200385808445, + "learning_rate": 1.1479863603809771e-05, + "loss": 2.5513, + "step": 51673 + }, + { + "epoch": 2.4058011499871963, + "grad_norm": 0.3278220583794259, + "learning_rate": 1.1478136684881618e-05, + "loss": 2.6534, + "step": 51674 + }, + { + "epoch": 2.4058477081732894, + "grad_norm": 0.31936353066913375, + "learning_rate": 1.1476409879011352e-05, + "loss": 2.6175, + "step": 51675 + }, + { + "epoch": 2.4058942663593825, + "grad_norm": 0.3158097301678302, + "learning_rate": 1.1474683186204016e-05, + "loss": 2.6103, + "step": 51676 + }, + { + "epoch": 2.4059408245454756, + "grad_norm": 0.30155085385178104, + "learning_rate": 1.1472956606464685e-05, + "loss": 2.6965, + "step": 51677 + }, + { + "epoch": 2.4059873827315688, + "grad_norm": 0.30444555874053086, + "learning_rate": 1.1471230139798434e-05, + "loss": 2.6114, + "step": 51678 + }, + { + "epoch": 2.406033940917662, + "grad_norm": 0.31382674245965175, + "learning_rate": 1.146950378621034e-05, + "loss": 2.6879, + "step": 51679 + }, + { + "epoch": 2.406080499103755, + "grad_norm": 0.3145641672993787, + "learning_rate": 1.1467777545705428e-05, + "loss": 2.6226, + "step": 51680 + }, + { + "epoch": 2.406127057289848, + "grad_norm": 0.32884921228810476, + "learning_rate": 1.146605141828882e-05, + "loss": 2.6715, + "step": 51681 + }, + { + "epoch": 2.406173615475941, + "grad_norm": 0.3097224088164527, + "learning_rate": 1.1464325403965527e-05, + "loss": 2.6272, + "step": 51682 + }, + { + "epoch": 2.4062201736620343, + "grad_norm": 0.32622431300152893, + "learning_rate": 1.1462599502740673e-05, + "loss": 2.6572, + "step": 51683 + }, + { + "epoch": 2.406266731848127, + "grad_norm": 0.3175556595794286, + "learning_rate": 1.1460873714619275e-05, + "loss": 2.642, + "step": 51684 + }, + { + "epoch": 2.40631329003422, + "grad_norm": 0.32630568838093854, + "learning_rate": 1.1459148039606422e-05, + "loss": 2.6051, + "step": 51685 + }, + { + "epoch": 2.4063598482203132, + "grad_norm": 0.3273295564862115, + "learning_rate": 1.145742247770717e-05, + "loss": 2.5664, + "step": 51686 + }, + { + "epoch": 2.4064064064064064, + "grad_norm": 0.30216046107849354, + "learning_rate": 1.1455697028926581e-05, + "loss": 2.6173, + "step": 51687 + }, + { + "epoch": 2.4064529645924995, + "grad_norm": 0.3244737260336071, + "learning_rate": 1.1453971693269743e-05, + "loss": 2.6854, + "step": 51688 + }, + { + "epoch": 2.4064995227785926, + "grad_norm": 0.3327735246838853, + "learning_rate": 1.1452246470741684e-05, + "loss": 2.6642, + "step": 51689 + }, + { + "epoch": 2.4065460809646857, + "grad_norm": 0.31493695216843137, + "learning_rate": 1.1450521361347488e-05, + "loss": 2.5932, + "step": 51690 + }, + { + "epoch": 2.406592639150779, + "grad_norm": 0.3203583196787557, + "learning_rate": 1.1448796365092212e-05, + "loss": 2.5102, + "step": 51691 + }, + { + "epoch": 2.406639197336872, + "grad_norm": 0.32006836292157387, + "learning_rate": 1.144707148198092e-05, + "loss": 2.6636, + "step": 51692 + }, + { + "epoch": 2.4066857555229646, + "grad_norm": 0.3220430625998221, + "learning_rate": 1.1445346712018673e-05, + "loss": 2.5938, + "step": 51693 + }, + { + "epoch": 2.4067323137090577, + "grad_norm": 0.31154773449465456, + "learning_rate": 1.1443622055210551e-05, + "loss": 2.704, + "step": 51694 + }, + { + "epoch": 2.406778871895151, + "grad_norm": 0.3181641283631945, + "learning_rate": 1.1441897511561572e-05, + "loss": 2.6101, + "step": 51695 + }, + { + "epoch": 2.406825430081244, + "grad_norm": 0.32530224171107264, + "learning_rate": 1.1440173081076854e-05, + "loss": 2.6689, + "step": 51696 + }, + { + "epoch": 2.406871988267337, + "grad_norm": 0.3140293835375991, + "learning_rate": 1.1438448763761416e-05, + "loss": 2.6285, + "step": 51697 + }, + { + "epoch": 2.40691854645343, + "grad_norm": 0.3242561908328176, + "learning_rate": 1.1436724559620332e-05, + "loss": 2.6305, + "step": 51698 + }, + { + "epoch": 2.4069651046395233, + "grad_norm": 0.32440027735175153, + "learning_rate": 1.1435000468658662e-05, + "loss": 2.6606, + "step": 51699 + }, + { + "epoch": 2.4070116628256164, + "grad_norm": 0.3194998392618488, + "learning_rate": 1.1433276490881462e-05, + "loss": 2.4973, + "step": 51700 + }, + { + "epoch": 2.4070582210117095, + "grad_norm": 0.32641724797623, + "learning_rate": 1.1431552626293818e-05, + "loss": 2.6002, + "step": 51701 + }, + { + "epoch": 2.4071047791978026, + "grad_norm": 0.31266552294767613, + "learning_rate": 1.1429828874900745e-05, + "loss": 2.6458, + "step": 51702 + }, + { + "epoch": 2.4071513373838958, + "grad_norm": 0.3024243318846073, + "learning_rate": 1.1428105236707332e-05, + "loss": 2.6382, + "step": 51703 + }, + { + "epoch": 2.4071978955699884, + "grad_norm": 0.31018065325222943, + "learning_rate": 1.1426381711718625e-05, + "loss": 2.5809, + "step": 51704 + }, + { + "epoch": 2.4072444537560815, + "grad_norm": 0.338761566162338, + "learning_rate": 1.1424658299939689e-05, + "loss": 2.562, + "step": 51705 + }, + { + "epoch": 2.4072910119421747, + "grad_norm": 0.32467748069922225, + "learning_rate": 1.142293500137558e-05, + "loss": 2.6925, + "step": 51706 + }, + { + "epoch": 2.4073375701282678, + "grad_norm": 0.3449516316571224, + "learning_rate": 1.1421211816031374e-05, + "loss": 2.6284, + "step": 51707 + }, + { + "epoch": 2.407384128314361, + "grad_norm": 0.3330453022503346, + "learning_rate": 1.1419488743912077e-05, + "loss": 2.7421, + "step": 51708 + }, + { + "epoch": 2.407430686500454, + "grad_norm": 0.3272578095893339, + "learning_rate": 1.141776578502281e-05, + "loss": 2.6604, + "step": 51709 + }, + { + "epoch": 2.407477244686547, + "grad_norm": 0.32451720001663237, + "learning_rate": 1.1416042939368572e-05, + "loss": 2.5843, + "step": 51710 + }, + { + "epoch": 2.4075238028726402, + "grad_norm": 0.33973607758047714, + "learning_rate": 1.1414320206954476e-05, + "loss": 2.6492, + "step": 51711 + }, + { + "epoch": 2.4075703610587333, + "grad_norm": 0.3196428480179186, + "learning_rate": 1.1412597587785534e-05, + "loss": 2.6879, + "step": 51712 + }, + { + "epoch": 2.407616919244826, + "grad_norm": 0.32600969567708366, + "learning_rate": 1.1410875081866811e-05, + "loss": 2.6524, + "step": 51713 + }, + { + "epoch": 2.407663477430919, + "grad_norm": 0.33113623873213116, + "learning_rate": 1.140915268920339e-05, + "loss": 2.7136, + "step": 51714 + }, + { + "epoch": 2.4077100356170122, + "grad_norm": 0.3221022629771361, + "learning_rate": 1.1407430409800285e-05, + "loss": 2.6097, + "step": 51715 + }, + { + "epoch": 2.4077565938031054, + "grad_norm": 0.30284010477584067, + "learning_rate": 1.1405708243662571e-05, + "loss": 2.6524, + "step": 51716 + }, + { + "epoch": 2.4078031519891985, + "grad_norm": 0.3324676853320791, + "learning_rate": 1.1403986190795308e-05, + "loss": 2.7624, + "step": 51717 + }, + { + "epoch": 2.4078497101752916, + "grad_norm": 0.33527615543151373, + "learning_rate": 1.1402264251203537e-05, + "loss": 2.6534, + "step": 51718 + }, + { + "epoch": 2.4078962683613847, + "grad_norm": 0.31488191385375286, + "learning_rate": 1.1400542424892313e-05, + "loss": 2.6397, + "step": 51719 + }, + { + "epoch": 2.407942826547478, + "grad_norm": 0.30611130808935094, + "learning_rate": 1.1398820711866714e-05, + "loss": 2.6473, + "step": 51720 + }, + { + "epoch": 2.407989384733571, + "grad_norm": 0.3149229119462649, + "learning_rate": 1.1397099112131748e-05, + "loss": 2.6525, + "step": 51721 + }, + { + "epoch": 2.408035942919664, + "grad_norm": 0.3268389848664183, + "learning_rate": 1.1395377625692516e-05, + "loss": 2.6851, + "step": 51722 + }, + { + "epoch": 2.408082501105757, + "grad_norm": 0.3359105411053975, + "learning_rate": 1.1393656252554019e-05, + "loss": 2.7292, + "step": 51723 + }, + { + "epoch": 2.40812905929185, + "grad_norm": 0.3436196504975297, + "learning_rate": 1.139193499272137e-05, + "loss": 2.7048, + "step": 51724 + }, + { + "epoch": 2.408175617477943, + "grad_norm": 0.32198907833660895, + "learning_rate": 1.139021384619957e-05, + "loss": 2.6315, + "step": 51725 + }, + { + "epoch": 2.408222175664036, + "grad_norm": 0.337020048952786, + "learning_rate": 1.1388492812993696e-05, + "loss": 2.5072, + "step": 51726 + }, + { + "epoch": 2.408268733850129, + "grad_norm": 0.32780600715560326, + "learning_rate": 1.13867718931088e-05, + "loss": 2.7008, + "step": 51727 + }, + { + "epoch": 2.4083152920362223, + "grad_norm": 0.3264853249714589, + "learning_rate": 1.1385051086549897e-05, + "loss": 2.6097, + "step": 51728 + }, + { + "epoch": 2.4083618502223154, + "grad_norm": 0.3442863522154772, + "learning_rate": 1.1383330393322089e-05, + "loss": 2.6392, + "step": 51729 + }, + { + "epoch": 2.4084084084084085, + "grad_norm": 0.3743897512308077, + "learning_rate": 1.1381609813430394e-05, + "loss": 2.7284, + "step": 51730 + }, + { + "epoch": 2.4084549665945016, + "grad_norm": 0.299105778653678, + "learning_rate": 1.137988934687987e-05, + "loss": 2.6221, + "step": 51731 + }, + { + "epoch": 2.4085015247805943, + "grad_norm": 0.31703823051749935, + "learning_rate": 1.1378168993675558e-05, + "loss": 2.6601, + "step": 51732 + }, + { + "epoch": 2.4085480829666874, + "grad_norm": 0.318090073863437, + "learning_rate": 1.1376448753822538e-05, + "loss": 2.5647, + "step": 51733 + }, + { + "epoch": 2.4085946411527805, + "grad_norm": 0.3324278942370975, + "learning_rate": 1.1374728627325804e-05, + "loss": 2.6447, + "step": 51734 + }, + { + "epoch": 2.4086411993388737, + "grad_norm": 0.34380111941814806, + "learning_rate": 1.1373008614190468e-05, + "loss": 2.6804, + "step": 51735 + }, + { + "epoch": 2.4086877575249668, + "grad_norm": 0.3049774078306484, + "learning_rate": 1.1371288714421513e-05, + "loss": 2.5424, + "step": 51736 + }, + { + "epoch": 2.40873431571106, + "grad_norm": 0.3168101152578023, + "learning_rate": 1.1369568928024048e-05, + "loss": 2.5558, + "step": 51737 + }, + { + "epoch": 2.408780873897153, + "grad_norm": 0.3282488761885081, + "learning_rate": 1.1367849255003083e-05, + "loss": 2.6218, + "step": 51738 + }, + { + "epoch": 2.408827432083246, + "grad_norm": 0.32416998665350166, + "learning_rate": 1.136612969536367e-05, + "loss": 2.6299, + "step": 51739 + }, + { + "epoch": 2.4088739902693392, + "grad_norm": 0.331453455980941, + "learning_rate": 1.1364410249110873e-05, + "loss": 2.7761, + "step": 51740 + }, + { + "epoch": 2.4089205484554324, + "grad_norm": 0.32279690010014644, + "learning_rate": 1.1362690916249701e-05, + "loss": 2.5461, + "step": 51741 + }, + { + "epoch": 2.4089671066415255, + "grad_norm": 0.32419691887010677, + "learning_rate": 1.1360971696785255e-05, + "loss": 2.6698, + "step": 51742 + }, + { + "epoch": 2.409013664827618, + "grad_norm": 0.3276028200851003, + "learning_rate": 1.1359252590722536e-05, + "loss": 2.6519, + "step": 51743 + }, + { + "epoch": 2.4090602230137113, + "grad_norm": 0.32714129100230377, + "learning_rate": 1.1357533598066599e-05, + "loss": 2.6707, + "step": 51744 + }, + { + "epoch": 2.4091067811998044, + "grad_norm": 0.3245751070541045, + "learning_rate": 1.1355814718822499e-05, + "loss": 2.6486, + "step": 51745 + }, + { + "epoch": 2.4091533393858975, + "grad_norm": 0.315614387743508, + "learning_rate": 1.1354095952995275e-05, + "loss": 2.6248, + "step": 51746 + }, + { + "epoch": 2.4091998975719906, + "grad_norm": 0.3094620711566234, + "learning_rate": 1.1352377300589972e-05, + "loss": 2.595, + "step": 51747 + }, + { + "epoch": 2.4092464557580837, + "grad_norm": 0.3293082468175449, + "learning_rate": 1.1350658761611643e-05, + "loss": 2.7116, + "step": 51748 + }, + { + "epoch": 2.409293013944177, + "grad_norm": 0.3159901832532776, + "learning_rate": 1.13489403360653e-05, + "loss": 2.644, + "step": 51749 + }, + { + "epoch": 2.40933957213027, + "grad_norm": 0.33610973836013663, + "learning_rate": 1.1347222023956039e-05, + "loss": 2.6998, + "step": 51750 + }, + { + "epoch": 2.409386130316363, + "grad_norm": 0.3062187853872428, + "learning_rate": 1.1345503825288856e-05, + "loss": 2.5371, + "step": 51751 + }, + { + "epoch": 2.4094326885024557, + "grad_norm": 0.3254619909333174, + "learning_rate": 1.1343785740068812e-05, + "loss": 2.5584, + "step": 51752 + }, + { + "epoch": 2.409479246688549, + "grad_norm": 0.3185253626720661, + "learning_rate": 1.1342067768300963e-05, + "loss": 2.5892, + "step": 51753 + }, + { + "epoch": 2.409525804874642, + "grad_norm": 0.3336938676171113, + "learning_rate": 1.1340349909990305e-05, + "loss": 2.6613, + "step": 51754 + }, + { + "epoch": 2.409572363060735, + "grad_norm": 0.31917728958849495, + "learning_rate": 1.1338632165141944e-05, + "loss": 2.5672, + "step": 51755 + }, + { + "epoch": 2.409618921246828, + "grad_norm": 0.3125900178037714, + "learning_rate": 1.1336914533760873e-05, + "loss": 2.6166, + "step": 51756 + }, + { + "epoch": 2.4096654794329213, + "grad_norm": 0.31755559121429017, + "learning_rate": 1.133519701585215e-05, + "loss": 2.537, + "step": 51757 + }, + { + "epoch": 2.4097120376190144, + "grad_norm": 0.3192968563453491, + "learning_rate": 1.1333479611420816e-05, + "loss": 2.6732, + "step": 51758 + }, + { + "epoch": 2.4097585958051075, + "grad_norm": 0.31575823837059364, + "learning_rate": 1.1331762320471905e-05, + "loss": 2.5521, + "step": 51759 + }, + { + "epoch": 2.4098051539912007, + "grad_norm": 0.33233655450605687, + "learning_rate": 1.1330045143010471e-05, + "loss": 2.7464, + "step": 51760 + }, + { + "epoch": 2.4098517121772938, + "grad_norm": 0.3159476970853733, + "learning_rate": 1.132832807904155e-05, + "loss": 2.6709, + "step": 51761 + }, + { + "epoch": 2.409898270363387, + "grad_norm": 0.323432372820127, + "learning_rate": 1.1326611128570153e-05, + "loss": 2.6279, + "step": 51762 + }, + { + "epoch": 2.4099448285494796, + "grad_norm": 0.31535624368029974, + "learning_rate": 1.1324894291601367e-05, + "loss": 2.6538, + "step": 51763 + }, + { + "epoch": 2.4099913867355727, + "grad_norm": 0.3189654097189501, + "learning_rate": 1.1323177568140198e-05, + "loss": 2.5787, + "step": 51764 + }, + { + "epoch": 2.410037944921666, + "grad_norm": 0.32023999111871443, + "learning_rate": 1.1321460958191687e-05, + "loss": 2.572, + "step": 51765 + }, + { + "epoch": 2.410084503107759, + "grad_norm": 0.32999011148775764, + "learning_rate": 1.131974446176089e-05, + "loss": 2.6657, + "step": 51766 + }, + { + "epoch": 2.410131061293852, + "grad_norm": 0.32437861415461705, + "learning_rate": 1.131802807885281e-05, + "loss": 2.6701, + "step": 51767 + }, + { + "epoch": 2.410177619479945, + "grad_norm": 0.321002187370579, + "learning_rate": 1.131631180947253e-05, + "loss": 2.6884, + "step": 51768 + }, + { + "epoch": 2.4102241776660382, + "grad_norm": 0.3277413182533912, + "learning_rate": 1.1314595653625054e-05, + "loss": 2.6556, + "step": 51769 + }, + { + "epoch": 2.4102707358521314, + "grad_norm": 0.3319700045950765, + "learning_rate": 1.1312879611315425e-05, + "loss": 2.5599, + "step": 51770 + }, + { + "epoch": 2.410317294038224, + "grad_norm": 0.32775011121060826, + "learning_rate": 1.1311163682548686e-05, + "loss": 2.7353, + "step": 51771 + }, + { + "epoch": 2.410363852224317, + "grad_norm": 0.3139698080168562, + "learning_rate": 1.130944786732987e-05, + "loss": 2.6158, + "step": 51772 + }, + { + "epoch": 2.4104104104104103, + "grad_norm": 0.3390065197794668, + "learning_rate": 1.130773216566401e-05, + "loss": 2.6672, + "step": 51773 + }, + { + "epoch": 2.4104569685965034, + "grad_norm": 0.32387589249005133, + "learning_rate": 1.1306016577556161e-05, + "loss": 2.7149, + "step": 51774 + }, + { + "epoch": 2.4105035267825965, + "grad_norm": 0.3363979532597405, + "learning_rate": 1.1304301103011311e-05, + "loss": 2.614, + "step": 51775 + }, + { + "epoch": 2.4105500849686896, + "grad_norm": 0.33582853185163075, + "learning_rate": 1.1302585742034555e-05, + "loss": 2.6607, + "step": 51776 + }, + { + "epoch": 2.4105966431547827, + "grad_norm": 0.33681325028734693, + "learning_rate": 1.1300870494630883e-05, + "loss": 2.6588, + "step": 51777 + }, + { + "epoch": 2.410643201340876, + "grad_norm": 0.3189603218886894, + "learning_rate": 1.1299155360805341e-05, + "loss": 2.5911, + "step": 51778 + }, + { + "epoch": 2.410689759526969, + "grad_norm": 0.34407393156010835, + "learning_rate": 1.1297440340562987e-05, + "loss": 2.6419, + "step": 51779 + }, + { + "epoch": 2.410736317713062, + "grad_norm": 0.3691013259308283, + "learning_rate": 1.12957254339088e-05, + "loss": 2.717, + "step": 51780 + }, + { + "epoch": 2.410782875899155, + "grad_norm": 0.3368154666522108, + "learning_rate": 1.1294010640847868e-05, + "loss": 2.5908, + "step": 51781 + }, + { + "epoch": 2.410829434085248, + "grad_norm": 0.32009902516867855, + "learning_rate": 1.129229596138518e-05, + "loss": 2.5956, + "step": 51782 + }, + { + "epoch": 2.410875992271341, + "grad_norm": 0.3350195449127763, + "learning_rate": 1.1290581395525817e-05, + "loss": 2.6989, + "step": 51783 + }, + { + "epoch": 2.410922550457434, + "grad_norm": 0.3344051281341854, + "learning_rate": 1.1288866943274772e-05, + "loss": 2.5054, + "step": 51784 + }, + { + "epoch": 2.410969108643527, + "grad_norm": 0.31726571164003187, + "learning_rate": 1.1287152604637086e-05, + "loss": 2.5909, + "step": 51785 + }, + { + "epoch": 2.4110156668296203, + "grad_norm": 0.30280497068717876, + "learning_rate": 1.1285438379617796e-05, + "loss": 2.5773, + "step": 51786 + }, + { + "epoch": 2.4110622250157134, + "grad_norm": 0.3465953597807738, + "learning_rate": 1.128372426822194e-05, + "loss": 2.6072, + "step": 51787 + }, + { + "epoch": 2.4111087832018065, + "grad_norm": 0.333975090123446, + "learning_rate": 1.1282010270454519e-05, + "loss": 2.6102, + "step": 51788 + }, + { + "epoch": 2.4111553413878997, + "grad_norm": 0.3063086090501863, + "learning_rate": 1.1280296386320604e-05, + "loss": 2.5577, + "step": 51789 + }, + { + "epoch": 2.411201899573993, + "grad_norm": 0.3177663098696069, + "learning_rate": 1.1278582615825189e-05, + "loss": 2.5385, + "step": 51790 + }, + { + "epoch": 2.4112484577600855, + "grad_norm": 0.30224492102783945, + "learning_rate": 1.1276868958973325e-05, + "loss": 2.5544, + "step": 51791 + }, + { + "epoch": 2.4112950159461786, + "grad_norm": 0.32389881116285957, + "learning_rate": 1.127515541577005e-05, + "loss": 2.7711, + "step": 51792 + }, + { + "epoch": 2.4113415741322717, + "grad_norm": 0.32174052068163306, + "learning_rate": 1.1273441986220345e-05, + "loss": 2.6201, + "step": 51793 + }, + { + "epoch": 2.411388132318365, + "grad_norm": 0.3429352539200032, + "learning_rate": 1.1271728670329307e-05, + "loss": 2.6629, + "step": 51794 + }, + { + "epoch": 2.411434690504458, + "grad_norm": 0.3323372107913026, + "learning_rate": 1.1270015468101896e-05, + "loss": 2.6155, + "step": 51795 + }, + { + "epoch": 2.411481248690551, + "grad_norm": 0.3430428985576767, + "learning_rate": 1.1268302379543211e-05, + "loss": 2.6657, + "step": 51796 + }, + { + "epoch": 2.411527806876644, + "grad_norm": 0.329869606561227, + "learning_rate": 1.1266589404658223e-05, + "loss": 2.6117, + "step": 51797 + }, + { + "epoch": 2.4115743650627373, + "grad_norm": 0.3176466356449042, + "learning_rate": 1.1264876543451974e-05, + "loss": 2.546, + "step": 51798 + }, + { + "epoch": 2.4116209232488304, + "grad_norm": 0.3453171204649802, + "learning_rate": 1.1263163795929504e-05, + "loss": 2.5891, + "step": 51799 + }, + { + "epoch": 2.4116674814349235, + "grad_norm": 0.32215084011607803, + "learning_rate": 1.1261451162095833e-05, + "loss": 2.6221, + "step": 51800 + }, + { + "epoch": 2.4117140396210166, + "grad_norm": 0.3304180525140993, + "learning_rate": 1.125973864195598e-05, + "loss": 2.5733, + "step": 51801 + }, + { + "epoch": 2.4117605978071093, + "grad_norm": 0.33911962564382997, + "learning_rate": 1.1258026235514996e-05, + "loss": 2.5468, + "step": 51802 + }, + { + "epoch": 2.4118071559932024, + "grad_norm": 0.3334383652301619, + "learning_rate": 1.1256313942777874e-05, + "loss": 2.6077, + "step": 51803 + }, + { + "epoch": 2.4118537141792955, + "grad_norm": 0.31555732594026, + "learning_rate": 1.125460176374965e-05, + "loss": 2.581, + "step": 51804 + }, + { + "epoch": 2.4119002723653886, + "grad_norm": 0.3164536344065371, + "learning_rate": 1.125288969843537e-05, + "loss": 2.6511, + "step": 51805 + }, + { + "epoch": 2.4119468305514817, + "grad_norm": 0.3067480561743066, + "learning_rate": 1.1251177746840014e-05, + "loss": 2.556, + "step": 51806 + }, + { + "epoch": 2.411993388737575, + "grad_norm": 0.3398239639932347, + "learning_rate": 1.124946590896866e-05, + "loss": 2.6523, + "step": 51807 + }, + { + "epoch": 2.412039946923668, + "grad_norm": 0.336375359734307, + "learning_rate": 1.124775418482628e-05, + "loss": 2.6662, + "step": 51808 + }, + { + "epoch": 2.412086505109761, + "grad_norm": 0.3300422450012603, + "learning_rate": 1.1246042574417948e-05, + "loss": 2.7, + "step": 51809 + }, + { + "epoch": 2.4121330632958538, + "grad_norm": 0.3192721258711908, + "learning_rate": 1.1244331077748654e-05, + "loss": 2.6057, + "step": 51810 + }, + { + "epoch": 2.412179621481947, + "grad_norm": 0.32256109388348697, + "learning_rate": 1.1242619694823426e-05, + "loss": 2.5781, + "step": 51811 + }, + { + "epoch": 2.41222617966804, + "grad_norm": 0.32103228263028694, + "learning_rate": 1.1240908425647295e-05, + "loss": 2.6817, + "step": 51812 + }, + { + "epoch": 2.412272737854133, + "grad_norm": 0.3191066466525062, + "learning_rate": 1.1239197270225276e-05, + "loss": 2.5725, + "step": 51813 + }, + { + "epoch": 2.412319296040226, + "grad_norm": 0.31963624723741973, + "learning_rate": 1.1237486228562393e-05, + "loss": 2.5991, + "step": 51814 + }, + { + "epoch": 2.4123658542263193, + "grad_norm": 0.30078806240382566, + "learning_rate": 1.1235775300663687e-05, + "loss": 2.6509, + "step": 51815 + }, + { + "epoch": 2.4124124124124124, + "grad_norm": 0.3257218476718431, + "learning_rate": 1.1234064486534152e-05, + "loss": 2.6329, + "step": 51816 + }, + { + "epoch": 2.4124589705985056, + "grad_norm": 0.33229413558669546, + "learning_rate": 1.1232353786178811e-05, + "loss": 2.6321, + "step": 51817 + }, + { + "epoch": 2.4125055287845987, + "grad_norm": 0.30245529490346296, + "learning_rate": 1.1230643199602692e-05, + "loss": 2.5772, + "step": 51818 + }, + { + "epoch": 2.412552086970692, + "grad_norm": 0.29955440646092857, + "learning_rate": 1.122893272681082e-05, + "loss": 2.653, + "step": 51819 + }, + { + "epoch": 2.412598645156785, + "grad_norm": 0.3268700733685978, + "learning_rate": 1.1227222367808221e-05, + "loss": 2.5512, + "step": 51820 + }, + { + "epoch": 2.4126452033428776, + "grad_norm": 0.35442443817081887, + "learning_rate": 1.122551212259988e-05, + "loss": 2.7129, + "step": 51821 + }, + { + "epoch": 2.4126917615289707, + "grad_norm": 0.32127851366492516, + "learning_rate": 1.1223801991190869e-05, + "loss": 2.5647, + "step": 51822 + }, + { + "epoch": 2.412738319715064, + "grad_norm": 0.3188986891264653, + "learning_rate": 1.1222091973586162e-05, + "loss": 2.6206, + "step": 51823 + }, + { + "epoch": 2.412784877901157, + "grad_norm": 0.3043284350780641, + "learning_rate": 1.1220382069790796e-05, + "loss": 2.7229, + "step": 51824 + }, + { + "epoch": 2.41283143608725, + "grad_norm": 0.3239309989492379, + "learning_rate": 1.1218672279809789e-05, + "loss": 2.6111, + "step": 51825 + }, + { + "epoch": 2.412877994273343, + "grad_norm": 0.370529049889814, + "learning_rate": 1.1216962603648163e-05, + "loss": 2.4595, + "step": 51826 + }, + { + "epoch": 2.4129245524594363, + "grad_norm": 0.35047075515716464, + "learning_rate": 1.1215253041310924e-05, + "loss": 2.6189, + "step": 51827 + }, + { + "epoch": 2.4129711106455294, + "grad_norm": 0.3015487297006072, + "learning_rate": 1.1213543592803111e-05, + "loss": 2.5993, + "step": 51828 + }, + { + "epoch": 2.4130176688316225, + "grad_norm": 0.3427428518955854, + "learning_rate": 1.1211834258129716e-05, + "loss": 2.6411, + "step": 51829 + }, + { + "epoch": 2.413064227017715, + "grad_norm": 0.316232973624385, + "learning_rate": 1.121012503729576e-05, + "loss": 2.6467, + "step": 51830 + }, + { + "epoch": 2.4131107852038083, + "grad_norm": 0.3296755369979846, + "learning_rate": 1.1208415930306271e-05, + "loss": 2.6312, + "step": 51831 + }, + { + "epoch": 2.4131573433899014, + "grad_norm": 0.34041956057983, + "learning_rate": 1.120670693716626e-05, + "loss": 2.6266, + "step": 51832 + }, + { + "epoch": 2.4132039015759945, + "grad_norm": 0.3114239555666707, + "learning_rate": 1.1204998057880756e-05, + "loss": 2.5752, + "step": 51833 + }, + { + "epoch": 2.4132504597620876, + "grad_norm": 0.3073553494996616, + "learning_rate": 1.1203289292454727e-05, + "loss": 2.6218, + "step": 51834 + }, + { + "epoch": 2.4132970179481807, + "grad_norm": 0.33432614446044534, + "learning_rate": 1.1201580640893256e-05, + "loss": 2.749, + "step": 51835 + }, + { + "epoch": 2.413343576134274, + "grad_norm": 0.3347470392589417, + "learning_rate": 1.1199872103201298e-05, + "loss": 2.6603, + "step": 51836 + }, + { + "epoch": 2.413390134320367, + "grad_norm": 0.3113222768693041, + "learning_rate": 1.1198163679383917e-05, + "loss": 2.5388, + "step": 51837 + }, + { + "epoch": 2.41343669250646, + "grad_norm": 0.326636758780466, + "learning_rate": 1.1196455369446085e-05, + "loss": 2.6267, + "step": 51838 + }, + { + "epoch": 2.413483250692553, + "grad_norm": 0.31678356362388016, + "learning_rate": 1.1194747173392838e-05, + "loss": 2.4978, + "step": 51839 + }, + { + "epoch": 2.4135298088786463, + "grad_norm": 0.32698691208348307, + "learning_rate": 1.1193039091229185e-05, + "loss": 2.68, + "step": 51840 + }, + { + "epoch": 2.413576367064739, + "grad_norm": 0.30803952811453794, + "learning_rate": 1.1191331122960153e-05, + "loss": 2.6482, + "step": 51841 + }, + { + "epoch": 2.413622925250832, + "grad_norm": 0.311050735157153, + "learning_rate": 1.1189623268590727e-05, + "loss": 2.603, + "step": 51842 + }, + { + "epoch": 2.4136694834369252, + "grad_norm": 0.32809534406056773, + "learning_rate": 1.1187915528125931e-05, + "loss": 2.5432, + "step": 51843 + }, + { + "epoch": 2.4137160416230183, + "grad_norm": 0.3320626309957156, + "learning_rate": 1.1186207901570784e-05, + "loss": 2.6625, + "step": 51844 + }, + { + "epoch": 2.4137625998091115, + "grad_norm": 0.3169681827666114, + "learning_rate": 1.1184500388930286e-05, + "loss": 2.6014, + "step": 51845 + }, + { + "epoch": 2.4138091579952046, + "grad_norm": 0.33858162690684834, + "learning_rate": 1.1182792990209473e-05, + "loss": 2.6758, + "step": 51846 + }, + { + "epoch": 2.4138557161812977, + "grad_norm": 0.3370322477160252, + "learning_rate": 1.1181085705413314e-05, + "loss": 2.6364, + "step": 51847 + }, + { + "epoch": 2.413902274367391, + "grad_norm": 0.3253333549787608, + "learning_rate": 1.1179378534546869e-05, + "loss": 2.7112, + "step": 51848 + }, + { + "epoch": 2.4139488325534835, + "grad_norm": 0.3163729631223942, + "learning_rate": 1.1177671477615093e-05, + "loss": 2.6822, + "step": 51849 + }, + { + "epoch": 2.4139953907395766, + "grad_norm": 0.3249998045831272, + "learning_rate": 1.1175964534623057e-05, + "loss": 2.6548, + "step": 51850 + }, + { + "epoch": 2.4140419489256697, + "grad_norm": 0.3317970712172964, + "learning_rate": 1.1174257705575731e-05, + "loss": 2.5476, + "step": 51851 + }, + { + "epoch": 2.414088507111763, + "grad_norm": 0.3148811351439173, + "learning_rate": 1.1172550990478126e-05, + "loss": 2.642, + "step": 51852 + }, + { + "epoch": 2.414135065297856, + "grad_norm": 0.3164036053162178, + "learning_rate": 1.117084438933526e-05, + "loss": 2.7097, + "step": 51853 + }, + { + "epoch": 2.414181623483949, + "grad_norm": 0.29962842153729197, + "learning_rate": 1.1169137902152143e-05, + "loss": 2.6676, + "step": 51854 + }, + { + "epoch": 2.414228181670042, + "grad_norm": 0.3307669167127376, + "learning_rate": 1.1167431528933797e-05, + "loss": 2.6925, + "step": 51855 + }, + { + "epoch": 2.4142747398561353, + "grad_norm": 0.2920694160949356, + "learning_rate": 1.1165725269685196e-05, + "loss": 2.5541, + "step": 51856 + }, + { + "epoch": 2.4143212980422284, + "grad_norm": 0.33070681968163534, + "learning_rate": 1.1164019124411368e-05, + "loss": 2.6383, + "step": 51857 + }, + { + "epoch": 2.4143678562283215, + "grad_norm": 0.3076698589739538, + "learning_rate": 1.1162313093117316e-05, + "loss": 2.552, + "step": 51858 + }, + { + "epoch": 2.4144144144144146, + "grad_norm": 0.32533718438058784, + "learning_rate": 1.1160607175808063e-05, + "loss": 2.7407, + "step": 51859 + }, + { + "epoch": 2.4144609726005073, + "grad_norm": 0.3184489666139271, + "learning_rate": 1.1158901372488573e-05, + "loss": 2.6063, + "step": 51860 + }, + { + "epoch": 2.4145075307866004, + "grad_norm": 0.30677970942038996, + "learning_rate": 1.1157195683163907e-05, + "loss": 2.657, + "step": 51861 + }, + { + "epoch": 2.4145540889726935, + "grad_norm": 0.3234017870386247, + "learning_rate": 1.1155490107839023e-05, + "loss": 2.6897, + "step": 51862 + }, + { + "epoch": 2.4146006471587866, + "grad_norm": 0.33670035697475176, + "learning_rate": 1.1153784646518972e-05, + "loss": 2.6268, + "step": 51863 + }, + { + "epoch": 2.4146472053448798, + "grad_norm": 0.3066507731657643, + "learning_rate": 1.1152079299208722e-05, + "loss": 2.5981, + "step": 51864 + }, + { + "epoch": 2.414693763530973, + "grad_norm": 0.31588979866243094, + "learning_rate": 1.1150374065913293e-05, + "loss": 2.5993, + "step": 51865 + }, + { + "epoch": 2.414740321717066, + "grad_norm": 0.30785817959317674, + "learning_rate": 1.114866894663769e-05, + "loss": 2.5433, + "step": 51866 + }, + { + "epoch": 2.414786879903159, + "grad_norm": 0.32070748998325266, + "learning_rate": 1.1146963941386918e-05, + "loss": 2.6257, + "step": 51867 + }, + { + "epoch": 2.414833438089252, + "grad_norm": 0.3101600539522322, + "learning_rate": 1.114525905016599e-05, + "loss": 2.5581, + "step": 51868 + }, + { + "epoch": 2.414879996275345, + "grad_norm": 0.31556444521673865, + "learning_rate": 1.114355427297989e-05, + "loss": 2.6471, + "step": 51869 + }, + { + "epoch": 2.414926554461438, + "grad_norm": 0.30629793711196746, + "learning_rate": 1.1141849609833622e-05, + "loss": 2.6506, + "step": 51870 + }, + { + "epoch": 2.414973112647531, + "grad_norm": 0.32139249595916175, + "learning_rate": 1.1140145060732205e-05, + "loss": 2.7417, + "step": 51871 + }, + { + "epoch": 2.4150196708336242, + "grad_norm": 0.33773366978158076, + "learning_rate": 1.113844062568063e-05, + "loss": 2.4777, + "step": 51872 + }, + { + "epoch": 2.4150662290197173, + "grad_norm": 0.31004100092040054, + "learning_rate": 1.1136736304683899e-05, + "loss": 2.6874, + "step": 51873 + }, + { + "epoch": 2.4151127872058105, + "grad_norm": 0.31528162694393724, + "learning_rate": 1.113503209774704e-05, + "loss": 2.6898, + "step": 51874 + }, + { + "epoch": 2.4151593453919036, + "grad_norm": 0.36299061671430577, + "learning_rate": 1.1133328004874999e-05, + "loss": 2.7161, + "step": 51875 + }, + { + "epoch": 2.4152059035779967, + "grad_norm": 0.3313726858446306, + "learning_rate": 1.113162402607284e-05, + "loss": 2.6553, + "step": 51876 + }, + { + "epoch": 2.41525246176409, + "grad_norm": 0.31944795616405053, + "learning_rate": 1.1129920161345526e-05, + "loss": 2.5767, + "step": 51877 + }, + { + "epoch": 2.415299019950183, + "grad_norm": 0.31600183553726086, + "learning_rate": 1.1128216410698062e-05, + "loss": 2.6984, + "step": 51878 + }, + { + "epoch": 2.415345578136276, + "grad_norm": 0.29415635036156695, + "learning_rate": 1.1126512774135456e-05, + "loss": 2.5229, + "step": 51879 + }, + { + "epoch": 2.4153921363223687, + "grad_norm": 0.3315556533509333, + "learning_rate": 1.1124809251662705e-05, + "loss": 2.6703, + "step": 51880 + }, + { + "epoch": 2.415438694508462, + "grad_norm": 0.3223280402607388, + "learning_rate": 1.1123105843284825e-05, + "loss": 2.6044, + "step": 51881 + }, + { + "epoch": 2.415485252694555, + "grad_norm": 0.3137613518218214, + "learning_rate": 1.1121402549006782e-05, + "loss": 2.6113, + "step": 51882 + }, + { + "epoch": 2.415531810880648, + "grad_norm": 0.3054637659517204, + "learning_rate": 1.1119699368833592e-05, + "loss": 2.6331, + "step": 51883 + }, + { + "epoch": 2.415578369066741, + "grad_norm": 0.3335691721899106, + "learning_rate": 1.1117996302770257e-05, + "loss": 2.716, + "step": 51884 + }, + { + "epoch": 2.4156249272528343, + "grad_norm": 0.32917470563549056, + "learning_rate": 1.1116293350821771e-05, + "loss": 2.71, + "step": 51885 + }, + { + "epoch": 2.4156714854389274, + "grad_norm": 0.30679172791418047, + "learning_rate": 1.1114590512993134e-05, + "loss": 2.5607, + "step": 51886 + }, + { + "epoch": 2.4157180436250205, + "grad_norm": 0.317990688419889, + "learning_rate": 1.1112887789289355e-05, + "loss": 2.5227, + "step": 51887 + }, + { + "epoch": 2.4157646018111136, + "grad_norm": 0.33785568243100206, + "learning_rate": 1.1111185179715394e-05, + "loss": 2.6623, + "step": 51888 + }, + { + "epoch": 2.4158111599972063, + "grad_norm": 0.3273707302420694, + "learning_rate": 1.1109482684276295e-05, + "loss": 2.6191, + "step": 51889 + }, + { + "epoch": 2.4158577181832994, + "grad_norm": 0.30820488223900755, + "learning_rate": 1.1107780302977028e-05, + "loss": 2.5383, + "step": 51890 + }, + { + "epoch": 2.4159042763693925, + "grad_norm": 0.31347253064454705, + "learning_rate": 1.1106078035822587e-05, + "loss": 2.691, + "step": 51891 + }, + { + "epoch": 2.4159508345554856, + "grad_norm": 0.3141895427263565, + "learning_rate": 1.110437588281798e-05, + "loss": 2.6388, + "step": 51892 + }, + { + "epoch": 2.4159973927415788, + "grad_norm": 0.31523585047728103, + "learning_rate": 1.1102673843968198e-05, + "loss": 2.6144, + "step": 51893 + }, + { + "epoch": 2.416043950927672, + "grad_norm": 0.31525805226868964, + "learning_rate": 1.1100971919278247e-05, + "loss": 2.6609, + "step": 51894 + }, + { + "epoch": 2.416090509113765, + "grad_norm": 0.3136781477806683, + "learning_rate": 1.1099270108753101e-05, + "loss": 2.6423, + "step": 51895 + }, + { + "epoch": 2.416137067299858, + "grad_norm": 0.30835063313571703, + "learning_rate": 1.1097568412397763e-05, + "loss": 2.6054, + "step": 51896 + }, + { + "epoch": 2.4161836254859512, + "grad_norm": 0.3371161136629367, + "learning_rate": 1.1095866830217228e-05, + "loss": 2.67, + "step": 51897 + }, + { + "epoch": 2.4162301836720443, + "grad_norm": 0.32705101070793485, + "learning_rate": 1.1094165362216491e-05, + "loss": 2.6063, + "step": 51898 + }, + { + "epoch": 2.4162767418581375, + "grad_norm": 0.3129519424512899, + "learning_rate": 1.1092464008400543e-05, + "loss": 2.6348, + "step": 51899 + }, + { + "epoch": 2.41632330004423, + "grad_norm": 0.325433955511543, + "learning_rate": 1.10907627687744e-05, + "loss": 2.6418, + "step": 51900 + }, + { + "epoch": 2.4163698582303232, + "grad_norm": 0.32004116899673324, + "learning_rate": 1.1089061643343008e-05, + "loss": 2.6162, + "step": 51901 + }, + { + "epoch": 2.4164164164164164, + "grad_norm": 0.33731997645112627, + "learning_rate": 1.108736063211141e-05, + "loss": 2.6337, + "step": 51902 + }, + { + "epoch": 2.4164629746025095, + "grad_norm": 0.31836362298756854, + "learning_rate": 1.1085659735084564e-05, + "loss": 2.6027, + "step": 51903 + }, + { + "epoch": 2.4165095327886026, + "grad_norm": 0.30674013986948767, + "learning_rate": 1.108395895226747e-05, + "loss": 2.5869, + "step": 51904 + }, + { + "epoch": 2.4165560909746957, + "grad_norm": 0.3195116300035005, + "learning_rate": 1.1082258283665126e-05, + "loss": 2.6998, + "step": 51905 + }, + { + "epoch": 2.416602649160789, + "grad_norm": 0.3284641095280534, + "learning_rate": 1.1080557729282514e-05, + "loss": 2.6463, + "step": 51906 + }, + { + "epoch": 2.416649207346882, + "grad_norm": 0.3226593294695591, + "learning_rate": 1.107885728912465e-05, + "loss": 2.6258, + "step": 51907 + }, + { + "epoch": 2.4166957655329746, + "grad_norm": 0.3287063033781532, + "learning_rate": 1.1077156963196478e-05, + "loss": 2.5755, + "step": 51908 + }, + { + "epoch": 2.4167423237190677, + "grad_norm": 0.33232684298748566, + "learning_rate": 1.1075456751503044e-05, + "loss": 2.6191, + "step": 51909 + }, + { + "epoch": 2.416788881905161, + "grad_norm": 0.32976558289539804, + "learning_rate": 1.1073756654049294e-05, + "loss": 2.5179, + "step": 51910 + }, + { + "epoch": 2.416835440091254, + "grad_norm": 0.3341224201090946, + "learning_rate": 1.1072056670840231e-05, + "loss": 2.6311, + "step": 51911 + }, + { + "epoch": 2.416881998277347, + "grad_norm": 0.32687496718569387, + "learning_rate": 1.107035680188085e-05, + "loss": 2.6655, + "step": 51912 + }, + { + "epoch": 2.41692855646344, + "grad_norm": 0.3296031875769998, + "learning_rate": 1.106865704717615e-05, + "loss": 2.6668, + "step": 51913 + }, + { + "epoch": 2.4169751146495333, + "grad_norm": 0.3422085073396692, + "learning_rate": 1.1066957406731076e-05, + "loss": 2.6525, + "step": 51914 + }, + { + "epoch": 2.4170216728356264, + "grad_norm": 0.3414651938852668, + "learning_rate": 1.1065257880550678e-05, + "loss": 2.6788, + "step": 51915 + }, + { + "epoch": 2.4170682310217195, + "grad_norm": 0.3217024328768376, + "learning_rate": 1.1063558468639894e-05, + "loss": 2.6456, + "step": 51916 + }, + { + "epoch": 2.4171147892078126, + "grad_norm": 0.32671143734686225, + "learning_rate": 1.1061859171003736e-05, + "loss": 2.5666, + "step": 51917 + }, + { + "epoch": 2.4171613473939058, + "grad_norm": 0.34985912827810617, + "learning_rate": 1.1060159987647178e-05, + "loss": 2.5548, + "step": 51918 + }, + { + "epoch": 2.4172079055799984, + "grad_norm": 0.30909936865074017, + "learning_rate": 1.105846091857522e-05, + "loss": 2.641, + "step": 51919 + }, + { + "epoch": 2.4172544637660915, + "grad_norm": 0.3395846819804467, + "learning_rate": 1.1056761963792855e-05, + "loss": 2.6377, + "step": 51920 + }, + { + "epoch": 2.4173010219521847, + "grad_norm": 0.32742708475814325, + "learning_rate": 1.1055063123305031e-05, + "loss": 2.7347, + "step": 51921 + }, + { + "epoch": 2.4173475801382778, + "grad_norm": 0.3199403212697326, + "learning_rate": 1.1053364397116784e-05, + "loss": 2.68, + "step": 51922 + }, + { + "epoch": 2.417394138324371, + "grad_norm": 0.33694798363685213, + "learning_rate": 1.1051665785233061e-05, + "loss": 2.6632, + "step": 51923 + }, + { + "epoch": 2.417440696510464, + "grad_norm": 0.31721183507766104, + "learning_rate": 1.104996728765887e-05, + "loss": 2.6343, + "step": 51924 + }, + { + "epoch": 2.417487254696557, + "grad_norm": 0.32032664626187474, + "learning_rate": 1.1048268904399178e-05, + "loss": 2.6554, + "step": 51925 + }, + { + "epoch": 2.4175338128826502, + "grad_norm": 0.3145400569687976, + "learning_rate": 1.1046570635458986e-05, + "loss": 2.5692, + "step": 51926 + }, + { + "epoch": 2.4175803710687434, + "grad_norm": 0.32708279352101605, + "learning_rate": 1.1044872480843272e-05, + "loss": 2.5646, + "step": 51927 + }, + { + "epoch": 2.417626929254836, + "grad_norm": 0.3137146266550892, + "learning_rate": 1.104317444055703e-05, + "loss": 2.5166, + "step": 51928 + }, + { + "epoch": 2.417673487440929, + "grad_norm": 0.3318165056447899, + "learning_rate": 1.104147651460522e-05, + "loss": 2.6636, + "step": 51929 + }, + { + "epoch": 2.4177200456270223, + "grad_norm": 0.303985476323575, + "learning_rate": 1.1039778702992837e-05, + "loss": 2.6408, + "step": 51930 + }, + { + "epoch": 2.4177666038131154, + "grad_norm": 0.33057725633266355, + "learning_rate": 1.1038081005724865e-05, + "loss": 2.6904, + "step": 51931 + }, + { + "epoch": 2.4178131619992085, + "grad_norm": 0.3229922646821535, + "learning_rate": 1.103638342280629e-05, + "loss": 2.7375, + "step": 51932 + }, + { + "epoch": 2.4178597201853016, + "grad_norm": 0.3209312741601272, + "learning_rate": 1.1034685954242102e-05, + "loss": 2.6309, + "step": 51933 + }, + { + "epoch": 2.4179062783713947, + "grad_norm": 0.309092688497213, + "learning_rate": 1.1032988600037247e-05, + "loss": 2.6013, + "step": 51934 + }, + { + "epoch": 2.417952836557488, + "grad_norm": 0.3089866250565297, + "learning_rate": 1.103129136019676e-05, + "loss": 2.6545, + "step": 51935 + }, + { + "epoch": 2.417999394743581, + "grad_norm": 0.30237204608326096, + "learning_rate": 1.1029594234725576e-05, + "loss": 2.5334, + "step": 51936 + }, + { + "epoch": 2.418045952929674, + "grad_norm": 0.3332050566897317, + "learning_rate": 1.1027897223628697e-05, + "loss": 2.7066, + "step": 51937 + }, + { + "epoch": 2.418092511115767, + "grad_norm": 0.3056941073813497, + "learning_rate": 1.1026200326911095e-05, + "loss": 2.5981, + "step": 51938 + }, + { + "epoch": 2.41813906930186, + "grad_norm": 0.29682921385073824, + "learning_rate": 1.1024503544577764e-05, + "loss": 2.5073, + "step": 51939 + }, + { + "epoch": 2.418185627487953, + "grad_norm": 0.3256088389816154, + "learning_rate": 1.1022806876633668e-05, + "loss": 2.6008, + "step": 51940 + }, + { + "epoch": 2.418232185674046, + "grad_norm": 0.3211055548176805, + "learning_rate": 1.102111032308381e-05, + "loss": 2.6082, + "step": 51941 + }, + { + "epoch": 2.418278743860139, + "grad_norm": 0.3132958148141632, + "learning_rate": 1.1019413883933144e-05, + "loss": 2.5694, + "step": 51942 + }, + { + "epoch": 2.4183253020462323, + "grad_norm": 0.3303295949662712, + "learning_rate": 1.1017717559186652e-05, + "loss": 2.677, + "step": 51943 + }, + { + "epoch": 2.4183718602323254, + "grad_norm": 0.3199439900959963, + "learning_rate": 1.101602134884932e-05, + "loss": 2.6646, + "step": 51944 + }, + { + "epoch": 2.4184184184184185, + "grad_norm": 0.29836544812177973, + "learning_rate": 1.1014325252926127e-05, + "loss": 2.6355, + "step": 51945 + }, + { + "epoch": 2.4184649766045117, + "grad_norm": 0.31473093664488727, + "learning_rate": 1.101262927142206e-05, + "loss": 2.5711, + "step": 51946 + }, + { + "epoch": 2.4185115347906043, + "grad_norm": 0.3196124438197751, + "learning_rate": 1.101093340434206e-05, + "loss": 2.6159, + "step": 51947 + }, + { + "epoch": 2.4185580929766974, + "grad_norm": 0.29685852349024905, + "learning_rate": 1.1009237651691152e-05, + "loss": 2.59, + "step": 51948 + }, + { + "epoch": 2.4186046511627906, + "grad_norm": 0.3242444878173436, + "learning_rate": 1.100754201347428e-05, + "loss": 2.6881, + "step": 51949 + }, + { + "epoch": 2.4186512093488837, + "grad_norm": 0.3017079893139503, + "learning_rate": 1.1005846489696431e-05, + "loss": 2.5177, + "step": 51950 + }, + { + "epoch": 2.418697767534977, + "grad_norm": 0.33370113270163154, + "learning_rate": 1.100415108036258e-05, + "loss": 2.6321, + "step": 51951 + }, + { + "epoch": 2.41874432572107, + "grad_norm": 0.32376981381440645, + "learning_rate": 1.1002455785477705e-05, + "loss": 2.6449, + "step": 51952 + }, + { + "epoch": 2.418790883907163, + "grad_norm": 0.3005146217916263, + "learning_rate": 1.1000760605046783e-05, + "loss": 2.6069, + "step": 51953 + }, + { + "epoch": 2.418837442093256, + "grad_norm": 0.3144183249734102, + "learning_rate": 1.0999065539074794e-05, + "loss": 2.5875, + "step": 51954 + }, + { + "epoch": 2.4188840002793492, + "grad_norm": 0.3194589573122468, + "learning_rate": 1.0997370587566696e-05, + "loss": 2.6181, + "step": 51955 + }, + { + "epoch": 2.4189305584654424, + "grad_norm": 0.298838200996796, + "learning_rate": 1.0995675750527474e-05, + "loss": 2.4381, + "step": 51956 + }, + { + "epoch": 2.4189771166515355, + "grad_norm": 0.31449154386318895, + "learning_rate": 1.0993981027962097e-05, + "loss": 2.6109, + "step": 51957 + }, + { + "epoch": 2.419023674837628, + "grad_norm": 0.2861847540970535, + "learning_rate": 1.0992286419875546e-05, + "loss": 2.6159, + "step": 51958 + }, + { + "epoch": 2.4190702330237213, + "grad_norm": 0.31995331037932595, + "learning_rate": 1.0990591926272803e-05, + "loss": 2.7613, + "step": 51959 + }, + { + "epoch": 2.4191167912098144, + "grad_norm": 0.31747448792996047, + "learning_rate": 1.0988897547158806e-05, + "loss": 2.5733, + "step": 51960 + }, + { + "epoch": 2.4191633493959075, + "grad_norm": 0.3101098956660464, + "learning_rate": 1.0987203282538572e-05, + "loss": 2.4955, + "step": 51961 + }, + { + "epoch": 2.4192099075820006, + "grad_norm": 0.32323005832920765, + "learning_rate": 1.0985509132417033e-05, + "loss": 2.643, + "step": 51962 + }, + { + "epoch": 2.4192564657680937, + "grad_norm": 0.30804004734004475, + "learning_rate": 1.0983815096799206e-05, + "loss": 2.6729, + "step": 51963 + }, + { + "epoch": 2.419303023954187, + "grad_norm": 0.32134802757582437, + "learning_rate": 1.0982121175690019e-05, + "loss": 2.6999, + "step": 51964 + }, + { + "epoch": 2.41934958214028, + "grad_norm": 0.3273814861869692, + "learning_rate": 1.0980427369094471e-05, + "loss": 2.5717, + "step": 51965 + }, + { + "epoch": 2.419396140326373, + "grad_norm": 0.3289841633757436, + "learning_rate": 1.0978733677017516e-05, + "loss": 2.5908, + "step": 51966 + }, + { + "epoch": 2.4194426985124657, + "grad_norm": 0.30105121175965693, + "learning_rate": 1.0977040099464143e-05, + "loss": 2.5856, + "step": 51967 + }, + { + "epoch": 2.419489256698559, + "grad_norm": 0.31564383847201194, + "learning_rate": 1.097534663643932e-05, + "loss": 2.5458, + "step": 51968 + }, + { + "epoch": 2.419535814884652, + "grad_norm": 0.30709424011994896, + "learning_rate": 1.0973653287947994e-05, + "loss": 2.6036, + "step": 51969 + }, + { + "epoch": 2.419582373070745, + "grad_norm": 0.3075213176775778, + "learning_rate": 1.0971960053995157e-05, + "loss": 2.5834, + "step": 51970 + }, + { + "epoch": 2.419628931256838, + "grad_norm": 0.3056615271013455, + "learning_rate": 1.0970266934585772e-05, + "loss": 2.6127, + "step": 51971 + }, + { + "epoch": 2.4196754894429313, + "grad_norm": 0.31839054958907675, + "learning_rate": 1.0968573929724818e-05, + "loss": 2.7058, + "step": 51972 + }, + { + "epoch": 2.4197220476290244, + "grad_norm": 0.3189354118052596, + "learning_rate": 1.0966881039417232e-05, + "loss": 2.6301, + "step": 51973 + }, + { + "epoch": 2.4197686058151175, + "grad_norm": 0.30405027687680264, + "learning_rate": 1.0965188263668024e-05, + "loss": 2.6121, + "step": 51974 + }, + { + "epoch": 2.4198151640012107, + "grad_norm": 0.3292247848456439, + "learning_rate": 1.0963495602482122e-05, + "loss": 2.7021, + "step": 51975 + }, + { + "epoch": 2.4198617221873038, + "grad_norm": 0.34077083415337067, + "learning_rate": 1.0961803055864544e-05, + "loss": 2.7324, + "step": 51976 + }, + { + "epoch": 2.419908280373397, + "grad_norm": 0.31295277768121876, + "learning_rate": 1.0960110623820207e-05, + "loss": 2.6812, + "step": 51977 + }, + { + "epoch": 2.4199548385594896, + "grad_norm": 0.31269000521486107, + "learning_rate": 1.0958418306354106e-05, + "loss": 2.6144, + "step": 51978 + }, + { + "epoch": 2.4200013967455827, + "grad_norm": 0.3319715957404281, + "learning_rate": 1.09567261034712e-05, + "loss": 2.6989, + "step": 51979 + }, + { + "epoch": 2.420047954931676, + "grad_norm": 0.3364884778926202, + "learning_rate": 1.0955034015176453e-05, + "loss": 2.72, + "step": 51980 + }, + { + "epoch": 2.420094513117769, + "grad_norm": 0.30972755734584073, + "learning_rate": 1.0953342041474852e-05, + "loss": 2.5922, + "step": 51981 + }, + { + "epoch": 2.420141071303862, + "grad_norm": 0.3329397792715254, + "learning_rate": 1.0951650182371332e-05, + "loss": 2.7226, + "step": 51982 + }, + { + "epoch": 2.420187629489955, + "grad_norm": 0.32417688449673904, + "learning_rate": 1.0949958437870867e-05, + "loss": 2.571, + "step": 51983 + }, + { + "epoch": 2.4202341876760483, + "grad_norm": 0.3370550505641255, + "learning_rate": 1.094826680797843e-05, + "loss": 2.7723, + "step": 51984 + }, + { + "epoch": 2.4202807458621414, + "grad_norm": 0.32295289015621026, + "learning_rate": 1.094657529269898e-05, + "loss": 2.5885, + "step": 51985 + }, + { + "epoch": 2.420327304048234, + "grad_norm": 0.34225704835556064, + "learning_rate": 1.094488389203749e-05, + "loss": 2.6364, + "step": 51986 + }, + { + "epoch": 2.420373862234327, + "grad_norm": 0.31751400273741526, + "learning_rate": 1.0943192605998926e-05, + "loss": 2.6175, + "step": 51987 + }, + { + "epoch": 2.4204204204204203, + "grad_norm": 0.32189811726270906, + "learning_rate": 1.0941501434588219e-05, + "loss": 2.5907, + "step": 51988 + }, + { + "epoch": 2.4204669786065134, + "grad_norm": 0.34810583112363447, + "learning_rate": 1.0939810377810388e-05, + "loss": 2.7595, + "step": 51989 + }, + { + "epoch": 2.4205135367926065, + "grad_norm": 0.30140200024924085, + "learning_rate": 1.0938119435670346e-05, + "loss": 2.5054, + "step": 51990 + }, + { + "epoch": 2.4205600949786996, + "grad_norm": 0.3363455466654243, + "learning_rate": 1.0936428608173077e-05, + "loss": 2.6369, + "step": 51991 + }, + { + "epoch": 2.4206066531647927, + "grad_norm": 0.3100056877246581, + "learning_rate": 1.0934737895323544e-05, + "loss": 2.6509, + "step": 51992 + }, + { + "epoch": 2.420653211350886, + "grad_norm": 0.346127714692694, + "learning_rate": 1.0933047297126704e-05, + "loss": 2.6861, + "step": 51993 + }, + { + "epoch": 2.420699769536979, + "grad_norm": 0.3325389601269502, + "learning_rate": 1.093135681358754e-05, + "loss": 2.6466, + "step": 51994 + }, + { + "epoch": 2.420746327723072, + "grad_norm": 0.31741526971234474, + "learning_rate": 1.0929666444710973e-05, + "loss": 2.6045, + "step": 51995 + }, + { + "epoch": 2.420792885909165, + "grad_norm": 0.3323218379208915, + "learning_rate": 1.0927976190501993e-05, + "loss": 2.6198, + "step": 51996 + }, + { + "epoch": 2.420839444095258, + "grad_norm": 0.32193220545358886, + "learning_rate": 1.0926286050965551e-05, + "loss": 2.6385, + "step": 51997 + }, + { + "epoch": 2.420886002281351, + "grad_norm": 0.3182687366692228, + "learning_rate": 1.0924596026106609e-05, + "loss": 2.5544, + "step": 51998 + }, + { + "epoch": 2.420932560467444, + "grad_norm": 0.3021223248584053, + "learning_rate": 1.0922906115930131e-05, + "loss": 2.5108, + "step": 51999 + }, + { + "epoch": 2.420979118653537, + "grad_norm": 0.32183680405543325, + "learning_rate": 1.0921216320441085e-05, + "loss": 2.6404, + "step": 52000 + }, + { + "epoch": 2.4210256768396303, + "grad_norm": 0.33064078866334323, + "learning_rate": 1.091952663964439e-05, + "loss": 2.6331, + "step": 52001 + }, + { + "epoch": 2.4210722350257234, + "grad_norm": 0.32142281877379864, + "learning_rate": 1.0917837073545067e-05, + "loss": 2.6067, + "step": 52002 + }, + { + "epoch": 2.4211187932118166, + "grad_norm": 0.31791713099680957, + "learning_rate": 1.091614762214801e-05, + "loss": 2.6032, + "step": 52003 + }, + { + "epoch": 2.4211653513979097, + "grad_norm": 0.33276385330255204, + "learning_rate": 1.091445828545824e-05, + "loss": 2.6913, + "step": 52004 + }, + { + "epoch": 2.421211909584003, + "grad_norm": 0.31513785367022074, + "learning_rate": 1.0912769063480666e-05, + "loss": 2.6471, + "step": 52005 + }, + { + "epoch": 2.4212584677700955, + "grad_norm": 0.3067602010703911, + "learning_rate": 1.0911079956220272e-05, + "loss": 2.5872, + "step": 52006 + }, + { + "epoch": 2.4213050259561886, + "grad_norm": 0.3402460431344627, + "learning_rate": 1.0909390963682015e-05, + "loss": 2.7018, + "step": 52007 + }, + { + "epoch": 2.4213515841422817, + "grad_norm": 0.32704347618666557, + "learning_rate": 1.090770208587083e-05, + "loss": 2.6469, + "step": 52008 + }, + { + "epoch": 2.421398142328375, + "grad_norm": 0.32931105491191215, + "learning_rate": 1.0906013322791692e-05, + "loss": 2.7095, + "step": 52009 + }, + { + "epoch": 2.421444700514468, + "grad_norm": 0.3474249585032301, + "learning_rate": 1.0904324674449551e-05, + "loss": 2.6731, + "step": 52010 + }, + { + "epoch": 2.421491258700561, + "grad_norm": 0.3223974781317913, + "learning_rate": 1.0902636140849365e-05, + "loss": 2.5978, + "step": 52011 + }, + { + "epoch": 2.421537816886654, + "grad_norm": 0.31611025138634513, + "learning_rate": 1.0900947721996091e-05, + "loss": 2.5483, + "step": 52012 + }, + { + "epoch": 2.4215843750727473, + "grad_norm": 0.34692245535723903, + "learning_rate": 1.0899259417894698e-05, + "loss": 2.6051, + "step": 52013 + }, + { + "epoch": 2.4216309332588404, + "grad_norm": 0.3249323329199229, + "learning_rate": 1.0897571228550097e-05, + "loss": 2.6485, + "step": 52014 + }, + { + "epoch": 2.4216774914449335, + "grad_norm": 0.3263931644471148, + "learning_rate": 1.0895883153967302e-05, + "loss": 2.5031, + "step": 52015 + }, + { + "epoch": 2.4217240496310266, + "grad_norm": 0.3240533734234195, + "learning_rate": 1.0894195194151213e-05, + "loss": 2.707, + "step": 52016 + }, + { + "epoch": 2.4217706078171193, + "grad_norm": 0.31110631800244753, + "learning_rate": 1.0892507349106828e-05, + "loss": 2.5597, + "step": 52017 + }, + { + "epoch": 2.4218171660032124, + "grad_norm": 0.3415333873441714, + "learning_rate": 1.0890819618839076e-05, + "loss": 2.5982, + "step": 52018 + }, + { + "epoch": 2.4218637241893055, + "grad_norm": 0.3400631745433034, + "learning_rate": 1.0889132003352908e-05, + "loss": 2.6457, + "step": 52019 + }, + { + "epoch": 2.4219102823753986, + "grad_norm": 0.34807073576263076, + "learning_rate": 1.0887444502653304e-05, + "loss": 2.6279, + "step": 52020 + }, + { + "epoch": 2.4219568405614917, + "grad_norm": 0.31916983551810657, + "learning_rate": 1.0885757116745166e-05, + "loss": 2.6583, + "step": 52021 + }, + { + "epoch": 2.422003398747585, + "grad_norm": 0.32634647119446586, + "learning_rate": 1.088406984563351e-05, + "loss": 2.6945, + "step": 52022 + }, + { + "epoch": 2.422049956933678, + "grad_norm": 0.3278426895723482, + "learning_rate": 1.0882382689323234e-05, + "loss": 2.5969, + "step": 52023 + }, + { + "epoch": 2.422096515119771, + "grad_norm": 0.3330497484870537, + "learning_rate": 1.0880695647819316e-05, + "loss": 2.7449, + "step": 52024 + }, + { + "epoch": 2.4221430733058638, + "grad_norm": 0.3148556535137966, + "learning_rate": 1.0879008721126704e-05, + "loss": 2.6155, + "step": 52025 + }, + { + "epoch": 2.422189631491957, + "grad_norm": 0.31724816713669357, + "learning_rate": 1.0877321909250354e-05, + "loss": 2.733, + "step": 52026 + }, + { + "epoch": 2.42223618967805, + "grad_norm": 0.351387102636118, + "learning_rate": 1.087563521219519e-05, + "loss": 2.6608, + "step": 52027 + }, + { + "epoch": 2.422282747864143, + "grad_norm": 0.3266757643497708, + "learning_rate": 1.0873948629966208e-05, + "loss": 2.6154, + "step": 52028 + }, + { + "epoch": 2.422329306050236, + "grad_norm": 0.30439117142649297, + "learning_rate": 1.0872262162568303e-05, + "loss": 2.5787, + "step": 52029 + }, + { + "epoch": 2.4223758642363293, + "grad_norm": 0.3322727073412116, + "learning_rate": 1.0870575810006483e-05, + "loss": 2.7423, + "step": 52030 + }, + { + "epoch": 2.4224224224224224, + "grad_norm": 0.31183577832365145, + "learning_rate": 1.0868889572285651e-05, + "loss": 2.5655, + "step": 52031 + }, + { + "epoch": 2.4224689806085156, + "grad_norm": 0.311063300017925, + "learning_rate": 1.0867203449410774e-05, + "loss": 2.6419, + "step": 52032 + }, + { + "epoch": 2.4225155387946087, + "grad_norm": 0.3304924651705816, + "learning_rate": 1.0865517441386818e-05, + "loss": 2.6951, + "step": 52033 + }, + { + "epoch": 2.422562096980702, + "grad_norm": 0.3152862767820689, + "learning_rate": 1.0863831548218677e-05, + "loss": 2.6297, + "step": 52034 + }, + { + "epoch": 2.422608655166795, + "grad_norm": 0.3101038335719915, + "learning_rate": 1.0862145769911369e-05, + "loss": 2.5791, + "step": 52035 + }, + { + "epoch": 2.4226552133528876, + "grad_norm": 0.3187237147597497, + "learning_rate": 1.0860460106469794e-05, + "loss": 2.5866, + "step": 52036 + }, + { + "epoch": 2.4227017715389807, + "grad_norm": 0.307055962478368, + "learning_rate": 1.085877455789891e-05, + "loss": 2.6802, + "step": 52037 + }, + { + "epoch": 2.422748329725074, + "grad_norm": 0.34883021927227786, + "learning_rate": 1.0857089124203668e-05, + "loss": 2.6218, + "step": 52038 + }, + { + "epoch": 2.422794887911167, + "grad_norm": 0.32176848092521576, + "learning_rate": 1.0855403805389014e-05, + "loss": 2.6651, + "step": 52039 + }, + { + "epoch": 2.42284144609726, + "grad_norm": 0.32278918806330276, + "learning_rate": 1.085371860145989e-05, + "loss": 2.5404, + "step": 52040 + }, + { + "epoch": 2.422888004283353, + "grad_norm": 0.3236445269455972, + "learning_rate": 1.0852033512421261e-05, + "loss": 2.6501, + "step": 52041 + }, + { + "epoch": 2.4229345624694463, + "grad_norm": 0.31052894318543, + "learning_rate": 1.0850348538278032e-05, + "loss": 2.649, + "step": 52042 + }, + { + "epoch": 2.4229811206555394, + "grad_norm": 0.32659117714434255, + "learning_rate": 1.0848663679035198e-05, + "loss": 2.5726, + "step": 52043 + }, + { + "epoch": 2.4230276788416325, + "grad_norm": 0.319225016008728, + "learning_rate": 1.0846978934697665e-05, + "loss": 2.5515, + "step": 52044 + }, + { + "epoch": 2.423074237027725, + "grad_norm": 0.3198143793124511, + "learning_rate": 1.0845294305270393e-05, + "loss": 2.5424, + "step": 52045 + }, + { + "epoch": 2.4231207952138183, + "grad_norm": 0.3027001239236059, + "learning_rate": 1.0843609790758335e-05, + "loss": 2.6644, + "step": 52046 + }, + { + "epoch": 2.4231673533999114, + "grad_norm": 0.31879904438478196, + "learning_rate": 1.0841925391166402e-05, + "loss": 2.5798, + "step": 52047 + }, + { + "epoch": 2.4232139115860045, + "grad_norm": 0.3189030264024187, + "learning_rate": 1.0840241106499587e-05, + "loss": 2.7479, + "step": 52048 + }, + { + "epoch": 2.4232604697720976, + "grad_norm": 0.3373931434336312, + "learning_rate": 1.0838556936762789e-05, + "loss": 2.7521, + "step": 52049 + }, + { + "epoch": 2.4233070279581908, + "grad_norm": 0.32559155333729567, + "learning_rate": 1.0836872881960974e-05, + "loss": 2.6348, + "step": 52050 + }, + { + "epoch": 2.423353586144284, + "grad_norm": 0.31619410298167, + "learning_rate": 1.0835188942099073e-05, + "loss": 2.7108, + "step": 52051 + }, + { + "epoch": 2.423400144330377, + "grad_norm": 0.33662113232937413, + "learning_rate": 1.0833505117182035e-05, + "loss": 2.6433, + "step": 52052 + }, + { + "epoch": 2.42344670251647, + "grad_norm": 0.31163739777151295, + "learning_rate": 1.0831821407214804e-05, + "loss": 2.6717, + "step": 52053 + }, + { + "epoch": 2.423493260702563, + "grad_norm": 0.30823887783317433, + "learning_rate": 1.0830137812202335e-05, + "loss": 2.5133, + "step": 52054 + }, + { + "epoch": 2.4235398188886563, + "grad_norm": 0.3370779490878816, + "learning_rate": 1.0828454332149519e-05, + "loss": 2.5976, + "step": 52055 + }, + { + "epoch": 2.423586377074749, + "grad_norm": 0.33149820295121674, + "learning_rate": 1.0826770967061362e-05, + "loss": 2.6154, + "step": 52056 + }, + { + "epoch": 2.423632935260842, + "grad_norm": 0.3137621868977411, + "learning_rate": 1.0825087716942756e-05, + "loss": 2.6192, + "step": 52057 + }, + { + "epoch": 2.4236794934469352, + "grad_norm": 0.3054270300091899, + "learning_rate": 1.0823404581798657e-05, + "loss": 2.647, + "step": 52058 + }, + { + "epoch": 2.4237260516330283, + "grad_norm": 0.32751067553563007, + "learning_rate": 1.0821721561634024e-05, + "loss": 2.6889, + "step": 52059 + }, + { + "epoch": 2.4237726098191215, + "grad_norm": 0.3212785043932954, + "learning_rate": 1.0820038656453752e-05, + "loss": 2.6278, + "step": 52060 + }, + { + "epoch": 2.4238191680052146, + "grad_norm": 0.32301858346735574, + "learning_rate": 1.0818355866262831e-05, + "loss": 2.6624, + "step": 52061 + }, + { + "epoch": 2.4238657261913077, + "grad_norm": 0.3174802411272405, + "learning_rate": 1.0816673191066163e-05, + "loss": 2.5811, + "step": 52062 + }, + { + "epoch": 2.423912284377401, + "grad_norm": 0.305771470139602, + "learning_rate": 1.0814990630868699e-05, + "loss": 2.661, + "step": 52063 + }, + { + "epoch": 2.4239588425634935, + "grad_norm": 0.3342029571296594, + "learning_rate": 1.0813308185675376e-05, + "loss": 2.5784, + "step": 52064 + }, + { + "epoch": 2.4240054007495866, + "grad_norm": 0.34377494222768107, + "learning_rate": 1.081162585549113e-05, + "loss": 2.6891, + "step": 52065 + }, + { + "epoch": 2.4240519589356797, + "grad_norm": 0.31037297825701016, + "learning_rate": 1.0809943640320908e-05, + "loss": 2.5744, + "step": 52066 + }, + { + "epoch": 2.424098517121773, + "grad_norm": 0.3159749800929692, + "learning_rate": 1.080826154016965e-05, + "loss": 2.6333, + "step": 52067 + }, + { + "epoch": 2.424145075307866, + "grad_norm": 0.3213050153430026, + "learning_rate": 1.0806579555042257e-05, + "loss": 2.6451, + "step": 52068 + }, + { + "epoch": 2.424191633493959, + "grad_norm": 0.3304570550357493, + "learning_rate": 1.0804897684943715e-05, + "loss": 2.6593, + "step": 52069 + }, + { + "epoch": 2.424238191680052, + "grad_norm": 0.32808208592123084, + "learning_rate": 1.0803215929878929e-05, + "loss": 2.5521, + "step": 52070 + }, + { + "epoch": 2.4242847498661453, + "grad_norm": 0.32370251851203286, + "learning_rate": 1.0801534289852844e-05, + "loss": 2.678, + "step": 52071 + }, + { + "epoch": 2.4243313080522384, + "grad_norm": 0.31753771592471874, + "learning_rate": 1.0799852764870406e-05, + "loss": 2.678, + "step": 52072 + }, + { + "epoch": 2.4243778662383315, + "grad_norm": 0.341920251905573, + "learning_rate": 1.0798171354936509e-05, + "loss": 2.6139, + "step": 52073 + }, + { + "epoch": 2.4244244244244246, + "grad_norm": 0.3056662332378984, + "learning_rate": 1.0796490060056142e-05, + "loss": 2.588, + "step": 52074 + }, + { + "epoch": 2.4244709826105173, + "grad_norm": 0.3214921383662438, + "learning_rate": 1.0794808880234191e-05, + "loss": 2.6176, + "step": 52075 + }, + { + "epoch": 2.4245175407966104, + "grad_norm": 0.3117760097464053, + "learning_rate": 1.079312781547564e-05, + "loss": 2.6444, + "step": 52076 + }, + { + "epoch": 2.4245640989827035, + "grad_norm": 0.31370588693807905, + "learning_rate": 1.0791446865785387e-05, + "loss": 2.7101, + "step": 52077 + }, + { + "epoch": 2.4246106571687966, + "grad_norm": 0.311182575405765, + "learning_rate": 1.078976603116837e-05, + "loss": 2.5956, + "step": 52078 + }, + { + "epoch": 2.4246572153548898, + "grad_norm": 0.3101645637061151, + "learning_rate": 1.0788085311629525e-05, + "loss": 2.6699, + "step": 52079 + }, + { + "epoch": 2.424703773540983, + "grad_norm": 0.32483380811271184, + "learning_rate": 1.0786404707173803e-05, + "loss": 2.6908, + "step": 52080 + }, + { + "epoch": 2.424750331727076, + "grad_norm": 0.31232398411648854, + "learning_rate": 1.0784724217806092e-05, + "loss": 2.6349, + "step": 52081 + }, + { + "epoch": 2.424796889913169, + "grad_norm": 0.30489742338745357, + "learning_rate": 1.0783043843531382e-05, + "loss": 2.6977, + "step": 52082 + }, + { + "epoch": 2.424843448099262, + "grad_norm": 0.32306968411258313, + "learning_rate": 1.078136358435456e-05, + "loss": 2.5754, + "step": 52083 + }, + { + "epoch": 2.424890006285355, + "grad_norm": 0.3065934921559018, + "learning_rate": 1.077968344028057e-05, + "loss": 2.5143, + "step": 52084 + }, + { + "epoch": 2.424936564471448, + "grad_norm": 0.30751798447488154, + "learning_rate": 1.0778003411314363e-05, + "loss": 2.4732, + "step": 52085 + }, + { + "epoch": 2.424983122657541, + "grad_norm": 0.304847987839334, + "learning_rate": 1.0776323497460822e-05, + "loss": 2.6261, + "step": 52086 + }, + { + "epoch": 2.4250296808436342, + "grad_norm": 0.31514277004206104, + "learning_rate": 1.0774643698724935e-05, + "loss": 2.6188, + "step": 52087 + }, + { + "epoch": 2.4250762390297274, + "grad_norm": 0.29349350701262616, + "learning_rate": 1.0772964015111576e-05, + "loss": 2.6119, + "step": 52088 + }, + { + "epoch": 2.4251227972158205, + "grad_norm": 0.31685825897865316, + "learning_rate": 1.0771284446625729e-05, + "loss": 2.6431, + "step": 52089 + }, + { + "epoch": 2.4251693554019136, + "grad_norm": 0.31554465484170646, + "learning_rate": 1.0769604993272286e-05, + "loss": 2.6857, + "step": 52090 + }, + { + "epoch": 2.4252159135880067, + "grad_norm": 0.33017393447309146, + "learning_rate": 1.0767925655056188e-05, + "loss": 2.6221, + "step": 52091 + }, + { + "epoch": 2.4252624717741, + "grad_norm": 0.2866800342167554, + "learning_rate": 1.0766246431982362e-05, + "loss": 2.4975, + "step": 52092 + }, + { + "epoch": 2.425309029960193, + "grad_norm": 0.312055471527912, + "learning_rate": 1.076456732405574e-05, + "loss": 2.6339, + "step": 52093 + }, + { + "epoch": 2.425355588146286, + "grad_norm": 0.3025028473904327, + "learning_rate": 1.0762888331281245e-05, + "loss": 2.7361, + "step": 52094 + }, + { + "epoch": 2.4254021463323787, + "grad_norm": 0.3196935922853253, + "learning_rate": 1.0761209453663824e-05, + "loss": 2.6246, + "step": 52095 + }, + { + "epoch": 2.425448704518472, + "grad_norm": 0.30980734603358023, + "learning_rate": 1.075953069120837e-05, + "loss": 2.7179, + "step": 52096 + }, + { + "epoch": 2.425495262704565, + "grad_norm": 0.3172419456528197, + "learning_rate": 1.075785204391983e-05, + "loss": 2.7384, + "step": 52097 + }, + { + "epoch": 2.425541820890658, + "grad_norm": 0.3131747662323118, + "learning_rate": 1.0756173511803141e-05, + "loss": 2.5859, + "step": 52098 + }, + { + "epoch": 2.425588379076751, + "grad_norm": 0.3295507678692245, + "learning_rate": 1.0754495094863187e-05, + "loss": 2.6675, + "step": 52099 + }, + { + "epoch": 2.4256349372628443, + "grad_norm": 0.32390778346151344, + "learning_rate": 1.0752816793104953e-05, + "loss": 2.5611, + "step": 52100 + }, + { + "epoch": 2.4256814954489374, + "grad_norm": 0.3243038998475223, + "learning_rate": 1.0751138606533307e-05, + "loss": 2.6985, + "step": 52101 + }, + { + "epoch": 2.4257280536350305, + "grad_norm": 0.3226755197244874, + "learning_rate": 1.0749460535153232e-05, + "loss": 2.6032, + "step": 52102 + }, + { + "epoch": 2.4257746118211236, + "grad_norm": 0.3121024442758975, + "learning_rate": 1.0747782578969611e-05, + "loss": 2.645, + "step": 52103 + }, + { + "epoch": 2.4258211700072163, + "grad_norm": 0.3128333972491102, + "learning_rate": 1.0746104737987379e-05, + "loss": 2.6323, + "step": 52104 + }, + { + "epoch": 2.4258677281933094, + "grad_norm": 0.3357187589789824, + "learning_rate": 1.0744427012211461e-05, + "loss": 2.582, + "step": 52105 + }, + { + "epoch": 2.4259142863794025, + "grad_norm": 0.32647337432211837, + "learning_rate": 1.0742749401646785e-05, + "loss": 2.6764, + "step": 52106 + }, + { + "epoch": 2.4259608445654957, + "grad_norm": 0.31087422712730617, + "learning_rate": 1.0741071906298272e-05, + "loss": 2.5438, + "step": 52107 + }, + { + "epoch": 2.4260074027515888, + "grad_norm": 0.3469940976947467, + "learning_rate": 1.0739394526170854e-05, + "loss": 2.6443, + "step": 52108 + }, + { + "epoch": 2.426053960937682, + "grad_norm": 0.3414947120578864, + "learning_rate": 1.0737717261269437e-05, + "loss": 2.51, + "step": 52109 + }, + { + "epoch": 2.426100519123775, + "grad_norm": 0.3330612172252512, + "learning_rate": 1.073604011159895e-05, + "loss": 2.7128, + "step": 52110 + }, + { + "epoch": 2.426147077309868, + "grad_norm": 0.3364106457498104, + "learning_rate": 1.0734363077164317e-05, + "loss": 2.6866, + "step": 52111 + }, + { + "epoch": 2.4261936354959612, + "grad_norm": 0.3324068561225904, + "learning_rate": 1.0732686157970462e-05, + "loss": 2.6764, + "step": 52112 + }, + { + "epoch": 2.4262401936820543, + "grad_norm": 0.32468962026097037, + "learning_rate": 1.0731009354022315e-05, + "loss": 2.6162, + "step": 52113 + }, + { + "epoch": 2.4262867518681475, + "grad_norm": 0.3158315724753586, + "learning_rate": 1.0729332665324764e-05, + "loss": 2.528, + "step": 52114 + }, + { + "epoch": 2.42633331005424, + "grad_norm": 0.32098277310123385, + "learning_rate": 1.0727656091882777e-05, + "loss": 2.6923, + "step": 52115 + }, + { + "epoch": 2.4263798682403332, + "grad_norm": 0.29976616201537043, + "learning_rate": 1.0725979633701238e-05, + "loss": 2.5976, + "step": 52116 + }, + { + "epoch": 2.4264264264264264, + "grad_norm": 0.336610390322823, + "learning_rate": 1.0724303290785076e-05, + "loss": 2.7228, + "step": 52117 + }, + { + "epoch": 2.4264729846125195, + "grad_norm": 0.32238447280598154, + "learning_rate": 1.0722627063139218e-05, + "loss": 2.6555, + "step": 52118 + }, + { + "epoch": 2.4265195427986126, + "grad_norm": 0.29525271405800246, + "learning_rate": 1.0720950950768582e-05, + "loss": 2.635, + "step": 52119 + }, + { + "epoch": 2.4265661009847057, + "grad_norm": 0.3174620886176617, + "learning_rate": 1.0719274953678083e-05, + "loss": 2.5398, + "step": 52120 + }, + { + "epoch": 2.426612659170799, + "grad_norm": 0.33123900139314394, + "learning_rate": 1.0717599071872653e-05, + "loss": 2.6456, + "step": 52121 + }, + { + "epoch": 2.426659217356892, + "grad_norm": 0.3126637770346401, + "learning_rate": 1.071592330535719e-05, + "loss": 2.6646, + "step": 52122 + }, + { + "epoch": 2.4267057755429846, + "grad_norm": 0.31116481441839766, + "learning_rate": 1.0714247654136621e-05, + "loss": 2.6988, + "step": 52123 + }, + { + "epoch": 2.4267523337290777, + "grad_norm": 0.3280587774777979, + "learning_rate": 1.0712572118215863e-05, + "loss": 2.6563, + "step": 52124 + }, + { + "epoch": 2.426798891915171, + "grad_norm": 0.3293879865888098, + "learning_rate": 1.0710896697599837e-05, + "loss": 2.592, + "step": 52125 + }, + { + "epoch": 2.426845450101264, + "grad_norm": 0.30377896069638854, + "learning_rate": 1.0709221392293472e-05, + "loss": 2.5677, + "step": 52126 + }, + { + "epoch": 2.426892008287357, + "grad_norm": 0.324745252146022, + "learning_rate": 1.0707546202301649e-05, + "loss": 2.7169, + "step": 52127 + }, + { + "epoch": 2.42693856647345, + "grad_norm": 0.32740045109749566, + "learning_rate": 1.070587112762933e-05, + "loss": 2.6095, + "step": 52128 + }, + { + "epoch": 2.4269851246595433, + "grad_norm": 0.32863159600996633, + "learning_rate": 1.0704196168281378e-05, + "loss": 2.6569, + "step": 52129 + }, + { + "epoch": 2.4270316828456364, + "grad_norm": 0.31246933046148045, + "learning_rate": 1.0702521324262771e-05, + "loss": 2.5828, + "step": 52130 + }, + { + "epoch": 2.4270782410317295, + "grad_norm": 0.3487879351319498, + "learning_rate": 1.0700846595578373e-05, + "loss": 2.708, + "step": 52131 + }, + { + "epoch": 2.4271247992178226, + "grad_norm": 0.3372181932585271, + "learning_rate": 1.0699171982233126e-05, + "loss": 2.6345, + "step": 52132 + }, + { + "epoch": 2.4271713574039158, + "grad_norm": 0.3245025562385784, + "learning_rate": 1.0697497484231933e-05, + "loss": 2.637, + "step": 52133 + }, + { + "epoch": 2.4272179155900084, + "grad_norm": 0.32921394608638616, + "learning_rate": 1.0695823101579728e-05, + "loss": 2.6654, + "step": 52134 + }, + { + "epoch": 2.4272644737761015, + "grad_norm": 0.3168658286792658, + "learning_rate": 1.0694148834281393e-05, + "loss": 2.6063, + "step": 52135 + }, + { + "epoch": 2.4273110319621947, + "grad_norm": 0.3383296915176533, + "learning_rate": 1.0692474682341863e-05, + "loss": 2.6938, + "step": 52136 + }, + { + "epoch": 2.427357590148288, + "grad_norm": 0.3076041459135722, + "learning_rate": 1.0690800645766046e-05, + "loss": 2.4983, + "step": 52137 + }, + { + "epoch": 2.427404148334381, + "grad_norm": 0.33776458557207034, + "learning_rate": 1.0689126724558856e-05, + "loss": 2.6218, + "step": 52138 + }, + { + "epoch": 2.427450706520474, + "grad_norm": 0.31703705880626204, + "learning_rate": 1.0687452918725222e-05, + "loss": 2.6256, + "step": 52139 + }, + { + "epoch": 2.427497264706567, + "grad_norm": 0.34464390575223025, + "learning_rate": 1.068577922827001e-05, + "loss": 2.6521, + "step": 52140 + }, + { + "epoch": 2.4275438228926602, + "grad_norm": 0.33306793801981366, + "learning_rate": 1.068410565319819e-05, + "loss": 2.6447, + "step": 52141 + }, + { + "epoch": 2.4275903810787534, + "grad_norm": 0.3140579039919164, + "learning_rate": 1.0682432193514625e-05, + "loss": 2.597, + "step": 52142 + }, + { + "epoch": 2.427636939264846, + "grad_norm": 0.3230358495599561, + "learning_rate": 1.068075884922427e-05, + "loss": 2.6734, + "step": 52143 + }, + { + "epoch": 2.427683497450939, + "grad_norm": 0.32082322375793143, + "learning_rate": 1.0679085620332002e-05, + "loss": 2.6955, + "step": 52144 + }, + { + "epoch": 2.4277300556370323, + "grad_norm": 0.3042780178706775, + "learning_rate": 1.0677412506842743e-05, + "loss": 2.6445, + "step": 52145 + }, + { + "epoch": 2.4277766138231254, + "grad_norm": 0.3165226379654488, + "learning_rate": 1.0675739508761407e-05, + "loss": 2.6509, + "step": 52146 + }, + { + "epoch": 2.4278231720092185, + "grad_norm": 0.3175939169112798, + "learning_rate": 1.0674066626092899e-05, + "loss": 2.6375, + "step": 52147 + }, + { + "epoch": 2.4278697301953116, + "grad_norm": 0.3071973346061272, + "learning_rate": 1.067239385884215e-05, + "loss": 2.595, + "step": 52148 + }, + { + "epoch": 2.4279162883814047, + "grad_norm": 0.3107674329255352, + "learning_rate": 1.0670721207014034e-05, + "loss": 2.5779, + "step": 52149 + }, + { + "epoch": 2.427962846567498, + "grad_norm": 0.3344926215001956, + "learning_rate": 1.0669048670613474e-05, + "loss": 2.7024, + "step": 52150 + }, + { + "epoch": 2.428009404753591, + "grad_norm": 0.3229605573549676, + "learning_rate": 1.0667376249645383e-05, + "loss": 2.6712, + "step": 52151 + }, + { + "epoch": 2.428055962939684, + "grad_norm": 0.3446678939481438, + "learning_rate": 1.066570394411468e-05, + "loss": 2.6823, + "step": 52152 + }, + { + "epoch": 2.428102521125777, + "grad_norm": 0.3064969023924344, + "learning_rate": 1.0664031754026237e-05, + "loss": 2.5839, + "step": 52153 + }, + { + "epoch": 2.42814907931187, + "grad_norm": 0.31788976224152404, + "learning_rate": 1.066235967938501e-05, + "loss": 2.5414, + "step": 52154 + }, + { + "epoch": 2.428195637497963, + "grad_norm": 0.31456651693585896, + "learning_rate": 1.0660687720195856e-05, + "loss": 2.6626, + "step": 52155 + }, + { + "epoch": 2.428242195684056, + "grad_norm": 0.32948878257279113, + "learning_rate": 1.0659015876463736e-05, + "loss": 2.6325, + "step": 52156 + }, + { + "epoch": 2.428288753870149, + "grad_norm": 0.30528382258680103, + "learning_rate": 1.065734414819351e-05, + "loss": 2.5667, + "step": 52157 + }, + { + "epoch": 2.4283353120562423, + "grad_norm": 0.3234166991283847, + "learning_rate": 1.065567253539011e-05, + "loss": 2.5563, + "step": 52158 + }, + { + "epoch": 2.4283818702423354, + "grad_norm": 0.3215985088282893, + "learning_rate": 1.0654001038058431e-05, + "loss": 2.5846, + "step": 52159 + }, + { + "epoch": 2.4284284284284285, + "grad_norm": 0.3412805794112801, + "learning_rate": 1.065232965620338e-05, + "loss": 2.5614, + "step": 52160 + }, + { + "epoch": 2.4284749866145217, + "grad_norm": 0.338981679617571, + "learning_rate": 1.0650658389829887e-05, + "loss": 2.6489, + "step": 52161 + }, + { + "epoch": 2.4285215448006143, + "grad_norm": 0.3465258809898832, + "learning_rate": 1.0648987238942814e-05, + "loss": 2.6779, + "step": 52162 + }, + { + "epoch": 2.4285681029867074, + "grad_norm": 0.32899228228258304, + "learning_rate": 1.0647316203547092e-05, + "loss": 2.6434, + "step": 52163 + }, + { + "epoch": 2.4286146611728006, + "grad_norm": 0.3089510108285097, + "learning_rate": 1.0645645283647615e-05, + "loss": 2.5831, + "step": 52164 + }, + { + "epoch": 2.4286612193588937, + "grad_norm": 0.31647768676117227, + "learning_rate": 1.0643974479249297e-05, + "loss": 2.6374, + "step": 52165 + }, + { + "epoch": 2.428707777544987, + "grad_norm": 0.3194426366607025, + "learning_rate": 1.0642303790357034e-05, + "loss": 2.6272, + "step": 52166 + }, + { + "epoch": 2.42875433573108, + "grad_norm": 0.3118872793127245, + "learning_rate": 1.0640633216975743e-05, + "loss": 2.6204, + "step": 52167 + }, + { + "epoch": 2.428800893917173, + "grad_norm": 0.31314650734030414, + "learning_rate": 1.0638962759110294e-05, + "loss": 2.7238, + "step": 52168 + }, + { + "epoch": 2.428847452103266, + "grad_norm": 0.3330237853960573, + "learning_rate": 1.0637292416765637e-05, + "loss": 2.6314, + "step": 52169 + }, + { + "epoch": 2.4288940102893593, + "grad_norm": 0.31336906106199747, + "learning_rate": 1.0635622189946637e-05, + "loss": 2.69, + "step": 52170 + }, + { + "epoch": 2.4289405684754524, + "grad_norm": 0.31585175796451015, + "learning_rate": 1.0633952078658204e-05, + "loss": 2.5834, + "step": 52171 + }, + { + "epoch": 2.4289871266615455, + "grad_norm": 0.3149503239903841, + "learning_rate": 1.0632282082905243e-05, + "loss": 2.7425, + "step": 52172 + }, + { + "epoch": 2.429033684847638, + "grad_norm": 0.30320854253031737, + "learning_rate": 1.0630612202692657e-05, + "loss": 2.5814, + "step": 52173 + }, + { + "epoch": 2.4290802430337313, + "grad_norm": 0.3219468567478164, + "learning_rate": 1.0628942438025363e-05, + "loss": 2.5832, + "step": 52174 + }, + { + "epoch": 2.4291268012198244, + "grad_norm": 0.314149383596222, + "learning_rate": 1.0627272788908228e-05, + "loss": 2.5634, + "step": 52175 + }, + { + "epoch": 2.4291733594059175, + "grad_norm": 0.3007544882805387, + "learning_rate": 1.0625603255346166e-05, + "loss": 2.6301, + "step": 52176 + }, + { + "epoch": 2.4292199175920106, + "grad_norm": 0.325537451162334, + "learning_rate": 1.062393383734408e-05, + "loss": 2.6917, + "step": 52177 + }, + { + "epoch": 2.4292664757781037, + "grad_norm": 0.3274097113783427, + "learning_rate": 1.0622264534906873e-05, + "loss": 2.6738, + "step": 52178 + }, + { + "epoch": 2.429313033964197, + "grad_norm": 0.341427859040427, + "learning_rate": 1.0620595348039435e-05, + "loss": 2.5767, + "step": 52179 + }, + { + "epoch": 2.42935959215029, + "grad_norm": 0.3007229807421651, + "learning_rate": 1.0618926276746688e-05, + "loss": 2.5689, + "step": 52180 + }, + { + "epoch": 2.429406150336383, + "grad_norm": 0.3245756675566038, + "learning_rate": 1.0617257321033486e-05, + "loss": 2.6076, + "step": 52181 + }, + { + "epoch": 2.4294527085224757, + "grad_norm": 0.3246027494952313, + "learning_rate": 1.0615588480904776e-05, + "loss": 2.6798, + "step": 52182 + }, + { + "epoch": 2.429499266708569, + "grad_norm": 0.328066620278083, + "learning_rate": 1.0613919756365426e-05, + "loss": 2.5741, + "step": 52183 + }, + { + "epoch": 2.429545824894662, + "grad_norm": 0.3114794001378945, + "learning_rate": 1.0612251147420338e-05, + "loss": 2.7514, + "step": 52184 + }, + { + "epoch": 2.429592383080755, + "grad_norm": 0.31461016433044087, + "learning_rate": 1.0610582654074414e-05, + "loss": 2.6033, + "step": 52185 + }, + { + "epoch": 2.429638941266848, + "grad_norm": 0.3121436361831237, + "learning_rate": 1.0608914276332548e-05, + "loss": 2.6578, + "step": 52186 + }, + { + "epoch": 2.4296854994529413, + "grad_norm": 0.3295234172679407, + "learning_rate": 1.0607246014199652e-05, + "loss": 2.5626, + "step": 52187 + }, + { + "epoch": 2.4297320576390344, + "grad_norm": 0.3564508741475425, + "learning_rate": 1.0605577867680594e-05, + "loss": 2.5787, + "step": 52188 + }, + { + "epoch": 2.4297786158251276, + "grad_norm": 0.33317420289647026, + "learning_rate": 1.0603909836780284e-05, + "loss": 2.6203, + "step": 52189 + }, + { + "epoch": 2.4298251740112207, + "grad_norm": 0.3111323732411502, + "learning_rate": 1.060224192150362e-05, + "loss": 2.6084, + "step": 52190 + }, + { + "epoch": 2.429871732197314, + "grad_norm": 0.33940427124119477, + "learning_rate": 1.0600574121855495e-05, + "loss": 2.699, + "step": 52191 + }, + { + "epoch": 2.429918290383407, + "grad_norm": 0.35327242779304724, + "learning_rate": 1.05989064378408e-05, + "loss": 2.7631, + "step": 52192 + }, + { + "epoch": 2.4299648485694996, + "grad_norm": 0.3270906325181477, + "learning_rate": 1.0597238869464448e-05, + "loss": 2.6393, + "step": 52193 + }, + { + "epoch": 2.4300114067555927, + "grad_norm": 0.3300838467176391, + "learning_rate": 1.0595571416731292e-05, + "loss": 2.5302, + "step": 52194 + }, + { + "epoch": 2.430057964941686, + "grad_norm": 0.32268463324413166, + "learning_rate": 1.0593904079646277e-05, + "loss": 2.637, + "step": 52195 + }, + { + "epoch": 2.430104523127779, + "grad_norm": 0.33676870442260276, + "learning_rate": 1.0592236858214255e-05, + "loss": 2.6628, + "step": 52196 + }, + { + "epoch": 2.430151081313872, + "grad_norm": 0.32305294251072836, + "learning_rate": 1.0590569752440143e-05, + "loss": 2.6456, + "step": 52197 + }, + { + "epoch": 2.430197639499965, + "grad_norm": 0.3341523843912296, + "learning_rate": 1.0588902762328822e-05, + "loss": 2.6917, + "step": 52198 + }, + { + "epoch": 2.4302441976860583, + "grad_norm": 0.3260177590402063, + "learning_rate": 1.0587235887885189e-05, + "loss": 2.6805, + "step": 52199 + }, + { + "epoch": 2.4302907558721514, + "grad_norm": 0.318980073953196, + "learning_rate": 1.0585569129114148e-05, + "loss": 2.5494, + "step": 52200 + }, + { + "epoch": 2.430337314058244, + "grad_norm": 0.31659821886226086, + "learning_rate": 1.0583902486020552e-05, + "loss": 2.6545, + "step": 52201 + }, + { + "epoch": 2.430383872244337, + "grad_norm": 0.3213592357772792, + "learning_rate": 1.0582235958609349e-05, + "loss": 2.6342, + "step": 52202 + }, + { + "epoch": 2.4304304304304303, + "grad_norm": 0.31929111576104247, + "learning_rate": 1.0580569546885383e-05, + "loss": 2.6408, + "step": 52203 + }, + { + "epoch": 2.4304769886165234, + "grad_norm": 0.3187862274920592, + "learning_rate": 1.0578903250853567e-05, + "loss": 2.5755, + "step": 52204 + }, + { + "epoch": 2.4305235468026165, + "grad_norm": 0.2946753644758263, + "learning_rate": 1.057723707051878e-05, + "loss": 2.6495, + "step": 52205 + }, + { + "epoch": 2.4305701049887096, + "grad_norm": 0.33108679795438006, + "learning_rate": 1.0575571005885937e-05, + "loss": 2.6773, + "step": 52206 + }, + { + "epoch": 2.4306166631748027, + "grad_norm": 0.31401297601123307, + "learning_rate": 1.0573905056959882e-05, + "loss": 2.5649, + "step": 52207 + }, + { + "epoch": 2.430663221360896, + "grad_norm": 0.33489110793462884, + "learning_rate": 1.0572239223745556e-05, + "loss": 2.6573, + "step": 52208 + }, + { + "epoch": 2.430709779546989, + "grad_norm": 0.3195098879938254, + "learning_rate": 1.057057350624781e-05, + "loss": 2.6168, + "step": 52209 + }, + { + "epoch": 2.430756337733082, + "grad_norm": 0.32098933115368095, + "learning_rate": 1.0568907904471548e-05, + "loss": 2.6775, + "step": 52210 + }, + { + "epoch": 2.430802895919175, + "grad_norm": 0.34375637227806166, + "learning_rate": 1.0567242418421658e-05, + "loss": 2.5779, + "step": 52211 + }, + { + "epoch": 2.430849454105268, + "grad_norm": 0.3489734444891859, + "learning_rate": 1.0565577048103021e-05, + "loss": 2.5833, + "step": 52212 + }, + { + "epoch": 2.430896012291361, + "grad_norm": 0.33236113842447723, + "learning_rate": 1.0563911793520547e-05, + "loss": 2.5852, + "step": 52213 + }, + { + "epoch": 2.430942570477454, + "grad_norm": 0.3251897625601736, + "learning_rate": 1.0562246654679081e-05, + "loss": 2.5331, + "step": 52214 + }, + { + "epoch": 2.430989128663547, + "grad_norm": 0.3559185394448073, + "learning_rate": 1.0560581631583566e-05, + "loss": 2.5983, + "step": 52215 + }, + { + "epoch": 2.4310356868496403, + "grad_norm": 0.3226733767549674, + "learning_rate": 1.0558916724238837e-05, + "loss": 2.7423, + "step": 52216 + }, + { + "epoch": 2.4310822450357334, + "grad_norm": 0.3220202275576454, + "learning_rate": 1.0557251932649808e-05, + "loss": 2.6219, + "step": 52217 + }, + { + "epoch": 2.4311288032218266, + "grad_norm": 0.3331148033750785, + "learning_rate": 1.0555587256821359e-05, + "loss": 2.6651, + "step": 52218 + }, + { + "epoch": 2.4311753614079197, + "grad_norm": 0.30954391144106946, + "learning_rate": 1.0553922696758373e-05, + "loss": 2.6385, + "step": 52219 + }, + { + "epoch": 2.431221919594013, + "grad_norm": 0.3281269296794527, + "learning_rate": 1.0552258252465735e-05, + "loss": 2.6659, + "step": 52220 + }, + { + "epoch": 2.4312684777801055, + "grad_norm": 0.3450110644762499, + "learning_rate": 1.055059392394835e-05, + "loss": 2.5459, + "step": 52221 + }, + { + "epoch": 2.4313150359661986, + "grad_norm": 0.3136383408135314, + "learning_rate": 1.054892971121107e-05, + "loss": 2.5799, + "step": 52222 + }, + { + "epoch": 2.4313615941522917, + "grad_norm": 0.31034640847408085, + "learning_rate": 1.0547265614258794e-05, + "loss": 2.6427, + "step": 52223 + }, + { + "epoch": 2.431408152338385, + "grad_norm": 0.3085160245377549, + "learning_rate": 1.0545601633096413e-05, + "loss": 2.6039, + "step": 52224 + }, + { + "epoch": 2.431454710524478, + "grad_norm": 0.3155193728458987, + "learning_rate": 1.05439377677288e-05, + "loss": 2.6181, + "step": 52225 + }, + { + "epoch": 2.431501268710571, + "grad_norm": 0.3303847695228604, + "learning_rate": 1.0542274018160852e-05, + "loss": 2.6954, + "step": 52226 + }, + { + "epoch": 2.431547826896664, + "grad_norm": 0.3022554305043069, + "learning_rate": 1.054061038439742e-05, + "loss": 2.576, + "step": 52227 + }, + { + "epoch": 2.4315943850827573, + "grad_norm": 0.32547573361951354, + "learning_rate": 1.0538946866443438e-05, + "loss": 2.72, + "step": 52228 + }, + { + "epoch": 2.4316409432688504, + "grad_norm": 0.3209935824597515, + "learning_rate": 1.0537283464303743e-05, + "loss": 2.6799, + "step": 52229 + }, + { + "epoch": 2.4316875014549435, + "grad_norm": 0.3234641780303601, + "learning_rate": 1.0535620177983236e-05, + "loss": 2.6444, + "step": 52230 + }, + { + "epoch": 2.4317340596410366, + "grad_norm": 0.3249575894256843, + "learning_rate": 1.0533957007486794e-05, + "loss": 2.6905, + "step": 52231 + }, + { + "epoch": 2.4317806178271293, + "grad_norm": 0.3171827436114067, + "learning_rate": 1.05322939528193e-05, + "loss": 2.5815, + "step": 52232 + }, + { + "epoch": 2.4318271760132224, + "grad_norm": 0.331895230991268, + "learning_rate": 1.053063101398563e-05, + "loss": 2.6716, + "step": 52233 + }, + { + "epoch": 2.4318737341993155, + "grad_norm": 0.2987674437942012, + "learning_rate": 1.0528968190990691e-05, + "loss": 2.6508, + "step": 52234 + }, + { + "epoch": 2.4319202923854086, + "grad_norm": 0.31081728831118965, + "learning_rate": 1.0527305483839328e-05, + "loss": 2.6605, + "step": 52235 + }, + { + "epoch": 2.4319668505715017, + "grad_norm": 0.3187848837418406, + "learning_rate": 1.0525642892536431e-05, + "loss": 2.644, + "step": 52236 + }, + { + "epoch": 2.432013408757595, + "grad_norm": 0.33799629929976255, + "learning_rate": 1.0523980417086882e-05, + "loss": 2.6525, + "step": 52237 + }, + { + "epoch": 2.432059966943688, + "grad_norm": 0.30867819272758323, + "learning_rate": 1.0522318057495567e-05, + "loss": 2.5615, + "step": 52238 + }, + { + "epoch": 2.432106525129781, + "grad_norm": 0.3198344199896319, + "learning_rate": 1.0520655813767366e-05, + "loss": 2.5808, + "step": 52239 + }, + { + "epoch": 2.4321530833158738, + "grad_norm": 0.342406611169899, + "learning_rate": 1.0518993685907131e-05, + "loss": 2.7165, + "step": 52240 + }, + { + "epoch": 2.432199641501967, + "grad_norm": 0.314989293253449, + "learning_rate": 1.0517331673919784e-05, + "loss": 2.563, + "step": 52241 + }, + { + "epoch": 2.43224619968806, + "grad_norm": 0.3250828745561565, + "learning_rate": 1.051566977781016e-05, + "loss": 2.5896, + "step": 52242 + }, + { + "epoch": 2.432292757874153, + "grad_norm": 0.32496821433614315, + "learning_rate": 1.0514007997583164e-05, + "loss": 2.6359, + "step": 52243 + }, + { + "epoch": 2.4323393160602462, + "grad_norm": 0.31470866808804965, + "learning_rate": 1.0512346333243656e-05, + "loss": 2.6851, + "step": 52244 + }, + { + "epoch": 2.4323858742463393, + "grad_norm": 0.32297675464918185, + "learning_rate": 1.0510684784796526e-05, + "loss": 2.7238, + "step": 52245 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.3129713331248175, + "learning_rate": 1.0509023352246649e-05, + "loss": 2.5654, + "step": 52246 + }, + { + "epoch": 2.4324789906185256, + "grad_norm": 0.3020676704880406, + "learning_rate": 1.0507362035598906e-05, + "loss": 2.6164, + "step": 52247 + }, + { + "epoch": 2.4325255488046187, + "grad_norm": 0.3353698872918597, + "learning_rate": 1.0505700834858157e-05, + "loss": 2.6769, + "step": 52248 + }, + { + "epoch": 2.432572106990712, + "grad_norm": 0.3139648495880702, + "learning_rate": 1.0504039750029277e-05, + "loss": 2.544, + "step": 52249 + }, + { + "epoch": 2.432618665176805, + "grad_norm": 0.3335034900882001, + "learning_rate": 1.0502378781117156e-05, + "loss": 2.6257, + "step": 52250 + }, + { + "epoch": 2.4326652233628976, + "grad_norm": 0.3149955770117471, + "learning_rate": 1.0500717928126658e-05, + "loss": 2.6305, + "step": 52251 + }, + { + "epoch": 2.4327117815489907, + "grad_norm": 0.3164949411698862, + "learning_rate": 1.0499057191062677e-05, + "loss": 2.5525, + "step": 52252 + }, + { + "epoch": 2.432758339735084, + "grad_norm": 0.3072932457246791, + "learning_rate": 1.0497396569930047e-05, + "loss": 2.5439, + "step": 52253 + }, + { + "epoch": 2.432804897921177, + "grad_norm": 0.31222123541299535, + "learning_rate": 1.049573606473369e-05, + "loss": 2.5954, + "step": 52254 + }, + { + "epoch": 2.43285145610727, + "grad_norm": 0.31294399593776423, + "learning_rate": 1.0494075675478431e-05, + "loss": 2.5981, + "step": 52255 + }, + { + "epoch": 2.432898014293363, + "grad_norm": 0.2919636499235366, + "learning_rate": 1.0492415402169193e-05, + "loss": 2.6594, + "step": 52256 + }, + { + "epoch": 2.4329445724794563, + "grad_norm": 0.31505167333069845, + "learning_rate": 1.0490755244810812e-05, + "loss": 2.6643, + "step": 52257 + }, + { + "epoch": 2.4329911306655494, + "grad_norm": 0.30199920243677597, + "learning_rate": 1.0489095203408173e-05, + "loss": 2.646, + "step": 52258 + }, + { + "epoch": 2.4330376888516425, + "grad_norm": 0.318094567515756, + "learning_rate": 1.0487435277966145e-05, + "loss": 2.6411, + "step": 52259 + }, + { + "epoch": 2.433084247037735, + "grad_norm": 0.31421167862606214, + "learning_rate": 1.0485775468489617e-05, + "loss": 2.6642, + "step": 52260 + }, + { + "epoch": 2.4331308052238283, + "grad_norm": 0.309114392652937, + "learning_rate": 1.0484115774983428e-05, + "loss": 2.6092, + "step": 52261 + }, + { + "epoch": 2.4331773634099214, + "grad_norm": 0.33115448152546945, + "learning_rate": 1.0482456197452467e-05, + "loss": 2.6732, + "step": 52262 + }, + { + "epoch": 2.4332239215960145, + "grad_norm": 0.31194826859873515, + "learning_rate": 1.0480796735901605e-05, + "loss": 2.6541, + "step": 52263 + }, + { + "epoch": 2.4332704797821076, + "grad_norm": 0.32670836670774245, + "learning_rate": 1.0479137390335714e-05, + "loss": 2.5659, + "step": 52264 + }, + { + "epoch": 2.4333170379682008, + "grad_norm": 0.3142003262352111, + "learning_rate": 1.0477478160759675e-05, + "loss": 2.6374, + "step": 52265 + }, + { + "epoch": 2.433363596154294, + "grad_norm": 0.2992952610858374, + "learning_rate": 1.047581904717831e-05, + "loss": 2.7082, + "step": 52266 + }, + { + "epoch": 2.433410154340387, + "grad_norm": 0.3486853024694681, + "learning_rate": 1.0474160049596555e-05, + "loss": 2.6417, + "step": 52267 + }, + { + "epoch": 2.43345671252648, + "grad_norm": 0.33565914782430384, + "learning_rate": 1.0472501168019217e-05, + "loss": 2.6538, + "step": 52268 + }, + { + "epoch": 2.433503270712573, + "grad_norm": 0.31608849409539946, + "learning_rate": 1.0470842402451225e-05, + "loss": 2.6249, + "step": 52269 + }, + { + "epoch": 2.4335498288986663, + "grad_norm": 0.3085149395744854, + "learning_rate": 1.0469183752897398e-05, + "loss": 2.5156, + "step": 52270 + }, + { + "epoch": 2.433596387084759, + "grad_norm": 0.3286171785341199, + "learning_rate": 1.0467525219362624e-05, + "loss": 2.6082, + "step": 52271 + }, + { + "epoch": 2.433642945270852, + "grad_norm": 0.3233056627917413, + "learning_rate": 1.046586680185177e-05, + "loss": 2.578, + "step": 52272 + }, + { + "epoch": 2.4336895034569452, + "grad_norm": 0.3354591910864385, + "learning_rate": 1.0464208500369698e-05, + "loss": 2.6138, + "step": 52273 + }, + { + "epoch": 2.4337360616430384, + "grad_norm": 0.32695722602444927, + "learning_rate": 1.0462550314921299e-05, + "loss": 2.7051, + "step": 52274 + }, + { + "epoch": 2.4337826198291315, + "grad_norm": 0.32116632401818535, + "learning_rate": 1.0460892245511405e-05, + "loss": 2.6892, + "step": 52275 + }, + { + "epoch": 2.4338291780152246, + "grad_norm": 0.31178398494442855, + "learning_rate": 1.0459234292144894e-05, + "loss": 2.537, + "step": 52276 + }, + { + "epoch": 2.4338757362013177, + "grad_norm": 0.31759724319516586, + "learning_rate": 1.0457576454826634e-05, + "loss": 2.6139, + "step": 52277 + }, + { + "epoch": 2.433922294387411, + "grad_norm": 0.3351669952125128, + "learning_rate": 1.0455918733561493e-05, + "loss": 2.5484, + "step": 52278 + }, + { + "epoch": 2.433968852573504, + "grad_norm": 0.3307390733942636, + "learning_rate": 1.0454261128354337e-05, + "loss": 2.5408, + "step": 52279 + }, + { + "epoch": 2.4340154107595966, + "grad_norm": 0.3169309199320564, + "learning_rate": 1.045260363921004e-05, + "loss": 2.5996, + "step": 52280 + }, + { + "epoch": 2.4340619689456897, + "grad_norm": 0.3148972001948969, + "learning_rate": 1.0450946266133426e-05, + "loss": 2.6363, + "step": 52281 + }, + { + "epoch": 2.434108527131783, + "grad_norm": 0.31855033351988055, + "learning_rate": 1.0449289009129421e-05, + "loss": 2.7195, + "step": 52282 + }, + { + "epoch": 2.434155085317876, + "grad_norm": 0.33924480711516747, + "learning_rate": 1.044763186820284e-05, + "loss": 2.6558, + "step": 52283 + }, + { + "epoch": 2.434201643503969, + "grad_norm": 0.33228107061059375, + "learning_rate": 1.0445974843358564e-05, + "loss": 2.6189, + "step": 52284 + }, + { + "epoch": 2.434248201690062, + "grad_norm": 0.33730500598405716, + "learning_rate": 1.0444317934601455e-05, + "loss": 2.5276, + "step": 52285 + }, + { + "epoch": 2.4342947598761553, + "grad_norm": 0.31912454486046404, + "learning_rate": 1.0442661141936372e-05, + "loss": 2.652, + "step": 52286 + }, + { + "epoch": 2.4343413180622484, + "grad_norm": 0.3285543420231971, + "learning_rate": 1.0441004465368199e-05, + "loss": 2.5301, + "step": 52287 + }, + { + "epoch": 2.4343878762483415, + "grad_norm": 0.32380949097112083, + "learning_rate": 1.043934790490177e-05, + "loss": 2.6286, + "step": 52288 + }, + { + "epoch": 2.4344344344344346, + "grad_norm": 0.32765539226848406, + "learning_rate": 1.0437691460541955e-05, + "loss": 2.6038, + "step": 52289 + }, + { + "epoch": 2.4344809926205278, + "grad_norm": 0.3068488357498169, + "learning_rate": 1.0436035132293614e-05, + "loss": 2.6352, + "step": 52290 + }, + { + "epoch": 2.4345275508066204, + "grad_norm": 0.3476331515747433, + "learning_rate": 1.0434378920161614e-05, + "loss": 2.6833, + "step": 52291 + }, + { + "epoch": 2.4345741089927135, + "grad_norm": 0.31784021850812366, + "learning_rate": 1.0432722824150814e-05, + "loss": 2.633, + "step": 52292 + }, + { + "epoch": 2.4346206671788067, + "grad_norm": 0.30225725500131856, + "learning_rate": 1.0431066844266092e-05, + "loss": 2.6247, + "step": 52293 + }, + { + "epoch": 2.4346672253648998, + "grad_norm": 0.31413901081335244, + "learning_rate": 1.0429410980512261e-05, + "loss": 2.6063, + "step": 52294 + }, + { + "epoch": 2.434713783550993, + "grad_norm": 0.32320464760361867, + "learning_rate": 1.0427755232894237e-05, + "loss": 2.6246, + "step": 52295 + }, + { + "epoch": 2.434760341737086, + "grad_norm": 0.32849216128388764, + "learning_rate": 1.0426099601416839e-05, + "loss": 2.643, + "step": 52296 + }, + { + "epoch": 2.434806899923179, + "grad_norm": 0.3222626891773479, + "learning_rate": 1.042444408608494e-05, + "loss": 2.5913, + "step": 52297 + }, + { + "epoch": 2.4348534581092722, + "grad_norm": 0.3063598862099304, + "learning_rate": 1.0422788686903402e-05, + "loss": 2.5856, + "step": 52298 + }, + { + "epoch": 2.434900016295365, + "grad_norm": 0.30729347576512844, + "learning_rate": 1.0421133403877076e-05, + "loss": 2.4781, + "step": 52299 + }, + { + "epoch": 2.434946574481458, + "grad_norm": 0.3140168986553099, + "learning_rate": 1.0419478237010837e-05, + "loss": 2.5551, + "step": 52300 + }, + { + "epoch": 2.434993132667551, + "grad_norm": 0.337469899152465, + "learning_rate": 1.0417823186309516e-05, + "loss": 2.7674, + "step": 52301 + }, + { + "epoch": 2.4350396908536442, + "grad_norm": 0.3093074054718139, + "learning_rate": 1.0416168251777992e-05, + "loss": 2.6304, + "step": 52302 + }, + { + "epoch": 2.4350862490397374, + "grad_norm": 0.31324180164889964, + "learning_rate": 1.0414513433421108e-05, + "loss": 2.6388, + "step": 52303 + }, + { + "epoch": 2.4351328072258305, + "grad_norm": 0.30358935953507504, + "learning_rate": 1.041285873124373e-05, + "loss": 2.5551, + "step": 52304 + }, + { + "epoch": 2.4351793654119236, + "grad_norm": 0.32174727414381366, + "learning_rate": 1.0411204145250708e-05, + "loss": 2.6471, + "step": 52305 + }, + { + "epoch": 2.4352259235980167, + "grad_norm": 0.3183607666670617, + "learning_rate": 1.0409549675446922e-05, + "loss": 2.6739, + "step": 52306 + }, + { + "epoch": 2.43527248178411, + "grad_norm": 0.3010870619509564, + "learning_rate": 1.0407895321837174e-05, + "loss": 2.607, + "step": 52307 + }, + { + "epoch": 2.435319039970203, + "grad_norm": 0.31779542745884787, + "learning_rate": 1.0406241084426382e-05, + "loss": 2.634, + "step": 52308 + }, + { + "epoch": 2.435365598156296, + "grad_norm": 0.32348929542831567, + "learning_rate": 1.040458696321936e-05, + "loss": 2.7016, + "step": 52309 + }, + { + "epoch": 2.4354121563423887, + "grad_norm": 0.32148210956893053, + "learning_rate": 1.0402932958220973e-05, + "loss": 2.4879, + "step": 52310 + }, + { + "epoch": 2.435458714528482, + "grad_norm": 0.30160471162937025, + "learning_rate": 1.0401279069436075e-05, + "loss": 2.5127, + "step": 52311 + }, + { + "epoch": 2.435505272714575, + "grad_norm": 0.3196456323815655, + "learning_rate": 1.0399625296869525e-05, + "loss": 2.5474, + "step": 52312 + }, + { + "epoch": 2.435551830900668, + "grad_norm": 0.3191771360477721, + "learning_rate": 1.0397971640526183e-05, + "loss": 2.7614, + "step": 52313 + }, + { + "epoch": 2.435598389086761, + "grad_norm": 0.3165492389136185, + "learning_rate": 1.0396318100410868e-05, + "loss": 2.5363, + "step": 52314 + }, + { + "epoch": 2.4356449472728543, + "grad_norm": 0.33131451356540914, + "learning_rate": 1.0394664676528487e-05, + "loss": 2.6907, + "step": 52315 + }, + { + "epoch": 2.4356915054589474, + "grad_norm": 0.3111809658304286, + "learning_rate": 1.0393011368883848e-05, + "loss": 2.5499, + "step": 52316 + }, + { + "epoch": 2.4357380636450405, + "grad_norm": 0.30645528606679184, + "learning_rate": 1.0391358177481819e-05, + "loss": 2.6287, + "step": 52317 + }, + { + "epoch": 2.4357846218311336, + "grad_norm": 0.3296593345307167, + "learning_rate": 1.0389705102327247e-05, + "loss": 2.6687, + "step": 52318 + }, + { + "epoch": 2.4358311800172263, + "grad_norm": 0.31905171476726846, + "learning_rate": 1.038805214342501e-05, + "loss": 2.6362, + "step": 52319 + }, + { + "epoch": 2.4358777382033194, + "grad_norm": 0.3083560814652395, + "learning_rate": 1.0386399300779904e-05, + "loss": 2.6098, + "step": 52320 + }, + { + "epoch": 2.4359242963894125, + "grad_norm": 0.3189195775997369, + "learning_rate": 1.0384746574396841e-05, + "loss": 2.6643, + "step": 52321 + }, + { + "epoch": 2.4359708545755057, + "grad_norm": 0.31066091374116733, + "learning_rate": 1.0383093964280621e-05, + "loss": 2.5713, + "step": 52322 + }, + { + "epoch": 2.4360174127615988, + "grad_norm": 0.3183454142546794, + "learning_rate": 1.0381441470436143e-05, + "loss": 2.7052, + "step": 52323 + }, + { + "epoch": 2.436063970947692, + "grad_norm": 0.3251358891530977, + "learning_rate": 1.0379789092868215e-05, + "loss": 2.7031, + "step": 52324 + }, + { + "epoch": 2.436110529133785, + "grad_norm": 0.3061601926277244, + "learning_rate": 1.037813683158171e-05, + "loss": 2.6089, + "step": 52325 + }, + { + "epoch": 2.436157087319878, + "grad_norm": 0.3225150121445651, + "learning_rate": 1.0376484686581472e-05, + "loss": 2.6524, + "step": 52326 + }, + { + "epoch": 2.4362036455059712, + "grad_norm": 0.3385977991889222, + "learning_rate": 1.037483265787233e-05, + "loss": 2.6641, + "step": 52327 + }, + { + "epoch": 2.4362502036920644, + "grad_norm": 0.3105766337470286, + "learning_rate": 1.0373180745459177e-05, + "loss": 2.5674, + "step": 52328 + }, + { + "epoch": 2.4362967618781575, + "grad_norm": 0.30159446169819054, + "learning_rate": 1.0371528949346815e-05, + "loss": 2.567, + "step": 52329 + }, + { + "epoch": 2.43634332006425, + "grad_norm": 0.3084281140782087, + "learning_rate": 1.0369877269540118e-05, + "loss": 2.5916, + "step": 52330 + }, + { + "epoch": 2.4363898782503433, + "grad_norm": 0.3160343409072535, + "learning_rate": 1.0368225706043922e-05, + "loss": 2.6177, + "step": 52331 + }, + { + "epoch": 2.4364364364364364, + "grad_norm": 0.31353401039544504, + "learning_rate": 1.0366574258863081e-05, + "loss": 2.6537, + "step": 52332 + }, + { + "epoch": 2.4364829946225295, + "grad_norm": 0.303349284057736, + "learning_rate": 1.0364922928002441e-05, + "loss": 2.5813, + "step": 52333 + }, + { + "epoch": 2.4365295528086226, + "grad_norm": 0.30855805532662023, + "learning_rate": 1.036327171346686e-05, + "loss": 2.5905, + "step": 52334 + }, + { + "epoch": 2.4365761109947157, + "grad_norm": 0.3108651374355597, + "learning_rate": 1.0361620615261147e-05, + "loss": 2.6103, + "step": 52335 + }, + { + "epoch": 2.436622669180809, + "grad_norm": 0.32370724191184885, + "learning_rate": 1.0359969633390194e-05, + "loss": 2.6095, + "step": 52336 + }, + { + "epoch": 2.436669227366902, + "grad_norm": 0.3138809822303729, + "learning_rate": 1.0358318767858816e-05, + "loss": 2.6194, + "step": 52337 + }, + { + "epoch": 2.4367157855529946, + "grad_norm": 0.3206784248420451, + "learning_rate": 1.0356668018671867e-05, + "loss": 2.5503, + "step": 52338 + }, + { + "epoch": 2.4367623437390877, + "grad_norm": 0.29906839888548514, + "learning_rate": 1.0355017385834204e-05, + "loss": 2.6617, + "step": 52339 + }, + { + "epoch": 2.436808901925181, + "grad_norm": 0.3580003817183839, + "learning_rate": 1.0353366869350634e-05, + "loss": 2.6623, + "step": 52340 + }, + { + "epoch": 2.436855460111274, + "grad_norm": 0.3237133845412519, + "learning_rate": 1.0351716469226053e-05, + "loss": 2.6967, + "step": 52341 + }, + { + "epoch": 2.436902018297367, + "grad_norm": 0.3351986420787995, + "learning_rate": 1.0350066185465263e-05, + "loss": 2.5935, + "step": 52342 + }, + { + "epoch": 2.43694857648346, + "grad_norm": 0.3136223078570483, + "learning_rate": 1.0348416018073121e-05, + "loss": 2.6588, + "step": 52343 + }, + { + "epoch": 2.4369951346695533, + "grad_norm": 0.30522892432768217, + "learning_rate": 1.0346765967054473e-05, + "loss": 2.5725, + "step": 52344 + }, + { + "epoch": 2.4370416928556464, + "grad_norm": 0.32282662096834347, + "learning_rate": 1.034511603241416e-05, + "loss": 2.5421, + "step": 52345 + }, + { + "epoch": 2.4370882510417395, + "grad_norm": 0.34283569561375293, + "learning_rate": 1.0343466214157028e-05, + "loss": 2.6713, + "step": 52346 + }, + { + "epoch": 2.4371348092278327, + "grad_norm": 0.30126701969196945, + "learning_rate": 1.0341816512287927e-05, + "loss": 2.4914, + "step": 52347 + }, + { + "epoch": 2.4371813674139258, + "grad_norm": 0.31782257942409337, + "learning_rate": 1.0340166926811662e-05, + "loss": 2.6132, + "step": 52348 + }, + { + "epoch": 2.4372279256000184, + "grad_norm": 0.3253188200626422, + "learning_rate": 1.0338517457733126e-05, + "loss": 2.6981, + "step": 52349 + }, + { + "epoch": 2.4372744837861116, + "grad_norm": 0.3498295213423171, + "learning_rate": 1.0336868105057119e-05, + "loss": 2.6025, + "step": 52350 + }, + { + "epoch": 2.4373210419722047, + "grad_norm": 0.3475371806433195, + "learning_rate": 1.0335218868788499e-05, + "loss": 2.7201, + "step": 52351 + }, + { + "epoch": 2.437367600158298, + "grad_norm": 0.315370508695471, + "learning_rate": 1.0333569748932114e-05, + "loss": 2.6405, + "step": 52352 + }, + { + "epoch": 2.437414158344391, + "grad_norm": 0.3548122819538202, + "learning_rate": 1.0331920745492774e-05, + "loss": 2.5961, + "step": 52353 + }, + { + "epoch": 2.437460716530484, + "grad_norm": 0.3133420744976956, + "learning_rate": 1.033027185847536e-05, + "loss": 2.623, + "step": 52354 + }, + { + "epoch": 2.437507274716577, + "grad_norm": 0.3278739806864988, + "learning_rate": 1.0328623087884675e-05, + "loss": 2.647, + "step": 52355 + }, + { + "epoch": 2.4375538329026702, + "grad_norm": 0.32878235082586577, + "learning_rate": 1.0326974433725579e-05, + "loss": 2.6084, + "step": 52356 + }, + { + "epoch": 2.4376003910887634, + "grad_norm": 0.3302584496842006, + "learning_rate": 1.0325325896002896e-05, + "loss": 2.7206, + "step": 52357 + }, + { + "epoch": 2.437646949274856, + "grad_norm": 0.3212885700066435, + "learning_rate": 1.0323677474721483e-05, + "loss": 2.6013, + "step": 52358 + }, + { + "epoch": 2.437693507460949, + "grad_norm": 0.3174297299575078, + "learning_rate": 1.032202916988616e-05, + "loss": 2.5856, + "step": 52359 + }, + { + "epoch": 2.4377400656470423, + "grad_norm": 0.3581165568297285, + "learning_rate": 1.0320380981501786e-05, + "loss": 2.6533, + "step": 52360 + }, + { + "epoch": 2.4377866238331354, + "grad_norm": 0.3129059082668361, + "learning_rate": 1.0318732909573165e-05, + "loss": 2.6378, + "step": 52361 + }, + { + "epoch": 2.4378331820192285, + "grad_norm": 0.32826847617091537, + "learning_rate": 1.0317084954105177e-05, + "loss": 2.6806, + "step": 52362 + }, + { + "epoch": 2.4378797402053216, + "grad_norm": 0.32914790641808434, + "learning_rate": 1.0315437115102622e-05, + "loss": 2.7064, + "step": 52363 + }, + { + "epoch": 2.4379262983914147, + "grad_norm": 0.3085159093877171, + "learning_rate": 1.0313789392570344e-05, + "loss": 2.5835, + "step": 52364 + }, + { + "epoch": 2.437972856577508, + "grad_norm": 0.31175952754979447, + "learning_rate": 1.0312141786513202e-05, + "loss": 2.6382, + "step": 52365 + }, + { + "epoch": 2.438019414763601, + "grad_norm": 0.33575469977537886, + "learning_rate": 1.0310494296935991e-05, + "loss": 2.7794, + "step": 52366 + }, + { + "epoch": 2.438065972949694, + "grad_norm": 0.3153575614817945, + "learning_rate": 1.030884692384359e-05, + "loss": 2.6087, + "step": 52367 + }, + { + "epoch": 2.438112531135787, + "grad_norm": 0.3060946350743209, + "learning_rate": 1.0307199667240791e-05, + "loss": 2.6474, + "step": 52368 + }, + { + "epoch": 2.43815908932188, + "grad_norm": 0.30076871064022703, + "learning_rate": 1.0305552527132473e-05, + "loss": 2.6599, + "step": 52369 + }, + { + "epoch": 2.438205647507973, + "grad_norm": 0.3439790520343698, + "learning_rate": 1.0303905503523437e-05, + "loss": 2.7043, + "step": 52370 + }, + { + "epoch": 2.438252205694066, + "grad_norm": 0.3379302460152657, + "learning_rate": 1.0302258596418529e-05, + "loss": 2.6523, + "step": 52371 + }, + { + "epoch": 2.438298763880159, + "grad_norm": 0.3319604598478971, + "learning_rate": 1.0300611805822574e-05, + "loss": 2.691, + "step": 52372 + }, + { + "epoch": 2.4383453220662523, + "grad_norm": 0.3114721007948238, + "learning_rate": 1.0298965131740435e-05, + "loss": 2.5475, + "step": 52373 + }, + { + "epoch": 2.4383918802523454, + "grad_norm": 0.3290340528371921, + "learning_rate": 1.029731857417689e-05, + "loss": 2.5815, + "step": 52374 + }, + { + "epoch": 2.4384384384384385, + "grad_norm": 0.33329945887527834, + "learning_rate": 1.029567213313683e-05, + "loss": 2.6038, + "step": 52375 + }, + { + "epoch": 2.4384849966245317, + "grad_norm": 0.3152699808078076, + "learning_rate": 1.0294025808625052e-05, + "loss": 2.6797, + "step": 52376 + }, + { + "epoch": 2.4385315548106243, + "grad_norm": 0.3326261848372849, + "learning_rate": 1.029237960064639e-05, + "loss": 2.5655, + "step": 52377 + }, + { + "epoch": 2.4385781129967175, + "grad_norm": 0.32255071823130094, + "learning_rate": 1.0290733509205703e-05, + "loss": 2.5426, + "step": 52378 + }, + { + "epoch": 2.4386246711828106, + "grad_norm": 0.3028908289633801, + "learning_rate": 1.0289087534307767e-05, + "loss": 2.6529, + "step": 52379 + }, + { + "epoch": 2.4386712293689037, + "grad_norm": 0.3007174359893809, + "learning_rate": 1.0287441675957477e-05, + "loss": 2.6073, + "step": 52380 + }, + { + "epoch": 2.438717787554997, + "grad_norm": 0.3429804066316847, + "learning_rate": 1.0285795934159609e-05, + "loss": 2.7346, + "step": 52381 + }, + { + "epoch": 2.43876434574109, + "grad_norm": 0.34465891278070915, + "learning_rate": 1.0284150308919039e-05, + "loss": 2.578, + "step": 52382 + }, + { + "epoch": 2.438810903927183, + "grad_norm": 0.31206351885892236, + "learning_rate": 1.0282504800240567e-05, + "loss": 2.6989, + "step": 52383 + }, + { + "epoch": 2.438857462113276, + "grad_norm": 0.3332309300052854, + "learning_rate": 1.0280859408129029e-05, + "loss": 2.6392, + "step": 52384 + }, + { + "epoch": 2.4389040202993693, + "grad_norm": 0.3480022295244738, + "learning_rate": 1.0279214132589255e-05, + "loss": 2.646, + "step": 52385 + }, + { + "epoch": 2.4389505784854624, + "grad_norm": 0.3247201867918729, + "learning_rate": 1.0277568973626072e-05, + "loss": 2.6366, + "step": 52386 + }, + { + "epoch": 2.4389971366715555, + "grad_norm": 0.30326232180180757, + "learning_rate": 1.0275923931244313e-05, + "loss": 2.5667, + "step": 52387 + }, + { + "epoch": 2.439043694857648, + "grad_norm": 0.3033101027976748, + "learning_rate": 1.0274279005448817e-05, + "loss": 2.5601, + "step": 52388 + }, + { + "epoch": 2.4390902530437413, + "grad_norm": 0.3079054346805986, + "learning_rate": 1.0272634196244385e-05, + "loss": 2.5316, + "step": 52389 + }, + { + "epoch": 2.4391368112298344, + "grad_norm": 0.327574943184036, + "learning_rate": 1.0270989503635858e-05, + "loss": 2.6896, + "step": 52390 + }, + { + "epoch": 2.4391833694159275, + "grad_norm": 0.33537865847077436, + "learning_rate": 1.0269344927628077e-05, + "loss": 2.7377, + "step": 52391 + }, + { + "epoch": 2.4392299276020206, + "grad_norm": 0.3209791317991088, + "learning_rate": 1.0267700468225827e-05, + "loss": 2.7239, + "step": 52392 + }, + { + "epoch": 2.4392764857881137, + "grad_norm": 0.3153623366176564, + "learning_rate": 1.0266056125433986e-05, + "loss": 2.6038, + "step": 52393 + }, + { + "epoch": 2.439323043974207, + "grad_norm": 0.3268377869441684, + "learning_rate": 1.0264411899257338e-05, + "loss": 2.5666, + "step": 52394 + }, + { + "epoch": 2.4393696021603, + "grad_norm": 0.29700489541282116, + "learning_rate": 1.0262767789700744e-05, + "loss": 2.5632, + "step": 52395 + }, + { + "epoch": 2.439416160346393, + "grad_norm": 0.31248333352314933, + "learning_rate": 1.0261123796769001e-05, + "loss": 2.759, + "step": 52396 + }, + { + "epoch": 2.4394627185324858, + "grad_norm": 0.30465242438606954, + "learning_rate": 1.0259479920466946e-05, + "loss": 2.6241, + "step": 52397 + }, + { + "epoch": 2.439509276718579, + "grad_norm": 0.3229846725729603, + "learning_rate": 1.02578361607994e-05, + "loss": 2.7279, + "step": 52398 + }, + { + "epoch": 2.439555834904672, + "grad_norm": 0.3053597548606377, + "learning_rate": 1.025619251777119e-05, + "loss": 2.6912, + "step": 52399 + }, + { + "epoch": 2.439602393090765, + "grad_norm": 0.31550939387115956, + "learning_rate": 1.0254548991387143e-05, + "loss": 2.6857, + "step": 52400 + }, + { + "epoch": 2.439648951276858, + "grad_norm": 0.31105626328092706, + "learning_rate": 1.0252905581652089e-05, + "loss": 2.6029, + "step": 52401 + }, + { + "epoch": 2.4396955094629513, + "grad_norm": 0.31446682720855484, + "learning_rate": 1.0251262288570829e-05, + "loss": 2.6627, + "step": 52402 + }, + { + "epoch": 2.4397420676490444, + "grad_norm": 0.3204313466882193, + "learning_rate": 1.02496191121482e-05, + "loss": 2.6765, + "step": 52403 + }, + { + "epoch": 2.4397886258351376, + "grad_norm": 0.3277873717459507, + "learning_rate": 1.0247976052389018e-05, + "loss": 2.5365, + "step": 52404 + }, + { + "epoch": 2.4398351840212307, + "grad_norm": 0.31107136306577104, + "learning_rate": 1.0246333109298117e-05, + "loss": 2.6927, + "step": 52405 + }, + { + "epoch": 2.439881742207324, + "grad_norm": 0.3149724829210734, + "learning_rate": 1.024469028288032e-05, + "loss": 2.5853, + "step": 52406 + }, + { + "epoch": 2.439928300393417, + "grad_norm": 0.31671717652723347, + "learning_rate": 1.0243047573140412e-05, + "loss": 2.7105, + "step": 52407 + }, + { + "epoch": 2.4399748585795096, + "grad_norm": 0.3335066811599814, + "learning_rate": 1.0241404980083274e-05, + "loss": 2.6546, + "step": 52408 + }, + { + "epoch": 2.4400214167656027, + "grad_norm": 0.3404614932873063, + "learning_rate": 1.0239762503713684e-05, + "loss": 2.7222, + "step": 52409 + }, + { + "epoch": 2.440067974951696, + "grad_norm": 0.30812604788828346, + "learning_rate": 1.0238120144036468e-05, + "loss": 2.6072, + "step": 52410 + }, + { + "epoch": 2.440114533137789, + "grad_norm": 0.3190195288685833, + "learning_rate": 1.0236477901056458e-05, + "loss": 2.6089, + "step": 52411 + }, + { + "epoch": 2.440161091323882, + "grad_norm": 0.3076277111705461, + "learning_rate": 1.0234835774778462e-05, + "loss": 2.6086, + "step": 52412 + }, + { + "epoch": 2.440207649509975, + "grad_norm": 0.3177724393878541, + "learning_rate": 1.0233193765207311e-05, + "loss": 2.5754, + "step": 52413 + }, + { + "epoch": 2.4402542076960683, + "grad_norm": 0.3213116103421446, + "learning_rate": 1.0231551872347828e-05, + "loss": 2.5976, + "step": 52414 + }, + { + "epoch": 2.4403007658821614, + "grad_norm": 0.3219020887346471, + "learning_rate": 1.0229910096204809e-05, + "loss": 2.7063, + "step": 52415 + }, + { + "epoch": 2.440347324068254, + "grad_norm": 0.3252776076087144, + "learning_rate": 1.0228268436783089e-05, + "loss": 2.7034, + "step": 52416 + }, + { + "epoch": 2.440393882254347, + "grad_norm": 0.3112530238077882, + "learning_rate": 1.0226626894087482e-05, + "loss": 2.5448, + "step": 52417 + }, + { + "epoch": 2.4404404404404403, + "grad_norm": 0.32245932421014756, + "learning_rate": 1.0224985468122806e-05, + "loss": 2.6287, + "step": 52418 + }, + { + "epoch": 2.4404869986265334, + "grad_norm": 0.33974422761615425, + "learning_rate": 1.0223344158893894e-05, + "loss": 2.6584, + "step": 52419 + }, + { + "epoch": 2.4405335568126265, + "grad_norm": 0.32124435955289043, + "learning_rate": 1.0221702966405517e-05, + "loss": 2.6336, + "step": 52420 + }, + { + "epoch": 2.4405801149987196, + "grad_norm": 0.3278123740044364, + "learning_rate": 1.0220061890662553e-05, + "loss": 2.5057, + "step": 52421 + }, + { + "epoch": 2.4406266731848127, + "grad_norm": 0.3218189849826322, + "learning_rate": 1.0218420931669765e-05, + "loss": 2.6038, + "step": 52422 + }, + { + "epoch": 2.440673231370906, + "grad_norm": 0.32940405701955233, + "learning_rate": 1.0216780089432016e-05, + "loss": 2.5964, + "step": 52423 + }, + { + "epoch": 2.440719789556999, + "grad_norm": 0.3353489717384253, + "learning_rate": 1.0215139363954079e-05, + "loss": 2.6083, + "step": 52424 + }, + { + "epoch": 2.440766347743092, + "grad_norm": 0.30660995227520266, + "learning_rate": 1.0213498755240797e-05, + "loss": 2.7107, + "step": 52425 + }, + { + "epoch": 2.440812905929185, + "grad_norm": 0.3204966153827866, + "learning_rate": 1.0211858263296974e-05, + "loss": 2.5847, + "step": 52426 + }, + { + "epoch": 2.440859464115278, + "grad_norm": 0.32811922969169616, + "learning_rate": 1.0210217888127438e-05, + "loss": 2.6538, + "step": 52427 + }, + { + "epoch": 2.440906022301371, + "grad_norm": 0.3209543892131233, + "learning_rate": 1.0208577629736981e-05, + "loss": 2.6124, + "step": 52428 + }, + { + "epoch": 2.440952580487464, + "grad_norm": 0.31643997337891966, + "learning_rate": 1.020693748813043e-05, + "loss": 2.59, + "step": 52429 + }, + { + "epoch": 2.4409991386735572, + "grad_norm": 0.3151176756611383, + "learning_rate": 1.0205297463312596e-05, + "loss": 2.629, + "step": 52430 + }, + { + "epoch": 2.4410456968596503, + "grad_norm": 0.2939543311745186, + "learning_rate": 1.0203657555288292e-05, + "loss": 2.5921, + "step": 52431 + }, + { + "epoch": 2.4410922550457435, + "grad_norm": 0.30858655960712394, + "learning_rate": 1.020201776406235e-05, + "loss": 2.702, + "step": 52432 + }, + { + "epoch": 2.4411388132318366, + "grad_norm": 0.3019822593576075, + "learning_rate": 1.0200378089639535e-05, + "loss": 2.6016, + "step": 52433 + }, + { + "epoch": 2.4411853714179297, + "grad_norm": 0.3166026179788777, + "learning_rate": 1.0198738532024715e-05, + "loss": 2.5915, + "step": 52434 + }, + { + "epoch": 2.441231929604023, + "grad_norm": 0.3113004364989316, + "learning_rate": 1.0197099091222657e-05, + "loss": 2.6666, + "step": 52435 + }, + { + "epoch": 2.4412784877901155, + "grad_norm": 0.3142843643792488, + "learning_rate": 1.0195459767238207e-05, + "loss": 2.6301, + "step": 52436 + }, + { + "epoch": 2.4413250459762086, + "grad_norm": 0.3090899333247174, + "learning_rate": 1.0193820560076156e-05, + "loss": 2.5504, + "step": 52437 + }, + { + "epoch": 2.4413716041623017, + "grad_norm": 0.3088703247930971, + "learning_rate": 1.0192181469741318e-05, + "loss": 2.6208, + "step": 52438 + }, + { + "epoch": 2.441418162348395, + "grad_norm": 0.29850063256418763, + "learning_rate": 1.019054249623851e-05, + "loss": 2.5829, + "step": 52439 + }, + { + "epoch": 2.441464720534488, + "grad_norm": 0.31948716887367645, + "learning_rate": 1.0188903639572528e-05, + "loss": 2.6961, + "step": 52440 + }, + { + "epoch": 2.441511278720581, + "grad_norm": 0.31664341828446974, + "learning_rate": 1.0187264899748212e-05, + "loss": 2.5828, + "step": 52441 + }, + { + "epoch": 2.441557836906674, + "grad_norm": 0.3022105730174536, + "learning_rate": 1.0185626276770339e-05, + "loss": 2.5939, + "step": 52442 + }, + { + "epoch": 2.4416043950927673, + "grad_norm": 0.3079524319117921, + "learning_rate": 1.0183987770643727e-05, + "loss": 2.7041, + "step": 52443 + }, + { + "epoch": 2.4416509532788604, + "grad_norm": 0.32032669217473897, + "learning_rate": 1.0182349381373191e-05, + "loss": 2.6369, + "step": 52444 + }, + { + "epoch": 2.4416975114649535, + "grad_norm": 0.32289764383455694, + "learning_rate": 1.0180711108963547e-05, + "loss": 2.6847, + "step": 52445 + }, + { + "epoch": 2.4417440696510466, + "grad_norm": 0.3043546371105192, + "learning_rate": 1.0179072953419566e-05, + "loss": 2.7276, + "step": 52446 + }, + { + "epoch": 2.4417906278371393, + "grad_norm": 0.3052042848031193, + "learning_rate": 1.0177434914746114e-05, + "loss": 2.7489, + "step": 52447 + }, + { + "epoch": 2.4418371860232324, + "grad_norm": 0.3146661934219695, + "learning_rate": 1.017579699294794e-05, + "loss": 2.6274, + "step": 52448 + }, + { + "epoch": 2.4418837442093255, + "grad_norm": 0.31709905625079376, + "learning_rate": 1.0174159188029902e-05, + "loss": 2.6127, + "step": 52449 + }, + { + "epoch": 2.4419303023954186, + "grad_norm": 0.31172097844105917, + "learning_rate": 1.0172521499996773e-05, + "loss": 2.6262, + "step": 52450 + }, + { + "epoch": 2.4419768605815118, + "grad_norm": 0.30393291720868343, + "learning_rate": 1.0170883928853365e-05, + "loss": 2.7467, + "step": 52451 + }, + { + "epoch": 2.442023418767605, + "grad_norm": 0.31384964857913844, + "learning_rate": 1.0169246474604493e-05, + "loss": 2.7044, + "step": 52452 + }, + { + "epoch": 2.442069976953698, + "grad_norm": 0.34892223357832386, + "learning_rate": 1.0167609137254957e-05, + "loss": 2.6502, + "step": 52453 + }, + { + "epoch": 2.442116535139791, + "grad_norm": 0.31287138344607257, + "learning_rate": 1.016597191680958e-05, + "loss": 2.5522, + "step": 52454 + }, + { + "epoch": 2.442163093325884, + "grad_norm": 0.3393202821790699, + "learning_rate": 1.0164334813273134e-05, + "loss": 2.7488, + "step": 52455 + }, + { + "epoch": 2.442209651511977, + "grad_norm": 0.31047742019770924, + "learning_rate": 1.0162697826650441e-05, + "loss": 2.6218, + "step": 52456 + }, + { + "epoch": 2.44225620969807, + "grad_norm": 0.32178220087927967, + "learning_rate": 1.0161060956946311e-05, + "loss": 2.6341, + "step": 52457 + }, + { + "epoch": 2.442302767884163, + "grad_norm": 0.3101667621370338, + "learning_rate": 1.0159424204165535e-05, + "loss": 2.6815, + "step": 52458 + }, + { + "epoch": 2.4423493260702562, + "grad_norm": 0.34461187720296355, + "learning_rate": 1.0157787568312931e-05, + "loss": 2.7157, + "step": 52459 + }, + { + "epoch": 2.4423958842563493, + "grad_norm": 0.3145370003678157, + "learning_rate": 1.0156151049393304e-05, + "loss": 2.6061, + "step": 52460 + }, + { + "epoch": 2.4424424424424425, + "grad_norm": 0.31689517160455627, + "learning_rate": 1.0154514647411423e-05, + "loss": 2.5519, + "step": 52461 + }, + { + "epoch": 2.4424890006285356, + "grad_norm": 0.3244042818942612, + "learning_rate": 1.0152878362372142e-05, + "loss": 2.6487, + "step": 52462 + }, + { + "epoch": 2.4425355588146287, + "grad_norm": 0.33401758507169504, + "learning_rate": 1.0151242194280225e-05, + "loss": 2.7403, + "step": 52463 + }, + { + "epoch": 2.442582117000722, + "grad_norm": 0.31540185708512924, + "learning_rate": 1.0149606143140484e-05, + "loss": 2.6554, + "step": 52464 + }, + { + "epoch": 2.442628675186815, + "grad_norm": 0.32927511859694997, + "learning_rate": 1.0147970208957724e-05, + "loss": 2.5626, + "step": 52465 + }, + { + "epoch": 2.442675233372908, + "grad_norm": 0.30295616935443326, + "learning_rate": 1.014633439173675e-05, + "loss": 2.6859, + "step": 52466 + }, + { + "epoch": 2.4427217915590007, + "grad_norm": 0.2976022864585729, + "learning_rate": 1.0144698691482368e-05, + "loss": 2.5949, + "step": 52467 + }, + { + "epoch": 2.442768349745094, + "grad_norm": 0.32608799802763394, + "learning_rate": 1.0143063108199352e-05, + "loss": 2.6888, + "step": 52468 + }, + { + "epoch": 2.442814907931187, + "grad_norm": 0.3159874521022797, + "learning_rate": 1.0141427641892526e-05, + "loss": 2.6649, + "step": 52469 + }, + { + "epoch": 2.44286146611728, + "grad_norm": 0.345498684176638, + "learning_rate": 1.013979229256668e-05, + "loss": 2.617, + "step": 52470 + }, + { + "epoch": 2.442908024303373, + "grad_norm": 0.3355779687481144, + "learning_rate": 1.0138157060226615e-05, + "loss": 2.6702, + "step": 52471 + }, + { + "epoch": 2.4429545824894663, + "grad_norm": 0.322637204052522, + "learning_rate": 1.0136521944877136e-05, + "loss": 2.5984, + "step": 52472 + }, + { + "epoch": 2.4430011406755594, + "grad_norm": 0.31950728376999205, + "learning_rate": 1.0134886946523047e-05, + "loss": 2.6305, + "step": 52473 + }, + { + "epoch": 2.4430476988616525, + "grad_norm": 0.3251017176932153, + "learning_rate": 1.0133252065169113e-05, + "loss": 2.5686, + "step": 52474 + }, + { + "epoch": 2.443094257047745, + "grad_norm": 0.3181702048312814, + "learning_rate": 1.0131617300820185e-05, + "loss": 2.5377, + "step": 52475 + }, + { + "epoch": 2.4431408152338383, + "grad_norm": 0.30730737012788045, + "learning_rate": 1.0129982653481013e-05, + "loss": 2.6566, + "step": 52476 + }, + { + "epoch": 2.4431873734199314, + "grad_norm": 0.31995967043329243, + "learning_rate": 1.0128348123156423e-05, + "loss": 2.4714, + "step": 52477 + }, + { + "epoch": 2.4432339316060245, + "grad_norm": 0.3137688289601528, + "learning_rate": 1.0126713709851194e-05, + "loss": 2.5847, + "step": 52478 + }, + { + "epoch": 2.4432804897921176, + "grad_norm": 0.30596696896661985, + "learning_rate": 1.0125079413570138e-05, + "loss": 2.5794, + "step": 52479 + }, + { + "epoch": 2.4433270479782108, + "grad_norm": 0.31414776919067905, + "learning_rate": 1.0123445234318058e-05, + "loss": 2.7508, + "step": 52480 + }, + { + "epoch": 2.443373606164304, + "grad_norm": 0.3040653655723986, + "learning_rate": 1.0121811172099721e-05, + "loss": 2.6725, + "step": 52481 + }, + { + "epoch": 2.443420164350397, + "grad_norm": 0.32109856516523066, + "learning_rate": 1.0120177226919948e-05, + "loss": 2.54, + "step": 52482 + }, + { + "epoch": 2.44346672253649, + "grad_norm": 0.34391084066435584, + "learning_rate": 1.011854339878352e-05, + "loss": 2.6676, + "step": 52483 + }, + { + "epoch": 2.4435132807225832, + "grad_norm": 0.2974354830533227, + "learning_rate": 1.0116909687695236e-05, + "loss": 2.528, + "step": 52484 + }, + { + "epoch": 2.4435598389086763, + "grad_norm": 0.30632416955324426, + "learning_rate": 1.0115276093659898e-05, + "loss": 2.5348, + "step": 52485 + }, + { + "epoch": 2.443606397094769, + "grad_norm": 0.3210919500715954, + "learning_rate": 1.0113642616682306e-05, + "loss": 2.7221, + "step": 52486 + }, + { + "epoch": 2.443652955280862, + "grad_norm": 0.3548983735421584, + "learning_rate": 1.0112009256767213e-05, + "loss": 2.7365, + "step": 52487 + }, + { + "epoch": 2.4436995134669552, + "grad_norm": 0.32234746643103335, + "learning_rate": 1.0110376013919475e-05, + "loss": 2.6415, + "step": 52488 + }, + { + "epoch": 2.4437460716530484, + "grad_norm": 0.3529426571092114, + "learning_rate": 1.010874288814384e-05, + "loss": 2.7186, + "step": 52489 + }, + { + "epoch": 2.4437926298391415, + "grad_norm": 0.3322414792473214, + "learning_rate": 1.0107109879445115e-05, + "loss": 2.6146, + "step": 52490 + }, + { + "epoch": 2.4438391880252346, + "grad_norm": 0.3482123459239242, + "learning_rate": 1.0105476987828088e-05, + "loss": 2.6599, + "step": 52491 + }, + { + "epoch": 2.4438857462113277, + "grad_norm": 0.28316176691338724, + "learning_rate": 1.0103844213297563e-05, + "loss": 2.5982, + "step": 52492 + }, + { + "epoch": 2.443932304397421, + "grad_norm": 0.30477999001117895, + "learning_rate": 1.0102211555858332e-05, + "loss": 2.5725, + "step": 52493 + }, + { + "epoch": 2.443978862583514, + "grad_norm": 0.3150924146409346, + "learning_rate": 1.0100579015515155e-05, + "loss": 2.6475, + "step": 52494 + }, + { + "epoch": 2.4440254207696066, + "grad_norm": 0.3535112140246204, + "learning_rate": 1.0098946592272879e-05, + "loss": 2.6687, + "step": 52495 + }, + { + "epoch": 2.4440719789556997, + "grad_norm": 0.31679138079833646, + "learning_rate": 1.0097314286136245e-05, + "loss": 2.6241, + "step": 52496 + }, + { + "epoch": 2.444118537141793, + "grad_norm": 0.3110048890160364, + "learning_rate": 1.009568209711006e-05, + "loss": 2.5716, + "step": 52497 + }, + { + "epoch": 2.444165095327886, + "grad_norm": 0.3236869590066057, + "learning_rate": 1.009405002519912e-05, + "loss": 2.5816, + "step": 52498 + }, + { + "epoch": 2.444211653513979, + "grad_norm": 0.3588179479134441, + "learning_rate": 1.0092418070408227e-05, + "loss": 2.6853, + "step": 52499 + }, + { + "epoch": 2.444258211700072, + "grad_norm": 0.3166926954971121, + "learning_rate": 1.0090786232742128e-05, + "loss": 2.623, + "step": 52500 + }, + { + "epoch": 2.4443047698861653, + "grad_norm": 0.30850927697187225, + "learning_rate": 1.0089154512205667e-05, + "loss": 2.6018, + "step": 52501 + }, + { + "epoch": 2.4443513280722584, + "grad_norm": 0.3286860576754635, + "learning_rate": 1.0087522908803587e-05, + "loss": 2.6538, + "step": 52502 + }, + { + "epoch": 2.4443978862583515, + "grad_norm": 0.3181748962748872, + "learning_rate": 1.00858914225407e-05, + "loss": 2.5666, + "step": 52503 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.340612637218356, + "learning_rate": 1.008426005342179e-05, + "loss": 2.7117, + "step": 52504 + }, + { + "epoch": 2.4444910026305378, + "grad_norm": 0.3129767894129077, + "learning_rate": 1.0082628801451643e-05, + "loss": 2.6794, + "step": 52505 + }, + { + "epoch": 2.4445375608166304, + "grad_norm": 0.30849867635422623, + "learning_rate": 1.008099766663506e-05, + "loss": 2.6093, + "step": 52506 + }, + { + "epoch": 2.4445841190027235, + "grad_norm": 0.3417419531119257, + "learning_rate": 1.0079366648976796e-05, + "loss": 2.6987, + "step": 52507 + }, + { + "epoch": 2.4446306771888167, + "grad_norm": 0.32536580212617194, + "learning_rate": 1.0077735748481676e-05, + "loss": 2.6432, + "step": 52508 + }, + { + "epoch": 2.4446772353749098, + "grad_norm": 0.35664188473926667, + "learning_rate": 1.0076104965154459e-05, + "loss": 2.6522, + "step": 52509 + }, + { + "epoch": 2.444723793561003, + "grad_norm": 0.32728530528166605, + "learning_rate": 1.0074474298999942e-05, + "loss": 2.5901, + "step": 52510 + }, + { + "epoch": 2.444770351747096, + "grad_norm": 0.2993015785067541, + "learning_rate": 1.007284375002291e-05, + "loss": 2.6634, + "step": 52511 + }, + { + "epoch": 2.444816909933189, + "grad_norm": 0.32223710162234276, + "learning_rate": 1.0071213318228145e-05, + "loss": 2.5467, + "step": 52512 + }, + { + "epoch": 2.4448634681192822, + "grad_norm": 0.32754758984685933, + "learning_rate": 1.0069583003620437e-05, + "loss": 2.6168, + "step": 52513 + }, + { + "epoch": 2.444910026305375, + "grad_norm": 0.32142561236188694, + "learning_rate": 1.0067952806204578e-05, + "loss": 2.5786, + "step": 52514 + }, + { + "epoch": 2.444956584491468, + "grad_norm": 0.30620625406136415, + "learning_rate": 1.0066322725985338e-05, + "loss": 2.6737, + "step": 52515 + }, + { + "epoch": 2.445003142677561, + "grad_norm": 0.3136488100986446, + "learning_rate": 1.0064692762967504e-05, + "loss": 2.5364, + "step": 52516 + }, + { + "epoch": 2.4450497008636543, + "grad_norm": 0.30202348753548763, + "learning_rate": 1.0063062917155857e-05, + "loss": 2.6234, + "step": 52517 + }, + { + "epoch": 2.4450962590497474, + "grad_norm": 0.3146758215823704, + "learning_rate": 1.006143318855519e-05, + "loss": 2.6775, + "step": 52518 + }, + { + "epoch": 2.4451428172358405, + "grad_norm": 0.3195829065299091, + "learning_rate": 1.0059803577170296e-05, + "loss": 2.6547, + "step": 52519 + }, + { + "epoch": 2.4451893754219336, + "grad_norm": 0.3407662972136292, + "learning_rate": 1.0058174083005917e-05, + "loss": 2.6464, + "step": 52520 + }, + { + "epoch": 2.4452359336080267, + "grad_norm": 0.3111169329752333, + "learning_rate": 1.005654470606689e-05, + "loss": 2.5829, + "step": 52521 + }, + { + "epoch": 2.44528249179412, + "grad_norm": 0.32923125145824517, + "learning_rate": 1.0054915446357954e-05, + "loss": 2.7168, + "step": 52522 + }, + { + "epoch": 2.445329049980213, + "grad_norm": 0.29248073635067673, + "learning_rate": 1.0053286303883908e-05, + "loss": 2.6366, + "step": 52523 + }, + { + "epoch": 2.445375608166306, + "grad_norm": 0.3175017153859777, + "learning_rate": 1.005165727864953e-05, + "loss": 2.6209, + "step": 52524 + }, + { + "epoch": 2.4454221663523987, + "grad_norm": 0.3323494112007647, + "learning_rate": 1.0050028370659597e-05, + "loss": 2.6745, + "step": 52525 + }, + { + "epoch": 2.445468724538492, + "grad_norm": 0.3285719838774402, + "learning_rate": 1.0048399579918904e-05, + "loss": 2.6643, + "step": 52526 + }, + { + "epoch": 2.445515282724585, + "grad_norm": 0.31631365123511646, + "learning_rate": 1.0046770906432228e-05, + "loss": 2.6498, + "step": 52527 + }, + { + "epoch": 2.445561840910678, + "grad_norm": 0.3035666144462163, + "learning_rate": 1.0045142350204333e-05, + "loss": 2.651, + "step": 52528 + }, + { + "epoch": 2.445608399096771, + "grad_norm": 0.3077453659807395, + "learning_rate": 1.004351391124001e-05, + "loss": 2.6681, + "step": 52529 + }, + { + "epoch": 2.4456549572828643, + "grad_norm": 0.3071125732417471, + "learning_rate": 1.0041885589544037e-05, + "loss": 2.6117, + "step": 52530 + }, + { + "epoch": 2.4457015154689574, + "grad_norm": 0.31204604448430695, + "learning_rate": 1.0040257385121194e-05, + "loss": 2.6332, + "step": 52531 + }, + { + "epoch": 2.4457480736550505, + "grad_norm": 0.3131288241708133, + "learning_rate": 1.003862929797627e-05, + "loss": 2.6402, + "step": 52532 + }, + { + "epoch": 2.4457946318411437, + "grad_norm": 0.30768084097351195, + "learning_rate": 1.003700132811401e-05, + "loss": 2.6166, + "step": 52533 + }, + { + "epoch": 2.4458411900272363, + "grad_norm": 0.3000753689418332, + "learning_rate": 1.003537347553924e-05, + "loss": 2.6162, + "step": 52534 + }, + { + "epoch": 2.4458877482133294, + "grad_norm": 0.3141996812913888, + "learning_rate": 1.0033745740256696e-05, + "loss": 2.5468, + "step": 52535 + }, + { + "epoch": 2.4459343063994226, + "grad_norm": 0.32521885042958104, + "learning_rate": 1.0032118122271167e-05, + "loss": 2.5168, + "step": 52536 + }, + { + "epoch": 2.4459808645855157, + "grad_norm": 0.32851250144328636, + "learning_rate": 1.003049062158744e-05, + "loss": 2.6711, + "step": 52537 + }, + { + "epoch": 2.446027422771609, + "grad_norm": 0.29879802786780324, + "learning_rate": 1.0028863238210285e-05, + "loss": 2.6049, + "step": 52538 + }, + { + "epoch": 2.446073980957702, + "grad_norm": 0.31003020449993807, + "learning_rate": 1.0027235972144472e-05, + "loss": 2.612, + "step": 52539 + }, + { + "epoch": 2.446120539143795, + "grad_norm": 0.31828049659351887, + "learning_rate": 1.0025608823394805e-05, + "loss": 2.6461, + "step": 52540 + }, + { + "epoch": 2.446167097329888, + "grad_norm": 0.3275748359278255, + "learning_rate": 1.002398179196602e-05, + "loss": 2.6163, + "step": 52541 + }, + { + "epoch": 2.4462136555159812, + "grad_norm": 0.31072272929552974, + "learning_rate": 1.002235487786291e-05, + "loss": 2.5134, + "step": 52542 + }, + { + "epoch": 2.4462602137020744, + "grad_norm": 0.30924770775577515, + "learning_rate": 1.0020728081090252e-05, + "loss": 2.5767, + "step": 52543 + }, + { + "epoch": 2.4463067718881675, + "grad_norm": 0.2981966108067105, + "learning_rate": 1.001910140165282e-05, + "loss": 2.6424, + "step": 52544 + }, + { + "epoch": 2.44635333007426, + "grad_norm": 0.31408958390970426, + "learning_rate": 1.0017474839555397e-05, + "loss": 2.5896, + "step": 52545 + }, + { + "epoch": 2.4463998882603533, + "grad_norm": 0.32558872115822324, + "learning_rate": 1.0015848394802719e-05, + "loss": 2.5555, + "step": 52546 + }, + { + "epoch": 2.4464464464464464, + "grad_norm": 0.30684481580648265, + "learning_rate": 1.0014222067399614e-05, + "loss": 2.5637, + "step": 52547 + }, + { + "epoch": 2.4464930046325395, + "grad_norm": 0.31248577309097453, + "learning_rate": 1.0012595857350804e-05, + "loss": 2.6601, + "step": 52548 + }, + { + "epoch": 2.4465395628186326, + "grad_norm": 0.3065626138031122, + "learning_rate": 1.001096976466111e-05, + "loss": 2.604, + "step": 52549 + }, + { + "epoch": 2.4465861210047257, + "grad_norm": 0.30156546023582836, + "learning_rate": 1.0009343789335269e-05, + "loss": 2.5879, + "step": 52550 + }, + { + "epoch": 2.446632679190819, + "grad_norm": 0.31223772198470723, + "learning_rate": 1.0007717931378058e-05, + "loss": 2.68, + "step": 52551 + }, + { + "epoch": 2.446679237376912, + "grad_norm": 0.30320263598268543, + "learning_rate": 1.0006092190794263e-05, + "loss": 2.6459, + "step": 52552 + }, + { + "epoch": 2.4467257955630046, + "grad_norm": 0.3391251925674943, + "learning_rate": 1.0004466567588656e-05, + "loss": 2.6652, + "step": 52553 + }, + { + "epoch": 2.4467723537490977, + "grad_norm": 0.29747212064217476, + "learning_rate": 1.0002841061765989e-05, + "loss": 2.6933, + "step": 52554 + }, + { + "epoch": 2.446818911935191, + "grad_norm": 0.31886707936691483, + "learning_rate": 1.0001215673331038e-05, + "loss": 2.6895, + "step": 52555 + }, + { + "epoch": 2.446865470121284, + "grad_norm": 0.3312910012662688, + "learning_rate": 9.999590402288584e-06, + "loss": 2.681, + "step": 52556 + }, + { + "epoch": 2.446912028307377, + "grad_norm": 0.3035600369463434, + "learning_rate": 9.997965248643387e-06, + "loss": 2.5456, + "step": 52557 + }, + { + "epoch": 2.44695858649347, + "grad_norm": 0.2966323067689744, + "learning_rate": 9.996340212400241e-06, + "loss": 2.6462, + "step": 52558 + }, + { + "epoch": 2.4470051446795633, + "grad_norm": 0.3329439640965112, + "learning_rate": 9.994715293563861e-06, + "loss": 2.6372, + "step": 52559 + }, + { + "epoch": 2.4470517028656564, + "grad_norm": 0.31068376211804044, + "learning_rate": 9.993090492139085e-06, + "loss": 2.5799, + "step": 52560 + }, + { + "epoch": 2.4470982610517495, + "grad_norm": 0.3321725449510988, + "learning_rate": 9.991465808130613e-06, + "loss": 2.6597, + "step": 52561 + }, + { + "epoch": 2.4471448192378427, + "grad_norm": 0.31755893050960426, + "learning_rate": 9.989841241543279e-06, + "loss": 2.6593, + "step": 52562 + }, + { + "epoch": 2.4471913774239358, + "grad_norm": 0.3105617441837296, + "learning_rate": 9.988216792381805e-06, + "loss": 2.5676, + "step": 52563 + }, + { + "epoch": 2.4472379356100284, + "grad_norm": 0.34632550842466875, + "learning_rate": 9.986592460650972e-06, + "loss": 2.6722, + "step": 52564 + }, + { + "epoch": 2.4472844937961216, + "grad_norm": 0.3210660698035634, + "learning_rate": 9.984968246355547e-06, + "loss": 2.4571, + "step": 52565 + }, + { + "epoch": 2.4473310519822147, + "grad_norm": 0.29942581343782493, + "learning_rate": 9.9833441495003e-06, + "loss": 2.5336, + "step": 52566 + }, + { + "epoch": 2.447377610168308, + "grad_norm": 0.30351715426456666, + "learning_rate": 9.981720170090009e-06, + "loss": 2.5908, + "step": 52567 + }, + { + "epoch": 2.447424168354401, + "grad_norm": 0.3156255345703299, + "learning_rate": 9.98009630812941e-06, + "loss": 2.5977, + "step": 52568 + }, + { + "epoch": 2.447470726540494, + "grad_norm": 0.30722017644644695, + "learning_rate": 9.978472563623292e-06, + "loss": 2.6417, + "step": 52569 + }, + { + "epoch": 2.447517284726587, + "grad_norm": 0.3294139106181668, + "learning_rate": 9.97684893657641e-06, + "loss": 2.5709, + "step": 52570 + }, + { + "epoch": 2.4475638429126803, + "grad_norm": 0.3094072483753288, + "learning_rate": 9.975225426993545e-06, + "loss": 2.5971, + "step": 52571 + }, + { + "epoch": 2.4476104010987734, + "grad_norm": 0.31632746597366657, + "learning_rate": 9.973602034879425e-06, + "loss": 2.5455, + "step": 52572 + }, + { + "epoch": 2.447656959284866, + "grad_norm": 0.3199624396848425, + "learning_rate": 9.97197876023887e-06, + "loss": 2.56, + "step": 52573 + }, + { + "epoch": 2.447703517470959, + "grad_norm": 0.3166356659835929, + "learning_rate": 9.970355603076586e-06, + "loss": 2.7301, + "step": 52574 + }, + { + "epoch": 2.4477500756570523, + "grad_norm": 0.3110038072243888, + "learning_rate": 9.968732563397388e-06, + "loss": 2.5515, + "step": 52575 + }, + { + "epoch": 2.4477966338431454, + "grad_norm": 0.32846720308689187, + "learning_rate": 9.967109641206007e-06, + "loss": 2.6931, + "step": 52576 + }, + { + "epoch": 2.4478431920292385, + "grad_norm": 0.29574239723648543, + "learning_rate": 9.965486836507216e-06, + "loss": 2.5948, + "step": 52577 + }, + { + "epoch": 2.4478897502153316, + "grad_norm": 0.3062781180151991, + "learning_rate": 9.963864149305774e-06, + "loss": 2.6335, + "step": 52578 + }, + { + "epoch": 2.4479363084014247, + "grad_norm": 0.32272839086473704, + "learning_rate": 9.962241579606446e-06, + "loss": 2.6012, + "step": 52579 + }, + { + "epoch": 2.447982866587518, + "grad_norm": 0.3153529373410492, + "learning_rate": 9.960619127414006e-06, + "loss": 2.6429, + "step": 52580 + }, + { + "epoch": 2.448029424773611, + "grad_norm": 0.31686116638846445, + "learning_rate": 9.958996792733193e-06, + "loss": 2.5947, + "step": 52581 + }, + { + "epoch": 2.448075982959704, + "grad_norm": 0.311982064269527, + "learning_rate": 9.957374575568784e-06, + "loss": 2.662, + "step": 52582 + }, + { + "epoch": 2.448122541145797, + "grad_norm": 0.3188599713051155, + "learning_rate": 9.955752475925534e-06, + "loss": 2.6797, + "step": 52583 + }, + { + "epoch": 2.44816909933189, + "grad_norm": 0.30075089332985633, + "learning_rate": 9.9541304938082e-06, + "loss": 2.6015, + "step": 52584 + }, + { + "epoch": 2.448215657517983, + "grad_norm": 0.2929734460002792, + "learning_rate": 9.952508629221552e-06, + "loss": 2.5933, + "step": 52585 + }, + { + "epoch": 2.448262215704076, + "grad_norm": 0.30313011498458126, + "learning_rate": 9.950886882170357e-06, + "loss": 2.5426, + "step": 52586 + }, + { + "epoch": 2.448308773890169, + "grad_norm": 0.3140167920096079, + "learning_rate": 9.94926525265934e-06, + "loss": 2.734, + "step": 52587 + }, + { + "epoch": 2.4483553320762623, + "grad_norm": 0.30867776699193683, + "learning_rate": 9.947643740693313e-06, + "loss": 2.6265, + "step": 52588 + }, + { + "epoch": 2.4484018902623554, + "grad_norm": 0.3157734162384649, + "learning_rate": 9.946022346276984e-06, + "loss": 2.5783, + "step": 52589 + }, + { + "epoch": 2.4484484484484486, + "grad_norm": 0.3122693586843611, + "learning_rate": 9.944401069415138e-06, + "loss": 2.5778, + "step": 52590 + }, + { + "epoch": 2.4484950066345417, + "grad_norm": 0.3310800211760599, + "learning_rate": 9.942779910112533e-06, + "loss": 2.5849, + "step": 52591 + }, + { + "epoch": 2.4485415648206343, + "grad_norm": 0.30590607794333086, + "learning_rate": 9.941158868373918e-06, + "loss": 2.6968, + "step": 52592 + }, + { + "epoch": 2.4485881230067275, + "grad_norm": 0.3092178535394499, + "learning_rate": 9.939537944204069e-06, + "loss": 2.5933, + "step": 52593 + }, + { + "epoch": 2.4486346811928206, + "grad_norm": 0.3187950303764496, + "learning_rate": 9.937917137607716e-06, + "loss": 2.6624, + "step": 52594 + }, + { + "epoch": 2.4486812393789137, + "grad_norm": 0.31140324035983696, + "learning_rate": 9.936296448589627e-06, + "loss": 2.604, + "step": 52595 + }, + { + "epoch": 2.448727797565007, + "grad_norm": 0.30701667822957696, + "learning_rate": 9.934675877154565e-06, + "loss": 2.504, + "step": 52596 + }, + { + "epoch": 2.4487743557511, + "grad_norm": 0.3091112294198698, + "learning_rate": 9.93305542330728e-06, + "loss": 2.5716, + "step": 52597 + }, + { + "epoch": 2.448820913937193, + "grad_norm": 0.34471551369614895, + "learning_rate": 9.931435087052532e-06, + "loss": 2.5958, + "step": 52598 + }, + { + "epoch": 2.448867472123286, + "grad_norm": 0.32828174463572174, + "learning_rate": 9.929814868395087e-06, + "loss": 2.5191, + "step": 52599 + }, + { + "epoch": 2.4489140303093793, + "grad_norm": 0.2925597370127761, + "learning_rate": 9.92819476733966e-06, + "loss": 2.6364, + "step": 52600 + }, + { + "epoch": 2.4489605884954724, + "grad_norm": 0.34838475314502976, + "learning_rate": 9.926574783891062e-06, + "loss": 2.6992, + "step": 52601 + }, + { + "epoch": 2.4490071466815655, + "grad_norm": 0.32224636838976006, + "learning_rate": 9.924954918054009e-06, + "loss": 2.5756, + "step": 52602 + }, + { + "epoch": 2.449053704867658, + "grad_norm": 0.31489873480434616, + "learning_rate": 9.923335169833258e-06, + "loss": 2.6008, + "step": 52603 + }, + { + "epoch": 2.4491002630537513, + "grad_norm": 0.32167594712137826, + "learning_rate": 9.921715539233573e-06, + "loss": 2.6123, + "step": 52604 + }, + { + "epoch": 2.4491468212398444, + "grad_norm": 0.3044240861819683, + "learning_rate": 9.920096026259706e-06, + "loss": 2.6653, + "step": 52605 + }, + { + "epoch": 2.4491933794259375, + "grad_norm": 0.3057806162299226, + "learning_rate": 9.918476630916424e-06, + "loss": 2.6127, + "step": 52606 + }, + { + "epoch": 2.4492399376120306, + "grad_norm": 0.3289160814127268, + "learning_rate": 9.916857353208432e-06, + "loss": 2.6367, + "step": 52607 + }, + { + "epoch": 2.4492864957981237, + "grad_norm": 0.30091986309305696, + "learning_rate": 9.915238193140548e-06, + "loss": 2.7435, + "step": 52608 + }, + { + "epoch": 2.449333053984217, + "grad_norm": 0.3380688257662011, + "learning_rate": 9.913619150717468e-06, + "loss": 2.6796, + "step": 52609 + }, + { + "epoch": 2.44937961217031, + "grad_norm": 0.31415162196552915, + "learning_rate": 9.912000225943973e-06, + "loss": 2.5781, + "step": 52610 + }, + { + "epoch": 2.449426170356403, + "grad_norm": 0.3133306801439764, + "learning_rate": 9.91038141882481e-06, + "loss": 2.6011, + "step": 52611 + }, + { + "epoch": 2.4494727285424958, + "grad_norm": 0.319854063914269, + "learning_rate": 9.90876272936474e-06, + "loss": 2.6754, + "step": 52612 + }, + { + "epoch": 2.449519286728589, + "grad_norm": 0.31653798700367336, + "learning_rate": 9.907144157568471e-06, + "loss": 2.6238, + "step": 52613 + }, + { + "epoch": 2.449565844914682, + "grad_norm": 0.3161397970515648, + "learning_rate": 9.905525703440815e-06, + "loss": 2.6596, + "step": 52614 + }, + { + "epoch": 2.449612403100775, + "grad_norm": 0.3161264808743182, + "learning_rate": 9.903907366986475e-06, + "loss": 2.6409, + "step": 52615 + }, + { + "epoch": 2.449658961286868, + "grad_norm": 0.32908017318128924, + "learning_rate": 9.90228914821022e-06, + "loss": 2.6597, + "step": 52616 + }, + { + "epoch": 2.4497055194729613, + "grad_norm": 0.2985486319844541, + "learning_rate": 9.900671047116794e-06, + "loss": 2.5857, + "step": 52617 + }, + { + "epoch": 2.4497520776590545, + "grad_norm": 0.315043961602871, + "learning_rate": 9.89905306371095e-06, + "loss": 2.6365, + "step": 52618 + }, + { + "epoch": 2.4497986358451476, + "grad_norm": 0.3020899416110476, + "learning_rate": 9.897435197997445e-06, + "loss": 2.6516, + "step": 52619 + }, + { + "epoch": 2.4498451940312407, + "grad_norm": 0.32329555323986936, + "learning_rate": 9.895817449980992e-06, + "loss": 2.7073, + "step": 52620 + }, + { + "epoch": 2.449891752217334, + "grad_norm": 0.31970901174035093, + "learning_rate": 9.894199819666388e-06, + "loss": 2.637, + "step": 52621 + }, + { + "epoch": 2.449938310403427, + "grad_norm": 0.3056439823139069, + "learning_rate": 9.892582307058346e-06, + "loss": 2.6582, + "step": 52622 + }, + { + "epoch": 2.4499848685895196, + "grad_norm": 0.31170442188247127, + "learning_rate": 9.890964912161621e-06, + "loss": 2.6473, + "step": 52623 + }, + { + "epoch": 2.4500314267756127, + "grad_norm": 0.3161246050287606, + "learning_rate": 9.889347634980967e-06, + "loss": 2.6263, + "step": 52624 + }, + { + "epoch": 2.450077984961706, + "grad_norm": 0.2980179212585536, + "learning_rate": 9.887730475521123e-06, + "loss": 2.6216, + "step": 52625 + }, + { + "epoch": 2.450124543147799, + "grad_norm": 0.29989472131591666, + "learning_rate": 9.886113433786842e-06, + "loss": 2.6303, + "step": 52626 + }, + { + "epoch": 2.450171101333892, + "grad_norm": 0.3034632577083357, + "learning_rate": 9.884496509782875e-06, + "loss": 2.7388, + "step": 52627 + }, + { + "epoch": 2.450217659519985, + "grad_norm": 0.30216651092740815, + "learning_rate": 9.882879703513947e-06, + "loss": 2.5629, + "step": 52628 + }, + { + "epoch": 2.4502642177060783, + "grad_norm": 0.3089151323282083, + "learning_rate": 9.881263014984814e-06, + "loss": 2.7385, + "step": 52629 + }, + { + "epoch": 2.4503107758921714, + "grad_norm": 0.2976480329056887, + "learning_rate": 9.879646444200224e-06, + "loss": 2.652, + "step": 52630 + }, + { + "epoch": 2.4503573340782645, + "grad_norm": 0.31186471912514774, + "learning_rate": 9.878029991164916e-06, + "loss": 2.5588, + "step": 52631 + }, + { + "epoch": 2.450403892264357, + "grad_norm": 0.3149879350517874, + "learning_rate": 9.876413655883654e-06, + "loss": 2.6723, + "step": 52632 + }, + { + "epoch": 2.4504504504504503, + "grad_norm": 0.306146901806569, + "learning_rate": 9.874797438361133e-06, + "loss": 2.5645, + "step": 52633 + }, + { + "epoch": 2.4504970086365434, + "grad_norm": 0.34367178993008346, + "learning_rate": 9.873181338602161e-06, + "loss": 2.6682, + "step": 52634 + }, + { + "epoch": 2.4505435668226365, + "grad_norm": 0.3344609116862457, + "learning_rate": 9.871565356611435e-06, + "loss": 2.6848, + "step": 52635 + }, + { + "epoch": 2.4505901250087296, + "grad_norm": 0.3106931965721526, + "learning_rate": 9.869949492393709e-06, + "loss": 2.6307, + "step": 52636 + }, + { + "epoch": 2.4506366831948228, + "grad_norm": 0.3309042348230973, + "learning_rate": 9.868333745953728e-06, + "loss": 2.776, + "step": 52637 + }, + { + "epoch": 2.450683241380916, + "grad_norm": 0.3309487220087698, + "learning_rate": 9.86671811729623e-06, + "loss": 2.6191, + "step": 52638 + }, + { + "epoch": 2.450729799567009, + "grad_norm": 0.30443507363300115, + "learning_rate": 9.86510260642597e-06, + "loss": 2.5898, + "step": 52639 + }, + { + "epoch": 2.450776357753102, + "grad_norm": 0.3274256469709344, + "learning_rate": 9.863487213347683e-06, + "loss": 2.6826, + "step": 52640 + }, + { + "epoch": 2.450822915939195, + "grad_norm": 0.33736978842243626, + "learning_rate": 9.861871938066102e-06, + "loss": 2.6291, + "step": 52641 + }, + { + "epoch": 2.4508694741252883, + "grad_norm": 0.33609257769228457, + "learning_rate": 9.86025678058597e-06, + "loss": 2.7643, + "step": 52642 + }, + { + "epoch": 2.450916032311381, + "grad_norm": 0.30934451338034985, + "learning_rate": 9.858641740912028e-06, + "loss": 2.7568, + "step": 52643 + }, + { + "epoch": 2.450962590497474, + "grad_norm": 0.32117747689712567, + "learning_rate": 9.85702681904902e-06, + "loss": 2.6101, + "step": 52644 + }, + { + "epoch": 2.4510091486835672, + "grad_norm": 0.3656206138463865, + "learning_rate": 9.855412015001692e-06, + "loss": 2.5949, + "step": 52645 + }, + { + "epoch": 2.4510557068696603, + "grad_norm": 0.32942214674456, + "learning_rate": 9.853797328774755e-06, + "loss": 2.6155, + "step": 52646 + }, + { + "epoch": 2.4511022650557535, + "grad_norm": 0.3059139299397558, + "learning_rate": 9.85218276037299e-06, + "loss": 2.6947, + "step": 52647 + }, + { + "epoch": 2.4511488232418466, + "grad_norm": 0.32742239399000245, + "learning_rate": 9.8505683098011e-06, + "loss": 2.6646, + "step": 52648 + }, + { + "epoch": 2.4511953814279397, + "grad_norm": 0.30947045764462505, + "learning_rate": 9.848953977063841e-06, + "loss": 2.5852, + "step": 52649 + }, + { + "epoch": 2.451241939614033, + "grad_norm": 0.3445154665494412, + "learning_rate": 9.847339762165941e-06, + "loss": 2.6919, + "step": 52650 + }, + { + "epoch": 2.4512884978001255, + "grad_norm": 0.3355323621708516, + "learning_rate": 9.845725665112149e-06, + "loss": 2.7437, + "step": 52651 + }, + { + "epoch": 2.4513350559862186, + "grad_norm": 0.3315998869752747, + "learning_rate": 9.844111685907192e-06, + "loss": 2.6345, + "step": 52652 + }, + { + "epoch": 2.4513816141723117, + "grad_norm": 0.3313244413137799, + "learning_rate": 9.842497824555825e-06, + "loss": 2.7049, + "step": 52653 + }, + { + "epoch": 2.451428172358405, + "grad_norm": 0.32734096615887076, + "learning_rate": 9.840884081062746e-06, + "loss": 2.6011, + "step": 52654 + }, + { + "epoch": 2.451474730544498, + "grad_norm": 0.33528671639943514, + "learning_rate": 9.839270455432742e-06, + "loss": 2.67, + "step": 52655 + }, + { + "epoch": 2.451521288730591, + "grad_norm": 0.3458270064285376, + "learning_rate": 9.83765694767051e-06, + "loss": 2.7254, + "step": 52656 + }, + { + "epoch": 2.451567846916684, + "grad_norm": 0.3038915197779938, + "learning_rate": 9.836043557780794e-06, + "loss": 2.5742, + "step": 52657 + }, + { + "epoch": 2.4516144051027773, + "grad_norm": 0.31893182397540626, + "learning_rate": 9.834430285768348e-06, + "loss": 2.6034, + "step": 52658 + }, + { + "epoch": 2.4516609632888704, + "grad_norm": 0.35212511907416005, + "learning_rate": 9.832817131637866e-06, + "loss": 2.7228, + "step": 52659 + }, + { + "epoch": 2.4517075214749635, + "grad_norm": 0.3358508051626342, + "learning_rate": 9.831204095394137e-06, + "loss": 2.617, + "step": 52660 + }, + { + "epoch": 2.4517540796610566, + "grad_norm": 0.3175756031793816, + "learning_rate": 9.829591177041841e-06, + "loss": 2.6119, + "step": 52661 + }, + { + "epoch": 2.4518006378471493, + "grad_norm": 0.324120473026244, + "learning_rate": 9.82797837658576e-06, + "loss": 2.5502, + "step": 52662 + }, + { + "epoch": 2.4518471960332424, + "grad_norm": 0.3368625336082429, + "learning_rate": 9.826365694030598e-06, + "loss": 2.6573, + "step": 52663 + }, + { + "epoch": 2.4518937542193355, + "grad_norm": 0.34320621069105356, + "learning_rate": 9.824753129381087e-06, + "loss": 2.5803, + "step": 52664 + }, + { + "epoch": 2.4519403124054286, + "grad_norm": 0.32717049472771453, + "learning_rate": 9.823140682641974e-06, + "loss": 2.6243, + "step": 52665 + }, + { + "epoch": 2.4519868705915218, + "grad_norm": 0.3263502662583659, + "learning_rate": 9.821528353817994e-06, + "loss": 2.5474, + "step": 52666 + }, + { + "epoch": 2.452033428777615, + "grad_norm": 0.33293092967844923, + "learning_rate": 9.819916142913848e-06, + "loss": 2.6365, + "step": 52667 + }, + { + "epoch": 2.452079986963708, + "grad_norm": 0.32436277919150175, + "learning_rate": 9.818304049934313e-06, + "loss": 2.5168, + "step": 52668 + }, + { + "epoch": 2.452126545149801, + "grad_norm": 0.3351590551988201, + "learning_rate": 9.816692074884082e-06, + "loss": 2.6895, + "step": 52669 + }, + { + "epoch": 2.452173103335894, + "grad_norm": 0.3156533100682618, + "learning_rate": 9.815080217767902e-06, + "loss": 2.6138, + "step": 52670 + }, + { + "epoch": 2.452219661521987, + "grad_norm": 0.31429984140996936, + "learning_rate": 9.813468478590521e-06, + "loss": 2.6017, + "step": 52671 + }, + { + "epoch": 2.45226621970808, + "grad_norm": 0.3479093114002568, + "learning_rate": 9.811856857356622e-06, + "loss": 2.6945, + "step": 52672 + }, + { + "epoch": 2.452312777894173, + "grad_norm": 0.31154758447134856, + "learning_rate": 9.810245354070984e-06, + "loss": 2.615, + "step": 52673 + }, + { + "epoch": 2.4523593360802662, + "grad_norm": 0.31026590551819827, + "learning_rate": 9.808633968738295e-06, + "loss": 2.6204, + "step": 52674 + }, + { + "epoch": 2.4524058942663594, + "grad_norm": 0.33658219264064765, + "learning_rate": 9.807022701363333e-06, + "loss": 2.6604, + "step": 52675 + }, + { + "epoch": 2.4524524524524525, + "grad_norm": 0.31374222980950683, + "learning_rate": 9.805411551950788e-06, + "loss": 2.6153, + "step": 52676 + }, + { + "epoch": 2.4524990106385456, + "grad_norm": 0.33900577661938885, + "learning_rate": 9.803800520505401e-06, + "loss": 2.6164, + "step": 52677 + }, + { + "epoch": 2.4525455688246387, + "grad_norm": 0.29759822351713944, + "learning_rate": 9.802189607031897e-06, + "loss": 2.5613, + "step": 52678 + }, + { + "epoch": 2.452592127010732, + "grad_norm": 0.33924592607724124, + "learning_rate": 9.800578811535005e-06, + "loss": 2.5136, + "step": 52679 + }, + { + "epoch": 2.452638685196825, + "grad_norm": 0.3275906976115431, + "learning_rate": 9.798968134019453e-06, + "loss": 2.6147, + "step": 52680 + }, + { + "epoch": 2.452685243382918, + "grad_norm": 0.30804362083231984, + "learning_rate": 9.797357574489984e-06, + "loss": 2.6701, + "step": 52681 + }, + { + "epoch": 2.4527318015690107, + "grad_norm": 0.3089263499233842, + "learning_rate": 9.7957471329513e-06, + "loss": 2.657, + "step": 52682 + }, + { + "epoch": 2.452778359755104, + "grad_norm": 0.3015250451945413, + "learning_rate": 9.794136809408133e-06, + "loss": 2.6776, + "step": 52683 + }, + { + "epoch": 2.452824917941197, + "grad_norm": 0.3412565938393772, + "learning_rate": 9.792526603865227e-06, + "loss": 2.5994, + "step": 52684 + }, + { + "epoch": 2.45287147612729, + "grad_norm": 0.3232747973425524, + "learning_rate": 9.79091651632727e-06, + "loss": 2.5856, + "step": 52685 + }, + { + "epoch": 2.452918034313383, + "grad_norm": 0.29303415323906173, + "learning_rate": 9.789306546799037e-06, + "loss": 2.667, + "step": 52686 + }, + { + "epoch": 2.4529645924994763, + "grad_norm": 0.31712382067093176, + "learning_rate": 9.787696695285203e-06, + "loss": 2.6046, + "step": 52687 + }, + { + "epoch": 2.4530111506855694, + "grad_norm": 0.32058224080216524, + "learning_rate": 9.78608696179054e-06, + "loss": 2.6388, + "step": 52688 + }, + { + "epoch": 2.4530577088716625, + "grad_norm": 0.3231904041816789, + "learning_rate": 9.784477346319742e-06, + "loss": 2.6455, + "step": 52689 + }, + { + "epoch": 2.453104267057755, + "grad_norm": 0.31790000273301866, + "learning_rate": 9.782867848877537e-06, + "loss": 2.666, + "step": 52690 + }, + { + "epoch": 2.4531508252438483, + "grad_norm": 0.3227152416870595, + "learning_rate": 9.781258469468657e-06, + "loss": 2.5635, + "step": 52691 + }, + { + "epoch": 2.4531973834299414, + "grad_norm": 0.3273889060638276, + "learning_rate": 9.779649208097819e-06, + "loss": 2.6133, + "step": 52692 + }, + { + "epoch": 2.4532439416160345, + "grad_norm": 0.31030249383703196, + "learning_rate": 9.778040064769744e-06, + "loss": 2.6599, + "step": 52693 + }, + { + "epoch": 2.4532904998021277, + "grad_norm": 0.31410105651486725, + "learning_rate": 9.776431039489176e-06, + "loss": 2.6512, + "step": 52694 + }, + { + "epoch": 2.4533370579882208, + "grad_norm": 0.3190655825017879, + "learning_rate": 9.774822132260808e-06, + "loss": 2.5778, + "step": 52695 + }, + { + "epoch": 2.453383616174314, + "grad_norm": 0.3273673104438028, + "learning_rate": 9.773213343089376e-06, + "loss": 2.6228, + "step": 52696 + }, + { + "epoch": 2.453430174360407, + "grad_norm": 0.31134966200203956, + "learning_rate": 9.771604671979595e-06, + "loss": 2.6291, + "step": 52697 + }, + { + "epoch": 2.4534767325465, + "grad_norm": 0.30726607288839014, + "learning_rate": 9.769996118936193e-06, + "loss": 2.5225, + "step": 52698 + }, + { + "epoch": 2.4535232907325932, + "grad_norm": 0.30852419316775165, + "learning_rate": 9.768387683963903e-06, + "loss": 2.5296, + "step": 52699 + }, + { + "epoch": 2.4535698489186863, + "grad_norm": 0.32214709274903786, + "learning_rate": 9.766779367067403e-06, + "loss": 2.5479, + "step": 52700 + }, + { + "epoch": 2.453616407104779, + "grad_norm": 0.3134043606279179, + "learning_rate": 9.765171168251469e-06, + "loss": 2.6132, + "step": 52701 + }, + { + "epoch": 2.453662965290872, + "grad_norm": 0.3408538201783164, + "learning_rate": 9.763563087520783e-06, + "loss": 2.6465, + "step": 52702 + }, + { + "epoch": 2.4537095234769652, + "grad_norm": 0.3288466542806085, + "learning_rate": 9.761955124880074e-06, + "loss": 2.723, + "step": 52703 + }, + { + "epoch": 2.4537560816630584, + "grad_norm": 0.3351094578055994, + "learning_rate": 9.760347280334064e-06, + "loss": 2.6165, + "step": 52704 + }, + { + "epoch": 2.4538026398491515, + "grad_norm": 0.3183437869255926, + "learning_rate": 9.758739553887469e-06, + "loss": 2.5579, + "step": 52705 + }, + { + "epoch": 2.4538491980352446, + "grad_norm": 0.31600502263003644, + "learning_rate": 9.757131945545012e-06, + "loss": 2.6866, + "step": 52706 + }, + { + "epoch": 2.4538957562213377, + "grad_norm": 0.3139338593558438, + "learning_rate": 9.755524455311416e-06, + "loss": 2.5514, + "step": 52707 + }, + { + "epoch": 2.453942314407431, + "grad_norm": 0.3467433598466275, + "learning_rate": 9.753917083191383e-06, + "loss": 2.6448, + "step": 52708 + }, + { + "epoch": 2.453988872593524, + "grad_norm": 0.32462240711537277, + "learning_rate": 9.752309829189632e-06, + "loss": 2.5206, + "step": 52709 + }, + { + "epoch": 2.4540354307796166, + "grad_norm": 0.30311742631337096, + "learning_rate": 9.750702693310892e-06, + "loss": 2.6159, + "step": 52710 + }, + { + "epoch": 2.4540819889657097, + "grad_norm": 0.32062569946055797, + "learning_rate": 9.749095675559871e-06, + "loss": 2.6849, + "step": 52711 + }, + { + "epoch": 2.454128547151803, + "grad_norm": 0.3115931181776837, + "learning_rate": 9.747488775941304e-06, + "loss": 2.5847, + "step": 52712 + }, + { + "epoch": 2.454175105337896, + "grad_norm": 0.336022924201079, + "learning_rate": 9.745881994459866e-06, + "loss": 2.6902, + "step": 52713 + }, + { + "epoch": 2.454221663523989, + "grad_norm": 0.32992041424626695, + "learning_rate": 9.744275331120323e-06, + "loss": 2.6798, + "step": 52714 + }, + { + "epoch": 2.454268221710082, + "grad_norm": 0.32036061578808644, + "learning_rate": 9.742668785927345e-06, + "loss": 2.6678, + "step": 52715 + }, + { + "epoch": 2.4543147798961753, + "grad_norm": 0.32256842130656876, + "learning_rate": 9.741062358885688e-06, + "loss": 2.5547, + "step": 52716 + }, + { + "epoch": 2.4543613380822684, + "grad_norm": 0.3169941946988084, + "learning_rate": 9.739456050000035e-06, + "loss": 2.6201, + "step": 52717 + }, + { + "epoch": 2.4544078962683615, + "grad_norm": 0.3058041099681314, + "learning_rate": 9.737849859275117e-06, + "loss": 2.5569, + "step": 52718 + }, + { + "epoch": 2.4544544544544546, + "grad_norm": 0.3299327503587289, + "learning_rate": 9.736243786715637e-06, + "loss": 2.6009, + "step": 52719 + }, + { + "epoch": 2.4545010126405478, + "grad_norm": 0.31959115726187476, + "learning_rate": 9.734637832326332e-06, + "loss": 2.5467, + "step": 52720 + }, + { + "epoch": 2.4545475708266404, + "grad_norm": 0.31720422711083707, + "learning_rate": 9.73303199611188e-06, + "loss": 2.6559, + "step": 52721 + }, + { + "epoch": 2.4545941290127335, + "grad_norm": 0.3198650072715619, + "learning_rate": 9.731426278077016e-06, + "loss": 2.7196, + "step": 52722 + }, + { + "epoch": 2.4546406871988267, + "grad_norm": 0.3288906605519569, + "learning_rate": 9.729820678226448e-06, + "loss": 2.6319, + "step": 52723 + }, + { + "epoch": 2.45468724538492, + "grad_norm": 0.33859073676576373, + "learning_rate": 9.728215196564889e-06, + "loss": 2.702, + "step": 52724 + }, + { + "epoch": 2.454733803571013, + "grad_norm": 0.3080650324362352, + "learning_rate": 9.726609833097067e-06, + "loss": 2.591, + "step": 52725 + }, + { + "epoch": 2.454780361757106, + "grad_norm": 0.31622801789758, + "learning_rate": 9.725004587827647e-06, + "loss": 2.6101, + "step": 52726 + }, + { + "epoch": 2.454826919943199, + "grad_norm": 0.3324129419393613, + "learning_rate": 9.723399460761401e-06, + "loss": 2.5634, + "step": 52727 + }, + { + "epoch": 2.4548734781292922, + "grad_norm": 0.36461073348813955, + "learning_rate": 9.721794451902976e-06, + "loss": 2.7081, + "step": 52728 + }, + { + "epoch": 2.454920036315385, + "grad_norm": 0.3182844776337137, + "learning_rate": 9.720189561257154e-06, + "loss": 2.7152, + "step": 52729 + }, + { + "epoch": 2.454966594501478, + "grad_norm": 0.31939511641048823, + "learning_rate": 9.718584788828584e-06, + "loss": 2.6336, + "step": 52730 + }, + { + "epoch": 2.455013152687571, + "grad_norm": 0.32717829238208035, + "learning_rate": 9.716980134622006e-06, + "loss": 2.6664, + "step": 52731 + }, + { + "epoch": 2.4550597108736643, + "grad_norm": 0.33309713585928225, + "learning_rate": 9.715375598642118e-06, + "loss": 2.5813, + "step": 52732 + }, + { + "epoch": 2.4551062690597574, + "grad_norm": 0.32028717571949494, + "learning_rate": 9.713771180893638e-06, + "loss": 2.5528, + "step": 52733 + }, + { + "epoch": 2.4551528272458505, + "grad_norm": 0.33786873686154173, + "learning_rate": 9.712166881381279e-06, + "loss": 2.5483, + "step": 52734 + }, + { + "epoch": 2.4551993854319436, + "grad_norm": 0.3216734484963155, + "learning_rate": 9.710562700109732e-06, + "loss": 2.6814, + "step": 52735 + }, + { + "epoch": 2.4552459436180367, + "grad_norm": 0.31433949769301817, + "learning_rate": 9.708958637083715e-06, + "loss": 2.6088, + "step": 52736 + }, + { + "epoch": 2.45529250180413, + "grad_norm": 0.3199540930618623, + "learning_rate": 9.70735469230793e-06, + "loss": 2.6901, + "step": 52737 + }, + { + "epoch": 2.455339059990223, + "grad_norm": 0.33019443206319515, + "learning_rate": 9.705750865787105e-06, + "loss": 2.5135, + "step": 52738 + }, + { + "epoch": 2.455385618176316, + "grad_norm": 0.3054772165053175, + "learning_rate": 9.704147157525912e-06, + "loss": 2.6117, + "step": 52739 + }, + { + "epoch": 2.4554321763624087, + "grad_norm": 0.3181057878335852, + "learning_rate": 9.702543567529093e-06, + "loss": 2.5993, + "step": 52740 + }, + { + "epoch": 2.455478734548502, + "grad_norm": 0.32150459035942663, + "learning_rate": 9.70094009580132e-06, + "loss": 2.6249, + "step": 52741 + }, + { + "epoch": 2.455525292734595, + "grad_norm": 0.3253201821634501, + "learning_rate": 9.699336742347343e-06, + "loss": 2.5559, + "step": 52742 + }, + { + "epoch": 2.455571850920688, + "grad_norm": 0.30072726665180877, + "learning_rate": 9.697733507171825e-06, + "loss": 2.5821, + "step": 52743 + }, + { + "epoch": 2.455618409106781, + "grad_norm": 0.32524799856534337, + "learning_rate": 9.69613039027949e-06, + "loss": 2.6279, + "step": 52744 + }, + { + "epoch": 2.4556649672928743, + "grad_norm": 0.3110724829041595, + "learning_rate": 9.694527391675045e-06, + "loss": 2.6044, + "step": 52745 + }, + { + "epoch": 2.4557115254789674, + "grad_norm": 0.317420298088127, + "learning_rate": 9.692924511363188e-06, + "loss": 2.6612, + "step": 52746 + }, + { + "epoch": 2.4557580836650605, + "grad_norm": 0.31663807710897107, + "learning_rate": 9.691321749348641e-06, + "loss": 2.5057, + "step": 52747 + }, + { + "epoch": 2.4558046418511537, + "grad_norm": 0.29712384112660395, + "learning_rate": 9.689719105636081e-06, + "loss": 2.5801, + "step": 52748 + }, + { + "epoch": 2.4558512000372463, + "grad_norm": 0.31313575362303964, + "learning_rate": 9.688116580230222e-06, + "loss": 2.6525, + "step": 52749 + }, + { + "epoch": 2.4558977582233394, + "grad_norm": 0.3017045803047875, + "learning_rate": 9.686514173135768e-06, + "loss": 2.7074, + "step": 52750 + }, + { + "epoch": 2.4559443164094326, + "grad_norm": 0.3045647046231872, + "learning_rate": 9.684911884357428e-06, + "loss": 2.5824, + "step": 52751 + }, + { + "epoch": 2.4559908745955257, + "grad_norm": 0.32553426163892457, + "learning_rate": 9.683309713899902e-06, + "loss": 2.6855, + "step": 52752 + }, + { + "epoch": 2.456037432781619, + "grad_norm": 0.31735138321153744, + "learning_rate": 9.681707661767897e-06, + "loss": 2.6925, + "step": 52753 + }, + { + "epoch": 2.456083990967712, + "grad_norm": 0.3215106483034149, + "learning_rate": 9.680105727966083e-06, + "loss": 2.5562, + "step": 52754 + }, + { + "epoch": 2.456130549153805, + "grad_norm": 0.32154270882230934, + "learning_rate": 9.678503912499214e-06, + "loss": 2.6566, + "step": 52755 + }, + { + "epoch": 2.456177107339898, + "grad_norm": 0.3044459980261171, + "learning_rate": 9.676902215371953e-06, + "loss": 2.6383, + "step": 52756 + }, + { + "epoch": 2.4562236655259913, + "grad_norm": 0.3060646196873655, + "learning_rate": 9.675300636589008e-06, + "loss": 2.6401, + "step": 52757 + }, + { + "epoch": 2.4562702237120844, + "grad_norm": 0.32132101836716054, + "learning_rate": 9.673699176155088e-06, + "loss": 2.6117, + "step": 52758 + }, + { + "epoch": 2.4563167818981775, + "grad_norm": 0.3099630820916277, + "learning_rate": 9.672097834074883e-06, + "loss": 2.684, + "step": 52759 + }, + { + "epoch": 2.45636334008427, + "grad_norm": 0.315196424916465, + "learning_rate": 9.670496610353108e-06, + "loss": 2.5953, + "step": 52760 + }, + { + "epoch": 2.4564098982703633, + "grad_norm": 0.29756887948692606, + "learning_rate": 9.668895504994446e-06, + "loss": 2.6368, + "step": 52761 + }, + { + "epoch": 2.4564564564564564, + "grad_norm": 0.3130087161383935, + "learning_rate": 9.6672945180036e-06, + "loss": 2.6245, + "step": 52762 + }, + { + "epoch": 2.4565030146425495, + "grad_norm": 0.3274058836785312, + "learning_rate": 9.665693649385272e-06, + "loss": 2.6567, + "step": 52763 + }, + { + "epoch": 2.4565495728286426, + "grad_norm": 0.3091389305636614, + "learning_rate": 9.664092899144157e-06, + "loss": 2.6266, + "step": 52764 + }, + { + "epoch": 2.4565961310147357, + "grad_norm": 0.32147290044025867, + "learning_rate": 9.662492267284956e-06, + "loss": 2.6472, + "step": 52765 + }, + { + "epoch": 2.456642689200829, + "grad_norm": 0.30386463686757176, + "learning_rate": 9.660891753812385e-06, + "loss": 2.4971, + "step": 52766 + }, + { + "epoch": 2.456689247386922, + "grad_norm": 0.3263631423796029, + "learning_rate": 9.659291358731087e-06, + "loss": 2.591, + "step": 52767 + }, + { + "epoch": 2.4567358055730146, + "grad_norm": 0.325580386491495, + "learning_rate": 9.657691082045828e-06, + "loss": 2.591, + "step": 52768 + }, + { + "epoch": 2.4567823637591077, + "grad_norm": 0.3144955417166044, + "learning_rate": 9.656090923761257e-06, + "loss": 2.5177, + "step": 52769 + }, + { + "epoch": 2.456828921945201, + "grad_norm": 0.30056452514914406, + "learning_rate": 9.654490883882084e-06, + "loss": 2.6424, + "step": 52770 + }, + { + "epoch": 2.456875480131294, + "grad_norm": 0.3387235420280923, + "learning_rate": 9.652890962413002e-06, + "loss": 2.6522, + "step": 52771 + }, + { + "epoch": 2.456922038317387, + "grad_norm": 0.33650083440759615, + "learning_rate": 9.651291159358716e-06, + "loss": 2.7361, + "step": 52772 + }, + { + "epoch": 2.45696859650348, + "grad_norm": 0.31649060991089595, + "learning_rate": 9.649691474723927e-06, + "loss": 2.6783, + "step": 52773 + }, + { + "epoch": 2.4570151546895733, + "grad_norm": 0.3198078945221666, + "learning_rate": 9.648091908513301e-06, + "loss": 2.7144, + "step": 52774 + }, + { + "epoch": 2.4570617128756664, + "grad_norm": 0.32406914250303825, + "learning_rate": 9.646492460731554e-06, + "loss": 2.5971, + "step": 52775 + }, + { + "epoch": 2.4571082710617596, + "grad_norm": 0.29343705971244216, + "learning_rate": 9.644893131383376e-06, + "loss": 2.6307, + "step": 52776 + }, + { + "epoch": 2.4571548292478527, + "grad_norm": 0.3162180775435305, + "learning_rate": 9.643293920473462e-06, + "loss": 2.5995, + "step": 52777 + }, + { + "epoch": 2.457201387433946, + "grad_norm": 0.3245292370580904, + "learning_rate": 9.641694828006503e-06, + "loss": 2.6681, + "step": 52778 + }, + { + "epoch": 2.4572479456200385, + "grad_norm": 0.30082361896690774, + "learning_rate": 9.640095853987207e-06, + "loss": 2.5586, + "step": 52779 + }, + { + "epoch": 2.4572945038061316, + "grad_norm": 0.3113013371393774, + "learning_rate": 9.638496998420227e-06, + "loss": 2.6894, + "step": 52780 + }, + { + "epoch": 2.4573410619922247, + "grad_norm": 0.3169644837255126, + "learning_rate": 9.636898261310307e-06, + "loss": 2.4968, + "step": 52781 + }, + { + "epoch": 2.457387620178318, + "grad_norm": 0.33207783046780087, + "learning_rate": 9.635299642662105e-06, + "loss": 2.7025, + "step": 52782 + }, + { + "epoch": 2.457434178364411, + "grad_norm": 0.3045024133364991, + "learning_rate": 9.633701142480317e-06, + "loss": 2.6447, + "step": 52783 + }, + { + "epoch": 2.457480736550504, + "grad_norm": 0.3029209924747023, + "learning_rate": 9.63210276076964e-06, + "loss": 2.6302, + "step": 52784 + }, + { + "epoch": 2.457527294736597, + "grad_norm": 0.29673678263226383, + "learning_rate": 9.630504497534771e-06, + "loss": 2.5566, + "step": 52785 + }, + { + "epoch": 2.4575738529226903, + "grad_norm": 0.3124587312157692, + "learning_rate": 9.628906352780398e-06, + "loss": 2.6574, + "step": 52786 + }, + { + "epoch": 2.4576204111087834, + "grad_norm": 0.32759206928227597, + "learning_rate": 9.627308326511186e-06, + "loss": 2.5882, + "step": 52787 + }, + { + "epoch": 2.457666969294876, + "grad_norm": 0.3104862621869308, + "learning_rate": 9.625710418731877e-06, + "loss": 2.612, + "step": 52788 + }, + { + "epoch": 2.457713527480969, + "grad_norm": 0.29719150229599633, + "learning_rate": 9.624112629447113e-06, + "loss": 2.5658, + "step": 52789 + }, + { + "epoch": 2.4577600856670623, + "grad_norm": 0.29249327250709073, + "learning_rate": 9.6225149586616e-06, + "loss": 2.6442, + "step": 52790 + }, + { + "epoch": 2.4578066438531554, + "grad_norm": 0.323027357832987, + "learning_rate": 9.620917406380026e-06, + "loss": 2.5557, + "step": 52791 + }, + { + "epoch": 2.4578532020392485, + "grad_norm": 0.3253393745332562, + "learning_rate": 9.6193199726071e-06, + "loss": 2.6725, + "step": 52792 + }, + { + "epoch": 2.4578997602253416, + "grad_norm": 0.30047076117027693, + "learning_rate": 9.61772265734746e-06, + "loss": 2.6426, + "step": 52793 + }, + { + "epoch": 2.4579463184114347, + "grad_norm": 0.30370430493331485, + "learning_rate": 9.616125460605857e-06, + "loss": 2.6439, + "step": 52794 + }, + { + "epoch": 2.457992876597528, + "grad_norm": 0.3187305825773, + "learning_rate": 9.614528382386933e-06, + "loss": 2.5711, + "step": 52795 + }, + { + "epoch": 2.458039434783621, + "grad_norm": 0.2932621022939223, + "learning_rate": 9.612931422695388e-06, + "loss": 2.5135, + "step": 52796 + }, + { + "epoch": 2.458085992969714, + "grad_norm": 0.3137046714935828, + "learning_rate": 9.611334581535908e-06, + "loss": 2.6361, + "step": 52797 + }, + { + "epoch": 2.458132551155807, + "grad_norm": 0.3295091699534256, + "learning_rate": 9.609737858913187e-06, + "loss": 2.7027, + "step": 52798 + }, + { + "epoch": 2.4581791093419, + "grad_norm": 0.30762615009837396, + "learning_rate": 9.608141254831915e-06, + "loss": 2.6469, + "step": 52799 + }, + { + "epoch": 2.458225667527993, + "grad_norm": 0.2999058290253995, + "learning_rate": 9.60654476929675e-06, + "loss": 2.5289, + "step": 52800 + }, + { + "epoch": 2.458272225714086, + "grad_norm": 0.32818698023696813, + "learning_rate": 9.604948402312413e-06, + "loss": 2.6418, + "step": 52801 + }, + { + "epoch": 2.458318783900179, + "grad_norm": 0.3147176096454744, + "learning_rate": 9.603352153883566e-06, + "loss": 2.5941, + "step": 52802 + }, + { + "epoch": 2.4583653420862723, + "grad_norm": 0.33308849930768475, + "learning_rate": 9.601756024014902e-06, + "loss": 2.7522, + "step": 52803 + }, + { + "epoch": 2.4584119002723654, + "grad_norm": 0.3072727158651858, + "learning_rate": 9.600160012711102e-06, + "loss": 2.5719, + "step": 52804 + }, + { + "epoch": 2.4584584584584586, + "grad_norm": 0.33283293344701176, + "learning_rate": 9.59856411997685e-06, + "loss": 2.6595, + "step": 52805 + }, + { + "epoch": 2.4585050166445517, + "grad_norm": 0.324968857289164, + "learning_rate": 9.596968345816838e-06, + "loss": 2.5676, + "step": 52806 + }, + { + "epoch": 2.4585515748306443, + "grad_norm": 0.3152972233252218, + "learning_rate": 9.595372690235749e-06, + "loss": 2.6314, + "step": 52807 + }, + { + "epoch": 2.4585981330167375, + "grad_norm": 0.30563110124904613, + "learning_rate": 9.593777153238253e-06, + "loss": 2.6014, + "step": 52808 + }, + { + "epoch": 2.4586446912028306, + "grad_norm": 0.32379316605176656, + "learning_rate": 9.592181734829036e-06, + "loss": 2.6022, + "step": 52809 + }, + { + "epoch": 2.4586912493889237, + "grad_norm": 0.31954223054179814, + "learning_rate": 9.590586435012789e-06, + "loss": 2.5975, + "step": 52810 + }, + { + "epoch": 2.458737807575017, + "grad_norm": 0.3083333148544265, + "learning_rate": 9.588991253794188e-06, + "loss": 2.5672, + "step": 52811 + }, + { + "epoch": 2.45878436576111, + "grad_norm": 0.32137654574370067, + "learning_rate": 9.587396191177933e-06, + "loss": 2.7483, + "step": 52812 + }, + { + "epoch": 2.458830923947203, + "grad_norm": 0.31404343723118194, + "learning_rate": 9.585801247168658e-06, + "loss": 2.5033, + "step": 52813 + }, + { + "epoch": 2.458877482133296, + "grad_norm": 0.30676673215667605, + "learning_rate": 9.584206421771103e-06, + "loss": 2.6008, + "step": 52814 + }, + { + "epoch": 2.4589240403193893, + "grad_norm": 0.32394812299666237, + "learning_rate": 9.582611714989908e-06, + "loss": 2.5649, + "step": 52815 + }, + { + "epoch": 2.4589705985054824, + "grad_norm": 0.30510455583192936, + "learning_rate": 9.581017126829767e-06, + "loss": 2.5584, + "step": 52816 + }, + { + "epoch": 2.4590171566915755, + "grad_norm": 0.32730802030214556, + "learning_rate": 9.579422657295357e-06, + "loss": 2.6382, + "step": 52817 + }, + { + "epoch": 2.459063714877668, + "grad_norm": 0.32389647315953085, + "learning_rate": 9.577828306391356e-06, + "loss": 2.6877, + "step": 52818 + }, + { + "epoch": 2.4591102730637613, + "grad_norm": 0.3388672194732395, + "learning_rate": 9.576234074122453e-06, + "loss": 2.7047, + "step": 52819 + }, + { + "epoch": 2.4591568312498544, + "grad_norm": 0.3115659249630116, + "learning_rate": 9.574639960493325e-06, + "loss": 2.6268, + "step": 52820 + }, + { + "epoch": 2.4592033894359475, + "grad_norm": 0.3191077885963322, + "learning_rate": 9.573045965508643e-06, + "loss": 2.6558, + "step": 52821 + }, + { + "epoch": 2.4592499476220406, + "grad_norm": 0.30998746802749294, + "learning_rate": 9.571452089173083e-06, + "loss": 2.6512, + "step": 52822 + }, + { + "epoch": 2.4592965058081337, + "grad_norm": 0.3155554624921249, + "learning_rate": 9.569858331491328e-06, + "loss": 2.6211, + "step": 52823 + }, + { + "epoch": 2.459343063994227, + "grad_norm": 0.3129641043508535, + "learning_rate": 9.56826469246806e-06, + "loss": 2.6154, + "step": 52824 + }, + { + "epoch": 2.45938962218032, + "grad_norm": 0.32146899493740627, + "learning_rate": 9.56667117210796e-06, + "loss": 2.6317, + "step": 52825 + }, + { + "epoch": 2.459436180366413, + "grad_norm": 0.3274237243078247, + "learning_rate": 9.565077770415676e-06, + "loss": 2.5713, + "step": 52826 + }, + { + "epoch": 2.4594827385525058, + "grad_norm": 0.3164241197766018, + "learning_rate": 9.563484487395925e-06, + "loss": 2.6892, + "step": 52827 + }, + { + "epoch": 2.459529296738599, + "grad_norm": 0.31420207550523016, + "learning_rate": 9.561891323053351e-06, + "loss": 2.687, + "step": 52828 + }, + { + "epoch": 2.459575854924692, + "grad_norm": 0.3099534665763848, + "learning_rate": 9.560298277392648e-06, + "loss": 2.5727, + "step": 52829 + }, + { + "epoch": 2.459622413110785, + "grad_norm": 0.3104748300200334, + "learning_rate": 9.558705350418485e-06, + "loss": 2.6268, + "step": 52830 + }, + { + "epoch": 2.4596689712968782, + "grad_norm": 0.3101661405749623, + "learning_rate": 9.557112542135532e-06, + "loss": 2.6849, + "step": 52831 + }, + { + "epoch": 2.4597155294829713, + "grad_norm": 0.3124479924952097, + "learning_rate": 9.555519852548472e-06, + "loss": 2.5566, + "step": 52832 + }, + { + "epoch": 2.4597620876690645, + "grad_norm": 0.3109326703146611, + "learning_rate": 9.55392728166199e-06, + "loss": 2.5897, + "step": 52833 + }, + { + "epoch": 2.4598086458551576, + "grad_norm": 0.2916173682542347, + "learning_rate": 9.552334829480735e-06, + "loss": 2.6604, + "step": 52834 + }, + { + "epoch": 2.4598552040412507, + "grad_norm": 0.30784254828764457, + "learning_rate": 9.55074249600939e-06, + "loss": 2.5368, + "step": 52835 + }, + { + "epoch": 2.459901762227344, + "grad_norm": 0.32417897308014176, + "learning_rate": 9.549150281252633e-06, + "loss": 2.5683, + "step": 52836 + }, + { + "epoch": 2.459948320413437, + "grad_norm": 0.30084873611359725, + "learning_rate": 9.547558185215133e-06, + "loss": 2.6159, + "step": 52837 + }, + { + "epoch": 2.4599948785995296, + "grad_norm": 0.3265753558234955, + "learning_rate": 9.54596620790158e-06, + "loss": 2.6564, + "step": 52838 + }, + { + "epoch": 2.4600414367856227, + "grad_norm": 0.31137063933369635, + "learning_rate": 9.544374349316604e-06, + "loss": 2.6451, + "step": 52839 + }, + { + "epoch": 2.460087994971716, + "grad_norm": 0.31327565597649787, + "learning_rate": 9.542782609464929e-06, + "loss": 2.5705, + "step": 52840 + }, + { + "epoch": 2.460134553157809, + "grad_norm": 0.30947336069092224, + "learning_rate": 9.541190988351172e-06, + "loss": 2.6269, + "step": 52841 + }, + { + "epoch": 2.460181111343902, + "grad_norm": 0.3216208659201434, + "learning_rate": 9.539599485980066e-06, + "loss": 2.7483, + "step": 52842 + }, + { + "epoch": 2.460227669529995, + "grad_norm": 0.3361869485510553, + "learning_rate": 9.538008102356233e-06, + "loss": 2.6263, + "step": 52843 + }, + { + "epoch": 2.4602742277160883, + "grad_norm": 0.3046227017118355, + "learning_rate": 9.536416837484358e-06, + "loss": 2.6123, + "step": 52844 + }, + { + "epoch": 2.4603207859021814, + "grad_norm": 0.29004679773639097, + "learning_rate": 9.53482569136912e-06, + "loss": 2.5976, + "step": 52845 + }, + { + "epoch": 2.4603673440882745, + "grad_norm": 0.3107800052729983, + "learning_rate": 9.533234664015189e-06, + "loss": 2.6933, + "step": 52846 + }, + { + "epoch": 2.460413902274367, + "grad_norm": 0.31731391270560966, + "learning_rate": 9.531643755427216e-06, + "loss": 2.5753, + "step": 52847 + }, + { + "epoch": 2.4604604604604603, + "grad_norm": 0.32684116164026045, + "learning_rate": 9.530052965609881e-06, + "loss": 2.6812, + "step": 52848 + }, + { + "epoch": 2.4605070186465534, + "grad_norm": 0.29488633863146624, + "learning_rate": 9.528462294567859e-06, + "loss": 2.635, + "step": 52849 + }, + { + "epoch": 2.4605535768326465, + "grad_norm": 0.3050846149105211, + "learning_rate": 9.526871742305805e-06, + "loss": 2.6166, + "step": 52850 + }, + { + "epoch": 2.4606001350187396, + "grad_norm": 0.29590148967172974, + "learning_rate": 9.525281308828416e-06, + "loss": 2.5784, + "step": 52851 + }, + { + "epoch": 2.4606466932048328, + "grad_norm": 0.31483282041052874, + "learning_rate": 9.523690994140304e-06, + "loss": 2.6392, + "step": 52852 + }, + { + "epoch": 2.460693251390926, + "grad_norm": 0.3097957920254616, + "learning_rate": 9.522100798246202e-06, + "loss": 2.55, + "step": 52853 + }, + { + "epoch": 2.460739809577019, + "grad_norm": 0.32267541870805616, + "learning_rate": 9.520510721150721e-06, + "loss": 2.6107, + "step": 52854 + }, + { + "epoch": 2.460786367763112, + "grad_norm": 0.3045570750326922, + "learning_rate": 9.51892076285858e-06, + "loss": 2.6745, + "step": 52855 + }, + { + "epoch": 2.460832925949205, + "grad_norm": 0.3255219649733767, + "learning_rate": 9.517330923374407e-06, + "loss": 2.7032, + "step": 52856 + }, + { + "epoch": 2.4608794841352983, + "grad_norm": 0.3196072048347699, + "learning_rate": 9.515741202702877e-06, + "loss": 2.6265, + "step": 52857 + }, + { + "epoch": 2.460926042321391, + "grad_norm": 0.3263605161921759, + "learning_rate": 9.51415160084866e-06, + "loss": 2.6223, + "step": 52858 + }, + { + "epoch": 2.460972600507484, + "grad_norm": 0.30604479043757044, + "learning_rate": 9.512562117816415e-06, + "loss": 2.6173, + "step": 52859 + }, + { + "epoch": 2.4610191586935772, + "grad_norm": 0.3176175866636013, + "learning_rate": 9.51097275361083e-06, + "loss": 2.6242, + "step": 52860 + }, + { + "epoch": 2.4610657168796704, + "grad_norm": 0.28997267964680157, + "learning_rate": 9.509383508236535e-06, + "loss": 2.6064, + "step": 52861 + }, + { + "epoch": 2.4611122750657635, + "grad_norm": 0.3092861777992983, + "learning_rate": 9.507794381698209e-06, + "loss": 2.6915, + "step": 52862 + }, + { + "epoch": 2.4611588332518566, + "grad_norm": 0.32644891676469656, + "learning_rate": 9.506205374000521e-06, + "loss": 2.5435, + "step": 52863 + }, + { + "epoch": 2.4612053914379497, + "grad_norm": 0.31565441048099646, + "learning_rate": 9.504616485148144e-06, + "loss": 2.6672, + "step": 52864 + }, + { + "epoch": 2.461251949624043, + "grad_norm": 0.3262495781939369, + "learning_rate": 9.503027715145697e-06, + "loss": 2.5837, + "step": 52865 + }, + { + "epoch": 2.4612985078101355, + "grad_norm": 0.32209956428072395, + "learning_rate": 9.501439063997902e-06, + "loss": 2.6824, + "step": 52866 + }, + { + "epoch": 2.4613450659962286, + "grad_norm": 0.327366484975428, + "learning_rate": 9.49985053170937e-06, + "loss": 2.6455, + "step": 52867 + }, + { + "epoch": 2.4613916241823217, + "grad_norm": 0.3295479720810261, + "learning_rate": 9.498262118284812e-06, + "loss": 2.6786, + "step": 52868 + }, + { + "epoch": 2.461438182368415, + "grad_norm": 0.32090341996782856, + "learning_rate": 9.496673823728853e-06, + "loss": 2.7154, + "step": 52869 + }, + { + "epoch": 2.461484740554508, + "grad_norm": 0.3446938532057578, + "learning_rate": 9.495085648046165e-06, + "loss": 2.6866, + "step": 52870 + }, + { + "epoch": 2.461531298740601, + "grad_norm": 0.31634914005418596, + "learning_rate": 9.493497591241408e-06, + "loss": 2.6249, + "step": 52871 + }, + { + "epoch": 2.461577856926694, + "grad_norm": 0.3182669441639284, + "learning_rate": 9.491909653319247e-06, + "loss": 2.6682, + "step": 52872 + }, + { + "epoch": 2.4616244151127873, + "grad_norm": 0.31780646372122134, + "learning_rate": 9.490321834284355e-06, + "loss": 2.566, + "step": 52873 + }, + { + "epoch": 2.4616709732988804, + "grad_norm": 0.32973681124614157, + "learning_rate": 9.488734134141358e-06, + "loss": 2.637, + "step": 52874 + }, + { + "epoch": 2.4617175314849735, + "grad_norm": 0.33800809677354426, + "learning_rate": 9.487146552894942e-06, + "loss": 2.6454, + "step": 52875 + }, + { + "epoch": 2.4617640896710666, + "grad_norm": 0.3311218402688254, + "learning_rate": 9.485559090549756e-06, + "loss": 2.6555, + "step": 52876 + }, + { + "epoch": 2.4618106478571593, + "grad_norm": 0.32230697561141275, + "learning_rate": 9.483971747110464e-06, + "loss": 2.7211, + "step": 52877 + }, + { + "epoch": 2.4618572060432524, + "grad_norm": 0.33151752859141065, + "learning_rate": 9.482384522581727e-06, + "loss": 2.6683, + "step": 52878 + }, + { + "epoch": 2.4619037642293455, + "grad_norm": 0.3248041411153819, + "learning_rate": 9.480797416968207e-06, + "loss": 2.4483, + "step": 52879 + }, + { + "epoch": 2.4619503224154387, + "grad_norm": 0.3092580476275032, + "learning_rate": 9.479210430274532e-06, + "loss": 2.635, + "step": 52880 + }, + { + "epoch": 2.4619968806015318, + "grad_norm": 0.303440269748221, + "learning_rate": 9.477623562505405e-06, + "loss": 2.7045, + "step": 52881 + }, + { + "epoch": 2.462043438787625, + "grad_norm": 0.3317110453304936, + "learning_rate": 9.47603681366545e-06, + "loss": 2.6861, + "step": 52882 + }, + { + "epoch": 2.462089996973718, + "grad_norm": 0.3259789012877557, + "learning_rate": 9.474450183759336e-06, + "loss": 2.5818, + "step": 52883 + }, + { + "epoch": 2.462136555159811, + "grad_norm": 0.3095213693269985, + "learning_rate": 9.472863672791722e-06, + "loss": 2.5001, + "step": 52884 + }, + { + "epoch": 2.4621831133459042, + "grad_norm": 0.31008173374683257, + "learning_rate": 9.471277280767255e-06, + "loss": 2.7013, + "step": 52885 + }, + { + "epoch": 2.462229671531997, + "grad_norm": 0.30306615855058894, + "learning_rate": 9.469691007690612e-06, + "loss": 2.5952, + "step": 52886 + }, + { + "epoch": 2.46227622971809, + "grad_norm": 0.3032399979009675, + "learning_rate": 9.468104853566417e-06, + "loss": 2.5483, + "step": 52887 + }, + { + "epoch": 2.462322787904183, + "grad_norm": 0.30434739119536885, + "learning_rate": 9.46651881839934e-06, + "loss": 2.679, + "step": 52888 + }, + { + "epoch": 2.4623693460902762, + "grad_norm": 0.2965312615193264, + "learning_rate": 9.46493290219404e-06, + "loss": 2.6647, + "step": 52889 + }, + { + "epoch": 2.4624159042763694, + "grad_norm": 0.30004755549230283, + "learning_rate": 9.463347104955166e-06, + "loss": 2.5788, + "step": 52890 + }, + { + "epoch": 2.4624624624624625, + "grad_norm": 0.3214053121121204, + "learning_rate": 9.461761426687382e-06, + "loss": 2.5611, + "step": 52891 + }, + { + "epoch": 2.4625090206485556, + "grad_norm": 0.31058367126576053, + "learning_rate": 9.460175867395343e-06, + "loss": 2.5932, + "step": 52892 + }, + { + "epoch": 2.4625555788346487, + "grad_norm": 0.323515001513528, + "learning_rate": 9.458590427083664e-06, + "loss": 2.5931, + "step": 52893 + }, + { + "epoch": 2.462602137020742, + "grad_norm": 0.31452848785300214, + "learning_rate": 9.45700510575706e-06, + "loss": 2.5705, + "step": 52894 + }, + { + "epoch": 2.462648695206835, + "grad_norm": 0.32722080997219455, + "learning_rate": 9.455419903420137e-06, + "loss": 2.6259, + "step": 52895 + }, + { + "epoch": 2.462695253392928, + "grad_norm": 0.3235364730222518, + "learning_rate": 9.453834820077563e-06, + "loss": 2.6488, + "step": 52896 + }, + { + "epoch": 2.4627418115790207, + "grad_norm": 0.315831045013213, + "learning_rate": 9.452249855733986e-06, + "loss": 2.5979, + "step": 52897 + }, + { + "epoch": 2.462788369765114, + "grad_norm": 0.3118172120507303, + "learning_rate": 9.450665010394062e-06, + "loss": 2.7102, + "step": 52898 + }, + { + "epoch": 2.462834927951207, + "grad_norm": 0.3337963157996623, + "learning_rate": 9.449080284062456e-06, + "loss": 2.5163, + "step": 52899 + }, + { + "epoch": 2.4628814861373, + "grad_norm": 0.31003842650007696, + "learning_rate": 9.447495676743779e-06, + "loss": 2.5644, + "step": 52900 + }, + { + "epoch": 2.462928044323393, + "grad_norm": 0.3207629443520272, + "learning_rate": 9.44591118844273e-06, + "loss": 2.5667, + "step": 52901 + }, + { + "epoch": 2.4629746025094863, + "grad_norm": 0.3217412810016832, + "learning_rate": 9.444326819163928e-06, + "loss": 2.6617, + "step": 52902 + }, + { + "epoch": 2.4630211606955794, + "grad_norm": 0.30126376197349347, + "learning_rate": 9.442742568912027e-06, + "loss": 2.7059, + "step": 52903 + }, + { + "epoch": 2.4630677188816725, + "grad_norm": 0.32741682791792437, + "learning_rate": 9.441158437691682e-06, + "loss": 2.6387, + "step": 52904 + }, + { + "epoch": 2.463114277067765, + "grad_norm": 0.3339145211145955, + "learning_rate": 9.439574425507557e-06, + "loss": 2.5393, + "step": 52905 + }, + { + "epoch": 2.4631608352538583, + "grad_norm": 0.3269576292256326, + "learning_rate": 9.437990532364256e-06, + "loss": 2.6691, + "step": 52906 + }, + { + "epoch": 2.4632073934399514, + "grad_norm": 0.3035680774906629, + "learning_rate": 9.436406758266486e-06, + "loss": 2.6018, + "step": 52907 + }, + { + "epoch": 2.4632539516260445, + "grad_norm": 0.30875040416484495, + "learning_rate": 9.434823103218848e-06, + "loss": 2.7159, + "step": 52908 + }, + { + "epoch": 2.4633005098121377, + "grad_norm": 0.3357278896762289, + "learning_rate": 9.43323956722601e-06, + "loss": 2.5942, + "step": 52909 + }, + { + "epoch": 2.4633470679982308, + "grad_norm": 0.33148487753729006, + "learning_rate": 9.431656150292623e-06, + "loss": 2.6492, + "step": 52910 + }, + { + "epoch": 2.463393626184324, + "grad_norm": 0.3107770807482172, + "learning_rate": 9.430072852423322e-06, + "loss": 2.6574, + "step": 52911 + }, + { + "epoch": 2.463440184370417, + "grad_norm": 0.30865394070579055, + "learning_rate": 9.428489673622776e-06, + "loss": 2.6394, + "step": 52912 + }, + { + "epoch": 2.46348674255651, + "grad_norm": 0.3379249291878449, + "learning_rate": 9.426906613895586e-06, + "loss": 2.6976, + "step": 52913 + }, + { + "epoch": 2.4635333007426032, + "grad_norm": 0.3254641087376709, + "learning_rate": 9.425323673246461e-06, + "loss": 2.7294, + "step": 52914 + }, + { + "epoch": 2.4635798589286964, + "grad_norm": 0.3242486009290451, + "learning_rate": 9.423740851679996e-06, + "loss": 2.6711, + "step": 52915 + }, + { + "epoch": 2.463626417114789, + "grad_norm": 0.3227976688819615, + "learning_rate": 9.422158149200855e-06, + "loss": 2.7584, + "step": 52916 + }, + { + "epoch": 2.463672975300882, + "grad_norm": 0.2900597055136981, + "learning_rate": 9.420575565813683e-06, + "loss": 2.5663, + "step": 52917 + }, + { + "epoch": 2.4637195334869753, + "grad_norm": 0.32200929407560136, + "learning_rate": 9.418993101523121e-06, + "loss": 2.6836, + "step": 52918 + }, + { + "epoch": 2.4637660916730684, + "grad_norm": 0.32566706879773377, + "learning_rate": 9.417410756333816e-06, + "loss": 2.6397, + "step": 52919 + }, + { + "epoch": 2.4638126498591615, + "grad_norm": 0.3274991972991926, + "learning_rate": 9.415828530250432e-06, + "loss": 2.6412, + "step": 52920 + }, + { + "epoch": 2.4638592080452546, + "grad_norm": 0.32361009008537905, + "learning_rate": 9.414246423277573e-06, + "loss": 2.5601, + "step": 52921 + }, + { + "epoch": 2.4639057662313477, + "grad_norm": 0.29973058417321774, + "learning_rate": 9.41266443541991e-06, + "loss": 2.5047, + "step": 52922 + }, + { + "epoch": 2.463952324417441, + "grad_norm": 0.3069332244866574, + "learning_rate": 9.411082566682072e-06, + "loss": 2.5385, + "step": 52923 + }, + { + "epoch": 2.463998882603534, + "grad_norm": 0.30930056488482083, + "learning_rate": 9.409500817068711e-06, + "loss": 2.4834, + "step": 52924 + }, + { + "epoch": 2.4640454407896266, + "grad_norm": 0.34280930154158895, + "learning_rate": 9.407919186584474e-06, + "loss": 2.6045, + "step": 52925 + }, + { + "epoch": 2.4640919989757197, + "grad_norm": 0.343054417324888, + "learning_rate": 9.406337675233979e-06, + "loss": 2.6565, + "step": 52926 + }, + { + "epoch": 2.464138557161813, + "grad_norm": 0.32957762922940237, + "learning_rate": 9.404756283021898e-06, + "loss": 2.6295, + "step": 52927 + }, + { + "epoch": 2.464185115347906, + "grad_norm": 0.30703693922664993, + "learning_rate": 9.403175009952852e-06, + "loss": 2.5393, + "step": 52928 + }, + { + "epoch": 2.464231673533999, + "grad_norm": 0.319365979823022, + "learning_rate": 9.401593856031487e-06, + "loss": 2.6334, + "step": 52929 + }, + { + "epoch": 2.464278231720092, + "grad_norm": 0.3310541331308874, + "learning_rate": 9.400012821262443e-06, + "loss": 2.6672, + "step": 52930 + }, + { + "epoch": 2.4643247899061853, + "grad_norm": 0.3284632807097067, + "learning_rate": 9.398431905650368e-06, + "loss": 2.6289, + "step": 52931 + }, + { + "epoch": 2.4643713480922784, + "grad_norm": 0.3335477982952159, + "learning_rate": 9.39685110919989e-06, + "loss": 2.5544, + "step": 52932 + }, + { + "epoch": 2.4644179062783715, + "grad_norm": 0.3181686190064848, + "learning_rate": 9.395270431915665e-06, + "loss": 2.6928, + "step": 52933 + }, + { + "epoch": 2.4644644644644647, + "grad_norm": 0.31653287968167676, + "learning_rate": 9.393689873802308e-06, + "loss": 2.5981, + "step": 52934 + }, + { + "epoch": 2.4645110226505578, + "grad_norm": 0.3079304047448663, + "learning_rate": 9.392109434864476e-06, + "loss": 2.5878, + "step": 52935 + }, + { + "epoch": 2.4645575808366504, + "grad_norm": 0.30156828876589964, + "learning_rate": 9.390529115106799e-06, + "loss": 2.6244, + "step": 52936 + }, + { + "epoch": 2.4646041390227436, + "grad_norm": 0.3083571807009285, + "learning_rate": 9.388948914533918e-06, + "loss": 2.6488, + "step": 52937 + }, + { + "epoch": 2.4646506972088367, + "grad_norm": 0.3207763619631722, + "learning_rate": 9.387368833150484e-06, + "loss": 2.6696, + "step": 52938 + }, + { + "epoch": 2.46469725539493, + "grad_norm": 0.3250489374802601, + "learning_rate": 9.385788870961099e-06, + "loss": 2.6315, + "step": 52939 + }, + { + "epoch": 2.464743813581023, + "grad_norm": 0.31355738319595317, + "learning_rate": 9.384209027970448e-06, + "loss": 2.5503, + "step": 52940 + }, + { + "epoch": 2.464790371767116, + "grad_norm": 0.3098540162054344, + "learning_rate": 9.382629304183128e-06, + "loss": 2.6276, + "step": 52941 + }, + { + "epoch": 2.464836929953209, + "grad_norm": 0.32747098541779474, + "learning_rate": 9.381049699603784e-06, + "loss": 2.675, + "step": 52942 + }, + { + "epoch": 2.4648834881393022, + "grad_norm": 0.31661153373742784, + "learning_rate": 9.379470214237062e-06, + "loss": 2.6313, + "step": 52943 + }, + { + "epoch": 2.464930046325395, + "grad_norm": 0.30388499527738605, + "learning_rate": 9.377890848087595e-06, + "loss": 2.6007, + "step": 52944 + }, + { + "epoch": 2.464976604511488, + "grad_norm": 0.31150965148365767, + "learning_rate": 9.376311601160015e-06, + "loss": 2.6228, + "step": 52945 + }, + { + "epoch": 2.465023162697581, + "grad_norm": 0.30882665607152515, + "learning_rate": 9.374732473458969e-06, + "loss": 2.4367, + "step": 52946 + }, + { + "epoch": 2.4650697208836743, + "grad_norm": 0.32020236486303255, + "learning_rate": 9.37315346498907e-06, + "loss": 2.6369, + "step": 52947 + }, + { + "epoch": 2.4651162790697674, + "grad_norm": 0.31330878547490565, + "learning_rate": 9.371574575754966e-06, + "loss": 2.6077, + "step": 52948 + }, + { + "epoch": 2.4651628372558605, + "grad_norm": 0.28658748744062346, + "learning_rate": 9.369995805761284e-06, + "loss": 2.5903, + "step": 52949 + }, + { + "epoch": 2.4652093954419536, + "grad_norm": 0.29619114532768137, + "learning_rate": 9.36841715501266e-06, + "loss": 2.6345, + "step": 52950 + }, + { + "epoch": 2.4652559536280467, + "grad_norm": 0.29801536505804604, + "learning_rate": 9.366838623513747e-06, + "loss": 2.6431, + "step": 52951 + }, + { + "epoch": 2.46530251181414, + "grad_norm": 0.2998450917249656, + "learning_rate": 9.365260211269133e-06, + "loss": 2.6259, + "step": 52952 + }, + { + "epoch": 2.465349070000233, + "grad_norm": 0.3236809188850429, + "learning_rate": 9.363681918283502e-06, + "loss": 2.7065, + "step": 52953 + }, + { + "epoch": 2.465395628186326, + "grad_norm": 0.3122416544695357, + "learning_rate": 9.362103744561428e-06, + "loss": 2.6134, + "step": 52954 + }, + { + "epoch": 2.4654421863724187, + "grad_norm": 0.30289722829177756, + "learning_rate": 9.360525690107613e-06, + "loss": 2.6623, + "step": 52955 + }, + { + "epoch": 2.465488744558512, + "grad_norm": 0.33482157889438674, + "learning_rate": 9.358947754926633e-06, + "loss": 2.5989, + "step": 52956 + }, + { + "epoch": 2.465535302744605, + "grad_norm": 0.31233672701994547, + "learning_rate": 9.357369939023136e-06, + "loss": 2.6669, + "step": 52957 + }, + { + "epoch": 2.465581860930698, + "grad_norm": 0.3177174818000905, + "learning_rate": 9.355792242401757e-06, + "loss": 2.6139, + "step": 52958 + }, + { + "epoch": 2.465628419116791, + "grad_norm": 0.33613687843281265, + "learning_rate": 9.35421466506714e-06, + "loss": 2.683, + "step": 52959 + }, + { + "epoch": 2.4656749773028843, + "grad_norm": 0.3060662061920493, + "learning_rate": 9.352637207023879e-06, + "loss": 2.5463, + "step": 52960 + }, + { + "epoch": 2.4657215354889774, + "grad_norm": 0.30972818772809046, + "learning_rate": 9.351059868276629e-06, + "loss": 2.6026, + "step": 52961 + }, + { + "epoch": 2.4657680936750705, + "grad_norm": 0.3102436711128587, + "learning_rate": 9.34948264883001e-06, + "loss": 2.6667, + "step": 52962 + }, + { + "epoch": 2.4658146518611637, + "grad_norm": 0.31103012214156583, + "learning_rate": 9.347905548688657e-06, + "loss": 2.6427, + "step": 52963 + }, + { + "epoch": 2.4658612100472563, + "grad_norm": 0.31117156449257344, + "learning_rate": 9.346328567857204e-06, + "loss": 2.7297, + "step": 52964 + }, + { + "epoch": 2.4659077682333495, + "grad_norm": 0.2980562637708364, + "learning_rate": 9.344751706340255e-06, + "loss": 2.626, + "step": 52965 + }, + { + "epoch": 2.4659543264194426, + "grad_norm": 0.31360380410796185, + "learning_rate": 9.34317496414247e-06, + "loss": 2.5083, + "step": 52966 + }, + { + "epoch": 2.4660008846055357, + "grad_norm": 0.30241294240403943, + "learning_rate": 9.341598341268443e-06, + "loss": 2.5451, + "step": 52967 + }, + { + "epoch": 2.466047442791629, + "grad_norm": 0.31200564281900317, + "learning_rate": 9.34002183772284e-06, + "loss": 2.704, + "step": 52968 + }, + { + "epoch": 2.466094000977722, + "grad_norm": 0.31611946378776595, + "learning_rate": 9.338445453510258e-06, + "loss": 2.7399, + "step": 52969 + }, + { + "epoch": 2.466140559163815, + "grad_norm": 0.31643446102358436, + "learning_rate": 9.336869188635334e-06, + "loss": 2.6208, + "step": 52970 + }, + { + "epoch": 2.466187117349908, + "grad_norm": 0.3074072454182272, + "learning_rate": 9.335293043102689e-06, + "loss": 2.5742, + "step": 52971 + }, + { + "epoch": 2.4662336755360013, + "grad_norm": 0.30664083130689834, + "learning_rate": 9.333717016916954e-06, + "loss": 2.62, + "step": 52972 + }, + { + "epoch": 2.4662802337220944, + "grad_norm": 0.32692766938037876, + "learning_rate": 9.332141110082766e-06, + "loss": 2.7275, + "step": 52973 + }, + { + "epoch": 2.4663267919081875, + "grad_norm": 0.3321878491221676, + "learning_rate": 9.330565322604728e-06, + "loss": 2.6656, + "step": 52974 + }, + { + "epoch": 2.46637335009428, + "grad_norm": 0.3169398258030926, + "learning_rate": 9.328989654487469e-06, + "loss": 2.6938, + "step": 52975 + }, + { + "epoch": 2.4664199082803733, + "grad_norm": 0.30720616163684605, + "learning_rate": 9.32741410573562e-06, + "loss": 2.5833, + "step": 52976 + }, + { + "epoch": 2.4664664664664664, + "grad_norm": 0.3233392937915924, + "learning_rate": 9.325838676353822e-06, + "loss": 2.7477, + "step": 52977 + }, + { + "epoch": 2.4665130246525595, + "grad_norm": 0.33979577757877294, + "learning_rate": 9.324263366346648e-06, + "loss": 2.6022, + "step": 52978 + }, + { + "epoch": 2.4665595828386526, + "grad_norm": 0.3103960568192918, + "learning_rate": 9.32268817571878e-06, + "loss": 2.5454, + "step": 52979 + }, + { + "epoch": 2.4666061410247457, + "grad_norm": 0.3139507867660953, + "learning_rate": 9.321113104474794e-06, + "loss": 2.5686, + "step": 52980 + }, + { + "epoch": 2.466652699210839, + "grad_norm": 0.3280608191536015, + "learning_rate": 9.319538152619355e-06, + "loss": 2.5498, + "step": 52981 + }, + { + "epoch": 2.466699257396932, + "grad_norm": 0.33109190441848396, + "learning_rate": 9.317963320157053e-06, + "loss": 2.6591, + "step": 52982 + }, + { + "epoch": 2.4667458155830246, + "grad_norm": 0.3239053605512331, + "learning_rate": 9.316388607092518e-06, + "loss": 2.584, + "step": 52983 + }, + { + "epoch": 2.4667923737691178, + "grad_norm": 0.29819516135096846, + "learning_rate": 9.314814013430372e-06, + "loss": 2.5502, + "step": 52984 + }, + { + "epoch": 2.466838931955211, + "grad_norm": 0.3514928536183958, + "learning_rate": 9.313239539175244e-06, + "loss": 2.7798, + "step": 52985 + }, + { + "epoch": 2.466885490141304, + "grad_norm": 0.3083948499895824, + "learning_rate": 9.311665184331752e-06, + "loss": 2.6668, + "step": 52986 + }, + { + "epoch": 2.466932048327397, + "grad_norm": 0.3132935425752705, + "learning_rate": 9.310090948904521e-06, + "loss": 2.5455, + "step": 52987 + }, + { + "epoch": 2.46697860651349, + "grad_norm": 0.297968624497885, + "learning_rate": 9.308516832898157e-06, + "loss": 2.5135, + "step": 52988 + }, + { + "epoch": 2.4670251646995833, + "grad_norm": 0.3194504289959888, + "learning_rate": 9.306942836317285e-06, + "loss": 2.5954, + "step": 52989 + }, + { + "epoch": 2.4670717228856764, + "grad_norm": 0.36571924163453046, + "learning_rate": 9.305368959166521e-06, + "loss": 2.5733, + "step": 52990 + }, + { + "epoch": 2.4671182810717696, + "grad_norm": 0.301060200174112, + "learning_rate": 9.3037952014505e-06, + "loss": 2.7111, + "step": 52991 + }, + { + "epoch": 2.4671648392578627, + "grad_norm": 0.31585920203295753, + "learning_rate": 9.302221563173836e-06, + "loss": 2.5391, + "step": 52992 + }, + { + "epoch": 2.467211397443956, + "grad_norm": 0.3507210261982781, + "learning_rate": 9.300648044341121e-06, + "loss": 2.6474, + "step": 52993 + }, + { + "epoch": 2.4672579556300485, + "grad_norm": 0.3133322808486124, + "learning_rate": 9.299074644957018e-06, + "loss": 2.6704, + "step": 52994 + }, + { + "epoch": 2.4673045138161416, + "grad_norm": 0.31210434246786173, + "learning_rate": 9.297501365026112e-06, + "loss": 2.5212, + "step": 52995 + }, + { + "epoch": 2.4673510720022347, + "grad_norm": 0.30919266717849186, + "learning_rate": 9.295928204553022e-06, + "loss": 2.5807, + "step": 52996 + }, + { + "epoch": 2.467397630188328, + "grad_norm": 0.32401784792408567, + "learning_rate": 9.29435516354238e-06, + "loss": 2.6253, + "step": 52997 + }, + { + "epoch": 2.467444188374421, + "grad_norm": 0.3125192041767857, + "learning_rate": 9.292782241998793e-06, + "loss": 2.6842, + "step": 52998 + }, + { + "epoch": 2.467490746560514, + "grad_norm": 0.29802111201028053, + "learning_rate": 9.29120943992688e-06, + "loss": 2.6736, + "step": 52999 + }, + { + "epoch": 2.467537304746607, + "grad_norm": 0.31435065087484426, + "learning_rate": 9.289636757331272e-06, + "loss": 2.6667, + "step": 53000 + }, + { + "epoch": 2.4675838629327003, + "grad_norm": 0.31136746941820265, + "learning_rate": 9.28806419421655e-06, + "loss": 2.5936, + "step": 53001 + }, + { + "epoch": 2.4676304211187934, + "grad_norm": 0.33625923035009203, + "learning_rate": 9.286491750587355e-06, + "loss": 2.7292, + "step": 53002 + }, + { + "epoch": 2.467676979304886, + "grad_norm": 0.29960258615427604, + "learning_rate": 9.28491942644829e-06, + "loss": 2.6756, + "step": 53003 + }, + { + "epoch": 2.467723537490979, + "grad_norm": 0.31657178087715937, + "learning_rate": 9.283347221803985e-06, + "loss": 2.5578, + "step": 53004 + }, + { + "epoch": 2.4677700956770723, + "grad_norm": 0.2995757071005387, + "learning_rate": 9.281775136659049e-06, + "loss": 2.6143, + "step": 53005 + }, + { + "epoch": 2.4678166538631654, + "grad_norm": 0.32853888449686214, + "learning_rate": 9.280203171018064e-06, + "loss": 2.5764, + "step": 53006 + }, + { + "epoch": 2.4678632120492585, + "grad_norm": 0.3010200208599782, + "learning_rate": 9.278631324885705e-06, + "loss": 2.6015, + "step": 53007 + }, + { + "epoch": 2.4679097702353516, + "grad_norm": 0.29652254237268577, + "learning_rate": 9.277059598266519e-06, + "loss": 2.5625, + "step": 53008 + }, + { + "epoch": 2.4679563284214447, + "grad_norm": 0.3028674948930221, + "learning_rate": 9.275487991165182e-06, + "loss": 2.6542, + "step": 53009 + }, + { + "epoch": 2.468002886607538, + "grad_norm": 0.3193430600011624, + "learning_rate": 9.27391650358626e-06, + "loss": 2.6853, + "step": 53010 + }, + { + "epoch": 2.468049444793631, + "grad_norm": 0.32342775492939885, + "learning_rate": 9.272345135534383e-06, + "loss": 2.5995, + "step": 53011 + }, + { + "epoch": 2.468096002979724, + "grad_norm": 0.30985215773876223, + "learning_rate": 9.270773887014157e-06, + "loss": 2.5942, + "step": 53012 + }, + { + "epoch": 2.468142561165817, + "grad_norm": 0.30187317214886233, + "learning_rate": 9.269202758030215e-06, + "loss": 2.6071, + "step": 53013 + }, + { + "epoch": 2.46818911935191, + "grad_norm": 0.31632811952431117, + "learning_rate": 9.26763174858713e-06, + "loss": 2.6183, + "step": 53014 + }, + { + "epoch": 2.468235677538003, + "grad_norm": 0.3002675313980568, + "learning_rate": 9.26606085868954e-06, + "loss": 2.6736, + "step": 53015 + }, + { + "epoch": 2.468282235724096, + "grad_norm": 0.3158407761663312, + "learning_rate": 9.264490088342048e-06, + "loss": 2.6354, + "step": 53016 + }, + { + "epoch": 2.4683287939101892, + "grad_norm": 0.30971968987750004, + "learning_rate": 9.262919437549261e-06, + "loss": 2.6246, + "step": 53017 + }, + { + "epoch": 2.4683753520962823, + "grad_norm": 0.3101872514086155, + "learning_rate": 9.261348906315803e-06, + "loss": 2.5741, + "step": 53018 + }, + { + "epoch": 2.4684219102823755, + "grad_norm": 0.29343929386098083, + "learning_rate": 9.259778494646254e-06, + "loss": 2.6079, + "step": 53019 + }, + { + "epoch": 2.4684684684684686, + "grad_norm": 0.30300402031758217, + "learning_rate": 9.25820820254526e-06, + "loss": 2.6984, + "step": 53020 + }, + { + "epoch": 2.4685150266545617, + "grad_norm": 0.2992761159321603, + "learning_rate": 9.25663803001739e-06, + "loss": 2.6378, + "step": 53021 + }, + { + "epoch": 2.468561584840655, + "grad_norm": 0.32339087345154177, + "learning_rate": 9.2550679770673e-06, + "loss": 2.6971, + "step": 53022 + }, + { + "epoch": 2.4686081430267475, + "grad_norm": 0.3403072306724585, + "learning_rate": 9.253498043699555e-06, + "loss": 2.6714, + "step": 53023 + }, + { + "epoch": 2.4686547012128406, + "grad_norm": 0.31029192043143095, + "learning_rate": 9.251928229918783e-06, + "loss": 2.6123, + "step": 53024 + }, + { + "epoch": 2.4687012593989337, + "grad_norm": 0.29853059310631896, + "learning_rate": 9.250358535729586e-06, + "loss": 2.648, + "step": 53025 + }, + { + "epoch": 2.468747817585027, + "grad_norm": 0.3374061580256817, + "learning_rate": 9.248788961136567e-06, + "loss": 2.7308, + "step": 53026 + }, + { + "epoch": 2.46879437577112, + "grad_norm": 0.31860879885277643, + "learning_rate": 9.247219506144355e-06, + "loss": 2.632, + "step": 53027 + }, + { + "epoch": 2.468840933957213, + "grad_norm": 0.30722818268792523, + "learning_rate": 9.245650170757531e-06, + "loss": 2.5929, + "step": 53028 + }, + { + "epoch": 2.468887492143306, + "grad_norm": 0.3043859444889832, + "learning_rate": 9.2440809549807e-06, + "loss": 2.597, + "step": 53029 + }, + { + "epoch": 2.4689340503293993, + "grad_norm": 0.3249410435816619, + "learning_rate": 9.24251185881848e-06, + "loss": 2.6119, + "step": 53030 + }, + { + "epoch": 2.4689806085154924, + "grad_norm": 0.3005422819775587, + "learning_rate": 9.24094288227549e-06, + "loss": 2.7332, + "step": 53031 + }, + { + "epoch": 2.4690271667015855, + "grad_norm": 0.30306474415809437, + "learning_rate": 9.239374025356284e-06, + "loss": 2.5972, + "step": 53032 + }, + { + "epoch": 2.4690737248876786, + "grad_norm": 0.30460181455917995, + "learning_rate": 9.237805288065526e-06, + "loss": 2.6274, + "step": 53033 + }, + { + "epoch": 2.4691202830737713, + "grad_norm": 0.3016355306936556, + "learning_rate": 9.236236670407772e-06, + "loss": 2.6095, + "step": 53034 + }, + { + "epoch": 2.4691668412598644, + "grad_norm": 0.31456471819924203, + "learning_rate": 9.234668172387673e-06, + "loss": 2.6051, + "step": 53035 + }, + { + "epoch": 2.4692133994459575, + "grad_norm": 0.29509537030972777, + "learning_rate": 9.233099794009791e-06, + "loss": 2.5549, + "step": 53036 + }, + { + "epoch": 2.4692599576320506, + "grad_norm": 0.31878867095907976, + "learning_rate": 9.231531535278749e-06, + "loss": 2.6684, + "step": 53037 + }, + { + "epoch": 2.4693065158181438, + "grad_norm": 0.3179413895658837, + "learning_rate": 9.229963396199138e-06, + "loss": 2.6124, + "step": 53038 + }, + { + "epoch": 2.469353074004237, + "grad_norm": 0.3037055994785079, + "learning_rate": 9.228395376775572e-06, + "loss": 2.6041, + "step": 53039 + }, + { + "epoch": 2.46939963219033, + "grad_norm": 0.3326567239088783, + "learning_rate": 9.226827477012662e-06, + "loss": 2.5508, + "step": 53040 + }, + { + "epoch": 2.469446190376423, + "grad_norm": 0.31513986320265625, + "learning_rate": 9.225259696914984e-06, + "loss": 2.6015, + "step": 53041 + }, + { + "epoch": 2.4694927485625158, + "grad_norm": 0.3260883993633467, + "learning_rate": 9.223692036487153e-06, + "loss": 2.6235, + "step": 53042 + }, + { + "epoch": 2.469539306748609, + "grad_norm": 0.3256302503242026, + "learning_rate": 9.22212449573377e-06, + "loss": 2.6304, + "step": 53043 + }, + { + "epoch": 2.469585864934702, + "grad_norm": 0.3113905039390815, + "learning_rate": 9.22055707465943e-06, + "loss": 2.6391, + "step": 53044 + }, + { + "epoch": 2.469632423120795, + "grad_norm": 0.3194175073054603, + "learning_rate": 9.218989773268743e-06, + "loss": 2.6681, + "step": 53045 + }, + { + "epoch": 2.4696789813068882, + "grad_norm": 0.31695331714062736, + "learning_rate": 9.21742259156631e-06, + "loss": 2.5583, + "step": 53046 + }, + { + "epoch": 2.4697255394929813, + "grad_norm": 0.32621674952928775, + "learning_rate": 9.215855529556699e-06, + "loss": 2.6835, + "step": 53047 + }, + { + "epoch": 2.4697720976790745, + "grad_norm": 0.306456074487793, + "learning_rate": 9.214288587244562e-06, + "loss": 2.6198, + "step": 53048 + }, + { + "epoch": 2.4698186558651676, + "grad_norm": 0.2932368301366092, + "learning_rate": 9.212721764634458e-06, + "loss": 2.6115, + "step": 53049 + }, + { + "epoch": 2.4698652140512607, + "grad_norm": 0.31385552596034066, + "learning_rate": 9.211155061730997e-06, + "loss": 2.6868, + "step": 53050 + }, + { + "epoch": 2.469911772237354, + "grad_norm": 0.32481677784273255, + "learning_rate": 9.209588478538778e-06, + "loss": 2.5914, + "step": 53051 + }, + { + "epoch": 2.469958330423447, + "grad_norm": 0.31243725231100083, + "learning_rate": 9.208022015062395e-06, + "loss": 2.5898, + "step": 53052 + }, + { + "epoch": 2.4700048886095396, + "grad_norm": 0.3227237235300471, + "learning_rate": 9.206455671306468e-06, + "loss": 2.6687, + "step": 53053 + }, + { + "epoch": 2.4700514467956327, + "grad_norm": 0.3171991578841345, + "learning_rate": 9.204889447275555e-06, + "loss": 2.5963, + "step": 53054 + }, + { + "epoch": 2.470098004981726, + "grad_norm": 0.31060072612145245, + "learning_rate": 9.203323342974274e-06, + "loss": 2.6703, + "step": 53055 + }, + { + "epoch": 2.470144563167819, + "grad_norm": 0.3182806557588518, + "learning_rate": 9.20175735840722e-06, + "loss": 2.5386, + "step": 53056 + }, + { + "epoch": 2.470191121353912, + "grad_norm": 0.32343410653487065, + "learning_rate": 9.200191493578991e-06, + "loss": 2.6606, + "step": 53057 + }, + { + "epoch": 2.470237679540005, + "grad_norm": 0.33968890675236446, + "learning_rate": 9.198625748494183e-06, + "loss": 2.7158, + "step": 53058 + }, + { + "epoch": 2.4702842377260983, + "grad_norm": 0.29994619356852936, + "learning_rate": 9.197060123157397e-06, + "loss": 2.5633, + "step": 53059 + }, + { + "epoch": 2.4703307959121914, + "grad_norm": 0.3038924437705181, + "learning_rate": 9.195494617573196e-06, + "loss": 2.5275, + "step": 53060 + }, + { + "epoch": 2.4703773540982845, + "grad_norm": 0.3143742763378941, + "learning_rate": 9.193929231746223e-06, + "loss": 2.6879, + "step": 53061 + }, + { + "epoch": 2.470423912284377, + "grad_norm": 0.29643227453496157, + "learning_rate": 9.192363965681034e-06, + "loss": 2.7236, + "step": 53062 + }, + { + "epoch": 2.4704704704704703, + "grad_norm": 0.31205597542567637, + "learning_rate": 9.190798819382234e-06, + "loss": 2.5822, + "step": 53063 + }, + { + "epoch": 2.4705170286565634, + "grad_norm": 0.3065736155242549, + "learning_rate": 9.189233792854423e-06, + "loss": 2.6172, + "step": 53064 + }, + { + "epoch": 2.4705635868426565, + "grad_norm": 0.32172256804319843, + "learning_rate": 9.18766888610219e-06, + "loss": 2.5061, + "step": 53065 + }, + { + "epoch": 2.4706101450287496, + "grad_norm": 0.3468270022452032, + "learning_rate": 9.186104099130144e-06, + "loss": 2.6467, + "step": 53066 + }, + { + "epoch": 2.4706567032148428, + "grad_norm": 0.3229374652883336, + "learning_rate": 9.184539431942845e-06, + "loss": 2.6325, + "step": 53067 + }, + { + "epoch": 2.470703261400936, + "grad_norm": 0.31082121399806556, + "learning_rate": 9.182974884544903e-06, + "loss": 2.6908, + "step": 53068 + }, + { + "epoch": 2.470749819587029, + "grad_norm": 0.3121186746421622, + "learning_rate": 9.18141045694091e-06, + "loss": 2.5464, + "step": 53069 + }, + { + "epoch": 2.470796377773122, + "grad_norm": 0.31800449959701244, + "learning_rate": 9.179846149135452e-06, + "loss": 2.7196, + "step": 53070 + }, + { + "epoch": 2.4708429359592152, + "grad_norm": 0.31004354155502767, + "learning_rate": 9.178281961133128e-06, + "loss": 2.5928, + "step": 53071 + }, + { + "epoch": 2.4708894941453083, + "grad_norm": 0.3190634862537192, + "learning_rate": 9.176717892938536e-06, + "loss": 2.6882, + "step": 53072 + }, + { + "epoch": 2.470936052331401, + "grad_norm": 0.3178113360599625, + "learning_rate": 9.175153944556226e-06, + "loss": 2.6395, + "step": 53073 + }, + { + "epoch": 2.470982610517494, + "grad_norm": 0.32149897229614566, + "learning_rate": 9.17359011599085e-06, + "loss": 2.5839, + "step": 53074 + }, + { + "epoch": 2.4710291687035872, + "grad_norm": 0.31797861891408746, + "learning_rate": 9.172026407246947e-06, + "loss": 2.5596, + "step": 53075 + }, + { + "epoch": 2.4710757268896804, + "grad_norm": 0.3077685095964339, + "learning_rate": 9.170462818329122e-06, + "loss": 2.6536, + "step": 53076 + }, + { + "epoch": 2.4711222850757735, + "grad_norm": 0.3244344562649532, + "learning_rate": 9.168899349241972e-06, + "loss": 2.6412, + "step": 53077 + }, + { + "epoch": 2.4711688432618666, + "grad_norm": 0.2995708138320664, + "learning_rate": 9.16733599999008e-06, + "loss": 2.5816, + "step": 53078 + }, + { + "epoch": 2.4712154014479597, + "grad_norm": 0.3019924348441022, + "learning_rate": 9.165772770578045e-06, + "loss": 2.6307, + "step": 53079 + }, + { + "epoch": 2.471261959634053, + "grad_norm": 0.3043939039945886, + "learning_rate": 9.164209661010415e-06, + "loss": 2.6445, + "step": 53080 + }, + { + "epoch": 2.4713085178201455, + "grad_norm": 0.32252617362994823, + "learning_rate": 9.162646671291836e-06, + "loss": 2.6637, + "step": 53081 + }, + { + "epoch": 2.4713550760062386, + "grad_norm": 0.30903189520197516, + "learning_rate": 9.161083801426857e-06, + "loss": 2.5945, + "step": 53082 + }, + { + "epoch": 2.4714016341923317, + "grad_norm": 0.31533878345272853, + "learning_rate": 9.159521051420067e-06, + "loss": 2.545, + "step": 53083 + }, + { + "epoch": 2.471448192378425, + "grad_norm": 0.3024288075243915, + "learning_rate": 9.157958421276064e-06, + "loss": 2.6247, + "step": 53084 + }, + { + "epoch": 2.471494750564518, + "grad_norm": 0.30992551666791585, + "learning_rate": 9.156395910999438e-06, + "loss": 2.635, + "step": 53085 + }, + { + "epoch": 2.471541308750611, + "grad_norm": 0.3126297286145445, + "learning_rate": 9.154833520594741e-06, + "loss": 2.6336, + "step": 53086 + }, + { + "epoch": 2.471587866936704, + "grad_norm": 0.298256756990473, + "learning_rate": 9.153271250066608e-06, + "loss": 2.5446, + "step": 53087 + }, + { + "epoch": 2.4716344251227973, + "grad_norm": 0.3088908226229176, + "learning_rate": 9.151709099419591e-06, + "loss": 2.6472, + "step": 53088 + }, + { + "epoch": 2.4716809833088904, + "grad_norm": 0.2883182820714946, + "learning_rate": 9.150147068658283e-06, + "loss": 2.5493, + "step": 53089 + }, + { + "epoch": 2.4717275414949835, + "grad_norm": 0.32673163351845025, + "learning_rate": 9.14858515778727e-06, + "loss": 2.6525, + "step": 53090 + }, + { + "epoch": 2.4717740996810766, + "grad_norm": 0.3194391663804694, + "learning_rate": 9.14702336681113e-06, + "loss": 2.5765, + "step": 53091 + }, + { + "epoch": 2.4718206578671693, + "grad_norm": 0.29394619410561496, + "learning_rate": 9.14546169573447e-06, + "loss": 2.4878, + "step": 53092 + }, + { + "epoch": 2.4718672160532624, + "grad_norm": 0.30293882796544336, + "learning_rate": 9.14390014456183e-06, + "loss": 2.6057, + "step": 53093 + }, + { + "epoch": 2.4719137742393555, + "grad_norm": 0.30935233741416057, + "learning_rate": 9.142338713297837e-06, + "loss": 2.6152, + "step": 53094 + }, + { + "epoch": 2.4719603324254487, + "grad_norm": 0.326019778775354, + "learning_rate": 9.140777401947048e-06, + "loss": 2.6494, + "step": 53095 + }, + { + "epoch": 2.4720068906115418, + "grad_norm": 0.30739650587796796, + "learning_rate": 9.139216210514051e-06, + "loss": 2.629, + "step": 53096 + }, + { + "epoch": 2.472053448797635, + "grad_norm": 0.30903338557010934, + "learning_rate": 9.137655139003425e-06, + "loss": 2.6699, + "step": 53097 + }, + { + "epoch": 2.472100006983728, + "grad_norm": 0.31563790724207536, + "learning_rate": 9.136094187419759e-06, + "loss": 2.6632, + "step": 53098 + }, + { + "epoch": 2.472146565169821, + "grad_norm": 0.31627421021780466, + "learning_rate": 9.134533355767633e-06, + "loss": 2.6514, + "step": 53099 + }, + { + "epoch": 2.4721931233559142, + "grad_norm": 0.32100777558233534, + "learning_rate": 9.132972644051634e-06, + "loss": 2.6499, + "step": 53100 + }, + { + "epoch": 2.472239681542007, + "grad_norm": 0.31051128875077894, + "learning_rate": 9.131412052276322e-06, + "loss": 2.6018, + "step": 53101 + }, + { + "epoch": 2.4722862397281, + "grad_norm": 0.29878236857167384, + "learning_rate": 9.129851580446291e-06, + "loss": 2.6059, + "step": 53102 + }, + { + "epoch": 2.472332797914193, + "grad_norm": 0.30868487939020234, + "learning_rate": 9.12829122856612e-06, + "loss": 2.521, + "step": 53103 + }, + { + "epoch": 2.4723793561002863, + "grad_norm": 0.3304829541670731, + "learning_rate": 9.126730996640386e-06, + "loss": 2.6053, + "step": 53104 + }, + { + "epoch": 2.4724259142863794, + "grad_norm": 0.3071590311176969, + "learning_rate": 9.125170884673684e-06, + "loss": 2.6801, + "step": 53105 + }, + { + "epoch": 2.4724724724724725, + "grad_norm": 0.3140461255820479, + "learning_rate": 9.123610892670553e-06, + "loss": 2.6407, + "step": 53106 + }, + { + "epoch": 2.4725190306585656, + "grad_norm": 0.3071435047788367, + "learning_rate": 9.12205102063562e-06, + "loss": 2.5042, + "step": 53107 + }, + { + "epoch": 2.4725655888446587, + "grad_norm": 0.303497370380869, + "learning_rate": 9.120491268573428e-06, + "loss": 2.4333, + "step": 53108 + }, + { + "epoch": 2.472612147030752, + "grad_norm": 0.31873889072660516, + "learning_rate": 9.11893163648857e-06, + "loss": 2.4893, + "step": 53109 + }, + { + "epoch": 2.472658705216845, + "grad_norm": 0.32711187170964867, + "learning_rate": 9.11737212438562e-06, + "loss": 2.5853, + "step": 53110 + }, + { + "epoch": 2.472705263402938, + "grad_norm": 0.30449700886200964, + "learning_rate": 9.115812732269152e-06, + "loss": 2.6754, + "step": 53111 + }, + { + "epoch": 2.4727518215890307, + "grad_norm": 0.31501554102903756, + "learning_rate": 9.114253460143745e-06, + "loss": 2.6415, + "step": 53112 + }, + { + "epoch": 2.472798379775124, + "grad_norm": 0.3310907018312611, + "learning_rate": 9.112694308013992e-06, + "loss": 2.6245, + "step": 53113 + }, + { + "epoch": 2.472844937961217, + "grad_norm": 0.3261948070631336, + "learning_rate": 9.111135275884442e-06, + "loss": 2.5755, + "step": 53114 + }, + { + "epoch": 2.47289149614731, + "grad_norm": 0.32425252402921767, + "learning_rate": 9.109576363759676e-06, + "loss": 2.6813, + "step": 53115 + }, + { + "epoch": 2.472938054333403, + "grad_norm": 0.323323363989614, + "learning_rate": 9.108017571644279e-06, + "loss": 2.5725, + "step": 53116 + }, + { + "epoch": 2.4729846125194963, + "grad_norm": 0.31413536456526975, + "learning_rate": 9.106458899542825e-06, + "loss": 2.5563, + "step": 53117 + }, + { + "epoch": 2.4730311707055894, + "grad_norm": 0.34505067787770294, + "learning_rate": 9.104900347459894e-06, + "loss": 2.694, + "step": 53118 + }, + { + "epoch": 2.4730777288916825, + "grad_norm": 0.2959964014015501, + "learning_rate": 9.103341915400026e-06, + "loss": 2.6302, + "step": 53119 + }, + { + "epoch": 2.473124287077775, + "grad_norm": 0.3065041922045318, + "learning_rate": 9.101783603367847e-06, + "loss": 2.5919, + "step": 53120 + }, + { + "epoch": 2.4731708452638683, + "grad_norm": 0.33386544442189153, + "learning_rate": 9.100225411367891e-06, + "loss": 2.5523, + "step": 53121 + }, + { + "epoch": 2.4732174034499614, + "grad_norm": 0.3258398425166303, + "learning_rate": 9.098667339404743e-06, + "loss": 2.6635, + "step": 53122 + }, + { + "epoch": 2.4732639616360546, + "grad_norm": 0.298237251587551, + "learning_rate": 9.097109387482976e-06, + "loss": 2.6117, + "step": 53123 + }, + { + "epoch": 2.4733105198221477, + "grad_norm": 0.3155637562909038, + "learning_rate": 9.095551555607168e-06, + "loss": 2.6322, + "step": 53124 + }, + { + "epoch": 2.473357078008241, + "grad_norm": 0.29770278822121493, + "learning_rate": 9.093993843781878e-06, + "loss": 2.5463, + "step": 53125 + }, + { + "epoch": 2.473403636194334, + "grad_norm": 0.33158417836593074, + "learning_rate": 9.092436252011705e-06, + "loss": 2.5758, + "step": 53126 + }, + { + "epoch": 2.473450194380427, + "grad_norm": 0.3238431982552719, + "learning_rate": 9.090878780301192e-06, + "loss": 2.6893, + "step": 53127 + }, + { + "epoch": 2.47349675256652, + "grad_norm": 0.3056204752648366, + "learning_rate": 9.089321428654913e-06, + "loss": 2.5957, + "step": 53128 + }, + { + "epoch": 2.4735433107526132, + "grad_norm": 0.30739734030939647, + "learning_rate": 9.087764197077448e-06, + "loss": 2.5958, + "step": 53129 + }, + { + "epoch": 2.4735898689387064, + "grad_norm": 0.29774732536792836, + "learning_rate": 9.086207085573367e-06, + "loss": 2.629, + "step": 53130 + }, + { + "epoch": 2.473636427124799, + "grad_norm": 0.3064059370475846, + "learning_rate": 9.084650094147246e-06, + "loss": 2.6418, + "step": 53131 + }, + { + "epoch": 2.473682985310892, + "grad_norm": 0.312567049285319, + "learning_rate": 9.083093222803623e-06, + "loss": 2.5762, + "step": 53132 + }, + { + "epoch": 2.4737295434969853, + "grad_norm": 0.32680905768539253, + "learning_rate": 9.081536471547114e-06, + "loss": 2.6376, + "step": 53133 + }, + { + "epoch": 2.4737761016830784, + "grad_norm": 0.3271833124939273, + "learning_rate": 9.079979840382241e-06, + "loss": 2.6325, + "step": 53134 + }, + { + "epoch": 2.4738226598691715, + "grad_norm": 0.29331053682932506, + "learning_rate": 9.078423329313624e-06, + "loss": 2.6742, + "step": 53135 + }, + { + "epoch": 2.4738692180552646, + "grad_norm": 0.31897981143677717, + "learning_rate": 9.076866938345786e-06, + "loss": 2.624, + "step": 53136 + }, + { + "epoch": 2.4739157762413577, + "grad_norm": 0.3206278813046901, + "learning_rate": 9.075310667483311e-06, + "loss": 2.6859, + "step": 53137 + }, + { + "epoch": 2.473962334427451, + "grad_norm": 0.3356485670550971, + "learning_rate": 9.073754516730771e-06, + "loss": 2.6253, + "step": 53138 + }, + { + "epoch": 2.474008892613544, + "grad_norm": 0.307454365404001, + "learning_rate": 9.072198486092737e-06, + "loss": 2.6345, + "step": 53139 + }, + { + "epoch": 2.4740554507996366, + "grad_norm": 0.3140557614527073, + "learning_rate": 9.070642575573762e-06, + "loss": 2.5506, + "step": 53140 + }, + { + "epoch": 2.4741020089857297, + "grad_norm": 0.3407026808871439, + "learning_rate": 9.069086785178416e-06, + "loss": 2.6266, + "step": 53141 + }, + { + "epoch": 2.474148567171823, + "grad_norm": 0.3126300308347745, + "learning_rate": 9.067531114911265e-06, + "loss": 2.6084, + "step": 53142 + }, + { + "epoch": 2.474195125357916, + "grad_norm": 0.3071504327348101, + "learning_rate": 9.065975564776886e-06, + "loss": 2.6364, + "step": 53143 + }, + { + "epoch": 2.474241683544009, + "grad_norm": 0.2949828649560129, + "learning_rate": 9.064420134779839e-06, + "loss": 2.6403, + "step": 53144 + }, + { + "epoch": 2.474288241730102, + "grad_norm": 0.3281706378073463, + "learning_rate": 9.062864824924666e-06, + "loss": 2.7089, + "step": 53145 + }, + { + "epoch": 2.4743347999161953, + "grad_norm": 0.31367025559817036, + "learning_rate": 9.061309635215976e-06, + "loss": 2.5713, + "step": 53146 + }, + { + "epoch": 2.4743813581022884, + "grad_norm": 0.31651810336254804, + "learning_rate": 9.059754565658286e-06, + "loss": 2.653, + "step": 53147 + }, + { + "epoch": 2.4744279162883815, + "grad_norm": 0.31073956900713695, + "learning_rate": 9.058199616256208e-06, + "loss": 2.6358, + "step": 53148 + }, + { + "epoch": 2.4744744744744747, + "grad_norm": 0.3017927137338264, + "learning_rate": 9.056644787014262e-06, + "loss": 2.6255, + "step": 53149 + }, + { + "epoch": 2.4745210326605678, + "grad_norm": 0.3406155534248407, + "learning_rate": 9.055090077937035e-06, + "loss": 2.6793, + "step": 53150 + }, + { + "epoch": 2.4745675908466604, + "grad_norm": 0.30386746639483836, + "learning_rate": 9.053535489029086e-06, + "loss": 2.6184, + "step": 53151 + }, + { + "epoch": 2.4746141490327536, + "grad_norm": 0.32461207475337756, + "learning_rate": 9.051981020294976e-06, + "loss": 2.6172, + "step": 53152 + }, + { + "epoch": 2.4746607072188467, + "grad_norm": 0.3168262743527839, + "learning_rate": 9.050426671739276e-06, + "loss": 2.7148, + "step": 53153 + }, + { + "epoch": 2.47470726540494, + "grad_norm": 0.3196957801192002, + "learning_rate": 9.048872443366529e-06, + "loss": 2.6081, + "step": 53154 + }, + { + "epoch": 2.474753823591033, + "grad_norm": 0.3058987088694504, + "learning_rate": 9.04731833518131e-06, + "loss": 2.664, + "step": 53155 + }, + { + "epoch": 2.474800381777126, + "grad_norm": 0.3034136468078525, + "learning_rate": 9.045764347188174e-06, + "loss": 2.566, + "step": 53156 + }, + { + "epoch": 2.474846939963219, + "grad_norm": 0.32759647786026896, + "learning_rate": 9.044210479391701e-06, + "loss": 2.6323, + "step": 53157 + }, + { + "epoch": 2.4748934981493123, + "grad_norm": 0.3177067847437423, + "learning_rate": 9.042656731796405e-06, + "loss": 2.5818, + "step": 53158 + }, + { + "epoch": 2.474940056335405, + "grad_norm": 0.3361459712921282, + "learning_rate": 9.041103104406906e-06, + "loss": 2.6692, + "step": 53159 + }, + { + "epoch": 2.474986614521498, + "grad_norm": 0.32926224436529733, + "learning_rate": 9.03954959722771e-06, + "loss": 2.5904, + "step": 53160 + }, + { + "epoch": 2.475033172707591, + "grad_norm": 0.333589819736593, + "learning_rate": 9.037996210263421e-06, + "loss": 2.5897, + "step": 53161 + }, + { + "epoch": 2.4750797308936843, + "grad_norm": 0.3047344791342487, + "learning_rate": 9.036442943518569e-06, + "loss": 2.6243, + "step": 53162 + }, + { + "epoch": 2.4751262890797774, + "grad_norm": 0.29533281877457174, + "learning_rate": 9.03488979699772e-06, + "loss": 2.6386, + "step": 53163 + }, + { + "epoch": 2.4751728472658705, + "grad_norm": 0.3195133347597412, + "learning_rate": 9.03333677070543e-06, + "loss": 2.6374, + "step": 53164 + }, + { + "epoch": 2.4752194054519636, + "grad_norm": 0.3319281364717663, + "learning_rate": 9.031783864646265e-06, + "loss": 2.7306, + "step": 53165 + }, + { + "epoch": 2.4752659636380567, + "grad_norm": 0.3083791181893417, + "learning_rate": 9.030231078824791e-06, + "loss": 2.5887, + "step": 53166 + }, + { + "epoch": 2.47531252182415, + "grad_norm": 0.3101441828827288, + "learning_rate": 9.028678413245539e-06, + "loss": 2.6376, + "step": 53167 + }, + { + "epoch": 2.475359080010243, + "grad_norm": 0.32358835381992235, + "learning_rate": 9.027125867913084e-06, + "loss": 2.783, + "step": 53168 + }, + { + "epoch": 2.475405638196336, + "grad_norm": 0.30499047530358026, + "learning_rate": 9.025573442831969e-06, + "loss": 2.5911, + "step": 53169 + }, + { + "epoch": 2.4754521963824287, + "grad_norm": 0.324896289854733, + "learning_rate": 9.024021138006766e-06, + "loss": 2.5914, + "step": 53170 + }, + { + "epoch": 2.475498754568522, + "grad_norm": 0.3030857968869162, + "learning_rate": 9.022468953442027e-06, + "loss": 2.6383, + "step": 53171 + }, + { + "epoch": 2.475545312754615, + "grad_norm": 0.32367316943244506, + "learning_rate": 9.020916889142312e-06, + "loss": 2.6433, + "step": 53172 + }, + { + "epoch": 2.475591870940708, + "grad_norm": 0.3119894887372612, + "learning_rate": 9.019364945112141e-06, + "loss": 2.5095, + "step": 53173 + }, + { + "epoch": 2.475638429126801, + "grad_norm": 0.28935331392234714, + "learning_rate": 9.017813121356127e-06, + "loss": 2.5684, + "step": 53174 + }, + { + "epoch": 2.4756849873128943, + "grad_norm": 0.30863108382545784, + "learning_rate": 9.01626141787878e-06, + "loss": 2.6458, + "step": 53175 + }, + { + "epoch": 2.4757315454989874, + "grad_norm": 0.3036094605503507, + "learning_rate": 9.014709834684665e-06, + "loss": 2.5618, + "step": 53176 + }, + { + "epoch": 2.4757781036850806, + "grad_norm": 0.32797730403739583, + "learning_rate": 9.013158371778341e-06, + "loss": 2.6521, + "step": 53177 + }, + { + "epoch": 2.4758246618711737, + "grad_norm": 0.31085763546063355, + "learning_rate": 9.01160702916436e-06, + "loss": 2.647, + "step": 53178 + }, + { + "epoch": 2.4758712200572663, + "grad_norm": 0.324372137959814, + "learning_rate": 9.010055806847289e-06, + "loss": 2.6227, + "step": 53179 + }, + { + "epoch": 2.4759177782433595, + "grad_norm": 0.31032176567925135, + "learning_rate": 9.00850470483165e-06, + "loss": 2.516, + "step": 53180 + }, + { + "epoch": 2.4759643364294526, + "grad_norm": 0.30227023733457575, + "learning_rate": 9.006953723122014e-06, + "loss": 2.5838, + "step": 53181 + }, + { + "epoch": 2.4760108946155457, + "grad_norm": 0.3289626417427905, + "learning_rate": 9.005402861722929e-06, + "loss": 2.6649, + "step": 53182 + }, + { + "epoch": 2.476057452801639, + "grad_norm": 0.30133085705723966, + "learning_rate": 9.003852120638944e-06, + "loss": 2.5194, + "step": 53183 + }, + { + "epoch": 2.476104010987732, + "grad_norm": 0.3019199890204744, + "learning_rate": 9.002301499874621e-06, + "loss": 2.6283, + "step": 53184 + }, + { + "epoch": 2.476150569173825, + "grad_norm": 0.3065020803895346, + "learning_rate": 9.000750999434515e-06, + "loss": 2.6602, + "step": 53185 + }, + { + "epoch": 2.476197127359918, + "grad_norm": 0.31942969447825675, + "learning_rate": 8.999200619323139e-06, + "loss": 2.6252, + "step": 53186 + }, + { + "epoch": 2.4762436855460113, + "grad_norm": 0.31686865904853784, + "learning_rate": 8.997650359545095e-06, + "loss": 2.6591, + "step": 53187 + }, + { + "epoch": 2.4762902437321044, + "grad_norm": 0.31599658057483104, + "learning_rate": 8.996100220104898e-06, + "loss": 2.6332, + "step": 53188 + }, + { + "epoch": 2.4763368019181975, + "grad_norm": 0.3026776494447996, + "learning_rate": 8.994550201007102e-06, + "loss": 2.7352, + "step": 53189 + }, + { + "epoch": 2.47638336010429, + "grad_norm": 0.30902917226437815, + "learning_rate": 8.993000302256265e-06, + "loss": 2.6959, + "step": 53190 + }, + { + "epoch": 2.4764299182903833, + "grad_norm": 0.31085638096780893, + "learning_rate": 8.991450523856932e-06, + "loss": 2.7123, + "step": 53191 + }, + { + "epoch": 2.4764764764764764, + "grad_norm": 0.32555607797861935, + "learning_rate": 8.989900865813667e-06, + "loss": 2.639, + "step": 53192 + }, + { + "epoch": 2.4765230346625695, + "grad_norm": 0.2985960458925506, + "learning_rate": 8.988351328130983e-06, + "loss": 2.6023, + "step": 53193 + }, + { + "epoch": 2.4765695928486626, + "grad_norm": 0.30166609379907366, + "learning_rate": 8.986801910813452e-06, + "loss": 2.4916, + "step": 53194 + }, + { + "epoch": 2.4766161510347557, + "grad_norm": 0.30706954887362414, + "learning_rate": 8.98525261386562e-06, + "loss": 2.5919, + "step": 53195 + }, + { + "epoch": 2.476662709220849, + "grad_norm": 0.3665550936277224, + "learning_rate": 8.983703437292024e-06, + "loss": 2.6715, + "step": 53196 + }, + { + "epoch": 2.476709267406942, + "grad_norm": 0.3025381939028393, + "learning_rate": 8.982154381097224e-06, + "loss": 2.6369, + "step": 53197 + }, + { + "epoch": 2.476755825593035, + "grad_norm": 0.30813026130820814, + "learning_rate": 8.980605445285766e-06, + "loss": 2.6124, + "step": 53198 + }, + { + "epoch": 2.4768023837791278, + "grad_norm": 0.30663067797622995, + "learning_rate": 8.97905662986217e-06, + "loss": 2.6914, + "step": 53199 + }, + { + "epoch": 2.476848941965221, + "grad_norm": 0.2962064193211524, + "learning_rate": 8.977507934831025e-06, + "loss": 2.487, + "step": 53200 + }, + { + "epoch": 2.476895500151314, + "grad_norm": 0.3144275363022892, + "learning_rate": 8.975959360196845e-06, + "loss": 2.6983, + "step": 53201 + }, + { + "epoch": 2.476942058337407, + "grad_norm": 0.3222797706422437, + "learning_rate": 8.974410905964176e-06, + "loss": 2.5695, + "step": 53202 + }, + { + "epoch": 2.4769886165235, + "grad_norm": 0.29720136772659583, + "learning_rate": 8.972862572137569e-06, + "loss": 2.5436, + "step": 53203 + }, + { + "epoch": 2.4770351747095933, + "grad_norm": 0.30757542319374354, + "learning_rate": 8.971314358721572e-06, + "loss": 2.7165, + "step": 53204 + }, + { + "epoch": 2.4770817328956865, + "grad_norm": 0.2958625631391692, + "learning_rate": 8.969766265720741e-06, + "loss": 2.6745, + "step": 53205 + }, + { + "epoch": 2.4771282910817796, + "grad_norm": 0.30453263205341397, + "learning_rate": 8.968218293139574e-06, + "loss": 2.7278, + "step": 53206 + }, + { + "epoch": 2.4771748492678727, + "grad_norm": 0.3248558040657661, + "learning_rate": 8.966670440982667e-06, + "loss": 2.7169, + "step": 53207 + }, + { + "epoch": 2.477221407453966, + "grad_norm": 0.31157296977405385, + "learning_rate": 8.965122709254531e-06, + "loss": 2.5849, + "step": 53208 + }, + { + "epoch": 2.477267965640059, + "grad_norm": 0.3049474383140592, + "learning_rate": 8.96357509795972e-06, + "loss": 2.651, + "step": 53209 + }, + { + "epoch": 2.4773145238261516, + "grad_norm": 0.3059355753059508, + "learning_rate": 8.96202760710277e-06, + "loss": 2.6899, + "step": 53210 + }, + { + "epoch": 2.4773610820122447, + "grad_norm": 0.3099582587968712, + "learning_rate": 8.960480236688223e-06, + "loss": 2.5804, + "step": 53211 + }, + { + "epoch": 2.477407640198338, + "grad_norm": 0.3148677225665583, + "learning_rate": 8.958932986720626e-06, + "loss": 2.623, + "step": 53212 + }, + { + "epoch": 2.477454198384431, + "grad_norm": 0.2994984134957566, + "learning_rate": 8.957385857204531e-06, + "loss": 2.5687, + "step": 53213 + }, + { + "epoch": 2.477500756570524, + "grad_norm": 0.3228093303176435, + "learning_rate": 8.955838848144448e-06, + "loss": 2.6646, + "step": 53214 + }, + { + "epoch": 2.477547314756617, + "grad_norm": 0.31755112231004545, + "learning_rate": 8.954291959544942e-06, + "loss": 2.6248, + "step": 53215 + }, + { + "epoch": 2.4775938729427103, + "grad_norm": 0.3150491161763163, + "learning_rate": 8.952745191410538e-06, + "loss": 2.6638, + "step": 53216 + }, + { + "epoch": 2.4776404311288034, + "grad_norm": 0.3038129120891179, + "learning_rate": 8.951198543745792e-06, + "loss": 2.5707, + "step": 53217 + }, + { + "epoch": 2.477686989314896, + "grad_norm": 0.32711038611309795, + "learning_rate": 8.949652016555237e-06, + "loss": 2.5944, + "step": 53218 + }, + { + "epoch": 2.477733547500989, + "grad_norm": 0.319438699637333, + "learning_rate": 8.948105609843394e-06, + "loss": 2.5832, + "step": 53219 + }, + { + "epoch": 2.4777801056870823, + "grad_norm": 0.3151681786801743, + "learning_rate": 8.946559323614834e-06, + "loss": 2.5827, + "step": 53220 + }, + { + "epoch": 2.4778266638731754, + "grad_norm": 0.3237285360666488, + "learning_rate": 8.945013157874065e-06, + "loss": 2.6068, + "step": 53221 + }, + { + "epoch": 2.4778732220592685, + "grad_norm": 0.30968339322115557, + "learning_rate": 8.943467112625642e-06, + "loss": 2.6059, + "step": 53222 + }, + { + "epoch": 2.4779197802453616, + "grad_norm": 0.3314357139937964, + "learning_rate": 8.941921187874097e-06, + "loss": 2.7336, + "step": 53223 + }, + { + "epoch": 2.4779663384314548, + "grad_norm": 0.32333499618766637, + "learning_rate": 8.940375383623973e-06, + "loss": 2.5354, + "step": 53224 + }, + { + "epoch": 2.478012896617548, + "grad_norm": 0.33830735816282026, + "learning_rate": 8.938829699879796e-06, + "loss": 2.6892, + "step": 53225 + }, + { + "epoch": 2.478059454803641, + "grad_norm": 0.30541459605322757, + "learning_rate": 8.937284136646123e-06, + "loss": 2.6983, + "step": 53226 + }, + { + "epoch": 2.478106012989734, + "grad_norm": 0.29098981933591134, + "learning_rate": 8.935738693927458e-06, + "loss": 2.6182, + "step": 53227 + }, + { + "epoch": 2.478152571175827, + "grad_norm": 0.3055277969753974, + "learning_rate": 8.934193371728362e-06, + "loss": 2.6507, + "step": 53228 + }, + { + "epoch": 2.47819912936192, + "grad_norm": 0.3101037135063964, + "learning_rate": 8.93264817005336e-06, + "loss": 2.6424, + "step": 53229 + }, + { + "epoch": 2.478245687548013, + "grad_norm": 0.3171296236976661, + "learning_rate": 8.931103088906989e-06, + "loss": 2.7391, + "step": 53230 + }, + { + "epoch": 2.478292245734106, + "grad_norm": 0.31724663162556366, + "learning_rate": 8.929558128293802e-06, + "loss": 2.6184, + "step": 53231 + }, + { + "epoch": 2.4783388039201992, + "grad_norm": 0.3189731728407656, + "learning_rate": 8.928013288218284e-06, + "loss": 2.671, + "step": 53232 + }, + { + "epoch": 2.4783853621062923, + "grad_norm": 0.3352642738432185, + "learning_rate": 8.926468568685031e-06, + "loss": 2.5993, + "step": 53233 + }, + { + "epoch": 2.4784319202923855, + "grad_norm": 0.33212983460043005, + "learning_rate": 8.924923969698528e-06, + "loss": 2.5928, + "step": 53234 + }, + { + "epoch": 2.4784784784784786, + "grad_norm": 0.30543877183622226, + "learning_rate": 8.923379491263333e-06, + "loss": 2.5315, + "step": 53235 + }, + { + "epoch": 2.4785250366645717, + "grad_norm": 0.30644111369926613, + "learning_rate": 8.921835133383966e-06, + "loss": 2.6008, + "step": 53236 + }, + { + "epoch": 2.478571594850665, + "grad_norm": 0.3294583297133643, + "learning_rate": 8.92029089606497e-06, + "loss": 2.5317, + "step": 53237 + }, + { + "epoch": 2.4786181530367575, + "grad_norm": 0.32094282492929294, + "learning_rate": 8.918746779310877e-06, + "loss": 2.6577, + "step": 53238 + }, + { + "epoch": 2.4786647112228506, + "grad_norm": 0.30968936733969304, + "learning_rate": 8.917202783126221e-06, + "loss": 2.59, + "step": 53239 + }, + { + "epoch": 2.4787112694089437, + "grad_norm": 0.31436053581458123, + "learning_rate": 8.915658907515518e-06, + "loss": 2.664, + "step": 53240 + }, + { + "epoch": 2.478757827595037, + "grad_norm": 0.3251273027643609, + "learning_rate": 8.914115152483304e-06, + "loss": 2.5739, + "step": 53241 + }, + { + "epoch": 2.47880438578113, + "grad_norm": 0.3096558722297171, + "learning_rate": 8.912571518034118e-06, + "loss": 2.6279, + "step": 53242 + }, + { + "epoch": 2.478850943967223, + "grad_norm": 0.31309519614039893, + "learning_rate": 8.91102800417249e-06, + "loss": 2.6704, + "step": 53243 + }, + { + "epoch": 2.478897502153316, + "grad_norm": 0.3243284533479356, + "learning_rate": 8.909484610902956e-06, + "loss": 2.6752, + "step": 53244 + }, + { + "epoch": 2.4789440603394093, + "grad_norm": 0.3080280168330993, + "learning_rate": 8.907941338230014e-06, + "loss": 2.7124, + "step": 53245 + }, + { + "epoch": 2.4789906185255024, + "grad_norm": 0.2989120069948386, + "learning_rate": 8.906398186158238e-06, + "loss": 2.674, + "step": 53246 + }, + { + "epoch": 2.4790371767115955, + "grad_norm": 0.3048328383213105, + "learning_rate": 8.90485515469211e-06, + "loss": 2.6309, + "step": 53247 + }, + { + "epoch": 2.4790837348976886, + "grad_norm": 0.3280759019173638, + "learning_rate": 8.903312243836209e-06, + "loss": 2.5551, + "step": 53248 + }, + { + "epoch": 2.4791302930837813, + "grad_norm": 0.31472283612398616, + "learning_rate": 8.901769453595027e-06, + "loss": 2.503, + "step": 53249 + }, + { + "epoch": 2.4791768512698744, + "grad_norm": 0.3072217079026237, + "learning_rate": 8.900226783973099e-06, + "loss": 2.5859, + "step": 53250 + }, + { + "epoch": 2.4792234094559675, + "grad_norm": 0.3211024468277292, + "learning_rate": 8.898684234974957e-06, + "loss": 2.6462, + "step": 53251 + }, + { + "epoch": 2.4792699676420606, + "grad_norm": 0.31698452177246766, + "learning_rate": 8.897141806605148e-06, + "loss": 2.6065, + "step": 53252 + }, + { + "epoch": 2.4793165258281538, + "grad_norm": 0.2939615540891626, + "learning_rate": 8.895599498868158e-06, + "loss": 2.5477, + "step": 53253 + }, + { + "epoch": 2.479363084014247, + "grad_norm": 0.3026786698563771, + "learning_rate": 8.894057311768534e-06, + "loss": 2.6253, + "step": 53254 + }, + { + "epoch": 2.47940964220034, + "grad_norm": 0.32710021518767807, + "learning_rate": 8.892515245310807e-06, + "loss": 2.622, + "step": 53255 + }, + { + "epoch": 2.479456200386433, + "grad_norm": 0.3276529134568733, + "learning_rate": 8.890973299499494e-06, + "loss": 2.7546, + "step": 53256 + }, + { + "epoch": 2.4795027585725258, + "grad_norm": 0.3191347238323107, + "learning_rate": 8.889431474339138e-06, + "loss": 2.6218, + "step": 53257 + }, + { + "epoch": 2.479549316758619, + "grad_norm": 0.2944442662834427, + "learning_rate": 8.887889769834224e-06, + "loss": 2.603, + "step": 53258 + }, + { + "epoch": 2.479595874944712, + "grad_norm": 0.3146257597668887, + "learning_rate": 8.886348185989334e-06, + "loss": 2.6417, + "step": 53259 + }, + { + "epoch": 2.479642433130805, + "grad_norm": 0.35305590315913193, + "learning_rate": 8.88480672280893e-06, + "loss": 2.7053, + "step": 53260 + }, + { + "epoch": 2.4796889913168982, + "grad_norm": 0.29768749629963775, + "learning_rate": 8.883265380297594e-06, + "loss": 2.6273, + "step": 53261 + }, + { + "epoch": 2.4797355495029914, + "grad_norm": 0.32894025986896896, + "learning_rate": 8.881724158459814e-06, + "loss": 2.5173, + "step": 53262 + }, + { + "epoch": 2.4797821076890845, + "grad_norm": 0.3357382014578553, + "learning_rate": 8.880183057300124e-06, + "loss": 2.6197, + "step": 53263 + }, + { + "epoch": 2.4798286658751776, + "grad_norm": 0.31746376850918484, + "learning_rate": 8.87864207682304e-06, + "loss": 2.5891, + "step": 53264 + }, + { + "epoch": 2.4798752240612707, + "grad_norm": 0.30033231343662614, + "learning_rate": 8.877101217033095e-06, + "loss": 2.596, + "step": 53265 + }, + { + "epoch": 2.479921782247364, + "grad_norm": 0.3165220190647548, + "learning_rate": 8.87556047793482e-06, + "loss": 2.7217, + "step": 53266 + }, + { + "epoch": 2.479968340433457, + "grad_norm": 0.3109696038468299, + "learning_rate": 8.874019859532706e-06, + "loss": 2.5862, + "step": 53267 + }, + { + "epoch": 2.4800148986195496, + "grad_norm": 0.3106297998022056, + "learning_rate": 8.872479361831294e-06, + "loss": 2.5113, + "step": 53268 + }, + { + "epoch": 2.4800614568056427, + "grad_norm": 0.30927170548402555, + "learning_rate": 8.870938984835103e-06, + "loss": 2.5443, + "step": 53269 + }, + { + "epoch": 2.480108014991736, + "grad_norm": 0.3387030182995157, + "learning_rate": 8.869398728548667e-06, + "loss": 2.6797, + "step": 53270 + }, + { + "epoch": 2.480154573177829, + "grad_norm": 0.3130492991572993, + "learning_rate": 8.867858592976469e-06, + "loss": 2.6543, + "step": 53271 + }, + { + "epoch": 2.480201131363922, + "grad_norm": 0.31871127495047413, + "learning_rate": 8.866318578123084e-06, + "loss": 2.5763, + "step": 53272 + }, + { + "epoch": 2.480247689550015, + "grad_norm": 0.29555007306118836, + "learning_rate": 8.86477868399297e-06, + "loss": 2.6486, + "step": 53273 + }, + { + "epoch": 2.4802942477361083, + "grad_norm": 0.3177918186622802, + "learning_rate": 8.863238910590704e-06, + "loss": 2.664, + "step": 53274 + }, + { + "epoch": 2.4803408059222014, + "grad_norm": 0.33952038810488827, + "learning_rate": 8.86169925792077e-06, + "loss": 2.6339, + "step": 53275 + }, + { + "epoch": 2.4803873641082945, + "grad_norm": 0.29884650993107037, + "learning_rate": 8.860159725987694e-06, + "loss": 2.5732, + "step": 53276 + }, + { + "epoch": 2.480433922294387, + "grad_norm": 0.3232535122958119, + "learning_rate": 8.858620314795995e-06, + "loss": 2.5809, + "step": 53277 + }, + { + "epoch": 2.4804804804804803, + "grad_norm": 0.3073796298075618, + "learning_rate": 8.857081024350195e-06, + "loss": 2.5492, + "step": 53278 + }, + { + "epoch": 2.4805270386665734, + "grad_norm": 0.3383987637856944, + "learning_rate": 8.855541854654815e-06, + "loss": 2.7022, + "step": 53279 + }, + { + "epoch": 2.4805735968526665, + "grad_norm": 0.29194946288481205, + "learning_rate": 8.854002805714362e-06, + "loss": 2.6131, + "step": 53280 + }, + { + "epoch": 2.4806201550387597, + "grad_norm": 0.2929877661511126, + "learning_rate": 8.852463877533351e-06, + "loss": 2.5356, + "step": 53281 + }, + { + "epoch": 2.4806667132248528, + "grad_norm": 0.3231895237304332, + "learning_rate": 8.850925070116305e-06, + "loss": 2.6647, + "step": 53282 + }, + { + "epoch": 2.480713271410946, + "grad_norm": 0.3197372183680651, + "learning_rate": 8.849386383467745e-06, + "loss": 2.6412, + "step": 53283 + }, + { + "epoch": 2.480759829597039, + "grad_norm": 0.3065312599060772, + "learning_rate": 8.847847817592181e-06, + "loss": 2.7148, + "step": 53284 + }, + { + "epoch": 2.480806387783132, + "grad_norm": 0.303790205032267, + "learning_rate": 8.846309372494138e-06, + "loss": 2.538, + "step": 53285 + }, + { + "epoch": 2.4808529459692252, + "grad_norm": 0.3015241690845708, + "learning_rate": 8.844771048178102e-06, + "loss": 2.5962, + "step": 53286 + }, + { + "epoch": 2.4808995041553183, + "grad_norm": 0.3299866388237442, + "learning_rate": 8.84323284464863e-06, + "loss": 2.6677, + "step": 53287 + }, + { + "epoch": 2.480946062341411, + "grad_norm": 0.33602788005437634, + "learning_rate": 8.841694761910202e-06, + "loss": 2.6446, + "step": 53288 + }, + { + "epoch": 2.480992620527504, + "grad_norm": 0.3198080137730576, + "learning_rate": 8.840156799967347e-06, + "loss": 2.5852, + "step": 53289 + }, + { + "epoch": 2.4810391787135972, + "grad_norm": 0.3121775424166456, + "learning_rate": 8.838618958824579e-06, + "loss": 2.5913, + "step": 53290 + }, + { + "epoch": 2.4810857368996904, + "grad_norm": 0.3239684275122909, + "learning_rate": 8.837081238486406e-06, + "loss": 2.6425, + "step": 53291 + }, + { + "epoch": 2.4811322950857835, + "grad_norm": 0.32904530865781334, + "learning_rate": 8.83554363895736e-06, + "loss": 2.5833, + "step": 53292 + }, + { + "epoch": 2.4811788532718766, + "grad_norm": 0.3367592789299918, + "learning_rate": 8.83400616024192e-06, + "loss": 2.7387, + "step": 53293 + }, + { + "epoch": 2.4812254114579697, + "grad_norm": 0.3353132159857694, + "learning_rate": 8.832468802344624e-06, + "loss": 2.6182, + "step": 53294 + }, + { + "epoch": 2.481271969644063, + "grad_norm": 0.34586533300741124, + "learning_rate": 8.830931565269974e-06, + "loss": 2.6343, + "step": 53295 + }, + { + "epoch": 2.4813185278301555, + "grad_norm": 0.2873489330765874, + "learning_rate": 8.829394449022476e-06, + "loss": 2.6369, + "step": 53296 + }, + { + "epoch": 2.4813650860162486, + "grad_norm": 0.30287709054424816, + "learning_rate": 8.827857453606658e-06, + "loss": 2.611, + "step": 53297 + }, + { + "epoch": 2.4814116442023417, + "grad_norm": 0.3271746041197807, + "learning_rate": 8.82632057902703e-06, + "loss": 2.5975, + "step": 53298 + }, + { + "epoch": 2.481458202388435, + "grad_norm": 0.32030349108649325, + "learning_rate": 8.824783825288074e-06, + "loss": 2.6443, + "step": 53299 + }, + { + "epoch": 2.481504760574528, + "grad_norm": 0.32589545525909114, + "learning_rate": 8.823247192394346e-06, + "loss": 2.5075, + "step": 53300 + }, + { + "epoch": 2.481551318760621, + "grad_norm": 0.306554838952528, + "learning_rate": 8.821710680350304e-06, + "loss": 2.5977, + "step": 53301 + }, + { + "epoch": 2.481597876946714, + "grad_norm": 0.3036525445693301, + "learning_rate": 8.82017428916051e-06, + "loss": 2.5966, + "step": 53302 + }, + { + "epoch": 2.4816444351328073, + "grad_norm": 0.32463458973201237, + "learning_rate": 8.818638018829434e-06, + "loss": 2.6979, + "step": 53303 + }, + { + "epoch": 2.4816909933189004, + "grad_norm": 0.30346199480502817, + "learning_rate": 8.8171018693616e-06, + "loss": 2.6223, + "step": 53304 + }, + { + "epoch": 2.4817375515049935, + "grad_norm": 0.3007255265487674, + "learning_rate": 8.815565840761514e-06, + "loss": 2.6393, + "step": 53305 + }, + { + "epoch": 2.4817841096910866, + "grad_norm": 0.33573096369728406, + "learning_rate": 8.814029933033697e-06, + "loss": 2.6537, + "step": 53306 + }, + { + "epoch": 2.4818306678771793, + "grad_norm": 0.34208786711262756, + "learning_rate": 8.812494146182631e-06, + "loss": 2.7205, + "step": 53307 + }, + { + "epoch": 2.4818772260632724, + "grad_norm": 0.29927027629321556, + "learning_rate": 8.810958480212838e-06, + "loss": 2.626, + "step": 53308 + }, + { + "epoch": 2.4819237842493655, + "grad_norm": 0.3222051041872024, + "learning_rate": 8.809422935128825e-06, + "loss": 2.6475, + "step": 53309 + }, + { + "epoch": 2.4819703424354587, + "grad_norm": 0.3342291215066766, + "learning_rate": 8.807887510935097e-06, + "loss": 2.6529, + "step": 53310 + }, + { + "epoch": 2.482016900621552, + "grad_norm": 0.31911082488616577, + "learning_rate": 8.806352207636176e-06, + "loss": 2.6601, + "step": 53311 + }, + { + "epoch": 2.482063458807645, + "grad_norm": 0.29674011523796323, + "learning_rate": 8.804817025236527e-06, + "loss": 2.6188, + "step": 53312 + }, + { + "epoch": 2.482110016993738, + "grad_norm": 0.30412217439794803, + "learning_rate": 8.803281963740707e-06, + "loss": 2.6566, + "step": 53313 + }, + { + "epoch": 2.482156575179831, + "grad_norm": 0.33251467866415774, + "learning_rate": 8.801747023153167e-06, + "loss": 2.6288, + "step": 53314 + }, + { + "epoch": 2.4822031333659242, + "grad_norm": 0.3091894964310065, + "learning_rate": 8.80021220347847e-06, + "loss": 2.569, + "step": 53315 + }, + { + "epoch": 2.482249691552017, + "grad_norm": 0.3233988635852699, + "learning_rate": 8.798677504721076e-06, + "loss": 2.6256, + "step": 53316 + }, + { + "epoch": 2.48229624973811, + "grad_norm": 0.33260301345567744, + "learning_rate": 8.797142926885504e-06, + "loss": 2.7152, + "step": 53317 + }, + { + "epoch": 2.482342807924203, + "grad_norm": 0.29608742252264014, + "learning_rate": 8.795608469976262e-06, + "loss": 2.5594, + "step": 53318 + }, + { + "epoch": 2.4823893661102963, + "grad_norm": 0.31346791379521394, + "learning_rate": 8.794074133997847e-06, + "loss": 2.6629, + "step": 53319 + }, + { + "epoch": 2.4824359242963894, + "grad_norm": 0.3198273701177867, + "learning_rate": 8.792539918954778e-06, + "loss": 2.6999, + "step": 53320 + }, + { + "epoch": 2.4824824824824825, + "grad_norm": 0.3225948119625559, + "learning_rate": 8.791005824851534e-06, + "loss": 2.5789, + "step": 53321 + }, + { + "epoch": 2.4825290406685756, + "grad_norm": 0.31794231446104504, + "learning_rate": 8.789471851692626e-06, + "loss": 2.6456, + "step": 53322 + }, + { + "epoch": 2.4825755988546687, + "grad_norm": 0.3093352847058512, + "learning_rate": 8.78793799948256e-06, + "loss": 2.6477, + "step": 53323 + }, + { + "epoch": 2.482622157040762, + "grad_norm": 0.30959629703278857, + "learning_rate": 8.786404268225844e-06, + "loss": 2.6597, + "step": 53324 + }, + { + "epoch": 2.482668715226855, + "grad_norm": 0.3269837430617669, + "learning_rate": 8.784870657926952e-06, + "loss": 2.7013, + "step": 53325 + }, + { + "epoch": 2.482715273412948, + "grad_norm": 0.32864489414319314, + "learning_rate": 8.783337168590428e-06, + "loss": 2.6228, + "step": 53326 + }, + { + "epoch": 2.4827618315990407, + "grad_norm": 0.3066040110432438, + "learning_rate": 8.781803800220717e-06, + "loss": 2.6738, + "step": 53327 + }, + { + "epoch": 2.482808389785134, + "grad_norm": 0.2980665991527846, + "learning_rate": 8.780270552822385e-06, + "loss": 2.5439, + "step": 53328 + }, + { + "epoch": 2.482854947971227, + "grad_norm": 0.29916384203354973, + "learning_rate": 8.77873742639988e-06, + "loss": 2.6156, + "step": 53329 + }, + { + "epoch": 2.48290150615732, + "grad_norm": 0.31424430119561164, + "learning_rate": 8.777204420957725e-06, + "loss": 2.6983, + "step": 53330 + }, + { + "epoch": 2.482948064343413, + "grad_norm": 0.3082833611269835, + "learning_rate": 8.775671536500407e-06, + "loss": 2.6092, + "step": 53331 + }, + { + "epoch": 2.4829946225295063, + "grad_norm": 0.3331386496223336, + "learning_rate": 8.774138773032436e-06, + "loss": 2.7278, + "step": 53332 + }, + { + "epoch": 2.4830411807155994, + "grad_norm": 0.31584628129153836, + "learning_rate": 8.772606130558319e-06, + "loss": 2.6275, + "step": 53333 + }, + { + "epoch": 2.4830877389016925, + "grad_norm": 0.30474953800104987, + "learning_rate": 8.771073609082532e-06, + "loss": 2.6284, + "step": 53334 + }, + { + "epoch": 2.483134297087785, + "grad_norm": 0.3153169145260629, + "learning_rate": 8.769541208609577e-06, + "loss": 2.5885, + "step": 53335 + }, + { + "epoch": 2.4831808552738783, + "grad_norm": 0.31619372058615447, + "learning_rate": 8.768008929143955e-06, + "loss": 2.5774, + "step": 53336 + }, + { + "epoch": 2.4832274134599714, + "grad_norm": 0.3244316554119287, + "learning_rate": 8.766476770690174e-06, + "loss": 2.5938, + "step": 53337 + }, + { + "epoch": 2.4832739716460646, + "grad_norm": 0.292854989977006, + "learning_rate": 8.76494473325271e-06, + "loss": 2.5826, + "step": 53338 + }, + { + "epoch": 2.4833205298321577, + "grad_norm": 0.30343013909959654, + "learning_rate": 8.763412816836092e-06, + "loss": 2.6028, + "step": 53339 + }, + { + "epoch": 2.483367088018251, + "grad_norm": 0.33260184349642924, + "learning_rate": 8.761881021444768e-06, + "loss": 2.6883, + "step": 53340 + }, + { + "epoch": 2.483413646204344, + "grad_norm": 0.3242458289514836, + "learning_rate": 8.760349347083286e-06, + "loss": 2.7044, + "step": 53341 + }, + { + "epoch": 2.483460204390437, + "grad_norm": 0.30793093640783703, + "learning_rate": 8.758817793756103e-06, + "loss": 2.5828, + "step": 53342 + }, + { + "epoch": 2.48350676257653, + "grad_norm": 0.32552044620873666, + "learning_rate": 8.75728636146772e-06, + "loss": 2.6996, + "step": 53343 + }, + { + "epoch": 2.4835533207626233, + "grad_norm": 0.3228998697816985, + "learning_rate": 8.755755050222647e-06, + "loss": 2.5913, + "step": 53344 + }, + { + "epoch": 2.4835998789487164, + "grad_norm": 0.3075100328509128, + "learning_rate": 8.754223860025368e-06, + "loss": 2.7008, + "step": 53345 + }, + { + "epoch": 2.483646437134809, + "grad_norm": 0.3190966502176064, + "learning_rate": 8.75269279088039e-06, + "loss": 2.6959, + "step": 53346 + }, + { + "epoch": 2.483692995320902, + "grad_norm": 0.34369375248441925, + "learning_rate": 8.751161842792182e-06, + "loss": 2.6736, + "step": 53347 + }, + { + "epoch": 2.4837395535069953, + "grad_norm": 0.330607955215026, + "learning_rate": 8.749631015765246e-06, + "loss": 2.7376, + "step": 53348 + }, + { + "epoch": 2.4837861116930884, + "grad_norm": 0.3323309049987989, + "learning_rate": 8.748100309804086e-06, + "loss": 2.6208, + "step": 53349 + }, + { + "epoch": 2.4838326698791815, + "grad_norm": 0.31581694043325986, + "learning_rate": 8.746569724913189e-06, + "loss": 2.5926, + "step": 53350 + }, + { + "epoch": 2.4838792280652746, + "grad_norm": 0.3170314492374689, + "learning_rate": 8.745039261097044e-06, + "loss": 2.5818, + "step": 53351 + }, + { + "epoch": 2.4839257862513677, + "grad_norm": 0.31216533355693304, + "learning_rate": 8.743508918360156e-06, + "loss": 2.5515, + "step": 53352 + }, + { + "epoch": 2.483972344437461, + "grad_norm": 0.3059225651627724, + "learning_rate": 8.741978696706976e-06, + "loss": 2.6918, + "step": 53353 + }, + { + "epoch": 2.484018902623554, + "grad_norm": 0.3250708450896611, + "learning_rate": 8.740448596142054e-06, + "loss": 2.6028, + "step": 53354 + }, + { + "epoch": 2.4840654608096466, + "grad_norm": 0.31352169335552454, + "learning_rate": 8.738918616669833e-06, + "loss": 2.5694, + "step": 53355 + }, + { + "epoch": 2.4841120189957397, + "grad_norm": 0.3187577379157974, + "learning_rate": 8.737388758294824e-06, + "loss": 2.6333, + "step": 53356 + }, + { + "epoch": 2.484158577181833, + "grad_norm": 0.295304010625022, + "learning_rate": 8.735859021021515e-06, + "loss": 2.6488, + "step": 53357 + }, + { + "epoch": 2.484205135367926, + "grad_norm": 0.298618371850506, + "learning_rate": 8.734329404854391e-06, + "loss": 2.6639, + "step": 53358 + }, + { + "epoch": 2.484251693554019, + "grad_norm": 0.3157489016403772, + "learning_rate": 8.732799909797961e-06, + "loss": 2.5728, + "step": 53359 + }, + { + "epoch": 2.484298251740112, + "grad_norm": 0.31677563173519874, + "learning_rate": 8.731270535856683e-06, + "loss": 2.547, + "step": 53360 + }, + { + "epoch": 2.4843448099262053, + "grad_norm": 0.30442712617755097, + "learning_rate": 8.729741283035064e-06, + "loss": 2.4875, + "step": 53361 + }, + { + "epoch": 2.4843913681122984, + "grad_norm": 0.2899243781233386, + "learning_rate": 8.72821215133759e-06, + "loss": 2.5973, + "step": 53362 + }, + { + "epoch": 2.4844379262983916, + "grad_norm": 0.31274595140463496, + "learning_rate": 8.726683140768743e-06, + "loss": 2.6735, + "step": 53363 + }, + { + "epoch": 2.4844844844844847, + "grad_norm": 0.2949519105509373, + "learning_rate": 8.725154251333012e-06, + "loss": 2.5576, + "step": 53364 + }, + { + "epoch": 2.484531042670578, + "grad_norm": 0.2944399248775764, + "learning_rate": 8.723625483034903e-06, + "loss": 2.5579, + "step": 53365 + }, + { + "epoch": 2.4845776008566705, + "grad_norm": 0.3081691374702429, + "learning_rate": 8.722096835878863e-06, + "loss": 2.6174, + "step": 53366 + }, + { + "epoch": 2.4846241590427636, + "grad_norm": 0.3066997309533936, + "learning_rate": 8.720568309869426e-06, + "loss": 2.6447, + "step": 53367 + }, + { + "epoch": 2.4846707172288567, + "grad_norm": 0.30623162376908447, + "learning_rate": 8.719039905011045e-06, + "loss": 2.5735, + "step": 53368 + }, + { + "epoch": 2.48471727541495, + "grad_norm": 0.30128093623047003, + "learning_rate": 8.71751162130821e-06, + "loss": 2.7109, + "step": 53369 + }, + { + "epoch": 2.484763833601043, + "grad_norm": 0.3152969044289084, + "learning_rate": 8.715983458765414e-06, + "loss": 2.6411, + "step": 53370 + }, + { + "epoch": 2.484810391787136, + "grad_norm": 0.3125302195411451, + "learning_rate": 8.714455417387134e-06, + "loss": 2.5907, + "step": 53371 + }, + { + "epoch": 2.484856949973229, + "grad_norm": 0.3196504960507437, + "learning_rate": 8.71292749717788e-06, + "loss": 2.5859, + "step": 53372 + }, + { + "epoch": 2.4849035081593223, + "grad_norm": 0.32590701623430024, + "learning_rate": 8.711399698142087e-06, + "loss": 2.5818, + "step": 53373 + }, + { + "epoch": 2.4849500663454154, + "grad_norm": 0.32143154218354025, + "learning_rate": 8.709872020284299e-06, + "loss": 2.6096, + "step": 53374 + }, + { + "epoch": 2.484996624531508, + "grad_norm": 0.31669386233007224, + "learning_rate": 8.708344463608953e-06, + "loss": 2.6898, + "step": 53375 + }, + { + "epoch": 2.485043182717601, + "grad_norm": 0.3213770609920201, + "learning_rate": 8.706817028120544e-06, + "loss": 2.6936, + "step": 53376 + }, + { + "epoch": 2.4850897409036943, + "grad_norm": 0.31088456936538506, + "learning_rate": 8.705289713823561e-06, + "loss": 2.7443, + "step": 53377 + }, + { + "epoch": 2.4851362990897874, + "grad_norm": 0.2922310210719234, + "learning_rate": 8.703762520722502e-06, + "loss": 2.7114, + "step": 53378 + }, + { + "epoch": 2.4851828572758805, + "grad_norm": 0.3110606721601445, + "learning_rate": 8.7022354488218e-06, + "loss": 2.603, + "step": 53379 + }, + { + "epoch": 2.4852294154619736, + "grad_norm": 0.29925272436353084, + "learning_rate": 8.700708498125998e-06, + "loss": 2.6263, + "step": 53380 + }, + { + "epoch": 2.4852759736480667, + "grad_norm": 0.29421819040937225, + "learning_rate": 8.699181668639534e-06, + "loss": 2.5511, + "step": 53381 + }, + { + "epoch": 2.48532253183416, + "grad_norm": 0.30238634264319214, + "learning_rate": 8.697654960366901e-06, + "loss": 2.7241, + "step": 53382 + }, + { + "epoch": 2.485369090020253, + "grad_norm": 0.3050499198516324, + "learning_rate": 8.696128373312585e-06, + "loss": 2.5637, + "step": 53383 + }, + { + "epoch": 2.485415648206346, + "grad_norm": 0.3060101353333068, + "learning_rate": 8.69460190748106e-06, + "loss": 2.7119, + "step": 53384 + }, + { + "epoch": 2.485462206392439, + "grad_norm": 0.3062259460267987, + "learning_rate": 8.693075562876824e-06, + "loss": 2.64, + "step": 53385 + }, + { + "epoch": 2.485508764578532, + "grad_norm": 0.3178786729086915, + "learning_rate": 8.691549339504318e-06, + "loss": 2.6617, + "step": 53386 + }, + { + "epoch": 2.485555322764625, + "grad_norm": 0.2957234943882225, + "learning_rate": 8.690023237368067e-06, + "loss": 2.6615, + "step": 53387 + }, + { + "epoch": 2.485601880950718, + "grad_norm": 0.3182204825833208, + "learning_rate": 8.688497256472517e-06, + "loss": 2.6788, + "step": 53388 + }, + { + "epoch": 2.485648439136811, + "grad_norm": 0.3245067004057338, + "learning_rate": 8.686971396822158e-06, + "loss": 2.6, + "step": 53389 + }, + { + "epoch": 2.4856949973229043, + "grad_norm": 0.32970911105282114, + "learning_rate": 8.685445658421464e-06, + "loss": 2.7588, + "step": 53390 + }, + { + "epoch": 2.4857415555089974, + "grad_norm": 0.3091566186979522, + "learning_rate": 8.683920041274922e-06, + "loss": 2.6637, + "step": 53391 + }, + { + "epoch": 2.4857881136950906, + "grad_norm": 0.3074154145095344, + "learning_rate": 8.682394545387002e-06, + "loss": 2.5646, + "step": 53392 + }, + { + "epoch": 2.4858346718811837, + "grad_norm": 0.2956086957586616, + "learning_rate": 8.680869170762197e-06, + "loss": 2.5911, + "step": 53393 + }, + { + "epoch": 2.4858812300672763, + "grad_norm": 0.3125684439348742, + "learning_rate": 8.679343917404959e-06, + "loss": 2.6482, + "step": 53394 + }, + { + "epoch": 2.4859277882533695, + "grad_norm": 0.3288230341095448, + "learning_rate": 8.677818785319774e-06, + "loss": 2.7204, + "step": 53395 + }, + { + "epoch": 2.4859743464394626, + "grad_norm": 0.33431477932935777, + "learning_rate": 8.676293774511119e-06, + "loss": 2.6566, + "step": 53396 + }, + { + "epoch": 2.4860209046255557, + "grad_norm": 0.2991697715841832, + "learning_rate": 8.674768884983475e-06, + "loss": 2.4315, + "step": 53397 + }, + { + "epoch": 2.486067462811649, + "grad_norm": 0.3057796336612077, + "learning_rate": 8.673244116741319e-06, + "loss": 2.6714, + "step": 53398 + }, + { + "epoch": 2.486114020997742, + "grad_norm": 0.30981404125520284, + "learning_rate": 8.671719469789103e-06, + "loss": 2.637, + "step": 53399 + }, + { + "epoch": 2.486160579183835, + "grad_norm": 0.2973879495034288, + "learning_rate": 8.670194944131338e-06, + "loss": 2.5934, + "step": 53400 + }, + { + "epoch": 2.486207137369928, + "grad_norm": 0.31962428005670196, + "learning_rate": 8.668670539772466e-06, + "loss": 2.6342, + "step": 53401 + }, + { + "epoch": 2.4862536955560213, + "grad_norm": 0.32687877937567417, + "learning_rate": 8.66714625671698e-06, + "loss": 2.6758, + "step": 53402 + }, + { + "epoch": 2.4863002537421144, + "grad_norm": 0.2989896849812194, + "learning_rate": 8.665622094969345e-06, + "loss": 2.6954, + "step": 53403 + }, + { + "epoch": 2.4863468119282075, + "grad_norm": 0.33760607243644214, + "learning_rate": 8.664098054534036e-06, + "loss": 2.5576, + "step": 53404 + }, + { + "epoch": 2.4863933701143, + "grad_norm": 0.3133292376698701, + "learning_rate": 8.662574135415524e-06, + "loss": 2.6212, + "step": 53405 + }, + { + "epoch": 2.4864399283003933, + "grad_norm": 0.30016721723912787, + "learning_rate": 8.661050337618304e-06, + "loss": 2.6301, + "step": 53406 + }, + { + "epoch": 2.4864864864864864, + "grad_norm": 0.29600506556579514, + "learning_rate": 8.65952666114681e-06, + "loss": 2.6586, + "step": 53407 + }, + { + "epoch": 2.4865330446725795, + "grad_norm": 0.31170382166915406, + "learning_rate": 8.658003106005535e-06, + "loss": 2.5847, + "step": 53408 + }, + { + "epoch": 2.4865796028586726, + "grad_norm": 0.324111998970302, + "learning_rate": 8.656479672198947e-06, + "loss": 2.5885, + "step": 53409 + }, + { + "epoch": 2.4866261610447657, + "grad_norm": 0.3202936744896437, + "learning_rate": 8.65495635973152e-06, + "loss": 2.6766, + "step": 53410 + }, + { + "epoch": 2.486672719230859, + "grad_norm": 0.3352888138753493, + "learning_rate": 8.653433168607733e-06, + "loss": 2.6001, + "step": 53411 + }, + { + "epoch": 2.486719277416952, + "grad_norm": 0.31174300643234104, + "learning_rate": 8.651910098832028e-06, + "loss": 2.6395, + "step": 53412 + }, + { + "epoch": 2.486765835603045, + "grad_norm": 0.3060651986094641, + "learning_rate": 8.650387150408911e-06, + "loss": 2.5952, + "step": 53413 + }, + { + "epoch": 2.4868123937891378, + "grad_norm": 0.3260835746265517, + "learning_rate": 8.648864323342826e-06, + "loss": 2.6202, + "step": 53414 + }, + { + "epoch": 2.486858951975231, + "grad_norm": 0.3301723485462877, + "learning_rate": 8.647341617638244e-06, + "loss": 2.6401, + "step": 53415 + }, + { + "epoch": 2.486905510161324, + "grad_norm": 0.31776391501262874, + "learning_rate": 8.645819033299647e-06, + "loss": 2.5863, + "step": 53416 + }, + { + "epoch": 2.486952068347417, + "grad_norm": 0.30530443244500355, + "learning_rate": 8.644296570331495e-06, + "loss": 2.5552, + "step": 53417 + }, + { + "epoch": 2.4869986265335102, + "grad_norm": 0.3138019081076938, + "learning_rate": 8.64277422873826e-06, + "loss": 2.5407, + "step": 53418 + }, + { + "epoch": 2.4870451847196033, + "grad_norm": 0.33936753066344943, + "learning_rate": 8.64125200852442e-06, + "loss": 2.7357, + "step": 53419 + }, + { + "epoch": 2.4870917429056965, + "grad_norm": 0.3263243417545661, + "learning_rate": 8.639729909694416e-06, + "loss": 2.5935, + "step": 53420 + }, + { + "epoch": 2.4871383010917896, + "grad_norm": 0.311864081930952, + "learning_rate": 8.63820793225273e-06, + "loss": 2.7528, + "step": 53421 + }, + { + "epoch": 2.4871848592778827, + "grad_norm": 0.3208753086309797, + "learning_rate": 8.636686076203832e-06, + "loss": 2.6186, + "step": 53422 + }, + { + "epoch": 2.487231417463976, + "grad_norm": 0.33086556196854, + "learning_rate": 8.635164341552182e-06, + "loss": 2.5845, + "step": 53423 + }, + { + "epoch": 2.487277975650069, + "grad_norm": 0.32321919857496806, + "learning_rate": 8.633642728302265e-06, + "loss": 2.6367, + "step": 53424 + }, + { + "epoch": 2.4873245338361616, + "grad_norm": 0.30190379140672136, + "learning_rate": 8.632121236458501e-06, + "loss": 2.5977, + "step": 53425 + }, + { + "epoch": 2.4873710920222547, + "grad_norm": 0.3234919463598553, + "learning_rate": 8.630599866025419e-06, + "loss": 2.5624, + "step": 53426 + }, + { + "epoch": 2.487417650208348, + "grad_norm": 0.3322380568071242, + "learning_rate": 8.629078617007418e-06, + "loss": 2.5411, + "step": 53427 + }, + { + "epoch": 2.487464208394441, + "grad_norm": 0.3204606088789852, + "learning_rate": 8.627557489409028e-06, + "loss": 2.5935, + "step": 53428 + }, + { + "epoch": 2.487510766580534, + "grad_norm": 0.3195632957241153, + "learning_rate": 8.626036483234662e-06, + "loss": 2.6819, + "step": 53429 + }, + { + "epoch": 2.487557324766627, + "grad_norm": 0.32822594482715695, + "learning_rate": 8.624515598488808e-06, + "loss": 2.4742, + "step": 53430 + }, + { + "epoch": 2.4876038829527203, + "grad_norm": 0.3249533630773996, + "learning_rate": 8.622994835175923e-06, + "loss": 2.6605, + "step": 53431 + }, + { + "epoch": 2.4876504411388134, + "grad_norm": 0.30488448882270514, + "learning_rate": 8.621474193300483e-06, + "loss": 2.6026, + "step": 53432 + }, + { + "epoch": 2.487696999324906, + "grad_norm": 0.3119089560338981, + "learning_rate": 8.619953672866926e-06, + "loss": 2.6863, + "step": 53433 + }, + { + "epoch": 2.487743557510999, + "grad_norm": 0.30571498260085234, + "learning_rate": 8.618433273879729e-06, + "loss": 2.5922, + "step": 53434 + }, + { + "epoch": 2.4877901156970923, + "grad_norm": 0.3159058093992656, + "learning_rate": 8.616912996343356e-06, + "loss": 2.6137, + "step": 53435 + }, + { + "epoch": 2.4878366738831854, + "grad_norm": 0.3032418002944663, + "learning_rate": 8.615392840262266e-06, + "loss": 2.6173, + "step": 53436 + }, + { + "epoch": 2.4878832320692785, + "grad_norm": 0.3257793048044163, + "learning_rate": 8.613872805640933e-06, + "loss": 2.6511, + "step": 53437 + }, + { + "epoch": 2.4879297902553716, + "grad_norm": 0.29057515272470424, + "learning_rate": 8.612352892483777e-06, + "loss": 2.533, + "step": 53438 + }, + { + "epoch": 2.4879763484414648, + "grad_norm": 0.3172704128674229, + "learning_rate": 8.610833100795317e-06, + "loss": 2.5568, + "step": 53439 + }, + { + "epoch": 2.488022906627558, + "grad_norm": 0.3118211999231663, + "learning_rate": 8.609313430579962e-06, + "loss": 2.7144, + "step": 53440 + }, + { + "epoch": 2.488069464813651, + "grad_norm": 0.3021348943407572, + "learning_rate": 8.607793881842213e-06, + "loss": 2.5435, + "step": 53441 + }, + { + "epoch": 2.488116022999744, + "grad_norm": 0.29958327562687204, + "learning_rate": 8.606274454586504e-06, + "loss": 2.5392, + "step": 53442 + }, + { + "epoch": 2.488162581185837, + "grad_norm": 0.32094967685639675, + "learning_rate": 8.6047551488173e-06, + "loss": 2.6133, + "step": 53443 + }, + { + "epoch": 2.48820913937193, + "grad_norm": 0.3029112616265958, + "learning_rate": 8.603235964539058e-06, + "loss": 2.6296, + "step": 53444 + }, + { + "epoch": 2.488255697558023, + "grad_norm": 0.31023664545533985, + "learning_rate": 8.601716901756246e-06, + "loss": 2.6253, + "step": 53445 + }, + { + "epoch": 2.488302255744116, + "grad_norm": 0.3166758917784105, + "learning_rate": 8.600197960473327e-06, + "loss": 2.6609, + "step": 53446 + }, + { + "epoch": 2.4883488139302092, + "grad_norm": 0.2921345405762174, + "learning_rate": 8.598679140694733e-06, + "loss": 2.5643, + "step": 53447 + }, + { + "epoch": 2.4883953721163024, + "grad_norm": 0.3069024066187546, + "learning_rate": 8.59716044242494e-06, + "loss": 2.5572, + "step": 53448 + }, + { + "epoch": 2.4884419303023955, + "grad_norm": 0.3002065087127994, + "learning_rate": 8.595641865668402e-06, + "loss": 2.591, + "step": 53449 + }, + { + "epoch": 2.4884884884884886, + "grad_norm": 0.2967440492417936, + "learning_rate": 8.594123410429588e-06, + "loss": 2.5267, + "step": 53450 + }, + { + "epoch": 2.4885350466745817, + "grad_norm": 0.3402131849904161, + "learning_rate": 8.592605076712918e-06, + "loss": 2.7334, + "step": 53451 + }, + { + "epoch": 2.488581604860675, + "grad_norm": 0.2947768127126493, + "learning_rate": 8.591086864522897e-06, + "loss": 2.5507, + "step": 53452 + }, + { + "epoch": 2.4886281630467675, + "grad_norm": 0.3221553686837982, + "learning_rate": 8.589568773863931e-06, + "loss": 2.6562, + "step": 53453 + }, + { + "epoch": 2.4886747212328606, + "grad_norm": 0.2980874173484733, + "learning_rate": 8.588050804740528e-06, + "loss": 2.5637, + "step": 53454 + }, + { + "epoch": 2.4887212794189537, + "grad_norm": 0.32020767666460664, + "learning_rate": 8.5865329571571e-06, + "loss": 2.6629, + "step": 53455 + }, + { + "epoch": 2.488767837605047, + "grad_norm": 0.3251900384753907, + "learning_rate": 8.585015231118121e-06, + "loss": 2.5649, + "step": 53456 + }, + { + "epoch": 2.48881439579114, + "grad_norm": 0.2961560597234585, + "learning_rate": 8.583497626628046e-06, + "loss": 2.6623, + "step": 53457 + }, + { + "epoch": 2.488860953977233, + "grad_norm": 0.29805219386684845, + "learning_rate": 8.58198014369132e-06, + "loss": 2.6734, + "step": 53458 + }, + { + "epoch": 2.488907512163326, + "grad_norm": 0.31480698775518884, + "learning_rate": 8.580462782312415e-06, + "loss": 2.545, + "step": 53459 + }, + { + "epoch": 2.4889540703494193, + "grad_norm": 0.3028013175366788, + "learning_rate": 8.57894554249576e-06, + "loss": 2.706, + "step": 53460 + }, + { + "epoch": 2.4890006285355124, + "grad_norm": 0.3138855935236037, + "learning_rate": 8.577428424245815e-06, + "loss": 2.5056, + "step": 53461 + }, + { + "epoch": 2.4890471867216055, + "grad_norm": 0.3189261485690505, + "learning_rate": 8.57591142756704e-06, + "loss": 2.6087, + "step": 53462 + }, + { + "epoch": 2.4890937449076986, + "grad_norm": 0.3078249303488627, + "learning_rate": 8.574394552463888e-06, + "loss": 2.6389, + "step": 53463 + }, + { + "epoch": 2.4891403030937913, + "grad_norm": 0.31724561928562073, + "learning_rate": 8.572877798940798e-06, + "loss": 2.7052, + "step": 53464 + }, + { + "epoch": 2.4891868612798844, + "grad_norm": 0.32399271461722795, + "learning_rate": 8.571361167002251e-06, + "loss": 2.5448, + "step": 53465 + }, + { + "epoch": 2.4892334194659775, + "grad_norm": 0.3203468323248695, + "learning_rate": 8.569844656652647e-06, + "loss": 2.6817, + "step": 53466 + }, + { + "epoch": 2.4892799776520707, + "grad_norm": 0.3400239449725645, + "learning_rate": 8.568328267896498e-06, + "loss": 2.6739, + "step": 53467 + }, + { + "epoch": 2.4893265358381638, + "grad_norm": 0.3088301109232655, + "learning_rate": 8.566812000738206e-06, + "loss": 2.6694, + "step": 53468 + }, + { + "epoch": 2.489373094024257, + "grad_norm": 0.312640147557253, + "learning_rate": 8.565295855182243e-06, + "loss": 2.5604, + "step": 53469 + }, + { + "epoch": 2.48941965221035, + "grad_norm": 0.323995704909429, + "learning_rate": 8.563779831233054e-06, + "loss": 2.596, + "step": 53470 + }, + { + "epoch": 2.489466210396443, + "grad_norm": 0.3169576624787845, + "learning_rate": 8.562263928895087e-06, + "loss": 2.6606, + "step": 53471 + }, + { + "epoch": 2.489512768582536, + "grad_norm": 0.31505106826572477, + "learning_rate": 8.560748148172804e-06, + "loss": 2.6359, + "step": 53472 + }, + { + "epoch": 2.489559326768629, + "grad_norm": 0.31647891256891003, + "learning_rate": 8.559232489070634e-06, + "loss": 2.612, + "step": 53473 + }, + { + "epoch": 2.489605884954722, + "grad_norm": 0.32684821721013574, + "learning_rate": 8.557716951593032e-06, + "loss": 2.6555, + "step": 53474 + }, + { + "epoch": 2.489652443140815, + "grad_norm": 0.3407504219453307, + "learning_rate": 8.556201535744452e-06, + "loss": 2.6887, + "step": 53475 + }, + { + "epoch": 2.4896990013269082, + "grad_norm": 0.3120334787286719, + "learning_rate": 8.55468624152933e-06, + "loss": 2.6509, + "step": 53476 + }, + { + "epoch": 2.4897455595130014, + "grad_norm": 0.31492057649773014, + "learning_rate": 8.553171068952126e-06, + "loss": 2.6374, + "step": 53477 + }, + { + "epoch": 2.4897921176990945, + "grad_norm": 0.30344371781926, + "learning_rate": 8.551656018017295e-06, + "loss": 2.5314, + "step": 53478 + }, + { + "epoch": 2.4898386758851876, + "grad_norm": 0.3171616754154513, + "learning_rate": 8.550141088729246e-06, + "loss": 2.673, + "step": 53479 + }, + { + "epoch": 2.4898852340712807, + "grad_norm": 0.32629705793302377, + "learning_rate": 8.54862628109247e-06, + "loss": 2.5606, + "step": 53480 + }, + { + "epoch": 2.489931792257374, + "grad_norm": 0.341165783062538, + "learning_rate": 8.547111595111385e-06, + "loss": 2.6553, + "step": 53481 + }, + { + "epoch": 2.489978350443467, + "grad_norm": 0.3248929411091493, + "learning_rate": 8.545597030790437e-06, + "loss": 2.5429, + "step": 53482 + }, + { + "epoch": 2.4900249086295596, + "grad_norm": 0.31457493823594834, + "learning_rate": 8.544082588134084e-06, + "loss": 2.6024, + "step": 53483 + }, + { + "epoch": 2.4900714668156527, + "grad_norm": 0.31453130205178237, + "learning_rate": 8.54256826714676e-06, + "loss": 2.4992, + "step": 53484 + }, + { + "epoch": 2.490118025001746, + "grad_norm": 0.3145585708133399, + "learning_rate": 8.541054067832927e-06, + "loss": 2.5533, + "step": 53485 + }, + { + "epoch": 2.490164583187839, + "grad_norm": 0.2984990132410541, + "learning_rate": 8.539539990197004e-06, + "loss": 2.5706, + "step": 53486 + }, + { + "epoch": 2.490211141373932, + "grad_norm": 0.33377992555786096, + "learning_rate": 8.538026034243446e-06, + "loss": 2.5026, + "step": 53487 + }, + { + "epoch": 2.490257699560025, + "grad_norm": 0.3231846086201637, + "learning_rate": 8.536512199976698e-06, + "loss": 2.5143, + "step": 53488 + }, + { + "epoch": 2.4903042577461183, + "grad_norm": 0.2970267530208598, + "learning_rate": 8.534998487401203e-06, + "loss": 2.6273, + "step": 53489 + }, + { + "epoch": 2.4903508159322114, + "grad_norm": 0.2909973386998463, + "learning_rate": 8.533484896521405e-06, + "loss": 2.6517, + "step": 53490 + }, + { + "epoch": 2.4903973741183045, + "grad_norm": 0.3116371392656794, + "learning_rate": 8.53197142734175e-06, + "loss": 2.5196, + "step": 53491 + }, + { + "epoch": 2.490443932304397, + "grad_norm": 0.3076447357476351, + "learning_rate": 8.530458079866654e-06, + "loss": 2.6272, + "step": 53492 + }, + { + "epoch": 2.4904904904904903, + "grad_norm": 0.3348795061361801, + "learning_rate": 8.528944854100596e-06, + "loss": 2.6501, + "step": 53493 + }, + { + "epoch": 2.4905370486765834, + "grad_norm": 0.3235321031932803, + "learning_rate": 8.527431750047992e-06, + "loss": 2.6543, + "step": 53494 + }, + { + "epoch": 2.4905836068626765, + "grad_norm": 0.30767836726815845, + "learning_rate": 8.525918767713292e-06, + "loss": 2.6274, + "step": 53495 + }, + { + "epoch": 2.4906301650487697, + "grad_norm": 0.36755480611332275, + "learning_rate": 8.52440590710093e-06, + "loss": 2.6992, + "step": 53496 + }, + { + "epoch": 2.4906767232348628, + "grad_norm": 0.3205612930989406, + "learning_rate": 8.522893168215351e-06, + "loss": 2.6124, + "step": 53497 + }, + { + "epoch": 2.490723281420956, + "grad_norm": 0.3133254398685073, + "learning_rate": 8.521380551061009e-06, + "loss": 2.5552, + "step": 53498 + }, + { + "epoch": 2.490769839607049, + "grad_norm": 0.3360657922560294, + "learning_rate": 8.519868055642304e-06, + "loss": 2.6262, + "step": 53499 + }, + { + "epoch": 2.490816397793142, + "grad_norm": 0.3043894502827726, + "learning_rate": 8.518355681963724e-06, + "loss": 2.6229, + "step": 53500 + }, + { + "epoch": 2.4908629559792352, + "grad_norm": 0.3026576310061795, + "learning_rate": 8.51684343002967e-06, + "loss": 2.4995, + "step": 53501 + }, + { + "epoch": 2.4909095141653284, + "grad_norm": 0.3170767481376455, + "learning_rate": 8.515331299844592e-06, + "loss": 2.6015, + "step": 53502 + }, + { + "epoch": 2.490956072351421, + "grad_norm": 0.29356439939036505, + "learning_rate": 8.513819291412934e-06, + "loss": 2.697, + "step": 53503 + }, + { + "epoch": 2.491002630537514, + "grad_norm": 0.32821593280495914, + "learning_rate": 8.51230740473914e-06, + "loss": 2.7216, + "step": 53504 + }, + { + "epoch": 2.4910491887236073, + "grad_norm": 0.3098648755562706, + "learning_rate": 8.510795639827612e-06, + "loss": 2.6656, + "step": 53505 + }, + { + "epoch": 2.4910957469097004, + "grad_norm": 0.300390135355818, + "learning_rate": 8.509283996682837e-06, + "loss": 2.5006, + "step": 53506 + }, + { + "epoch": 2.4911423050957935, + "grad_norm": 0.29709149895419196, + "learning_rate": 8.507772475309211e-06, + "loss": 2.5455, + "step": 53507 + }, + { + "epoch": 2.4911888632818866, + "grad_norm": 0.29314944228795414, + "learning_rate": 8.506261075711186e-06, + "loss": 2.6479, + "step": 53508 + }, + { + "epoch": 2.4912354214679797, + "grad_norm": 0.3187490394125277, + "learning_rate": 8.504749797893197e-06, + "loss": 2.647, + "step": 53509 + }, + { + "epoch": 2.491281979654073, + "grad_norm": 0.3136320121346049, + "learning_rate": 8.503238641859684e-06, + "loss": 2.6083, + "step": 53510 + }, + { + "epoch": 2.4913285378401655, + "grad_norm": 0.3130212219653187, + "learning_rate": 8.501727607615084e-06, + "loss": 2.6288, + "step": 53511 + }, + { + "epoch": 2.4913750960262586, + "grad_norm": 0.3003276629421192, + "learning_rate": 8.500216695163805e-06, + "loss": 2.6032, + "step": 53512 + }, + { + "epoch": 2.4914216542123517, + "grad_norm": 0.33032260995223706, + "learning_rate": 8.498705904510324e-06, + "loss": 2.6411, + "step": 53513 + }, + { + "epoch": 2.491468212398445, + "grad_norm": 0.33605222680885616, + "learning_rate": 8.497195235659039e-06, + "loss": 2.7334, + "step": 53514 + }, + { + "epoch": 2.491514770584538, + "grad_norm": 0.3098656085087912, + "learning_rate": 8.495684688614396e-06, + "loss": 2.524, + "step": 53515 + }, + { + "epoch": 2.491561328770631, + "grad_norm": 0.3268609168326104, + "learning_rate": 8.494174263380832e-06, + "loss": 2.669, + "step": 53516 + }, + { + "epoch": 2.491607886956724, + "grad_norm": 0.3302103579917802, + "learning_rate": 8.492663959962776e-06, + "loss": 2.571, + "step": 53517 + }, + { + "epoch": 2.4916544451428173, + "grad_norm": 0.31653197043126513, + "learning_rate": 8.491153778364663e-06, + "loss": 2.6269, + "step": 53518 + }, + { + "epoch": 2.4917010033289104, + "grad_norm": 0.30335816548376543, + "learning_rate": 8.48964371859094e-06, + "loss": 2.5029, + "step": 53519 + }, + { + "epoch": 2.4917475615150035, + "grad_norm": 0.33013927419593186, + "learning_rate": 8.488133780646008e-06, + "loss": 2.7416, + "step": 53520 + }, + { + "epoch": 2.4917941197010967, + "grad_norm": 0.30854702169723724, + "learning_rate": 8.486623964534312e-06, + "loss": 2.7559, + "step": 53521 + }, + { + "epoch": 2.4918406778871893, + "grad_norm": 0.31026988707071834, + "learning_rate": 8.485114270260286e-06, + "loss": 2.6692, + "step": 53522 + }, + { + "epoch": 2.4918872360732824, + "grad_norm": 0.3086221225994034, + "learning_rate": 8.483604697828362e-06, + "loss": 2.6421, + "step": 53523 + }, + { + "epoch": 2.4919337942593756, + "grad_norm": 0.28920260597728964, + "learning_rate": 8.482095247242977e-06, + "loss": 2.6354, + "step": 53524 + }, + { + "epoch": 2.4919803524454687, + "grad_norm": 0.31065126491904854, + "learning_rate": 8.480585918508532e-06, + "loss": 2.6109, + "step": 53525 + }, + { + "epoch": 2.492026910631562, + "grad_norm": 0.305129349288676, + "learning_rate": 8.479076711629497e-06, + "loss": 2.4829, + "step": 53526 + }, + { + "epoch": 2.492073468817655, + "grad_norm": 0.3199745327673475, + "learning_rate": 8.477567626610273e-06, + "loss": 2.5786, + "step": 53527 + }, + { + "epoch": 2.492120027003748, + "grad_norm": 0.2993290711789461, + "learning_rate": 8.476058663455293e-06, + "loss": 2.6281, + "step": 53528 + }, + { + "epoch": 2.492166585189841, + "grad_norm": 0.3134784238678428, + "learning_rate": 8.474549822168992e-06, + "loss": 2.6079, + "step": 53529 + }, + { + "epoch": 2.4922131433759342, + "grad_norm": 0.3109417883334059, + "learning_rate": 8.473041102755797e-06, + "loss": 2.6981, + "step": 53530 + }, + { + "epoch": 2.492259701562027, + "grad_norm": 0.3193529348028723, + "learning_rate": 8.471532505220136e-06, + "loss": 2.5623, + "step": 53531 + }, + { + "epoch": 2.49230625974812, + "grad_norm": 0.3181523501593153, + "learning_rate": 8.470024029566448e-06, + "loss": 2.6373, + "step": 53532 + }, + { + "epoch": 2.492352817934213, + "grad_norm": 0.3116777062513646, + "learning_rate": 8.468515675799133e-06, + "loss": 2.526, + "step": 53533 + }, + { + "epoch": 2.4923993761203063, + "grad_norm": 0.30959625917491806, + "learning_rate": 8.467007443922637e-06, + "loss": 2.6635, + "step": 53534 + }, + { + "epoch": 2.4924459343063994, + "grad_norm": 0.3212614053749531, + "learning_rate": 8.465499333941379e-06, + "loss": 2.5356, + "step": 53535 + }, + { + "epoch": 2.4924924924924925, + "grad_norm": 0.3120292272754324, + "learning_rate": 8.46399134585979e-06, + "loss": 2.7188, + "step": 53536 + }, + { + "epoch": 2.4925390506785856, + "grad_norm": 0.3161707060205121, + "learning_rate": 8.46248347968231e-06, + "loss": 2.6242, + "step": 53537 + }, + { + "epoch": 2.4925856088646787, + "grad_norm": 0.31109925003328154, + "learning_rate": 8.460975735413318e-06, + "loss": 2.6966, + "step": 53538 + }, + { + "epoch": 2.492632167050772, + "grad_norm": 0.3025533554009922, + "learning_rate": 8.4594681130573e-06, + "loss": 2.7266, + "step": 53539 + }, + { + "epoch": 2.492678725236865, + "grad_norm": 0.308437525423327, + "learning_rate": 8.45796061261862e-06, + "loss": 2.5844, + "step": 53540 + }, + { + "epoch": 2.492725283422958, + "grad_norm": 0.296899552991336, + "learning_rate": 8.456453234101764e-06, + "loss": 2.5588, + "step": 53541 + }, + { + "epoch": 2.4927718416090507, + "grad_norm": 0.31113400922068973, + "learning_rate": 8.454945977511108e-06, + "loss": 2.7581, + "step": 53542 + }, + { + "epoch": 2.492818399795144, + "grad_norm": 0.32394220996111633, + "learning_rate": 8.453438842851096e-06, + "loss": 2.5985, + "step": 53543 + }, + { + "epoch": 2.492864957981237, + "grad_norm": 0.3194976626979381, + "learning_rate": 8.451931830126147e-06, + "loss": 2.7187, + "step": 53544 + }, + { + "epoch": 2.49291151616733, + "grad_norm": 0.3106374804923341, + "learning_rate": 8.450424939340696e-06, + "loss": 2.6982, + "step": 53545 + }, + { + "epoch": 2.492958074353423, + "grad_norm": 0.3053802874929071, + "learning_rate": 8.448918170499142e-06, + "loss": 2.5879, + "step": 53546 + }, + { + "epoch": 2.4930046325395163, + "grad_norm": 0.346562186307744, + "learning_rate": 8.447411523605925e-06, + "loss": 2.7017, + "step": 53547 + }, + { + "epoch": 2.4930511907256094, + "grad_norm": 0.2784523068831817, + "learning_rate": 8.445904998665454e-06, + "loss": 2.597, + "step": 53548 + }, + { + "epoch": 2.4930977489117025, + "grad_norm": 0.3320382565023264, + "learning_rate": 8.444398595682162e-06, + "loss": 2.6926, + "step": 53549 + }, + { + "epoch": 2.493144307097795, + "grad_norm": 0.3252603831158921, + "learning_rate": 8.442892314660478e-06, + "loss": 2.6631, + "step": 53550 + }, + { + "epoch": 2.4931908652838883, + "grad_norm": 0.30148969084562616, + "learning_rate": 8.441386155604786e-06, + "loss": 2.5158, + "step": 53551 + }, + { + "epoch": 2.4932374234699815, + "grad_norm": 0.30938345004834533, + "learning_rate": 8.439880118519556e-06, + "loss": 2.6053, + "step": 53552 + }, + { + "epoch": 2.4932839816560746, + "grad_norm": 0.31465425629482435, + "learning_rate": 8.438374203409161e-06, + "loss": 2.6165, + "step": 53553 + }, + { + "epoch": 2.4933305398421677, + "grad_norm": 0.3233908280032882, + "learning_rate": 8.436868410278065e-06, + "loss": 2.6217, + "step": 53554 + }, + { + "epoch": 2.493377098028261, + "grad_norm": 0.30218993761111423, + "learning_rate": 8.435362739130653e-06, + "loss": 2.6512, + "step": 53555 + }, + { + "epoch": 2.493423656214354, + "grad_norm": 0.2923135399766897, + "learning_rate": 8.433857189971362e-06, + "loss": 2.6973, + "step": 53556 + }, + { + "epoch": 2.493470214400447, + "grad_norm": 0.29785492184179585, + "learning_rate": 8.432351762804603e-06, + "loss": 2.5935, + "step": 53557 + }, + { + "epoch": 2.49351677258654, + "grad_norm": 0.33768984543905417, + "learning_rate": 8.43084645763479e-06, + "loss": 2.7298, + "step": 53558 + }, + { + "epoch": 2.4935633307726333, + "grad_norm": 0.32841335593761833, + "learning_rate": 8.429341274466367e-06, + "loss": 2.6811, + "step": 53559 + }, + { + "epoch": 2.4936098889587264, + "grad_norm": 0.30217805904182243, + "learning_rate": 8.427836213303714e-06, + "loss": 2.5685, + "step": 53560 + }, + { + "epoch": 2.493656447144819, + "grad_norm": 0.31829977273856175, + "learning_rate": 8.426331274151272e-06, + "loss": 2.5909, + "step": 53561 + }, + { + "epoch": 2.493703005330912, + "grad_norm": 0.29965684017193245, + "learning_rate": 8.424826457013446e-06, + "loss": 2.5988, + "step": 53562 + }, + { + "epoch": 2.4937495635170053, + "grad_norm": 0.3154421151796588, + "learning_rate": 8.423321761894676e-06, + "loss": 2.653, + "step": 53563 + }, + { + "epoch": 2.4937961217030984, + "grad_norm": 0.31831189722489145, + "learning_rate": 8.421817188799335e-06, + "loss": 2.7119, + "step": 53564 + }, + { + "epoch": 2.4938426798891915, + "grad_norm": 0.3112237683333707, + "learning_rate": 8.42031273773189e-06, + "loss": 2.5858, + "step": 53565 + }, + { + "epoch": 2.4938892380752846, + "grad_norm": 0.31437612524842723, + "learning_rate": 8.418808408696704e-06, + "loss": 2.7165, + "step": 53566 + }, + { + "epoch": 2.4939357962613777, + "grad_norm": 0.30483587358249037, + "learning_rate": 8.417304201698239e-06, + "loss": 2.6236, + "step": 53567 + }, + { + "epoch": 2.493982354447471, + "grad_norm": 0.33357468152391645, + "learning_rate": 8.415800116740884e-06, + "loss": 2.6619, + "step": 53568 + }, + { + "epoch": 2.494028912633564, + "grad_norm": 0.3190081506820759, + "learning_rate": 8.41429615382906e-06, + "loss": 2.6047, + "step": 53569 + }, + { + "epoch": 2.4940754708196566, + "grad_norm": 0.3013017200620498, + "learning_rate": 8.412792312967176e-06, + "loss": 2.5488, + "step": 53570 + }, + { + "epoch": 2.4941220290057498, + "grad_norm": 0.31866988155124276, + "learning_rate": 8.411288594159649e-06, + "loss": 2.5992, + "step": 53571 + }, + { + "epoch": 2.494168587191843, + "grad_norm": 0.30524317324399, + "learning_rate": 8.409784997410908e-06, + "loss": 2.5524, + "step": 53572 + }, + { + "epoch": 2.494215145377936, + "grad_norm": 0.3280110755604872, + "learning_rate": 8.408281522725336e-06, + "loss": 2.6717, + "step": 53573 + }, + { + "epoch": 2.494261703564029, + "grad_norm": 0.32147153745786466, + "learning_rate": 8.40677817010736e-06, + "loss": 2.6858, + "step": 53574 + }, + { + "epoch": 2.494308261750122, + "grad_norm": 0.3202122779778328, + "learning_rate": 8.405274939561398e-06, + "loss": 2.7281, + "step": 53575 + }, + { + "epoch": 2.4943548199362153, + "grad_norm": 0.317109422629783, + "learning_rate": 8.40377183109185e-06, + "loss": 2.6194, + "step": 53576 + }, + { + "epoch": 2.4944013781223084, + "grad_norm": 0.29548685009095854, + "learning_rate": 8.402268844703137e-06, + "loss": 2.5891, + "step": 53577 + }, + { + "epoch": 2.4944479363084016, + "grad_norm": 0.3172816954683675, + "learning_rate": 8.400765980399678e-06, + "loss": 2.6346, + "step": 53578 + }, + { + "epoch": 2.4944944944944947, + "grad_norm": 0.32418949638328554, + "learning_rate": 8.399263238185844e-06, + "loss": 2.621, + "step": 53579 + }, + { + "epoch": 2.494541052680588, + "grad_norm": 0.29826098690603503, + "learning_rate": 8.397760618066103e-06, + "loss": 2.668, + "step": 53580 + }, + { + "epoch": 2.4945876108666805, + "grad_norm": 0.31745675070481727, + "learning_rate": 8.396258120044826e-06, + "loss": 2.6577, + "step": 53581 + }, + { + "epoch": 2.4946341690527736, + "grad_norm": 0.31962480381260167, + "learning_rate": 8.394755744126432e-06, + "loss": 2.6186, + "step": 53582 + }, + { + "epoch": 2.4946807272388667, + "grad_norm": 0.3200154531114622, + "learning_rate": 8.393253490315328e-06, + "loss": 2.6441, + "step": 53583 + }, + { + "epoch": 2.49472728542496, + "grad_norm": 0.3247512040900894, + "learning_rate": 8.391751358615929e-06, + "loss": 2.5824, + "step": 53584 + }, + { + "epoch": 2.494773843611053, + "grad_norm": 0.332963069898751, + "learning_rate": 8.390249349032647e-06, + "loss": 2.6144, + "step": 53585 + }, + { + "epoch": 2.494820401797146, + "grad_norm": 0.3358741851733687, + "learning_rate": 8.38874746156988e-06, + "loss": 2.7485, + "step": 53586 + }, + { + "epoch": 2.494866959983239, + "grad_norm": 0.3142332345311304, + "learning_rate": 8.387245696232032e-06, + "loss": 2.6706, + "step": 53587 + }, + { + "epoch": 2.4949135181693323, + "grad_norm": 0.33836509182038804, + "learning_rate": 8.385744053023526e-06, + "loss": 2.6054, + "step": 53588 + }, + { + "epoch": 2.4949600763554254, + "grad_norm": 0.3114605451811004, + "learning_rate": 8.384242531948754e-06, + "loss": 2.5914, + "step": 53589 + }, + { + "epoch": 2.495006634541518, + "grad_norm": 0.33330343580535376, + "learning_rate": 8.382741133012134e-06, + "loss": 2.6854, + "step": 53590 + }, + { + "epoch": 2.495053192727611, + "grad_norm": 0.3459791253793134, + "learning_rate": 8.381239856218081e-06, + "loss": 2.6548, + "step": 53591 + }, + { + "epoch": 2.4950997509137043, + "grad_norm": 0.3414015407812466, + "learning_rate": 8.379738701570966e-06, + "loss": 2.6995, + "step": 53592 + }, + { + "epoch": 2.4951463090997974, + "grad_norm": 0.3317074301799249, + "learning_rate": 8.37823766907524e-06, + "loss": 2.6679, + "step": 53593 + }, + { + "epoch": 2.4951928672858905, + "grad_norm": 0.33055017709028806, + "learning_rate": 8.376736758735266e-06, + "loss": 2.5941, + "step": 53594 + }, + { + "epoch": 2.4952394254719836, + "grad_norm": 0.3406128251076026, + "learning_rate": 8.375235970555484e-06, + "loss": 2.618, + "step": 53595 + }, + { + "epoch": 2.4952859836580767, + "grad_norm": 0.3397258373669712, + "learning_rate": 8.373735304540281e-06, + "loss": 2.5364, + "step": 53596 + }, + { + "epoch": 2.49533254184417, + "grad_norm": 0.33170775127970187, + "learning_rate": 8.37223476069406e-06, + "loss": 2.5625, + "step": 53597 + }, + { + "epoch": 2.495379100030263, + "grad_norm": 0.3334253530508522, + "learning_rate": 8.370734339021241e-06, + "loss": 2.6501, + "step": 53598 + }, + { + "epoch": 2.495425658216356, + "grad_norm": 0.32376286691491485, + "learning_rate": 8.369234039526202e-06, + "loss": 2.6497, + "step": 53599 + }, + { + "epoch": 2.495472216402449, + "grad_norm": 0.30142582070008395, + "learning_rate": 8.367733862213361e-06, + "loss": 2.6127, + "step": 53600 + }, + { + "epoch": 2.495518774588542, + "grad_norm": 0.32993924034162475, + "learning_rate": 8.36623380708712e-06, + "loss": 2.5394, + "step": 53601 + }, + { + "epoch": 2.495565332774635, + "grad_norm": 0.33756647880537594, + "learning_rate": 8.364733874151881e-06, + "loss": 2.6555, + "step": 53602 + }, + { + "epoch": 2.495611890960728, + "grad_norm": 0.30267121510064293, + "learning_rate": 8.363234063412045e-06, + "loss": 2.542, + "step": 53603 + }, + { + "epoch": 2.4956584491468212, + "grad_norm": 0.3017862930388956, + "learning_rate": 8.361734374872032e-06, + "loss": 2.6839, + "step": 53604 + }, + { + "epoch": 2.4957050073329143, + "grad_norm": 0.32363409890106365, + "learning_rate": 8.360234808536199e-06, + "loss": 2.6137, + "step": 53605 + }, + { + "epoch": 2.4957515655190075, + "grad_norm": 0.3340871528659533, + "learning_rate": 8.358735364408998e-06, + "loss": 2.6109, + "step": 53606 + }, + { + "epoch": 2.4957981237051006, + "grad_norm": 0.3255767859817739, + "learning_rate": 8.357236042494781e-06, + "loss": 2.601, + "step": 53607 + }, + { + "epoch": 2.4958446818911937, + "grad_norm": 0.30999534551170405, + "learning_rate": 8.355736842798002e-06, + "loss": 2.6148, + "step": 53608 + }, + { + "epoch": 2.4958912400772864, + "grad_norm": 0.3046811934564805, + "learning_rate": 8.35423776532302e-06, + "loss": 2.6723, + "step": 53609 + }, + { + "epoch": 2.4959377982633795, + "grad_norm": 0.30125440999201647, + "learning_rate": 8.352738810074245e-06, + "loss": 2.6814, + "step": 53610 + }, + { + "epoch": 2.4959843564494726, + "grad_norm": 0.3143076848578034, + "learning_rate": 8.351239977056092e-06, + "loss": 2.5569, + "step": 53611 + }, + { + "epoch": 2.4960309146355657, + "grad_norm": 0.33457085486018395, + "learning_rate": 8.34974126627292e-06, + "loss": 2.7463, + "step": 53612 + }, + { + "epoch": 2.496077472821659, + "grad_norm": 0.29406572750117904, + "learning_rate": 8.348242677729185e-06, + "loss": 2.6764, + "step": 53613 + }, + { + "epoch": 2.496124031007752, + "grad_norm": 0.29958413906641684, + "learning_rate": 8.346744211429235e-06, + "loss": 2.6554, + "step": 53614 + }, + { + "epoch": 2.496170589193845, + "grad_norm": 0.3111635806061175, + "learning_rate": 8.345245867377494e-06, + "loss": 2.6379, + "step": 53615 + }, + { + "epoch": 2.496217147379938, + "grad_norm": 0.31153433847649803, + "learning_rate": 8.343747645578348e-06, + "loss": 2.5347, + "step": 53616 + }, + { + "epoch": 2.4962637055660313, + "grad_norm": 0.32441029843985636, + "learning_rate": 8.342249546036207e-06, + "loss": 2.6521, + "step": 53617 + }, + { + "epoch": 2.4963102637521244, + "grad_norm": 0.30805435226551486, + "learning_rate": 8.340751568755445e-06, + "loss": 2.6173, + "step": 53618 + }, + { + "epoch": 2.4963568219382175, + "grad_norm": 0.30587198647576885, + "learning_rate": 8.339253713740492e-06, + "loss": 2.6671, + "step": 53619 + }, + { + "epoch": 2.49640338012431, + "grad_norm": 0.32136459196470585, + "learning_rate": 8.337755980995699e-06, + "loss": 2.7558, + "step": 53620 + }, + { + "epoch": 2.4964499383104033, + "grad_norm": 0.29913629614737475, + "learning_rate": 8.336258370525513e-06, + "loss": 2.6922, + "step": 53621 + }, + { + "epoch": 2.4964964964964964, + "grad_norm": 0.3194144926955552, + "learning_rate": 8.334760882334291e-06, + "loss": 2.6224, + "step": 53622 + }, + { + "epoch": 2.4965430546825895, + "grad_norm": 0.32116808865448354, + "learning_rate": 8.333263516426448e-06, + "loss": 2.721, + "step": 53623 + }, + { + "epoch": 2.4965896128686826, + "grad_norm": 0.3050694084191939, + "learning_rate": 8.331766272806379e-06, + "loss": 2.6389, + "step": 53624 + }, + { + "epoch": 2.4966361710547758, + "grad_norm": 0.2963329366779503, + "learning_rate": 8.330269151478448e-06, + "loss": 2.6258, + "step": 53625 + }, + { + "epoch": 2.496682729240869, + "grad_norm": 0.3042336845388938, + "learning_rate": 8.328772152447095e-06, + "loss": 2.6945, + "step": 53626 + }, + { + "epoch": 2.496729287426962, + "grad_norm": 0.30987305489346983, + "learning_rate": 8.327275275716679e-06, + "loss": 2.6192, + "step": 53627 + }, + { + "epoch": 2.496775845613055, + "grad_norm": 0.3081469487279532, + "learning_rate": 8.3257785212916e-06, + "loss": 2.613, + "step": 53628 + }, + { + "epoch": 2.4968224037991478, + "grad_norm": 0.313818362961025, + "learning_rate": 8.32428188917626e-06, + "loss": 2.6079, + "step": 53629 + }, + { + "epoch": 2.496868961985241, + "grad_norm": 0.31058994597567324, + "learning_rate": 8.322785379375047e-06, + "loss": 2.6013, + "step": 53630 + }, + { + "epoch": 2.496915520171334, + "grad_norm": 0.3012998149725772, + "learning_rate": 8.32128899189235e-06, + "loss": 2.7428, + "step": 53631 + }, + { + "epoch": 2.496962078357427, + "grad_norm": 0.30740489619759054, + "learning_rate": 8.319792726732579e-06, + "loss": 2.7428, + "step": 53632 + }, + { + "epoch": 2.4970086365435202, + "grad_norm": 0.2986163193158868, + "learning_rate": 8.318296583900087e-06, + "loss": 2.5484, + "step": 53633 + }, + { + "epoch": 2.4970551947296133, + "grad_norm": 0.32691084147779226, + "learning_rate": 8.316800563399308e-06, + "loss": 2.704, + "step": 53634 + }, + { + "epoch": 2.4971017529157065, + "grad_norm": 0.3297911994050633, + "learning_rate": 8.315304665234607e-06, + "loss": 2.6744, + "step": 53635 + }, + { + "epoch": 2.4971483111017996, + "grad_norm": 0.2953813718316853, + "learning_rate": 8.313808889410373e-06, + "loss": 2.6056, + "step": 53636 + }, + { + "epoch": 2.4971948692878927, + "grad_norm": 0.30713041527229357, + "learning_rate": 8.312313235931008e-06, + "loss": 2.6009, + "step": 53637 + }, + { + "epoch": 2.497241427473986, + "grad_norm": 0.3133935217026949, + "learning_rate": 8.310817704800894e-06, + "loss": 2.5676, + "step": 53638 + }, + { + "epoch": 2.497287985660079, + "grad_norm": 0.2940217133268912, + "learning_rate": 8.309322296024436e-06, + "loss": 2.5486, + "step": 53639 + }, + { + "epoch": 2.4973345438461716, + "grad_norm": 0.3161607453359718, + "learning_rate": 8.307827009606e-06, + "loss": 2.6549, + "step": 53640 + }, + { + "epoch": 2.4973811020322647, + "grad_norm": 0.28936278997063236, + "learning_rate": 8.30633184554998e-06, + "loss": 2.5437, + "step": 53641 + }, + { + "epoch": 2.497427660218358, + "grad_norm": 0.29837130271144746, + "learning_rate": 8.304836803860771e-06, + "loss": 2.6732, + "step": 53642 + }, + { + "epoch": 2.497474218404451, + "grad_norm": 0.3090864867680219, + "learning_rate": 8.303341884542759e-06, + "loss": 2.5692, + "step": 53643 + }, + { + "epoch": 2.497520776590544, + "grad_norm": 0.30774151690609824, + "learning_rate": 8.301847087600335e-06, + "loss": 2.5225, + "step": 53644 + }, + { + "epoch": 2.497567334776637, + "grad_norm": 0.31397585074300216, + "learning_rate": 8.300352413037888e-06, + "loss": 2.5732, + "step": 53645 + }, + { + "epoch": 2.4976138929627303, + "grad_norm": 0.3263216263239587, + "learning_rate": 8.298857860859776e-06, + "loss": 2.6669, + "step": 53646 + }, + { + "epoch": 2.4976604511488234, + "grad_norm": 0.3176326859000776, + "learning_rate": 8.297363431070432e-06, + "loss": 2.6536, + "step": 53647 + }, + { + "epoch": 2.497707009334916, + "grad_norm": 0.3191149963254687, + "learning_rate": 8.295869123674205e-06, + "loss": 2.6146, + "step": 53648 + }, + { + "epoch": 2.497753567521009, + "grad_norm": 0.33152617597310324, + "learning_rate": 8.294374938675492e-06, + "loss": 2.6577, + "step": 53649 + }, + { + "epoch": 2.4978001257071023, + "grad_norm": 0.3212832193818704, + "learning_rate": 8.292880876078685e-06, + "loss": 2.6435, + "step": 53650 + }, + { + "epoch": 2.4978466838931954, + "grad_norm": 0.30629886198095324, + "learning_rate": 8.291386935888157e-06, + "loss": 2.659, + "step": 53651 + }, + { + "epoch": 2.4978932420792885, + "grad_norm": 0.31012803783941856, + "learning_rate": 8.289893118108317e-06, + "loss": 2.6618, + "step": 53652 + }, + { + "epoch": 2.4979398002653816, + "grad_norm": 0.3083515028826107, + "learning_rate": 8.288399422743514e-06, + "loss": 2.556, + "step": 53653 + }, + { + "epoch": 2.4979863584514748, + "grad_norm": 0.31002385589218906, + "learning_rate": 8.286905849798155e-06, + "loss": 2.6131, + "step": 53654 + }, + { + "epoch": 2.498032916637568, + "grad_norm": 0.3183645305330761, + "learning_rate": 8.285412399276615e-06, + "loss": 2.661, + "step": 53655 + }, + { + "epoch": 2.498079474823661, + "grad_norm": 0.3269029411611584, + "learning_rate": 8.283919071183283e-06, + "loss": 2.6022, + "step": 53656 + }, + { + "epoch": 2.498126033009754, + "grad_norm": 0.3108314688744345, + "learning_rate": 8.282425865522537e-06, + "loss": 2.6128, + "step": 53657 + }, + { + "epoch": 2.4981725911958472, + "grad_norm": 0.29727140730935203, + "learning_rate": 8.280932782298773e-06, + "loss": 2.5628, + "step": 53658 + }, + { + "epoch": 2.49821914938194, + "grad_norm": 0.3301943029433426, + "learning_rate": 8.279439821516338e-06, + "loss": 2.6162, + "step": 53659 + }, + { + "epoch": 2.498265707568033, + "grad_norm": 0.3284801990895026, + "learning_rate": 8.277946983179658e-06, + "loss": 2.6668, + "step": 53660 + }, + { + "epoch": 2.498312265754126, + "grad_norm": 0.29608952798188365, + "learning_rate": 8.276454267293082e-06, + "loss": 2.5994, + "step": 53661 + }, + { + "epoch": 2.4983588239402192, + "grad_norm": 0.3529171390718854, + "learning_rate": 8.27496167386101e-06, + "loss": 2.6325, + "step": 53662 + }, + { + "epoch": 2.4984053821263124, + "grad_norm": 0.3265116255005325, + "learning_rate": 8.273469202887806e-06, + "loss": 2.5527, + "step": 53663 + }, + { + "epoch": 2.4984519403124055, + "grad_norm": 0.322292695893729, + "learning_rate": 8.27197685437786e-06, + "loss": 2.6338, + "step": 53664 + }, + { + "epoch": 2.4984984984984986, + "grad_norm": 0.31586618356206553, + "learning_rate": 8.270484628335567e-06, + "loss": 2.5924, + "step": 53665 + }, + { + "epoch": 2.4985450566845917, + "grad_norm": 0.32769420774211805, + "learning_rate": 8.268992524765262e-06, + "loss": 2.6451, + "step": 53666 + }, + { + "epoch": 2.498591614870685, + "grad_norm": 0.29165014595585487, + "learning_rate": 8.26750054367138e-06, + "loss": 2.6435, + "step": 53667 + }, + { + "epoch": 2.4986381730567775, + "grad_norm": 0.3173795652043658, + "learning_rate": 8.266008685058263e-06, + "loss": 2.7086, + "step": 53668 + }, + { + "epoch": 2.4986847312428706, + "grad_norm": 0.3127258513865194, + "learning_rate": 8.264516948930295e-06, + "loss": 2.6044, + "step": 53669 + }, + { + "epoch": 2.4987312894289637, + "grad_norm": 0.3066124115226101, + "learning_rate": 8.263025335291857e-06, + "loss": 2.5946, + "step": 53670 + }, + { + "epoch": 2.498777847615057, + "grad_norm": 0.29192275443573823, + "learning_rate": 8.26153384414734e-06, + "loss": 2.6127, + "step": 53671 + }, + { + "epoch": 2.49882440580115, + "grad_norm": 0.31001497627861646, + "learning_rate": 8.26004247550109e-06, + "loss": 2.5724, + "step": 53672 + }, + { + "epoch": 2.498870963987243, + "grad_norm": 0.33336101529444584, + "learning_rate": 8.258551229357525e-06, + "loss": 2.6699, + "step": 53673 + }, + { + "epoch": 2.498917522173336, + "grad_norm": 0.3166467391810441, + "learning_rate": 8.257060105720987e-06, + "loss": 2.6544, + "step": 53674 + }, + { + "epoch": 2.4989640803594293, + "grad_norm": 0.31104590306233076, + "learning_rate": 8.255569104595861e-06, + "loss": 2.6684, + "step": 53675 + }, + { + "epoch": 2.4990106385455224, + "grad_norm": 0.3055108520416916, + "learning_rate": 8.254078225986533e-06, + "loss": 2.5825, + "step": 53676 + }, + { + "epoch": 2.4990571967316155, + "grad_norm": 0.31049832903834645, + "learning_rate": 8.252587469897371e-06, + "loss": 2.6826, + "step": 53677 + }, + { + "epoch": 2.4991037549177086, + "grad_norm": 0.31925895611281446, + "learning_rate": 8.251096836332767e-06, + "loss": 2.5894, + "step": 53678 + }, + { + "epoch": 2.4991503131038013, + "grad_norm": 0.3378145913958969, + "learning_rate": 8.249606325297055e-06, + "loss": 2.6535, + "step": 53679 + }, + { + "epoch": 2.4991968712898944, + "grad_norm": 0.32567591864833323, + "learning_rate": 8.24811593679466e-06, + "loss": 2.6448, + "step": 53680 + }, + { + "epoch": 2.4992434294759875, + "grad_norm": 0.3347009818426703, + "learning_rate": 8.246625670829916e-06, + "loss": 2.6162, + "step": 53681 + }, + { + "epoch": 2.4992899876620807, + "grad_norm": 0.3078814827283176, + "learning_rate": 8.245135527407217e-06, + "loss": 2.6601, + "step": 53682 + }, + { + "epoch": 2.4993365458481738, + "grad_norm": 0.3188520230999094, + "learning_rate": 8.243645506530929e-06, + "loss": 2.7172, + "step": 53683 + }, + { + "epoch": 2.499383104034267, + "grad_norm": 0.3204755194766304, + "learning_rate": 8.242155608205427e-06, + "loss": 2.6871, + "step": 53684 + }, + { + "epoch": 2.49942966222036, + "grad_norm": 0.29377288968543097, + "learning_rate": 8.240665832435085e-06, + "loss": 2.5899, + "step": 53685 + }, + { + "epoch": 2.499476220406453, + "grad_norm": 0.32259826189470664, + "learning_rate": 8.239176179224284e-06, + "loss": 2.6734, + "step": 53686 + }, + { + "epoch": 2.499522778592546, + "grad_norm": 0.3213608461498126, + "learning_rate": 8.23768664857738e-06, + "loss": 2.6158, + "step": 53687 + }, + { + "epoch": 2.499569336778639, + "grad_norm": 0.31381011448876156, + "learning_rate": 8.23619724049875e-06, + "loss": 2.7019, + "step": 53688 + }, + { + "epoch": 2.499615894964732, + "grad_norm": 0.31106102109429473, + "learning_rate": 8.234707954992765e-06, + "loss": 2.6423, + "step": 53689 + }, + { + "epoch": 2.499662453150825, + "grad_norm": 0.30565568322196435, + "learning_rate": 8.233218792063796e-06, + "loss": 2.6208, + "step": 53690 + }, + { + "epoch": 2.4997090113369183, + "grad_norm": 0.31097928747244097, + "learning_rate": 8.231729751716234e-06, + "loss": 2.6501, + "step": 53691 + }, + { + "epoch": 2.4997555695230114, + "grad_norm": 0.2924161746534061, + "learning_rate": 8.2302408339544e-06, + "loss": 2.6083, + "step": 53692 + }, + { + "epoch": 2.4998021277091045, + "grad_norm": 0.3192687592667165, + "learning_rate": 8.228752038782717e-06, + "loss": 2.6669, + "step": 53693 + }, + { + "epoch": 2.4998486858951976, + "grad_norm": 0.32017661872162756, + "learning_rate": 8.227263366205524e-06, + "loss": 2.6453, + "step": 53694 + }, + { + "epoch": 2.4998952440812907, + "grad_norm": 0.31929735362081535, + "learning_rate": 8.2257748162272e-06, + "loss": 2.7129, + "step": 53695 + }, + { + "epoch": 2.499941802267384, + "grad_norm": 0.3100728575627786, + "learning_rate": 8.224286388852104e-06, + "loss": 2.6289, + "step": 53696 + }, + { + "epoch": 2.499988360453477, + "grad_norm": 0.31152098089377983, + "learning_rate": 8.222798084084615e-06, + "loss": 2.58, + "step": 53697 + }, + { + "epoch": 2.50003491863957, + "grad_norm": 0.30374544549288013, + "learning_rate": 8.221309901929103e-06, + "loss": 2.6498, + "step": 53698 + }, + { + "epoch": 2.5000814768256627, + "grad_norm": 0.305843781415432, + "learning_rate": 8.219821842389936e-06, + "loss": 2.6291, + "step": 53699 + }, + { + "epoch": 2.500128035011756, + "grad_norm": 0.3050082584714745, + "learning_rate": 8.218333905471466e-06, + "loss": 2.4912, + "step": 53700 + }, + { + "epoch": 2.500174593197849, + "grad_norm": 0.3318442968672426, + "learning_rate": 8.21684609117807e-06, + "loss": 2.7219, + "step": 53701 + }, + { + "epoch": 2.500221151383942, + "grad_norm": 0.29577069282825824, + "learning_rate": 8.215358399514112e-06, + "loss": 2.6279, + "step": 53702 + }, + { + "epoch": 2.500267709570035, + "grad_norm": 0.29182982026888127, + "learning_rate": 8.213870830483966e-06, + "loss": 2.5587, + "step": 53703 + }, + { + "epoch": 2.5003142677561283, + "grad_norm": 0.319176141996563, + "learning_rate": 8.212383384091999e-06, + "loss": 2.608, + "step": 53704 + }, + { + "epoch": 2.5003608259422214, + "grad_norm": 0.3061746371360118, + "learning_rate": 8.21089606034255e-06, + "loss": 2.4806, + "step": 53705 + }, + { + "epoch": 2.500407384128314, + "grad_norm": 0.2917091729544983, + "learning_rate": 8.209408859240025e-06, + "loss": 2.5912, + "step": 53706 + }, + { + "epoch": 2.500453942314407, + "grad_norm": 0.30669455991845673, + "learning_rate": 8.207921780788758e-06, + "loss": 2.6024, + "step": 53707 + }, + { + "epoch": 2.5005005005005003, + "grad_norm": 0.32189469861891096, + "learning_rate": 8.206434824993126e-06, + "loss": 2.5166, + "step": 53708 + }, + { + "epoch": 2.5005470586865934, + "grad_norm": 0.3076122570660065, + "learning_rate": 8.204947991857487e-06, + "loss": 2.6023, + "step": 53709 + }, + { + "epoch": 2.5005936168726866, + "grad_norm": 0.30008687241370036, + "learning_rate": 8.203461281386204e-06, + "loss": 2.6267, + "step": 53710 + }, + { + "epoch": 2.5006401750587797, + "grad_norm": 0.3096954924622054, + "learning_rate": 8.201974693583653e-06, + "loss": 2.6549, + "step": 53711 + }, + { + "epoch": 2.500686733244873, + "grad_norm": 0.3190933895036086, + "learning_rate": 8.200488228454196e-06, + "loss": 2.6635, + "step": 53712 + }, + { + "epoch": 2.500733291430966, + "grad_norm": 0.33105745658745706, + "learning_rate": 8.199001886002172e-06, + "loss": 2.6674, + "step": 53713 + }, + { + "epoch": 2.500779849617059, + "grad_norm": 0.31116581564862383, + "learning_rate": 8.197515666231964e-06, + "loss": 2.6623, + "step": 53714 + }, + { + "epoch": 2.500826407803152, + "grad_norm": 0.3173755055786146, + "learning_rate": 8.196029569147934e-06, + "loss": 2.6603, + "step": 53715 + }, + { + "epoch": 2.5008729659892452, + "grad_norm": 0.3123592854083672, + "learning_rate": 8.194543594754433e-06, + "loss": 2.6772, + "step": 53716 + }, + { + "epoch": 2.5009195241753384, + "grad_norm": 0.3295565090023804, + "learning_rate": 8.193057743055838e-06, + "loss": 2.6009, + "step": 53717 + }, + { + "epoch": 2.5009660823614315, + "grad_norm": 0.3036306170899399, + "learning_rate": 8.191572014056481e-06, + "loss": 2.6125, + "step": 53718 + }, + { + "epoch": 2.501012640547524, + "grad_norm": 0.30139419970643344, + "learning_rate": 8.190086407760761e-06, + "loss": 2.6497, + "step": 53719 + }, + { + "epoch": 2.5010591987336173, + "grad_norm": 0.3093527359296857, + "learning_rate": 8.188600924173002e-06, + "loss": 2.692, + "step": 53720 + }, + { + "epoch": 2.5011057569197104, + "grad_norm": 0.33081755048581835, + "learning_rate": 8.187115563297592e-06, + "loss": 2.5939, + "step": 53721 + }, + { + "epoch": 2.5011523151058035, + "grad_norm": 0.3079500172660287, + "learning_rate": 8.185630325138876e-06, + "loss": 2.5307, + "step": 53722 + }, + { + "epoch": 2.5011988732918966, + "grad_norm": 0.3131354436371807, + "learning_rate": 8.184145209701215e-06, + "loss": 2.5968, + "step": 53723 + }, + { + "epoch": 2.5012454314779897, + "grad_norm": 0.2910843653681612, + "learning_rate": 8.182660216988963e-06, + "loss": 2.6358, + "step": 53724 + }, + { + "epoch": 2.501291989664083, + "grad_norm": 0.31790281615786387, + "learning_rate": 8.181175347006504e-06, + "loss": 2.706, + "step": 53725 + }, + { + "epoch": 2.5013385478501755, + "grad_norm": 0.31206440537085517, + "learning_rate": 8.179690599758155e-06, + "loss": 2.7007, + "step": 53726 + }, + { + "epoch": 2.5013851060362686, + "grad_norm": 0.31645815707097213, + "learning_rate": 8.178205975248298e-06, + "loss": 2.6719, + "step": 53727 + }, + { + "epoch": 2.5014316642223617, + "grad_norm": 0.3232015746139401, + "learning_rate": 8.176721473481285e-06, + "loss": 2.5352, + "step": 53728 + }, + { + "epoch": 2.501478222408455, + "grad_norm": 0.32247011012532517, + "learning_rate": 8.175237094461474e-06, + "loss": 2.6018, + "step": 53729 + }, + { + "epoch": 2.501524780594548, + "grad_norm": 0.313797896696744, + "learning_rate": 8.173752838193239e-06, + "loss": 2.5791, + "step": 53730 + }, + { + "epoch": 2.501571338780641, + "grad_norm": 0.3018765420120997, + "learning_rate": 8.172268704680885e-06, + "loss": 2.5638, + "step": 53731 + }, + { + "epoch": 2.501617896966734, + "grad_norm": 0.3147472065324801, + "learning_rate": 8.170784693928834e-06, + "loss": 2.596, + "step": 53732 + }, + { + "epoch": 2.5016644551528273, + "grad_norm": 0.32840890854654275, + "learning_rate": 8.169300805941382e-06, + "loss": 2.7513, + "step": 53733 + }, + { + "epoch": 2.5017110133389204, + "grad_norm": 0.3090839814172821, + "learning_rate": 8.167817040722937e-06, + "loss": 2.5614, + "step": 53734 + }, + { + "epoch": 2.5017575715250135, + "grad_norm": 0.3130547185285241, + "learning_rate": 8.166333398277814e-06, + "loss": 2.5997, + "step": 53735 + }, + { + "epoch": 2.5018041297111067, + "grad_norm": 0.29643386468688127, + "learning_rate": 8.16484987861038e-06, + "loss": 2.4984, + "step": 53736 + }, + { + "epoch": 2.5018506878971998, + "grad_norm": 0.2923945131397664, + "learning_rate": 8.163366481724994e-06, + "loss": 2.7098, + "step": 53737 + }, + { + "epoch": 2.5018972460832924, + "grad_norm": 0.3286061777245109, + "learning_rate": 8.161883207626003e-06, + "loss": 2.6569, + "step": 53738 + }, + { + "epoch": 2.5019438042693856, + "grad_norm": 0.28926996397428134, + "learning_rate": 8.160400056317775e-06, + "loss": 2.5494, + "step": 53739 + }, + { + "epoch": 2.5019903624554787, + "grad_norm": 0.32784817654600806, + "learning_rate": 8.158917027804636e-06, + "loss": 2.7019, + "step": 53740 + }, + { + "epoch": 2.502036920641572, + "grad_norm": 0.31962361387123805, + "learning_rate": 8.157434122090957e-06, + "loss": 2.5965, + "step": 53741 + }, + { + "epoch": 2.502083478827665, + "grad_norm": 0.31706730735775673, + "learning_rate": 8.15595133918109e-06, + "loss": 2.6198, + "step": 53742 + }, + { + "epoch": 2.502130037013758, + "grad_norm": 0.3231686443969062, + "learning_rate": 8.15446867907939e-06, + "loss": 2.6623, + "step": 53743 + }, + { + "epoch": 2.502176595199851, + "grad_norm": 0.3124449786758167, + "learning_rate": 8.152986141790181e-06, + "loss": 2.6565, + "step": 53744 + }, + { + "epoch": 2.502223153385944, + "grad_norm": 0.3226764739758155, + "learning_rate": 8.151503727317855e-06, + "loss": 2.5756, + "step": 53745 + }, + { + "epoch": 2.502269711572037, + "grad_norm": 0.3284132926171035, + "learning_rate": 8.150021435666721e-06, + "loss": 2.5922, + "step": 53746 + }, + { + "epoch": 2.50231626975813, + "grad_norm": 0.30366486562380784, + "learning_rate": 8.148539266841176e-06, + "loss": 2.6436, + "step": 53747 + }, + { + "epoch": 2.502362827944223, + "grad_norm": 0.28918125438009135, + "learning_rate": 8.147057220845533e-06, + "loss": 2.6098, + "step": 53748 + }, + { + "epoch": 2.5024093861303163, + "grad_norm": 0.31129411955513364, + "learning_rate": 8.145575297684154e-06, + "loss": 2.4457, + "step": 53749 + }, + { + "epoch": 2.5024559443164094, + "grad_norm": 0.3160666206301822, + "learning_rate": 8.144093497361388e-06, + "loss": 2.5645, + "step": 53750 + }, + { + "epoch": 2.5025025025025025, + "grad_norm": 0.31006235704962337, + "learning_rate": 8.142611819881585e-06, + "loss": 2.6358, + "step": 53751 + }, + { + "epoch": 2.5025490606885956, + "grad_norm": 0.29892031669864144, + "learning_rate": 8.141130265249109e-06, + "loss": 2.6086, + "step": 53752 + }, + { + "epoch": 2.5025956188746887, + "grad_norm": 0.2981621610366919, + "learning_rate": 8.139648833468272e-06, + "loss": 2.5337, + "step": 53753 + }, + { + "epoch": 2.502642177060782, + "grad_norm": 0.2966549944014462, + "learning_rate": 8.138167524543444e-06, + "loss": 2.7305, + "step": 53754 + }, + { + "epoch": 2.502688735246875, + "grad_norm": 0.30112667403576593, + "learning_rate": 8.136686338478971e-06, + "loss": 2.6347, + "step": 53755 + }, + { + "epoch": 2.502735293432968, + "grad_norm": 0.30803124045789104, + "learning_rate": 8.135205275279201e-06, + "loss": 2.643, + "step": 53756 + }, + { + "epoch": 2.502781851619061, + "grad_norm": 0.30227719560495875, + "learning_rate": 8.133724334948478e-06, + "loss": 2.5927, + "step": 53757 + }, + { + "epoch": 2.502828409805154, + "grad_norm": 0.3032362211741251, + "learning_rate": 8.132243517491156e-06, + "loss": 2.5742, + "step": 53758 + }, + { + "epoch": 2.502874967991247, + "grad_norm": 0.291507951834894, + "learning_rate": 8.13076282291156e-06, + "loss": 2.6038, + "step": 53759 + }, + { + "epoch": 2.50292152617734, + "grad_norm": 0.306970534025782, + "learning_rate": 8.129282251214066e-06, + "loss": 2.5831, + "step": 53760 + }, + { + "epoch": 2.502968084363433, + "grad_norm": 0.3058035050173619, + "learning_rate": 8.127801802402996e-06, + "loss": 2.5441, + "step": 53761 + }, + { + "epoch": 2.5030146425495263, + "grad_norm": 0.2952682340386328, + "learning_rate": 8.1263214764827e-06, + "loss": 2.6477, + "step": 53762 + }, + { + "epoch": 2.5030612007356194, + "grad_norm": 0.29489865582323743, + "learning_rate": 8.124841273457529e-06, + "loss": 2.5943, + "step": 53763 + }, + { + "epoch": 2.5031077589217126, + "grad_norm": 0.3082132943342659, + "learning_rate": 8.123361193331824e-06, + "loss": 2.6897, + "step": 53764 + }, + { + "epoch": 2.5031543171078052, + "grad_norm": 0.297903691164345, + "learning_rate": 8.121881236109935e-06, + "loss": 2.5932, + "step": 53765 + }, + { + "epoch": 2.5032008752938983, + "grad_norm": 0.31110880694615656, + "learning_rate": 8.120401401796191e-06, + "loss": 2.657, + "step": 53766 + }, + { + "epoch": 2.5032474334799915, + "grad_norm": 0.3077454578724256, + "learning_rate": 8.118921690394943e-06, + "loss": 2.6121, + "step": 53767 + }, + { + "epoch": 2.5032939916660846, + "grad_norm": 0.3204283318755035, + "learning_rate": 8.117442101910533e-06, + "loss": 2.6767, + "step": 53768 + }, + { + "epoch": 2.5033405498521777, + "grad_norm": 0.3213020739967188, + "learning_rate": 8.115962636347308e-06, + "loss": 2.5557, + "step": 53769 + }, + { + "epoch": 2.503387108038271, + "grad_norm": 0.3084583564388067, + "learning_rate": 8.114483293709601e-06, + "loss": 2.609, + "step": 53770 + }, + { + "epoch": 2.503433666224364, + "grad_norm": 0.29994487870367337, + "learning_rate": 8.113004074001773e-06, + "loss": 2.5509, + "step": 53771 + }, + { + "epoch": 2.503480224410457, + "grad_norm": 0.3015055126521817, + "learning_rate": 8.111524977228124e-06, + "loss": 2.5915, + "step": 53772 + }, + { + "epoch": 2.50352678259655, + "grad_norm": 0.30732487912809053, + "learning_rate": 8.11004600339305e-06, + "loss": 2.6379, + "step": 53773 + }, + { + "epoch": 2.5035733407826433, + "grad_norm": 0.32305940010901946, + "learning_rate": 8.10856715250085e-06, + "loss": 2.675, + "step": 53774 + }, + { + "epoch": 2.5036198989687364, + "grad_norm": 0.29730062776990246, + "learning_rate": 8.10708842455588e-06, + "loss": 2.6998, + "step": 53775 + }, + { + "epoch": 2.5036664571548295, + "grad_norm": 0.33089768000901604, + "learning_rate": 8.105609819562482e-06, + "loss": 2.6571, + "step": 53776 + }, + { + "epoch": 2.503713015340922, + "grad_norm": 0.3137367073909361, + "learning_rate": 8.10413133752499e-06, + "loss": 2.593, + "step": 53777 + }, + { + "epoch": 2.5037595735270153, + "grad_norm": 0.343045113160751, + "learning_rate": 8.102652978447756e-06, + "loss": 2.6799, + "step": 53778 + }, + { + "epoch": 2.5038061317131084, + "grad_norm": 0.335710418771359, + "learning_rate": 8.101174742335094e-06, + "loss": 2.6997, + "step": 53779 + }, + { + "epoch": 2.5038526898992015, + "grad_norm": 0.3105494851368742, + "learning_rate": 8.099696629191356e-06, + "loss": 2.5191, + "step": 53780 + }, + { + "epoch": 2.5038992480852946, + "grad_norm": 0.3203767695881658, + "learning_rate": 8.098218639020882e-06, + "loss": 2.6036, + "step": 53781 + }, + { + "epoch": 2.5039458062713877, + "grad_norm": 0.32192412617527055, + "learning_rate": 8.09674077182801e-06, + "loss": 2.5709, + "step": 53782 + }, + { + "epoch": 2.503992364457481, + "grad_norm": 0.30837586422959556, + "learning_rate": 8.095263027617073e-06, + "loss": 2.7392, + "step": 53783 + }, + { + "epoch": 2.504038922643574, + "grad_norm": 0.33387939107377534, + "learning_rate": 8.09378540639243e-06, + "loss": 2.6801, + "step": 53784 + }, + { + "epoch": 2.5040854808296666, + "grad_norm": 0.3188368831008247, + "learning_rate": 8.092307908158365e-06, + "loss": 2.5792, + "step": 53785 + }, + { + "epoch": 2.5041320390157598, + "grad_norm": 0.3093913368452603, + "learning_rate": 8.090830532919275e-06, + "loss": 2.6202, + "step": 53786 + }, + { + "epoch": 2.504178597201853, + "grad_norm": 0.3170397296044403, + "learning_rate": 8.089353280679457e-06, + "loss": 2.6816, + "step": 53787 + }, + { + "epoch": 2.504225155387946, + "grad_norm": 0.32098147412804867, + "learning_rate": 8.087876151443257e-06, + "loss": 2.6218, + "step": 53788 + }, + { + "epoch": 2.504271713574039, + "grad_norm": 0.30956037003467785, + "learning_rate": 8.086399145215012e-06, + "loss": 2.5596, + "step": 53789 + }, + { + "epoch": 2.504318271760132, + "grad_norm": 0.3313566610068818, + "learning_rate": 8.084922261999056e-06, + "loss": 2.6688, + "step": 53790 + }, + { + "epoch": 2.5043648299462253, + "grad_norm": 0.3266849676191171, + "learning_rate": 8.08344550179974e-06, + "loss": 2.551, + "step": 53791 + }, + { + "epoch": 2.5044113881323185, + "grad_norm": 0.32649332461501224, + "learning_rate": 8.081968864621353e-06, + "loss": 2.6494, + "step": 53792 + }, + { + "epoch": 2.5044579463184116, + "grad_norm": 0.31144209419795255, + "learning_rate": 8.080492350468283e-06, + "loss": 2.6542, + "step": 53793 + }, + { + "epoch": 2.5045045045045047, + "grad_norm": 0.3056943891653492, + "learning_rate": 8.079015959344826e-06, + "loss": 2.6311, + "step": 53794 + }, + { + "epoch": 2.504551062690598, + "grad_norm": 0.3008716534430967, + "learning_rate": 8.077539691255332e-06, + "loss": 2.5363, + "step": 53795 + }, + { + "epoch": 2.504597620876691, + "grad_norm": 0.312731070296292, + "learning_rate": 8.076063546204122e-06, + "loss": 2.6805, + "step": 53796 + }, + { + "epoch": 2.5046441790627836, + "grad_norm": 0.2982341153292289, + "learning_rate": 8.074587524195553e-06, + "loss": 2.5928, + "step": 53797 + }, + { + "epoch": 2.5046907372488767, + "grad_norm": 0.3016750890274297, + "learning_rate": 8.073111625233908e-06, + "loss": 2.5697, + "step": 53798 + }, + { + "epoch": 2.50473729543497, + "grad_norm": 0.3070597342743179, + "learning_rate": 8.071635849323583e-06, + "loss": 2.5709, + "step": 53799 + }, + { + "epoch": 2.504783853621063, + "grad_norm": 0.3256461252276009, + "learning_rate": 8.070160196468857e-06, + "loss": 2.6721, + "step": 53800 + }, + { + "epoch": 2.504830411807156, + "grad_norm": 0.3337861085398762, + "learning_rate": 8.068684666674082e-06, + "loss": 2.7773, + "step": 53801 + }, + { + "epoch": 2.504876969993249, + "grad_norm": 0.31683881779211975, + "learning_rate": 8.067209259943587e-06, + "loss": 2.6037, + "step": 53802 + }, + { + "epoch": 2.5049235281793423, + "grad_norm": 0.3101745318914675, + "learning_rate": 8.065733976281703e-06, + "loss": 2.5805, + "step": 53803 + }, + { + "epoch": 2.504970086365435, + "grad_norm": 0.3007746915683553, + "learning_rate": 8.06425881569277e-06, + "loss": 2.6165, + "step": 53804 + }, + { + "epoch": 2.505016644551528, + "grad_norm": 0.3207736419749035, + "learning_rate": 8.062783778181082e-06, + "loss": 2.5396, + "step": 53805 + }, + { + "epoch": 2.505063202737621, + "grad_norm": 0.3258477204771221, + "learning_rate": 8.061308863751017e-06, + "loss": 2.5137, + "step": 53806 + }, + { + "epoch": 2.5051097609237143, + "grad_norm": 0.3053563045656181, + "learning_rate": 8.059834072406863e-06, + "loss": 2.58, + "step": 53807 + }, + { + "epoch": 2.5051563191098074, + "grad_norm": 0.31536669352891966, + "learning_rate": 8.058359404152965e-06, + "loss": 2.6854, + "step": 53808 + }, + { + "epoch": 2.5052028772959005, + "grad_norm": 0.3151129681673339, + "learning_rate": 8.056884858993652e-06, + "loss": 2.5018, + "step": 53809 + }, + { + "epoch": 2.5052494354819936, + "grad_norm": 0.2979245094503337, + "learning_rate": 8.055410436933253e-06, + "loss": 2.5527, + "step": 53810 + }, + { + "epoch": 2.5052959936680868, + "grad_norm": 0.308186272262266, + "learning_rate": 8.053936137976086e-06, + "loss": 2.5791, + "step": 53811 + }, + { + "epoch": 2.50534255185418, + "grad_norm": 0.3171097733722316, + "learning_rate": 8.052461962126501e-06, + "loss": 2.6609, + "step": 53812 + }, + { + "epoch": 2.505389110040273, + "grad_norm": 0.3269782024287399, + "learning_rate": 8.050987909388796e-06, + "loss": 2.6553, + "step": 53813 + }, + { + "epoch": 2.505435668226366, + "grad_norm": 0.33118742805110735, + "learning_rate": 8.049513979767303e-06, + "loss": 2.633, + "step": 53814 + }, + { + "epoch": 2.505482226412459, + "grad_norm": 0.3222393715677394, + "learning_rate": 8.048040173266359e-06, + "loss": 2.7187, + "step": 53815 + }, + { + "epoch": 2.505528784598552, + "grad_norm": 0.30694099994377416, + "learning_rate": 8.046566489890284e-06, + "loss": 2.6707, + "step": 53816 + }, + { + "epoch": 2.505575342784645, + "grad_norm": 0.3118973480512338, + "learning_rate": 8.04509292964341e-06, + "loss": 2.6558, + "step": 53817 + }, + { + "epoch": 2.505621900970738, + "grad_norm": 0.2973950802679836, + "learning_rate": 8.043619492530035e-06, + "loss": 2.5675, + "step": 53818 + }, + { + "epoch": 2.5056684591568312, + "grad_norm": 0.30632729116416896, + "learning_rate": 8.042146178554528e-06, + "loss": 2.6186, + "step": 53819 + }, + { + "epoch": 2.5057150173429243, + "grad_norm": 0.31517687264683203, + "learning_rate": 8.040672987721177e-06, + "loss": 2.6138, + "step": 53820 + }, + { + "epoch": 2.5057615755290175, + "grad_norm": 0.3162527860140112, + "learning_rate": 8.039199920034312e-06, + "loss": 2.56, + "step": 53821 + }, + { + "epoch": 2.5058081337151106, + "grad_norm": 0.3176359916818247, + "learning_rate": 8.037726975498266e-06, + "loss": 2.4826, + "step": 53822 + }, + { + "epoch": 2.5058546919012037, + "grad_norm": 0.3089753227452863, + "learning_rate": 8.036254154117357e-06, + "loss": 2.6317, + "step": 53823 + }, + { + "epoch": 2.5059012500872964, + "grad_norm": 0.28522847989327427, + "learning_rate": 8.034781455895907e-06, + "loss": 2.5733, + "step": 53824 + }, + { + "epoch": 2.5059478082733895, + "grad_norm": 0.3141438855793186, + "learning_rate": 8.033308880838247e-06, + "loss": 2.6476, + "step": 53825 + }, + { + "epoch": 2.5059943664594826, + "grad_norm": 0.32231193602397773, + "learning_rate": 8.031836428948686e-06, + "loss": 2.6849, + "step": 53826 + }, + { + "epoch": 2.5060409246455757, + "grad_norm": 0.31268223043420046, + "learning_rate": 8.030364100231547e-06, + "loss": 2.645, + "step": 53827 + }, + { + "epoch": 2.506087482831669, + "grad_norm": 0.3046398117966668, + "learning_rate": 8.028891894691153e-06, + "loss": 2.5668, + "step": 53828 + }, + { + "epoch": 2.506134041017762, + "grad_norm": 0.32514452347071643, + "learning_rate": 8.027419812331833e-06, + "loss": 2.6505, + "step": 53829 + }, + { + "epoch": 2.506180599203855, + "grad_norm": 0.3130209690059638, + "learning_rate": 8.025947853157906e-06, + "loss": 2.6715, + "step": 53830 + }, + { + "epoch": 2.506227157389948, + "grad_norm": 0.3267171438880653, + "learning_rate": 8.024476017173666e-06, + "loss": 2.5922, + "step": 53831 + }, + { + "epoch": 2.5062737155760413, + "grad_norm": 0.32924603305413835, + "learning_rate": 8.02300430438348e-06, + "loss": 2.6462, + "step": 53832 + }, + { + "epoch": 2.5063202737621344, + "grad_norm": 0.29679928820941714, + "learning_rate": 8.021532714791624e-06, + "loss": 2.6785, + "step": 53833 + }, + { + "epoch": 2.5063668319482275, + "grad_norm": 0.3104710577699427, + "learning_rate": 8.020061248402439e-06, + "loss": 2.6198, + "step": 53834 + }, + { + "epoch": 2.5064133901343206, + "grad_norm": 0.3435097152835644, + "learning_rate": 8.018589905220236e-06, + "loss": 2.6504, + "step": 53835 + }, + { + "epoch": 2.5064599483204133, + "grad_norm": 0.3392244078781904, + "learning_rate": 8.017118685249337e-06, + "loss": 2.6436, + "step": 53836 + }, + { + "epoch": 2.5065065065065064, + "grad_norm": 0.3129177647298549, + "learning_rate": 8.015647588494052e-06, + "loss": 2.7023, + "step": 53837 + }, + { + "epoch": 2.5065530646925995, + "grad_norm": 0.3072100662313301, + "learning_rate": 8.014176614958723e-06, + "loss": 2.6595, + "step": 53838 + }, + { + "epoch": 2.5065996228786926, + "grad_norm": 0.33995241966979617, + "learning_rate": 8.012705764647639e-06, + "loss": 2.6081, + "step": 53839 + }, + { + "epoch": 2.5066461810647858, + "grad_norm": 0.33134195692015483, + "learning_rate": 8.011235037565123e-06, + "loss": 2.6526, + "step": 53840 + }, + { + "epoch": 2.506692739250879, + "grad_norm": 0.3244116249647882, + "learning_rate": 8.0097644337155e-06, + "loss": 2.6759, + "step": 53841 + }, + { + "epoch": 2.506739297436972, + "grad_norm": 0.31225243413092235, + "learning_rate": 8.008293953103075e-06, + "loss": 2.6336, + "step": 53842 + }, + { + "epoch": 2.5067858556230647, + "grad_norm": 0.3149704375398158, + "learning_rate": 8.006823595732187e-06, + "loss": 2.586, + "step": 53843 + }, + { + "epoch": 2.506832413809158, + "grad_norm": 0.33603897728589, + "learning_rate": 8.00535336160711e-06, + "loss": 2.5463, + "step": 53844 + }, + { + "epoch": 2.506878971995251, + "grad_norm": 0.3150650517361612, + "learning_rate": 8.003883250732208e-06, + "loss": 2.6727, + "step": 53845 + }, + { + "epoch": 2.506925530181344, + "grad_norm": 0.3090112721897318, + "learning_rate": 8.002413263111746e-06, + "loss": 2.6654, + "step": 53846 + }, + { + "epoch": 2.506972088367437, + "grad_norm": 0.3028966339189727, + "learning_rate": 8.000943398750093e-06, + "loss": 2.5445, + "step": 53847 + }, + { + "epoch": 2.5070186465535302, + "grad_norm": 0.3137132714411829, + "learning_rate": 7.999473657651518e-06, + "loss": 2.5941, + "step": 53848 + }, + { + "epoch": 2.5070652047396234, + "grad_norm": 0.294340503607322, + "learning_rate": 7.998004039820345e-06, + "loss": 2.5463, + "step": 53849 + }, + { + "epoch": 2.5071117629257165, + "grad_norm": 0.3381101892763171, + "learning_rate": 7.996534545260897e-06, + "loss": 2.5945, + "step": 53850 + }, + { + "epoch": 2.5071583211118096, + "grad_norm": 0.3241350363048037, + "learning_rate": 7.995065173977484e-06, + "loss": 2.7071, + "step": 53851 + }, + { + "epoch": 2.5072048792979027, + "grad_norm": 0.3164707277489852, + "learning_rate": 7.993595925974428e-06, + "loss": 2.634, + "step": 53852 + }, + { + "epoch": 2.507251437483996, + "grad_norm": 0.28732695306463085, + "learning_rate": 7.992126801256011e-06, + "loss": 2.5723, + "step": 53853 + }, + { + "epoch": 2.507297995670089, + "grad_norm": 0.3134375334355811, + "learning_rate": 7.990657799826573e-06, + "loss": 2.6754, + "step": 53854 + }, + { + "epoch": 2.5073445538561816, + "grad_norm": 0.30290834133318617, + "learning_rate": 7.989188921690411e-06, + "loss": 2.6155, + "step": 53855 + }, + { + "epoch": 2.5073911120422747, + "grad_norm": 0.3058589035022487, + "learning_rate": 7.987720166851853e-06, + "loss": 2.6538, + "step": 53856 + }, + { + "epoch": 2.507437670228368, + "grad_norm": 0.3027568862320145, + "learning_rate": 7.986251535315171e-06, + "loss": 2.6233, + "step": 53857 + }, + { + "epoch": 2.507484228414461, + "grad_norm": 0.30230605395659516, + "learning_rate": 7.984783027084731e-06, + "loss": 2.6171, + "step": 53858 + }, + { + "epoch": 2.507530786600554, + "grad_norm": 0.3202837998609884, + "learning_rate": 7.983314642164785e-06, + "loss": 2.5795, + "step": 53859 + }, + { + "epoch": 2.507577344786647, + "grad_norm": 0.33881469687909255, + "learning_rate": 7.981846380559694e-06, + "loss": 2.7005, + "step": 53860 + }, + { + "epoch": 2.5076239029727403, + "grad_norm": 0.33160704388170453, + "learning_rate": 7.98037824227374e-06, + "loss": 2.701, + "step": 53861 + }, + { + "epoch": 2.5076704611588334, + "grad_norm": 0.3019868471708962, + "learning_rate": 7.978910227311225e-06, + "loss": 2.6152, + "step": 53862 + }, + { + "epoch": 2.507717019344926, + "grad_norm": 0.3146786147777704, + "learning_rate": 7.977442335676478e-06, + "loss": 2.6599, + "step": 53863 + }, + { + "epoch": 2.507763577531019, + "grad_norm": 0.32076945096792364, + "learning_rate": 7.975974567373795e-06, + "loss": 2.6574, + "step": 53864 + }, + { + "epoch": 2.5078101357171123, + "grad_norm": 0.3149526517758228, + "learning_rate": 7.974506922407498e-06, + "loss": 2.5227, + "step": 53865 + }, + { + "epoch": 2.5078566939032054, + "grad_norm": 0.3171256020912553, + "learning_rate": 7.973039400781874e-06, + "loss": 2.6948, + "step": 53866 + }, + { + "epoch": 2.5079032520892985, + "grad_norm": 0.32994207617678484, + "learning_rate": 7.971572002501238e-06, + "loss": 2.6023, + "step": 53867 + }, + { + "epoch": 2.5079498102753917, + "grad_norm": 0.32437011084720263, + "learning_rate": 7.970104727569893e-06, + "loss": 2.6174, + "step": 53868 + }, + { + "epoch": 2.5079963684614848, + "grad_norm": 0.3419538076980502, + "learning_rate": 7.968637575992155e-06, + "loss": 2.6972, + "step": 53869 + }, + { + "epoch": 2.508042926647578, + "grad_norm": 0.31348770243445634, + "learning_rate": 7.967170547772324e-06, + "loss": 2.6836, + "step": 53870 + }, + { + "epoch": 2.508089484833671, + "grad_norm": 0.31672163672403564, + "learning_rate": 7.965703642914718e-06, + "loss": 2.5917, + "step": 53871 + }, + { + "epoch": 2.508136043019764, + "grad_norm": 0.32759780033389085, + "learning_rate": 7.964236861423603e-06, + "loss": 2.5772, + "step": 53872 + }, + { + "epoch": 2.5081826012058572, + "grad_norm": 0.3194259444033594, + "learning_rate": 7.962770203303338e-06, + "loss": 2.6716, + "step": 53873 + }, + { + "epoch": 2.5082291593919503, + "grad_norm": 0.32303704113038995, + "learning_rate": 7.96130366855819e-06, + "loss": 2.6337, + "step": 53874 + }, + { + "epoch": 2.508275717578043, + "grad_norm": 0.3383807226761916, + "learning_rate": 7.959837257192477e-06, + "loss": 2.5899, + "step": 53875 + }, + { + "epoch": 2.508322275764136, + "grad_norm": 0.31527740829696094, + "learning_rate": 7.958370969210499e-06, + "loss": 2.5597, + "step": 53876 + }, + { + "epoch": 2.5083688339502292, + "grad_norm": 0.3608504525146019, + "learning_rate": 7.956904804616555e-06, + "loss": 2.6483, + "step": 53877 + }, + { + "epoch": 2.5084153921363224, + "grad_norm": 0.311003264623258, + "learning_rate": 7.955438763414968e-06, + "loss": 2.6152, + "step": 53878 + }, + { + "epoch": 2.5084619503224155, + "grad_norm": 0.3507013553012291, + "learning_rate": 7.953972845610019e-06, + "loss": 2.5817, + "step": 53879 + }, + { + "epoch": 2.5085085085085086, + "grad_norm": 0.32552530252081957, + "learning_rate": 7.952507051206014e-06, + "loss": 2.5278, + "step": 53880 + }, + { + "epoch": 2.5085550666946017, + "grad_norm": 0.3178182775860198, + "learning_rate": 7.951041380207258e-06, + "loss": 2.6386, + "step": 53881 + }, + { + "epoch": 2.5086016248806944, + "grad_norm": 0.30001843993233673, + "learning_rate": 7.94957583261805e-06, + "loss": 2.5546, + "step": 53882 + }, + { + "epoch": 2.5086481830667875, + "grad_norm": 0.30733501421857146, + "learning_rate": 7.948110408442699e-06, + "loss": 2.6588, + "step": 53883 + }, + { + "epoch": 2.5086947412528806, + "grad_norm": 0.30774007767274286, + "learning_rate": 7.946645107685514e-06, + "loss": 2.6024, + "step": 53884 + }, + { + "epoch": 2.5087412994389737, + "grad_norm": 0.31790799492738264, + "learning_rate": 7.945179930350754e-06, + "loss": 2.5294, + "step": 53885 + }, + { + "epoch": 2.508787857625067, + "grad_norm": 0.3048824654675715, + "learning_rate": 7.943714876442776e-06, + "loss": 2.5972, + "step": 53886 + }, + { + "epoch": 2.50883441581116, + "grad_norm": 0.31318559683402075, + "learning_rate": 7.942249945965824e-06, + "loss": 2.651, + "step": 53887 + }, + { + "epoch": 2.508880973997253, + "grad_norm": 0.30362431850079813, + "learning_rate": 7.94078513892425e-06, + "loss": 2.5852, + "step": 53888 + }, + { + "epoch": 2.508927532183346, + "grad_norm": 0.3129601332306355, + "learning_rate": 7.939320455322319e-06, + "loss": 2.5664, + "step": 53889 + }, + { + "epoch": 2.5089740903694393, + "grad_norm": 0.30655146478427725, + "learning_rate": 7.937855895164337e-06, + "loss": 2.575, + "step": 53890 + }, + { + "epoch": 2.5090206485555324, + "grad_norm": 0.30132723140022705, + "learning_rate": 7.936391458454617e-06, + "loss": 2.5607, + "step": 53891 + }, + { + "epoch": 2.5090672067416255, + "grad_norm": 0.33151716525641073, + "learning_rate": 7.934927145197429e-06, + "loss": 2.7478, + "step": 53892 + }, + { + "epoch": 2.5091137649277186, + "grad_norm": 0.31866694991295275, + "learning_rate": 7.933462955397092e-06, + "loss": 2.6306, + "step": 53893 + }, + { + "epoch": 2.5091603231138113, + "grad_norm": 0.31160550808043397, + "learning_rate": 7.93199888905789e-06, + "loss": 2.6185, + "step": 53894 + }, + { + "epoch": 2.5092068812999044, + "grad_norm": 0.3182413023652695, + "learning_rate": 7.930534946184131e-06, + "loss": 2.6421, + "step": 53895 + }, + { + "epoch": 2.5092534394859975, + "grad_norm": 0.2987471678042234, + "learning_rate": 7.929071126780108e-06, + "loss": 2.6034, + "step": 53896 + }, + { + "epoch": 2.5092999976720907, + "grad_norm": 0.3065545939302508, + "learning_rate": 7.927607430850126e-06, + "loss": 2.6489, + "step": 53897 + }, + { + "epoch": 2.509346555858184, + "grad_norm": 0.310110359059789, + "learning_rate": 7.926143858398443e-06, + "loss": 2.6289, + "step": 53898 + }, + { + "epoch": 2.509393114044277, + "grad_norm": 0.2966879393968979, + "learning_rate": 7.924680409429414e-06, + "loss": 2.5521, + "step": 53899 + }, + { + "epoch": 2.50943967223037, + "grad_norm": 0.3033658390991143, + "learning_rate": 7.923217083947276e-06, + "loss": 2.642, + "step": 53900 + }, + { + "epoch": 2.509486230416463, + "grad_norm": 0.30053189336682146, + "learning_rate": 7.921753881956379e-06, + "loss": 2.6119, + "step": 53901 + }, + { + "epoch": 2.509532788602556, + "grad_norm": 0.3173930506461256, + "learning_rate": 7.920290803460973e-06, + "loss": 2.6465, + "step": 53902 + }, + { + "epoch": 2.509579346788649, + "grad_norm": 0.31791904228777446, + "learning_rate": 7.918827848465365e-06, + "loss": 2.6309, + "step": 53903 + }, + { + "epoch": 2.509625904974742, + "grad_norm": 0.3042905106746833, + "learning_rate": 7.917365016973865e-06, + "loss": 2.6343, + "step": 53904 + }, + { + "epoch": 2.509672463160835, + "grad_norm": 0.3062779515166556, + "learning_rate": 7.915902308990737e-06, + "loss": 2.6319, + "step": 53905 + }, + { + "epoch": 2.5097190213469283, + "grad_norm": 0.29371774998863187, + "learning_rate": 7.914439724520306e-06, + "loss": 2.6178, + "step": 53906 + }, + { + "epoch": 2.5097655795330214, + "grad_norm": 0.31906232812421814, + "learning_rate": 7.912977263566835e-06, + "loss": 2.6698, + "step": 53907 + }, + { + "epoch": 2.5098121377191145, + "grad_norm": 0.3222167601901447, + "learning_rate": 7.911514926134634e-06, + "loss": 2.6277, + "step": 53908 + }, + { + "epoch": 2.5098586959052076, + "grad_norm": 0.3283516831479961, + "learning_rate": 7.910052712227989e-06, + "loss": 2.5153, + "step": 53909 + }, + { + "epoch": 2.5099052540913007, + "grad_norm": 0.3221981371250845, + "learning_rate": 7.908590621851203e-06, + "loss": 2.642, + "step": 53910 + }, + { + "epoch": 2.509951812277394, + "grad_norm": 0.28987055601199735, + "learning_rate": 7.907128655008533e-06, + "loss": 2.5845, + "step": 53911 + }, + { + "epoch": 2.509998370463487, + "grad_norm": 0.3261901860656032, + "learning_rate": 7.905666811704321e-06, + "loss": 2.6015, + "step": 53912 + }, + { + "epoch": 2.51004492864958, + "grad_norm": 0.31792927304600105, + "learning_rate": 7.9042050919428e-06, + "loss": 2.6568, + "step": 53913 + }, + { + "epoch": 2.5100914868356727, + "grad_norm": 0.35073670434312154, + "learning_rate": 7.90274349572832e-06, + "loss": 2.6902, + "step": 53914 + }, + { + "epoch": 2.510138045021766, + "grad_norm": 0.30582631119142434, + "learning_rate": 7.901282023065126e-06, + "loss": 2.6287, + "step": 53915 + }, + { + "epoch": 2.510184603207859, + "grad_norm": 0.2938843561034341, + "learning_rate": 7.89982067395752e-06, + "loss": 2.7148, + "step": 53916 + }, + { + "epoch": 2.510231161393952, + "grad_norm": 0.31195769494010644, + "learning_rate": 7.89835944840981e-06, + "loss": 2.5555, + "step": 53917 + }, + { + "epoch": 2.510277719580045, + "grad_norm": 0.320374361788317, + "learning_rate": 7.896898346426235e-06, + "loss": 2.6427, + "step": 53918 + }, + { + "epoch": 2.5103242777661383, + "grad_norm": 0.3199825119128038, + "learning_rate": 7.895437368011143e-06, + "loss": 2.624, + "step": 53919 + }, + { + "epoch": 2.5103708359522314, + "grad_norm": 0.29207198549684876, + "learning_rate": 7.893976513168787e-06, + "loss": 2.53, + "step": 53920 + }, + { + "epoch": 2.510417394138324, + "grad_norm": 0.3251371917699533, + "learning_rate": 7.89251578190346e-06, + "loss": 2.662, + "step": 53921 + }, + { + "epoch": 2.510463952324417, + "grad_norm": 0.3122622121913828, + "learning_rate": 7.891055174219448e-06, + "loss": 2.7442, + "step": 53922 + }, + { + "epoch": 2.5105105105105103, + "grad_norm": 0.33310102892819354, + "learning_rate": 7.889594690121044e-06, + "loss": 2.7092, + "step": 53923 + }, + { + "epoch": 2.5105570686966034, + "grad_norm": 0.3099910001255441, + "learning_rate": 7.888134329612529e-06, + "loss": 2.6353, + "step": 53924 + }, + { + "epoch": 2.5106036268826966, + "grad_norm": 0.315871416255194, + "learning_rate": 7.886674092698199e-06, + "loss": 2.6001, + "step": 53925 + }, + { + "epoch": 2.5106501850687897, + "grad_norm": 0.3162565342925492, + "learning_rate": 7.885213979382317e-06, + "loss": 2.5889, + "step": 53926 + }, + { + "epoch": 2.510696743254883, + "grad_norm": 0.3134700729672839, + "learning_rate": 7.883753989669201e-06, + "loss": 2.5771, + "step": 53927 + }, + { + "epoch": 2.510743301440976, + "grad_norm": 0.31030903525230524, + "learning_rate": 7.882294123563106e-06, + "loss": 2.6314, + "step": 53928 + }, + { + "epoch": 2.510789859627069, + "grad_norm": 0.2959308636525586, + "learning_rate": 7.880834381068331e-06, + "loss": 2.6384, + "step": 53929 + }, + { + "epoch": 2.510836417813162, + "grad_norm": 0.31840680379079267, + "learning_rate": 7.879374762189162e-06, + "loss": 2.6809, + "step": 53930 + }, + { + "epoch": 2.5108829759992553, + "grad_norm": 0.3256629004402309, + "learning_rate": 7.877915266929864e-06, + "loss": 2.5753, + "step": 53931 + }, + { + "epoch": 2.5109295341853484, + "grad_norm": 0.3104267639320639, + "learning_rate": 7.876455895294754e-06, + "loss": 2.5236, + "step": 53932 + }, + { + "epoch": 2.5109760923714415, + "grad_norm": 0.29494527335055126, + "learning_rate": 7.874996647288085e-06, + "loss": 2.6214, + "step": 53933 + }, + { + "epoch": 2.511022650557534, + "grad_norm": 0.325011427058414, + "learning_rate": 7.873537522914153e-06, + "loss": 2.583, + "step": 53934 + }, + { + "epoch": 2.5110692087436273, + "grad_norm": 0.31542891027160613, + "learning_rate": 7.872078522177233e-06, + "loss": 2.6516, + "step": 53935 + }, + { + "epoch": 2.5111157669297204, + "grad_norm": 0.3426953810717319, + "learning_rate": 7.870619645081616e-06, + "loss": 2.7237, + "step": 53936 + }, + { + "epoch": 2.5111623251158135, + "grad_norm": 0.30676150424813586, + "learning_rate": 7.869160891631583e-06, + "loss": 2.5573, + "step": 53937 + }, + { + "epoch": 2.5112088833019066, + "grad_norm": 0.33180591856539593, + "learning_rate": 7.867702261831423e-06, + "loss": 2.6326, + "step": 53938 + }, + { + "epoch": 2.5112554414879997, + "grad_norm": 0.3266741364068259, + "learning_rate": 7.866243755685376e-06, + "loss": 2.6951, + "step": 53939 + }, + { + "epoch": 2.511301999674093, + "grad_norm": 0.33774874928394255, + "learning_rate": 7.86478537319778e-06, + "loss": 2.6765, + "step": 53940 + }, + { + "epoch": 2.5113485578601855, + "grad_norm": 0.30669846590819927, + "learning_rate": 7.863327114372875e-06, + "loss": 2.6758, + "step": 53941 + }, + { + "epoch": 2.5113951160462786, + "grad_norm": 0.3127238688660267, + "learning_rate": 7.861868979214959e-06, + "loss": 2.6951, + "step": 53942 + }, + { + "epoch": 2.5114416742323717, + "grad_norm": 0.3178943823905149, + "learning_rate": 7.860410967728316e-06, + "loss": 2.584, + "step": 53943 + }, + { + "epoch": 2.511488232418465, + "grad_norm": 0.33500748211915204, + "learning_rate": 7.858953079917186e-06, + "loss": 2.5746, + "step": 53944 + }, + { + "epoch": 2.511534790604558, + "grad_norm": 0.32207313517444663, + "learning_rate": 7.857495315785906e-06, + "loss": 2.69, + "step": 53945 + }, + { + "epoch": 2.511581348790651, + "grad_norm": 0.33064102337152707, + "learning_rate": 7.856037675338712e-06, + "loss": 2.7487, + "step": 53946 + }, + { + "epoch": 2.511627906976744, + "grad_norm": 0.3063334682661151, + "learning_rate": 7.8545801585799e-06, + "loss": 2.588, + "step": 53947 + }, + { + "epoch": 2.5116744651628373, + "grad_norm": 0.33747659635729366, + "learning_rate": 7.853122765513737e-06, + "loss": 2.6014, + "step": 53948 + }, + { + "epoch": 2.5117210233489304, + "grad_norm": 0.32476455002592647, + "learning_rate": 7.851665496144511e-06, + "loss": 2.6004, + "step": 53949 + }, + { + "epoch": 2.5117675815350236, + "grad_norm": 0.2977612533058303, + "learning_rate": 7.85020835047649e-06, + "loss": 2.5719, + "step": 53950 + }, + { + "epoch": 2.5118141397211167, + "grad_norm": 0.31538940962827483, + "learning_rate": 7.848751328513971e-06, + "loss": 2.7104, + "step": 53951 + }, + { + "epoch": 2.51186069790721, + "grad_norm": 0.312571920820597, + "learning_rate": 7.847294430261187e-06, + "loss": 2.6112, + "step": 53952 + }, + { + "epoch": 2.5119072560933025, + "grad_norm": 0.29826264921240153, + "learning_rate": 7.845837655722465e-06, + "loss": 2.6594, + "step": 53953 + }, + { + "epoch": 2.5119538142793956, + "grad_norm": 0.30789654086174256, + "learning_rate": 7.844381004902051e-06, + "loss": 2.6659, + "step": 53954 + }, + { + "epoch": 2.5120003724654887, + "grad_norm": 0.29627912239701226, + "learning_rate": 7.84292447780422e-06, + "loss": 2.5906, + "step": 53955 + }, + { + "epoch": 2.512046930651582, + "grad_norm": 0.3017408163195122, + "learning_rate": 7.841468074433268e-06, + "loss": 2.6163, + "step": 53956 + }, + { + "epoch": 2.512093488837675, + "grad_norm": 0.31559091728586974, + "learning_rate": 7.840011794793423e-06, + "loss": 2.679, + "step": 53957 + }, + { + "epoch": 2.512140047023768, + "grad_norm": 0.31091705889174986, + "learning_rate": 7.838555638889022e-06, + "loss": 2.7277, + "step": 53958 + }, + { + "epoch": 2.512186605209861, + "grad_norm": 0.3029865035860582, + "learning_rate": 7.83709960672428e-06, + "loss": 2.6199, + "step": 53959 + }, + { + "epoch": 2.512233163395954, + "grad_norm": 0.3155968983728392, + "learning_rate": 7.835643698303519e-06, + "loss": 2.7021, + "step": 53960 + }, + { + "epoch": 2.512279721582047, + "grad_norm": 0.311493309321625, + "learning_rate": 7.834187913630975e-06, + "loss": 2.6588, + "step": 53961 + }, + { + "epoch": 2.51232627976814, + "grad_norm": 0.3187225607999686, + "learning_rate": 7.83273225271094e-06, + "loss": 2.6494, + "step": 53962 + }, + { + "epoch": 2.512372837954233, + "grad_norm": 0.3189032593852703, + "learning_rate": 7.83127671554768e-06, + "loss": 2.5916, + "step": 53963 + }, + { + "epoch": 2.5124193961403263, + "grad_norm": 0.2969095724575917, + "learning_rate": 7.829821302145485e-06, + "loss": 2.6381, + "step": 53964 + }, + { + "epoch": 2.5124659543264194, + "grad_norm": 0.32207474644156475, + "learning_rate": 7.828366012508581e-06, + "loss": 2.6551, + "step": 53965 + }, + { + "epoch": 2.5125125125125125, + "grad_norm": 0.31823849563903017, + "learning_rate": 7.826910846641295e-06, + "loss": 2.5346, + "step": 53966 + }, + { + "epoch": 2.5125590706986056, + "grad_norm": 0.3287168442699343, + "learning_rate": 7.825455804547855e-06, + "loss": 2.6011, + "step": 53967 + }, + { + "epoch": 2.5126056288846987, + "grad_norm": 0.3154677556450993, + "learning_rate": 7.824000886232551e-06, + "loss": 2.5611, + "step": 53968 + }, + { + "epoch": 2.512652187070792, + "grad_norm": 0.29475626484457645, + "learning_rate": 7.82254609169965e-06, + "loss": 2.5388, + "step": 53969 + }, + { + "epoch": 2.512698745256885, + "grad_norm": 0.3291561460063933, + "learning_rate": 7.821091420953419e-06, + "loss": 2.6533, + "step": 53970 + }, + { + "epoch": 2.512745303442978, + "grad_norm": 0.30443476701696215, + "learning_rate": 7.819636873998144e-06, + "loss": 2.5805, + "step": 53971 + }, + { + "epoch": 2.512791861629071, + "grad_norm": 0.32747630034603176, + "learning_rate": 7.818182450838051e-06, + "loss": 2.5537, + "step": 53972 + }, + { + "epoch": 2.512838419815164, + "grad_norm": 0.32079313588471864, + "learning_rate": 7.816728151477464e-06, + "loss": 2.4669, + "step": 53973 + }, + { + "epoch": 2.512884978001257, + "grad_norm": 0.30934124487981085, + "learning_rate": 7.815273975920612e-06, + "loss": 2.7282, + "step": 53974 + }, + { + "epoch": 2.51293153618735, + "grad_norm": 0.31750266061401305, + "learning_rate": 7.813819924171772e-06, + "loss": 2.6063, + "step": 53975 + }, + { + "epoch": 2.512978094373443, + "grad_norm": 0.3179101170317614, + "learning_rate": 7.812365996235217e-06, + "loss": 2.627, + "step": 53976 + }, + { + "epoch": 2.5130246525595363, + "grad_norm": 0.3055659792459392, + "learning_rate": 7.81091219211521e-06, + "loss": 2.5733, + "step": 53977 + }, + { + "epoch": 2.5130712107456294, + "grad_norm": 0.31180713796566406, + "learning_rate": 7.80945851181602e-06, + "loss": 2.6425, + "step": 53978 + }, + { + "epoch": 2.5131177689317226, + "grad_norm": 0.30613685658327494, + "learning_rate": 7.808004955341925e-06, + "loss": 2.6515, + "step": 53979 + }, + { + "epoch": 2.5131643271178152, + "grad_norm": 0.32059036114077993, + "learning_rate": 7.806551522697169e-06, + "loss": 2.6493, + "step": 53980 + }, + { + "epoch": 2.5132108853039083, + "grad_norm": 0.3188424984138678, + "learning_rate": 7.805098213886025e-06, + "loss": 2.6411, + "step": 53981 + }, + { + "epoch": 2.5132574434900015, + "grad_norm": 0.2950979476384588, + "learning_rate": 7.803645028912766e-06, + "loss": 2.5621, + "step": 53982 + }, + { + "epoch": 2.5133040016760946, + "grad_norm": 0.3192137408050583, + "learning_rate": 7.802191967781641e-06, + "loss": 2.6059, + "step": 53983 + }, + { + "epoch": 2.5133505598621877, + "grad_norm": 0.32768920994984685, + "learning_rate": 7.800739030496945e-06, + "loss": 2.6022, + "step": 53984 + }, + { + "epoch": 2.513397118048281, + "grad_norm": 0.33498667582825464, + "learning_rate": 7.799286217062901e-06, + "loss": 2.7256, + "step": 53985 + }, + { + "epoch": 2.513443676234374, + "grad_norm": 0.29510477239256877, + "learning_rate": 7.79783352748381e-06, + "loss": 2.6342, + "step": 53986 + }, + { + "epoch": 2.513490234420467, + "grad_norm": 0.3005380096359848, + "learning_rate": 7.796380961763916e-06, + "loss": 2.5814, + "step": 53987 + }, + { + "epoch": 2.51353679260656, + "grad_norm": 0.30578448517767814, + "learning_rate": 7.794928519907485e-06, + "loss": 2.6408, + "step": 53988 + }, + { + "epoch": 2.5135833507926533, + "grad_norm": 0.3046722325579396, + "learning_rate": 7.793476201918775e-06, + "loss": 2.6552, + "step": 53989 + }, + { + "epoch": 2.5136299089787464, + "grad_norm": 0.3259381928107474, + "learning_rate": 7.792024007802063e-06, + "loss": 2.7089, + "step": 53990 + }, + { + "epoch": 2.5136764671648395, + "grad_norm": 0.3297579139871575, + "learning_rate": 7.790571937561598e-06, + "loss": 2.6424, + "step": 53991 + }, + { + "epoch": 2.513723025350932, + "grad_norm": 0.3125059269378529, + "learning_rate": 7.789119991201655e-06, + "loss": 2.6634, + "step": 53992 + }, + { + "epoch": 2.5137695835370253, + "grad_norm": 0.31156285777293313, + "learning_rate": 7.787668168726475e-06, + "loss": 2.7042, + "step": 53993 + }, + { + "epoch": 2.5138161417231184, + "grad_norm": 0.3297462733736671, + "learning_rate": 7.786216470140334e-06, + "loss": 2.5852, + "step": 53994 + }, + { + "epoch": 2.5138626999092115, + "grad_norm": 0.3279495211657475, + "learning_rate": 7.784764895447489e-06, + "loss": 2.6543, + "step": 53995 + }, + { + "epoch": 2.5139092580953046, + "grad_norm": 0.3128464756481358, + "learning_rate": 7.783313444652196e-06, + "loss": 2.6058, + "step": 53996 + }, + { + "epoch": 2.5139558162813977, + "grad_norm": 0.30773188898545256, + "learning_rate": 7.781862117758737e-06, + "loss": 2.5139, + "step": 53997 + }, + { + "epoch": 2.514002374467491, + "grad_norm": 0.3337763126961074, + "learning_rate": 7.780410914771324e-06, + "loss": 2.7053, + "step": 53998 + }, + { + "epoch": 2.514048932653584, + "grad_norm": 0.30298971468611485, + "learning_rate": 7.778959835694272e-06, + "loss": 2.5565, + "step": 53999 + }, + { + "epoch": 2.5140954908396766, + "grad_norm": 0.3151100253492101, + "learning_rate": 7.777508880531803e-06, + "loss": 2.6172, + "step": 54000 + }, + { + "epoch": 2.5141420490257698, + "grad_norm": 0.32243564689546755, + "learning_rate": 7.776058049288182e-06, + "loss": 2.6615, + "step": 54001 + }, + { + "epoch": 2.514188607211863, + "grad_norm": 0.3097390195812791, + "learning_rate": 7.774607341967671e-06, + "loss": 2.6184, + "step": 54002 + }, + { + "epoch": 2.514235165397956, + "grad_norm": 0.31281695102718, + "learning_rate": 7.773156758574528e-06, + "loss": 2.6781, + "step": 54003 + }, + { + "epoch": 2.514281723584049, + "grad_norm": 0.3108112934716836, + "learning_rate": 7.771706299113013e-06, + "loss": 2.6237, + "step": 54004 + }, + { + "epoch": 2.5143282817701422, + "grad_norm": 0.3312763263939134, + "learning_rate": 7.770255963587392e-06, + "loss": 2.5403, + "step": 54005 + }, + { + "epoch": 2.5143748399562353, + "grad_norm": 0.3114681164900929, + "learning_rate": 7.768805752001895e-06, + "loss": 2.551, + "step": 54006 + }, + { + "epoch": 2.5144213981423285, + "grad_norm": 0.3035357488617775, + "learning_rate": 7.767355664360793e-06, + "loss": 2.6997, + "step": 54007 + }, + { + "epoch": 2.5144679563284216, + "grad_norm": 0.31313984524436567, + "learning_rate": 7.76590570066834e-06, + "loss": 2.621, + "step": 54008 + }, + { + "epoch": 2.5145145145145147, + "grad_norm": 0.29772070952544466, + "learning_rate": 7.764455860928794e-06, + "loss": 2.5502, + "step": 54009 + }, + { + "epoch": 2.514561072700608, + "grad_norm": 0.3127436013333007, + "learning_rate": 7.763006145146424e-06, + "loss": 2.7321, + "step": 54010 + }, + { + "epoch": 2.514607630886701, + "grad_norm": 0.3014741103441947, + "learning_rate": 7.761556553325443e-06, + "loss": 2.6588, + "step": 54011 + }, + { + "epoch": 2.5146541890727936, + "grad_norm": 0.30398559687098015, + "learning_rate": 7.760107085470159e-06, + "loss": 2.6515, + "step": 54012 + }, + { + "epoch": 2.5147007472588867, + "grad_norm": 0.293948922061235, + "learning_rate": 7.758657741584774e-06, + "loss": 2.6249, + "step": 54013 + }, + { + "epoch": 2.51474730544498, + "grad_norm": 0.2952355931542062, + "learning_rate": 7.757208521673587e-06, + "loss": 2.614, + "step": 54014 + }, + { + "epoch": 2.514793863631073, + "grad_norm": 0.3178328527290078, + "learning_rate": 7.755759425740822e-06, + "loss": 2.64, + "step": 54015 + }, + { + "epoch": 2.514840421817166, + "grad_norm": 0.3191712795588295, + "learning_rate": 7.754310453790748e-06, + "loss": 2.6178, + "step": 54016 + }, + { + "epoch": 2.514886980003259, + "grad_norm": 0.30863645224051217, + "learning_rate": 7.7528616058276e-06, + "loss": 2.636, + "step": 54017 + }, + { + "epoch": 2.5149335381893523, + "grad_norm": 0.2975141013712824, + "learning_rate": 7.751412881855657e-06, + "loss": 2.5865, + "step": 54018 + }, + { + "epoch": 2.514980096375445, + "grad_norm": 0.30433232395709464, + "learning_rate": 7.749964281879146e-06, + "loss": 2.5452, + "step": 54019 + }, + { + "epoch": 2.515026654561538, + "grad_norm": 0.3239585410004511, + "learning_rate": 7.748515805902328e-06, + "loss": 2.6679, + "step": 54020 + }, + { + "epoch": 2.515073212747631, + "grad_norm": 0.30421836745013353, + "learning_rate": 7.747067453929451e-06, + "loss": 2.6588, + "step": 54021 + }, + { + "epoch": 2.5151197709337243, + "grad_norm": 0.29038819657297593, + "learning_rate": 7.745619225964768e-06, + "loss": 2.6363, + "step": 54022 + }, + { + "epoch": 2.5151663291198174, + "grad_norm": 0.3120624812310386, + "learning_rate": 7.744171122012545e-06, + "loss": 2.6431, + "step": 54023 + }, + { + "epoch": 2.5152128873059105, + "grad_norm": 0.320408595139664, + "learning_rate": 7.742723142076991e-06, + "loss": 2.643, + "step": 54024 + }, + { + "epoch": 2.5152594454920036, + "grad_norm": 0.3017656080682824, + "learning_rate": 7.741275286162404e-06, + "loss": 2.5644, + "step": 54025 + }, + { + "epoch": 2.5153060036780968, + "grad_norm": 0.3042123602031834, + "learning_rate": 7.739827554272987e-06, + "loss": 2.5818, + "step": 54026 + }, + { + "epoch": 2.51535256186419, + "grad_norm": 0.3176608508424297, + "learning_rate": 7.738379946413037e-06, + "loss": 2.6909, + "step": 54027 + }, + { + "epoch": 2.515399120050283, + "grad_norm": 0.3201646672148303, + "learning_rate": 7.736932462586766e-06, + "loss": 2.5318, + "step": 54028 + }, + { + "epoch": 2.515445678236376, + "grad_norm": 0.3194393637188744, + "learning_rate": 7.73548510279844e-06, + "loss": 2.6131, + "step": 54029 + }, + { + "epoch": 2.515492236422469, + "grad_norm": 0.3092547600164635, + "learning_rate": 7.734037867052297e-06, + "loss": 2.6465, + "step": 54030 + }, + { + "epoch": 2.515538794608562, + "grad_norm": 0.34506315271322946, + "learning_rate": 7.732590755352587e-06, + "loss": 2.7116, + "step": 54031 + }, + { + "epoch": 2.515585352794655, + "grad_norm": 0.31135043131663376, + "learning_rate": 7.731143767703574e-06, + "loss": 2.6305, + "step": 54032 + }, + { + "epoch": 2.515631910980748, + "grad_norm": 0.31340707497647036, + "learning_rate": 7.729696904109479e-06, + "loss": 2.5419, + "step": 54033 + }, + { + "epoch": 2.5156784691668412, + "grad_norm": 0.30788168661487325, + "learning_rate": 7.728250164574558e-06, + "loss": 2.4598, + "step": 54034 + }, + { + "epoch": 2.5157250273529344, + "grad_norm": 0.34510447445174497, + "learning_rate": 7.726803549103057e-06, + "loss": 2.6599, + "step": 54035 + }, + { + "epoch": 2.5157715855390275, + "grad_norm": 0.3419936035973144, + "learning_rate": 7.725357057699233e-06, + "loss": 2.6046, + "step": 54036 + }, + { + "epoch": 2.5158181437251206, + "grad_norm": 0.32098408097089004, + "learning_rate": 7.723910690367303e-06, + "loss": 2.5848, + "step": 54037 + }, + { + "epoch": 2.5158647019112137, + "grad_norm": 0.31695918750041724, + "learning_rate": 7.72246444711155e-06, + "loss": 2.5935, + "step": 54038 + }, + { + "epoch": 2.5159112600973064, + "grad_norm": 0.3258081917031734, + "learning_rate": 7.721018327936175e-06, + "loss": 2.7428, + "step": 54039 + }, + { + "epoch": 2.5159578182833995, + "grad_norm": 0.36303727255831936, + "learning_rate": 7.71957233284547e-06, + "loss": 2.6087, + "step": 54040 + }, + { + "epoch": 2.5160043764694926, + "grad_norm": 0.3224358912923117, + "learning_rate": 7.718126461843645e-06, + "loss": 2.7052, + "step": 54041 + }, + { + "epoch": 2.5160509346555857, + "grad_norm": 0.3139755770949766, + "learning_rate": 7.716680714934954e-06, + "loss": 2.8082, + "step": 54042 + }, + { + "epoch": 2.516097492841679, + "grad_norm": 0.3003472667184968, + "learning_rate": 7.715235092123635e-06, + "loss": 2.6954, + "step": 54043 + }, + { + "epoch": 2.516144051027772, + "grad_norm": 0.3257938484793083, + "learning_rate": 7.71378959341394e-06, + "loss": 2.6482, + "step": 54044 + }, + { + "epoch": 2.516190609213865, + "grad_norm": 0.3352809011121904, + "learning_rate": 7.71234421881012e-06, + "loss": 2.5388, + "step": 54045 + }, + { + "epoch": 2.516237167399958, + "grad_norm": 0.2883384269125829, + "learning_rate": 7.710898968316389e-06, + "loss": 2.582, + "step": 54046 + }, + { + "epoch": 2.5162837255860513, + "grad_norm": 0.29878966524372863, + "learning_rate": 7.709453841936998e-06, + "loss": 2.65, + "step": 54047 + }, + { + "epoch": 2.5163302837721444, + "grad_norm": 0.321352309773543, + "learning_rate": 7.7080088396762e-06, + "loss": 2.6475, + "step": 54048 + }, + { + "epoch": 2.5163768419582375, + "grad_norm": 0.3480767216126049, + "learning_rate": 7.706563961538226e-06, + "loss": 2.6621, + "step": 54049 + }, + { + "epoch": 2.5164234001443306, + "grad_norm": 0.30224083974099897, + "learning_rate": 7.705119207527323e-06, + "loss": 2.6355, + "step": 54050 + }, + { + "epoch": 2.5164699583304233, + "grad_norm": 0.3299094566895323, + "learning_rate": 7.70367457764774e-06, + "loss": 2.6515, + "step": 54051 + }, + { + "epoch": 2.5165165165165164, + "grad_norm": 0.33109821726398203, + "learning_rate": 7.702230071903683e-06, + "loss": 2.5622, + "step": 54052 + }, + { + "epoch": 2.5165630747026095, + "grad_norm": 0.3103108279342671, + "learning_rate": 7.700785690299434e-06, + "loss": 2.7087, + "step": 54053 + }, + { + "epoch": 2.5166096328887027, + "grad_norm": 0.3176888849822779, + "learning_rate": 7.699341432839203e-06, + "loss": 2.7153, + "step": 54054 + }, + { + "epoch": 2.5166561910747958, + "grad_norm": 0.29743941276519287, + "learning_rate": 7.697897299527235e-06, + "loss": 2.6267, + "step": 54055 + }, + { + "epoch": 2.516702749260889, + "grad_norm": 0.32310601724255333, + "learning_rate": 7.696453290367773e-06, + "loss": 2.6602, + "step": 54056 + }, + { + "epoch": 2.516749307446982, + "grad_norm": 0.32804840384712874, + "learning_rate": 7.69500940536505e-06, + "loss": 2.7621, + "step": 54057 + }, + { + "epoch": 2.5167958656330747, + "grad_norm": 0.3273435304007073, + "learning_rate": 7.69356564452332e-06, + "loss": 2.6159, + "step": 54058 + }, + { + "epoch": 2.516842423819168, + "grad_norm": 0.3037434729318889, + "learning_rate": 7.692122007846797e-06, + "loss": 2.6417, + "step": 54059 + }, + { + "epoch": 2.516888982005261, + "grad_norm": 0.32372103031401334, + "learning_rate": 7.690678495339727e-06, + "loss": 2.5844, + "step": 54060 + }, + { + "epoch": 2.516935540191354, + "grad_norm": 0.31684113721146606, + "learning_rate": 7.689235107006348e-06, + "loss": 2.5164, + "step": 54061 + }, + { + "epoch": 2.516982098377447, + "grad_norm": 0.3201793381087976, + "learning_rate": 7.687791842850895e-06, + "loss": 2.5642, + "step": 54062 + }, + { + "epoch": 2.5170286565635402, + "grad_norm": 0.31410338251453723, + "learning_rate": 7.686348702877605e-06, + "loss": 2.6887, + "step": 54063 + }, + { + "epoch": 2.5170752147496334, + "grad_norm": 0.30879438881047, + "learning_rate": 7.684905687090721e-06, + "loss": 2.6119, + "step": 54064 + }, + { + "epoch": 2.5171217729357265, + "grad_norm": 0.32020749623779976, + "learning_rate": 7.683462795494451e-06, + "loss": 2.6066, + "step": 54065 + }, + { + "epoch": 2.5171683311218196, + "grad_norm": 0.28782977675852217, + "learning_rate": 7.682020028093073e-06, + "loss": 2.666, + "step": 54066 + }, + { + "epoch": 2.5172148893079127, + "grad_norm": 0.3241649463969342, + "learning_rate": 7.68057738489078e-06, + "loss": 2.6774, + "step": 54067 + }, + { + "epoch": 2.517261447494006, + "grad_norm": 0.3275136314196952, + "learning_rate": 7.67913486589183e-06, + "loss": 2.581, + "step": 54068 + }, + { + "epoch": 2.517308005680099, + "grad_norm": 0.31640688287573326, + "learning_rate": 7.677692471100444e-06, + "loss": 2.6593, + "step": 54069 + }, + { + "epoch": 2.5173545638661916, + "grad_norm": 0.30479025604385834, + "learning_rate": 7.676250200520862e-06, + "loss": 2.7341, + "step": 54070 + }, + { + "epoch": 2.5174011220522847, + "grad_norm": 0.3042896876081372, + "learning_rate": 7.674808054157333e-06, + "loss": 2.5575, + "step": 54071 + }, + { + "epoch": 2.517447680238378, + "grad_norm": 0.3281755938683687, + "learning_rate": 7.673366032014056e-06, + "loss": 2.6137, + "step": 54072 + }, + { + "epoch": 2.517494238424471, + "grad_norm": 0.2971361387955615, + "learning_rate": 7.671924134095283e-06, + "loss": 2.6366, + "step": 54073 + }, + { + "epoch": 2.517540796610564, + "grad_norm": 0.30548495077316434, + "learning_rate": 7.67048236040524e-06, + "loss": 2.5091, + "step": 54074 + }, + { + "epoch": 2.517587354796657, + "grad_norm": 0.31634113163900435, + "learning_rate": 7.669040710948161e-06, + "loss": 2.6181, + "step": 54075 + }, + { + "epoch": 2.5176339129827503, + "grad_norm": 0.32842372988095003, + "learning_rate": 7.66759918572828e-06, + "loss": 2.6688, + "step": 54076 + }, + { + "epoch": 2.5176804711688434, + "grad_norm": 0.30972092782619254, + "learning_rate": 7.666157784749834e-06, + "loss": 2.6167, + "step": 54077 + }, + { + "epoch": 2.517727029354936, + "grad_norm": 0.30812845459523913, + "learning_rate": 7.664716508017023e-06, + "loss": 2.5965, + "step": 54078 + }, + { + "epoch": 2.517773587541029, + "grad_norm": 0.31239683405330126, + "learning_rate": 7.663275355534116e-06, + "loss": 2.5944, + "step": 54079 + }, + { + "epoch": 2.5178201457271223, + "grad_norm": 0.28984655765753664, + "learning_rate": 7.661834327305318e-06, + "loss": 2.6452, + "step": 54080 + }, + { + "epoch": 2.5178667039132154, + "grad_norm": 0.3049414380434368, + "learning_rate": 7.660393423334866e-06, + "loss": 2.6036, + "step": 54081 + }, + { + "epoch": 2.5179132620993085, + "grad_norm": 0.31671627483630727, + "learning_rate": 7.658952643626988e-06, + "loss": 2.6057, + "step": 54082 + }, + { + "epoch": 2.5179598202854017, + "grad_norm": 0.31758568791681663, + "learning_rate": 7.657511988185912e-06, + "loss": 2.626, + "step": 54083 + }, + { + "epoch": 2.5180063784714948, + "grad_norm": 0.3183842231964279, + "learning_rate": 7.656071457015878e-06, + "loss": 2.6292, + "step": 54084 + }, + { + "epoch": 2.518052936657588, + "grad_norm": 0.3044527585890509, + "learning_rate": 7.65463105012108e-06, + "loss": 2.6511, + "step": 54085 + }, + { + "epoch": 2.518099494843681, + "grad_norm": 0.31659863343038697, + "learning_rate": 7.65319076750579e-06, + "loss": 2.7591, + "step": 54086 + }, + { + "epoch": 2.518146053029774, + "grad_norm": 0.30969420407740017, + "learning_rate": 7.651750609174202e-06, + "loss": 2.6204, + "step": 54087 + }, + { + "epoch": 2.5181926112158672, + "grad_norm": 0.306555389385004, + "learning_rate": 7.650310575130554e-06, + "loss": 2.6662, + "step": 54088 + }, + { + "epoch": 2.5182391694019604, + "grad_norm": 0.3039046598074226, + "learning_rate": 7.648870665379076e-06, + "loss": 2.7028, + "step": 54089 + }, + { + "epoch": 2.518285727588053, + "grad_norm": 0.3006501237335638, + "learning_rate": 7.647430879924e-06, + "loss": 2.4644, + "step": 54090 + }, + { + "epoch": 2.518332285774146, + "grad_norm": 0.3149489138606096, + "learning_rate": 7.645991218769516e-06, + "loss": 2.5619, + "step": 54091 + }, + { + "epoch": 2.5183788439602393, + "grad_norm": 0.30083112837139786, + "learning_rate": 7.644551681919899e-06, + "loss": 2.5207, + "step": 54092 + }, + { + "epoch": 2.5184254021463324, + "grad_norm": 0.3233429981271652, + "learning_rate": 7.643112269379343e-06, + "loss": 2.6588, + "step": 54093 + }, + { + "epoch": 2.5184719603324255, + "grad_norm": 0.31832245995612474, + "learning_rate": 7.641672981152076e-06, + "loss": 2.6883, + "step": 54094 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 0.33583629959589784, + "learning_rate": 7.640233817242326e-06, + "loss": 2.7125, + "step": 54095 + }, + { + "epoch": 2.5185650767046117, + "grad_norm": 0.30849246038964373, + "learning_rate": 7.638794777654317e-06, + "loss": 2.6946, + "step": 54096 + }, + { + "epoch": 2.5186116348907044, + "grad_norm": 0.3080526246463769, + "learning_rate": 7.637355862392281e-06, + "loss": 2.6441, + "step": 54097 + }, + { + "epoch": 2.5186581930767975, + "grad_norm": 0.3113319508198158, + "learning_rate": 7.635917071460413e-06, + "loss": 2.5981, + "step": 54098 + }, + { + "epoch": 2.5187047512628906, + "grad_norm": 0.31898380708213764, + "learning_rate": 7.634478404862976e-06, + "loss": 2.6024, + "step": 54099 + }, + { + "epoch": 2.5187513094489837, + "grad_norm": 0.31211120210633503, + "learning_rate": 7.633039862604159e-06, + "loss": 2.7343, + "step": 54100 + }, + { + "epoch": 2.518797867635077, + "grad_norm": 0.31696461297965045, + "learning_rate": 7.631601444688197e-06, + "loss": 2.5715, + "step": 54101 + }, + { + "epoch": 2.51884442582117, + "grad_norm": 0.3327733360842622, + "learning_rate": 7.630163151119313e-06, + "loss": 2.684, + "step": 54102 + }, + { + "epoch": 2.518890984007263, + "grad_norm": 0.3205117137518061, + "learning_rate": 7.628724981901724e-06, + "loss": 2.6571, + "step": 54103 + }, + { + "epoch": 2.518937542193356, + "grad_norm": 0.31264569085162974, + "learning_rate": 7.6272869370396515e-06, + "loss": 2.6183, + "step": 54104 + }, + { + "epoch": 2.5189841003794493, + "grad_norm": 0.31702118537611584, + "learning_rate": 7.625849016537329e-06, + "loss": 2.6369, + "step": 54105 + }, + { + "epoch": 2.5190306585655424, + "grad_norm": 0.31937837475649955, + "learning_rate": 7.624411220398958e-06, + "loss": 2.635, + "step": 54106 + }, + { + "epoch": 2.5190772167516355, + "grad_norm": 0.3222671059791725, + "learning_rate": 7.622973548628759e-06, + "loss": 2.6907, + "step": 54107 + }, + { + "epoch": 2.5191237749377287, + "grad_norm": 0.3010490443345299, + "learning_rate": 7.621536001230961e-06, + "loss": 2.6539, + "step": 54108 + }, + { + "epoch": 2.5191703331238218, + "grad_norm": 0.3156431925440218, + "learning_rate": 7.620098578209784e-06, + "loss": 2.5978, + "step": 54109 + }, + { + "epoch": 2.5192168913099144, + "grad_norm": 0.3165241819134932, + "learning_rate": 7.618661279569444e-06, + "loss": 2.6476, + "step": 54110 + }, + { + "epoch": 2.5192634494960076, + "grad_norm": 0.2989486072531701, + "learning_rate": 7.617224105314141e-06, + "loss": 2.6389, + "step": 54111 + }, + { + "epoch": 2.5193100076821007, + "grad_norm": 0.3058369966121407, + "learning_rate": 7.615787055448137e-06, + "loss": 2.574, + "step": 54112 + }, + { + "epoch": 2.519356565868194, + "grad_norm": 0.3128748148294367, + "learning_rate": 7.614350129975606e-06, + "loss": 2.5772, + "step": 54113 + }, + { + "epoch": 2.519403124054287, + "grad_norm": 0.30522185332321783, + "learning_rate": 7.612913328900784e-06, + "loss": 2.6357, + "step": 54114 + }, + { + "epoch": 2.51944968224038, + "grad_norm": 0.2965295791195204, + "learning_rate": 7.6114766522278835e-06, + "loss": 2.6257, + "step": 54115 + }, + { + "epoch": 2.519496240426473, + "grad_norm": 0.31548047891985875, + "learning_rate": 7.610040099961124e-06, + "loss": 2.6846, + "step": 54116 + }, + { + "epoch": 2.519542798612566, + "grad_norm": 0.3053236058898806, + "learning_rate": 7.60860367210472e-06, + "loss": 2.6705, + "step": 54117 + }, + { + "epoch": 2.519589356798659, + "grad_norm": 0.30916594085720245, + "learning_rate": 7.607167368662899e-06, + "loss": 2.7249, + "step": 54118 + }, + { + "epoch": 2.519635914984752, + "grad_norm": 0.3041134590963414, + "learning_rate": 7.605731189639853e-06, + "loss": 2.6026, + "step": 54119 + }, + { + "epoch": 2.519682473170845, + "grad_norm": 0.3126795875274499, + "learning_rate": 7.604295135039813e-06, + "loss": 2.6388, + "step": 54120 + }, + { + "epoch": 2.5197290313569383, + "grad_norm": 0.3194428110505534, + "learning_rate": 7.602859204866985e-06, + "loss": 2.4979, + "step": 54121 + }, + { + "epoch": 2.5197755895430314, + "grad_norm": 0.3107771100251792, + "learning_rate": 7.60142339912559e-06, + "loss": 2.7375, + "step": 54122 + }, + { + "epoch": 2.5198221477291245, + "grad_norm": 0.3110772731632174, + "learning_rate": 7.599987717819856e-06, + "loss": 2.6713, + "step": 54123 + }, + { + "epoch": 2.5198687059152176, + "grad_norm": 0.30067580888814427, + "learning_rate": 7.5985521609539536e-06, + "loss": 2.6356, + "step": 54124 + }, + { + "epoch": 2.5199152641013107, + "grad_norm": 0.32316112728722124, + "learning_rate": 7.5971167285321495e-06, + "loss": 2.6864, + "step": 54125 + }, + { + "epoch": 2.519961822287404, + "grad_norm": 0.3211967910149582, + "learning_rate": 7.59568142055862e-06, + "loss": 2.6418, + "step": 54126 + }, + { + "epoch": 2.520008380473497, + "grad_norm": 0.300824360250577, + "learning_rate": 7.594246237037584e-06, + "loss": 2.651, + "step": 54127 + }, + { + "epoch": 2.52005493865959, + "grad_norm": 0.3205976273937744, + "learning_rate": 7.592811177973258e-06, + "loss": 2.6566, + "step": 54128 + }, + { + "epoch": 2.5201014968456827, + "grad_norm": 0.3030778930975447, + "learning_rate": 7.591376243369858e-06, + "loss": 2.5698, + "step": 54129 + }, + { + "epoch": 2.520148055031776, + "grad_norm": 0.32037763656122575, + "learning_rate": 7.5899414332315845e-06, + "loss": 2.5955, + "step": 54130 + }, + { + "epoch": 2.520194613217869, + "grad_norm": 0.30377126548934036, + "learning_rate": 7.588506747562668e-06, + "loss": 2.7168, + "step": 54131 + }, + { + "epoch": 2.520241171403962, + "grad_norm": 0.294208396202074, + "learning_rate": 7.587072186367294e-06, + "loss": 2.5995, + "step": 54132 + }, + { + "epoch": 2.520287729590055, + "grad_norm": 0.3133351377045406, + "learning_rate": 7.585637749649688e-06, + "loss": 2.7051, + "step": 54133 + }, + { + "epoch": 2.5203342877761483, + "grad_norm": 0.3126037923432727, + "learning_rate": 7.584203437414056e-06, + "loss": 2.5551, + "step": 54134 + }, + { + "epoch": 2.5203808459622414, + "grad_norm": 0.31017833365976377, + "learning_rate": 7.582769249664606e-06, + "loss": 2.6221, + "step": 54135 + }, + { + "epoch": 2.520427404148334, + "grad_norm": 0.3099451238256433, + "learning_rate": 7.581335186405564e-06, + "loss": 2.6867, + "step": 54136 + }, + { + "epoch": 2.520473962334427, + "grad_norm": 0.2966984140959244, + "learning_rate": 7.579901247641097e-06, + "loss": 2.6037, + "step": 54137 + }, + { + "epoch": 2.5205205205205203, + "grad_norm": 0.295750684924286, + "learning_rate": 7.578467433375469e-06, + "loss": 2.5674, + "step": 54138 + }, + { + "epoch": 2.5205670787066135, + "grad_norm": 0.3074494577119586, + "learning_rate": 7.577033743612832e-06, + "loss": 2.618, + "step": 54139 + }, + { + "epoch": 2.5206136368927066, + "grad_norm": 0.29998985978516096, + "learning_rate": 7.57560017835745e-06, + "loss": 2.6309, + "step": 54140 + }, + { + "epoch": 2.5206601950787997, + "grad_norm": 0.3044491071211894, + "learning_rate": 7.574166737613486e-06, + "loss": 2.5573, + "step": 54141 + }, + { + "epoch": 2.520706753264893, + "grad_norm": 0.31976842017545487, + "learning_rate": 7.572733421385164e-06, + "loss": 2.6427, + "step": 54142 + }, + { + "epoch": 2.520753311450986, + "grad_norm": 0.29652560644420567, + "learning_rate": 7.571300229676692e-06, + "loss": 2.5502, + "step": 54143 + }, + { + "epoch": 2.520799869637079, + "grad_norm": 0.30947143746504946, + "learning_rate": 7.569867162492283e-06, + "loss": 2.691, + "step": 54144 + }, + { + "epoch": 2.520846427823172, + "grad_norm": 0.305738706007298, + "learning_rate": 7.568434219836123e-06, + "loss": 2.5677, + "step": 54145 + }, + { + "epoch": 2.5208929860092653, + "grad_norm": 0.3250225049415577, + "learning_rate": 7.567001401712426e-06, + "loss": 2.6974, + "step": 54146 + }, + { + "epoch": 2.5209395441953584, + "grad_norm": 0.3021709522601654, + "learning_rate": 7.565568708125404e-06, + "loss": 2.5808, + "step": 54147 + }, + { + "epoch": 2.5209861023814515, + "grad_norm": 0.2980749774293179, + "learning_rate": 7.5641361390792535e-06, + "loss": 2.5886, + "step": 54148 + }, + { + "epoch": 2.521032660567544, + "grad_norm": 0.3111491109732597, + "learning_rate": 7.562703694578194e-06, + "loss": 2.658, + "step": 54149 + }, + { + "epoch": 2.5210792187536373, + "grad_norm": 0.31069432779519485, + "learning_rate": 7.5612713746264e-06, + "loss": 2.614, + "step": 54150 + }, + { + "epoch": 2.5211257769397304, + "grad_norm": 0.29693258668815226, + "learning_rate": 7.559839179228112e-06, + "loss": 2.6261, + "step": 54151 + }, + { + "epoch": 2.5211723351258235, + "grad_norm": 0.29797907095719667, + "learning_rate": 7.558407108387494e-06, + "loss": 2.6639, + "step": 54152 + }, + { + "epoch": 2.5212188933119166, + "grad_norm": 0.2908228296915416, + "learning_rate": 7.556975162108792e-06, + "loss": 2.5419, + "step": 54153 + }, + { + "epoch": 2.5212654514980097, + "grad_norm": 0.3186376297327503, + "learning_rate": 7.555543340396176e-06, + "loss": 2.6601, + "step": 54154 + }, + { + "epoch": 2.521312009684103, + "grad_norm": 0.2953986032243986, + "learning_rate": 7.5541116432538585e-06, + "loss": 2.6397, + "step": 54155 + }, + { + "epoch": 2.5213585678701955, + "grad_norm": 0.3097687573163933, + "learning_rate": 7.552680070686041e-06, + "loss": 2.5546, + "step": 54156 + }, + { + "epoch": 2.5214051260562886, + "grad_norm": 0.3063837189929222, + "learning_rate": 7.551248622696927e-06, + "loss": 2.708, + "step": 54157 + }, + { + "epoch": 2.5214516842423818, + "grad_norm": 0.3166708023103996, + "learning_rate": 7.549817299290729e-06, + "loss": 2.6431, + "step": 54158 + }, + { + "epoch": 2.521498242428475, + "grad_norm": 0.29720726353965127, + "learning_rate": 7.548386100471622e-06, + "loss": 2.547, + "step": 54159 + }, + { + "epoch": 2.521544800614568, + "grad_norm": 0.3315703660597058, + "learning_rate": 7.5469550262438185e-06, + "loss": 2.6696, + "step": 54160 + }, + { + "epoch": 2.521591358800661, + "grad_norm": 0.3143815627466971, + "learning_rate": 7.545524076611521e-06, + "loss": 2.6811, + "step": 54161 + }, + { + "epoch": 2.521637916986754, + "grad_norm": 0.30668312306383966, + "learning_rate": 7.544093251578926e-06, + "loss": 2.5592, + "step": 54162 + }, + { + "epoch": 2.5216844751728473, + "grad_norm": 0.2940783761889525, + "learning_rate": 7.5426625511502365e-06, + "loss": 2.6075, + "step": 54163 + }, + { + "epoch": 2.5217310333589404, + "grad_norm": 0.31726423991979613, + "learning_rate": 7.54123197532966e-06, + "loss": 2.7438, + "step": 54164 + }, + { + "epoch": 2.5217775915450336, + "grad_norm": 0.32649607593784746, + "learning_rate": 7.539801524121365e-06, + "loss": 2.5837, + "step": 54165 + }, + { + "epoch": 2.5218241497311267, + "grad_norm": 0.3125136776522396, + "learning_rate": 7.538371197529592e-06, + "loss": 2.5437, + "step": 54166 + }, + { + "epoch": 2.52187070791722, + "grad_norm": 0.29144712407802387, + "learning_rate": 7.5369409955585e-06, + "loss": 2.5328, + "step": 54167 + }, + { + "epoch": 2.5219172661033125, + "grad_norm": 0.32890068154156, + "learning_rate": 7.535510918212307e-06, + "loss": 2.6652, + "step": 54168 + }, + { + "epoch": 2.5219638242894056, + "grad_norm": 0.30645246524047887, + "learning_rate": 7.534080965495205e-06, + "loss": 2.663, + "step": 54169 + }, + { + "epoch": 2.5220103824754987, + "grad_norm": 0.3092245995351908, + "learning_rate": 7.5326511374113895e-06, + "loss": 2.6088, + "step": 54170 + }, + { + "epoch": 2.522056940661592, + "grad_norm": 0.28468525807292955, + "learning_rate": 7.531221433965075e-06, + "loss": 2.5699, + "step": 54171 + }, + { + "epoch": 2.522103498847685, + "grad_norm": 0.301870048594622, + "learning_rate": 7.529791855160428e-06, + "loss": 2.7014, + "step": 54172 + }, + { + "epoch": 2.522150057033778, + "grad_norm": 0.3025195813376171, + "learning_rate": 7.528362401001659e-06, + "loss": 2.6037, + "step": 54173 + }, + { + "epoch": 2.522196615219871, + "grad_norm": 0.3134179969598909, + "learning_rate": 7.526933071492959e-06, + "loss": 2.5166, + "step": 54174 + }, + { + "epoch": 2.5222431734059643, + "grad_norm": 0.30626286620088605, + "learning_rate": 7.525503866638528e-06, + "loss": 2.6321, + "step": 54175 + }, + { + "epoch": 2.522289731592057, + "grad_norm": 0.30513722648517955, + "learning_rate": 7.524074786442564e-06, + "loss": 2.6343, + "step": 54176 + }, + { + "epoch": 2.52233628977815, + "grad_norm": 0.31600462108685884, + "learning_rate": 7.522645830909259e-06, + "loss": 2.6673, + "step": 54177 + }, + { + "epoch": 2.522382847964243, + "grad_norm": 0.3036413214922428, + "learning_rate": 7.521217000042785e-06, + "loss": 2.5721, + "step": 54178 + }, + { + "epoch": 2.5224294061503363, + "grad_norm": 0.3107913558863607, + "learning_rate": 7.5197882938473796e-06, + "loss": 2.6448, + "step": 54179 + }, + { + "epoch": 2.5224759643364294, + "grad_norm": 0.3217752845533013, + "learning_rate": 7.518359712327183e-06, + "loss": 2.6516, + "step": 54180 + }, + { + "epoch": 2.5225225225225225, + "grad_norm": 0.3137423844670873, + "learning_rate": 7.516931255486437e-06, + "loss": 2.6224, + "step": 54181 + }, + { + "epoch": 2.5225690807086156, + "grad_norm": 0.29793490798857236, + "learning_rate": 7.5155029233293035e-06, + "loss": 2.6153, + "step": 54182 + }, + { + "epoch": 2.5226156388947087, + "grad_norm": 0.30112695962161584, + "learning_rate": 7.514074715859987e-06, + "loss": 2.6401, + "step": 54183 + }, + { + "epoch": 2.522662197080802, + "grad_norm": 0.2934380951574235, + "learning_rate": 7.512646633082687e-06, + "loss": 2.4846, + "step": 54184 + }, + { + "epoch": 2.522708755266895, + "grad_norm": 0.29135971945104255, + "learning_rate": 7.511218675001569e-06, + "loss": 2.5092, + "step": 54185 + }, + { + "epoch": 2.522755313452988, + "grad_norm": 0.2963298416442857, + "learning_rate": 7.509790841620834e-06, + "loss": 2.6181, + "step": 54186 + }, + { + "epoch": 2.522801871639081, + "grad_norm": 0.29789927333083116, + "learning_rate": 7.5083631329446845e-06, + "loss": 2.5133, + "step": 54187 + }, + { + "epoch": 2.522848429825174, + "grad_norm": 0.3139463974195225, + "learning_rate": 7.5069355489773005e-06, + "loss": 2.6361, + "step": 54188 + }, + { + "epoch": 2.522894988011267, + "grad_norm": 0.31297863758241784, + "learning_rate": 7.5055080897228736e-06, + "loss": 2.6275, + "step": 54189 + }, + { + "epoch": 2.52294154619736, + "grad_norm": 0.2997557642417739, + "learning_rate": 7.504080755185605e-06, + "loss": 2.6106, + "step": 54190 + }, + { + "epoch": 2.5229881043834532, + "grad_norm": 0.3054605151040242, + "learning_rate": 7.502653545369653e-06, + "loss": 2.6159, + "step": 54191 + }, + { + "epoch": 2.5230346625695463, + "grad_norm": 0.3156085144902109, + "learning_rate": 7.501226460279242e-06, + "loss": 2.6167, + "step": 54192 + }, + { + "epoch": 2.5230812207556395, + "grad_norm": 0.3141319985110726, + "learning_rate": 7.499799499918531e-06, + "loss": 2.5981, + "step": 54193 + }, + { + "epoch": 2.5231277789417326, + "grad_norm": 0.3073910745606172, + "learning_rate": 7.498372664291736e-06, + "loss": 2.6489, + "step": 54194 + }, + { + "epoch": 2.5231743371278252, + "grad_norm": 0.30910953996745916, + "learning_rate": 7.496945953403023e-06, + "loss": 2.5937, + "step": 54195 + }, + { + "epoch": 2.5232208953139184, + "grad_norm": 0.3100896086841764, + "learning_rate": 7.495519367256587e-06, + "loss": 2.6299, + "step": 54196 + }, + { + "epoch": 2.5232674535000115, + "grad_norm": 0.32627178579401345, + "learning_rate": 7.4940929058566255e-06, + "loss": 2.5521, + "step": 54197 + }, + { + "epoch": 2.5233140116861046, + "grad_norm": 0.300372183568908, + "learning_rate": 7.492666569207291e-06, + "loss": 2.5731, + "step": 54198 + }, + { + "epoch": 2.5233605698721977, + "grad_norm": 0.2967771077665142, + "learning_rate": 7.491240357312812e-06, + "loss": 2.6029, + "step": 54199 + }, + { + "epoch": 2.523407128058291, + "grad_norm": 0.30211990932466776, + "learning_rate": 7.489814270177348e-06, + "loss": 2.6383, + "step": 54200 + }, + { + "epoch": 2.523453686244384, + "grad_norm": 0.29576686778504235, + "learning_rate": 7.488388307805089e-06, + "loss": 2.635, + "step": 54201 + }, + { + "epoch": 2.523500244430477, + "grad_norm": 0.29872319214455323, + "learning_rate": 7.486962470200221e-06, + "loss": 2.6868, + "step": 54202 + }, + { + "epoch": 2.52354680261657, + "grad_norm": 0.2898014211744458, + "learning_rate": 7.48553675736694e-06, + "loss": 2.5318, + "step": 54203 + }, + { + "epoch": 2.5235933608026633, + "grad_norm": 0.30003519187061517, + "learning_rate": 7.484111169309399e-06, + "loss": 2.6181, + "step": 54204 + }, + { + "epoch": 2.5236399189887564, + "grad_norm": 0.3136835481408019, + "learning_rate": 7.482685706031828e-06, + "loss": 2.5579, + "step": 54205 + }, + { + "epoch": 2.5236864771748495, + "grad_norm": 0.32264454267076836, + "learning_rate": 7.481260367538362e-06, + "loss": 2.6015, + "step": 54206 + }, + { + "epoch": 2.523733035360942, + "grad_norm": 0.2920785454572041, + "learning_rate": 7.479835153833231e-06, + "loss": 2.5879, + "step": 54207 + }, + { + "epoch": 2.5237795935470353, + "grad_norm": 0.3048374766185319, + "learning_rate": 7.478410064920582e-06, + "loss": 2.5475, + "step": 54208 + }, + { + "epoch": 2.5238261517331284, + "grad_norm": 0.29830093406789593, + "learning_rate": 7.476985100804612e-06, + "loss": 2.6294, + "step": 54209 + }, + { + "epoch": 2.5238727099192215, + "grad_norm": 0.294215509993369, + "learning_rate": 7.475560261489511e-06, + "loss": 2.6517, + "step": 54210 + }, + { + "epoch": 2.5239192681053146, + "grad_norm": 0.294579799789808, + "learning_rate": 7.474135546979433e-06, + "loss": 2.5649, + "step": 54211 + }, + { + "epoch": 2.5239658262914078, + "grad_norm": 0.29677868263273566, + "learning_rate": 7.4727109572786004e-06, + "loss": 2.6117, + "step": 54212 + }, + { + "epoch": 2.524012384477501, + "grad_norm": 0.3076794977219032, + "learning_rate": 7.471286492391155e-06, + "loss": 2.6801, + "step": 54213 + }, + { + "epoch": 2.524058942663594, + "grad_norm": 0.31916635104539787, + "learning_rate": 7.469862152321294e-06, + "loss": 2.6676, + "step": 54214 + }, + { + "epoch": 2.5241055008496867, + "grad_norm": 0.30403243275599, + "learning_rate": 7.4684379370732025e-06, + "loss": 2.5709, + "step": 54215 + }, + { + "epoch": 2.5241520590357798, + "grad_norm": 0.29467083099147773, + "learning_rate": 7.46701384665105e-06, + "loss": 2.6724, + "step": 54216 + }, + { + "epoch": 2.524198617221873, + "grad_norm": 0.3017955394253983, + "learning_rate": 7.4655898810590265e-06, + "loss": 2.6462, + "step": 54217 + }, + { + "epoch": 2.524245175407966, + "grad_norm": 0.3107585924058443, + "learning_rate": 7.464166040301313e-06, + "loss": 2.6585, + "step": 54218 + }, + { + "epoch": 2.524291733594059, + "grad_norm": 0.33205997647284125, + "learning_rate": 7.462742324382066e-06, + "loss": 2.6082, + "step": 54219 + }, + { + "epoch": 2.5243382917801522, + "grad_norm": 0.3322551608196286, + "learning_rate": 7.461318733305494e-06, + "loss": 2.6273, + "step": 54220 + }, + { + "epoch": 2.5243848499662453, + "grad_norm": 0.3121972395349531, + "learning_rate": 7.459895267075756e-06, + "loss": 2.6871, + "step": 54221 + }, + { + "epoch": 2.5244314081523385, + "grad_norm": 0.288348665070813, + "learning_rate": 7.4584719256970306e-06, + "loss": 2.5395, + "step": 54222 + }, + { + "epoch": 2.5244779663384316, + "grad_norm": 0.3123167913428397, + "learning_rate": 7.457048709173514e-06, + "loss": 2.568, + "step": 54223 + }, + { + "epoch": 2.5245245245245247, + "grad_norm": 0.3213320644091246, + "learning_rate": 7.455625617509343e-06, + "loss": 2.7267, + "step": 54224 + }, + { + "epoch": 2.524571082710618, + "grad_norm": 0.3353733898748172, + "learning_rate": 7.454202650708741e-06, + "loss": 2.6923, + "step": 54225 + }, + { + "epoch": 2.524617640896711, + "grad_norm": 0.29866000744601096, + "learning_rate": 7.452779808775856e-06, + "loss": 2.6514, + "step": 54226 + }, + { + "epoch": 2.5246641990828036, + "grad_norm": 0.3109434338849921, + "learning_rate": 7.451357091714867e-06, + "loss": 2.6299, + "step": 54227 + }, + { + "epoch": 2.5247107572688967, + "grad_norm": 0.32645150544728174, + "learning_rate": 7.449934499529953e-06, + "loss": 2.6471, + "step": 54228 + }, + { + "epoch": 2.52475731545499, + "grad_norm": 0.29562944380465583, + "learning_rate": 7.448512032225291e-06, + "loss": 2.4853, + "step": 54229 + }, + { + "epoch": 2.524803873641083, + "grad_norm": 0.30857115223432274, + "learning_rate": 7.4470896898050536e-06, + "loss": 2.6711, + "step": 54230 + }, + { + "epoch": 2.524850431827176, + "grad_norm": 0.3243324445569901, + "learning_rate": 7.445667472273427e-06, + "loss": 2.6618, + "step": 54231 + }, + { + "epoch": 2.524896990013269, + "grad_norm": 0.3182383130357739, + "learning_rate": 7.4442453796345515e-06, + "loss": 2.6558, + "step": 54232 + }, + { + "epoch": 2.5249435481993623, + "grad_norm": 0.3020892207701671, + "learning_rate": 7.442823411892641e-06, + "loss": 2.7092, + "step": 54233 + }, + { + "epoch": 2.524990106385455, + "grad_norm": 0.3244674503353403, + "learning_rate": 7.441401569051848e-06, + "loss": 2.697, + "step": 54234 + }, + { + "epoch": 2.525036664571548, + "grad_norm": 0.30479623057888805, + "learning_rate": 7.4399798511163465e-06, + "loss": 2.5388, + "step": 54235 + }, + { + "epoch": 2.525083222757641, + "grad_norm": 0.3118363344509037, + "learning_rate": 7.438558258090323e-06, + "loss": 2.6462, + "step": 54236 + }, + { + "epoch": 2.5251297809437343, + "grad_norm": 0.31302021884958764, + "learning_rate": 7.437136789977911e-06, + "loss": 2.6613, + "step": 54237 + }, + { + "epoch": 2.5251763391298274, + "grad_norm": 0.30098741655850364, + "learning_rate": 7.435715446783337e-06, + "loss": 2.5638, + "step": 54238 + }, + { + "epoch": 2.5252228973159205, + "grad_norm": 0.29158969835559234, + "learning_rate": 7.4342942285107295e-06, + "loss": 2.624, + "step": 54239 + }, + { + "epoch": 2.5252694555020136, + "grad_norm": 0.30566538381757696, + "learning_rate": 7.432873135164275e-06, + "loss": 2.613, + "step": 54240 + }, + { + "epoch": 2.5253160136881068, + "grad_norm": 0.3205071070467869, + "learning_rate": 7.431452166748149e-06, + "loss": 2.59, + "step": 54241 + }, + { + "epoch": 2.5253625718742, + "grad_norm": 0.32176706067756977, + "learning_rate": 7.430031323266512e-06, + "loss": 2.6409, + "step": 54242 + }, + { + "epoch": 2.525409130060293, + "grad_norm": 0.3078575735575791, + "learning_rate": 7.42861060472354e-06, + "loss": 2.6896, + "step": 54243 + }, + { + "epoch": 2.525455688246386, + "grad_norm": 0.30919746732897285, + "learning_rate": 7.4271900111234196e-06, + "loss": 2.7549, + "step": 54244 + }, + { + "epoch": 2.5255022464324792, + "grad_norm": 0.29483967738016337, + "learning_rate": 7.4257695424702735e-06, + "loss": 2.5796, + "step": 54245 + }, + { + "epoch": 2.525548804618572, + "grad_norm": 0.31645863434590843, + "learning_rate": 7.424349198768321e-06, + "loss": 2.4807, + "step": 54246 + }, + { + "epoch": 2.525595362804665, + "grad_norm": 0.3151538995801986, + "learning_rate": 7.422928980021704e-06, + "loss": 2.6345, + "step": 54247 + }, + { + "epoch": 2.525641920990758, + "grad_norm": 0.30533361228521794, + "learning_rate": 7.42150888623459e-06, + "loss": 2.6158, + "step": 54248 + }, + { + "epoch": 2.5256884791768512, + "grad_norm": 0.29868044233646807, + "learning_rate": 7.4200889174111666e-06, + "loss": 2.6367, + "step": 54249 + }, + { + "epoch": 2.5257350373629444, + "grad_norm": 0.3166102742707117, + "learning_rate": 7.418669073555568e-06, + "loss": 2.5942, + "step": 54250 + }, + { + "epoch": 2.5257815955490375, + "grad_norm": 0.3006483383268637, + "learning_rate": 7.417249354672001e-06, + "loss": 2.6019, + "step": 54251 + }, + { + "epoch": 2.5258281537351306, + "grad_norm": 0.318423427489947, + "learning_rate": 7.415829760764587e-06, + "loss": 2.6115, + "step": 54252 + }, + { + "epoch": 2.5258747119212237, + "grad_norm": 0.33644162167902497, + "learning_rate": 7.414410291837543e-06, + "loss": 2.8202, + "step": 54253 + }, + { + "epoch": 2.5259212701073164, + "grad_norm": 0.301342737331916, + "learning_rate": 7.412990947894993e-06, + "loss": 2.5501, + "step": 54254 + }, + { + "epoch": 2.5259678282934095, + "grad_norm": 0.31478686919976706, + "learning_rate": 7.411571728941125e-06, + "loss": 2.6364, + "step": 54255 + }, + { + "epoch": 2.5260143864795026, + "grad_norm": 0.33216694496552357, + "learning_rate": 7.410152634980094e-06, + "loss": 2.7413, + "step": 54256 + }, + { + "epoch": 2.5260609446655957, + "grad_norm": 0.3311835109499025, + "learning_rate": 7.4087336660160824e-06, + "loss": 2.6463, + "step": 54257 + }, + { + "epoch": 2.526107502851689, + "grad_norm": 0.32056653481427777, + "learning_rate": 7.407314822053218e-06, + "loss": 2.6486, + "step": 54258 + }, + { + "epoch": 2.526154061037782, + "grad_norm": 0.31356364099699197, + "learning_rate": 7.405896103095716e-06, + "loss": 2.5683, + "step": 54259 + }, + { + "epoch": 2.526200619223875, + "grad_norm": 0.31250170222741147, + "learning_rate": 7.404477509147695e-06, + "loss": 2.6998, + "step": 54260 + }, + { + "epoch": 2.526247177409968, + "grad_norm": 0.3233009720326534, + "learning_rate": 7.40305904021334e-06, + "loss": 2.6254, + "step": 54261 + }, + { + "epoch": 2.5262937355960613, + "grad_norm": 0.30955265317965264, + "learning_rate": 7.401640696296819e-06, + "loss": 2.5658, + "step": 54262 + }, + { + "epoch": 2.5263402937821544, + "grad_norm": 0.32475435728565744, + "learning_rate": 7.4002224774022636e-06, + "loss": 2.6678, + "step": 54263 + }, + { + "epoch": 2.5263868519682475, + "grad_norm": 0.3221307050407538, + "learning_rate": 7.398804383533886e-06, + "loss": 2.6553, + "step": 54264 + }, + { + "epoch": 2.5264334101543406, + "grad_norm": 0.30420148626002874, + "learning_rate": 7.397386414695795e-06, + "loss": 2.6354, + "step": 54265 + }, + { + "epoch": 2.5264799683404333, + "grad_norm": 0.3027572472477661, + "learning_rate": 7.395968570892198e-06, + "loss": 2.6866, + "step": 54266 + }, + { + "epoch": 2.5265265265265264, + "grad_norm": 0.31061911345148463, + "learning_rate": 7.394550852127225e-06, + "loss": 2.6342, + "step": 54267 + }, + { + "epoch": 2.5265730847126195, + "grad_norm": 0.3026219121728165, + "learning_rate": 7.393133258405049e-06, + "loss": 2.5559, + "step": 54268 + }, + { + "epoch": 2.5266196428987127, + "grad_norm": 0.30622336948655565, + "learning_rate": 7.391715789729831e-06, + "loss": 2.5664, + "step": 54269 + }, + { + "epoch": 2.5266662010848058, + "grad_norm": 0.31271246658966234, + "learning_rate": 7.390298446105726e-06, + "loss": 2.6147, + "step": 54270 + }, + { + "epoch": 2.526712759270899, + "grad_norm": 0.31465891808657964, + "learning_rate": 7.388881227536898e-06, + "loss": 2.56, + "step": 54271 + }, + { + "epoch": 2.526759317456992, + "grad_norm": 0.3064942889697004, + "learning_rate": 7.3874641340275215e-06, + "loss": 2.5817, + "step": 54272 + }, + { + "epoch": 2.5268058756430847, + "grad_norm": 0.30403298723249483, + "learning_rate": 7.386047165581728e-06, + "loss": 2.5566, + "step": 54273 + }, + { + "epoch": 2.526852433829178, + "grad_norm": 0.3032033903142958, + "learning_rate": 7.3846303222036836e-06, + "loss": 2.6716, + "step": 54274 + }, + { + "epoch": 2.526898992015271, + "grad_norm": 0.2962903761800381, + "learning_rate": 7.3832136038975654e-06, + "loss": 2.6592, + "step": 54275 + }, + { + "epoch": 2.526945550201364, + "grad_norm": 0.3060757422713463, + "learning_rate": 7.381797010667491e-06, + "loss": 2.6198, + "step": 54276 + }, + { + "epoch": 2.526992108387457, + "grad_norm": 0.3031236701117901, + "learning_rate": 7.380380542517667e-06, + "loss": 2.5835, + "step": 54277 + }, + { + "epoch": 2.5270386665735503, + "grad_norm": 0.29755086040548523, + "learning_rate": 7.378964199452204e-06, + "loss": 2.5777, + "step": 54278 + }, + { + "epoch": 2.5270852247596434, + "grad_norm": 0.31857300427230706, + "learning_rate": 7.377547981475308e-06, + "loss": 2.6633, + "step": 54279 + }, + { + "epoch": 2.5271317829457365, + "grad_norm": 0.2969792085829526, + "learning_rate": 7.376131888591098e-06, + "loss": 2.6299, + "step": 54280 + }, + { + "epoch": 2.5271783411318296, + "grad_norm": 0.30191323457477004, + "learning_rate": 7.374715920803743e-06, + "loss": 2.7364, + "step": 54281 + }, + { + "epoch": 2.5272248993179227, + "grad_norm": 0.3062185230651865, + "learning_rate": 7.37330007811739e-06, + "loss": 2.6348, + "step": 54282 + }, + { + "epoch": 2.527271457504016, + "grad_norm": 0.29580110549988775, + "learning_rate": 7.371884360536208e-06, + "loss": 2.6272, + "step": 54283 + }, + { + "epoch": 2.527318015690109, + "grad_norm": 0.29695187600846795, + "learning_rate": 7.3704687680643484e-06, + "loss": 2.633, + "step": 54284 + }, + { + "epoch": 2.527364573876202, + "grad_norm": 0.2997449469946408, + "learning_rate": 7.369053300705969e-06, + "loss": 2.6505, + "step": 54285 + }, + { + "epoch": 2.5274111320622947, + "grad_norm": 0.2944595924142158, + "learning_rate": 7.367637958465212e-06, + "loss": 2.5212, + "step": 54286 + }, + { + "epoch": 2.527457690248388, + "grad_norm": 0.2993421239325764, + "learning_rate": 7.366222741346235e-06, + "loss": 2.6221, + "step": 54287 + }, + { + "epoch": 2.527504248434481, + "grad_norm": 0.3071498675149258, + "learning_rate": 7.364807649353189e-06, + "loss": 2.6509, + "step": 54288 + }, + { + "epoch": 2.527550806620574, + "grad_norm": 0.3068864371538486, + "learning_rate": 7.363392682490239e-06, + "loss": 2.6295, + "step": 54289 + }, + { + "epoch": 2.527597364806667, + "grad_norm": 0.28355806572013786, + "learning_rate": 7.361977840761541e-06, + "loss": 2.6039, + "step": 54290 + }, + { + "epoch": 2.5276439229927603, + "grad_norm": 0.308057646020621, + "learning_rate": 7.360563124171216e-06, + "loss": 2.5444, + "step": 54291 + }, + { + "epoch": 2.5276904811788534, + "grad_norm": 0.311892421486303, + "learning_rate": 7.359148532723459e-06, + "loss": 2.5446, + "step": 54292 + }, + { + "epoch": 2.527737039364946, + "grad_norm": 0.31255759853300924, + "learning_rate": 7.3577340664223836e-06, + "loss": 2.5922, + "step": 54293 + }, + { + "epoch": 2.527783597551039, + "grad_norm": 0.29519114005574526, + "learning_rate": 7.356319725272165e-06, + "loss": 2.5816, + "step": 54294 + }, + { + "epoch": 2.5278301557371323, + "grad_norm": 0.3071722004046856, + "learning_rate": 7.354905509276943e-06, + "loss": 2.6221, + "step": 54295 + }, + { + "epoch": 2.5278767139232254, + "grad_norm": 0.3167700847516883, + "learning_rate": 7.353491418440877e-06, + "loss": 2.6751, + "step": 54296 + }, + { + "epoch": 2.5279232721093186, + "grad_norm": 0.32395264695553233, + "learning_rate": 7.352077452768108e-06, + "loss": 2.6042, + "step": 54297 + }, + { + "epoch": 2.5279698302954117, + "grad_norm": 0.29751085210006034, + "learning_rate": 7.350663612262803e-06, + "loss": 2.5677, + "step": 54298 + }, + { + "epoch": 2.528016388481505, + "grad_norm": 0.3139444312031634, + "learning_rate": 7.349249896929084e-06, + "loss": 2.7178, + "step": 54299 + }, + { + "epoch": 2.528062946667598, + "grad_norm": 0.286493332992928, + "learning_rate": 7.347836306771122e-06, + "loss": 2.6812, + "step": 54300 + }, + { + "epoch": 2.528109504853691, + "grad_norm": 0.2990438008114576, + "learning_rate": 7.346422841793049e-06, + "loss": 2.644, + "step": 54301 + }, + { + "epoch": 2.528156063039784, + "grad_norm": 0.3242172300579122, + "learning_rate": 7.345009501999028e-06, + "loss": 2.6833, + "step": 54302 + }, + { + "epoch": 2.5282026212258772, + "grad_norm": 0.2999076286991059, + "learning_rate": 7.343596287393217e-06, + "loss": 2.6397, + "step": 54303 + }, + { + "epoch": 2.5282491794119704, + "grad_norm": 0.30152108972683306, + "learning_rate": 7.342183197979719e-06, + "loss": 2.5941, + "step": 54304 + }, + { + "epoch": 2.528295737598063, + "grad_norm": 0.3308491292739324, + "learning_rate": 7.34077023376274e-06, + "loss": 2.6216, + "step": 54305 + }, + { + "epoch": 2.528342295784156, + "grad_norm": 0.3022320262856263, + "learning_rate": 7.339357394746371e-06, + "loss": 2.616, + "step": 54306 + }, + { + "epoch": 2.5283888539702493, + "grad_norm": 0.3108626429883636, + "learning_rate": 7.337944680934805e-06, + "loss": 2.5775, + "step": 54307 + }, + { + "epoch": 2.5284354121563424, + "grad_norm": 0.31007773109032866, + "learning_rate": 7.3365320923321605e-06, + "loss": 2.6538, + "step": 54308 + }, + { + "epoch": 2.5284819703424355, + "grad_norm": 0.29629407010679426, + "learning_rate": 7.335119628942594e-06, + "loss": 2.6439, + "step": 54309 + }, + { + "epoch": 2.5285285285285286, + "grad_norm": 0.3088750789387038, + "learning_rate": 7.333707290770242e-06, + "loss": 2.4758, + "step": 54310 + }, + { + "epoch": 2.5285750867146217, + "grad_norm": 0.32112664744855696, + "learning_rate": 7.332295077819273e-06, + "loss": 2.6062, + "step": 54311 + }, + { + "epoch": 2.5286216449007144, + "grad_norm": 0.30039463008003375, + "learning_rate": 7.330882990093801e-06, + "loss": 2.5256, + "step": 54312 + }, + { + "epoch": 2.5286682030868075, + "grad_norm": 0.31041747306608614, + "learning_rate": 7.329471027597984e-06, + "loss": 2.6836, + "step": 54313 + }, + { + "epoch": 2.5287147612729006, + "grad_norm": 0.31408593601574947, + "learning_rate": 7.328059190335968e-06, + "loss": 2.61, + "step": 54314 + }, + { + "epoch": 2.5287613194589937, + "grad_norm": 0.30450439052330813, + "learning_rate": 7.326647478311888e-06, + "loss": 2.4774, + "step": 54315 + }, + { + "epoch": 2.528807877645087, + "grad_norm": 0.2857956321993923, + "learning_rate": 7.325235891529908e-06, + "loss": 2.6549, + "step": 54316 + }, + { + "epoch": 2.52885443583118, + "grad_norm": 0.3187578509826814, + "learning_rate": 7.323824429994136e-06, + "loss": 2.6239, + "step": 54317 + }, + { + "epoch": 2.528900994017273, + "grad_norm": 0.31822170096881447, + "learning_rate": 7.322413093708758e-06, + "loss": 2.6194, + "step": 54318 + }, + { + "epoch": 2.528947552203366, + "grad_norm": 0.3098816068950298, + "learning_rate": 7.321001882677869e-06, + "loss": 2.6932, + "step": 54319 + }, + { + "epoch": 2.5289941103894593, + "grad_norm": 0.3055971749029365, + "learning_rate": 7.319590796905656e-06, + "loss": 2.6519, + "step": 54320 + }, + { + "epoch": 2.5290406685755524, + "grad_norm": 0.3131915263890019, + "learning_rate": 7.318179836396227e-06, + "loss": 2.6453, + "step": 54321 + }, + { + "epoch": 2.5290872267616455, + "grad_norm": 0.29248370195865553, + "learning_rate": 7.3167690011537395e-06, + "loss": 2.4428, + "step": 54322 + }, + { + "epoch": 2.5291337849477387, + "grad_norm": 0.304861121454379, + "learning_rate": 7.315358291182328e-06, + "loss": 2.6366, + "step": 54323 + }, + { + "epoch": 2.5291803431338318, + "grad_norm": 0.29693086923056283, + "learning_rate": 7.313947706486135e-06, + "loss": 2.5949, + "step": 54324 + }, + { + "epoch": 2.5292269013199244, + "grad_norm": 0.29666346378172354, + "learning_rate": 7.312537247069312e-06, + "loss": 2.5918, + "step": 54325 + }, + { + "epoch": 2.5292734595060176, + "grad_norm": 0.2929307276427374, + "learning_rate": 7.311126912935973e-06, + "loss": 2.611, + "step": 54326 + }, + { + "epoch": 2.5293200176921107, + "grad_norm": 0.2999995108672191, + "learning_rate": 7.309716704090274e-06, + "loss": 2.5994, + "step": 54327 + }, + { + "epoch": 2.529366575878204, + "grad_norm": 0.29691765411441184, + "learning_rate": 7.308306620536348e-06, + "loss": 2.6016, + "step": 54328 + }, + { + "epoch": 2.529413134064297, + "grad_norm": 0.3155603705633833, + "learning_rate": 7.3068966622783505e-06, + "loss": 2.7269, + "step": 54329 + }, + { + "epoch": 2.52945969225039, + "grad_norm": 0.28997989327268087, + "learning_rate": 7.305486829320379e-06, + "loss": 2.6484, + "step": 54330 + }, + { + "epoch": 2.529506250436483, + "grad_norm": 0.2973641380610129, + "learning_rate": 7.304077121666625e-06, + "loss": 2.5471, + "step": 54331 + }, + { + "epoch": 2.529552808622576, + "grad_norm": 0.28195846369326066, + "learning_rate": 7.302667539321173e-06, + "loss": 2.6129, + "step": 54332 + }, + { + "epoch": 2.529599366808669, + "grad_norm": 0.29592328146425134, + "learning_rate": 7.301258082288209e-06, + "loss": 2.5839, + "step": 54333 + }, + { + "epoch": 2.529645924994762, + "grad_norm": 0.3024269935944891, + "learning_rate": 7.29984875057183e-06, + "loss": 2.6006, + "step": 54334 + }, + { + "epoch": 2.529692483180855, + "grad_norm": 0.2794076876401498, + "learning_rate": 7.2984395441761945e-06, + "loss": 2.4545, + "step": 54335 + }, + { + "epoch": 2.5297390413669483, + "grad_norm": 0.29952490320259095, + "learning_rate": 7.297030463105431e-06, + "loss": 2.5956, + "step": 54336 + }, + { + "epoch": 2.5297855995530414, + "grad_norm": 0.29513948475794, + "learning_rate": 7.2956215073636756e-06, + "loss": 2.5166, + "step": 54337 + }, + { + "epoch": 2.5298321577391345, + "grad_norm": 0.30255540028386774, + "learning_rate": 7.294212676955076e-06, + "loss": 2.6222, + "step": 54338 + }, + { + "epoch": 2.5298787159252276, + "grad_norm": 0.3005043918286671, + "learning_rate": 7.292803971883738e-06, + "loss": 2.6329, + "step": 54339 + }, + { + "epoch": 2.5299252741113207, + "grad_norm": 0.31060761607369297, + "learning_rate": 7.291395392153822e-06, + "loss": 2.6085, + "step": 54340 + }, + { + "epoch": 2.529971832297414, + "grad_norm": 0.28464576659384677, + "learning_rate": 7.289986937769444e-06, + "loss": 2.6485, + "step": 54341 + }, + { + "epoch": 2.530018390483507, + "grad_norm": 0.29481355681484794, + "learning_rate": 7.288578608734753e-06, + "loss": 2.7012, + "step": 54342 + }, + { + "epoch": 2.5300649486696, + "grad_norm": 0.2985743595849043, + "learning_rate": 7.2871704050538726e-06, + "loss": 2.5983, + "step": 54343 + }, + { + "epoch": 2.5301115068556927, + "grad_norm": 0.2950670932902887, + "learning_rate": 7.285762326730955e-06, + "loss": 2.5603, + "step": 54344 + }, + { + "epoch": 2.530158065041786, + "grad_norm": 0.3058949679816527, + "learning_rate": 7.2843543737700915e-06, + "loss": 2.638, + "step": 54345 + }, + { + "epoch": 2.530204623227879, + "grad_norm": 0.3106068279834879, + "learning_rate": 7.282946546175462e-06, + "loss": 2.6719, + "step": 54346 + }, + { + "epoch": 2.530251181413972, + "grad_norm": 0.3071293059585939, + "learning_rate": 7.281538843951163e-06, + "loss": 2.5859, + "step": 54347 + }, + { + "epoch": 2.530297739600065, + "grad_norm": 0.3064757655821789, + "learning_rate": 7.280131267101342e-06, + "loss": 2.6115, + "step": 54348 + }, + { + "epoch": 2.5303442977861583, + "grad_norm": 0.3060513147034252, + "learning_rate": 7.278723815630123e-06, + "loss": 2.514, + "step": 54349 + }, + { + "epoch": 2.5303908559722514, + "grad_norm": 0.3006133141716385, + "learning_rate": 7.277316489541647e-06, + "loss": 2.6014, + "step": 54350 + }, + { + "epoch": 2.5304374141583446, + "grad_norm": 0.29457842094124276, + "learning_rate": 7.275909288840044e-06, + "loss": 2.6121, + "step": 54351 + }, + { + "epoch": 2.5304839723444372, + "grad_norm": 0.3129059892653553, + "learning_rate": 7.274502213529433e-06, + "loss": 2.6107, + "step": 54352 + }, + { + "epoch": 2.5305305305305303, + "grad_norm": 0.3027931654477112, + "learning_rate": 7.273095263613944e-06, + "loss": 2.7202, + "step": 54353 + }, + { + "epoch": 2.5305770887166235, + "grad_norm": 0.2966643079513159, + "learning_rate": 7.271688439097712e-06, + "loss": 2.5419, + "step": 54354 + }, + { + "epoch": 2.5306236469027166, + "grad_norm": 0.29434268955252063, + "learning_rate": 7.2702817399848634e-06, + "loss": 2.5741, + "step": 54355 + }, + { + "epoch": 2.5306702050888097, + "grad_norm": 0.3079970771520344, + "learning_rate": 7.268875166279532e-06, + "loss": 2.6127, + "step": 54356 + }, + { + "epoch": 2.530716763274903, + "grad_norm": 0.3158393957556606, + "learning_rate": 7.267468717985848e-06, + "loss": 2.6212, + "step": 54357 + }, + { + "epoch": 2.530763321460996, + "grad_norm": 0.29466737371818574, + "learning_rate": 7.266062395107914e-06, + "loss": 2.7782, + "step": 54358 + }, + { + "epoch": 2.530809879647089, + "grad_norm": 0.31292751203694524, + "learning_rate": 7.264656197649905e-06, + "loss": 2.6943, + "step": 54359 + }, + { + "epoch": 2.530856437833182, + "grad_norm": 0.3027001885509952, + "learning_rate": 7.2632501256159e-06, + "loss": 2.5867, + "step": 54360 + }, + { + "epoch": 2.5309029960192753, + "grad_norm": 0.29761877832213424, + "learning_rate": 7.2618441790100525e-06, + "loss": 2.6989, + "step": 54361 + }, + { + "epoch": 2.5309495542053684, + "grad_norm": 0.32095117959049957, + "learning_rate": 7.260438357836474e-06, + "loss": 2.6946, + "step": 54362 + }, + { + "epoch": 2.5309961123914615, + "grad_norm": 0.3168044918273994, + "learning_rate": 7.259032662099307e-06, + "loss": 2.6267, + "step": 54363 + }, + { + "epoch": 2.531042670577554, + "grad_norm": 0.3120288455239373, + "learning_rate": 7.257627091802677e-06, + "loss": 2.7757, + "step": 54364 + }, + { + "epoch": 2.5310892287636473, + "grad_norm": 0.2921386579984839, + "learning_rate": 7.2562216469506836e-06, + "loss": 2.6235, + "step": 54365 + }, + { + "epoch": 2.5311357869497404, + "grad_norm": 0.29898762205737717, + "learning_rate": 7.2548163275474755e-06, + "loss": 2.5894, + "step": 54366 + }, + { + "epoch": 2.5311823451358335, + "grad_norm": 0.30094430469431815, + "learning_rate": 7.253411133597165e-06, + "loss": 2.6287, + "step": 54367 + }, + { + "epoch": 2.5312289033219266, + "grad_norm": 0.32286372652822504, + "learning_rate": 7.252006065103889e-06, + "loss": 2.6751, + "step": 54368 + }, + { + "epoch": 2.5312754615080197, + "grad_norm": 0.3159955753808014, + "learning_rate": 7.25060112207176e-06, + "loss": 2.6401, + "step": 54369 + }, + { + "epoch": 2.531322019694113, + "grad_norm": 0.31523202833605, + "learning_rate": 7.249196304504913e-06, + "loss": 2.6276, + "step": 54370 + }, + { + "epoch": 2.5313685778802055, + "grad_norm": 0.32925919647414975, + "learning_rate": 7.247791612407445e-06, + "loss": 2.5788, + "step": 54371 + }, + { + "epoch": 2.5314151360662986, + "grad_norm": 0.32057354624396955, + "learning_rate": 7.24638704578352e-06, + "loss": 2.502, + "step": 54372 + }, + { + "epoch": 2.5314616942523918, + "grad_norm": 0.30029357074188306, + "learning_rate": 7.244982604637224e-06, + "loss": 2.6353, + "step": 54373 + }, + { + "epoch": 2.531508252438485, + "grad_norm": 0.30282528438058726, + "learning_rate": 7.243578288972691e-06, + "loss": 2.6044, + "step": 54374 + }, + { + "epoch": 2.531554810624578, + "grad_norm": 0.3347158524888293, + "learning_rate": 7.2421740987940414e-06, + "loss": 2.6506, + "step": 54375 + }, + { + "epoch": 2.531601368810671, + "grad_norm": 0.34212503422042917, + "learning_rate": 7.2407700341054045e-06, + "loss": 2.6635, + "step": 54376 + }, + { + "epoch": 2.531647926996764, + "grad_norm": 0.29896859989674673, + "learning_rate": 7.239366094910904e-06, + "loss": 2.5753, + "step": 54377 + }, + { + "epoch": 2.5316944851828573, + "grad_norm": 0.3067279675889942, + "learning_rate": 7.237962281214628e-06, + "loss": 2.6294, + "step": 54378 + }, + { + "epoch": 2.5317410433689505, + "grad_norm": 0.3108242397232591, + "learning_rate": 7.2365585930207425e-06, + "loss": 2.6676, + "step": 54379 + }, + { + "epoch": 2.5317876015550436, + "grad_norm": 0.3262176912862718, + "learning_rate": 7.235155030333329e-06, + "loss": 2.6365, + "step": 54380 + }, + { + "epoch": 2.5318341597411367, + "grad_norm": 0.3086333815533519, + "learning_rate": 7.233751593156529e-06, + "loss": 2.6722, + "step": 54381 + }, + { + "epoch": 2.53188071792723, + "grad_norm": 0.3166547942953113, + "learning_rate": 7.232348281494456e-06, + "loss": 2.6676, + "step": 54382 + }, + { + "epoch": 2.5319272761133225, + "grad_norm": 0.2971684367841095, + "learning_rate": 7.230945095351233e-06, + "loss": 2.6922, + "step": 54383 + }, + { + "epoch": 2.5319738342994156, + "grad_norm": 0.31266374592703977, + "learning_rate": 7.229542034730952e-06, + "loss": 2.5751, + "step": 54384 + }, + { + "epoch": 2.5320203924855087, + "grad_norm": 0.3158561684475075, + "learning_rate": 7.228139099637776e-06, + "loss": 2.7148, + "step": 54385 + }, + { + "epoch": 2.532066950671602, + "grad_norm": 0.30688311567340365, + "learning_rate": 7.2267362900757865e-06, + "loss": 2.5427, + "step": 54386 + }, + { + "epoch": 2.532113508857695, + "grad_norm": 0.2974081990334931, + "learning_rate": 7.225333606049112e-06, + "loss": 2.74, + "step": 54387 + }, + { + "epoch": 2.532160067043788, + "grad_norm": 0.3086350035461506, + "learning_rate": 7.223931047561866e-06, + "loss": 2.6328, + "step": 54388 + }, + { + "epoch": 2.532206625229881, + "grad_norm": 0.3023144967358126, + "learning_rate": 7.222528614618168e-06, + "loss": 2.6336, + "step": 54389 + }, + { + "epoch": 2.5322531834159743, + "grad_norm": 0.31569888278891234, + "learning_rate": 7.221126307222154e-06, + "loss": 2.6855, + "step": 54390 + }, + { + "epoch": 2.532299741602067, + "grad_norm": 0.29244081700877506, + "learning_rate": 7.219724125377886e-06, + "loss": 2.5769, + "step": 54391 + }, + { + "epoch": 2.53234629978816, + "grad_norm": 0.30914354498407365, + "learning_rate": 7.218322069089544e-06, + "loss": 2.7762, + "step": 54392 + }, + { + "epoch": 2.532392857974253, + "grad_norm": 0.2961273267937652, + "learning_rate": 7.2169201383612e-06, + "loss": 2.5836, + "step": 54393 + }, + { + "epoch": 2.5324394161603463, + "grad_norm": 0.3130120153287305, + "learning_rate": 7.215518333196974e-06, + "loss": 2.6322, + "step": 54394 + }, + { + "epoch": 2.5324859743464394, + "grad_norm": 0.3231508484349099, + "learning_rate": 7.214116653600994e-06, + "loss": 2.6285, + "step": 54395 + }, + { + "epoch": 2.5325325325325325, + "grad_norm": 0.304123742712189, + "learning_rate": 7.212715099577361e-06, + "loss": 2.5783, + "step": 54396 + }, + { + "epoch": 2.5325790907186256, + "grad_norm": 0.3259686342114869, + "learning_rate": 7.211313671130198e-06, + "loss": 2.5498, + "step": 54397 + }, + { + "epoch": 2.5326256489047188, + "grad_norm": 0.3113028721071584, + "learning_rate": 7.209912368263627e-06, + "loss": 2.6598, + "step": 54398 + }, + { + "epoch": 2.532672207090812, + "grad_norm": 0.30993302657590815, + "learning_rate": 7.208511190981731e-06, + "loss": 2.6176, + "step": 54399 + }, + { + "epoch": 2.532718765276905, + "grad_norm": 0.2938939137280702, + "learning_rate": 7.207110139288642e-06, + "loss": 2.6166, + "step": 54400 + }, + { + "epoch": 2.532765323462998, + "grad_norm": 0.3118704961641875, + "learning_rate": 7.205709213188466e-06, + "loss": 2.6534, + "step": 54401 + }, + { + "epoch": 2.532811881649091, + "grad_norm": 0.3366233867158388, + "learning_rate": 7.204308412685324e-06, + "loss": 2.6697, + "step": 54402 + }, + { + "epoch": 2.532858439835184, + "grad_norm": 0.3160135439011591, + "learning_rate": 7.2029077377833264e-06, + "loss": 2.5361, + "step": 54403 + }, + { + "epoch": 2.532904998021277, + "grad_norm": 0.2964673953004164, + "learning_rate": 7.201507188486556e-06, + "loss": 2.5997, + "step": 54404 + }, + { + "epoch": 2.53295155620737, + "grad_norm": 0.28518231781904857, + "learning_rate": 7.200106764799169e-06, + "loss": 2.7043, + "step": 54405 + }, + { + "epoch": 2.5329981143934632, + "grad_norm": 0.33195765423328805, + "learning_rate": 7.198706466725241e-06, + "loss": 2.639, + "step": 54406 + }, + { + "epoch": 2.5330446725795563, + "grad_norm": 0.33294066431382113, + "learning_rate": 7.197306294268896e-06, + "loss": 2.6097, + "step": 54407 + }, + { + "epoch": 2.5330912307656495, + "grad_norm": 0.3393107401256636, + "learning_rate": 7.195906247434237e-06, + "loss": 2.6804, + "step": 54408 + }, + { + "epoch": 2.5331377889517426, + "grad_norm": 0.2960744938339738, + "learning_rate": 7.194506326225376e-06, + "loss": 2.6863, + "step": 54409 + }, + { + "epoch": 2.5331843471378352, + "grad_norm": 0.30443575195477607, + "learning_rate": 7.19310653064642e-06, + "loss": 2.6722, + "step": 54410 + }, + { + "epoch": 2.5332309053239284, + "grad_norm": 0.29719559393810596, + "learning_rate": 7.191706860701497e-06, + "loss": 2.5823, + "step": 54411 + }, + { + "epoch": 2.5332774635100215, + "grad_norm": 0.29967194047356843, + "learning_rate": 7.190307316394679e-06, + "loss": 2.6809, + "step": 54412 + }, + { + "epoch": 2.5333240216961146, + "grad_norm": 0.30310908143590537, + "learning_rate": 7.188907897730096e-06, + "loss": 2.5607, + "step": 54413 + }, + { + "epoch": 2.5333705798822077, + "grad_norm": 0.30878172988841124, + "learning_rate": 7.187508604711851e-06, + "loss": 2.6673, + "step": 54414 + }, + { + "epoch": 2.533417138068301, + "grad_norm": 0.2969530920181387, + "learning_rate": 7.186109437344046e-06, + "loss": 2.6238, + "step": 54415 + }, + { + "epoch": 2.533463696254394, + "grad_norm": 0.3066081891104706, + "learning_rate": 7.184710395630806e-06, + "loss": 2.6358, + "step": 54416 + }, + { + "epoch": 2.533510254440487, + "grad_norm": 0.29883146401213345, + "learning_rate": 7.183311479576199e-06, + "loss": 2.6462, + "step": 54417 + }, + { + "epoch": 2.53355681262658, + "grad_norm": 0.29985761787891324, + "learning_rate": 7.181912689184383e-06, + "loss": 2.5488, + "step": 54418 + }, + { + "epoch": 2.5336033708126733, + "grad_norm": 0.323064202042107, + "learning_rate": 7.180514024459417e-06, + "loss": 2.6063, + "step": 54419 + }, + { + "epoch": 2.5336499289987664, + "grad_norm": 0.2817848817002953, + "learning_rate": 7.1791154854054244e-06, + "loss": 2.5639, + "step": 54420 + }, + { + "epoch": 2.5336964871848595, + "grad_norm": 0.29604204370279547, + "learning_rate": 7.177717072026513e-06, + "loss": 2.635, + "step": 54421 + }, + { + "epoch": 2.533743045370952, + "grad_norm": 0.3103790621609085, + "learning_rate": 7.17631878432678e-06, + "loss": 2.7547, + "step": 54422 + }, + { + "epoch": 2.5337896035570453, + "grad_norm": 0.29223798711894844, + "learning_rate": 7.174920622310333e-06, + "loss": 2.6229, + "step": 54423 + }, + { + "epoch": 2.5338361617431384, + "grad_norm": 0.30980405995312565, + "learning_rate": 7.17352258598129e-06, + "loss": 2.6355, + "step": 54424 + }, + { + "epoch": 2.5338827199292315, + "grad_norm": 0.33214423269851895, + "learning_rate": 7.172124675343722e-06, + "loss": 2.5775, + "step": 54425 + }, + { + "epoch": 2.5339292781153246, + "grad_norm": 0.3186018310712397, + "learning_rate": 7.1707268904017525e-06, + "loss": 2.6044, + "step": 54426 + }, + { + "epoch": 2.5339758363014178, + "grad_norm": 0.2786819993140697, + "learning_rate": 7.169329231159483e-06, + "loss": 2.5892, + "step": 54427 + }, + { + "epoch": 2.534022394487511, + "grad_norm": 0.3280956582616801, + "learning_rate": 7.167931697621005e-06, + "loss": 2.6429, + "step": 54428 + }, + { + "epoch": 2.534068952673604, + "grad_norm": 0.2915354407569731, + "learning_rate": 7.1665342897904445e-06, + "loss": 2.572, + "step": 54429 + }, + { + "epoch": 2.5341155108596967, + "grad_norm": 0.32043899798178, + "learning_rate": 7.165137007671863e-06, + "loss": 2.6405, + "step": 54430 + }, + { + "epoch": 2.53416206904579, + "grad_norm": 0.2979486038906769, + "learning_rate": 7.163739851269408e-06, + "loss": 2.5979, + "step": 54431 + }, + { + "epoch": 2.534208627231883, + "grad_norm": 0.31651503855358104, + "learning_rate": 7.162342820587131e-06, + "loss": 2.6279, + "step": 54432 + }, + { + "epoch": 2.534255185417976, + "grad_norm": 0.3012194228755797, + "learning_rate": 7.1609459156291804e-06, + "loss": 2.6477, + "step": 54433 + }, + { + "epoch": 2.534301743604069, + "grad_norm": 0.3160431109088722, + "learning_rate": 7.1595491363996235e-06, + "loss": 2.5917, + "step": 54434 + }, + { + "epoch": 2.5343483017901622, + "grad_norm": 0.31610544584112865, + "learning_rate": 7.158152482902569e-06, + "loss": 2.5981, + "step": 54435 + }, + { + "epoch": 2.5343948599762554, + "grad_norm": 0.3192802194379996, + "learning_rate": 7.156755955142114e-06, + "loss": 2.6392, + "step": 54436 + }, + { + "epoch": 2.5344414181623485, + "grad_norm": 0.30901700000874344, + "learning_rate": 7.155359553122376e-06, + "loss": 2.6578, + "step": 54437 + }, + { + "epoch": 2.5344879763484416, + "grad_norm": 0.29988511839578896, + "learning_rate": 7.15396327684742e-06, + "loss": 2.5781, + "step": 54438 + }, + { + "epoch": 2.5345345345345347, + "grad_norm": 0.3161792382179178, + "learning_rate": 7.152567126321364e-06, + "loss": 2.6429, + "step": 54439 + }, + { + "epoch": 2.534581092720628, + "grad_norm": 0.29670864885221393, + "learning_rate": 7.151171101548304e-06, + "loss": 2.6194, + "step": 54440 + }, + { + "epoch": 2.534627650906721, + "grad_norm": 0.29310137692883703, + "learning_rate": 7.1497752025323324e-06, + "loss": 2.6361, + "step": 54441 + }, + { + "epoch": 2.5346742090928136, + "grad_norm": 0.3195460913395021, + "learning_rate": 7.148379429277557e-06, + "loss": 2.6179, + "step": 54442 + }, + { + "epoch": 2.5347207672789067, + "grad_norm": 0.3021021851051395, + "learning_rate": 7.146983781788052e-06, + "loss": 2.5785, + "step": 54443 + }, + { + "epoch": 2.534767325465, + "grad_norm": 0.3068705516795234, + "learning_rate": 7.145588260067942e-06, + "loss": 2.6495, + "step": 54444 + }, + { + "epoch": 2.534813883651093, + "grad_norm": 0.3162389796032404, + "learning_rate": 7.1441928641212905e-06, + "loss": 2.5324, + "step": 54445 + }, + { + "epoch": 2.534860441837186, + "grad_norm": 0.3216066033394986, + "learning_rate": 7.142797593952233e-06, + "loss": 2.5862, + "step": 54446 + }, + { + "epoch": 2.534907000023279, + "grad_norm": 0.30584960781770165, + "learning_rate": 7.141402449564833e-06, + "loss": 2.6539, + "step": 54447 + }, + { + "epoch": 2.5349535582093723, + "grad_norm": 0.28606782978230083, + "learning_rate": 7.140007430963191e-06, + "loss": 2.6127, + "step": 54448 + }, + { + "epoch": 2.535000116395465, + "grad_norm": 0.30823369541569845, + "learning_rate": 7.138612538151412e-06, + "loss": 2.6366, + "step": 54449 + }, + { + "epoch": 2.535046674581558, + "grad_norm": 0.2984303405888741, + "learning_rate": 7.137217771133576e-06, + "loss": 2.6896, + "step": 54450 + }, + { + "epoch": 2.535093232767651, + "grad_norm": 0.3009739850998596, + "learning_rate": 7.135823129913793e-06, + "loss": 2.6751, + "step": 54451 + }, + { + "epoch": 2.5351397909537443, + "grad_norm": 0.30604451476942346, + "learning_rate": 7.134428614496142e-06, + "loss": 2.6281, + "step": 54452 + }, + { + "epoch": 2.5351863491398374, + "grad_norm": 0.3224151366986884, + "learning_rate": 7.133034224884716e-06, + "loss": 2.6336, + "step": 54453 + }, + { + "epoch": 2.5352329073259305, + "grad_norm": 0.29079115876555867, + "learning_rate": 7.131639961083608e-06, + "loss": 2.7402, + "step": 54454 + }, + { + "epoch": 2.5352794655120237, + "grad_norm": 0.2991631888515938, + "learning_rate": 7.130245823096932e-06, + "loss": 2.5205, + "step": 54455 + }, + { + "epoch": 2.5353260236981168, + "grad_norm": 0.3005131481853435, + "learning_rate": 7.128851810928733e-06, + "loss": 2.6747, + "step": 54456 + }, + { + "epoch": 2.53537258188421, + "grad_norm": 0.29985731882791944, + "learning_rate": 7.127457924583153e-06, + "loss": 2.5976, + "step": 54457 + }, + { + "epoch": 2.535419140070303, + "grad_norm": 0.30759907663061653, + "learning_rate": 7.126064164064239e-06, + "loss": 2.6024, + "step": 54458 + }, + { + "epoch": 2.535465698256396, + "grad_norm": 0.29062949360044027, + "learning_rate": 7.1246705293761265e-06, + "loss": 2.5169, + "step": 54459 + }, + { + "epoch": 2.5355122564424892, + "grad_norm": 0.2928306727127395, + "learning_rate": 7.123277020522873e-06, + "loss": 2.525, + "step": 54460 + }, + { + "epoch": 2.5355588146285823, + "grad_norm": 0.30158628721644803, + "learning_rate": 7.12188363750857e-06, + "loss": 2.6274, + "step": 54461 + }, + { + "epoch": 2.535605372814675, + "grad_norm": 0.289001362225655, + "learning_rate": 7.120490380337319e-06, + "loss": 2.6912, + "step": 54462 + }, + { + "epoch": 2.535651931000768, + "grad_norm": 0.3166952941727891, + "learning_rate": 7.119097249013207e-06, + "loss": 2.6046, + "step": 54463 + }, + { + "epoch": 2.5356984891868612, + "grad_norm": 0.30535722268068644, + "learning_rate": 7.11770424354033e-06, + "loss": 2.5706, + "step": 54464 + }, + { + "epoch": 2.5357450473729544, + "grad_norm": 0.29150884657737064, + "learning_rate": 7.116311363922751e-06, + "loss": 2.5813, + "step": 54465 + }, + { + "epoch": 2.5357916055590475, + "grad_norm": 0.30940362020249534, + "learning_rate": 7.114918610164578e-06, + "loss": 2.6281, + "step": 54466 + }, + { + "epoch": 2.5358381637451406, + "grad_norm": 0.29644136095422036, + "learning_rate": 7.113525982269892e-06, + "loss": 2.5992, + "step": 54467 + }, + { + "epoch": 2.5358847219312337, + "grad_norm": 0.3158369596355404, + "learning_rate": 7.112133480242783e-06, + "loss": 2.626, + "step": 54468 + }, + { + "epoch": 2.5359312801173264, + "grad_norm": 0.3158079408216895, + "learning_rate": 7.110741104087337e-06, + "loss": 2.7413, + "step": 54469 + }, + { + "epoch": 2.5359778383034195, + "grad_norm": 0.32751756367615953, + "learning_rate": 7.109348853807652e-06, + "loss": 2.6292, + "step": 54470 + }, + { + "epoch": 2.5360243964895126, + "grad_norm": 0.3090428775017261, + "learning_rate": 7.107956729407777e-06, + "loss": 2.6707, + "step": 54471 + }, + { + "epoch": 2.5360709546756057, + "grad_norm": 0.30709316310409596, + "learning_rate": 7.106564730891852e-06, + "loss": 2.6125, + "step": 54472 + }, + { + "epoch": 2.536117512861699, + "grad_norm": 0.3156441273633252, + "learning_rate": 7.105172858263903e-06, + "loss": 2.6294, + "step": 54473 + }, + { + "epoch": 2.536164071047792, + "grad_norm": 0.3124464866041883, + "learning_rate": 7.103781111528074e-06, + "loss": 2.6534, + "step": 54474 + }, + { + "epoch": 2.536210629233885, + "grad_norm": 0.3298531025273328, + "learning_rate": 7.1023894906884105e-06, + "loss": 2.6942, + "step": 54475 + }, + { + "epoch": 2.536257187419978, + "grad_norm": 0.2942467203554787, + "learning_rate": 7.100997995749009e-06, + "loss": 2.6046, + "step": 54476 + }, + { + "epoch": 2.5363037456060713, + "grad_norm": 0.3000835036982941, + "learning_rate": 7.099606626713956e-06, + "loss": 2.692, + "step": 54477 + }, + { + "epoch": 2.5363503037921644, + "grad_norm": 0.31789855551322327, + "learning_rate": 7.0982153835873255e-06, + "loss": 2.5728, + "step": 54478 + }, + { + "epoch": 2.5363968619782575, + "grad_norm": 0.30544836524431607, + "learning_rate": 7.096824266373209e-06, + "loss": 2.646, + "step": 54479 + }, + { + "epoch": 2.5364434201643506, + "grad_norm": 0.3253752014780873, + "learning_rate": 7.09543327507568e-06, + "loss": 2.6434, + "step": 54480 + }, + { + "epoch": 2.5364899783504433, + "grad_norm": 0.3228602890655117, + "learning_rate": 7.094042409698831e-06, + "loss": 2.5848, + "step": 54481 + }, + { + "epoch": 2.5365365365365364, + "grad_norm": 0.3150446934886455, + "learning_rate": 7.092651670246742e-06, + "loss": 2.6357, + "step": 54482 + }, + { + "epoch": 2.5365830947226295, + "grad_norm": 0.310801905418748, + "learning_rate": 7.091261056723503e-06, + "loss": 2.6717, + "step": 54483 + }, + { + "epoch": 2.5366296529087227, + "grad_norm": 0.3139146565223347, + "learning_rate": 7.089870569133161e-06, + "loss": 2.7153, + "step": 54484 + }, + { + "epoch": 2.536676211094816, + "grad_norm": 0.30819562662529965, + "learning_rate": 7.088480207479847e-06, + "loss": 2.6092, + "step": 54485 + }, + { + "epoch": 2.536722769280909, + "grad_norm": 0.32116873777473576, + "learning_rate": 7.087089971767591e-06, + "loss": 2.5034, + "step": 54486 + }, + { + "epoch": 2.536769327467002, + "grad_norm": 0.2953429840671786, + "learning_rate": 7.085699862000528e-06, + "loss": 2.6002, + "step": 54487 + }, + { + "epoch": 2.5368158856530947, + "grad_norm": 0.31696316581914635, + "learning_rate": 7.084309878182688e-06, + "loss": 2.6548, + "step": 54488 + }, + { + "epoch": 2.536862443839188, + "grad_norm": 0.31275964899964775, + "learning_rate": 7.082920020318173e-06, + "loss": 2.5998, + "step": 54489 + }, + { + "epoch": 2.536909002025281, + "grad_norm": 0.3305929760822265, + "learning_rate": 7.081530288411076e-06, + "loss": 2.6573, + "step": 54490 + }, + { + "epoch": 2.536955560211374, + "grad_norm": 0.31441953607400386, + "learning_rate": 7.08014068246543e-06, + "loss": 2.6524, + "step": 54491 + }, + { + "epoch": 2.537002118397467, + "grad_norm": 0.30432199986105773, + "learning_rate": 7.078751202485367e-06, + "loss": 2.6901, + "step": 54492 + }, + { + "epoch": 2.5370486765835603, + "grad_norm": 0.3146912098766872, + "learning_rate": 7.077361848474934e-06, + "loss": 2.7706, + "step": 54493 + }, + { + "epoch": 2.5370952347696534, + "grad_norm": 0.2996301882056953, + "learning_rate": 7.075972620438209e-06, + "loss": 2.6261, + "step": 54494 + }, + { + "epoch": 2.5371417929557465, + "grad_norm": 0.29132257995810107, + "learning_rate": 7.07458351837928e-06, + "loss": 2.5623, + "step": 54495 + }, + { + "epoch": 2.5371883511418396, + "grad_norm": 0.2969318265898026, + "learning_rate": 7.073194542302231e-06, + "loss": 2.6258, + "step": 54496 + }, + { + "epoch": 2.5372349093279327, + "grad_norm": 0.3031819465888797, + "learning_rate": 7.071805692211103e-06, + "loss": 2.6378, + "step": 54497 + }, + { + "epoch": 2.537281467514026, + "grad_norm": 0.2967577418904246, + "learning_rate": 7.070416968110017e-06, + "loss": 2.6087, + "step": 54498 + }, + { + "epoch": 2.537328025700119, + "grad_norm": 0.3042420511943093, + "learning_rate": 7.069028370003006e-06, + "loss": 2.6707, + "step": 54499 + }, + { + "epoch": 2.537374583886212, + "grad_norm": 0.29674310502214396, + "learning_rate": 7.067639897894191e-06, + "loss": 2.5599, + "step": 54500 + }, + { + "epoch": 2.5374211420723047, + "grad_norm": 0.30771043482836713, + "learning_rate": 7.066251551787612e-06, + "loss": 2.6473, + "step": 54501 + }, + { + "epoch": 2.537467700258398, + "grad_norm": 0.2964035336781531, + "learning_rate": 7.0648633316873545e-06, + "loss": 2.5733, + "step": 54502 + }, + { + "epoch": 2.537514258444491, + "grad_norm": 0.3012789521348094, + "learning_rate": 7.063475237597506e-06, + "loss": 2.6555, + "step": 54503 + }, + { + "epoch": 2.537560816630584, + "grad_norm": 0.28811948203186866, + "learning_rate": 7.062087269522105e-06, + "loss": 2.5803, + "step": 54504 + }, + { + "epoch": 2.537607374816677, + "grad_norm": 0.299966655538898, + "learning_rate": 7.060699427465267e-06, + "loss": 2.6127, + "step": 54505 + }, + { + "epoch": 2.5376539330027703, + "grad_norm": 0.29917581342203303, + "learning_rate": 7.059311711431038e-06, + "loss": 2.6704, + "step": 54506 + }, + { + "epoch": 2.5377004911888634, + "grad_norm": 0.28912784438797773, + "learning_rate": 7.057924121423498e-06, + "loss": 2.5583, + "step": 54507 + }, + { + "epoch": 2.537747049374956, + "grad_norm": 0.3256746337933449, + "learning_rate": 7.056536657446721e-06, + "loss": 2.632, + "step": 54508 + }, + { + "epoch": 2.537793607561049, + "grad_norm": 0.31237999611329165, + "learning_rate": 7.055149319504778e-06, + "loss": 2.5716, + "step": 54509 + }, + { + "epoch": 2.5378401657471423, + "grad_norm": 0.29758629843405854, + "learning_rate": 7.0537621076017415e-06, + "loss": 2.6017, + "step": 54510 + }, + { + "epoch": 2.5378867239332354, + "grad_norm": 0.30688348843708774, + "learning_rate": 7.052375021741692e-06, + "loss": 2.6554, + "step": 54511 + }, + { + "epoch": 2.5379332821193286, + "grad_norm": 0.3174337222584811, + "learning_rate": 7.0509880619286705e-06, + "loss": 2.5734, + "step": 54512 + }, + { + "epoch": 2.5379798403054217, + "grad_norm": 0.3003850694188251, + "learning_rate": 7.049601228166786e-06, + "loss": 2.5771, + "step": 54513 + }, + { + "epoch": 2.538026398491515, + "grad_norm": 0.29508374014489624, + "learning_rate": 7.048214520460084e-06, + "loss": 2.6113, + "step": 54514 + }, + { + "epoch": 2.538072956677608, + "grad_norm": 0.3124861979915078, + "learning_rate": 7.046827938812639e-06, + "loss": 2.6334, + "step": 54515 + }, + { + "epoch": 2.538119514863701, + "grad_norm": 0.3182121005071524, + "learning_rate": 7.045441483228532e-06, + "loss": 2.7403, + "step": 54516 + }, + { + "epoch": 2.538166073049794, + "grad_norm": 0.31880478348339325, + "learning_rate": 7.044055153711804e-06, + "loss": 2.5865, + "step": 54517 + }, + { + "epoch": 2.5382126312358873, + "grad_norm": 0.31099366906347, + "learning_rate": 7.042668950266562e-06, + "loss": 2.747, + "step": 54518 + }, + { + "epoch": 2.5382591894219804, + "grad_norm": 0.31160531471528263, + "learning_rate": 7.041282872896843e-06, + "loss": 2.6734, + "step": 54519 + }, + { + "epoch": 2.538305747608073, + "grad_norm": 0.3048957303339372, + "learning_rate": 7.039896921606726e-06, + "loss": 2.6526, + "step": 54520 + }, + { + "epoch": 2.538352305794166, + "grad_norm": 0.2977854023444916, + "learning_rate": 7.03851109640028e-06, + "loss": 2.5161, + "step": 54521 + }, + { + "epoch": 2.5383988639802593, + "grad_norm": 0.3114394027688967, + "learning_rate": 7.0371253972815746e-06, + "loss": 2.5619, + "step": 54522 + }, + { + "epoch": 2.5384454221663524, + "grad_norm": 0.322668595402342, + "learning_rate": 7.035739824254672e-06, + "loss": 2.7545, + "step": 54523 + }, + { + "epoch": 2.5384919803524455, + "grad_norm": 0.32026814703439266, + "learning_rate": 7.034354377323649e-06, + "loss": 2.5962, + "step": 54524 + }, + { + "epoch": 2.5385385385385386, + "grad_norm": 0.3144148769908516, + "learning_rate": 7.032969056492544e-06, + "loss": 2.6098, + "step": 54525 + }, + { + "epoch": 2.5385850967246317, + "grad_norm": 0.3159717695451854, + "learning_rate": 7.0315838617654615e-06, + "loss": 2.5315, + "step": 54526 + }, + { + "epoch": 2.538631654910725, + "grad_norm": 0.2996861731319867, + "learning_rate": 7.030198793146436e-06, + "loss": 2.5994, + "step": 54527 + }, + { + "epoch": 2.5386782130968175, + "grad_norm": 0.2961384073806365, + "learning_rate": 7.028813850639548e-06, + "loss": 2.5855, + "step": 54528 + }, + { + "epoch": 2.5387247712829106, + "grad_norm": 0.302907958422269, + "learning_rate": 7.027429034248867e-06, + "loss": 2.5773, + "step": 54529 + }, + { + "epoch": 2.5387713294690037, + "grad_norm": 0.2992111383270678, + "learning_rate": 7.026044343978422e-06, + "loss": 2.5661, + "step": 54530 + }, + { + "epoch": 2.538817887655097, + "grad_norm": 0.29844153928388967, + "learning_rate": 7.0246597798323265e-06, + "loss": 2.6169, + "step": 54531 + }, + { + "epoch": 2.53886444584119, + "grad_norm": 0.3156459760208995, + "learning_rate": 7.0232753418146165e-06, + "loss": 2.5892, + "step": 54532 + }, + { + "epoch": 2.538911004027283, + "grad_norm": 0.31081451893551737, + "learning_rate": 7.02189102992935e-06, + "loss": 2.6199, + "step": 54533 + }, + { + "epoch": 2.538957562213376, + "grad_norm": 0.30818588577223, + "learning_rate": 7.0205068441806075e-06, + "loss": 2.6169, + "step": 54534 + }, + { + "epoch": 2.5390041203994693, + "grad_norm": 0.3065120083443276, + "learning_rate": 7.019122784572441e-06, + "loss": 2.6048, + "step": 54535 + }, + { + "epoch": 2.5390506785855624, + "grad_norm": 0.30167111610641356, + "learning_rate": 7.0177388511089125e-06, + "loss": 2.5822, + "step": 54536 + }, + { + "epoch": 2.5390972367716556, + "grad_norm": 0.3195178862765892, + "learning_rate": 7.016355043794098e-06, + "loss": 2.7032, + "step": 54537 + }, + { + "epoch": 2.5391437949577487, + "grad_norm": 0.3223830756254849, + "learning_rate": 7.014971362632023e-06, + "loss": 2.6417, + "step": 54538 + }, + { + "epoch": 2.539190353143842, + "grad_norm": 0.29675801042425826, + "learning_rate": 7.013587807626798e-06, + "loss": 2.6504, + "step": 54539 + }, + { + "epoch": 2.5392369113299345, + "grad_norm": 0.3125557444876485, + "learning_rate": 7.012204378782444e-06, + "loss": 2.6336, + "step": 54540 + }, + { + "epoch": 2.5392834695160276, + "grad_norm": 0.2925921370442607, + "learning_rate": 7.010821076103036e-06, + "loss": 2.6516, + "step": 54541 + }, + { + "epoch": 2.5393300277021207, + "grad_norm": 0.31406217918081286, + "learning_rate": 7.009437899592647e-06, + "loss": 2.5499, + "step": 54542 + }, + { + "epoch": 2.539376585888214, + "grad_norm": 0.3312571122678489, + "learning_rate": 7.0080548492552965e-06, + "loss": 2.7021, + "step": 54543 + }, + { + "epoch": 2.539423144074307, + "grad_norm": 0.32026246165431344, + "learning_rate": 7.006671925095093e-06, + "loss": 2.6543, + "step": 54544 + }, + { + "epoch": 2.5394697022604, + "grad_norm": 0.3163104433173041, + "learning_rate": 7.005289127116049e-06, + "loss": 2.708, + "step": 54545 + }, + { + "epoch": 2.539516260446493, + "grad_norm": 0.29379669147262205, + "learning_rate": 7.0039064553222674e-06, + "loss": 2.4814, + "step": 54546 + }, + { + "epoch": 2.539562818632586, + "grad_norm": 0.30084728236231956, + "learning_rate": 7.002523909717779e-06, + "loss": 2.5731, + "step": 54547 + }, + { + "epoch": 2.539609376818679, + "grad_norm": 0.3023696545972126, + "learning_rate": 7.00114149030664e-06, + "loss": 2.6768, + "step": 54548 + }, + { + "epoch": 2.539655935004772, + "grad_norm": 0.3113348231059159, + "learning_rate": 6.99975919709292e-06, + "loss": 2.6863, + "step": 54549 + }, + { + "epoch": 2.539702493190865, + "grad_norm": 0.31446017099395485, + "learning_rate": 6.998377030080677e-06, + "loss": 2.5982, + "step": 54550 + }, + { + "epoch": 2.5397490513769583, + "grad_norm": 0.30159294747248366, + "learning_rate": 6.996994989273942e-06, + "loss": 2.5946, + "step": 54551 + }, + { + "epoch": 2.5397956095630514, + "grad_norm": 0.3027181742067125, + "learning_rate": 6.99561307467681e-06, + "loss": 2.5803, + "step": 54552 + }, + { + "epoch": 2.5398421677491445, + "grad_norm": 0.2933098049745432, + "learning_rate": 6.994231286293306e-06, + "loss": 2.65, + "step": 54553 + }, + { + "epoch": 2.5398887259352376, + "grad_norm": 0.30219492567821343, + "learning_rate": 6.992849624127495e-06, + "loss": 2.6686, + "step": 54554 + }, + { + "epoch": 2.5399352841213307, + "grad_norm": 0.30150898935424575, + "learning_rate": 6.9914680881834506e-06, + "loss": 2.7162, + "step": 54555 + }, + { + "epoch": 2.539981842307424, + "grad_norm": 0.30193897357529653, + "learning_rate": 6.990086678465185e-06, + "loss": 2.7039, + "step": 54556 + }, + { + "epoch": 2.540028400493517, + "grad_norm": 0.3147658780164319, + "learning_rate": 6.988705394976796e-06, + "loss": 2.6379, + "step": 54557 + }, + { + "epoch": 2.54007495867961, + "grad_norm": 0.30255736122868476, + "learning_rate": 6.987324237722304e-06, + "loss": 2.5457, + "step": 54558 + }, + { + "epoch": 2.5401215168657028, + "grad_norm": 0.29653583831154795, + "learning_rate": 6.9859432067057975e-06, + "loss": 2.7, + "step": 54559 + }, + { + "epoch": 2.540168075051796, + "grad_norm": 0.292453105707496, + "learning_rate": 6.984562301931297e-06, + "loss": 2.6917, + "step": 54560 + }, + { + "epoch": 2.540214633237889, + "grad_norm": 0.3117797822754602, + "learning_rate": 6.983181523402871e-06, + "loss": 2.5985, + "step": 54561 + }, + { + "epoch": 2.540261191423982, + "grad_norm": 0.30447872797537934, + "learning_rate": 6.9818008711245665e-06, + "loss": 2.6959, + "step": 54562 + }, + { + "epoch": 2.540307749610075, + "grad_norm": 0.3028400231324538, + "learning_rate": 6.980420345100442e-06, + "loss": 2.6902, + "step": 54563 + }, + { + "epoch": 2.5403543077961683, + "grad_norm": 0.29383014307746236, + "learning_rate": 6.979039945334542e-06, + "loss": 2.7368, + "step": 54564 + }, + { + "epoch": 2.5404008659822614, + "grad_norm": 0.2949438472444354, + "learning_rate": 6.977659671830933e-06, + "loss": 2.674, + "step": 54565 + }, + { + "epoch": 2.5404474241683546, + "grad_norm": 0.30044673479538814, + "learning_rate": 6.976279524593643e-06, + "loss": 2.5521, + "step": 54566 + }, + { + "epoch": 2.5404939823544472, + "grad_norm": 0.29811981022975603, + "learning_rate": 6.974899503626736e-06, + "loss": 2.6942, + "step": 54567 + }, + { + "epoch": 2.5405405405405403, + "grad_norm": 0.31762668744024253, + "learning_rate": 6.973519608934276e-06, + "loss": 2.6405, + "step": 54568 + }, + { + "epoch": 2.5405870987266335, + "grad_norm": 0.3039975251299975, + "learning_rate": 6.97213984052027e-06, + "loss": 2.6038, + "step": 54569 + }, + { + "epoch": 2.5406336569127266, + "grad_norm": 0.2963678044316241, + "learning_rate": 6.970760198388815e-06, + "loss": 2.647, + "step": 54570 + }, + { + "epoch": 2.5406802150988197, + "grad_norm": 0.3431562623195897, + "learning_rate": 6.9693806825439185e-06, + "loss": 2.6437, + "step": 54571 + }, + { + "epoch": 2.540726773284913, + "grad_norm": 0.3105114656199382, + "learning_rate": 6.968001292989673e-06, + "loss": 2.5561, + "step": 54572 + }, + { + "epoch": 2.540773331471006, + "grad_norm": 0.2889560101176193, + "learning_rate": 6.966622029730096e-06, + "loss": 2.76, + "step": 54573 + }, + { + "epoch": 2.540819889657099, + "grad_norm": 0.3050022685434889, + "learning_rate": 6.965242892769241e-06, + "loss": 2.7685, + "step": 54574 + }, + { + "epoch": 2.540866447843192, + "grad_norm": 0.3089925884024209, + "learning_rate": 6.96386388211116e-06, + "loss": 2.5015, + "step": 54575 + }, + { + "epoch": 2.5409130060292853, + "grad_norm": 0.32397660386680255, + "learning_rate": 6.962484997759894e-06, + "loss": 2.5947, + "step": 54576 + }, + { + "epoch": 2.5409595642153784, + "grad_norm": 0.30225488177742565, + "learning_rate": 6.961106239719501e-06, + "loss": 2.5866, + "step": 54577 + }, + { + "epoch": 2.5410061224014715, + "grad_norm": 0.31053801540859166, + "learning_rate": 6.9597276079940265e-06, + "loss": 2.7023, + "step": 54578 + }, + { + "epoch": 2.541052680587564, + "grad_norm": 0.3049723071004794, + "learning_rate": 6.958349102587508e-06, + "loss": 2.6388, + "step": 54579 + }, + { + "epoch": 2.5410992387736573, + "grad_norm": 0.3082024258110865, + "learning_rate": 6.956970723503986e-06, + "loss": 2.5435, + "step": 54580 + }, + { + "epoch": 2.5411457969597504, + "grad_norm": 0.28763622445292075, + "learning_rate": 6.955592470747518e-06, + "loss": 2.5218, + "step": 54581 + }, + { + "epoch": 2.5411923551458435, + "grad_norm": 0.3082304738379968, + "learning_rate": 6.954214344322147e-06, + "loss": 2.6221, + "step": 54582 + }, + { + "epoch": 2.5412389133319366, + "grad_norm": 0.3042210065600626, + "learning_rate": 6.952836344231928e-06, + "loss": 2.5909, + "step": 54583 + }, + { + "epoch": 2.5412854715180297, + "grad_norm": 0.3092485401329406, + "learning_rate": 6.9514584704808705e-06, + "loss": 2.5926, + "step": 54584 + }, + { + "epoch": 2.541332029704123, + "grad_norm": 0.31510803355273465, + "learning_rate": 6.95008072307306e-06, + "loss": 2.6057, + "step": 54585 + }, + { + "epoch": 2.5413785878902155, + "grad_norm": 0.29799168314817, + "learning_rate": 6.948703102012516e-06, + "loss": 2.5582, + "step": 54586 + }, + { + "epoch": 2.5414251460763086, + "grad_norm": 0.2887588758812107, + "learning_rate": 6.947325607303284e-06, + "loss": 2.6244, + "step": 54587 + }, + { + "epoch": 2.5414717042624018, + "grad_norm": 0.31426676664068326, + "learning_rate": 6.945948238949407e-06, + "loss": 2.6295, + "step": 54588 + }, + { + "epoch": 2.541518262448495, + "grad_norm": 0.3162705959555611, + "learning_rate": 6.944570996954935e-06, + "loss": 2.554, + "step": 54589 + }, + { + "epoch": 2.541564820634588, + "grad_norm": 0.33721869078100686, + "learning_rate": 6.943193881323906e-06, + "loss": 2.601, + "step": 54590 + }, + { + "epoch": 2.541611378820681, + "grad_norm": 0.31952355081117023, + "learning_rate": 6.941816892060371e-06, + "loss": 2.5937, + "step": 54591 + }, + { + "epoch": 2.5416579370067742, + "grad_norm": 0.306424752506985, + "learning_rate": 6.940440029168349e-06, + "loss": 2.5272, + "step": 54592 + }, + { + "epoch": 2.5417044951928673, + "grad_norm": 0.30399495383111674, + "learning_rate": 6.939063292651893e-06, + "loss": 2.6753, + "step": 54593 + }, + { + "epoch": 2.5417510533789605, + "grad_norm": 0.29607454476501177, + "learning_rate": 6.937686682515043e-06, + "loss": 2.497, + "step": 54594 + }, + { + "epoch": 2.5417976115650536, + "grad_norm": 0.3054649506981187, + "learning_rate": 6.936310198761842e-06, + "loss": 2.5868, + "step": 54595 + }, + { + "epoch": 2.5418441697511467, + "grad_norm": 0.3041781161228975, + "learning_rate": 6.934933841396341e-06, + "loss": 2.6569, + "step": 54596 + }, + { + "epoch": 2.54189072793724, + "grad_norm": 0.32355963303440105, + "learning_rate": 6.933557610422542e-06, + "loss": 2.6021, + "step": 54597 + }, + { + "epoch": 2.5419372861233325, + "grad_norm": 0.2840356961924677, + "learning_rate": 6.932181505844531e-06, + "loss": 2.5813, + "step": 54598 + }, + { + "epoch": 2.5419838443094256, + "grad_norm": 0.31501606062492243, + "learning_rate": 6.930805527666307e-06, + "loss": 2.6341, + "step": 54599 + }, + { + "epoch": 2.5420304024955187, + "grad_norm": 0.30144502317296473, + "learning_rate": 6.929429675891941e-06, + "loss": 2.6908, + "step": 54600 + }, + { + "epoch": 2.542076960681612, + "grad_norm": 0.30305346617216083, + "learning_rate": 6.928053950525448e-06, + "loss": 2.5589, + "step": 54601 + }, + { + "epoch": 2.542123518867705, + "grad_norm": 0.30069452819082393, + "learning_rate": 6.926678351570876e-06, + "loss": 2.5959, + "step": 54602 + }, + { + "epoch": 2.542170077053798, + "grad_norm": 0.3155026083085067, + "learning_rate": 6.925302879032253e-06, + "loss": 2.5865, + "step": 54603 + }, + { + "epoch": 2.542216635239891, + "grad_norm": 0.3048270276729671, + "learning_rate": 6.923927532913638e-06, + "loss": 2.701, + "step": 54604 + }, + { + "epoch": 2.5422631934259843, + "grad_norm": 0.3049481599607395, + "learning_rate": 6.922552313219038e-06, + "loss": 2.7339, + "step": 54605 + }, + { + "epoch": 2.542309751612077, + "grad_norm": 0.30281435070726015, + "learning_rate": 6.921177219952507e-06, + "loss": 2.7172, + "step": 54606 + }, + { + "epoch": 2.54235630979817, + "grad_norm": 0.30958014465937805, + "learning_rate": 6.919802253118074e-06, + "loss": 2.5727, + "step": 54607 + }, + { + "epoch": 2.542402867984263, + "grad_norm": 0.2916324925390542, + "learning_rate": 6.91842741271978e-06, + "loss": 2.6494, + "step": 54608 + }, + { + "epoch": 2.5424494261703563, + "grad_norm": 0.30123410472902623, + "learning_rate": 6.917052698761667e-06, + "loss": 2.5372, + "step": 54609 + }, + { + "epoch": 2.5424959843564494, + "grad_norm": 0.29666735697327395, + "learning_rate": 6.915678111247736e-06, + "loss": 2.6508, + "step": 54610 + }, + { + "epoch": 2.5425425425425425, + "grad_norm": 0.3017987016506688, + "learning_rate": 6.914303650182069e-06, + "loss": 2.6673, + "step": 54611 + }, + { + "epoch": 2.5425891007286356, + "grad_norm": 0.3050932750606059, + "learning_rate": 6.912929315568656e-06, + "loss": 2.6589, + "step": 54612 + }, + { + "epoch": 2.5426356589147288, + "grad_norm": 0.29249392386955103, + "learning_rate": 6.9115551074115715e-06, + "loss": 2.6593, + "step": 54613 + }, + { + "epoch": 2.542682217100822, + "grad_norm": 0.3203976233916801, + "learning_rate": 6.910181025714812e-06, + "loss": 2.6445, + "step": 54614 + }, + { + "epoch": 2.542728775286915, + "grad_norm": 0.3124501208765789, + "learning_rate": 6.908807070482431e-06, + "loss": 2.611, + "step": 54615 + }, + { + "epoch": 2.542775333473008, + "grad_norm": 0.31506762236032476, + "learning_rate": 6.907433241718453e-06, + "loss": 2.6445, + "step": 54616 + }, + { + "epoch": 2.542821891659101, + "grad_norm": 0.30391729757389385, + "learning_rate": 6.906059539426918e-06, + "loss": 2.5894, + "step": 54617 + }, + { + "epoch": 2.542868449845194, + "grad_norm": 0.3177141842306445, + "learning_rate": 6.904685963611862e-06, + "loss": 2.7064, + "step": 54618 + }, + { + "epoch": 2.542915008031287, + "grad_norm": 0.3260643617997656, + "learning_rate": 6.903312514277294e-06, + "loss": 2.6422, + "step": 54619 + }, + { + "epoch": 2.54296156621738, + "grad_norm": 0.3164328648435777, + "learning_rate": 6.901939191427259e-06, + "loss": 2.6711, + "step": 54620 + }, + { + "epoch": 2.5430081244034732, + "grad_norm": 0.32427317912767445, + "learning_rate": 6.900565995065788e-06, + "loss": 2.5992, + "step": 54621 + }, + { + "epoch": 2.5430546825895664, + "grad_norm": 0.34478897456273144, + "learning_rate": 6.899192925196918e-06, + "loss": 2.6311, + "step": 54622 + }, + { + "epoch": 2.5431012407756595, + "grad_norm": 0.3135453748388471, + "learning_rate": 6.897819981824649e-06, + "loss": 2.6243, + "step": 54623 + }, + { + "epoch": 2.5431477989617526, + "grad_norm": 0.3078737676884046, + "learning_rate": 6.896447164953057e-06, + "loss": 2.6212, + "step": 54624 + }, + { + "epoch": 2.5431943571478453, + "grad_norm": 0.3106049883605045, + "learning_rate": 6.895074474586122e-06, + "loss": 2.6308, + "step": 54625 + }, + { + "epoch": 2.5432409153339384, + "grad_norm": 0.3119214062361037, + "learning_rate": 6.893701910727917e-06, + "loss": 2.644, + "step": 54626 + }, + { + "epoch": 2.5432874735200315, + "grad_norm": 0.3131801013317937, + "learning_rate": 6.892329473382442e-06, + "loss": 2.5444, + "step": 54627 + }, + { + "epoch": 2.5433340317061246, + "grad_norm": 0.31200204425286326, + "learning_rate": 6.890957162553735e-06, + "loss": 2.6542, + "step": 54628 + }, + { + "epoch": 2.5433805898922177, + "grad_norm": 0.3175108590901864, + "learning_rate": 6.889584978245822e-06, + "loss": 2.6798, + "step": 54629 + }, + { + "epoch": 2.543427148078311, + "grad_norm": 0.2947121313101149, + "learning_rate": 6.888212920462728e-06, + "loss": 2.5826, + "step": 54630 + }, + { + "epoch": 2.543473706264404, + "grad_norm": 0.31927550855665704, + "learning_rate": 6.886840989208487e-06, + "loss": 2.7255, + "step": 54631 + }, + { + "epoch": 2.543520264450497, + "grad_norm": 0.3060216486018373, + "learning_rate": 6.88546918448712e-06, + "loss": 2.6439, + "step": 54632 + }, + { + "epoch": 2.54356682263659, + "grad_norm": 0.32133478552448574, + "learning_rate": 6.884097506302645e-06, + "loss": 2.7335, + "step": 54633 + }, + { + "epoch": 2.5436133808226833, + "grad_norm": 0.3060890505573444, + "learning_rate": 6.882725954659103e-06, + "loss": 2.5507, + "step": 54634 + }, + { + "epoch": 2.5436599390087764, + "grad_norm": 0.2899919882040021, + "learning_rate": 6.8813545295605075e-06, + "loss": 2.6869, + "step": 54635 + }, + { + "epoch": 2.5437064971948695, + "grad_norm": 0.32086201657180335, + "learning_rate": 6.8799832310108895e-06, + "loss": 2.7013, + "step": 54636 + }, + { + "epoch": 2.543753055380962, + "grad_norm": 0.3196632258753878, + "learning_rate": 6.878612059014283e-06, + "loss": 2.6764, + "step": 54637 + }, + { + "epoch": 2.5437996135670553, + "grad_norm": 0.28750928401564435, + "learning_rate": 6.877241013574681e-06, + "loss": 2.5756, + "step": 54638 + }, + { + "epoch": 2.5438461717531484, + "grad_norm": 0.3014179361475458, + "learning_rate": 6.875870094696152e-06, + "loss": 2.5933, + "step": 54639 + }, + { + "epoch": 2.5438927299392415, + "grad_norm": 0.30566883236689907, + "learning_rate": 6.874499302382681e-06, + "loss": 2.611, + "step": 54640 + }, + { + "epoch": 2.5439392881253347, + "grad_norm": 0.3062963778165598, + "learning_rate": 6.87312863663831e-06, + "loss": 2.6512, + "step": 54641 + }, + { + "epoch": 2.5439858463114278, + "grad_norm": 0.30597893965755196, + "learning_rate": 6.871758097467057e-06, + "loss": 2.707, + "step": 54642 + }, + { + "epoch": 2.544032404497521, + "grad_norm": 0.3189854421435151, + "learning_rate": 6.870387684872942e-06, + "loss": 2.6381, + "step": 54643 + }, + { + "epoch": 2.544078962683614, + "grad_norm": 0.298043353918462, + "learning_rate": 6.869017398860006e-06, + "loss": 2.6338, + "step": 54644 + }, + { + "epoch": 2.5441255208697067, + "grad_norm": 0.2990015817671477, + "learning_rate": 6.86764723943224e-06, + "loss": 2.6586, + "step": 54645 + }, + { + "epoch": 2.5441720790558, + "grad_norm": 0.29748018197781667, + "learning_rate": 6.86627720659368e-06, + "loss": 2.6243, + "step": 54646 + }, + { + "epoch": 2.544218637241893, + "grad_norm": 0.2979143444843442, + "learning_rate": 6.86490730034835e-06, + "loss": 2.5966, + "step": 54647 + }, + { + "epoch": 2.544265195427986, + "grad_norm": 0.2913316443579632, + "learning_rate": 6.8635375207002635e-06, + "loss": 2.6226, + "step": 54648 + }, + { + "epoch": 2.544311753614079, + "grad_norm": 0.3052966782627012, + "learning_rate": 6.862167867653446e-06, + "loss": 2.5669, + "step": 54649 + }, + { + "epoch": 2.5443583118001722, + "grad_norm": 0.29314242016711106, + "learning_rate": 6.860798341211932e-06, + "loss": 2.5799, + "step": 54650 + }, + { + "epoch": 2.5444048699862654, + "grad_norm": 0.3060885071590545, + "learning_rate": 6.8594289413796975e-06, + "loss": 2.5987, + "step": 54651 + }, + { + "epoch": 2.5444514281723585, + "grad_norm": 0.29753644026111625, + "learning_rate": 6.858059668160815e-06, + "loss": 2.6967, + "step": 54652 + }, + { + "epoch": 2.5444979863584516, + "grad_norm": 0.3116459903312429, + "learning_rate": 6.856690521559262e-06, + "loss": 2.6774, + "step": 54653 + }, + { + "epoch": 2.5445445445445447, + "grad_norm": 0.31360902610733643, + "learning_rate": 6.855321501579076e-06, + "loss": 2.6545, + "step": 54654 + }, + { + "epoch": 2.544591102730638, + "grad_norm": 0.3118141913986686, + "learning_rate": 6.853952608224268e-06, + "loss": 2.6244, + "step": 54655 + }, + { + "epoch": 2.544637660916731, + "grad_norm": 0.3115480601373423, + "learning_rate": 6.852583841498855e-06, + "loss": 2.6257, + "step": 54656 + }, + { + "epoch": 2.5446842191028236, + "grad_norm": 0.30927393832318467, + "learning_rate": 6.851215201406874e-06, + "loss": 2.6003, + "step": 54657 + }, + { + "epoch": 2.5447307772889167, + "grad_norm": 0.2976421750012697, + "learning_rate": 6.849846687952316e-06, + "loss": 2.6378, + "step": 54658 + }, + { + "epoch": 2.54477733547501, + "grad_norm": 0.3026196054300638, + "learning_rate": 6.848478301139205e-06, + "loss": 2.6436, + "step": 54659 + }, + { + "epoch": 2.544823893661103, + "grad_norm": 0.3162017274272728, + "learning_rate": 6.847110040971555e-06, + "loss": 2.6631, + "step": 54660 + }, + { + "epoch": 2.544870451847196, + "grad_norm": 0.3032499587060165, + "learning_rate": 6.8457419074533905e-06, + "loss": 2.6046, + "step": 54661 + }, + { + "epoch": 2.544917010033289, + "grad_norm": 0.3120807628567628, + "learning_rate": 6.844373900588718e-06, + "loss": 2.6416, + "step": 54662 + }, + { + "epoch": 2.5449635682193823, + "grad_norm": 0.3079720244277379, + "learning_rate": 6.843006020381576e-06, + "loss": 2.6988, + "step": 54663 + }, + { + "epoch": 2.545010126405475, + "grad_norm": 0.3075825037870256, + "learning_rate": 6.841638266835931e-06, + "loss": 2.6328, + "step": 54664 + }, + { + "epoch": 2.545056684591568, + "grad_norm": 0.30689843786681964, + "learning_rate": 6.840270639955848e-06, + "loss": 2.5405, + "step": 54665 + }, + { + "epoch": 2.545103242777661, + "grad_norm": 0.3113436396240316, + "learning_rate": 6.838903139745306e-06, + "loss": 2.5862, + "step": 54666 + }, + { + "epoch": 2.5451498009637543, + "grad_norm": 0.3110135388121349, + "learning_rate": 6.837535766208336e-06, + "loss": 2.6592, + "step": 54667 + }, + { + "epoch": 2.5451963591498474, + "grad_norm": 0.31337471455432764, + "learning_rate": 6.836168519348946e-06, + "loss": 2.6076, + "step": 54668 + }, + { + "epoch": 2.5452429173359405, + "grad_norm": 0.2926755093280262, + "learning_rate": 6.834801399171148e-06, + "loss": 2.6096, + "step": 54669 + }, + { + "epoch": 2.5452894755220337, + "grad_norm": 0.30171616799011575, + "learning_rate": 6.833434405678962e-06, + "loss": 2.589, + "step": 54670 + }, + { + "epoch": 2.5453360337081268, + "grad_norm": 0.3075626592686065, + "learning_rate": 6.832067538876374e-06, + "loss": 2.5538, + "step": 54671 + }, + { + "epoch": 2.54538259189422, + "grad_norm": 0.31054698038696044, + "learning_rate": 6.830700798767442e-06, + "loss": 2.7436, + "step": 54672 + }, + { + "epoch": 2.545429150080313, + "grad_norm": 0.2967593274697852, + "learning_rate": 6.829334185356134e-06, + "loss": 2.6173, + "step": 54673 + }, + { + "epoch": 2.545475708266406, + "grad_norm": 0.3165671246880607, + "learning_rate": 6.827967698646481e-06, + "loss": 2.6213, + "step": 54674 + }, + { + "epoch": 2.5455222664524992, + "grad_norm": 0.29620134900736306, + "learning_rate": 6.826601338642485e-06, + "loss": 2.6967, + "step": 54675 + }, + { + "epoch": 2.5455688246385924, + "grad_norm": 0.2947193461935076, + "learning_rate": 6.825235105348171e-06, + "loss": 2.6866, + "step": 54676 + }, + { + "epoch": 2.545615382824685, + "grad_norm": 0.3116525015351663, + "learning_rate": 6.823868998767524e-06, + "loss": 2.7422, + "step": 54677 + }, + { + "epoch": 2.545661941010778, + "grad_norm": 0.2992869061887927, + "learning_rate": 6.8225030189045855e-06, + "loss": 2.6918, + "step": 54678 + }, + { + "epoch": 2.5457084991968713, + "grad_norm": 0.3079166590034631, + "learning_rate": 6.821137165763336e-06, + "loss": 2.6962, + "step": 54679 + }, + { + "epoch": 2.5457550573829644, + "grad_norm": 0.3061887152983592, + "learning_rate": 6.819771439347794e-06, + "loss": 2.5908, + "step": 54680 + }, + { + "epoch": 2.5458016155690575, + "grad_norm": 0.30220326401134967, + "learning_rate": 6.818405839661968e-06, + "loss": 2.6038, + "step": 54681 + }, + { + "epoch": 2.5458481737551506, + "grad_norm": 0.29261994707444594, + "learning_rate": 6.817040366709871e-06, + "loss": 2.6193, + "step": 54682 + }, + { + "epoch": 2.5458947319412437, + "grad_norm": 0.33100630208706927, + "learning_rate": 6.815675020495515e-06, + "loss": 2.5327, + "step": 54683 + }, + { + "epoch": 2.5459412901273364, + "grad_norm": 0.30974294109727857, + "learning_rate": 6.814309801022872e-06, + "loss": 2.5839, + "step": 54684 + }, + { + "epoch": 2.5459878483134295, + "grad_norm": 0.33321179804418327, + "learning_rate": 6.812944708296004e-06, + "loss": 2.6263, + "step": 54685 + }, + { + "epoch": 2.5460344064995226, + "grad_norm": 0.3144238071449139, + "learning_rate": 6.811579742318875e-06, + "loss": 2.5882, + "step": 54686 + }, + { + "epoch": 2.5460809646856157, + "grad_norm": 0.32490306123652923, + "learning_rate": 6.810214903095507e-06, + "loss": 2.6874, + "step": 54687 + }, + { + "epoch": 2.546127522871709, + "grad_norm": 0.29680738596246536, + "learning_rate": 6.8088501906299e-06, + "loss": 2.6224, + "step": 54688 + }, + { + "epoch": 2.546174081057802, + "grad_norm": 0.31036837976431075, + "learning_rate": 6.8074856049260615e-06, + "loss": 2.706, + "step": 54689 + }, + { + "epoch": 2.546220639243895, + "grad_norm": 0.2948740567516493, + "learning_rate": 6.806121145988004e-06, + "loss": 2.6513, + "step": 54690 + }, + { + "epoch": 2.546267197429988, + "grad_norm": 0.30538720302970374, + "learning_rate": 6.804756813819729e-06, + "loss": 2.5679, + "step": 54691 + }, + { + "epoch": 2.5463137556160813, + "grad_norm": 0.2993626247124608, + "learning_rate": 6.803392608425229e-06, + "loss": 2.544, + "step": 54692 + }, + { + "epoch": 2.5463603138021744, + "grad_norm": 0.2994700693468467, + "learning_rate": 6.802028529808518e-06, + "loss": 2.6401, + "step": 54693 + }, + { + "epoch": 2.5464068719882675, + "grad_norm": 0.3143504599077799, + "learning_rate": 6.800664577973598e-06, + "loss": 2.6622, + "step": 54694 + }, + { + "epoch": 2.5464534301743607, + "grad_norm": 0.300431094555822, + "learning_rate": 6.79930075292447e-06, + "loss": 2.6647, + "step": 54695 + }, + { + "epoch": 2.5464999883604533, + "grad_norm": 0.3188456797704342, + "learning_rate": 6.797937054665149e-06, + "loss": 2.6186, + "step": 54696 + }, + { + "epoch": 2.5465465465465464, + "grad_norm": 0.307261990312841, + "learning_rate": 6.796573483199608e-06, + "loss": 2.6544, + "step": 54697 + }, + { + "epoch": 2.5465931047326396, + "grad_norm": 0.2924440969177336, + "learning_rate": 6.795210038531885e-06, + "loss": 2.6964, + "step": 54698 + }, + { + "epoch": 2.5466396629187327, + "grad_norm": 0.3047137605627894, + "learning_rate": 6.793846720665953e-06, + "loss": 2.5839, + "step": 54699 + }, + { + "epoch": 2.546686221104826, + "grad_norm": 0.29419034616824685, + "learning_rate": 6.792483529605831e-06, + "loss": 2.6504, + "step": 54700 + }, + { + "epoch": 2.546732779290919, + "grad_norm": 0.29627265125711444, + "learning_rate": 6.791120465355505e-06, + "loss": 2.5787, + "step": 54701 + }, + { + "epoch": 2.546779337477012, + "grad_norm": 0.3060818132356623, + "learning_rate": 6.789757527918989e-06, + "loss": 2.6763, + "step": 54702 + }, + { + "epoch": 2.5468258956631047, + "grad_norm": 0.30748828422182056, + "learning_rate": 6.788394717300273e-06, + "loss": 2.6187, + "step": 54703 + }, + { + "epoch": 2.546872453849198, + "grad_norm": 0.2989947155405676, + "learning_rate": 6.7870320335033765e-06, + "loss": 2.5921, + "step": 54704 + }, + { + "epoch": 2.546919012035291, + "grad_norm": 0.32504050643367666, + "learning_rate": 6.7856694765322746e-06, + "loss": 2.5531, + "step": 54705 + }, + { + "epoch": 2.546965570221384, + "grad_norm": 0.30559951545177516, + "learning_rate": 6.78430704639097e-06, + "loss": 2.6303, + "step": 54706 + }, + { + "epoch": 2.547012128407477, + "grad_norm": 0.29479061863250794, + "learning_rate": 6.782944743083469e-06, + "loss": 2.5789, + "step": 54707 + }, + { + "epoch": 2.5470586865935703, + "grad_norm": 0.30156600377970494, + "learning_rate": 6.78158256661377e-06, + "loss": 2.5634, + "step": 54708 + }, + { + "epoch": 2.5471052447796634, + "grad_norm": 0.3194549094024009, + "learning_rate": 6.7802205169858746e-06, + "loss": 2.6265, + "step": 54709 + }, + { + "epoch": 2.5471518029657565, + "grad_norm": 0.31058781997204465, + "learning_rate": 6.778858594203757e-06, + "loss": 2.6583, + "step": 54710 + }, + { + "epoch": 2.5471983611518496, + "grad_norm": 0.31974565372939406, + "learning_rate": 6.777496798271449e-06, + "loss": 2.6385, + "step": 54711 + }, + { + "epoch": 2.5472449193379427, + "grad_norm": 0.3044843762496756, + "learning_rate": 6.776135129192923e-06, + "loss": 2.6409, + "step": 54712 + }, + { + "epoch": 2.547291477524036, + "grad_norm": 0.33036697830355644, + "learning_rate": 6.7747735869721774e-06, + "loss": 2.5256, + "step": 54713 + }, + { + "epoch": 2.547338035710129, + "grad_norm": 0.3110556999573398, + "learning_rate": 6.773412171613219e-06, + "loss": 2.652, + "step": 54714 + }, + { + "epoch": 2.547384593896222, + "grad_norm": 0.3111613303840017, + "learning_rate": 6.7720508831200345e-06, + "loss": 2.6101, + "step": 54715 + }, + { + "epoch": 2.5474311520823147, + "grad_norm": 0.33706665284418497, + "learning_rate": 6.77068972149662e-06, + "loss": 2.665, + "step": 54716 + }, + { + "epoch": 2.547477710268408, + "grad_norm": 0.3095829694888025, + "learning_rate": 6.769328686746984e-06, + "loss": 2.6296, + "step": 54717 + }, + { + "epoch": 2.547524268454501, + "grad_norm": 0.3292926365052862, + "learning_rate": 6.7679677788751e-06, + "loss": 2.5587, + "step": 54718 + }, + { + "epoch": 2.547570826640594, + "grad_norm": 0.31188146697824587, + "learning_rate": 6.766606997884972e-06, + "loss": 2.6117, + "step": 54719 + }, + { + "epoch": 2.547617384826687, + "grad_norm": 0.30329280702117173, + "learning_rate": 6.76524634378059e-06, + "loss": 2.7052, + "step": 54720 + }, + { + "epoch": 2.5476639430127803, + "grad_norm": 0.31035944341796484, + "learning_rate": 6.763885816565957e-06, + "loss": 2.6613, + "step": 54721 + }, + { + "epoch": 2.5477105011988734, + "grad_norm": 0.3316486224489497, + "learning_rate": 6.7625254162450635e-06, + "loss": 2.6384, + "step": 54722 + }, + { + "epoch": 2.547757059384966, + "grad_norm": 0.32176986069351926, + "learning_rate": 6.761165142821879e-06, + "loss": 2.5991, + "step": 54723 + }, + { + "epoch": 2.547803617571059, + "grad_norm": 0.3002501268240292, + "learning_rate": 6.759804996300439e-06, + "loss": 2.5493, + "step": 54724 + }, + { + "epoch": 2.5478501757571523, + "grad_norm": 0.313024369372237, + "learning_rate": 6.758444976684686e-06, + "loss": 2.6651, + "step": 54725 + }, + { + "epoch": 2.5478967339432455, + "grad_norm": 0.30337592287657933, + "learning_rate": 6.75708508397866e-06, + "loss": 2.6832, + "step": 54726 + }, + { + "epoch": 2.5479432921293386, + "grad_norm": 0.3063441065629964, + "learning_rate": 6.755725318186318e-06, + "loss": 2.6417, + "step": 54727 + }, + { + "epoch": 2.5479898503154317, + "grad_norm": 0.3290843386167111, + "learning_rate": 6.754365679311664e-06, + "loss": 2.6999, + "step": 54728 + }, + { + "epoch": 2.548036408501525, + "grad_norm": 0.3063096025103257, + "learning_rate": 6.753006167358688e-06, + "loss": 2.6722, + "step": 54729 + }, + { + "epoch": 2.548082966687618, + "grad_norm": 0.31855922544598114, + "learning_rate": 6.751646782331383e-06, + "loss": 2.7704, + "step": 54730 + }, + { + "epoch": 2.548129524873711, + "grad_norm": 0.3214632032963617, + "learning_rate": 6.750287524233728e-06, + "loss": 2.5951, + "step": 54731 + }, + { + "epoch": 2.548176083059804, + "grad_norm": 0.3267518124992745, + "learning_rate": 6.748928393069714e-06, + "loss": 2.5225, + "step": 54732 + }, + { + "epoch": 2.5482226412458973, + "grad_norm": 0.3220114575887556, + "learning_rate": 6.747569388843339e-06, + "loss": 2.5661, + "step": 54733 + }, + { + "epoch": 2.5482691994319904, + "grad_norm": 0.33629068307837645, + "learning_rate": 6.746210511558587e-06, + "loss": 2.6927, + "step": 54734 + }, + { + "epoch": 2.548315757618083, + "grad_norm": 0.311014043351318, + "learning_rate": 6.744851761219456e-06, + "loss": 2.6401, + "step": 54735 + }, + { + "epoch": 2.548362315804176, + "grad_norm": 0.3186298965386468, + "learning_rate": 6.743493137829898e-06, + "loss": 2.6355, + "step": 54736 + }, + { + "epoch": 2.5484088739902693, + "grad_norm": 0.34636276430746393, + "learning_rate": 6.742134641393949e-06, + "loss": 2.6583, + "step": 54737 + }, + { + "epoch": 2.5484554321763624, + "grad_norm": 0.3563632920711793, + "learning_rate": 6.740776271915555e-06, + "loss": 2.652, + "step": 54738 + }, + { + "epoch": 2.5485019903624555, + "grad_norm": 0.30267286572695173, + "learning_rate": 6.739418029398742e-06, + "loss": 2.6489, + "step": 54739 + }, + { + "epoch": 2.5485485485485486, + "grad_norm": 0.301127084534658, + "learning_rate": 6.738059913847461e-06, + "loss": 2.7149, + "step": 54740 + }, + { + "epoch": 2.5485951067346417, + "grad_norm": 0.3197039429510173, + "learning_rate": 6.7367019252657146e-06, + "loss": 2.6249, + "step": 54741 + }, + { + "epoch": 2.548641664920735, + "grad_norm": 0.318598191790939, + "learning_rate": 6.735344063657484e-06, + "loss": 2.5388, + "step": 54742 + }, + { + "epoch": 2.5486882231068275, + "grad_norm": 0.3388730436361529, + "learning_rate": 6.733986329026759e-06, + "loss": 2.6618, + "step": 54743 + }, + { + "epoch": 2.5487347812929206, + "grad_norm": 0.311222693208093, + "learning_rate": 6.732628721377532e-06, + "loss": 2.6251, + "step": 54744 + }, + { + "epoch": 2.5487813394790138, + "grad_norm": 0.31456487024166807, + "learning_rate": 6.731271240713766e-06, + "loss": 2.6582, + "step": 54745 + }, + { + "epoch": 2.548827897665107, + "grad_norm": 0.31274917089281246, + "learning_rate": 6.729913887039457e-06, + "loss": 2.6645, + "step": 54746 + }, + { + "epoch": 2.5488744558512, + "grad_norm": 0.363447393280565, + "learning_rate": 6.728556660358587e-06, + "loss": 2.566, + "step": 54747 + }, + { + "epoch": 2.548921014037293, + "grad_norm": 0.32780275947234394, + "learning_rate": 6.727199560675151e-06, + "loss": 2.6711, + "step": 54748 + }, + { + "epoch": 2.548967572223386, + "grad_norm": 0.3222647078027608, + "learning_rate": 6.7258425879930975e-06, + "loss": 2.6126, + "step": 54749 + }, + { + "epoch": 2.5490141304094793, + "grad_norm": 0.3207941301911718, + "learning_rate": 6.724485742316456e-06, + "loss": 2.5077, + "step": 54750 + }, + { + "epoch": 2.5490606885955724, + "grad_norm": 0.36551856493400814, + "learning_rate": 6.723129023649166e-06, + "loss": 2.7206, + "step": 54751 + }, + { + "epoch": 2.5491072467816656, + "grad_norm": 0.3581024624454569, + "learning_rate": 6.721772431995249e-06, + "loss": 2.6702, + "step": 54752 + }, + { + "epoch": 2.5491538049677587, + "grad_norm": 0.2971343706277949, + "learning_rate": 6.720415967358651e-06, + "loss": 2.6342, + "step": 54753 + }, + { + "epoch": 2.549200363153852, + "grad_norm": 0.3028973155066806, + "learning_rate": 6.719059629743374e-06, + "loss": 2.6181, + "step": 54754 + }, + { + "epoch": 2.5492469213399445, + "grad_norm": 0.3294124332451608, + "learning_rate": 6.7177034191533925e-06, + "loss": 2.5586, + "step": 54755 + }, + { + "epoch": 2.5492934795260376, + "grad_norm": 0.310125438613607, + "learning_rate": 6.716347335592687e-06, + "loss": 2.6192, + "step": 54756 + }, + { + "epoch": 2.5493400377121307, + "grad_norm": 0.3402638819761211, + "learning_rate": 6.71499137906525e-06, + "loss": 2.6295, + "step": 54757 + }, + { + "epoch": 2.549386595898224, + "grad_norm": 0.29381561161852304, + "learning_rate": 6.713635549575037e-06, + "loss": 2.5647, + "step": 54758 + }, + { + "epoch": 2.549433154084317, + "grad_norm": 0.3126530635268905, + "learning_rate": 6.7122798471260355e-06, + "loss": 2.4838, + "step": 54759 + }, + { + "epoch": 2.54947971227041, + "grad_norm": 0.34770793818279894, + "learning_rate": 6.7109242717222295e-06, + "loss": 2.6282, + "step": 54760 + }, + { + "epoch": 2.549526270456503, + "grad_norm": 0.3370760946031889, + "learning_rate": 6.7095688233676015e-06, + "loss": 2.7148, + "step": 54761 + }, + { + "epoch": 2.549572828642596, + "grad_norm": 0.3126970519083464, + "learning_rate": 6.708213502066118e-06, + "loss": 2.6141, + "step": 54762 + }, + { + "epoch": 2.549619386828689, + "grad_norm": 0.3041602315645261, + "learning_rate": 6.706858307821778e-06, + "loss": 2.6019, + "step": 54763 + }, + { + "epoch": 2.549665945014782, + "grad_norm": 0.32999366660761276, + "learning_rate": 6.705503240638522e-06, + "loss": 2.6873, + "step": 54764 + }, + { + "epoch": 2.549712503200875, + "grad_norm": 0.3248445360645335, + "learning_rate": 6.704148300520369e-06, + "loss": 2.5813, + "step": 54765 + }, + { + "epoch": 2.5497590613869683, + "grad_norm": 0.32560672712409194, + "learning_rate": 6.702793487471265e-06, + "loss": 2.7648, + "step": 54766 + }, + { + "epoch": 2.5498056195730614, + "grad_norm": 0.30419549925003553, + "learning_rate": 6.701438801495197e-06, + "loss": 2.6596, + "step": 54767 + }, + { + "epoch": 2.5498521777591545, + "grad_norm": 0.30127043377645263, + "learning_rate": 6.700084242596138e-06, + "loss": 2.6521, + "step": 54768 + }, + { + "epoch": 2.5498987359452476, + "grad_norm": 0.32255353829546046, + "learning_rate": 6.698729810778065e-06, + "loss": 2.7172, + "step": 54769 + }, + { + "epoch": 2.5499452941313407, + "grad_norm": 0.30768927471917284, + "learning_rate": 6.697375506044968e-06, + "loss": 2.6192, + "step": 54770 + }, + { + "epoch": 2.549991852317434, + "grad_norm": 0.307350879389514, + "learning_rate": 6.6960213284007976e-06, + "loss": 2.6114, + "step": 54771 + }, + { + "epoch": 2.550038410503527, + "grad_norm": 0.3066135077084336, + "learning_rate": 6.694667277849537e-06, + "loss": 2.5421, + "step": 54772 + }, + { + "epoch": 2.55008496868962, + "grad_norm": 0.29677945241849013, + "learning_rate": 6.6933133543951645e-06, + "loss": 2.6441, + "step": 54773 + }, + { + "epoch": 2.5501315268757128, + "grad_norm": 0.30106076746042304, + "learning_rate": 6.691959558041649e-06, + "loss": 2.6443, + "step": 54774 + }, + { + "epoch": 2.550178085061806, + "grad_norm": 0.3192115603910741, + "learning_rate": 6.690605888792961e-06, + "loss": 2.674, + "step": 54775 + }, + { + "epoch": 2.550224643247899, + "grad_norm": 0.31186298681224417, + "learning_rate": 6.6892523466530965e-06, + "loss": 2.7181, + "step": 54776 + }, + { + "epoch": 2.550271201433992, + "grad_norm": 0.31242737563213174, + "learning_rate": 6.687898931625986e-06, + "loss": 2.6576, + "step": 54777 + }, + { + "epoch": 2.5503177596200852, + "grad_norm": 0.3135367348832489, + "learning_rate": 6.686545643715641e-06, + "loss": 2.7238, + "step": 54778 + }, + { + "epoch": 2.5503643178061783, + "grad_norm": 0.30312124156678955, + "learning_rate": 6.6851924829260006e-06, + "loss": 2.6111, + "step": 54779 + }, + { + "epoch": 2.5504108759922715, + "grad_norm": 0.3105415756070449, + "learning_rate": 6.683839449261076e-06, + "loss": 2.596, + "step": 54780 + }, + { + "epoch": 2.5504574341783646, + "grad_norm": 0.3068704341202659, + "learning_rate": 6.682486542724803e-06, + "loss": 2.6509, + "step": 54781 + }, + { + "epoch": 2.5505039923644572, + "grad_norm": 0.30269315496507265, + "learning_rate": 6.681133763321163e-06, + "loss": 2.6301, + "step": 54782 + }, + { + "epoch": 2.5505505505505504, + "grad_norm": 0.2876293436761545, + "learning_rate": 6.6797811110541396e-06, + "loss": 2.6378, + "step": 54783 + }, + { + "epoch": 2.5505971087366435, + "grad_norm": 0.30338242233180573, + "learning_rate": 6.678428585927671e-06, + "loss": 2.5631, + "step": 54784 + }, + { + "epoch": 2.5506436669227366, + "grad_norm": 0.30868389198579205, + "learning_rate": 6.677076187945763e-06, + "loss": 2.5142, + "step": 54785 + }, + { + "epoch": 2.5506902251088297, + "grad_norm": 0.30949614244449397, + "learning_rate": 6.6757239171123646e-06, + "loss": 2.7018, + "step": 54786 + }, + { + "epoch": 2.550736783294923, + "grad_norm": 0.3225747507333037, + "learning_rate": 6.674371773431443e-06, + "loss": 2.6121, + "step": 54787 + }, + { + "epoch": 2.550783341481016, + "grad_norm": 0.3133273214033729, + "learning_rate": 6.673019756906973e-06, + "loss": 2.5921, + "step": 54788 + }, + { + "epoch": 2.550829899667109, + "grad_norm": 0.31127194237696015, + "learning_rate": 6.671667867542936e-06, + "loss": 2.6333, + "step": 54789 + }, + { + "epoch": 2.550876457853202, + "grad_norm": 0.3148565338872268, + "learning_rate": 6.670316105343261e-06, + "loss": 2.5603, + "step": 54790 + }, + { + "epoch": 2.5509230160392953, + "grad_norm": 0.29276055703714593, + "learning_rate": 6.6689644703119615e-06, + "loss": 2.5978, + "step": 54791 + }, + { + "epoch": 2.5509695742253884, + "grad_norm": 0.3044657820543715, + "learning_rate": 6.667612962452957e-06, + "loss": 2.658, + "step": 54792 + }, + { + "epoch": 2.5510161324114815, + "grad_norm": 0.32571782844024866, + "learning_rate": 6.666261581770267e-06, + "loss": 2.6543, + "step": 54793 + }, + { + "epoch": 2.551062690597574, + "grad_norm": 0.31488148348596, + "learning_rate": 6.6649103282678146e-06, + "loss": 2.5776, + "step": 54794 + }, + { + "epoch": 2.5511092487836673, + "grad_norm": 0.3233667726261945, + "learning_rate": 6.663559201949587e-06, + "loss": 2.6959, + "step": 54795 + }, + { + "epoch": 2.5511558069697604, + "grad_norm": 0.29467504460038846, + "learning_rate": 6.662208202819548e-06, + "loss": 2.629, + "step": 54796 + }, + { + "epoch": 2.5512023651558535, + "grad_norm": 0.32332148549836504, + "learning_rate": 6.660857330881643e-06, + "loss": 2.7648, + "step": 54797 + }, + { + "epoch": 2.5512489233419466, + "grad_norm": 0.3017910479098927, + "learning_rate": 6.659506586139868e-06, + "loss": 2.5294, + "step": 54798 + }, + { + "epoch": 2.5512954815280398, + "grad_norm": 0.3193377851952576, + "learning_rate": 6.6581559685981675e-06, + "loss": 2.6486, + "step": 54799 + }, + { + "epoch": 2.551342039714133, + "grad_norm": 0.30322921167884354, + "learning_rate": 6.6568054782605026e-06, + "loss": 2.5861, + "step": 54800 + }, + { + "epoch": 2.5513885979002255, + "grad_norm": 0.3164355664793599, + "learning_rate": 6.655455115130849e-06, + "loss": 2.6252, + "step": 54801 + }, + { + "epoch": 2.5514351560863187, + "grad_norm": 0.30866857197262293, + "learning_rate": 6.6541048792131575e-06, + "loss": 2.6334, + "step": 54802 + }, + { + "epoch": 2.5514817142724118, + "grad_norm": 0.3101674677721237, + "learning_rate": 6.652754770511405e-06, + "loss": 2.7194, + "step": 54803 + }, + { + "epoch": 2.551528272458505, + "grad_norm": 0.29122933022910386, + "learning_rate": 6.651404789029553e-06, + "loss": 2.6706, + "step": 54804 + }, + { + "epoch": 2.551574830644598, + "grad_norm": 0.2920851682638094, + "learning_rate": 6.650054934771538e-06, + "loss": 2.6064, + "step": 54805 + }, + { + "epoch": 2.551621388830691, + "grad_norm": 0.30730589341118125, + "learning_rate": 6.6487052077413625e-06, + "loss": 2.7542, + "step": 54806 + }, + { + "epoch": 2.5516679470167842, + "grad_norm": 0.29619307209053053, + "learning_rate": 6.6473556079429566e-06, + "loss": 2.6498, + "step": 54807 + }, + { + "epoch": 2.5517145052028773, + "grad_norm": 0.3135480797536694, + "learning_rate": 6.646006135380289e-06, + "loss": 2.7289, + "step": 54808 + }, + { + "epoch": 2.5517610633889705, + "grad_norm": 0.2955082867175733, + "learning_rate": 6.644656790057335e-06, + "loss": 2.6468, + "step": 54809 + }, + { + "epoch": 2.5518076215750636, + "grad_norm": 0.30537851701632274, + "learning_rate": 6.643307571978019e-06, + "loss": 2.6251, + "step": 54810 + }, + { + "epoch": 2.5518541797611567, + "grad_norm": 0.2967477669003224, + "learning_rate": 6.641958481146343e-06, + "loss": 2.5405, + "step": 54811 + }, + { + "epoch": 2.55190073794725, + "grad_norm": 0.3188862584478456, + "learning_rate": 6.640609517566243e-06, + "loss": 2.6363, + "step": 54812 + }, + { + "epoch": 2.5519472961333425, + "grad_norm": 0.31496316182663936, + "learning_rate": 6.639260681241677e-06, + "loss": 2.5827, + "step": 54813 + }, + { + "epoch": 2.5519938543194356, + "grad_norm": 0.3194936456219794, + "learning_rate": 6.637911972176608e-06, + "loss": 2.6502, + "step": 54814 + }, + { + "epoch": 2.5520404125055287, + "grad_norm": 0.29033085997631, + "learning_rate": 6.636563390375e-06, + "loss": 2.5722, + "step": 54815 + }, + { + "epoch": 2.552086970691622, + "grad_norm": 0.30048100003029304, + "learning_rate": 6.635214935840806e-06, + "loss": 2.6077, + "step": 54816 + }, + { + "epoch": 2.552133528877715, + "grad_norm": 0.31548892849577964, + "learning_rate": 6.633866608577993e-06, + "loss": 2.6383, + "step": 54817 + }, + { + "epoch": 2.552180087063808, + "grad_norm": 0.3107712380121385, + "learning_rate": 6.632518408590488e-06, + "loss": 2.6605, + "step": 54818 + }, + { + "epoch": 2.552226645249901, + "grad_norm": 0.29324550051463844, + "learning_rate": 6.631170335882292e-06, + "loss": 2.6081, + "step": 54819 + }, + { + "epoch": 2.5522732034359943, + "grad_norm": 0.29267456437578565, + "learning_rate": 6.62982239045733e-06, + "loss": 2.6461, + "step": 54820 + }, + { + "epoch": 2.552319761622087, + "grad_norm": 0.3031159288041035, + "learning_rate": 6.62847457231956e-06, + "loss": 2.5511, + "step": 54821 + }, + { + "epoch": 2.55236631980818, + "grad_norm": 0.3163871057664974, + "learning_rate": 6.627126881472961e-06, + "loss": 2.7709, + "step": 54822 + }, + { + "epoch": 2.552412877994273, + "grad_norm": 0.3163022663642564, + "learning_rate": 6.6257793179214435e-06, + "loss": 2.6351, + "step": 54823 + }, + { + "epoch": 2.5524594361803663, + "grad_norm": 0.30523921560918654, + "learning_rate": 6.624431881669018e-06, + "loss": 2.604, + "step": 54824 + }, + { + "epoch": 2.5525059943664594, + "grad_norm": 0.29407859563756894, + "learning_rate": 6.623084572719601e-06, + "loss": 2.5343, + "step": 54825 + }, + { + "epoch": 2.5525525525525525, + "grad_norm": 0.30784234672275035, + "learning_rate": 6.621737391077154e-06, + "loss": 2.6433, + "step": 54826 + }, + { + "epoch": 2.5525991107386456, + "grad_norm": 0.3078178428633468, + "learning_rate": 6.620390336745635e-06, + "loss": 2.5787, + "step": 54827 + }, + { + "epoch": 2.5526456689247388, + "grad_norm": 0.30334316411825846, + "learning_rate": 6.619043409728998e-06, + "loss": 2.6273, + "step": 54828 + }, + { + "epoch": 2.552692227110832, + "grad_norm": 0.29541083796111745, + "learning_rate": 6.617696610031199e-06, + "loss": 2.6473, + "step": 54829 + }, + { + "epoch": 2.552738785296925, + "grad_norm": 0.29311286628500083, + "learning_rate": 6.616349937656191e-06, + "loss": 2.5579, + "step": 54830 + }, + { + "epoch": 2.552785343483018, + "grad_norm": 0.3036483063139886, + "learning_rate": 6.615003392607899e-06, + "loss": 2.6301, + "step": 54831 + }, + { + "epoch": 2.5528319016691112, + "grad_norm": 0.29794520570540384, + "learning_rate": 6.6136569748903255e-06, + "loss": 2.6389, + "step": 54832 + }, + { + "epoch": 2.552878459855204, + "grad_norm": 0.31549521307919876, + "learning_rate": 6.612310684507383e-06, + "loss": 2.6459, + "step": 54833 + }, + { + "epoch": 2.552925018041297, + "grad_norm": 0.306791414928054, + "learning_rate": 6.610964521463031e-06, + "loss": 2.533, + "step": 54834 + }, + { + "epoch": 2.55297157622739, + "grad_norm": 0.29159168424624676, + "learning_rate": 6.609618485761237e-06, + "loss": 2.5881, + "step": 54835 + }, + { + "epoch": 2.5530181344134832, + "grad_norm": 0.30611850396333684, + "learning_rate": 6.608272577405911e-06, + "loss": 2.6188, + "step": 54836 + }, + { + "epoch": 2.5530646925995764, + "grad_norm": 0.3248760945939191, + "learning_rate": 6.606926796401059e-06, + "loss": 2.7312, + "step": 54837 + }, + { + "epoch": 2.5531112507856695, + "grad_norm": 0.31762300009616984, + "learning_rate": 6.605581142750577e-06, + "loss": 2.7144, + "step": 54838 + }, + { + "epoch": 2.5531578089717626, + "grad_norm": 0.3090188474586268, + "learning_rate": 6.60423561645846e-06, + "loss": 2.5654, + "step": 54839 + }, + { + "epoch": 2.5532043671578553, + "grad_norm": 0.33338749784344923, + "learning_rate": 6.6028902175286285e-06, + "loss": 2.571, + "step": 54840 + }, + { + "epoch": 2.5532509253439484, + "grad_norm": 0.31093138316449914, + "learning_rate": 6.601544945965038e-06, + "loss": 2.6253, + "step": 54841 + }, + { + "epoch": 2.5532974835300415, + "grad_norm": 0.31236671846716424, + "learning_rate": 6.600199801771634e-06, + "loss": 2.622, + "step": 54842 + }, + { + "epoch": 2.5533440417161346, + "grad_norm": 0.3040251675348592, + "learning_rate": 6.59885478495238e-06, + "loss": 2.683, + "step": 54843 + }, + { + "epoch": 2.5533905999022277, + "grad_norm": 0.28265532939302546, + "learning_rate": 6.5975098955111945e-06, + "loss": 2.5176, + "step": 54844 + }, + { + "epoch": 2.553437158088321, + "grad_norm": 0.29366811641577606, + "learning_rate": 6.596165133452059e-06, + "loss": 2.5306, + "step": 54845 + }, + { + "epoch": 2.553483716274414, + "grad_norm": 0.30590058355478694, + "learning_rate": 6.594820498778892e-06, + "loss": 2.7364, + "step": 54846 + }, + { + "epoch": 2.553530274460507, + "grad_norm": 0.31827430369554305, + "learning_rate": 6.593475991495651e-06, + "loss": 2.7036, + "step": 54847 + }, + { + "epoch": 2.5535768326466, + "grad_norm": 0.3223183151203409, + "learning_rate": 6.592131611606295e-06, + "loss": 2.6829, + "step": 54848 + }, + { + "epoch": 2.5536233908326933, + "grad_norm": 0.29538920961496445, + "learning_rate": 6.590787359114731e-06, + "loss": 2.7205, + "step": 54849 + }, + { + "epoch": 2.5536699490187864, + "grad_norm": 0.31602832609470277, + "learning_rate": 6.589443234024956e-06, + "loss": 2.7339, + "step": 54850 + }, + { + "epoch": 2.5537165072048795, + "grad_norm": 0.30784693406276153, + "learning_rate": 6.588099236340861e-06, + "loss": 2.6231, + "step": 54851 + }, + { + "epoch": 2.5537630653909726, + "grad_norm": 0.32033209896160947, + "learning_rate": 6.586755366066444e-06, + "loss": 2.5772, + "step": 54852 + }, + { + "epoch": 2.5538096235770653, + "grad_norm": 0.301887492565875, + "learning_rate": 6.585411623205612e-06, + "loss": 2.5447, + "step": 54853 + }, + { + "epoch": 2.5538561817631584, + "grad_norm": 0.30270801955744464, + "learning_rate": 6.584068007762318e-06, + "loss": 2.6108, + "step": 54854 + }, + { + "epoch": 2.5539027399492515, + "grad_norm": 0.30957632516906675, + "learning_rate": 6.582724519740508e-06, + "loss": 2.6024, + "step": 54855 + }, + { + "epoch": 2.5539492981353447, + "grad_norm": 0.29807881960414057, + "learning_rate": 6.581381159144123e-06, + "loss": 2.6642, + "step": 54856 + }, + { + "epoch": 2.5539958563214378, + "grad_norm": 0.29626395632501823, + "learning_rate": 6.580037925977112e-06, + "loss": 2.5529, + "step": 54857 + }, + { + "epoch": 2.554042414507531, + "grad_norm": 0.2731008832956992, + "learning_rate": 6.578694820243419e-06, + "loss": 2.5993, + "step": 54858 + }, + { + "epoch": 2.554088972693624, + "grad_norm": 0.29489224934377345, + "learning_rate": 6.577351841946966e-06, + "loss": 2.5865, + "step": 54859 + }, + { + "epoch": 2.5541355308797167, + "grad_norm": 0.3113882419317496, + "learning_rate": 6.576008991091709e-06, + "loss": 2.6762, + "step": 54860 + }, + { + "epoch": 2.55418208906581, + "grad_norm": 0.32009870706839494, + "learning_rate": 6.5746662676816e-06, + "loss": 2.6394, + "step": 54861 + }, + { + "epoch": 2.554228647251903, + "grad_norm": 0.30459918050557944, + "learning_rate": 6.573323671720544e-06, + "loss": 2.5924, + "step": 54862 + }, + { + "epoch": 2.554275205437996, + "grad_norm": 0.30091117445608423, + "learning_rate": 6.5719812032125295e-06, + "loss": 2.5779, + "step": 54863 + }, + { + "epoch": 2.554321763624089, + "grad_norm": 0.32111185491156485, + "learning_rate": 6.570638862161449e-06, + "loss": 2.6716, + "step": 54864 + }, + { + "epoch": 2.5543683218101823, + "grad_norm": 0.313952335319341, + "learning_rate": 6.569296648571283e-06, + "loss": 2.6736, + "step": 54865 + }, + { + "epoch": 2.5544148799962754, + "grad_norm": 0.3013279520211119, + "learning_rate": 6.567954562445944e-06, + "loss": 2.683, + "step": 54866 + }, + { + "epoch": 2.5544614381823685, + "grad_norm": 0.2782158293019603, + "learning_rate": 6.56661260378938e-06, + "loss": 2.5401, + "step": 54867 + }, + { + "epoch": 2.5545079963684616, + "grad_norm": 0.31753936630601315, + "learning_rate": 6.565270772605531e-06, + "loss": 2.7093, + "step": 54868 + }, + { + "epoch": 2.5545545545545547, + "grad_norm": 0.32514838737078516, + "learning_rate": 6.5639290688983275e-06, + "loss": 2.6935, + "step": 54869 + }, + { + "epoch": 2.554601112740648, + "grad_norm": 0.29907342645096013, + "learning_rate": 6.562587492671718e-06, + "loss": 2.6358, + "step": 54870 + }, + { + "epoch": 2.554647670926741, + "grad_norm": 0.30862874027734377, + "learning_rate": 6.561246043929648e-06, + "loss": 2.6304, + "step": 54871 + }, + { + "epoch": 2.5546942291128336, + "grad_norm": 0.3103127939916123, + "learning_rate": 6.559904722676024e-06, + "loss": 2.622, + "step": 54872 + }, + { + "epoch": 2.5547407872989267, + "grad_norm": 0.3135237976047801, + "learning_rate": 6.5585635289148075e-06, + "loss": 2.6726, + "step": 54873 + }, + { + "epoch": 2.55478734548502, + "grad_norm": 0.29380812033214765, + "learning_rate": 6.55722246264992e-06, + "loss": 2.5911, + "step": 54874 + }, + { + "epoch": 2.554833903671113, + "grad_norm": 0.295278971300813, + "learning_rate": 6.55588152388531e-06, + "loss": 2.5783, + "step": 54875 + }, + { + "epoch": 2.554880461857206, + "grad_norm": 0.30523927689692476, + "learning_rate": 6.554540712624918e-06, + "loss": 2.7397, + "step": 54876 + }, + { + "epoch": 2.554927020043299, + "grad_norm": 0.29737532597446154, + "learning_rate": 6.553200028872641e-06, + "loss": 2.5223, + "step": 54877 + }, + { + "epoch": 2.5549735782293923, + "grad_norm": 0.2999509845402024, + "learning_rate": 6.551859472632471e-06, + "loss": 2.6866, + "step": 54878 + }, + { + "epoch": 2.555020136415485, + "grad_norm": 0.3012752142011329, + "learning_rate": 6.550519043908299e-06, + "loss": 2.6536, + "step": 54879 + }, + { + "epoch": 2.555066694601578, + "grad_norm": 0.29790785435607237, + "learning_rate": 6.549178742704071e-06, + "loss": 2.6356, + "step": 54880 + }, + { + "epoch": 2.555113252787671, + "grad_norm": 0.3130730904950623, + "learning_rate": 6.547838569023729e-06, + "loss": 2.645, + "step": 54881 + }, + { + "epoch": 2.5551598109737643, + "grad_norm": 0.29386036679760863, + "learning_rate": 6.546498522871197e-06, + "loss": 2.6525, + "step": 54882 + }, + { + "epoch": 2.5552063691598574, + "grad_norm": 0.30049123963180824, + "learning_rate": 6.5451586042504064e-06, + "loss": 2.6755, + "step": 54883 + }, + { + "epoch": 2.5552529273459506, + "grad_norm": 0.2990198299975384, + "learning_rate": 6.543818813165309e-06, + "loss": 2.5653, + "step": 54884 + }, + { + "epoch": 2.5552994855320437, + "grad_norm": 0.312645179790827, + "learning_rate": 6.542479149619812e-06, + "loss": 2.5159, + "step": 54885 + }, + { + "epoch": 2.555346043718137, + "grad_norm": 0.2962209631208984, + "learning_rate": 6.541139613617858e-06, + "loss": 2.5781, + "step": 54886 + }, + { + "epoch": 2.55539260190423, + "grad_norm": 0.30333540776409296, + "learning_rate": 6.539800205163377e-06, + "loss": 2.7459, + "step": 54887 + }, + { + "epoch": 2.555439160090323, + "grad_norm": 0.3057631038804928, + "learning_rate": 6.538460924260298e-06, + "loss": 2.6896, + "step": 54888 + }, + { + "epoch": 2.555485718276416, + "grad_norm": 0.28891621000209605, + "learning_rate": 6.537121770912568e-06, + "loss": 2.5299, + "step": 54889 + }, + { + "epoch": 2.5555322764625092, + "grad_norm": 0.30883139022745415, + "learning_rate": 6.535782745124086e-06, + "loss": 2.6571, + "step": 54890 + }, + { + "epoch": 2.5555788346486024, + "grad_norm": 0.3003349465245214, + "learning_rate": 6.5344438468988134e-06, + "loss": 2.6567, + "step": 54891 + }, + { + "epoch": 2.555625392834695, + "grad_norm": 0.3166462334490804, + "learning_rate": 6.5331050762406475e-06, + "loss": 2.6289, + "step": 54892 + }, + { + "epoch": 2.555671951020788, + "grad_norm": 0.3061672470091005, + "learning_rate": 6.531766433153563e-06, + "loss": 2.5001, + "step": 54893 + }, + { + "epoch": 2.5557185092068813, + "grad_norm": 0.30444909289106475, + "learning_rate": 6.530427917641446e-06, + "loss": 2.5416, + "step": 54894 + }, + { + "epoch": 2.5557650673929744, + "grad_norm": 0.2971791775677883, + "learning_rate": 6.529089529708238e-06, + "loss": 2.6342, + "step": 54895 + }, + { + "epoch": 2.5558116255790675, + "grad_norm": 0.30900647422516614, + "learning_rate": 6.527751269357874e-06, + "loss": 2.5625, + "step": 54896 + }, + { + "epoch": 2.5558581837651606, + "grad_norm": 0.307826119841721, + "learning_rate": 6.5264131365942894e-06, + "loss": 2.686, + "step": 54897 + }, + { + "epoch": 2.5559047419512537, + "grad_norm": 0.3048229508063225, + "learning_rate": 6.525075131421382e-06, + "loss": 2.6951, + "step": 54898 + }, + { + "epoch": 2.5559513001373464, + "grad_norm": 0.3130403450350496, + "learning_rate": 6.523737253843104e-06, + "loss": 2.6403, + "step": 54899 + }, + { + "epoch": 2.5559978583234395, + "grad_norm": 0.30160033825440863, + "learning_rate": 6.522399503863369e-06, + "loss": 2.6741, + "step": 54900 + }, + { + "epoch": 2.5560444165095326, + "grad_norm": 0.27961996207390905, + "learning_rate": 6.5210618814861065e-06, + "loss": 2.6485, + "step": 54901 + }, + { + "epoch": 2.5560909746956257, + "grad_norm": 0.29377441226578355, + "learning_rate": 6.519724386715259e-06, + "loss": 2.6313, + "step": 54902 + }, + { + "epoch": 2.556137532881719, + "grad_norm": 0.2952620279615628, + "learning_rate": 6.5183870195547115e-06, + "loss": 2.795, + "step": 54903 + }, + { + "epoch": 2.556184091067812, + "grad_norm": 0.29818741223144546, + "learning_rate": 6.517049780008438e-06, + "loss": 2.5674, + "step": 54904 + }, + { + "epoch": 2.556230649253905, + "grad_norm": 0.3136943853104089, + "learning_rate": 6.515712668080321e-06, + "loss": 2.6506, + "step": 54905 + }, + { + "epoch": 2.556277207439998, + "grad_norm": 0.288708988671593, + "learning_rate": 6.514375683774321e-06, + "loss": 2.7445, + "step": 54906 + }, + { + "epoch": 2.5563237656260913, + "grad_norm": 0.3014350843064152, + "learning_rate": 6.513038827094336e-06, + "loss": 2.6247, + "step": 54907 + }, + { + "epoch": 2.5563703238121844, + "grad_norm": 0.3181890656197875, + "learning_rate": 6.511702098044298e-06, + "loss": 2.6411, + "step": 54908 + }, + { + "epoch": 2.5564168819982775, + "grad_norm": 0.30385300925617464, + "learning_rate": 6.510365496628129e-06, + "loss": 2.4675, + "step": 54909 + }, + { + "epoch": 2.5564634401843707, + "grad_norm": 0.31025120716243976, + "learning_rate": 6.50902902284975e-06, + "loss": 2.5251, + "step": 54910 + }, + { + "epoch": 2.5565099983704633, + "grad_norm": 0.2835972451149226, + "learning_rate": 6.507692676713101e-06, + "loss": 2.5571, + "step": 54911 + }, + { + "epoch": 2.5565565565565564, + "grad_norm": 0.3000274221329046, + "learning_rate": 6.50635645822208e-06, + "loss": 2.6089, + "step": 54912 + }, + { + "epoch": 2.5566031147426496, + "grad_norm": 0.3196798945267216, + "learning_rate": 6.5050203673806094e-06, + "loss": 2.7493, + "step": 54913 + }, + { + "epoch": 2.5566496729287427, + "grad_norm": 0.29802287919811676, + "learning_rate": 6.503684404192628e-06, + "loss": 2.588, + "step": 54914 + }, + { + "epoch": 2.556696231114836, + "grad_norm": 0.3104214593879251, + "learning_rate": 6.502348568662053e-06, + "loss": 2.6754, + "step": 54915 + }, + { + "epoch": 2.556742789300929, + "grad_norm": 0.298797460768343, + "learning_rate": 6.501012860792782e-06, + "loss": 2.482, + "step": 54916 + }, + { + "epoch": 2.556789347487022, + "grad_norm": 0.3049600115778891, + "learning_rate": 6.499677280588773e-06, + "loss": 2.6563, + "step": 54917 + }, + { + "epoch": 2.556835905673115, + "grad_norm": 0.31449072802336975, + "learning_rate": 6.498341828053906e-06, + "loss": 2.6812, + "step": 54918 + }, + { + "epoch": 2.556882463859208, + "grad_norm": 0.3082195685214955, + "learning_rate": 6.497006503192138e-06, + "loss": 2.5492, + "step": 54919 + }, + { + "epoch": 2.556929022045301, + "grad_norm": 0.3085861800906395, + "learning_rate": 6.495671306007362e-06, + "loss": 2.5727, + "step": 54920 + }, + { + "epoch": 2.556975580231394, + "grad_norm": 0.313787118837282, + "learning_rate": 6.494336236503501e-06, + "loss": 2.6244, + "step": 54921 + }, + { + "epoch": 2.557022138417487, + "grad_norm": 0.28963181007297306, + "learning_rate": 6.493001294684481e-06, + "loss": 2.6152, + "step": 54922 + }, + { + "epoch": 2.5570686966035803, + "grad_norm": 0.31454918839738655, + "learning_rate": 6.491666480554215e-06, + "loss": 2.6583, + "step": 54923 + }, + { + "epoch": 2.5571152547896734, + "grad_norm": 0.30365288962275255, + "learning_rate": 6.490331794116633e-06, + "loss": 2.5772, + "step": 54924 + }, + { + "epoch": 2.5571618129757665, + "grad_norm": 0.31948387997212024, + "learning_rate": 6.488997235375632e-06, + "loss": 2.6951, + "step": 54925 + }, + { + "epoch": 2.5572083711618596, + "grad_norm": 0.31220884748003697, + "learning_rate": 6.487662804335132e-06, + "loss": 2.5949, + "step": 54926 + }, + { + "epoch": 2.5572549293479527, + "grad_norm": 0.3170722782894776, + "learning_rate": 6.4863285009990615e-06, + "loss": 2.5799, + "step": 54927 + }, + { + "epoch": 2.557301487534046, + "grad_norm": 0.31628262978902755, + "learning_rate": 6.484994325371324e-06, + "loss": 2.6044, + "step": 54928 + }, + { + "epoch": 2.557348045720139, + "grad_norm": 0.3034389402118834, + "learning_rate": 6.483660277455844e-06, + "loss": 2.6178, + "step": 54929 + }, + { + "epoch": 2.557394603906232, + "grad_norm": 0.2963997065219413, + "learning_rate": 6.482326357256546e-06, + "loss": 2.5757, + "step": 54930 + }, + { + "epoch": 2.5574411620923247, + "grad_norm": 0.31718274560414245, + "learning_rate": 6.48099256477731e-06, + "loss": 2.6343, + "step": 54931 + }, + { + "epoch": 2.557487720278418, + "grad_norm": 0.3089681445259783, + "learning_rate": 6.479658900022095e-06, + "loss": 2.5877, + "step": 54932 + }, + { + "epoch": 2.557534278464511, + "grad_norm": 0.2933910455155555, + "learning_rate": 6.478325362994781e-06, + "loss": 2.6756, + "step": 54933 + }, + { + "epoch": 2.557580836650604, + "grad_norm": 0.2929066868720104, + "learning_rate": 6.476991953699296e-06, + "loss": 2.6128, + "step": 54934 + }, + { + "epoch": 2.557627394836697, + "grad_norm": 0.30882292602574524, + "learning_rate": 6.475658672139551e-06, + "loss": 2.5799, + "step": 54935 + }, + { + "epoch": 2.5576739530227903, + "grad_norm": 0.315656008738675, + "learning_rate": 6.474325518319463e-06, + "loss": 2.6494, + "step": 54936 + }, + { + "epoch": 2.5577205112088834, + "grad_norm": 0.29250161222548904, + "learning_rate": 6.472992492242952e-06, + "loss": 2.541, + "step": 54937 + }, + { + "epoch": 2.557767069394976, + "grad_norm": 0.3016417062791788, + "learning_rate": 6.471659593913903e-06, + "loss": 2.7162, + "step": 54938 + }, + { + "epoch": 2.5578136275810692, + "grad_norm": 0.3097901651605717, + "learning_rate": 6.4703268233362536e-06, + "loss": 2.6341, + "step": 54939 + }, + { + "epoch": 2.5578601857671623, + "grad_norm": 0.3173747818888891, + "learning_rate": 6.468994180513904e-06, + "loss": 2.6152, + "step": 54940 + }, + { + "epoch": 2.5579067439532555, + "grad_norm": 0.3236826017192986, + "learning_rate": 6.467661665450764e-06, + "loss": 2.6534, + "step": 54941 + }, + { + "epoch": 2.5579533021393486, + "grad_norm": 0.2984370524826663, + "learning_rate": 6.4663292781507515e-06, + "loss": 2.594, + "step": 54942 + }, + { + "epoch": 2.5579998603254417, + "grad_norm": 0.3080939794086324, + "learning_rate": 6.464997018617785e-06, + "loss": 2.6455, + "step": 54943 + }, + { + "epoch": 2.558046418511535, + "grad_norm": 0.30427463687247874, + "learning_rate": 6.463664886855742e-06, + "loss": 2.5847, + "step": 54944 + }, + { + "epoch": 2.558092976697628, + "grad_norm": 0.29799234476953995, + "learning_rate": 6.462332882868572e-06, + "loss": 2.5818, + "step": 54945 + }, + { + "epoch": 2.558139534883721, + "grad_norm": 0.3224581040147823, + "learning_rate": 6.4610010066601556e-06, + "loss": 2.6049, + "step": 54946 + }, + { + "epoch": 2.558186093069814, + "grad_norm": 0.32156620122865104, + "learning_rate": 6.459669258234413e-06, + "loss": 2.6499, + "step": 54947 + }, + { + "epoch": 2.5582326512559073, + "grad_norm": 0.2887983716944551, + "learning_rate": 6.458337637595252e-06, + "loss": 2.6027, + "step": 54948 + }, + { + "epoch": 2.5582792094420004, + "grad_norm": 0.29884081515872224, + "learning_rate": 6.45700614474658e-06, + "loss": 2.7535, + "step": 54949 + }, + { + "epoch": 2.558325767628093, + "grad_norm": 0.3106169914944411, + "learning_rate": 6.455674779692316e-06, + "loss": 2.7013, + "step": 54950 + }, + { + "epoch": 2.558372325814186, + "grad_norm": 0.32575437883295444, + "learning_rate": 6.454343542436347e-06, + "loss": 2.6134, + "step": 54951 + }, + { + "epoch": 2.5584188840002793, + "grad_norm": 0.3017180678219266, + "learning_rate": 6.453012432982591e-06, + "loss": 2.6928, + "step": 54952 + }, + { + "epoch": 2.5584654421863724, + "grad_norm": 0.2971991487935634, + "learning_rate": 6.451681451334951e-06, + "loss": 2.6921, + "step": 54953 + }, + { + "epoch": 2.5585120003724655, + "grad_norm": 0.29953699814054696, + "learning_rate": 6.450350597497334e-06, + "loss": 2.6928, + "step": 54954 + }, + { + "epoch": 2.5585585585585586, + "grad_norm": 0.3006277460037219, + "learning_rate": 6.449019871473649e-06, + "loss": 2.6495, + "step": 54955 + }, + { + "epoch": 2.5586051167446517, + "grad_norm": 0.32017769775690935, + "learning_rate": 6.447689273267815e-06, + "loss": 2.6462, + "step": 54956 + }, + { + "epoch": 2.558651674930745, + "grad_norm": 0.30979735933870045, + "learning_rate": 6.446358802883695e-06, + "loss": 2.6198, + "step": 54957 + }, + { + "epoch": 2.5586982331168375, + "grad_norm": 0.3026668879871997, + "learning_rate": 6.4450284603252475e-06, + "loss": 2.6856, + "step": 54958 + }, + { + "epoch": 2.5587447913029306, + "grad_norm": 0.31476174139815155, + "learning_rate": 6.443698245596336e-06, + "loss": 2.6154, + "step": 54959 + }, + { + "epoch": 2.5587913494890238, + "grad_norm": 0.29691113579305556, + "learning_rate": 6.4423681587008785e-06, + "loss": 2.5257, + "step": 54960 + }, + { + "epoch": 2.558837907675117, + "grad_norm": 0.31440273428130083, + "learning_rate": 6.4410381996427845e-06, + "loss": 2.7147, + "step": 54961 + }, + { + "epoch": 2.55888446586121, + "grad_norm": 0.2986010544940637, + "learning_rate": 6.439708368425951e-06, + "loss": 2.6017, + "step": 54962 + }, + { + "epoch": 2.558931024047303, + "grad_norm": 0.2876853537893117, + "learning_rate": 6.43837866505429e-06, + "loss": 2.499, + "step": 54963 + }, + { + "epoch": 2.558977582233396, + "grad_norm": 0.31675265802040176, + "learning_rate": 6.437049089531672e-06, + "loss": 2.5666, + "step": 54964 + }, + { + "epoch": 2.5590241404194893, + "grad_norm": 0.3214000461854939, + "learning_rate": 6.435719641862048e-06, + "loss": 2.5803, + "step": 54965 + }, + { + "epoch": 2.5590706986055825, + "grad_norm": 0.3102203143712388, + "learning_rate": 6.434390322049289e-06, + "loss": 2.6452, + "step": 54966 + }, + { + "epoch": 2.5591172567916756, + "grad_norm": 0.2788906557326181, + "learning_rate": 6.433061130097295e-06, + "loss": 2.5379, + "step": 54967 + }, + { + "epoch": 2.5591638149777687, + "grad_norm": 0.3162205131650716, + "learning_rate": 6.431732066009977e-06, + "loss": 2.6255, + "step": 54968 + }, + { + "epoch": 2.559210373163862, + "grad_norm": 0.31195855732106387, + "learning_rate": 6.430403129791246e-06, + "loss": 2.6033, + "step": 54969 + }, + { + "epoch": 2.5592569313499545, + "grad_norm": 0.3259670835115583, + "learning_rate": 6.429074321444967e-06, + "loss": 2.5576, + "step": 54970 + }, + { + "epoch": 2.5593034895360476, + "grad_norm": 0.29029832516734805, + "learning_rate": 6.4277456409750855e-06, + "loss": 2.5517, + "step": 54971 + }, + { + "epoch": 2.5593500477221407, + "grad_norm": 0.2937214702636856, + "learning_rate": 6.426417088385461e-06, + "loss": 2.608, + "step": 54972 + }, + { + "epoch": 2.559396605908234, + "grad_norm": 0.32200344543675646, + "learning_rate": 6.425088663680018e-06, + "loss": 2.5226, + "step": 54973 + }, + { + "epoch": 2.559443164094327, + "grad_norm": 0.29776108443929283, + "learning_rate": 6.423760366862641e-06, + "loss": 2.6153, + "step": 54974 + }, + { + "epoch": 2.55948972228042, + "grad_norm": 0.2882870315341956, + "learning_rate": 6.422432197937234e-06, + "loss": 2.5949, + "step": 54975 + }, + { + "epoch": 2.559536280466513, + "grad_norm": 0.31108585734777244, + "learning_rate": 6.421104156907709e-06, + "loss": 2.6649, + "step": 54976 + }, + { + "epoch": 2.559582838652606, + "grad_norm": 0.2959941257061575, + "learning_rate": 6.4197762437779306e-06, + "loss": 2.5751, + "step": 54977 + }, + { + "epoch": 2.559629396838699, + "grad_norm": 0.3063339381681438, + "learning_rate": 6.418448458551835e-06, + "loss": 2.5772, + "step": 54978 + }, + { + "epoch": 2.559675955024792, + "grad_norm": 0.31392169154612753, + "learning_rate": 6.417120801233284e-06, + "loss": 2.5626, + "step": 54979 + }, + { + "epoch": 2.559722513210885, + "grad_norm": 0.3208786031288911, + "learning_rate": 6.415793271826198e-06, + "loss": 2.7005, + "step": 54980 + }, + { + "epoch": 2.5597690713969783, + "grad_norm": 0.3173410603846878, + "learning_rate": 6.414465870334457e-06, + "loss": 2.676, + "step": 54981 + }, + { + "epoch": 2.5598156295830714, + "grad_norm": 0.3011152524287177, + "learning_rate": 6.413138596761969e-06, + "loss": 2.61, + "step": 54982 + }, + { + "epoch": 2.5598621877691645, + "grad_norm": 0.3011293673296566, + "learning_rate": 6.411811451112626e-06, + "loss": 2.6357, + "step": 54983 + }, + { + "epoch": 2.5599087459552576, + "grad_norm": 0.3053823879441237, + "learning_rate": 6.410484433390334e-06, + "loss": 2.6517, + "step": 54984 + }, + { + "epoch": 2.5599553041413508, + "grad_norm": 0.29440780533154154, + "learning_rate": 6.409157543598959e-06, + "loss": 2.673, + "step": 54985 + }, + { + "epoch": 2.560001862327444, + "grad_norm": 0.30488187353544993, + "learning_rate": 6.407830781742419e-06, + "loss": 2.6074, + "step": 54986 + }, + { + "epoch": 2.560048420513537, + "grad_norm": 0.30109644098134153, + "learning_rate": 6.406504147824599e-06, + "loss": 2.5863, + "step": 54987 + }, + { + "epoch": 2.56009497869963, + "grad_norm": 0.3164590154660032, + "learning_rate": 6.405177641849392e-06, + "loss": 2.6364, + "step": 54988 + }, + { + "epoch": 2.5601415368857228, + "grad_norm": 0.3145864453867944, + "learning_rate": 6.403851263820709e-06, + "loss": 2.5825, + "step": 54989 + }, + { + "epoch": 2.560188095071816, + "grad_norm": 0.3060771302959177, + "learning_rate": 6.4025250137424e-06, + "loss": 2.5975, + "step": 54990 + }, + { + "epoch": 2.560234653257909, + "grad_norm": 0.29403020933062995, + "learning_rate": 6.4011988916184155e-06, + "loss": 2.5518, + "step": 54991 + }, + { + "epoch": 2.560281211444002, + "grad_norm": 0.30902651332781933, + "learning_rate": 6.3998728974525966e-06, + "loss": 2.6337, + "step": 54992 + }, + { + "epoch": 2.5603277696300952, + "grad_norm": 0.3191370375742353, + "learning_rate": 6.398547031248864e-06, + "loss": 2.5938, + "step": 54993 + }, + { + "epoch": 2.5603743278161883, + "grad_norm": 0.3077935598911563, + "learning_rate": 6.3972212930110965e-06, + "loss": 2.6098, + "step": 54994 + }, + { + "epoch": 2.5604208860022815, + "grad_norm": 0.3041015491412626, + "learning_rate": 6.395895682743186e-06, + "loss": 2.5362, + "step": 54995 + }, + { + "epoch": 2.5604674441883746, + "grad_norm": 0.30914144277245215, + "learning_rate": 6.394570200449029e-06, + "loss": 2.7492, + "step": 54996 + }, + { + "epoch": 2.5605140023744672, + "grad_norm": 0.3086203011968011, + "learning_rate": 6.393244846132524e-06, + "loss": 2.644, + "step": 54997 + }, + { + "epoch": 2.5605605605605604, + "grad_norm": 0.31977836957066497, + "learning_rate": 6.391919619797537e-06, + "loss": 2.5436, + "step": 54998 + }, + { + "epoch": 2.5606071187466535, + "grad_norm": 0.31248105334752047, + "learning_rate": 6.390594521447968e-06, + "loss": 2.603, + "step": 54999 + }, + { + "epoch": 2.5606536769327466, + "grad_norm": 0.3231121644855172, + "learning_rate": 6.389269551087712e-06, + "loss": 2.5473, + "step": 55000 + }, + { + "epoch": 2.5607002351188397, + "grad_norm": 0.32414606924750217, + "learning_rate": 6.38794470872065e-06, + "loss": 2.5711, + "step": 55001 + }, + { + "epoch": 2.560746793304933, + "grad_norm": 0.2935422109044529, + "learning_rate": 6.38661999435069e-06, + "loss": 2.5322, + "step": 55002 + }, + { + "epoch": 2.560793351491026, + "grad_norm": 0.3162740722931634, + "learning_rate": 6.385295407981679e-06, + "loss": 2.6839, + "step": 55003 + }, + { + "epoch": 2.560839909677119, + "grad_norm": 0.31386778787820835, + "learning_rate": 6.383970949617551e-06, + "loss": 2.4658, + "step": 55004 + }, + { + "epoch": 2.560886467863212, + "grad_norm": 0.32000677480852535, + "learning_rate": 6.382646619262156e-06, + "loss": 2.6793, + "step": 55005 + }, + { + "epoch": 2.5609330260493053, + "grad_norm": 0.3091329645811695, + "learning_rate": 6.381322416919399e-06, + "loss": 2.6849, + "step": 55006 + }, + { + "epoch": 2.5609795842353984, + "grad_norm": 0.3151054905403512, + "learning_rate": 6.379998342593169e-06, + "loss": 2.6524, + "step": 55007 + }, + { + "epoch": 2.5610261424214915, + "grad_norm": 0.30713902323895925, + "learning_rate": 6.378674396287343e-06, + "loss": 2.5832, + "step": 55008 + }, + { + "epoch": 2.561072700607584, + "grad_norm": 0.30703231455920665, + "learning_rate": 6.377350578005808e-06, + "loss": 2.596, + "step": 55009 + }, + { + "epoch": 2.5611192587936773, + "grad_norm": 0.2916743049167973, + "learning_rate": 6.376026887752462e-06, + "loss": 2.575, + "step": 55010 + }, + { + "epoch": 2.5611658169797704, + "grad_norm": 0.313949542015873, + "learning_rate": 6.374703325531173e-06, + "loss": 2.6397, + "step": 55011 + }, + { + "epoch": 2.5612123751658635, + "grad_norm": 0.3252397032797239, + "learning_rate": 6.3733798913458264e-06, + "loss": 2.6393, + "step": 55012 + }, + { + "epoch": 2.5612589333519566, + "grad_norm": 0.3082281113403008, + "learning_rate": 6.372056585200315e-06, + "loss": 2.6741, + "step": 55013 + }, + { + "epoch": 2.5613054915380498, + "grad_norm": 0.30878598466917834, + "learning_rate": 6.370733407098517e-06, + "loss": 2.7387, + "step": 55014 + }, + { + "epoch": 2.561352049724143, + "grad_norm": 0.2971013303327015, + "learning_rate": 6.369410357044337e-06, + "loss": 2.5334, + "step": 55015 + }, + { + "epoch": 2.5613986079102355, + "grad_norm": 0.296740601163696, + "learning_rate": 6.368087435041608e-06, + "loss": 2.5399, + "step": 55016 + }, + { + "epoch": 2.5614451660963287, + "grad_norm": 0.2923952334410872, + "learning_rate": 6.366764641094269e-06, + "loss": 2.6677, + "step": 55017 + }, + { + "epoch": 2.561491724282422, + "grad_norm": 0.3135215393866628, + "learning_rate": 6.365441975206155e-06, + "loss": 2.68, + "step": 55018 + }, + { + "epoch": 2.561538282468515, + "grad_norm": 0.31027152549678966, + "learning_rate": 6.364119437381194e-06, + "loss": 2.5401, + "step": 55019 + }, + { + "epoch": 2.561584840654608, + "grad_norm": 0.29637429386497427, + "learning_rate": 6.36279702762323e-06, + "loss": 2.5741, + "step": 55020 + }, + { + "epoch": 2.561631398840701, + "grad_norm": 0.29914416069489763, + "learning_rate": 6.361474745936163e-06, + "loss": 2.5358, + "step": 55021 + }, + { + "epoch": 2.5616779570267942, + "grad_norm": 0.2930153801362639, + "learning_rate": 6.360152592323864e-06, + "loss": 2.6343, + "step": 55022 + }, + { + "epoch": 2.5617245152128874, + "grad_norm": 0.29505342254475136, + "learning_rate": 6.35883056679023e-06, + "loss": 2.6056, + "step": 55023 + }, + { + "epoch": 2.5617710733989805, + "grad_norm": 0.3060933668003339, + "learning_rate": 6.357508669339119e-06, + "loss": 2.5967, + "step": 55024 + }, + { + "epoch": 2.5618176315850736, + "grad_norm": 0.31355703919685995, + "learning_rate": 6.356186899974414e-06, + "loss": 2.6169, + "step": 55025 + }, + { + "epoch": 2.5618641897711667, + "grad_norm": 0.3101338416427978, + "learning_rate": 6.35486525870001e-06, + "loss": 2.6149, + "step": 55026 + }, + { + "epoch": 2.56191074795726, + "grad_norm": 0.30324803421632635, + "learning_rate": 6.353543745519769e-06, + "loss": 2.6759, + "step": 55027 + }, + { + "epoch": 2.561957306143353, + "grad_norm": 0.29901715372250615, + "learning_rate": 6.352222360437593e-06, + "loss": 2.6199, + "step": 55028 + }, + { + "epoch": 2.5620038643294456, + "grad_norm": 0.3026401544022949, + "learning_rate": 6.350901103457324e-06, + "loss": 2.6319, + "step": 55029 + }, + { + "epoch": 2.5620504225155387, + "grad_norm": 0.3063164686032778, + "learning_rate": 6.349579974582875e-06, + "loss": 2.5545, + "step": 55030 + }, + { + "epoch": 2.562096980701632, + "grad_norm": 0.31353009086401007, + "learning_rate": 6.348258973818094e-06, + "loss": 2.7461, + "step": 55031 + }, + { + "epoch": 2.562143538887725, + "grad_norm": 0.31260628001089275, + "learning_rate": 6.3469381011668886e-06, + "loss": 2.6283, + "step": 55032 + }, + { + "epoch": 2.562190097073818, + "grad_norm": 0.31631338934599473, + "learning_rate": 6.345617356633115e-06, + "loss": 2.7268, + "step": 55033 + }, + { + "epoch": 2.562236655259911, + "grad_norm": 0.3042380759924852, + "learning_rate": 6.344296740220645e-06, + "loss": 2.5948, + "step": 55034 + }, + { + "epoch": 2.5622832134460043, + "grad_norm": 0.2928271277426342, + "learning_rate": 6.342976251933369e-06, + "loss": 2.6735, + "step": 55035 + }, + { + "epoch": 2.562329771632097, + "grad_norm": 0.3095810189549838, + "learning_rate": 6.341655891775155e-06, + "loss": 2.6428, + "step": 55036 + }, + { + "epoch": 2.56237632981819, + "grad_norm": 0.3149448840643766, + "learning_rate": 6.340335659749896e-06, + "loss": 2.6083, + "step": 55037 + }, + { + "epoch": 2.562422888004283, + "grad_norm": 0.28879305557119467, + "learning_rate": 6.339015555861433e-06, + "loss": 2.4669, + "step": 55038 + }, + { + "epoch": 2.5624694461903763, + "grad_norm": 0.3112463529730496, + "learning_rate": 6.337695580113662e-06, + "loss": 2.5747, + "step": 55039 + }, + { + "epoch": 2.5625160043764694, + "grad_norm": 0.3004974407480163, + "learning_rate": 6.336375732510452e-06, + "loss": 2.6842, + "step": 55040 + }, + { + "epoch": 2.5625625625625625, + "grad_norm": 0.30128725787414457, + "learning_rate": 6.335056013055685e-06, + "loss": 2.713, + "step": 55041 + }, + { + "epoch": 2.5626091207486557, + "grad_norm": 0.31284763831617496, + "learning_rate": 6.333736421753211e-06, + "loss": 2.6574, + "step": 55042 + }, + { + "epoch": 2.5626556789347488, + "grad_norm": 0.30739905283694463, + "learning_rate": 6.332416958606935e-06, + "loss": 2.6135, + "step": 55043 + }, + { + "epoch": 2.562702237120842, + "grad_norm": 0.30257801585850774, + "learning_rate": 6.331097623620696e-06, + "loss": 2.6233, + "step": 55044 + }, + { + "epoch": 2.562748795306935, + "grad_norm": 0.3026002461611152, + "learning_rate": 6.329778416798404e-06, + "loss": 2.5861, + "step": 55045 + }, + { + "epoch": 2.562795353493028, + "grad_norm": 0.3009369060856513, + "learning_rate": 6.328459338143899e-06, + "loss": 2.6419, + "step": 55046 + }, + { + "epoch": 2.5628419116791212, + "grad_norm": 0.3063073978840048, + "learning_rate": 6.327140387661063e-06, + "loss": 2.6592, + "step": 55047 + }, + { + "epoch": 2.562888469865214, + "grad_norm": 0.2916514466261716, + "learning_rate": 6.325821565353767e-06, + "loss": 2.633, + "step": 55048 + }, + { + "epoch": 2.562935028051307, + "grad_norm": 0.29026027406806754, + "learning_rate": 6.324502871225879e-06, + "loss": 2.593, + "step": 55049 + }, + { + "epoch": 2.5629815862374, + "grad_norm": 0.2873757669949071, + "learning_rate": 6.323184305281288e-06, + "loss": 2.561, + "step": 55050 + }, + { + "epoch": 2.5630281444234932, + "grad_norm": 0.292462816411831, + "learning_rate": 6.321865867523835e-06, + "loss": 2.6989, + "step": 55051 + }, + { + "epoch": 2.5630747026095864, + "grad_norm": 0.2925935216159578, + "learning_rate": 6.320547557957407e-06, + "loss": 2.6252, + "step": 55052 + }, + { + "epoch": 2.5631212607956795, + "grad_norm": 0.3118016910194083, + "learning_rate": 6.319229376585867e-06, + "loss": 2.671, + "step": 55053 + }, + { + "epoch": 2.5631678189817726, + "grad_norm": 0.3112256672303153, + "learning_rate": 6.317911323413084e-06, + "loss": 2.6837, + "step": 55054 + }, + { + "epoch": 2.5632143771678653, + "grad_norm": 0.2894907206610842, + "learning_rate": 6.316593398442927e-06, + "loss": 2.5711, + "step": 55055 + }, + { + "epoch": 2.5632609353539584, + "grad_norm": 0.30998288888053854, + "learning_rate": 6.315275601679277e-06, + "loss": 2.6563, + "step": 55056 + }, + { + "epoch": 2.5633074935400515, + "grad_norm": 0.2932466287212545, + "learning_rate": 6.313957933125964e-06, + "loss": 2.5559, + "step": 55057 + }, + { + "epoch": 2.5633540517261446, + "grad_norm": 0.3006164529347588, + "learning_rate": 6.312640392786906e-06, + "loss": 2.4883, + "step": 55058 + }, + { + "epoch": 2.5634006099122377, + "grad_norm": 0.29817840116732386, + "learning_rate": 6.311322980665935e-06, + "loss": 2.6337, + "step": 55059 + }, + { + "epoch": 2.563447168098331, + "grad_norm": 0.28260317164426185, + "learning_rate": 6.310005696766924e-06, + "loss": 2.556, + "step": 55060 + }, + { + "epoch": 2.563493726284424, + "grad_norm": 0.29739779398284893, + "learning_rate": 6.308688541093744e-06, + "loss": 2.5506, + "step": 55061 + }, + { + "epoch": 2.563540284470517, + "grad_norm": 0.30114181719157435, + "learning_rate": 6.3073715136502565e-06, + "loss": 2.6343, + "step": 55062 + }, + { + "epoch": 2.56358684265661, + "grad_norm": 0.29907846823398165, + "learning_rate": 6.306054614440343e-06, + "loss": 2.7081, + "step": 55063 + }, + { + "epoch": 2.5636334008427033, + "grad_norm": 0.30595137447456955, + "learning_rate": 6.304737843467845e-06, + "loss": 2.5681, + "step": 55064 + }, + { + "epoch": 2.5636799590287964, + "grad_norm": 0.3001040562207001, + "learning_rate": 6.303421200736631e-06, + "loss": 2.5322, + "step": 55065 + }, + { + "epoch": 2.5637265172148895, + "grad_norm": 0.29677531845311755, + "learning_rate": 6.3021046862505765e-06, + "loss": 2.5914, + "step": 55066 + }, + { + "epoch": 2.5637730754009826, + "grad_norm": 0.29878064591076225, + "learning_rate": 6.300788300013538e-06, + "loss": 2.6585, + "step": 55067 + }, + { + "epoch": 2.5638196335870753, + "grad_norm": 0.3033868830347948, + "learning_rate": 6.299472042029381e-06, + "loss": 2.5828, + "step": 55068 + }, + { + "epoch": 2.5638661917731684, + "grad_norm": 0.30542304101802076, + "learning_rate": 6.2981559123019785e-06, + "loss": 2.61, + "step": 55069 + }, + { + "epoch": 2.5639127499592616, + "grad_norm": 0.31736305826360006, + "learning_rate": 6.2968399108351615e-06, + "loss": 2.6588, + "step": 55070 + }, + { + "epoch": 2.5639593081453547, + "grad_norm": 0.3038457149772529, + "learning_rate": 6.295524037632833e-06, + "loss": 2.697, + "step": 55071 + }, + { + "epoch": 2.564005866331448, + "grad_norm": 0.30390500200767706, + "learning_rate": 6.294208292698816e-06, + "loss": 2.5505, + "step": 55072 + }, + { + "epoch": 2.564052424517541, + "grad_norm": 0.3102692312867844, + "learning_rate": 6.292892676037016e-06, + "loss": 2.5751, + "step": 55073 + }, + { + "epoch": 2.564098982703634, + "grad_norm": 0.308323595958637, + "learning_rate": 6.291577187651254e-06, + "loss": 2.6501, + "step": 55074 + }, + { + "epoch": 2.5641455408897267, + "grad_norm": 0.3042693008862379, + "learning_rate": 6.290261827545412e-06, + "loss": 2.7085, + "step": 55075 + }, + { + "epoch": 2.56419209907582, + "grad_norm": 0.2943226692888509, + "learning_rate": 6.288946595723355e-06, + "loss": 2.5527, + "step": 55076 + }, + { + "epoch": 2.564238657261913, + "grad_norm": 0.30810372880370707, + "learning_rate": 6.287631492188922e-06, + "loss": 2.6978, + "step": 55077 + }, + { + "epoch": 2.564285215448006, + "grad_norm": 0.30084066250022407, + "learning_rate": 6.286316516945984e-06, + "loss": 2.6603, + "step": 55078 + }, + { + "epoch": 2.564331773634099, + "grad_norm": 0.30235902290714356, + "learning_rate": 6.285001669998403e-06, + "loss": 2.6181, + "step": 55079 + }, + { + "epoch": 2.5643783318201923, + "grad_norm": 0.28946513555303766, + "learning_rate": 6.2836869513500316e-06, + "loss": 2.5659, + "step": 55080 + }, + { + "epoch": 2.5644248900062854, + "grad_norm": 0.2958657131132892, + "learning_rate": 6.282372361004735e-06, + "loss": 2.6246, + "step": 55081 + }, + { + "epoch": 2.5644714481923785, + "grad_norm": 0.29415846249434924, + "learning_rate": 6.28105789896638e-06, + "loss": 2.5998, + "step": 55082 + }, + { + "epoch": 2.5645180063784716, + "grad_norm": 0.31154627520781325, + "learning_rate": 6.2797435652387885e-06, + "loss": 2.6709, + "step": 55083 + }, + { + "epoch": 2.5645645645645647, + "grad_norm": 0.29856756534339546, + "learning_rate": 6.278429359825866e-06, + "loss": 2.6196, + "step": 55084 + }, + { + "epoch": 2.564611122750658, + "grad_norm": 0.28138797481441186, + "learning_rate": 6.277115282731422e-06, + "loss": 2.5645, + "step": 55085 + }, + { + "epoch": 2.564657680936751, + "grad_norm": 0.29654276881204733, + "learning_rate": 6.275801333959364e-06, + "loss": 2.6549, + "step": 55086 + }, + { + "epoch": 2.5647042391228436, + "grad_norm": 0.29453968517881296, + "learning_rate": 6.274487513513505e-06, + "loss": 2.7136, + "step": 55087 + }, + { + "epoch": 2.5647507973089367, + "grad_norm": 0.30201977731370183, + "learning_rate": 6.273173821397721e-06, + "loss": 2.6963, + "step": 55088 + }, + { + "epoch": 2.56479735549503, + "grad_norm": 0.3141564487325488, + "learning_rate": 6.2718602576158745e-06, + "loss": 2.6733, + "step": 55089 + }, + { + "epoch": 2.564843913681123, + "grad_norm": 0.296368338916309, + "learning_rate": 6.270546822171785e-06, + "loss": 2.6144, + "step": 55090 + }, + { + "epoch": 2.564890471867216, + "grad_norm": 0.31632782976396245, + "learning_rate": 6.269233515069362e-06, + "loss": 2.6943, + "step": 55091 + }, + { + "epoch": 2.564937030053309, + "grad_norm": 0.3103656635309044, + "learning_rate": 6.267920336312411e-06, + "loss": 2.6982, + "step": 55092 + }, + { + "epoch": 2.5649835882394023, + "grad_norm": 0.30010217482830176, + "learning_rate": 6.266607285904808e-06, + "loss": 2.6412, + "step": 55093 + }, + { + "epoch": 2.5650301464254954, + "grad_norm": 0.29721212506900224, + "learning_rate": 6.265294363850405e-06, + "loss": 2.5663, + "step": 55094 + }, + { + "epoch": 2.565076704611588, + "grad_norm": 0.3160085943027106, + "learning_rate": 6.263981570153055e-06, + "loss": 2.6577, + "step": 55095 + }, + { + "epoch": 2.565123262797681, + "grad_norm": 0.31082528497499984, + "learning_rate": 6.26266890481661e-06, + "loss": 2.6661, + "step": 55096 + }, + { + "epoch": 2.5651698209837743, + "grad_norm": 0.30847710912380255, + "learning_rate": 6.261356367844934e-06, + "loss": 2.5708, + "step": 55097 + }, + { + "epoch": 2.5652163791698674, + "grad_norm": 0.31171854809794003, + "learning_rate": 6.260043959241846e-06, + "loss": 2.6085, + "step": 55098 + }, + { + "epoch": 2.5652629373559606, + "grad_norm": 0.2973171586129447, + "learning_rate": 6.258731679011243e-06, + "loss": 2.6349, + "step": 55099 + }, + { + "epoch": 2.5653094955420537, + "grad_norm": 0.29808752464853255, + "learning_rate": 6.257419527156944e-06, + "loss": 2.6549, + "step": 55100 + }, + { + "epoch": 2.565356053728147, + "grad_norm": 0.3169405056569395, + "learning_rate": 6.256107503682807e-06, + "loss": 2.6914, + "step": 55101 + }, + { + "epoch": 2.56540261191424, + "grad_norm": 0.3184369888678679, + "learning_rate": 6.254795608592695e-06, + "loss": 2.5831, + "step": 55102 + }, + { + "epoch": 2.565449170100333, + "grad_norm": 0.30057344803259656, + "learning_rate": 6.253483841890429e-06, + "loss": 2.6113, + "step": 55103 + }, + { + "epoch": 2.565495728286426, + "grad_norm": 0.2996719069921408, + "learning_rate": 6.252172203579892e-06, + "loss": 2.6049, + "step": 55104 + }, + { + "epoch": 2.5655422864725193, + "grad_norm": 0.29798841522180064, + "learning_rate": 6.250860693664917e-06, + "loss": 2.6093, + "step": 55105 + }, + { + "epoch": 2.5655888446586124, + "grad_norm": 0.3242097087208791, + "learning_rate": 6.249549312149355e-06, + "loss": 2.6442, + "step": 55106 + }, + { + "epoch": 2.565635402844705, + "grad_norm": 0.3166917715834461, + "learning_rate": 6.248238059037054e-06, + "loss": 2.5965, + "step": 55107 + }, + { + "epoch": 2.565681961030798, + "grad_norm": 0.3047687207152902, + "learning_rate": 6.246926934331865e-06, + "loss": 2.6456, + "step": 55108 + }, + { + "epoch": 2.5657285192168913, + "grad_norm": 0.30352680761428574, + "learning_rate": 6.245615938037635e-06, + "loss": 2.6577, + "step": 55109 + }, + { + "epoch": 2.5657750774029844, + "grad_norm": 0.31305768572512205, + "learning_rate": 6.244305070158219e-06, + "loss": 2.6209, + "step": 55110 + }, + { + "epoch": 2.5658216355890775, + "grad_norm": 0.3027164305236051, + "learning_rate": 6.242994330697438e-06, + "loss": 2.6094, + "step": 55111 + }, + { + "epoch": 2.5658681937751706, + "grad_norm": 0.3102308284706086, + "learning_rate": 6.24168371965918e-06, + "loss": 2.6516, + "step": 55112 + }, + { + "epoch": 2.5659147519612637, + "grad_norm": 0.3071166405112751, + "learning_rate": 6.240373237047259e-06, + "loss": 2.6285, + "step": 55113 + }, + { + "epoch": 2.5659613101473564, + "grad_norm": 0.2960558978288282, + "learning_rate": 6.239062882865532e-06, + "loss": 2.6084, + "step": 55114 + }, + { + "epoch": 2.5660078683334495, + "grad_norm": 0.29581126651906575, + "learning_rate": 6.237752657117851e-06, + "loss": 2.5837, + "step": 55115 + }, + { + "epoch": 2.5660544265195426, + "grad_norm": 0.30312883243129746, + "learning_rate": 6.2364425598080364e-06, + "loss": 2.6408, + "step": 55116 + }, + { + "epoch": 2.5661009847056357, + "grad_norm": 0.28937156780345596, + "learning_rate": 6.235132590939974e-06, + "loss": 2.5845, + "step": 55117 + }, + { + "epoch": 2.566147542891729, + "grad_norm": 0.2979292466420387, + "learning_rate": 6.233822750517476e-06, + "loss": 2.6232, + "step": 55118 + }, + { + "epoch": 2.566194101077822, + "grad_norm": 0.2925043621562825, + "learning_rate": 6.232513038544391e-06, + "loss": 2.7017, + "step": 55119 + }, + { + "epoch": 2.566240659263915, + "grad_norm": 0.30826904311842784, + "learning_rate": 6.23120345502457e-06, + "loss": 2.5877, + "step": 55120 + }, + { + "epoch": 2.566287217450008, + "grad_norm": 0.2967545231511546, + "learning_rate": 6.229893999961861e-06, + "loss": 2.5534, + "step": 55121 + }, + { + "epoch": 2.5663337756361013, + "grad_norm": 0.2991975377473648, + "learning_rate": 6.228584673360094e-06, + "loss": 2.6866, + "step": 55122 + }, + { + "epoch": 2.5663803338221944, + "grad_norm": 0.2974440804455021, + "learning_rate": 6.227275475223132e-06, + "loss": 2.5451, + "step": 55123 + }, + { + "epoch": 2.5664268920082876, + "grad_norm": 0.31186018663981546, + "learning_rate": 6.225966405554784e-06, + "loss": 2.6385, + "step": 55124 + }, + { + "epoch": 2.5664734501943807, + "grad_norm": 0.2906611877203354, + "learning_rate": 6.224657464358929e-06, + "loss": 2.6232, + "step": 55125 + }, + { + "epoch": 2.5665200083804733, + "grad_norm": 0.30354126029222556, + "learning_rate": 6.223348651639388e-06, + "loss": 2.6045, + "step": 55126 + }, + { + "epoch": 2.5665665665665665, + "grad_norm": 0.29416917040585305, + "learning_rate": 6.222039967400001e-06, + "loss": 2.6382, + "step": 55127 + }, + { + "epoch": 2.5666131247526596, + "grad_norm": 0.2969518262370767, + "learning_rate": 6.220731411644626e-06, + "loss": 2.6152, + "step": 55128 + }, + { + "epoch": 2.5666596829387527, + "grad_norm": 0.28934983967220806, + "learning_rate": 6.219422984377072e-06, + "loss": 2.6151, + "step": 55129 + }, + { + "epoch": 2.566706241124846, + "grad_norm": 0.30199321041549737, + "learning_rate": 6.218114685601218e-06, + "loss": 2.5939, + "step": 55130 + }, + { + "epoch": 2.566752799310939, + "grad_norm": 0.30608432712442923, + "learning_rate": 6.216806515320861e-06, + "loss": 2.6461, + "step": 55131 + }, + { + "epoch": 2.566799357497032, + "grad_norm": 0.29811202448014706, + "learning_rate": 6.215498473539888e-06, + "loss": 2.6028, + "step": 55132 + }, + { + "epoch": 2.566845915683125, + "grad_norm": 0.31762391896612063, + "learning_rate": 6.214190560262101e-06, + "loss": 2.6522, + "step": 55133 + }, + { + "epoch": 2.566892473869218, + "grad_norm": 0.29497120758073886, + "learning_rate": 6.212882775491352e-06, + "loss": 2.6389, + "step": 55134 + }, + { + "epoch": 2.566939032055311, + "grad_norm": 0.2922905681633141, + "learning_rate": 6.211575119231477e-06, + "loss": 2.6909, + "step": 55135 + }, + { + "epoch": 2.566985590241404, + "grad_norm": 0.3141367973981268, + "learning_rate": 6.210267591486324e-06, + "loss": 2.6174, + "step": 55136 + }, + { + "epoch": 2.567032148427497, + "grad_norm": 0.3092054075216844, + "learning_rate": 6.208960192259705e-06, + "loss": 2.5798, + "step": 55137 + }, + { + "epoch": 2.5670787066135903, + "grad_norm": 0.30421510812203295, + "learning_rate": 6.20765292155549e-06, + "loss": 2.5602, + "step": 55138 + }, + { + "epoch": 2.5671252647996834, + "grad_norm": 0.29446924457966966, + "learning_rate": 6.2063457793774924e-06, + "loss": 2.5431, + "step": 55139 + }, + { + "epoch": 2.5671718229857765, + "grad_norm": 0.3019267595364212, + "learning_rate": 6.2050387657295535e-06, + "loss": 2.5826, + "step": 55140 + }, + { + "epoch": 2.5672183811718696, + "grad_norm": 0.30720759676587533, + "learning_rate": 6.203731880615526e-06, + "loss": 2.604, + "step": 55141 + }, + { + "epoch": 2.5672649393579627, + "grad_norm": 0.2941629435051882, + "learning_rate": 6.202425124039202e-06, + "loss": 2.6232, + "step": 55142 + }, + { + "epoch": 2.567311497544056, + "grad_norm": 0.29970124187385516, + "learning_rate": 6.2011184960044765e-06, + "loss": 2.6167, + "step": 55143 + }, + { + "epoch": 2.567358055730149, + "grad_norm": 0.294741143028163, + "learning_rate": 6.199811996515126e-06, + "loss": 2.6257, + "step": 55144 + }, + { + "epoch": 2.567404613916242, + "grad_norm": 0.3028584512952356, + "learning_rate": 6.198505625575035e-06, + "loss": 2.5027, + "step": 55145 + }, + { + "epoch": 2.5674511721023348, + "grad_norm": 0.3084390888310903, + "learning_rate": 6.197199383188001e-06, + "loss": 2.6732, + "step": 55146 + }, + { + "epoch": 2.567497730288428, + "grad_norm": 0.29450184271147434, + "learning_rate": 6.195893269357872e-06, + "loss": 2.603, + "step": 55147 + }, + { + "epoch": 2.567544288474521, + "grad_norm": 0.30785310302650365, + "learning_rate": 6.194587284088488e-06, + "loss": 2.6881, + "step": 55148 + }, + { + "epoch": 2.567590846660614, + "grad_norm": 0.29725346739276376, + "learning_rate": 6.193281427383668e-06, + "loss": 2.5879, + "step": 55149 + }, + { + "epoch": 2.567637404846707, + "grad_norm": 0.30609484642224, + "learning_rate": 6.191975699247254e-06, + "loss": 2.6828, + "step": 55150 + }, + { + "epoch": 2.5676839630328003, + "grad_norm": 0.29675970469507995, + "learning_rate": 6.190670099683082e-06, + "loss": 2.5901, + "step": 55151 + }, + { + "epoch": 2.5677305212188934, + "grad_norm": 0.30649886811671073, + "learning_rate": 6.1893646286949715e-06, + "loss": 2.7614, + "step": 55152 + }, + { + "epoch": 2.567777079404986, + "grad_norm": 0.3153916737025475, + "learning_rate": 6.188059286286757e-06, + "loss": 2.638, + "step": 55153 + }, + { + "epoch": 2.5678236375910792, + "grad_norm": 0.30229006052836443, + "learning_rate": 6.186754072462286e-06, + "loss": 2.5447, + "step": 55154 + }, + { + "epoch": 2.5678701957771723, + "grad_norm": 0.2965247106128598, + "learning_rate": 6.185448987225351e-06, + "loss": 2.5909, + "step": 55155 + }, + { + "epoch": 2.5679167539632655, + "grad_norm": 0.3117502152650038, + "learning_rate": 6.18414403057983e-06, + "loss": 2.634, + "step": 55156 + }, + { + "epoch": 2.5679633121493586, + "grad_norm": 0.3044564102969447, + "learning_rate": 6.182839202529506e-06, + "loss": 2.5799, + "step": 55157 + }, + { + "epoch": 2.5680098703354517, + "grad_norm": 0.3021314141082958, + "learning_rate": 6.181534503078257e-06, + "loss": 2.6151, + "step": 55158 + }, + { + "epoch": 2.568056428521545, + "grad_norm": 0.3163510396649981, + "learning_rate": 6.180229932229875e-06, + "loss": 2.6128, + "step": 55159 + }, + { + "epoch": 2.568102986707638, + "grad_norm": 0.2896575775705269, + "learning_rate": 6.178925489988202e-06, + "loss": 2.563, + "step": 55160 + }, + { + "epoch": 2.568149544893731, + "grad_norm": 0.29477589386992303, + "learning_rate": 6.177621176357062e-06, + "loss": 2.6351, + "step": 55161 + }, + { + "epoch": 2.568196103079824, + "grad_norm": 0.30120860373695424, + "learning_rate": 6.176316991340292e-06, + "loss": 2.6058, + "step": 55162 + }, + { + "epoch": 2.5682426612659173, + "grad_norm": 0.3207824470452572, + "learning_rate": 6.17501293494171e-06, + "loss": 2.5769, + "step": 55163 + }, + { + "epoch": 2.5682892194520104, + "grad_norm": 0.29275467140345285, + "learning_rate": 6.173709007165157e-06, + "loss": 2.6383, + "step": 55164 + }, + { + "epoch": 2.568335777638103, + "grad_norm": 0.3053573124935315, + "learning_rate": 6.172405208014448e-06, + "loss": 2.5914, + "step": 55165 + }, + { + "epoch": 2.568382335824196, + "grad_norm": 0.3061583634050144, + "learning_rate": 6.171101537493401e-06, + "loss": 2.5185, + "step": 55166 + }, + { + "epoch": 2.5684288940102893, + "grad_norm": 0.31916418079020686, + "learning_rate": 6.169797995605863e-06, + "loss": 2.6578, + "step": 55167 + }, + { + "epoch": 2.5684754521963824, + "grad_norm": 0.30882612256863595, + "learning_rate": 6.168494582355644e-06, + "loss": 2.653, + "step": 55168 + }, + { + "epoch": 2.5685220103824755, + "grad_norm": 0.301466728339071, + "learning_rate": 6.1671912977465885e-06, + "loss": 2.5954, + "step": 55169 + }, + { + "epoch": 2.5685685685685686, + "grad_norm": 0.30018378895776743, + "learning_rate": 6.165888141782483e-06, + "loss": 2.6777, + "step": 55170 + }, + { + "epoch": 2.5686151267546617, + "grad_norm": 0.31002078362531454, + "learning_rate": 6.164585114467203e-06, + "loss": 2.705, + "step": 55171 + }, + { + "epoch": 2.568661684940755, + "grad_norm": 0.2978493413258687, + "learning_rate": 6.163282215804539e-06, + "loss": 2.5433, + "step": 55172 + }, + { + "epoch": 2.5687082431268475, + "grad_norm": 0.30239750791851683, + "learning_rate": 6.161979445798316e-06, + "loss": 2.6249, + "step": 55173 + }, + { + "epoch": 2.5687548013129406, + "grad_norm": 0.2941230961971086, + "learning_rate": 6.16067680445237e-06, + "loss": 2.6099, + "step": 55174 + }, + { + "epoch": 2.5688013594990338, + "grad_norm": 0.33298706571473863, + "learning_rate": 6.159374291770514e-06, + "loss": 2.6912, + "step": 55175 + }, + { + "epoch": 2.568847917685127, + "grad_norm": 0.29227696657590835, + "learning_rate": 6.158071907756579e-06, + "loss": 2.5815, + "step": 55176 + }, + { + "epoch": 2.56889447587122, + "grad_norm": 0.3043563957519986, + "learning_rate": 6.156769652414396e-06, + "loss": 2.7208, + "step": 55177 + }, + { + "epoch": 2.568941034057313, + "grad_norm": 0.3003972935827677, + "learning_rate": 6.15546752574776e-06, + "loss": 2.627, + "step": 55178 + }, + { + "epoch": 2.5689875922434062, + "grad_norm": 0.29199701394215527, + "learning_rate": 6.154165527760508e-06, + "loss": 2.6744, + "step": 55179 + }, + { + "epoch": 2.5690341504294993, + "grad_norm": 0.30720860241770453, + "learning_rate": 6.1528636584564644e-06, + "loss": 2.5609, + "step": 55180 + }, + { + "epoch": 2.5690807086155925, + "grad_norm": 0.29688857018672954, + "learning_rate": 6.151561917839444e-06, + "loss": 2.6704, + "step": 55181 + }, + { + "epoch": 2.5691272668016856, + "grad_norm": 0.2977215175190522, + "learning_rate": 6.15026030591328e-06, + "loss": 2.6098, + "step": 55182 + }, + { + "epoch": 2.5691738249877787, + "grad_norm": 0.3052065151647173, + "learning_rate": 6.1489588226817606e-06, + "loss": 2.661, + "step": 55183 + }, + { + "epoch": 2.569220383173872, + "grad_norm": 0.29839166661966415, + "learning_rate": 6.147657468148749e-06, + "loss": 2.5993, + "step": 55184 + }, + { + "epoch": 2.5692669413599645, + "grad_norm": 0.304218689577172, + "learning_rate": 6.146356242318019e-06, + "loss": 2.6228, + "step": 55185 + }, + { + "epoch": 2.5693134995460576, + "grad_norm": 0.284536785623088, + "learning_rate": 6.1450551451934344e-06, + "loss": 2.6578, + "step": 55186 + }, + { + "epoch": 2.5693600577321507, + "grad_norm": 0.311093534081228, + "learning_rate": 6.143754176778782e-06, + "loss": 2.6482, + "step": 55187 + }, + { + "epoch": 2.569406615918244, + "grad_norm": 0.30935194519930187, + "learning_rate": 6.142453337077891e-06, + "loss": 2.6085, + "step": 55188 + }, + { + "epoch": 2.569453174104337, + "grad_norm": 0.31133237684323944, + "learning_rate": 6.141152626094582e-06, + "loss": 2.5476, + "step": 55189 + }, + { + "epoch": 2.56949973229043, + "grad_norm": 0.28819890106320895, + "learning_rate": 6.1398520438326775e-06, + "loss": 2.5178, + "step": 55190 + }, + { + "epoch": 2.569546290476523, + "grad_norm": 0.3097249669216128, + "learning_rate": 6.138551590295977e-06, + "loss": 2.7296, + "step": 55191 + }, + { + "epoch": 2.569592848662616, + "grad_norm": 0.3077971980116616, + "learning_rate": 6.137251265488303e-06, + "loss": 2.611, + "step": 55192 + }, + { + "epoch": 2.569639406848709, + "grad_norm": 0.2854332679545527, + "learning_rate": 6.135951069413481e-06, + "loss": 2.6236, + "step": 55193 + }, + { + "epoch": 2.569685965034802, + "grad_norm": 0.30204235237727145, + "learning_rate": 6.134651002075315e-06, + "loss": 2.6579, + "step": 55194 + }, + { + "epoch": 2.569732523220895, + "grad_norm": 0.3003983557992067, + "learning_rate": 6.133351063477644e-06, + "loss": 2.6924, + "step": 55195 + }, + { + "epoch": 2.5697790814069883, + "grad_norm": 0.3105257507630138, + "learning_rate": 6.132051253624238e-06, + "loss": 2.5581, + "step": 55196 + }, + { + "epoch": 2.5698256395930814, + "grad_norm": 0.30702100732921644, + "learning_rate": 6.130751572518967e-06, + "loss": 2.5873, + "step": 55197 + }, + { + "epoch": 2.5698721977791745, + "grad_norm": 0.3066564149761282, + "learning_rate": 6.129452020165594e-06, + "loss": 2.6432, + "step": 55198 + }, + { + "epoch": 2.5699187559652676, + "grad_norm": 0.29520505643590905, + "learning_rate": 6.128152596567977e-06, + "loss": 2.5355, + "step": 55199 + }, + { + "epoch": 2.5699653141513608, + "grad_norm": 0.2965406108300965, + "learning_rate": 6.1268533017299015e-06, + "loss": 2.6426, + "step": 55200 + }, + { + "epoch": 2.570011872337454, + "grad_norm": 0.2922711986140115, + "learning_rate": 6.125554135655192e-06, + "loss": 2.617, + "step": 55201 + }, + { + "epoch": 2.570058430523547, + "grad_norm": 0.28917091293410957, + "learning_rate": 6.124255098347653e-06, + "loss": 2.6668, + "step": 55202 + }, + { + "epoch": 2.57010498870964, + "grad_norm": 0.29208033899412816, + "learning_rate": 6.122956189811108e-06, + "loss": 2.5837, + "step": 55203 + }, + { + "epoch": 2.570151546895733, + "grad_norm": 0.31805774466199516, + "learning_rate": 6.12165741004937e-06, + "loss": 2.6631, + "step": 55204 + }, + { + "epoch": 2.570198105081826, + "grad_norm": 0.3047553901321941, + "learning_rate": 6.120358759066236e-06, + "loss": 2.6528, + "step": 55205 + }, + { + "epoch": 2.570244663267919, + "grad_norm": 0.2987104382944988, + "learning_rate": 6.119060236865531e-06, + "loss": 2.7355, + "step": 55206 + }, + { + "epoch": 2.570291221454012, + "grad_norm": 0.31818104622552984, + "learning_rate": 6.117761843451053e-06, + "loss": 2.589, + "step": 55207 + }, + { + "epoch": 2.5703377796401052, + "grad_norm": 0.3077392708760516, + "learning_rate": 6.116463578826637e-06, + "loss": 2.5056, + "step": 55208 + }, + { + "epoch": 2.5703843378261984, + "grad_norm": 0.3043561408905964, + "learning_rate": 6.115165442996057e-06, + "loss": 2.6056, + "step": 55209 + }, + { + "epoch": 2.5704308960122915, + "grad_norm": 0.30702099369987174, + "learning_rate": 6.113867435963161e-06, + "loss": 2.6523, + "step": 55210 + }, + { + "epoch": 2.5704774541983846, + "grad_norm": 0.32086283776190533, + "learning_rate": 6.112569557731718e-06, + "loss": 2.6461, + "step": 55211 + }, + { + "epoch": 2.5705240123844773, + "grad_norm": 0.3110685608782638, + "learning_rate": 6.111271808305585e-06, + "loss": 2.5693, + "step": 55212 + }, + { + "epoch": 2.5705705705705704, + "grad_norm": 0.33083760250073135, + "learning_rate": 6.109974187688533e-06, + "loss": 2.6402, + "step": 55213 + }, + { + "epoch": 2.5706171287566635, + "grad_norm": 0.2980653424349308, + "learning_rate": 6.10867669588438e-06, + "loss": 2.6435, + "step": 55214 + }, + { + "epoch": 2.5706636869427566, + "grad_norm": 0.30545482510726685, + "learning_rate": 6.1073793328969394e-06, + "loss": 2.5877, + "step": 55215 + }, + { + "epoch": 2.5707102451288497, + "grad_norm": 0.2963909411523941, + "learning_rate": 6.106082098730015e-06, + "loss": 2.6874, + "step": 55216 + }, + { + "epoch": 2.570756803314943, + "grad_norm": 0.29955991850649594, + "learning_rate": 6.10478499338743e-06, + "loss": 2.7828, + "step": 55217 + }, + { + "epoch": 2.570803361501036, + "grad_norm": 0.2978609543220866, + "learning_rate": 6.1034880168729655e-06, + "loss": 2.5046, + "step": 55218 + }, + { + "epoch": 2.570849919687129, + "grad_norm": 0.3192804241509511, + "learning_rate": 6.102191169190435e-06, + "loss": 2.5712, + "step": 55219 + }, + { + "epoch": 2.570896477873222, + "grad_norm": 0.2998777539706604, + "learning_rate": 6.100894450343653e-06, + "loss": 2.6556, + "step": 55220 + }, + { + "epoch": 2.5709430360593153, + "grad_norm": 0.29802344421220084, + "learning_rate": 6.099597860336415e-06, + "loss": 2.5887, + "step": 55221 + }, + { + "epoch": 2.5709895942454084, + "grad_norm": 0.338796512710493, + "learning_rate": 6.09830139917254e-06, + "loss": 2.6284, + "step": 55222 + }, + { + "epoch": 2.5710361524315015, + "grad_norm": 0.2928785507914382, + "learning_rate": 6.097005066855832e-06, + "loss": 2.6581, + "step": 55223 + }, + { + "epoch": 2.571082710617594, + "grad_norm": 0.2994654277659365, + "learning_rate": 6.095708863390065e-06, + "loss": 2.5533, + "step": 55224 + }, + { + "epoch": 2.5711292688036873, + "grad_norm": 0.2932455677810013, + "learning_rate": 6.0944127887790914e-06, + "loss": 2.6404, + "step": 55225 + }, + { + "epoch": 2.5711758269897804, + "grad_norm": 0.3220168280168573, + "learning_rate": 6.09311684302668e-06, + "loss": 2.6933, + "step": 55226 + }, + { + "epoch": 2.5712223851758735, + "grad_norm": 0.2959901799070705, + "learning_rate": 6.0918210261366456e-06, + "loss": 2.5368, + "step": 55227 + }, + { + "epoch": 2.5712689433619667, + "grad_norm": 0.3107059362671952, + "learning_rate": 6.090525338112791e-06, + "loss": 2.6247, + "step": 55228 + }, + { + "epoch": 2.5713155015480598, + "grad_norm": 0.3050708331493443, + "learning_rate": 6.089229778958921e-06, + "loss": 2.598, + "step": 55229 + }, + { + "epoch": 2.571362059734153, + "grad_norm": 0.28970388728470664, + "learning_rate": 6.087934348678842e-06, + "loss": 2.6525, + "step": 55230 + }, + { + "epoch": 2.5714086179202456, + "grad_norm": 0.3287286023234023, + "learning_rate": 6.086639047276338e-06, + "loss": 2.6576, + "step": 55231 + }, + { + "epoch": 2.5714551761063387, + "grad_norm": 0.31415977414240287, + "learning_rate": 6.085343874755229e-06, + "loss": 2.6203, + "step": 55232 + }, + { + "epoch": 2.571501734292432, + "grad_norm": 0.310627313321636, + "learning_rate": 6.084048831119304e-06, + "loss": 2.6512, + "step": 55233 + }, + { + "epoch": 2.571548292478525, + "grad_norm": 0.3140816023941911, + "learning_rate": 6.082753916372369e-06, + "loss": 2.6513, + "step": 55234 + }, + { + "epoch": 2.571594850664618, + "grad_norm": 0.29147564660157416, + "learning_rate": 6.081459130518225e-06, + "loss": 2.6247, + "step": 55235 + }, + { + "epoch": 2.571641408850711, + "grad_norm": 0.31092439652304066, + "learning_rate": 6.080164473560684e-06, + "loss": 2.6912, + "step": 55236 + }, + { + "epoch": 2.5716879670368042, + "grad_norm": 0.3147132420576498, + "learning_rate": 6.078869945503513e-06, + "loss": 2.6567, + "step": 55237 + }, + { + "epoch": 2.5717345252228974, + "grad_norm": 0.30278242763050917, + "learning_rate": 6.07757554635055e-06, + "loss": 2.7479, + "step": 55238 + }, + { + "epoch": 2.5717810834089905, + "grad_norm": 0.31067889518807384, + "learning_rate": 6.0762812761055666e-06, + "loss": 2.6552, + "step": 55239 + }, + { + "epoch": 2.5718276415950836, + "grad_norm": 0.2962326517833622, + "learning_rate": 6.074987134772375e-06, + "loss": 2.6351, + "step": 55240 + }, + { + "epoch": 2.5718741997811767, + "grad_norm": 0.3030846551451466, + "learning_rate": 6.073693122354762e-06, + "loss": 2.5776, + "step": 55241 + }, + { + "epoch": 2.57192075796727, + "grad_norm": 0.30053938022181403, + "learning_rate": 6.0723992388565355e-06, + "loss": 2.5997, + "step": 55242 + }, + { + "epoch": 2.571967316153363, + "grad_norm": 0.30381526758308186, + "learning_rate": 6.071105484281497e-06, + "loss": 2.6476, + "step": 55243 + }, + { + "epoch": 2.5720138743394556, + "grad_norm": 0.28329616927414497, + "learning_rate": 6.0698118586334285e-06, + "loss": 2.5542, + "step": 55244 + }, + { + "epoch": 2.5720604325255487, + "grad_norm": 0.33390566056957127, + "learning_rate": 6.068518361916137e-06, + "loss": 2.7261, + "step": 55245 + }, + { + "epoch": 2.572106990711642, + "grad_norm": 0.2966650523492396, + "learning_rate": 6.0672249941334136e-06, + "loss": 2.666, + "step": 55246 + }, + { + "epoch": 2.572153548897735, + "grad_norm": 0.28724093788564803, + "learning_rate": 6.065931755289056e-06, + "loss": 2.5674, + "step": 55247 + }, + { + "epoch": 2.572200107083828, + "grad_norm": 0.2884102869496805, + "learning_rate": 6.064638645386861e-06, + "loss": 2.659, + "step": 55248 + }, + { + "epoch": 2.572246665269921, + "grad_norm": 0.2881165619911045, + "learning_rate": 6.06334566443063e-06, + "loss": 2.444, + "step": 55249 + }, + { + "epoch": 2.5722932234560143, + "grad_norm": 0.31421294076189565, + "learning_rate": 6.062052812424135e-06, + "loss": 2.6431, + "step": 55250 + }, + { + "epoch": 2.572339781642107, + "grad_norm": 0.29578848223251564, + "learning_rate": 6.060760089371204e-06, + "loss": 2.5765, + "step": 55251 + }, + { + "epoch": 2.5723863398282, + "grad_norm": 0.2962941015186596, + "learning_rate": 6.0594674952756056e-06, + "loss": 2.5251, + "step": 55252 + }, + { + "epoch": 2.572432898014293, + "grad_norm": 0.30836154613716715, + "learning_rate": 6.058175030141139e-06, + "loss": 2.6425, + "step": 55253 + }, + { + "epoch": 2.5724794562003863, + "grad_norm": 0.31818125520669255, + "learning_rate": 6.0568826939716054e-06, + "loss": 2.6349, + "step": 55254 + }, + { + "epoch": 2.5725260143864794, + "grad_norm": 0.309607050252162, + "learning_rate": 6.055590486770784e-06, + "loss": 2.6486, + "step": 55255 + }, + { + "epoch": 2.5725725725725725, + "grad_norm": 0.30693032599748127, + "learning_rate": 6.0542984085424915e-06, + "loss": 2.6483, + "step": 55256 + }, + { + "epoch": 2.5726191307586657, + "grad_norm": 0.303970316801092, + "learning_rate": 6.053006459290478e-06, + "loss": 2.5862, + "step": 55257 + }, + { + "epoch": 2.5726656889447588, + "grad_norm": 0.2988742456649547, + "learning_rate": 6.051714639018585e-06, + "loss": 2.5579, + "step": 55258 + }, + { + "epoch": 2.572712247130852, + "grad_norm": 0.3125114745867169, + "learning_rate": 6.050422947730572e-06, + "loss": 2.5411, + "step": 55259 + }, + { + "epoch": 2.572758805316945, + "grad_norm": 0.28920508400087297, + "learning_rate": 6.049131385430234e-06, + "loss": 2.5996, + "step": 55260 + }, + { + "epoch": 2.572805363503038, + "grad_norm": 0.2916541319808909, + "learning_rate": 6.047839952121365e-06, + "loss": 2.5777, + "step": 55261 + }, + { + "epoch": 2.5728519216891312, + "grad_norm": 0.3104210614483694, + "learning_rate": 6.04654864780777e-06, + "loss": 2.5941, + "step": 55262 + }, + { + "epoch": 2.572898479875224, + "grad_norm": 0.3190716083495666, + "learning_rate": 6.045257472493199e-06, + "loss": 2.6799, + "step": 55263 + }, + { + "epoch": 2.572945038061317, + "grad_norm": 0.29268411825831475, + "learning_rate": 6.043966426181491e-06, + "loss": 2.5838, + "step": 55264 + }, + { + "epoch": 2.57299159624741, + "grad_norm": 0.31772139386215437, + "learning_rate": 6.0426755088764e-06, + "loss": 2.6366, + "step": 55265 + }, + { + "epoch": 2.5730381544335033, + "grad_norm": 0.3380568011267628, + "learning_rate": 6.041384720581728e-06, + "loss": 2.6878, + "step": 55266 + }, + { + "epoch": 2.5730847126195964, + "grad_norm": 0.3117983057062654, + "learning_rate": 6.040094061301254e-06, + "loss": 2.6502, + "step": 55267 + }, + { + "epoch": 2.5731312708056895, + "grad_norm": 0.2845011320696828, + "learning_rate": 6.038803531038784e-06, + "loss": 2.6599, + "step": 55268 + }, + { + "epoch": 2.5731778289917826, + "grad_norm": 0.30881852862857223, + "learning_rate": 6.037513129798095e-06, + "loss": 2.5837, + "step": 55269 + }, + { + "epoch": 2.5732243871778757, + "grad_norm": 0.3268576677599072, + "learning_rate": 6.036222857582957e-06, + "loss": 2.7309, + "step": 55270 + }, + { + "epoch": 2.5732709453639684, + "grad_norm": 0.2875317651181141, + "learning_rate": 6.034932714397195e-06, + "loss": 2.6224, + "step": 55271 + }, + { + "epoch": 2.5733175035500615, + "grad_norm": 0.31339429192130375, + "learning_rate": 6.0336427002445685e-06, + "loss": 2.6942, + "step": 55272 + }, + { + "epoch": 2.5733640617361546, + "grad_norm": 0.3151492827758672, + "learning_rate": 6.032352815128861e-06, + "loss": 2.6575, + "step": 55273 + }, + { + "epoch": 2.5734106199222477, + "grad_norm": 0.2883589154785855, + "learning_rate": 6.031063059053871e-06, + "loss": 2.6907, + "step": 55274 + }, + { + "epoch": 2.573457178108341, + "grad_norm": 0.3072555542015994, + "learning_rate": 6.029773432023378e-06, + "loss": 2.6374, + "step": 55275 + }, + { + "epoch": 2.573503736294434, + "grad_norm": 0.3114746125636915, + "learning_rate": 6.028483934041168e-06, + "loss": 2.5703, + "step": 55276 + }, + { + "epoch": 2.573550294480527, + "grad_norm": 0.33997324410670515, + "learning_rate": 6.0271945651110386e-06, + "loss": 2.6447, + "step": 55277 + }, + { + "epoch": 2.57359685266662, + "grad_norm": 0.29613218892437543, + "learning_rate": 6.025905325236747e-06, + "loss": 2.5743, + "step": 55278 + }, + { + "epoch": 2.5736434108527133, + "grad_norm": 0.3115321733359922, + "learning_rate": 6.0246162144220965e-06, + "loss": 2.7253, + "step": 55279 + }, + { + "epoch": 2.5736899690388064, + "grad_norm": 0.3112214866518542, + "learning_rate": 6.023327232670861e-06, + "loss": 2.6169, + "step": 55280 + }, + { + "epoch": 2.5737365272248995, + "grad_norm": 0.31064725422066664, + "learning_rate": 6.022038379986828e-06, + "loss": 2.5793, + "step": 55281 + }, + { + "epoch": 2.5737830854109927, + "grad_norm": 0.29056345327744076, + "learning_rate": 6.020749656373792e-06, + "loss": 2.603, + "step": 55282 + }, + { + "epoch": 2.5738296435970853, + "grad_norm": 0.30063908153342317, + "learning_rate": 6.019461061835496e-06, + "loss": 2.6349, + "step": 55283 + }, + { + "epoch": 2.5738762017831784, + "grad_norm": 0.31194482349264885, + "learning_rate": 6.018172596375776e-06, + "loss": 2.5862, + "step": 55284 + }, + { + "epoch": 2.5739227599692716, + "grad_norm": 0.3082299086713006, + "learning_rate": 6.016884259998373e-06, + "loss": 2.5699, + "step": 55285 + }, + { + "epoch": 2.5739693181553647, + "grad_norm": 0.2808548438444095, + "learning_rate": 6.015596052707085e-06, + "loss": 2.5144, + "step": 55286 + }, + { + "epoch": 2.574015876341458, + "grad_norm": 0.30495185757918797, + "learning_rate": 6.014307974505689e-06, + "loss": 2.6354, + "step": 55287 + }, + { + "epoch": 2.574062434527551, + "grad_norm": 0.30162347244420895, + "learning_rate": 6.013020025397964e-06, + "loss": 2.621, + "step": 55288 + }, + { + "epoch": 2.574108992713644, + "grad_norm": 0.2990190992081833, + "learning_rate": 6.011732205387693e-06, + "loss": 2.5171, + "step": 55289 + }, + { + "epoch": 2.5741555508997367, + "grad_norm": 0.3386682165633679, + "learning_rate": 6.010444514478664e-06, + "loss": 2.5447, + "step": 55290 + }, + { + "epoch": 2.57420210908583, + "grad_norm": 0.29450510796756385, + "learning_rate": 6.009156952674633e-06, + "loss": 2.6446, + "step": 55291 + }, + { + "epoch": 2.574248667271923, + "grad_norm": 0.32449448722282215, + "learning_rate": 6.007869519979398e-06, + "loss": 2.7022, + "step": 55292 + }, + { + "epoch": 2.574295225458016, + "grad_norm": 0.3100471561306936, + "learning_rate": 6.0065822163967275e-06, + "loss": 2.6445, + "step": 55293 + }, + { + "epoch": 2.574341783644109, + "grad_norm": 0.29984194906248685, + "learning_rate": 6.005295041930409e-06, + "loss": 2.6863, + "step": 55294 + }, + { + "epoch": 2.5743883418302023, + "grad_norm": 0.32576606861892343, + "learning_rate": 6.004007996584221e-06, + "loss": 2.7183, + "step": 55295 + }, + { + "epoch": 2.5744349000162954, + "grad_norm": 0.29157560364170027, + "learning_rate": 6.002721080361917e-06, + "loss": 2.6719, + "step": 55296 + }, + { + "epoch": 2.5744814582023885, + "grad_norm": 0.2991759739683212, + "learning_rate": 6.0014342932673164e-06, + "loss": 2.6104, + "step": 55297 + }, + { + "epoch": 2.5745280163884816, + "grad_norm": 0.29703256883302975, + "learning_rate": 6.000147635304154e-06, + "loss": 2.6068, + "step": 55298 + }, + { + "epoch": 2.5745745745745747, + "grad_norm": 0.3005802904721143, + "learning_rate": 5.998861106476228e-06, + "loss": 2.6615, + "step": 55299 + }, + { + "epoch": 2.574621132760668, + "grad_norm": 0.30427907392231834, + "learning_rate": 5.9975747067873064e-06, + "loss": 2.7335, + "step": 55300 + }, + { + "epoch": 2.574667690946761, + "grad_norm": 0.2992390717468405, + "learning_rate": 5.996288436241171e-06, + "loss": 2.5955, + "step": 55301 + }, + { + "epoch": 2.5747142491328536, + "grad_norm": 0.29291292203496105, + "learning_rate": 5.995002294841595e-06, + "loss": 2.6568, + "step": 55302 + }, + { + "epoch": 2.5747608073189467, + "grad_norm": 0.3043317749452495, + "learning_rate": 5.99371628259236e-06, + "loss": 2.6372, + "step": 55303 + }, + { + "epoch": 2.57480736550504, + "grad_norm": 0.3164876537278225, + "learning_rate": 5.992430399497223e-06, + "loss": 2.697, + "step": 55304 + }, + { + "epoch": 2.574853923691133, + "grad_norm": 0.31490229139449, + "learning_rate": 5.991144645559965e-06, + "loss": 2.6056, + "step": 55305 + }, + { + "epoch": 2.574900481877226, + "grad_norm": 0.30770518300230193, + "learning_rate": 5.989859020784366e-06, + "loss": 2.7422, + "step": 55306 + }, + { + "epoch": 2.574947040063319, + "grad_norm": 0.29043623374247207, + "learning_rate": 5.988573525174196e-06, + "loss": 2.5302, + "step": 55307 + }, + { + "epoch": 2.5749935982494123, + "grad_norm": 0.3020184549627381, + "learning_rate": 5.987288158733234e-06, + "loss": 2.6094, + "step": 55308 + }, + { + "epoch": 2.5750401564355054, + "grad_norm": 0.3013199294649323, + "learning_rate": 5.9860029214652225e-06, + "loss": 2.6161, + "step": 55309 + }, + { + "epoch": 2.575086714621598, + "grad_norm": 0.29130987093820315, + "learning_rate": 5.98471781337398e-06, + "loss": 2.5108, + "step": 55310 + }, + { + "epoch": 2.575133272807691, + "grad_norm": 0.30278252758777735, + "learning_rate": 5.983432834463232e-06, + "loss": 2.746, + "step": 55311 + }, + { + "epoch": 2.5751798309937843, + "grad_norm": 0.3166772729866295, + "learning_rate": 5.982147984736797e-06, + "loss": 2.5839, + "step": 55312 + }, + { + "epoch": 2.5752263891798775, + "grad_norm": 0.2892328279253085, + "learning_rate": 5.980863264198405e-06, + "loss": 2.6779, + "step": 55313 + }, + { + "epoch": 2.5752729473659706, + "grad_norm": 0.30352012242202314, + "learning_rate": 5.979578672851843e-06, + "loss": 2.5985, + "step": 55314 + }, + { + "epoch": 2.5753195055520637, + "grad_norm": 0.29476166280100086, + "learning_rate": 5.978294210700886e-06, + "loss": 2.6048, + "step": 55315 + }, + { + "epoch": 2.575366063738157, + "grad_norm": 0.296252545109391, + "learning_rate": 5.977009877749306e-06, + "loss": 2.5178, + "step": 55316 + }, + { + "epoch": 2.57541262192425, + "grad_norm": 0.30287749327323343, + "learning_rate": 5.975725674000854e-06, + "loss": 2.5451, + "step": 55317 + }, + { + "epoch": 2.575459180110343, + "grad_norm": 0.32341606357401764, + "learning_rate": 5.974441599459313e-06, + "loss": 2.6353, + "step": 55318 + }, + { + "epoch": 2.575505738296436, + "grad_norm": 0.3042980210518417, + "learning_rate": 5.973157654128447e-06, + "loss": 2.6262, + "step": 55319 + }, + { + "epoch": 2.5755522964825293, + "grad_norm": 0.31066415500044037, + "learning_rate": 5.971873838012027e-06, + "loss": 2.6527, + "step": 55320 + }, + { + "epoch": 2.5755988546686224, + "grad_norm": 0.3088649000202471, + "learning_rate": 5.970590151113831e-06, + "loss": 2.6137, + "step": 55321 + }, + { + "epoch": 2.575645412854715, + "grad_norm": 0.3105650612815836, + "learning_rate": 5.96930659343759e-06, + "loss": 2.6355, + "step": 55322 + }, + { + "epoch": 2.575691971040808, + "grad_norm": 0.3324919717460959, + "learning_rate": 5.968023164987119e-06, + "loss": 2.7088, + "step": 55323 + }, + { + "epoch": 2.5757385292269013, + "grad_norm": 0.30088786300260806, + "learning_rate": 5.966739865766141e-06, + "loss": 2.6653, + "step": 55324 + }, + { + "epoch": 2.5757850874129944, + "grad_norm": 0.3024301417048661, + "learning_rate": 5.965456695778465e-06, + "loss": 2.6417, + "step": 55325 + }, + { + "epoch": 2.5758316455990875, + "grad_norm": 0.29529871343371794, + "learning_rate": 5.964173655027827e-06, + "loss": 2.6368, + "step": 55326 + }, + { + "epoch": 2.5758782037851806, + "grad_norm": 0.2894642682480551, + "learning_rate": 5.962890743518001e-06, + "loss": 2.651, + "step": 55327 + }, + { + "epoch": 2.5759247619712737, + "grad_norm": 0.2926449019218935, + "learning_rate": 5.961607961252746e-06, + "loss": 2.602, + "step": 55328 + }, + { + "epoch": 2.5759713201573664, + "grad_norm": 0.2912916060263107, + "learning_rate": 5.960325308235842e-06, + "loss": 2.5807, + "step": 55329 + }, + { + "epoch": 2.5760178783434595, + "grad_norm": 0.2911833454080011, + "learning_rate": 5.959042784471053e-06, + "loss": 2.6203, + "step": 55330 + }, + { + "epoch": 2.5760644365295526, + "grad_norm": 0.31177763778945505, + "learning_rate": 5.9577603899621195e-06, + "loss": 2.6979, + "step": 55331 + }, + { + "epoch": 2.5761109947156458, + "grad_norm": 0.30963840020287825, + "learning_rate": 5.956478124712822e-06, + "loss": 2.657, + "step": 55332 + }, + { + "epoch": 2.576157552901739, + "grad_norm": 0.32355863907279764, + "learning_rate": 5.955195988726925e-06, + "loss": 2.6378, + "step": 55333 + }, + { + "epoch": 2.576204111087832, + "grad_norm": 0.3104031720515154, + "learning_rate": 5.953913982008197e-06, + "loss": 2.5829, + "step": 55334 + }, + { + "epoch": 2.576250669273925, + "grad_norm": 0.30185936091585075, + "learning_rate": 5.952632104560368e-06, + "loss": 2.601, + "step": 55335 + }, + { + "epoch": 2.576297227460018, + "grad_norm": 0.30699639785112665, + "learning_rate": 5.951350356387253e-06, + "loss": 2.7234, + "step": 55336 + }, + { + "epoch": 2.5763437856461113, + "grad_norm": 0.3071916326542719, + "learning_rate": 5.950068737492559e-06, + "loss": 2.6661, + "step": 55337 + }, + { + "epoch": 2.5763903438322044, + "grad_norm": 0.3025502696995899, + "learning_rate": 5.948787247880094e-06, + "loss": 2.5334, + "step": 55338 + }, + { + "epoch": 2.5764369020182976, + "grad_norm": 0.302188397377369, + "learning_rate": 5.9475058875535895e-06, + "loss": 2.5926, + "step": 55339 + }, + { + "epoch": 2.5764834602043907, + "grad_norm": 0.30510816785344425, + "learning_rate": 5.946224656516819e-06, + "loss": 2.5361, + "step": 55340 + }, + { + "epoch": 2.5765300183904833, + "grad_norm": 0.31068448045046276, + "learning_rate": 5.944943554773536e-06, + "loss": 2.6228, + "step": 55341 + }, + { + "epoch": 2.5765765765765765, + "grad_norm": 0.33138605893307754, + "learning_rate": 5.943662582327503e-06, + "loss": 2.7192, + "step": 55342 + }, + { + "epoch": 2.5766231347626696, + "grad_norm": 0.3169726058020712, + "learning_rate": 5.94238173918249e-06, + "loss": 2.5662, + "step": 55343 + }, + { + "epoch": 2.5766696929487627, + "grad_norm": 0.28423381982859064, + "learning_rate": 5.941101025342238e-06, + "loss": 2.6679, + "step": 55344 + }, + { + "epoch": 2.576716251134856, + "grad_norm": 0.29350889370404826, + "learning_rate": 5.939820440810512e-06, + "loss": 2.6104, + "step": 55345 + }, + { + "epoch": 2.576762809320949, + "grad_norm": 0.2962794713212956, + "learning_rate": 5.938539985591074e-06, + "loss": 2.5904, + "step": 55346 + }, + { + "epoch": 2.576809367507042, + "grad_norm": 0.3071737565552856, + "learning_rate": 5.9372596596876774e-06, + "loss": 2.6615, + "step": 55347 + }, + { + "epoch": 2.576855925693135, + "grad_norm": 0.3127099018618195, + "learning_rate": 5.935979463104085e-06, + "loss": 2.6012, + "step": 55348 + }, + { + "epoch": 2.576902483879228, + "grad_norm": 0.2965085246959139, + "learning_rate": 5.934699395844062e-06, + "loss": 2.6557, + "step": 55349 + }, + { + "epoch": 2.576949042065321, + "grad_norm": 0.3069731461509511, + "learning_rate": 5.9334194579113325e-06, + "loss": 2.5832, + "step": 55350 + }, + { + "epoch": 2.576995600251414, + "grad_norm": 0.28680527244824583, + "learning_rate": 5.932139649309698e-06, + "loss": 2.5893, + "step": 55351 + }, + { + "epoch": 2.577042158437507, + "grad_norm": 0.30027240613772155, + "learning_rate": 5.930859970042885e-06, + "loss": 2.5957, + "step": 55352 + }, + { + "epoch": 2.5770887166236003, + "grad_norm": 0.30748469267158085, + "learning_rate": 5.92958042011465e-06, + "loss": 2.6412, + "step": 55353 + }, + { + "epoch": 2.5771352748096934, + "grad_norm": 0.30634776906476563, + "learning_rate": 5.928300999528757e-06, + "loss": 2.5348, + "step": 55354 + }, + { + "epoch": 2.5771818329957865, + "grad_norm": 0.3040908838165256, + "learning_rate": 5.927021708288966e-06, + "loss": 2.5607, + "step": 55355 + }, + { + "epoch": 2.5772283911818796, + "grad_norm": 0.3078131475460121, + "learning_rate": 5.925742546399027e-06, + "loss": 2.5495, + "step": 55356 + }, + { + "epoch": 2.5772749493679727, + "grad_norm": 0.3155922230506236, + "learning_rate": 5.924463513862683e-06, + "loss": 2.6156, + "step": 55357 + }, + { + "epoch": 2.577321507554066, + "grad_norm": 0.3147143996124699, + "learning_rate": 5.923184610683696e-06, + "loss": 2.6332, + "step": 55358 + }, + { + "epoch": 2.577368065740159, + "grad_norm": 0.30553936523768616, + "learning_rate": 5.9219058368658206e-06, + "loss": 2.526, + "step": 55359 + }, + { + "epoch": 2.577414623926252, + "grad_norm": 0.29745314716293797, + "learning_rate": 5.920627192412808e-06, + "loss": 2.6481, + "step": 55360 + }, + { + "epoch": 2.5774611821123448, + "grad_norm": 0.29968193604486754, + "learning_rate": 5.919348677328412e-06, + "loss": 2.6187, + "step": 55361 + }, + { + "epoch": 2.577507740298438, + "grad_norm": 0.3014074523817639, + "learning_rate": 5.918070291616395e-06, + "loss": 2.6763, + "step": 55362 + }, + { + "epoch": 2.577554298484531, + "grad_norm": 0.30868837827582385, + "learning_rate": 5.916792035280483e-06, + "loss": 2.679, + "step": 55363 + }, + { + "epoch": 2.577600856670624, + "grad_norm": 0.30790234242704756, + "learning_rate": 5.915513908324461e-06, + "loss": 2.5022, + "step": 55364 + }, + { + "epoch": 2.5776474148567172, + "grad_norm": 0.30343348300385675, + "learning_rate": 5.914235910752042e-06, + "loss": 2.6433, + "step": 55365 + }, + { + "epoch": 2.5776939730428103, + "grad_norm": 0.30922935974893473, + "learning_rate": 5.912958042567018e-06, + "loss": 2.642, + "step": 55366 + }, + { + "epoch": 2.5777405312289035, + "grad_norm": 0.3132990391453844, + "learning_rate": 5.911680303773104e-06, + "loss": 2.595, + "step": 55367 + }, + { + "epoch": 2.577787089414996, + "grad_norm": 0.29828982937820275, + "learning_rate": 5.9104026943740735e-06, + "loss": 2.5737, + "step": 55368 + }, + { + "epoch": 2.5778336476010892, + "grad_norm": 0.3015044950100214, + "learning_rate": 5.909125214373673e-06, + "loss": 2.656, + "step": 55369 + }, + { + "epoch": 2.5778802057871824, + "grad_norm": 0.2944021597172432, + "learning_rate": 5.907847863775634e-06, + "loss": 2.6321, + "step": 55370 + }, + { + "epoch": 2.5779267639732755, + "grad_norm": 0.30969237142292183, + "learning_rate": 5.9065706425837196e-06, + "loss": 2.7294, + "step": 55371 + }, + { + "epoch": 2.5779733221593686, + "grad_norm": 0.3079677506279097, + "learning_rate": 5.9052935508016774e-06, + "loss": 2.5708, + "step": 55372 + }, + { + "epoch": 2.5780198803454617, + "grad_norm": 0.30550314981423804, + "learning_rate": 5.904016588433253e-06, + "loss": 2.6103, + "step": 55373 + }, + { + "epoch": 2.578066438531555, + "grad_norm": 0.3068962953498037, + "learning_rate": 5.9027397554822e-06, + "loss": 2.6212, + "step": 55374 + }, + { + "epoch": 2.578112996717648, + "grad_norm": 0.3309685926594532, + "learning_rate": 5.901463051952266e-06, + "loss": 2.6078, + "step": 55375 + }, + { + "epoch": 2.578159554903741, + "grad_norm": 0.3074843620556775, + "learning_rate": 5.900186477847175e-06, + "loss": 2.5403, + "step": 55376 + }, + { + "epoch": 2.578206113089834, + "grad_norm": 0.30303998814515964, + "learning_rate": 5.8989100331707185e-06, + "loss": 2.7624, + "step": 55377 + }, + { + "epoch": 2.5782526712759273, + "grad_norm": 0.3034369122985782, + "learning_rate": 5.8976337179265874e-06, + "loss": 2.5937, + "step": 55378 + }, + { + "epoch": 2.5782992294620204, + "grad_norm": 0.31056784877684, + "learning_rate": 5.896357532118579e-06, + "loss": 2.6699, + "step": 55379 + }, + { + "epoch": 2.578345787648113, + "grad_norm": 0.29759615551078344, + "learning_rate": 5.895081475750408e-06, + "loss": 2.512, + "step": 55380 + }, + { + "epoch": 2.578392345834206, + "grad_norm": 0.3265573081876355, + "learning_rate": 5.893805548825826e-06, + "loss": 2.6412, + "step": 55381 + }, + { + "epoch": 2.5784389040202993, + "grad_norm": 0.29510601621272653, + "learning_rate": 5.892529751348591e-06, + "loss": 2.5889, + "step": 55382 + }, + { + "epoch": 2.5784854622063924, + "grad_norm": 0.29349597090857676, + "learning_rate": 5.891254083322417e-06, + "loss": 2.5697, + "step": 55383 + }, + { + "epoch": 2.5785320203924855, + "grad_norm": 0.30663861179783025, + "learning_rate": 5.889978544751085e-06, + "loss": 2.5997, + "step": 55384 + }, + { + "epoch": 2.5785785785785786, + "grad_norm": 0.30415335833998236, + "learning_rate": 5.888703135638312e-06, + "loss": 2.6258, + "step": 55385 + }, + { + "epoch": 2.5786251367646718, + "grad_norm": 0.31160455529705183, + "learning_rate": 5.8874278559878485e-06, + "loss": 2.6857, + "step": 55386 + }, + { + "epoch": 2.578671694950765, + "grad_norm": 0.30451386498714644, + "learning_rate": 5.886152705803438e-06, + "loss": 2.6628, + "step": 55387 + }, + { + "epoch": 2.5787182531368575, + "grad_norm": 0.29696522569158695, + "learning_rate": 5.8848776850888356e-06, + "loss": 2.6254, + "step": 55388 + }, + { + "epoch": 2.5787648113229507, + "grad_norm": 0.3172652026909623, + "learning_rate": 5.883602793847748e-06, + "loss": 2.5169, + "step": 55389 + }, + { + "epoch": 2.5788113695090438, + "grad_norm": 0.313570231462595, + "learning_rate": 5.882328032083961e-06, + "loss": 2.6424, + "step": 55390 + }, + { + "epoch": 2.578857927695137, + "grad_norm": 0.2841085420259454, + "learning_rate": 5.881053399801173e-06, + "loss": 2.5154, + "step": 55391 + }, + { + "epoch": 2.57890448588123, + "grad_norm": 0.3094302918960909, + "learning_rate": 5.879778897003174e-06, + "loss": 2.6064, + "step": 55392 + }, + { + "epoch": 2.578951044067323, + "grad_norm": 0.3251772298584029, + "learning_rate": 5.878504523693662e-06, + "loss": 2.6719, + "step": 55393 + }, + { + "epoch": 2.5789976022534162, + "grad_norm": 0.3022398129024953, + "learning_rate": 5.877230279876389e-06, + "loss": 2.6494, + "step": 55394 + }, + { + "epoch": 2.5790441604395093, + "grad_norm": 0.29162727491907775, + "learning_rate": 5.8759561655551135e-06, + "loss": 2.6297, + "step": 55395 + }, + { + "epoch": 2.5790907186256025, + "grad_norm": 0.2995870970891164, + "learning_rate": 5.874682180733537e-06, + "loss": 2.6169, + "step": 55396 + }, + { + "epoch": 2.5791372768116956, + "grad_norm": 0.3155758028446879, + "learning_rate": 5.873408325415447e-06, + "loss": 2.688, + "step": 55397 + }, + { + "epoch": 2.5791838349977887, + "grad_norm": 0.30378426995005486, + "learning_rate": 5.87213459960454e-06, + "loss": 2.5674, + "step": 55398 + }, + { + "epoch": 2.579230393183882, + "grad_norm": 0.2924750311164143, + "learning_rate": 5.870861003304573e-06, + "loss": 2.6154, + "step": 55399 + }, + { + "epoch": 2.5792769513699745, + "grad_norm": 0.3066452033411457, + "learning_rate": 5.869587536519283e-06, + "loss": 2.5429, + "step": 55400 + }, + { + "epoch": 2.5793235095560676, + "grad_norm": 0.29620650648276875, + "learning_rate": 5.868314199252406e-06, + "loss": 2.6873, + "step": 55401 + }, + { + "epoch": 2.5793700677421607, + "grad_norm": 0.28725532220295463, + "learning_rate": 5.867040991507678e-06, + "loss": 2.5676, + "step": 55402 + }, + { + "epoch": 2.579416625928254, + "grad_norm": 0.3207390713729158, + "learning_rate": 5.865767913288844e-06, + "loss": 2.6386, + "step": 55403 + }, + { + "epoch": 2.579463184114347, + "grad_norm": 0.30098771276283637, + "learning_rate": 5.864494964599615e-06, + "loss": 2.5965, + "step": 55404 + }, + { + "epoch": 2.57950974230044, + "grad_norm": 0.3188277295707598, + "learning_rate": 5.863222145443764e-06, + "loss": 2.6719, + "step": 55405 + }, + { + "epoch": 2.579556300486533, + "grad_norm": 0.32409961840727, + "learning_rate": 5.861949455825e-06, + "loss": 2.7178, + "step": 55406 + }, + { + "epoch": 2.579602858672626, + "grad_norm": 0.29078914268554373, + "learning_rate": 5.860676895747069e-06, + "loss": 2.6, + "step": 55407 + }, + { + "epoch": 2.579649416858719, + "grad_norm": 0.3063677940729326, + "learning_rate": 5.859404465213708e-06, + "loss": 2.6108, + "step": 55408 + }, + { + "epoch": 2.579695975044812, + "grad_norm": 0.29817047124847257, + "learning_rate": 5.8581321642286244e-06, + "loss": 2.6429, + "step": 55409 + }, + { + "epoch": 2.579742533230905, + "grad_norm": 0.31150397866754315, + "learning_rate": 5.856859992795599e-06, + "loss": 2.5855, + "step": 55410 + }, + { + "epoch": 2.5797890914169983, + "grad_norm": 0.30866540214710597, + "learning_rate": 5.8555879509183286e-06, + "loss": 2.6859, + "step": 55411 + }, + { + "epoch": 2.5798356496030914, + "grad_norm": 0.3145438593745432, + "learning_rate": 5.85431603860056e-06, + "loss": 2.6108, + "step": 55412 + }, + { + "epoch": 2.5798822077891845, + "grad_norm": 0.32254335033921677, + "learning_rate": 5.853044255846024e-06, + "loss": 2.5846, + "step": 55413 + }, + { + "epoch": 2.5799287659752776, + "grad_norm": 0.30883288338067155, + "learning_rate": 5.851772602658456e-06, + "loss": 2.692, + "step": 55414 + }, + { + "epoch": 2.5799753241613708, + "grad_norm": 0.30501186023040106, + "learning_rate": 5.850501079041581e-06, + "loss": 2.6878, + "step": 55415 + }, + { + "epoch": 2.580021882347464, + "grad_norm": 0.31159271494715224, + "learning_rate": 5.849229684999152e-06, + "loss": 2.6226, + "step": 55416 + }, + { + "epoch": 2.580068440533557, + "grad_norm": 0.30430658351367484, + "learning_rate": 5.84795842053486e-06, + "loss": 2.5591, + "step": 55417 + }, + { + "epoch": 2.58011499871965, + "grad_norm": 0.30111872411716145, + "learning_rate": 5.84668728565248e-06, + "loss": 2.6001, + "step": 55418 + }, + { + "epoch": 2.5801615569057432, + "grad_norm": 0.30587916318660746, + "learning_rate": 5.845416280355715e-06, + "loss": 2.5734, + "step": 55419 + }, + { + "epoch": 2.580208115091836, + "grad_norm": 0.3119687314033453, + "learning_rate": 5.844145404648304e-06, + "loss": 2.7018, + "step": 55420 + }, + { + "epoch": 2.580254673277929, + "grad_norm": 0.3087661557746497, + "learning_rate": 5.842874658533986e-06, + "loss": 2.6216, + "step": 55421 + }, + { + "epoch": 2.580301231464022, + "grad_norm": 0.3091852155819768, + "learning_rate": 5.841604042016457e-06, + "loss": 2.5752, + "step": 55422 + }, + { + "epoch": 2.5803477896501152, + "grad_norm": 0.33480615099997707, + "learning_rate": 5.840333555099492e-06, + "loss": 2.5809, + "step": 55423 + }, + { + "epoch": 2.5803943478362084, + "grad_norm": 0.31620775338772855, + "learning_rate": 5.839063197786782e-06, + "loss": 2.7868, + "step": 55424 + }, + { + "epoch": 2.5804409060223015, + "grad_norm": 0.3113546899123481, + "learning_rate": 5.837792970082084e-06, + "loss": 2.6134, + "step": 55425 + }, + { + "epoch": 2.5804874642083946, + "grad_norm": 0.31519625775340054, + "learning_rate": 5.836522871989109e-06, + "loss": 2.5715, + "step": 55426 + }, + { + "epoch": 2.5805340223944873, + "grad_norm": 0.35259790217090337, + "learning_rate": 5.835252903511584e-06, + "loss": 2.7086, + "step": 55427 + }, + { + "epoch": 2.5805805805805804, + "grad_norm": 0.3372350729120137, + "learning_rate": 5.833983064653242e-06, + "loss": 2.6442, + "step": 55428 + }, + { + "epoch": 2.5806271387666735, + "grad_norm": 0.30048238502613256, + "learning_rate": 5.832713355417818e-06, + "loss": 2.6442, + "step": 55429 + }, + { + "epoch": 2.5806736969527666, + "grad_norm": 0.3271401377142589, + "learning_rate": 5.831443775809009e-06, + "loss": 2.6488, + "step": 55430 + }, + { + "epoch": 2.5807202551388597, + "grad_norm": 0.30375659808441713, + "learning_rate": 5.830174325830584e-06, + "loss": 2.609, + "step": 55431 + }, + { + "epoch": 2.580766813324953, + "grad_norm": 0.30931332650525006, + "learning_rate": 5.828905005486235e-06, + "loss": 2.5237, + "step": 55432 + }, + { + "epoch": 2.580813371511046, + "grad_norm": 0.3177394582012579, + "learning_rate": 5.827635814779697e-06, + "loss": 2.7118, + "step": 55433 + }, + { + "epoch": 2.580859929697139, + "grad_norm": 0.3118808143025329, + "learning_rate": 5.826366753714707e-06, + "loss": 2.5655, + "step": 55434 + }, + { + "epoch": 2.580906487883232, + "grad_norm": 0.31206247962638095, + "learning_rate": 5.825097822294956e-06, + "loss": 2.6841, + "step": 55435 + }, + { + "epoch": 2.5809530460693253, + "grad_norm": 0.2972145974659652, + "learning_rate": 5.823829020524213e-06, + "loss": 2.5864, + "step": 55436 + }, + { + "epoch": 2.5809996042554184, + "grad_norm": 0.3097670878755746, + "learning_rate": 5.822560348406159e-06, + "loss": 2.5852, + "step": 55437 + }, + { + "epoch": 2.5810461624415115, + "grad_norm": 0.2934490143284675, + "learning_rate": 5.821291805944562e-06, + "loss": 2.5616, + "step": 55438 + }, + { + "epoch": 2.581092720627604, + "grad_norm": 0.3055568640373545, + "learning_rate": 5.820023393143109e-06, + "loss": 2.763, + "step": 55439 + }, + { + "epoch": 2.5811392788136973, + "grad_norm": 0.3063958653542186, + "learning_rate": 5.81875511000553e-06, + "loss": 2.5743, + "step": 55440 + }, + { + "epoch": 2.5811858369997904, + "grad_norm": 0.3072623250676045, + "learning_rate": 5.8174869565355615e-06, + "loss": 2.6579, + "step": 55441 + }, + { + "epoch": 2.5812323951858835, + "grad_norm": 0.30270894412158755, + "learning_rate": 5.816218932736911e-06, + "loss": 2.6092, + "step": 55442 + }, + { + "epoch": 2.5812789533719767, + "grad_norm": 0.3135156899923643, + "learning_rate": 5.814951038613303e-06, + "loss": 2.695, + "step": 55443 + }, + { + "epoch": 2.5813255115580698, + "grad_norm": 0.3046551990251144, + "learning_rate": 5.813683274168474e-06, + "loss": 2.5235, + "step": 55444 + }, + { + "epoch": 2.581372069744163, + "grad_norm": 0.30084470594328944, + "learning_rate": 5.812415639406121e-06, + "loss": 2.6934, + "step": 55445 + }, + { + "epoch": 2.5814186279302556, + "grad_norm": 0.2933743623276911, + "learning_rate": 5.811148134329975e-06, + "loss": 2.6926, + "step": 55446 + }, + { + "epoch": 2.5814651861163487, + "grad_norm": 0.30894218952789954, + "learning_rate": 5.809880758943764e-06, + "loss": 2.6852, + "step": 55447 + }, + { + "epoch": 2.581511744302442, + "grad_norm": 0.30243203051642237, + "learning_rate": 5.8086135132511766e-06, + "loss": 2.6627, + "step": 55448 + }, + { + "epoch": 2.581558302488535, + "grad_norm": 0.29314355406625686, + "learning_rate": 5.807346397255981e-06, + "loss": 2.6212, + "step": 55449 + }, + { + "epoch": 2.581604860674628, + "grad_norm": 0.3085697072058493, + "learning_rate": 5.8060794109618456e-06, + "loss": 2.68, + "step": 55450 + }, + { + "epoch": 2.581651418860721, + "grad_norm": 0.3146075692822188, + "learning_rate": 5.804812554372541e-06, + "loss": 2.7641, + "step": 55451 + }, + { + "epoch": 2.5816979770468143, + "grad_norm": 0.31083862530059625, + "learning_rate": 5.803545827491741e-06, + "loss": 2.7449, + "step": 55452 + }, + { + "epoch": 2.5817445352329074, + "grad_norm": 0.2902530286866895, + "learning_rate": 5.802279230323182e-06, + "loss": 2.6151, + "step": 55453 + }, + { + "epoch": 2.5817910934190005, + "grad_norm": 0.2913950287539469, + "learning_rate": 5.801012762870578e-06, + "loss": 2.6479, + "step": 55454 + }, + { + "epoch": 2.5818376516050936, + "grad_norm": 0.2737233227095825, + "learning_rate": 5.799746425137648e-06, + "loss": 2.6019, + "step": 55455 + }, + { + "epoch": 2.5818842097911867, + "grad_norm": 0.3200014527357096, + "learning_rate": 5.798480217128105e-06, + "loss": 2.6937, + "step": 55456 + }, + { + "epoch": 2.58193076797728, + "grad_norm": 0.30345274347203455, + "learning_rate": 5.79721413884568e-06, + "loss": 2.6447, + "step": 55457 + }, + { + "epoch": 2.581977326163373, + "grad_norm": 0.3103081899166645, + "learning_rate": 5.795948190294071e-06, + "loss": 2.5867, + "step": 55458 + }, + { + "epoch": 2.5820238843494656, + "grad_norm": 0.3094739203844719, + "learning_rate": 5.794682371476995e-06, + "loss": 2.6726, + "step": 55459 + }, + { + "epoch": 2.5820704425355587, + "grad_norm": 0.3179110141053562, + "learning_rate": 5.793416682398167e-06, + "loss": 2.6958, + "step": 55460 + }, + { + "epoch": 2.582117000721652, + "grad_norm": 0.29549476096130656, + "learning_rate": 5.7921511230613125e-06, + "loss": 2.6369, + "step": 55461 + }, + { + "epoch": 2.582163558907745, + "grad_norm": 0.30311145945582696, + "learning_rate": 5.7908856934701494e-06, + "loss": 2.6825, + "step": 55462 + }, + { + "epoch": 2.582210117093838, + "grad_norm": 0.311383713081106, + "learning_rate": 5.789620393628359e-06, + "loss": 2.6127, + "step": 55463 + }, + { + "epoch": 2.582256675279931, + "grad_norm": 0.3140802013124518, + "learning_rate": 5.788355223539699e-06, + "loss": 2.7041, + "step": 55464 + }, + { + "epoch": 2.5823032334660243, + "grad_norm": 0.29763971313325355, + "learning_rate": 5.78709018320785e-06, + "loss": 2.5843, + "step": 55465 + }, + { + "epoch": 2.582349791652117, + "grad_norm": 0.31043967911917736, + "learning_rate": 5.785825272636536e-06, + "loss": 2.613, + "step": 55466 + }, + { + "epoch": 2.58239634983821, + "grad_norm": 0.3080628987872948, + "learning_rate": 5.784560491829466e-06, + "loss": 2.7174, + "step": 55467 + }, + { + "epoch": 2.582442908024303, + "grad_norm": 0.29964412980485944, + "learning_rate": 5.783295840790359e-06, + "loss": 2.67, + "step": 55468 + }, + { + "epoch": 2.5824894662103963, + "grad_norm": 0.2973779452610187, + "learning_rate": 5.782031319522918e-06, + "loss": 2.6845, + "step": 55469 + }, + { + "epoch": 2.5825360243964894, + "grad_norm": 0.29559502513355024, + "learning_rate": 5.780766928030873e-06, + "loss": 2.671, + "step": 55470 + }, + { + "epoch": 2.5825825825825826, + "grad_norm": 0.29371826070932344, + "learning_rate": 5.77950266631791e-06, + "loss": 2.6887, + "step": 55471 + }, + { + "epoch": 2.5826291407686757, + "grad_norm": 0.296268691209635, + "learning_rate": 5.778238534387753e-06, + "loss": 2.6537, + "step": 55472 + }, + { + "epoch": 2.582675698954769, + "grad_norm": 0.29808186310149065, + "learning_rate": 5.776974532244106e-06, + "loss": 2.6065, + "step": 55473 + }, + { + "epoch": 2.582722257140862, + "grad_norm": 0.30606019411656327, + "learning_rate": 5.775710659890682e-06, + "loss": 2.6397, + "step": 55474 + }, + { + "epoch": 2.582768815326955, + "grad_norm": 0.29564864762461046, + "learning_rate": 5.7744469173312e-06, + "loss": 2.6236, + "step": 55475 + }, + { + "epoch": 2.582815373513048, + "grad_norm": 0.3030570304207998, + "learning_rate": 5.7731833045693404e-06, + "loss": 2.7331, + "step": 55476 + }, + { + "epoch": 2.5828619316991412, + "grad_norm": 0.29932445073844455, + "learning_rate": 5.771919821608851e-06, + "loss": 2.6317, + "step": 55477 + }, + { + "epoch": 2.582908489885234, + "grad_norm": 0.2889887926949185, + "learning_rate": 5.7706564684533944e-06, + "loss": 2.5003, + "step": 55478 + }, + { + "epoch": 2.582955048071327, + "grad_norm": 0.3047645227568688, + "learning_rate": 5.76939324510673e-06, + "loss": 2.6123, + "step": 55479 + }, + { + "epoch": 2.58300160625742, + "grad_norm": 0.30959601932113384, + "learning_rate": 5.768130151572526e-06, + "loss": 2.6374, + "step": 55480 + }, + { + "epoch": 2.5830481644435133, + "grad_norm": 0.30178496713948116, + "learning_rate": 5.7668671878545075e-06, + "loss": 2.6406, + "step": 55481 + }, + { + "epoch": 2.5830947226296064, + "grad_norm": 0.2969177235812099, + "learning_rate": 5.765604353956372e-06, + "loss": 2.6222, + "step": 55482 + }, + { + "epoch": 2.5831412808156995, + "grad_norm": 0.3039240239220729, + "learning_rate": 5.764341649881838e-06, + "loss": 2.652, + "step": 55483 + }, + { + "epoch": 2.5831878390017926, + "grad_norm": 0.3160534871958707, + "learning_rate": 5.7630790756345974e-06, + "loss": 2.6386, + "step": 55484 + }, + { + "epoch": 2.5832343971878857, + "grad_norm": 0.3039281423865665, + "learning_rate": 5.761816631218359e-06, + "loss": 2.6695, + "step": 55485 + }, + { + "epoch": 2.5832809553739784, + "grad_norm": 0.3040403201036851, + "learning_rate": 5.760554316636834e-06, + "loss": 2.5833, + "step": 55486 + }, + { + "epoch": 2.5833275135600715, + "grad_norm": 0.29981916947434967, + "learning_rate": 5.7592921318937235e-06, + "loss": 2.6494, + "step": 55487 + }, + { + "epoch": 2.5833740717461646, + "grad_norm": 0.3156665405365582, + "learning_rate": 5.758030076992738e-06, + "loss": 2.5677, + "step": 55488 + }, + { + "epoch": 2.5834206299322577, + "grad_norm": 0.3144506333197282, + "learning_rate": 5.756768151937559e-06, + "loss": 2.622, + "step": 55489 + }, + { + "epoch": 2.583467188118351, + "grad_norm": 0.3082761487250663, + "learning_rate": 5.755506356731921e-06, + "loss": 2.5538, + "step": 55490 + }, + { + "epoch": 2.583513746304444, + "grad_norm": 0.31023701355371897, + "learning_rate": 5.754244691379502e-06, + "loss": 2.691, + "step": 55491 + }, + { + "epoch": 2.583560304490537, + "grad_norm": 0.2985743539591971, + "learning_rate": 5.752983155884029e-06, + "loss": 2.6596, + "step": 55492 + }, + { + "epoch": 2.58360686267663, + "grad_norm": 0.32412094183153767, + "learning_rate": 5.751721750249184e-06, + "loss": 2.6251, + "step": 55493 + }, + { + "epoch": 2.5836534208627233, + "grad_norm": 0.33006861122595343, + "learning_rate": 5.750460474478675e-06, + "loss": 2.634, + "step": 55494 + }, + { + "epoch": 2.5836999790488164, + "grad_norm": 0.3009141345781506, + "learning_rate": 5.74919932857621e-06, + "loss": 2.6948, + "step": 55495 + }, + { + "epoch": 2.5837465372349095, + "grad_norm": 0.29832664999289127, + "learning_rate": 5.747938312545481e-06, + "loss": 2.7076, + "step": 55496 + }, + { + "epoch": 2.5837930954210027, + "grad_norm": 0.3332842283783399, + "learning_rate": 5.7466774263902014e-06, + "loss": 2.5843, + "step": 55497 + }, + { + "epoch": 2.5838396536070953, + "grad_norm": 0.34379317859733965, + "learning_rate": 5.745416670114057e-06, + "loss": 2.6336, + "step": 55498 + }, + { + "epoch": 2.5838862117931884, + "grad_norm": 0.3500957248050009, + "learning_rate": 5.7441560437207555e-06, + "loss": 2.6228, + "step": 55499 + }, + { + "epoch": 2.5839327699792816, + "grad_norm": 0.2863965557504646, + "learning_rate": 5.742895547213994e-06, + "loss": 2.5868, + "step": 55500 + }, + { + "epoch": 2.5839793281653747, + "grad_norm": 0.2861405235782575, + "learning_rate": 5.741635180597488e-06, + "loss": 2.6251, + "step": 55501 + }, + { + "epoch": 2.584025886351468, + "grad_norm": 0.31993652959135077, + "learning_rate": 5.740374943874899e-06, + "loss": 2.6144, + "step": 55502 + }, + { + "epoch": 2.584072444537561, + "grad_norm": 0.3256041933253337, + "learning_rate": 5.739114837049975e-06, + "loss": 2.6268, + "step": 55503 + }, + { + "epoch": 2.584119002723654, + "grad_norm": 0.3200819926596814, + "learning_rate": 5.737854860126363e-06, + "loss": 2.5477, + "step": 55504 + }, + { + "epoch": 2.5841655609097467, + "grad_norm": 0.29824338988042, + "learning_rate": 5.736595013107804e-06, + "loss": 2.5766, + "step": 55505 + }, + { + "epoch": 2.58421211909584, + "grad_norm": 0.31976565949440744, + "learning_rate": 5.735335295997973e-06, + "loss": 2.6702, + "step": 55506 + }, + { + "epoch": 2.584258677281933, + "grad_norm": 0.31256811639663235, + "learning_rate": 5.734075708800568e-06, + "loss": 2.5597, + "step": 55507 + }, + { + "epoch": 2.584305235468026, + "grad_norm": 0.30592121242530923, + "learning_rate": 5.732816251519297e-06, + "loss": 2.6032, + "step": 55508 + }, + { + "epoch": 2.584351793654119, + "grad_norm": 0.30806573538492454, + "learning_rate": 5.731556924157844e-06, + "loss": 2.647, + "step": 55509 + }, + { + "epoch": 2.5843983518402123, + "grad_norm": 0.29995863829804664, + "learning_rate": 5.730297726719918e-06, + "loss": 2.6275, + "step": 55510 + }, + { + "epoch": 2.5844449100263054, + "grad_norm": 0.3316296421035061, + "learning_rate": 5.729038659209201e-06, + "loss": 2.6453, + "step": 55511 + }, + { + "epoch": 2.5844914682123985, + "grad_norm": 0.32386535190311266, + "learning_rate": 5.72777972162939e-06, + "loss": 2.6787, + "step": 55512 + }, + { + "epoch": 2.5845380263984916, + "grad_norm": 0.3024299787225423, + "learning_rate": 5.7265209139841905e-06, + "loss": 2.6576, + "step": 55513 + }, + { + "epoch": 2.5845845845845847, + "grad_norm": 0.3220910543881492, + "learning_rate": 5.7252622362772846e-06, + "loss": 2.6692, + "step": 55514 + }, + { + "epoch": 2.584631142770678, + "grad_norm": 0.28965859210593, + "learning_rate": 5.724003688512375e-06, + "loss": 2.5482, + "step": 55515 + }, + { + "epoch": 2.584677700956771, + "grad_norm": 0.3208092139179879, + "learning_rate": 5.722745270693164e-06, + "loss": 2.609, + "step": 55516 + }, + { + "epoch": 2.5847242591428636, + "grad_norm": 0.3120169129746257, + "learning_rate": 5.72148698282331e-06, + "loss": 2.6846, + "step": 55517 + }, + { + "epoch": 2.5847708173289567, + "grad_norm": 0.3090635694300869, + "learning_rate": 5.720228824906548e-06, + "loss": 2.6068, + "step": 55518 + }, + { + "epoch": 2.58481737551505, + "grad_norm": 0.30512407412213777, + "learning_rate": 5.718970796946549e-06, + "loss": 2.6657, + "step": 55519 + }, + { + "epoch": 2.584863933701143, + "grad_norm": 0.2979911970395764, + "learning_rate": 5.717712898947003e-06, + "loss": 2.6926, + "step": 55520 + }, + { + "epoch": 2.584910491887236, + "grad_norm": 0.3034892530274442, + "learning_rate": 5.716455130911608e-06, + "loss": 2.559, + "step": 55521 + }, + { + "epoch": 2.584957050073329, + "grad_norm": 0.30593189637226553, + "learning_rate": 5.7151974928440545e-06, + "loss": 2.6599, + "step": 55522 + }, + { + "epoch": 2.5850036082594223, + "grad_norm": 0.31018839219552663, + "learning_rate": 5.713939984748046e-06, + "loss": 2.6073, + "step": 55523 + }, + { + "epoch": 2.5850501664455154, + "grad_norm": 0.2912466260719446, + "learning_rate": 5.712682606627251e-06, + "loss": 2.6593, + "step": 55524 + }, + { + "epoch": 2.585096724631608, + "grad_norm": 0.2973865051596165, + "learning_rate": 5.711425358485368e-06, + "loss": 2.6227, + "step": 55525 + }, + { + "epoch": 2.5851432828177012, + "grad_norm": 0.3059996199703846, + "learning_rate": 5.710168240326086e-06, + "loss": 2.672, + "step": 55526 + }, + { + "epoch": 2.5851898410037943, + "grad_norm": 0.29170428881929006, + "learning_rate": 5.708911252153099e-06, + "loss": 2.6245, + "step": 55527 + }, + { + "epoch": 2.5852363991898875, + "grad_norm": 0.2941784299922728, + "learning_rate": 5.707654393970091e-06, + "loss": 2.6605, + "step": 55528 + }, + { + "epoch": 2.5852829573759806, + "grad_norm": 0.30671187001680483, + "learning_rate": 5.706397665780772e-06, + "loss": 2.5989, + "step": 55529 + }, + { + "epoch": 2.5853295155620737, + "grad_norm": 0.30637007292885376, + "learning_rate": 5.705141067588782e-06, + "loss": 2.563, + "step": 55530 + }, + { + "epoch": 2.585376073748167, + "grad_norm": 0.29597437338515287, + "learning_rate": 5.703884599397869e-06, + "loss": 2.6109, + "step": 55531 + }, + { + "epoch": 2.58542263193426, + "grad_norm": 0.3253022825855338, + "learning_rate": 5.702628261211673e-06, + "loss": 2.6952, + "step": 55532 + }, + { + "epoch": 2.585469190120353, + "grad_norm": 0.2856285307374539, + "learning_rate": 5.7013720530339045e-06, + "loss": 2.5596, + "step": 55533 + }, + { + "epoch": 2.585515748306446, + "grad_norm": 0.31112881212039933, + "learning_rate": 5.700115974868242e-06, + "loss": 2.6724, + "step": 55534 + }, + { + "epoch": 2.5855623064925393, + "grad_norm": 0.28261545040223346, + "learning_rate": 5.698860026718372e-06, + "loss": 2.6027, + "step": 55535 + }, + { + "epoch": 2.5856088646786324, + "grad_norm": 0.30866652661145855, + "learning_rate": 5.697604208587998e-06, + "loss": 2.7486, + "step": 55536 + }, + { + "epoch": 2.585655422864725, + "grad_norm": 0.3082693325054623, + "learning_rate": 5.696348520480782e-06, + "loss": 2.6552, + "step": 55537 + }, + { + "epoch": 2.585701981050818, + "grad_norm": 0.2987434490602383, + "learning_rate": 5.695092962400417e-06, + "loss": 2.6533, + "step": 55538 + }, + { + "epoch": 2.5857485392369113, + "grad_norm": 0.2962758752331832, + "learning_rate": 5.693837534350588e-06, + "loss": 2.5704, + "step": 55539 + }, + { + "epoch": 2.5857950974230044, + "grad_norm": 0.28324189732740174, + "learning_rate": 5.692582236334981e-06, + "loss": 2.6014, + "step": 55540 + }, + { + "epoch": 2.5858416556090975, + "grad_norm": 0.3017068483406291, + "learning_rate": 5.691327068357277e-06, + "loss": 2.7167, + "step": 55541 + }, + { + "epoch": 2.5858882137951906, + "grad_norm": 0.29649491759194246, + "learning_rate": 5.690072030421178e-06, + "loss": 2.6521, + "step": 55542 + }, + { + "epoch": 2.5859347719812837, + "grad_norm": 0.30305681158329734, + "learning_rate": 5.688817122530332e-06, + "loss": 2.6949, + "step": 55543 + }, + { + "epoch": 2.5859813301673764, + "grad_norm": 0.3109678885066588, + "learning_rate": 5.687562344688463e-06, + "loss": 2.6358, + "step": 55544 + }, + { + "epoch": 2.5860278883534695, + "grad_norm": 0.30240423148875895, + "learning_rate": 5.686307696899224e-06, + "loss": 2.7197, + "step": 55545 + }, + { + "epoch": 2.5860744465395626, + "grad_norm": 0.2977902420299321, + "learning_rate": 5.685053179166299e-06, + "loss": 2.6599, + "step": 55546 + }, + { + "epoch": 2.5861210047256558, + "grad_norm": 0.30468895768341714, + "learning_rate": 5.683798791493383e-06, + "loss": 2.6433, + "step": 55547 + }, + { + "epoch": 2.586167562911749, + "grad_norm": 0.30300590643197545, + "learning_rate": 5.682544533884149e-06, + "loss": 2.5305, + "step": 55548 + }, + { + "epoch": 2.586214121097842, + "grad_norm": 0.30117091093350434, + "learning_rate": 5.681290406342293e-06, + "loss": 2.6393, + "step": 55549 + }, + { + "epoch": 2.586260679283935, + "grad_norm": 0.30258976418923456, + "learning_rate": 5.6800364088714645e-06, + "loss": 2.6847, + "step": 55550 + }, + { + "epoch": 2.586307237470028, + "grad_norm": 0.2892217868391754, + "learning_rate": 5.67878254147538e-06, + "loss": 2.5501, + "step": 55551 + }, + { + "epoch": 2.5863537956561213, + "grad_norm": 0.297561894019213, + "learning_rate": 5.677528804157695e-06, + "loss": 2.674, + "step": 55552 + }, + { + "epoch": 2.5864003538422145, + "grad_norm": 0.3043196623216283, + "learning_rate": 5.676275196922093e-06, + "loss": 2.5993, + "step": 55553 + }, + { + "epoch": 2.5864469120283076, + "grad_norm": 0.30183159779474744, + "learning_rate": 5.675021719772261e-06, + "loss": 2.6035, + "step": 55554 + }, + { + "epoch": 2.5864934702144007, + "grad_norm": 0.28542869863673637, + "learning_rate": 5.673768372711879e-06, + "loss": 2.6126, + "step": 55555 + }, + { + "epoch": 2.5865400284004934, + "grad_norm": 0.28937476863461836, + "learning_rate": 5.672515155744607e-06, + "loss": 2.6355, + "step": 55556 + }, + { + "epoch": 2.5865865865865865, + "grad_norm": 0.3080939180894772, + "learning_rate": 5.67126206887415e-06, + "loss": 2.6299, + "step": 55557 + }, + { + "epoch": 2.5866331447726796, + "grad_norm": 0.30015898789987827, + "learning_rate": 5.670009112104163e-06, + "loss": 2.6596, + "step": 55558 + }, + { + "epoch": 2.5866797029587727, + "grad_norm": 0.285270123332804, + "learning_rate": 5.6687562854383315e-06, + "loss": 2.5868, + "step": 55559 + }, + { + "epoch": 2.586726261144866, + "grad_norm": 0.3121099389980491, + "learning_rate": 5.667503588880335e-06, + "loss": 2.6043, + "step": 55560 + }, + { + "epoch": 2.586772819330959, + "grad_norm": 0.2802180620590689, + "learning_rate": 5.666251022433844e-06, + "loss": 2.5345, + "step": 55561 + }, + { + "epoch": 2.586819377517052, + "grad_norm": 0.3035283530286358, + "learning_rate": 5.664998586102554e-06, + "loss": 2.6363, + "step": 55562 + }, + { + "epoch": 2.586865935703145, + "grad_norm": 0.304643504765492, + "learning_rate": 5.663746279890103e-06, + "loss": 2.6183, + "step": 55563 + }, + { + "epoch": 2.586912493889238, + "grad_norm": 0.2970621509131396, + "learning_rate": 5.662494103800209e-06, + "loss": 2.5818, + "step": 55564 + }, + { + "epoch": 2.586959052075331, + "grad_norm": 0.2943857493364309, + "learning_rate": 5.661242057836513e-06, + "loss": 2.612, + "step": 55565 + }, + { + "epoch": 2.587005610261424, + "grad_norm": 0.2993161662411779, + "learning_rate": 5.659990142002708e-06, + "loss": 2.5588, + "step": 55566 + }, + { + "epoch": 2.587052168447517, + "grad_norm": 0.2931472018747798, + "learning_rate": 5.658738356302462e-06, + "loss": 2.6212, + "step": 55567 + }, + { + "epoch": 2.5870987266336103, + "grad_norm": 0.299875923562596, + "learning_rate": 5.6574867007394505e-06, + "loss": 2.579, + "step": 55568 + }, + { + "epoch": 2.5871452848197034, + "grad_norm": 0.29964156781688567, + "learning_rate": 5.656235175317348e-06, + "loss": 2.5547, + "step": 55569 + }, + { + "epoch": 2.5871918430057965, + "grad_norm": 0.29986179636449845, + "learning_rate": 5.654983780039835e-06, + "loss": 2.578, + "step": 55570 + }, + { + "epoch": 2.5872384011918896, + "grad_norm": 0.2978076161745458, + "learning_rate": 5.65373251491057e-06, + "loss": 2.6579, + "step": 55571 + }, + { + "epoch": 2.5872849593779828, + "grad_norm": 0.30953677499315546, + "learning_rate": 5.652481379933228e-06, + "loss": 2.6296, + "step": 55572 + }, + { + "epoch": 2.587331517564076, + "grad_norm": 0.30592426533225053, + "learning_rate": 5.651230375111482e-06, + "loss": 2.664, + "step": 55573 + }, + { + "epoch": 2.587378075750169, + "grad_norm": 0.29931839214085953, + "learning_rate": 5.6499795004490095e-06, + "loss": 2.6853, + "step": 55574 + }, + { + "epoch": 2.587424633936262, + "grad_norm": 0.2811703089381346, + "learning_rate": 5.648728755949489e-06, + "loss": 2.6578, + "step": 55575 + }, + { + "epoch": 2.5874711921223548, + "grad_norm": 0.29115919184303707, + "learning_rate": 5.647478141616558e-06, + "loss": 2.5795, + "step": 55576 + }, + { + "epoch": 2.587517750308448, + "grad_norm": 0.3085541581859805, + "learning_rate": 5.6462276574539284e-06, + "loss": 2.5778, + "step": 55577 + }, + { + "epoch": 2.587564308494541, + "grad_norm": 0.2952069667821293, + "learning_rate": 5.6449773034652435e-06, + "loss": 2.5883, + "step": 55578 + }, + { + "epoch": 2.587610866680634, + "grad_norm": 0.2952033029630253, + "learning_rate": 5.643727079654177e-06, + "loss": 2.6107, + "step": 55579 + }, + { + "epoch": 2.5876574248667272, + "grad_norm": 0.29909128945146624, + "learning_rate": 5.642476986024403e-06, + "loss": 2.6242, + "step": 55580 + }, + { + "epoch": 2.5877039830528203, + "grad_norm": 0.30873199324697886, + "learning_rate": 5.641227022579593e-06, + "loss": 2.6431, + "step": 55581 + }, + { + "epoch": 2.5877505412389135, + "grad_norm": 0.309666180072346, + "learning_rate": 5.6399771893234035e-06, + "loss": 2.5466, + "step": 55582 + }, + { + "epoch": 2.587797099425006, + "grad_norm": 0.29426017223118706, + "learning_rate": 5.6387274862595265e-06, + "loss": 2.6591, + "step": 55583 + }, + { + "epoch": 2.5878436576110992, + "grad_norm": 0.2837428060019743, + "learning_rate": 5.637477913391603e-06, + "loss": 2.6865, + "step": 55584 + }, + { + "epoch": 2.5878902157971924, + "grad_norm": 0.2812375226099917, + "learning_rate": 5.636228470723315e-06, + "loss": 2.6472, + "step": 55585 + }, + { + "epoch": 2.5879367739832855, + "grad_norm": 0.30487103839436125, + "learning_rate": 5.634979158258319e-06, + "loss": 2.5696, + "step": 55586 + }, + { + "epoch": 2.5879833321693786, + "grad_norm": 0.2941995611652145, + "learning_rate": 5.633729976000296e-06, + "loss": 2.5991, + "step": 55587 + }, + { + "epoch": 2.5880298903554717, + "grad_norm": 0.2982614925608161, + "learning_rate": 5.63248092395291e-06, + "loss": 2.6483, + "step": 55588 + }, + { + "epoch": 2.588076448541565, + "grad_norm": 0.29781818680675587, + "learning_rate": 5.631232002119802e-06, + "loss": 2.6947, + "step": 55589 + }, + { + "epoch": 2.588123006727658, + "grad_norm": 0.28537838596954684, + "learning_rate": 5.629983210504674e-06, + "loss": 2.5301, + "step": 55590 + }, + { + "epoch": 2.588169564913751, + "grad_norm": 0.3063574900042149, + "learning_rate": 5.6287345491111695e-06, + "loss": 2.6677, + "step": 55591 + }, + { + "epoch": 2.588216123099844, + "grad_norm": 0.30268504968756477, + "learning_rate": 5.627486017942956e-06, + "loss": 2.6733, + "step": 55592 + }, + { + "epoch": 2.5882626812859373, + "grad_norm": 0.28860232877865033, + "learning_rate": 5.626237617003699e-06, + "loss": 2.6654, + "step": 55593 + }, + { + "epoch": 2.5883092394720304, + "grad_norm": 0.3085020534575968, + "learning_rate": 5.624989346297066e-06, + "loss": 2.6143, + "step": 55594 + }, + { + "epoch": 2.5883557976581235, + "grad_norm": 0.29627148542849446, + "learning_rate": 5.62374120582671e-06, + "loss": 2.6466, + "step": 55595 + }, + { + "epoch": 2.588402355844216, + "grad_norm": 0.29423238712659355, + "learning_rate": 5.622493195596318e-06, + "loss": 2.5615, + "step": 55596 + }, + { + "epoch": 2.5884489140303093, + "grad_norm": 0.3016267692114465, + "learning_rate": 5.621245315609525e-06, + "loss": 2.6721, + "step": 55597 + }, + { + "epoch": 2.5884954722164024, + "grad_norm": 0.2867796413353278, + "learning_rate": 5.619997565870005e-06, + "loss": 2.6824, + "step": 55598 + }, + { + "epoch": 2.5885420304024955, + "grad_norm": 0.29016508273305525, + "learning_rate": 5.618749946381424e-06, + "loss": 2.7021, + "step": 55599 + }, + { + "epoch": 2.5885885885885886, + "grad_norm": 0.294782795277996, + "learning_rate": 5.617502457147434e-06, + "loss": 2.6486, + "step": 55600 + }, + { + "epoch": 2.5886351467746818, + "grad_norm": 0.31862896502230204, + "learning_rate": 5.616255098171713e-06, + "loss": 2.7175, + "step": 55601 + }, + { + "epoch": 2.588681704960775, + "grad_norm": 0.3027825757626751, + "learning_rate": 5.615007869457894e-06, + "loss": 2.6093, + "step": 55602 + }, + { + "epoch": 2.5887282631468675, + "grad_norm": 0.28703338535547523, + "learning_rate": 5.6137607710096685e-06, + "loss": 2.6454, + "step": 55603 + }, + { + "epoch": 2.5887748213329607, + "grad_norm": 0.2907873833396579, + "learning_rate": 5.612513802830666e-06, + "loss": 2.6684, + "step": 55604 + }, + { + "epoch": 2.588821379519054, + "grad_norm": 0.32361577106309714, + "learning_rate": 5.611266964924583e-06, + "loss": 2.6593, + "step": 55605 + }, + { + "epoch": 2.588867937705147, + "grad_norm": 0.297085734797792, + "learning_rate": 5.610020257295046e-06, + "loss": 2.6832, + "step": 55606 + }, + { + "epoch": 2.58891449589124, + "grad_norm": 0.31072732878424136, + "learning_rate": 5.608773679945728e-06, + "loss": 2.6383, + "step": 55607 + }, + { + "epoch": 2.588961054077333, + "grad_norm": 0.3086886274223194, + "learning_rate": 5.607527232880289e-06, + "loss": 2.646, + "step": 55608 + }, + { + "epoch": 2.5890076122634262, + "grad_norm": 0.29319011131584716, + "learning_rate": 5.606280916102391e-06, + "loss": 2.6771, + "step": 55609 + }, + { + "epoch": 2.5890541704495194, + "grad_norm": 0.27985646068223147, + "learning_rate": 5.605034729615677e-06, + "loss": 2.587, + "step": 55610 + }, + { + "epoch": 2.5891007286356125, + "grad_norm": 0.29600849203161206, + "learning_rate": 5.60378867342381e-06, + "loss": 2.6918, + "step": 55611 + }, + { + "epoch": 2.5891472868217056, + "grad_norm": 0.3017263927424587, + "learning_rate": 5.602542747530448e-06, + "loss": 2.6615, + "step": 55612 + }, + { + "epoch": 2.5891938450077987, + "grad_norm": 0.3033855073624911, + "learning_rate": 5.60129695193925e-06, + "loss": 2.674, + "step": 55613 + }, + { + "epoch": 2.589240403193892, + "grad_norm": 0.31092835211033637, + "learning_rate": 5.6000512866538845e-06, + "loss": 2.616, + "step": 55614 + }, + { + "epoch": 2.5892869613799845, + "grad_norm": 0.2901882439947024, + "learning_rate": 5.598805751677971e-06, + "loss": 2.6309, + "step": 55615 + }, + { + "epoch": 2.5893335195660776, + "grad_norm": 0.30272458919800493, + "learning_rate": 5.5975603470152125e-06, + "loss": 2.6081, + "step": 55616 + }, + { + "epoch": 2.5893800777521707, + "grad_norm": 0.2873155275966004, + "learning_rate": 5.596315072669217e-06, + "loss": 2.656, + "step": 55617 + }, + { + "epoch": 2.589426635938264, + "grad_norm": 0.31206592272921246, + "learning_rate": 5.595069928643682e-06, + "loss": 2.6392, + "step": 55618 + }, + { + "epoch": 2.589473194124357, + "grad_norm": 0.29778582692260896, + "learning_rate": 5.593824914942236e-06, + "loss": 2.6227, + "step": 55619 + }, + { + "epoch": 2.58951975231045, + "grad_norm": 0.3075824860740352, + "learning_rate": 5.592580031568534e-06, + "loss": 2.6187, + "step": 55620 + }, + { + "epoch": 2.589566310496543, + "grad_norm": 0.3109541495544045, + "learning_rate": 5.591335278526238e-06, + "loss": 2.5943, + "step": 55621 + }, + { + "epoch": 2.589612868682636, + "grad_norm": 0.3064295776065333, + "learning_rate": 5.590090655818997e-06, + "loss": 2.6852, + "step": 55622 + }, + { + "epoch": 2.589659426868729, + "grad_norm": 0.30733983175814544, + "learning_rate": 5.588846163450473e-06, + "loss": 2.7096, + "step": 55623 + }, + { + "epoch": 2.589705985054822, + "grad_norm": 0.31188499792463403, + "learning_rate": 5.587601801424302e-06, + "loss": 2.6377, + "step": 55624 + }, + { + "epoch": 2.589752543240915, + "grad_norm": 0.30717299350309546, + "learning_rate": 5.586357569744144e-06, + "loss": 2.6559, + "step": 55625 + }, + { + "epoch": 2.5897991014270083, + "grad_norm": 0.29991397669572084, + "learning_rate": 5.58511346841365e-06, + "loss": 2.6547, + "step": 55626 + }, + { + "epoch": 2.5898456596131014, + "grad_norm": 0.29798343719275844, + "learning_rate": 5.5838694974364835e-06, + "loss": 2.5649, + "step": 55627 + }, + { + "epoch": 2.5898922177991945, + "grad_norm": 0.32400077947695527, + "learning_rate": 5.582625656816265e-06, + "loss": 2.7858, + "step": 55628 + }, + { + "epoch": 2.5899387759852877, + "grad_norm": 0.31742543418307506, + "learning_rate": 5.581381946556685e-06, + "loss": 2.5665, + "step": 55629 + }, + { + "epoch": 2.5899853341713808, + "grad_norm": 0.30839505814638457, + "learning_rate": 5.580138366661353e-06, + "loss": 2.5758, + "step": 55630 + }, + { + "epoch": 2.590031892357474, + "grad_norm": 0.286470283385523, + "learning_rate": 5.578894917133959e-06, + "loss": 2.5978, + "step": 55631 + }, + { + "epoch": 2.590078450543567, + "grad_norm": 0.3076241882414007, + "learning_rate": 5.577651597978118e-06, + "loss": 2.6349, + "step": 55632 + }, + { + "epoch": 2.59012500872966, + "grad_norm": 0.31795062098226107, + "learning_rate": 5.576408409197498e-06, + "loss": 2.5915, + "step": 55633 + }, + { + "epoch": 2.5901715669157532, + "grad_norm": 0.298990694563337, + "learning_rate": 5.575165350795741e-06, + "loss": 2.5481, + "step": 55634 + }, + { + "epoch": 2.590218125101846, + "grad_norm": 0.30798103831191165, + "learning_rate": 5.573922422776495e-06, + "loss": 2.6668, + "step": 55635 + }, + { + "epoch": 2.590264683287939, + "grad_norm": 0.2969117076506692, + "learning_rate": 5.572679625143423e-06, + "loss": 2.6228, + "step": 55636 + }, + { + "epoch": 2.590311241474032, + "grad_norm": 0.29640084173963954, + "learning_rate": 5.57143695790015e-06, + "loss": 2.7763, + "step": 55637 + }, + { + "epoch": 2.5903577996601252, + "grad_norm": 0.30102250251298035, + "learning_rate": 5.570194421050329e-06, + "loss": 2.6844, + "step": 55638 + }, + { + "epoch": 2.5904043578462184, + "grad_norm": 0.303812795751115, + "learning_rate": 5.5689520145976116e-06, + "loss": 2.5751, + "step": 55639 + }, + { + "epoch": 2.5904509160323115, + "grad_norm": 0.31323525510569195, + "learning_rate": 5.5677097385456455e-06, + "loss": 2.6761, + "step": 55640 + }, + { + "epoch": 2.5904974742184046, + "grad_norm": 0.29700272196312216, + "learning_rate": 5.566467592898073e-06, + "loss": 2.6182, + "step": 55641 + }, + { + "epoch": 2.5905440324044973, + "grad_norm": 0.30126994957268366, + "learning_rate": 5.565225577658551e-06, + "loss": 2.6209, + "step": 55642 + }, + { + "epoch": 2.5905905905905904, + "grad_norm": 0.29941054130364125, + "learning_rate": 5.563983692830693e-06, + "loss": 2.479, + "step": 55643 + }, + { + "epoch": 2.5906371487766835, + "grad_norm": 0.29551300933913466, + "learning_rate": 5.5627419384181865e-06, + "loss": 2.6501, + "step": 55644 + }, + { + "epoch": 2.5906837069627766, + "grad_norm": 0.29770031005201397, + "learning_rate": 5.561500314424639e-06, + "loss": 2.6084, + "step": 55645 + }, + { + "epoch": 2.5907302651488697, + "grad_norm": 0.301774486030307, + "learning_rate": 5.5602588208537195e-06, + "loss": 2.6697, + "step": 55646 + }, + { + "epoch": 2.590776823334963, + "grad_norm": 0.2969171597809267, + "learning_rate": 5.5590174577090536e-06, + "loss": 2.6181, + "step": 55647 + }, + { + "epoch": 2.590823381521056, + "grad_norm": 0.29231696629131537, + "learning_rate": 5.557776224994299e-06, + "loss": 2.6006, + "step": 55648 + }, + { + "epoch": 2.590869939707149, + "grad_norm": 0.29057546496474773, + "learning_rate": 5.556535122713102e-06, + "loss": 2.6191, + "step": 55649 + }, + { + "epoch": 2.590916497893242, + "grad_norm": 0.304697070484646, + "learning_rate": 5.555294150869084e-06, + "loss": 2.7002, + "step": 55650 + }, + { + "epoch": 2.5909630560793353, + "grad_norm": 0.3100337826700385, + "learning_rate": 5.554053309465901e-06, + "loss": 2.6121, + "step": 55651 + }, + { + "epoch": 2.5910096142654284, + "grad_norm": 0.293217808098673, + "learning_rate": 5.5528125985071895e-06, + "loss": 2.6775, + "step": 55652 + }, + { + "epoch": 2.5910561724515215, + "grad_norm": 0.3150728583846222, + "learning_rate": 5.551572017996598e-06, + "loss": 2.704, + "step": 55653 + }, + { + "epoch": 2.591102730637614, + "grad_norm": 0.31454388178334913, + "learning_rate": 5.550331567937766e-06, + "loss": 2.6041, + "step": 55654 + }, + { + "epoch": 2.5911492888237073, + "grad_norm": 0.30111374725470935, + "learning_rate": 5.549091248334337e-06, + "loss": 2.6361, + "step": 55655 + }, + { + "epoch": 2.5911958470098004, + "grad_norm": 0.29498904265833764, + "learning_rate": 5.547851059189929e-06, + "loss": 2.5127, + "step": 55656 + }, + { + "epoch": 2.5912424051958936, + "grad_norm": 0.30447161970688363, + "learning_rate": 5.546611000508217e-06, + "loss": 2.466, + "step": 55657 + }, + { + "epoch": 2.5912889633819867, + "grad_norm": 0.3019544206182489, + "learning_rate": 5.5453710722928095e-06, + "loss": 2.5962, + "step": 55658 + }, + { + "epoch": 2.59133552156808, + "grad_norm": 0.287682825629454, + "learning_rate": 5.544131274547365e-06, + "loss": 2.5754, + "step": 55659 + }, + { + "epoch": 2.591382079754173, + "grad_norm": 0.2974628836936517, + "learning_rate": 5.542891607275513e-06, + "loss": 2.4822, + "step": 55660 + }, + { + "epoch": 2.591428637940266, + "grad_norm": 0.30139396317111095, + "learning_rate": 5.541652070480891e-06, + "loss": 2.5688, + "step": 55661 + }, + { + "epoch": 2.5914751961263587, + "grad_norm": 0.296059149601535, + "learning_rate": 5.540412664167155e-06, + "loss": 2.5867, + "step": 55662 + }, + { + "epoch": 2.591521754312452, + "grad_norm": 0.2782615380711608, + "learning_rate": 5.539173388337915e-06, + "loss": 2.619, + "step": 55663 + }, + { + "epoch": 2.591568312498545, + "grad_norm": 0.29739491469326496, + "learning_rate": 5.5379342429968225e-06, + "loss": 2.7162, + "step": 55664 + }, + { + "epoch": 2.591614870684638, + "grad_norm": 0.2936129900772445, + "learning_rate": 5.536695228147509e-06, + "loss": 2.6173, + "step": 55665 + }, + { + "epoch": 2.591661428870731, + "grad_norm": 0.3165697619495128, + "learning_rate": 5.535456343793621e-06, + "loss": 2.599, + "step": 55666 + }, + { + "epoch": 2.5917079870568243, + "grad_norm": 0.30662841475098557, + "learning_rate": 5.534217589938784e-06, + "loss": 2.6318, + "step": 55667 + }, + { + "epoch": 2.5917545452429174, + "grad_norm": 0.2975964967534014, + "learning_rate": 5.5329789665866495e-06, + "loss": 2.5313, + "step": 55668 + }, + { + "epoch": 2.5918011034290105, + "grad_norm": 0.29963259333834524, + "learning_rate": 5.531740473740821e-06, + "loss": 2.6215, + "step": 55669 + }, + { + "epoch": 2.5918476616151036, + "grad_norm": 0.3029401914704319, + "learning_rate": 5.530502111404973e-06, + "loss": 2.5536, + "step": 55670 + }, + { + "epoch": 2.5918942198011967, + "grad_norm": 0.3076581962652749, + "learning_rate": 5.5292638795827075e-06, + "loss": 2.6497, + "step": 55671 + }, + { + "epoch": 2.59194077798729, + "grad_norm": 0.3014351964997932, + "learning_rate": 5.528025778277673e-06, + "loss": 2.6722, + "step": 55672 + }, + { + "epoch": 2.591987336173383, + "grad_norm": 0.294869583742002, + "learning_rate": 5.5267878074935e-06, + "loss": 2.5873, + "step": 55673 + }, + { + "epoch": 2.5920338943594756, + "grad_norm": 0.30740125937355856, + "learning_rate": 5.525549967233828e-06, + "loss": 2.6325, + "step": 55674 + }, + { + "epoch": 2.5920804525455687, + "grad_norm": 0.30044749921721237, + "learning_rate": 5.524312257502295e-06, + "loss": 2.5654, + "step": 55675 + }, + { + "epoch": 2.592127010731662, + "grad_norm": 0.28939926725700815, + "learning_rate": 5.523074678302498e-06, + "loss": 2.6186, + "step": 55676 + }, + { + "epoch": 2.592173568917755, + "grad_norm": 0.3106604853897937, + "learning_rate": 5.521837229638121e-06, + "loss": 2.6974, + "step": 55677 + }, + { + "epoch": 2.592220127103848, + "grad_norm": 0.2996280840175596, + "learning_rate": 5.5205999115127585e-06, + "loss": 2.679, + "step": 55678 + }, + { + "epoch": 2.592266685289941, + "grad_norm": 0.30408664266847313, + "learning_rate": 5.519362723930055e-06, + "loss": 2.5605, + "step": 55679 + }, + { + "epoch": 2.5923132434760343, + "grad_norm": 0.31229855512497723, + "learning_rate": 5.518125666893636e-06, + "loss": 2.7274, + "step": 55680 + }, + { + "epoch": 2.592359801662127, + "grad_norm": 0.30989091767140825, + "learning_rate": 5.516888740407145e-06, + "loss": 2.6441, + "step": 55681 + }, + { + "epoch": 2.59240635984822, + "grad_norm": 0.2939020300675206, + "learning_rate": 5.515651944474187e-06, + "loss": 2.5856, + "step": 55682 + }, + { + "epoch": 2.592452918034313, + "grad_norm": 0.3116197478011983, + "learning_rate": 5.514415279098428e-06, + "loss": 2.6797, + "step": 55683 + }, + { + "epoch": 2.5924994762204063, + "grad_norm": 0.3058165482365819, + "learning_rate": 5.513178744283465e-06, + "loss": 2.6602, + "step": 55684 + }, + { + "epoch": 2.5925460344064994, + "grad_norm": 0.3014932117551455, + "learning_rate": 5.511942340032944e-06, + "loss": 2.6128, + "step": 55685 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 0.31249033537174403, + "learning_rate": 5.510706066350485e-06, + "loss": 2.5121, + "step": 55686 + }, + { + "epoch": 2.5926391507786857, + "grad_norm": 0.30150167204291556, + "learning_rate": 5.509469923239724e-06, + "loss": 2.6264, + "step": 55687 + }, + { + "epoch": 2.592685708964779, + "grad_norm": 0.31431162500236476, + "learning_rate": 5.508233910704291e-06, + "loss": 2.7005, + "step": 55688 + }, + { + "epoch": 2.592732267150872, + "grad_norm": 0.2958986852206176, + "learning_rate": 5.506998028747795e-06, + "loss": 2.6078, + "step": 55689 + }, + { + "epoch": 2.592778825336965, + "grad_norm": 0.3101053448863063, + "learning_rate": 5.505762277373894e-06, + "loss": 2.5079, + "step": 55690 + }, + { + "epoch": 2.592825383523058, + "grad_norm": 0.31309798563913643, + "learning_rate": 5.504526656586184e-06, + "loss": 2.5676, + "step": 55691 + }, + { + "epoch": 2.5928719417091513, + "grad_norm": 0.30129993907456487, + "learning_rate": 5.503291166388308e-06, + "loss": 2.7054, + "step": 55692 + }, + { + "epoch": 2.592918499895244, + "grad_norm": 0.28917776566450354, + "learning_rate": 5.50205580678389e-06, + "loss": 2.5735, + "step": 55693 + }, + { + "epoch": 2.592965058081337, + "grad_norm": 0.31325799753996786, + "learning_rate": 5.50082057777655e-06, + "loss": 2.5977, + "step": 55694 + }, + { + "epoch": 2.59301161626743, + "grad_norm": 0.290613763153111, + "learning_rate": 5.499585479369917e-06, + "loss": 2.6567, + "step": 55695 + }, + { + "epoch": 2.5930581744535233, + "grad_norm": 0.3090708114675759, + "learning_rate": 5.498350511567635e-06, + "loss": 2.6341, + "step": 55696 + }, + { + "epoch": 2.5931047326396164, + "grad_norm": 0.3014518884937808, + "learning_rate": 5.4971156743732825e-06, + "loss": 2.6234, + "step": 55697 + }, + { + "epoch": 2.5931512908257095, + "grad_norm": 0.28195196336576084, + "learning_rate": 5.4958809677905344e-06, + "loss": 2.5655, + "step": 55698 + }, + { + "epoch": 2.5931978490118026, + "grad_norm": 0.3088544615620993, + "learning_rate": 5.494646391822977e-06, + "loss": 2.5268, + "step": 55699 + }, + { + "epoch": 2.5932444071978957, + "grad_norm": 0.298317867891889, + "learning_rate": 5.493411946474253e-06, + "loss": 2.6145, + "step": 55700 + }, + { + "epoch": 2.5932909653839884, + "grad_norm": 0.3101723192714444, + "learning_rate": 5.4921776317479915e-06, + "loss": 2.6309, + "step": 55701 + }, + { + "epoch": 2.5933375235700815, + "grad_norm": 0.29363046867742926, + "learning_rate": 5.490943447647779e-06, + "loss": 2.7032, + "step": 55702 + }, + { + "epoch": 2.5933840817561746, + "grad_norm": 0.2962333401797507, + "learning_rate": 5.489709394177283e-06, + "loss": 2.5769, + "step": 55703 + }, + { + "epoch": 2.5934306399422677, + "grad_norm": 0.29814451014306953, + "learning_rate": 5.488475471340099e-06, + "loss": 2.6217, + "step": 55704 + }, + { + "epoch": 2.593477198128361, + "grad_norm": 0.2984027356802781, + "learning_rate": 5.487241679139848e-06, + "loss": 2.5924, + "step": 55705 + }, + { + "epoch": 2.593523756314454, + "grad_norm": 0.3022698645045706, + "learning_rate": 5.486008017580163e-06, + "loss": 2.538, + "step": 55706 + }, + { + "epoch": 2.593570314500547, + "grad_norm": 0.28587878590196647, + "learning_rate": 5.484774486664657e-06, + "loss": 2.6114, + "step": 55707 + }, + { + "epoch": 2.59361687268664, + "grad_norm": 0.3010153389341494, + "learning_rate": 5.483541086396954e-06, + "loss": 2.5691, + "step": 55708 + }, + { + "epoch": 2.5936634308727333, + "grad_norm": 0.29830248472465604, + "learning_rate": 5.482307816780679e-06, + "loss": 2.5922, + "step": 55709 + }, + { + "epoch": 2.5937099890588264, + "grad_norm": 0.30270635966884707, + "learning_rate": 5.481074677819431e-06, + "loss": 2.661, + "step": 55710 + }, + { + "epoch": 2.5937565472449196, + "grad_norm": 0.28784524434879566, + "learning_rate": 5.479841669516855e-06, + "loss": 2.6034, + "step": 55711 + }, + { + "epoch": 2.5938031054310127, + "grad_norm": 0.2913141292918252, + "learning_rate": 5.478608791876555e-06, + "loss": 2.5808, + "step": 55712 + }, + { + "epoch": 2.5938496636171053, + "grad_norm": 0.3067087900119691, + "learning_rate": 5.47737604490215e-06, + "loss": 2.5746, + "step": 55713 + }, + { + "epoch": 2.5938962218031985, + "grad_norm": 0.31073664056381795, + "learning_rate": 5.476143428597269e-06, + "loss": 2.5322, + "step": 55714 + }, + { + "epoch": 2.5939427799892916, + "grad_norm": 0.30419627101507857, + "learning_rate": 5.4749109429655e-06, + "loss": 2.5685, + "step": 55715 + }, + { + "epoch": 2.5939893381753847, + "grad_norm": 0.2921886249802624, + "learning_rate": 5.4736785880105e-06, + "loss": 2.5393, + "step": 55716 + }, + { + "epoch": 2.594035896361478, + "grad_norm": 0.2921452748319287, + "learning_rate": 5.472446363735851e-06, + "loss": 2.5494, + "step": 55717 + }, + { + "epoch": 2.594082454547571, + "grad_norm": 0.30494772755200944, + "learning_rate": 5.471214270145197e-06, + "loss": 2.6388, + "step": 55718 + }, + { + "epoch": 2.594129012733664, + "grad_norm": 0.31726641250568166, + "learning_rate": 5.469982307242138e-06, + "loss": 2.6791, + "step": 55719 + }, + { + "epoch": 2.5941755709197567, + "grad_norm": 0.31643547411600886, + "learning_rate": 5.4687504750302984e-06, + "loss": 2.636, + "step": 55720 + }, + { + "epoch": 2.59422212910585, + "grad_norm": 0.30410782654130253, + "learning_rate": 5.467518773513281e-06, + "loss": 2.6844, + "step": 55721 + }, + { + "epoch": 2.594268687291943, + "grad_norm": 0.308167196826691, + "learning_rate": 5.466287202694725e-06, + "loss": 2.6247, + "step": 55722 + }, + { + "epoch": 2.594315245478036, + "grad_norm": 0.2888030446035094, + "learning_rate": 5.465055762578203e-06, + "loss": 2.6296, + "step": 55723 + }, + { + "epoch": 2.594361803664129, + "grad_norm": 0.32035171601198426, + "learning_rate": 5.463824453167382e-06, + "loss": 2.6026, + "step": 55724 + }, + { + "epoch": 2.5944083618502223, + "grad_norm": 0.3244860997106744, + "learning_rate": 5.4625932744658325e-06, + "loss": 2.6737, + "step": 55725 + }, + { + "epoch": 2.5944549200363154, + "grad_norm": 0.29115128874588186, + "learning_rate": 5.461362226477184e-06, + "loss": 2.6883, + "step": 55726 + }, + { + "epoch": 2.5945014782224085, + "grad_norm": 0.2941037624270879, + "learning_rate": 5.460131309205063e-06, + "loss": 2.6737, + "step": 55727 + }, + { + "epoch": 2.5945480364085016, + "grad_norm": 0.303607204046002, + "learning_rate": 5.458900522653048e-06, + "loss": 2.5737, + "step": 55728 + }, + { + "epoch": 2.5945945945945947, + "grad_norm": 0.3073387793748589, + "learning_rate": 5.457669866824794e-06, + "loss": 2.6413, + "step": 55729 + }, + { + "epoch": 2.594641152780688, + "grad_norm": 0.30587356033012825, + "learning_rate": 5.456439341723868e-06, + "loss": 2.6821, + "step": 55730 + }, + { + "epoch": 2.594687710966781, + "grad_norm": 0.2853595238854884, + "learning_rate": 5.455208947353924e-06, + "loss": 2.5573, + "step": 55731 + }, + { + "epoch": 2.5947342691528736, + "grad_norm": 0.2941295483970693, + "learning_rate": 5.453978683718541e-06, + "loss": 2.6092, + "step": 55732 + }, + { + "epoch": 2.5947808273389668, + "grad_norm": 0.2869226918882964, + "learning_rate": 5.452748550821346e-06, + "loss": 2.6109, + "step": 55733 + }, + { + "epoch": 2.59482738552506, + "grad_norm": 0.2969448623019964, + "learning_rate": 5.451518548665946e-06, + "loss": 2.6376, + "step": 55734 + }, + { + "epoch": 2.594873943711153, + "grad_norm": 0.2934876052504585, + "learning_rate": 5.450288677255949e-06, + "loss": 2.6801, + "step": 55735 + }, + { + "epoch": 2.594920501897246, + "grad_norm": 0.3063170528569876, + "learning_rate": 5.449058936594964e-06, + "loss": 2.626, + "step": 55736 + }, + { + "epoch": 2.594967060083339, + "grad_norm": 0.30922736705247544, + "learning_rate": 5.4478293266866155e-06, + "loss": 2.6056, + "step": 55737 + }, + { + "epoch": 2.5950136182694323, + "grad_norm": 0.3075322092133119, + "learning_rate": 5.446599847534483e-06, + "loss": 2.6328, + "step": 55738 + }, + { + "epoch": 2.5950601764555254, + "grad_norm": 0.2979481914019975, + "learning_rate": 5.445370499142199e-06, + "loss": 2.6399, + "step": 55739 + }, + { + "epoch": 2.595106734641618, + "grad_norm": 0.29108487355519824, + "learning_rate": 5.444141281513365e-06, + "loss": 2.6055, + "step": 55740 + }, + { + "epoch": 2.5951532928277112, + "grad_norm": 0.29650631325677956, + "learning_rate": 5.4429121946515725e-06, + "loss": 2.5996, + "step": 55741 + }, + { + "epoch": 2.5951998510138043, + "grad_norm": 0.30128563980976236, + "learning_rate": 5.441683238560463e-06, + "loss": 2.6316, + "step": 55742 + }, + { + "epoch": 2.5952464091998975, + "grad_norm": 0.3125570614738066, + "learning_rate": 5.440454413243601e-06, + "loss": 2.7173, + "step": 55743 + }, + { + "epoch": 2.5952929673859906, + "grad_norm": 0.29611651178036996, + "learning_rate": 5.439225718704638e-06, + "loss": 2.6894, + "step": 55744 + }, + { + "epoch": 2.5953395255720837, + "grad_norm": 0.2983372162624088, + "learning_rate": 5.4379971549471445e-06, + "loss": 2.5693, + "step": 55745 + }, + { + "epoch": 2.595386083758177, + "grad_norm": 0.3041665297788536, + "learning_rate": 5.436768721974739e-06, + "loss": 2.6755, + "step": 55746 + }, + { + "epoch": 2.59543264194427, + "grad_norm": 0.2988612915348046, + "learning_rate": 5.43554041979103e-06, + "loss": 2.6356, + "step": 55747 + }, + { + "epoch": 2.595479200130363, + "grad_norm": 0.29573750464728954, + "learning_rate": 5.434312248399614e-06, + "loss": 2.6151, + "step": 55748 + }, + { + "epoch": 2.595525758316456, + "grad_norm": 0.2966430602196242, + "learning_rate": 5.433084207804107e-06, + "loss": 2.5855, + "step": 55749 + }, + { + "epoch": 2.5955723165025493, + "grad_norm": 0.29532839999028304, + "learning_rate": 5.431856298008115e-06, + "loss": 2.5564, + "step": 55750 + }, + { + "epoch": 2.5956188746886424, + "grad_norm": 0.31107583098260005, + "learning_rate": 5.430628519015224e-06, + "loss": 2.628, + "step": 55751 + }, + { + "epoch": 2.595665432874735, + "grad_norm": 0.3010070045271047, + "learning_rate": 5.429400870829049e-06, + "loss": 2.5653, + "step": 55752 + }, + { + "epoch": 2.595711991060828, + "grad_norm": 0.29966077000028296, + "learning_rate": 5.428173353453192e-06, + "loss": 2.6224, + "step": 55753 + }, + { + "epoch": 2.5957585492469213, + "grad_norm": 0.2954883438866305, + "learning_rate": 5.426945966891256e-06, + "loss": 2.6066, + "step": 55754 + }, + { + "epoch": 2.5958051074330144, + "grad_norm": 0.29988417175106263, + "learning_rate": 5.425718711146849e-06, + "loss": 2.6259, + "step": 55755 + }, + { + "epoch": 2.5958516656191075, + "grad_norm": 0.3029961487038622, + "learning_rate": 5.424491586223546e-06, + "loss": 2.5872, + "step": 55756 + }, + { + "epoch": 2.5958982238052006, + "grad_norm": 0.31350682025401383, + "learning_rate": 5.423264592124988e-06, + "loss": 2.6283, + "step": 55757 + }, + { + "epoch": 2.5959447819912937, + "grad_norm": 0.30490972986381104, + "learning_rate": 5.422037728854751e-06, + "loss": 2.5996, + "step": 55758 + }, + { + "epoch": 2.5959913401773864, + "grad_norm": 0.2894462422274103, + "learning_rate": 5.420810996416442e-06, + "loss": 2.6173, + "step": 55759 + }, + { + "epoch": 2.5960378983634795, + "grad_norm": 0.29964614123252703, + "learning_rate": 5.419584394813654e-06, + "loss": 2.5762, + "step": 55760 + }, + { + "epoch": 2.5960844565495726, + "grad_norm": 0.2970565485737462, + "learning_rate": 5.418357924049999e-06, + "loss": 2.6558, + "step": 55761 + }, + { + "epoch": 2.5961310147356658, + "grad_norm": 0.31167540606521454, + "learning_rate": 5.4171315841290704e-06, + "loss": 2.6405, + "step": 55762 + }, + { + "epoch": 2.596177572921759, + "grad_norm": 0.3021494287676478, + "learning_rate": 5.415905375054481e-06, + "loss": 2.5401, + "step": 55763 + }, + { + "epoch": 2.596224131107852, + "grad_norm": 0.3043119869027631, + "learning_rate": 5.4146792968298054e-06, + "loss": 2.5649, + "step": 55764 + }, + { + "epoch": 2.596270689293945, + "grad_norm": 0.3030042708391802, + "learning_rate": 5.413453349458652e-06, + "loss": 2.6411, + "step": 55765 + }, + { + "epoch": 2.5963172474800382, + "grad_norm": 0.3204478914835158, + "learning_rate": 5.412227532944625e-06, + "loss": 2.6052, + "step": 55766 + }, + { + "epoch": 2.5963638056661313, + "grad_norm": 0.29437664024967747, + "learning_rate": 5.411001847291314e-06, + "loss": 2.4884, + "step": 55767 + }, + { + "epoch": 2.5964103638522245, + "grad_norm": 0.2857537165113038, + "learning_rate": 5.409776292502328e-06, + "loss": 2.6104, + "step": 55768 + }, + { + "epoch": 2.5964569220383176, + "grad_norm": 0.3019233740673018, + "learning_rate": 5.408550868581241e-06, + "loss": 2.6362, + "step": 55769 + }, + { + "epoch": 2.5965034802244107, + "grad_norm": 0.313412843453208, + "learning_rate": 5.40732557553168e-06, + "loss": 2.6924, + "step": 55770 + }, + { + "epoch": 2.596550038410504, + "grad_norm": 0.2866277087520834, + "learning_rate": 5.406100413357207e-06, + "loss": 2.5843, + "step": 55771 + }, + { + "epoch": 2.5965965965965965, + "grad_norm": 0.3077235047310055, + "learning_rate": 5.404875382061453e-06, + "loss": 2.6185, + "step": 55772 + }, + { + "epoch": 2.5966431547826896, + "grad_norm": 0.305457309286596, + "learning_rate": 5.403650481647993e-06, + "loss": 2.6054, + "step": 55773 + }, + { + "epoch": 2.5966897129687827, + "grad_norm": 0.309980856355417, + "learning_rate": 5.402425712120418e-06, + "loss": 2.6201, + "step": 55774 + }, + { + "epoch": 2.596736271154876, + "grad_norm": 0.3006770840824216, + "learning_rate": 5.401201073482337e-06, + "loss": 2.6518, + "step": 55775 + }, + { + "epoch": 2.596782829340969, + "grad_norm": 0.2970961726903933, + "learning_rate": 5.399976565737342e-06, + "loss": 2.5287, + "step": 55776 + }, + { + "epoch": 2.596829387527062, + "grad_norm": 0.31060241724885457, + "learning_rate": 5.3987521888890116e-06, + "loss": 2.5974, + "step": 55777 + }, + { + "epoch": 2.596875945713155, + "grad_norm": 0.30708840255323083, + "learning_rate": 5.39752794294095e-06, + "loss": 2.551, + "step": 55778 + }, + { + "epoch": 2.596922503899248, + "grad_norm": 0.30258228528536507, + "learning_rate": 5.396303827896754e-06, + "loss": 2.6859, + "step": 55779 + }, + { + "epoch": 2.596969062085341, + "grad_norm": 0.303356996150627, + "learning_rate": 5.395079843760009e-06, + "loss": 2.6056, + "step": 55780 + }, + { + "epoch": 2.597015620271434, + "grad_norm": 0.29684361561110034, + "learning_rate": 5.3938559905343185e-06, + "loss": 2.6551, + "step": 55781 + }, + { + "epoch": 2.597062178457527, + "grad_norm": 0.28265979458021706, + "learning_rate": 5.392632268223252e-06, + "loss": 2.603, + "step": 55782 + }, + { + "epoch": 2.5971087366436203, + "grad_norm": 0.28967340540705555, + "learning_rate": 5.391408676830428e-06, + "loss": 2.6549, + "step": 55783 + }, + { + "epoch": 2.5971552948297134, + "grad_norm": 0.31037514243416264, + "learning_rate": 5.3901852163594116e-06, + "loss": 2.6263, + "step": 55784 + }, + { + "epoch": 2.5972018530158065, + "grad_norm": 0.3129529189476891, + "learning_rate": 5.38896188681382e-06, + "loss": 2.6978, + "step": 55785 + }, + { + "epoch": 2.5972484112018996, + "grad_norm": 0.322184859114456, + "learning_rate": 5.387738688197225e-06, + "loss": 2.6133, + "step": 55786 + }, + { + "epoch": 2.5972949693879928, + "grad_norm": 0.28887524264067144, + "learning_rate": 5.386515620513222e-06, + "loss": 2.6941, + "step": 55787 + }, + { + "epoch": 2.597341527574086, + "grad_norm": 0.2994723337716216, + "learning_rate": 5.385292683765397e-06, + "loss": 2.6467, + "step": 55788 + }, + { + "epoch": 2.597388085760179, + "grad_norm": 0.3172326548199888, + "learning_rate": 5.3840698779573485e-06, + "loss": 2.5936, + "step": 55789 + }, + { + "epoch": 2.597434643946272, + "grad_norm": 0.30916384791569546, + "learning_rate": 5.382847203092667e-06, + "loss": 2.7565, + "step": 55790 + }, + { + "epoch": 2.5974812021323648, + "grad_norm": 0.31025149354123926, + "learning_rate": 5.381624659174922e-06, + "loss": 2.5522, + "step": 55791 + }, + { + "epoch": 2.597527760318458, + "grad_norm": 0.29852028324299806, + "learning_rate": 5.380402246207711e-06, + "loss": 2.6309, + "step": 55792 + }, + { + "epoch": 2.597574318504551, + "grad_norm": 0.29526501575181824, + "learning_rate": 5.379179964194625e-06, + "loss": 2.6209, + "step": 55793 + }, + { + "epoch": 2.597620876690644, + "grad_norm": 0.31819787358679064, + "learning_rate": 5.377957813139262e-06, + "loss": 2.7406, + "step": 55794 + }, + { + "epoch": 2.5976674348767372, + "grad_norm": 0.29908969792417345, + "learning_rate": 5.376735793045179e-06, + "loss": 2.65, + "step": 55795 + }, + { + "epoch": 2.5977139930628304, + "grad_norm": 0.3109578066751395, + "learning_rate": 5.3755139039159964e-06, + "loss": 2.5772, + "step": 55796 + }, + { + "epoch": 2.5977605512489235, + "grad_norm": 0.2977817872982537, + "learning_rate": 5.374292145755266e-06, + "loss": 2.6831, + "step": 55797 + }, + { + "epoch": 2.597807109435016, + "grad_norm": 0.2968610896458845, + "learning_rate": 5.3730705185666086e-06, + "loss": 2.5546, + "step": 55798 + }, + { + "epoch": 2.5978536676211093, + "grad_norm": 0.31285276444884574, + "learning_rate": 5.371849022353586e-06, + "loss": 2.5638, + "step": 55799 + }, + { + "epoch": 2.5979002258072024, + "grad_norm": 0.31210693378541793, + "learning_rate": 5.370627657119786e-06, + "loss": 2.6506, + "step": 55800 + }, + { + "epoch": 2.5979467839932955, + "grad_norm": 0.3079220643668471, + "learning_rate": 5.3694064228688004e-06, + "loss": 2.5863, + "step": 55801 + }, + { + "epoch": 2.5979933421793886, + "grad_norm": 0.2995493817716366, + "learning_rate": 5.3681853196042135e-06, + "loss": 2.6317, + "step": 55802 + }, + { + "epoch": 2.5980399003654817, + "grad_norm": 0.3007932259985924, + "learning_rate": 5.366964347329612e-06, + "loss": 2.5932, + "step": 55803 + }, + { + "epoch": 2.598086458551575, + "grad_norm": 0.3193520734024676, + "learning_rate": 5.36574350604856e-06, + "loss": 2.7432, + "step": 55804 + }, + { + "epoch": 2.598133016737668, + "grad_norm": 0.30531841743862187, + "learning_rate": 5.36452279576466e-06, + "loss": 2.5947, + "step": 55805 + }, + { + "epoch": 2.598179574923761, + "grad_norm": 0.30187927112994395, + "learning_rate": 5.36330221648148e-06, + "loss": 2.6911, + "step": 55806 + }, + { + "epoch": 2.598226133109854, + "grad_norm": 0.29989088541413805, + "learning_rate": 5.36208176820262e-06, + "loss": 2.5898, + "step": 55807 + }, + { + "epoch": 2.5982726912959473, + "grad_norm": 0.29971990819555144, + "learning_rate": 5.360861450931648e-06, + "loss": 2.4941, + "step": 55808 + }, + { + "epoch": 2.5983192494820404, + "grad_norm": 0.3112257731869924, + "learning_rate": 5.359641264672155e-06, + "loss": 2.6563, + "step": 55809 + }, + { + "epoch": 2.5983658076681335, + "grad_norm": 0.31189344789907253, + "learning_rate": 5.3584212094277055e-06, + "loss": 2.644, + "step": 55810 + }, + { + "epoch": 2.598412365854226, + "grad_norm": 0.31047359039082695, + "learning_rate": 5.357201285201907e-06, + "loss": 2.6105, + "step": 55811 + }, + { + "epoch": 2.5984589240403193, + "grad_norm": 0.32357860809477407, + "learning_rate": 5.355981491998313e-06, + "loss": 2.6457, + "step": 55812 + }, + { + "epoch": 2.5985054822264124, + "grad_norm": 0.3173011155919558, + "learning_rate": 5.35476182982052e-06, + "loss": 2.6458, + "step": 55813 + }, + { + "epoch": 2.5985520404125055, + "grad_norm": 0.32095628816449245, + "learning_rate": 5.353542298672104e-06, + "loss": 2.5793, + "step": 55814 + }, + { + "epoch": 2.5985985985985987, + "grad_norm": 0.3030393591444639, + "learning_rate": 5.352322898556639e-06, + "loss": 2.6094, + "step": 55815 + }, + { + "epoch": 2.5986451567846918, + "grad_norm": 0.295010559268445, + "learning_rate": 5.351103629477716e-06, + "loss": 2.513, + "step": 55816 + }, + { + "epoch": 2.598691714970785, + "grad_norm": 0.316244228622874, + "learning_rate": 5.3498844914389e-06, + "loss": 2.6874, + "step": 55817 + }, + { + "epoch": 2.5987382731568776, + "grad_norm": 0.319056872334737, + "learning_rate": 5.3486654844437714e-06, + "loss": 2.6072, + "step": 55818 + }, + { + "epoch": 2.5987848313429707, + "grad_norm": 0.31382930318537966, + "learning_rate": 5.347446608495909e-06, + "loss": 2.5692, + "step": 55819 + }, + { + "epoch": 2.598831389529064, + "grad_norm": 0.3066635625619796, + "learning_rate": 5.346227863598896e-06, + "loss": 2.6479, + "step": 55820 + }, + { + "epoch": 2.598877947715157, + "grad_norm": 0.31937823792217407, + "learning_rate": 5.345009249756305e-06, + "loss": 2.5924, + "step": 55821 + }, + { + "epoch": 2.59892450590125, + "grad_norm": 0.2885061387701618, + "learning_rate": 5.343790766971718e-06, + "loss": 2.496, + "step": 55822 + }, + { + "epoch": 2.598971064087343, + "grad_norm": 0.30457291456629454, + "learning_rate": 5.342572415248687e-06, + "loss": 2.6363, + "step": 55823 + }, + { + "epoch": 2.5990176222734362, + "grad_norm": 0.34371001393291706, + "learning_rate": 5.341354194590831e-06, + "loss": 2.6013, + "step": 55824 + }, + { + "epoch": 2.5990641804595294, + "grad_norm": 0.3099674580163003, + "learning_rate": 5.340136105001686e-06, + "loss": 2.6907, + "step": 55825 + }, + { + "epoch": 2.5991107386456225, + "grad_norm": 0.2905315202863046, + "learning_rate": 5.338918146484844e-06, + "loss": 2.4927, + "step": 55826 + }, + { + "epoch": 2.5991572968317156, + "grad_norm": 0.31714127538365966, + "learning_rate": 5.337700319043876e-06, + "loss": 2.6654, + "step": 55827 + }, + { + "epoch": 2.5992038550178087, + "grad_norm": 0.32559408265474166, + "learning_rate": 5.336482622682359e-06, + "loss": 2.7295, + "step": 55828 + }, + { + "epoch": 2.599250413203902, + "grad_norm": 0.29953254871163615, + "learning_rate": 5.3352650574038765e-06, + "loss": 2.6416, + "step": 55829 + }, + { + "epoch": 2.5992969713899945, + "grad_norm": 0.2846977993390361, + "learning_rate": 5.33404762321198e-06, + "loss": 2.6619, + "step": 55830 + }, + { + "epoch": 2.5993435295760876, + "grad_norm": 0.2993528685139036, + "learning_rate": 5.332830320110249e-06, + "loss": 2.5714, + "step": 55831 + }, + { + "epoch": 2.5993900877621807, + "grad_norm": 0.3009227314056059, + "learning_rate": 5.3316131481022654e-06, + "loss": 2.5665, + "step": 55832 + }, + { + "epoch": 2.599436645948274, + "grad_norm": 0.3074278856163506, + "learning_rate": 5.330396107191599e-06, + "loss": 2.4905, + "step": 55833 + }, + { + "epoch": 2.599483204134367, + "grad_norm": 0.3104443417610225, + "learning_rate": 5.3291791973818115e-06, + "loss": 2.709, + "step": 55834 + }, + { + "epoch": 2.59952976232046, + "grad_norm": 0.3087205614417182, + "learning_rate": 5.327962418676497e-06, + "loss": 2.5774, + "step": 55835 + }, + { + "epoch": 2.599576320506553, + "grad_norm": 0.32021452686609025, + "learning_rate": 5.3267457710791945e-06, + "loss": 2.6644, + "step": 55836 + }, + { + "epoch": 2.5996228786926463, + "grad_norm": 0.299250300570849, + "learning_rate": 5.325529254593509e-06, + "loss": 2.6415, + "step": 55837 + }, + { + "epoch": 2.599669436878739, + "grad_norm": 0.3094906305984272, + "learning_rate": 5.324312869222986e-06, + "loss": 2.5855, + "step": 55838 + }, + { + "epoch": 2.599715995064832, + "grad_norm": 0.3092739809764605, + "learning_rate": 5.323096614971202e-06, + "loss": 2.5929, + "step": 55839 + }, + { + "epoch": 2.599762553250925, + "grad_norm": 0.33201595501027353, + "learning_rate": 5.321880491841724e-06, + "loss": 2.6655, + "step": 55840 + }, + { + "epoch": 2.5998091114370183, + "grad_norm": 0.3087029072891289, + "learning_rate": 5.32066449983813e-06, + "loss": 2.6041, + "step": 55841 + }, + { + "epoch": 2.5998556696231114, + "grad_norm": 0.308131084646566, + "learning_rate": 5.319448638963992e-06, + "loss": 2.6183, + "step": 55842 + }, + { + "epoch": 2.5999022278092045, + "grad_norm": 0.3023119582807803, + "learning_rate": 5.3182329092228535e-06, + "loss": 2.6252, + "step": 55843 + }, + { + "epoch": 2.5999487859952977, + "grad_norm": 0.3101565072333419, + "learning_rate": 5.317017310618317e-06, + "loss": 2.5941, + "step": 55844 + }, + { + "epoch": 2.5999953441813908, + "grad_norm": 0.3217045508668162, + "learning_rate": 5.315801843153923e-06, + "loss": 2.6378, + "step": 55845 + }, + { + "epoch": 2.600041902367484, + "grad_norm": 0.30754736018471374, + "learning_rate": 5.3145865068332465e-06, + "loss": 2.6793, + "step": 55846 + }, + { + "epoch": 2.600088460553577, + "grad_norm": 0.3126301801873785, + "learning_rate": 5.313371301659859e-06, + "loss": 2.6437, + "step": 55847 + }, + { + "epoch": 2.60013501873967, + "grad_norm": 0.3087715404857311, + "learning_rate": 5.3121562276373324e-06, + "loss": 2.7118, + "step": 55848 + }, + { + "epoch": 2.6001815769257632, + "grad_norm": 0.316055709050608, + "learning_rate": 5.3109412847692055e-06, + "loss": 2.594, + "step": 55849 + }, + { + "epoch": 2.600228135111856, + "grad_norm": 0.318925372546112, + "learning_rate": 5.309726473059085e-06, + "loss": 2.6837, + "step": 55850 + }, + { + "epoch": 2.600274693297949, + "grad_norm": 0.2970224653972414, + "learning_rate": 5.308511792510506e-06, + "loss": 2.6396, + "step": 55851 + }, + { + "epoch": 2.600321251484042, + "grad_norm": 0.31127581716724473, + "learning_rate": 5.307297243127035e-06, + "loss": 2.7185, + "step": 55852 + }, + { + "epoch": 2.6003678096701353, + "grad_norm": 0.3151460851318722, + "learning_rate": 5.3060828249122505e-06, + "loss": 2.6392, + "step": 55853 + }, + { + "epoch": 2.6004143678562284, + "grad_norm": 0.30994476500853674, + "learning_rate": 5.304868537869706e-06, + "loss": 2.5995, + "step": 55854 + }, + { + "epoch": 2.6004609260423215, + "grad_norm": 0.3273110023363933, + "learning_rate": 5.303654382002981e-06, + "loss": 2.5246, + "step": 55855 + }, + { + "epoch": 2.6005074842284146, + "grad_norm": 0.3450178385769018, + "learning_rate": 5.302440357315608e-06, + "loss": 2.533, + "step": 55856 + }, + { + "epoch": 2.6005540424145073, + "grad_norm": 0.3134214666783488, + "learning_rate": 5.3012264638111875e-06, + "loss": 2.5975, + "step": 55857 + }, + { + "epoch": 2.6006006006006004, + "grad_norm": 0.3031876019997127, + "learning_rate": 5.300012701493256e-06, + "loss": 2.6827, + "step": 55858 + }, + { + "epoch": 2.6006471587866935, + "grad_norm": 0.3091778512807612, + "learning_rate": 5.298799070365379e-06, + "loss": 2.6321, + "step": 55859 + }, + { + "epoch": 2.6006937169727866, + "grad_norm": 0.32066490459522595, + "learning_rate": 5.2975855704311295e-06, + "loss": 2.6634, + "step": 55860 + }, + { + "epoch": 2.6007402751588797, + "grad_norm": 0.31891739048461826, + "learning_rate": 5.2963722016940555e-06, + "loss": 2.69, + "step": 55861 + }, + { + "epoch": 2.600786833344973, + "grad_norm": 0.3040089322991478, + "learning_rate": 5.295158964157726e-06, + "loss": 2.6998, + "step": 55862 + }, + { + "epoch": 2.600833391531066, + "grad_norm": 0.30677959703580737, + "learning_rate": 5.2939458578257166e-06, + "loss": 2.5816, + "step": 55863 + }, + { + "epoch": 2.600879949717159, + "grad_norm": 0.2896336875663561, + "learning_rate": 5.2927328827015565e-06, + "loss": 2.5141, + "step": 55864 + }, + { + "epoch": 2.600926507903252, + "grad_norm": 0.29929776167578653, + "learning_rate": 5.291520038788822e-06, + "loss": 2.5865, + "step": 55865 + }, + { + "epoch": 2.6009730660893453, + "grad_norm": 0.29885425955851486, + "learning_rate": 5.290307326091071e-06, + "loss": 2.566, + "step": 55866 + }, + { + "epoch": 2.6010196242754384, + "grad_norm": 0.30614735243658553, + "learning_rate": 5.289094744611866e-06, + "loss": 2.6346, + "step": 55867 + }, + { + "epoch": 2.6010661824615315, + "grad_norm": 0.31897989534047033, + "learning_rate": 5.287882294354773e-06, + "loss": 2.6001, + "step": 55868 + }, + { + "epoch": 2.601112740647624, + "grad_norm": 0.3017962303154351, + "learning_rate": 5.2866699753233165e-06, + "loss": 2.5363, + "step": 55869 + }, + { + "epoch": 2.6011592988337173, + "grad_norm": 0.2890220000572146, + "learning_rate": 5.285457787521103e-06, + "loss": 2.6266, + "step": 55870 + }, + { + "epoch": 2.6012058570198104, + "grad_norm": 0.3010636226795604, + "learning_rate": 5.284245730951653e-06, + "loss": 2.6576, + "step": 55871 + }, + { + "epoch": 2.6012524152059036, + "grad_norm": 0.31620821375042707, + "learning_rate": 5.283033805618542e-06, + "loss": 2.6654, + "step": 55872 + }, + { + "epoch": 2.6012989733919967, + "grad_norm": 0.2938378646258069, + "learning_rate": 5.281822011525317e-06, + "loss": 2.5765, + "step": 55873 + }, + { + "epoch": 2.60134553157809, + "grad_norm": 0.2929211956572916, + "learning_rate": 5.280610348675536e-06, + "loss": 2.6055, + "step": 55874 + }, + { + "epoch": 2.601392089764183, + "grad_norm": 0.2873203794366726, + "learning_rate": 5.279398817072762e-06, + "loss": 2.6949, + "step": 55875 + }, + { + "epoch": 2.601438647950276, + "grad_norm": 0.3018492545693593, + "learning_rate": 5.278187416720554e-06, + "loss": 2.7337, + "step": 55876 + }, + { + "epoch": 2.6014852061363687, + "grad_norm": 0.2918361004910636, + "learning_rate": 5.276976147622454e-06, + "loss": 2.5296, + "step": 55877 + }, + { + "epoch": 2.601531764322462, + "grad_norm": 0.30063941806180844, + "learning_rate": 5.275765009782019e-06, + "loss": 2.5687, + "step": 55878 + }, + { + "epoch": 2.601578322508555, + "grad_norm": 0.29825899832408637, + "learning_rate": 5.274554003202814e-06, + "loss": 2.6237, + "step": 55879 + }, + { + "epoch": 2.601624880694648, + "grad_norm": 0.307807179005932, + "learning_rate": 5.27334312788838e-06, + "loss": 2.717, + "step": 55880 + }, + { + "epoch": 2.601671438880741, + "grad_norm": 0.29027263552201016, + "learning_rate": 5.272132383842293e-06, + "loss": 2.5825, + "step": 55881 + }, + { + "epoch": 2.6017179970668343, + "grad_norm": 0.3080409467655547, + "learning_rate": 5.270921771068066e-06, + "loss": 2.4801, + "step": 55882 + }, + { + "epoch": 2.6017645552529274, + "grad_norm": 0.3093348933362828, + "learning_rate": 5.269711289569301e-06, + "loss": 2.6986, + "step": 55883 + }, + { + "epoch": 2.6018111134390205, + "grad_norm": 0.2949064286762944, + "learning_rate": 5.2685009393495135e-06, + "loss": 2.644, + "step": 55884 + }, + { + "epoch": 2.6018576716251136, + "grad_norm": 0.31862577139519294, + "learning_rate": 5.267290720412271e-06, + "loss": 2.7244, + "step": 55885 + }, + { + "epoch": 2.6019042298112067, + "grad_norm": 0.29953855887838743, + "learning_rate": 5.266080632761122e-06, + "loss": 2.6711, + "step": 55886 + }, + { + "epoch": 2.6019507879973, + "grad_norm": 0.29222281226646424, + "learning_rate": 5.264870676399619e-06, + "loss": 2.6011, + "step": 55887 + }, + { + "epoch": 2.601997346183393, + "grad_norm": 0.302476809474789, + "learning_rate": 5.263660851331315e-06, + "loss": 2.6112, + "step": 55888 + }, + { + "epoch": 2.6020439043694856, + "grad_norm": 0.30517895144659984, + "learning_rate": 5.2624511575597666e-06, + "loss": 2.6294, + "step": 55889 + }, + { + "epoch": 2.6020904625555787, + "grad_norm": 0.2864667205101526, + "learning_rate": 5.2612415950885115e-06, + "loss": 2.5482, + "step": 55890 + }, + { + "epoch": 2.602137020741672, + "grad_norm": 0.2955976921085686, + "learning_rate": 5.260032163921103e-06, + "loss": 2.5845, + "step": 55891 + }, + { + "epoch": 2.602183578927765, + "grad_norm": 0.27745472735920623, + "learning_rate": 5.258822864061092e-06, + "loss": 2.6375, + "step": 55892 + }, + { + "epoch": 2.602230137113858, + "grad_norm": 0.30165450243160064, + "learning_rate": 5.257613695512026e-06, + "loss": 2.6551, + "step": 55893 + }, + { + "epoch": 2.602276695299951, + "grad_norm": 0.3071007225432747, + "learning_rate": 5.256404658277469e-06, + "loss": 2.6526, + "step": 55894 + }, + { + "epoch": 2.6023232534860443, + "grad_norm": 0.2895440756551188, + "learning_rate": 5.255195752360931e-06, + "loss": 2.6271, + "step": 55895 + }, + { + "epoch": 2.602369811672137, + "grad_norm": 0.3022229915153189, + "learning_rate": 5.253986977766012e-06, + "loss": 2.661, + "step": 55896 + }, + { + "epoch": 2.60241636985823, + "grad_norm": 0.3036697341882302, + "learning_rate": 5.252778334496211e-06, + "loss": 2.673, + "step": 55897 + }, + { + "epoch": 2.602462928044323, + "grad_norm": 0.2876892563208048, + "learning_rate": 5.251569822555114e-06, + "loss": 2.5807, + "step": 55898 + }, + { + "epoch": 2.6025094862304163, + "grad_norm": 0.2932003907645239, + "learning_rate": 5.250361441946244e-06, + "loss": 2.6973, + "step": 55899 + }, + { + "epoch": 2.6025560444165095, + "grad_norm": 0.30469856007526286, + "learning_rate": 5.249153192673157e-06, + "loss": 2.6745, + "step": 55900 + }, + { + "epoch": 2.6026026026026026, + "grad_norm": 0.29000622877243826, + "learning_rate": 5.247945074739391e-06, + "loss": 2.6812, + "step": 55901 + }, + { + "epoch": 2.6026491607886957, + "grad_norm": 0.30517579984270743, + "learning_rate": 5.246737088148512e-06, + "loss": 2.6919, + "step": 55902 + }, + { + "epoch": 2.602695718974789, + "grad_norm": 0.28941911880101584, + "learning_rate": 5.245529232904039e-06, + "loss": 2.6277, + "step": 55903 + }, + { + "epoch": 2.602742277160882, + "grad_norm": 0.28198330606278205, + "learning_rate": 5.244321509009526e-06, + "loss": 2.6683, + "step": 55904 + }, + { + "epoch": 2.602788835346975, + "grad_norm": 0.32330728407309406, + "learning_rate": 5.243113916468523e-06, + "loss": 2.6076, + "step": 55905 + }, + { + "epoch": 2.602835393533068, + "grad_norm": 0.29831284655099255, + "learning_rate": 5.241906455284573e-06, + "loss": 2.7047, + "step": 55906 + }, + { + "epoch": 2.6028819517191613, + "grad_norm": 0.2925659213459927, + "learning_rate": 5.2406991254612295e-06, + "loss": 2.5706, + "step": 55907 + }, + { + "epoch": 2.602928509905254, + "grad_norm": 0.31474573921000953, + "learning_rate": 5.2394919270019994e-06, + "loss": 2.6753, + "step": 55908 + }, + { + "epoch": 2.602975068091347, + "grad_norm": 0.29830100535844045, + "learning_rate": 5.2382848599104695e-06, + "loss": 2.5401, + "step": 55909 + }, + { + "epoch": 2.60302162627744, + "grad_norm": 0.30943071193892696, + "learning_rate": 5.237077924190148e-06, + "loss": 2.6339, + "step": 55910 + }, + { + "epoch": 2.6030681844635333, + "grad_norm": 0.29645800519677457, + "learning_rate": 5.235871119844616e-06, + "loss": 2.6897, + "step": 55911 + }, + { + "epoch": 2.6031147426496264, + "grad_norm": 0.3052769058742646, + "learning_rate": 5.2346644468773745e-06, + "loss": 2.6954, + "step": 55912 + }, + { + "epoch": 2.6031613008357195, + "grad_norm": 0.29595329012547655, + "learning_rate": 5.233457905291988e-06, + "loss": 2.6903, + "step": 55913 + }, + { + "epoch": 2.6032078590218126, + "grad_norm": 0.2991699232282981, + "learning_rate": 5.232251495091988e-06, + "loss": 2.6682, + "step": 55914 + }, + { + "epoch": 2.6032544172079057, + "grad_norm": 0.2970775530972791, + "learning_rate": 5.231045216280928e-06, + "loss": 2.6277, + "step": 55915 + }, + { + "epoch": 2.6033009753939984, + "grad_norm": 0.2882521983233316, + "learning_rate": 5.229839068862341e-06, + "loss": 2.5555, + "step": 55916 + }, + { + "epoch": 2.6033475335800915, + "grad_norm": 0.3072874044462308, + "learning_rate": 5.22863305283976e-06, + "loss": 2.6508, + "step": 55917 + }, + { + "epoch": 2.6033940917661846, + "grad_norm": 0.30293310144694413, + "learning_rate": 5.227427168216731e-06, + "loss": 2.655, + "step": 55918 + }, + { + "epoch": 2.6034406499522778, + "grad_norm": 0.300611614277745, + "learning_rate": 5.22622141499679e-06, + "loss": 2.5521, + "step": 55919 + }, + { + "epoch": 2.603487208138371, + "grad_norm": 0.31417462865572615, + "learning_rate": 5.225015793183485e-06, + "loss": 2.7006, + "step": 55920 + }, + { + "epoch": 2.603533766324464, + "grad_norm": 0.3055234256575399, + "learning_rate": 5.223810302780335e-06, + "loss": 2.5193, + "step": 55921 + }, + { + "epoch": 2.603580324510557, + "grad_norm": 0.3058498565975255, + "learning_rate": 5.222604943790904e-06, + "loss": 2.5875, + "step": 55922 + }, + { + "epoch": 2.60362688269665, + "grad_norm": 0.3117427472277889, + "learning_rate": 5.2213997162187004e-06, + "loss": 2.7015, + "step": 55923 + }, + { + "epoch": 2.6036734408827433, + "grad_norm": 0.30934295943175066, + "learning_rate": 5.220194620067298e-06, + "loss": 2.4985, + "step": 55924 + }, + { + "epoch": 2.6037199990688364, + "grad_norm": 0.2997855959117287, + "learning_rate": 5.2189896553402e-06, + "loss": 2.5181, + "step": 55925 + }, + { + "epoch": 2.6037665572549296, + "grad_norm": 0.31117568645687765, + "learning_rate": 5.217784822040955e-06, + "loss": 2.5501, + "step": 55926 + }, + { + "epoch": 2.6038131154410227, + "grad_norm": 0.31102202579697236, + "learning_rate": 5.216580120173103e-06, + "loss": 2.6154, + "step": 55927 + }, + { + "epoch": 2.6038596736271153, + "grad_norm": 0.2995926834211303, + "learning_rate": 5.215375549740176e-06, + "loss": 2.554, + "step": 55928 + }, + { + "epoch": 2.6039062318132085, + "grad_norm": 0.3069026121511814, + "learning_rate": 5.214171110745719e-06, + "loss": 2.7294, + "step": 55929 + }, + { + "epoch": 2.6039527899993016, + "grad_norm": 0.3005650543356271, + "learning_rate": 5.212966803193248e-06, + "loss": 2.6253, + "step": 55930 + }, + { + "epoch": 2.6039993481853947, + "grad_norm": 0.3106075303837277, + "learning_rate": 5.211762627086303e-06, + "loss": 2.6673, + "step": 55931 + }, + { + "epoch": 2.604045906371488, + "grad_norm": 0.2990457942126802, + "learning_rate": 5.210558582428426e-06, + "loss": 2.6396, + "step": 55932 + }, + { + "epoch": 2.604092464557581, + "grad_norm": 0.30918442533302526, + "learning_rate": 5.2093546692231485e-06, + "loss": 2.7024, + "step": 55933 + }, + { + "epoch": 2.604139022743674, + "grad_norm": 0.30048652511662094, + "learning_rate": 5.208150887474e-06, + "loss": 2.6405, + "step": 55934 + }, + { + "epoch": 2.6041855809297667, + "grad_norm": 0.2888358690843604, + "learning_rate": 5.206947237184529e-06, + "loss": 2.6675, + "step": 55935 + }, + { + "epoch": 2.60423213911586, + "grad_norm": 0.27675777617553915, + "learning_rate": 5.20574371835823e-06, + "loss": 2.4774, + "step": 55936 + }, + { + "epoch": 2.604278697301953, + "grad_norm": 0.3042825324868016, + "learning_rate": 5.204540330998681e-06, + "loss": 2.5937, + "step": 55937 + }, + { + "epoch": 2.604325255488046, + "grad_norm": 0.3024897247267927, + "learning_rate": 5.203337075109388e-06, + "loss": 2.6909, + "step": 55938 + }, + { + "epoch": 2.604371813674139, + "grad_norm": 0.27822427357034674, + "learning_rate": 5.202133950693883e-06, + "loss": 2.6984, + "step": 55939 + }, + { + "epoch": 2.6044183718602323, + "grad_norm": 0.29231276013890856, + "learning_rate": 5.200930957755701e-06, + "loss": 2.6822, + "step": 55940 + }, + { + "epoch": 2.6044649300463254, + "grad_norm": 0.30987054826703275, + "learning_rate": 5.199728096298378e-06, + "loss": 2.5215, + "step": 55941 + }, + { + "epoch": 2.6045114882324185, + "grad_norm": 0.3020696888295901, + "learning_rate": 5.198525366325446e-06, + "loss": 2.6522, + "step": 55942 + }, + { + "epoch": 2.6045580464185116, + "grad_norm": 0.2968654168538282, + "learning_rate": 5.197322767840418e-06, + "loss": 2.572, + "step": 55943 + }, + { + "epoch": 2.6046046046046047, + "grad_norm": 0.3174670266071174, + "learning_rate": 5.196120300846835e-06, + "loss": 2.5513, + "step": 55944 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 0.296277748835413, + "learning_rate": 5.194917965348223e-06, + "loss": 2.5217, + "step": 55945 + }, + { + "epoch": 2.604697720976791, + "grad_norm": 0.3177967796703292, + "learning_rate": 5.193715761348117e-06, + "loss": 2.5946, + "step": 55946 + }, + { + "epoch": 2.604744279162884, + "grad_norm": 0.3042443462248287, + "learning_rate": 5.192513688850037e-06, + "loss": 2.5505, + "step": 55947 + }, + { + "epoch": 2.6047908373489768, + "grad_norm": 0.28957465354279976, + "learning_rate": 5.191311747857524e-06, + "loss": 2.6142, + "step": 55948 + }, + { + "epoch": 2.60483739553507, + "grad_norm": 0.30934523438223177, + "learning_rate": 5.190109938374083e-06, + "loss": 2.6377, + "step": 55949 + }, + { + "epoch": 2.604883953721163, + "grad_norm": 0.3043067949976136, + "learning_rate": 5.188908260403269e-06, + "loss": 2.6792, + "step": 55950 + }, + { + "epoch": 2.604930511907256, + "grad_norm": 0.2862368034211706, + "learning_rate": 5.187706713948587e-06, + "loss": 2.6912, + "step": 55951 + }, + { + "epoch": 2.6049770700933492, + "grad_norm": 0.3134250597346272, + "learning_rate": 5.186505299013572e-06, + "loss": 2.5992, + "step": 55952 + }, + { + "epoch": 2.6050236282794423, + "grad_norm": 0.2930732558994883, + "learning_rate": 5.18530401560175e-06, + "loss": 2.5864, + "step": 55953 + }, + { + "epoch": 2.6050701864655355, + "grad_norm": 0.3008308123914247, + "learning_rate": 5.184102863716644e-06, + "loss": 2.5768, + "step": 55954 + }, + { + "epoch": 2.605116744651628, + "grad_norm": 0.2983950862117424, + "learning_rate": 5.182901843361787e-06, + "loss": 2.6053, + "step": 55955 + }, + { + "epoch": 2.6051633028377212, + "grad_norm": 0.299564728974003, + "learning_rate": 5.181700954540691e-06, + "loss": 2.5994, + "step": 55956 + }, + { + "epoch": 2.6052098610238144, + "grad_norm": 0.3100364520190179, + "learning_rate": 5.180500197256888e-06, + "loss": 2.6668, + "step": 55957 + }, + { + "epoch": 2.6052564192099075, + "grad_norm": 0.3184012810102974, + "learning_rate": 5.179299571513902e-06, + "loss": 2.7258, + "step": 55958 + }, + { + "epoch": 2.6053029773960006, + "grad_norm": 0.29447790027527093, + "learning_rate": 5.178099077315257e-06, + "loss": 2.5965, + "step": 55959 + }, + { + "epoch": 2.6053495355820937, + "grad_norm": 0.29195490931098617, + "learning_rate": 5.176898714664474e-06, + "loss": 2.5768, + "step": 55960 + }, + { + "epoch": 2.605396093768187, + "grad_norm": 0.2958214237097273, + "learning_rate": 5.175698483565089e-06, + "loss": 2.6463, + "step": 55961 + }, + { + "epoch": 2.60544265195428, + "grad_norm": 0.30340925842787564, + "learning_rate": 5.174498384020587e-06, + "loss": 2.6224, + "step": 55962 + }, + { + "epoch": 2.605489210140373, + "grad_norm": 0.2901268825798365, + "learning_rate": 5.173298416034544e-06, + "loss": 2.6765, + "step": 55963 + }, + { + "epoch": 2.605535768326466, + "grad_norm": 0.307606923270428, + "learning_rate": 5.172098579610441e-06, + "loss": 2.6635, + "step": 55964 + }, + { + "epoch": 2.6055823265125593, + "grad_norm": 0.31130492213247424, + "learning_rate": 5.170898874751812e-06, + "loss": 2.6746, + "step": 55965 + }, + { + "epoch": 2.6056288846986524, + "grad_norm": 0.29718384743449344, + "learning_rate": 5.169699301462177e-06, + "loss": 2.5655, + "step": 55966 + }, + { + "epoch": 2.605675442884745, + "grad_norm": 0.30108632467639646, + "learning_rate": 5.168499859745057e-06, + "loss": 2.6377, + "step": 55967 + }, + { + "epoch": 2.605722001070838, + "grad_norm": 0.29184232865122095, + "learning_rate": 5.167300549603987e-06, + "loss": 2.5602, + "step": 55968 + }, + { + "epoch": 2.6057685592569313, + "grad_norm": 0.2972043540847757, + "learning_rate": 5.166101371042454e-06, + "loss": 2.5639, + "step": 55969 + }, + { + "epoch": 2.6058151174430244, + "grad_norm": 0.29672521811981944, + "learning_rate": 5.1649023240640135e-06, + "loss": 2.6152, + "step": 55970 + }, + { + "epoch": 2.6058616756291175, + "grad_norm": 0.298721606073769, + "learning_rate": 5.16370340867216e-06, + "loss": 2.5445, + "step": 55971 + }, + { + "epoch": 2.6059082338152106, + "grad_norm": 0.3052646580394725, + "learning_rate": 5.162504624870418e-06, + "loss": 2.6326, + "step": 55972 + }, + { + "epoch": 2.6059547920013038, + "grad_norm": 0.29809724217314815, + "learning_rate": 5.161305972662306e-06, + "loss": 2.5505, + "step": 55973 + }, + { + "epoch": 2.6060013501873964, + "grad_norm": 0.3105987722412769, + "learning_rate": 5.16010745205136e-06, + "loss": 2.6457, + "step": 55974 + }, + { + "epoch": 2.6060479083734895, + "grad_norm": 0.30199488884640896, + "learning_rate": 5.158909063041057e-06, + "loss": 2.6035, + "step": 55975 + }, + { + "epoch": 2.6060944665595827, + "grad_norm": 0.3002165228862286, + "learning_rate": 5.157710805634958e-06, + "loss": 2.5903, + "step": 55976 + }, + { + "epoch": 2.6061410247456758, + "grad_norm": 0.315383555822937, + "learning_rate": 5.1565126798365505e-06, + "loss": 2.6741, + "step": 55977 + }, + { + "epoch": 2.606187582931769, + "grad_norm": 0.31812813666766576, + "learning_rate": 5.155314685649359e-06, + "loss": 2.6467, + "step": 55978 + }, + { + "epoch": 2.606234141117862, + "grad_norm": 0.3093812610009649, + "learning_rate": 5.154116823076905e-06, + "loss": 2.6323, + "step": 55979 + }, + { + "epoch": 2.606280699303955, + "grad_norm": 0.3066469768716919, + "learning_rate": 5.152919092122693e-06, + "loss": 2.6322, + "step": 55980 + }, + { + "epoch": 2.6063272574900482, + "grad_norm": 0.3231698173519582, + "learning_rate": 5.151721492790262e-06, + "loss": 2.6041, + "step": 55981 + }, + { + "epoch": 2.6063738156761413, + "grad_norm": 0.2900677014219572, + "learning_rate": 5.150524025083087e-06, + "loss": 2.5892, + "step": 55982 + }, + { + "epoch": 2.6064203738622345, + "grad_norm": 0.29202210362012737, + "learning_rate": 5.1493266890047245e-06, + "loss": 2.5266, + "step": 55983 + }, + { + "epoch": 2.6064669320483276, + "grad_norm": 0.3091736983086228, + "learning_rate": 5.148129484558661e-06, + "loss": 2.6778, + "step": 55984 + }, + { + "epoch": 2.6065134902344207, + "grad_norm": 0.3041589088089761, + "learning_rate": 5.146932411748423e-06, + "loss": 2.5036, + "step": 55985 + }, + { + "epoch": 2.606560048420514, + "grad_norm": 0.30656067558318656, + "learning_rate": 5.145735470577517e-06, + "loss": 2.5417, + "step": 55986 + }, + { + "epoch": 2.6066066066066065, + "grad_norm": 0.2921775104196898, + "learning_rate": 5.144538661049458e-06, + "loss": 2.6344, + "step": 55987 + }, + { + "epoch": 2.6066531647926996, + "grad_norm": 0.3056345482885462, + "learning_rate": 5.143341983167754e-06, + "loss": 2.5452, + "step": 55988 + }, + { + "epoch": 2.6066997229787927, + "grad_norm": 0.3065273318898171, + "learning_rate": 5.142145436935941e-06, + "loss": 2.6379, + "step": 55989 + }, + { + "epoch": 2.606746281164886, + "grad_norm": 0.2965914401457677, + "learning_rate": 5.140949022357494e-06, + "loss": 2.5571, + "step": 55990 + }, + { + "epoch": 2.606792839350979, + "grad_norm": 0.3300765757838613, + "learning_rate": 5.139752739435949e-06, + "loss": 2.5406, + "step": 55991 + }, + { + "epoch": 2.606839397537072, + "grad_norm": 0.32341024620540043, + "learning_rate": 5.138556588174803e-06, + "loss": 2.5434, + "step": 55992 + }, + { + "epoch": 2.606885955723165, + "grad_norm": 0.307098010677503, + "learning_rate": 5.137360568577582e-06, + "loss": 2.5962, + "step": 55993 + }, + { + "epoch": 2.606932513909258, + "grad_norm": 0.3007150457046029, + "learning_rate": 5.136164680647793e-06, + "loss": 2.5448, + "step": 55994 + }, + { + "epoch": 2.606979072095351, + "grad_norm": 0.3010237986622173, + "learning_rate": 5.134968924388922e-06, + "loss": 2.6709, + "step": 55995 + }, + { + "epoch": 2.607025630281444, + "grad_norm": 0.30193428113372556, + "learning_rate": 5.1337732998045175e-06, + "loss": 2.6078, + "step": 55996 + }, + { + "epoch": 2.607072188467537, + "grad_norm": 0.3074599961648895, + "learning_rate": 5.132577806898059e-06, + "loss": 2.6456, + "step": 55997 + }, + { + "epoch": 2.6071187466536303, + "grad_norm": 0.3235034383999209, + "learning_rate": 5.131382445673061e-06, + "loss": 2.596, + "step": 55998 + }, + { + "epoch": 2.6071653048397234, + "grad_norm": 0.2956543273955564, + "learning_rate": 5.1301872161330365e-06, + "loss": 2.5682, + "step": 55999 + }, + { + "epoch": 2.6072118630258165, + "grad_norm": 0.2937624884838675, + "learning_rate": 5.128992118281495e-06, + "loss": 2.697, + "step": 56000 + }, + { + "epoch": 2.6072584212119096, + "grad_norm": 0.2852769823604441, + "learning_rate": 5.127797152121938e-06, + "loss": 2.5364, + "step": 56001 + }, + { + "epoch": 2.6073049793980028, + "grad_norm": 0.30186425774701486, + "learning_rate": 5.126602317657886e-06, + "loss": 2.6591, + "step": 56002 + }, + { + "epoch": 2.607351537584096, + "grad_norm": 0.30382220844304336, + "learning_rate": 5.1254076148928294e-06, + "loss": 2.6727, + "step": 56003 + }, + { + "epoch": 2.607398095770189, + "grad_norm": 0.3032776141972891, + "learning_rate": 5.124213043830279e-06, + "loss": 2.6708, + "step": 56004 + }, + { + "epoch": 2.607444653956282, + "grad_norm": 0.30099955737821593, + "learning_rate": 5.12301860447374e-06, + "loss": 2.5321, + "step": 56005 + }, + { + "epoch": 2.607491212142375, + "grad_norm": 0.2983868006271271, + "learning_rate": 5.121824296826722e-06, + "loss": 2.5746, + "step": 56006 + }, + { + "epoch": 2.607537770328468, + "grad_norm": 0.306470811907398, + "learning_rate": 5.1206301208927335e-06, + "loss": 2.6298, + "step": 56007 + }, + { + "epoch": 2.607584328514561, + "grad_norm": 0.3003909215508907, + "learning_rate": 5.11943607667526e-06, + "loss": 2.6164, + "step": 56008 + }, + { + "epoch": 2.607630886700654, + "grad_norm": 0.29224636113756813, + "learning_rate": 5.118242164177839e-06, + "loss": 2.6297, + "step": 56009 + }, + { + "epoch": 2.6076774448867472, + "grad_norm": 0.2932316110169427, + "learning_rate": 5.117048383403944e-06, + "loss": 2.5222, + "step": 56010 + }, + { + "epoch": 2.6077240030728404, + "grad_norm": 0.2854779417518986, + "learning_rate": 5.1158547343570955e-06, + "loss": 2.4961, + "step": 56011 + }, + { + "epoch": 2.6077705612589335, + "grad_norm": 0.3124639164499043, + "learning_rate": 5.1146612170407835e-06, + "loss": 2.7381, + "step": 56012 + }, + { + "epoch": 2.6078171194450266, + "grad_norm": 0.31501147321764733, + "learning_rate": 5.113467831458524e-06, + "loss": 2.6608, + "step": 56013 + }, + { + "epoch": 2.6078636776311193, + "grad_norm": 0.3377313536542505, + "learning_rate": 5.112274577613812e-06, + "loss": 2.5929, + "step": 56014 + }, + { + "epoch": 2.6079102358172124, + "grad_norm": 0.29934454588806336, + "learning_rate": 5.111081455510164e-06, + "loss": 2.5981, + "step": 56015 + }, + { + "epoch": 2.6079567940033055, + "grad_norm": 0.30387374794213556, + "learning_rate": 5.109888465151052e-06, + "loss": 2.7033, + "step": 56016 + }, + { + "epoch": 2.6080033521893986, + "grad_norm": 0.2938804715976114, + "learning_rate": 5.108695606540009e-06, + "loss": 2.647, + "step": 56017 + }, + { + "epoch": 2.6080499103754917, + "grad_norm": 0.29620659621106193, + "learning_rate": 5.1075028796805145e-06, + "loss": 2.5266, + "step": 56018 + }, + { + "epoch": 2.608096468561585, + "grad_norm": 0.28878690620766456, + "learning_rate": 5.106310284576077e-06, + "loss": 2.5477, + "step": 56019 + }, + { + "epoch": 2.608143026747678, + "grad_norm": 0.29879948253305844, + "learning_rate": 5.105117821230204e-06, + "loss": 2.661, + "step": 56020 + }, + { + "epoch": 2.608189584933771, + "grad_norm": 0.3079406528225213, + "learning_rate": 5.103925489646372e-06, + "loss": 2.5691, + "step": 56021 + }, + { + "epoch": 2.608236143119864, + "grad_norm": 0.29285847578167673, + "learning_rate": 5.10273328982811e-06, + "loss": 2.6215, + "step": 56022 + }, + { + "epoch": 2.6082827013059573, + "grad_norm": 0.31387808559812463, + "learning_rate": 5.101541221778888e-06, + "loss": 2.6905, + "step": 56023 + }, + { + "epoch": 2.6083292594920504, + "grad_norm": 0.29961609591833516, + "learning_rate": 5.100349285502237e-06, + "loss": 2.5603, + "step": 56024 + }, + { + "epoch": 2.6083758176781435, + "grad_norm": 0.31390154806844933, + "learning_rate": 5.099157481001626e-06, + "loss": 2.7847, + "step": 56025 + }, + { + "epoch": 2.608422375864236, + "grad_norm": 0.28982662968319417, + "learning_rate": 5.097965808280563e-06, + "loss": 2.5512, + "step": 56026 + }, + { + "epoch": 2.6084689340503293, + "grad_norm": 0.28855411539615217, + "learning_rate": 5.096774267342552e-06, + "loss": 2.5756, + "step": 56027 + }, + { + "epoch": 2.6085154922364224, + "grad_norm": 0.2815582666863899, + "learning_rate": 5.095582858191083e-06, + "loss": 2.6054, + "step": 56028 + }, + { + "epoch": 2.6085620504225155, + "grad_norm": 0.32536427478859503, + "learning_rate": 5.094391580829655e-06, + "loss": 2.5712, + "step": 56029 + }, + { + "epoch": 2.6086086086086087, + "grad_norm": 0.30361083130463346, + "learning_rate": 5.093200435261769e-06, + "loss": 2.6009, + "step": 56030 + }, + { + "epoch": 2.6086551667947018, + "grad_norm": 0.31041612536247637, + "learning_rate": 5.092009421490912e-06, + "loss": 2.653, + "step": 56031 + }, + { + "epoch": 2.608701724980795, + "grad_norm": 0.29270730401412065, + "learning_rate": 5.0908185395205755e-06, + "loss": 2.5289, + "step": 56032 + }, + { + "epoch": 2.6087482831668876, + "grad_norm": 0.3043858306806252, + "learning_rate": 5.0896277893542786e-06, + "loss": 2.7132, + "step": 56033 + }, + { + "epoch": 2.6087948413529807, + "grad_norm": 0.29602909270695127, + "learning_rate": 5.08843717099548e-06, + "loss": 2.7079, + "step": 56034 + }, + { + "epoch": 2.608841399539074, + "grad_norm": 0.299550434923349, + "learning_rate": 5.087246684447711e-06, + "loss": 2.622, + "step": 56035 + }, + { + "epoch": 2.608887957725167, + "grad_norm": 0.2855012693725673, + "learning_rate": 5.086056329714428e-06, + "loss": 2.5952, + "step": 56036 + }, + { + "epoch": 2.60893451591126, + "grad_norm": 0.28974844123905996, + "learning_rate": 5.084866106799169e-06, + "loss": 2.6517, + "step": 56037 + }, + { + "epoch": 2.608981074097353, + "grad_norm": 0.2986512450810395, + "learning_rate": 5.083676015705385e-06, + "loss": 2.604, + "step": 56038 + }, + { + "epoch": 2.6090276322834463, + "grad_norm": 0.29871696449355345, + "learning_rate": 5.0824860564365975e-06, + "loss": 2.5782, + "step": 56039 + }, + { + "epoch": 2.6090741904695394, + "grad_norm": 0.3014604795676558, + "learning_rate": 5.0812962289962796e-06, + "loss": 2.665, + "step": 56040 + }, + { + "epoch": 2.6091207486556325, + "grad_norm": 0.2880200307839094, + "learning_rate": 5.080106533387935e-06, + "loss": 2.6227, + "step": 56041 + }, + { + "epoch": 2.6091673068417256, + "grad_norm": 0.2958226159187139, + "learning_rate": 5.078916969615055e-06, + "loss": 2.6503, + "step": 56042 + }, + { + "epoch": 2.6092138650278187, + "grad_norm": 0.31012686595605565, + "learning_rate": 5.077727537681137e-06, + "loss": 2.6906, + "step": 56043 + }, + { + "epoch": 2.609260423213912, + "grad_norm": 0.292452016443257, + "learning_rate": 5.076538237589651e-06, + "loss": 2.6997, + "step": 56044 + }, + { + "epoch": 2.6093069814000045, + "grad_norm": 0.3020163798702113, + "learning_rate": 5.075349069344098e-06, + "loss": 2.6966, + "step": 56045 + }, + { + "epoch": 2.6093535395860976, + "grad_norm": 0.3053536518642722, + "learning_rate": 5.0741600329479725e-06, + "loss": 2.6028, + "step": 56046 + }, + { + "epoch": 2.6094000977721907, + "grad_norm": 0.3022099374299909, + "learning_rate": 5.072971128404763e-06, + "loss": 2.6342, + "step": 56047 + }, + { + "epoch": 2.609446655958284, + "grad_norm": 0.2888934144129623, + "learning_rate": 5.071782355717964e-06, + "loss": 2.4737, + "step": 56048 + }, + { + "epoch": 2.609493214144377, + "grad_norm": 0.30722429962356945, + "learning_rate": 5.070593714891042e-06, + "loss": 2.5991, + "step": 56049 + }, + { + "epoch": 2.60953977233047, + "grad_norm": 0.3088276315028517, + "learning_rate": 5.069405205927513e-06, + "loss": 2.6308, + "step": 56050 + }, + { + "epoch": 2.609586330516563, + "grad_norm": 0.3008036976951851, + "learning_rate": 5.068216828830852e-06, + "loss": 2.6051, + "step": 56051 + }, + { + "epoch": 2.6096328887026563, + "grad_norm": 0.28845424978398554, + "learning_rate": 5.067028583604544e-06, + "loss": 2.546, + "step": 56052 + }, + { + "epoch": 2.609679446888749, + "grad_norm": 0.30118540472872557, + "learning_rate": 5.0658404702520816e-06, + "loss": 2.643, + "step": 56053 + }, + { + "epoch": 2.609726005074842, + "grad_norm": 0.3057891571345989, + "learning_rate": 5.06465248877695e-06, + "loss": 2.7018, + "step": 56054 + }, + { + "epoch": 2.609772563260935, + "grad_norm": 0.3258560154038519, + "learning_rate": 5.063464639182636e-06, + "loss": 2.6578, + "step": 56055 + }, + { + "epoch": 2.6098191214470283, + "grad_norm": 0.2943726971726635, + "learning_rate": 5.062276921472636e-06, + "loss": 2.5971, + "step": 56056 + }, + { + "epoch": 2.6098656796331214, + "grad_norm": 0.29546049669262925, + "learning_rate": 5.061089335650415e-06, + "loss": 2.4942, + "step": 56057 + }, + { + "epoch": 2.6099122378192146, + "grad_norm": 0.2957972368105768, + "learning_rate": 5.059901881719476e-06, + "loss": 2.5807, + "step": 56058 + }, + { + "epoch": 2.6099587960053077, + "grad_norm": 0.3030979813864281, + "learning_rate": 5.058714559683292e-06, + "loss": 2.6406, + "step": 56059 + }, + { + "epoch": 2.610005354191401, + "grad_norm": 0.305305636854661, + "learning_rate": 5.057527369545356e-06, + "loss": 2.5362, + "step": 56060 + }, + { + "epoch": 2.610051912377494, + "grad_norm": 0.31895370975987714, + "learning_rate": 5.056340311309155e-06, + "loss": 2.5501, + "step": 56061 + }, + { + "epoch": 2.610098470563587, + "grad_norm": 0.3095253362631625, + "learning_rate": 5.055153384978151e-06, + "loss": 2.6919, + "step": 56062 + }, + { + "epoch": 2.61014502874968, + "grad_norm": 0.30206692938910434, + "learning_rate": 5.053966590555864e-06, + "loss": 2.6495, + "step": 56063 + }, + { + "epoch": 2.6101915869357732, + "grad_norm": 0.32192626505066263, + "learning_rate": 5.0527799280457366e-06, + "loss": 2.5965, + "step": 56064 + }, + { + "epoch": 2.610238145121866, + "grad_norm": 0.30980623893728804, + "learning_rate": 5.051593397451293e-06, + "loss": 2.7826, + "step": 56065 + }, + { + "epoch": 2.610284703307959, + "grad_norm": 0.3326353917946477, + "learning_rate": 5.0504069987759865e-06, + "loss": 2.6287, + "step": 56066 + }, + { + "epoch": 2.610331261494052, + "grad_norm": 0.31129658294071916, + "learning_rate": 5.049220732023302e-06, + "loss": 2.6132, + "step": 56067 + }, + { + "epoch": 2.6103778196801453, + "grad_norm": 0.30129952253522096, + "learning_rate": 5.048034597196727e-06, + "loss": 2.7325, + "step": 56068 + }, + { + "epoch": 2.6104243778662384, + "grad_norm": 0.3059146578151697, + "learning_rate": 5.046848594299758e-06, + "loss": 2.5256, + "step": 56069 + }, + { + "epoch": 2.6104709360523315, + "grad_norm": 0.289271999817583, + "learning_rate": 5.045662723335842e-06, + "loss": 2.714, + "step": 56070 + }, + { + "epoch": 2.6105174942384246, + "grad_norm": 0.2902196479618991, + "learning_rate": 5.044476984308483e-06, + "loss": 2.4096, + "step": 56071 + }, + { + "epoch": 2.6105640524245173, + "grad_norm": 0.3097350175131321, + "learning_rate": 5.043291377221154e-06, + "loss": 2.6765, + "step": 56072 + }, + { + "epoch": 2.6106106106106104, + "grad_norm": 0.30284862857439376, + "learning_rate": 5.042105902077332e-06, + "loss": 2.5536, + "step": 56073 + }, + { + "epoch": 2.6106571687967035, + "grad_norm": 0.3047576736513465, + "learning_rate": 5.040920558880513e-06, + "loss": 2.5634, + "step": 56074 + }, + { + "epoch": 2.6107037269827966, + "grad_norm": 0.29239218868219113, + "learning_rate": 5.0397353476341446e-06, + "loss": 2.6921, + "step": 56075 + }, + { + "epoch": 2.6107502851688897, + "grad_norm": 0.30969061555995553, + "learning_rate": 5.0385502683417405e-06, + "loss": 2.5971, + "step": 56076 + }, + { + "epoch": 2.610796843354983, + "grad_norm": 0.29220995158195323, + "learning_rate": 5.0373653210067425e-06, + "loss": 2.6329, + "step": 56077 + }, + { + "epoch": 2.610843401541076, + "grad_norm": 0.3035380352806348, + "learning_rate": 5.03618050563267e-06, + "loss": 2.6807, + "step": 56078 + }, + { + "epoch": 2.610889959727169, + "grad_norm": 0.2959348504229842, + "learning_rate": 5.034995822222966e-06, + "loss": 2.7956, + "step": 56079 + }, + { + "epoch": 2.610936517913262, + "grad_norm": 0.2978816791191268, + "learning_rate": 5.033811270781119e-06, + "loss": 2.5899, + "step": 56080 + }, + { + "epoch": 2.6109830760993553, + "grad_norm": 0.294099648417511, + "learning_rate": 5.032626851310607e-06, + "loss": 2.5809, + "step": 56081 + }, + { + "epoch": 2.6110296342854484, + "grad_norm": 0.28793930860962097, + "learning_rate": 5.0314425638149034e-06, + "loss": 2.6264, + "step": 56082 + }, + { + "epoch": 2.6110761924715415, + "grad_norm": 0.2916490583591159, + "learning_rate": 5.0302584082975e-06, + "loss": 2.7009, + "step": 56083 + }, + { + "epoch": 2.611122750657634, + "grad_norm": 0.29565852510968726, + "learning_rate": 5.029074384761845e-06, + "loss": 2.6045, + "step": 56084 + }, + { + "epoch": 2.6111693088437273, + "grad_norm": 0.3043049671906841, + "learning_rate": 5.027890493211429e-06, + "loss": 2.5994, + "step": 56085 + }, + { + "epoch": 2.6112158670298204, + "grad_norm": 0.2957800239187414, + "learning_rate": 5.026706733649722e-06, + "loss": 2.5471, + "step": 56086 + }, + { + "epoch": 2.6112624252159136, + "grad_norm": 0.28748639281585164, + "learning_rate": 5.025523106080204e-06, + "loss": 2.7353, + "step": 56087 + }, + { + "epoch": 2.6113089834020067, + "grad_norm": 0.28291996108744505, + "learning_rate": 5.024339610506335e-06, + "loss": 2.6666, + "step": 56088 + }, + { + "epoch": 2.6113555415881, + "grad_norm": 0.2813142533932237, + "learning_rate": 5.0231562469316155e-06, + "loss": 2.517, + "step": 56089 + }, + { + "epoch": 2.611402099774193, + "grad_norm": 0.313172543333267, + "learning_rate": 5.021973015359477e-06, + "loss": 2.5832, + "step": 56090 + }, + { + "epoch": 2.611448657960286, + "grad_norm": 0.30577060792856353, + "learning_rate": 5.020789915793439e-06, + "loss": 2.5669, + "step": 56091 + }, + { + "epoch": 2.6114952161463787, + "grad_norm": 0.29643037662050425, + "learning_rate": 5.0196069482369435e-06, + "loss": 2.6376, + "step": 56092 + }, + { + "epoch": 2.611541774332472, + "grad_norm": 0.30771434689897736, + "learning_rate": 5.0184241126934704e-06, + "loss": 2.6554, + "step": 56093 + }, + { + "epoch": 2.611588332518565, + "grad_norm": 0.2906096596659179, + "learning_rate": 5.01724140916649e-06, + "loss": 2.6853, + "step": 56094 + }, + { + "epoch": 2.611634890704658, + "grad_norm": 0.2807109702740108, + "learning_rate": 5.016058837659471e-06, + "loss": 2.6161, + "step": 56095 + }, + { + "epoch": 2.611681448890751, + "grad_norm": 0.2815891936910165, + "learning_rate": 5.014876398175899e-06, + "loss": 2.6046, + "step": 56096 + }, + { + "epoch": 2.6117280070768443, + "grad_norm": 0.28692146186470596, + "learning_rate": 5.013694090719229e-06, + "loss": 2.5927, + "step": 56097 + }, + { + "epoch": 2.6117745652629374, + "grad_norm": 0.29543388645167196, + "learning_rate": 5.012511915292928e-06, + "loss": 2.5997, + "step": 56098 + }, + { + "epoch": 2.6118211234490305, + "grad_norm": 0.3018225335229946, + "learning_rate": 5.011329871900478e-06, + "loss": 2.7166, + "step": 56099 + }, + { + "epoch": 2.6118676816351236, + "grad_norm": 0.30902279930982307, + "learning_rate": 5.010147960545342e-06, + "loss": 2.6195, + "step": 56100 + }, + { + "epoch": 2.6119142398212167, + "grad_norm": 0.3052045975678594, + "learning_rate": 5.008966181230984e-06, + "loss": 2.5764, + "step": 56101 + }, + { + "epoch": 2.61196079800731, + "grad_norm": 0.31800245333411076, + "learning_rate": 5.0077845339608905e-06, + "loss": 2.6371, + "step": 56102 + }, + { + "epoch": 2.612007356193403, + "grad_norm": 0.29267051372409436, + "learning_rate": 5.006603018738499e-06, + "loss": 2.6373, + "step": 56103 + }, + { + "epoch": 2.6120539143794956, + "grad_norm": 0.33226767660832063, + "learning_rate": 5.005421635567314e-06, + "loss": 2.7269, + "step": 56104 + }, + { + "epoch": 2.6121004725655887, + "grad_norm": 0.3081923076707288, + "learning_rate": 5.004240384450776e-06, + "loss": 2.5341, + "step": 56105 + }, + { + "epoch": 2.612147030751682, + "grad_norm": 0.299308058163896, + "learning_rate": 5.003059265392357e-06, + "loss": 2.6792, + "step": 56106 + }, + { + "epoch": 2.612193588937775, + "grad_norm": 0.2953848324758426, + "learning_rate": 5.001878278395522e-06, + "loss": 2.5065, + "step": 56107 + }, + { + "epoch": 2.612240147123868, + "grad_norm": 0.3187620578837788, + "learning_rate": 5.000697423463746e-06, + "loss": 2.6845, + "step": 56108 + }, + { + "epoch": 2.612286705309961, + "grad_norm": 0.3095312899663746, + "learning_rate": 4.999516700600504e-06, + "loss": 2.6062, + "step": 56109 + }, + { + "epoch": 2.6123332634960543, + "grad_norm": 0.3058380795312241, + "learning_rate": 4.998336109809232e-06, + "loss": 2.6874, + "step": 56110 + }, + { + "epoch": 2.612379821682147, + "grad_norm": 0.31029067042675185, + "learning_rate": 4.99715565109341e-06, + "loss": 2.6129, + "step": 56111 + }, + { + "epoch": 2.61242637986824, + "grad_norm": 0.29431215924034154, + "learning_rate": 4.995975324456503e-06, + "loss": 2.7237, + "step": 56112 + }, + { + "epoch": 2.6124729380543332, + "grad_norm": 0.29037186967445267, + "learning_rate": 4.9947951299019745e-06, + "loss": 2.6432, + "step": 56113 + }, + { + "epoch": 2.6125194962404263, + "grad_norm": 0.3078802279492739, + "learning_rate": 4.993615067433288e-06, + "loss": 2.6715, + "step": 56114 + }, + { + "epoch": 2.6125660544265195, + "grad_norm": 0.30980363675898537, + "learning_rate": 4.992435137053919e-06, + "loss": 2.6743, + "step": 56115 + }, + { + "epoch": 2.6126126126126126, + "grad_norm": 0.30361480656979584, + "learning_rate": 4.991255338767298e-06, + "loss": 2.4863, + "step": 56116 + }, + { + "epoch": 2.6126591707987057, + "grad_norm": 0.2950508722948174, + "learning_rate": 4.9900756725769325e-06, + "loss": 2.6595, + "step": 56117 + }, + { + "epoch": 2.612705728984799, + "grad_norm": 0.3137867942570208, + "learning_rate": 4.9888961384862485e-06, + "loss": 2.6851, + "step": 56118 + }, + { + "epoch": 2.612752287170892, + "grad_norm": 0.3011583799719147, + "learning_rate": 4.9877167364987155e-06, + "loss": 2.572, + "step": 56119 + }, + { + "epoch": 2.612798845356985, + "grad_norm": 0.319865605853344, + "learning_rate": 4.986537466617802e-06, + "loss": 2.6626, + "step": 56120 + }, + { + "epoch": 2.612845403543078, + "grad_norm": 0.3032141050845975, + "learning_rate": 4.985358328846967e-06, + "loss": 2.6366, + "step": 56121 + }, + { + "epoch": 2.6128919617291713, + "grad_norm": 0.2904226647265581, + "learning_rate": 4.9841793231896805e-06, + "loss": 2.5592, + "step": 56122 + }, + { + "epoch": 2.612938519915264, + "grad_norm": 0.29968535859017564, + "learning_rate": 4.9830004496493835e-06, + "loss": 2.6029, + "step": 56123 + }, + { + "epoch": 2.612985078101357, + "grad_norm": 0.30183926696601343, + "learning_rate": 4.981821708229545e-06, + "loss": 2.7387, + "step": 56124 + }, + { + "epoch": 2.61303163628745, + "grad_norm": 0.3182682860306139, + "learning_rate": 4.98064309893363e-06, + "loss": 2.5623, + "step": 56125 + }, + { + "epoch": 2.6130781944735433, + "grad_norm": 0.29941941308548403, + "learning_rate": 4.979464621765084e-06, + "loss": 2.5751, + "step": 56126 + }, + { + "epoch": 2.6131247526596364, + "grad_norm": 0.3049745003725268, + "learning_rate": 4.9782862767273786e-06, + "loss": 2.7294, + "step": 56127 + }, + { + "epoch": 2.6131713108457295, + "grad_norm": 0.29988483850568665, + "learning_rate": 4.977108063823976e-06, + "loss": 2.6345, + "step": 56128 + }, + { + "epoch": 2.6132178690318226, + "grad_norm": 0.3039746508550838, + "learning_rate": 4.975929983058303e-06, + "loss": 2.5743, + "step": 56129 + }, + { + "epoch": 2.6132644272179157, + "grad_norm": 0.2963454341733198, + "learning_rate": 4.974752034433866e-06, + "loss": 2.6775, + "step": 56130 + }, + { + "epoch": 2.6133109854040084, + "grad_norm": 0.30350000252453924, + "learning_rate": 4.97357421795408e-06, + "loss": 2.6197, + "step": 56131 + }, + { + "epoch": 2.6133575435901015, + "grad_norm": 0.3002722607264716, + "learning_rate": 4.97239653362242e-06, + "loss": 2.505, + "step": 56132 + }, + { + "epoch": 2.6134041017761946, + "grad_norm": 0.2975260767396131, + "learning_rate": 4.9712189814423436e-06, + "loss": 2.543, + "step": 56133 + }, + { + "epoch": 2.6134506599622878, + "grad_norm": 0.2943466234462807, + "learning_rate": 4.970041561417299e-06, + "loss": 2.6181, + "step": 56134 + }, + { + "epoch": 2.613497218148381, + "grad_norm": 0.3043234430847812, + "learning_rate": 4.968864273550761e-06, + "loss": 2.58, + "step": 56135 + }, + { + "epoch": 2.613543776334474, + "grad_norm": 0.3071812951422439, + "learning_rate": 4.967687117846148e-06, + "loss": 2.6327, + "step": 56136 + }, + { + "epoch": 2.613590334520567, + "grad_norm": 0.29658828297173623, + "learning_rate": 4.966510094306953e-06, + "loss": 2.5486, + "step": 56137 + }, + { + "epoch": 2.61363689270666, + "grad_norm": 0.3055625740531115, + "learning_rate": 4.965333202936606e-06, + "loss": 2.5789, + "step": 56138 + }, + { + "epoch": 2.6136834508927533, + "grad_norm": 0.3001395735897224, + "learning_rate": 4.9641564437385716e-06, + "loss": 2.7155, + "step": 56139 + }, + { + "epoch": 2.6137300090788465, + "grad_norm": 0.30149314224383633, + "learning_rate": 4.962979816716301e-06, + "loss": 2.6635, + "step": 56140 + }, + { + "epoch": 2.6137765672649396, + "grad_norm": 0.3023746109671937, + "learning_rate": 4.961803321873254e-06, + "loss": 2.6999, + "step": 56141 + }, + { + "epoch": 2.6138231254510327, + "grad_norm": 0.29108188115816236, + "learning_rate": 4.960626959212867e-06, + "loss": 2.6614, + "step": 56142 + }, + { + "epoch": 2.6138696836371254, + "grad_norm": 0.29364081398096925, + "learning_rate": 4.959450728738612e-06, + "loss": 2.6194, + "step": 56143 + }, + { + "epoch": 2.6139162418232185, + "grad_norm": 0.30033899575801426, + "learning_rate": 4.958274630453924e-06, + "loss": 2.5986, + "step": 56144 + }, + { + "epoch": 2.6139628000093116, + "grad_norm": 0.28861798428111046, + "learning_rate": 4.95709866436227e-06, + "loss": 2.6451, + "step": 56145 + }, + { + "epoch": 2.6140093581954047, + "grad_norm": 0.2995686362348706, + "learning_rate": 4.955922830467086e-06, + "loss": 2.6976, + "step": 56146 + }, + { + "epoch": 2.614055916381498, + "grad_norm": 0.29491129967895374, + "learning_rate": 4.954747128771831e-06, + "loss": 2.6596, + "step": 56147 + }, + { + "epoch": 2.614102474567591, + "grad_norm": 0.29484601577102726, + "learning_rate": 4.953571559279968e-06, + "loss": 2.657, + "step": 56148 + }, + { + "epoch": 2.614149032753684, + "grad_norm": 0.2983544537556861, + "learning_rate": 4.952396121994918e-06, + "loss": 2.7208, + "step": 56149 + }, + { + "epoch": 2.6141955909397767, + "grad_norm": 0.2846183973726175, + "learning_rate": 4.95122081692016e-06, + "loss": 2.5716, + "step": 56150 + }, + { + "epoch": 2.61424214912587, + "grad_norm": 0.30143833961645805, + "learning_rate": 4.9500456440591255e-06, + "loss": 2.7003, + "step": 56151 + }, + { + "epoch": 2.614288707311963, + "grad_norm": 0.2963763624249789, + "learning_rate": 4.948870603415268e-06, + "loss": 2.6333, + "step": 56152 + }, + { + "epoch": 2.614335265498056, + "grad_norm": 0.2997953292966627, + "learning_rate": 4.947695694992038e-06, + "loss": 2.5152, + "step": 56153 + }, + { + "epoch": 2.614381823684149, + "grad_norm": 0.29814414208424744, + "learning_rate": 4.946520918792885e-06, + "loss": 2.5842, + "step": 56154 + }, + { + "epoch": 2.6144283818702423, + "grad_norm": 0.2991076360805845, + "learning_rate": 4.94534627482125e-06, + "loss": 2.582, + "step": 56155 + }, + { + "epoch": 2.6144749400563354, + "grad_norm": 0.30088673782325936, + "learning_rate": 4.944171763080596e-06, + "loss": 2.5628, + "step": 56156 + }, + { + "epoch": 2.6145214982424285, + "grad_norm": 0.3085062098747151, + "learning_rate": 4.94299738357435e-06, + "loss": 2.6122, + "step": 56157 + }, + { + "epoch": 2.6145680564285216, + "grad_norm": 0.2916177305351476, + "learning_rate": 4.941823136305968e-06, + "loss": 2.5322, + "step": 56158 + }, + { + "epoch": 2.6146146146146148, + "grad_norm": 0.2867842762435784, + "learning_rate": 4.940649021278898e-06, + "loss": 2.6034, + "step": 56159 + }, + { + "epoch": 2.614661172800708, + "grad_norm": 0.28917786441212867, + "learning_rate": 4.9394750384965826e-06, + "loss": 2.5372, + "step": 56160 + }, + { + "epoch": 2.614707730986801, + "grad_norm": 0.3060861178121286, + "learning_rate": 4.938301187962474e-06, + "loss": 2.5887, + "step": 56161 + }, + { + "epoch": 2.614754289172894, + "grad_norm": 0.300604078168259, + "learning_rate": 4.937127469679997e-06, + "loss": 2.5448, + "step": 56162 + }, + { + "epoch": 2.6148008473589868, + "grad_norm": 0.31814233447381457, + "learning_rate": 4.935953883652633e-06, + "loss": 2.6295, + "step": 56163 + }, + { + "epoch": 2.61484740554508, + "grad_norm": 0.2969677550974139, + "learning_rate": 4.93478042988379e-06, + "loss": 2.6318, + "step": 56164 + }, + { + "epoch": 2.614893963731173, + "grad_norm": 0.29889085364362616, + "learning_rate": 4.933607108376931e-06, + "loss": 2.6136, + "step": 56165 + }, + { + "epoch": 2.614940521917266, + "grad_norm": 0.28292603383641546, + "learning_rate": 4.9324339191354926e-06, + "loss": 2.6296, + "step": 56166 + }, + { + "epoch": 2.6149870801033592, + "grad_norm": 0.309536849614726, + "learning_rate": 4.931260862162923e-06, + "loss": 2.6506, + "step": 56167 + }, + { + "epoch": 2.6150336382894523, + "grad_norm": 0.30915936639540414, + "learning_rate": 4.930087937462657e-06, + "loss": 2.6028, + "step": 56168 + }, + { + "epoch": 2.6150801964755455, + "grad_norm": 0.2927200579141043, + "learning_rate": 4.92891514503816e-06, + "loss": 2.7288, + "step": 56169 + }, + { + "epoch": 2.615126754661638, + "grad_norm": 0.28551579381732334, + "learning_rate": 4.9277424848928445e-06, + "loss": 2.6878, + "step": 56170 + }, + { + "epoch": 2.6151733128477312, + "grad_norm": 0.2900717226313288, + "learning_rate": 4.9265699570301634e-06, + "loss": 2.604, + "step": 56171 + }, + { + "epoch": 2.6152198710338244, + "grad_norm": 0.3085231967600837, + "learning_rate": 4.925397561453559e-06, + "loss": 2.7153, + "step": 56172 + }, + { + "epoch": 2.6152664292199175, + "grad_norm": 0.2976229710061682, + "learning_rate": 4.924225298166474e-06, + "loss": 2.6223, + "step": 56173 + }, + { + "epoch": 2.6153129874060106, + "grad_norm": 0.3102481194133614, + "learning_rate": 4.923053167172354e-06, + "loss": 2.6109, + "step": 56174 + }, + { + "epoch": 2.6153595455921037, + "grad_norm": 0.2944586614695393, + "learning_rate": 4.921881168474613e-06, + "loss": 2.6468, + "step": 56175 + }, + { + "epoch": 2.615406103778197, + "grad_norm": 0.2835066495331594, + "learning_rate": 4.920709302076731e-06, + "loss": 2.6507, + "step": 56176 + }, + { + "epoch": 2.61545266196429, + "grad_norm": 0.2928109123549468, + "learning_rate": 4.919537567982119e-06, + "loss": 2.662, + "step": 56177 + }, + { + "epoch": 2.615499220150383, + "grad_norm": 0.29585536719548505, + "learning_rate": 4.918365966194216e-06, + "loss": 2.6044, + "step": 56178 + }, + { + "epoch": 2.615545778336476, + "grad_norm": 0.3059055940396192, + "learning_rate": 4.917194496716471e-06, + "loss": 2.611, + "step": 56179 + }, + { + "epoch": 2.6155923365225693, + "grad_norm": 0.3088260794339078, + "learning_rate": 4.916023159552325e-06, + "loss": 2.7203, + "step": 56180 + }, + { + "epoch": 2.6156388947086624, + "grad_norm": 0.301242172006909, + "learning_rate": 4.914851954705202e-06, + "loss": 2.6196, + "step": 56181 + }, + { + "epoch": 2.615685452894755, + "grad_norm": 0.2883299223661595, + "learning_rate": 4.913680882178556e-06, + "loss": 2.585, + "step": 56182 + }, + { + "epoch": 2.615732011080848, + "grad_norm": 0.2925369266983346, + "learning_rate": 4.912509941975807e-06, + "loss": 2.6519, + "step": 56183 + }, + { + "epoch": 2.6157785692669413, + "grad_norm": 0.30307571733341016, + "learning_rate": 4.911339134100401e-06, + "loss": 2.6608, + "step": 56184 + }, + { + "epoch": 2.6158251274530344, + "grad_norm": 0.3067722165695784, + "learning_rate": 4.910168458555775e-06, + "loss": 2.5746, + "step": 56185 + }, + { + "epoch": 2.6158716856391275, + "grad_norm": 0.2881574748510975, + "learning_rate": 4.908997915345359e-06, + "loss": 2.6409, + "step": 56186 + }, + { + "epoch": 2.6159182438252206, + "grad_norm": 0.2960713823043926, + "learning_rate": 4.907827504472601e-06, + "loss": 2.6069, + "step": 56187 + }, + { + "epoch": 2.6159648020113138, + "grad_norm": 0.28774164513206224, + "learning_rate": 4.906657225940908e-06, + "loss": 2.5727, + "step": 56188 + }, + { + "epoch": 2.6160113601974064, + "grad_norm": 0.30902779451384593, + "learning_rate": 4.905487079753751e-06, + "loss": 2.6742, + "step": 56189 + }, + { + "epoch": 2.6160579183834995, + "grad_norm": 0.3098982561159492, + "learning_rate": 4.904317065914532e-06, + "loss": 2.7152, + "step": 56190 + }, + { + "epoch": 2.6161044765695927, + "grad_norm": 0.28128699585446215, + "learning_rate": 4.9031471844267155e-06, + "loss": 2.5344, + "step": 56191 + }, + { + "epoch": 2.616151034755686, + "grad_norm": 0.298629681925336, + "learning_rate": 4.901977435293714e-06, + "loss": 2.5221, + "step": 56192 + }, + { + "epoch": 2.616197592941779, + "grad_norm": 0.3035587732351877, + "learning_rate": 4.90080781851896e-06, + "loss": 2.5755, + "step": 56193 + }, + { + "epoch": 2.616244151127872, + "grad_norm": 0.28851946295357056, + "learning_rate": 4.8996383341058935e-06, + "loss": 2.5359, + "step": 56194 + }, + { + "epoch": 2.616290709313965, + "grad_norm": 0.30113250627833343, + "learning_rate": 4.8984689820579575e-06, + "loss": 2.5724, + "step": 56195 + }, + { + "epoch": 2.6163372675000582, + "grad_norm": 0.30462101314265816, + "learning_rate": 4.897299762378566e-06, + "loss": 2.6447, + "step": 56196 + }, + { + "epoch": 2.6163838256861514, + "grad_norm": 0.2997548699230893, + "learning_rate": 4.896130675071147e-06, + "loss": 2.617, + "step": 56197 + }, + { + "epoch": 2.6164303838722445, + "grad_norm": 0.29664482957138044, + "learning_rate": 4.894961720139146e-06, + "loss": 2.6278, + "step": 56198 + }, + { + "epoch": 2.6164769420583376, + "grad_norm": 0.2987354957957256, + "learning_rate": 4.893792897585991e-06, + "loss": 2.5774, + "step": 56199 + }, + { + "epoch": 2.6165235002444307, + "grad_norm": 0.2974877853750428, + "learning_rate": 4.892624207415114e-06, + "loss": 2.6803, + "step": 56200 + }, + { + "epoch": 2.616570058430524, + "grad_norm": 0.29452145237876176, + "learning_rate": 4.891455649629928e-06, + "loss": 2.7092, + "step": 56201 + }, + { + "epoch": 2.6166166166166165, + "grad_norm": 0.29629829393562446, + "learning_rate": 4.890287224233892e-06, + "loss": 2.734, + "step": 56202 + }, + { + "epoch": 2.6166631748027096, + "grad_norm": 0.29500754834474263, + "learning_rate": 4.889118931230397e-06, + "loss": 2.6413, + "step": 56203 + }, + { + "epoch": 2.6167097329888027, + "grad_norm": 0.2929262128545291, + "learning_rate": 4.887950770622918e-06, + "loss": 2.6375, + "step": 56204 + }, + { + "epoch": 2.616756291174896, + "grad_norm": 0.2940817198562085, + "learning_rate": 4.886782742414847e-06, + "loss": 2.6564, + "step": 56205 + }, + { + "epoch": 2.616802849360989, + "grad_norm": 0.29559836161495295, + "learning_rate": 4.885614846609621e-06, + "loss": 2.5647, + "step": 56206 + }, + { + "epoch": 2.616849407547082, + "grad_norm": 0.3097630722427768, + "learning_rate": 4.884447083210675e-06, + "loss": 2.6526, + "step": 56207 + }, + { + "epoch": 2.616895965733175, + "grad_norm": 0.2889785017025545, + "learning_rate": 4.883279452221435e-06, + "loss": 2.6306, + "step": 56208 + }, + { + "epoch": 2.616942523919268, + "grad_norm": 0.28626748652650963, + "learning_rate": 4.88211195364533e-06, + "loss": 2.5416, + "step": 56209 + }, + { + "epoch": 2.616989082105361, + "grad_norm": 0.29618763688731914, + "learning_rate": 4.880944587485769e-06, + "loss": 2.6796, + "step": 56210 + }, + { + "epoch": 2.617035640291454, + "grad_norm": 0.29891550071558126, + "learning_rate": 4.879777353746195e-06, + "loss": 2.6142, + "step": 56211 + }, + { + "epoch": 2.617082198477547, + "grad_norm": 0.30539188035616704, + "learning_rate": 4.878610252430032e-06, + "loss": 2.5831, + "step": 56212 + }, + { + "epoch": 2.6171287566636403, + "grad_norm": 0.2994233927764349, + "learning_rate": 4.877443283540712e-06, + "loss": 2.6854, + "step": 56213 + }, + { + "epoch": 2.6171753148497334, + "grad_norm": 0.2912093634053133, + "learning_rate": 4.8762764470816294e-06, + "loss": 2.606, + "step": 56214 + }, + { + "epoch": 2.6172218730358265, + "grad_norm": 0.2959270068837386, + "learning_rate": 4.875109743056244e-06, + "loss": 2.6233, + "step": 56215 + }, + { + "epoch": 2.6172684312219197, + "grad_norm": 0.28747673602420537, + "learning_rate": 4.8739431714679536e-06, + "loss": 2.558, + "step": 56216 + }, + { + "epoch": 2.6173149894080128, + "grad_norm": 0.28576765413356237, + "learning_rate": 4.8727767323202115e-06, + "loss": 2.611, + "step": 56217 + }, + { + "epoch": 2.617361547594106, + "grad_norm": 0.3034155002551777, + "learning_rate": 4.871610425616413e-06, + "loss": 2.6618, + "step": 56218 + }, + { + "epoch": 2.617408105780199, + "grad_norm": 0.3109137777640012, + "learning_rate": 4.8704442513599955e-06, + "loss": 2.614, + "step": 56219 + }, + { + "epoch": 2.617454663966292, + "grad_norm": 0.2956222056847164, + "learning_rate": 4.869278209554373e-06, + "loss": 2.6094, + "step": 56220 + }, + { + "epoch": 2.617501222152385, + "grad_norm": 0.28715269235017793, + "learning_rate": 4.868112300202976e-06, + "loss": 2.5923, + "step": 56221 + }, + { + "epoch": 2.617547780338478, + "grad_norm": 0.3052090460821839, + "learning_rate": 4.866946523309235e-06, + "loss": 2.6217, + "step": 56222 + }, + { + "epoch": 2.617594338524571, + "grad_norm": 0.28972627485536817, + "learning_rate": 4.8657808788765465e-06, + "loss": 2.687, + "step": 56223 + }, + { + "epoch": 2.617640896710664, + "grad_norm": 0.32731090907186666, + "learning_rate": 4.864615366908348e-06, + "loss": 2.7509, + "step": 56224 + }, + { + "epoch": 2.6176874548967572, + "grad_norm": 0.3094144293770602, + "learning_rate": 4.863449987408053e-06, + "loss": 2.5836, + "step": 56225 + }, + { + "epoch": 2.6177340130828504, + "grad_norm": 0.3129547978877102, + "learning_rate": 4.862284740379086e-06, + "loss": 2.5866, + "step": 56226 + }, + { + "epoch": 2.6177805712689435, + "grad_norm": 0.3018075416183126, + "learning_rate": 4.861119625824867e-06, + "loss": 2.7012, + "step": 56227 + }, + { + "epoch": 2.6178271294550366, + "grad_norm": 0.290832773986476, + "learning_rate": 4.859954643748826e-06, + "loss": 2.6984, + "step": 56228 + }, + { + "epoch": 2.6178736876411293, + "grad_norm": 0.2901272928970646, + "learning_rate": 4.858789794154351e-06, + "loss": 2.6001, + "step": 56229 + }, + { + "epoch": 2.6179202458272224, + "grad_norm": 0.2900405213146043, + "learning_rate": 4.857625077044903e-06, + "loss": 2.633, + "step": 56230 + }, + { + "epoch": 2.6179668040133155, + "grad_norm": 0.2980950092963568, + "learning_rate": 4.856460492423864e-06, + "loss": 2.6787, + "step": 56231 + }, + { + "epoch": 2.6180133621994086, + "grad_norm": 0.2969484576889748, + "learning_rate": 4.855296040294671e-06, + "loss": 2.5775, + "step": 56232 + }, + { + "epoch": 2.6180599203855017, + "grad_norm": 0.2842925929462323, + "learning_rate": 4.854131720660732e-06, + "loss": 2.6152, + "step": 56233 + }, + { + "epoch": 2.618106478571595, + "grad_norm": 0.3057229313319304, + "learning_rate": 4.852967533525471e-06, + "loss": 2.6493, + "step": 56234 + }, + { + "epoch": 2.618153036757688, + "grad_norm": 0.2931111722378693, + "learning_rate": 4.8518034788923085e-06, + "loss": 2.5868, + "step": 56235 + }, + { + "epoch": 2.618199594943781, + "grad_norm": 0.2856983593329889, + "learning_rate": 4.850639556764647e-06, + "loss": 2.4929, + "step": 56236 + }, + { + "epoch": 2.618246153129874, + "grad_norm": 0.30610339518625873, + "learning_rate": 4.849475767145911e-06, + "loss": 2.6279, + "step": 56237 + }, + { + "epoch": 2.6182927113159673, + "grad_norm": 0.2898364230858924, + "learning_rate": 4.848312110039522e-06, + "loss": 2.562, + "step": 56238 + }, + { + "epoch": 2.6183392695020604, + "grad_norm": 0.29465069371306485, + "learning_rate": 4.847148585448879e-06, + "loss": 2.6379, + "step": 56239 + }, + { + "epoch": 2.6183858276881535, + "grad_norm": 0.29761658390756535, + "learning_rate": 4.8459851933774114e-06, + "loss": 2.5654, + "step": 56240 + }, + { + "epoch": 2.618432385874246, + "grad_norm": 0.2934447878449726, + "learning_rate": 4.844821933828536e-06, + "loss": 2.694, + "step": 56241 + }, + { + "epoch": 2.6184789440603393, + "grad_norm": 0.2960235289082547, + "learning_rate": 4.843658806805646e-06, + "loss": 2.5397, + "step": 56242 + }, + { + "epoch": 2.6185255022464324, + "grad_norm": 0.28755022595119795, + "learning_rate": 4.842495812312181e-06, + "loss": 2.534, + "step": 56243 + }, + { + "epoch": 2.6185720604325256, + "grad_norm": 0.28881022995274724, + "learning_rate": 4.841332950351535e-06, + "loss": 2.6849, + "step": 56244 + }, + { + "epoch": 2.6186186186186187, + "grad_norm": 0.28354877023345837, + "learning_rate": 4.840170220927132e-06, + "loss": 2.7547, + "step": 56245 + }, + { + "epoch": 2.618665176804712, + "grad_norm": 0.3123609234779515, + "learning_rate": 4.839007624042374e-06, + "loss": 2.6283, + "step": 56246 + }, + { + "epoch": 2.618711734990805, + "grad_norm": 0.30528891364809585, + "learning_rate": 4.837845159700682e-06, + "loss": 2.5675, + "step": 56247 + }, + { + "epoch": 2.6187582931768976, + "grad_norm": 0.29049855052201795, + "learning_rate": 4.836682827905475e-06, + "loss": 2.4764, + "step": 56248 + }, + { + "epoch": 2.6188048513629907, + "grad_norm": 0.2818773669956568, + "learning_rate": 4.835520628660145e-06, + "loss": 2.5902, + "step": 56249 + }, + { + "epoch": 2.618851409549084, + "grad_norm": 0.305411610295162, + "learning_rate": 4.834358561968116e-06, + "loss": 2.6916, + "step": 56250 + }, + { + "epoch": 2.618897967735177, + "grad_norm": 0.30363560697500547, + "learning_rate": 4.833196627832792e-06, + "loss": 2.4828, + "step": 56251 + }, + { + "epoch": 2.61894452592127, + "grad_norm": 0.2984129356566392, + "learning_rate": 4.8320348262575865e-06, + "loss": 2.6057, + "step": 56252 + }, + { + "epoch": 2.618991084107363, + "grad_norm": 0.29247076485062407, + "learning_rate": 4.830873157245908e-06, + "loss": 2.6093, + "step": 56253 + }, + { + "epoch": 2.6190376422934563, + "grad_norm": 0.29566310510031757, + "learning_rate": 4.829711620801175e-06, + "loss": 2.643, + "step": 56254 + }, + { + "epoch": 2.6190842004795494, + "grad_norm": 0.29814574246139874, + "learning_rate": 4.82855021692677e-06, + "loss": 2.6691, + "step": 56255 + }, + { + "epoch": 2.6191307586656425, + "grad_norm": 0.3017339836948504, + "learning_rate": 4.8273889456261434e-06, + "loss": 2.7012, + "step": 56256 + }, + { + "epoch": 2.6191773168517356, + "grad_norm": 0.2814970620232998, + "learning_rate": 4.826227806902667e-06, + "loss": 2.5075, + "step": 56257 + }, + { + "epoch": 2.6192238750378287, + "grad_norm": 0.2965184851646514, + "learning_rate": 4.8250668007597585e-06, + "loss": 2.6696, + "step": 56258 + }, + { + "epoch": 2.619270433223922, + "grad_norm": 0.2906331341350799, + "learning_rate": 4.823905927200834e-06, + "loss": 2.628, + "step": 56259 + }, + { + "epoch": 2.6193169914100145, + "grad_norm": 0.3056643437330911, + "learning_rate": 4.822745186229294e-06, + "loss": 2.6069, + "step": 56260 + }, + { + "epoch": 2.6193635495961076, + "grad_norm": 0.30266854690163353, + "learning_rate": 4.821584577848553e-06, + "loss": 2.5952, + "step": 56261 + }, + { + "epoch": 2.6194101077822007, + "grad_norm": 0.30812630771194927, + "learning_rate": 4.8204241020619935e-06, + "loss": 2.588, + "step": 56262 + }, + { + "epoch": 2.619456665968294, + "grad_norm": 0.3109249494542858, + "learning_rate": 4.81926375887306e-06, + "loss": 2.6183, + "step": 56263 + }, + { + "epoch": 2.619503224154387, + "grad_norm": 0.3007708801847816, + "learning_rate": 4.818103548285124e-06, + "loss": 2.5601, + "step": 56264 + }, + { + "epoch": 2.61954978234048, + "grad_norm": 0.30353097731207507, + "learning_rate": 4.8169434703015996e-06, + "loss": 2.5962, + "step": 56265 + }, + { + "epoch": 2.619596340526573, + "grad_norm": 0.32198490575959715, + "learning_rate": 4.815783524925899e-06, + "loss": 2.6661, + "step": 56266 + }, + { + "epoch": 2.6196428987126663, + "grad_norm": 0.29065107976407406, + "learning_rate": 4.814623712161431e-06, + "loss": 2.5917, + "step": 56267 + }, + { + "epoch": 2.619689456898759, + "grad_norm": 0.297822291642017, + "learning_rate": 4.813464032011572e-06, + "loss": 2.6547, + "step": 56268 + }, + { + "epoch": 2.619736015084852, + "grad_norm": 0.3009012413023779, + "learning_rate": 4.812304484479763e-06, + "loss": 2.4982, + "step": 56269 + }, + { + "epoch": 2.619782573270945, + "grad_norm": 0.305572583230329, + "learning_rate": 4.811145069569378e-06, + "loss": 2.5856, + "step": 56270 + }, + { + "epoch": 2.6198291314570383, + "grad_norm": 0.2801108840308075, + "learning_rate": 4.809985787283833e-06, + "loss": 2.5303, + "step": 56271 + }, + { + "epoch": 2.6198756896431314, + "grad_norm": 0.2958289292737325, + "learning_rate": 4.808826637626529e-06, + "loss": 2.6663, + "step": 56272 + }, + { + "epoch": 2.6199222478292246, + "grad_norm": 0.2891739267846132, + "learning_rate": 4.8076676206008644e-06, + "loss": 2.666, + "step": 56273 + }, + { + "epoch": 2.6199688060153177, + "grad_norm": 0.2909048774276641, + "learning_rate": 4.806508736210252e-06, + "loss": 2.6513, + "step": 56274 + }, + { + "epoch": 2.620015364201411, + "grad_norm": 0.3059113097491858, + "learning_rate": 4.805349984458063e-06, + "loss": 2.6255, + "step": 56275 + }, + { + "epoch": 2.620061922387504, + "grad_norm": 0.28688180094983495, + "learning_rate": 4.804191365347743e-06, + "loss": 2.6551, + "step": 56276 + }, + { + "epoch": 2.620108480573597, + "grad_norm": 0.3077849805335886, + "learning_rate": 4.803032878882657e-06, + "loss": 2.6349, + "step": 56277 + }, + { + "epoch": 2.62015503875969, + "grad_norm": 0.3122354666299713, + "learning_rate": 4.801874525066214e-06, + "loss": 2.6899, + "step": 56278 + }, + { + "epoch": 2.6202015969457833, + "grad_norm": 0.28961296879757015, + "learning_rate": 4.80071630390182e-06, + "loss": 2.5644, + "step": 56279 + }, + { + "epoch": 2.620248155131876, + "grad_norm": 0.2805363370313204, + "learning_rate": 4.7995582153928745e-06, + "loss": 2.5673, + "step": 56280 + }, + { + "epoch": 2.620294713317969, + "grad_norm": 0.2858551374078823, + "learning_rate": 4.7984002595427676e-06, + "loss": 2.6935, + "step": 56281 + }, + { + "epoch": 2.620341271504062, + "grad_norm": 0.28990960342097744, + "learning_rate": 4.79724243635491e-06, + "loss": 2.5757, + "step": 56282 + }, + { + "epoch": 2.6203878296901553, + "grad_norm": 0.28942746423419174, + "learning_rate": 4.796084745832691e-06, + "loss": 2.6589, + "step": 56283 + }, + { + "epoch": 2.6204343878762484, + "grad_norm": 0.2850219316192148, + "learning_rate": 4.794927187979503e-06, + "loss": 2.6753, + "step": 56284 + }, + { + "epoch": 2.6204809460623415, + "grad_norm": 0.3013349269799452, + "learning_rate": 4.7937697627987495e-06, + "loss": 2.541, + "step": 56285 + }, + { + "epoch": 2.6205275042484346, + "grad_norm": 0.30145523658453927, + "learning_rate": 4.792612470293833e-06, + "loss": 2.5717, + "step": 56286 + }, + { + "epoch": 2.6205740624345273, + "grad_norm": 0.2821682807496149, + "learning_rate": 4.791455310468157e-06, + "loss": 2.6732, + "step": 56287 + }, + { + "epoch": 2.6206206206206204, + "grad_norm": 0.2771165905941581, + "learning_rate": 4.790298283325079e-06, + "loss": 2.554, + "step": 56288 + }, + { + "epoch": 2.6206671788067135, + "grad_norm": 0.2845551806492677, + "learning_rate": 4.7891413888680455e-06, + "loss": 2.6337, + "step": 56289 + }, + { + "epoch": 2.6207137369928066, + "grad_norm": 0.3104301076490983, + "learning_rate": 4.7879846271004166e-06, + "loss": 2.6138, + "step": 56290 + }, + { + "epoch": 2.6207602951788997, + "grad_norm": 0.308511398243642, + "learning_rate": 4.7868279980256005e-06, + "loss": 2.6499, + "step": 56291 + }, + { + "epoch": 2.620806853364993, + "grad_norm": 0.28234900539234725, + "learning_rate": 4.785671501646988e-06, + "loss": 2.5715, + "step": 56292 + }, + { + "epoch": 2.620853411551086, + "grad_norm": 0.3011710101159664, + "learning_rate": 4.784515137967976e-06, + "loss": 2.5038, + "step": 56293 + }, + { + "epoch": 2.620899969737179, + "grad_norm": 0.286787067587392, + "learning_rate": 4.783358906991964e-06, + "loss": 2.6228, + "step": 56294 + }, + { + "epoch": 2.620946527923272, + "grad_norm": 0.2958565117806993, + "learning_rate": 4.782202808722341e-06, + "loss": 2.5811, + "step": 56295 + }, + { + "epoch": 2.6209930861093653, + "grad_norm": 0.3113765097959006, + "learning_rate": 4.78104684316249e-06, + "loss": 2.6176, + "step": 56296 + }, + { + "epoch": 2.6210396442954584, + "grad_norm": 0.3115845097562445, + "learning_rate": 4.779891010315812e-06, + "loss": 2.6735, + "step": 56297 + }, + { + "epoch": 2.6210862024815516, + "grad_norm": 0.3221042811396084, + "learning_rate": 4.7787353101857e-06, + "loss": 2.6501, + "step": 56298 + }, + { + "epoch": 2.6211327606676442, + "grad_norm": 0.30472516843949093, + "learning_rate": 4.77757974277554e-06, + "loss": 2.6695, + "step": 56299 + }, + { + "epoch": 2.6211793188537373, + "grad_norm": 0.2931822952245925, + "learning_rate": 4.776424308088745e-06, + "loss": 2.5945, + "step": 56300 + }, + { + "epoch": 2.6212258770398305, + "grad_norm": 0.292912856110762, + "learning_rate": 4.775269006128663e-06, + "loss": 2.6156, + "step": 56301 + }, + { + "epoch": 2.6212724352259236, + "grad_norm": 0.2907272161009601, + "learning_rate": 4.774113836898736e-06, + "loss": 2.6351, + "step": 56302 + }, + { + "epoch": 2.6213189934120167, + "grad_norm": 0.29119962243407266, + "learning_rate": 4.7729588004023164e-06, + "loss": 2.669, + "step": 56303 + }, + { + "epoch": 2.62136555159811, + "grad_norm": 0.29450680926146683, + "learning_rate": 4.771803896642813e-06, + "loss": 2.634, + "step": 56304 + }, + { + "epoch": 2.621412109784203, + "grad_norm": 0.3248300294442581, + "learning_rate": 4.7706491256236e-06, + "loss": 2.5507, + "step": 56305 + }, + { + "epoch": 2.621458667970296, + "grad_norm": 0.30613199659308005, + "learning_rate": 4.76949448734808e-06, + "loss": 2.6058, + "step": 56306 + }, + { + "epoch": 2.6215052261563887, + "grad_norm": 0.301959826534086, + "learning_rate": 4.768339981819641e-06, + "loss": 2.6245, + "step": 56307 + }, + { + "epoch": 2.621551784342482, + "grad_norm": 0.3253396174575547, + "learning_rate": 4.7671856090416745e-06, + "loss": 2.6204, + "step": 56308 + }, + { + "epoch": 2.621598342528575, + "grad_norm": 0.2955207993805046, + "learning_rate": 4.766031369017554e-06, + "loss": 2.5822, + "step": 56309 + }, + { + "epoch": 2.621644900714668, + "grad_norm": 0.3233365361738372, + "learning_rate": 4.764877261750677e-06, + "loss": 2.5833, + "step": 56310 + }, + { + "epoch": 2.621691458900761, + "grad_norm": 0.3130865912796681, + "learning_rate": 4.763723287244426e-06, + "loss": 2.6205, + "step": 56311 + }, + { + "epoch": 2.6217380170868543, + "grad_norm": 0.3136944039362453, + "learning_rate": 4.7625694455021905e-06, + "loss": 2.6486, + "step": 56312 + }, + { + "epoch": 2.6217845752729474, + "grad_norm": 0.2799372494182812, + "learning_rate": 4.76141573652737e-06, + "loss": 2.5576, + "step": 56313 + }, + { + "epoch": 2.6218311334590405, + "grad_norm": 0.2847770575887066, + "learning_rate": 4.760262160323314e-06, + "loss": 2.5379, + "step": 56314 + }, + { + "epoch": 2.6218776916451336, + "grad_norm": 0.30281880437107583, + "learning_rate": 4.759108716893451e-06, + "loss": 2.5814, + "step": 56315 + }, + { + "epoch": 2.6219242498312267, + "grad_norm": 0.3086284261141926, + "learning_rate": 4.757955406241133e-06, + "loss": 2.6564, + "step": 56316 + }, + { + "epoch": 2.62197080801732, + "grad_norm": 0.2699897949704606, + "learning_rate": 4.756802228369772e-06, + "loss": 2.5359, + "step": 56317 + }, + { + "epoch": 2.622017366203413, + "grad_norm": 0.29616241495548773, + "learning_rate": 4.755649183282729e-06, + "loss": 2.5626, + "step": 56318 + }, + { + "epoch": 2.6220639243895056, + "grad_norm": 0.29890709446296554, + "learning_rate": 4.7544962709834054e-06, + "loss": 2.5861, + "step": 56319 + }, + { + "epoch": 2.6221104825755988, + "grad_norm": 0.29651121190507, + "learning_rate": 4.7533434914751704e-06, + "loss": 2.574, + "step": 56320 + }, + { + "epoch": 2.622157040761692, + "grad_norm": 0.3018281891339432, + "learning_rate": 4.752190844761429e-06, + "loss": 2.6592, + "step": 56321 + }, + { + "epoch": 2.622203598947785, + "grad_norm": 0.3085596171281884, + "learning_rate": 4.751038330845536e-06, + "loss": 2.6012, + "step": 56322 + }, + { + "epoch": 2.622250157133878, + "grad_norm": 0.3024196981638785, + "learning_rate": 4.749885949730893e-06, + "loss": 2.6235, + "step": 56323 + }, + { + "epoch": 2.622296715319971, + "grad_norm": 0.28643395205910754, + "learning_rate": 4.748733701420876e-06, + "loss": 2.6311, + "step": 56324 + }, + { + "epoch": 2.6223432735060643, + "grad_norm": 0.2886321272992249, + "learning_rate": 4.747581585918864e-06, + "loss": 2.5656, + "step": 56325 + }, + { + "epoch": 2.622389831692157, + "grad_norm": 0.28140353143042984, + "learning_rate": 4.746429603228253e-06, + "loss": 2.6964, + "step": 56326 + }, + { + "epoch": 2.62243638987825, + "grad_norm": 0.3093708064894255, + "learning_rate": 4.745277753352395e-06, + "loss": 2.608, + "step": 56327 + }, + { + "epoch": 2.6224829480643432, + "grad_norm": 0.33256528529099566, + "learning_rate": 4.744126036294705e-06, + "loss": 2.7255, + "step": 56328 + }, + { + "epoch": 2.6225295062504363, + "grad_norm": 0.2907598303894616, + "learning_rate": 4.742974452058529e-06, + "loss": 2.6079, + "step": 56329 + }, + { + "epoch": 2.6225760644365295, + "grad_norm": 0.30317984507204193, + "learning_rate": 4.741823000647289e-06, + "loss": 2.6981, + "step": 56330 + }, + { + "epoch": 2.6226226226226226, + "grad_norm": 0.2892532423023411, + "learning_rate": 4.7406716820643234e-06, + "loss": 2.6115, + "step": 56331 + }, + { + "epoch": 2.6226691808087157, + "grad_norm": 0.2900923152581223, + "learning_rate": 4.739520496313027e-06, + "loss": 2.6237, + "step": 56332 + }, + { + "epoch": 2.622715738994809, + "grad_norm": 0.28316217649902586, + "learning_rate": 4.738369443396784e-06, + "loss": 2.6683, + "step": 56333 + }, + { + "epoch": 2.622762297180902, + "grad_norm": 0.2845877007883874, + "learning_rate": 4.737218523318965e-06, + "loss": 2.538, + "step": 56334 + }, + { + "epoch": 2.622808855366995, + "grad_norm": 0.299031790210375, + "learning_rate": 4.736067736082961e-06, + "loss": 2.6281, + "step": 56335 + }, + { + "epoch": 2.622855413553088, + "grad_norm": 0.29176917260142277, + "learning_rate": 4.734917081692125e-06, + "loss": 2.5333, + "step": 56336 + }, + { + "epoch": 2.6229019717391813, + "grad_norm": 0.2828442315599902, + "learning_rate": 4.733766560149855e-06, + "loss": 2.5938, + "step": 56337 + }, + { + "epoch": 2.6229485299252744, + "grad_norm": 0.29162729380212804, + "learning_rate": 4.732616171459514e-06, + "loss": 2.6096, + "step": 56338 + }, + { + "epoch": 2.622995088111367, + "grad_norm": 0.28865510455227644, + "learning_rate": 4.731465915624489e-06, + "loss": 2.6052, + "step": 56339 + }, + { + "epoch": 2.62304164629746, + "grad_norm": 0.30302059933856484, + "learning_rate": 4.730315792648155e-06, + "loss": 2.561, + "step": 56340 + }, + { + "epoch": 2.6230882044835533, + "grad_norm": 0.29495591812725147, + "learning_rate": 4.7291658025338855e-06, + "loss": 2.6319, + "step": 56341 + }, + { + "epoch": 2.6231347626696464, + "grad_norm": 0.3067380054309443, + "learning_rate": 4.728015945285042e-06, + "loss": 2.6528, + "step": 56342 + }, + { + "epoch": 2.6231813208557395, + "grad_norm": 0.3234134075015328, + "learning_rate": 4.72686622090503e-06, + "loss": 2.7101, + "step": 56343 + }, + { + "epoch": 2.6232278790418326, + "grad_norm": 0.3152045335530288, + "learning_rate": 4.725716629397192e-06, + "loss": 2.6379, + "step": 56344 + }, + { + "epoch": 2.6232744372279257, + "grad_norm": 0.29839493714634535, + "learning_rate": 4.7245671707649205e-06, + "loss": 2.7016, + "step": 56345 + }, + { + "epoch": 2.6233209954140184, + "grad_norm": 0.30053033327259027, + "learning_rate": 4.7234178450115796e-06, + "loss": 2.6703, + "step": 56346 + }, + { + "epoch": 2.6233675536001115, + "grad_norm": 0.2916936828329559, + "learning_rate": 4.722268652140549e-06, + "loss": 2.5085, + "step": 56347 + }, + { + "epoch": 2.6234141117862046, + "grad_norm": 0.2983905824401262, + "learning_rate": 4.721119592155199e-06, + "loss": 2.6318, + "step": 56348 + }, + { + "epoch": 2.6234606699722978, + "grad_norm": 0.28900641307225067, + "learning_rate": 4.719970665058909e-06, + "loss": 2.6006, + "step": 56349 + }, + { + "epoch": 2.623507228158391, + "grad_norm": 0.28293964004494426, + "learning_rate": 4.7188218708550395e-06, + "loss": 2.5804, + "step": 56350 + }, + { + "epoch": 2.623553786344484, + "grad_norm": 0.30248517874162356, + "learning_rate": 4.717673209546963e-06, + "loss": 2.5981, + "step": 56351 + }, + { + "epoch": 2.623600344530577, + "grad_norm": 0.3105219895957881, + "learning_rate": 4.716524681138057e-06, + "loss": 2.6648, + "step": 56352 + }, + { + "epoch": 2.6236469027166702, + "grad_norm": 0.2974281155867532, + "learning_rate": 4.715376285631689e-06, + "loss": 2.5944, + "step": 56353 + }, + { + "epoch": 2.6236934609027633, + "grad_norm": 0.29577025134552987, + "learning_rate": 4.714228023031242e-06, + "loss": 2.5866, + "step": 56354 + }, + { + "epoch": 2.6237400190888565, + "grad_norm": 0.29283571240054723, + "learning_rate": 4.713079893340055e-06, + "loss": 2.5454, + "step": 56355 + }, + { + "epoch": 2.6237865772749496, + "grad_norm": 0.2966385295840265, + "learning_rate": 4.711931896561533e-06, + "loss": 2.6687, + "step": 56356 + }, + { + "epoch": 2.6238331354610427, + "grad_norm": 0.29545203087200556, + "learning_rate": 4.710784032699012e-06, + "loss": 2.6075, + "step": 56357 + }, + { + "epoch": 2.6238796936471354, + "grad_norm": 0.28737562458293014, + "learning_rate": 4.709636301755899e-06, + "loss": 2.5804, + "step": 56358 + }, + { + "epoch": 2.6239262518332285, + "grad_norm": 0.3032503686319616, + "learning_rate": 4.7084887037355305e-06, + "loss": 2.5749, + "step": 56359 + }, + { + "epoch": 2.6239728100193216, + "grad_norm": 0.29067335604447037, + "learning_rate": 4.707341238641283e-06, + "loss": 2.5943, + "step": 56360 + }, + { + "epoch": 2.6240193682054147, + "grad_norm": 0.2949994944505007, + "learning_rate": 4.70619390647653e-06, + "loss": 2.5949, + "step": 56361 + }, + { + "epoch": 2.624065926391508, + "grad_norm": 0.2918086979751546, + "learning_rate": 4.7050467072446426e-06, + "loss": 2.539, + "step": 56362 + }, + { + "epoch": 2.624112484577601, + "grad_norm": 0.2855825146522779, + "learning_rate": 4.703899640948972e-06, + "loss": 2.657, + "step": 56363 + }, + { + "epoch": 2.624159042763694, + "grad_norm": 0.30227843052124764, + "learning_rate": 4.7027527075929e-06, + "loss": 2.5412, + "step": 56364 + }, + { + "epoch": 2.6242056009497867, + "grad_norm": 0.2890932474104411, + "learning_rate": 4.701605907179779e-06, + "loss": 2.5871, + "step": 56365 + }, + { + "epoch": 2.62425215913588, + "grad_norm": 0.29258030442839383, + "learning_rate": 4.700459239712984e-06, + "loss": 2.6221, + "step": 56366 + }, + { + "epoch": 2.624298717321973, + "grad_norm": 0.2886130486718459, + "learning_rate": 4.6993127051958856e-06, + "loss": 2.5995, + "step": 56367 + }, + { + "epoch": 2.624345275508066, + "grad_norm": 0.2963399482853167, + "learning_rate": 4.6981663036318294e-06, + "loss": 2.6849, + "step": 56368 + }, + { + "epoch": 2.624391833694159, + "grad_norm": 0.2913537512877099, + "learning_rate": 4.697020035024202e-06, + "loss": 2.64, + "step": 56369 + }, + { + "epoch": 2.6244383918802523, + "grad_norm": 0.2981756928863488, + "learning_rate": 4.6958738993763466e-06, + "loss": 2.6729, + "step": 56370 + }, + { + "epoch": 2.6244849500663454, + "grad_norm": 0.3071081385574123, + "learning_rate": 4.694727896691653e-06, + "loss": 2.5339, + "step": 56371 + }, + { + "epoch": 2.6245315082524385, + "grad_norm": 0.30115838132364764, + "learning_rate": 4.693582026973459e-06, + "loss": 2.6834, + "step": 56372 + }, + { + "epoch": 2.6245780664385316, + "grad_norm": 0.3084791966115565, + "learning_rate": 4.692436290225144e-06, + "loss": 2.5531, + "step": 56373 + }, + { + "epoch": 2.6246246246246248, + "grad_norm": 0.3079614502777645, + "learning_rate": 4.691290686450061e-06, + "loss": 2.5849, + "step": 56374 + }, + { + "epoch": 2.624671182810718, + "grad_norm": 0.28913871735264407, + "learning_rate": 4.6901452156515754e-06, + "loss": 2.5944, + "step": 56375 + }, + { + "epoch": 2.624717740996811, + "grad_norm": 0.2886156790382784, + "learning_rate": 4.688999877833061e-06, + "loss": 2.6894, + "step": 56376 + }, + { + "epoch": 2.624764299182904, + "grad_norm": 0.3123933526477522, + "learning_rate": 4.687854672997854e-06, + "loss": 2.567, + "step": 56377 + }, + { + "epoch": 2.6248108573689968, + "grad_norm": 0.32045399172535033, + "learning_rate": 4.686709601149336e-06, + "loss": 2.6183, + "step": 56378 + }, + { + "epoch": 2.62485741555509, + "grad_norm": 0.3038523359127064, + "learning_rate": 4.685564662290859e-06, + "loss": 2.4698, + "step": 56379 + }, + { + "epoch": 2.624903973741183, + "grad_norm": 0.31460215749766546, + "learning_rate": 4.684419856425792e-06, + "loss": 2.6704, + "step": 56380 + }, + { + "epoch": 2.624950531927276, + "grad_norm": 0.28628739648215773, + "learning_rate": 4.683275183557473e-06, + "loss": 2.6262, + "step": 56381 + }, + { + "epoch": 2.6249970901133692, + "grad_norm": 0.2895588642416863, + "learning_rate": 4.6821306436892975e-06, + "loss": 2.6159, + "step": 56382 + }, + { + "epoch": 2.6250436482994624, + "grad_norm": 0.2948455632691317, + "learning_rate": 4.680986236824581e-06, + "loss": 2.5417, + "step": 56383 + }, + { + "epoch": 2.6250902064855555, + "grad_norm": 0.30351891399782477, + "learning_rate": 4.679841962966725e-06, + "loss": 2.5777, + "step": 56384 + }, + { + "epoch": 2.625136764671648, + "grad_norm": 0.3051860662841969, + "learning_rate": 4.67869782211906e-06, + "loss": 2.6262, + "step": 56385 + }, + { + "epoch": 2.6251833228577413, + "grad_norm": 0.30376035774392807, + "learning_rate": 4.6775538142849515e-06, + "loss": 2.5959, + "step": 56386 + }, + { + "epoch": 2.6252298810438344, + "grad_norm": 0.3074076591467594, + "learning_rate": 4.676409939467757e-06, + "loss": 2.6885, + "step": 56387 + }, + { + "epoch": 2.6252764392299275, + "grad_norm": 0.2979112484094284, + "learning_rate": 4.675266197670836e-06, + "loss": 2.5626, + "step": 56388 + }, + { + "epoch": 2.6253229974160206, + "grad_norm": 0.2885642845561224, + "learning_rate": 4.67412258889755e-06, + "loss": 2.6226, + "step": 56389 + }, + { + "epoch": 2.6253695556021137, + "grad_norm": 0.3026965045852026, + "learning_rate": 4.672979113151244e-06, + "loss": 2.4991, + "step": 56390 + }, + { + "epoch": 2.625416113788207, + "grad_norm": 0.31058442563475175, + "learning_rate": 4.671835770435273e-06, + "loss": 2.6278, + "step": 56391 + }, + { + "epoch": 2.6254626719743, + "grad_norm": 0.2907357990222988, + "learning_rate": 4.670692560753004e-06, + "loss": 2.6455, + "step": 56392 + }, + { + "epoch": 2.625509230160393, + "grad_norm": 0.3037822361477139, + "learning_rate": 4.669549484107783e-06, + "loss": 2.5909, + "step": 56393 + }, + { + "epoch": 2.625555788346486, + "grad_norm": 0.31024617552521355, + "learning_rate": 4.668406540502968e-06, + "loss": 2.6438, + "step": 56394 + }, + { + "epoch": 2.6256023465325793, + "grad_norm": 0.31418035261245086, + "learning_rate": 4.667263729941923e-06, + "loss": 2.5942, + "step": 56395 + }, + { + "epoch": 2.6256489047186724, + "grad_norm": 0.31342065923805207, + "learning_rate": 4.666121052427979e-06, + "loss": 2.6211, + "step": 56396 + }, + { + "epoch": 2.625695462904765, + "grad_norm": 0.2966248044013614, + "learning_rate": 4.664978507964518e-06, + "loss": 2.6232, + "step": 56397 + }, + { + "epoch": 2.625742021090858, + "grad_norm": 0.30908375983860653, + "learning_rate": 4.663836096554869e-06, + "loss": 2.4618, + "step": 56398 + }, + { + "epoch": 2.6257885792769513, + "grad_norm": 0.3209953080824083, + "learning_rate": 4.662693818202396e-06, + "loss": 2.582, + "step": 56399 + }, + { + "epoch": 2.6258351374630444, + "grad_norm": 0.2952342476420589, + "learning_rate": 4.661551672910452e-06, + "loss": 2.6442, + "step": 56400 + }, + { + "epoch": 2.6258816956491375, + "grad_norm": 0.29013334275686387, + "learning_rate": 4.660409660682385e-06, + "loss": 2.7138, + "step": 56401 + }, + { + "epoch": 2.6259282538352307, + "grad_norm": 0.3000616604206299, + "learning_rate": 4.659267781521559e-06, + "loss": 2.5415, + "step": 56402 + }, + { + "epoch": 2.6259748120213238, + "grad_norm": 0.2961959456800308, + "learning_rate": 4.658126035431309e-06, + "loss": 2.6234, + "step": 56403 + }, + { + "epoch": 2.626021370207417, + "grad_norm": 0.2859272524186215, + "learning_rate": 4.6569844224149895e-06, + "loss": 2.5976, + "step": 56404 + }, + { + "epoch": 2.6260679283935096, + "grad_norm": 0.31163646206707557, + "learning_rate": 4.655842942475952e-06, + "loss": 2.5624, + "step": 56405 + }, + { + "epoch": 2.6261144865796027, + "grad_norm": 0.29843049661967774, + "learning_rate": 4.65470159561755e-06, + "loss": 2.5984, + "step": 56406 + }, + { + "epoch": 2.626161044765696, + "grad_norm": 0.2951517956967081, + "learning_rate": 4.653560381843131e-06, + "loss": 2.6528, + "step": 56407 + }, + { + "epoch": 2.626207602951789, + "grad_norm": 0.3244097293678699, + "learning_rate": 4.6524193011560524e-06, + "loss": 2.6291, + "step": 56408 + }, + { + "epoch": 2.626254161137882, + "grad_norm": 0.3092266448051265, + "learning_rate": 4.65127835355964e-06, + "loss": 2.5905, + "step": 56409 + }, + { + "epoch": 2.626300719323975, + "grad_norm": 0.2958798497744955, + "learning_rate": 4.650137539057275e-06, + "loss": 2.6057, + "step": 56410 + }, + { + "epoch": 2.6263472775100682, + "grad_norm": 0.3055658774825523, + "learning_rate": 4.648996857652282e-06, + "loss": 2.6659, + "step": 56411 + }, + { + "epoch": 2.6263938356961614, + "grad_norm": 0.29690879570924644, + "learning_rate": 4.647856309348014e-06, + "loss": 2.5348, + "step": 56412 + }, + { + "epoch": 2.6264403938822545, + "grad_norm": 0.2985780188942659, + "learning_rate": 4.6467158941478184e-06, + "loss": 2.6556, + "step": 56413 + }, + { + "epoch": 2.6264869520683476, + "grad_norm": 0.29349595428308317, + "learning_rate": 4.645575612055042e-06, + "loss": 2.5799, + "step": 56414 + }, + { + "epoch": 2.6265335102544407, + "grad_norm": 0.30176054967775096, + "learning_rate": 4.644435463073049e-06, + "loss": 2.6548, + "step": 56415 + }, + { + "epoch": 2.626580068440534, + "grad_norm": 0.2987849277656631, + "learning_rate": 4.643295447205154e-06, + "loss": 2.5345, + "step": 56416 + }, + { + "epoch": 2.6266266266266265, + "grad_norm": 0.3110154698884466, + "learning_rate": 4.64215556445472e-06, + "loss": 2.5868, + "step": 56417 + }, + { + "epoch": 2.6266731848127196, + "grad_norm": 0.291163846242682, + "learning_rate": 4.641015814825095e-06, + "loss": 2.5978, + "step": 56418 + }, + { + "epoch": 2.6267197429988127, + "grad_norm": 0.30946073612630864, + "learning_rate": 4.639876198319615e-06, + "loss": 2.6136, + "step": 56419 + }, + { + "epoch": 2.626766301184906, + "grad_norm": 0.29149840221829204, + "learning_rate": 4.638736714941633e-06, + "loss": 2.6784, + "step": 56420 + }, + { + "epoch": 2.626812859370999, + "grad_norm": 0.2902978277216422, + "learning_rate": 4.6375973646944955e-06, + "loss": 2.6355, + "step": 56421 + }, + { + "epoch": 2.626859417557092, + "grad_norm": 0.29345849257347956, + "learning_rate": 4.636458147581524e-06, + "loss": 2.7376, + "step": 56422 + }, + { + "epoch": 2.626905975743185, + "grad_norm": 0.2900947610938251, + "learning_rate": 4.6353190636060975e-06, + "loss": 2.6357, + "step": 56423 + }, + { + "epoch": 2.626952533929278, + "grad_norm": 0.29596991756912827, + "learning_rate": 4.6341801127715304e-06, + "loss": 2.6045, + "step": 56424 + }, + { + "epoch": 2.626999092115371, + "grad_norm": 0.29105900465021756, + "learning_rate": 4.633041295081175e-06, + "loss": 2.621, + "step": 56425 + }, + { + "epoch": 2.627045650301464, + "grad_norm": 0.29148890180447373, + "learning_rate": 4.6319026105383746e-06, + "loss": 2.6736, + "step": 56426 + }, + { + "epoch": 2.627092208487557, + "grad_norm": 0.28736125636937904, + "learning_rate": 4.630764059146475e-06, + "loss": 2.7011, + "step": 56427 + }, + { + "epoch": 2.6271387666736503, + "grad_norm": 0.2892371249375108, + "learning_rate": 4.629625640908819e-06, + "loss": 2.5421, + "step": 56428 + }, + { + "epoch": 2.6271853248597434, + "grad_norm": 0.31058391792237916, + "learning_rate": 4.62848735582872e-06, + "loss": 2.6073, + "step": 56429 + }, + { + "epoch": 2.6272318830458365, + "grad_norm": 0.2996805392136349, + "learning_rate": 4.627349203909565e-06, + "loss": 2.6236, + "step": 56430 + }, + { + "epoch": 2.6272784412319297, + "grad_norm": 0.2842189784584992, + "learning_rate": 4.626211185154656e-06, + "loss": 2.6179, + "step": 56431 + }, + { + "epoch": 2.6273249994180228, + "grad_norm": 0.3001628648839245, + "learning_rate": 4.625073299567351e-06, + "loss": 2.6472, + "step": 56432 + }, + { + "epoch": 2.627371557604116, + "grad_norm": 0.30320556297180706, + "learning_rate": 4.623935547150981e-06, + "loss": 2.5966, + "step": 56433 + }, + { + "epoch": 2.627418115790209, + "grad_norm": 0.28003628406215914, + "learning_rate": 4.622797927908906e-06, + "loss": 2.6449, + "step": 56434 + }, + { + "epoch": 2.627464673976302, + "grad_norm": 0.2950475531730232, + "learning_rate": 4.621660441844422e-06, + "loss": 2.5312, + "step": 56435 + }, + { + "epoch": 2.627511232162395, + "grad_norm": 0.2903098964826273, + "learning_rate": 4.620523088960921e-06, + "loss": 2.5846, + "step": 56436 + }, + { + "epoch": 2.627557790348488, + "grad_norm": 0.2865330131835363, + "learning_rate": 4.6193858692617e-06, + "loss": 2.5847, + "step": 56437 + }, + { + "epoch": 2.627604348534581, + "grad_norm": 0.2974156976490814, + "learning_rate": 4.618248782750112e-06, + "loss": 2.6652, + "step": 56438 + }, + { + "epoch": 2.627650906720674, + "grad_norm": 0.2952251377941759, + "learning_rate": 4.617111829429494e-06, + "loss": 2.6535, + "step": 56439 + }, + { + "epoch": 2.6276974649067673, + "grad_norm": 0.2926376427330363, + "learning_rate": 4.615975009303175e-06, + "loss": 2.539, + "step": 56440 + }, + { + "epoch": 2.6277440230928604, + "grad_norm": 0.2860906839856015, + "learning_rate": 4.614838322374515e-06, + "loss": 2.6955, + "step": 56441 + }, + { + "epoch": 2.6277905812789535, + "grad_norm": 0.292536440306074, + "learning_rate": 4.613701768646811e-06, + "loss": 2.6591, + "step": 56442 + }, + { + "epoch": 2.6278371394650466, + "grad_norm": 0.2886316363315344, + "learning_rate": 4.612565348123443e-06, + "loss": 2.547, + "step": 56443 + }, + { + "epoch": 2.6278836976511393, + "grad_norm": 0.28844282206341654, + "learning_rate": 4.611429060807715e-06, + "loss": 2.6377, + "step": 56444 + }, + { + "epoch": 2.6279302558372324, + "grad_norm": 0.2835107110109584, + "learning_rate": 4.610292906702968e-06, + "loss": 2.5375, + "step": 56445 + }, + { + "epoch": 2.6279768140233255, + "grad_norm": 0.2999911028934296, + "learning_rate": 4.6091568858125435e-06, + "loss": 2.7227, + "step": 56446 + }, + { + "epoch": 2.6280233722094186, + "grad_norm": 0.2993629035261697, + "learning_rate": 4.608020998139767e-06, + "loss": 2.6703, + "step": 56447 + }, + { + "epoch": 2.6280699303955117, + "grad_norm": 0.2944300902712365, + "learning_rate": 4.60688524368798e-06, + "loss": 2.7067, + "step": 56448 + }, + { + "epoch": 2.628116488581605, + "grad_norm": 0.3239235047301254, + "learning_rate": 4.60574962246052e-06, + "loss": 2.5667, + "step": 56449 + }, + { + "epoch": 2.628163046767698, + "grad_norm": 0.3096584631643619, + "learning_rate": 4.604614134460705e-06, + "loss": 2.6101, + "step": 56450 + }, + { + "epoch": 2.628209604953791, + "grad_norm": 0.3025175237295945, + "learning_rate": 4.603478779691878e-06, + "loss": 2.5251, + "step": 56451 + }, + { + "epoch": 2.628256163139884, + "grad_norm": 0.2878826511865995, + "learning_rate": 4.602343558157362e-06, + "loss": 2.4885, + "step": 56452 + }, + { + "epoch": 2.6283027213259773, + "grad_norm": 0.2861757827753631, + "learning_rate": 4.601208469860502e-06, + "loss": 2.6516, + "step": 56453 + }, + { + "epoch": 2.6283492795120704, + "grad_norm": 0.31505638410627346, + "learning_rate": 4.600073514804632e-06, + "loss": 2.7165, + "step": 56454 + }, + { + "epoch": 2.6283958376981635, + "grad_norm": 0.2881018871587323, + "learning_rate": 4.598938692993049e-06, + "loss": 2.6131, + "step": 56455 + }, + { + "epoch": 2.628442395884256, + "grad_norm": 0.2772516192239985, + "learning_rate": 4.597804004429129e-06, + "loss": 2.5193, + "step": 56456 + }, + { + "epoch": 2.6284889540703493, + "grad_norm": 0.3065509562877177, + "learning_rate": 4.596669449116176e-06, + "loss": 2.5509, + "step": 56457 + }, + { + "epoch": 2.6285355122564424, + "grad_norm": 0.2865500404377883, + "learning_rate": 4.595535027057518e-06, + "loss": 2.6419, + "step": 56458 + }, + { + "epoch": 2.6285820704425356, + "grad_norm": 0.2922479174389369, + "learning_rate": 4.594400738256499e-06, + "loss": 2.5595, + "step": 56459 + }, + { + "epoch": 2.6286286286286287, + "grad_norm": 0.3032128069398352, + "learning_rate": 4.593266582716438e-06, + "loss": 2.605, + "step": 56460 + }, + { + "epoch": 2.628675186814722, + "grad_norm": 0.2888095802172057, + "learning_rate": 4.5921325604406655e-06, + "loss": 2.5591, + "step": 56461 + }, + { + "epoch": 2.628721745000815, + "grad_norm": 0.29310043549987996, + "learning_rate": 4.590998671432517e-06, + "loss": 2.5526, + "step": 56462 + }, + { + "epoch": 2.6287683031869076, + "grad_norm": 0.2970141789853631, + "learning_rate": 4.589864915695308e-06, + "loss": 2.7271, + "step": 56463 + }, + { + "epoch": 2.6288148613730007, + "grad_norm": 0.29840839570499017, + "learning_rate": 4.588731293232373e-06, + "loss": 2.5827, + "step": 56464 + }, + { + "epoch": 2.628861419559094, + "grad_norm": 0.3005076081517845, + "learning_rate": 4.587597804047033e-06, + "loss": 2.6581, + "step": 56465 + }, + { + "epoch": 2.628907977745187, + "grad_norm": 0.29846911606944854, + "learning_rate": 4.586464448142624e-06, + "loss": 2.5733, + "step": 56466 + }, + { + "epoch": 2.62895453593128, + "grad_norm": 0.28964481482153515, + "learning_rate": 4.585331225522477e-06, + "loss": 2.5854, + "step": 56467 + }, + { + "epoch": 2.629001094117373, + "grad_norm": 0.3010898136951379, + "learning_rate": 4.584198136189888e-06, + "loss": 2.6704, + "step": 56468 + }, + { + "epoch": 2.6290476523034663, + "grad_norm": 0.2941861793489831, + "learning_rate": 4.583065180148227e-06, + "loss": 2.5934, + "step": 56469 + }, + { + "epoch": 2.6290942104895594, + "grad_norm": 0.2895392644411775, + "learning_rate": 4.581932357400781e-06, + "loss": 2.673, + "step": 56470 + }, + { + "epoch": 2.6291407686756525, + "grad_norm": 0.3030533586691161, + "learning_rate": 4.580799667950891e-06, + "loss": 2.6007, + "step": 56471 + }, + { + "epoch": 2.6291873268617456, + "grad_norm": 0.2965631849952996, + "learning_rate": 4.579667111801878e-06, + "loss": 2.6895, + "step": 56472 + }, + { + "epoch": 2.6292338850478387, + "grad_norm": 0.30075334910773543, + "learning_rate": 4.578534688957065e-06, + "loss": 2.5554, + "step": 56473 + }, + { + "epoch": 2.629280443233932, + "grad_norm": 0.29410623566369065, + "learning_rate": 4.577402399419783e-06, + "loss": 2.6807, + "step": 56474 + }, + { + "epoch": 2.6293270014200245, + "grad_norm": 0.29548001517103906, + "learning_rate": 4.576270243193359e-06, + "loss": 2.6208, + "step": 56475 + }, + { + "epoch": 2.6293735596061176, + "grad_norm": 0.3131842528503858, + "learning_rate": 4.5751382202810944e-06, + "loss": 2.7593, + "step": 56476 + }, + { + "epoch": 2.6294201177922107, + "grad_norm": 0.29868419813577424, + "learning_rate": 4.574006330686326e-06, + "loss": 2.6295, + "step": 56477 + }, + { + "epoch": 2.629466675978304, + "grad_norm": 0.29788645989251894, + "learning_rate": 4.572874574412372e-06, + "loss": 2.5876, + "step": 56478 + }, + { + "epoch": 2.629513234164397, + "grad_norm": 0.29484226986648404, + "learning_rate": 4.57174295146256e-06, + "loss": 2.616, + "step": 56479 + }, + { + "epoch": 2.62955979235049, + "grad_norm": 0.2991426903894371, + "learning_rate": 4.570611461840213e-06, + "loss": 2.5806, + "step": 56480 + }, + { + "epoch": 2.629606350536583, + "grad_norm": 0.2906652985861864, + "learning_rate": 4.569480105548629e-06, + "loss": 2.6167, + "step": 56481 + }, + { + "epoch": 2.6296529087226763, + "grad_norm": 0.2944119047504898, + "learning_rate": 4.568348882591161e-06, + "loss": 2.5948, + "step": 56482 + }, + { + "epoch": 2.629699466908769, + "grad_norm": 0.29030984135182075, + "learning_rate": 4.5672177929710956e-06, + "loss": 2.5792, + "step": 56483 + }, + { + "epoch": 2.629746025094862, + "grad_norm": 0.31193363576290034, + "learning_rate": 4.566086836691791e-06, + "loss": 2.662, + "step": 56484 + }, + { + "epoch": 2.629792583280955, + "grad_norm": 0.2878272046212779, + "learning_rate": 4.5649560137565325e-06, + "loss": 2.5446, + "step": 56485 + }, + { + "epoch": 2.6298391414670483, + "grad_norm": 0.29933656275231074, + "learning_rate": 4.563825324168652e-06, + "loss": 2.6955, + "step": 56486 + }, + { + "epoch": 2.6298856996531415, + "grad_norm": 0.2933372971383049, + "learning_rate": 4.562694767931475e-06, + "loss": 2.6435, + "step": 56487 + }, + { + "epoch": 2.6299322578392346, + "grad_norm": 0.29676018586521075, + "learning_rate": 4.561564345048313e-06, + "loss": 2.517, + "step": 56488 + }, + { + "epoch": 2.6299788160253277, + "grad_norm": 0.29181588449354146, + "learning_rate": 4.560434055522478e-06, + "loss": 2.7082, + "step": 56489 + }, + { + "epoch": 2.630025374211421, + "grad_norm": 0.3048672133399673, + "learning_rate": 4.5593038993572924e-06, + "loss": 2.6783, + "step": 56490 + }, + { + "epoch": 2.630071932397514, + "grad_norm": 0.2963482514296248, + "learning_rate": 4.558173876556077e-06, + "loss": 2.5507, + "step": 56491 + }, + { + "epoch": 2.630118490583607, + "grad_norm": 0.29253301788236763, + "learning_rate": 4.557043987122139e-06, + "loss": 2.572, + "step": 56492 + }, + { + "epoch": 2.6301650487697, + "grad_norm": 0.30381818137407346, + "learning_rate": 4.555914231058811e-06, + "loss": 2.5438, + "step": 56493 + }, + { + "epoch": 2.6302116069557933, + "grad_norm": 0.2961509175254036, + "learning_rate": 4.554784608369378e-06, + "loss": 2.7133, + "step": 56494 + }, + { + "epoch": 2.630258165141886, + "grad_norm": 0.2995553547652142, + "learning_rate": 4.553655119057193e-06, + "loss": 2.7111, + "step": 56495 + }, + { + "epoch": 2.630304723327979, + "grad_norm": 0.30512574843188367, + "learning_rate": 4.552525763125537e-06, + "loss": 2.553, + "step": 56496 + }, + { + "epoch": 2.630351281514072, + "grad_norm": 0.3147206974753794, + "learning_rate": 4.551396540577757e-06, + "loss": 2.5944, + "step": 56497 + }, + { + "epoch": 2.6303978397001653, + "grad_norm": 0.2806213568239046, + "learning_rate": 4.550267451417145e-06, + "loss": 2.5217, + "step": 56498 + }, + { + "epoch": 2.6304443978862584, + "grad_norm": 0.2891371174469817, + "learning_rate": 4.54913849564702e-06, + "loss": 2.6537, + "step": 56499 + }, + { + "epoch": 2.6304909560723515, + "grad_norm": 0.2989860832352746, + "learning_rate": 4.548009673270692e-06, + "loss": 2.5288, + "step": 56500 + }, + { + "epoch": 2.6305375142584446, + "grad_norm": 0.2919870059068517, + "learning_rate": 4.546880984291479e-06, + "loss": 2.5796, + "step": 56501 + }, + { + "epoch": 2.6305840724445373, + "grad_norm": 0.2879975359367116, + "learning_rate": 4.5457524287127015e-06, + "loss": 2.5255, + "step": 56502 + }, + { + "epoch": 2.6306306306306304, + "grad_norm": 0.3014033293442714, + "learning_rate": 4.544624006537656e-06, + "loss": 2.5969, + "step": 56503 + }, + { + "epoch": 2.6306771888167235, + "grad_norm": 0.3011166025345367, + "learning_rate": 4.5434957177696576e-06, + "loss": 2.5634, + "step": 56504 + }, + { + "epoch": 2.6307237470028166, + "grad_norm": 0.29731577616140153, + "learning_rate": 4.542367562412025e-06, + "loss": 2.7068, + "step": 56505 + }, + { + "epoch": 2.6307703051889098, + "grad_norm": 0.3074588766446919, + "learning_rate": 4.541239540468073e-06, + "loss": 2.6568, + "step": 56506 + }, + { + "epoch": 2.630816863375003, + "grad_norm": 0.2815065971503896, + "learning_rate": 4.540111651941087e-06, + "loss": 2.5257, + "step": 56507 + }, + { + "epoch": 2.630863421561096, + "grad_norm": 0.3092383063446574, + "learning_rate": 4.538983896834409e-06, + "loss": 2.6268, + "step": 56508 + }, + { + "epoch": 2.630909979747189, + "grad_norm": 0.2991318428506254, + "learning_rate": 4.53785627515132e-06, + "loss": 2.4966, + "step": 56509 + }, + { + "epoch": 2.630956537933282, + "grad_norm": 0.28031237682715326, + "learning_rate": 4.5367287868951615e-06, + "loss": 2.6463, + "step": 56510 + }, + { + "epoch": 2.6310030961193753, + "grad_norm": 0.32038227149365345, + "learning_rate": 4.535601432069214e-06, + "loss": 2.6784, + "step": 56511 + }, + { + "epoch": 2.6310496543054684, + "grad_norm": 0.2870941368046829, + "learning_rate": 4.534474210676804e-06, + "loss": 2.6214, + "step": 56512 + }, + { + "epoch": 2.6310962124915616, + "grad_norm": 0.29537294944344583, + "learning_rate": 4.5333471227212264e-06, + "loss": 2.603, + "step": 56513 + }, + { + "epoch": 2.6311427706776547, + "grad_norm": 0.2938251158384318, + "learning_rate": 4.5322201682057975e-06, + "loss": 2.6772, + "step": 56514 + }, + { + "epoch": 2.6311893288637473, + "grad_norm": 0.29320453551227377, + "learning_rate": 4.53109334713383e-06, + "loss": 2.6498, + "step": 56515 + }, + { + "epoch": 2.6312358870498405, + "grad_norm": 0.2726008034672029, + "learning_rate": 4.529966659508616e-06, + "loss": 2.631, + "step": 56516 + }, + { + "epoch": 2.6312824452359336, + "grad_norm": 0.2998716934201315, + "learning_rate": 4.528840105333476e-06, + "loss": 2.5945, + "step": 56517 + }, + { + "epoch": 2.6313290034220267, + "grad_norm": 0.28748812373658783, + "learning_rate": 4.527713684611706e-06, + "loss": 2.5343, + "step": 56518 + }, + { + "epoch": 2.63137556160812, + "grad_norm": 0.3035546678089425, + "learning_rate": 4.526587397346615e-06, + "loss": 2.6271, + "step": 56519 + }, + { + "epoch": 2.631422119794213, + "grad_norm": 0.3066961382498231, + "learning_rate": 4.525461243541513e-06, + "loss": 2.5579, + "step": 56520 + }, + { + "epoch": 2.631468677980306, + "grad_norm": 0.2743742382943102, + "learning_rate": 4.5243352231997105e-06, + "loss": 2.6594, + "step": 56521 + }, + { + "epoch": 2.6315152361663987, + "grad_norm": 0.296267784157569, + "learning_rate": 4.523209336324486e-06, + "loss": 2.6728, + "step": 56522 + }, + { + "epoch": 2.631561794352492, + "grad_norm": 0.29919810355996657, + "learning_rate": 4.522083582919178e-06, + "loss": 2.6121, + "step": 56523 + }, + { + "epoch": 2.631608352538585, + "grad_norm": 0.3175180571700967, + "learning_rate": 4.5209579629870655e-06, + "loss": 2.6907, + "step": 56524 + }, + { + "epoch": 2.631654910724678, + "grad_norm": 0.29503794688538004, + "learning_rate": 4.519832476531466e-06, + "loss": 2.6056, + "step": 56525 + }, + { + "epoch": 2.631701468910771, + "grad_norm": 0.2885529013042755, + "learning_rate": 4.518707123555671e-06, + "loss": 2.6652, + "step": 56526 + }, + { + "epoch": 2.6317480270968643, + "grad_norm": 0.29935495629303893, + "learning_rate": 4.517581904062995e-06, + "loss": 2.6306, + "step": 56527 + }, + { + "epoch": 2.6317945852829574, + "grad_norm": 0.29299723372759595, + "learning_rate": 4.516456818056742e-06, + "loss": 2.6354, + "step": 56528 + }, + { + "epoch": 2.6318411434690505, + "grad_norm": 0.3098037294355823, + "learning_rate": 4.515331865540201e-06, + "loss": 2.7356, + "step": 56529 + }, + { + "epoch": 2.6318877016551436, + "grad_norm": 0.30268034883423806, + "learning_rate": 4.514207046516678e-06, + "loss": 2.5516, + "step": 56530 + }, + { + "epoch": 2.6319342598412367, + "grad_norm": 0.2777857350751121, + "learning_rate": 4.513082360989474e-06, + "loss": 2.4778, + "step": 56531 + }, + { + "epoch": 2.63198081802733, + "grad_norm": 0.30198518670174096, + "learning_rate": 4.511957808961892e-06, + "loss": 2.6026, + "step": 56532 + }, + { + "epoch": 2.632027376213423, + "grad_norm": 0.31297247607497697, + "learning_rate": 4.510833390437236e-06, + "loss": 2.6532, + "step": 56533 + }, + { + "epoch": 2.6320739343995156, + "grad_norm": 0.31720464056949105, + "learning_rate": 4.5097091054188075e-06, + "loss": 2.6142, + "step": 56534 + }, + { + "epoch": 2.6321204925856088, + "grad_norm": 0.2926995344386493, + "learning_rate": 4.508584953909889e-06, + "loss": 2.6688, + "step": 56535 + }, + { + "epoch": 2.632167050771702, + "grad_norm": 0.31271680351309666, + "learning_rate": 4.5074609359138044e-06, + "loss": 2.7993, + "step": 56536 + }, + { + "epoch": 2.632213608957795, + "grad_norm": 0.2948568757863247, + "learning_rate": 4.506337051433834e-06, + "loss": 2.48, + "step": 56537 + }, + { + "epoch": 2.632260167143888, + "grad_norm": 0.2960834085005112, + "learning_rate": 4.505213300473282e-06, + "loss": 2.5795, + "step": 56538 + }, + { + "epoch": 2.6323067253299812, + "grad_norm": 0.2943938940807217, + "learning_rate": 4.504089683035451e-06, + "loss": 2.5955, + "step": 56539 + }, + { + "epoch": 2.6323532835160743, + "grad_norm": 0.2962116565092645, + "learning_rate": 4.502966199123626e-06, + "loss": 2.6279, + "step": 56540 + }, + { + "epoch": 2.632399841702167, + "grad_norm": 0.29796307755523044, + "learning_rate": 4.501842848741128e-06, + "loss": 2.6398, + "step": 56541 + }, + { + "epoch": 2.63244639988826, + "grad_norm": 0.2984452527904921, + "learning_rate": 4.500719631891232e-06, + "loss": 2.6587, + "step": 56542 + }, + { + "epoch": 2.6324929580743532, + "grad_norm": 0.28792127637180054, + "learning_rate": 4.499596548577234e-06, + "loss": 2.5361, + "step": 56543 + }, + { + "epoch": 2.6325395162604464, + "grad_norm": 0.2887608803221406, + "learning_rate": 4.498473598802444e-06, + "loss": 2.5183, + "step": 56544 + }, + { + "epoch": 2.6325860744465395, + "grad_norm": 0.31393411909795493, + "learning_rate": 4.497350782570148e-06, + "loss": 2.6025, + "step": 56545 + }, + { + "epoch": 2.6326326326326326, + "grad_norm": 0.2813741417067709, + "learning_rate": 4.496228099883648e-06, + "loss": 2.5201, + "step": 56546 + }, + { + "epoch": 2.6326791908187257, + "grad_norm": 0.30063822169082977, + "learning_rate": 4.495105550746242e-06, + "loss": 2.6066, + "step": 56547 + }, + { + "epoch": 2.632725749004819, + "grad_norm": 0.2868834258996573, + "learning_rate": 4.4939831351612005e-06, + "loss": 2.5767, + "step": 56548 + }, + { + "epoch": 2.632772307190912, + "grad_norm": 0.2852957114439169, + "learning_rate": 4.4928608531318525e-06, + "loss": 2.6511, + "step": 56549 + }, + { + "epoch": 2.632818865377005, + "grad_norm": 0.29563402234045555, + "learning_rate": 4.491738704661469e-06, + "loss": 2.5723, + "step": 56550 + }, + { + "epoch": 2.632865423563098, + "grad_norm": 0.3133568449258468, + "learning_rate": 4.4906166897533466e-06, + "loss": 2.6032, + "step": 56551 + }, + { + "epoch": 2.6329119817491913, + "grad_norm": 0.3121879718751892, + "learning_rate": 4.489494808410783e-06, + "loss": 2.5786, + "step": 56552 + }, + { + "epoch": 2.6329585399352844, + "grad_norm": 0.30176378770736123, + "learning_rate": 4.488373060637063e-06, + "loss": 2.6003, + "step": 56553 + }, + { + "epoch": 2.633005098121377, + "grad_norm": 0.30198289960046504, + "learning_rate": 4.487251446435497e-06, + "loss": 2.6374, + "step": 56554 + }, + { + "epoch": 2.63305165630747, + "grad_norm": 0.30909244241590356, + "learning_rate": 4.48612996580935e-06, + "loss": 2.6125, + "step": 56555 + }, + { + "epoch": 2.6330982144935633, + "grad_norm": 0.2879669919647645, + "learning_rate": 4.4850086187619374e-06, + "loss": 2.6435, + "step": 56556 + }, + { + "epoch": 2.6331447726796564, + "grad_norm": 0.30055712371329313, + "learning_rate": 4.483887405296538e-06, + "loss": 2.7048, + "step": 56557 + }, + { + "epoch": 2.6331913308657495, + "grad_norm": 0.3024364161127469, + "learning_rate": 4.482766325416443e-06, + "loss": 2.6888, + "step": 56558 + }, + { + "epoch": 2.6332378890518426, + "grad_norm": 0.2989254264925218, + "learning_rate": 4.481645379124943e-06, + "loss": 2.5997, + "step": 56559 + }, + { + "epoch": 2.6332844472379358, + "grad_norm": 0.2913985904253972, + "learning_rate": 4.480524566425343e-06, + "loss": 2.6505, + "step": 56560 + }, + { + "epoch": 2.6333310054240284, + "grad_norm": 0.3031249529078565, + "learning_rate": 4.4794038873208985e-06, + "loss": 2.6397, + "step": 56561 + }, + { + "epoch": 2.6333775636101215, + "grad_norm": 0.3056465230787653, + "learning_rate": 4.478283341814938e-06, + "loss": 2.6399, + "step": 56562 + }, + { + "epoch": 2.6334241217962147, + "grad_norm": 0.3069717501561735, + "learning_rate": 4.477162929910722e-06, + "loss": 2.5668, + "step": 56563 + }, + { + "epoch": 2.6334706799823078, + "grad_norm": 0.30172167679437356, + "learning_rate": 4.476042651611545e-06, + "loss": 2.6098, + "step": 56564 + }, + { + "epoch": 2.633517238168401, + "grad_norm": 0.30475040139982823, + "learning_rate": 4.474922506920698e-06, + "loss": 2.6599, + "step": 56565 + }, + { + "epoch": 2.633563796354494, + "grad_norm": 0.29566988699253793, + "learning_rate": 4.473802495841473e-06, + "loss": 2.6717, + "step": 56566 + }, + { + "epoch": 2.633610354540587, + "grad_norm": 0.3050219386129598, + "learning_rate": 4.472682618377161e-06, + "loss": 2.7187, + "step": 56567 + }, + { + "epoch": 2.6336569127266802, + "grad_norm": 0.2900965779831297, + "learning_rate": 4.4715628745310154e-06, + "loss": 2.6846, + "step": 56568 + }, + { + "epoch": 2.6337034709127733, + "grad_norm": 0.2977459337463511, + "learning_rate": 4.470443264306373e-06, + "loss": 2.5988, + "step": 56569 + }, + { + "epoch": 2.6337500290988665, + "grad_norm": 0.28520968727915574, + "learning_rate": 4.469323787706481e-06, + "loss": 2.6027, + "step": 56570 + }, + { + "epoch": 2.6337965872849596, + "grad_norm": 0.2903414499152575, + "learning_rate": 4.468204444734641e-06, + "loss": 2.597, + "step": 56571 + }, + { + "epoch": 2.6338431454710527, + "grad_norm": 0.30077372874120606, + "learning_rate": 4.467085235394136e-06, + "loss": 2.5846, + "step": 56572 + }, + { + "epoch": 2.6338897036571454, + "grad_norm": 0.2987706917095536, + "learning_rate": 4.4659661596882505e-06, + "loss": 2.6443, + "step": 56573 + }, + { + "epoch": 2.6339362618432385, + "grad_norm": 0.3007216199859132, + "learning_rate": 4.464847217620266e-06, + "loss": 2.6078, + "step": 56574 + }, + { + "epoch": 2.6339828200293316, + "grad_norm": 0.2937178042468604, + "learning_rate": 4.4637284091934735e-06, + "loss": 2.6494, + "step": 56575 + }, + { + "epoch": 2.6340293782154247, + "grad_norm": 0.29875886474202984, + "learning_rate": 4.462609734411149e-06, + "loss": 2.7337, + "step": 56576 + }, + { + "epoch": 2.634075936401518, + "grad_norm": 0.3195435344402613, + "learning_rate": 4.461491193276579e-06, + "loss": 2.6709, + "step": 56577 + }, + { + "epoch": 2.634122494587611, + "grad_norm": 0.310867530189957, + "learning_rate": 4.4603727857930434e-06, + "loss": 2.6555, + "step": 56578 + }, + { + "epoch": 2.634169052773704, + "grad_norm": 0.2880322676606612, + "learning_rate": 4.459254511963828e-06, + "loss": 2.6918, + "step": 56579 + }, + { + "epoch": 2.634215610959797, + "grad_norm": 0.29123094832651136, + "learning_rate": 4.458136371792226e-06, + "loss": 2.5843, + "step": 56580 + }, + { + "epoch": 2.63426216914589, + "grad_norm": 0.2882699254700249, + "learning_rate": 4.457018365281485e-06, + "loss": 2.6186, + "step": 56581 + }, + { + "epoch": 2.634308727331983, + "grad_norm": 0.30831712211963047, + "learning_rate": 4.455900492434928e-06, + "loss": 2.65, + "step": 56582 + }, + { + "epoch": 2.634355285518076, + "grad_norm": 0.2990205088101497, + "learning_rate": 4.4547827532558096e-06, + "loss": 2.671, + "step": 56583 + }, + { + "epoch": 2.634401843704169, + "grad_norm": 0.29492891971442137, + "learning_rate": 4.453665147747415e-06, + "loss": 2.453, + "step": 56584 + }, + { + "epoch": 2.6344484018902623, + "grad_norm": 0.3140402834828089, + "learning_rate": 4.4525476759130265e-06, + "loss": 2.5627, + "step": 56585 + }, + { + "epoch": 2.6344949600763554, + "grad_norm": 0.30311711813294845, + "learning_rate": 4.451430337755924e-06, + "loss": 2.6603, + "step": 56586 + }, + { + "epoch": 2.6345415182624485, + "grad_norm": 0.30506742474826726, + "learning_rate": 4.4503131332793825e-06, + "loss": 2.6173, + "step": 56587 + }, + { + "epoch": 2.6345880764485416, + "grad_norm": 0.29388288232146303, + "learning_rate": 4.449196062486693e-06, + "loss": 2.6603, + "step": 56588 + }, + { + "epoch": 2.6346346346346348, + "grad_norm": 0.302317631285704, + "learning_rate": 4.448079125381122e-06, + "loss": 2.6407, + "step": 56589 + }, + { + "epoch": 2.634681192820728, + "grad_norm": 0.29517992610046895, + "learning_rate": 4.446962321965947e-06, + "loss": 2.6447, + "step": 56590 + }, + { + "epoch": 2.634727751006821, + "grad_norm": 0.283862444883008, + "learning_rate": 4.445845652244451e-06, + "loss": 2.6674, + "step": 56591 + }, + { + "epoch": 2.634774309192914, + "grad_norm": 0.30139876047955977, + "learning_rate": 4.4447291162199145e-06, + "loss": 2.6501, + "step": 56592 + }, + { + "epoch": 2.634820867379007, + "grad_norm": 0.3012785075147584, + "learning_rate": 4.443612713895612e-06, + "loss": 2.5393, + "step": 56593 + }, + { + "epoch": 2.6348674255651, + "grad_norm": 0.2844748830687264, + "learning_rate": 4.442496445274807e-06, + "loss": 2.665, + "step": 56594 + }, + { + "epoch": 2.634913983751193, + "grad_norm": 0.30218904149469056, + "learning_rate": 4.441380310360799e-06, + "loss": 2.5454, + "step": 56595 + }, + { + "epoch": 2.634960541937286, + "grad_norm": 0.3100475188149252, + "learning_rate": 4.440264309156844e-06, + "loss": 2.5449, + "step": 56596 + }, + { + "epoch": 2.6350071001233792, + "grad_norm": 0.30445927630509334, + "learning_rate": 4.43914844166623e-06, + "loss": 2.5359, + "step": 56597 + }, + { + "epoch": 2.6350536583094724, + "grad_norm": 0.3006464182595574, + "learning_rate": 4.43803270789222e-06, + "loss": 2.6871, + "step": 56598 + }, + { + "epoch": 2.6351002164955655, + "grad_norm": 0.3096662274854822, + "learning_rate": 4.436917107838095e-06, + "loss": 2.7068, + "step": 56599 + }, + { + "epoch": 2.635146774681658, + "grad_norm": 0.305359032464777, + "learning_rate": 4.435801641507137e-06, + "loss": 2.6532, + "step": 56600 + }, + { + "epoch": 2.6351933328677513, + "grad_norm": 0.301149672068634, + "learning_rate": 4.4346863089026135e-06, + "loss": 2.6456, + "step": 56601 + }, + { + "epoch": 2.6352398910538444, + "grad_norm": 0.30029023863598614, + "learning_rate": 4.43357111002779e-06, + "loss": 2.5948, + "step": 56602 + }, + { + "epoch": 2.6352864492399375, + "grad_norm": 0.2989853018804223, + "learning_rate": 4.432456044885952e-06, + "loss": 2.5807, + "step": 56603 + }, + { + "epoch": 2.6353330074260306, + "grad_norm": 0.30128458562023686, + "learning_rate": 4.431341113480358e-06, + "loss": 2.5325, + "step": 56604 + }, + { + "epoch": 2.6353795656121237, + "grad_norm": 0.2906639924063104, + "learning_rate": 4.430226315814295e-06, + "loss": 2.6583, + "step": 56605 + }, + { + "epoch": 2.635426123798217, + "grad_norm": 0.2896166997847098, + "learning_rate": 4.429111651891033e-06, + "loss": 2.5609, + "step": 56606 + }, + { + "epoch": 2.63547268198431, + "grad_norm": 0.2870671183037337, + "learning_rate": 4.4279971217138224e-06, + "loss": 2.7298, + "step": 56607 + }, + { + "epoch": 2.635519240170403, + "grad_norm": 0.2794119152692468, + "learning_rate": 4.426882725285969e-06, + "loss": 2.5964, + "step": 56608 + }, + { + "epoch": 2.635565798356496, + "grad_norm": 0.30816836748883775, + "learning_rate": 4.4257684626107076e-06, + "loss": 2.6734, + "step": 56609 + }, + { + "epoch": 2.6356123565425893, + "grad_norm": 0.2873668568925604, + "learning_rate": 4.424654333691342e-06, + "loss": 2.6825, + "step": 56610 + }, + { + "epoch": 2.6356589147286824, + "grad_norm": 0.28460908247094774, + "learning_rate": 4.423540338531118e-06, + "loss": 2.5594, + "step": 56611 + }, + { + "epoch": 2.635705472914775, + "grad_norm": 0.2881048798029201, + "learning_rate": 4.422426477133312e-06, + "loss": 2.5294, + "step": 56612 + }, + { + "epoch": 2.635752031100868, + "grad_norm": 0.2946980809596803, + "learning_rate": 4.4213127495011995e-06, + "loss": 2.6331, + "step": 56613 + }, + { + "epoch": 2.6357985892869613, + "grad_norm": 0.30597940164265314, + "learning_rate": 4.420199155638044e-06, + "loss": 2.6137, + "step": 56614 + }, + { + "epoch": 2.6358451474730544, + "grad_norm": 0.3153672920843325, + "learning_rate": 4.41908569554711e-06, + "loss": 2.8228, + "step": 56615 + }, + { + "epoch": 2.6358917056591475, + "grad_norm": 0.29953934287603334, + "learning_rate": 4.417972369231671e-06, + "loss": 2.7, + "step": 56616 + }, + { + "epoch": 2.6359382638452407, + "grad_norm": 0.2873703185551322, + "learning_rate": 4.416859176694987e-06, + "loss": 2.5882, + "step": 56617 + }, + { + "epoch": 2.6359848220313338, + "grad_norm": 0.30476807955798146, + "learning_rate": 4.415746117940334e-06, + "loss": 2.5519, + "step": 56618 + }, + { + "epoch": 2.636031380217427, + "grad_norm": 0.30610131086266745, + "learning_rate": 4.414633192970985e-06, + "loss": 2.6455, + "step": 56619 + }, + { + "epoch": 2.6360779384035196, + "grad_norm": 0.3000050300129922, + "learning_rate": 4.4135204017901775e-06, + "loss": 2.6468, + "step": 56620 + }, + { + "epoch": 2.6361244965896127, + "grad_norm": 0.30357449565449823, + "learning_rate": 4.4124077444012145e-06, + "loss": 2.5885, + "step": 56621 + }, + { + "epoch": 2.636171054775706, + "grad_norm": 0.29503570367344983, + "learning_rate": 4.411295220807326e-06, + "loss": 2.5197, + "step": 56622 + }, + { + "epoch": 2.636217612961799, + "grad_norm": 0.2889017550982528, + "learning_rate": 4.410182831011811e-06, + "loss": 2.6779, + "step": 56623 + }, + { + "epoch": 2.636264171147892, + "grad_norm": 0.30158076022478836, + "learning_rate": 4.409070575017915e-06, + "loss": 2.72, + "step": 56624 + }, + { + "epoch": 2.636310729333985, + "grad_norm": 0.3160645315147804, + "learning_rate": 4.407958452828898e-06, + "loss": 2.6847, + "step": 56625 + }, + { + "epoch": 2.6363572875200783, + "grad_norm": 0.31078047613588433, + "learning_rate": 4.406846464448034e-06, + "loss": 2.6904, + "step": 56626 + }, + { + "epoch": 2.6364038457061714, + "grad_norm": 0.3052045575692271, + "learning_rate": 4.405734609878587e-06, + "loss": 2.6964, + "step": 56627 + }, + { + "epoch": 2.6364504038922645, + "grad_norm": 0.28359557734528495, + "learning_rate": 4.4046228891238215e-06, + "loss": 2.579, + "step": 56628 + }, + { + "epoch": 2.6364969620783576, + "grad_norm": 0.28132020741572933, + "learning_rate": 4.403511302186991e-06, + "loss": 2.6833, + "step": 56629 + }, + { + "epoch": 2.6365435202644507, + "grad_norm": 0.2921279064161264, + "learning_rate": 4.402399849071359e-06, + "loss": 2.6238, + "step": 56630 + }, + { + "epoch": 2.636590078450544, + "grad_norm": 0.29880334865548003, + "learning_rate": 4.401288529780195e-06, + "loss": 2.7041, + "step": 56631 + }, + { + "epoch": 2.6366366366366365, + "grad_norm": 0.3056621774288718, + "learning_rate": 4.400177344316764e-06, + "loss": 2.6528, + "step": 56632 + }, + { + "epoch": 2.6366831948227296, + "grad_norm": 0.297911381767672, + "learning_rate": 4.399066292684301e-06, + "loss": 2.6439, + "step": 56633 + }, + { + "epoch": 2.6367297530088227, + "grad_norm": 0.30960301962829473, + "learning_rate": 4.397955374886103e-06, + "loss": 2.7037, + "step": 56634 + }, + { + "epoch": 2.636776311194916, + "grad_norm": 0.29388732215160324, + "learning_rate": 4.396844590925397e-06, + "loss": 2.6433, + "step": 56635 + }, + { + "epoch": 2.636822869381009, + "grad_norm": 0.28671645821121905, + "learning_rate": 4.39573394080548e-06, + "loss": 2.5881, + "step": 56636 + }, + { + "epoch": 2.636869427567102, + "grad_norm": 0.2876126532634947, + "learning_rate": 4.394623424529582e-06, + "loss": 2.5678, + "step": 56637 + }, + { + "epoch": 2.636915985753195, + "grad_norm": 0.2996835599045036, + "learning_rate": 4.393513042100966e-06, + "loss": 2.5647, + "step": 56638 + }, + { + "epoch": 2.636962543939288, + "grad_norm": 0.288578557155395, + "learning_rate": 4.392402793522904e-06, + "loss": 2.6132, + "step": 56639 + }, + { + "epoch": 2.637009102125381, + "grad_norm": 0.28550501547662793, + "learning_rate": 4.391292678798642e-06, + "loss": 2.6938, + "step": 56640 + }, + { + "epoch": 2.637055660311474, + "grad_norm": 0.29598823142120484, + "learning_rate": 4.39018269793145e-06, + "loss": 2.5732, + "step": 56641 + }, + { + "epoch": 2.637102218497567, + "grad_norm": 0.29211868326975726, + "learning_rate": 4.389072850924575e-06, + "loss": 2.6226, + "step": 56642 + }, + { + "epoch": 2.6371487766836603, + "grad_norm": 0.3148727516433387, + "learning_rate": 4.38796313778127e-06, + "loss": 2.602, + "step": 56643 + }, + { + "epoch": 2.6371953348697534, + "grad_norm": 0.29762903014755326, + "learning_rate": 4.386853558504805e-06, + "loss": 2.7105, + "step": 56644 + }, + { + "epoch": 2.6372418930558466, + "grad_norm": 0.2945406875511121, + "learning_rate": 4.385744113098433e-06, + "loss": 2.6628, + "step": 56645 + }, + { + "epoch": 2.6372884512419397, + "grad_norm": 0.3004413810161489, + "learning_rate": 4.384634801565407e-06, + "loss": 2.6185, + "step": 56646 + }, + { + "epoch": 2.637335009428033, + "grad_norm": 0.32113284540605874, + "learning_rate": 4.383525623908991e-06, + "loss": 2.6788, + "step": 56647 + }, + { + "epoch": 2.637381567614126, + "grad_norm": 0.3136450208311155, + "learning_rate": 4.382416580132414e-06, + "loss": 2.6799, + "step": 56648 + }, + { + "epoch": 2.637428125800219, + "grad_norm": 0.2875026476332078, + "learning_rate": 4.381307670238971e-06, + "loss": 2.5265, + "step": 56649 + }, + { + "epoch": 2.637474683986312, + "grad_norm": 0.2970635787133466, + "learning_rate": 4.380198894231874e-06, + "loss": 2.7199, + "step": 56650 + }, + { + "epoch": 2.637521242172405, + "grad_norm": 0.2934659513923276, + "learning_rate": 4.3790902521144205e-06, + "loss": 2.6232, + "step": 56651 + }, + { + "epoch": 2.637567800358498, + "grad_norm": 0.30783627639904715, + "learning_rate": 4.377981743889837e-06, + "loss": 2.6558, + "step": 56652 + }, + { + "epoch": 2.637614358544591, + "grad_norm": 0.30002608552606447, + "learning_rate": 4.376873369561379e-06, + "loss": 2.6692, + "step": 56653 + }, + { + "epoch": 2.637660916730684, + "grad_norm": 0.3010703169972351, + "learning_rate": 4.375765129132309e-06, + "loss": 2.6517, + "step": 56654 + }, + { + "epoch": 2.6377074749167773, + "grad_norm": 0.29893782127109325, + "learning_rate": 4.374657022605866e-06, + "loss": 2.5995, + "step": 56655 + }, + { + "epoch": 2.6377540331028704, + "grad_norm": 0.29854413806027597, + "learning_rate": 4.3735490499853145e-06, + "loss": 2.6918, + "step": 56656 + }, + { + "epoch": 2.6378005912889635, + "grad_norm": 0.30527772283175175, + "learning_rate": 4.372441211273898e-06, + "loss": 2.6278, + "step": 56657 + }, + { + "epoch": 2.6378471494750566, + "grad_norm": 0.30251570529945165, + "learning_rate": 4.371333506474873e-06, + "loss": 2.6072, + "step": 56658 + }, + { + "epoch": 2.6378937076611493, + "grad_norm": 0.2977568576137868, + "learning_rate": 4.370225935591488e-06, + "loss": 2.5882, + "step": 56659 + }, + { + "epoch": 2.6379402658472424, + "grad_norm": 0.28398908896609376, + "learning_rate": 4.369118498627e-06, + "loss": 2.638, + "step": 56660 + }, + { + "epoch": 2.6379868240333355, + "grad_norm": 0.28842849853748664, + "learning_rate": 4.368011195584643e-06, + "loss": 2.566, + "step": 56661 + }, + { + "epoch": 2.6380333822194286, + "grad_norm": 0.30061653667407334, + "learning_rate": 4.366904026467689e-06, + "loss": 2.6351, + "step": 56662 + }, + { + "epoch": 2.6380799404055217, + "grad_norm": 0.29321593407553326, + "learning_rate": 4.365796991279359e-06, + "loss": 2.5798, + "step": 56663 + }, + { + "epoch": 2.638126498591615, + "grad_norm": 0.28783042067787173, + "learning_rate": 4.364690090022938e-06, + "loss": 2.5931, + "step": 56664 + }, + { + "epoch": 2.638173056777708, + "grad_norm": 0.27994412735775165, + "learning_rate": 4.363583322701653e-06, + "loss": 2.5246, + "step": 56665 + }, + { + "epoch": 2.638219614963801, + "grad_norm": 0.3113023403195318, + "learning_rate": 4.362476689318745e-06, + "loss": 2.6509, + "step": 56666 + }, + { + "epoch": 2.638266173149894, + "grad_norm": 0.3114113611350486, + "learning_rate": 4.361370189877489e-06, + "loss": 2.5997, + "step": 56667 + }, + { + "epoch": 2.6383127313359873, + "grad_norm": 0.30549979181240156, + "learning_rate": 4.360263824381094e-06, + "loss": 2.6309, + "step": 56668 + }, + { + "epoch": 2.6383592895220804, + "grad_norm": 0.31053738855251556, + "learning_rate": 4.359157592832847e-06, + "loss": 2.6287, + "step": 56669 + }, + { + "epoch": 2.6384058477081735, + "grad_norm": 0.29327014896138515, + "learning_rate": 4.358051495235965e-06, + "loss": 2.6544, + "step": 56670 + }, + { + "epoch": 2.638452405894266, + "grad_norm": 0.29946162645458435, + "learning_rate": 4.356945531593709e-06, + "loss": 2.7643, + "step": 56671 + }, + { + "epoch": 2.6384989640803593, + "grad_norm": 0.3152075905508252, + "learning_rate": 4.355839701909314e-06, + "loss": 2.48, + "step": 56672 + }, + { + "epoch": 2.6385455222664524, + "grad_norm": 0.29678588631372427, + "learning_rate": 4.354734006186051e-06, + "loss": 2.6191, + "step": 56673 + }, + { + "epoch": 2.6385920804525456, + "grad_norm": 0.2976223022659815, + "learning_rate": 4.353628444427121e-06, + "loss": 2.539, + "step": 56674 + }, + { + "epoch": 2.6386386386386387, + "grad_norm": 0.3095733810696575, + "learning_rate": 4.3525230166358175e-06, + "loss": 2.7178, + "step": 56675 + }, + { + "epoch": 2.638685196824732, + "grad_norm": 0.27786811390932725, + "learning_rate": 4.351417722815343e-06, + "loss": 2.5836, + "step": 56676 + }, + { + "epoch": 2.638731755010825, + "grad_norm": 0.29755893129003447, + "learning_rate": 4.350312562968978e-06, + "loss": 2.5587, + "step": 56677 + }, + { + "epoch": 2.6387783131969176, + "grad_norm": 0.2876943146135233, + "learning_rate": 4.349207537099942e-06, + "loss": 2.6206, + "step": 56678 + }, + { + "epoch": 2.6388248713830107, + "grad_norm": 0.2872241962208754, + "learning_rate": 4.348102645211482e-06, + "loss": 2.6114, + "step": 56679 + }, + { + "epoch": 2.638871429569104, + "grad_norm": 0.3073533787227442, + "learning_rate": 4.346997887306842e-06, + "loss": 2.6267, + "step": 56680 + }, + { + "epoch": 2.638917987755197, + "grad_norm": 0.2904548772442303, + "learning_rate": 4.345893263389267e-06, + "loss": 2.5984, + "step": 56681 + }, + { + "epoch": 2.63896454594129, + "grad_norm": 0.29302481113476186, + "learning_rate": 4.344788773462005e-06, + "loss": 2.6253, + "step": 56682 + }, + { + "epoch": 2.639011104127383, + "grad_norm": 0.3016354629314896, + "learning_rate": 4.343684417528288e-06, + "loss": 2.6268, + "step": 56683 + }, + { + "epoch": 2.6390576623134763, + "grad_norm": 0.3130165736635231, + "learning_rate": 4.34258019559135e-06, + "loss": 2.6089, + "step": 56684 + }, + { + "epoch": 2.6391042204995694, + "grad_norm": 0.2899908205512951, + "learning_rate": 4.341476107654446e-06, + "loss": 2.5178, + "step": 56685 + }, + { + "epoch": 2.6391507786856625, + "grad_norm": 0.27677390638408017, + "learning_rate": 4.3403721537208116e-06, + "loss": 2.5336, + "step": 56686 + }, + { + "epoch": 2.6391973368717556, + "grad_norm": 0.30467275741983024, + "learning_rate": 4.339268333793684e-06, + "loss": 2.5725, + "step": 56687 + }, + { + "epoch": 2.6392438950578487, + "grad_norm": 0.286446895637771, + "learning_rate": 4.3381646478763195e-06, + "loss": 2.5816, + "step": 56688 + }, + { + "epoch": 2.639290453243942, + "grad_norm": 0.3030365084885792, + "learning_rate": 4.337061095971922e-06, + "loss": 2.605, + "step": 56689 + }, + { + "epoch": 2.639337011430035, + "grad_norm": 0.2932682021077923, + "learning_rate": 4.335957678083769e-06, + "loss": 2.6449, + "step": 56690 + }, + { + "epoch": 2.6393835696161276, + "grad_norm": 0.29018885684494466, + "learning_rate": 4.334854394215071e-06, + "loss": 2.553, + "step": 56691 + }, + { + "epoch": 2.6394301278022207, + "grad_norm": 0.2772959084052822, + "learning_rate": 4.333751244369083e-06, + "loss": 2.5518, + "step": 56692 + }, + { + "epoch": 2.639476685988314, + "grad_norm": 0.30568175739912257, + "learning_rate": 4.33264822854903e-06, + "loss": 2.6233, + "step": 56693 + }, + { + "epoch": 2.639523244174407, + "grad_norm": 0.29812787650024086, + "learning_rate": 4.331545346758159e-06, + "loss": 2.585, + "step": 56694 + }, + { + "epoch": 2.6395698023605, + "grad_norm": 0.29276163505927316, + "learning_rate": 4.330442598999706e-06, + "loss": 2.5521, + "step": 56695 + }, + { + "epoch": 2.639616360546593, + "grad_norm": 0.2804728780081831, + "learning_rate": 4.329339985276903e-06, + "loss": 2.6237, + "step": 56696 + }, + { + "epoch": 2.6396629187326863, + "grad_norm": 0.2919116248992405, + "learning_rate": 4.328237505592986e-06, + "loss": 2.7061, + "step": 56697 + }, + { + "epoch": 2.639709476918779, + "grad_norm": 0.30073738309448267, + "learning_rate": 4.32713515995119e-06, + "loss": 2.5379, + "step": 56698 + }, + { + "epoch": 2.639756035104872, + "grad_norm": 0.28953387959893984, + "learning_rate": 4.326032948354758e-06, + "loss": 2.5791, + "step": 56699 + }, + { + "epoch": 2.6398025932909652, + "grad_norm": 0.2905666274395836, + "learning_rate": 4.324930870806914e-06, + "loss": 2.6978, + "step": 56700 + }, + { + "epoch": 2.6398491514770583, + "grad_norm": 0.2977812078140378, + "learning_rate": 4.323828927310913e-06, + "loss": 2.656, + "step": 56701 + }, + { + "epoch": 2.6398957096631515, + "grad_norm": 0.27430134918571586, + "learning_rate": 4.322727117869951e-06, + "loss": 2.5325, + "step": 56702 + }, + { + "epoch": 2.6399422678492446, + "grad_norm": 0.2938403259823056, + "learning_rate": 4.321625442487304e-06, + "loss": 2.5588, + "step": 56703 + }, + { + "epoch": 2.6399888260353377, + "grad_norm": 0.2843654681159152, + "learning_rate": 4.320523901166179e-06, + "loss": 2.71, + "step": 56704 + }, + { + "epoch": 2.640035384221431, + "grad_norm": 0.28707040176546594, + "learning_rate": 4.319422493909814e-06, + "loss": 2.6241, + "step": 56705 + }, + { + "epoch": 2.640081942407524, + "grad_norm": 0.28120444552349516, + "learning_rate": 4.3183212207214494e-06, + "loss": 2.6221, + "step": 56706 + }, + { + "epoch": 2.640128500593617, + "grad_norm": 0.27734196264092464, + "learning_rate": 4.317220081604307e-06, + "loss": 2.5946, + "step": 56707 + }, + { + "epoch": 2.64017505877971, + "grad_norm": 0.2901059116663166, + "learning_rate": 4.316119076561631e-06, + "loss": 2.6075, + "step": 56708 + }, + { + "epoch": 2.6402216169658033, + "grad_norm": 0.2999128750121374, + "learning_rate": 4.315018205596638e-06, + "loss": 2.7117, + "step": 56709 + }, + { + "epoch": 2.640268175151896, + "grad_norm": 0.29013790117454463, + "learning_rate": 4.313917468712564e-06, + "loss": 2.5929, + "step": 56710 + }, + { + "epoch": 2.640314733337989, + "grad_norm": 0.29397772984222703, + "learning_rate": 4.312816865912644e-06, + "loss": 2.6626, + "step": 56711 + }, + { + "epoch": 2.640361291524082, + "grad_norm": 0.3007367040450305, + "learning_rate": 4.3117163972000995e-06, + "loss": 2.5226, + "step": 56712 + }, + { + "epoch": 2.6404078497101753, + "grad_norm": 0.3153690967501017, + "learning_rate": 4.310616062578171e-06, + "loss": 2.6463, + "step": 56713 + }, + { + "epoch": 2.6404544078962684, + "grad_norm": 0.3081986780651903, + "learning_rate": 4.309515862050095e-06, + "loss": 2.6481, + "step": 56714 + }, + { + "epoch": 2.6405009660823615, + "grad_norm": 0.31587107067711606, + "learning_rate": 4.308415795619069e-06, + "loss": 2.6574, + "step": 56715 + }, + { + "epoch": 2.6405475242684546, + "grad_norm": 0.3011879211068668, + "learning_rate": 4.307315863288358e-06, + "loss": 2.6847, + "step": 56716 + }, + { + "epoch": 2.6405940824545473, + "grad_norm": 0.2901653272278634, + "learning_rate": 4.306216065061164e-06, + "loss": 2.642, + "step": 56717 + }, + { + "epoch": 2.6406406406406404, + "grad_norm": 0.30889280891076903, + "learning_rate": 4.305116400940723e-06, + "loss": 2.6454, + "step": 56718 + }, + { + "epoch": 2.6406871988267335, + "grad_norm": 0.30677377565172953, + "learning_rate": 4.304016870930266e-06, + "loss": 2.7243, + "step": 56719 + }, + { + "epoch": 2.6407337570128266, + "grad_norm": 0.3082639368690817, + "learning_rate": 4.302917475033019e-06, + "loss": 2.6156, + "step": 56720 + }, + { + "epoch": 2.6407803151989198, + "grad_norm": 0.30640808067451003, + "learning_rate": 4.3018182132522125e-06, + "loss": 2.6631, + "step": 56721 + }, + { + "epoch": 2.640826873385013, + "grad_norm": 0.2867524578372425, + "learning_rate": 4.300719085591054e-06, + "loss": 2.63, + "step": 56722 + }, + { + "epoch": 2.640873431571106, + "grad_norm": 0.29422843367100904, + "learning_rate": 4.299620092052797e-06, + "loss": 2.7236, + "step": 56723 + }, + { + "epoch": 2.640919989757199, + "grad_norm": 0.31568456602569295, + "learning_rate": 4.298521232640645e-06, + "loss": 2.6438, + "step": 56724 + }, + { + "epoch": 2.640966547943292, + "grad_norm": 0.3015573388316022, + "learning_rate": 4.297422507357829e-06, + "loss": 2.7373, + "step": 56725 + }, + { + "epoch": 2.6410131061293853, + "grad_norm": 0.3116501184613122, + "learning_rate": 4.296323916207579e-06, + "loss": 2.6009, + "step": 56726 + }, + { + "epoch": 2.6410596643154785, + "grad_norm": 0.3013086030874972, + "learning_rate": 4.295225459193125e-06, + "loss": 2.622, + "step": 56727 + }, + { + "epoch": 2.6411062225015716, + "grad_norm": 0.29087196378591434, + "learning_rate": 4.2941271363176605e-06, + "loss": 2.5822, + "step": 56728 + }, + { + "epoch": 2.6411527806876647, + "grad_norm": 0.3144866215232718, + "learning_rate": 4.293028947584449e-06, + "loss": 2.5694, + "step": 56729 + }, + { + "epoch": 2.6411993388737574, + "grad_norm": 0.31002908116631867, + "learning_rate": 4.291930892996687e-06, + "loss": 2.5515, + "step": 56730 + }, + { + "epoch": 2.6412458970598505, + "grad_norm": 0.28079453822146994, + "learning_rate": 4.290832972557601e-06, + "loss": 2.5726, + "step": 56731 + }, + { + "epoch": 2.6412924552459436, + "grad_norm": 0.29395251565073, + "learning_rate": 4.289735186270421e-06, + "loss": 2.5488, + "step": 56732 + }, + { + "epoch": 2.6413390134320367, + "grad_norm": 0.30988084121800075, + "learning_rate": 4.288637534138362e-06, + "loss": 2.6104, + "step": 56733 + }, + { + "epoch": 2.64138557161813, + "grad_norm": 0.2920340396778423, + "learning_rate": 4.287540016164665e-06, + "loss": 2.6043, + "step": 56734 + }, + { + "epoch": 2.641432129804223, + "grad_norm": 0.30768971702965575, + "learning_rate": 4.286442632352511e-06, + "loss": 2.5528, + "step": 56735 + }, + { + "epoch": 2.641478687990316, + "grad_norm": 0.32708543453130456, + "learning_rate": 4.285345382705158e-06, + "loss": 2.6476, + "step": 56736 + }, + { + "epoch": 2.6415252461764087, + "grad_norm": 0.29731091819359995, + "learning_rate": 4.28424826722581e-06, + "loss": 2.6777, + "step": 56737 + }, + { + "epoch": 2.641571804362502, + "grad_norm": 0.28963380151543455, + "learning_rate": 4.283151285917686e-06, + "loss": 2.6064, + "step": 56738 + }, + { + "epoch": 2.641618362548595, + "grad_norm": 0.3122166035272935, + "learning_rate": 4.282054438784011e-06, + "loss": 2.6332, + "step": 56739 + }, + { + "epoch": 2.641664920734688, + "grad_norm": 0.30319717539066143, + "learning_rate": 4.280957725828005e-06, + "loss": 2.5661, + "step": 56740 + }, + { + "epoch": 2.641711478920781, + "grad_norm": 0.30341376001712017, + "learning_rate": 4.279861147052883e-06, + "loss": 2.6329, + "step": 56741 + }, + { + "epoch": 2.6417580371068743, + "grad_norm": 0.30325628162688745, + "learning_rate": 4.278764702461868e-06, + "loss": 2.6636, + "step": 56742 + }, + { + "epoch": 2.6418045952929674, + "grad_norm": 0.29284334340798246, + "learning_rate": 4.27766839205817e-06, + "loss": 2.6813, + "step": 56743 + }, + { + "epoch": 2.6418511534790605, + "grad_norm": 0.30357682528092994, + "learning_rate": 4.27657221584501e-06, + "loss": 2.6118, + "step": 56744 + }, + { + "epoch": 2.6418977116651536, + "grad_norm": 0.30898652409443367, + "learning_rate": 4.2754761738256045e-06, + "loss": 2.7454, + "step": 56745 + }, + { + "epoch": 2.6419442698512468, + "grad_norm": 0.31744988487672643, + "learning_rate": 4.274380266003175e-06, + "loss": 2.6617, + "step": 56746 + }, + { + "epoch": 2.64199082803734, + "grad_norm": 0.3060156612923395, + "learning_rate": 4.2732844923809415e-06, + "loss": 2.6499, + "step": 56747 + }, + { + "epoch": 2.642037386223433, + "grad_norm": 0.3029851035541688, + "learning_rate": 4.272188852962095e-06, + "loss": 2.6699, + "step": 56748 + }, + { + "epoch": 2.6420839444095257, + "grad_norm": 0.3097172639339934, + "learning_rate": 4.271093347749889e-06, + "loss": 2.7229, + "step": 56749 + }, + { + "epoch": 2.6421305025956188, + "grad_norm": 0.29741385014361504, + "learning_rate": 4.269997976747508e-06, + "loss": 2.6408, + "step": 56750 + }, + { + "epoch": 2.642177060781712, + "grad_norm": 0.3050618189904212, + "learning_rate": 4.268902739958175e-06, + "loss": 2.6473, + "step": 56751 + }, + { + "epoch": 2.642223618967805, + "grad_norm": 0.33190736431158524, + "learning_rate": 4.2678076373851125e-06, + "loss": 2.6091, + "step": 56752 + }, + { + "epoch": 2.642270177153898, + "grad_norm": 0.29610508738597063, + "learning_rate": 4.266712669031525e-06, + "loss": 2.5824, + "step": 56753 + }, + { + "epoch": 2.6423167353399912, + "grad_norm": 0.33056030732898434, + "learning_rate": 4.265617834900637e-06, + "loss": 2.6505, + "step": 56754 + }, + { + "epoch": 2.6423632935260843, + "grad_norm": 0.29786872255443864, + "learning_rate": 4.2645231349956574e-06, + "loss": 2.6129, + "step": 56755 + }, + { + "epoch": 2.6424098517121775, + "grad_norm": 0.3018002284374482, + "learning_rate": 4.263428569319788e-06, + "loss": 2.6388, + "step": 56756 + }, + { + "epoch": 2.64245640989827, + "grad_norm": 0.2888203521999097, + "learning_rate": 4.262334137876256e-06, + "loss": 2.6298, + "step": 56757 + }, + { + "epoch": 2.6425029680843632, + "grad_norm": 0.2936966595302451, + "learning_rate": 4.261239840668263e-06, + "loss": 2.689, + "step": 56758 + }, + { + "epoch": 2.6425495262704564, + "grad_norm": 0.30549047414566705, + "learning_rate": 4.260145677699024e-06, + "loss": 2.7441, + "step": 56759 + }, + { + "epoch": 2.6425960844565495, + "grad_norm": 0.29954151385267663, + "learning_rate": 4.259051648971762e-06, + "loss": 2.7238, + "step": 56760 + }, + { + "epoch": 2.6426426426426426, + "grad_norm": 0.2836708390530439, + "learning_rate": 4.257957754489661e-06, + "loss": 2.6585, + "step": 56761 + }, + { + "epoch": 2.6426892008287357, + "grad_norm": 0.30331493362001505, + "learning_rate": 4.256863994255961e-06, + "loss": 2.6259, + "step": 56762 + }, + { + "epoch": 2.642735759014829, + "grad_norm": 0.3156586576192901, + "learning_rate": 4.2557703682738534e-06, + "loss": 2.7388, + "step": 56763 + }, + { + "epoch": 2.642782317200922, + "grad_norm": 0.3091455366170154, + "learning_rate": 4.254676876546554e-06, + "loss": 2.7147, + "step": 56764 + }, + { + "epoch": 2.642828875387015, + "grad_norm": 0.2998638833242805, + "learning_rate": 4.253583519077264e-06, + "loss": 2.5983, + "step": 56765 + }, + { + "epoch": 2.642875433573108, + "grad_norm": 0.2985643823428774, + "learning_rate": 4.25249029586921e-06, + "loss": 2.6163, + "step": 56766 + }, + { + "epoch": 2.6429219917592013, + "grad_norm": 0.2974496856443911, + "learning_rate": 4.251397206925583e-06, + "loss": 2.5642, + "step": 56767 + }, + { + "epoch": 2.6429685499452944, + "grad_norm": 0.29083272330526255, + "learning_rate": 4.250304252249609e-06, + "loss": 2.6481, + "step": 56768 + }, + { + "epoch": 2.643015108131387, + "grad_norm": 0.2931017502615782, + "learning_rate": 4.2492114318444745e-06, + "loss": 2.6796, + "step": 56769 + }, + { + "epoch": 2.64306166631748, + "grad_norm": 0.29972171635508427, + "learning_rate": 4.248118745713403e-06, + "loss": 2.6311, + "step": 56770 + }, + { + "epoch": 2.6431082245035733, + "grad_norm": 0.28992067202947996, + "learning_rate": 4.247026193859588e-06, + "loss": 2.6011, + "step": 56771 + }, + { + "epoch": 2.6431547826896664, + "grad_norm": 0.288590058240354, + "learning_rate": 4.245933776286248e-06, + "loss": 2.5383, + "step": 56772 + }, + { + "epoch": 2.6432013408757595, + "grad_norm": 0.29058543652636887, + "learning_rate": 4.244841492996593e-06, + "loss": 2.5224, + "step": 56773 + }, + { + "epoch": 2.6432478990618526, + "grad_norm": 0.29501562548491633, + "learning_rate": 4.243749343993802e-06, + "loss": 2.5931, + "step": 56774 + }, + { + "epoch": 2.6432944572479458, + "grad_norm": 0.28370524586667895, + "learning_rate": 4.242657329281119e-06, + "loss": 2.6528, + "step": 56775 + }, + { + "epoch": 2.6433410154340384, + "grad_norm": 0.2873506977810507, + "learning_rate": 4.241565448861711e-06, + "loss": 2.6775, + "step": 56776 + }, + { + "epoch": 2.6433875736201315, + "grad_norm": 0.2866262162656139, + "learning_rate": 4.240473702738818e-06, + "loss": 2.5079, + "step": 56777 + }, + { + "epoch": 2.6434341318062247, + "grad_norm": 0.29820380419878906, + "learning_rate": 4.239382090915617e-06, + "loss": 2.635, + "step": 56778 + }, + { + "epoch": 2.643480689992318, + "grad_norm": 0.2904504792524033, + "learning_rate": 4.238290613395318e-06, + "loss": 2.5957, + "step": 56779 + }, + { + "epoch": 2.643527248178411, + "grad_norm": 0.30388685942346694, + "learning_rate": 4.23719927018113e-06, + "loss": 2.556, + "step": 56780 + }, + { + "epoch": 2.643573806364504, + "grad_norm": 0.29517379900533386, + "learning_rate": 4.236108061276267e-06, + "loss": 2.7183, + "step": 56781 + }, + { + "epoch": 2.643620364550597, + "grad_norm": 0.2896060255657224, + "learning_rate": 4.235016986683904e-06, + "loss": 2.6537, + "step": 56782 + }, + { + "epoch": 2.6436669227366902, + "grad_norm": 0.30023494764126984, + "learning_rate": 4.23392604640726e-06, + "loss": 2.6478, + "step": 56783 + }, + { + "epoch": 2.6437134809227834, + "grad_norm": 0.2958960860180819, + "learning_rate": 4.232835240449534e-06, + "loss": 2.569, + "step": 56784 + }, + { + "epoch": 2.6437600391088765, + "grad_norm": 0.2837014357054049, + "learning_rate": 4.231744568813928e-06, + "loss": 2.6831, + "step": 56785 + }, + { + "epoch": 2.6438065972949696, + "grad_norm": 0.2930598605522296, + "learning_rate": 4.230654031503651e-06, + "loss": 2.6654, + "step": 56786 + }, + { + "epoch": 2.6438531554810627, + "grad_norm": 0.28278287487817805, + "learning_rate": 4.229563628521876e-06, + "loss": 2.5932, + "step": 56787 + }, + { + "epoch": 2.6438997136671554, + "grad_norm": 0.2809688266484374, + "learning_rate": 4.228473359871843e-06, + "loss": 2.5728, + "step": 56788 + }, + { + "epoch": 2.6439462718532485, + "grad_norm": 0.29667545287396224, + "learning_rate": 4.227383225556713e-06, + "loss": 2.6188, + "step": 56789 + }, + { + "epoch": 2.6439928300393416, + "grad_norm": 0.29783677720454976, + "learning_rate": 4.2262932255797246e-06, + "loss": 2.6898, + "step": 56790 + }, + { + "epoch": 2.6440393882254347, + "grad_norm": 0.28729263955245576, + "learning_rate": 4.225203359944041e-06, + "loss": 2.6548, + "step": 56791 + }, + { + "epoch": 2.644085946411528, + "grad_norm": 0.2908141998093254, + "learning_rate": 4.224113628652882e-06, + "loss": 2.6292, + "step": 56792 + }, + { + "epoch": 2.644132504597621, + "grad_norm": 0.3037481034886088, + "learning_rate": 4.223024031709438e-06, + "loss": 2.6997, + "step": 56793 + }, + { + "epoch": 2.644179062783714, + "grad_norm": 0.29947283234175, + "learning_rate": 4.221934569116909e-06, + "loss": 2.6515, + "step": 56794 + }, + { + "epoch": 2.644225620969807, + "grad_norm": 0.28593625961017827, + "learning_rate": 4.220845240878496e-06, + "loss": 2.6348, + "step": 56795 + }, + { + "epoch": 2.6442721791559, + "grad_norm": 0.2960597222162683, + "learning_rate": 4.219756046997386e-06, + "loss": 2.655, + "step": 56796 + }, + { + "epoch": 2.644318737341993, + "grad_norm": 0.30377335552680795, + "learning_rate": 4.218666987476783e-06, + "loss": 2.5206, + "step": 56797 + }, + { + "epoch": 2.644365295528086, + "grad_norm": 0.30236550348020474, + "learning_rate": 4.217578062319882e-06, + "loss": 2.6018, + "step": 56798 + }, + { + "epoch": 2.644411853714179, + "grad_norm": 0.30531755021065493, + "learning_rate": 4.2164892715298885e-06, + "loss": 2.5925, + "step": 56799 + }, + { + "epoch": 2.6444584119002723, + "grad_norm": 0.29708335778575506, + "learning_rate": 4.215400615109971e-06, + "loss": 2.5973, + "step": 56800 + }, + { + "epoch": 2.6445049700863654, + "grad_norm": 0.29775273538121777, + "learning_rate": 4.214312093063361e-06, + "loss": 2.6442, + "step": 56801 + }, + { + "epoch": 2.6445515282724585, + "grad_norm": 0.2948794749578434, + "learning_rate": 4.213223705393215e-06, + "loss": 2.6476, + "step": 56802 + }, + { + "epoch": 2.6445980864585517, + "grad_norm": 0.3019296837853361, + "learning_rate": 4.212135452102766e-06, + "loss": 2.7346, + "step": 56803 + }, + { + "epoch": 2.6446446446446448, + "grad_norm": 0.3022135163535465, + "learning_rate": 4.211047333195178e-06, + "loss": 2.6301, + "step": 56804 + }, + { + "epoch": 2.644691202830738, + "grad_norm": 0.28572237911732473, + "learning_rate": 4.209959348673654e-06, + "loss": 2.6338, + "step": 56805 + }, + { + "epoch": 2.644737761016831, + "grad_norm": 0.3217121399708978, + "learning_rate": 4.208871498541389e-06, + "loss": 2.6624, + "step": 56806 + }, + { + "epoch": 2.644784319202924, + "grad_norm": 0.2756068995921427, + "learning_rate": 4.207783782801577e-06, + "loss": 2.6059, + "step": 56807 + }, + { + "epoch": 2.644830877389017, + "grad_norm": 0.2866618126862804, + "learning_rate": 4.206696201457416e-06, + "loss": 2.43, + "step": 56808 + }, + { + "epoch": 2.64487743557511, + "grad_norm": 0.28835267164311956, + "learning_rate": 4.205608754512081e-06, + "loss": 2.626, + "step": 56809 + }, + { + "epoch": 2.644923993761203, + "grad_norm": 0.28015357920360334, + "learning_rate": 4.204521441968773e-06, + "loss": 2.577, + "step": 56810 + }, + { + "epoch": 2.644970551947296, + "grad_norm": 0.3242028992995253, + "learning_rate": 4.20343426383068e-06, + "loss": 2.7212, + "step": 56811 + }, + { + "epoch": 2.6450171101333892, + "grad_norm": 0.30352825650436505, + "learning_rate": 4.2023472201009995e-06, + "loss": 2.637, + "step": 56812 + }, + { + "epoch": 2.6450636683194824, + "grad_norm": 0.2840024913160623, + "learning_rate": 4.201260310782918e-06, + "loss": 2.5661, + "step": 56813 + }, + { + "epoch": 2.6451102265055755, + "grad_norm": 0.2911530276112687, + "learning_rate": 4.200173535879631e-06, + "loss": 2.6367, + "step": 56814 + }, + { + "epoch": 2.645156784691668, + "grad_norm": 0.2942821154752928, + "learning_rate": 4.1990868953943105e-06, + "loss": 2.6271, + "step": 56815 + }, + { + "epoch": 2.6452033428777613, + "grad_norm": 0.292629018159345, + "learning_rate": 4.19800038933017e-06, + "loss": 2.6741, + "step": 56816 + }, + { + "epoch": 2.6452499010638544, + "grad_norm": 0.29224559057125077, + "learning_rate": 4.196914017690378e-06, + "loss": 2.6571, + "step": 56817 + }, + { + "epoch": 2.6452964592499475, + "grad_norm": 0.3095145254635609, + "learning_rate": 4.195827780478134e-06, + "loss": 2.5824, + "step": 56818 + }, + { + "epoch": 2.6453430174360406, + "grad_norm": 0.3003859931286872, + "learning_rate": 4.194741677696623e-06, + "loss": 2.6629, + "step": 56819 + }, + { + "epoch": 2.6453895756221337, + "grad_norm": 0.3117849446932667, + "learning_rate": 4.193655709349031e-06, + "loss": 2.6428, + "step": 56820 + }, + { + "epoch": 2.645436133808227, + "grad_norm": 0.3066591351540015, + "learning_rate": 4.192569875438557e-06, + "loss": 2.6143, + "step": 56821 + }, + { + "epoch": 2.64548269199432, + "grad_norm": 0.2990091390720599, + "learning_rate": 4.191484175968369e-06, + "loss": 2.6089, + "step": 56822 + }, + { + "epoch": 2.645529250180413, + "grad_norm": 0.2909420434440999, + "learning_rate": 4.19039861094166e-06, + "loss": 2.5931, + "step": 56823 + }, + { + "epoch": 2.645575808366506, + "grad_norm": 0.3112190931896707, + "learning_rate": 4.18931318036162e-06, + "loss": 2.6389, + "step": 56824 + }, + { + "epoch": 2.6456223665525993, + "grad_norm": 0.29120216120680226, + "learning_rate": 4.188227884231433e-06, + "loss": 2.6708, + "step": 56825 + }, + { + "epoch": 2.6456689247386924, + "grad_norm": 0.2946968598688225, + "learning_rate": 4.187142722554277e-06, + "loss": 2.5085, + "step": 56826 + }, + { + "epoch": 2.645715482924785, + "grad_norm": 0.2896212234669134, + "learning_rate": 4.186057695333362e-06, + "loss": 2.6347, + "step": 56827 + }, + { + "epoch": 2.645762041110878, + "grad_norm": 0.2928076974084801, + "learning_rate": 4.184972802571829e-06, + "loss": 2.5929, + "step": 56828 + }, + { + "epoch": 2.6458085992969713, + "grad_norm": 0.30771845229210976, + "learning_rate": 4.18388804427291e-06, + "loss": 2.695, + "step": 56829 + }, + { + "epoch": 2.6458551574830644, + "grad_norm": 0.2900948869096871, + "learning_rate": 4.182803420439757e-06, + "loss": 2.6816, + "step": 56830 + }, + { + "epoch": 2.6459017156691576, + "grad_norm": 0.28619703398749485, + "learning_rate": 4.181718931075562e-06, + "loss": 2.6199, + "step": 56831 + }, + { + "epoch": 2.6459482738552507, + "grad_norm": 0.287059974851107, + "learning_rate": 4.180634576183507e-06, + "loss": 2.6613, + "step": 56832 + }, + { + "epoch": 2.645994832041344, + "grad_norm": 0.3056731096102557, + "learning_rate": 4.179550355766776e-06, + "loss": 2.699, + "step": 56833 + }, + { + "epoch": 2.646041390227437, + "grad_norm": 0.29615760437595595, + "learning_rate": 4.178466269828557e-06, + "loss": 2.564, + "step": 56834 + }, + { + "epoch": 2.6460879484135296, + "grad_norm": 0.29770140710710064, + "learning_rate": 4.17738231837202e-06, + "loss": 2.6205, + "step": 56835 + }, + { + "epoch": 2.6461345065996227, + "grad_norm": 0.31285352652622583, + "learning_rate": 4.17629850140035e-06, + "loss": 2.6108, + "step": 56836 + }, + { + "epoch": 2.646181064785716, + "grad_norm": 0.2732915164903122, + "learning_rate": 4.175214818916728e-06, + "loss": 2.5597, + "step": 56837 + }, + { + "epoch": 2.646227622971809, + "grad_norm": 0.3039421446781745, + "learning_rate": 4.174131270924342e-06, + "loss": 2.624, + "step": 56838 + }, + { + "epoch": 2.646274181157902, + "grad_norm": 0.2665689143065374, + "learning_rate": 4.17304785742636e-06, + "loss": 2.6678, + "step": 56839 + }, + { + "epoch": 2.646320739343995, + "grad_norm": 0.2954878820923233, + "learning_rate": 4.17196457842598e-06, + "loss": 2.5409, + "step": 56840 + }, + { + "epoch": 2.6463672975300883, + "grad_norm": 0.3029002538775805, + "learning_rate": 4.170881433926349e-06, + "loss": 2.698, + "step": 56841 + }, + { + "epoch": 2.6464138557161814, + "grad_norm": 0.30463408223013994, + "learning_rate": 4.169798423930688e-06, + "loss": 2.474, + "step": 56842 + }, + { + "epoch": 2.6464604139022745, + "grad_norm": 0.3030229541558879, + "learning_rate": 4.168715548442137e-06, + "loss": 2.6674, + "step": 56843 + }, + { + "epoch": 2.6465069720883676, + "grad_norm": 0.30389644456492043, + "learning_rate": 4.167632807463895e-06, + "loss": 2.6462, + "step": 56844 + }, + { + "epoch": 2.6465535302744607, + "grad_norm": 0.3039812887038311, + "learning_rate": 4.166550200999136e-06, + "loss": 2.7078, + "step": 56845 + }, + { + "epoch": 2.646600088460554, + "grad_norm": 0.3045854814095997, + "learning_rate": 4.165467729051037e-06, + "loss": 2.6268, + "step": 56846 + }, + { + "epoch": 2.6466466466466465, + "grad_norm": 0.288122359686978, + "learning_rate": 4.164385391622783e-06, + "loss": 2.6192, + "step": 56847 + }, + { + "epoch": 2.6466932048327396, + "grad_norm": 0.2994889491035935, + "learning_rate": 4.163303188717526e-06, + "loss": 2.6583, + "step": 56848 + }, + { + "epoch": 2.6467397630188327, + "grad_norm": 0.2920878322857499, + "learning_rate": 4.162221120338478e-06, + "loss": 2.4518, + "step": 56849 + }, + { + "epoch": 2.646786321204926, + "grad_norm": 0.2974879617803115, + "learning_rate": 4.161139186488782e-06, + "loss": 2.6148, + "step": 56850 + }, + { + "epoch": 2.646832879391019, + "grad_norm": 0.30773625169322827, + "learning_rate": 4.160057387171628e-06, + "loss": 2.6238, + "step": 56851 + }, + { + "epoch": 2.646879437577112, + "grad_norm": 0.3005458466128853, + "learning_rate": 4.15897572239019e-06, + "loss": 2.6487, + "step": 56852 + }, + { + "epoch": 2.646925995763205, + "grad_norm": 0.2906582653221446, + "learning_rate": 4.1578941921476486e-06, + "loss": 2.5559, + "step": 56853 + }, + { + "epoch": 2.646972553949298, + "grad_norm": 0.28364485600709904, + "learning_rate": 4.156812796447157e-06, + "loss": 2.6402, + "step": 56854 + }, + { + "epoch": 2.647019112135391, + "grad_norm": 0.2897415836499648, + "learning_rate": 4.155731535291918e-06, + "loss": 2.634, + "step": 56855 + }, + { + "epoch": 2.647065670321484, + "grad_norm": 0.3045535634797986, + "learning_rate": 4.154650408685084e-06, + "loss": 2.61, + "step": 56856 + }, + { + "epoch": 2.647112228507577, + "grad_norm": 0.29473625943751225, + "learning_rate": 4.153569416629838e-06, + "loss": 2.6439, + "step": 56857 + }, + { + "epoch": 2.6471587866936703, + "grad_norm": 0.2968612069128959, + "learning_rate": 4.152488559129341e-06, + "loss": 2.5897, + "step": 56858 + }, + { + "epoch": 2.6472053448797634, + "grad_norm": 0.2949115708434146, + "learning_rate": 4.1514078361867824e-06, + "loss": 2.7075, + "step": 56859 + }, + { + "epoch": 2.6472519030658566, + "grad_norm": 0.3014957311589076, + "learning_rate": 4.15032724780533e-06, + "loss": 2.637, + "step": 56860 + }, + { + "epoch": 2.6472984612519497, + "grad_norm": 0.3167832428484006, + "learning_rate": 4.1492467939881315e-06, + "loss": 2.6815, + "step": 56861 + }, + { + "epoch": 2.647345019438043, + "grad_norm": 0.3230371142979012, + "learning_rate": 4.1481664747383905e-06, + "loss": 2.6142, + "step": 56862 + }, + { + "epoch": 2.647391577624136, + "grad_norm": 0.3022188799417114, + "learning_rate": 4.14708629005926e-06, + "loss": 2.6702, + "step": 56863 + }, + { + "epoch": 2.647438135810229, + "grad_norm": 0.27758479949728576, + "learning_rate": 4.146006239953914e-06, + "loss": 2.6776, + "step": 56864 + }, + { + "epoch": 2.647484693996322, + "grad_norm": 0.2993916350937627, + "learning_rate": 4.144926324425524e-06, + "loss": 2.5443, + "step": 56865 + }, + { + "epoch": 2.647531252182415, + "grad_norm": 0.3037530958875668, + "learning_rate": 4.143846543477253e-06, + "loss": 2.5811, + "step": 56866 + }, + { + "epoch": 2.647577810368508, + "grad_norm": 0.30254927116498403, + "learning_rate": 4.142766897112277e-06, + "loss": 2.5727, + "step": 56867 + }, + { + "epoch": 2.647624368554601, + "grad_norm": 0.29582366984019115, + "learning_rate": 4.14168738533377e-06, + "loss": 2.6915, + "step": 56868 + }, + { + "epoch": 2.647670926740694, + "grad_norm": 0.2989576939854489, + "learning_rate": 4.140608008144886e-06, + "loss": 2.6454, + "step": 56869 + }, + { + "epoch": 2.6477174849267873, + "grad_norm": 0.2920782794820364, + "learning_rate": 4.1395287655488e-06, + "loss": 2.6749, + "step": 56870 + }, + { + "epoch": 2.6477640431128804, + "grad_norm": 0.2850249167987985, + "learning_rate": 4.138449657548677e-06, + "loss": 2.6808, + "step": 56871 + }, + { + "epoch": 2.6478106012989735, + "grad_norm": 0.2958353335869952, + "learning_rate": 4.137370684147685e-06, + "loss": 2.6514, + "step": 56872 + }, + { + "epoch": 2.6478571594850666, + "grad_norm": 0.30697360038827, + "learning_rate": 4.136291845349005e-06, + "loss": 2.6168, + "step": 56873 + }, + { + "epoch": 2.6479037176711593, + "grad_norm": 0.31421884039329606, + "learning_rate": 4.135213141155769e-06, + "loss": 2.7057, + "step": 56874 + }, + { + "epoch": 2.6479502758572524, + "grad_norm": 0.296484468071527, + "learning_rate": 4.13413457157118e-06, + "loss": 2.6224, + "step": 56875 + }, + { + "epoch": 2.6479968340433455, + "grad_norm": 0.2863827226988252, + "learning_rate": 4.133056136598384e-06, + "loss": 2.5221, + "step": 56876 + }, + { + "epoch": 2.6480433922294386, + "grad_norm": 0.2882090556993915, + "learning_rate": 4.131977836240547e-06, + "loss": 2.6193, + "step": 56877 + }, + { + "epoch": 2.6480899504155317, + "grad_norm": 0.298963656225963, + "learning_rate": 4.1308996705008374e-06, + "loss": 2.5928, + "step": 56878 + }, + { + "epoch": 2.648136508601625, + "grad_norm": 0.2777197011855905, + "learning_rate": 4.129821639382414e-06, + "loss": 2.5065, + "step": 56879 + }, + { + "epoch": 2.648183066787718, + "grad_norm": 0.287631685967522, + "learning_rate": 4.1287437428884465e-06, + "loss": 2.5104, + "step": 56880 + }, + { + "epoch": 2.648229624973811, + "grad_norm": 0.3051358077544515, + "learning_rate": 4.1276659810221106e-06, + "loss": 2.6233, + "step": 56881 + }, + { + "epoch": 2.648276183159904, + "grad_norm": 0.31603020520514363, + "learning_rate": 4.126588353786542e-06, + "loss": 2.6372, + "step": 56882 + }, + { + "epoch": 2.6483227413459973, + "grad_norm": 0.30100369280219985, + "learning_rate": 4.125510861184923e-06, + "loss": 2.6517, + "step": 56883 + }, + { + "epoch": 2.6483692995320904, + "grad_norm": 0.29394532128892875, + "learning_rate": 4.124433503220404e-06, + "loss": 2.6145, + "step": 56884 + }, + { + "epoch": 2.6484158577181836, + "grad_norm": 0.29615428573818964, + "learning_rate": 4.123356279896156e-06, + "loss": 2.4947, + "step": 56885 + }, + { + "epoch": 2.6484624159042762, + "grad_norm": 0.28498543492451817, + "learning_rate": 4.1222791912153445e-06, + "loss": 2.5143, + "step": 56886 + }, + { + "epoch": 2.6485089740903693, + "grad_norm": 0.30018446444640284, + "learning_rate": 4.12120223718111e-06, + "loss": 2.5846, + "step": 56887 + }, + { + "epoch": 2.6485555322764625, + "grad_norm": 0.3059769178316484, + "learning_rate": 4.120125417796645e-06, + "loss": 2.6937, + "step": 56888 + }, + { + "epoch": 2.6486020904625556, + "grad_norm": 0.3102533339469823, + "learning_rate": 4.119048733065079e-06, + "loss": 2.7225, + "step": 56889 + }, + { + "epoch": 2.6486486486486487, + "grad_norm": 0.3070450170936357, + "learning_rate": 4.117972182989588e-06, + "loss": 2.6951, + "step": 56890 + }, + { + "epoch": 2.648695206834742, + "grad_norm": 0.29352369240349513, + "learning_rate": 4.116895767573325e-06, + "loss": 2.5838, + "step": 56891 + }, + { + "epoch": 2.648741765020835, + "grad_norm": 0.3071441124426259, + "learning_rate": 4.1158194868194555e-06, + "loss": 2.5903, + "step": 56892 + }, + { + "epoch": 2.6487883232069276, + "grad_norm": 0.2906474088494622, + "learning_rate": 4.114743340731137e-06, + "loss": 2.6277, + "step": 56893 + }, + { + "epoch": 2.6488348813930207, + "grad_norm": 0.31144599994597155, + "learning_rate": 4.1136673293115324e-06, + "loss": 2.5237, + "step": 56894 + }, + { + "epoch": 2.648881439579114, + "grad_norm": 0.2883742557793664, + "learning_rate": 4.112591452563785e-06, + "loss": 2.6214, + "step": 56895 + }, + { + "epoch": 2.648927997765207, + "grad_norm": 0.2907043227891616, + "learning_rate": 4.111515710491065e-06, + "loss": 2.5193, + "step": 56896 + }, + { + "epoch": 2.6489745559513, + "grad_norm": 0.2895679115352501, + "learning_rate": 4.110440103096519e-06, + "loss": 2.5868, + "step": 56897 + }, + { + "epoch": 2.649021114137393, + "grad_norm": 0.30456454757379386, + "learning_rate": 4.109364630383317e-06, + "loss": 2.6377, + "step": 56898 + }, + { + "epoch": 2.6490676723234863, + "grad_norm": 0.3006073117549578, + "learning_rate": 4.108289292354611e-06, + "loss": 2.5964, + "step": 56899 + }, + { + "epoch": 2.6491142305095794, + "grad_norm": 0.30484441353369396, + "learning_rate": 4.107214089013539e-06, + "loss": 2.6202, + "step": 56900 + }, + { + "epoch": 2.6491607886956725, + "grad_norm": 0.30190016527971913, + "learning_rate": 4.1061390203632925e-06, + "loss": 2.6309, + "step": 56901 + }, + { + "epoch": 2.6492073468817656, + "grad_norm": 0.29629459229056276, + "learning_rate": 4.10506408640699e-06, + "loss": 2.6658, + "step": 56902 + }, + { + "epoch": 2.6492539050678587, + "grad_norm": 0.3012530332622891, + "learning_rate": 4.103989287147819e-06, + "loss": 2.6027, + "step": 56903 + }, + { + "epoch": 2.649300463253952, + "grad_norm": 0.2972401500302787, + "learning_rate": 4.102914622588911e-06, + "loss": 2.5767, + "step": 56904 + }, + { + "epoch": 2.649347021440045, + "grad_norm": 0.2993486223243914, + "learning_rate": 4.1018400927334225e-06, + "loss": 2.7004, + "step": 56905 + }, + { + "epoch": 2.6493935796261376, + "grad_norm": 0.29509463111812867, + "learning_rate": 4.100765697584513e-06, + "loss": 2.6119, + "step": 56906 + }, + { + "epoch": 2.6494401378122308, + "grad_norm": 0.2879571286909394, + "learning_rate": 4.0996914371453475e-06, + "loss": 2.5778, + "step": 56907 + }, + { + "epoch": 2.649486695998324, + "grad_norm": 0.2935910418288438, + "learning_rate": 4.098617311419051e-06, + "loss": 2.5933, + "step": 56908 + }, + { + "epoch": 2.649533254184417, + "grad_norm": 0.29560981631495215, + "learning_rate": 4.097543320408792e-06, + "loss": 2.587, + "step": 56909 + }, + { + "epoch": 2.64957981237051, + "grad_norm": 0.3051140746886319, + "learning_rate": 4.096469464117725e-06, + "loss": 2.6434, + "step": 56910 + }, + { + "epoch": 2.649626370556603, + "grad_norm": 0.28576244962476205, + "learning_rate": 4.095395742548997e-06, + "loss": 2.5349, + "step": 56911 + }, + { + "epoch": 2.6496729287426963, + "grad_norm": 0.28513095944912664, + "learning_rate": 4.0943221557057656e-06, + "loss": 2.6913, + "step": 56912 + }, + { + "epoch": 2.649719486928789, + "grad_norm": 0.29395092130328776, + "learning_rate": 4.093248703591162e-06, + "loss": 2.631, + "step": 56913 + }, + { + "epoch": 2.649766045114882, + "grad_norm": 0.28269085073534217, + "learning_rate": 4.092175386208363e-06, + "loss": 2.6412, + "step": 56914 + }, + { + "epoch": 2.6498126033009752, + "grad_norm": 0.2899992859747395, + "learning_rate": 4.091102203560493e-06, + "loss": 2.6804, + "step": 56915 + }, + { + "epoch": 2.6498591614870683, + "grad_norm": 0.3179598970260184, + "learning_rate": 4.090029155650732e-06, + "loss": 2.6116, + "step": 56916 + }, + { + "epoch": 2.6499057196731615, + "grad_norm": 0.2991658507546411, + "learning_rate": 4.088956242482206e-06, + "loss": 2.5041, + "step": 56917 + }, + { + "epoch": 2.6499522778592546, + "grad_norm": 0.30379365133713776, + "learning_rate": 4.087883464058068e-06, + "loss": 2.6605, + "step": 56918 + }, + { + "epoch": 2.6499988360453477, + "grad_norm": 0.3000944685277376, + "learning_rate": 4.08681082038147e-06, + "loss": 2.6092, + "step": 56919 + }, + { + "epoch": 2.650045394231441, + "grad_norm": 0.3173563704022479, + "learning_rate": 4.085738311455562e-06, + "loss": 2.6024, + "step": 56920 + }, + { + "epoch": 2.650091952417534, + "grad_norm": 0.32126173001376823, + "learning_rate": 4.084665937283494e-06, + "loss": 2.618, + "step": 56921 + }, + { + "epoch": 2.650138510603627, + "grad_norm": 0.30441429345481286, + "learning_rate": 4.083593697868399e-06, + "loss": 2.6603, + "step": 56922 + }, + { + "epoch": 2.65018506878972, + "grad_norm": 0.2944792694296919, + "learning_rate": 4.082521593213434e-06, + "loss": 2.5073, + "step": 56923 + }, + { + "epoch": 2.6502316269758133, + "grad_norm": 0.3107540943774221, + "learning_rate": 4.081449623321743e-06, + "loss": 2.5973, + "step": 56924 + }, + { + "epoch": 2.650278185161906, + "grad_norm": 0.29020520154833934, + "learning_rate": 4.080377788196482e-06, + "loss": 2.6378, + "step": 56925 + }, + { + "epoch": 2.650324743347999, + "grad_norm": 0.31443970809365196, + "learning_rate": 4.079306087840773e-06, + "loss": 2.6103, + "step": 56926 + }, + { + "epoch": 2.650371301534092, + "grad_norm": 0.30269544131341025, + "learning_rate": 4.078234522257795e-06, + "loss": 2.5065, + "step": 56927 + }, + { + "epoch": 2.6504178597201853, + "grad_norm": 0.28373284922399633, + "learning_rate": 4.0771630914506535e-06, + "loss": 2.6014, + "step": 56928 + }, + { + "epoch": 2.6504644179062784, + "grad_norm": 0.2960300424853327, + "learning_rate": 4.076091795422537e-06, + "loss": 2.6831, + "step": 56929 + }, + { + "epoch": 2.6505109760923715, + "grad_norm": 0.30042438103335023, + "learning_rate": 4.075020634176551e-06, + "loss": 2.6936, + "step": 56930 + }, + { + "epoch": 2.6505575342784646, + "grad_norm": 0.29158912725456393, + "learning_rate": 4.07394960771586e-06, + "loss": 2.574, + "step": 56931 + }, + { + "epoch": 2.6506040924645573, + "grad_norm": 0.2891671812627209, + "learning_rate": 4.072878716043604e-06, + "loss": 2.6092, + "step": 56932 + }, + { + "epoch": 2.6506506506506504, + "grad_norm": 0.28538115022415655, + "learning_rate": 4.071807959162921e-06, + "loss": 2.7113, + "step": 56933 + }, + { + "epoch": 2.6506972088367435, + "grad_norm": 0.2916033607634191, + "learning_rate": 4.070737337076963e-06, + "loss": 2.6454, + "step": 56934 + }, + { + "epoch": 2.6507437670228366, + "grad_norm": 0.30844195290940507, + "learning_rate": 4.069666849788861e-06, + "loss": 2.6158, + "step": 56935 + }, + { + "epoch": 2.6507903252089298, + "grad_norm": 0.29475085562775427, + "learning_rate": 4.068596497301763e-06, + "loss": 2.5677, + "step": 56936 + }, + { + "epoch": 2.650836883395023, + "grad_norm": 0.29707959507943754, + "learning_rate": 4.067526279618811e-06, + "loss": 2.5776, + "step": 56937 + }, + { + "epoch": 2.650883441581116, + "grad_norm": 0.2930739480745956, + "learning_rate": 4.06645619674314e-06, + "loss": 2.5106, + "step": 56938 + }, + { + "epoch": 2.650929999767209, + "grad_norm": 0.2994644696845287, + "learning_rate": 4.065386248677893e-06, + "loss": 2.62, + "step": 56939 + }, + { + "epoch": 2.6509765579533022, + "grad_norm": 0.29533552845645283, + "learning_rate": 4.064316435426224e-06, + "loss": 2.7041, + "step": 56940 + }, + { + "epoch": 2.6510231161393953, + "grad_norm": 0.30228301087213666, + "learning_rate": 4.0632467569912455e-06, + "loss": 2.5984, + "step": 56941 + }, + { + "epoch": 2.6510696743254885, + "grad_norm": 0.29866420888830897, + "learning_rate": 4.0621772133761284e-06, + "loss": 2.5268, + "step": 56942 + }, + { + "epoch": 2.6511162325115816, + "grad_norm": 0.29329292624504955, + "learning_rate": 4.061107804583986e-06, + "loss": 2.6248, + "step": 56943 + }, + { + "epoch": 2.6511627906976747, + "grad_norm": 0.29297254613102874, + "learning_rate": 4.0600385306179655e-06, + "loss": 2.6199, + "step": 56944 + }, + { + "epoch": 2.6512093488837674, + "grad_norm": 0.2815078682169225, + "learning_rate": 4.05896939148121e-06, + "loss": 2.5512, + "step": 56945 + }, + { + "epoch": 2.6512559070698605, + "grad_norm": 0.2901555156277925, + "learning_rate": 4.05790038717685e-06, + "loss": 2.5884, + "step": 56946 + }, + { + "epoch": 2.6513024652559536, + "grad_norm": 0.3047021942226845, + "learning_rate": 4.056831517708032e-06, + "loss": 2.5711, + "step": 56947 + }, + { + "epoch": 2.6513490234420467, + "grad_norm": 0.28612508608597104, + "learning_rate": 4.0557627830778825e-06, + "loss": 2.5577, + "step": 56948 + }, + { + "epoch": 2.65139558162814, + "grad_norm": 0.2859663962681737, + "learning_rate": 4.054694183289542e-06, + "loss": 2.5783, + "step": 56949 + }, + { + "epoch": 2.651442139814233, + "grad_norm": 0.29581099189282206, + "learning_rate": 4.053625718346149e-06, + "loss": 2.5832, + "step": 56950 + }, + { + "epoch": 2.651488698000326, + "grad_norm": 0.3032978520616297, + "learning_rate": 4.052557388250839e-06, + "loss": 2.5759, + "step": 56951 + }, + { + "epoch": 2.6515352561864187, + "grad_norm": 0.3120609212057174, + "learning_rate": 4.051489193006741e-06, + "loss": 2.6513, + "step": 56952 + }, + { + "epoch": 2.651581814372512, + "grad_norm": 0.30661289145013565, + "learning_rate": 4.05042113261701e-06, + "loss": 2.6007, + "step": 56953 + }, + { + "epoch": 2.651628372558605, + "grad_norm": 0.29296905860828504, + "learning_rate": 4.04935320708475e-06, + "loss": 2.6886, + "step": 56954 + }, + { + "epoch": 2.651674930744698, + "grad_norm": 0.30241575253240355, + "learning_rate": 4.048285416413122e-06, + "loss": 2.6467, + "step": 56955 + }, + { + "epoch": 2.651721488930791, + "grad_norm": 0.29957470146668974, + "learning_rate": 4.047217760605237e-06, + "loss": 2.5849, + "step": 56956 + }, + { + "epoch": 2.6517680471168843, + "grad_norm": 0.3024378247518757, + "learning_rate": 4.046150239664259e-06, + "loss": 2.6808, + "step": 56957 + }, + { + "epoch": 2.6518146053029774, + "grad_norm": 0.2910135862371541, + "learning_rate": 4.045082853593291e-06, + "loss": 2.6334, + "step": 56958 + }, + { + "epoch": 2.6518611634890705, + "grad_norm": 0.2910023298136258, + "learning_rate": 4.044015602395479e-06, + "loss": 2.6721, + "step": 56959 + }, + { + "epoch": 2.6519077216751636, + "grad_norm": 0.31719705015588395, + "learning_rate": 4.042948486073961e-06, + "loss": 2.6019, + "step": 56960 + }, + { + "epoch": 2.6519542798612568, + "grad_norm": 0.28815497712452887, + "learning_rate": 4.041881504631845e-06, + "loss": 2.6875, + "step": 56961 + }, + { + "epoch": 2.65200083804735, + "grad_norm": 0.2895540673314581, + "learning_rate": 4.040814658072295e-06, + "loss": 2.7275, + "step": 56962 + }, + { + "epoch": 2.652047396233443, + "grad_norm": 0.30627570111847874, + "learning_rate": 4.03974794639842e-06, + "loss": 2.6086, + "step": 56963 + }, + { + "epoch": 2.6520939544195357, + "grad_norm": 0.32300838142000154, + "learning_rate": 4.0386813696133565e-06, + "loss": 2.6802, + "step": 56964 + }, + { + "epoch": 2.6521405126056288, + "grad_norm": 0.29494370225296734, + "learning_rate": 4.037614927720234e-06, + "loss": 2.6429, + "step": 56965 + }, + { + "epoch": 2.652187070791722, + "grad_norm": 0.28600436130883605, + "learning_rate": 4.036548620722191e-06, + "loss": 2.6368, + "step": 56966 + }, + { + "epoch": 2.652233628977815, + "grad_norm": 0.30047881565343926, + "learning_rate": 4.035482448622335e-06, + "loss": 2.5995, + "step": 56967 + }, + { + "epoch": 2.652280187163908, + "grad_norm": 0.28909623140635543, + "learning_rate": 4.03441641142383e-06, + "loss": 2.5833, + "step": 56968 + }, + { + "epoch": 2.6523267453500012, + "grad_norm": 0.28895263025005674, + "learning_rate": 4.033350509129763e-06, + "loss": 2.5756, + "step": 56969 + }, + { + "epoch": 2.6523733035360944, + "grad_norm": 0.2989650343050983, + "learning_rate": 4.032284741743303e-06, + "loss": 2.578, + "step": 56970 + }, + { + "epoch": 2.6524198617221875, + "grad_norm": 0.2810329018291172, + "learning_rate": 4.0312191092675474e-06, + "loss": 2.6798, + "step": 56971 + }, + { + "epoch": 2.65246641990828, + "grad_norm": 0.3041896355555858, + "learning_rate": 4.030153611705639e-06, + "loss": 2.6276, + "step": 56972 + }, + { + "epoch": 2.6525129780943733, + "grad_norm": 0.2837049515536921, + "learning_rate": 4.029088249060703e-06, + "loss": 2.6314, + "step": 56973 + }, + { + "epoch": 2.6525595362804664, + "grad_norm": 0.2803556690883864, + "learning_rate": 4.0280230213358526e-06, + "loss": 2.4896, + "step": 56974 + }, + { + "epoch": 2.6526060944665595, + "grad_norm": 0.2985789240272604, + "learning_rate": 4.026957928534242e-06, + "loss": 2.6438, + "step": 56975 + }, + { + "epoch": 2.6526526526526526, + "grad_norm": 0.3043100177881896, + "learning_rate": 4.025892970658973e-06, + "loss": 2.719, + "step": 56976 + }, + { + "epoch": 2.6526992108387457, + "grad_norm": 0.30058561628645175, + "learning_rate": 4.024828147713178e-06, + "loss": 2.4818, + "step": 56977 + }, + { + "epoch": 2.652745769024839, + "grad_norm": 0.30784688307896435, + "learning_rate": 4.0237634596999804e-06, + "loss": 2.6158, + "step": 56978 + }, + { + "epoch": 2.652792327210932, + "grad_norm": 0.2921860616547278, + "learning_rate": 4.022698906622507e-06, + "loss": 2.5775, + "step": 56979 + }, + { + "epoch": 2.652838885397025, + "grad_norm": 0.2824000272732401, + "learning_rate": 4.021634488483888e-06, + "loss": 2.6999, + "step": 56980 + }, + { + "epoch": 2.652885443583118, + "grad_norm": 0.29186098573805425, + "learning_rate": 4.0205702052872495e-06, + "loss": 2.5998, + "step": 56981 + }, + { + "epoch": 2.6529320017692113, + "grad_norm": 0.2841926536807442, + "learning_rate": 4.019506057035688e-06, + "loss": 2.6011, + "step": 56982 + }, + { + "epoch": 2.6529785599553044, + "grad_norm": 0.293479989983487, + "learning_rate": 4.018442043732362e-06, + "loss": 2.6247, + "step": 56983 + }, + { + "epoch": 2.653025118141397, + "grad_norm": 0.29601899561514716, + "learning_rate": 4.0173781653803756e-06, + "loss": 2.4961, + "step": 56984 + }, + { + "epoch": 2.65307167632749, + "grad_norm": 0.2912599247363949, + "learning_rate": 4.016314421982853e-06, + "loss": 2.7513, + "step": 56985 + }, + { + "epoch": 2.6531182345135833, + "grad_norm": 0.2909113833207264, + "learning_rate": 4.015250813542926e-06, + "loss": 2.6254, + "step": 56986 + }, + { + "epoch": 2.6531647926996764, + "grad_norm": 0.29970889578680293, + "learning_rate": 4.014187340063686e-06, + "loss": 2.6573, + "step": 56987 + }, + { + "epoch": 2.6532113508857695, + "grad_norm": 0.3077327635332817, + "learning_rate": 4.013124001548296e-06, + "loss": 2.5765, + "step": 56988 + }, + { + "epoch": 2.6532579090718627, + "grad_norm": 0.2989250062790536, + "learning_rate": 4.0120607979998505e-06, + "loss": 2.6324, + "step": 56989 + }, + { + "epoch": 2.6533044672579558, + "grad_norm": 0.2953916165795204, + "learning_rate": 4.0109977294214725e-06, + "loss": 2.5861, + "step": 56990 + }, + { + "epoch": 2.6533510254440484, + "grad_norm": 0.280925889085118, + "learning_rate": 4.009934795816284e-06, + "loss": 2.6628, + "step": 56991 + }, + { + "epoch": 2.6533975836301416, + "grad_norm": 0.31726556298204545, + "learning_rate": 4.0088719971874075e-06, + "loss": 2.7147, + "step": 56992 + }, + { + "epoch": 2.6534441418162347, + "grad_norm": 0.2986382813775295, + "learning_rate": 4.007809333537965e-06, + "loss": 2.6002, + "step": 56993 + }, + { + "epoch": 2.653490700002328, + "grad_norm": 0.31835429809165694, + "learning_rate": 4.006746804871075e-06, + "loss": 2.5798, + "step": 56994 + }, + { + "epoch": 2.653537258188421, + "grad_norm": 0.29563157556015934, + "learning_rate": 4.005684411189836e-06, + "loss": 2.6388, + "step": 56995 + }, + { + "epoch": 2.653583816374514, + "grad_norm": 0.3098906824581665, + "learning_rate": 4.0046221524974e-06, + "loss": 2.7066, + "step": 56996 + }, + { + "epoch": 2.653630374560607, + "grad_norm": 0.3044475802098603, + "learning_rate": 4.00356002879686e-06, + "loss": 2.7045, + "step": 56997 + }, + { + "epoch": 2.6536769327467002, + "grad_norm": 0.30740575021079336, + "learning_rate": 4.002498040091335e-06, + "loss": 2.7499, + "step": 56998 + }, + { + "epoch": 2.6537234909327934, + "grad_norm": 0.2925883620707721, + "learning_rate": 4.001436186383961e-06, + "loss": 2.5477, + "step": 56999 + }, + { + "epoch": 2.6537700491188865, + "grad_norm": 0.29823741187359176, + "learning_rate": 4.00037446767782e-06, + "loss": 2.5257, + "step": 57000 + } + ], + "logging_steps": 1, + "max_steps": 64434, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5095044138663936.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}